linux/drivers/vfio/vfio.c

2474 lines
62 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
* VFIO core
*
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
* Author: Alex Williamson <alex.williamson@redhat.com>
*
* Derived from original vfio:
* Copyright 2010 Cisco Systems, Inc. All rights reserved.
* Author: Tom Lyon, pugs@cisco.com
*/
#include <linux/cdev.h>
#include <linux/compat.h>
#include <linux/device.h>
#include <linux/file.h>
#include <linux/anon_inodes.h>
#include <linux/fs.h>
#include <linux/idr.h>
#include <linux/iommu.h>
#include <linux/list.h>
#include <linux/miscdevice.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/pci.h>
#include <linux/rwsem.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/uaccess.h>
#include <linux/vfio.h>
#include <linux/wait.h>
vfio: Fix WARNING "do not call blocking ops when !TASK_RUNNING" vfio_dev_present() which is the condition to wait_event_interruptible_timeout(), will call vfio_group_get_device and try to acquire the mutex group->device_lock. wait_event_interruptible_timeout() will set the state of the current task to TASK_INTERRUPTIBLE, before doing the condition check. This means that we will try to acquire the mutex while already in a sleeping state. The scheduler warns us by giving the following warning: [ 4050.264464] ------------[ cut here ]------------ [ 4050.264508] do not call blocking ops when !TASK_RUNNING; state=1 set at [<00000000b33c00e2>] prepare_to_wait_event+0x14a/0x188 [ 4050.264529] WARNING: CPU: 12 PID: 35924 at kernel/sched/core.c:6112 __might_sleep+0x76/0x90 .... 4050.264756] Call Trace: [ 4050.264765] ([<000000000017bbaa>] __might_sleep+0x72/0x90) [ 4050.264774] [<0000000000b97edc>] __mutex_lock+0x44/0x8c0 [ 4050.264782] [<0000000000b9878a>] mutex_lock_nested+0x32/0x40 [ 4050.264793] [<000003ff800d7abe>] vfio_group_get_device+0x36/0xa8 [vfio] [ 4050.264803] [<000003ff800d87c0>] vfio_del_group_dev+0x238/0x378 [vfio] [ 4050.264813] [<000003ff8015f67c>] mdev_remove+0x3c/0x68 [mdev] [ 4050.264825] [<00000000008e01b0>] device_release_driver_internal+0x168/0x268 [ 4050.264834] [<00000000008de692>] bus_remove_device+0x162/0x190 [ 4050.264843] [<00000000008daf42>] device_del+0x1e2/0x368 [ 4050.264851] [<00000000008db12c>] device_unregister+0x64/0x88 [ 4050.264862] [<000003ff8015ed84>] mdev_device_remove+0xec/0x130 [mdev] [ 4050.264872] [<000003ff8015f074>] remove_store+0x6c/0xa8 [mdev] [ 4050.264881] [<000000000046f494>] kernfs_fop_write+0x14c/0x1f8 [ 4050.264890] [<00000000003c1530>] __vfs_write+0x38/0x1a8 [ 4050.264899] [<00000000003c187c>] vfs_write+0xb4/0x198 [ 4050.264908] [<00000000003c1af2>] ksys_write+0x5a/0xb0 [ 4050.264916] [<0000000000b9e270>] system_call+0xdc/0x2d8 [ 4050.264925] 4 locks held by sh/35924: [ 4050.264933] #0: 000000001ef90325 (sb_writers#4){.+.+}, at: vfs_write+0x9e/0x198 [ 4050.264948] #1: 000000005c1ab0b3 (&of->mutex){+.+.}, at: kernfs_fop_write+0x1cc/0x1f8 [ 4050.264963] #2: 0000000034831ab8 (kn->count#297){++++}, at: kernfs_remove_self+0x12e/0x150 [ 4050.264979] #3: 00000000e152484f (&dev->mutex){....}, at: device_release_driver_internal+0x5c/0x268 [ 4050.264993] Last Breaking-Event-Address: [ 4050.265002] [<000000000017bbaa>] __might_sleep+0x72/0x90 [ 4050.265010] irq event stamp: 7039 [ 4050.265020] hardirqs last enabled at (7047): [<00000000001cee7a>] console_unlock+0x6d2/0x740 [ 4050.265029] hardirqs last disabled at (7054): [<00000000001ce87e>] console_unlock+0xd6/0x740 [ 4050.265040] softirqs last enabled at (6416): [<0000000000b8fe26>] __udelay+0xb6/0x100 [ 4050.265049] softirqs last disabled at (6415): [<0000000000b8fe06>] __udelay+0x96/0x100 [ 4050.265057] ---[ end trace d04a07d39d99a9f9 ]--- Let's fix this as described in the article https://lwn.net/Articles/628628/. Signed-off-by: Farhan Ali <alifm@linux.ibm.com> [remove now redundant vfio_dev_present()] Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2019-04-04 02:22:27 +08:00
#include <linux/sched/signal.h>
#define DRIVER_VERSION "0.3"
#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
#define DRIVER_DESC "VFIO - User Level meta-driver"
static struct vfio {
struct class *class;
struct list_head iommu_drivers_list;
struct mutex iommu_drivers_lock;
struct list_head group_list;
struct idr group_idr;
struct mutex group_lock;
struct cdev group_cdev;
dev_t group_devt;
} vfio;
struct vfio_iommu_driver {
const struct vfio_iommu_driver_ops *ops;
struct list_head vfio_next;
};
struct vfio_container {
struct kref kref;
struct list_head group_list;
struct rw_semaphore group_lock;
struct vfio_iommu_driver *iommu_driver;
void *iommu_data;
bool noiommu;
};
struct vfio_unbound_dev {
struct device *dev;
struct list_head unbound_next;
};
struct vfio_group {
struct kref kref;
int minor;
atomic_t container_users;
struct iommu_group *iommu_group;
struct vfio_container *container;
struct list_head device_list;
struct mutex device_lock;
struct device *dev;
struct notifier_block nb;
struct list_head vfio_next;
struct list_head container_next;
struct list_head unbound_list;
struct mutex unbound_lock;
atomic_t opened;
wait_queue_head_t container_q;
bool noiommu;
unsigned int dev_counter;
struct kvm *kvm;
struct blocking_notifier_head notifier;
};
#ifdef CONFIG_VFIO_NOIOMMU
static bool noiommu __read_mostly;
module_param_named(enable_unsafe_noiommu_mode,
noiommu, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
#endif
vfio: Provide better generic support for open/release vfio_device_ops Currently the driver ops have an open/release pair that is called once each time a device FD is opened or closed. Add an additional set of open/close_device() ops which are called when the device FD is opened for the first time and closed for the last time. An analysis shows that all of the drivers require this semantic. Some are open coding it as part of their reflck implementation, and some are just buggy and miss it completely. To retain the current semantics PCI and FSL depend on, introduce the idea of a "device set" which is a grouping of vfio_device's that share the same lock around opening. The device set is established by providing a 'set_id' pointer. All vfio_device's that provide the same pointer will be joined to the same singleton memory and lock across the whole set. This effectively replaces the oddly named reflck. After conversion the set_id will be sourced from: - A struct device from a fsl_mc_device (fsl) - A struct pci_slot (pci) - A struct pci_bus (pci) - The struct vfio_device (everything) The design ensures that the above pointers are live as long as the vfio_device is registered, so they form reliable unique keys to group vfio_devices into sets. This implementation uses xarray instead of searching through the driver core structures, which simplifies the somewhat tricky locking in this area. Following patches convert all the drivers. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/4-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-08-06 09:19:00 +08:00
static DEFINE_XARRAY(vfio_device_set_xa);
int vfio_assign_device_set(struct vfio_device *device, void *set_id)
{
unsigned long idx = (unsigned long)set_id;
struct vfio_device_set *new_dev_set;
struct vfio_device_set *dev_set;
if (WARN_ON(!set_id))
return -EINVAL;
/*
* Atomically acquire a singleton object in the xarray for this set_id
*/
xa_lock(&vfio_device_set_xa);
dev_set = xa_load(&vfio_device_set_xa, idx);
if (dev_set)
goto found_get_ref;
xa_unlock(&vfio_device_set_xa);
new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
if (!new_dev_set)
return -ENOMEM;
mutex_init(&new_dev_set->lock);
INIT_LIST_HEAD(&new_dev_set->device_list);
new_dev_set->set_id = set_id;
xa_lock(&vfio_device_set_xa);
dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
GFP_KERNEL);
if (!dev_set) {
dev_set = new_dev_set;
goto found_get_ref;
}
kfree(new_dev_set);
if (xa_is_err(dev_set)) {
xa_unlock(&vfio_device_set_xa);
return xa_err(dev_set);
}
found_get_ref:
dev_set->device_count++;
xa_unlock(&vfio_device_set_xa);
mutex_lock(&dev_set->lock);
device->dev_set = dev_set;
list_add_tail(&device->dev_set_list, &dev_set->device_list);
mutex_unlock(&dev_set->lock);
return 0;
}
EXPORT_SYMBOL_GPL(vfio_assign_device_set);
static void vfio_release_device_set(struct vfio_device *device)
{
struct vfio_device_set *dev_set = device->dev_set;
if (!dev_set)
return;
mutex_lock(&dev_set->lock);
list_del(&device->dev_set_list);
mutex_unlock(&dev_set->lock);
xa_lock(&vfio_device_set_xa);
if (!--dev_set->device_count) {
__xa_erase(&vfio_device_set_xa,
(unsigned long)dev_set->set_id);
mutex_destroy(&dev_set->lock);
kfree(dev_set);
}
xa_unlock(&vfio_device_set_xa);
}
#ifdef CONFIG_VFIO_NOIOMMU
static void *vfio_noiommu_open(unsigned long arg)
{
if (arg != VFIO_NOIOMMU_IOMMU)
return ERR_PTR(-EINVAL);
if (!capable(CAP_SYS_RAWIO))
return ERR_PTR(-EPERM);
return NULL;
}
static void vfio_noiommu_release(void *iommu_data)
{
}
static long vfio_noiommu_ioctl(void *iommu_data,
unsigned int cmd, unsigned long arg)
{
if (cmd == VFIO_CHECK_EXTENSION)
return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
return -ENOTTY;
}
static int vfio_noiommu_attach_group(void *iommu_data,
struct iommu_group *iommu_group)
{
return 0;
}
static void vfio_noiommu_detach_group(void *iommu_data,
struct iommu_group *iommu_group)
{
}
static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
.name = "vfio-noiommu",
.owner = THIS_MODULE,
.open = vfio_noiommu_open,
.release = vfio_noiommu_release,
.ioctl = vfio_noiommu_ioctl,
.attach_group = vfio_noiommu_attach_group,
.detach_group = vfio_noiommu_detach_group,
};
/*
* Only noiommu containers can use vfio-noiommu and noiommu containers can only
* use vfio-noiommu.
*/
static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
const struct vfio_iommu_driver *driver)
{
return container->noiommu == (driver->ops == &vfio_noiommu_ops);
}
#else
static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
const struct vfio_iommu_driver *driver)
{
return true;
}
#endif /* CONFIG_VFIO_NOIOMMU */
/**
* IOMMU driver registration
*/
int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
{
struct vfio_iommu_driver *driver, *tmp;
driver = kzalloc(sizeof(*driver), GFP_KERNEL);
if (!driver)
return -ENOMEM;
driver->ops = ops;
mutex_lock(&vfio.iommu_drivers_lock);
/* Check for duplicates */
list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
if (tmp->ops == ops) {
mutex_unlock(&vfio.iommu_drivers_lock);
kfree(driver);
return -EINVAL;
}
}
list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
mutex_unlock(&vfio.iommu_drivers_lock);
return 0;
}
EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
{
struct vfio_iommu_driver *driver;
mutex_lock(&vfio.iommu_drivers_lock);
list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
if (driver->ops == ops) {
list_del(&driver->vfio_next);
mutex_unlock(&vfio.iommu_drivers_lock);
kfree(driver);
return;
}
}
mutex_unlock(&vfio.iommu_drivers_lock);
}
EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
/**
* Group minor allocation/free - both called with vfio.group_lock held
*/
static int vfio_alloc_group_minor(struct vfio_group *group)
{
return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
}
static void vfio_free_group_minor(int minor)
{
idr_remove(&vfio.group_idr, minor);
}
static int vfio_iommu_group_notifier(struct notifier_block *nb,
unsigned long action, void *data);
static void vfio_group_get(struct vfio_group *group);
/**
* Container objects - containers are created when /dev/vfio/vfio is
* opened, but their lifecycle extends until the last user is done, so
* it's freed via kref. Must support container/group/device being
* closed in any order.
*/
static void vfio_container_get(struct vfio_container *container)
{
kref_get(&container->kref);
}
static void vfio_container_release(struct kref *kref)
{
struct vfio_container *container;
container = container_of(kref, struct vfio_container, kref);
kfree(container);
}
static void vfio_container_put(struct vfio_container *container)
{
kref_put(&container->kref, vfio_container_release);
}
static void vfio_group_unlock_and_free(struct vfio_group *group)
{
mutex_unlock(&vfio.group_lock);
/*
* Unregister outside of lock. A spurious callback is harmless now
* that the group is no longer in vfio.group_list.
*/
iommu_group_unregister_notifier(group->iommu_group, &group->nb);
kfree(group);
}
/**
* Group objects - create, release, get, put, search
*/
static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
bool noiommu)
{
struct vfio_group *group, *tmp;
struct device *dev;
int ret, minor;
group = kzalloc(sizeof(*group), GFP_KERNEL);
if (!group)
return ERR_PTR(-ENOMEM);
kref_init(&group->kref);
INIT_LIST_HEAD(&group->device_list);
mutex_init(&group->device_lock);
INIT_LIST_HEAD(&group->unbound_list);
mutex_init(&group->unbound_lock);
atomic_set(&group->container_users, 0);
atomic_set(&group->opened, 0);
init_waitqueue_head(&group->container_q);
group->iommu_group = iommu_group;
group->noiommu = noiommu;
BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
group->nb.notifier_call = vfio_iommu_group_notifier;
/*
* blocking notifiers acquire a rwsem around registering and hold
* it around callback. Therefore, need to register outside of
* vfio.group_lock to avoid A-B/B-A contention. Our callback won't
* do anything unless it can find the group in vfio.group_list, so
* no harm in registering early.
*/
ret = iommu_group_register_notifier(iommu_group, &group->nb);
if (ret) {
kfree(group);
return ERR_PTR(ret);
}
mutex_lock(&vfio.group_lock);
/* Did we race creating this group? */
list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
if (tmp->iommu_group == iommu_group) {
vfio_group_get(tmp);
vfio_group_unlock_and_free(group);
return tmp;
}
}
minor = vfio_alloc_group_minor(group);
if (minor < 0) {
vfio_group_unlock_and_free(group);
return ERR_PTR(minor);
}
dev = device_create(vfio.class, NULL,
MKDEV(MAJOR(vfio.group_devt), minor),
group, "%s%d", group->noiommu ? "noiommu-" : "",
iommu_group_id(iommu_group));
if (IS_ERR(dev)) {
vfio_free_group_minor(minor);
vfio_group_unlock_and_free(group);
return ERR_CAST(dev);
}
group->minor = minor;
group->dev = dev;
list_add(&group->vfio_next, &vfio.group_list);
mutex_unlock(&vfio.group_lock);
return group;
}
/* called with vfio.group_lock held */
static void vfio_group_release(struct kref *kref)
{
struct vfio_group *group = container_of(kref, struct vfio_group, kref);
struct vfio_unbound_dev *unbound, *tmp;
struct iommu_group *iommu_group = group->iommu_group;
WARN_ON(!list_empty(&group->device_list));
WARN_ON(group->notifier.head);
list_for_each_entry_safe(unbound, tmp,
&group->unbound_list, unbound_next) {
list_del(&unbound->unbound_next);
kfree(unbound);
}
device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
list_del(&group->vfio_next);
vfio_free_group_minor(group->minor);
vfio_group_unlock_and_free(group);
iommu_group_put(iommu_group);
}
static void vfio_group_put(struct vfio_group *group)
{
kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
}
struct vfio_group_put_work {
struct work_struct work;
struct vfio_group *group;
};
static void vfio_group_put_bg(struct work_struct *work)
{
struct vfio_group_put_work *do_work;
do_work = container_of(work, struct vfio_group_put_work, work);
vfio_group_put(do_work->group);
kfree(do_work);
}
static void vfio_group_schedule_put(struct vfio_group *group)
{
struct vfio_group_put_work *do_work;
do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
if (WARN_ON(!do_work))
return;
INIT_WORK(&do_work->work, vfio_group_put_bg);
do_work->group = group;
schedule_work(&do_work->work);
}
/* Assume group_lock or group reference is held */
static void vfio_group_get(struct vfio_group *group)
{
kref_get(&group->kref);
}
/*
* Not really a try as we will sleep for mutex, but we need to make
* sure the group pointer is valid under lock and get a reference.
*/
static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
{
struct vfio_group *target = group;
mutex_lock(&vfio.group_lock);
list_for_each_entry(group, &vfio.group_list, vfio_next) {
if (group == target) {
vfio_group_get(group);
mutex_unlock(&vfio.group_lock);
return group;
}
}
mutex_unlock(&vfio.group_lock);
return NULL;
}
static
struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
{
struct vfio_group *group;
mutex_lock(&vfio.group_lock);
list_for_each_entry(group, &vfio.group_list, vfio_next) {
if (group->iommu_group == iommu_group) {
vfio_group_get(group);
mutex_unlock(&vfio.group_lock);
return group;
}
}
mutex_unlock(&vfio.group_lock);
return NULL;
}
static struct vfio_group *vfio_group_get_from_minor(int minor)
{
struct vfio_group *group;
mutex_lock(&vfio.group_lock);
group = idr_find(&vfio.group_idr, minor);
if (!group) {
mutex_unlock(&vfio.group_lock);
return NULL;
}
vfio_group_get(group);
mutex_unlock(&vfio.group_lock);
return group;
}
static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
{
struct iommu_group *iommu_group;
struct vfio_group *group;
iommu_group = iommu_group_get(dev);
if (!iommu_group)
return NULL;
group = vfio_group_get_from_iommu(iommu_group);
iommu_group_put(iommu_group);
return group;
}
/**
* Device objects - create, release, get, put, search
*/
/* Device reference always implies a group reference */
void vfio_device_put(struct vfio_device *device)
{
if (refcount_dec_and_test(&device->refcount))
complete(&device->comp);
}
EXPORT_SYMBOL_GPL(vfio_device_put);
static bool vfio_device_try_get(struct vfio_device *device)
{
return refcount_inc_not_zero(&device->refcount);
}
static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
struct device *dev)
{
struct vfio_device *device;
mutex_lock(&group->device_lock);
list_for_each_entry(device, &group->device_list, group_next) {
if (device->dev == dev && vfio_device_try_get(device)) {
mutex_unlock(&group->device_lock);
return device;
}
}
mutex_unlock(&group->device_lock);
return NULL;
}
/*
* Some drivers, like pci-stub, are only used to prevent other drivers from
* claiming a device and are therefore perfectly legitimate for a user owned
* group. The pci-stub driver has no dependencies on DMA or the IOVA mapping
* of the device, but it does prevent the user from having direct access to
* the device, which is useful in some circumstances.
*
* We also assume that we can include PCI interconnect devices, ie. bridges.
* IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
* then all of the downstream devices will be part of the same IOMMU group as
* the bridge. Thus, if placing the bridge into the user owned IOVA space
* breaks anything, it only does so for user owned devices downstream. Note
* that error notification via MSI can be affected for platforms that handle
* MSI within the same IOVA space as DMA.
*/
static const char * const vfio_driver_allowed[] = { "pci-stub" };
static bool vfio_dev_driver_allowed(struct device *dev,
struct device_driver *drv)
{
if (dev_is_pci(dev)) {
struct pci_dev *pdev = to_pci_dev(dev);
if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
return true;
}
return match_string(vfio_driver_allowed,
ARRAY_SIZE(vfio_driver_allowed),
drv->name) >= 0;
}
/*
* A vfio group is viable for use by userspace if all devices are in
* one of the following states:
* - driver-less
* - bound to a vfio driver
* - bound to an otherwise allowed driver
* - a PCI interconnect device
*
* We use two methods to determine whether a device is bound to a vfio
* driver. The first is to test whether the device exists in the vfio
* group. The second is to test if the device exists on the group
* unbound_list, indicating it's in the middle of transitioning from
* a vfio driver to driver-less.
*/
static int vfio_dev_viable(struct device *dev, void *data)
{
struct vfio_group *group = data;
struct vfio_device *device;
locking/atomics: COCCINELLE/treewide: Convert trivial ACCESS_ONCE() patterns to READ_ONCE()/WRITE_ONCE() Please do not apply this to mainline directly, instead please re-run the coccinelle script shown below and apply its output. For several reasons, it is desirable to use {READ,WRITE}_ONCE() in preference to ACCESS_ONCE(), and new code is expected to use one of the former. So far, there's been no reason to change most existing uses of ACCESS_ONCE(), as these aren't harmful, and changing them results in churn. However, for some features, the read/write distinction is critical to correct operation. To distinguish these cases, separate read/write accessors must be used. This patch migrates (most) remaining ACCESS_ONCE() instances to {READ,WRITE}_ONCE(), using the following coccinelle script: ---- // Convert trivial ACCESS_ONCE() uses to equivalent READ_ONCE() and // WRITE_ONCE() // $ make coccicheck COCCI=/home/mark/once.cocci SPFLAGS="--include-headers" MODE=patch virtual patch @ depends on patch @ expression E1, E2; @@ - ACCESS_ONCE(E1) = E2 + WRITE_ONCE(E1, E2) @ depends on patch @ expression E; @@ - ACCESS_ONCE(E) + READ_ONCE(E) ---- Signed-off-by: Mark Rutland <mark.rutland@arm.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: davem@davemloft.net Cc: linux-arch@vger.kernel.org Cc: mpe@ellerman.id.au Cc: shuah@kernel.org Cc: snitzer@redhat.com Cc: thor.thayer@linux.intel.com Cc: tj@kernel.org Cc: viro@zeniv.linux.org.uk Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1508792849-3115-19-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-24 05:07:29 +08:00
struct device_driver *drv = READ_ONCE(dev->driver);
struct vfio_unbound_dev *unbound;
int ret = -EINVAL;
mutex_lock(&group->unbound_lock);
list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
if (dev == unbound->dev) {
ret = 0;
break;
}
}
mutex_unlock(&group->unbound_lock);
if (!ret || !drv || vfio_dev_driver_allowed(dev, drv))
return 0;
device = vfio_group_get_device(group, dev);
if (device) {
vfio_device_put(device);
return 0;
}
return ret;
}
/**
* Async device support
*/
static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
{
struct vfio_device *device;
/* Do we already know about it? We shouldn't */
device = vfio_group_get_device(group, dev);
if (WARN_ON_ONCE(device)) {
vfio_device_put(device);
return 0;
}
/* Nothing to do for idle groups */
if (!atomic_read(&group->container_users))
return 0;
/* TODO Prevent device auto probing */
dev_WARN(dev, "Device added to live group %d!\n",
iommu_group_id(group->iommu_group));
return 0;
}
static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
{
/* We don't care what happens when the group isn't in use */
if (!atomic_read(&group->container_users))
return 0;
return vfio_dev_viable(dev, group);
}
static int vfio_iommu_group_notifier(struct notifier_block *nb,
unsigned long action, void *data)
{
struct vfio_group *group = container_of(nb, struct vfio_group, nb);
struct device *dev = data;
struct vfio_unbound_dev *unbound;
/*
* Need to go through a group_lock lookup to get a reference or we
* risk racing a group being removed. Ignore spurious notifies.
*/
group = vfio_group_try_get(group);
if (!group)
return NOTIFY_OK;
switch (action) {
case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
vfio_group_nb_add_dev(group, dev);
break;
case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
/*
* Nothing to do here. If the device is in use, then the
* vfio sub-driver should block the remove callback until
* it is unused. If the device is unused or attached to a
* stub driver, then it should be released and we don't
* care that it will be going away.
*/
break;
case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
dev_dbg(dev, "%s: group %d binding to driver\n", __func__,
iommu_group_id(group->iommu_group));
break;
case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
dev_dbg(dev, "%s: group %d bound to driver %s\n", __func__,
iommu_group_id(group->iommu_group), dev->driver->name);
BUG_ON(vfio_group_nb_verify(group, dev));
break;
case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
dev_dbg(dev, "%s: group %d unbinding from driver %s\n",
__func__, iommu_group_id(group->iommu_group),
dev->driver->name);
break;
case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
dev_dbg(dev, "%s: group %d unbound from driver\n", __func__,
iommu_group_id(group->iommu_group));
/*
* XXX An unbound device in a live group is ok, but we'd
* really like to avoid the above BUG_ON by preventing other
* drivers from binding to it. Once that occurs, we have to
* stop the system to maintain isolation. At a minimum, we'd
* want a toggle to disable driver auto probe for this device.
*/
mutex_lock(&group->unbound_lock);
list_for_each_entry(unbound,
&group->unbound_list, unbound_next) {
if (dev == unbound->dev) {
list_del(&unbound->unbound_next);
kfree(unbound);
break;
}
}
mutex_unlock(&group->unbound_lock);
break;
}
/*
* If we're the last reference to the group, the group will be
* released, which includes unregistering the iommu group notifier.
* We hold a read-lock on that notifier list, unregistering needs
* a write-lock... deadlock. Release our reference asynchronously
* to avoid that situation.
*/
vfio_group_schedule_put(group);
return NOTIFY_OK;
}
/**
* VFIO driver API
*/
vfio: Split creation of a vfio_device into init and register ops This makes the struct vfio_device part of the public interface so it can be used with container_of and so forth, as is typical for a Linux subystem. This is the first step to bring some type-safety to the vfio interface by allowing the replacement of 'void *' and 'struct device *' inputs with a simple and clear 'struct vfio_device *' For now the self-allocating vfio_add_group_dev() interface is kept so each user can be updated as a separate patch. The expected usage pattern is driver core probe() function: my_device = kzalloc(sizeof(*mydevice)); vfio_init_group_dev(&my_device->vdev, dev, ops, mydevice); /* other driver specific prep */ vfio_register_group_dev(&my_device->vdev); dev_set_drvdata(dev, my_device); driver core remove() function: my_device = dev_get_drvdata(dev); vfio_unregister_group_dev(&my_device->vdev); /* other driver specific tear down */ kfree(my_device); Allowing the driver to be able to use the drvdata and vfio_device to go to/from its own data. The pattern also makes it clear that vfio_register_group_dev() must be last in the sequence, as once it is called the core code can immediately start calling ops. The init/register gap is provided to allow for the driver to do setup before ops can be called and thus avoid races. Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Liu Yi L <yi.l.liu@intel.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Eric Auger <eric.auger@redhat.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Message-Id: <3-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-03-30 23:53:05 +08:00
void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
const struct vfio_device_ops *ops)
{
vfio: Split creation of a vfio_device into init and register ops This makes the struct vfio_device part of the public interface so it can be used with container_of and so forth, as is typical for a Linux subystem. This is the first step to bring some type-safety to the vfio interface by allowing the replacement of 'void *' and 'struct device *' inputs with a simple and clear 'struct vfio_device *' For now the self-allocating vfio_add_group_dev() interface is kept so each user can be updated as a separate patch. The expected usage pattern is driver core probe() function: my_device = kzalloc(sizeof(*mydevice)); vfio_init_group_dev(&my_device->vdev, dev, ops, mydevice); /* other driver specific prep */ vfio_register_group_dev(&my_device->vdev); dev_set_drvdata(dev, my_device); driver core remove() function: my_device = dev_get_drvdata(dev); vfio_unregister_group_dev(&my_device->vdev); /* other driver specific tear down */ kfree(my_device); Allowing the driver to be able to use the drvdata and vfio_device to go to/from its own data. The pattern also makes it clear that vfio_register_group_dev() must be last in the sequence, as once it is called the core code can immediately start calling ops. The init/register gap is provided to allow for the driver to do setup before ops can be called and thus avoid races. Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Liu Yi L <yi.l.liu@intel.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Eric Auger <eric.auger@redhat.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Message-Id: <3-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-03-30 23:53:05 +08:00
init_completion(&device->comp);
device->dev = dev;
device->ops = ops;
}
EXPORT_SYMBOL_GPL(vfio_init_group_dev);
void vfio_uninit_group_dev(struct vfio_device *device)
{
vfio: Provide better generic support for open/release vfio_device_ops Currently the driver ops have an open/release pair that is called once each time a device FD is opened or closed. Add an additional set of open/close_device() ops which are called when the device FD is opened for the first time and closed for the last time. An analysis shows that all of the drivers require this semantic. Some are open coding it as part of their reflck implementation, and some are just buggy and miss it completely. To retain the current semantics PCI and FSL depend on, introduce the idea of a "device set" which is a grouping of vfio_device's that share the same lock around opening. The device set is established by providing a 'set_id' pointer. All vfio_device's that provide the same pointer will be joined to the same singleton memory and lock across the whole set. This effectively replaces the oddly named reflck. After conversion the set_id will be sourced from: - A struct device from a fsl_mc_device (fsl) - A struct pci_slot (pci) - A struct pci_bus (pci) - The struct vfio_device (everything) The design ensures that the above pointers are live as long as the vfio_device is registered, so they form reliable unique keys to group vfio_devices into sets. This implementation uses xarray instead of searching through the driver core structures, which simplifies the somewhat tricky locking in this area. Following patches convert all the drivers. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/4-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-08-06 09:19:00 +08:00
vfio_release_device_set(device);
}
EXPORT_SYMBOL_GPL(vfio_uninit_group_dev);
#ifdef CONFIG_VFIO_NOIOMMU
static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev)
{
struct iommu_group *iommu_group;
struct vfio_group *group;
int ret;
iommu_group = iommu_group_alloc();
if (IS_ERR(iommu_group))
return ERR_CAST(iommu_group);
iommu_group_set_name(iommu_group, "vfio-noiommu");
ret = iommu_group_add_device(iommu_group, dev);
if (ret)
goto out_put_group;
group = vfio_create_group(iommu_group, true);
if (IS_ERR(group)) {
ret = PTR_ERR(group);
goto out_remove_device;
}
return group;
out_remove_device:
iommu_group_remove_device(dev);
out_put_group:
iommu_group_put(iommu_group);
return ERR_PTR(ret);
}
#endif
static struct vfio_group *vfio_group_find_or_alloc(struct device *dev)
{
struct iommu_group *iommu_group;
struct vfio_group *group;
iommu_group = iommu_group_get(dev);
#ifdef CONFIG_VFIO_NOIOMMU
if (!iommu_group && noiommu && !iommu_present(dev->bus)) {
/*
* With noiommu enabled, create an IOMMU group for devices that
* don't already have one and don't have an iommu_ops on their
* bus. Taint the kernel because we're about to give a DMA
* capable device to a user without IOMMU protection.
*/
group = vfio_noiommu_group_alloc(dev);
if (!IS_ERR(group)) {
add_taint(TAINT_USER, LOCKDEP_STILL_OK);
dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
}
return group;
}
#endif
if (!iommu_group)
return ERR_PTR(-EINVAL);
/* a found vfio_group already holds a reference to the iommu_group */
group = vfio_group_get_from_iommu(iommu_group);
if (group)
goto out_put;
/* a newly created vfio_group keeps the reference. */
group = vfio_create_group(iommu_group, false);
if (IS_ERR(group))
goto out_put;
return group;
out_put:
iommu_group_put(iommu_group);
return group;
}
vfio: Split creation of a vfio_device into init and register ops This makes the struct vfio_device part of the public interface so it can be used with container_of and so forth, as is typical for a Linux subystem. This is the first step to bring some type-safety to the vfio interface by allowing the replacement of 'void *' and 'struct device *' inputs with a simple and clear 'struct vfio_device *' For now the self-allocating vfio_add_group_dev() interface is kept so each user can be updated as a separate patch. The expected usage pattern is driver core probe() function: my_device = kzalloc(sizeof(*mydevice)); vfio_init_group_dev(&my_device->vdev, dev, ops, mydevice); /* other driver specific prep */ vfio_register_group_dev(&my_device->vdev); dev_set_drvdata(dev, my_device); driver core remove() function: my_device = dev_get_drvdata(dev); vfio_unregister_group_dev(&my_device->vdev); /* other driver specific tear down */ kfree(my_device); Allowing the driver to be able to use the drvdata and vfio_device to go to/from its own data. The pattern also makes it clear that vfio_register_group_dev() must be last in the sequence, as once it is called the core code can immediately start calling ops. The init/register gap is provided to allow for the driver to do setup before ops can be called and thus avoid races. Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Liu Yi L <yi.l.liu@intel.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Eric Auger <eric.auger@redhat.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Message-Id: <3-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-03-30 23:53:05 +08:00
int vfio_register_group_dev(struct vfio_device *device)
{
struct vfio_device *existing_device;
struct vfio_group *group;
vfio: Provide better generic support for open/release vfio_device_ops Currently the driver ops have an open/release pair that is called once each time a device FD is opened or closed. Add an additional set of open/close_device() ops which are called when the device FD is opened for the first time and closed for the last time. An analysis shows that all of the drivers require this semantic. Some are open coding it as part of their reflck implementation, and some are just buggy and miss it completely. To retain the current semantics PCI and FSL depend on, introduce the idea of a "device set" which is a grouping of vfio_device's that share the same lock around opening. The device set is established by providing a 'set_id' pointer. All vfio_device's that provide the same pointer will be joined to the same singleton memory and lock across the whole set. This effectively replaces the oddly named reflck. After conversion the set_id will be sourced from: - A struct device from a fsl_mc_device (fsl) - A struct pci_slot (pci) - A struct pci_bus (pci) - The struct vfio_device (everything) The design ensures that the above pointers are live as long as the vfio_device is registered, so they form reliable unique keys to group vfio_devices into sets. This implementation uses xarray instead of searching through the driver core structures, which simplifies the somewhat tricky locking in this area. Following patches convert all the drivers. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/4-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-08-06 09:19:00 +08:00
/*
* If the driver doesn't specify a set then the device is added to a
* singleton set just for itself.
*/
if (!device->dev_set)
vfio_assign_device_set(device, device);
group = vfio_group_find_or_alloc(device->dev);
if (IS_ERR(group))
return PTR_ERR(group);
vfio: Split creation of a vfio_device into init and register ops This makes the struct vfio_device part of the public interface so it can be used with container_of and so forth, as is typical for a Linux subystem. This is the first step to bring some type-safety to the vfio interface by allowing the replacement of 'void *' and 'struct device *' inputs with a simple and clear 'struct vfio_device *' For now the self-allocating vfio_add_group_dev() interface is kept so each user can be updated as a separate patch. The expected usage pattern is driver core probe() function: my_device = kzalloc(sizeof(*mydevice)); vfio_init_group_dev(&my_device->vdev, dev, ops, mydevice); /* other driver specific prep */ vfio_register_group_dev(&my_device->vdev); dev_set_drvdata(dev, my_device); driver core remove() function: my_device = dev_get_drvdata(dev); vfio_unregister_group_dev(&my_device->vdev); /* other driver specific tear down */ kfree(my_device); Allowing the driver to be able to use the drvdata and vfio_device to go to/from its own data. The pattern also makes it clear that vfio_register_group_dev() must be last in the sequence, as once it is called the core code can immediately start calling ops. The init/register gap is provided to allow for the driver to do setup before ops can be called and thus avoid races. Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Liu Yi L <yi.l.liu@intel.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Eric Auger <eric.auger@redhat.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Message-Id: <3-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-03-30 23:53:05 +08:00
existing_device = vfio_group_get_device(group, device->dev);
if (existing_device) {
dev_WARN(device->dev, "Device already exists on group %d\n",
iommu_group_id(group->iommu_group));
vfio: Split creation of a vfio_device into init and register ops This makes the struct vfio_device part of the public interface so it can be used with container_of and so forth, as is typical for a Linux subystem. This is the first step to bring some type-safety to the vfio interface by allowing the replacement of 'void *' and 'struct device *' inputs with a simple and clear 'struct vfio_device *' For now the self-allocating vfio_add_group_dev() interface is kept so each user can be updated as a separate patch. The expected usage pattern is driver core probe() function: my_device = kzalloc(sizeof(*mydevice)); vfio_init_group_dev(&my_device->vdev, dev, ops, mydevice); /* other driver specific prep */ vfio_register_group_dev(&my_device->vdev); dev_set_drvdata(dev, my_device); driver core remove() function: my_device = dev_get_drvdata(dev); vfio_unregister_group_dev(&my_device->vdev); /* other driver specific tear down */ kfree(my_device); Allowing the driver to be able to use the drvdata and vfio_device to go to/from its own data. The pattern also makes it clear that vfio_register_group_dev() must be last in the sequence, as once it is called the core code can immediately start calling ops. The init/register gap is provided to allow for the driver to do setup before ops can be called and thus avoid races. Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Liu Yi L <yi.l.liu@intel.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Eric Auger <eric.auger@redhat.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Message-Id: <3-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-03-30 23:53:05 +08:00
vfio_device_put(existing_device);
if (group->noiommu)
iommu_group_remove_device(device->dev);
vfio_group_put(group);
return -EBUSY;
}
vfio: Split creation of a vfio_device into init and register ops This makes the struct vfio_device part of the public interface so it can be used with container_of and so forth, as is typical for a Linux subystem. This is the first step to bring some type-safety to the vfio interface by allowing the replacement of 'void *' and 'struct device *' inputs with a simple and clear 'struct vfio_device *' For now the self-allocating vfio_add_group_dev() interface is kept so each user can be updated as a separate patch. The expected usage pattern is driver core probe() function: my_device = kzalloc(sizeof(*mydevice)); vfio_init_group_dev(&my_device->vdev, dev, ops, mydevice); /* other driver specific prep */ vfio_register_group_dev(&my_device->vdev); dev_set_drvdata(dev, my_device); driver core remove() function: my_device = dev_get_drvdata(dev); vfio_unregister_group_dev(&my_device->vdev); /* other driver specific tear down */ kfree(my_device); Allowing the driver to be able to use the drvdata and vfio_device to go to/from its own data. The pattern also makes it clear that vfio_register_group_dev() must be last in the sequence, as once it is called the core code can immediately start calling ops. The init/register gap is provided to allow for the driver to do setup before ops can be called and thus avoid races. Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Liu Yi L <yi.l.liu@intel.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Eric Auger <eric.auger@redhat.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Message-Id: <3-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-03-30 23:53:05 +08:00
/* Our reference on group is moved to the device */
device->group = group;
/* Refcounting can't start until the driver calls register */
refcount_set(&device->refcount, 1);
mutex_lock(&group->device_lock);
list_add(&device->group_next, &group->device_list);
group->dev_counter++;
mutex_unlock(&group->device_lock);
return 0;
}
EXPORT_SYMBOL_GPL(vfio_register_group_dev);
/**
* Get a reference to the vfio_device for a device. Even if the
* caller thinks they own the device, they could be racing with a
* release call path, so we can't trust drvdata for the shortcut.
* Go the long way around, from the iommu_group to the vfio_group
* to the vfio_device.
*/
struct vfio_device *vfio_device_get_from_dev(struct device *dev)
{
struct vfio_group *group;
struct vfio_device *device;
group = vfio_group_get_from_dev(dev);
if (!group)
return NULL;
device = vfio_group_get_device(group, dev);
vfio_group_put(group);
return device;
}
EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
char *buf)
{
struct vfio_device *it, *device = ERR_PTR(-ENODEV);
mutex_lock(&group->device_lock);
list_for_each_entry(it, &group->device_list, group_next) {
int ret;
if (it->ops->match) {
ret = it->ops->match(it, buf);
if (ret < 0) {
device = ERR_PTR(ret);
break;
}
} else {
ret = !strcmp(dev_name(it->dev), buf);
}
if (ret && vfio_device_try_get(it)) {
device = it;
break;
}
}
mutex_unlock(&group->device_lock);
return device;
}
/*
* Decrement the device reference count and wait for the device to be
* removed. Open file descriptors for the device... */
vfio: Split creation of a vfio_device into init and register ops This makes the struct vfio_device part of the public interface so it can be used with container_of and so forth, as is typical for a Linux subystem. This is the first step to bring some type-safety to the vfio interface by allowing the replacement of 'void *' and 'struct device *' inputs with a simple and clear 'struct vfio_device *' For now the self-allocating vfio_add_group_dev() interface is kept so each user can be updated as a separate patch. The expected usage pattern is driver core probe() function: my_device = kzalloc(sizeof(*mydevice)); vfio_init_group_dev(&my_device->vdev, dev, ops, mydevice); /* other driver specific prep */ vfio_register_group_dev(&my_device->vdev); dev_set_drvdata(dev, my_device); driver core remove() function: my_device = dev_get_drvdata(dev); vfio_unregister_group_dev(&my_device->vdev); /* other driver specific tear down */ kfree(my_device); Allowing the driver to be able to use the drvdata and vfio_device to go to/from its own data. The pattern also makes it clear that vfio_register_group_dev() must be last in the sequence, as once it is called the core code can immediately start calling ops. The init/register gap is provided to allow for the driver to do setup before ops can be called and thus avoid races. Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Liu Yi L <yi.l.liu@intel.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Eric Auger <eric.auger@redhat.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Message-Id: <3-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-03-30 23:53:05 +08:00
void vfio_unregister_group_dev(struct vfio_device *device)
{
struct vfio_group *group = device->group;
struct vfio_unbound_dev *unbound;
unsigned int i = 0;
bool interrupted = false;
long rc;
/*
* When the device is removed from the group, the group suddenly
* becomes non-viable; the device has a driver (until the unbind
* completes), but it's not present in the group. This is bad news
* for any external users that need to re-acquire a group reference
* in order to match and release their existing reference. To
* solve this, we track such devices on the unbound_list to bridge
* the gap until they're fully unbound.
*/
unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
if (unbound) {
vfio: Split creation of a vfio_device into init and register ops This makes the struct vfio_device part of the public interface so it can be used with container_of and so forth, as is typical for a Linux subystem. This is the first step to bring some type-safety to the vfio interface by allowing the replacement of 'void *' and 'struct device *' inputs with a simple and clear 'struct vfio_device *' For now the self-allocating vfio_add_group_dev() interface is kept so each user can be updated as a separate patch. The expected usage pattern is driver core probe() function: my_device = kzalloc(sizeof(*mydevice)); vfio_init_group_dev(&my_device->vdev, dev, ops, mydevice); /* other driver specific prep */ vfio_register_group_dev(&my_device->vdev); dev_set_drvdata(dev, my_device); driver core remove() function: my_device = dev_get_drvdata(dev); vfio_unregister_group_dev(&my_device->vdev); /* other driver specific tear down */ kfree(my_device); Allowing the driver to be able to use the drvdata and vfio_device to go to/from its own data. The pattern also makes it clear that vfio_register_group_dev() must be last in the sequence, as once it is called the core code can immediately start calling ops. The init/register gap is provided to allow for the driver to do setup before ops can be called and thus avoid races. Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Liu Yi L <yi.l.liu@intel.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Eric Auger <eric.auger@redhat.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Message-Id: <3-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-03-30 23:53:05 +08:00
unbound->dev = device->dev;
mutex_lock(&group->unbound_lock);
list_add(&unbound->unbound_next, &group->unbound_list);
mutex_unlock(&group->unbound_lock);
}
WARN_ON(!unbound);
vfio_device_put(device);
rc = try_wait_for_completion(&device->comp);
while (rc <= 0) {
if (device->ops->request)
device->ops->request(device, i++);
if (interrupted) {
rc = wait_for_completion_timeout(&device->comp,
HZ * 10);
} else {
rc = wait_for_completion_interruptible_timeout(
&device->comp, HZ * 10);
if (rc < 0) {
interrupted = true;
vfio: Split creation of a vfio_device into init and register ops This makes the struct vfio_device part of the public interface so it can be used with container_of and so forth, as is typical for a Linux subystem. This is the first step to bring some type-safety to the vfio interface by allowing the replacement of 'void *' and 'struct device *' inputs with a simple and clear 'struct vfio_device *' For now the self-allocating vfio_add_group_dev() interface is kept so each user can be updated as a separate patch. The expected usage pattern is driver core probe() function: my_device = kzalloc(sizeof(*mydevice)); vfio_init_group_dev(&my_device->vdev, dev, ops, mydevice); /* other driver specific prep */ vfio_register_group_dev(&my_device->vdev); dev_set_drvdata(dev, my_device); driver core remove() function: my_device = dev_get_drvdata(dev); vfio_unregister_group_dev(&my_device->vdev); /* other driver specific tear down */ kfree(my_device); Allowing the driver to be able to use the drvdata and vfio_device to go to/from its own data. The pattern also makes it clear that vfio_register_group_dev() must be last in the sequence, as once it is called the core code can immediately start calling ops. The init/register gap is provided to allow for the driver to do setup before ops can be called and thus avoid races. Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Liu Yi L <yi.l.liu@intel.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Eric Auger <eric.auger@redhat.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Message-Id: <3-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-03-30 23:53:05 +08:00
dev_warn(device->dev,
"Device is currently in use, task"
" \"%s\" (%d) "
"blocked until device is released",
current->comm, task_pid_nr(current));
}
}
}
mutex_lock(&group->device_lock);
list_del(&device->group_next);
group->dev_counter--;
mutex_unlock(&group->device_lock);
vfio: Fix WARNING "do not call blocking ops when !TASK_RUNNING" vfio_dev_present() which is the condition to wait_event_interruptible_timeout(), will call vfio_group_get_device and try to acquire the mutex group->device_lock. wait_event_interruptible_timeout() will set the state of the current task to TASK_INTERRUPTIBLE, before doing the condition check. This means that we will try to acquire the mutex while already in a sleeping state. The scheduler warns us by giving the following warning: [ 4050.264464] ------------[ cut here ]------------ [ 4050.264508] do not call blocking ops when !TASK_RUNNING; state=1 set at [<00000000b33c00e2>] prepare_to_wait_event+0x14a/0x188 [ 4050.264529] WARNING: CPU: 12 PID: 35924 at kernel/sched/core.c:6112 __might_sleep+0x76/0x90 .... 4050.264756] Call Trace: [ 4050.264765] ([<000000000017bbaa>] __might_sleep+0x72/0x90) [ 4050.264774] [<0000000000b97edc>] __mutex_lock+0x44/0x8c0 [ 4050.264782] [<0000000000b9878a>] mutex_lock_nested+0x32/0x40 [ 4050.264793] [<000003ff800d7abe>] vfio_group_get_device+0x36/0xa8 [vfio] [ 4050.264803] [<000003ff800d87c0>] vfio_del_group_dev+0x238/0x378 [vfio] [ 4050.264813] [<000003ff8015f67c>] mdev_remove+0x3c/0x68 [mdev] [ 4050.264825] [<00000000008e01b0>] device_release_driver_internal+0x168/0x268 [ 4050.264834] [<00000000008de692>] bus_remove_device+0x162/0x190 [ 4050.264843] [<00000000008daf42>] device_del+0x1e2/0x368 [ 4050.264851] [<00000000008db12c>] device_unregister+0x64/0x88 [ 4050.264862] [<000003ff8015ed84>] mdev_device_remove+0xec/0x130 [mdev] [ 4050.264872] [<000003ff8015f074>] remove_store+0x6c/0xa8 [mdev] [ 4050.264881] [<000000000046f494>] kernfs_fop_write+0x14c/0x1f8 [ 4050.264890] [<00000000003c1530>] __vfs_write+0x38/0x1a8 [ 4050.264899] [<00000000003c187c>] vfs_write+0xb4/0x198 [ 4050.264908] [<00000000003c1af2>] ksys_write+0x5a/0xb0 [ 4050.264916] [<0000000000b9e270>] system_call+0xdc/0x2d8 [ 4050.264925] 4 locks held by sh/35924: [ 4050.264933] #0: 000000001ef90325 (sb_writers#4){.+.+}, at: vfs_write+0x9e/0x198 [ 4050.264948] #1: 000000005c1ab0b3 (&of->mutex){+.+.}, at: kernfs_fop_write+0x1cc/0x1f8 [ 4050.264963] #2: 0000000034831ab8 (kn->count#297){++++}, at: kernfs_remove_self+0x12e/0x150 [ 4050.264979] #3: 00000000e152484f (&dev->mutex){....}, at: device_release_driver_internal+0x5c/0x268 [ 4050.264993] Last Breaking-Event-Address: [ 4050.265002] [<000000000017bbaa>] __might_sleep+0x72/0x90 [ 4050.265010] irq event stamp: 7039 [ 4050.265020] hardirqs last enabled at (7047): [<00000000001cee7a>] console_unlock+0x6d2/0x740 [ 4050.265029] hardirqs last disabled at (7054): [<00000000001ce87e>] console_unlock+0xd6/0x740 [ 4050.265040] softirqs last enabled at (6416): [<0000000000b8fe26>] __udelay+0xb6/0x100 [ 4050.265049] softirqs last disabled at (6415): [<0000000000b8fe06>] __udelay+0x96/0x100 [ 4050.265057] ---[ end trace d04a07d39d99a9f9 ]--- Let's fix this as described in the article https://lwn.net/Articles/628628/. Signed-off-by: Farhan Ali <alifm@linux.ibm.com> [remove now redundant vfio_dev_present()] Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2019-04-04 02:22:27 +08:00
/*
* In order to support multiple devices per group, devices can be
* plucked from the group while other devices in the group are still
* in use. The container persists with this group and those remaining
* devices still attached. If the user creates an isolation violation
* by binding this device to another driver while the group is still in
* use, that's their fault. However, in the case of removing the last,
* or potentially the only, device in the group there can be no other
* in-use devices in the group. The user has done their due diligence
* and we should lay no claims to those devices. In order to do that,
* we need to make sure the group is detached from the container.
* Without this stall, we're potentially racing with a user process
* that may attempt to immediately bind this device to another driver.
*/
if (list_empty(&group->device_list))
wait_event(group->container_q, !group->container);
if (group->noiommu)
iommu_group_remove_device(device->dev);
vfio: Split creation of a vfio_device into init and register ops This makes the struct vfio_device part of the public interface so it can be used with container_of and so forth, as is typical for a Linux subystem. This is the first step to bring some type-safety to the vfio interface by allowing the replacement of 'void *' and 'struct device *' inputs with a simple and clear 'struct vfio_device *' For now the self-allocating vfio_add_group_dev() interface is kept so each user can be updated as a separate patch. The expected usage pattern is driver core probe() function: my_device = kzalloc(sizeof(*mydevice)); vfio_init_group_dev(&my_device->vdev, dev, ops, mydevice); /* other driver specific prep */ vfio_register_group_dev(&my_device->vdev); dev_set_drvdata(dev, my_device); driver core remove() function: my_device = dev_get_drvdata(dev); vfio_unregister_group_dev(&my_device->vdev); /* other driver specific tear down */ kfree(my_device); Allowing the driver to be able to use the drvdata and vfio_device to go to/from its own data. The pattern also makes it clear that vfio_register_group_dev() must be last in the sequence, as once it is called the core code can immediately start calling ops. The init/register gap is provided to allow for the driver to do setup before ops can be called and thus avoid races. Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Liu Yi L <yi.l.liu@intel.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Eric Auger <eric.auger@redhat.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Message-Id: <3-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-03-30 23:53:05 +08:00
/* Matches the get in vfio_register_group_dev() */
vfio_group_put(group);
vfio: Split creation of a vfio_device into init and register ops This makes the struct vfio_device part of the public interface so it can be used with container_of and so forth, as is typical for a Linux subystem. This is the first step to bring some type-safety to the vfio interface by allowing the replacement of 'void *' and 'struct device *' inputs with a simple and clear 'struct vfio_device *' For now the self-allocating vfio_add_group_dev() interface is kept so each user can be updated as a separate patch. The expected usage pattern is driver core probe() function: my_device = kzalloc(sizeof(*mydevice)); vfio_init_group_dev(&my_device->vdev, dev, ops, mydevice); /* other driver specific prep */ vfio_register_group_dev(&my_device->vdev); dev_set_drvdata(dev, my_device); driver core remove() function: my_device = dev_get_drvdata(dev); vfio_unregister_group_dev(&my_device->vdev); /* other driver specific tear down */ kfree(my_device); Allowing the driver to be able to use the drvdata and vfio_device to go to/from its own data. The pattern also makes it clear that vfio_register_group_dev() must be last in the sequence, as once it is called the core code can immediately start calling ops. The init/register gap is provided to allow for the driver to do setup before ops can be called and thus avoid races. Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Liu Yi L <yi.l.liu@intel.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Eric Auger <eric.auger@redhat.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Message-Id: <3-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-03-30 23:53:05 +08:00
}
EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
/**
* VFIO base fd, /dev/vfio/vfio
*/
static long vfio_ioctl_check_extension(struct vfio_container *container,
unsigned long arg)
{
struct vfio_iommu_driver *driver;
long ret = 0;
down_read(&container->group_lock);
driver = container->iommu_driver;
switch (arg) {
/* No base extensions yet */
default:
/*
* If no driver is set, poll all registered drivers for
* extensions and return the first positive result. If
* a driver is already set, further queries will be passed
* only to that driver.
*/
if (!driver) {
mutex_lock(&vfio.iommu_drivers_lock);
list_for_each_entry(driver, &vfio.iommu_drivers_list,
vfio_next) {
if (!list_empty(&container->group_list) &&
!vfio_iommu_driver_allowed(container,
driver))
continue;
if (!try_module_get(driver->ops->owner))
continue;
ret = driver->ops->ioctl(NULL,
VFIO_CHECK_EXTENSION,
arg);
module_put(driver->ops->owner);
if (ret > 0)
break;
}
mutex_unlock(&vfio.iommu_drivers_lock);
} else
ret = driver->ops->ioctl(container->iommu_data,
VFIO_CHECK_EXTENSION, arg);
}
up_read(&container->group_lock);
return ret;
}
/* hold write lock on container->group_lock */
static int __vfio_container_attach_groups(struct vfio_container *container,
struct vfio_iommu_driver *driver,
void *data)
{
struct vfio_group *group;
int ret = -ENODEV;
list_for_each_entry(group, &container->group_list, container_next) {
ret = driver->ops->attach_group(data, group->iommu_group);
if (ret)
goto unwind;
}
return ret;
unwind:
list_for_each_entry_continue_reverse(group, &container->group_list,
container_next) {
driver->ops->detach_group(data, group->iommu_group);
}
return ret;
}
static long vfio_ioctl_set_iommu(struct vfio_container *container,
unsigned long arg)
{
struct vfio_iommu_driver *driver;
long ret = -ENODEV;
down_write(&container->group_lock);
/*
* The container is designed to be an unprivileged interface while
* the group can be assigned to specific users. Therefore, only by
* adding a group to a container does the user get the privilege of
* enabling the iommu, which may allocate finite resources. There
* is no unset_iommu, but by removing all the groups from a container,
* the container is deprivileged and returns to an unset state.
*/
if (list_empty(&container->group_list) || container->iommu_driver) {
up_write(&container->group_lock);
return -EINVAL;
}
mutex_lock(&vfio.iommu_drivers_lock);
list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
void *data;
if (!vfio_iommu_driver_allowed(container, driver))
continue;
if (!try_module_get(driver->ops->owner))
continue;
/*
* The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
* so test which iommu driver reported support for this
* extension and call open on them. We also pass them the
* magic, allowing a single driver to support multiple
* interfaces if they'd like.
*/
if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
module_put(driver->ops->owner);
continue;
}
data = driver->ops->open(arg);
if (IS_ERR(data)) {
ret = PTR_ERR(data);
module_put(driver->ops->owner);
continue;
}
ret = __vfio_container_attach_groups(container, driver, data);
if (ret) {
driver->ops->release(data);
module_put(driver->ops->owner);
continue;
}
container->iommu_driver = driver;
container->iommu_data = data;
break;
}
mutex_unlock(&vfio.iommu_drivers_lock);
up_write(&container->group_lock);
return ret;
}
static long vfio_fops_unl_ioctl(struct file *filep,
unsigned int cmd, unsigned long arg)
{
struct vfio_container *container = filep->private_data;
struct vfio_iommu_driver *driver;
void *data;
long ret = -EINVAL;
if (!container)
return ret;
switch (cmd) {
case VFIO_GET_API_VERSION:
ret = VFIO_API_VERSION;
break;
case VFIO_CHECK_EXTENSION:
ret = vfio_ioctl_check_extension(container, arg);
break;
case VFIO_SET_IOMMU:
ret = vfio_ioctl_set_iommu(container, arg);
break;
default:
driver = container->iommu_driver;
data = container->iommu_data;
if (driver) /* passthrough all unrecognized ioctls */
ret = driver->ops->ioctl(data, cmd, arg);
}
return ret;
}
static int vfio_fops_open(struct inode *inode, struct file *filep)
{
struct vfio_container *container;
container = kzalloc(sizeof(*container), GFP_KERNEL);
if (!container)
return -ENOMEM;
INIT_LIST_HEAD(&container->group_list);
init_rwsem(&container->group_lock);
kref_init(&container->kref);
filep->private_data = container;
return 0;
}
static int vfio_fops_release(struct inode *inode, struct file *filep)
{
struct vfio_container *container = filep->private_data;
struct vfio_iommu_driver *driver = container->iommu_driver;
if (driver && driver->ops->notify)
driver->ops->notify(container->iommu_data,
VFIO_IOMMU_CONTAINER_CLOSE);
filep->private_data = NULL;
vfio_container_put(container);
return 0;
}
/*
* Once an iommu driver is set, we optionally pass read/write/mmap
* on to the driver, allowing management interfaces beyond ioctl.
*/
static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
size_t count, loff_t *ppos)
{
struct vfio_container *container = filep->private_data;
struct vfio_iommu_driver *driver;
ssize_t ret = -EINVAL;
driver = container->iommu_driver;
if (likely(driver && driver->ops->read))
ret = driver->ops->read(container->iommu_data,
buf, count, ppos);
return ret;
}
static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
size_t count, loff_t *ppos)
{
struct vfio_container *container = filep->private_data;
struct vfio_iommu_driver *driver;
ssize_t ret = -EINVAL;
driver = container->iommu_driver;
if (likely(driver && driver->ops->write))
ret = driver->ops->write(container->iommu_data,
buf, count, ppos);
return ret;
}
static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
{
struct vfio_container *container = filep->private_data;
struct vfio_iommu_driver *driver;
int ret = -EINVAL;
driver = container->iommu_driver;
if (likely(driver && driver->ops->mmap))
ret = driver->ops->mmap(container->iommu_data, vma);
return ret;
}
static const struct file_operations vfio_fops = {
.owner = THIS_MODULE,
.open = vfio_fops_open,
.release = vfio_fops_release,
.read = vfio_fops_read,
.write = vfio_fops_write,
.unlocked_ioctl = vfio_fops_unl_ioctl,
.compat_ioctl = compat_ptr_ioctl,
.mmap = vfio_fops_mmap,
};
/**
* VFIO Group fd, /dev/vfio/$GROUP
*/
static void __vfio_group_unset_container(struct vfio_group *group)
{
struct vfio_container *container = group->container;
struct vfio_iommu_driver *driver;
down_write(&container->group_lock);
driver = container->iommu_driver;
if (driver)
driver->ops->detach_group(container->iommu_data,
group->iommu_group);
group->container = NULL;
wake_up(&group->container_q);
list_del(&group->container_next);
/* Detaching the last group deprivileges a container, remove iommu */
if (driver && list_empty(&container->group_list)) {
driver->ops->release(container->iommu_data);
module_put(driver->ops->owner);
container->iommu_driver = NULL;
container->iommu_data = NULL;
}
up_write(&container->group_lock);
vfio_container_put(container);
}
/*
* VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
* if there was no container to unset. Since the ioctl is called on
* the group, we know that still exists, therefore the only valid
* transition here is 1->0.
*/
static int vfio_group_unset_container(struct vfio_group *group)
{
int users = atomic_cmpxchg(&group->container_users, 1, 0);
if (!users)
return -EINVAL;
if (users != 1)
return -EBUSY;
__vfio_group_unset_container(group);
return 0;
}
/*
* When removing container users, anything that removes the last user
* implicitly removes the group from the container. That is, if the
* group file descriptor is closed, as well as any device file descriptors,
* the group is free.
*/
static void vfio_group_try_dissolve_container(struct vfio_group *group)
{
if (0 == atomic_dec_if_positive(&group->container_users))
__vfio_group_unset_container(group);
}
static int vfio_group_set_container(struct vfio_group *group, int container_fd)
{
struct fd f;
struct vfio_container *container;
struct vfio_iommu_driver *driver;
int ret = 0;
if (atomic_read(&group->container_users))
return -EINVAL;
if (group->noiommu && !capable(CAP_SYS_RAWIO))
return -EPERM;
f = fdget(container_fd);
if (!f.file)
return -EBADF;
/* Sanity check, is this really our fd? */
if (f.file->f_op != &vfio_fops) {
fdput(f);
return -EINVAL;
}
container = f.file->private_data;
WARN_ON(!container); /* fget ensures we don't race vfio_release */
down_write(&container->group_lock);
/* Real groups and fake groups cannot mix */
if (!list_empty(&container->group_list) &&
container->noiommu != group->noiommu) {
ret = -EPERM;
goto unlock_out;
}
driver = container->iommu_driver;
if (driver) {
ret = driver->ops->attach_group(container->iommu_data,
group->iommu_group);
if (ret)
goto unlock_out;
}
group->container = container;
container->noiommu = group->noiommu;
list_add(&group->container_next, &container->group_list);
/* Get a reference on the container and mark a user within the group */
vfio_container_get(container);
atomic_inc(&group->container_users);
unlock_out:
up_write(&container->group_lock);
fdput(f);
return ret;
}
static bool vfio_group_viable(struct vfio_group *group)
{
return (iommu_group_for_each_dev(group->iommu_group,
group, vfio_dev_viable) == 0);
}
static int vfio_group_add_container_user(struct vfio_group *group)
{
if (!atomic_inc_not_zero(&group->container_users))
return -EINVAL;
if (group->noiommu) {
atomic_dec(&group->container_users);
return -EPERM;
}
if (!group->container->iommu_driver || !vfio_group_viable(group)) {
atomic_dec(&group->container_users);
return -EINVAL;
}
return 0;
}
static const struct file_operations vfio_device_fops;
static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
{
struct vfio_device *device;
struct file *filep;
vfio: Provide better generic support for open/release vfio_device_ops Currently the driver ops have an open/release pair that is called once each time a device FD is opened or closed. Add an additional set of open/close_device() ops which are called when the device FD is opened for the first time and closed for the last time. An analysis shows that all of the drivers require this semantic. Some are open coding it as part of their reflck implementation, and some are just buggy and miss it completely. To retain the current semantics PCI and FSL depend on, introduce the idea of a "device set" which is a grouping of vfio_device's that share the same lock around opening. The device set is established by providing a 'set_id' pointer. All vfio_device's that provide the same pointer will be joined to the same singleton memory and lock across the whole set. This effectively replaces the oddly named reflck. After conversion the set_id will be sourced from: - A struct device from a fsl_mc_device (fsl) - A struct pci_slot (pci) - A struct pci_bus (pci) - The struct vfio_device (everything) The design ensures that the above pointers are live as long as the vfio_device is registered, so they form reliable unique keys to group vfio_devices into sets. This implementation uses xarray instead of searching through the driver core structures, which simplifies the somewhat tricky locking in this area. Following patches convert all the drivers. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/4-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-08-06 09:19:00 +08:00
int fdno;
int ret = 0;
if (0 == atomic_read(&group->container_users) ||
!group->container->iommu_driver || !vfio_group_viable(group))
return -EINVAL;
if (group->noiommu && !capable(CAP_SYS_RAWIO))
return -EPERM;
device = vfio_device_get_from_name(group, buf);
if (IS_ERR(device))
return PTR_ERR(device);
if (!try_module_get(device->dev->driver->owner)) {
vfio: Provide better generic support for open/release vfio_device_ops Currently the driver ops have an open/release pair that is called once each time a device FD is opened or closed. Add an additional set of open/close_device() ops which are called when the device FD is opened for the first time and closed for the last time. An analysis shows that all of the drivers require this semantic. Some are open coding it as part of their reflck implementation, and some are just buggy and miss it completely. To retain the current semantics PCI and FSL depend on, introduce the idea of a "device set" which is a grouping of vfio_device's that share the same lock around opening. The device set is established by providing a 'set_id' pointer. All vfio_device's that provide the same pointer will be joined to the same singleton memory and lock across the whole set. This effectively replaces the oddly named reflck. After conversion the set_id will be sourced from: - A struct device from a fsl_mc_device (fsl) - A struct pci_slot (pci) - A struct pci_bus (pci) - The struct vfio_device (everything) The design ensures that the above pointers are live as long as the vfio_device is registered, so they form reliable unique keys to group vfio_devices into sets. This implementation uses xarray instead of searching through the driver core structures, which simplifies the somewhat tricky locking in this area. Following patches convert all the drivers. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/4-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-08-06 09:19:00 +08:00
ret = -ENODEV;
goto err_device_put;
}
vfio: Provide better generic support for open/release vfio_device_ops Currently the driver ops have an open/release pair that is called once each time a device FD is opened or closed. Add an additional set of open/close_device() ops which are called when the device FD is opened for the first time and closed for the last time. An analysis shows that all of the drivers require this semantic. Some are open coding it as part of their reflck implementation, and some are just buggy and miss it completely. To retain the current semantics PCI and FSL depend on, introduce the idea of a "device set" which is a grouping of vfio_device's that share the same lock around opening. The device set is established by providing a 'set_id' pointer. All vfio_device's that provide the same pointer will be joined to the same singleton memory and lock across the whole set. This effectively replaces the oddly named reflck. After conversion the set_id will be sourced from: - A struct device from a fsl_mc_device (fsl) - A struct pci_slot (pci) - A struct pci_bus (pci) - The struct vfio_device (everything) The design ensures that the above pointers are live as long as the vfio_device is registered, so they form reliable unique keys to group vfio_devices into sets. This implementation uses xarray instead of searching through the driver core structures, which simplifies the somewhat tricky locking in this area. Following patches convert all the drivers. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/4-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-08-06 09:19:00 +08:00
mutex_lock(&device->dev_set->lock);
device->open_count++;
if (device->open_count == 1 && device->ops->open_device) {
ret = device->ops->open_device(device);
if (ret)
goto err_undo_count;
}
mutex_unlock(&device->dev_set->lock);
/*
* We can't use anon_inode_getfd() because we need to modify
* the f_mode flags directly to allow more than just ioctls
*/
vfio: Provide better generic support for open/release vfio_device_ops Currently the driver ops have an open/release pair that is called once each time a device FD is opened or closed. Add an additional set of open/close_device() ops which are called when the device FD is opened for the first time and closed for the last time. An analysis shows that all of the drivers require this semantic. Some are open coding it as part of their reflck implementation, and some are just buggy and miss it completely. To retain the current semantics PCI and FSL depend on, introduce the idea of a "device set" which is a grouping of vfio_device's that share the same lock around opening. The device set is established by providing a 'set_id' pointer. All vfio_device's that provide the same pointer will be joined to the same singleton memory and lock across the whole set. This effectively replaces the oddly named reflck. After conversion the set_id will be sourced from: - A struct device from a fsl_mc_device (fsl) - A struct pci_slot (pci) - A struct pci_bus (pci) - The struct vfio_device (everything) The design ensures that the above pointers are live as long as the vfio_device is registered, so they form reliable unique keys to group vfio_devices into sets. This implementation uses xarray instead of searching through the driver core structures, which simplifies the somewhat tricky locking in this area. Following patches convert all the drivers. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/4-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-08-06 09:19:00 +08:00
fdno = ret = get_unused_fd_flags(O_CLOEXEC);
if (ret < 0)
goto err_close_device;
filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
device, O_RDWR);
if (IS_ERR(filep)) {
ret = PTR_ERR(filep);
vfio: Provide better generic support for open/release vfio_device_ops Currently the driver ops have an open/release pair that is called once each time a device FD is opened or closed. Add an additional set of open/close_device() ops which are called when the device FD is opened for the first time and closed for the last time. An analysis shows that all of the drivers require this semantic. Some are open coding it as part of their reflck implementation, and some are just buggy and miss it completely. To retain the current semantics PCI and FSL depend on, introduce the idea of a "device set" which is a grouping of vfio_device's that share the same lock around opening. The device set is established by providing a 'set_id' pointer. All vfio_device's that provide the same pointer will be joined to the same singleton memory and lock across the whole set. This effectively replaces the oddly named reflck. After conversion the set_id will be sourced from: - A struct device from a fsl_mc_device (fsl) - A struct pci_slot (pci) - A struct pci_bus (pci) - The struct vfio_device (everything) The design ensures that the above pointers are live as long as the vfio_device is registered, so they form reliable unique keys to group vfio_devices into sets. This implementation uses xarray instead of searching through the driver core structures, which simplifies the somewhat tricky locking in this area. Following patches convert all the drivers. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/4-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-08-06 09:19:00 +08:00
goto err_fd;
}
/*
* TODO: add an anon_inode interface to do this.
* Appears to be missing by lack of need rather than
* explicitly prevented. Now there's need.
*/
filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
atomic_inc(&group->container_users);
vfio: Provide better generic support for open/release vfio_device_ops Currently the driver ops have an open/release pair that is called once each time a device FD is opened or closed. Add an additional set of open/close_device() ops which are called when the device FD is opened for the first time and closed for the last time. An analysis shows that all of the drivers require this semantic. Some are open coding it as part of their reflck implementation, and some are just buggy and miss it completely. To retain the current semantics PCI and FSL depend on, introduce the idea of a "device set" which is a grouping of vfio_device's that share the same lock around opening. The device set is established by providing a 'set_id' pointer. All vfio_device's that provide the same pointer will be joined to the same singleton memory and lock across the whole set. This effectively replaces the oddly named reflck. After conversion the set_id will be sourced from: - A struct device from a fsl_mc_device (fsl) - A struct pci_slot (pci) - A struct pci_bus (pci) - The struct vfio_device (everything) The design ensures that the above pointers are live as long as the vfio_device is registered, so they form reliable unique keys to group vfio_devices into sets. This implementation uses xarray instead of searching through the driver core structures, which simplifies the somewhat tricky locking in this area. Following patches convert all the drivers. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/4-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-08-06 09:19:00 +08:00
fd_install(fdno, filep);
if (group->noiommu)
dev_warn(device->dev, "vfio-noiommu device opened by user "
"(%s:%d)\n", current->comm, task_pid_nr(current));
vfio: Provide better generic support for open/release vfio_device_ops Currently the driver ops have an open/release pair that is called once each time a device FD is opened or closed. Add an additional set of open/close_device() ops which are called when the device FD is opened for the first time and closed for the last time. An analysis shows that all of the drivers require this semantic. Some are open coding it as part of their reflck implementation, and some are just buggy and miss it completely. To retain the current semantics PCI and FSL depend on, introduce the idea of a "device set" which is a grouping of vfio_device's that share the same lock around opening. The device set is established by providing a 'set_id' pointer. All vfio_device's that provide the same pointer will be joined to the same singleton memory and lock across the whole set. This effectively replaces the oddly named reflck. After conversion the set_id will be sourced from: - A struct device from a fsl_mc_device (fsl) - A struct pci_slot (pci) - A struct pci_bus (pci) - The struct vfio_device (everything) The design ensures that the above pointers are live as long as the vfio_device is registered, so they form reliable unique keys to group vfio_devices into sets. This implementation uses xarray instead of searching through the driver core structures, which simplifies the somewhat tricky locking in this area. Following patches convert all the drivers. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/4-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-08-06 09:19:00 +08:00
return fdno;
vfio: Provide better generic support for open/release vfio_device_ops Currently the driver ops have an open/release pair that is called once each time a device FD is opened or closed. Add an additional set of open/close_device() ops which are called when the device FD is opened for the first time and closed for the last time. An analysis shows that all of the drivers require this semantic. Some are open coding it as part of their reflck implementation, and some are just buggy and miss it completely. To retain the current semantics PCI and FSL depend on, introduce the idea of a "device set" which is a grouping of vfio_device's that share the same lock around opening. The device set is established by providing a 'set_id' pointer. All vfio_device's that provide the same pointer will be joined to the same singleton memory and lock across the whole set. This effectively replaces the oddly named reflck. After conversion the set_id will be sourced from: - A struct device from a fsl_mc_device (fsl) - A struct pci_slot (pci) - A struct pci_bus (pci) - The struct vfio_device (everything) The design ensures that the above pointers are live as long as the vfio_device is registered, so they form reliable unique keys to group vfio_devices into sets. This implementation uses xarray instead of searching through the driver core structures, which simplifies the somewhat tricky locking in this area. Following patches convert all the drivers. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/4-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-08-06 09:19:00 +08:00
err_fd:
put_unused_fd(fdno);
err_close_device:
mutex_lock(&device->dev_set->lock);
if (device->open_count == 1 && device->ops->close_device)
device->ops->close_device(device);
err_undo_count:
device->open_count--;
mutex_unlock(&device->dev_set->lock);
module_put(device->dev->driver->owner);
err_device_put:
vfio_device_put(device);
return ret;
}
static long vfio_group_fops_unl_ioctl(struct file *filep,
unsigned int cmd, unsigned long arg)
{
struct vfio_group *group = filep->private_data;
long ret = -ENOTTY;
switch (cmd) {
case VFIO_GROUP_GET_STATUS:
{
struct vfio_group_status status;
unsigned long minsz;
minsz = offsetofend(struct vfio_group_status, flags);
if (copy_from_user(&status, (void __user *)arg, minsz))
return -EFAULT;
if (status.argsz < minsz)
return -EINVAL;
status.flags = 0;
if (vfio_group_viable(group))
status.flags |= VFIO_GROUP_FLAGS_VIABLE;
if (group->container)
status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
if (copy_to_user((void __user *)arg, &status, minsz))
return -EFAULT;
ret = 0;
break;
}
case VFIO_GROUP_SET_CONTAINER:
{
int fd;
if (get_user(fd, (int __user *)arg))
return -EFAULT;
if (fd < 0)
return -EINVAL;
ret = vfio_group_set_container(group, fd);
break;
}
case VFIO_GROUP_UNSET_CONTAINER:
ret = vfio_group_unset_container(group);
break;
case VFIO_GROUP_GET_DEVICE_FD:
{
char *buf;
buf = strndup_user((const char __user *)arg, PAGE_SIZE);
if (IS_ERR(buf))
return PTR_ERR(buf);
ret = vfio_group_get_device_fd(group, buf);
kfree(buf);
break;
}
}
return ret;
}
static int vfio_group_fops_open(struct inode *inode, struct file *filep)
{
struct vfio_group *group;
int opened;
group = vfio_group_get_from_minor(iminor(inode));
if (!group)
return -ENODEV;
if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
vfio_group_put(group);
return -EPERM;
}
/* Do we need multiple instances of the group open? Seems not. */
opened = atomic_cmpxchg(&group->opened, 0, 1);
if (opened) {
vfio_group_put(group);
return -EBUSY;
}
/* Is something still in use from a previous open? */
if (group->container) {
atomic_dec(&group->opened);
vfio_group_put(group);
return -EBUSY;
}
/* Warn if previous user didn't cleanup and re-init to drop them */
if (WARN_ON(group->notifier.head))
BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
filep->private_data = group;
return 0;
}
static int vfio_group_fops_release(struct inode *inode, struct file *filep)
{
struct vfio_group *group = filep->private_data;
filep->private_data = NULL;
vfio_group_try_dissolve_container(group);
atomic_dec(&group->opened);
vfio_group_put(group);
return 0;
}
static const struct file_operations vfio_group_fops = {
.owner = THIS_MODULE,
.unlocked_ioctl = vfio_group_fops_unl_ioctl,
.compat_ioctl = compat_ptr_ioctl,
.open = vfio_group_fops_open,
.release = vfio_group_fops_release,
};
/**
* VFIO Device fd
*/
static int vfio_device_fops_release(struct inode *inode, struct file *filep)
{
struct vfio_device *device = filep->private_data;
vfio: Provide better generic support for open/release vfio_device_ops Currently the driver ops have an open/release pair that is called once each time a device FD is opened or closed. Add an additional set of open/close_device() ops which are called when the device FD is opened for the first time and closed for the last time. An analysis shows that all of the drivers require this semantic. Some are open coding it as part of their reflck implementation, and some are just buggy and miss it completely. To retain the current semantics PCI and FSL depend on, introduce the idea of a "device set" which is a grouping of vfio_device's that share the same lock around opening. The device set is established by providing a 'set_id' pointer. All vfio_device's that provide the same pointer will be joined to the same singleton memory and lock across the whole set. This effectively replaces the oddly named reflck. After conversion the set_id will be sourced from: - A struct device from a fsl_mc_device (fsl) - A struct pci_slot (pci) - A struct pci_bus (pci) - The struct vfio_device (everything) The design ensures that the above pointers are live as long as the vfio_device is registered, so they form reliable unique keys to group vfio_devices into sets. This implementation uses xarray instead of searching through the driver core structures, which simplifies the somewhat tricky locking in this area. Following patches convert all the drivers. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/4-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-08-06 09:19:00 +08:00
mutex_lock(&device->dev_set->lock);
if (!--device->open_count && device->ops->close_device)
device->ops->close_device(device);
mutex_unlock(&device->dev_set->lock);
module_put(device->dev->driver->owner);
vfio_group_try_dissolve_container(device->group);
vfio_device_put(device);
return 0;
}
static long vfio_device_fops_unl_ioctl(struct file *filep,
unsigned int cmd, unsigned long arg)
{
struct vfio_device *device = filep->private_data;
if (unlikely(!device->ops->ioctl))
return -EINVAL;
return device->ops->ioctl(device, cmd, arg);
}
static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
size_t count, loff_t *ppos)
{
struct vfio_device *device = filep->private_data;
if (unlikely(!device->ops->read))
return -EINVAL;
return device->ops->read(device, buf, count, ppos);
}
static ssize_t vfio_device_fops_write(struct file *filep,
const char __user *buf,
size_t count, loff_t *ppos)
{
struct vfio_device *device = filep->private_data;
if (unlikely(!device->ops->write))
return -EINVAL;
return device->ops->write(device, buf, count, ppos);
}
static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
{
struct vfio_device *device = filep->private_data;
if (unlikely(!device->ops->mmap))
return -EINVAL;
return device->ops->mmap(device, vma);
}
static const struct file_operations vfio_device_fops = {
.owner = THIS_MODULE,
.release = vfio_device_fops_release,
.read = vfio_device_fops_read,
.write = vfio_device_fops_write,
.unlocked_ioctl = vfio_device_fops_unl_ioctl,
.compat_ioctl = compat_ptr_ioctl,
.mmap = vfio_device_fops_mmap,
};
vfio: add external user support VFIO is designed to be used via ioctls on file descriptors returned by VFIO. However in some situations support for an external user is required. The first user is KVM on PPC64 (SPAPR TCE protocol) which is going to use the existing VFIO groups for exclusive access in real/virtual mode on a host to avoid passing map/unmap requests to the user space which would made things pretty slow. The protocol includes: 1. do normal VFIO init operation: - opening a new container; - attaching group(s) to it; - setting an IOMMU driver for a container. When IOMMU is set for a container, all groups in it are considered ready to use by an external user. 2. User space passes a group fd to an external user. The external user calls vfio_group_get_external_user() to verify that: - the group is initialized; - IOMMU is set for it. If both checks passed, vfio_group_get_external_user() increments the container user counter to prevent the VFIO group from disposal before KVM exits. 3. The external user calls vfio_external_user_iommu_id() to know an IOMMU ID. PPC64 KVM uses it to link logical bus number (LIOBN) with IOMMU ID. 4. When the external KVM finishes, it calls vfio_group_put_external_user() to release the VFIO group. This call decrements the container user counter. Everything gets released. The "vfio: Limit group opens" patch is also required for the consistency. Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2013-08-06 00:52:36 +08:00
/**
* External user API, exported by symbols to be linked dynamically.
*
* The protocol includes:
* 1. do normal VFIO init operation:
* - opening a new container;
* - attaching group(s) to it;
* - setting an IOMMU driver for a container.
* When IOMMU is set for a container, all groups in it are
* considered ready to use by an external user.
*
* 2. User space passes a group fd to an external user.
* The external user calls vfio_group_get_external_user()
* to verify that:
* - the group is initialized;
* - IOMMU is set for it.
* If both checks passed, vfio_group_get_external_user()
* increments the container user counter to prevent
* the VFIO group from disposal before KVM exits.
*
* 3. The external user calls vfio_external_user_iommu_id()
* to know an IOMMU ID.
*
* 4. When the external KVM finishes, it calls
* vfio_group_put_external_user() to release the VFIO group.
* This call decrements the container user counter.
*/
struct vfio_group *vfio_group_get_external_user(struct file *filep)
{
struct vfio_group *group = filep->private_data;
int ret;
vfio: add external user support VFIO is designed to be used via ioctls on file descriptors returned by VFIO. However in some situations support for an external user is required. The first user is KVM on PPC64 (SPAPR TCE protocol) which is going to use the existing VFIO groups for exclusive access in real/virtual mode on a host to avoid passing map/unmap requests to the user space which would made things pretty slow. The protocol includes: 1. do normal VFIO init operation: - opening a new container; - attaching group(s) to it; - setting an IOMMU driver for a container. When IOMMU is set for a container, all groups in it are considered ready to use by an external user. 2. User space passes a group fd to an external user. The external user calls vfio_group_get_external_user() to verify that: - the group is initialized; - IOMMU is set for it. If both checks passed, vfio_group_get_external_user() increments the container user counter to prevent the VFIO group from disposal before KVM exits. 3. The external user calls vfio_external_user_iommu_id() to know an IOMMU ID. PPC64 KVM uses it to link logical bus number (LIOBN) with IOMMU ID. 4. When the external KVM finishes, it calls vfio_group_put_external_user() to release the VFIO group. This call decrements the container user counter. Everything gets released. The "vfio: Limit group opens" patch is also required for the consistency. Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2013-08-06 00:52:36 +08:00
if (filep->f_op != &vfio_group_fops)
return ERR_PTR(-EINVAL);
ret = vfio_group_add_container_user(group);
if (ret)
return ERR_PTR(ret);
vfio: add external user support VFIO is designed to be used via ioctls on file descriptors returned by VFIO. However in some situations support for an external user is required. The first user is KVM on PPC64 (SPAPR TCE protocol) which is going to use the existing VFIO groups for exclusive access in real/virtual mode on a host to avoid passing map/unmap requests to the user space which would made things pretty slow. The protocol includes: 1. do normal VFIO init operation: - opening a new container; - attaching group(s) to it; - setting an IOMMU driver for a container. When IOMMU is set for a container, all groups in it are considered ready to use by an external user. 2. User space passes a group fd to an external user. The external user calls vfio_group_get_external_user() to verify that: - the group is initialized; - IOMMU is set for it. If both checks passed, vfio_group_get_external_user() increments the container user counter to prevent the VFIO group from disposal before KVM exits. 3. The external user calls vfio_external_user_iommu_id() to know an IOMMU ID. PPC64 KVM uses it to link logical bus number (LIOBN) with IOMMU ID. 4. When the external KVM finishes, it calls vfio_group_put_external_user() to release the VFIO group. This call decrements the container user counter. Everything gets released. The "vfio: Limit group opens" patch is also required for the consistency. Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2013-08-06 00:52:36 +08:00
vfio_group_get(group);
return group;
}
EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
/**
* External user API, exported by symbols to be linked dynamically.
* The external user passes in a device pointer
* to verify that:
* - A VFIO group is assiciated with the device;
* - IOMMU is set for the group.
* If both checks passed, vfio_group_get_external_user_from_dev()
* increments the container user counter to prevent the VFIO group
* from disposal before external user exits and returns the pointer
* to the VFIO group.
*
* When the external user finishes using the VFIO group, it calls
* vfio_group_put_external_user() to release the VFIO group and
* decrement the container user counter.
*
* @dev [in] : device
* Return error PTR or pointer to VFIO group.
*/
struct vfio_group *vfio_group_get_external_user_from_dev(struct device *dev)
{
struct vfio_group *group;
int ret;
group = vfio_group_get_from_dev(dev);
if (!group)
return ERR_PTR(-ENODEV);
ret = vfio_group_add_container_user(group);
if (ret) {
vfio_group_put(group);
return ERR_PTR(ret);
}
return group;
}
EXPORT_SYMBOL_GPL(vfio_group_get_external_user_from_dev);
vfio: add external user support VFIO is designed to be used via ioctls on file descriptors returned by VFIO. However in some situations support for an external user is required. The first user is KVM on PPC64 (SPAPR TCE protocol) which is going to use the existing VFIO groups for exclusive access in real/virtual mode on a host to avoid passing map/unmap requests to the user space which would made things pretty slow. The protocol includes: 1. do normal VFIO init operation: - opening a new container; - attaching group(s) to it; - setting an IOMMU driver for a container. When IOMMU is set for a container, all groups in it are considered ready to use by an external user. 2. User space passes a group fd to an external user. The external user calls vfio_group_get_external_user() to verify that: - the group is initialized; - IOMMU is set for it. If both checks passed, vfio_group_get_external_user() increments the container user counter to prevent the VFIO group from disposal before KVM exits. 3. The external user calls vfio_external_user_iommu_id() to know an IOMMU ID. PPC64 KVM uses it to link logical bus number (LIOBN) with IOMMU ID. 4. When the external KVM finishes, it calls vfio_group_put_external_user() to release the VFIO group. This call decrements the container user counter. Everything gets released. The "vfio: Limit group opens" patch is also required for the consistency. Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2013-08-06 00:52:36 +08:00
void vfio_group_put_external_user(struct vfio_group *group)
{
vfio_group_try_dissolve_container(group);
vfio_group_put(group);
vfio: add external user support VFIO is designed to be used via ioctls on file descriptors returned by VFIO. However in some situations support for an external user is required. The first user is KVM on PPC64 (SPAPR TCE protocol) which is going to use the existing VFIO groups for exclusive access in real/virtual mode on a host to avoid passing map/unmap requests to the user space which would made things pretty slow. The protocol includes: 1. do normal VFIO init operation: - opening a new container; - attaching group(s) to it; - setting an IOMMU driver for a container. When IOMMU is set for a container, all groups in it are considered ready to use by an external user. 2. User space passes a group fd to an external user. The external user calls vfio_group_get_external_user() to verify that: - the group is initialized; - IOMMU is set for it. If both checks passed, vfio_group_get_external_user() increments the container user counter to prevent the VFIO group from disposal before KVM exits. 3. The external user calls vfio_external_user_iommu_id() to know an IOMMU ID. PPC64 KVM uses it to link logical bus number (LIOBN) with IOMMU ID. 4. When the external KVM finishes, it calls vfio_group_put_external_user() to release the VFIO group. This call decrements the container user counter. Everything gets released. The "vfio: Limit group opens" patch is also required for the consistency. Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2013-08-06 00:52:36 +08:00
}
EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
bool vfio_external_group_match_file(struct vfio_group *test_group,
struct file *filep)
{
struct vfio_group *group = filep->private_data;
return (filep->f_op == &vfio_group_fops) && (group == test_group);
}
EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
vfio: add external user support VFIO is designed to be used via ioctls on file descriptors returned by VFIO. However in some situations support for an external user is required. The first user is KVM on PPC64 (SPAPR TCE protocol) which is going to use the existing VFIO groups for exclusive access in real/virtual mode on a host to avoid passing map/unmap requests to the user space which would made things pretty slow. The protocol includes: 1. do normal VFIO init operation: - opening a new container; - attaching group(s) to it; - setting an IOMMU driver for a container. When IOMMU is set for a container, all groups in it are considered ready to use by an external user. 2. User space passes a group fd to an external user. The external user calls vfio_group_get_external_user() to verify that: - the group is initialized; - IOMMU is set for it. If both checks passed, vfio_group_get_external_user() increments the container user counter to prevent the VFIO group from disposal before KVM exits. 3. The external user calls vfio_external_user_iommu_id() to know an IOMMU ID. PPC64 KVM uses it to link logical bus number (LIOBN) with IOMMU ID. 4. When the external KVM finishes, it calls vfio_group_put_external_user() to release the VFIO group. This call decrements the container user counter. Everything gets released. The "vfio: Limit group opens" patch is also required for the consistency. Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2013-08-06 00:52:36 +08:00
int vfio_external_user_iommu_id(struct vfio_group *group)
{
return iommu_group_id(group->iommu_group);
}
EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
{
return vfio_ioctl_check_extension(group->container, arg);
}
EXPORT_SYMBOL_GPL(vfio_external_check_extension);
/**
* Sub-module support
*/
/*
* Helper for managing a buffer of info chain capabilities, allocate or
* reallocate a buffer with additional @size, filling in @id and @version
* of the capability. A pointer to the new capability is returned.
*
* NB. The chain is based at the head of the buffer, so new entries are
* added to the tail, vfio_info_cap_shift() should be called to fixup the
* next offsets prior to copying to the user buffer.
*/
struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
size_t size, u16 id, u16 version)
{
void *buf;
struct vfio_info_cap_header *header, *tmp;
buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
if (!buf) {
kfree(caps->buf);
caps->size = 0;
return ERR_PTR(-ENOMEM);
}
caps->buf = buf;
header = buf + caps->size;
/* Eventually copied to user buffer, zero */
memset(header, 0, size);
header->id = id;
header->version = version;
/* Add to the end of the capability chain */
for (tmp = buf; tmp->next; tmp = buf + tmp->next)
; /* nothing */
tmp->next = caps->size;
caps->size += size;
return header;
}
EXPORT_SYMBOL_GPL(vfio_info_cap_add);
void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
{
struct vfio_info_cap_header *tmp;
void *buf = (void *)caps->buf;
for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
tmp->next += offset;
}
EXPORT_SYMBOL(vfio_info_cap_shift);
int vfio_info_add_capability(struct vfio_info_cap *caps,
struct vfio_info_cap_header *cap, size_t size)
{
struct vfio_info_cap_header *header;
header = vfio_info_cap_add(caps, size, cap->id, cap->version);
if (IS_ERR(header))
return PTR_ERR(header);
memcpy(header + 1, cap + 1, size - sizeof(*header));
return 0;
}
EXPORT_SYMBOL(vfio_info_add_capability);
int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
int max_irq_type, size_t *data_size)
{
unsigned long minsz;
size_t size;
minsz = offsetofend(struct vfio_irq_set, count);
if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
(hdr->count >= (U32_MAX - hdr->start)) ||
(hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
VFIO_IRQ_SET_ACTION_TYPE_MASK)))
return -EINVAL;
if (data_size)
*data_size = 0;
if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
return -EINVAL;
switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
case VFIO_IRQ_SET_DATA_NONE:
size = 0;
break;
case VFIO_IRQ_SET_DATA_BOOL:
size = sizeof(uint8_t);
break;
case VFIO_IRQ_SET_DATA_EVENTFD:
size = sizeof(int32_t);
break;
default:
return -EINVAL;
}
if (size) {
if (hdr->argsz - minsz < hdr->count * size)
return -EINVAL;
if (!data_size)
return -EINVAL;
*data_size = hdr->count * size;
}
return 0;
}
EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
/*
* Pin a set of guest PFNs and return their associated host PFNs for local
* domain only.
* @dev [in] : device
* @user_pfn [in]: array of user/guest PFNs to be pinned.
* @npage [in] : count of elements in user_pfn array. This count should not
* be greater VFIO_PIN_PAGES_MAX_ENTRIES.
* @prot [in] : protection flags
* @phys_pfn[out]: array of host PFNs
* Return error or number of pages pinned.
*/
int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
int prot, unsigned long *phys_pfn)
{
struct vfio_container *container;
struct vfio_group *group;
struct vfio_iommu_driver *driver;
int ret;
if (!dev || !user_pfn || !phys_pfn || !npage)
return -EINVAL;
if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
return -E2BIG;
group = vfio_group_get_from_dev(dev);
if (!group)
return -ENODEV;
if (group->dev_counter > 1) {
ret = -EINVAL;
goto err_pin_pages;
}
ret = vfio_group_add_container_user(group);
if (ret)
goto err_pin_pages;
container = group->container;
driver = container->iommu_driver;
if (likely(driver && driver->ops->pin_pages))
ret = driver->ops->pin_pages(container->iommu_data,
group->iommu_group, user_pfn,
npage, prot, phys_pfn);
else
ret = -ENOTTY;
vfio_group_try_dissolve_container(group);
err_pin_pages:
vfio_group_put(group);
return ret;
}
EXPORT_SYMBOL(vfio_pin_pages);
/*
* Unpin set of host PFNs for local domain only.
* @dev [in] : device
* @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
* PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
* @npage [in] : count of elements in user_pfn array. This count should not
* be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
* Return error or number of pages unpinned.
*/
int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
{
struct vfio_container *container;
struct vfio_group *group;
struct vfio_iommu_driver *driver;
int ret;
if (!dev || !user_pfn || !npage)
return -EINVAL;
if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
return -E2BIG;
group = vfio_group_get_from_dev(dev);
if (!group)
return -ENODEV;
ret = vfio_group_add_container_user(group);
if (ret)
goto err_unpin_pages;
container = group->container;
driver = container->iommu_driver;
if (likely(driver && driver->ops->unpin_pages))
ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
npage);
else
ret = -ENOTTY;
vfio_group_try_dissolve_container(group);
err_unpin_pages:
vfio_group_put(group);
return ret;
}
EXPORT_SYMBOL(vfio_unpin_pages);
/*
* Pin a set of guest IOVA PFNs and return their associated host PFNs for a
* VFIO group.
*
* The caller needs to call vfio_group_get_external_user() or
* vfio_group_get_external_user_from_dev() prior to calling this interface,
* so as to prevent the VFIO group from disposal in the middle of the call.
* But it can keep the reference to the VFIO group for several calls into
* this interface.
* After finishing using of the VFIO group, the caller needs to release the
* VFIO group by calling vfio_group_put_external_user().
*
* @group [in] : VFIO group
* @user_iova_pfn [in] : array of user/guest IOVA PFNs to be pinned.
* @npage [in] : count of elements in user_iova_pfn array.
* This count should not be greater
* VFIO_PIN_PAGES_MAX_ENTRIES.
* @prot [in] : protection flags
* @phys_pfn [out] : array of host PFNs
* Return error or number of pages pinned.
*/
int vfio_group_pin_pages(struct vfio_group *group,
unsigned long *user_iova_pfn, int npage,
int prot, unsigned long *phys_pfn)
{
struct vfio_container *container;
struct vfio_iommu_driver *driver;
int ret;
if (!group || !user_iova_pfn || !phys_pfn || !npage)
return -EINVAL;
if (group->dev_counter > 1)
return -EINVAL;
if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
return -E2BIG;
container = group->container;
driver = container->iommu_driver;
if (likely(driver && driver->ops->pin_pages))
ret = driver->ops->pin_pages(container->iommu_data,
group->iommu_group, user_iova_pfn,
npage, prot, phys_pfn);
else
ret = -ENOTTY;
return ret;
}
EXPORT_SYMBOL(vfio_group_pin_pages);
/*
* Unpin a set of guest IOVA PFNs for a VFIO group.
*
* The caller needs to call vfio_group_get_external_user() or
* vfio_group_get_external_user_from_dev() prior to calling this interface,
* so as to prevent the VFIO group from disposal in the middle of the call.
* But it can keep the reference to the VFIO group for several calls into
* this interface.
* After finishing using of the VFIO group, the caller needs to release the
* VFIO group by calling vfio_group_put_external_user().
*
* @group [in] : vfio group
* @user_iova_pfn [in] : array of user/guest IOVA PFNs to be unpinned.
* @npage [in] : count of elements in user_iova_pfn array.
* This count should not be greater than
* VFIO_PIN_PAGES_MAX_ENTRIES.
* Return error or number of pages unpinned.
*/
int vfio_group_unpin_pages(struct vfio_group *group,
unsigned long *user_iova_pfn, int npage)
{
struct vfio_container *container;
struct vfio_iommu_driver *driver;
int ret;
if (!group || !user_iova_pfn || !npage)
return -EINVAL;
if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
return -E2BIG;
container = group->container;
driver = container->iommu_driver;
if (likely(driver && driver->ops->unpin_pages))
ret = driver->ops->unpin_pages(container->iommu_data,
user_iova_pfn, npage);
else
ret = -ENOTTY;
return ret;
}
EXPORT_SYMBOL(vfio_group_unpin_pages);
/*
* This interface allows the CPUs to perform some sort of virtual DMA on
* behalf of the device.
*
* CPUs read/write from/into a range of IOVAs pointing to user space memory
* into/from a kernel buffer.
*
* As the read/write of user space memory is conducted via the CPUs and is
* not a real device DMA, it is not necessary to pin the user space memory.
*
* The caller needs to call vfio_group_get_external_user() or
* vfio_group_get_external_user_from_dev() prior to calling this interface,
* so as to prevent the VFIO group from disposal in the middle of the call.
* But it can keep the reference to the VFIO group for several calls into
* this interface.
* After finishing using of the VFIO group, the caller needs to release the
* VFIO group by calling vfio_group_put_external_user().
*
* @group [in] : VFIO group
* @user_iova [in] : base IOVA of a user space buffer
* @data [in] : pointer to kernel buffer
* @len [in] : kernel buffer length
* @write : indicate read or write
* Return error code on failure or 0 on success.
*/
int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova,
void *data, size_t len, bool write)
{
struct vfio_container *container;
struct vfio_iommu_driver *driver;
int ret = 0;
if (!group || !data || len <= 0)
return -EINVAL;
container = group->container;
driver = container->iommu_driver;
if (likely(driver && driver->ops->dma_rw))
ret = driver->ops->dma_rw(container->iommu_data,
user_iova, data, len, write);
else
ret = -ENOTTY;
return ret;
}
EXPORT_SYMBOL(vfio_dma_rw);
static int vfio_register_iommu_notifier(struct vfio_group *group,
unsigned long *events,
struct notifier_block *nb)
{
struct vfio_container *container;
struct vfio_iommu_driver *driver;
int ret;
ret = vfio_group_add_container_user(group);
if (ret)
return -EINVAL;
container = group->container;
driver = container->iommu_driver;
if (likely(driver && driver->ops->register_notifier))
ret = driver->ops->register_notifier(container->iommu_data,
events, nb);
else
ret = -ENOTTY;
vfio_group_try_dissolve_container(group);
return ret;
}
static int vfio_unregister_iommu_notifier(struct vfio_group *group,
struct notifier_block *nb)
{
struct vfio_container *container;
struct vfio_iommu_driver *driver;
int ret;
ret = vfio_group_add_container_user(group);
if (ret)
return -EINVAL;
container = group->container;
driver = container->iommu_driver;
if (likely(driver && driver->ops->unregister_notifier))
ret = driver->ops->unregister_notifier(container->iommu_data,
nb);
else
ret = -ENOTTY;
vfio_group_try_dissolve_container(group);
return ret;
}
void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
{
group->kvm = kvm;
blocking_notifier_call_chain(&group->notifier,
VFIO_GROUP_NOTIFY_SET_KVM, kvm);
}
EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
static int vfio_register_group_notifier(struct vfio_group *group,
unsigned long *events,
struct notifier_block *nb)
{
int ret;
bool set_kvm = false;
if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
set_kvm = true;
/* clear known events */
*events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
/* refuse to continue if still events remaining */
if (*events)
return -EINVAL;
ret = vfio_group_add_container_user(group);
if (ret)
return -EINVAL;
ret = blocking_notifier_chain_register(&group->notifier, nb);
/*
* The attaching of kvm and vfio_group might already happen, so
* here we replay once upon registration.
*/
if (!ret && set_kvm && group->kvm)
blocking_notifier_call_chain(&group->notifier,
VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
vfio_group_try_dissolve_container(group);
return ret;
}
static int vfio_unregister_group_notifier(struct vfio_group *group,
struct notifier_block *nb)
{
int ret;
ret = vfio_group_add_container_user(group);
if (ret)
return -EINVAL;
ret = blocking_notifier_chain_unregister(&group->notifier, nb);
vfio_group_try_dissolve_container(group);
return ret;
}
int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
unsigned long *events, struct notifier_block *nb)
{
struct vfio_group *group;
int ret;
if (!dev || !nb || !events || (*events == 0))
return -EINVAL;
group = vfio_group_get_from_dev(dev);
if (!group)
return -ENODEV;
switch (type) {
case VFIO_IOMMU_NOTIFY:
ret = vfio_register_iommu_notifier(group, events, nb);
break;
case VFIO_GROUP_NOTIFY:
ret = vfio_register_group_notifier(group, events, nb);
break;
default:
ret = -EINVAL;
}
vfio_group_put(group);
return ret;
}
EXPORT_SYMBOL(vfio_register_notifier);
int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
struct notifier_block *nb)
{
struct vfio_group *group;
int ret;
if (!dev || !nb)
return -EINVAL;
group = vfio_group_get_from_dev(dev);
if (!group)
return -ENODEV;
switch (type) {
case VFIO_IOMMU_NOTIFY:
ret = vfio_unregister_iommu_notifier(group, nb);
break;
case VFIO_GROUP_NOTIFY:
ret = vfio_unregister_group_notifier(group, nb);
break;
default:
ret = -EINVAL;
}
vfio_group_put(group);
return ret;
}
EXPORT_SYMBOL(vfio_unregister_notifier);
struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group)
{
struct vfio_container *container;
struct vfio_iommu_driver *driver;
if (!group)
return ERR_PTR(-EINVAL);
container = group->container;
driver = container->iommu_driver;
if (likely(driver && driver->ops->group_iommu_domain))
return driver->ops->group_iommu_domain(container->iommu_data,
group->iommu_group);
return ERR_PTR(-ENOTTY);
}
EXPORT_SYMBOL_GPL(vfio_group_iommu_domain);
/**
* Module/class support
*/
static char *vfio_devnode(struct device *dev, umode_t *mode)
{
return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
}
static struct miscdevice vfio_dev = {
.minor = VFIO_MINOR,
.name = "vfio",
.fops = &vfio_fops,
.nodename = "vfio/vfio",
.mode = S_IRUGO | S_IWUGO,
};
static int __init vfio_init(void)
{
int ret;
idr_init(&vfio.group_idr);
mutex_init(&vfio.group_lock);
mutex_init(&vfio.iommu_drivers_lock);
INIT_LIST_HEAD(&vfio.group_list);
INIT_LIST_HEAD(&vfio.iommu_drivers_list);
ret = misc_register(&vfio_dev);
if (ret) {
pr_err("vfio: misc device register failed\n");
return ret;
}
/* /dev/vfio/$GROUP */
vfio.class = class_create(THIS_MODULE, "vfio");
if (IS_ERR(vfio.class)) {
ret = PTR_ERR(vfio.class);
goto err_class;
}
vfio.class->devnode = vfio_devnode;
ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
if (ret)
goto err_alloc_chrdev;
cdev_init(&vfio.group_cdev, &vfio_group_fops);
ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK + 1);
if (ret)
goto err_cdev_add;
pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
#ifdef CONFIG_VFIO_NOIOMMU
vfio_register_iommu_driver(&vfio_noiommu_ops);
#endif
return 0;
err_cdev_add:
unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
err_alloc_chrdev:
class_destroy(vfio.class);
vfio.class = NULL;
err_class:
misc_deregister(&vfio_dev);
return ret;
}
static void __exit vfio_cleanup(void)
{
WARN_ON(!list_empty(&vfio.group_list));
#ifdef CONFIG_VFIO_NOIOMMU
vfio_unregister_iommu_driver(&vfio_noiommu_ops);
#endif
idr_destroy(&vfio.group_idr);
cdev_del(&vfio.group_cdev);
unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
class_destroy(vfio.class);
vfio.class = NULL;
misc_deregister(&vfio_dev);
vfio: Provide better generic support for open/release vfio_device_ops Currently the driver ops have an open/release pair that is called once each time a device FD is opened or closed. Add an additional set of open/close_device() ops which are called when the device FD is opened for the first time and closed for the last time. An analysis shows that all of the drivers require this semantic. Some are open coding it as part of their reflck implementation, and some are just buggy and miss it completely. To retain the current semantics PCI and FSL depend on, introduce the idea of a "device set" which is a grouping of vfio_device's that share the same lock around opening. The device set is established by providing a 'set_id' pointer. All vfio_device's that provide the same pointer will be joined to the same singleton memory and lock across the whole set. This effectively replaces the oddly named reflck. After conversion the set_id will be sourced from: - A struct device from a fsl_mc_device (fsl) - A struct pci_slot (pci) - A struct pci_bus (pci) - The struct vfio_device (everything) The design ensures that the above pointers are live as long as the vfio_device is registered, so they form reliable unique keys to group vfio_devices into sets. This implementation uses xarray instead of searching through the driver core structures, which simplifies the somewhat tricky locking in this area. Following patches convert all the drivers. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/4-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-08-06 09:19:00 +08:00
xa_destroy(&vfio_device_set_xa);
}
module_init(vfio_init);
module_exit(vfio_cleanup);
MODULE_VERSION(DRIVER_VERSION);
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR(DRIVER_AUTHOR);
MODULE_DESCRIPTION(DRIVER_DESC);
MODULE_ALIAS_MISCDEV(VFIO_MINOR);
MODULE_ALIAS("devname:vfio/vfio");
MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");