2012-07-31 22:16:24 +08:00
/*
* Copyright ( C ) 2012 Red Hat , Inc . All rights reserved .
* Author : Alex Williamson < alex . williamson @ redhat . com >
*
* This program is free software ; you can redistribute it and / or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation .
*
* Derived from original vfio :
* Copyright 2010 Cisco Systems , Inc . All rights reserved .
* Author : Tom Lyon , pugs @ cisco . com
*/
# include <linux/device.h>
# include <linux/eventfd.h>
2013-09-05 01:28:04 +08:00
# include <linux/file.h>
2012-07-31 22:16:24 +08:00
# include <linux/interrupt.h>
# include <linux/iommu.h>
# include <linux/module.h>
# include <linux/mutex.h>
# include <linux/notifier.h>
# include <linux/pci.h>
# include <linux/pm_runtime.h>
# include <linux/slab.h>
# include <linux/types.h>
# include <linux/uaccess.h>
# include <linux/vfio.h>
2015-04-08 01:14:41 +08:00
# include <linux/vgaarb.h>
2012-07-31 22:16:24 +08:00
# include "vfio_pci_private.h"
# define DRIVER_VERSION "0.2"
# define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
# define DRIVER_DESC "VFIO PCI - User Level meta-driver"
static bool nointxmask ;
module_param_named ( nointxmask , nointxmask , bool , S_IRUGO | S_IWUSR ) ;
MODULE_PARM_DESC ( nointxmask ,
" Disable support for PCI 2.3 style INTx masking. If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag. " ) ;
2015-04-08 01:14:40 +08:00
# ifdef CONFIG_VFIO_PCI_VGA
static bool disable_vga ;
module_param ( disable_vga , bool , S_IRUGO ) ;
MODULE_PARM_DESC ( disable_vga , " Disable VGA resource access through vfio-pci " ) ;
# endif
2014-08-08 01:12:04 +08:00
static DEFINE_MUTEX ( driver_lock ) ;
2015-04-08 01:14:40 +08:00
static inline bool vfio_vga_disabled ( void )
{
# ifdef CONFIG_VFIO_PCI_VGA
return disable_vga ;
# else
return true ;
# endif
}
2015-04-08 01:14:41 +08:00
/*
* Our VGA arbiter participation is limited since we don ' t know anything
* about the device itself . However , if the device is the only VGA device
* downstream of a bridge and VFIO VGA support is disabled , then we can
* safely return legacy VGA IO and memory as not decoded since the user
* has no way to get to it and routing can be disabled externally at the
* bridge .
*/
static unsigned int vfio_pci_set_vga_decode ( void * opaque , bool single_vga )
{
struct vfio_pci_device * vdev = opaque ;
struct pci_dev * tmp = NULL , * pdev = vdev - > pdev ;
unsigned char max_busnr ;
unsigned int decodes ;
if ( single_vga | | ! vfio_vga_disabled ( ) | | pci_is_root_bus ( pdev - > bus ) )
return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM ;
max_busnr = pci_bus_max_busnr ( pdev - > bus ) ;
decodes = VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM ;
while ( ( tmp = pci_get_class ( PCI_CLASS_DISPLAY_VGA < < 8 , tmp ) ) ! = NULL ) {
if ( tmp = = pdev | |
pci_domain_nr ( tmp - > bus ) ! = pci_domain_nr ( pdev - > bus ) | |
pci_is_root_bus ( tmp - > bus ) )
continue ;
if ( tmp - > bus - > number > = pdev - > bus - > number & &
tmp - > bus - > number < = max_busnr ) {
pci_dev_put ( tmp ) ;
decodes | = VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM ;
break ;
}
}
return decodes ;
}
static inline bool vfio_pci_is_vga ( struct pci_dev * pdev )
{
return ( pdev - > class > > 8 ) = = PCI_CLASS_DISPLAY_VGA ;
}
2014-08-08 01:12:07 +08:00
static void vfio_pci_try_bus_reset ( struct vfio_pci_device * vdev ) ;
2012-07-31 22:16:24 +08:00
static int vfio_pci_enable ( struct vfio_pci_device * vdev )
{
struct pci_dev * pdev = vdev - > pdev ;
int ret ;
u16 cmd ;
u8 msix_pos ;
2014-08-08 01:12:02 +08:00
/* Don't allow our initial saved state to include busmaster */
pci_clear_master ( pdev ) ;
2012-12-08 04:43:51 +08:00
ret = pci_enable_device ( pdev ) ;
if ( ret )
return ret ;
2012-07-31 22:16:24 +08:00
vdev - > reset_works = ( pci_reset_function ( pdev ) = = 0 ) ;
pci_save_state ( pdev ) ;
vdev - > pci_saved_state = pci_store_saved_state ( pdev ) ;
if ( ! vdev - > pci_saved_state )
pr_debug ( " %s: Couldn't store %s saved state \n " ,
__func__ , dev_name ( & pdev - > dev ) ) ;
ret = vfio_config_init ( vdev ) ;
2012-12-08 04:43:51 +08:00
if ( ret ) {
2014-05-31 01:35:53 +08:00
kfree ( vdev - > pci_saved_state ) ;
vdev - > pci_saved_state = NULL ;
2012-12-08 04:43:51 +08:00
pci_disable_device ( pdev ) ;
return ret ;
}
2012-07-31 22:16:24 +08:00
if ( likely ( ! nointxmask ) )
vdev - > pci_2_3 = pci_intx_mask_supported ( pdev ) ;
pci_read_config_word ( pdev , PCI_COMMAND , & cmd ) ;
if ( vdev - > pci_2_3 & & ( cmd & PCI_COMMAND_INTX_DISABLE ) ) {
cmd & = ~ PCI_COMMAND_INTX_DISABLE ;
pci_write_config_word ( pdev , PCI_COMMAND , cmd ) ;
}
2013-04-19 05:12:58 +08:00
msix_pos = pdev - > msix_cap ;
2012-07-31 22:16:24 +08:00
if ( msix_pos ) {
u16 flags ;
u32 table ;
pci_read_config_word ( pdev , msix_pos + PCI_MSIX_FLAGS , & flags ) ;
pci_read_config_dword ( pdev , msix_pos + PCI_MSIX_TABLE , & table ) ;
2013-04-19 02:42:58 +08:00
vdev - > msix_bar = table & PCI_MSIX_TABLE_BIR ;
vdev - > msix_offset = table & PCI_MSIX_TABLE_OFFSET ;
2012-07-31 22:16:24 +08:00
vdev - > msix_size = ( ( flags & PCI_MSIX_FLAGS_QSIZE ) + 1 ) * 16 ;
} else
vdev - > msix_bar = 0xFF ;
2015-04-08 01:14:41 +08:00
if ( ! vfio_vga_disabled ( ) & & vfio_pci_is_vga ( pdev ) )
2013-02-19 01:11:13 +08:00
vdev - > has_vga = true ;
2012-12-08 04:43:51 +08:00
return 0 ;
2012-07-31 22:16:24 +08:00
}
static void vfio_pci_disable ( struct vfio_pci_device * vdev )
{
2012-12-08 04:43:50 +08:00
struct pci_dev * pdev = vdev - > pdev ;
2012-07-31 22:16:24 +08:00
int bar ;
2014-08-08 01:12:02 +08:00
/* Stop the device from further DMA */
pci_clear_master ( pdev ) ;
2012-07-31 22:16:24 +08:00
vfio_pci_set_irqs_ioctl ( vdev , VFIO_IRQ_SET_DATA_NONE |
VFIO_IRQ_SET_ACTION_TRIGGER ,
vdev - > irq_type , 0 , 0 , NULL ) ;
vdev - > virq_disabled = false ;
vfio_config_free ( vdev ) ;
for ( bar = PCI_STD_RESOURCES ; bar < = PCI_STD_RESOURCE_END ; bar + + ) {
if ( ! vdev - > barmap [ bar ] )
continue ;
2012-12-08 04:43:50 +08:00
pci_iounmap ( pdev , vdev - > barmap [ bar ] ) ;
pci_release_selected_regions ( pdev , 1 < < bar ) ;
2012-07-31 22:16:24 +08:00
vdev - > barmap [ bar ] = NULL ;
}
2012-12-08 04:43:50 +08:00
2014-08-08 01:12:07 +08:00
vdev - > needs_reset = true ;
2012-12-08 04:43:50 +08:00
/*
* If we have saved state , restore it . If we can reset the device ,
* even better . Resetting with current state seems better than
* nothing , but saving and restoring current state without reset
* is just busy work .
*/
if ( pci_load_and_free_saved_state ( pdev , & vdev - > pci_saved_state ) ) {
pr_info ( " %s: Couldn't reload %s saved state \n " ,
__func__ , dev_name ( & pdev - > dev ) ) ;
if ( ! vdev - > reset_works )
2014-08-08 01:12:02 +08:00
goto out ;
2012-12-08 04:43:50 +08:00
pci_save_state ( pdev ) ;
}
/*
* Disable INTx and MSI , presumably to avoid spurious interrupts
* during reset . Stolen from pci_reset_function ( )
*/
pci_write_config_word ( pdev , PCI_COMMAND , PCI_COMMAND_INTX_DISABLE ) ;
2013-06-11 06:40:57 +08:00
/*
2014-01-15 11:45:09 +08:00
* Try to reset the device . The success of this is dependent on
* being able to lock the device , which is not always possible .
2013-06-11 06:40:57 +08:00
*/
if ( vdev - > reset_works ) {
2014-01-15 11:45:09 +08:00
int ret = pci_try_reset_function ( pdev ) ;
if ( ret )
pr_warn ( " %s: Failed to reset device %s (%d) \n " ,
__func__ , dev_name ( & pdev - > dev ) , ret ) ;
2014-08-08 01:12:07 +08:00
else
vdev - > needs_reset = false ;
2013-06-11 06:40:57 +08:00
}
2012-12-08 04:43:50 +08:00
pci_restore_state ( pdev ) ;
2014-08-08 01:12:02 +08:00
out :
pci_disable_device ( pdev ) ;
2014-08-08 01:12:07 +08:00
vfio_pci_try_bus_reset ( vdev ) ;
2012-07-31 22:16:24 +08:00
}
static void vfio_pci_release ( void * device_data )
{
struct vfio_pci_device * vdev = device_data ;
2014-08-08 01:12:04 +08:00
mutex_lock ( & driver_lock ) ;
if ( ! ( - - vdev - > refcnt ) ) {
2014-06-10 09:41:57 +08:00
vfio_spapr_pci_eeh_release ( vdev - > pdev ) ;
2012-07-31 22:16:24 +08:00
vfio_pci_disable ( vdev ) ;
2014-06-10 09:41:57 +08:00
}
2012-07-31 22:16:24 +08:00
2014-08-08 01:12:04 +08:00
mutex_unlock ( & driver_lock ) ;
2012-07-31 22:16:24 +08:00
module_put ( THIS_MODULE ) ;
}
static int vfio_pci_open ( void * device_data )
{
struct vfio_pci_device * vdev = device_data ;
2014-08-08 01:12:04 +08:00
int ret = 0 ;
2012-07-31 22:16:24 +08:00
if ( ! try_module_get ( THIS_MODULE ) )
return - ENODEV ;
2014-08-08 01:12:04 +08:00
mutex_lock ( & driver_lock ) ;
if ( ! vdev - > refcnt ) {
2014-06-10 09:41:57 +08:00
ret = vfio_pci_enable ( vdev ) ;
if ( ret )
goto error ;
2014-08-09 00:39:16 +08:00
vfio_spapr_pci_eeh_open ( vdev - > pdev ) ;
2012-07-31 22:16:24 +08:00
}
2014-08-08 01:12:04 +08:00
vdev - > refcnt + + ;
2014-06-10 09:41:57 +08:00
error :
2014-08-08 01:12:04 +08:00
mutex_unlock ( & driver_lock ) ;
if ( ret )
module_put ( THIS_MODULE ) ;
2014-06-10 09:41:57 +08:00
return ret ;
2012-07-31 22:16:24 +08:00
}
static int vfio_pci_get_irq_count ( struct vfio_pci_device * vdev , int irq_type )
{
if ( irq_type = = VFIO_PCI_INTX_IRQ_INDEX ) {
u8 pin ;
pci_read_config_byte ( vdev - > pdev , PCI_INTERRUPT_PIN , & pin ) ;
2014-11-08 00:52:22 +08:00
if ( IS_ENABLED ( CONFIG_VFIO_PCI_INTX ) & & pin )
2012-07-31 22:16:24 +08:00
return 1 ;
} else if ( irq_type = = VFIO_PCI_MSI_IRQ_INDEX ) {
u8 pos ;
u16 flags ;
2013-04-19 05:12:58 +08:00
pos = vdev - > pdev - > msi_cap ;
2012-07-31 22:16:24 +08:00
if ( pos ) {
pci_read_config_word ( vdev - > pdev ,
pos + PCI_MSI_FLAGS , & flags ) ;
2014-05-31 01:35:54 +08:00
return 1 < < ( ( flags & PCI_MSI_FLAGS_QMASK ) > > 1 ) ;
2012-07-31 22:16:24 +08:00
}
} else if ( irq_type = = VFIO_PCI_MSIX_IRQ_INDEX ) {
u8 pos ;
u16 flags ;
2013-04-19 05:12:58 +08:00
pos = vdev - > pdev - > msix_cap ;
2012-07-31 22:16:24 +08:00
if ( pos ) {
pci_read_config_word ( vdev - > pdev ,
pos + PCI_MSIX_FLAGS , & flags ) ;
return ( flags & PCI_MSIX_FLAGS_QSIZE ) + 1 ;
}
2015-02-07 06:05:08 +08:00
} else if ( irq_type = = VFIO_PCI_ERR_IRQ_INDEX ) {
2013-03-11 23:31:22 +08:00
if ( pci_is_pcie ( vdev - > pdev ) )
return 1 ;
2015-02-07 06:05:08 +08:00
} else if ( irq_type = = VFIO_PCI_REQ_IRQ_INDEX ) {
return 1 ;
}
2012-07-31 22:16:24 +08:00
return 0 ;
}
2013-09-05 01:28:04 +08:00
static int vfio_pci_count_devs ( struct pci_dev * pdev , void * data )
{
( * ( int * ) data ) + + ;
return 0 ;
}
struct vfio_pci_fill_info {
int max ;
int cur ;
struct vfio_pci_dependent_device * devices ;
} ;
static int vfio_pci_fill_devs ( struct pci_dev * pdev , void * data )
{
struct vfio_pci_fill_info * fill = data ;
struct iommu_group * iommu_group ;
if ( fill - > cur = = fill - > max )
return - EAGAIN ; /* Something changed, try again */
iommu_group = iommu_group_get ( & pdev - > dev ) ;
if ( ! iommu_group )
return - EPERM ; /* Cannot reset non-isolated devices */
fill - > devices [ fill - > cur ] . group_id = iommu_group_id ( iommu_group ) ;
fill - > devices [ fill - > cur ] . segment = pci_domain_nr ( pdev - > bus ) ;
fill - > devices [ fill - > cur ] . bus = pdev - > bus - > number ;
fill - > devices [ fill - > cur ] . devfn = pdev - > devfn ;
fill - > cur + + ;
iommu_group_put ( iommu_group ) ;
return 0 ;
}
struct vfio_pci_group_entry {
struct vfio_group * group ;
int id ;
} ;
struct vfio_pci_group_info {
int count ;
struct vfio_pci_group_entry * groups ;
} ;
static int vfio_pci_validate_devs ( struct pci_dev * pdev , void * data )
{
struct vfio_pci_group_info * info = data ;
struct iommu_group * group ;
int id , i ;
group = iommu_group_get ( & pdev - > dev ) ;
if ( ! group )
return - EPERM ;
id = iommu_group_id ( group ) ;
for ( i = 0 ; i < info - > count ; i + + )
if ( info - > groups [ i ] . id = = id )
break ;
iommu_group_put ( group ) ;
return ( i = = info - > count ) ? - EINVAL : 0 ;
}
static bool vfio_pci_dev_below_slot ( struct pci_dev * pdev , struct pci_slot * slot )
{
for ( ; pdev ; pdev = pdev - > bus - > self )
if ( pdev - > bus = = slot - > bus )
return ( pdev - > slot = = slot ) ;
return false ;
}
struct vfio_pci_walk_info {
int ( * fn ) ( struct pci_dev * , void * data ) ;
void * data ;
struct pci_dev * pdev ;
bool slot ;
int ret ;
} ;
static int vfio_pci_walk_wrapper ( struct pci_dev * pdev , void * data )
{
struct vfio_pci_walk_info * walk = data ;
if ( ! walk - > slot | | vfio_pci_dev_below_slot ( pdev , walk - > pdev - > slot ) )
walk - > ret = walk - > fn ( pdev , walk - > data ) ;
return walk - > ret ;
}
static int vfio_pci_for_each_slot_or_bus ( struct pci_dev * pdev ,
int ( * fn ) ( struct pci_dev * ,
void * data ) , void * data ,
bool slot )
{
struct vfio_pci_walk_info walk = {
. fn = fn , . data = data , . pdev = pdev , . slot = slot , . ret = 0 ,
} ;
pci_walk_bus ( pdev - > bus , vfio_pci_walk_wrapper , & walk ) ;
return walk . ret ;
}
2012-07-31 22:16:24 +08:00
static long vfio_pci_ioctl ( void * device_data ,
unsigned int cmd , unsigned long arg )
{
struct vfio_pci_device * vdev = device_data ;
unsigned long minsz ;
if ( cmd = = VFIO_DEVICE_GET_INFO ) {
struct vfio_device_info info ;
minsz = offsetofend ( struct vfio_device_info , num_irqs ) ;
if ( copy_from_user ( & info , ( void __user * ) arg , minsz ) )
return - EFAULT ;
if ( info . argsz < minsz )
return - EINVAL ;
info . flags = VFIO_DEVICE_FLAGS_PCI ;
if ( vdev - > reset_works )
info . flags | = VFIO_DEVICE_FLAGS_RESET ;
info . num_regions = VFIO_PCI_NUM_REGIONS ;
info . num_irqs = VFIO_PCI_NUM_IRQS ;
return copy_to_user ( ( void __user * ) arg , & info , minsz ) ;
} else if ( cmd = = VFIO_DEVICE_GET_REGION_INFO ) {
struct pci_dev * pdev = vdev - > pdev ;
struct vfio_region_info info ;
minsz = offsetofend ( struct vfio_region_info , offset ) ;
if ( copy_from_user ( & info , ( void __user * ) arg , minsz ) )
return - EFAULT ;
if ( info . argsz < minsz )
return - EINVAL ;
switch ( info . index ) {
case VFIO_PCI_CONFIG_REGION_INDEX :
info . offset = VFIO_PCI_INDEX_TO_OFFSET ( info . index ) ;
info . size = pdev - > cfg_size ;
info . flags = VFIO_REGION_INFO_FLAG_READ |
VFIO_REGION_INFO_FLAG_WRITE ;
break ;
case VFIO_PCI_BAR0_REGION_INDEX . . . VFIO_PCI_BAR5_REGION_INDEX :
info . offset = VFIO_PCI_INDEX_TO_OFFSET ( info . index ) ;
info . size = pci_resource_len ( pdev , info . index ) ;
if ( ! info . size ) {
info . flags = 0 ;
break ;
}
info . flags = VFIO_REGION_INFO_FLAG_READ |
VFIO_REGION_INFO_FLAG_WRITE ;
2014-11-08 00:52:22 +08:00
if ( IS_ENABLED ( CONFIG_VFIO_PCI_MMAP ) & &
pci_resource_flags ( pdev , info . index ) &
2012-07-31 22:16:24 +08:00
IORESOURCE_MEM & & info . size > = PAGE_SIZE )
info . flags | = VFIO_REGION_INFO_FLAG_MMAP ;
break ;
case VFIO_PCI_ROM_REGION_INDEX :
{
void __iomem * io ;
size_t size ;
info . offset = VFIO_PCI_INDEX_TO_OFFSET ( info . index ) ;
info . flags = 0 ;
/* Report the BAR size, not the ROM size */
info . size = pci_resource_len ( pdev , info . index ) ;
if ( ! info . size )
break ;
/* Is it really there? */
io = pci_map_rom ( pdev , & size ) ;
if ( ! io | | ! size ) {
info . size = 0 ;
break ;
}
pci_unmap_rom ( pdev , io ) ;
info . flags = VFIO_REGION_INFO_FLAG_READ ;
break ;
}
2013-02-19 01:11:13 +08:00
case VFIO_PCI_VGA_REGION_INDEX :
if ( ! vdev - > has_vga )
return - EINVAL ;
info . offset = VFIO_PCI_INDEX_TO_OFFSET ( info . index ) ;
info . size = 0xc0000 ;
info . flags = VFIO_REGION_INFO_FLAG_READ |
VFIO_REGION_INFO_FLAG_WRITE ;
break ;
2012-07-31 22:16:24 +08:00
default :
return - EINVAL ;
}
return copy_to_user ( ( void __user * ) arg , & info , minsz ) ;
} else if ( cmd = = VFIO_DEVICE_GET_IRQ_INFO ) {
struct vfio_irq_info info ;
minsz = offsetofend ( struct vfio_irq_info , count ) ;
if ( copy_from_user ( & info , ( void __user * ) arg , minsz ) )
return - EFAULT ;
if ( info . argsz < minsz | | info . index > = VFIO_PCI_NUM_IRQS )
return - EINVAL ;
2013-03-11 23:31:22 +08:00
switch ( info . index ) {
case VFIO_PCI_INTX_IRQ_INDEX . . . VFIO_PCI_MSIX_IRQ_INDEX :
2015-02-07 06:05:08 +08:00
case VFIO_PCI_REQ_IRQ_INDEX :
2013-03-11 23:31:22 +08:00
break ;
case VFIO_PCI_ERR_IRQ_INDEX :
if ( pci_is_pcie ( vdev - > pdev ) )
break ;
/* pass thru to return error */
default :
return - EINVAL ;
}
2012-07-31 22:16:24 +08:00
info . flags = VFIO_IRQ_INFO_EVENTFD ;
info . count = vfio_pci_get_irq_count ( vdev , info . index ) ;
if ( info . index = = VFIO_PCI_INTX_IRQ_INDEX )
info . flags | = ( VFIO_IRQ_INFO_MASKABLE |
VFIO_IRQ_INFO_AUTOMASKED ) ;
else
info . flags | = VFIO_IRQ_INFO_NORESIZE ;
return copy_to_user ( ( void __user * ) arg , & info , minsz ) ;
} else if ( cmd = = VFIO_DEVICE_SET_IRQS ) {
struct vfio_irq_set hdr ;
u8 * data = NULL ;
int ret = 0 ;
minsz = offsetofend ( struct vfio_irq_set , count ) ;
if ( copy_from_user ( & hdr , ( void __user * ) arg , minsz ) )
return - EFAULT ;
if ( hdr . argsz < minsz | | hdr . index > = VFIO_PCI_NUM_IRQS | |
hdr . flags & ~ ( VFIO_IRQ_SET_DATA_TYPE_MASK |
VFIO_IRQ_SET_ACTION_TYPE_MASK ) )
return - EINVAL ;
if ( ! ( hdr . flags & VFIO_IRQ_SET_DATA_NONE ) ) {
size_t size ;
2013-03-27 01:33:16 +08:00
int max = vfio_pci_get_irq_count ( vdev , hdr . index ) ;
2012-07-31 22:16:24 +08:00
if ( hdr . flags & VFIO_IRQ_SET_DATA_BOOL )
size = sizeof ( uint8_t ) ;
else if ( hdr . flags & VFIO_IRQ_SET_DATA_EVENTFD )
size = sizeof ( int32_t ) ;
else
return - EINVAL ;
if ( hdr . argsz - minsz < hdr . count * size | |
2013-03-27 01:33:16 +08:00
hdr . start > = max | | hdr . start + hdr . count > max )
2012-07-31 22:16:24 +08:00
return - EINVAL ;
2012-12-08 04:43:49 +08:00
data = memdup_user ( ( void __user * ) ( arg + minsz ) ,
hdr . count * size ) ;
if ( IS_ERR ( data ) )
return PTR_ERR ( data ) ;
2012-07-31 22:16:24 +08:00
}
mutex_lock ( & vdev - > igate ) ;
ret = vfio_pci_set_irqs_ioctl ( vdev , hdr . flags , hdr . index ,
hdr . start , hdr . count , data ) ;
mutex_unlock ( & vdev - > igate ) ;
kfree ( data ) ;
return ret ;
2013-09-05 01:28:04 +08:00
} else if ( cmd = = VFIO_DEVICE_RESET ) {
2012-07-31 22:16:24 +08:00
return vdev - > reset_works ?
2014-01-15 11:45:09 +08:00
pci_try_reset_function ( vdev - > pdev ) : - EINVAL ;
2012-07-31 22:16:24 +08:00
2013-09-05 01:28:04 +08:00
} else if ( cmd = = VFIO_DEVICE_GET_PCI_HOT_RESET_INFO ) {
struct vfio_pci_hot_reset_info hdr ;
struct vfio_pci_fill_info fill = { 0 } ;
struct vfio_pci_dependent_device * devices = NULL ;
bool slot = false ;
int ret = 0 ;
minsz = offsetofend ( struct vfio_pci_hot_reset_info , count ) ;
if ( copy_from_user ( & hdr , ( void __user * ) arg , minsz ) )
return - EFAULT ;
if ( hdr . argsz < minsz )
return - EINVAL ;
hdr . flags = 0 ;
/* Can we do a slot or bus reset or neither? */
if ( ! pci_probe_reset_slot ( vdev - > pdev - > slot ) )
slot = true ;
else if ( pci_probe_reset_bus ( vdev - > pdev - > bus ) )
return - ENODEV ;
/* How many devices are affected? */
ret = vfio_pci_for_each_slot_or_bus ( vdev - > pdev ,
vfio_pci_count_devs ,
& fill . max , slot ) ;
if ( ret )
return ret ;
WARN_ON ( ! fill . max ) ; /* Should always be at least one */
/*
* If there ' s enough space , fill it now , otherwise return
* - ENOSPC and the number of devices affected .
*/
if ( hdr . argsz < sizeof ( hdr ) + ( fill . max * sizeof ( * devices ) ) ) {
ret = - ENOSPC ;
hdr . count = fill . max ;
goto reset_info_exit ;
}
devices = kcalloc ( fill . max , sizeof ( * devices ) , GFP_KERNEL ) ;
if ( ! devices )
return - ENOMEM ;
fill . devices = devices ;
ret = vfio_pci_for_each_slot_or_bus ( vdev - > pdev ,
vfio_pci_fill_devs ,
& fill , slot ) ;
/*
* If a device was removed between counting and filling ,
* we may come up short of fill . max . If a device was
* added , we ' ll have a return of - EAGAIN above .
*/
if ( ! ret )
hdr . count = fill . cur ;
reset_info_exit :
if ( copy_to_user ( ( void __user * ) arg , & hdr , minsz ) )
ret = - EFAULT ;
if ( ! ret ) {
if ( copy_to_user ( ( void __user * ) ( arg + minsz ) , devices ,
hdr . count * sizeof ( * devices ) ) )
ret = - EFAULT ;
}
kfree ( devices ) ;
return ret ;
} else if ( cmd = = VFIO_DEVICE_PCI_HOT_RESET ) {
struct vfio_pci_hot_reset hdr ;
int32_t * group_fds ;
struct vfio_pci_group_entry * groups ;
struct vfio_pci_group_info info ;
bool slot = false ;
int i , count = 0 , ret = 0 ;
minsz = offsetofend ( struct vfio_pci_hot_reset , count ) ;
if ( copy_from_user ( & hdr , ( void __user * ) arg , minsz ) )
return - EFAULT ;
if ( hdr . argsz < minsz | | hdr . flags )
return - EINVAL ;
/* Can we do a slot or bus reset or neither? */
if ( ! pci_probe_reset_slot ( vdev - > pdev - > slot ) )
slot = true ;
else if ( pci_probe_reset_bus ( vdev - > pdev - > bus ) )
return - ENODEV ;
/*
* We can ' t let userspace give us an arbitrarily large
* buffer to copy , so verify how many we think there
* could be . Note groups can have multiple devices so
* one group per device is the max .
*/
ret = vfio_pci_for_each_slot_or_bus ( vdev - > pdev ,
vfio_pci_count_devs ,
& count , slot ) ;
if ( ret )
return ret ;
/* Somewhere between 1 and count is OK */
if ( ! hdr . count | | hdr . count > count )
return - EINVAL ;
group_fds = kcalloc ( hdr . count , sizeof ( * group_fds ) , GFP_KERNEL ) ;
groups = kcalloc ( hdr . count , sizeof ( * groups ) , GFP_KERNEL ) ;
if ( ! group_fds | | ! groups ) {
kfree ( group_fds ) ;
kfree ( groups ) ;
return - ENOMEM ;
}
if ( copy_from_user ( group_fds , ( void __user * ) ( arg + minsz ) ,
hdr . count * sizeof ( * group_fds ) ) ) {
kfree ( group_fds ) ;
kfree ( groups ) ;
return - EFAULT ;
}
/*
* For each group_fd , get the group through the vfio external
* user interface and store the group and iommu ID . This
* ensures the group is held across the reset .
*/
for ( i = 0 ; i < hdr . count ; i + + ) {
struct vfio_group * group ;
struct fd f = fdget ( group_fds [ i ] ) ;
if ( ! f . file ) {
ret = - EBADF ;
break ;
}
group = vfio_group_get_external_user ( f . file ) ;
fdput ( f ) ;
if ( IS_ERR ( group ) ) {
ret = PTR_ERR ( group ) ;
break ;
}
groups [ i ] . group = group ;
groups [ i ] . id = vfio_external_user_iommu_id ( group ) ;
}
kfree ( group_fds ) ;
/* release reference to groups on error */
if ( ret )
goto hot_reset_release ;
info . count = hdr . count ;
info . groups = groups ;
/*
* Test whether all the affected devices are contained
* by the set of groups provided by the user .
*/
ret = vfio_pci_for_each_slot_or_bus ( vdev - > pdev ,
vfio_pci_validate_devs ,
& info , slot ) ;
if ( ! ret )
/* User has access, do the reset */
2014-01-15 11:45:09 +08:00
ret = slot ? pci_try_reset_slot ( vdev - > pdev - > slot ) :
pci_try_reset_bus ( vdev - > pdev - > bus ) ;
2013-09-05 01:28:04 +08:00
hot_reset_release :
for ( i - - ; i > = 0 ; i - - )
vfio_group_put_external_user ( groups [ i ] . group ) ;
kfree ( groups ) ;
return ret ;
}
2012-07-31 22:16:24 +08:00
return - ENOTTY ;
}
2013-02-15 05:02:12 +08:00
static ssize_t vfio_pci_rw ( void * device_data , char __user * buf ,
size_t count , loff_t * ppos , bool iswrite )
2012-07-31 22:16:24 +08:00
{
unsigned int index = VFIO_PCI_OFFSET_TO_INDEX ( * ppos ) ;
struct vfio_pci_device * vdev = device_data ;
if ( index > = VFIO_PCI_NUM_REGIONS )
return - EINVAL ;
2013-02-15 05:02:12 +08:00
switch ( index ) {
case VFIO_PCI_CONFIG_REGION_INDEX :
2013-02-15 05:02:12 +08:00
return vfio_pci_config_rw ( vdev , buf , count , ppos , iswrite ) ;
2013-02-15 05:02:12 +08:00
case VFIO_PCI_ROM_REGION_INDEX :
if ( iswrite )
return - EINVAL ;
2013-02-15 05:02:12 +08:00
return vfio_pci_bar_rw ( vdev , buf , count , ppos , false ) ;
2012-07-31 22:16:24 +08:00
2013-02-15 05:02:12 +08:00
case VFIO_PCI_BAR0_REGION_INDEX . . . VFIO_PCI_BAR5_REGION_INDEX :
2013-02-15 05:02:12 +08:00
return vfio_pci_bar_rw ( vdev , buf , count , ppos , iswrite ) ;
2013-02-19 01:11:13 +08:00
case VFIO_PCI_VGA_REGION_INDEX :
return vfio_pci_vga_rw ( vdev , buf , count , ppos , iswrite ) ;
2013-02-15 05:02:12 +08:00
}
2012-07-31 22:16:24 +08:00
return - EINVAL ;
}
2013-02-15 05:02:12 +08:00
static ssize_t vfio_pci_read ( void * device_data , char __user * buf ,
size_t count , loff_t * ppos )
{
2013-02-15 05:02:12 +08:00
if ( ! count )
return 0 ;
2013-02-15 05:02:12 +08:00
return vfio_pci_rw ( device_data , buf , count , ppos , false ) ;
}
2012-07-31 22:16:24 +08:00
static ssize_t vfio_pci_write ( void * device_data , const char __user * buf ,
size_t count , loff_t * ppos )
{
2013-02-15 05:02:12 +08:00
if ( ! count )
return 0 ;
return vfio_pci_rw ( device_data , ( char __user * ) buf , count , ppos , true ) ;
2012-07-31 22:16:24 +08:00
}
static int vfio_pci_mmap ( void * device_data , struct vm_area_struct * vma )
{
struct vfio_pci_device * vdev = device_data ;
struct pci_dev * pdev = vdev - > pdev ;
unsigned int index ;
2012-10-10 23:10:31 +08:00
u64 phys_len , req_len , pgoff , req_start ;
2012-07-31 22:16:24 +08:00
int ret ;
index = vma - > vm_pgoff > > ( VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT ) ;
if ( vma - > vm_end < vma - > vm_start )
return - EINVAL ;
if ( ( vma - > vm_flags & VM_SHARED ) = = 0 )
return - EINVAL ;
if ( index > = VFIO_PCI_ROM_REGION_INDEX )
return - EINVAL ;
if ( ! ( pci_resource_flags ( pdev , index ) & IORESOURCE_MEM ) )
return - EINVAL ;
phys_len = pci_resource_len ( pdev , index ) ;
req_len = vma - > vm_end - vma - > vm_start ;
pgoff = vma - > vm_pgoff &
( ( 1U < < ( VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT ) ) - 1 ) ;
req_start = pgoff < < PAGE_SHIFT ;
if ( phys_len < PAGE_SIZE | | req_start + req_len > phys_len )
return - EINVAL ;
if ( index = = vdev - > msix_bar ) {
/*
* Disallow mmaps overlapping the MSI - X table ; users don ' t
* get to touch this directly . We could find somewhere
* else to map the overlap , but page granularity is only
* a recommendation , not a requirement , so the user needs
* to know which bits are real . Requiring them to mmap
* around the table makes that clear .
*/
/* If neither entirely above nor below, then it overlaps */
if ( ! ( req_start > = vdev - > msix_offset + vdev - > msix_size | |
req_start + req_len < = vdev - > msix_offset ) )
return - EINVAL ;
}
/*
* Even though we don ' t make use of the barmap for the mmap ,
* we need to request the region and the barmap tracks that .
*/
if ( ! vdev - > barmap [ index ] ) {
ret = pci_request_selected_regions ( pdev ,
1 < < index , " vfio-pci " ) ;
if ( ret )
return ret ;
vdev - > barmap [ index ] = pci_iomap ( pdev , index , 0 ) ;
}
vma - > vm_private_data = vdev ;
vma - > vm_page_prot = pgprot_noncached ( vma - > vm_page_prot ) ;
2012-10-10 23:10:31 +08:00
vma - > vm_pgoff = ( pci_resource_start ( pdev , index ) > > PAGE_SHIFT ) + pgoff ;
2012-07-31 22:16:24 +08:00
2012-10-10 23:10:31 +08:00
return remap_pfn_range ( vma , vma - > vm_start , vma - > vm_pgoff ,
2012-07-31 22:16:24 +08:00
req_len , vma - > vm_page_prot ) ;
}
2015-02-07 06:05:08 +08:00
static void vfio_pci_request ( void * device_data , unsigned int count )
{
struct vfio_pci_device * vdev = device_data ;
mutex_lock ( & vdev - > igate ) ;
if ( vdev - > req_trigger ) {
dev_dbg ( & vdev - > pdev - > dev , " Requesting device from user \n " ) ;
eventfd_signal ( vdev - > req_trigger , 1 ) ;
}
mutex_unlock ( & vdev - > igate ) ;
}
2012-07-31 22:16:24 +08:00
static const struct vfio_device_ops vfio_pci_ops = {
. name = " vfio-pci " ,
. open = vfio_pci_open ,
. release = vfio_pci_release ,
. ioctl = vfio_pci_ioctl ,
. read = vfio_pci_read ,
. write = vfio_pci_write ,
. mmap = vfio_pci_mmap ,
2015-02-07 06:05:08 +08:00
. request = vfio_pci_request ,
2012-07-31 22:16:24 +08:00
} ;
static int vfio_pci_probe ( struct pci_dev * pdev , const struct pci_device_id * id )
{
struct vfio_pci_device * vdev ;
struct iommu_group * group ;
int ret ;
2015-01-08 01:29:11 +08:00
if ( pdev - > hdr_type ! = PCI_HEADER_TYPE_NORMAL )
2012-07-31 22:16:24 +08:00
return - EINVAL ;
group = iommu_group_get ( & pdev - > dev ) ;
if ( ! group )
return - EINVAL ;
vdev = kzalloc ( sizeof ( * vdev ) , GFP_KERNEL ) ;
if ( ! vdev ) {
iommu_group_put ( group ) ;
return - ENOMEM ;
}
vdev - > pdev = pdev ;
vdev - > irq_type = VFIO_PCI_NUM_IRQS ;
mutex_init ( & vdev - > igate ) ;
spin_lock_init ( & vdev - > irqlock ) ;
ret = vfio_add_group_dev ( & pdev - > dev , & vfio_pci_ops , vdev ) ;
if ( ret ) {
iommu_group_put ( group ) ;
kfree ( vdev ) ;
}
2015-04-08 01:14:41 +08:00
if ( vfio_pci_is_vga ( pdev ) ) {
vga_client_register ( pdev , vdev , NULL , vfio_pci_set_vga_decode ) ;
vga_set_legacy_decoding ( pdev ,
vfio_pci_set_vga_decode ( vdev , false ) ) ;
}
2012-07-31 22:16:24 +08:00
return ret ;
}
static void vfio_pci_remove ( struct pci_dev * pdev )
{
struct vfio_pci_device * vdev ;
vdev = vfio_del_group_dev ( & pdev - > dev ) ;
2015-04-08 01:14:41 +08:00
if ( ! vdev )
return ;
iommu_group_put ( pdev - > dev . iommu_group ) ;
kfree ( vdev ) ;
if ( vfio_pci_is_vga ( pdev ) ) {
vga_client_register ( pdev , NULL , NULL , NULL ) ;
vga_set_legacy_decoding ( pdev ,
VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM ) ;
2014-08-08 01:12:04 +08:00
}
2012-07-31 22:16:24 +08:00
}
2013-03-11 23:31:22 +08:00
static pci_ers_result_t vfio_pci_aer_err_detected ( struct pci_dev * pdev ,
pci_channel_state_t state )
{
struct vfio_pci_device * vdev ;
struct vfio_device * device ;
device = vfio_device_get_from_dev ( & pdev - > dev ) ;
if ( device = = NULL )
return PCI_ERS_RESULT_DISCONNECT ;
vdev = vfio_device_data ( device ) ;
if ( vdev = = NULL ) {
vfio_device_put ( device ) ;
return PCI_ERS_RESULT_DISCONNECT ;
}
2014-01-15 07:12:55 +08:00
mutex_lock ( & vdev - > igate ) ;
2013-03-11 23:31:22 +08:00
if ( vdev - > err_trigger )
eventfd_signal ( vdev - > err_trigger , 1 ) ;
2014-01-15 07:12:55 +08:00
mutex_unlock ( & vdev - > igate ) ;
2013-03-11 23:31:22 +08:00
vfio_device_put ( device ) ;
return PCI_ERS_RESULT_CAN_RECOVER ;
}
static struct pci_error_handlers vfio_err_handlers = {
. error_detected = vfio_pci_aer_err_detected ,
} ;
2012-07-31 22:16:24 +08:00
static struct pci_driver vfio_pci_driver = {
. name = " vfio-pci " ,
. id_table = NULL , /* only dynamic ids */
. probe = vfio_pci_probe ,
. remove = vfio_pci_remove ,
2013-03-11 23:31:22 +08:00
. err_handler = & vfio_err_handlers ,
2012-07-31 22:16:24 +08:00
} ;
2014-09-30 07:18:39 +08:00
struct vfio_devices {
struct vfio_device * * devices ;
int cur_index ;
int max_index ;
} ;
2014-08-08 01:12:07 +08:00
2014-09-30 07:18:39 +08:00
static int vfio_pci_get_devs ( struct pci_dev * pdev , void * data )
2014-08-08 01:12:07 +08:00
{
2014-09-30 07:18:39 +08:00
struct vfio_devices * devs = data ;
2014-08-08 01:12:07 +08:00
struct pci_driver * pci_drv = ACCESS_ONCE ( pdev - > driver ) ;
2014-09-30 07:18:39 +08:00
if ( pci_drv ! = & vfio_pci_driver )
return - EBUSY ;
2014-08-08 01:12:07 +08:00
2014-09-30 07:18:39 +08:00
if ( devs - > cur_index = = devs - > max_index )
return - ENOSPC ;
2014-08-08 01:12:07 +08:00
2014-09-30 07:18:39 +08:00
devs - > devices [ devs - > cur_index ] = vfio_device_get_from_dev ( & pdev - > dev ) ;
if ( ! devs - > devices [ devs - > cur_index ] )
return - EINVAL ;
2014-08-08 01:12:07 +08:00
2014-09-30 07:18:39 +08:00
devs - > cur_index + + ;
2014-08-08 01:12:07 +08:00
return 0 ;
}
/*
* Attempt to do a bus / slot reset if there are devices affected by a reset for
* this device that are needs_reset and all of the affected devices are unused
2014-09-30 07:18:39 +08:00
* ( ! refcnt ) . Callers are required to hold driver_lock when calling this to
* prevent device opens and concurrent bus reset attempts . We prevent device
* unbinds by acquiring and holding a reference to the vfio_device .
*
* NB : vfio - core considers a group to be viable even if some devices are
* bound to drivers like pci - stub or pcieport . Here we require all devices
* to be bound to vfio_pci since that ' s the only way we can be sure they
* stay put .
2014-08-08 01:12:07 +08:00
*/
static void vfio_pci_try_bus_reset ( struct vfio_pci_device * vdev )
{
2014-09-30 07:18:39 +08:00
struct vfio_devices devs = { . cur_index = 0 } ;
int i = 0 , ret = - EINVAL ;
2014-08-08 01:12:07 +08:00
bool needs_reset = false , slot = false ;
2014-09-30 07:18:39 +08:00
struct vfio_pci_device * tmp ;
2014-08-08 01:12:07 +08:00
if ( ! pci_probe_reset_slot ( vdev - > pdev - > slot ) )
slot = true ;
else if ( pci_probe_reset_bus ( vdev - > pdev - > bus ) )
return ;
2014-09-30 07:18:39 +08:00
if ( vfio_pci_for_each_slot_or_bus ( vdev - > pdev , vfio_pci_count_devs ,
& i , slot ) | | ! i )
2014-08-08 01:12:07 +08:00
return ;
2014-09-30 07:18:39 +08:00
devs . max_index = i ;
devs . devices = kcalloc ( i , sizeof ( struct vfio_device * ) , GFP_KERNEL ) ;
if ( ! devs . devices )
2014-08-08 01:12:07 +08:00
return ;
2014-09-30 07:18:39 +08:00
if ( vfio_pci_for_each_slot_or_bus ( vdev - > pdev ,
vfio_pci_get_devs , & devs , slot ) )
goto put_devs ;
for ( i = 0 ; i < devs . cur_index ; i + + ) {
tmp = vfio_device_data ( devs . devices [ i ] ) ;
if ( tmp - > needs_reset )
needs_reset = true ;
if ( tmp - > refcnt )
goto put_devs ;
}
if ( needs_reset )
ret = slot ? pci_try_reset_slot ( vdev - > pdev - > slot ) :
pci_try_reset_bus ( vdev - > pdev - > bus ) ;
put_devs :
for ( i = 0 ; i < devs . cur_index ; i + + ) {
if ( ! ret ) {
tmp = vfio_device_data ( devs . devices [ i ] ) ;
tmp - > needs_reset = false ;
}
vfio_device_put ( devs . devices [ i ] ) ;
}
kfree ( devs . devices ) ;
2014-08-08 01:12:07 +08:00
}
2012-07-31 22:16:24 +08:00
static void __exit vfio_pci_cleanup ( void )
{
pci_unregister_driver ( & vfio_pci_driver ) ;
vfio_pci_uninit_perm_bits ( ) ;
}
static int __init vfio_pci_init ( void )
{
int ret ;
/* Allocate shared config space permision data used by all devices */
ret = vfio_pci_init_perm_bits ( ) ;
if ( ret )
return ret ;
/* Register and scan for devices */
ret = pci_register_driver ( & vfio_pci_driver ) ;
if ( ret )
goto out_driver ;
return 0 ;
out_driver :
vfio_pci_uninit_perm_bits ( ) ;
return ret ;
}
module_init ( vfio_pci_init ) ;
module_exit ( vfio_pci_cleanup ) ;
MODULE_VERSION ( DRIVER_VERSION ) ;
MODULE_LICENSE ( " GPL v2 " ) ;
MODULE_AUTHOR ( DRIVER_AUTHOR ) ;
MODULE_DESCRIPTION ( DRIVER_DESC ) ;