xen: branch for v6.1-rc1

-----BEGIN PGP SIGNATURE-----
 
 iHUEABYIAB0WIQRTLbB6QfY48x44uB6AXGG7T9hjvgUCY0ZjFAAKCRCAXGG7T9hj
 vjEsAP4rFMnqc6AXy4Mpvv8cxBtEuQZbwEqgBrMJUvK1jZQrBQD/dOJK2GBCVcfD
 2yaVlefFiJGTw5WUlbPeohUlTZ8pJwg=
 =xsHV
 -----END PGP SIGNATURE-----

Merge tag 'for-linus-6.1-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip

Pull xen updates from Juergen Gross:

 - Some minor typo fixes

 - A fix of the Xen pcifront driver for supporting the device model to
   run in a Linux stub domain

 - A cleanup of the pcifront driver

 - A series to enable grant-based virtio with Xen on x86

 - A cleanup of Xen PV guests to distinguish between safe and faulting
   MSR accesses

 - Two fixes of the Xen gntdev driver

 - Two fixes of the new xen grant DMA driver

* tag 'for-linus-6.1-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip:
  xen: Kconfig: Fix spelling mistake "Maxmium" -> "Maximum"
  xen/pv: support selecting safe/unsafe msr accesses
  xen/pv: refactor msr access functions to support safe and unsafe accesses
  xen/pv: fix vendor checks for pmu emulation
  xen/pv: add fault recovery control to pmu msr accesses
  xen/virtio: enable grant based virtio on x86
  xen/virtio: use dom0 as default backend for CONFIG_XEN_VIRTIO_FORCE_GRANT
  xen/virtio: restructure xen grant dma setup
  xen/pcifront: move xenstore config scanning into sub-function
  xen/gntdev: Accommodate VMA splitting
  xen/gntdev: Prevent leaking grants
  xen/virtio: Fix potential deadlock when accessing xen_grant_dma_devices
  xen/virtio: Fix n_pages calculation in xen_grant_dma_map(unmap)_page()
  xen/xenbus: Fix spelling mistake "hardward" -> "hardware"
  xen-pcifront: Handle missed Connected state
This commit is contained in:
Linus Torvalds 2022-10-12 14:39:38 -07:00
commit 778ce723e9
12 changed files with 318 additions and 247 deletions

View File

@ -6851,6 +6851,12 @@
Crash from Xen panic notifier, without executing late Crash from Xen panic notifier, without executing late
panic() code such as dumping handler. panic() code such as dumping handler.
xen_msr_safe= [X86,XEN]
Format: <bool>
Select whether to always use non-faulting (safe) MSR
access functions when running as Xen PV guest. The
default value is controlled by CONFIG_XEN_PV_MSR_SAFE.
xen_nopvspin [X86,XEN] xen_nopvspin [X86,XEN]
Disables the qspinlock slowpath using Xen PV optimizations. Disables the qspinlock slowpath using Xen PV optimizations.
This parameter is obsoleted by "nopvspin" parameter, which This parameter is obsoleted by "nopvspin" parameter, which

View File

@ -92,3 +92,12 @@ config XEN_DOM0
select X86_X2APIC if XEN_PVH && X86_64 select X86_X2APIC if XEN_PVH && X86_64
help help
Support running as a Xen Dom0 guest. Support running as a Xen Dom0 guest.
config XEN_PV_MSR_SAFE
bool "Always use safe MSR accesses in PV guests"
default y
depends on XEN_PV
help
Use safe (not faulting) MSR access functions even if the MSR access
should not fault anyway.
The default can be changed by using the "xen_msr_safe" boot parameter.

View File

@ -212,7 +212,7 @@ static void __init xen_hvm_guest_init(void)
return; return;
if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT)) if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT))
virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); virtio_set_mem_acc_cb(xen_virtio_restricted_mem_acc);
init_hvm_pv_info(); init_hvm_pv_info();

View File

@ -108,11 +108,21 @@ struct tls_descs {
*/ */
static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
static __read_mostly bool xen_msr_safe = IS_ENABLED(CONFIG_XEN_PV_MSR_SAFE);
static int __init parse_xen_msr_safe(char *str)
{
if (str)
return strtobool(str, &xen_msr_safe);
return -EINVAL;
}
early_param("xen_msr_safe", parse_xen_msr_safe);
static void __init xen_pv_init_platform(void) static void __init xen_pv_init_platform(void)
{ {
/* PV guests can't operate virtio devices without grants. */ /* PV guests can't operate virtio devices without grants. */
if (IS_ENABLED(CONFIG_XEN_VIRTIO)) if (IS_ENABLED(CONFIG_XEN_VIRTIO))
virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); virtio_set_mem_acc_cb(xen_virtio_restricted_mem_acc);
populate_extra_pte(fix_to_virt(FIX_PARAVIRT_BOOTMAP)); populate_extra_pte(fix_to_virt(FIX_PARAVIRT_BOOTMAP));
@ -917,14 +927,18 @@ static void xen_write_cr4(unsigned long cr4)
native_write_cr4(cr4); native_write_cr4(cr4);
} }
static u64 xen_read_msr_safe(unsigned int msr, int *err) static u64 xen_do_read_msr(unsigned int msr, int *err)
{ {
u64 val; u64 val = 0; /* Avoid uninitialized value for safe variant. */
if (pmu_msr_read(msr, &val, err)) if (pmu_msr_read(msr, &val, err))
return val; return val;
val = native_read_msr_safe(msr, err); if (err)
val = native_read_msr_safe(msr, err);
else
val = native_read_msr(msr);
switch (msr) { switch (msr) {
case MSR_IA32_APICBASE: case MSR_IA32_APICBASE:
val &= ~X2APIC_ENABLE; val &= ~X2APIC_ENABLE;
@ -933,23 +947,39 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err)
return val; return val;
} }
static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) static void set_seg(unsigned int which, unsigned int low, unsigned int high,
int *err)
{ {
int ret; u64 base = ((u64)high << 32) | low;
unsigned int which;
u64 base;
ret = 0; if (HYPERVISOR_set_segment_base(which, base) == 0)
return;
if (err)
*err = -EIO;
else
WARN(1, "Xen set_segment_base(%u, %llx) failed\n", which, base);
}
/*
* Support write_msr_safe() and write_msr() semantics.
* With err == NULL write_msr() semantics are selected.
* Supplying an err pointer requires err to be pre-initialized with 0.
*/
static void xen_do_write_msr(unsigned int msr, unsigned int low,
unsigned int high, int *err)
{
switch (msr) { switch (msr) {
case MSR_FS_BASE: which = SEGBASE_FS; goto set; case MSR_FS_BASE:
case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set; set_seg(SEGBASE_FS, low, high, err);
case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set; break;
set: case MSR_KERNEL_GS_BASE:
base = ((u64)high << 32) | low; set_seg(SEGBASE_GS_USER, low, high, err);
if (HYPERVISOR_set_segment_base(which, base) != 0) break;
ret = -EIO;
case MSR_GS_BASE:
set_seg(SEGBASE_GS_KERNEL, low, high, err);
break; break;
case MSR_STAR: case MSR_STAR:
@ -965,31 +995,42 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
break; break;
default: default:
if (!pmu_msr_write(msr, low, high, &ret)) if (!pmu_msr_write(msr, low, high, err)) {
ret = native_write_msr_safe(msr, low, high); if (err)
*err = native_write_msr_safe(msr, low, high);
else
native_write_msr(msr, low, high);
}
} }
}
return ret; static u64 xen_read_msr_safe(unsigned int msr, int *err)
{
return xen_do_read_msr(msr, err);
}
static int xen_write_msr_safe(unsigned int msr, unsigned int low,
unsigned int high)
{
int err = 0;
xen_do_write_msr(msr, low, high, &err);
return err;
} }
static u64 xen_read_msr(unsigned int msr) static u64 xen_read_msr(unsigned int msr)
{ {
/*
* This will silently swallow a #GP from RDMSR. It may be worth
* changing that.
*/
int err; int err;
return xen_read_msr_safe(msr, &err); return xen_do_read_msr(msr, xen_msr_safe ? &err : NULL);
} }
static void xen_write_msr(unsigned int msr, unsigned low, unsigned high) static void xen_write_msr(unsigned int msr, unsigned low, unsigned high)
{ {
/* int err;
* This will silently swallow a #GP from WRMSR. It may be worth
* changing that. xen_do_write_msr(msr, low, high, xen_msr_safe ? &err : NULL);
*/
xen_write_msr_safe(msr, low, high);
} }
/* This is called once we have the cpu_possible_mask */ /* This is called once we have the cpu_possible_mask */

View File

@ -131,6 +131,10 @@ static inline uint32_t get_fam15h_addr(u32 addr)
static inline bool is_amd_pmu_msr(unsigned int msr) static inline bool is_amd_pmu_msr(unsigned int msr)
{ {
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
return false;
if ((msr >= MSR_F15H_PERF_CTL && if ((msr >= MSR_F15H_PERF_CTL &&
msr < MSR_F15H_PERF_CTR + (amd_num_counters * 2)) || msr < MSR_F15H_PERF_CTR + (amd_num_counters * 2)) ||
(msr >= MSR_K7_EVNTSEL0 && (msr >= MSR_K7_EVNTSEL0 &&
@ -140,10 +144,15 @@ static inline bool is_amd_pmu_msr(unsigned int msr)
return false; return false;
} }
static int is_intel_pmu_msr(u32 msr_index, int *type, int *index) static bool is_intel_pmu_msr(u32 msr_index, int *type, int *index)
{ {
u32 msr_index_pmc; u32 msr_index_pmc;
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR &&
boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
return false;
switch (msr_index) { switch (msr_index) {
case MSR_CORE_PERF_FIXED_CTR_CTRL: case MSR_CORE_PERF_FIXED_CTR_CTRL:
case MSR_IA32_DS_AREA: case MSR_IA32_DS_AREA:
@ -290,48 +299,52 @@ static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read)
return false; return false;
} }
static bool pmu_msr_chk_emulated(unsigned int msr, uint64_t *val, bool is_read,
bool *emul)
{
int type, index;
if (is_amd_pmu_msr(msr))
*emul = xen_amd_pmu_emulate(msr, val, is_read);
else if (is_intel_pmu_msr(msr, &type, &index))
*emul = xen_intel_pmu_emulate(msr, val, type, index, is_read);
else
return false;
return true;
}
bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err) bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err)
{ {
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { bool emulated;
if (is_amd_pmu_msr(msr)) {
if (!xen_amd_pmu_emulate(msr, val, 1))
*val = native_read_msr_safe(msr, err);
return true;
}
} else {
int type, index;
if (is_intel_pmu_msr(msr, &type, &index)) { if (!pmu_msr_chk_emulated(msr, val, true, &emulated))
if (!xen_intel_pmu_emulate(msr, val, type, index, 1)) return false;
*val = native_read_msr_safe(msr, err);
return true; if (!emulated) {
} *val = err ? native_read_msr_safe(msr, err)
: native_read_msr(msr);
} }
return false; return true;
} }
bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err) bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err)
{ {
uint64_t val = ((uint64_t)high << 32) | low; uint64_t val = ((uint64_t)high << 32) | low;
bool emulated;
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { if (!pmu_msr_chk_emulated(msr, &val, false, &emulated))
if (is_amd_pmu_msr(msr)) { return false;
if (!xen_amd_pmu_emulate(msr, &val, 0))
*err = native_write_msr_safe(msr, low, high);
return true;
}
} else {
int type, index;
if (is_intel_pmu_msr(msr, &type, &index)) { if (!emulated) {
if (!xen_intel_pmu_emulate(msr, &val, type, index, 0)) if (err)
*err = native_write_msr_safe(msr, low, high); *err = native_write_msr_safe(msr, low, high);
return true; else
} native_write_msr(msr, low, high);
} }
return false; return true;
} }
static unsigned long long xen_amd_read_pmc(int counter) static unsigned long long xen_amd_read_pmc(int counter)

View File

@ -521,24 +521,14 @@ static int pcifront_rescan_root(struct pcifront_device *pdev,
int err; int err;
struct pci_bus *b; struct pci_bus *b;
#ifndef CONFIG_PCI_DOMAINS
if (domain != 0) {
dev_err(&pdev->xdev->dev,
"PCI Root in non-zero PCI Domain! domain=%d\n", domain);
dev_err(&pdev->xdev->dev,
"Please compile with CONFIG_PCI_DOMAINS\n");
return -EINVAL;
}
#endif
dev_info(&pdev->xdev->dev, "Rescanning PCI Frontend Bus %04x:%02x\n",
domain, bus);
b = pci_find_bus(domain, bus); b = pci_find_bus(domain, bus);
if (!b) if (!b)
/* If the bus is unknown, create it. */ /* If the bus is unknown, create it. */
return pcifront_scan_root(pdev, domain, bus); return pcifront_scan_root(pdev, domain, bus);
dev_info(&pdev->xdev->dev, "Rescanning PCI Frontend Bus %04x:%02x\n",
domain, bus);
err = pcifront_scan_bus(pdev, domain, bus, b); err = pcifront_scan_bus(pdev, domain, bus, b);
/* Claim resources before going "live" with our devices */ /* Claim resources before going "live" with our devices */
@ -819,76 +809,73 @@ out:
return err; return err;
} }
static int pcifront_try_connect(struct pcifront_device *pdev) static void pcifront_connect(struct pcifront_device *pdev)
{ {
int err = -EFAULT; int err;
int i, num_roots, len; int i, num_roots, len;
char str[64]; char str[64];
unsigned int domain, bus; unsigned int domain, bus;
/* Only connect once */
if (xenbus_read_driver_state(pdev->xdev->nodename) !=
XenbusStateInitialised)
goto out;
err = pcifront_connect_and_init_dma(pdev);
if (err && err != -EEXIST) {
xenbus_dev_fatal(pdev->xdev, err,
"Error setting up PCI Frontend");
goto out;
}
err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
"root_num", "%d", &num_roots); "root_num", "%d", &num_roots);
if (err == -ENOENT) { if (err == -ENOENT) {
xenbus_dev_error(pdev->xdev, err, xenbus_dev_error(pdev->xdev, err,
"No PCI Roots found, trying 0000:00"); "No PCI Roots found, trying 0000:00");
err = pcifront_scan_root(pdev, 0, 0); err = pcifront_rescan_root(pdev, 0, 0);
if (err) { if (err) {
xenbus_dev_fatal(pdev->xdev, err, xenbus_dev_fatal(pdev->xdev, err,
"Error scanning PCI root 0000:00"); "Error scanning PCI root 0000:00");
goto out; return;
} }
num_roots = 0; num_roots = 0;
} else if (err != 1) { } else if (err != 1) {
if (err == 0) xenbus_dev_fatal(pdev->xdev, err >= 0 ? -EINVAL : err,
err = -EINVAL;
xenbus_dev_fatal(pdev->xdev, err,
"Error reading number of PCI roots"); "Error reading number of PCI roots");
goto out; return;
} }
for (i = 0; i < num_roots; i++) { for (i = 0; i < num_roots; i++) {
len = snprintf(str, sizeof(str), "root-%d", i); len = snprintf(str, sizeof(str), "root-%d", i);
if (unlikely(len >= (sizeof(str) - 1))) { if (unlikely(len >= (sizeof(str) - 1)))
err = -ENOMEM; return;
goto out;
}
err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str, err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
"%x:%x", &domain, &bus); "%x:%x", &domain, &bus);
if (err != 2) { if (err != 2) {
if (err >= 0) xenbus_dev_fatal(pdev->xdev, err >= 0 ? -EINVAL : err,
err = -EINVAL;
xenbus_dev_fatal(pdev->xdev, err,
"Error reading PCI root %d", i); "Error reading PCI root %d", i);
goto out; return;
} }
err = pcifront_scan_root(pdev, domain, bus); err = pcifront_rescan_root(pdev, domain, bus);
if (err) { if (err) {
xenbus_dev_fatal(pdev->xdev, err, xenbus_dev_fatal(pdev->xdev, err,
"Error scanning PCI root %04x:%02x", "Error scanning PCI root %04x:%02x",
domain, bus); domain, bus);
goto out; return;
} }
} }
err = xenbus_switch_state(pdev->xdev, XenbusStateConnected); xenbus_switch_state(pdev->xdev, XenbusStateConnected);
}
out: static void pcifront_try_connect(struct pcifront_device *pdev)
return err; {
int err;
/* Only connect once */
if (xenbus_read_driver_state(pdev->xdev->nodename) !=
XenbusStateInitialised)
return;
err = pcifront_connect_and_init_dma(pdev);
if (err && err != -EEXIST) {
xenbus_dev_fatal(pdev->xdev, err,
"Error setting up PCI Frontend");
return;
}
pcifront_connect(pdev);
} }
static int pcifront_try_disconnect(struct pcifront_device *pdev) static int pcifront_try_disconnect(struct pcifront_device *pdev)
@ -914,80 +901,37 @@ out:
return err; return err;
} }
static int pcifront_attach_devices(struct pcifront_device *pdev) static void pcifront_attach_devices(struct pcifront_device *pdev)
{ {
int err = -EFAULT; if (xenbus_read_driver_state(pdev->xdev->nodename) ==
int i, num_roots, len;
unsigned int domain, bus;
char str[64];
if (xenbus_read_driver_state(pdev->xdev->nodename) !=
XenbusStateReconfiguring) XenbusStateReconfiguring)
goto out; pcifront_connect(pdev);
err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
"root_num", "%d", &num_roots);
if (err == -ENOENT) {
xenbus_dev_error(pdev->xdev, err,
"No PCI Roots found, trying 0000:00");
err = pcifront_rescan_root(pdev, 0, 0);
if (err) {
xenbus_dev_fatal(pdev->xdev, err,
"Error scanning PCI root 0000:00");
goto out;
}
num_roots = 0;
} else if (err != 1) {
if (err == 0)
err = -EINVAL;
xenbus_dev_fatal(pdev->xdev, err,
"Error reading number of PCI roots");
goto out;
}
for (i = 0; i < num_roots; i++) {
len = snprintf(str, sizeof(str), "root-%d", i);
if (unlikely(len >= (sizeof(str) - 1))) {
err = -ENOMEM;
goto out;
}
err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
"%x:%x", &domain, &bus);
if (err != 2) {
if (err >= 0)
err = -EINVAL;
xenbus_dev_fatal(pdev->xdev, err,
"Error reading PCI root %d", i);
goto out;
}
err = pcifront_rescan_root(pdev, domain, bus);
if (err) {
xenbus_dev_fatal(pdev->xdev, err,
"Error scanning PCI root %04x:%02x",
domain, bus);
goto out;
}
}
xenbus_switch_state(pdev->xdev, XenbusStateConnected);
out:
return err;
} }
static int pcifront_detach_devices(struct pcifront_device *pdev) static int pcifront_detach_devices(struct pcifront_device *pdev)
{ {
int err = 0; int err = 0;
int i, num_devs; int i, num_devs;
enum xenbus_state state;
unsigned int domain, bus, slot, func; unsigned int domain, bus, slot, func;
struct pci_dev *pci_dev; struct pci_dev *pci_dev;
char str[64]; char str[64];
if (xenbus_read_driver_state(pdev->xdev->nodename) != state = xenbus_read_driver_state(pdev->xdev->nodename);
XenbusStateConnected) if (state == XenbusStateInitialised) {
dev_dbg(&pdev->xdev->dev, "Handle skipped connect.\n");
/* We missed Connected and need to initialize. */
err = pcifront_connect_and_init_dma(pdev);
if (err && err != -EEXIST) {
xenbus_dev_fatal(pdev->xdev, err,
"Error setting up PCI Frontend");
goto out;
}
goto out_switch_state;
} else if (state != XenbusStateConnected) {
goto out; goto out;
}
err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, "num_devs", "%d", err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, "num_devs", "%d",
&num_devs); &num_devs);
@ -1048,6 +992,7 @@ static int pcifront_detach_devices(struct pcifront_device *pdev)
domain, bus, slot, func); domain, bus, slot, func);
} }
out_switch_state:
err = xenbus_switch_state(pdev->xdev, XenbusStateReconfiguring); err = xenbus_switch_state(pdev->xdev, XenbusStateReconfiguring);
out: out:

View File

@ -56,7 +56,7 @@ config XEN_MEMORY_HOTPLUG_LIMIT
depends on XEN_HAVE_PVMMU depends on XEN_HAVE_PVMMU
depends on MEMORY_HOTPLUG depends on MEMORY_HOTPLUG
help help
Maxmium amount of memory (in GiB) that a PV guest can be Maximum amount of memory (in GiB) that a PV guest can be
expanded to when using memory hotplug. expanded to when using memory hotplug.
A PV guest can have more memory than this limit if is A PV guest can have more memory than this limit if is

View File

@ -44,9 +44,10 @@ struct gntdev_unmap_notify {
}; };
struct gntdev_grant_map { struct gntdev_grant_map {
atomic_t in_use;
struct mmu_interval_notifier notifier; struct mmu_interval_notifier notifier;
bool notifier_init;
struct list_head next; struct list_head next;
struct vm_area_struct *vma;
int index; int index;
int count; int count;
int flags; int flags;

View File

@ -286,6 +286,9 @@ void gntdev_put_map(struct gntdev_priv *priv, struct gntdev_grant_map *map)
*/ */
} }
if (use_ptemod && map->notifier_init)
mmu_interval_notifier_remove(&map->notifier);
if (map->notify.flags & UNMAP_NOTIFY_SEND_EVENT) { if (map->notify.flags & UNMAP_NOTIFY_SEND_EVENT) {
notify_remote_via_evtchn(map->notify.event); notify_remote_via_evtchn(map->notify.event);
evtchn_put(map->notify.event); evtchn_put(map->notify.event);
@ -298,7 +301,7 @@ void gntdev_put_map(struct gntdev_priv *priv, struct gntdev_grant_map *map)
static int find_grant_ptes(pte_t *pte, unsigned long addr, void *data) static int find_grant_ptes(pte_t *pte, unsigned long addr, void *data)
{ {
struct gntdev_grant_map *map = data; struct gntdev_grant_map *map = data;
unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT; unsigned int pgnr = (addr - map->pages_vm_start) >> PAGE_SHIFT;
int flags = map->flags | GNTMAP_application_map | GNTMAP_contains_pte | int flags = map->flags | GNTMAP_application_map | GNTMAP_contains_pte |
(1 << _GNTMAP_guest_avail0); (1 << _GNTMAP_guest_avail0);
u64 pte_maddr; u64 pte_maddr;
@ -367,8 +370,7 @@ int gntdev_map_grant_pages(struct gntdev_grant_map *map)
for (i = 0; i < map->count; i++) { for (i = 0; i < map->count; i++) {
if (map->map_ops[i].status == GNTST_okay) { if (map->map_ops[i].status == GNTST_okay) {
map->unmap_ops[i].handle = map->map_ops[i].handle; map->unmap_ops[i].handle = map->map_ops[i].handle;
if (!use_ptemod) alloced++;
alloced++;
} else if (!err) } else if (!err)
err = -EINVAL; err = -EINVAL;
@ -377,8 +379,7 @@ int gntdev_map_grant_pages(struct gntdev_grant_map *map)
if (use_ptemod) { if (use_ptemod) {
if (map->kmap_ops[i].status == GNTST_okay) { if (map->kmap_ops[i].status == GNTST_okay) {
if (map->map_ops[i].status == GNTST_okay) alloced++;
alloced++;
map->kunmap_ops[i].handle = map->kmap_ops[i].handle; map->kunmap_ops[i].handle = map->kmap_ops[i].handle;
} else if (!err) } else if (!err)
err = -EINVAL; err = -EINVAL;
@ -394,8 +395,14 @@ static void __unmap_grant_pages_done(int result,
unsigned int i; unsigned int i;
struct gntdev_grant_map *map = data->data; struct gntdev_grant_map *map = data->data;
unsigned int offset = data->unmap_ops - map->unmap_ops; unsigned int offset = data->unmap_ops - map->unmap_ops;
int successful_unmaps = 0;
int live_grants;
for (i = 0; i < data->count; i++) { for (i = 0; i < data->count; i++) {
if (map->unmap_ops[offset + i].status == GNTST_okay &&
map->unmap_ops[offset + i].handle != INVALID_GRANT_HANDLE)
successful_unmaps++;
WARN_ON(map->unmap_ops[offset + i].status != GNTST_okay && WARN_ON(map->unmap_ops[offset + i].status != GNTST_okay &&
map->unmap_ops[offset + i].handle != INVALID_GRANT_HANDLE); map->unmap_ops[offset + i].handle != INVALID_GRANT_HANDLE);
pr_debug("unmap handle=%d st=%d\n", pr_debug("unmap handle=%d st=%d\n",
@ -403,6 +410,10 @@ static void __unmap_grant_pages_done(int result,
map->unmap_ops[offset+i].status); map->unmap_ops[offset+i].status);
map->unmap_ops[offset+i].handle = INVALID_GRANT_HANDLE; map->unmap_ops[offset+i].handle = INVALID_GRANT_HANDLE;
if (use_ptemod) { if (use_ptemod) {
if (map->kunmap_ops[offset + i].status == GNTST_okay &&
map->kunmap_ops[offset + i].handle != INVALID_GRANT_HANDLE)
successful_unmaps++;
WARN_ON(map->kunmap_ops[offset + i].status != GNTST_okay && WARN_ON(map->kunmap_ops[offset + i].status != GNTST_okay &&
map->kunmap_ops[offset + i].handle != INVALID_GRANT_HANDLE); map->kunmap_ops[offset + i].handle != INVALID_GRANT_HANDLE);
pr_debug("kunmap handle=%u st=%d\n", pr_debug("kunmap handle=%u st=%d\n",
@ -411,11 +422,15 @@ static void __unmap_grant_pages_done(int result,
map->kunmap_ops[offset+i].handle = INVALID_GRANT_HANDLE; map->kunmap_ops[offset+i].handle = INVALID_GRANT_HANDLE;
} }
} }
/* /*
* Decrease the live-grant counter. This must happen after the loop to * Decrease the live-grant counter. This must happen after the loop to
* prevent premature reuse of the grants by gnttab_mmap(). * prevent premature reuse of the grants by gnttab_mmap().
*/ */
atomic_sub(data->count, &map->live_grants); live_grants = atomic_sub_return(successful_unmaps, &map->live_grants);
if (WARN_ON(live_grants < 0))
pr_err("%s: live_grants became negative (%d) after unmapping %d pages!\n",
__func__, live_grants, successful_unmaps);
/* Release reference taken by __unmap_grant_pages */ /* Release reference taken by __unmap_grant_pages */
gntdev_put_map(NULL, map); gntdev_put_map(NULL, map);
@ -496,11 +511,7 @@ static void gntdev_vma_close(struct vm_area_struct *vma)
struct gntdev_priv *priv = file->private_data; struct gntdev_priv *priv = file->private_data;
pr_debug("gntdev_vma_close %p\n", vma); pr_debug("gntdev_vma_close %p\n", vma);
if (use_ptemod) {
WARN_ON(map->vma != vma);
mmu_interval_notifier_remove(&map->notifier);
map->vma = NULL;
}
vma->vm_private_data = NULL; vma->vm_private_data = NULL;
gntdev_put_map(priv, map); gntdev_put_map(priv, map);
} }
@ -528,29 +539,30 @@ static bool gntdev_invalidate(struct mmu_interval_notifier *mn,
struct gntdev_grant_map *map = struct gntdev_grant_map *map =
container_of(mn, struct gntdev_grant_map, notifier); container_of(mn, struct gntdev_grant_map, notifier);
unsigned long mstart, mend; unsigned long mstart, mend;
unsigned long map_start, map_end;
if (!mmu_notifier_range_blockable(range)) if (!mmu_notifier_range_blockable(range))
return false; return false;
map_start = map->pages_vm_start;
map_end = map->pages_vm_start + (map->count << PAGE_SHIFT);
/* /*
* If the VMA is split or otherwise changed the notifier is not * If the VMA is split or otherwise changed the notifier is not
* updated, but we don't want to process VA's outside the modified * updated, but we don't want to process VA's outside the modified
* VMA. FIXME: It would be much more understandable to just prevent * VMA. FIXME: It would be much more understandable to just prevent
* modifying the VMA in the first place. * modifying the VMA in the first place.
*/ */
if (map->vma->vm_start >= range->end || if (map_start >= range->end || map_end <= range->start)
map->vma->vm_end <= range->start)
return true; return true;
mstart = max(range->start, map->vma->vm_start); mstart = max(range->start, map_start);
mend = min(range->end, map->vma->vm_end); mend = min(range->end, map_end);
pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n", pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
map->index, map->count, map->index, map->count, map_start, map_end,
map->vma->vm_start, map->vma->vm_end, range->start, range->end, mstart, mend);
range->start, range->end, mstart, mend); unmap_grant_pages(map, (mstart - map_start) >> PAGE_SHIFT,
unmap_grant_pages(map, (mend - mstart) >> PAGE_SHIFT);
(mstart - map->vma->vm_start) >> PAGE_SHIFT,
(mend - mstart) >> PAGE_SHIFT);
return true; return true;
} }
@ -1030,18 +1042,15 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
return -EINVAL; return -EINVAL;
pr_debug("map %d+%d at %lx (pgoff %lx)\n", pr_debug("map %d+%d at %lx (pgoff %lx)\n",
index, count, vma->vm_start, vma->vm_pgoff); index, count, vma->vm_start, vma->vm_pgoff);
mutex_lock(&priv->lock); mutex_lock(&priv->lock);
map = gntdev_find_map_index(priv, index, count); map = gntdev_find_map_index(priv, index, count);
if (!map) if (!map)
goto unlock_out; goto unlock_out;
if (use_ptemod && map->vma) if (!atomic_add_unless(&map->in_use, 1, 1))
goto unlock_out; goto unlock_out;
if (atomic_read(&map->live_grants)) {
err = -EAGAIN;
goto unlock_out;
}
refcount_inc(&map->users); refcount_inc(&map->users);
vma->vm_ops = &gntdev_vmops; vma->vm_ops = &gntdev_vmops;
@ -1062,15 +1071,16 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
map->flags |= GNTMAP_readonly; map->flags |= GNTMAP_readonly;
} }
map->pages_vm_start = vma->vm_start;
if (use_ptemod) { if (use_ptemod) {
map->vma = vma;
err = mmu_interval_notifier_insert_locked( err = mmu_interval_notifier_insert_locked(
&map->notifier, vma->vm_mm, vma->vm_start, &map->notifier, vma->vm_mm, vma->vm_start,
vma->vm_end - vma->vm_start, &gntdev_mmu_ops); vma->vm_end - vma->vm_start, &gntdev_mmu_ops);
if (err) { if (err)
map->vma = NULL;
goto out_unlock_put; goto out_unlock_put;
}
map->notifier_init = true;
} }
mutex_unlock(&priv->lock); mutex_unlock(&priv->lock);
@ -1087,7 +1097,6 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
*/ */
mmu_interval_read_begin(&map->notifier); mmu_interval_read_begin(&map->notifier);
map->pages_vm_start = vma->vm_start;
err = apply_to_page_range(vma->vm_mm, vma->vm_start, err = apply_to_page_range(vma->vm_mm, vma->vm_start,
vma->vm_end - vma->vm_start, vma->vm_end - vma->vm_start,
find_grant_ptes, map); find_grant_ptes, map);
@ -1116,13 +1125,8 @@ unlock_out:
out_unlock_put: out_unlock_put:
mutex_unlock(&priv->lock); mutex_unlock(&priv->lock);
out_put_map: out_put_map:
if (use_ptemod) { if (use_ptemod)
unmap_grant_pages(map, 0, map->count); unmap_grant_pages(map, 0, map->count);
if (map->vma) {
mmu_interval_notifier_remove(&map->notifier);
map->vma = NULL;
}
}
gntdev_put_map(priv, map); gntdev_put_map(priv, map);
return err; return err;
} }

View File

@ -25,7 +25,7 @@ struct xen_grant_dma_data {
bool broken; bool broken;
}; };
static DEFINE_XARRAY(xen_grant_dma_devices); static DEFINE_XARRAY_FLAGS(xen_grant_dma_devices, XA_FLAGS_LOCK_IRQ);
#define XEN_GRANT_DMA_ADDR_OFF (1ULL << 63) #define XEN_GRANT_DMA_ADDR_OFF (1ULL << 63)
@ -42,14 +42,29 @@ static inline grant_ref_t dma_to_grant(dma_addr_t dma)
static struct xen_grant_dma_data *find_xen_grant_dma_data(struct device *dev) static struct xen_grant_dma_data *find_xen_grant_dma_data(struct device *dev)
{ {
struct xen_grant_dma_data *data; struct xen_grant_dma_data *data;
unsigned long flags;
xa_lock(&xen_grant_dma_devices); xa_lock_irqsave(&xen_grant_dma_devices, flags);
data = xa_load(&xen_grant_dma_devices, (unsigned long)dev); data = xa_load(&xen_grant_dma_devices, (unsigned long)dev);
xa_unlock(&xen_grant_dma_devices); xa_unlock_irqrestore(&xen_grant_dma_devices, flags);
return data; return data;
} }
static int store_xen_grant_dma_data(struct device *dev,
struct xen_grant_dma_data *data)
{
unsigned long flags;
int ret;
xa_lock_irqsave(&xen_grant_dma_devices, flags);
ret = xa_err(__xa_store(&xen_grant_dma_devices, (unsigned long)dev, data,
GFP_ATOMIC));
xa_unlock_irqrestore(&xen_grant_dma_devices, flags);
return ret;
}
/* /*
* DMA ops for Xen frontends (e.g. virtio). * DMA ops for Xen frontends (e.g. virtio).
* *
@ -153,7 +168,7 @@ static dma_addr_t xen_grant_dma_map_page(struct device *dev, struct page *page,
unsigned long attrs) unsigned long attrs)
{ {
struct xen_grant_dma_data *data; struct xen_grant_dma_data *data;
unsigned int i, n_pages = PFN_UP(size); unsigned int i, n_pages = PFN_UP(offset + size);
grant_ref_t grant; grant_ref_t grant;
dma_addr_t dma_handle; dma_addr_t dma_handle;
@ -185,7 +200,8 @@ static void xen_grant_dma_unmap_page(struct device *dev, dma_addr_t dma_handle,
unsigned long attrs) unsigned long attrs)
{ {
struct xen_grant_dma_data *data; struct xen_grant_dma_data *data;
unsigned int i, n_pages = PFN_UP(size); unsigned long offset = dma_handle & (PAGE_SIZE - 1);
unsigned int i, n_pages = PFN_UP(offset + size);
grant_ref_t grant; grant_ref_t grant;
if (WARN_ON(dir == DMA_NONE)) if (WARN_ON(dir == DMA_NONE))
@ -273,34 +289,68 @@ static const struct dma_map_ops xen_grant_dma_ops = {
.dma_supported = xen_grant_dma_supported, .dma_supported = xen_grant_dma_supported,
}; };
bool xen_is_grant_dma_device(struct device *dev) static bool xen_is_dt_grant_dma_device(struct device *dev)
{ {
struct device_node *iommu_np; struct device_node *iommu_np;
bool has_iommu; bool has_iommu;
/* XXX Handle only DT devices for now */
if (!dev->of_node)
return false;
iommu_np = of_parse_phandle(dev->of_node, "iommus", 0); iommu_np = of_parse_phandle(dev->of_node, "iommus", 0);
has_iommu = iommu_np && of_device_is_compatible(iommu_np, "xen,grant-dma"); has_iommu = iommu_np &&
of_device_is_compatible(iommu_np, "xen,grant-dma");
of_node_put(iommu_np); of_node_put(iommu_np);
return has_iommu; return has_iommu;
} }
bool xen_is_grant_dma_device(struct device *dev)
{
/* XXX Handle only DT devices for now */
if (dev->of_node)
return xen_is_dt_grant_dma_device(dev);
return false;
}
bool xen_virtio_mem_acc(struct virtio_device *dev) bool xen_virtio_mem_acc(struct virtio_device *dev)
{ {
if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT)) if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT) || xen_pv_domain())
return true; return true;
return xen_is_grant_dma_device(dev->dev.parent); return xen_is_grant_dma_device(dev->dev.parent);
} }
static int xen_dt_grant_init_backend_domid(struct device *dev,
struct xen_grant_dma_data *data)
{
struct of_phandle_args iommu_spec;
if (of_parse_phandle_with_args(dev->of_node, "iommus", "#iommu-cells",
0, &iommu_spec)) {
dev_err(dev, "Cannot parse iommus property\n");
return -ESRCH;
}
if (!of_device_is_compatible(iommu_spec.np, "xen,grant-dma") ||
iommu_spec.args_count != 1) {
dev_err(dev, "Incompatible IOMMU node\n");
of_node_put(iommu_spec.np);
return -ESRCH;
}
of_node_put(iommu_spec.np);
/*
* The endpoint ID here means the ID of the domain where the
* corresponding backend is running
*/
data->backend_domid = iommu_spec.args[0];
return 0;
}
void xen_grant_setup_dma_ops(struct device *dev) void xen_grant_setup_dma_ops(struct device *dev)
{ {
struct xen_grant_dma_data *data; struct xen_grant_dma_data *data;
struct of_phandle_args iommu_spec;
data = find_xen_grant_dma_data(dev); data = find_xen_grant_dma_data(dev);
if (data) { if (data) {
@ -308,37 +358,22 @@ void xen_grant_setup_dma_ops(struct device *dev)
return; return;
} }
/* XXX ACPI device unsupported for now */
if (!dev->of_node)
goto err;
if (of_parse_phandle_with_args(dev->of_node, "iommus", "#iommu-cells",
0, &iommu_spec)) {
dev_err(dev, "Cannot parse iommus property\n");
goto err;
}
if (!of_device_is_compatible(iommu_spec.np, "xen,grant-dma") ||
iommu_spec.args_count != 1) {
dev_err(dev, "Incompatible IOMMU node\n");
of_node_put(iommu_spec.np);
goto err;
}
of_node_put(iommu_spec.np);
data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL); data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
if (!data) if (!data)
goto err; goto err;
/* if (dev->of_node) {
* The endpoint ID here means the ID of the domain where the corresponding if (xen_dt_grant_init_backend_domid(dev, data))
* backend is running goto err;
*/ } else if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT)) {
data->backend_domid = iommu_spec.args[0]; dev_info(dev, "Using dom0 as backend\n");
data->backend_domid = 0;
} else {
/* XXX ACPI device unsupported for now */
goto err;
}
if (xa_err(xa_store(&xen_grant_dma_devices, (unsigned long)dev, data, if (store_xen_grant_dma_data(dev, data)) {
GFP_KERNEL))) {
dev_err(dev, "Cannot store Xen grant DMA data\n"); dev_err(dev, "Cannot store Xen grant DMA data\n");
goto err; goto err;
} }
@ -348,9 +383,20 @@ void xen_grant_setup_dma_ops(struct device *dev)
return; return;
err: err:
devm_kfree(dev, data);
dev_err(dev, "Cannot set up Xen grant DMA ops, retain platform DMA ops\n"); dev_err(dev, "Cannot set up Xen grant DMA ops, retain platform DMA ops\n");
} }
bool xen_virtio_restricted_mem_acc(struct virtio_device *dev)
{
bool ret = xen_virtio_mem_acc(dev);
if (ret)
xen_grant_setup_dma_ops(dev->dev.parent);
return ret;
}
MODULE_DESCRIPTION("Xen grant DMA-mapping layer"); MODULE_DESCRIPTION("Xen grant DMA-mapping layer");
MODULE_AUTHOR("Juergen Gross <jgross@suse.com>"); MODULE_AUTHOR("Juergen Gross <jgross@suse.com>");
MODULE_LICENSE("GPL"); MODULE_LICENSE("GPL");

View File

@ -31,7 +31,7 @@ MODULE_PARM_DESC(passthrough,
" frontend (for example, a device at 06:01.b will still appear at\n"\ " frontend (for example, a device at 06:01.b will still appear at\n"\
" 06:01.b to the frontend). This is similar to how Xen 2.0.x\n"\ " 06:01.b to the frontend). This is similar to how Xen 2.0.x\n"\
" exposed PCI devices to its driver domains. This may be required\n"\ " exposed PCI devices to its driver domains. This may be required\n"\
" for drivers which depend on finding their hardward in certain\n"\ " for drivers which depend on finding their hardware in certain\n"\
" bus/slot locations."); " bus/slot locations.");
static struct xen_pcibk_device *alloc_pdev(struct xenbus_device *xdev) static struct xen_pcibk_device *alloc_pdev(struct xenbus_device *xdev)

View File

@ -219,6 +219,7 @@ static inline void xen_preemptible_hcall_end(void) { }
void xen_grant_setup_dma_ops(struct device *dev); void xen_grant_setup_dma_ops(struct device *dev);
bool xen_is_grant_dma_device(struct device *dev); bool xen_is_grant_dma_device(struct device *dev);
bool xen_virtio_mem_acc(struct virtio_device *dev); bool xen_virtio_mem_acc(struct virtio_device *dev);
bool xen_virtio_restricted_mem_acc(struct virtio_device *dev);
#else #else
static inline void xen_grant_setup_dma_ops(struct device *dev) static inline void xen_grant_setup_dma_ops(struct device *dev)
{ {
@ -234,6 +235,11 @@ static inline bool xen_virtio_mem_acc(struct virtio_device *dev)
{ {
return false; return false;
} }
static inline bool xen_virtio_restricted_mem_acc(struct virtio_device *dev)
{
return false;
}
#endif /* CONFIG_XEN_GRANT_DMA_OPS */ #endif /* CONFIG_XEN_GRANT_DMA_OPS */
#endif /* INCLUDE_XEN_OPS_H */ #endif /* INCLUDE_XEN_OPS_H */