hyperv-fixes for 6.4-rc8

-----BEGIN PGP SIGNATURE-----
 
 iQFHBAABCgAxFiEEIbPD0id6easf0xsudhRwX5BBoF4FAmSQ3ioTHHdlaS5saXVA
 a2VybmVsLm9yZwAKCRB2FHBfkEGgXpREB/9nMJ5PbgsxpqKiV3ckodXZp7wLkFAv
 VK12KBZcjAr8kbZON0CHXWssC/QLBV9+UYDjvA7ciEjkzBZoIY8GMAjFZ4NNveTm
 ssZPaxg0DHX7SzVO6qDrZBwjyGmjPh8vH5TDsb6QPYk8WMuwYy+QZMWTEcxr7QU4
 o3GRbt+JShS05s5Q1B3pSeztyxDxJh1potyoTfaY1sbih0c+r6mtewlpRW3KgoSc
 ukssybTmNyRRpDos/PlT2e0gRpIzlYQnzE+sj4mGOOQFh4wGOR8wGcNPr4yirrcI
 gy/4nvIwxp0uLW0C30FBlqzNt9dirOSRflXq/Pp4MdQcSM3hpeONbDxx
 =0aJ5
 -----END PGP SIGNATURE-----

Merge tag 'hyperv-fixes-signed-20230619' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux

Pull hyperv fixes from Wei Liu:

 - Fix races in Hyper-V PCI controller (Dexuan Cui)

 - Fix handling of hyperv_pcpu_input_arg (Michael Kelley)

 - Fix vmbus_wait_for_unload to scan present CPUs (Michael Kelley)

 - Call hv_synic_free in the failure path of hv_synic_alloc (Dexuan Cui)

 - Add noop for real mode handlers for virtual trust level code (Saurabh
   Sengar)

* tag 'hyperv-fixes-signed-20230619' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux:
  PCI: hv: Add a per-bus mutex state_lock
  Revert "PCI: hv: Fix a timing issue which causes kdump to fail occasionally"
  PCI: hv: Remove the useless hv_pcichild_state from struct hv_pci_dev
  PCI: hv: Fix a race condition in hv_irq_unmask() that can cause panic
  PCI: hv: Fix a race condition bug in hv_pci_query_relations()
  arm64/hyperv: Use CPUHP_AP_HYPERV_ONLINE state to fix CPU online sequencing
  x86/hyperv: Fix hyperv_pcpu_input_arg handling when CPUs go online/offline
  Drivers: hv: vmbus: Fix vmbus_wait_for_unload() to scan present CPUs
  Drivers: hv: vmbus: Call hv_synic_free() if hv_synic_alloc() fails
  x86/hyperv/vtl: Add noop for realmode pointers
This commit is contained in:
Linus Torvalds 2023-06-19 17:05:43 -07:00
commit 692b7dc87c
8 changed files with 129 additions and 88 deletions

View File

@ -67,7 +67,7 @@ static int __init hyperv_init(void)
if (ret)
return ret;
ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "arm64/hyperv_init:online",
ret = cpuhp_setup_state(CPUHP_AP_HYPERV_ONLINE, "arm64/hyperv_init:online",
hv_common_cpu_init, hv_common_cpu_die);
if (ret < 0) {
hv_common_free();

View File

@ -416,7 +416,7 @@ void __init hyperv_init(void)
goto free_vp_assist_page;
}
cpuhp = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online",
cpuhp = cpuhp_setup_state(CPUHP_AP_HYPERV_ONLINE, "x86/hyperv_init:online",
hv_cpu_init, hv_cpu_die);
if (cpuhp < 0)
goto free_ghcb_page;

View File

@ -20,6 +20,8 @@ void __init hv_vtl_init_platform(void)
{
pr_info("Linux runs in Hyper-V Virtual Trust Level\n");
x86_platform.realmode_reserve = x86_init_noop;
x86_platform.realmode_init = x86_init_noop;
x86_init.irqs.pre_vector_init = x86_init_noop;
x86_init.timers.timer_init = x86_init_noop;

View File

@ -829,11 +829,22 @@ static void vmbus_wait_for_unload(void)
if (completion_done(&vmbus_connection.unload_event))
goto completed;
for_each_online_cpu(cpu) {
for_each_present_cpu(cpu) {
struct hv_per_cpu_context *hv_cpu
= per_cpu_ptr(hv_context.cpu_context, cpu);
/*
* In a CoCo VM the synic_message_page is not allocated
* in hv_synic_alloc(). Instead it is set/cleared in
* hv_synic_enable_regs() and hv_synic_disable_regs()
* such that it is set only when the CPU is online. If
* not all present CPUs are online, the message page
* might be NULL, so skip such CPUs.
*/
page_addr = hv_cpu->synic_message_page;
if (!page_addr)
continue;
msg = (struct hv_message *)page_addr
+ VMBUS_MESSAGE_SINT;
@ -867,11 +878,14 @@ completed:
* maybe-pending messages on all CPUs to be able to receive new
* messages after we reconnect.
*/
for_each_online_cpu(cpu) {
for_each_present_cpu(cpu) {
struct hv_per_cpu_context *hv_cpu
= per_cpu_ptr(hv_context.cpu_context, cpu);
page_addr = hv_cpu->synic_message_page;
if (!page_addr)
continue;
msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT;
msg->header.message_type = HVMSG_NONE;
}

View File

@ -364,13 +364,20 @@ int hv_common_cpu_init(unsigned int cpu)
flags = irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL;
inputarg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
*inputarg = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags);
if (!(*inputarg))
return -ENOMEM;
if (hv_root_partition) {
outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
*outputarg = (char *)(*inputarg) + HV_HYP_PAGE_SIZE;
/*
* hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory is already
* allocated if this CPU was previously online and then taken offline
*/
if (!*inputarg) {
*inputarg = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags);
if (!(*inputarg))
return -ENOMEM;
if (hv_root_partition) {
outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
*outputarg = (char *)(*inputarg) + HV_HYP_PAGE_SIZE;
}
}
msr_vp_index = hv_get_register(HV_REGISTER_VP_INDEX);
@ -385,24 +392,17 @@ int hv_common_cpu_init(unsigned int cpu)
int hv_common_cpu_die(unsigned int cpu)
{
unsigned long flags;
void **inputarg, **outputarg;
void *mem;
local_irq_save(flags);
inputarg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
mem = *inputarg;
*inputarg = NULL;
if (hv_root_partition) {
outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
*outputarg = NULL;
}
local_irq_restore(flags);
kfree(mem);
/*
* The hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory
* is not freed when the CPU goes offline as the hyperv_pcpu_input_arg
* may be used by the Hyper-V vPCI driver in reassigning interrupts
* as part of the offlining process. The interrupt reassignment
* happens *after* the CPUHP_AP_HYPERV_ONLINE state has run and
* called this function.
*
* If a previously offlined CPU is brought back online again, the
* originally allocated memory is reused in hv_common_cpu_init().
*/
return 0;
}

View File

@ -1372,7 +1372,7 @@ static int vmbus_bus_init(void)
ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online",
hv_synic_init, hv_synic_cleanup);
if (ret < 0)
goto err_cpuhp;
goto err_alloc;
hyperv_cpuhp_online = ret;
ret = vmbus_connect();
@ -1392,9 +1392,8 @@ static int vmbus_bus_init(void)
err_connect:
cpuhp_remove_state(hyperv_cpuhp_online);
err_cpuhp:
hv_synic_free();
err_alloc:
hv_synic_free();
if (vmbus_irq == -1) {
hv_remove_vmbus_handler();
} else {

View File

@ -489,7 +489,10 @@ struct hv_pcibus_device {
struct fwnode_handle *fwnode;
/* Protocol version negotiated with the host */
enum pci_protocol_version_t protocol_version;
struct mutex state_lock;
enum hv_pcibus_state state;
struct hv_device *hdev;
resource_size_t low_mmio_space;
resource_size_t high_mmio_space;
@ -545,19 +548,10 @@ struct hv_dr_state {
struct hv_pcidev_description func[];
};
enum hv_pcichild_state {
hv_pcichild_init = 0,
hv_pcichild_requirements,
hv_pcichild_resourced,
hv_pcichild_ejecting,
hv_pcichild_maximum
};
struct hv_pci_dev {
/* List protected by pci_rescan_remove_lock */
struct list_head list_entry;
refcount_t refs;
enum hv_pcichild_state state;
struct pci_slot *pci_slot;
struct hv_pcidev_description desc;
bool reported_missing;
@ -635,6 +629,11 @@ static void hv_arch_irq_unmask(struct irq_data *data)
pbus = pdev->bus;
hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
int_desc = data->chip_data;
if (!int_desc) {
dev_warn(&hbus->hdev->device, "%s() can not unmask irq %u\n",
__func__, data->irq);
return;
}
local_irq_save(flags);
@ -2004,12 +2003,6 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
hv_pci_onchannelcallback(hbus);
spin_unlock_irqrestore(&channel->sched_lock, flags);
if (hpdev->state == hv_pcichild_ejecting) {
dev_err_once(&hbus->hdev->device,
"the device is being ejected\n");
goto enable_tasklet;
}
udelay(100);
}
@ -2615,6 +2608,8 @@ static void pci_devices_present_work(struct work_struct *work)
if (!dr)
return;
mutex_lock(&hbus->state_lock);
/* First, mark all existing children as reported missing. */
spin_lock_irqsave(&hbus->device_list_lock, flags);
list_for_each_entry(hpdev, &hbus->children, list_entry) {
@ -2696,6 +2691,8 @@ static void pci_devices_present_work(struct work_struct *work)
break;
}
mutex_unlock(&hbus->state_lock);
kfree(dr);
}
@ -2844,7 +2841,7 @@ static void hv_eject_device_work(struct work_struct *work)
hpdev = container_of(work, struct hv_pci_dev, wrk);
hbus = hpdev->hbus;
WARN_ON(hpdev->state != hv_pcichild_ejecting);
mutex_lock(&hbus->state_lock);
/*
* Ejection can come before or after the PCI bus has been set up, so
@ -2882,6 +2879,8 @@ static void hv_eject_device_work(struct work_struct *work)
put_pcichild(hpdev);
put_pcichild(hpdev);
/* hpdev has been freed. Do not use it any more. */
mutex_unlock(&hbus->state_lock);
}
/**
@ -2902,7 +2901,6 @@ static void hv_pci_eject_device(struct hv_pci_dev *hpdev)
return;
}
hpdev->state = hv_pcichild_ejecting;
get_pcichild(hpdev);
INIT_WORK(&hpdev->wrk, hv_eject_device_work);
queue_work(hbus->wq, &hpdev->wrk);
@ -3331,8 +3329,10 @@ static int hv_pci_enter_d0(struct hv_device *hdev)
struct pci_bus_d0_entry *d0_entry;
struct hv_pci_compl comp_pkt;
struct pci_packet *pkt;
bool retry = true;
int ret;
enter_d0_retry:
/*
* Tell the host that the bus is ready to use, and moved into the
* powered-on state. This includes telling the host which region
@ -3359,6 +3359,38 @@ static int hv_pci_enter_d0(struct hv_device *hdev)
if (ret)
goto exit;
/*
* In certain case (Kdump) the pci device of interest was
* not cleanly shut down and resource is still held on host
* side, the host could return invalid device status.
* We need to explicitly request host to release the resource
* and try to enter D0 again.
*/
if (comp_pkt.completion_status < 0 && retry) {
retry = false;
dev_err(&hdev->device, "Retrying D0 Entry\n");
/*
* Hv_pci_bus_exit() calls hv_send_resource_released()
* to free up resources of its child devices.
* In the kdump kernel we need to set the
* wslot_res_allocated to 255 so it scans all child
* devices to release resources allocated in the
* normal kernel before panic happened.
*/
hbus->wslot_res_allocated = 255;
ret = hv_pci_bus_exit(hdev, true);
if (ret == 0) {
kfree(pkt);
goto enter_d0_retry;
}
dev_err(&hdev->device,
"Retrying D0 failed with ret %d\n", ret);
}
if (comp_pkt.completion_status < 0) {
dev_err(&hdev->device,
"PCI Pass-through VSP failed D0 Entry with status %x\n",
@ -3401,6 +3433,24 @@ static int hv_pci_query_relations(struct hv_device *hdev)
if (!ret)
ret = wait_for_response(hdev, &comp);
/*
* In the case of fast device addition/removal, it's possible that
* vmbus_sendpacket() or wait_for_response() returns -ENODEV but we
* already got a PCI_BUS_RELATIONS* message from the host and the
* channel callback already scheduled a work to hbus->wq, which can be
* running pci_devices_present_work() -> survey_child_resources() ->
* complete(&hbus->survey_event), even after hv_pci_query_relations()
* exits and the stack variable 'comp' is no longer valid; as a result,
* a hang or a page fault may happen when the complete() calls
* raw_spin_lock_irqsave(). Flush hbus->wq before we exit from
* hv_pci_query_relations() to avoid the issues. Note: if 'ret' is
* -ENODEV, there can't be any more work item scheduled to hbus->wq
* after the flush_workqueue(): see vmbus_onoffer_rescind() ->
* vmbus_reset_channel_cb(), vmbus_rescind_cleanup() ->
* channel->rescind = true.
*/
flush_workqueue(hbus->wq);
return ret;
}
@ -3586,7 +3636,6 @@ static int hv_pci_probe(struct hv_device *hdev,
struct hv_pcibus_device *hbus;
u16 dom_req, dom;
char *name;
bool enter_d0_retry = true;
int ret;
bridge = devm_pci_alloc_host_bridge(&hdev->device, 0);
@ -3598,6 +3647,7 @@ static int hv_pci_probe(struct hv_device *hdev,
return -ENOMEM;
hbus->bridge = bridge;
mutex_init(&hbus->state_lock);
hbus->state = hv_pcibus_init;
hbus->wslot_res_allocated = -1;
@ -3703,49 +3753,15 @@ static int hv_pci_probe(struct hv_device *hdev,
if (ret)
goto free_fwnode;
retry:
ret = hv_pci_query_relations(hdev);
if (ret)
goto free_irq_domain;
mutex_lock(&hbus->state_lock);
ret = hv_pci_enter_d0(hdev);
/*
* In certain case (Kdump) the pci device of interest was
* not cleanly shut down and resource is still held on host
* side, the host could return invalid device status.
* We need to explicitly request host to release the resource
* and try to enter D0 again.
* Since the hv_pci_bus_exit() call releases structures
* of all its child devices, we need to start the retry from
* hv_pci_query_relations() call, requesting host to send
* the synchronous child device relations message before this
* information is needed in hv_send_resources_allocated()
* call later.
*/
if (ret == -EPROTO && enter_d0_retry) {
enter_d0_retry = false;
dev_err(&hdev->device, "Retrying D0 Entry\n");
/*
* Hv_pci_bus_exit() calls hv_send_resources_released()
* to free up resources of its child devices.
* In the kdump kernel we need to set the
* wslot_res_allocated to 255 so it scans all child
* devices to release resources allocated in the
* normal kernel before panic happened.
*/
hbus->wslot_res_allocated = 255;
ret = hv_pci_bus_exit(hdev, true);
if (ret == 0)
goto retry;
dev_err(&hdev->device,
"Retrying D0 failed with ret %d\n", ret);
}
if (ret)
goto free_irq_domain;
goto release_state_lock;
ret = hv_pci_allocate_bridge_windows(hbus);
if (ret)
@ -3763,12 +3779,15 @@ retry:
if (ret)
goto free_windows;
mutex_unlock(&hbus->state_lock);
return 0;
free_windows:
hv_pci_free_bridge_windows(hbus);
exit_d0:
(void) hv_pci_bus_exit(hdev, true);
release_state_lock:
mutex_unlock(&hbus->state_lock);
free_irq_domain:
irq_domain_remove(hbus->irq_domain);
free_fwnode:
@ -4018,20 +4037,26 @@ static int hv_pci_resume(struct hv_device *hdev)
if (ret)
goto out;
mutex_lock(&hbus->state_lock);
ret = hv_pci_enter_d0(hdev);
if (ret)
goto out;
goto release_state_lock;
ret = hv_send_resources_allocated(hdev);
if (ret)
goto out;
goto release_state_lock;
prepopulate_bars(hbus);
hv_pci_restore_msi_state(hbus);
hbus->state = hv_pcibus_installed;
mutex_unlock(&hbus->state_lock);
return 0;
release_state_lock:
mutex_unlock(&hbus->state_lock);
out:
vmbus_close(hdev->channel);
return ret;

View File

@ -200,6 +200,7 @@ enum cpuhp_state {
/* Online section invoked on the hotplugged CPU from the hotplug thread */
CPUHP_AP_ONLINE_IDLE,
CPUHP_AP_HYPERV_ONLINE,
CPUHP_AP_KVM_ONLINE,
CPUHP_AP_SCHED_WAIT_EMPTY,
CPUHP_AP_SMPBOOT_THREADS,