From cf11372949434fc4a6194b1ed9eb861d08032d08 Mon Sep 17 00:00:00 2001 From: Alexey Makhalov Date: Wed, 19 Oct 2016 22:02:11 -0700 Subject: [PATCH 1/5] x86/vmware: Read tsc_khz only once at boot time Re-factor the vmware platform setup code to query the hypervisor for tsc frequency only once during boot. Since the VMware hypervisor guarantees constant TSC, calibrate_tsc now uses the saved value. Signed-off-by: Alexey Makhalov Acked-by: Alok N Kataria Cc: virtualization@lists.linux-foundation.org Link: http://lkml.kernel.org/r/20161020050211.GA25304@amakhalov-virtual-machine Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/vmware.c | 37 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 81160578b91a..4e34da4be297 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -47,6 +47,8 @@ "2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \ "memory"); +static unsigned long vmware_tsc_khz __ro_after_init; + static inline int __vmware_platform(void) { uint32_t eax, ebx, ecx, edx; @@ -56,35 +58,32 @@ static inline int __vmware_platform(void) static unsigned long vmware_get_tsc_khz(void) { - uint64_t tsc_hz, lpj; - uint32_t eax, ebx, ecx, edx; - - VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); - - tsc_hz = eax | (((uint64_t)ebx) << 32); - do_div(tsc_hz, 1000); - BUG_ON(tsc_hz >> 32); - pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n", - (unsigned long) tsc_hz / 1000, - (unsigned long) tsc_hz % 1000); - - if (!preset_lpj) { - lpj = ((u64)tsc_hz * 1000); - do_div(lpj, HZ); - preset_lpj = lpj; - } - - return tsc_hz; + return vmware_tsc_khz; } static void __init vmware_platform_setup(void) { uint32_t eax, ebx, ecx, edx; + uint64_t lpj, tsc_khz; VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); if (ebx != UINT_MAX) { + lpj = tsc_khz = eax | (((uint64_t)ebx) << 32); + do_div(tsc_khz, 1000); + WARN_ON(tsc_khz >> 32); + pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n", + (unsigned long) tsc_khz / 1000, + (unsigned long) tsc_khz % 1000); + + if (!preset_lpj) { + do_div(lpj, HZ); + preset_lpj = lpj; + } + + vmware_tsc_khz = tsc_khz; x86_platform.calibrate_tsc = vmware_get_tsc_khz; + #ifdef CONFIG_X86_LOCAL_APIC /* Skip lapic calibration since we know the bus frequency. */ lapic_timer_frequency = ecx / HZ; From 5ccd5f7057d28cdd5a8eadd8d5d158984a0a13a8 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Tue, 25 Oct 2016 08:04:32 +0200 Subject: [PATCH 2/5] x86/platform/intel-mid: Unexport intel_mid_pci_set_power_state() There's no module user of this. Signed-off-by: Lukas Wunner Cc: Andy Shevchenko Cc: Bryan O'Donoghue Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/b8cf9b508c89d3c69d20a61ff540e666d4243747.1477374931.git.lukas@wunner.de Signed-off-by: Ingo Molnar --- arch/x86/platform/intel-mid/pwr.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/platform/intel-mid/pwr.c b/arch/x86/platform/intel-mid/pwr.c index 5d3b45ad1c03..59628e4b5551 100644 --- a/arch/x86/platform/intel-mid/pwr.c +++ b/arch/x86/platform/intel-mid/pwr.c @@ -270,7 +270,6 @@ int intel_mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state) return 0; } -EXPORT_SYMBOL_GPL(intel_mid_pci_set_power_state); void intel_mid_pwr_power_off(void) { From 687bca8d664ac9b098005b57846773eb62040ae0 Mon Sep 17 00:00:00 2001 From: Alexey Makhalov Date: Fri, 28 Oct 2016 00:54:30 -0700 Subject: [PATCH 3/5] x86/vmware: Use tsc_khz value for calibrate_cpu() Commit aa297292d708 ("x86/tsc: Enumerate SKL cpu_khz and tsc_khz via CPUID") separated the calibration mechanisms for cpu_khz and tsc_khz. Since the vmware hypervisor provides a constant frequency TSC to the guest, this change can lead to divergence between the tsc and the cpu frequency after vmotion, which might confuse the user. Solve this by overriding the x86 platform cpu calibration callback with the vmware specific tsc calibration function. Signed-off-by: Alexey Makhalov Acked-by: Alok N Kataria Cc: linux-doc@vger.kernel.org Cc: pv-drivers@vmware.com Cc: corbet@lwn.net Cc: virtualization@lists.linux-foundation.org Link: http://lkml.kernel.org/r/20161028075432.90579-2-amakhalov@vmware.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/vmware.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 4e34da4be297..480790fe2463 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -83,6 +83,7 @@ static void __init vmware_platform_setup(void) vmware_tsc_khz = tsc_khz; x86_platform.calibrate_tsc = vmware_get_tsc_khz; + x86_platform.calibrate_cpu = vmware_get_tsc_khz; #ifdef CONFIG_X86_LOCAL_APIC /* Skip lapic calibration since we know the bus frequency. */ From 91d1e54ebd1615d216b7f57324a5e69166a344e0 Mon Sep 17 00:00:00 2001 From: Alexey Makhalov Date: Fri, 28 Oct 2016 00:54:31 -0700 Subject: [PATCH 4/5] x86/vmware: Add basic paravirt ops support Add basic paravirt support: 1. Set pv_info.name to "VMware hypervisor" to have proper boot log message Booting paravirtualized kernel on VMware hypervisor instead of "... on bare hardware" 2. Set pv_cpu_ops.io_delay() to empty function - paravirt_noop() to avoid vm-exits on IO delays because io delays they are not required. Signed-off-by: Alexey Makhalov Acked-by: Alok N Kataria Cc: linux-doc@vger.kernel.org Cc: pv-drivers@vmware.com Cc: corbet@lwn.net Cc: virtualization@lists.linux-foundation.org Link: http://lkml.kernel.org/r/20161028075432.90579-3-amakhalov@vmware.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/vmware.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 480790fe2463..098a524a1646 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -61,6 +61,16 @@ static unsigned long vmware_get_tsc_khz(void) return vmware_tsc_khz; } +#ifdef CONFIG_PARAVIRT +static void __init vmware_paravirt_ops_setup(void) +{ + pv_info.name = "VMware hypervisor"; + pv_cpu_ops.io_delay = paravirt_nop; +} +#else +#define vmware_paravirt_ops_setup() do {} while (0) +#endif + static void __init vmware_platform_setup(void) { uint32_t eax, ebx, ecx, edx; @@ -94,6 +104,8 @@ static void __init vmware_platform_setup(void) } else { pr_warn("Failed to get TSC freq from the hypervisor\n"); } + + vmware_paravirt_ops_setup(); } /* From 80e9a4f21fd7ccce7e9b8439986fd028c9946dda Mon Sep 17 00:00:00 2001 From: Alexey Makhalov Date: Fri, 28 Oct 2016 00:54:32 -0700 Subject: [PATCH 5/5] x86/vmware: Add paravirt sched clock The default sched_clock() implementation is native_sched_clock(). It contains code to handle non constant frequency TSCs, which creates overhead for systems with constant frequency TSCs. The vmware hypervisor guarantees a constant frequency TSC, so native_sched_clock() is not required and slower than a dedicated function which operates with one time calculated conversion factors. Calculate the conversion factors at boot time from the tsc frequency and install an optimized sched_clock() function via paravirt ops. The paravirtualized clock can be disabled on the kernel command line with the new 'no-vmw-sched-clock' option. Signed-off-by: Alexey Makhalov Acked-by: Alok N Kataria Cc: linux-doc@vger.kernel.org Cc: pv-drivers@vmware.com Cc: corbet@lwn.net Cc: virtualization@lists.linux-foundation.org Link: http://lkml.kernel.org/r/20161028075432.90579-4-amakhalov@vmware.com Signed-off-by: Thomas Gleixner --- Documentation/kernel-parameters.txt | 4 +++ arch/x86/kernel/cpu/vmware.c | 42 +++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 37babf91f2cb..b3b2ec00646f 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2754,6 +2754,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page fault handling. + no-vmw-sched-clock + [X86,PV_OPS] Disable paravirtualized VMware scheduler + clock and use the default one. + no-steal-acc [X86,KVM] Disable paravirtualized steal time accounting. steal time is computed, but won't influence scheduler behaviour diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 098a524a1646..cdbe38be28fd 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -24,10 +24,15 @@ #include #include #include +#include #include #include #include #include +#include + +#undef pr_fmt +#define pr_fmt(fmt) "vmware: " fmt #define CPUID_VMWARE_INFO_LEAF 0x40000000 #define VMWARE_HYPERVISOR_MAGIC 0x564D5868 @@ -62,10 +67,47 @@ static unsigned long vmware_get_tsc_khz(void) } #ifdef CONFIG_PARAVIRT +static struct cyc2ns_data vmware_cyc2ns __ro_after_init; +static int vmw_sched_clock __initdata = 1; + +static __init int setup_vmw_sched_clock(char *s) +{ + vmw_sched_clock = 0; + return 0; +} +early_param("no-vmw-sched-clock", setup_vmw_sched_clock); + +static unsigned long long vmware_sched_clock(void) +{ + unsigned long long ns; + + ns = mul_u64_u32_shr(rdtsc(), vmware_cyc2ns.cyc2ns_mul, + vmware_cyc2ns.cyc2ns_shift); + ns -= vmware_cyc2ns.cyc2ns_offset; + return ns; +} + +static void __init vmware_sched_clock_setup(void) +{ + struct cyc2ns_data *d = &vmware_cyc2ns; + unsigned long long tsc_now = rdtsc(); + + clocks_calc_mult_shift(&d->cyc2ns_mul, &d->cyc2ns_shift, + vmware_tsc_khz, NSEC_PER_MSEC, 0); + d->cyc2ns_offset = mul_u64_u32_shr(tsc_now, d->cyc2ns_mul, + d->cyc2ns_shift); + + pv_time_ops.sched_clock = vmware_sched_clock; + pr_info("using sched offset of %llu ns\n", d->cyc2ns_offset); +} + static void __init vmware_paravirt_ops_setup(void) { pv_info.name = "VMware hypervisor"; pv_cpu_ops.io_delay = paravirt_nop; + + if (vmware_tsc_khz && vmw_sched_clock) + vmware_sched_clock_setup(); } #else #define vmware_paravirt_ops_setup() do {} while (0)