From 8bcedb4ce04750e1ccc9a6b6433387f6a9166a56 Mon Sep 17 00:00:00 2001 From: Wyes Karny Date: Mon, 6 Jun 2022 23:33:34 +0530 Subject: [PATCH 1/4] x86: Handle idle=nomwait cmdline properly for x86_idle When kernel is booted with idle=nomwait do not use MWAIT as the default idle state. If the user boots the kernel with idle=nomwait, it is a clear direction to not use mwait as the default idle state. However, the current code does not take this into consideration while selecting the default idle state on x86. Fix it by checking for the idle=nomwait boot option in prefer_mwait_c1_over_halt(). Also update the documentation around idle=nomwait appropriately. [ dhansen: tweak commit message ] Signed-off-by: Wyes Karny Signed-off-by: Dave Hansen Tested-by: Zhang Rui Link: https://lkml.kernel.org/r/fdc2dc2d0a1bc21c2f53d989ea2d2ee3ccbc0dbe.1654538381.git-series.wyes.karny@amd.com --- Documentation/admin-guide/pm/cpuidle.rst | 15 +++++++++------ arch/x86/kernel/process.c | 9 ++++++--- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/Documentation/admin-guide/pm/cpuidle.rst b/Documentation/admin-guide/pm/cpuidle.rst index aec2cd2aaea7..19754beb5a4e 100644 --- a/Documentation/admin-guide/pm/cpuidle.rst +++ b/Documentation/admin-guide/pm/cpuidle.rst @@ -612,8 +612,8 @@ the ``menu`` governor to be used on the systems that use the ``ladder`` governor by default this way, for example. The other kernel command line parameters controlling CPU idle time management -described below are only relevant for the *x86* architecture and some of -them affect Intel processors only. +described below are only relevant for the *x86* architecture and references +to ``intel_idle`` affect Intel processors only. The *x86* architecture support code recognizes three kernel command line options related to CPU idle time management: ``idle=poll``, ``idle=halt``, @@ -635,10 +635,13 @@ idle, so it very well may hurt single-thread computations performance as well as energy-efficiency. Thus using it for performance reasons may not be a good idea at all.] -The ``idle=nomwait`` option disables the ``intel_idle`` driver and causes -``acpi_idle`` to be used (as long as all of the information needed by it is -there in the system's ACPI tables), but it is not allowed to use the -``MWAIT`` instruction of the CPUs to ask the hardware to enter idle states. +The ``idle=nomwait`` option prevents the use of ``MWAIT`` instruction of +the CPU to enter idle states. When this option is used, the ``acpi_idle`` +driver will use the ``HLT`` instruction instead of ``MWAIT``. On systems +running Intel processors, this option disables the ``intel_idle`` driver +and forces the use of the ``acpi_idle`` driver instead. Note that in either +case, ``acpi_idle`` driver will function only if all the information needed +by it is in the system's ACPI tables. In addition to the architecture-level kernel command line options affecting CPU idle time management, there are parameters affecting individual ``CPUIdle`` diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index b370767f5b19..dca2e5e2fae2 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -824,6 +824,10 @@ static void amd_e400_idle(void) */ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) { + /* User has disallowed the use of MWAIT. Fallback to HALT */ + if (boot_option_idle_override == IDLE_NOMWAIT) + return 0; + if (c->x86_vendor != X86_VENDOR_INTEL) return 0; @@ -932,9 +936,8 @@ static int __init idle_setup(char *str) } else if (!strcmp(str, "nomwait")) { /* * If the boot option of "idle=nomwait" is added, - * it means that mwait will be disabled for CPU C2/C3 - * states. In such case it won't touch the variable - * of boot_option_idle_override. + * it means that mwait will be disabled for CPU C1/C2/C3 + * states. */ boot_option_idle_override = IDLE_NOMWAIT; } else From aebef63cf7ff222660fd321c47e24452e8437673 Mon Sep 17 00:00:00 2001 From: Wyes Karny Date: Mon, 6 Jun 2022 23:33:35 +0530 Subject: [PATCH 2/4] x86: Remove vendor checks from prefer_mwait_c1_over_halt Remove vendor checks from prefer_mwait_c1_over_halt function. Restore the decision tree to support MWAIT C1 as the default idle state based on CPUID checks as done by Thomas Gleixner in commit 09fd4b4ef5bc ("x86: use cpuid to check MWAIT support for C1") The decision tree is removed in commit 69fb3676df33 ("x86 idle: remove mwait_idle() and "idle=mwait" cmdline param") Prefer MWAIT when the following conditions are satisfied: 1. CPUID_Fn00000001_ECX [Monitor] should be set 2. CPUID_Fn00000005 should be supported 3. If CPUID_Fn00000005_ECX [EMX] is set then there should be at least one C1 substate available, indicated by CPUID_Fn00000005_EDX [MWaitC1SubStates] bits. Otherwise use HLT for default_idle function. HPC customers who want to optimize for lower latency are known to disable Global C-States in the BIOS. In fact, some vendors allow choosing a BIOS 'performance' profile which explicitly disables C-States. In this scenario, the cpuidle driver will not be loaded and the kernel will continue with the default idle state chosen at boot time. On AMD systems currently the default idle state is HLT which has a higher exit latency compared to MWAIT. The reason for the choice of HLT over MWAIT on AMD systems is: 1. Families prior to 10h didn't support MWAIT 2. Families 10h-15h supported MWAIT, but not MWAIT C1. Hence it was preferable to use HLT as the default state on these systems. However, AMD Family 17h onwards supports MWAIT as well as MWAIT C1. And it is preferable to use MWAIT as the default idle state on these systems, as it has lower exit latencies. The below table represents the exit latency for HLT and MWAIT on AMD Zen 3 system. Exit latency is measured by issuing a wakeup (IPI) to other CPU and measuring how many clock cycles it took to wakeup. Each iteration measures 10K wakeups by pinning source and destination. HLT: 25.0000th percentile : 1900 ns 50.0000th percentile : 2000 ns 75.0000th percentile : 2300 ns 90.0000th percentile : 2500 ns 95.0000th percentile : 2600 ns 99.0000th percentile : 2800 ns 99.5000th percentile : 3000 ns 99.9000th percentile : 3400 ns 99.9500th percentile : 3600 ns 99.9900th percentile : 5900 ns Min latency : 1700 ns Max latency : 5900 ns Total Samples 9999 MWAIT: 25.0000th percentile : 1400 ns 50.0000th percentile : 1500 ns 75.0000th percentile : 1700 ns 90.0000th percentile : 1800 ns 95.0000th percentile : 1900 ns 99.0000th percentile : 2300 ns 99.5000th percentile : 2500 ns 99.9000th percentile : 3200 ns 99.9500th percentile : 3500 ns 99.9900th percentile : 4600 ns Min latency : 1200 ns Max latency : 4600 ns Total Samples 9997 Improvement (99th percentile): 21.74% Below is another result for context_switch2 micro-benchmark, which brings out the impact of improved wakeup latency through increased context-switches per second. with HLT: ------------------------------- 50.0000th percentile : 190184 75.0000th percentile : 191032 90.0000th percentile : 192314 95.0000th percentile : 192520 99.0000th percentile : 192844 MIN : 190148 MAX : 192852 with MWAIT: ------------------------------- 50.0000th percentile : 277444 75.0000th percentile : 278268 90.0000th percentile : 278888 95.0000th percentile : 279164 99.0000th percentile : 280504 MIN : 273278 MAX : 281410 Improvement(99th percentile): ~ 45.46% Signed-off-by: Wyes Karny Signed-off-by: Dave Hansen Acked-by: Peter Zijlstra (Intel) Tested-by: Zhang Rui Link: https://ozlabs.org/~anton/junkcode/context_switch2.c Link: https://lkml.kernel.org/r/0cc675d8fd1f55e41b510e10abf2e21b6e9803d5.1654538381.git-series.wyes.karny@amd.com --- arch/x86/include/asm/mwait.h | 1 + arch/x86/kernel/process.c | 35 +++++++++++++++++++++++++---------- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 29dd27b5a339..3a8fdf881313 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -13,6 +13,7 @@ #define MWAIT_SUBSTATE_SIZE 4 #define MWAIT_HINT2CSTATE(hint) (((hint) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) #define MWAIT_HINT2SUBSTATE(hint) ((hint) & MWAIT_CSTATE_MASK) +#define MWAIT_C1_SUBSTATE_MASK 0xf0 #define CPUID_MWAIT_LEAF 5 #define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1 diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index dca2e5e2fae2..5cbfbe566270 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -813,28 +813,43 @@ static void amd_e400_idle(void) } /* - * Intel Core2 and older machines prefer MWAIT over HALT for C1. - * We can't rely on cpuidle installing MWAIT, because it will not load - * on systems that support only C1 -- so the boot default must be MWAIT. + * Prefer MWAIT over HALT if MWAIT is supported, MWAIT_CPUID leaf + * exists and whenever MONITOR/MWAIT extensions are present there is at + * least one C1 substate. * - * Some AMD machines are the opposite, they depend on using HALT. - * - * So for default C1, which is used during boot until cpuidle loads, - * use MWAIT-C1 on Intel HW that has it, else use HALT. + * Do not prefer MWAIT if MONITOR instruction has a bug or idle=nomwait + * is passed to kernel commandline parameter. */ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) { + u32 eax, ebx, ecx, edx; + /* User has disallowed the use of MWAIT. Fallback to HALT */ if (boot_option_idle_override == IDLE_NOMWAIT) return 0; - if (c->x86_vendor != X86_VENDOR_INTEL) + /* MWAIT is not supported on this platform. Fallback to HALT */ + if (!cpu_has(c, X86_FEATURE_MWAIT)) return 0; - if (!cpu_has(c, X86_FEATURE_MWAIT) || boot_cpu_has_bug(X86_BUG_MONITOR)) + /* Monitor has a bug. Fallback to HALT */ + if (boot_cpu_has_bug(X86_BUG_MONITOR)) return 0; - return 1; + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); + + /* + * If MWAIT extensions are not available, it is safe to use MWAIT + * with EAX=0, ECX=0. + */ + if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) + return 1; + + /* + * If MWAIT extensions are available, there should be at least one + * MWAIT C1 substate present. + */ + return (edx & MWAIT_C1_SUBSTATE_MASK); } /* From 6f33a9daff9f07906365c4054c90b225f346cd0e Mon Sep 17 00:00:00 2001 From: Wyes Karny Date: Mon, 6 Jun 2022 23:33:36 +0530 Subject: [PATCH 3/4] x86: Fix comment for X86_FEATURE_ZEN The feature X86_FEATURE_ZEN implies that the CPU based on Zen microarchitecture. Call this out explicitly in the comment. Signed-off-by: Wyes Karny Signed-off-by: Dave Hansen Tested-by: Zhang Rui Link: https://lkml.kernel.org/r/9931b01a85120a0d1faf0f244e8de3f2190e774c.1654538381.git-series.wyes.karny@amd.com --- arch/x86/include/asm/cpufeatures.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 73e643ae94b6..6141457cda38 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -219,7 +219,7 @@ #define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */ #define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ #define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 or above (Zen) */ +#define X86_FEATURE_ZEN (7*32+28) /* "" CPU based on Zen microarchitecture */ #define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */ #define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */ #define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* "" MSR IA32_FEAT_CTL configured */ From 3f2adf00f52b5f2e9e9f23bb5c77608fc9ee297c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 19 Jul 2022 13:47:14 -0400 Subject: [PATCH 4/4] x86/cpu: Use MSR_IA32_MISC_ENABLE constants Instead of the magic numbers 1<<11 and 1<<12 use the constants from msr-index.h. This makes it obvious where those bits of MSR_IA32_MISC_ENABLE are consumed (and in fact that Linux consumes them at all) to simple minds that grep for MSR_IA32_MISC_ENABLE_.*_UNAVAIL. Signed-off-by: Paolo Bonzini Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220719174714.2410374-1-pbonzini@redhat.com --- arch/x86/kernel/cpu/intel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 8321c43554a1..a00dd3e2ab55 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -647,9 +647,9 @@ static void init_intel(struct cpuinfo_x86 *c) unsigned int l1, l2; rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); - if (!(l1 & (1<<11))) + if (!(l1 & MSR_IA32_MISC_ENABLE_BTS_UNAVAIL)) set_cpu_cap(c, X86_FEATURE_BTS); - if (!(l1 & (1<<12))) + if (!(l1 & MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL)) set_cpu_cap(c, X86_FEATURE_PEBS); }