From bb6ab52f2befe1fb29ac198f27d8a6aadf510f81 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 31 Mar 2016 17:42:15 +0200 Subject: [PATCH 01/18] intel_pstate: Do not set utilization update hook too early The utilization update hook in the intel_pstate driver is set too early, as it only should be set after the policy has been fully initialized by the core. That may cause intel_pstate_update_util() to use incorrect data and put the CPUs into incorrect P-states as a result. To prevent that from happening, make intel_pstate_set_policy() set the utilization update hook instead of intel_pstate_init_cpu() so intel_pstate_update_util() only runs when all things have been initialized as appropriate. Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 4b644526fd59..81057e48c4de 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1103,7 +1103,6 @@ static int intel_pstate_init_cpu(unsigned int cpunum) intel_pstate_sample(cpu, 0); cpu->update_util.func = intel_pstate_update_util; - cpufreq_set_update_util_data(cpunum, &cpu->update_util); pr_debug("intel_pstate: controlling: cpu %d\n", cpunum); @@ -1122,18 +1121,29 @@ static unsigned int intel_pstate_get(unsigned int cpu_num) return get_avg_frequency(cpu); } +static void intel_pstate_set_update_util_hook(unsigned int cpu) +{ + cpufreq_set_update_util_data(cpu, &all_cpu_data[cpu]->update_util); +} + +static void intel_pstate_clear_update_util_hook(unsigned int cpu) +{ + cpufreq_set_update_util_data(cpu, NULL); + synchronize_sched(); +} + static int intel_pstate_set_policy(struct cpufreq_policy *policy) { if (!policy->cpuinfo.max_freq) return -ENODEV; + intel_pstate_clear_update_util_hook(policy->cpu); + if (policy->policy == CPUFREQ_POLICY_PERFORMANCE && policy->max >= policy->cpuinfo.max_freq) { pr_debug("intel_pstate: set performance\n"); limits = &performance_limits; - if (hwp_active) - intel_pstate_hwp_set(policy->cpus); - return 0; + goto out; } pr_debug("intel_pstate: set powersave\n"); @@ -1163,6 +1173,9 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy) limits->max_perf = div_fp(int_tofp(limits->max_perf_pct), int_tofp(100)); + out: + intel_pstate_set_update_util_hook(policy->cpu); + if (hwp_active) intel_pstate_hwp_set(policy->cpus); @@ -1187,8 +1200,7 @@ static void intel_pstate_stop_cpu(struct cpufreq_policy *policy) pr_debug("intel_pstate: CPU %d exiting\n", cpu_num); - cpufreq_set_update_util_data(cpu_num, NULL); - synchronize_sched(); + intel_pstate_clear_update_util_hook(cpu_num); if (hwp_active) return; @@ -1455,8 +1467,7 @@ out: get_online_cpus(); for_each_online_cpu(cpu) { if (all_cpu_data[cpu]) { - cpufreq_set_update_util_data(cpu, NULL); - synchronize_sched(); + intel_pstate_clear_update_util_hook(cpu); kfree(all_cpu_data[cpu]); } } From febce40febcff3ccdb33f63456ffc4cfc61640c8 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 2 Apr 2016 01:06:21 +0200 Subject: [PATCH 02/18] intel_pstate: Avoid extra invocation of intel_pstate_sample() The initialization of intel_pstate for a given CPU involves populating the fields of its struct cpudata that represent the previous sample, but currently that is done in a problematic way. Namely, intel_pstate_init_cpu() makes an extra call to intel_pstate_sample() so it reads the current register values that will be used to populate the "previous sample" record during the next invocation of intel_pstate_sample(). However, after commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) that doesn't work for last_sample_time, because the time value is passed to intel_pstate_sample() as an argument now. Passing 0 to it from intel_pstate_init_cpu() is problematic, because that causes cpu->last_sample_time == 0 to be visible in get_target_pstate_use_performance() (and hence the extra cpu->last_sample_time > 0 check in there) and effectively allows the first invocation of intel_pstate_sample() from intel_pstate_update_util() to happen immediately after the initialization which may lead to a significant "turn on" effect in the governor algorithm. To mitigate that issue, rework the initialization to avoid the extra intel_pstate_sample() call from intel_pstate_init_cpu(). Instead, make intel_pstate_sample() return false if it has been called with cpu->sample.time equal to zero, which will make intel_pstate_update_util() skip the sample in that case, and reset cpu->sample.time from intel_pstate_set_update_util_hook() to make the algorithm start properly every time the hook is set. Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 81057e48c4de..9ae159631f52 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -910,7 +910,14 @@ static inline bool intel_pstate_sample(struct cpudata *cpu, u64 time) cpu->prev_aperf = aperf; cpu->prev_mperf = mperf; cpu->prev_tsc = tsc; - return true; + /* + * First time this function is invoked in a given cycle, all of the + * previous sample data fields are equal to zero or stale and they must + * be populated with meaningful numbers for things to work, so assume + * that sample.time will always be reset before setting the utilization + * update hook and make the caller skip the sample then. + */ + return !!cpu->last_sample_time; } static inline int32_t get_avg_frequency(struct cpudata *cpu) @@ -984,8 +991,7 @@ static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu) * enough period of time to adjust our busyness. */ duration_ns = cpu->sample.time - cpu->last_sample_time; - if ((s64)duration_ns > pid_params.sample_rate_ns * 3 - && cpu->last_sample_time > 0) { + if ((s64)duration_ns > pid_params.sample_rate_ns * 3) { sample_ratio = div_fp(int_tofp(pid_params.sample_rate_ns), int_tofp(duration_ns)); core_busy = mul_fp(core_busy, sample_ratio); @@ -1100,7 +1106,6 @@ static int intel_pstate_init_cpu(unsigned int cpunum) intel_pstate_get_cpu_pstates(cpu); intel_pstate_busy_pid_reset(cpu); - intel_pstate_sample(cpu, 0); cpu->update_util.func = intel_pstate_update_util; @@ -1121,9 +1126,13 @@ static unsigned int intel_pstate_get(unsigned int cpu_num) return get_avg_frequency(cpu); } -static void intel_pstate_set_update_util_hook(unsigned int cpu) +static void intel_pstate_set_update_util_hook(unsigned int cpu_num) { - cpufreq_set_update_util_data(cpu, &all_cpu_data[cpu]->update_util); + struct cpudata *cpu = all_cpu_data[cpu_num]; + + /* Prevent intel_pstate_update_util() from using stale data. */ + cpu->sample.time = 0; + cpufreq_set_update_util_data(cpu_num, &cpu->update_util); } static void intel_pstate_clear_update_util_hook(unsigned int cpu) From 30a3915385f124a6e3c81df4070f531d4f84299b Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Sun, 3 Apr 2016 19:42:11 -0700 Subject: [PATCH 03/18] cpufreq: intel_pstate: fix inconsistency in setting policy limits When user sets performance policy using cpufreq interface, it is possible that because of policy->max limits, the actual performance is still limited. But the current implementation will silently switch the policy to powersave and start using powersave limits. If user modifies any limits using intel_pstate sysfs, this is actually changing powersave limits. The current implementation tracks limits under powersave and performance policy using two different variables. When policy->max is less than policy->cpuinfo.max_freq, only powersave limit variable is used. This fix causes the performance limits variable to be used always when the policy is performance. Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 9ae159631f52..c6c169a1b2fc 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1141,6 +1141,20 @@ static void intel_pstate_clear_update_util_hook(unsigned int cpu) synchronize_sched(); } +static void intel_pstate_set_performance_limits(struct perf_limits *limits) +{ + limits->no_turbo = 0; + limits->turbo_disabled = 0; + limits->max_perf_pct = 100; + limits->max_perf = int_tofp(1); + limits->min_perf_pct = 100; + limits->min_perf = int_tofp(1); + limits->max_policy_pct = 100; + limits->max_sysfs_pct = 100; + limits->min_policy_pct = 0; + limits->min_sysfs_pct = 0; +} + static int intel_pstate_set_policy(struct cpufreq_policy *policy) { if (!policy->cpuinfo.max_freq) @@ -1148,15 +1162,18 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy) intel_pstate_clear_update_util_hook(policy->cpu); - if (policy->policy == CPUFREQ_POLICY_PERFORMANCE && - policy->max >= policy->cpuinfo.max_freq) { - pr_debug("intel_pstate: set performance\n"); + if (policy->policy == CPUFREQ_POLICY_PERFORMANCE) { limits = &performance_limits; - goto out; + if (policy->max >= policy->cpuinfo.max_freq) { + pr_debug("intel_pstate: set performance\n"); + intel_pstate_set_performance_limits(limits); + goto out; + } + } else { + pr_debug("intel_pstate: set powersave\n"); + limits = &powersave_limits; } - pr_debug("intel_pstate: set powersave\n"); - limits = &powersave_limits; limits->min_policy_pct = (policy->min * 100) / policy->cpuinfo.max_freq; limits->min_policy_pct = clamp_t(int, limits->min_policy_pct, 0 , 100); limits->max_policy_pct = DIV_ROUND_UP(policy->max * 100, From 13ad7701f9d0ab7806eb91f1fe1ca43d41b31fa2 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Sun, 3 Apr 2016 13:06:46 -0700 Subject: [PATCH 04/18] cpufreq: intel_pstate: Documenation for structures No code change. Only added kernel doc style comments for structures. Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 135 +++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index c6c169a1b2fc..8b5a415ee14a 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -64,6 +64,25 @@ static inline int ceiling_fp(int32_t x) return ret; } +/** + * struct sample - Store performance sample + * @core_pct_busy: Ratio of APERF/MPERF in percent, which is actual + * performance during last sample period + * @busy_scaled: Scaled busy value which is used to calculate next + * P state. This can be different than core_pct_busy + * to account for cpu idle period + * @aperf: Difference of actual performance frequency clock count + * read from APERF MSR between last and current sample + * @mperf: Difference of maximum performance frequency clock count + * read from MPERF MSR between last and current sample + * @tsc: Difference of time stamp counter between last and + * current sample + * @freq: Effective frequency calculated from APERF/MPERF + * @time: Current time from scheduler + * + * This structure is used in the cpudata structure to store performance sample + * data for choosing next P State. + */ struct sample { int32_t core_pct_busy; int32_t busy_scaled; @@ -74,6 +93,20 @@ struct sample { u64 time; }; +/** + * struct pstate_data - Store P state data + * @current_pstate: Current requested P state + * @min_pstate: Min P state possible for this platform + * @max_pstate: Max P state possible for this platform + * @max_pstate_physical:This is physical Max P state for a processor + * This can be higher than the max_pstate which can + * be limited by platform thermal design power limits + * @scaling: Scaling factor to convert frequency to cpufreq + * frequency units + * @turbo_pstate: Max Turbo P state possible for this platform + * + * Stores the per cpu model P state limits and current P state. + */ struct pstate_data { int current_pstate; int min_pstate; @@ -83,6 +116,19 @@ struct pstate_data { int turbo_pstate; }; +/** + * struct vid_data - Stores voltage information data + * @min: VID data for this platform corresponding to + * the lowest P state + * @max: VID data corresponding to the highest P State. + * @turbo: VID data for turbo P state + * @ratio: Ratio of (vid max - vid min) / + * (max P state - Min P State) + * + * Stores the voltage data for DVFS (Dynamic Voltage and Frequency Scaling) + * This data is used in Atom platforms, where in addition to target P state, + * the voltage data needs to be specified to select next P State. + */ struct vid_data { int min; int max; @@ -90,6 +136,18 @@ struct vid_data { int32_t ratio; }; +/** + * struct _pid - Stores PID data + * @setpoint: Target set point for busyness or performance + * @integral: Storage for accumulated error values + * @p_gain: PID proportional gain + * @i_gain: PID integral gain + * @d_gain: PID derivative gain + * @deadband: PID deadband + * @last_err: Last error storage for integral part of PID calculation + * + * Stores PID coefficients and last error for PID controller. + */ struct _pid { int setpoint; int32_t integral; @@ -100,6 +158,23 @@ struct _pid { int32_t last_err; }; +/** + * struct cpudata - Per CPU instance data storage + * @cpu: CPU number for this instance data + * @update_util: CPUFreq utility callback information + * @pstate: Stores P state limits for this CPU + * @vid: Stores VID limits for this CPU + * @pid: Stores PID parameters for this CPU + * @last_sample_time: Last Sample time + * @prev_aperf: Last APERF value read from APERF MSR + * @prev_mperf: Last MPERF value read from MPERF MSR + * @prev_tsc: Last timestamp counter (TSC) value + * @prev_cummulative_iowait: IO Wait time difference from last and + * current sample + * @sample: Storage for storing last Sample data + * + * This structure stores per CPU instance data for all CPUs. + */ struct cpudata { int cpu; @@ -118,6 +193,19 @@ struct cpudata { }; static struct cpudata **all_cpu_data; + +/** + * struct pid_adjust_policy - Stores static PID configuration data + * @sample_rate_ms: PID calculation sample rate in ms + * @sample_rate_ns: Sample rate calculation in ns + * @deadband: PID deadband + * @setpoint: PID Setpoint + * @p_gain_pct: PID proportional gain + * @i_gain_pct: PID integral gain + * @d_gain_pct: PID derivative gain + * + * Stores per CPU model static PID configuration data. + */ struct pstate_adjust_policy { int sample_rate_ms; s64 sample_rate_ns; @@ -128,6 +216,20 @@ struct pstate_adjust_policy { int i_gain_pct; }; +/** + * struct pstate_funcs - Per CPU model specific callbacks + * @get_max: Callback to get maximum non turbo effective P state + * @get_max_physical: Callback to get maximum non turbo physical P state + * @get_min: Callback to get minimum P state + * @get_turbo: Callback to get turbo P state + * @get_scaling: Callback to get frequency scaling factor + * @get_val: Callback to convert P state to actual MSR write value + * @get_vid: Callback to get VID data for Atom platforms + * @get_target_pstate: Callback to a function to calculate next P state to use + * + * Core and Atom CPU models have different way to get P State limits. This + * structure is used to store those callbacks. + */ struct pstate_funcs { int (*get_max)(void); int (*get_max_physical)(void); @@ -139,6 +241,11 @@ struct pstate_funcs { int32_t (*get_target_pstate)(struct cpudata *); }; +/** + * struct cpu_defaults- Per CPU model default config data + * @pid_policy: PID config data + * @funcs: Callback function data + */ struct cpu_defaults { struct pstate_adjust_policy pid_policy; struct pstate_funcs funcs; @@ -151,6 +258,34 @@ static struct pstate_adjust_policy pid_params; static struct pstate_funcs pstate_funcs; static int hwp_active; + +/** + * struct perf_limits - Store user and policy limits + * @no_turbo: User requested turbo state from intel_pstate sysfs + * @turbo_disabled: Platform turbo status either from msr + * MSR_IA32_MISC_ENABLE or when maximum available pstate + * matches the maximum turbo pstate + * @max_perf_pct: Effective maximum performance limit in percentage, this + * is minimum of either limits enforced by cpufreq policy + * or limits from user set limits via intel_pstate sysfs + * @min_perf_pct: Effective minimum performance limit in percentage, this + * is maximum of either limits enforced by cpufreq policy + * or limits from user set limits via intel_pstate sysfs + * @max_perf: This is a scaled value between 0 to 255 for max_perf_pct + * This value is used to limit max pstate + * @min_perf: This is a scaled value between 0 to 255 for min_perf_pct + * This value is used to limit min pstate + * @max_policy_pct: The maximum performance in percentage enforced by + * cpufreq setpolicy interface + * @max_sysfs_pct: The maximum performance in percentage enforced by + * intel pstate sysfs interface + * @min_policy_pct: The minimum performance in percentage enforced by + * cpufreq setpolicy interface + * @min_sysfs_pct: The minimum performance in percentage enforced by + * intel pstate sysfs interface + * + * Storage for user and policy defined limits. + */ struct perf_limits { int no_turbo; int turbo_disabled; From b318556479cc923970a79d6c2311138581c0db83 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 24 Mar 2016 09:51:57 +0530 Subject: [PATCH 05/18] cpufreq: dt: Drop stale comment The comment in file header doesn't hold true anymore, drop it. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq-dt.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c index f951f911786e..5f8dbe640a20 100644 --- a/drivers/cpufreq/cpufreq-dt.c +++ b/drivers/cpufreq/cpufreq-dt.c @@ -4,9 +4,6 @@ * Copyright (C) 2014 Linaro. * Viresh Kumar * - * The OPP code in function set_target() is reused from - * drivers/cpufreq/omap-cpufreq.c - * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. From 169b38373fd4a7d47cac0e5fdeeae90b8c076f32 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Wed, 6 Apr 2016 12:49:24 -0500 Subject: [PATCH 06/18] mailbox: pcc: Don't access an unmapped memory address space The acpi_pcc_probe() may end up accessing memory outside of the PCCT table space causing the kernel panic(). Increment the pcct_entry pointer after parsing 'HW-reduced Communications Subspace' to fix the problem. This change also enables the parsing of subtable at index 0. Signed-off-by: Shanker Donthineni Acked-by: Ashwin Chaugule Signed-off-by: Rafael J. Wysocki --- drivers/mailbox/pcc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mailbox/pcc.c b/drivers/mailbox/pcc.c index ac11a9b395d3..9d3955e6ab44 100644 --- a/drivers/mailbox/pcc.c +++ b/drivers/mailbox/pcc.c @@ -367,8 +367,6 @@ static int __init acpi_pcc_probe(void) struct acpi_generic_address *db_reg; struct acpi_pcct_hw_reduced *pcct_ss; pcc_mbox_channels[i].con_priv = pcct_entry; - pcct_entry = (struct acpi_subtable_header *) - ((unsigned long) pcct_entry + pcct_entry->length); /* If doorbell is in system memory cache the virt address */ pcct_ss = (struct acpi_pcct_hw_reduced *)pcct_entry; @@ -376,6 +374,8 @@ static int __init acpi_pcc_probe(void) if (db_reg->space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY) pcc_doorbell_vaddr[i] = acpi_os_ioremap(db_reg->address, db_reg->bit_width/8); + pcct_entry = (struct acpi_subtable_header *) + ((unsigned long) pcct_entry + pcct_entry->length); } pcc_mbox_ctrl.num_chans = count; From f70415496d5ddf06fe7e0a22250d60bab2b2d7cc Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 6 Apr 2016 17:00:48 -0400 Subject: [PATCH 07/18] intel_idle: remove useless return from void function. Signed-off-by: Richard Cochran Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index ba947df5a8c7..9d5ed3264135 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1260,8 +1260,6 @@ static void __exit intel_idle_exit(void) __unregister_cpu_notifier(&cpu_hotplug_notifier); cpu_notifier_register_done(); - - return; } module_init(intel_idle_init); From 5469c827d20ab013f43d4f5f94e101d0cf7afd2c Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 6 Apr 2016 17:00:49 -0400 Subject: [PATCH 08/18] intel_idle: Fix a helper function's return value. The function, intel_idle_cpuidle_driver_init, delivers no error codes at all. This patch changes the function to return 'void' instead of returning zero. Signed-off-by: Richard Cochran Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 9d5ed3264135..dfa90558babf 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1111,7 +1111,7 @@ static void intel_idle_state_table_update(void) * intel_idle_cpuidle_driver_init() * allocate, initialize cpuidle_states */ -static int __init intel_idle_cpuidle_driver_init(void) +static void __init intel_idle_cpuidle_driver_init(void) { int cstate; struct cpuidle_driver *drv = &intel_idle_driver; @@ -1173,8 +1173,6 @@ static int __init intel_idle_cpuidle_driver_init(void) if (icpu->disable_promotion_to_c1e) /* each-cpu is redundant */ on_each_cpu(c1e_promotion_disable, NULL, 1); - - return 0; } From 4a3dfb3fc0fb0fc9acd36c94b7145f9c9dd4d93a Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 6 Apr 2016 17:00:50 -0400 Subject: [PATCH 09/18] intel_idle: Remove redundant initialization calls. The function, intel_idle_cpuidle_driver_init, makes calls on each CPU to auto_demotion_disable() and c1e_promotion_disable(). These calls are redundant, as intel_idle_cpu_init() does the same calls just a bit later on. They are also premature, as the driver registration may yet fail. This patch removes the redundant code. Signed-off-by: Richard Cochran Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index dfa90558babf..cb85c4c42ff9 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1163,16 +1163,10 @@ static void __init intel_idle_cpuidle_driver_init(void) drv->state_count += 1; } - if (icpu->auto_demotion_disable_flags) - on_each_cpu(auto_demotion_disable, NULL, 1); - if (icpu->byt_auto_demotion_disable_flag) { wrmsrl(MSR_CC6_DEMOTION_POLICY_CONFIG, 0); wrmsrl(MSR_MC6_DEMOTION_POLICY_CONFIG, 0); } - - if (icpu->disable_promotion_to_c1e) /* each-cpu is redundant */ - on_each_cpu(c1e_promotion_disable, NULL, 1); } From 51319918bcc31f901646fc66348d41cf74ee0566 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 6 Apr 2016 17:00:51 -0400 Subject: [PATCH 10/18] intel_idle: Fix deallocation order on the driver exit path. In the module_exit() method, this driver first frees its per-CPU pointer, then unregisters a callback making use of the pointer. Furthermore, the function, intel_idle_cpuidle_devices_uninit, is racy against CPU hot plugging as it calls for_each_online_cpu(). This patch corrects the issues by unregistering first on the exit path while holding the hot plug lock. Signed-off-by: Richard Cochran Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index cb85c4c42ff9..4a1de3dcbf85 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1242,16 +1242,16 @@ static int __init intel_idle_init(void) static void __exit intel_idle_exit(void) { - intel_idle_cpuidle_devices_uninit(); - cpuidle_unregister_driver(&intel_idle_driver); - cpu_notifier_register_begin(); if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE) on_each_cpu(__setup_broadcast_timer, (void *)false, 1); __unregister_cpu_notifier(&cpu_hotplug_notifier); + intel_idle_cpuidle_devices_uninit(); cpu_notifier_register_done(); + + cpuidle_unregister_driver(&intel_idle_driver); } module_init(intel_idle_init); From e9df69ccd1322e87eee10f28036fad9e6c71f8dd Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 6 Apr 2016 17:00:52 -0400 Subject: [PATCH 11/18] intel_idle: Fix dangling registration on error path. In the module_init() method, if the per-CPU allocation fails, then the active cpuidle registration is not cleaned up. This patch fixes the issue by attempting the allocation before registration, and then cleaning it up again on registration failure. Signed-off-by: Richard Cochran Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 4a1de3dcbf85..5dd741ff4de3 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1210,19 +1210,20 @@ static int __init intel_idle_init(void) if (retval) return retval; + intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device); + if (intel_idle_cpuidle_devices == NULL) + return -ENOMEM; + intel_idle_cpuidle_driver_init(); retval = cpuidle_register_driver(&intel_idle_driver); if (retval) { struct cpuidle_driver *drv = cpuidle_get_driver(); printk(KERN_DEBUG PREFIX "intel_idle yielding to %s", drv ? drv->name : "none"); + free_percpu(intel_idle_cpuidle_devices); return retval; } - intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device); - if (intel_idle_cpuidle_devices == NULL) - return -ENOMEM; - cpu_notifier_register_begin(); for_each_online_cpu(i) { From ca42489d9ee3262482717c83428e087322fdc39c Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 6 Apr 2016 17:00:53 -0400 Subject: [PATCH 12/18] intel_idle: Avoid a double free of the per-CPU data. The helper function, intel_idle_cpuidle_devices_uninit, frees the globally allocated per-CPU data. However, this function is invoked from the hot plug notifier callback at a time when freeing that data is not safe. If the call to cpuidle_register_driver() should fail (say, due to lack of memory), then the driver will free its per-CPU region. On the *next* CPU_ONLINE event, the driver will happily use the region again and even free it again if the failure repeats. This patch fixes the issue by moving the call to free_percpu() outside of the helper function at the two call sites that actually need to free the per-CPU data. Signed-off-by: Richard Cochran Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 5dd741ff4de3..0b568728867e 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1002,7 +1002,7 @@ static int __init intel_idle_probe(void) /* * intel_idle_cpuidle_devices_uninit() - * unregister, free cpuidle_devices + * Unregisters the cpuidle devices. */ static void intel_idle_cpuidle_devices_uninit(void) { @@ -1013,9 +1013,6 @@ static void intel_idle_cpuidle_devices_uninit(void) dev = per_cpu_ptr(intel_idle_cpuidle_devices, i); cpuidle_unregister_device(dev); } - - free_percpu(intel_idle_cpuidle_devices); - return; } /* @@ -1231,6 +1228,7 @@ static int __init intel_idle_init(void) if (retval) { cpu_notifier_register_done(); cpuidle_unregister_driver(&intel_idle_driver); + free_percpu(intel_idle_cpuidle_devices); return retval; } } @@ -1253,6 +1251,7 @@ static void __exit intel_idle_exit(void) cpu_notifier_register_done(); cpuidle_unregister_driver(&intel_idle_driver); + free_percpu(intel_idle_cpuidle_devices); } module_init(intel_idle_init); From 2259a819a8d37e472f08c88bc0dd22194754adb4 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 6 Apr 2016 17:00:54 -0400 Subject: [PATCH 13/18] intel_idle: Setup the timer broadcast only on successful driver load. This driver sets the broadcast tick quite early on during probe and does not clean up again in cast of failure. This patch moves the setup call after the registration, placing the on_each_cpu() calls within the global CPU lock region. Signed-off-by: Richard Cochran Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 0b568728867e..ab34cd8b933c 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -987,16 +987,9 @@ static int __init intel_idle_probe(void) icpu = (const struct idle_cpu *)id->driver_data; cpuidle_state_table = icpu->state_table; - if (boot_cpu_has(X86_FEATURE_ARAT)) /* Always Reliable APIC Timer */ - lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE; - else - on_each_cpu(__setup_broadcast_timer, (void *)true, 1); - pr_debug(PREFIX "v" INTEL_IDLE_VERSION " model 0x%X\n", boot_cpu_data.x86_model); - pr_debug(PREFIX "lapic_timer_reliable_states 0x%x\n", - lapic_timer_reliable_states); return 0; } @@ -1234,8 +1227,16 @@ static int __init intel_idle_init(void) } __register_cpu_notifier(&cpu_hotplug_notifier); + if (boot_cpu_has(X86_FEATURE_ARAT)) /* Always Reliable APIC Timer */ + lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE; + else + on_each_cpu(__setup_broadcast_timer, (void *)true, 1); + cpu_notifier_register_done(); + pr_debug(PREFIX "lapic_timer_reliable_states 0x%x\n", + lapic_timer_reliable_states); + return 0; } From b69ef2c099c3e5f11bd5c33a9530d6522f72c9aa Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 6 Apr 2016 17:00:55 -0400 Subject: [PATCH 14/18] intel_idle: Don't overreact to a cpuidle registration failure. The helper function, intel_idle_cpu_init, registers one new device with the cpuidle layer. If the registration should fail, that function immediately calls intel_idle_cpuidle_devices_uninit() to unregister every last CPU's device. However, it makes no sense to do so, when called from the hot plug notifier callback. This patch moves the call to intel_idle_cpuidle_devices_uninit() outside of the helper function to the one call site that actually needs to perform the de-registrations. Signed-off-by: Richard Cochran Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index ab34cd8b933c..4418cfa2a4fc 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1175,7 +1175,6 @@ static int intel_idle_cpu_init(int cpu) if (cpuidle_register_device(dev)) { pr_debug(PREFIX "cpuidle_register_device %d failed!\n", cpu); - intel_idle_cpuidle_devices_uninit(); return -EIO; } @@ -1219,6 +1218,7 @@ static int __init intel_idle_init(void) for_each_online_cpu(i) { retval = intel_idle_cpu_init(i); if (retval) { + intel_idle_cpuidle_devices_uninit(); cpu_notifier_register_done(); cpuidle_unregister_driver(&intel_idle_driver); free_percpu(intel_idle_cpuidle_devices); From 08820546e4c30c84d0a1f1a49df055e1719c07ea Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 6 Apr 2016 17:00:56 -0400 Subject: [PATCH 15/18] intel_idle: Propagate hot plug errors. If a cpuidle registration error occurs during the hot plug notifier callback, we should really inform the hot plug machinery instead of just ignoring the error. This patch changes the callback to properly return on error. Signed-off-by: Richard Cochran Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 4418cfa2a4fc..8420ba10a434 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -818,8 +818,11 @@ static int cpu_hotplug_notify(struct notifier_block *n, * driver in this case */ dev = per_cpu_ptr(intel_idle_cpuidle_devices, hotcpu); - if (!dev->registered) - intel_idle_cpu_init(hotcpu); + if (dev->registered) + break; + + if (intel_idle_cpu_init(hotcpu)) + return NOTIFY_BAD; break; } From 3e66a9ab53641a0f7a440e56f7b35bf5d77494b3 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 6 Apr 2016 17:00:57 -0400 Subject: [PATCH 16/18] intel_idle: Clean up all registered devices on exit. This driver registers cpuidle devices when a CPU comes online, but it leaves the registrations in place when a CPU goes offline. The module exit code only unregisters the currently online CPUs, leaving the devices for offline CPUs dangling. This patch changes the driver to clean up all registrations on exit, even those from CPUs that are offline. Signed-off-by: Richard Cochran Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 8420ba10a434..862346bd053a 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1245,12 +1245,19 @@ static int __init intel_idle_init(void) static void __exit intel_idle_exit(void) { + struct cpuidle_device *dev; + int i; + cpu_notifier_register_begin(); if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE) on_each_cpu(__setup_broadcast_timer, (void *)false, 1); __unregister_cpu_notifier(&cpu_hotplug_notifier); - intel_idle_cpuidle_devices_uninit(); + + for_each_possible_cpu(i) { + dev = per_cpu_ptr(intel_idle_cpuidle_devices, i); + cpuidle_unregister_device(dev); + } cpu_notifier_register_done(); From f9e71657c2c0a8f1c50884ab45794be2854e158e Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 6 Apr 2016 17:00:58 -0400 Subject: [PATCH 17/18] intel_idle: Add SKX support SKX is similar to BDX Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 862346bd053a..4677c3da03a0 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -660,6 +660,35 @@ static struct cpuidle_state skl_cstates[] = { .enter = NULL } }; +static struct cpuidle_state skx_cstates[] = { + { + .name = "C1-SKX", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 2, + .target_residency = 2, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C1E-SKX", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, + .target_residency = 20, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C6-SKX", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 133, + .target_residency = 600, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .enter = NULL } +}; + static struct cpuidle_state atom_cstates[] = { { .name = "C1E-ATM", @@ -907,6 +936,10 @@ static const struct idle_cpu idle_cpu_skl = { .disable_promotion_to_c1e = true, }; +static const struct idle_cpu idle_cpu_skx = { + .state_table = skx_cstates, + .disable_promotion_to_c1e = true, +}; static const struct idle_cpu idle_cpu_avn = { .state_table = avn_cstates, @@ -948,6 +981,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { ICPU(0x56, idle_cpu_bdw), ICPU(0x4e, idle_cpu_skl), ICPU(0x5e, idle_cpu_skl), + ICPU(0x55, idle_cpu_skx), ICPU(0x57, idle_cpu_knl), {} }; From 3ce093d4de753d6c92cc09366e29d0618a62f542 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 6 Apr 2016 17:00:59 -0400 Subject: [PATCH 18/18] intel_idle: Add KBL support KBL is similar to SKL Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 4677c3da03a0..c6935de425fa 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -981,6 +981,8 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { ICPU(0x56, idle_cpu_bdw), ICPU(0x4e, idle_cpu_skl), ICPU(0x5e, idle_cpu_skl), + ICPU(0x8e, idle_cpu_skl), + ICPU(0x9e, idle_cpu_skl), ICPU(0x55, idle_cpu_skx), ICPU(0x57, idle_cpu_knl), {}