From d689fe222a858c767cb8594faf280048e532b53f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Nov 2013 21:01:57 +0100 Subject: [PATCH 01/11] NOHZ: Check for nohz active instead of nohz enabled RCU and the fine grained idle time accounting functions check tick_nohz_enabled. But that variable is merily telling that NOHZ has been enabled in the config and not been disabled on the command line. But it does not tell anything about nohz being active. That's what all this should check for. Matthew reported, that the idle accounting on his old P1 machine showed bogus values, when he enabled NOHZ in the config and did not disable it on the kernel command line. The reason is that his machine uses (refined) jiffies as a clocksource which explains why the "fine" grained accounting went into lala land, because it depends on when the system goes and leaves idle relative to the jiffies increment. Provide a tick_nohz_active indicator and let RCU and the accounting code use this instead of tick_nohz_enable. Reported-and-tested-by: Matthew Whitehead Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt Reviewed-by: Paul E. McKenney Cc: john.stultz@linaro.org Cc: mwhitehe@redhat.com Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1311132052240.30673@ionos.tec.linutronix.de --- kernel/rcu/tree_plugin.h | 4 ++-- kernel/time/tick-sched.c | 21 +++++++++------------ 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 6abb03dff5c0..08a765232432 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1632,7 +1632,7 @@ module_param(rcu_idle_gp_delay, int, 0644); static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; module_param(rcu_idle_lazy_gp_delay, int, 0644); -extern int tick_nohz_enabled; +extern int tick_nohz_active; /* * Try to advance callbacks for all flavors of RCU on the current CPU, but @@ -1729,7 +1729,7 @@ static void rcu_prepare_for_idle(int cpu) int tne; /* Handle nohz enablement switches conservatively. */ - tne = ACCESS_ONCE(tick_nohz_enabled); + tne = ACCESS_ONCE(tick_nohz_active); if (tne != rdtp->tick_nohz_enabled_snap) { if (rcu_cpu_has_callbacks(cpu, NULL)) invoke_rcu_core(); /* force nohz to see update. */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3612fc77f834..a12df5abde0b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -361,8 +361,8 @@ void __init tick_nohz_init(void) /* * NO HZ enabled ? */ -int tick_nohz_enabled __read_mostly = 1; - +static int tick_nohz_enabled __read_mostly = 1; +int tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ @@ -465,7 +465,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t now, idle; - if (!tick_nohz_enabled) + if (!tick_nohz_active) return -1; now = ktime_get(); @@ -506,7 +506,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t now, iowait; - if (!tick_nohz_enabled) + if (!tick_nohz_active) return -1; now = ktime_get(); @@ -799,11 +799,6 @@ void tick_nohz_idle_enter(void) local_irq_disable(); ts = &__get_cpu_var(tick_cpu_sched); - /* - * set ts->inidle unconditionally. even if the system did not - * switch to nohz mode the cpu frequency governers rely on the - * update of the idle time accounting in tick_nohz_start_idle(). - */ ts->inidle = 1; __tick_nohz_idle_enter(ts); @@ -973,7 +968,7 @@ static void tick_nohz_switch_to_nohz(void) struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); ktime_t next; - if (!tick_nohz_enabled) + if (!tick_nohz_active) return; local_irq_disable(); @@ -981,7 +976,7 @@ static void tick_nohz_switch_to_nohz(void) local_irq_enable(); return; } - + tick_nohz_active = 1; ts->nohz_mode = NOHZ_MODE_LOWRES; /* @@ -1139,8 +1134,10 @@ void tick_setup_sched_timer(void) } #ifdef CONFIG_NO_HZ_COMMON - if (tick_nohz_enabled) + if (tick_nohz_enabled) { ts->nohz_mode = NOHZ_MODE_HIGHRES; + tick_nohz_active = 1; + } #endif } #endif /* HIGH_RES_TIMERS */ From da554eba2e68c8ec051977db5ee1f42d384a01ed Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 15 Nov 2013 14:15:31 -0800 Subject: [PATCH 02/11] timer: Convert kmalloc_node(...GFP_ZERO...) to kzalloc_node(...) Use the helper function instead of __GFP_ZERO. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/timer.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/timer.c b/kernel/timer.c index 6582b82fa966..accfd241b9e5 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1518,9 +1518,8 @@ static int init_timers_cpu(int cpu) /* * The APs use this path later in boot */ - base = kmalloc_node(sizeof(*base), - GFP_KERNEL | __GFP_ZERO, - cpu_to_node(cpu)); + base = kzalloc_node(sizeof(*base), GFP_KERNEL, + cpu_to_node(cpu)); if (!base) return -ENOMEM; From 050ded1bbaea3331745cf2782315f5bc2582d083 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 15 Nov 2013 14:15:33 -0800 Subject: [PATCH 03/11] tick: Document tick_do_timer_cpu Taken straight from a tglx email ;) Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/time/tick-common.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 64522ecdfe0e..162b03ab0ad2 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -33,6 +33,21 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); */ ktime_t tick_next_period; ktime_t tick_period; + +/* + * tick_do_timer_cpu is a timer core internal variable which holds the CPU NR + * which is responsible for calling do_timer(), i.e. the timekeeping stuff. This + * variable has two functions: + * + * 1) Prevent a thundering herd issue of a gazillion of CPUs trying to grab the + * timekeeping lock all at once. Only the CPU which is assigned to do the + * update is handling it. + * + * 2) Hand off the duty in the NOHZ idle case by setting the value to + * TICK_DO_TIMER_NONE, i.e. a non existing CPU. So the next cpu which looks + * at it will take over and keep the time keeping alive. The handover + * procedure also covers cpu hotplug. + */ int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; /* From 1c283531115bda77a9b559271311c1983fbcffaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 8 Oct 2013 16:38:53 +0200 Subject: [PATCH 04/11] ARM: at91: rm9200: switch back to clockevents_config_and_register MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The timer code for at91rm9200 was already converted some time ago by Shawn Guo in commit 838a2ae (ARM: use clockevents_config_and_register() where possible) but because of a rounding issue in the timer core this resulted in an easily reproducible oops. So it was reverted (commit b7a8ca5 (ARM: at91: rm9200 fix time support)) which stopped the oops from happening because min_delta_ns is increased by one in arch code which stopped from problem from happening. Now that the timer core problem is fixed (commit a4578ea (clockevents: Sanitize ticks to nsec conversion)), we can switch back to the clockevents_config_and_register helper. Tested-by: Nicolas Ferre Signed-off-by: Uwe Kleine-König Signed-off-by: Daniel Lezcano --- arch/arm/mach-at91/at91rm9200_time.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/arm/mach-at91/at91rm9200_time.c b/arch/arm/mach-at91/at91rm9200_time.c index f607deb40f4d..bc7b363a3083 100644 --- a/arch/arm/mach-at91/at91rm9200_time.c +++ b/arch/arm/mach-at91/at91rm9200_time.c @@ -174,7 +174,6 @@ clkevt32k_next_event(unsigned long delta, struct clock_event_device *dev) static struct clock_event_device clkevt = { .name = "at91_tick", .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, - .shift = 32, .rating = 150, .set_next_event = clkevt32k_next_event, .set_mode = clkevt32k_mode, @@ -265,11 +264,9 @@ void __init at91rm9200_timer_init(void) at91_st_write(AT91_ST_RTMR, 1); /* Setup timer clockevent, with minimum of two ticks (important!!) */ - clkevt.mult = div_sc(AT91_SLOW_CLOCK, NSEC_PER_SEC, clkevt.shift); - clkevt.max_delta_ns = clockevent_delta2ns(AT91_ST_ALMV, &clkevt); - clkevt.min_delta_ns = clockevent_delta2ns(2, &clkevt) + 1; clkevt.cpumask = cpumask_of(0); - clockevents_register_device(&clkevt); + clockevents_config_and_register(&clkevt, AT91_SLOW_CLOCK, + 2, AT91_ST_ALMV); /* register clocksource */ clocksource_register_hz(&clk32k, AT91_SLOW_CLOCK); From a4a5fc3b64cd553820d97667056506636eaaba77 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Fri, 8 Nov 2013 11:07:59 +0100 Subject: [PATCH 05/11] clocksource: sh_mtu2: Release clock when sh_mtu2_register() fails Fix the probe error path to release the clock resource when the sh_mtu2_register() call fails. Cc: Daniel Lezcano Cc: linux-kernel@vger.kernel.org Signed-off-by: Laurent Pinchart Acked-by: Simon Horman Signed-off-by: Daniel Lezcano --- drivers/clocksource/sh_mtu2.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/sh_mtu2.c b/drivers/clocksource/sh_mtu2.c index 4aac9ee0d0c0..e6cfb328eb2e 100644 --- a/drivers/clocksource/sh_mtu2.c +++ b/drivers/clocksource/sh_mtu2.c @@ -313,8 +313,15 @@ static int sh_mtu2_setup(struct sh_mtu2_priv *p, struct platform_device *pdev) goto err1; } - return sh_mtu2_register(p, (char *)dev_name(&p->pdev->dev), - cfg->clockevent_rating); + ret = sh_mtu2_register(p, (char *)dev_name(&p->pdev->dev), + cfg->clockevent_rating); + if (ret < 0) + goto err2; + + return 0; + + err2: + clk_put(p->clk); err1: iounmap(p->mapbase); err0: From bd7549308eed47b3750b3dab692034a553e75663 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Fri, 8 Nov 2013 11:07:59 +0100 Subject: [PATCH 06/11] clocksource: sh_mtu2: Add clk_prepare/unprepare support Prepare the clock at probe time, as there is no other appropriate place in the driver where we're allowed to sleep. Cc: Daniel Lezcano Cc: linux-kernel@vger.kernel.org Signed-off-by: Laurent Pinchart Acked-by: Simon Horman Signed-off-by: Daniel Lezcano --- drivers/clocksource/sh_mtu2.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/clocksource/sh_mtu2.c b/drivers/clocksource/sh_mtu2.c index e6cfb328eb2e..3cf12834681e 100644 --- a/drivers/clocksource/sh_mtu2.c +++ b/drivers/clocksource/sh_mtu2.c @@ -313,13 +313,18 @@ static int sh_mtu2_setup(struct sh_mtu2_priv *p, struct platform_device *pdev) goto err1; } - ret = sh_mtu2_register(p, (char *)dev_name(&p->pdev->dev), - cfg->clockevent_rating); + ret = clk_prepare(p->clk); if (ret < 0) goto err2; - return 0; + ret = sh_mtu2_register(p, (char *)dev_name(&p->pdev->dev), + cfg->clockevent_rating); + if (ret < 0) + goto err3; + return 0; + err3: + clk_unprepare(p->clk); err2: clk_put(p->clk); err1: From 394a4486f009a184b58fc2f2435d6f5f800870bb Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Fri, 8 Nov 2013 11:07:59 +0100 Subject: [PATCH 07/11] clocksource: sh_tmu: Release clock when sh_tmu_register() fails Fix the probe error path to release the clock resource when the sh_tmu_register() call fails. Cc: Daniel Lezcano Cc: linux-kernel@vger.kernel.org Signed-off-by: Laurent Pinchart Acked-by: Simon Horman Signed-off-by: Daniel Lezcano --- drivers/clocksource/sh_tmu.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/clocksource/sh_tmu.c b/drivers/clocksource/sh_tmu.c index 78b8dae49628..15978372c937 100644 --- a/drivers/clocksource/sh_tmu.c +++ b/drivers/clocksource/sh_tmu.c @@ -475,9 +475,16 @@ static int sh_tmu_setup(struct sh_tmu_priv *p, struct platform_device *pdev) p->cs_enabled = false; p->enable_count = 0; - return sh_tmu_register(p, (char *)dev_name(&p->pdev->dev), - cfg->clockevent_rating, - cfg->clocksource_rating); + ret = sh_tmu_register(p, (char *)dev_name(&p->pdev->dev), + cfg->clockevent_rating, + cfg->clocksource_rating); + if (ret < 0) + goto err2; + + return 0; + + err2: + clk_put(p->clk); err1: iounmap(p->mapbase); err0: From 1c09eb3e2d761ffd152faa6b9d06caf560e7d445 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Fri, 8 Nov 2013 11:08:00 +0100 Subject: [PATCH 08/11] clocksource: sh_tmu: Add clk_prepare/unprepare support Prepare the clock at probe time, as there is no other appropriate place in the driver where we're allowed to sleep. Cc: Daniel Lezcano Cc: linux-kernel@vger.kernel.org Signed-off-by: Laurent Pinchart Acked-by: Simon Horman Signed-off-by: Daniel Lezcano --- drivers/clocksource/sh_tmu.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/clocksource/sh_tmu.c b/drivers/clocksource/sh_tmu.c index 15978372c937..63557cda0a7d 100644 --- a/drivers/clocksource/sh_tmu.c +++ b/drivers/clocksource/sh_tmu.c @@ -472,6 +472,11 @@ static int sh_tmu_setup(struct sh_tmu_priv *p, struct platform_device *pdev) ret = PTR_ERR(p->clk); goto err1; } + + ret = clk_prepare(p->clk); + if (ret < 0) + goto err2; + p->cs_enabled = false; p->enable_count = 0; @@ -479,10 +484,12 @@ static int sh_tmu_setup(struct sh_tmu_priv *p, struct platform_device *pdev) cfg->clockevent_rating, cfg->clocksource_rating); if (ret < 0) - goto err2; + goto err3; return 0; + err3: + clk_unprepare(p->clk); err2: clk_put(p->clk); err1: From 77f7ce9a9f636daaa65731a833bae1311a7b80e5 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 20 Nov 2013 12:02:03 -0800 Subject: [PATCH 09/11] clocksource: arm_arch_timer: Hide eventstream Kconfig on non-ARM Pavel Machek reports that this config is exposed on x86 where the ARM architected timers aren't even present. Make it depend on the ARM architected timers being selected so that non-ARM builds aren't asked about it. Reported-by: Pavel Machek Reviewed-by: Pavel Machek Signed-off-by: Stephen Boyd Signed-off-by: Daniel Lezcano --- drivers/clocksource/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index bdb953e15d2a..5c07a56962db 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -87,6 +87,7 @@ config ARM_ARCH_TIMER config ARM_ARCH_TIMER_EVTSTREAM bool "Support for ARM architected timer event stream generation" default y if ARM_ARCH_TIMER + depends on ARM_ARCH_TIMER help This option enables support for event stream generation based on the ARM architected timer. It is used for waking up CPUs executing From 4be77398ac9d948773116b6be4a3c91b3d6ea18c Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 22 Nov 2013 11:44:51 -0800 Subject: [PATCH 10/11] time: Fix 1ns/tick drift w/ GENERIC_TIME_VSYSCALL_OLD Since commit 1e75fa8be9f (time: Condense timekeeper.xtime into xtime_sec - merged in v3.6), there has been an problem with the error accounting in the timekeeping code, such that when truncating to nanoseconds, we round up to the next nsec, but the balancing adjustment to the ntp_error value was dropped. This causes 1ns per tick drift forward of the clock. In 3.7, this logic was isolated to only GENERIC_TIME_VSYSCALL_OLD architectures (s390, ia64, powerpc). The fix is simply to balance the accounting and to subtract the added nanosecond from ntp_error. This allows the internal long-term clock steering to keep the clock accurate. While this fix removes the regression added in 1e75fa8be9f, the ideal solution is to move away from GENERIC_TIME_VSYSCALL_OLD and use the new VSYSCALL method, which avoids entirely the nanosecond granular rounding, and the resulting short-term clock adjustment oscillation needed to keep long term accurate time. [ jstultz: Many thanks to Martin for his efforts identifying this subtle bug, and providing the fix. ] Originally-from: Martin Schwidefsky Cc: Tony Luck Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Andy Lutomirski Cc: Paul Turner Cc: Steven Rostedt Cc: Richard Cochran Cc: Prarit Bhargava Cc: Fenghua Yu Cc: Thomas Gleixner Cc: stable #v3.6+ Link: http://lkml.kernel.org/r/1385149491-20307-1-git-send-email-john.stultz@linaro.org Signed-off-by: John Stultz Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3abf53418b67..87b4f00284c9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1347,7 +1347,7 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk) tk->xtime_nsec -= remainder; tk->xtime_nsec += 1ULL << tk->shift; tk->ntp_error += remainder << tk->ntp_error_shift; - + tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift; } #else #define old_vsyscall_fixup(tk) From 0e576acbc1d9600cf2d9b4a141a2554639959d50 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Nov 2013 12:18:13 +0100 Subject: [PATCH 11/11] nohz: Fix another inconsistency between CONFIG_NO_HZ=n and nohz=off If CONFIG_NO_HZ=n tick_nohz_get_sleep_length() returns NSEC_PER_SEC/HZ. If CONFIG_NO_HZ=y and the nohz functionality is disabled via the command line option "nohz=off" or not enabled due to missing hardware support, then tick_nohz_get_sleep_length() returns 0. That happens because ts->sleep_length is never set in that case. Set it to NSEC_PER_SEC/HZ when the NOHZ mode is inactive. Reported-by: Michal Hocko Reported-by: Borislav Petkov Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a12df5abde0b..ea20f7d1ac2c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -711,8 +711,10 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return false; } - if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) { + ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ }; return false; + } if (need_resched()) return false;