2007-05-08 15:27:59 +08:00
/*
* linux / kernel / time / timekeeping . c
*
* Kernel timekeeping code and accessor functions
*
* This code was moved from linux / kernel / timer . c .
* Please see that file for copyright and history logs .
*
*/
2012-09-05 03:12:07 +08:00
# include <linux/timekeeper_internal.h>
2007-05-08 15:27:59 +08:00
# include <linux/module.h>
# include <linux/interrupt.h>
# include <linux/percpu.h>
# include <linux/init.h>
# include <linux/mm.h>
2009-10-07 21:09:06 +08:00
# include <linux/sched.h>
2011-03-24 05:16:04 +08:00
# include <linux/syscore_ops.h>
2007-05-08 15:27:59 +08:00
# include <linux/clocksource.h>
# include <linux/jiffies.h>
# include <linux/time.h>
# include <linux/tick.h>
2009-08-14 21:47:30 +08:00
# include <linux/stop_machine.h>
2012-11-28 09:28:59 +08:00
# include <linux/pvclock_gtod.h>
2014-04-08 06:39:20 +08:00
# include <linux/compiler.h>
2007-05-08 15:27:59 +08:00
2013-02-22 06:51:36 +08:00
# include "tick-internal.h"
2013-03-23 02:31:29 +08:00
# include "ntp_internal.h"
2013-05-22 13:32:14 +08:00
# include "timekeeping_internal.h"
2009-08-14 21:47:26 +08:00
2013-06-27 18:35:45 +08:00
# define TK_CLEAR_NTP (1 << 0)
# define TK_MIRROR (1 << 1)
2013-06-27 18:35:46 +08:00
# define TK_CLOCK_WAS_SET (1 << 2)
2013-06-27 18:35:45 +08:00
2014-07-17 05:04:07 +08:00
/*
* The most important data for readout fits into a single 64 byte
* cache line .
*/
static struct {
seqcount_t seq ;
struct timekeeper timekeeper ;
} tk_core ____cacheline_aligned ;
2013-02-22 06:51:38 +08:00
static DEFINE_RAW_SPINLOCK ( timekeeper_lock ) ;
2013-02-22 06:51:40 +08:00
static struct timekeeper shadow_timekeeper ;
2009-08-14 21:47:26 +08:00
2014-07-17 05:05:23 +08:00
/**
* struct tk_fast - NMI safe timekeeper
* @ seq : Sequence counter for protecting updates . The lowest bit
* is the index for the tk_read_base array
* @ base : tk_read_base array . Access is indexed by the lowest bit of
* @ seq .
*
* See @ update_fast_timekeeper ( ) below .
*/
struct tk_fast {
seqcount_t seq ;
struct tk_read_base base [ 2 ] ;
} ;
static struct tk_fast tk_fast_mono ____cacheline_aligned ;
2015-03-19 16:39:08 +08:00
static struct tk_fast tk_fast_raw ____cacheline_aligned ;
2014-07-17 05:05:23 +08:00
2011-11-15 03:46:39 +08:00
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended ;
2012-07-13 13:21:53 +08:00
static inline void tk_normalize_xtime ( struct timekeeper * tk )
{
2015-03-19 17:09:06 +08:00
while ( tk - > tkr_mono . xtime_nsec > = ( ( u64 ) NSEC_PER_SEC < < tk - > tkr_mono . shift ) ) {
tk - > tkr_mono . xtime_nsec - = ( u64 ) NSEC_PER_SEC < < tk - > tkr_mono . shift ;
2012-07-13 13:21:53 +08:00
tk - > xtime_sec + + ;
}
}
2014-07-17 05:04:05 +08:00
static inline struct timespec64 tk_xtime ( struct timekeeper * tk )
{
struct timespec64 ts ;
ts . tv_sec = tk - > xtime_sec ;
2015-03-19 17:09:06 +08:00
ts . tv_nsec = ( long ) ( tk - > tkr_mono . xtime_nsec > > tk - > tkr_mono . shift ) ;
2014-07-17 05:04:05 +08:00
return ts ;
}
2014-07-17 05:04:01 +08:00
static void tk_set_xtime ( struct timekeeper * tk , const struct timespec64 * ts )
2012-07-13 13:21:53 +08:00
{
tk - > xtime_sec = ts - > tv_sec ;
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec = ( u64 ) ts - > tv_nsec < < tk - > tkr_mono . shift ;
2012-07-13 13:21:53 +08:00
}
2014-07-17 05:04:01 +08:00
static void tk_xtime_add ( struct timekeeper * tk , const struct timespec64 * ts )
2012-07-13 13:21:53 +08:00
{
tk - > xtime_sec + = ts - > tv_sec ;
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec + = ( u64 ) ts - > tv_nsec < < tk - > tkr_mono . shift ;
2012-08-22 08:30:46 +08:00
tk_normalize_xtime ( tk ) ;
2012-07-13 13:21:53 +08:00
}
2011-11-15 03:46:39 +08:00
2014-07-17 05:04:01 +08:00
static void tk_set_wall_to_mono ( struct timekeeper * tk , struct timespec64 wtm )
2012-07-28 02:48:12 +08:00
{
2014-07-17 05:04:01 +08:00
struct timespec64 tmp ;
2012-07-28 02:48:12 +08:00
/*
* Verify consistency of : offset_real = - wall_to_monotonic
* before modifying anything
*/
2014-07-17 05:04:01 +08:00
set_normalized_timespec64 ( & tmp , - tk - > wall_to_monotonic . tv_sec ,
2012-07-28 02:48:12 +08:00
- tk - > wall_to_monotonic . tv_nsec ) ;
2014-07-17 05:04:01 +08:00
WARN_ON_ONCE ( tk - > offs_real . tv64 ! = timespec64_to_ktime ( tmp ) . tv64 ) ;
2012-07-28 02:48:12 +08:00
tk - > wall_to_monotonic = wtm ;
2014-07-17 05:04:01 +08:00
set_normalized_timespec64 ( & tmp , - wtm . tv_sec , - wtm . tv_nsec ) ;
tk - > offs_real = timespec64_to_ktime ( tmp ) ;
2013-12-11 09:13:35 +08:00
tk - > offs_tai = ktime_add ( tk - > offs_real , ktime_set ( tk - > tai_offset , 0 ) ) ;
2012-07-28 02:48:12 +08:00
}
2014-07-17 05:05:00 +08:00
static inline void tk_update_sleep_time ( struct timekeeper * tk , ktime_t delta )
2012-07-28 02:48:12 +08:00
{
2014-07-17 05:05:00 +08:00
tk - > offs_boot = ktime_add ( tk - > offs_boot , delta ) ;
2012-07-28 02:48:12 +08:00
}
2015-03-12 12:16:32 +08:00
# ifdef CONFIG_DEBUG_TIMEKEEPING
2015-03-12 12:16:35 +08:00
# define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
2015-03-12 12:16:32 +08:00
static void timekeeping_check_update ( struct timekeeper * tk , cycle_t offset )
{
2015-03-19 17:09:06 +08:00
cycle_t max_cycles = tk - > tkr_mono . clock - > max_cycles ;
const char * name = tk - > tkr_mono . clock - > name ;
2015-03-12 12:16:32 +08:00
if ( offset > max_cycles ) {
2015-03-12 12:16:33 +08:00
printk_deferred ( " WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger \n " ,
2015-03-12 12:16:32 +08:00
offset , name , max_cycles ) ;
2015-03-12 12:16:33 +08:00
printk_deferred ( " timekeeping: Your kernel is sick, but tries to cope by capping time updates \n " ) ;
2015-03-12 12:16:32 +08:00
} else {
if ( offset > ( max_cycles > > 1 ) ) {
printk_deferred ( " INFO: timekeeping: Cycle offset (%lld) is larger than the the '%s' clock's 50%% safety margin (%lld) \n " ,
offset , name , max_cycles > > 1 ) ;
printk_deferred ( " timekeeping: Your kernel is still fine, but is feeling a bit nervous \n " ) ;
}
}
2015-03-12 12:16:35 +08:00
2015-05-14 07:04:47 +08:00
if ( tk - > underflow_seen ) {
if ( jiffies - tk - > last_warning > WARNING_FREQ ) {
2015-03-12 12:16:35 +08:00
printk_deferred ( " WARNING: Underflow in clocksource '%s' observed, time update ignored. \n " , name ) ;
printk_deferred ( " Please report this, consider using a different clocksource, if possible. \n " ) ;
printk_deferred ( " Your kernel is probably still fine. \n " ) ;
2015-05-14 07:04:47 +08:00
tk - > last_warning = jiffies ;
2015-03-12 12:16:35 +08:00
}
2015-05-14 07:04:47 +08:00
tk - > underflow_seen = 0 ;
2015-03-12 12:16:35 +08:00
}
2015-05-14 07:04:47 +08:00
if ( tk - > overflow_seen ) {
if ( jiffies - tk - > last_warning > WARNING_FREQ ) {
2015-03-12 12:16:35 +08:00
printk_deferred ( " WARNING: Overflow in clocksource '%s' observed, time update capped. \n " , name ) ;
printk_deferred ( " Please report this, consider using a different clocksource, if possible. \n " ) ;
printk_deferred ( " Your kernel is probably still fine. \n " ) ;
2015-05-14 07:04:47 +08:00
tk - > last_warning = jiffies ;
2015-03-12 12:16:35 +08:00
}
2015-05-14 07:04:47 +08:00
tk - > overflow_seen = 0 ;
2015-03-12 12:16:35 +08:00
}
2015-03-12 12:16:32 +08:00
}
2015-03-12 12:16:33 +08:00
static inline cycle_t timekeeping_get_delta ( struct tk_read_base * tkr )
{
2015-05-14 07:04:47 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2015-03-12 12:16:35 +08:00
cycle_t now , last , mask , max , delta ;
unsigned int seq ;
2015-03-12 12:16:33 +08:00
2015-03-12 12:16:35 +08:00
/*
* Since we ' re called holding a seqlock , the data may shift
* under us while we ' re doing the calculation . This can cause
* false positives , since we ' d note a problem but throw the
* results away . So nest another seqlock here to atomically
* grab the points we are checking with .
*/
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
now = tkr - > read ( tkr - > clock ) ;
last = tkr - > cycle_last ;
mask = tkr - > mask ;
max = tkr - > clock - > max_cycles ;
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2015-03-12 12:16:33 +08:00
2015-03-12 12:16:35 +08:00
delta = clocksource_delta ( now , last , mask ) ;
2015-03-12 12:16:33 +08:00
2015-03-12 12:16:34 +08:00
/*
* Try to catch underflows by checking if we are seeing small
* mask - relative negative values .
*/
2015-03-12 12:16:35 +08:00
if ( unlikely ( ( ~ delta & mask ) < ( mask > > 3 ) ) ) {
2015-05-14 07:04:47 +08:00
tk - > underflow_seen = 1 ;
2015-03-12 12:16:34 +08:00
delta = 0 ;
2015-03-12 12:16:35 +08:00
}
2015-03-12 12:16:34 +08:00
2015-03-12 12:16:33 +08:00
/* Cap delta value to the max_cycles values to avoid mult overflows */
2015-03-12 12:16:35 +08:00
if ( unlikely ( delta > max ) ) {
2015-05-14 07:04:47 +08:00
tk - > overflow_seen = 1 ;
2015-03-12 12:16:33 +08:00
delta = tkr - > clock - > max_cycles ;
2015-03-12 12:16:35 +08:00
}
2015-03-12 12:16:33 +08:00
return delta ;
}
2015-03-12 12:16:32 +08:00
# else
static inline void timekeeping_check_update ( struct timekeeper * tk , cycle_t offset )
{
}
2015-03-12 12:16:33 +08:00
static inline cycle_t timekeeping_get_delta ( struct tk_read_base * tkr )
{
cycle_t cycle_now , delta ;
/* read clocksource */
cycle_now = tkr - > read ( tkr - > clock ) ;
/* calculate the delta since the last update_wall_time */
delta = clocksource_delta ( cycle_now , tkr - > cycle_last , tkr - > mask ) ;
return delta ;
}
2015-03-12 12:16:32 +08:00
# endif
2009-08-14 21:47:26 +08:00
/**
2013-11-28 16:28:55 +08:00
* tk_setup_internals - Set up internals to use clocksource clock .
2009-08-14 21:47:26 +08:00
*
2013-11-28 16:28:55 +08:00
* @ tk : The target timekeeper to setup .
2009-08-14 21:47:26 +08:00
* @ clock : Pointer to clocksource .
*
* Calculates a fixed cycle / nsec interval for a given clocksource / adjustment
* pair and interval request .
*
* Unless you ' re the timekeeping code , you should not be using this !
*/
2012-07-13 13:21:57 +08:00
static void tk_setup_internals ( struct timekeeper * tk , struct clocksource * clock )
2009-08-14 21:47:26 +08:00
{
cycle_t interval ;
2010-10-21 06:55:15 +08:00
u64 tmp , ntpinterval ;
2012-07-13 13:21:53 +08:00
struct clocksource * old_clock ;
2009-08-14 21:47:26 +08:00
2015-03-19 17:09:06 +08:00
old_clock = tk - > tkr_mono . clock ;
tk - > tkr_mono . clock = clock ;
tk - > tkr_mono . read = clock - > read ;
tk - > tkr_mono . mask = clock - > mask ;
tk - > tkr_mono . cycle_last = tk - > tkr_mono . read ( clock ) ;
2009-08-14 21:47:26 +08:00
2015-03-19 16:28:44 +08:00
tk - > tkr_raw . clock = clock ;
tk - > tkr_raw . read = clock - > read ;
tk - > tkr_raw . mask = clock - > mask ;
tk - > tkr_raw . cycle_last = tk - > tkr_mono . cycle_last ;
2009-08-14 21:47:26 +08:00
/* Do the ns -> cycle conversion first, using original mult */
tmp = NTP_INTERVAL_LENGTH ;
tmp < < = clock - > shift ;
2010-10-21 06:55:15 +08:00
ntpinterval = tmp ;
2009-08-14 21:47:28 +08:00
tmp + = clock - > mult / 2 ;
do_div ( tmp , clock - > mult ) ;
2009-08-14 21:47:26 +08:00
if ( tmp = = 0 )
tmp = 1 ;
interval = ( cycle_t ) tmp ;
2012-07-13 13:21:57 +08:00
tk - > cycle_interval = interval ;
2009-08-14 21:47:26 +08:00
/* Go back from cycles -> shifted ns */
2012-07-13 13:21:57 +08:00
tk - > xtime_interval = ( u64 ) interval * clock - > mult ;
tk - > xtime_remainder = ntpinterval - tk - > xtime_interval ;
tk - > raw_interval =
2009-08-14 21:47:28 +08:00
( ( u64 ) interval * clock - > mult ) > > clock - > shift ;
2009-08-14 21:47:26 +08:00
2012-07-13 13:21:53 +08:00
/* if changing clocks, convert xtime_nsec shift units */
if ( old_clock ) {
int shift_change = clock - > shift - old_clock - > shift ;
if ( shift_change < 0 )
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec > > = - shift_change ;
2012-07-13 13:21:53 +08:00
else
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec < < = shift_change ;
2012-07-13 13:21:53 +08:00
}
2015-03-19 16:28:44 +08:00
tk - > tkr_raw . xtime_nsec = 0 ;
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . shift = clock - > shift ;
2015-03-19 16:28:44 +08:00
tk - > tkr_raw . shift = clock - > shift ;
2009-08-14 21:47:26 +08:00
2012-07-13 13:21:57 +08:00
tk - > ntp_error = 0 ;
tk - > ntp_error_shift = NTP_SCALE_SHIFT - clock - > shift ;
2014-04-24 11:53:29 +08:00
tk - > ntp_tick = ntpinterval < < tk - > ntp_error_shift ;
2009-08-14 21:47:28 +08:00
/*
* The timekeeper keeps its own mult values for the currently
* active clocksource . These value will be adjusted via NTP
* to counteract clock drifting .
*/
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . mult = clock - > mult ;
2015-03-19 16:28:44 +08:00
tk - > tkr_raw . mult = clock - > mult ;
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
tk - > ntp_err_mult = 0 ;
2009-08-14 21:47:26 +08:00
}
2007-05-08 15:27:59 +08:00
2009-08-14 21:47:29 +08:00
/* Timekeeper helper functions. */
time: convert arch_gettimeoffset to a pointer
Currently, whenever CONFIG_ARCH_USES_GETTIMEOFFSET is enabled, each
arch core provides a single implementation of arch_gettimeoffset(). In
many cases, different sub-architectures, different machines, or
different timer providers exist, and so the arch ends up implementing
arch_gettimeoffset() as a call-through-pointer anyway. Examples are
ARM, Cris, M68K, and it's arguable that the remaining architectures,
M32R and Blackfin, should be doing this anyway.
Modify arch_gettimeoffset so that it itself is a function pointer, which
the arch initializes. This will allow later changes to move the
initialization of this function into individual machine support or timer
drivers. This is particularly useful for code in drivers/clocksource
which should rely on an arch-independant mechanism to register their
implementation of arch_gettimeoffset().
This patch also converts the Cris architecture to set arch_gettimeoffset
directly to the final implementation in time_init(), because Cris already
had separate time_init() functions per sub-architecture. M68K and ARM
are converted to set arch_gettimeoffset to the final implementation in
later patches, because they already have function pointers in place for
this purpose.
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Jesper Nilsson <jesper.nilsson@axis.com>
Acked-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Stephen Warren <swarren@nvidia.com>
2012-11-08 08:58:54 +08:00
# ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
2014-07-17 05:03:50 +08:00
static u32 default_arch_gettimeoffset ( void ) { return 0 ; }
u32 ( * arch_gettimeoffset ) ( void ) = default_arch_gettimeoffset ;
time: convert arch_gettimeoffset to a pointer
Currently, whenever CONFIG_ARCH_USES_GETTIMEOFFSET is enabled, each
arch core provides a single implementation of arch_gettimeoffset(). In
many cases, different sub-architectures, different machines, or
different timer providers exist, and so the arch ends up implementing
arch_gettimeoffset() as a call-through-pointer anyway. Examples are
ARM, Cris, M68K, and it's arguable that the remaining architectures,
M32R and Blackfin, should be doing this anyway.
Modify arch_gettimeoffset so that it itself is a function pointer, which
the arch initializes. This will allow later changes to move the
initialization of this function into individual machine support or timer
drivers. This is particularly useful for code in drivers/clocksource
which should rely on an arch-independant mechanism to register their
implementation of arch_gettimeoffset().
This patch also converts the Cris architecture to set arch_gettimeoffset
directly to the final implementation in time_init(), because Cris already
had separate time_init() functions per sub-architecture. M68K and ARM
are converted to set arch_gettimeoffset to the final implementation in
later patches, because they already have function pointers in place for
this purpose.
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Jesper Nilsson <jesper.nilsson@axis.com>
Acked-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Stephen Warren <swarren@nvidia.com>
2012-11-08 08:58:54 +08:00
# else
2014-07-17 05:03:50 +08:00
static inline u32 arch_gettimeoffset ( void ) { return 0 ; }
time: convert arch_gettimeoffset to a pointer
Currently, whenever CONFIG_ARCH_USES_GETTIMEOFFSET is enabled, each
arch core provides a single implementation of arch_gettimeoffset(). In
many cases, different sub-architectures, different machines, or
different timer providers exist, and so the arch ends up implementing
arch_gettimeoffset() as a call-through-pointer anyway. Examples are
ARM, Cris, M68K, and it's arguable that the remaining architectures,
M32R and Blackfin, should be doing this anyway.
Modify arch_gettimeoffset so that it itself is a function pointer, which
the arch initializes. This will allow later changes to move the
initialization of this function into individual machine support or timer
drivers. This is particularly useful for code in drivers/clocksource
which should rely on an arch-independant mechanism to register their
implementation of arch_gettimeoffset().
This patch also converts the Cris architecture to set arch_gettimeoffset
directly to the final implementation in time_init(), because Cris already
had separate time_init() functions per sub-architecture. M68K and ARM
are converted to set arch_gettimeoffset to the final implementation in
later patches, because they already have function pointers in place for
this purpose.
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Jesper Nilsson <jesper.nilsson@axis.com>
Acked-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Stephen Warren <swarren@nvidia.com>
2012-11-08 08:58:54 +08:00
# endif
2014-07-17 05:05:18 +08:00
static inline s64 timekeeping_get_ns ( struct tk_read_base * tkr )
2009-08-14 21:47:29 +08:00
{
2015-03-12 12:16:33 +08:00
cycle_t delta ;
2012-07-13 13:21:53 +08:00
s64 nsec ;
2009-08-14 21:47:29 +08:00
2015-03-12 12:16:33 +08:00
delta = timekeeping_get_delta ( tkr ) ;
2009-08-14 21:47:29 +08:00
2014-07-17 05:05:18 +08:00
nsec = delta * tkr - > mult + tkr - > xtime_nsec ;
nsec > > = tkr - > shift ;
2012-07-13 13:21:55 +08:00
time: convert arch_gettimeoffset to a pointer
Currently, whenever CONFIG_ARCH_USES_GETTIMEOFFSET is enabled, each
arch core provides a single implementation of arch_gettimeoffset(). In
many cases, different sub-architectures, different machines, or
different timer providers exist, and so the arch ends up implementing
arch_gettimeoffset() as a call-through-pointer anyway. Examples are
ARM, Cris, M68K, and it's arguable that the remaining architectures,
M32R and Blackfin, should be doing this anyway.
Modify arch_gettimeoffset so that it itself is a function pointer, which
the arch initializes. This will allow later changes to move the
initialization of this function into individual machine support or timer
drivers. This is particularly useful for code in drivers/clocksource
which should rely on an arch-independant mechanism to register their
implementation of arch_gettimeoffset().
This patch also converts the Cris architecture to set arch_gettimeoffset
directly to the final implementation in time_init(), because Cris already
had separate time_init() functions per sub-architecture. M68K and ARM
are converted to set arch_gettimeoffset to the final implementation in
later patches, because they already have function pointers in place for
this purpose.
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Jesper Nilsson <jesper.nilsson@axis.com>
Acked-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Stephen Warren <swarren@nvidia.com>
2012-11-08 08:58:54 +08:00
/* If arch requires, add in get_arch_timeoffset() */
2014-07-17 05:03:50 +08:00
return nsec + arch_gettimeoffset ( ) ;
2009-08-14 21:47:29 +08:00
}
2014-07-17 05:05:23 +08:00
/**
* update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper .
2015-02-11 12:01:52 +08:00
* @ tkr : Timekeeping readout base from which we take the update
2014-07-17 05:05:23 +08:00
*
* We want to use this from any context including NMI and tracing /
* instrumenting the timekeeping code itself .
*
2015-05-27 09:39:36 +08:00
* Employ the latch technique ; see @ raw_write_seqcount_latch .
2014-07-17 05:05:23 +08:00
*
* So if a NMI hits the update of base [ 0 ] then it will use base [ 1 ]
* which is still consistent . In the worst case this can result is a
* slightly wrong timestamp ( a few nanoseconds ) . See
* @ ktime_get_mono_fast_ns .
*/
2015-03-19 16:36:19 +08:00
static void update_fast_timekeeper ( struct tk_read_base * tkr , struct tk_fast * tkf )
2014-07-17 05:05:23 +08:00
{
2015-03-19 16:36:19 +08:00
struct tk_read_base * base = tkf - > base ;
2014-07-17 05:05:23 +08:00
/* Force readers off to base[1] */
2015-03-19 16:36:19 +08:00
raw_write_seqcount_latch ( & tkf - > seq ) ;
2014-07-17 05:05:23 +08:00
/* Update base[0] */
2015-02-11 12:01:52 +08:00
memcpy ( base , tkr , sizeof ( * base ) ) ;
2014-07-17 05:05:23 +08:00
/* Force readers back to base[0] */
2015-03-19 16:36:19 +08:00
raw_write_seqcount_latch ( & tkf - > seq ) ;
2014-07-17 05:05:23 +08:00
/* Update base[1] */
memcpy ( base + 1 , base , sizeof ( * base ) ) ;
}
/**
* ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic
*
* This timestamp is not guaranteed to be monotonic across an update .
* The timestamp is calculated by :
*
* now = base_mono + clock_delta * slope
*
* So if the update lowers the slope , readers who are forced to the
* not yet updated second array are still using the old steeper slope .
*
* tmono
* ^
* | o n
* | o n
* | u
* | o
* | o
* | 12345678 - - - > reader order
*
* o = old slope
* u = update
* n = new slope
*
* So reader 6 will observe time going backwards versus reader 5.
*
* While other CPUs are likely to be able observe that , the only way
* for a CPU local observation is when an NMI hits in the middle of
* the update . Timestamps taken from that NMI context might be ahead
* of the following timestamps . Callers need to be aware of that and
* deal with it .
*/
2015-03-19 16:36:19 +08:00
static __always_inline u64 __ktime_get_fast_ns ( struct tk_fast * tkf )
2014-07-17 05:05:23 +08:00
{
struct tk_read_base * tkr ;
unsigned int seq ;
u64 now ;
do {
2015-05-27 09:39:36 +08:00
seq = raw_read_seqcount_latch ( & tkf - > seq ) ;
2015-03-19 16:36:19 +08:00
tkr = tkf - > base + ( seq & 0x01 ) ;
2015-03-19 17:09:06 +08:00
now = ktime_to_ns ( tkr - > base ) + timekeeping_get_ns ( tkr ) ;
2015-03-19 16:36:19 +08:00
} while ( read_seqcount_retry ( & tkf - > seq , seq ) ) ;
2014-07-17 05:05:23 +08:00
return now ;
}
2015-03-19 16:36:19 +08:00
u64 ktime_get_mono_fast_ns ( void )
{
return __ktime_get_fast_ns ( & tk_fast_mono ) ;
}
2014-07-17 05:05:23 +08:00
EXPORT_SYMBOL_GPL ( ktime_get_mono_fast_ns ) ;
2015-03-19 16:39:08 +08:00
u64 ktime_get_raw_fast_ns ( void )
{
return __ktime_get_fast_ns ( & tk_fast_raw ) ;
}
EXPORT_SYMBOL_GPL ( ktime_get_raw_fast_ns ) ;
2015-02-13 21:49:02 +08:00
/* Suspend-time cycles value for halted fast timekeeper. */
static cycle_t cycles_at_suspend ;
static cycle_t dummy_clock_read ( struct clocksource * cs )
{
return cycles_at_suspend ;
}
/**
* halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource .
* @ tk : Timekeeper to snapshot .
*
* It generally is unsafe to access the clocksource after timekeeping has been
* suspended , so take a snapshot of the readout base of @ tk and use it as the
* fast timekeeper ' s readout base while suspended . It will return the same
* number of cycles every time until timekeeping is resumed at which time the
* proper readout base for the fast timekeeper will be restored automatically .
*/
static void halt_fast_timekeeper ( struct timekeeper * tk )
{
static struct tk_read_base tkr_dummy ;
2015-03-19 17:09:06 +08:00
struct tk_read_base * tkr = & tk - > tkr_mono ;
2015-02-13 21:49:02 +08:00
memcpy ( & tkr_dummy , tkr , sizeof ( tkr_dummy ) ) ;
cycles_at_suspend = tkr - > read ( tkr - > clock ) ;
tkr_dummy . read = dummy_clock_read ;
2015-03-19 16:36:19 +08:00
update_fast_timekeeper ( & tkr_dummy , & tk_fast_mono ) ;
2015-03-19 16:39:08 +08:00
tkr = & tk - > tkr_raw ;
memcpy ( & tkr_dummy , tkr , sizeof ( tkr_dummy ) ) ;
tkr_dummy . read = dummy_clock_read ;
update_fast_timekeeper ( & tkr_dummy , & tk_fast_raw ) ;
2015-02-13 21:49:02 +08:00
}
2014-07-17 05:04:05 +08:00
# ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
static inline void update_vsyscall ( struct timekeeper * tk )
{
2014-08-14 03:47:14 +08:00
struct timespec xt , wm ;
2014-07-17 05:04:05 +08:00
2014-07-24 05:35:39 +08:00
xt = timespec64_to_timespec ( tk_xtime ( tk ) ) ;
2014-08-14 03:47:14 +08:00
wm = timespec64_to_timespec ( tk - > wall_to_monotonic ) ;
2015-03-19 17:09:06 +08:00
update_vsyscall_old ( & xt , & wm , tk - > tkr_mono . clock , tk - > tkr_mono . mult ,
tk - > tkr_mono . cycle_last ) ;
2014-07-17 05:04:05 +08:00
}
static inline void old_vsyscall_fixup ( struct timekeeper * tk )
{
s64 remainder ;
/*
* Store only full nanoseconds into xtime_nsec after rounding
* it up and add the remainder to the error difference .
* XXX - This is necessary to avoid small 1 ns inconsistnecies caused
* by truncating the remainder in vsyscalls . However , it causes
* additional work to be done in timekeeping_adjust ( ) . Once
* the vsyscall implementations are converted to use xtime_nsec
* ( shifted nanoseconds ) , and CONFIG_GENERIC_TIME_VSYSCALL_OLD
* users are removed , this can be killed .
*/
2015-03-19 17:09:06 +08:00
remainder = tk - > tkr_mono . xtime_nsec & ( ( 1ULL < < tk - > tkr_mono . shift ) - 1 ) ;
tk - > tkr_mono . xtime_nsec - = remainder ;
tk - > tkr_mono . xtime_nsec + = 1ULL < < tk - > tkr_mono . shift ;
2014-07-17 05:04:05 +08:00
tk - > ntp_error + = remainder < < tk - > ntp_error_shift ;
2015-03-19 17:09:06 +08:00
tk - > ntp_error - = ( 1ULL < < tk - > tkr_mono . shift ) < < tk - > ntp_error_shift ;
2014-07-17 05:04:05 +08:00
}
# else
# define old_vsyscall_fixup(tk)
# endif
2012-11-28 09:28:59 +08:00
static RAW_NOTIFIER_HEAD ( pvclock_gtod_chain ) ;
2013-06-27 18:35:46 +08:00
static void update_pvclock_gtod ( struct timekeeper * tk , bool was_set )
2012-11-28 09:28:59 +08:00
{
2013-06-27 18:35:46 +08:00
raw_notifier_call_chain ( & pvclock_gtod_chain , was_set , tk ) ;
2012-11-28 09:28:59 +08:00
}
/**
* pvclock_gtod_register_notifier - register a pvclock timedata update listener
*/
int pvclock_gtod_register_notifier ( struct notifier_block * nb )
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2012-11-28 09:28:59 +08:00
unsigned long flags ;
int ret ;
2013-02-22 06:51:38 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2012-11-28 09:28:59 +08:00
ret = raw_notifier_chain_register ( & pvclock_gtod_chain , nb ) ;
2013-06-27 18:35:46 +08:00
update_pvclock_gtod ( tk , true ) ;
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2012-11-28 09:28:59 +08:00
return ret ;
}
EXPORT_SYMBOL_GPL ( pvclock_gtod_register_notifier ) ;
/**
* pvclock_gtod_unregister_notifier - unregister a pvclock
* timedata update listener
*/
int pvclock_gtod_unregister_notifier ( struct notifier_block * nb )
{
unsigned long flags ;
int ret ;
2013-02-22 06:51:38 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2012-11-28 09:28:59 +08:00
ret = raw_notifier_chain_unregister ( & pvclock_gtod_chain , nb ) ;
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2012-11-28 09:28:59 +08:00
return ret ;
}
EXPORT_SYMBOL_GPL ( pvclock_gtod_unregister_notifier ) ;
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
/*
* tk_update_leap_state - helper to update the next_leap_ktime
*/
static inline void tk_update_leap_state ( struct timekeeper * tk )
{
tk - > next_leap_ktime = ntp_get_next_leap ( ) ;
if ( tk - > next_leap_ktime . tv64 ! = KTIME_MAX )
/* Convert to monotonic time */
tk - > next_leap_ktime = ktime_sub ( tk - > next_leap_ktime , tk - > offs_real ) ;
}
2014-07-17 05:04:10 +08:00
/*
* Update the ktime_t based scalar nsec members of the timekeeper
*/
static inline void tk_update_ktime_data ( struct timekeeper * tk )
{
2014-10-29 18:31:16 +08:00
u64 seconds ;
u32 nsec ;
2014-07-17 05:04:10 +08:00
/*
* The xtime based monotonic readout is :
* nsec = ( xtime_sec + wtm_sec ) * 1e9 + wtm_nsec + now ( ) ;
* The ktime based monotonic readout is :
* nsec = base_mono + now ( ) ;
* = = > base_mono = ( xtime_sec + wtm_sec ) * 1e9 + wtm_nsec
*/
2014-10-29 18:31:16 +08:00
seconds = ( u64 ) ( tk - > xtime_sec + tk - > wall_to_monotonic . tv_sec ) ;
nsec = ( u32 ) tk - > wall_to_monotonic . tv_nsec ;
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . base = ns_to_ktime ( seconds * NSEC_PER_SEC + nsec ) ;
2014-07-17 05:05:04 +08:00
/* Update the monotonic raw base */
2015-03-19 16:28:44 +08:00
tk - > tkr_raw . base = timespec64_to_ktime ( tk - > raw_time ) ;
2014-10-29 18:31:16 +08:00
/*
* The sum of the nanoseconds portions of xtime and
* wall_to_monotonic can be greater / equal one second . Take
* this into account before updating tk - > ktime_sec .
*/
2015-03-19 17:09:06 +08:00
nsec + = ( u32 ) ( tk - > tkr_mono . xtime_nsec > > tk - > tkr_mono . shift ) ;
2014-10-29 18:31:16 +08:00
if ( nsec > = NSEC_PER_SEC )
seconds + + ;
tk - > ktime_sec = seconds ;
2014-07-17 05:04:10 +08:00
}
2013-02-22 06:51:38 +08:00
/* must hold timekeeper_lock */
2013-06-27 18:35:45 +08:00
static void timekeeping_update ( struct timekeeper * tk , unsigned int action )
2011-11-14 07:19:49 +08:00
{
2013-06-27 18:35:45 +08:00
if ( action & TK_CLEAR_NTP ) {
2012-07-13 13:21:57 +08:00
tk - > ntp_error = 0 ;
2011-11-14 07:19:49 +08:00
ntp_clear ( ) ;
}
2013-02-22 06:51:40 +08:00
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
tk_update_leap_state ( tk ) ;
2014-07-17 05:04:10 +08:00
tk_update_ktime_data ( tk ) ;
2014-09-06 18:24:49 +08:00
update_vsyscall ( tk ) ;
update_pvclock_gtod ( tk , action & TK_CLOCK_WAS_SET ) ;
2015-03-19 16:36:19 +08:00
update_fast_timekeeper ( & tk - > tkr_mono , & tk_fast_mono ) ;
2015-03-19 16:39:08 +08:00
update_fast_timekeeper ( & tk - > tkr_raw , & tk_fast_raw ) ;
2015-04-15 05:08:37 +08:00
if ( action & TK_CLOCK_WAS_SET )
tk - > clock_was_set_seq + + ;
2015-06-12 06:54:53 +08:00
/*
* The mirroring of the data to the shadow - timekeeper needs
* to happen last here to ensure we don ' t over - write the
* timekeeper structure on the next update with stale data
*/
if ( action & TK_MIRROR )
memcpy ( & shadow_timekeeper , & tk_core . timekeeper ,
sizeof ( tk_core . timekeeper ) ) ;
2011-11-14 07:19:49 +08:00
}
2007-05-08 15:27:59 +08:00
/**
2009-08-14 21:47:26 +08:00
* timekeeping_forward_now - update clock to the current time
2007-05-08 15:27:59 +08:00
*
2008-08-21 07:37:28 +08:00
* Forward the current clock to update its state since the last call to
* update_wall_time ( ) . This is useful before significant clock changes ,
* as it avoids having to deal with this time offset explicitly .
2007-05-08 15:27:59 +08:00
*/
2012-07-13 13:21:57 +08:00
static void timekeeping_forward_now ( struct timekeeper * tk )
2007-05-08 15:27:59 +08:00
{
2015-03-19 17:09:06 +08:00
struct clocksource * clock = tk - > tkr_mono . clock ;
2014-07-17 05:05:10 +08:00
cycle_t cycle_now , delta ;
2008-08-21 07:37:28 +08:00
s64 nsec ;
2007-05-08 15:27:59 +08:00
2015-03-19 17:09:06 +08:00
cycle_now = tk - > tkr_mono . read ( clock ) ;
delta = clocksource_delta ( cycle_now , tk - > tkr_mono . cycle_last , tk - > tkr_mono . mask ) ;
tk - > tkr_mono . cycle_last = cycle_now ;
2015-03-19 16:28:44 +08:00
tk - > tkr_raw . cycle_last = cycle_now ;
2007-05-08 15:27:59 +08:00
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec + = delta * tk - > tkr_mono . mult ;
2009-05-02 04:10:26 +08:00
time: convert arch_gettimeoffset to a pointer
Currently, whenever CONFIG_ARCH_USES_GETTIMEOFFSET is enabled, each
arch core provides a single implementation of arch_gettimeoffset(). In
many cases, different sub-architectures, different machines, or
different timer providers exist, and so the arch ends up implementing
arch_gettimeoffset() as a call-through-pointer anyway. Examples are
ARM, Cris, M68K, and it's arguable that the remaining architectures,
M32R and Blackfin, should be doing this anyway.
Modify arch_gettimeoffset so that it itself is a function pointer, which
the arch initializes. This will allow later changes to move the
initialization of this function into individual machine support or timer
drivers. This is particularly useful for code in drivers/clocksource
which should rely on an arch-independant mechanism to register their
implementation of arch_gettimeoffset().
This patch also converts the Cris architecture to set arch_gettimeoffset
directly to the final implementation in time_init(), because Cris already
had separate time_init() functions per sub-architecture. M68K and ARM
are converted to set arch_gettimeoffset to the final implementation in
later patches, because they already have function pointers in place for
this purpose.
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Jesper Nilsson <jesper.nilsson@axis.com>
Acked-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Stephen Warren <swarren@nvidia.com>
2012-11-08 08:58:54 +08:00
/* If arch requires, add in get_arch_timeoffset() */
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec + = ( u64 ) arch_gettimeoffset ( ) < < tk - > tkr_mono . shift ;
2009-05-02 04:10:26 +08:00
2012-07-13 13:21:57 +08:00
tk_normalize_xtime ( tk ) ;
2008-08-21 07:37:30 +08:00
2015-03-19 16:28:44 +08:00
nsec = clocksource_cyc2ns ( delta , tk - > tkr_raw . mult , tk - > tkr_raw . shift ) ;
2014-07-17 05:04:01 +08:00
timespec64_add_ns ( & tk - > raw_time , nsec ) ;
2007-05-08 15:27:59 +08:00
}
/**
2014-07-17 05:04:04 +08:00
* __getnstimeofday64 - Returns the time of day in a timespec64 .
2007-05-08 15:27:59 +08:00
* @ ts : pointer to the timespec to be set
*
2012-11-20 02:26:16 +08:00
* Updates the time of day in the timespec .
* Returns 0 on success , or - ve when suspended ( timespec will be undefined ) .
2007-05-08 15:27:59 +08:00
*/
2014-07-17 05:04:04 +08:00
int __getnstimeofday64 ( struct timespec64 * ts )
2007-05-08 15:27:59 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2007-05-08 15:27:59 +08:00
unsigned long seq ;
2012-07-13 13:21:53 +08:00
s64 nsecs = 0 ;
2007-05-08 15:27:59 +08:00
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
2007-05-08 15:27:59 +08:00
2012-07-28 02:48:13 +08:00
ts - > tv_sec = tk - > xtime_sec ;
2015-03-19 17:09:06 +08:00
nsecs = timekeeping_get_ns ( & tk - > tkr_mono ) ;
2007-05-08 15:27:59 +08:00
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2007-05-08 15:27:59 +08:00
2012-09-12 07:26:03 +08:00
ts - > tv_nsec = 0 ;
2014-07-17 05:04:04 +08:00
timespec64_add_ns ( ts , nsecs ) ;
2012-11-20 02:26:16 +08:00
/*
* Do not bail out early , in case there were callers still using
* the value , even in the face of the WARN_ON .
*/
if ( unlikely ( timekeeping_suspended ) )
return - EAGAIN ;
return 0 ;
}
2014-07-17 05:04:04 +08:00
EXPORT_SYMBOL ( __getnstimeofday64 ) ;
2012-11-20 02:26:16 +08:00
/**
2014-07-17 05:04:04 +08:00
* getnstimeofday64 - Returns the time of day in a timespec64 .
2014-11-08 05:13:04 +08:00
* @ ts : pointer to the timespec64 to be set
2012-11-20 02:26:16 +08:00
*
2014-11-08 05:13:04 +08:00
* Returns the time of day in a timespec64 ( WARN if suspended ) .
2012-11-20 02:26:16 +08:00
*/
2014-07-17 05:04:04 +08:00
void getnstimeofday64 ( struct timespec64 * ts )
2012-11-20 02:26:16 +08:00
{
2014-07-17 05:04:04 +08:00
WARN_ON ( __getnstimeofday64 ( ts ) ) ;
2007-05-08 15:27:59 +08:00
}
2014-07-17 05:04:04 +08:00
EXPORT_SYMBOL ( getnstimeofday64 ) ;
2007-05-08 15:27:59 +08:00
2009-07-07 17:27:28 +08:00
ktime_t ktime_get ( void )
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2009-07-07 17:27:28 +08:00
unsigned int seq ;
2014-07-17 05:04:12 +08:00
ktime_t base ;
s64 nsecs ;
2009-07-07 17:27:28 +08:00
WARN_ON ( timekeeping_suspended ) ;
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
2015-03-19 17:09:06 +08:00
base = tk - > tkr_mono . base ;
nsecs = timekeeping_get_ns ( & tk - > tkr_mono ) ;
2009-07-07 17:27:28 +08:00
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2014-07-17 05:03:53 +08:00
2014-07-17 05:04:12 +08:00
return ktime_add_ns ( base , nsecs ) ;
2009-07-07 17:27:28 +08:00
}
EXPORT_SYMBOL_GPL ( ktime_get ) ;
2015-04-07 19:12:35 +08:00
u32 ktime_get_resolution_ns ( void )
{
struct timekeeper * tk = & tk_core . timekeeper ;
unsigned int seq ;
u32 nsecs ;
WARN_ON ( timekeeping_suspended ) ;
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
nsecs = tk - > tkr_mono . mult > > tk - > tkr_mono . shift ;
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
return nsecs ;
}
EXPORT_SYMBOL_GPL ( ktime_get_resolution_ns ) ;
2014-07-17 05:04:13 +08:00
static ktime_t * offsets [ TK_OFFS_MAX ] = {
[ TK_OFFS_REAL ] = & tk_core . timekeeper . offs_real ,
[ TK_OFFS_BOOT ] = & tk_core . timekeeper . offs_boot ,
[ TK_OFFS_TAI ] = & tk_core . timekeeper . offs_tai ,
} ;
ktime_t ktime_get_with_offset ( enum tk_offsets offs )
{
struct timekeeper * tk = & tk_core . timekeeper ;
unsigned int seq ;
ktime_t base , * offset = offsets [ offs ] ;
s64 nsecs ;
WARN_ON ( timekeeping_suspended ) ;
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
2015-03-19 17:09:06 +08:00
base = ktime_add ( tk - > tkr_mono . base , * offset ) ;
nsecs = timekeeping_get_ns ( & tk - > tkr_mono ) ;
2014-07-17 05:04:13 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
return ktime_add_ns ( base , nsecs ) ;
}
EXPORT_SYMBOL_GPL ( ktime_get_with_offset ) ;
2014-07-17 05:04:22 +08:00
/**
* ktime_mono_to_any ( ) - convert mononotic time to any other time
* @ tmono : time to convert .
* @ offs : which offset to use
*/
ktime_t ktime_mono_to_any ( ktime_t tmono , enum tk_offsets offs )
{
ktime_t * offset = offsets [ offs ] ;
unsigned long seq ;
ktime_t tconv ;
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
tconv = ktime_add ( tmono , * offset ) ;
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
return tconv ;
}
EXPORT_SYMBOL_GPL ( ktime_mono_to_any ) ;
2014-07-17 05:05:04 +08:00
/**
* ktime_get_raw - Returns the raw monotonic time in ktime_t format
*/
ktime_t ktime_get_raw ( void )
{
struct timekeeper * tk = & tk_core . timekeeper ;
unsigned int seq ;
ktime_t base ;
s64 nsecs ;
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
2015-03-19 16:28:44 +08:00
base = tk - > tkr_raw . base ;
nsecs = timekeeping_get_ns ( & tk - > tkr_raw ) ;
2014-07-17 05:05:04 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
return ktime_add_ns ( base , nsecs ) ;
}
EXPORT_SYMBOL_GPL ( ktime_get_raw ) ;
2009-07-07 17:27:28 +08:00
/**
2014-07-17 05:04:04 +08:00
* ktime_get_ts64 - get the monotonic clock in timespec64 format
2009-07-07 17:27:28 +08:00
* @ ts : pointer to timespec variable
*
* The function calculates the monotonic clock from the realtime
* clock and the wall_to_monotonic offset and stores the result
2014-11-08 05:13:04 +08:00
* in normalized timespec64 format in the variable pointed to by @ ts .
2009-07-07 17:27:28 +08:00
*/
2014-07-17 05:04:04 +08:00
void ktime_get_ts64 ( struct timespec64 * ts )
2009-07-07 17:27:28 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2014-07-17 05:04:04 +08:00
struct timespec64 tomono ;
2012-09-12 07:26:03 +08:00
s64 nsec ;
2009-07-07 17:27:28 +08:00
unsigned int seq ;
WARN_ON ( timekeeping_suspended ) ;
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
2014-07-17 05:04:04 +08:00
ts - > tv_sec = tk - > xtime_sec ;
2015-03-19 17:09:06 +08:00
nsec = timekeeping_get_ns ( & tk - > tkr_mono ) ;
2012-07-28 02:48:13 +08:00
tomono = tk - > wall_to_monotonic ;
2009-07-07 17:27:28 +08:00
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2009-07-07 17:27:28 +08:00
2014-07-17 05:04:04 +08:00
ts - > tv_sec + = tomono . tv_sec ;
ts - > tv_nsec = 0 ;
timespec64_add_ns ( ts , nsec + tomono . tv_nsec ) ;
2009-07-07 17:27:28 +08:00
}
2014-07-17 05:04:04 +08:00
EXPORT_SYMBOL_GPL ( ktime_get_ts64 ) ;
2009-07-07 17:27:28 +08:00
2014-10-29 18:31:16 +08:00
/**
* ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC
*
* Returns the seconds portion of CLOCK_MONOTONIC with a single non
* serialized read . tk - > ktime_sec is of type ' unsigned long ' so this
* works on both 32 and 64 bit systems . On 32 bit systems the readout
* covers ~ 136 years of uptime which should be enough to prevent
* premature wrap arounds .
*/
time64_t ktime_get_seconds ( void )
{
struct timekeeper * tk = & tk_core . timekeeper ;
WARN_ON ( timekeeping_suspended ) ;
return tk - > ktime_sec ;
}
EXPORT_SYMBOL_GPL ( ktime_get_seconds ) ;
2014-10-29 18:31:50 +08:00
/**
* ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME
*
* Returns the wall clock seconds since 1970. This replaces the
* get_seconds ( ) interface which is not y2038 safe on 32 bit systems .
*
* For 64 bit systems the fast access to tk - > xtime_sec is preserved . On
* 32 bit systems the access must be protected with the sequence
* counter to provide " atomic " access to the 64 bit tk - > xtime_sec
* value .
*/
time64_t ktime_get_real_seconds ( void )
{
struct timekeeper * tk = & tk_core . timekeeper ;
time64_t seconds ;
unsigned int seq ;
if ( IS_ENABLED ( CONFIG_64BIT ) )
return tk - > xtime_sec ;
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
seconds = tk - > xtime_sec ;
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
return seconds ;
}
EXPORT_SYMBOL_GPL ( ktime_get_real_seconds ) ;
2011-01-13 09:00:57 +08:00
# ifdef CONFIG_NTP_PPS
/**
* getnstime_raw_and_real - get day and raw monotonic time in timespec format
* @ ts_raw : pointer to the timespec to be set to raw monotonic time
* @ ts_real : pointer to the timespec to be set to the time of day
*
* This function reads both the time of day and raw monotonic time at the
* same time atomically and stores the resulting timestamps in timespec
* format .
*/
void getnstime_raw_and_real ( struct timespec * ts_raw , struct timespec * ts_real )
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2011-01-13 09:00:57 +08:00
unsigned long seq ;
s64 nsecs_raw , nsecs_real ;
WARN_ON_ONCE ( timekeeping_suspended ) ;
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
2011-01-13 09:00:57 +08:00
2014-07-17 05:04:01 +08:00
* ts_raw = timespec64_to_timespec ( tk - > raw_time ) ;
2012-07-28 02:48:13 +08:00
ts_real - > tv_sec = tk - > xtime_sec ;
2012-07-13 13:21:53 +08:00
ts_real - > tv_nsec = 0 ;
2011-01-13 09:00:57 +08:00
2015-03-19 16:28:44 +08:00
nsecs_raw = timekeeping_get_ns ( & tk - > tkr_raw ) ;
2015-03-19 17:09:06 +08:00
nsecs_real = timekeeping_get_ns ( & tk - > tkr_mono ) ;
2011-01-13 09:00:57 +08:00
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2011-01-13 09:00:57 +08:00
timespec_add_ns ( ts_raw , nsecs_raw ) ;
timespec_add_ns ( ts_real , nsecs_real ) ;
}
EXPORT_SYMBOL ( getnstime_raw_and_real ) ;
# endif /* CONFIG_NTP_PPS */
2007-05-08 15:27:59 +08:00
/**
* do_gettimeofday - Returns the time of day in a timeval
* @ tv : pointer to the timeval to be set
*
2008-01-30 20:30:01 +08:00
* NOTE : Users should be converted to using getnstimeofday ( )
2007-05-08 15:27:59 +08:00
*/
void do_gettimeofday ( struct timeval * tv )
{
2014-07-17 05:04:04 +08:00
struct timespec64 now ;
2007-05-08 15:27:59 +08:00
2014-07-17 05:04:04 +08:00
getnstimeofday64 ( & now ) ;
2007-05-08 15:27:59 +08:00
tv - > tv_sec = now . tv_sec ;
tv - > tv_usec = now . tv_nsec / 1000 ;
}
EXPORT_SYMBOL ( do_gettimeofday ) ;
2012-04-27 16:12:42 +08:00
2007-05-08 15:27:59 +08:00
/**
2014-11-18 19:15:16 +08:00
* do_settimeofday64 - Sets the time of day .
* @ ts : pointer to the timespec64 variable containing the new time
2007-05-08 15:27:59 +08:00
*
* Sets the time of day to the new time and update NTP and notify hrtimers
*/
2014-11-18 19:15:16 +08:00
int do_settimeofday64 ( const struct timespec64 * ts )
2007-05-08 15:27:59 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2014-11-18 19:15:16 +08:00
struct timespec64 ts_delta , xt ;
2011-11-15 06:05:44 +08:00
unsigned long flags ;
2015-06-23 18:38:54 +08:00
int ret = 0 ;
2007-05-08 15:27:59 +08:00
2014-11-18 19:15:16 +08:00
if ( ! timespec64_valid_strict ( ts ) )
2007-05-08 15:27:59 +08:00
return - EINVAL ;
2013-02-22 06:51:38 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2007-05-08 15:27:59 +08:00
2012-07-28 02:48:13 +08:00
timekeeping_forward_now ( tk ) ;
2008-08-21 07:37:28 +08:00
2012-07-28 02:48:13 +08:00
xt = tk_xtime ( tk ) ;
2014-11-18 19:15:16 +08:00
ts_delta . tv_sec = ts - > tv_sec - xt . tv_sec ;
ts_delta . tv_nsec = ts - > tv_nsec - xt . tv_nsec ;
2012-07-13 13:21:53 +08:00
2015-06-23 18:38:54 +08:00
if ( timespec64_compare ( & tk - > wall_to_monotonic , & ts_delta ) > 0 ) {
ret = - EINVAL ;
goto out ;
}
2014-07-17 05:04:01 +08:00
tk_set_wall_to_mono ( tk , timespec64_sub ( tk - > wall_to_monotonic , ts_delta ) ) ;
2007-05-08 15:27:59 +08:00
2014-11-18 19:15:16 +08:00
tk_set_xtime ( tk , ts ) ;
2015-06-23 18:38:54 +08:00
out :
2013-06-27 18:35:46 +08:00
timekeeping_update ( tk , TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET ) ;
2007-05-08 15:27:59 +08:00
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2007-05-08 15:27:59 +08:00
/* signal hrtimers about time change */
clock_was_set ( ) ;
2015-06-23 18:38:54 +08:00
return ret ;
2007-05-08 15:27:59 +08:00
}
2014-11-18 19:15:16 +08:00
EXPORT_SYMBOL ( do_settimeofday64 ) ;
2007-05-08 15:27:59 +08:00
2011-02-01 21:52:17 +08:00
/**
* timekeeping_inject_offset - Adds or subtracts from the current time .
* @ tv : pointer to the timespec variable containing the offset
*
* Adds or subtracts an offset value from the current time .
*/
int timekeeping_inject_offset ( struct timespec * ts )
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2011-11-15 06:05:44 +08:00
unsigned long flags ;
2014-07-17 05:04:01 +08:00
struct timespec64 ts64 , tmp ;
2012-08-09 03:36:20 +08:00
int ret = 0 ;
2011-02-01 21:52:17 +08:00
if ( ( unsigned long ) ts - > tv_nsec > = NSEC_PER_SEC )
return - EINVAL ;
2014-07-17 05:04:01 +08:00
ts64 = timespec_to_timespec64 ( * ts ) ;
2013-02-22 06:51:38 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2011-02-01 21:52:17 +08:00
2012-07-28 02:48:13 +08:00
timekeeping_forward_now ( tk ) ;
2011-02-01 21:52:17 +08:00
2012-08-09 03:36:20 +08:00
/* Make sure the proposed value is valid */
2014-07-17 05:04:01 +08:00
tmp = timespec64_add ( tk_xtime ( tk ) , ts64 ) ;
2015-06-23 18:38:54 +08:00
if ( timespec64_compare ( & tk - > wall_to_monotonic , & ts64 ) > 0 | |
! timespec64_valid_strict ( & tmp ) ) {
2012-08-09 03:36:20 +08:00
ret = - EINVAL ;
goto error ;
}
2012-07-13 13:21:53 +08:00
2014-07-17 05:04:01 +08:00
tk_xtime_add ( tk , & ts64 ) ;
tk_set_wall_to_mono ( tk , timespec64_sub ( tk - > wall_to_monotonic , ts64 ) ) ;
2011-02-01 21:52:17 +08:00
2012-08-09 03:36:20 +08:00
error : /* even if we error out, we forwarded the time, so call update */
2013-06-27 18:35:46 +08:00
timekeeping_update ( tk , TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET ) ;
2011-02-01 21:52:17 +08:00
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2011-02-01 21:52:17 +08:00
/* signal hrtimers about time change */
clock_was_set ( ) ;
2012-08-09 03:36:20 +08:00
return ret ;
2011-02-01 21:52:17 +08:00
}
EXPORT_SYMBOL ( timekeeping_inject_offset ) ;
2012-05-04 03:30:07 +08:00
/**
* timekeeping_get_tai_offset - Returns current TAI offset from UTC
*
*/
s32 timekeeping_get_tai_offset ( void )
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2012-05-04 03:30:07 +08:00
unsigned int seq ;
s32 ret ;
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
2012-05-04 03:30:07 +08:00
ret = tk - > tai_offset ;
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2012-05-04 03:30:07 +08:00
return ret ;
}
/**
* __timekeeping_set_tai_offset - Lock free worker function
*
*/
2013-03-26 03:24:24 +08:00
static void __timekeeping_set_tai_offset ( struct timekeeper * tk , s32 tai_offset )
2012-05-04 03:30:07 +08:00
{
tk - > tai_offset = tai_offset ;
2013-12-11 09:13:35 +08:00
tk - > offs_tai = ktime_add ( tk - > offs_real , ktime_set ( tai_offset , 0 ) ) ;
2012-05-04 03:30:07 +08:00
}
/**
* timekeeping_set_tai_offset - Sets the current TAI offset from UTC
*
*/
void timekeeping_set_tai_offset ( s32 tai_offset )
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2012-05-04 03:30:07 +08:00
unsigned long flags ;
2013-02-22 06:51:38 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2012-05-04 03:30:07 +08:00
__timekeeping_set_tai_offset ( tk , tai_offset ) ;
2013-12-12 10:50:25 +08:00
timekeeping_update ( tk , TK_MIRROR | TK_CLOCK_WAS_SET ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2013-04-11 03:41:49 +08:00
clock_was_set ( ) ;
2012-05-04 03:30:07 +08:00
}
2007-05-08 15:27:59 +08:00
/**
* change_clocksource - Swaps clocksources if a new one is available
*
* Accumulates current time interval and initializes new clocksource
*/
2009-08-14 21:47:30 +08:00
static int change_clocksource ( void * data )
2007-05-08 15:27:59 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2009-04-22 03:24:02 +08:00
struct clocksource * new , * old ;
2012-03-15 07:38:15 +08:00
unsigned long flags ;
2007-05-08 15:27:59 +08:00
2009-08-14 21:47:30 +08:00
new = ( struct clocksource * ) data ;
2007-05-08 15:27:59 +08:00
2013-02-22 06:51:38 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2012-03-15 07:38:15 +08:00
2012-07-28 02:48:13 +08:00
timekeeping_forward_now ( tk ) ;
2013-04-26 04:31:44 +08:00
/*
* If the cs is in module , get a module reference . Succeeds
* for built - in code ( owner = = NULL ) as well .
*/
if ( try_module_get ( new - > owner ) ) {
if ( ! new - > enable | | new - > enable ( new ) = = 0 ) {
2015-03-19 17:09:06 +08:00
old = tk - > tkr_mono . clock ;
2013-04-26 04:31:44 +08:00
tk_setup_internals ( tk , new ) ;
if ( old - > disable )
old - > disable ( old ) ;
module_put ( old - > owner ) ;
} else {
module_put ( new - > owner ) ;
}
2009-08-14 21:47:30 +08:00
}
2013-06-27 18:35:46 +08:00
timekeeping_update ( tk , TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET ) ;
2012-03-15 07:38:15 +08:00
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2012-03-15 07:38:15 +08:00
2009-08-14 21:47:30 +08:00
return 0 ;
}
2007-05-08 15:27:59 +08:00
2009-08-14 21:47:30 +08:00
/**
* timekeeping_notify - Install a new clock source
* @ clock : pointer to the clock source
*
* This function is called from clocksource . c after a new , better clock
* source has been registered . The caller holds the clocksource_mutex .
*/
2013-04-26 04:31:44 +08:00
int timekeeping_notify ( struct clocksource * clock )
2009-08-14 21:47:30 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2012-07-28 02:48:13 +08:00
2015-03-19 17:09:06 +08:00
if ( tk - > tkr_mono . clock = = clock )
2013-04-26 04:31:44 +08:00
return 0 ;
2009-08-14 21:47:30 +08:00
stop_machine ( change_clocksource , clock , NULL ) ;
2007-05-08 15:27:59 +08:00
tick_clock_notify ( ) ;
2015-03-19 17:09:06 +08:00
return tk - > tkr_mono . clock = = clock ? 0 : - 1 ;
2007-05-08 15:27:59 +08:00
}
2009-08-14 21:47:30 +08:00
2008-08-21 07:37:30 +08:00
/**
2014-11-08 03:03:20 +08:00
* getrawmonotonic64 - Returns the raw monotonic time in a timespec
* @ ts : pointer to the timespec64 to be set
2008-08-21 07:37:30 +08:00
*
* Returns the raw monotonic time ( completely un - modified by ntp )
*/
2014-11-08 03:03:20 +08:00
void getrawmonotonic64 ( struct timespec64 * ts )
2008-08-21 07:37:30 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2014-07-17 05:04:01 +08:00
struct timespec64 ts64 ;
2008-08-21 07:37:30 +08:00
unsigned long seq ;
s64 nsecs ;
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
2015-03-19 16:28:44 +08:00
nsecs = timekeeping_get_ns ( & tk - > tkr_raw ) ;
2014-07-17 05:04:01 +08:00
ts64 = tk - > raw_time ;
2008-08-21 07:37:30 +08:00
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2008-08-21 07:37:30 +08:00
2014-07-17 05:04:01 +08:00
timespec64_add_ns ( & ts64 , nsecs ) ;
2014-11-08 03:03:20 +08:00
* ts = ts64 ;
2008-08-21 07:37:30 +08:00
}
2014-11-08 03:03:20 +08:00
EXPORT_SYMBOL ( getrawmonotonic64 ) ;
2008-08-21 07:37:30 +08:00
2007-05-08 15:27:59 +08:00
/**
2008-02-08 20:19:24 +08:00
* timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
2007-05-08 15:27:59 +08:00
*/
2008-02-08 20:19:24 +08:00
int timekeeping_valid_for_hres ( void )
2007-05-08 15:27:59 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2007-05-08 15:27:59 +08:00
unsigned long seq ;
int ret ;
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
2007-05-08 15:27:59 +08:00
2015-03-19 17:09:06 +08:00
ret = tk - > tkr_mono . clock - > flags & CLOCK_SOURCE_VALID_FOR_HRES ;
2007-05-08 15:27:59 +08:00
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2007-05-08 15:27:59 +08:00
return ret ;
}
2009-08-19 01:45:10 +08:00
/**
* timekeeping_max_deferment - Returns max time the clocksource can be deferred
*/
u64 timekeeping_max_deferment ( void )
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2011-11-15 04:48:10 +08:00
unsigned long seq ;
u64 ret ;
2012-07-13 13:21:51 +08:00
2011-11-15 04:48:10 +08:00
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
2011-11-15 04:48:10 +08:00
2015-03-19 17:09:06 +08:00
ret = tk - > tkr_mono . clock - > max_idle_ns ;
2011-11-15 04:48:10 +08:00
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2011-11-15 04:48:10 +08:00
return ret ;
2009-08-19 01:45:10 +08:00
}
2007-05-08 15:27:59 +08:00
/**
2009-08-14 21:47:31 +08:00
* read_persistent_clock - Return time from the persistent clock .
2007-05-08 15:27:59 +08:00
*
* Weak dummy function for arches that do not yet support it .
2009-08-14 21:47:31 +08:00
* Reads the time from the battery backed persistent clock .
* Returns a timespec with tv_sec = 0 and tv_nsec = 0 if unsupported .
2007-05-08 15:27:59 +08:00
*
* XXX - Do be sure to remove it once all arches implement it .
*/
2014-04-08 06:39:20 +08:00
void __weak read_persistent_clock ( struct timespec * ts )
2007-05-08 15:27:59 +08:00
{
2009-08-14 21:47:31 +08:00
ts - > tv_sec = 0 ;
ts - > tv_nsec = 0 ;
2007-05-08 15:27:59 +08:00
}
2015-04-02 11:34:22 +08:00
void __weak read_persistent_clock64 ( struct timespec64 * ts64 )
{
struct timespec ts ;
read_persistent_clock ( & ts ) ;
* ts64 = timespec_to_timespec64 ( ts ) ;
}
2009-08-14 21:47:32 +08:00
/**
2015-04-09 09:04:42 +08:00
* read_boot_clock64 - Return time of the system start .
2009-08-14 21:47:32 +08:00
*
* Weak dummy function for arches that do not yet support it .
* Function to read the exact time the system has been started .
2015-04-09 09:04:42 +08:00
* Returns a timespec64 with tv_sec = 0 and tv_nsec = 0 if unsupported .
2009-08-14 21:47:32 +08:00
*
* XXX - Do be sure to remove it once all arches implement it .
*/
2015-04-09 09:04:42 +08:00
void __weak read_boot_clock64 ( struct timespec64 * ts )
2009-08-14 21:47:32 +08:00
{
ts - > tv_sec = 0 ;
ts - > tv_nsec = 0 ;
}
2015-04-02 11:34:38 +08:00
/* Flag for if timekeeping_resume() has injected sleeptime */
static bool sleeptime_injected ;
/* Flag for if there is a persistent clock on this platform */
static bool persistent_clock_exists ;
2007-05-08 15:27:59 +08:00
/*
* timekeeping_init - Initializes the clocksource and common timekeeping values
*/
void __init timekeeping_init ( void )
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2009-08-14 21:47:26 +08:00
struct clocksource * clock ;
2007-05-08 15:27:59 +08:00
unsigned long flags ;
2014-07-17 05:04:01 +08:00
struct timespec64 now , boot , tmp ;
2013-01-16 00:09:47 +08:00
2015-04-02 11:34:22 +08:00
read_persistent_clock64 ( & now ) ;
2014-07-17 05:04:01 +08:00
if ( ! timespec64_valid_strict ( & now ) ) {
2012-08-09 03:36:20 +08:00
pr_warn ( " WARNING: Persistent clock returned invalid value! \n "
" Check your CMOS/BIOS settings. \n " ) ;
now . tv_sec = 0 ;
now . tv_nsec = 0 ;
2013-01-16 00:09:47 +08:00
} else if ( now . tv_sec | | now . tv_nsec )
2015-04-02 11:34:38 +08:00
persistent_clock_exists = true ;
2012-08-09 03:36:20 +08:00
2015-04-02 11:34:21 +08:00
read_boot_clock64 ( & boot ) ;
2014-07-17 05:04:01 +08:00
if ( ! timespec64_valid_strict ( & boot ) ) {
2012-08-09 03:36:20 +08:00
pr_warn ( " WARNING: Boot clock returned invalid value! \n "
" Check your CMOS/BIOS settings. \n " ) ;
boot . tv_sec = 0 ;
boot . tv_nsec = 0 ;
}
2007-05-08 15:27:59 +08:00
2013-02-22 06:51:38 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2013-03-23 02:37:28 +08:00
ntp_init ( ) ;
2009-08-14 21:47:21 +08:00
clock = clocksource_default_clock ( ) ;
2009-08-14 21:47:19 +08:00
if ( clock - > enable )
clock - > enable ( clock ) ;
2012-07-28 02:48:13 +08:00
tk_setup_internals ( tk , clock ) ;
2007-05-08 15:27:59 +08:00
2012-07-28 02:48:13 +08:00
tk_set_xtime ( tk , & now ) ;
tk - > raw_time . tv_sec = 0 ;
tk - > raw_time . tv_nsec = 0 ;
2012-07-13 13:21:53 +08:00
if ( boot . tv_sec = = 0 & & boot . tv_nsec = = 0 )
2012-07-28 02:48:13 +08:00
boot = tk_xtime ( tk ) ;
2012-07-13 13:21:53 +08:00
2014-07-17 05:04:01 +08:00
set_normalized_timespec64 ( & tmp , - boot . tv_sec , - boot . tv_nsec ) ;
2012-07-28 02:48:13 +08:00
tk_set_wall_to_mono ( tk , tmp ) ;
2012-07-28 02:48:12 +08:00
2014-07-17 05:04:09 +08:00
timekeeping_update ( tk , TK_MIRROR ) ;
2013-02-22 06:51:40 +08:00
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2007-05-08 15:27:59 +08:00
}
time: Fix a bug in timekeeping_suspend() with no persistent clock
When there's no persistent clock, normally
timekeeping_suspend_time should always be zero, but this can
break in timekeeping_suspend().
At T1, there was a system suspend, so old_delta was assigned T1.
After some time, one time adjustment happened, and xtime got the
value of T1-dt(0s<dt<2s). Then, there comes another system
suspend soon after this adjustment, obviously we will get a
small negative delta_delta, resulting in a negative
timekeeping_suspend_time.
This is problematic, when doing timekeeping_resume() if there is
no nonstop clocksource for example, it will hit the else leg and
inject the improper sleeptime which is the wrong logic.
So, we can solve this problem by only doing delta related code
when the persistent clock is existent. Actually the code only
makes sense for persistent clock cases.
Signed-off-by: Xunlei Pang <pang.xunlei@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1427945681-29972-18-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-04-02 11:34:37 +08:00
/* time in seconds when suspend began for persistent clock */
2014-07-17 05:04:01 +08:00
static struct timespec64 timekeeping_suspend_time ;
2007-05-08 15:27:59 +08:00
2011-04-02 05:32:09 +08:00
/**
* __timekeeping_inject_sleeptime - Internal function to add sleep interval
* @ delta : pointer to a timespec delta value
*
* Takes a timespec offset measuring a suspend interval and properly
* adds the sleep offset to the timekeeping variables .
*/
2012-07-13 13:21:57 +08:00
static void __timekeeping_inject_sleeptime ( struct timekeeper * tk ,
2014-07-17 05:04:01 +08:00
struct timespec64 * delta )
2011-04-02 05:32:09 +08:00
{
2014-07-17 05:04:01 +08:00
if ( ! timespec64_valid_strict ( delta ) ) {
2014-06-05 07:11:43 +08:00
printk_deferred ( KERN_WARNING
" __timekeeping_inject_sleeptime: Invalid "
" sleep delta value! \n " ) ;
2011-06-02 09:18:09 +08:00
return ;
}
2012-07-13 13:21:57 +08:00
tk_xtime_add ( tk , delta ) ;
2014-07-17 05:04:01 +08:00
tk_set_wall_to_mono ( tk , timespec64_sub ( tk - > wall_to_monotonic , * delta ) ) ;
2014-07-17 05:05:00 +08:00
tk_update_sleep_time ( tk , timespec64_to_ktime ( * delta ) ) ;
2013-05-22 13:32:14 +08:00
tk_debug_account_sleep_time ( delta ) ;
2011-04-02 05:32:09 +08:00
}
2015-04-02 11:34:35 +08:00
# if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE)
2015-04-02 11:34:38 +08:00
/**
* We have three kinds of time sources to use for sleep time
* injection , the preference order is :
* 1 ) non - stop clocksource
* 2 ) persistent clock ( ie : RTC accessible when irqs are off )
* 3 ) RTC
*
* 1 ) and 2 ) are used by timekeeping , 3 ) by RTC subsystem .
* If system has neither 1 ) nor 2 ) , 3 ) will be used finally .
*
*
* If timekeeping has injected sleeptime via either 1 ) or 2 ) ,
* 3 ) becomes needless , so in this case we don ' t need to call
* rtc_resume ( ) , and this is what timekeeping_rtc_skipresume ( )
* means .
*/
bool timekeeping_rtc_skipresume ( void )
{
return sleeptime_injected ;
}
/**
* 1 ) can be determined whether to use or not only when doing
* timekeeping_resume ( ) which is invoked after rtc_suspend ( ) ,
* so we can ' t skip rtc_suspend ( ) surely if system has 1 ) .
*
* But if system has 2 ) , 2 ) will definitely be used , so in this
* case we don ' t need to call rtc_suspend ( ) , and this is what
* timekeeping_rtc_skipsuspend ( ) means .
*/
bool timekeeping_rtc_skipsuspend ( void )
{
return persistent_clock_exists ;
}
2011-04-02 05:32:09 +08:00
/**
2014-11-18 19:15:17 +08:00
* timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values
* @ delta : pointer to a timespec64 delta value
2011-04-02 05:32:09 +08:00
*
2015-04-02 11:34:22 +08:00
* This hook is for architectures that cannot support read_persistent_clock64
2011-04-02 05:32:09 +08:00
* because their RTC / persistent clock is only accessible when irqs are enabled .
2015-04-02 11:34:38 +08:00
* and also don ' t have an effective nonstop clocksource .
2011-04-02 05:32:09 +08:00
*
* This function should only be called by rtc_resume ( ) , and allows
* a suspend offset to be injected into the timekeeping values .
*/
2014-11-18 19:15:17 +08:00
void timekeeping_inject_sleeptime64 ( struct timespec64 * delta )
2011-04-02 05:32:09 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2011-11-15 06:05:44 +08:00
unsigned long flags ;
2011-04-02 05:32:09 +08:00
2013-02-22 06:51:38 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2011-11-15 04:48:10 +08:00
2012-07-28 02:48:13 +08:00
timekeeping_forward_now ( tk ) ;
2011-04-02 05:32:09 +08:00
2014-11-18 19:15:17 +08:00
__timekeeping_inject_sleeptime ( tk , delta ) ;
2011-04-02 05:32:09 +08:00
2013-06-27 18:35:46 +08:00
timekeeping_update ( tk , TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET ) ;
2011-04-02 05:32:09 +08:00
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2011-04-02 05:32:09 +08:00
/* signal hrtimers about time change */
clock_was_set ( ) ;
}
2015-04-02 11:34:35 +08:00
# endif
2011-04-02 05:32:09 +08:00
2007-05-08 15:27:59 +08:00
/**
* timekeeping_resume - Resumes the generic timekeeping subsystem .
*/
PM / sleep: Make it possible to quiesce timers during suspend-to-idle
The efficiency of suspend-to-idle depends on being able to keep CPUs
in the deepest available idle states for as much time as possible.
Ideally, they should only be brought out of idle by system wakeup
interrupts.
However, timer interrupts occurring periodically prevent that from
happening and it is not practical to chase all of the "misbehaving"
timers in a whack-a-mole fashion. A much more effective approach is
to suspend the local ticks for all CPUs and the entire timekeeping
along the lines of what is done during full suspend, which also
helps to keep suspend-to-idle and full suspend reasonably similar.
The idea is to suspend the local tick on each CPU executing
cpuidle_enter_freeze() and to make the last of them suspend the
entire timekeeping. That should prevent timer interrupts from
triggering until an IO interrupt wakes up one of the CPUs. It
needs to be done with interrupts disabled on all of the CPUs,
though, because otherwise the suspended clocksource might be
accessed by an interrupt handler which might lead to fatal
consequences.
Unfortunately, the existing ->enter callbacks provided by cpuidle
drivers generally cannot be used for implementing that, because some
of them re-enable interrupts temporarily and some idle entry methods
cause interrupts to be re-enabled automatically on exit. Also some
of these callbacks manipulate local clock event devices of the CPUs
which really shouldn't be done after suspending their ticks.
To overcome that difficulty, introduce a new cpuidle state callback,
->enter_freeze, that will be guaranteed (1) to keep interrupts
disabled all the time (and return with interrupts disabled) and (2)
not to touch the CPU timer devices. Modify cpuidle_enter_freeze() to
look for the deepest available idle state with ->enter_freeze present
and to make the CPU execute that callback with suspended tick (and the
last of the online CPUs to execute it with suspended timekeeping).
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2015-02-14 06:50:43 +08:00
void timekeeping_resume ( void )
2007-05-08 15:27:59 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2015-03-19 17:09:06 +08:00
struct clocksource * clock = tk - > tkr_mono . clock ;
2011-11-15 06:05:44 +08:00
unsigned long flags ;
2014-07-17 05:04:01 +08:00
struct timespec64 ts_new , ts_delta ;
2013-03-12 11:56:48 +08:00
cycle_t cycle_now , cycle_delta ;
2009-08-14 21:47:31 +08:00
2015-04-02 11:34:38 +08:00
sleeptime_injected = false ;
2015-04-02 11:34:22 +08:00
read_persistent_clock64 ( & ts_new ) ;
2007-05-08 15:27:59 +08:00
2012-08-06 07:40:41 +08:00
clockevents_resume ( ) ;
2007-05-14 17:10:02 +08:00
clocksource_resume ( ) ;
2013-02-22 06:51:38 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2007-05-08 15:27:59 +08:00
2013-03-12 11:56:48 +08:00
/*
* After system resumes , we need to calculate the suspended time and
* compensate it for the OS time . There are 3 sources that could be
* used : Nonstop clocksource during suspend , persistent clock and rtc
* device .
*
* One specific platform may have 1 or 2 or all of them , and the
* preference will be :
* suspend - nonstop clocksource - > persistent clock - > rtc
* The less preferred source will only be tried if there is no better
* usable source . The rtc part is handled separately in rtc core code .
*/
2015-03-19 17:09:06 +08:00
cycle_now = tk - > tkr_mono . read ( clock ) ;
2013-03-12 11:56:48 +08:00
if ( ( clock - > flags & CLOCK_SOURCE_SUSPEND_NONSTOP ) & &
2015-03-19 17:09:06 +08:00
cycle_now > tk - > tkr_mono . cycle_last ) {
2013-03-12 11:56:48 +08:00
u64 num , max = ULLONG_MAX ;
u32 mult = clock - > mult ;
u32 shift = clock - > shift ;
s64 nsec = 0 ;
2015-03-19 17:09:06 +08:00
cycle_delta = clocksource_delta ( cycle_now , tk - > tkr_mono . cycle_last ,
tk - > tkr_mono . mask ) ;
2013-03-12 11:56:48 +08:00
/*
* " cycle_delta * mutl " may cause 64 bits overflow , if the
* suspended time is too long . In that case we need do the
* 64 bits math carefully
*/
do_div ( max , mult ) ;
if ( cycle_delta > max ) {
num = div64_u64 ( cycle_delta , max ) ;
nsec = ( ( ( u64 ) max * mult ) > > shift ) * num ;
cycle_delta - = num * max ;
}
nsec + = ( ( u64 ) cycle_delta * mult ) > > shift ;
2014-07-17 05:04:01 +08:00
ts_delta = ns_to_timespec64 ( nsec ) ;
2015-04-02 11:34:38 +08:00
sleeptime_injected = true ;
2014-07-17 05:04:01 +08:00
} else if ( timespec64_compare ( & ts_new , & timekeeping_suspend_time ) > 0 ) {
ts_delta = timespec64_sub ( ts_new , timekeeping_suspend_time ) ;
2015-04-02 11:34:38 +08:00
sleeptime_injected = true ;
2007-05-08 15:27:59 +08:00
}
2013-03-12 11:56:48 +08:00
2015-04-02 11:34:38 +08:00
if ( sleeptime_injected )
2013-03-12 11:56:48 +08:00
__timekeeping_inject_sleeptime ( tk , & ts_delta ) ;
/* Re-base the last cycle value */
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . cycle_last = cycle_now ;
2015-03-19 16:28:44 +08:00
tk - > tkr_raw . cycle_last = cycle_now ;
2012-07-28 02:48:13 +08:00
tk - > ntp_error = 0 ;
2007-05-08 15:27:59 +08:00
timekeeping_suspended = 0 ;
2013-06-27 18:35:46 +08:00
timekeeping_update ( tk , TK_MIRROR | TK_CLOCK_WAS_SET ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2007-05-08 15:27:59 +08:00
touch_softlockup_watchdog ( ) ;
2015-03-25 20:09:16 +08:00
tick_resume ( ) ;
2011-05-02 22:48:57 +08:00
hrtimers_resume ( ) ;
2007-05-08 15:27:59 +08:00
}
PM / sleep: Make it possible to quiesce timers during suspend-to-idle
The efficiency of suspend-to-idle depends on being able to keep CPUs
in the deepest available idle states for as much time as possible.
Ideally, they should only be brought out of idle by system wakeup
interrupts.
However, timer interrupts occurring periodically prevent that from
happening and it is not practical to chase all of the "misbehaving"
timers in a whack-a-mole fashion. A much more effective approach is
to suspend the local ticks for all CPUs and the entire timekeeping
along the lines of what is done during full suspend, which also
helps to keep suspend-to-idle and full suspend reasonably similar.
The idea is to suspend the local tick on each CPU executing
cpuidle_enter_freeze() and to make the last of them suspend the
entire timekeeping. That should prevent timer interrupts from
triggering until an IO interrupt wakes up one of the CPUs. It
needs to be done with interrupts disabled on all of the CPUs,
though, because otherwise the suspended clocksource might be
accessed by an interrupt handler which might lead to fatal
consequences.
Unfortunately, the existing ->enter callbacks provided by cpuidle
drivers generally cannot be used for implementing that, because some
of them re-enable interrupts temporarily and some idle entry methods
cause interrupts to be re-enabled automatically on exit. Also some
of these callbacks manipulate local clock event devices of the CPUs
which really shouldn't be done after suspending their ticks.
To overcome that difficulty, introduce a new cpuidle state callback,
->enter_freeze, that will be guaranteed (1) to keep interrupts
disabled all the time (and return with interrupts disabled) and (2)
not to touch the CPU timer devices. Modify cpuidle_enter_freeze() to
look for the deepest available idle state with ->enter_freeze present
and to make the CPU execute that callback with suspended tick (and the
last of the online CPUs to execute it with suspended timekeeping).
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2015-02-14 06:50:43 +08:00
int timekeeping_suspend ( void )
2007-05-08 15:27:59 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2011-11-15 06:05:44 +08:00
unsigned long flags ;
2014-07-17 05:04:01 +08:00
struct timespec64 delta , delta_delta ;
static struct timespec64 old_delta ;
2007-05-08 15:27:59 +08:00
2015-04-02 11:34:22 +08:00
read_persistent_clock64 ( & timekeeping_suspend_time ) ;
2007-09-16 21:36:43 +08:00
2013-05-18 02:24:05 +08:00
/*
* On some systems the persistent_clock can not be detected at
* timekeeping_init by its return value , so if we see a valid
* value returned , update the persistent_clock_exists flag .
*/
if ( timekeeping_suspend_time . tv_sec | | timekeeping_suspend_time . tv_nsec )
2015-04-02 11:34:38 +08:00
persistent_clock_exists = true ;
2013-05-18 02:24:05 +08:00
2013-02-22 06:51:38 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2012-07-28 02:48:13 +08:00
timekeeping_forward_now ( tk ) ;
2007-05-08 15:27:59 +08:00
timekeeping_suspended = 1 ;
time: Avoid accumulating time drift in suspend/resume
Because the read_persistent_clock interface is usually backed by
only a second granular interface, each time we read from the persistent
clock for suspend/resume, we introduce a half second (on average) of error.
In order to avoid this error accumulating as the system is suspended
over and over, this patch measures the time delta between the persistent
clock and the system CLOCK_REALTIME.
If the delta is less then 2 seconds from the last suspend, we compensate
by using the previous time delta (keeping it close). If it is larger
then 2 seconds, we assume the clock was set or has been changed, so we
do no correction and update the delta.
Note: If NTP is running, ths could seem to "fight" with the NTP corrected
time, where as if the system time was off by 1 second, and NTP slewed the
value in, a suspend/resume cycle could undo this correction, by trying to
restore the previous offset from the persistent clock. However, without
this patch, since each read could cause almost a full second worth of
error, its possible to get almost 2 seconds of error just from the
suspend/resume cycle alone, so this about equal to any offset added by
the compensation.
Further on systems that suspend/resume frequently, this should keep time
closer then NTP could compensate for if the errors were allowed to
accumulate.
Credits to Arve Hjønnevåg for suggesting this solution.
CC: Arve Hjønnevåg <arve@android.com>
CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2011-06-01 13:53:23 +08:00
2015-04-02 11:34:38 +08:00
if ( persistent_clock_exists ) {
time: Avoid accumulating time drift in suspend/resume
Because the read_persistent_clock interface is usually backed by
only a second granular interface, each time we read from the persistent
clock for suspend/resume, we introduce a half second (on average) of error.
In order to avoid this error accumulating as the system is suspended
over and over, this patch measures the time delta between the persistent
clock and the system CLOCK_REALTIME.
If the delta is less then 2 seconds from the last suspend, we compensate
by using the previous time delta (keeping it close). If it is larger
then 2 seconds, we assume the clock was set or has been changed, so we
do no correction and update the delta.
Note: If NTP is running, ths could seem to "fight" with the NTP corrected
time, where as if the system time was off by 1 second, and NTP slewed the
value in, a suspend/resume cycle could undo this correction, by trying to
restore the previous offset from the persistent clock. However, without
this patch, since each read could cause almost a full second worth of
error, its possible to get almost 2 seconds of error just from the
suspend/resume cycle alone, so this about equal to any offset added by
the compensation.
Further on systems that suspend/resume frequently, this should keep time
closer then NTP could compensate for if the errors were allowed to
accumulate.
Credits to Arve Hjønnevåg for suggesting this solution.
CC: Arve Hjønnevåg <arve@android.com>
CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2011-06-01 13:53:23 +08:00
/*
time: Fix a bug in timekeeping_suspend() with no persistent clock
When there's no persistent clock, normally
timekeeping_suspend_time should always be zero, but this can
break in timekeeping_suspend().
At T1, there was a system suspend, so old_delta was assigned T1.
After some time, one time adjustment happened, and xtime got the
value of T1-dt(0s<dt<2s). Then, there comes another system
suspend soon after this adjustment, obviously we will get a
small negative delta_delta, resulting in a negative
timekeeping_suspend_time.
This is problematic, when doing timekeeping_resume() if there is
no nonstop clocksource for example, it will hit the else leg and
inject the improper sleeptime which is the wrong logic.
So, we can solve this problem by only doing delta related code
when the persistent clock is existent. Actually the code only
makes sense for persistent clock cases.
Signed-off-by: Xunlei Pang <pang.xunlei@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1427945681-29972-18-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-04-02 11:34:37 +08:00
* To avoid drift caused by repeated suspend / resumes ,
* which each can add ~ 1 second drift error ,
* try to compensate so the difference in system time
* and persistent_clock time stays close to constant .
time: Avoid accumulating time drift in suspend/resume
Because the read_persistent_clock interface is usually backed by
only a second granular interface, each time we read from the persistent
clock for suspend/resume, we introduce a half second (on average) of error.
In order to avoid this error accumulating as the system is suspended
over and over, this patch measures the time delta between the persistent
clock and the system CLOCK_REALTIME.
If the delta is less then 2 seconds from the last suspend, we compensate
by using the previous time delta (keeping it close). If it is larger
then 2 seconds, we assume the clock was set or has been changed, so we
do no correction and update the delta.
Note: If NTP is running, ths could seem to "fight" with the NTP corrected
time, where as if the system time was off by 1 second, and NTP slewed the
value in, a suspend/resume cycle could undo this correction, by trying to
restore the previous offset from the persistent clock. However, without
this patch, since each read could cause almost a full second worth of
error, its possible to get almost 2 seconds of error just from the
suspend/resume cycle alone, so this about equal to any offset added by
the compensation.
Further on systems that suspend/resume frequently, this should keep time
closer then NTP could compensate for if the errors were allowed to
accumulate.
Credits to Arve Hjønnevåg for suggesting this solution.
CC: Arve Hjønnevåg <arve@android.com>
CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2011-06-01 13:53:23 +08:00
*/
time: Fix a bug in timekeeping_suspend() with no persistent clock
When there's no persistent clock, normally
timekeeping_suspend_time should always be zero, but this can
break in timekeeping_suspend().
At T1, there was a system suspend, so old_delta was assigned T1.
After some time, one time adjustment happened, and xtime got the
value of T1-dt(0s<dt<2s). Then, there comes another system
suspend soon after this adjustment, obviously we will get a
small negative delta_delta, resulting in a negative
timekeeping_suspend_time.
This is problematic, when doing timekeeping_resume() if there is
no nonstop clocksource for example, it will hit the else leg and
inject the improper sleeptime which is the wrong logic.
So, we can solve this problem by only doing delta related code
when the persistent clock is existent. Actually the code only
makes sense for persistent clock cases.
Signed-off-by: Xunlei Pang <pang.xunlei@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1427945681-29972-18-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-04-02 11:34:37 +08:00
delta = timespec64_sub ( tk_xtime ( tk ) , timekeeping_suspend_time ) ;
delta_delta = timespec64_sub ( delta , old_delta ) ;
if ( abs ( delta_delta . tv_sec ) > = 2 ) {
/*
* if delta_delta is too large , assume time correction
* has occurred and set old_delta to the current delta .
*/
old_delta = delta ;
} else {
/* Otherwise try to adjust old_system to compensate */
timekeeping_suspend_time =
timespec64_add ( timekeeping_suspend_time , delta_delta ) ;
}
time: Avoid accumulating time drift in suspend/resume
Because the read_persistent_clock interface is usually backed by
only a second granular interface, each time we read from the persistent
clock for suspend/resume, we introduce a half second (on average) of error.
In order to avoid this error accumulating as the system is suspended
over and over, this patch measures the time delta between the persistent
clock and the system CLOCK_REALTIME.
If the delta is less then 2 seconds from the last suspend, we compensate
by using the previous time delta (keeping it close). If it is larger
then 2 seconds, we assume the clock was set or has been changed, so we
do no correction and update the delta.
Note: If NTP is running, ths could seem to "fight" with the NTP corrected
time, where as if the system time was off by 1 second, and NTP slewed the
value in, a suspend/resume cycle could undo this correction, by trying to
restore the previous offset from the persistent clock. However, without
this patch, since each read could cause almost a full second worth of
error, its possible to get almost 2 seconds of error just from the
suspend/resume cycle alone, so this about equal to any offset added by
the compensation.
Further on systems that suspend/resume frequently, this should keep time
closer then NTP could compensate for if the errors were allowed to
accumulate.
Credits to Arve Hjønnevåg for suggesting this solution.
CC: Arve Hjønnevåg <arve@android.com>
CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2011-06-01 13:53:23 +08:00
}
2013-12-12 11:10:36 +08:00
timekeeping_update ( tk , TK_MIRROR ) ;
2015-02-13 21:49:02 +08:00
halt_fast_timekeeper ( tk ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2007-05-08 15:27:59 +08:00
2015-03-25 20:09:16 +08:00
tick_suspend ( ) ;
2010-02-03 06:41:41 +08:00
clocksource_suspend ( ) ;
2012-08-06 07:40:41 +08:00
clockevents_suspend ( ) ;
2007-05-08 15:27:59 +08:00
return 0 ;
}
/* sysfs resume/suspend bits for timekeeping */
2011-03-24 05:16:04 +08:00
static struct syscore_ops timekeeping_syscore_ops = {
2007-05-08 15:27:59 +08:00
. resume = timekeeping_resume ,
. suspend = timekeeping_suspend ,
} ;
2011-03-24 05:16:04 +08:00
static int __init timekeeping_init_ops ( void )
2007-05-08 15:27:59 +08:00
{
2011-03-24 05:16:04 +08:00
register_syscore_ops ( & timekeeping_syscore_ops ) ;
return 0 ;
2007-05-08 15:27:59 +08:00
}
2011-03-24 05:16:04 +08:00
device_initcall ( timekeeping_init_ops ) ;
2007-05-08 15:27:59 +08:00
/*
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
* Apply a multiplier adjustment to the timekeeper
2007-05-08 15:27:59 +08:00
*/
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
static __always_inline void timekeeping_apply_adjustment ( struct timekeeper * tk ,
s64 offset ,
bool negative ,
int adj_scale )
2007-05-08 15:27:59 +08:00
{
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
s64 interval = tk - > cycle_interval ;
s32 mult_adj = 1 ;
2007-05-08 15:27:59 +08:00
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
if ( negative ) {
mult_adj = - mult_adj ;
interval = - interval ;
offset = - offset ;
2012-08-05 03:21:14 +08:00
}
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
mult_adj < < = adj_scale ;
interval < < = adj_scale ;
offset < < = adj_scale ;
2007-05-08 15:27:59 +08:00
2011-10-28 09:12:42 +08:00
/*
* So the following can be confusing .
*
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
* To keep things simple , lets assume mult_adj = = 1 for now .
2011-10-28 09:12:42 +08:00
*
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
* When mult_adj ! = 1 , remember that the interval and offset values
2011-10-28 09:12:42 +08:00
* have been appropriately scaled so the math is the same .
*
* The basic idea here is that we ' re increasing the multiplier
* by one , this causes the xtime_interval to be incremented by
* one cycle_interval . This is because :
* xtime_interval = cycle_interval * mult
* So if mult is being incremented by one :
* xtime_interval = cycle_interval * ( mult + 1 )
* Its the same as :
* xtime_interval = ( cycle_interval * mult ) + cycle_interval
* Which can be shortened to :
* xtime_interval + = cycle_interval
*
* So offset stores the non - accumulated cycles . Thus the current
* time ( in shifted nanoseconds ) is :
* now = ( offset * adj ) + xtime_nsec
* Now , even though we ' re adjusting the clock frequency , we have
* to keep time consistent . In other words , we can ' t jump back
* in time , and we also want to avoid jumping forward in time .
*
* So given the same offset value , we need the time to be the same
* both before and after the freq adjustment .
* now = ( offset * adj_1 ) + xtime_nsec_1
* now = ( offset * adj_2 ) + xtime_nsec_2
* So :
* ( offset * adj_1 ) + xtime_nsec_1 =
* ( offset * adj_2 ) + xtime_nsec_2
* And we know :
* adj_2 = adj_1 + 1
* So :
* ( offset * adj_1 ) + xtime_nsec_1 =
* ( offset * ( adj_1 + 1 ) ) + xtime_nsec_2
* ( offset * adj_1 ) + xtime_nsec_1 =
* ( offset * adj_1 ) + offset + xtime_nsec_2
* Canceling the sides :
* xtime_nsec_1 = offset + xtime_nsec_2
* Which gives us :
* xtime_nsec_2 = xtime_nsec_1 - offset
* Which simplfies to :
* xtime_nsec - = offset
*
* XXX - TODO : Doc ntp_error calculation .
*/
2015-03-19 17:09:06 +08:00
if ( ( mult_adj > 0 ) & & ( tk - > tkr_mono . mult + mult_adj < mult_adj ) ) {
time: Avoid possible NTP adjustment mult overflow.
Ideally, __clocksource_updatefreq_scale, selects the largest shift
value possible for a clocksource. This results in the mult memember of
struct clocksource being particularly large, although not so large
that NTP would adjust the clock to cause it to overflow.
That said, nothing actually prohibits an overflow from occuring, its
just that it "shouldn't" occur.
So while very unlikely, and so far never observed, the value of
(cs->mult+cs->maxadj) may have a chance to reach very near 0xFFFFFFFF,
so there is a possibility it may overflow when doing NTP positive
adjustment
See the following detail: When NTP slewes the clock, kernel goes
through update_wall_time()->...->timekeeping_apply_adjustment():
tk->tkr.mult += mult_adj;
Since there is no guard against it, its possible tk->tkr.mult may
overflow during this operation.
This patch avoids any possible mult overflow by judging the overflow
case before adding mult_adj to mult, also adds the WARNING message
when capturing such case.
Signed-off-by: pang.xunlei <pang.xunlei@linaro.org>
[jstultz: Reworded commit message]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2014-10-08 15:03:34 +08:00
/* NTP adjustment caused clocksource mult overflow */
WARN_ON_ONCE ( 1 ) ;
return ;
}
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . mult + = mult_adj ;
2012-07-13 13:21:57 +08:00
tk - > xtime_interval + = interval ;
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec - = offset ;
2012-07-13 13:21:57 +08:00
tk - > ntp_error - = ( interval - offset ) < < tk - > ntp_error_shift ;
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
}
/*
* Calculate the multiplier adjustment needed to match the frequency
* specified by NTP
*/
static __always_inline void timekeeping_freqadjust ( struct timekeeper * tk ,
s64 offset )
{
s64 interval = tk - > cycle_interval ;
s64 xinterval = tk - > xtime_interval ;
s64 tick_error ;
bool negative ;
u32 adj ;
/* Remove any current error adj from freq calculation */
if ( tk - > ntp_err_mult )
xinterval - = tk - > cycle_interval ;
2014-04-24 11:53:29 +08:00
tk - > ntp_tick = ntp_tick_length ( ) ;
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
/* Calculate current error per tick */
tick_error = ntp_tick_length ( ) > > tk - > ntp_error_shift ;
tick_error - = ( xinterval + tk - > xtime_remainder ) ;
/* Don't worry about correcting it if its small */
if ( likely ( ( tick_error > = 0 ) & & ( tick_error < = interval ) ) )
return ;
/* preserve the direction of correction */
negative = ( tick_error < 0 ) ;
/* Sort out the magnitude of the correction */
tick_error = abs ( tick_error ) ;
for ( adj = 0 ; tick_error > interval ; adj + + )
tick_error > > = 1 ;
/* scale the corrections */
timekeeping_apply_adjustment ( tk , offset , negative , adj ) ;
}
/*
* Adjust the timekeeper ' s multiplier to the correct frequency
* and also to reduce the accumulated error value .
*/
static void timekeeping_adjust ( struct timekeeper * tk , s64 offset )
{
/* Correct for the current frequency error */
timekeeping_freqadjust ( tk , offset ) ;
/* Next make a small adjustment to fix any cumulative error */
if ( ! tk - > ntp_err_mult & & ( tk - > ntp_error > 0 ) ) {
tk - > ntp_err_mult = 1 ;
timekeeping_apply_adjustment ( tk , offset , 0 , 0 ) ;
} else if ( tk - > ntp_err_mult & & ( tk - > ntp_error < = 0 ) ) {
/* Undo any existing error adjustment */
timekeeping_apply_adjustment ( tk , offset , 1 , 0 ) ;
tk - > ntp_err_mult = 0 ;
}
2015-03-19 17:09:06 +08:00
if ( unlikely ( tk - > tkr_mono . clock - > maxadj & &
( abs ( tk - > tkr_mono . mult - tk - > tkr_mono . clock - > mult )
> tk - > tkr_mono . clock - > maxadj ) ) ) {
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
printk_once ( KERN_WARNING
" Adjusting %s more than 11%% (%ld vs %ld) \n " ,
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . clock - > name , ( long ) tk - > tkr_mono . mult ,
( long ) tk - > tkr_mono . clock - > mult + tk - > tkr_mono . clock - > maxadj ) ;
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
}
2012-07-13 13:21:56 +08:00
/*
* It may be possible that when we entered this function , xtime_nsec
* was very small . Further , if we ' re slightly speeding the clocksource
* in the code above , its possible the required corrective factor to
* xtime_nsec could cause it to underflow .
*
* Now , since we already accumulated the second , cannot simply roll
* the accumulated second back , since the NTP subsystem has been
* notified via second_overflow . So instead we push xtime_nsec forward
* by the amount we underflowed , and add that amount into the error .
*
* We ' ll correct this error next time through this function , when
* xtime_nsec is not as small .
*/
2015-03-19 17:09:06 +08:00
if ( unlikely ( ( s64 ) tk - > tkr_mono . xtime_nsec < 0 ) ) {
s64 neg = - ( s64 ) tk - > tkr_mono . xtime_nsec ;
tk - > tkr_mono . xtime_nsec = 0 ;
2012-07-13 13:21:57 +08:00
tk - > ntp_error + = neg < < tk - > ntp_error_shift ;
2012-07-13 13:21:56 +08:00
}
2007-05-08 15:27:59 +08:00
}
2012-07-13 13:21:54 +08:00
/**
* accumulate_nsecs_to_secs - Accumulates nsecs into secs
*
* Helper function that accumulates a the nsecs greater then a second
* from the xtime_nsec field to the xtime_secs field .
* It also calls into the NTP code to handle leapsecond processing .
*
*/
2013-06-27 18:35:46 +08:00
static inline unsigned int accumulate_nsecs_to_secs ( struct timekeeper * tk )
2012-07-13 13:21:54 +08:00
{
2015-03-19 17:09:06 +08:00
u64 nsecps = ( u64 ) NSEC_PER_SEC < < tk - > tkr_mono . shift ;
2013-12-12 12:07:49 +08:00
unsigned int clock_set = 0 ;
2012-07-13 13:21:54 +08:00
2015-03-19 17:09:06 +08:00
while ( tk - > tkr_mono . xtime_nsec > = nsecps ) {
2012-07-13 13:21:54 +08:00
int leap ;
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec - = nsecps ;
2012-07-13 13:21:54 +08:00
tk - > xtime_sec + + ;
/* Figure out if its a leap sec and apply if needed */
leap = second_overflow ( tk - > xtime_sec ) ;
2012-07-28 02:48:12 +08:00
if ( unlikely ( leap ) ) {
2014-07-17 05:04:01 +08:00
struct timespec64 ts ;
2012-07-28 02:48:12 +08:00
tk - > xtime_sec + = leap ;
2012-07-13 13:21:54 +08:00
2012-07-28 02:48:12 +08:00
ts . tv_sec = leap ;
ts . tv_nsec = 0 ;
tk_set_wall_to_mono ( tk ,
2014-07-17 05:04:01 +08:00
timespec64_sub ( tk - > wall_to_monotonic , ts ) ) ;
2012-07-28 02:48:12 +08:00
2012-05-04 03:30:07 +08:00
__timekeeping_set_tai_offset ( tk , tk - > tai_offset - leap ) ;
2013-12-12 12:07:49 +08:00
clock_set = TK_CLOCK_WAS_SET ;
2012-07-28 02:48:12 +08:00
}
2012-07-13 13:21:54 +08:00
}
2013-12-12 12:07:49 +08:00
return clock_set ;
2012-07-13 13:21:54 +08:00
}
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
/**
* logarithmic_accumulation - shifted accumulation of cycles
*
* This functions accumulates a shifted interval of cycles into
* into a shifted interval nanoseconds . Allows for O ( log ) accumulation
* loop .
*
* Returns the unconsumed cycles .
*/
2012-07-13 13:21:57 +08:00
static cycle_t logarithmic_accumulation ( struct timekeeper * tk , cycle_t offset ,
2013-12-12 12:07:49 +08:00
u32 shift ,
unsigned int * clock_set )
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
{
2013-02-22 06:51:36 +08:00
cycle_t interval = tk - > cycle_interval < < shift ;
2010-08-10 05:20:09 +08:00
u64 raw_nsecs ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
2012-07-13 13:21:57 +08:00
/* If the offset is smaller then a shifted interval, do nothing */
2013-02-22 06:51:36 +08:00
if ( offset < interval )
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
return offset ;
/* Accumulate one shifted interval */
2013-02-22 06:51:36 +08:00
offset - = interval ;
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . cycle_last + = interval ;
2015-03-19 16:28:44 +08:00
tk - > tkr_raw . cycle_last + = interval ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec + = tk - > xtime_interval < < shift ;
2013-12-12 12:07:49 +08:00
* clock_set | = accumulate_nsecs_to_secs ( tk ) ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
2010-08-10 05:20:09 +08:00
/* Accumulate raw time */
2012-10-09 15:18:23 +08:00
raw_nsecs = ( u64 ) tk - > raw_interval < < shift ;
2012-07-13 13:21:57 +08:00
raw_nsecs + = tk - > raw_time . tv_nsec ;
2010-08-14 02:30:58 +08:00
if ( raw_nsecs > = NSEC_PER_SEC ) {
u64 raw_secs = raw_nsecs ;
raw_nsecs = do_div ( raw_secs , NSEC_PER_SEC ) ;
2012-07-13 13:21:57 +08:00
tk - > raw_time . tv_sec + = raw_secs ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
}
2012-07-13 13:21:57 +08:00
tk - > raw_time . tv_nsec = raw_nsecs ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
/* Accumulate error between NTP and clock interval */
2014-04-24 11:53:29 +08:00
tk - > ntp_error + = tk - > ntp_tick < < shift ;
2012-07-13 13:21:57 +08:00
tk - > ntp_error - = ( tk - > xtime_interval + tk - > xtime_remainder ) < <
( tk - > ntp_error_shift + shift ) ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
return offset ;
}
2007-05-08 15:27:59 +08:00
/**
* update_wall_time - Uses the current clocksource to increment the wall time
*
*/
2013-12-13 05:10:55 +08:00
void update_wall_time ( void )
2007-05-08 15:27:59 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * real_tk = & tk_core . timekeeper ;
2013-02-22 06:51:40 +08:00
struct timekeeper * tk = & shadow_timekeeper ;
2007-05-08 15:27:59 +08:00
cycle_t offset ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
int shift = 0 , maxshift ;
2013-12-12 12:07:49 +08:00
unsigned int clock_set = 0 ;
2011-11-15 04:48:10 +08:00
unsigned long flags ;
2013-02-22 06:51:38 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2007-05-08 15:27:59 +08:00
/* Make sure we're fully resumed: */
if ( unlikely ( timekeeping_suspended ) )
2011-11-15 04:48:10 +08:00
goto out ;
2007-05-08 15:27:59 +08:00
2010-07-14 08:56:20 +08:00
# ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
2013-02-22 06:51:40 +08:00
offset = real_tk - > cycle_interval ;
2010-07-14 08:56:20 +08:00
# else
2015-03-19 17:09:06 +08:00
offset = clocksource_delta ( tk - > tkr_mono . read ( tk - > tkr_mono . clock ) ,
tk - > tkr_mono . cycle_last , tk - > tkr_mono . mask ) ;
2007-05-08 15:27:59 +08:00
# endif
2012-08-22 08:30:49 +08:00
/* Check if there's really nothing to do */
2013-02-22 06:51:40 +08:00
if ( offset < real_tk - > cycle_interval )
2012-08-22 08:30:49 +08:00
goto out ;
2015-03-12 12:16:32 +08:00
/* Do some additional sanity checking */
timekeeping_check_update ( real_tk , offset ) ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
/*
* With NO_HZ we may have to accumulate many cycle_intervals
* ( think " ticks " ) worth of time at once . To do this efficiently ,
* we calculate the largest doubling multiple of cycle_intervals
2012-03-15 11:28:56 +08:00
* that is smaller than the offset . We then accumulate that
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
* chunk in one go , and then try to consume the next smaller
* doubled multiple .
2007-05-08 15:27:59 +08:00
*/
2012-07-28 02:48:13 +08:00
shift = ilog2 ( offset ) - ilog2 ( tk - > cycle_interval ) ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
shift = max ( 0 , shift ) ;
2012-03-15 11:28:56 +08:00
/* Bound shift to one less than what overflows tick_length */
2011-11-15 05:18:07 +08:00
maxshift = ( 64 - ( ilog2 ( ntp_tick_length ( ) ) + 1 ) ) - 1 ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
shift = min ( shift , maxshift ) ;
2012-07-28 02:48:13 +08:00
while ( offset > = tk - > cycle_interval ) {
2013-12-12 12:07:49 +08:00
offset = logarithmic_accumulation ( tk , offset , shift ,
& clock_set ) ;
2012-07-28 02:48:13 +08:00
if ( offset < tk - > cycle_interval < < shift )
2010-03-19 05:47:30 +08:00
shift - - ;
2007-05-08 15:27:59 +08:00
}
/* correct the clock when NTP error is too big */
2012-07-28 02:48:13 +08:00
timekeeping_adjust ( tk , offset ) ;
2007-05-08 15:27:59 +08:00
2010-04-07 05:30:51 +08:00
/*
2012-09-05 03:38:12 +08:00
* XXX This can be killed once everyone converts
* to the new update_vsyscall .
*/
old_vsyscall_fixup ( tk ) ;
2007-05-08 15:27:59 +08:00
2010-04-07 05:30:51 +08:00
/*
* Finally , make sure that after the rounding
2012-07-13 13:21:53 +08:00
* xtime_nsec isn ' t larger than NSEC_PER_SEC
2010-04-07 05:30:51 +08:00
*/
2013-12-12 12:07:49 +08:00
clock_set | = accumulate_nsecs_to_secs ( tk ) ;
Revert "time: Remove xtime_cache"
This reverts commit 7bc7d637452383d56ba4368d4336b0dde1bb476d, as
requested by John Stultz. Quoting John:
"Petr Titěra reported an issue where he saw odd atime regressions with
2.6.33 where there were a full second worth of nanoseconds in the
nanoseconds field.
He also reviewed the time code and narrowed down the problem: unhandled
overflow of the nanosecond field caused by rounding up the
sub-nanosecond accumulated time.
Details:
* At the end of update_wall_time(), we currently round up the
sub-nanosecond portion of accumulated time when storing it into xtime.
This was added to avoid time inconsistencies caused when the
sub-nanosecond portion was truncated when storing into xtime.
Unfortunately we don't handle the possible second overflow caused by
that rounding.
* Previously the xtime_cache code hid this overflow by normalizing the
xtime value when storing into the xtime_cache.
* We could try to handle the second overflow after the rounding up, but
since this affects the timekeeping's internal state, this would further
complicate the next accumulation cycle, causing small errors in ntp
steering. As much as I'd like to get rid of it, the xtime_cache code is
known to work.
* The correct fix is really to include the sub-nanosecond portion in the
timekeeping accessor function, so we don't need to round up at during
accumulation. This would greatly simplify the accumulation code.
Unfortunately, we can't do this safely until the last three
non-GENERIC_TIME arches (sparc32, arm, cris) are converted (those
patches are in -mm) and we kill off the spots where arches set xtime
directly. This is all 2.6.34 material, so I think reverting the
xtime_cache change is the best approach for now.
Many thanks to Petr for both reporting and finding the issue!"
Reported-by: Petr Titěra <P.Titera@century.cz>
Requested-by: john stultz <johnstul@us.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-12-23 06:10:37 +08:00
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2013-02-22 06:51:40 +08:00
/*
* Update the real timekeeper .
*
* We could avoid this memcpy by switching pointers , but that
* requires changes to all other timekeeper usage sites as
* well , i . e . move the timekeeper pointer getter into the
* spinlocked / seqcount protected sections . And we trade this
2014-07-17 05:04:07 +08:00
* memcpy under the tk_core . seq against one before we start
2013-02-22 06:51:40 +08:00
* updating .
*/
timekeeping: Copy the shadow-timekeeper over the real timekeeper last
The fix in d151832650ed9 (time: Move clock_was_set_seq update
before updating shadow-timekeeper) was unfortunately incomplete.
The main gist of that change was to do the shadow-copy update
last, so that any state changes were properly duplicated, and
we wouldn't accidentally have stale data in the shadow.
Unfortunately in the main update_wall_time() logic, we update
use the shadow-timekeeper to calculate the next update values,
then while holding the lock, copy the shadow-timekeeper over,
then call timekeeping_update() to do some additional
bookkeeping, (skipping the shadow mirror). The bug with this is
the additional bookkeeping isn't all read-only, and some
changes timkeeper state. Thus we might then overwrite this state
change on the next update.
To avoid this problem, do the timekeeping_update() on the
shadow-timekeeper prior to copying the full state over to
the real-timekeeper.
This avoids problems with both the clock_was_set_seq and
next_leap_ktime being overwritten and possibly the
fast-timekeepers as well.
Many thanks to Prarit for his rigorous testing, which discovered
this problem, along with Prarit and Daniel's work validating this
fix.
Reported-by: Prarit Bhargava <prarit@redhat.com>
Tested-by: Prarit Bhargava <prarit@redhat.com>
Tested-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434560753-7441-1-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-18 01:05:53 +08:00
timekeeping_update ( tk , clock_set ) ;
2013-02-22 06:51:40 +08:00
memcpy ( real_tk , tk , sizeof ( * tk ) ) ;
timekeeping: Copy the shadow-timekeeper over the real timekeeper last
The fix in d151832650ed9 (time: Move clock_was_set_seq update
before updating shadow-timekeeper) was unfortunately incomplete.
The main gist of that change was to do the shadow-copy update
last, so that any state changes were properly duplicated, and
we wouldn't accidentally have stale data in the shadow.
Unfortunately in the main update_wall_time() logic, we update
use the shadow-timekeeper to calculate the next update values,
then while holding the lock, copy the shadow-timekeeper over,
then call timekeeping_update() to do some additional
bookkeeping, (skipping the shadow mirror). The bug with this is
the additional bookkeeping isn't all read-only, and some
changes timkeeper state. Thus we might then overwrite this state
change on the next update.
To avoid this problem, do the timekeeping_update() on the
shadow-timekeeper prior to copying the full state over to
the real-timekeeper.
This avoids problems with both the clock_was_set_seq and
next_leap_ktime being overwritten and possibly the
fast-timekeepers as well.
Many thanks to Prarit for his rigorous testing, which discovered
this problem, along with Prarit and Daniel's work validating this
fix.
Reported-by: Prarit Bhargava <prarit@redhat.com>
Tested-by: Prarit Bhargava <prarit@redhat.com>
Tested-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434560753-7441-1-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-18 01:05:53 +08:00
/* The memcpy must come last. Do not put anything here! */
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-02-22 06:51:40 +08:00
out :
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2013-12-13 05:10:55 +08:00
if ( clock_set )
2014-03-28 07:30:49 +08:00
/* Have to call _delayed version, since in irq context*/
clock_was_set_delayed ( ) ;
2007-05-08 15:27:59 +08:00
}
2007-07-16 14:39:41 +08:00
/**
2014-12-09 04:00:09 +08:00
* getboottime64 - Return the real time of system boot .
* @ ts : pointer to the timespec64 to be set
2007-07-16 14:39:41 +08:00
*
2014-12-09 04:00:09 +08:00
* Returns the wall - time of boot in a timespec64 .
2007-07-16 14:39:41 +08:00
*
* This is based on the wall_to_monotonic offset and the total suspend
* time . Calls to settimeofday will affect the value returned ( which
* basically means that however wrong your real time clock is at boot time ,
* you get the right time here ) .
*/
2014-12-09 04:00:09 +08:00
void getboottime64 ( struct timespec64 * ts )
2007-07-16 14:39:41 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2014-07-17 05:04:58 +08:00
ktime_t t = ktime_sub ( tk - > offs_real , tk - > offs_boot ) ;
2014-12-09 04:00:09 +08:00
* ts = ktime_to_timespec64 ( t ) ;
2007-07-16 14:39:41 +08:00
}
2014-12-09 04:00:09 +08:00
EXPORT_SYMBOL_GPL ( getboottime64 ) ;
2007-07-16 14:39:41 +08:00
2007-07-25 09:38:34 +08:00
unsigned long get_seconds ( void )
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2012-07-28 02:48:13 +08:00
return tk - > xtime_sec ;
2007-07-25 09:38:34 +08:00
}
EXPORT_SYMBOL ( get_seconds ) ;
2009-08-20 10:13:34 +08:00
struct timespec __current_kernel_time ( void )
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2012-07-28 02:48:13 +08:00
2014-07-17 05:04:01 +08:00
return timespec64_to_timespec ( tk_xtime ( tk ) ) ;
2009-08-20 10:13:34 +08:00
}
2007-07-25 09:38:34 +08:00
2015-07-29 20:09:43 +08:00
struct timespec64 current_kernel_time64 ( void )
2007-07-25 08:47:43 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2014-07-17 05:04:01 +08:00
struct timespec64 now ;
2007-07-25 08:47:43 +08:00
unsigned long seq ;
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
Revert "time: Remove xtime_cache"
This reverts commit 7bc7d637452383d56ba4368d4336b0dde1bb476d, as
requested by John Stultz. Quoting John:
"Petr Titěra reported an issue where he saw odd atime regressions with
2.6.33 where there were a full second worth of nanoseconds in the
nanoseconds field.
He also reviewed the time code and narrowed down the problem: unhandled
overflow of the nanosecond field caused by rounding up the
sub-nanosecond accumulated time.
Details:
* At the end of update_wall_time(), we currently round up the
sub-nanosecond portion of accumulated time when storing it into xtime.
This was added to avoid time inconsistencies caused when the
sub-nanosecond portion was truncated when storing into xtime.
Unfortunately we don't handle the possible second overflow caused by
that rounding.
* Previously the xtime_cache code hid this overflow by normalizing the
xtime value when storing into the xtime_cache.
* We could try to handle the second overflow after the rounding up, but
since this affects the timekeeping's internal state, this would further
complicate the next accumulation cycle, causing small errors in ntp
steering. As much as I'd like to get rid of it, the xtime_cache code is
known to work.
* The correct fix is really to include the sub-nanosecond portion in the
timekeeping accessor function, so we don't need to round up at during
accumulation. This would greatly simplify the accumulation code.
Unfortunately, we can't do this safely until the last three
non-GENERIC_TIME arches (sparc32, arm, cris) are converted (those
patches are in -mm) and we kill off the spots where arches set xtime
directly. This is all 2.6.34 material, so I think reverting the
xtime_cache change is the best approach for now.
Many thanks to Petr for both reporting and finding the issue!"
Reported-by: Petr Titěra <P.Titera@century.cz>
Requested-by: john stultz <johnstul@us.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-12-23 06:10:37 +08:00
2012-07-28 02:48:13 +08:00
now = tk_xtime ( tk ) ;
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2007-07-25 08:47:43 +08:00
2015-07-29 20:09:43 +08:00
return now ;
2007-07-25 08:47:43 +08:00
}
2015-07-29 20:09:43 +08:00
EXPORT_SYMBOL ( current_kernel_time64 ) ;
2009-08-20 10:13:34 +08:00
2014-11-08 03:20:40 +08:00
struct timespec64 get_monotonic_coarse64 ( void )
2009-08-20 10:13:34 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2014-07-17 05:04:01 +08:00
struct timespec64 now , mono ;
2009-08-20 10:13:34 +08:00
unsigned long seq ;
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
Revert "time: Remove xtime_cache"
This reverts commit 7bc7d637452383d56ba4368d4336b0dde1bb476d, as
requested by John Stultz. Quoting John:
"Petr Titěra reported an issue where he saw odd atime regressions with
2.6.33 where there were a full second worth of nanoseconds in the
nanoseconds field.
He also reviewed the time code and narrowed down the problem: unhandled
overflow of the nanosecond field caused by rounding up the
sub-nanosecond accumulated time.
Details:
* At the end of update_wall_time(), we currently round up the
sub-nanosecond portion of accumulated time when storing it into xtime.
This was added to avoid time inconsistencies caused when the
sub-nanosecond portion was truncated when storing into xtime.
Unfortunately we don't handle the possible second overflow caused by
that rounding.
* Previously the xtime_cache code hid this overflow by normalizing the
xtime value when storing into the xtime_cache.
* We could try to handle the second overflow after the rounding up, but
since this affects the timekeeping's internal state, this would further
complicate the next accumulation cycle, causing small errors in ntp
steering. As much as I'd like to get rid of it, the xtime_cache code is
known to work.
* The correct fix is really to include the sub-nanosecond portion in the
timekeeping accessor function, so we don't need to round up at during
accumulation. This would greatly simplify the accumulation code.
Unfortunately, we can't do this safely until the last three
non-GENERIC_TIME arches (sparc32, arm, cris) are converted (those
patches are in -mm) and we kill off the spots where arches set xtime
directly. This is all 2.6.34 material, so I think reverting the
xtime_cache change is the best approach for now.
Many thanks to Petr for both reporting and finding the issue!"
Reported-by: Petr Titěra <P.Titera@century.cz>
Requested-by: john stultz <johnstul@us.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-12-23 06:10:37 +08:00
2012-07-28 02:48:13 +08:00
now = tk_xtime ( tk ) ;
mono = tk - > wall_to_monotonic ;
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2009-08-20 10:13:34 +08:00
2014-07-17 05:04:01 +08:00
set_normalized_timespec64 ( & now , now . tv_sec + mono . tv_sec ,
2009-08-20 10:13:34 +08:00
now . tv_nsec + mono . tv_nsec ) ;
2014-07-17 05:04:01 +08:00
2014-11-08 03:20:40 +08:00
return now ;
2009-08-20 10:13:34 +08:00
}
2011-01-27 22:58:55 +08:00
/*
2012-02-29 08:50:11 +08:00
* Must hold jiffies_lock
2011-01-27 22:58:55 +08:00
*/
void do_timer ( unsigned long ticks )
{
jiffies_64 + = ticks ;
calc_global_load ( ticks ) ;
}
2011-01-27 22:59:05 +08:00
2012-07-11 06:43:24 +08:00
/**
2014-07-17 05:03:52 +08:00
* ktime_get_update_offsets_now - hrtimer helper
2015-04-15 05:08:37 +08:00
* @ cwsseq : pointer to check and store the clock was set sequence number
2012-07-11 06:43:24 +08:00
* @ offs_real : pointer to storage for monotonic - > realtime offset
* @ offs_boot : pointer to storage for monotonic - > boottime offset
2013-10-18 09:13:30 +08:00
* @ offs_tai : pointer to storage for monotonic - > clock tai offset
2012-07-11 06:43:24 +08:00
*
2015-04-15 05:08:37 +08:00
* Returns current monotonic time and updates the offsets if the
* sequence number in @ cwsseq and timekeeper . clock_was_set_seq are
* different .
*
2013-10-18 09:13:30 +08:00
* Called from hrtimer_interrupt ( ) or retrigger_next_event ( )
2012-07-11 06:43:24 +08:00
*/
2015-04-15 05:08:37 +08:00
ktime_t ktime_get_update_offsets_now ( unsigned int * cwsseq , ktime_t * offs_real ,
ktime_t * offs_boot , ktime_t * offs_tai )
2012-07-11 06:43:24 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2012-07-11 06:43:24 +08:00
unsigned int seq ;
2014-07-17 05:04:19 +08:00
ktime_t base ;
u64 nsecs ;
2012-07-11 06:43:24 +08:00
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
2012-07-11 06:43:24 +08:00
2015-03-19 17:09:06 +08:00
base = tk - > tkr_mono . base ;
nsecs = timekeeping_get_ns ( & tk - > tkr_mono ) ;
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
base = ktime_add_ns ( base , nsecs ) ;
2015-04-15 05:08:37 +08:00
if ( * cwsseq ! = tk - > clock_was_set_seq ) {
* cwsseq = tk - > clock_was_set_seq ;
* offs_real = tk - > offs_real ;
* offs_boot = tk - > offs_boot ;
* offs_tai = tk - > offs_tai ;
}
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
/* Handle leapsecond insertion adjustments */
if ( unlikely ( base . tv64 > = tk - > next_leap_ktime . tv64 ) )
* offs_real = ktime_sub ( tk - > offs_real , ktime_set ( 1 , 0 ) ) ;
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2012-07-11 06:43:24 +08:00
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
return base ;
2012-07-11 06:43:24 +08:00
}
2013-03-23 02:31:29 +08:00
/**
* do_adjtimex ( ) - Accessor function to NTP __do_adjtimex function
*/
int do_adjtimex ( struct timex * txc )
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2013-03-23 02:37:28 +08:00
unsigned long flags ;
2014-07-17 05:04:01 +08:00
struct timespec64 ts ;
2013-04-11 03:41:49 +08:00
s32 orig_tai , tai ;
2013-03-23 03:08:52 +08:00
int ret ;
/* Validate the data before disabling interrupts */
ret = ntp_validate_timex ( txc ) ;
if ( ret )
return ret ;
2013-03-23 06:04:13 +08:00
if ( txc - > modes & ADJ_SETOFFSET ) {
struct timespec delta ;
delta . tv_sec = txc - > time . tv_sec ;
delta . tv_nsec = txc - > time . tv_usec ;
if ( ! ( txc - > modes & ADJ_NANO ) )
delta . tv_nsec * = 1000 ;
ret = timekeeping_inject_offset ( & delta ) ;
if ( ret )
return ret ;
}
2014-07-17 05:04:04 +08:00
getnstimeofday64 ( & ts ) ;
2013-03-23 03:28:15 +08:00
2013-03-23 02:37:28 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2013-03-23 02:37:28 +08:00
2013-04-11 03:41:49 +08:00
orig_tai = tai = tk - > tai_offset ;
2013-03-23 03:28:15 +08:00
ret = __do_adjtimex ( txc , & ts , & tai ) ;
2013-03-23 02:31:29 +08:00
2013-04-11 03:41:49 +08:00
if ( tai ! = orig_tai ) {
__timekeeping_set_tai_offset ( tk , tai ) ;
2013-12-12 10:50:25 +08:00
timekeeping_update ( tk , TK_MIRROR | TK_CLOCK_WAS_SET ) ;
2013-04-11 03:41:49 +08:00
}
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
tk_update_leap_state ( tk ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-03-23 02:37:28 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
timekeeping: Avoid possible deadlock from clock_was_set_delayed
As part of normal operaions, the hrtimer subsystem frequently calls
into the timekeeping code, creating a locking order of
hrtimer locks -> timekeeping locks
clock_was_set_delayed() was suppoed to allow us to avoid deadlocks
between the timekeeping the hrtimer subsystem, so that we could
notify the hrtimer subsytem the time had changed while holding
the timekeeping locks. This was done by scheduling delayed work
that would run later once we were out of the timekeeing code.
But unfortunately the lock chains are complex enoguh that in
scheduling delayed work, we end up eventually trying to grab
an hrtimer lock.
Sasha Levin noticed this in testing when the new seqlock lockdep
enablement triggered the following (somewhat abrieviated) message:
[ 251.100221] ======================================================
[ 251.100221] [ INFO: possible circular locking dependency detected ]
[ 251.100221] 3.13.0-rc2-next-20131206-sasha-00005-g8be2375-dirty #4053 Not tainted
[ 251.101967] -------------------------------------------------------
[ 251.101967] kworker/10:1/4506 is trying to acquire lock:
[ 251.101967] (timekeeper_seq){----..}, at: [<ffffffff81160e96>] retrigger_next_event+0x56/0x70
[ 251.101967]
[ 251.101967] but task is already holding lock:
[ 251.101967] (hrtimer_bases.lock#11){-.-...}, at: [<ffffffff81160e7c>] retrigger_next_event+0x3c/0x70
[ 251.101967]
[ 251.101967] which lock already depends on the new lock.
[ 251.101967]
[ 251.101967]
[ 251.101967] the existing dependency chain (in reverse order) is:
[ 251.101967]
-> #5 (hrtimer_bases.lock#11){-.-...}:
[snipped]
-> #4 (&rt_b->rt_runtime_lock){-.-...}:
[snipped]
-> #3 (&rq->lock){-.-.-.}:
[snipped]
-> #2 (&p->pi_lock){-.-.-.}:
[snipped]
-> #1 (&(&pool->lock)->rlock){-.-...}:
[ 251.101967] [<ffffffff81194803>] validate_chain+0x6c3/0x7b0
[ 251.101967] [<ffffffff81194d9d>] __lock_acquire+0x4ad/0x580
[ 251.101967] [<ffffffff81194ff2>] lock_acquire+0x182/0x1d0
[ 251.101967] [<ffffffff84398500>] _raw_spin_lock+0x40/0x80
[ 251.101967] [<ffffffff81153e69>] __queue_work+0x1a9/0x3f0
[ 251.101967] [<ffffffff81154168>] queue_work_on+0x98/0x120
[ 251.101967] [<ffffffff81161351>] clock_was_set_delayed+0x21/0x30
[ 251.101967] [<ffffffff811c4bd1>] do_adjtimex+0x111/0x160
[ 251.101967] [<ffffffff811e2711>] compat_sys_adjtimex+0x41/0x70
[ 251.101967] [<ffffffff843a4b49>] ia32_sysret+0x0/0x5
[ 251.101967]
-> #0 (timekeeper_seq){----..}:
[snipped]
[ 251.101967] other info that might help us debug this:
[ 251.101967]
[ 251.101967] Chain exists of:
timekeeper_seq --> &rt_b->rt_runtime_lock --> hrtimer_bases.lock#11
[ 251.101967] Possible unsafe locking scenario:
[ 251.101967]
[ 251.101967] CPU0 CPU1
[ 251.101967] ---- ----
[ 251.101967] lock(hrtimer_bases.lock#11);
[ 251.101967] lock(&rt_b->rt_runtime_lock);
[ 251.101967] lock(hrtimer_bases.lock#11);
[ 251.101967] lock(timekeeper_seq);
[ 251.101967]
[ 251.101967] *** DEADLOCK ***
[ 251.101967]
[ 251.101967] 3 locks held by kworker/10:1/4506:
[ 251.101967] #0: (events){.+.+.+}, at: [<ffffffff81154960>] process_one_work+0x200/0x530
[ 251.101967] #1: (hrtimer_work){+.+...}, at: [<ffffffff81154960>] process_one_work+0x200/0x530
[ 251.101967] #2: (hrtimer_bases.lock#11){-.-...}, at: [<ffffffff81160e7c>] retrigger_next_event+0x3c/0x70
[ 251.101967]
[ 251.101967] stack backtrace:
[ 251.101967] CPU: 10 PID: 4506 Comm: kworker/10:1 Not tainted 3.13.0-rc2-next-20131206-sasha-00005-g8be2375-dirty #4053
[ 251.101967] Workqueue: events clock_was_set_work
So the best solution is to avoid calling clock_was_set_delayed() while
holding the timekeeping lock, and instead using a flag variable to
decide if we should call clock_was_set() once we've released the locks.
This works for the case here, where the do_adjtimex() was the deadlock
trigger point. Unfortuantely, in update_wall_time() we still hold
the jiffies lock, which would deadlock with the ipi triggered by
clock_was_set(), preventing us from calling it even after we drop the
timekeeping lock. So instead call clock_was_set_delayed() at that point.
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: stable <stable@vger.kernel.org> #3.10+
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Tested-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-11 09:18:18 +08:00
if ( tai ! = orig_tai )
clock_was_set ( ) ;
2013-09-12 07:50:56 +08:00
ntp_notify_cmos_timer ( ) ;
2013-03-23 03:28:15 +08:00
return ret ;
}
2013-03-23 02:31:29 +08:00
# ifdef CONFIG_NTP_PPS
/**
* hardpps ( ) - Accessor function to NTP __hardpps function
*/
void hardpps ( const struct timespec * phase_ts , const struct timespec * raw_ts )
{
2013-03-23 02:37:28 +08:00
unsigned long flags ;
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2013-03-23 02:37:28 +08:00
2013-03-23 02:31:29 +08:00
__hardpps ( phase_ts , raw_ts ) ;
2013-03-23 02:37:28 +08:00
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-03-23 02:37:28 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2013-03-23 02:31:29 +08:00
}
EXPORT_SYMBOL ( hardpps ) ;
# endif
2011-01-27 22:59:10 +08:00
/**
* xtime_update ( ) - advances the timekeeping infrastructure
* @ ticks : number of ticks , that have elapsed since the last call .
*
* Must be called with interrupts disabled .
*/
void xtime_update ( unsigned long ticks )
{
2012-02-29 08:50:11 +08:00
write_seqlock ( & jiffies_lock ) ;
2011-01-27 22:59:10 +08:00
do_timer ( ticks ) ;
2012-02-29 08:50:11 +08:00
write_sequnlock ( & jiffies_lock ) ;
2013-12-13 05:10:55 +08:00
update_wall_time ( ) ;
2011-01-27 22:59:10 +08:00
}