2018-11-01 02:21:09 +08:00
// SPDX-License-Identifier: GPL-2.0
2007-05-08 15:27:59 +08:00
/*
2018-11-01 02:21:08 +08:00
* Kernel timekeeping code and accessor functions . Based on code from
* timer . c , moved in commit 8524070 b7982 .
2007-05-08 15:27:59 +08:00
*/
2012-09-05 03:12:07 +08:00
# include <linux/timekeeper_internal.h>
2007-05-08 15:27:59 +08:00
# include <linux/module.h>
# include <linux/interrupt.h>
# include <linux/percpu.h>
# include <linux/init.h>
# include <linux/mm.h>
2017-02-09 01:51:31 +08:00
# include <linux/nmi.h>
2009-10-07 21:09:06 +08:00
# include <linux/sched.h>
2017-02-08 15:45:17 +08:00
# include <linux/sched/loadavg.h>
2018-07-20 04:55:34 +08:00
# include <linux/sched/clock.h>
2011-03-24 05:16:04 +08:00
# include <linux/syscore_ops.h>
2007-05-08 15:27:59 +08:00
# include <linux/clocksource.h>
# include <linux/jiffies.h>
# include <linux/time.h>
timekeeping: Add raw clock fallback for random_get_entropy()
The addition of random_get_entropy_fallback() provides access to
whichever time source has the highest frequency, which is useful for
gathering entropy on platforms without available cycle counters. It's
not necessarily as good as being able to quickly access a cycle counter
that the CPU has, but it's still something, even when it falls back to
being jiffies-based.
In the event that a given arch does not define get_cycles(), falling
back to the get_cycles() default implementation that returns 0 is really
not the best we can do. Instead, at least calling
random_get_entropy_fallback() would be preferable, because that always
needs to return _something_, even falling back to jiffies eventually.
It's not as though random_get_entropy_fallback() is super high precision
or guaranteed to be entropic, but basically anything that's not zero all
the time is better than returning zero all the time.
Finally, since random_get_entropy_fallback() is used during extremely
early boot when randomizing freelists in mm_init(), it can be called
before timekeeping has been initialized. In that case there really is
nothing we can do; jiffies hasn't even started ticking yet. So just give
up and return 0.
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Theodore Ts'o <tytso@mit.edu>
2022-04-10 22:49:50 +08:00
# include <linux/timex.h>
2007-05-08 15:27:59 +08:00
# include <linux/tick.h>
2009-08-14 21:47:30 +08:00
# include <linux/stop_machine.h>
2012-11-28 09:28:59 +08:00
# include <linux/pvclock_gtod.h>
2014-04-08 06:39:20 +08:00
# include <linux/compiler.h>
2019-04-10 17:14:19 +08:00
# include <linux/audit.h>
2022-07-18 05:53:34 +08:00
# include <linux/random.h>
2007-05-08 15:27:59 +08:00
2013-02-22 06:51:36 +08:00
# include "tick-internal.h"
2013-03-23 02:31:29 +08:00
# include "ntp_internal.h"
2013-05-22 13:32:14 +08:00
# include "timekeeping_internal.h"
2009-08-14 21:47:26 +08:00
2013-06-27 18:35:45 +08:00
# define TK_CLEAR_NTP (1 << 0)
# define TK_MIRROR (1 << 1)
2013-06-27 18:35:46 +08:00
# define TK_CLOCK_WAS_SET (1 << 2)
2013-06-27 18:35:45 +08:00
2024-10-09 16:29:07 +08:00
# define TK_UPDATE_ALL (TK_CLEAR_NTP | TK_CLOCK_WAS_SET)
2018-06-04 21:34:21 +08:00
enum timekeeping_adv_mode {
/* Update timekeeper when a tick has passed */
TK_ADV_TICK ,
/* Update timekeeper on a direct frequency change */
TK_ADV_FREQ
} ;
2014-07-17 05:04:07 +08:00
/*
* The most important data for readout fits into a single 64 byte
* cache line .
*/
2024-10-09 16:29:03 +08:00
struct tk_data {
2020-07-20 23:55:23 +08:00
seqcount_raw_spinlock_t seq ;
2014-07-17 05:04:07 +08:00
struct timekeeper timekeeper ;
2024-10-09 16:29:00 +08:00
struct timekeeper shadow_timekeeper ;
2024-10-09 16:29:02 +08:00
raw_spinlock_t lock ;
2024-10-09 16:29:03 +08:00
} ____cacheline_aligned ;
static struct tk_data tk_core ;
2009-08-14 21:47:26 +08:00
2020-08-14 18:19:34 +08:00
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended ;
2014-07-17 05:05:23 +08:00
/**
* struct tk_fast - NMI safe timekeeper
* @ seq : Sequence counter for protecting updates . The lowest bit
* is the index for the tk_read_base array
* @ base : tk_read_base array . Access is indexed by the lowest bit of
* @ seq .
*
* See @ update_fast_timekeeper ( ) below .
*/
struct tk_fast {
2020-08-27 19:40:41 +08:00
seqcount_latch_t seq ;
2014-07-17 05:05:23 +08:00
struct tk_read_base base [ 2 ] ;
} ;
2017-08-28 20:21:53 +08:00
/* Suspend-time cycles value for halted fast timekeeper. */
static u64 cycles_at_suspend ;
static u64 dummy_clock_read ( struct clocksource * cs )
{
2020-08-14 18:19:34 +08:00
if ( timekeeping_suspended )
return cycles_at_suspend ;
return local_clock ( ) ;
2017-08-28 20:21:53 +08:00
}
static struct clocksource dummy_clock = {
. read = dummy_clock_read ,
} ;
2020-08-14 18:19:34 +08:00
/*
* Boot time initialization which allows local_clock ( ) to be utilized
* during early boot when clocksources are not available . local_clock ( )
* returns nanoseconds already so no conversion is required , hence mult = 1
* and shift = 0. When the first proper clocksource is installed then
* the fast time keepers are updated with the correct values .
*/
# define FAST_TK_INIT \
{ \
. clock = & dummy_clock , \
. mask = CLOCKSOURCE_MASK ( 64 ) , \
. mult = 1 , \
. shift = 0 , \
}
2017-08-28 20:21:53 +08:00
static struct tk_fast tk_fast_mono ____cacheline_aligned = {
2020-08-27 19:40:41 +08:00
. seq = SEQCNT_LATCH_ZERO ( tk_fast_mono . seq ) ,
2020-08-14 18:19:34 +08:00
. base [ 0 ] = FAST_TK_INIT ,
. base [ 1 ] = FAST_TK_INIT ,
2017-08-28 20:21:53 +08:00
} ;
static struct tk_fast tk_fast_raw ____cacheline_aligned = {
2020-08-27 19:40:41 +08:00
. seq = SEQCNT_LATCH_ZERO ( tk_fast_raw . seq ) ,
2020-08-14 18:19:34 +08:00
. base [ 0 ] = FAST_TK_INIT ,
. base [ 1 ] = FAST_TK_INIT ,
2017-08-28 20:21:53 +08:00
} ;
2014-07-17 05:05:23 +08:00
2024-10-09 16:29:01 +08:00
unsigned long timekeeper_lock_irqsave ( void )
{
unsigned long flags ;
2024-10-09 16:29:02 +08:00
raw_spin_lock_irqsave ( & tk_core . lock , flags ) ;
2024-10-09 16:29:01 +08:00
return flags ;
}
void timekeeper_unlock_irqrestore ( unsigned long flags )
{
2024-10-09 16:29:02 +08:00
raw_spin_unlock_irqrestore ( & tk_core . lock , flags ) ;
2024-10-09 16:29:01 +08:00
}
timekeeping: Add interfaces for handling timestamps with a floor value
Multigrain timestamps allow the kernel to use fine-grained timestamps when
an inode's attributes is being actively observed via ->getattr(). With
this support, it's possible for a file to get a fine-grained timestamp, and
another modified after it to get a coarse-grained stamp that is earlier
than the fine-grained time. If this happens then the files can appear to
have been modified in reverse order, which breaks VFS ordering guarantees
[1].
To prevent this, maintain a floor value for multigrain timestamps.
Whenever a fine-grained timestamp is handed out, record it, and when later
coarse-grained stamps are handed out, ensure they are not earlier than that
value. If the coarse-grained timestamp is earlier than the fine-grained
floor, return the floor value instead.
Add a static singleton atomic64_t into timekeeper.c that is used to keep
track of the latest fine-grained time ever handed out. This is tracked as a
monotonic ktime_t value to ensure that it isn't affected by clock
jumps. Because it is updated at different times than the rest of the
timekeeper object, the floor value is managed independently of the
timekeeper via a cmpxchg() operation, and sits on its own cacheline.
Add two new public interfaces:
- ktime_get_coarse_real_ts64_mg() fills a timespec64 with the later of the
coarse-grained clock and the floor time
- ktime_get_real_ts64_mg() gets the fine-grained clock value, and tries
to swap it into the floor. A timespec64 is filled with the result.
The floor value is global and updated via a single try_cmpxchg(). If
that fails then the operation raced with a concurrent update. Any
concurrent update must be later than the existing floor value, so any
racing tasks can accept any resulting floor value without retrying.
[1]: POSIX requires that files be stamped with realtime clock values, and
makes no provision for dealing with backward clock jumps. If a backward
realtime clock jump occurs, then files can appear to have been modified
in reverse order.
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Randy Dunlap <rdunlap@infradead.org> # documentation bits
Acked-by: John Stultz <jstultz@google.com>
Link: https://lore.kernel.org/all/20241002-mgtime-v10-1-d1c4717f5284@kernel.org
2024-10-03 05:27:16 +08:00
/*
* Multigrain timestamps require tracking the latest fine - grained timestamp
* that has been issued , and never returning a coarse - grained timestamp that is
* earlier than that value .
*
* mg_floor represents the latest fine - grained time that has been handed out as
* a file timestamp on the system . This is tracked as a monotonic ktime_t , and
* converted to a realtime clock value on an as - needed basis .
*
* Maintaining mg_floor ensures the multigrain interfaces never issue a
* timestamp earlier than one that has been previously issued .
*
* The exception to this rule is when there is a backward realtime clock jump . If
* such an event occurs , a timestamp can appear to be earlier than a previous one .
*/
static __cacheline_aligned_in_smp atomic64_t mg_floor ;
2012-07-13 13:21:53 +08:00
static inline void tk_normalize_xtime ( struct timekeeper * tk )
{
2015-03-19 17:09:06 +08:00
while ( tk - > tkr_mono . xtime_nsec > = ( ( u64 ) NSEC_PER_SEC < < tk - > tkr_mono . shift ) ) {
tk - > tkr_mono . xtime_nsec - = ( u64 ) NSEC_PER_SEC < < tk - > tkr_mono . shift ;
2012-07-13 13:21:53 +08:00
tk - > xtime_sec + + ;
}
2017-05-23 08:20:20 +08:00
while ( tk - > tkr_raw . xtime_nsec > = ( ( u64 ) NSEC_PER_SEC < < tk - > tkr_raw . shift ) ) {
tk - > tkr_raw . xtime_nsec - = ( u64 ) NSEC_PER_SEC < < tk - > tkr_raw . shift ;
tk - > raw_sec + + ;
}
2012-07-13 13:21:53 +08:00
}
2018-07-13 20:06:42 +08:00
static inline struct timespec64 tk_xtime ( const struct timekeeper * tk )
2014-07-17 05:04:05 +08:00
{
struct timespec64 ts ;
ts . tv_sec = tk - > xtime_sec ;
2015-03-19 17:09:06 +08:00
ts . tv_nsec = ( long ) ( tk - > tkr_mono . xtime_nsec > > tk - > tkr_mono . shift ) ;
2014-07-17 05:04:05 +08:00
return ts ;
}
2014-07-17 05:04:01 +08:00
static void tk_set_xtime ( struct timekeeper * tk , const struct timespec64 * ts )
2012-07-13 13:21:53 +08:00
{
tk - > xtime_sec = ts - > tv_sec ;
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec = ( u64 ) ts - > tv_nsec < < tk - > tkr_mono . shift ;
2012-07-13 13:21:53 +08:00
}
2014-07-17 05:04:01 +08:00
static void tk_xtime_add ( struct timekeeper * tk , const struct timespec64 * ts )
2012-07-13 13:21:53 +08:00
{
tk - > xtime_sec + = ts - > tv_sec ;
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec + = ( u64 ) ts - > tv_nsec < < tk - > tkr_mono . shift ;
2012-08-22 08:30:46 +08:00
tk_normalize_xtime ( tk ) ;
2012-07-13 13:21:53 +08:00
}
2011-11-15 03:46:39 +08:00
2014-07-17 05:04:01 +08:00
static void tk_set_wall_to_mono ( struct timekeeper * tk , struct timespec64 wtm )
2012-07-28 02:48:12 +08:00
{
2014-07-17 05:04:01 +08:00
struct timespec64 tmp ;
2012-07-28 02:48:12 +08:00
/*
* Verify consistency of : offset_real = - wall_to_monotonic
* before modifying anything
*/
2014-07-17 05:04:01 +08:00
set_normalized_timespec64 ( & tmp , - tk - > wall_to_monotonic . tv_sec ,
2012-07-28 02:48:12 +08:00
- tk - > wall_to_monotonic . tv_nsec ) ;
2016-12-25 18:38:40 +08:00
WARN_ON_ONCE ( tk - > offs_real ! = timespec64_to_ktime ( tmp ) ) ;
2012-07-28 02:48:12 +08:00
tk - > wall_to_monotonic = wtm ;
2014-07-17 05:04:01 +08:00
set_normalized_timespec64 ( & tmp , - wtm . tv_sec , - wtm . tv_nsec ) ;
2024-09-11 01:43:34 +08:00
/* Paired with READ_ONCE() in ktime_mono_to_any() */
WRITE_ONCE ( tk - > offs_real , timespec64_to_ktime ( tmp ) ) ;
WRITE_ONCE ( tk - > offs_tai , ktime_add ( tk - > offs_real , ktime_set ( tk - > tai_offset , 0 ) ) ) ;
2012-07-28 02:48:12 +08:00
}
2014-07-17 05:05:00 +08:00
static inline void tk_update_sleep_time ( struct timekeeper * tk , ktime_t delta )
2012-07-28 02:48:12 +08:00
{
2024-09-11 01:43:34 +08:00
/* Paired with READ_ONCE() in ktime_mono_to_any() */
WRITE_ONCE ( tk - > offs_boot , ktime_add ( tk - > offs_boot , delta ) ) ;
2019-08-22 19:00:15 +08:00
/*
* Timespec representation for VDSO update to avoid 64 bit division
* on every update .
*/
tk - > monotonic_to_boot = ktime_to_timespec64 ( tk - > offs_boot ) ;
2012-07-28 02:48:12 +08:00
}
2017-06-09 07:44:20 +08:00
/*
* tk_clock_read - atomic clocksource read ( ) helper
*
* This helper is necessary to use in the read paths because , while the
2020-07-20 23:55:23 +08:00
* seqcount ensures we don ' t return a bad value while structures are updated ,
2017-06-09 07:44:20 +08:00
* it doesn ' t protect from potential crashes . There is the possibility that
* the tkr ' s clocksource may change between the read reference , and the
* clock reference passed to the read function . This can cause crashes if
* the wrong clocksource is passed to the wrong read function .
2024-10-09 16:29:02 +08:00
* This isn ' t necessary to use when holding the tk_core . lock or doing
2017-06-09 07:44:20 +08:00
* a read of the fast - timekeeper tkrs ( which is protected by its own locking
* and update logic ) .
*/
2018-07-13 20:06:42 +08:00
static inline u64 tk_clock_read ( const struct tk_read_base * tkr )
2017-06-09 07:44:20 +08:00
{
struct clocksource * clock = READ_ONCE ( tkr - > clock ) ;
return clock - > read ( clock ) ;
}
2015-03-12 12:16:32 +08:00
# ifdef CONFIG_DEBUG_TIMEKEEPING
2015-03-12 12:16:35 +08:00
# define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
2016-12-22 03:32:01 +08:00
static void timekeeping_check_update ( struct timekeeper * tk , u64 offset )
2015-03-12 12:16:32 +08:00
{
2016-12-22 03:32:01 +08:00
u64 max_cycles = tk - > tkr_mono . clock - > max_cycles ;
2015-03-19 17:09:06 +08:00
const char * name = tk - > tkr_mono . clock - > name ;
2015-03-12 12:16:32 +08:00
if ( offset > max_cycles ) {
2015-03-12 12:16:33 +08:00
printk_deferred ( " WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger \n " ,
2015-03-12 12:16:32 +08:00
offset , name , max_cycles ) ;
2015-03-12 12:16:33 +08:00
printk_deferred ( " timekeeping: Your kernel is sick, but tries to cope by capping time updates \n " ) ;
2015-03-12 12:16:32 +08:00
} else {
if ( offset > ( max_cycles > > 1 ) ) {
2015-12-13 14:26:11 +08:00
printk_deferred ( " INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld) \n " ,
2015-03-12 12:16:32 +08:00
offset , name , max_cycles > > 1 ) ;
printk_deferred ( " timekeeping: Your kernel is still fine, but is feeling a bit nervous \n " ) ;
}
}
2015-03-12 12:16:35 +08:00
2015-05-14 07:04:47 +08:00
if ( tk - > underflow_seen ) {
if ( jiffies - tk - > last_warning > WARNING_FREQ ) {
2015-03-12 12:16:35 +08:00
printk_deferred ( " WARNING: Underflow in clocksource '%s' observed, time update ignored. \n " , name ) ;
printk_deferred ( " Please report this, consider using a different clocksource, if possible. \n " ) ;
printk_deferred ( " Your kernel is probably still fine. \n " ) ;
2015-05-14 07:04:47 +08:00
tk - > last_warning = jiffies ;
2015-03-12 12:16:35 +08:00
}
2015-05-14 07:04:47 +08:00
tk - > underflow_seen = 0 ;
2015-03-12 12:16:35 +08:00
}
2015-05-14 07:04:47 +08:00
if ( tk - > overflow_seen ) {
if ( jiffies - tk - > last_warning > WARNING_FREQ ) {
2015-03-12 12:16:35 +08:00
printk_deferred ( " WARNING: Overflow in clocksource '%s' observed, time update capped. \n " , name ) ;
printk_deferred ( " Please report this, consider using a different clocksource, if possible. \n " ) ;
printk_deferred ( " Your kernel is probably still fine. \n " ) ;
2015-05-14 07:04:47 +08:00
tk - > last_warning = jiffies ;
2015-03-12 12:16:35 +08:00
}
2015-05-14 07:04:47 +08:00
tk - > overflow_seen = 0 ;
2015-03-12 12:16:35 +08:00
}
2015-03-12 12:16:32 +08:00
}
2015-03-12 12:16:33 +08:00
2024-03-25 14:40:18 +08:00
static inline u64 timekeeping_cycles_to_ns ( const struct tk_read_base * tkr , u64 cycles ) ;
static inline u64 timekeeping_debug_get_ns ( const struct tk_read_base * tkr )
2015-03-12 12:16:33 +08:00
{
2015-05-14 07:04:47 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2016-12-22 03:32:01 +08:00
u64 now , last , mask , max , delta ;
2015-03-12 12:16:35 +08:00
unsigned int seq ;
2015-03-12 12:16:33 +08:00
2015-03-12 12:16:35 +08:00
/*
2020-07-20 23:55:23 +08:00
* Since we ' re called holding a seqcount , the data may shift
2015-03-12 12:16:35 +08:00
* under us while we ' re doing the calculation . This can cause
* false positives , since we ' d note a problem but throw the
2020-07-20 23:55:23 +08:00
* results away . So nest another seqcount here to atomically
2015-03-12 12:16:35 +08:00
* grab the points we are checking with .
*/
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
2017-06-09 07:44:20 +08:00
now = tk_clock_read ( tkr ) ;
2015-03-12 12:16:35 +08:00
last = tkr - > cycle_last ;
mask = tkr - > mask ;
max = tkr - > clock - > max_cycles ;
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2015-03-12 12:16:33 +08:00
2015-03-12 12:16:35 +08:00
delta = clocksource_delta ( now , last , mask ) ;
2015-03-12 12:16:33 +08:00
2015-03-12 12:16:34 +08:00
/*
* Try to catch underflows by checking if we are seeing small
* mask - relative negative values .
*/
2024-03-25 14:40:22 +08:00
if ( unlikely ( ( ~ delta & mask ) < ( mask > > 3 ) ) )
2015-05-14 07:04:47 +08:00
tk - > underflow_seen = 1 ;
2015-03-12 12:16:34 +08:00
2024-03-25 14:40:22 +08:00
/* Check for multiplication overflows */
if ( unlikely ( delta > max ) )
2015-05-14 07:04:47 +08:00
tk - > overflow_seen = 1 ;
2015-03-12 12:16:33 +08:00
2024-03-25 14:40:22 +08:00
/* timekeeping_cycles_to_ns() handles both under and overflow */
2024-03-25 14:40:18 +08:00
return timekeeping_cycles_to_ns ( tkr , now ) ;
2015-03-12 12:16:33 +08:00
}
2015-03-12 12:16:32 +08:00
# else
2016-12-22 03:32:01 +08:00
static inline void timekeeping_check_update ( struct timekeeper * tk , u64 offset )
2015-03-12 12:16:32 +08:00
{
}
2024-03-25 14:40:18 +08:00
static inline u64 timekeeping_debug_get_ns ( const struct tk_read_base * tkr )
2015-03-12 12:16:33 +08:00
{
2024-03-25 14:40:17 +08:00
BUG ( ) ;
2015-03-12 12:16:33 +08:00
}
2015-03-12 12:16:32 +08:00
# endif
2009-08-14 21:47:26 +08:00
/**
2013-11-28 16:28:55 +08:00
* tk_setup_internals - Set up internals to use clocksource clock .
2009-08-14 21:47:26 +08:00
*
2013-11-28 16:28:55 +08:00
* @ tk : The target timekeeper to setup .
2009-08-14 21:47:26 +08:00
* @ clock : Pointer to clocksource .
*
* Calculates a fixed cycle / nsec interval for a given clocksource / adjustment
* pair and interval request .
*
* Unless you ' re the timekeeping code , you should not be using this !
*/
2012-07-13 13:21:57 +08:00
static void tk_setup_internals ( struct timekeeper * tk , struct clocksource * clock )
2009-08-14 21:47:26 +08:00
{
2016-12-22 03:32:01 +08:00
u64 interval ;
2010-10-21 06:55:15 +08:00
u64 tmp , ntpinterval ;
2012-07-13 13:21:53 +08:00
struct clocksource * old_clock ;
2009-08-14 21:47:26 +08:00
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
+ + tk - > cs_was_changed_seq ;
2015-03-19 17:09:06 +08:00
old_clock = tk - > tkr_mono . clock ;
tk - > tkr_mono . clock = clock ;
tk - > tkr_mono . mask = clock - > mask ;
2017-06-09 07:44:20 +08:00
tk - > tkr_mono . cycle_last = tk_clock_read ( & tk - > tkr_mono ) ;
2009-08-14 21:47:26 +08:00
2015-03-19 16:28:44 +08:00
tk - > tkr_raw . clock = clock ;
tk - > tkr_raw . mask = clock - > mask ;
tk - > tkr_raw . cycle_last = tk - > tkr_mono . cycle_last ;
2009-08-14 21:47:26 +08:00
/* Do the ns -> cycle conversion first, using original mult */
tmp = NTP_INTERVAL_LENGTH ;
tmp < < = clock - > shift ;
2010-10-21 06:55:15 +08:00
ntpinterval = tmp ;
2009-08-14 21:47:28 +08:00
tmp + = clock - > mult / 2 ;
do_div ( tmp , clock - > mult ) ;
2009-08-14 21:47:26 +08:00
if ( tmp = = 0 )
tmp = 1 ;
2016-12-22 03:32:01 +08:00
interval = ( u64 ) tmp ;
2012-07-13 13:21:57 +08:00
tk - > cycle_interval = interval ;
2009-08-14 21:47:26 +08:00
/* Go back from cycles -> shifted ns */
2016-12-09 04:49:36 +08:00
tk - > xtime_interval = interval * clock - > mult ;
2012-07-13 13:21:57 +08:00
tk - > xtime_remainder = ntpinterval - tk - > xtime_interval ;
2017-06-09 07:44:21 +08:00
tk - > raw_interval = interval * clock - > mult ;
2009-08-14 21:47:26 +08:00
2012-07-13 13:21:53 +08:00
/* if changing clocks, convert xtime_nsec shift units */
if ( old_clock ) {
int shift_change = clock - > shift - old_clock - > shift ;
2017-05-23 08:20:20 +08:00
if ( shift_change < 0 ) {
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec > > = - shift_change ;
2017-05-23 08:20:20 +08:00
tk - > tkr_raw . xtime_nsec > > = - shift_change ;
} else {
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec < < = shift_change ;
2017-05-23 08:20:20 +08:00
tk - > tkr_raw . xtime_nsec < < = shift_change ;
}
2012-07-13 13:21:53 +08:00
}
2015-03-19 16:28:44 +08:00
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . shift = clock - > shift ;
2015-03-19 16:28:44 +08:00
tk - > tkr_raw . shift = clock - > shift ;
2009-08-14 21:47:26 +08:00
2012-07-13 13:21:57 +08:00
tk - > ntp_error = 0 ;
tk - > ntp_error_shift = NTP_SCALE_SHIFT - clock - > shift ;
2014-04-24 11:53:29 +08:00
tk - > ntp_tick = ntpinterval < < tk - > ntp_error_shift ;
2009-08-14 21:47:28 +08:00
/*
* The timekeeper keeps its own mult values for the currently
* active clocksource . These value will be adjusted via NTP
* to counteract clock drifting .
*/
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . mult = clock - > mult ;
2015-03-19 16:28:44 +08:00
tk - > tkr_raw . mult = clock - > mult ;
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
tk - > ntp_err_mult = 0 ;
2018-03-10 02:42:48 +08:00
tk - > skip_second_overflow = 0 ;
2009-08-14 21:47:26 +08:00
}
2007-05-08 15:27:59 +08:00
2009-08-14 21:47:29 +08:00
/* Timekeeper helper functions. */
2024-03-25 14:40:21 +08:00
static noinline u64 delta_to_ns_safe ( const struct tk_read_base * tkr , u64 delta )
{
return mul_u64_u32_add_u64_shr ( delta , tkr - > mult , tkr - > xtime_nsec , tkr - > shift ) ;
}
2024-03-25 14:40:13 +08:00
static inline u64 timekeeping_cycles_to_ns ( const struct tk_read_base * tkr , u64 cycles )
2009-08-14 21:47:29 +08:00
{
2024-03-25 14:40:15 +08:00
/* Calculate the delta since the last update_wall_time() */
2024-03-25 14:40:20 +08:00
u64 mask = tkr - > mask , delta = ( cycles - tkr - > cycle_last ) & mask ;
2024-03-25 14:40:21 +08:00
/*
2024-03-25 14:40:22 +08:00
* This detects both negative motion and the case where the delta
* overflows the multiplication with tkr - > mult .
2024-03-25 14:40:21 +08:00
*/
if ( unlikely ( delta > tkr - > clock - > max_cycles ) ) {
2024-03-25 14:40:22 +08:00
/*
* Handle clocksource inconsistency between CPUs to prevent
* time from going backwards by checking for the MSB of the
* mask being set in the delta .
*/
if ( delta & ~ ( mask > > 1 ) )
return tkr - > xtime_nsec > > tkr - > shift ;
2024-03-25 14:40:21 +08:00
return delta_to_ns_safe ( tkr , delta ) ;
2024-03-25 14:40:20 +08:00
}
2009-08-14 21:47:29 +08:00
2024-03-25 14:40:19 +08:00
return ( ( delta * tkr - > mult ) + tkr - > xtime_nsec ) > > tkr - > shift ;
2016-02-22 19:15:19 +08:00
}
2009-08-14 21:47:29 +08:00
2024-03-25 14:40:14 +08:00
static __always_inline u64 __timekeeping_get_ns ( const struct tk_read_base * tkr )
2016-02-22 19:15:19 +08:00
{
2024-03-25 14:40:16 +08:00
return timekeeping_cycles_to_ns ( tkr , tk_clock_read ( tkr ) ) ;
2009-08-14 21:47:29 +08:00
}
2024-03-25 14:40:13 +08:00
static inline u64 timekeeping_get_ns ( const struct tk_read_base * tkr )
{
2024-03-25 14:40:17 +08:00
if ( IS_ENABLED ( CONFIG_DEBUG_TIMEKEEPING ) )
2024-03-25 14:40:18 +08:00
return timekeeping_debug_get_ns ( tkr ) ;
2024-03-25 14:40:13 +08:00
2024-03-25 14:40:17 +08:00
return __timekeeping_get_ns ( tkr ) ;
2024-03-25 14:40:13 +08:00
}
2014-07-17 05:05:23 +08:00
/**
* update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper .
2015-02-11 12:01:52 +08:00
* @ tkr : Timekeeping readout base from which we take the update
2020-11-13 15:24:31 +08:00
* @ tkf : Pointer to NMI safe timekeeper
2014-07-17 05:05:23 +08:00
*
* We want to use this from any context including NMI and tracing /
* instrumenting the timekeeping code itself .
*
2015-05-27 09:39:36 +08:00
* Employ the latch technique ; see @ raw_write_seqcount_latch .
2014-07-17 05:05:23 +08:00
*
* So if a NMI hits the update of base [ 0 ] then it will use base [ 1 ]
* which is still consistent . In the worst case this can result is a
* slightly wrong timestamp ( a few nanoseconds ) . See
* @ ktime_get_mono_fast_ns .
*/
2018-07-13 20:06:42 +08:00
static void update_fast_timekeeper ( const struct tk_read_base * tkr ,
struct tk_fast * tkf )
2014-07-17 05:05:23 +08:00
{
2015-03-19 16:36:19 +08:00
struct tk_read_base * base = tkf - > base ;
2014-07-17 05:05:23 +08:00
/* Force readers off to base[1] */
2015-03-19 16:36:19 +08:00
raw_write_seqcount_latch ( & tkf - > seq ) ;
2014-07-17 05:05:23 +08:00
/* Update base[0] */
2015-02-11 12:01:52 +08:00
memcpy ( base , tkr , sizeof ( * base ) ) ;
2014-07-17 05:05:23 +08:00
/* Force readers back to base[0] */
2015-03-19 16:36:19 +08:00
raw_write_seqcount_latch ( & tkf - > seq ) ;
2014-07-17 05:05:23 +08:00
/* Update base[1] */
memcpy ( base + 1 , base , sizeof ( * base ) ) ;
}
2020-11-16 04:09:31 +08:00
static __always_inline u64 __ktime_get_fast_ns ( struct tk_fast * tkf )
{
struct tk_read_base * tkr ;
unsigned int seq ;
u64 now ;
do {
seq = raw_read_seqcount_latch ( & tkf - > seq ) ;
tkr = tkf - > base + ( seq & 0x01 ) ;
now = ktime_to_ns ( tkr - > base ) ;
2024-03-25 14:40:14 +08:00
now + = __timekeeping_get_ns ( tkr ) ;
2023-05-19 18:20:59 +08:00
} while ( raw_read_seqcount_latch_retry ( & tkf - > seq , seq ) ) ;
2020-11-16 04:09:31 +08:00
return now ;
}
2014-07-17 05:05:23 +08:00
/**
* ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic
*
* This timestamp is not guaranteed to be monotonic across an update .
* The timestamp is calculated by :
*
* now = base_mono + clock_delta * slope
*
* So if the update lowers the slope , readers who are forced to the
* not yet updated second array are still using the old steeper slope .
*
* tmono
* ^
* | o n
* | o n
* | u
* | o
* | o
* | 12345678 - - - > reader order
*
* o = old slope
* u = update
* n = new slope
*
* So reader 6 will observe time going backwards versus reader 5.
*
2020-11-16 04:09:31 +08:00
* While other CPUs are likely to be able to observe that , the only way
2014-07-17 05:05:23 +08:00
* for a CPU local observation is when an NMI hits in the middle of
* the update . Timestamps taken from that NMI context might be ahead
* of the following timestamps . Callers need to be aware of that and
* deal with it .
*/
2022-04-28 14:24:32 +08:00
u64 notrace ktime_get_mono_fast_ns ( void )
2015-03-19 16:36:19 +08:00
{
return __ktime_get_fast_ns ( & tk_fast_mono ) ;
}
2014-07-17 05:05:23 +08:00
EXPORT_SYMBOL_GPL ( ktime_get_mono_fast_ns ) ;
2020-11-16 04:09:31 +08:00
/**
* ktime_get_raw_fast_ns - Fast NMI safe access to clock monotonic raw
*
* Contrary to ktime_get_mono_fast_ns ( ) this is always correct because the
* conversion factor is not affected by NTP / PTP correction .
*/
2022-04-28 14:24:32 +08:00
u64 notrace ktime_get_raw_fast_ns ( void )
2015-03-19 16:39:08 +08:00
{
return __ktime_get_fast_ns ( & tk_fast_raw ) ;
}
EXPORT_SYMBOL_GPL ( ktime_get_raw_fast_ns ) ;
2018-04-25 21:33:38 +08:00
/**
* ktime_get_boot_fast_ns - NMI safe and fast access to boot clock .
*
* To keep it NMI safe since we ' re accessing from tracing , we ' re not using a
* separate timekeeper with updates to monotonic clock and boot offset
2020-07-20 23:55:23 +08:00
* protected with seqcounts . This has the following minor side effects :
2018-04-25 21:33:38 +08:00
*
* ( 1 ) Its possible that a timestamp be taken after the boot offset is updated
* but before the timekeeper is updated . If this happens , the new boot offset
* is added to the old timekeeping making the clock appear to update slightly
* earlier :
* CPU 0 CPU 1
* timekeeping_inject_sleeptime64 ( )
* __timekeeping_inject_sleeptime ( tk , delta ) ;
* timestamp ( ) ;
2024-10-09 16:29:05 +08:00
* timekeeping_update ( tkd , tk , TK_CLEAR_NTP . . . ) ;
2018-04-25 21:33:38 +08:00
*
* ( 2 ) On 32 - bit systems , the 64 - bit boot offset ( tk - > offs_boot ) may be
* partially updated . Since the tk - > offs_boot update is a rare event , this
* should be a rare occurrence which postprocessing should be able to handle .
2020-11-16 04:09:31 +08:00
*
2023-04-26 21:43:34 +08:00
* The caveats vs . timestamp ordering as documented for ktime_get_mono_fast_ns ( )
2020-11-16 04:09:31 +08:00
* apply as well .
2018-04-25 21:33:38 +08:00
*/
u64 notrace ktime_get_boot_fast_ns ( void )
{
struct timekeeper * tk = & tk_core . timekeeper ;
2022-04-15 17:19:35 +08:00
return ( ktime_get_mono_fast_ns ( ) + ktime_to_ns ( data_race ( tk - > offs_boot ) ) ) ;
2018-04-25 21:33:38 +08:00
}
EXPORT_SYMBOL_GPL ( ktime_get_boot_fast_ns ) ;
2022-04-14 17:18:03 +08:00
/**
* ktime_get_tai_fast_ns - NMI safe and fast access to tai clock .
*
* The same limitations as described for ktime_get_boot_fast_ns ( ) apply . The
* mono time and the TAI offset are not read atomically which may yield wrong
* readouts . However , an update of the TAI offset is an rare event e . g . , caused
* by settime or adjtimex with an offset . The user of this function has to deal
* with the possibility of wrong timestamps in post processing .
*/
u64 notrace ktime_get_tai_fast_ns ( void )
{
struct timekeeper * tk = & tk_core . timekeeper ;
return ( ktime_get_mono_fast_ns ( ) + ktime_to_ns ( data_race ( tk - > offs_tai ) ) ) ;
}
EXPORT_SYMBOL_GPL ( ktime_get_tai_fast_ns ) ;
2020-08-14 18:19:35 +08:00
static __always_inline u64 __ktime_get_real_fast ( struct tk_fast * tkf , u64 * mono )
2017-08-31 23:12:48 +08:00
{
struct tk_read_base * tkr ;
2020-08-14 18:19:35 +08:00
u64 basem , baser , delta ;
2017-08-31 23:12:48 +08:00
unsigned int seq ;
do {
seq = raw_read_seqcount_latch ( & tkf - > seq ) ;
tkr = tkf - > base + ( seq & 0x01 ) ;
2020-08-14 18:19:35 +08:00
basem = ktime_to_ns ( tkr - > base ) ;
baser = ktime_to_ns ( tkr - > base_real ) ;
2024-03-25 14:40:14 +08:00
delta = __timekeeping_get_ns ( tkr ) ;
2023-05-19 18:20:59 +08:00
} while ( raw_read_seqcount_latch_retry ( & tkf - > seq , seq ) ) ;
2017-08-31 23:12:48 +08:00
2020-08-14 18:19:35 +08:00
if ( mono )
* mono = basem + delta ;
return baser + delta ;
2017-08-31 23:12:48 +08:00
}
/**
* ktime_get_real_fast_ns : - NMI safe and fast access to clock realtime .
2020-11-16 04:09:31 +08:00
*
2023-04-26 21:43:34 +08:00
* See ktime_get_mono_fast_ns ( ) for documentation of the time stamp ordering .
2017-08-31 23:12:48 +08:00
*/
u64 ktime_get_real_fast_ns ( void )
{
2020-08-14 18:19:35 +08:00
return __ktime_get_real_fast ( & tk_fast_mono , NULL ) ;
2017-08-31 23:12:48 +08:00
}
2017-11-10 23:25:04 +08:00
EXPORT_SYMBOL_GPL ( ktime_get_real_fast_ns ) ;
2017-08-31 23:12:48 +08:00
2020-08-14 18:19:35 +08:00
/**
* ktime_get_fast_timestamps : - NMI safe timestamps
* @ snapshot : Pointer to timestamp storage
*
* Stores clock monotonic , boottime and realtime timestamps .
*
* Boot time is a racy access on 32 bit systems if the sleep time injection
* happens late during resume and not in timekeeping_resume ( ) . That could
* be avoided by expanding struct tk_read_base with boot offset for 32 bit
* and adding more overhead to the update . As this is a hard to observe
* once per resume event which can be filtered with reasonable effort using
* the accurate mono / real timestamps , it ' s probably not worth the trouble .
*
* Aside of that it might be possible on 32 and 64 bit to observe the
* following when the sleep time injection happens late :
*
* CPU 0 CPU 1
* timekeeping_resume ( )
* ktime_get_fast_timestamps ( )
* mono , real = __ktime_get_real_fast ( )
* inject_sleep_time ( )
* update boot offset
* boot = mono + bootoffset ;
*
* That means that boot time already has the sleep time adjustment , but
* real time does not . On the next readout both are in sync again .
*
* Preventing this for 64 bit is not really feasible without destroying the
* careful cache layout of the timekeeper because the sequence count and
* struct tk_read_base would then need two cache lines instead of one .
*
2021-03-23 05:39:03 +08:00
* Access to the time keeper clock source is disabled across the innermost
2020-08-14 18:19:35 +08:00
* steps of suspend / resume . The accessors still work , but the timestamps
* are frozen until time keeping is resumed which happens very early .
*
* For regular suspend / resume there is no observable difference vs . sched
* clock , but it might affect some of the nasty low level debug printks .
*
2021-03-23 05:39:03 +08:00
* OTOH , access to sched clock is not guaranteed across suspend / resume on
2020-08-14 18:19:35 +08:00
* all systems either so it depends on the hardware in use .
*
* If that turns out to be a real problem then this could be mitigated by
* using sched clock in a similar way as during early boot . But it ' s not as
* trivial as on early boot because it needs some careful protection
* against the clock monotonic timestamp jumping backwards on resume .
*/
void ktime_get_fast_timestamps ( struct ktime_timestamps * snapshot )
{
struct timekeeper * tk = & tk_core . timekeeper ;
snapshot - > real = __ktime_get_real_fast ( & tk_fast_mono , & snapshot - > mono ) ;
snapshot - > boot = snapshot - > mono + ktime_to_ns ( data_race ( tk - > offs_boot ) ) ;
}
2015-02-13 21:49:02 +08:00
/**
* halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource .
* @ tk : Timekeeper to snapshot .
*
* It generally is unsafe to access the clocksource after timekeeping has been
* suspended , so take a snapshot of the readout base of @ tk and use it as the
* fast timekeeper ' s readout base while suspended . It will return the same
* number of cycles every time until timekeeping is resumed at which time the
* proper readout base for the fast timekeeper will be restored automatically .
*/
2018-07-13 20:06:42 +08:00
static void halt_fast_timekeeper ( const struct timekeeper * tk )
2015-02-13 21:49:02 +08:00
{
static struct tk_read_base tkr_dummy ;
2018-07-13 20:06:42 +08:00
const struct tk_read_base * tkr = & tk - > tkr_mono ;
2015-02-13 21:49:02 +08:00
memcpy ( & tkr_dummy , tkr , sizeof ( tkr_dummy ) ) ;
2017-06-09 07:44:20 +08:00
cycles_at_suspend = tk_clock_read ( tkr ) ;
tkr_dummy . clock = & dummy_clock ;
2017-08-31 23:12:48 +08:00
tkr_dummy . base_real = tkr - > base + tk - > offs_real ;
2015-03-19 16:36:19 +08:00
update_fast_timekeeper ( & tkr_dummy , & tk_fast_mono ) ;
2015-03-19 16:39:08 +08:00
tkr = & tk - > tkr_raw ;
memcpy ( & tkr_dummy , tkr , sizeof ( tkr_dummy ) ) ;
2017-06-09 07:44:20 +08:00
tkr_dummy . clock = & dummy_clock ;
2015-03-19 16:39:08 +08:00
update_fast_timekeeper ( & tkr_dummy , & tk_fast_raw ) ;
2015-02-13 21:49:02 +08:00
}
2012-11-28 09:28:59 +08:00
static RAW_NOTIFIER_HEAD ( pvclock_gtod_chain ) ;
2013-06-27 18:35:46 +08:00
static void update_pvclock_gtod ( struct timekeeper * tk , bool was_set )
2012-11-28 09:28:59 +08:00
{
2013-06-27 18:35:46 +08:00
raw_notifier_call_chain ( & pvclock_gtod_chain , was_set , tk ) ;
2012-11-28 09:28:59 +08:00
}
/**
* pvclock_gtod_register_notifier - register a pvclock timedata update listener
2020-11-13 15:24:32 +08:00
* @ nb : Pointer to the notifier block to register
2012-11-28 09:28:59 +08:00
*/
int pvclock_gtod_register_notifier ( struct notifier_block * nb )
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2012-11-28 09:28:59 +08:00
int ret ;
2024-10-09 16:29:02 +08:00
guard ( raw_spinlock_irqsave ) ( & tk_core . lock ) ;
2012-11-28 09:28:59 +08:00
ret = raw_notifier_chain_register ( & pvclock_gtod_chain , nb ) ;
2013-06-27 18:35:46 +08:00
update_pvclock_gtod ( tk , true ) ;
2012-11-28 09:28:59 +08:00
return ret ;
}
EXPORT_SYMBOL_GPL ( pvclock_gtod_register_notifier ) ;
/**
* pvclock_gtod_unregister_notifier - unregister a pvclock
* timedata update listener
2020-11-13 15:24:32 +08:00
* @ nb : Pointer to the notifier block to unregister
2012-11-28 09:28:59 +08:00
*/
int pvclock_gtod_unregister_notifier ( struct notifier_block * nb )
{
2024-10-09 16:29:02 +08:00
guard ( raw_spinlock_irqsave ) ( & tk_core . lock ) ;
return raw_notifier_chain_unregister ( & pvclock_gtod_chain , nb ) ;
2012-11-28 09:28:59 +08:00
}
EXPORT_SYMBOL_GPL ( pvclock_gtod_unregister_notifier ) ;
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
/*
* tk_update_leap_state - helper to update the next_leap_ktime
*/
static inline void tk_update_leap_state ( struct timekeeper * tk )
{
tk - > next_leap_ktime = ntp_get_next_leap ( ) ;
2016-12-25 18:38:40 +08:00
if ( tk - > next_leap_ktime ! = KTIME_MAX )
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
/* Convert to monotonic time */
tk - > next_leap_ktime = ktime_sub ( tk - > next_leap_ktime , tk - > offs_real ) ;
}
2014-07-17 05:04:10 +08:00
/*
* Update the ktime_t based scalar nsec members of the timekeeper
*/
static inline void tk_update_ktime_data ( struct timekeeper * tk )
{
2014-10-29 18:31:16 +08:00
u64 seconds ;
u32 nsec ;
2014-07-17 05:04:10 +08:00
/*
* The xtime based monotonic readout is :
* nsec = ( xtime_sec + wtm_sec ) * 1e9 + wtm_nsec + now ( ) ;
* The ktime based monotonic readout is :
* nsec = base_mono + now ( ) ;
* = = > base_mono = ( xtime_sec + wtm_sec ) * 1e9 + wtm_nsec
*/
2014-10-29 18:31:16 +08:00
seconds = ( u64 ) ( tk - > xtime_sec + tk - > wall_to_monotonic . tv_sec ) ;
nsec = ( u32 ) tk - > wall_to_monotonic . tv_nsec ;
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . base = ns_to_ktime ( seconds * NSEC_PER_SEC + nsec ) ;
2014-07-17 05:05:04 +08:00
2014-10-29 18:31:16 +08:00
/*
* The sum of the nanoseconds portions of xtime and
* wall_to_monotonic can be greater / equal one second . Take
* this into account before updating tk - > ktime_sec .
*/
2015-03-19 17:09:06 +08:00
nsec + = ( u32 ) ( tk - > tkr_mono . xtime_nsec > > tk - > tkr_mono . shift ) ;
2014-10-29 18:31:16 +08:00
if ( nsec > = NSEC_PER_SEC )
seconds + + ;
tk - > ktime_sec = seconds ;
2017-05-23 08:20:20 +08:00
/* Update the monotonic raw base */
2017-08-26 06:57:04 +08:00
tk - > tkr_raw . base = ns_to_ktime ( tk - > raw_sec * NSEC_PER_SEC ) ;
2014-07-17 05:04:10 +08:00
}
2024-10-09 16:29:05 +08:00
static void timekeeping_update ( struct tk_data * tkd , struct timekeeper * tk , unsigned int action )
2011-11-14 07:19:49 +08:00
{
2024-10-09 16:29:05 +08:00
lockdep_assert_held ( & tkd - > lock ) ;
2013-06-27 18:35:45 +08:00
if ( action & TK_CLEAR_NTP ) {
2012-07-13 13:21:57 +08:00
tk - > ntp_error = 0 ;
2011-11-14 07:19:49 +08:00
ntp_clear ( ) ;
}
2013-02-22 06:51:40 +08:00
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
tk_update_leap_state ( tk ) ;
2014-07-17 05:04:10 +08:00
tk_update_ktime_data ( tk ) ;
2014-09-06 18:24:49 +08:00
update_vsyscall ( tk ) ;
update_pvclock_gtod ( tk , action & TK_CLOCK_WAS_SET ) ;
2017-08-31 23:12:48 +08:00
tk - > tkr_mono . base_real = tk - > tkr_mono . base + tk - > offs_real ;
2015-03-19 16:36:19 +08:00
update_fast_timekeeper ( & tk - > tkr_mono , & tk_fast_mono ) ;
2015-03-19 16:39:08 +08:00
update_fast_timekeeper ( & tk - > tkr_raw , & tk_fast_raw ) ;
2015-04-15 05:08:37 +08:00
if ( action & TK_CLOCK_WAS_SET )
tk - > clock_was_set_seq + + ;
2015-06-12 06:54:53 +08:00
/*
* The mirroring of the data to the shadow - timekeeper needs
* to happen last here to ensure we don ' t over - write the
* timekeeper structure on the next update with stale data
*/
if ( action & TK_MIRROR )
2024-10-09 16:29:06 +08:00
memcpy ( & tkd - > shadow_timekeeper , tk , sizeof ( * tk ) ) ;
}
static void timekeeping_update_from_shadow ( struct tk_data * tkd , unsigned int action )
{
/*
* Block out readers before invoking timekeeping_update ( ) because
* that updates VDSO and other time related infrastructure . Not
* blocking the readers might let a reader see time going backwards
* when reading from the VDSO after the VDSO update and then
* reading in the kernel from the timekeeper before that got updated .
*/
write_seqcount_begin ( & tkd - > seq ) ;
timekeeping_update ( tkd , & tkd - > shadow_timekeeper , action ) ;
/*
* Update the real timekeeper .
*
* We could avoid this memcpy ( ) by switching pointers , but that has
* the downside that the reader side does not longer benefit from
* the cacheline optimized data layout of the timekeeper and requires
* another indirection .
*/
memcpy ( & tkd - > timekeeper , & tkd - > shadow_timekeeper , sizeof ( tkd - > shadow_timekeeper ) ) ;
write_seqcount_end ( & tkd - > seq ) ;
2011-11-14 07:19:49 +08:00
}
2007-05-08 15:27:59 +08:00
/**
2009-08-14 21:47:26 +08:00
* timekeeping_forward_now - update clock to the current time
2020-11-13 15:24:34 +08:00
* @ tk : Pointer to the timekeeper to update
2007-05-08 15:27:59 +08:00
*
2008-08-21 07:37:28 +08:00
* Forward the current clock to update its state since the last call to
* update_wall_time ( ) . This is useful before significant clock changes ,
* as it avoids having to deal with this time offset explicitly .
2007-05-08 15:27:59 +08:00
*/
2012-07-13 13:21:57 +08:00
static void timekeeping_forward_now ( struct timekeeper * tk )
2007-05-08 15:27:59 +08:00
{
2016-12-22 03:32:01 +08:00
u64 cycle_now , delta ;
2007-05-08 15:27:59 +08:00
2017-06-09 07:44:20 +08:00
cycle_now = tk_clock_read ( & tk - > tkr_mono ) ;
2015-03-19 17:09:06 +08:00
delta = clocksource_delta ( cycle_now , tk - > tkr_mono . cycle_last , tk - > tkr_mono . mask ) ;
tk - > tkr_mono . cycle_last = cycle_now ;
2015-03-19 16:28:44 +08:00
tk - > tkr_raw . cycle_last = cycle_now ;
2007-05-08 15:27:59 +08:00
2024-03-25 14:40:21 +08:00
while ( delta > 0 ) {
u64 max = tk - > tkr_mono . clock - > max_cycles ;
u64 incr = delta < max ? delta : max ;
2017-05-23 08:20:20 +08:00
2024-03-25 14:40:21 +08:00
tk - > tkr_mono . xtime_nsec + = incr * tk - > tkr_mono . mult ;
tk - > tkr_raw . xtime_nsec + = incr * tk - > tkr_raw . mult ;
tk_normalize_xtime ( tk ) ;
delta - = incr ;
}
2007-05-08 15:27:59 +08:00
}
/**
2018-04-27 21:40:13 +08:00
* ktime_get_real_ts64 - Returns the time of day in a timespec64 .
2007-05-08 15:27:59 +08:00
* @ ts : pointer to the timespec to be set
*
2018-04-27 21:40:13 +08:00
* Returns the time of day in a timespec64 ( WARN if suspended ) .
2007-05-08 15:27:59 +08:00
*/
2018-04-27 21:40:13 +08:00
void ktime_get_real_ts64 ( struct timespec64 * ts )
2007-05-08 15:27:59 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2019-03-19 03:55:56 +08:00
unsigned int seq ;
2016-12-09 04:49:34 +08:00
u64 nsecs ;
2007-05-08 15:27:59 +08:00
2018-04-27 21:40:13 +08:00
WARN_ON ( timekeeping_suspended ) ;
2007-05-08 15:27:59 +08:00
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
2007-05-08 15:27:59 +08:00
2012-07-28 02:48:13 +08:00
ts - > tv_sec = tk - > xtime_sec ;
2015-03-19 17:09:06 +08:00
nsecs = timekeeping_get_ns ( & tk - > tkr_mono ) ;
2007-05-08 15:27:59 +08:00
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2007-05-08 15:27:59 +08:00
2012-09-12 07:26:03 +08:00
ts - > tv_nsec = 0 ;
2014-07-17 05:04:04 +08:00
timespec64_add_ns ( ts , nsecs ) ;
2007-05-08 15:27:59 +08:00
}
2018-04-27 21:40:13 +08:00
EXPORT_SYMBOL ( ktime_get_real_ts64 ) ;
2007-05-08 15:27:59 +08:00
2009-07-07 17:27:28 +08:00
ktime_t ktime_get ( void )
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2009-07-07 17:27:28 +08:00
unsigned int seq ;
2014-07-17 05:04:12 +08:00
ktime_t base ;
2016-12-09 04:49:34 +08:00
u64 nsecs ;
2009-07-07 17:27:28 +08:00
WARN_ON ( timekeeping_suspended ) ;
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
2015-03-19 17:09:06 +08:00
base = tk - > tkr_mono . base ;
nsecs = timekeeping_get_ns ( & tk - > tkr_mono ) ;
2009-07-07 17:27:28 +08:00
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2014-07-17 05:03:53 +08:00
2014-07-17 05:04:12 +08:00
return ktime_add_ns ( base , nsecs ) ;
2009-07-07 17:27:28 +08:00
}
EXPORT_SYMBOL_GPL ( ktime_get ) ;
2015-04-07 19:12:35 +08:00
u32 ktime_get_resolution_ns ( void )
{
struct timekeeper * tk = & tk_core . timekeeper ;
unsigned int seq ;
u32 nsecs ;
WARN_ON ( timekeeping_suspended ) ;
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
nsecs = tk - > tkr_mono . mult > > tk - > tkr_mono . shift ;
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
return nsecs ;
}
EXPORT_SYMBOL_GPL ( ktime_get_resolution_ns ) ;
2014-07-17 05:04:13 +08:00
static ktime_t * offsets [ TK_OFFS_MAX ] = {
[ TK_OFFS_REAL ] = & tk_core . timekeeper . offs_real ,
2018-04-25 21:33:38 +08:00
[ TK_OFFS_BOOT ] = & tk_core . timekeeper . offs_boot ,
2014-07-17 05:04:13 +08:00
[ TK_OFFS_TAI ] = & tk_core . timekeeper . offs_tai ,
} ;
ktime_t ktime_get_with_offset ( enum tk_offsets offs )
{
struct timekeeper * tk = & tk_core . timekeeper ;
unsigned int seq ;
ktime_t base , * offset = offsets [ offs ] ;
2016-12-09 04:49:34 +08:00
u64 nsecs ;
2014-07-17 05:04:13 +08:00
WARN_ON ( timekeeping_suspended ) ;
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
2015-03-19 17:09:06 +08:00
base = ktime_add ( tk - > tkr_mono . base , * offset ) ;
nsecs = timekeeping_get_ns ( & tk - > tkr_mono ) ;
2014-07-17 05:04:13 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
return ktime_add_ns ( base , nsecs ) ;
}
EXPORT_SYMBOL_GPL ( ktime_get_with_offset ) ;
2018-04-27 21:40:15 +08:00
ktime_t ktime_get_coarse_with_offset ( enum tk_offsets offs )
{
struct timekeeper * tk = & tk_core . timekeeper ;
unsigned int seq ;
ktime_t base , * offset = offsets [ offs ] ;
2019-06-14 03:40:45 +08:00
u64 nsecs ;
2018-04-27 21:40:15 +08:00
WARN_ON ( timekeeping_suspended ) ;
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
base = ktime_add ( tk - > tkr_mono . base , * offset ) ;
2019-06-14 03:40:45 +08:00
nsecs = tk - > tkr_mono . xtime_nsec > > tk - > tkr_mono . shift ;
2018-04-27 21:40:15 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2019-06-22 04:32:47 +08:00
return ktime_add_ns ( base , nsecs ) ;
2018-04-27 21:40:15 +08:00
}
EXPORT_SYMBOL_GPL ( ktime_get_coarse_with_offset ) ;
2014-07-17 05:04:22 +08:00
/**
2021-03-23 05:39:03 +08:00
* ktime_mono_to_any ( ) - convert monotonic time to any other time
2014-07-17 05:04:22 +08:00
* @ tmono : time to convert .
* @ offs : which offset to use
*/
ktime_t ktime_mono_to_any ( ktime_t tmono , enum tk_offsets offs )
{
ktime_t * offset = offsets [ offs ] ;
2019-03-19 03:55:56 +08:00
unsigned int seq ;
2014-07-17 05:04:22 +08:00
ktime_t tconv ;
2024-09-11 01:43:34 +08:00
if ( IS_ENABLED ( CONFIG_64BIT ) ) {
/*
* Paired with WRITE_ONCE ( ) s in tk_set_wall_to_mono ( ) and
* tk_update_sleep_time ( ) .
*/
return ktime_add ( tmono , READ_ONCE ( * offset ) ) ;
}
2014-07-17 05:04:22 +08:00
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
tconv = ktime_add ( tmono , * offset ) ;
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
return tconv ;
}
EXPORT_SYMBOL_GPL ( ktime_mono_to_any ) ;
2014-07-17 05:05:04 +08:00
/**
* ktime_get_raw - Returns the raw monotonic time in ktime_t format
*/
ktime_t ktime_get_raw ( void )
{
struct timekeeper * tk = & tk_core . timekeeper ;
unsigned int seq ;
ktime_t base ;
2016-12-09 04:49:34 +08:00
u64 nsecs ;
2014-07-17 05:05:04 +08:00
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
2015-03-19 16:28:44 +08:00
base = tk - > tkr_raw . base ;
nsecs = timekeeping_get_ns ( & tk - > tkr_raw ) ;
2014-07-17 05:05:04 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
return ktime_add_ns ( base , nsecs ) ;
}
EXPORT_SYMBOL_GPL ( ktime_get_raw ) ;
2009-07-07 17:27:28 +08:00
/**
2014-07-17 05:04:04 +08:00
* ktime_get_ts64 - get the monotonic clock in timespec64 format
2009-07-07 17:27:28 +08:00
* @ ts : pointer to timespec variable
*
* The function calculates the monotonic clock from the realtime
* clock and the wall_to_monotonic offset and stores the result
2014-11-08 05:13:04 +08:00
* in normalized timespec64 format in the variable pointed to by @ ts .
2009-07-07 17:27:28 +08:00
*/
2014-07-17 05:04:04 +08:00
void ktime_get_ts64 ( struct timespec64 * ts )
2009-07-07 17:27:28 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2014-07-17 05:04:04 +08:00
struct timespec64 tomono ;
2009-07-07 17:27:28 +08:00
unsigned int seq ;
2016-12-09 04:49:34 +08:00
u64 nsec ;
2009-07-07 17:27:28 +08:00
WARN_ON ( timekeeping_suspended ) ;
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
2014-07-17 05:04:04 +08:00
ts - > tv_sec = tk - > xtime_sec ;
2015-03-19 17:09:06 +08:00
nsec = timekeeping_get_ns ( & tk - > tkr_mono ) ;
2012-07-28 02:48:13 +08:00
tomono = tk - > wall_to_monotonic ;
2009-07-07 17:27:28 +08:00
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2009-07-07 17:27:28 +08:00
2014-07-17 05:04:04 +08:00
ts - > tv_sec + = tomono . tv_sec ;
ts - > tv_nsec = 0 ;
timespec64_add_ns ( ts , nsec + tomono . tv_nsec ) ;
2009-07-07 17:27:28 +08:00
}
2014-07-17 05:04:04 +08:00
EXPORT_SYMBOL_GPL ( ktime_get_ts64 ) ;
2009-07-07 17:27:28 +08:00
2014-10-29 18:31:16 +08:00
/**
* ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC
*
* Returns the seconds portion of CLOCK_MONOTONIC with a single non
* serialized read . tk - > ktime_sec is of type ' unsigned long ' so this
* works on both 32 and 64 bit systems . On 32 bit systems the readout
* covers ~ 136 years of uptime which should be enough to prevent
* premature wrap arounds .
*/
time64_t ktime_get_seconds ( void )
{
struct timekeeper * tk = & tk_core . timekeeper ;
WARN_ON ( timekeeping_suspended ) ;
return tk - > ktime_sec ;
}
EXPORT_SYMBOL_GPL ( ktime_get_seconds ) ;
2014-10-29 18:31:50 +08:00
/**
* ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME
*
2020-12-01 17:52:31 +08:00
* Returns the wall clock seconds since 1970.
2014-10-29 18:31:50 +08:00
*
* For 64 bit systems the fast access to tk - > xtime_sec is preserved . On
* 32 bit systems the access must be protected with the sequence
* counter to provide " atomic " access to the 64 bit tk - > xtime_sec
* value .
*/
time64_t ktime_get_real_seconds ( void )
{
struct timekeeper * tk = & tk_core . timekeeper ;
time64_t seconds ;
unsigned int seq ;
if ( IS_ENABLED ( CONFIG_64BIT ) )
return tk - > xtime_sec ;
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
seconds = tk - > xtime_sec ;
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
return seconds ;
}
EXPORT_SYMBOL_GPL ( ktime_get_real_seconds ) ;
2015-12-13 12:24:18 +08:00
/**
* __ktime_get_real_seconds - The same as ktime_get_real_seconds
* but without the sequence counter protect . This internal function
* is called just when timekeeping lock is already held .
*/
2020-04-22 03:22:36 +08:00
noinstr time64_t __ktime_get_real_seconds ( void )
2015-12-13 12:24:18 +08:00
{
struct timekeeper * tk = & tk_core . timekeeper ;
return tk - > xtime_sec ;
}
2016-02-22 19:15:20 +08:00
/**
* ktime_get_snapshot - snapshots the realtime / monotonic raw clocks with counter
* @ systime_snapshot : pointer to struct receiving the system time snapshot
*/
void ktime_get_snapshot ( struct system_time_snapshot * systime_snapshot )
{
struct timekeeper * tk = & tk_core . timekeeper ;
2019-03-19 03:55:56 +08:00
unsigned int seq ;
2016-02-22 19:15:20 +08:00
ktime_t base_raw ;
ktime_t base_real ;
2024-09-11 17:30:20 +08:00
ktime_t base_boot ;
2016-12-09 04:49:34 +08:00
u64 nsec_raw ;
u64 nsec_real ;
2016-12-22 03:32:01 +08:00
u64 now ;
2016-02-22 19:15:20 +08:00
2016-02-22 19:15:21 +08:00
WARN_ON_ONCE ( timekeeping_suspended ) ;
2016-02-22 19:15:20 +08:00
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
2017-06-09 07:44:20 +08:00
now = tk_clock_read ( & tk - > tkr_mono ) ;
2020-12-09 14:09:27 +08:00
systime_snapshot - > cs_id = tk - > tkr_mono . clock - > id ;
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
systime_snapshot - > cs_was_changed_seq = tk - > cs_was_changed_seq ;
systime_snapshot - > clock_was_set_seq = tk - > clock_was_set_seq ;
2016-02-22 19:15:20 +08:00
base_real = ktime_add ( tk - > tkr_mono . base ,
tk_core . timekeeper . offs_real ) ;
2024-09-11 17:30:20 +08:00
base_boot = ktime_add ( tk - > tkr_mono . base ,
tk_core . timekeeper . offs_boot ) ;
2016-02-22 19:15:20 +08:00
base_raw = tk - > tkr_raw . base ;
nsec_real = timekeeping_cycles_to_ns ( & tk - > tkr_mono , now ) ;
nsec_raw = timekeeping_cycles_to_ns ( & tk - > tkr_raw , now ) ;
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
systime_snapshot - > cycles = now ;
systime_snapshot - > real = ktime_add_ns ( base_real , nsec_real ) ;
2024-09-11 17:30:20 +08:00
systime_snapshot - > boot = ktime_add_ns ( base_boot , nsec_real ) ;
2016-02-22 19:15:20 +08:00
systime_snapshot - > raw = ktime_add_ns ( base_raw , nsec_raw ) ;
}
EXPORT_SYMBOL_GPL ( ktime_get_snapshot ) ;
2015-12-13 12:24:18 +08:00
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
/* Scale base by mult/div checking for overflow */
static int scale64_check_overflow ( u64 mult , u64 div , u64 * base )
{
u64 tmp , rem ;
tmp = div64_u64_rem ( * base , div , & rem ) ;
if ( ( ( int ) sizeof ( u64 ) * 8 - fls64 ( mult ) < fls64 ( tmp ) ) | |
( ( int ) sizeof ( u64 ) * 8 - fls64 ( mult ) < fls64 ( rem ) ) )
return - EOVERFLOW ;
tmp * = mult ;
2020-01-20 18:05:23 +08:00
rem = div64_u64 ( rem * mult , div ) ;
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
* base = tmp + rem ;
return 0 ;
}
/**
* adjust_historical_crosststamp - adjust crosstimestamp previous to current interval
* @ history : Snapshot representing start of history
* @ partial_history_cycles : Cycle offset into history ( fractional part )
* @ total_history_cycles : Total history length in cycles
* @ discontinuity : True indicates clock was set on history period
* @ ts : Cross timestamp that should be adjusted using
* partial / total ratio
*
* Helper function used by get_device_system_crosststamp ( ) to correct the
* crosstimestamp corresponding to the start of the current interval to the
* system counter value ( timestamp point ) provided by the driver . The
* total_history_ * quantities are the total history starting at the provided
* reference point and ending at the start of the current interval . The cycle
* count between the driver timestamp point and the start of the current
* interval is partial_history_cycles .
*/
static int adjust_historical_crosststamp ( struct system_time_snapshot * history ,
2016-12-22 03:32:01 +08:00
u64 partial_history_cycles ,
u64 total_history_cycles ,
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
bool discontinuity ,
struct system_device_crosststamp * ts )
{
struct timekeeper * tk = & tk_core . timekeeper ;
u64 corr_raw , corr_real ;
bool interp_forward ;
int ret ;
if ( total_history_cycles = = 0 | | partial_history_cycles = = 0 )
return 0 ;
/* Interpolate shortest distance from beginning or end of history */
2017-03-25 03:03:35 +08:00
interp_forward = partial_history_cycles > total_history_cycles / 2 ;
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
partial_history_cycles = interp_forward ?
total_history_cycles - partial_history_cycles :
partial_history_cycles ;
/*
* Scale the monotonic raw time delta by :
* partial_history_cycles / total_history_cycles
*/
corr_raw = ( u64 ) ktime_to_ns (
ktime_sub ( ts - > sys_monoraw , history - > raw ) ) ;
ret = scale64_check_overflow ( partial_history_cycles ,
total_history_cycles , & corr_raw ) ;
if ( ret )
return ret ;
/*
* If there is a discontinuity in the history , scale monotonic raw
* correction by :
* mult ( real ) / mult ( raw ) yielding the realtime correction
* Otherwise , calculate the realtime correction similar to monotonic
* raw calculation
*/
if ( discontinuity ) {
corr_real = mul_u64_u32_div
( corr_raw , tk - > tkr_mono . mult , tk - > tkr_raw . mult ) ;
} else {
corr_real = ( u64 ) ktime_to_ns (
ktime_sub ( ts - > sys_realtime , history - > real ) ) ;
ret = scale64_check_overflow ( partial_history_cycles ,
total_history_cycles , & corr_real ) ;
if ( ret )
return ret ;
}
/* Fixup monotonic raw and real time time values */
if ( interp_forward ) {
ts - > sys_monoraw = ktime_add_ns ( history - > raw , corr_raw ) ;
ts - > sys_realtime = ktime_add_ns ( history - > real , corr_real ) ;
} else {
ts - > sys_monoraw = ktime_sub_ns ( ts - > sys_monoraw , corr_raw ) ;
ts - > sys_realtime = ktime_sub_ns ( ts - > sys_realtime , corr_real ) ;
}
return 0 ;
}
/*
timekeeping: Fix cross-timestamp interpolation corner case decision
The cycle_between() helper checks if parameter test is in the open interval
(before, after). Colloquially speaking, this also applies to the counter
wrap-around special case before > after. get_device_system_crosststamp()
currently uses cycle_between() at the first call site to decide whether to
interpolate for older counter readings.
get_device_system_crosststamp() has the following problem with
cycle_between() testing against an open interval: Assume that, by chance,
cycles == tk->tkr_mono.cycle_last (in the following, "cycle_last" for
brevity). Then, cycle_between() at the first call site, with effective
argument values cycle_between(cycle_last, cycles, now), returns false,
enabling interpolation. During interpolation,
get_device_system_crosststamp() will then call cycle_between() at the
second call site (if a history_begin was supplied). The effective argument
values are cycle_between(history_begin->cycles, cycles, cycles), since
system_counterval.cycles == interval_start == cycles, per the assumption.
Due to the test against the open interval, cycle_between() returns false
again. This causes get_device_system_crosststamp() to return -EINVAL.
This failure should be avoided, since get_device_system_crosststamp() works
both when cycles follows cycle_last (no interpolation), and when cycles
precedes cycle_last (interpolation). For the case cycles == cycle_last,
interpolation is actually unneeded.
Fix this by changing cycle_between() into timestamp_in_interval(), which
now checks against the closed interval, rather than the open interval.
This changes the get_device_system_crosststamp() behavior for three corner
cases:
1. Bypass interpolation in the case cycles == tk->tkr_mono.cycle_last,
fixing the problem described above.
2. At the first timestamp_in_interval() call site, cycles == now no longer
causes failure.
3. At the second timestamp_in_interval() call site, history_begin->cycles
== system_counterval.cycles no longer causes failure.
adjust_historical_crosststamp() also works for this corner case,
where partial_history_cycles == total_history_cycles.
These behavioral changes should not cause any problems.
Fixes: 2c756feb18d9 ("time: Add history to cross timestamp interface supporting slower devices")
Signed-off-by: Peter Hilber <peter.hilber@opensynergy.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20231218073849.35294-3-peter.hilber@opensynergy.com
2023-12-18 15:38:40 +08:00
* timestamp_in_interval - true if ts is chronologically in [ start , end ]
*
* True if ts occurs chronologically at or after start , and before or at end .
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
*/
timekeeping: Fix cross-timestamp interpolation corner case decision
The cycle_between() helper checks if parameter test is in the open interval
(before, after). Colloquially speaking, this also applies to the counter
wrap-around special case before > after. get_device_system_crosststamp()
currently uses cycle_between() at the first call site to decide whether to
interpolate for older counter readings.
get_device_system_crosststamp() has the following problem with
cycle_between() testing against an open interval: Assume that, by chance,
cycles == tk->tkr_mono.cycle_last (in the following, "cycle_last" for
brevity). Then, cycle_between() at the first call site, with effective
argument values cycle_between(cycle_last, cycles, now), returns false,
enabling interpolation. During interpolation,
get_device_system_crosststamp() will then call cycle_between() at the
second call site (if a history_begin was supplied). The effective argument
values are cycle_between(history_begin->cycles, cycles, cycles), since
system_counterval.cycles == interval_start == cycles, per the assumption.
Due to the test against the open interval, cycle_between() returns false
again. This causes get_device_system_crosststamp() to return -EINVAL.
This failure should be avoided, since get_device_system_crosststamp() works
both when cycles follows cycle_last (no interpolation), and when cycles
precedes cycle_last (interpolation). For the case cycles == cycle_last,
interpolation is actually unneeded.
Fix this by changing cycle_between() into timestamp_in_interval(), which
now checks against the closed interval, rather than the open interval.
This changes the get_device_system_crosststamp() behavior for three corner
cases:
1. Bypass interpolation in the case cycles == tk->tkr_mono.cycle_last,
fixing the problem described above.
2. At the first timestamp_in_interval() call site, cycles == now no longer
causes failure.
3. At the second timestamp_in_interval() call site, history_begin->cycles
== system_counterval.cycles no longer causes failure.
adjust_historical_crosststamp() also works for this corner case,
where partial_history_cycles == total_history_cycles.
These behavioral changes should not cause any problems.
Fixes: 2c756feb18d9 ("time: Add history to cross timestamp interface supporting slower devices")
Signed-off-by: Peter Hilber <peter.hilber@opensynergy.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20231218073849.35294-3-peter.hilber@opensynergy.com
2023-12-18 15:38:40 +08:00
static bool timestamp_in_interval ( u64 start , u64 end , u64 ts )
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
{
timekeeping: Fix cross-timestamp interpolation corner case decision
The cycle_between() helper checks if parameter test is in the open interval
(before, after). Colloquially speaking, this also applies to the counter
wrap-around special case before > after. get_device_system_crosststamp()
currently uses cycle_between() at the first call site to decide whether to
interpolate for older counter readings.
get_device_system_crosststamp() has the following problem with
cycle_between() testing against an open interval: Assume that, by chance,
cycles == tk->tkr_mono.cycle_last (in the following, "cycle_last" for
brevity). Then, cycle_between() at the first call site, with effective
argument values cycle_between(cycle_last, cycles, now), returns false,
enabling interpolation. During interpolation,
get_device_system_crosststamp() will then call cycle_between() at the
second call site (if a history_begin was supplied). The effective argument
values are cycle_between(history_begin->cycles, cycles, cycles), since
system_counterval.cycles == interval_start == cycles, per the assumption.
Due to the test against the open interval, cycle_between() returns false
again. This causes get_device_system_crosststamp() to return -EINVAL.
This failure should be avoided, since get_device_system_crosststamp() works
both when cycles follows cycle_last (no interpolation), and when cycles
precedes cycle_last (interpolation). For the case cycles == cycle_last,
interpolation is actually unneeded.
Fix this by changing cycle_between() into timestamp_in_interval(), which
now checks against the closed interval, rather than the open interval.
This changes the get_device_system_crosststamp() behavior for three corner
cases:
1. Bypass interpolation in the case cycles == tk->tkr_mono.cycle_last,
fixing the problem described above.
2. At the first timestamp_in_interval() call site, cycles == now no longer
causes failure.
3. At the second timestamp_in_interval() call site, history_begin->cycles
== system_counterval.cycles no longer causes failure.
adjust_historical_crosststamp() also works for this corner case,
where partial_history_cycles == total_history_cycles.
These behavioral changes should not cause any problems.
Fixes: 2c756feb18d9 ("time: Add history to cross timestamp interface supporting slower devices")
Signed-off-by: Peter Hilber <peter.hilber@opensynergy.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20231218073849.35294-3-peter.hilber@opensynergy.com
2023-12-18 15:38:40 +08:00
if ( ts > = start & & ts < = end )
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
return true ;
timekeeping: Fix cross-timestamp interpolation corner case decision
The cycle_between() helper checks if parameter test is in the open interval
(before, after). Colloquially speaking, this also applies to the counter
wrap-around special case before > after. get_device_system_crosststamp()
currently uses cycle_between() at the first call site to decide whether to
interpolate for older counter readings.
get_device_system_crosststamp() has the following problem with
cycle_between() testing against an open interval: Assume that, by chance,
cycles == tk->tkr_mono.cycle_last (in the following, "cycle_last" for
brevity). Then, cycle_between() at the first call site, with effective
argument values cycle_between(cycle_last, cycles, now), returns false,
enabling interpolation. During interpolation,
get_device_system_crosststamp() will then call cycle_between() at the
second call site (if a history_begin was supplied). The effective argument
values are cycle_between(history_begin->cycles, cycles, cycles), since
system_counterval.cycles == interval_start == cycles, per the assumption.
Due to the test against the open interval, cycle_between() returns false
again. This causes get_device_system_crosststamp() to return -EINVAL.
This failure should be avoided, since get_device_system_crosststamp() works
both when cycles follows cycle_last (no interpolation), and when cycles
precedes cycle_last (interpolation). For the case cycles == cycle_last,
interpolation is actually unneeded.
Fix this by changing cycle_between() into timestamp_in_interval(), which
now checks against the closed interval, rather than the open interval.
This changes the get_device_system_crosststamp() behavior for three corner
cases:
1. Bypass interpolation in the case cycles == tk->tkr_mono.cycle_last,
fixing the problem described above.
2. At the first timestamp_in_interval() call site, cycles == now no longer
causes failure.
3. At the second timestamp_in_interval() call site, history_begin->cycles
== system_counterval.cycles no longer causes failure.
adjust_historical_crosststamp() also works for this corner case,
where partial_history_cycles == total_history_cycles.
These behavioral changes should not cause any problems.
Fixes: 2c756feb18d9 ("time: Add history to cross timestamp interface supporting slower devices")
Signed-off-by: Peter Hilber <peter.hilber@opensynergy.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20231218073849.35294-3-peter.hilber@opensynergy.com
2023-12-18 15:38:40 +08:00
if ( start > end & & ( ts > = start | | ts < = end ) )
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
return true ;
return false ;
}
2024-05-13 18:38:02 +08:00
static bool convert_clock ( u64 * val , u32 numerator , u32 denominator )
{
u64 rem , res ;
if ( ! numerator | | ! denominator )
return false ;
res = div64_u64_rem ( * val , denominator , & rem ) * numerator ;
* val = res + div_u64 ( rem * numerator , denominator ) ;
return true ;
}
static bool convert_base_to_cs ( struct system_counterval_t * scv )
{
struct clocksource * cs = tk_core . timekeeper . tkr_mono . clock ;
struct clocksource_base * base ;
u32 num , den ;
/* The timestamp was taken from the time keeper clock source */
if ( cs - > id = = scv - > cs_id )
return true ;
/*
* Check whether cs_id matches the base clock . Prevent the compiler from
* re - evaluating @ base as the clocksource might change concurrently .
*/
base = READ_ONCE ( cs - > base ) ;
if ( ! base | | base - > id ! = scv - > cs_id )
return false ;
num = scv - > use_nsecs ? cs - > freq_khz : base - > numerator ;
den = scv - > use_nsecs ? USEC_PER_SEC : base - > denominator ;
if ( ! convert_clock ( & scv - > cycles , num , den ) )
return false ;
scv - > cycles + = base - > offset ;
return true ;
}
2024-05-13 18:38:10 +08:00
static bool convert_cs_to_base ( u64 * cycles , enum clocksource_ids base_id )
{
struct clocksource * cs = tk_core . timekeeper . tkr_mono . clock ;
struct clocksource_base * base ;
/*
* Check whether base_id matches the base clock . Prevent the compiler from
* re - evaluating @ base as the clocksource might change concurrently .
*/
base = READ_ONCE ( cs - > base ) ;
if ( ! base | | base - > id ! = base_id )
return false ;
* cycles - = base - > offset ;
if ( ! convert_clock ( cycles , base - > denominator , base - > numerator ) )
return false ;
return true ;
}
static bool convert_ns_to_cs ( u64 * delta )
{
struct tk_read_base * tkr = & tk_core . timekeeper . tkr_mono ;
if ( BITS_TO_BYTES ( fls64 ( * delta ) + tkr - > shift ) > = sizeof ( * delta ) )
return false ;
* delta = div_u64 ( ( * delta < < tkr - > shift ) - tkr - > xtime_nsec , tkr - > mult ) ;
return true ;
}
/**
* ktime_real_to_base_clock ( ) - Convert CLOCK_REALTIME timestamp to a base clock timestamp
* @ treal : CLOCK_REALTIME timestamp to convert
* @ base_id : base clocksource id
* @ cycles : pointer to store the converted base clock timestamp
*
* Converts a supplied , future realtime clock value to the corresponding base clock value .
*
* Return : true if the conversion is successful , false otherwise .
*/
bool ktime_real_to_base_clock ( ktime_t treal , enum clocksource_ids base_id , u64 * cycles )
{
struct timekeeper * tk = & tk_core . timekeeper ;
unsigned int seq ;
u64 delta ;
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
if ( ( u64 ) treal < tk - > tkr_mono . base_real )
return false ;
delta = ( u64 ) treal - tk - > tkr_mono . base_real ;
if ( ! convert_ns_to_cs ( & delta ) )
return false ;
* cycles = tk - > tkr_mono . cycle_last + delta ;
if ( ! convert_cs_to_base ( cycles , base_id ) )
return false ;
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
return true ;
}
EXPORT_SYMBOL_GPL ( ktime_real_to_base_clock ) ;
2016-02-22 19:15:22 +08:00
/**
* get_device_system_crosststamp - Synchronously capture system / device timestamp
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
* @ get_time_fn : Callback to get simultaneous device time and
2016-02-22 19:15:22 +08:00
* system counter from the device driver
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
* @ ctx : Context passed to get_time_fn ( )
* @ history_begin : Historical reference point used to interpolate system
* time when counter provided by the driver is before the current interval
2016-02-22 19:15:22 +08:00
* @ xtstamp : Receives simultaneously captured system and device time
*
* Reads a timestamp from a device and correlates it to system time
*/
int get_device_system_crosststamp ( int ( * get_time_fn )
( ktime_t * device_time ,
struct system_counterval_t * sys_counterval ,
void * ctx ) ,
void * ctx ,
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
struct system_time_snapshot * history_begin ,
2016-02-22 19:15:22 +08:00
struct system_device_crosststamp * xtstamp )
{
struct system_counterval_t system_counterval ;
struct timekeeper * tk = & tk_core . timekeeper ;
2016-12-22 03:32:01 +08:00
u64 cycles , now , interval_start ;
2016-03-08 18:09:53 +08:00
unsigned int clock_was_set_seq = 0 ;
2016-02-22 19:15:22 +08:00
ktime_t base_real , base_raw ;
2016-12-09 04:49:34 +08:00
u64 nsec_real , nsec_raw ;
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
u8 cs_was_changed_seq ;
2019-03-19 03:55:56 +08:00
unsigned int seq ;
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
bool do_interp ;
2016-02-22 19:15:22 +08:00
int ret ;
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
/*
* Try to synchronously capture device time and a system
* counter value calling back into the device driver
*/
ret = get_time_fn ( & xtstamp - > device , & system_counterval , ctx ) ;
if ( ret )
return ret ;
/*
2024-02-01 09:04:51 +08:00
* Verify that the clocksource ID associated with the captured
* system counter value is the same as for the currently
* installed timekeeper clocksource
2016-02-22 19:15:22 +08:00
*/
2024-02-01 09:04:51 +08:00
if ( system_counterval . cs_id = = CSID_GENERIC | |
2024-05-13 18:38:02 +08:00
! convert_base_to_cs ( & system_counterval ) )
2016-02-22 19:15:22 +08:00
return - ENODEV ;
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
cycles = system_counterval . cycles ;
/*
* Check whether the system counter value provided by the
* device driver is on the current timekeeping interval .
*/
2017-06-09 07:44:20 +08:00
now = tk_clock_read ( & tk - > tkr_mono ) ;
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
interval_start = tk - > tkr_mono . cycle_last ;
timekeeping: Fix cross-timestamp interpolation corner case decision
The cycle_between() helper checks if parameter test is in the open interval
(before, after). Colloquially speaking, this also applies to the counter
wrap-around special case before > after. get_device_system_crosststamp()
currently uses cycle_between() at the first call site to decide whether to
interpolate for older counter readings.
get_device_system_crosststamp() has the following problem with
cycle_between() testing against an open interval: Assume that, by chance,
cycles == tk->tkr_mono.cycle_last (in the following, "cycle_last" for
brevity). Then, cycle_between() at the first call site, with effective
argument values cycle_between(cycle_last, cycles, now), returns false,
enabling interpolation. During interpolation,
get_device_system_crosststamp() will then call cycle_between() at the
second call site (if a history_begin was supplied). The effective argument
values are cycle_between(history_begin->cycles, cycles, cycles), since
system_counterval.cycles == interval_start == cycles, per the assumption.
Due to the test against the open interval, cycle_between() returns false
again. This causes get_device_system_crosststamp() to return -EINVAL.
This failure should be avoided, since get_device_system_crosststamp() works
both when cycles follows cycle_last (no interpolation), and when cycles
precedes cycle_last (interpolation). For the case cycles == cycle_last,
interpolation is actually unneeded.
Fix this by changing cycle_between() into timestamp_in_interval(), which
now checks against the closed interval, rather than the open interval.
This changes the get_device_system_crosststamp() behavior for three corner
cases:
1. Bypass interpolation in the case cycles == tk->tkr_mono.cycle_last,
fixing the problem described above.
2. At the first timestamp_in_interval() call site, cycles == now no longer
causes failure.
3. At the second timestamp_in_interval() call site, history_begin->cycles
== system_counterval.cycles no longer causes failure.
adjust_historical_crosststamp() also works for this corner case,
where partial_history_cycles == total_history_cycles.
These behavioral changes should not cause any problems.
Fixes: 2c756feb18d9 ("time: Add history to cross timestamp interface supporting slower devices")
Signed-off-by: Peter Hilber <peter.hilber@opensynergy.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20231218073849.35294-3-peter.hilber@opensynergy.com
2023-12-18 15:38:40 +08:00
if ( ! timestamp_in_interval ( interval_start , now , cycles ) ) {
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
clock_was_set_seq = tk - > clock_was_set_seq ;
cs_was_changed_seq = tk - > cs_was_changed_seq ;
cycles = interval_start ;
do_interp = true ;
} else {
do_interp = false ;
}
2016-02-22 19:15:22 +08:00
base_real = ktime_add ( tk - > tkr_mono . base ,
tk_core . timekeeper . offs_real ) ;
base_raw = tk - > tkr_raw . base ;
timekeeping: Fix cross-timestamp interpolation for non-x86
So far, get_device_system_crosststamp() unconditionally passes
system_counterval.cycles to timekeeping_cycles_to_ns(). But when
interpolating system time (do_interp == true), system_counterval.cycles is
before tkr_mono.cycle_last, contrary to the timekeeping_cycles_to_ns()
expectations.
On x86, CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE will mitigate on
interpolating, setting delta to 0. With delta == 0, xtstamp->sys_monoraw
and xtstamp->sys_realtime are then set to the last update time, as
implicitly expected by adjust_historical_crosststamp(). On other
architectures, the resulting nonsense xtstamp->sys_monoraw and
xtstamp->sys_realtime corrupt the xtstamp (ts) adjustment in
adjust_historical_crosststamp().
Fix this by deriving xtstamp->sys_monoraw and xtstamp->sys_realtime from
the last update time when interpolating, by using the local variable
"cycles". The local variable already has the right value when
interpolating, unlike system_counterval.cycles.
Fixes: 2c756feb18d9 ("time: Add history to cross timestamp interface supporting slower devices")
Signed-off-by: Peter Hilber <peter.hilber@opensynergy.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <jstultz@google.com>
Link: https://lore.kernel.org/r/20231218073849.35294-4-peter.hilber@opensynergy.com
2023-12-18 15:38:41 +08:00
nsec_real = timekeeping_cycles_to_ns ( & tk - > tkr_mono , cycles ) ;
nsec_raw = timekeeping_cycles_to_ns ( & tk - > tkr_raw , cycles ) ;
2016-02-22 19:15:22 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
xtstamp - > sys_realtime = ktime_add_ns ( base_real , nsec_real ) ;
xtstamp - > sys_monoraw = ktime_add_ns ( base_raw , nsec_raw ) ;
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
/*
* Interpolate if necessary , adjusting back from the start of the
* current interval
*/
if ( do_interp ) {
2016-12-22 03:32:01 +08:00
u64 partial_history_cycles , total_history_cycles ;
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
bool discontinuity ;
/*
timekeeping: Fix cross-timestamp interpolation corner case decision
The cycle_between() helper checks if parameter test is in the open interval
(before, after). Colloquially speaking, this also applies to the counter
wrap-around special case before > after. get_device_system_crosststamp()
currently uses cycle_between() at the first call site to decide whether to
interpolate for older counter readings.
get_device_system_crosststamp() has the following problem with
cycle_between() testing against an open interval: Assume that, by chance,
cycles == tk->tkr_mono.cycle_last (in the following, "cycle_last" for
brevity). Then, cycle_between() at the first call site, with effective
argument values cycle_between(cycle_last, cycles, now), returns false,
enabling interpolation. During interpolation,
get_device_system_crosststamp() will then call cycle_between() at the
second call site (if a history_begin was supplied). The effective argument
values are cycle_between(history_begin->cycles, cycles, cycles), since
system_counterval.cycles == interval_start == cycles, per the assumption.
Due to the test against the open interval, cycle_between() returns false
again. This causes get_device_system_crosststamp() to return -EINVAL.
This failure should be avoided, since get_device_system_crosststamp() works
both when cycles follows cycle_last (no interpolation), and when cycles
precedes cycle_last (interpolation). For the case cycles == cycle_last,
interpolation is actually unneeded.
Fix this by changing cycle_between() into timestamp_in_interval(), which
now checks against the closed interval, rather than the open interval.
This changes the get_device_system_crosststamp() behavior for three corner
cases:
1. Bypass interpolation in the case cycles == tk->tkr_mono.cycle_last,
fixing the problem described above.
2. At the first timestamp_in_interval() call site, cycles == now no longer
causes failure.
3. At the second timestamp_in_interval() call site, history_begin->cycles
== system_counterval.cycles no longer causes failure.
adjust_historical_crosststamp() also works for this corner case,
where partial_history_cycles == total_history_cycles.
These behavioral changes should not cause any problems.
Fixes: 2c756feb18d9 ("time: Add history to cross timestamp interface supporting slower devices")
Signed-off-by: Peter Hilber <peter.hilber@opensynergy.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20231218073849.35294-3-peter.hilber@opensynergy.com
2023-12-18 15:38:40 +08:00
* Check that the counter value is not before the provided
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
* history reference and that the history doesn ' t cross a
* clocksource change
*/
if ( ! history_begin | |
timekeeping: Fix cross-timestamp interpolation corner case decision
The cycle_between() helper checks if parameter test is in the open interval
(before, after). Colloquially speaking, this also applies to the counter
wrap-around special case before > after. get_device_system_crosststamp()
currently uses cycle_between() at the first call site to decide whether to
interpolate for older counter readings.
get_device_system_crosststamp() has the following problem with
cycle_between() testing against an open interval: Assume that, by chance,
cycles == tk->tkr_mono.cycle_last (in the following, "cycle_last" for
brevity). Then, cycle_between() at the first call site, with effective
argument values cycle_between(cycle_last, cycles, now), returns false,
enabling interpolation. During interpolation,
get_device_system_crosststamp() will then call cycle_between() at the
second call site (if a history_begin was supplied). The effective argument
values are cycle_between(history_begin->cycles, cycles, cycles), since
system_counterval.cycles == interval_start == cycles, per the assumption.
Due to the test against the open interval, cycle_between() returns false
again. This causes get_device_system_crosststamp() to return -EINVAL.
This failure should be avoided, since get_device_system_crosststamp() works
both when cycles follows cycle_last (no interpolation), and when cycles
precedes cycle_last (interpolation). For the case cycles == cycle_last,
interpolation is actually unneeded.
Fix this by changing cycle_between() into timestamp_in_interval(), which
now checks against the closed interval, rather than the open interval.
This changes the get_device_system_crosststamp() behavior for three corner
cases:
1. Bypass interpolation in the case cycles == tk->tkr_mono.cycle_last,
fixing the problem described above.
2. At the first timestamp_in_interval() call site, cycles == now no longer
causes failure.
3. At the second timestamp_in_interval() call site, history_begin->cycles
== system_counterval.cycles no longer causes failure.
adjust_historical_crosststamp() also works for this corner case,
where partial_history_cycles == total_history_cycles.
These behavioral changes should not cause any problems.
Fixes: 2c756feb18d9 ("time: Add history to cross timestamp interface supporting slower devices")
Signed-off-by: Peter Hilber <peter.hilber@opensynergy.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20231218073849.35294-3-peter.hilber@opensynergy.com
2023-12-18 15:38:40 +08:00
! timestamp_in_interval ( history_begin - > cycles ,
cycles , system_counterval . cycles ) | |
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
history_begin - > cs_was_changed_seq ! = cs_was_changed_seq )
return - EINVAL ;
partial_history_cycles = cycles - system_counterval . cycles ;
total_history_cycles = cycles - history_begin - > cycles ;
discontinuity =
history_begin - > clock_was_set_seq ! = clock_was_set_seq ;
ret = adjust_historical_crosststamp ( history_begin ,
partial_history_cycles ,
total_history_cycles ,
discontinuity , xtstamp ) ;
if ( ret )
return ret ;
}
2016-02-22 19:15:22 +08:00
return 0 ;
}
EXPORT_SYMBOL_GPL ( get_device_system_crosststamp ) ;
2024-05-13 18:38:10 +08:00
/**
* timekeeping_clocksource_has_base - Check whether the current clocksource
* is based on given a base clock
* @ id : base clocksource ID
*
* Note : The return value is a snapshot which can become invalid right
* after the function returns .
*
* Return : true if the timekeeper clocksource has a base clock with @ id ,
* false otherwise
*/
bool timekeeping_clocksource_has_base ( enum clocksource_ids id )
{
/*
* This is a snapshot , so no point in using the sequence
* count . Just prevent the compiler from re - evaluating @ base as the
* clocksource might change concurrently .
*/
struct clocksource_base * base = READ_ONCE ( tk_core . timekeeper . tkr_mono . clock - > base ) ;
return base ? base - > id = = id : false ;
}
EXPORT_SYMBOL_GPL ( timekeeping_clocksource_has_base ) ;
2007-05-08 15:27:59 +08:00
/**
2014-11-18 19:15:16 +08:00
* do_settimeofday64 - Sets the time of day .
* @ ts : pointer to the timespec64 variable containing the new time
2007-05-08 15:27:59 +08:00
*
* Sets the time of day to the new time and update NTP and notify hrtimers
*/
2014-11-18 19:15:16 +08:00
int do_settimeofday64 ( const struct timespec64 * ts )
2007-05-08 15:27:59 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2014-11-18 19:15:16 +08:00
struct timespec64 ts_delta , xt ;
2011-11-15 06:05:44 +08:00
unsigned long flags ;
2015-06-23 18:38:54 +08:00
int ret = 0 ;
2007-05-08 15:27:59 +08:00
2019-03-23 18:36:19 +08:00
if ( ! timespec64_valid_settod ( ts ) )
2007-05-08 15:27:59 +08:00
return - EINVAL ;
2024-10-09 16:29:02 +08:00
raw_spin_lock_irqsave ( & tk_core . lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2007-05-08 15:27:59 +08:00
2012-07-28 02:48:13 +08:00
timekeeping_forward_now ( tk ) ;
2008-08-21 07:37:28 +08:00
2012-07-28 02:48:13 +08:00
xt = tk_xtime ( tk ) ;
timekeeping: Really make sure wall_to_monotonic isn't positive
Even after commit e1d7ba873555 ("time: Always make sure wall_to_monotonic
isn't positive") it is still possible to make wall_to_monotonic positive
by running the following code:
int main(void)
{
struct timespec time;
clock_gettime(CLOCK_MONOTONIC, &time);
time.tv_nsec = 0;
clock_settime(CLOCK_REALTIME, &time);
return 0;
}
The reason is that the second parameter of timespec64_compare(), ts_delta,
may be unnormalized because the delta is calculated with an open coded
substraction which causes the comparison of tv_sec to yield the wrong
result:
wall_to_monotonic = { .tv_sec = -10, .tv_nsec = 900000000 }
ts_delta = { .tv_sec = -9, .tv_nsec = -900000000 }
That makes timespec64_compare() claim that wall_to_monotonic < ts_delta,
but actually the result should be wall_to_monotonic > ts_delta.
After normalization, the result of timespec64_compare() is correct because
the tv_sec comparison is not longer misleading:
wall_to_monotonic = { .tv_sec = -10, .tv_nsec = 900000000 }
ts_delta = { .tv_sec = -10, .tv_nsec = 100000000 }
Use timespec64_sub() to ensure that ts_delta is normalized, which fixes the
issue.
Fixes: e1d7ba873555 ("time: Always make sure wall_to_monotonic isn't positive")
Signed-off-by: Yu Liao <liaoyu15@huawei.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20211213135727.1656662-1-liaoyu15@huawei.com
2021-12-13 21:57:27 +08:00
ts_delta = timespec64_sub ( * ts , xt ) ;
2012-07-13 13:21:53 +08:00
2015-06-23 18:38:54 +08:00
if ( timespec64_compare ( & tk - > wall_to_monotonic , & ts_delta ) > 0 ) {
ret = - EINVAL ;
goto out ;
}
2014-07-17 05:04:01 +08:00
tk_set_wall_to_mono ( tk , timespec64_sub ( tk - > wall_to_monotonic , ts_delta ) ) ;
2007-05-08 15:27:59 +08:00
2014-11-18 19:15:16 +08:00
tk_set_xtime ( tk , ts ) ;
2015-06-23 18:38:54 +08:00
out :
2024-10-09 16:29:07 +08:00
timekeeping_update ( & tk_core , tk , TK_UPDATE_ALL | TK_MIRROR ) ;
2007-05-08 15:27:59 +08:00
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2024-10-09 16:29:02 +08:00
raw_spin_unlock_irqrestore ( & tk_core . lock , flags ) ;
2007-05-08 15:27:59 +08:00
2021-07-13 21:39:53 +08:00
/* Signal hrtimers about time change */
clock_was_set ( CLOCK_SET_WALL ) ;
2007-05-08 15:27:59 +08:00
2022-07-18 05:53:34 +08:00
if ( ! ret ) {
2019-04-10 17:14:19 +08:00
audit_tk_injoffset ( ts_delta ) ;
2022-07-18 05:53:34 +08:00
add_device_randomness ( ts , sizeof ( * ts ) ) ;
}
2019-04-10 17:14:19 +08:00
2015-06-23 18:38:54 +08:00
return ret ;
2007-05-08 15:27:59 +08:00
}
2014-11-18 19:15:16 +08:00
EXPORT_SYMBOL ( do_settimeofday64 ) ;
2007-05-08 15:27:59 +08:00
2011-02-01 21:52:17 +08:00
/**
* timekeeping_inject_offset - Adds or subtracts from the current time .
2020-11-13 15:24:34 +08:00
* @ ts : Pointer to the timespec variable containing the offset
2011-02-01 21:52:17 +08:00
*
* Adds or subtracts an offset value from the current time .
*/
2018-07-13 20:06:42 +08:00
static int timekeeping_inject_offset ( const struct timespec64 * ts )
2011-02-01 21:52:17 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2011-11-15 06:05:44 +08:00
unsigned long flags ;
2017-10-19 19:14:45 +08:00
struct timespec64 tmp ;
2012-08-09 03:36:20 +08:00
int ret = 0 ;
2011-02-01 21:52:17 +08:00
2017-10-19 19:14:45 +08:00
if ( ts - > tv_nsec < 0 | | ts - > tv_nsec > = NSEC_PER_SEC )
2011-02-01 21:52:17 +08:00
return - EINVAL ;
2024-10-09 16:29:02 +08:00
raw_spin_lock_irqsave ( & tk_core . lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2011-02-01 21:52:17 +08:00
2012-07-28 02:48:13 +08:00
timekeeping_forward_now ( tk ) ;
2011-02-01 21:52:17 +08:00
2012-08-09 03:36:20 +08:00
/* Make sure the proposed value is valid */
2017-10-19 19:14:45 +08:00
tmp = timespec64_add ( tk_xtime ( tk ) , * ts ) ;
if ( timespec64_compare ( & tk - > wall_to_monotonic , ts ) > 0 | |
2019-03-23 18:36:19 +08:00
! timespec64_valid_settod ( & tmp ) ) {
2012-08-09 03:36:20 +08:00
ret = - EINVAL ;
goto error ;
}
2012-07-13 13:21:53 +08:00
2017-10-19 19:14:45 +08:00
tk_xtime_add ( tk , ts ) ;
tk_set_wall_to_mono ( tk , timespec64_sub ( tk - > wall_to_monotonic , * ts ) ) ;
2011-02-01 21:52:17 +08:00
2012-08-09 03:36:20 +08:00
error : /* even if we error out, we forwarded the time, so call update */
2024-10-09 16:29:07 +08:00
timekeeping_update ( & tk_core , tk , TK_UPDATE_ALL | TK_MIRROR ) ;
2011-02-01 21:52:17 +08:00
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2024-10-09 16:29:02 +08:00
raw_spin_unlock_irqrestore ( & tk_core . lock , flags ) ;
2011-02-01 21:52:17 +08:00
2021-07-13 21:39:53 +08:00
/* Signal hrtimers about time change */
clock_was_set ( CLOCK_SET_WALL ) ;
2011-02-01 21:52:17 +08:00
2012-08-09 03:36:20 +08:00
return ret ;
2011-02-01 21:52:17 +08:00
}
2017-10-19 19:14:44 +08:00
/*
* Indicates if there is an offset between the system clock and the hardware
* clock / persistent clock / rtc .
*/
int persistent_clock_is_local ;
/*
* Adjust the time obtained from the CMOS to be UTC time instead of
* local time .
*
* This is ugly , but preferable to the alternatives . Otherwise we
* would either need to write a program to do it in / etc / rc ( and risk
* confusion if the program gets run more than once ; it would also be
* hard to make the program warp the clock precisely n hours ) or
* compile in the timezone information into the kernel . Bad , bad . . . .
*
* - TYT , 1992 - 01 - 01
*
* The best thing to do is to keep the CMOS clock in universal time ( UTC )
* as real UNIX machines always do it . This avoids all headaches about
* daylight saving times and warping kernel clocks .
*/
void timekeeping_warp_clock ( void )
{
if ( sys_tz . tz_minuteswest ! = 0 ) {
2017-10-19 19:14:45 +08:00
struct timespec64 adjust ;
2017-10-19 19:14:44 +08:00
persistent_clock_is_local = 1 ;
adjust . tv_sec = sys_tz . tz_minuteswest * 60 ;
adjust . tv_nsec = 0 ;
timekeeping_inject_offset ( & adjust ) ;
}
}
2011-02-01 21:52:17 +08:00
2020-11-13 15:24:33 +08:00
/*
2016-12-08 06:33:23 +08:00
* __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic
2012-05-04 03:30:07 +08:00
*/
2013-03-26 03:24:24 +08:00
static void __timekeeping_set_tai_offset ( struct timekeeper * tk , s32 tai_offset )
2012-05-04 03:30:07 +08:00
{
tk - > tai_offset = tai_offset ;
2013-12-11 09:13:35 +08:00
tk - > offs_tai = ktime_add ( tk - > offs_real , ktime_set ( tai_offset , 0 ) ) ;
2012-05-04 03:30:07 +08:00
}
2020-11-13 15:24:33 +08:00
/*
2007-05-08 15:27:59 +08:00
* change_clocksource - Swaps clocksources if a new one is available
*
* Accumulates current time interval and initializes new clocksource
*/
2009-08-14 21:47:30 +08:00
static int change_clocksource ( void * data )
2007-05-08 15:27:59 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2024-10-09 16:28:57 +08:00
struct clocksource * new = data , * old = NULL ;
2012-03-15 07:38:15 +08:00
unsigned long flags ;
2007-05-08 15:27:59 +08:00
2013-04-26 04:31:44 +08:00
/*
2024-10-09 16:28:57 +08:00
* If the clocksource is in a module , get a module reference .
* Succeeds for built - in code ( owner = = NULL ) as well . Abort if the
* reference can ' t be acquired .
2013-04-26 04:31:44 +08:00
*/
2024-10-09 16:28:57 +08:00
if ( ! try_module_get ( new - > owner ) )
return 0 ;
/* Abort if the device can't be enabled */
if ( new - > enable & & new - > enable ( new ) ! = 0 ) {
module_put ( new - > owner ) ;
return 0 ;
2009-08-14 21:47:30 +08:00
}
2021-02-11 21:43:18 +08:00
2024-10-09 16:29:02 +08:00
raw_spin_lock_irqsave ( & tk_core . lock , flags ) ;
2021-02-11 21:43:18 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
timekeeping_forward_now ( tk ) ;
2024-10-09 16:28:57 +08:00
old = tk - > tkr_mono . clock ;
tk_setup_internals ( tk , new ) ;
2024-10-09 16:29:07 +08:00
timekeeping_update ( & tk_core , tk , TK_UPDATE_ALL | TK_MIRROR ) ;
2012-03-15 07:38:15 +08:00
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2024-10-09 16:29:02 +08:00
raw_spin_unlock_irqrestore ( & tk_core . lock , flags ) ;
2012-03-15 07:38:15 +08:00
2021-02-11 21:43:18 +08:00
if ( old ) {
if ( old - > disable )
old - > disable ( old ) ;
module_put ( old - > owner ) ;
}
2009-08-14 21:47:30 +08:00
return 0 ;
}
2007-05-08 15:27:59 +08:00
2009-08-14 21:47:30 +08:00
/**
* timekeeping_notify - Install a new clock source
* @ clock : pointer to the clock source
*
* This function is called from clocksource . c after a new , better clock
* source has been registered . The caller holds the clocksource_mutex .
*/
2013-04-26 04:31:44 +08:00
int timekeeping_notify ( struct clocksource * clock )
2009-08-14 21:47:30 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2012-07-28 02:48:13 +08:00
2015-03-19 17:09:06 +08:00
if ( tk - > tkr_mono . clock = = clock )
2013-04-26 04:31:44 +08:00
return 0 ;
2009-08-14 21:47:30 +08:00
stop_machine ( change_clocksource , clock , NULL ) ;
2007-05-08 15:27:59 +08:00
tick_clock_notify ( ) ;
2015-03-19 17:09:06 +08:00
return tk - > tkr_mono . clock = = clock ? 0 : - 1 ;
2007-05-08 15:27:59 +08:00
}
2009-08-14 21:47:30 +08:00
2008-08-21 07:37:30 +08:00
/**
2018-04-27 21:40:14 +08:00
* ktime_get_raw_ts64 - Returns the raw monotonic time in a timespec
2014-11-08 03:03:20 +08:00
* @ ts : pointer to the timespec64 to be set
2008-08-21 07:37:30 +08:00
*
* Returns the raw monotonic time ( completely un - modified by ntp )
*/
2018-04-27 21:40:14 +08:00
void ktime_get_raw_ts64 ( struct timespec64 * ts )
2008-08-21 07:37:30 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2019-03-19 03:55:56 +08:00
unsigned int seq ;
2016-12-09 04:49:34 +08:00
u64 nsecs ;
2008-08-21 07:37:30 +08:00
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
2017-05-23 08:20:20 +08:00
ts - > tv_sec = tk - > raw_sec ;
2015-03-19 16:28:44 +08:00
nsecs = timekeeping_get_ns ( & tk - > tkr_raw ) ;
2008-08-21 07:37:30 +08:00
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2008-08-21 07:37:30 +08:00
2017-05-23 08:20:20 +08:00
ts - > tv_nsec = 0 ;
timespec64_add_ns ( ts , nsecs ) ;
2008-08-21 07:37:30 +08:00
}
2018-04-27 21:40:14 +08:00
EXPORT_SYMBOL ( ktime_get_raw_ts64 ) ;
2014-11-08 03:03:20 +08:00
2008-08-21 07:37:30 +08:00
2007-05-08 15:27:59 +08:00
/**
2008-02-08 20:19:24 +08:00
* timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
2007-05-08 15:27:59 +08:00
*/
2008-02-08 20:19:24 +08:00
int timekeeping_valid_for_hres ( void )
2007-05-08 15:27:59 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2019-03-19 03:55:56 +08:00
unsigned int seq ;
2007-05-08 15:27:59 +08:00
int ret ;
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
2007-05-08 15:27:59 +08:00
2015-03-19 17:09:06 +08:00
ret = tk - > tkr_mono . clock - > flags & CLOCK_SOURCE_VALID_FOR_HRES ;
2007-05-08 15:27:59 +08:00
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2007-05-08 15:27:59 +08:00
return ret ;
}
2009-08-19 01:45:10 +08:00
/**
* timekeeping_max_deferment - Returns max time the clocksource can be deferred
*/
u64 timekeeping_max_deferment ( void )
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2019-03-19 03:55:56 +08:00
unsigned int seq ;
2011-11-15 04:48:10 +08:00
u64 ret ;
2012-07-13 13:21:51 +08:00
2011-11-15 04:48:10 +08:00
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
2011-11-15 04:48:10 +08:00
2015-03-19 17:09:06 +08:00
ret = tk - > tkr_mono . clock - > max_idle_ns ;
2011-11-15 04:48:10 +08:00
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2011-11-15 04:48:10 +08:00
return ret ;
2009-08-19 01:45:10 +08:00
}
2007-05-08 15:27:59 +08:00
/**
2018-08-14 20:15:23 +08:00
* read_persistent_clock64 - Return time from the persistent clock .
2020-11-13 15:24:34 +08:00
* @ ts : Pointer to the storage for the readout value
2007-05-08 15:27:59 +08:00
*
* Weak dummy function for arches that do not yet support it .
2009-08-14 21:47:31 +08:00
* Reads the time from the battery backed persistent clock .
* Returns a timespec with tv_sec = 0 and tv_nsec = 0 if unsupported .
2007-05-08 15:27:59 +08:00
*
* XXX - Do be sure to remove it once all arches implement it .
*/
2018-08-14 20:15:23 +08:00
void __weak read_persistent_clock64 ( struct timespec64 * ts )
2007-05-08 15:27:59 +08:00
{
2009-08-14 21:47:31 +08:00
ts - > tv_sec = 0 ;
ts - > tv_nsec = 0 ;
2007-05-08 15:27:59 +08:00
}
2009-08-14 21:47:32 +08:00
/**
2018-07-20 04:55:34 +08:00
* read_persistent_wall_and_boot_offset - Read persistent clock , and also offset
* from the boot .
2023-01-03 11:28:49 +08:00
* @ wall_time : current time as returned by persistent clock
* @ boot_offset : offset that is defined as wall_time - boot_time
2009-08-14 21:47:32 +08:00
*
* Weak dummy function for arches that do not yet support it .
2020-11-13 15:24:35 +08:00
*
2018-07-20 04:55:35 +08:00
* The default function calculates offset based on the current value of
* local_clock ( ) . This way architectures that support sched_clock ( ) but don ' t
* support dedicated boot time clock will provide the best estimate of the
* boot time .
2009-08-14 21:47:32 +08:00
*/
2018-07-20 04:55:34 +08:00
void __weak __init
read_persistent_wall_and_boot_offset ( struct timespec64 * wall_time ,
struct timespec64 * boot_offset )
2009-08-14 21:47:32 +08:00
{
2018-07-20 04:55:34 +08:00
read_persistent_clock64 ( wall_time ) ;
2018-07-20 04:55:35 +08:00
* boot_offset = ns_to_timespec64 ( local_clock ( ) ) ;
2009-08-14 21:47:32 +08:00
}
2024-10-09 16:29:04 +08:00
static __init void tkd_basic_setup ( struct tk_data * tkd )
{
raw_spin_lock_init ( & tkd - > lock ) ;
seqcount_raw_spinlock_init ( & tkd - > seq , & tkd - > lock ) ;
}
2018-07-17 14:31:29 +08:00
/*
* Flag reflecting whether timekeeping_resume ( ) has injected sleeptime .
*
* The flag starts of false and is only set when a suspend reaches
* timekeeping_suspend ( ) , timekeeping_resume ( ) sets it to false when the
* timekeeper clocksource is not stopping across suspend and has been
* used to update sleep time . If the timekeeper clocksource has stopped
* then the flag stays true and is used by the RTC resume code to decide
* whether sleeptime must be injected and if so the flag gets false then .
*
* If a suspend fails before reaching timekeeping_resume ( ) then the flag
* stays false and prevents erroneous sleeptime injection .
*/
static bool suspend_timing_needed ;
2015-04-02 11:34:38 +08:00
/* Flag for if there is a persistent clock on this platform */
static bool persistent_clock_exists ;
2007-05-08 15:27:59 +08:00
/*
* timekeeping_init - Initializes the clocksource and common timekeeping values
*/
void __init timekeeping_init ( void )
{
2018-07-20 04:55:34 +08:00
struct timespec64 wall_time , boot_offset , wall_to_mono ;
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2009-08-14 21:47:26 +08:00
struct clocksource * clock ;
2024-10-09 16:29:02 +08:00
2024-10-09 16:29:04 +08:00
tkd_basic_setup ( & tk_core ) ;
2012-08-09 03:36:20 +08:00
2018-07-20 04:55:34 +08:00
read_persistent_wall_and_boot_offset ( & wall_time , & boot_offset ) ;
2019-03-23 18:36:19 +08:00
if ( timespec64_valid_settod ( & wall_time ) & &
2018-07-20 04:55:34 +08:00
timespec64_to_ns ( & wall_time ) > 0 ) {
persistent_clock_exists = true ;
2018-07-26 04:00:18 +08:00
} else if ( timespec64_to_ns ( & wall_time ) ! = 0 ) {
2018-07-20 04:55:34 +08:00
pr_warn ( " Persistent clock returned invalid value " ) ;
wall_time = ( struct timespec64 ) { 0 } ;
2012-08-09 03:36:20 +08:00
}
2007-05-08 15:27:59 +08:00
2018-07-20 04:55:34 +08:00
if ( timespec64_compare ( & wall_time , & boot_offset ) < 0 )
boot_offset = ( struct timespec64 ) { 0 } ;
/*
* We want set wall_to_mono , so the following is true :
* wall time + wall_to_mono = boot time
*/
wall_to_mono = timespec64_sub ( boot_offset , wall_time ) ;
2024-10-09 16:29:02 +08:00
guard ( raw_spinlock_irqsave ) ( & tk_core . lock ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2013-03-23 02:37:28 +08:00
ntp_init ( ) ;
2009-08-14 21:47:21 +08:00
clock = clocksource_default_clock ( ) ;
2009-08-14 21:47:19 +08:00
if ( clock - > enable )
clock - > enable ( clock ) ;
2012-07-28 02:48:13 +08:00
tk_setup_internals ( tk , clock ) ;
2007-05-08 15:27:59 +08:00
2018-07-20 04:55:34 +08:00
tk_set_xtime ( tk , & wall_time ) ;
2017-05-23 08:20:20 +08:00
tk - > raw_sec = 0 ;
2012-07-13 13:21:53 +08:00
2018-07-20 04:55:34 +08:00
tk_set_wall_to_mono ( tk , wall_to_mono ) ;
2012-07-28 02:48:12 +08:00
2024-10-09 16:29:05 +08:00
timekeeping_update ( & tk_core , tk , TK_MIRROR | TK_CLOCK_WAS_SET ) ;
2013-02-22 06:51:40 +08:00
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2007-05-08 15:27:59 +08:00
}
time: Fix a bug in timekeeping_suspend() with no persistent clock
When there's no persistent clock, normally
timekeeping_suspend_time should always be zero, but this can
break in timekeeping_suspend().
At T1, there was a system suspend, so old_delta was assigned T1.
After some time, one time adjustment happened, and xtime got the
value of T1-dt(0s<dt<2s). Then, there comes another system
suspend soon after this adjustment, obviously we will get a
small negative delta_delta, resulting in a negative
timekeeping_suspend_time.
This is problematic, when doing timekeeping_resume() if there is
no nonstop clocksource for example, it will hit the else leg and
inject the improper sleeptime which is the wrong logic.
So, we can solve this problem by only doing delta related code
when the persistent clock is existent. Actually the code only
makes sense for persistent clock cases.
Signed-off-by: Xunlei Pang <pang.xunlei@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1427945681-29972-18-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-04-02 11:34:37 +08:00
/* time in seconds when suspend began for persistent clock */
2014-07-17 05:04:01 +08:00
static struct timespec64 timekeeping_suspend_time ;
2007-05-08 15:27:59 +08:00
2011-04-02 05:32:09 +08:00
/**
* __timekeeping_inject_sleeptime - Internal function to add sleep interval
2020-11-13 15:24:34 +08:00
* @ tk : Pointer to the timekeeper to be updated
* @ delta : Pointer to the delta value in timespec64 format
2011-04-02 05:32:09 +08:00
*
* Takes a timespec offset measuring a suspend interval and properly
* adds the sleep offset to the timekeeping variables .
*/
2012-07-13 13:21:57 +08:00
static void __timekeeping_inject_sleeptime ( struct timekeeper * tk ,
2018-07-13 20:06:42 +08:00
const struct timespec64 * delta )
2011-04-02 05:32:09 +08:00
{
2014-07-17 05:04:01 +08:00
if ( ! timespec64_valid_strict ( delta ) ) {
2014-06-05 07:11:43 +08:00
printk_deferred ( KERN_WARNING
" __timekeeping_inject_sleeptime: Invalid "
" sleep delta value! \n " ) ;
2011-06-02 09:18:09 +08:00
return ;
}
2012-07-13 13:21:57 +08:00
tk_xtime_add ( tk , delta ) ;
2018-04-25 21:33:38 +08:00
tk_set_wall_to_mono ( tk , timespec64_sub ( tk - > wall_to_monotonic , * delta ) ) ;
2014-07-17 05:05:00 +08:00
tk_update_sleep_time ( tk , timespec64_to_ktime ( * delta ) ) ;
2013-05-22 13:32:14 +08:00
tk_debug_account_sleep_time ( delta ) ;
2011-04-02 05:32:09 +08:00
}
2015-04-02 11:34:35 +08:00
# if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE)
2023-01-03 11:28:49 +08:00
/*
2015-04-02 11:34:38 +08:00
* We have three kinds of time sources to use for sleep time
* injection , the preference order is :
* 1 ) non - stop clocksource
* 2 ) persistent clock ( ie : RTC accessible when irqs are off )
* 3 ) RTC
*
* 1 ) and 2 ) are used by timekeeping , 3 ) by RTC subsystem .
* If system has neither 1 ) nor 2 ) , 3 ) will be used finally .
*
*
* If timekeeping has injected sleeptime via either 1 ) or 2 ) ,
* 3 ) becomes needless , so in this case we don ' t need to call
* rtc_resume ( ) , and this is what timekeeping_rtc_skipresume ( )
* means .
*/
bool timekeeping_rtc_skipresume ( void )
{
2018-07-17 14:31:29 +08:00
return ! suspend_timing_needed ;
2015-04-02 11:34:38 +08:00
}
2023-01-03 11:28:49 +08:00
/*
2015-04-02 11:34:38 +08:00
* 1 ) can be determined whether to use or not only when doing
* timekeeping_resume ( ) which is invoked after rtc_suspend ( ) ,
* so we can ' t skip rtc_suspend ( ) surely if system has 1 ) .
*
* But if system has 2 ) , 2 ) will definitely be used , so in this
* case we don ' t need to call rtc_suspend ( ) , and this is what
* timekeeping_rtc_skipsuspend ( ) means .
*/
bool timekeeping_rtc_skipsuspend ( void )
{
return persistent_clock_exists ;
}
2011-04-02 05:32:09 +08:00
/**
2014-11-18 19:15:17 +08:00
* timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values
* @ delta : pointer to a timespec64 delta value
2011-04-02 05:32:09 +08:00
*
2015-04-02 11:34:22 +08:00
* This hook is for architectures that cannot support read_persistent_clock64
2011-04-02 05:32:09 +08:00
* because their RTC / persistent clock is only accessible when irqs are enabled .
2015-04-02 11:34:38 +08:00
* and also don ' t have an effective nonstop clocksource .
2011-04-02 05:32:09 +08:00
*
* This function should only be called by rtc_resume ( ) , and allows
* a suspend offset to be injected into the timekeeping values .
*/
2018-07-13 20:06:42 +08:00
void timekeeping_inject_sleeptime64 ( const struct timespec64 * delta )
2011-04-02 05:32:09 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2011-11-15 06:05:44 +08:00
unsigned long flags ;
2011-04-02 05:32:09 +08:00
2024-10-09 16:29:02 +08:00
raw_spin_lock_irqsave ( & tk_core . lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2011-11-15 04:48:10 +08:00
2018-07-17 14:31:29 +08:00
suspend_timing_needed = false ;
2012-07-28 02:48:13 +08:00
timekeeping_forward_now ( tk ) ;
2011-04-02 05:32:09 +08:00
2014-11-18 19:15:17 +08:00
__timekeeping_inject_sleeptime ( tk , delta ) ;
2011-04-02 05:32:09 +08:00
2024-10-09 16:29:07 +08:00
timekeeping_update ( & tk_core , tk , TK_UPDATE_ALL | TK_MIRROR ) ;
2011-04-02 05:32:09 +08:00
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2024-10-09 16:29:02 +08:00
raw_spin_unlock_irqrestore ( & tk_core . lock , flags ) ;
2011-04-02 05:32:09 +08:00
2021-07-13 21:39:53 +08:00
/* Signal hrtimers about time change */
clock_was_set ( CLOCK_SET_WALL | CLOCK_SET_BOOT ) ;
2011-04-02 05:32:09 +08:00
}
2015-04-02 11:34:35 +08:00
# endif
2011-04-02 05:32:09 +08:00
2007-05-08 15:27:59 +08:00
/**
* timekeeping_resume - Resumes the generic timekeeping subsystem .
*/
PM / sleep: Make it possible to quiesce timers during suspend-to-idle
The efficiency of suspend-to-idle depends on being able to keep CPUs
in the deepest available idle states for as much time as possible.
Ideally, they should only be brought out of idle by system wakeup
interrupts.
However, timer interrupts occurring periodically prevent that from
happening and it is not practical to chase all of the "misbehaving"
timers in a whack-a-mole fashion. A much more effective approach is
to suspend the local ticks for all CPUs and the entire timekeeping
along the lines of what is done during full suspend, which also
helps to keep suspend-to-idle and full suspend reasonably similar.
The idea is to suspend the local tick on each CPU executing
cpuidle_enter_freeze() and to make the last of them suspend the
entire timekeeping. That should prevent timer interrupts from
triggering until an IO interrupt wakes up one of the CPUs. It
needs to be done with interrupts disabled on all of the CPUs,
though, because otherwise the suspended clocksource might be
accessed by an interrupt handler which might lead to fatal
consequences.
Unfortunately, the existing ->enter callbacks provided by cpuidle
drivers generally cannot be used for implementing that, because some
of them re-enable interrupts temporarily and some idle entry methods
cause interrupts to be re-enabled automatically on exit. Also some
of these callbacks manipulate local clock event devices of the CPUs
which really shouldn't be done after suspending their ticks.
To overcome that difficulty, introduce a new cpuidle state callback,
->enter_freeze, that will be guaranteed (1) to keep interrupts
disabled all the time (and return with interrupts disabled) and (2)
not to touch the CPU timer devices. Modify cpuidle_enter_freeze() to
look for the deepest available idle state with ->enter_freeze present
and to make the CPU execute that callback with suspended tick (and the
last of the online CPUs to execute it with suspended timekeeping).
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2015-02-14 06:50:43 +08:00
void timekeeping_resume ( void )
2007-05-08 15:27:59 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2015-03-19 17:09:06 +08:00
struct clocksource * clock = tk - > tkr_mono . clock ;
2011-11-15 06:05:44 +08:00
unsigned long flags ;
2014-07-17 05:04:01 +08:00
struct timespec64 ts_new , ts_delta ;
2018-07-17 15:55:16 +08:00
u64 cycle_now , nsec ;
2018-07-17 14:31:29 +08:00
bool inject_sleeptime = false ;
2009-08-14 21:47:31 +08:00
2015-04-02 11:34:22 +08:00
read_persistent_clock64 ( & ts_new ) ;
2007-05-08 15:27:59 +08:00
2012-08-06 07:40:41 +08:00
clockevents_resume ( ) ;
2007-05-14 17:10:02 +08:00
clocksource_resume ( ) ;
2024-10-09 16:29:02 +08:00
raw_spin_lock_irqsave ( & tk_core . lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2007-05-08 15:27:59 +08:00
2013-03-12 11:56:48 +08:00
/*
* After system resumes , we need to calculate the suspended time and
* compensate it for the OS time . There are 3 sources that could be
* used : Nonstop clocksource during suspend , persistent clock and rtc
* device .
*
* One specific platform may have 1 or 2 or all of them , and the
* preference will be :
* suspend - nonstop clocksource - > persistent clock - > rtc
* The less preferred source will only be tried if there is no better
* usable source . The rtc part is handled separately in rtc core code .
*/
2017-06-09 07:44:20 +08:00
cycle_now = tk_clock_read ( & tk - > tkr_mono ) ;
2018-07-17 15:55:16 +08:00
nsec = clocksource_stop_suspend_timing ( clock , cycle_now ) ;
if ( nsec > 0 ) {
2014-07-17 05:04:01 +08:00
ts_delta = ns_to_timespec64 ( nsec ) ;
2018-07-17 14:31:29 +08:00
inject_sleeptime = true ;
2014-07-17 05:04:01 +08:00
} else if ( timespec64_compare ( & ts_new , & timekeeping_suspend_time ) > 0 ) {
ts_delta = timespec64_sub ( ts_new , timekeeping_suspend_time ) ;
2018-07-17 14:31:29 +08:00
inject_sleeptime = true ;
2007-05-08 15:27:59 +08:00
}
2013-03-12 11:56:48 +08:00
2018-07-17 14:31:29 +08:00
if ( inject_sleeptime ) {
suspend_timing_needed = false ;
2013-03-12 11:56:48 +08:00
__timekeeping_inject_sleeptime ( tk , & ts_delta ) ;
2018-07-17 14:31:29 +08:00
}
2013-03-12 11:56:48 +08:00
/* Re-base the last cycle value */
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . cycle_last = cycle_now ;
2015-03-19 16:28:44 +08:00
tk - > tkr_raw . cycle_last = cycle_now ;
2012-07-28 02:48:13 +08:00
tk - > ntp_error = 0 ;
2007-05-08 15:27:59 +08:00
timekeeping_suspended = 0 ;
2024-10-09 16:29:05 +08:00
timekeeping_update ( & tk_core , tk , TK_MIRROR | TK_CLOCK_WAS_SET ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2024-10-09 16:29:02 +08:00
raw_spin_unlock_irqrestore ( & tk_core . lock , flags ) ;
2007-05-08 15:27:59 +08:00
touch_softlockup_watchdog ( ) ;
2021-07-13 21:39:51 +08:00
/* Resume the clockevent device(s) and hrtimers */
2015-03-25 20:09:16 +08:00
tick_resume ( ) ;
2021-07-13 21:39:51 +08:00
/* Notify timerfd as resume is equivalent to clock_was_set() */
timerfd_resume ( ) ;
2007-05-08 15:27:59 +08:00
}
PM / sleep: Make it possible to quiesce timers during suspend-to-idle
The efficiency of suspend-to-idle depends on being able to keep CPUs
in the deepest available idle states for as much time as possible.
Ideally, they should only be brought out of idle by system wakeup
interrupts.
However, timer interrupts occurring periodically prevent that from
happening and it is not practical to chase all of the "misbehaving"
timers in a whack-a-mole fashion. A much more effective approach is
to suspend the local ticks for all CPUs and the entire timekeeping
along the lines of what is done during full suspend, which also
helps to keep suspend-to-idle and full suspend reasonably similar.
The idea is to suspend the local tick on each CPU executing
cpuidle_enter_freeze() and to make the last of them suspend the
entire timekeeping. That should prevent timer interrupts from
triggering until an IO interrupt wakes up one of the CPUs. It
needs to be done with interrupts disabled on all of the CPUs,
though, because otherwise the suspended clocksource might be
accessed by an interrupt handler which might lead to fatal
consequences.
Unfortunately, the existing ->enter callbacks provided by cpuidle
drivers generally cannot be used for implementing that, because some
of them re-enable interrupts temporarily and some idle entry methods
cause interrupts to be re-enabled automatically on exit. Also some
of these callbacks manipulate local clock event devices of the CPUs
which really shouldn't be done after suspending their ticks.
To overcome that difficulty, introduce a new cpuidle state callback,
->enter_freeze, that will be guaranteed (1) to keep interrupts
disabled all the time (and return with interrupts disabled) and (2)
not to touch the CPU timer devices. Modify cpuidle_enter_freeze() to
look for the deepest available idle state with ->enter_freeze present
and to make the CPU execute that callback with suspended tick (and the
last of the online CPUs to execute it with suspended timekeeping).
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2015-02-14 06:50:43 +08:00
int timekeeping_suspend ( void )
2007-05-08 15:27:59 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2011-11-15 06:05:44 +08:00
unsigned long flags ;
2014-07-17 05:04:01 +08:00
struct timespec64 delta , delta_delta ;
static struct timespec64 old_delta ;
2018-07-17 15:55:16 +08:00
struct clocksource * curr_clock ;
u64 cycle_now ;
2007-05-08 15:27:59 +08:00
2015-04-02 11:34:22 +08:00
read_persistent_clock64 ( & timekeeping_suspend_time ) ;
2007-09-16 21:36:43 +08:00
2013-05-18 02:24:05 +08:00
/*
* On some systems the persistent_clock can not be detected at
* timekeeping_init by its return value , so if we see a valid
* value returned , update the persistent_clock_exists flag .
*/
if ( timekeeping_suspend_time . tv_sec | | timekeeping_suspend_time . tv_nsec )
2015-04-02 11:34:38 +08:00
persistent_clock_exists = true ;
2013-05-18 02:24:05 +08:00
2018-07-17 14:31:29 +08:00
suspend_timing_needed = true ;
2024-10-09 16:29:02 +08:00
raw_spin_lock_irqsave ( & tk_core . lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2012-07-28 02:48:13 +08:00
timekeeping_forward_now ( tk ) ;
2007-05-08 15:27:59 +08:00
timekeeping_suspended = 1 ;
time: Avoid accumulating time drift in suspend/resume
Because the read_persistent_clock interface is usually backed by
only a second granular interface, each time we read from the persistent
clock for suspend/resume, we introduce a half second (on average) of error.
In order to avoid this error accumulating as the system is suspended
over and over, this patch measures the time delta between the persistent
clock and the system CLOCK_REALTIME.
If the delta is less then 2 seconds from the last suspend, we compensate
by using the previous time delta (keeping it close). If it is larger
then 2 seconds, we assume the clock was set or has been changed, so we
do no correction and update the delta.
Note: If NTP is running, ths could seem to "fight" with the NTP corrected
time, where as if the system time was off by 1 second, and NTP slewed the
value in, a suspend/resume cycle could undo this correction, by trying to
restore the previous offset from the persistent clock. However, without
this patch, since each read could cause almost a full second worth of
error, its possible to get almost 2 seconds of error just from the
suspend/resume cycle alone, so this about equal to any offset added by
the compensation.
Further on systems that suspend/resume frequently, this should keep time
closer then NTP could compensate for if the errors were allowed to
accumulate.
Credits to Arve Hjønnevåg for suggesting this solution.
CC: Arve Hjønnevåg <arve@android.com>
CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2011-06-01 13:53:23 +08:00
2018-07-17 15:55:16 +08:00
/*
* Since we ' ve called forward_now , cycle_last stores the value
* just read from the current clocksource . Save this to potentially
* use in suspend timing .
*/
curr_clock = tk - > tkr_mono . clock ;
cycle_now = tk - > tkr_mono . cycle_last ;
clocksource_start_suspend_timing ( curr_clock , cycle_now ) ;
2015-04-02 11:34:38 +08:00
if ( persistent_clock_exists ) {
time: Avoid accumulating time drift in suspend/resume
Because the read_persistent_clock interface is usually backed by
only a second granular interface, each time we read from the persistent
clock for suspend/resume, we introduce a half second (on average) of error.
In order to avoid this error accumulating as the system is suspended
over and over, this patch measures the time delta between the persistent
clock and the system CLOCK_REALTIME.
If the delta is less then 2 seconds from the last suspend, we compensate
by using the previous time delta (keeping it close). If it is larger
then 2 seconds, we assume the clock was set or has been changed, so we
do no correction and update the delta.
Note: If NTP is running, ths could seem to "fight" with the NTP corrected
time, where as if the system time was off by 1 second, and NTP slewed the
value in, a suspend/resume cycle could undo this correction, by trying to
restore the previous offset from the persistent clock. However, without
this patch, since each read could cause almost a full second worth of
error, its possible to get almost 2 seconds of error just from the
suspend/resume cycle alone, so this about equal to any offset added by
the compensation.
Further on systems that suspend/resume frequently, this should keep time
closer then NTP could compensate for if the errors were allowed to
accumulate.
Credits to Arve Hjønnevåg for suggesting this solution.
CC: Arve Hjønnevåg <arve@android.com>
CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2011-06-01 13:53:23 +08:00
/*
time: Fix a bug in timekeeping_suspend() with no persistent clock
When there's no persistent clock, normally
timekeeping_suspend_time should always be zero, but this can
break in timekeeping_suspend().
At T1, there was a system suspend, so old_delta was assigned T1.
After some time, one time adjustment happened, and xtime got the
value of T1-dt(0s<dt<2s). Then, there comes another system
suspend soon after this adjustment, obviously we will get a
small negative delta_delta, resulting in a negative
timekeeping_suspend_time.
This is problematic, when doing timekeeping_resume() if there is
no nonstop clocksource for example, it will hit the else leg and
inject the improper sleeptime which is the wrong logic.
So, we can solve this problem by only doing delta related code
when the persistent clock is existent. Actually the code only
makes sense for persistent clock cases.
Signed-off-by: Xunlei Pang <pang.xunlei@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1427945681-29972-18-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-04-02 11:34:37 +08:00
* To avoid drift caused by repeated suspend / resumes ,
* which each can add ~ 1 second drift error ,
* try to compensate so the difference in system time
* and persistent_clock time stays close to constant .
time: Avoid accumulating time drift in suspend/resume
Because the read_persistent_clock interface is usually backed by
only a second granular interface, each time we read from the persistent
clock for suspend/resume, we introduce a half second (on average) of error.
In order to avoid this error accumulating as the system is suspended
over and over, this patch measures the time delta between the persistent
clock and the system CLOCK_REALTIME.
If the delta is less then 2 seconds from the last suspend, we compensate
by using the previous time delta (keeping it close). If it is larger
then 2 seconds, we assume the clock was set or has been changed, so we
do no correction and update the delta.
Note: If NTP is running, ths could seem to "fight" with the NTP corrected
time, where as if the system time was off by 1 second, and NTP slewed the
value in, a suspend/resume cycle could undo this correction, by trying to
restore the previous offset from the persistent clock. However, without
this patch, since each read could cause almost a full second worth of
error, its possible to get almost 2 seconds of error just from the
suspend/resume cycle alone, so this about equal to any offset added by
the compensation.
Further on systems that suspend/resume frequently, this should keep time
closer then NTP could compensate for if the errors were allowed to
accumulate.
Credits to Arve Hjønnevåg for suggesting this solution.
CC: Arve Hjønnevåg <arve@android.com>
CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2011-06-01 13:53:23 +08:00
*/
time: Fix a bug in timekeeping_suspend() with no persistent clock
When there's no persistent clock, normally
timekeeping_suspend_time should always be zero, but this can
break in timekeeping_suspend().
At T1, there was a system suspend, so old_delta was assigned T1.
After some time, one time adjustment happened, and xtime got the
value of T1-dt(0s<dt<2s). Then, there comes another system
suspend soon after this adjustment, obviously we will get a
small negative delta_delta, resulting in a negative
timekeeping_suspend_time.
This is problematic, when doing timekeeping_resume() if there is
no nonstop clocksource for example, it will hit the else leg and
inject the improper sleeptime which is the wrong logic.
So, we can solve this problem by only doing delta related code
when the persistent clock is existent. Actually the code only
makes sense for persistent clock cases.
Signed-off-by: Xunlei Pang <pang.xunlei@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1427945681-29972-18-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-04-02 11:34:37 +08:00
delta = timespec64_sub ( tk_xtime ( tk ) , timekeeping_suspend_time ) ;
delta_delta = timespec64_sub ( delta , old_delta ) ;
if ( abs ( delta_delta . tv_sec ) > = 2 ) {
/*
* if delta_delta is too large , assume time correction
* has occurred and set old_delta to the current delta .
*/
old_delta = delta ;
} else {
/* Otherwise try to adjust old_system to compensate */
timekeeping_suspend_time =
timespec64_add ( timekeeping_suspend_time , delta_delta ) ;
}
time: Avoid accumulating time drift in suspend/resume
Because the read_persistent_clock interface is usually backed by
only a second granular interface, each time we read from the persistent
clock for suspend/resume, we introduce a half second (on average) of error.
In order to avoid this error accumulating as the system is suspended
over and over, this patch measures the time delta between the persistent
clock and the system CLOCK_REALTIME.
If the delta is less then 2 seconds from the last suspend, we compensate
by using the previous time delta (keeping it close). If it is larger
then 2 seconds, we assume the clock was set or has been changed, so we
do no correction and update the delta.
Note: If NTP is running, ths could seem to "fight" with the NTP corrected
time, where as if the system time was off by 1 second, and NTP slewed the
value in, a suspend/resume cycle could undo this correction, by trying to
restore the previous offset from the persistent clock. However, without
this patch, since each read could cause almost a full second worth of
error, its possible to get almost 2 seconds of error just from the
suspend/resume cycle alone, so this about equal to any offset added by
the compensation.
Further on systems that suspend/resume frequently, this should keep time
closer then NTP could compensate for if the errors were allowed to
accumulate.
Credits to Arve Hjønnevåg for suggesting this solution.
CC: Arve Hjønnevåg <arve@android.com>
CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2011-06-01 13:53:23 +08:00
}
2013-12-12 11:10:36 +08:00
2024-10-09 16:29:05 +08:00
timekeeping_update ( & tk_core , tk , TK_MIRROR ) ;
2015-02-13 21:49:02 +08:00
halt_fast_timekeeper ( tk ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2024-10-09 16:29:02 +08:00
raw_spin_unlock_irqrestore ( & tk_core . lock , flags ) ;
2007-05-08 15:27:59 +08:00
2015-03-25 20:09:16 +08:00
tick_suspend ( ) ;
2010-02-03 06:41:41 +08:00
clocksource_suspend ( ) ;
2012-08-06 07:40:41 +08:00
clockevents_suspend ( ) ;
2007-05-08 15:27:59 +08:00
return 0 ;
}
/* sysfs resume/suspend bits for timekeeping */
2011-03-24 05:16:04 +08:00
static struct syscore_ops timekeeping_syscore_ops = {
2007-05-08 15:27:59 +08:00
. resume = timekeeping_resume ,
. suspend = timekeeping_suspend ,
} ;
2011-03-24 05:16:04 +08:00
static int __init timekeeping_init_ops ( void )
2007-05-08 15:27:59 +08:00
{
2011-03-24 05:16:04 +08:00
register_syscore_ops ( & timekeeping_syscore_ops ) ;
return 0 ;
2007-05-08 15:27:59 +08:00
}
2011-03-24 05:16:04 +08:00
device_initcall ( timekeeping_init_ops ) ;
2007-05-08 15:27:59 +08:00
/*
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
* Apply a multiplier adjustment to the timekeeper
2007-05-08 15:27:59 +08:00
*/
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
static __always_inline void timekeeping_apply_adjustment ( struct timekeeper * tk ,
s64 offset ,
2018-03-10 02:42:48 +08:00
s32 mult_adj )
2007-05-08 15:27:59 +08:00
{
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
s64 interval = tk - > cycle_interval ;
2007-05-08 15:27:59 +08:00
2018-03-10 02:42:48 +08:00
if ( mult_adj = = 0 ) {
return ;
} else if ( mult_adj = = - 1 ) {
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
interval = - interval ;
2018-03-10 02:42:48 +08:00
offset = - offset ;
} else if ( mult_adj ! = 1 ) {
interval * = mult_adj ;
offset * = mult_adj ;
2012-08-05 03:21:14 +08:00
}
2007-05-08 15:27:59 +08:00
2011-10-28 09:12:42 +08:00
/*
* So the following can be confusing .
*
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
* To keep things simple , lets assume mult_adj = = 1 for now .
2011-10-28 09:12:42 +08:00
*
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
* When mult_adj ! = 1 , remember that the interval and offset values
2011-10-28 09:12:42 +08:00
* have been appropriately scaled so the math is the same .
*
* The basic idea here is that we ' re increasing the multiplier
* by one , this causes the xtime_interval to be incremented by
* one cycle_interval . This is because :
* xtime_interval = cycle_interval * mult
* So if mult is being incremented by one :
* xtime_interval = cycle_interval * ( mult + 1 )
* Its the same as :
* xtime_interval = ( cycle_interval * mult ) + cycle_interval
* Which can be shortened to :
* xtime_interval + = cycle_interval
*
* So offset stores the non - accumulated cycles . Thus the current
* time ( in shifted nanoseconds ) is :
* now = ( offset * adj ) + xtime_nsec
* Now , even though we ' re adjusting the clock frequency , we have
* to keep time consistent . In other words , we can ' t jump back
* in time , and we also want to avoid jumping forward in time .
*
* So given the same offset value , we need the time to be the same
* both before and after the freq adjustment .
* now = ( offset * adj_1 ) + xtime_nsec_1
* now = ( offset * adj_2 ) + xtime_nsec_2
* So :
* ( offset * adj_1 ) + xtime_nsec_1 =
* ( offset * adj_2 ) + xtime_nsec_2
* And we know :
* adj_2 = adj_1 + 1
* So :
* ( offset * adj_1 ) + xtime_nsec_1 =
* ( offset * ( adj_1 + 1 ) ) + xtime_nsec_2
* ( offset * adj_1 ) + xtime_nsec_1 =
* ( offset * adj_1 ) + offset + xtime_nsec_2
* Canceling the sides :
* xtime_nsec_1 = offset + xtime_nsec_2
* Which gives us :
* xtime_nsec_2 = xtime_nsec_1 - offset
2021-03-23 05:39:03 +08:00
* Which simplifies to :
2011-10-28 09:12:42 +08:00
* xtime_nsec - = offset
*/
2015-03-19 17:09:06 +08:00
if ( ( mult_adj > 0 ) & & ( tk - > tkr_mono . mult + mult_adj < mult_adj ) ) {
time: Avoid possible NTP adjustment mult overflow.
Ideally, __clocksource_updatefreq_scale, selects the largest shift
value possible for a clocksource. This results in the mult memember of
struct clocksource being particularly large, although not so large
that NTP would adjust the clock to cause it to overflow.
That said, nothing actually prohibits an overflow from occuring, its
just that it "shouldn't" occur.
So while very unlikely, and so far never observed, the value of
(cs->mult+cs->maxadj) may have a chance to reach very near 0xFFFFFFFF,
so there is a possibility it may overflow when doing NTP positive
adjustment
See the following detail: When NTP slewes the clock, kernel goes
through update_wall_time()->...->timekeeping_apply_adjustment():
tk->tkr.mult += mult_adj;
Since there is no guard against it, its possible tk->tkr.mult may
overflow during this operation.
This patch avoids any possible mult overflow by judging the overflow
case before adding mult_adj to mult, also adds the WARNING message
when capturing such case.
Signed-off-by: pang.xunlei <pang.xunlei@linaro.org>
[jstultz: Reworded commit message]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2014-10-08 15:03:34 +08:00
/* NTP adjustment caused clocksource mult overflow */
WARN_ON_ONCE ( 1 ) ;
return ;
}
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . mult + = mult_adj ;
2012-07-13 13:21:57 +08:00
tk - > xtime_interval + = interval ;
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec - = offset ;
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
}
/*
2018-03-10 02:42:48 +08:00
* Adjust the timekeeper ' s multiplier to the correct frequency
* and also to reduce the accumulated error value .
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
*/
2018-03-10 02:42:48 +08:00
static void timekeeping_adjust ( struct timekeeper * tk , s64 offset )
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
{
2024-10-09 16:28:54 +08:00
u64 ntp_tl = ntp_tick_length ( ) ;
2018-03-10 02:42:48 +08:00
u32 mult ;
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
2015-12-04 02:23:30 +08:00
/*
2018-03-10 02:42:48 +08:00
* Determine the multiplier from the current NTP tick length .
* Avoid expensive division when the tick length doesn ' t change .
2015-12-04 02:23:30 +08:00
*/
2024-10-09 16:28:54 +08:00
if ( likely ( tk - > ntp_tick = = ntp_tl ) ) {
2018-03-10 02:42:48 +08:00
mult = tk - > tkr_mono . mult - tk - > ntp_err_mult ;
} else {
2024-10-09 16:28:54 +08:00
tk - > ntp_tick = ntp_tl ;
2018-03-10 02:42:48 +08:00
mult = div64_u64 ( ( tk - > ntp_tick > > tk - > ntp_error_shift ) -
tk - > xtime_remainder , tk - > cycle_interval ) ;
2015-12-04 02:23:30 +08:00
}
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
2018-03-10 02:42:48 +08:00
/*
* If the clock is behind the NTP time , increase the multiplier by 1
* to catch up with it . If it ' s ahead and there was a remainder in the
* tick division , the clock will slow down . Otherwise it will stay
* ahead until the tick length changes to a non - divisible value .
*/
tk - > ntp_err_mult = tk - > ntp_error > 0 ? 1 : 0 ;
mult + = tk - > ntp_err_mult ;
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
2018-03-10 02:42:48 +08:00
timekeeping_apply_adjustment ( tk , offset , mult - tk - > tkr_mono . mult ) ;
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
2015-03-19 17:09:06 +08:00
if ( unlikely ( tk - > tkr_mono . clock - > maxadj & &
( abs ( tk - > tkr_mono . mult - tk - > tkr_mono . clock - > mult )
> tk - > tkr_mono . clock - > maxadj ) ) ) {
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
printk_once ( KERN_WARNING
" Adjusting %s more than 11%% (%ld vs %ld) \n " ,
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . clock - > name , ( long ) tk - > tkr_mono . mult ,
( long ) tk - > tkr_mono . clock - > mult + tk - > tkr_mono . clock - > maxadj ) ;
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
}
2012-07-13 13:21:56 +08:00
/*
* It may be possible that when we entered this function , xtime_nsec
* was very small . Further , if we ' re slightly speeding the clocksource
* in the code above , its possible the required corrective factor to
* xtime_nsec could cause it to underflow .
*
2018-03-10 02:42:48 +08:00
* Now , since we have already accumulated the second and the NTP
* subsystem has been notified via second_overflow ( ) , we need to skip
* the next update .
2012-07-13 13:21:56 +08:00
*/
2015-03-19 17:09:06 +08:00
if ( unlikely ( ( s64 ) tk - > tkr_mono . xtime_nsec < 0 ) ) {
2018-03-10 02:42:48 +08:00
tk - > tkr_mono . xtime_nsec + = ( u64 ) NSEC_PER_SEC < <
tk - > tkr_mono . shift ;
tk - > xtime_sec - - ;
tk - > skip_second_overflow = 1 ;
2012-07-13 13:21:56 +08:00
}
2007-05-08 15:27:59 +08:00
}
2020-11-13 15:24:33 +08:00
/*
2012-07-13 13:21:54 +08:00
* accumulate_nsecs_to_secs - Accumulates nsecs into secs
*
2015-08-25 14:42:53 +08:00
* Helper function that accumulates the nsecs greater than a second
2012-07-13 13:21:54 +08:00
* from the xtime_nsec field to the xtime_secs field .
* It also calls into the NTP code to handle leapsecond processing .
*/
2013-06-27 18:35:46 +08:00
static inline unsigned int accumulate_nsecs_to_secs ( struct timekeeper * tk )
2012-07-13 13:21:54 +08:00
{
2015-03-19 17:09:06 +08:00
u64 nsecps = ( u64 ) NSEC_PER_SEC < < tk - > tkr_mono . shift ;
2013-12-12 12:07:49 +08:00
unsigned int clock_set = 0 ;
2012-07-13 13:21:54 +08:00
2015-03-19 17:09:06 +08:00
while ( tk - > tkr_mono . xtime_nsec > = nsecps ) {
2012-07-13 13:21:54 +08:00
int leap ;
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec - = nsecps ;
2012-07-13 13:21:54 +08:00
tk - > xtime_sec + + ;
2018-03-10 02:42:48 +08:00
/*
* Skip NTP update if this second was accumulated before ,
* i . e . xtime_nsec underflowed in timekeeping_adjust ( )
*/
if ( unlikely ( tk - > skip_second_overflow ) ) {
tk - > skip_second_overflow = 0 ;
continue ;
}
2012-07-13 13:21:54 +08:00
/* Figure out if its a leap sec and apply if needed */
leap = second_overflow ( tk - > xtime_sec ) ;
2012-07-28 02:48:12 +08:00
if ( unlikely ( leap ) ) {
2014-07-17 05:04:01 +08:00
struct timespec64 ts ;
2012-07-28 02:48:12 +08:00
tk - > xtime_sec + = leap ;
2012-07-13 13:21:54 +08:00
2012-07-28 02:48:12 +08:00
ts . tv_sec = leap ;
ts . tv_nsec = 0 ;
tk_set_wall_to_mono ( tk ,
2014-07-17 05:04:01 +08:00
timespec64_sub ( tk - > wall_to_monotonic , ts ) ) ;
2012-07-28 02:48:12 +08:00
2012-05-04 03:30:07 +08:00
__timekeeping_set_tai_offset ( tk , tk - > tai_offset - leap ) ;
2013-12-12 12:07:49 +08:00
clock_set = TK_CLOCK_WAS_SET ;
2012-07-28 02:48:12 +08:00
}
2012-07-13 13:21:54 +08:00
}
2013-12-12 12:07:49 +08:00
return clock_set ;
2012-07-13 13:21:54 +08:00
}
2020-11-13 15:24:33 +08:00
/*
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
* logarithmic_accumulation - shifted accumulation of cycles
*
* This functions accumulates a shifted interval of cycles into
2020-08-07 11:32:48 +08:00
* a shifted interval nanoseconds . Allows for O ( log ) accumulation
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
* loop .
*
* Returns the unconsumed cycles .
*/
2016-12-22 03:32:01 +08:00
static u64 logarithmic_accumulation ( struct timekeeper * tk , u64 offset ,
u32 shift , unsigned int * clock_set )
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
{
2016-12-22 03:32:01 +08:00
u64 interval = tk - > cycle_interval < < shift ;
2017-06-09 07:44:21 +08:00
u64 snsec_per_sec ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
2015-08-25 14:42:53 +08:00
/* If the offset is smaller than a shifted interval, do nothing */
2013-02-22 06:51:36 +08:00
if ( offset < interval )
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
return offset ;
/* Accumulate one shifted interval */
2013-02-22 06:51:36 +08:00
offset - = interval ;
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . cycle_last + = interval ;
2015-03-19 16:28:44 +08:00
tk - > tkr_raw . cycle_last + = interval ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec + = tk - > xtime_interval < < shift ;
2013-12-12 12:07:49 +08:00
* clock_set | = accumulate_nsecs_to_secs ( tk ) ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
2010-08-10 05:20:09 +08:00
/* Accumulate raw time */
2017-06-09 07:44:21 +08:00
tk - > tkr_raw . xtime_nsec + = tk - > raw_interval < < shift ;
snsec_per_sec = ( u64 ) NSEC_PER_SEC < < tk - > tkr_raw . shift ;
while ( tk - > tkr_raw . xtime_nsec > = snsec_per_sec ) {
tk - > tkr_raw . xtime_nsec - = snsec_per_sec ;
2017-05-23 08:20:20 +08:00
tk - > raw_sec + + ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
}
/* Accumulate error between NTP and clock interval */
2014-04-24 11:53:29 +08:00
tk - > ntp_error + = tk - > ntp_tick < < shift ;
2012-07-13 13:21:57 +08:00
tk - > ntp_error - = ( tk - > xtime_interval + tk - > xtime_remainder ) < <
( tk - > ntp_error_shift + shift ) ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
return offset ;
}
2018-06-04 21:34:21 +08:00
/*
* timekeeping_advance - Updates the timekeeper to the current time and
* current NTP tick length
2007-05-08 15:27:59 +08:00
*/
2021-07-13 21:39:52 +08:00
static bool timekeeping_advance ( enum timekeeping_adv_mode mode )
2007-05-08 15:27:59 +08:00
{
2024-10-09 16:29:00 +08:00
struct timekeeper * tk = & tk_core . shadow_timekeeper ;
2014-07-17 05:04:07 +08:00
struct timekeeper * real_tk = & tk_core . timekeeper ;
2013-12-12 12:07:49 +08:00
unsigned int clock_set = 0 ;
2024-10-09 16:28:58 +08:00
int shift = 0 , maxshift ;
u64 offset ;
2011-11-15 04:48:10 +08:00
2024-10-09 16:29:02 +08:00
guard ( raw_spinlock_irqsave ) ( & tk_core . lock ) ;
2007-05-08 15:27:59 +08:00
/* Make sure we're fully resumed: */
if ( unlikely ( timekeeping_suspended ) )
2024-10-09 16:28:58 +08:00
return false ;
2007-05-08 15:27:59 +08:00
2017-06-09 07:44:20 +08:00
offset = clocksource_delta ( tk_clock_read ( & tk - > tkr_mono ) ,
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . cycle_last , tk - > tkr_mono . mask ) ;
2007-05-08 15:27:59 +08:00
2012-08-22 08:30:49 +08:00
/* Check if there's really nothing to do */
2018-06-04 21:34:21 +08:00
if ( offset < real_tk - > cycle_interval & & mode = = TK_ADV_TICK )
2024-10-09 16:28:58 +08:00
return false ;
2012-08-22 08:30:49 +08:00
2015-03-12 12:16:32 +08:00
/* Do some additional sanity checking */
2017-06-28 21:21:35 +08:00
timekeeping_check_update ( tk , offset ) ;
2015-03-12 12:16:32 +08:00
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
/*
* With NO_HZ we may have to accumulate many cycle_intervals
* ( think " ticks " ) worth of time at once . To do this efficiently ,
* we calculate the largest doubling multiple of cycle_intervals
2012-03-15 11:28:56 +08:00
* that is smaller than the offset . We then accumulate that
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
* chunk in one go , and then try to consume the next smaller
* doubled multiple .
2007-05-08 15:27:59 +08:00
*/
2012-07-28 02:48:13 +08:00
shift = ilog2 ( offset ) - ilog2 ( tk - > cycle_interval ) ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
shift = max ( 0 , shift ) ;
2012-03-15 11:28:56 +08:00
/* Bound shift to one less than what overflows tick_length */
2011-11-15 05:18:07 +08:00
maxshift = ( 64 - ( ilog2 ( ntp_tick_length ( ) ) + 1 ) ) - 1 ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
shift = min ( shift , maxshift ) ;
2012-07-28 02:48:13 +08:00
while ( offset > = tk - > cycle_interval ) {
2024-10-09 16:28:58 +08:00
offset = logarithmic_accumulation ( tk , offset , shift , & clock_set ) ;
2012-07-28 02:48:13 +08:00
if ( offset < tk - > cycle_interval < < shift )
2010-03-19 05:47:30 +08:00
shift - - ;
2007-05-08 15:27:59 +08:00
}
2018-03-10 02:42:48 +08:00
/* Adjust the multiplier to correct NTP error */
2012-07-28 02:48:13 +08:00
timekeeping_adjust ( tk , offset ) ;
2007-05-08 15:27:59 +08:00
2010-04-07 05:30:51 +08:00
/*
* Finally , make sure that after the rounding
2012-07-13 13:21:53 +08:00
* xtime_nsec isn ' t larger than NSEC_PER_SEC
2010-04-07 05:30:51 +08:00
*/
2013-12-12 12:07:49 +08:00
clock_set | = accumulate_nsecs_to_secs ( tk ) ;
Revert "time: Remove xtime_cache"
This reverts commit 7bc7d637452383d56ba4368d4336b0dde1bb476d, as
requested by John Stultz. Quoting John:
"Petr Titěra reported an issue where he saw odd atime regressions with
2.6.33 where there were a full second worth of nanoseconds in the
nanoseconds field.
He also reviewed the time code and narrowed down the problem: unhandled
overflow of the nanosecond field caused by rounding up the
sub-nanosecond accumulated time.
Details:
* At the end of update_wall_time(), we currently round up the
sub-nanosecond portion of accumulated time when storing it into xtime.
This was added to avoid time inconsistencies caused when the
sub-nanosecond portion was truncated when storing into xtime.
Unfortunately we don't handle the possible second overflow caused by
that rounding.
* Previously the xtime_cache code hid this overflow by normalizing the
xtime value when storing into the xtime_cache.
* We could try to handle the second overflow after the rounding up, but
since this affects the timekeeping's internal state, this would further
complicate the next accumulation cycle, causing small errors in ntp
steering. As much as I'd like to get rid of it, the xtime_cache code is
known to work.
* The correct fix is really to include the sub-nanosecond portion in the
timekeeping accessor function, so we don't need to round up at during
accumulation. This would greatly simplify the accumulation code.
Unfortunately, we can't do this safely until the last three
non-GENERIC_TIME arches (sparc32, arm, cris) are converted (those
patches are in -mm) and we kill off the spots where arches set xtime
directly. This is all 2.6.34 material, so I think reverting the
xtime_cache change is the best approach for now.
Many thanks to Petr for both reporting and finding the issue!"
Reported-by: Petr Titěra <P.Titera@century.cz>
Requested-by: john stultz <johnstul@us.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-12-23 06:10:37 +08:00
2024-10-09 16:29:06 +08:00
timekeeping_update_from_shadow ( & tk_core , clock_set ) ;
2021-07-13 21:39:52 +08:00
return ! ! clock_set ;
2007-05-08 15:27:59 +08:00
}
2007-07-16 14:39:41 +08:00
2018-06-04 21:34:21 +08:00
/**
* update_wall_time - Uses the current clocksource to increment the wall time
*
*/
void update_wall_time ( void )
{
2021-07-13 21:39:52 +08:00
if ( timekeeping_advance ( TK_ADV_TICK ) )
clock_was_set_delayed ( ) ;
2018-06-04 21:34:21 +08:00
}
2007-07-16 14:39:41 +08:00
/**
2014-12-09 04:00:09 +08:00
* getboottime64 - Return the real time of system boot .
* @ ts : pointer to the timespec64 to be set
2007-07-16 14:39:41 +08:00
*
2014-12-09 04:00:09 +08:00
* Returns the wall - time of boot in a timespec64 .
2007-07-16 14:39:41 +08:00
*
* This is based on the wall_to_monotonic offset and the total suspend
* time . Calls to settimeofday will affect the value returned ( which
* basically means that however wrong your real time clock is at boot time ,
* you get the right time here ) .
*/
2014-12-09 04:00:09 +08:00
void getboottime64 ( struct timespec64 * ts )
2007-07-16 14:39:41 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2018-04-25 21:33:38 +08:00
ktime_t t = ktime_sub ( tk - > offs_real , tk - > offs_boot ) ;
2014-07-17 05:04:58 +08:00
2014-12-09 04:00:09 +08:00
* ts = ktime_to_timespec64 ( t ) ;
2007-07-16 14:39:41 +08:00
}
2014-12-09 04:00:09 +08:00
EXPORT_SYMBOL_GPL ( getboottime64 ) ;
2007-07-16 14:39:41 +08:00
2018-04-27 21:40:14 +08:00
void ktime_get_coarse_real_ts64 ( struct timespec64 * ts )
2007-07-25 08:47:43 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2019-03-19 03:55:56 +08:00
unsigned int seq ;
2007-07-25 08:47:43 +08:00
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
Revert "time: Remove xtime_cache"
This reverts commit 7bc7d637452383d56ba4368d4336b0dde1bb476d, as
requested by John Stultz. Quoting John:
"Petr Titěra reported an issue where he saw odd atime regressions with
2.6.33 where there were a full second worth of nanoseconds in the
nanoseconds field.
He also reviewed the time code and narrowed down the problem: unhandled
overflow of the nanosecond field caused by rounding up the
sub-nanosecond accumulated time.
Details:
* At the end of update_wall_time(), we currently round up the
sub-nanosecond portion of accumulated time when storing it into xtime.
This was added to avoid time inconsistencies caused when the
sub-nanosecond portion was truncated when storing into xtime.
Unfortunately we don't handle the possible second overflow caused by
that rounding.
* Previously the xtime_cache code hid this overflow by normalizing the
xtime value when storing into the xtime_cache.
* We could try to handle the second overflow after the rounding up, but
since this affects the timekeeping's internal state, this would further
complicate the next accumulation cycle, causing small errors in ntp
steering. As much as I'd like to get rid of it, the xtime_cache code is
known to work.
* The correct fix is really to include the sub-nanosecond portion in the
timekeeping accessor function, so we don't need to round up at during
accumulation. This would greatly simplify the accumulation code.
Unfortunately, we can't do this safely until the last three
non-GENERIC_TIME arches (sparc32, arm, cris) are converted (those
patches are in -mm) and we kill off the spots where arches set xtime
directly. This is all 2.6.34 material, so I think reverting the
xtime_cache change is the best approach for now.
Many thanks to Petr for both reporting and finding the issue!"
Reported-by: Petr Titěra <P.Titera@century.cz>
Requested-by: john stultz <johnstul@us.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-12-23 06:10:37 +08:00
2018-04-27 21:40:14 +08:00
* ts = tk_xtime ( tk ) ;
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2007-07-25 08:47:43 +08:00
}
2018-04-27 21:40:14 +08:00
EXPORT_SYMBOL ( ktime_get_coarse_real_ts64 ) ;
2009-08-20 10:13:34 +08:00
timekeeping: Add interfaces for handling timestamps with a floor value
Multigrain timestamps allow the kernel to use fine-grained timestamps when
an inode's attributes is being actively observed via ->getattr(). With
this support, it's possible for a file to get a fine-grained timestamp, and
another modified after it to get a coarse-grained stamp that is earlier
than the fine-grained time. If this happens then the files can appear to
have been modified in reverse order, which breaks VFS ordering guarantees
[1].
To prevent this, maintain a floor value for multigrain timestamps.
Whenever a fine-grained timestamp is handed out, record it, and when later
coarse-grained stamps are handed out, ensure they are not earlier than that
value. If the coarse-grained timestamp is earlier than the fine-grained
floor, return the floor value instead.
Add a static singleton atomic64_t into timekeeper.c that is used to keep
track of the latest fine-grained time ever handed out. This is tracked as a
monotonic ktime_t value to ensure that it isn't affected by clock
jumps. Because it is updated at different times than the rest of the
timekeeper object, the floor value is managed independently of the
timekeeper via a cmpxchg() operation, and sits on its own cacheline.
Add two new public interfaces:
- ktime_get_coarse_real_ts64_mg() fills a timespec64 with the later of the
coarse-grained clock and the floor time
- ktime_get_real_ts64_mg() gets the fine-grained clock value, and tries
to swap it into the floor. A timespec64 is filled with the result.
The floor value is global and updated via a single try_cmpxchg(). If
that fails then the operation raced with a concurrent update. Any
concurrent update must be later than the existing floor value, so any
racing tasks can accept any resulting floor value without retrying.
[1]: POSIX requires that files be stamped with realtime clock values, and
makes no provision for dealing with backward clock jumps. If a backward
realtime clock jump occurs, then files can appear to have been modified
in reverse order.
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Randy Dunlap <rdunlap@infradead.org> # documentation bits
Acked-by: John Stultz <jstultz@google.com>
Link: https://lore.kernel.org/all/20241002-mgtime-v10-1-d1c4717f5284@kernel.org
2024-10-03 05:27:16 +08:00
/**
* ktime_get_coarse_real_ts64_mg - return latter of coarse grained time or floor
* @ ts : timespec64 to be filled
*
* Fetch the global mg_floor value , convert it to realtime and compare it
* to the current coarse - grained time . Fill @ ts with whichever is
* latest . Note that this is a filesystem - specific interface and should be
* avoided outside of that context .
*/
void ktime_get_coarse_real_ts64_mg ( struct timespec64 * ts )
{
struct timekeeper * tk = & tk_core . timekeeper ;
u64 floor = atomic64_read ( & mg_floor ) ;
ktime_t f_real , offset , coarse ;
unsigned int seq ;
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
* ts = tk_xtime ( tk ) ;
offset = tk_core . timekeeper . offs_real ;
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
coarse = timespec64_to_ktime ( * ts ) ;
f_real = ktime_add ( floor , offset ) ;
if ( ktime_after ( f_real , coarse ) )
* ts = ktime_to_timespec64 ( f_real ) ;
}
/**
* ktime_get_real_ts64_mg - attempt to update floor value and return result
* @ ts : pointer to the timespec to be set
*
* Get a monotonic fine - grained time value and attempt to swap it into
* mg_floor . If that succeeds then accept the new floor value . If it fails
* then another task raced in during the interim time and updated the
* floor . Since any update to the floor must be later than the previous
* floor , either outcome is acceptable .
*
* Typically this will be called after calling ktime_get_coarse_real_ts64_mg ( ) ,
* and determining that the resulting coarse - grained timestamp did not effect
* a change in ctime . Any more recent floor value would effect a change to
* ctime , so there is no need to retry the atomic64_try_cmpxchg ( ) on failure .
*
* @ ts will be filled with the latest floor value , regardless of the outcome of
* the cmpxchg . Note that this is a filesystem specific interface and should be
* avoided outside of that context .
*/
void ktime_get_real_ts64_mg ( struct timespec64 * ts )
{
struct timekeeper * tk = & tk_core . timekeeper ;
ktime_t old = atomic64_read ( & mg_floor ) ;
ktime_t offset , mono ;
unsigned int seq ;
u64 nsecs ;
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
ts - > tv_sec = tk - > xtime_sec ;
mono = tk - > tkr_mono . base ;
nsecs = timekeeping_get_ns ( & tk - > tkr_mono ) ;
offset = tk_core . timekeeper . offs_real ;
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
mono = ktime_add_ns ( mono , nsecs ) ;
/*
* Attempt to update the floor with the new time value . As any
* update must be later then the existing floor , and would effect
* a change to ctime from the perspective of the current task ,
* accept the resulting floor value regardless of the outcome of
* the swap .
*/
if ( atomic64_try_cmpxchg ( & mg_floor , & old , mono ) ) {
ts - > tv_nsec = 0 ;
timespec64_add_ns ( ts , nsecs ) ;
2024-10-03 05:27:17 +08:00
timekeeping_inc_mg_floor_swaps ( ) ;
timekeeping: Add interfaces for handling timestamps with a floor value
Multigrain timestamps allow the kernel to use fine-grained timestamps when
an inode's attributes is being actively observed via ->getattr(). With
this support, it's possible for a file to get a fine-grained timestamp, and
another modified after it to get a coarse-grained stamp that is earlier
than the fine-grained time. If this happens then the files can appear to
have been modified in reverse order, which breaks VFS ordering guarantees
[1].
To prevent this, maintain a floor value for multigrain timestamps.
Whenever a fine-grained timestamp is handed out, record it, and when later
coarse-grained stamps are handed out, ensure they are not earlier than that
value. If the coarse-grained timestamp is earlier than the fine-grained
floor, return the floor value instead.
Add a static singleton atomic64_t into timekeeper.c that is used to keep
track of the latest fine-grained time ever handed out. This is tracked as a
monotonic ktime_t value to ensure that it isn't affected by clock
jumps. Because it is updated at different times than the rest of the
timekeeper object, the floor value is managed independently of the
timekeeper via a cmpxchg() operation, and sits on its own cacheline.
Add two new public interfaces:
- ktime_get_coarse_real_ts64_mg() fills a timespec64 with the later of the
coarse-grained clock and the floor time
- ktime_get_real_ts64_mg() gets the fine-grained clock value, and tries
to swap it into the floor. A timespec64 is filled with the result.
The floor value is global and updated via a single try_cmpxchg(). If
that fails then the operation raced with a concurrent update. Any
concurrent update must be later than the existing floor value, so any
racing tasks can accept any resulting floor value without retrying.
[1]: POSIX requires that files be stamped with realtime clock values, and
makes no provision for dealing with backward clock jumps. If a backward
realtime clock jump occurs, then files can appear to have been modified
in reverse order.
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Randy Dunlap <rdunlap@infradead.org> # documentation bits
Acked-by: John Stultz <jstultz@google.com>
Link: https://lore.kernel.org/all/20241002-mgtime-v10-1-d1c4717f5284@kernel.org
2024-10-03 05:27:16 +08:00
} else {
/*
* Another task changed mg_floor since " old " was fetched .
* " old " has been updated with the latest value of " mg_floor " .
* That value is newer than the previous floor value , which
* is enough to effect a change to ctime . Accept it .
*/
* ts = ktime_to_timespec64 ( ktime_add ( old , offset ) ) ;
}
}
2018-04-27 21:40:14 +08:00
void ktime_get_coarse_ts64 ( struct timespec64 * ts )
2009-08-20 10:13:34 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2014-07-17 05:04:01 +08:00
struct timespec64 now , mono ;
2019-03-19 03:55:56 +08:00
unsigned int seq ;
2009-08-20 10:13:34 +08:00
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
Revert "time: Remove xtime_cache"
This reverts commit 7bc7d637452383d56ba4368d4336b0dde1bb476d, as
requested by John Stultz. Quoting John:
"Petr Titěra reported an issue where he saw odd atime regressions with
2.6.33 where there were a full second worth of nanoseconds in the
nanoseconds field.
He also reviewed the time code and narrowed down the problem: unhandled
overflow of the nanosecond field caused by rounding up the
sub-nanosecond accumulated time.
Details:
* At the end of update_wall_time(), we currently round up the
sub-nanosecond portion of accumulated time when storing it into xtime.
This was added to avoid time inconsistencies caused when the
sub-nanosecond portion was truncated when storing into xtime.
Unfortunately we don't handle the possible second overflow caused by
that rounding.
* Previously the xtime_cache code hid this overflow by normalizing the
xtime value when storing into the xtime_cache.
* We could try to handle the second overflow after the rounding up, but
since this affects the timekeeping's internal state, this would further
complicate the next accumulation cycle, causing small errors in ntp
steering. As much as I'd like to get rid of it, the xtime_cache code is
known to work.
* The correct fix is really to include the sub-nanosecond portion in the
timekeeping accessor function, so we don't need to round up at during
accumulation. This would greatly simplify the accumulation code.
Unfortunately, we can't do this safely until the last three
non-GENERIC_TIME arches (sparc32, arm, cris) are converted (those
patches are in -mm) and we kill off the spots where arches set xtime
directly. This is all 2.6.34 material, so I think reverting the
xtime_cache change is the best approach for now.
Many thanks to Petr for both reporting and finding the issue!"
Reported-by: Petr Titěra <P.Titera@century.cz>
Requested-by: john stultz <johnstul@us.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-12-23 06:10:37 +08:00
2012-07-28 02:48:13 +08:00
now = tk_xtime ( tk ) ;
mono = tk - > wall_to_monotonic ;
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2009-08-20 10:13:34 +08:00
2018-04-27 21:40:14 +08:00
set_normalized_timespec64 ( ts , now . tv_sec + mono . tv_sec ,
2009-08-20 10:13:34 +08:00
now . tv_nsec + mono . tv_nsec ) ;
}
2018-04-27 21:40:14 +08:00
EXPORT_SYMBOL ( ktime_get_coarse_ts64 ) ;
2011-01-27 22:58:55 +08:00
/*
2012-02-29 08:50:11 +08:00
* Must hold jiffies_lock
2011-01-27 22:58:55 +08:00
*/
void do_timer ( unsigned long ticks )
{
jiffies_64 + = ticks ;
2020-07-02 02:34:18 +08:00
calc_global_load ( ) ;
2011-01-27 22:58:55 +08:00
}
2011-01-27 22:59:05 +08:00
2012-07-11 06:43:24 +08:00
/**
2014-07-17 05:03:52 +08:00
* ktime_get_update_offsets_now - hrtimer helper
2015-04-15 05:08:37 +08:00
* @ cwsseq : pointer to check and store the clock was set sequence number
2012-07-11 06:43:24 +08:00
* @ offs_real : pointer to storage for monotonic - > realtime offset
2018-04-25 21:33:38 +08:00
* @ offs_boot : pointer to storage for monotonic - > boottime offset
2013-10-18 09:13:30 +08:00
* @ offs_tai : pointer to storage for monotonic - > clock tai offset
2012-07-11 06:43:24 +08:00
*
2015-04-15 05:08:37 +08:00
* Returns current monotonic time and updates the offsets if the
* sequence number in @ cwsseq and timekeeper . clock_was_set_seq are
* different .
*
2013-10-18 09:13:30 +08:00
* Called from hrtimer_interrupt ( ) or retrigger_next_event ( )
2012-07-11 06:43:24 +08:00
*/
2015-04-15 05:08:37 +08:00
ktime_t ktime_get_update_offsets_now ( unsigned int * cwsseq , ktime_t * offs_real ,
2018-04-25 21:33:38 +08:00
ktime_t * offs_boot , ktime_t * offs_tai )
2012-07-11 06:43:24 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2012-07-11 06:43:24 +08:00
unsigned int seq ;
2014-07-17 05:04:19 +08:00
ktime_t base ;
u64 nsecs ;
2012-07-11 06:43:24 +08:00
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
2012-07-11 06:43:24 +08:00
2015-03-19 17:09:06 +08:00
base = tk - > tkr_mono . base ;
nsecs = timekeeping_get_ns ( & tk - > tkr_mono ) ;
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
base = ktime_add_ns ( base , nsecs ) ;
2015-04-15 05:08:37 +08:00
if ( * cwsseq ! = tk - > clock_was_set_seq ) {
* cwsseq = tk - > clock_was_set_seq ;
* offs_real = tk - > offs_real ;
2018-04-25 21:33:38 +08:00
* offs_boot = tk - > offs_boot ;
2015-04-15 05:08:37 +08:00
* offs_tai = tk - > offs_tai ;
}
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
/* Handle leapsecond insertion adjustments */
2016-12-25 18:38:40 +08:00
if ( unlikely ( base > = tk - > next_leap_ktime ) )
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
* offs_real = ktime_sub ( tk - > offs_real , ktime_set ( 1 , 0 ) ) ;
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2012-07-11 06:43:24 +08:00
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
return base ;
2012-07-11 06:43:24 +08:00
}
2020-11-13 15:24:33 +08:00
/*
2017-10-19 19:14:45 +08:00
* timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex
2017-10-19 19:14:44 +08:00
*/
2018-07-03 13:44:21 +08:00
static int timekeeping_validate_timex ( const struct __kernel_timex * txc )
2017-10-19 19:14:44 +08:00
{
if ( txc - > modes & ADJ_ADJTIME ) {
/* singleshot must not be used with any other mode bits */
if ( ! ( txc - > modes & ADJ_OFFSET_SINGLESHOT ) )
return - EINVAL ;
if ( ! ( txc - > modes & ADJ_OFFSET_READONLY ) & &
! capable ( CAP_SYS_TIME ) )
return - EPERM ;
} else {
/* In order to modify anything, you gotta be super-user! */
if ( txc - > modes & & ! capable ( CAP_SYS_TIME ) )
return - EPERM ;
/*
* if the quartz is off by more than 10 % then
* something is VERY wrong !
*/
if ( txc - > modes & ADJ_TICK & &
( txc - > tick < 900000 / USER_HZ | |
txc - > tick > 1100000 / USER_HZ ) )
return - EINVAL ;
}
if ( txc - > modes & ADJ_SETOFFSET ) {
/* In order to inject time, you gotta be super-user! */
if ( ! capable ( CAP_SYS_TIME ) )
return - EPERM ;
2017-10-19 19:14:45 +08:00
/*
* Validate if a timespec / timeval used to inject a time
2021-03-23 05:39:03 +08:00
* offset is valid . Offsets can be positive or negative , so
2017-10-19 19:14:45 +08:00
* we don ' t check tv_sec . The value of the timeval / timespec
* is the sum of its fields , but * NOTE * :
* The field tv_usec / tv_nsec must always be non - negative and
* we can ' t have more nanoseconds / microseconds than a second .
*/
if ( txc - > time . tv_usec < 0 )
return - EINVAL ;
2017-10-19 19:14:44 +08:00
2017-10-19 19:14:45 +08:00
if ( txc - > modes & ADJ_NANO ) {
if ( txc - > time . tv_usec > = NSEC_PER_SEC )
2017-10-19 19:14:44 +08:00
return - EINVAL ;
} else {
2017-10-19 19:14:45 +08:00
if ( txc - > time . tv_usec > = USEC_PER_SEC )
2017-10-19 19:14:44 +08:00
return - EINVAL ;
}
}
/*
* Check for potential multiplication overflows that can
* only happen on 64 - bit systems :
*/
if ( ( txc - > modes & ADJ_FREQUENCY ) & & ( BITS_PER_LONG = = 64 ) ) {
if ( LLONG_MIN / PPM_SCALE > txc - > freq )
return - EINVAL ;
if ( LLONG_MAX / PPM_SCALE < txc - > freq )
return - EINVAL ;
}
return 0 ;
}
timekeeping: Add raw clock fallback for random_get_entropy()
The addition of random_get_entropy_fallback() provides access to
whichever time source has the highest frequency, which is useful for
gathering entropy on platforms without available cycle counters. It's
not necessarily as good as being able to quickly access a cycle counter
that the CPU has, but it's still something, even when it falls back to
being jiffies-based.
In the event that a given arch does not define get_cycles(), falling
back to the get_cycles() default implementation that returns 0 is really
not the best we can do. Instead, at least calling
random_get_entropy_fallback() would be preferable, because that always
needs to return _something_, even falling back to jiffies eventually.
It's not as though random_get_entropy_fallback() is super high precision
or guaranteed to be entropic, but basically anything that's not zero all
the time is better than returning zero all the time.
Finally, since random_get_entropy_fallback() is used during extremely
early boot when randomizing freelists in mm_init(), it can be called
before timekeeping has been initialized. In that case there really is
nothing we can do; jiffies hasn't even started ticking yet. So just give
up and return 0.
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Theodore Ts'o <tytso@mit.edu>
2022-04-10 22:49:50 +08:00
/**
* random_get_entropy_fallback - Returns the raw clock source value ,
* used by random . c for platforms with no valid random_get_entropy ( ) .
*/
unsigned long random_get_entropy_fallback ( void )
{
struct tk_read_base * tkr = & tk_core . timekeeper . tkr_mono ;
struct clocksource * clock = READ_ONCE ( tkr - > clock ) ;
if ( unlikely ( timekeeping_suspended | | ! clock ) )
return 0 ;
return clock - > read ( clock ) ;
}
EXPORT_SYMBOL_GPL ( random_get_entropy_fallback ) ;
2017-10-19 19:14:44 +08:00
2013-03-23 02:31:29 +08:00
/**
* do_adjtimex ( ) - Accessor function to NTP __do_adjtimex function
2024-06-07 17:06:56 +08:00
* @ txc : Pointer to kernel_timex structure containing NTP parameters
2013-03-23 02:31:29 +08:00
*/
2018-07-03 13:44:21 +08:00
int do_adjtimex ( struct __kernel_timex * txc )
2013-03-23 02:31:29 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2019-04-10 17:14:20 +08:00
struct audit_ntp_data ad ;
ntp: Make sure RTC is synchronized when time goes backwards
sync_hw_clock() is normally called every 11 minutes when time is
synchronized. This issue is that this periodic timer uses the REALTIME
clock, so when time moves backwards (the NTP server jumps into the past),
the timer expires late.
If the timer expires late, which can be days later, the RTC will no longer
be updated, which is an issue if the device is abruptly powered OFF during
this period. When the device will restart (when powered ON), it will have
the date prior to the ADJ_SETOFFSET call.
A normal NTP server should not jump in the past like that, but it is
possible... Another way of reproducing this issue is to use phc2sys to
synchronize the REALTIME clock with, for example, an IRIG timecode with
the source always starting at the same date (not synchronized).
Also, if the time jump in the future by less than 11 minutes, the RTC may
not be updated immediately (minor issue). Consider the following scenario:
- Time is synchronized, and sync_hw_clock() was just called (the timer
expires in 11 minutes).
- A time jump is realized in the future by a couple of minutes.
- The time is synchronized again.
- Users may expect that RTC to be updated as soon as possible, and not
after 11 minutes (for the same reason, if a power loss occurs in this
period).
Cancel periodic timer on any time jump (ADJ_SETOFFSET) greater than or
equal to 1s. The timer will be relaunched at the end of do_adjtimex() if
NTP is still considered synced. Otherwise the timer will be relaunched
later when NTP is synced. This way, when the time is synchronized again,
the RTC is updated after less than 2 seconds.
Signed-off-by: Benjamin ROBIN <dev@benjarobin.fr>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20240908140836.203911-1-dev@benjarobin.fr
2024-09-08 22:08:36 +08:00
bool offset_set = false ;
2021-07-13 21:39:52 +08:00
bool clock_set = false ;
2014-07-17 05:04:01 +08:00
struct timespec64 ts ;
2021-07-13 21:39:52 +08:00
unsigned long flags ;
2013-04-11 03:41:49 +08:00
s32 orig_tai , tai ;
2013-03-23 03:08:52 +08:00
int ret ;
/* Validate the data before disabling interrupts */
2017-10-19 19:14:45 +08:00
ret = timekeeping_validate_timex ( txc ) ;
2013-03-23 03:08:52 +08:00
if ( ret )
return ret ;
2022-07-18 05:53:34 +08:00
add_device_randomness ( txc , sizeof ( * txc ) ) ;
2013-03-23 03:08:52 +08:00
2013-03-23 06:04:13 +08:00
if ( txc - > modes & ADJ_SETOFFSET ) {
2017-10-19 19:14:45 +08:00
struct timespec64 delta ;
2013-03-23 06:04:13 +08:00
delta . tv_sec = txc - > time . tv_sec ;
delta . tv_nsec = txc - > time . tv_usec ;
if ( ! ( txc - > modes & ADJ_NANO ) )
delta . tv_nsec * = 1000 ;
ret = timekeeping_inject_offset ( & delta ) ;
if ( ret )
return ret ;
2019-04-10 17:14:19 +08:00
ntp: Make sure RTC is synchronized when time goes backwards
sync_hw_clock() is normally called every 11 minutes when time is
synchronized. This issue is that this periodic timer uses the REALTIME
clock, so when time moves backwards (the NTP server jumps into the past),
the timer expires late.
If the timer expires late, which can be days later, the RTC will no longer
be updated, which is an issue if the device is abruptly powered OFF during
this period. When the device will restart (when powered ON), it will have
the date prior to the ADJ_SETOFFSET call.
A normal NTP server should not jump in the past like that, but it is
possible... Another way of reproducing this issue is to use phc2sys to
synchronize the REALTIME clock with, for example, an IRIG timecode with
the source always starting at the same date (not synchronized).
Also, if the time jump in the future by less than 11 minutes, the RTC may
not be updated immediately (minor issue). Consider the following scenario:
- Time is synchronized, and sync_hw_clock() was just called (the timer
expires in 11 minutes).
- A time jump is realized in the future by a couple of minutes.
- The time is synchronized again.
- Users may expect that RTC to be updated as soon as possible, and not
after 11 minutes (for the same reason, if a power loss occurs in this
period).
Cancel periodic timer on any time jump (ADJ_SETOFFSET) greater than or
equal to 1s. The timer will be relaunched at the end of do_adjtimex() if
NTP is still considered synced. Otherwise the timer will be relaunched
later when NTP is synced. This way, when the time is synchronized again,
the RTC is updated after less than 2 seconds.
Signed-off-by: Benjamin ROBIN <dev@benjarobin.fr>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20240908140836.203911-1-dev@benjarobin.fr
2024-09-08 22:08:36 +08:00
offset_set = delta . tv_sec ! = 0 ;
2019-04-10 17:14:19 +08:00
audit_tk_injoffset ( delta ) ;
2013-03-23 06:04:13 +08:00
}
2019-04-10 17:14:20 +08:00
audit_ntp_init ( & ad ) ;
2018-06-18 22:08:01 +08:00
ktime_get_real_ts64 ( & ts ) ;
2022-07-18 05:53:34 +08:00
add_device_randomness ( & ts , sizeof ( ts ) ) ;
2013-03-23 03:28:15 +08:00
2024-10-09 16:29:02 +08:00
raw_spin_lock_irqsave ( & tk_core . lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2013-03-23 02:37:28 +08:00
2013-04-11 03:41:49 +08:00
orig_tai = tai = tk - > tai_offset ;
2019-04-10 17:14:20 +08:00
ret = __do_adjtimex ( txc , & ts , & tai , & ad ) ;
2013-03-23 02:31:29 +08:00
2013-04-11 03:41:49 +08:00
if ( tai ! = orig_tai ) {
__timekeeping_set_tai_offset ( tk , tai ) ;
2024-10-09 16:29:05 +08:00
timekeeping_update ( & tk_core , tk , TK_MIRROR | TK_CLOCK_WAS_SET ) ;
2021-07-13 21:39:52 +08:00
clock_set = true ;
2024-10-09 16:28:56 +08:00
} else {
tk_update_leap_state ( tk ) ;
2013-04-11 03:41:49 +08:00
}
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2024-10-09 16:29:02 +08:00
raw_spin_unlock_irqrestore ( & tk_core . lock , flags ) ;
2013-03-23 02:37:28 +08:00
2019-04-10 17:14:20 +08:00
audit_ntp_log ( & ad ) ;
2018-06-04 21:34:21 +08:00
/* Update the multiplier immediately if frequency was set directly */
if ( txc - > modes & ( ADJ_FREQUENCY | ADJ_TICK ) )
2021-07-13 21:39:52 +08:00
clock_set | = timekeeping_advance ( TK_ADV_FREQ ) ;
2018-06-04 21:34:21 +08:00
2021-07-13 21:39:52 +08:00
if ( clock_set )
2024-08-03 23:07:51 +08:00
clock_was_set ( CLOCK_SET_WALL ) ;
timekeeping: Avoid possible deadlock from clock_was_set_delayed
As part of normal operaions, the hrtimer subsystem frequently calls
into the timekeeping code, creating a locking order of
hrtimer locks -> timekeeping locks
clock_was_set_delayed() was suppoed to allow us to avoid deadlocks
between the timekeeping the hrtimer subsystem, so that we could
notify the hrtimer subsytem the time had changed while holding
the timekeeping locks. This was done by scheduling delayed work
that would run later once we were out of the timekeeing code.
But unfortunately the lock chains are complex enoguh that in
scheduling delayed work, we end up eventually trying to grab
an hrtimer lock.
Sasha Levin noticed this in testing when the new seqlock lockdep
enablement triggered the following (somewhat abrieviated) message:
[ 251.100221] ======================================================
[ 251.100221] [ INFO: possible circular locking dependency detected ]
[ 251.100221] 3.13.0-rc2-next-20131206-sasha-00005-g8be2375-dirty #4053 Not tainted
[ 251.101967] -------------------------------------------------------
[ 251.101967] kworker/10:1/4506 is trying to acquire lock:
[ 251.101967] (timekeeper_seq){----..}, at: [<ffffffff81160e96>] retrigger_next_event+0x56/0x70
[ 251.101967]
[ 251.101967] but task is already holding lock:
[ 251.101967] (hrtimer_bases.lock#11){-.-...}, at: [<ffffffff81160e7c>] retrigger_next_event+0x3c/0x70
[ 251.101967]
[ 251.101967] which lock already depends on the new lock.
[ 251.101967]
[ 251.101967]
[ 251.101967] the existing dependency chain (in reverse order) is:
[ 251.101967]
-> #5 (hrtimer_bases.lock#11){-.-...}:
[snipped]
-> #4 (&rt_b->rt_runtime_lock){-.-...}:
[snipped]
-> #3 (&rq->lock){-.-.-.}:
[snipped]
-> #2 (&p->pi_lock){-.-.-.}:
[snipped]
-> #1 (&(&pool->lock)->rlock){-.-...}:
[ 251.101967] [<ffffffff81194803>] validate_chain+0x6c3/0x7b0
[ 251.101967] [<ffffffff81194d9d>] __lock_acquire+0x4ad/0x580
[ 251.101967] [<ffffffff81194ff2>] lock_acquire+0x182/0x1d0
[ 251.101967] [<ffffffff84398500>] _raw_spin_lock+0x40/0x80
[ 251.101967] [<ffffffff81153e69>] __queue_work+0x1a9/0x3f0
[ 251.101967] [<ffffffff81154168>] queue_work_on+0x98/0x120
[ 251.101967] [<ffffffff81161351>] clock_was_set_delayed+0x21/0x30
[ 251.101967] [<ffffffff811c4bd1>] do_adjtimex+0x111/0x160
[ 251.101967] [<ffffffff811e2711>] compat_sys_adjtimex+0x41/0x70
[ 251.101967] [<ffffffff843a4b49>] ia32_sysret+0x0/0x5
[ 251.101967]
-> #0 (timekeeper_seq){----..}:
[snipped]
[ 251.101967] other info that might help us debug this:
[ 251.101967]
[ 251.101967] Chain exists of:
timekeeper_seq --> &rt_b->rt_runtime_lock --> hrtimer_bases.lock#11
[ 251.101967] Possible unsafe locking scenario:
[ 251.101967]
[ 251.101967] CPU0 CPU1
[ 251.101967] ---- ----
[ 251.101967] lock(hrtimer_bases.lock#11);
[ 251.101967] lock(&rt_b->rt_runtime_lock);
[ 251.101967] lock(hrtimer_bases.lock#11);
[ 251.101967] lock(timekeeper_seq);
[ 251.101967]
[ 251.101967] *** DEADLOCK ***
[ 251.101967]
[ 251.101967] 3 locks held by kworker/10:1/4506:
[ 251.101967] #0: (events){.+.+.+}, at: [<ffffffff81154960>] process_one_work+0x200/0x530
[ 251.101967] #1: (hrtimer_work){+.+...}, at: [<ffffffff81154960>] process_one_work+0x200/0x530
[ 251.101967] #2: (hrtimer_bases.lock#11){-.-...}, at: [<ffffffff81160e7c>] retrigger_next_event+0x3c/0x70
[ 251.101967]
[ 251.101967] stack backtrace:
[ 251.101967] CPU: 10 PID: 4506 Comm: kworker/10:1 Not tainted 3.13.0-rc2-next-20131206-sasha-00005-g8be2375-dirty #4053
[ 251.101967] Workqueue: events clock_was_set_work
So the best solution is to avoid calling clock_was_set_delayed() while
holding the timekeeping lock, and instead using a flag variable to
decide if we should call clock_was_set() once we've released the locks.
This works for the case here, where the do_adjtimex() was the deadlock
trigger point. Unfortuantely, in update_wall_time() we still hold
the jiffies lock, which would deadlock with the ipi triggered by
clock_was_set(), preventing us from calling it even after we drop the
timekeeping lock. So instead call clock_was_set_delayed() at that point.
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: stable <stable@vger.kernel.org> #3.10+
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Tested-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-11 09:18:18 +08:00
ntp: Make sure RTC is synchronized when time goes backwards
sync_hw_clock() is normally called every 11 minutes when time is
synchronized. This issue is that this periodic timer uses the REALTIME
clock, so when time moves backwards (the NTP server jumps into the past),
the timer expires late.
If the timer expires late, which can be days later, the RTC will no longer
be updated, which is an issue if the device is abruptly powered OFF during
this period. When the device will restart (when powered ON), it will have
the date prior to the ADJ_SETOFFSET call.
A normal NTP server should not jump in the past like that, but it is
possible... Another way of reproducing this issue is to use phc2sys to
synchronize the REALTIME clock with, for example, an IRIG timecode with
the source always starting at the same date (not synchronized).
Also, if the time jump in the future by less than 11 minutes, the RTC may
not be updated immediately (minor issue). Consider the following scenario:
- Time is synchronized, and sync_hw_clock() was just called (the timer
expires in 11 minutes).
- A time jump is realized in the future by a couple of minutes.
- The time is synchronized again.
- Users may expect that RTC to be updated as soon as possible, and not
after 11 minutes (for the same reason, if a power loss occurs in this
period).
Cancel periodic timer on any time jump (ADJ_SETOFFSET) greater than or
equal to 1s. The timer will be relaunched at the end of do_adjtimex() if
NTP is still considered synced. Otherwise the timer will be relaunched
later when NTP is synced. This way, when the time is synchronized again,
the RTC is updated after less than 2 seconds.
Signed-off-by: Benjamin ROBIN <dev@benjarobin.fr>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20240908140836.203911-1-dev@benjarobin.fr
2024-09-08 22:08:36 +08:00
ntp_notify_cmos_timer ( offset_set ) ;
2013-09-12 07:50:56 +08:00
2013-03-23 03:28:15 +08:00
return ret ;
}
2013-03-23 02:31:29 +08:00
# ifdef CONFIG_NTP_PPS
/**
* hardpps ( ) - Accessor function to NTP __hardpps function
2024-06-07 17:06:56 +08:00
* @ phase_ts : Pointer to timespec64 structure representing phase timestamp
* @ raw_ts : Pointer to timespec64 structure representing raw timestamp
2013-03-23 02:31:29 +08:00
*/
2015-09-29 04:21:28 +08:00
void hardpps ( const struct timespec64 * phase_ts , const struct timespec64 * raw_ts )
2013-03-23 02:31:29 +08:00
{
2024-10-09 16:29:02 +08:00
guard ( raw_spinlock_irqsave ) ( & tk_core . lock ) ;
2013-03-23 02:31:29 +08:00
__hardpps ( phase_ts , raw_ts ) ;
}
EXPORT_SYMBOL ( hardpps ) ;
2017-09-09 07:17:19 +08:00
# endif /* CONFIG_NTP_PPS */