2018-11-01 02:21:09 +08:00
// SPDX-License-Identifier: GPL-2.0
2007-05-08 15:27:59 +08:00
/*
2018-11-01 02:21:08 +08:00
* Kernel timekeeping code and accessor functions . Based on code from
* timer . c , moved in commit 8524070 b7982 .
2007-05-08 15:27:59 +08:00
*/
2012-09-05 03:12:07 +08:00
# include <linux/timekeeper_internal.h>
2007-05-08 15:27:59 +08:00
# include <linux/module.h>
# include <linux/interrupt.h>
# include <linux/percpu.h>
# include <linux/init.h>
# include <linux/mm.h>
2017-02-09 01:51:31 +08:00
# include <linux/nmi.h>
2009-10-07 21:09:06 +08:00
# include <linux/sched.h>
2017-02-08 15:45:17 +08:00
# include <linux/sched/loadavg.h>
2018-07-20 04:55:34 +08:00
# include <linux/sched/clock.h>
2011-03-24 05:16:04 +08:00
# include <linux/syscore_ops.h>
2007-05-08 15:27:59 +08:00
# include <linux/clocksource.h>
# include <linux/jiffies.h>
# include <linux/time.h>
timekeeping: Add raw clock fallback for random_get_entropy()
The addition of random_get_entropy_fallback() provides access to
whichever time source has the highest frequency, which is useful for
gathering entropy on platforms without available cycle counters. It's
not necessarily as good as being able to quickly access a cycle counter
that the CPU has, but it's still something, even when it falls back to
being jiffies-based.
In the event that a given arch does not define get_cycles(), falling
back to the get_cycles() default implementation that returns 0 is really
not the best we can do. Instead, at least calling
random_get_entropy_fallback() would be preferable, because that always
needs to return _something_, even falling back to jiffies eventually.
It's not as though random_get_entropy_fallback() is super high precision
or guaranteed to be entropic, but basically anything that's not zero all
the time is better than returning zero all the time.
Finally, since random_get_entropy_fallback() is used during extremely
early boot when randomizing freelists in mm_init(), it can be called
before timekeeping has been initialized. In that case there really is
nothing we can do; jiffies hasn't even started ticking yet. So just give
up and return 0.
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Theodore Ts'o <tytso@mit.edu>
2022-04-10 22:49:50 +08:00
# include <linux/timex.h>
2007-05-08 15:27:59 +08:00
# include <linux/tick.h>
2009-08-14 21:47:30 +08:00
# include <linux/stop_machine.h>
2012-11-28 09:28:59 +08:00
# include <linux/pvclock_gtod.h>
2014-04-08 06:39:20 +08:00
# include <linux/compiler.h>
2019-04-10 17:14:19 +08:00
# include <linux/audit.h>
2022-07-18 05:53:34 +08:00
# include <linux/random.h>
2007-05-08 15:27:59 +08:00
2013-02-22 06:51:36 +08:00
# include "tick-internal.h"
2013-03-23 02:31:29 +08:00
# include "ntp_internal.h"
2013-05-22 13:32:14 +08:00
# include "timekeeping_internal.h"
2009-08-14 21:47:26 +08:00
2013-06-27 18:35:45 +08:00
# define TK_CLEAR_NTP (1 << 0)
# define TK_MIRROR (1 << 1)
2013-06-27 18:35:46 +08:00
# define TK_CLOCK_WAS_SET (1 << 2)
2013-06-27 18:35:45 +08:00
2018-06-04 21:34:21 +08:00
enum timekeeping_adv_mode {
/* Update timekeeper when a tick has passed */
TK_ADV_TICK ,
/* Update timekeeper on a direct frequency change */
TK_ADV_FREQ
} ;
2020-08-15 05:26:08 +08:00
DEFINE_RAW_SPINLOCK ( timekeeper_lock ) ;
2020-07-20 23:55:23 +08:00
2014-07-17 05:04:07 +08:00
/*
* The most important data for readout fits into a single 64 byte
* cache line .
*/
static struct {
2020-07-20 23:55:23 +08:00
seqcount_raw_spinlock_t seq ;
2014-07-17 05:04:07 +08:00
struct timekeeper timekeeper ;
2018-11-29 07:43:09 +08:00
} tk_core ____cacheline_aligned = {
2020-07-20 23:55:23 +08:00
. seq = SEQCNT_RAW_SPINLOCK_ZERO ( tk_core . seq , & timekeeper_lock ) ,
2018-11-29 07:43:09 +08:00
} ;
2014-07-17 05:04:07 +08:00
2013-02-22 06:51:40 +08:00
static struct timekeeper shadow_timekeeper ;
2009-08-14 21:47:26 +08:00
2020-08-14 18:19:34 +08:00
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended ;
2014-07-17 05:05:23 +08:00
/**
* struct tk_fast - NMI safe timekeeper
* @ seq : Sequence counter for protecting updates . The lowest bit
* is the index for the tk_read_base array
* @ base : tk_read_base array . Access is indexed by the lowest bit of
* @ seq .
*
* See @ update_fast_timekeeper ( ) below .
*/
struct tk_fast {
2020-08-27 19:40:41 +08:00
seqcount_latch_t seq ;
2014-07-17 05:05:23 +08:00
struct tk_read_base base [ 2 ] ;
} ;
2017-08-28 20:21:53 +08:00
/* Suspend-time cycles value for halted fast timekeeper. */
static u64 cycles_at_suspend ;
static u64 dummy_clock_read ( struct clocksource * cs )
{
2020-08-14 18:19:34 +08:00
if ( timekeeping_suspended )
return cycles_at_suspend ;
return local_clock ( ) ;
2017-08-28 20:21:53 +08:00
}
static struct clocksource dummy_clock = {
. read = dummy_clock_read ,
} ;
2020-08-14 18:19:34 +08:00
/*
* Boot time initialization which allows local_clock ( ) to be utilized
* during early boot when clocksources are not available . local_clock ( )
* returns nanoseconds already so no conversion is required , hence mult = 1
* and shift = 0. When the first proper clocksource is installed then
* the fast time keepers are updated with the correct values .
*/
# define FAST_TK_INIT \
{ \
. clock = & dummy_clock , \
. mask = CLOCKSOURCE_MASK ( 64 ) , \
. mult = 1 , \
. shift = 0 , \
}
2017-08-28 20:21:53 +08:00
static struct tk_fast tk_fast_mono ____cacheline_aligned = {
2020-08-27 19:40:41 +08:00
. seq = SEQCNT_LATCH_ZERO ( tk_fast_mono . seq ) ,
2020-08-14 18:19:34 +08:00
. base [ 0 ] = FAST_TK_INIT ,
. base [ 1 ] = FAST_TK_INIT ,
2017-08-28 20:21:53 +08:00
} ;
static struct tk_fast tk_fast_raw ____cacheline_aligned = {
2020-08-27 19:40:41 +08:00
. seq = SEQCNT_LATCH_ZERO ( tk_fast_raw . seq ) ,
2020-08-14 18:19:34 +08:00
. base [ 0 ] = FAST_TK_INIT ,
. base [ 1 ] = FAST_TK_INIT ,
2017-08-28 20:21:53 +08:00
} ;
2014-07-17 05:05:23 +08:00
2012-07-13 13:21:53 +08:00
static inline void tk_normalize_xtime ( struct timekeeper * tk )
{
2015-03-19 17:09:06 +08:00
while ( tk - > tkr_mono . xtime_nsec > = ( ( u64 ) NSEC_PER_SEC < < tk - > tkr_mono . shift ) ) {
tk - > tkr_mono . xtime_nsec - = ( u64 ) NSEC_PER_SEC < < tk - > tkr_mono . shift ;
2012-07-13 13:21:53 +08:00
tk - > xtime_sec + + ;
}
2017-05-23 08:20:20 +08:00
while ( tk - > tkr_raw . xtime_nsec > = ( ( u64 ) NSEC_PER_SEC < < tk - > tkr_raw . shift ) ) {
tk - > tkr_raw . xtime_nsec - = ( u64 ) NSEC_PER_SEC < < tk - > tkr_raw . shift ;
tk - > raw_sec + + ;
}
2012-07-13 13:21:53 +08:00
}
2018-07-13 20:06:42 +08:00
static inline struct timespec64 tk_xtime ( const struct timekeeper * tk )
2014-07-17 05:04:05 +08:00
{
struct timespec64 ts ;
ts . tv_sec = tk - > xtime_sec ;
2015-03-19 17:09:06 +08:00
ts . tv_nsec = ( long ) ( tk - > tkr_mono . xtime_nsec > > tk - > tkr_mono . shift ) ;
2014-07-17 05:04:05 +08:00
return ts ;
}
2014-07-17 05:04:01 +08:00
static void tk_set_xtime ( struct timekeeper * tk , const struct timespec64 * ts )
2012-07-13 13:21:53 +08:00
{
tk - > xtime_sec = ts - > tv_sec ;
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec = ( u64 ) ts - > tv_nsec < < tk - > tkr_mono . shift ;
2012-07-13 13:21:53 +08:00
}
2014-07-17 05:04:01 +08:00
static void tk_xtime_add ( struct timekeeper * tk , const struct timespec64 * ts )
2012-07-13 13:21:53 +08:00
{
tk - > xtime_sec + = ts - > tv_sec ;
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec + = ( u64 ) ts - > tv_nsec < < tk - > tkr_mono . shift ;
2012-08-22 08:30:46 +08:00
tk_normalize_xtime ( tk ) ;
2012-07-13 13:21:53 +08:00
}
2011-11-15 03:46:39 +08:00
2014-07-17 05:04:01 +08:00
static void tk_set_wall_to_mono ( struct timekeeper * tk , struct timespec64 wtm )
2012-07-28 02:48:12 +08:00
{
2014-07-17 05:04:01 +08:00
struct timespec64 tmp ;
2012-07-28 02:48:12 +08:00
/*
* Verify consistency of : offset_real = - wall_to_monotonic
* before modifying anything
*/
2014-07-17 05:04:01 +08:00
set_normalized_timespec64 ( & tmp , - tk - > wall_to_monotonic . tv_sec ,
2012-07-28 02:48:12 +08:00
- tk - > wall_to_monotonic . tv_nsec ) ;
2016-12-25 18:38:40 +08:00
WARN_ON_ONCE ( tk - > offs_real ! = timespec64_to_ktime ( tmp ) ) ;
2012-07-28 02:48:12 +08:00
tk - > wall_to_monotonic = wtm ;
2014-07-17 05:04:01 +08:00
set_normalized_timespec64 ( & tmp , - wtm . tv_sec , - wtm . tv_nsec ) ;
tk - > offs_real = timespec64_to_ktime ( tmp ) ;
2013-12-11 09:13:35 +08:00
tk - > offs_tai = ktime_add ( tk - > offs_real , ktime_set ( tk - > tai_offset , 0 ) ) ;
2012-07-28 02:48:12 +08:00
}
2014-07-17 05:05:00 +08:00
static inline void tk_update_sleep_time ( struct timekeeper * tk , ktime_t delta )
2012-07-28 02:48:12 +08:00
{
2018-04-25 21:33:38 +08:00
tk - > offs_boot = ktime_add ( tk - > offs_boot , delta ) ;
2019-08-22 19:00:15 +08:00
/*
* Timespec representation for VDSO update to avoid 64 bit division
* on every update .
*/
tk - > monotonic_to_boot = ktime_to_timespec64 ( tk - > offs_boot ) ;
2012-07-28 02:48:12 +08:00
}
2017-06-09 07:44:20 +08:00
/*
* tk_clock_read - atomic clocksource read ( ) helper
*
* This helper is necessary to use in the read paths because , while the
2020-07-20 23:55:23 +08:00
* seqcount ensures we don ' t return a bad value while structures are updated ,
2017-06-09 07:44:20 +08:00
* it doesn ' t protect from potential crashes . There is the possibility that
* the tkr ' s clocksource may change between the read reference , and the
* clock reference passed to the read function . This can cause crashes if
* the wrong clocksource is passed to the wrong read function .
* This isn ' t necessary to use when holding the timekeeper_lock or doing
* a read of the fast - timekeeper tkrs ( which is protected by its own locking
* and update logic ) .
*/
2018-07-13 20:06:42 +08:00
static inline u64 tk_clock_read ( const struct tk_read_base * tkr )
2017-06-09 07:44:20 +08:00
{
struct clocksource * clock = READ_ONCE ( tkr - > clock ) ;
return clock - > read ( clock ) ;
}
2015-03-12 12:16:32 +08:00
# ifdef CONFIG_DEBUG_TIMEKEEPING
2015-03-12 12:16:35 +08:00
# define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
2016-12-22 03:32:01 +08:00
static void timekeeping_check_update ( struct timekeeper * tk , u64 offset )
2015-03-12 12:16:32 +08:00
{
2016-12-22 03:32:01 +08:00
u64 max_cycles = tk - > tkr_mono . clock - > max_cycles ;
2015-03-19 17:09:06 +08:00
const char * name = tk - > tkr_mono . clock - > name ;
2015-03-12 12:16:32 +08:00
if ( offset > max_cycles ) {
2015-03-12 12:16:33 +08:00
printk_deferred ( " WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger \n " ,
2015-03-12 12:16:32 +08:00
offset , name , max_cycles ) ;
2015-03-12 12:16:33 +08:00
printk_deferred ( " timekeeping: Your kernel is sick, but tries to cope by capping time updates \n " ) ;
2015-03-12 12:16:32 +08:00
} else {
if ( offset > ( max_cycles > > 1 ) ) {
2015-12-13 14:26:11 +08:00
printk_deferred ( " INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld) \n " ,
2015-03-12 12:16:32 +08:00
offset , name , max_cycles > > 1 ) ;
printk_deferred ( " timekeeping: Your kernel is still fine, but is feeling a bit nervous \n " ) ;
}
}
2015-03-12 12:16:35 +08:00
2015-05-14 07:04:47 +08:00
if ( tk - > underflow_seen ) {
if ( jiffies - tk - > last_warning > WARNING_FREQ ) {
2015-03-12 12:16:35 +08:00
printk_deferred ( " WARNING: Underflow in clocksource '%s' observed, time update ignored. \n " , name ) ;
printk_deferred ( " Please report this, consider using a different clocksource, if possible. \n " ) ;
printk_deferred ( " Your kernel is probably still fine. \n " ) ;
2015-05-14 07:04:47 +08:00
tk - > last_warning = jiffies ;
2015-03-12 12:16:35 +08:00
}
2015-05-14 07:04:47 +08:00
tk - > underflow_seen = 0 ;
2015-03-12 12:16:35 +08:00
}
2015-05-14 07:04:47 +08:00
if ( tk - > overflow_seen ) {
if ( jiffies - tk - > last_warning > WARNING_FREQ ) {
2015-03-12 12:16:35 +08:00
printk_deferred ( " WARNING: Overflow in clocksource '%s' observed, time update capped. \n " , name ) ;
printk_deferred ( " Please report this, consider using a different clocksource, if possible. \n " ) ;
printk_deferred ( " Your kernel is probably still fine. \n " ) ;
2015-05-14 07:04:47 +08:00
tk - > last_warning = jiffies ;
2015-03-12 12:16:35 +08:00
}
2015-05-14 07:04:47 +08:00
tk - > overflow_seen = 0 ;
2015-03-12 12:16:35 +08:00
}
2015-03-12 12:16:32 +08:00
}
2015-03-12 12:16:33 +08:00
2024-03-25 14:40:18 +08:00
static inline u64 timekeeping_cycles_to_ns ( const struct tk_read_base * tkr , u64 cycles ) ;
static inline u64 timekeeping_debug_get_ns ( const struct tk_read_base * tkr )
2015-03-12 12:16:33 +08:00
{
2015-05-14 07:04:47 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2016-12-22 03:32:01 +08:00
u64 now , last , mask , max , delta ;
2015-03-12 12:16:35 +08:00
unsigned int seq ;
2015-03-12 12:16:33 +08:00
2015-03-12 12:16:35 +08:00
/*
2020-07-20 23:55:23 +08:00
* Since we ' re called holding a seqcount , the data may shift
2015-03-12 12:16:35 +08:00
* under us while we ' re doing the calculation . This can cause
* false positives , since we ' d note a problem but throw the
2020-07-20 23:55:23 +08:00
* results away . So nest another seqcount here to atomically
2015-03-12 12:16:35 +08:00
* grab the points we are checking with .
*/
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
2017-06-09 07:44:20 +08:00
now = tk_clock_read ( tkr ) ;
2015-03-12 12:16:35 +08:00
last = tkr - > cycle_last ;
mask = tkr - > mask ;
max = tkr - > clock - > max_cycles ;
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2015-03-12 12:16:33 +08:00
2015-03-12 12:16:35 +08:00
delta = clocksource_delta ( now , last , mask ) ;
2015-03-12 12:16:33 +08:00
2015-03-12 12:16:34 +08:00
/*
* Try to catch underflows by checking if we are seeing small
* mask - relative negative values .
*/
2024-03-25 14:40:22 +08:00
if ( unlikely ( ( ~ delta & mask ) < ( mask > > 3 ) ) )
2015-05-14 07:04:47 +08:00
tk - > underflow_seen = 1 ;
2015-03-12 12:16:34 +08:00
2024-03-25 14:40:22 +08:00
/* Check for multiplication overflows */
if ( unlikely ( delta > max ) )
2015-05-14 07:04:47 +08:00
tk - > overflow_seen = 1 ;
2015-03-12 12:16:33 +08:00
2024-03-25 14:40:22 +08:00
/* timekeeping_cycles_to_ns() handles both under and overflow */
2024-03-25 14:40:18 +08:00
return timekeeping_cycles_to_ns ( tkr , now ) ;
2015-03-12 12:16:33 +08:00
}
2015-03-12 12:16:32 +08:00
# else
2016-12-22 03:32:01 +08:00
static inline void timekeeping_check_update ( struct timekeeper * tk , u64 offset )
2015-03-12 12:16:32 +08:00
{
}
2024-03-25 14:40:18 +08:00
static inline u64 timekeeping_debug_get_ns ( const struct tk_read_base * tkr )
2015-03-12 12:16:33 +08:00
{
2024-03-25 14:40:17 +08:00
BUG ( ) ;
2015-03-12 12:16:33 +08:00
}
2015-03-12 12:16:32 +08:00
# endif
2009-08-14 21:47:26 +08:00
/**
2013-11-28 16:28:55 +08:00
* tk_setup_internals - Set up internals to use clocksource clock .
2009-08-14 21:47:26 +08:00
*
2013-11-28 16:28:55 +08:00
* @ tk : The target timekeeper to setup .
2009-08-14 21:47:26 +08:00
* @ clock : Pointer to clocksource .
*
* Calculates a fixed cycle / nsec interval for a given clocksource / adjustment
* pair and interval request .
*
* Unless you ' re the timekeeping code , you should not be using this !
*/
2012-07-13 13:21:57 +08:00
static void tk_setup_internals ( struct timekeeper * tk , struct clocksource * clock )
2009-08-14 21:47:26 +08:00
{
2016-12-22 03:32:01 +08:00
u64 interval ;
2010-10-21 06:55:15 +08:00
u64 tmp , ntpinterval ;
2012-07-13 13:21:53 +08:00
struct clocksource * old_clock ;
2009-08-14 21:47:26 +08:00
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
+ + tk - > cs_was_changed_seq ;
2015-03-19 17:09:06 +08:00
old_clock = tk - > tkr_mono . clock ;
tk - > tkr_mono . clock = clock ;
tk - > tkr_mono . mask = clock - > mask ;
2017-06-09 07:44:20 +08:00
tk - > tkr_mono . cycle_last = tk_clock_read ( & tk - > tkr_mono ) ;
2009-08-14 21:47:26 +08:00
2015-03-19 16:28:44 +08:00
tk - > tkr_raw . clock = clock ;
tk - > tkr_raw . mask = clock - > mask ;
tk - > tkr_raw . cycle_last = tk - > tkr_mono . cycle_last ;
2009-08-14 21:47:26 +08:00
/* Do the ns -> cycle conversion first, using original mult */
tmp = NTP_INTERVAL_LENGTH ;
tmp < < = clock - > shift ;
2010-10-21 06:55:15 +08:00
ntpinterval = tmp ;
2009-08-14 21:47:28 +08:00
tmp + = clock - > mult / 2 ;
do_div ( tmp , clock - > mult ) ;
2009-08-14 21:47:26 +08:00
if ( tmp = = 0 )
tmp = 1 ;
2016-12-22 03:32:01 +08:00
interval = ( u64 ) tmp ;
2012-07-13 13:21:57 +08:00
tk - > cycle_interval = interval ;
2009-08-14 21:47:26 +08:00
/* Go back from cycles -> shifted ns */
2016-12-09 04:49:36 +08:00
tk - > xtime_interval = interval * clock - > mult ;
2012-07-13 13:21:57 +08:00
tk - > xtime_remainder = ntpinterval - tk - > xtime_interval ;
2017-06-09 07:44:21 +08:00
tk - > raw_interval = interval * clock - > mult ;
2009-08-14 21:47:26 +08:00
2012-07-13 13:21:53 +08:00
/* if changing clocks, convert xtime_nsec shift units */
if ( old_clock ) {
int shift_change = clock - > shift - old_clock - > shift ;
2017-05-23 08:20:20 +08:00
if ( shift_change < 0 ) {
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec > > = - shift_change ;
2017-05-23 08:20:20 +08:00
tk - > tkr_raw . xtime_nsec > > = - shift_change ;
} else {
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec < < = shift_change ;
2017-05-23 08:20:20 +08:00
tk - > tkr_raw . xtime_nsec < < = shift_change ;
}
2012-07-13 13:21:53 +08:00
}
2015-03-19 16:28:44 +08:00
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . shift = clock - > shift ;
2015-03-19 16:28:44 +08:00
tk - > tkr_raw . shift = clock - > shift ;
2009-08-14 21:47:26 +08:00
2012-07-13 13:21:57 +08:00
tk - > ntp_error = 0 ;
tk - > ntp_error_shift = NTP_SCALE_SHIFT - clock - > shift ;
2014-04-24 11:53:29 +08:00
tk - > ntp_tick = ntpinterval < < tk - > ntp_error_shift ;
2009-08-14 21:47:28 +08:00
/*
* The timekeeper keeps its own mult values for the currently
* active clocksource . These value will be adjusted via NTP
* to counteract clock drifting .
*/
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . mult = clock - > mult ;
2015-03-19 16:28:44 +08:00
tk - > tkr_raw . mult = clock - > mult ;
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
tk - > ntp_err_mult = 0 ;
2018-03-10 02:42:48 +08:00
tk - > skip_second_overflow = 0 ;
2009-08-14 21:47:26 +08:00
}
2007-05-08 15:27:59 +08:00
2009-08-14 21:47:29 +08:00
/* Timekeeper helper functions. */
2024-03-25 14:40:21 +08:00
static noinline u64 delta_to_ns_safe ( const struct tk_read_base * tkr , u64 delta )
{
return mul_u64_u32_add_u64_shr ( delta , tkr - > mult , tkr - > xtime_nsec , tkr - > shift ) ;
}
2024-03-25 14:40:13 +08:00
static inline u64 timekeeping_cycles_to_ns ( const struct tk_read_base * tkr , u64 cycles )
2009-08-14 21:47:29 +08:00
{
2024-03-25 14:40:15 +08:00
/* Calculate the delta since the last update_wall_time() */
2024-03-25 14:40:20 +08:00
u64 mask = tkr - > mask , delta = ( cycles - tkr - > cycle_last ) & mask ;
2024-03-25 14:40:21 +08:00
/*
2024-03-25 14:40:22 +08:00
* This detects both negative motion and the case where the delta
* overflows the multiplication with tkr - > mult .
2024-03-25 14:40:21 +08:00
*/
if ( unlikely ( delta > tkr - > clock - > max_cycles ) ) {
2024-03-25 14:40:22 +08:00
/*
* Handle clocksource inconsistency between CPUs to prevent
* time from going backwards by checking for the MSB of the
* mask being set in the delta .
*/
if ( delta & ~ ( mask > > 1 ) )
return tkr - > xtime_nsec > > tkr - > shift ;
2024-03-25 14:40:21 +08:00
return delta_to_ns_safe ( tkr , delta ) ;
2024-03-25 14:40:20 +08:00
}
2009-08-14 21:47:29 +08:00
2024-03-25 14:40:19 +08:00
return ( ( delta * tkr - > mult ) + tkr - > xtime_nsec ) > > tkr - > shift ;
2016-02-22 19:15:19 +08:00
}
2009-08-14 21:47:29 +08:00
2024-03-25 14:40:14 +08:00
static __always_inline u64 __timekeeping_get_ns ( const struct tk_read_base * tkr )
2016-02-22 19:15:19 +08:00
{
2024-03-25 14:40:16 +08:00
return timekeeping_cycles_to_ns ( tkr , tk_clock_read ( tkr ) ) ;
2009-08-14 21:47:29 +08:00
}
2024-03-25 14:40:13 +08:00
static inline u64 timekeeping_get_ns ( const struct tk_read_base * tkr )
{
2024-03-25 14:40:17 +08:00
if ( IS_ENABLED ( CONFIG_DEBUG_TIMEKEEPING ) )
2024-03-25 14:40:18 +08:00
return timekeeping_debug_get_ns ( tkr ) ;
2024-03-25 14:40:13 +08:00
2024-03-25 14:40:17 +08:00
return __timekeeping_get_ns ( tkr ) ;
2024-03-25 14:40:13 +08:00
}
2014-07-17 05:05:23 +08:00
/**
* update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper .
2015-02-11 12:01:52 +08:00
* @ tkr : Timekeeping readout base from which we take the update
2020-11-13 15:24:31 +08:00
* @ tkf : Pointer to NMI safe timekeeper
2014-07-17 05:05:23 +08:00
*
* We want to use this from any context including NMI and tracing /
* instrumenting the timekeeping code itself .
*
2015-05-27 09:39:36 +08:00
* Employ the latch technique ; see @ raw_write_seqcount_latch .
2014-07-17 05:05:23 +08:00
*
* So if a NMI hits the update of base [ 0 ] then it will use base [ 1 ]
* which is still consistent . In the worst case this can result is a
* slightly wrong timestamp ( a few nanoseconds ) . See
* @ ktime_get_mono_fast_ns .
*/
2018-07-13 20:06:42 +08:00
static void update_fast_timekeeper ( const struct tk_read_base * tkr ,
struct tk_fast * tkf )
2014-07-17 05:05:23 +08:00
{
2015-03-19 16:36:19 +08:00
struct tk_read_base * base = tkf - > base ;
2014-07-17 05:05:23 +08:00
/* Force readers off to base[1] */
2015-03-19 16:36:19 +08:00
raw_write_seqcount_latch ( & tkf - > seq ) ;
2014-07-17 05:05:23 +08:00
/* Update base[0] */
2015-02-11 12:01:52 +08:00
memcpy ( base , tkr , sizeof ( * base ) ) ;
2014-07-17 05:05:23 +08:00
/* Force readers back to base[0] */
2015-03-19 16:36:19 +08:00
raw_write_seqcount_latch ( & tkf - > seq ) ;
2014-07-17 05:05:23 +08:00
/* Update base[1] */
memcpy ( base + 1 , base , sizeof ( * base ) ) ;
}
2020-11-16 04:09:31 +08:00
static __always_inline u64 __ktime_get_fast_ns ( struct tk_fast * tkf )
{
struct tk_read_base * tkr ;
unsigned int seq ;
u64 now ;
do {
seq = raw_read_seqcount_latch ( & tkf - > seq ) ;
tkr = tkf - > base + ( seq & 0x01 ) ;
now = ktime_to_ns ( tkr - > base ) ;
2024-03-25 14:40:14 +08:00
now + = __timekeeping_get_ns ( tkr ) ;
2023-05-19 18:20:59 +08:00
} while ( raw_read_seqcount_latch_retry ( & tkf - > seq , seq ) ) ;
2020-11-16 04:09:31 +08:00
return now ;
}
2014-07-17 05:05:23 +08:00
/**
* ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic
*
* This timestamp is not guaranteed to be monotonic across an update .
* The timestamp is calculated by :
*
* now = base_mono + clock_delta * slope
*
* So if the update lowers the slope , readers who are forced to the
* not yet updated second array are still using the old steeper slope .
*
* tmono
* ^
* | o n
* | o n
* | u
* | o
* | o
* | 12345678 - - - > reader order
*
* o = old slope
* u = update
* n = new slope
*
* So reader 6 will observe time going backwards versus reader 5.
*
2020-11-16 04:09:31 +08:00
* While other CPUs are likely to be able to observe that , the only way
2014-07-17 05:05:23 +08:00
* for a CPU local observation is when an NMI hits in the middle of
* the update . Timestamps taken from that NMI context might be ahead
* of the following timestamps . Callers need to be aware of that and
* deal with it .
*/
2022-04-28 14:24:32 +08:00
u64 notrace ktime_get_mono_fast_ns ( void )
2015-03-19 16:36:19 +08:00
{
return __ktime_get_fast_ns ( & tk_fast_mono ) ;
}
2014-07-17 05:05:23 +08:00
EXPORT_SYMBOL_GPL ( ktime_get_mono_fast_ns ) ;
2020-11-16 04:09:31 +08:00
/**
* ktime_get_raw_fast_ns - Fast NMI safe access to clock monotonic raw
*
* Contrary to ktime_get_mono_fast_ns ( ) this is always correct because the
* conversion factor is not affected by NTP / PTP correction .
*/
2022-04-28 14:24:32 +08:00
u64 notrace ktime_get_raw_fast_ns ( void )
2015-03-19 16:39:08 +08:00
{
return __ktime_get_fast_ns ( & tk_fast_raw ) ;
}
EXPORT_SYMBOL_GPL ( ktime_get_raw_fast_ns ) ;
2018-04-25 21:33:38 +08:00
/**
* ktime_get_boot_fast_ns - NMI safe and fast access to boot clock .
*
* To keep it NMI safe since we ' re accessing from tracing , we ' re not using a
* separate timekeeper with updates to monotonic clock and boot offset
2020-07-20 23:55:23 +08:00
* protected with seqcounts . This has the following minor side effects :
2018-04-25 21:33:38 +08:00
*
* ( 1 ) Its possible that a timestamp be taken after the boot offset is updated
* but before the timekeeper is updated . If this happens , the new boot offset
* is added to the old timekeeping making the clock appear to update slightly
* earlier :
* CPU 0 CPU 1
* timekeeping_inject_sleeptime64 ( )
* __timekeeping_inject_sleeptime ( tk , delta ) ;
* timestamp ( ) ;
* timekeeping_update ( tk , TK_CLEAR_NTP . . . ) ;
*
* ( 2 ) On 32 - bit systems , the 64 - bit boot offset ( tk - > offs_boot ) may be
* partially updated . Since the tk - > offs_boot update is a rare event , this
* should be a rare occurrence which postprocessing should be able to handle .
2020-11-16 04:09:31 +08:00
*
2023-04-26 21:43:34 +08:00
* The caveats vs . timestamp ordering as documented for ktime_get_mono_fast_ns ( )
2020-11-16 04:09:31 +08:00
* apply as well .
2018-04-25 21:33:38 +08:00
*/
u64 notrace ktime_get_boot_fast_ns ( void )
{
struct timekeeper * tk = & tk_core . timekeeper ;
2022-04-15 17:19:35 +08:00
return ( ktime_get_mono_fast_ns ( ) + ktime_to_ns ( data_race ( tk - > offs_boot ) ) ) ;
2018-04-25 21:33:38 +08:00
}
EXPORT_SYMBOL_GPL ( ktime_get_boot_fast_ns ) ;
2022-04-14 17:18:03 +08:00
/**
* ktime_get_tai_fast_ns - NMI safe and fast access to tai clock .
*
* The same limitations as described for ktime_get_boot_fast_ns ( ) apply . The
* mono time and the TAI offset are not read atomically which may yield wrong
* readouts . However , an update of the TAI offset is an rare event e . g . , caused
* by settime or adjtimex with an offset . The user of this function has to deal
* with the possibility of wrong timestamps in post processing .
*/
u64 notrace ktime_get_tai_fast_ns ( void )
{
struct timekeeper * tk = & tk_core . timekeeper ;
return ( ktime_get_mono_fast_ns ( ) + ktime_to_ns ( data_race ( tk - > offs_tai ) ) ) ;
}
EXPORT_SYMBOL_GPL ( ktime_get_tai_fast_ns ) ;
2020-08-14 18:19:35 +08:00
static __always_inline u64 __ktime_get_real_fast ( struct tk_fast * tkf , u64 * mono )
2017-08-31 23:12:48 +08:00
{
struct tk_read_base * tkr ;
2020-08-14 18:19:35 +08:00
u64 basem , baser , delta ;
2017-08-31 23:12:48 +08:00
unsigned int seq ;
do {
seq = raw_read_seqcount_latch ( & tkf - > seq ) ;
tkr = tkf - > base + ( seq & 0x01 ) ;
2020-08-14 18:19:35 +08:00
basem = ktime_to_ns ( tkr - > base ) ;
baser = ktime_to_ns ( tkr - > base_real ) ;
2024-03-25 14:40:14 +08:00
delta = __timekeeping_get_ns ( tkr ) ;
2023-05-19 18:20:59 +08:00
} while ( raw_read_seqcount_latch_retry ( & tkf - > seq , seq ) ) ;
2017-08-31 23:12:48 +08:00
2020-08-14 18:19:35 +08:00
if ( mono )
* mono = basem + delta ;
return baser + delta ;
2017-08-31 23:12:48 +08:00
}
/**
* ktime_get_real_fast_ns : - NMI safe and fast access to clock realtime .
2020-11-16 04:09:31 +08:00
*
2023-04-26 21:43:34 +08:00
* See ktime_get_mono_fast_ns ( ) for documentation of the time stamp ordering .
2017-08-31 23:12:48 +08:00
*/
u64 ktime_get_real_fast_ns ( void )
{
2020-08-14 18:19:35 +08:00
return __ktime_get_real_fast ( & tk_fast_mono , NULL ) ;
2017-08-31 23:12:48 +08:00
}
2017-11-10 23:25:04 +08:00
EXPORT_SYMBOL_GPL ( ktime_get_real_fast_ns ) ;
2017-08-31 23:12:48 +08:00
2020-08-14 18:19:35 +08:00
/**
* ktime_get_fast_timestamps : - NMI safe timestamps
* @ snapshot : Pointer to timestamp storage
*
* Stores clock monotonic , boottime and realtime timestamps .
*
* Boot time is a racy access on 32 bit systems if the sleep time injection
* happens late during resume and not in timekeeping_resume ( ) . That could
* be avoided by expanding struct tk_read_base with boot offset for 32 bit
* and adding more overhead to the update . As this is a hard to observe
* once per resume event which can be filtered with reasonable effort using
* the accurate mono / real timestamps , it ' s probably not worth the trouble .
*
* Aside of that it might be possible on 32 and 64 bit to observe the
* following when the sleep time injection happens late :
*
* CPU 0 CPU 1
* timekeeping_resume ( )
* ktime_get_fast_timestamps ( )
* mono , real = __ktime_get_real_fast ( )
* inject_sleep_time ( )
* update boot offset
* boot = mono + bootoffset ;
*
* That means that boot time already has the sleep time adjustment , but
* real time does not . On the next readout both are in sync again .
*
* Preventing this for 64 bit is not really feasible without destroying the
* careful cache layout of the timekeeper because the sequence count and
* struct tk_read_base would then need two cache lines instead of one .
*
2021-03-23 05:39:03 +08:00
* Access to the time keeper clock source is disabled across the innermost
2020-08-14 18:19:35 +08:00
* steps of suspend / resume . The accessors still work , but the timestamps
* are frozen until time keeping is resumed which happens very early .
*
* For regular suspend / resume there is no observable difference vs . sched
* clock , but it might affect some of the nasty low level debug printks .
*
2021-03-23 05:39:03 +08:00
* OTOH , access to sched clock is not guaranteed across suspend / resume on
2020-08-14 18:19:35 +08:00
* all systems either so it depends on the hardware in use .
*
* If that turns out to be a real problem then this could be mitigated by
* using sched clock in a similar way as during early boot . But it ' s not as
* trivial as on early boot because it needs some careful protection
* against the clock monotonic timestamp jumping backwards on resume .
*/
void ktime_get_fast_timestamps ( struct ktime_timestamps * snapshot )
{
struct timekeeper * tk = & tk_core . timekeeper ;
snapshot - > real = __ktime_get_real_fast ( & tk_fast_mono , & snapshot - > mono ) ;
snapshot - > boot = snapshot - > mono + ktime_to_ns ( data_race ( tk - > offs_boot ) ) ;
}
2015-02-13 21:49:02 +08:00
/**
* halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource .
* @ tk : Timekeeper to snapshot .
*
* It generally is unsafe to access the clocksource after timekeeping has been
* suspended , so take a snapshot of the readout base of @ tk and use it as the
* fast timekeeper ' s readout base while suspended . It will return the same
* number of cycles every time until timekeeping is resumed at which time the
* proper readout base for the fast timekeeper will be restored automatically .
*/
2018-07-13 20:06:42 +08:00
static void halt_fast_timekeeper ( const struct timekeeper * tk )
2015-02-13 21:49:02 +08:00
{
static struct tk_read_base tkr_dummy ;
2018-07-13 20:06:42 +08:00
const struct tk_read_base * tkr = & tk - > tkr_mono ;
2015-02-13 21:49:02 +08:00
memcpy ( & tkr_dummy , tkr , sizeof ( tkr_dummy ) ) ;
2017-06-09 07:44:20 +08:00
cycles_at_suspend = tk_clock_read ( tkr ) ;
tkr_dummy . clock = & dummy_clock ;
2017-08-31 23:12:48 +08:00
tkr_dummy . base_real = tkr - > base + tk - > offs_real ;
2015-03-19 16:36:19 +08:00
update_fast_timekeeper ( & tkr_dummy , & tk_fast_mono ) ;
2015-03-19 16:39:08 +08:00
tkr = & tk - > tkr_raw ;
memcpy ( & tkr_dummy , tkr , sizeof ( tkr_dummy ) ) ;
2017-06-09 07:44:20 +08:00
tkr_dummy . clock = & dummy_clock ;
2015-03-19 16:39:08 +08:00
update_fast_timekeeper ( & tkr_dummy , & tk_fast_raw ) ;
2015-02-13 21:49:02 +08:00
}
2012-11-28 09:28:59 +08:00
static RAW_NOTIFIER_HEAD ( pvclock_gtod_chain ) ;
2013-06-27 18:35:46 +08:00
static void update_pvclock_gtod ( struct timekeeper * tk , bool was_set )
2012-11-28 09:28:59 +08:00
{
2013-06-27 18:35:46 +08:00
raw_notifier_call_chain ( & pvclock_gtod_chain , was_set , tk ) ;
2012-11-28 09:28:59 +08:00
}
/**
* pvclock_gtod_register_notifier - register a pvclock timedata update listener
2020-11-13 15:24:32 +08:00
* @ nb : Pointer to the notifier block to register
2012-11-28 09:28:59 +08:00
*/
int pvclock_gtod_register_notifier ( struct notifier_block * nb )
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2012-11-28 09:28:59 +08:00
unsigned long flags ;
int ret ;
2013-02-22 06:51:38 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2012-11-28 09:28:59 +08:00
ret = raw_notifier_chain_register ( & pvclock_gtod_chain , nb ) ;
2013-06-27 18:35:46 +08:00
update_pvclock_gtod ( tk , true ) ;
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2012-11-28 09:28:59 +08:00
return ret ;
}
EXPORT_SYMBOL_GPL ( pvclock_gtod_register_notifier ) ;
/**
* pvclock_gtod_unregister_notifier - unregister a pvclock
* timedata update listener
2020-11-13 15:24:32 +08:00
* @ nb : Pointer to the notifier block to unregister
2012-11-28 09:28:59 +08:00
*/
int pvclock_gtod_unregister_notifier ( struct notifier_block * nb )
{
unsigned long flags ;
int ret ;
2013-02-22 06:51:38 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2012-11-28 09:28:59 +08:00
ret = raw_notifier_chain_unregister ( & pvclock_gtod_chain , nb ) ;
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2012-11-28 09:28:59 +08:00
return ret ;
}
EXPORT_SYMBOL_GPL ( pvclock_gtod_unregister_notifier ) ;
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
/*
* tk_update_leap_state - helper to update the next_leap_ktime
*/
static inline void tk_update_leap_state ( struct timekeeper * tk )
{
tk - > next_leap_ktime = ntp_get_next_leap ( ) ;
2016-12-25 18:38:40 +08:00
if ( tk - > next_leap_ktime ! = KTIME_MAX )
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
/* Convert to monotonic time */
tk - > next_leap_ktime = ktime_sub ( tk - > next_leap_ktime , tk - > offs_real ) ;
}
2014-07-17 05:04:10 +08:00
/*
* Update the ktime_t based scalar nsec members of the timekeeper
*/
static inline void tk_update_ktime_data ( struct timekeeper * tk )
{
2014-10-29 18:31:16 +08:00
u64 seconds ;
u32 nsec ;
2014-07-17 05:04:10 +08:00
/*
* The xtime based monotonic readout is :
* nsec = ( xtime_sec + wtm_sec ) * 1e9 + wtm_nsec + now ( ) ;
* The ktime based monotonic readout is :
* nsec = base_mono + now ( ) ;
* = = > base_mono = ( xtime_sec + wtm_sec ) * 1e9 + wtm_nsec
*/
2014-10-29 18:31:16 +08:00
seconds = ( u64 ) ( tk - > xtime_sec + tk - > wall_to_monotonic . tv_sec ) ;
nsec = ( u32 ) tk - > wall_to_monotonic . tv_nsec ;
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . base = ns_to_ktime ( seconds * NSEC_PER_SEC + nsec ) ;
2014-07-17 05:05:04 +08:00
2014-10-29 18:31:16 +08:00
/*
* The sum of the nanoseconds portions of xtime and
* wall_to_monotonic can be greater / equal one second . Take
* this into account before updating tk - > ktime_sec .
*/
2015-03-19 17:09:06 +08:00
nsec + = ( u32 ) ( tk - > tkr_mono . xtime_nsec > > tk - > tkr_mono . shift ) ;
2014-10-29 18:31:16 +08:00
if ( nsec > = NSEC_PER_SEC )
seconds + + ;
tk - > ktime_sec = seconds ;
2017-05-23 08:20:20 +08:00
/* Update the monotonic raw base */
2017-08-26 06:57:04 +08:00
tk - > tkr_raw . base = ns_to_ktime ( tk - > raw_sec * NSEC_PER_SEC ) ;
2014-07-17 05:04:10 +08:00
}
2013-02-22 06:51:38 +08:00
/* must hold timekeeper_lock */
2013-06-27 18:35:45 +08:00
static void timekeeping_update ( struct timekeeper * tk , unsigned int action )
2011-11-14 07:19:49 +08:00
{
2013-06-27 18:35:45 +08:00
if ( action & TK_CLEAR_NTP ) {
2012-07-13 13:21:57 +08:00
tk - > ntp_error = 0 ;
2011-11-14 07:19:49 +08:00
ntp_clear ( ) ;
}
2013-02-22 06:51:40 +08:00
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
tk_update_leap_state ( tk ) ;
2014-07-17 05:04:10 +08:00
tk_update_ktime_data ( tk ) ;
2014-09-06 18:24:49 +08:00
update_vsyscall ( tk ) ;
update_pvclock_gtod ( tk , action & TK_CLOCK_WAS_SET ) ;
2017-08-31 23:12:48 +08:00
tk - > tkr_mono . base_real = tk - > tkr_mono . base + tk - > offs_real ;
2015-03-19 16:36:19 +08:00
update_fast_timekeeper ( & tk - > tkr_mono , & tk_fast_mono ) ;
2015-03-19 16:39:08 +08:00
update_fast_timekeeper ( & tk - > tkr_raw , & tk_fast_raw ) ;
2015-04-15 05:08:37 +08:00
if ( action & TK_CLOCK_WAS_SET )
tk - > clock_was_set_seq + + ;
2015-06-12 06:54:53 +08:00
/*
* The mirroring of the data to the shadow - timekeeper needs
* to happen last here to ensure we don ' t over - write the
* timekeeper structure on the next update with stale data
*/
if ( action & TK_MIRROR )
memcpy ( & shadow_timekeeper , & tk_core . timekeeper ,
sizeof ( tk_core . timekeeper ) ) ;
2011-11-14 07:19:49 +08:00
}
2007-05-08 15:27:59 +08:00
/**
2009-08-14 21:47:26 +08:00
* timekeeping_forward_now - update clock to the current time
2020-11-13 15:24:34 +08:00
* @ tk : Pointer to the timekeeper to update
2007-05-08 15:27:59 +08:00
*
2008-08-21 07:37:28 +08:00
* Forward the current clock to update its state since the last call to
* update_wall_time ( ) . This is useful before significant clock changes ,
* as it avoids having to deal with this time offset explicitly .
2007-05-08 15:27:59 +08:00
*/
2012-07-13 13:21:57 +08:00
static void timekeeping_forward_now ( struct timekeeper * tk )
2007-05-08 15:27:59 +08:00
{
2016-12-22 03:32:01 +08:00
u64 cycle_now , delta ;
2007-05-08 15:27:59 +08:00
2017-06-09 07:44:20 +08:00
cycle_now = tk_clock_read ( & tk - > tkr_mono ) ;
2015-03-19 17:09:06 +08:00
delta = clocksource_delta ( cycle_now , tk - > tkr_mono . cycle_last , tk - > tkr_mono . mask ) ;
tk - > tkr_mono . cycle_last = cycle_now ;
2015-03-19 16:28:44 +08:00
tk - > tkr_raw . cycle_last = cycle_now ;
2007-05-08 15:27:59 +08:00
2024-03-25 14:40:21 +08:00
while ( delta > 0 ) {
u64 max = tk - > tkr_mono . clock - > max_cycles ;
u64 incr = delta < max ? delta : max ;
2017-05-23 08:20:20 +08:00
2024-03-25 14:40:21 +08:00
tk - > tkr_mono . xtime_nsec + = incr * tk - > tkr_mono . mult ;
tk - > tkr_raw . xtime_nsec + = incr * tk - > tkr_raw . mult ;
tk_normalize_xtime ( tk ) ;
delta - = incr ;
}
2007-05-08 15:27:59 +08:00
}
/**
2018-04-27 21:40:13 +08:00
* ktime_get_real_ts64 - Returns the time of day in a timespec64 .
2007-05-08 15:27:59 +08:00
* @ ts : pointer to the timespec to be set
*
2018-04-27 21:40:13 +08:00
* Returns the time of day in a timespec64 ( WARN if suspended ) .
2007-05-08 15:27:59 +08:00
*/
2018-04-27 21:40:13 +08:00
void ktime_get_real_ts64 ( struct timespec64 * ts )
2007-05-08 15:27:59 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2019-03-19 03:55:56 +08:00
unsigned int seq ;
2016-12-09 04:49:34 +08:00
u64 nsecs ;
2007-05-08 15:27:59 +08:00
2018-04-27 21:40:13 +08:00
WARN_ON ( timekeeping_suspended ) ;
2007-05-08 15:27:59 +08:00
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
2007-05-08 15:27:59 +08:00
2012-07-28 02:48:13 +08:00
ts - > tv_sec = tk - > xtime_sec ;
2015-03-19 17:09:06 +08:00
nsecs = timekeeping_get_ns ( & tk - > tkr_mono ) ;
2007-05-08 15:27:59 +08:00
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2007-05-08 15:27:59 +08:00
2012-09-12 07:26:03 +08:00
ts - > tv_nsec = 0 ;
2014-07-17 05:04:04 +08:00
timespec64_add_ns ( ts , nsecs ) ;
2007-05-08 15:27:59 +08:00
}
2018-04-27 21:40:13 +08:00
EXPORT_SYMBOL ( ktime_get_real_ts64 ) ;
2007-05-08 15:27:59 +08:00
2009-07-07 17:27:28 +08:00
ktime_t ktime_get ( void )
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2009-07-07 17:27:28 +08:00
unsigned int seq ;
2014-07-17 05:04:12 +08:00
ktime_t base ;
2016-12-09 04:49:34 +08:00
u64 nsecs ;
2009-07-07 17:27:28 +08:00
WARN_ON ( timekeeping_suspended ) ;
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
2015-03-19 17:09:06 +08:00
base = tk - > tkr_mono . base ;
nsecs = timekeeping_get_ns ( & tk - > tkr_mono ) ;
2009-07-07 17:27:28 +08:00
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2014-07-17 05:03:53 +08:00
2014-07-17 05:04:12 +08:00
return ktime_add_ns ( base , nsecs ) ;
2009-07-07 17:27:28 +08:00
}
EXPORT_SYMBOL_GPL ( ktime_get ) ;
2015-04-07 19:12:35 +08:00
u32 ktime_get_resolution_ns ( void )
{
struct timekeeper * tk = & tk_core . timekeeper ;
unsigned int seq ;
u32 nsecs ;
WARN_ON ( timekeeping_suspended ) ;
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
nsecs = tk - > tkr_mono . mult > > tk - > tkr_mono . shift ;
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
return nsecs ;
}
EXPORT_SYMBOL_GPL ( ktime_get_resolution_ns ) ;
2014-07-17 05:04:13 +08:00
static ktime_t * offsets [ TK_OFFS_MAX ] = {
[ TK_OFFS_REAL ] = & tk_core . timekeeper . offs_real ,
2018-04-25 21:33:38 +08:00
[ TK_OFFS_BOOT ] = & tk_core . timekeeper . offs_boot ,
2014-07-17 05:04:13 +08:00
[ TK_OFFS_TAI ] = & tk_core . timekeeper . offs_tai ,
} ;
ktime_t ktime_get_with_offset ( enum tk_offsets offs )
{
struct timekeeper * tk = & tk_core . timekeeper ;
unsigned int seq ;
ktime_t base , * offset = offsets [ offs ] ;
2016-12-09 04:49:34 +08:00
u64 nsecs ;
2014-07-17 05:04:13 +08:00
WARN_ON ( timekeeping_suspended ) ;
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
2015-03-19 17:09:06 +08:00
base = ktime_add ( tk - > tkr_mono . base , * offset ) ;
nsecs = timekeeping_get_ns ( & tk - > tkr_mono ) ;
2014-07-17 05:04:13 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
return ktime_add_ns ( base , nsecs ) ;
}
EXPORT_SYMBOL_GPL ( ktime_get_with_offset ) ;
2018-04-27 21:40:15 +08:00
ktime_t ktime_get_coarse_with_offset ( enum tk_offsets offs )
{
struct timekeeper * tk = & tk_core . timekeeper ;
unsigned int seq ;
ktime_t base , * offset = offsets [ offs ] ;
2019-06-14 03:40:45 +08:00
u64 nsecs ;
2018-04-27 21:40:15 +08:00
WARN_ON ( timekeeping_suspended ) ;
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
base = ktime_add ( tk - > tkr_mono . base , * offset ) ;
2019-06-14 03:40:45 +08:00
nsecs = tk - > tkr_mono . xtime_nsec > > tk - > tkr_mono . shift ;
2018-04-27 21:40:15 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2019-06-22 04:32:47 +08:00
return ktime_add_ns ( base , nsecs ) ;
2018-04-27 21:40:15 +08:00
}
EXPORT_SYMBOL_GPL ( ktime_get_coarse_with_offset ) ;
2014-07-17 05:04:22 +08:00
/**
2021-03-23 05:39:03 +08:00
* ktime_mono_to_any ( ) - convert monotonic time to any other time
2014-07-17 05:04:22 +08:00
* @ tmono : time to convert .
* @ offs : which offset to use
*/
ktime_t ktime_mono_to_any ( ktime_t tmono , enum tk_offsets offs )
{
ktime_t * offset = offsets [ offs ] ;
2019-03-19 03:55:56 +08:00
unsigned int seq ;
2014-07-17 05:04:22 +08:00
ktime_t tconv ;
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
tconv = ktime_add ( tmono , * offset ) ;
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
return tconv ;
}
EXPORT_SYMBOL_GPL ( ktime_mono_to_any ) ;
2014-07-17 05:05:04 +08:00
/**
* ktime_get_raw - Returns the raw monotonic time in ktime_t format
*/
ktime_t ktime_get_raw ( void )
{
struct timekeeper * tk = & tk_core . timekeeper ;
unsigned int seq ;
ktime_t base ;
2016-12-09 04:49:34 +08:00
u64 nsecs ;
2014-07-17 05:05:04 +08:00
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
2015-03-19 16:28:44 +08:00
base = tk - > tkr_raw . base ;
nsecs = timekeeping_get_ns ( & tk - > tkr_raw ) ;
2014-07-17 05:05:04 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
return ktime_add_ns ( base , nsecs ) ;
}
EXPORT_SYMBOL_GPL ( ktime_get_raw ) ;
2009-07-07 17:27:28 +08:00
/**
2014-07-17 05:04:04 +08:00
* ktime_get_ts64 - get the monotonic clock in timespec64 format
2009-07-07 17:27:28 +08:00
* @ ts : pointer to timespec variable
*
* The function calculates the monotonic clock from the realtime
* clock and the wall_to_monotonic offset and stores the result
2014-11-08 05:13:04 +08:00
* in normalized timespec64 format in the variable pointed to by @ ts .
2009-07-07 17:27:28 +08:00
*/
2014-07-17 05:04:04 +08:00
void ktime_get_ts64 ( struct timespec64 * ts )
2009-07-07 17:27:28 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2014-07-17 05:04:04 +08:00
struct timespec64 tomono ;
2009-07-07 17:27:28 +08:00
unsigned int seq ;
2016-12-09 04:49:34 +08:00
u64 nsec ;
2009-07-07 17:27:28 +08:00
WARN_ON ( timekeeping_suspended ) ;
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
2014-07-17 05:04:04 +08:00
ts - > tv_sec = tk - > xtime_sec ;
2015-03-19 17:09:06 +08:00
nsec = timekeeping_get_ns ( & tk - > tkr_mono ) ;
2012-07-28 02:48:13 +08:00
tomono = tk - > wall_to_monotonic ;
2009-07-07 17:27:28 +08:00
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2009-07-07 17:27:28 +08:00
2014-07-17 05:04:04 +08:00
ts - > tv_sec + = tomono . tv_sec ;
ts - > tv_nsec = 0 ;
timespec64_add_ns ( ts , nsec + tomono . tv_nsec ) ;
2009-07-07 17:27:28 +08:00
}
2014-07-17 05:04:04 +08:00
EXPORT_SYMBOL_GPL ( ktime_get_ts64 ) ;
2009-07-07 17:27:28 +08:00
2014-10-29 18:31:16 +08:00
/**
* ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC
*
* Returns the seconds portion of CLOCK_MONOTONIC with a single non
* serialized read . tk - > ktime_sec is of type ' unsigned long ' so this
* works on both 32 and 64 bit systems . On 32 bit systems the readout
* covers ~ 136 years of uptime which should be enough to prevent
* premature wrap arounds .
*/
time64_t ktime_get_seconds ( void )
{
struct timekeeper * tk = & tk_core . timekeeper ;
WARN_ON ( timekeeping_suspended ) ;
return tk - > ktime_sec ;
}
EXPORT_SYMBOL_GPL ( ktime_get_seconds ) ;
2014-10-29 18:31:50 +08:00
/**
* ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME
*
2020-12-01 17:52:31 +08:00
* Returns the wall clock seconds since 1970.
2014-10-29 18:31:50 +08:00
*
* For 64 bit systems the fast access to tk - > xtime_sec is preserved . On
* 32 bit systems the access must be protected with the sequence
* counter to provide " atomic " access to the 64 bit tk - > xtime_sec
* value .
*/
time64_t ktime_get_real_seconds ( void )
{
struct timekeeper * tk = & tk_core . timekeeper ;
time64_t seconds ;
unsigned int seq ;
if ( IS_ENABLED ( CONFIG_64BIT ) )
return tk - > xtime_sec ;
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
seconds = tk - > xtime_sec ;
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
return seconds ;
}
EXPORT_SYMBOL_GPL ( ktime_get_real_seconds ) ;
2015-12-13 12:24:18 +08:00
/**
* __ktime_get_real_seconds - The same as ktime_get_real_seconds
* but without the sequence counter protect . This internal function
* is called just when timekeeping lock is already held .
*/
2020-04-22 03:22:36 +08:00
noinstr time64_t __ktime_get_real_seconds ( void )
2015-12-13 12:24:18 +08:00
{
struct timekeeper * tk = & tk_core . timekeeper ;
return tk - > xtime_sec ;
}
2016-02-22 19:15:20 +08:00
/**
* ktime_get_snapshot - snapshots the realtime / monotonic raw clocks with counter
* @ systime_snapshot : pointer to struct receiving the system time snapshot
*/
void ktime_get_snapshot ( struct system_time_snapshot * systime_snapshot )
{
struct timekeeper * tk = & tk_core . timekeeper ;
2019-03-19 03:55:56 +08:00
unsigned int seq ;
2016-02-22 19:15:20 +08:00
ktime_t base_raw ;
ktime_t base_real ;
2016-12-09 04:49:34 +08:00
u64 nsec_raw ;
u64 nsec_real ;
2016-12-22 03:32:01 +08:00
u64 now ;
2016-02-22 19:15:20 +08:00
2016-02-22 19:15:21 +08:00
WARN_ON_ONCE ( timekeeping_suspended ) ;
2016-02-22 19:15:20 +08:00
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
2017-06-09 07:44:20 +08:00
now = tk_clock_read ( & tk - > tkr_mono ) ;
2020-12-09 14:09:27 +08:00
systime_snapshot - > cs_id = tk - > tkr_mono . clock - > id ;
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
systime_snapshot - > cs_was_changed_seq = tk - > cs_was_changed_seq ;
systime_snapshot - > clock_was_set_seq = tk - > clock_was_set_seq ;
2016-02-22 19:15:20 +08:00
base_real = ktime_add ( tk - > tkr_mono . base ,
tk_core . timekeeper . offs_real ) ;
base_raw = tk - > tkr_raw . base ;
nsec_real = timekeeping_cycles_to_ns ( & tk - > tkr_mono , now ) ;
nsec_raw = timekeeping_cycles_to_ns ( & tk - > tkr_raw , now ) ;
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
systime_snapshot - > cycles = now ;
systime_snapshot - > real = ktime_add_ns ( base_real , nsec_real ) ;
systime_snapshot - > raw = ktime_add_ns ( base_raw , nsec_raw ) ;
}
EXPORT_SYMBOL_GPL ( ktime_get_snapshot ) ;
2015-12-13 12:24:18 +08:00
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
/* Scale base by mult/div checking for overflow */
static int scale64_check_overflow ( u64 mult , u64 div , u64 * base )
{
u64 tmp , rem ;
tmp = div64_u64_rem ( * base , div , & rem ) ;
if ( ( ( int ) sizeof ( u64 ) * 8 - fls64 ( mult ) < fls64 ( tmp ) ) | |
( ( int ) sizeof ( u64 ) * 8 - fls64 ( mult ) < fls64 ( rem ) ) )
return - EOVERFLOW ;
tmp * = mult ;
2020-01-20 18:05:23 +08:00
rem = div64_u64 ( rem * mult , div ) ;
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
* base = tmp + rem ;
return 0 ;
}
/**
* adjust_historical_crosststamp - adjust crosstimestamp previous to current interval
* @ history : Snapshot representing start of history
* @ partial_history_cycles : Cycle offset into history ( fractional part )
* @ total_history_cycles : Total history length in cycles
* @ discontinuity : True indicates clock was set on history period
* @ ts : Cross timestamp that should be adjusted using
* partial / total ratio
*
* Helper function used by get_device_system_crosststamp ( ) to correct the
* crosstimestamp corresponding to the start of the current interval to the
* system counter value ( timestamp point ) provided by the driver . The
* total_history_ * quantities are the total history starting at the provided
* reference point and ending at the start of the current interval . The cycle
* count between the driver timestamp point and the start of the current
* interval is partial_history_cycles .
*/
static int adjust_historical_crosststamp ( struct system_time_snapshot * history ,
2016-12-22 03:32:01 +08:00
u64 partial_history_cycles ,
u64 total_history_cycles ,
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
bool discontinuity ,
struct system_device_crosststamp * ts )
{
struct timekeeper * tk = & tk_core . timekeeper ;
u64 corr_raw , corr_real ;
bool interp_forward ;
int ret ;
if ( total_history_cycles = = 0 | | partial_history_cycles = = 0 )
return 0 ;
/* Interpolate shortest distance from beginning or end of history */
2017-03-25 03:03:35 +08:00
interp_forward = partial_history_cycles > total_history_cycles / 2 ;
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
partial_history_cycles = interp_forward ?
total_history_cycles - partial_history_cycles :
partial_history_cycles ;
/*
* Scale the monotonic raw time delta by :
* partial_history_cycles / total_history_cycles
*/
corr_raw = ( u64 ) ktime_to_ns (
ktime_sub ( ts - > sys_monoraw , history - > raw ) ) ;
ret = scale64_check_overflow ( partial_history_cycles ,
total_history_cycles , & corr_raw ) ;
if ( ret )
return ret ;
/*
* If there is a discontinuity in the history , scale monotonic raw
* correction by :
* mult ( real ) / mult ( raw ) yielding the realtime correction
* Otherwise , calculate the realtime correction similar to monotonic
* raw calculation
*/
if ( discontinuity ) {
corr_real = mul_u64_u32_div
( corr_raw , tk - > tkr_mono . mult , tk - > tkr_raw . mult ) ;
} else {
corr_real = ( u64 ) ktime_to_ns (
ktime_sub ( ts - > sys_realtime , history - > real ) ) ;
ret = scale64_check_overflow ( partial_history_cycles ,
total_history_cycles , & corr_real ) ;
if ( ret )
return ret ;
}
/* Fixup monotonic raw and real time time values */
if ( interp_forward ) {
ts - > sys_monoraw = ktime_add_ns ( history - > raw , corr_raw ) ;
ts - > sys_realtime = ktime_add_ns ( history - > real , corr_real ) ;
} else {
ts - > sys_monoraw = ktime_sub_ns ( ts - > sys_monoraw , corr_raw ) ;
ts - > sys_realtime = ktime_sub_ns ( ts - > sys_realtime , corr_real ) ;
}
return 0 ;
}
/*
timekeeping: Fix cross-timestamp interpolation corner case decision
The cycle_between() helper checks if parameter test is in the open interval
(before, after). Colloquially speaking, this also applies to the counter
wrap-around special case before > after. get_device_system_crosststamp()
currently uses cycle_between() at the first call site to decide whether to
interpolate for older counter readings.
get_device_system_crosststamp() has the following problem with
cycle_between() testing against an open interval: Assume that, by chance,
cycles == tk->tkr_mono.cycle_last (in the following, "cycle_last" for
brevity). Then, cycle_between() at the first call site, with effective
argument values cycle_between(cycle_last, cycles, now), returns false,
enabling interpolation. During interpolation,
get_device_system_crosststamp() will then call cycle_between() at the
second call site (if a history_begin was supplied). The effective argument
values are cycle_between(history_begin->cycles, cycles, cycles), since
system_counterval.cycles == interval_start == cycles, per the assumption.
Due to the test against the open interval, cycle_between() returns false
again. This causes get_device_system_crosststamp() to return -EINVAL.
This failure should be avoided, since get_device_system_crosststamp() works
both when cycles follows cycle_last (no interpolation), and when cycles
precedes cycle_last (interpolation). For the case cycles == cycle_last,
interpolation is actually unneeded.
Fix this by changing cycle_between() into timestamp_in_interval(), which
now checks against the closed interval, rather than the open interval.
This changes the get_device_system_crosststamp() behavior for three corner
cases:
1. Bypass interpolation in the case cycles == tk->tkr_mono.cycle_last,
fixing the problem described above.
2. At the first timestamp_in_interval() call site, cycles == now no longer
causes failure.
3. At the second timestamp_in_interval() call site, history_begin->cycles
== system_counterval.cycles no longer causes failure.
adjust_historical_crosststamp() also works for this corner case,
where partial_history_cycles == total_history_cycles.
These behavioral changes should not cause any problems.
Fixes: 2c756feb18d9 ("time: Add history to cross timestamp interface supporting slower devices")
Signed-off-by: Peter Hilber <peter.hilber@opensynergy.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20231218073849.35294-3-peter.hilber@opensynergy.com
2023-12-18 15:38:40 +08:00
* timestamp_in_interval - true if ts is chronologically in [ start , end ]
*
* True if ts occurs chronologically at or after start , and before or at end .
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
*/
timekeeping: Fix cross-timestamp interpolation corner case decision
The cycle_between() helper checks if parameter test is in the open interval
(before, after). Colloquially speaking, this also applies to the counter
wrap-around special case before > after. get_device_system_crosststamp()
currently uses cycle_between() at the first call site to decide whether to
interpolate for older counter readings.
get_device_system_crosststamp() has the following problem with
cycle_between() testing against an open interval: Assume that, by chance,
cycles == tk->tkr_mono.cycle_last (in the following, "cycle_last" for
brevity). Then, cycle_between() at the first call site, with effective
argument values cycle_between(cycle_last, cycles, now), returns false,
enabling interpolation. During interpolation,
get_device_system_crosststamp() will then call cycle_between() at the
second call site (if a history_begin was supplied). The effective argument
values are cycle_between(history_begin->cycles, cycles, cycles), since
system_counterval.cycles == interval_start == cycles, per the assumption.
Due to the test against the open interval, cycle_between() returns false
again. This causes get_device_system_crosststamp() to return -EINVAL.
This failure should be avoided, since get_device_system_crosststamp() works
both when cycles follows cycle_last (no interpolation), and when cycles
precedes cycle_last (interpolation). For the case cycles == cycle_last,
interpolation is actually unneeded.
Fix this by changing cycle_between() into timestamp_in_interval(), which
now checks against the closed interval, rather than the open interval.
This changes the get_device_system_crosststamp() behavior for three corner
cases:
1. Bypass interpolation in the case cycles == tk->tkr_mono.cycle_last,
fixing the problem described above.
2. At the first timestamp_in_interval() call site, cycles == now no longer
causes failure.
3. At the second timestamp_in_interval() call site, history_begin->cycles
== system_counterval.cycles no longer causes failure.
adjust_historical_crosststamp() also works for this corner case,
where partial_history_cycles == total_history_cycles.
These behavioral changes should not cause any problems.
Fixes: 2c756feb18d9 ("time: Add history to cross timestamp interface supporting slower devices")
Signed-off-by: Peter Hilber <peter.hilber@opensynergy.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20231218073849.35294-3-peter.hilber@opensynergy.com
2023-12-18 15:38:40 +08:00
static bool timestamp_in_interval ( u64 start , u64 end , u64 ts )
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
{
timekeeping: Fix cross-timestamp interpolation corner case decision
The cycle_between() helper checks if parameter test is in the open interval
(before, after). Colloquially speaking, this also applies to the counter
wrap-around special case before > after. get_device_system_crosststamp()
currently uses cycle_between() at the first call site to decide whether to
interpolate for older counter readings.
get_device_system_crosststamp() has the following problem with
cycle_between() testing against an open interval: Assume that, by chance,
cycles == tk->tkr_mono.cycle_last (in the following, "cycle_last" for
brevity). Then, cycle_between() at the first call site, with effective
argument values cycle_between(cycle_last, cycles, now), returns false,
enabling interpolation. During interpolation,
get_device_system_crosststamp() will then call cycle_between() at the
second call site (if a history_begin was supplied). The effective argument
values are cycle_between(history_begin->cycles, cycles, cycles), since
system_counterval.cycles == interval_start == cycles, per the assumption.
Due to the test against the open interval, cycle_between() returns false
again. This causes get_device_system_crosststamp() to return -EINVAL.
This failure should be avoided, since get_device_system_crosststamp() works
both when cycles follows cycle_last (no interpolation), and when cycles
precedes cycle_last (interpolation). For the case cycles == cycle_last,
interpolation is actually unneeded.
Fix this by changing cycle_between() into timestamp_in_interval(), which
now checks against the closed interval, rather than the open interval.
This changes the get_device_system_crosststamp() behavior for three corner
cases:
1. Bypass interpolation in the case cycles == tk->tkr_mono.cycle_last,
fixing the problem described above.
2. At the first timestamp_in_interval() call site, cycles == now no longer
causes failure.
3. At the second timestamp_in_interval() call site, history_begin->cycles
== system_counterval.cycles no longer causes failure.
adjust_historical_crosststamp() also works for this corner case,
where partial_history_cycles == total_history_cycles.
These behavioral changes should not cause any problems.
Fixes: 2c756feb18d9 ("time: Add history to cross timestamp interface supporting slower devices")
Signed-off-by: Peter Hilber <peter.hilber@opensynergy.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20231218073849.35294-3-peter.hilber@opensynergy.com
2023-12-18 15:38:40 +08:00
if ( ts > = start & & ts < = end )
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
return true ;
timekeeping: Fix cross-timestamp interpolation corner case decision
The cycle_between() helper checks if parameter test is in the open interval
(before, after). Colloquially speaking, this also applies to the counter
wrap-around special case before > after. get_device_system_crosststamp()
currently uses cycle_between() at the first call site to decide whether to
interpolate for older counter readings.
get_device_system_crosststamp() has the following problem with
cycle_between() testing against an open interval: Assume that, by chance,
cycles == tk->tkr_mono.cycle_last (in the following, "cycle_last" for
brevity). Then, cycle_between() at the first call site, with effective
argument values cycle_between(cycle_last, cycles, now), returns false,
enabling interpolation. During interpolation,
get_device_system_crosststamp() will then call cycle_between() at the
second call site (if a history_begin was supplied). The effective argument
values are cycle_between(history_begin->cycles, cycles, cycles), since
system_counterval.cycles == interval_start == cycles, per the assumption.
Due to the test against the open interval, cycle_between() returns false
again. This causes get_device_system_crosststamp() to return -EINVAL.
This failure should be avoided, since get_device_system_crosststamp() works
both when cycles follows cycle_last (no interpolation), and when cycles
precedes cycle_last (interpolation). For the case cycles == cycle_last,
interpolation is actually unneeded.
Fix this by changing cycle_between() into timestamp_in_interval(), which
now checks against the closed interval, rather than the open interval.
This changes the get_device_system_crosststamp() behavior for three corner
cases:
1. Bypass interpolation in the case cycles == tk->tkr_mono.cycle_last,
fixing the problem described above.
2. At the first timestamp_in_interval() call site, cycles == now no longer
causes failure.
3. At the second timestamp_in_interval() call site, history_begin->cycles
== system_counterval.cycles no longer causes failure.
adjust_historical_crosststamp() also works for this corner case,
where partial_history_cycles == total_history_cycles.
These behavioral changes should not cause any problems.
Fixes: 2c756feb18d9 ("time: Add history to cross timestamp interface supporting slower devices")
Signed-off-by: Peter Hilber <peter.hilber@opensynergy.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20231218073849.35294-3-peter.hilber@opensynergy.com
2023-12-18 15:38:40 +08:00
if ( start > end & & ( ts > = start | | ts < = end ) )
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
return true ;
return false ;
}
2024-05-13 18:38:02 +08:00
static bool convert_clock ( u64 * val , u32 numerator , u32 denominator )
{
u64 rem , res ;
if ( ! numerator | | ! denominator )
return false ;
res = div64_u64_rem ( * val , denominator , & rem ) * numerator ;
* val = res + div_u64 ( rem * numerator , denominator ) ;
return true ;
}
static bool convert_base_to_cs ( struct system_counterval_t * scv )
{
struct clocksource * cs = tk_core . timekeeper . tkr_mono . clock ;
struct clocksource_base * base ;
u32 num , den ;
/* The timestamp was taken from the time keeper clock source */
if ( cs - > id = = scv - > cs_id )
return true ;
/*
* Check whether cs_id matches the base clock . Prevent the compiler from
* re - evaluating @ base as the clocksource might change concurrently .
*/
base = READ_ONCE ( cs - > base ) ;
if ( ! base | | base - > id ! = scv - > cs_id )
return false ;
num = scv - > use_nsecs ? cs - > freq_khz : base - > numerator ;
den = scv - > use_nsecs ? USEC_PER_SEC : base - > denominator ;
if ( ! convert_clock ( & scv - > cycles , num , den ) )
return false ;
scv - > cycles + = base - > offset ;
return true ;
}
2016-02-22 19:15:22 +08:00
/**
* get_device_system_crosststamp - Synchronously capture system / device timestamp
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
* @ get_time_fn : Callback to get simultaneous device time and
2016-02-22 19:15:22 +08:00
* system counter from the device driver
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
* @ ctx : Context passed to get_time_fn ( )
* @ history_begin : Historical reference point used to interpolate system
* time when counter provided by the driver is before the current interval
2016-02-22 19:15:22 +08:00
* @ xtstamp : Receives simultaneously captured system and device time
*
* Reads a timestamp from a device and correlates it to system time
*/
int get_device_system_crosststamp ( int ( * get_time_fn )
( ktime_t * device_time ,
struct system_counterval_t * sys_counterval ,
void * ctx ) ,
void * ctx ,
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
struct system_time_snapshot * history_begin ,
2016-02-22 19:15:22 +08:00
struct system_device_crosststamp * xtstamp )
{
struct system_counterval_t system_counterval ;
struct timekeeper * tk = & tk_core . timekeeper ;
2016-12-22 03:32:01 +08:00
u64 cycles , now , interval_start ;
2016-03-08 18:09:53 +08:00
unsigned int clock_was_set_seq = 0 ;
2016-02-22 19:15:22 +08:00
ktime_t base_real , base_raw ;
2016-12-09 04:49:34 +08:00
u64 nsec_real , nsec_raw ;
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
u8 cs_was_changed_seq ;
2019-03-19 03:55:56 +08:00
unsigned int seq ;
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
bool do_interp ;
2016-02-22 19:15:22 +08:00
int ret ;
do {
seq = read_seqcount_begin ( & tk_core . seq ) ;
/*
* Try to synchronously capture device time and a system
* counter value calling back into the device driver
*/
ret = get_time_fn ( & xtstamp - > device , & system_counterval , ctx ) ;
if ( ret )
return ret ;
/*
2024-02-01 09:04:51 +08:00
* Verify that the clocksource ID associated with the captured
* system counter value is the same as for the currently
* installed timekeeper clocksource
2016-02-22 19:15:22 +08:00
*/
2024-02-01 09:04:51 +08:00
if ( system_counterval . cs_id = = CSID_GENERIC | |
2024-05-13 18:38:02 +08:00
! convert_base_to_cs ( & system_counterval ) )
2016-02-22 19:15:22 +08:00
return - ENODEV ;
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
cycles = system_counterval . cycles ;
/*
* Check whether the system counter value provided by the
* device driver is on the current timekeeping interval .
*/
2017-06-09 07:44:20 +08:00
now = tk_clock_read ( & tk - > tkr_mono ) ;
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
interval_start = tk - > tkr_mono . cycle_last ;
timekeeping: Fix cross-timestamp interpolation corner case decision
The cycle_between() helper checks if parameter test is in the open interval
(before, after). Colloquially speaking, this also applies to the counter
wrap-around special case before > after. get_device_system_crosststamp()
currently uses cycle_between() at the first call site to decide whether to
interpolate for older counter readings.
get_device_system_crosststamp() has the following problem with
cycle_between() testing against an open interval: Assume that, by chance,
cycles == tk->tkr_mono.cycle_last (in the following, "cycle_last" for
brevity). Then, cycle_between() at the first call site, with effective
argument values cycle_between(cycle_last, cycles, now), returns false,
enabling interpolation. During interpolation,
get_device_system_crosststamp() will then call cycle_between() at the
second call site (if a history_begin was supplied). The effective argument
values are cycle_between(history_begin->cycles, cycles, cycles), since
system_counterval.cycles == interval_start == cycles, per the assumption.
Due to the test against the open interval, cycle_between() returns false
again. This causes get_device_system_crosststamp() to return -EINVAL.
This failure should be avoided, since get_device_system_crosststamp() works
both when cycles follows cycle_last (no interpolation), and when cycles
precedes cycle_last (interpolation). For the case cycles == cycle_last,
interpolation is actually unneeded.
Fix this by changing cycle_between() into timestamp_in_interval(), which
now checks against the closed interval, rather than the open interval.
This changes the get_device_system_crosststamp() behavior for three corner
cases:
1. Bypass interpolation in the case cycles == tk->tkr_mono.cycle_last,
fixing the problem described above.
2. At the first timestamp_in_interval() call site, cycles == now no longer
causes failure.
3. At the second timestamp_in_interval() call site, history_begin->cycles
== system_counterval.cycles no longer causes failure.
adjust_historical_crosststamp() also works for this corner case,
where partial_history_cycles == total_history_cycles.
These behavioral changes should not cause any problems.
Fixes: 2c756feb18d9 ("time: Add history to cross timestamp interface supporting slower devices")
Signed-off-by: Peter Hilber <peter.hilber@opensynergy.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20231218073849.35294-3-peter.hilber@opensynergy.com
2023-12-18 15:38:40 +08:00
if ( ! timestamp_in_interval ( interval_start , now , cycles ) ) {
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
clock_was_set_seq = tk - > clock_was_set_seq ;
cs_was_changed_seq = tk - > cs_was_changed_seq ;
cycles = interval_start ;
do_interp = true ;
} else {
do_interp = false ;
}
2016-02-22 19:15:22 +08:00
base_real = ktime_add ( tk - > tkr_mono . base ,
tk_core . timekeeper . offs_real ) ;
base_raw = tk - > tkr_raw . base ;
timekeeping: Fix cross-timestamp interpolation for non-x86
So far, get_device_system_crosststamp() unconditionally passes
system_counterval.cycles to timekeeping_cycles_to_ns(). But when
interpolating system time (do_interp == true), system_counterval.cycles is
before tkr_mono.cycle_last, contrary to the timekeeping_cycles_to_ns()
expectations.
On x86, CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE will mitigate on
interpolating, setting delta to 0. With delta == 0, xtstamp->sys_monoraw
and xtstamp->sys_realtime are then set to the last update time, as
implicitly expected by adjust_historical_crosststamp(). On other
architectures, the resulting nonsense xtstamp->sys_monoraw and
xtstamp->sys_realtime corrupt the xtstamp (ts) adjustment in
adjust_historical_crosststamp().
Fix this by deriving xtstamp->sys_monoraw and xtstamp->sys_realtime from
the last update time when interpolating, by using the local variable
"cycles". The local variable already has the right value when
interpolating, unlike system_counterval.cycles.
Fixes: 2c756feb18d9 ("time: Add history to cross timestamp interface supporting slower devices")
Signed-off-by: Peter Hilber <peter.hilber@opensynergy.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <jstultz@google.com>
Link: https://lore.kernel.org/r/20231218073849.35294-4-peter.hilber@opensynergy.com
2023-12-18 15:38:41 +08:00
nsec_real = timekeeping_cycles_to_ns ( & tk - > tkr_mono , cycles ) ;
nsec_raw = timekeeping_cycles_to_ns ( & tk - > tkr_raw , cycles ) ;
2016-02-22 19:15:22 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
xtstamp - > sys_realtime = ktime_add_ns ( base_real , nsec_real ) ;
xtstamp - > sys_monoraw = ktime_add_ns ( base_raw , nsec_raw ) ;
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
/*
* Interpolate if necessary , adjusting back from the start of the
* current interval
*/
if ( do_interp ) {
2016-12-22 03:32:01 +08:00
u64 partial_history_cycles , total_history_cycles ;
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
bool discontinuity ;
/*
timekeeping: Fix cross-timestamp interpolation corner case decision
The cycle_between() helper checks if parameter test is in the open interval
(before, after). Colloquially speaking, this also applies to the counter
wrap-around special case before > after. get_device_system_crosststamp()
currently uses cycle_between() at the first call site to decide whether to
interpolate for older counter readings.
get_device_system_crosststamp() has the following problem with
cycle_between() testing against an open interval: Assume that, by chance,
cycles == tk->tkr_mono.cycle_last (in the following, "cycle_last" for
brevity). Then, cycle_between() at the first call site, with effective
argument values cycle_between(cycle_last, cycles, now), returns false,
enabling interpolation. During interpolation,
get_device_system_crosststamp() will then call cycle_between() at the
second call site (if a history_begin was supplied). The effective argument
values are cycle_between(history_begin->cycles, cycles, cycles), since
system_counterval.cycles == interval_start == cycles, per the assumption.
Due to the test against the open interval, cycle_between() returns false
again. This causes get_device_system_crosststamp() to return -EINVAL.
This failure should be avoided, since get_device_system_crosststamp() works
both when cycles follows cycle_last (no interpolation), and when cycles
precedes cycle_last (interpolation). For the case cycles == cycle_last,
interpolation is actually unneeded.
Fix this by changing cycle_between() into timestamp_in_interval(), which
now checks against the closed interval, rather than the open interval.
This changes the get_device_system_crosststamp() behavior for three corner
cases:
1. Bypass interpolation in the case cycles == tk->tkr_mono.cycle_last,
fixing the problem described above.
2. At the first timestamp_in_interval() call site, cycles == now no longer
causes failure.
3. At the second timestamp_in_interval() call site, history_begin->cycles
== system_counterval.cycles no longer causes failure.
adjust_historical_crosststamp() also works for this corner case,
where partial_history_cycles == total_history_cycles.
These behavioral changes should not cause any problems.
Fixes: 2c756feb18d9 ("time: Add history to cross timestamp interface supporting slower devices")
Signed-off-by: Peter Hilber <peter.hilber@opensynergy.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20231218073849.35294-3-peter.hilber@opensynergy.com
2023-12-18 15:38:40 +08:00
* Check that the counter value is not before the provided
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
* history reference and that the history doesn ' t cross a
* clocksource change
*/
if ( ! history_begin | |
timekeeping: Fix cross-timestamp interpolation corner case decision
The cycle_between() helper checks if parameter test is in the open interval
(before, after). Colloquially speaking, this also applies to the counter
wrap-around special case before > after. get_device_system_crosststamp()
currently uses cycle_between() at the first call site to decide whether to
interpolate for older counter readings.
get_device_system_crosststamp() has the following problem with
cycle_between() testing against an open interval: Assume that, by chance,
cycles == tk->tkr_mono.cycle_last (in the following, "cycle_last" for
brevity). Then, cycle_between() at the first call site, with effective
argument values cycle_between(cycle_last, cycles, now), returns false,
enabling interpolation. During interpolation,
get_device_system_crosststamp() will then call cycle_between() at the
second call site (if a history_begin was supplied). The effective argument
values are cycle_between(history_begin->cycles, cycles, cycles), since
system_counterval.cycles == interval_start == cycles, per the assumption.
Due to the test against the open interval, cycle_between() returns false
again. This causes get_device_system_crosststamp() to return -EINVAL.
This failure should be avoided, since get_device_system_crosststamp() works
both when cycles follows cycle_last (no interpolation), and when cycles
precedes cycle_last (interpolation). For the case cycles == cycle_last,
interpolation is actually unneeded.
Fix this by changing cycle_between() into timestamp_in_interval(), which
now checks against the closed interval, rather than the open interval.
This changes the get_device_system_crosststamp() behavior for three corner
cases:
1. Bypass interpolation in the case cycles == tk->tkr_mono.cycle_last,
fixing the problem described above.
2. At the first timestamp_in_interval() call site, cycles == now no longer
causes failure.
3. At the second timestamp_in_interval() call site, history_begin->cycles
== system_counterval.cycles no longer causes failure.
adjust_historical_crosststamp() also works for this corner case,
where partial_history_cycles == total_history_cycles.
These behavioral changes should not cause any problems.
Fixes: 2c756feb18d9 ("time: Add history to cross timestamp interface supporting slower devices")
Signed-off-by: Peter Hilber <peter.hilber@opensynergy.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20231218073849.35294-3-peter.hilber@opensynergy.com
2023-12-18 15:38:40 +08:00
! timestamp_in_interval ( history_begin - > cycles ,
cycles , system_counterval . cycles ) | |
time: Add history to cross timestamp interface supporting slower devices
Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.
In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).
From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:
System Clock <-> Audio clock
System Clock <-> Network Device Clock [<-> PTP Master Clock]
Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock). The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence. This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.
Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.
When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2016-02-22 19:15:23 +08:00
history_begin - > cs_was_changed_seq ! = cs_was_changed_seq )
return - EINVAL ;
partial_history_cycles = cycles - system_counterval . cycles ;
total_history_cycles = cycles - history_begin - > cycles ;
discontinuity =
history_begin - > clock_was_set_seq ! = clock_was_set_seq ;
ret = adjust_historical_crosststamp ( history_begin ,
partial_history_cycles ,
total_history_cycles ,
discontinuity , xtstamp ) ;
if ( ret )
return ret ;
}
2016-02-22 19:15:22 +08:00
return 0 ;
}
EXPORT_SYMBOL_GPL ( get_device_system_crosststamp ) ;
2007-05-08 15:27:59 +08:00
/**
2014-11-18 19:15:16 +08:00
* do_settimeofday64 - Sets the time of day .
* @ ts : pointer to the timespec64 variable containing the new time
2007-05-08 15:27:59 +08:00
*
* Sets the time of day to the new time and update NTP and notify hrtimers
*/
2014-11-18 19:15:16 +08:00
int do_settimeofday64 ( const struct timespec64 * ts )
2007-05-08 15:27:59 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2014-11-18 19:15:16 +08:00
struct timespec64 ts_delta , xt ;
2011-11-15 06:05:44 +08:00
unsigned long flags ;
2015-06-23 18:38:54 +08:00
int ret = 0 ;
2007-05-08 15:27:59 +08:00
2019-03-23 18:36:19 +08:00
if ( ! timespec64_valid_settod ( ts ) )
2007-05-08 15:27:59 +08:00
return - EINVAL ;
2013-02-22 06:51:38 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2007-05-08 15:27:59 +08:00
2012-07-28 02:48:13 +08:00
timekeeping_forward_now ( tk ) ;
2008-08-21 07:37:28 +08:00
2012-07-28 02:48:13 +08:00
xt = tk_xtime ( tk ) ;
timekeeping: Really make sure wall_to_monotonic isn't positive
Even after commit e1d7ba873555 ("time: Always make sure wall_to_monotonic
isn't positive") it is still possible to make wall_to_monotonic positive
by running the following code:
int main(void)
{
struct timespec time;
clock_gettime(CLOCK_MONOTONIC, &time);
time.tv_nsec = 0;
clock_settime(CLOCK_REALTIME, &time);
return 0;
}
The reason is that the second parameter of timespec64_compare(), ts_delta,
may be unnormalized because the delta is calculated with an open coded
substraction which causes the comparison of tv_sec to yield the wrong
result:
wall_to_monotonic = { .tv_sec = -10, .tv_nsec = 900000000 }
ts_delta = { .tv_sec = -9, .tv_nsec = -900000000 }
That makes timespec64_compare() claim that wall_to_monotonic < ts_delta,
but actually the result should be wall_to_monotonic > ts_delta.
After normalization, the result of timespec64_compare() is correct because
the tv_sec comparison is not longer misleading:
wall_to_monotonic = { .tv_sec = -10, .tv_nsec = 900000000 }
ts_delta = { .tv_sec = -10, .tv_nsec = 100000000 }
Use timespec64_sub() to ensure that ts_delta is normalized, which fixes the
issue.
Fixes: e1d7ba873555 ("time: Always make sure wall_to_monotonic isn't positive")
Signed-off-by: Yu Liao <liaoyu15@huawei.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20211213135727.1656662-1-liaoyu15@huawei.com
2021-12-13 21:57:27 +08:00
ts_delta = timespec64_sub ( * ts , xt ) ;
2012-07-13 13:21:53 +08:00
2015-06-23 18:38:54 +08:00
if ( timespec64_compare ( & tk - > wall_to_monotonic , & ts_delta ) > 0 ) {
ret = - EINVAL ;
goto out ;
}
2014-07-17 05:04:01 +08:00
tk_set_wall_to_mono ( tk , timespec64_sub ( tk - > wall_to_monotonic , ts_delta ) ) ;
2007-05-08 15:27:59 +08:00
2014-11-18 19:15:16 +08:00
tk_set_xtime ( tk , ts ) ;
2015-06-23 18:38:54 +08:00
out :
2013-06-27 18:35:46 +08:00
timekeeping_update ( tk , TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET ) ;
2007-05-08 15:27:59 +08:00
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2007-05-08 15:27:59 +08:00
2021-07-13 21:39:53 +08:00
/* Signal hrtimers about time change */
clock_was_set ( CLOCK_SET_WALL ) ;
2007-05-08 15:27:59 +08:00
2022-07-18 05:53:34 +08:00
if ( ! ret ) {
2019-04-10 17:14:19 +08:00
audit_tk_injoffset ( ts_delta ) ;
2022-07-18 05:53:34 +08:00
add_device_randomness ( ts , sizeof ( * ts ) ) ;
}
2019-04-10 17:14:19 +08:00
2015-06-23 18:38:54 +08:00
return ret ;
2007-05-08 15:27:59 +08:00
}
2014-11-18 19:15:16 +08:00
EXPORT_SYMBOL ( do_settimeofday64 ) ;
2007-05-08 15:27:59 +08:00
2011-02-01 21:52:17 +08:00
/**
* timekeeping_inject_offset - Adds or subtracts from the current time .
2020-11-13 15:24:34 +08:00
* @ ts : Pointer to the timespec variable containing the offset
2011-02-01 21:52:17 +08:00
*
* Adds or subtracts an offset value from the current time .
*/
2018-07-13 20:06:42 +08:00
static int timekeeping_inject_offset ( const struct timespec64 * ts )
2011-02-01 21:52:17 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2011-11-15 06:05:44 +08:00
unsigned long flags ;
2017-10-19 19:14:45 +08:00
struct timespec64 tmp ;
2012-08-09 03:36:20 +08:00
int ret = 0 ;
2011-02-01 21:52:17 +08:00
2017-10-19 19:14:45 +08:00
if ( ts - > tv_nsec < 0 | | ts - > tv_nsec > = NSEC_PER_SEC )
2011-02-01 21:52:17 +08:00
return - EINVAL ;
2013-02-22 06:51:38 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2011-02-01 21:52:17 +08:00
2012-07-28 02:48:13 +08:00
timekeeping_forward_now ( tk ) ;
2011-02-01 21:52:17 +08:00
2012-08-09 03:36:20 +08:00
/* Make sure the proposed value is valid */
2017-10-19 19:14:45 +08:00
tmp = timespec64_add ( tk_xtime ( tk ) , * ts ) ;
if ( timespec64_compare ( & tk - > wall_to_monotonic , ts ) > 0 | |
2019-03-23 18:36:19 +08:00
! timespec64_valid_settod ( & tmp ) ) {
2012-08-09 03:36:20 +08:00
ret = - EINVAL ;
goto error ;
}
2012-07-13 13:21:53 +08:00
2017-10-19 19:14:45 +08:00
tk_xtime_add ( tk , ts ) ;
tk_set_wall_to_mono ( tk , timespec64_sub ( tk - > wall_to_monotonic , * ts ) ) ;
2011-02-01 21:52:17 +08:00
2012-08-09 03:36:20 +08:00
error : /* even if we error out, we forwarded the time, so call update */
2013-06-27 18:35:46 +08:00
timekeeping_update ( tk , TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET ) ;
2011-02-01 21:52:17 +08:00
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2011-02-01 21:52:17 +08:00
2021-07-13 21:39:53 +08:00
/* Signal hrtimers about time change */
clock_was_set ( CLOCK_SET_WALL ) ;
2011-02-01 21:52:17 +08:00
2012-08-09 03:36:20 +08:00
return ret ;
2011-02-01 21:52:17 +08:00
}
2017-10-19 19:14:44 +08:00
/*
* Indicates if there is an offset between the system clock and the hardware
* clock / persistent clock / rtc .
*/
int persistent_clock_is_local ;
/*
* Adjust the time obtained from the CMOS to be UTC time instead of
* local time .
*
* This is ugly , but preferable to the alternatives . Otherwise we
* would either need to write a program to do it in / etc / rc ( and risk
* confusion if the program gets run more than once ; it would also be
* hard to make the program warp the clock precisely n hours ) or
* compile in the timezone information into the kernel . Bad , bad . . . .
*
* - TYT , 1992 - 01 - 01
*
* The best thing to do is to keep the CMOS clock in universal time ( UTC )
* as real UNIX machines always do it . This avoids all headaches about
* daylight saving times and warping kernel clocks .
*/
void timekeeping_warp_clock ( void )
{
if ( sys_tz . tz_minuteswest ! = 0 ) {
2017-10-19 19:14:45 +08:00
struct timespec64 adjust ;
2017-10-19 19:14:44 +08:00
persistent_clock_is_local = 1 ;
adjust . tv_sec = sys_tz . tz_minuteswest * 60 ;
adjust . tv_nsec = 0 ;
timekeeping_inject_offset ( & adjust ) ;
}
}
2011-02-01 21:52:17 +08:00
2020-11-13 15:24:33 +08:00
/*
2016-12-08 06:33:23 +08:00
* __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic
2012-05-04 03:30:07 +08:00
*/
2013-03-26 03:24:24 +08:00
static void __timekeeping_set_tai_offset ( struct timekeeper * tk , s32 tai_offset )
2012-05-04 03:30:07 +08:00
{
tk - > tai_offset = tai_offset ;
2013-12-11 09:13:35 +08:00
tk - > offs_tai = ktime_add ( tk - > offs_real , ktime_set ( tai_offset , 0 ) ) ;
2012-05-04 03:30:07 +08:00
}
2020-11-13 15:24:33 +08:00
/*
2007-05-08 15:27:59 +08:00
* change_clocksource - Swaps clocksources if a new one is available
*
* Accumulates current time interval and initializes new clocksource
*/
2009-08-14 21:47:30 +08:00
static int change_clocksource ( void * data )
2007-05-08 15:27:59 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2021-02-11 21:43:18 +08:00
struct clocksource * new , * old = NULL ;
2012-03-15 07:38:15 +08:00
unsigned long flags ;
2021-02-11 21:43:18 +08:00
bool change = false ;
2007-05-08 15:27:59 +08:00
2009-08-14 21:47:30 +08:00
new = ( struct clocksource * ) data ;
2007-05-08 15:27:59 +08:00
2013-04-26 04:31:44 +08:00
/*
* If the cs is in module , get a module reference . Succeeds
* for built - in code ( owner = = NULL ) as well .
*/
if ( try_module_get ( new - > owner ) ) {
2021-02-11 21:43:18 +08:00
if ( ! new - > enable | | new - > enable ( new ) = = 0 )
change = true ;
else
2013-04-26 04:31:44 +08:00
module_put ( new - > owner ) ;
2009-08-14 21:47:30 +08:00
}
2021-02-11 21:43:18 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
write_seqcount_begin ( & tk_core . seq ) ;
timekeeping_forward_now ( tk ) ;
if ( change ) {
old = tk - > tkr_mono . clock ;
tk_setup_internals ( tk , new ) ;
}
2013-06-27 18:35:46 +08:00
timekeeping_update ( tk , TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET ) ;
2012-03-15 07:38:15 +08:00
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2012-03-15 07:38:15 +08:00
2021-02-11 21:43:18 +08:00
if ( old ) {
if ( old - > disable )
old - > disable ( old ) ;
module_put ( old - > owner ) ;
}
2009-08-14 21:47:30 +08:00
return 0 ;
}
2007-05-08 15:27:59 +08:00
2009-08-14 21:47:30 +08:00
/**
* timekeeping_notify - Install a new clock source
* @ clock : pointer to the clock source
*
* This function is called from clocksource . c after a new , better clock
* source has been registered . The caller holds the clocksource_mutex .
*/
2013-04-26 04:31:44 +08:00
int timekeeping_notify ( struct clocksource * clock )
2009-08-14 21:47:30 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2012-07-28 02:48:13 +08:00
2015-03-19 17:09:06 +08:00
if ( tk - > tkr_mono . clock = = clock )
2013-04-26 04:31:44 +08:00
return 0 ;
2009-08-14 21:47:30 +08:00
stop_machine ( change_clocksource , clock , NULL ) ;
2007-05-08 15:27:59 +08:00
tick_clock_notify ( ) ;
2015-03-19 17:09:06 +08:00
return tk - > tkr_mono . clock = = clock ? 0 : - 1 ;
2007-05-08 15:27:59 +08:00
}
2009-08-14 21:47:30 +08:00
2008-08-21 07:37:30 +08:00
/**
2018-04-27 21:40:14 +08:00
* ktime_get_raw_ts64 - Returns the raw monotonic time in a timespec
2014-11-08 03:03:20 +08:00
* @ ts : pointer to the timespec64 to be set
2008-08-21 07:37:30 +08:00
*
* Returns the raw monotonic time ( completely un - modified by ntp )
*/
2018-04-27 21:40:14 +08:00
void ktime_get_raw_ts64 ( struct timespec64 * ts )
2008-08-21 07:37:30 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2019-03-19 03:55:56 +08:00
unsigned int seq ;
2016-12-09 04:49:34 +08:00
u64 nsecs ;
2008-08-21 07:37:30 +08:00
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
2017-05-23 08:20:20 +08:00
ts - > tv_sec = tk - > raw_sec ;
2015-03-19 16:28:44 +08:00
nsecs = timekeeping_get_ns ( & tk - > tkr_raw ) ;
2008-08-21 07:37:30 +08:00
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2008-08-21 07:37:30 +08:00
2017-05-23 08:20:20 +08:00
ts - > tv_nsec = 0 ;
timespec64_add_ns ( ts , nsecs ) ;
2008-08-21 07:37:30 +08:00
}
2018-04-27 21:40:14 +08:00
EXPORT_SYMBOL ( ktime_get_raw_ts64 ) ;
2014-11-08 03:03:20 +08:00
2008-08-21 07:37:30 +08:00
2007-05-08 15:27:59 +08:00
/**
2008-02-08 20:19:24 +08:00
* timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
2007-05-08 15:27:59 +08:00
*/
2008-02-08 20:19:24 +08:00
int timekeeping_valid_for_hres ( void )
2007-05-08 15:27:59 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2019-03-19 03:55:56 +08:00
unsigned int seq ;
2007-05-08 15:27:59 +08:00
int ret ;
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
2007-05-08 15:27:59 +08:00
2015-03-19 17:09:06 +08:00
ret = tk - > tkr_mono . clock - > flags & CLOCK_SOURCE_VALID_FOR_HRES ;
2007-05-08 15:27:59 +08:00
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2007-05-08 15:27:59 +08:00
return ret ;
}
2009-08-19 01:45:10 +08:00
/**
* timekeeping_max_deferment - Returns max time the clocksource can be deferred
*/
u64 timekeeping_max_deferment ( void )
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2019-03-19 03:55:56 +08:00
unsigned int seq ;
2011-11-15 04:48:10 +08:00
u64 ret ;
2012-07-13 13:21:51 +08:00
2011-11-15 04:48:10 +08:00
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
2011-11-15 04:48:10 +08:00
2015-03-19 17:09:06 +08:00
ret = tk - > tkr_mono . clock - > max_idle_ns ;
2011-11-15 04:48:10 +08:00
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2011-11-15 04:48:10 +08:00
return ret ;
2009-08-19 01:45:10 +08:00
}
2007-05-08 15:27:59 +08:00
/**
2018-08-14 20:15:23 +08:00
* read_persistent_clock64 - Return time from the persistent clock .
2020-11-13 15:24:34 +08:00
* @ ts : Pointer to the storage for the readout value
2007-05-08 15:27:59 +08:00
*
* Weak dummy function for arches that do not yet support it .
2009-08-14 21:47:31 +08:00
* Reads the time from the battery backed persistent clock .
* Returns a timespec with tv_sec = 0 and tv_nsec = 0 if unsupported .
2007-05-08 15:27:59 +08:00
*
* XXX - Do be sure to remove it once all arches implement it .
*/
2018-08-14 20:15:23 +08:00
void __weak read_persistent_clock64 ( struct timespec64 * ts )
2007-05-08 15:27:59 +08:00
{
2009-08-14 21:47:31 +08:00
ts - > tv_sec = 0 ;
ts - > tv_nsec = 0 ;
2007-05-08 15:27:59 +08:00
}
2009-08-14 21:47:32 +08:00
/**
2018-07-20 04:55:34 +08:00
* read_persistent_wall_and_boot_offset - Read persistent clock , and also offset
* from the boot .
2023-01-03 11:28:49 +08:00
* @ wall_time : current time as returned by persistent clock
* @ boot_offset : offset that is defined as wall_time - boot_time
2009-08-14 21:47:32 +08:00
*
* Weak dummy function for arches that do not yet support it .
2020-11-13 15:24:35 +08:00
*
2018-07-20 04:55:35 +08:00
* The default function calculates offset based on the current value of
* local_clock ( ) . This way architectures that support sched_clock ( ) but don ' t
* support dedicated boot time clock will provide the best estimate of the
* boot time .
2009-08-14 21:47:32 +08:00
*/
2018-07-20 04:55:34 +08:00
void __weak __init
read_persistent_wall_and_boot_offset ( struct timespec64 * wall_time ,
struct timespec64 * boot_offset )
2009-08-14 21:47:32 +08:00
{
2018-07-20 04:55:34 +08:00
read_persistent_clock64 ( wall_time ) ;
2018-07-20 04:55:35 +08:00
* boot_offset = ns_to_timespec64 ( local_clock ( ) ) ;
2009-08-14 21:47:32 +08:00
}
2018-07-17 14:31:29 +08:00
/*
* Flag reflecting whether timekeeping_resume ( ) has injected sleeptime .
*
* The flag starts of false and is only set when a suspend reaches
* timekeeping_suspend ( ) , timekeeping_resume ( ) sets it to false when the
* timekeeper clocksource is not stopping across suspend and has been
* used to update sleep time . If the timekeeper clocksource has stopped
* then the flag stays true and is used by the RTC resume code to decide
* whether sleeptime must be injected and if so the flag gets false then .
*
* If a suspend fails before reaching timekeeping_resume ( ) then the flag
* stays false and prevents erroneous sleeptime injection .
*/
static bool suspend_timing_needed ;
2015-04-02 11:34:38 +08:00
/* Flag for if there is a persistent clock on this platform */
static bool persistent_clock_exists ;
2007-05-08 15:27:59 +08:00
/*
* timekeeping_init - Initializes the clocksource and common timekeeping values
*/
void __init timekeeping_init ( void )
{
2018-07-20 04:55:34 +08:00
struct timespec64 wall_time , boot_offset , wall_to_mono ;
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2009-08-14 21:47:26 +08:00
struct clocksource * clock ;
2007-05-08 15:27:59 +08:00
unsigned long flags ;
2012-08-09 03:36:20 +08:00
2018-07-20 04:55:34 +08:00
read_persistent_wall_and_boot_offset ( & wall_time , & boot_offset ) ;
2019-03-23 18:36:19 +08:00
if ( timespec64_valid_settod ( & wall_time ) & &
2018-07-20 04:55:34 +08:00
timespec64_to_ns ( & wall_time ) > 0 ) {
persistent_clock_exists = true ;
2018-07-26 04:00:18 +08:00
} else if ( timespec64_to_ns ( & wall_time ) ! = 0 ) {
2018-07-20 04:55:34 +08:00
pr_warn ( " Persistent clock returned invalid value " ) ;
wall_time = ( struct timespec64 ) { 0 } ;
2012-08-09 03:36:20 +08:00
}
2007-05-08 15:27:59 +08:00
2018-07-20 04:55:34 +08:00
if ( timespec64_compare ( & wall_time , & boot_offset ) < 0 )
boot_offset = ( struct timespec64 ) { 0 } ;
/*
* We want set wall_to_mono , so the following is true :
* wall time + wall_to_mono = boot time
*/
wall_to_mono = timespec64_sub ( boot_offset , wall_time ) ;
2013-02-22 06:51:38 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2013-03-23 02:37:28 +08:00
ntp_init ( ) ;
2009-08-14 21:47:21 +08:00
clock = clocksource_default_clock ( ) ;
2009-08-14 21:47:19 +08:00
if ( clock - > enable )
clock - > enable ( clock ) ;
2012-07-28 02:48:13 +08:00
tk_setup_internals ( tk , clock ) ;
2007-05-08 15:27:59 +08:00
2018-07-20 04:55:34 +08:00
tk_set_xtime ( tk , & wall_time ) ;
2017-05-23 08:20:20 +08:00
tk - > raw_sec = 0 ;
2012-07-13 13:21:53 +08:00
2018-07-20 04:55:34 +08:00
tk_set_wall_to_mono ( tk , wall_to_mono ) ;
2012-07-28 02:48:12 +08:00
2015-10-16 21:50:22 +08:00
timekeeping_update ( tk , TK_MIRROR | TK_CLOCK_WAS_SET ) ;
2013-02-22 06:51:40 +08:00
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2007-05-08 15:27:59 +08:00
}
time: Fix a bug in timekeeping_suspend() with no persistent clock
When there's no persistent clock, normally
timekeeping_suspend_time should always be zero, but this can
break in timekeeping_suspend().
At T1, there was a system suspend, so old_delta was assigned T1.
After some time, one time adjustment happened, and xtime got the
value of T1-dt(0s<dt<2s). Then, there comes another system
suspend soon after this adjustment, obviously we will get a
small negative delta_delta, resulting in a negative
timekeeping_suspend_time.
This is problematic, when doing timekeeping_resume() if there is
no nonstop clocksource for example, it will hit the else leg and
inject the improper sleeptime which is the wrong logic.
So, we can solve this problem by only doing delta related code
when the persistent clock is existent. Actually the code only
makes sense for persistent clock cases.
Signed-off-by: Xunlei Pang <pang.xunlei@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1427945681-29972-18-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-04-02 11:34:37 +08:00
/* time in seconds when suspend began for persistent clock */
2014-07-17 05:04:01 +08:00
static struct timespec64 timekeeping_suspend_time ;
2007-05-08 15:27:59 +08:00
2011-04-02 05:32:09 +08:00
/**
* __timekeeping_inject_sleeptime - Internal function to add sleep interval
2020-11-13 15:24:34 +08:00
* @ tk : Pointer to the timekeeper to be updated
* @ delta : Pointer to the delta value in timespec64 format
2011-04-02 05:32:09 +08:00
*
* Takes a timespec offset measuring a suspend interval and properly
* adds the sleep offset to the timekeeping variables .
*/
2012-07-13 13:21:57 +08:00
static void __timekeeping_inject_sleeptime ( struct timekeeper * tk ,
2018-07-13 20:06:42 +08:00
const struct timespec64 * delta )
2011-04-02 05:32:09 +08:00
{
2014-07-17 05:04:01 +08:00
if ( ! timespec64_valid_strict ( delta ) ) {
2014-06-05 07:11:43 +08:00
printk_deferred ( KERN_WARNING
" __timekeeping_inject_sleeptime: Invalid "
" sleep delta value! \n " ) ;
2011-06-02 09:18:09 +08:00
return ;
}
2012-07-13 13:21:57 +08:00
tk_xtime_add ( tk , delta ) ;
2018-04-25 21:33:38 +08:00
tk_set_wall_to_mono ( tk , timespec64_sub ( tk - > wall_to_monotonic , * delta ) ) ;
2014-07-17 05:05:00 +08:00
tk_update_sleep_time ( tk , timespec64_to_ktime ( * delta ) ) ;
2013-05-22 13:32:14 +08:00
tk_debug_account_sleep_time ( delta ) ;
2011-04-02 05:32:09 +08:00
}
2015-04-02 11:34:35 +08:00
# if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE)
2023-01-03 11:28:49 +08:00
/*
2015-04-02 11:34:38 +08:00
* We have three kinds of time sources to use for sleep time
* injection , the preference order is :
* 1 ) non - stop clocksource
* 2 ) persistent clock ( ie : RTC accessible when irqs are off )
* 3 ) RTC
*
* 1 ) and 2 ) are used by timekeeping , 3 ) by RTC subsystem .
* If system has neither 1 ) nor 2 ) , 3 ) will be used finally .
*
*
* If timekeeping has injected sleeptime via either 1 ) or 2 ) ,
* 3 ) becomes needless , so in this case we don ' t need to call
* rtc_resume ( ) , and this is what timekeeping_rtc_skipresume ( )
* means .
*/
bool timekeeping_rtc_skipresume ( void )
{
2018-07-17 14:31:29 +08:00
return ! suspend_timing_needed ;
2015-04-02 11:34:38 +08:00
}
2023-01-03 11:28:49 +08:00
/*
2015-04-02 11:34:38 +08:00
* 1 ) can be determined whether to use or not only when doing
* timekeeping_resume ( ) which is invoked after rtc_suspend ( ) ,
* so we can ' t skip rtc_suspend ( ) surely if system has 1 ) .
*
* But if system has 2 ) , 2 ) will definitely be used , so in this
* case we don ' t need to call rtc_suspend ( ) , and this is what
* timekeeping_rtc_skipsuspend ( ) means .
*/
bool timekeeping_rtc_skipsuspend ( void )
{
return persistent_clock_exists ;
}
2011-04-02 05:32:09 +08:00
/**
2014-11-18 19:15:17 +08:00
* timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values
* @ delta : pointer to a timespec64 delta value
2011-04-02 05:32:09 +08:00
*
2015-04-02 11:34:22 +08:00
* This hook is for architectures that cannot support read_persistent_clock64
2011-04-02 05:32:09 +08:00
* because their RTC / persistent clock is only accessible when irqs are enabled .
2015-04-02 11:34:38 +08:00
* and also don ' t have an effective nonstop clocksource .
2011-04-02 05:32:09 +08:00
*
* This function should only be called by rtc_resume ( ) , and allows
* a suspend offset to be injected into the timekeeping values .
*/
2018-07-13 20:06:42 +08:00
void timekeeping_inject_sleeptime64 ( const struct timespec64 * delta )
2011-04-02 05:32:09 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2011-11-15 06:05:44 +08:00
unsigned long flags ;
2011-04-02 05:32:09 +08:00
2013-02-22 06:51:38 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2011-11-15 04:48:10 +08:00
2018-07-17 14:31:29 +08:00
suspend_timing_needed = false ;
2012-07-28 02:48:13 +08:00
timekeeping_forward_now ( tk ) ;
2011-04-02 05:32:09 +08:00
2014-11-18 19:15:17 +08:00
__timekeeping_inject_sleeptime ( tk , delta ) ;
2011-04-02 05:32:09 +08:00
2013-06-27 18:35:46 +08:00
timekeeping_update ( tk , TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET ) ;
2011-04-02 05:32:09 +08:00
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2011-04-02 05:32:09 +08:00
2021-07-13 21:39:53 +08:00
/* Signal hrtimers about time change */
clock_was_set ( CLOCK_SET_WALL | CLOCK_SET_BOOT ) ;
2011-04-02 05:32:09 +08:00
}
2015-04-02 11:34:35 +08:00
# endif
2011-04-02 05:32:09 +08:00
2007-05-08 15:27:59 +08:00
/**
* timekeeping_resume - Resumes the generic timekeeping subsystem .
*/
PM / sleep: Make it possible to quiesce timers during suspend-to-idle
The efficiency of suspend-to-idle depends on being able to keep CPUs
in the deepest available idle states for as much time as possible.
Ideally, they should only be brought out of idle by system wakeup
interrupts.
However, timer interrupts occurring periodically prevent that from
happening and it is not practical to chase all of the "misbehaving"
timers in a whack-a-mole fashion. A much more effective approach is
to suspend the local ticks for all CPUs and the entire timekeeping
along the lines of what is done during full suspend, which also
helps to keep suspend-to-idle and full suspend reasonably similar.
The idea is to suspend the local tick on each CPU executing
cpuidle_enter_freeze() and to make the last of them suspend the
entire timekeeping. That should prevent timer interrupts from
triggering until an IO interrupt wakes up one of the CPUs. It
needs to be done with interrupts disabled on all of the CPUs,
though, because otherwise the suspended clocksource might be
accessed by an interrupt handler which might lead to fatal
consequences.
Unfortunately, the existing ->enter callbacks provided by cpuidle
drivers generally cannot be used for implementing that, because some
of them re-enable interrupts temporarily and some idle entry methods
cause interrupts to be re-enabled automatically on exit. Also some
of these callbacks manipulate local clock event devices of the CPUs
which really shouldn't be done after suspending their ticks.
To overcome that difficulty, introduce a new cpuidle state callback,
->enter_freeze, that will be guaranteed (1) to keep interrupts
disabled all the time (and return with interrupts disabled) and (2)
not to touch the CPU timer devices. Modify cpuidle_enter_freeze() to
look for the deepest available idle state with ->enter_freeze present
and to make the CPU execute that callback with suspended tick (and the
last of the online CPUs to execute it with suspended timekeeping).
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2015-02-14 06:50:43 +08:00
void timekeeping_resume ( void )
2007-05-08 15:27:59 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2015-03-19 17:09:06 +08:00
struct clocksource * clock = tk - > tkr_mono . clock ;
2011-11-15 06:05:44 +08:00
unsigned long flags ;
2014-07-17 05:04:01 +08:00
struct timespec64 ts_new , ts_delta ;
2018-07-17 15:55:16 +08:00
u64 cycle_now , nsec ;
2018-07-17 14:31:29 +08:00
bool inject_sleeptime = false ;
2009-08-14 21:47:31 +08:00
2015-04-02 11:34:22 +08:00
read_persistent_clock64 ( & ts_new ) ;
2007-05-08 15:27:59 +08:00
2012-08-06 07:40:41 +08:00
clockevents_resume ( ) ;
2007-05-14 17:10:02 +08:00
clocksource_resume ( ) ;
2013-02-22 06:51:38 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2007-05-08 15:27:59 +08:00
2013-03-12 11:56:48 +08:00
/*
* After system resumes , we need to calculate the suspended time and
* compensate it for the OS time . There are 3 sources that could be
* used : Nonstop clocksource during suspend , persistent clock and rtc
* device .
*
* One specific platform may have 1 or 2 or all of them , and the
* preference will be :
* suspend - nonstop clocksource - > persistent clock - > rtc
* The less preferred source will only be tried if there is no better
* usable source . The rtc part is handled separately in rtc core code .
*/
2017-06-09 07:44:20 +08:00
cycle_now = tk_clock_read ( & tk - > tkr_mono ) ;
2018-07-17 15:55:16 +08:00
nsec = clocksource_stop_suspend_timing ( clock , cycle_now ) ;
if ( nsec > 0 ) {
2014-07-17 05:04:01 +08:00
ts_delta = ns_to_timespec64 ( nsec ) ;
2018-07-17 14:31:29 +08:00
inject_sleeptime = true ;
2014-07-17 05:04:01 +08:00
} else if ( timespec64_compare ( & ts_new , & timekeeping_suspend_time ) > 0 ) {
ts_delta = timespec64_sub ( ts_new , timekeeping_suspend_time ) ;
2018-07-17 14:31:29 +08:00
inject_sleeptime = true ;
2007-05-08 15:27:59 +08:00
}
2013-03-12 11:56:48 +08:00
2018-07-17 14:31:29 +08:00
if ( inject_sleeptime ) {
suspend_timing_needed = false ;
2013-03-12 11:56:48 +08:00
__timekeeping_inject_sleeptime ( tk , & ts_delta ) ;
2018-07-17 14:31:29 +08:00
}
2013-03-12 11:56:48 +08:00
/* Re-base the last cycle value */
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . cycle_last = cycle_now ;
2015-03-19 16:28:44 +08:00
tk - > tkr_raw . cycle_last = cycle_now ;
2012-07-28 02:48:13 +08:00
tk - > ntp_error = 0 ;
2007-05-08 15:27:59 +08:00
timekeeping_suspended = 0 ;
2013-06-27 18:35:46 +08:00
timekeeping_update ( tk , TK_MIRROR | TK_CLOCK_WAS_SET ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2007-05-08 15:27:59 +08:00
touch_softlockup_watchdog ( ) ;
2021-07-13 21:39:51 +08:00
/* Resume the clockevent device(s) and hrtimers */
2015-03-25 20:09:16 +08:00
tick_resume ( ) ;
2021-07-13 21:39:51 +08:00
/* Notify timerfd as resume is equivalent to clock_was_set() */
timerfd_resume ( ) ;
2007-05-08 15:27:59 +08:00
}
PM / sleep: Make it possible to quiesce timers during suspend-to-idle
The efficiency of suspend-to-idle depends on being able to keep CPUs
in the deepest available idle states for as much time as possible.
Ideally, they should only be brought out of idle by system wakeup
interrupts.
However, timer interrupts occurring periodically prevent that from
happening and it is not practical to chase all of the "misbehaving"
timers in a whack-a-mole fashion. A much more effective approach is
to suspend the local ticks for all CPUs and the entire timekeeping
along the lines of what is done during full suspend, which also
helps to keep suspend-to-idle and full suspend reasonably similar.
The idea is to suspend the local tick on each CPU executing
cpuidle_enter_freeze() and to make the last of them suspend the
entire timekeeping. That should prevent timer interrupts from
triggering until an IO interrupt wakes up one of the CPUs. It
needs to be done with interrupts disabled on all of the CPUs,
though, because otherwise the suspended clocksource might be
accessed by an interrupt handler which might lead to fatal
consequences.
Unfortunately, the existing ->enter callbacks provided by cpuidle
drivers generally cannot be used for implementing that, because some
of them re-enable interrupts temporarily and some idle entry methods
cause interrupts to be re-enabled automatically on exit. Also some
of these callbacks manipulate local clock event devices of the CPUs
which really shouldn't be done after suspending their ticks.
To overcome that difficulty, introduce a new cpuidle state callback,
->enter_freeze, that will be guaranteed (1) to keep interrupts
disabled all the time (and return with interrupts disabled) and (2)
not to touch the CPU timer devices. Modify cpuidle_enter_freeze() to
look for the deepest available idle state with ->enter_freeze present
and to make the CPU execute that callback with suspended tick (and the
last of the online CPUs to execute it with suspended timekeeping).
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2015-02-14 06:50:43 +08:00
int timekeeping_suspend ( void )
2007-05-08 15:27:59 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2011-11-15 06:05:44 +08:00
unsigned long flags ;
2014-07-17 05:04:01 +08:00
struct timespec64 delta , delta_delta ;
static struct timespec64 old_delta ;
2018-07-17 15:55:16 +08:00
struct clocksource * curr_clock ;
u64 cycle_now ;
2007-05-08 15:27:59 +08:00
2015-04-02 11:34:22 +08:00
read_persistent_clock64 ( & timekeeping_suspend_time ) ;
2007-09-16 21:36:43 +08:00
2013-05-18 02:24:05 +08:00
/*
* On some systems the persistent_clock can not be detected at
* timekeeping_init by its return value , so if we see a valid
* value returned , update the persistent_clock_exists flag .
*/
if ( timekeeping_suspend_time . tv_sec | | timekeeping_suspend_time . tv_nsec )
2015-04-02 11:34:38 +08:00
persistent_clock_exists = true ;
2013-05-18 02:24:05 +08:00
2018-07-17 14:31:29 +08:00
suspend_timing_needed = true ;
2013-02-22 06:51:38 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2012-07-28 02:48:13 +08:00
timekeeping_forward_now ( tk ) ;
2007-05-08 15:27:59 +08:00
timekeeping_suspended = 1 ;
time: Avoid accumulating time drift in suspend/resume
Because the read_persistent_clock interface is usually backed by
only a second granular interface, each time we read from the persistent
clock for suspend/resume, we introduce a half second (on average) of error.
In order to avoid this error accumulating as the system is suspended
over and over, this patch measures the time delta between the persistent
clock and the system CLOCK_REALTIME.
If the delta is less then 2 seconds from the last suspend, we compensate
by using the previous time delta (keeping it close). If it is larger
then 2 seconds, we assume the clock was set or has been changed, so we
do no correction and update the delta.
Note: If NTP is running, ths could seem to "fight" with the NTP corrected
time, where as if the system time was off by 1 second, and NTP slewed the
value in, a suspend/resume cycle could undo this correction, by trying to
restore the previous offset from the persistent clock. However, without
this patch, since each read could cause almost a full second worth of
error, its possible to get almost 2 seconds of error just from the
suspend/resume cycle alone, so this about equal to any offset added by
the compensation.
Further on systems that suspend/resume frequently, this should keep time
closer then NTP could compensate for if the errors were allowed to
accumulate.
Credits to Arve Hjønnevåg for suggesting this solution.
CC: Arve Hjønnevåg <arve@android.com>
CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2011-06-01 13:53:23 +08:00
2018-07-17 15:55:16 +08:00
/*
* Since we ' ve called forward_now , cycle_last stores the value
* just read from the current clocksource . Save this to potentially
* use in suspend timing .
*/
curr_clock = tk - > tkr_mono . clock ;
cycle_now = tk - > tkr_mono . cycle_last ;
clocksource_start_suspend_timing ( curr_clock , cycle_now ) ;
2015-04-02 11:34:38 +08:00
if ( persistent_clock_exists ) {
time: Avoid accumulating time drift in suspend/resume
Because the read_persistent_clock interface is usually backed by
only a second granular interface, each time we read from the persistent
clock for suspend/resume, we introduce a half second (on average) of error.
In order to avoid this error accumulating as the system is suspended
over and over, this patch measures the time delta between the persistent
clock and the system CLOCK_REALTIME.
If the delta is less then 2 seconds from the last suspend, we compensate
by using the previous time delta (keeping it close). If it is larger
then 2 seconds, we assume the clock was set or has been changed, so we
do no correction and update the delta.
Note: If NTP is running, ths could seem to "fight" with the NTP corrected
time, where as if the system time was off by 1 second, and NTP slewed the
value in, a suspend/resume cycle could undo this correction, by trying to
restore the previous offset from the persistent clock. However, without
this patch, since each read could cause almost a full second worth of
error, its possible to get almost 2 seconds of error just from the
suspend/resume cycle alone, so this about equal to any offset added by
the compensation.
Further on systems that suspend/resume frequently, this should keep time
closer then NTP could compensate for if the errors were allowed to
accumulate.
Credits to Arve Hjønnevåg for suggesting this solution.
CC: Arve Hjønnevåg <arve@android.com>
CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2011-06-01 13:53:23 +08:00
/*
time: Fix a bug in timekeeping_suspend() with no persistent clock
When there's no persistent clock, normally
timekeeping_suspend_time should always be zero, but this can
break in timekeeping_suspend().
At T1, there was a system suspend, so old_delta was assigned T1.
After some time, one time adjustment happened, and xtime got the
value of T1-dt(0s<dt<2s). Then, there comes another system
suspend soon after this adjustment, obviously we will get a
small negative delta_delta, resulting in a negative
timekeeping_suspend_time.
This is problematic, when doing timekeeping_resume() if there is
no nonstop clocksource for example, it will hit the else leg and
inject the improper sleeptime which is the wrong logic.
So, we can solve this problem by only doing delta related code
when the persistent clock is existent. Actually the code only
makes sense for persistent clock cases.
Signed-off-by: Xunlei Pang <pang.xunlei@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1427945681-29972-18-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-04-02 11:34:37 +08:00
* To avoid drift caused by repeated suspend / resumes ,
* which each can add ~ 1 second drift error ,
* try to compensate so the difference in system time
* and persistent_clock time stays close to constant .
time: Avoid accumulating time drift in suspend/resume
Because the read_persistent_clock interface is usually backed by
only a second granular interface, each time we read from the persistent
clock for suspend/resume, we introduce a half second (on average) of error.
In order to avoid this error accumulating as the system is suspended
over and over, this patch measures the time delta between the persistent
clock and the system CLOCK_REALTIME.
If the delta is less then 2 seconds from the last suspend, we compensate
by using the previous time delta (keeping it close). If it is larger
then 2 seconds, we assume the clock was set or has been changed, so we
do no correction and update the delta.
Note: If NTP is running, ths could seem to "fight" with the NTP corrected
time, where as if the system time was off by 1 second, and NTP slewed the
value in, a suspend/resume cycle could undo this correction, by trying to
restore the previous offset from the persistent clock. However, without
this patch, since each read could cause almost a full second worth of
error, its possible to get almost 2 seconds of error just from the
suspend/resume cycle alone, so this about equal to any offset added by
the compensation.
Further on systems that suspend/resume frequently, this should keep time
closer then NTP could compensate for if the errors were allowed to
accumulate.
Credits to Arve Hjønnevåg for suggesting this solution.
CC: Arve Hjønnevåg <arve@android.com>
CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2011-06-01 13:53:23 +08:00
*/
time: Fix a bug in timekeeping_suspend() with no persistent clock
When there's no persistent clock, normally
timekeeping_suspend_time should always be zero, but this can
break in timekeeping_suspend().
At T1, there was a system suspend, so old_delta was assigned T1.
After some time, one time adjustment happened, and xtime got the
value of T1-dt(0s<dt<2s). Then, there comes another system
suspend soon after this adjustment, obviously we will get a
small negative delta_delta, resulting in a negative
timekeeping_suspend_time.
This is problematic, when doing timekeeping_resume() if there is
no nonstop clocksource for example, it will hit the else leg and
inject the improper sleeptime which is the wrong logic.
So, we can solve this problem by only doing delta related code
when the persistent clock is existent. Actually the code only
makes sense for persistent clock cases.
Signed-off-by: Xunlei Pang <pang.xunlei@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1427945681-29972-18-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-04-02 11:34:37 +08:00
delta = timespec64_sub ( tk_xtime ( tk ) , timekeeping_suspend_time ) ;
delta_delta = timespec64_sub ( delta , old_delta ) ;
if ( abs ( delta_delta . tv_sec ) > = 2 ) {
/*
* if delta_delta is too large , assume time correction
* has occurred and set old_delta to the current delta .
*/
old_delta = delta ;
} else {
/* Otherwise try to adjust old_system to compensate */
timekeeping_suspend_time =
timespec64_add ( timekeeping_suspend_time , delta_delta ) ;
}
time: Avoid accumulating time drift in suspend/resume
Because the read_persistent_clock interface is usually backed by
only a second granular interface, each time we read from the persistent
clock for suspend/resume, we introduce a half second (on average) of error.
In order to avoid this error accumulating as the system is suspended
over and over, this patch measures the time delta between the persistent
clock and the system CLOCK_REALTIME.
If the delta is less then 2 seconds from the last suspend, we compensate
by using the previous time delta (keeping it close). If it is larger
then 2 seconds, we assume the clock was set or has been changed, so we
do no correction and update the delta.
Note: If NTP is running, ths could seem to "fight" with the NTP corrected
time, where as if the system time was off by 1 second, and NTP slewed the
value in, a suspend/resume cycle could undo this correction, by trying to
restore the previous offset from the persistent clock. However, without
this patch, since each read could cause almost a full second worth of
error, its possible to get almost 2 seconds of error just from the
suspend/resume cycle alone, so this about equal to any offset added by
the compensation.
Further on systems that suspend/resume frequently, this should keep time
closer then NTP could compensate for if the errors were allowed to
accumulate.
Credits to Arve Hjønnevåg for suggesting this solution.
CC: Arve Hjønnevåg <arve@android.com>
CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2011-06-01 13:53:23 +08:00
}
2013-12-12 11:10:36 +08:00
timekeeping_update ( tk , TK_MIRROR ) ;
2015-02-13 21:49:02 +08:00
halt_fast_timekeeper ( tk ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2007-05-08 15:27:59 +08:00
2015-03-25 20:09:16 +08:00
tick_suspend ( ) ;
2010-02-03 06:41:41 +08:00
clocksource_suspend ( ) ;
2012-08-06 07:40:41 +08:00
clockevents_suspend ( ) ;
2007-05-08 15:27:59 +08:00
return 0 ;
}
/* sysfs resume/suspend bits for timekeeping */
2011-03-24 05:16:04 +08:00
static struct syscore_ops timekeeping_syscore_ops = {
2007-05-08 15:27:59 +08:00
. resume = timekeeping_resume ,
. suspend = timekeeping_suspend ,
} ;
2011-03-24 05:16:04 +08:00
static int __init timekeeping_init_ops ( void )
2007-05-08 15:27:59 +08:00
{
2011-03-24 05:16:04 +08:00
register_syscore_ops ( & timekeeping_syscore_ops ) ;
return 0 ;
2007-05-08 15:27:59 +08:00
}
2011-03-24 05:16:04 +08:00
device_initcall ( timekeeping_init_ops ) ;
2007-05-08 15:27:59 +08:00
/*
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
* Apply a multiplier adjustment to the timekeeper
2007-05-08 15:27:59 +08:00
*/
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
static __always_inline void timekeeping_apply_adjustment ( struct timekeeper * tk ,
s64 offset ,
2018-03-10 02:42:48 +08:00
s32 mult_adj )
2007-05-08 15:27:59 +08:00
{
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
s64 interval = tk - > cycle_interval ;
2007-05-08 15:27:59 +08:00
2018-03-10 02:42:48 +08:00
if ( mult_adj = = 0 ) {
return ;
} else if ( mult_adj = = - 1 ) {
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
interval = - interval ;
2018-03-10 02:42:48 +08:00
offset = - offset ;
} else if ( mult_adj ! = 1 ) {
interval * = mult_adj ;
offset * = mult_adj ;
2012-08-05 03:21:14 +08:00
}
2007-05-08 15:27:59 +08:00
2011-10-28 09:12:42 +08:00
/*
* So the following can be confusing .
*
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
* To keep things simple , lets assume mult_adj = = 1 for now .
2011-10-28 09:12:42 +08:00
*
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
* When mult_adj ! = 1 , remember that the interval and offset values
2011-10-28 09:12:42 +08:00
* have been appropriately scaled so the math is the same .
*
* The basic idea here is that we ' re increasing the multiplier
* by one , this causes the xtime_interval to be incremented by
* one cycle_interval . This is because :
* xtime_interval = cycle_interval * mult
* So if mult is being incremented by one :
* xtime_interval = cycle_interval * ( mult + 1 )
* Its the same as :
* xtime_interval = ( cycle_interval * mult ) + cycle_interval
* Which can be shortened to :
* xtime_interval + = cycle_interval
*
* So offset stores the non - accumulated cycles . Thus the current
* time ( in shifted nanoseconds ) is :
* now = ( offset * adj ) + xtime_nsec
* Now , even though we ' re adjusting the clock frequency , we have
* to keep time consistent . In other words , we can ' t jump back
* in time , and we also want to avoid jumping forward in time .
*
* So given the same offset value , we need the time to be the same
* both before and after the freq adjustment .
* now = ( offset * adj_1 ) + xtime_nsec_1
* now = ( offset * adj_2 ) + xtime_nsec_2
* So :
* ( offset * adj_1 ) + xtime_nsec_1 =
* ( offset * adj_2 ) + xtime_nsec_2
* And we know :
* adj_2 = adj_1 + 1
* So :
* ( offset * adj_1 ) + xtime_nsec_1 =
* ( offset * ( adj_1 + 1 ) ) + xtime_nsec_2
* ( offset * adj_1 ) + xtime_nsec_1 =
* ( offset * adj_1 ) + offset + xtime_nsec_2
* Canceling the sides :
* xtime_nsec_1 = offset + xtime_nsec_2
* Which gives us :
* xtime_nsec_2 = xtime_nsec_1 - offset
2021-03-23 05:39:03 +08:00
* Which simplifies to :
2011-10-28 09:12:42 +08:00
* xtime_nsec - = offset
*/
2015-03-19 17:09:06 +08:00
if ( ( mult_adj > 0 ) & & ( tk - > tkr_mono . mult + mult_adj < mult_adj ) ) {
time: Avoid possible NTP adjustment mult overflow.
Ideally, __clocksource_updatefreq_scale, selects the largest shift
value possible for a clocksource. This results in the mult memember of
struct clocksource being particularly large, although not so large
that NTP would adjust the clock to cause it to overflow.
That said, nothing actually prohibits an overflow from occuring, its
just that it "shouldn't" occur.
So while very unlikely, and so far never observed, the value of
(cs->mult+cs->maxadj) may have a chance to reach very near 0xFFFFFFFF,
so there is a possibility it may overflow when doing NTP positive
adjustment
See the following detail: When NTP slewes the clock, kernel goes
through update_wall_time()->...->timekeeping_apply_adjustment():
tk->tkr.mult += mult_adj;
Since there is no guard against it, its possible tk->tkr.mult may
overflow during this operation.
This patch avoids any possible mult overflow by judging the overflow
case before adding mult_adj to mult, also adds the WARNING message
when capturing such case.
Signed-off-by: pang.xunlei <pang.xunlei@linaro.org>
[jstultz: Reworded commit message]
Signed-off-by: John Stultz <john.stultz@linaro.org>
2014-10-08 15:03:34 +08:00
/* NTP adjustment caused clocksource mult overflow */
WARN_ON_ONCE ( 1 ) ;
return ;
}
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . mult + = mult_adj ;
2012-07-13 13:21:57 +08:00
tk - > xtime_interval + = interval ;
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec - = offset ;
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
}
/*
2018-03-10 02:42:48 +08:00
* Adjust the timekeeper ' s multiplier to the correct frequency
* and also to reduce the accumulated error value .
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
*/
2018-03-10 02:42:48 +08:00
static void timekeeping_adjust ( struct timekeeper * tk , s64 offset )
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
{
2018-03-10 02:42:48 +08:00
u32 mult ;
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
2015-12-04 02:23:30 +08:00
/*
2018-03-10 02:42:48 +08:00
* Determine the multiplier from the current NTP tick length .
* Avoid expensive division when the tick length doesn ' t change .
2015-12-04 02:23:30 +08:00
*/
2018-03-10 02:42:48 +08:00
if ( likely ( tk - > ntp_tick = = ntp_tick_length ( ) ) ) {
mult = tk - > tkr_mono . mult - tk - > ntp_err_mult ;
} else {
tk - > ntp_tick = ntp_tick_length ( ) ;
mult = div64_u64 ( ( tk - > ntp_tick > > tk - > ntp_error_shift ) -
tk - > xtime_remainder , tk - > cycle_interval ) ;
2015-12-04 02:23:30 +08:00
}
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
2018-03-10 02:42:48 +08:00
/*
* If the clock is behind the NTP time , increase the multiplier by 1
* to catch up with it . If it ' s ahead and there was a remainder in the
* tick division , the clock will slow down . Otherwise it will stay
* ahead until the tick length changes to a non - divisible value .
*/
tk - > ntp_err_mult = tk - > ntp_error > 0 ? 1 : 0 ;
mult + = tk - > ntp_err_mult ;
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
2018-03-10 02:42:48 +08:00
timekeeping_apply_adjustment ( tk , offset , mult - tk - > tkr_mono . mult ) ;
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
2015-03-19 17:09:06 +08:00
if ( unlikely ( tk - > tkr_mono . clock - > maxadj & &
( abs ( tk - > tkr_mono . mult - tk - > tkr_mono . clock - > mult )
> tk - > tkr_mono . clock - > maxadj ) ) ) {
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
printk_once ( KERN_WARNING
" Adjusting %s more than 11%% (%ld vs %ld) \n " ,
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . clock - > name , ( long ) tk - > tkr_mono . mult ,
( long ) tk - > tkr_mono . clock - > mult + tk - > tkr_mono . clock - > maxadj ) ;
timekeeping: Rework frequency adjustments to work better w/ nohz
The existing timekeeping_adjust logic has always been complicated
to understand. Further, since it was developed prior to NOHZ becoming
common, its not surprising it performs poorly when NOHZ is enabled.
Since Miroslav pointed out the problematic nature of the existing code
in the NOHZ case, I've tried to refactor the code to perform better.
The problem with the previous approach was that it tried to adjust
for the total cumulative error using a scaled dampening factor. This
resulted in large errors to be corrected slowly, while small errors
were corrected quickly. With NOHZ the timekeeping code doesn't know
how far out the next tick will be, so this results in bad
over-correction to small errors, and insufficient correction to large
errors.
Inspired by Miroslav's patch, I've refactored the code to try to
address the correction in two steps.
1) Check the future freq error for the next tick, and if the frequency
error is large, try to make sure we correct it so it doesn't cause
much accumulated error.
2) Then make a small single unit adjustment to correct any cumulative
error that has collected over time.
This method performs fairly well in the simulator Miroslav created.
Major credit to Miroslav for pointing out the issue, providing the
original patch to resolve this, a simulator for testing, as well as
helping debug and resolve issues in my implementation so that it
performed closer to his original implementation.
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-07 09:25:21 +08:00
}
2012-07-13 13:21:56 +08:00
/*
* It may be possible that when we entered this function , xtime_nsec
* was very small . Further , if we ' re slightly speeding the clocksource
* in the code above , its possible the required corrective factor to
* xtime_nsec could cause it to underflow .
*
2018-03-10 02:42:48 +08:00
* Now , since we have already accumulated the second and the NTP
* subsystem has been notified via second_overflow ( ) , we need to skip
* the next update .
2012-07-13 13:21:56 +08:00
*/
2015-03-19 17:09:06 +08:00
if ( unlikely ( ( s64 ) tk - > tkr_mono . xtime_nsec < 0 ) ) {
2018-03-10 02:42:48 +08:00
tk - > tkr_mono . xtime_nsec + = ( u64 ) NSEC_PER_SEC < <
tk - > tkr_mono . shift ;
tk - > xtime_sec - - ;
tk - > skip_second_overflow = 1 ;
2012-07-13 13:21:56 +08:00
}
2007-05-08 15:27:59 +08:00
}
2020-11-13 15:24:33 +08:00
/*
2012-07-13 13:21:54 +08:00
* accumulate_nsecs_to_secs - Accumulates nsecs into secs
*
2015-08-25 14:42:53 +08:00
* Helper function that accumulates the nsecs greater than a second
2012-07-13 13:21:54 +08:00
* from the xtime_nsec field to the xtime_secs field .
* It also calls into the NTP code to handle leapsecond processing .
*/
2013-06-27 18:35:46 +08:00
static inline unsigned int accumulate_nsecs_to_secs ( struct timekeeper * tk )
2012-07-13 13:21:54 +08:00
{
2015-03-19 17:09:06 +08:00
u64 nsecps = ( u64 ) NSEC_PER_SEC < < tk - > tkr_mono . shift ;
2013-12-12 12:07:49 +08:00
unsigned int clock_set = 0 ;
2012-07-13 13:21:54 +08:00
2015-03-19 17:09:06 +08:00
while ( tk - > tkr_mono . xtime_nsec > = nsecps ) {
2012-07-13 13:21:54 +08:00
int leap ;
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec - = nsecps ;
2012-07-13 13:21:54 +08:00
tk - > xtime_sec + + ;
2018-03-10 02:42:48 +08:00
/*
* Skip NTP update if this second was accumulated before ,
* i . e . xtime_nsec underflowed in timekeeping_adjust ( )
*/
if ( unlikely ( tk - > skip_second_overflow ) ) {
tk - > skip_second_overflow = 0 ;
continue ;
}
2012-07-13 13:21:54 +08:00
/* Figure out if its a leap sec and apply if needed */
leap = second_overflow ( tk - > xtime_sec ) ;
2012-07-28 02:48:12 +08:00
if ( unlikely ( leap ) ) {
2014-07-17 05:04:01 +08:00
struct timespec64 ts ;
2012-07-28 02:48:12 +08:00
tk - > xtime_sec + = leap ;
2012-07-13 13:21:54 +08:00
2012-07-28 02:48:12 +08:00
ts . tv_sec = leap ;
ts . tv_nsec = 0 ;
tk_set_wall_to_mono ( tk ,
2014-07-17 05:04:01 +08:00
timespec64_sub ( tk - > wall_to_monotonic , ts ) ) ;
2012-07-28 02:48:12 +08:00
2012-05-04 03:30:07 +08:00
__timekeeping_set_tai_offset ( tk , tk - > tai_offset - leap ) ;
2013-12-12 12:07:49 +08:00
clock_set = TK_CLOCK_WAS_SET ;
2012-07-28 02:48:12 +08:00
}
2012-07-13 13:21:54 +08:00
}
2013-12-12 12:07:49 +08:00
return clock_set ;
2012-07-13 13:21:54 +08:00
}
2020-11-13 15:24:33 +08:00
/*
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
* logarithmic_accumulation - shifted accumulation of cycles
*
* This functions accumulates a shifted interval of cycles into
2020-08-07 11:32:48 +08:00
* a shifted interval nanoseconds . Allows for O ( log ) accumulation
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
* loop .
*
* Returns the unconsumed cycles .
*/
2016-12-22 03:32:01 +08:00
static u64 logarithmic_accumulation ( struct timekeeper * tk , u64 offset ,
u32 shift , unsigned int * clock_set )
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
{
2016-12-22 03:32:01 +08:00
u64 interval = tk - > cycle_interval < < shift ;
2017-06-09 07:44:21 +08:00
u64 snsec_per_sec ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
2015-08-25 14:42:53 +08:00
/* If the offset is smaller than a shifted interval, do nothing */
2013-02-22 06:51:36 +08:00
if ( offset < interval )
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
return offset ;
/* Accumulate one shifted interval */
2013-02-22 06:51:36 +08:00
offset - = interval ;
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . cycle_last + = interval ;
2015-03-19 16:28:44 +08:00
tk - > tkr_raw . cycle_last + = interval ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . xtime_nsec + = tk - > xtime_interval < < shift ;
2013-12-12 12:07:49 +08:00
* clock_set | = accumulate_nsecs_to_secs ( tk ) ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
2010-08-10 05:20:09 +08:00
/* Accumulate raw time */
2017-06-09 07:44:21 +08:00
tk - > tkr_raw . xtime_nsec + = tk - > raw_interval < < shift ;
snsec_per_sec = ( u64 ) NSEC_PER_SEC < < tk - > tkr_raw . shift ;
while ( tk - > tkr_raw . xtime_nsec > = snsec_per_sec ) {
tk - > tkr_raw . xtime_nsec - = snsec_per_sec ;
2017-05-23 08:20:20 +08:00
tk - > raw_sec + + ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
}
/* Accumulate error between NTP and clock interval */
2014-04-24 11:53:29 +08:00
tk - > ntp_error + = tk - > ntp_tick < < shift ;
2012-07-13 13:21:57 +08:00
tk - > ntp_error - = ( tk - > xtime_interval + tk - > xtime_remainder ) < <
( tk - > ntp_error_shift + shift ) ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
return offset ;
}
2018-06-04 21:34:21 +08:00
/*
* timekeeping_advance - Updates the timekeeper to the current time and
* current NTP tick length
2007-05-08 15:27:59 +08:00
*/
2021-07-13 21:39:52 +08:00
static bool timekeeping_advance ( enum timekeeping_adv_mode mode )
2007-05-08 15:27:59 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * real_tk = & tk_core . timekeeper ;
2013-02-22 06:51:40 +08:00
struct timekeeper * tk = & shadow_timekeeper ;
2016-12-22 03:32:01 +08:00
u64 offset ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
int shift = 0 , maxshift ;
2013-12-12 12:07:49 +08:00
unsigned int clock_set = 0 ;
2011-11-15 04:48:10 +08:00
unsigned long flags ;
2013-02-22 06:51:38 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2007-05-08 15:27:59 +08:00
/* Make sure we're fully resumed: */
if ( unlikely ( timekeeping_suspended ) )
2011-11-15 04:48:10 +08:00
goto out ;
2007-05-08 15:27:59 +08:00
2017-06-09 07:44:20 +08:00
offset = clocksource_delta ( tk_clock_read ( & tk - > tkr_mono ) ,
2015-03-19 17:09:06 +08:00
tk - > tkr_mono . cycle_last , tk - > tkr_mono . mask ) ;
2007-05-08 15:27:59 +08:00
2012-08-22 08:30:49 +08:00
/* Check if there's really nothing to do */
2018-06-04 21:34:21 +08:00
if ( offset < real_tk - > cycle_interval & & mode = = TK_ADV_TICK )
2012-08-22 08:30:49 +08:00
goto out ;
2015-03-12 12:16:32 +08:00
/* Do some additional sanity checking */
2017-06-28 21:21:35 +08:00
timekeeping_check_update ( tk , offset ) ;
2015-03-12 12:16:32 +08:00
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
/*
* With NO_HZ we may have to accumulate many cycle_intervals
* ( think " ticks " ) worth of time at once . To do this efficiently ,
* we calculate the largest doubling multiple of cycle_intervals
2012-03-15 11:28:56 +08:00
* that is smaller than the offset . We then accumulate that
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
* chunk in one go , and then try to consume the next smaller
* doubled multiple .
2007-05-08 15:27:59 +08:00
*/
2012-07-28 02:48:13 +08:00
shift = ilog2 ( offset ) - ilog2 ( tk - > cycle_interval ) ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
shift = max ( 0 , shift ) ;
2012-03-15 11:28:56 +08:00
/* Bound shift to one less than what overflows tick_length */
2011-11-15 05:18:07 +08:00
maxshift = ( 64 - ( ilog2 ( ntp_tick_length ( ) ) + 1 ) ) - 1 ;
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-03 07:17:53 +08:00
shift = min ( shift , maxshift ) ;
2012-07-28 02:48:13 +08:00
while ( offset > = tk - > cycle_interval ) {
2013-12-12 12:07:49 +08:00
offset = logarithmic_accumulation ( tk , offset , shift ,
& clock_set ) ;
2012-07-28 02:48:13 +08:00
if ( offset < tk - > cycle_interval < < shift )
2010-03-19 05:47:30 +08:00
shift - - ;
2007-05-08 15:27:59 +08:00
}
2018-03-10 02:42:48 +08:00
/* Adjust the multiplier to correct NTP error */
2012-07-28 02:48:13 +08:00
timekeeping_adjust ( tk , offset ) ;
2007-05-08 15:27:59 +08:00
2010-04-07 05:30:51 +08:00
/*
* Finally , make sure that after the rounding
2012-07-13 13:21:53 +08:00
* xtime_nsec isn ' t larger than NSEC_PER_SEC
2010-04-07 05:30:51 +08:00
*/
2013-12-12 12:07:49 +08:00
clock_set | = accumulate_nsecs_to_secs ( tk ) ;
Revert "time: Remove xtime_cache"
This reverts commit 7bc7d637452383d56ba4368d4336b0dde1bb476d, as
requested by John Stultz. Quoting John:
"Petr Titěra reported an issue where he saw odd atime regressions with
2.6.33 where there were a full second worth of nanoseconds in the
nanoseconds field.
He also reviewed the time code and narrowed down the problem: unhandled
overflow of the nanosecond field caused by rounding up the
sub-nanosecond accumulated time.
Details:
* At the end of update_wall_time(), we currently round up the
sub-nanosecond portion of accumulated time when storing it into xtime.
This was added to avoid time inconsistencies caused when the
sub-nanosecond portion was truncated when storing into xtime.
Unfortunately we don't handle the possible second overflow caused by
that rounding.
* Previously the xtime_cache code hid this overflow by normalizing the
xtime value when storing into the xtime_cache.
* We could try to handle the second overflow after the rounding up, but
since this affects the timekeeping's internal state, this would further
complicate the next accumulation cycle, causing small errors in ntp
steering. As much as I'd like to get rid of it, the xtime_cache code is
known to work.
* The correct fix is really to include the sub-nanosecond portion in the
timekeeping accessor function, so we don't need to round up at during
accumulation. This would greatly simplify the accumulation code.
Unfortunately, we can't do this safely until the last three
non-GENERIC_TIME arches (sparc32, arm, cris) are converted (those
patches are in -mm) and we kill off the spots where arches set xtime
directly. This is all 2.6.34 material, so I think reverting the
xtime_cache change is the best approach for now.
Many thanks to Petr for both reporting and finding the issue!"
Reported-by: Petr Titěra <P.Titera@century.cz>
Requested-by: john stultz <johnstul@us.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-12-23 06:10:37 +08:00
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2013-02-22 06:51:40 +08:00
/*
* Update the real timekeeper .
*
* We could avoid this memcpy by switching pointers , but that
* requires changes to all other timekeeper usage sites as
* well , i . e . move the timekeeper pointer getter into the
* spinlocked / seqcount protected sections . And we trade this
2014-07-17 05:04:07 +08:00
* memcpy under the tk_core . seq against one before we start
2013-02-22 06:51:40 +08:00
* updating .
*/
timekeeping: Copy the shadow-timekeeper over the real timekeeper last
The fix in d151832650ed9 (time: Move clock_was_set_seq update
before updating shadow-timekeeper) was unfortunately incomplete.
The main gist of that change was to do the shadow-copy update
last, so that any state changes were properly duplicated, and
we wouldn't accidentally have stale data in the shadow.
Unfortunately in the main update_wall_time() logic, we update
use the shadow-timekeeper to calculate the next update values,
then while holding the lock, copy the shadow-timekeeper over,
then call timekeeping_update() to do some additional
bookkeeping, (skipping the shadow mirror). The bug with this is
the additional bookkeeping isn't all read-only, and some
changes timkeeper state. Thus we might then overwrite this state
change on the next update.
To avoid this problem, do the timekeeping_update() on the
shadow-timekeeper prior to copying the full state over to
the real-timekeeper.
This avoids problems with both the clock_was_set_seq and
next_leap_ktime being overwritten and possibly the
fast-timekeepers as well.
Many thanks to Prarit for his rigorous testing, which discovered
this problem, along with Prarit and Daniel's work validating this
fix.
Reported-by: Prarit Bhargava <prarit@redhat.com>
Tested-by: Prarit Bhargava <prarit@redhat.com>
Tested-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434560753-7441-1-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-18 01:05:53 +08:00
timekeeping_update ( tk , clock_set ) ;
2013-02-22 06:51:40 +08:00
memcpy ( real_tk , tk , sizeof ( * tk ) ) ;
timekeeping: Copy the shadow-timekeeper over the real timekeeper last
The fix in d151832650ed9 (time: Move clock_was_set_seq update
before updating shadow-timekeeper) was unfortunately incomplete.
The main gist of that change was to do the shadow-copy update
last, so that any state changes were properly duplicated, and
we wouldn't accidentally have stale data in the shadow.
Unfortunately in the main update_wall_time() logic, we update
use the shadow-timekeeper to calculate the next update values,
then while holding the lock, copy the shadow-timekeeper over,
then call timekeeping_update() to do some additional
bookkeeping, (skipping the shadow mirror). The bug with this is
the additional bookkeeping isn't all read-only, and some
changes timkeeper state. Thus we might then overwrite this state
change on the next update.
To avoid this problem, do the timekeeping_update() on the
shadow-timekeeper prior to copying the full state over to
the real-timekeeper.
This avoids problems with both the clock_was_set_seq and
next_leap_ktime being overwritten and possibly the
fast-timekeepers as well.
Many thanks to Prarit for his rigorous testing, which discovered
this problem, along with Prarit and Daniel's work validating this
fix.
Reported-by: Prarit Bhargava <prarit@redhat.com>
Tested-by: Prarit Bhargava <prarit@redhat.com>
Tested-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434560753-7441-1-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-18 01:05:53 +08:00
/* The memcpy must come last. Do not put anything here! */
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-02-22 06:51:40 +08:00
out :
2013-02-22 06:51:38 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2021-07-13 21:39:52 +08:00
return ! ! clock_set ;
2007-05-08 15:27:59 +08:00
}
2007-07-16 14:39:41 +08:00
2018-06-04 21:34:21 +08:00
/**
* update_wall_time - Uses the current clocksource to increment the wall time
*
*/
void update_wall_time ( void )
{
2021-07-13 21:39:52 +08:00
if ( timekeeping_advance ( TK_ADV_TICK ) )
clock_was_set_delayed ( ) ;
2018-06-04 21:34:21 +08:00
}
2007-07-16 14:39:41 +08:00
/**
2014-12-09 04:00:09 +08:00
* getboottime64 - Return the real time of system boot .
* @ ts : pointer to the timespec64 to be set
2007-07-16 14:39:41 +08:00
*
2014-12-09 04:00:09 +08:00
* Returns the wall - time of boot in a timespec64 .
2007-07-16 14:39:41 +08:00
*
* This is based on the wall_to_monotonic offset and the total suspend
* time . Calls to settimeofday will affect the value returned ( which
* basically means that however wrong your real time clock is at boot time ,
* you get the right time here ) .
*/
2014-12-09 04:00:09 +08:00
void getboottime64 ( struct timespec64 * ts )
2007-07-16 14:39:41 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2018-04-25 21:33:38 +08:00
ktime_t t = ktime_sub ( tk - > offs_real , tk - > offs_boot ) ;
2014-07-17 05:04:58 +08:00
2014-12-09 04:00:09 +08:00
* ts = ktime_to_timespec64 ( t ) ;
2007-07-16 14:39:41 +08:00
}
2014-12-09 04:00:09 +08:00
EXPORT_SYMBOL_GPL ( getboottime64 ) ;
2007-07-16 14:39:41 +08:00
2018-04-27 21:40:14 +08:00
void ktime_get_coarse_real_ts64 ( struct timespec64 * ts )
2007-07-25 08:47:43 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2019-03-19 03:55:56 +08:00
unsigned int seq ;
2007-07-25 08:47:43 +08:00
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
Revert "time: Remove xtime_cache"
This reverts commit 7bc7d637452383d56ba4368d4336b0dde1bb476d, as
requested by John Stultz. Quoting John:
"Petr Titěra reported an issue where he saw odd atime regressions with
2.6.33 where there were a full second worth of nanoseconds in the
nanoseconds field.
He also reviewed the time code and narrowed down the problem: unhandled
overflow of the nanosecond field caused by rounding up the
sub-nanosecond accumulated time.
Details:
* At the end of update_wall_time(), we currently round up the
sub-nanosecond portion of accumulated time when storing it into xtime.
This was added to avoid time inconsistencies caused when the
sub-nanosecond portion was truncated when storing into xtime.
Unfortunately we don't handle the possible second overflow caused by
that rounding.
* Previously the xtime_cache code hid this overflow by normalizing the
xtime value when storing into the xtime_cache.
* We could try to handle the second overflow after the rounding up, but
since this affects the timekeeping's internal state, this would further
complicate the next accumulation cycle, causing small errors in ntp
steering. As much as I'd like to get rid of it, the xtime_cache code is
known to work.
* The correct fix is really to include the sub-nanosecond portion in the
timekeeping accessor function, so we don't need to round up at during
accumulation. This would greatly simplify the accumulation code.
Unfortunately, we can't do this safely until the last three
non-GENERIC_TIME arches (sparc32, arm, cris) are converted (those
patches are in -mm) and we kill off the spots where arches set xtime
directly. This is all 2.6.34 material, so I think reverting the
xtime_cache change is the best approach for now.
Many thanks to Petr for both reporting and finding the issue!"
Reported-by: Petr Titěra <P.Titera@century.cz>
Requested-by: john stultz <johnstul@us.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-12-23 06:10:37 +08:00
2018-04-27 21:40:14 +08:00
* ts = tk_xtime ( tk ) ;
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2007-07-25 08:47:43 +08:00
}
2018-04-27 21:40:14 +08:00
EXPORT_SYMBOL ( ktime_get_coarse_real_ts64 ) ;
2009-08-20 10:13:34 +08:00
2018-04-27 21:40:14 +08:00
void ktime_get_coarse_ts64 ( struct timespec64 * ts )
2009-08-20 10:13:34 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2014-07-17 05:04:01 +08:00
struct timespec64 now , mono ;
2019-03-19 03:55:56 +08:00
unsigned int seq ;
2009-08-20 10:13:34 +08:00
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
Revert "time: Remove xtime_cache"
This reverts commit 7bc7d637452383d56ba4368d4336b0dde1bb476d, as
requested by John Stultz. Quoting John:
"Petr Titěra reported an issue where he saw odd atime regressions with
2.6.33 where there were a full second worth of nanoseconds in the
nanoseconds field.
He also reviewed the time code and narrowed down the problem: unhandled
overflow of the nanosecond field caused by rounding up the
sub-nanosecond accumulated time.
Details:
* At the end of update_wall_time(), we currently round up the
sub-nanosecond portion of accumulated time when storing it into xtime.
This was added to avoid time inconsistencies caused when the
sub-nanosecond portion was truncated when storing into xtime.
Unfortunately we don't handle the possible second overflow caused by
that rounding.
* Previously the xtime_cache code hid this overflow by normalizing the
xtime value when storing into the xtime_cache.
* We could try to handle the second overflow after the rounding up, but
since this affects the timekeeping's internal state, this would further
complicate the next accumulation cycle, causing small errors in ntp
steering. As much as I'd like to get rid of it, the xtime_cache code is
known to work.
* The correct fix is really to include the sub-nanosecond portion in the
timekeeping accessor function, so we don't need to round up at during
accumulation. This would greatly simplify the accumulation code.
Unfortunately, we can't do this safely until the last three
non-GENERIC_TIME arches (sparc32, arm, cris) are converted (those
patches are in -mm) and we kill off the spots where arches set xtime
directly. This is all 2.6.34 material, so I think reverting the
xtime_cache change is the best approach for now.
Many thanks to Petr for both reporting and finding the issue!"
Reported-by: Petr Titěra <P.Titera@century.cz>
Requested-by: john stultz <johnstul@us.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-12-23 06:10:37 +08:00
2012-07-28 02:48:13 +08:00
now = tk_xtime ( tk ) ;
mono = tk - > wall_to_monotonic ;
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2009-08-20 10:13:34 +08:00
2018-04-27 21:40:14 +08:00
set_normalized_timespec64 ( ts , now . tv_sec + mono . tv_sec ,
2009-08-20 10:13:34 +08:00
now . tv_nsec + mono . tv_nsec ) ;
}
2018-04-27 21:40:14 +08:00
EXPORT_SYMBOL ( ktime_get_coarse_ts64 ) ;
2011-01-27 22:58:55 +08:00
/*
2012-02-29 08:50:11 +08:00
* Must hold jiffies_lock
2011-01-27 22:58:55 +08:00
*/
void do_timer ( unsigned long ticks )
{
jiffies_64 + = ticks ;
2020-07-02 02:34:18 +08:00
calc_global_load ( ) ;
2011-01-27 22:58:55 +08:00
}
2011-01-27 22:59:05 +08:00
2012-07-11 06:43:24 +08:00
/**
2014-07-17 05:03:52 +08:00
* ktime_get_update_offsets_now - hrtimer helper
2015-04-15 05:08:37 +08:00
* @ cwsseq : pointer to check and store the clock was set sequence number
2012-07-11 06:43:24 +08:00
* @ offs_real : pointer to storage for monotonic - > realtime offset
2018-04-25 21:33:38 +08:00
* @ offs_boot : pointer to storage for monotonic - > boottime offset
2013-10-18 09:13:30 +08:00
* @ offs_tai : pointer to storage for monotonic - > clock tai offset
2012-07-11 06:43:24 +08:00
*
2015-04-15 05:08:37 +08:00
* Returns current monotonic time and updates the offsets if the
* sequence number in @ cwsseq and timekeeper . clock_was_set_seq are
* different .
*
2013-10-18 09:13:30 +08:00
* Called from hrtimer_interrupt ( ) or retrigger_next_event ( )
2012-07-11 06:43:24 +08:00
*/
2015-04-15 05:08:37 +08:00
ktime_t ktime_get_update_offsets_now ( unsigned int * cwsseq , ktime_t * offs_real ,
2018-04-25 21:33:38 +08:00
ktime_t * offs_boot , ktime_t * offs_tai )
2012-07-11 06:43:24 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2012-07-11 06:43:24 +08:00
unsigned int seq ;
2014-07-17 05:04:19 +08:00
ktime_t base ;
u64 nsecs ;
2012-07-11 06:43:24 +08:00
do {
2014-07-17 05:04:07 +08:00
seq = read_seqcount_begin ( & tk_core . seq ) ;
2012-07-11 06:43:24 +08:00
2015-03-19 17:09:06 +08:00
base = tk - > tkr_mono . base ;
nsecs = timekeeping_get_ns ( & tk - > tkr_mono ) ;
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
base = ktime_add_ns ( base , nsecs ) ;
2015-04-15 05:08:37 +08:00
if ( * cwsseq ! = tk - > clock_was_set_seq ) {
* cwsseq = tk - > clock_was_set_seq ;
* offs_real = tk - > offs_real ;
2018-04-25 21:33:38 +08:00
* offs_boot = tk - > offs_boot ;
2015-04-15 05:08:37 +08:00
* offs_tai = tk - > offs_tai ;
}
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
/* Handle leapsecond insertion adjustments */
2016-12-25 18:38:40 +08:00
if ( unlikely ( base > = tk - > next_leap_ktime ) )
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
* offs_real = ktime_sub ( tk - > offs_real , ktime_set ( 1 , 0 ) ) ;
2014-07-17 05:04:07 +08:00
} while ( read_seqcount_retry ( & tk_core . seq , seq ) ) ;
2012-07-11 06:43:24 +08:00
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
return base ;
2012-07-11 06:43:24 +08:00
}
2020-11-13 15:24:33 +08:00
/*
2017-10-19 19:14:45 +08:00
* timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex
2017-10-19 19:14:44 +08:00
*/
2018-07-03 13:44:21 +08:00
static int timekeeping_validate_timex ( const struct __kernel_timex * txc )
2017-10-19 19:14:44 +08:00
{
if ( txc - > modes & ADJ_ADJTIME ) {
/* singleshot must not be used with any other mode bits */
if ( ! ( txc - > modes & ADJ_OFFSET_SINGLESHOT ) )
return - EINVAL ;
if ( ! ( txc - > modes & ADJ_OFFSET_READONLY ) & &
! capable ( CAP_SYS_TIME ) )
return - EPERM ;
} else {
/* In order to modify anything, you gotta be super-user! */
if ( txc - > modes & & ! capable ( CAP_SYS_TIME ) )
return - EPERM ;
/*
* if the quartz is off by more than 10 % then
* something is VERY wrong !
*/
if ( txc - > modes & ADJ_TICK & &
( txc - > tick < 900000 / USER_HZ | |
txc - > tick > 1100000 / USER_HZ ) )
return - EINVAL ;
}
if ( txc - > modes & ADJ_SETOFFSET ) {
/* In order to inject time, you gotta be super-user! */
if ( ! capable ( CAP_SYS_TIME ) )
return - EPERM ;
2017-10-19 19:14:45 +08:00
/*
* Validate if a timespec / timeval used to inject a time
2021-03-23 05:39:03 +08:00
* offset is valid . Offsets can be positive or negative , so
2017-10-19 19:14:45 +08:00
* we don ' t check tv_sec . The value of the timeval / timespec
* is the sum of its fields , but * NOTE * :
* The field tv_usec / tv_nsec must always be non - negative and
* we can ' t have more nanoseconds / microseconds than a second .
*/
if ( txc - > time . tv_usec < 0 )
return - EINVAL ;
2017-10-19 19:14:44 +08:00
2017-10-19 19:14:45 +08:00
if ( txc - > modes & ADJ_NANO ) {
if ( txc - > time . tv_usec > = NSEC_PER_SEC )
2017-10-19 19:14:44 +08:00
return - EINVAL ;
} else {
2017-10-19 19:14:45 +08:00
if ( txc - > time . tv_usec > = USEC_PER_SEC )
2017-10-19 19:14:44 +08:00
return - EINVAL ;
}
}
/*
* Check for potential multiplication overflows that can
* only happen on 64 - bit systems :
*/
if ( ( txc - > modes & ADJ_FREQUENCY ) & & ( BITS_PER_LONG = = 64 ) ) {
if ( LLONG_MIN / PPM_SCALE > txc - > freq )
return - EINVAL ;
if ( LLONG_MAX / PPM_SCALE < txc - > freq )
return - EINVAL ;
}
return 0 ;
}
timekeeping: Add raw clock fallback for random_get_entropy()
The addition of random_get_entropy_fallback() provides access to
whichever time source has the highest frequency, which is useful for
gathering entropy on platforms without available cycle counters. It's
not necessarily as good as being able to quickly access a cycle counter
that the CPU has, but it's still something, even when it falls back to
being jiffies-based.
In the event that a given arch does not define get_cycles(), falling
back to the get_cycles() default implementation that returns 0 is really
not the best we can do. Instead, at least calling
random_get_entropy_fallback() would be preferable, because that always
needs to return _something_, even falling back to jiffies eventually.
It's not as though random_get_entropy_fallback() is super high precision
or guaranteed to be entropic, but basically anything that's not zero all
the time is better than returning zero all the time.
Finally, since random_get_entropy_fallback() is used during extremely
early boot when randomizing freelists in mm_init(), it can be called
before timekeeping has been initialized. In that case there really is
nothing we can do; jiffies hasn't even started ticking yet. So just give
up and return 0.
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Theodore Ts'o <tytso@mit.edu>
2022-04-10 22:49:50 +08:00
/**
* random_get_entropy_fallback - Returns the raw clock source value ,
* used by random . c for platforms with no valid random_get_entropy ( ) .
*/
unsigned long random_get_entropy_fallback ( void )
{
struct tk_read_base * tkr = & tk_core . timekeeper . tkr_mono ;
struct clocksource * clock = READ_ONCE ( tkr - > clock ) ;
if ( unlikely ( timekeeping_suspended | | ! clock ) )
return 0 ;
return clock - > read ( clock ) ;
}
EXPORT_SYMBOL_GPL ( random_get_entropy_fallback ) ;
2017-10-19 19:14:44 +08:00
2013-03-23 02:31:29 +08:00
/**
* do_adjtimex ( ) - Accessor function to NTP __do_adjtimex function
*/
2018-07-03 13:44:21 +08:00
int do_adjtimex ( struct __kernel_timex * txc )
2013-03-23 02:31:29 +08:00
{
2014-07-17 05:04:07 +08:00
struct timekeeper * tk = & tk_core . timekeeper ;
2019-04-10 17:14:20 +08:00
struct audit_ntp_data ad ;
2021-07-13 21:39:52 +08:00
bool clock_set = false ;
2014-07-17 05:04:01 +08:00
struct timespec64 ts ;
2021-07-13 21:39:52 +08:00
unsigned long flags ;
2013-04-11 03:41:49 +08:00
s32 orig_tai , tai ;
2013-03-23 03:08:52 +08:00
int ret ;
/* Validate the data before disabling interrupts */
2017-10-19 19:14:45 +08:00
ret = timekeeping_validate_timex ( txc ) ;
2013-03-23 03:08:52 +08:00
if ( ret )
return ret ;
2022-07-18 05:53:34 +08:00
add_device_randomness ( txc , sizeof ( * txc ) ) ;
2013-03-23 03:08:52 +08:00
2013-03-23 06:04:13 +08:00
if ( txc - > modes & ADJ_SETOFFSET ) {
2017-10-19 19:14:45 +08:00
struct timespec64 delta ;
2013-03-23 06:04:13 +08:00
delta . tv_sec = txc - > time . tv_sec ;
delta . tv_nsec = txc - > time . tv_usec ;
if ( ! ( txc - > modes & ADJ_NANO ) )
delta . tv_nsec * = 1000 ;
ret = timekeeping_inject_offset ( & delta ) ;
if ( ret )
return ret ;
2019-04-10 17:14:19 +08:00
audit_tk_injoffset ( delta ) ;
2013-03-23 06:04:13 +08:00
}
2019-04-10 17:14:20 +08:00
audit_ntp_init ( & ad ) ;
2018-06-18 22:08:01 +08:00
ktime_get_real_ts64 ( & ts ) ;
2022-07-18 05:53:34 +08:00
add_device_randomness ( & ts , sizeof ( ts ) ) ;
2013-03-23 03:28:15 +08:00
2013-03-23 02:37:28 +08:00
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2013-03-23 02:37:28 +08:00
2013-04-11 03:41:49 +08:00
orig_tai = tai = tk - > tai_offset ;
2019-04-10 17:14:20 +08:00
ret = __do_adjtimex ( txc , & ts , & tai , & ad ) ;
2013-03-23 02:31:29 +08:00
2013-04-11 03:41:49 +08:00
if ( tai ! = orig_tai ) {
__timekeeping_set_tai_offset ( tk , tai ) ;
2013-12-12 10:50:25 +08:00
timekeeping_update ( tk , TK_MIRROR | TK_CLOCK_WAS_SET ) ;
2021-07-13 21:39:52 +08:00
clock_set = true ;
2013-04-11 03:41:49 +08:00
}
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.
This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.
However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.
This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)
However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.
So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.
This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.
Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.
While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.
Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-06-12 06:54:55 +08:00
tk_update_leap_state ( tk ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-03-23 02:37:28 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2019-04-10 17:14:20 +08:00
audit_ntp_log ( & ad ) ;
2018-06-04 21:34:21 +08:00
/* Update the multiplier immediately if frequency was set directly */
if ( txc - > modes & ( ADJ_FREQUENCY | ADJ_TICK ) )
2021-07-13 21:39:52 +08:00
clock_set | = timekeeping_advance ( TK_ADV_FREQ ) ;
2018-06-04 21:34:21 +08:00
2021-07-13 21:39:52 +08:00
if ( clock_set )
2021-07-13 21:39:53 +08:00
clock_was_set ( CLOCK_REALTIME ) ;
timekeeping: Avoid possible deadlock from clock_was_set_delayed
As part of normal operaions, the hrtimer subsystem frequently calls
into the timekeeping code, creating a locking order of
hrtimer locks -> timekeeping locks
clock_was_set_delayed() was suppoed to allow us to avoid deadlocks
between the timekeeping the hrtimer subsystem, so that we could
notify the hrtimer subsytem the time had changed while holding
the timekeeping locks. This was done by scheduling delayed work
that would run later once we were out of the timekeeing code.
But unfortunately the lock chains are complex enoguh that in
scheduling delayed work, we end up eventually trying to grab
an hrtimer lock.
Sasha Levin noticed this in testing when the new seqlock lockdep
enablement triggered the following (somewhat abrieviated) message:
[ 251.100221] ======================================================
[ 251.100221] [ INFO: possible circular locking dependency detected ]
[ 251.100221] 3.13.0-rc2-next-20131206-sasha-00005-g8be2375-dirty #4053 Not tainted
[ 251.101967] -------------------------------------------------------
[ 251.101967] kworker/10:1/4506 is trying to acquire lock:
[ 251.101967] (timekeeper_seq){----..}, at: [<ffffffff81160e96>] retrigger_next_event+0x56/0x70
[ 251.101967]
[ 251.101967] but task is already holding lock:
[ 251.101967] (hrtimer_bases.lock#11){-.-...}, at: [<ffffffff81160e7c>] retrigger_next_event+0x3c/0x70
[ 251.101967]
[ 251.101967] which lock already depends on the new lock.
[ 251.101967]
[ 251.101967]
[ 251.101967] the existing dependency chain (in reverse order) is:
[ 251.101967]
-> #5 (hrtimer_bases.lock#11){-.-...}:
[snipped]
-> #4 (&rt_b->rt_runtime_lock){-.-...}:
[snipped]
-> #3 (&rq->lock){-.-.-.}:
[snipped]
-> #2 (&p->pi_lock){-.-.-.}:
[snipped]
-> #1 (&(&pool->lock)->rlock){-.-...}:
[ 251.101967] [<ffffffff81194803>] validate_chain+0x6c3/0x7b0
[ 251.101967] [<ffffffff81194d9d>] __lock_acquire+0x4ad/0x580
[ 251.101967] [<ffffffff81194ff2>] lock_acquire+0x182/0x1d0
[ 251.101967] [<ffffffff84398500>] _raw_spin_lock+0x40/0x80
[ 251.101967] [<ffffffff81153e69>] __queue_work+0x1a9/0x3f0
[ 251.101967] [<ffffffff81154168>] queue_work_on+0x98/0x120
[ 251.101967] [<ffffffff81161351>] clock_was_set_delayed+0x21/0x30
[ 251.101967] [<ffffffff811c4bd1>] do_adjtimex+0x111/0x160
[ 251.101967] [<ffffffff811e2711>] compat_sys_adjtimex+0x41/0x70
[ 251.101967] [<ffffffff843a4b49>] ia32_sysret+0x0/0x5
[ 251.101967]
-> #0 (timekeeper_seq){----..}:
[snipped]
[ 251.101967] other info that might help us debug this:
[ 251.101967]
[ 251.101967] Chain exists of:
timekeeper_seq --> &rt_b->rt_runtime_lock --> hrtimer_bases.lock#11
[ 251.101967] Possible unsafe locking scenario:
[ 251.101967]
[ 251.101967] CPU0 CPU1
[ 251.101967] ---- ----
[ 251.101967] lock(hrtimer_bases.lock#11);
[ 251.101967] lock(&rt_b->rt_runtime_lock);
[ 251.101967] lock(hrtimer_bases.lock#11);
[ 251.101967] lock(timekeeper_seq);
[ 251.101967]
[ 251.101967] *** DEADLOCK ***
[ 251.101967]
[ 251.101967] 3 locks held by kworker/10:1/4506:
[ 251.101967] #0: (events){.+.+.+}, at: [<ffffffff81154960>] process_one_work+0x200/0x530
[ 251.101967] #1: (hrtimer_work){+.+...}, at: [<ffffffff81154960>] process_one_work+0x200/0x530
[ 251.101967] #2: (hrtimer_bases.lock#11){-.-...}, at: [<ffffffff81160e7c>] retrigger_next_event+0x3c/0x70
[ 251.101967]
[ 251.101967] stack backtrace:
[ 251.101967] CPU: 10 PID: 4506 Comm: kworker/10:1 Not tainted 3.13.0-rc2-next-20131206-sasha-00005-g8be2375-dirty #4053
[ 251.101967] Workqueue: events clock_was_set_work
So the best solution is to avoid calling clock_was_set_delayed() while
holding the timekeeping lock, and instead using a flag variable to
decide if we should call clock_was_set() once we've released the locks.
This works for the case here, where the do_adjtimex() was the deadlock
trigger point. Unfortuantely, in update_wall_time() we still hold
the jiffies lock, which would deadlock with the ipi triggered by
clock_was_set(), preventing us from calling it even after we drop the
timekeeping lock. So instead call clock_was_set_delayed() at that point.
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: stable <stable@vger.kernel.org> #3.10+
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Tested-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
2013-12-11 09:18:18 +08:00
2013-09-12 07:50:56 +08:00
ntp_notify_cmos_timer ( ) ;
2013-03-23 03:28:15 +08:00
return ret ;
}
2013-03-23 02:31:29 +08:00
# ifdef CONFIG_NTP_PPS
/**
* hardpps ( ) - Accessor function to NTP __hardpps function
*/
2015-09-29 04:21:28 +08:00
void hardpps ( const struct timespec64 * phase_ts , const struct timespec64 * raw_ts )
2013-03-23 02:31:29 +08:00
{
2013-03-23 02:37:28 +08:00
unsigned long flags ;
raw_spin_lock_irqsave ( & timekeeper_lock , flags ) ;
2014-07-17 05:04:07 +08:00
write_seqcount_begin ( & tk_core . seq ) ;
2013-03-23 02:37:28 +08:00
2013-03-23 02:31:29 +08:00
__hardpps ( phase_ts , raw_ts ) ;
2013-03-23 02:37:28 +08:00
2014-07-17 05:04:07 +08:00
write_seqcount_end ( & tk_core . seq ) ;
2013-03-23 02:37:28 +08:00
raw_spin_unlock_irqrestore ( & timekeeper_lock , flags ) ;
2013-03-23 02:31:29 +08:00
}
EXPORT_SYMBOL ( hardpps ) ;
2017-09-09 07:17:19 +08:00
# endif /* CONFIG_NTP_PPS */