From 990518eb3a71c357ca4ff1ad3e747fb844d8094c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 19 May 2025 10:33:15 +0200 Subject: [PATCH 01/39] timekeeping: Remove hardcoded access to tk_core This was overlooked in the initial conversion. Use the provided pointer to access the shadow timekeeper. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250519083025.652611452@linutronix.de --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index a009c91f7b05..2ad78fbdc9ff 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -663,7 +663,7 @@ static void timekeeping_restore_shadow(struct tk_data *tkd) static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action) { - struct timekeeper *tk = &tk_core.shadow_timekeeper; + struct timekeeper *tk = &tkd->shadow_timekeeper; lockdep_assert_held(&tkd->lock); From 506a54a0316ee4854b0ed113a8001477f5211d50 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 19 May 2025 10:33:16 +0200 Subject: [PATCH 02/39] timekeeping: Cleanup kernel doc of __ktime_get_real_seconds() Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250519083025.715836017@linutronix.de --- kernel/time/timekeeping.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2ad78fbdc9ff..d88d19fb794c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -975,9 +975,14 @@ time64_t ktime_get_real_seconds(void) EXPORT_SYMBOL_GPL(ktime_get_real_seconds); /** - * __ktime_get_real_seconds - The same as ktime_get_real_seconds - * but without the sequence counter protect. This internal function - * is called just when timekeeping lock is already held. + * __ktime_get_real_seconds - Unprotected access to CLOCK_REALTIME seconds + * + * The same as ktime_get_real_seconds() but without the sequence counter + * protection. This function is used in restricted contexts like the x86 MCE + * handler and in KGDB. It's unprotected on 32-bit vs. concurrent half + * completed modification and only to be used for such critical contexts. + * + * Returns: Racy snapshot of the CLOCK_REALTIME seconds value */ noinstr time64_t __ktime_get_real_seconds(void) { From 7e55b6ba1fe6987638160e5f8216288f38043759 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 19 May 2025 10:33:17 +0200 Subject: [PATCH 03/39] timekeeping: Avoid double notification in do_adjtimex() Consolidate do_adjtimex() so that it does not notify about clock changes twice. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250519083025.779267274@linutronix.de --- kernel/time/timekeeping.c | 98 ++++++++++++++++++++++----------------- 1 file changed, 56 insertions(+), 42 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d88d19fb794c..fb1da87a92f1 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1418,40 +1418,49 @@ int do_settimeofday64(const struct timespec64 *ts) EXPORT_SYMBOL(do_settimeofday64); /** - * timekeeping_inject_offset - Adds or subtracts from the current time. + * __timekeeping_inject_offset - Adds or subtracts from the current time. * @ts: Pointer to the timespec variable containing the offset * * Adds or subtracts an offset value from the current time. */ -static int timekeeping_inject_offset(const struct timespec64 *ts) +static int __timekeeping_inject_offset(const struct timespec64 *ts) { + struct timekeeper *tks = &tk_core.shadow_timekeeper; + struct timespec64 tmp; + if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { - struct timekeeper *tks = &tk_core.shadow_timekeeper; - struct timespec64 tmp; - timekeeping_forward_now(tks); + timekeeping_forward_now(tks); - /* Make sure the proposed value is valid */ - tmp = timespec64_add(tk_xtime(tks), *ts); - if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 || - !timespec64_valid_settod(&tmp)) { - timekeeping_restore_shadow(&tk_core); - return -EINVAL; - } - - tk_xtime_add(tks, ts); - tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts)); - timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); + /* Make sure the proposed value is valid */ + tmp = timespec64_add(tk_xtime(tks), *ts); + if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 || + !timespec64_valid_settod(&tmp)) { + timekeeping_restore_shadow(&tk_core); + return -EINVAL; } - /* Signal hrtimers about time change */ - clock_was_set(CLOCK_SET_WALL); + tk_xtime_add(tks, ts); + tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts)); + timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); return 0; } +static int timekeeping_inject_offset(const struct timespec64 *ts) +{ + int ret; + + scoped_guard (raw_spinlock_irqsave, &tk_core.lock) + ret = __timekeeping_inject_offset(ts); + + /* Signal hrtimers about time change */ + if (!ret) + clock_was_set(CLOCK_SET_WALL); + return ret; +} + /* * Indicates if there is an offset between the system clock and the hardware * clock/persistent clock/rtc. @@ -2186,7 +2195,7 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, * timekeeping_advance - Updates the timekeeper to the current time and * current NTP tick length */ -static bool timekeeping_advance(enum timekeeping_adv_mode mode) +static bool __timekeeping_advance(enum timekeeping_adv_mode mode) { struct timekeeper *tk = &tk_core.shadow_timekeeper; struct timekeeper *real_tk = &tk_core.timekeeper; @@ -2194,8 +2203,6 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) int shift = 0, maxshift; u64 offset, orig_offset; - guard(raw_spinlock_irqsave)(&tk_core.lock); - /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) return false; @@ -2249,6 +2256,12 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) return !!clock_set; } +static bool timekeeping_advance(enum timekeeping_adv_mode mode) +{ + guard(raw_spinlock_irqsave)(&tk_core.lock); + return __timekeeping_advance(mode); +} + /** * update_wall_time - Uses the current clocksource to increment the wall time * @@ -2537,10 +2550,10 @@ EXPORT_SYMBOL_GPL(random_get_entropy_fallback); */ int do_adjtimex(struct __kernel_timex *txc) { + struct timespec64 delta, ts; struct audit_ntp_data ad; bool offset_set = false; bool clock_set = false; - struct timespec64 ts; int ret; /* Validate the data before disabling interrupts */ @@ -2549,21 +2562,6 @@ int do_adjtimex(struct __kernel_timex *txc) return ret; add_device_randomness(txc, sizeof(*txc)); - if (txc->modes & ADJ_SETOFFSET) { - struct timespec64 delta; - - delta.tv_sec = txc->time.tv_sec; - delta.tv_nsec = txc->time.tv_usec; - if (!(txc->modes & ADJ_NANO)) - delta.tv_nsec *= 1000; - ret = timekeeping_inject_offset(&delta); - if (ret) - return ret; - - offset_set = delta.tv_sec != 0; - audit_tk_injoffset(delta); - } - audit_ntp_init(&ad); ktime_get_real_ts64(&ts); @@ -2573,6 +2571,19 @@ int do_adjtimex(struct __kernel_timex *txc) struct timekeeper *tks = &tk_core.shadow_timekeeper; s32 orig_tai, tai; + if (txc->modes & ADJ_SETOFFSET) { + delta.tv_sec = txc->time.tv_sec; + delta.tv_nsec = txc->time.tv_usec; + if (!(txc->modes & ADJ_NANO)) + delta.tv_nsec *= 1000; + ret = __timekeeping_inject_offset(&delta); + if (ret) + return ret; + + offset_set = delta.tv_sec != 0; + clock_set = true; + } + orig_tai = tai = tks->tai_offset; ret = __do_adjtimex(txc, &ts, &tai, &ad); @@ -2583,13 +2594,16 @@ int do_adjtimex(struct __kernel_timex *txc) } else { tk_update_leap_state_all(&tk_core); } + + /* Update the multiplier immediately if frequency was set directly */ + if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) + clock_set |= __timekeeping_advance(TK_ADV_FREQ); } - audit_ntp_log(&ad); + if (txc->modes & ADJ_SETOFFSET) + audit_tk_injoffset(delta); - /* Update the multiplier immediately if frequency was set directly */ - if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) - clock_set |= timekeeping_advance(TK_ADV_FREQ); + audit_ntp_log(&ad); if (clock_set) clock_was_set(CLOCK_SET_WALL); From f12b45862c4dcb9c2937b83ed730e473b9a76cbf Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 19 May 2025 10:33:19 +0200 Subject: [PATCH 04/39] timekeeping: Introduce timekeeper ID As long as there is only a single timekeeper, there is no need to clarify which timekeeper is used. But with the upcoming reusage of the timekeeper infrastructure for auxiliary clock timekeepers, an ID is required to differentiate. Introduce an enum for timekeeper IDs, introduce a field in struct tk_data to store this timekeeper id and add also initialization. The id struct field is added at the end of the second cachline, as there is a 4 byte hole anyway. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250519083025.842476378@linutronix.de --- include/linux/timekeeper_internal.h | 14 +++++++++++++- kernel/time/timekeeping.c | 5 +++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index 785048a3b3e6..bfcecad0e279 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -11,6 +11,16 @@ #include #include +/** + * timekeeper_ids - IDs for various time keepers in the kernel + * @TIMEKEEPER_CORE: The central core timekeeper managing system time + * @TIMEKEEPERS_MAX: The maximum number of timekeepers managed + */ +enum timekeeper_ids { + TIMEKEEPER_CORE, + TIMEKEEPERS_MAX, +}; + /** * struct tk_read_base - base structure for timekeeping readout * @clock: Current clocksource used for timekeeping. @@ -52,6 +62,7 @@ struct tk_read_base { * @offs_boot: Offset clock monotonic -> clock boottime * @offs_tai: Offset clock monotonic -> clock tai * @coarse_nsec: The nanoseconds part for coarse time getters + * @id: The timekeeper ID * @tkr_raw: The readout base structure for CLOCK_MONOTONIC_RAW * @raw_sec: CLOCK_MONOTONIC_RAW time in seconds * @clock_was_set_seq: The sequence number of clock was set events @@ -101,7 +112,7 @@ struct tk_read_base { * which results in the following cacheline layout: * * 0: seqcount, tkr_mono - * 1: xtime_sec ... coarse_nsec + * 1: xtime_sec ... id * 2: tkr_raw, raw_sec * 3,4: Internal variables * @@ -123,6 +134,7 @@ struct timekeeper { ktime_t offs_boot; ktime_t offs_tai; u32 coarse_nsec; + enum timekeeper_ids id; /* Cacheline 2: */ struct tk_read_base tkr_raw; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index fb1da87a92f1..f4692fc2ea6b 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1663,10 +1663,11 @@ read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, *boot_offset = ns_to_timespec64(local_clock()); } -static __init void tkd_basic_setup(struct tk_data *tkd) +static __init void tkd_basic_setup(struct tk_data *tkd, enum timekeeper_ids tk_id) { raw_spin_lock_init(&tkd->lock); seqcount_raw_spinlock_init(&tkd->seq, &tkd->lock); + tkd->timekeeper.id = tkd->shadow_timekeeper.id = tk_id; } /* @@ -1696,7 +1697,7 @@ void __init timekeeping_init(void) struct timekeeper *tks = &tk_core.shadow_timekeeper; struct clocksource *clock; - tkd_basic_setup(&tk_core); + tkd_basic_setup(&tk_core, TIMEKEEPER_CORE); read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); if (timespec64_valid_settod(&wall_time) && From 9094c72c3d81bf2416b7c79d12c8494ab8fbac20 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 19 May 2025 10:33:20 +0200 Subject: [PATCH 05/39] time: Introduce auxiliary POSIX clocks To support auxiliary timekeeping and the related user space interfaces, it's required to define a clock ID range for them. Reserve 8 auxiliary clock IDs after the regular timekeeping clock ID space. This is the maximum number of auxiliary clocks the kernel can support. The actual number of supported clocks depends obviously on the presence of related devices and might be constraint by the available VDSO space. Add the corresponding timekeeper IDs as well. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250519083025.905800695@linutronix.de --- include/linux/timekeeper_internal.h | 10 ++++++++-- include/uapi/linux/time.h | 11 +++++++++++ kernel/time/Kconfig | 15 +++++++++++++-- 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index bfcecad0e279..4201ae818f57 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -13,11 +13,17 @@ /** * timekeeper_ids - IDs for various time keepers in the kernel - * @TIMEKEEPER_CORE: The central core timekeeper managing system time - * @TIMEKEEPERS_MAX: The maximum number of timekeepers managed + * @TIMEKEEPER_CORE: The central core timekeeper managing system time + * @TIMEKEEPER_AUX_FIRST: The first AUX timekeeper + * @TIMEKEEPER_AUX_LAST: The last AUX timekeeper + * @TIMEKEEPERS_MAX: The maximum number of timekeepers managed */ enum timekeeper_ids { TIMEKEEPER_CORE, +#ifdef CONFIG_POSIX_AUX_CLOCKS + TIMEKEEPER_AUX_FIRST, + TIMEKEEPER_AUX_LAST = TIMEKEEPER_AUX_FIRST + MAX_AUX_CLOCKS - 1, +#endif TIMEKEEPERS_MAX, }; diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h index 4f4b6e48e01c..16ca1ac206fd 100644 --- a/include/uapi/linux/time.h +++ b/include/uapi/linux/time.h @@ -64,6 +64,17 @@ struct timezone { #define CLOCK_TAI 11 #define MAX_CLOCKS 16 + +/* + * AUX clock support. AUXiliary clocks are dynamically configured by + * enabling a clock ID. These clock can be steered independently of the + * core timekeeper. The kernel can support up to 8 auxiliary clocks, but + * the actual limit depends on eventual architecture constraints vs. VDSO. + */ +#define CLOCK_AUX MAX_CLOCKS +#define MAX_AUX_CLOCKS 8 +#define CLOCK_AUX_LAST (CLOCK_AUX + MAX_AUX_CLOCKS - 1) + #define CLOCKS_MASK (CLOCK_REALTIME | CLOCK_MONOTONIC) #define CLOCKS_MONO CLOCK_MONOTONIC diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index b0b97a60aaa6..7c6a52f7836c 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -82,9 +82,9 @@ config CONTEXT_TRACKING_IDLE help Tracks idle state on behalf of RCU. -if GENERIC_CLOCKEVENTS menu "Timers subsystem" +if GENERIC_CLOCKEVENTS # Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is # only related to the tick functionality. Oneshot clockevent devices # are supported independent of this. @@ -208,6 +208,17 @@ config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US interval and NTP's maximum frequency drift of 500 parts per million. If the clocksource is good enough for NTP, it is good enough for the clocksource watchdog! +endif + +config POSIX_AUX_CLOCKS + bool "Enable auxiliary POSIX clocks" + depends on POSIX_TIMERS + help + Auxiliary POSIX clocks are clocks which can be steered + independently of the core timekeeper, which controls the + MONOTONIC, REALTIME, BOOTTIME and TAI clocks. They are useful to + provide e.g. lockless time accessors to independent PTP clocks + and other clock domains, which are not correlated to the TAI/NTP + notion of time. endmenu -endif From 8515714b0f88a698a4c26f0f0ce7d43ad14dce16 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 19 May 2025 10:33:21 +0200 Subject: [PATCH 06/39] ntp: Add support for auxiliary timekeepers If auxiliary clocks are enabled, provide an array of NTP data so that the auxiliary timekeepers can be steered independently of the core timekeeper. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250519083025.969000914@linutronix.de --- kernel/time/ntp.c | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index b837d3d9d325..5b5a0f76866d 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "ntp_internal.h" #include "timekeeping_internal.h" @@ -86,14 +87,16 @@ struct ntp_data { #endif }; -static struct ntp_data tk_ntp_data = { - .tick_usec = USER_TICK_USEC, - .time_state = TIME_OK, - .time_status = STA_UNSYNC, - .time_constant = 2, - .time_maxerror = NTP_PHASE_LIMIT, - .time_esterror = NTP_PHASE_LIMIT, - .ntp_next_leap_sec = TIME64_MAX, +static struct ntp_data tk_ntp_data[TIMEKEEPERS_MAX] = { + [ 0 ... TIMEKEEPERS_MAX - 1 ] = { + .tick_usec = USER_TICK_USEC, + .time_state = TIME_OK, + .time_status = STA_UNSYNC, + .time_constant = 2, + .time_maxerror = NTP_PHASE_LIMIT, + .time_esterror = NTP_PHASE_LIMIT, + .ntp_next_leap_sec = TIME64_MAX, + }, }; #define SECS_PER_DAY 86400 @@ -351,13 +354,13 @@ static void __ntp_clear(struct ntp_data *ntpdata) */ void ntp_clear(void) { - __ntp_clear(&tk_ntp_data); + __ntp_clear(&tk_ntp_data[TIMEKEEPER_CORE]); } u64 ntp_tick_length(void) { - return tk_ntp_data.tick_length; + return tk_ntp_data[TIMEKEEPER_CORE].tick_length; } /** @@ -368,7 +371,7 @@ u64 ntp_tick_length(void) */ ktime_t ntp_get_next_leap(void) { - struct ntp_data *ntpdata = &tk_ntp_data; + struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE]; ktime_t ret; if ((ntpdata->time_state == TIME_INS) && (ntpdata->time_status & STA_INS)) @@ -389,7 +392,7 @@ ktime_t ntp_get_next_leap(void) */ int second_overflow(time64_t secs) { - struct ntp_data *ntpdata = &tk_ntp_data; + struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE]; s64 delta; int leap = 0; s32 rem; @@ -605,7 +608,7 @@ static inline int update_rtc(struct timespec64 *to_set, unsigned long *offset_ns */ static inline bool ntp_synced(void) { - return !(tk_ntp_data.time_status & STA_UNSYNC); + return !(tk_ntp_data[TIMEKEEPER_CORE].time_status & STA_UNSYNC); } /* @@ -762,7 +765,7 @@ static inline void process_adjtimex_modes(struct ntp_data *ntpdata, const struct int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, s32 *time_tai, struct audit_ntp_data *ad) { - struct ntp_data *ntpdata = &tk_ntp_data; + struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE]; int result; if (txc->modes & ADJ_ADJTIME) { @@ -1031,8 +1034,8 @@ static void hardpps_update_phase(struct ntp_data *ntpdata, long error) */ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { + struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE]; struct pps_normtime pts_norm, freq_norm; - struct ntp_data *ntpdata = &tk_ntp_data; pts_norm = pps_normalize_ts(*phase_ts); @@ -1083,18 +1086,18 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t static int __init ntp_tick_adj_setup(char *str) { - int rc = kstrtos64(str, 0, &tk_ntp_data.ntp_tick_adj); + int rc = kstrtos64(str, 0, &tk_ntp_data[TIMEKEEPER_CORE].ntp_tick_adj); if (rc) return rc; - tk_ntp_data.ntp_tick_adj <<= NTP_SCALE_SHIFT; + tk_ntp_data[TIMEKEEPER_CORE].ntp_tick_adj <<= NTP_SCALE_SHIFT; return 1; } - __setup("ntp_tick_adj=", ntp_tick_adj_setup); void __init ntp_init(void) { - ntp_clear(); + for (int id = 0; id < TIMEKEEPERS_MAX; id++) + __ntp_clear(tk_ntp_data + id); ntp_init_cmos_sync(); } From 5ffa25f573cf524ff53660c5ff7a158ee10f23c7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 19 May 2025 10:33:22 +0200 Subject: [PATCH 07/39] ntp: Add timekeeper ID arguments to public functions In preparation for supporting auxiliary POSIX clocks, add a timekeeper ID to the relevant functions. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250519083026.032425931@linutronix.de --- kernel/time/ntp.c | 33 +++++++++++++++++++-------------- kernel/time/ntp_internal.h | 11 +++++------ kernel/time/timekeeping.c | 12 ++++++------ 3 files changed, 30 insertions(+), 26 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5b5a0f76866d..e28dc53194a7 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -351,33 +351,38 @@ static void __ntp_clear(struct ntp_data *ntpdata) /** * ntp_clear - Clears the NTP state variables + * @tkid: Timekeeper ID to be able to select proper ntp data array member */ -void ntp_clear(void) +void ntp_clear(unsigned int tkid) { - __ntp_clear(&tk_ntp_data[TIMEKEEPER_CORE]); + __ntp_clear(&tk_ntp_data[tkid]); } -u64 ntp_tick_length(void) +u64 ntp_tick_length(unsigned int tkid) { - return tk_ntp_data[TIMEKEEPER_CORE].tick_length; + return tk_ntp_data[tkid].tick_length; } /** * ntp_get_next_leap - Returns the next leapsecond in CLOCK_REALTIME ktime_t + * @tkid: Timekeeper ID * - * Provides the time of the next leapsecond against CLOCK_REALTIME in - * a ktime_t format. Returns KTIME_MAX if no leapsecond is pending. + * Returns: For @tkid == TIMEKEEPER_CORE this provides the time of the next + * leap second against CLOCK_REALTIME in a ktime_t format if a + * leap second is pending. KTIME_MAX otherwise. */ -ktime_t ntp_get_next_leap(void) +ktime_t ntp_get_next_leap(unsigned int tkid) { struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE]; - ktime_t ret; + + if (tkid != TIMEKEEPER_CORE) + return KTIME_MAX; if ((ntpdata->time_state == TIME_INS) && (ntpdata->time_status & STA_INS)) return ktime_set(ntpdata->ntp_next_leap_sec, 0); - ret = KTIME_MAX; - return ret; + + return KTIME_MAX; } /* @@ -390,9 +395,9 @@ ktime_t ntp_get_next_leap(void) * * Also handles leap second processing, and returns leap offset */ -int second_overflow(time64_t secs) +int second_overflow(unsigned int tkid, time64_t secs) { - struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE]; + struct ntp_data *ntpdata = &tk_ntp_data[tkid]; s64 delta; int leap = 0; s32 rem; @@ -762,10 +767,10 @@ static inline void process_adjtimex_modes(struct ntp_data *ntpdata, const struct * adjtimex() mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. */ -int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, +int __do_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts, s32 *time_tai, struct audit_ntp_data *ad) { - struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE]; + struct ntp_data *ntpdata = &tk_ntp_data[tkid]; int result; if (txc->modes & ADJ_ADJTIME) { diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index 5a633dce9057..2d3e9669730b 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -3,13 +3,12 @@ #define _LINUX_NTP_INTERNAL_H extern void ntp_init(void); -extern void ntp_clear(void); +extern void ntp_clear(unsigned int tkid); /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ -extern u64 ntp_tick_length(void); -extern ktime_t ntp_get_next_leap(void); -extern int second_overflow(time64_t secs); -extern int __do_adjtimex(struct __kernel_timex *txc, - const struct timespec64 *ts, +extern u64 ntp_tick_length(unsigned int tkid); +extern ktime_t ntp_get_next_leap(unsigned int tkid); +extern int second_overflow(unsigned int tkid, time64_t secs); +extern int __do_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts, s32 *time_tai, struct audit_ntp_data *ad); extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f4692fc2ea6b..e1b8e2618ca7 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -601,7 +601,7 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); */ static inline void tk_update_leap_state(struct timekeeper *tk) { - tk->next_leap_ktime = ntp_get_next_leap(); + tk->next_leap_ktime = ntp_get_next_leap(tk->id); if (tk->next_leap_ktime != KTIME_MAX) /* Convert to monotonic time */ tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real); @@ -678,7 +678,7 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act if (action & TK_CLEAR_NTP) { tk->ntp_error = 0; - ntp_clear(); + ntp_clear(tk->id); } tk_update_leap_state(tk); @@ -2049,7 +2049,7 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, */ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) { - u64 ntp_tl = ntp_tick_length(); + u64 ntp_tl = ntp_tick_length(tk->id); u32 mult; /* @@ -2130,7 +2130,7 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) } /* Figure out if its a leap sec and apply if needed */ - leap = second_overflow(tk->xtime_sec); + leap = second_overflow(tk->id, tk->xtime_sec); if (unlikely(leap)) { struct timespec64 ts; @@ -2227,7 +2227,7 @@ static bool __timekeeping_advance(enum timekeeping_adv_mode mode) shift = ilog2(offset) - ilog2(tk->cycle_interval); shift = max(0, shift); /* Bound shift to one less than what overflows tick_length */ - maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; + maxshift = (64 - (ilog2(ntp_tick_length(tk->id)) + 1)) - 1; shift = min(shift, maxshift); while (offset >= tk->cycle_interval) { offset = logarithmic_accumulation(tk, offset, shift, &clock_set); @@ -2586,7 +2586,7 @@ int do_adjtimex(struct __kernel_timex *txc) } orig_tai = tai = tks->tai_offset; - ret = __do_adjtimex(txc, &ts, &tai, &ad); + ret = __do_adjtimex(tks->id, txc, &ts, &tai, &ad); if (tai != orig_tai) { __timekeeping_set_tai_offset(tks, tai); From c7ebfbc440151ae4a66a03b0f879cbece45174c8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 19 May 2025 10:33:23 +0200 Subject: [PATCH 08/39] ntp: Rename __do_adjtimex() to ntp_adjtimex() Clean up the name space. No functional change. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250519083026.095637820@linutronix.de --- kernel/time/ntp.c | 4 ++-- kernel/time/ntp_internal.h | 4 ++-- kernel/time/timekeeping.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index e28dc53194a7..9aba1bc7b2a7 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -767,8 +767,8 @@ static inline void process_adjtimex_modes(struct ntp_data *ntpdata, const struct * adjtimex() mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. */ -int __do_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts, - s32 *time_tai, struct audit_ntp_data *ad) +int ntp_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts, + s32 *time_tai, struct audit_ntp_data *ad) { struct ntp_data *ntpdata = &tk_ntp_data[tkid]; int result; diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index 2d3e9669730b..7084d839c207 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -8,8 +8,8 @@ extern void ntp_clear(unsigned int tkid); extern u64 ntp_tick_length(unsigned int tkid); extern ktime_t ntp_get_next_leap(unsigned int tkid); extern int second_overflow(unsigned int tkid, time64_t secs); -extern int __do_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts, - s32 *time_tai, struct audit_ntp_data *ad); +extern int ntp_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts, + s32 *time_tai, struct audit_ntp_data *ad); extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts); #if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e1b8e2618ca7..99b4749f0665 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2586,7 +2586,7 @@ int do_adjtimex(struct __kernel_timex *txc) } orig_tai = tai = tks->tai_offset; - ret = __do_adjtimex(tks->id, txc, &ts, &tai, &ad); + ret = ntp_adjtimex(tks->id, txc, &ts, &tai, &ad); if (tai != orig_tai) { __timekeeping_set_tai_offset(tks, tai); From 926ad475169f5b24868438e4bff61ec6a73efd19 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 19 May 2025 10:33:25 +0200 Subject: [PATCH 09/39] timekeeping: Make __timekeeping_advance() reusable In __timekeeping_advance() the pointer to struct tk_data is hardcoded by the use of &tk_core. As long as there is only a single timekeeper (tk_core), this is not a problem. But when __timekeeping_advance() will be reused for per auxiliary timekeepers, __timekeeping_advance() needs to be generalized. Add a pointer to struct tk_data as function argument of __timekeeping_advance() and adapt all call sites. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250519083026.160967312@linutronix.de --- kernel/time/timekeeping.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 99b4749f0665..153f760dffb4 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2196,10 +2196,10 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, * timekeeping_advance - Updates the timekeeper to the current time and * current NTP tick length */ -static bool __timekeeping_advance(enum timekeeping_adv_mode mode) +static bool __timekeeping_advance(struct tk_data *tkd, enum timekeeping_adv_mode mode) { - struct timekeeper *tk = &tk_core.shadow_timekeeper; - struct timekeeper *real_tk = &tk_core.timekeeper; + struct timekeeper *tk = &tkd->shadow_timekeeper; + struct timekeeper *real_tk = &tkd->timekeeper; unsigned int clock_set = 0; int shift = 0, maxshift; u64 offset, orig_offset; @@ -2252,7 +2252,7 @@ static bool __timekeeping_advance(enum timekeeping_adv_mode mode) if (orig_offset != offset) tk_update_coarse_nsecs(tk); - timekeeping_update_from_shadow(&tk_core, clock_set); + timekeeping_update_from_shadow(tkd, clock_set); return !!clock_set; } @@ -2260,7 +2260,7 @@ static bool __timekeeping_advance(enum timekeeping_adv_mode mode) static bool timekeeping_advance(enum timekeeping_adv_mode mode) { guard(raw_spinlock_irqsave)(&tk_core.lock); - return __timekeeping_advance(mode); + return __timekeeping_advance(&tk_core, mode); } /** @@ -2598,7 +2598,7 @@ int do_adjtimex(struct __kernel_timex *txc) /* Update the multiplier immediately if frequency was set directly */ if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) - clock_set |= __timekeeping_advance(TK_ADV_FREQ); + clock_set |= __timekeeping_advance(&tk_core, TK_ADV_FREQ); } if (txc->modes & ADJ_SETOFFSET) From 8c782acd3f47e21f9b03fd3720172d1f8e4fb796 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 19 May 2025 10:33:26 +0200 Subject: [PATCH 10/39] timekeeping: Prepare timekeeping_update_from_shadow() Don't invoke the VDSO and paravirt updates when utilized for auxiliary clocks. This is a temporary workaround until the VDSO and paravirt interfaces have been worked out. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250519083026.223876435@linutronix.de --- kernel/time/timekeeping.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 153f760dffb4..e3c1a1c1d8c5 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -683,13 +683,15 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act tk_update_leap_state(tk); tk_update_ktime_data(tk); - - update_vsyscall(tk); - update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); - tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; - update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); - update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); + + if (tk->id == TIMEKEEPER_CORE) { + update_vsyscall(tk); + update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); + + update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); + update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); + } if (action & TK_CLOCK_WAS_SET) tk->clock_was_set_seq++; From 6168024604236cb2bb1004ea8459c8ece2c4ef5f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 19 May 2025 10:33:27 +0200 Subject: [PATCH 11/39] timekeeping: Add clock_valid flag to timekeeper In preparation for supporting independent auxiliary timekeepers, add a clock valid field and set it to true for the system timekeeper. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250519083026.287145536@linutronix.de --- include/linux/timekeeper_internal.h | 2 ++ kernel/time/timekeeping.c | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index 4201ae818f57..1690eda1c7c3 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -73,6 +73,7 @@ struct tk_read_base { * @raw_sec: CLOCK_MONOTONIC_RAW time in seconds * @clock_was_set_seq: The sequence number of clock was set events * @cs_was_changed_seq: The sequence number of clocksource change events + * @clock_valid: Indicator for valid clock * @monotonic_to_boot: CLOCK_MONOTONIC to CLOCK_BOOTTIME offset * @cycle_interval: Number of clock cycles in one NTP interval * @xtime_interval: Number of clock shifted nano seconds in one NTP @@ -149,6 +150,7 @@ struct timekeeper { /* Cachline 3 and 4 (timekeeping internal variables): */ unsigned int clock_was_set_seq; u8 cs_was_changed_seq; + u8 clock_valid; struct timespec64 monotonic_to_boot; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e3c1a1c1d8c5..bf59bacc97db 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1665,11 +1665,12 @@ read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, *boot_offset = ns_to_timespec64(local_clock()); } -static __init void tkd_basic_setup(struct tk_data *tkd, enum timekeeper_ids tk_id) +static __init void tkd_basic_setup(struct tk_data *tkd, enum timekeeper_ids tk_id, bool valid) { raw_spin_lock_init(&tkd->lock); seqcount_raw_spinlock_init(&tkd->seq, &tkd->lock); tkd->timekeeper.id = tkd->shadow_timekeeper.id = tk_id; + tkd->timekeeper.clock_valid = tkd->shadow_timekeeper.clock_valid = valid; } /* @@ -1699,7 +1700,7 @@ void __init timekeeping_init(void) struct timekeeper *tks = &tk_core.shadow_timekeeper; struct clocksource *clock; - tkd_basic_setup(&tk_core, TIMEKEEPER_CORE); + tkd_basic_setup(&tk_core, TIMEKEEPER_CORE, true); read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); if (timespec64_valid_settod(&wall_time) && From 22c62b9a84b8f16ca0277e133a0cd62a259fee7c Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 19 May 2025 10:33:28 +0200 Subject: [PATCH 12/39] timekeeping: Introduce auxiliary timekeepers Provide timekeepers for auxiliary clocks and initialize them during boot. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250519083026.350061049@linutronix.de --- kernel/time/timekeeping.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index bf59bacc97db..19f4af1a37ea 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -53,7 +53,11 @@ struct tk_data { raw_spinlock_t lock; } ____cacheline_aligned; -static struct tk_data tk_core; +static struct tk_data timekeeper_data[TIMEKEEPERS_MAX]; + +/* The core timekeeper */ +#define tk_core (timekeeper_data[TIMEKEEPER_CORE]) + /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -113,6 +117,12 @@ static struct tk_fast tk_fast_raw ____cacheline_aligned = { .base[1] = FAST_TK_INIT, }; +#ifdef CONFIG_POSIX_AUX_CLOCKS +static __init void tk_aux_setup(void); +#else +static inline void tk_aux_setup(void) { } +#endif + unsigned long timekeeper_lock_irqsave(void) { unsigned long flags; @@ -1589,7 +1599,6 @@ void ktime_get_raw_ts64(struct timespec64 *ts) } EXPORT_SYMBOL(ktime_get_raw_ts64); - /** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres */ @@ -1701,6 +1710,7 @@ void __init timekeeping_init(void) struct clocksource *clock; tkd_basic_setup(&tk_core, TIMEKEEPER_CORE, true); + tk_aux_setup(); read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); if (timespec64_valid_settod(&wall_time) && @@ -2630,3 +2640,11 @@ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) } EXPORT_SYMBOL(hardpps); #endif /* CONFIG_NTP_PPS */ + +#ifdef CONFIG_POSIX_AUX_CLOCKS +static __init void tk_aux_setup(void) +{ + for (int i = TIMEKEEPER_AUX_FIRST; i <= TIMEKEEPER_AUX_LAST; i++) + tkd_basic_setup(&timekeeper_data[i], i, false); +} +#endif /* CONFIG_POSIX_AUX_CLOCKS */ From ffa0519baaed48ca953bd201e1b17f15dae21b2d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 19 May 2025 10:33:30 +0200 Subject: [PATCH 13/39] timekeeping: Provide ktime_get_ntp_seconds() ntp_adjtimex() requires access to the actual time keeper per timekeeper ID. Provide an interface. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250519083026.411809421@linutronix.de --- kernel/time/timekeeping.c | 9 +++++++++ kernel/time/timekeeping_internal.h | 3 +++ 2 files changed, 12 insertions(+) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 19f4af1a37ea..7d3693a72a01 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2627,6 +2627,15 @@ int do_adjtimex(struct __kernel_timex *txc) return ret; } +/* + * Invoked from NTP with the time keeper lock held, so lockless access is + * fine. + */ +long ktime_get_ntp_seconds(unsigned int id) +{ + return timekeeper_data[id].timekeeper.xtime_sec; +} + #ifdef CONFIG_NTP_PPS /** * hardpps() - Accessor function to NTP __hardpps function diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index 8c9079108ffb..973ede670a36 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -45,4 +45,7 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask, u64 max_delta) unsigned long timekeeper_lock_irqsave(void); void timekeeper_unlock_irqrestore(unsigned long flags); +/* NTP specific interface to access the current seconds value */ +long ktime_get_ntp_seconds(unsigned int id); + #endif /* _TIMEKEEPING_INTERNAL_H */ From c85f5ab60820bde1510110e403d17456fbb8c266 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 19 May 2025 10:33:31 +0200 Subject: [PATCH 14/39] ntp: Use ktime_get_ntp_seconds() Use ktime_get_ntp_seconds() to prepare for auxiliary clocks so that the readout becomes per timekeeper. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250519083026.472512636@linutronix.de --- kernel/time/ntp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 9aba1bc7b2a7..97fa99b96dd0 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -303,7 +303,7 @@ static void ntp_update_offset(struct ntp_data *ntpdata, long offset) * Select how the frequency is to be controlled * and in which mode (PLL or FLL). */ - real_secs = __ktime_get_real_seconds(); + real_secs = ktime_get_ntp_seconds(ntpdata - tk_ntp_data); secs = (long)(real_secs - ntpdata->time_reftime); if (unlikely(ntpdata->time_status & STA_FREQHOLD)) secs = 0; @@ -710,7 +710,7 @@ static inline void process_adj_status(struct ntp_data *ntpdata, const struct __k * reference time to current time. */ if (!(ntpdata->time_status & STA_PLL) && (txc->status & STA_PLL)) - ntpdata->time_reftime = __ktime_get_real_seconds(); + ntpdata->time_reftime = ktime_get_ntp_seconds(ntpdata - tk_ntp_data); /* only set allowed bits */ ntpdata->time_status &= STA_RONLY; From 180d8b4ce91fe0cf7a9cb236bb01f14587ba4bf0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 19 May 2025 10:33:32 +0200 Subject: [PATCH 15/39] timekeeping: Add AUX offset to struct timekeeper This offset will be used in the time getters of auxiliary clocks. It is added to the "monotonic" clock readout. As auxiliary clocks do not utilize the offset fields of the core time keeper, this is just an alias for offs_tai, so that the cache line layout stays the same. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250519083026.533486349@linutronix.de --- include/linux/timekeeper_internal.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index 1690eda1c7c3..ca79938b62f3 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -67,6 +67,7 @@ struct tk_read_base { * @offs_real: Offset clock monotonic -> clock realtime * @offs_boot: Offset clock monotonic -> clock boottime * @offs_tai: Offset clock monotonic -> clock tai + * @offs_aux: Offset clock monotonic -> clock AUX * @coarse_nsec: The nanoseconds part for coarse time getters * @id: The timekeeper ID * @tkr_raw: The readout base structure for CLOCK_MONOTONIC_RAW @@ -113,6 +114,9 @@ struct tk_read_base { * @monotonic_to_boottime is a timespec64 representation of @offs_boot to * accelerate the VDSO update for CLOCK_BOOTTIME. * + * @offs_aux is used by the auxiliary timekeepers which do not utilize any + * of the regular timekeeper offset fields. + * * The cacheline ordering of the structure is optimized for in kernel usage of * the ktime_get() and ktime_get_ts64() family of time accessors. Struct * timekeeper is prepended in the core timekeeping code with a sequence count, @@ -139,7 +143,10 @@ struct timekeeper { struct timespec64 wall_to_monotonic; ktime_t offs_real; ktime_t offs_boot; - ktime_t offs_tai; + union { + ktime_t offs_tai; + ktime_t offs_aux; + }; u32 coarse_nsec; enum timekeeper_ids id; From 9f7729480a2c771bbe49b7eab034a8eaa5e27bfb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Jun 2025 20:38:29 +0200 Subject: [PATCH 16/39] timekeeping: Update auxiliary timekeepers on clocksource change Propagate a system clocksource change to the auxiliary timekeepers so that they can pick up the new clocksource. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250625183757.803890875@linutronix.de --- kernel/time/timekeeping.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 7d3693a72a01..ee9757018341 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -119,8 +119,10 @@ static struct tk_fast tk_fast_raw ____cacheline_aligned = { #ifdef CONFIG_POSIX_AUX_CLOCKS static __init void tk_aux_setup(void); +static void tk_aux_update_clocksource(void); #else static inline void tk_aux_setup(void) { } +static inline void tk_aux_update_clocksource(void) { } #endif unsigned long timekeeper_lock_irqsave(void) @@ -1548,6 +1550,8 @@ static int change_clocksource(void *data) timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); } + tk_aux_update_clocksource(); + if (old) { if (old->disable) old->disable(old); @@ -2651,6 +2655,35 @@ EXPORT_SYMBOL(hardpps); #endif /* CONFIG_NTP_PPS */ #ifdef CONFIG_POSIX_AUX_CLOCKS + +/* + * Bitmap for the activated auxiliary timekeepers to allow lockless quick + * checks in the hot paths without touching extra cache lines. If set, then + * the state of the corresponding timekeeper has to be re-checked under + * timekeeper::lock. + */ +static unsigned long aux_timekeepers; + +/* Invoked from timekeeping after a clocksource change */ +static void tk_aux_update_clocksource(void) +{ + unsigned long active = READ_ONCE(aux_timekeepers); + unsigned int id; + + for_each_set_bit(id, &active, BITS_PER_LONG) { + struct tk_data *tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST]; + struct timekeeper *tks = &tkd->shadow_timekeeper; + + guard(raw_spinlock_irqsave)(&tkd->lock); + if (!tks->clock_valid) + continue; + + timekeeping_forward_now(tks); + tk_setup_internals(tks, tk_core.timekeeper.tkr_mono.clock); + timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL); + } +} + static __init void tk_aux_setup(void) { for (int i = TIMEKEEPER_AUX_FIRST; i <= TIMEKEEPER_AUX_LAST; i++) From 05bc6e6290f91d2d40086ab4ef52da21c14ec4b6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Jun 2025 20:38:31 +0200 Subject: [PATCH 17/39] timekeeping: Provide time getters for auxiliary clocks Provide interfaces similar to the ktime_get*() family which provide access to the auxiliary clocks. These interfaces have a boolean return value, which indicates whether the accessed clock is valid or not. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250625183757.868342628@linutronix.de --- include/linux/posix-timers.h | 5 +++ include/linux/timekeeping.h | 11 ++++++ kernel/time/timekeeping.c | 65 ++++++++++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index dd48c64b605e..4d3dbcef379e 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -37,6 +37,11 @@ static inline int clockid_to_fd(const clockid_t clk) return ~(clk >> 3); } +static inline bool clockid_aux_valid(clockid_t id) +{ + return IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS) && id >= CLOCK_AUX && id <= CLOCK_AUX_LAST; +} + #ifdef CONFIG_POSIX_TIMERS #include diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 542773650200..de9a3b7d7d0d 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -263,6 +263,17 @@ extern bool timekeeping_rtc_skipresume(void); extern void timekeeping_inject_sleeptime64(const struct timespec64 *delta); +/* + * Auxiliary clock interfaces + */ +#ifdef CONFIG_POSIX_AUX_CLOCKS +extern bool ktime_get_aux(clockid_t id, ktime_t *kt); +extern bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *kt); +#else +static inline bool ktime_get_aux(clockid_t id, ktime_t *kt) { return false; } +static inline bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *kt) { return false; } +#endif + /** * struct system_time_snapshot - simultaneous raw/real time capture with * counter value diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index ee9757018341..c7d2913e68c3 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2664,6 +2664,18 @@ EXPORT_SYMBOL(hardpps); */ static unsigned long aux_timekeepers; +static inline unsigned int clockid_to_tkid(unsigned int id) +{ + return TIMEKEEPER_AUX_FIRST + id - CLOCK_AUX; +} + +static inline struct tk_data *aux_get_tk_data(clockid_t id) +{ + if (!clockid_aux_valid(id)) + return NULL; + return &timekeeper_data[clockid_to_tkid(id)]; +} + /* Invoked from timekeeping after a clocksource change */ static void tk_aux_update_clocksource(void) { @@ -2684,6 +2696,59 @@ static void tk_aux_update_clocksource(void) } } +/** + * ktime_get_aux - Get time for a AUX clock + * @id: ID of the clock to read (CLOCK_AUX...) + * @kt: Pointer to ktime_t to store the time stamp + * + * Returns: True if the timestamp is valid, false otherwise + */ +bool ktime_get_aux(clockid_t id, ktime_t *kt) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct timekeeper *aux_tk; + unsigned int seq; + ktime_t base; + u64 nsecs; + + WARN_ON(timekeeping_suspended); + + if (!aux_tkd) + return false; + + aux_tk = &aux_tkd->timekeeper; + do { + seq = read_seqcount_begin(&aux_tkd->seq); + if (!aux_tk->clock_valid) + return false; + + base = ktime_add(aux_tk->tkr_mono.base, aux_tk->offs_aux); + nsecs = timekeeping_get_ns(&aux_tk->tkr_mono); + } while (read_seqcount_retry(&aux_tkd->seq, seq)); + + *kt = ktime_add_ns(base, nsecs); + return true; +} +EXPORT_SYMBOL_GPL(ktime_get_aux); + +/** + * ktime_get_aux_ts64 - Get time for a AUX clock + * @id: ID of the clock to read (CLOCK_AUX...) + * @ts: Pointer to timespec64 to store the time stamp + * + * Returns: True if the timestamp is valid, false otherwise + */ +bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *ts) +{ + ktime_t now; + + if (!ktime_get_aux(id, &now)) + return false; + *ts = ktime_to_timespec64(now); + return true; +} +EXPORT_SYMBOL_GPL(ktime_get_aux_ts64); + static __init void tk_aux_setup(void) { for (int i = TIMEKEEPER_AUX_FIRST; i <= TIMEKEEPER_AUX_LAST; i++) From 606424bf4ffd9d27865c45b5707c1edac6b187ed Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Jun 2025 20:38:32 +0200 Subject: [PATCH 18/39] timekeeping: Add minimal posix-timers support for auxiliary clocks Provide clock_getres(2) and clock_gettime(2) for auxiliary clocks. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250625183757.932220594@linutronix.de --- kernel/time/posix-timers.c | 3 +++ kernel/time/posix-timers.h | 1 + kernel/time/timekeeping.c | 21 +++++++++++++++++++++ 3 files changed, 25 insertions(+) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 2053b1a4c9e4..8b582174b1f9 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1526,6 +1526,9 @@ static const struct k_clock * const posix_clocks[] = { [CLOCK_REALTIME_ALARM] = &alarm_clock, [CLOCK_BOOTTIME_ALARM] = &alarm_clock, [CLOCK_TAI] = &clock_tai, +#ifdef CONFIG_POSIX_AUX_CLOCKS + [CLOCK_AUX ... CLOCK_AUX_LAST] = &clock_aux, +#endif }; static const struct k_clock *clockid_to_kclock(const clockid_t id) diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index 61906f0688c1..7f259e845d24 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -41,6 +41,7 @@ extern const struct k_clock clock_posix_dynamic; extern const struct k_clock clock_process; extern const struct k_clock clock_thread; extern const struct k_clock alarm_clock; +extern const struct k_clock clock_aux; void posix_timer_queue_signal(struct k_itimer *timr); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index c7d2913e68c3..10c6e37dc0dc 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2655,6 +2655,7 @@ EXPORT_SYMBOL(hardpps); #endif /* CONFIG_NTP_PPS */ #ifdef CONFIG_POSIX_AUX_CLOCKS +#include "posix-timers.h" /* * Bitmap for the activated auxiliary timekeepers to allow lockless quick @@ -2749,6 +2750,26 @@ bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *ts) } EXPORT_SYMBOL_GPL(ktime_get_aux_ts64); +static int aux_get_res(clockid_t id, struct timespec64 *tp) +{ + if (!clockid_aux_valid(id)) + return -ENODEV; + + tp->tv_sec = 0; + tp->tv_nsec = 1; + return 0; +} + +static int aux_get_timespec(clockid_t id, struct timespec64 *tp) +{ + return ktime_get_aux_ts64(id, tp) ? 0 : -ENODEV; +} + +const struct k_clock clock_aux = { + .clock_getres = aux_get_res, + .clock_get_timespec = aux_get_timespec, +}; + static __init void tk_aux_setup(void) { for (int i = TIMEKEEPER_AUX_FIRST; i <= TIMEKEEPER_AUX_LAST; i++) From 60ecc26ec5af567a55f362ad92c0cac8b894541c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Jun 2025 20:38:34 +0200 Subject: [PATCH 19/39] timekeeping: Provide time setter for auxiliary clocks Add clock_settime(2) support for auxiliary clocks. The function affects the AUX offset which is added to the "monotonic" clock readout of these clocks. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250625183757.995688714@linutronix.de --- kernel/time/timekeeping.c | 44 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 10c6e37dc0dc..b6ac7847bc0a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2765,9 +2765,53 @@ static int aux_get_timespec(clockid_t id, struct timespec64 *tp) return ktime_get_aux_ts64(id, tp) ? 0 : -ENODEV; } +static int aux_clock_set(const clockid_t id, const struct timespec64 *tnew) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct timekeeper *aux_tks; + ktime_t tnow, nsecs; + + if (!timespec64_valid_settod(tnew)) + return -EINVAL; + if (!aux_tkd) + return -ENODEV; + + aux_tks = &aux_tkd->shadow_timekeeper; + + guard(raw_spinlock_irq)(&aux_tkd->lock); + if (!aux_tks->clock_valid) + return -ENODEV; + + /* Forward the timekeeper base time */ + timekeeping_forward_now(aux_tks); + /* + * Get the updated base time. tkr_mono.base has not been + * updated yet, so do that first. That makes the update + * in timekeeping_update_from_shadow() redundant, but + * that's harmless. After that @tnow can be calculated + * by using tkr_mono::cycle_last, which has been set + * by timekeeping_forward_now(). + */ + tk_update_ktime_data(aux_tks); + nsecs = timekeeping_cycles_to_ns(&aux_tks->tkr_mono, aux_tks->tkr_mono.cycle_last); + tnow = ktime_add(aux_tks->tkr_mono.base, nsecs); + + /* + * Calculate the new AUX offset as delta to @tnow ("monotonic"). + * That avoids all the tk::xtime back and forth conversions as + * xtime ("realtime") is not applicable for auxiliary clocks and + * kept in sync with "monotonic". + */ + aux_tks->offs_aux = ktime_sub(timespec64_to_ktime(*tnew), tnow); + + timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); + return 0; +} + const struct k_clock clock_aux = { .clock_getres = aux_get_res, .clock_get_timespec = aux_get_timespec, + .clock_set = aux_clock_set, }; static __init void tk_aux_setup(void) From e8db3a55798d70f2c222c6103990776fca6a6ebc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Jun 2025 20:38:40 +0200 Subject: [PATCH 20/39] timekeeping: Make timekeeping_inject_offset() reusable Split out the inner workings for auxiliary clock support and feed the core time keeper into it. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250625183758.059934561@linutronix.de --- kernel/time/timekeeping.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b6ac7847bc0a..2d294cfe185e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1433,32 +1433,32 @@ EXPORT_SYMBOL(do_settimeofday64); /** * __timekeeping_inject_offset - Adds or subtracts from the current time. + * @tkd: Pointer to the timekeeper to modify * @ts: Pointer to the timespec variable containing the offset * * Adds or subtracts an offset value from the current time. */ -static int __timekeeping_inject_offset(const struct timespec64 *ts) +static int __timekeeping_inject_offset(struct tk_data *tkd, const struct timespec64 *ts) { - struct timekeeper *tks = &tk_core.shadow_timekeeper; + struct timekeeper *tks = &tkd->shadow_timekeeper; struct timespec64 tmp; if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - timekeeping_forward_now(tks); /* Make sure the proposed value is valid */ tmp = timespec64_add(tk_xtime(tks), *ts); if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 || !timespec64_valid_settod(&tmp)) { - timekeeping_restore_shadow(&tk_core); + timekeeping_restore_shadow(tkd); return -EINVAL; } tk_xtime_add(tks, ts); tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts)); - timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); + timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL); return 0; } @@ -1467,7 +1467,7 @@ static int timekeeping_inject_offset(const struct timespec64 *ts) int ret; scoped_guard (raw_spinlock_irqsave, &tk_core.lock) - ret = __timekeeping_inject_offset(ts); + ret = __timekeeping_inject_offset(&tk_core, ts); /* Signal hrtimers about time change */ if (!ret) @@ -2568,6 +2568,7 @@ EXPORT_SYMBOL_GPL(random_get_entropy_fallback); */ int do_adjtimex(struct __kernel_timex *txc) { + struct tk_data *tkd = &tk_core; struct timespec64 delta, ts; struct audit_ntp_data ad; bool offset_set = false; @@ -2585,16 +2586,19 @@ int do_adjtimex(struct __kernel_timex *txc) ktime_get_real_ts64(&ts); add_device_randomness(&ts, sizeof(ts)); - scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { - struct timekeeper *tks = &tk_core.shadow_timekeeper; + scoped_guard (raw_spinlock_irqsave, &tkd->lock) { + struct timekeeper *tks = &tkd->shadow_timekeeper; s32 orig_tai, tai; + if (!tks->clock_valid) + return -ENODEV; + if (txc->modes & ADJ_SETOFFSET) { delta.tv_sec = txc->time.tv_sec; delta.tv_nsec = txc->time.tv_usec; if (!(txc->modes & ADJ_NANO)) delta.tv_nsec *= 1000; - ret = __timekeeping_inject_offset(&delta); + ret = __timekeeping_inject_offset(tkd, &delta); if (ret) return ret; @@ -2607,7 +2611,7 @@ int do_adjtimex(struct __kernel_timex *txc) if (tai != orig_tai) { __timekeeping_set_tai_offset(tks, tai); - timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET); + timekeeping_update_from_shadow(tkd, TK_CLOCK_WAS_SET); clock_set = true; } else { tk_update_leap_state_all(&tk_core); @@ -2615,7 +2619,7 @@ int do_adjtimex(struct __kernel_timex *txc) /* Update the multiplier immediately if frequency was set directly */ if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) - clock_set |= __timekeeping_advance(&tk_core, TK_ADV_FREQ); + clock_set |= __timekeeping_advance(tkd, TK_ADV_FREQ); } if (txc->modes & ADJ_SETOFFSET) From 2c8aea59c206b12b436373861590baeda728be12 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Jun 2025 20:38:42 +0200 Subject: [PATCH 21/39] timekeeping: Add auxiliary clock support to __timekeeping_inject_offset() Redirect the relative offset adjustment to the auxiliary clock offset instead of modifying CLOCK_REALTIME, which has no meaning in context of these clocks. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250625183758.124057787@linutronix.de --- kernel/time/timekeeping.c | 39 +++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2d294cfe185e..e893557cd53f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1431,6 +1431,11 @@ int do_settimeofday64(const struct timespec64 *ts) } EXPORT_SYMBOL(do_settimeofday64); +static inline bool timekeeper_is_core_tk(struct timekeeper *tk) +{ + return !IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS) || tk->id == TIMEKEEPER_CORE; +} + /** * __timekeeping_inject_offset - Adds or subtracts from the current time. * @tkd: Pointer to the timekeeper to modify @@ -1448,16 +1453,34 @@ static int __timekeeping_inject_offset(struct tk_data *tkd, const struct timespe timekeeping_forward_now(tks); - /* Make sure the proposed value is valid */ - tmp = timespec64_add(tk_xtime(tks), *ts); - if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 || - !timespec64_valid_settod(&tmp)) { - timekeeping_restore_shadow(tkd); - return -EINVAL; + if (timekeeper_is_core_tk(tks)) { + /* Make sure the proposed value is valid */ + tmp = timespec64_add(tk_xtime(tks), *ts); + if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 || + !timespec64_valid_settod(&tmp)) { + timekeeping_restore_shadow(tkd); + return -EINVAL; + } + + tk_xtime_add(tks, ts); + tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts)); + } else { + struct tk_read_base *tkr_mono = &tks->tkr_mono; + ktime_t now, offs; + + /* Get the current time */ + now = ktime_add_ns(tkr_mono->base, timekeeping_get_ns(tkr_mono)); + /* Add the relative offset change */ + offs = ktime_add(tks->offs_aux, timespec64_to_ktime(*ts)); + + /* Prevent that the resulting time becomes negative */ + if (ktime_add(now, offs) < 0) { + timekeeping_restore_shadow(tkd); + return -EINVAL; + } + tks->offs_aux = offs; } - tk_xtime_add(tks, ts); - tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts)); timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL); return 0; } From 775f71ebedd382da390dc16a4c28cffa5b937f79 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Jun 2025 20:38:43 +0200 Subject: [PATCH 22/39] timekeeping: Make do_adjtimex() reusable Split out the actual functionality of adjtimex() and make do_adjtimex() a wrapper which feeds the core timekeeper into it and handles the result including audit at the call site. This allows to reuse the actual functionality for auxiliary clocks. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250625183758.187322876@linutronix.de --- kernel/time/timekeeping.c | 106 +++++++++++++++++++++----------------- 1 file changed, 58 insertions(+), 48 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e893557cd53f..0de61315b40c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2585,17 +2585,18 @@ unsigned long random_get_entropy_fallback(void) } EXPORT_SYMBOL_GPL(random_get_entropy_fallback); -/** - * do_adjtimex() - Accessor function to NTP __do_adjtimex function - * @txc: Pointer to kernel_timex structure containing NTP parameters - */ -int do_adjtimex(struct __kernel_timex *txc) +struct adjtimex_result { + struct audit_ntp_data ad; + struct timespec64 delta; + bool clock_set; +}; + +static int __do_adjtimex(struct tk_data *tkd, struct __kernel_timex *txc, + struct adjtimex_result *result) { - struct tk_data *tkd = &tk_core; - struct timespec64 delta, ts; - struct audit_ntp_data ad; - bool offset_set = false; - bool clock_set = false; + struct timekeeper *tks = &tkd->shadow_timekeeper; + struct timespec64 ts; + s32 orig_tai, tai; int ret; /* Validate the data before disabling interrupts */ @@ -2604,56 +2605,65 @@ int do_adjtimex(struct __kernel_timex *txc) return ret; add_device_randomness(txc, sizeof(*txc)); - audit_ntp_init(&ad); - ktime_get_real_ts64(&ts); add_device_randomness(&ts, sizeof(ts)); - scoped_guard (raw_spinlock_irqsave, &tkd->lock) { - struct timekeeper *tks = &tkd->shadow_timekeeper; - s32 orig_tai, tai; + guard(raw_spinlock_irqsave)(&tkd->lock); - if (!tks->clock_valid) - return -ENODEV; + if (!tks->clock_valid) + return -ENODEV; - if (txc->modes & ADJ_SETOFFSET) { - delta.tv_sec = txc->time.tv_sec; - delta.tv_nsec = txc->time.tv_usec; - if (!(txc->modes & ADJ_NANO)) - delta.tv_nsec *= 1000; - ret = __timekeeping_inject_offset(tkd, &delta); - if (ret) - return ret; - - offset_set = delta.tv_sec != 0; - clock_set = true; - } - - orig_tai = tai = tks->tai_offset; - ret = ntp_adjtimex(tks->id, txc, &ts, &tai, &ad); - - if (tai != orig_tai) { - __timekeeping_set_tai_offset(tks, tai); - timekeeping_update_from_shadow(tkd, TK_CLOCK_WAS_SET); - clock_set = true; - } else { - tk_update_leap_state_all(&tk_core); - } - - /* Update the multiplier immediately if frequency was set directly */ - if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) - clock_set |= __timekeeping_advance(tkd, TK_ADV_FREQ); + if (txc->modes & ADJ_SETOFFSET) { + result->delta.tv_sec = txc->time.tv_sec; + result->delta.tv_nsec = txc->time.tv_usec; + if (!(txc->modes & ADJ_NANO)) + result->delta.tv_nsec *= 1000; + ret = __timekeeping_inject_offset(tkd, &result->delta); + if (ret) + return ret; + result->clock_set = true; } + orig_tai = tai = tks->tai_offset; + ret = ntp_adjtimex(tks->id, txc, &ts, &tai, &result->ad); + + if (tai != orig_tai) { + __timekeeping_set_tai_offset(tks, tai); + timekeeping_update_from_shadow(tkd, TK_CLOCK_WAS_SET); + result->clock_set = true; + } else { + tk_update_leap_state_all(&tk_core); + } + + /* Update the multiplier immediately if frequency was set directly */ + if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) + result->clock_set |= __timekeeping_advance(tkd, TK_ADV_FREQ); + + return ret; +} + +/** + * do_adjtimex() - Accessor function to NTP __do_adjtimex function + * @txc: Pointer to kernel_timex structure containing NTP parameters + */ +int do_adjtimex(struct __kernel_timex *txc) +{ + struct adjtimex_result result = { }; + int ret; + + ret = __do_adjtimex(&tk_core, txc, &result); + if (ret < 0) + return ret; + if (txc->modes & ADJ_SETOFFSET) - audit_tk_injoffset(delta); + audit_tk_injoffset(result.delta); - audit_ntp_log(&ad); + audit_ntp_log(&result.ad); - if (clock_set) + if (result.clock_set) clock_was_set(CLOCK_SET_WALL); - ntp_notify_cmos_timer(offset_set); + ntp_notify_cmos_timer(result.delta.tv_sec != 0); return ret; } From 4eca49d0b621b314ac7c80f363932ec6f6c8abc8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Jun 2025 20:38:45 +0200 Subject: [PATCH 23/39] timekeeping: Prepare do_adtimex() for auxiliary clocks Exclude ADJ_TAI, leap seconds and PPS functionality as they make no sense in the context of auxiliary clocks and provide a time stamp based on the actual clock. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250625183758.253203783@linutronix.de --- kernel/time/timekeeping.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 0de61315b40c..6770544f8c0e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -58,6 +58,17 @@ static struct tk_data timekeeper_data[TIMEKEEPERS_MAX]; /* The core timekeeper */ #define tk_core (timekeeper_data[TIMEKEEPER_CORE]) +#ifdef CONFIG_POSIX_AUX_CLOCKS +static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts) +{ + return ktime_get_aux_ts64(CLOCK_AUX + tkid - TIMEKEEPER_AUX_FIRST, ts); +} +#else +static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts) +{ + return false; +} +#endif /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -2508,7 +2519,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, /* * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex */ -static int timekeeping_validate_timex(const struct __kernel_timex *txc) +static int timekeeping_validate_timex(const struct __kernel_timex *txc, bool aux_clock) { if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ @@ -2567,6 +2578,20 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc) return -EINVAL; } + if (aux_clock) { + /* Auxiliary clocks are similar to TAI and do not have leap seconds */ + if (txc->status & (STA_INS | STA_DEL)) + return -EINVAL; + + /* No TAI offset setting */ + if (txc->modes & ADJ_TAI) + return -EINVAL; + + /* No PPS support either */ + if (txc->status & (STA_PPSFREQ | STA_PPSTIME)) + return -EINVAL; + } + return 0; } @@ -2595,17 +2620,22 @@ static int __do_adjtimex(struct tk_data *tkd, struct __kernel_timex *txc, struct adjtimex_result *result) { struct timekeeper *tks = &tkd->shadow_timekeeper; + bool aux_clock = !timekeeper_is_core_tk(tks); struct timespec64 ts; s32 orig_tai, tai; int ret; /* Validate the data before disabling interrupts */ - ret = timekeeping_validate_timex(txc); + ret = timekeeping_validate_timex(txc, aux_clock); if (ret) return ret; add_device_randomness(txc, sizeof(*txc)); - ktime_get_real_ts64(&ts); + if (!aux_clock) + ktime_get_real_ts64(&ts); + else + tk_get_aux_ts64(tkd->timekeeper.id, &ts); + add_device_randomness(&ts, sizeof(ts)); guard(raw_spinlock_irqsave)(&tkd->lock); From ecf3e70304911be1c14cd21baa0bc611a53ec50b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Jun 2025 20:38:46 +0200 Subject: [PATCH 24/39] timekeeping: Provide adjtimex() for auxiliary clocks The behaviour is close to clock_adtime(CLOCK_REALTIME) with the following differences: 1) ADJ_SETOFFSET adjusts the auxiliary clock offset 2) ADJ_TAI is not supported 3) Leap seconds are not supported 4) PPS is not supported Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250625183758.317946543@linutronix.de --- kernel/time/timekeeping.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6770544f8c0e..523670ec0d2e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2875,10 +2875,26 @@ static int aux_clock_set(const clockid_t id, const struct timespec64 *tnew) return 0; } +static int aux_clock_adj(const clockid_t id, struct __kernel_timex *txc) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct adjtimex_result result = { }; + + if (!aux_tkd) + return -ENODEV; + + /* + * @result is ignored for now as there are neither hrtimers nor a + * RTC related to auxiliary clocks for now. + */ + return __do_adjtimex(aux_tkd, txc, &result); +} + const struct k_clock clock_aux = { .clock_getres = aux_get_res, .clock_get_timespec = aux_get_timespec, .clock_set = aux_clock_set, + .clock_adj = aux_clock_adj, }; static __init void tk_aux_setup(void) From e6d4c00719a6b1dda3fb358b4c973595f9dfd455 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Jun 2025 20:38:47 +0200 Subject: [PATCH 25/39] timekeeping: Provide update for auxiliary timekeepers Update the auxiliary timekeepers periodically. For now this is tied to the system timekeeper update from the tick. This might be revisited and moved out of the tick. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250625183758.382451331@linutronix.de --- kernel/time/timekeeping.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 523670ec0d2e..568ba1ffba0b 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -131,9 +131,11 @@ static struct tk_fast tk_fast_raw ____cacheline_aligned = { #ifdef CONFIG_POSIX_AUX_CLOCKS static __init void tk_aux_setup(void); static void tk_aux_update_clocksource(void); +static void tk_aux_advance(void); #else static inline void tk_aux_setup(void) { } static inline void tk_aux_update_clocksource(void) { } +static inline void tk_aux_advance(void) { } #endif unsigned long timekeeper_lock_irqsave(void) @@ -2317,11 +2319,13 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) /** * update_wall_time - Uses the current clocksource to increment the wall time * + * It also updates the enabled auxiliary clock timekeepers */ void update_wall_time(void) { if (timekeeping_advance(TK_ADV_TICK)) clock_was_set_delayed(); + tk_aux_advance(); } /** @@ -2764,6 +2768,21 @@ static void tk_aux_update_clocksource(void) } } +static void tk_aux_advance(void) +{ + unsigned long active = READ_ONCE(aux_timekeepers); + unsigned int id; + + /* Lockless quick check to avoid extra cache lines */ + for_each_set_bit(id, &active, BITS_PER_LONG) { + struct tk_data *aux_tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST]; + + guard(raw_spinlock)(&aux_tkd->lock); + if (aux_tkd->shadow_timekeeper.clock_valid) + __timekeeping_advance(aux_tkd, TK_ADV_TICK); + } +} + /** * ktime_get_aux - Get time for a AUX clock * @id: ID of the clock to read (CLOCK_AUX...) From 7b95663a3d96b39b40f169dba5faef3e20163c5c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Jun 2025 20:38:49 +0200 Subject: [PATCH 26/39] timekeeping: Provide interface to control auxiliary clocks Auxiliary clocks are disabled by default and attempts to access them fail. Provide an interface to enable/disable them at run-time. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250625183758.444626478@linutronix.de --- .../ABI/stable/sysfs-kernel-time-aux-clocks | 5 + kernel/time/timekeeping.c | 116 ++++++++++++++++++ 2 files changed, 121 insertions(+) create mode 100644 Documentation/ABI/stable/sysfs-kernel-time-aux-clocks diff --git a/Documentation/ABI/stable/sysfs-kernel-time-aux-clocks b/Documentation/ABI/stable/sysfs-kernel-time-aux-clocks new file mode 100644 index 000000000000..825508f42af6 --- /dev/null +++ b/Documentation/ABI/stable/sysfs-kernel-time-aux-clocks @@ -0,0 +1,5 @@ +What: /sys/kernel/time/aux_clocks//enable +Date: May 2025 +Contact: Thomas Gleixner +Description: + Controls the enablement of auxiliary clock timekeepers. diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 568ba1ffba0b..6a61887eb87e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -2916,6 +2917,121 @@ const struct k_clock clock_aux = { .clock_adj = aux_clock_adj, }; +static void aux_clock_enable(clockid_t id) +{ + struct tk_read_base *tkr_raw = &tk_core.timekeeper.tkr_raw; + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct timekeeper *aux_tks = &aux_tkd->shadow_timekeeper; + + /* Prevent the core timekeeper from changing. */ + guard(raw_spinlock_irq)(&tk_core.lock); + + /* + * Setup the auxiliary clock assuming that the raw core timekeeper + * clock frequency conversion is close enough. Userspace has to + * adjust for the deviation via clock_adjtime(2). + */ + guard(raw_spinlock_nested)(&aux_tkd->lock); + + /* Remove leftovers of a previous registration */ + memset(aux_tks, 0, sizeof(*aux_tks)); + /* Restore the timekeeper id */ + aux_tks->id = aux_tkd->timekeeper.id; + /* Setup the timekeeper based on the current system clocksource */ + tk_setup_internals(aux_tks, tkr_raw->clock); + + /* Mark it valid and set it live */ + aux_tks->clock_valid = true; + timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); +} + +static void aux_clock_disable(clockid_t id) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + + guard(raw_spinlock_irq)(&aux_tkd->lock); + aux_tkd->shadow_timekeeper.clock_valid = false; + timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); +} + +static DEFINE_MUTEX(aux_clock_mutex); + +static ssize_t aux_clock_enable_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + /* Lazy atoi() as name is "0..7" */ + int id = kobj->name[0] & 0x7; + bool enable; + + if (!capable(CAP_SYS_TIME)) + return -EPERM; + + if (kstrtobool(buf, &enable) < 0) + return -EINVAL; + + guard(mutex)(&aux_clock_mutex); + if (enable == test_bit(id, &aux_timekeepers)) + return count; + + if (enable) { + aux_clock_enable(CLOCK_AUX + id); + set_bit(id, &aux_timekeepers); + } else { + aux_clock_disable(CLOCK_AUX + id); + clear_bit(id, &aux_timekeepers); + } + return count; +} + +static ssize_t aux_clock_enable_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + unsigned long active = READ_ONCE(aux_timekeepers); + /* Lazy atoi() as name is "0..7" */ + int id = kobj->name[0] & 0x7; + + return sysfs_emit(buf, "%d\n", test_bit(id, &active)); +} + +static struct kobj_attribute aux_clock_enable_attr = __ATTR_RW(aux_clock_enable); + +static struct attribute *aux_clock_enable_attrs[] = { + &aux_clock_enable_attr.attr, + NULL +}; + +static const struct attribute_group aux_clock_enable_attr_group = { + .attrs = aux_clock_enable_attrs, +}; + +static int __init tk_aux_sysfs_init(void) +{ + struct kobject *auxo, *tko = kobject_create_and_add("time", kernel_kobj); + + if (!tko) + return -ENOMEM; + + auxo = kobject_create_and_add("aux_clocks", tko); + if (!auxo) { + kobject_put(tko); + return -ENOMEM; + } + + for (int i = 0; i <= MAX_AUX_CLOCKS; i++) { + char id[2] = { [0] = '0' + i, }; + struct kobject *clk = kobject_create_and_add(id, auxo); + + if (!clk) + return -ENOMEM; + + int ret = sysfs_create_group(clk, &aux_clock_enable_attr_group); + + if (ret) + return ret; + } + return 0; +} +late_initcall(tk_aux_sysfs_init); + static __init void tk_aux_setup(void) { for (int i = TIMEKEEPER_AUX_FIRST; i <= TIMEKEEPER_AUX_LAST; i++) From 5b605dbee07dda8fd538af1f07cbf1baf0a49cbc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Jul 2025 15:26:58 +0200 Subject: [PATCH 27/39] timekeeping: Provide ktime_get_clock_ts64() PTP implements an inline switch case for taking timestamps from various POSIX clock IDs, which already consumes quite some text space. Expanding it for auxiliary clocks really becomes too big for inlining. Provide a out of line version. The function invalidates the timestamp in case the clock is invalid. The invalidation allows to implement a validation check without the need to propagate a return value through deep existing call chains. Due to merge logistics this temporarily defines CLOCK_AUX[_LAST] if undefined, so that the plain branch, which does not contain any of the core timekeeper changes, can be pulled into the networking tree as prerequisite for the PTP side changes. These temporary defines are removed after that branch is merged into the tip::timers/ptp branch. That way the result in -next or upstream in the next merge window has zero dependencies. Signed-off-by: Thomas Gleixner Reviewed-by: Vadim Fedorenko Acked-by: John Stultz Link: https://lore.kernel.org/all/20250701132628.357686408@linutronix.de --- include/linux/timekeeping.h | 10 ++++++++++ kernel/time/timekeeping.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 542773650200..4a4c2778abae 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -44,6 +44,7 @@ extern void ktime_get_ts64(struct timespec64 *ts); extern void ktime_get_real_ts64(struct timespec64 *tv); extern void ktime_get_coarse_ts64(struct timespec64 *ts); extern void ktime_get_coarse_real_ts64(struct timespec64 *ts); +extern void ktime_get_clock_ts64(clockid_t id, struct timespec64 *ts); /* Multigrain timestamp interfaces */ extern void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts); @@ -345,4 +346,13 @@ void read_persistent_wall_and_boot_offset(struct timespec64 *wall_clock, extern int update_persistent_clock64(struct timespec64 now); #endif +/* Temporary workaround to avoid merge dependencies and cross tree messes */ +#ifndef CLOCK_AUX +#define CLOCK_AUX MAX_CLOCKS +#define MAX_AUX_CLOCKS 8 +#define CLOCK_AUX_LAST (CLOCK_AUX + MAX_AUX_CLOCKS - 1) + +static inline bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *kt) { return false; } +#endif + #endif diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index a009c91f7b05..572e3bd0cc94 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1573,6 +1573,39 @@ void ktime_get_raw_ts64(struct timespec64 *ts) } EXPORT_SYMBOL(ktime_get_raw_ts64); +/** + * ktime_get_clock_ts64 - Returns time of a clock in a timespec + * @id: POSIX clock ID of the clock to read + * @ts: Pointer to the timespec64 to be set + * + * The timestamp is invalidated (@ts->sec is set to -1) if the + * clock @id is not available. + */ +void ktime_get_clock_ts64(clockid_t id, struct timespec64 *ts) +{ + /* Invalidate time stamp */ + ts->tv_sec = -1; + ts->tv_nsec = 0; + + switch (id) { + case CLOCK_REALTIME: + ktime_get_real_ts64(ts); + return; + case CLOCK_MONOTONIC: + ktime_get_ts64(ts); + return; + case CLOCK_MONOTONIC_RAW: + ktime_get_raw_ts64(ts); + return; + case CLOCK_AUX ... CLOCK_AUX_LAST: + if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) + ktime_get_aux_ts64(id, ts); + return; + default: + WARN_ON_ONCE(1); + } +} +EXPORT_SYMBOL_GPL(ktime_get_clock_ts64); /** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres From 8959338617a85e35820e3a7fa21801cf55b068bf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 3 Jul 2025 14:39:28 +0200 Subject: [PATCH 28/39] timekeeping: Remove the temporary CLOCK_AUX workaround ktime_get_clock_ts64() was provided for the networking tree as a stand alone commit based on v6.16-rc1. It contains a temporary workaround for the CLOCK_AUX* defines, which are only available in the timekeeping tree. As this commit is now merged into the timers/ptp branch, which contains the real CLOCK_AUX* defines, the workaround is obsolete. Remove it. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250701130923.579834908@linutronix.de --- include/linux/timekeeping.h | 9 --------- 1 file changed, 9 deletions(-) diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 6121924d93c4..aee2c1a46e47 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -357,13 +357,4 @@ void read_persistent_wall_and_boot_offset(struct timespec64 *wall_clock, extern int update_persistent_clock64(struct timespec64 now); #endif -/* Temporary workaround to avoid merge dependencies and cross tree messes */ -#ifndef CLOCK_AUX -#define CLOCK_AUX MAX_CLOCKS -#define MAX_AUX_CLOCKS 8 -#define CLOCK_AUX_LAST (CLOCK_AUX + MAX_AUX_CLOCKS - 1) - -static inline bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *kt) { return false; } -#endif - #endif From 6fedaf682a5e1866efdaddc70ff0ada329825d53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 1 Jul 2025 10:57:56 +0200 Subject: [PATCH 29/39] vdso/vsyscall: Introduce a helper to fill clock configurations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The logic to configure a 'struct vdso_clock' from a 'struct tk_read_base' is copied two times. Split it into a shared function to reduce the duplication, especially as another user will be added for auxiliary clocks. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250701-vdso-auxclock-v1-2-df7d9f87b9b8@linutronix.de --- kernel/time/vsyscall.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 32ef27c71b57..d655df259733 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -15,26 +15,25 @@ #include "timekeeping_internal.h" +static inline void fill_clock_configuration(struct vdso_clock *vc, const struct tk_read_base *base) +{ + vc->cycle_last = base->cycle_last; +#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT + vc->max_cycles = base->clock->max_cycles; +#endif + vc->mask = base->mask; + vc->mult = base->mult; + vc->shift = base->shift; +} + static inline void update_vdso_time_data(struct vdso_time_data *vdata, struct timekeeper *tk) { struct vdso_clock *vc = vdata->clock_data; struct vdso_timestamp *vdso_ts; u64 nsec, sec; - vc[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; -#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT - vc[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles; -#endif - vc[CS_HRES_COARSE].mask = tk->tkr_mono.mask; - vc[CS_HRES_COARSE].mult = tk->tkr_mono.mult; - vc[CS_HRES_COARSE].shift = tk->tkr_mono.shift; - vc[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; -#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT - vc[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles; -#endif - vc[CS_RAW].mask = tk->tkr_raw.mask; - vc[CS_RAW].mult = tk->tkr_raw.mult; - vc[CS_RAW].shift = tk->tkr_raw.shift; + fill_clock_configuration(&vc[CS_HRES_COARSE], &tk->tkr_mono); + fill_clock_configuration(&vc[CS_RAW], &tk->tkr_raw); /* CLOCK_MONOTONIC */ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; From 76164ca0d113e6a9f3033f948c739586fc606ed1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 1 Jul 2025 10:57:57 +0200 Subject: [PATCH 30/39] vdso/vsyscall: Split up __arch_update_vsyscall() into __arch_update_vdso_clock() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The upcoming auxiliary clocks need this hook, too. To separate the architecture hooks from the timekeeper internals, refactor the hook to only operate on a single vDSO clock. While at it, use a more robust #define for the hook override. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250701-vdso-auxclock-v1-3-df7d9f87b9b8@linutronix.de --- arch/arm64/include/asm/vdso/vsyscall.h | 7 +++---- include/asm-generic/vdso/vsyscall.h | 6 +++--- kernel/time/vsyscall.c | 3 ++- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/vdso/vsyscall.h b/arch/arm64/include/asm/vdso/vsyscall.h index de58951b8df6..417aae5763a8 100644 --- a/arch/arm64/include/asm/vdso/vsyscall.h +++ b/arch/arm64/include/asm/vdso/vsyscall.h @@ -13,12 +13,11 @@ * Update the vDSO data page to keep in sync with kernel timekeeping. */ static __always_inline -void __arm64_update_vsyscall(struct vdso_time_data *vdata) +void __arch_update_vdso_clock(struct vdso_clock *vc) { - vdata->clock_data[CS_HRES_COARSE].mask = VDSO_PRECISION_MASK; - vdata->clock_data[CS_RAW].mask = VDSO_PRECISION_MASK; + vc->mask = VDSO_PRECISION_MASK; } -#define __arch_update_vsyscall __arm64_update_vsyscall +#define __arch_update_vdso_clock __arch_update_vdso_clock /* The asm-generic header needs to be included after the definitions above */ #include diff --git a/include/asm-generic/vdso/vsyscall.h b/include/asm-generic/vdso/vsyscall.h index b550afa15ecd..7fc0b560007d 100644 --- a/include/asm-generic/vdso/vsyscall.h +++ b/include/asm-generic/vdso/vsyscall.h @@ -22,11 +22,11 @@ static __always_inline const struct vdso_rng_data *__arch_get_vdso_u_rng_data(vo #endif /* CONFIG_GENERIC_VDSO_DATA_STORE */ -#ifndef __arch_update_vsyscall -static __always_inline void __arch_update_vsyscall(struct vdso_time_data *vdata) +#ifndef __arch_update_vdso_clock +static __always_inline void __arch_update_vdso_clock(struct vdso_clock *vc) { } -#endif /* __arch_update_vsyscall */ +#endif /* __arch_update_vdso_clock */ #ifndef __arch_sync_vdso_time_data static __always_inline void __arch_sync_vdso_time_data(struct vdso_time_data *vdata) diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index d655df259733..df6bada2d58e 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -118,7 +118,8 @@ void update_vsyscall(struct timekeeper *tk) if (clock_mode != VDSO_CLOCKMODE_NONE) update_vdso_time_data(vdata, tk); - __arch_update_vsyscall(vdata); + __arch_update_vdso_clock(&vc[CS_HRES_COARSE]); + __arch_update_vdso_clock(&vc[CS_RAW]); vdso_write_end(vdata); From ad64d71d7409a0602b50ee71c7f9663a3385c286 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 1 Jul 2025 10:57:58 +0200 Subject: [PATCH 31/39] vdso/helpers: Add helpers for seqlocks of single vdso_clock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Auxiliary clocks will have their vDSO data in a dedicated 'struct vdso_clock', which needs to be synchronized independently. Add a helper to synchronize a single vDSO clock. [ tglx: Move the SMP memory barriers to the call sites and get rid of the confusing first/last arguments and conditional barriers ] Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250701-vdso-auxclock-v1-4-df7d9f87b9b8@linutronix.de --- include/vdso/helpers.h | 54 +++++++++++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 14 deletions(-) diff --git a/include/vdso/helpers.h b/include/vdso/helpers.h index 0a98fed550ba..1a5ee9d9052c 100644 --- a/include/vdso/helpers.h +++ b/include/vdso/helpers.h @@ -28,17 +28,47 @@ static __always_inline u32 vdso_read_retry(const struct vdso_clock *vc, return seq != start; } +static __always_inline void vdso_write_seq_begin(struct vdso_clock *vc) +{ + /* + * WRITE_ONCE() is required otherwise the compiler can validly tear + * updates to vc->seq and it is possible that the value seen by the + * reader is inconsistent. + */ + WRITE_ONCE(vc->seq, vc->seq + 1); +} + +static __always_inline void vdso_write_seq_end(struct vdso_clock *vc) +{ + /* + * WRITE_ONCE() is required otherwise the compiler can validly tear + * updates to vc->seq and it is possible that the value seen by the + * reader is inconsistent. + */ + WRITE_ONCE(vc->seq, vc->seq + 1); +} + +static __always_inline void vdso_write_begin_clock(struct vdso_clock *vc) +{ + vdso_write_seq_begin(vc); + /* Ensure the sequence invalidation is visible before data is modified */ + smp_wmb(); +} + +static __always_inline void vdso_write_end_clock(struct vdso_clock *vc) +{ + /* Ensure the data update is visible before the sequence is set valid again */ + smp_wmb(); + vdso_write_seq_end(vc); +} + static __always_inline void vdso_write_begin(struct vdso_time_data *vd) { struct vdso_clock *vc = vd->clock_data; - /* - * WRITE_ONCE() is required otherwise the compiler can validly tear - * updates to vd[x].seq and it is possible that the value seen by the - * reader is inconsistent. - */ - WRITE_ONCE(vc[CS_HRES_COARSE].seq, vc[CS_HRES_COARSE].seq + 1); - WRITE_ONCE(vc[CS_RAW].seq, vc[CS_RAW].seq + 1); + vdso_write_seq_begin(&vc[CS_HRES_COARSE]); + vdso_write_seq_begin(&vc[CS_RAW]); + /* Ensure the sequence invalidation is visible before data is modified */ smp_wmb(); } @@ -46,14 +76,10 @@ static __always_inline void vdso_write_end(struct vdso_time_data *vd) { struct vdso_clock *vc = vd->clock_data; + /* Ensure the data update is visible before the sequence is set valid again */ smp_wmb(); - /* - * WRITE_ONCE() is required otherwise the compiler can validly tear - * updates to vd[x].seq and it is possible that the value seen by the - * reader is inconsistent. - */ - WRITE_ONCE(vc[CS_HRES_COARSE].seq, vc[CS_HRES_COARSE].seq + 1); - WRITE_ONCE(vc[CS_RAW].seq, vc[CS_RAW].seq + 1); + vdso_write_seq_end(&vc[CS_HRES_COARSE]); + vdso_write_seq_end(&vc[CS_RAW]); } #endif /* !__ASSEMBLY__ */ From 34f888e3405acefc3a353227aa850dd0a37e709d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 1 Jul 2025 10:57:59 +0200 Subject: [PATCH 32/39] vdso/gettimeofday: Return bool from clock_getres() helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The internal helpers are effectively using boolean results, while pretending to use error numbers. Switch the return type to bool for more clarity. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250701-vdso-auxclock-v1-5-df7d9f87b9b8@linutronix.de --- lib/vdso/gettimeofday.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index 93ef801a97ef..9b77f23566f6 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -396,8 +396,8 @@ static __maybe_unused __kernel_old_time_t __cvdso_time(__kernel_old_time_t *time #ifdef VDSO_HAS_CLOCK_GETRES static __maybe_unused -int __cvdso_clock_getres_common(const struct vdso_time_data *vd, clockid_t clock, - struct __kernel_timespec *res) +bool __cvdso_clock_getres_common(const struct vdso_time_data *vd, clockid_t clock, + struct __kernel_timespec *res) { const struct vdso_clock *vc = vd->clock_data; u32 msk; @@ -405,7 +405,7 @@ int __cvdso_clock_getres_common(const struct vdso_time_data *vd, clockid_t clock /* Check for negative values or invalid clocks */ if (unlikely((u32) clock >= MAX_CLOCKS)) - return -1; + return false; if (IS_ENABLED(CONFIG_TIME_NS) && vc->clock_mode == VDSO_CLOCKMODE_TIMENS) @@ -427,23 +427,25 @@ int __cvdso_clock_getres_common(const struct vdso_time_data *vd, clockid_t clock */ ns = LOW_RES_NSEC; } else { - return -1; + return false; } if (likely(res)) { res->tv_sec = 0; res->tv_nsec = ns; } - return 0; + return true; } static __maybe_unused int __cvdso_clock_getres_data(const struct vdso_time_data *vd, clockid_t clock, struct __kernel_timespec *res) { - int ret = __cvdso_clock_getres_common(vd, clock, res); + bool ok; - if (unlikely(ret)) + ok = __cvdso_clock_getres_common(vd, clock, res); + + if (unlikely(!ok)) return clock_getres_fallback(clock, res); return 0; } @@ -460,18 +462,18 @@ __cvdso_clock_getres_time32_data(const struct vdso_time_data *vd, clockid_t cloc struct old_timespec32 *res) { struct __kernel_timespec ts; - int ret; + bool ok; - ret = __cvdso_clock_getres_common(vd, clock, &ts); + ok = __cvdso_clock_getres_common(vd, clock, &ts); - if (unlikely(ret)) + if (unlikely(!ok)) return clock_getres32_fallback(clock, res); if (likely(res)) { res->tv_sec = ts.tv_sec; res->tv_nsec = ts.tv_nsec; } - return ret; + return 0; } static __maybe_unused int From fb61bdb27fd730c393a8bddbda2401c37a919667 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 1 Jul 2025 10:58:00 +0200 Subject: [PATCH 33/39] vdso/gettimeofday: Return bool from clock_gettime() helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The internal helpers are effectively using boolean results, while pretending to use error numbers. Switch the return type to bool for more clarity. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250701-vdso-auxclock-v1-6-df7d9f87b9b8@linutronix.de --- lib/vdso/gettimeofday.c | 70 +++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index 9b77f23566f6..32e568db6d5a 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -82,8 +82,8 @@ const struct vdso_time_data *__arch_get_vdso_u_timens_data(const struct vdso_tim #endif /* CONFIG_GENERIC_VDSO_DATA_STORE */ static __always_inline -int do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, - clockid_t clk, struct __kernel_timespec *ts) +bool do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, + clockid_t clk, struct __kernel_timespec *ts) { const struct vdso_time_data *vd = __arch_get_vdso_u_timens_data(vdns); const struct timens_offset *offs = &vcns->offset[clk]; @@ -103,11 +103,11 @@ int do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock *v seq = vdso_read_begin(vc); if (unlikely(!vdso_clocksource_ok(vc))) - return -1; + return false; cycles = __arch_get_hw_counter(vc->clock_mode, vd); if (unlikely(!vdso_cycles_ok(cycles))) - return -1; + return false; ns = vdso_calc_ns(vc, cycles, vdso_ts->nsec); sec = vdso_ts->sec; } while (unlikely(vdso_read_retry(vc, seq))); @@ -123,7 +123,7 @@ int do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock *v ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); ts->tv_nsec = ns; - return 0; + return true; } #else static __always_inline @@ -133,16 +133,16 @@ const struct vdso_time_data *__arch_get_vdso_u_timens_data(const struct vdso_tim } static __always_inline -int do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, - clockid_t clk, struct __kernel_timespec *ts) +bool do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, + clockid_t clk, struct __kernel_timespec *ts) { - return -EINVAL; + return false; } #endif static __always_inline -int do_hres(const struct vdso_time_data *vd, const struct vdso_clock *vc, - clockid_t clk, struct __kernel_timespec *ts) +bool do_hres(const struct vdso_time_data *vd, const struct vdso_clock *vc, + clockid_t clk, struct __kernel_timespec *ts) { const struct vdso_timestamp *vdso_ts = &vc->basetime[clk]; u64 cycles, sec, ns; @@ -150,7 +150,7 @@ int do_hres(const struct vdso_time_data *vd, const struct vdso_clock *vc, /* Allows to compile the high resolution parts out */ if (!__arch_vdso_hres_capable()) - return -1; + return false; do { /* @@ -173,11 +173,11 @@ int do_hres(const struct vdso_time_data *vd, const struct vdso_clock *vc, smp_rmb(); if (unlikely(!vdso_clocksource_ok(vc))) - return -1; + return false; cycles = __arch_get_hw_counter(vc->clock_mode, vd); if (unlikely(!vdso_cycles_ok(cycles))) - return -1; + return false; ns = vdso_calc_ns(vc, cycles, vdso_ts->nsec); sec = vdso_ts->sec; } while (unlikely(vdso_read_retry(vc, seq))); @@ -189,13 +189,13 @@ int do_hres(const struct vdso_time_data *vd, const struct vdso_clock *vc, ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); ts->tv_nsec = ns; - return 0; + return true; } #ifdef CONFIG_TIME_NS static __always_inline -int do_coarse_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, - clockid_t clk, struct __kernel_timespec *ts) +bool do_coarse_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, + clockid_t clk, struct __kernel_timespec *ts) { const struct vdso_time_data *vd = __arch_get_vdso_u_timens_data(vdns); const struct timens_offset *offs = &vcns->offset[clk]; @@ -223,20 +223,20 @@ int do_coarse_timens(const struct vdso_time_data *vdns, const struct vdso_clock */ ts->tv_sec = sec + __iter_div_u64_rem(nsec, NSEC_PER_SEC, &nsec); ts->tv_nsec = nsec; - return 0; + return true; } #else static __always_inline -int do_coarse_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, - clockid_t clk, struct __kernel_timespec *ts) +bool do_coarse_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, + clockid_t clk, struct __kernel_timespec *ts) { - return -1; + return false; } #endif static __always_inline -int do_coarse(const struct vdso_time_data *vd, const struct vdso_clock *vc, - clockid_t clk, struct __kernel_timespec *ts) +bool do_coarse(const struct vdso_time_data *vd, const struct vdso_clock *vc, + clockid_t clk, struct __kernel_timespec *ts) { const struct vdso_timestamp *vdso_ts = &vc->basetime[clk]; u32 seq; @@ -258,10 +258,10 @@ int do_coarse(const struct vdso_time_data *vd, const struct vdso_clock *vc, ts->tv_nsec = vdso_ts->nsec; } while (unlikely(vdso_read_retry(vc, seq))); - return 0; + return true; } -static __always_inline int +static __always_inline bool __cvdso_clock_gettime_common(const struct vdso_time_data *vd, clockid_t clock, struct __kernel_timespec *ts) { @@ -270,7 +270,7 @@ __cvdso_clock_gettime_common(const struct vdso_time_data *vd, clockid_t clock, /* Check for negative values or invalid clocks */ if (unlikely((u32) clock >= MAX_CLOCKS)) - return -1; + return false; /* * Convert the clockid to a bitmask and use it to check which @@ -284,7 +284,7 @@ __cvdso_clock_gettime_common(const struct vdso_time_data *vd, clockid_t clock, else if (msk & VDSO_RAW) vc = &vc[CS_RAW]; else - return -1; + return false; return do_hres(vd, vc, clock, ts); } @@ -293,9 +293,11 @@ static __maybe_unused int __cvdso_clock_gettime_data(const struct vdso_time_data *vd, clockid_t clock, struct __kernel_timespec *ts) { - int ret = __cvdso_clock_gettime_common(vd, clock, ts); + bool ok; - if (unlikely(ret)) + ok = __cvdso_clock_gettime_common(vd, clock, ts); + + if (unlikely(!ok)) return clock_gettime_fallback(clock, ts); return 0; } @@ -312,18 +314,18 @@ __cvdso_clock_gettime32_data(const struct vdso_time_data *vd, clockid_t clock, struct old_timespec32 *res) { struct __kernel_timespec ts; - int ret; + bool ok; - ret = __cvdso_clock_gettime_common(vd, clock, &ts); + ok = __cvdso_clock_gettime_common(vd, clock, &ts); - if (unlikely(ret)) + if (unlikely(!ok)) return clock_gettime32_fallback(clock, res); - /* For ret == 0 */ + /* For ok == true */ res->tv_sec = ts.tv_sec; res->tv_nsec = ts.tv_nsec; - return ret; + return 0; } static __maybe_unused int @@ -342,7 +344,7 @@ __cvdso_gettimeofday_data(const struct vdso_time_data *vd, if (likely(tv != NULL)) { struct __kernel_timespec ts; - if (do_hres(vd, &vc[CS_HRES_COARSE], CLOCK_REALTIME, &ts)) + if (!do_hres(vd, &vc[CS_HRES_COARSE], CLOCK_REALTIME, &ts)) return gettimeofday_fallback(tv, tz); tv->tv_sec = ts.tv_sec; From 1a1cd5fe881fdf7b0391e5426f6bfcb663c90dde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 1 Jul 2025 10:58:01 +0200 Subject: [PATCH 34/39] vdso/gettimeofday: Introduce vdso_clockid_valid() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the clock ID validation check into a common helper. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250701-vdso-auxclock-v1-7-df7d9f87b9b8@linutronix.de --- lib/vdso/gettimeofday.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index 32e568db6d5a..0271226aaa5c 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -71,6 +71,12 @@ static inline bool vdso_cycles_ok(u64 cycles) } #endif +static __always_inline bool vdso_clockid_valid(clockid_t clock) +{ + /* Check for negative values or invalid clocks */ + return likely((u32) clock < MAX_CLOCKS); +} + #ifdef CONFIG_TIME_NS #ifdef CONFIG_GENERIC_VDSO_DATA_STORE @@ -268,8 +274,7 @@ __cvdso_clock_gettime_common(const struct vdso_time_data *vd, clockid_t clock, const struct vdso_clock *vc = vd->clock_data; u32 msk; - /* Check for negative values or invalid clocks */ - if (unlikely((u32) clock >= MAX_CLOCKS)) + if (!vdso_clockid_valid(clock)) return false; /* @@ -405,8 +410,7 @@ bool __cvdso_clock_getres_common(const struct vdso_time_data *vd, clockid_t cloc u32 msk; u64 ns; - /* Check for negative values or invalid clocks */ - if (unlikely((u32) clock >= MAX_CLOCKS)) + if (!vdso_clockid_valid(clock)) return false; if (IS_ENABLED(CONFIG_TIME_NS) && From 381d96ccc1a52237e03ac97b4d2945997c9356e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 1 Jul 2025 10:58:02 +0200 Subject: [PATCH 35/39] vdso/gettimeofday: Introduce vdso_set_timespec() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This code is duplicated and with the introduction of auxiliary clocks will be duplicated even more. Introduce a helper. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250701-vdso-auxclock-v1-8-df7d9f87b9b8@linutronix.de --- lib/vdso/gettimeofday.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index 0271226aaa5c..9d7ac980107d 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -77,6 +77,16 @@ static __always_inline bool vdso_clockid_valid(clockid_t clock) return likely((u32) clock < MAX_CLOCKS); } +/* + * Must not be invoked within the sequence read section as a race inside + * that loop could result in __iter_div_u64_rem() being extremely slow. + */ +static __always_inline void vdso_set_timespec(struct __kernel_timespec *ts, u64 sec, u64 ns) +{ + ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); + ts->tv_nsec = ns; +} + #ifdef CONFIG_TIME_NS #ifdef CONFIG_GENERIC_VDSO_DATA_STORE @@ -122,12 +132,7 @@ bool do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock * sec += offs->sec; ns += offs->nsec; - /* - * Do this outside the loop: a race inside the loop could result - * in __iter_div_u64_rem() being extremely slow. - */ - ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); - ts->tv_nsec = ns; + vdso_set_timespec(ts, sec, ns); return true; } @@ -188,12 +193,7 @@ bool do_hres(const struct vdso_time_data *vd, const struct vdso_clock *vc, sec = vdso_ts->sec; } while (unlikely(vdso_read_retry(vc, seq))); - /* - * Do this outside the loop: a race inside the loop could result - * in __iter_div_u64_rem() being extremely slow. - */ - ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); - ts->tv_nsec = ns; + vdso_set_timespec(ts, sec, ns); return true; } @@ -223,12 +223,8 @@ bool do_coarse_timens(const struct vdso_time_data *vdns, const struct vdso_clock sec += offs->sec; nsec += offs->nsec; - /* - * Do this outside the loop: a race inside the loop could result - * in __iter_div_u64_rem() being extremely slow. - */ - ts->tv_sec = sec + __iter_div_u64_rem(nsec, NSEC_PER_SEC, &nsec); - ts->tv_nsec = nsec; + vdso_set_timespec(ts, sec, nsec); + return true; } #else From 562f03ed967dc65e513a3e2e9821f656d5333b8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 1 Jul 2025 10:58:03 +0200 Subject: [PATCH 36/39] vdso/gettimeofday: Introduce vdso_get_timestamp() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This code is duplicated and with the introduction of auxiliary clocks will be duplicated even more. Introduce a helper. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250701-vdso-auxclock-v1-9-df7d9f87b9b8@linutronix.de --- lib/vdso/gettimeofday.c | 43 +++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index 9d7ac980107d..fc0038e83b5c 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -87,6 +87,26 @@ static __always_inline void vdso_set_timespec(struct __kernel_timespec *ts, u64 ts->tv_nsec = ns; } +static __always_inline +bool vdso_get_timestamp(const struct vdso_time_data *vd, const struct vdso_clock *vc, + unsigned int clkidx, u64 *sec, u64 *ns) +{ + const struct vdso_timestamp *vdso_ts = &vc->basetime[clkidx]; + u64 cycles; + + if (unlikely(!vdso_clocksource_ok(vc))) + return false; + + cycles = __arch_get_hw_counter(vc->clock_mode, vd); + if (unlikely(!vdso_cycles_ok(cycles))) + return false; + + *ns = vdso_calc_ns(vc, cycles, vdso_ts->nsec); + *sec = vdso_ts->sec; + + return true; +} + #ifdef CONFIG_TIME_NS #ifdef CONFIG_GENERIC_VDSO_DATA_STORE @@ -104,28 +124,20 @@ bool do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock * const struct vdso_time_data *vd = __arch_get_vdso_u_timens_data(vdns); const struct timens_offset *offs = &vcns->offset[clk]; const struct vdso_clock *vc = vd->clock_data; - const struct vdso_timestamp *vdso_ts; - u64 cycles, ns; u32 seq; s64 sec; + u64 ns; if (clk != CLOCK_MONOTONIC_RAW) vc = &vc[CS_HRES_COARSE]; else vc = &vc[CS_RAW]; - vdso_ts = &vc->basetime[clk]; do { seq = vdso_read_begin(vc); - if (unlikely(!vdso_clocksource_ok(vc))) + if (!vdso_get_timestamp(vd, vc, clk, &sec, &ns)) return false; - - cycles = __arch_get_hw_counter(vc->clock_mode, vd); - if (unlikely(!vdso_cycles_ok(cycles))) - return false; - ns = vdso_calc_ns(vc, cycles, vdso_ts->nsec); - sec = vdso_ts->sec; } while (unlikely(vdso_read_retry(vc, seq))); /* Add the namespace offset */ @@ -155,8 +167,7 @@ static __always_inline bool do_hres(const struct vdso_time_data *vd, const struct vdso_clock *vc, clockid_t clk, struct __kernel_timespec *ts) { - const struct vdso_timestamp *vdso_ts = &vc->basetime[clk]; - u64 cycles, sec, ns; + u64 sec, ns; u32 seq; /* Allows to compile the high resolution parts out */ @@ -183,14 +194,8 @@ bool do_hres(const struct vdso_time_data *vd, const struct vdso_clock *vc, } smp_rmb(); - if (unlikely(!vdso_clocksource_ok(vc))) + if (!vdso_get_timestamp(vd, vc, clk, &sec, &ns)) return false; - - cycles = __arch_get_hw_counter(vc->clock_mode, vd); - if (unlikely(!vdso_cycles_ok(cycles))) - return false; - ns = vdso_calc_ns(vc, cycles, vdso_ts->nsec); - sec = vdso_ts->sec; } while (unlikely(vdso_read_retry(vc, seq))); vdso_set_timespec(ts, sec, ns); From 9b7fc3f14576c268f62fe0b882fac5e61239b659 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 1 Jul 2025 10:58:04 +0200 Subject: [PATCH 37/39] vdso: Introduce aux_clock_resolution_ns() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the constant resolution to a shared header, so the vDSO can use it and return it without going through a syscall. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250701-vdso-auxclock-v1-10-df7d9f87b9b8@linutronix.de --- include/vdso/auxclock.h | 13 +++++++++++++ kernel/time/timekeeping.c | 6 ++++-- 2 files changed, 17 insertions(+), 2 deletions(-) create mode 100644 include/vdso/auxclock.h diff --git a/include/vdso/auxclock.h b/include/vdso/auxclock.h new file mode 100644 index 000000000000..6d6e74cbc400 --- /dev/null +++ b/include/vdso/auxclock.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _VDSO_AUXCLOCK_H +#define _VDSO_AUXCLOCK_H + +#include +#include + +static __always_inline u64 aux_clock_resolution_ns(void) +{ + return 1; +} + +#endif /* _VDSO_AUXCLOCK_H */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index c6fe89bded02..cbcf090bb4be 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -26,6 +26,8 @@ #include #include +#include + #include "tick-internal.h" #include "ntp_internal.h" #include "timekeeping_internal.h" @@ -2876,8 +2878,8 @@ static int aux_get_res(clockid_t id, struct timespec64 *tp) if (!clockid_aux_valid(id)) return -ENODEV; - tp->tv_sec = 0; - tp->tv_nsec = 1; + tp->tv_sec = aux_clock_resolution_ns() / NSEC_PER_SEC; + tp->tv_nsec = aux_clock_resolution_ns() % NSEC_PER_SEC; return 0; } From 380b84e168e57c54d0a9e053a5558fddc43f0c1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 1 Jul 2025 10:58:05 +0200 Subject: [PATCH 38/39] vdso/vsyscall: Update auxiliary clock data in the datapage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expose the auxiliary clock data so it can be read from the vDSO. Architectures not using the generic vDSO time framework, namely SPARC64, are not supported. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250701-vdso-auxclock-v1-11-df7d9f87b9b8@linutronix.de --- include/linux/timekeeper_internal.h | 6 +++++ include/vdso/datapage.h | 3 +++ kernel/time/namespace.c | 5 ++++ kernel/time/timekeeping.c | 12 +++++++++ kernel/time/vsyscall.c | 40 +++++++++++++++++++++++++++++ 5 files changed, 66 insertions(+) diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index ca79938b62f3..c27aac67cb3f 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -190,4 +190,10 @@ static inline void update_vsyscall_tz(void) } #endif +#if defined(CONFIG_GENERIC_GETTIMEOFDAY) && defined(CONFIG_POSIX_AUX_CLOCKS) +extern void vdso_time_update_aux(struct timekeeper *tk); +#else +static inline void vdso_time_update_aux(struct timekeeper *tk) { } +#endif + #endif /* _LINUX_TIMEKEEPER_INTERNAL_H */ diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h index 1864e76e8f69..f4c96d9ce674 100644 --- a/include/vdso/datapage.h +++ b/include/vdso/datapage.h @@ -38,6 +38,7 @@ struct vdso_arch_data { #endif #define VDSO_BASES (CLOCK_TAI + 1) +#define VDSO_BASE_AUX 0 #define VDSO_HRES (BIT(CLOCK_REALTIME) | \ BIT(CLOCK_MONOTONIC) | \ BIT(CLOCK_BOOTTIME) | \ @@ -117,6 +118,7 @@ struct vdso_clock { * @arch_data: architecture specific data (optional, defaults * to an empty struct) * @clock_data: clocksource related data (array) + * @aux_clock_data: auxiliary clocksource related data (array) * @tz_minuteswest: minutes west of Greenwich * @tz_dsttime: type of DST correction * @hrtimer_res: hrtimer resolution @@ -133,6 +135,7 @@ struct vdso_time_data { struct arch_vdso_time_data arch_data; struct vdso_clock clock_data[CS_BASES]; + struct vdso_clock aux_clock_data[MAX_AUX_CLOCKS]; s32 tz_minuteswest; s32 tz_dsttime; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index e3642278df43..667452768ed3 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -242,6 +242,11 @@ static void timens_set_vvar_page(struct task_struct *task, for (i = 0; i < CS_BASES; i++) timens_setup_vdso_clock_data(&vc[i], ns); + if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) { + for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++) + timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns); + } + out: mutex_unlock(&offset_lock); } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index cbcf090bb4be..243fe25e680a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -66,11 +66,21 @@ static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts) { return ktime_get_aux_ts64(CLOCK_AUX + tkid - TIMEKEEPER_AUX_FIRST, ts); } + +static inline bool tk_is_aux(const struct timekeeper *tk) +{ + return tk->id >= TIMEKEEPER_AUX_FIRST && tk->id <= TIMEKEEPER_AUX_LAST; +} #else static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts) { return false; } + +static inline bool tk_is_aux(const struct timekeeper *tk) +{ + return false; +} #endif /* flag for if timekeeping is suspended */ @@ -719,6 +729,8 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); + } else if (tk_is_aux(tk)) { + vdso_time_update_aux(tk); } if (action & TK_CLOCK_WAS_SET) diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index df6bada2d58e..8ba8b0d8a387 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -136,6 +136,46 @@ void update_vsyscall_tz(void) __arch_sync_vdso_time_data(vdata); } +#ifdef CONFIG_POSIX_AUX_CLOCKS +void vdso_time_update_aux(struct timekeeper *tk) +{ + struct vdso_time_data *vdata = vdso_k_time_data; + struct vdso_timestamp *vdso_ts; + struct vdso_clock *vc; + s32 clock_mode; + u64 nsec; + + vc = &vdata->aux_clock_data[tk->id - TIMEKEEPER_AUX_FIRST]; + vdso_ts = &vc->basetime[VDSO_BASE_AUX]; + clock_mode = tk->tkr_mono.clock->vdso_clock_mode; + if (!tk->clock_valid) + clock_mode = VDSO_CLOCKMODE_NONE; + + /* copy vsyscall data */ + vdso_write_begin_clock(vc); + + vc->clock_mode = clock_mode; + + if (clock_mode != VDSO_CLOCKMODE_NONE) { + fill_clock_configuration(vc, &tk->tkr_mono); + + vdso_ts->sec = tk->xtime_sec; + + nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsec += tk->offs_aux; + vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &nsec); + nsec = nsec << tk->tkr_mono.shift; + vdso_ts->nsec = nsec; + } + + __arch_update_vdso_clock(vc); + + vdso_write_end_clock(vc); + + __arch_sync_vdso_time_data(vdata); +} +#endif + /** * vdso_update_begin - Start of a VDSO update section * From cd3557a7618bf5c1935e9f66b58a329f1f1f4b27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 1 Jul 2025 10:58:06 +0200 Subject: [PATCH 39/39] vdso/gettimeofday: Add support for auxiliary clocks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expose the auxiliary clocks through the vDSO. Architectures not using the generic vDSO time framework, namely SPARC64, are not supported. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250701-vdso-auxclock-v1-12-df7d9f87b9b8@linutronix.de --- include/vdso/datapage.h | 2 ++ lib/vdso/gettimeofday.c | 49 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h index f4c96d9ce674..02533038640e 100644 --- a/include/vdso/datapage.h +++ b/include/vdso/datapage.h @@ -5,6 +5,7 @@ #ifndef __ASSEMBLY__ #include +#include #include #include #include @@ -46,6 +47,7 @@ struct vdso_arch_data { #define VDSO_COARSE (BIT(CLOCK_REALTIME_COARSE) | \ BIT(CLOCK_MONOTONIC_COARSE)) #define VDSO_RAW (BIT(CLOCK_MONOTONIC_RAW)) +#define VDSO_AUX __GENMASK(CLOCK_AUX_LAST, CLOCK_AUX) #define CS_HRES_COARSE 0 #define CS_RAW 1 diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index fc0038e83b5c..02ea19f67164 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -2,6 +2,7 @@ /* * Generic userspace implementations of gettimeofday() and similar. */ +#include #include #include @@ -74,7 +75,7 @@ static inline bool vdso_cycles_ok(u64 cycles) static __always_inline bool vdso_clockid_valid(clockid_t clock) { /* Check for negative values or invalid clocks */ - return likely((u32) clock < MAX_CLOCKS); + return likely((u32) clock <= CLOCK_AUX_LAST); } /* @@ -268,6 +269,48 @@ bool do_coarse(const struct vdso_time_data *vd, const struct vdso_clock *vc, return true; } +static __always_inline +bool do_aux(const struct vdso_time_data *vd, clockid_t clock, struct __kernel_timespec *ts) +{ + const struct vdso_clock *vc; + u32 seq, idx; + u64 sec, ns; + + if (!IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) + return false; + + idx = clock - CLOCK_AUX; + vc = &vd->aux_clock_data[idx]; + + do { + /* + * Open coded function vdso_read_begin() to handle + * VDSO_CLOCK_TIMENS. See comment in do_hres(). + */ + while ((seq = READ_ONCE(vc->seq)) & 1) { + if (IS_ENABLED(CONFIG_TIME_NS) && vc->clock_mode == VDSO_CLOCKMODE_TIMENS) { + vd = __arch_get_vdso_u_timens_data(vd); + vc = &vd->aux_clock_data[idx]; + /* Re-read from the real time data page */ + continue; + } + cpu_relax(); + } + smp_rmb(); + + /* Auxclock disabled? */ + if (vc->clock_mode == VDSO_CLOCKMODE_NONE) + return false; + + if (!vdso_get_timestamp(vd, vc, VDSO_BASE_AUX, &sec, &ns)) + return false; + } while (unlikely(vdso_read_retry(vc, seq))); + + vdso_set_timespec(ts, sec, ns); + + return true; +} + static __always_inline bool __cvdso_clock_gettime_common(const struct vdso_time_data *vd, clockid_t clock, struct __kernel_timespec *ts) @@ -289,6 +332,8 @@ __cvdso_clock_gettime_common(const struct vdso_time_data *vd, clockid_t clock, return do_coarse(vd, &vc[CS_HRES_COARSE], clock, ts); else if (msk & VDSO_RAW) vc = &vc[CS_RAW]; + else if (msk & VDSO_AUX) + return do_aux(vd, clock, ts); else return false; @@ -433,6 +478,8 @@ bool __cvdso_clock_getres_common(const struct vdso_time_data *vd, clockid_t cloc * Preserves the behaviour of posix_get_coarse_res(). */ ns = LOW_RES_NSEC; + } else if (msk & VDSO_AUX) { + ns = aux_clock_resolution_ns(); } else { return false; }