From e560083c0467f86b72aecac377b27bd1e7d16c49 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Fri, 30 Jan 2026 12:49:40 +0530 Subject: [PATCH 1/9] OPP: debugfs: Use performance level if available to distinguish between rates Some OPP tables have entries with same rate and different performance level. For these entries, using only the rate as the debugfs directory name causes below error: debugfs: 'opp:5000000' already exists in 'soc@0-1c00000.pci' Fix it by appending the performance level to the dir name if available. Reported-by: Bjorn Andersson Closes: https://lore.kernel.org/linux-arm-msm/75lzykd37zdvrks5i2bb4zb2yzjtm25kv3hegmikndkbr772mz@w2ykff3ny45u/ Fixes: 05db35963eef ("OPP: Add support to find OPP for a set of keys") Signed-off-by: Manivannan Sadhasivam Signed-off-by: Viresh Kumar --- drivers/opp/debugfs.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/opp/debugfs.c b/drivers/opp/debugfs.c index 8fc6238b1728..61506d30d5ff 100644 --- a/drivers/opp/debugfs.c +++ b/drivers/opp/debugfs.c @@ -130,22 +130,24 @@ void opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table) { struct dentry *pdentry = opp_table->dentry; struct dentry *d; - unsigned long id; - char name[25]; /* 20 chars for 64 bit value + 5 (opp:\0) */ + char name[36]; /* "opp:"(4) + u64(20) + "-" (1) + u32(10) + NULL(1) */ /* * Get directory name for OPP. * - * - Normally rate is unique to each OPP, use it to get unique opp-name. + * - Normally rate is unique to each OPP, use it to get unique opp-name, + * together with performance level if available. * - For some devices rate isn't available or there are multiple, use * index instead for them. */ - if (likely(opp_table->clk_count == 1 && opp->rates[0])) - id = opp->rates[0]; - else - id = _get_opp_count(opp_table); - - snprintf(name, sizeof(name), "opp:%lu", id); + if (likely(opp_table->clk_count == 1 && opp->rates[0])) { + if (opp->level == OPP_LEVEL_UNSET) + snprintf(name, sizeof(name), "opp:%lu", opp->rates[0]); + else + snprintf(name, sizeof(name), "opp:%lu-%u", opp->rates[0], opp->level); + } else { + snprintf(name, sizeof(name), "opp:%u", _get_opp_count(opp_table)); + } /* Create per-opp directory */ d = debugfs_create_dir(name, pdentry); From 3d2398f44a2d48fb1c575a6e0bc6b38f3e689e22 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 23 Feb 2026 11:05:59 +0530 Subject: [PATCH 2/9] OPP: Move break out of scoped_guard in dev_pm_opp_xlate_required_opp() The commit ff9c512041f2 ("OPP: Use mutex locking guards") unintentionally made the for loop run longer than required. scoped_guard() is implemented as a for loop. The break statement now breaks out out the scoped_guard() and not out of the outer for loop. The outer loop always iterates to completion. Fix it. Fixes: ff9c512041f2 ("OPP: Use mutex locking guards") Reported-by: David Lechner Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index 866641666e41..da3f5eba4341 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -2742,8 +2742,8 @@ struct dev_pm_opp *dev_pm_opp_xlate_required_opp(struct opp_table *src_table, break; } } - break; } + break; } if (IS_ERR(dest_opp)) { From e57c2bf2e89df3b176ab579abfd3ed54fd27034c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 23 Feb 2026 16:38:55 +0100 Subject: [PATCH 3/9] cpuidle: governors: menu: Refine stopped tick handling This change is based on the observation that it is not in fact necessary to select a deep idle state every time the scheduler tick has been stopped before the idle state selection takes place. Namely, if the time till the closest timer (that is not the tick) is short enough, a shallow idle state can be selected because the timer will kick the CPU out of that state, so the damage from a possible overly optimistic selection will be limited. Update the menu governor in accordance with the above and use twice the tick period length as the "safe timer range" for allowing the original predicted_ns value to be used even if the tick has been stopped. Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Link: https://patch.msgid.link/3341782.5fSG56mABF@rafael.j.wysocki --- drivers/cpuidle/governors/gov.h | 5 +++++ drivers/cpuidle/governors/menu.c | 15 +++++++++------ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/cpuidle/governors/gov.h b/drivers/cpuidle/governors/gov.h index 99e067d9668c..cd06a2e7b506 100644 --- a/drivers/cpuidle/governors/gov.h +++ b/drivers/cpuidle/governors/gov.h @@ -10,5 +10,10 @@ * check the time till the closest expected timer event. */ #define RESIDENCY_THRESHOLD_NS (15 * NSEC_PER_USEC) +/* + * If the closest timer is in this range, the governor idle state selection need + * not be adjusted after the scheduler tick has been stopped. + */ +#define SAFE_TIMER_RANGE_NS (2 * TICK_NSEC) #endif /* __CPUIDLE_GOVERNOR_H */ diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 899ff16ff1fe..544a5d593007 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -261,13 +261,16 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, predicted_ns = min((u64)timer_us * NSEC_PER_USEC, predicted_ns); /* * If the tick is already stopped, the cost of possible short - * idle duration misprediction is much higher, because the CPU - * may be stuck in a shallow idle state for a long time as a - * result of it. In that case, say we might mispredict and use - * the known time till the closest timer event for the idle - * state selection. + * idle duration misprediction is higher because the CPU may get + * stuck in a shallow idle state then. To avoid that, if + * predicted_ns is small enough, say it might be mispredicted + * and use the known time till the closest timer for idle state + * selection unless that timer is going to trigger within + * SAFE_TIMER_RANGE_NS in which case it can be regarded as a + * sufficient safety net. */ - if (tick_nohz_tick_stopped() && predicted_ns < TICK_NSEC) + if (tick_nohz_tick_stopped() && predicted_ns < TICK_NSEC && + data->next_timer_ns > SAFE_TIMER_RANGE_NS) predicted_ns = data->next_timer_ns; } else { /* From 106a2662e655363db0dd73d9a91e1ec9afe9f4b1 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 23 Feb 2026 16:40:18 +0100 Subject: [PATCH 4/9] cpuidle: governors: teo: Rearrange stopped tick handling This change is based on the observation that it is not in fact necessary to select a deep idle state every time the scheduler tick has been stopped before the idle state selection takes place. Namely, if the time till the closest timer (that is not the tick) is short enough, a shallow idle state can be selected because the timer will kick the CPU out of that state, so the damage from a possible overly optimistic selection will be limited. Update the teo governor in accordance with the above in analogy with the previous analogous menu governor update. Among other things, this will cause the teo governor to call tick_nohz_get_sleep_length() every time when the tick has been stopped already and only change the original idle state selection if the time till the closest timer is beyond SAFE_TIMER_RANGE_NS which is way more straightforward than the current code flow. Of course, this effectively throws away some of the recent teo governor changes made recently, but the resulting simplification is worth it in my view. Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Link: https://patch.msgid.link/1865078.VLH7GnMWUR@rafael.j.wysocki --- drivers/cpuidle/governors/teo.c | 81 ++++++++++++++------------------- 1 file changed, 34 insertions(+), 47 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index bec0142377b8..ac43b9b013b3 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -407,50 +407,13 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * better choice. */ if (2 * idx_intercept_sum > cpu_data->total - idx_hit_sum) { - int min_idx = idx0; - - if (tick_nohz_tick_stopped()) { - /* - * Look for the shallowest idle state below the current - * candidate one whose target residency is at least - * equal to the tick period length. - */ - while (min_idx < idx && - drv->states[min_idx].target_residency_ns < TICK_NSEC) - min_idx++; - - /* - * Avoid selecting a state with a lower index, but with - * the same target residency as the current candidate - * one. - */ - if (drv->states[min_idx].target_residency_ns == - drv->states[idx].target_residency_ns) - goto constraint; - } - - /* - * If the minimum state index is greater than or equal to the - * index of the state with the maximum intercepts metric and - * the corresponding state is enabled, there is no need to look - * at the deeper states. - */ - if (min_idx >= intercept_max_idx && - !dev->states_usage[min_idx].disable) { - idx = min_idx; - goto constraint; - } - /* * Look for the deepest enabled idle state, at most as deep as * the one with the maximum intercepts metric, whose target * residency had not been greater than the idle duration in over * a half of the relevant cases in the past. - * - * Take the possible duration limitation present if the tick - * has been stopped already into account. */ - for (i = idx - 1, intercept_sum = 0; i >= min_idx; i--) { + for (i = idx - 1, intercept_sum = 0; i >= idx0; i--) { intercept_sum += cpu_data->state_bins[i].intercepts; if (dev->states_usage[i].disable) @@ -463,7 +426,6 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, } } -constraint: /* * If there is a latency constraint, it may be necessary to select an * idle state shallower than the current candidate one. @@ -472,13 +434,13 @@ constraint: idx = constraint_idx; /* - * If either the candidate state is state 0 or its target residency is - * low enough, there is basically nothing more to do, but if the sleep - * length is not updated, the subsequent wakeup will be counted as an - * "intercept" which may be problematic in the cases when timer wakeups - * are dominant. Namely, it may effectively prevent deeper idle states - * from being selected at one point even if no imminent timers are - * scheduled. + * If the tick has not been stopped and either the candidate state is + * state 0 or its target residency is low enough, there is basically + * nothing more to do, but if the sleep length is not updated, the + * subsequent wakeup will be counted as an "intercept". That may be + * problematic in the cases when timer wakeups are dominant because it + * may effectively prevent deeper idle states from being selected at one + * point even if no imminent timers are scheduled. * * However, frequent timers in the RESIDENCY_THRESHOLD_NS range on one * CPU are unlikely (user space has a default 50 us slack value for @@ -494,7 +456,8 @@ constraint: * shallow idle states regardless of the wakeup type, so the sleep * length need not be known in that case. */ - if ((!idx || drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS) && + if (!tick_nohz_tick_stopped() && (!idx || + drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS) && (2 * cpu_data->short_idles >= cpu_data->total || latency_req < LATENCY_THRESHOLD_NS)) goto out_tick; @@ -502,6 +465,30 @@ constraint: duration_ns = tick_nohz_get_sleep_length(&delta_tick); cpu_data->sleep_length_ns = duration_ns; + /* + * If the tick has been stopped and the closest timer is too far away, + * update the selection to prevent the CPU from getting stuck in a + * shallow idle state for too long. + */ + if (tick_nohz_tick_stopped() && duration_ns > SAFE_TIMER_RANGE_NS && + drv->states[idx].target_residency_ns < TICK_NSEC) { + /* + * Look for the deepest enabled idle state with exit latency + * within the PM QoS limit and with target residency within + * duration_ns. + */ + for (i = constraint_idx; i > idx; i--) { + if (dev->states_usage[i].disable) + continue; + + if (drv->states[i].target_residency_ns <= duration_ns) { + idx = i; + break; + } + } + return idx; + } + if (!idx) goto out_tick; From d51de21b4c3a34a2cc592319df63864e14b18b29 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 9 Mar 2026 10:38:18 +0200 Subject: [PATCH 5/9] intel_idle: Add Panther Lake C-states table Panther Lake supports the following requestable C-states: C1, C1E, C6S, C10. The parameters of these C-states should be consistent across all systems based on Panther Lake, so add a custom C-states table for it that will override C-state parameters supplied by platform firmware that may vary from one platform to another and may not represent the most optimum choice. Signed-off-by: Artem Bityutskiy [ rjw: Changelog expansion ] Link: https://patch.msgid.link/20260309083818.79588-1-dedekind1@gmail.com Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 42 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index f49c939d636f..f49354e37777 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -983,6 +983,43 @@ static struct cpuidle_state mtl_l_cstates[] __initdata = { .enter = NULL } }; +static struct cpuidle_state ptl_cstates[] __initdata = { + { + .name = "C1", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 1, + .target_residency = 1, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C1E", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 10, + .target_residency = 10, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C6S", + .desc = "MWAIT 0x21", + .flags = MWAIT2flg(0x21) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 300, + .target_residency = 300, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C10", + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 370, + .target_residency = 2500, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .enter = NULL } +}; + static struct cpuidle_state gmt_cstates[] __initdata = { { .name = "C1", @@ -1561,6 +1598,10 @@ static const struct idle_cpu idle_cpu_mtl_l __initconst = { .state_table = mtl_l_cstates, }; +static const struct idle_cpu idle_cpu_ptl __initconst = { + .state_table = ptl_cstates, +}; + static const struct idle_cpu idle_cpu_gmt __initconst = { .state_table = gmt_cstates, }; @@ -1669,6 +1710,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { X86_MATCH_VFM(INTEL_ALDERLAKE, &idle_cpu_adl), X86_MATCH_VFM(INTEL_ALDERLAKE_L, &idle_cpu_adl_l), X86_MATCH_VFM(INTEL_METEORLAKE_L, &idle_cpu_mtl_l), + X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &idle_cpu_ptl), X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &idle_cpu_gmt), X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &idle_cpu_spr), X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &idle_cpu_spr), From 9b0f1cd58fe9d2c95eae97f089040f1a5b02c097 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 14 Mar 2026 13:12:25 -0700 Subject: [PATCH 6/9] PM: hibernate: x86: Remove inclusion of crypto/hash.h hibernate_64.c does not do any cryptographic hashing, so the header crypto/hash.h is not needed at all. Signed-off-by: Eric Biggers [ rjw: Subject tweak ] Link: https://patch.msgid.link/20260314201225.38822-1-ebiggers@kernel.org Signed-off-by: Rafael J. Wysocki --- arch/x86/power/hibernate_64.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index a595953f1d6d..e72d26acae79 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -14,8 +14,6 @@ #include #include -#include - #include #include #include From 2b27ea5b644d7da9bc84f4539e53d1b31c601566 Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Mon, 9 Mar 2026 18:39:42 +0100 Subject: [PATCH 7/9] PM: hibernate: return -ENODATA if the snapshot image is not loaded snapshot_image_loaded() is used in both the in-kernel and the userspace restore path to ensure that the snapshot image has been completely loaded. However the latter path returns -EPERM in such situations, which is meant for cases where the operation is neither write-only nor ready. This patch updates the check so the returned error code is -ENODATA in both cases. Suggested-by: Brian Geffon Signed-off-by: Alberto Garcia Acked-by: Brian Geffon Link: https://patch.msgid.link/8cfda38659c623f5392f3458cb32504ffd556a74.1773075892.git.berto@igalia.com Signed-off-by: Rafael J. Wysocki --- kernel/power/user.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/power/user.c b/kernel/power/user.c index 4401cfe26e5c..be77f3556bd7 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -322,11 +322,14 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, error = snapshot_write_finalize(&data->handle); if (error) break; - if (data->mode != O_WRONLY || !data->frozen || - !snapshot_image_loaded(&data->handle)) { + if (data->mode != O_WRONLY || !data->frozen) { error = -EPERM; break; } + if (!snapshot_image_loaded(&data->handle)) { + error = -ENODATA; + break; + } error = hibernation_restore(data->platform_support); break; From 9d3a068cc80c9ed374cb78ab8c861668c835a6bc Mon Sep 17 00:00:00 2001 From: Julian Braha Date: Tue, 31 Mar 2026 08:49:20 +0100 Subject: [PATCH 8/9] cpuidle: clean up dead dependencies on CPU_IDLE in Kconfig The Kconfig in the parent directory already has the first 'if CPU_IDLE' gating the inclusion of this Kconfig, meaning that the 'depends on CPUIDLE' statements in these config options are effectively dead code. Leave the 'if CPU_IDLE...endif' condition, and remove the individual 'depends on' statements in Kconfig.mips and Kconfig.powerpc This dead code was found by kconfirm, a static analysis tool for Kconfig. Signed-off-by: Julian Braha [ rjw: Subject and changelog edits ] Link: https://patch.msgid.link/20260331074920.41269-1-julianbraha@gmail.com Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/Kconfig | 2 +- drivers/cpuidle/Kconfig.mips | 2 +- drivers/cpuidle/Kconfig.powerpc | 2 -- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig index cac5997dca50..d6d8386d3f02 100644 --- a/drivers/cpuidle/Kconfig +++ b/drivers/cpuidle/Kconfig @@ -81,7 +81,7 @@ config HALTPOLL_CPUIDLE before halting in the guest (more efficient than polling in the host via halt_poll_ns for some scenarios). -endif +endif # CPU_IDLE config ARCH_NEEDS_CPU_IDLE_COUPLED def_bool n diff --git a/drivers/cpuidle/Kconfig.mips b/drivers/cpuidle/Kconfig.mips index c3c011af4a35..88728b2b4ea0 100644 --- a/drivers/cpuidle/Kconfig.mips +++ b/drivers/cpuidle/Kconfig.mips @@ -4,7 +4,7 @@ # config MIPS_CPS_CPUIDLE bool "CPU Idle driver for MIPS CPS platforms" - depends on CPU_IDLE && MIPS_CPS + depends on MIPS_CPS depends on SYS_SUPPORTS_MIPS_CPS select ARCH_NEEDS_CPU_IDLE_COUPLED if MIPS_MT || CPU_MIPSR6 select GENERIC_CLOCKEVENTS_BROADCAST if SMP diff --git a/drivers/cpuidle/Kconfig.powerpc b/drivers/cpuidle/Kconfig.powerpc index a797a02b7b6f..1931ac8faffb 100644 --- a/drivers/cpuidle/Kconfig.powerpc +++ b/drivers/cpuidle/Kconfig.powerpc @@ -4,7 +4,6 @@ # config PSERIES_CPUIDLE bool "Cpuidle driver for pSeries platforms" - depends on CPU_IDLE depends on PPC_PSERIES default y help @@ -13,7 +12,6 @@ config PSERIES_CPUIDLE config POWERNV_CPUIDLE bool "Cpuidle driver for powernv platforms" - depends on CPU_IDLE depends on PPC_POWERNV default y help From 629be87e0d6be4c3683d3b39811804f42a78f04b Mon Sep 17 00:00:00 2001 From: Huisong Li Date: Fri, 3 Apr 2026 16:45:42 +0800 Subject: [PATCH 9/9] cpuidle: Simplify cpuidle_register_device() with guard() Use guard() macro for mutex to simplify the control flow in cpuidle_register_device(). Signed-off-by: Huisong Li Link: https://patch.msgid.link/20260403084542.708104-1-lihuisong@huawei.com Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index c7876e9e024f..8c037db46792 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -679,16 +679,16 @@ int cpuidle_register_device(struct cpuidle_device *dev) if (!dev) return -EINVAL; - mutex_lock(&cpuidle_lock); + guard(mutex)(&cpuidle_lock); if (dev->registered) - goto out_unlock; + return ret; __cpuidle_device_init(dev); ret = __cpuidle_register_device(dev); if (ret) - goto out_unlock; + return ret; ret = cpuidle_add_sysfs(dev); if (ret) @@ -700,16 +700,14 @@ int cpuidle_register_device(struct cpuidle_device *dev) cpuidle_install_idle_handler(); -out_unlock: - mutex_unlock(&cpuidle_lock); - return ret; out_sysfs: cpuidle_remove_sysfs(dev); out_unregister: __cpuidle_unregister_device(dev); - goto out_unlock; + + return ret; } EXPORT_SYMBOL_GPL(cpuidle_register_device);