From 1440648c0feed03cfd51c7dba92a77feb34bf27b Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 31 Jul 2025 07:11:54 +0900 Subject: [PATCH 01/75] hung_task: dump blocker task if it is not hung Dump the lock blocker task if it is not hung because if the blocker task is also hung, it should be dumped by the detector. This will de-duplicate the same stackdumps if the blocker task is also blocked by another task (and hung). Link: https://lkml.kernel.org/r/175391351423.688839.11917911323784986774.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Suggested-by: Sergey Senozhatsky Tested-by: Sergey Senozhatsky Acked-by: Lance Yang Signed-off-by: Andrew Morton --- kernel/hung_task.c | 78 ++++++++++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 37 deletions(-) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 8708a1205f82..b2c1f14b8129 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -95,9 +95,41 @@ static struct notifier_block panic_block = { .notifier_call = hung_task_panic, }; +static bool task_is_hung(struct task_struct *t, unsigned long timeout) +{ + unsigned long switch_count = t->nvcsw + t->nivcsw; + unsigned int state = READ_ONCE(t->__state); + + /* + * skip the TASK_KILLABLE tasks -- these can be killed + * skip the TASK_IDLE tasks -- those are genuinely idle + * skip the TASK_FROZEN task -- it reasonably stops scheduling by freezer + */ + if (!(state & TASK_UNINTERRUPTIBLE) || + (state & (TASK_WAKEKILL | TASK_NOLOAD | TASK_FROZEN))) + return false; + + /* + * When a freshly created task is scheduled once, changes its state to + * TASK_UNINTERRUPTIBLE without having ever been switched out once, it + * musn't be checked. + */ + if (unlikely(!switch_count)) + return false; + + if (switch_count != t->last_switch_count) { + t->last_switch_count = switch_count; + t->last_switch_time = jiffies; + return false; + } + if (time_is_after_jiffies(t->last_switch_time + timeout * HZ)) + return false; + + return true; +} #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER -static void debug_show_blocker(struct task_struct *task) +static void debug_show_blocker(struct task_struct *task, unsigned long timeout) { struct task_struct *g, *t; unsigned long owner, blocker, blocker_type; @@ -174,41 +206,21 @@ static void debug_show_blocker(struct task_struct *task) t->pid, rwsem_blocked_by); break; } - sched_show_task(t); + /* Avoid duplicated task dump, skip if the task is also hung. */ + if (!task_is_hung(t, timeout)) + sched_show_task(t); return; } } #else -static inline void debug_show_blocker(struct task_struct *task) +static inline void debug_show_blocker(struct task_struct *task, unsigned long timeout) { } #endif static void check_hung_task(struct task_struct *t, unsigned long timeout) { - unsigned long switch_count = t->nvcsw + t->nivcsw; - - /* - * Ensure the task is not frozen. - * Also, skip vfork and any other user process that freezer should skip. - */ - if (unlikely(READ_ONCE(t->__state) & TASK_FROZEN)) - return; - - /* - * When a freshly created task is scheduled once, changes its state to - * TASK_UNINTERRUPTIBLE without having ever been switched out once, it - * musn't be checked. - */ - if (unlikely(!switch_count)) - return; - - if (switch_count != t->last_switch_count) { - t->last_switch_count = switch_count; - t->last_switch_time = jiffies; - return; - } - if (time_is_after_jiffies(t->last_switch_time + timeout * HZ)) + if (!task_is_hung(t, timeout)) return; /* @@ -243,7 +255,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); - debug_show_blocker(t); + debug_show_blocker(t, timeout); hung_task_show_lock = true; if (sysctl_hung_task_all_cpu_backtrace) @@ -299,7 +311,6 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) hung_task_show_lock = false; rcu_read_lock(); for_each_process_thread(g, t) { - unsigned int state; if (!max_count--) goto unlock; @@ -308,15 +319,8 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) goto unlock; last_break = jiffies; } - /* - * skip the TASK_KILLABLE tasks -- these can be killed - * skip the TASK_IDLE tasks -- those are genuinely idle - */ - state = READ_ONCE(t->__state); - if ((state & TASK_UNINTERRUPTIBLE) && - !(state & TASK_WAKEKILL) && - !(state & TASK_NOLOAD)) - check_hung_task(t, timeout); + + check_hung_task(t, timeout); } unlock: rcu_read_unlock(); From f367474b5884edbc42661e7fecf784cb131dd25d Mon Sep 17 00:00:00 2001 From: Brian Mak Date: Tue, 5 Aug 2025 14:15:27 -0700 Subject: [PATCH 02/75] x86/kexec: carry forward the boot DTB on kexec Currently, the kexec_file_load syscall on x86 does not support passing a device tree blob to the new kernel. Some embedded x86 systems use device trees. On these systems, failing to pass a device tree to the new kernel causes a boot failure. To add support for this, we copy the behavior of ARM64 and PowerPC and copy the current boot's device tree blob for use in the new kernel. We do this on x86 by passing the device tree blob as a setup_data entry in accordance with the x86 boot protocol. This behavior is gated behind the KEXEC_FILE_FORCE_DTB flag. Link: https://lkml.kernel.org/r/20250805211527.122367-3-makb@juniper.net Signed-off-by: Brian Mak Cc: Alexander Graf Cc: Baoquan He Cc: Borislav Betkov Cc: Dave Young Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Rob Herring Cc: Saravana Kannan Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- arch/x86/kernel/kexec-bzimage64.c | 47 +++++++++++++++++++++++++++++-- include/linux/kexec.h | 5 +++- include/uapi/linux/kexec.h | 4 +++ kernel/kexec_file.c | 1 + 4 files changed, 53 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 24a41f0e0cf1..c3244ac680d1 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include #include @@ -212,6 +214,28 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr, } #endif /* CONFIG_EFI */ +#ifdef CONFIG_OF_FLATTREE +static void setup_dtb(struct boot_params *params, + unsigned long params_load_addr, + unsigned int dtb_setup_data_offset) +{ + struct setup_data *sd = (void *)params + dtb_setup_data_offset; + unsigned long setup_data_phys, dtb_len; + + dtb_len = fdt_totalsize(initial_boot_params); + sd->type = SETUP_DTB; + sd->len = dtb_len; + + /* Carry over current boot DTB with setup_data */ + memcpy(sd->data, initial_boot_params, dtb_len); + + /* Add setup data */ + setup_data_phys = params_load_addr + dtb_setup_data_offset; + sd->next = params->hdr.setup_data; + params->hdr.setup_data = setup_data_phys; +} +#endif /* CONFIG_OF_FLATTREE */ + static void setup_ima_state(const struct kimage *image, struct boot_params *params, unsigned long params_load_addr, @@ -336,6 +360,17 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, sizeof(struct efi_setup_data); #endif +#ifdef CONFIG_OF_FLATTREE + if (image->force_dtb && initial_boot_params) { + setup_dtb(params, params_load_addr, setup_data_offset); + setup_data_offset += sizeof(struct setup_data) + + fdt_totalsize(initial_boot_params); + } else { + pr_debug("Not carrying over DTB, force_dtb = %d\n", + image->force_dtb); + } +#endif + if (IS_ENABLED(CONFIG_IMA_KEXEC)) { /* Setup IMA log buffer state */ setup_ima_state(image, params, params_load_addr, @@ -529,6 +564,12 @@ static void *bzImage64_load(struct kimage *image, char *kernel, sizeof(struct setup_data) + RNG_SEED_LENGTH; +#ifdef CONFIG_OF_FLATTREE + if (image->force_dtb && initial_boot_params) + kbuf.bufsz += sizeof(struct setup_data) + + fdt_totalsize(initial_boot_params); +#endif + if (IS_ENABLED(CONFIG_IMA_KEXEC)) kbuf.bufsz += sizeof(struct setup_data) + sizeof(struct ima_setup_data); @@ -537,7 +578,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel, kbuf.bufsz += sizeof(struct setup_data) + sizeof(struct kho_data); - params = kzalloc(kbuf.bufsz, GFP_KERNEL); + params = kvzalloc(kbuf.bufsz, GFP_KERNEL); if (!params) return ERR_PTR(-ENOMEM); efi_map_offset = params_cmdline_sz; @@ -647,7 +688,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel, return ldata; out_free_params: - kfree(params); + kvfree(params); return ERR_PTR(ret); } @@ -659,7 +700,7 @@ static int bzImage64_cleanup(void *loader_data) if (!ldata) return 0; - kfree(ldata->bootparams_buf); + kvfree(ldata->bootparams_buf); ldata->bootparams_buf = NULL; return 0; diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 39fe3e6cd282..ff7e231b0485 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -395,6 +395,9 @@ struct kimage { /* Information for loading purgatory */ struct purgatory_info purgatory_info; + + /* Force carrying over the DTB from the current boot */ + bool force_dtb; #endif #ifdef CONFIG_CRASH_HOTPLUG @@ -461,7 +464,7 @@ bool kexec_load_permitted(int kexec_image_type); /* List of defined/legal kexec file flags */ #define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \ KEXEC_FILE_NO_INITRAMFS | KEXEC_FILE_DEBUG | \ - KEXEC_FILE_NO_CMA) + KEXEC_FILE_NO_CMA | KEXEC_FILE_FORCE_DTB) /* flag to track if kexec reboot is in progress */ extern bool kexec_in_progress; diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h index 8958ebfcff94..55749cb0b81d 100644 --- a/include/uapi/linux/kexec.h +++ b/include/uapi/linux/kexec.h @@ -22,12 +22,16 @@ * KEXEC_FILE_ON_CRASH : Load/unload operation belongs to kdump image. * KEXEC_FILE_NO_INITRAMFS : No initramfs is being loaded. Ignore the initrd * fd field. + * KEXEC_FILE_FORCE_DTB : Force carrying over the current boot's DTB to the new + * kernel on x86. This is already the default behavior on + * some other architectures, like ARM64 and PowerPC. */ #define KEXEC_FILE_UNLOAD 0x00000001 #define KEXEC_FILE_ON_CRASH 0x00000002 #define KEXEC_FILE_NO_INITRAMFS 0x00000004 #define KEXEC_FILE_DEBUG 0x00000008 #define KEXEC_FILE_NO_CMA 0x00000010 +#define KEXEC_FILE_FORCE_DTB 0x00000020 /* These values match the ELF architecture values. * Unless there is a good reason that should continue to be the case. diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 91d46502a817..eb62a9794242 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -255,6 +255,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, } image->no_cma = !!(flags & KEXEC_FILE_NO_CMA); + image->force_dtb = flags & KEXEC_FILE_FORCE_DTB; if (cmdline_len) { image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len); From 06ef8b9aa25ea7342ffd604784edcfdbc8348920 Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Tue, 5 Aug 2025 10:30:31 +0800 Subject: [PATCH 03/75] ref_tracker: remove redundant __GFP_NOWARN Commit 16f5dfbc851b ("gfp: include __GFP_NOWARN in GFP_NOWAIT") made GFP_NOWAIT implicitly include __GFP_NOWARN. Therefore, explicit __GFP_NOWARN combined with GFP_NOWAIT (e.g., `GFP_NOWAIT | __GFP_NOWARN`) is now redundant. Let's clean up these redundant flags across subsystems. No functional changes. Link: https://lkml.kernel.org/r/20250805023031.331718-1-rongqianfeng@vivo.com Signed-off-by: Qianfeng Rong Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- lib/ref_tracker.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/ref_tracker.c b/lib/ref_tracker.c index cce12287708e..258fb0e7abdf 100644 --- a/lib/ref_tracker.c +++ b/lib/ref_tracker.c @@ -75,7 +75,7 @@ ref_tracker_get_stats(struct ref_tracker_dir *dir, unsigned int limit) struct ref_tracker *tracker; stats = kmalloc(struct_size(stats, stacks, limit), - GFP_NOWAIT | __GFP_NOWARN); + GFP_NOWAIT); if (!stats) return ERR_PTR(-ENOMEM); stats->total = 0; @@ -159,7 +159,7 @@ __ref_tracker_dir_pr_ostream(struct ref_tracker_dir *dir, return; } - sbuf = kmalloc(STACK_BUF_SIZE, GFP_NOWAIT | __GFP_NOWARN); + sbuf = kmalloc(STACK_BUF_SIZE, GFP_NOWAIT); for (i = 0, skipped = stats->total; i < stats->count; ++i) { stack = stats->stacks[i].stack_handle; @@ -306,7 +306,7 @@ int ref_tracker_free(struct ref_tracker_dir *dir, } nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1); stack_handle = stack_depot_save(entries, nr_entries, - GFP_NOWAIT | __GFP_NOWARN); + GFP_NOWAIT); spin_lock_irqsave(&dir->lock, flags); if (tracker->dead) { From c2fe368b6eb24af72708890b04e9a773c8465703 Mon Sep 17 00:00:00 2001 From: Soham Bagchi Date: Mon, 28 Jul 2025 12:43:17 -0600 Subject: [PATCH 04/75] kcov: use write memory barrier after memcpy() in kcov_move_area() KCOV Remote uses two separate memory buffers, one private to the kernel space (kcov_remote_areas) and the second one shared between user and kernel space (kcov->area). After every pair of kcov_remote_start() and kcov_remote_stop(), the coverage data collected in the kcov_remote_areas is copied to kcov->area so the user can read the collected coverage data. This memcpy() is located in kcov_move_area(). The load/store pattern on the kernel-side [1] is: ``` /* dst_area === kcov->area, dst_area[0] is where the count is stored */ dst_len = READ_ONCE(*(unsigned long *)dst_area); ... memcpy(dst_entries, src_entries, ...); ... WRITE_ONCE(*(unsigned long *)dst_area, dst_len + entries_moved); ``` And for the user [2]: ``` /* cover is equivalent to kcov->area */ n = __atomic_load_n(&cover[0], __ATOMIC_RELAXED); ``` Without a write-memory barrier, the atomic load for the user can potentially read fresh values of the count stored at cover[0], but continue to read stale coverage data from the buffer itself. Hence, we recommend adding a write-memory barrier between the memcpy() and the WRITE_ONCE() in kcov_move_area(). Link: https://lkml.kernel.org/r/20250728184318.1839137-1-soham.bagchi@utah.edu Link: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/kernel/kcov.c?h=master#n978 [1] Link: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/Documentation/dev-tools/kcov.rst#n364 [2] Signed-off-by: Soham Bagchi Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Dmitriy Vyukov Cc: Jonathan Corbet Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- kernel/kcov.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/kcov.c b/kernel/kcov.c index 1d85597057e1..6563141f5de9 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -978,6 +978,15 @@ static void kcov_move_area(enum kcov_mode mode, void *dst_area, memcpy(dst_entries, src_entries, bytes_to_move); entries_moved = bytes_to_move >> entry_size_log; + /* + * A write memory barrier is required here, to ensure + * that the writes from the memcpy() are visible before + * the count is updated. Without this, it is possible for + * a user to observe a new count value but stale + * coverage data. + */ + smp_wmb(); + switch (mode) { case KCOV_MODE_TRACE_PC: WRITE_ONCE(*(unsigned long *)dst_area, dst_len + entries_moved); From 1455b6ac210d5c094066264e7a6809c3a6a9a4d2 Mon Sep 17 00:00:00 2001 From: Soham Bagchi Date: Mon, 28 Jul 2025 12:43:18 -0600 Subject: [PATCH 05/75] kcov: load acquire coverage count in user-space code Update the KCOV documentation to use a load-acquire operation for the first element of the shared memory buffer between kernel-space and user-space. The load-acquire pairs with the write memory barrier used in kcov_move_area(). [soham.bagchi@utah.edu: v2] Link: https://lkml.kernel.org/r/20250803180558.2967962-1-soham.bagchi@utah.edu Link: https://lkml.kernel.org/r/20250728184318.1839137-2-soham.bagchi@utah.edu Signed-off-by: Soham Bagchi Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Dmitriy Vyukov Cc: Jonathan Corbet Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- Documentation/dev-tools/kcov.rst | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Documentation/dev-tools/kcov.rst b/Documentation/dev-tools/kcov.rst index 6611434e2dd2..8127849d40f5 100644 --- a/Documentation/dev-tools/kcov.rst +++ b/Documentation/dev-tools/kcov.rst @@ -361,7 +361,12 @@ local tasks spawned by the process and the global task that handles USB bus #1: */ sleep(2); - n = __atomic_load_n(&cover[0], __ATOMIC_RELAXED); + /* + * The load to the coverage count should be an acquire to pair with + * pair with the corresponding write memory barrier (smp_wmb()) on + * the kernel-side in kcov_move_area(). + */ + n = __atomic_load_n(&cover[0], __ATOMIC_ACQUIRE); for (i = 0; i < n; i++) printf("0x%lx\n", cover[i + 1]); if (ioctl(fd, KCOV_DISABLE, 0)) From 8f09142e4f9bacb7f3b9f41c864ef3eb2cfa27df Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 14 Jul 2025 10:17:08 +0200 Subject: [PATCH 06/75] idr test suite: remove usage of the deprecated ida_simple_xx() API Patch series "ida: Remove the ida_simple_xxx() API", v3. These are the final steps in removing the ida_simple_xxx() API. This series was last proposed in August 2024. Since then, some users of the old API have be re-introduced and then removed. A first time in drivers/misc/rpmb-core.c, added in commit 1e9046e3a154 ("rpmb: add Replay Protected Memory Block (RPMB) subsystem") (2024-08-26) and removed in commit dfc881abca42 ("rpmb: Remove usage of the deprecated ida_simple_xx() API") (2024-10-13). A second time in drivers/gpio/gpio-mpsse.c, added in commit c46a74ff05c0 ("gpio: add support for FTDI's MPSSE as GPIO") (2024-10-14) and removed in commit f57c08492866 (gpio: mpsse: Remove usage of the deprecated ida_simple_xx() API) (2024-11-22). Since then, I've not spotted any new usage. So things being stable now, it's time to end this story once and for all. This patch (of 3): ida_alloc() and ida_free() should be preferred to the deprecated ida_simple_get() and ida_simple_remove(). Note that the upper limit of ida_simple_get() is exclusive, but the one of ida_alloc_range()/ida_alloc_max() is inclusive. But because of the ranges used for the tests, there is no need to adjust them. While at it remove some useless {}. Link: https://lkml.kernel.org/r/cover.1752480043.git.christophe.jaillet@wanadoo.fr Link: https://lkml.kernel.org/r/2904fa2006e4fe58eea63aef87fa7f832c7804a1.1752480043.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Acked-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- tools/testing/radix-tree/idr-test.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/tools/testing/radix-tree/idr-test.c b/tools/testing/radix-tree/idr-test.c index 84b8c3c92c79..2f830ff8396c 100644 --- a/tools/testing/radix-tree/idr-test.c +++ b/tools/testing/radix-tree/idr-test.c @@ -499,19 +499,17 @@ void ida_check_random(void) goto repeat; } -void ida_simple_get_remove_test(void) +void ida_alloc_free_test(void) { DEFINE_IDA(ida); unsigned long i; - for (i = 0; i < 10000; i++) { - assert(ida_simple_get(&ida, 0, 20000, GFP_KERNEL) == i); - } - assert(ida_simple_get(&ida, 5, 30, GFP_KERNEL) < 0); + for (i = 0; i < 10000; i++) + assert(ida_alloc_max(&ida, 20000, GFP_KERNEL) == i); + assert(ida_alloc_range(&ida, 5, 30, GFP_KERNEL) < 0); - for (i = 0; i < 10000; i++) { - ida_simple_remove(&ida, i); - } + for (i = 0; i < 10000; i++) + ida_free(&ida, i); assert(ida_is_empty(&ida)); ida_destroy(&ida); @@ -524,7 +522,7 @@ void user_ida_checks(void) ida_check_nomem(); ida_check_conv_user(); ida_check_random(); - ida_simple_get_remove_test(); + ida_alloc_free_test(); radix_tree_cpu_dead(1); } From c8a09fc9664f79eeb66cdf4a2a34d5b6a239b727 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 14 Jul 2025 10:17:09 +0200 Subject: [PATCH 07/75] ida: remove the ida_simple_xxx() API All users of the ida_simple_xxx() have been converted. In Linux 6.11-rc2, the only callers are in tools/testing/. So it is now time to remove the definition of this old and deprecated ida_simple_get() and ida_simple_remove(). Link: https://lkml.kernel.org/r/aa205f45fef70a9c948b6a98bad06da58e4de776.1752480043.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/idr.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/include/linux/idr.h b/include/linux/idr.h index 2267902d29a7..789e23e67444 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -334,14 +334,6 @@ static inline void ida_init(struct ida *ida) xa_init_flags(&ida->xa, IDA_INIT_FLAGS); } -/* - * ida_simple_get() and ida_simple_remove() are deprecated. Use - * ida_alloc() and ida_free() instead respectively. - */ -#define ida_simple_get(ida, start, end, gfp) \ - ida_alloc_range(ida, start, (end) - 1, gfp) -#define ida_simple_remove(ida, id) ida_free(ida, id) - static inline bool ida_is_empty(const struct ida *ida) { return xa_empty(&ida->xa); From baa96bcb180e979a821ce3ade87a3d2349b2d640 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 14 Jul 2025 10:17:10 +0200 Subject: [PATCH 08/75] nvmem: update a comment related to struct nvmem_config Update a comment to match the function used in nvmem_register(). ida_simple_get() was replaced by ida_alloc() in commit 1eb51d6a4fce ("nvmem: switch to simpler IDA interface") Link: https://lkml.kernel.org/r/27a9dec93a9f79140b11a77df38b1b45bd342e09.1752480043.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/nvmem-provider.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h index 615a560d9edb..f3b13da78aac 100644 --- a/include/linux/nvmem-provider.h +++ b/include/linux/nvmem-provider.h @@ -103,7 +103,7 @@ struct nvmem_cell_info { * * Note: A default "nvmem" name will be assigned to the device if * no name is specified in its configuration. In such case "" is - * generated with ida_simple_get() and provided id field is ignored. + * generated with ida_alloc() and provided id field is ignored. * * Note: Specifying name and setting id to -1 implies a unique device * whose name is provided as-is (kept unaltered). From ca78a04ce5c42b00addc063a7610a5ae12dafc39 Mon Sep 17 00:00:00 2001 From: Liao Yuanhong Date: Mon, 11 Aug 2025 16:27:38 +0800 Subject: [PATCH 09/75] lib/digsig: remove unnecessary memset kzalloc() has already been initialized to full 0 space, there is no need to use memset() to initialize again. Link: https://lkml.kernel.org/r/20250811082739.378284-1-liaoyuanhong@vivo.com Signed-off-by: Liao Yuanhong Signed-off-by: Andrew Morton --- lib/digsig.c | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/digsig.c b/lib/digsig.c index 04b5e55ed95f..2b36f9cc91e9 100644 --- a/lib/digsig.c +++ b/lib/digsig.c @@ -159,7 +159,6 @@ static int digsig_verify_rsa(struct key *key, len = mlen; head = len - l; - memset(out1, 0, head); memcpy(out1 + head, p, l); kfree(p); From e416f0ed3c500c05c55fb62ee62662717b1c7f71 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Mon, 21 Jul 2025 18:13:43 +0800 Subject: [PATCH 10/75] init: handle bootloader identifier in kernel parameters BootLoaders (Grub, LILO, etc) may pass an identifier such as "BOOT_IMAGE= /boot/vmlinuz-x.y.z" to kernel parameters. But these identifiers are not recognized by the kernel itself so will be passed to userspace. However user space init program also don't recognize it. KEXEC/KDUMP (kexec-tools) may also pass an identifier such as "kexec" on some architectures. We cannot change BootLoader's behavior, because this behavior exists for many years, and there are already user space programs search BOOT_IMAGE= in /proc/cmdline to obtain the kernel image locations: https://github.com/linuxdeepin/deepin-ab-recovery/blob/master/util.go (search getBootOptions) https://github.com/linuxdeepin/deepin-ab-recovery/blob/master/main.go (search getKernelReleaseWithBootOption) So the the best way is handle (ignore) it by the kernel itself, which can avoid such boot warnings (if we use something like init=/bin/bash, bootloader identifier can even cause a crash): Kernel command line: BOOT_IMAGE=(hd0,1)/vmlinuz-6.x root=/dev/sda3 ro console=tty Unknown kernel command line parameters "BOOT_IMAGE=(hd0,1)/vmlinuz-6.x", will be passed to user space. [chenhuacai@loongson.cn: use strstarts()] Link: https://lkml.kernel.org/r/20250815090120.1569947-1-chenhuacai@loongson.cn Link: https://lkml.kernel.org/r/20250721101343.3283480-1-chenhuacai@loongson.cn Signed-off-by: Huacai Chen Cc: Al Viro Cc: Christian Brauner Cc: Jan Kara Cc: Signed-off-by: Andrew Morton --- init/main.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/init/main.c b/init/main.c index 0ee0ee7b7c2c..9b5150166bcf 100644 --- a/init/main.c +++ b/init/main.c @@ -544,6 +544,12 @@ static int __init unknown_bootoption(char *param, char *val, const char *unused, void *arg) { size_t len = strlen(param); + /* + * Well-known bootloader identifiers: + * 1. LILO/Grub pass "BOOT_IMAGE=..."; + * 2. kexec/kdump (kexec-tools) pass "kexec". + */ + const char *bootloader[] = { "BOOT_IMAGE=", "kexec", NULL }; /* Handle params aliased to sysctls */ if (sysctl_is_alias(param)) @@ -551,6 +557,12 @@ static int __init unknown_bootoption(char *param, char *val, repair_env_string(param, val); + /* Handle bootloader identifier */ + for (int i = 0; bootloader[i]; i++) { + if (strstarts(param, bootloader[i])) + return 0; + } + /* Handle obsolete-style parameters */ if (obsolete_checksetup(param)) return 0; From a6cf527e66b8ea7b3119a354a873ebfa57ea225e Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sat, 9 Aug 2025 21:31:10 -0700 Subject: [PATCH 11/75] checkpatch: allow http links of any length in commit logs Dave Gilbert noticed that checkpatch warns about URL links over 75 chars in length in commit logs. Fix that. Link: https://lkml.kernel.org/r/3529faaf84a5a9a96c5c0ec4183ae0ba6e97673c.camel@perches.com Signed-off-by: Joe Perches Cc: Dave Gilbert Signed-off-by: Andrew Morton --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index e722dd6fa8ef..319cc5f85885 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3294,7 +3294,7 @@ sub process { # file delta changes $line =~ /^\s*(?:[\w\.\-\+]*\/)++[\w\.\-\+]+:/ || # filename then : - $line =~ /^\s*(?:Fixes:|$link_tags_search|$signature_tags)/i || + $line =~ /^\s*(?:Fixes:|https?:|$link_tags_search|$signature_tags)/i || # A Fixes:, link or signature tag line $commit_log_possible_stack_dump)) { WARN("COMMIT_LOG_LONG_LINE", From 6c0022d6dc341668d704b1490674a21805ef645b Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Tue, 12 Aug 2025 16:40:45 +0800 Subject: [PATCH 12/75] lib/fault-inject-usercopy.c: use PTR_ERR_OR_ZERO() to simplify code Use the standard error pointer macro to shorten the code and simplify. Link: https://lkml.kernel.org/r/20250812084045.64218-1-zhao.xichao@vivo.com Signed-off-by: Xichao Zhao Signed-off-by: Andrew Morton --- lib/fault-inject-usercopy.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/fault-inject-usercopy.c b/lib/fault-inject-usercopy.c index 77558b6c29ca..75403ec50f49 100644 --- a/lib/fault-inject-usercopy.c +++ b/lib/fault-inject-usercopy.c @@ -22,10 +22,8 @@ static int __init fail_usercopy_debugfs(void) dir = fault_create_debugfs_attr("fail_usercopy", NULL, &fail_usercopy.attr); - if (IS_ERR(dir)) - return PTR_ERR(dir); - return 0; + return PTR_ERR_OR_ZERO(dir); } late_initcall(fail_usercopy_debugfs); From bc107a619f029415e0a87e6df16f995553da4568 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 13 Aug 2025 16:17:43 +0900 Subject: [PATCH 13/75] squashfs: verify inode mode when loading from disk The inode mode loaded from corrupted disk might by error contain the file type bits. Since the file type bits are set by squashfs_read_inode() using bitwise OR, the file type bits must not be set by squashfs_new_inode() from squashfs_read_inode(); otherwise, an invalid file type bits later confuses may_open(). Link: https://lkml.kernel.org/r/f63d8d11-2254-4fc3-9292-9a43a93b374e@I-love.SAKURA.ne.jp Reported-by: syzbot Closes: https://syzkaller.appspot.com/bug?extid=895c23f6917da440ed0d Signed-off-by: Tetsuo Handa Reviewed-by: Phillip Lougher Signed-off-by: Andrew Morton --- fs/squashfs/inode.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index d5918eba27e3..dee8fa016930 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -68,6 +68,10 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode, inode->i_mode = le16_to_cpu(sqsh_ino->mode); inode->i_size = 0; + /* File type must not be set at this moment, for it will later be set by the caller. */ + if (inode->i_mode & S_IFMT) + err = -EIO; + return err; } From 9a0ee378eb6ca39513dcfeffabd10037c2c9f2a0 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Thu, 14 Aug 2025 13:38:10 +0200 Subject: [PATCH 14/75] ocfs2: remove commented out mlog() statements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mlog() statements have been commented out ever since commit 6714d8e86bf44 ("[PATCH] OCFS2: The Second Oracle Cluster Filesystem") - remove them. Link: https://lkml.kernel.org/r/20250814113815.219064-1-thorsten.blum@linux.dev Signed-off-by: Thorsten Blum Reviewed-by: Mark Tinguely Reviewed-by: Mark Fasheh Cc: Ahelenia Ziemiańska Cc: Dmitry Antipov Cc: Joel Becker Cc: Joseph Qi Cc: Thorsten Blum Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/dlm/dlmmaster.c | 11 ----------- fs/ocfs2/dlm/dlmrecovery.c | 1 - 2 files changed, 12 deletions(-) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 86bb1a03bcc1..4145e06d2c08 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -1477,7 +1477,6 @@ way_up_top: goto send_response; } else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { spin_unlock(&res->spinlock); - // mlog(0, "node %u is the master\n", res->owner); response = DLM_MASTER_RESP_NO; if (mle) kmem_cache_free(dlm_mle_cache, mle); @@ -1493,7 +1492,6 @@ way_up_top: BUG(); } - // mlog(0, "lockres is in progress...\n"); spin_lock(&dlm->master_lock); found = dlm_find_mle(dlm, &tmpmle, name, namelen); if (!found) { @@ -1503,8 +1501,6 @@ way_up_top: set_maybe = 1; spin_lock(&tmpmle->spinlock); if (tmpmle->type == DLM_MLE_BLOCK) { - // mlog(0, "this node is waiting for " - // "lockres to be mastered\n"); response = DLM_MASTER_RESP_NO; } else if (tmpmle->type == DLM_MLE_MIGRATION) { mlog(0, "node %u is master, but trying to migrate to " @@ -1531,8 +1527,6 @@ way_up_top: } else response = DLM_MASTER_RESP_NO; } else { - // mlog(0, "this node is attempting to " - // "master lockres\n"); response = DLM_MASTER_RESP_MAYBE; } if (set_maybe) @@ -1559,7 +1553,6 @@ way_up_top: found = dlm_find_mle(dlm, &tmpmle, name, namelen); if (!found) { /* this lockid has never been seen on this node yet */ - // mlog(0, "no mle found\n"); if (!mle) { spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); @@ -1573,8 +1566,6 @@ way_up_top: goto way_up_top; } - // mlog(0, "this is second time thru, already allocated, " - // "add the block.\n"); dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen); set_bit(request->node_idx, mle->maybe_map); __dlm_insert_mle(dlm, mle); @@ -1897,8 +1888,6 @@ ok: spin_unlock(&res->spinlock); } - // mlog(0, "woo! got an assert_master from node %u!\n", - // assert->node_idx); if (mle) { int extra_ref = 0; int nn = -1; diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 00f52812dbb0..843ee02bd85f 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -464,7 +464,6 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) } if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { - // mlog(0, "nothing to recover! sleeping now!\n"); spin_unlock(&dlm->spinlock); /* return to main thread loop and sleep. */ return 0; From 7da017eaa43a215f68d42ebfd08d381134ea6356 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Thu, 14 Aug 2025 17:50:33 +0800 Subject: [PATCH 15/75] test_firmware: use str_true_false() helper Replace ternary (condition ? "true" : "false") expressions with the str_true_false() helper from string_choices.h. This improves readability by replacing the three-operand ternary with a single function call, ensures consistent string output, and allows potential string deduplication by the linker, resulting in a slightly smaller binary. Link: https://lkml.kernel.org/r/20250814095033.244034-1-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Cc: Kuan-Wei Chiu Signed-off-by: Andrew Morton --- lib/test_firmware.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/test_firmware.c b/lib/test_firmware.c index 211222e63328..be4f93124901 100644 --- a/lib/test_firmware.c +++ b/lib/test_firmware.c @@ -26,6 +26,7 @@ #include #include #include +#include MODULE_IMPORT_NS("TEST_FIRMWARE"); @@ -304,17 +305,17 @@ static ssize_t config_show(struct device *dev, "FW_ACTION_NOUEVENT"); len += scnprintf(buf + len, PAGE_SIZE - len, "into_buf:\t\t%s\n", - test_fw_config->into_buf ? "true" : "false"); + str_true_false(test_fw_config->into_buf)); len += scnprintf(buf + len, PAGE_SIZE - len, "buf_size:\t%zu\n", test_fw_config->buf_size); len += scnprintf(buf + len, PAGE_SIZE - len, "file_offset:\t%zu\n", test_fw_config->file_offset); len += scnprintf(buf + len, PAGE_SIZE - len, "partial:\t\t%s\n", - test_fw_config->partial ? "true" : "false"); + str_true_false(test_fw_config->partial)); len += scnprintf(buf + len, PAGE_SIZE - len, "sync_direct:\t\t%s\n", - test_fw_config->sync_direct ? "true" : "false"); + str_true_false(test_fw_config->sync_direct)); len += scnprintf(buf + len, PAGE_SIZE - len, "read_fw_idx:\t%u\n", test_fw_config->read_fw_idx); if (test_fw_config->upload_name) From 0ca863b7c638803722dc5596b2d520812cf283b7 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Thu, 14 Aug 2025 17:38:27 +0800 Subject: [PATCH 16/75] alloc_tag: use str_on_off() helper Replace the ternary (enable ? "on" : "off") with the str_on_off() helper from string_choices.h. This improves readability by replacing the three-operand ternary with a single function call, ensures consistent string output, and allows potential string deduplication by the linker, resulting in a slightly smaller binary. Link: https://lkml.kernel.org/r/20250814093827.237980-1-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Cc: Ching-Chun (Jim) Huang Cc: Kent Overstreet Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/alloc_tag.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index e9b33848700a..d4449205eca2 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -726,7 +727,7 @@ static int __init setup_early_mem_profiling(char *str) } mem_profiling_support = true; pr_info("Memory allocation profiling is enabled %s compression and is turned %s!\n", - compressed ? "with" : "without", enable ? "on" : "off"); + compressed ? "with" : "without", str_on_off(enable)); } if (enable != mem_alloc_profiling_enabled()) { From 41f88ddfd453fe894678e1f6909b9fb9e08e8c3d Mon Sep 17 00:00:00 2001 From: ZhenguoYao Date: Tue, 12 Aug 2025 15:41:32 +0800 Subject: [PATCH 17/75] watchdog/softlockup: fix wrong output when watchdog_thresh < 3 When watchdog_thresh is below 3, sample_period will be less than 1 second. So the following output will print when softlockup: CPU#3 Utilization every 0s during lockup Fix this by changing time unit from seconds to milliseconds. Link: https://lkml.kernel.org/r/20250812074132.27810-1-yaozhenguo@jd.com Signed-off-by: ZhenguoYao Cc: Bitao Hu Cc: Li Huafei Cc: Max Kellermann Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- kernel/watchdog.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 80b56c002c7f..9c7134f7d2c4 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -455,17 +455,17 @@ static void print_cpustat(void) { int i, group; u8 tail = __this_cpu_read(cpustat_tail); - u64 sample_period_second = sample_period; + u64 sample_period_msecond = sample_period; - do_div(sample_period_second, NSEC_PER_SEC); + do_div(sample_period_msecond, NSEC_PER_MSEC); /* * Outputting the "watchdog" prefix on every line is redundant and not * concise, and the original alarm information is sufficient for * positioning in logs, hence here printk() is used instead of pr_crit(). */ - printk(KERN_CRIT "CPU#%d Utilization every %llus during lockup:\n", - smp_processor_id(), sample_period_second); + printk(KERN_CRIT "CPU#%d Utilization every %llums during lockup:\n", + smp_processor_id(), sample_period_msecond); for (i = 0; i < NUM_SAMPLE_PERIODS; i++) { group = (tail + i) % NUM_SAMPLE_PERIODS; From 95f091274f3db39493e8b5c44671b9f1e02c0c25 Mon Sep 17 00:00:00 2001 From: ZhenguoYao Date: Tue, 12 Aug 2025 16:25:10 +0800 Subject: [PATCH 18/75] watchdog/softlockup: fix incorrect CPU utilization output during softlockup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since we use 16-bit precision, the raw data will undergo integer division, which may sometimes result in data loss. This can lead to slightly inaccurate CPU utilization calculations. Under normal circumstances, this isn't an issue. However, when CPU utilization reaches 100%, the calculated result might exceed 100%. For example, with raw data like the following: sample_period 400000134 new_stat 83648414036 old_stat 83247417494 sample_period=400000134/2^24=23 new_stat=83648414036/2^24=4985 old_stat=83247417494/2^24=4961 util=105% Below log will output: CPU#3 Utilization every 0s during lockup: #1: 0% system, 0% softirq, 105% hardirq, 0% idle #2: 0% system, 0% softirq, 105% hardirq, 0% idle #3: 0% system, 0% softirq, 100% hardirq, 0% idle #4: 0% system, 0% softirq, 105% hardirq, 0% idle #5: 0% system, 0% softirq, 105% hardirq, 0% idle To avoid confusion, we enforce a 100% display cap when calculations exceed this threshold. We also round to the nearest multiple of 16.8 milliseconds to improve the accuracy. [yaozhenguo1@gmail.com: make get_16bit_precision() more accurate, fix comment layout] Link: https://lkml.kernel.org/r/20250818081438.40540-1-yaozhenguo@jd.com Link: https://lkml.kernel.org/r/20250812082510.32291-1-yaozhenguo@jd.com Signed-off-by: ZhenguoYao Cc: Bitao Hu Cc: Li Huafei Cc: Max Kellermann Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- kernel/watchdog.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 9c7134f7d2c4..5413aa85e8a4 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -425,7 +425,11 @@ static DEFINE_PER_CPU(u8, cpustat_tail); */ static u16 get_16bit_precision(u64 data_ns) { - return data_ns >> 24LL; /* 2^24ns ~= 16.8ms */ + /* + * 2^24ns ~= 16.8ms + * Round to the nearest multiple of 16.8 milliseconds. + */ + return (data_ns + (1 << 23)) >> 24LL; } static void update_cpustat(void) @@ -444,6 +448,14 @@ static void update_cpustat(void) old_stat = __this_cpu_read(cpustat_old[i]); new_stat = get_16bit_precision(cpustat[tracked_stats[i]]); util = DIV_ROUND_UP(100 * (new_stat - old_stat), sample_period_16); + /* + * Since we use 16-bit precision, the raw data will undergo + * integer division, which may sometimes result in data loss, + * and then result might exceed 100%. To avoid confusion, + * we enforce a 100% display cap when calculations exceed this threshold. + */ + if (util > 100) + util = 100; __this_cpu_write(cpustat_util[tail][i], util); __this_cpu_write(cpustat_old[i], new_stat); } From 228bf041a7fdf8c94dfc57df1e2e918fefce1a25 Mon Sep 17 00:00:00 2001 From: zhoumin Date: Mon, 18 Aug 2025 22:44:49 +0800 Subject: [PATCH 19/75] vfat: remove unused variable Remove unused variable definition and related function definition and redundant variable assignments within functions. Link: https://lkml.kernel.org/r/tencent_9DE7CC9367096503F6ADD2BD960079267406@qq.com Signed-off-by: zhoumin Acked-by: OGAWA Hirofumi Signed-off-by: Andrew Morton --- fs/fat/dir.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/fat/dir.c b/fs/fat/dir.c index acbec5bdd521..92b091783966 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -1209,7 +1209,7 @@ EXPORT_SYMBOL_GPL(fat_alloc_new_dir); static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots, int *nr_cluster, struct msdos_dir_entry **de, - struct buffer_head **bh, loff_t *i_pos) + struct buffer_head **bh) { struct super_block *sb = dir->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); @@ -1269,7 +1269,6 @@ static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots, get_bh(bhs[n]); *bh = bhs[n]; *de = (struct msdos_dir_entry *)((*bh)->b_data + offset); - *i_pos = fat_make_i_pos(sb, *bh, *de); /* Second stage: clear the rest of cluster, and write outs */ err = fat_zeroed_cluster(dir, start_blknr, ++n, bhs, MAX_BUF_PER_PAGE); @@ -1298,7 +1297,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots, struct buffer_head *bh, *prev, *bhs[3]; /* 32*slots (672bytes) */ struct msdos_dir_entry *de; int err, free_slots, i, nr_bhs; - loff_t pos, i_pos; + loff_t pos; sinfo->nr_slots = nr_slots; @@ -1386,7 +1385,7 @@ found: * add the cluster to dir. */ cluster = fat_add_new_entries(dir, slots, nr_slots, &nr_cluster, - &de, &bh, &i_pos); + &de, &bh); if (cluster < 0) { err = cluster; goto error_remove; From 6c609f36398adfe0de712a9025cc1065074b9abc Mon Sep 17 00:00:00 2001 From: Liao Yuanhong Date: Mon, 18 Aug 2025 20:35:28 +0800 Subject: [PATCH 20/75] x86/crash: remove redundant 0 value initialization The crash_mem struct is already zeroed by vzalloc(). It's redundant to initialize cmem->nr_ranges to 0. Link: https://lkml.kernel.org/r/20250818123530.635234-1-liaoyuanhong@vivo.com Signed-off-by: Liao Yuanhong Cc: Baoquan He Cc: Borislav Betkov Cc: Brian Gerst Cc: Coiby Xu Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jiri Bohac Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- arch/x86/kernel/crash.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index c6b12bed173d..60bb24ddd36f 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -172,7 +172,6 @@ static struct crash_mem *fill_up_crash_elf_data(void) return NULL; cmem->max_nr_ranges = nr_ranges; - cmem->nr_ranges = 0; return cmem; } @@ -332,7 +331,6 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) return -ENOMEM; cmem->max_nr_ranges = nr_ranges; - cmem->nr_ranges = 0; memset(&cmd, 0, sizeof(struct crash_memmap_data)); cmd.params = params; From b1e34412998d628dfa8ba3da042bb60dee232b6c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 19 Aug 2025 21:19:17 +0300 Subject: [PATCH 21/75] proc: test lseek on /proc/net/dev This line in tools/testing/selftests/proc/read.c was added to catch oopses, not to verify lseek correctness: (void)lseek(fd, 0, SEEK_SET); Oh, well. Prevent more embarassement with simple test. Link: https://lkml.kernel.org/r/aKTCfMuRXOpjBXxI@p183 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton --- tools/testing/selftests/proc/.gitignore | 1 + tools/testing/selftests/proc/Makefile | 1 + .../selftests/proc/proc-net-dev-lseek.c | 68 +++++++++++++++++++ 3 files changed, 70 insertions(+) create mode 100644 tools/testing/selftests/proc/proc-net-dev-lseek.c diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore index 19bb333e2485..243f4537a670 100644 --- a/tools/testing/selftests/proc/.gitignore +++ b/tools/testing/selftests/proc/.gitignore @@ -7,6 +7,7 @@ /proc-loadavg-001 /proc-maps-race /proc-multiple-procfs +/proc-net-dev-lseek /proc-empty-vm /proc-pid-vm /proc-self-map-files-001 diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index 50aba102201a..2a9547630115 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile @@ -10,6 +10,7 @@ TEST_GEN_PROGS += fd-003-kthread TEST_GEN_PROGS += proc-2-is-kthread TEST_GEN_PROGS += proc-loadavg-001 TEST_GEN_PROGS += proc-maps-race +TEST_GEN_PROGS += proc-net-dev-lseek TEST_GEN_PROGS += proc-empty-vm TEST_GEN_PROGS += proc-pid-vm TEST_GEN_PROGS += proc-self-map-files-001 diff --git a/tools/testing/selftests/proc/proc-net-dev-lseek.c b/tools/testing/selftests/proc/proc-net-dev-lseek.c new file mode 100644 index 000000000000..742a3e804451 --- /dev/null +++ b/tools/testing/selftests/proc/proc-net-dev-lseek.c @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2025 Alexey Dobriyan + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#undef _GNU_SOURCE +#define _GNU_SOURCE +#undef NDEBUG +#include +#include +#include +#include +#include +#include +/* + * Test that lseek("/proc/net/dev/", 0, SEEK_SET) + * a) works, + * b) does what you think it does. + */ +int main(void) +{ + /* /proc/net/dev output is deterministic in fresh netns only. */ + if (unshare(CLONE_NEWNET) == -1) { + if (errno == ENOSYS || errno == EPERM) { + return 4; + } + return 1; + } + + const int fd = open("/proc/net/dev", O_RDONLY); + assert(fd >= 0); + + char buf1[4096]; + const ssize_t rv1 = read(fd, buf1, sizeof(buf1)); + /* + * Not "<=", this file can't be empty: + * there is header, "lo" interface with some zeroes. + */ + assert(0 < rv1); + assert(rv1 <= sizeof(buf1)); + + /* Believe it or not, this line broke one day. */ + assert(lseek(fd, 0, SEEK_SET) == 0); + + char buf2[4096]; + const ssize_t rv2 = read(fd, buf2, sizeof(buf2)); + /* Not "<=", see above. */ + assert(0 < rv2); + assert(rv2 <= sizeof(buf2)); + + /* Test that lseek rewinds to the beginning of the file. */ + assert(rv1 == rv2); + assert(memcmp(buf1, buf2, rv1) == 0); + + /* Contents of the file is not validated: this test is about lseek(). */ + + return 0; +} From 2a8c51bc9391ff3c701f06ae1b678419f843dc1a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 19 Aug 2025 00:55:07 -0700 Subject: [PATCH 22/75] list.h: add missing kernel-doc for basic macros kernel-doc for the basic LIST_HEAD() and LIST_HEAD_INIT() macros has been missing forever (i.e., since git). Add them for completeness. Link: https://lkml.kernel.org/r/20250819075507.113639-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Cc: Nicolas Frattaroli Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/list.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/linux/list.h b/include/linux/list.h index e7e28afd28f8..ca63bdea6c1a 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -20,8 +20,16 @@ * using the generic single-entry routines. */ +/** + * LIST_HEAD_INIT - initialize a &struct list_head's links to point to itself + * @name: name of the list_head + */ #define LIST_HEAD_INIT(name) { &(name), &(name) } +/** + * LIST_HEAD - definition of a &struct list_head with initialization values + * @name: name of the list_head + */ #define LIST_HEAD(name) \ struct list_head name = LIST_HEAD_INIT(name) From b32730e68d326bef5c081c4b7cdd275c45b1902b Mon Sep 17 00:00:00 2001 From: Tio Zhang Date: Wed, 20 Aug 2025 18:18:46 +0800 Subject: [PATCH 23/75] fork: remove #ifdef CONFIG_LOCKDEP in copy_process() lockdep_init_task() is defined as an empty when CONFIG_LOCKDEP is not set. So the #ifdef here is redundant, remove it. Link: https://lkml.kernel.org/r/20250820101826.GA2484@didi-ThinkCentre-M930t-N000 Signed-off-by: Tio Zhang Cc: Kees Cook Signed-off-by: Andrew Morton --- kernel/fork.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index af673856499d..e06cfaa85a84 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2124,9 +2124,7 @@ __latent_entropy struct task_struct *copy_process( p->pagefault_disabled = 0; -#ifdef CONFIG_LOCKDEP lockdep_init_task(p); -#endif p->blocked_on = NULL; /* not blocked yet */ From f7071db2fe3d20991a35043b32012e1b37d32cc0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 20 Aug 2025 18:39:46 +0200 Subject: [PATCH 24/75] fork: kill the pointless lower_32_bits() in create_io_thread(), kernel_thread(), and user_mode_thread() Unlike sys_clone(), these helpers have only in kernel users which should pass the correct "flags" argument. lower_32_bits(flags) just adds the unnecessary confusion and doesn't allow to use the CLONE_ flags which don't fit into 32 bits. create_io_thread() looks especially confusing because: - "flags" is a compile-time constant, so lower_32_bits() simply has no effect - .exit_signal = (lower_32_bits(flags) & CSIGNAL) is harmless but doesn't look right, copy_process(CLONE_THREAD) will ignore this argument anyway. None of these helpers actually need CLONE_UNTRACED or "& ~CSIGNAL", but their presence does not add any confusion and improves code clarity. Link: https://lkml.kernel.org/r/20250820163946.GA18549@redhat.com Signed-off-by: Oleg Nesterov Reviewed-by: Jens Axboe Cc: Christian Brauner Cc: Kees Cook Signed-off-by: Andrew Morton --- kernel/fork.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index e06cfaa85a84..a8674ba2b33b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2537,11 +2537,9 @@ struct task_struct * __init fork_idle(int cpu) struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) { unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| - CLONE_IO; + CLONE_IO|CLONE_VM|CLONE_UNTRACED; struct kernel_clone_args args = { - .flags = ((lower_32_bits(flags) | CLONE_VM | - CLONE_UNTRACED) & ~CSIGNAL), - .exit_signal = (lower_32_bits(flags) & CSIGNAL), + .flags = flags, .fn = fn, .fn_arg = arg, .io_thread = 1, @@ -2653,9 +2651,8 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, unsigned long flags) { struct kernel_clone_args args = { - .flags = ((lower_32_bits(flags) | CLONE_VM | - CLONE_UNTRACED) & ~CSIGNAL), - .exit_signal = (lower_32_bits(flags) & CSIGNAL), + .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), + .exit_signal = (flags & CSIGNAL), .fn = fn, .fn_arg = arg, .name = name, @@ -2671,9 +2668,8 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags) { struct kernel_clone_args args = { - .flags = ((lower_32_bits(flags) | CLONE_VM | - CLONE_UNTRACED) & ~CSIGNAL), - .exit_signal = (lower_32_bits(flags) & CSIGNAL), + .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), + .exit_signal = (flags & CSIGNAL), .fn = fn, .fn_arg = arg, }; From 171c04728993bac01666994cacaf4d840be97801 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 19 Aug 2025 12:41:51 +0300 Subject: [PATCH 25/75] ocfs2: remove unnecessary NULL check in ocfs2_grab_folios() Smatch complains that checking "folios" for NULL doesn't make sense because it has already been dereferenced at this point. Really passing a NULL "folios" pointer to ocfs2_grab_folios() doesn't make sense, and fortunately none of the callers do that. Delete the unnecessary NULL check. Link: https://lkml.kernel.org/r/aKRG39hyvDJcN2G7@stanley.mountain Signed-off-by: Dan Carpenter Acked-by: Joseph Qi Reviewed-by: Mark Tinguely Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/alloc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 821cb7874685..162711cc5b20 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6928,8 +6928,7 @@ static int ocfs2_grab_folios(struct inode *inode, loff_t start, loff_t end, out: if (ret != 0) { - if (folios) - ocfs2_unlock_and_free_folios(folios, numfolios); + ocfs2_unlock_and_free_folios(folios, numfolios); numfolios = 0; } From 493977de1469b9898e178c9aab5fd498c4083da7 Mon Sep 17 00:00:00 2001 From: yili Date: Thu, 7 Aug 2025 23:57:49 +0800 Subject: [PATCH 26/75] ocfs2: fix super block reserved field offset comment The offset annotation for s_reserved2 in struct ocfs2_super_block was incorrect. After the preceding fields: - s_xattr_inline_size (2 bytes at 0xB8) - s_reserved0 (2 bytes at 0xBA) - s_dx_seed[3] (12 bytes at 0xBC) The actual offset of s_reserved2 is at 0xC8, when calculating from the start of the structure. Correct the offset comment from C0 to C8 to reflect the proper location in the super block structure. Link: https://lkml.kernel.org/r/20250807155749.164481-1-yili@winhong.com Signed-off-by: yili Reviewed-by: Joel Becker Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/ocfs2_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index e8e94599e907..ae0e44e5f2ad 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -614,7 +614,7 @@ struct ocfs2_super_block { __le16 s_reserved0; __le32 s_dx_seed[3]; /* seed[0-2] for dx dir hash. * s_uuid_hash serves as seed[3]. */ -/*C0*/ __le64 s_reserved2[15]; /* Fill out superblock */ +/*C8*/ __le64 s_reserved2[15]; /* Fill out superblock */ /*140*/ /* From 13818f7b8c85c89aa97a430f8116490c1b833470 Mon Sep 17 00:00:00 2001 From: Liao Yuanhong Date: Mon, 25 Aug 2025 20:33:05 +0800 Subject: [PATCH 27/75] kexec_core: remove redundant 0 value initialization The kimage struct is already zeroed by kzalloc(). It's redundant to initialize image->head to 0. Link: https://lkml.kernel.org/r/20250825123307.306634-1-liaoyuanhong@vivo.com Signed-off-by: Liao Yuanhong Signed-off-by: Andrew Morton --- kernel/kexec_core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 31203f0bacaf..fa00b239c5d9 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -233,7 +233,6 @@ struct kimage *do_kimage_alloc_init(void) if (!image) return NULL; - image->head = 0; image->entry = &image->head; image->last_entry = &image->head; image->control_page = ~0; /* By default this does not apply */ From 31cf021b6161ac63c2ab8c64415cca0e3e136636 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Mon, 25 Aug 2025 10:56:58 +0800 Subject: [PATCH 28/75] lib/sys_info: handle sys_info_mask==0 case Generalization of panic_print's dump function [1] has been merged, and this patchset is to address some remaining issues, like adding note of the obsoletion of 'panic_print' cmdline parameter, refining the kernel document for panic_print, and hardening some string management. This patch (of 4): It is a normal case that bitmask parameter is 0, so pre-initialize the names[] to null string to cover this case. Also remove the superfluous "+1" in names[sizeof(sys_info_avail) + 1], which is needed for 'strlen()', but not for 'sizeof()'. Link: https://lkml.kernel.org/r/20250825025701.81921-1-feng.tang@linux.alibaba.com Link: https://lkml.kernel.org/r/20250825025701.81921-2-feng.tang@linux.alibaba.com Link: Link: https://lkml.kernel.org/r/20250703021004.42328-1-feng.tang@linux.alibaba.com [1] Signed-off-by: Feng Tang Suggested-by: Petr Mladek Reviewed-by: Petr Mladek Cc: Askar Safin Cc: John Ogness Cc: Jonathan Corbet Cc: Lance Yang Cc: "Paul E . McKenney" Cc: Steven Rostedt Signed-off-by: Andrew Morton --- lib/sys_info.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/sys_info.c b/lib/sys_info.c index 5bf503fd7ec1..496f9151c9b6 100644 --- a/lib/sys_info.c +++ b/lib/sys_info.c @@ -55,7 +55,7 @@ int sysctl_sys_info_handler(const struct ctl_table *ro_table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - char names[sizeof(sys_info_avail) + 1]; + char names[sizeof(sys_info_avail)]; struct ctl_table table; unsigned long *si_bits_global; @@ -81,6 +81,7 @@ int sysctl_sys_info_handler(const struct ctl_table *ro_table, int write, char *delim = ""; int i, len = 0; + names[0] = '\0'; for (i = 0; i < ARRAY_SIZE(si_names); i++) { if (*si_bits_global & si_names[i].bit) { len += scnprintf(names + len, sizeof(names) - len, From 8c2b91fbb0078b0c33d9031ebbbced04a351ecfe Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Mon, 25 Aug 2025 10:56:59 +0800 Subject: [PATCH 29/75] panic: refine the document for 'panic_print' User reported current document about SYS_INFO_PANIC_CONSOLE_REPLAY is confusing, that people could expect all user space console messages to be replayed. Specify that only 'kernel' messages will be replayed to solve the confusion. Link: https://lkml.kernel.org/r/20250825025701.81921-3-feng.tang@linux.alibaba.com Signed-off-by: Feng Tang Reported-by: Askar Safin Reviewed-by: Petr Mladek Cc: John Ogness Cc: Jonathan Corbet Cc: Lance Yang Cc: "Paul E . McKenney" Cc: Steven Rostedt Signed-off-by: Andrew Morton --- Documentation/admin-guide/kernel-parameters.txt | 2 +- Documentation/admin-guide/sysctl/kernel.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 747a55abf494..86f395f2933b 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4589,7 +4589,7 @@ bit 2: print timer info bit 3: print locks info if CONFIG_LOCKDEP is on bit 4: print ftrace buffer - bit 5: replay all messages on consoles at the end of panic + bit 5: replay all kernel messages on consoles at the end of panic bit 6: print all CPUs backtrace (if available in the arch) bit 7: print only tasks in uninterruptible (blocked) state *Be aware* that this option may print a _lot_ of lines, diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 8b49eab937d0..f3ee807b5d8b 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -890,7 +890,7 @@ bit 1 print system memory info bit 2 print timer info bit 3 print locks info if ``CONFIG_LOCKDEP`` is on bit 4 print ftrace buffer -bit 5 replay all messages on consoles at the end of panic +bit 5 replay all kernel messages on consoles at the end of panic bit 6 print all CPUs backtrace (if available in the arch) bit 7 print only tasks in uninterruptible (blocked) state ===== ============================================ From 2683df6539cbc3f0eeeba11154bc0cbf042a5cee Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Mon, 25 Aug 2025 10:57:00 +0800 Subject: [PATCH 30/75] panic: add note that 'panic_print' parameter is deprecated Just like for 'panic_print's systcl interface, add similar note for setup of kernel cmdline parameter and parameter under /sys/module/kernel/. Also add __core_param_cb() macro, which enables to add special get/set operation for a kernel parameter. Link: https://lkml.kernel.org/r/20250825025701.81921-4-feng.tang@linux.alibaba.com Signed-off-by: Feng Tang Suggested-by: Petr Mladek Reviewed-by: Petr Mladek Cc: Askar Safin Cc: John Ogness Cc: Jonathan Corbet Cc: Lance Yang Cc: "Paul E . McKenney" Cc: Steven Rostedt Signed-off-by: Andrew Morton --- include/linux/moduleparam.h | 13 +++++++++++++ kernel/panic.c | 19 ++++++++++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 3a25122d83e2..6907aedc4f74 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -349,6 +349,19 @@ static inline void kernel_param_unlock(struct module *mod) __module_param_call("", name, ¶m_ops_##type, &var, perm, \ -1, KERNEL_PARAM_FL_UNSAFE) +/** + * __core_param_cb - similar like core_param, with a set/get ops instead of type. + * @name: the name of the cmdline and sysfs parameter (often the same as var) + * @var: the variable + * @ops: the set & get operations for this parameter. + * @perm: visibility in sysfs + * + * Ideally this should be called 'core_param_cb', but the name has been + * used for module core parameter, so add the '__' prefix + */ +#define __core_param_cb(name, ops, arg, perm) \ + __module_param_call("", name, ops, arg, perm, -1, 0) + #endif /* !MODULE */ /** diff --git a/kernel/panic.c b/kernel/panic.c index 72fcbb5a071b..12a10e17ab4a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -937,12 +937,29 @@ EXPORT_SYMBOL(__stack_chk_fail); #endif core_param(panic, panic_timeout, int, 0644); -core_param(panic_print, panic_print, ulong, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); core_param(panic_on_warn, panic_on_warn, int, 0644); core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644); core_param(panic_console_replay, panic_console_replay, bool, 0644); +static int panic_print_set(const char *val, const struct kernel_param *kp) +{ + pr_info_once("Kernel: 'panic_print' parameter will be obsoleted by both 'panic_sys_info' and 'panic_console_replay'\n"); + return param_set_ulong(val, kp); +} + +static int panic_print_get(char *val, const struct kernel_param *kp) +{ + pr_info_once("Kernel: 'panic_print' parameter will be obsoleted by both 'panic_sys_info' and 'panic_console_replay'\n"); + return param_get_ulong(val, kp); +} + +static const struct kernel_param_ops panic_print_ops = { + .set = panic_print_set, + .get = panic_print_get, +}; +__core_param_cb(panic_print, &panic_print_ops, &panic_print, 0644); + static int __init oops_setup(char *s) { if (!s) From e40d2014b2ccaf0f1a49ba0d0cfb59ac2a36cc6e Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 25 Aug 2025 10:57:01 +0800 Subject: [PATCH 31/75] panic: clean up message about deprecated 'panic_print' parameter Remove duplication of the message about deprecated 'panic_print' parameter. Also make the wording more direct. Make it clear that the new parameters already exist and should be used instead. Link: https://lkml.kernel.org/r/20250825025701.81921-5-feng.tang@linux.alibaba.com Signed-off-by: Petr Mladek Signed-off-by: Feng Tang Reviewed-by: Lance Yang Tested-by: Lance Yang Reviewed-by: Feng Tang Cc: Askar Safin Cc: John Ogness Cc: Jonathan Corbet Cc: "Paul E . McKenney" Cc: Steven Rostedt Signed-off-by: Andrew Morton --- kernel/panic.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/kernel/panic.c b/kernel/panic.c index 12a10e17ab4a..24bca263f896 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -77,6 +77,11 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); +static void panic_print_deprecated(void) +{ + pr_info_once("Kernel: The 'panic_print' parameter is now deprecated. Please use 'panic_sys_info' and 'panic_console_replay' instead.\n"); +} + #ifdef CONFIG_SYSCTL /* @@ -125,7 +130,7 @@ static int proc_taint(const struct ctl_table *table, int write, static int sysctl_panic_print_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - pr_info_once("Kernel: 'panic_print' sysctl interface will be obsoleted by both 'panic_sys_info' and 'panic_console_replay'\n"); + panic_print_deprecated(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } @@ -944,13 +949,13 @@ core_param(panic_console_replay, panic_console_replay, bool, 0644); static int panic_print_set(const char *val, const struct kernel_param *kp) { - pr_info_once("Kernel: 'panic_print' parameter will be obsoleted by both 'panic_sys_info' and 'panic_console_replay'\n"); + panic_print_deprecated(); return param_set_ulong(val, kp); } static int panic_print_get(char *val, const struct kernel_param *kp) { - pr_info_once("Kernel: 'panic_print' parameter will be obsoleted by both 'panic_sys_info' and 'panic_console_replay'\n"); + panic_print_deprecated(); return param_get_ulong(val, kp); } From d0d9c7235548f1d772f1e48c9d5742c65d81c705 Mon Sep 17 00:00:00 2001 From: Jinchao Wang Date: Mon, 25 Aug 2025 10:29:29 +0800 Subject: [PATCH 32/75] panic: introduce helper functions for panic state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "panic: introduce panic status function family", v2. This series introduces a family of helper functions to manage panic state and updates existing code to use them. Before this series, panic state helpers were scattered and inconsistent. For example, panic_in_progress() was defined in printk/printk.c, not in panic.c or panic.h. As a result, developers had to look in unexpected places to understand or re-use panic state logic. Other checks were open- coded, duplicating logic across panic, crash, and watchdog paths. The new helpers centralize the functionality in panic.c/panic.h: - panic_try_start() - panic_reset() - panic_in_progress() - panic_on_this_cpu() - panic_on_other_cpu() Patches 1–8 add the helpers and convert panic/crash and printk/nbcon code to use them. Patch 9 fixes a bug in the watchdog subsystem by skipping checks when a panic is in progress, avoiding interference with the panic CPU. Together, this makes panic state handling simpler, more discoverable, and more robust. This patch (of 9): This patch introduces four new helper functions to abstract the management of the panic_cpu variable. These functions will be used in subsequent patches to refactor existing code. The direct use of panic_cpu can be error-prone and ambiguous, as it requires manual checks to determine which CPU is handling the panic. The new helpers clarify intent: panic_try_start(): Atomically sets the current CPU as the panicking CPU. panic_reset(): Reset panic_cpu to PANIC_CPU_INVALID. panic_in_progress(): Checks if a panic has been triggered. panic_on_this_cpu(): Returns true if the current CPU is the panic originator. panic_on_other_cpu(): Returns true if a panic is on another CPU. This change lays the groundwork for improved code readability and robustness in the panic handling subsystem. Link: https://lkml.kernel.org/r/20250825022947.1596226-1-wangjinchao600@gmail.com Link: https://lkml.kernel.org/r/20250825022947.1596226-2-wangjinchao600@gmail.com Signed-off-by: Jinchao Wang Cc: Anna Schumaker Cc: Baoquan He Cc: "Darrick J. Wong" Cc: Dave Young Cc: Doug Anderson Cc: "Guilherme G. Piccoli" Cc: Helge Deller Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joanthan Cameron Cc: Joel Granados Cc: John Ogness Cc: Kees Cook Cc: Li Huafei Cc: "Luck, Tony" Cc: Luo Gengkun Cc: Max Kellermann Cc: Nam Cao Cc: oushixiong Cc: Petr Mladek Cc: Qianqiang Liu Cc: Sergey Senozhatsky Cc: Sohil Mehta Cc: Steven Rostedt Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Zimemrmann Cc: Thorsten Blum Cc: Ville Syrjala Cc: Vivek Goyal Cc: Yicong Yang Cc: Yunhui Cui Cc: Yury Norov (NVIDIA) b Signed-off-by: Andrew Morton --- include/linux/panic.h | 6 +++++ kernel/panic.c | 53 ++++++++++++++++++++++++++++++++++++++++++ kernel/printk/printk.c | 5 ---- 3 files changed, 59 insertions(+), 5 deletions(-) diff --git a/include/linux/panic.h b/include/linux/panic.h index 7be742628c25..6f972a66c13e 100644 --- a/include/linux/panic.h +++ b/include/linux/panic.h @@ -43,6 +43,12 @@ void abort(void); extern atomic_t panic_cpu; #define PANIC_CPU_INVALID -1 +bool panic_try_start(void); +void panic_reset(void); +bool panic_in_progress(void); +bool panic_on_this_cpu(void); +bool panic_on_other_cpu(void); + /* * Only to be used by arch init code. If the user over-wrote the default * CONFIG_PANIC_TIMEOUT, honor it. diff --git a/kernel/panic.c b/kernel/panic.c index 24bca263f896..010a1bfc4843 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -299,6 +299,59 @@ void __weak crash_smp_send_stop(void) atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); +bool panic_try_start(void) +{ + int old_cpu, this_cpu; + + /* + * Only one CPU is allowed to execute the crash_kexec() code as with + * panic(). Otherwise parallel calls of panic() and crash_kexec() + * may stop each other. To exclude them, we use panic_cpu here too. + */ + old_cpu = PANIC_CPU_INVALID; + this_cpu = raw_smp_processor_id(); + + return atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu); +} +EXPORT_SYMBOL(panic_try_start); + +void panic_reset(void) +{ + atomic_set(&panic_cpu, PANIC_CPU_INVALID); +} +EXPORT_SYMBOL(panic_reset); + +bool panic_in_progress(void) +{ + return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); +} +EXPORT_SYMBOL(panic_in_progress); + +/* Return true if a panic is in progress on the current CPU. */ +bool panic_on_this_cpu(void) +{ + /* + * We can use raw_smp_processor_id() here because it is impossible for + * the task to be migrated to the panic_cpu, or away from it. If + * panic_cpu has already been set, and we're not currently executing on + * that CPU, then we never will be. + */ + return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id()); +} +EXPORT_SYMBOL(panic_on_this_cpu); + +/* + * Return true if a panic is in progress on a remote CPU. + * + * On true, the local CPU should immediately release any printing resources + * that may be needed by the panic CPU. + */ +bool panic_on_other_cpu(void) +{ + return (panic_in_progress() && !this_cpu_in_panic()); +} +EXPORT_SYMBOL(panic_on_other_cpu); + /* * A variant of panic() called from NMI context. We return if we've already * panicked on this CPU. If another CPU already panicked, loop in diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 0efbcdda9aab..5fe35f377b79 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -345,11 +345,6 @@ static void __up_console_sem(unsigned long ip) } #define up_console_sem() __up_console_sem(_RET_IP_) -static bool panic_in_progress(void) -{ - return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); -} - /* Return true if a panic is in progress on the current CPU. */ bool this_cpu_in_panic(void) { From f7998e7f03ff9846a5047743d0eae554f25fdbe8 Mon Sep 17 00:00:00 2001 From: Jinchao Wang Date: Mon, 25 Aug 2025 10:29:30 +0800 Subject: [PATCH 33/75] fbdev: use panic_in_progress() helper Update the fbcon_skip_panic() function to use the panic_in_progress() helper. The previous direct access to panic_cpu is less readable and is being replaced by a dedicated function that more clearly expresses the intent. This change is part of a series to refactor the kernel's panic handling logic for better clarity and robustness. Link: https://lkml.kernel.org/r/20250825022947.1596226-3-wangjinchao600@gmail.com Signed-off-by: Jinchao Wang Acked-by Qianqiang Liu Cc: Anna Schumaker Cc: Baoquan He Cc: "Darrick J. Wong" Cc: Dave Young Cc: Doug Anderson Cc: "Guilherme G. Piccoli" Cc: Helge Deller Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joanthan Cameron Cc: Joel Granados Cc: John Ogness Cc: Kees Cook Cc: Li Huafei Cc: "Luck, Tony" Cc: Luo Gengkun Cc: Max Kellermann Cc: Nam Cao Cc: oushixiong Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Sohil Mehta Cc: Steven Rostedt Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Zimemrmann Cc: Thorsten Blum Cc: Ville Syrjala Cc: Vivek Goyal Cc: Yicong Yang Cc: Yunhui Cui Cc: Yury Norov (NVIDIA) Signed-off-by: Andrew Morton --- drivers/video/fbdev/core/fbcon.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 55f5731e94c3..b062b05f4128 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -279,14 +279,7 @@ static int fbcon_get_rotate(struct fb_info *info) static bool fbcon_skip_panic(struct fb_info *info) { -/* panic_cpu is not exported, and can't be used if built as module. Use - * oops_in_progress instead, but non-fatal oops won't be printed. - */ -#if defined(MODULE) - return (info->skip_panic && unlikely(oops_in_progress)); -#else - return (info->skip_panic && unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID)); -#endif + return (info->skip_panic && unlikely(panic_in_progress())); } static inline bool fbcon_is_active(struct vc_data *vc, struct fb_info *info) From 33effbcaf110a68b49b7ab4f8720858e7598c216 Mon Sep 17 00:00:00 2001 From: Jinchao Wang Date: Mon, 25 Aug 2025 10:29:31 +0800 Subject: [PATCH 34/75] crash_core: use panic_try_start() in crash_kexec() crash_kexec() had its own code to exclude parallel execution by setting panic_cpu. This is already handled by panic_try_start(). Switch to panic_try_start() to remove the duplication and keep the logic consistent. Link: https://lkml.kernel.org/r/20250825022947.1596226-4-wangjinchao600@gmail.com Signed-off-by: Jinchao Wang Cc: Anna Schumaker Cc: Baoquan He Cc: "Darrick J. Wong" Cc: Dave Young Cc: Doug Anderson Cc: "Guilherme G. Piccoli" Cc: Helge Deller Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joanthan Cameron Cc: Joel Granados Cc: John Ogness Cc: Kees Cook Cc: Li Huafei Cc: "Luck, Tony" Cc: Luo Gengkun Cc: Max Kellermann Cc: Nam Cao Cc: oushixiong Cc: Petr Mladek Cc: Qianqiang Liu Cc: Sergey Senozhatsky Cc: Sohil Mehta Cc: Steven Rostedt Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Zimemrmann Cc: Thorsten Blum Cc: Ville Syrjala Cc: Vivek Goyal Cc: Yicong Yang Cc: Yunhui Cui Cc: Yury Norov (NVIDIA) Signed-off-by: Andrew Morton --- kernel/crash_core.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index a4ef79591eb2..bb38bbaf3a26 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -4,6 +4,7 @@ * Copyright (C) 2002-2004 Eric Biederman */ +#include "linux/panic.h" #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include @@ -143,17 +144,7 @@ STACK_FRAME_NON_STANDARD(__crash_kexec); __bpf_kfunc void crash_kexec(struct pt_regs *regs) { - int old_cpu, this_cpu; - - /* - * Only one CPU is allowed to execute the crash_kexec() code as with - * panic(). Otherwise parallel calls of panic() and crash_kexec() - * may stop each other. To exclude them, we use panic_cpu here too. - */ - old_cpu = PANIC_CPU_INVALID; - this_cpu = raw_smp_processor_id(); - - if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) { + if (panic_try_start()) { /* This is the 1st CPU which comes here, so go ahead. */ __crash_kexec(regs); @@ -161,7 +152,7 @@ __bpf_kfunc void crash_kexec(struct pt_regs *regs) * Reset panic_cpu to allow another panic()/crash_kexec() * call. */ - atomic_set(&panic_cpu, PANIC_CPU_INVALID); + panic_reset(); } } From 6b69c7ef96f1afc7b426195087f7488f1510c2a4 Mon Sep 17 00:00:00 2001 From: Jinchao Wang Date: Mon, 25 Aug 2025 10:29:32 +0800 Subject: [PATCH 35/75] panic: use panic_try_start() in nmi_panic() nmi_panic() duplicated the logic to claim panic_cpu with atomic_try_cmpxchg. This is already wrapped in panic_try_start(). Replace the open-coded logic with panic_try_start(), and use panic_on_other_cpu() for the fallback path. This removes duplication and keeps panic handling code consistent. Link: https://lkml.kernel.org/r/20250825022947.1596226-5-wangjinchao600@gmail.com Signed-off-by: Jinchao Wang Cc: Anna Schumaker Cc: Baoquan He Cc: "Darrick J. Wong" Cc: Dave Young Cc: Doug Anderson Cc: "Guilherme G. Piccoli" Cc: Helge Deller Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joanthan Cameron Cc: Joel Granados Cc: John Ogness Cc: Kees Cook Cc: Li Huafei Cc: "Luck, Tony" Cc: Luo Gengkun Cc: Max Kellermann Cc: Nam Cao Cc: oushixiong Cc: Petr Mladek Cc: Qianqiang Liu Cc: Sergey Senozhatsky Cc: Sohil Mehta Cc: Steven Rostedt Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Zimemrmann Cc: Thorsten Blum Cc: Ville Syrjala Cc: Vivek Goyal Cc: Yicong Yang Cc: Yunhui Cui Cc: Yury Norov (NVIDIA) Signed-off-by: Andrew Morton --- kernel/panic.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/kernel/panic.c b/kernel/panic.c index 010a1bfc4843..f7ecb36cf2b3 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -360,15 +360,9 @@ EXPORT_SYMBOL(panic_on_other_cpu); */ void nmi_panic(struct pt_regs *regs, const char *msg) { - int old_cpu, this_cpu; - - old_cpu = PANIC_CPU_INVALID; - this_cpu = raw_smp_processor_id(); - - /* atomic_try_cmpxchg updates old_cpu on failure */ - if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) + if (panic_try_start()) panic("%s", msg); - else if (old_cpu != this_cpu) + else if (panic_on_other_cpu()) nmi_panic_self_stop(regs); } EXPORT_SYMBOL(nmi_panic); From 6f313b558562161c181734c1d23b25bf71e574b9 Mon Sep 17 00:00:00 2001 From: Jinchao Wang Date: Mon, 25 Aug 2025 10:29:33 +0800 Subject: [PATCH 36/75] panic: use panic_try_start() in vpanic() vpanic() had open-coded logic to claim panic_cpu with atomic_try_cmpxchg. This is already handled by panic_try_start(). Switch to panic_try_start() and use panic_on_other_cpu() for the fallback path. This removes duplicate code and makes panic handling consistent across functions. Link: https://lkml.kernel.org/r/20250825022947.1596226-6-wangjinchao600@gmail.com Signed-off-by: Jinchao Wang Cc: Anna Schumaker Cc: Baoquan He Cc: "Darrick J. Wong" Cc: Dave Young Cc: Doug Anderson Cc: "Guilherme G. Piccoli" Cc: Helge Deller Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joanthan Cameron Cc: Joel Granados Cc: John Ogness Cc: Kees Cook Cc: Li Huafei Cc: "Luck, Tony" Cc: Luo Gengkun Cc: Max Kellermann Cc: Nam Cao Cc: oushixiong Cc: Petr Mladek Cc: Qianqiang Liu Cc: Sergey Senozhatsky Cc: Sohil Mehta Cc: Steven Rostedt Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Zimemrmann Cc: Thorsten Blum Cc: Ville Syrjala Cc: Vivek Goyal Cc: Yicong Yang Cc: Yunhui Cui Cc: Yury Norov (NVIDIA) Signed-off-by: Andrew Morton --- kernel/panic.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/kernel/panic.c b/kernel/panic.c index f7ecb36cf2b3..c4ef86fc643f 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -420,7 +420,6 @@ void vpanic(const char *fmt, va_list args) static char buf[1024]; long i, i_next = 0, len; int state = 0; - int old_cpu, this_cpu; bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; if (panic_on_warn) { @@ -457,13 +456,10 @@ void vpanic(const char *fmt, va_list args) * `old_cpu == this_cpu' means we came from nmi_panic() which sets * panic_cpu to this CPU. In this case, this is also the 1st CPU. */ - old_cpu = PANIC_CPU_INVALID; - this_cpu = raw_smp_processor_id(); - /* atomic_try_cmpxchg updates old_cpu on failure */ - if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) { + if (panic_try_start()) { /* go ahead */ - } else if (old_cpu != this_cpu) + } else if (panic_on_other_cpu()) panic_smp_self_stop(); console_verbose(); From 2325e8eadf7cd2a086855809ffcd054336369d47 Mon Sep 17 00:00:00 2001 From: Jinchao Wang Date: Mon, 25 Aug 2025 10:29:34 +0800 Subject: [PATCH 37/75] printk/nbcon: use panic_on_this_cpu() helper nbcon_context_try_acquire() compared panic_cpu directly with smp_processor_id(). This open-coded check is now provided by panic_on_this_cpu(). Switch to panic_on_this_cpu() to simplify the code and improve readability. Link: https://lkml.kernel.org/r/20250825022947.1596226-7-wangjinchao600@gmail.com Signed-off-by: Jinchao Wang Cc: Anna Schumaker Cc: Baoquan He Cc: "Darrick J. Wong" Cc: Dave Young Cc: Doug Anderson Cc: "Guilherme G. Piccoli" Cc: Helge Deller Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joanthan Cameron Cc: Joel Granados Cc: John Ogness Cc: Kees Cook Cc: Li Huafei Cc: "Luck, Tony" Cc: Luo Gengkun Cc: Max Kellermann Cc: Nam Cao Cc: oushixiong Cc: Petr Mladek Cc: Qianqiang Liu Cc: Sergey Senozhatsky Cc: Sohil Mehta Cc: Steven Rostedt Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Zimemrmann Cc: Thorsten Blum Cc: Ville Syrjala Cc: Vivek Goyal Cc: Yicong Yang Cc: Yunhui Cui Cc: Yury Norov (NVIDIA) Signed-off-by: Andrew Morton --- kernel/printk/nbcon.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index 646801813415..7490865e2f44 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -2,6 +2,7 @@ // Copyright (C) 2022 Linutronix GmbH, John Ogness // Copyright (C) 2022 Intel, Thomas Gleixner +#include "linux/panic.h" #include #include #include @@ -589,7 +590,6 @@ static struct printk_buffers panic_nbcon_pbufs; */ static bool nbcon_context_try_acquire(struct nbcon_context *ctxt, bool is_reacquire) { - unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; struct nbcon_state cur; int err; @@ -614,7 +614,7 @@ out: /* Acquire succeeded. */ /* Assign the appropriate buffer for this context. */ - if (atomic_read(&panic_cpu) == cpu) + if (panic_on_this_cpu()) ctxt->pbufs = &panic_nbcon_pbufs; else ctxt->pbufs = con->pbufs; From c6be36e2997662f423edfa3979a63935873ff648 Mon Sep 17 00:00:00 2001 From: Jinchao Wang Date: Mon, 25 Aug 2025 10:29:35 +0800 Subject: [PATCH 38/75] panic/printk: replace this_cpu_in_panic() with panic_on_this_cpu() The helper this_cpu_in_panic() duplicated logic already provided by panic_on_this_cpu(). Remove this_cpu_in_panic() and switch all users to panic_on_this_cpu(). This simplifies the code and avoids having two helpers for the same check. Link: https://lkml.kernel.org/r/20250825022947.1596226-8-wangjinchao600@gmail.com Signed-off-by: Jinchao Wang Cc: Anna Schumaker Cc: Baoquan He Cc: "Darrick J. Wong" Cc: Dave Young Cc: Doug Anderson Cc: "Guilherme G. Piccoli" Cc: Helge Deller Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joanthan Cameron Cc: Joel Granados Cc: John Ogness Cc: Kees Cook Cc: Li Huafei Cc: "Luck, Tony" Cc: Luo Gengkun Cc: Max Kellermann Cc: Nam Cao Cc: oushixiong Cc: Petr Mladek Cc: Qianqiang Liu Cc: Sergey Senozhatsky Cc: Sohil Mehta Cc: Steven Rostedt Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Zimemrmann Cc: Thorsten Blum Cc: Ville Syrjala Cc: Vivek Goyal Cc: Yicong Yang Cc: Yunhui Cui Cc: Yury Norov (NVIDIA) Signed-off-by: Andrew Morton --- include/linux/printk.h | 2 -- kernel/panic.c | 2 +- kernel/printk/nbcon.c | 2 +- kernel/printk/printk.c | 15 ++------------- kernel/printk/printk_ringbuffer.c | 2 +- lib/dump_stack.c | 2 +- 6 files changed, 6 insertions(+), 19 deletions(-) diff --git a/include/linux/printk.h b/include/linux/printk.h index 5d22b803f51e..45c663124c9b 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -330,8 +330,6 @@ static inline bool pr_flush(int timeout_ms, bool reset_on_progress) #endif -bool this_cpu_in_panic(void); - #ifdef CONFIG_SMP extern int __printk_cpu_sync_try_get(void); extern void __printk_cpu_sync_wait(void); diff --git a/kernel/panic.c b/kernel/panic.c index c4ef86fc643f..a8b1bf60e09f 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -348,7 +348,7 @@ EXPORT_SYMBOL(panic_on_this_cpu); */ bool panic_on_other_cpu(void) { - return (panic_in_progress() && !this_cpu_in_panic()); + return (panic_in_progress() && !panic_on_this_cpu()); } EXPORT_SYMBOL(panic_on_other_cpu); diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index 7490865e2f44..c6d1a4a747e9 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -1394,7 +1394,7 @@ enum nbcon_prio nbcon_get_default_prio(void) { unsigned int *cpu_emergency_nesting; - if (this_cpu_in_panic()) + if (panic_on_this_cpu()) return NBCON_PRIO_PANIC; cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5fe35f377b79..faa8b1f0585b 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -17,6 +17,7 @@ * 01Mar01 Andrew Morton */ +#include "linux/panic.h" #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include @@ -345,18 +346,6 @@ static void __up_console_sem(unsigned long ip) } #define up_console_sem() __up_console_sem(_RET_IP_) -/* Return true if a panic is in progress on the current CPU. */ -bool this_cpu_in_panic(void) -{ - /* - * We can use raw_smp_processor_id() here because it is impossible for - * the task to be migrated to the panic_cpu, or away from it. If - * panic_cpu has already been set, and we're not currently executing on - * that CPU, then we never will be. - */ - return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id()); -} - /* * Return true if a panic is in progress on a remote CPU. * @@ -365,7 +354,7 @@ bool this_cpu_in_panic(void) */ bool other_cpu_in_panic(void) { - return (panic_in_progress() && !this_cpu_in_panic()); + return (panic_in_progress() && !panic_on_this_cpu()); } /* diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index d9fb053cff67..e2a1b2d34d2b 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -2143,7 +2143,7 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, * But it would have the sequence number returned * by "prb_next_reserve_seq() - 1". */ - if (this_cpu_in_panic() && + if (panic_on_this_cpu() && (!debug_non_panic_cpus || legacy_allow_panic_sync) && ((*seq + 1) < prb_next_reserve_seq(rb))) { (*seq)++; diff --git a/lib/dump_stack.c b/lib/dump_stack.c index b3a85fe8b673..f0c78b5b5324 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c @@ -102,7 +102,7 @@ static void __dump_stack(const char *log_lvl) */ asmlinkage __visible void dump_stack_lvl(const char *log_lvl) { - bool in_panic = this_cpu_in_panic(); + bool in_panic = panic_on_this_cpu(); unsigned long flags; /* From d4a36db5639db032a434aef968f9188a600139ec Mon Sep 17 00:00:00 2001 From: Jinchao Wang Date: Mon, 25 Aug 2025 10:29:36 +0800 Subject: [PATCH 39/75] panic/printk: replace other_cpu_in_panic() with panic_on_other_cpu() The helper other_cpu_in_panic() duplicated logic already provided by panic_on_other_cpu(). Remove other_cpu_in_panic() and update all users to call panic_on_other_cpu() instead. This removes redundant code and makes panic handling consistent. Link: https://lkml.kernel.org/r/20250825022947.1596226-9-wangjinchao600@gmail.com Signed-off-by: Jinchao Wang Cc: Anna Schumaker Cc: Baoquan He Cc: "Darrick J. Wong" Cc: Dave Young Cc: Doug Anderson Cc: "Guilherme G. Piccoli" Cc: Helge Deller Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joanthan Cameron Cc: Joel Granados Cc: John Ogness Cc: Kees Cook Cc: Li Huafei Cc: "Luck, Tony" Cc: Luo Gengkun Cc: Max Kellermann Cc: Nam Cao Cc: oushixiong Cc: Petr Mladek Cc: Qianqiang Liu Cc: Sergey Senozhatsky Cc: Sohil Mehta Cc: Steven Rostedt Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Zimemrmann Cc: Thorsten Blum Cc: Ville Syrjala Cc: Vivek Goyal Cc: Yicong Yang Cc: Yunhui Cui Cc: Yury Norov (NVIDIA) Signed-off-by: Andrew Morton --- kernel/printk/internal.h | 1 - kernel/printk/nbcon.c | 8 ++++---- kernel/printk/printk.c | 19 ++++--------------- 3 files changed, 8 insertions(+), 20 deletions(-) diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index ef282001f200..f72bbfa266d6 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -332,7 +332,6 @@ struct printk_message { unsigned long dropped; }; -bool other_cpu_in_panic(void); bool printk_get_next_message(struct printk_message *pmsg, u64 seq, bool is_extended, bool may_supress); diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index c6d1a4a747e9..171480135830 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -255,7 +255,7 @@ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt, * opportunity to perform any necessary cleanup if they were * interrupted by the panic CPU while printing. */ - if (other_cpu_in_panic() && + if (panic_on_other_cpu() && (!is_reacquire || cur->unsafe_takeover)) { return -EPERM; } @@ -310,7 +310,7 @@ static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio) * Event #2 implies the new context is PANIC. * Event #3 occurs when panic() has flushed the console. * Event #4 occurs when a non-panic CPU reacquires. - * Event #5 is not possible due to the other_cpu_in_panic() check + * Event #5 is not possible due to the panic_on_other_cpu() check * in nbcon_context_try_acquire_handover(). */ @@ -349,7 +349,7 @@ static int nbcon_context_try_acquire_requested(struct nbcon_context *ctxt, struct nbcon_state new; /* Note that the caller must still remove the request! */ - if (other_cpu_in_panic()) + if (panic_on_other_cpu()) return -EPERM; /* @@ -447,7 +447,7 @@ static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt, * nbcon_waiter_matches(). In particular, the assumption that * lower priorities are ignored during panic. */ - if (other_cpu_in_panic()) + if (panic_on_other_cpu()) return -EPERM; /* Handover is not possible on the same CPU. */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index faa8b1f0585b..236f03937107 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -346,17 +346,6 @@ static void __up_console_sem(unsigned long ip) } #define up_console_sem() __up_console_sem(_RET_IP_) -/* - * Return true if a panic is in progress on a remote CPU. - * - * On true, the local CPU should immediately release any printing resources - * that may be needed by the panic CPU. - */ -bool other_cpu_in_panic(void) -{ - return (panic_in_progress() && !panic_on_this_cpu()); -} - /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's @@ -2391,7 +2380,7 @@ asmlinkage int vprintk_emit(int facility, int level, * non-panic CPUs are generating any messages, they will be * silently dropped. */ - if (other_cpu_in_panic() && + if (panic_on_other_cpu() && !debug_non_panic_cpus && !panic_triggering_all_cpu_backtrace) return 0; @@ -2827,7 +2816,7 @@ void console_lock(void) might_sleep(); /* On panic, the console_lock must be left to the panic cpu. */ - while (other_cpu_in_panic()) + while (panic_on_other_cpu()) msleep(1000); down_console_sem(); @@ -2847,7 +2836,7 @@ EXPORT_SYMBOL(console_lock); int console_trylock(void) { /* On panic, the console_lock must be left to the panic cpu. */ - if (other_cpu_in_panic()) + if (panic_on_other_cpu()) return 0; if (down_trylock_console_sem()) return 0; @@ -3227,7 +3216,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove any_progress = true; /* Allow panic_cpu to take over the consoles safely. */ - if (other_cpu_in_panic()) + if (panic_on_other_cpu()) goto abandon; if (do_cond_resched) From 3d5f4f15b778d6da9760d54455cc256ecf924c0a Mon Sep 17 00:00:00 2001 From: Jinchao Wang Date: Mon, 25 Aug 2025 10:29:37 +0800 Subject: [PATCH 40/75] watchdog: skip checks when panic is in progress This issue was found when an EFI pstore was configured for kdump logging with the NMI hard lockup detector enabled. The efi-pstore write operation was slow, and with a large number of logs, the pstore dump callback within kmsg_dump() took a long time. This delay triggered the NMI watchdog, leading to a nested panic. The call flow demonstrates how the secondary panic caused an emergency_restart() to be triggered before the initial pstore operation could finish, leading to a failure to dump the logs: real panic() { kmsg_dump() { ... pstore_dump() { start_dump(); ... // long time operation triggers NMI watchdog nmi panic() { ... emergency_restart(); // pstore unfinished } ... finish_dump(); // never reached } } } Both watchdog_buddy_check_hardlockup() and watchdog_overflow_callback() may trigger during a panic. This can lead to recursive panic handling. Add panic_in_progress() checks so watchdog activity is skipped once a panic has begun. This prevents recursive panic and keeps the panic path more reliable. Link: https://lkml.kernel.org/r/20250825022947.1596226-10-wangjinchao600@gmail.com Signed-off-by: Jinchao Wang Reviewed-by: Yury Norov (NVIDIA) Cc: Anna Schumaker Cc: Baoquan He Cc: "Darrick J. Wong" Cc: Dave Young Cc: Doug Anderson Cc: "Guilherme G. Piccoli" Cc: Helge Deller Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joanthan Cameron Cc: Joel Granados Cc: John Ogness Cc: Kees Cook Cc: Li Huafei Cc: "Luck, Tony" Cc: Luo Gengkun Cc: Max Kellermann Cc: Nam Cao Cc: oushixiong Cc: Petr Mladek Cc: Qianqiang Liu Cc: Sergey Senozhatsky Cc: Sohil Mehta Cc: Steven Rostedt Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Zimemrmann Cc: Thorsten Blum Cc: Ville Syrjala Cc: Vivek Goyal Cc: Yicong Yang Cc: Yunhui Cui Signed-off-by: Andrew Morton --- kernel/watchdog.c | 6 ++++++ kernel/watchdog_perf.c | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 5413aa85e8a4..5b62d1002783 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -752,6 +752,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) if (!watchdog_enabled) return HRTIMER_NORESTART; + /* + * pass the buddy check if a panic is in process + */ + if (panic_in_progress()) + return HRTIMER_NORESTART; + watchdog_hardlockup_kick(); /* kick the softlockup detector */ diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index 9c58f5b4381d..d3ca70e3c256 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt) "NMI watchdog: " fmt +#include #include #include #include @@ -108,6 +109,9 @@ static void watchdog_overflow_callback(struct perf_event *event, /* Ensure the watchdog never gets throttled */ event->hw.interrupts = 0; + if (panic_in_progress()) + return; + if (!watchdog_check_timestamp()) return; From 3e3f55f8b73fa0e5c4df50f8b6c001f0a1f18adf Mon Sep 17 00:00:00 2001 From: Guan-Chun Wu <409411716@gms.tku.edu.tw> Date: Wed, 27 Aug 2025 00:17:41 +0800 Subject: [PATCH 41/75] btree: simplify merge logic by using btree_last() return value Previously btree_merge() called btree_last() only to test existence, then performed an extra btree_lookup() to fetch the value. This patch changes it to directly use the value returned by btree_last(), avoiding redundant lookups and simplifying the merge loop. Link: https://lkml.kernel.org/r/20250826161741.686704-1-409411716@gms.tku.edu.tw Signed-off-by: Guan-Chun Wu <409411716@gms.tku.edu.tw> Cc: Kuan-Wei Chiu Signed-off-by: Andrew Morton --- lib/btree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/btree.c b/lib/btree.c index bb81d3393ac5..9c80c0c7bba8 100644 --- a/lib/btree.c +++ b/lib/btree.c @@ -653,9 +653,9 @@ int btree_merge(struct btree_head *target, struct btree_head *victim, * walks to remove a single object from the victim. */ for (;;) { - if (!btree_last(victim, geo, key)) + val = btree_last(victim, geo, key); + if (!val) break; - val = btree_lookup(victim, geo, key); err = btree_insert(target, geo, key, val, gfp); if (err) return err; From 17bdc64c0d419b78bb400c221d36eaa391d16b3a Mon Sep 17 00:00:00 2001 From: Bala-Vignesh-Reddy Date: Wed, 20 Aug 2025 23:26:10 +0530 Subject: [PATCH 42/75] selftests: proc: mark vsyscall strings maybe-unused The str_vsyscall_* constants in proc-pid-vm.c triggers -Wunused-const-variable warnings with gcc-13.32 and clang 18.1. Define and apply __maybe_unused locally to suppress the warnings. No functional change Fixes compiler warning: warning: `str_vsyscall_*' defined but not used[-Wunused-const-variable] Link: https://lkml.kernel.org/r/20250820175610.83014-1-reddybalavignesh9979@gmail.com Signed-off-by: Bala-Vignesh-Reddy Cc: Shuah Khan Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- tools/testing/selftests/proc/proc-pid-vm.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/proc/proc-pid-vm.c b/tools/testing/selftests/proc/proc-pid-vm.c index d04685771952..978cbcb3eb11 100644 --- a/tools/testing/selftests/proc/proc-pid-vm.c +++ b/tools/testing/selftests/proc/proc-pid-vm.c @@ -47,6 +47,10 @@ #include #include +#ifndef __maybe_unused +#define __maybe_unused __attribute__((__unused__)) +#endif + #include "../kselftest.h" static inline long sys_execveat(int dirfd, const char *pathname, char **argv, char **envp, int flags) @@ -218,12 +222,12 @@ static int make_exe(const uint8_t *payload, size_t len) * 2: vsyscall VMA is r-xp vsyscall=emulate */ static volatile int g_vsyscall; -static const char *str_vsyscall; +static const char *str_vsyscall __maybe_unused; -static const char str_vsyscall_0[] = ""; -static const char str_vsyscall_1[] = +static const char str_vsyscall_0[] __maybe_unused = ""; +static const char str_vsyscall_1[] __maybe_unused = "ffffffffff600000-ffffffffff601000 --xp 00000000 00:00 0 [vsyscall]\n"; -static const char str_vsyscall_2[] = +static const char str_vsyscall_2[] __maybe_unused = "ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]\n"; #ifdef __x86_64__ From fe7a283b39160153b6d1bd7f61b0a9d5d44987a8 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Tue, 26 Aug 2025 12:51:06 +0300 Subject: [PATCH 43/75] ocfs2: add suballoc slot check in ocfs2_validate_inode_block() In 'ocfs2_validate_inode_block()', add suballoc slot check similar to one in 'ocfs2_get_suballoc_slot_bit()', thus preventing an out-of-bounds accesses for 'local_system_inodes' in 'get_local_system_inode()'. Most likely this fixes https://syzkaller.appspot.com/bug?extid=a77d690840e60bc2ddd8 as well. Link: https://lkml.kernel.org/r/20250826095106.666980-1-dmantipov@yandex.ru Signed-off-by: Dmitry Antipov Reported-by: syzbot+900962ac9bf1860033f2@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=900962ac9bf1860033f2 Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/inode.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 6c4f78f473fb..fcc89856ab95 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1495,6 +1495,14 @@ int ocfs2_validate_inode_block(struct super_block *sb, goto bail; } + if (le16_to_cpu(di->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT && + (u32)le16_to_cpu(di->i_suballoc_slot) > OCFS2_SB(sb)->max_slots - 1) { + rc = ocfs2_error(sb, "Invalid dinode %llu: suballoc slot %u\n", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(di->i_suballoc_slot)); + goto bail; + } + rc = 0; bail: From 652ab7c8fab36bd803d2947a3abf26155faa5dc5 Mon Sep 17 00:00:00 2001 From: Jinchao Wang Date: Fri, 29 Aug 2025 13:13:02 +0800 Subject: [PATCH 44/75] panic: use angle-bracket include for panic.h Replace quoted includes of panic.h with `#include ` for consistency across the kernel. Link: https://lkml.kernel.org/r/20250829051312.33773-1-wangjinchao600@gmail.com Signed-off-by: Jinchao Wang Reviewed-by: John Ogness Reviewed-by: Petr Mladek Cc: Qianqiang Liu Cc: Sergey Senozhatsky Cc: Steven Rostedt Signed-off-by: Andrew Morton --- kernel/crash_core.c | 2 +- kernel/printk/nbcon.c | 2 +- kernel/printk/printk.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index bb38bbaf3a26..a5e8523dd6eb 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -4,7 +4,6 @@ * Copyright (C) 2002-2004 Eric Biederman */ -#include "linux/panic.h" #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include @@ -23,6 +22,7 @@ #include #include #include +#include #include #include diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index 171480135830..558ef3177976 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -2,7 +2,6 @@ // Copyright (C) 2022 Linutronix GmbH, John Ogness // Copyright (C) 2022 Intel, Thomas Gleixner -#include "linux/panic.h" #include #include #include @@ -13,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 236f03937107..5aee9ffb16b9 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -17,7 +17,6 @@ * 01Mar01 Andrew Morton */ -#include "linux/panic.h" #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include @@ -49,6 +48,7 @@ #include #include #include +#include #include #include From 37aa782df94d16277b45b9a62b748cd62b4bccb9 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 3 Sep 2025 12:04:18 +0200 Subject: [PATCH 45/75] panic: remove redundant panic-cpu backtrace Backtraces from all CPUs are printed during panic() when SYS_INFO_ALL_CPU_BT is set. It shows the backtrace for the panic-CPU even when it has already been explicitly printed before. Do not change the legacy code which prints the backtrace in various contexts, for example, as part of Oops report, right after panic message. It will always be visible in the crash dump. Instead, remember when the backtrace was printed, and skip it when dumping the optional backtraces on all CPUs. [akpm@linux-foundation.org: make panic_this_cpu_backtrace_printed static] Closes: https://lore.kernel.org/oe-kbuild-all/202509050048.FMpVvh1u-lkp@intel.com/ [pmladek@suse.com: Handle situations when the backtrace was not printed for the panic CPU] Link: https://lkml.kernel.org/r/20250903100418.410026-1-pmladek@suse.com Signed-off-by: Sergey Senozhatsky Link: https://lore.kernel.org/r/20250731030314.3818040-1-senozhatsky@chromium.org Signed-off-by: Petr Mladek Tested-by: Feng Tang Reviewed-by: John Ogness Signed-off-by: Andrew Morton --- kernel/panic.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/kernel/panic.c b/kernel/panic.c index a8b1bf60e09f..ebd81c259fa9 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -67,6 +67,7 @@ static unsigned int warn_limit __read_mostly; static bool panic_console_replay; bool panic_triggering_all_cpu_backtrace; +static bool panic_this_cpu_backtrace_printed; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); @@ -380,6 +381,19 @@ void check_panic_on_warn(const char *origin) origin, limit); } +static void panic_trigger_all_cpu_backtrace(void) +{ + /* Temporary allow non-panic CPUs to write their backtraces. */ + panic_triggering_all_cpu_backtrace = true; + + if (panic_this_cpu_backtrace_printed) + trigger_allbutcpu_cpu_backtrace(raw_smp_processor_id()); + else + trigger_all_cpu_backtrace(); + + panic_triggering_all_cpu_backtrace = false; +} + /* * Helper that triggers the NMI backtrace (if set in panic_print) * and then performs the secondary CPUs shutdown - we cannot have @@ -387,12 +401,8 @@ void check_panic_on_warn(const char *origin) */ static void panic_other_cpus_shutdown(bool crash_kexec) { - if (panic_print & SYS_INFO_ALL_CPU_BT) { - /* Temporary allow non-panic CPUs to write their backtraces. */ - panic_triggering_all_cpu_backtrace = true; - trigger_all_cpu_backtrace(); - panic_triggering_all_cpu_backtrace = false; - } + if (panic_print & SYS_INFO_ALL_CPU_BT) + panic_trigger_all_cpu_backtrace(); /* * Note that smp_send_stop() is the usual SMP shutdown function, @@ -470,13 +480,15 @@ void vpanic(const char *fmt, va_list args) buf[len - 1] = '\0'; pr_emerg("Kernel panic - not syncing: %s\n", buf); -#ifdef CONFIG_DEBUG_BUGVERBOSE /* * Avoid nested stack-dumping if a panic occurs during oops processing */ - if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) + if (test_taint(TAINT_DIE) || oops_in_progress > 1) { + panic_this_cpu_backtrace_printed = true; + } else if (IS_ENABLED(CONFIG_DEBUG_BUGVERBOSE)) { dump_stack(); -#endif + panic_this_cpu_backtrace_printed = true; + } /* * If kgdb is enabled, give it a chance to run before we stop all From 13f23538ef49a96a1b8ba9d61bd778de65f00613 Mon Sep 17 00:00:00 2001 From: zhang jiao Date: Wed, 3 Sep 2025 16:39:47 +0800 Subject: [PATCH 46/75] fs/proc/base.c: fix the wrong format specifier Use '%d' instead of '%u' for int. Link: https://lkml.kernel.org/r/20250903083948.2536-1-zhangjiao2@cmss.chinamobile.com Signed-off-by: zhang jiao Cc: Al Viro Cc: Christian Brauner Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Lorenzo Stoakes Cc: Thomas Gleinxer Cc: xu xin Signed-off-by: Andrew Morton --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 62d35631ba8c..569c93c9f927 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3945,7 +3945,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) tid = task_pid_nr_ns(task, ns); if (!tid) continue; /* The task has just exited. */ - len = snprintf(name, sizeof(name), "%u", tid); + len = snprintf(name, sizeof(name), "%d", tid); if (!proc_fill_cache(file, ctx, name, len, proc_task_instantiate, task, NULL)) { /* returning this tgid failed, save it as the first From d337f4524861f4e74c31ee575d60f2eaa1e82388 Mon Sep 17 00:00:00 2001 From: fuqiang wang Date: Thu, 4 Sep 2025 17:38:52 +0800 Subject: [PATCH 47/75] x86/kexec: fix potential cmem->ranges out of memory In memmap_exclude_ranges(), elfheader will be excluded from crashk_res. In the current x86 architecture code, the elfheader is always allocated at crashk_res.start. It seems that there won't be a new split range. But it depends on the allocation position of elfheader in crashk_res. To avoid potential out of memory in future, add a extra slot. Otherwise loading the kdump kernel will fail because crash_exclude_mem_range will return -ENOMEM. random kexec_buf for passing dm crypt keys may cause a range split too, add another extra slot here. The similar issue also exists in fill_up_crash_elf_data(). The range to be excluded is [0, 1M], start (0) is special and will not appear in the middle of existing cmem->ranges[]. But in cast the low 1M could be changed in the future, add a extra slot too. Previous discussions: [1] https://lore.kernel.org/kexec/ZXk2oBf%2FT1Ul6o0c@MiWiFi-R3L-srv/ [2] https://lore.kernel.org/kexec/273284e8-7680-4f5f-8065-c5d780987e59@easystack.cn/ [3] https://lore.kernel.org/kexec/ZYQ6O%2F57sHAPxTHm@MiWiFi-R3L-srv/ Link: https://lkml.kernel.org/r/20250904093855.1180154-1-coxu@redhat.com Signed-off-by: fuqiang wang Signed-off-by: Baoquan He Signed-off-by: Coiby Xu Cc: Dave Young Cc: Vivek Goyal Cc: Borislav Betkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- arch/x86/kernel/crash.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 60bb24ddd36f..335fd2ee9766 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -165,8 +165,18 @@ static struct crash_mem *fill_up_crash_elf_data(void) /* * Exclusion of crash region, crashk_low_res and/or crashk_cma_ranges * may cause range splits. So add extra slots here. + * + * Exclusion of low 1M may not cause another range split, because the + * range of exclude is [0, 1M] and the condition for splitting a new + * region is that the start, end parameters are both in a certain + * existing region in cmem and cannot be equal to existing region's + * start or end. Obviously, the start of [0, 1M] cannot meet this + * condition. + * + * But in order to lest the low 1M could be changed in the future, + * (e.g. [start, 1M]), add a extra slot. */ - nr_ranges += 2 + crashk_cma_cnt; + nr_ranges += 3 + crashk_cma_cnt; cmem = vzalloc(struct_size(cmem, ranges, nr_ranges)); if (!cmem) return NULL; @@ -322,10 +332,15 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) struct crash_mem *cmem; /* - * Using random kexec_buf for passing dm crypt keys may cause a range - * split. So use two slots here. + * In the current x86 architecture code, the elfheader is always + * allocated at crashk_res.start. But it depends on the allocation + * position of elfheader in crashk_res. To avoid potential out of + * bounds in future, add an extra slot. + * + * And using random kexec_buf for passing dm crypt keys may cause a + * range split too, add another extra slot here. */ - nr_ranges = 2; + nr_ranges = 3; cmem = vzalloc(struct_size(cmem, ranges, nr_ranges)); if (!cmem) return -ENOMEM; From 913e65a2fe1a16fa253c4a016e2306b2cf9ffef8 Mon Sep 17 00:00:00 2001 From: Coiby Xu Date: Thu, 4 Sep 2025 17:38:53 +0800 Subject: [PATCH 48/75] crash: add KUnit tests for crash_exclude_mem_range crash_exclude_mem_range seems to be a simple function but there have been multiple attempts to fix it, - commit a2e9a95d2190 ("kexec: Improve & fix crash_exclude_mem_range() to handle overlapping ranges") - commit 6dff31597264 ("crash_core: fix and simplify the logic of crash_exclude_mem_range()") So add a set of unit tests to verify the correctness of current implementation. Shall we change the function in the future, the unit tests can also help prevent any regression. For example, we may make the function smarter by allocating extra crash_mem range on demand thus there is no need for the caller to foresee any memory range split or address -ENOMEM failure. The testing strategy is to verify the correctness of base case. The base case is there is one to-be-excluded range A and one existing range B. Then we can exhaust all possibilities of the position of A regarding B. For example, here are two combinations, Case: A is completely inside B (causes split) Original: [----B----] Exclude: {--A--} Result: [B1] .. [B2] Case: A overlaps B's left part Original: [----B----] Exclude: {---A---} Result: [..B..] In theory we can prove the correctness by induction, - Base case: crash_exclude_mem_range is correct in the case where n=1 (n is the number of existing ranges). - Inductive step: If crash_exclude_mem_range is correct for n=k existing ranges, then the it's also correct for n=k+1 ranges. But for the sake of simplicity, simply use unit tests to cover the base case together with two regression tests. Note most of the exclude_single_range_test() code is generated by Google Gemini with some small tweaks. The function specification, function body and the exhausting test strategy are presented as prompts. [akpm@linux-foundation.org: export crash_exclude_mem_range() to modules, for kernel/crash_core_test.c] Link: https://lkml.kernel.org/r/20250904093855.1180154-2-coxu@redhat.com Signed-off-by: Coiby Xu Assisted-by: Google Gemini Cc: Baoquan He Cc: Borislav Betkov Cc: Dave Young Cc: fuqiang wang Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleinxer Cc: Vivek Goyal Signed-off-by: Andrew Morton --- kernel/Kconfig.kexec | 11 ++ kernel/Makefile | 1 + kernel/crash_core.c | 15 ++ kernel/crash_core_test.c | 343 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 370 insertions(+) create mode 100644 kernel/crash_core_test.c diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 1224dd937df0..422270d64820 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -148,6 +148,17 @@ config CRASH_DM_CRYPT_CONFIGS CRASH_DM_CRYPT cannot directly select CONFIGFS_FS, because that is required to be built-in. +config CRASH_DUMP_KUNIT_TEST + tristate "Unit Tests for kernel crash dumps" if !KUNIT_ALL_TESTS + depends on CRASH_DUMP && KUNIT + default KUNIT_ALL_TESTS + help + This option builds KUnit unit tests for kernel crash dumps. The unit + tests will be used to verify the correctness of covered functions and + also prevent any regression. + + If unsure, say N. + config CRASH_HOTPLUG bool "Update the crash elfcorehdr on system configuration changes" default y diff --git a/kernel/Makefile b/kernel/Makefile index c60623448235..216a7dfc3a68 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -78,6 +78,7 @@ obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o obj-$(CONFIG_KEXEC_CORE) += kexec_core.o obj-$(CONFIG_CRASH_DUMP) += crash_core.o obj-$(CONFIG_CRASH_DM_CRYPT) += crash_dump_dm_crypt.o +obj-$(CONFIG_CRASH_DUMP_KUNIT_TEST) += crash_core_test.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o diff --git a/kernel/crash_core.c b/kernel/crash_core.c index a5e8523dd6eb..3b1c43382eec 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -265,6 +265,20 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, return 0; } +/** + * crash_exclude_mem_range - exclude a mem range for existing ranges + * @mem: mem->range contains an array of ranges sorted in ascending order + * @mstart: the start of to-be-excluded range + * @mend: the start of to-be-excluded range + * + * If you are unsure if a range split will happen, to avoid function call + * failure because of -ENOMEM, always make sure + * mem->max_nr_ranges == mem->nr_ranges + 1 + * before calling the function each time. + * + * returns 0 if a memory range is excluded successfully + * return -ENOMEM if mem->ranges doesn't have space to hold split ranges + */ int crash_exclude_mem_range(struct crash_mem *mem, unsigned long long mstart, unsigned long long mend) { @@ -324,6 +338,7 @@ int crash_exclude_mem_range(struct crash_mem *mem, return 0; } +EXPORT_SYMBOL_GPL(crash_exclude_mem_range); ssize_t crash_get_memory_size(void) { diff --git a/kernel/crash_core_test.c b/kernel/crash_core_test.c new file mode 100644 index 000000000000..8aadf6801530 --- /dev/null +++ b/kernel/crash_core_test.c @@ -0,0 +1,343 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include // For struct crash_mem and struct range if defined there + +// Helper to create and initialize crash_mem +static struct crash_mem *create_crash_mem(struct kunit *test, unsigned int max_ranges, + unsigned int nr_initial_ranges, + const struct range *initial_ranges) +{ + struct crash_mem *mem; + size_t alloc_size; + + // Check if max_ranges can even hold initial_ranges + if (max_ranges < nr_initial_ranges) { + kunit_err(test, "max_ranges (%u) < nr_initial_ranges (%u)\n", + max_ranges, nr_initial_ranges); + return NULL; + } + + alloc_size = sizeof(struct crash_mem) + (size_t)max_ranges * sizeof(struct range); + mem = kunit_kzalloc(test, alloc_size, GFP_KERNEL); + if (!mem) { + kunit_err(test, "Failed to allocate crash_mem\n"); + return NULL; + } + + mem->max_nr_ranges = max_ranges; + mem->nr_ranges = nr_initial_ranges; + if (initial_ranges && nr_initial_ranges > 0) { + memcpy(mem->ranges, initial_ranges, + nr_initial_ranges * sizeof(struct range)); + } + + return mem; +} + +// Helper to compare ranges for assertions +static void assert_ranges_equal(struct kunit *test, + const struct range *actual_ranges, + unsigned int actual_nr_ranges, + const struct range *expected_ranges, + unsigned int expected_nr_ranges, + const char *case_name) +{ + unsigned int i; + + KUNIT_ASSERT_EQ_MSG(test, expected_nr_ranges, actual_nr_ranges, + "%s: Number of ranges mismatch.", case_name); + + for (i = 0; i < expected_nr_ranges; i++) { + KUNIT_ASSERT_EQ_MSG(test, expected_ranges[i].start, actual_ranges[i].start, + "%s: Range %u start mismatch.", case_name, i); + KUNIT_ASSERT_EQ_MSG(test, expected_ranges[i].end, actual_ranges[i].end, + "%s: Range %u end mismatch.", case_name, i); + } +} + +// Structure for test parameters +struct exclude_test_param { + const char *description; + unsigned long long exclude_start; + unsigned long long exclude_end; + unsigned int initial_max_ranges; + const struct range *initial_ranges; + unsigned int initial_nr_ranges; + const struct range *expected_ranges; + unsigned int expected_nr_ranges; + int expected_ret; +}; + +static void run_exclude_test_case(struct kunit *test, const struct exclude_test_param *params) +{ + struct crash_mem *mem; + int ret; + + kunit_info(test, "%s", params->description); + + mem = create_crash_mem(test, params->initial_max_ranges, + params->initial_nr_ranges, params->initial_ranges); + if (!mem) + return; // Error already logged by create_crash_mem or kunit_kzalloc + + ret = crash_exclude_mem_range(mem, params->exclude_start, params->exclude_end); + + KUNIT_ASSERT_EQ_MSG(test, params->expected_ret, ret, + "%s: Return value mismatch.", params->description); + + if (params->expected_ret == 0) { + assert_ranges_equal(test, mem->ranges, mem->nr_ranges, + params->expected_ranges, params->expected_nr_ranges, + params->description); + } else { + // If an error is expected, nr_ranges might still be relevant to check + // depending on the exact point of failure. For ENOMEM on split, + // nr_ranges shouldn't have changed. + KUNIT_ASSERT_EQ_MSG(test, params->initial_nr_ranges, + mem->nr_ranges, + "%s: Number of ranges mismatch on error.", + params->description); + } +} + +/* + * Test Strategy 1: One to-be-excluded range A and one existing range B. + * + * Exhaust all possibilities of the position of A regarding B. + */ + +static const struct range single_range_b = { .start = 100, .end = 199 }; + +static const struct exclude_test_param exclude_single_range_test_data[] = { + { + .description = "1.1: A is left of B, no overlap", + .exclude_start = 10, .exclude_end = 50, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.2: A's right boundary touches B's left boundary", + .exclude_start = 10, .exclude_end = 99, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.3: A overlaps B's left part", + .exclude_start = 50, .exclude_end = 149, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){{ .start = 150, .end = 199 }}, + .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.4: A is completely inside B", + .exclude_start = 120, .exclude_end = 179, + .initial_max_ranges = 2, // Needs space for split + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){ + { .start = 100, .end = 119 }, + { .start = 180, .end = 199 } + }, + .expected_nr_ranges = 2, + .expected_ret = 0, + }, + { + .description = "1.5: A overlaps B's right part", + .exclude_start = 150, .exclude_end = 249, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){{ .start = 100, .end = 149 }}, + .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.6: A's left boundary touches B's right boundary", + .exclude_start = 200, .exclude_end = 250, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.7: A is right of B, no overlap", + .exclude_start = 250, .exclude_end = 300, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.8: A completely covers B and extends beyond", + .exclude_start = 50, .exclude_end = 250, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = NULL, .expected_nr_ranges = 0, + .expected_ret = 0, + }, + { + .description = "1.9: A covers B and extends to the left", + .exclude_start = 50, .exclude_end = 199, // A ends exactly where B ends + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = NULL, .expected_nr_ranges = 0, + .expected_ret = 0, + }, + { + .description = "1.10: A covers B and extends to the right", + .exclude_start = 100, .exclude_end = 250, // A starts exactly where B starts + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = NULL, .expected_nr_ranges = 0, + .expected_ret = 0, + }, + { + .description = "1.11: A is identical to B", + .exclude_start = 100, .exclude_end = 199, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = NULL, .expected_nr_ranges = 0, + .expected_ret = 0, + }, + { + .description = "1.12: A is a point, left of B, no overlap", + .exclude_start = 10, .exclude_end = 10, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.13: A is a point, at start of B", + .exclude_start = 100, .exclude_end = 100, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){{ .start = 101, .end = 199 }}, + .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.14: A is a point, in middle of B (causes split)", + .exclude_start = 150, .exclude_end = 150, + .initial_max_ranges = 2, // Needs space for split + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){ + { .start = 100, .end = 149 }, + { .start = 151, .end = 199 } + }, + .expected_nr_ranges = 2, + .expected_ret = 0, + }, + { + .description = "1.15: A is a point, at end of B", + .exclude_start = 199, .exclude_end = 199, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){{ .start = 100, .end = 198 }}, + .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.16: A is a point, right of B, no overlap", + .exclude_start = 250, .exclude_end = 250, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + // ENOMEM case for single range split + { + .description = "1.17: A completely inside B (split), no space (ENOMEM)", + .exclude_start = 120, .exclude_end = 179, + .initial_max_ranges = 1, // Not enough for split + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = NULL, // Not checked on error by assert_ranges_equal for content + .expected_nr_ranges = 1, // Should remain unchanged + .expected_ret = -ENOMEM, + }, +}; + + +static void exclude_single_range_test(struct kunit *test) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(exclude_single_range_test_data); i++) { + kunit_log(KERN_INFO, test, "Running: %s", exclude_single_range_test_data[i].description); + run_exclude_test_case(test, &exclude_single_range_test_data[i]); + // KUnit will stop on first KUNIT_ASSERT failure within run_exclude_test_case + } +} + +/* + * Test Strategy 2: Regression test. + */ + +static const struct exclude_test_param exclude_range_regression_test_data[] = { + // Test data from commit a2e9a95d2190 + { + .description = "2.1: exclude low 1M", + .exclude_start = 0, .exclude_end = (1 << 20) - 1, + .initial_max_ranges = 3, + .initial_ranges = (const struct range[]){ + { .start = 0, .end = 0x3efff }, + { .start = 0x3f000, .end = 0x3ffff }, + { .start = 0x40000, .end = 0x9ffff } + }, + .initial_nr_ranges = 3, + .expected_nr_ranges = 0, + .expected_ret = 0, + }, + // Test data from https://lore.kernel.org/all/ZXrY7QbXAlxydsSC@MiWiFi-R3L-srv/T/#u + { + .description = "2.2: when range out of bound", + .exclude_start = 100, .exclude_end = 200, + .initial_max_ranges = 3, + .initial_ranges = (const struct range[]){ + { .start = 1, .end = 299 }, + { .start = 401, .end = 1000 }, + { .start = 1001, .end = 2000 } + }, + .initial_nr_ranges = 3, + .expected_ranges = NULL, // Not checked on error by assert_ranges_equal for content + .expected_nr_ranges = 3, // Should remain unchanged + .expected_ret = -ENOMEM + }, + +}; + + +static void exclude_range_regression_test(struct kunit *test) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(exclude_range_regression_test_data); i++) { + kunit_log(KERN_INFO, test, "Running: %s", exclude_range_regression_test_data[i].description); + run_exclude_test_case(test, &exclude_range_regression_test_data[i]); + // KUnit will stop on first KUNIT_ASSERT failure within run_exclude_test_case + } +} + +/* + * KUnit Test Suite + */ +static struct kunit_case crash_exclude_mem_range_test_cases[] = { + KUNIT_CASE(exclude_single_range_test), + KUNIT_CASE(exclude_range_regression_test), + {} +}; + +static struct kunit_suite crash_exclude_mem_range_suite = { + .name = "crash_exclude_mem_range_tests", + .test_cases = crash_exclude_mem_range_test_cases, + // .init and .exit can be NULL if not needed globally for the suite +}; + +kunit_test_suite(crash_exclude_mem_range_suite); + +MODULE_DESCRIPTION("crash dump KUnit test suite"); +MODULE_LICENSE("GPL"); From 7b1e502eb17c23fa6459a19bfe5974bffdb95574 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 6 Sep 2025 21:38:57 -0700 Subject: [PATCH 49/75] kernel.h: add comments for enum system_states Provide some basic comments about the system_states and what they imply. Also convert the comments to kernel-doc format. Split the enum declaration from the definition of the system_state variable so that kernel-doc notation works cleanly with it. This is picked up by Documentation/driver-api/basics.rst so it does not need further inclusion in the kernel docbooks. Link: https://lkml.kernel.org/r/20250907043857.2941203-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Acked-by: Rafael J. Wysocki # v1 Reviewed-by: Mauro Carvalho Chehab [v5] Cc: "Brown, Len" Cc: Greg Kroah-Hartman Cc: James Bottomley Cc: Jani Nikula Cc: Jonathan Corbet Cc: Mauro Carvalho Chehab Cc: Pavel Machek Signed-off-by: Andrew Morton --- include/linux/kernel.h | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 989315dabb86..5b46924fdff5 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -164,11 +164,23 @@ extern int root_mountflags; extern bool early_boot_irqs_disabled; -/* - * Values used for system_state. Ordering of the states must not be changed +/** + * enum system_states - Values used for system_state. + * + * @SYSTEM_BOOTING: %0, no init needed + * @SYSTEM_SCHEDULING: system is ready for scheduling; OK to use RCU + * @SYSTEM_FREEING_INITMEM: system is freeing all of initmem; almost running + * @SYSTEM_RUNNING: system is up and running + * @SYSTEM_HALT: system entered clean system halt state + * @SYSTEM_POWER_OFF: system entered shutdown/clean power off state + * @SYSTEM_RESTART: system entered emergency power off or normal restart + * @SYSTEM_SUSPEND: system entered suspend or hibernate state + * + * Note: + * Ordering of the states must not be changed * as code checks for <, <=, >, >= STATE. */ -extern enum system_states { +enum system_states { SYSTEM_BOOTING, SYSTEM_SCHEDULING, SYSTEM_FREEING_INITMEM, @@ -177,7 +189,8 @@ extern enum system_states { SYSTEM_POWER_OFF, SYSTEM_RESTART, SYSTEM_SUSPEND, -} system_state; +}; +extern enum system_states system_state; /* * General tracing related utility functions - trace_printk(), From 0471440c8061e9a7c0fd292c7c6598d6304efd53 Mon Sep 17 00:00:00 2001 From: Fan Yu Date: Sun, 7 Sep 2025 00:12:05 +0800 Subject: [PATCH 50/75] tools/delaytop: add flexible sorting by delay field Patch series "tools/delaytop: implement real-time keyboard interaction support", v2. Current Limitations =================== The current delaytop implementation has two main limitations: 1) Static sorting only by CPU delay Forcing users to restart with different parameters to analyze other resource bottlenecks. 2) Memory delay information is always expanded Causing information overload when only high-level memory pressure monitoring is needed. Improvements ============ 1) Implemented dynamic sorting capability - Interactive key 'o' triggers sort mode. - Supports sorting by CPU/IO/Memory/IRQ delays. - Memory subcategories available in verbose mode. * c - CPU delay (default) * i - IO delay * m - Total memory delay * q - IRQ delay * s/r/t/p/w - Memory subcategories (in verbose mode) 2) Added memory display modes - Compact view (default): shows aggregated memory delays. - Verbose view ('M' key): breaks down into memory sub-delays. * SWAP - swapin delays * RCL - freepages reclaim delays * THR - thrashing delays * CMP - compaction delays * WP - write-protect copy delays Practical benefits ================== 1) Dynamic Sorting for Real-Time Bottleneck Detection System administrators can now dynamically change sorting to identify different types of resource bottlenecks without restarting. 2) Enhanced Usability with On-Screen Keybindings More intuitive interactive usage with on-screen keybindings help. Reduced screen clutter when only memory overview is needed. Use Case ======== # ./delaytop System Pressure Information: (avg10/avg60vg300/total) CPU some: 0.0%/ 0.0%/ 0.0%/ 106817(ms) CPU full: 0.0%/ 0.0%/ 0.0%/ 0(ms) Memory full: 0.0%/ 0.0%/ 0.0%/ 0(ms) Memory some: 0.0%/ 0.0%/ 0.0%/ 0(ms) IO full: 0.0%/ 0.0%/ 0.0%/ 2245(ms) IO some: 0.0%/ 0.0%/ 0.0%/ 2791(ms) IRQ full: 0.0%/ 0.0%/ 0.0%/ 0(ms) [o]sort [M]memverbose [q]quit Top 20 processes (sorted by cpu delay): PID TGID COMMAND CPU(ms) IO(ms) IRQ(ms) MEM(ms) ------------------------------------------------------------------------ 110 110 kworker/15:0H-s 27.91 0.00 0.00 0.00 57 57 cpuhp/7 3.18 0.00 0.00 0.00 99 99 cpuhp/14 2.97 0.00 0.00 0.00 51 51 cpuhp/6 0.90 0.00 0.00 0.00 44 44 kworker/4:0H-sy 0.80 0.00 0.00 0.00 76 76 idle_inject/10 0.31 0.00 0.00 0.00 100 100 idle_inject/14 0.30 0.00 0.00 0.00 1309 1309 systemsettings 0.29 0.00 0.00 0.00 60 60 ksoftirqd/7 0.28 0.00 0.00 0.00 45 45 cpuhp/5 0.22 0.00 0.00 0.00 63 63 cpuhp/8 0.20 0.00 0.00 0.00 87 87 cpuhp/12 0.18 0.00 0.00 0.00 93 93 cpuhp/13 0.17 0.00 0.00 0.00 1265 1265 acpid 0.17 0.00 0.00 0.00 1552 1552 sshd 0.17 0.00 0.00 0.00 2584 2584 sddm-helper 0.16 0.00 0.00 0.00 1284 1284 rtkit-daemon 0.15 0.00 0.00 0.00 1326 1326 nde-netfilter 0.14 0.00 0.00 0.00 27 27 cpuhp/2 0.13 0.00 0.00 0.00 631 631 kworker/11:2-rc 0.11 0.00 0.00 0.00 # ./delaytop -M System Pressure Information: (avg10/avg60vg300/total) CPU some: 0.0%/ 0.0%/ 0.0%/ 106827(ms) CPU full: 0.0%/ 0.0%/ 0.0%/ 0(ms) Memory full: 0.0%/ 0.0%/ 0.0%/ 0(ms) Memory some: 0.0%/ 0.0%/ 0.0%/ 0(ms) IO full: 0.0%/ 0.0%/ 0.0%/ 2245(ms) IO some: 0.0%/ 0.0%/ 0.0%/ 2791(ms) IRQ full: 0.0%/ 0.0%/ 0.0%/ 0(ms) [o]sort [M]memverbose [q]quit Top 20 processes (sorted by mem delay): PID TGID COMMAND MEM(ms) SWAP(ms) RCL(ms) THR(ms) CMP(ms) WP(ms) ------------------------------------------------------------------------------------------ 121732 121732 delaytop 0.01 0.00 0.00 0.00 0.00 0.01 95876 95876 top 0.00 0.00 0.00 0.00 0.00 0.00 121641 121641 systemd-userwor 0.00 0.00 0.00 0.00 0.00 0.00 121693 121693 systemd-userwor 0.00 0.00 0.00 0.00 0.00 0.00 121661 121661 systemd-userwor 0.00 0.00 0.00 0.00 0.00 0.00 1 1 systemd 0.00 0.00 0.00 0.00 0.00 0.00 2 2 kthreadd 0.00 0.00 0.00 0.00 0.00 0.00 3 3 pool_workqueue_ 0.00 0.00 0.00 0.00 0.00 0.00 4 4 kworker/R-rcu_g 0.00 0.00 0.00 0.00 0.00 0.00 5 5 kworker/R-rcu_p 0.00 0.00 0.00 0.00 0.00 0.00 6 6 kworker/R-slub_ 0.00 0.00 0.00 0.00 0.00 0.00 7 7 kworker/R-netns 0.00 0.00 0.00 0.00 0.00 0.00 9 9 kworker/0:0H-sy 0.00 0.00 0.00 0.00 0.00 0.00 11 11 kworker/u32:0-n 0.00 0.00 0.00 0.00 0.00 0.00 12 12 kworker/R-mm_pe 0.00 0.00 0.00 0.00 0.00 0.00 13 13 rcu_tasks_kthre 0.00 0.00 0.00 0.00 0.00 0.00 14 14 rcu_tasks_rude_ 0.00 0.00 0.00 0.00 0.00 0.00 15 15 rcu_tasks_trace 0.00 0.00 0.00 0.00 0.00 0.00 16 16 ksoftirqd/0 0.00 0.00 0.00 0.00 0.00 0.00 17 17 rcu_preempt 0.00 0.00 0.00 0.00 0.00 0.00 When psi is not enabled: # ./delaytop System Pressure Information: (avg10/avg60vg300/total) PSI not found: check if psi=1 enabled in cmdline This patch (of 5): The delaytop tool only supported sorting by CPU delay, which limited its usefulness when users needed to identify bottlenecks in other subsystems. Users had no way to sort processes by IO, IRQ, or other delay types to quickly pinpoint specific performance issues. Add -s/--sort option to allow sorting by different delay types. Users can now quickly identify bottlenecks in specific subsystems by sorting processes by the relevant delay metric. Link: https://lkml.kernel.org/r/20250907001101305vrTGnXaRNvtmsGkp-Ljk_@zte.com.cn Link: https://lkml.kernel.org/r/20250907001205573L3XpsQMIQnLgDqiiKYd3H@zte.com.cn Signed-off-by: Fan Yu Reviewed-by: xu xin Cc: Jonathan Corbet Cc: Wang Yaxin Cc: Yang Yang Signed-off-by: Andrew Morton --- tools/accounting/delaytop.c | 153 ++++++++++++++++++++++++++++-------- 1 file changed, 121 insertions(+), 32 deletions(-) diff --git a/tools/accounting/delaytop.c b/tools/accounting/delaytop.c index 9afb1ffc00ba..52718714496b 100644 --- a/tools/accounting/delaytop.c +++ b/tools/accounting/delaytop.c @@ -42,6 +42,7 @@ #include #include #include +#include #define PSI_CPU_SOME "/proc/pressure/cpu" #define PSI_CPU_FULL "/proc/pressure/cpu" @@ -61,6 +62,7 @@ #define TASK_COMM_LEN 16 #define MAX_MSG_SIZE 1024 #define MAX_TASKS 1000 +#define MAX_BUF_LEN 256 #define SET_TASK_STAT(task_count, field) tasks[task_count].field = stats.field #define BOOL_FPRINT(stream, fmt, ...) \ ({ \ @@ -68,17 +70,11 @@ ret >= 0; \ }) #define PSI_LINE_FORMAT "%-12s %6.1f%%/%6.1f%%/%6.1f%%/%8llu(ms)\n" - -/* Program settings structure */ -struct config { - int delay; /* Update interval in seconds */ - int iterations; /* Number of iterations, 0 == infinite */ - int max_processes; /* Maximum number of processes to show */ - char sort_field; /* Field to sort by */ - int output_one_time; /* Output once and exit */ - int monitor_pid; /* Monitor specific PID */ - char *container_path; /* Path to container cgroup */ -}; +#define SORT_FIELD(name) \ + {#name, \ + offsetof(struct task_info, name##_delay_total), \ + offsetof(struct task_info, name##_count)} +#define END_FIELD {NULL, 0, 0} /* PSI statistics structure */ struct psi_stats { @@ -130,6 +126,24 @@ struct container_stats { int nr_io_wait; /* Number of processes in IO wait */ }; +/* Delay field structure */ +struct field_desc { + const char *name; /* Field name for cmdline argument */ + unsigned long total_offset; /* Offset of total delay in task_info */ + unsigned long count_offset; /* Offset of count in task_info */ +}; + +/* Program settings structure */ +struct config { + int delay; /* Update interval in seconds */ + int iterations; /* Number of iterations, 0 == infinite */ + int max_processes; /* Maximum number of processes to show */ + int output_one_time; /* Output once and exit */ + int monitor_pid; /* Monitor specific PID */ + char *container_path; /* Path to container cgroup */ + const struct field_desc *sort_field; /* Current sort field */ +}; + /* Global variables */ static struct config cfg; static struct psi_stats psi; @@ -137,6 +151,17 @@ static struct task_info tasks[MAX_TASKS]; static int task_count; static int running = 1; static struct container_stats container_stats; +static const struct field_desc sort_fields[] = { + SORT_FIELD(cpu), + SORT_FIELD(blkio), + SORT_FIELD(irq), + SORT_FIELD(swapin), + SORT_FIELD(freepages), + SORT_FIELD(thrashing), + SORT_FIELD(compact), + SORT_FIELD(wpcopy), + END_FIELD +}; /* Netlink socket variables */ static int nl_sd = -1; @@ -158,18 +183,59 @@ static void disable_raw_mode(void) tcsetattr(STDIN_FILENO, TCSAFLUSH, &orig_termios); } +/* Find field descriptor by name with string comparison */ +static const struct field_desc *get_field_by_name(const char *name) +{ + const struct field_desc *field; + size_t field_len; + + for (field = sort_fields; field->name != NULL; field++) { + field_len = strlen(field->name); + if (field_len != strlen(name)) + continue; + if (strncmp(field->name, name, field_len) == 0) + return field; + } + + return NULL; +} + +/* Find display name for a field descriptor */ +static const char *get_name_by_field(const struct field_desc *field) +{ + return field ? field->name : "UNKNOWN"; +} + +/* Generate string of available field names */ +static void display_available_fields(void) +{ + const struct field_desc *field; + char buf[MAX_BUF_LEN]; + + buf[0] = '\0'; + + for (field = sort_fields; field->name != NULL; field++) { + strncat(buf, "|", MAX_BUF_LEN - strlen(buf) - 1); + strncat(buf, field->name, MAX_BUF_LEN - strlen(buf) - 1); + buf[MAX_BUF_LEN - 1] = '\0'; + } + + fprintf(stderr, "Available fields: %s\n", buf); +} + /* Display usage information and command line options */ static void usage(void) { printf("Usage: delaytop [Options]\n" "Options:\n" - " -h, --help Show this help message and exit\n" - " -d, --delay=SECONDS Set refresh interval (default: 2 seconds, min: 1)\n" - " -n, --iterations=COUNT Set number of updates (default: 0 = infinite)\n" - " -P, --processes=NUMBER Set maximum number of processes to show (default: 20, max: 1000)\n" - " -o, --once Display once and exit\n" - " -p, --pid=PID Monitor only the specified PID\n" - " -C, --container=PATH Monitor the container at specified cgroup path\n"); + " -h, --help Show this help message and exit\n" + " -d, --delay=SECONDS Set refresh interval (default: 2 seconds, min: 1)\n" + " -n, --iterations=COUNT Set number of updates (default: 0 = infinite)\n" + " -P, --processes=NUMBER Set maximum number of processes to show (default: 20, max: 1000)\n" + " -o, --once Display once and exit\n" + " -p, --pid=PID Monitor only the specified PID\n" + " -C, --container=PATH Monitor the container at specified cgroup path\n" + " -s, --sort=FIELD Sort by delay field (default: cpu)\n"); exit(0); } @@ -177,6 +243,7 @@ static void usage(void) static void parse_args(int argc, char **argv) { int c; + const struct field_desc *field; struct option long_options[] = { {"help", no_argument, 0, 'h'}, {"delay", required_argument, 0, 'd'}, @@ -184,6 +251,7 @@ static void parse_args(int argc, char **argv) {"pid", required_argument, 0, 'p'}, {"once", no_argument, 0, 'o'}, {"processes", required_argument, 0, 'P'}, + {"sort", required_argument, 0, 's'}, {"container", required_argument, 0, 'C'}, {0, 0, 0, 0} }; @@ -192,7 +260,7 @@ static void parse_args(int argc, char **argv) cfg.delay = 2; cfg.iterations = 0; cfg.max_processes = 20; - cfg.sort_field = 'c'; /* Default sort by CPU delay */ + cfg.sort_field = &sort_fields[0]; /* Default sorted by CPU delay */ cfg.output_one_time = 0; cfg.monitor_pid = 0; /* 0 means monitor all PIDs */ cfg.container_path = NULL; @@ -200,7 +268,7 @@ static void parse_args(int argc, char **argv) while (1) { int option_index = 0; - c = getopt_long(argc, argv, "hd:n:p:oP:C:", long_options, &option_index); + c = getopt_long(argc, argv, "hd:n:p:oP:C:s:", long_options, &option_index); if (c == -1) break; @@ -247,6 +315,22 @@ static void parse_args(int argc, char **argv) case 'C': cfg.container_path = strdup(optarg); break; + case 's': + if (strlen(optarg) == 0) { + fprintf(stderr, "Error: empty sort field\n"); + exit(1); + } + + field = get_field_by_name(optarg); + /* Show available fields if invalid option provided */ + if (!field) { + fprintf(stderr, "Error: invalid sort field '%s'\n", optarg); + display_available_fields(); + exit(1); + } + + cfg.sort_field = field; + break; default: fprintf(stderr, "Try 'delaytop --help' for more information.\n"); exit(1); @@ -587,19 +671,23 @@ static int compare_tasks(const void *a, const void *b) { const struct task_info *t1 = (const struct task_info *)a; const struct task_info *t2 = (const struct task_info *)b; + unsigned long long total1; + unsigned long long total2; + unsigned long count1; + unsigned long count2; double avg1, avg2; - switch (cfg.sort_field) { - case 'c': /* CPU */ - avg1 = average_ms(t1->cpu_delay_total, t1->cpu_count); - avg2 = average_ms(t2->cpu_delay_total, t2->cpu_count); - if (avg1 != avg2) - return avg2 > avg1 ? 1 : -1; - return t2->cpu_delay_total > t1->cpu_delay_total ? 1 : -1; + total1 = *(unsigned long long *)((char *)t1 + cfg.sort_field->total_offset); + total2 = *(unsigned long long *)((char *)t2 + cfg.sort_field->total_offset); + count1 = *(unsigned long *)((char *)t1 + cfg.sort_field->count_offset); + count2 = *(unsigned long *)((char *)t2 + cfg.sort_field->count_offset); - default: - return t2->cpu_delay_total > t1->cpu_delay_total ? 1 : -1; - } + avg1 = average_ms(total1, count1); + avg2 = average_ms(total2, count2); + if (avg1 != avg2) + return avg2 > avg1 ? 1 : -1; + + return 0; } /* Sort tasks by selected field */ @@ -738,8 +826,9 @@ static void display_results(void) container_stats.nr_stopped, container_stats.nr_uninterruptible, container_stats.nr_io_wait); } - suc &= BOOL_FPRINT(out, "Top %d processes (sorted by CPU delay):\n", - cfg.max_processes); + /* Task delay output */ + suc &= BOOL_FPRINT(out, "Top %d processes (sorted by %s delay):\n", + cfg.max_processes, get_name_by_field(cfg.sort_field)); suc &= BOOL_FPRINT(out, "%5s %5s %-17s", "PID", "TGID", "COMMAND"); suc &= BOOL_FPRINT(out, "%7s %7s %7s %7s %7s %7s %7s %7s\n", "CPU(ms)", "IO(ms)", "SWAP(ms)", "RCL(ms)", From 99d9c55f88e69ebbfc90e05ce7c320bdb3901d03 Mon Sep 17 00:00:00 2001 From: Fan Yu Date: Sun, 7 Sep 2025 00:12:52 +0800 Subject: [PATCH 51/75] tools/delaytop: add memory verbose mode support The original delaytop tool always displayed detailed memory subsystem breakdown, which could be overwhelming for users who only need high-level overview. Add flexible display control allowing users to choose their preferred information granularity. The new flexibility provides: 1) For quick monitoring: use normal mode to reduce visual clutter 2) For deep analysis: use verbose mode to see all memory subsystem details Link: https://lkml.kernel.org/r/202509070012527934u0ySb3teQ4gOYKnocyNO@zte.com.cn Signed-off-by: Fan Yu Reviewed-by: xu xin Signed-off-by: Andrew Morton --- tools/accounting/delaytop.c | 128 +++++++++++++++++++++++++++--------- 1 file changed, 98 insertions(+), 30 deletions(-) diff --git a/tools/accounting/delaytop.c b/tools/accounting/delaytop.c index 52718714496b..30dc95fb531a 100644 --- a/tools/accounting/delaytop.c +++ b/tools/accounting/delaytop.c @@ -69,13 +69,22 @@ int ret = fprintf(stream, fmt, ##__VA_ARGS__); \ ret >= 0; \ }) +#define TASK_AVG(task, field) average_ms((task).field##_delay_total, (task).field##_count) #define PSI_LINE_FORMAT "%-12s %6.1f%%/%6.1f%%/%6.1f%%/%8llu(ms)\n" -#define SORT_FIELD(name) \ +#define DELAY_FMT_DEFAULT "%8.2f %8.2f %8.2f %8.2f\n" +#define DELAY_FMT_MEMVERBOSE "%8.2f %8.2f %8.2f %8.2f %8.2f %8.2f\n" +#define SORT_FIELD(name, modes) \ {#name, \ offsetof(struct task_info, name##_delay_total), \ - offsetof(struct task_info, name##_count)} + offsetof(struct task_info, name##_count), \ + modes} #define END_FIELD {NULL, 0, 0} +/* Display mode types */ +#define MODE_TYPE_ALL (0xFFFFFFFF) +#define MODE_DEFAULT (1 << 0) +#define MODE_MEMVERBOSE (1 << 1) + /* PSI statistics structure */ struct psi_stats { double cpu_some_avg10, cpu_some_avg60, cpu_some_avg300; @@ -115,6 +124,8 @@ struct task_info { unsigned long long wpcopy_delay_total; unsigned long long irq_count; unsigned long long irq_delay_total; + unsigned long long mem_count; + unsigned long long mem_delay_total; }; /* Container statistics structure */ @@ -131,6 +142,7 @@ struct field_desc { const char *name; /* Field name for cmdline argument */ unsigned long total_offset; /* Offset of total delay in task_info */ unsigned long count_offset; /* Offset of count in task_info */ + size_t supported_modes; /* Supported display modes */ }; /* Program settings structure */ @@ -142,6 +154,7 @@ struct config { int monitor_pid; /* Monitor specific PID */ char *container_path; /* Path to container cgroup */ const struct field_desc *sort_field; /* Current sort field */ + size_t display_mode; /* Current display mode */ }; /* Global variables */ @@ -152,14 +165,15 @@ static int task_count; static int running = 1; static struct container_stats container_stats; static const struct field_desc sort_fields[] = { - SORT_FIELD(cpu), - SORT_FIELD(blkio), - SORT_FIELD(irq), - SORT_FIELD(swapin), - SORT_FIELD(freepages), - SORT_FIELD(thrashing), - SORT_FIELD(compact), - SORT_FIELD(wpcopy), + SORT_FIELD(cpu, MODE_DEFAULT), + SORT_FIELD(blkio, MODE_DEFAULT), + SORT_FIELD(irq, MODE_DEFAULT), + SORT_FIELD(mem, MODE_DEFAULT | MODE_MEMVERBOSE), + SORT_FIELD(swapin, MODE_MEMVERBOSE), + SORT_FIELD(freepages, MODE_MEMVERBOSE), + SORT_FIELD(thrashing, MODE_MEMVERBOSE), + SORT_FIELD(compact, MODE_MEMVERBOSE), + SORT_FIELD(wpcopy, MODE_MEMVERBOSE), END_FIELD }; @@ -207,7 +221,7 @@ static const char *get_name_by_field(const struct field_desc *field) } /* Generate string of available field names */ -static void display_available_fields(void) +static void display_available_fields(size_t mode) { const struct field_desc *field; char buf[MAX_BUF_LEN]; @@ -215,6 +229,8 @@ static void display_available_fields(void) buf[0] = '\0'; for (field = sort_fields; field->name != NULL; field++) { + if (!(field->supported_modes & mode)) + continue; strncat(buf, "|", MAX_BUF_LEN - strlen(buf) - 1); strncat(buf, field->name, MAX_BUF_LEN - strlen(buf) - 1); buf[MAX_BUF_LEN - 1] = '\0'; @@ -235,7 +251,8 @@ static void usage(void) " -o, --once Display once and exit\n" " -p, --pid=PID Monitor only the specified PID\n" " -C, --container=PATH Monitor the container at specified cgroup path\n" - " -s, --sort=FIELD Sort by delay field (default: cpu)\n"); + " -s, --sort=FIELD Sort by delay field (default: cpu)\n" + " -M, --memverbose Display memory detailed information\n"); exit(0); } @@ -253,6 +270,7 @@ static void parse_args(int argc, char **argv) {"processes", required_argument, 0, 'P'}, {"sort", required_argument, 0, 's'}, {"container", required_argument, 0, 'C'}, + {"memverbose", no_argument, 0, 'M'}, {0, 0, 0, 0} }; @@ -264,11 +282,12 @@ static void parse_args(int argc, char **argv) cfg.output_one_time = 0; cfg.monitor_pid = 0; /* 0 means monitor all PIDs */ cfg.container_path = NULL; + cfg.display_mode = MODE_DEFAULT; while (1) { int option_index = 0; - c = getopt_long(argc, argv, "hd:n:p:oP:C:s:", long_options, &option_index); + c = getopt_long(argc, argv, "hd:n:p:oP:C:s:M", long_options, &option_index); if (c == -1) break; @@ -325,12 +344,16 @@ static void parse_args(int argc, char **argv) /* Show available fields if invalid option provided */ if (!field) { fprintf(stderr, "Error: invalid sort field '%s'\n", optarg); - display_available_fields(); + display_available_fields(MODE_TYPE_ALL); exit(1); } cfg.sort_field = field; break; + case 'M': + cfg.display_mode = MODE_MEMVERBOSE; + cfg.sort_field = get_field_by_name("mem"); + break; default: fprintf(stderr, "Try 'delaytop --help' for more information.\n"); exit(1); @@ -338,6 +361,25 @@ static void parse_args(int argc, char **argv) } } +/* Calculate average delay in milliseconds for overall memory */ +static void set_mem_delay_total(struct task_info *t) +{ + t->mem_delay_total = t->swapin_delay_total + + t->freepages_delay_total + + t->thrashing_delay_total + + t->compact_delay_total + + t->wpcopy_delay_total; +} + +static void set_mem_count(struct task_info *t) +{ + t->mem_count = t->swapin_count + + t->freepages_count + + t->thrashing_count + + t->compact_count + + t->wpcopy_count; +} + /* Create a raw netlink socket and bind */ static int create_nl_socket(void) { @@ -611,6 +653,8 @@ static void fetch_and_fill_task_info(int pid, const char *comm) SET_TASK_STAT(task_count, wpcopy_delay_total); SET_TASK_STAT(task_count, irq_count); SET_TASK_STAT(task_count, irq_delay_total); + set_mem_count(&tasks[task_count]); + set_mem_delay_total(&tasks[task_count]); task_count++; } break; @@ -829,27 +873,44 @@ static void display_results(void) /* Task delay output */ suc &= BOOL_FPRINT(out, "Top %d processes (sorted by %s delay):\n", cfg.max_processes, get_name_by_field(cfg.sort_field)); - suc &= BOOL_FPRINT(out, "%5s %5s %-17s", "PID", "TGID", "COMMAND"); - suc &= BOOL_FPRINT(out, "%7s %7s %7s %7s %7s %7s %7s %7s\n", - "CPU(ms)", "IO(ms)", "SWAP(ms)", "RCL(ms)", - "THR(ms)", "CMP(ms)", "WP(ms)", "IRQ(ms)"); - suc &= BOOL_FPRINT(out, "-----------------------------------------------"); - suc &= BOOL_FPRINT(out, "----------------------------------------------\n"); + suc &= BOOL_FPRINT(out, "%8s %8s %-17s", "PID", "TGID", "COMMAND"); + if (cfg.display_mode == MODE_MEMVERBOSE) { + suc &= BOOL_FPRINT(out, "%8s %8s %8s %8s %8s %8s\n", + "MEM(ms)", "SWAP(ms)", "RCL(ms)", + "THR(ms)", "CMP(ms)", "WP(ms)"); + suc &= BOOL_FPRINT(out, "-----------------------"); + suc &= BOOL_FPRINT(out, "-----------------------"); + suc &= BOOL_FPRINT(out, "-----------------------"); + suc &= BOOL_FPRINT(out, "---------------------\n"); + } else { + suc &= BOOL_FPRINT(out, "%8s %8s %8s %8s\n", + "CPU(ms)", "IO(ms)", "IRQ(ms)", "MEM(ms)"); + suc &= BOOL_FPRINT(out, "-----------------------"); + suc &= BOOL_FPRINT(out, "-----------------------"); + suc &= BOOL_FPRINT(out, "--------------------------\n"); + } + count = task_count < cfg.max_processes ? task_count : cfg.max_processes; for (i = 0; i < count; i++) { - suc &= BOOL_FPRINT(out, "%5d %5d %-15s", + suc &= BOOL_FPRINT(out, "%8d %8d %-15s", tasks[i].pid, tasks[i].tgid, tasks[i].command); - suc &= BOOL_FPRINT(out, "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f\n", - average_ms(tasks[i].cpu_delay_total, tasks[i].cpu_count), - average_ms(tasks[i].blkio_delay_total, tasks[i].blkio_count), - average_ms(tasks[i].swapin_delay_total, tasks[i].swapin_count), - average_ms(tasks[i].freepages_delay_total, tasks[i].freepages_count), - average_ms(tasks[i].thrashing_delay_total, tasks[i].thrashing_count), - average_ms(tasks[i].compact_delay_total, tasks[i].compact_count), - average_ms(tasks[i].wpcopy_delay_total, tasks[i].wpcopy_count), - average_ms(tasks[i].irq_delay_total, tasks[i].irq_count)); + if (cfg.display_mode == MODE_MEMVERBOSE) { + suc &= BOOL_FPRINT(out, DELAY_FMT_MEMVERBOSE, + TASK_AVG(tasks[i], mem), + TASK_AVG(tasks[i], swapin), + TASK_AVG(tasks[i], freepages), + TASK_AVG(tasks[i], thrashing), + TASK_AVG(tasks[i], compact), + TASK_AVG(tasks[i], wpcopy)); + } else { + suc &= BOOL_FPRINT(out, DELAY_FMT_DEFAULT, + TASK_AVG(tasks[i], cpu), + TASK_AVG(tasks[i], blkio), + TASK_AVG(tasks[i], irq), + TASK_AVG(tasks[i], mem)); + } } suc &= BOOL_FPRINT(out, "\n"); @@ -891,6 +952,13 @@ int main(int argc, char **argv) /* Main loop */ while (running) { + /* Exit when sort field do not match display mode */ + if (!(cfg.sort_field->supported_modes & cfg.display_mode)) { + fprintf(stderr, "Sort field not supported in this mode\n"); + display_available_fields(cfg.display_mode); + break; + } + /* Read PSI statistics */ read_psi_stats(); From 5e57515d81f9003555b7a4d246e02f1ee9c74ffa Mon Sep 17 00:00:00 2001 From: Fan Yu Date: Sun, 7 Sep 2025 00:13:38 +0800 Subject: [PATCH 52/75] tools/delaytop: add interactive mode with keyboard controls The original delaytop only supported static output with limited interaction. Users had to restart the tool with different command-line options to change sorting or display modes, which disrupted continuous monitoring and reduced productivity during performance investigations. Adds real-time interactive controls through keyboard input: 1) Add interactive menu system with visual prompts 2) Support dynamic sorting changes without restarting 3) Enable toggle of memory verbose mode with 'M' key The interactive mode transforms delaytop from a static monitoring tool into a dynamic investigation platform, allowing users to adapt the view in real-time based on observed performance patterns. Link: https://lkml.kernel.org/r/20250907001338580EURha20BxWFmBSrUpS8D1@zte.com.cn Signed-off-by: Fan Yu Reviewed-by: xu xin Signed-off-by: Andrew Morton --- tools/accounting/delaytop.c | 166 ++++++++++++++++++++++++++---------- 1 file changed, 121 insertions(+), 45 deletions(-) diff --git a/tools/accounting/delaytop.c b/tools/accounting/delaytop.c index 30dc95fb531a..7bd1a1eeb354 100644 --- a/tools/accounting/delaytop.c +++ b/tools/accounting/delaytop.c @@ -73,8 +73,8 @@ #define PSI_LINE_FORMAT "%-12s %6.1f%%/%6.1f%%/%6.1f%%/%8llu(ms)\n" #define DELAY_FMT_DEFAULT "%8.2f %8.2f %8.2f %8.2f\n" #define DELAY_FMT_MEMVERBOSE "%8.2f %8.2f %8.2f %8.2f %8.2f %8.2f\n" -#define SORT_FIELD(name, modes) \ - {#name, \ +#define SORT_FIELD(name, cmd, modes) \ + {#name, #cmd, \ offsetof(struct task_info, name##_delay_total), \ offsetof(struct task_info, name##_count), \ modes} @@ -140,6 +140,7 @@ struct container_stats { /* Delay field structure */ struct field_desc { const char *name; /* Field name for cmdline argument */ + const char *cmd_char; /* Interactive command */ unsigned long total_offset; /* Offset of total delay in task_info */ unsigned long count_offset; /* Offset of count in task_info */ size_t supported_modes; /* Supported display modes */ @@ -165,17 +166,18 @@ static int task_count; static int running = 1; static struct container_stats container_stats; static const struct field_desc sort_fields[] = { - SORT_FIELD(cpu, MODE_DEFAULT), - SORT_FIELD(blkio, MODE_DEFAULT), - SORT_FIELD(irq, MODE_DEFAULT), - SORT_FIELD(mem, MODE_DEFAULT | MODE_MEMVERBOSE), - SORT_FIELD(swapin, MODE_MEMVERBOSE), - SORT_FIELD(freepages, MODE_MEMVERBOSE), - SORT_FIELD(thrashing, MODE_MEMVERBOSE), - SORT_FIELD(compact, MODE_MEMVERBOSE), - SORT_FIELD(wpcopy, MODE_MEMVERBOSE), + SORT_FIELD(cpu, c, MODE_DEFAULT), + SORT_FIELD(blkio, i, MODE_DEFAULT), + SORT_FIELD(irq, q, MODE_DEFAULT), + SORT_FIELD(mem, m, MODE_DEFAULT | MODE_MEMVERBOSE), + SORT_FIELD(swapin, s, MODE_MEMVERBOSE), + SORT_FIELD(freepages, r, MODE_MEMVERBOSE), + SORT_FIELD(thrashing, t, MODE_MEMVERBOSE), + SORT_FIELD(compact, p, MODE_MEMVERBOSE), + SORT_FIELD(wpcopy, w, MODE_MEMVERBOSE), END_FIELD }; +static int sort_selected; /* Netlink socket variables */ static int nl_sd = -1; @@ -197,6 +199,19 @@ static void disable_raw_mode(void) tcsetattr(STDIN_FILENO, TCSAFLUSH, &orig_termios); } +/* Find field descriptor by command line */ +static const struct field_desc *get_field_by_cmd_char(char ch) +{ + const struct field_desc *field; + + for (field = sort_fields; field->name != NULL; field++) { + if (field->cmd_char[0] == ch) + return field; + } + + return NULL; +} + /* Find field descriptor by name with string comparison */ static const struct field_desc *get_field_by_name(const char *name) { @@ -870,6 +885,18 @@ static void display_results(void) container_stats.nr_stopped, container_stats.nr_uninterruptible, container_stats.nr_io_wait); } + + /* Interacive command */ + suc &= BOOL_FPRINT(out, "[o]sort [M]memverbose [q]quit\n"); + if (sort_selected) { + if (cfg.display_mode == MODE_MEMVERBOSE) + suc &= BOOL_FPRINT(out, + "sort selection: [m]MEM [r]RCL [t]THR [p]CMP [w]WP\n"); + else + suc &= BOOL_FPRINT(out, + "sort selection: [c]CPU [i]IO [m]MEM [q]IRQ\n"); + } + /* Task delay output */ suc &= BOOL_FPRINT(out, "Top %d processes (sorted by %s delay):\n", cfg.max_processes, get_name_by_field(cfg.sort_field)); @@ -919,11 +946,78 @@ static void display_results(void) perror("Error writing to output"); } +/* Check for keyboard input with timeout based on cfg.delay */ +static char check_for_keypress(void) +{ + struct timeval tv = {cfg.delay, 0}; + fd_set readfds; + char ch = 0; + + FD_ZERO(&readfds); + FD_SET(STDIN_FILENO, &readfds); + int r = select(STDIN_FILENO + 1, &readfds, NULL, NULL, &tv); + + if (r > 0 && FD_ISSET(STDIN_FILENO, &readfds)) { + read(STDIN_FILENO, &ch, 1); + return ch; + } + + return 0; +} + +#define MAX_MODE_SIZE 2 +static void toggle_display_mode(void) +{ + static const size_t modes[MAX_MODE_SIZE] = {MODE_DEFAULT, MODE_MEMVERBOSE}; + static size_t cur_index; + + cur_index = (cur_index + 1) % MAX_MODE_SIZE; + cfg.display_mode = modes[cur_index]; +} + +/* Handle keyboard input: sorting selection, mode toggle, or quit */ +static void handle_keypress(char ch, int *running) +{ + const struct field_desc *field; + + /* Change sort field */ + if (sort_selected) { + field = get_field_by_cmd_char(ch); + if (field && (field->supported_modes & cfg.display_mode)) + cfg.sort_field = field; + + sort_selected = 0; + /* Handle mode changes or quit */ + } else { + switch (ch) { + case 'o': + sort_selected = 1; + break; + case 'M': + toggle_display_mode(); + for (field = sort_fields; field->name != NULL; field++) { + if (field->supported_modes & cfg.display_mode) { + cfg.sort_field = field; + break; + } + } + break; + case 'q': + case 'Q': + *running = 0; + break; + default: + break; + } + } +} + /* Main function */ int main(int argc, char **argv) { + const struct field_desc *field; int iterations = 0; - int use_q_quit = 0; + char keypress; /* Parse command line arguments */ parse_args(argc, argv); @@ -943,20 +1037,20 @@ int main(int argc, char **argv) exit(1); } - if (!cfg.output_one_time) { - use_q_quit = 1; - enable_raw_mode(); - printf("Press 'q' to quit.\n"); - fflush(stdout); - } + /* Set terminal to non-canonical mode for interaction */ + enable_raw_mode(); /* Main loop */ while (running) { - /* Exit when sort field do not match display mode */ + /* Auto-switch sort field when not matching display mode */ if (!(cfg.sort_field->supported_modes & cfg.display_mode)) { - fprintf(stderr, "Sort field not supported in this mode\n"); - display_available_fields(cfg.display_mode); - break; + for (field = sort_fields; field->name != NULL; field++) { + if (field->supported_modes & cfg.display_mode) { + cfg.sort_field = field; + printf("Auto-switched sort field to: %s\n", field->name); + break; + } + } } /* Read PSI statistics */ @@ -983,32 +1077,14 @@ int main(int argc, char **argv) if (cfg.output_one_time) break; - /* Check for 'q' key to quit */ - if (use_q_quit) { - struct timeval tv = {cfg.delay, 0}; - fd_set readfds; - - FD_ZERO(&readfds); - FD_SET(STDIN_FILENO, &readfds); - int r = select(STDIN_FILENO+1, &readfds, NULL, NULL, &tv); - - if (r > 0 && FD_ISSET(STDIN_FILENO, &readfds)) { - char ch = 0; - - read(STDIN_FILENO, &ch, 1); - if (ch == 'q' || ch == 'Q') { - running = 0; - break; - } - } - } else { - sleep(cfg.delay); - } + /* Keypress for interactive usage */ + keypress = check_for_keypress(); + if (keypress) + handle_keypress(keypress, &running); } /* Restore terminal mode */ - if (use_q_quit) - disable_raw_mode(); + disable_raw_mode(); /* Cleanup */ close(nl_sd); From 0c10f9cd812f6f32dca928010e132a7d89812666 Mon Sep 17 00:00:00 2001 From: Fan Yu Date: Sun, 7 Sep 2025 00:14:17 +0800 Subject: [PATCH 53/75] tools/delaytop: improve error handling for missing PSI support Enhanced display logic to conditionally show PSI information only when successfully read, with helpful guidance for users to enable PSI support (psi=1 cmdline parameter). Link: https://lkml.kernel.org/r/20250907001417537vSx6nUsb3ILqI0iQ-WnGp@zte.com.cn Signed-off-by: Fan Yu Reviewed-by: xu xin Signed-off-by: Andrew Morton --- tools/accounting/delaytop.c | 182 +++++++++++++++++++++++------------- 1 file changed, 116 insertions(+), 66 deletions(-) diff --git a/tools/accounting/delaytop.c b/tools/accounting/delaytop.c index 7bd1a1eeb354..72cc500b44b1 100644 --- a/tools/accounting/delaytop.c +++ b/tools/accounting/delaytop.c @@ -44,13 +44,11 @@ #include #include -#define PSI_CPU_SOME "/proc/pressure/cpu" -#define PSI_CPU_FULL "/proc/pressure/cpu" -#define PSI_MEMORY_SOME "/proc/pressure/memory" -#define PSI_MEMORY_FULL "/proc/pressure/memory" -#define PSI_IO_SOME "/proc/pressure/io" -#define PSI_IO_FULL "/proc/pressure/io" -#define PSI_IRQ_FULL "/proc/pressure/irq" +#define PSI_PATH "/proc/pressure" +#define PSI_CPU_PATH "/proc/pressure/cpu" +#define PSI_MEMORY_PATH "/proc/pressure/memory" +#define PSI_IO_PATH "/proc/pressure/io" +#define PSI_IRQ_PATH "/proc/pressure/irq" #define NLA_NEXT(na) ((struct nlattr *)((char *)(na) + NLA_ALIGN((na)->nla_len))) #define NLA_DATA(na) ((void *)((char *)(na) + NLA_HDRLEN)) @@ -499,87 +497,134 @@ static int get_family_id(int sd) return id; } -static void read_psi_stats(void) +static int read_psi_stats(void) { FILE *fp; char line[256]; int ret = 0; + int error_count = 0; + + /* Check if PSI path exists */ + if (access(PSI_PATH, F_OK) != 0) { + fprintf(stderr, "Error: PSI interface not found at %s\n", PSI_PATH); + fprintf(stderr, "Please ensure your kernel supports PSI (Pressure Stall Information)\n"); + return -1; + } + /* Zero all fields */ memset(&psi, 0, sizeof(psi)); + /* CPU pressure */ - fp = fopen(PSI_CPU_SOME, "r"); + fp = fopen(PSI_CPU_PATH, "r"); if (fp) { while (fgets(line, sizeof(line), fp)) { if (strncmp(line, "some", 4) == 0) { ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu", &psi.cpu_some_avg10, &psi.cpu_some_avg60, &psi.cpu_some_avg300, &psi.cpu_some_total); - if (ret != 4) + if (ret != 4) { fprintf(stderr, "Failed to parse CPU some PSI data\n"); + error_count++; + } } else if (strncmp(line, "full", 4) == 0) { ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu", &psi.cpu_full_avg10, &psi.cpu_full_avg60, &psi.cpu_full_avg300, &psi.cpu_full_total); - if (ret != 4) + if (ret != 4) { fprintf(stderr, "Failed to parse CPU full PSI data\n"); + error_count++; + } } } fclose(fp); + } else { + fprintf(stderr, "Warning: Failed to open %s\n", PSI_CPU_PATH); + error_count++; } + /* Memory pressure */ - fp = fopen(PSI_MEMORY_SOME, "r"); + fp = fopen(PSI_MEMORY_PATH, "r"); if (fp) { while (fgets(line, sizeof(line), fp)) { if (strncmp(line, "some", 4) == 0) { ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu", &psi.memory_some_avg10, &psi.memory_some_avg60, &psi.memory_some_avg300, &psi.memory_some_total); - if (ret != 4) + if (ret != 4) { fprintf(stderr, "Failed to parse Memory some PSI data\n"); + error_count++; + } } else if (strncmp(line, "full", 4) == 0) { ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu", &psi.memory_full_avg10, &psi.memory_full_avg60, &psi.memory_full_avg300, &psi.memory_full_total); - } - if (ret != 4) + if (ret != 4) { fprintf(stderr, "Failed to parse Memory full PSI data\n"); + error_count++; + } + } } fclose(fp); + } else { + fprintf(stderr, "Warning: Failed to open %s\n", PSI_MEMORY_PATH); + error_count++; } + /* IO pressure */ - fp = fopen(PSI_IO_SOME, "r"); + fp = fopen(PSI_IO_PATH, "r"); if (fp) { while (fgets(line, sizeof(line), fp)) { if (strncmp(line, "some", 4) == 0) { ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu", &psi.io_some_avg10, &psi.io_some_avg60, &psi.io_some_avg300, &psi.io_some_total); - if (ret != 4) + if (ret != 4) { fprintf(stderr, "Failed to parse IO some PSI data\n"); + error_count++; + } } else if (strncmp(line, "full", 4) == 0) { ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu", &psi.io_full_avg10, &psi.io_full_avg60, &psi.io_full_avg300, &psi.io_full_total); - if (ret != 4) + if (ret != 4) { fprintf(stderr, "Failed to parse IO full PSI data\n"); + error_count++; + } } } fclose(fp); + } else { + fprintf(stderr, "Warning: Failed to open %s\n", PSI_IO_PATH); + error_count++; } + /* IRQ pressure (only full) */ - fp = fopen(PSI_IRQ_FULL, "r"); + fp = fopen(PSI_IRQ_PATH, "r"); if (fp) { while (fgets(line, sizeof(line), fp)) { if (strncmp(line, "full", 4) == 0) { ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu", &psi.irq_full_avg10, &psi.irq_full_avg60, &psi.irq_full_avg300, &psi.irq_full_total); - if (ret != 4) + if (ret != 4) { fprintf(stderr, "Failed to parse IRQ full PSI data\n"); + error_count++; + } } } fclose(fp); + } else { + fprintf(stderr, "Warning: Failed to open %s\n", PSI_IRQ_PATH); + error_count++; } + + /* Return error count: 0 means success, >0 means warnings, -1 means fatal error */ + if (error_count > 0) { + fprintf(stderr, "PSI stats reading completed with %d warnings\n", error_count); + return error_count; + } + + return 0; } static int read_comm(int pid, char *comm_buf, size_t buf_size) @@ -820,7 +865,7 @@ static void get_container_stats(void) } /* Display results to stdout or log file */ -static void display_results(void) +static void display_results(int psi_ret) { time_t now = time(NULL); struct tm *tm_now = localtime(&now); @@ -833,49 +878,53 @@ static void display_results(void) suc &= BOOL_FPRINT(out, "\033[H\033[J"); /* PSI output (one-line, no cat style) */ - suc &= BOOL_FPRINT(out, "System Pressure Information: (avg10/avg60/avg300/total)\n"); - suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, - "CPU some:", - psi.cpu_some_avg10, - psi.cpu_some_avg60, - psi.cpu_some_avg300, - psi.cpu_some_total / 1000); - suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, - "CPU full:", - psi.cpu_full_avg10, - psi.cpu_full_avg60, - psi.cpu_full_avg300, - psi.cpu_full_total / 1000); - suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, - "Memory full:", - psi.memory_full_avg10, - psi.memory_full_avg60, - psi.memory_full_avg300, - psi.memory_full_total / 1000); - suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, - "Memory some:", - psi.memory_some_avg10, - psi.memory_some_avg60, - psi.memory_some_avg300, - psi.memory_some_total / 1000); - suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, - "IO full:", - psi.io_full_avg10, - psi.io_full_avg60, - psi.io_full_avg300, - psi.io_full_total / 1000); - suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, - "IO some:", - psi.io_some_avg10, - psi.io_some_avg60, - psi.io_some_avg300, - psi.io_some_total / 1000); - suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, - "IRQ full:", - psi.irq_full_avg10, - psi.irq_full_avg60, - psi.irq_full_avg300, - psi.irq_full_total / 1000); + suc &= BOOL_FPRINT(out, "System Pressure Information: (avg10/avg60vg300/total)\n"); + if (psi_ret) { + suc &= BOOL_FPRINT(out, " PSI not found: check if psi=1 enabled in cmdline\n"); + } else { + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, + "CPU some:", + psi.cpu_some_avg10, + psi.cpu_some_avg60, + psi.cpu_some_avg300, + psi.cpu_some_total / 1000); + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, + "CPU full:", + psi.cpu_full_avg10, + psi.cpu_full_avg60, + psi.cpu_full_avg300, + psi.cpu_full_total / 1000); + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, + "Memory full:", + psi.memory_full_avg10, + psi.memory_full_avg60, + psi.memory_full_avg300, + psi.memory_full_total / 1000); + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, + "Memory some:", + psi.memory_some_avg10, + psi.memory_some_avg60, + psi.memory_some_avg300, + psi.memory_some_total / 1000); + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, + "IO full:", + psi.io_full_avg10, + psi.io_full_avg60, + psi.io_full_avg300, + psi.io_full_total / 1000); + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, + "IO some:", + psi.io_some_avg10, + psi.io_some_avg60, + psi.io_some_avg300, + psi.io_some_total / 1000); + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, + "IRQ full:", + psi.irq_full_avg10, + psi.irq_full_avg60, + psi.irq_full_avg300, + psi.irq_full_total / 1000); + } if (cfg.container_path) { suc &= BOOL_FPRINT(out, "Container Information (%s):\n", cfg.container_path); @@ -1017,6 +1066,7 @@ int main(int argc, char **argv) { const struct field_desc *field; int iterations = 0; + int psi_ret = 0; char keypress; /* Parse command line arguments */ @@ -1054,7 +1104,7 @@ int main(int argc, char **argv) } /* Read PSI statistics */ - read_psi_stats(); + psi_ret = read_psi_stats(); /* Get container stats if container path provided */ if (cfg.container_path) @@ -1067,7 +1117,7 @@ int main(int argc, char **argv) sort_tasks(); /* Display results to stdout or log file */ - display_results(); + display_results(psi_ret); /* Check for iterations */ if (cfg.iterations > 0 && ++iterations >= cfg.iterations) From c25822ccaa40fbc8ec38510397e0fbb8b1b0c0e2 Mon Sep 17 00:00:00 2001 From: Fan Yu Date: Sun, 7 Sep 2025 00:14:57 +0800 Subject: [PATCH 54/75] docs: update delaytop documentation for new interactive features This commit updates the delaytop documentation to reflect the newly added features: 1) Added comprehensive description of interactive keyboard controls 2) Documented all available sort fields 3) Added examples for advanced usage scenarios 4) Included PSI availability note Link: https://lkml.kernel.org/r/20250907001457696qAqUGGkV1VfEO6OkVMovW@zte.com.cn Signed-off-by: Fan Yu Reviewed-by: xu xin Signed-off-by: Andrew Morton --- Documentation/accounting/delay-accounting.rst | 89 ++++++++++++------- 1 file changed, 57 insertions(+), 32 deletions(-) diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst index 8ccc5af5ea1e..86d7902a657f 100644 --- a/Documentation/accounting/delay-accounting.rst +++ b/Documentation/accounting/delay-accounting.rst @@ -134,47 +134,72 @@ The above command can be used with -v to get more debug information. After the system starts, use `delaytop` to get the system-wide delay information, which includes system-wide PSI information and Top-N high-latency tasks. +Note: PSI support requires `CONFIG_PSI=y` and `psi=1` for full functionality. -`delaytop` supports sorting by CPU latency in descending order by default, -displays the top 20 high-latency tasks by default, and refreshes the latency -data every 2 seconds by default. +`delaytop` is an interactive tool for monitoring system pressure and task delays. +It supports multiple sorting options, display modes, and real-time keyboard controls. -Get PSI information and Top-N tasks delay, since system boot:: +Basic usage with default settings (sorts by CPU delay, shows top 20 tasks, refreshes every 2 seconds):: bash# ./delaytop - System Pressure Information: (avg10/avg60/avg300/total) - CPU some: 0.0%/ 0.0%/ 0.0%/ 345(ms) + System Pressure Information: (avg10/avg60vg300/total) + CPU some: 0.0%/ 0.0%/ 0.0%/ 106137(ms) CPU full: 0.0%/ 0.0%/ 0.0%/ 0(ms) Memory full: 0.0%/ 0.0%/ 0.0%/ 0(ms) Memory some: 0.0%/ 0.0%/ 0.0%/ 0(ms) - IO full: 0.0%/ 0.0%/ 0.0%/ 65(ms) - IO some: 0.0%/ 0.0%/ 0.0%/ 79(ms) + IO full: 0.0%/ 0.0%/ 0.0%/ 2240(ms) + IO some: 0.0%/ 0.0%/ 0.0%/ 2783(ms) IRQ full: 0.0%/ 0.0%/ 0.0%/ 0(ms) - Top 20 processes (sorted by CPU delay): - PID TGID COMMAND CPU(ms) IO(ms) SWAP(ms) RCL(ms) THR(ms) CMP(ms) WP(ms) IRQ(ms) - ---------------------------------------------------------------------------------------------- - 161 161 zombie_memcg_re 1.40 0.00 0.00 0.00 0.00 0.00 0.00 0.00 - 130 130 blkcg_punt_bio 1.37 0.00 0.00 0.00 0.00 0.00 0.00 0.00 - 444 444 scsi_tmf_0 0.73 0.00 0.00 0.00 0.00 0.00 0.00 0.00 - 1280 1280 rsyslogd 0.53 0.04 0.00 0.00 0.00 0.00 0.00 0.00 - 12 12 ksoftirqd/0 0.47 0.00 0.00 0.00 0.00 0.00 0.00 0.00 - 1277 1277 nbd-server 0.44 0.00 0.00 0.00 0.00 0.00 0.00 0.00 - 308 308 kworker/2:2-sys 0.41 0.00 0.00 0.00 0.00 0.00 0.00 0.00 - 55 55 netns 0.36 0.00 0.00 0.00 0.00 0.00 0.00 0.00 - 1187 1187 acpid 0.31 0.03 0.00 0.00 0.00 0.00 0.00 0.00 - 6184 6184 kworker/1:2-sys 0.24 0.00 0.00 0.00 0.00 0.00 0.00 0.00 - 186 186 kaluad 0.24 0.00 0.00 0.00 0.00 0.00 0.00 0.00 - 18 18 ksoftirqd/1 0.24 0.00 0.00 0.00 0.00 0.00 0.00 0.00 - 185 185 kmpath_rdacd 0.23 0.00 0.00 0.00 0.00 0.00 0.00 0.00 - 190 190 kstrp 0.23 0.00 0.00 0.00 0.00 0.00 0.00 0.00 - 2759 2759 agetty 0.20 0.03 0.00 0.00 0.00 0.00 0.00 0.00 - 1190 1190 kworker/0:3-sys 0.19 0.00 0.00 0.00 0.00 0.00 0.00 0.00 - 1272 1272 sshd 0.15 0.04 0.00 0.00 0.00 0.00 0.00 0.00 - 1156 1156 license 0.15 0.11 0.00 0.00 0.00 0.00 0.00 0.00 - 134 134 md 0.13 0.00 0.00 0.00 0.00 0.00 0.00 0.00 - 6142 6142 kworker/3:2-xfs 0.13 0.00 0.00 0.00 0.00 0.00 0.00 0.00 + [o]sort [M]memverbose [q]quit + Top 20 processes (sorted by cpu delay): + PID TGID COMMAND CPU(ms) IO(ms) IRQ(ms) MEM(ms) + ------------------------------------------------------------------------ + 110 110 kworker/15:0H-s 27.91 0.00 0.00 0.00 + 57 57 cpuhp/7 3.18 0.00 0.00 0.00 + 99 99 cpuhp/14 2.97 0.00 0.00 0.00 + 51 51 cpuhp/6 0.90 0.00 0.00 0.00 + 44 44 kworker/4:0H-sy 0.80 0.00 0.00 0.00 + 60 60 ksoftirqd/7 0.74 0.00 0.00 0.00 + 76 76 idle_inject/10 0.31 0.00 0.00 0.00 + 100 100 idle_inject/14 0.30 0.00 0.00 0.00 + 1309 1309 systemsettings 0.29 0.00 0.00 0.00 + 45 45 cpuhp/5 0.22 0.00 0.00 0.00 + 63 63 cpuhp/8 0.20 0.00 0.00 0.00 + 87 87 cpuhp/12 0.18 0.00 0.00 0.00 + 93 93 cpuhp/13 0.17 0.00 0.00 0.00 + 1265 1265 acpid 0.17 0.00 0.00 0.00 + 1552 1552 sshd 0.17 0.00 0.00 0.00 + 2584 2584 sddm-helper 0.16 0.00 0.00 0.00 + 1284 1284 rtkit-daemon 0.15 0.00 0.00 0.00 + 1326 1326 nde-netfilter 0.14 0.00 0.00 0.00 + 27 27 cpuhp/2 0.13 0.00 0.00 0.00 + 631 631 kworker/11:2-rc 0.11 0.00 0.00 0.00 -Dynamic interactive interface of delaytop:: +Interactive keyboard controls during runtime:: + + o - Select sort field (CPU, IO, IRQ, Memory, etc.) + M - Toggle display mode (Default/Memory Verbose) + q - Quit + +Available sort fields(use -s/--sort or interactive command):: + + cpu(c) - CPU delay + blkio(i) - I/O delay + irq(q) - IRQ delay + mem(m) - Total memory delay + swapin(s) - Swapin delay (memory verbose mode only) + freepages(r) - Freepages reclaim delay (memory verbose mode only) + thrashing(t) - Thrashing delay (memory verbose mode only) + compact(p) - Compaction delay (memory verbose mode only) + wpcopy(w) - Write page copy delay (memory verbose mode only) + +Advanced usage examples:: + + # ./delaytop -s blkio + Sorted by IO delay + + # ./delaytop -s mem -M + Sorted by memory delay in memory verbose mode # ./delaytop -p pid Print delayacct stats From d6d5116391857fc78fad9aa42317b36e4ce17b58 Mon Sep 17 00:00:00 2001 From: Evangelos Petrongonas Date: Thu, 21 Aug 2025 17:58:59 +0000 Subject: [PATCH 55/75] kexec: introduce is_kho_boot() Patch series "efi: Fix EFI boot with kexec handover (KHO)", v3. This patch series fixes a kernel panic that occurs when booting with both EFI and KHO (Kexec HandOver) enabled. The issue arises because EFI's `reserve_regions()` clears all memory regions with `memblock_remove(0, PHYS_ADDR_MAX)` before rebuilding them from EFI data. This destroys KHO scratch regions that were set up early during device tree scanning, causing a panic as the kernel has no valid memory regions for early allocations. The first patch introduces `is_kho_boot()` to allow early boot components to reliably detect if the kernel was booted via KHO-enabled kexec. The existing `kho_is_enabled()` only checks the command line and doesn't verify if an actual KHO FDT was passed. The second patch modifies EFI's `reserve_regions()` to selectively remove only non-KHO memory regions when KHO is active, preserving the critical scratch regions while still allowing EFI to rebuild its memory map. This patch (of 3): During early initialisation, after a kexec, other components, like EFI need to know if a KHO enabled kexec is performed. The `kho_is_enabled` function is not enough as in the early stages, it only reflects whether the cmdline has KHO enabled, not if an actual KHO FDT exists. Extend the KHO API with `is_kho_boot()` to provide a way for components to check if a KHO enabled kexec is performed. Link: https://lkml.kernel.org/r/cover.1755721529.git.epetron@amazon.de Link: https://lkml.kernel.org/r/7dc6674a76bf6e68cca0222ccff32427699cc02e.1755721529.git.epetron@amazon.de Signed-off-by: Evangelos Petrongonas Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Ard Biesheuvel Cc: Baoquan He Cc: Changyuan Lyu Signed-off-by: Andrew Morton --- include/linux/kexec_handover.h | 6 ++++++ kernel/kexec_handover.c | 20 ++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index 348844cffb13..559d13a3bc44 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -40,6 +40,7 @@ struct kho_serialization; #ifdef CONFIG_KEXEC_HANDOVER bool kho_is_enabled(void); +bool is_kho_boot(void); int kho_preserve_folio(struct folio *folio); int kho_preserve_phys(phys_addr_t phys, size_t size); @@ -60,6 +61,11 @@ static inline bool kho_is_enabled(void) return false; } +static inline bool is_kho_boot(void) +{ + return false; +} + static inline int kho_preserve_folio(struct folio *folio) { return -EOPNOTSUPP; diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index ecd1ac210dbd..49a39aee6a8e 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -951,6 +951,26 @@ static const void *kho_get_fdt(void) return kho_in.fdt_phys ? phys_to_virt(kho_in.fdt_phys) : NULL; } +/** + * is_kho_boot - check if current kernel was booted via KHO-enabled + * kexec + * + * This function checks if the current kernel was loaded through a kexec + * operation with KHO enabled, by verifying that a valid KHO FDT + * was passed. + * + * Note: This function returns reliable results only after + * kho_populate() has been called during early boot. Before that, + * it may return false even if KHO data is present. + * + * Return: true if booted via KHO-enabled kexec, false otherwise + */ +bool is_kho_boot(void) +{ + return !!kho_get_fdt(); +} +EXPORT_SYMBOL_GPL(is_kho_boot); + /** * kho_retrieve_subtree - retrieve a preserved sub FDT by its name. * @name: the name of the sub FDT passed to kho_add_subtree(). From 5b86af1ded2d90402477dce6d4cf8dfa95cca6ac Mon Sep 17 00:00:00 2001 From: Evangelos Petrongonas Date: Thu, 21 Aug 2025 17:59:00 +0000 Subject: [PATCH 56/75] efi: support booting with kexec handover (KHO) When KHO (Kexec HandOver) is enabled, it sets up scratch memory regions early during device tree scanning. After kexec, the new kernel exclusively uses this region for memory allocations during boot up to the initialization of the page allocator However, when booting with EFI, EFI's reserve_regions() uses memblock_remove(0, PHYS_ADDR_MAX) to clear all memory regions before rebuilding them from EFI data. This destroys KHO scratch regions and their flags, thus causing a kernel panic, as there are no scratch memory regions. Instead of wholesale removal, iterate through memory regions and only remove non-KHO ones. This preserves KHO scratch regions, which are good known memory, while still allowing EFI to rebuild its memory map. Link: https://lkml.kernel.org/r/b34da9fd50c89644cd4204136cfa6f5533445c56.1755721529.git.epetron@amazon.de Signed-off-by: Evangelos Petrongonas Acked-by: Mike Rapoport (Microsoft) Acked-by: Pratyush Yadav Cc: Alexander Graf Cc: Ard Biesheuvel Cc: Baoquan He Cc: Changyuan Lyu Signed-off-by: Andrew Morton --- drivers/firmware/efi/efi-init.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/drivers/firmware/efi/efi-init.c b/drivers/firmware/efi/efi-init.c index a00e07b853f2..a65c2d5b9e7b 100644 --- a/drivers/firmware/efi/efi-init.c +++ b/drivers/firmware/efi/efi-init.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -164,12 +165,32 @@ static __init void reserve_regions(void) pr_info("Processing EFI memory map:\n"); /* - * Discard memblocks discovered so far: if there are any at this - * point, they originate from memory nodes in the DT, and UEFI - * uses its own memory map instead. + * Discard memblocks discovered so far except for KHO scratch + * regions. Most memblocks at this point originate from memory nodes + * in the DT and UEFI uses its own memory map instead. However, if + * KHO is enabled, scratch regions, which are good known memory + * must be preserved. */ memblock_dump_all(); - memblock_remove(0, PHYS_ADDR_MAX); + + if (is_kho_boot()) { + struct memblock_region *r; + + /* Remove all non-KHO regions */ + for_each_mem_region(r) { + if (!memblock_is_kho_scratch(r)) { + memblock_remove(r->base, r->size); + r--; + } + } + } else { + /* + * KHO is disabled. Discard memblocks discovered so far: + * if there are any at this point, they originate from memory + * nodes in the DT, and UEFI uses its own memory map instead. + */ + memblock_remove(0, PHYS_ADDR_MAX); + } for_each_efi_memory_desc(md) { paddr = md->phys_addr; From 04ae01a80df616b0998bb6a81c8db310f3aef102 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Thu, 11 Sep 2025 01:23:51 +0200 Subject: [PATCH 57/75] lib/decompress: use designated initializers for struct compress_format Switch 'compressed_formats[]' to the more modern and flexible designated initializers. This improves readability and allows struct fields to be reordered. Also use a more concise sentinel marker. Remove the curly braces around the for loop while we're at it. No functional changes intended. Link: https://lkml.kernel.org/r/20250910232350.1308206-2-thorsten.blum@linux.dev Signed-off-by: Thorsten Blum Reviewed-by: Kuan-Wei Chiu Cc: David Sterba Cc: Nick Terrell Signed-off-by: Andrew Morton --- lib/decompress.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/lib/decompress.c b/lib/decompress.c index ab3fc90ffc64..7785471586c6 100644 --- a/lib/decompress.c +++ b/lib/decompress.c @@ -49,15 +49,15 @@ struct compress_format { }; static const struct compress_format compressed_formats[] __initconst = { - { {0x1f, 0x8b}, "gzip", gunzip }, - { {0x1f, 0x9e}, "gzip", gunzip }, - { {0x42, 0x5a}, "bzip2", bunzip2 }, - { {0x5d, 0x00}, "lzma", unlzma }, - { {0xfd, 0x37}, "xz", unxz }, - { {0x89, 0x4c}, "lzo", unlzo }, - { {0x02, 0x21}, "lz4", unlz4 }, - { {0x28, 0xb5}, "zstd", unzstd }, - { {0, 0}, NULL, NULL } + { .magic = {0x1f, 0x8b}, .name = "gzip", .decompressor = gunzip }, + { .magic = {0x1f, 0x9e}, .name = "gzip", .decompressor = gunzip }, + { .magic = {0x42, 0x5a}, .name = "bzip2", .decompressor = bunzip2 }, + { .magic = {0x5d, 0x00}, .name = "lzma", .decompressor = unlzma }, + { .magic = {0xfd, 0x37}, .name = "xz", .decompressor = unxz }, + { .magic = {0x89, 0x4c}, .name = "lzo", .decompressor = unlzo }, + { .magic = {0x02, 0x21}, .name = "lz4", .decompressor = unlz4 }, + { .magic = {0x28, 0xb5}, .name = "zstd", .decompressor = unzstd }, + { /* sentinel */ } }; decompress_fn __init decompress_method(const unsigned char *inbuf, long len, @@ -73,11 +73,10 @@ decompress_fn __init decompress_method(const unsigned char *inbuf, long len, pr_debug("Compressed data magic: %#.2x %#.2x\n", inbuf[0], inbuf[1]); - for (cf = compressed_formats; cf->name; cf++) { + for (cf = compressed_formats; cf->name; cf++) if (!memcmp(inbuf, cf->magic, 2)) break; - } if (name) *name = cf->name; return cf->decompressor; From 347b564599fb01d8ae1e9f473b82820fea4c2767 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 11 Sep 2025 21:33:55 +0200 Subject: [PATCH 58/75] coccinelle: of_table: handle SPI device ID tables 'struct spi_device_id' tables also need to be NULL terminated. Link: https://lkml.kernel.org/r/20250911193354.56262-2-krzysztof.kozlowski@linaro.org Signed-off-by: Krzysztof Kozlowski Cc: Julia Lawall Cc: Nathan Chancellor Cc: Nicolas Palix Signed-off-by: Andrew Morton --- scripts/coccinelle/misc/of_table.cocci | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scripts/coccinelle/misc/of_table.cocci b/scripts/coccinelle/misc/of_table.cocci index 4693ea744753..17881cb0884b 100644 --- a/scripts/coccinelle/misc/of_table.cocci +++ b/scripts/coccinelle/misc/of_table.cocci @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/// Make sure (of/i2c/platform)_device_id tables are NULL terminated +/// Make sure (of/i2c/platform/spi)_device_id tables are NULL terminated // // Keywords: of_table i2c_table platform_table // Confidence: Medium @@ -15,14 +15,14 @@ identifier var, arr; expression E; @@ ( -struct \(of_device_id \| i2c_device_id \| platform_device_id\) arr[] = { +struct \(of_device_id \| i2c_device_id \| platform_device_id \| spi_device_id\) arr[] = { ..., { .var = E, * } }; | -struct \(of_device_id \| i2c_device_id \| platform_device_id\) arr[] = { +struct \(of_device_id \| i2c_device_id \| platform_device_id \| spi_device_id\) arr[] = { ..., * { ..., E, ... }, }; @@ -33,7 +33,7 @@ identifier var, arr; expression E; @@ ( -struct \(of_device_id \| i2c_device_id \| platform_device_id\) arr[] = { +struct \(of_device_id \| i2c_device_id \| platform_device_id \| spi_device_id\) arr[] = { ..., { .var = E, @@ -42,7 +42,7 @@ struct \(of_device_id \| i2c_device_id \| platform_device_id\) arr[] = { + { } }; | -struct \(of_device_id \| i2c_device_id \| platform_device_id\) arr[] = { +struct \(of_device_id \| i2c_device_id \| platform_device_id \| spi_device_id\) arr[] = { ..., { ..., E, ... }, + { }, @@ -55,7 +55,7 @@ identifier var, arr; expression E; @@ ( -struct \(of_device_id \| i2c_device_id \| platform_device_id\) arr[] = { +struct \(of_device_id \| i2c_device_id \| platform_device_id \| spi_device_id\) arr[] = { ..., { .var = E, @@ -63,7 +63,7 @@ struct \(of_device_id \| i2c_device_id \| platform_device_id\) arr[] = { @p1 }; | -struct \(of_device_id \| i2c_device_id \| platform_device_id\) arr[] = { +struct \(of_device_id \| i2c_device_id \| platform_device_id \| spi_device_id\) arr[] = { ..., { ..., E, ... } @p1 From f23e76a32dbfc45418a1448f94886eb608b35b98 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 11 Sep 2025 20:47:27 +0200 Subject: [PATCH 59/75] coccinelle: platform_no_drv_owner: handle also built-in drivers builtin_platform_driver() and others also use macro platform_driver_register() which sets the .owner=THIS_MODULE, so extend the cocci script to detect these as well. Link: https://lkml.kernel.org/r/20250911184726.23154-2-krzysztof.kozlowski@linaro.org Signed-off-by: Krzysztof Kozlowski Cc: Julia Lawall Cc: Nathan Chancellor Cc: Nicolas Palix Signed-off-by: Andrew Morton --- scripts/coccinelle/api/platform_no_drv_owner.cocci | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/scripts/coccinelle/api/platform_no_drv_owner.cocci b/scripts/coccinelle/api/platform_no_drv_owner.cocci index 8fa050eeb7e5..5e869858bda8 100644 --- a/scripts/coccinelle/api/platform_no_drv_owner.cocci +++ b/scripts/coccinelle/api/platform_no_drv_owner.cocci @@ -10,12 +10,21 @@ virtual org virtual report @match1@ +declarer name builtin_i2c_driver; +declarer name builtin_platform_driver; +declarer name builtin_platform_driver_probe; declarer name module_i2c_driver; declarer name module_platform_driver; declarer name module_platform_driver_probe; identifier __driver; @@ ( + builtin_i2c_driver(__driver); +| + builtin_platform_driver(__driver); +| + builtin_platform_driver_probe(__driver, ...); +| module_i2c_driver(__driver); | module_platform_driver(__driver); From 39f17c707454290900b608ee5a200b5db9245626 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 14 Sep 2025 13:09:08 +0200 Subject: [PATCH 60/75] sched/task.h: fix the wrong comment on task_lock() nesting with tasklist_lock The ancient comment above task_lock() states that it can be nested outside of read_lock(&tasklist_lock), but this is no longer true: CPU_0 CPU_1 CPU_2 task_lock() read_lock(tasklist) write_lock_irq(tasklist) read_lock(tasklist) task_lock() Unless CPU_0 calls read_lock() in IRQ context, queued_read_lock_slowpath() won't get the lock immediately, it will spin waiting for the pending writer on CPU_2, resulting in a deadlock. Link: https://lkml.kernel.org/r/20250914110908.GA18769@redhat.com Signed-off-by: Oleg Nesterov Cc: Christian Brauner Cc: Jiri Slaby Cc: Mateusz Guzik Signed-off-by: Andrew Morton --- include/linux/sched/task.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index ea41795a352b..8ff98b18b24b 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -210,9 +210,8 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) * pins the final release of task.io_context. Also protects ->cpuset and * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist. * - * Nests both inside and outside of read_lock(&tasklist_lock). - * It must not be nested with write_lock_irq(&tasklist_lock), - * neither inside nor outside. + * Nests inside of read_lock(&tasklist_lock). It must not be nested with + * write_lock_irq(&tasklist_lock), neither inside nor outside. */ static inline void task_lock(struct task_struct *p) { From a15f37a40145c986cdf289a4b88390f35efdecc4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 15 Sep 2025 14:09:17 +0200 Subject: [PATCH 61/75] kernel/sys.c: fix the racy usage of task_lock(tsk->group_leader) in sys_prlimit64() paths The usage of task_lock(tsk->group_leader) in sys_prlimit64()->do_prlimit() path is very broken. sys_prlimit64() does get_task_struct(tsk) but this only protects task_struct itself. If tsk != current and tsk is not a leader, this process can exit/exec and task_lock(tsk->group_leader) may use the already freed task_struct. Another problem is that sys_prlimit64() can race with mt-exec which changes ->group_leader. In this case do_prlimit() may take the wrong lock, or (worse) ->group_leader may change between task_lock() and task_unlock(). Change sys_prlimit64() to take tasklist_lock when necessary. This is not nice, but I don't see a better fix for -stable. Link: https://lkml.kernel.org/r/20250915120917.GA27702@redhat.com Fixes: 18c91bb2d872 ("prlimit: do not grab the tasklist_lock") Signed-off-by: Oleg Nesterov Cc: Christian Brauner Cc: Jiri Slaby Cc: Mateusz Guzik Cc: Signed-off-by: Andrew Morton --- kernel/sys.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/kernel/sys.c b/kernel/sys.c index 1e28b40053ce..36d66ff41611 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1734,6 +1734,7 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, struct rlimit old, new; struct task_struct *tsk; unsigned int checkflags = 0; + bool need_tasklist; int ret; if (old_rlim) @@ -1760,8 +1761,25 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, get_task_struct(tsk); rcu_read_unlock(); - ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL, - old_rlim ? &old : NULL); + need_tasklist = !same_thread_group(tsk, current); + if (need_tasklist) { + /* + * Ensure we can't race with group exit or de_thread(), + * so tsk->group_leader can't be freed or changed until + * read_unlock(tasklist_lock) below. + */ + read_lock(&tasklist_lock); + if (!pid_alive(tsk)) + ret = -ESRCH; + } + + if (!ret) { + ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL, + old_rlim ? &old : NULL); + } + + if (need_tasklist) + read_unlock(&tasklist_lock); if (!ret && old_rlim) { rlim_to_rlim64(&old, &old64); From 3437819c5e7a855b344030bfe15bbd513c28388f Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Wed, 17 Sep 2025 09:02:29 +0300 Subject: [PATCH 62/75] ocfs2: avoid extra calls to strlen() after ocfs2_sprintf_system_inode_name() Since 'ocfs2_sprintf_system_inode_name()' uses 'snprintf()' and returns the number of characters emitted, callers of the former are better to use that return value instead of an explicit calls to 'strlen()'. Link: https://lkml.kernel.org/r/20250917060229.1854335-1-dmantipov@yandex.ru Signed-off-by: Dmitry Antipov Reviewed-by: Joseph Qi Reviewed-by: Joel Becker Cc: Mark Fasheh Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/ioctl.c | 18 +++++++----------- fs/ocfs2/move_extents.c | 8 ++++---- fs/ocfs2/sysfile.c | 12 ++++++------ 3 files changed, 17 insertions(+), 21 deletions(-) diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index db14c92302a1..b6864602814c 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -358,13 +358,11 @@ static int ocfs2_info_handle_freeinode(struct inode *inode, goto bail; } } else { - ocfs2_sprintf_system_inode_name(namebuf, - sizeof(namebuf), - type, i); + int len = ocfs2_sprintf_system_inode_name(namebuf, + sizeof(namebuf), + type, i); status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, - namebuf, - strlen(namebuf), - &blkno); + namebuf, len, &blkno); if (status < 0) { status = -ENOENT; goto bail; @@ -651,12 +649,10 @@ static int ocfs2_info_handle_freefrag(struct inode *inode, goto bail; } } else { - ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, - OCFS2_INVALID_SLOT); + int len = ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), + type, OCFS2_INVALID_SLOT); status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, - namebuf, - strlen(namebuf), - &blkno); + namebuf, len, &blkno); if (status < 0) { status = -ENOENT; goto bail; diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index cbe2f8ed8897..86f2631e6360 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -364,7 +364,7 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode, int *vict_bit, struct buffer_head **ret_bh) { - int ret, i, bits_per_unit = 0; + int ret, i, len, bits_per_unit = 0; u64 blkno; char namebuf[40]; @@ -375,9 +375,9 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode, struct ocfs2_dinode *ac_dinode; struct ocfs2_group_desc *bg; - ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot); - ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, - strlen(namebuf), &blkno); + len = ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot); + ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, len, &blkno); + if (ret) { ret = -ENOENT; goto out; diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c index 53a945da873b..d53a6cc866be 100644 --- a/fs/ocfs2/sysfile.c +++ b/fs/ocfs2/sysfile.c @@ -127,14 +127,14 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb, char namebuf[40]; struct inode *inode = NULL; u64 blkno; - int status = 0; + int len, status = 0; - ocfs2_sprintf_system_inode_name(namebuf, - sizeof(namebuf), - type, slot); + len = ocfs2_sprintf_system_inode_name(namebuf, + sizeof(namebuf), + type, slot); - status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, - strlen(namebuf), &blkno); + status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, + namebuf, len, &blkno); if (status < 0) { goto bail; } From f322a97aeb2a05b6b1ee17629145eb02e1a4c6a0 Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Thu, 18 Sep 2025 19:06:15 +0200 Subject: [PATCH 63/75] kho: only fill kimage if KHO is finalized kho_fill_kimage() only checks for KHO being enabled before filling in the FDT to the image. KHO being enabled does not mean that the kernel has data to hand over. That happens when KHO is finalized. When a kexec is done with KHO enabled but not finalized, the FDT page is allocated but not initialized. FDT initialization happens after finalize. This means the KHO segment is filled in but the FDT contains garbage data. This leads to the below error messages in the next kernel: [ 0.000000] KHO: setup: handover FDT (0x10116b000) is invalid: -9 [ 0.000000] KHO: disabling KHO revival: -22 There is no problem in practice, and the next kernel boots and works fine. But this still leads to misleading error messages and garbage being handed over. Only fill in KHO segment when KHO is finalized. When KHO is not enabled, the debugfs interface is not created and there is no way to finalize it anyway. So the check for kho_enable is not needed, and kho_out.finalize alone is enough. Link: https://lkml.kernel.org/r/20250918170617.91413-1-pratyush@kernel.org Fixes: 3bdecc3c93f9 ("kexec: add KHO support to kexec file loads") Signed-off-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Baoquan He Cc: Changyuan Lyu Cc: Jason Gunthorpe Cc: Pasha Tatashin Cc: Signed-off-by: Andrew Morton --- kernel/kexec_handover.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 49a39aee6a8e..e238ec6b470b 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -1253,7 +1253,7 @@ int kho_fill_kimage(struct kimage *image) int err = 0; struct kexec_buf scratch; - if (!kho_enable) + if (!kho_out.finalized) return 0; image->kho.fdt = page_to_phys(kho_out.ser.fdt); From 74058c0a9fc8b2b4d5f4a0ef7ee2cfa66a9e49cf Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Fri, 19 Sep 2025 00:33:08 +0100 Subject: [PATCH 64/75] Squashfs: fix uninit-value in squashfs_get_parent Syzkaller reports a "KMSAN: uninit-value in squashfs_get_parent" bug. This is caused by open_by_handle_at() being called with a file handle containing an invalid parent inode number. In particular the inode number is that of a symbolic link, rather than a directory. Squashfs_get_parent() gets called with that symbolic link inode, and accesses the parent member field. unsigned int parent_ino = squashfs_i(inode)->parent; Because non-directory inodes in Squashfs do not have a parent value, this is uninitialised, and this causes an uninitialised value access. The fix is to initialise parent with the invalid inode 0, which will cause an EINVAL error to be returned. Regular inodes used to share the parent field with the block_list_start field. This is removed in this commit to enable the parent field to contain the invalid inode number 0. Link: https://lkml.kernel.org/r/20250918233308.293861-1-phillip@squashfs.org.uk Fixes: 122601408d20 ("Squashfs: export operations") Signed-off-by: Phillip Lougher Reported-by: syzbot+157bdef5cf596ad0da2c@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/68cc2431.050a0220.139b6.0001.GAE@google.com/ Cc: Signed-off-by: Andrew Morton --- fs/squashfs/inode.c | 7 +++++++ fs/squashfs/squashfs_fs_i.h | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index dee8fa016930..bb0529b09adc 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -169,6 +169,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block); squashfs_i(inode)->block_list_start = block; squashfs_i(inode)->offset = offset; + squashfs_i(inode)->parent = 0; inode->i_data.a_ops = &squashfs_aops; TRACE("File inode %x:%x, start_block %llx, block_list_start " @@ -216,6 +217,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) squashfs_i(inode)->start = le64_to_cpu(sqsh_ino->start_block); squashfs_i(inode)->block_list_start = block; squashfs_i(inode)->offset = offset; + squashfs_i(inode)->parent = 0; inode->i_data.a_ops = &squashfs_aops; TRACE("File inode %x:%x, start_block %llx, block_list_start " @@ -296,6 +298,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) inode->i_mode |= S_IFLNK; squashfs_i(inode)->start = block; squashfs_i(inode)->offset = offset; + squashfs_i(inode)->parent = 0; if (type == SQUASHFS_LSYMLINK_TYPE) { __le32 xattr; @@ -333,6 +336,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); rdev = le32_to_cpu(sqsh_ino->rdev); init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); + squashfs_i(inode)->parent = 0; TRACE("Device inode %x:%x, rdev %x\n", SQUASHFS_INODE_BLK(ino), offset, rdev); @@ -357,6 +361,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); rdev = le32_to_cpu(sqsh_ino->rdev); init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); + squashfs_i(inode)->parent = 0; TRACE("Device inode %x:%x, rdev %x\n", SQUASHFS_INODE_BLK(ino), offset, rdev); @@ -377,6 +382,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) inode->i_mode |= S_IFSOCK; set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); init_special_inode(inode, inode->i_mode, 0); + squashfs_i(inode)->parent = 0; break; } case SQUASHFS_LFIFO_TYPE: @@ -396,6 +402,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) inode->i_op = &squashfs_inode_ops; set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); init_special_inode(inode, inode->i_mode, 0); + squashfs_i(inode)->parent = 0; break; } default: diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h index 2c82d6f2a456..8e497ac07b9a 100644 --- a/fs/squashfs/squashfs_fs_i.h +++ b/fs/squashfs/squashfs_fs_i.h @@ -16,6 +16,7 @@ struct squashfs_inode_info { u64 xattr; unsigned int xattr_size; int xattr_count; + int parent; union { struct { u64 fragment_block; @@ -27,7 +28,6 @@ struct squashfs_inode_info { u64 dir_idx_start; int dir_idx_offset; int dir_idx_cnt; - int parent; }; }; struct inode vfs_inode; From 634cdfd6b394cf4a5bfaeacf3b325998c752df45 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Sat, 13 Sep 2025 18:28:49 -0400 Subject: [PATCH 65/75] kernel: prevent prctl(PR_SET_PDEATHSIG) from racing with parent process exit If a process calls prctl(PR_SET_PDEATHSIG) at the same time that the parent process exits, the child will write to me->pdeath_sig at the same time the parent is reading it. Since there is no synchronization, this is a data race. Worse, it is possible that a subsequent call to getppid() can continue to return the previous parent process ID without the parent death signal being delivered. This happens in the following scenario: parent child forget_original_parent() prctl(PR_SET_PDEATHSIG, SIGKILL) sys_prctl() me->pdeath_sig = SIGKILL; getppid(); RCU_INIT_POINTER(t->real_parent, reaper); if (t->pdeath_signal) /* reads stale me->pdeath_sig */ group_send_sig_info(t->pdeath_signal, ...); And in the following: parent child forget_original_parent() RCU_INIT_POINTER(t->real_parent, reaper); /* also no barrier */ if (t->pdeath_signal) /* reads stale me->pdeath_sig */ group_send_sig_info(t->pdeath_signal, ...); prctl(PR_SET_PDEATHSIG, SIGKILL) sys_prctl() me->pdeath_sig = SIGKILL; getppid(); /* reads old ppid() */ As a result, the following pattern is racy: pid_t parent_pid = getpid(); pid_t child_pid = fork(); if (child_pid == -1) { /* handle error... */ return; } if (child_pid == 0) { if (prctl(PR_SET_PDEATHSIG, SIGKILL) != 0) { /* handle error */ _exit(126); } if (getppid() != parent_pid) { /* parent died already */ raise(SIGKILL); } /* keep going in child */ } /* keep going in parent */ If the parent is killed at exactly the wrong time, the child process can (wrongly) stay running. I didn't manage to reproduce this in my testing, but I'm pretty sure the race is real. KCSAN is probably the best way to spot the race. Fix the bug by holding tasklist_lock for reading whenever pdeath_signal is being written to. This prevents races on me->pdeath_sig, and the locking and unlocking of the rwlock provide the needed memory barriers. If prctl(PR_SET_PDEATHSIG) happens before the parent exits, the signal will be sent. If it happens afterwards, a subsequent getppid() will return the new value. Link: https://lkml.kernel.org/r/20250913-fix-prctl-pdeathsig-race-v1-1-44e2eb426fe9@gmail.com Signed-off-by: Demi Marie Obenour Cc: Oleg Nesterov Cc: Mateusz Guzik Signed-off-by: Andrew Morton --- kernel/sys.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/sys.c b/kernel/sys.c index 36d66ff41611..bd25f39a6b57 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2488,7 +2488,17 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = -EINVAL; break; } + /* + * Ensure that either: + * + * 1. Subsequent getppid() calls reflect the parent process having died. + * 2. forget_original_parent() will send the new me->pdeath_signal. + * + * Also prevent the read of me->pdeath_signal from being a data race. + */ + read_lock(&tasklist_lock); me->pdeath_signal = arg2; + read_unlock(&tasklist_lock); break; case PR_GET_PDEATHSIG: error = put_user(me->pdeath_signal, (int __user *)arg2); From 20a8e0454d833d80d0c0cae304841a50a2a126bd Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Tue, 23 Sep 2025 10:53:33 +1000 Subject: [PATCH 66/75] cramfs: fix incorrect physical page address calculation Commit 21aa65bf82a7 ("mm: remove callers of pfn_t functionality") incorrectly replaced the pfn with the physical address when calling vmf_insert_mixed(). Instead the phys_to_pfn_t() call should have been replaced with PHYS_PFN(). Found by inspection after a similar issue was noted in fuse virtio_fs. Link: https://lkml.kernel.org/r/20250923005333.3165032-1-apopple@nvidia.com Fixes: 21aa65bf82a7 ("mm: remove callers of pfn_t functionality") Signed-off-by: Alistair Popple Reviewed-by: Dev Jain Reviewed-by: David Hildenbrand Cc: Haiyue Wang Cc: Nicolas Pitre Signed-off-by: Andrew Morton --- fs/cramfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index b002e9b734f9..56c8005b24a3 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -412,7 +412,7 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) vm_fault_t vmf; unsigned long off = i * PAGE_SIZE; vmf = vmf_insert_mixed(vma, vma->vm_start + off, - address + off); + PHYS_PFN(address + off)); if (vmf & VM_FAULT_ERROR) ret = vm_fault_to_errno(vmf, 0); } From 99b70ece33d87500ef7bee8e32cb99772c45ce14 Mon Sep 17 00:00:00 2001 From: Suchit Karunakaran Date: Tue, 23 Sep 2025 22:47:21 +0530 Subject: [PATCH 67/75] checkpatch: suppress strscpy warnings for userspace tools The checkpatch.pl script currently warns against the use of strcpy, strlcpy, and strncpy, recommending strscpy as a safer alternative. However, these warnings are also triggered for code under tools/ and scripts/, which are userspace utilities where strscpy is not available. This patch suppresses these warnings for files in tools/ and scripts/. Link: https://lkml.kernel.org/r/20250923171722.7798-1-suchitkarunakaran@gmail.com Signed-off-by: Suchit Karunakaran Acked-by: Joe Perches Cc: Andy Whitcroft Cc: Dwaipayan Ray Cc: Lukas Bulwahn Cc: Shuah Khan Signed-off-by: Andrew Morton --- scripts/checkpatch.pl | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 319cc5f85885..92669904eecc 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2636,6 +2636,11 @@ sub exclude_global_initialisers { $realfile =~ m@/bpf/.*\.bpf\.c$@; } +sub is_userspace { + my ($realfile) = @_; + return ($realfile =~ m@^tools/@ || $realfile =~ m@^scripts/@); +} + sub process { my $filename = shift; @@ -7018,21 +7023,20 @@ sub process { # } # } # } - # strcpy uses that should likely be strscpy - if ($line =~ /\bstrcpy\s*\(/) { + if ($line =~ /\bstrcpy\s*\(/ && !is_userspace($realfile)) { WARN("STRCPY", "Prefer strscpy over strcpy - see: https://github.com/KSPP/linux/issues/88\n" . $herecurr); } # strlcpy uses that should likely be strscpy - if ($line =~ /\bstrlcpy\s*\(/) { + if ($line =~ /\bstrlcpy\s*\(/ && !is_userspace($realfile)) { WARN("STRLCPY", "Prefer strscpy over strlcpy - see: https://github.com/KSPP/linux/issues/89\n" . $herecurr); } # strncpy uses that should likely be strscpy or strscpy_pad - if ($line =~ /\bstrncpy\s*\(/) { + if ($line =~ /\bstrncpy\s*\(/ && !is_userspace($realfile)) { WARN("STRNCPY", "Prefer strscpy, strscpy_pad, or __nonstring over strncpy - see: https://github.com/KSPP/linux/issues/90\n" . $herecurr); } From 8f45f089337d924db24397f55697cda0e6960516 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 23 Sep 2025 14:26:07 +0300 Subject: [PATCH 68/75] ocfs2: fix double free in user_cluster_connect() user_cluster_disconnect() frees "conn->cc_private" which is "lc" but then the error handling frees "lc" a second time. Set "lc" to NULL on this path to avoid a double free. Link: https://lkml.kernel.org/r/aNKDz_7JF7aycZ0k@stanley.mountain Fixes: c994c2ebdbbc ("ocfs2: use the new DLM operation callbacks while requesting new lockspace") Signed-off-by: Dan Carpenter Reviewed-by: Joseph Qi Reviewed-by: Goldwyn Rodrigues Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/stack_user.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 0f045e45fa0c..439742cec3c2 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -1011,6 +1011,7 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) printk(KERN_ERR "ocfs2: Could not determine" " locking version\n"); user_cluster_disconnect(conn); + lc = NULL; goto out; } wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0)); From 1daf37592a050da046a03f78b20abb2a91f6d934 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 24 Sep 2025 11:43:04 +0200 Subject: [PATCH 69/75] panic: remove CONFIG_PANIC_ON_OOPS_VALUE There's really no need for this since it's 0 or 1 when CONFIG_PANIC_ON_OOPS is disabled/enabled, so just use IS_ENABLED() instead. The extra symbol goes back to the original code adding it in commit 2a01bb3885c9 ("panic: Make panic_on_oops configurable"). Link: https://lkml.kernel.org/r/20250924094303.18521-2-johannes@sipsolutions.net Signed-off-by: Johannes Berg Signed-off-by: Andrew Morton --- kernel/panic.c | 2 +- lib/Kconfig.debug | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/kernel/panic.c b/kernel/panic.c index ebd81c259fa9..24cc3eec1805 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -53,7 +53,7 @@ static unsigned int __read_mostly sysctl_oops_all_cpu_backtrace; #define sysctl_oops_all_cpu_backtrace 0 #endif /* CONFIG_SMP */ -int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; +int panic_on_oops = IS_ENABLED(CONFIG_PANIC_ON_OOPS); static unsigned long tainted_mask = IS_ENABLED(CONFIG_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0; static int pause_on_oops; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index dc0e0c6ed075..30761648e2de 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1067,12 +1067,6 @@ config PANIC_ON_OOPS Say N if unsure. -config PANIC_ON_OOPS_VALUE - int - range 0 1 - default 0 if !PANIC_ON_OOPS - default 1 if PANIC_ON_OOPS - config PANIC_TIMEOUT int "panic timeout" default 0 From 1260cbcffa608219fc9188a6cbe9c45a300ef8b5 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 24 Sep 2025 10:02:07 +0200 Subject: [PATCH 70/75] lib/genalloc: fix device leak in of_gen_pool_get() Make sure to drop the reference taken when looking up the genpool platform device in of_gen_pool_get() before returning the pool. Note that holding a reference to a device does typically not prevent its devres managed resources from being released so there is no point in keeping the reference. Link: https://lkml.kernel.org/r/20250924080207.18006-1-johan@kernel.org Fixes: 9375db07adea ("genalloc: add devres support, allow to find a managed pool by device") Signed-off-by: Johan Hovold Cc: Philipp Zabel Cc: Vladimir Zapolskiy Cc: [3.10+] Signed-off-by: Andrew Morton --- lib/genalloc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/genalloc.c b/lib/genalloc.c index 4fa5635bf81b..841f29783833 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -899,8 +899,11 @@ struct gen_pool *of_gen_pool_get(struct device_node *np, if (!name) name = of_node_full_name(np_pool); } - if (pdev) + if (pdev) { pool = gen_pool_get(&pdev->dev, name); + put_device(&pdev->dev); + } + of_node_put(np_pool); return pool; From 9ee94bfbe930a1b39df53fa2d7b31141b780eb5a Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Tue, 23 Sep 2025 23:06:51 +0100 Subject: [PATCH 71/75] Squashfs: add additional inode sanity checking Patch series "Squashfs: performance improvement and a sanity check". This patchset adds an additional sanity check when reading regular file inodes, and adds support for SEEK_DATA/SEEK_HOLE lseek() whence values. This patch (of 2): Add an additional sanity check when reading regular file inodes. A regular file if the file size is an exact multiple of the filesystem block size cannot have a fragment. This is because by definition a fragment block stores tailends which are not a whole block in size. Link: https://lkml.kernel.org/r/20250923220652.568416-1-phillip@squashfs.org.uk Link: https://lkml.kernel.org/r/20250923220652.568416-2-phillip@squashfs.org.uk Signed-off-by: Phillip Lougher Signed-off-by: Andrew Morton --- fs/squashfs/inode.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index bb0529b09adc..8f425dbf9052 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -144,8 +144,17 @@ int squashfs_read_inode(struct inode *inode, long long ino) if (err < 0) goto failed_read; + inode->i_size = le32_to_cpu(sqsh_ino->file_size); frag = le32_to_cpu(sqsh_ino->fragment); if (frag != SQUASHFS_INVALID_FRAG) { + /* + * the file cannot have a fragment (tailend) and have a + * file size a multiple of the block size + */ + if ((inode->i_size & (msblk->block_size - 1)) == 0) { + err = -EINVAL; + goto failed_read; + } frag_offset = le32_to_cpu(sqsh_ino->offset); frag_size = squashfs_frag_lookup(sb, frag, &frag_blk); if (frag_size < 0) { @@ -159,7 +168,6 @@ int squashfs_read_inode(struct inode *inode, long long ino) } set_nlink(inode, 1); - inode->i_size = le32_to_cpu(sqsh_ino->file_size); inode->i_fop = &generic_ro_fops; inode->i_mode |= S_IFREG; inode->i_blocks = ((inode->i_size - 1) >> 9) + 1; @@ -188,8 +196,17 @@ int squashfs_read_inode(struct inode *inode, long long ino) if (err < 0) goto failed_read; + inode->i_size = le64_to_cpu(sqsh_ino->file_size); frag = le32_to_cpu(sqsh_ino->fragment); if (frag != SQUASHFS_INVALID_FRAG) { + /* + * the file cannot have a fragment (tailend) and have a + * file size a multiple of the block size + */ + if ((inode->i_size & (msblk->block_size - 1)) == 0) { + err = -EINVAL; + goto failed_read; + } frag_offset = le32_to_cpu(sqsh_ino->offset); frag_size = squashfs_frag_lookup(sb, frag, &frag_blk); if (frag_size < 0) { @@ -204,7 +221,6 @@ int squashfs_read_inode(struct inode *inode, long long ino) xattr_id = le32_to_cpu(sqsh_ino->xattr); set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); - inode->i_size = le64_to_cpu(sqsh_ino->file_size); inode->i_op = &squashfs_inode_ops; inode->i_fop = &generic_ro_fops; inode->i_mode |= S_IFREG; From dec91e7ab10ec465d59144aabe69c01723ddd1c4 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Tue, 23 Sep 2025 23:06:52 +0100 Subject: [PATCH 72/75] Squashfs: add SEEK_DATA/SEEK_HOLE support Add support for SEEK_DATA and SEEK_HOLE lseek() whence values. These allow much faster searches for holes and data in sparse files, which can significantly speed up file copying, e.g. before (GNU coreutils, Debian 13): cp --sparse=always big-file / took real 11m58s, user 0m5.764s, sys 11m48s after: real 0.047s, user 0.000s, sys 0.027s Where big-file has a 256 GB hole followed by 47 KB of data. Link: https://lkml.kernel.org/r/20250923220652.568416-3-phillip@squashfs.org.uk Signed-off-by: Phillip Lougher Signed-off-by: Andrew Morton --- fs/squashfs/file.c | 137 +++++++++++++++++++++++++++++++++++--- fs/squashfs/inode.c | 4 +- fs/squashfs/squashfs.h | 1 + fs/squashfs/squashfs_fs.h | 1 + 4 files changed, 130 insertions(+), 13 deletions(-) diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index ce7d661d5ad8..1582e0637a7e 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -307,7 +307,8 @@ static int fill_meta_index(struct inode *inode, int index, all_done: *index_block = cur_index_block; *index_offset = cur_offset; - *data_block = cur_data_block; + if (data_block) + *data_block = cur_data_block; /* * Scale cache index (cache slot entry) to index @@ -324,17 +325,15 @@ failed: * Get the on-disk location and compressed size of the datablock * specified by index. Fill_meta_index() does most of the work. */ -static int read_blocklist(struct inode *inode, int index, u64 *block) +static int read_blocklist_ptrs(struct inode *inode, int index, u64 *start, + int *offset, u64 *block) { - u64 start; long long blks; - int offset; __le32 size; - int res = fill_meta_index(inode, index, &start, &offset, block); + int res = fill_meta_index(inode, index, start, offset, block); - TRACE("read_blocklist: res %d, index %d, start 0x%llx, offset" - " 0x%x, block 0x%llx\n", res, index, start, offset, - *block); + TRACE("read_blocklist: res %d, index %d, start 0x%llx, offset 0x%x, block 0x%llx\n", + res, index, *start, *offset, block ? *block : 0); if (res < 0) return res; @@ -346,22 +345,31 @@ static int read_blocklist(struct inode *inode, int index, u64 *block) * extra block indexes needed. */ if (res < index) { - blks = read_indexes(inode->i_sb, index - res, &start, &offset); + blks = read_indexes(inode->i_sb, index - res, start, offset); if (blks < 0) return (int) blks; - *block += blks; + if (block) + *block += blks; } /* * Read length of block specified by index. */ - res = squashfs_read_metadata(inode->i_sb, &size, &start, &offset, + res = squashfs_read_metadata(inode->i_sb, &size, start, offset, sizeof(size)); if (res < 0) return res; return squashfs_block_size(size); } +static inline int read_blocklist(struct inode *inode, int index, u64 *block) +{ + u64 start; + int offset; + + return read_blocklist_ptrs(inode, index, &start, &offset, block); +} + static bool squashfs_fill_page(struct folio *folio, struct squashfs_cache_entry *buffer, size_t offset, size_t avail) @@ -658,7 +666,114 @@ skip_pages: kfree(pages); } +static loff_t seek_hole_data(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + struct super_block *sb = inode->i_sb; + struct squashfs_sb_info *msblk = sb->s_fs_info; + u64 start, index = offset >> msblk->block_log; + u64 file_end = (i_size_read(inode) + msblk->block_size - 1) >> msblk->block_log; + int s_offset, length; + __le32 *blist = NULL; + + /* reject offset if negative or beyond file end */ + if ((unsigned long long)offset >= i_size_read(inode)) + return -ENXIO; + + /* is offset within tailend and is tailend packed into a fragment? */ + if (index + 1 == file_end && + squashfs_i(inode)->fragment_block != SQUASHFS_INVALID_BLK) { + if (whence == SEEK_DATA) + return offset; + + /* there is an implicit hole at the end of any file */ + return i_size_read(inode); + } + + length = read_blocklist_ptrs(inode, index, &start, &s_offset, NULL); + if (length < 0) + return length; + + /* nothing more to do if offset matches desired whence value */ + if ((length == 0 && whence == SEEK_HOLE) || + (length && whence == SEEK_DATA)) + return offset; + + /* skip scanning forwards if we're at file end */ + if (++ index == file_end) + goto not_found; + + blist = kmalloc(SQUASHFS_SCAN_INDEXES << 2, GFP_KERNEL); + if (blist == NULL) { + ERROR("%s: Failed to allocate block_list\n", __func__); + return -ENOMEM; + } + + while (index < file_end) { + int i, indexes = min(file_end - index, SQUASHFS_SCAN_INDEXES); + + offset = squashfs_read_metadata(sb, blist, &start, &s_offset, indexes << 2); + if (offset < 0) + goto finished; + + for (i = 0; i < indexes; i++) { + length = squashfs_block_size(blist[i]); + if (length < 0) { + offset = length; + goto finished; + } + + /* does this block match desired whence value? */ + if ((length == 0 && whence == SEEK_HOLE) || + (length && whence == SEEK_DATA)) { + offset = (index + i) << msblk->block_log; + goto finished; + } + } + + index += indexes; + } + +not_found: + /* whence value determines what happens */ + if (whence == SEEK_DATA) + offset = -ENXIO; + else + /* there is an implicit hole at the end of any file */ + offset = i_size_read(inode); + +finished: + kfree(blist); + return offset; +} + +static loff_t squashfs_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + + switch (whence) { + default: + return generic_file_llseek(file, offset, whence); + case SEEK_DATA: + case SEEK_HOLE: + offset = seek_hole_data(file, offset, whence); + break; + } + + if (offset < 0) + return offset; + + return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); +} + const struct address_space_operations squashfs_aops = { .read_folio = squashfs_read_folio, .readahead = squashfs_readahead }; + +const struct file_operations squashfs_file_operations = { + .llseek = squashfs_llseek, + .read_iter = generic_file_read_iter, + .mmap_prepare = generic_file_readonly_mmap_prepare, + .splice_read = filemap_splice_read +}; diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index 8f425dbf9052..ddc65d006063 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -168,7 +168,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) } set_nlink(inode, 1); - inode->i_fop = &generic_ro_fops; + inode->i_fop = &squashfs_file_operations; inode->i_mode |= S_IFREG; inode->i_blocks = ((inode->i_size - 1) >> 9) + 1; squashfs_i(inode)->fragment_block = frag_blk; @@ -222,7 +222,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) xattr_id = le32_to_cpu(sqsh_ino->xattr); set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_op = &squashfs_inode_ops; - inode->i_fop = &generic_ro_fops; + inode->i_fop = &squashfs_file_operations; inode->i_mode |= S_IFREG; inode->i_blocks = (inode->i_size - le64_to_cpu(sqsh_ino->sparse) + 511) >> 9; diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 218868b20f16..4851bd964502 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -107,6 +107,7 @@ extern const struct address_space_operations squashfs_aops; /* inode.c */ extern const struct inode_operations squashfs_inode_ops; +extern const struct file_operations squashfs_file_operations; /* namei.c */ extern const struct inode_operations squashfs_dir_inode_ops; diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index 95f8e8901768..a955d9369749 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -208,6 +208,7 @@ static inline int squashfs_block_size(__le32 raw) #define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int)) #define SQUASHFS_META_ENTRIES 127 #define SQUASHFS_META_SLOTS 8 +#define SQUASHFS_SCAN_INDEXES 1024 struct meta_entry { u64 data_block; From cf1bb6b2d0f24c138ddaf3e8b8ecbf90273ed558 Mon Sep 17 00:00:00 2001 From: Sibi Sankar Date: Fri, 26 Sep 2025 15:04:26 +0530 Subject: [PATCH 73/75] MAINTAINERS: update Sibi Sankar's email address Switch to using oss.qualcomm.com email ID in the MAINTAINERS file. Also update mailmap to ensure proper attribution across historical commits. Link: https://lkml.kernel.org/r/20250926093426.1735334-1-sibi.sankar@oss.qualcomm.com Signed-off-by: Sibi Sankar Cc: Bjorn Andersson Cc: Carlos Bilbao Cc: Jarkko Sakkinen Cc: Shannon Nelson Signed-off-by: Andrew Morton --- .mailmap | 3 ++- MAINTAINERS | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.mailmap b/.mailmap index a124aeed52a2..02615c76ab5f 100644 --- a/.mailmap +++ b/.mailmap @@ -715,7 +715,8 @@ Shuah Khan Shuah Khan Shuah Khan Shuah Khan -Sibi Sankar +Sibi Sankar +Sibi Sankar Sid Manning Simon Arlott Simona Vetter diff --git a/MAINTAINERS b/MAINTAINERS index 6dcfbd11efef..0a1fda9c8113 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20710,7 +20710,7 @@ F: Documentation/devicetree/bindings/power/avs/qcom,cpr.yaml F: drivers/pmdomain/qcom/cpr.c QUALCOMM CPUCP MAILBOX DRIVER -M: Sibi Sankar +M: Sibi Sankar L: linux-arm-msm@vger.kernel.org S: Supported F: Documentation/devicetree/bindings/mailbox/qcom,cpucp-mbox.yaml From 94b3f02fb33f56c896d855ccbac270766d1aa48b Mon Sep 17 00:00:00 2001 From: Sahil Chandna Date: Fri, 26 Sep 2025 13:20:53 +0530 Subject: [PATCH 74/75] kallsyms: use kmalloc_array() instead of kmalloc() Replace kmalloc(sizeof(*stat) * 2, GFP_KERNEL) with kmalloc_array(2, sizeof(*stat), GFP_KERNEL) to prevent potential overflow, as recommended in Documentation/process/deprecated.rst. Link: https://lkml.kernel.org/r/20250926075053.25615-1-chandna.linuxkernel@gmail.com Signed-off-by: Sahil Chandna Cc: Shuah Khan Cc: David Hunter Signed-off-by: Andrew Morton --- kernel/kallsyms_selftest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c index cf4af5728307..2b082a7e24a2 100644 --- a/kernel/kallsyms_selftest.c +++ b/kernel/kallsyms_selftest.c @@ -264,7 +264,7 @@ static int test_kallsyms_basic_function(void) char namebuf[KSYM_NAME_LEN]; struct test_stat *stat, *stat2; - stat = kmalloc(sizeof(*stat) * 2, GFP_KERNEL); + stat = kmalloc_array(2, sizeof(*stat), GFP_KERNEL); if (!stat) return -ENOMEM; stat2 = stat + 1; From 9f1c14c1de1bdde395f6cc893efa4f80a2ae3b2b Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Fri, 26 Sep 2025 22:59:35 +0100 Subject: [PATCH 75/75] Squashfs: reject negative file sizes in squashfs_read_inode() Syskaller reports a "WARNING in ovl_copy_up_file" in overlayfs. This warning is ultimately caused because the underlying Squashfs file system returns a file with a negative file size. This commit checks for a negative file size and returns EINVAL. [phillip@squashfs.org.uk: only need to check 64 bit quantity] Link: https://lkml.kernel.org/r/20250926222305.110103-1-phillip@squashfs.org.uk Link: https://lkml.kernel.org/r/20250926215935.107233-1-phillip@squashfs.org.uk Fixes: 6545b246a2c8 ("Squashfs: inode operations") Signed-off-by: Phillip Lougher Reported-by: syzbot+f754e01116421e9754b9@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/68d580e5.a00a0220.303701.0019.GAE@google.com/ Cc: Amir Goldstein Cc: Signed-off-by: Andrew Morton --- fs/squashfs/inode.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index ddc65d006063..cceae3b78698 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -197,6 +197,10 @@ int squashfs_read_inode(struct inode *inode, long long ino) goto failed_read; inode->i_size = le64_to_cpu(sqsh_ino->file_size); + if (inode->i_size < 0) { + err = -EINVAL; + goto failed_read; + } frag = le32_to_cpu(sqsh_ino->fragment); if (frag != SQUASHFS_INVALID_FRAG) { /*