From adbf61cc47cb72b102682e690ad323e1eda652c2 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 11 Nov 2025 14:53:57 +0000 Subject: [PATCH 01/30] x86/acpi/boot: Correct acpi_is_processor_usable() check again ACPI v6.3 defined a new "Online Capable" MADT LAPIC flag. This bit is used in conjunction with the "Enabled" MADT LAPIC flag to determine if a CPU can be enabled/hotplugged by the OS after boot. Before the new bit was defined, the "Enabled" bit was explicitly described like this (ACPI v6.0 wording provided): "If zero, this processor is unusable, and the operating system support will not attempt to use it" This means that CPU hotplug (based on MADT) is not possible. Many BIOS implementations follow this guidance. They may include LAPIC entries in MADT for unavailable CPUs, but since these entries are marked with "Enabled=0" it is expected that the OS will completely ignore these entries. However, QEMU will do the same (include entries with "Enabled=0") for the purpose of allowing CPU hotplug within the guest. Comment from QEMU function pc_madt_cpu_entry(): /* ACPI spec says that LAPIC entry for non present * CPU may be omitted from MADT or it must be marked * as disabled. However omitting non present CPU from * MADT breaks hotplug on linux. So possible CPUs * should be put in MADT but kept disabled. */ Recent Linux topology changes broke the QEMU use case. A following fix for the QEMU use case broke bare metal topology enumeration. Rework the Linux MADT LAPIC flags check to allow the QEMU use case only for guests and to maintain the ACPI spec behavior for bare metal. Remove an unnecessary check added to fix a bare metal case introduced by the QEMU "fix". [ bp: Change logic as Michal suggested. ] [ mingo: Removed misapplied -stable tag. ] Fixes: fed8d8773b8e ("x86/acpi/boot: Correct acpi_is_processor_usable() check") Fixes: f0551af02130 ("x86/topology: Ignore non-present APIC IDs in a present package") Closes: https://lore.kernel.org/r/20251024204658.3da9bf3f.michal.pecio@gmail.com Reported-by: Michal Pecio Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Signed-off-by: Ingo Molnar Tested-by: Michal Pecio Tested-by: Ricardo Neri Link: https://lore.kernel.org/20251111145357.4031846-1-yazen.ghannam@amd.com Cc: stable@vger.kernel.org --- arch/x86/kernel/acpi/boot.c | 12 ++++++++---- arch/x86/kernel/cpu/topology.c | 15 --------------- 2 files changed, 8 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 9fa321a95eb3..d6138b2b633a 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "sleep.h" /* To include x86_acpi_suspend_lowlevel */ static int __initdata acpi_force = 0; @@ -164,11 +165,14 @@ static bool __init acpi_is_processor_usable(u32 lapic_flags) if (lapic_flags & ACPI_MADT_ENABLED) return true; - if (!acpi_support_online_capable || - (lapic_flags & ACPI_MADT_ONLINE_CAPABLE)) - return true; + if (acpi_support_online_capable) + return lapic_flags & ACPI_MADT_ONLINE_CAPABLE; - return false; + /* + * QEMU expects legacy "Enabled=0" LAPIC entries to be counted as usable + * in order to support CPU hotplug in guests. + */ + return !hypervisor_is_type(X86_HYPER_NATIVE); } static int __init diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index f55ea3cdbf88..23190a786d31 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -27,7 +27,6 @@ #include #include -#include #include #include #include @@ -236,20 +235,6 @@ static __init void topo_register_apic(u32 apic_id, u32 acpi_id, bool present) cpuid_to_apicid[cpu] = apic_id; topo_set_cpuids(cpu, apic_id, acpi_id); } else { - u32 pkgid = topo_apicid(apic_id, TOPO_PKG_DOMAIN); - - /* - * Check for present APICs in the same package when running - * on bare metal. Allow the bogosity in a guest. - */ - if (hypervisor_is_type(X86_HYPER_NATIVE) && - topo_unit_count(pkgid, TOPO_PKG_DOMAIN, phys_cpu_present_map)) { - pr_info_once("Ignoring hot-pluggable APIC ID %x in present package.\n", - apic_id); - topo_info.nr_rejected_cpus++; - return; - } - topo_info.nr_disabled_cpus++; } From db0d69c5700ba4749217b83b475606d864d46226 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 May 2025 14:05:17 +0200 Subject: [PATCH 02/30] x86/boot/e820: Remove inverted boolean logic from the e820_nomerge() function name, rename it to e820_type_mergeable() It's a bad practice to put inverted logic into function names, flip it back and rename it to e820_type_mergeable(). Add/update a few comments about this function while at it. Signed-off-by: Ingo Molnar Reviewed-by: Nikolay Borisov Link: https://patch.msgid.link/20250515120549.2820541-2-mingo@kernel.org --- arch/x86/kernel/e820.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index b15b97d3cb52..4c3159d07252 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -305,18 +305,22 @@ static int __init cpcompare(const void *a, const void *b) return (ap->addr != ap->entry->addr) - (bp->addr != bp->entry->addr); } -static bool e820_nomerge(enum e820_type type) +/* + * Can two consecutive E820 entries of this same E820 type be merged? + */ +static bool e820_type_mergeable(enum e820_type type) { /* * These types may indicate distinct platform ranges aligned to - * numa node, protection domain, performance domain, or other + * NUMA node, protection domain, performance domain, or other * boundaries. Do not merge them. */ if (type == E820_TYPE_PRAM) - return true; + return false; if (type == E820_TYPE_SOFT_RESERVED) - return true; - return false; + return false; + + return true; } int __init e820__update_table(struct e820_table *table) @@ -394,7 +398,7 @@ int __init e820__update_table(struct e820_table *table) } /* Continue building up new map based on this information: */ - if (current_type != last_type || e820_nomerge(current_type)) { + if (current_type != last_type || !e820_type_mergeable(current_type)) { if (last_type) { new_entries[new_nr_entries].size = change_point[chg_idx]->addr - last_addr; /* Move forward only if the new size was non-zero: */ From 0bb4a8bdbd22fda17660fdd4c086adaf4970239b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 May 2025 14:05:18 +0200 Subject: [PATCH 03/30] x86/boot/e820: Simplify e820__print_table() a bit Introduce 'entry' for the current table entry and shorten repetitious use of e820_table->entries[i]. Signed-off-by: Ingo Molnar Cc: H . Peter Anvin Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: David Woodhouse Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Mike Rapoport Cc: Paul Menzel Cc: Peter Zijlstra Link: https://patch.msgid.link/20250515120549.2820541-3-mingo@kernel.org --- arch/x86/kernel/e820.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 4c3159d07252..cf2eb39b5ed6 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -205,12 +205,14 @@ void __init e820__print_table(char *who) int i; for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = e820_table->entries + i; + pr_info("%s: [mem %#018Lx-%#018Lx] ", who, - e820_table->entries[i].addr, - e820_table->entries[i].addr + e820_table->entries[i].size - 1); + entry->addr, + entry->addr + entry->size-1); - e820_print_type(e820_table->entries[i].type); + e820_print_type(entry->type); pr_cont("\n"); } } From 3814bf08452ecfc6db0de53a2e4f977e9661c1f4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 May 2025 14:05:19 +0200 Subject: [PATCH 04/30] x86/boot/e820: Simplify the PPro Erratum #50 workaround No need to print out the table - users won't really be able to tell much from it anyway and the messages around this erratum are unnecessarily obtuse. Instead clearly inform the user that a 256 kB hole is being punched in their memory map at the 1.75 GB physical address. Not that there are many PPro users left. :-) Signed-off-by: Ingo Molnar Cc: H . Peter Anvin Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: David Woodhouse Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Mike Rapoport Cc: Paul Menzel Cc: Peter Zijlstra Link: https://patch.msgid.link/20250515120549.2820541-4-mingo@kernel.org --- arch/x86/kernel/setup.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 1b2edd07a3e1..a231b249d23b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1015,11 +1015,9 @@ void __init setup_arch(char **cmdline_p) trim_bios_range(); #ifdef CONFIG_X86_32 if (ppro_with_ram_bug()) { - e820__range_update(0x70000000ULL, 0x40000ULL, E820_TYPE_RAM, - E820_TYPE_RESERVED); + pr_info("Applying PPro RAM bug workaround: punching 256 kB hole at 1.75 GB physical.\n"); + e820__range_update(0x70000000ULL, SZ_256K, E820_TYPE_RAM, E820_TYPE_RESERVED); e820__update_table(e820_table); - printk(KERN_INFO "fixed physical RAM map:\n"); - e820__print_table("bad_ppro"); } #else early_gart_iommu_check(); From 3e57abd4556b0fe727a755f6b9d573d324105ab0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 May 2025 14:05:20 +0200 Subject: [PATCH 05/30] x86/boot/e820: Mark e820__print_table() static There are no external users of this function left. Signed-off-by: Ingo Molnar Cc: H . Peter Anvin Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: David Woodhouse Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Mike Rapoport Cc: Paul Menzel Cc: Peter Zijlstra Link: https://patch.msgid.link/20250515120549.2820541-5-mingo@kernel.org --- arch/x86/include/asm/e820/api.h | 1 - arch/x86/kernel/e820.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h index c83645d5b2a8..54427b77bc19 100644 --- a/arch/x86/include/asm/e820/api.h +++ b/arch/x86/include/asm/e820/api.h @@ -19,7 +19,6 @@ extern u64 e820__range_update(u64 start, u64 size, enum e820_type old_type, enu extern u64 e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type); extern u64 e820__range_update_table(struct e820_table *t, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type); -extern void e820__print_table(char *who); extern int e820__update_table(struct e820_table *table); extern void e820__update_table_print(void); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index cf2eb39b5ed6..09c712aec2d0 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -200,7 +200,7 @@ static void __init e820_print_type(enum e820_type type) } } -void __init e820__print_table(char *who) +static void __init e820__print_table(const char *who) { int i; From 4d8e5a682be4136758a8beadd5aecc7f76276504 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 May 2025 14:05:21 +0200 Subject: [PATCH 06/30] x86/boot/e820: Print gaps in the E820 table Gaps in the E820 table are not obvious at a glance and can easily be overlooked. Print out gaps in the E820 table: Before: BIOS-provided physical RAM map: BIOS-e820: [mem 0x0000000000000000-0x000000000009fbff] usable BIOS-e820: [mem 0x000000000009fc00-0x000000000009ffff] reserved BIOS-e820: [mem 0x00000000000f0000-0x00000000000fffff] reserved BIOS-e820: [mem 0x0000000000100000-0x000000007ffdbfff] usable BIOS-e820: [mem 0x000000007ffdc000-0x000000007fffffff] reserved BIOS-e820: [mem 0x00000000b0000000-0x00000000bfffffff] reserved BIOS-e820: [mem 0x00000000fed1c000-0x00000000fed1ffff] reserved BIOS-e820: [mem 0x00000000feffc000-0x00000000feffffff] reserved BIOS-e820: [mem 0x00000000fffc0000-0x00000000ffffffff] reserved BIOS-e820: [mem 0x000000fd00000000-0x000000ffffffffff] reserved After: BIOS-provided physical RAM map: BIOS-e820: [mem 0x0000000000000000-0x000000000009fbff] usable BIOS-e820: [mem 0x000000000009fc00-0x000000000009ffff] reserved BIOS-e820: [gap 0x00000000000a0000-0x00000000000effff] BIOS-e820: [mem 0x00000000000f0000-0x00000000000fffff] reserved BIOS-e820: [mem 0x0000000000100000-0x000000007ffdbfff] usable BIOS-e820: [mem 0x000000007ffdc000-0x000000007fffffff] reserved BIOS-e820: [gap 0x0000000080000000-0x00000000afffffff] BIOS-e820: [mem 0x00000000b0000000-0x00000000bfffffff] reserved BIOS-e820: [gap 0x00000000c0000000-0x00000000fed1bfff] BIOS-e820: [mem 0x00000000fed1c000-0x00000000fed1ffff] reserved BIOS-e820: [gap 0x00000000fed20000-0x00000000feffbfff] BIOS-e820: [mem 0x00000000feffc000-0x00000000feffffff] reserved BIOS-e820: [gap 0x00000000ff000000-0x00000000fffbffff] BIOS-e820: [mem 0x00000000fffc0000-0x00000000ffffffff] reserved BIOS-e820: [gap 0x0000000100000000-0x000000fcffffffff] BIOS-e820: [mem 0x000000fd00000000-0x000000ffffffffff] reserved Also warn about badly ordered E820 table entries: BUG: out of order E820 entry! ( this is printed before the entry is printed, so there's no need to print any additional data with the warning. ) Signed-off-by: Ingo Molnar Cc: H . Peter Anvin Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: David Woodhouse Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Mike Rapoport Cc: Paul Menzel Cc: Peter Zijlstra Link: https://patch.msgid.link/20250515120549.2820541-6-mingo@kernel.org --- arch/x86/kernel/e820.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 09c712aec2d0..460645d09c6c 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -202,18 +202,32 @@ static void __init e820_print_type(enum e820_type type) static void __init e820__print_table(const char *who) { + u64 range_end_prev = 0; int i; for (i = 0; i < e820_table->nr_entries; i++) { struct e820_entry *entry = e820_table->entries + i; + u64 range_start, range_end; - pr_info("%s: [mem %#018Lx-%#018Lx] ", - who, - entry->addr, - entry->addr + entry->size-1); + range_start = entry->addr; + range_end = entry->addr + entry->size; + /* Out of order E820 maps should not happen: */ + if (range_start < range_end_prev) + pr_info(FW_BUG "out of order E820 entry!\n"); + + if (range_start > range_end_prev) { + pr_info("%s: [gap %#018Lx-%#018Lx]\n", + who, + range_end_prev, + range_start-1); + } + + pr_info("%s: [mem %#018Lx-%#018Lx] ", who, range_start, range_end-1); e820_print_type(entry->type); pr_cont("\n"); + + range_end_prev = range_end; } } From c87f94477740f35aafc208c85da784087c94a46e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 May 2025 14:05:22 +0200 Subject: [PATCH 07/30] x86/boot/e820: Make the field separator space character part of e820_print_type() We are going to add more columns to the E820 table printout, so make e820_print_type()'s field separator (space character) part of the function itself. Signed-off-by: Ingo Molnar Cc: H . Peter Anvin Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: David Woodhouse Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Mike Rapoport Cc: Paul Menzel Cc: Peter Zijlstra Link: https://patch.msgid.link/20250515120549.2820541-7-mingo@kernel.org --- arch/x86/kernel/e820.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 460645d09c6c..58001536ba7c 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -188,15 +188,15 @@ void __init e820__range_add(u64 start, u64 size, enum e820_type type) static void __init e820_print_type(enum e820_type type) { switch (type) { - case E820_TYPE_RAM: pr_cont("usable"); break; - case E820_TYPE_RESERVED: pr_cont("reserved"); break; - case E820_TYPE_SOFT_RESERVED: pr_cont("soft reserved"); break; - case E820_TYPE_ACPI: pr_cont("ACPI data"); break; - case E820_TYPE_NVS: pr_cont("ACPI NVS"); break; - case E820_TYPE_UNUSABLE: pr_cont("unusable"); break; + case E820_TYPE_RAM: pr_cont(" usable"); break; + case E820_TYPE_RESERVED: pr_cont(" reserved"); break; + case E820_TYPE_SOFT_RESERVED: pr_cont(" soft reserved"); break; + case E820_TYPE_ACPI: pr_cont(" ACPI data"); break; + case E820_TYPE_NVS: pr_cont(" ACPI NVS"); break; + case E820_TYPE_UNUSABLE: pr_cont(" unusable"); break; case E820_TYPE_PMEM: /* Fall through: */ - case E820_TYPE_PRAM: pr_cont("persistent (type %u)", type); break; - default: pr_cont("type %u", type); break; + case E820_TYPE_PRAM: pr_cont(" persistent (type %u)", type); break; + default: pr_cont(" type %u", type); break; } } @@ -492,9 +492,9 @@ __e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_ty size = ULLONG_MAX - start; end = start + size; - printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ", start, end - 1); + printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx]", start, end - 1); e820_print_type(old_type); - pr_cont(" ==> "); + pr_cont(" ==>"); e820_print_type(new_type); pr_cont("\n"); @@ -569,7 +569,7 @@ u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool size = ULLONG_MAX - start; end = start + size; - printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ", start, end - 1); + printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx]", start, end - 1); if (check_type) e820_print_type(old_type); pr_cont("\n"); From fa06d58805c88f76f4454284c1e9e8334b559e30 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 May 2025 14:05:24 +0200 Subject: [PATCH 08/30] x86/boot/e820: Print E820_TYPE_RAM entries as ... RAM entries So it is a bit weird that the actual RAM entries of the E820 table are not actually called RAM, but 'usable': BIOS-e820: [mem 0x0000000000100000-0x000000007ffdbfff] 1.9 GB usable 'usable' is pretty passive-aggressive in that context and ambiguous, most E820 entries denote 'usable' address ranges - reserved ranges may be used by devices, or the platform. Clarify and disambiguate this by making the boot log entry explicitly say 'System RAM', like in /proc/iomem: BIOS-e820: [mem 0x0000000000100000-0x000000007ffdbfff] 1.9 GB System RAM Signed-off-by: Ingo Molnar Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Kees Cook Cc: Linus Torvalds Cc: Mike Rapoport Cc: Paul Menzel Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: David Woodhouse Link: https://patch.msgid.link/20250515120549.2820541-9-mingo@kernel.org --- arch/x86/kernel/e820.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 58001536ba7c..b0efa4bf0632 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -188,7 +188,7 @@ void __init e820__range_add(u64 start, u64 size, enum e820_type type) static void __init e820_print_type(enum e820_type type) { switch (type) { - case E820_TYPE_RAM: pr_cont(" usable"); break; + case E820_TYPE_RAM: pr_cont(" System RAM"); break; case E820_TYPE_RESERVED: pr_cont(" reserved"); break; case E820_TYPE_SOFT_RESERVED: pr_cont(" soft reserved"); break; case E820_TYPE_ACPI: pr_cont(" ACPI data"); break; From 1d7bc219e2b6176eac361ed2eb11c7a70387644c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 May 2025 14:05:25 +0200 Subject: [PATCH 09/30] x86/boot/e820: Call the PCI gap a 'gap' in the boot log printout It is a bit weird and inconsistent that the PCI gap is advertised during bootup as 'mem'ory: [mem 0xc0000000-0xfed1bfff] available for PCI devices ^^^ It's not really memory, it's a gap that PCI devices can decode and use and they often do not map it to any memory themselves. So advertise it for what it is, a gap: [gap 0xc0000000-0xfed1bfff] available for PCI devices Signed-off-by: Ingo Molnar Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Kees Cook Cc: Linus Torvalds Cc: Mike Rapoport Cc: Paul Menzel Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: David Woodhouse Link: https://patch.msgid.link/20250515120549.2820541-10-mingo@kernel.org --- arch/x86/kernel/e820.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index b0efa4bf0632..96840fa2a086 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -699,7 +699,7 @@ __init void e820__setup_pci_gap(void) */ pci_mem_start = gapstart; - pr_info("[mem %#010lx-%#010lx] available for PCI devices\n", + pr_info("[gap %#010lx-%#010lx] available for PCI devices\n", gapstart, gapstart + gapsize - 1); } From eea78dc546a95af343fd1463ecfbd250f0abbf22 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 May 2025 14:05:26 +0200 Subject: [PATCH 10/30] x86/boot/e820: Use 'u64' consistently instead of 'unsigned long long' There's a number of structure fields and local variables related to E820 entry physical addresses that are defined as 'unsigned long long', but then are compared to u64 fields. Make the types all consistently u64. Signed-off-by: Ingo Molnar Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Kees Cook Cc: Linus Torvalds Cc: Mike Rapoport Cc: Paul Menzel Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: David Woodhouse Link: https://patch.msgid.link/20250515120549.2820541-11-mingo@kernel.org --- arch/x86/kernel/e820.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 96840fa2a086..037864890183 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -296,7 +296,7 @@ struct change_member { /* Pointer to the original entry: */ struct e820_entry *entry; /* Address for this change point: */ - unsigned long long addr; + u64 addr; }; static struct change_member change_point_list[2*E820_MAX_ENTRIES] __initdata; @@ -344,7 +344,7 @@ int __init e820__update_table(struct e820_table *table) struct e820_entry *entries = table->entries; u32 max_nr_entries = ARRAY_SIZE(table->entries); enum e820_type current_type, last_type; - unsigned long long last_addr; + u64 last_addr; u32 new_nr_entries, overlap_entries; u32 i, chg_idx, chg_nr; @@ -641,13 +641,13 @@ static void __init e820__update_table_kexec(void) */ static int __init e820_search_gap(unsigned long *gapstart, unsigned long *gapsize) { - unsigned long long last = MAX_GAP_END; + u64 last = MAX_GAP_END; int i = e820_table->nr_entries; int found = 0; while (--i >= 0) { - unsigned long long start = e820_table->entries[i].addr; - unsigned long long end = start + e820_table->entries[i].size; + u64 start = e820_table->entries[i].addr; + u64 end = start + e820_table->entries[i].size; /* * Since "last" is at most 4GB, we know we'll From d214484f50f4c5dbab932b943b824a4c2920cb6e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 May 2025 14:05:27 +0200 Subject: [PATCH 11/30] x86/boot/e820: Remove pointless early_panic() indirection early_panic() is a pointless wrapper around panic(): static void __init early_panic(char *msg) { early_printk(msg); panic(msg); } panic() will already do a printk() of 'msg', and an early_printk() if earlyprintk is enabled. There's no need to print it separately. Remove the function. Signed-off-by: Ingo Molnar Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Kees Cook Cc: Linus Torvalds Cc: Mike Rapoport Cc: Paul Menzel Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: David Woodhouse Link: https://patch.msgid.link/20250515120549.2820541-12-mingo@kernel.org --- arch/x86/kernel/e820.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 037864890183..6bc06860c6b0 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -893,12 +893,6 @@ unsigned long __init e820__end_of_low_ram_pfn(void) return e820__end_ram_pfn(1UL << (32 - PAGE_SHIFT)); } -static void __init early_panic(char *msg) -{ - early_printk(msg); - panic(msg); -} - static int userdef __initdata; /* The "mem=nopentium" boot option disables 4MB page tables on 32-bit kernels: */ @@ -1018,7 +1012,7 @@ void __init e820__finish_early_params(void) { if (userdef) { if (e820__update_table(e820_table) < 0) - early_panic("Invalid user supplied memory map"); + panic("Invalid user supplied memory map"); pr_info("user-defined physical RAM map:\n"); e820__print_table("user"); From 44f732f3ec8273b99252fcd47f873206d556a69f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 May 2025 14:05:28 +0200 Subject: [PATCH 12/30] x86/boot/e820: Clean up confusing and self-contradictory verbiage around E820 related resource allocations So the E820 code has a rather confusing area of code at around e820__reserve_resources(), which is, by its plain reading, rather self-contradictory. For example, the comment explaining e820__reserve_resources() claims: - '* Mark E820 reserved areas as busy for the resource manager' By 'E820 reserved areas' one can naively conclude that it's talking about E820_TYPE_RESERVED areas - while those areas are treated in exactly the opposite fashion by do_mark_busy(): switch (type) { case E820_TYPE_RESERVED: case E820_TYPE_SOFT_RESERVED: case E820_TYPE_PRAM: case E820_TYPE_PMEM: return false; Ie. E820_TYPE_RESERVED areas are *not* marked busy for the resource manager, because E820_TYPE_RESERVED areas are device regions that might eventually be claimed by a device driver. This type of confusion permeates this whole area of code, making it exceedingly difficult to read (for me at least). So untangle it bit by bit: - Instead of talking about ambiguous 'reserved areas', talk about 'E820 device address regions' instead, and 'register'/'lock' them. - The do_mark_busy() function is a misnomer as well, because despite its name it 'does' nothing - it only determines what type of resource handling an E820 type should receive from the kernel. Rename it to e820_device_region() and negate its meaning, to avoid the 'busy/reserved' confusion. Because that's what this code is really about: filtering out device regions such as E820_TYPE_RESERVED, E820_TYPE_PRAM, E820_TYPE_PMEM, etc., and allowing them to be claimed by device drivers later on. - All other E820 regions (system regions) are registered and locked early on, before the PCI resource manager does its search for device BAR addresses, etc. Also fix this somewhat misleading comment: /* * Try to bump up RAM regions to reasonable boundaries, to * avoid stolen RAM: */ and explain that here we register artificial 'gap' resources at the end of suspiciously sized RAM regions, as heuristics to try to avoid buggy firmware with undeclared 'stolen RAM' regions: /* * Create additional 'gaps' at the end of RAM regions, * rounding them up to 64k/1MB/64MB boundaries, should * they be weirdly sized, and register extra, locked * resource regions for them, to make sure drivers * won't claim those addresses. * * These are basically blind guesses and heuristics to * avoid resource conflicts with broken firmware that * doesn't properly list 'stolen RAM' as a system region * in the E820 map. */ Also improve the printout of this extra resource a bit: make the message more unambiguous, and upgrade it from pr_debug() (where very few people will see it), to pr_info() (where it will make it into the syslog on default distro configs). Also fix spelling and improve comment placement. No change in functionality intended. Signed-off-by: Ingo Molnar Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Kees Cook Cc: Linus Torvalds Cc: Mike Rapoport Cc: Paul Menzel Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: David Woodhouse Link: https://patch.msgid.link/20250515120549.2820541-13-mingo@kernel.org --- arch/x86/kernel/e820.c | 55 ++++++++++++++++++++++++++++-------------- 1 file changed, 37 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 6bc06860c6b0..0316a186b42b 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1064,37 +1064,44 @@ static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry) } } -static bool __init do_mark_busy(enum e820_type type, struct resource *res) +/* + * We assign one resource entry for each E820 map entry: + */ +static struct resource __initdata *e820_res; + +/* + * Is this a device address region that should not be marked busy? + * (Versus system address regions that we register & lock early.) + */ +static bool __init e820_device_region(enum e820_type type, struct resource *res) { - /* this is the legacy bios/dos rom-shadow + mmio region */ + /* This is the legacy BIOS/DOS ROM-shadow + MMIO region: */ if (res->start < (1ULL<<20)) - return true; + return false; /* * Treat persistent memory and other special memory ranges like - * device memory, i.e. reserve it for exclusive use of a driver + * device memory, i.e. keep it available for exclusive use of a + * driver: */ switch (type) { case E820_TYPE_RESERVED: case E820_TYPE_SOFT_RESERVED: case E820_TYPE_PRAM: case E820_TYPE_PMEM: - return false; + return true; case E820_TYPE_RAM: case E820_TYPE_ACPI: case E820_TYPE_NVS: case E820_TYPE_UNUSABLE: default: - return true; + return false; } } /* - * Mark E820 reserved areas as busy for the resource manager: + * Mark E820 system regions as busy for the resource manager: */ - -static struct resource __initdata *e820_res; - void __init e820__reserve_resources(void) { int i; @@ -1120,18 +1127,18 @@ void __init e820__reserve_resources(void) res->desc = e820_type_to_iores_desc(entry); /* - * Don't register the region that could be conflicted with - * PCI device BAR resources and insert them later in - * pcibios_resource_survey(): + * Skip and don't register device regions that could be conflicted + * with PCI device BAR resources. They get inserted later in + * pcibios_resource_survey() -> e820__reserve_resources_late(): */ - if (do_mark_busy(entry->type, res)) { + if (!e820_device_region(entry->type, res)) { res->flags |= IORESOURCE_BUSY; insert_resource(&iomem_resource, res); } res++; } - /* Expose the kexec e820 table to the sysfs. */ + /* Expose the kexec e820 table to sysfs: */ for (i = 0; i < e820_table_kexec->nr_entries; i++) { struct e820_entry *entry = e820_table_kexec->entries + i; @@ -1165,6 +1172,10 @@ void __init e820__reserve_resources_late(void) int i; struct resource *res; + /* + * Register device address regions listed in the E820 map, + * these can be claimed by device drivers later on: + */ res = e820_res; for (i = 0; i < e820_table->nr_entries; i++) { if (!res->parent && res->end) @@ -1173,8 +1184,16 @@ void __init e820__reserve_resources_late(void) } /* - * Try to bump up RAM regions to reasonable boundaries, to - * avoid stolen RAM: + * Create additional 'gaps' at the end of RAM regions, + * rounding them up to 64k/1MB/64MB boundaries, should + * they be weirdly sized, and register extra, locked + * resource regions for them, to make sure drivers + * won't claim those addresses. + * + * These are basically blind guesses and heuristics to + * avoid resource conflicts with broken firmware that + * doesn't properly list 'stolen RAM' as a system region + * in the E820 map. */ for (i = 0; i < e820_table->nr_entries; i++) { struct e820_entry *entry = &e820_table->entries[i]; @@ -1190,7 +1209,7 @@ void __init e820__reserve_resources_late(void) if (start >= end) continue; - printk(KERN_DEBUG "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", start, end); + pr_info("e820: register RAM buffer resource [mem %#010llx-%#010llx]\n", start, end); reserve_region_with_split(&iomem_resource, start, end, "RAM buffer"); } } From 4a7a13e04c0528771e5006cd781934f7bc4f8fa0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 May 2025 14:05:29 +0200 Subject: [PATCH 13/30] x86/boot/e820: Improve e820_print_type() messages For E820_TYPE_RESERVED, print: 'reserved' -> 'device reserved' For E820_TYPE_PRAM and E820_TYPE_PMEM: 'persistent' -> 'persistent RAM' Signed-off-by: Ingo Molnar Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Kees Cook Cc: Linus Torvalds Cc: Mike Rapoport Cc: Paul Menzel Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: David Woodhouse Link: https://patch.msgid.link/20250515120549.2820541-14-mingo@kernel.org --- arch/x86/kernel/e820.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 0316a186b42b..0c3f12fcf2b1 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -188,15 +188,15 @@ void __init e820__range_add(u64 start, u64 size, enum e820_type type) static void __init e820_print_type(enum e820_type type) { switch (type) { - case E820_TYPE_RAM: pr_cont(" System RAM"); break; - case E820_TYPE_RESERVED: pr_cont(" reserved"); break; - case E820_TYPE_SOFT_RESERVED: pr_cont(" soft reserved"); break; - case E820_TYPE_ACPI: pr_cont(" ACPI data"); break; - case E820_TYPE_NVS: pr_cont(" ACPI NVS"); break; - case E820_TYPE_UNUSABLE: pr_cont(" unusable"); break; + case E820_TYPE_RAM: pr_cont(" System RAM"); break; + case E820_TYPE_RESERVED: pr_cont(" device reserved"); break; + case E820_TYPE_SOFT_RESERVED: pr_cont(" soft reserved"); break; + case E820_TYPE_ACPI: pr_cont(" ACPI data"); break; + case E820_TYPE_NVS: pr_cont(" ACPI NVS"); break; + case E820_TYPE_UNUSABLE: pr_cont(" unusable"); break; case E820_TYPE_PMEM: /* Fall through: */ - case E820_TYPE_PRAM: pr_cont(" persistent (type %u)", type); break; - default: pr_cont(" type %u", type); break; + case E820_TYPE_PRAM: pr_cont(" persistent RAM (type %u)", type); break; + default: pr_cont(" type %u", type); break; } } From a4803df3a2b145fd17bc3d4c23c4c12c74951299 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 May 2025 14:05:30 +0200 Subject: [PATCH 14/30] x86/boot/e820: Clean up __e820__range_add() a bit - Use 'idx' index variable instead of a weird 'x' - Make the error message E820-specific - Group the code a bit better Signed-off-by: Ingo Molnar Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Kees Cook Cc: Linus Torvalds Cc: Mike Rapoport Cc: Paul Menzel Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: David Woodhouse Link: https://patch.msgid.link/20250515120549.2820541-15-mingo@kernel.org --- arch/x86/kernel/e820.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 0c3f12fcf2b1..434075174909 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -166,17 +166,18 @@ int e820__get_entry_type(u64 start, u64 end) */ static void __init __e820__range_add(struct e820_table *table, u64 start, u64 size, enum e820_type type) { - int x = table->nr_entries; + int idx = table->nr_entries; - if (x >= ARRAY_SIZE(table->entries)) { - pr_err("too many entries; ignoring [mem %#010llx-%#010llx]\n", - start, start + size - 1); + if (idx >= ARRAY_SIZE(table->entries)) { + pr_err("too many E820 table entries; ignoring [mem %#010llx-%#010llx]\n", + start, start + size-1); return; } - table->entries[x].addr = start; - table->entries[x].size = size; - table->entries[x].type = type; + table->entries[idx].addr = start; + table->entries[idx].size = size; + table->entries[idx].type = type; + table->nr_entries++; } From 2774ae1046fb1504908f8387351485cd0fc71108 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 May 2025 14:05:31 +0200 Subject: [PATCH 15/30] x86/boot/e820: Clean up __refdata use a bit So __refdata, like __init, is more of a storage class specifier, so move the attribute in front of the type, not after the variable name. This also aligns it vertically. Signed-off-by: Ingo Molnar Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Juergen Gross Cc: "H . Peter Anvin" Cc: Kees Cook Cc: Linus Torvalds Cc: Mike Rapoport Cc: Paul Menzel Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: David Woodhouse Link: https://patch.msgid.link/20250515120549.2820541-16-mingo@kernel.org --- arch/x86/kernel/e820.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 434075174909..2ce8ca5ab5c8 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -61,9 +61,9 @@ static struct e820_table e820_table_init __initdata; static struct e820_table e820_table_kexec_init __initdata; static struct e820_table e820_table_firmware_init __initdata; -struct e820_table *e820_table __refdata = &e820_table_init; -struct e820_table *e820_table_kexec __refdata = &e820_table_kexec_init; -struct e820_table *e820_table_firmware __refdata = &e820_table_firmware_init; +__refdata struct e820_table *e820_table = &e820_table_init; +__refdata struct e820_table *e820_table_kexec = &e820_table_kexec_init; +__refdata struct e820_table *e820_table_firmware = &e820_table_firmware_init; /* For PCI or other memory-mapped resources */ unsigned long pci_mem_start = 0xaeedbabe; From a515ca9664fba4733a95231e5b3e570762b39ced Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 May 2025 14:05:32 +0200 Subject: [PATCH 16/30] x86/boot/e820: Remove unnecessary header inclusions Signed-off-by: Ingo Molnar Cc: H . Peter Anvin Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: David Woodhouse Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Mike Rapoport Cc: Paul Menzel Cc: Peter Zijlstra Link: https://patch.msgid.link/20250515120549.2820541-17-mingo@kernel.org --- arch/x86/kernel/e820.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 2ce8ca5ab5c8..d8fd7c1d1a9b 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -9,13 +9,11 @@ * quirks and other tweaks, and feeds that into the generic Linux memory * allocation code routines via a platform independent interface (memblock, etc.). */ -#include #include #include #include #include #include -#include #include #include From dc043d6463bf5bb732fe4e29ca5db21ba114871e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 May 2025 14:05:33 +0200 Subject: [PATCH 17/30] x86/boot/e820: Standardize e820 table index variable names under 'idx' Signed-off-by: Ingo Molnar Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: David Woodhouse Cc: H . Peter Anvin Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Mike Rapoport Cc: Paul Menzel Cc: Peter Zijlstra Link: https://patch.msgid.link/20250515120549.2820541-18-mingo@kernel.org --- arch/x86/kernel/e820.c | 114 ++++++++++++++++++++--------------------- 1 file changed, 57 insertions(+), 57 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index d8fd7c1d1a9b..a7dabf809b2b 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -76,10 +76,10 @@ EXPORT_SYMBOL(pci_mem_start); static bool _e820__mapped_any(struct e820_table *table, u64 start, u64 end, enum e820_type type) { - int i; + int idx; - for (i = 0; i < table->nr_entries; i++) { - struct e820_entry *entry = &table->entries[i]; + for (idx = 0; idx < table->nr_entries; idx++) { + struct e820_entry *entry = &table->entries[idx]; if (type && entry->type != type) continue; @@ -111,10 +111,10 @@ EXPORT_SYMBOL_GPL(e820__mapped_any); static struct e820_entry *__e820__mapped_all(u64 start, u64 end, enum e820_type type) { - int i; + int idx; - for (i = 0; i < e820_table->nr_entries; i++) { - struct e820_entry *entry = &e820_table->entries[i]; + for (idx = 0; idx < e820_table->nr_entries; idx++) { + struct e820_entry *entry = &e820_table->entries[idx]; if (type && entry->type != type) continue; @@ -202,10 +202,10 @@ static void __init e820_print_type(enum e820_type type) static void __init e820__print_table(const char *who) { u64 range_end_prev = 0; - int i; + int idx; - for (i = 0; i < e820_table->nr_entries; i++) { - struct e820_entry *entry = e820_table->entries + i; + for (idx = 0; idx < e820_table->nr_entries; idx++) { + struct e820_entry *entry = e820_table->entries + idx; u64 range_start, range_end; range_start = entry->addr; @@ -345,7 +345,7 @@ int __init e820__update_table(struct e820_table *table) enum e820_type current_type, last_type; u64 last_addr; u32 new_nr_entries, overlap_entries; - u32 i, chg_idx, chg_nr; + u32 idx, chg_idx, chg_nr; /* If there's only one memory region, don't bother: */ if (table->nr_entries < 2) @@ -354,26 +354,26 @@ int __init e820__update_table(struct e820_table *table) BUG_ON(table->nr_entries > max_nr_entries); /* Bail out if we find any unreasonable addresses in the map: */ - for (i = 0; i < table->nr_entries; i++) { - if (entries[i].addr + entries[i].size < entries[i].addr) + for (idx = 0; idx < table->nr_entries; idx++) { + if (entries[idx].addr + entries[idx].size < entries[idx].addr) return -1; } /* Create pointers for initial change-point information (for sorting): */ - for (i = 0; i < 2 * table->nr_entries; i++) - change_point[i] = &change_point_list[i]; + for (idx = 0; idx < 2 * table->nr_entries; idx++) + change_point[idx] = &change_point_list[idx]; /* * Record all known change-points (starting and ending addresses), * omitting empty memory regions: */ chg_idx = 0; - for (i = 0; i < table->nr_entries; i++) { - if (entries[i].size != 0) { - change_point[chg_idx]->addr = entries[i].addr; - change_point[chg_idx++]->entry = &entries[i]; - change_point[chg_idx]->addr = entries[i].addr + entries[i].size; - change_point[chg_idx++]->entry = &entries[i]; + for (idx = 0; idx < table->nr_entries; idx++) { + if (entries[idx].size != 0) { + change_point[chg_idx]->addr = entries[idx].addr; + change_point[chg_idx++]->entry = &entries[idx]; + change_point[chg_idx]->addr = entries[idx].addr + entries[idx].size; + change_point[chg_idx++]->entry = &entries[idx]; } } chg_nr = chg_idx; @@ -395,9 +395,9 @@ int __init e820__update_table(struct e820_table *table) overlap_list[overlap_entries++] = change_point[chg_idx]->entry; } else { /* Remove entry from list (order independent, so swap with last): */ - for (i = 0; i < overlap_entries; i++) { - if (overlap_list[i] == change_point[chg_idx]->entry) - overlap_list[i] = overlap_list[overlap_entries-1]; + for (idx = 0; idx < overlap_entries; idx++) { + if (overlap_list[idx] == change_point[chg_idx]->entry) + overlap_list[idx] = overlap_list[overlap_entries-1]; } overlap_entries--; } @@ -407,9 +407,9 @@ int __init e820__update_table(struct e820_table *table) * 1=usable, 2,3,4,4+=unusable) */ current_type = 0; - for (i = 0; i < overlap_entries; i++) { - if (overlap_list[i]->type > current_type) - current_type = overlap_list[i]->type; + for (idx = 0; idx < overlap_entries; idx++) { + if (overlap_list[idx]->type > current_type) + current_type = overlap_list[idx]->type; } /* Continue building up new map based on this information: */ @@ -482,7 +482,7 @@ static u64 __init __e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) { u64 end; - unsigned int i; + unsigned int idx; u64 real_updated_size = 0; BUG_ON(old_type == new_type); @@ -497,8 +497,8 @@ __e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_ty e820_print_type(new_type); pr_cont("\n"); - for (i = 0; i < table->nr_entries; i++) { - struct e820_entry *entry = &table->entries[i]; + for (idx = 0; idx < table->nr_entries; idx++) { + struct e820_entry *entry = &table->entries[idx]; u64 final_start, final_end; u64 entry_end; @@ -560,7 +560,7 @@ u64 __init e820__range_update_table(struct e820_table *t, u64 start, u64 size, /* Remove a range of memory from the E820 table: */ u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type) { - int i; + int idx; u64 end; u64 real_removed_size = 0; @@ -573,8 +573,8 @@ u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool e820_print_type(old_type); pr_cont("\n"); - for (i = 0; i < e820_table->nr_entries; i++) { - struct e820_entry *entry = &e820_table->entries[i]; + for (idx = 0; idx < e820_table->nr_entries; idx++) { + struct e820_entry *entry = &e820_table->entries[idx]; u64 final_start, final_end; u64 entry_end; @@ -641,12 +641,12 @@ static void __init e820__update_table_kexec(void) static int __init e820_search_gap(unsigned long *gapstart, unsigned long *gapsize) { u64 last = MAX_GAP_END; - int i = e820_table->nr_entries; + int idx = e820_table->nr_entries; int found = 0; - while (--i >= 0) { - u64 start = e820_table->entries[i].addr; - u64 end = start + e820_table->entries[i].size; + while (--idx >= 0) { + u64 start = e820_table->entries[idx].addr; + u64 end = start + e820_table->entries[idx].size; /* * Since "last" is at most 4GB, we know we'll @@ -772,11 +772,11 @@ void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len) */ void __init e820__register_nosave_regions(unsigned long limit_pfn) { - int i; + int idx; u64 last_addr = 0; - for (i = 0; i < e820_table->nr_entries; i++) { - struct e820_entry *entry = &e820_table->entries[i]; + for (idx = 0; idx < e820_table->nr_entries; idx++) { + struct e820_entry *entry = &e820_table->entries[idx]; if (entry->type != E820_TYPE_RAM) continue; @@ -797,10 +797,10 @@ void __init e820__register_nosave_regions(unsigned long limit_pfn) */ static int __init e820__register_nvs_regions(void) { - int i; + int idx; - for (i = 0; i < e820_table->nr_entries; i++) { - struct e820_entry *entry = &e820_table->entries[i]; + for (idx = 0; idx < e820_table->nr_entries; idx++) { + struct e820_entry *entry = &e820_table->entries[idx]; if (entry->type == E820_TYPE_NVS) acpi_nvs_register(entry->addr, entry->size); @@ -848,12 +848,12 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align) */ static unsigned long __init e820__end_ram_pfn(unsigned long limit_pfn) { - int i; + int idx; unsigned long last_pfn = 0; unsigned long max_arch_pfn = MAX_ARCH_PFN; - for (i = 0; i < e820_table->nr_entries; i++) { - struct e820_entry *entry = &e820_table->entries[i]; + for (idx = 0; idx < e820_table->nr_entries; idx++) { + struct e820_entry *entry = &e820_table->entries[idx]; unsigned long start_pfn; unsigned long end_pfn; @@ -1103,7 +1103,7 @@ static bool __init e820_device_region(enum e820_type type, struct resource *res) */ void __init e820__reserve_resources(void) { - int i; + int idx; struct resource *res; u64 end; @@ -1111,8 +1111,8 @@ void __init e820__reserve_resources(void) SMP_CACHE_BYTES); e820_res = res; - for (i = 0; i < e820_table->nr_entries; i++) { - struct e820_entry *entry = e820_table->entries + i; + for (idx = 0; idx < e820_table->nr_entries; idx++) { + struct e820_entry *entry = e820_table->entries + idx; end = entry->addr + entry->size - 1; if (end != (resource_size_t)end) { @@ -1138,8 +1138,8 @@ void __init e820__reserve_resources(void) } /* Expose the kexec e820 table to sysfs: */ - for (i = 0; i < e820_table_kexec->nr_entries; i++) { - struct e820_entry *entry = e820_table_kexec->entries + i; + for (idx = 0; idx < e820_table_kexec->nr_entries; idx++) { + struct e820_entry *entry = e820_table_kexec->entries + idx; firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry)); } @@ -1168,7 +1168,7 @@ static unsigned long __init ram_alignment(resource_size_t pos) void __init e820__reserve_resources_late(void) { - int i; + int idx; struct resource *res; /* @@ -1176,7 +1176,7 @@ void __init e820__reserve_resources_late(void) * these can be claimed by device drivers later on: */ res = e820_res; - for (i = 0; i < e820_table->nr_entries; i++) { + for (idx = 0; idx < e820_table->nr_entries; idx++) { if (!res->parent && res->end) insert_resource_expand_to_fit(&iomem_resource, res); res++; @@ -1194,8 +1194,8 @@ void __init e820__reserve_resources_late(void) * doesn't properly list 'stolen RAM' as a system region * in the E820 map. */ - for (i = 0; i < e820_table->nr_entries; i++) { - struct e820_entry *entry = &e820_table->entries[i]; + for (idx = 0; idx < e820_table->nr_entries; idx++) { + struct e820_entry *entry = &e820_table->entries[idx]; u64 start, end; if (entry->type != E820_TYPE_RAM) @@ -1272,7 +1272,7 @@ void __init e820__memory_setup(void) void __init e820__memblock_setup(void) { - int i; + int idx; u64 end; #ifdef CONFIG_MEMORY_HOTPLUG @@ -1316,8 +1316,8 @@ void __init e820__memblock_setup(void) */ memblock_allow_resize(); - for (i = 0; i < e820_table->nr_entries; i++) { - struct e820_entry *entry = &e820_table->entries[i]; + for (idx = 0; idx < e820_table->nr_entries; idx++) { + struct e820_entry *entry = &e820_table->entries[idx]; end = entry->addr + entry->size; if (end != (resource_size_t)end) From 58dcd82d2e2543e0aba4915613debec3c309849b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 May 2025 14:05:34 +0200 Subject: [PATCH 18/30] x86/boot/e820: Standardize e820 table index variable types under 'u32' So we have 'idx' types of 'int' and 'unsigned int', and sometimes we assign 'u32' fields such as e820_table::nr_entries to these 'int' values. While there's no real risk of overflow with these tables, make it all cleaner by standardizing on a single type: u32. This also happens to shrink the code a bit: text data bss dec hex filename 7745 44072 0 51817 ca69 e820.o.before 7613 44072 0 51685 c9e5 e820.o.after Signed-off-by: Ingo Molnar Cc: H . Peter Anvin Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: David Woodhouse Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Mike Rapoport Cc: Paul Menzel Cc: Peter Zijlstra Link: https://patch.msgid.link/20250515120549.2820541-19-mingo@kernel.org --- arch/x86/kernel/e820.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index a7dabf809b2b..39f29bf29b7c 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -76,7 +76,7 @@ EXPORT_SYMBOL(pci_mem_start); static bool _e820__mapped_any(struct e820_table *table, u64 start, u64 end, enum e820_type type) { - int idx; + u32 idx; for (idx = 0; idx < table->nr_entries; idx++) { struct e820_entry *entry = &table->entries[idx]; @@ -111,7 +111,7 @@ EXPORT_SYMBOL_GPL(e820__mapped_any); static struct e820_entry *__e820__mapped_all(u64 start, u64 end, enum e820_type type) { - int idx; + u32 idx; for (idx = 0; idx < e820_table->nr_entries; idx++) { struct e820_entry *entry = &e820_table->entries[idx]; @@ -164,7 +164,7 @@ int e820__get_entry_type(u64 start, u64 end) */ static void __init __e820__range_add(struct e820_table *table, u64 start, u64 size, enum e820_type type) { - int idx = table->nr_entries; + u32 idx = table->nr_entries; if (idx >= ARRAY_SIZE(table->entries)) { pr_err("too many E820 table entries; ignoring [mem %#010llx-%#010llx]\n", @@ -202,7 +202,7 @@ static void __init e820_print_type(enum e820_type type) static void __init e820__print_table(const char *who) { u64 range_end_prev = 0; - int idx; + u32 idx; for (idx = 0; idx < e820_table->nr_entries; idx++) { struct e820_entry *entry = e820_table->entries + idx; @@ -482,7 +482,7 @@ static u64 __init __e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) { u64 end; - unsigned int idx; + u32 idx; u64 real_updated_size = 0; BUG_ON(old_type == new_type); @@ -560,7 +560,7 @@ u64 __init e820__range_update_table(struct e820_table *t, u64 start, u64 size, /* Remove a range of memory from the E820 table: */ u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type) { - int idx; + u32 idx; u64 end; u64 real_removed_size = 0; @@ -772,7 +772,7 @@ void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len) */ void __init e820__register_nosave_regions(unsigned long limit_pfn) { - int idx; + u32 idx; u64 last_addr = 0; for (idx = 0; idx < e820_table->nr_entries; idx++) { @@ -797,7 +797,7 @@ void __init e820__register_nosave_regions(unsigned long limit_pfn) */ static int __init e820__register_nvs_regions(void) { - int idx; + u32 idx; for (idx = 0; idx < e820_table->nr_entries; idx++) { struct e820_entry *entry = &e820_table->entries[idx]; @@ -848,7 +848,7 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align) */ static unsigned long __init e820__end_ram_pfn(unsigned long limit_pfn) { - int idx; + u32 idx; unsigned long last_pfn = 0; unsigned long max_arch_pfn = MAX_ARCH_PFN; @@ -1103,7 +1103,7 @@ static bool __init e820_device_region(enum e820_type type, struct resource *res) */ void __init e820__reserve_resources(void) { - int idx; + u32 idx; struct resource *res; u64 end; @@ -1168,7 +1168,7 @@ static unsigned long __init ram_alignment(resource_size_t pos) void __init e820__reserve_resources_late(void) { - int idx; + u32 idx; struct resource *res; /* @@ -1272,7 +1272,7 @@ void __init e820__memory_setup(void) void __init e820__memblock_setup(void) { - int idx; + u32 idx; u64 end; #ifdef CONFIG_MEMORY_HOTPLUG From 46f3e7d394b23e93d274590d7bede5d62d80440b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 May 2025 14:05:35 +0200 Subject: [PATCH 19/30] x86/boot/e820: Change struct e820_table::nr_entries type from __u32 to u32 __u32 is for UAPI headers, and this definition is the only place in the kernel-internal E820 code that uses __u32. Change it to u32. Signed-off-by: Ingo Molnar Cc: H . Peter Anvin Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: David Woodhouse Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Mike Rapoport Cc: Paul Menzel Cc: Peter Zijlstra Link: https://patch.msgid.link/20250515120549.2820541-20-mingo@kernel.org --- arch/x86/include/asm/e820/types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/e820/types.h b/arch/x86/include/asm/e820/types.h index 80c4a7266629..df12f7ee75d3 100644 --- a/arch/x86/include/asm/e820/types.h +++ b/arch/x86/include/asm/e820/types.h @@ -83,7 +83,7 @@ struct e820_entry { * The whole array of E820 entries: */ struct e820_table { - __u32 nr_entries; + u32 nr_entries; struct e820_entry entries[E820_MAX_ENTRIES]; }; From cff02bff04f237b361fdc7066f043d00f0e3c872 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 May 2025 14:05:36 +0200 Subject: [PATCH 20/30] x86/boot/e820: Clean up e820__setup_pci_gap()/e820_search_gap() a bit Apply misc cleanups: - Use a bit more readable variable names, we haven't run out of underscore characters in the kernel yet. - s/0x400000/SZ_4M - s/1024*1024/SZ_1M Suggested-by: Andy Shevchenko Signed-off-by: Ingo Molnar Cc: Arnd Bergmann Cc: David Woodhouse Cc: H . Peter Anvin Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Mike Rapoport Cc: Paul Menzel Cc: Peter Zijlstra Link: https://patch.msgid.link/20250515120549.2820541-21-mingo@kernel.org --- arch/x86/kernel/e820.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 39f29bf29b7c..b8edc5e32d87 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -638,7 +638,7 @@ static void __init e820__update_table_kexec(void) /* * Search for a gap in the E820 memory space from 0 to MAX_GAP_END (4GB). */ -static int __init e820_search_gap(unsigned long *gapstart, unsigned long *gapsize) +static int __init e820_search_gap(unsigned long *gap_start, unsigned long *gap_size) { u64 last = MAX_GAP_END; int idx = e820_table->nr_entries; @@ -655,9 +655,9 @@ static int __init e820_search_gap(unsigned long *gapstart, unsigned long *gapsiz if (last > end) { unsigned long gap = last - end; - if (gap >= *gapsize) { - *gapsize = gap; - *gapstart = end; + if (gap >= *gap_size) { + *gap_size = gap; + *gap_start = end; found = 1; } } @@ -677,29 +677,29 @@ static int __init e820_search_gap(unsigned long *gapstart, unsigned long *gapsiz */ __init void e820__setup_pci_gap(void) { - unsigned long gapstart, gapsize; + unsigned long gap_start, gap_size; int found; - gapsize = 0x400000; - found = e820_search_gap(&gapstart, &gapsize); + gap_size = SZ_4M; + found = e820_search_gap(&gap_start, &gap_size); if (!found) { #ifdef CONFIG_X86_64 - gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; + gap_start = (max_pfn << PAGE_SHIFT) + SZ_1M; pr_err("Cannot find an available gap in the 32-bit address range\n"); pr_err("PCI devices with unassigned 32-bit BARs may not work!\n"); #else - gapstart = 0x10000000; + gap_start = 0x10000000; #endif } /* * e820__reserve_resources_late() protects stolen RAM already: */ - pci_mem_start = gapstart; + pci_mem_start = gap_start; pr_info("[gap %#010lx-%#010lx] available for PCI devices\n", - gapstart, gapstart + gapsize - 1); + gap_start, gap_start + gap_size - 1); } /* From f40f3f32b34562672364f02f1b7f7929b8467768 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 May 2025 14:05:37 +0200 Subject: [PATCH 21/30] x86/boot/e820: Change e820_search_gap() to search for the highest-address PCI gap Right now the main x86 function that determines the position and size of the 'PCI gap', e820_search_gap(), has this curious property: while (--idx >= 0) { ... if (gap >= *gap_size) { I.e. it will iterate the E820 table backwards, from its end to the beginning, and will search for larger and larger gaps in the memory map below 4GB, until it finishes with the table. This logic will, should there be two gaps with the same size, pick the one with the lower physical address - which is contrary to usual practice that the PCI gap is just below 4GB. Furthermore, the commit that introduced this weird logic 16 years ago: 3381959da5a0 ("x86: cleanup e820_setup_gap(), add e820_search_gap(), v2") - if (gap > gapsize) { + if (gap >= *gapsize) { didn't even declare this change, the title says it's a cleanup, and the changelog declares it as a preparatory refactoring for a later bugfix: 809d9a8f93bd ("x86/PCI: ACPI based PCI gap calculation") which bugfix was reverted only 1 day later without much of an explanation, and was never reintroduced: 58b6e5538460 ("Revert "x86/PCI: ACPI based PCI gap calculation"") So based on the Git archeology and by the plain reading of the code I declare this '>=' change an unintended bug and side effect. Change it to '>' again. It should not make much of a difference in practice, as the likelihood of having *two* largest gaps with exactly the same size are very low outside of weird user-provided memory maps. Signed-off-by: Ingo Molnar Cc: H . Peter Anvin Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: David Woodhouse Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Mike Rapoport Cc: Paul Menzel Cc: Peter Zijlstra Link: https://patch.msgid.link/20250515120549.2820541-22-mingo@kernel.org --- arch/x86/kernel/e820.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index b8edc5e32d87..3fba5406502a 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -655,7 +655,7 @@ static int __init e820_search_gap(unsigned long *gap_start, unsigned long *gap_s if (last > end) { unsigned long gap = last - end; - if (gap >= *gap_size) { + if (gap > *gap_size) { *gap_size = gap; *gap_start = end; found = 1; From 95060e411ffd1be5db641d469b759912abad3332 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 May 2025 14:05:38 +0200 Subject: [PATCH 22/30] x86/boot/e820: Rename gap_start/gap_size to max_gap_start/max_gap_start in e820_search_gap() et al The PCI gap searching functions pass around pointers to the gap_start/gap_size variables, which refer to the maximum size gap found so far. Rename the variables to say so, and disambiguate their namespace from 'current gap' variables. Signed-off-by: Ingo Molnar Cc: H . Peter Anvin Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: David Woodhouse Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Mike Rapoport Cc: Paul Menzel Cc: Peter Zijlstra Link: https://patch.msgid.link/20250515120549.2820541-23-mingo@kernel.org --- arch/x86/kernel/e820.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 3fba5406502a..f5828029829f 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -638,7 +638,7 @@ static void __init e820__update_table_kexec(void) /* * Search for a gap in the E820 memory space from 0 to MAX_GAP_END (4GB). */ -static int __init e820_search_gap(unsigned long *gap_start, unsigned long *gap_size) +static int __init e820_search_gap(unsigned long *max_gap_start, unsigned long *max_gap_size) { u64 last = MAX_GAP_END; int idx = e820_table->nr_entries; @@ -655,9 +655,9 @@ static int __init e820_search_gap(unsigned long *gap_start, unsigned long *gap_s if (last > end) { unsigned long gap = last - end; - if (gap > *gap_size) { - *gap_size = gap; - *gap_start = end; + if (gap > *max_gap_size) { + *max_gap_size = gap; + *max_gap_start = end; found = 1; } } @@ -677,29 +677,29 @@ static int __init e820_search_gap(unsigned long *gap_start, unsigned long *gap_s */ __init void e820__setup_pci_gap(void) { - unsigned long gap_start, gap_size; + unsigned long max_gap_start, max_gap_size; int found; - gap_size = SZ_4M; - found = e820_search_gap(&gap_start, &gap_size); + max_gap_size = SZ_4M; + found = e820_search_gap(&max_gap_start, &max_gap_size); if (!found) { #ifdef CONFIG_X86_64 - gap_start = (max_pfn << PAGE_SHIFT) + SZ_1M; + max_gap_start = (max_pfn << PAGE_SHIFT) + SZ_1M; pr_err("Cannot find an available gap in the 32-bit address range\n"); pr_err("PCI devices with unassigned 32-bit BARs may not work!\n"); #else - gap_start = 0x10000000; + max_gap_start = 0x10000000; #endif } /* * e820__reserve_resources_late() protects stolen RAM already: */ - pci_mem_start = gap_start; + pci_mem_start = max_gap_start; pr_info("[gap %#010lx-%#010lx] available for PCI devices\n", - gap_start, gap_start + gap_size - 1); + max_gap_start, max_gap_start + max_gap_size - 1); } /* From 7df2f811b275e6067f7e15a966a2b6ff22a4edfc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 May 2025 14:05:39 +0200 Subject: [PATCH 23/30] x86/boot/e820: Simplify & clarify __e820__range_add() a bit Use 'entry_new' to make clear we are allocating a new entry. Change the table-full message to say that the table is full. Signed-off-by: Ingo Molnar Cc: H . Peter Anvin Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: David Woodhouse Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Mike Rapoport Cc: Paul Menzel Cc: Peter Zijlstra Link: https://patch.msgid.link/20250515120549.2820541-24-mingo@kernel.org --- arch/x86/kernel/e820.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index f5828029829f..4758099a96bc 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -165,16 +165,19 @@ int e820__get_entry_type(u64 start, u64 end) static void __init __e820__range_add(struct e820_table *table, u64 start, u64 size, enum e820_type type) { u32 idx = table->nr_entries; + struct e820_entry *entry_new; if (idx >= ARRAY_SIZE(table->entries)) { - pr_err("too many E820 table entries; ignoring [mem %#010llx-%#010llx]\n", + pr_err("E820 table full; ignoring [mem %#010llx-%#010llx]\n", start, start + size-1); return; } - table->entries[idx].addr = start; - table->entries[idx].size = size; - table->entries[idx].type = type; + entry_new = table->entries + idx; + + entry_new->addr = start; + entry_new->size = size; + entry_new->type = type; table->nr_entries++; } From af0cf1646d9de812465c3fa134c8c5bcf85de118 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 May 2025 14:05:40 +0200 Subject: [PATCH 24/30] x86/boot/e820: Standardize __init/__initdata tag placement So the e820.c file has a hodgepodge of __init and __initdata tag placements: static int __init e820_search_gap(unsigned long *max_gap_start, unsigned long *max_gap_size) __init void e820__setup_pci_gap(void) __init void e820__reallocate_tables(void) void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len) void __init e820__register_nosave_regions(unsigned long limit_pfn) static int __init e820__register_nvs_regions(void) u64 __init e820__memblock_alloc_reserved(u64 size, u64 align) Standardize on the style used by e820__setup_pci_gap() and place them before the storage class. In addition to the consistency, as a bonus this makes the grep output rather clean looking: __init void e820__range_remove(u64 start, u64 size, enum e820_type filter_type) __init void e820__update_table_print(void) __init static void e820__update_table_kexec(void) __init static int e820_search_gap(unsigned long *max_gap_start, unsigned long *max_gap_size) __init void e820__setup_pci_gap(void) __init void e820__reallocate_tables(void) __init void e820__memory_setup_extended(u64 phys_addr, u32 data_len) __init void e820__register_nosave_regions(unsigned long limit_pfn) __init static int e820__register_nvs_regions(void) ... and if one learns to just ignore the leftmost '__init' noise then the rest of the line looks just like a regular C function definition. With the 'mixed' tag placement style the __init tag breaks up the function's prototype for no good reason. Do the same for __initdata. Signed-off-by: Ingo Molnar Cc: H . Peter Anvin Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: David Woodhouse Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Mike Rapoport Cc: Paul Menzel Cc: Peter Zijlstra Link: https://patch.msgid.link/20250515120549.2820541-25-mingo@kernel.org --- arch/x86/kernel/e820.c | 92 +++++++++++++++++++++--------------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 4758099a96bc..46960801c580 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -55,9 +55,9 @@ * re-propagated. So its main role is a temporary bootstrap storage of firmware * specific memory layout data during early bootup. */ -static struct e820_table e820_table_init __initdata; -static struct e820_table e820_table_kexec_init __initdata; -static struct e820_table e820_table_firmware_init __initdata; +__initdata static struct e820_table e820_table_init; +__initdata static struct e820_table e820_table_kexec_init; +__initdata static struct e820_table e820_table_firmware_init; __refdata struct e820_table *e820_table = &e820_table_init; __refdata struct e820_table *e820_table_kexec = &e820_table_kexec_init; @@ -144,7 +144,7 @@ static struct e820_entry *__e820__mapped_all(u64 start, u64 end, /* * This function checks if the entire range is mapped with type. */ -bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type) +__init bool e820__mapped_all(u64 start, u64 end, enum e820_type type) { return __e820__mapped_all(start, end, type); } @@ -162,7 +162,7 @@ int e820__get_entry_type(u64 start, u64 end) /* * Add a memory region to the kernel E820 map. */ -static void __init __e820__range_add(struct e820_table *table, u64 start, u64 size, enum e820_type type) +__init static void __e820__range_add(struct e820_table *table, u64 start, u64 size, enum e820_type type) { u32 idx = table->nr_entries; struct e820_entry *entry_new; @@ -182,12 +182,12 @@ static void __init __e820__range_add(struct e820_table *table, u64 start, u64 si table->nr_entries++; } -void __init e820__range_add(u64 start, u64 size, enum e820_type type) +__init void e820__range_add(u64 start, u64 size, enum e820_type type) { __e820__range_add(e820_table, start, size, type); } -static void __init e820_print_type(enum e820_type type) +__init static void e820_print_type(enum e820_type type) { switch (type) { case E820_TYPE_RAM: pr_cont(" System RAM"); break; @@ -202,7 +202,7 @@ static void __init e820_print_type(enum e820_type type) } } -static void __init e820__print_table(const char *who) +__init static void e820__print_table(const char *who) { u64 range_end_prev = 0; u32 idx; @@ -301,12 +301,12 @@ struct change_member { u64 addr; }; -static struct change_member change_point_list[2*E820_MAX_ENTRIES] __initdata; -static struct change_member *change_point[2*E820_MAX_ENTRIES] __initdata; -static struct e820_entry *overlap_list[E820_MAX_ENTRIES] __initdata; -static struct e820_entry new_entries[E820_MAX_ENTRIES] __initdata; +__initdata static struct change_member change_point_list[2*E820_MAX_ENTRIES]; +__initdata static struct change_member *change_point[2*E820_MAX_ENTRIES]; +__initdata static struct e820_entry *overlap_list[E820_MAX_ENTRIES]; +__initdata static struct e820_entry new_entries[E820_MAX_ENTRIES]; -static int __init cpcompare(const void *a, const void *b) +__init static int cpcompare(const void *a, const void *b) { struct change_member * const *app = a, * const *bpp = b; const struct change_member *ap = *app, *bp = *bpp; @@ -341,7 +341,7 @@ static bool e820_type_mergeable(enum e820_type type) return true; } -int __init e820__update_table(struct e820_table *table) +__init int e820__update_table(struct e820_table *table) { struct e820_entry *entries = table->entries; u32 max_nr_entries = ARRAY_SIZE(table->entries); @@ -441,7 +441,7 @@ int __init e820__update_table(struct e820_table *table) return 0; } -static int __init __append_e820_table(struct boot_e820_entry *entries, u32 nr_entries) +__init static int __append_e820_table(struct boot_e820_entry *entries, u32 nr_entries) { struct boot_e820_entry *entry = entries; @@ -472,7 +472,7 @@ static int __init __append_e820_table(struct boot_e820_entry *entries, u32 nr_en * will have given us a memory map that we can use to properly * set up memory. If we aren't, we'll fake a memory map. */ -static int __init append_e820_table(struct boot_e820_entry *entries, u32 nr_entries) +__init static int append_e820_table(struct boot_e820_entry *entries, u32 nr_entries) { /* Only one memory region (or negative)? Ignore it */ if (nr_entries < 2) @@ -481,7 +481,7 @@ static int __init append_e820_table(struct boot_e820_entry *entries, u32 nr_entr return __append_e820_table(entries, nr_entries); } -static u64 __init +__init static u64 __e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) { u64 end; @@ -549,19 +549,19 @@ __e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_ty return real_updated_size; } -u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) +__init u64 e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) { return __e820__range_update(e820_table, start, size, old_type, new_type); } -u64 __init e820__range_update_table(struct e820_table *t, u64 start, u64 size, +__init u64 e820__range_update_table(struct e820_table *t, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) { return __e820__range_update(t, start, size, old_type, new_type); } /* Remove a range of memory from the E820 table: */ -u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type) +__init u64 e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type) { u32 idx; u64 end; @@ -622,7 +622,7 @@ u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool return real_removed_size; } -void __init e820__update_table_print(void) +__init void e820__update_table_print(void) { if (e820__update_table(e820_table)) return; @@ -631,7 +631,7 @@ void __init e820__update_table_print(void) e820__print_table("modified"); } -static void __init e820__update_table_kexec(void) +__init static void e820__update_table_kexec(void) { e820__update_table(e820_table_kexec); } @@ -641,7 +641,7 @@ static void __init e820__update_table_kexec(void) /* * Search for a gap in the E820 memory space from 0 to MAX_GAP_END (4GB). */ -static int __init e820_search_gap(unsigned long *max_gap_start, unsigned long *max_gap_size) +__init static int e820_search_gap(unsigned long *max_gap_start, unsigned long *max_gap_size) { u64 last = MAX_GAP_END; int idx = e820_table->nr_entries; @@ -744,7 +744,7 @@ __init void e820__reallocate_tables(void) * the remaining (if any) entries are passed via the SETUP_E820_EXT node of * struct setup_data, which is parsed here. */ -void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len) +__init void e820__memory_setup_extended(u64 phys_addr, u32 data_len) { int entries; struct boot_e820_entry *extmap; @@ -773,7 +773,7 @@ void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len) * This function requires the E820 map to be sorted and without any * overlapping entries. */ -void __init e820__register_nosave_regions(unsigned long limit_pfn) +__init void e820__register_nosave_regions(unsigned long limit_pfn) { u32 idx; u64 last_addr = 0; @@ -798,7 +798,7 @@ void __init e820__register_nosave_regions(unsigned long limit_pfn) * Register ACPI NVS memory regions, so that we can save/restore them during * hibernation and the subsequent resume: */ -static int __init e820__register_nvs_regions(void) +__init static int e820__register_nvs_regions(void) { u32 idx; @@ -822,7 +822,7 @@ core_initcall(e820__register_nvs_regions); * This allows kexec to fake a new mptable, as if it came from the real * system. */ -u64 __init e820__memblock_alloc_reserved(u64 size, u64 align) +__init u64 e820__memblock_alloc_reserved(u64 size, u64 align) { u64 addr; @@ -849,7 +849,7 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align) /* * Find the highest page frame number we have available */ -static unsigned long __init e820__end_ram_pfn(unsigned long limit_pfn) +__init static unsigned long e820__end_ram_pfn(unsigned long limit_pfn) { u32 idx; unsigned long last_pfn = 0; @@ -885,20 +885,20 @@ static unsigned long __init e820__end_ram_pfn(unsigned long limit_pfn) return last_pfn; } -unsigned long __init e820__end_of_ram_pfn(void) +__init unsigned long e820__end_of_ram_pfn(void) { return e820__end_ram_pfn(MAX_ARCH_PFN); } -unsigned long __init e820__end_of_low_ram_pfn(void) +__init unsigned long e820__end_of_low_ram_pfn(void) { return e820__end_ram_pfn(1UL << (32 - PAGE_SHIFT)); } -static int userdef __initdata; +__initdata static int userdef; /* The "mem=nopentium" boot option disables 4MB page tables on 32-bit kernels: */ -static int __init parse_memopt(char *p) +__init static int parse_memopt(char *p) { u64 mem_size; @@ -932,7 +932,7 @@ static int __init parse_memopt(char *p) } early_param("mem", parse_memopt); -static int __init parse_memmap_one(char *p) +__init static int parse_memmap_one(char *p) { char *oldp; u64 start_at, mem_size; @@ -989,7 +989,7 @@ static int __init parse_memmap_one(char *p) return *p == '\0' ? 0 : -EINVAL; } -static int __init parse_memmap_opt(char *str) +__init static int parse_memmap_opt(char *str) { while (str) { char *k = strchr(str, ','); @@ -1010,7 +1010,7 @@ early_param("memmap", parse_memmap_opt); * have been processed, in which case we already have an E820 table filled in * via the parameter callback function(s), but it's not sorted and printed yet: */ -void __init e820__finish_early_params(void) +__init void e820__finish_early_params(void) { if (userdef) { if (e820__update_table(e820_table) < 0) @@ -1021,7 +1021,7 @@ void __init e820__finish_early_params(void) } } -static const char *__init e820_type_to_string(struct e820_entry *entry) +__init static const char * e820_type_to_string(struct e820_entry *entry) { switch (entry->type) { case E820_TYPE_RAM: return "System RAM"; @@ -1036,7 +1036,7 @@ static const char *__init e820_type_to_string(struct e820_entry *entry) } } -static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry) +__init static unsigned long e820_type_to_iomem_type(struct e820_entry *entry) { switch (entry->type) { case E820_TYPE_RAM: return IORESOURCE_SYSTEM_RAM; @@ -1051,7 +1051,7 @@ static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry) } } -static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry) +__init static unsigned long e820_type_to_iores_desc(struct e820_entry *entry) { switch (entry->type) { case E820_TYPE_ACPI: return IORES_DESC_ACPI_TABLES; @@ -1069,13 +1069,13 @@ static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry) /* * We assign one resource entry for each E820 map entry: */ -static struct resource __initdata *e820_res; +__initdata static struct resource *e820_res; /* * Is this a device address region that should not be marked busy? * (Versus system address regions that we register & lock early.) */ -static bool __init e820_device_region(enum e820_type type, struct resource *res) +__init static bool e820_device_region(enum e820_type type, struct resource *res) { /* This is the legacy BIOS/DOS ROM-shadow + MMIO region: */ if (res->start < (1ULL<<20)) @@ -1104,7 +1104,7 @@ static bool __init e820_device_region(enum e820_type type, struct resource *res) /* * Mark E820 system regions as busy for the resource manager: */ -void __init e820__reserve_resources(void) +__init void e820__reserve_resources(void) { u32 idx; struct resource *res; @@ -1151,7 +1151,7 @@ void __init e820__reserve_resources(void) /* * How much should we pad the end of RAM, depending on where it is? */ -static unsigned long __init ram_alignment(resource_size_t pos) +__init static unsigned long ram_alignment(resource_size_t pos) { unsigned long mb = pos >> 20; @@ -1169,7 +1169,7 @@ static unsigned long __init ram_alignment(resource_size_t pos) #define MAX_RESOURCE_SIZE ((resource_size_t)-1) -void __init e820__reserve_resources_late(void) +__init void e820__reserve_resources_late(void) { u32 idx; struct resource *res; @@ -1219,7 +1219,7 @@ void __init e820__reserve_resources_late(void) /* * Pass the firmware (bootloader) E820 map to the kernel and process it: */ -char *__init e820__memory_setup_default(void) +__init char * e820__memory_setup_default(void) { char *who = "BIOS-e820"; @@ -1257,7 +1257,7 @@ char *__init e820__memory_setup_default(void) * E820 map - with an optional platform quirk available for virtual platforms * to override this method of boot environment processing: */ -void __init e820__memory_setup(void) +__init void e820__memory_setup(void) { char *who; @@ -1273,7 +1273,7 @@ void __init e820__memory_setup(void) e820__print_table(who); } -void __init e820__memblock_setup(void) +__init void e820__memblock_setup(void) { u32 idx; u64 end; From 157266edcc56715323de1bd60e49194b3b66a174 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 May 2025 14:05:41 +0200 Subject: [PATCH 25/30] x86/boot/e820: Simplify append_e820_table() and remove restriction on single-entry tables So append_e820_table() begins with this weird condition that checks 'nr_entries': static int __init append_e820_table(struct boot_e820_entry *entries, u32 nr_entries) { /* Only one memory region (or negative)? Ignore it */ if (nr_entries < 2) return -1; Firstly, 'nr_entries' has been an u32 since 2017 and cannot be negative. Secondly, there's nothing inherently wrong with single-entry E820 maps, especially in virtualized environments. So remove this restriction and remove the __append_e820_table() indirection. Also: - fix/update comments - remove obsolete comments This shrinks the generated code a bit as well: text data bss dec hex filename 7549 44072 0 51621 c9a5 e820.o.before 7533 44072 0 51605 c995 e820.o.after Signed-off-by: Ingo Molnar Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: David Woodhouse Cc: H . Peter Anvin Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Mike Rapoport Cc: Paul Menzel Cc: Peter Zijlstra Link: https://patch.msgid.link/20250515120549.2820541-26-mingo@kernel.org --- arch/x86/kernel/e820.c | 35 +++++++++++------------------------ 1 file changed, 11 insertions(+), 24 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 46960801c580..806fd92c226a 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -441,17 +441,22 @@ __init int e820__update_table(struct e820_table *table) return 0; } -__init static int __append_e820_table(struct boot_e820_entry *entries, u32 nr_entries) +/* + * Copy the BIOS E820 map into the kernel's e820_table. + * + * Sanity-check it while we're at it.. + */ +__init static int append_e820_table(struct boot_e820_entry *entries, u32 nr_entries) { struct boot_e820_entry *entry = entries; while (nr_entries) { u64 start = entry->addr; - u64 size = entry->size; - u64 end = start + size - 1; - u32 type = entry->type; + u64 size = entry->size; + u64 end = start + size-1; + u32 type = entry->type; - /* Ignore the entry on 64-bit overflow: */ + /* Ignore the remaining entries on 64-bit overflow: */ if (start > end && likely(size)) return -1; @@ -463,24 +468,6 @@ __init static int __append_e820_table(struct boot_e820_entry *entries, u32 nr_en return 0; } -/* - * Copy the BIOS E820 map into a safe place. - * - * Sanity-check it while we're at it.. - * - * If we're lucky and live on a modern system, the setup code - * will have given us a memory map that we can use to properly - * set up memory. If we aren't, we'll fake a memory map. - */ -__init static int append_e820_table(struct boot_e820_entry *entries, u32 nr_entries) -{ - /* Only one memory region (or negative)? Ignore it */ - if (nr_entries < 2) - return -1; - - return __append_e820_table(entries, nr_entries); -} - __init static u64 __e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) { @@ -754,7 +741,7 @@ __init void e820__memory_setup_extended(u64 phys_addr, u32 data_len) entries = sdata->len / sizeof(*extmap); extmap = (struct boot_e820_entry *)(sdata->data); - __append_e820_table(extmap, entries); + append_e820_table(extmap, entries); e820__update_table(e820_table); memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec)); From 8b886d8a4db9a75c22cf7d0939f63ca811486efd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 May 2025 14:05:42 +0200 Subject: [PATCH 26/30] x86/boot/e820: Remove e820__range_remove()'s unused return parameter None of the usage sites make use of the 'real_removed_size' return parameter of e820__range_remove(), and it's hard to contemplate much constructive use: E820 maps can have holes, and removing a fixed range may result in removal of any number of bytes from 0 to the requested size. So remove this pointless calculation. This simplifies the function a bit: text data bss dec hex filename 7645 44072 0 51717 ca05 e820.o.before 7597 44072 0 51669 c9d5 e820.o.after Signed-off-by: Ingo Molnar Cc: H . Peter Anvin Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: David Woodhouse Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Mike Rapoport Cc: Paul Menzel Cc: Peter Zijlstra Link: https://patch.msgid.link/20250515120549.2820541-27-mingo@kernel.org --- arch/x86/include/asm/e820/api.h | 2 +- arch/x86/kernel/e820.c | 8 +------- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h index 54427b77bc19..9cf416f7a84f 100644 --- a/arch/x86/include/asm/e820/api.h +++ b/arch/x86/include/asm/e820/api.h @@ -16,7 +16,7 @@ extern bool e820__mapped_all(u64 start, u64 end, enum e820_type type); extern void e820__range_add (u64 start, u64 size, enum e820_type type); extern u64 e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type); -extern u64 e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type); +extern void e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type); extern u64 e820__range_update_table(struct e820_table *t, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type); extern int e820__update_table(struct e820_table *table); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 806fd92c226a..dfbc6e1f3290 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -548,11 +548,10 @@ __init u64 e820__range_update_table(struct e820_table *t, u64 start, u64 size, } /* Remove a range of memory from the E820 table: */ -__init u64 e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type) +__init void e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type) { u32 idx; u64 end; - u64 real_removed_size = 0; if (size > (ULLONG_MAX - start)) size = ULLONG_MAX - start; @@ -575,7 +574,6 @@ __init u64 e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool /* Completely covered? */ if (entry->addr >= start && entry_end <= end) { - real_removed_size += entry->size; memset(entry, 0, sizeof(*entry)); continue; } @@ -584,7 +582,6 @@ __init u64 e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool if (entry->addr < start && entry_end > end) { e820__range_add(end, entry_end - end, entry->type); entry->size = start - entry->addr; - real_removed_size += size; continue; } @@ -594,8 +591,6 @@ __init u64 e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool if (final_start >= final_end) continue; - real_removed_size += final_end - final_start; - /* * Left range could be head or tail, so need to update * the size first: @@ -606,7 +601,6 @@ __init u64 e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool entry->addr = final_end; } - return real_removed_size; } __init void e820__update_table_print(void) From 4ad03f133c9e509099907df56717a01468aedfbc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 May 2025 14:05:43 +0200 Subject: [PATCH 27/30] x86/boot/e820: Simplify the e820__range_remove() API Right now e820__range_remove() has two parameters to control the E820 type of the range removed: extern void e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type); Since E820 types start at 1, zero has a natural meaning of 'no type. Consolidate the (old_type,check_type) parameters into a single (filter_type) parameter: extern void e820__range_remove(u64 start, u64 size, enum e820_type filter_type); Note that both e820__mapped_raw_any() and e820__mapped_any() already have such semantics for their 'type' parameter, although it's currently not used with '0' by in-kernel code. Also, the __e820__mapped_all() internal helper already has such semantics implemented as well, and the e820__get_entry_type() API uses the '0' type to such effect. This simplifies not just e820__range_remove(), and synchronizes its use of type filters with other E820 API functions, but simplifies usage sites as well, such as parse_memmap_one(), beyond the reduction of the number of parameters: - else if (from) - e820__range_remove(start_at, mem_size, from, 1); else - e820__range_remove(start_at, mem_size, 0, 0); + e820__range_remove(start_at, mem_size, from); The generated code gets smaller as well: add/remove: 0/0 grow/shrink: 0/5 up/down: 0/-66 (-66) Function old new delta parse_memopt 112 107 -5 efi_init 1048 1039 -9 setup_arch 2719 2709 -10 e820__range_remove 283 273 -10 parse_memmap_opt 559 527 -32 Total: Before=22,675,600, After=22,675,534, chg -0.00% Signed-off-by: Ingo Molnar Cc: H . Peter Anvin Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: David Woodhouse Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Mike Rapoport Cc: Paul Menzel Cc: Peter Zijlstra Link: https://patch.msgid.link/20250515120549.2820541-28-mingo@kernel.org --- arch/x86/include/asm/e820/api.h | 2 +- arch/x86/kernel/e820.c | 16 +++++++--------- arch/x86/kernel/setup.c | 4 ++-- arch/x86/platform/efi/efi.c | 3 +-- 4 files changed, 11 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h index 9cf416f7a84f..bbe0c8de976c 100644 --- a/arch/x86/include/asm/e820/api.h +++ b/arch/x86/include/asm/e820/api.h @@ -16,7 +16,7 @@ extern bool e820__mapped_all(u64 start, u64 end, enum e820_type type); extern void e820__range_add (u64 start, u64 size, enum e820_type type); extern u64 e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type); -extern void e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type); +extern void e820__range_remove(u64 start, u64 size, enum e820_type filter_type); extern u64 e820__range_update_table(struct e820_table *t, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type); extern int e820__update_table(struct e820_table *table); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index dfbc6e1f3290..c4b9a24aeaa2 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -548,7 +548,7 @@ __init u64 e820__range_update_table(struct e820_table *t, u64 start, u64 size, } /* Remove a range of memory from the E820 table: */ -__init void e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type) +__init void e820__range_remove(u64 start, u64 size, enum e820_type filter_type) { u32 idx; u64 end; @@ -558,8 +558,8 @@ __init void e820__range_remove(u64 start, u64 size, enum e820_type old_type, boo end = start + size; printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx]", start, end - 1); - if (check_type) - e820_print_type(old_type); + if (filter_type) + e820_print_type(filter_type); pr_cont("\n"); for (idx = 0; idx < e820_table->nr_entries; idx++) { @@ -567,7 +567,7 @@ __init void e820__range_remove(u64 start, u64 size, enum e820_type old_type, boo u64 final_start, final_end; u64 entry_end; - if (check_type && entry->type != old_type) + if (filter_type && entry->type != filter_type) continue; entry_end = entry->addr + entry->size; @@ -903,7 +903,7 @@ __init static int parse_memopt(char *p) if (mem_size == 0) return -EINVAL; - e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1); + e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM); #ifdef CONFIG_MEMORY_HOTPLUG max_mem_size = mem_size; @@ -959,12 +959,10 @@ __init static int parse_memmap_one(char *p) e820__range_update(start_at, mem_size, from, to); else if (to) e820__range_add(start_at, mem_size, to); - else if (from) - e820__range_remove(start_at, mem_size, from, 1); else - e820__range_remove(start_at, mem_size, 0, 0); + e820__range_remove(start_at, mem_size, from); } else { - e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1); + e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM); } return *p == '\0' ? 0 : -EINVAL; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index a231b249d23b..ffbd04ee0f68 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -763,7 +763,7 @@ static void __init trim_bios_range(void) * area (640Kb -> 1Mb) as RAM even though it is not. * take them out. */ - e820__range_remove(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_TYPE_RAM, 1); + e820__range_remove(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_TYPE_RAM); e820__update_table(e820_table); } @@ -785,7 +785,7 @@ static void __init e820_add_kernel_range(void) return; pr_warn(".text .data .bss are not marked as E820_TYPE_RAM!\n"); - e820__range_remove(start, size, E820_TYPE_RAM, 0); + e820__range_remove(start, size, 0); e820__range_add(start, size, E820_TYPE_RAM); } diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 463b784499a8..d00c6de7f3b7 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -333,8 +333,7 @@ static void __init efi_remove_e820_mmio(void) if (size >= 256*1024) { pr_info("Remove mem%02u: MMIO range=[0x%08llx-0x%08llx] (%lluMB) from e820 map\n", i, start, end, size >> 20); - e820__range_remove(start, size, - E820_TYPE_RESERVED, 1); + e820__range_remove(start, size, E820_TYPE_RESERVED); } else { pr_info("Not removing mem%02u: MMIO range=[0x%08llx-0x%08llx] (%lluKB) from e820 map\n", i, start, end, size >> 10); From 0d9daff41418cbc762e4b6ec683e0a5ec4cdb5f3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 May 2025 14:05:44 +0200 Subject: [PATCH 28/30] x86/boot/e820: Make sure e820_search_gap() finds all gaps The current implementation of e820_search_gap() searches gaps in a reverse search from MAX_GAP_END back to 0, contrary to what its main comment claims: * Search for a gap in the E820 memory space from 0 to MAX_GAP_END (4GB). But gaps can not only be beyond E820 RAM ranges, they can be below them as well. For example this function will not find the proper PCI gap for simplified memory map layouts that have a single RAM range that crosses the 4GB boundary. Rework the function to have a proper forward search of E820 table entries. This makes the code somewhat bigger: text data bss dec hex filename 7613 44072 0 51685 c9e5 e820.o.before 7645 44072 0 51717 ca05 e820.o.after but it now both implements what it claims to do, and is more straightforward to read. ( This also allows 'idx' to be the regular u32 again, not an 'int' underflowing to -1. ) Signed-off-by: Ingo Molnar Cc: H . Peter Anvin Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: David Woodhouse Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Mike Rapoport Cc: Paul Menzel Cc: Peter Zijlstra Link: https://patch.msgid.link/20250515120549.2820541-29-mingo@kernel.org --- arch/x86/kernel/e820.c | 59 +++++++++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index c4b9a24aeaa2..d1b1786d006b 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -624,30 +624,52 @@ __init static void e820__update_table_kexec(void) */ __init static int e820_search_gap(unsigned long *max_gap_start, unsigned long *max_gap_size) { - u64 last = MAX_GAP_END; - int idx = e820_table->nr_entries; + struct e820_entry *entry; + u64 range_end_prev = 0; int found = 0; + u32 idx; - while (--idx >= 0) { - u64 start = e820_table->entries[idx].addr; - u64 end = start + e820_table->entries[idx].size; + for (idx = 0; idx < e820_table->nr_entries; idx++) { + u64 range_start, range_end; - /* - * Since "last" is at most 4GB, we know we'll - * fit in 32 bits if this condition is true: - */ - if (last > end) { - unsigned long gap = last - end; + entry = e820_table->entries + idx; + range_start = entry->addr; + range_end = entry->addr + entry->size; - if (gap > *max_gap_size) { - *max_gap_size = gap; - *max_gap_start = end; - found = 1; + /* Process any gap before this entry: */ + if (range_start > range_end_prev) { + u64 gap_start = range_end_prev; + u64 gap_end = range_start; + u64 gap_size; + + if (gap_start < MAX_GAP_END) { + /* Make sure the entirety of the gap is below MAX_GAP_END: */ + gap_end = min(gap_end, MAX_GAP_END); + gap_size = gap_end-gap_start; + + if (gap_size >= *max_gap_size) { + *max_gap_start = gap_start; + *max_gap_size = gap_size; + found = 1; + } } } - if (start < last) - last = start; + + range_end_prev = range_end; } + + /* Is there a usable gap beyond the last entry: */ + if (entry->addr + entry->size < MAX_GAP_END) { + u64 gap_start = entry->addr + entry->size; + u64 gap_size = MAX_GAP_END-gap_start; + + if (gap_size >= *max_gap_size) { + *max_gap_start = gap_start; + *max_gap_size = gap_size; + found = 1; + } + } + return found; } @@ -664,6 +686,7 @@ __init void e820__setup_pci_gap(void) unsigned long max_gap_start, max_gap_size; int found; + /* The minimum eligible gap size is 4MB: */ max_gap_size = SZ_4M; found = e820_search_gap(&max_gap_start, &max_gap_size); @@ -683,7 +706,7 @@ __init void e820__setup_pci_gap(void) pci_mem_start = max_gap_start; pr_info("[gap %#010lx-%#010lx] available for PCI devices\n", - max_gap_start, max_gap_start + max_gap_size - 1); + max_gap_start, max_gap_start + max_gap_size-1); } /* From 6c08d768a528ad22016850a481d67bfc8cdb9d4b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Dec 2025 09:22:37 +0100 Subject: [PATCH 29/30] x86/boot/e820: Use symbols for literals Use the human-readable SZ_* constants. Suggested-by: Nikolay Borisov Signed-off-by: Ingo Molnar Link: https://patch.msgid.link/92a15c2d-055c-4f4e-b232-32030a8e5e54@suse.com --- arch/x86/kernel/e820.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index d1b1786d006b..97b54bd0f482 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -617,7 +617,7 @@ __init static void e820__update_table_kexec(void) e820__update_table(e820_table_kexec); } -#define MAX_GAP_END 0x100000000ull +#define MAX_GAP_END SZ_4G /* * Search for a gap in the E820 memory space from 0 to MAX_GAP_END (4GB). @@ -696,7 +696,7 @@ __init void e820__setup_pci_gap(void) pr_err("Cannot find an available gap in the 32-bit address range\n"); pr_err("PCI devices with unassigned 32-bit BARs may not work!\n"); #else - max_gap_start = 0x10000000; + max_gap_start = SZ_256M; #endif } @@ -1080,7 +1080,7 @@ __initdata static struct resource *e820_res; __init static bool e820_device_region(enum e820_type type, struct resource *res) { /* This is the legacy BIOS/DOS ROM-shadow + MMIO region: */ - if (res->start < (1ULL<<20)) + if (res->start < SZ_1M) return false; /* From 2a11e1479ef07519bfd6b64ee276905ca84cf817 Mon Sep 17 00:00:00 2001 From: Shenghao Yang Date: Sat, 17 Jan 2026 15:28:27 +0800 Subject: [PATCH 30/30] x86/acpi: Add acpi=spcr to use SPCR-provided default console MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SPCR provided console on x86 is only available as a boot console when earlycon is provided on the kernel command line, and will not be present in /proc/consoles. While it's possible to retain the boot console with the keep_bootcon parameter, that leaves the console using the less efficient 8250_early driver. Users wanting to use the firmware suggested console (to avoid maintaining unique serial console parameters for different server models in large fleets) with the conventional driver have to parse the kernel log for the console parameters and reinsert them. [ 0.005091] ACPI: SPCR 0x000000007FFB5000 000059 (v04 ALASKA A M I 01072009 INTL 20250404) [ 0.073387] ACPI: SPCR: console: uart,io,0x3f8,115200 In commit 0231d00082f6 ("ACPI: SPCR: Make SPCR available to x86")¹ the SPCR console was only added as an option for earlycon but not as an ordinary console so users don't see console output changes. So users can opt in to an automatic SPCR console, make ACPI init add it if acpi=spcr is set. ¹https://lore.kernel.org/lkml/20180118150951.28964-1-prarit@redhat.com/ [ bp: Touchups. ] Signed-off-by: Shenghao Yang Signed-off-by: Borislav Petkov (AMD) Link: https://patch.msgid.link/20260117072827.355360-1-me@shenghaoyang.info --- Documentation/admin-guide/kernel-parameters.txt | 2 ++ arch/x86/kernel/acpi/boot.c | 11 ++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a8d0afde7f85..4d2f0bf1f8ab 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -125,6 +125,8 @@ Kernel parameters may result in duplicate corrected error reports. nospcr -- disable console in ACPI SPCR table as default _serial_ console on ARM64 + spcr -- enable console in ACPI SPCR table as + default _serial_ console on x86 For ARM64, ONLY "acpi=off", "acpi=on", "acpi=force" or "acpi=nospcr" are available For RISCV64, ONLY "acpi=off", "acpi=on" or "acpi=force" diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index d6138b2b633a..a3f2fb1fea1b 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -48,7 +48,8 @@ EXPORT_SYMBOL(acpi_disabled); int acpi_noirq; /* skip ACPI IRQ initialization */ static int acpi_nobgrt; /* skip ACPI BGRT */ -int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */ +static int acpi_spcr_add __initdata; /* add SPCR-provided console */ +int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */ EXPORT_SYMBOL(acpi_pci_disabled); int acpi_lapic; @@ -1669,8 +1670,8 @@ int __init acpi_boot_init(void) if (!acpi_noirq) x86_init.pci.init = pci_acpi_init; - /* Do not enable ACPI SPCR console by default */ - acpi_parse_spcr(earlycon_acpi_spcr_enable, false); + acpi_parse_spcr(earlycon_acpi_spcr_enable, acpi_spcr_add); + return 0; } @@ -1707,6 +1708,10 @@ static int __init parse_acpi(char *arg) /* "acpi=nocmcff" disables FF mode for corrected errors */ else if (strcmp(arg, "nocmcff") == 0) { acpi_disable_cmcff = 1; + } + /* "acpi=spcr" adds the SPCR-provided console as a preferred one */ + else if (strcmp(arg, "spcr") == 0) { + acpi_spcr_add = 1; } else { /* Core will printk when we return error. */ return -EINVAL;