From b0510ac74e189442dde8799c1b212bd106f2300c Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 31 Mar 2025 16:13:25 +0800 Subject: x86/mm: Remove the arch-specific pgd_leaf() definition PGD huge pages are not supported yet, let's use the generic definition in . [ mingo: Cleaned up the changelog. ] Signed-off-by: Baoquan He Signed-off-by: Ingo Molnar Reviewed-by: Oscar Salvador Link: https://lore.kernel.org/r/20250331081327.256412-6-bhe@redhat.com --- arch/x86/include/asm/pgtable.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 7bd6bd6df4a1..5f4fcc0eea17 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1472,9 +1472,6 @@ static inline bool pgdp_maps_userspace(void *__ptr) return (((ptr & ~PAGE_MASK) / sizeof(pgd_t)) < PGD_KERNEL_START); } -#define pgd_leaf pgd_leaf -static inline bool pgd_leaf(pgd_t pgd) { return false; } - #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* * All top-level MITIGATION_PAGE_TABLE_ISOLATION page tables are order-1 pages -- cgit v1.2.3 From c083eff324edd73eb23f4bd3f40f388a3e7c2cd2 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 31 Mar 2025 16:13:26 +0800 Subject: x86/mm: Remove the arch-specific p4d_leaf() definition P4D huge pages are not supported yet, let's use the generic definition in . [ mingo: Cleaned up the changelog. ] Signed-off-by: Baoquan He Signed-off-by: Ingo Molnar Reviewed-by: Oscar Salvador Link: https://lore.kernel.org/r/20250331081327.256412-7-bhe@redhat.com --- arch/x86/include/asm/pgtable.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5f4fcc0eea17..5ddba366d3b4 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -292,13 +292,6 @@ static inline unsigned long pgd_pfn(pgd_t pgd) return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT; } -#define p4d_leaf p4d_leaf -static inline bool p4d_leaf(p4d_t p4d) -{ - /* No 512 GiB pages yet */ - return 0; -} - #define pte_page(pte) pfn_to_page(pte_pfn(pte)) #define pmd_leaf pmd_leaf -- cgit v1.2.3 From 2b00d9031e42eabc8d32847d231ef48b8be0373d Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 31 Mar 2025 16:13:25 +0800 Subject: x86/mm: Simplify the pgd_leaf() and p4d_leaf() checks a bit The functions return bool, simplify the checks. [ mingo: Split off from two other patches. ] Signed-off-by: Baoquan He Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250331081327.256412-6-bhe@redhat.com --- arch/x86/mm/pti.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 5f0d579932c6..190299834011 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -185,7 +185,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); } - BUILD_BUG_ON(pgd_leaf(*pgd) != 0); + BUILD_BUG_ON(pgd_leaf(*pgd)); return p4d_offset(pgd, address); } @@ -206,7 +206,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) if (!p4d) return NULL; - BUILD_BUG_ON(p4d_leaf(*p4d) != 0); + BUILD_BUG_ON(p4d_leaf(*p4d)); if (p4d_none(*p4d)) { unsigned long new_pud_page = __get_free_page(gfp); if (WARN_ON_ONCE(!new_pud_page)) -- cgit v1.2.3 From 1701771d3069fbee154ca48e882e227fdcfbb583 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 1 Apr 2025 16:35:20 +0200 Subject: x86/mm: Stop prefetching current->mm->mmap_lock on page faults The prefetchw() dates back decades and the fundamental notion of doing something like this on a lock is shady. Moreover, for a few years now in the fast path faults are handled with RCU + per-vma locking, hopefully not even looking at the lock to begin with. As such just remove it. I did not see a point benchmarking this. Given that it is not expected to be looked at by default justifies not doing the prefetch. Signed-off-by: Mateusz Guzik Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Rik van Riel Cc: "H. Peter Anvin" Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250401143520.1113572-1-mjguzik@gmail.com --- arch/x86/mm/fault.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 296d294142c8..697432f63c59 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -13,7 +13,6 @@ #include /* kmmio_handler, ... */ #include /* perf_sw_event */ #include /* hstate_index_to_shift */ -#include /* prefetchw */ #include /* exception_enter(), ... */ #include /* faulthandler_disabled() */ #include /* efi_crash_gracefully_on_page_fault()*/ @@ -1496,8 +1495,6 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) address = cpu_feature_enabled(X86_FEATURE_FRED) ? fred_event_data(regs) : read_cr2(); - prefetchw(¤t->mm->mmap_lock); - /* * KVM uses #PF vector to deliver 'page not present' events to guests * (asynchronous page fault mechanism). The event happens when a -- cgit v1.2.3 From 1f13c60d84e880df6698441026e64f84c7110c49 Mon Sep 17 00:00:00 2001 From: Andrew Cooper Date: Wed, 2 Apr 2025 18:24:58 +0100 Subject: x86/idle: Remove MFENCEs for X86_BUG_CLFLUSH_MONITOR in mwait_idle_with_hints() and prefer_mwait_c1_over_halt() The following commit, 12 years ago: 7e98b7192046 ("x86, idle: Use static_cpu_has() for CLFLUSH workaround, add barriers") added barriers around the CLFLUSH in mwait_idle_with_hints(), justified with: ... and add memory barriers around it since the documentation is explicit that CLFLUSH is only ordered with respect to MFENCE. This also triggered, 11 years ago, the same adjustment in: f8e617f45829 ("sched/idle/x86: Optimize unnecessary mwait_idle() resched IPIs") during development, although it failed to get the static_cpu_has_bug() treatment. X86_BUG_CLFLUSH_MONITOR (a.k.a the AAI65 errata) is specific to Intel CPUs, and the SDM currently states: Executions of the CLFLUSH instruction are ordered with respect to each other and with respect to writes, locked read-modify-write instructions, and fence instructions[1]. With footnote 1 reading: Earlier versions of this manual specified that executions of the CLFLUSH instruction were ordered only by the MFENCE instruction. All processors implementing the CLFLUSH instruction also order it relative to the other operations enumerated above. i.e. The SDM was incorrect at the time, and barriers should not have been inserted. Double checking the original AAI65 errata (not available from intel.com any more) shows no mention of barriers either. Note: If this were a general codepath, the MFENCEs would be needed, because AMD CPUs of the same vintage do sport otherwise-unordered CLFLUSHs. Remove the unnecessary barriers. Furthermore, use a plain alternative(), rather than static_cpu_has_bug() and/or no optimisation. The workaround is a single instruction. Use an explicit %rax pointer rather than a general memory operand, because MONITOR takes the pointer implicitly in the same way. [ mingo: Cleaned up the commit a bit. ] Fixes: 7e98b7192046 ("x86, idle: Use static_cpu_has() for CLFLUSH workaround, add barriers") Signed-off-by: Andrew Cooper Signed-off-by: Ingo Molnar Acked-by: Dave Hansen Acked-by: Borislav Petkov (AMD) Cc: "H. Peter Anvin" Cc: Peter Zijlstra Cc: Rik van Riel Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Brian Gerst Cc: Juergen Gross Cc: Rafael J. Wysocki Link: https://lore.kernel.org/r/20250402172458.1378112-1-andrew.cooper3@citrix.com --- arch/x86/include/asm/mwait.h | 9 +++------ arch/x86/kernel/process.c | 9 +++------ 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index ce857ef54cf1..54dc313bcdf0 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -116,13 +116,10 @@ static __always_inline void __sti_mwait(unsigned long eax, unsigned long ecx) static __always_inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) { if (static_cpu_has_bug(X86_BUG_MONITOR) || !current_set_polling_and_test()) { - if (static_cpu_has_bug(X86_BUG_CLFLUSH_MONITOR)) { - mb(); - clflush((void *)¤t_thread_info()->flags); - mb(); - } + const void *addr = ¤t_thread_info()->flags; - __monitor((void *)¤t_thread_info()->flags, 0, 0); + alternative_input("", "clflush (%[addr])", X86_BUG_CLFLUSH_MONITOR, [addr] "a" (addr)); + __monitor(addr, 0, 0); if (!need_resched()) { if (ecx & 1) { diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 91f6ff618852..bda47d93c8f9 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -907,13 +907,10 @@ static __init bool prefer_mwait_c1_over_halt(void) static __cpuidle void mwait_idle(void) { if (!current_set_polling_and_test()) { - if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { - mb(); /* quirk */ - clflush((void *)¤t_thread_info()->flags); - mb(); /* quirk */ - } + const void *addr = ¤t_thread_info()->flags; - __monitor((void *)¤t_thread_info()->flags, 0, 0); + alternative_input("", "clflush (%[addr])", X86_BUG_CLFLUSH_MONITOR, [addr] "a" (addr)); + __monitor(addr, 0, 0); if (!need_resched()) { __sti_mwait(0, 0); raw_local_irq_disable(); -- cgit v1.2.3 From 1ae899e413105aa81068d0282ab6e22974891d74 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 2 Apr 2025 20:08:05 +0200 Subject: x86/idle: Standardize argument types for MONITOR{,X} and MWAIT{,X} instruction wrappers on 'u32' MONITOR and MONITORX expect 32-bit unsigned integer arguments in the %ecx and %edx registers. MWAIT and MWAITX expect 32-bit usigned int argument in %eax and %ecx registers. Some of the helpers around these instructions in are using too wide types (long), standardize on u32 instead that makes it clear that this is a hardware ABI. [ mingo: Cleaned up the changelog. ] Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Brian Gerst Cc: Juergen Gross Cc: Andrew Cooper Cc: Rik van Riel Cc: "H. Peter Anvin" Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250402180827.3762-1-ubizjak@gmail.com --- arch/x86/include/asm/mwait.h | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 54dc313bcdf0..3377869ff2e8 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -25,23 +25,21 @@ #define TPAUSE_C01_STATE 1 #define TPAUSE_C02_STATE 0 -static __always_inline void __monitor(const void *eax, unsigned long ecx, - unsigned long edx) +static __always_inline void __monitor(const void *eax, u32 ecx, u32 edx) { /* "monitor %eax, %ecx, %edx;" */ asm volatile(".byte 0x0f, 0x01, 0xc8;" :: "a" (eax), "c" (ecx), "d"(edx)); } -static __always_inline void __monitorx(const void *eax, unsigned long ecx, - unsigned long edx) +static __always_inline void __monitorx(const void *eax, u32 ecx, u32 edx) { /* "monitorx %eax, %ecx, %edx;" */ asm volatile(".byte 0x0f, 0x01, 0xfa;" :: "a" (eax), "c" (ecx), "d"(edx)); } -static __always_inline void __mwait(unsigned long eax, unsigned long ecx) +static __always_inline void __mwait(u32 eax, u32 ecx) { mds_idle_clear_cpu_buffers(); @@ -76,8 +74,7 @@ static __always_inline void __mwait(unsigned long eax, unsigned long ecx) * EAX (logical) address to monitor * ECX #GP if not zero */ -static __always_inline void __mwaitx(unsigned long eax, unsigned long ebx, - unsigned long ecx) +static __always_inline void __mwaitx(u32 eax, u32 ebx, u32 ecx) { /* No MDS buffer clear as this is AMD/HYGON only */ @@ -95,7 +92,7 @@ static __always_inline void __mwaitx(unsigned long eax, unsigned long ebx, * executing mwait, it would otherwise go unnoticed and the next tick * would not be reprogrammed accordingly before mwait ever wakes up. */ -static __always_inline void __sti_mwait(unsigned long eax, unsigned long ecx) +static __always_inline void __sti_mwait(u32 eax, u32 ecx) { mds_idle_clear_cpu_buffers(); /* "mwait %eax, %ecx;" */ -- cgit v1.2.3 From 19c3dcd953bc8961ab486cf05f5dd45455393c42 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 2 Apr 2025 20:08:07 +0200 Subject: x86/idle: Remove .s output beautifying delimiters from simpler asm() templates Delimiters in asm() templates such as ';', '\t' or '\n' are not required syntactically, they were used historically in the Linux kernel to prettify the compiler's .s output for people who were looking at compiler generated .s output. Most x86 developers these days are primarily looking at: 1) objdump --disassemble-all .o 2) perf top's live kernel function annotation and disassembler feature that uses /dev/mem. ... because: - this kind of assembler output is standardized regardless of compiler used, - it's generally less messy looking, - it gives ground-truth instead of being some intermediate layer in the toolchain that might or might not be the real deal, - and on a live kernel it also sees through the kernel's various layers of runtime patching code obfuscation facilities, also known as: alternative-instructions, tracepoints and jump labels. There are some cases where the .s output is the most useful tool, such as alternatives() code generation, but other than that these delimiters used in simple asm() statements mostly add noise to the source code side, which isn't desirable for assembly code that is fragile enough already. Remove the delimiters for , which also happens to make the GCC inliner's asm() instruction length heuristics more accurate... [ mingo: Wrote a new changelog to give historic context and to give people a chance to object. :-) ] Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Brian Gerst Cc: Juergen Gross Cc: Andrew Cooper Cc: Rik van Riel Cc: "H. Peter Anvin" Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250402180827.3762-3-ubizjak@gmail.com --- arch/x86/include/asm/mwait.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 3377869ff2e8..0e020a69b431 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -34,8 +34,8 @@ static __always_inline void __monitor(const void *eax, u32 ecx, u32 edx) static __always_inline void __monitorx(const void *eax, u32 ecx, u32 edx) { - /* "monitorx %eax, %ecx, %edx;" */ - asm volatile(".byte 0x0f, 0x01, 0xfa;" + /* "monitorx %eax, %ecx, %edx" */ + asm volatile(".byte 0x0f, 0x01, 0xfa" :: "a" (eax), "c" (ecx), "d"(edx)); } @@ -78,8 +78,8 @@ static __always_inline void __mwaitx(u32 eax, u32 ebx, u32 ecx) { /* No MDS buffer clear as this is AMD/HYGON only */ - /* "mwaitx %eax, %ebx, %ecx;" */ - asm volatile(".byte 0x0f, 0x01, 0xfb;" + /* "mwaitx %eax, %ebx, %ecx" */ + asm volatile(".byte 0x0f, 0x01, 0xfb" :: "a" (eax), "b" (ebx), "c" (ecx)); } @@ -138,13 +138,13 @@ static __always_inline void mwait_idle_with_hints(unsigned long eax, unsigned lo */ static inline void __tpause(u32 ecx, u32 edx, u32 eax) { - /* "tpause %ecx, %edx, %eax;" */ + /* "tpause %ecx, %edx, %eax" */ #ifdef CONFIG_AS_TPAUSE - asm volatile("tpause %%ecx\n" + asm volatile("tpause %%ecx" : : "c"(ecx), "d"(edx), "a"(eax)); #else - asm volatile(".byte 0x66, 0x0f, 0xae, 0xf1\t\n" + asm volatile(".byte 0x66, 0x0f, 0xae, 0xf1" : : "c"(ecx), "d"(edx), "a"(eax)); #endif -- cgit v1.2.3 From a72d55dc3bd6555cc1f97459b42b7f62ae480f13 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 2 Apr 2025 20:08:08 +0200 Subject: x86/idle: Remove CONFIG_AS_TPAUSE There is not much point in CONFIG_AS_TPAUSE at all when the emitted assembly is always the same - it only obfuscates the __tpause() code in essence. Remove the TPAUSE insn mnemonic from __tpause() and leave only the equivalent byte-wise definition. This can then be changed back to insn mnemonic once binutils 2.31.1 is the minimum version to build the kernel. (Right now it's 2.25.) Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Brian Gerst Cc: Juergen Gross Cc: Andrew Cooper Cc: Rik van Riel Cc: "H. Peter Anvin" Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250402180827.3762-4-ubizjak@gmail.com --- arch/x86/Kconfig.assembler | 4 ---- arch/x86/include/asm/mwait.h | 11 ++--------- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/arch/x86/Kconfig.assembler b/arch/x86/Kconfig.assembler index 6d20a6ce0507..fa8858546d5e 100644 --- a/arch/x86/Kconfig.assembler +++ b/arch/x86/Kconfig.assembler @@ -15,10 +15,6 @@ config AS_SHA256_NI def_bool $(as-instr,sha256msg1 %xmm0$(comma)%xmm1) help Supported by binutils >= 2.24 and LLVM integrated assembler -config AS_TPAUSE - def_bool $(as-instr,tpause %ecx) - help - Supported by binutils >= 2.31.1 and LLVM integrated assembler >= V7 config AS_GFNI def_bool $(as-instr,vgf2p8mulb %xmm0$(comma)%xmm1$(comma)%xmm2) diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 0e020a69b431..6a2ec2083043 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -138,16 +138,9 @@ static __always_inline void mwait_idle_with_hints(unsigned long eax, unsigned lo */ static inline void __tpause(u32 ecx, u32 edx, u32 eax) { - /* "tpause %ecx, %edx, %eax" */ - #ifdef CONFIG_AS_TPAUSE - asm volatile("tpause %%ecx" - : - : "c"(ecx), "d"(edx), "a"(eax)); - #else + /* "tpause %ecx" */ asm volatile(".byte 0x66, 0x0f, 0xae, 0xf1" - : - : "c"(ecx), "d"(edx), "a"(eax)); - #endif + :: "c" (ecx), "d" (edx), "a" (eax)); } #endif /* _ASM_X86_MWAIT_H */ -- cgit v1.2.3 From 2fb34b1566a386913b291d04f91ba6f6e6a5bb99 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Thu, 3 Apr 2025 10:56:23 +0200 Subject: x86/tlb: Simplify choose_new_asid() and generate better code Have it return the two things it does return: - a new ASID and - the need to flush the TLB or not, in a struct which fits in a single 32-bit register and whack the IO parameters. Beyond being easier to read, this also helps the compiler generate better, more compact code: # arch/x86/mm/tlb.o: text data bss dec hex filename 9341 753 516 10610 2972 tlb.o.before 9213 753 516 10482 28f2 tlb.o.after No functional changes. Signed-off-by: Borislav Petkov (AMD) Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Brian Gerst Cc: Juergen Gross Cc: Andrew Cooper Cc: Uros Bizjak Cc: Rik van Riel Cc: "H. Peter Anvin" Cc: Peter Zijlstra Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250403085623.20824-1-bp@kernel.org --- arch/x86/mm/tlb.c | 63 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 34 insertions(+), 29 deletions(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index e459d97ef397..d00ae21d0ee2 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -215,16 +215,20 @@ static void clear_asid_other(void) atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); +struct new_asid { + unsigned int asid : 16; + unsigned int need_flush : 1; +}; -static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, - u16 *new_asid, bool *need_flush) +static struct new_asid choose_new_asid(struct mm_struct *next, u64 next_tlb_gen) { + struct new_asid ns; u16 asid; if (!static_cpu_has(X86_FEATURE_PCID)) { - *new_asid = 0; - *need_flush = true; - return; + ns.asid = 0; + ns.need_flush = 1; + return ns; } /* @@ -235,9 +239,9 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, u16 global_asid = mm_global_asid(next); if (global_asid) { - *new_asid = global_asid; - *need_flush = false; - return; + ns.asid = global_asid; + ns.need_flush = 0; + return ns; } } @@ -249,22 +253,23 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, next->context.ctx_id) continue; - *new_asid = asid; - *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < - next_tlb_gen); - return; + ns.asid = asid; + ns.need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < next_tlb_gen); + return ns; } /* * We don't currently own an ASID slot on this CPU. * Allocate a slot. */ - *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1; - if (*new_asid >= TLB_NR_DYN_ASIDS) { - *new_asid = 0; + ns.asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1; + if (ns.asid >= TLB_NR_DYN_ASIDS) { + ns.asid = 0; this_cpu_write(cpu_tlbstate.next_asid, 1); } - *need_flush = true; + ns.need_flush = true; + + return ns; } /* @@ -781,9 +786,9 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy); unsigned cpu = smp_processor_id(); unsigned long new_lam; + struct new_asid ns; u64 next_tlb_gen; - bool need_flush; - u16 new_asid; + /* We don't want flush_tlb_func() to run concurrently with us. */ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) @@ -854,7 +859,7 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, /* Check if the current mm is transitioning to a global ASID */ if (mm_needs_global_asid(next, prev_asid)) { next_tlb_gen = atomic64_read(&next->context.tlb_gen); - choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); + ns = choose_new_asid(next, next_tlb_gen); goto reload_tlb; } @@ -889,8 +894,8 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, * TLB contents went out of date while we were in lazy * mode. Fall through to the TLB switching code below. */ - new_asid = prev_asid; - need_flush = true; + ns.asid = prev_asid; + ns.need_flush = true; } else { /* * Apply process to process speculation vulnerability @@ -918,21 +923,21 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, cpumask_set_cpu(cpu, mm_cpumask(next)); next_tlb_gen = atomic64_read(&next->context.tlb_gen); - choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); + ns = choose_new_asid(next, next_tlb_gen); } reload_tlb: new_lam = mm_lam_cr3_mask(next); - if (need_flush) { - VM_WARN_ON_ONCE(is_global_asid(new_asid)); - this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); - this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); - load_new_mm_cr3(next->pgd, new_asid, new_lam, true); + if (ns.need_flush) { + VM_WARN_ON_ONCE(is_global_asid(ns.asid)); + this_cpu_write(cpu_tlbstate.ctxs[ns.asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[ns.asid].tlb_gen, next_tlb_gen); + load_new_mm_cr3(next->pgd, ns.asid, new_lam, true); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } else { /* The new ASID is already up to date. */ - load_new_mm_cr3(next->pgd, new_asid, new_lam, false); + load_new_mm_cr3(next->pgd, ns.asid, new_lam, false); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); } @@ -941,7 +946,7 @@ reload_tlb: barrier(); this_cpu_write(cpu_tlbstate.loaded_mm, next); - this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); + this_cpu_write(cpu_tlbstate.loaded_mm_asid, ns.asid); cpu_tlbstate_update_lam(new_lam, mm_untag_mask(next)); if (next != prev) { -- cgit v1.2.3 From a17b37a3f416c9e385bbd2b5fc603d337eab76eb Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 3 Apr 2025 09:30:49 +0200 Subject: x86/idle: Change arguments of mwait_idle_with_hints() to u32 All functions in mwait_idle_with_hints() cast eax and ecx arguments to u32. Propagate argument type to the enclosing function. Suggested-by: Andrew Cooper Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250403073105.245987-1-ubizjak@gmail.com --- arch/x86/include/asm/mwait.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 6a2ec2083043..44d3bb2a259d 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -110,7 +110,7 @@ static __always_inline void __sti_mwait(u32 eax, u32 ecx) * New with Core Duo processors, MWAIT can take some hints based on CPU * capability. */ -static __always_inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) +static __always_inline void mwait_idle_with_hints(u32 eax, u32 ecx) { if (static_cpu_has_bug(X86_BUG_MONITOR) || !current_set_polling_and_test()) { const void *addr = ¤t_thread_info()->flags; -- cgit v1.2.3 From fc1cd60042b3df1d162278461c7a87f0362502b8 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 3 Apr 2025 14:50:45 +0200 Subject: x86/idle: Use MONITOR and MWAIT mnemonics in Current minimum required version of binutils is 2.25, which supports MONITOR and MWAIT instruction mnemonics. Replace the byte-wise specification of MONITOR and MWAIT with these proper mnemonics. No functional change intended. Note: LLVM assembler is not able to assemble correct forms of MONITOR and MWAIT instructions with explicit operands and reports: error: invalid operand for instruction monitor %rax,%ecx,%edx ^~~~ # https://lore.kernel.org/oe-kbuild-all/202504030802.2lEVBSpN-lkp@intel.com/ Use instruction mnemonics with implicit operands to work around this issue. Reported-by: kernel test robot Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Acked-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20250403125111.429805-1-ubizjak@gmail.com --- arch/x86/include/asm/mwait.h | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 44d3bb2a259d..dd2b129b0418 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -27,9 +27,11 @@ static __always_inline void __monitor(const void *eax, u32 ecx, u32 edx) { - /* "monitor %eax, %ecx, %edx;" */ - asm volatile(".byte 0x0f, 0x01, 0xc8;" - :: "a" (eax), "c" (ecx), "d"(edx)); + /* + * Use the instruction mnemonic with implicit operands, as the LLVM + * assembler fails to assemble the mnemonic with explicit operands: + */ + asm volatile("monitor" :: "a" (eax), "c" (ecx), "d" (edx)); } static __always_inline void __monitorx(const void *eax, u32 ecx, u32 edx) @@ -43,9 +45,11 @@ static __always_inline void __mwait(u32 eax, u32 ecx) { mds_idle_clear_cpu_buffers(); - /* "mwait %eax, %ecx;" */ - asm volatile(".byte 0x0f, 0x01, 0xc9;" - :: "a" (eax), "c" (ecx)); + /* + * Use the instruction mnemonic with implicit operands, as the LLVM + * assembler fails to assemble the mnemonic with explicit operands: + */ + asm volatile("mwait" :: "a" (eax), "c" (ecx)); } /* @@ -95,9 +99,8 @@ static __always_inline void __mwaitx(u32 eax, u32 ebx, u32 ecx) static __always_inline void __sti_mwait(u32 eax, u32 ecx) { mds_idle_clear_cpu_buffers(); - /* "mwait %eax, %ecx;" */ - asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" - :: "a" (eax), "c" (ecx)); + + asm volatile("sti; mwait" :: "a" (eax), "c" (ecx)); } /* -- cgit v1.2.3 From 60567e93c05d7064c93830cf4bf0d2c58f11b2f2 Mon Sep 17 00:00:00 2001 From: Malaya Kumar Rout Date: Wed, 9 Apr 2025 19:23:37 +0530 Subject: selftests/x86/lam: Fix clean up fds in do_uring() and allocate_dsa_pasid() Resolve minor fd leaks reported by cppcheck in lam.c. Specifically, the 'file_fd' and 'fd' were not closed in do_uring() and allocate_dsa_pasid() functions, respectively. cppcheck output before this patch: tools/testing/selftests/x86/lam.c:685:3: error: Resource leak: file_fd [resourceLeak] tools/testing/selftests/x86/lam.c:693:3: error: Resource leak: file_fd [resourceLeak] tools/testing/selftests/x86/lam.c:1195:2: error: Resource leak: fd [resourceLeak] cppcheck output after this patch: No resource leaks found While this is a standalone test tool that doesn't really leak anything in practice, as exit() cleans it up all, clean up resources nevertheless. [ mingo: Updated the changelog. ] Signed-off-by: Malaya Kumar Rout Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250409135341.28987-1-malayarout91@gmail.com --- tools/testing/selftests/x86/lam.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/x86/lam.c b/tools/testing/selftests/x86/lam.c index 18d736640ece..0873b0e5f48b 100644 --- a/tools/testing/selftests/x86/lam.c +++ b/tools/testing/selftests/x86/lam.c @@ -682,7 +682,7 @@ int do_uring(unsigned long lam) return 1; if (fstat(file_fd, &st) < 0) - return 1; + goto cleanup; off_t file_sz = st.st_size; @@ -690,7 +690,7 @@ int do_uring(unsigned long lam) fi = malloc(sizeof(*fi) + sizeof(struct iovec) * blocks); if (!fi) - return 1; + goto cleanup; fi->file_sz = file_sz; fi->file_fd = file_fd; @@ -698,7 +698,7 @@ int do_uring(unsigned long lam) ring = malloc(sizeof(*ring)); if (!ring) { free(fi); - return 1; + goto cleanup; } memset(ring, 0, sizeof(struct io_ring)); @@ -729,6 +729,8 @@ out: } free(fi); +cleanup: + close(file_fd); return ret; } @@ -1189,6 +1191,7 @@ void *allocate_dsa_pasid(void) wq = mmap(NULL, 0x1000, PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, 0); + close(fd); if (wq == MAP_FAILED) perror("mmap"); -- cgit v1.2.3 From 35c3151a98a6e6f56552cff8dc7d59e8ef7aca50 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 9 Apr 2025 15:28:15 +0300 Subject: x86/mm: Consolidate initmem_init() There are 4 wariants of initmem_init(), for 32 and 64 bits and for CONFIG_NUMA enabled and disabled. After commit bbeb69ce3013 ("x86/mm: Remove CONFIG_HIGHMEM64G support") NUMA is not supported on 32 bit kernels anymore, and arch/x86/mm/numa_32.c can be just deleted and setup_bootmem_allocator() with completely misleading name can be folded into initmem_init(). For 64 bits the NUMA variant calls x86_numa_init() and !NUMA variant sets all memory to node 0. The later can be split out into inline helper called x86_numa_init() and then both initmem_init() functions become the same. Split out memblock_set_node() from initmem_init() for !NUMA on 64 bit into x86_numa_init() helper and remove arch/x86/mm/numa_*.c that only contained initmem_init() variants for NUMA configs. Signed-off-by: Mike Rapoport (Microsoft) Signed-off-by: Ingo Molnar Cc: Ard Biesheuvel Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: David Woodhouse Cc: Rafael J. Wysocki Cc: Len Brown Link: https://lore.kernel.org/r/20250409122815.420041-1-rppt@kernel.org --- arch/x86/include/asm/page_32_types.h | 1 - arch/x86/mm/Makefile | 2 +- arch/x86/mm/init_32.c | 7 ----- arch/x86/mm/init_64.c | 7 ++++- arch/x86/mm/mm_internal.h | 4 +++ arch/x86/mm/numa.c | 3 +- arch/x86/mm/numa_32.c | 61 ------------------------------------ arch/x86/mm/numa_64.c | 13 -------- arch/x86/mm/numa_internal.h | 10 ------ 9 files changed, 13 insertions(+), 95 deletions(-) delete mode 100644 arch/x86/mm/numa_32.c delete mode 100644 arch/x86/mm/numa_64.c delete mode 100644 arch/x86/mm/numa_internal.h diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index a9b62e0e6f79..623f1e9f493e 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -73,7 +73,6 @@ extern unsigned int __VMALLOC_RESERVE; extern int sysctl_legacy_va_layout; extern void find_low_pfn_range(void); -extern void setup_bootmem_allocator(void); #endif /* !__ASSEMBLER__ */ diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 32035d5be5a0..1e72f06b6ba5 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -52,7 +52,7 @@ obj-$(CONFIG_MMIOTRACE) += mmiotrace.o mmiotrace-y := kmmio.o pf_in.o mmio-mod.o obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o -obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o +obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_AMD_NUMA) += amdtopology.o obj-$(CONFIG_ACPI_NUMA) += srat.o diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ad662cc4605c..d467f89191cd 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -612,7 +612,6 @@ void __init find_low_pfn_range(void) highmem_pfn_init(); } -#ifndef CONFIG_NUMA void __init initmem_init(void) { #ifdef CONFIG_HIGHMEM @@ -633,12 +632,6 @@ void __init initmem_init(void) printk(KERN_NOTICE "%ldMB LOWMEM available.\n", pages_to_mb(max_low_pfn)); - setup_bootmem_allocator(); -} -#endif /* !CONFIG_NUMA */ - -void __init setup_bootmem_allocator(void) -{ printk(KERN_INFO " mapped low ram: 0 - %08lx\n", max_pfn_mapped< #include #include +#include -#include "numa_internal.h" +#include "mm_internal.h" int numa_off; diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c deleted file mode 100644 index 65fda406e6f2..000000000000 --- a/arch/x86/mm/numa_32.c +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Written by: Patricia Gaughen , IBM Corporation - * August 2002: added remote node KVA remap - Martin J. Bligh - * - * Copyright (C) 2002, IBM Corp. - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include -#include -#include -#include - -#include "numa_internal.h" - -extern unsigned long highend_pfn, highstart_pfn; - -void __init initmem_init(void) -{ - x86_numa_init(); - -#ifdef CONFIG_HIGHMEM - highstart_pfn = highend_pfn = max_pfn; - if (max_pfn > max_low_pfn) - highstart_pfn = max_low_pfn; - printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", - pages_to_mb(highend_pfn - highstart_pfn)); - high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; -#else - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; -#endif - printk(KERN_NOTICE "%ldMB LOWMEM available.\n", - pages_to_mb(max_low_pfn)); - printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n", - max_low_pfn, highstart_pfn); - - printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n", - (ulong) pfn_to_kaddr(max_low_pfn)); - - printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", - (ulong) pfn_to_kaddr(highstart_pfn)); - - __vmalloc_start_set = true; - setup_bootmem_allocator(); -} diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c deleted file mode 100644 index 59d80160fa5a..000000000000 --- a/arch/x86/mm/numa_64.c +++ /dev/null @@ -1,13 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Generic VM initialization for x86-64 NUMA setups. - * Copyright 2002,2003 Andi Kleen, SuSE Labs. - */ -#include - -#include "numa_internal.h" - -void __init initmem_init(void) -{ - x86_numa_init(); -} diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h deleted file mode 100644 index 11e1ff370c10..000000000000 --- a/arch/x86/mm/numa_internal.h +++ /dev/null @@ -1,10 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __X86_MM_NUMA_INTERNAL_H -#define __X86_MM_NUMA_INTERNAL_H - -#include -#include - -void __init x86_numa_init(void); - -#endif /* __X86_MM_NUMA_INTERNAL_H */ -- cgit v1.2.3 From 780f97e309302fdee05b31c91a4dc81ded4c3702 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 14 Apr 2025 10:32:34 -0700 Subject: x86/mm: Always allocate a whole page for PAE PGDs A hardware PAE PGD is only 32 bytes. A PGD is PAGE_SIZE in the other paging modes. But for reasons*, the kernel _sometimes_ allocates a whole page even though it only ever uses 32 bytes. Make PAE less weird. Just allocate a page like the other paging modes. This was already being done for PTI (and Xen in the past) and nobody screamed that loudly about it so it can't be that bad. * The original reason for PAGE_SIZE allocations for the PAE PGDs was Xen's need to detect page table writes. But 32-bit PTI forced it too for reasons I'm unclear about. Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20250414173234.D34F0C3E%40davehans-spike.ostc.intel.com --- arch/x86/mm/pgtable.c | 62 ++++----------------------------------------------- 1 file changed, 4 insertions(+), 58 deletions(-) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index a05fcddfc811..ea01b5572442 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -318,68 +318,15 @@ static void pgd_prepopulate_user_pmd(struct mm_struct *mm, { } #endif -/* - * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also - * assumes that pgd should be in one page. - * - * But kernel with PAE paging that is not running as a Xen domain - * only needs to allocate 32 bytes for pgd instead of one page. - */ -#ifdef CONFIG_X86_PAE - -#include - -#define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) -#define PGD_ALIGN 32 - -static struct kmem_cache *pgd_cache; - -void __init pgtable_cache_init(void) -{ - /* - * When PAE kernel is running as a Xen domain, it does not use - * shared kernel pmd. And this requires a whole page for pgd. - */ - if (!SHARED_KERNEL_PMD) - return; - - /* - * when PAE kernel is not running as a Xen domain, it uses - * shared kernel pmd. Shared kernel pmd does not require a whole - * page for pgd. We are able to just allocate a 32-byte for pgd. - * During boot time, we create a 32-byte slab for pgd table allocation. - */ - pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, - SLAB_PANIC, NULL); -} static inline pgd_t *_pgd_alloc(struct mm_struct *mm) { /* - * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain. - * We allocate one page for pgd. - */ - if (!SHARED_KERNEL_PMD) - return __pgd_alloc(mm, PGD_ALLOCATION_ORDER); - - /* - * Now PAE kernel is not running as a Xen domain. We can allocate - * a 32-byte slab for pgd to save memory space. + * PTI and Xen need a whole page for the PAE PGD + * even though the hardware only needs 32 bytes. + * + * For simplicity, allocate a page for all users. */ - return kmem_cache_alloc(pgd_cache, GFP_PGTABLE_USER); -} - -static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd) -{ - if (!SHARED_KERNEL_PMD) - __pgd_free(mm, pgd); - else - kmem_cache_free(pgd_cache, pgd); -} -#else - -static inline pgd_t *_pgd_alloc(struct mm_struct *mm) -{ return __pgd_alloc(mm, PGD_ALLOCATION_ORDER); } @@ -387,7 +334,6 @@ static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd) { __pgd_free(mm, pgd); } -#endif /* CONFIG_X86_PAE */ pgd_t *pgd_alloc(struct mm_struct *mm) { -- cgit v1.2.3 From b0cc4d19f198cdfd1b58c8f5536670d1dc68cbbd Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 14 Apr 2025 10:32:35 -0700 Subject: x86/mm: Always "broadcast" PMD setting operations Kernel PMDs can either be shared across processes or private to a process. On 64-bit, they are always shared. 32-bit non-PAE hardware does not have PMDs, but the kernel logically squishes them into the PGD and treats them as private. Here are the four cases: 64-bit: Shared 32-bit: non-PAE: Private 32-bit: PAE+ PTI: Private 32-bit: PAE+noPTI: Shared Note that 32-bit is all "Private" except for PAE+noPTI being an oddball. The 32-bit+PAE+noPTI case will be made like the rest of 32-bit shortly. But until that can be done, temporarily treat the 32-bit+PAE+noPTI case as Private. This will do unnecessary walks across pgd_list and unnecessary PTE setting but should be otherwise harmless. Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20250414173235.F63F50D1%40davehans-spike.ostc.intel.com --- arch/x86/mm/pat/set_memory.c | 4 ++-- arch/x86/mm/pgtable.c | 11 +++-------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index def3d9284254..30ab4aced761 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -889,7 +889,7 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) /* change init_mm */ set_pte_atomic(kpte, pte); #ifdef CONFIG_X86_32 - if (!SHARED_KERNEL_PMD) { + { struct page *page; list_for_each_entry(page, &pgd_list, lru) { @@ -1293,7 +1293,7 @@ static int collapse_pmd_page(pmd_t *pmd, unsigned long addr, /* Queue the page table to be freed after TLB flush */ list_add(&page_ptdesc(pmd_page(old_pmd))->pt_list, pgtables); - if (IS_ENABLED(CONFIG_X86_32) && !SHARED_KERNEL_PMD) { + if (IS_ENABLED(CONFIG_X86_32)) { struct page *page; /* Update all PGD tables to use the same large page */ diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index ea01b5572442..f1c58860b926 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -97,18 +97,13 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) KERNEL_PGD_PTRS); } - /* list required to sync kernel mapping updates */ - if (!SHARED_KERNEL_PMD) { - pgd_set_mm(pgd, mm); - pgd_list_add(pgd); - } + /* List used to sync kernel mapping updates */ + pgd_set_mm(pgd, mm); + pgd_list_add(pgd); } static void pgd_dtor(pgd_t *pgd) { - if (SHARED_KERNEL_PMD) - return; - spin_lock(&pgd_lock); pgd_list_del(pgd); spin_unlock(&pgd_lock); -- cgit v1.2.3 From eb9c7f00f22d6ea2a94e00eb4f33a79064681564 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 14 Apr 2025 10:32:37 -0700 Subject: x86/mm: Always tell core mm to sync kernel mappings Each mm_struct has its own copy of the page tables. When core mm code makes changes to a copy of the page tables those changes sometimes need to be synchronized with other mms' copies of the page tables. But when this synchronization actually needs to happen is highly architecture and configuration specific. In cases where kernel PMDs are shared across processes (SHARED_KERNEL_PMD) the core mm does not itself need to do that synchronization for kernel PMD changes. The x86 code communicates this by clearing the PGTBL_PMD_MODIFIED bit cleared in those configs to avoid expensive synchronization. The kernel is moving toward never sharing kernel PMDs on 32-bit. Prepare for that and make 32-bit PAE always set PGTBL_PMD_MODIFIED, even if there is no modification to synchronize. This obviously adds some synchronization overhead in cases where the kernel page tables are being changed. Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20250414173237.EC790E95%40davehans-spike.ostc.intel.com --- arch/x86/include/asm/pgtable-3level_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index 9d5b257d44e3..9759fa0eb6a3 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -29,7 +29,7 @@ typedef union { #define SHARED_KERNEL_PMD (!static_cpu_has(X86_FEATURE_PTI)) -#define ARCH_PAGE_TABLE_SYNC_MASK (SHARED_KERNEL_PMD ? 0 : PGTBL_PMD_MODIFIED) +#define ARCH_PAGE_TABLE_SYNC_MASK PGTBL_PMD_MODIFIED /* * PGDIR_SHIFT determines what a top-level page table entry can map -- cgit v1.2.3 From 45fb940563f80b8138f465f18d71c2d3e4a0724e Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 14 Apr 2025 10:32:38 -0700 Subject: x86/mm: Simplify PAE PGD sharing macros There are a few too many levels of abstraction here. First, just expand the PREALLOCATED_PMDS macro in place to make it clear that it is only conditional on PTI. Second, MAX_PREALLOCATED_PMDS is only used in one spot for an on-stack allocation. It has a *maximum* value of 4. Do not bother with the macro MAX() magic. Just set it to 4. Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20250414173238.6E3CDA56%40davehans-spike.ostc.intel.com --- arch/x86/mm/pgtable.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index f1c58860b926..027e1d32925c 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -68,12 +68,6 @@ static inline void pgd_list_del(pgd_t *pgd) list_del(&ptdesc->pt_list); } -#define UNSHARED_PTRS_PER_PGD \ - (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) -#define MAX_UNSHARED_PTRS_PER_PGD \ - MAX_T(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD) - - static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) { virt_to_ptdesc(pgd)->pt_mm = mm; @@ -132,8 +126,9 @@ static void pgd_dtor(pgd_t *pgd) * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate * and initialize the kernel pmds here. */ -#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD -#define MAX_PREALLOCATED_PMDS MAX_UNSHARED_PTRS_PER_PGD +#define PREALLOCATED_PMDS (static_cpu_has(X86_FEATURE_PTI) ? \ + PTRS_PER_PGD : KERNEL_PGD_BOUNDARY) +#define MAX_PREALLOCATED_PMDS PTRS_PER_PGD /* * We allocate separate PMDs for the kernel part of the user page-table -- cgit v1.2.3 From 82f120010f3b86c2f9c1279452c1ecab7bc117d2 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 14 Apr 2025 10:32:40 -0700 Subject: x86/mm: Fix up comments around PMD preallocation The "paravirt environment" is no longer in the tree. Axe that part of the comment. Also add a blurb to remind readers that "USER_PMDS" refer to the PTI user *copy* of the page tables, not the user *portion*. Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20250414173240.5B1AB322%40davehans-spike.ostc.intel.com --- arch/x86/mm/pgtable.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 027e1d32925c..ca07db510d26 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -121,16 +121,17 @@ static void pgd_dtor(pgd_t *pgd) * processor notices the update. Since this is expensive, and * all 4 top-level entries are used almost immediately in a * new process's life, we just pre-populate them here. - * - * Also, if we're in a paravirt environment where the kernel pmd is - * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate - * and initialize the kernel pmds here. */ #define PREALLOCATED_PMDS (static_cpu_has(X86_FEATURE_PTI) ? \ PTRS_PER_PGD : KERNEL_PGD_BOUNDARY) #define MAX_PREALLOCATED_PMDS PTRS_PER_PGD /* + * "USER_PMDS" are the PMDs for the user copy of the page tables when + * PTI is enabled. They do not exist when PTI is disabled. Note that + * this is distinct from the user _portion_ of the kernel page tables + * which always exists. + * * We allocate separate PMDs for the kernel part of the user page-table * when PTI is enabled. We need them to map the per-process LDT into the * user-space page-table. -- cgit v1.2.3 From 454e65b4fb38ddeea62472649ef16b5e8d285015 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 14 Apr 2025 10:32:41 -0700 Subject: x86/mm: Preallocate all PAE page tables Finally, move away from having PAE kernels share any PMDs across processes. This was already the default on PTI kernels which are the common case. Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20250414173241.1288CAB4%40davehans-spike.ostc.intel.com --- arch/x86/mm/pgtable.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index ca07db510d26..f4fa8fabf326 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -80,16 +80,11 @@ struct mm_struct *pgd_page_get_mm(struct page *page) static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) { - /* If the pgd points to a shared pagetable level (either the - ptes in non-PAE, or shared PMD in PAE), then just copy the - references from swapper_pg_dir. */ - if (CONFIG_PGTABLE_LEVELS == 2 || - (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || - CONFIG_PGTABLE_LEVELS >= 4) { + /* PAE preallocates all its PMDs. No cloning needed. */ + if (!IS_ENABLED(CONFIG_X86_PAE)) clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, swapper_pg_dir + KERNEL_PGD_BOUNDARY, KERNEL_PGD_PTRS); - } /* List used to sync kernel mapping updates */ pgd_set_mm(pgd, mm); @@ -122,8 +117,7 @@ static void pgd_dtor(pgd_t *pgd) * all 4 top-level entries are used almost immediately in a * new process's life, we just pre-populate them here. */ -#define PREALLOCATED_PMDS (static_cpu_has(X86_FEATURE_PTI) ? \ - PTRS_PER_PGD : KERNEL_PGD_BOUNDARY) +#define PREALLOCATED_PMDS PTRS_PER_PGD #define MAX_PREALLOCATED_PMDS PTRS_PER_PGD /* -- cgit v1.2.3 From 99b8f0c54f571616d7bf4a776a2863a321c38cb1 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 14 Apr 2025 10:32:42 -0700 Subject: x86/mm: Remove duplicated PMD preallocation macro MAX_PREALLOCATED_PMDS and PREALLOCATED_PMDS are now identical. Just use PREALLOCATED_PMDS and remove "MAX". Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20250414173242.5ED13A5B%40davehans-spike.ostc.intel.com --- arch/x86/mm/pgtable.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index f4fa8fabf326..c1144e2f24e2 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -118,7 +118,6 @@ static void pgd_dtor(pgd_t *pgd) * new process's life, we just pre-populate them here. */ #define PREALLOCATED_PMDS PTRS_PER_PGD -#define MAX_PREALLOCATED_PMDS PTRS_PER_PGD /* * "USER_PMDS" are the PMDs for the user copy of the page tables when @@ -154,7 +153,6 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) /* No need to prepopulate any pagetable entries in non-PAE modes. */ #define PREALLOCATED_PMDS 0 -#define MAX_PREALLOCATED_PMDS 0 #define PREALLOCATED_USER_PMDS 0 #define MAX_PREALLOCATED_USER_PMDS 0 #endif /* CONFIG_X86_PAE */ @@ -324,7 +322,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS]; - pmd_t *pmds[MAX_PREALLOCATED_PMDS]; + pmd_t *pmds[PREALLOCATED_PMDS]; pgd = _pgd_alloc(mm); -- cgit v1.2.3 From eaa607deb29e0b6fd24b9adf230fbc765f342521 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 14 Apr 2025 10:32:44 -0700 Subject: x86/mm: Remove now unused SHARED_KERNEL_PMD All the users of SHARED_KERNEL_PMD are gone. Zap it. Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20250414173244.1125BEC3%40davehans-spike.ostc.intel.com --- arch/x86/include/asm/pgtable-2level_types.h | 2 -- arch/x86/include/asm/pgtable-3level_types.h | 2 -- arch/x86/include/asm/pgtable_64_types.h | 2 -- 3 files changed, 6 deletions(-) diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h index 66425424ce91..54690bd4ddbe 100644 --- a/arch/x86/include/asm/pgtable-2level_types.h +++ b/arch/x86/include/asm/pgtable-2level_types.h @@ -18,8 +18,6 @@ typedef union { } pte_t; #endif /* !__ASSEMBLER__ */ -#define SHARED_KERNEL_PMD 0 - #define ARCH_PAGE_TABLE_SYNC_MASK PGTBL_PMD_MODIFIED /* diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index 9759fa0eb6a3..580b09bf6a45 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -27,8 +27,6 @@ typedef union { } pmd_t; #endif /* !__ASSEMBLER__ */ -#define SHARED_KERNEL_PMD (!static_cpu_has(X86_FEATURE_PTI)) - #define ARCH_PAGE_TABLE_SYNC_MASK PGTBL_PMD_MODIFIED /* diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 5bb782d856f2..e83721db18c9 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -46,8 +46,6 @@ extern unsigned int ptrs_per_p4d; #endif /* !__ASSEMBLER__ */ -#define SHARED_KERNEL_PMD 0 - #ifdef CONFIG_X86_5LEVEL /* -- cgit v1.2.3 From 1b3f2bd04d90f61e1f291b5e365b9bc4ce0ea7c7 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 29 Apr 2025 19:46:21 -0700 Subject: x86/devmem: Remove duplicate range_is_allowed() definition 17 years ago, Venki suggested [1] "A future improvement would be to avoid the range_is_allowed duplication". The only thing preventing a common implementation is that phys_mem_access_prot_allowed() expects the range check to exit immediately when PAT is disabled [2]. I.e. there is no cache conflict to manage in that case. This cleanup was noticed on the path to considering changing range_is_allowed() policy to blanket deny /dev/mem for private (confidential computing) memory. Note, however that phys_mem_access_prot_allowed() has long since stopped being relevant for managing cache-type validation due to [3], and [4]. Commit 0124cecfc85a ("x86, PAT: disable /dev/mem mmap RAM with PAT") [1] Commit 9e41bff2708e ("x86: fix /dev/mem mmap breakage when PAT is disabled") [2] Commit 1886297ce0c8 ("x86/mm/pat: Fix BUG_ON() in mmap_mem() on QEMU/i386") [3] Commit 0c3c8a18361a ("x86, PAT: Remove duplicate memtype reserve in devmem mmap") [4] Signed-off-by: Dan Williams Signed-off-by: Dave Hansen Reviewed-by: Nikolay Borisov Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/all/20250430024622.1134277-2-dan.j.williams%40intel.com --- arch/x86/mm/pat/memtype.c | 31 ++++--------------------------- drivers/char/mem.c | 18 ------------------ include/linux/io.h | 21 +++++++++++++++++++++ 3 files changed, 25 insertions(+), 45 deletions(-) diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 72d8cbc61158..c97b6598f187 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -773,38 +774,14 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, return vma_prot; } -#ifdef CONFIG_STRICT_DEVMEM -/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */ -static inline int range_is_allowed(unsigned long pfn, unsigned long size) -{ - return 1; -} -#else -/* This check is needed to avoid cache aliasing when PAT is enabled */ -static inline int range_is_allowed(unsigned long pfn, unsigned long size) -{ - u64 from = ((u64)pfn) << PAGE_SHIFT; - u64 to = from + size; - u64 cursor = from; - - if (!pat_enabled()) - return 1; - - while (cursor < to) { - if (!devmem_is_allowed(pfn)) - return 0; - cursor += PAGE_SIZE; - pfn++; - } - return 1; -} -#endif /* CONFIG_STRICT_DEVMEM */ - int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot) { enum page_cache_mode pcm = _PAGE_CACHE_MODE_WB; + if (!pat_enabled()) + return 1; + if (!range_is_allowed(pfn, size)) return 0; diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 169eed162a7f..48839958b0b1 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -61,29 +61,11 @@ static inline int page_is_allowed(unsigned long pfn) { return devmem_is_allowed(pfn); } -static inline int range_is_allowed(unsigned long pfn, unsigned long size) -{ - u64 from = ((u64)pfn) << PAGE_SHIFT; - u64 to = from + size; - u64 cursor = from; - - while (cursor < to) { - if (!devmem_is_allowed(pfn)) - return 0; - cursor += PAGE_SIZE; - pfn++; - } - return 1; -} #else static inline int page_is_allowed(unsigned long pfn) { return 1; } -static inline int range_is_allowed(unsigned long pfn, unsigned long size) -{ - return 1; -} #endif static inline bool should_stop_iteration(void) diff --git a/include/linux/io.h b/include/linux/io.h index 6a6bc4d46d0a..0642c7ee41db 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -183,4 +183,25 @@ static inline void arch_io_free_memtype_wc(resource_size_t base, int devm_arch_io_reserve_memtype_wc(struct device *dev, resource_size_t start, resource_size_t size); +#ifdef CONFIG_STRICT_DEVMEM +static inline int range_is_allowed(unsigned long pfn, unsigned long size) +{ + u64 from = ((u64)pfn) << PAGE_SHIFT; + u64 to = from + size; + u64 cursor = from; + + while (cursor < to) { + if (!devmem_is_allowed(pfn)) + return 0; + cursor += PAGE_SIZE; + pfn++; + } + return 1; +} +#else +static inline int range_is_allowed(unsigned long pfn, unsigned long size) +{ + return 1; +} +#endif #endif /* _LINUX_IO_H */ -- cgit v1.2.3