From c7fa848ff01dad9ed3146a6b1a7d3622131bcedd Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 3 Mar 2022 15:33:10 +1000 Subject: KVM: PPC: Book3S HV P9: Fix "lost kick" race When new work is created that requires attention from the hypervisor (e.g., to inject an interrupt into the guest), fast_vcpu_kick is used to pull the target vcpu out of the guest if it may have been running. Therefore the work creation side looks like this: vcpu->arch.doorbell_request = 1; kvmppc_fast_vcpu_kick_hv(vcpu) { smp_mb(); cpu = vcpu->cpu; if (cpu != -1) send_ipi(cpu); } And the guest entry side *should* look like this: vcpu->cpu = smp_processor_id(); smp_mb(); if (vcpu->arch.doorbell_request) { // do something (abort entry or inject doorbell etc) } But currently the store and load are flipped, so it is possible for the entry to see no doorbell pending, and the doorbell creation misses the store to set cpu, resulting lost work (or at least delayed until the next guest exit). Fix this by reordering the entry operations and adding a smp_mb between them. The P8 path appears to have a similar race which is commented but not addressed yet. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220303053315.1056880-2-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 41 +++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index c886557638a1..6fa518f6501d 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -225,6 +225,13 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) int cpu; struct rcuwait *waitp; + /* + * rcuwait_wake_up contains smp_mb() which orders prior stores that + * create pending work vs below loads of cpu fields. The other side + * is the barrier in vcpu run that orders setting the cpu fields vs + * testing for pending work. + */ + waitp = kvm_arch_vcpu_get_wait(vcpu); if (rcuwait_wake_up(waitp)) ++vcpu->stat.generic.halt_wakeup; @@ -1089,7 +1096,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) break; } tvcpu->arch.prodded = 1; - smp_mb(); + smp_mb(); /* This orders prodded store vs ceded load */ if (tvcpu->arch.ceded) kvmppc_fast_vcpu_kick_hv(tvcpu); break; @@ -3766,6 +3773,14 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) pvc = core_info.vc[sub]; pvc->pcpu = pcpu + thr; for_each_runnable_thread(i, vcpu, pvc) { + /* + * XXX: is kvmppc_start_thread called too late here? + * It updates vcpu->cpu and vcpu->arch.thread_cpu + * which are used by kvmppc_fast_vcpu_kick_hv(), but + * kick is called after new exceptions become available + * and exceptions are checked earlier than here, by + * kvmppc_core_prepare_to_enter. + */ kvmppc_start_thread(vcpu, pvc); kvmppc_create_dtl_entry(vcpu, pvc); trace_kvm_guest_enter(vcpu); @@ -4487,6 +4502,21 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, if (need_resched() || !kvm->arch.mmu_ready) goto out; + vcpu->cpu = pcpu; + vcpu->arch.thread_cpu = pcpu; + vc->pcpu = pcpu; + local_paca->kvm_hstate.kvm_vcpu = vcpu; + local_paca->kvm_hstate.ptid = 0; + local_paca->kvm_hstate.fake_suspend = 0; + + /* + * Orders set cpu/thread_cpu vs testing for pending interrupts and + * doorbells below. The other side is when these fields are set vs + * kvmppc_fast_vcpu_kick_hv reading the cpu/thread_cpu fields to + * kick a vCPU to notice the pending interrupt. + */ + smp_mb(); + if (!nested) { kvmppc_core_prepare_to_enter(vcpu); if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, @@ -4506,13 +4536,6 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, tb = mftb(); - vcpu->cpu = pcpu; - vcpu->arch.thread_cpu = pcpu; - vc->pcpu = pcpu; - local_paca->kvm_hstate.kvm_vcpu = vcpu; - local_paca->kvm_hstate.ptid = 0; - local_paca->kvm_hstate.fake_suspend = 0; - __kvmppc_create_dtl_entry(vcpu, pcpu, tb + vc->tb_offset, 0); trace_kvm_guest_enter(vcpu); @@ -4614,6 +4637,8 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, run->exit_reason = KVM_EXIT_INTR; vcpu->arch.ret = -EINTR; out: + vcpu->cpu = -1; + vcpu->arch.thread_cpu = -1; powerpc_local_irq_pmu_restore(flags); preempt_enable(); goto done; -- cgit v1.2.3 From b5149e229218118c9cd44a4d256f970ddcbf745b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 22 Feb 2022 16:47:25 +1000 Subject: KVM: PPC: Book3S PR: Disable SCV when AIL could be disabled PR KVM does not support running with AIL enabled, and SCV does is not supported with AIL disabled. Fix this by ensuring the SCV facility is disabled with FSCR while a CPU could be running with AIL=0. The PowerNV host supports disabling AIL on a per-CPU basis, so SCV just needs to be disabled when a vCPU is being run. The pSeries machine can only switch AIL on a system-wide basis, so it must disable SCV support at boot if the configuration can potentially run a PR KVM guest. Also ensure a the FSCR[SCV] bit can not be enabled when emulating mtFSCR for the guest. SCV is not emulated for the PR guest at the moment, this just fixes the host crashes. Alternatives considered and rejected: - SCV support can not be disabled by PR KVM after boot, because it is advertised to userspace with HWCAP. - AIL can not be disabled on a per-CPU basis. At least when running on pseries it is a per-LPAR setting. - Support for real-mode SCV vectors will not be added because they are at 0x17000 so making such a large fixed head space causes immediate value limits to be exceeded, requiring a lot rework and more code. - Disabling SCV for any PR KVM possible kernel will cause a slowdown when not using PR KVM. - A boot time option to disable SCV to use PR KVM is user-hostile. - System call instruction emulation for SCV facility unavailable instructions is too complex and old emulation code was subtly broken and removed. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20220222064727.2314380-2-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 4 ++++ arch/powerpc/kernel/setup_64.c | 28 ++++++++++++++++++++++++++++ arch/powerpc/kvm/Kconfig | 9 +++++++++ arch/powerpc/kvm/book3s_pr.c | 26 +++++++++++++++++--------- 4 files changed, 58 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 55caeee37c08..b66dd6f775a4 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -809,6 +809,10 @@ __start_interrupts: * - MSR_EE|MSR_RI is clear (no reentrant exceptions) * - Standard kernel environment is set up (stack, paca, etc) * + * KVM: + * These interrupts do not elevate HV 0->1, so HV is not involved. PR KVM + * ensures that FSCR[SCV] is disabled whenever it has to force AIL off. + * * Call convention: * * syscall register convention is in Documentation/powerpc/syscall64-abi.rst diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index be8577ac9397..d973ae7558e3 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -197,6 +197,34 @@ static void __init configure_exceptions(void) /* Under a PAPR hypervisor, we need hypercalls */ if (firmware_has_feature(FW_FEATURE_SET_MODE)) { + /* + * - PR KVM does not support AIL mode interrupts in the host + * while a PR guest is running. + * + * - SCV system call interrupt vectors are only implemented for + * AIL mode interrupts. + * + * - On pseries, AIL mode can only be enabled and disabled + * system-wide so when a PR VM is created on a pseries host, + * all CPUs of the host are set to AIL=0 mode. + * + * - Therefore host CPUs must not execute scv while a PR VM + * exists. + * + * - SCV support can not be disabled dynamically because the + * feature is advertised to host userspace. Disabling the + * facility and emulating it would be possible but is not + * implemented. + * + * - So SCV support is blanket disabled if PR KVM could possibly + * run. That is, PR support compiled in, booting on pseries + * with hash MMU. + */ + if (IS_ENABLED(CONFIG_KVM_BOOK3S_PR_POSSIBLE) && !radix_enabled()) { + init_task.thread.fscr &= ~FSCR_SCV; + cur_cpu_spec->cpu_user_features2 &= ~PPC_FEATURE2_SCV; + } + /* Enable AIL if possible */ if (!pseries_enable_reloc_on_exc()) { init_task.thread.fscr &= ~FSCR_SCV; diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 18e58085447c..ddd88179110a 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -112,12 +112,21 @@ config KVM_BOOK3S_64_PR guest in user mode (problem state) and emulating all privileged instructions and registers. + This is only available for hash MMU mode and only supports + guests that use hash MMU mode. + This is not as fast as using hypervisor mode, but works on machines where hypervisor mode is not available or not usable, and can emulate processors that are different from the host processor, including emulating 32-bit processors on a 64-bit host. + Selecting this option will cause the SCV facility to be + disabled when the kernel is booted on the pseries platform in + hash MMU mode (regardless of PR VMs running). When any PR VMs + are running, "AIL" mode is disabled which may slow interrupts + and system calls on the host. + config KVM_BOOK3S_HV_EXIT_TIMING bool "Detailed timing for hypervisor real-mode code" depends on KVM_BOOK3S_HV_POSSIBLE && DEBUG_FS diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 34a801c3604a..7bf9e6ca5c2d 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -137,12 +137,15 @@ static void kvmppc_core_vcpu_load_pr(struct kvm_vcpu *vcpu, int cpu) svcpu->slb_max = to_book3s(vcpu)->slb_shadow_max; svcpu->in_use = 0; svcpu_put(svcpu); -#endif /* Disable AIL if supported */ - if (cpu_has_feature(CPU_FTR_HVMODE) && - cpu_has_feature(CPU_FTR_ARCH_207S)) - mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_AIL); + if (cpu_has_feature(CPU_FTR_HVMODE)) { + if (cpu_has_feature(CPU_FTR_ARCH_207S)) + mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_AIL); + if (cpu_has_feature(CPU_FTR_ARCH_300) && (current->thread.fscr & FSCR_SCV)) + mtspr(SPRN_FSCR, mfspr(SPRN_FSCR) & ~FSCR_SCV); + } +#endif vcpu->cpu = smp_processor_id(); #ifdef CONFIG_PPC_BOOK3S_32 @@ -165,6 +168,14 @@ static void kvmppc_core_vcpu_put_pr(struct kvm_vcpu *vcpu) memcpy(to_book3s(vcpu)->slb_shadow, svcpu->slb, sizeof(svcpu->slb)); to_book3s(vcpu)->slb_shadow_max = svcpu->slb_max; svcpu_put(svcpu); + + /* Enable AIL if supported */ + if (cpu_has_feature(CPU_FTR_HVMODE)) { + if (cpu_has_feature(CPU_FTR_ARCH_207S)) + mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_AIL_3); + if (cpu_has_feature(CPU_FTR_ARCH_300) && (current->thread.fscr & FSCR_SCV)) + mtspr(SPRN_FSCR, mfspr(SPRN_FSCR) | FSCR_SCV); + } #endif if (kvmppc_is_split_real(vcpu)) @@ -174,11 +185,6 @@ static void kvmppc_core_vcpu_put_pr(struct kvm_vcpu *vcpu) kvmppc_giveup_fac(vcpu, FSCR_TAR_LG); kvmppc_save_tm_pr(vcpu); - /* Enable AIL if supported */ - if (cpu_has_feature(CPU_FTR_HVMODE) && - cpu_has_feature(CPU_FTR_ARCH_207S)) - mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_AIL_3); - vcpu->cpu = -1; } @@ -1037,6 +1043,8 @@ static int kvmppc_handle_fac(struct kvm_vcpu *vcpu, ulong fac) void kvmppc_set_fscr(struct kvm_vcpu *vcpu, u64 fscr) { + if (fscr & FSCR_SCV) + fscr &= ~FSCR_SCV; /* SCV must not be enabled */ if ((vcpu->arch.fscr & FSCR_TAR) && !(fscr & FSCR_TAR)) { /* TAR got dropped, drop it in shadow too */ kvmppc_giveup_fac(vcpu, FSCR_TAR_LG); -- cgit v1.2.3 From 839d893b4067da14b9c46fda2dfd88b80aeed551 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 22 Feb 2022 16:47:26 +1000 Subject: KVM: PPC: Book3S PR: Disallow AIL != 0 KVM PR does not implement address translation modes on interrupt, so it must not allow H_SET_MODE to succeed. The behaviour change caused by this mode is architected and not advisory (interrupts *must* behave differently). QEMU does not deal with differences in AIL support in the host. The solution to that is a spapr capability and corresponding KVM CAP, but this patch does not break things more than before (the host behaviour already differs, this change just disallows some modes that are not implemented properly). By happy coincidence, this allows PR Linux guests that are using the SCV facility to boot and run, because Linux disables the use of SCV if AIL can not be set to 3. This does not fix the underlying problem of missing SCV support (an OS could implement real-mode SCV vectors and try to enable the facility). The true fix for that is for KVM PR to emulate scv interrupts from the facility unavailable interrupt. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20220222064727.2314380-3-npiggin@gmail.com --- arch/powerpc/kvm/book3s_pr_papr.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c index 1f10e7dfcdd0..dc4f51ac84bc 100644 --- a/arch/powerpc/kvm/book3s_pr_papr.c +++ b/arch/powerpc/kvm/book3s_pr_papr.c @@ -281,6 +281,22 @@ static int kvmppc_h_pr_logical_ci_store(struct kvm_vcpu *vcpu) return EMULATE_DONE; } +static int kvmppc_h_pr_set_mode(struct kvm_vcpu *vcpu) +{ + unsigned long mflags = kvmppc_get_gpr(vcpu, 4); + unsigned long resource = kvmppc_get_gpr(vcpu, 5); + + if (resource == H_SET_MODE_RESOURCE_ADDR_TRANS_MODE) { + /* KVM PR does not provide AIL!=0 to guests */ + if (mflags == 0) + kvmppc_set_gpr(vcpu, 3, H_SUCCESS); + else + kvmppc_set_gpr(vcpu, 3, H_UNSUPPORTED_FLAG_START - 63); + return EMULATE_DONE; + } + return EMULATE_FAIL; +} + #ifdef CONFIG_SPAPR_TCE_IOMMU static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu) { @@ -384,6 +400,8 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd) return kvmppc_h_pr_logical_ci_load(vcpu); case H_LOGICAL_CI_STORE: return kvmppc_h_pr_logical_ci_store(vcpu); + case H_SET_MODE: + return kvmppc_h_pr_set_mode(vcpu); case H_XIRR: case H_CPPR: case H_EOI: @@ -421,6 +439,7 @@ int kvmppc_hcall_impl_pr(unsigned long cmd) case H_CEDE: case H_LOGICAL_CI_LOAD: case H_LOGICAL_CI_STORE: + case H_SET_MODE: #ifdef CONFIG_KVM_XICS case H_XIRR: case H_CPPR: @@ -447,6 +466,7 @@ static unsigned int default_hcall_list[] = { H_BULK_REMOVE, H_PUT_TCE, H_CEDE, + H_SET_MODE, #ifdef CONFIG_KVM_XICS H_XIRR, H_CPPR, -- cgit v1.2.3 From f771b55731fc82b1e8e9ef123f6f1b8d8c92bc63 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 7 Mar 2022 13:26:25 +1100 Subject: KVM: PPC: Use KVM_CAP_PPC_AIL_MODE_3 Use KVM_CAP_PPC_AIL_MODE_3 to advertise the capability to set the AIL resource mode to 3 with the H_SET_MODE hypercall. This capability differs between processor types and KVM types (PR, HV, Nested HV), and affects guest-visible behaviour. QEMU will implement a cap-ail-mode-3 to control this behaviour[1], and use the KVM CAP if available to determine KVM support[2]. [1] https://lists.nongnu.org/archive/html/qemu-ppc/2022-02/msg00437.html [2] https://lists.nongnu.org/archive/html/qemu-ppc/2022-02/msg00439.html Signed-off-by: Nicholas Piggin Reviewed-by: Fabiano Rosas [mpe: Rebase onto 93b71801a827 from kvm-ppc-cap-210 branch, add EXPORT_SYMBOL] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220222064727.2314380-4-npiggin@gmail.com --- arch/powerpc/include/asm/setup.h | 2 ++ arch/powerpc/kvm/powerpc.c | 17 +++++++++++++++++ arch/powerpc/platforms/pseries/setup.c | 13 ++++++++++++- 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index d0d3dd531c7f..a555fb77258a 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -28,11 +28,13 @@ void setup_panic(void); #define ARCH_PANIC_TIMEOUT 180 #ifdef CONFIG_PPC_PSERIES +extern bool pseries_reloc_on_exception(void); extern bool pseries_enable_reloc_on_exc(void); extern void pseries_disable_reloc_on_exc(void); extern void pseries_big_endian_exceptions(void); void __init pseries_little_endian_exceptions(void); #else +static inline bool pseries_reloc_on_exception(void) { return false; } static inline bool pseries_enable_reloc_on_exc(void) { return false; } static inline void pseries_disable_reloc_on_exc(void) {} static inline void pseries_big_endian_exceptions(void) {} diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 9772b176e406..875c30c12db0 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -705,6 +705,23 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = 1; break; #endif + case KVM_CAP_PPC_AIL_MODE_3: + r = 0; + /* + * KVM PR, POWER7, and some POWER9s don't support AIL=3 mode. + * The POWER9s can support it if the guest runs in hash mode, + * but QEMU doesn't necessarily query the capability in time. + */ + if (hv_enabled) { + if (kvmhv_on_pseries()) { + if (pseries_reloc_on_exception()) + r = 1; + } else if (cpu_has_feature(CPU_FTR_ARCH_207S) && + !cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) { + r = 1; + } + } + break; default: r = 0; break; diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 83a04d967a59..5bdbbe2151b1 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -353,6 +353,14 @@ static void pseries_lpar_idle(void) pseries_idle_epilog(); } +static bool pseries_reloc_on_exception_enabled; + +bool pseries_reloc_on_exception(void) +{ + return pseries_reloc_on_exception_enabled; +} +EXPORT_SYMBOL_GPL(pseries_reloc_on_exception); + /* * Enable relocation on during exceptions. This has partition wide scope and * may take a while to complete, if it takes longer than one second we will @@ -377,6 +385,7 @@ bool pseries_enable_reloc_on_exc(void) " on exceptions: %ld\n", rc); return false; } + pseries_reloc_on_exception_enabled = true; return true; } @@ -404,7 +413,9 @@ void pseries_disable_reloc_on_exc(void) break; mdelay(get_longbusy_msecs(rc)); } - if (rc != H_SUCCESS) + if (rc == H_SUCCESS) + pseries_reloc_on_exception_enabled = false; + else pr_warn("Warning: Failed to disable relocation on exceptions: %ld\n", rc); } -- cgit v1.2.3 From af41d2866f7d75bbb38d487f6ec7770425d70e45 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sun, 27 Mar 2022 09:32:26 +0200 Subject: powerpc/64: Fix build failure with allyesconfig in book3s_64_entry.S Using conditional branches between two files is hasardous, they may get linked too far from each other. arch/powerpc/kvm/book3s_64_entry.o:(.text+0x3ec): relocation truncated to fit: R_PPC64_REL14 (stub) against symbol `system_reset_common' defined in .text section in arch/powerpc/kernel/head_64.o Reorganise the code to use non conditional branches. Fixes: 89d35b239101 ("KVM: PPC: Book3S HV P9: Implement the rest of the P9 path in C") Signed-off-by: Christophe Leroy [mpe: Avoid odd-looking bne ., use named local labels] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/89cf27bf43ee07a0b2879b9e8e2f5cd6386a3645.1648366338.git.christophe.leroy@csgroup.eu --- arch/powerpc/kvm/book3s_64_entry.S | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_entry.S b/arch/powerpc/kvm/book3s_64_entry.S index 05e003eb5d90..e42d1c609e47 100644 --- a/arch/powerpc/kvm/book3s_64_entry.S +++ b/arch/powerpc/kvm/book3s_64_entry.S @@ -414,10 +414,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_DAWR1) */ ld r10,HSTATE_SCRATCH0(r13) cmpwi r10,BOOK3S_INTERRUPT_MACHINE_CHECK - beq machine_check_common + beq .Lcall_machine_check_common cmpwi r10,BOOK3S_INTERRUPT_SYSTEM_RESET - beq system_reset_common + beq .Lcall_system_reset_common b . + +.Lcall_machine_check_common: + b machine_check_common + +.Lcall_system_reset_common: + b system_reset_common #endif -- cgit v1.2.3 From e4ff77598a109bd36789ad5e80aba66fc53d0ffb Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 30 Mar 2022 19:21:23 +0530 Subject: powerpc/numa: Handle partially initialized numa nodes With commit 09f49dca570a ("mm: handle uninitialized numa nodes gracefully") NODE_DATA for even a memoryless/cpuless node is partially initialized at boot time. Before onlining the node, current Powerpc code checks for NODE_DATA to be NULL. However since NODE_DATA is partially initialized, this check will end up always being false. This causes hotplugging a CPU to a memoryless/cpuless node to fail. Before adding CPUs: $ numactl -H available: 1 nodes (4) node 4 cpus: 0 1 2 3 4 5 6 7 node 4 size: 97372 MB node 4 free: 95545 MB node distances: node 4 4: 10 $ lparstat System Configuration type=Dedicated mode=Capped smt=8 lcpu=1 mem=99709440 kB cpus=0 ent=1.00 %user %sys %wait %idle physc %entc lbusy app vcsw phint ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- 2.66 2.67 0.16 94.51 0.00 0.00 5.33 0.00 67749 0 After hotplugging 32 cores: $ numactl -H node 4 cpus: 0 1 2 3 4 5 6 7 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 node 4 size: 97372 MB node 4 free: 93636 MB node distances: node 4 4: 10 $ lparstat System Configuration type=Dedicated mode=Capped smt=8 lcpu=33 mem=99709440 kB cpus=0 ent=33.00 %user %sys %wait %idle physc %entc lbusy app vcsw phint ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- 0.04 0.02 0.00 99.94 0.00 0.00 0.06 0.00 1128751 3 As we can see numactl is listing only 8 cores while lparstat is showing 33 cores. Also dmesg is showing messages like: [ 2261.318350 ] BUG: arch topology borken [ 2261.318357 ] the DIE domain not a subset of the NODE domain Fixes: 09f49dca570a ("mm: handle uninitialized numa nodes gracefully") Reported-by: Geetika Moolchandani Signed-off-by: Srikar Dronamraju Acked-by: Michal Hocko Acked-by: Oscar Salvador Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220330135123.1868197-1-srikar@linux.vnet.ibm.com --- arch/powerpc/mm/numa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index b9b7fefbb64b..13022d734951 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1436,7 +1436,7 @@ int find_and_online_cpu_nid(int cpu) if (new_nid < 0 || !node_possible(new_nid)) new_nid = first_online_node; - if (NODE_DATA(new_nid) == NULL) { + if (!node_online(new_nid)) { #ifdef CONFIG_MEMORY_HOTPLUG /* * Need to ensure that NODE_DATA is initialized for a node from -- cgit v1.2.3 From 7f921a2d6c93051b6002dbb7c1781f1fa5b88cce Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Sun, 3 Apr 2022 22:12:52 +1000 Subject: KVM: PPC: Move kvmhv_on_pseries() into kvm_ppc.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We recently introduced a usage of kvmhv_on_pseries() in powerpc.c, which causes a build error for ppc64_book3e_allmodconfig: arch/powerpc/kvm/powerpc.c:716:8: error: implicit declaration of function ‘kvmhv_on_pseries’ 716 | if (kvmhv_on_pseries()) { | ^~~~~~~~~~~~~~~~ Fix it by moving kvmhv_on_pseries() into kvm_ppc.h so that the stub version is available for book3e builds. Fixes: f771b55731fc ("KVM: PPC: Use KVM_CAP_PPC_AIL_MODE_3") Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/kvm_book3s_64.h | 12 ------------ arch/powerpc/include/asm/kvm_ppc.h | 12 ++++++++++++ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 827038a33064..4def2bd17b9b 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -16,18 +16,6 @@ #include #include -#ifdef CONFIG_PPC_PSERIES -static inline bool kvmhv_on_pseries(void) -{ - return !cpu_has_feature(CPU_FTR_HVMODE); -} -#else -static inline bool kvmhv_on_pseries(void) -{ - return false; -} -#endif - /* * Structure for a nested guest, that is, for a guest that is managed by * one of our guests. diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index c583d0c37f31..838d4cb460b7 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -586,6 +586,18 @@ static inline bool kvm_hv_mode_active(void) { return false; } #endif +#ifdef CONFIG_PPC_PSERIES +static inline bool kvmhv_on_pseries(void) +{ + return !cpu_has_feature(CPU_FTR_HVMODE); +} +#else +static inline bool kvmhv_on_pseries(void) +{ + return false; +} +#endif + #ifdef CONFIG_KVM_XICS static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) { -- cgit v1.2.3 From ffa0b64e3be58519ae472ea29a1a1ad681e32f48 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 7 Apr 2022 00:57:57 +1000 Subject: powerpc: Fix virt_addr_valid() for 64-bit Book3E & 32-bit mpe: On 64-bit Book3E vmalloc space starts at 0x8000000000000000. Because of the way __pa() works we have: __pa(0x8000000000000000) == 0, and therefore virt_to_pfn(0x8000000000000000) == 0, and therefore virt_addr_valid(0x8000000000000000) == true Which is wrong, virt_addr_valid() should be false for vmalloc space. In fact all vmalloc addresses that alias with a valid PFN will return true from virt_addr_valid(). That can cause bugs with hardened usercopy as described below by Kefeng Wang: When running ethtool eth0 on 64-bit Book3E, a BUG occurred: usercopy: Kernel memory exposure attempt detected from SLUB object not in SLUB page?! (offset 0, size 1048)! kernel BUG at mm/usercopy.c:99 ... usercopy_abort+0x64/0xa0 (unreliable) __check_heap_object+0x168/0x190 __check_object_size+0x1a0/0x200 dev_ethtool+0x2494/0x2b20 dev_ioctl+0x5d0/0x770 sock_do_ioctl+0xf0/0x1d0 sock_ioctl+0x3ec/0x5a0 __se_sys_ioctl+0xf0/0x160 system_call_exception+0xfc/0x1f0 system_call_common+0xf8/0x200 The code shows below, data = vzalloc(array_size(gstrings.len, ETH_GSTRING_LEN)); copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN)) The data is alloced by vmalloc(), virt_addr_valid(ptr) will return true on 64-bit Book3E, which leads to the panic. As commit 4dd7554a6456 ("powerpc/64: Add VIRTUAL_BUG_ON checks for __va and __pa addresses") does, make sure the virt addr above PAGE_OFFSET in the virt_addr_valid() for 64-bit, also add upper limit check to make sure the virt is below high_memory. Meanwhile, for 32-bit PAGE_OFFSET is the virtual address of the start of lowmem, high_memory is the upper low virtual address, the check is suitable for 32-bit, this will fix the issue mentioned in commit 602946ec2f90 ("powerpc: Set max_mapnr correctly") too. On 32-bit there is a similar problem with high memory, that was fixed in commit 602946ec2f90 ("powerpc: Set max_mapnr correctly"), but that commit breaks highmem and needs to be reverted. We can't easily fix __pa(), we have code that relies on its current behaviour. So for now add extra checks to virt_addr_valid(). For 64-bit Book3S the extra checks are not necessary, the combination of virt_to_pfn() and pfn_valid() should yield the correct result, but they are harmless. Signed-off-by: Kefeng Wang Reviewed-by: Christophe Leroy [mpe: Add additional change log detail] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220406145802.538416-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/page.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index 254687258f42..f2c5c26869f1 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -132,7 +132,11 @@ static inline bool pfn_valid(unsigned long pfn) #define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr)) #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) -#define virt_addr_valid(kaddr) pfn_valid(virt_to_pfn(kaddr)) +#define virt_addr_valid(vaddr) ({ \ + unsigned long _addr = (unsigned long)vaddr; \ + _addr >= PAGE_OFFSET && _addr < (unsigned long)high_memory && \ + pfn_valid(virt_to_pfn(_addr)); \ +}) /* * On Book-E parts we need __va to parse the device tree and we can't -- cgit v1.2.3 From 1ff5c8e8c835e8a81c0868e3050c76563dd56a2c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 7 Apr 2022 00:57:58 +1000 Subject: Revert "powerpc: Set max_mapnr correctly" This reverts commit 602946ec2f90d5bd965857753880db29d2d9a1e9. If CONFIG_HIGHMEM is enabled, no highmem will be added with max_mapnr set to max_low_pfn, see mem_init(): for (pfn = highmem_mapnr; pfn < max_mapnr; ++pfn) { ... free_highmem_page(); } Now that virt_addr_valid() has been fixed in the previous commit, we can revert the change to max_mapnr. Fixes: 602946ec2f90 ("powerpc: Set max_mapnr correctly") Signed-off-by: Kefeng Wang Reviewed-by: Christophe Leroy Reported-by: Erhard F. [mpe: Update change log to reflect series reordering] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220406145802.538416-2-mpe@ellerman.id.au --- arch/powerpc/mm/mem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 8e301cd8925b..4d221d033804 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -255,7 +255,7 @@ void __init mem_init(void) #endif high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); - set_max_mapnr(max_low_pfn); + set_max_mapnr(max_pfn); kasan_late_init(); -- cgit v1.2.3