summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/virt/kvm/api.rst20
-rw-r--r--arch/arm64/include/asm/kvm_host.h2
-rw-r--r--arch/arm64/kvm/mmu.c19
-rw-r--r--arch/mips/include/asm/kvm_host.h2
-rw-r--r--arch/mips/kvm/mips.c2
-rw-r--r--arch/mips/kvm/mmu.c3
-rw-r--r--arch/powerpc/include/asm/kvm_host.h3
-rw-r--r--arch/powerpc/kvm/book3s.c3
-rw-r--r--arch/powerpc/kvm/e500_mmu_host.c3
-rw-r--r--arch/x86/include/asm/kvm_host.h3
-rw-r--r--arch/x86/kernel/kvm.c4
-rw-r--r--arch/x86/kvm/emulate.c22
-rw-r--r--arch/x86/kvm/mmu/mmu.c5
-rw-r--r--arch/x86/kvm/svm/nested.c7
-rw-r--r--arch/x86/kvm/svm/sev.c1
-rw-r--r--arch/x86/kvm/svm/svm.c44
-rw-r--r--arch/x86/kvm/vmx/nested.c10
-rw-r--r--arch/x86/kvm/vmx/vmx.c42
-rw-r--r--arch/x86/kvm/vmx/vmx.h6
-rw-r--r--arch/x86/kvm/x86.c30
-rw-r--r--include/uapi/linux/kvm.h5
-rw-r--r--tools/testing/selftests/kvm/x86_64/debug_regs.c6
-rw-r--r--virt/kvm/kvm_main.c24
23 files changed, 186 insertions, 80 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index d2b733dc7892..51191b56e61c 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6173,3 +6173,23 @@ specific interfaces must be consistent, i.e. if one says the feature
is supported, than the other should as well and vice versa. For arm64
see Documentation/virt/kvm/devices/vcpu.rst "KVM_ARM_VCPU_PVTIME_CTRL".
For x86 see Documentation/virt/kvm/msr.rst "MSR_KVM_STEAL_TIME".
+
+8.25 KVM_CAP_S390_DIAG318
+-------------------------
+
+:Architectures: s390
+
+This capability enables a guest to set information about its control program
+(i.e. guest kernel type and version). The information is helpful during
+system/firmware service events, providing additional data about the guest
+environments running on the machine.
+
+The information is associated with the DIAGNOSE 0x318 instruction, which sets
+an 8-byte value consisting of a one-byte Control Program Name Code (CPNC) and
+a 7-byte Control Program Version Code (CPVC). The CPNC determines what
+environment the control program is running in (e.g. Linux, z/VM...), and the
+CPVC is used for information specific to OS (e.g. Linux version, Linux
+distribution...)
+
+If this capability is available, then the CPNC and CPVC can be synchronized
+between KVM and userspace via the sync regs mechanism (KVM_SYNC_DIAG318).
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index af4989a25bb7..905c2b87e05a 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -472,7 +472,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
#define KVM_ARCH_WANT_MMU_NOTIFIER
int kvm_unmap_hva_range(struct kvm *kvm,
- unsigned long start, unsigned long end);
+ unsigned long start, unsigned long end, unsigned flags);
int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 9aec1ce491d2..3d26b47a1343 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -343,7 +343,8 @@ static void unmap_stage2_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
* destroying the VM), otherwise another faulting VCPU may come in and mess
* with things behind our backs.
*/
-static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
+static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size,
+ bool may_block)
{
struct kvm *kvm = mmu->kvm;
pgd_t *pgd;
@@ -369,11 +370,16 @@ static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 si
* If the range is too large, release the kvm->mmu_lock
* to prevent starvation and lockup detector warnings.
*/
- if (next != end)
+ if (may_block && next != end)
cond_resched_lock(&kvm->mmu_lock);
} while (pgd++, addr = next, addr != end);
}
+static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
+{
+ __unmap_stage2_range(mmu, start, size, true);
+}
+
static void stage2_flush_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
phys_addr_t addr, phys_addr_t end)
{
@@ -2214,18 +2220,21 @@ static int handle_hva_to_gpa(struct kvm *kvm,
static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
{
- unmap_stage2_range(&kvm->arch.mmu, gpa, size);
+ unsigned flags = *(unsigned *)data;
+ bool may_block = flags & MMU_NOTIFIER_RANGE_BLOCKABLE;
+
+ __unmap_stage2_range(&kvm->arch.mmu, gpa, size, may_block);
return 0;
}
int kvm_unmap_hva_range(struct kvm *kvm,
- unsigned long start, unsigned long end)
+ unsigned long start, unsigned long end, unsigned flags)
{
if (!kvm->arch.mmu.pgd)
return 0;
trace_kvm_unmap_hva_range(start, end);
- handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL);
+ handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &flags);
return 0;
}
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index d35eaed1668f..825d337a505a 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -969,7 +969,7 @@ enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu,
#define KVM_ARCH_WANT_MMU_NOTIFIER
int kvm_unmap_hva_range(struct kvm *kvm,
- unsigned long start, unsigned long end);
+ unsigned long start, unsigned long end, unsigned flags);
int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 7de85d2253ff..0c50ac444222 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -137,6 +137,8 @@ extern void kvm_init_loongson_ipi(struct kvm *kvm);
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
switch (type) {
+ case KVM_VM_MIPS_AUTO:
+ break;
#ifdef CONFIG_KVM_MIPS_VZ
case KVM_VM_MIPS_VZ:
#else
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 87fa8d8a1031..28c366d307e7 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -486,7 +486,8 @@ static int kvm_unmap_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
return 1;
}
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
+ unsigned flags)
{
handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL);
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index e020d269416d..10ded83414de 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -58,7 +58,8 @@
#define KVM_ARCH_WANT_MMU_NOTIFIER
extern int kvm_unmap_hva_range(struct kvm *kvm,
- unsigned long start, unsigned long end);
+ unsigned long start, unsigned long end,
+ unsigned flags);
extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
extern int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 41fedec69ac3..49db50d1db04 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -834,7 +834,8 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm,
kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old, new, change);
}
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
+ unsigned flags)
{
return kvm->arch.kvm_ops->unmap_hva_range(kvm, start, end);
}
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index d6c1069e9954..ed0c9c43d0cf 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -734,7 +734,8 @@ static int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
return 0;
}
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
+ unsigned flags)
{
/* kvm_unmap_hva flushes everything anyways */
kvm_unmap_hva(kvm, start);
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5ab3af7275d8..5303dbc5c9bc 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1596,7 +1596,8 @@ asmlinkage void kvm_spurious_fault(void);
_ASM_EXTABLE(666b, 667b)
#define KVM_ARCH_WANT_MMU_NOTIFIER
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end);
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
+ unsigned flags);
int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 08320b0b2b27..9663ba31347c 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -270,9 +270,8 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
{
struct pt_regs *old_regs = set_irq_regs(regs);
u32 token;
- irqentry_state_t state;
- state = irqentry_enter(regs);
+ ack_APIC_irq();
inc_irq_stat(irq_hv_callback_count);
@@ -283,7 +282,6 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
wrmsrl(MSR_KVM_ASYNC_PF_ACK, 1);
}
- irqentry_exit(regs, state);
set_irq_regs(old_regs);
}
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d0e2825ae617..1d450d7710d6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2505,9 +2505,14 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
*reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4);
val = GET_SMSTATE(u32, smstate, 0x7fcc);
- ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1);
+
+ if (ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1))
+ return X86EMUL_UNHANDLEABLE;
+
val = GET_SMSTATE(u32, smstate, 0x7fc8);
- ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
+
+ if (ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1))
+ return X86EMUL_UNHANDLEABLE;
selector = GET_SMSTATE(u32, smstate, 0x7fc4);
set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f64));
@@ -2560,16 +2565,23 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7f70) | X86_EFLAGS_FIXED;
val = GET_SMSTATE(u32, smstate, 0x7f68);
- ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1);
+
+ if (ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1))
+ return X86EMUL_UNHANDLEABLE;
+
val = GET_SMSTATE(u32, smstate, 0x7f60);
- ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
+
+ if (ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1))
+ return X86EMUL_UNHANDLEABLE;
cr0 = GET_SMSTATE(u64, smstate, 0x7f58);
cr3 = GET_SMSTATE(u64, smstate, 0x7f50);
cr4 = GET_SMSTATE(u64, smstate, 0x7f48);
ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7f00));
val = GET_SMSTATE(u64, smstate, 0x7ed0);
- ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA);
+
+ if (ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA))
+ return X86EMUL_UNHANDLEABLE;
selector = GET_SMSTATE(u32, smstate, 0x7e90);
rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e92) << 8);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 4e03841f053d..76c5826e29a2 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1916,7 +1916,8 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
}
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
+ unsigned flags)
{
return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
}
@@ -2468,7 +2469,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
}
if (sp->unsync_children)
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
__clear_sp_write_flooding_count(sp);
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index fb68467e6049..e90bc436f584 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -586,7 +586,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE;
/* Give the current vmcb to the guest */
- svm_set_gif(svm, false);
nested_vmcb->save.es = vmcb->save.es;
nested_vmcb->save.cs = vmcb->save.cs;
@@ -632,6 +631,9 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
/* Restore the original control entries */
copy_vmcb_control_area(&vmcb->control, &hsave->control);
+ /* On vmexit the GIF is set to false */
+ svm_set_gif(svm, false);
+
svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset =
svm->vcpu.arch.l1_tsc_offset;
@@ -1132,6 +1134,9 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
load_nested_vmcb_control(svm, &ctl);
nested_prepare_vmcb_control(svm);
+ if (!nested_svm_vmrun_msrpm(svm))
+ return -EINVAL;
+
out_set_gif:
svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
return 0;
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 402dc4234e39..7bf7bf734979 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -1106,6 +1106,7 @@ void sev_vm_destroy(struct kvm *kvm)
list_for_each_safe(pos, q, head) {
__unregister_enc_region_locked(kvm,
list_entry(pos, struct enc_region, list));
+ cond_resched();
}
}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 03dd7bac8034..d4e18bda19c7 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -2183,6 +2183,12 @@ static int iret_interception(struct vcpu_svm *svm)
return 1;
}
+static int invd_interception(struct vcpu_svm *svm)
+{
+ /* Treat an INVD instruction as a NOP and just skip it. */
+ return kvm_skip_emulated_instruction(&svm->vcpu);
+}
+
static int invlpg_interception(struct vcpu_svm *svm)
{
if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
@@ -2774,7 +2780,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_RDPMC] = rdpmc_interception,
[SVM_EXIT_CPUID] = cpuid_interception,
[SVM_EXIT_IRET] = iret_interception,
- [SVM_EXIT_INVD] = emulate_on_interception,
+ [SVM_EXIT_INVD] = invd_interception,
[SVM_EXIT_PAUSE] = pause_interception,
[SVM_EXIT_HLT] = halt_interception,
[SVM_EXIT_INVLPG] = invlpg_interception,
@@ -2938,8 +2944,6 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
if (npt_enabled)
vcpu->arch.cr3 = svm->vmcb->save.cr3;
- svm_complete_interrupts(svm);
-
if (is_guest_mode(vcpu)) {
int vmexit;
@@ -3504,7 +3508,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
stgi();
/* Any pending NMI will happen here */
- exit_fastpath = svm_exit_handlers_fastpath(vcpu);
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
kvm_after_interrupt(&svm->vcpu);
@@ -3518,6 +3521,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
}
svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+ vmcb_mark_all_clean(svm->vmcb);
/* if exit due to PF check for async PF */
if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
@@ -3537,7 +3541,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
SVM_EXIT_EXCP_BASE + MC_VECTOR))
svm_handle_mce(svm);
- vmcb_mark_all_clean(svm->vmcb);
+ svm_complete_interrupts(svm);
+ exit_fastpath = svm_exit_handlers_fastpath(vcpu);
return exit_fastpath;
}
@@ -3900,21 +3905,28 @@ static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb *nested_vmcb;
struct kvm_host_map map;
- u64 guest;
- u64 vmcb;
int ret = 0;
- guest = GET_SMSTATE(u64, smstate, 0x7ed8);
- vmcb = GET_SMSTATE(u64, smstate, 0x7ee0);
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) {
+ u64 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0);
+ u64 guest = GET_SMSTATE(u64, smstate, 0x7ed8);
+ u64 vmcb = GET_SMSTATE(u64, smstate, 0x7ee0);
- if (guest) {
- if (kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb), &map) == -EINVAL)
- return 1;
- nested_vmcb = map.hva;
- ret = enter_svm_guest_mode(svm, vmcb, nested_vmcb);
- kvm_vcpu_unmap(&svm->vcpu, &map, true);
+ if (guest) {
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
+ return 1;
+
+ if (!(saved_efer & EFER_SVME))
+ return 1;
+
+ if (kvm_vcpu_map(&svm->vcpu,
+ gpa_to_gfn(vmcb), &map) == -EINVAL)
+ return 1;
+
+ ret = enter_svm_guest_mode(svm, vmcb, map.hva);
+ kvm_vcpu_unmap(&svm->vcpu, &map, true);
+ }
}
return ret;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 23b58c28a1c9..1bb6b31eb646 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -4404,6 +4404,14 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
kvm_vcpu_flush_tlb_current(vcpu);
+ /*
+ * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between
+ * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are
+ * up-to-date before switching to L1.
+ */
+ if (enable_ept && is_pae_paging(vcpu))
+ vmx_ept_load_pdptrs(vcpu);
+
leave_guest_mode(vcpu);
if (nested_cpu_has_preemption_timer(vmcs12))
@@ -4668,7 +4676,7 @@ void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
vmx->nested.msrs.entry_ctls_high &=
~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
vmx->nested.msrs.exit_ctls_high &=
- ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
}
}
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 46ba2e03a892..f4e9c310032a 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -129,6 +129,9 @@ static bool __read_mostly enable_preemption_timer = 1;
module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
#endif
+extern bool __read_mostly allow_smaller_maxphyaddr;
+module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
+
#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
#define KVM_VM_CR0_ALWAYS_ON \
@@ -791,6 +794,18 @@ void update_exception_bitmap(struct kvm_vcpu *vcpu)
*/
if (is_guest_mode(vcpu))
eb |= get_vmcs12(vcpu)->exception_bitmap;
+ else {
+ /*
+ * If EPT is enabled, #PF is only trapped if MAXPHYADDR is mismatched
+ * between guest and host. In that case we only care about present
+ * faults. For vmcs02, however, PFEC_MASK and PFEC_MATCH are set in
+ * prepare_vmcs02_rare.
+ */
+ bool selective_pf_trap = enable_ept && (eb & (1u << PF_VECTOR));
+ int mask = selective_pf_trap ? PFERR_PRESENT_MASK : 0;
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, mask);
+ }
vmcs_write32(EXCEPTION_BITMAP, eb);
}
@@ -2971,7 +2986,7 @@ static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
vpid_sync_context(to_vmx(vcpu)->vpid);
}
-static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
+void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
@@ -3114,7 +3129,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
guest_cr3 = vcpu->arch.cr3;
else /* vmcs01.GUEST_CR3 is already up-to-date. */
update_guest_cr3 = false;
- ept_load_pdptrs(vcpu);
+ vmx_ept_load_pdptrs(vcpu);
} else {
guest_cr3 = pgd;
}
@@ -4352,16 +4367,6 @@ static void init_vmcs(struct vcpu_vmx *vmx)
vmx->pt_desc.guest.output_mask = 0x7F;
vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
}
-
- /*
- * If EPT is enabled, #PF is only trapped if MAXPHYADDR is mismatched
- * between guest and host. In that case we only care about present
- * faults.
- */
- if (enable_ept) {
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, PFERR_PRESENT_MASK);
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, PFERR_PRESENT_MASK);
- }
}
static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -4803,6 +4808,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
* EPT will cause page fault only if we need to
* detect illegal GPAs.
*/
+ WARN_ON_ONCE(!allow_smaller_maxphyaddr);
kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
return 1;
} else
@@ -5331,7 +5337,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
* would also use advanced VM-exit information for EPT violations to
* reconstruct the page fault error code.
*/
- if (unlikely(kvm_mmu_is_illegal_gpa(vcpu, gpa)))
+ if (unlikely(allow_smaller_maxphyaddr && kvm_mmu_is_illegal_gpa(vcpu, gpa)))
return kvm_emulate_instruction(vcpu, 0);
return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
@@ -6054,6 +6060,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
(exit_reason != EXIT_REASON_EXCEPTION_NMI &&
exit_reason != EXIT_REASON_EPT_VIOLATION &&
exit_reason != EXIT_REASON_PML_FULL &&
+ exit_reason != EXIT_REASON_APIC_ACCESS &&
exit_reason != EXIT_REASON_TASK_SWITCH)) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
@@ -8304,11 +8311,12 @@ static int __init vmx_init(void)
vmx_check_vmcs12_offsets();
/*
- * Intel processors don't have problems with
- * GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable
- * it for VMX by default
+ * Shadow paging doesn't have a (further) performance penalty
+ * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it
+ * by default
*/
- allow_smaller_maxphyaddr = true;
+ if (!enable_ept)
+ allow_smaller_maxphyaddr = true;
return 0;
}
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 26175a4759fa..a0e47720f60c 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -356,6 +356,7 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp);
int vmx_find_msr_index(struct vmx_msrs *m, u32 msr);
int vmx_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
struct x86_exception *e);
+void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu);
#define POSTED_INTR_ON 0
#define POSTED_INTR_SN 1
@@ -551,7 +552,10 @@ static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx)
static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu)
{
- return !enable_ept || cpuid_maxphyaddr(vcpu) < boot_cpu_data.x86_phys_bits;
+ if (!enable_ept)
+ return true;
+
+ return allow_smaller_maxphyaddr && cpuid_maxphyaddr(vcpu) < boot_cpu_data.x86_phys_bits;
}
void dump_vmcs(void);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c44d3a73b8eb..58fa354b1446 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -188,7 +188,7 @@ static struct kvm_shared_msrs __percpu *shared_msrs;
u64 __read_mostly host_efer;
EXPORT_SYMBOL_GPL(host_efer);
-bool __read_mostly allow_smaller_maxphyaddr;
+bool __read_mostly allow_smaller_maxphyaddr = 0;
EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
static u64 __read_mostly host_xss;
@@ -975,7 +975,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
unsigned long old_cr4 = kvm_read_cr4(vcpu);
unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
- X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
+ X86_CR4_SMEP;
+ unsigned long mmu_role_bits = pdptr_bits | X86_CR4_SMAP | X86_CR4_PKE;
if (kvm_valid_cr4(vcpu, cr4))
return 1;
@@ -1003,7 +1004,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
if (kvm_x86_ops.set_cr4(vcpu, cr4))
return 1;
- if (((cr4 ^ old_cr4) & pdptr_bits) ||
+ if (((cr4 ^ old_cr4) & mmu_role_bits) ||
(!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
kvm_mmu_reset_context(vcpu);
@@ -2735,7 +2736,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
return 1;
if (!lapic_in_kernel(vcpu))
- return 1;
+ return data ? 1 : 0;
vcpu->arch.apf.msr_en_val = data;
@@ -3224,9 +3225,22 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_POWER_CTL:
msr_info->data = vcpu->arch.msr_ia32_power_ctl;
break;
- case MSR_IA32_TSC:
- msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
+ case MSR_IA32_TSC: {
+ /*
+ * Intel SDM states that MSR_IA32_TSC read adds the TSC offset
+ * even when not intercepted. AMD manual doesn't explicitly
+ * state this but appears to behave the same.
+ *
+ * On userspace reads and writes, however, we unconditionally
+ * operate L1's TSC value to ensure backwards-compatible
+ * behavior for migration.
+ */
+ u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset :
+ vcpu->arch.tsc_offset;
+
+ msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset;
break;
+ }
case MSR_MTRRcap:
case 0x200 ... 0x2ff:
return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
@@ -10754,9 +10768,11 @@ EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value);
void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code)
{
struct x86_exception fault;
+ u32 access = error_code &
+ (PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK);
if (!(error_code & PFERR_PRESENT_MASK) ||
- vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, error_code, &fault) != UNMAPPED_GVA) {
+ vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, &fault) != UNMAPPED_GVA) {
/*
* If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
* tables probably do not match the TLB. Just proceed
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 3d8023474f2a..7d8eced6f459 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -790,9 +790,10 @@ struct kvm_ppc_resize_hpt {
#define KVM_VM_PPC_HV 1
#define KVM_VM_PPC_PR 2
-/* on MIPS, 0 forces trap & emulate, 1 forces VZ ASE */
-#define KVM_VM_MIPS_TE 0
+/* on MIPS, 0 indicates auto, 1 forces VZ ASE, 2 forces trap & emulate */
+#define KVM_VM_MIPS_AUTO 0
#define KVM_VM_MIPS_VZ 1
+#define KVM_VM_MIPS_TE 2
#define KVM_S390_SIE_PAGE_OFFSET 1
diff --git a/tools/testing/selftests/kvm/x86_64/debug_regs.c b/tools/testing/selftests/kvm/x86_64/debug_regs.c
index 8162c58a1234..2fc6b3af81a1 100644
--- a/tools/testing/selftests/kvm/x86_64/debug_regs.c
+++ b/tools/testing/selftests/kvm/x86_64/debug_regs.c
@@ -40,11 +40,11 @@ static void guest_code(void)
/* Single step test, covers 2 basic instructions and 2 emulated */
asm volatile("ss_start: "
- "xor %%rax,%%rax\n\t"
+ "xor %%eax,%%eax\n\t"
"cpuid\n\t"
"movl $0x1a0,%%ecx\n\t"
"rdmsr\n\t"
- : : : "rax", "ecx");
+ : : : "eax", "ebx", "ecx", "edx");
/* DR6.BD test */
asm volatile("bd_start: mov %%dr0, %%rax" : : : "rax");
@@ -73,7 +73,7 @@ int main(void)
int i;
/* Instruction lengths starting at ss_start */
int ss_size[4] = {
- 3, /* xor */
+ 2, /* xor */
2, /* cpuid */
5, /* mov */
2, /* rdmsr */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 737666db02de..cf88233b819a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -482,7 +482,8 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
* count is also read inside the mmu_lock critical section.
*/
kvm->mmu_notifier_count++;
- need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end);
+ need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end,
+ range->flags);
need_tlb_flush |= kvm->tlbs_dirty;
/* we've to flush the tlb before the pages can be freed */
if (need_tlb_flush)
@@ -4331,7 +4332,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
struct kvm_io_device *dev)
{
- int i;
+ int i, j;
struct kvm_io_bus *new_bus, *bus;
bus = kvm_get_bus(kvm, bus_idx);
@@ -4348,17 +4349,20 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
GFP_KERNEL_ACCOUNT);
- if (!new_bus) {
+ if (new_bus) {
+ memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
+ new_bus->dev_count--;
+ memcpy(new_bus->range + i, bus->range + i + 1,
+ (new_bus->dev_count - i) * sizeof(struct kvm_io_range));
+ } else {
pr_err("kvm: failed to shrink bus, removing it completely\n");
- goto broken;
+ for (j = 0; j < bus->dev_count; j++) {
+ if (j == i)
+ continue;
+ kvm_iodevice_destructor(bus->range[j].dev);
+ }
}
- memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
- new_bus->dev_count--;
- memcpy(new_bus->range + i, bus->range + i + 1,
- (new_bus->dev_count - i) * sizeof(struct kvm_io_range));
-
-broken:
rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
synchronize_srcu_expedited(&kvm->srcu);
kfree(bus);