summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/virtual/kvm/locking.txt130
-rw-r--r--arch/x86/include/asm/hypervisor.h1
-rw-r--r--arch/x86/include/asm/kvm_emulate.h6
-rw-r--r--arch/x86/include/asm/kvm_host.h4
-rw-r--r--arch/x86/include/asm/processor-flags.h2
-rw-r--r--arch/x86/include/asm/vmx.h2
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c1
-rw-r--r--arch/x86/kernel/kvm.c14
-rw-r--r--arch/x86/kvm/cpuid.c45
-rw-r--r--arch/x86/kvm/cpuid.h9
-rw-r--r--arch/x86/kvm/emulate.c273
-rw-r--r--arch/x86/kvm/mmu.c300
-rw-r--r--arch/x86/kvm/mmutrace.h45
-rw-r--r--arch/x86/kvm/paging_tmpl.h3
-rw-r--r--arch/x86/kvm/svm.c6
-rw-r--r--arch/x86/kvm/vmx.c81
-rw-r--r--arch/x86/kvm/x86.c43
17 files changed, 775 insertions, 190 deletions
diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt
index 3b4cd3bf5631..41b7ac9884b5 100644
--- a/Documentation/virtual/kvm/locking.txt
+++ b/Documentation/virtual/kvm/locking.txt
@@ -6,7 +6,129 @@ KVM Lock Overview
(to be written)
-2. Reference
+2: Exception
+------------
+
+Fast page fault:
+
+Fast page fault is the fast path which fixes the guest page fault out of
+the mmu-lock on x86. Currently, the page fault can be fast only if the
+shadow page table is present and it is caused by write-protect, that means
+we just need change the W bit of the spte.
+
+What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and
+SPTE_MMU_WRITEABLE bit on the spte:
+- SPTE_HOST_WRITEABLE means the gfn is writable on host.
+- SPTE_MMU_WRITEABLE means the gfn is writable on mmu. The bit is set when
+ the gfn is writable on guest mmu and it is not write-protected by shadow
+ page write-protection.
+
+On fast page fault path, we will use cmpxchg to atomically set the spte W
+bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, this
+is safe because whenever changing these bits can be detected by cmpxchg.
+
+But we need carefully check these cases:
+1): The mapping from gfn to pfn
+The mapping from gfn to pfn may be changed since we can only ensure the pfn
+is not changed during cmpxchg. This is a ABA problem, for example, below case
+will happen:
+
+At the beginning:
+gpte = gfn1
+gfn1 is mapped to pfn1 on host
+spte is the shadow page table entry corresponding with gpte and
+spte = pfn1
+
+ VCPU 0 VCPU0
+on fast page fault path:
+
+ old_spte = *spte;
+ pfn1 is swapped out:
+ spte = 0;
+
+ pfn1 is re-alloced for gfn2.
+
+ gpte is changed to point to
+ gfn2 by the guest:
+ spte = pfn1;
+
+ if (cmpxchg(spte, old_spte, old_spte+W)
+ mark_page_dirty(vcpu->kvm, gfn1)
+ OOPS!!!
+
+We dirty-log for gfn1, that means gfn2 is lost in dirty-bitmap.
+
+For direct sp, we can easily avoid it since the spte of direct sp is fixed
+to gfn. For indirect sp, before we do cmpxchg, we call gfn_to_pfn_atomic()
+to pin gfn to pfn, because after gfn_to_pfn_atomic():
+- We have held the refcount of pfn that means the pfn can not be freed and
+ be reused for another gfn.
+- The pfn is writable that means it can not be shared between different gfns
+ by KSM.
+
+Then, we can ensure the dirty bitmaps is correctly set for a gfn.
+
+Currently, to simplify the whole things, we disable fast page fault for
+indirect shadow page.
+
+2): Dirty bit tracking
+In the origin code, the spte can be fast updated (non-atomically) if the
+spte is read-only and the Accessed bit has already been set since the
+Accessed bit and Dirty bit can not be lost.
+
+But it is not true after fast page fault since the spte can be marked
+writable between reading spte and updating spte. Like below case:
+
+At the beginning:
+spte.W = 0
+spte.Accessed = 1
+
+ VCPU 0 VCPU0
+In mmu_spte_clear_track_bits():
+
+ old_spte = *spte;
+
+ /* 'if' condition is satisfied. */
+ if (old_spte.Accssed == 1 &&
+ old_spte.W == 0)
+ spte = 0ull;
+ on fast page fault path:
+ spte.W = 1
+ memory write on the spte:
+ spte.Dirty = 1
+
+
+ else
+ old_spte = xchg(spte, 0ull)
+
+
+ if (old_spte.Accssed == 1)
+ kvm_set_pfn_accessed(spte.pfn);
+ if (old_spte.Dirty == 1)
+ kvm_set_pfn_dirty(spte.pfn);
+ OOPS!!!
+
+The Dirty bit is lost in this case.
+
+In order to avoid this kind of issue, we always treat the spte as "volatile"
+if it can be updated out of mmu-lock, see spte_has_volatile_bits(), it means,
+the spte is always atomicly updated in this case.
+
+3): flush tlbs due to spte updated
+If the spte is updated from writable to readonly, we should flush all TLBs,
+otherwise rmap_write_protect will find a read-only spte, even though the
+writable spte might be cached on a CPU's TLB.
+
+As mentioned before, the spte can be updated to writable out of mmu-lock on
+fast page fault path, in order to easily audit the path, we see if TLBs need
+be flushed caused by this reason in mmu_spte_update() since this is a common
+function to update spte (present -> present).
+
+Since the spte is "volatile" if it can be updated out of mmu-lock, we always
+atomicly update the spte, the race caused by fast page fault can be avoided,
+See the comments in spte_has_volatile_bits() and mmu_spte_update().
+
+3. Reference
------------
Name: kvm_lock
@@ -23,3 +145,9 @@ Arch: x86
Protects: - kvm_arch::{last_tsc_write,last_tsc_nsec,last_tsc_offset}
- tsc offset in vmcb
Comment: 'raw' because updating the tsc offsets must not be preempted.
+
+Name: kvm->mmu_lock
+Type: spinlock_t
+Arch: any
+Protects: -shadow page/shadow tlb entry
+Comment: it is a spinlock since it is used in mmu notifier.
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 7a15153c675d..b518c7509933 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -49,6 +49,7 @@ extern const struct hypervisor_x86 *x86_hyper;
extern const struct hypervisor_x86 x86_hyper_vmware;
extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
extern const struct hypervisor_x86 x86_hyper_xen_hvm;
+extern const struct hypervisor_x86 x86_hyper_kvm;
static inline bool hypervisor_x2apic_available(void)
{
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 1ac46c22dd50..c764f43b71c5 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -192,8 +192,8 @@ struct x86_emulate_ops {
struct x86_instruction_info *info,
enum x86_intercept_stage stage);
- bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt,
- u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
+ void (*get_cpuid)(struct x86_emulate_ctxt *ctxt,
+ u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
};
typedef u32 __attribute__((vector_size(16))) sse128_t;
@@ -280,9 +280,9 @@ struct x86_emulate_ctxt {
u8 modrm_seg;
bool rip_relative;
unsigned long _eip;
+ struct operand memop;
/* Fields above regs are cleared together. */
unsigned long regs[NR_VCPU_REGS];
- struct operand memop;
struct operand *memopp;
struct fetch_cache fetch;
struct read_cache io_read;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 24b76474d9de..a3e9409e90b6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -48,12 +48,13 @@
#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
+#define CR3_PCID_ENABLED_RESERVED_BITS 0xFFFFFF0000000000ULL
#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
0xFFFFFF0000000000ULL)
#define CR4_RESERVED_BITS \
(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
| X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
- | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
+ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \
| X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \
| X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
@@ -673,6 +674,7 @@ struct kvm_x86_ops {
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
int (*get_lpage_level)(void);
bool (*rdtscp_supported)(void);
+ bool (*invpcid_supported)(void);
void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host);
void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index f8ab3eaad128..aea1d1d848c7 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -44,6 +44,7 @@
*/
#define X86_CR3_PWT 0x00000008 /* Page Write Through */
#define X86_CR3_PCD 0x00000010 /* Page Cache Disable */
+#define X86_CR3_PCID_MASK 0x00000fff /* PCID Mask */
/*
* Intel CPU features in CR4
@@ -61,6 +62,7 @@
#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */
#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */
#define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */
+#define X86_CR4_PCIDE 0x00020000 /* enable PCID support */
#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */
#define X86_CR4_SMEP 0x00100000 /* enable SMEP support */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index de007c272730..74fcb963595b 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -60,6 +60,7 @@
#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
+#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
#define PIN_BASED_EXT_INTR_MASK 0x00000001
@@ -281,6 +282,7 @@ enum vmcs_field {
#define EXIT_REASON_EPT_MISCONFIG 49
#define EXIT_REASON_WBINVD 54
#define EXIT_REASON_XSETBV 55
+#define EXIT_REASON_INVPCID 58
/*
* Interruption-information format
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 755f64fb0743..6d6dd7afb229 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -37,6 +37,7 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
#endif
&x86_hyper_vmware,
&x86_hyper_ms_hyperv,
+ &x86_hyper_kvm,
};
const struct hypervisor_x86 *x86_hyper;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 75ab94c75c7a..299cf1470923 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -41,6 +41,7 @@
#include <asm/idle.h>
#include <asm/apic.h>
#include <asm/apicdef.h>
+#include <asm/hypervisor.h>
static int kvmapf = 1;
@@ -483,6 +484,19 @@ void __init kvm_guest_init(void)
#endif
}
+static bool __init kvm_detect(void)
+{
+ if (!kvm_para_available())
+ return false;
+ return true;
+}
+
+const struct hypervisor_x86 x86_hyper_kvm __refconst = {
+ .name = "KVM",
+ .detect = kvm_detect,
+};
+EXPORT_SYMBOL_GPL(x86_hyper_kvm);
+
static __init int activate_jump_labels(void)
{
if (has_steal_clock) {
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 61ccbdf3d0ac..0595f1397b7c 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -201,6 +201,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
unsigned f_lm = 0;
#endif
unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
+ unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
/* cpuid 1.edx */
const u32 kvm_supported_word0_x86_features =
@@ -228,7 +229,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
0 /* DS-CPL, VMX, SMX, EST */ |
0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
- 0 /* Reserved, DCA */ | F(XMM4_1) |
+ F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |
F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
F(F16C) | F(RDRAND);
@@ -248,7 +249,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
/* cpuid 7.0.ebx */
const u32 kvm_supported_word9_x86_features =
F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
- F(BMI2) | F(ERMS) | F(RTM);
+ F(BMI2) | F(ERMS) | f_invpcid | F(RTM);
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
@@ -640,33 +641,37 @@ static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
}
-void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
{
- u32 function, index;
+ u32 function = *eax, index = *ecx;
struct kvm_cpuid_entry2 *best;
- function = kvm_register_read(vcpu, VCPU_REGS_RAX);
- index = kvm_register_read(vcpu, VCPU_REGS_RCX);
- kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
- kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
- kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
- kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
best = kvm_find_cpuid_entry(vcpu, function, index);
if (!best)
best = check_cpuid_limit(vcpu, function, index);
if (best) {
- kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
- kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
- kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
- kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
- }
+ *eax = best->eax;
+ *ebx = best->ebx;
+ *ecx = best->ecx;
+ *edx = best->edx;
+ } else
+ *eax = *ebx = *ecx = *edx = 0;
+}
+
+void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+{
+ u32 function, eax, ebx, ecx, edx;
+
+ function = eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, eax);
+ kvm_register_write(vcpu, VCPU_REGS_RBX, ebx);
+ kvm_register_write(vcpu, VCPU_REGS_RCX, ecx);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, edx);
kvm_x86_ops->skip_emulated_instruction(vcpu);
- trace_kvm_cpuid(function,
- kvm_register_read(vcpu, VCPU_REGS_RAX),
- kvm_register_read(vcpu, VCPU_REGS_RBX),
- kvm_register_read(vcpu, VCPU_REGS_RCX),
- kvm_register_read(vcpu, VCPU_REGS_RDX));
+ trace_kvm_cpuid(function, eax, ebx, ecx, edx);
}
EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 26d1fb437eb5..a10e46016851 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -17,6 +17,7 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries);
+void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
@@ -51,4 +52,12 @@ static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu)
return best && (best->ecx & bit(X86_FEATURE_OSVW));
}
+static inline bool guest_cpuid_has_pcid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 1, 0);
+ return best && (best->ecx & bit(X86_FEATURE_PCID));
+}
+
#endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f95d242ee9f7..97d9a9914ba8 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -433,11 +433,32 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
return ctxt->ops->intercept(ctxt, &info, stage);
}
+static void assign_masked(ulong *dest, ulong src, ulong mask)
+{
+ *dest = (*dest & ~mask) | (src & mask);
+}
+
static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt)
{
return (1UL << (ctxt->ad_bytes << 3)) - 1;
}
+static ulong stack_mask(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel;
+ struct desc_struct ss;
+
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return ~0UL;
+ ctxt->ops->get_segment(ctxt, &sel, &ss, NULL, VCPU_SREG_SS);
+ return ~0U >> ((ss.d ^ 1) * 16); /* d=0: 0xffff; d=1: 0xffffffff */
+}
+
+static int stack_size(struct x86_emulate_ctxt *ctxt)
+{
+ return (__fls(stack_mask(ctxt)) + 1) >> 3;
+}
+
/* Access/update address held in a register, based on addressing mode. */
static inline unsigned long
address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg)
@@ -958,6 +979,12 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
op->orig_val = op->val;
}
+static void adjust_modrm_seg(struct x86_emulate_ctxt *ctxt, int base_reg)
+{
+ if (base_reg == VCPU_REGS_RSP || base_reg == VCPU_REGS_RBP)
+ ctxt->modrm_seg = VCPU_SREG_SS;
+}
+
static int decode_modrm(struct x86_emulate_ctxt *ctxt,
struct operand *op)
{
@@ -1061,15 +1088,20 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)
modrm_ea += insn_fetch(s32, ctxt);
- else
+ else {
modrm_ea += ctxt->regs[base_reg];
+ adjust_modrm_seg(ctxt, base_reg);
+ }
if (index_reg != 4)
modrm_ea += ctxt->regs[index_reg] << scale;
} else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) {
if (ctxt->mode == X86EMUL_MODE_PROT64)
ctxt->rip_relative = 1;
- } else
- modrm_ea += ctxt->regs[ctxt->modrm_rm];
+ } else {
+ base_reg = ctxt->modrm_rm;
+ modrm_ea += ctxt->regs[base_reg];
+ adjust_modrm_seg(ctxt, base_reg);
+ }
switch (ctxt->modrm_mod) {
case 0:
if (ctxt->modrm_rm == 5)
@@ -1264,7 +1296,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
/* allowed just for 8 bytes segments */
static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
- u16 selector, struct desc_struct *desc)
+ u16 selector, struct desc_struct *desc,
+ ulong *desc_addr_p)
{
struct desc_ptr dt;
u16 index = selector >> 3;
@@ -1275,7 +1308,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
if (dt.size < index * 8 + 7)
return emulate_gp(ctxt, selector & 0xfffc);
- addr = dt.address + index * 8;
+ *desc_addr_p = addr = dt.address + index * 8;
return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,
&ctxt->exception);
}
@@ -1302,11 +1335,12 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
u16 selector, int seg)
{
- struct desc_struct seg_desc;
+ struct desc_struct seg_desc, old_desc;
u8 dpl, rpl, cpl;
unsigned err_vec = GP_VECTOR;
u32 err_code = 0;
bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
+ ulong desc_addr;
int ret;
memset(&seg_desc, 0, sizeof seg_desc);
@@ -1324,8 +1358,14 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
goto load;
}
- /* NULL selector is not valid for TR, CS and SS */
- if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
+ rpl = selector & 3;
+ cpl = ctxt->ops->cpl(ctxt);
+
+ /* NULL selector is not valid for TR, CS and SS (except for long mode) */
+ if ((seg == VCPU_SREG_CS
+ || (seg == VCPU_SREG_SS
+ && (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl))
+ || seg == VCPU_SREG_TR)
&& null_selector)
goto exception;
@@ -1336,7 +1376,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
if (null_selector) /* for NULL selector skip all following checks */
goto load;
- ret = read_segment_descriptor(ctxt, selector, &seg_desc);
+ ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr);
if (ret != X86EMUL_CONTINUE)
return ret;
@@ -1352,9 +1392,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
goto exception;
}
- rpl = selector & 3;
dpl = seg_desc.dpl;
- cpl = ctxt->ops->cpl(ctxt);
switch (seg) {
case VCPU_SREG_SS:
@@ -1384,6 +1422,12 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
case VCPU_SREG_TR:
if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
goto exception;
+ old_desc = seg_desc;
+ seg_desc.type |= 2; /* busy */
+ ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc,
+ sizeof(seg_desc), &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
break;
case VCPU_SREG_LDTR:
if (seg_desc.s || seg_desc.type != 2)
@@ -1474,17 +1518,22 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
-static int em_push(struct x86_emulate_ctxt *ctxt)
+static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes)
{
struct segmented_address addr;
- register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -ctxt->op_bytes);
+ register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -bytes);
addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]);
addr.seg = VCPU_SREG_SS;
+ return segmented_write(ctxt, addr, data, bytes);
+}
+
+static int em_push(struct x86_emulate_ctxt *ctxt)
+{
/* Disable writeback. */
ctxt->dst.type = OP_NONE;
- return segmented_write(ctxt, addr, &ctxt->src.val, ctxt->op_bytes);
+ return push(ctxt, &ctxt->src.val, ctxt->op_bytes);
}
static int emulate_pop(struct x86_emulate_ctxt *ctxt,
@@ -1556,6 +1605,33 @@ static int em_popf(struct x86_emulate_ctxt *ctxt)
return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes);
}
+static int em_enter(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ unsigned frame_size = ctxt->src.val;
+ unsigned nesting_level = ctxt->src2.val & 31;
+
+ if (nesting_level)
+ return X86EMUL_UNHANDLEABLE;
+
+ rc = push(ctxt, &ctxt->regs[VCPU_REGS_RBP], stack_size(ctxt));
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ assign_masked(&ctxt->regs[VCPU_REGS_RBP], ctxt->regs[VCPU_REGS_RSP],
+ stack_mask(ctxt));
+ assign_masked(&ctxt->regs[VCPU_REGS_RSP],
+ ctxt->regs[VCPU_REGS_RSP] - frame_size,
+ stack_mask(ctxt));
+ return X86EMUL_CONTINUE;
+}
+
+static int em_leave(struct x86_emulate_ctxt *ctxt)
+{
+ assign_masked(&ctxt->regs[VCPU_REGS_RSP], ctxt->regs[VCPU_REGS_RBP],
+ stack_mask(ctxt));
+ return emulate_pop(ctxt, &ctxt->regs[VCPU_REGS_RBP], ctxt->op_bytes);
+}
+
static int em_push_sreg(struct x86_emulate_ctxt *ctxt)
{
int seg = ctxt->src2.val;
@@ -1993,8 +2069,8 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
u32 eax, ebx, ecx, edx;
eax = ecx = 0;
- return ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)
- && ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+ return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
&& ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
&& edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
}
@@ -2013,32 +2089,31 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
eax = 0x00000000;
ecx = 0x00000000;
- if (ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)) {
- /*
- * Intel ("GenuineIntel")
- * remark: Intel CPUs only support "syscall" in 64bit
- * longmode. Also an 64bit guest with a
- * 32bit compat-app running will #UD !! While this
- * behaviour can be fixed (by emulating) into AMD
- * response - CPUs of AMD can't behave like Intel.
- */
- if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx &&
- ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx &&
- edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx)
- return false;
+ ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+ /*
+ * Intel ("GenuineIntel")
+ * remark: Intel CPUs only support "syscall" in 64bit
+ * longmode. Also an 64bit guest with a
+ * 32bit compat-app running will #UD !! While this
+ * behaviour can be fixed (by emulating) into AMD
+ * response - CPUs of AMD can't behave like Intel.
+ */
+ if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx &&
+ ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx &&
+ edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx)
+ return false;
- /* AMD ("AuthenticAMD") */
- if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx &&
- ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx &&
- edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx)
- return true;
-
- /* AMD ("AMDisbetter!") */
- if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx &&
- ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx &&
- edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx)
- return true;
- }
+ /* AMD ("AuthenticAMD") */
+ if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx &&
+ ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx &&
+ edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx)
+ return true;
+
+ /* AMD ("AMDisbetter!") */
+ if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx &&
+ ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx &&
+ edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx)
+ return true;
/* default: (not Intel, not AMD), apply Intel's stricter rules... */
return false;
@@ -2547,13 +2622,14 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
ulong old_tss_base =
ops->get_cached_segment_base(ctxt, VCPU_SREG_TR);
u32 desc_limit;
+ ulong desc_addr;
/* FIXME: old_tss_base == ~0 ? */
- ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc);
+ ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc, &desc_addr);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc);
+ ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc, &desc_addr);
if (ret != X86EMUL_CONTINUE)
return ret;
@@ -2948,6 +3024,24 @@ static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg);
}
+static int em_lldt(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel = ctxt->src.val;
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return load_segment_descriptor(ctxt, sel, VCPU_SREG_LDTR);
+}
+
+static int em_ltr(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel = ctxt->src.val;
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return load_segment_descriptor(ctxt, sel, VCPU_SREG_TR);
+}
+
static int em_invlpg(struct x86_emulate_ctxt *ctxt)
{
int rc;
@@ -2989,11 +3083,42 @@ static int em_vmcall(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
+static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt,
+ void (*get)(struct x86_emulate_ctxt *ctxt,
+ struct desc_ptr *ptr))
+{
+ struct desc_ptr desc_ptr;
+
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ ctxt->op_bytes = 8;
+ get(ctxt, &desc_ptr);
+ if (ctxt->op_bytes == 2) {
+ ctxt->op_bytes = 4;
+ desc_ptr.address &= 0x00ffffff;
+ }
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return segmented_write(ctxt, ctxt->dst.addr.mem,
+ &desc_ptr, 2 + ctxt->op_bytes);
+}
+
+static int em_sgdt(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_store_desc_ptr(ctxt, ctxt->ops->get_gdt);
+}
+
+static int em_sidt(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_store_desc_ptr(ctxt, ctxt->ops->get_idt);
+}
+
static int em_lgdt(struct x86_emulate_ctxt *ctxt)
{
struct desc_ptr desc_ptr;
int rc;
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ ctxt->op_bytes = 8;
rc = read_descriptor(ctxt, ctxt->src.addr.mem,
&desc_ptr.size, &desc_ptr.address,
ctxt->op_bytes);
@@ -3021,6 +3146,8 @@ static int em_lidt(struct x86_emulate_ctxt *ctxt)
struct desc_ptr desc_ptr;
int rc;
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ ctxt->op_bytes = 8;
rc = read_descriptor(ctxt, ctxt->src.addr.mem,
&desc_ptr.size, &desc_ptr.address,
ctxt->op_bytes);
@@ -3143,6 +3270,42 @@ static int em_bsr(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
+static int em_cpuid(struct x86_emulate_ctxt *ctxt)
+{
+ u32 eax, ebx, ecx, edx;
+
+ eax = ctxt->regs[VCPU_REGS_RAX];
+ ecx = ctxt->regs[VCPU_REGS_RCX];
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+ ctxt->regs[VCPU_REGS_RAX] = eax;
+ ctxt->regs[VCPU_REGS_RBX] = ebx;
+ ctxt->regs[VCPU_REGS_RCX] = ecx;
+ ctxt->regs[VCPU_REGS_RDX] = edx;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_lahf(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->regs[VCPU_REGS_RAX] &= ~0xff00UL;
+ ctxt->regs[VCPU_REGS_RAX] |= (ctxt->eflags & 0xff) << 8;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_bswap(struct x86_emulate_ctxt *ctxt)
+{
+ switch (ctxt->op_bytes) {
+#ifdef CONFIG_X86_64
+ case 8:
+ asm("bswap %0" : "+r"(ctxt->dst.val));
+ break;
+#endif
+ default:
+ asm("bswap %0" : "+r"(*(u32 *)&ctxt->dst.val));
+ break;
+ }
+ return X86EMUL_CONTINUE;
+}
+
static bool valid_cr(int nr)
{
switch (nr) {
@@ -3424,14 +3587,14 @@ static struct opcode group5[] = {
static struct opcode group6[] = {
DI(Prot, sldt),
DI(Prot, str),
- DI(Prot | Priv, lldt),
- DI(Prot | Priv, ltr),
+ II(Prot | Priv | SrcMem16, em_lldt, lldt),
+ II(Prot | Priv | SrcMem16, em_ltr, ltr),
N, N, N, N,
};
static struct group_dual group7 = { {
- DI(Mov | DstMem | Priv, sgdt),
- DI(Mov | DstMem | Priv, sidt),
+ II(Mov | DstMem | Priv, em_sgdt, sgdt),
+ II(Mov | DstMem | Priv, em_sidt, sidt),
II(SrcMem | Priv, em_lgdt, lgdt),
II(SrcMem | Priv, em_lidt, lidt),
II(SrcNone | DstMem | Mov, em_smsw, smsw), N,
@@ -3538,7 +3701,7 @@ static struct opcode opcode_table[256] = {
D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
I(SrcImmFAddr | No64, em_call_far), N,
II(ImplicitOps | Stack, em_pushf, pushf),
- II(ImplicitOps | Stack, em_popf, popf), N, N,
+ II(ImplicitOps | Stack, em_popf, popf), N, I(ImplicitOps, em_lahf),
/* 0xA0 - 0xA7 */
I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
@@ -3561,7 +3724,8 @@ static struct opcode opcode_table[256] = {
I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg),
G(ByteOp, group11), G(0, group11),
/* 0xC8 - 0xCF */
- N, N, N, I(ImplicitOps | Stack, em_ret_far),
+ I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave),
+ N, I(ImplicitOps | Stack, em_ret_far),
D(ImplicitOps), DI(SrcImmByte, intn),
D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
/* 0xD0 - 0xD7 */
@@ -3635,7 +3799,7 @@ static struct opcode twobyte_table[256] = {
X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
/* 0xA0 - 0xA7 */
I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg),
- DI(ImplicitOps, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt),
+ II(ImplicitOps, em_cpuid, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt),
D(DstMem | SrcReg | Src2ImmByte | ModRM),
D(DstMem | SrcReg | Src2CL | ModRM), N, N,
/* 0xA8 - 0xAF */
@@ -3658,11 +3822,12 @@ static struct opcode twobyte_table[256] = {
I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr),
D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
- /* 0xC0 - 0xCF */
+ /* 0xC0 - 0xC7 */
D2bv(DstMem | SrcReg | ModRM | Lock),
N, D(DstMem | SrcReg | ModRM | Mov),
N, N, N, GD(0, &group9),
- N, N, N, N, N, N, N, N,
+ /* 0xC8 - 0xCF */
+ X8(I(DstReg, em_bswap)),
/* 0xD0 - 0xDF */
N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
/* 0xE0 - 0xEF */
@@ -4426,12 +4591,12 @@ twobyte_insn:
break;
case 0xb6 ... 0xb7: /* movzx */
ctxt->dst.bytes = ctxt->op_bytes;
- ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val
+ ctxt->dst.val = (ctxt->src.bytes == 1) ? (u8) ctxt->src.val
: (u16) ctxt->src.val;
break;
case 0xbe ... 0xbf: /* movsx */
ctxt->dst.bytes = ctxt->op_bytes;
- ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val :
+ ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val :
(s16) ctxt->src.val;
break;
case 0xc0 ... 0xc1: /* xadd */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3b53d9e08bfc..28c8fbcc6763 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -145,7 +145,8 @@ module_param(dbg, bool, 0644);
#define CREATE_TRACE_POINTS
#include "mmutrace.h"
-#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
@@ -188,6 +189,7 @@ static u64 __read_mostly shadow_dirty_mask;
static u64 __read_mostly shadow_mmio_mask;
static void mmu_spte_set(u64 *sptep, u64 spte);
+static void mmu_free_roots(struct kvm_vcpu *vcpu);
void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
{
@@ -444,8 +446,22 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
}
#endif
+static bool spte_is_locklessly_modifiable(u64 spte)
+{
+ return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE));
+}
+
static bool spte_has_volatile_bits(u64 spte)
{
+ /*
+ * Always atomicly update spte if it can be updated
+ * out of mmu-lock, it can ensure dirty bit is not lost,
+ * also, it can help us to get a stable is_writable_pte()
+ * to ensure tlb flush is not missed.
+ */
+ if (spte_is_locklessly_modifiable(spte))
+ return true;
+
if (!shadow_accessed_mask)
return false;
@@ -478,34 +494,47 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
/* Rules for using mmu_spte_update:
* Update the state bits, it means the mapped pfn is not changged.
+ *
+ * Whenever we overwrite a writable spte with a read-only one we
+ * should flush remote TLBs. Otherwise rmap_write_protect
+ * will find a read-only spte, even though the writable spte
+ * might be cached on a CPU's TLB, the return value indicates this
+ * case.
*/
-static void mmu_spte_update(u64 *sptep, u64 new_spte)
+static bool mmu_spte_update(u64 *sptep, u64 new_spte)
{
- u64 mask, old_spte = *sptep;
+ u64 old_spte = *sptep;
+ bool ret = false;
WARN_ON(!is_rmap_spte(new_spte));
- if (!is_shadow_present_pte(old_spte))
- return mmu_spte_set(sptep, new_spte);
-
- new_spte |= old_spte & shadow_dirty_mask;
-
- mask = shadow_accessed_mask;
- if (is_writable_pte(old_spte))
- mask |= shadow_dirty_mask;
+ if (!is_shadow_present_pte(old_spte)) {
+ mmu_spte_set(sptep, new_spte);
+ return ret;
+ }
- if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
+ if (!spte_has_volatile_bits(old_spte))
__update_clear_spte_fast(sptep, new_spte);
else
old_spte = __update_clear_spte_slow(sptep, new_spte);
+ /*
+ * For the spte updated out of mmu-lock is safe, since
+ * we always atomicly update it, see the comments in
+ * spte_has_volatile_bits().
+ */
+ if (is_writable_pte(old_spte) && !is_writable_pte(new_spte))
+ ret = true;
+
if (!shadow_accessed_mask)
- return;
+ return ret;
if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
kvm_set_pfn_accessed(spte_to_pfn(old_spte));
if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
kvm_set_pfn_dirty(spte_to_pfn(old_spte));
+
+ return ret;
}
/*
@@ -1049,35 +1078,82 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
rmap_remove(kvm, sptep);
}
-static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
+
+static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
+{
+ if (is_large_pte(*sptep)) {
+ WARN_ON(page_header(__pa(sptep))->role.level ==
+ PT_PAGE_TABLE_LEVEL);
+ drop_spte(kvm, sptep);
+ --kvm->stat.lpages;
+ return true;
+ }
+
+ return false;
+}
+
+static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+ if (__drop_large_spte(vcpu->kvm, sptep))
+ kvm_flush_remote_tlbs(vcpu->kvm);
+}
+
+/*
+ * Write-protect on the specified @sptep, @pt_protect indicates whether
+ * spte writ-protection is caused by protecting shadow page table.
+ * @flush indicates whether tlb need be flushed.
+ *
+ * Note: write protection is difference between drity logging and spte
+ * protection:
+ * - for dirty logging, the spte can be set to writable at anytime if
+ * its dirty bitmap is properly set.
+ * - for spte protection, the spte can be writable only after unsync-ing
+ * shadow page.
+ *
+ * Return true if the spte is dropped.
+ */
+static bool
+spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
+{
+ u64 spte = *sptep;
+
+ if (!is_writable_pte(spte) &&
+ !(pt_protect && spte_is_locklessly_modifiable(spte)))
+ return false;
+
+ rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
+
+ if (__drop_large_spte(kvm, sptep)) {
+ *flush |= true;
+ return true;
+ }
+
+ if (pt_protect)
+ spte &= ~SPTE_MMU_WRITEABLE;
+ spte = spte & ~PT_WRITABLE_MASK;
+
+ *flush |= mmu_spte_update(sptep, spte);
+ return false;
+}
+
+static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
+ int level, bool pt_protect)
{
u64 *sptep;
struct rmap_iterator iter;
- int write_protected = 0;
+ bool flush = false;
for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
BUG_ON(!(*sptep & PT_PRESENT_MASK));
- rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
-
- if (!is_writable_pte(*sptep)) {
- sptep = rmap_get_next(&iter);
- continue;
- }
-
- if (level == PT_PAGE_TABLE_LEVEL) {
- mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK);
- sptep = rmap_get_next(&iter);
- } else {
- BUG_ON(!is_large_pte(*sptep));
- drop_spte(kvm, sptep);
- --kvm->stat.lpages;
+ if (spte_write_protect(kvm, sptep, &flush, pt_protect)) {
sptep = rmap_get_first(*rmapp, &iter);
+ continue;
}
- write_protected = 1;
+ sptep = rmap_get_next(&iter);
}
- return write_protected;
+ return flush;
}
/**
@@ -1098,26 +1174,26 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
while (mask) {
rmapp = &slot->rmap[gfn_offset + __ffs(mask)];
- __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL);
+ __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false);
/* clear the first set bit */
mask &= mask - 1;
}
}
-static int rmap_write_protect(struct kvm *kvm, u64 gfn)
+static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
{
struct kvm_memory_slot *slot;
unsigned long *rmapp;
int i;
- int write_protected = 0;
+ bool write_protected = false;
slot = gfn_to_memslot(kvm, gfn);
for (i = PT_PAGE_TABLE_LEVEL;
i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
rmapp = __gfn_to_rmap(gfn, i, slot);
- write_protected |= __rmap_write_protect(kvm, rmapp, i);
+ write_protected |= __rmap_write_protect(kvm, rmapp, i, true);
}
return write_protected;
@@ -1699,7 +1775,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
kvm_mmu_pages_init(parent, &parents, &pages);
while (mmu_unsync_walk(parent, &pages)) {
- int protected = 0;
+ bool protected = false;
for_each_sp(pages, sp, parents, i)
protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
@@ -1864,15 +1940,6 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
mmu_spte_set(sptep, spte);
}
-static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
-{
- if (is_large_pte(*sptep)) {
- drop_spte(vcpu->kvm, sptep);
- --vcpu->kvm->stat.lpages;
- kvm_flush_remote_tlbs(vcpu->kvm);
- }
-}
-
static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned direct_access)
{
@@ -2241,7 +2308,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
gfn_t gfn, pfn_t pfn, bool speculative,
bool can_unsync, bool host_writable)
{
- u64 spte, entry = *sptep;
+ u64 spte;
int ret = 0;
if (set_mmio_spte(sptep, gfn, pfn, pte_access))
@@ -2255,8 +2322,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= shadow_x_mask;
else
spte |= shadow_nx_mask;
+
if (pte_access & ACC_USER_MASK)
spte |= shadow_user_mask;
+
if (level > PT_PAGE_TABLE_LEVEL)
spte |= PT_PAGE_SIZE_MASK;
if (tdp_enabled)
@@ -2281,7 +2350,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
goto done;
}
- spte |= PT_WRITABLE_MASK;
+ spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
if (!vcpu->arch.mmu.direct_map
&& !(pte_access & ACC_WRITE_MASK)) {
@@ -2310,8 +2379,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
__func__, gfn);
ret = 1;
pte_access &= ~ACC_WRITE_MASK;
- if (is_writable_pte(spte))
- spte &= ~PT_WRITABLE_MASK;
+ spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
}
}
@@ -2319,14 +2387,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
mark_page_dirty(vcpu->kvm, gfn);
set_pte:
- mmu_spte_update(sptep, spte);
- /*
- * If we overwrite a writable spte with a read-only one we
- * should flush remote TLBs. Otherwise rmap_write_protect
- * will find a read-only spte, even though the writable spte
- * might be cached on a CPU's TLB.
- */
- if (is_writable_pte(entry) && !is_writable_pte(*sptep))
+ if (mmu_spte_update(sptep, spte))
kvm_flush_remote_tlbs(vcpu->kvm);
done:
return ret;
@@ -2401,6 +2462,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
{
+ mmu_free_roots(vcpu);
}
static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
@@ -2623,18 +2685,116 @@ exit:
return ret;
}
+static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code)
+{
+ /*
+ * #PF can be fast only if the shadow page table is present and it
+ * is caused by write-protect, that means we just need change the
+ * W bit of the spte which can be done out of mmu-lock.
+ */
+ if (!(error_code & PFERR_PRESENT_MASK) ||
+ !(error_code & PFERR_WRITE_MASK))
+ return false;
+
+ return true;
+}
+
+static bool
+fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte)
+{
+ struct kvm_mmu_page *sp = page_header(__pa(sptep));
+ gfn_t gfn;
+
+ WARN_ON(!sp->role.direct);
+
+ /*
+ * The gfn of direct spte is stable since it is calculated
+ * by sp->gfn.
+ */
+ gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+
+ if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
+ mark_page_dirty(vcpu->kvm, gfn);
+
+ return true;
+}
+
+/*
+ * Return value:
+ * - true: let the vcpu to access on the same address again.
+ * - false: let the real page fault path to fix it.
+ */
+static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
+ u32 error_code)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ bool ret = false;
+ u64 spte = 0ull;
+
+ if (!page_fault_can_be_fast(vcpu, error_code))
+ return false;
+
+ walk_shadow_page_lockless_begin(vcpu);
+ for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
+ if (!is_shadow_present_pte(spte) || iterator.level < level)
+ break;
+
+ /*
+ * If the mapping has been changed, let the vcpu fault on the
+ * same address again.
+ */
+ if (!is_rmap_spte(spte)) {
+ ret = true;
+ goto exit;
+ }
+
+ if (!is_last_spte(spte, level))
+ goto exit;
+
+ /*
+ * Check if it is a spurious fault caused by TLB lazily flushed.
+ *
+ * Need not check the access of upper level table entries since
+ * they are always ACC_ALL.
+ */
+ if (is_writable_pte(spte)) {
+ ret = true;
+ goto exit;
+ }
+
+ /*
+ * Currently, to simplify the code, only the spte write-protected
+ * by dirty-log can be fast fixed.
+ */
+ if (!spte_is_locklessly_modifiable(spte))
+ goto exit;
+
+ /*
+ * Currently, fast page fault only works for direct mapping since
+ * the gfn is not stable for indirect shadow page.
+ * See Documentation/virtual/kvm/locking.txt to get more detail.
+ */
+ ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte);
+exit:
+ trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
+ spte, ret);
+ walk_shadow_page_lockless_end(vcpu);
+
+ return ret;
+}
+
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
gva_t gva, pfn_t *pfn, bool write, bool *writable);
-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
- bool prefault)
+static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
+ gfn_t gfn, bool prefault)
{
int r;
int level;
int force_pt_level;
pfn_t pfn;
unsigned long mmu_seq;
- bool map_writable;
+ bool map_writable, write = error_code & PFERR_WRITE_MASK;
force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
if (likely(!force_pt_level)) {
@@ -2651,6 +2811,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
} else
level = PT_PAGE_TABLE_LEVEL;
+ if (fast_page_fault(vcpu, v, level, error_code))
+ return 0;
+
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
@@ -3039,7 +3202,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
gfn = gva >> PAGE_SHIFT;
return nonpaging_map(vcpu, gva & PAGE_MASK,
- error_code & PFERR_WRITE_MASK, gfn, prefault);
+ error_code, gfn, prefault);
}
static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
@@ -3119,6 +3282,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
} else
level = PT_PAGE_TABLE_LEVEL;
+ if (fast_page_fault(vcpu, gpa, level, error_code))
+ return 0;
+
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
@@ -3883,6 +4049,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
{
struct kvm_mmu_page *sp;
+ bool flush = false;
list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
int i;
@@ -3897,16 +4064,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
!is_last_spte(pt[i], sp->role.level))
continue;
- if (is_large_pte(pt[i])) {
- drop_spte(kvm, &pt[i]);
- --kvm->stat.lpages;
- continue;
- }
-
- /* avoid RMW */
- if (is_writable_pte(pt[i]))
- mmu_spte_update(&pt[i],
- pt[i] & ~PT_WRITABLE_MASK);
+ spte_write_protect(kvm, &pt[i], &flush, false);
}
}
kvm_flush_remote_tlbs(kvm);
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 89fb0e81322a..cd6e98333ba3 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -54,8 +54,8 @@
*/
TRACE_EVENT(
kvm_mmu_pagetable_walk,
- TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault),
- TP_ARGS(addr, write_fault, user_fault, fetch_fault),
+ TP_PROTO(u64 addr, u32 pferr),
+ TP_ARGS(addr, pferr),
TP_STRUCT__entry(
__field(__u64, addr)
@@ -64,8 +64,7 @@ TRACE_EVENT(
TP_fast_assign(
__entry->addr = addr;
- __entry->pferr = (!!write_fault << 1) | (!!user_fault << 2)
- | (!!fetch_fault << 4);
+ __entry->pferr = pferr;
),
TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr,
@@ -243,6 +242,44 @@ TRACE_EVENT(
TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
__entry->access)
);
+
+#define __spte_satisfied(__spte) \
+ (__entry->retry && is_writable_pte(__entry->__spte))
+
+TRACE_EVENT(
+ fast_page_fault,
+ TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
+ u64 *sptep, u64 old_spte, bool retry),
+ TP_ARGS(vcpu, gva, error_code, sptep, old_spte, retry),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(gva_t, gva)
+ __field(u32, error_code)
+ __field(u64 *, sptep)
+ __field(u64, old_spte)
+ __field(u64, new_spte)
+ __field(bool, retry)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu->vcpu_id;
+ __entry->gva = gva;
+ __entry->error_code = error_code;
+ __entry->sptep = sptep;
+ __entry->old_spte = old_spte;
+ __entry->new_spte = *sptep;
+ __entry->retry = retry;
+ ),
+
+ TP_printk("vcpu %d gva %lx error_code %s sptep %p old %#llx"
+ " new %llx spurious %d fixed %d", __entry->vcpu_id,
+ __entry->gva, __print_flags(__entry->error_code, "|",
+ kvm_mmu_trace_pferr_flags), __entry->sptep,
+ __entry->old_spte, __entry->new_spte,
+ __spte_satisfied(old_spte), __spte_satisfied(new_spte)
+ )
+);
#endif /* _TRACE_KVMMMU_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 34f970937ef1..bb7cf01cae76 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -154,8 +154,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
const int fetch_fault = access & PFERR_FETCH_MASK;
u16 errcode = 0;
- trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
- fetch_fault);
+ trace_kvm_mmu_pagetable_walk(addr, access);
retry_walk:
eperm = false;
walker->level = mmu->root_level;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 7a418783259d..baead950d6c8 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4044,6 +4044,11 @@ static bool svm_rdtscp_supported(void)
return false;
}
+static bool svm_invpcid_supported(void)
+{
+ return false;
+}
+
static bool svm_has_wbinvd_exit(void)
{
return true;
@@ -4312,6 +4317,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.cpuid_update = svm_cpuid_update,
.rdtscp_supported = svm_rdtscp_supported,
+ .invpcid_supported = svm_invpcid_supported,
.set_supported_cpuid = svm_set_supported_cpuid,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e10ec0e4d1c6..c39b60707e02 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -74,7 +74,7 @@ module_param_named(unrestricted_guest,
static bool __read_mostly enable_ept_ad_bits = 1;
module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
-static bool __read_mostly emulate_invalid_guest_state = 0;
+static bool __read_mostly emulate_invalid_guest_state = true;
module_param(emulate_invalid_guest_state, bool, S_IRUGO);
static bool __read_mostly vmm_exclusive = 1;
@@ -861,6 +861,12 @@ static inline bool cpu_has_vmx_rdtscp(void)
SECONDARY_EXEC_RDTSCP;
}
+static inline bool cpu_has_vmx_invpcid(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_INVPCID;
+}
+
static inline bool cpu_has_virtual_nmis(void)
{
return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
@@ -1751,6 +1757,11 @@ static bool vmx_rdtscp_supported(void)
return cpu_has_vmx_rdtscp();
}
+static bool vmx_invpcid_supported(void)
+{
+ return cpu_has_vmx_invpcid() && enable_ept;
+}
+
/*
* Swap MSR entry in host/guest MSR entry array.
*/
@@ -2470,7 +2481,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
SECONDARY_EXEC_ENABLE_EPT |
SECONDARY_EXEC_UNRESTRICTED_GUEST |
SECONDARY_EXEC_PAUSE_LOOP_EXITING |
- SECONDARY_EXEC_RDTSCP;
+ SECONDARY_EXEC_RDTSCP |
+ SECONDARY_EXEC_ENABLE_INVPCID;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
@@ -3175,11 +3187,22 @@ static int __vmx_get_cpl(struct kvm_vcpu *vcpu)
static int vmx_get_cpl(struct kvm_vcpu *vcpu)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * If we enter real mode with cs.sel & 3 != 0, the normal CPL calculations
+ * fail; use the cache instead.
+ */
+ if (unlikely(vmx->emulation_required && emulate_invalid_guest_state)) {
+ return vmx->cpl;
+ }
+
if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
__set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
- to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu);
+ vmx->cpl = __vmx_get_cpl(vcpu);
}
- return to_vmx(vcpu)->cpl;
+
+ return vmx->cpl;
}
@@ -3187,7 +3210,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
{
u32 ar;
- if (var->unusable)
+ if (var->unusable || !var->present)
ar = 1 << 16;
else {
ar = var->type & 15;
@@ -3199,8 +3222,6 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
ar |= (var->db & 1) << 14;
ar |= (var->g & 1) << 15;
}
- if (ar == 0) /* a 0 value means unusable */
- ar = AR_UNUSABLE_MASK;
return ar;
}
@@ -3791,6 +3812,8 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
if (!enable_ept) {
exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
enable_unrestricted_guest = 0;
+ /* Enable INVPCID for non-ept guests may cause performance regression. */
+ exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
}
if (!enable_unrestricted_guest)
exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
@@ -4829,6 +4852,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification;
gpa_t gpa;
+ u32 error_code;
int gla_validity;
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -4853,7 +4877,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
trace_kvm_page_fault(gpa, exit_qualification);
- return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0);
+
+ /* It is a write fault? */
+ error_code = exit_qualification & (1U << 1);
+ /* ept page table is present? */
+ error_code |= (exit_qualification >> 3) & 0x1;
+
+ return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
}
static u64 ept_rsvd_mask(u64 spte, int level)
@@ -4968,15 +4998,18 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
int ret = 1;
u32 cpu_exec_ctrl;
bool intr_window_requested;
+ unsigned count = 130;
cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
- while (!guest_state_valid(vcpu)) {
- if (intr_window_requested
- && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
+ while (!guest_state_valid(vcpu) && count-- != 0) {
+ if (intr_window_requested && vmx_interrupt_allowed(vcpu))
return handle_interrupt_window(&vmx->vcpu);
+ if (test_bit(KVM_REQ_EVENT, &vcpu->requests))
+ return 1;
+
err = emulate_instruction(vcpu, 0);
if (err == EMULATE_DO_MMIO) {
@@ -4984,8 +5017,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
goto out;
}
- if (err != EMULATE_DONE)
+ if (err != EMULATE_DONE) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
return 0;
+ }
if (signal_pending(current))
goto out;
@@ -4993,7 +5030,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
schedule();
}
- vmx->emulation_required = 0;
+ vmx->emulation_required = !guest_state_valid(vcpu);
out:
return ret;
}
@@ -6527,6 +6564,23 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
}
}
}
+
+ exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ /* Exposing INVPCID only when PCID is exposed */
+ best = kvm_find_cpuid_entry(vcpu, 0x7, 0);
+ if (vmx_invpcid_supported() &&
+ best && (best->ecx & bit(X86_FEATURE_INVPCID)) &&
+ guest_cpuid_has_pcid(vcpu)) {
+ exec_control |= SECONDARY_EXEC_ENABLE_INVPCID;
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+ exec_control);
+ } else {
+ exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+ exec_control);
+ if (best)
+ best->ecx &= ~bit(X86_FEATURE_INVPCID);
+ }
}
static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -7261,6 +7315,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.cpuid_update = vmx_cpuid_update,
.rdtscp_supported = vmx_rdtscp_supported,
+ .invpcid_supported = vmx_invpcid_supported,
.set_supported_cpuid = vmx_set_supported_cpuid,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8eacb2e64560..59b59508ff07 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -528,6 +528,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
return 1;
}
+ if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
+ return 1;
+
kvm_x86_ops->set_cr0(vcpu, cr0);
if ((cr0 ^ old_cr0) & X86_CR0_PG) {
@@ -604,10 +607,20 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
kvm_read_cr3(vcpu)))
return 1;
+ if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
+ if (!guest_cpuid_has_pcid(vcpu))
+ return 1;
+
+ /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
+ if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
+ return 1;
+ }
+
if (kvm_x86_ops->set_cr4(vcpu, cr4))
return 1;
- if ((cr4 ^ old_cr4) & pdptr_bits)
+ if (((cr4 ^ old_cr4) & pdptr_bits) ||
+ (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
kvm_mmu_reset_context(vcpu);
if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
@@ -626,8 +639,12 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
}
if (is_long_mode(vcpu)) {
- if (cr3 & CR3_L_MODE_RESERVED_BITS)
- return 1;
+ if (kvm_read_cr4(vcpu) & X86_CR4_PCIDE) {
+ if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)
+ return 1;
+ } else
+ if (cr3 & CR3_L_MODE_RESERVED_BITS)
+ return 1;
} else {
if (is_pae(vcpu)) {
if (cr3 & CR3_PAE_RESERVED_BITS)
@@ -4302,26 +4319,10 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
}
-static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
+static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
{
- struct kvm_cpuid_entry2 *cpuid = NULL;
-
- if (eax && ecx)
- cpuid = kvm_find_cpuid_entry(emul_to_vcpu(ctxt),
- *eax, *ecx);
-
- if (cpuid) {
- *eax = cpuid->eax;
- *ecx = cpuid->ecx;
- if (ebx)
- *ebx = cpuid->ebx;
- if (edx)
- *edx = cpuid->edx;
- return true;
- }
-
- return false;
+ kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx);
}
static struct x86_emulate_ops emulate_ops = {