summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/virtual/kvm/api.txt184
-rw-r--r--Documentation/virtual/kvm/mmu.txt14
-rw-r--r--arch/x86/include/asm/kvm_host.h11
-rw-r--r--arch/x86/kvm/cpuid.h8
-rw-r--r--arch/x86/kvm/mmu.c49
-rw-r--r--arch/x86/kvm/mmu.h5
-rw-r--r--arch/x86/kvm/paging_tmpl.h19
-rw-r--r--arch/x86/kvm/svm.c24
-rw-r--r--arch/x86/kvm/x86.c26
-rw-r--r--arch/x86/kvm/x86.h20
-rw-r--r--virt/kvm/kvm_main.c38
11 files changed, 265 insertions, 133 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 6485750ae08..f7735c72c12 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2565,6 +2565,120 @@ associated with the service will be forgotten, and subsequent RTAS
calls by the guest for that service will be passed to userspace to be
handled.
+4.87 KVM_SET_GUEST_DEBUG
+
+Capability: KVM_CAP_SET_GUEST_DEBUG
+Architectures: x86, s390, ppc
+Type: vcpu ioctl
+Parameters: struct kvm_guest_debug (in)
+Returns: 0 on success; -1 on error
+
+struct kvm_guest_debug {
+ __u32 control;
+ __u32 pad;
+ struct kvm_guest_debug_arch arch;
+};
+
+Set up the processor specific debug registers and configure vcpu for
+handling guest debug events. There are two parts to the structure, the
+first a control bitfield indicates the type of debug events to handle
+when running. Common control bits are:
+
+ - KVM_GUESTDBG_ENABLE: guest debugging is enabled
+ - KVM_GUESTDBG_SINGLESTEP: the next run should single-step
+
+The top 16 bits of the control field are architecture specific control
+flags which can include the following:
+
+ - KVM_GUESTDBG_USE_SW_BP: using software breakpoints [x86]
+ - KVM_GUESTDBG_USE_HW_BP: using hardware breakpoints [x86, s390]
+ - KVM_GUESTDBG_INJECT_DB: inject DB type exception [x86]
+ - KVM_GUESTDBG_INJECT_BP: inject BP type exception [x86]
+ - KVM_GUESTDBG_EXIT_PENDING: trigger an immediate guest exit [s390]
+
+For example KVM_GUESTDBG_USE_SW_BP indicates that software breakpoints
+are enabled in memory so we need to ensure breakpoint exceptions are
+correctly trapped and the KVM run loop exits at the breakpoint and not
+running off into the normal guest vector. For KVM_GUESTDBG_USE_HW_BP
+we need to ensure the guest vCPUs architecture specific registers are
+updated to the correct (supplied) values.
+
+The second part of the structure is architecture specific and
+typically contains a set of debug registers.
+
+When debug events exit the main run loop with the reason
+KVM_EXIT_DEBUG with the kvm_debug_exit_arch part of the kvm_run
+structure containing architecture specific debug information.
+
+4.88 KVM_GET_EMULATED_CPUID
+
+Capability: KVM_CAP_EXT_EMUL_CPUID
+Architectures: x86
+Type: system ioctl
+Parameters: struct kvm_cpuid2 (in/out)
+Returns: 0 on success, -1 on error
+
+struct kvm_cpuid2 {
+ __u32 nent;
+ __u32 flags;
+ struct kvm_cpuid_entry2 entries[0];
+};
+
+The member 'flags' is used for passing flags from userspace.
+
+#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX BIT(0)
+#define KVM_CPUID_FLAG_STATEFUL_FUNC BIT(1)
+#define KVM_CPUID_FLAG_STATE_READ_NEXT BIT(2)
+
+struct kvm_cpuid_entry2 {
+ __u32 function;
+ __u32 index;
+ __u32 flags;
+ __u32 eax;
+ __u32 ebx;
+ __u32 ecx;
+ __u32 edx;
+ __u32 padding[3];
+};
+
+This ioctl returns x86 cpuid features which are emulated by
+kvm.Userspace can use the information returned by this ioctl to query
+which features are emulated by kvm instead of being present natively.
+
+Userspace invokes KVM_GET_EMULATED_CPUID by passing a kvm_cpuid2
+structure with the 'nent' field indicating the number of entries in
+the variable-size array 'entries'. If the number of entries is too low
+to describe the cpu capabilities, an error (E2BIG) is returned. If the
+number is too high, the 'nent' field is adjusted and an error (ENOMEM)
+is returned. If the number is just right, the 'nent' field is adjusted
+to the number of valid entries in the 'entries' array, which is then
+filled.
+
+The entries returned are the set CPUID bits of the respective features
+which kvm emulates, as returned by the CPUID instruction, with unknown
+or unsupported feature bits cleared.
+
+Features like x2apic, for example, may not be present in the host cpu
+but are exposed by kvm in KVM_GET_SUPPORTED_CPUID because they can be
+emulated efficiently and thus not included here.
+
+The fields in each entry are defined as follows:
+
+ function: the eax value used to obtain the entry
+ index: the ecx value used to obtain the entry (for entries that are
+ affected by ecx)
+ flags: an OR of zero or more of the following:
+ KVM_CPUID_FLAG_SIGNIFCANT_INDEX:
+ if the index field is valid
+ KVM_CPUID_FLAG_STATEFUL_FUNC:
+ if cpuid for this function returns different values for successive
+ invocations; there will be several entries with the same function,
+ all with this flag set
+ KVM_CPUID_FLAG_STATE_READ_NEXT:
+ for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is
+ the first entry to be read by a cpu
+ eax, ebx, ecx, edx: the values returned by the cpuid instruction for
+ this function/index combination
5. The kvm_run structure
------------------------
@@ -2868,76 +2982,6 @@ values in kvm_run even if the corresponding bit in kvm_dirty_regs is not set.
};
-4.81 KVM_GET_EMULATED_CPUID
-
-Capability: KVM_CAP_EXT_EMUL_CPUID
-Architectures: x86
-Type: system ioctl
-Parameters: struct kvm_cpuid2 (in/out)
-Returns: 0 on success, -1 on error
-
-struct kvm_cpuid2 {
- __u32 nent;
- __u32 flags;
- struct kvm_cpuid_entry2 entries[0];
-};
-
-The member 'flags' is used for passing flags from userspace.
-
-#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX BIT(0)
-#define KVM_CPUID_FLAG_STATEFUL_FUNC BIT(1)
-#define KVM_CPUID_FLAG_STATE_READ_NEXT BIT(2)
-
-struct kvm_cpuid_entry2 {
- __u32 function;
- __u32 index;
- __u32 flags;
- __u32 eax;
- __u32 ebx;
- __u32 ecx;
- __u32 edx;
- __u32 padding[3];
-};
-
-This ioctl returns x86 cpuid features which are emulated by
-kvm.Userspace can use the information returned by this ioctl to query
-which features are emulated by kvm instead of being present natively.
-
-Userspace invokes KVM_GET_EMULATED_CPUID by passing a kvm_cpuid2
-structure with the 'nent' field indicating the number of entries in
-the variable-size array 'entries'. If the number of entries is too low
-to describe the cpu capabilities, an error (E2BIG) is returned. If the
-number is too high, the 'nent' field is adjusted and an error (ENOMEM)
-is returned. If the number is just right, the 'nent' field is adjusted
-to the number of valid entries in the 'entries' array, which is then
-filled.
-
-The entries returned are the set CPUID bits of the respective features
-which kvm emulates, as returned by the CPUID instruction, with unknown
-or unsupported feature bits cleared.
-
-Features like x2apic, for example, may not be present in the host cpu
-but are exposed by kvm in KVM_GET_SUPPORTED_CPUID because they can be
-emulated efficiently and thus not included here.
-
-The fields in each entry are defined as follows:
-
- function: the eax value used to obtain the entry
- index: the ecx value used to obtain the entry (for entries that are
- affected by ecx)
- flags: an OR of zero or more of the following:
- KVM_CPUID_FLAG_SIGNIFCANT_INDEX:
- if the index field is valid
- KVM_CPUID_FLAG_STATEFUL_FUNC:
- if cpuid for this function returns different values for successive
- invocations; there will be several entries with the same function,
- all with this flag set
- KVM_CPUID_FLAG_STATE_READ_NEXT:
- for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is
- the first entry to be read by a cpu
- eax, ebx, ecx, edx: the values returned by the cpuid instruction for
- this function/index combination
-
6. Capabilities that can be enabled on vCPUs
--------------------------------------------
diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt
index 29089417614..53838d9c629 100644
--- a/Documentation/virtual/kvm/mmu.txt
+++ b/Documentation/virtual/kvm/mmu.txt
@@ -425,6 +425,20 @@ fault through the slow path.
Since only 19 bits are used to store generation-number on mmio spte, all
pages are zapped when there is an overflow.
+Unfortunately, a single memory access might access kvm_memslots(kvm) multiple
+times, the last one happening when the generation number is retrieved and
+stored into the MMIO spte. Thus, the MMIO spte might be created based on
+out-of-date information, but with an up-to-date generation number.
+
+To avoid this, the generation number is incremented again after synchronize_srcu
+returns; thus, the low bit of kvm_memslots(kvm)->generation is only 1 during a
+memslot update, while some SRCU readers might be using the old copy. We do not
+want to use an MMIO sptes created with an odd generation number, and we can do
+this without losing a bit in the MMIO spte. The low bit of the generation
+is not stored in MMIO spte, and presumed zero when it is extracted out of the
+spte. If KVM is unlucky and creates an MMIO spte while the low bit is 1,
+the next access to the spte will always be a cache miss.
+
Further reading
===============
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 73e4149eda3..2dbde3b7446 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -262,7 +262,8 @@ struct kvm_mmu {
struct x86_exception *fault);
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
struct x86_exception *exception);
- gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
+ gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
+ struct x86_exception *exception);
int (*sync_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp);
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
@@ -477,6 +478,7 @@ struct kvm_vcpu_arch {
u64 mmio_gva;
unsigned access;
gfn_t mmio_gfn;
+ u64 mmio_gen;
struct kvm_pmu pmu;
@@ -892,7 +894,6 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gfn_t gfn, void *data, int offset, int len,
u32 access);
-void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
static inline int __kvm_irq_line_state(unsigned long *irq_state,
@@ -923,7 +924,8 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
-gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
+gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
+ struct x86_exception *exception);
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception);
gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
@@ -943,7 +945,8 @@ void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu);
void kvm_enable_tdp(void);
void kvm_disable_tdp(void);
-static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
+static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
+ struct x86_exception *exception)
{
return gpa;
}
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index a5380590ab0..43b33e301e6 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -88,6 +88,14 @@ static inline bool guest_cpuid_has_x2apic(struct kvm_vcpu *vcpu)
return best && (best->ecx & bit(X86_FEATURE_X2APIC));
}
+static inline bool guest_cpuid_is_amd(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0, 0);
+ return best && best->ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx;
+}
+
static inline bool guest_cpuid_has_gbpages(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 931467881da..76398fe15df 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -199,16 +199,20 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
/*
- * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number,
- * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation
- * number.
+ * the low bit of the generation number is always presumed to be zero.
+ * This disables mmio caching during memslot updates. The concept is
+ * similar to a seqcount but instead of retrying the access we just punt
+ * and ignore the cache.
+ *
+ * spte bits 3-11 are used as bits 1-9 of the generation number,
+ * the bits 52-61 are used as bits 10-19 of the generation number.
*/
-#define MMIO_SPTE_GEN_LOW_SHIFT 3
+#define MMIO_SPTE_GEN_LOW_SHIFT 2
#define MMIO_SPTE_GEN_HIGH_SHIFT 52
-#define MMIO_GEN_SHIFT 19
-#define MMIO_GEN_LOW_SHIFT 9
-#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 1)
+#define MMIO_GEN_SHIFT 20
+#define MMIO_GEN_LOW_SHIFT 10
+#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2)
#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1)
#define MMIO_MAX_GEN ((1 << MMIO_GEN_SHIFT) - 1)
@@ -236,12 +240,7 @@ static unsigned int get_mmio_spte_generation(u64 spte)
static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
{
- /*
- * Init kvm generation close to MMIO_MAX_GEN to easily test the
- * code of handling generation number wrap-around.
- */
- return (kvm_memslots(kvm)->generation +
- MMIO_MAX_GEN - 150) & MMIO_GEN_MASK;
+ return kvm_memslots(kvm)->generation & MMIO_GEN_MASK;
}
static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
@@ -296,11 +295,6 @@ static bool check_mmio_spte(struct kvm *kvm, u64 spte)
return likely(kvm_gen == spte_gen);
}
-static inline u64 rsvd_bits(int s, int e)
-{
- return ((1ULL << (e - s + 1)) - 1) << s;
-}
-
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask)
{
@@ -3163,7 +3157,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
- vcpu_clear_mmio_info(vcpu, ~0ul);
+ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
@@ -3206,7 +3200,7 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
{
if (exception)
exception->error_code = 0;
- return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
+ return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
}
static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
@@ -3518,6 +3512,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
int maxphyaddr = cpuid_maxphyaddr(vcpu);
u64 exb_bit_rsvd = 0;
u64 gbpages_bit_rsvd = 0;
+ u64 nonleaf_bit8_rsvd = 0;
context->bad_mt_xwr = 0;
@@ -3525,6 +3520,14 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
exb_bit_rsvd = rsvd_bits(63, 63);
if (!guest_cpuid_has_gbpages(vcpu))
gbpages_bit_rsvd = rsvd_bits(7, 7);
+
+ /*
+ * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
+ * leaf entries) on AMD CPUs only.
+ */
+ if (guest_cpuid_is_amd(vcpu))
+ nonleaf_bit8_rsvd = rsvd_bits(8, 8);
+
switch (context->root_level) {
case PT32_ROOT_LEVEL:
/* no rsvd bits for 2 level 4K page table entries */
@@ -3559,9 +3562,9 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
break;
case PT64_ROOT_LEVEL:
context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 7);
+ nonleaf_bit8_rsvd | rsvd_bits(7, 7) | rsvd_bits(maxphyaddr, 51);
context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
- gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51);
+ nonleaf_bit8_rsvd | gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51);
context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 51);
context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
@@ -4433,7 +4436,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
* The very rare case: if the generation-number is round,
* zap all shadow pages.
*/
- if (unlikely(kvm_current_mmio_generation(kvm) >= MMIO_MAX_GEN)) {
+ if (unlikely(kvm_current_mmio_generation(kvm) == 0)) {
printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n");
kvm_mmu_invalidate_zap_all_pages(kvm);
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index b982112d2ca..bde8ee72575 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -56,6 +56,11 @@
#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
+static inline u64 rsvd_bits(int s, int e)
+{
+ return ((1ULL << (e - s + 1)) - 1) << s;
+}
+
int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 41077652826..0ab6c65a282 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -321,9 +321,22 @@ retry_walk:
walker->pte_gpa[walker->level - 1] = pte_gpa;
real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
- PFERR_USER_MASK|PFERR_WRITE_MASK);
+ PFERR_USER_MASK|PFERR_WRITE_MASK,
+ &walker->fault);
+
+ /*
+ * FIXME: This can happen if emulation (for of an INS/OUTS
+ * instruction) triggers a nested page fault. The exit
+ * qualification / exit info field will incorrectly have
+ * "guest page access" as the nested page fault's cause,
+ * instead of "guest page structure access". To fix this,
+ * the x86_exception struct should be augmented with enough
+ * information to fix the exit_qualification or exit_info_1
+ * fields.
+ */
if (unlikely(real_gfn == UNMAPPED_GVA))
- goto error;
+ return 0;
+
real_gfn = gpa_to_gfn(real_gfn);
host_addr = gfn_to_hva_prot(vcpu->kvm, real_gfn,
@@ -364,7 +377,7 @@ retry_walk:
if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36())
gfn += pse36_gfn_delta(pte);
- real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access);
+ real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access, &walker->fault);
if (real_gpa == UNMAPPED_GVA)
return 0;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8ef70403737..d0a61a05a89 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1974,10 +1974,26 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->control.exit_code = SVM_EXIT_NPF;
- svm->vmcb->control.exit_code_hi = 0;
- svm->vmcb->control.exit_info_1 = fault->error_code;
- svm->vmcb->control.exit_info_2 = fault->address;
+ if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) {
+ /*
+ * TODO: track the cause of the nested page fault, and
+ * correctly fill in the high bits of exit_info_1.
+ */
+ svm->vmcb->control.exit_code = SVM_EXIT_NPF;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = (1ULL << 32);
+ svm->vmcb->control.exit_info_2 = fault->address;
+ }
+
+ svm->vmcb->control.exit_info_1 &= ~0xffffffffULL;
+ svm->vmcb->control.exit_info_1 |= fault->error_code;
+
+ /*
+ * The present bit is always zero for page structure faults on real
+ * hardware.
+ */
+ if (svm->vmcb->control.exit_info_1 & (2ULL << 32))
+ svm->vmcb->control.exit_info_1 &= ~1;
nested_svm_vmexit(svm);
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 916e8951521..7b25aa2725f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -408,12 +408,14 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
}
EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
-void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
{
if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
else
vcpu->arch.mmu.inject_page_fault(vcpu, fault);
+
+ return fault->nested_page_fault;
}
void kvm_inject_nmi(struct kvm_vcpu *vcpu)
@@ -457,11 +459,12 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gfn_t ngfn, void *data, int offset, int len,
u32 access)
{
+ struct x86_exception exception;
gfn_t real_gfn;
gpa_t ngpa;
ngpa = gfn_to_gpa(ngfn);
- real_gfn = mmu->translate_gpa(vcpu, ngpa, access);
+ real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception);
if (real_gfn == UNMAPPED_GVA)
return -EFAULT;
@@ -4063,16 +4066,16 @@ void kvm_get_segment(struct kvm_vcpu *vcpu,
kvm_x86_ops->get_segment(vcpu, var, seg);
}
-gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
+gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
+ struct x86_exception *exception)
{
gpa_t t_gpa;
- struct x86_exception exception;
BUG_ON(!mmu_is_nested(vcpu));
/* NPT walks are always user-walks */
access |= PFERR_USER_MASK;
- t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception);
+ t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, exception);
return t_gpa;
}
@@ -4929,16 +4932,18 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
}
}
-static void inject_emulated_exception(struct kvm_vcpu *vcpu)
+static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
{
struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
if (ctxt->exception.vector == PF_VECTOR)
- kvm_propagate_fault(vcpu, &ctxt->exception);
- else if (ctxt->exception.error_code_valid)
+ return kvm_propagate_fault(vcpu, &ctxt->exception);
+
+ if (ctxt->exception.error_code_valid)
kvm_queue_exception_e(vcpu, ctxt->exception.vector,
ctxt->exception.error_code);
else
kvm_queue_exception(vcpu, ctxt->exception.vector);
+ return false;
}
static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
@@ -5300,8 +5305,9 @@ restart:
}
if (ctxt->have_exception) {
- inject_emulated_exception(vcpu);
r = EMULATE_DONE;
+ if (inject_emulated_exception(vcpu))
+ return r;
} else if (vcpu->arch.pio.count) {
if (!vcpu->arch.pio.in) {
/* FIXME: return into emulator if single-stepping. */
@@ -5569,7 +5575,7 @@ static void kvm_set_mmio_spte_mask(void)
* entry to generate page fault with PFER.RSV = 1.
*/
/* Mask the reserved physical address bits. */
- mask = ((1ull << (51 - maxphyaddr + 1)) - 1) << maxphyaddr;
+ mask = rsvd_bits(maxphyaddr, 51);
/* Bit 62 is always reserved for 32bit host. */
mask |= 0x3ull << 62;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 306a1b77581..985fb2c006f 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -88,15 +88,23 @@ static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
vcpu->arch.mmio_gva = gva & PAGE_MASK;
vcpu->arch.access = access;
vcpu->arch.mmio_gfn = gfn;
+ vcpu->arch.mmio_gen = kvm_memslots(vcpu->kvm)->generation;
+}
+
+static inline bool vcpu_match_mmio_gen(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.mmio_gen == kvm_memslots(vcpu->kvm)->generation;
}
/*
- * Clear the mmio cache info for the given gva,
- * specially, if gva is ~0ul, we clear all mmio cache info.
+ * Clear the mmio cache info for the given gva. If gva is MMIO_GVA_ANY, we
+ * clear all mmio cache info.
*/
+#define MMIO_GVA_ANY (~(gva_t)0)
+
static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva)
{
- if (gva != (~0ul) && vcpu->arch.mmio_gva != (gva & PAGE_MASK))
+ if (gva != MMIO_GVA_ANY && vcpu->arch.mmio_gva != (gva & PAGE_MASK))
return;
vcpu->arch.mmio_gva = 0;
@@ -104,7 +112,8 @@ static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva)
static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva)
{
- if (vcpu->arch.mmio_gva && vcpu->arch.mmio_gva == (gva & PAGE_MASK))
+ if (vcpu_match_mmio_gen(vcpu) && vcpu->arch.mmio_gva &&
+ vcpu->arch.mmio_gva == (gva & PAGE_MASK))
return true;
return false;
@@ -112,7 +121,8 @@ static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva)
static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
{
- if (vcpu->arch.mmio_gfn && vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT)
+ if (vcpu_match_mmio_gen(vcpu) && vcpu->arch.mmio_gfn &&
+ vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT)
return true;
return false;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7176929a4cd..c338599804e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -95,8 +95,6 @@ static int hardware_enable_all(void);
static void hardware_disable_all(void);
static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
-static void update_memslots(struct kvm_memslots *slots,
- struct kvm_memory_slot *new, u64 last_generation);
static void kvm_release_pfn_dirty(pfn_t pfn);
static void mark_page_dirty_in_slot(struct kvm *kvm,
@@ -477,6 +475,13 @@ static struct kvm *kvm_create_vm(unsigned long type)
kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
if (!kvm->memslots)
goto out_err_no_srcu;
+
+ /*
+ * Init kvm generation close to the maximum to easily test the
+ * code of handling generation number wrap-around.
+ */
+ kvm->memslots->generation = -150;
+
kvm_init_memslots_id(kvm);
if (init_srcu_struct(&kvm->srcu))
goto out_err_no_srcu;
@@ -688,8 +693,7 @@ static void sort_memslots(struct kvm_memslots *slots)
}
static void update_memslots(struct kvm_memslots *slots,
- struct kvm_memory_slot *new,
- u64 last_generation)
+ struct kvm_memory_slot *new)
{
if (new) {
int id = new->id;
@@ -700,8 +704,6 @@ static void update_memslots(struct kvm_memslots *slots,
if (new->npages != npages)
sort_memslots(slots);
}
-
- slots->generation = last_generation + 1;
}
static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
@@ -723,10 +725,24 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
{
struct kvm_memslots *old_memslots = kvm->memslots;
- update_memslots(slots, new, kvm->memslots->generation);
+ /*
+ * Set the low bit in the generation, which disables SPTE caching
+ * until the end of synchronize_srcu_expedited.
+ */
+ WARN_ON(old_memslots->generation & 1);
+ slots->generation = old_memslots->generation + 1;
+
+ update_memslots(slots, new);
rcu_assign_pointer(kvm->memslots, slots);
synchronize_srcu_expedited(&kvm->srcu);
+ /*
+ * Increment the new memslot generation a second time. This prevents
+ * vm exits that race with memslot updates from caching a memslot
+ * generation that will (potentially) be valid forever.
+ */
+ slots->generation++;
+
kvm_arch_memslots_updated(kvm);
return old_memslots;
@@ -777,7 +793,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
npages = mem->memory_size >> PAGE_SHIFT;
- r = -EINVAL;
if (npages > KVM_MEM_MAX_NR_PAGES)
goto out;
@@ -791,7 +806,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
new.npages = npages;
new.flags = mem->flags;
- r = -EINVAL;
if (npages) {
if (!old.npages)
change = KVM_MR_CREATE;
@@ -847,7 +861,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
}
if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) {
- r = -ENOMEM;
slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
GFP_KERNEL);
if (!slots)
@@ -1769,8 +1782,7 @@ static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
bool eligible;
eligible = !vcpu->spin_loop.in_spin_loop ||
- (vcpu->spin_loop.in_spin_loop &&
- vcpu->spin_loop.dy_eligible);
+ vcpu->spin_loop.dy_eligible;
if (vcpu->spin_loop.in_spin_loop)
kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
@@ -2612,7 +2624,6 @@ static long kvm_dev_ioctl(struct file *filp,
switch (ioctl) {
case KVM_GET_API_VERSION:
- r = -EINVAL;
if (arg)
goto out;
r = KVM_API_VERSION;
@@ -2624,7 +2635,6 @@ static long kvm_dev_ioctl(struct file *filp,
r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
break;
case KVM_GET_VCPU_MMAP_SIZE:
- r = -EINVAL;
if (arg)
goto out;
r = PAGE_SIZE; /* struct kvm_run */