summaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-02-24 13:07:18 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2013-02-24 13:07:18 -0800
commit89f883372fa60f604d136924baf3e89ff1870e9e (patch)
treecb69b0a14957945ba00d3d392bf9ccbbef56f3b8 /arch
parent9e2d59ad580d590134285f361a0e80f0e98c0207 (diff)
parent6b73a96065e89dc9fa75ba4f78b1aa3a3bbd0470 (diff)
Merge tag 'kvm-3.9-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Marcelo Tosatti: "KVM updates for the 3.9 merge window, including x86 real mode emulation fixes, stronger memory slot interface restrictions, mmu_lock spinlock hold time reduction, improved handling of large page faults on shadow, initial APICv HW acceleration support, s390 channel IO based virtio, amongst others" * tag 'kvm-3.9-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (143 commits) Revert "KVM: MMU: lazily drop large spte" x86: pvclock kvm: align allocation size to page size KVM: nVMX: Remove redundant get_vmcs12 from nested_vmx_exit_handled_msr x86 emulator: fix parity calculation for AAD instruction KVM: PPC: BookE: Handle alignment interrupts booke: Added DBCR4 SPR number KVM: PPC: booke: Allow multiple exception types KVM: PPC: booke: use vcpu reference from thread_struct KVM: Remove user_alloc from struct kvm_memory_slot KVM: VMX: disable apicv by default KVM: s390: Fix handling of iscs. KVM: MMU: cleanup __direct_map KVM: MMU: remove pt_access in mmu_set_spte KVM: MMU: cleanup mapping-level KVM: MMU: lazily drop large spte KVM: VMX: cleanup vmx_set_cr0(). KVM: VMX: add missing exit names to VMX_EXIT_REASONS array KVM: VMX: disable SMEP feature when guest is in non-paging mode KVM: Remove duplicate text in api.txt Revert "KVM: MMU: split kvm_mmu_free_page" ...
Diffstat (limited to 'arch')
-rw-r--r--arch/ia64/include/asm/kvm_host.h4
-rw-r--r--arch/ia64/kvm/kvm-ia64.c8
-rw-r--r--arch/ia64/kvm/lapic.h6
-rw-r--r--arch/powerpc/include/asm/kvm_host.h8
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h12
-rw-r--r--arch/powerpc/include/asm/reg.h2
-rw-r--r--arch/powerpc/include/asm/reg_booke.h1
-rw-r--r--arch/powerpc/include/uapi/asm/kvm.h6
-rw-r--r--arch/powerpc/kernel/asm-offsets.c2
-rw-r--r--arch/powerpc/kvm/Makefile9
-rw-r--r--arch/powerpc/kvm/book3s_emulate.c30
-rw-r--r--arch/powerpc/kvm/book3s_hv.c2
-rw-r--r--arch/powerpc/kvm/book3s_pr.c5
-rw-r--r--arch/powerpc/kvm/booke.c70
-rw-r--r--arch/powerpc/kvm/booke.h1
-rw-r--r--arch/powerpc/kvm/booke_emulate.c3
-rw-r--r--arch/powerpc/kvm/booke_interrupts.S49
-rw-r--r--arch/powerpc/kvm/e500.c16
-rw-r--r--arch/powerpc/kvm/e500.h1
-rw-r--r--arch/powerpc/kvm/e500_mmu.c (renamed from arch/powerpc/kvm/e500_tlb.c)659
-rw-r--r--arch/powerpc/kvm/e500_mmu_host.c699
-rw-r--r--arch/powerpc/kvm/e500_mmu_host.h18
-rw-r--r--arch/powerpc/kvm/emulate.c5
-rw-r--r--arch/powerpc/kvm/powerpc.c17
-rw-r--r--arch/s390/include/asm/irq.h1
-rw-r--r--arch/s390/include/asm/kvm_host.h15
-rw-r--r--arch/s390/kernel/irq.c1
-rw-r--r--arch/s390/kvm/intercept.c45
-rw-r--r--arch/s390/kvm/interrupt.c264
-rw-r--r--arch/s390/kvm/kvm-s390.c50
-rw-r--r--arch/s390/kvm/kvm-s390.h46
-rw-r--r--arch/s390/kvm/priv.c316
-rw-r--r--arch/s390/kvm/sigp.c10
-rw-r--r--arch/s390/kvm/trace-s390.h26
-rw-r--r--arch/x86/include/asm/kvm_host.h26
-rw-r--r--arch/x86/include/asm/kvm_para.h2
-rw-r--r--arch/x86/include/asm/vmx.h18
-rw-r--r--arch/x86/include/uapi/asm/vmx.h9
-rw-r--r--arch/x86/kernel/kvmclock.c11
-rw-r--r--arch/x86/kvm/emulate.c673
-rw-r--r--arch/x86/kvm/i8254.c1
-rw-r--r--arch/x86/kvm/i8259.c2
-rw-r--r--arch/x86/kvm/irq.c74
-rw-r--r--arch/x86/kvm/lapic.c140
-rw-r--r--arch/x86/kvm/lapic.h34
-rw-r--r--arch/x86/kvm/mmu.c168
-rw-r--r--arch/x86/kvm/mmutrace.h6
-rw-r--r--arch/x86/kvm/paging_tmpl.h106
-rw-r--r--arch/x86/kvm/svm.c24
-rw-r--r--arch/x86/kvm/vmx.c714
-rw-r--r--arch/x86/kvm/x86.c168
51 files changed, 3044 insertions, 1539 deletions
diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 6d6a5ac48d8..cfa74983c67 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -23,9 +23,7 @@
#ifndef __ASM_KVM_HOST_H
#define __ASM_KVM_HOST_H
-#define KVM_MEMORY_SLOTS 32
-/* memory slots that does not exposed to userspace */
-#define KVM_PRIVATE_MEM_SLOTS 4
+#define KVM_USER_MEM_SLOTS 32
#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index bd1c5155503..ad3126a5864 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -955,7 +955,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
kvm_mem.guest_phys_addr;
kvm_userspace_mem.memory_size = kvm_mem.memory_size;
r = kvm_vm_ioctl_set_memory_region(kvm,
- &kvm_userspace_mem, 0);
+ &kvm_userspace_mem, false);
if (r)
goto out;
break;
@@ -1580,7 +1580,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_memory_slot *memslot,
struct kvm_memory_slot old,
struct kvm_userspace_memory_region *mem,
- int user_alloc)
+ bool user_alloc)
{
unsigned long i;
unsigned long pfn;
@@ -1611,7 +1611,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
void kvm_arch_commit_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
struct kvm_memory_slot old,
- int user_alloc)
+ bool user_alloc)
{
return;
}
@@ -1834,7 +1834,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
mutex_lock(&kvm->slots_lock);
r = -EINVAL;
- if (log->slot >= KVM_MEMORY_SLOTS)
+ if (log->slot >= KVM_USER_MEM_SLOTS)
goto out;
memslot = id_to_memslot(kvm->memslots, log->slot);
diff --git a/arch/ia64/kvm/lapic.h b/arch/ia64/kvm/lapic.h
index c5f92a926a9..c3e2935b6db 100644
--- a/arch/ia64/kvm/lapic.h
+++ b/arch/ia64/kvm/lapic.h
@@ -27,4 +27,10 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
#define kvm_apic_present(x) (true)
#define kvm_lapic_enabled(x) (true)
+static inline bool kvm_apic_vid_enabled(void)
+{
+ /* IA64 has no apicv supporting, do nothing here */
+ return false;
+}
+
#endif
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 03d7beae89a..d1bb8607472 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -37,10 +37,8 @@
#define KVM_MAX_VCPUS NR_CPUS
#define KVM_MAX_VCORES NR_CPUS
-#define KVM_MEMORY_SLOTS 32
-/* memory slots that does not exposed to userspace */
-#define KVM_PRIVATE_MEM_SLOTS 4
-#define KVM_MEM_SLOTS_NUM (KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
+#define KVM_USER_MEM_SLOTS 32
+#define KVM_MEM_SLOTS_NUM KVM_USER_MEM_SLOTS
#ifdef CONFIG_KVM_MMIO
#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
@@ -523,6 +521,8 @@ struct kvm_vcpu_arch {
u8 sane;
u8 cpu_type;
u8 hcall_needed;
+ u8 epr_enabled;
+ u8 epr_needed;
u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 572aa753061..44a657adf41 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -44,12 +44,11 @@ enum emulation_result {
EMULATE_DO_DCR, /* kvm_run filled with DCR request */
EMULATE_FAIL, /* can't emulate this instruction */
EMULATE_AGAIN, /* something went wrong. go again */
+ EMULATE_DO_PAPR, /* kvm_run filled with PAPR request */
};
extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
-extern char kvmppc_handlers_start[];
-extern unsigned long kvmppc_handler_len;
extern void kvmppc_handler_highmem(void);
extern void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu);
@@ -263,6 +262,15 @@ static inline void kvm_linear_init(void)
{}
#endif
+static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, u32 epr)
+{
+#ifdef CONFIG_KVM_BOOKE_HV
+ mtspr(SPRN_GEPR, epr);
+#elif defined(CONFIG_BOOKE)
+ vcpu->arch.epr = epr;
+#endif
+}
+
int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
struct kvm_config_tlb *cfg);
int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 7035e608f3f..e6658612203 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -956,8 +956,6 @@
#define SPRN_SPRG_RSCRATCH_DBG SPRN_SPRG9
#define SPRN_SPRG_WSCRATCH_DBG SPRN_SPRG9
#endif
-#define SPRN_SPRG_RVCPU SPRN_SPRG1
-#define SPRN_SPRG_WVCPU SPRN_SPRG1
#endif
#ifdef CONFIG_8xx
diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h
index e07e6af5e1f..b417de3cc2c 100644
--- a/arch/powerpc/include/asm/reg_booke.h
+++ b/arch/powerpc/include/asm/reg_booke.h
@@ -56,6 +56,7 @@
#define SPRN_SPRG7W 0x117 /* Special Purpose Register General 7 Write */
#define SPRN_EPCR 0x133 /* Embedded Processor Control Register */
#define SPRN_DBCR2 0x136 /* Debug Control Register 2 */
+#define SPRN_DBCR4 0x233 /* Debug Control Register 4 */
#define SPRN_MSRP 0x137 /* MSR Protect Register */
#define SPRN_IAC3 0x13A /* Instruction Address Compare 3 */
#define SPRN_IAC4 0x13B /* Instruction Address Compare 4 */
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 2fba8a66fb1..16064d00adb 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -114,7 +114,10 @@ struct kvm_regs {
/* Embedded Floating Point (SPE) -- IVOR32-34 if KVM_SREGS_E_IVOR */
#define KVM_SREGS_E_SPE (1 << 9)
-/* External Proxy (EXP) -- EPR */
+/*
+ * DEPRECATED! USE ONE_REG FOR THIS ONE!
+ * External Proxy (EXP) -- EPR
+ */
#define KVM_SREGS_EXP (1 << 10)
/* External PID (E.PD) -- EPSC/EPLC */
@@ -412,5 +415,6 @@ struct kvm_get_htab_header {
#define KVM_REG_PPC_VPA_DTL (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x84)
#define KVM_REG_PPC_EPCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x85)
+#define KVM_REG_PPC_EPR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x86)
#endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 78119036729..b6c17ec9b16 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -118,7 +118,7 @@ int main(void)
#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
DEFINE(THREAD_KVM_SVCPU, offsetof(struct thread_struct, kvm_shadow_vcpu));
#endif
-#ifdef CONFIG_KVM_BOOKE_HV
+#if defined(CONFIG_KVM) && defined(CONFIG_BOOKE)
DEFINE(THREAD_KVM_VCPU, offsetof(struct thread_struct, kvm_vcpu));
#endif
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 1e473d46322..b772eded8c2 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -10,7 +10,8 @@ common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o \
eventfd.o)
CFLAGS_44x_tlb.o := -I.
-CFLAGS_e500_tlb.o := -I.
+CFLAGS_e500_mmu.o := -I.
+CFLAGS_e500_mmu_host.o := -I.
CFLAGS_emulate.o := -I.
common-objs-y += powerpc.o emulate.o
@@ -35,7 +36,8 @@ kvm-e500-objs := \
booke_emulate.o \
booke_interrupts.o \
e500.o \
- e500_tlb.o \
+ e500_mmu.o \
+ e500_mmu_host.o \
e500_emulate.o
kvm-objs-$(CONFIG_KVM_E500V2) := $(kvm-e500-objs)
@@ -45,7 +47,8 @@ kvm-e500mc-objs := \
booke_emulate.o \
bookehv_interrupts.o \
e500mc.o \
- e500_tlb.o \
+ e500_mmu.o \
+ e500_mmu_host.o \
e500_emulate.o
kvm-objs-$(CONFIG_KVM_E500MC) := $(kvm-e500mc-objs)
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index d31a716f7f2..836c56975e2 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -34,6 +34,8 @@
#define OP_31_XOP_MTSRIN 242
#define OP_31_XOP_TLBIEL 274
#define OP_31_XOP_TLBIE 306
+/* Opcode is officially reserved, reuse it as sc 1 when sc 1 doesn't trap */
+#define OP_31_XOP_FAKE_SC1 308
#define OP_31_XOP_SLBMTE 402
#define OP_31_XOP_SLBIE 434
#define OP_31_XOP_SLBIA 498
@@ -170,6 +172,32 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
vcpu->arch.mmu.tlbie(vcpu, addr, large);
break;
}
+#ifdef CONFIG_KVM_BOOK3S_64_PR
+ case OP_31_XOP_FAKE_SC1:
+ {
+ /* SC 1 papr hypercalls */
+ ulong cmd = kvmppc_get_gpr(vcpu, 3);
+ int i;
+
+ if ((vcpu->arch.shared->msr & MSR_PR) ||
+ !vcpu->arch.papr_enabled) {
+ emulated = EMULATE_FAIL;
+ break;
+ }
+
+ if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE)
+ break;
+
+ run->papr_hcall.nr = cmd;
+ for (i = 0; i < 9; ++i) {
+ ulong gpr = kvmppc_get_gpr(vcpu, 4 + i);
+ run->papr_hcall.args[i] = gpr;
+ }
+
+ emulated = EMULATE_DO_PAPR;
+ break;
+ }
+#endif
case OP_31_XOP_EIOIO:
break;
case OP_31_XOP_SLBMTE:
@@ -427,6 +455,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
case SPRN_PMC3_GEKKO:
case SPRN_PMC4_GEKKO:
case SPRN_WPAR_GEKKO:
+ case SPRN_MSSSR0:
break;
unprivileged:
default:
@@ -523,6 +552,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
case SPRN_PMC3_GEKKO:
case SPRN_PMC4_GEKKO:
case SPRN_WPAR_GEKKO:
+ case SPRN_MSSSR0:
*spr_val = 0;
break;
default:
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 71d0c90b62b..80dcc53a1ab 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1549,7 +1549,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
mutex_lock(&kvm->slots_lock);
r = -EINVAL;
- if (log->slot >= KVM_MEMORY_SLOTS)
+ if (log->slot >= KVM_USER_MEM_SLOTS)
goto out;
memslot = id_to_memslot(kvm->memslots, log->slot);
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 6702442ca81..5e93438afb0 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -762,6 +762,11 @@ program_interrupt:
run->exit_reason = KVM_EXIT_MMIO;
r = RESUME_HOST_NV;
break;
+ case EMULATE_DO_PAPR:
+ run->exit_reason = KVM_EXIT_PAPR_HCALL;
+ vcpu->arch.hcall_needed = 1;
+ r = RESUME_HOST_NV;
+ break;
default:
BUG();
}
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 69f11401578..020923e4313 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -182,6 +182,14 @@ static void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu,
kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_INST_STORAGE);
}
+static void kvmppc_core_queue_alignment(struct kvm_vcpu *vcpu, ulong dear_flags,
+ ulong esr_flags)
+{
+ vcpu->arch.queued_dear = dear_flags;
+ vcpu->arch.queued_esr = esr_flags;
+ kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ALIGNMENT);
+}
+
void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong esr_flags)
{
vcpu->arch.queued_esr = esr_flags;
@@ -300,13 +308,22 @@ static void set_guest_esr(struct kvm_vcpu *vcpu, u32 esr)
#endif
}
+static unsigned long get_guest_epr(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_KVM_BOOKE_HV
+ return mfspr(SPRN_GEPR);
+#else
+ return vcpu->arch.epr;
+#endif
+}
+
/* Deliver the interrupt of the corresponding priority, if possible. */
static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
unsigned int priority)
{
int allowed = 0;
ulong msr_mask = 0;
- bool update_esr = false, update_dear = false;
+ bool update_esr = false, update_dear = false, update_epr = false;
ulong crit_raw = vcpu->arch.shared->critical;
ulong crit_r1 = kvmppc_get_gpr(vcpu, 1);
bool crit;
@@ -330,9 +347,13 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
keep_irq = true;
}
+ if ((priority == BOOKE_IRQPRIO_EXTERNAL) && vcpu->arch.epr_enabled)
+ update_epr = true;
+
switch (priority) {
case BOOKE_IRQPRIO_DTLB_MISS:
case BOOKE_IRQPRIO_DATA_STORAGE:
+ case BOOKE_IRQPRIO_ALIGNMENT:
update_dear = true;
/* fall through */
case BOOKE_IRQPRIO_INST_STORAGE:
@@ -346,7 +367,6 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
case BOOKE_IRQPRIO_SPE_FP_DATA:
case BOOKE_IRQPRIO_SPE_FP_ROUND:
case BOOKE_IRQPRIO_AP_UNAVAIL:
- case BOOKE_IRQPRIO_ALIGNMENT:
allowed = 1;
msr_mask = MSR_CE | MSR_ME | MSR_DE;
int_class = INT_CLASS_NONCRIT;
@@ -408,6 +428,8 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
set_guest_esr(vcpu, vcpu->arch.queued_esr);
if (update_dear == true)
set_guest_dear(vcpu, vcpu->arch.queued_dear);
+ if (update_epr == true)
+ kvm_make_request(KVM_REQ_EPR_EXIT, vcpu);
new_msr &= msr_mask;
#if defined(CONFIG_64BIT)
@@ -581,6 +603,11 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
kvmppc_core_check_exceptions(vcpu);
+ if (vcpu->requests) {
+ /* Exception delivery raised request; start over */
+ return 1;
+ }
+
if (vcpu->arch.shared->msr & MSR_WE) {
local_irq_enable();
kvm_vcpu_block(vcpu);
@@ -610,6 +637,13 @@ int kvmppc_core_check_requests(struct kvm_vcpu *vcpu)
r = 0;
}
+ if (kvm_check_request(KVM_REQ_EPR_EXIT, vcpu)) {
+ vcpu->run->epr.epr = 0;
+ vcpu->arch.epr_needed = true;
+ vcpu->run->exit_reason = KVM_EXIT_EPR;
+ r = 0;
+ }
+
return r;
}
@@ -945,6 +979,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
r = RESUME_GUEST;
break;
+ case BOOKE_INTERRUPT_ALIGNMENT:
+ kvmppc_core_queue_alignment(vcpu, vcpu->arch.fault_dear,
+ vcpu->arch.fault_esr);
+ r = RESUME_GUEST;
+ break;
+
#ifdef CONFIG_KVM_BOOKE_HV
case BOOKE_INTERRUPT_HV_SYSCALL:
if (!(vcpu->arch.shared->msr & MSR_PR)) {
@@ -1388,6 +1428,11 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
&vcpu->arch.dbg_reg.dac[dac], sizeof(u64));
break;
}
+ case KVM_REG_PPC_EPR: {
+ u32 epr = get_guest_epr(vcpu);
+ r = put_user(epr, (u32 __user *)(long)reg->addr);
+ break;
+ }
#if defined(CONFIG_64BIT)
case KVM_REG_PPC_EPCR:
r = put_user(vcpu->arch.epcr, (u32 __user *)(long)reg->addr);
@@ -1420,6 +1465,13 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
(u64 __user *)(long)reg->addr, sizeof(u64));
break;
}
+ case KVM_REG_PPC_EPR: {
+ u32 new_epr;
+ r = get_user(new_epr, (u32 __user *)(long)reg->addr);
+ if (!r)
+ kvmppc_set_epr(vcpu, new_epr);
+ break;
+ }
#if defined(CONFIG_64BIT)
case KVM_REG_PPC_EPCR: {
u32 new_epcr;
@@ -1556,7 +1608,9 @@ int __init kvmppc_booke_init(void)
{
#ifndef CONFIG_KVM_BOOKE_HV
unsigned long ivor[16];
+ unsigned long *handler = kvmppc_booke_handler_addr;
unsigned long max_ivor = 0;
+ unsigned long handler_len;
int i;
/* We install our own exception handlers by hijacking IVPR. IVPR must
@@ -1589,14 +1643,16 @@ int __init kvmppc_booke_init(void)
for (i = 0; i < 16; i++) {
if (ivor[i] > max_ivor)
- max_ivor = ivor[i];
+ max_ivor = i;
+ handler_len = handler[i + 1] - handler[i];
memcpy((void *)kvmppc_booke_handlers + ivor[i],
- kvmppc_handlers_start + i * kvmppc_handler_len,
- kvmppc_handler_len);
+ (void *)handler[i], handler_len);
}
- flush_icache_range(kvmppc_booke_handlers,
- kvmppc_booke_handlers + max_ivor + kvmppc_handler_len);
+
+ handler_len = handler[max_ivor + 1] - handler[max_ivor];
+ flush_icache_range(kvmppc_booke_handlers, kvmppc_booke_handlers +
+ ivor[max_ivor] + handler_len);
#endif /* !BOOKE_HV */
return 0;
}
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
index e9b88e433f6..5fd1ba69357 100644
--- a/arch/powerpc/kvm/booke.h
+++ b/arch/powerpc/kvm/booke.h
@@ -65,6 +65,7 @@
(1 << BOOKE_IRQPRIO_CRITICAL))
extern unsigned long kvmppc_booke_handlers;
+extern unsigned long kvmppc_booke_handler_addr[];
void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr);
void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr);
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
index 4685b8cf224..27a4b2877c1 100644
--- a/arch/powerpc/kvm/booke_emulate.c
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -269,6 +269,9 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
case SPRN_ESR:
*spr_val = vcpu->arch.shared->esr;
break;
+ case SPRN_EPR:
+ *spr_val = vcpu->arch.epr;
+ break;
case SPRN_CSRR0:
*spr_val = vcpu->arch.csrr0;
break;
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
index bb46b32f981..f4bb55c9651 100644
--- a/arch/powerpc/kvm/booke_interrupts.S
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -45,18 +45,21 @@
(1<<BOOKE_INTERRUPT_DEBUG))
#define NEED_DEAR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \
- (1<<BOOKE_INTERRUPT_DTLB_MISS))
+ (1<<BOOKE_INTERRUPT_DTLB_MISS) | \
+ (1<<BOOKE_INTERRUPT_ALIGNMENT))
#define NEED_ESR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \
(1<<BOOKE_INTERRUPT_INST_STORAGE) | \
(1<<BOOKE_INTERRUPT_PROGRAM) | \
- (1<<BOOKE_INTERRUPT_DTLB_MISS))
+ (1<<BOOKE_INTERRUPT_DTLB_MISS) | \
+ (1<<BOOKE_INTERRUPT_ALIGNMENT))
.macro KVM_HANDLER ivor_nr scratch srr0
_GLOBAL(kvmppc_handler_\ivor_nr)
/* Get pointer to vcpu and record exit number. */
mtspr \scratch , r4
- mfspr r4, SPRN_SPRG_RVCPU
+ mfspr r4, SPRN_SPRG_THREAD
+ lwz r4, THREAD_KVM_VCPU(r4)
stw r3, VCPU_GPR(R3)(r4)
stw r5, VCPU_GPR(R5)(r4)
stw r6, VCPU_GPR(R6)(r4)
@@ -73,6 +76,14 @@ _GLOBAL(kvmppc_handler_\ivor_nr)
bctr
.endm
+.macro KVM_HANDLER_ADDR ivor_nr
+ .long kvmppc_handler_\ivor_nr
+.endm
+
+.macro KVM_HANDLER_END
+ .long kvmppc_handlers_end
+.endm
+
_GLOBAL(kvmppc_handlers_start)
KVM_HANDLER BOOKE_INTERRUPT_CRITICAL SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0
KVM_HANDLER BOOKE_INTERRUPT_MACHINE_CHECK SPRN_SPRG_RSCRATCH_MC SPRN_MCSRR0
@@ -93,9 +104,7 @@ KVM_HANDLER BOOKE_INTERRUPT_DEBUG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0
KVM_HANDLER BOOKE_INTERRUPT_SPE_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0
KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_DATA SPRN_SPRG_RSCRATCH0 SPRN_SRR0
KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_ROUND SPRN_SPRG_RSCRATCH0 SPRN_SRR0
-
-_GLOBAL(kvmppc_handler_len)
- .long kvmppc_handler_1 - kvmppc_handler_0
+_GLOBAL(kvmppc_handlers_end)
/* Registers:
* SPRG_SCRATCH0: guest r4
@@ -402,9 +411,6 @@ lightweight_exit:
lwz r8, kvmppc_booke_handlers@l(r8)
mtspr SPRN_IVPR, r8
- /* Save vcpu pointer for the exception handlers. */
- mtspr SPRN_SPRG_WVCPU, r4
-
lwz r5, VCPU_SHARED(r4)
/* Can't switch the stack pointer until after IVPR is switched,
@@ -463,6 +469,31 @@ lightweight_exit:
lwz r4, VCPU_GPR(R4)(r4)
rfi
+ .data
+ .align 4
+ .globl kvmppc_booke_handler_addr
+kvmppc_booke_handler_addr:
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_CRITICAL
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_MACHINE_CHECK
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_DATA_STORAGE
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_INST_STORAGE
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_EXTERNAL
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_ALIGNMENT
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_PROGRAM
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_FP_UNAVAIL
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_SYSCALL
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_AP_UNAVAIL
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_DECREMENTER
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_FIT
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_WATCHDOG
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_DTLB_MISS
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_ITLB_MISS
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_DEBUG
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_SPE_UNAVAIL
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_SPE_FP_DATA
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_SPE_FP_ROUND
+KVM_HANDLER_END /*Always keep this in end*/
+
#ifdef CONFIG_SPE
_GLOBAL(kvmppc_save_guest_spe)
cmpi 0,r3,0
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index b479ed77c51..6dd4de7802b 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -491,6 +491,9 @@ static int __init kvmppc_e500_init(void)
{
int r, i;
unsigned long ivor[3];
+ /* Process remaining handlers above the generic first 16 */
+ unsigned long *handler = &kvmppc_booke_handler_addr[16];
+ unsigned long handler_len;
unsigned long max_ivor = 0;
r = kvmppc_core_check_processor_compat();
@@ -506,15 +509,16 @@ static int __init kvmppc_e500_init(void)
ivor[1] = mfspr(SPRN_IVOR33);
ivor[2] = mfspr(SPRN_IVOR34);
for (i = 0; i < 3; i++) {
- if (ivor[i] > max_ivor)
- max_ivor = ivor[i];
+ if (ivor[i] > ivor[max_ivor])
+ max_ivor = i;
+ handler_len = handler[i + 1] - handler[i];
memcpy((void *)kvmppc_booke_handlers + ivor[i],
- kvmppc_handlers_start + (i + 16) * kvmppc_handler_len,
- kvmppc_handler_len);
+ (void *)handler[i], handler_len);
}
- flush_icache_range(kvmppc_booke_handlers,
- kvmppc_booke_handlers + max_ivor + kvmppc_handler_len);
+ handler_len = handler[max_ivor + 1] - handler[max_ivor];
+ flush_icache_range(kvmppc_booke_handlers, kvmppc_booke_handlers +
+ ivor[max_ivor] + handler_len);
return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE);
}
diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
index c70d37ed770..41cefd43655 100644
--- a/arch/powerpc/kvm/e500.h
+++ b/arch/powerpc/kvm/e500.h
@@ -28,6 +28,7 @@
#define E500_TLB_VALID 1
#define E500_TLB_BITMAP 2
+#define E500_TLB_TLB0 (1 << 2)
struct tlbe_ref {
pfn_t pfn;
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_mmu.c
index cf3f1801237..5c4475983f7 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_mmu.c
@@ -1,10 +1,11 @@
/*
- * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
+ * Copyright (C) 2008-2013 Freescale Semiconductor, Inc. All rights reserved.
*
* Author: Yu Liu, yu.liu@freescale.com
* Scott Wood, scottwood@freescale.com
* Ashish Kalra, ashish.kalra@freescale.com
* Varun Sethi, varun.sethi@freescale.com
+ * Alexander Graf, agraf@suse.de
*
* Description:
* This file is based on arch/powerpc/kvm/44x_tlb.c,
@@ -33,10 +34,7 @@
#include "e500.h"
#include "trace.h"
#include "timing.h"
-
-#define to_htlb1_esel(esel) (host_tlb_params[1].entries - (esel) - 1)
-
-static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM];
+#include "e500_mmu_host.h"
static inline unsigned int gtlb0_get_next_victim(
struct kvmppc_vcpu_e500 *vcpu_e500)
@@ -50,174 +48,6 @@ static inline unsigned int gtlb0_get_next_victim(
return victim;
}
-static inline unsigned int tlb1_max_shadow_size(void)
-{
- /* reserve one entry for magic page */
- return host_tlb_params[1].entries - tlbcam_index - 1;
-}
-
-static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe)
-{
- return tlbe->mas7_3 & (MAS3_SW|MAS3_UW);
-}
-
-static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode)
-{
- /* Mask off reserved bits. */
- mas3 &= MAS3_ATTRIB_MASK;
-
-#ifndef CONFIG_KVM_BOOKE_HV
- if (!usermode) {
- /* Guest is in supervisor mode,
- * so we need to translate guest
- * supervisor permissions into user permissions. */
- mas3 &= ~E500_TLB_USER_PERM_MASK;
- mas3 |= (mas3 & E500_TLB_SUPER_PERM_MASK) << 1;
- }
- mas3 |= E500_TLB_SUPER_PERM_MASK;
-#endif
- return mas3;
-}
-
-static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode)
-{
-#ifdef CONFIG_SMP
- return (mas2 & MAS2_ATTRIB_MASK) | MAS2_M;
-#else
- return mas2 & MAS2_ATTRIB_MASK;
-#endif
-}
-
-/*
- * writing shadow tlb entry to host TLB
- */
-static inline void __write_host_tlbe(struct kvm_book3e_206_tlb_entry *stlbe,
- uint32_t mas0)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- mtspr(SPRN_MAS0, mas0);
- mtspr(SPRN_MAS1, stlbe->mas1);
- mtspr(SPRN_MAS2, (unsigned long)stlbe->mas2);
- mtspr(SPRN_MAS3, (u32)stlbe->mas7_3);
- mtspr(SPRN_MAS7, (u32)(stlbe->mas7_3 >> 32));
-#ifdef CONFIG_KVM_BOOKE_HV
- mtspr(SPRN_MAS8, stlbe->mas8);
-#endif
- asm volatile("isync; tlbwe" : : : "memory");
-
-#ifdef CONFIG_KVM_BOOKE_HV
- /* Must clear mas8 for other host tlbwe's */
- mtspr(SPRN_MAS8, 0);
- isync();
-#endif
- local_irq_restore(flags);
-
- trace_kvm_booke206_stlb_write(mas0, stlbe->mas8, stlbe->mas1,
- stlbe->mas2, stlbe->mas7_3);
-}
-
-/*
- * Acquire a mas0 with victim hint, as if we just took a TLB miss.
- *
- * We don't care about the address we're searching for, other than that it's
- * in the right set and is not present in the TLB. Using a zero PID and a
- * userspace address means we don't have to set and then restore MAS5, or
- * calculate a proper MAS6 value.
- */
-static u32 get_host_mas0(unsigned long eaddr)
-{
- unsigned long flags;
- u32 mas0;
-
- local_irq_save(flags);
- mtspr(SPRN_MAS6, 0);
- asm volatile("tlbsx 0, %0" : : "b" (eaddr & ~CONFIG_PAGE_OFFSET));
- mas0 = mfspr(SPRN_MAS0);
- local_irq_restore(flags);
-
- return mas0;
-}
-
-/* sesel is for tlb1 only */
-static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
- int tlbsel, int sesel, struct kvm_book3e_206_tlb_entry *stlbe)
-{
- u32 mas0;
-
- if (tlbsel == 0) {
- mas0 = get_host_mas0(stlbe->mas2);
- __write_host_tlbe(stlbe, mas0);
- } else {
- __write_host_tlbe(stlbe,
- MAS0_TLBSEL(1) |
- MAS0_ESEL(to_htlb1_esel(sesel)));
- }
-}
-
-#ifdef CONFIG_KVM_E500V2
-void kvmppc_map_magic(struct kvm_vcpu *vcpu)
-{
- struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
- struct kvm_book3e_206_tlb_entry magic;
- ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK;
- unsigned int stid;
- pfn_t pfn;
-
- pfn = (pfn_t)virt_to_phys((void *)shared_page) >> PAGE_SHIFT;
- get_page(pfn_to_page(pfn));
-
- preempt_disable();
- stid = kvmppc_e500_get_sid(vcpu_e500, 0, 0, 0, 0);
-
- magic.mas1 = MAS1_VALID | MAS1_TS | MAS1_TID(stid) |
- MAS1_TSIZE(BOOK3E_PAGESZ_4K);
- magic.mas2 = vcpu->arch.magic_page_ea | MAS2_M;
- magic.mas7_3 = ((u64)pfn << PAGE_SHIFT) |
- MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR;
- magic.mas8 = 0;
-
- __write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index));
- preempt_enable();
-}
-#endif
-
-static void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500,
- int tlbsel, int esel)
-{
- struct kvm_book3e_206_tlb_entry *gtlbe =
- get_entry(vcpu_e500, tlbsel, esel);
-
- if (tlbsel == 1 &&
- vcpu_e500->gtlb_priv[1][esel].ref.flags & E500_TLB_BITMAP) {
- u64 tmp = vcpu_e500->g2h_tlb1_map[esel];
- int hw_tlb_indx;
- unsigned long flags;
-
- local_irq_save(flags);
- while (tmp) {
- hw_tlb_indx = __ilog2_u64(tmp & -tmp);
- mtspr(SPRN_MAS0,
- MAS0_TLBSEL(1) |
- MAS0_ESEL(to_htlb1_esel(hw_tlb_indx)));
- mtspr(SPRN_MAS1, 0);
- asm volatile("tlbwe");
- vcpu_e500->h2g_tlb1_rmap[hw_tlb_indx] = 0;
- tmp &= tmp - 1;
- }
- mb();
- vcpu_e500->g2h_tlb1_map[esel] = 0;
- vcpu_e500->gtlb_priv[1][esel].ref.flags &= ~E500_TLB_BITMAP;
- local_irq_restore(flags);
-
- return;
- }
-
- /* Guest tlbe is backed by at most one host tlbe per shadow pid. */
- kvmppc_e500_tlbil_one(vcpu_e500, gtlbe);
-}
-
static int tlb0_set_base(gva_t addr, int sets, int ways)
{
int set_base;
@@ -296,70 +126,6 @@ static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500,
return -1;
}
-static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref,
- struct kvm_book3e_206_tlb_entry *gtlbe,
- pfn_t pfn)
-{
- ref->pfn = pfn;
- ref->flags = E500_TLB_VALID;
-
- if (tlbe_is_writable(gtlbe))
- kvm_set_pfn_dirty(pfn);
-}
-
-static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref)
-{
- if (ref->flags & E500_TLB_VALID) {
- trace_kvm_booke206_ref_release(ref->pfn, ref->flags);
- ref->flags = 0;
- }
-}
-
-static void clear_tlb1_bitmap(struct kvmppc_vcpu_e500 *vcpu_e500)
-{
- if (vcpu_e500->g2h_tlb1_map)
- memset(vcpu_e500->g2h_tlb1_map, 0,
- sizeof(u64) * vcpu_e500->gtlb_params[1].entries);
- if (vcpu_e500->h2g_tlb1_rmap)
- memset(vcpu_e500->h2g_tlb1_rmap, 0,
- sizeof(unsigned int) * host_tlb_params[1].entries);
-}
-
-static void clear_tlb_privs(struct kvmppc_vcpu_e500 *vcpu_e500)
-{
- int tlbsel = 0;
- int i;
-
- for (i = 0; i < vcpu_e500->gtlb_params[tlbsel].entries; i++) {
- struct tlbe_ref *ref =
- &vcpu_e500->gtlb_priv[tlbsel][i].ref;
- kvmppc_e500_ref_release(ref);
- }
-}
-
-static void clear_tlb_refs(struct kvmppc_vcpu_e500 *vcpu_e500)
-{
- int stlbsel = 1;
- int i;
-
- kvmppc_e500_tlbil_all(vcpu_e500);
-
- for (i = 0; i < host_tlb_params[stlbsel].entries; i++) {
- struct tlbe_ref *ref =
- &vcpu_e500->tlb_refs[stlbsel][i];
- kvmppc_e500_ref_release(ref);
- }
-
- clear_tlb_privs(vcpu_e500);
-}
-
-void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu)
-{
- struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
- clear_tlb_refs(vcpu_e500);
- clear_tlb1_bitmap(vcpu_e500);
-}
-
static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
unsigned int eaddr, int as)
{
@@ -385,216 +151,6 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
| (as ? MAS6_SAS : 0);
}
-/* TID must be supplied by the caller */
-static inline void kvmppc_e500_setup_stlbe(
- struct kvm_vcpu *vcpu,
- struct kvm_book3e_206_tlb_entry *gtlbe,
- int tsize, struct tlbe_ref *ref, u64 gvaddr,
- struct kvm_book3e_206_tlb_entry *stlbe)
-{
- pfn_t pfn = ref->pfn;
- u32 pr = vcpu->arch.shared->msr & MSR_PR;
-
- BUG_ON(!(ref->flags & E500_TLB_VALID));
-
- /* Force IPROT=0 for all guest mappings. */
- stlbe->mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) | MAS1_VALID;
- stlbe->mas2 = (gvaddr & MAS2_EPN) |
- e500_shadow_mas2_attrib(gtlbe->mas2, pr);
- stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) |
- e500_shadow_mas3_attrib(gtlbe->mas7_3, pr);
-
-#ifdef CONFIG_KVM_BOOKE_HV
- stlbe->mas8 = MAS8_TGS | vcpu->kvm->arch.lpid;
-#endif
-}
-
-static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
- u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe,
- int tlbsel, struct kvm_book3e_206_tlb_entry *stlbe,
- struct tlbe_ref *ref)
-{
- struct kvm_memory_slot *slot;
- unsigned long pfn = 0; /* silence GCC warning */
- unsigned long hva;
- int pfnmap = 0;
- int tsize = BOOK3E_PAGESZ_4K;
-
- /*
- * Translate guest physical to true physical, acquiring
- * a page reference if it is normal, non-reserved memory.
- *
- * gfn_to_memslot() must succeed because otherwise we wouldn't
- * have gotten this far. Eventually we should just pass the slot
- * pointer through from the first lookup.
- */
- slot = gfn_to_memslot(vcpu_e500->vcpu.kvm, gfn);
- hva = gfn_to_hva_memslot(slot, gfn);
-
- if (tlbsel == 1) {
- struct vm_area_struct *vma;
- down_read(&current->mm->mmap_sem);
-
- vma = find_vma(current->mm, hva);
- if (vma && hva >= vma->vm_start &&
- (vma->vm_flags & VM_PFNMAP)) {
- /*
- * This VMA is a physically contiguous region (e.g.
- * /dev/mem) that bypasses normal Linux page
- * management. Find the overlap between the
- * vma and the memslot.
- */
-
- unsigned long start, end;
- unsigned long slot_start, slot_end;
-
- pfnmap = 1;
-
- start = vma->vm_pgoff;
- end = start +
- ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT);
-
- pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT);
-
- slot_start = pfn - (gfn - slot->base_gfn);
- slot_end = slot_start + slot->npages;
-
- if (start < slot_start)
- start = slot_start;
- if (end > slot_end)
- end = slot_end;
-
- tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
- MAS1_TSIZE_SHIFT;
-
- /*
- * e500 doesn't implement the lowest tsize bit,
- * or 1K pages.
- */
- tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1);
-
- /*
- * Now find the largest tsize (up to what the guest
- * requested) that will cover gfn, stay within the
- * range, and for which gfn and pfn are mutually
- * aligned.
- */
-
- for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) {
- unsigned long gfn_start, gfn_end, tsize_pages;
- tsize_pages = 1 << (tsize - 2);
-
- gfn_start = gfn & ~(tsize_pages - 1);
- gfn_end = gfn_start + tsize_pages;
-
- if (gfn_start + pfn - gfn < start)
- continue;
- if (gfn_end + pfn - gfn > end)
- continue;
- if ((gfn & (tsize_pages - 1)) !=
- (pfn & (tsize_pages - 1)))
- continue;
-
- gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
- pfn &= ~(tsize_pages - 1);
- break;
- }
- } else if (vma && hva >= vma->vm_start &&
- (vma->vm_flags & VM_HUGETLB)) {
- unsigned long psize = vma_kernel_pagesize(vma);
-
- tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
- MAS1_TSIZE_SHIFT;
-
- /*
- * Take the largest page size that satisfies both host
- * and guest mapping
- */
- tsize = min(__ilog2(psize) - 10, tsize);
-
- /*
- * e500 doesn't implement the lowest tsize bit,
- * or 1K pages.
- */
- tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1);
- }
-
- up_read(&current->mm->mmap_sem);
- }
-
- if (likely(!pfnmap)) {
- unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT);
- pfn = gfn_to_pfn_memslot(slot, gfn);
- if (is_error_noslot_pfn(pfn)) {
- printk(KERN_ERR "Couldn't get real page for gfn %lx!\n",
- (long)gfn);
- return;
- }
-
- /* Align guest and physical address to page map boundaries */
- pfn &= ~(tsize_pages - 1);
- gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
- }
-
- /* Drop old ref and setup new one. */
- kvmppc_e500_ref_release(ref);
- kvmppc_e500_ref_setup(ref, gtlbe, pfn);
-
- kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize,
- ref, gvaddr, stlbe);
-
- /* Clear i-cache for new pages */
- kvmppc_mmu_flush_icache(pfn);
-
- /* Drop refcount on page, so that mmu notifiers can clear it */
- kvm_release_pfn_clean(pfn);
-}
-
-/* XXX only map the one-one case, for now use TLB0 */
-static void kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500,
- int esel,
- struct kvm_book3e_206_tlb_entry *stlbe)
-{
- struct kvm_book3e_206_tlb_entry *gtlbe;
- struct tlbe_ref *ref;
-
- gtlbe = get_entry(vcpu_e500, 0, esel);
- ref = &vcpu_e500->gtlb_priv[0][esel].ref;
-
- kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe),
- get_tlb_raddr(gtlbe) >> PAGE_SHIFT,
- gtlbe, 0, stlbe, ref);
-}
-
-/* Caller must ensure that the specified guest TLB entry is safe to insert into
- * the shadow TLB. */
-/* XXX for both one-one and one-to-many , for now use TLB1 */
-static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500,
- u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe,
- struct kvm_book3e_206_tlb_entry *stlbe, int esel)
-{
- struct tlbe_ref *ref;
- unsigned int victim;
-
- victim = vcpu_e500->host_tlb1_nv++;
-
- if (unlikely(vcpu_e500->host_tlb1_nv >= tlb1_max_shadow_size()))
- vcpu_e500->host_tlb1_nv = 0;
-
- ref = &vcpu_e500->tlb_refs[1][victim];
- kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, stlbe, ref);
-
- vcpu_e500->g2h_tlb1_map[esel] |= (u64)1 << victim;
- vcpu_e500->gtlb_priv[1][esel].ref.flags |= E500_TLB_BITMAP;
- if (vcpu_e500->h2g_tlb1_rmap[victim]) {
- unsigned int idx = vcpu_e500->h2g_tlb1_rmap[victim];
- vcpu_e500->g2h_tlb1_map[idx] &= ~(1ULL << victim);
- }
- vcpu_e500->h2g_tlb1_rmap[victim] = esel;
-
- return victim;
-}
-
static void kvmppc_recalc_tlb1map_range(struct kvmppc_vcpu_e500 *vcpu_e500)
{
int size = vcpu_e500->gtlb_params[1].entries;
@@ -683,8 +239,8 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value)
for (esel = 0; esel < vcpu_e500->gtlb_params[1].entries; esel++)
kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel);
- /* Invalidate all vcpu id mappings */
- kvmppc_e500_tlbil_all(vcpu_e500);
+ /* Invalidate all host shadow mappings */
+ kvmppc_core_flush_tlb(&vcpu_e500->vcpu);
return EMULATE_DONE;
}
@@ -713,8 +269,8 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, gva_t ea)
kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel);
}
- /* Invalidate all vcpu id mappings */
- kvmppc_e500_tlbil_all(vcpu_e500);
+ /* Invalidate all host shadow mappings */
+ kvmppc_core_flush_tlb(&vcpu_e500->vcpu);
return EMULATE_DONE;
}
@@ -834,27 +390,11 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea)
return EMULATE_DONE;
}
-/* sesel is for tlb1 only */
-static void write_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
- struct kvm_book3e_206_tlb_entry *gtlbe,
- struct kvm_book3e_206_tlb_entry *stlbe,
- int stlbsel, int sesel)
-{
- int stid;
-
- preempt_disable();
- stid = kvmppc_e500_get_tlb_stid(&vcpu_e500->vcpu, gtlbe);
-
- stlbe->mas1 |= MAS1_TID(stid);
- write_host_tlbe(vcpu_e500, stlbsel, sesel, stlbe);
- preempt_enable();
-}
-
int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
{
struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
- struct kvm_book3e_206_tlb_entry *gtlbe, stlbe;
- int tlbsel, esel, stlbsel, sesel;
+ struct kvm_book3e_206_tlb_entry *gtlbe;
+ int tlbsel, esel;
int recal = 0;
tlbsel = get_tlb_tlbsel(vcpu);
@@ -892,40 +432,16 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
/* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */
if (tlbe_is_host_safe(vcpu, gtlbe)) {
- u64 eaddr;
- u64 raddr;
+ u64 eaddr = get_tlb_eaddr(gtlbe);
+ u64 raddr = get_tlb_raddr(gtlbe);
- switch (tlbsel) {
- case 0:
- /* TLB0 */
+ if (tlbsel == 0) {
gtlbe->mas1 &= ~MAS1_TSIZE(~0);
gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K);
-
- stlbsel = 0;
- kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe);
- sesel = 0; /* unused */
-
- break;
-
- case 1:
- /* TLB1 */
- eaddr = get_tlb_eaddr(gtlbe);
- raddr = get_tlb_raddr(gtlbe);
-
- /* Create a 4KB mapping on the host.
- * If the guest wanted a large page,
- * only the first 4KB is mapped here and the rest
- * are mapped on the fly. */
- stlbsel = 1;
- sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr,
- raddr >> PAGE_SHIFT, gtlbe, &stlbe, esel);
- break;
-
- default:
- BUG();
}
- write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel);
+ /* Premap the faulting page */
+ kvmppc_mmu_map(vcpu, eaddr, raddr, index_of(tlbsel, esel));
}
kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS);
@@ -1019,100 +535,14 @@ void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
{
}
-void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
- unsigned int index)
-{
- struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
- struct tlbe_priv *priv;
- struct kvm_book3e_206_tlb_entry *gtlbe, stlbe;
- int tlbsel = tlbsel_of(index);
- int esel = esel_of(index);
- int stlbsel, sesel;
-
- gtlbe = get_entry(vcpu_e500, tlbsel, esel);
-
- switch (tlbsel) {
- case 0:
- stlbsel = 0;
- sesel = 0; /* unused */
- priv = &vcpu_e500->gtlb_priv[tlbsel][esel];
-
- /* Only triggers after clear_tlb_refs */
- if (unlikely(!(priv->ref.flags & E500_TLB_VALID)))
- kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe);
- else
- kvmppc_e500_setup_stlbe(vcpu, gtlbe, BOOK3E_PAGESZ_4K,
- &priv->ref, eaddr, &stlbe);
- break;
-
- case 1: {
- gfn_t gfn = gpaddr >> PAGE_SHIFT;
-
- stlbsel = 1;
- sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn,
- gtlbe, &stlbe, esel);
- break;
- }
-
- default:
- BUG();
- break;
- }
-
- write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel);
-}
-
-/************* MMU Notifiers *************/
-
-int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
-{
- trace_kvm_unmap_hva(hva);
-
- /*
- * Flush all shadow tlb entries everywhere. This is slow, but
- * we are 100% sure that we catch the to be unmapped page
- */
- kvm_flush_remote_tlbs(kvm);
-
- return 0;
-}
-
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
-{
- /* kvm_unmap_hva flushes everything anyways */
- kvm_unmap_hva(kvm, start);
-
- return 0;
-}
-
-int kvm_age_hva(struct kvm *kvm, unsigned long hva)
-{
- /* XXX could be more clever ;) */
- return 0;
-}
-
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
-{
- /* XXX could be more clever ;) */
- return 0;
-}
-
-void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
-{
- /* The page will get remapped properly on its next fault */
- kvm_unmap_hva(kvm, hva);
-}
-
/*****************************************/
static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500)
{
int i;
- clear_tlb1_bitmap(vcpu_e500);
+ kvmppc_core_flush_tlb(&vcpu_e500->vcpu);
kfree(vcpu_e500->g2h_tlb1_map);
-
- clear_tlb_refs(vcpu_e500);
kfree(vcpu_e500->gtlb_priv[0]);
kfree(vcpu_e500->gtlb_priv[1]);
@@ -1303,7 +733,7 @@ int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
{
struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
kvmppc_recalc_tlb1map_range(vcpu_e500);
- clear_tlb_refs(vcpu_e500);
+ kvmppc_core_flush_tlb(vcpu);
return 0;
}
@@ -1313,37 +743,8 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
int entry_size = sizeof(struct kvm_book3e_206_tlb_entry);
int entries = KVM_E500_TLB0_SIZE + KVM_E500_TLB1_SIZE;
- host_tlb_params[0].entries = mfspr(SPRN_TLB0CFG) & TLBnCFG_N_ENTRY;
- host_tlb_params[1].entries = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
-
- /*
- * This should never happen on real e500 hardware, but is
- * architecturally possible -- e.g. in some weird nested
- * virtualization case.
- */
- if (host_tlb_params[0].entries == 0 ||
- host_tlb_params[1].entries == 0) {
- pr_err("%s: need to know host tlb size\n", __func__);
- return -ENODEV;
- }
-
- host_tlb_params[0].ways = (mfspr(SPRN_TLB0CFG) & TLBnCFG_ASSOC) >>
- TLBnCFG_ASSOC_SHIFT;
- host_tlb_params[1].ways = host_tlb_params[1].entries;
-
- if (!is_power_of_2(host_tlb_params[0].entries) ||
- !is_power_of_2(host_tlb_params[0].ways) ||
- host_tlb_params[0].entries < host_tlb_params[0].ways ||
- host_tlb_params[0].ways == 0) {
- pr_err("%s: bad tlb0 host config: %u entries %u ways\n",
- __func__, host_tlb_params[0].entries,
- host_tlb_params[0].ways);
- return -ENODEV;
- }
-
- host_tlb_params[0].sets =
- host_tlb_params[0].entries / host_tlb_params[0].ways;
- host_tlb_params[1].sets = 1;
+ if (e500_mmu_host_init(vcpu_e500))
+ goto err;
vcpu_e500->gtlb_params[0].entries = KVM_E500_TLB0_SIZE;
vcpu_e500->gtlb_params[1].entries = KVM_E500_TLB1_SIZE;
@@ -1362,18 +763,6 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
vcpu_e500->gtlb_offset[0] = 0;
vcpu_e500->gtlb_offset[1] = KVM_E500_TLB0_SIZE;
- vcpu_e500->tlb_refs[0] =
- kzalloc(sizeof(struct tlbe_ref) * host_tlb_params[0].entries,
- GFP_KERNEL);
- if (!vcpu_e500->tlb_refs[0])
- goto err;
-
- vcpu_e500->tlb_refs[1] =
- kzalloc(sizeof(struct tlbe_ref) * host_tlb_params[1].entries,
- GFP_KERNEL);
- if (!vcpu_e500->tlb_refs[1])
- goto err;
-
vcpu_e500->gtlb_priv[0] = kzalloc(sizeof(struct tlbe_ref) *
vcpu_e500->gtlb_params[0].entries,
GFP_KERNEL);
@@ -1392,12 +781,6 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
if (!vcpu_e500->g2h_tlb1_map)
goto err;
- vcpu_e500->h2g_tlb1_rmap = kzalloc(sizeof(unsigned int) *
- host_tlb_params[1].entries,
- GFP_KERNEL);
- if (!vcpu_e500->h2g_tlb1_rmap)
- goto err;
-
/* Init TLB configuration register */
vcpu->arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) &
~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
@@ -1416,15 +799,11 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
err:
free_gtlb(vcpu_e500);
- kfree(vcpu_e500->tlb_refs[0]);
- kfree(vcpu_e500->tlb_refs[1]);
return -1;
}
void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500)
{
free_gtlb(vcpu_e500);
- kfree(vcpu_e500->h2g_tlb1_rmap);
- kfree(vcpu_e500->tlb_refs[0]);
- kfree(vcpu_e500->tlb_refs[1]);
+ e500_mmu_host_uninit(vcpu_e500);
}
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
new file mode 100644
index 00000000000..a222edfb9a9
--- /dev/null
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -0,0 +1,699 @@
+/*
+ * Copyright (C) 2008-2013 Freescale Semiconductor, Inc. All rights reserved.
+ *
+ * Author: Yu Liu, yu.liu@freescale.com
+ * Scott Wood, scottwood@freescale.com
+ * Ashish Kalra, ashish.kalra@freescale.com
+ * Varun Sethi, varun.sethi@freescale.com
+ * Alexander Graf, agraf@suse.de
+ *
+ * Description:
+ * This file is based on arch/powerpc/kvm/44x_tlb.c,
+ * by Hollis Blanchard <hollisb@us.ibm.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <linux/log2.h>
+#include <linux/uaccess.h>
+#include <linux/sched.h>
+#include <linux/rwsem.h>
+#include <linux/vmalloc.h>
+#include <linux/hugetlb.h>
+#include <asm/kvm_ppc.h>
+
+#include "e500.h"
+#include "trace.h"
+#include "timing.h"
+#include "e500_mmu_host.h"
+
+#define to_htlb1_esel(esel) (host_tlb_params[1].entries - (esel) - 1)
+
+static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM];
+
+static inline unsigned int tlb1_max_shadow_size(void)
+{
+ /* reserve one entry for magic page */
+ return host_tlb_params[1].entries - tlbcam_index - 1;
+}
+
+static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode)
+{
+ /* Mask off reserved bits. */
+ mas3 &= MAS3_ATTRIB_MASK;
+
+#ifndef CONFIG_KVM_BOOKE_HV
+ if (!usermode) {
+ /* Guest is in supervisor mode,
+ * so we need to translate guest
+ * supervisor permissions into user permissions. */
+ mas3 &= ~E500_TLB_USER_PERM_MASK;
+ mas3 |= (mas3 & E500_TLB_SUPER_PERM_MASK) << 1;
+ }
+ mas3 |= E500_TLB_SUPER_PERM_MASK;
+#endif
+ return mas3;
+}
+
+static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode)
+{
+#ifdef CONFIG_SMP
+ return (mas2 & MAS2_ATTRIB_MASK) | MAS2_M;
+#else
+ return mas2 & MAS2_ATTRIB_MASK;
+#endif
+}
+
+/*
+ * writing shadow tlb entry to host TLB
+ */
+static inline void __write_host_tlbe(struct kvm_book3e_206_tlb_entry *stlbe,
+ uint32_t mas0)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ mtspr(SPRN_MAS0, mas0);
+ mtspr(SPRN_MAS1, stlbe->mas1);
+ mtspr(SPRN_MAS2, (unsigned long)stlbe->mas2);
+ mtspr(SPRN_MAS3, (u32)stlbe->mas7_3);
+ mtspr(SPRN_MAS7, (u32)(stlbe->mas7_3 >> 32));
+#ifdef CONFIG_KVM_BOOKE_HV
+ mtspr(SPRN_MAS8, stlbe->mas8);
+#endif
+ asm volatile("isync; tlbwe" : : : "memory");
+
+#ifdef CONFIG_KVM_BOOKE_HV
+ /* Must clear mas8 for other host tlbwe's */
+ mtspr(SPRN_MAS8, 0);
+ isync();
+#endif
+ local_irq_restore(flags);
+
+ trace_kvm_booke206_stlb_write(mas0, stlbe->mas8, stlbe->mas1,
+ stlbe->mas2, stlbe->mas7_3);
+}
+
+/*
+ * Acquire a mas0 with victim hint, as if we just took a TLB miss.
+ *
+ * We don't care about the address we're searching for, other than that it's
+ * in the right set and is not present in the TLB. Using a zero PID and a
+ * userspace address means we don't have to set and then restore MAS5, or
+ * calculate a proper MAS6 value.
+ */
+static u32 get_host_mas0(unsigned long eaddr)
+{
+ unsigned long flags;
+ u32 mas0;
+
+ local_irq_save(flags);
+ mtspr(SPRN_MAS6, 0);
+ asm volatile("tlbsx 0, %0" : : "b" (eaddr & ~CONFIG_PAGE_OFFSET));
+ mas0 = mfspr(SPRN_MAS0);
+ local_irq_restore(flags);
+
+ return mas0;
+}
+
+/* sesel is for tlb1 only */
+static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
+ int tlbsel, int sesel, struct kvm_book3e_206_tlb_entry *stlbe)
+{
+ u32 mas0;
+
+ if (tlbsel == 0) {
+ mas0 = get_host_mas0(stlbe->mas2);
+ __write_host_tlbe(stlbe, mas0);
+ } else {
+ __write_host_tlbe(stlbe,
+ MAS0_TLBSEL(1) |
+ MAS0_ESEL(to_htlb1_esel(sesel)));
+ }
+}
+
+/* sesel is for tlb1 only */
+static void write_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
+ struct kvm_book3e_206_tlb_entry *gtlbe,
+ struct kvm_book3e_206_tlb_entry *stlbe,
+ int stlbsel, int sesel)
+{
+ int stid;
+
+ preempt_disable();
+ stid = kvmppc_e500_get_tlb_stid(&vcpu_e500->vcpu, gtlbe);
+
+ stlbe->mas1 |= MAS1_TID(stid);
+ write_host_tlbe(vcpu_e500, stlbsel, sesel, stlbe);
+ preempt_enable();
+}
+
+#ifdef CONFIG_KVM_E500V2
+/* XXX should be a hook in the gva2hpa translation */
+void kvmppc_map_magic(struct kvm_vcpu *vcpu)
+{
+ struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+ struct kvm_book3e_206_tlb_entry magic;
+ ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK;
+ unsigned int stid;
+ pfn_t pfn;
+
+ pfn = (pfn_t)virt_to_phys((void *)shared_page) >> PAGE_SHIFT;
+ get_page(pfn_to_page(pfn));
+
+ preempt_disable();
+ stid = kvmppc_e500_get_sid(vcpu_e500, 0, 0, 0, 0);
+
+ magic.mas1 = MAS1_VALID | MAS1_TS | MAS1_TID(stid) |
+ MAS1_TSIZE(BOOK3E_PAGESZ_4K);
+ magic.mas2 = vcpu->arch.magic_page_ea | MAS2_M;
+ magic.mas7_3 = ((u64)pfn << PAGE_SHIFT) |
+ MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR;
+ magic.mas8 = 0;
+
+ __write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index));
+ preempt_enable();
+}
+#endif
+
+void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel,
+ int esel)
+{
+ struct kvm_book3e_206_tlb_entry *gtlbe =
+ get_entry(vcpu_e500, tlbsel, esel);
+ struct tlbe_ref *ref = &vcpu_e500->gtlb_priv[tlbsel][esel].ref;
+
+ /* Don't bother with unmapped entries */
+ if (!(ref->flags & E500_TLB_VALID))
+ return;
+
+ if (tlbsel == 1 && ref->flags & E500_TLB_BITMAP) {
+ u64 tmp = vcpu_e500->g2h_tlb1_map[esel];
+ int hw_tlb_indx;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ while (tmp) {
+ hw_tlb_indx = __ilog2_u64(tmp & -tmp);
+ mtspr(SPRN_MAS0,
+ MAS0_TLBSEL(1) |
+ MAS0_ESEL(to_htlb1_esel(hw_tlb_indx)));
+ mtspr(SPRN_MAS1, 0);
+ asm volatile("tlbwe");
+ vcpu_e500->h2g_tlb1_rmap[hw_tlb_indx] = 0;
+ tmp &= tmp - 1;
+ }
+ mb();
+ vcpu_e500->g2h_tlb1_map[esel] = 0;
+ ref->flags &= ~(E500_TLB_BITMAP | E500_TLB_VALID);
+ local_irq_restore(flags);
+ }
+
+ if (tlbsel == 1 && ref->flags & E500_TLB_TLB0) {
+ /*
+ * TLB1 entry is backed by 4k pages. This should happen
+ * rarely and is not worth optimizing. Invalidate everything.
+ */
+ kvmppc_e500_tlbil_all(vcpu_e500);
+ ref->flags &= ~(E500_TLB_TLB0 | E500_TLB_VALID);
+ }
+
+ /* Already invalidated in between */
+ if (!(ref->flags & E500_TLB_VALID))
+ return;
+
+ /* Guest tlbe is backed by at most one host tlbe per shadow pid. */
+ kvmppc_e500_tlbil_one(vcpu_e500, gtlbe);
+
+ /* Mark the TLB as not backed by the host anymore */
+ ref->flags &= ~E500_TLB_VALID;
+}
+
+static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe)
+{
+ return tlbe->mas7_3 & (MAS3_SW|MAS3_UW);
+}
+
+static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref,
+ struct kvm_book3e_206_tlb_entry *gtlbe,
+ pfn_t pfn)
+{
+ ref->pfn = pfn;
+ ref->flags = E500_TLB_VALID;
+
+ if (tlbe_is_writable(gtlbe))
+ kvm_set_pfn_dirty(pfn);
+}
+
+static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref)
+{
+ if (ref->flags & E500_TLB_VALID) {
+ trace_kvm_booke206_ref_release(ref->pfn, ref->flags);
+ ref->flags = 0;
+ }
+}
+
+static void clear_tlb1_bitmap(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+ if (vcpu_e500->g2h_tlb1_map)
+ memset(vcpu_e500->g2h_tlb1_map, 0,
+ sizeof(u64) * vcpu_e500->gtlb_params[1].entries);
+ if (vcpu_e500->h2g_tlb1_rmap)
+ memset(vcpu_e500->h2g_tlb1_rmap, 0,
+ sizeof(unsigned int) * host_tlb_params[1].entries);
+}
+
+static void clear_tlb_privs(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+ int tlbsel = 0;
+ int i;
+
+ for (i = 0; i < vcpu_e500->gtlb_params[tlbsel].entries; i++) {
+ struct tlbe_ref *ref =
+ &vcpu_e500->gtlb_priv[tlbsel][i].ref;
+ kvmppc_e500_ref_release(ref);
+ }
+}
+
+static void clear_tlb_refs(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+ int stlbsel = 1;
+ int i;
+
+ kvmppc_e500_tlbil_all(vcpu_e500);
+
+ for (i = 0; i < host_tlb_params[stlbsel].entries; i++) {
+ struct tlbe_ref *ref =
+ &vcpu_e500->tlb_refs[stlbsel][i];
+ kvmppc_e500_ref_release(ref);
+ }
+
+ clear_tlb_privs(vcpu_e500);
+}
+
+void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu)
+{
+ struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+ clear_tlb_refs(vcpu_e500);
+ clear_tlb1_bitmap(vcpu_e500);
+}
+
+/* TID must be supplied by the caller */
+static void kvmppc_e500_setup_stlbe(
+ struct kvm_vcpu *vcpu,
+ struct kvm_book3e_206_tlb_entry *gtlbe,
+ int tsize, struct tlbe_ref *ref, u64 gvaddr,
+ struct kvm_book3e_206_tlb_entry *stlbe)
+{
+ pfn_t pfn = ref->pfn;
+ u32 pr = vcpu->arch.shared->msr & MSR_PR;
+
+ BUG_ON(!(ref->flags & E500_TLB_VALID));
+
+ /* Force IPROT=0 for all guest mappings. */
+ stlbe->mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) | MAS1_VALID;
+ stlbe->mas2 = (gvaddr & MAS2_EPN) |
+ e500_shadow_mas2_attrib(gtlbe->mas2, pr);
+ stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) |
+ e500_shadow_mas3_attrib(gtlbe->mas7_3, pr);
+
+#ifdef CONFIG_KVM_BOOKE_HV
+ stlbe->mas8 = MAS8_TGS | vcpu->kvm->arch.lpid;
+#endif
+}
+
+static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
+ u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe,
+ int tlbsel, struct kvm_book3e_206_tlb_entry *stlbe,
+ struct tlbe_ref *ref)
+{
+ struct kvm_memory_slot *slot;
+ unsigned long pfn = 0; /* silence GCC warning */
+ unsigned long hva;
+ int pfnmap = 0;
+ int tsize = BOOK3E_PAGESZ_4K;
+
+ /*
+ * Translate guest physical to true physical, acquiring
+ * a page reference if it is normal, non-reserved memory.
+ *
+ * gfn_to_memslot() must succeed because otherwise we wouldn't
+ * have gotten this far. Eventually we should just pass the slot
+ * pointer through from the first lookup.
+ */
+ slot = gfn_to_memslot(vcpu_e500->vcpu.kvm, gfn);
+ hva = gfn_to_hva_memslot(slot, gfn);
+
+ if (tlbsel == 1) {
+ struct vm_area_struct *vma;
+ down_read(&current->mm->mmap_sem);
+
+ vma = find_vma(current->mm, hva);
+ if (vma && hva >= vma->vm_start &&
+ (vma->vm_flags & VM_PFNMAP)) {
+ /*
+ * This VMA is a physically contiguous region (e.g.
+ * /dev/mem) that bypasses normal Linux page
+ * management. Find the overlap between the
+ * vma and the memslot.
+ */
+
+ unsigned long start, end;
+ unsigned long slot_start, slot_end;
+
+ pfnmap = 1;
+
+ start = vma->vm_pgoff;
+ end = start +
+ ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT);
+
+ pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT);
+
+ slot_start = pfn - (gfn - slot->base_gfn);
+ slot_end = slot_start + slot->npages;
+
+ if (start < slot_start)
+ start = slot_start;
+ if (end > slot_end)
+ end = slot_end;
+
+ tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
+ MAS1_TSIZE_SHIFT;
+
+ /*
+ * e500 doesn't implement the lowest tsize bit,
+ * or 1K pages.
+ */
+ tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1);
+
+ /*
+ * Now find the largest tsize (up to what the guest
+ * requested) that will cover gfn, stay within the
+ * range, and for which gfn and pfn are mutually
+ * aligned.
+ */
+
+ for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) {
+ unsigned long gfn_start, gfn_end, tsize_pages;
+ tsize_pages = 1 << (tsize - 2);
+
+ gfn_start = gfn & ~(tsize_pages - 1);
+ gfn_end = gfn_start + tsize_pages;
+
+ if (gfn_start + pfn - gfn < start)
+ continue;
+ if (gfn_end + pfn - gfn > end)
+ continue;
+ if ((gfn & (tsize_pages - 1)) !=
+ (pfn & (tsize_pages - 1)))
+ continue;
+
+ gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
+ pfn &= ~(tsize_pages - 1);
+ break;
+ }
+ } else if (vma && hva >= vma->vm_start &&
+ (vma->vm_flags & VM_HUGETLB)) {
+ unsigned long psize = vma_kernel_pagesize(vma);
+
+ tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
+ MAS1_TSIZE_SHIFT;
+
+ /*
+ * Take the largest page size that satisfies both host
+ * and guest mapping
+ */
+ tsize = min(__ilog2(psize) - 10, tsize);
+
+ /*
+ * e500 doesn't implement the lowest tsize bit,
+ * or 1K pages.
+ */
+ tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1);
+ }
+
+ up_read(&current->mm->mmap_sem);
+ }
+
+ if (likely(!pfnmap)) {
+ unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT);
+ pfn = gfn_to_pfn_memslot(slot, gfn);
+ if (is_error_noslot_pfn(pfn)) {
+ printk(KERN_ERR "Couldn't get real page for gfn %lx!\n",
+ (long)gfn);
+ return -EINVAL;
+ }
+
+ /* Align guest and physical address to page map boundaries */
+ pfn &= ~(tsize_pages - 1);
+ gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
+ }
+
+ /* Drop old ref and setup new one. */
+ kvmppc_e500_ref_release(ref);
+ kvmppc_e500_ref_setup(ref, gtlbe, pfn);
+
+ kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize,
+ ref, gvaddr, stlbe);
+
+ /* Clear i-cache for new pages */
+ kvmppc_mmu_flush_icache(pfn);
+
+ /* Drop refcount on page, so that mmu notifiers can clear it */
+ kvm_release_pfn_clean(pfn);
+
+ return 0;
+}
+
+/* XXX only map the one-one case, for now use TLB0 */
+static int kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500, int esel,
+ struct kvm_book3e_206_tlb_entry *stlbe)
+{
+ struct kvm_book3e_206_tlb_entry *gtlbe;
+ struct tlbe_ref *ref;
+ int stlbsel = 0;
+ int sesel = 0;
+ int r;
+
+ gtlbe = get_entry(vcpu_e500, 0, esel);
+ ref = &vcpu_e500->gtlb_priv[0][esel].ref;
+
+ r = kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe),
+ get_tlb_raddr(gtlbe) >> PAGE_SHIFT,
+ gtlbe, 0, stlbe, ref);
+ if (r)
+ return r;
+
+ write_stlbe(vcpu_e500, gtlbe, stlbe, stlbsel, sesel);
+
+ return 0;
+}
+
+static int kvmppc_e500_tlb1_map_tlb1(struct kvmppc_vcpu_e500 *vcpu_e500,
+ struct tlbe_ref *ref,
+ int esel)
+{
+ unsigned int sesel = vcpu_e500->host_tlb1_nv++;
+
+ if (unlikely(vcpu_e500->host_tlb1_nv >= tlb1_max_shadow_size()))
+ vcpu_e500->host_tlb1_nv = 0;
+
+ vcpu_e500->tlb_refs[1][sesel] = *ref;
+ vcpu_e500->g2h_tlb1_map[esel] |= (u64)1 << sesel;
+ vcpu_e500->gtlb_priv[1][esel].ref.flags |= E500_TLB_BITMAP;
+ if (vcpu_e500->h2g_tlb1_rmap[sesel]) {
+ unsigned int idx = vcpu_e500->h2g_tlb1_rmap[sesel];
+ vcpu_e500->g2h_tlb1_map[idx] &= ~(1ULL << sesel);
+ }
+ vcpu_e500->h2g_tlb1_rmap[sesel] = esel;
+
+ return sesel;
+}
+
+/* Caller must ensure that the specified guest TLB entry is safe to insert into
+ * the shadow TLB. */
+/* For both one-one and one-to-many */
+static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500,
+ u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe,
+ struct kvm_book3e_206_tlb_entry *stlbe, int esel)
+{
+ struct tlbe_ref ref;
+ int sesel;
+ int r;
+
+ ref.flags = 0;
+ r = kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, stlbe,
+ &ref);
+ if (r)
+ return r;
+
+ /* Use TLB0 when we can only map a page with 4k */
+ if (get_tlb_tsize(stlbe) == BOOK3E_PAGESZ_4K) {
+ vcpu_e500->gtlb_priv[1][esel].ref.flags |= E500_TLB_TLB0;
+ write_stlbe(vcpu_e500, gtlbe, stlbe, 0, 0);
+ return 0;
+ }
+
+ /* Otherwise map into TLB1 */
+ sesel = kvmppc_e500_tlb1_map_tlb1(vcpu_e500, &ref, esel);
+ write_stlbe(vcpu_e500, gtlbe, stlbe, 1, sesel);
+
+ return 0;
+}
+
+void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
+ unsigned int index)
+{
+ struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+ struct tlbe_priv *priv;
+ struct kvm_book3e_206_tlb_entry *gtlbe, stlbe;
+ int tlbsel = tlbsel_of(index);
+ int esel = esel_of(index);
+
+ gtlbe = get_entry(vcpu_e500, tlbsel, esel);
+
+ switch (tlbsel) {
+ case 0:
+ priv = &vcpu_e500->gtlb_priv[tlbsel][esel];
+
+ /* Triggers after clear_tlb_refs or on initial mapping */
+ if (!(priv->ref.flags & E500_TLB_VALID)) {
+ kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe);
+ } else {
+ kvmppc_e500_setup_stlbe(vcpu, gtlbe, BOOK3E_PAGESZ_4K,
+ &priv->ref, eaddr, &stlbe);
+ write_stlbe(vcpu_e500, gtlbe, &stlbe, 0, 0);
+ }
+ break;
+
+ case 1: {
+ gfn_t gfn = gpaddr >> PAGE_SHIFT;
+ kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn, gtlbe, &stlbe,
+ esel);
+ break;
+ }
+
+ default:
+ BUG();
+ break;
+ }
+}
+
+/************* MMU Notifiers *************/
+
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+ trace_kvm_unmap_hva(hva);
+
+ /*
+ * Flush all shadow tlb entries everywhere. This is slow, but
+ * we are 100% sure that we catch the to be unmapped page
+ */
+ kvm_flush_remote_tlbs(kvm);
+
+ return 0;
+}
+
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+ /* kvm_unmap_hva flushes everything anyways */
+ kvm_unmap_hva(kvm, start);
+
+ return 0;
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+{
+ /* XXX could be more clever ;) */
+ return 0;
+}
+
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+ /* XXX could be more clever ;) */
+ return 0;
+}
+
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+ /* The page will get remapped properly on its next fault */
+ kvm_unmap_hva(kvm, hva);
+}
+
+/*****************************************/
+
+int e500_mmu_host_init(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+ host_tlb_params[0].entries = mfspr(SPRN_TLB0CFG) & TLBnCFG_N_ENTRY;
+ host_tlb_params[1].entries = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
+
+ /*
+ * This should never happen on real e500 hardware, but is
+ * architecturally possible -- e.g. in some weird nested
+ * virtualization case.
+ */
+ if (host_tlb_params[0].entries == 0 ||
+ host_tlb_params[1].entries == 0) {
+ pr_err("%s: need to know host tlb size\n", __func__);
+ return -ENODEV;
+ }
+
+ host_tlb_params[0].ways = (mfspr(SPRN_TLB0CFG) & TLBnCFG_ASSOC) >>
+ TLBnCFG_ASSOC_SHIFT;
+ host_tlb_params[1].ways = host_tlb_params[1].entries;
+
+ if (!is_power_of_2(host_tlb_params[0].entries) ||
+ !is_power_of_2(host_tlb_params[0].ways) ||
+ host_tlb_params[0].entries < host_tlb_params[0].ways ||
+ host_tlb_params[0].ways == 0) {
+ pr_err("%s: bad tlb0 host config: %u entries %u ways\n",
+ __func__, host_tlb_params[0].entries,
+ host_tlb_params[0].ways);
+ return -ENODEV;
+ }
+
+ host_tlb_params[0].sets =
+ host_tlb_params[0].entries / host_tlb_params[0].ways;
+ host_tlb_params[1].sets = 1;
+
+ vcpu_e500->tlb_refs[0] =
+ kzalloc(sizeof(struct tlbe_ref) * host_tlb_params[0].entries,
+ GFP_KERNEL);
+ if (!vcpu_e500->tlb_refs[0])
+ goto err;
+
+ vcpu_e500->tlb_refs[1] =
+ kzalloc(sizeof(struct tlbe_ref) * host_tlb_params[1].entries,
+ GFP_KERNEL);
+ if (!vcpu_e500->tlb_refs[1])
+ goto err;
+
+ vcpu_e500->h2g_tlb1_rmap = kzalloc(sizeof(unsigned int) *
+ host_tlb_params[1].entries,
+ GFP_KERNEL);
+ if (!vcpu_e500->h2g_tlb1_rmap)
+ goto err;
+
+ return 0;
+
+err:
+ kfree(vcpu_e500->tlb_refs[0]);
+ kfree(vcpu_e500->tlb_refs[1]);
+ return -EINVAL;
+}
+
+void e500_mmu_host_uninit(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+ kfree(vcpu_e500->h2g_tlb1_rmap);
+ kfree(vcpu_e500->tlb_refs[0]);
+ kfree(vcpu_e500->tlb_refs[1]);
+}
diff --git a/arch/powerpc/kvm/e500_mmu_host.h b/arch/powerpc/kvm/e500_mmu_host.h
new file mode 100644
index 00000000000..7624835b76c
--- /dev/null
+++ b/arch/powerpc/kvm/e500_mmu_host.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) 2008-2013 Freescale Semiconductor, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef KVM_E500_MMU_HOST_H
+#define KVM_E500_MMU_HOST_H
+
+void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel,
+ int esel);
+
+int e500_mmu_host_init(struct kvmppc_vcpu_e500 *vcpu_e500);
+void e500_mmu_host_uninit(struct kvmppc_vcpu_e500 *vcpu_e500);
+
+#endif /* KVM_E500_MMU_HOST_H */
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 9d9cddc5b34..7a73b6f72a8 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -150,8 +150,6 @@ static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
case SPRN_TBWL: break;
case SPRN_TBWU: break;
- case SPRN_MSSSR0: break;
-
case SPRN_DEC:
vcpu->arch.dec = spr_val;
kvmppc_emulate_dec(vcpu);
@@ -202,9 +200,6 @@ static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
case SPRN_PIR:
spr_val = vcpu->vcpu_id;
break;
- case SPRN_MSSSR0:
- spr_val = 0;
- break;
/* Note: mftb and TBRL/TBWL are user-accessible, so
* the guest can always access the real TB anyways.
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 70739a08956..934413cd3a1 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -237,7 +237,8 @@ int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)
r = RESUME_HOST;
break;
default:
- BUG();
+ WARN_ON(1);
+ r = RESUME_GUEST;
}
return r;
@@ -305,6 +306,7 @@ int kvm_dev_ioctl_check_extension(long ext)
#ifdef CONFIG_BOOKE
case KVM_CAP_PPC_BOOKE_SREGS:
case KVM_CAP_PPC_BOOKE_WATCHDOG:
+ case KVM_CAP_PPC_EPR:
#else
case KVM_CAP_PPC_SEGSTATE:
case KVM_CAP_PPC_HIOR:
@@ -412,7 +414,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_memory_slot *memslot,
struct kvm_memory_slot old,
struct kvm_userspace_memory_region *mem,
- int user_alloc)
+ bool user_alloc)
{
return kvmppc_core_prepare_memory_region(kvm, memslot, mem);
}
@@ -420,7 +422,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
void kvm_arch_commit_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
struct kvm_memory_slot old,
- int user_alloc)
+ bool user_alloc)
{
kvmppc_core_commit_memory_region(kvm, mem, old);
}
@@ -720,6 +722,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
for (i = 0; i < 9; ++i)
kvmppc_set_gpr(vcpu, 4 + i, run->papr_hcall.args[i]);
vcpu->arch.hcall_needed = 0;
+#ifdef CONFIG_BOOKE
+ } else if (vcpu->arch.epr_needed) {
+ kvmppc_set_epr(vcpu, run->epr.epr);
+ vcpu->arch.epr_needed = 0;
+#endif
}
r = kvmppc_vcpu_run(run, vcpu);
@@ -761,6 +768,10 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
r = 0;
vcpu->arch.papr_enabled = true;
break;
+ case KVM_CAP_PPC_EPR:
+ r = 0;
+ vcpu->arch.epr_enabled = cap->args[0];
+ break;
#ifdef CONFIG_BOOKE
case KVM_CAP_PPC_BOOKE_WATCHDOG:
r = 0;
diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h
index 7def77302d6..87c17bfb296 100644
--- a/arch/s390/include/asm/irq.h
+++ b/arch/s390/include/asm/irq.h
@@ -41,6 +41,7 @@ enum interruption_class {
IRQIO_CSC,
IRQIO_PCI,
IRQIO_MSI,
+ IRQIO_VIR,
NMI_NMI,
CPU_RST,
NR_ARCH_IRQS
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index b7841546991..16bd5d169cd 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -20,9 +20,7 @@
#include <asm/cpu.h>
#define KVM_MAX_VCPUS 64
-#define KVM_MEMORY_SLOTS 32
-/* memory slots that does not exposed to userspace */
-#define KVM_PRIVATE_MEM_SLOTS 4
+#define KVM_USER_MEM_SLOTS 32
struct sca_entry {
atomic_t scn;
@@ -76,8 +74,11 @@ struct kvm_s390_sie_block {
__u64 epoch; /* 0x0038 */
__u8 reserved40[4]; /* 0x0040 */
#define LCTL_CR0 0x8000
+#define LCTL_CR6 0x0200
+#define LCTL_CR14 0x0002
__u16 lctl; /* 0x0044 */
__s16 icpua; /* 0x0046 */
+#define ICTL_LPSW 0x00400000
__u32 ictl; /* 0x0048 */
__u32 eca; /* 0x004c */
__u8 icptcode; /* 0x0050 */
@@ -127,6 +128,7 @@ struct kvm_vcpu_stat {
u32 deliver_prefix_signal;
u32 deliver_restart_signal;
u32 deliver_program_int;
+ u32 deliver_io_int;
u32 exit_wait_state;
u32 instruction_stidp;
u32 instruction_spx;
@@ -187,6 +189,11 @@ struct kvm_s390_emerg_info {
__u16 code;
};
+struct kvm_s390_mchk_info {
+ __u64 cr14;
+ __u64 mcic;
+};
+
struct kvm_s390_interrupt_info {
struct list_head list;
u64 type;
@@ -197,6 +204,7 @@ struct kvm_s390_interrupt_info {
struct kvm_s390_emerg_info emerg;
struct kvm_s390_extcall_info extcall;
struct kvm_s390_prefix_info prefix;
+ struct kvm_s390_mchk_info mchk;
};
};
@@ -254,6 +262,7 @@ struct kvm_arch{
debug_info_t *dbf;
struct kvm_s390_float_interrupt float_int;
struct gmap *gmap;
+ int css_support;
};
extern int sie64a(struct kvm_s390_sie_block *, u64 *);
diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
index 9df824ea166..1630f439cd2 100644
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -81,6 +81,7 @@ static const struct irq_class irqclass_sub_desc[NR_ARCH_IRQS] = {
[IRQIO_CSC] = {.name = "CSC", .desc = "[I/O] CHSC Subchannel"},
[IRQIO_PCI] = {.name = "PCI", .desc = "[I/O] PCI Interrupt" },
[IRQIO_MSI] = {.name = "MSI", .desc = "[I/O] MSI Interrupt" },
+ [IRQIO_VIR] = {.name = "VIR", .desc = "[I/O] Virtual I/O Devices"},
[NMI_NMI] = {.name = "NMI", .desc = "[NMI] Machine Check"},
[CPU_RST] = {.name = "RST", .desc = "[CPU] CPU Restart"},
};
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 22798ec33fd..f26ff1e31bd 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -26,27 +26,20 @@ static int handle_lctlg(struct kvm_vcpu *vcpu)
{
int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
int reg3 = vcpu->arch.sie_block->ipa & 0x000f;
- int base2 = vcpu->arch.sie_block->ipb >> 28;
- int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16) +
- ((vcpu->arch.sie_block->ipb & 0xff00) << 4);
u64 useraddr;
int reg, rc;
vcpu->stat.instruction_lctlg++;
- if ((vcpu->arch.sie_block->ipb & 0xff) != 0x2f)
- return -EOPNOTSUPP;
- useraddr = disp2;
- if (base2)
- useraddr += vcpu->run->s.regs.gprs[base2];
+ useraddr = kvm_s390_get_base_disp_rsy(vcpu);
if (useraddr & 7)
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
reg = reg1;
- VCPU_EVENT(vcpu, 5, "lctlg r1:%x, r3:%x,b2:%x,d2:%x", reg1, reg3, base2,
- disp2);
+ VCPU_EVENT(vcpu, 5, "lctlg r1:%x, r3:%x, addr:%llx", reg1, reg3,
+ useraddr);
trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, useraddr);
do {
@@ -68,23 +61,19 @@ static int handle_lctl(struct kvm_vcpu *vcpu)
{
int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
int reg3 = vcpu->arch.sie_block->ipa & 0x000f;
- int base2 = vcpu->arch.sie_block->ipb >> 28;
- int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
u64 useraddr;
u32 val = 0;
int reg, rc;
vcpu->stat.instruction_lctl++;
- useraddr = disp2;
- if (base2)
- useraddr += vcpu->run->s.regs.gprs[base2];
+ useraddr = kvm_s390_get_base_disp_rs(vcpu);
if (useraddr & 3)
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
- VCPU_EVENT(vcpu, 5, "lctl r1:%x, r3:%x,b2:%x,d2:%x", reg1, reg3, base2,
- disp2);
+ VCPU_EVENT(vcpu, 5, "lctl r1:%x, r3:%x, addr:%llx", reg1, reg3,
+ useraddr);
trace_kvm_s390_handle_lctl(vcpu, 0, reg1, reg3, useraddr);
reg = reg1;
@@ -104,14 +93,31 @@ static int handle_lctl(struct kvm_vcpu *vcpu)
return 0;
}
-static intercept_handler_t instruction_handlers[256] = {
+static const intercept_handler_t eb_handlers[256] = {
+ [0x2f] = handle_lctlg,
+ [0x8a] = kvm_s390_handle_priv_eb,
+};
+
+static int handle_eb(struct kvm_vcpu *vcpu)
+{
+ intercept_handler_t handler;
+
+ handler = eb_handlers[vcpu->arch.sie_block->ipb & 0xff];
+ if (handler)
+ return handler(vcpu);
+ return -EOPNOTSUPP;
+}
+
+static const intercept_handler_t instruction_handlers[256] = {
[0x01] = kvm_s390_handle_01,
+ [0x82] = kvm_s390_handle_lpsw,
[0x83] = kvm_s390_handle_diag,
[0xae] = kvm_s390_handle_sigp,
[0xb2] = kvm_s390_handle_b2,
[0xb7] = handle_lctl,
+ [0xb9] = kvm_s390_handle_b9,
[0xe5] = kvm_s390_handle_e5,
- [0xeb] = handle_lctlg,
+ [0xeb] = handle_eb,
};
static int handle_noop(struct kvm_vcpu *vcpu)
@@ -258,6 +264,7 @@ static const intercept_handler_t intercept_funcs[] = {
[0x0C >> 2] = handle_instruction_and_prog,
[0x10 >> 2] = handle_noop,
[0x14 >> 2] = handle_noop,
+ [0x18 >> 2] = handle_noop,
[0x1C >> 2] = kvm_s390_handle_wait,
[0x20 >> 2] = handle_validity,
[0x28 >> 2] = handle_stop,
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 87418b50f21..37116a77cb4 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -21,11 +21,31 @@
#include "gaccess.h"
#include "trace-s390.h"
+#define IOINT_SCHID_MASK 0x0000ffff
+#define IOINT_SSID_MASK 0x00030000
+#define IOINT_CSSID_MASK 0x03fc0000
+#define IOINT_AI_MASK 0x04000000
+
+static int is_ioint(u64 type)
+{
+ return ((type & 0xfffe0000u) != 0xfffe0000u);
+}
+
static int psw_extint_disabled(struct kvm_vcpu *vcpu)
{
return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_EXT);
}
+static int psw_ioint_disabled(struct kvm_vcpu *vcpu)
+{
+ return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_IO);
+}
+
+static int psw_mchk_disabled(struct kvm_vcpu *vcpu)
+{
+ return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_MCHECK);
+}
+
static int psw_interrupts_disabled(struct kvm_vcpu *vcpu)
{
if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER) ||
@@ -35,6 +55,13 @@ static int psw_interrupts_disabled(struct kvm_vcpu *vcpu)
return 1;
}
+static u64 int_word_to_isc_bits(u32 int_word)
+{
+ u8 isc = (int_word & 0x38000000) >> 27;
+
+ return (0x80 >> isc) << 24;
+}
+
static int __interrupt_is_deliverable(struct kvm_vcpu *vcpu,
struct kvm_s390_interrupt_info *inti)
{
@@ -67,7 +94,22 @@ static int __interrupt_is_deliverable(struct kvm_vcpu *vcpu,
case KVM_S390_SIGP_SET_PREFIX:
case KVM_S390_RESTART:
return 1;
+ case KVM_S390_MCHK:
+ if (psw_mchk_disabled(vcpu))
+ return 0;
+ if (vcpu->arch.sie_block->gcr[14] & inti->mchk.cr14)
+ return 1;
+ return 0;
+ case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
+ if (psw_ioint_disabled(vcpu))
+ return 0;
+ if (vcpu->arch.sie_block->gcr[6] &
+ int_word_to_isc_bits(inti->io.io_int_word))
+ return 1;
+ return 0;
default:
+ printk(KERN_WARNING "illegal interrupt type %llx\n",
+ inti->type);
BUG();
}
return 0;
@@ -93,6 +135,7 @@ static void __reset_intercept_indicators(struct kvm_vcpu *vcpu)
CPUSTAT_IO_INT | CPUSTAT_EXT_INT | CPUSTAT_STOP_INT,
&vcpu->arch.sie_block->cpuflags);
vcpu->arch.sie_block->lctl = 0x0000;
+ vcpu->arch.sie_block->ictl &= ~ICTL_LPSW;
}
static void __set_cpuflag(struct kvm_vcpu *vcpu, u32 flag)
@@ -116,6 +159,18 @@ static void __set_intercept_indicator(struct kvm_vcpu *vcpu,
case KVM_S390_SIGP_STOP:
__set_cpuflag(vcpu, CPUSTAT_STOP_INT);
break;
+ case KVM_S390_MCHK:
+ if (psw_mchk_disabled(vcpu))
+ vcpu->arch.sie_block->ictl |= ICTL_LPSW;
+ else
+ vcpu->arch.sie_block->lctl |= LCTL_CR14;
+ break;
+ case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
+ if (psw_ioint_disabled(vcpu))
+ __set_cpuflag(vcpu, CPUSTAT_IO_INT);
+ else
+ vcpu->arch.sie_block->lctl |= LCTL_CR6;
+ break;
default:
BUG();
}
@@ -297,6 +352,73 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
exception = 1;
break;
+ case KVM_S390_MCHK:
+ VCPU_EVENT(vcpu, 4, "interrupt: machine check mcic=%llx",
+ inti->mchk.mcic);
+ trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+ inti->mchk.cr14,
+ inti->mchk.mcic);
+ rc = kvm_s390_vcpu_store_status(vcpu,
+ KVM_S390_STORE_STATUS_PREFIXED);
+ if (rc == -EFAULT)
+ exception = 1;
+
+ rc = put_guest_u64(vcpu, __LC_MCCK_CODE, inti->mchk.mcic);
+ if (rc == -EFAULT)
+ exception = 1;
+
+ rc = copy_to_guest(vcpu, __LC_MCK_OLD_PSW,
+ &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+ if (rc == -EFAULT)
+ exception = 1;
+
+ rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
+ __LC_MCK_NEW_PSW, sizeof(psw_t));
+ if (rc == -EFAULT)
+ exception = 1;
+ break;
+
+ case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
+ {
+ __u32 param0 = ((__u32)inti->io.subchannel_id << 16) |
+ inti->io.subchannel_nr;
+ __u64 param1 = ((__u64)inti->io.io_int_parm << 32) |
+ inti->io.io_int_word;
+ VCPU_EVENT(vcpu, 4, "interrupt: I/O %llx", inti->type);
+ vcpu->stat.deliver_io_int++;
+ trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+ param0, param1);
+ rc = put_guest_u16(vcpu, __LC_SUBCHANNEL_ID,
+ inti->io.subchannel_id);
+ if (rc == -EFAULT)
+ exception = 1;
+
+ rc = put_guest_u16(vcpu, __LC_SUBCHANNEL_NR,
+ inti->io.subchannel_nr);
+ if (rc == -EFAULT)
+ exception = 1;
+
+ rc = put_guest_u32(vcpu, __LC_IO_INT_PARM,
+ inti->io.io_int_parm);
+ if (rc == -EFAULT)
+ exception = 1;
+
+ rc = put_guest_u32(vcpu, __LC_IO_INT_WORD,
+ inti->io.io_int_word);
+ if (rc == -EFAULT)
+ exception = 1;
+
+ rc = copy_to_guest(vcpu, __LC_IO_OLD_PSW,
+ &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+ if (rc == -EFAULT)
+ exception = 1;
+
+ rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
+ __LC_IO_NEW_PSW, sizeof(psw_t));
+ if (rc == -EFAULT)
+ exception = 1;
+ break;
+ }
default:
BUG();
}
@@ -518,6 +640,61 @@ void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
}
}
+void kvm_s390_deliver_pending_machine_checks(struct kvm_vcpu *vcpu)
+{
+ struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
+ struct kvm_s390_float_interrupt *fi = vcpu->arch.local_int.float_int;
+ struct kvm_s390_interrupt_info *n, *inti = NULL;
+ int deliver;
+
+ __reset_intercept_indicators(vcpu);
+ if (atomic_read(&li->active)) {
+ do {
+ deliver = 0;
+ spin_lock_bh(&li->lock);
+ list_for_each_entry_safe(inti, n, &li->list, list) {
+ if ((inti->type == KVM_S390_MCHK) &&
+ __interrupt_is_deliverable(vcpu, inti)) {
+ list_del(&inti->list);
+ deliver = 1;
+ break;
+ }
+ __set_intercept_indicator(vcpu, inti);
+ }
+ if (list_empty(&li->list))
+ atomic_set(&li->active, 0);
+ spin_unlock_bh(&li->lock);
+ if (deliver) {
+ __do_deliver_interrupt(vcpu, inti);
+ kfree(inti);
+ }
+ } while (deliver);
+ }
+
+ if (atomic_read(&fi->active)) {
+ do {
+ deliver = 0;
+ spin_lock(&fi->lock);
+ list_for_each_entry_safe(inti, n, &fi->list, list) {
+ if ((inti->type == KVM_S390_MCHK) &&
+ __interrupt_is_deliverable(vcpu, inti)) {
+ list_del(&inti->list);
+ deliver = 1;
+ break;
+ }
+ __set_intercept_indicator(vcpu, inti);
+ }
+ if (list_empty(&fi->list))
+ atomic_set(&fi->active, 0);
+ spin_unlock(&fi->lock);
+ if (deliver) {
+ __do_deliver_interrupt(vcpu, inti);
+ kfree(inti);
+ }
+ } while (deliver);
+ }
+}
+
int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code)
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@ -540,12 +717,50 @@ int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code)
return 0;
}
+struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
+ u64 cr6, u64 schid)
+{
+ struct kvm_s390_float_interrupt *fi;
+ struct kvm_s390_interrupt_info *inti, *iter;
+
+ if ((!schid && !cr6) || (schid && cr6))
+ return NULL;
+ mutex_lock(&kvm->lock);
+ fi = &kvm->arch.float_int;
+ spin_lock(&fi->lock);
+ inti = NULL;
+ list_for_each_entry(iter, &fi->list, list) {
+ if (!is_ioint(iter->type))
+ continue;
+ if (cr6 &&
+ ((cr6 & int_word_to_isc_bits(iter->io.io_int_word)) == 0))
+ continue;
+ if (schid) {
+ if (((schid & 0x00000000ffff0000) >> 16) !=
+ iter->io.subchannel_id)
+ continue;
+ if ((schid & 0x000000000000ffff) !=
+ iter->io.subchannel_nr)
+ continue;
+ }
+ inti = iter;
+ break;
+ }
+ if (inti)
+ list_del_init(&inti->list);
+ if (list_empty(&fi->list))
+ atomic_set(&fi->active, 0);
+ spin_unlock(&fi->lock);
+ mutex_unlock(&kvm->lock);
+ return inti;
+}
+
int kvm_s390_inject_vm(struct kvm *kvm,
struct kvm_s390_interrupt *s390int)
{
struct kvm_s390_local_interrupt *li;
struct kvm_s390_float_interrupt *fi;
- struct kvm_s390_interrupt_info *inti;
+ struct kvm_s390_interrupt_info *inti, *iter;
int sigcpu;
inti = kzalloc(sizeof(*inti), GFP_KERNEL);
@@ -569,6 +784,29 @@ int kvm_s390_inject_vm(struct kvm *kvm,
case KVM_S390_SIGP_STOP:
case KVM_S390_INT_EXTERNAL_CALL:
case KVM_S390_INT_EMERGENCY:
+ kfree(inti);
+ return -EINVAL;
+ case KVM_S390_MCHK:
+ VM_EVENT(kvm, 5, "inject: machine check parm64:%llx",
+ s390int->parm64);
+ inti->type = s390int->type;
+ inti->mchk.cr14 = s390int->parm; /* upper bits are not used */
+ inti->mchk.mcic = s390int->parm64;
+ break;
+ case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
+ if (s390int->type & IOINT_AI_MASK)
+ VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)");
+ else
+ VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x",
+ s390int->type & IOINT_CSSID_MASK,
+ s390int->type & IOINT_SSID_MASK,
+ s390int->type & IOINT_SCHID_MASK);
+ inti->type = s390int->type;
+ inti->io.subchannel_id = s390int->parm >> 16;
+ inti->io.subchannel_nr = s390int->parm & 0x0000ffffu;
+ inti->io.io_int_parm = s390int->parm64 >> 32;
+ inti->io.io_int_word = s390int->parm64 & 0x00000000ffffffffull;
+ break;
default:
kfree(inti);
return -EINVAL;
@@ -579,7 +817,22 @@ int kvm_s390_inject_vm(struct kvm *kvm,
mutex_lock(&kvm->lock);
fi = &kvm->arch.float_int;
spin_lock(&fi->lock);
- list_add_tail(&inti->list, &fi->list);
+ if (!is_ioint(inti->type))
+ list_add_tail(&inti->list, &fi->list);
+ else {
+ u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word);
+
+ /* Keep I/O interrupts sorted in isc order. */
+ list_for_each_entry(iter, &fi->list, list) {
+ if (!is_ioint(iter->type))
+ continue;
+ if (int_word_to_isc_bits(iter->io.io_int_word)
+ <= isc_bits)
+ continue;
+ break;
+ }
+ list_add_tail(&inti->list, &iter->list);
+ }
atomic_set(&fi->active, 1);
sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
if (sigcpu == KVM_MAX_VCPUS) {
@@ -651,8 +904,15 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
inti->type = s390int->type;
inti->emerg.code = s390int->parm;
break;
+ case KVM_S390_MCHK:
+ VCPU_EVENT(vcpu, 5, "inject: machine check parm64:%llx",
+ s390int->parm64);
+ inti->type = s390int->type;
+ inti->mchk.mcic = s390int->parm64;
+ break;
case KVM_S390_INT_VIRTIO:
case KVM_S390_INT_SERVICE:
+ case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
default:
kfree(inti);
return -EINVAL;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 2923781590a..4cf35a0a79e 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -140,6 +140,8 @@ int kvm_dev_ioctl_check_extension(long ext)
#endif
case KVM_CAP_SYNC_REGS:
case KVM_CAP_ONE_REG:
+ case KVM_CAP_ENABLE_CAP:
+ case KVM_CAP_S390_CSS_SUPPORT:
r = 1;
break;
case KVM_CAP_NR_VCPUS:
@@ -234,6 +236,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
if (!kvm->arch.gmap)
goto out_nogmap;
}
+
+ kvm->arch.css_support = 0;
+
return 0;
out_nogmap:
debug_unregister(kvm->arch.dbf);
@@ -659,6 +664,7 @@ rerun_vcpu:
case KVM_EXIT_INTR:
case KVM_EXIT_S390_RESET:
case KVM_EXIT_S390_UCONTROL:
+ case KVM_EXIT_S390_TSCH:
break;
default:
BUG();
@@ -766,6 +772,14 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
} else
prefix = 0;
+ /*
+ * The guest FPRS and ACRS are in the host FPRS/ACRS due to the lazy
+ * copying in vcpu load/put. Lets update our copies before we save
+ * it into the save area
+ */
+ save_fp_regs(&vcpu->arch.guest_fpregs);
+ save_access_regs(vcpu->run->s.regs.acrs);
+
if (__guestcopy(vcpu, addr + offsetof(struct save_area, fp_regs),
vcpu->arch.guest_fpregs.fprs, 128, prefix))
return -EFAULT;
@@ -810,6 +824,29 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
return 0;
}
+static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
+ struct kvm_enable_cap *cap)
+{
+ int r;
+
+ if (cap->flags)
+ return -EINVAL;
+
+ switch (cap->cap) {
+ case KVM_CAP_S390_CSS_SUPPORT:
+ if (!vcpu->kvm->arch.css_support) {
+ vcpu->kvm->arch.css_support = 1;
+ trace_kvm_s390_enable_css(vcpu->kvm);
+ }
+ r = 0;
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ return r;
+}
+
long kvm_arch_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -896,6 +933,15 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = 0;
break;
}
+ case KVM_ENABLE_CAP:
+ {
+ struct kvm_enable_cap cap;
+ r = -EFAULT;
+ if (copy_from_user(&cap, argp, sizeof(cap)))
+ break;
+ r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
+ break;
+ }
default:
r = -ENOTTY;
}
@@ -930,7 +976,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_memory_slot *memslot,
struct kvm_memory_slot old,
struct kvm_userspace_memory_region *mem,
- int user_alloc)
+ bool user_alloc)
{
/* A few sanity checks. We can have exactly one memory slot which has
to start at guest virtual zero and which has to be located at a
@@ -960,7 +1006,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
void kvm_arch_commit_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
struct kvm_memory_slot old,
- int user_alloc)
+ bool user_alloc)
{
int rc;
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index d75bc5e92c5..4d89d64a816 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -65,21 +65,67 @@ static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix)
vcpu->arch.sie_block->ihcpu = 0xffff;
}
+static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu)
+{
+ u32 base2 = vcpu->arch.sie_block->ipb >> 28;
+ u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
+
+ return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2;
+}
+
+static inline void kvm_s390_get_base_disp_sse(struct kvm_vcpu *vcpu,
+ u64 *address1, u64 *address2)
+{
+ u32 base1 = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28;
+ u32 disp1 = (vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16;
+ u32 base2 = (vcpu->arch.sie_block->ipb & 0xf000) >> 12;
+ u32 disp2 = vcpu->arch.sie_block->ipb & 0x0fff;
+
+ *address1 = (base1 ? vcpu->run->s.regs.gprs[base1] : 0) + disp1;
+ *address2 = (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2;
+}
+
+static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu)
+{
+ u32 base2 = vcpu->arch.sie_block->ipb >> 28;
+ u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16) +
+ ((vcpu->arch.sie_block->ipb & 0xff00) << 4);
+ /* The displacement is a 20bit _SIGNED_ value */
+ if (disp2 & 0x80000)
+ disp2+=0xfff00000;
+
+ return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + (long)(int)disp2;
+}
+
+static inline u64 kvm_s390_get_base_disp_rs(struct kvm_vcpu *vcpu)
+{
+ u32 base2 = vcpu->arch.sie_block->ipb >> 28;
+ u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
+
+ return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2;
+}
+
int kvm_s390_handle_wait(struct kvm_vcpu *vcpu);
enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer);
void kvm_s390_tasklet(unsigned long parm);
void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu);
+void kvm_s390_deliver_pending_machine_checks(struct kvm_vcpu *vcpu);
int kvm_s390_inject_vm(struct kvm *kvm,
struct kvm_s390_interrupt *s390int);
int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
struct kvm_s390_interrupt *s390int);
int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code);
int kvm_s390_inject_sigp_stop(struct kvm_vcpu *vcpu, int action);
+struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
+ u64 cr6, u64 schid);
/* implemented in priv.c */
int kvm_s390_handle_b2(struct kvm_vcpu *vcpu);
int kvm_s390_handle_e5(struct kvm_vcpu *vcpu);
int kvm_s390_handle_01(struct kvm_vcpu *vcpu);
+int kvm_s390_handle_b9(struct kvm_vcpu *vcpu);
+int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu);
+int kvm_s390_handle_priv_eb(struct kvm_vcpu *vcpu);
/* implemented in sigp.c */
int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index d768906f15c..0ef9894606e 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -18,23 +18,21 @@
#include <asm/debug.h>
#include <asm/ebcdic.h>
#include <asm/sysinfo.h>
+#include <asm/ptrace.h>
+#include <asm/compat.h>
#include "gaccess.h"
#include "kvm-s390.h"
#include "trace.h"
static int handle_set_prefix(struct kvm_vcpu *vcpu)
{
- int base2 = vcpu->arch.sie_block->ipb >> 28;
- int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
u64 operand2;
u32 address = 0;
u8 tmp;
vcpu->stat.instruction_spx++;
- operand2 = disp2;
- if (base2)
- operand2 += vcpu->run->s.regs.gprs[base2];
+ operand2 = kvm_s390_get_base_disp_s(vcpu);
/* must be word boundary */
if (operand2 & 3) {
@@ -67,15 +65,12 @@ out:
static int handle_store_prefix(struct kvm_vcpu *vcpu)
{
- int base2 = vcpu->arch.sie_block->ipb >> 28;
- int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
u64 operand2;
u32 address;
vcpu->stat.instruction_stpx++;
- operand2 = disp2;
- if (base2)
- operand2 += vcpu->run->s.regs.gprs[base2];
+
+ operand2 = kvm_s390_get_base_disp_s(vcpu);
/* must be word boundary */
if (operand2 & 3) {
@@ -100,15 +95,12 @@ out:
static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
{
- int base2 = vcpu->arch.sie_block->ipb >> 28;
- int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
u64 useraddr;
int rc;
vcpu->stat.instruction_stap++;
- useraddr = disp2;
- if (base2)
- useraddr += vcpu->run->s.regs.gprs[base2];
+
+ useraddr = kvm_s390_get_base_disp_s(vcpu);
if (useraddr & 1) {
kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -135,24 +127,96 @@ static int handle_skey(struct kvm_vcpu *vcpu)
return 0;
}
-static int handle_stsch(struct kvm_vcpu *vcpu)
+static int handle_tpi(struct kvm_vcpu *vcpu)
{
- vcpu->stat.instruction_stsch++;
- VCPU_EVENT(vcpu, 4, "%s", "store subchannel - CC3");
- /* condition code 3 */
+ u64 addr;
+ struct kvm_s390_interrupt_info *inti;
+ int cc;
+
+ addr = kvm_s390_get_base_disp_s(vcpu);
+
+ inti = kvm_s390_get_io_int(vcpu->kvm, vcpu->run->s.regs.crs[6], 0);
+ if (inti) {
+ if (addr) {
+ /*
+ * Store the two-word I/O interruption code into the
+ * provided area.
+ */
+ put_guest_u16(vcpu, addr, inti->io.subchannel_id);
+ put_guest_u16(vcpu, addr + 2, inti->io.subchannel_nr);
+ put_guest_u32(vcpu, addr + 4, inti->io.io_int_parm);
+ } else {
+ /*
+ * Store the three-word I/O interruption code into
+ * the appropriate lowcore area.
+ */
+ put_guest_u16(vcpu, 184, inti->io.subchannel_id);
+ put_guest_u16(vcpu, 186, inti->io.subchannel_nr);
+ put_guest_u32(vcpu, 188, inti->io.io_int_parm);
+ put_guest_u32(vcpu, 192, inti->io.io_int_word);
+ }
+ cc = 1;
+ } else
+ cc = 0;
+ kfree(inti);
+ /* Set condition code and we're done. */
vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
- vcpu->arch.sie_block->gpsw.mask |= (3 & 3ul) << 44;
+ vcpu->arch.sie_block->gpsw.mask |= (cc & 3ul) << 44;
return 0;
}
-static int handle_chsc(struct kvm_vcpu *vcpu)
+static int handle_tsch(struct kvm_vcpu *vcpu)
{
- vcpu->stat.instruction_chsc++;
- VCPU_EVENT(vcpu, 4, "%s", "channel subsystem call - CC3");
- /* condition code 3 */
- vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
- vcpu->arch.sie_block->gpsw.mask |= (3 & 3ul) << 44;
- return 0;
+ struct kvm_s390_interrupt_info *inti;
+
+ inti = kvm_s390_get_io_int(vcpu->kvm, 0,
+ vcpu->run->s.regs.gprs[1]);
+
+ /*
+ * Prepare exit to userspace.
+ * We indicate whether we dequeued a pending I/O interrupt
+ * so that userspace can re-inject it if the instruction gets
+ * a program check. While this may re-order the pending I/O
+ * interrupts, this is no problem since the priority is kept
+ * intact.
+ */
+ vcpu->run->exit_reason = KVM_EXIT_S390_TSCH;
+ vcpu->run->s390_tsch.dequeued = !!inti;
+ if (inti) {
+ vcpu->run->s390_tsch.subchannel_id = inti->io.subchannel_id;
+ vcpu->run->s390_tsch.subchannel_nr = inti->io.subchannel_nr;
+ vcpu->run->s390_tsch.io_int_parm = inti->io.io_int_parm;
+ vcpu->run->s390_tsch.io_int_word = inti->io.io_int_word;
+ }
+ vcpu->run->s390_tsch.ipb = vcpu->arch.sie_block->ipb;
+ kfree(inti);
+ return -EREMOTE;
+}
+
+static int handle_io_inst(struct kvm_vcpu *vcpu)
+{
+ VCPU_EVENT(vcpu, 4, "%s", "I/O instruction");
+
+ if (vcpu->kvm->arch.css_support) {
+ /*
+ * Most I/O instructions will be handled by userspace.
+ * Exceptions are tpi and the interrupt portion of tsch.
+ */
+ if (vcpu->arch.sie_block->ipa == 0xb236)
+ return handle_tpi(vcpu);
+ if (vcpu->arch.sie_block->ipa == 0xb235)
+ return handle_tsch(vcpu);
+ /* Handle in userspace. */
+ return -EOPNOTSUPP;
+ } else {
+ /*
+ * Set condition code 3 to stop the guest from issueing channel
+ * I/O instructions.
+ */
+ vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
+ vcpu->arch.sie_block->gpsw.mask |= (3 & 3ul) << 44;
+ return 0;
+ }
}
static int handle_stfl(struct kvm_vcpu *vcpu)
@@ -176,17 +240,107 @@ static int handle_stfl(struct kvm_vcpu *vcpu)
return 0;
}
+static void handle_new_psw(struct kvm_vcpu *vcpu)
+{
+ /* Check whether the new psw is enabled for machine checks. */
+ if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_MCHECK)
+ kvm_s390_deliver_pending_machine_checks(vcpu);
+}
+
+#define PSW_MASK_ADDR_MODE (PSW_MASK_EA | PSW_MASK_BA)
+#define PSW_MASK_UNASSIGNED 0xb80800fe7fffffffUL
+#define PSW_ADDR_24 0x00000000000fffffUL
+#define PSW_ADDR_31 0x000000007fffffffUL
+
+int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu)
+{
+ u64 addr;
+ psw_compat_t new_psw;
+
+ if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+ return kvm_s390_inject_program_int(vcpu,
+ PGM_PRIVILEGED_OPERATION);
+
+ addr = kvm_s390_get_base_disp_s(vcpu);
+
+ if (addr & 7) {
+ kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+ goto out;
+ }
+
+ if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw))) {
+ kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+ goto out;
+ }
+
+ if (!(new_psw.mask & PSW32_MASK_BASE)) {
+ kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+ goto out;
+ }
+
+ vcpu->arch.sie_block->gpsw.mask =
+ (new_psw.mask & ~PSW32_MASK_BASE) << 32;
+ vcpu->arch.sie_block->gpsw.addr = new_psw.addr;
+
+ if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_UNASSIGNED) ||
+ (!(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) &&
+ (vcpu->arch.sie_block->gpsw.addr & ~PSW_ADDR_24)) ||
+ ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) ==
+ PSW_MASK_EA)) {
+ kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+ goto out;
+ }
+
+ handle_new_psw(vcpu);
+out:
+ return 0;
+}
+
+static int handle_lpswe(struct kvm_vcpu *vcpu)
+{
+ u64 addr;
+ psw_t new_psw;
+
+ addr = kvm_s390_get_base_disp_s(vcpu);
+
+ if (addr & 7) {
+ kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+ goto out;
+ }
+
+ if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw))) {
+ kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+ goto out;
+ }
+
+ vcpu->arch.sie_block->gpsw.mask = new_psw.mask;
+ vcpu->arch.sie_block->gpsw.addr = new_psw.addr;
+
+ if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_UNASSIGNED) ||
+ (((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) ==
+ PSW_MASK_BA) &&
+ (vcpu->arch.sie_block->gpsw.addr & ~PSW_ADDR_31)) ||
+ (!(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) &&
+ (vcpu->arch.sie_block->gpsw.addr & ~PSW_ADDR_24)) ||
+ ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) ==
+ PSW_MASK_EA)) {
+ kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+ goto out;
+ }
+
+ handle_new_psw(vcpu);
+out:
+ return 0;
+}
+
static int handle_stidp(struct kvm_vcpu *vcpu)
{
- int base2 = vcpu->arch.sie_block->ipb >> 28;
- int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
u64 operand2;
int rc;
vcpu->stat.instruction_stidp++;
- operand2 = disp2;
- if (base2)
- operand2 += vcpu->run->s.regs.gprs[base2];
+
+ operand2 = kvm_s390_get_base_disp_s(vcpu);
if (operand2 & 7) {
kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -240,17 +394,13 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
int fc = (vcpu->run->s.regs.gprs[0] & 0xf0000000) >> 28;
int sel1 = vcpu->run->s.regs.gprs[0] & 0xff;
int sel2 = vcpu->run->s.regs.gprs[1] & 0xffff;
- int base2 = vcpu->arch.sie_block->ipb >> 28;
- int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
u64 operand2;
unsigned long mem;
vcpu->stat.instruction_stsi++;
VCPU_EVENT(vcpu, 4, "stsi: fc: %x sel1: %x sel2: %x", fc, sel1, sel2);
- operand2 = disp2;
- if (base2)
- operand2 += vcpu->run->s.regs.gprs[base2];
+ operand2 = kvm_s390_get_base_disp_s(vcpu);
if (operand2 & 0xfff && fc > 0)
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -297,7 +447,7 @@ out_fail:
return 0;
}
-static intercept_handler_t priv_handlers[256] = {
+static const intercept_handler_t b2_handlers[256] = {
[0x02] = handle_stidp,
[0x10] = handle_set_prefix,
[0x11] = handle_store_prefix,
@@ -305,10 +455,25 @@ static intercept_handler_t priv_handlers[256] = {
[0x29] = handle_skey,
[0x2a] = handle_skey,
[0x2b] = handle_skey,
- [0x34] = handle_stsch,
- [0x5f] = handle_chsc,
+ [0x30] = handle_io_inst,
+ [0x31] = handle_io_inst,
+ [0x32] = handle_io_inst,
+ [0x33] = handle_io_inst,
+ [0x34] = handle_io_inst,
+ [0x35] = handle_io_inst,
+ [0x36] = handle_io_inst,
+ [0x37] = handle_io_inst,
+ [0x38] = handle_io_inst,
+ [0x39] = handle_io_inst,
+ [0x3a] = handle_io_inst,
+ [0x3b] = handle_io_inst,
+ [0x3c] = handle_io_inst,
+ [0x5f] = handle_io_inst,
+ [0x74] = handle_io_inst,
+ [0x76] = handle_io_inst,
[0x7d] = handle_stsi,
[0xb1] = handle_stfl,
+ [0xb2] = handle_lpswe,
};
int kvm_s390_handle_b2(struct kvm_vcpu *vcpu)
@@ -322,7 +487,7 @@ int kvm_s390_handle_b2(struct kvm_vcpu *vcpu)
* state bit and (a) handle the instruction or (b) send a code 2
* program check.
* Anything else goes to userspace.*/
- handler = priv_handlers[vcpu->arch.sie_block->ipa & 0x00ff];
+ handler = b2_handlers[vcpu->arch.sie_block->ipa & 0x00ff];
if (handler) {
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
return kvm_s390_inject_program_int(vcpu,
@@ -333,19 +498,74 @@ int kvm_s390_handle_b2(struct kvm_vcpu *vcpu)
return -EOPNOTSUPP;
}
+static int handle_epsw(struct kvm_vcpu *vcpu)
+{
+ int reg1, reg2;
+
+ reg1 = (vcpu->arch.sie_block->ipb & 0x00f00000) >> 24;
+ reg2 = (vcpu->arch.sie_block->ipb & 0x000f0000) >> 16;
+
+ /* This basically extracts the mask half of the psw. */
+ vcpu->run->s.regs.gprs[reg1] &= 0xffffffff00000000;
+ vcpu->run->s.regs.gprs[reg1] |= vcpu->arch.sie_block->gpsw.mask >> 32;
+ if (reg2) {
+ vcpu->run->s.regs.gprs[reg2] &= 0xffffffff00000000;
+ vcpu->run->s.regs.gprs[reg2] |=
+ vcpu->arch.sie_block->gpsw.mask & 0x00000000ffffffff;
+ }
+ return 0;
+}
+
+static const intercept_handler_t b9_handlers[256] = {
+ [0x8d] = handle_epsw,
+ [0x9c] = handle_io_inst,
+};
+
+int kvm_s390_handle_b9(struct kvm_vcpu *vcpu)
+{
+ intercept_handler_t handler;
+
+ /* This is handled just as for the B2 instructions. */
+ handler = b9_handlers[vcpu->arch.sie_block->ipa & 0x00ff];
+ if (handler) {
+ if ((handler != handle_epsw) &&
+ (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE))
+ return kvm_s390_inject_program_int(vcpu,
+ PGM_PRIVILEGED_OPERATION);
+ else
+ return handler(vcpu);
+ }
+ return -EOPNOTSUPP;
+}
+
+static const intercept_handler_t eb_handlers[256] = {
+ [0x8a] = handle_io_inst,
+};
+
+int kvm_s390_handle_priv_eb(struct kvm_vcpu *vcpu)
+{
+ intercept_handler_t handler;
+
+ /* All eb instructions that end up here are privileged. */
+ if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+ return kvm_s390_inject_program_int(vcpu,
+ PGM_PRIVILEGED_OPERATION);
+ handler = eb_handlers[vcpu->arch.sie_block->ipb & 0xff];
+ if (handler)
+ return handler(vcpu);
+ return -EOPNOTSUPP;
+}
+
static int handle_tprot(struct kvm_vcpu *vcpu)
{
- int base1 = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28;
- int disp1 = (vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16;
- int base2 = (vcpu->arch.sie_block->ipb & 0xf000) >> 12;
- int disp2 = vcpu->arch.sie_block->ipb & 0x0fff;
- u64 address1 = disp1 + base1 ? vcpu->run->s.regs.gprs[base1] : 0;
- u64 address2 = disp2 + base2 ? vcpu->run->s.regs.gprs[base2] : 0;
+ u64 address1, address2;
struct vm_area_struct *vma;
unsigned long user_address;
vcpu->stat.instruction_tprot++;
+ kvm_s390_get_base_disp_sse(vcpu, &address1, &address2);
+
/* we only handle the Linux memory detection case:
* access key == 0
* guest DAT == off
@@ -405,7 +625,7 @@ static int handle_sckpf(struct kvm_vcpu *vcpu)
return 0;
}
-static intercept_handler_t x01_handlers[256] = {
+static const intercept_handler_t x01_handlers[256] = {
[0x07] = handle_sckpf,
};
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 566ddf6e8df..1c48ab2845e 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -137,8 +137,10 @@ static int __inject_sigp_stop(struct kvm_s390_local_interrupt *li, int action)
inti->type = KVM_S390_SIGP_STOP;
spin_lock_bh(&li->lock);
- if ((atomic_read(li->cpuflags) & CPUSTAT_STOPPED))
+ if ((atomic_read(li->cpuflags) & CPUSTAT_STOPPED)) {
+ kfree(inti);
goto out;
+ }
list_add_tail(&inti->list, &li->list);
atomic_set(&li->active, 1);
atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags);
@@ -324,8 +326,6 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
{
int r1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
int r3 = vcpu->arch.sie_block->ipa & 0x000f;
- int base2 = vcpu->arch.sie_block->ipb >> 28;
- int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
u32 parameter;
u16 cpu_addr = vcpu->run->s.regs.gprs[r3];
u8 order_code;
@@ -336,9 +336,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
return kvm_s390_inject_program_int(vcpu,
PGM_PRIVILEGED_OPERATION);
- order_code = disp2;
- if (base2)
- order_code += vcpu->run->s.regs.gprs[base2];
+ order_code = kvm_s390_get_base_disp_rs(vcpu);
if (r1 % 2)
parameter = vcpu->run->s.regs.gprs[r1];
diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h
index 90fdf85b5ff..13f30f58a2d 100644
--- a/arch/s390/kvm/trace-s390.h
+++ b/arch/s390/kvm/trace-s390.h
@@ -141,13 +141,13 @@ TRACE_EVENT(kvm_s390_inject_vcpu,
* Trace point for the actual delivery of interrupts.
*/
TRACE_EVENT(kvm_s390_deliver_interrupt,
- TP_PROTO(unsigned int id, __u64 type, __u32 data0, __u64 data1),
+ TP_PROTO(unsigned int id, __u64 type, __u64 data0, __u64 data1),
TP_ARGS(id, type, data0, data1),
TP_STRUCT__entry(
__field(int, id)
__field(__u32, inttype)
- __field(__u32, data0)
+ __field(__u64, data0)
__field(__u64, data1)
),
@@ -159,7 +159,7 @@ TRACE_EVENT(kvm_s390_deliver_interrupt,
),
TP_printk("deliver interrupt (vcpu %d): type:%x (%s) " \
- "data:%08x %016llx",
+ "data:%08llx %016llx",
__entry->id, __entry->inttype,
__print_symbolic(__entry->inttype, kvm_s390_int_type),
__entry->data0, __entry->data1)
@@ -204,6 +204,26 @@ TRACE_EVENT(kvm_s390_stop_request,
);
+/*
+ * Trace point for enabling channel I/O instruction support.
+ */
+TRACE_EVENT(kvm_s390_enable_css,
+ TP_PROTO(void *kvm),
+ TP_ARGS(kvm),
+
+ TP_STRUCT__entry(
+ __field(void *, kvm)
+ ),
+
+ TP_fast_assign(
+ __entry->kvm = kvm;
+ ),
+
+ TP_printk("enabling channel I/O support (kvm @ %p)\n",
+ __entry->kvm)
+ );
+
+
#endif /* _TRACE_KVMS390_H */
/* This part must be outside protection */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index dc87b65e9c3..635a74d2240 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -33,10 +33,10 @@
#define KVM_MAX_VCPUS 254
#define KVM_SOFT_MAX_VCPUS 160
-#define KVM_MEMORY_SLOTS 32
-/* memory slots that does not exposed to userspace */
-#define KVM_PRIVATE_MEM_SLOTS 4
-#define KVM_MEM_SLOTS_NUM (KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
+#define KVM_USER_MEM_SLOTS 125
+/* memory slots that are not exposed to userspace */
+#define KVM_PRIVATE_MEM_SLOTS 3
+#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
#define KVM_MMIO_SIZE 16
@@ -219,11 +219,6 @@ struct kvm_mmu_page {
u64 *spt;
/* hold the gfn of each spte inside spt */
gfn_t *gfns;
- /*
- * One bit set per slot which has memory
- * in this shadow page.
- */
- DECLARE_BITMAP(slot_bitmap, KVM_MEM_SLOTS_NUM);
bool unsync;
int root_count; /* Currently serving as active root */
unsigned int unsync_children;
@@ -502,6 +497,13 @@ struct kvm_vcpu_arch {
u64 msr_val;
struct gfn_to_hva_cache data;
} pv_eoi;
+
+ /*
+ * Indicate whether the access faults on its page table in guest
+ * which is set when fix page fault and used to detect unhandeable
+ * instruction.
+ */
+ bool write_fault_to_shadow_pgtable;
};
struct kvm_lpage_info {
@@ -697,6 +699,11 @@ struct kvm_x86_ops {
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
+ int (*vm_has_apicv)(struct kvm *kvm);
+ void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
+ void (*hwapic_isr_update)(struct kvm *kvm, int isr);
+ void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
+ void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*get_tdp_level)(void);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
@@ -991,6 +998,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva);
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
+int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 65231e173ba..695399f2d5e 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -27,7 +27,7 @@ static inline bool kvm_check_and_clear_guest_paused(void)
*
* Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively.
* The hypercall number should be placed in rax and the return value will be
- * placed in rax. No other registers will be clobbered unless explicited
+ * placed in rax. No other registers will be clobbered unless explicitly
* noted by the particular hypercall.
*/
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 235b49fa554..b6fbf860e39 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -57,9 +57,12 @@
#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
#define SECONDARY_EXEC_ENABLE_EPT 0x00000002
#define SECONDARY_EXEC_RDTSCP 0x00000008
+#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x00000010
#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
+#define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100
+#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200
#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
@@ -97,6 +100,7 @@ enum vmcs_field {
GUEST_GS_SELECTOR = 0x0000080a,
GUEST_LDTR_SELECTOR = 0x0000080c,
GUEST_TR_SELECTOR = 0x0000080e,
+ GUEST_INTR_STATUS = 0x00000810,
HOST_ES_SELECTOR = 0x00000c00,
HOST_CS_SELECTOR = 0x00000c02,
HOST_SS_SELECTOR = 0x00000c04,
@@ -124,6 +128,14 @@ enum vmcs_field {
APIC_ACCESS_ADDR_HIGH = 0x00002015,
EPT_POINTER = 0x0000201a,
EPT_POINTER_HIGH = 0x0000201b,
+ EOI_EXIT_BITMAP0 = 0x0000201c,
+ EOI_EXIT_BITMAP0_HIGH = 0x0000201d,
+ EOI_EXIT_BITMAP1 = 0x0000201e,
+ EOI_EXIT_BITMAP1_HIGH = 0x0000201f,
+ EOI_EXIT_BITMAP2 = 0x00002020,
+ EOI_EXIT_BITMAP2_HIGH = 0x00002021,
+ EOI_EXIT_BITMAP3 = 0x00002022,
+ EOI_EXIT_BITMAP3_HIGH = 0x00002023,
GUEST_PHYSICAL_ADDRESS = 0x00002400,
GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401,
VMCS_LINK_POINTER = 0x00002800,
@@ -346,9 +358,9 @@ enum vmcs_field {
#define AR_RESERVD_MASK 0xfffe0f00
-#define TSS_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 0)
-#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 1)
-#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 2)
+#define TSS_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 0)
+#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 1)
+#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 2)
#define VMX_NR_VPIDS (1 << 16)
#define VMX_VPID_EXTENT_SINGLE_CONTEXT 1
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 979d03bce13..2871fccfee6 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -62,10 +62,12 @@
#define EXIT_REASON_MCE_DURING_VMENTRY 41
#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
#define EXIT_REASON_APIC_ACCESS 44
+#define EXIT_REASON_EOI_INDUCED 45
#define EXIT_REASON_EPT_VIOLATION 48
#define EXIT_REASON_EPT_MISCONFIG 49
#define EXIT_REASON_WBINVD 54
#define EXIT_REASON_XSETBV 55
+#define EXIT_REASON_APIC_WRITE 56
#define EXIT_REASON_INVPCID 58
#define VMX_EXIT_REASONS \
@@ -103,7 +105,12 @@
{ EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \
{ EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \
{ EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \
- { EXIT_REASON_WBINVD, "WBINVD" }
+ { EXIT_REASON_WBINVD, "WBINVD" }, \
+ { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \
+ { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \
+ { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
+ { EXIT_REASON_INVD, "INVD" }, \
+ { EXIT_REASON_INVPCID, "INVPCID" }
#endif /* _UAPIVMX_H */
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 9f966dc0b9e..0732f0089a3 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -218,6 +218,9 @@ static void kvm_shutdown(void)
void __init kvmclock_init(void)
{
unsigned long mem;
+ int size;
+
+ size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
if (!kvm_para_available())
return;
@@ -231,16 +234,14 @@ void __init kvmclock_init(void)
printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
msr_kvm_system_time, msr_kvm_wall_clock);
- mem = memblock_alloc(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS,
- PAGE_SIZE);
+ mem = memblock_alloc(size, PAGE_SIZE);
if (!mem)
return;
hv_clock = __va(mem);
if (kvm_register_clock("boot clock")) {
hv_clock = NULL;
- memblock_free(mem,
- sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
+ memblock_free(mem, size);
return;
}
pv_time_ops.sched_clock = kvm_clock_read;
@@ -275,7 +276,7 @@ int __init kvm_setup_vsyscall_timeinfo(void)
struct pvclock_vcpu_time_info *vcpu_time;
unsigned int size;
- size = sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS;
+ size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
preempt_disable();
cpu = smp_processor_id();
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a27e7637110..a335cc6cde7 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -24,6 +24,7 @@
#include "kvm_cache_regs.h"
#include <linux/module.h>
#include <asm/kvm_emulate.h>
+#include <linux/stringify.h>
#include "x86.h"
#include "tss.h"
@@ -43,7 +44,7 @@
#define OpCL 9ull /* CL register (for shifts) */
#define OpImmByte 10ull /* 8-bit sign extended immediate */
#define OpOne 11ull /* Implied 1 */
-#define OpImm 12ull /* Sign extended immediate */
+#define OpImm 12ull /* Sign extended up to 32-bit immediate */
#define OpMem16 13ull /* Memory operand (16-bit). */
#define OpMem32 14ull /* Memory operand (32-bit). */
#define OpImmU 15ull /* Immediate operand, zero extended */
@@ -58,6 +59,7 @@
#define OpFS 24ull /* FS */
#define OpGS 25ull /* GS */
#define OpMem8 26ull /* 8-bit zero extended memory operand */
+#define OpImm64 27ull /* Sign extended 16/32/64-bit immediate */
#define OpBits 5 /* Width of operand field */
#define OpMask ((1ull << OpBits) - 1)
@@ -101,6 +103,7 @@
#define SrcMemFAddr (OpMemFAddr << SrcShift)
#define SrcAcc (OpAcc << SrcShift)
#define SrcImmU16 (OpImmU16 << SrcShift)
+#define SrcImm64 (OpImm64 << SrcShift)
#define SrcDX (OpDX << SrcShift)
#define SrcMem8 (OpMem8 << SrcShift)
#define SrcMask (OpMask << SrcShift)
@@ -113,6 +116,7 @@
#define GroupDual (2<<15) /* Alternate decoding of mod == 3 */
#define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */
#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */
+#define Escape (5<<15) /* Escape to coprocessor instruction */
#define Sse (1<<18) /* SSE Vector instruction */
/* Generic ModRM decode. */
#define ModRM (1<<19)
@@ -146,6 +150,8 @@
#define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */
#define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */
#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */
+#define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */
+#define NoWrite ((u64)1 << 45) /* No writeback */
#define X2(x...) x, x
#define X3(x...) X2(x), x
@@ -156,6 +162,27 @@
#define X8(x...) X4(x), X4(x)
#define X16(x...) X8(x), X8(x)
+#define NR_FASTOP (ilog2(sizeof(ulong)) + 1)
+#define FASTOP_SIZE 8
+
+/*
+ * fastop functions have a special calling convention:
+ *
+ * dst: [rdx]:rax (in/out)
+ * src: rbx (in/out)
+ * src2: rcx (in)
+ * flags: rflags (in/out)
+ *
+ * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for
+ * different operand sizes can be reached by calculation, rather than a jump
+ * table (which would be bigger than the code).
+ *
+ * fastop functions are declared as taking a never-defined fastop parameter,
+ * so they can't be called from C directly.
+ */
+
+struct fastop;
+
struct opcode {
u64 flags : 56;
u64 intercept : 8;
@@ -164,6 +191,8 @@ struct opcode {
const struct opcode *group;
const struct group_dual *gdual;
const struct gprefix *gprefix;
+ const struct escape *esc;
+ void (*fastop)(struct fastop *fake);
} u;
int (*check_perm)(struct x86_emulate_ctxt *ctxt);
};
@@ -180,6 +209,11 @@ struct gprefix {
struct opcode pfx_f3;
};
+struct escape {
+ struct opcode op[8];
+ struct opcode high[64];
+};
+
/* EFLAGS bit definitions. */
#define EFLG_ID (1<<21)
#define EFLG_VIP (1<<20)
@@ -407,6 +441,97 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
} \
} while (0)
+static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
+
+#define FOP_ALIGN ".align " __stringify(FASTOP_SIZE) " \n\t"
+#define FOP_RET "ret \n\t"
+
+#define FOP_START(op) \
+ extern void em_##op(struct fastop *fake); \
+ asm(".pushsection .text, \"ax\" \n\t" \
+ ".global em_" #op " \n\t" \
+ FOP_ALIGN \
+ "em_" #op ": \n\t"
+
+#define FOP_END \
+ ".popsection")
+
+#define FOPNOP() FOP_ALIGN FOP_RET
+
+#define FOP1E(op, dst) \
+ FOP_ALIGN #op " %" #dst " \n\t" FOP_RET
+
+#define FASTOP1(op) \
+ FOP_START(op) \
+ FOP1E(op##b, al) \
+ FOP1E(op##w, ax) \
+ FOP1E(op##l, eax) \
+ ON64(FOP1E(op##q, rax)) \
+ FOP_END
+
+#define FOP2E(op, dst, src) \
+ FOP_ALIGN #op " %" #src ", %" #dst " \n\t" FOP_RET
+
+#define FASTOP2(op) \
+ FOP_START(op) \
+ FOP2E(op##b, al, bl) \
+ FOP2E(op##w, ax, bx) \
+ FOP2E(op##l, eax, ebx) \
+ ON64(FOP2E(op##q, rax, rbx)) \
+ FOP_END
+
+/* 2 operand, word only */
+#define FASTOP2W(op) \
+ FOP_START(op) \
+ FOPNOP() \
+ FOP2E(op##w, ax, bx) \
+ FOP2E(op##l, eax, ebx) \
+ ON64(FOP2E(op##q, rax, rbx)) \
+ FOP_END
+
+/* 2 operand, src is CL */
+#define FASTOP2CL(op) \
+ FOP_START(op) \
+ FOP2E(op##b, al, cl) \
+ FOP2E(op##w, ax, cl) \
+ FOP2E(op##l, eax, cl) \
+ ON64(FOP2E(op##q, rax, cl)) \
+ FOP_END
+
+#define FOP3E(op, dst, src, src2) \
+ FOP_ALIGN #op " %" #src2 ", %" #src ", %" #dst " \n\t" FOP_RET
+
+/* 3-operand, word-only, src2=cl */
+#define FASTOP3WCL(op) \
+ FOP_START(op) \
+ FOPNOP() \
+ FOP3E(op##w, ax, bx, cl) \
+ FOP3E(op##l, eax, ebx, cl) \
+ ON64(FOP3E(op##q, rax, rbx, cl)) \
+ FOP_END
+
+/* Special case for SETcc - 1 instruction per cc */
+#define FOP_SETCC(op) ".align 4; " #op " %al; ret \n\t"
+
+FOP_START(setcc)
+FOP_SETCC(seto)
+FOP_SETCC(setno)
+FOP_SETCC(setc)
+FOP_SETCC(setnc)
+FOP_SETCC(setz)
+FOP_SETCC(setnz)
+FOP_SETCC(setbe)
+FOP_SETCC(setnbe)
+FOP_SETCC(sets)
+FOP_SETCC(setns)
+FOP_SETCC(setp)
+FOP_SETCC(setnp)
+FOP_SETCC(setl)
+FOP_SETCC(setnl)
+FOP_SETCC(setle)
+FOP_SETCC(setnle)
+FOP_END;
+
#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \
do { \
unsigned long _tmp; \
@@ -663,7 +788,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
ulong la;
u32 lim;
u16 sel;
- unsigned cpl, rpl;
+ unsigned cpl;
la = seg_base(ctxt, addr.seg) + addr.ea;
switch (ctxt->mode) {
@@ -697,11 +822,6 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
goto bad;
}
cpl = ctxt->ops->cpl(ctxt);
- if (ctxt->mode == X86EMUL_MODE_REAL)
- rpl = 0;
- else
- rpl = sel & 3;
- cpl = max(cpl, rpl);
if (!(desc.type & 8)) {
/* data segment */
if (cpl > desc.dpl)
@@ -852,39 +972,50 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
return rc;
}
-static int test_cc(unsigned int condition, unsigned int flags)
-{
- int rc = 0;
-
- switch ((condition & 15) >> 1) {
- case 0: /* o */
- rc |= (flags & EFLG_OF);
- break;
- case 1: /* b/c/nae */
- rc |= (flags & EFLG_CF);
- break;
- case 2: /* z/e */
- rc |= (flags & EFLG_ZF);
- break;
- case 3: /* be/na */
- rc |= (flags & (EFLG_CF|EFLG_ZF));
- break;
- case 4: /* s */
- rc |= (flags & EFLG_SF);
- break;
- case 5: /* p/pe */
- rc |= (flags & EFLG_PF);
- break;
- case 7: /* le/ng */
- rc |= (flags & EFLG_ZF);
- /* fall through */
- case 6: /* l/nge */
- rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
- break;
- }
-
- /* Odd condition identifiers (lsb == 1) have inverted sense. */
- return (!!rc ^ (condition & 1));
+FASTOP2(add);
+FASTOP2(or);
+FASTOP2(adc);
+FASTOP2(sbb);
+FASTOP2(and);
+FASTOP2(sub);
+FASTOP2(xor);
+FASTOP2(cmp);
+FASTOP2(test);
+
+FASTOP3WCL(shld);
+FASTOP3WCL(shrd);
+
+FASTOP2W(imul);
+
+FASTOP1(not);
+FASTOP1(neg);
+FASTOP1(inc);
+FASTOP1(dec);
+
+FASTOP2CL(rol);
+FASTOP2CL(ror);
+FASTOP2CL(rcl);
+FASTOP2CL(rcr);
+FASTOP2CL(shl);
+FASTOP2CL(shr);
+FASTOP2CL(sar);
+
+FASTOP2W(bsf);
+FASTOP2W(bsr);
+FASTOP2W(bt);
+FASTOP2W(bts);
+FASTOP2W(btr);
+FASTOP2W(btc);
+
+static u8 test_cc(unsigned int condition, unsigned long flags)
+{
+ u8 rc;
+ void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf);
+
+ flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF;
+ asm("push %[flags]; popf; call *%[fastop]"
+ : "=a"(rc) : [fastop]"r"(fop), [flags]"r"(flags));
+ return rc;
}
static void fetch_register_operand(struct operand *op)
@@ -994,6 +1125,53 @@ static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
ctxt->ops->put_fpu(ctxt);
}
+static int em_fninit(struct x86_emulate_ctxt *ctxt)
+{
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
+ ctxt->ops->get_fpu(ctxt);
+ asm volatile("fninit");
+ ctxt->ops->put_fpu(ctxt);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_fnstcw(struct x86_emulate_ctxt *ctxt)
+{
+ u16 fcw;
+
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
+ ctxt->ops->get_fpu(ctxt);
+ asm volatile("fnstcw %0": "+m"(fcw));
+ ctxt->ops->put_fpu(ctxt);
+
+ /* force 2 byte destination */
+ ctxt->dst.bytes = 2;
+ ctxt->dst.val = fcw;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_fnstsw(struct x86_emulate_ctxt *ctxt)
+{
+ u16 fsw;
+
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
+ ctxt->ops->get_fpu(ctxt);
+ asm volatile("fnstsw %0": "+m"(fsw));
+ ctxt->ops->put_fpu(ctxt);
+
+ /* force 2 byte destination */
+ ctxt->dst.bytes = 2;
+ ctxt->dst.val = fsw;
+
+ return X86EMUL_CONTINUE;
+}
+
static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
struct operand *op)
{
@@ -1534,6 +1712,9 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
{
int rc;
+ if (ctxt->d & NoWrite)
+ return X86EMUL_CONTINUE;
+
switch (ctxt->dst.type) {
case OP_REG:
write_register_operand(&ctxt->dst);
@@ -1918,47 +2099,6 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
-static int em_grp2(struct x86_emulate_ctxt *ctxt)
-{
- switch (ctxt->modrm_reg) {
- case 0: /* rol */
- emulate_2op_SrcB(ctxt, "rol");
- break;
- case 1: /* ror */
- emulate_2op_SrcB(ctxt, "ror");
- break;
- case 2: /* rcl */
- emulate_2op_SrcB(ctxt, "rcl");
- break;
- case 3: /* rcr */
- emulate_2op_SrcB(ctxt, "rcr");
- break;
- case 4: /* sal/shl */
- case 6: /* sal/shl */
- emulate_2op_SrcB(ctxt, "sal");
- break;
- case 5: /* shr */
- emulate_2op_SrcB(ctxt, "shr");
- break;
- case 7: /* sar */
- emulate_2op_SrcB(ctxt, "sar");
- break;
- }
- return X86EMUL_CONTINUE;
-}
-
-static int em_not(struct x86_emulate_ctxt *ctxt)
-{
- ctxt->dst.val = ~ctxt->dst.val;
- return X86EMUL_CONTINUE;
-}
-
-static int em_neg(struct x86_emulate_ctxt *ctxt)
-{
- emulate_1op(ctxt, "neg");
- return X86EMUL_CONTINUE;
-}
-
static int em_mul_ex(struct x86_emulate_ctxt *ctxt)
{
u8 ex = 0;
@@ -2000,12 +2140,6 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt)
int rc = X86EMUL_CONTINUE;
switch (ctxt->modrm_reg) {
- case 0: /* inc */
- emulate_1op(ctxt, "inc");
- break;
- case 1: /* dec */
- emulate_1op(ctxt, "dec");
- break;
case 2: /* call near abs */ {
long int old_eip;
old_eip = ctxt->_eip;
@@ -2075,7 +2209,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
/* Save real source value, then compare EAX against destination. */
ctxt->src.orig_val = ctxt->src.val;
ctxt->src.val = reg_read(ctxt, VCPU_REGS_RAX);
- emulate_2op_SrcV(ctxt, "cmp");
+ fastop(ctxt, em_cmp);
if (ctxt->eflags & EFLG_ZF) {
/* Success: write back to memory. */
@@ -2843,7 +2977,7 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
ctxt->src.type = OP_IMM;
ctxt->src.val = 0;
ctxt->src.bytes = 1;
- emulate_2op_SrcV(ctxt, "or");
+ fastop(ctxt, em_or);
ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
if (cf)
ctxt->eflags |= X86_EFLAGS_CF;
@@ -2852,6 +2986,24 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
+static int em_aad(struct x86_emulate_ctxt *ctxt)
+{
+ u8 al = ctxt->dst.val & 0xff;
+ u8 ah = (ctxt->dst.val >> 8) & 0xff;
+
+ al = (al + (ah * ctxt->src.val)) & 0xff;
+
+ ctxt->dst.val = (ctxt->dst.val & 0xffff0000) | al;
+
+ /* Set PF, ZF, SF */
+ ctxt->src.type = OP_IMM;
+ ctxt->src.val = 0;
+ ctxt->src.bytes = 1;
+ fastop(ctxt, em_or);
+
+ return X86EMUL_CONTINUE;
+}
+
static int em_call(struct x86_emulate_ctxt *ctxt)
{
long rel = ctxt->src.val;
@@ -2900,64 +3052,6 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
-static int em_add(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV(ctxt, "add");
- return X86EMUL_CONTINUE;
-}
-
-static int em_or(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV(ctxt, "or");
- return X86EMUL_CONTINUE;
-}
-
-static int em_adc(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV(ctxt, "adc");
- return X86EMUL_CONTINUE;
-}
-
-static int em_sbb(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV(ctxt, "sbb");
- return X86EMUL_CONTINUE;
-}
-
-static int em_and(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV(ctxt, "and");
- return X86EMUL_CONTINUE;
-}
-
-static int em_sub(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV(ctxt, "sub");
- return X86EMUL_CONTINUE;
-}
-
-static int em_xor(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV(ctxt, "xor");
- return X86EMUL_CONTINUE;
-}
-
-static int em_cmp(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV(ctxt, "cmp");
- /* Disable writeback. */
- ctxt->dst.type = OP_NONE;
- return X86EMUL_CONTINUE;
-}
-
-static int em_test(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV(ctxt, "test");
- /* Disable writeback. */
- ctxt->dst.type = OP_NONE;
- return X86EMUL_CONTINUE;
-}
-
static int em_xchg(struct x86_emulate_ctxt *ctxt)
{
/* Write back the register source. */
@@ -2970,16 +3064,10 @@ static int em_xchg(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
-static int em_imul(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV_nobyte(ctxt, "imul");
- return X86EMUL_CONTINUE;
-}
-
static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
{
ctxt->dst.val = ctxt->src2.val;
- return em_imul(ctxt);
+ return fastop(ctxt, em_imul);
}
static int em_cwd(struct x86_emulate_ctxt *ctxt)
@@ -3300,47 +3388,6 @@ static int em_sti(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
-static int em_bt(struct x86_emulate_ctxt *ctxt)
-{
- /* Disable writeback. */
- ctxt->dst.type = OP_NONE;
- /* only subword offset */
- ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
-
- emulate_2op_SrcV_nobyte(ctxt, "bt");
- return X86EMUL_CONTINUE;
-}
-
-static int em_bts(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV_nobyte(ctxt, "bts");
- return X86EMUL_CONTINUE;
-}
-
-static int em_btr(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV_nobyte(ctxt, "btr");
- return X86EMUL_CONTINUE;
-}
-
-static int em_btc(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV_nobyte(ctxt, "btc");
- return X86EMUL_CONTINUE;
-}
-
-static int em_bsf(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV_nobyte(ctxt, "bsf");
- return X86EMUL_CONTINUE;
-}
-
-static int em_bsr(struct x86_emulate_ctxt *ctxt)
-{
- emulate_2op_SrcV_nobyte(ctxt, "bsr");
- return X86EMUL_CONTINUE;
-}
-
static int em_cpuid(struct x86_emulate_ctxt *ctxt)
{
u32 eax, ebx, ecx, edx;
@@ -3572,7 +3619,9 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
+#define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) }
#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
+#define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) }
#define II(_f, _e, _i) \
{ .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i }
#define IIP(_f, _e, _i, _p) \
@@ -3583,12 +3632,13 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
#define D2bv(_f) D((_f) | ByteOp), D(_f)
#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p)
#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e)
+#define F2bv(_f, _e) F((_f) | ByteOp, _e), F(_f, _e)
#define I2bvIP(_f, _e, _i, _p) \
IIP((_f) | ByteOp, _e, _i, _p), IIP(_f, _e, _i, _p)
-#define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \
- I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \
- I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
+#define F6ALU(_f, _e) F2bv((_f) | DstMem | SrcReg | ModRM, _e), \
+ F2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \
+ F2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
static const struct opcode group7_rm1[] = {
DI(SrcNone | Priv, monitor),
@@ -3614,25 +3664,36 @@ static const struct opcode group7_rm7[] = {
};
static const struct opcode group1[] = {
- I(Lock, em_add),
- I(Lock | PageTable, em_or),
- I(Lock, em_adc),
- I(Lock, em_sbb),
- I(Lock | PageTable, em_and),
- I(Lock, em_sub),
- I(Lock, em_xor),
- I(0, em_cmp),
+ F(Lock, em_add),
+ F(Lock | PageTable, em_or),
+ F(Lock, em_adc),
+ F(Lock, em_sbb),
+ F(Lock | PageTable, em_and),
+ F(Lock, em_sub),
+ F(Lock, em_xor),
+ F(NoWrite, em_cmp),
};
static const struct opcode group1A[] = {
I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N,
};
+static const struct opcode group2[] = {
+ F(DstMem | ModRM, em_rol),
+ F(DstMem | ModRM, em_ror),
+ F(DstMem | ModRM, em_rcl),
+ F(DstMem | ModRM, em_rcr),
+ F(DstMem | ModRM, em_shl),
+ F(DstMem | ModRM, em_shr),
+ F(DstMem | ModRM, em_shl),
+ F(DstMem | ModRM, em_sar),
+};
+
static const struct opcode group3[] = {
- I(DstMem | SrcImm, em_test),
- I(DstMem | SrcImm, em_test),
- I(DstMem | SrcNone | Lock, em_not),
- I(DstMem | SrcNone | Lock, em_neg),
+ F(DstMem | SrcImm | NoWrite, em_test),
+ F(DstMem | SrcImm | NoWrite, em_test),
+ F(DstMem | SrcNone | Lock, em_not),
+ F(DstMem | SrcNone | Lock, em_neg),
I(SrcMem, em_mul_ex),
I(SrcMem, em_imul_ex),
I(SrcMem, em_div_ex),
@@ -3640,14 +3701,14 @@ static const struct opcode group3[] = {
};
static const struct opcode group4[] = {
- I(ByteOp | DstMem | SrcNone | Lock, em_grp45),
- I(ByteOp | DstMem | SrcNone | Lock, em_grp45),
+ F(ByteOp | DstMem | SrcNone | Lock, em_inc),
+ F(ByteOp | DstMem | SrcNone | Lock, em_dec),
N, N, N, N, N, N,
};
static const struct opcode group5[] = {
- I(DstMem | SrcNone | Lock, em_grp45),
- I(DstMem | SrcNone | Lock, em_grp45),
+ F(DstMem | SrcNone | Lock, em_inc),
+ F(DstMem | SrcNone | Lock, em_dec),
I(SrcMem | Stack, em_grp45),
I(SrcMemFAddr | ImplicitOps | Stack, em_call_far),
I(SrcMem | Stack, em_grp45),
@@ -3682,10 +3743,10 @@ static const struct group_dual group7 = { {
static const struct opcode group8[] = {
N, N, N, N,
- I(DstMem | SrcImmByte, em_bt),
- I(DstMem | SrcImmByte | Lock | PageTable, em_bts),
- I(DstMem | SrcImmByte | Lock, em_btr),
- I(DstMem | SrcImmByte | Lock | PageTable, em_btc),
+ F(DstMem | SrcImmByte | NoWrite, em_bt),
+ F(DstMem | SrcImmByte | Lock | PageTable, em_bts),
+ F(DstMem | SrcImmByte | Lock, em_btr),
+ F(DstMem | SrcImmByte | Lock | PageTable, em_btc),
};
static const struct group_dual group9 = { {
@@ -3707,33 +3768,96 @@ static const struct gprefix pfx_vmovntpx = {
I(0, em_mov), N, N, N,
};
+static const struct escape escape_d9 = { {
+ N, N, N, N, N, N, N, I(DstMem, em_fnstcw),
+}, {
+ /* 0xC0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xC8 - 0xCF */
+ N, N, N, N, N, N, N, N,
+ /* 0xD0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xD8 - 0xDF */
+ N, N, N, N, N, N, N, N,
+ /* 0xE0 - 0xE7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xE8 - 0xEF */
+ N, N, N, N, N, N, N, N,
+ /* 0xF0 - 0xF7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xF8 - 0xFF */
+ N, N, N, N, N, N, N, N,
+} };
+
+static const struct escape escape_db = { {
+ N, N, N, N, N, N, N, N,
+}, {
+ /* 0xC0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xC8 - 0xCF */
+ N, N, N, N, N, N, N, N,
+ /* 0xD0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xD8 - 0xDF */
+ N, N, N, N, N, N, N, N,
+ /* 0xE0 - 0xE7 */
+ N, N, N, I(ImplicitOps, em_fninit), N, N, N, N,
+ /* 0xE8 - 0xEF */
+ N, N, N, N, N, N, N, N,
+ /* 0xF0 - 0xF7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xF8 - 0xFF */
+ N, N, N, N, N, N, N, N,
+} };
+
+static const struct escape escape_dd = { {
+ N, N, N, N, N, N, N, I(DstMem, em_fnstsw),
+}, {
+ /* 0xC0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xC8 - 0xCF */
+ N, N, N, N, N, N, N, N,
+ /* 0xD0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xD8 - 0xDF */
+ N, N, N, N, N, N, N, N,
+ /* 0xE0 - 0xE7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xE8 - 0xEF */
+ N, N, N, N, N, N, N, N,
+ /* 0xF0 - 0xF7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xF8 - 0xFF */
+ N, N, N, N, N, N, N, N,
+} };
+
static const struct opcode opcode_table[256] = {
/* 0x00 - 0x07 */
- I6ALU(Lock, em_add),
+ F6ALU(Lock, em_add),
I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg),
I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg),
/* 0x08 - 0x0F */
- I6ALU(Lock | PageTable, em_or),
+ F6ALU(Lock | PageTable, em_or),
I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg),
N,
/* 0x10 - 0x17 */
- I6ALU(Lock, em_adc),
+ F6ALU(Lock, em_adc),
I(ImplicitOps | Stack | No64 | Src2SS, em_push_sreg),
I(ImplicitOps | Stack | No64 | Src2SS, em_pop_sreg),
/* 0x18 - 0x1F */
- I6ALU(Lock, em_sbb),
+ F6ALU(Lock, em_sbb),
I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg),
I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg),
/* 0x20 - 0x27 */
- I6ALU(Lock | PageTable, em_and), N, N,
+ F6ALU(Lock | PageTable, em_and), N, N,
/* 0x28 - 0x2F */
- I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das),
+ F6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das),
/* 0x30 - 0x37 */
- I6ALU(Lock, em_xor), N, N,
+ F6ALU(Lock, em_xor), N, N,
/* 0x38 - 0x3F */
- I6ALU(0, em_cmp), N, N,
+ F6ALU(NoWrite, em_cmp), N, N,
/* 0x40 - 0x4F */
- X16(D(DstReg)),
+ X8(F(DstReg, em_inc)), X8(F(DstReg, em_dec)),
/* 0x50 - 0x57 */
X8(I(SrcReg | Stack, em_push)),
/* 0x58 - 0x5F */
@@ -3757,7 +3881,7 @@ static const struct opcode opcode_table[256] = {
G(DstMem | SrcImm, group1),
G(ByteOp | DstMem | SrcImm | No64, group1),
G(DstMem | SrcImmByte, group1),
- I2bv(DstMem | SrcReg | ModRM, em_test),
+ F2bv(DstMem | SrcReg | ModRM | NoWrite, em_test),
I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg),
/* 0x88 - 0x8F */
I2bv(DstMem | SrcReg | ModRM | Mov | PageTable, em_mov),
@@ -3777,18 +3901,18 @@ static const struct opcode opcode_table[256] = {
I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
I2bv(SrcSI | DstDI | Mov | String, em_mov),
- I2bv(SrcSI | DstDI | String, em_cmp),
+ F2bv(SrcSI | DstDI | String | NoWrite, em_cmp),
/* 0xA8 - 0xAF */
- I2bv(DstAcc | SrcImm, em_test),
+ F2bv(DstAcc | SrcImm | NoWrite, em_test),
I2bv(SrcAcc | DstDI | Mov | String, em_mov),
I2bv(SrcSI | DstAcc | Mov | String, em_mov),
- I2bv(SrcAcc | DstDI | String, em_cmp),
+ F2bv(SrcAcc | DstDI | String | NoWrite, em_cmp),
/* 0xB0 - 0xB7 */
X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)),
/* 0xB8 - 0xBF */
- X8(I(DstReg | SrcImm | Mov, em_mov)),
+ X8(I(DstReg | SrcImm64 | Mov, em_mov)),
/* 0xC0 - 0xC7 */
- D2bv(DstMem | SrcImmByte | ModRM),
+ G(ByteOp | Src2ImmByte, group2), G(Src2ImmByte, group2),
I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
I(ImplicitOps | Stack, em_ret),
I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg),
@@ -3800,10 +3924,11 @@ static const struct opcode opcode_table[256] = {
D(ImplicitOps), DI(SrcImmByte, intn),
D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
/* 0xD0 - 0xD7 */
- D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM),
- N, N, N, N,
+ G(Src2One | ByteOp, group2), G(Src2One, group2),
+ G(Src2CL | ByteOp, group2), G(Src2CL, group2),
+ N, I(DstAcc | SrcImmByte | No64, em_aad), N, N,
/* 0xD8 - 0xDF */
- N, N, N, N, N, N, N, N,
+ N, E(0, &escape_d9), N, E(0, &escape_db), N, E(0, &escape_dd), N, N,
/* 0xE0 - 0xE7 */
X3(I(SrcImmByte, em_loop)),
I(SrcImmByte, em_jcxz),
@@ -3870,28 +3995,29 @@ static const struct opcode twobyte_table[256] = {
X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
/* 0xA0 - 0xA7 */
I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg),
- II(ImplicitOps, em_cpuid, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt),
- D(DstMem | SrcReg | Src2ImmByte | ModRM),
- D(DstMem | SrcReg | Src2CL | ModRM), N, N,
+ II(ImplicitOps, em_cpuid, cpuid),
+ F(DstMem | SrcReg | ModRM | BitOp | NoWrite, em_bt),
+ F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shld),
+ F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N,
/* 0xA8 - 0xAF */
I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
DI(ImplicitOps, rsm),
- I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
- D(DstMem | SrcReg | Src2ImmByte | ModRM),
- D(DstMem | SrcReg | Src2CL | ModRM),
- D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
+ F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
+ F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd),
+ F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
+ D(ModRM), F(DstReg | SrcMem | ModRM, em_imul),
/* 0xB0 - 0xB7 */
I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg),
I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
- I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
+ F(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg),
D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
/* 0xB8 - 0xBF */
N, N,
G(BitOp, group8),
- I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
- I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr),
+ F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
+ F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr),
D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
/* 0xC0 - 0xC7 */
D2bv(DstMem | SrcReg | ModRM | Lock),
@@ -3950,6 +4076,9 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
case 4:
op->val = insn_fetch(s32, ctxt);
break;
+ case 8:
+ op->val = insn_fetch(s64, ctxt);
+ break;
}
if (!sign_extension) {
switch (op->bytes) {
@@ -4028,6 +4157,9 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
case OpImm:
rc = decode_imm(ctxt, op, imm_size(ctxt), true);
break;
+ case OpImm64:
+ rc = decode_imm(ctxt, op, ctxt->op_bytes, true);
+ break;
case OpMem8:
ctxt->memop.bytes = 1;
goto mem_common;
@@ -4222,6 +4354,12 @@ done_prefixes:
case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break;
}
break;
+ case Escape:
+ if (ctxt->modrm > 0xbf)
+ opcode = opcode.u.esc->high[ctxt->modrm - 0xc0];
+ else
+ opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7];
+ break;
default:
return EMULATION_FAILED;
}
@@ -4354,6 +4492,16 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
read_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
}
+static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
+{
+ ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF;
+ fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
+ asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n"
+ : "+a"(ctxt->dst.val), "+b"(ctxt->src.val), [flags]"+D"(flags)
+ : "c"(ctxt->src2.val), [fastop]"S"(fop));
+ ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK);
+ return X86EMUL_CONTINUE;
+}
int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
{
@@ -4483,6 +4631,13 @@ special_insn:
}
if (ctxt->execute) {
+ if (ctxt->d & Fastop) {
+ void (*fop)(struct fastop *) = (void *)ctxt->execute;
+ rc = fastop(ctxt, fop);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ goto writeback;
+ }
rc = ctxt->execute(ctxt);
if (rc != X86EMUL_CONTINUE)
goto done;
@@ -4493,12 +4648,6 @@ special_insn:
goto twobyte_insn;
switch (ctxt->b) {
- case 0x40 ... 0x47: /* inc r16/r32 */
- emulate_1op(ctxt, "inc");
- break;
- case 0x48 ... 0x4f: /* dec r16/r32 */
- emulate_1op(ctxt, "dec");
- break;
case 0x63: /* movsxd */
if (ctxt->mode != X86EMUL_MODE_PROT64)
goto cannot_emulate;
@@ -4523,9 +4672,6 @@ special_insn:
case 8: ctxt->dst.val = (s32)ctxt->dst.val; break;
}
break;
- case 0xc0 ... 0xc1:
- rc = em_grp2(ctxt);
- break;
case 0xcc: /* int3 */
rc = emulate_int(ctxt, 3);
break;
@@ -4536,13 +4682,6 @@ special_insn:
if (ctxt->eflags & EFLG_OF)
rc = emulate_int(ctxt, 4);
break;
- case 0xd0 ... 0xd1: /* Grp2 */
- rc = em_grp2(ctxt);
- break;
- case 0xd2 ... 0xd3: /* Grp2 */
- ctxt->src.val = reg_read(ctxt, VCPU_REGS_RCX);
- rc = em_grp2(ctxt);
- break;
case 0xe9: /* jmp rel */
case 0xeb: /* jmp rel short */
jmp_rel(ctxt, ctxt->src.val);
@@ -4661,14 +4800,6 @@ twobyte_insn:
case 0x90 ... 0x9f: /* setcc r/m8 */
ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags);
break;
- case 0xa4: /* shld imm8, r, r/m */
- case 0xa5: /* shld cl, r, r/m */
- emulate_2op_cl(ctxt, "shld");
- break;
- case 0xac: /* shrd imm8, r, r/m */
- case 0xad: /* shrd cl, r, r/m */
- emulate_2op_cl(ctxt, "shrd");
- break;
case 0xae: /* clflush */
break;
case 0xb6 ... 0xb7: /* movzx */
@@ -4682,7 +4813,7 @@ twobyte_insn:
(s16) ctxt->src.val;
break;
case 0xc0 ... 0xc1: /* xadd */
- emulate_2op_SrcV(ctxt, "add");
+ fastop(ctxt, em_add);
/* Write back the register source. */
ctxt->src.val = ctxt->dst.orig_val;
write_register_operand(&ctxt->src);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 11300d2fa71..c1d30b2fc9b 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -122,7 +122,6 @@ static s64 __kpit_elapsed(struct kvm *kvm)
*/
remaining = hrtimer_get_remaining(&ps->timer);
elapsed = ps->period - ktime_to_ns(remaining);
- elapsed = mod_64(elapsed, ps->period);
return elapsed;
}
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 848206df096..cc31f7c06d3 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -241,6 +241,8 @@ int kvm_pic_read_irq(struct kvm *kvm)
int irq, irq2, intno;
struct kvm_pic *s = pic_irqchip(kvm);
+ s->output = 0;
+
pic_lock(s);
irq = pic_get_irq(&s->pics[0]);
if (irq >= 0) {
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 7e06ba1618b..484bc874688 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -38,49 +38,81 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
/*
+ * check if there is pending interrupt from
+ * non-APIC source without intack.
+ */
+static int kvm_cpu_has_extint(struct kvm_vcpu *v)
+{
+ if (kvm_apic_accept_pic_intr(v))
+ return pic_irqchip(v->kvm)->output; /* PIC */
+ else
+ return 0;
+}
+
+/*
+ * check if there is injectable interrupt:
+ * when virtual interrupt delivery enabled,
+ * interrupt from apic will handled by hardware,
+ * we don't need to check it here.
+ */
+int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
+{
+ if (!irqchip_in_kernel(v->kvm))
+ return v->arch.interrupt.pending;
+
+ if (kvm_cpu_has_extint(v))
+ return 1;
+
+ if (kvm_apic_vid_enabled(v->kvm))
+ return 0;
+
+ return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
+}
+
+/*
* check if there is pending interrupt without
* intack.
*/
int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
{
- struct kvm_pic *s;
-
if (!irqchip_in_kernel(v->kvm))
return v->arch.interrupt.pending;
- if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */
- if (kvm_apic_accept_pic_intr(v)) {
- s = pic_irqchip(v->kvm); /* PIC */
- return s->output;
- } else
- return 0;
- }
- return 1;
+ if (kvm_cpu_has_extint(v))
+ return 1;
+
+ return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
}
EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
/*
+ * Read pending interrupt(from non-APIC source)
+ * vector and intack.
+ */
+static int kvm_cpu_get_extint(struct kvm_vcpu *v)
+{
+ if (kvm_cpu_has_extint(v))
+ return kvm_pic_read_irq(v->kvm); /* PIC */
+ return -1;
+}
+
+/*
* Read pending interrupt vector and intack.
*/
int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
{
- struct kvm_pic *s;
int vector;
if (!irqchip_in_kernel(v->kvm))
return v->arch.interrupt.nr;
- vector = kvm_get_apic_interrupt(v); /* APIC */
- if (vector == -1) {
- if (kvm_apic_accept_pic_intr(v)) {
- s = pic_irqchip(v->kvm);
- s->output = 0; /* PIC */
- vector = kvm_pic_read_irq(v->kvm);
- }
- }
- return vector;
+ vector = kvm_cpu_get_extint(v);
+
+ if (kvm_apic_vid_enabled(v->kvm) || vector != -1)
+ return vector; /* PIC */
+
+ return kvm_get_apic_interrupt(v); /* APIC */
}
-EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
{
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9392f527f10..02b51dd4e4a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -140,31 +140,56 @@ static inline int apic_enabled(struct kvm_lapic *apic)
(LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
-static inline int apic_x2apic_mode(struct kvm_lapic *apic)
-{
- return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
-}
-
static inline int kvm_apic_id(struct kvm_lapic *apic)
{
return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
}
-static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr)
+void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
+ struct kvm_lapic_irq *irq,
+ u64 *eoi_exit_bitmap)
{
- u16 cid;
- ldr >>= 32 - map->ldr_bits;
- cid = (ldr >> map->cid_shift) & map->cid_mask;
+ struct kvm_lapic **dst;
+ struct kvm_apic_map *map;
+ unsigned long bitmap = 1;
+ int i;
- BUG_ON(cid >= ARRAY_SIZE(map->logical_map));
+ rcu_read_lock();
+ map = rcu_dereference(vcpu->kvm->arch.apic_map);
- return cid;
-}
+ if (unlikely(!map)) {
+ __set_bit(irq->vector, (unsigned long *)eoi_exit_bitmap);
+ goto out;
+ }
-static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr)
-{
- ldr >>= (32 - map->ldr_bits);
- return ldr & map->lid_mask;
+ if (irq->dest_mode == 0) { /* physical mode */
+ if (irq->delivery_mode == APIC_DM_LOWEST ||
+ irq->dest_id == 0xff) {
+ __set_bit(irq->vector,
+ (unsigned long *)eoi_exit_bitmap);
+ goto out;
+ }
+ dst = &map->phys_map[irq->dest_id & 0xff];
+ } else {
+ u32 mda = irq->dest_id << (32 - map->ldr_bits);
+
+ dst = map->logical_map[apic_cluster_id(map, mda)];
+
+ bitmap = apic_logical_id(map, mda);
+ }
+
+ for_each_set_bit(i, &bitmap, 16) {
+ if (!dst[i])
+ continue;
+ if (dst[i]->vcpu == vcpu) {
+ __set_bit(irq->vector,
+ (unsigned long *)eoi_exit_bitmap);
+ break;
+ }
+ }
+
+out:
+ rcu_read_unlock();
}
static void recalculate_apic_map(struct kvm *kvm)
@@ -230,6 +255,8 @@ out:
if (old)
kfree_rcu(old, rcu);
+
+ kvm_ioapic_make_eoibitmap_request(kvm);
}
static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
@@ -345,6 +372,10 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
{
int result;
+ /*
+ * Note that irr_pending is just a hint. It will be always
+ * true with virtual interrupt delivery enabled.
+ */
if (!apic->irr_pending)
return -1;
@@ -461,6 +492,8 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
static inline int apic_find_highest_isr(struct kvm_lapic *apic)
{
int result;
+
+ /* Note that isr_count is always 1 with vid enabled */
if (!apic->isr_count)
return -1;
if (likely(apic->highest_isr_cache != -1))
@@ -740,6 +773,19 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
}
+static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
+{
+ if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
+ kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
+ int trigger_mode;
+ if (apic_test_vector(vector, apic->regs + APIC_TMR))
+ trigger_mode = IOAPIC_LEVEL_TRIG;
+ else
+ trigger_mode = IOAPIC_EDGE_TRIG;
+ kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
+ }
+}
+
static int apic_set_eoi(struct kvm_lapic *apic)
{
int vector = apic_find_highest_isr(apic);
@@ -756,19 +802,26 @@ static int apic_set_eoi(struct kvm_lapic *apic)
apic_clear_isr(vector, apic);
apic_update_ppr(apic);
- if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
- kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
- int trigger_mode;
- if (apic_test_vector(vector, apic->regs + APIC_TMR))
- trigger_mode = IOAPIC_LEVEL_TRIG;
- else
- trigger_mode = IOAPIC_EDGE_TRIG;
- kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
- }
+ kvm_ioapic_send_eoi(apic, vector);
kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
return vector;
}
+/*
+ * this interface assumes a trap-like exit, which has already finished
+ * desired side effect including vISR and vPPR update.
+ */
+void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ trace_kvm_eoi(apic, vector);
+
+ kvm_ioapic_send_eoi(apic, vector);
+ kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated);
+
static void apic_send_ipi(struct kvm_lapic *apic)
{
u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR);
@@ -1212,6 +1265,21 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
+/* emulate APIC access in a trap manner */
+void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
+{
+ u32 val = 0;
+
+ /* hw has done the conditional check and inst decode */
+ offset &= 0xff0;
+
+ apic_reg_read(vcpu->arch.apic, offset, 4, &val);
+
+ /* TODO: optimize to just emulate side effect w/o one more write */
+ apic_reg_write(vcpu->arch.apic, offset, val);
+}
+EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode);
+
void kvm_free_lapic(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
@@ -1288,6 +1356,7 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
{
+ u64 old_value = vcpu->arch.apic_base;
struct kvm_lapic *apic = vcpu->arch.apic;
if (!apic) {
@@ -1309,11 +1378,16 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
value &= ~MSR_IA32_APICBASE_BSP;
vcpu->arch.apic_base = value;
- if (apic_x2apic_mode(apic)) {
- u32 id = kvm_apic_id(apic);
- u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
- kvm_apic_set_ldr(apic, ldr);
+ if ((old_value ^ value) & X2APIC_ENABLE) {
+ if (value & X2APIC_ENABLE) {
+ u32 id = kvm_apic_id(apic);
+ u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
+ kvm_apic_set_ldr(apic, ldr);
+ kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
+ } else
+ kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
}
+
apic->base_address = apic->vcpu->arch.apic_base &
MSR_IA32_APICBASE_BASE;
@@ -1359,8 +1433,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
}
- apic->irr_pending = false;
- apic->isr_count = 0;
+ apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm);
+ apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm);
apic->highest_isr_cache = -1;
update_divide_count(apic);
atomic_set(&apic->lapic_timer.pending, 0);
@@ -1575,8 +1649,10 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
update_divide_count(apic);
start_apic_timer(apic);
apic->irr_pending = true;
- apic->isr_count = count_vectors(apic->regs + APIC_ISR);
+ apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm) ?
+ 1 : count_vectors(apic->regs + APIC_ISR);
apic->highest_isr_cache = -1;
+ kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic));
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index e5ebf9f3571..1676d34ddb4 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -64,6 +64,9 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
+void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset);
+void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector);
+
void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
@@ -124,4 +127,35 @@ static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic);
}
+static inline int apic_x2apic_mode(struct kvm_lapic *apic)
+{
+ return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
+}
+
+static inline bool kvm_apic_vid_enabled(struct kvm *kvm)
+{
+ return kvm_x86_ops->vm_has_apicv(kvm);
+}
+
+static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr)
+{
+ u16 cid;
+ ldr >>= 32 - map->ldr_bits;
+ cid = (ldr >> map->cid_shift) & map->cid_mask;
+
+ BUG_ON(cid >= ARRAY_SIZE(map->logical_map));
+
+ return cid;
+}
+
+static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr)
+{
+ ldr >>= (32 - map->ldr_bits);
+ return ldr & map->lid_mask;
+}
+
+void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
+ struct kvm_lapic_irq *irq,
+ u64 *eoi_bitmap);
+
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 01d7c2ad05f..4ed3edbe06b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -448,7 +448,8 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
static bool spte_is_locklessly_modifiable(u64 spte)
{
- return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE));
+ return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
+ (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
}
static bool spte_has_volatile_bits(u64 spte)
@@ -831,8 +832,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
if (host_level == PT_PAGE_TABLE_LEVEL)
return host_level;
- max_level = kvm_x86_ops->get_lpage_level() < host_level ?
- kvm_x86_ops->get_lpage_level() : host_level;
+ max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
@@ -1142,7 +1142,7 @@ spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
}
static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
- int level, bool pt_protect)
+ bool pt_protect)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1180,7 +1180,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
while (mask) {
rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
PT_PAGE_TABLE_LEVEL, slot);
- __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false);
+ __rmap_write_protect(kvm, rmapp, false);
/* clear the first set bit */
mask &= mask - 1;
@@ -1199,7 +1199,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
for (i = PT_PAGE_TABLE_LEVEL;
i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
rmapp = __gfn_to_rmap(gfn, i, slot);
- write_protected |= __rmap_write_protect(kvm, rmapp, i, true);
+ write_protected |= __rmap_write_protect(kvm, rmapp, true);
}
return write_protected;
@@ -1460,28 +1460,14 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
percpu_counter_add(&kvm_total_used_mmu_pages, nr);
}
-/*
- * Remove the sp from shadow page cache, after call it,
- * we can not find this sp from the cache, and the shadow
- * page table is still valid.
- * It should be under the protection of mmu lock.
- */
-static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp)
+static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
{
ASSERT(is_empty_shadow_page(sp->spt));
hlist_del(&sp->hash_link);
- if (!sp->role.direct)
- free_page((unsigned long)sp->gfns);
-}
-
-/*
- * Free the shadow page table and the sp, we can do it
- * out of the protection of mmu lock.
- */
-static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
-{
list_del(&sp->link);
free_page((unsigned long)sp->spt);
+ if (!sp->role.direct)
+ free_page((unsigned long)sp->gfns);
kmem_cache_free(mmu_page_header_cache, sp);
}
@@ -1522,7 +1508,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
- bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM);
sp->parent_ptes = 0;
mmu_page_add_parent_pte(vcpu, sp, parent_pte);
kvm_mod_used_mmu_pages(vcpu->kvm, +1);
@@ -1973,9 +1958,9 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
{
u64 spte;
- spte = __pa(sp->spt)
- | PT_PRESENT_MASK | PT_ACCESSED_MASK
- | PT_WRITABLE_MASK | PT_USER_MASK;
+ spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
+ shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
+
mmu_spte_set(sptep, spte);
}
@@ -2126,7 +2111,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
do {
sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
WARN_ON(!sp->role.invalid || sp->root_count);
- kvm_mmu_isolate_page(sp);
kvm_mmu_free_page(sp);
} while (!list_empty(invalid_list));
}
@@ -2144,6 +2128,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
* change the value
*/
+ spin_lock(&kvm->mmu_lock);
+
if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
!list_empty(&kvm->arch.active_mmu_pages)) {
@@ -2158,6 +2144,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
}
kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
+
+ spin_unlock(&kvm->mmu_lock);
}
int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
@@ -2183,14 +2171,6 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
-static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
-{
- int slot = memslot_id(kvm, gfn);
- struct kvm_mmu_page *sp = page_header(__pa(pte));
-
- __set_bit(slot, sp->slot_bitmap);
-}
-
/*
* The function is based on mtrr_type_lookup() in
* arch/x86/kernel/cpu/mtrr/generic.c
@@ -2332,9 +2312,8 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
if (s->role.level != PT_PAGE_TABLE_LEVEL)
return 1;
- if (!need_unsync && !s->unsync) {
+ if (!s->unsync)
need_unsync = true;
- }
}
if (need_unsync)
kvm_unsync_pages(vcpu, gfn);
@@ -2342,8 +2321,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
}
static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
- unsigned pte_access, int user_fault,
- int write_fault, int level,
+ unsigned pte_access, int level,
gfn_t gfn, pfn_t pfn, bool speculative,
bool can_unsync, bool host_writable)
{
@@ -2378,20 +2356,13 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= (u64)pfn << PAGE_SHIFT;
- if ((pte_access & ACC_WRITE_MASK)
- || (!vcpu->arch.mmu.direct_map && write_fault
- && !is_write_protection(vcpu) && !user_fault)) {
+ if (pte_access & ACC_WRITE_MASK) {
/*
- * There are two cases:
- * - the one is other vcpu creates new sp in the window
- * between mapping_level() and acquiring mmu-lock.
- * - the another case is the new sp is created by itself
- * (page-fault path) when guest uses the target gfn as
- * its page table.
- * Both of these cases can be fixed by allowing guest to
- * retry the access, it will refault, then we can establish
- * the mapping by using small page.
+ * Other vcpu creates new sp in the window between
+ * mapping_level() and acquiring mmu-lock. We can
+ * allow guest to retry the access, the mapping can
+ * be fixed if guest refault.
*/
if (level > PT_PAGE_TABLE_LEVEL &&
has_wrprotected_page(vcpu->kvm, gfn, level))
@@ -2399,19 +2370,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
- if (!vcpu->arch.mmu.direct_map
- && !(pte_access & ACC_WRITE_MASK)) {
- spte &= ~PT_USER_MASK;
- /*
- * If we converted a user page to a kernel page,
- * so that the kernel can write to it when cr0.wp=0,
- * then we should prevent the kernel from executing it
- * if SMEP is enabled.
- */
- if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
- spte |= PT64_NX_MASK;
- }
-
/*
* Optimization: for pte sync, if spte was writable the hash
* lookup is unnecessary (and expensive). Write protection
@@ -2441,19 +2399,15 @@ done:
}
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
- unsigned pt_access, unsigned pte_access,
- int user_fault, int write_fault,
- int *emulate, int level, gfn_t gfn,
- pfn_t pfn, bool speculative,
+ unsigned pte_access, int write_fault, int *emulate,
+ int level, gfn_t gfn, pfn_t pfn, bool speculative,
bool host_writable)
{
int was_rmapped = 0;
int rmap_count;
- pgprintk("%s: spte %llx access %x write_fault %d"
- " user_fault %d gfn %llx\n",
- __func__, *sptep, pt_access,
- write_fault, user_fault, gfn);
+ pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
+ *sptep, write_fault, gfn);
if (is_rmap_spte(*sptep)) {
/*
@@ -2477,9 +2431,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
was_rmapped = 1;
}
- if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
- level, gfn, pfn, speculative, true,
- host_writable)) {
+ if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
+ true, host_writable)) {
if (write_fault)
*emulate = 1;
kvm_mmu_flush_tlb(vcpu);
@@ -2497,7 +2450,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
++vcpu->kvm->stat.lpages;
if (is_shadow_present_pte(*sptep)) {
- page_header_update_slot(vcpu->kvm, sptep, gfn);
if (!was_rmapped) {
rmap_count = rmap_add(vcpu, sptep, gfn);
if (rmap_count > RMAP_RECYCLE_THRESHOLD)
@@ -2571,10 +2523,9 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
return -1;
for (i = 0; i < ret; i++, gfn++, start++)
- mmu_set_spte(vcpu, start, ACC_ALL,
- access, 0, 0, NULL,
- sp->role.level, gfn,
- page_to_pfn(pages[i]), true, true);
+ mmu_set_spte(vcpu, start, access, 0, NULL,
+ sp->role.level, gfn, page_to_pfn(pages[i]),
+ true, true);
return 0;
}
@@ -2633,11 +2584,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
if (iterator.level == level) {
- unsigned pte_access = ACC_ALL;
-
- mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
- 0, write, &emulate,
- level, gfn, pfn, prefault, map_writable);
+ mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
+ write, &emulate, level, gfn, pfn,
+ prefault, map_writable);
direct_pte_prefetch(vcpu, iterator.sptep);
++vcpu->stat.pf_fixed;
break;
@@ -2652,11 +2601,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
iterator.level - 1,
1, ACC_ALL, iterator.sptep);
- mmu_spte_set(iterator.sptep,
- __pa(sp->spt)
- | PT_PRESENT_MASK | PT_WRITABLE_MASK
- | shadow_user_mask | shadow_x_mask
- | shadow_accessed_mask);
+ link_shadow_page(iterator.sptep, sp);
}
}
return emulate;
@@ -3719,6 +3664,7 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
else
r = paging32_init_context(vcpu, context);
+ vcpu->arch.mmu.base_role.nxe = is_nx(vcpu);
vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
vcpu->arch.mmu.base_role.smep_andnot_wp
@@ -3885,7 +3831,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
*gpa &= ~(gpa_t)7;
*bytes = 8;
- r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, min(*bytes, 8));
+ r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, 8);
if (r)
gentry = 0;
new = (const u8 *)&gentry;
@@ -4039,7 +3985,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
!((sp->role.word ^ vcpu->arch.mmu.base_role.word)
& mask.word) && rmap_can_add(vcpu))
mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
- if (!remote_flush && need_remote_flush(entry, *spte))
+ if (need_remote_flush(entry, *spte))
remote_flush = true;
++spte;
}
@@ -4198,26 +4144,36 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
{
- struct kvm_mmu_page *sp;
- bool flush = false;
+ struct kvm_memory_slot *memslot;
+ gfn_t last_gfn;
+ int i;
- list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
- int i;
- u64 *pt;
+ memslot = id_to_memslot(kvm->memslots, slot);
+ last_gfn = memslot->base_gfn + memslot->npages - 1;
- if (!test_bit(slot, sp->slot_bitmap))
- continue;
+ spin_lock(&kvm->mmu_lock);
- pt = sp->spt;
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- if (!is_shadow_present_pte(pt[i]) ||
- !is_last_spte(pt[i], sp->role.level))
- continue;
+ for (i = PT_PAGE_TABLE_LEVEL;
+ i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+ unsigned long *rmapp;
+ unsigned long last_index, index;
- spte_write_protect(kvm, &pt[i], &flush, false);
+ rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
+ last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
+
+ for (index = 0; index <= last_index; ++index, ++rmapp) {
+ if (*rmapp)
+ __rmap_write_protect(kvm, rmapp, false);
+
+ if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+ kvm_flush_remote_tlbs(kvm);
+ cond_resched_lock(&kvm->mmu_lock);
+ }
}
}
+
kvm_flush_remote_tlbs(kvm);
+ spin_unlock(&kvm->mmu_lock);
}
void kvm_mmu_zap_all(struct kvm *kvm)
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index cd6e98333ba..b8f6172f417 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -195,12 +195,6 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
TP_ARGS(sp)
);
-DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_delay_free_pages,
- TP_PROTO(struct kvm_mmu_page *sp),
-
- TP_ARGS(sp)
-);
-
TRACE_EVENT(
mark_mmio_spte,
TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access),
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 891eb6d93b8..105dd5bd550 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -151,7 +151,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
pt_element_t pte;
pt_element_t __user *uninitialized_var(ptep_user);
gfn_t table_gfn;
- unsigned index, pt_access, pte_access, accessed_dirty, shift;
+ unsigned index, pt_access, pte_access, accessed_dirty;
gpa_t pte_gpa;
int offset;
const int write_fault = access & PFERR_WRITE_MASK;
@@ -249,16 +249,12 @@ retry_walk:
if (!write_fault)
protect_clean_gpte(&pte_access, pte);
-
- /*
- * On a write fault, fold the dirty bit into accessed_dirty by shifting it one
- * place right.
- *
- * On a read fault, do nothing.
- */
- shift = write_fault >> ilog2(PFERR_WRITE_MASK);
- shift *= PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT;
- accessed_dirty &= pte >> shift;
+ else
+ /*
+ * On a write fault, fold the dirty bit into accessed_dirty by
+ * shifting it one place right.
+ */
+ accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT);
if (unlikely(!accessed_dirty)) {
ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
@@ -330,8 +326,8 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
* we call mmu_set_spte() with host_writable = true because
* pte_prefetch_gfn_to_pfn always gets a writable pfn.
*/
- mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
- NULL, PT_PAGE_TABLE_LEVEL, gfn, pfn, true, true);
+ mmu_set_spte(vcpu, spte, pte_access, 0, NULL, PT_PAGE_TABLE_LEVEL,
+ gfn, pfn, true, true);
return true;
}
@@ -405,7 +401,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
*/
static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *gw,
- int user_fault, int write_fault, int hlevel,
+ int write_fault, int hlevel,
pfn_t pfn, bool map_writable, bool prefault)
{
struct kvm_mmu_page *sp = NULL;
@@ -413,9 +409,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
unsigned direct_access, access = gw->pt_access;
int top_level, emulate = 0;
- if (!is_present_gpte(gw->ptes[gw->level - 1]))
- return 0;
-
direct_access = gw->pte_access;
top_level = vcpu->arch.mmu.root_level;
@@ -477,9 +470,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
}
clear_sp_write_flooding_count(it.sptep);
- mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
- user_fault, write_fault, &emulate, it.level,
- gw->gfn, pfn, prefault, map_writable);
+ mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, &emulate,
+ it.level, gw->gfn, pfn, prefault, map_writable);
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
return emulate;
@@ -491,6 +483,46 @@ out_gpte_changed:
return 0;
}
+ /*
+ * To see whether the mapped gfn can write its page table in the current
+ * mapping.
+ *
+ * It is the helper function of FNAME(page_fault). When guest uses large page
+ * size to map the writable gfn which is used as current page table, we should
+ * force kvm to use small page size to map it because new shadow page will be
+ * created when kvm establishes shadow page table that stop kvm using large
+ * page size. Do it early can avoid unnecessary #PF and emulation.
+ *
+ * @write_fault_to_shadow_pgtable will return true if the fault gfn is
+ * currently used as its page table.
+ *
+ * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
+ * since the PDPT is always shadowed, that means, we can not use large page
+ * size to map the gfn which is used as PDPT.
+ */
+static bool
+FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
+ struct guest_walker *walker, int user_fault,
+ bool *write_fault_to_shadow_pgtable)
+{
+ int level;
+ gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1);
+ bool self_changed = false;
+
+ if (!(walker->pte_access & ACC_WRITE_MASK ||
+ (!is_write_protection(vcpu) && !user_fault)))
+ return false;
+
+ for (level = walker->level; level <= walker->max_level; level++) {
+ gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1];
+
+ self_changed |= !(gfn & mask);
+ *write_fault_to_shadow_pgtable |= !gfn;
+ }
+
+ return self_changed;
+}
+
/*
* Page fault handler. There are several causes for a page fault:
* - there is no shadow pte for the guest pte
@@ -516,7 +548,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
int level = PT_PAGE_TABLE_LEVEL;
int force_pt_level;
unsigned long mmu_seq;
- bool map_writable;
+ bool map_writable, is_self_change_mapping;
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
@@ -544,8 +576,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
return 0;
}
+ vcpu->arch.write_fault_to_shadow_pgtable = false;
+
+ is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
+ &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
+
if (walker.level >= PT_DIRECTORY_LEVEL)
- force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn);
+ force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
+ || is_self_change_mapping;
else
force_pt_level = 1;
if (!force_pt_level) {
@@ -564,6 +602,26 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
walker.gfn, pfn, walker.pte_access, &r))
return r;
+ /*
+ * Do not change pte_access if the pfn is a mmio page, otherwise
+ * we will cache the incorrect access into mmio spte.
+ */
+ if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) &&
+ !is_write_protection(vcpu) && !user_fault &&
+ !is_noslot_pfn(pfn)) {
+ walker.pte_access |= ACC_WRITE_MASK;
+ walker.pte_access &= ~ACC_USER_MASK;
+
+ /*
+ * If we converted a user page to a kernel page,
+ * so that the kernel can write to it when cr0.wp=0,
+ * then we should prevent the kernel from executing it
+ * if SMEP is enabled.
+ */
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
+ walker.pte_access &= ~ACC_EXEC_MASK;
+ }
+
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
@@ -572,7 +630,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
kvm_mmu_free_some_pages(vcpu);
if (!force_pt_level)
transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
- r = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
+ r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
level, pfn, map_writable, prefault);
++vcpu->stat.pf_fixed;
kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
@@ -747,7 +805,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
- set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
+ set_spte(vcpu, &sp->spt[i], pte_access,
PT_PAGE_TABLE_LEVEL, gfn,
spte_to_pfn(sp->spt[i]), true, false,
host_writable);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d29d3cd1c15..e1b1ce21bc0 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3571,6 +3571,26 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
}
+static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
+{
+ return;
+}
+
+static int svm_vm_has_apicv(struct kvm *kvm)
+{
+ return 0;
+}
+
+static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+{
+ return;
+}
+
+static void svm_hwapic_isr_update(struct kvm *kvm, int isr)
+{
+ return;
+}
+
static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -4290,6 +4310,10 @@ static struct kvm_x86_ops svm_x86_ops = {
.enable_nmi_window = enable_nmi_window,
.enable_irq_window = enable_irq_window,
.update_cr8_intercept = update_cr8_intercept,
+ .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
+ .vm_has_apicv = svm_vm_has_apicv,
+ .load_eoi_exitmap = svm_load_eoi_exitmap,
+ .hwapic_isr_update = svm_hwapic_isr_update,
.set_tss_addr = svm_set_tss_addr,
.get_tdp_level = get_npt_level,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9120ae1901e..6667042714c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -84,6 +84,8 @@ module_param(vmm_exclusive, bool, S_IRUGO);
static bool __read_mostly fasteoi = 1;
module_param(fasteoi, bool, S_IRUGO);
+static bool __read_mostly enable_apicv_reg_vid;
+
/*
* If nested=1, nested virtualization is supported, i.e., guests may use
* VMX and be a hypervisor for its own guests. If nested=0, guests may not
@@ -92,12 +94,8 @@ module_param(fasteoi, bool, S_IRUGO);
static bool __read_mostly nested = 0;
module_param(nested, bool, S_IRUGO);
-#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
- (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
-#define KVM_GUEST_CR0_MASK \
- (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
-#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
- (X86_CR0_WP | X86_CR0_NE)
+#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
+#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
#define KVM_VM_CR0_ALWAYS_ON \
(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
#define KVM_CR4_GUEST_OWNED_BITS \
@@ -624,6 +622,8 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
static void vmx_get_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
+static bool guest_state_valid(struct kvm_vcpu *vcpu);
+static u32 vmx_segment_access_rights(struct kvm_segment *var);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -638,6 +638,8 @@ static unsigned long *vmx_io_bitmap_a;
static unsigned long *vmx_io_bitmap_b;
static unsigned long *vmx_msr_bitmap_legacy;
static unsigned long *vmx_msr_bitmap_longmode;
+static unsigned long *vmx_msr_bitmap_legacy_x2apic;
+static unsigned long *vmx_msr_bitmap_longmode_x2apic;
static bool cpu_has_load_ia32_efer;
static bool cpu_has_load_perf_global_ctrl;
@@ -762,6 +764,24 @@ static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
}
+static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+}
+
+static inline bool cpu_has_vmx_apic_register_virt(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_APIC_REGISTER_VIRT;
+}
+
+static inline bool cpu_has_vmx_virtual_intr_delivery(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
+}
+
static inline bool cpu_has_vmx_flexpriority(void)
{
return cpu_has_vmx_tpr_shadow() &&
@@ -1694,7 +1714,6 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
__set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
- __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
to_vmx(vcpu)->rflags = rflags;
if (to_vmx(vcpu)->rmode.vm86_active) {
to_vmx(vcpu)->rmode.save_rflags = rflags;
@@ -1820,6 +1839,25 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
vmx->guest_msrs[from] = tmp;
}
+static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
+{
+ unsigned long *msr_bitmap;
+
+ if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) {
+ if (is_long_mode(vcpu))
+ msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
+ else
+ msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
+ } else {
+ if (is_long_mode(vcpu))
+ msr_bitmap = vmx_msr_bitmap_longmode;
+ else
+ msr_bitmap = vmx_msr_bitmap_legacy;
+ }
+
+ vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
+}
+
/*
* Set up the vmcs to automatically save and restore system
* msrs. Don't touch the 64-bit msrs if the guest is in legacy
@@ -1828,7 +1866,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
static void setup_msrs(struct vcpu_vmx *vmx)
{
int save_nmsrs, index;
- unsigned long *msr_bitmap;
save_nmsrs = 0;
#ifdef CONFIG_X86_64
@@ -1860,14 +1897,8 @@ static void setup_msrs(struct vcpu_vmx *vmx)
vmx->save_nmsrs = save_nmsrs;
- if (cpu_has_vmx_msr_bitmap()) {
- if (is_long_mode(&vmx->vcpu))
- msr_bitmap = vmx_msr_bitmap_longmode;
- else
- msr_bitmap = vmx_msr_bitmap_legacy;
-
- vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
- }
+ if (cpu_has_vmx_msr_bitmap())
+ vmx_set_msr_bitmap(&vmx->vcpu);
}
/*
@@ -2533,13 +2564,16 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
min2 = 0;
opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_ENABLE_VPID |
SECONDARY_EXEC_ENABLE_EPT |
SECONDARY_EXEC_UNRESTRICTED_GUEST |
SECONDARY_EXEC_PAUSE_LOOP_EXITING |
SECONDARY_EXEC_RDTSCP |
- SECONDARY_EXEC_ENABLE_INVPCID;
+ SECONDARY_EXEC_ENABLE_INVPCID |
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
@@ -2550,6 +2584,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
#endif
+
+ if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
+ _cpu_based_2nd_exec_control &= ~(
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+
if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
/* CR3 accesses and invlpg don't need to cause VM Exits when EPT
enabled */
@@ -2747,6 +2788,15 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_ple())
ple_gap = 0;
+ if (!cpu_has_vmx_apic_register_virt() ||
+ !cpu_has_vmx_virtual_intr_delivery())
+ enable_apicv_reg_vid = 0;
+
+ if (enable_apicv_reg_vid)
+ kvm_x86_ops->update_cr8_intercept = NULL;
+ else
+ kvm_x86_ops->hwapic_irr_update = NULL;
+
if (nested)
nested_vmx_setup_ctls_msrs();
@@ -2758,18 +2808,28 @@ static __exit void hardware_unsetup(void)
free_kvm_area();
}
-static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save)
+static bool emulation_required(struct kvm_vcpu *vcpu)
{
- const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
- struct kvm_segment tmp = *save;
+ return emulate_invalid_guest_state && !guest_state_valid(vcpu);
+}
- if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) {
- tmp.base = vmcs_readl(sf->base);
- tmp.selector = vmcs_read16(sf->selector);
- tmp.dpl = tmp.selector & SELECTOR_RPL_MASK;
- tmp.s = 1;
+static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
+ struct kvm_segment *save)
+{
+ if (!emulate_invalid_guest_state) {
+ /*
+ * CS and SS RPL should be equal during guest entry according
+ * to VMX spec, but in reality it is not always so. Since vcpu
+ * is in the middle of the transition from real mode to
+ * protected mode it is safe to assume that RPL 0 is a good
+ * default value.
+ */
+ if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
+ save->selector &= ~SELECTOR_RPL_MASK;
+ save->dpl = save->selector & SELECTOR_RPL_MASK;
+ save->s = 1;
}
- vmx_set_segment(vcpu, &tmp, seg);
+ vmx_set_segment(vcpu, save, seg);
}
static void enter_pmode(struct kvm_vcpu *vcpu)
@@ -2777,7 +2837,17 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
unsigned long flags;
struct vcpu_vmx *vmx = to_vmx(vcpu);
- vmx->emulation_required = 1;
+ /*
+ * Update real mode segment cache. It may be not up-to-date if sement
+ * register was written while vcpu was in a guest mode.
+ */
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
+
vmx->rmode.vm86_active = 0;
vmx_segment_cache_clear(vmx);
@@ -2794,22 +2864,16 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
update_exception_bitmap(vcpu);
- if (emulate_invalid_guest_state)
- return;
-
- fix_pmode_dataseg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
- fix_pmode_dataseg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
- fix_pmode_dataseg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
- fix_pmode_dataseg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
-
- vmx_segment_cache_clear(vmx);
+ fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
+ fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
+ fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
+ fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
+ fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
+ fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
- vmcs_write16(GUEST_SS_SELECTOR, 0);
- vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
-
- vmcs_write16(GUEST_CS_SELECTOR,
- vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
- vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
+ /* CPL is always 0 when CPU enters protected mode */
+ __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
+ vmx->cpl = 0;
}
static gva_t rmode_tss_base(struct kvm *kvm)
@@ -2831,36 +2895,51 @@ static gva_t rmode_tss_base(struct kvm *kvm)
static void fix_rmode_seg(int seg, struct kvm_segment *save)
{
const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
-
- vmcs_write16(sf->selector, save->base >> 4);
- vmcs_write32(sf->base, save->base & 0xffff0);
- vmcs_write32(sf->limit, 0xffff);
- vmcs_write32(sf->ar_bytes, 0xf3);
- if (save->base & 0xf)
- printk_once(KERN_WARNING "kvm: segment base is not paragraph"
- " aligned when entering protected mode (seg=%d)",
- seg);
+ struct kvm_segment var = *save;
+
+ var.dpl = 0x3;
+ if (seg == VCPU_SREG_CS)
+ var.type = 0x3;
+
+ if (!emulate_invalid_guest_state) {
+ var.selector = var.base >> 4;
+ var.base = var.base & 0xffff0;
+ var.limit = 0xffff;
+ var.g = 0;
+ var.db = 0;
+ var.present = 1;
+ var.s = 1;
+ var.l = 0;
+ var.unusable = 0;
+ var.type = 0x3;
+ var.avl = 0;
+ if (save->base & 0xf)
+ printk_once(KERN_WARNING "kvm: segment base is not "
+ "paragraph aligned when entering "
+ "protected mode (seg=%d)", seg);
+ }
+
+ vmcs_write16(sf->selector, var.selector);
+ vmcs_write32(sf->base, var.base);
+ vmcs_write32(sf->limit, var.limit);
+ vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
}
static void enter_rmode(struct kvm_vcpu *vcpu)
{
unsigned long flags;
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct kvm_segment var;
-
- if (enable_unrestricted_guest)
- return;
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
- vmx->emulation_required = 1;
vmx->rmode.vm86_active = 1;
-
/*
* Very old userspace does not call KVM_SET_TSS_ADDR before entering
* vcpu. Call it here with phys address pointing 16M below 4G.
@@ -2888,28 +2967,13 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
update_exception_bitmap(vcpu);
- if (emulate_invalid_guest_state)
- goto continue_rmode;
-
- vmx_get_segment(vcpu, &var, VCPU_SREG_SS);
- vmx_set_segment(vcpu, &var, VCPU_SREG_SS);
-
- vmx_get_segment(vcpu, &var, VCPU_SREG_CS);
- vmx_set_segment(vcpu, &var, VCPU_SREG_CS);
-
- vmx_get_segment(vcpu, &var, VCPU_SREG_ES);
- vmx_set_segment(vcpu, &var, VCPU_SREG_ES);
-
- vmx_get_segment(vcpu, &var, VCPU_SREG_DS);
- vmx_set_segment(vcpu, &var, VCPU_SREG_DS);
+ fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
+ fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
+ fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
+ fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
+ fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
+ fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
- vmx_get_segment(vcpu, &var, VCPU_SREG_GS);
- vmx_set_segment(vcpu, &var, VCPU_SREG_GS);
-
- vmx_get_segment(vcpu, &var, VCPU_SREG_FS);
- vmx_set_segment(vcpu, &var, VCPU_SREG_FS);
-
-continue_rmode:
kvm_mmu_reset_context(vcpu);
}
@@ -3068,17 +3132,18 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long hw_cr0;
+ hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK);
if (enable_unrestricted_guest)
- hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST)
- | KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
- else
- hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON;
+ hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
+ else {
+ hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
- if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
- enter_pmode(vcpu);
+ if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
+ enter_pmode(vcpu);
- if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
- enter_rmode(vcpu);
+ if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
+ enter_rmode(vcpu);
+ }
#ifdef CONFIG_X86_64
if (vcpu->arch.efer & EFER_LME) {
@@ -3098,7 +3163,9 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
vmcs_writel(CR0_READ_SHADOW, cr0);
vmcs_writel(GUEST_CR0, hw_cr0);
vcpu->arch.cr0 = cr0;
- __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
+
+ /* depends on vcpu->arch.cr0 to be set to a new value */
+ vmx->emulation_required = emulation_required(vcpu);
}
static u64 construct_eptp(unsigned long root_hpa)
@@ -3155,6 +3222,14 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
if (!is_paging(vcpu)) {
hw_cr4 &= ~X86_CR4_PAE;
hw_cr4 |= X86_CR4_PSE;
+ /*
+ * SMEP is disabled if CPU is in non-paging mode in
+ * hardware. However KVM always uses paging mode to
+ * emulate guest non-paging mode with TDP.
+ * To emulate this behavior, SMEP needs to be manually
+ * disabled when guest switches to non-paging mode.
+ */
+ hw_cr4 &= ~X86_CR4_SMEP;
} else if (!(cr4 & X86_CR4_PAE)) {
hw_cr4 &= ~X86_CR4_PAE;
}
@@ -3171,10 +3246,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 ar;
- if (vmx->rmode.vm86_active
- && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES
- || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS
- || seg == VCPU_SREG_GS)) {
+ if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
*var = vmx->rmode.segs[seg];
if (seg == VCPU_SREG_TR
|| var->selector == vmx_read_guest_seg_selector(vmx, seg))
@@ -3187,8 +3259,6 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
var->limit = vmx_read_guest_seg_limit(vmx, seg);
var->selector = vmx_read_guest_seg_selector(vmx, seg);
ar = vmx_read_guest_seg_ar(vmx, seg);
- if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
- ar = 0;
var->type = ar & 15;
var->s = (ar >> 4) & 1;
var->dpl = (ar >> 5) & 3;
@@ -3211,8 +3281,10 @@ static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
}
-static int __vmx_get_cpl(struct kvm_vcpu *vcpu)
+static int vmx_get_cpl(struct kvm_vcpu *vcpu)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
if (!is_protmode(vcpu))
return 0;
@@ -3220,24 +3292,9 @@ static int __vmx_get_cpl(struct kvm_vcpu *vcpu)
&& (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */
return 3;
- return vmx_read_guest_seg_selector(to_vmx(vcpu), VCPU_SREG_CS) & 3;
-}
-
-static int vmx_get_cpl(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- /*
- * If we enter real mode with cs.sel & 3 != 0, the normal CPL calculations
- * fail; use the cache instead.
- */
- if (unlikely(vmx->emulation_required && emulate_invalid_guest_state)) {
- return vmx->cpl;
- }
-
if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
__set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
- vmx->cpl = __vmx_get_cpl(vcpu);
+ vmx->cpl = vmx_read_guest_seg_selector(vmx, VCPU_SREG_CS) & 3;
}
return vmx->cpl;
@@ -3269,28 +3326,23 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
- u32 ar;
vmx_segment_cache_clear(vmx);
+ if (seg == VCPU_SREG_CS)
+ __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
- if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
- vmcs_write16(sf->selector, var->selector);
- vmx->rmode.segs[VCPU_SREG_TR] = *var;
- return;
+ if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
+ vmx->rmode.segs[seg] = *var;
+ if (seg == VCPU_SREG_TR)
+ vmcs_write16(sf->selector, var->selector);
+ else if (var->s)
+ fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
+ goto out;
}
+
vmcs_writel(sf->base, var->base);
vmcs_write32(sf->limit, var->limit);
vmcs_write16(sf->selector, var->selector);
- if (vmx->rmode.vm86_active && var->s) {
- vmx->rmode.segs[seg] = *var;
- /*
- * Hack real-mode segments into vm86 compatibility.
- */
- if (var->base == 0xffff0000 && var->selector == 0xf000)
- vmcs_writel(sf->base, 0xf0000);
- ar = 0xf3;
- } else
- ar = vmx_segment_access_rights(var);
/*
* Fix the "Accessed" bit in AR field of segment registers for older
@@ -3304,42 +3356,12 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
* kvm hack.
*/
if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
- ar |= 0x1; /* Accessed */
+ var->type |= 0x1; /* Accessed */
- vmcs_write32(sf->ar_bytes, ar);
- __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
+ vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
- /*
- * Fix segments for real mode guest in hosts that don't have
- * "unrestricted_mode" or it was disabled.
- * This is done to allow migration of the guests from hosts with
- * unrestricted guest like Westmere to older host that don't have
- * unrestricted guest like Nehelem.
- */
- if (vmx->rmode.vm86_active) {
- switch (seg) {
- case VCPU_SREG_CS:
- vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
- vmcs_write32(GUEST_CS_LIMIT, 0xffff);
- if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
- vmcs_writel(GUEST_CS_BASE, 0xf0000);
- vmcs_write16(GUEST_CS_SELECTOR,
- vmcs_readl(GUEST_CS_BASE) >> 4);
- break;
- case VCPU_SREG_ES:
- case VCPU_SREG_DS:
- case VCPU_SREG_GS:
- case VCPU_SREG_FS:
- fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
- break;
- case VCPU_SREG_SS:
- vmcs_write16(GUEST_SS_SELECTOR,
- vmcs_readl(GUEST_SS_BASE) >> 4);
- vmcs_write32(GUEST_SS_LIMIT, 0xffff);
- vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
- break;
- }
- }
+out:
+ vmx->emulation_required |= emulation_required(vcpu);
}
static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
@@ -3380,13 +3402,16 @@ static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
u32 ar;
vmx_get_segment(vcpu, &var, seg);
+ var.dpl = 0x3;
+ if (seg == VCPU_SREG_CS)
+ var.type = 0x3;
ar = vmx_segment_access_rights(&var);
if (var.base != (var.selector << 4))
return false;
- if (var.limit < 0xffff)
+ if (var.limit != 0xffff)
return false;
- if (((ar | (3 << AR_DPL_SHIFT)) & ~(AR_G_MASK | AR_DB_MASK)) != 0xf3)
+ if (ar != 0xf3)
return false;
return true;
@@ -3521,6 +3546,9 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
*/
static bool guest_state_valid(struct kvm_vcpu *vcpu)
{
+ if (enable_unrestricted_guest)
+ return true;
+
/* real mode guest state checks */
if (!is_protmode(vcpu)) {
if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
@@ -3644,12 +3672,9 @@ static void seg_setup(int seg)
vmcs_write16(sf->selector, 0);
vmcs_writel(sf->base, 0);
vmcs_write32(sf->limit, 0xffff);
- if (enable_unrestricted_guest) {
- ar = 0x93;
- if (seg == VCPU_SREG_CS)
- ar |= 0x08; /* code segment */
- } else
- ar = 0xf3;
+ ar = 0x93;
+ if (seg == VCPU_SREG_CS)
+ ar |= 0x08; /* code segment */
vmcs_write32(sf->ar_bytes, ar);
}
@@ -3667,7 +3692,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
kvm_userspace_mem.flags = 0;
kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
kvm_userspace_mem.memory_size = PAGE_SIZE;
- r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
+ r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false);
if (r)
goto out;
@@ -3697,7 +3722,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
kvm_userspace_mem.guest_phys_addr =
kvm->arch.ept_identity_map_addr;
kvm_userspace_mem.memory_size = PAGE_SIZE;
- r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
+ r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false);
if (r)
goto out;
@@ -3739,7 +3764,10 @@ static void free_vpid(struct vcpu_vmx *vmx)
spin_unlock(&vmx_vpid_lock);
}
-static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
+#define MSR_TYPE_R 1
+#define MSR_TYPE_W 2
+static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+ u32 msr, int type)
{
int f = sizeof(unsigned long);
@@ -3752,20 +3780,93 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
* We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
*/
if (msr <= 0x1fff) {
- __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
- __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
+ if (type & MSR_TYPE_R)
+ /* read-low */
+ __clear_bit(msr, msr_bitmap + 0x000 / f);
+
+ if (type & MSR_TYPE_W)
+ /* write-low */
+ __clear_bit(msr, msr_bitmap + 0x800 / f);
+
} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
msr &= 0x1fff;
- __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
- __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
+ if (type & MSR_TYPE_R)
+ /* read-high */
+ __clear_bit(msr, msr_bitmap + 0x400 / f);
+
+ if (type & MSR_TYPE_W)
+ /* write-high */
+ __clear_bit(msr, msr_bitmap + 0xc00 / f);
+
+ }
+}
+
+static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
+ u32 msr, int type)
+{
+ int f = sizeof(unsigned long);
+
+ if (!cpu_has_vmx_msr_bitmap())
+ return;
+
+ /*
+ * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+ * have the write-low and read-high bitmap offsets the wrong way round.
+ * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
+ */
+ if (msr <= 0x1fff) {
+ if (type & MSR_TYPE_R)
+ /* read-low */
+ __set_bit(msr, msr_bitmap + 0x000 / f);
+
+ if (type & MSR_TYPE_W)
+ /* write-low */
+ __set_bit(msr, msr_bitmap + 0x800 / f);
+
+ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+ msr &= 0x1fff;
+ if (type & MSR_TYPE_R)
+ /* read-high */
+ __set_bit(msr, msr_bitmap + 0x400 / f);
+
+ if (type & MSR_TYPE_W)
+ /* write-high */
+ __set_bit(msr, msr_bitmap + 0xc00 / f);
+
}
}
static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
{
if (!longmode_only)
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
+ msr, MSR_TYPE_R | MSR_TYPE_W);
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
+ msr, MSR_TYPE_R | MSR_TYPE_W);
+}
+
+static void vmx_enable_intercept_msr_read_x2apic(u32 msr)
+{
+ __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+ msr, MSR_TYPE_R);
+ __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+ msr, MSR_TYPE_R);
+}
+
+static void vmx_disable_intercept_msr_read_x2apic(u32 msr)
+{
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+ msr, MSR_TYPE_R);
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+ msr, MSR_TYPE_R);
+}
+
+static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
+{
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+ msr, MSR_TYPE_W);
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+ msr, MSR_TYPE_W);
}
/*
@@ -3844,6 +3945,11 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
return exec_control;
}
+static int vmx_vm_has_apicv(struct kvm *kvm)
+{
+ return enable_apicv_reg_vid && irqchip_in_kernel(kvm);
+}
+
static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
{
u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
@@ -3861,6 +3967,10 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
if (!ple_gap)
exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+ if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
+ exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+ exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
return exec_control;
}
@@ -3905,6 +4015,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmx_secondary_exec_control(vmx));
}
+ if (enable_apicv_reg_vid) {
+ vmcs_write64(EOI_EXIT_BITMAP0, 0);
+ vmcs_write64(EOI_EXIT_BITMAP1, 0);
+ vmcs_write64(EOI_EXIT_BITMAP2, 0);
+ vmcs_write64(EOI_EXIT_BITMAP3, 0);
+
+ vmcs_write16(GUEST_INTR_STATUS, 0);
+ }
+
if (ple_gap) {
vmcs_write32(PLE_GAP, ple_gap);
vmcs_write32(PLE_WINDOW, ple_window);
@@ -3990,14 +4109,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmx_segment_cache_clear(vmx);
seg_setup(VCPU_SREG_CS);
- /*
- * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
- * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
- */
- if (kvm_vcpu_is_bsp(&vmx->vcpu)) {
+ if (kvm_vcpu_is_bsp(&vmx->vcpu))
vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
- vmcs_writel(GUEST_CS_BASE, 0x000f0000);
- } else {
+ else {
vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
}
@@ -4073,9 +4187,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
ret = 0;
- /* HACK: Don't enable emulation on guest boot/reset */
- vmx->emulation_required = 0;
-
return ret;
}
@@ -4251,7 +4362,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
.flags = 0,
};
- ret = kvm_set_memory_region(kvm, &tss_mem, 0);
+ ret = kvm_set_memory_region(kvm, &tss_mem, false);
if (ret)
return ret;
kvm->arch.tss_addr = addr;
@@ -4261,28 +4372,9 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
return 0;
}
-static int handle_rmode_exception(struct kvm_vcpu *vcpu,
- int vec, u32 err_code)
+static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
{
- /*
- * Instruction with address size override prefix opcode 0x67
- * Cause the #SS fault with 0 error code in VM86 mode.
- */
- if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
- if (emulate_instruction(vcpu, 0) == EMULATE_DONE)
- return 1;
- /*
- * Forward all other exceptions that are valid in real mode.
- * FIXME: Breaks guest debugging in real mode, needs to be fixed with
- * the required debugging infrastructure rework.
- */
switch (vec) {
- case DB_VECTOR:
- if (vcpu->guest_debug &
- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
- return 0;
- kvm_queue_exception(vcpu, vec);
- return 1;
case BP_VECTOR:
/*
* Update instruction length as we may reinject the exception
@@ -4291,7 +4383,12 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
- return 0;
+ return false;
+ /* fall through */
+ case DB_VECTOR:
+ if (vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+ return false;
/* fall through */
case DE_VECTOR:
case OF_VECTOR:
@@ -4301,10 +4398,37 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
case SS_VECTOR:
case GP_VECTOR:
case MF_VECTOR:
- kvm_queue_exception(vcpu, vec);
- return 1;
+ return true;
+ break;
}
- return 0;
+ return false;
+}
+
+static int handle_rmode_exception(struct kvm_vcpu *vcpu,
+ int vec, u32 err_code)
+{
+ /*
+ * Instruction with address size override prefix opcode 0x67
+ * Cause the #SS fault with 0 error code in VM86 mode.
+ */
+ if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
+ if (emulate_instruction(vcpu, 0) == EMULATE_DONE) {
+ if (vcpu->arch.halt_request) {
+ vcpu->arch.halt_request = 0;
+ return kvm_emulate_halt(vcpu);
+ }
+ return 1;
+ }
+ return 0;
+ }
+
+ /*
+ * Forward all other exceptions that are valid in real mode.
+ * FIXME: Breaks guest debugging in real mode, needs to be fixed with
+ * the required debugging infrastructure rework.
+ */
+ kvm_queue_exception(vcpu, vec);
+ return 1;
}
/*
@@ -4392,17 +4516,11 @@ static int handle_exception(struct kvm_vcpu *vcpu)
return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
}
- if (vmx->rmode.vm86_active &&
- handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
- error_code)) {
- if (vcpu->arch.halt_request) {
- vcpu->arch.halt_request = 0;
- return kvm_emulate_halt(vcpu);
- }
- return 1;
- }
-
ex_no = intr_info & INTR_INFO_VECTOR_MASK;
+
+ if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
+ return handle_rmode_exception(vcpu, ex_no, error_code);
+
switch (ex_no) {
case DB_VECTOR:
dr6 = vmcs_readl(EXIT_QUALIFICATION);
@@ -4820,6 +4938,26 @@ static int handle_apic_access(struct kvm_vcpu *vcpu)
return emulate_instruction(vcpu, 0) == EMULATE_DONE;
}
+static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ int vector = exit_qualification & 0xff;
+
+ /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
+ kvm_apic_set_eoi_accelerated(vcpu, vector);
+ return 1;
+}
+
+static int handle_apic_write(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ u32 offset = exit_qualification & 0xfff;
+
+ /* APIC-write VM exit is trap-like and thus no need to adjust IP */
+ kvm_apic_write_nodecode(vcpu, offset);
+ return 1;
+}
+
static int handle_task_switch(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -5065,7 +5203,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
schedule();
}
- vmx->emulation_required = !guest_state_valid(vcpu);
+ vmx->emulation_required = emulation_required(vcpu);
out:
return ret;
}
@@ -5754,6 +5892,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_VMON] = handle_vmon,
[EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
[EXIT_REASON_APIC_ACCESS] = handle_apic_access,
+ [EXIT_REASON_APIC_WRITE] = handle_apic_write,
+ [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced,
[EXIT_REASON_WBINVD] = handle_wbinvd,
[EXIT_REASON_XSETBV] = handle_xsetbv,
[EXIT_REASON_TASK_SWITCH] = handle_task_switch,
@@ -5780,7 +5920,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
gpa_t bitmap;
- if (!nested_cpu_has(get_vmcs12(vcpu), CPU_BASED_USE_MSR_BITMAPS))
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
return 1;
/*
@@ -6008,7 +6148,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
u32 vectoring_info = vmx->idt_vectoring_info;
/* If guest state is invalid, start emulating */
- if (vmx->emulation_required && emulate_invalid_guest_state)
+ if (vmx->emulation_required)
return handle_invalid_guest_state(vcpu);
/*
@@ -6103,6 +6243,85 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
vmcs_write32(TPR_THRESHOLD, irr);
}
+static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
+{
+ u32 sec_exec_control;
+
+ /*
+ * There is not point to enable virtualize x2apic without enable
+ * apicv
+ */
+ if (!cpu_has_vmx_virtualize_x2apic_mode() ||
+ !vmx_vm_has_apicv(vcpu->kvm))
+ return;
+
+ if (!vm_need_tpr_shadow(vcpu->kvm))
+ return;
+
+ sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+
+ if (set) {
+ sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+ } else {
+ sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+ sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ }
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
+
+ vmx_set_msr_bitmap(vcpu);
+}
+
+static void vmx_hwapic_isr_update(struct kvm *kvm, int isr)
+{
+ u16 status;
+ u8 old;
+
+ if (!vmx_vm_has_apicv(kvm))
+ return;
+
+ if (isr == -1)
+ isr = 0;
+
+ status = vmcs_read16(GUEST_INTR_STATUS);
+ old = status >> 8;
+ if (isr != old) {
+ status &= 0xff;
+ status |= isr << 8;
+ vmcs_write16(GUEST_INTR_STATUS, status);
+ }
+}
+
+static void vmx_set_rvi(int vector)
+{
+ u16 status;
+ u8 old;
+
+ status = vmcs_read16(GUEST_INTR_STATUS);
+ old = (u8)status & 0xff;
+ if ((u8)vector != old) {
+ status &= ~0xff;
+ status |= (u8)vector;
+ vmcs_write16(GUEST_INTR_STATUS, status);
+ }
+}
+
+static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
+{
+ if (max_irr == -1)
+ return;
+
+ vmx_set_rvi(max_irr);
+}
+
+static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+{
+ vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
+ vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
+ vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
+ vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
+}
+
static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
{
u32 exit_intr_info;
@@ -6291,7 +6510,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
/* Don't enter VMX if guest state is invalid, let the exit handler
start emulation until we arrive back to a valid state */
- if (vmx->emulation_required && emulate_invalid_guest_state)
+ if (vmx->emulation_required)
return;
if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
@@ -7366,6 +7585,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
.enable_nmi_window = enable_nmi_window,
.enable_irq_window = enable_irq_window,
.update_cr8_intercept = update_cr8_intercept,
+ .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
+ .vm_has_apicv = vmx_vm_has_apicv,
+ .load_eoi_exitmap = vmx_load_eoi_exitmap,
+ .hwapic_irr_update = vmx_hwapic_irr_update,
+ .hwapic_isr_update = vmx_hwapic_isr_update,
.set_tss_addr = vmx_set_tss_addr,
.get_tdp_level = get_ept_level,
@@ -7398,7 +7622,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
static int __init vmx_init(void)
{
- int r, i;
+ int r, i, msr;
rdmsrl_safe(MSR_EFER, &host_efer);
@@ -7419,11 +7643,19 @@ static int __init vmx_init(void)
if (!vmx_msr_bitmap_legacy)
goto out1;
+ vmx_msr_bitmap_legacy_x2apic =
+ (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_msr_bitmap_legacy_x2apic)
+ goto out2;
vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_msr_bitmap_longmode)
- goto out2;
+ goto out3;
+ vmx_msr_bitmap_longmode_x2apic =
+ (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_msr_bitmap_longmode_x2apic)
+ goto out4;
/*
* Allow direct access to the PC debug port (it is often used for I/O
@@ -7455,6 +7687,28 @@ static int __init vmx_init(void)
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
+ memcpy(vmx_msr_bitmap_legacy_x2apic,
+ vmx_msr_bitmap_legacy, PAGE_SIZE);
+ memcpy(vmx_msr_bitmap_longmode_x2apic,
+ vmx_msr_bitmap_longmode, PAGE_SIZE);
+
+ if (enable_apicv_reg_vid) {
+ for (msr = 0x800; msr <= 0x8ff; msr++)
+ vmx_disable_intercept_msr_read_x2apic(msr);
+
+ /* According SDM, in x2apic mode, the whole id reg is used.
+ * But in KVM, it only use the highest eight bits. Need to
+ * intercept it */
+ vmx_enable_intercept_msr_read_x2apic(0x802);
+ /* TMCCT */
+ vmx_enable_intercept_msr_read_x2apic(0x839);
+ /* TPR */
+ vmx_disable_intercept_msr_write_x2apic(0x808);
+ /* EOI */
+ vmx_disable_intercept_msr_write_x2apic(0x80b);
+ /* SELF-IPI */
+ vmx_disable_intercept_msr_write_x2apic(0x83f);
+ }
if (enable_ept) {
kvm_mmu_set_mask_ptes(0ull,
@@ -7468,8 +7722,10 @@ static int __init vmx_init(void)
return 0;
-out3:
+out4:
free_page((unsigned long)vmx_msr_bitmap_longmode);
+out3:
+ free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
out2:
free_page((unsigned long)vmx_msr_bitmap_legacy);
out1:
@@ -7481,6 +7737,8 @@ out:
static void __exit vmx_exit(void)
{
+ free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
+ free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
free_page((unsigned long)vmx_msr_bitmap_legacy);
free_page((unsigned long)vmx_msr_bitmap_longmode);
free_page((unsigned long)vmx_io_bitmap_b);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 37040079cd6..f71500af1f8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -872,8 +872,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
kvm_x86_ops->set_efer(vcpu, efer);
- vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
-
/* Update reserved bits */
if ((efer ^ old_efer) & EFER_NX)
kvm_mmu_reset_context(vcpu);
@@ -2522,7 +2520,7 @@ int kvm_dev_ioctl_check_extension(long ext)
r = KVM_MAX_VCPUS;
break;
case KVM_CAP_NR_MEMSLOTS:
- r = KVM_MEMORY_SLOTS;
+ r = KVM_USER_MEM_SLOTS;
break;
case KVM_CAP_PV_MMU: /* obsolete */
r = 0;
@@ -3274,12 +3272,10 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
return -EINVAL;
mutex_lock(&kvm->slots_lock);
- spin_lock(&kvm->mmu_lock);
kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
- spin_unlock(&kvm->mmu_lock);
mutex_unlock(&kvm->slots_lock);
return 0;
}
@@ -3439,7 +3435,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
mutex_lock(&kvm->slots_lock);
r = -EINVAL;
- if (log->slot >= KVM_MEMORY_SLOTS)
+ if (log->slot >= KVM_USER_MEM_SLOTS)
goto out;
memslot = id_to_memslot(kvm->memslots, log->slot);
@@ -4495,8 +4491,10 @@ static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
*selector = var.selector;
- if (var.unusable)
+ if (var.unusable) {
+ memset(desc, 0, sizeof(*desc));
return false;
+ }
if (var.g)
var.limit >>= 12;
@@ -4757,26 +4755,26 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
return r;
}
-static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
+static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
+ bool write_fault_to_shadow_pgtable)
{
- gpa_t gpa;
+ gpa_t gpa = cr2;
pfn_t pfn;
- if (tdp_enabled)
- return false;
-
- /*
- * if emulation was due to access to shadowed page table
- * and it failed try to unshadow page and re-enter the
- * guest to let CPU execute the instruction.
- */
- if (kvm_mmu_unprotect_page_virt(vcpu, gva))
- return true;
-
- gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
+ if (!vcpu->arch.mmu.direct_map) {
+ /*
+ * Write permission should be allowed since only
+ * write access need to be emulated.
+ */
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
- if (gpa == UNMAPPED_GVA)
- return true; /* let cpu generate fault */
+ /*
+ * If the mapping is invalid in guest, let cpu retry
+ * it to generate fault.
+ */
+ if (gpa == UNMAPPED_GVA)
+ return true;
+ }
/*
* Do not retry the unhandleable instruction if it faults on the
@@ -4785,12 +4783,43 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
* instruction -> ...
*/
pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
- if (!is_error_noslot_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
+
+ /*
+ * If the instruction failed on the error pfn, it can not be fixed,
+ * report the error to userspace.
+ */
+ if (is_error_noslot_pfn(pfn))
+ return false;
+
+ kvm_release_pfn_clean(pfn);
+
+ /* The instructions are well-emulated on direct mmu. */
+ if (vcpu->arch.mmu.direct_map) {
+ unsigned int indirect_shadow_pages;
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
+ spin_unlock(&vcpu->kvm->mmu_lock);
+
+ if (indirect_shadow_pages)
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+
return true;
}
- return false;
+ /*
+ * if emulation was due to access to shadowed page table
+ * and it failed try to unshadow page and re-enter the
+ * guest to let CPU execute the instruction.
+ */
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+
+ /*
+ * If the access faults on its page table, it can not
+ * be fixed by unprotecting shadow page and it should
+ * be reported to userspace.
+ */
+ return !write_fault_to_shadow_pgtable;
}
static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
@@ -4832,7 +4861,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
if (!vcpu->arch.mmu.direct_map)
gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
- kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
return true;
}
@@ -4849,7 +4878,13 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
int r;
struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
bool writeback = true;
+ bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
+ /*
+ * Clear write_fault_to_shadow_pgtable here to ensure it is
+ * never reused.
+ */
+ vcpu->arch.write_fault_to_shadow_pgtable = false;
kvm_clear_exception_queue(vcpu);
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
@@ -4868,7 +4903,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
if (r != EMULATION_OK) {
if (emulation_type & EMULTYPE_TRAP_UD)
return EMULATE_FAIL;
- if (reexecute_instruction(vcpu, cr2))
+ if (reexecute_instruction(vcpu, cr2,
+ write_fault_to_spt))
return EMULATE_DONE;
if (emulation_type & EMULTYPE_SKIP)
return EMULATE_FAIL;
@@ -4898,7 +4934,7 @@ restart:
return EMULATE_DONE;
if (r == EMULATION_FAILED) {
- if (reexecute_instruction(vcpu, cr2))
+ if (reexecute_instruction(vcpu, cr2, write_fault_to_spt))
return EMULATE_DONE;
return handle_emulation_failure(vcpu);
@@ -5541,7 +5577,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
vcpu->arch.nmi_injected = true;
kvm_x86_ops->set_nmi(vcpu);
}
- } else if (kvm_cpu_has_interrupt(vcpu)) {
+ } else if (kvm_cpu_has_injectable_intr(vcpu)) {
if (kvm_x86_ops->interrupt_allowed(vcpu)) {
kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
false);
@@ -5609,6 +5645,16 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
#endif
}
+static void update_eoi_exitmap(struct kvm_vcpu *vcpu)
+{
+ u64 eoi_exit_bitmap[4];
+
+ memset(eoi_exit_bitmap, 0, 32);
+
+ kvm_ioapic_calculate_eoi_exitmap(vcpu, eoi_exit_bitmap);
+ kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
+}
+
static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
int r;
@@ -5662,6 +5708,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_handle_pmu_event(vcpu);
if (kvm_check_request(KVM_REQ_PMI, vcpu))
kvm_deliver_pmi(vcpu);
+ if (kvm_check_request(KVM_REQ_EOIBITMAP, vcpu))
+ update_eoi_exitmap(vcpu);
}
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
@@ -5670,10 +5718,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
/* enable NMI/IRQ window open exits if needed */
if (vcpu->arch.nmi_pending)
kvm_x86_ops->enable_nmi_window(vcpu);
- else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+ else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
kvm_x86_ops->enable_irq_window(vcpu);
if (kvm_lapic_enabled(vcpu)) {
+ /*
+ * Update architecture specific hints for APIC
+ * virtual interrupt delivery.
+ */
+ if (kvm_x86_ops->hwapic_irr_update)
+ kvm_x86_ops->hwapic_irr_update(vcpu,
+ kvm_lapic_find_highest_irr(vcpu));
update_cr8_intercept(vcpu);
kvm_lapic_sync_to_vapic(vcpu);
}
@@ -6853,48 +6908,43 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_memory_slot *memslot,
struct kvm_memory_slot old,
struct kvm_userspace_memory_region *mem,
- int user_alloc)
+ bool user_alloc)
{
int npages = memslot->npages;
- int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
- /* Prevent internal slot pages from being moved by fork()/COW. */
- if (memslot->id >= KVM_MEMORY_SLOTS)
- map_flags = MAP_SHARED | MAP_ANONYMOUS;
-
- /*To keep backward compatibility with older userspace,
- *x86 needs to handle !user_alloc case.
+ /*
+ * Only private memory slots need to be mapped here since
+ * KVM_SET_MEMORY_REGION ioctl is no longer supported.
*/
- if (!user_alloc) {
- if (npages && !old.npages) {
- unsigned long userspace_addr;
+ if ((memslot->id >= KVM_USER_MEM_SLOTS) && npages && !old.npages) {
+ unsigned long userspace_addr;
- userspace_addr = vm_mmap(NULL, 0,
- npages * PAGE_SIZE,
- PROT_READ | PROT_WRITE,
- map_flags,
- 0);
+ /*
+ * MAP_SHARED to prevent internal slot pages from being moved
+ * by fork()/COW.
+ */
+ userspace_addr = vm_mmap(NULL, 0, npages * PAGE_SIZE,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS, 0);
- if (IS_ERR((void *)userspace_addr))
- return PTR_ERR((void *)userspace_addr);
+ if (IS_ERR((void *)userspace_addr))
+ return PTR_ERR((void *)userspace_addr);
- memslot->userspace_addr = userspace_addr;
- }
+ memslot->userspace_addr = userspace_addr;
}
-
return 0;
}
void kvm_arch_commit_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
struct kvm_memory_slot old,
- int user_alloc)
+ bool user_alloc)
{
int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
- if (!user_alloc && !old.user_alloc && old.npages && !npages) {
+ if ((mem->slot >= KVM_USER_MEM_SLOTS) && old.npages && !npages) {
int ret;
ret = vm_munmap(old.userspace_addr,
@@ -6908,11 +6958,15 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
if (!kvm->arch.n_requested_mmu_pages)
nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
- spin_lock(&kvm->mmu_lock);
if (nr_mmu_pages)
kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
- kvm_mmu_slot_remove_write_access(kvm, mem->slot);
- spin_unlock(&kvm->mmu_lock);
+ /*
+ * Write protect all pages for dirty logging.
+ * Existing largepage mappings are destroyed here and new ones will
+ * not be created until the end of the logging.
+ */
+ if (npages && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
+ kvm_mmu_slot_remove_write_access(kvm, mem->slot);
/*
* If memory slot is created, or moved, we need to clear all
* mmio sptes.