diff options
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/Kconfig | 13 | ||||
-rw-r--r-- | arch/x86/kvm/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/kvm/i8254.c | 124 | ||||
-rw-r--r-- | arch/x86/kvm/i8254.h | 12 | ||||
-rw-r--r-- | arch/x86/kvm/i8259.c | 25 | ||||
-rw-r--r-- | arch/x86/kvm/irq.c | 7 | ||||
-rw-r--r-- | arch/x86/kvm/irq.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/kvm_svm.h | 16 | ||||
-rw-r--r-- | arch/x86/kvm/kvm_timer.h | 18 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 251 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.h | 12 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 430 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 7 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 233 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 1269 | ||||
-rw-r--r-- | arch/x86/kvm/timer.c | 46 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 1086 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 847 | ||||
-rw-r--r-- | arch/x86/kvm/x86.h | 14 | ||||
-rw-r--r-- | arch/x86/kvm/x86_emulate.c | 195 |
20 files changed, 2943 insertions, 1666 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index b81125f0bde..8600a09e0c6 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -4,6 +4,10 @@ config HAVE_KVM bool +config HAVE_KVM_IRQCHIP + bool + default y + menuconfig VIRTUALIZATION bool "Virtualization" depends on HAVE_KVM || X86 @@ -46,6 +50,9 @@ config KVM_INTEL Provides support for KVM on Intel processors equipped with the VT extensions. + To compile this as a module, choose M here: the module + will be called kvm-intel. + config KVM_AMD tristate "KVM for AMD processors support" depends on KVM @@ -53,9 +60,13 @@ config KVM_AMD Provides support for KVM on AMD processors equipped with the AMD-V (SVM) extensions. + To compile this as a module, choose M here: the module + will be called kvm-amd. + config KVM_TRACE bool "KVM trace support" - depends on KVM && MARKERS && SYSFS + depends on KVM && SYSFS + select MARKERS select RELAY select DEBUG_FS default n diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index d3ec292f00f..b43c4efafe8 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -14,7 +14,7 @@ endif EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ - i8254.o + i8254.o timer.o obj-$(CONFIG_KVM) += kvm.o kvm-intel-objs = vmx.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 72bd275a9b5..4d6f0d293ee 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -98,6 +98,37 @@ static int pit_get_gate(struct kvm *kvm, int channel) return kvm->arch.vpit->pit_state.channels[channel].gate; } +static s64 __kpit_elapsed(struct kvm *kvm) +{ + s64 elapsed; + ktime_t remaining; + struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; + + /* + * The Counter does not stop when it reaches zero. In + * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to + * the highest count, either FFFF hex for binary counting + * or 9999 for BCD counting, and continues counting. + * Modes 2 and 3 are periodic; the Counter reloads + * itself with the initial count and continues counting + * from there. + */ + remaining = hrtimer_expires_remaining(&ps->pit_timer.timer); + elapsed = ps->pit_timer.period - ktime_to_ns(remaining); + elapsed = mod_64(elapsed, ps->pit_timer.period); + + return elapsed; +} + +static s64 kpit_elapsed(struct kvm *kvm, struct kvm_kpit_channel_state *c, + int channel) +{ + if (channel == 0) + return __kpit_elapsed(kvm); + + return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); +} + static int pit_get_count(struct kvm *kvm, int channel) { struct kvm_kpit_channel_state *c = @@ -107,7 +138,7 @@ static int pit_get_count(struct kvm *kvm, int channel) WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); - t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); + t = kpit_elapsed(kvm, c, channel); d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); switch (c->mode) { @@ -137,7 +168,7 @@ static int pit_get_out(struct kvm *kvm, int channel) WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); - t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); + t = kpit_elapsed(kvm, c, channel); d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); switch (c->mode) { @@ -193,25 +224,6 @@ static void pit_latch_status(struct kvm *kvm, int channel) } } -static int __pit_timer_fn(struct kvm_kpit_state *ps) -{ - struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0]; - struct kvm_kpit_timer *pt = &ps->pit_timer; - - if (!atomic_inc_and_test(&pt->pending)) - set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); - - if (vcpu0 && waitqueue_active(&vcpu0->wq)) - wake_up_interruptible(&vcpu0->wq); - - hrtimer_add_expires_ns(&pt->timer, pt->period); - pt->scheduled = hrtimer_get_expires_ns(&pt->timer); - if (pt->period) - ps->channels[0].count_load_time = ktime_get(); - - return (pt->period == 0 ? 0 : 1); -} - int pit_has_pending_timer(struct kvm_vcpu *vcpu) { struct kvm_pit *pit = vcpu->kvm->arch.vpit; @@ -232,21 +244,6 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) spin_unlock(&ps->inject_lock); } -static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) -{ - struct kvm_kpit_state *ps; - int restart_timer = 0; - - ps = container_of(data, struct kvm_kpit_state, pit_timer.timer); - - restart_timer = __pit_timer_fn(ps); - - if (restart_timer) - return HRTIMER_RESTART; - else - return HRTIMER_NORESTART; -} - void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) { struct kvm_pit *pit = vcpu->kvm->arch.vpit; @@ -260,15 +257,26 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } -static void destroy_pit_timer(struct kvm_kpit_timer *pt) +static void destroy_pit_timer(struct kvm_timer *pt) { pr_debug("pit: execute del timer!\n"); hrtimer_cancel(&pt->timer); } +static bool kpit_is_periodic(struct kvm_timer *ktimer) +{ + struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state, + pit_timer); + return ps->is_periodic; +} + +static struct kvm_timer_ops kpit_ops = { + .is_periodic = kpit_is_periodic, +}; + static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) { - struct kvm_kpit_timer *pt = &ps->pit_timer; + struct kvm_timer *pt = &ps->pit_timer; s64 interval; interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); @@ -277,8 +285,14 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) /* TODO The new value only affected after the retriggered */ hrtimer_cancel(&pt->timer); - pt->period = (is_period == 0) ? 0 : interval; - pt->timer.function = pit_timer_fn; + pt->period = interval; + ps->is_periodic = is_period; + + pt->timer.function = kvm_timer_fn; + pt->t_ops = &kpit_ops; + pt->kvm = ps->pit->kvm; + pt->vcpu_id = 0; + atomic_set(&pt->pending, 0); ps->irq_ack = 1; @@ -295,23 +309,23 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) pr_debug("pit: load_count val is %d, channel is %d\n", val, channel); /* - * Though spec said the state of 8254 is undefined after power-up, - * seems some tricky OS like Windows XP depends on IRQ0 interrupt - * when booting up. - * So here setting initialize rate for it, and not a specific number + * The largest possible initial count is 0; this is equivalent + * to 216 for binary counting and 104 for BCD counting. */ if (val == 0) val = 0x10000; - ps->channels[channel].count_load_time = ktime_get(); ps->channels[channel].count = val; - if (channel != 0) + if (channel != 0) { + ps->channels[channel].count_load_time = ktime_get(); return; + } /* Two types of timer * mode 1 is one shot, mode 2 is period, otherwise del timer */ switch (ps->channels[0].mode) { + case 0: case 1: /* FIXME: enhance mode 4 precision */ case 4: @@ -536,6 +550,16 @@ void kvm_pit_reset(struct kvm_pit *pit) pit->pit_state.irq_ack = 1; } +static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask) +{ + struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier); + + if (!mask) { + atomic_set(&pit->pit_state.pit_timer.pending, 0); + pit->pit_state.irq_ack = 1; + } +} + struct kvm_pit *kvm_create_pit(struct kvm *kvm) { struct kvm_pit *pit; @@ -545,9 +569,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) if (!pit) return NULL; - mutex_lock(&kvm->lock); pit->irq_source_id = kvm_request_irq_source_id(kvm); - mutex_unlock(&kvm->lock); if (pit->irq_source_id < 0) { kfree(pit); return NULL; @@ -580,10 +602,14 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) pit_state->irq_ack_notifier.gsi = 0; pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); + pit_state->pit_timer.reinject = true; mutex_unlock(&pit->pit_state.lock); kvm_pit_reset(pit); + pit->mask_notifier.func = pit_mask_notifer; + kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); + return pit; } @@ -592,6 +618,8 @@ void kvm_free_pit(struct kvm *kvm) struct hrtimer *timer; if (kvm->arch.vpit) { + kvm_unregister_irq_mask_notifier(kvm, 0, + &kvm->arch.vpit->mask_notifier); mutex_lock(&kvm->arch.vpit->pit_state.lock); timer = &kvm->arch.vpit->pit_state.pit_timer.timer; hrtimer_cancel(timer); diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 4178022b97a..bbd863ff60b 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -3,14 +3,6 @@ #include "iodev.h" -struct kvm_kpit_timer { - struct hrtimer timer; - int irq; - s64 period; /* unit: ns */ - s64 scheduled; - atomic_t pending; -}; - struct kvm_kpit_channel_state { u32 count; /* can be 65536 */ u16 latched_count; @@ -29,7 +21,8 @@ struct kvm_kpit_channel_state { struct kvm_kpit_state { struct kvm_kpit_channel_state channels[3]; - struct kvm_kpit_timer pit_timer; + struct kvm_timer pit_timer; + bool is_periodic; u32 speaker_data_on; struct mutex lock; struct kvm_pit *pit; @@ -45,6 +38,7 @@ struct kvm_pit { struct kvm *kvm; struct kvm_kpit_state pit_state; int irq_source_id; + struct kvm_irq_mask_notifier mask_notifier; }; #define KVM_PIT_BASE_ADDRESS 0x40 diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 179dcb0103f..1ccb50c74f1 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -32,11 +32,13 @@ #include <linux/kvm_host.h> static void pic_lock(struct kvm_pic *s) + __acquires(&s->lock) { spin_lock(&s->lock); } static void pic_unlock(struct kvm_pic *s) + __releases(&s->lock) { struct kvm *kvm = s->kvm; unsigned acks = s->pending_acks; @@ -49,7 +51,8 @@ static void pic_unlock(struct kvm_pic *s) spin_unlock(&s->lock); while (acks) { - kvm_notify_acked_irq(kvm, __ffs(acks)); + kvm_notify_acked_irq(kvm, SELECT_PIC(__ffs(acks)), + __ffs(acks)); acks &= acks - 1; } @@ -76,12 +79,13 @@ void kvm_pic_clear_isr_ack(struct kvm *kvm) /* * set irq level. If an edge is detected, then the IRR is set to 1 */ -static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level) +static inline int pic_set_irq1(struct kvm_kpic_state *s, int irq, int level) { - int mask; + int mask, ret = 1; mask = 1 << irq; if (s->elcr & mask) /* level triggered */ if (level) { + ret = !(s->irr & mask); s->irr |= mask; s->last_irr |= mask; } else { @@ -90,11 +94,15 @@ static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level) } else /* edge triggered */ if (level) { - if ((s->last_irr & mask) == 0) + if ((s->last_irr & mask) == 0) { + ret = !(s->irr & mask); s->irr |= mask; + } s->last_irr |= mask; } else s->last_irr &= ~mask; + + return (s->imr & mask) ? -1 : ret; } /* @@ -171,16 +179,19 @@ void kvm_pic_update_irq(struct kvm_pic *s) pic_unlock(s); } -void kvm_pic_set_irq(void *opaque, int irq, int level) +int kvm_pic_set_irq(void *opaque, int irq, int level) { struct kvm_pic *s = opaque; + int ret = -1; pic_lock(s); if (irq >= 0 && irq < PIC_NUM_PINS) { - pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); + ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); pic_update_irq(s); } pic_unlock(s); + + return ret; } /* @@ -232,7 +243,7 @@ int kvm_pic_read_irq(struct kvm *kvm) } pic_update_irq(s); pic_unlock(s); - kvm_notify_acked_irq(kvm, irq); + kvm_notify_acked_irq(kvm, SELECT_PIC(irq), irq); return intno; } diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index cf17ed52f6f..96dfbb6ad2a 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -24,6 +24,7 @@ #include "irq.h" #include "i8254.h" +#include "x86.h" /* * check if there are pending timer events @@ -48,6 +49,9 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v) { struct kvm_pic *s; + if (!irqchip_in_kernel(v->kvm)) + return v->arch.interrupt.pending; + if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */ if (kvm_apic_accept_pic_intr(v)) { s = pic_irqchip(v->kvm); /* PIC */ @@ -67,6 +71,9 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) struct kvm_pic *s; int vector; + if (!irqchip_in_kernel(v->kvm)) + return v->arch.interrupt.nr; + vector = kvm_get_apic_interrupt(v); /* APIC */ if (vector == -1) { if (kvm_apic_accept_pic_intr(v)) { diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 82579ee538d..9f593188129 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -32,6 +32,8 @@ #include "lapic.h" #define PIC_NUM_PINS 16 +#define SELECT_PIC(irq) \ + ((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE) struct kvm; struct kvm_vcpu; diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h index 8e5ee99551f..ed66e4c078d 100644 --- a/arch/x86/kvm/kvm_svm.h +++ b/arch/x86/kvm/kvm_svm.h @@ -18,7 +18,6 @@ static const u32 host_save_user_msrs[] = { }; #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) -#define NUM_DB_REGS 4 struct kvm_vcpu; @@ -29,18 +28,23 @@ struct vcpu_svm { struct svm_cpu_data *svm_data; uint64_t asid_generation; - unsigned long db_regs[NUM_DB_REGS]; - u64 next_rip; u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; u64 host_gs_base; unsigned long host_cr2; - unsigned long host_db_regs[NUM_DB_REGS]; - unsigned long host_dr6; - unsigned long host_dr7; u32 *msrpm; + struct vmcb *hsave; + u64 hsave_msr; + + u64 nested_vmcb; + + /* These are the merged vectors */ + u32 *nested_msrpm; + + /* gpa pointers to the real vectors */ + u64 nested_vmcb_msrpm; }; #endif diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h new file mode 100644 index 00000000000..26bd6ba74e1 --- /dev/null +++ b/arch/x86/kvm/kvm_timer.h @@ -0,0 +1,18 @@ + +struct kvm_timer { + struct hrtimer timer; + s64 period; /* unit: ns */ + atomic_t pending; /* accumulated triggered timers */ + bool reinject; + struct kvm_timer_ops *t_ops; + struct kvm *kvm; + int vcpu_id; +}; + +struct kvm_timer_ops { + bool (*is_periodic)(struct kvm_timer *); +}; + + +enum hrtimer_restart kvm_timer_fn(struct hrtimer *data); + diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index f0b67f2cdd6..ae99d83f81a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -196,20 +196,15 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); -int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig) +static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, + int vector, int level, int trig_mode); + +int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!apic_test_and_set_irr(vec, apic)) { - /* a new pending irq is set in IRR */ - if (trig) - apic_set_vector(vec, apic->regs + APIC_TMR); - else - apic_clear_vector(vec, apic->regs + APIC_TMR); - kvm_vcpu_kick(apic->vcpu); - return 1; - } - return 0; + return __apic_accept_irq(apic, irq->delivery_mode, irq->vector, + irq->level, irq->trig_mode); } static inline int apic_find_highest_isr(struct kvm_lapic *apic) @@ -250,7 +245,7 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest) { - return kvm_apic_id(apic) == dest; + return dest == 0xff || kvm_apic_id(apic) == dest; } int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) @@ -279,37 +274,34 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) return result; } -static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, +int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, int dest, int dest_mode) { int result = 0; struct kvm_lapic *target = vcpu->arch.apic; apic_debug("target %p, source %p, dest 0x%x, " - "dest_mode 0x%x, short_hand 0x%x", + "dest_mode 0x%x, short_hand 0x%x\n", target, source, dest, dest_mode, short_hand); ASSERT(!target); switch (short_hand) { case APIC_DEST_NOSHORT: - if (dest_mode == 0) { + if (dest_mode == 0) /* Physical mode. */ - if ((dest == 0xFF) || (dest == kvm_apic_id(target))) - result = 1; - } else + result = kvm_apic_match_physical_addr(target, dest); + else /* Logical mode. */ result = kvm_apic_match_logical_addr(target, dest); break; case APIC_DEST_SELF: - if (target == source) - result = 1; + result = (target == source); break; case APIC_DEST_ALLINC: result = 1; break; case APIC_DEST_ALLBUT: - if (target != source) - result = 1; + result = (target != source); break; default: printk(KERN_WARNING "Bad dest shorthand value %x\n", @@ -327,20 +319,22 @@ static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, int vector, int level, int trig_mode) { - int orig_irr, result = 0; + int result = 0; struct kvm_vcpu *vcpu = apic->vcpu; switch (delivery_mode) { - case APIC_DM_FIXED: case APIC_DM_LOWEST: + vcpu->arch.apic_arb_prio++; + case APIC_DM_FIXED: /* FIXME add logic for vcpu on reset */ if (unlikely(!apic_enabled(apic))) break; - orig_irr = apic_test_and_set_irr(vector, apic); - if (orig_irr && trig_mode) { - apic_debug("level trig mode repeatedly for vector %d", - vector); + result = !apic_test_and_set_irr(vector, apic); + if (!result) { + if (trig_mode) + apic_debug("level trig mode repeatedly for " + "vector %d", vector); break; } @@ -349,10 +343,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, apic_set_vector(vector, apic->regs + APIC_TMR); } else apic_clear_vector(vector, apic->regs + APIC_TMR); - kvm_vcpu_kick(vcpu); - - result = (orig_irr == 0); break; case APIC_DM_REMRD: @@ -364,12 +355,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, break; case APIC_DM_NMI: + result = 1; kvm_inject_nmi(vcpu); kvm_vcpu_kick(vcpu); break; case APIC_DM_INIT: if (level) { + result = 1; if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) printk(KERN_DEBUG "INIT on a runnable vcpu %d\n", @@ -386,6 +379,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, apic_debug("SIPI to vcpu %d vector 0x%02x\n", vcpu->vcpu_id, vector); if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { + result = 1; vcpu->arch.sipi_vector = vector; vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; kvm_vcpu_kick(vcpu); @@ -408,43 +402,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, return result; } -static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, - unsigned long bitmap) -{ - int last; - int next; - struct kvm_lapic *apic = NULL; - - last = kvm->arch.round_robin_prev_vcpu; - next = last; - - do { - if (++next == KVM_MAX_VCPUS) - next = 0; - if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap)) - continue; - apic = kvm->vcpus[next]->arch.apic; - if (apic && apic_enabled(apic)) - break; - apic = NULL; - } while (next != last); - kvm->arch.round_robin_prev_vcpu = next; - - if (!apic) - printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n"); - - return apic; -} - -struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, - unsigned long bitmap) +int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) { - struct kvm_lapic *apic; - - apic = kvm_apic_round_robin(kvm, vector, bitmap); - if (apic) - return apic->vcpu; - return NULL; + return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; } static void apic_set_eoi(struct kvm_lapic *apic) @@ -472,47 +432,24 @@ static void apic_send_ipi(struct kvm_lapic *apic) { u32 icr_low = apic_get_reg(apic, APIC_ICR); u32 icr_high = apic_get_reg(apic, APIC_ICR2); + struct kvm_lapic_irq irq; - unsigned int dest = GET_APIC_DEST_FIELD(icr_high); - unsigned int short_hand = icr_low & APIC_SHORT_MASK; - unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG; - unsigned int level = icr_low & APIC_INT_ASSERT; - unsigned int dest_mode = icr_low & APIC_DEST_MASK; - unsigned int delivery_mode = icr_low & APIC_MODE_MASK; - unsigned int vector = icr_low & APIC_VECTOR_MASK; - - struct kvm_vcpu *target; - struct kvm_vcpu *vcpu; - unsigned long lpr_map = 0; - int i; + irq.vector = icr_low & APIC_VECTOR_MASK; + irq.delivery_mode = icr_low & APIC_MODE_MASK; + irq.dest_mode = icr_low & APIC_DEST_MASK; + irq.level = icr_low & APIC_INT_ASSERT; + irq.trig_mode = icr_low & APIC_INT_LEVELTRIG; + irq.shorthand = icr_low & APIC_SHORT_MASK; + irq.dest_id = GET_APIC_DEST_FIELD(icr_high); apic_debug("icr_high 0x%x, icr_low 0x%x, " "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n", - icr_high, icr_low, short_hand, dest, - trig_mode, level, dest_mode, delivery_mode, vector); - - for (i = 0; i < KVM_MAX_VCPUS; i++) { - vcpu = apic->vcpu->kvm->vcpus[i]; - if (!vcpu) - continue; - - if (vcpu->arch.apic && - apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) { - if (delivery_mode == APIC_DM_LOWEST) - set_bit(vcpu->vcpu_id, &lpr_map); - else - __apic_accept_irq(vcpu->arch.apic, delivery_mode, - vector, level, trig_mode); - } - } + icr_high, icr_low, irq.shorthand, irq.dest_id, + irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, + irq.vector); - if (delivery_mode == APIC_DM_LOWEST) { - target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map); - if (target != NULL) - __apic_accept_irq(target->arch.apic, delivery_mode, - vector, level, trig_mode); - } + kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); } static u32 apic_get_tmcct(struct kvm_lapic *apic) @@ -527,12 +464,13 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic) if (apic_get_reg(apic, APIC_TMICT) == 0) return 0; - remaining = hrtimer_expires_remaining(&apic->timer.dev); + remaining = hrtimer_expires_remaining(&apic->lapic_timer.timer); if (ktime_to_ns(remaining) < 0) remaining = ktime_set(0, 0); - ns = mod_64(ktime_to_ns(remaining), apic->timer.period); - tmcct = div64_u64(ns, (APIC_BUS_CYCLE_NS * apic->timer.divide_count)); + ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period); + tmcct = div64_u64(ns, + (APIC_BUS_CYCLE_NS * apic->divide_count)); return tmcct; } @@ -619,25 +557,25 @@ static void update_divide_count(struct kvm_lapic *apic) tdcr = apic_get_reg(apic, APIC_TDCR); tmp1 = tdcr & 0xf; tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; - apic->timer.divide_count = 0x1 << (tmp2 & 0x7); + apic->divide_count = 0x1 << (tmp2 & 0x7); apic_debug("timer divide count is 0x%x\n", - apic->timer.divide_count); + apic->divide_count); } static void start_apic_timer(struct kvm_lapic *apic) { - ktime_t now = apic->timer.dev.base->get_time(); + ktime_t now = apic->lapic_timer.timer.base->get_time(); - apic->timer.period = apic_get_reg(apic, APIC_TMICT) * - APIC_BUS_CYCLE_NS * apic->timer.divide_count; - atomic_set(&apic->timer.pending, 0); + apic->lapic_timer.period = apic_get_reg(apic, APIC_TMICT) * + APIC_BUS_CYCLE_NS * apic->divide_count; + atomic_set(&apic->lapic_timer.pending, 0); - if (!apic->timer.period) + if (!apic->lapic_timer.period) return; - hrtimer_start(&apic->timer.dev, - ktime_add_ns(now, apic->timer.period), + hrtimer_start(&apic->lapic_timer.timer, + ktime_add_ns(now, apic->lapic_timer.period), HRTIMER_MODE_ABS); apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" @@ -646,9 +584,9 @@ static void start_apic_timer(struct kvm_lapic *apic) "expire @ 0x%016" PRIx64 ".\n", __func__, APIC_BUS_CYCLE_NS, ktime_to_ns(now), apic_get_reg(apic, APIC_TMICT), - apic->timer.period, + apic->lapic_timer.period, ktime_to_ns(ktime_add_ns(now, - apic->timer.period))); + apic->lapic_timer.period))); } static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) @@ -730,7 +668,7 @@ static void apic_mmio_write(struct kvm_io_device *this, apic_set_reg(apic, APIC_LVTT + 0x10 * i, lvt_val | APIC_LVT_MASKED); } - atomic_set(&apic->timer.pending, 0); + atomic_set(&apic->lapic_timer.pending, 0); } break; @@ -762,7 +700,7 @@ static void apic_mmio_write(struct kvm_io_device *this, break; case APIC_TMICT: - hrtimer_cancel(&apic->timer.dev); + hrtimer_cancel(&apic->lapic_timer.timer); apic_set_reg(apic, APIC_TMICT, val); start_apic_timer(apic); return; @@ -802,7 +740,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) if (!vcpu->arch.apic) return; - hrtimer_cancel(&vcpu->arch.apic->timer.dev); + hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer); if (vcpu->arch.apic->regs_page) __free_page(vcpu->arch.apic->regs_page); @@ -880,7 +818,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) ASSERT(apic != NULL); /* Stop the timer in case it's a reset to an active apic */ - hrtimer_cancel(&apic->timer.dev); + hrtimer_cancel(&apic->lapic_timer.timer); apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); apic_set_reg(apic, APIC_LVR, APIC_VERSION); @@ -905,11 +843,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } update_divide_count(apic); - atomic_set(&apic->timer.pending, 0); + atomic_set(&apic->lapic_timer.pending, 0); if (vcpu->vcpu_id == 0) vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; apic_update_ppr(apic); + vcpu->arch.apic_arb_prio = 0; + apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, vcpu, kvm_apic_id(apic), @@ -917,16 +857,14 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_lapic_reset); -int kvm_lapic_enabled(struct kvm_vcpu *vcpu) +bool kvm_apic_present(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = vcpu->arch.apic; - int ret = 0; - - if (!apic) - return 0; - ret = apic_enabled(apic); + return vcpu->arch.apic && apic_hw_enabled(vcpu->arch.apic); +} - return ret; +int kvm_lapic_enabled(struct kvm_vcpu *vcpu) +{ + return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic); } EXPORT_SYMBOL_GPL(kvm_lapic_enabled); @@ -936,22 +874,11 @@ EXPORT_SYMBOL_GPL(kvm_lapic_enabled); *---------------------------------------------------------------------- */ -/* TODO: make sure __apic_timer_fn runs in current pCPU */ -static int __apic_timer_fn(struct kvm_lapic *apic) +static bool lapic_is_periodic(struct kvm_timer *ktimer) { - int result = 0; - wait_queue_head_t *q = &apic->vcpu->wq; - - if(!atomic_inc_and_test(&apic->timer.pending)) - set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); - if (waitqueue_active(q)) - wake_up_interruptible(q); - - if (apic_lvtt_period(apic)) { - result = 1; - hrtimer_add_expires_ns(&apic->timer.dev, apic->timer.period); - } - return result; + struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, + lapic_timer); + return apic_lvtt_period(apic); } int apic_has_pending_timer(struct kvm_vcpu *vcpu) @@ -959,7 +886,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu) struct kvm_lapic *lapic = vcpu->arch.apic; if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT)) - return atomic_read(&lapic->timer.pending); + return atomic_read(&lapic->lapic_timer.pending); return 0; } @@ -986,20 +913,9 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu) kvm_apic_local_deliver(apic, APIC_LVT0); } -static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) -{ - struct kvm_lapic *apic; - int restart_timer = 0; - - apic = container_of(data, struct kvm_lapic, timer.dev); - - restart_timer = __apic_timer_fn(apic); - - if (restart_timer) - return HRTIMER_RESTART; - else - return HRTIMER_NORESTART; -} +static struct kvm_timer_ops lapic_timer_ops = { + .is_periodic = lapic_is_periodic, +}; int kvm_create_lapic(struct kvm_vcpu *vcpu) { @@ -1024,8 +940,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) memset(apic->regs, 0, PAGE_SIZE); apic->vcpu = vcpu; - hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - apic->timer.dev.function = apic_timer_fn; + hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); + apic->lapic_timer.timer.function = kvm_timer_fn; + apic->lapic_timer.t_ops = &lapic_timer_ops; + apic->lapic_timer.kvm = vcpu->kvm; + apic->lapic_timer.vcpu_id = vcpu->vcpu_id; + apic->base_address = APIC_DEFAULT_PHYS_BASE; vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; @@ -1078,9 +999,9 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (apic && atomic_read(&apic->timer.pending) > 0) { + if (apic && atomic_read(&apic->lapic_timer.pending) > 0) { if (kvm_apic_local_deliver(apic, APIC_LVTT)) - atomic_dec(&apic->timer.pending); + atomic_dec(&apic->lapic_timer.pending); } } @@ -1106,7 +1027,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) MSR_IA32_APICBASE_BASE; apic_set_reg(apic, APIC_LVR, APIC_VERSION); apic_update_ppr(apic); - hrtimer_cancel(&apic->timer.dev); + hrtimer_cancel(&apic->lapic_timer.timer); update_divide_count(apic); start_apic_timer(apic); } @@ -1119,7 +1040,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) if (!apic) return; - timer = &apic->timer.dev; + timer = &apic->lapic_timer.timer; if (hrtimer_cancel(timer)) hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 45ab6ee7120..a587f8349c4 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -2,18 +2,15 @@ #define __KVM_X86_LAPIC_H #include "iodev.h" +#include "kvm_timer.h" #include <linux/kvm_host.h> struct kvm_lapic { unsigned long base_address; struct kvm_io_device dev; - struct { - atomic_t pending; - s64 period; /* unit: ns */ - u32 divide_count; - struct hrtimer dev; - } timer; + struct kvm_timer lapic_timer; + u32 divide_count; struct kvm_vcpu *vcpu; struct page *regs_page; void *regs; @@ -34,12 +31,13 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); -int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig); +int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq); u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); int kvm_lapic_enabled(struct kvm_vcpu *vcpu); +bool kvm_apic_present(struct kvm_vcpu *vcpu); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 2d4477c7147..5c3d6e81a7d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -126,6 +126,7 @@ module_param(oos_shadow, bool, 0644); #define PFERR_PRESENT_MASK (1U << 0) #define PFERR_WRITE_MASK (1U << 1) #define PFERR_USER_MASK (1U << 2) +#define PFERR_RSVD_MASK (1U << 3) #define PFERR_FETCH_MASK (1U << 4) #define PT_DIRECTORY_LEVEL 2 @@ -145,11 +146,20 @@ struct kvm_rmap_desc { struct kvm_rmap_desc *more; }; -struct kvm_shadow_walk { - int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu, - u64 addr, u64 *spte, int level); +struct kvm_shadow_walk_iterator { + u64 addr; + hpa_t shadow_addr; + int level; + u64 *sptep; + unsigned index; }; +#define for_each_shadow_entry(_vcpu, _addr, _walker) \ + for (shadow_walk_init(&(_walker), _vcpu, _addr); \ + shadow_walk_okay(&(_walker)); \ + shadow_walk_next(&(_walker))) + + struct kvm_unsync_walk { int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk); }; @@ -168,7 +178,11 @@ static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ static u64 __read_mostly shadow_user_mask; static u64 __read_mostly shadow_accessed_mask; static u64 __read_mostly shadow_dirty_mask; -static u64 __read_mostly shadow_mt_mask; + +static inline u64 rsvd_bits(int s, int e) +{ + return ((1ULL << (e - s + 1)) - 1) << s; +} void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) { @@ -184,14 +198,13 @@ void kvm_mmu_set_base_ptes(u64 base_pte) EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask) + u64 dirty_mask, u64 nx_mask, u64 x_mask) { shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; shadow_dirty_mask = dirty_mask; shadow_nx_mask = nx_mask; shadow_x_mask = x_mask; - shadow_mt_mask = mt_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); @@ -210,11 +223,6 @@ static int is_nx(struct kvm_vcpu *vcpu) return vcpu->arch.shadow_efer & EFER_NX; } -static int is_present_pte(unsigned long pte) -{ - return pte & PT_PRESENT_MASK; -} - static int is_shadow_present_pte(u64 pte) { return pte != shadow_trap_nonpresent_pte @@ -343,7 +351,6 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, BUG_ON(!mc->nobjs); p = mc->objects[--mc->nobjs]; - memset(p, 0, size); return p; } @@ -794,10 +801,8 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, set_page_private(virt_to_page(sp->spt), (unsigned long)sp); list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&sp->oos_link); - ASSERT(is_empty_shadow_page(sp->spt)); bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); sp->multimapped = 0; - sp->global = 1; sp->parent_pte = parent_pte; --vcpu->kvm->arch.n_free_mmu_pages; return sp; @@ -983,8 +988,8 @@ struct kvm_mmu_pages { idx < 512; \ idx = find_next_bit(bitmap, 512, idx+1)) -int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, - int idx) +static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, + int idx) { int i; @@ -1059,7 +1064,7 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) index = kvm_page_table_hashfn(gfn); bucket = &kvm->arch.mmu_page_hash[index]; hlist_for_each_entry(sp, node, bucket, hash_link) - if (sp->gfn == gfn && !sp->role.metaphysical + if (sp->gfn == gfn && !sp->role.direct && !sp->role.invalid) { pgprintk("%s: found role %x\n", __func__, sp->role.word); @@ -1068,18 +1073,10 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) return NULL; } -static void kvm_unlink_unsync_global(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - list_del(&sp->oos_link); - --kvm->stat.mmu_unsync_global; -} - static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { WARN_ON(!sp->unsync); sp->unsync = 0; - if (sp->global) - kvm_unlink_unsync_global(kvm, sp); --kvm->stat.mmu_unsync; } @@ -1115,8 +1112,9 @@ struct mmu_page_path { i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \ i = mmu_pages_next(&pvec, &parents, i)) -int mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents, - int i) +static int mmu_pages_next(struct kvm_mmu_pages *pvec, + struct mmu_page_path *parents, + int i) { int n; @@ -1135,7 +1133,7 @@ int mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents, return n; } -void mmu_pages_clear_parents(struct mmu_page_path *parents) +static void mmu_pages_clear_parents(struct mmu_page_path *parents) { struct kvm_mmu_page *sp; unsigned int level = 0; @@ -1193,7 +1191,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gaddr, unsigned level, - int metaphysical, + int direct, unsigned access, u64 *parent_pte) { @@ -1204,10 +1202,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp; struct hlist_node *node, *tmp; - role.word = 0; - role.glevels = vcpu->arch.mmu.root_level; + role = vcpu->arch.mmu.base_role; role.level = level; - role.metaphysical = metaphysical; + role.direct = direct; role.access = access; if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); @@ -1243,7 +1240,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, sp->gfn = gfn; sp->role = role; hlist_add_head(&sp->hash_link, bucket); - if (!metaphysical) { + if (!direct) { if (rmap_write_protect(vcpu->kvm, gfn)) kvm_flush_remote_tlbs(vcpu->kvm); account_shadowed(vcpu->kvm, gfn); @@ -1255,35 +1252,35 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, return sp; } -static int walk_shadow(struct kvm_shadow_walk *walker, - struct kvm_vcpu *vcpu, u64 addr) +static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, + struct kvm_vcpu *vcpu, u64 addr) { - hpa_t shadow_addr; - int level; - int r; - u64 *sptep; - unsigned index; - - shadow_addr = vcpu->arch.mmu.root_hpa; - level = vcpu->arch.mmu.shadow_root_level; - if (level == PT32E_ROOT_LEVEL) { - shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; - shadow_addr &= PT64_BASE_ADDR_MASK; - if (!shadow_addr) - return 1; - --level; + iterator->addr = addr; + iterator->shadow_addr = vcpu->arch.mmu.root_hpa; + iterator->level = vcpu->arch.mmu.shadow_root_level; + if (iterator->level == PT32E_ROOT_LEVEL) { + iterator->shadow_addr + = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; + iterator->shadow_addr &= PT64_BASE_ADDR_MASK; + --iterator->level; + if (!iterator->shadow_addr) + iterator->level = 0; } +} - while (level >= PT_PAGE_TABLE_LEVEL) { - index = SHADOW_PT_INDEX(addr, level); - sptep = ((u64 *)__va(shadow_addr)) + index; - r = walker->entry(walker, vcpu, addr, sptep, level); - if (r) - return r; - shadow_addr = *sptep & PT64_BASE_ADDR_MASK; - --level; - } - return 0; +static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) +{ + if (iterator->level < PT_PAGE_TABLE_LEVEL) + return false; + iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); + iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; + return true; +} + +static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) +{ + iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK; + --iterator->level; } static void kvm_mmu_page_unlink_children(struct kvm *kvm, @@ -1388,7 +1385,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_page_unlink_children(kvm, sp); kvm_mmu_unlink_parents(kvm, sp); kvm_flush_remote_tlbs(kvm); - if (!sp->role.invalid && !sp->role.metaphysical) + if (!sp->role.invalid && !sp->role.direct) unaccount_shadowed(kvm, sp->gfn); if (sp->unsync) kvm_unlink_unsync_page(kvm, sp); @@ -1451,7 +1448,7 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) index = kvm_page_table_hashfn(gfn); bucket = &kvm->arch.mmu_page_hash[index]; hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) - if (sp->gfn == gfn && !sp->role.metaphysical) { + if (sp->gfn == gfn && !sp->role.direct) { pgprintk("%s: gfn %lx role %x\n", __func__, gfn, sp->role.word); r = 1; @@ -1463,11 +1460,20 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) { + unsigned index; + struct hlist_head *bucket; struct kvm_mmu_page *sp; + struct hlist_node *node, *nn; - while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) { - pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word); - kvm_mmu_zap_page(kvm, sp); + index = kvm_page_table_hashfn(gfn); + bucket = &kvm->arch.mmu_page_hash[index]; + hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) { + if (sp->gfn == gfn && !sp->role.direct + && !sp->role.invalid) { + pgprintk("%s: zap %lx %x\n", + __func__, gfn, sp->role.word); + kvm_mmu_zap_page(kvm, sp); + } } } @@ -1600,7 +1606,7 @@ static int get_mtrr_type(struct mtrr_state_type *mtrr_state, return mtrr_state->def_type; } -static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) +u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) { u8 mtrr; @@ -1610,6 +1616,7 @@ static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) mtrr = MTRR_TYPE_WRBACK; return mtrr; } +EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type); static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { @@ -1622,7 +1629,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) bucket = &vcpu->kvm->arch.mmu_page_hash[index]; /* don't unsync if pagetable is shadowed with multiple roles */ hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { - if (s->gfn != sp->gfn || s->role.metaphysical) + if (s->gfn != sp->gfn || s->role.direct) continue; if (s->role.word != sp->role.word) return 1; @@ -1630,11 +1637,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) ++vcpu->kvm->stat.mmu_unsync; sp->unsync = 1; - if (sp->global) { - list_add(&sp->oos_link, &vcpu->kvm->arch.oos_global_pages); - ++vcpu->kvm->stat.mmu_unsync_global; - } else - kvm_mmu_mark_parents_unsync(vcpu, sp); + kvm_mmu_mark_parents_unsync(vcpu, sp); mmu_convert_notrap(sp); return 0; @@ -1661,23 +1664,11 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, unsigned pte_access, int user_fault, int write_fault, int dirty, int largepage, - int global, gfn_t gfn, pfn_t pfn, bool speculative, + gfn_t gfn, pfn_t pfn, bool speculative, bool can_unsync) { u64 spte; int ret = 0; - u64 mt_mask = shadow_mt_mask; - struct kvm_mmu_page *sp = page_header(__pa(shadow_pte)); - - if (!(vcpu->arch.cr4 & X86_CR4_PGE)) - global = 0; - if (!global && sp->global) { - sp->global = 0; - if (sp->unsync) { - kvm_unlink_unsync_global(vcpu->kvm, sp); - kvm_mmu_mark_parents_unsync(vcpu, sp); - } - } /* * We don't set the accessed bit, since we sometimes want to see @@ -1697,16 +1688,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, spte |= shadow_user_mask; if (largepage) spte |= PT_PAGE_SIZE_MASK; - if (mt_mask) { - if (!kvm_is_mmio_pfn(pfn)) { - mt_mask = get_memory_type(vcpu, gfn) << - kvm_x86_ops->get_mt_mask_shift(); - mt_mask |= VMX_EPT_IGMT_BIT; - } else - mt_mask = MTRR_TYPE_UNCACHABLE << - kvm_x86_ops->get_mt_mask_shift(); - spte |= mt_mask; - } + if (tdp_enabled) + spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, + kvm_is_mmio_pfn(pfn)); spte |= (u64)pfn << PAGE_SHIFT; @@ -1751,8 +1735,8 @@ set_pte: static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, unsigned pt_access, unsigned pte_access, int user_fault, int write_fault, int dirty, - int *ptwrite, int largepage, int global, - gfn_t gfn, pfn_t pfn, bool speculative) + int *ptwrite, int largepage, gfn_t gfn, + pfn_t pfn, bool speculative) { int was_rmapped = 0; int was_writeble = is_writeble_pte(*shadow_pte); @@ -1777,15 +1761,11 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, pgprintk("hfn old %lx new %lx\n", spte_to_pfn(*shadow_pte), pfn); rmap_remove(vcpu->kvm, shadow_pte); - } else { - if (largepage) - was_rmapped = is_large_pte(*shadow_pte); - else - was_rmapped = 1; - } + } else + was_rmapped = 1; } if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, - dirty, largepage, global, gfn, pfn, speculative, true)) { + dirty, largepage, gfn, pfn, speculative, true)) { if (write_fault) *ptwrite = 1; kvm_x86_ops->tlb_flush(vcpu); @@ -1820,67 +1800,42 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { } -struct direct_shadow_walk { - struct kvm_shadow_walk walker; - pfn_t pfn; - int write; - int largepage; - int pt_write; -}; - -static int direct_map_entry(struct kvm_shadow_walk *_walk, - struct kvm_vcpu *vcpu, - u64 addr, u64 *sptep, int level) +static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, + int largepage, gfn_t gfn, pfn_t pfn) { - struct direct_shadow_walk *walk = - container_of(_walk, struct direct_shadow_walk, walker); + struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; + int pt_write = 0; gfn_t pseudo_gfn; - gfn_t gfn = addr >> PAGE_SHIFT; - - if (level == PT_PAGE_TABLE_LEVEL - || (walk->largepage && level == PT_DIRECTORY_LEVEL)) { - mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL, - 0, walk->write, 1, &walk->pt_write, - walk->largepage, 0, gfn, walk->pfn, false); - ++vcpu->stat.pf_fixed; - return 1; - } - if (*sptep == shadow_trap_nonpresent_pte) { - pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; - sp = kvm_mmu_get_page(vcpu, pseudo_gfn, (gva_t)addr, level - 1, - 1, ACC_ALL, sptep); - if (!sp) { - pgprintk("nonpaging_map: ENOMEM\n"); - kvm_release_pfn_clean(walk->pfn); - return -ENOMEM; + for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { + if (iterator.level == PT_PAGE_TABLE_LEVEL + || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) { + mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, + 0, write, 1, &pt_write, + largepage, gfn, pfn, false); + ++vcpu->stat.pf_fixed; + break; } - set_shadow_pte(sptep, - __pa(sp->spt) - | PT_PRESENT_MASK | PT_WRITABLE_MASK - | shadow_user_mask | shadow_x_mask); - } - return 0; -} + if (*iterator.sptep == shadow_trap_nonpresent_pte) { + pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; + sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, + iterator.level - 1, + 1, ACC_ALL, iterator.sptep); + if (!sp) { + pgprintk("nonpaging_map: ENOMEM\n"); + kvm_release_pfn_clean(pfn); + return -ENOMEM; + } -static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, - int largepage, gfn_t gfn, pfn_t pfn) -{ - int r; - struct direct_shadow_walk walker = { - .walker = { .entry = direct_map_entry, }, - .pfn = pfn, - .largepage = largepage, - .write = write, - .pt_write = 0, - }; - - r = walk_shadow(&walker.walker, vcpu, gfn << PAGE_SHIFT); - if (r < 0) - return r; - return walker.pt_write; + set_shadow_pte(iterator.sptep, + __pa(sp->spt) + | PT_PRESENT_MASK | PT_WRITABLE_MASK + | shadow_user_mask | shadow_x_mask); + } + } + return pt_write; } static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) @@ -1957,12 +1912,24 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) vcpu->arch.mmu.root_hpa = INVALID_PAGE; } -static void mmu_alloc_roots(struct kvm_vcpu *vcpu) +static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) +{ + int ret = 0; + + if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { + set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + ret = 1; + } + + return ret; +} + +static int mmu_alloc_roots(struct kvm_vcpu *vcpu) { int i; gfn_t root_gfn; struct kvm_mmu_page *sp; - int metaphysical = 0; + int direct = 0; root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; @@ -1971,18 +1938,20 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) ASSERT(!VALID_PAGE(root)); if (tdp_enabled) - metaphysical = 1; + direct = 1; + if (mmu_check_root(vcpu, root_gfn)) + return 1; sp = kvm_mmu_get_page(vcpu, root_gfn, 0, - PT64_ROOT_LEVEL, metaphysical, + PT64_ROOT_LEVEL, direct, ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; vcpu->arch.mmu.root_hpa = root; - return; + return 0; } - metaphysical = !is_paging(vcpu); + direct = !is_paging(vcpu); if (tdp_enabled) - metaphysical = 1; + direct = 1; for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; @@ -1995,14 +1964,17 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT; } else if (vcpu->arch.mmu.root_level == 0) root_gfn = 0; + if (mmu_check_root(vcpu, root_gfn)) + return 1; sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, - PT32_ROOT_LEVEL, metaphysical, + PT32_ROOT_LEVEL, direct, ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; } vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); + return 0; } static void mmu_sync_roots(struct kvm_vcpu *vcpu) @@ -2021,7 +1993,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; - if (root) { + if (root && VALID_PAGE(root)) { root &= PT64_BASE_ADDR_MASK; sp = page_header(root); mmu_sync_children(vcpu, sp); @@ -2029,15 +2001,6 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) } } -static void mmu_sync_global(struct kvm_vcpu *vcpu) -{ - struct kvm *kvm = vcpu->kvm; - struct kvm_mmu_page *sp, *n; - - list_for_each_entry_safe(sp, n, &kvm->arch.oos_global_pages, oos_link) - kvm_sync_page(vcpu, sp); -} - void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) { spin_lock(&vcpu->kvm->mmu_lock); @@ -2045,13 +2008,6 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) spin_unlock(&vcpu->kvm->mmu_lock); } -void kvm_mmu_sync_global(struct kvm_vcpu *vcpu) -{ - spin_lock(&vcpu->kvm->mmu_lock); - mmu_sync_global(vcpu); - spin_unlock(&vcpu->kvm->mmu_lock); -} - static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) { return vaddr; @@ -2166,6 +2122,14 @@ static void paging_free(struct kvm_vcpu *vcpu) nonpaging_free(vcpu); } +static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level) +{ + int bit7; + + bit7 = (gpte >> 7) & 1; + return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0; +} + #define PTTYPE 64 #include "paging_tmpl.h" #undef PTTYPE @@ -2174,6 +2138,59 @@ static void paging_free(struct kvm_vcpu *vcpu) #include "paging_tmpl.h" #undef PTTYPE +static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) +{ + struct kvm_mmu *context = &vcpu->arch.mmu; + int maxphyaddr = cpuid_maxphyaddr(vcpu); + u64 exb_bit_rsvd = 0; + + if (!is_nx(vcpu)) + exb_bit_rsvd = rsvd_bits(63, 63); + switch (level) { + case PT32_ROOT_LEVEL: + /* no rsvd bits for 2 level 4K page table entries */ + context->rsvd_bits_mask[0][1] = 0; + context->rsvd_bits_mask[0][0] = 0; + if (is_cpuid_PSE36()) + /* 36bits PSE 4MB page */ + context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); + else + /* 32 bits PSE 4MB page */ + context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); + context->rsvd_bits_mask[1][0] = ~0ull; + break; + case PT32E_ROOT_LEVEL: + context->rsvd_bits_mask[0][2] = + rsvd_bits(maxphyaddr, 63) | + rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */ + context->rsvd_bits_mask[0][1] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 62); /* PDE */ + context->rsvd_bits_mask[0][0] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 62); /* PTE */ + context->rsvd_bits_mask[1][1] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 62) | + rsvd_bits(13, 20); /* large page */ + context->rsvd_bits_mask[1][0] = ~0ull; + break; + case PT64_ROOT_LEVEL: + context->rsvd_bits_mask[0][3] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); + context->rsvd_bits_mask[0][2] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); + context->rsvd_bits_mask[0][1] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51); + context->rsvd_bits_mask[0][0] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51); + context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; + context->rsvd_bits_mask[1][2] = context->rsvd_bits_mask[0][2]; + context->rsvd_bits_mask[1][1] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51) | + rsvd_bits(13, 20); /* large page */ + context->rsvd_bits_mask[1][0] = ~0ull; + break; + } +} + static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) { struct kvm_mmu *context = &vcpu->arch.mmu; @@ -2194,6 +2211,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) static int paging64_init_context(struct kvm_vcpu *vcpu) { + reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); } @@ -2201,6 +2219,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = &vcpu->arch.mmu; + reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); context->new_cr3 = paging_new_cr3; context->page_fault = paging32_page_fault; context->gva_to_gpa = paging32_gva_to_gpa; @@ -2216,6 +2235,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) static int paging32E_init_context(struct kvm_vcpu *vcpu) { + reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); } @@ -2236,12 +2256,15 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->gva_to_gpa = nonpaging_gva_to_gpa; context->root_level = 0; } else if (is_long_mode(vcpu)) { + reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); context->gva_to_gpa = paging64_gva_to_gpa; context->root_level = PT64_ROOT_LEVEL; } else if (is_pae(vcpu)) { + reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); context->gva_to_gpa = paging64_gva_to_gpa; context->root_level = PT32E_ROOT_LEVEL; } else { + reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); context->gva_to_gpa = paging32_gva_to_gpa; context->root_level = PT32_ROOT_LEVEL; } @@ -2251,17 +2274,23 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) static int init_kvm_softmmu(struct kvm_vcpu *vcpu) { + int r; + ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); if (!is_paging(vcpu)) - return nonpaging_init_context(vcpu); + r = nonpaging_init_context(vcpu); else if (is_long_mode(vcpu)) - return paging64_init_context(vcpu); + r = paging64_init_context(vcpu); else if (is_pae(vcpu)) - return paging32E_init_context(vcpu); + r = paging32E_init_context(vcpu); else - return paging32_init_context(vcpu); + r = paging32_init_context(vcpu); + + vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level; + + return r; } static int init_kvm_mmu(struct kvm_vcpu *vcpu) @@ -2299,9 +2328,11 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) goto out; spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); - mmu_alloc_roots(vcpu); + r = mmu_alloc_roots(vcpu); mmu_sync_roots(vcpu); spin_unlock(&vcpu->kvm->mmu_lock); + if (r) + goto out; kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); kvm_mmu_flush_tlb(vcpu); out: @@ -2492,7 +2523,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { - if (sp->gfn != gfn || sp->role.metaphysical || sp->role.invalid) + if (sp->gfn != gfn || sp->role.direct || sp->role.invalid) continue; pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); @@ -2647,14 +2678,6 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp); static void free_mmu_pages(struct kvm_vcpu *vcpu) { - struct kvm_mmu_page *sp; - - while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) { - sp = container_of(vcpu->kvm->arch.active_mmu_pages.next, - struct kvm_mmu_page, link); - kvm_mmu_zap_page(vcpu->kvm, sp); - cond_resched(); - } free_page((unsigned long)vcpu->arch.mmu.pae_root); } @@ -2719,7 +2742,6 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) { struct kvm_mmu_page *sp; - spin_lock(&kvm->mmu_lock); list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { int i; u64 *pt; @@ -2734,7 +2756,6 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) pt[i] &= ~PT_WRITABLE_MASK; } kvm_flush_remote_tlbs(kvm); - spin_unlock(&kvm->mmu_lock); } void kvm_mmu_zap_all(struct kvm *kvm) @@ -2906,8 +2927,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu, static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) { - kvm_x86_ops->tlb_flush(vcpu); - set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); + kvm_set_cr3(vcpu, vcpu->arch.cr3); return 1; } @@ -3017,11 +3037,13 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, " in nonleaf level: levels %d gva %lx" " level %d pte %llx\n", audit_msg, vcpu->arch.mmu.root_level, va, level, ent); - - audit_mappings_page(vcpu, ent, va, level - 1); + else + audit_mappings_page(vcpu, ent, va, level - 1); } else { gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); - hpa_t hpa = (hpa_t)gpa_to_pfn(vcpu, gpa) << PAGE_SHIFT; + gfn_t gfn = gpa >> PAGE_SHIFT; + pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); + hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; if (is_shadow_present_pte(ent) && (ent & PT64_BASE_ADDR_MASK) != hpa) @@ -3130,7 +3152,7 @@ static void audit_write_protection(struct kvm_vcpu *vcpu) gfn_t gfn; list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { - if (sp->role.metaphysical) + if (sp->role.direct) continue; gfn = unalias_gfn(vcpu->kvm, sp->gfn); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 258e5d56298..3494a2fb136 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -54,7 +54,7 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) static inline int is_long_mode(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 - return vcpu->arch.shadow_efer & EFER_LME; + return vcpu->arch.shadow_efer & EFER_LMA; #else return 0; #endif @@ -75,4 +75,9 @@ static inline int is_paging(struct kvm_vcpu *vcpu) return vcpu->arch.cr0 & X86_CR0_PG; } +static inline int is_present_pte(unsigned long pte) +{ + return pte & PT_PRESENT_MASK; +} + #endif diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 9fd78b6e17a..258e4591e1c 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -25,7 +25,6 @@ #if PTTYPE == 64 #define pt_element_t u64 #define guest_walker guest_walker64 - #define shadow_walker shadow_walker64 #define FNAME(name) paging##64_##name #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK @@ -42,7 +41,6 @@ #elif PTTYPE == 32 #define pt_element_t u32 #define guest_walker guest_walker32 - #define shadow_walker shadow_walker32 #define FNAME(name) paging##32_##name #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK @@ -73,18 +71,6 @@ struct guest_walker { u32 error_code; }; -struct shadow_walker { - struct kvm_shadow_walk walker; - struct guest_walker *guest_walker; - int user_fault; - int write_fault; - int largepage; - int *ptwrite; - pfn_t pfn; - u64 *sptep; - gpa_t pte_gpa; -}; - static gfn_t gpte_to_gfn(pt_element_t gpte) { return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; @@ -137,6 +123,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker, gfn_t table_gfn; unsigned index, pt_access, pte_access; gpa_t pte_gpa; + int rsvd_fault = 0; pgprintk("%s: addr %lx\n", __func__, addr); walk: @@ -171,6 +158,10 @@ walk: if (!is_present_pte(pte)) goto not_present; + rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level); + if (rsvd_fault) + goto access_error; + if (write_fault && !is_writeble_pte(pte)) if (user_fault || is_write_protection(vcpu)) goto access_error; @@ -223,7 +214,6 @@ walk: if (ret) goto walk; pte |= PT_DIRTY_MASK; - kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte), 0); walker->ptes[walker->level - 1] = pte; } @@ -247,6 +237,8 @@ err: walker->error_code |= PFERR_USER_MASK; if (fetch_fault) walker->error_code |= PFERR_FETCH_MASK; + if (rsvd_fault) + walker->error_code |= PFERR_RSVD_MASK; return 0; } @@ -276,98 +268,84 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, kvm_get_pfn(pfn); mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, gpte & PT_DIRTY_MASK, NULL, largepage, - gpte & PT_GLOBAL_MASK, gpte_to_gfn(gpte), - pfn, true); + gpte_to_gfn(gpte), pfn, true); } /* * Fetch a shadow pte for a specific level in the paging hierarchy. */ -static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw, - struct kvm_vcpu *vcpu, u64 addr, - u64 *sptep, int level) +static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, + struct guest_walker *gw, + int user_fault, int write_fault, int largepage, + int *ptwrite, pfn_t pfn) { - struct shadow_walker *sw = - container_of(_sw, struct shadow_walker, walker); - struct guest_walker *gw = sw->guest_walker; unsigned access = gw->pt_access; struct kvm_mmu_page *shadow_page; - u64 spte; - int metaphysical; + u64 spte, *sptep; + int direct; gfn_t table_gfn; int r; + int level; pt_element_t curr_pte; + struct kvm_shadow_walk_iterator iterator; - if (level == PT_PAGE_TABLE_LEVEL - || (sw->largepage && level == PT_DIRECTORY_LEVEL)) { - mmu_set_spte(vcpu, sptep, access, gw->pte_access & access, - sw->user_fault, sw->write_fault, - gw->ptes[gw->level-1] & PT_DIRTY_MASK, - sw->ptwrite, sw->largepage, - gw->ptes[gw->level-1] & PT_GLOBAL_MASK, - gw->gfn, sw->pfn, false); - sw->sptep = sptep; - return 1; - } + if (!is_present_pte(gw->ptes[gw->level - 1])) + return NULL; - if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) - return 0; + for_each_shadow_entry(vcpu, addr, iterator) { + level = iterator.level; + sptep = iterator.sptep; + if (level == PT_PAGE_TABLE_LEVEL + || (largepage && level == PT_DIRECTORY_LEVEL)) { + mmu_set_spte(vcpu, sptep, access, + gw->pte_access & access, + user_fault, write_fault, + gw->ptes[gw->level-1] & PT_DIRTY_MASK, + ptwrite, largepage, + gw->gfn, pfn, false); + break; + } - if (is_large_pte(*sptep)) { - set_shadow_pte(sptep, shadow_trap_nonpresent_pte); - kvm_flush_remote_tlbs(vcpu->kvm); - rmap_remove(vcpu->kvm, sptep); - } + if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) + continue; - if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) { - metaphysical = 1; - if (!is_dirty_pte(gw->ptes[level - 1])) - access &= ~ACC_WRITE_MASK; - table_gfn = gpte_to_gfn(gw->ptes[level - 1]); - } else { - metaphysical = 0; - table_gfn = gw->table_gfn[level - 2]; - } - shadow_page = kvm_mmu_get_page(vcpu, table_gfn, (gva_t)addr, level-1, - metaphysical, access, sptep); - if (!metaphysical) { - r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2], - &curr_pte, sizeof(curr_pte)); - if (r || curr_pte != gw->ptes[level - 2]) { - kvm_mmu_put_page(shadow_page, sptep); - kvm_release_pfn_clean(sw->pfn); - sw->sptep = NULL; - return 1; + if (is_large_pte(*sptep)) { + rmap_remove(vcpu->kvm, sptep); + set_shadow_pte(sptep, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(vcpu->kvm); } - } - spte = __pa(shadow_page->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK - | PT_WRITABLE_MASK | PT_USER_MASK; - *sptep = spte; - return 0; -} - -static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, - struct guest_walker *guest_walker, - int user_fault, int write_fault, int largepage, - int *ptwrite, pfn_t pfn) -{ - struct shadow_walker walker = { - .walker = { .entry = FNAME(shadow_walk_entry), }, - .guest_walker = guest_walker, - .user_fault = user_fault, - .write_fault = write_fault, - .largepage = largepage, - .ptwrite = ptwrite, - .pfn = pfn, - }; - - if (!is_present_pte(guest_walker->ptes[guest_walker->level - 1])) - return NULL; + if (level == PT_DIRECTORY_LEVEL + && gw->level == PT_DIRECTORY_LEVEL) { + direct = 1; + if (!is_dirty_pte(gw->ptes[level - 1])) + access &= ~ACC_WRITE_MASK; + table_gfn = gpte_to_gfn(gw->ptes[level - 1]); + } else { + direct = 0; + table_gfn = gw->table_gfn[level - 2]; + } + shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, + direct, access, sptep); + if (!direct) { + r = kvm_read_guest_atomic(vcpu->kvm, + gw->pte_gpa[level - 2], + &curr_pte, sizeof(curr_pte)); + if (r || curr_pte != gw->ptes[level - 2]) { + kvm_mmu_put_page(shadow_page, sptep); + kvm_release_pfn_clean(pfn); + sptep = NULL; + break; + } + } - walk_shadow(&walker.walker, vcpu, addr); + spte = __pa(shadow_page->spt) + | PT_PRESENT_MASK | PT_ACCESSED_MASK + | PT_WRITABLE_MASK | PT_USER_MASK; + *sptep = spte; + } - return walker.sptep; + return sptep; } /* @@ -406,7 +384,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, return r; /* - * Look up the shadow pte for the faulting address. + * Look up the guest pte for the faulting address. */ r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, fetch_fault); @@ -465,54 +443,56 @@ out_unlock: return 0; } -static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw, - struct kvm_vcpu *vcpu, u64 addr, - u64 *sptep, int level) +static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) { - struct shadow_walker *sw = - container_of(_sw, struct shadow_walker, walker); - - /* FIXME: properly handle invlpg on large guest pages */ - if (level == PT_PAGE_TABLE_LEVEL || - ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); + struct kvm_shadow_walk_iterator iterator; + pt_element_t gpte; + gpa_t pte_gpa = -1; + int level; + u64 *sptep; + int need_flush = 0; - sw->pte_gpa = (sp->gfn << PAGE_SHIFT); - sw->pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); + spin_lock(&vcpu->kvm->mmu_lock); - if (is_shadow_present_pte(*sptep)) { - rmap_remove(vcpu->kvm, sptep); - if (is_large_pte(*sptep)) - --vcpu->kvm->stat.lpages; + for_each_shadow_entry(vcpu, gva, iterator) { + level = iterator.level; + sptep = iterator.sptep; + + /* FIXME: properly handle invlpg on large guest pages */ + if (level == PT_PAGE_TABLE_LEVEL || + ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) { + struct kvm_mmu_page *sp = page_header(__pa(sptep)); + + pte_gpa = (sp->gfn << PAGE_SHIFT); + pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); + + if (is_shadow_present_pte(*sptep)) { + rmap_remove(vcpu->kvm, sptep); + if (is_large_pte(*sptep)) + --vcpu->kvm->stat.lpages; + need_flush = 1; + } + set_shadow_pte(sptep, shadow_trap_nonpresent_pte); + break; } - set_shadow_pte(sptep, shadow_trap_nonpresent_pte); - return 1; - } - if (!is_shadow_present_pte(*sptep)) - return 1; - return 0; -} -static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) -{ - pt_element_t gpte; - struct shadow_walker walker = { - .walker = { .entry = FNAME(shadow_invlpg_entry), }, - .pte_gpa = -1, - }; + if (!is_shadow_present_pte(*sptep)) + break; + } - spin_lock(&vcpu->kvm->mmu_lock); - walk_shadow(&walker.walker, vcpu, gva); + if (need_flush) + kvm_flush_remote_tlbs(vcpu->kvm); spin_unlock(&vcpu->kvm->mmu_lock); - if (walker.pte_gpa == -1) + + if (pte_gpa == -1) return; - if (kvm_read_guest_atomic(vcpu->kvm, walker.pte_gpa, &gpte, + if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, sizeof(pt_element_t))) return; if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) { if (mmu_topup_memory_caches(vcpu)) return; - kvm_mmu_pte_write(vcpu, walker.pte_gpa, (const u8 *)&gpte, + kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte, sizeof(pt_element_t), 0); } } @@ -540,7 +520,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, pt_element_t pt[256 / sizeof(pt_element_t)]; gpa_t pte_gpa; - if (sp->role.metaphysical + if (sp->role.direct || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) { nonpaging_prefetch_page(vcpu, sp); return; @@ -610,7 +590,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) nr_present++; pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, - is_dirty_pte(gpte), 0, gpte & PT_GLOBAL_MASK, gfn, + is_dirty_pte(gpte), 0, gfn, spte_to_pfn(sp->spt[i]), true, false); } @@ -619,7 +599,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) #undef pt_element_t #undef guest_walker -#undef shadow_walker #undef FNAME #undef PT_BASE_ADDR_MASK #undef PT_INDEX diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index a9e769e4e25..71510e07e69 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -19,6 +19,7 @@ #include "irq.h" #include "mmu.h" #include "kvm_cache_regs.h" +#include "x86.h" #include <linux/module.h> #include <linux/kernel.h> @@ -38,9 +39,6 @@ MODULE_LICENSE("GPL"); #define IOPM_ALLOC_ORDER 2 #define MSRPM_ALLOC_ORDER 1 -#define DR7_GD_MASK (1 << 13) -#define DR6_BD_MASK (1 << 13) - #define SEG_TYPE_LDT 2 #define SEG_TYPE_BUSY_TSS16 3 @@ -50,6 +48,15 @@ MODULE_LICENSE("GPL"); #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) +/* Turn on to get debugging output*/ +/* #define NESTED_DEBUG */ + +#ifdef NESTED_DEBUG +#define nsvm_printk(fmt, args...) printk(KERN_INFO fmt, ## args) +#else +#define nsvm_printk(fmt, args...) do {} while(0) +#endif + /* enable NPT for AMD64 and X86 with PAE */ #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) static bool npt_enabled = true; @@ -60,14 +67,28 @@ static int npt = 1; module_param(npt, int, S_IRUGO); -static void kvm_reput_irq(struct vcpu_svm *svm); +static int nested = 0; +module_param(nested, int, S_IRUGO); + static void svm_flush_tlb(struct kvm_vcpu *vcpu); +static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override); +static int nested_svm_vmexit(struct vcpu_svm *svm); +static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb, + void *arg2, void *opaque); +static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, + bool has_error_code, u32 error_code); + static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) { return container_of(vcpu, struct vcpu_svm, vcpu); } +static inline bool is_nested(struct vcpu_svm *svm) +{ + return svm->nested_vmcb; +} + static unsigned long iopm_base; struct kvm_ldttss_desc { @@ -111,24 +132,6 @@ static inline u32 svm_has(u32 feat) return svm_features & feat; } -static inline u8 pop_irq(struct kvm_vcpu *vcpu) -{ - int word_index = __ffs(vcpu->arch.irq_summary); - int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); - int irq = word_index * BITS_PER_LONG + bit_index; - - clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); - if (!vcpu->arch.irq_pending[word_index]) - clear_bit(word_index, &vcpu->arch.irq_summary); - return irq; -} - -static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq) -{ - set_bit(irq, vcpu->arch.irq_pending); - set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); -} - static inline void clgi(void) { asm volatile (__ex(SVM_CLGI)); @@ -157,32 +160,6 @@ static inline void kvm_write_cr2(unsigned long val) asm volatile ("mov %0, %%cr2" :: "r" (val)); } -static inline unsigned long read_dr6(void) -{ - unsigned long dr6; - - asm volatile ("mov %%dr6, %0" : "=r" (dr6)); - return dr6; -} - -static inline void write_dr6(unsigned long val) -{ - asm volatile ("mov %0, %%dr6" :: "r" (val)); -} - -static inline unsigned long read_dr7(void) -{ - unsigned long dr7; - - asm volatile ("mov %%dr7, %0" : "=r" (dr7)); - return dr7; -} - -static inline void write_dr7(unsigned long val) -{ - asm volatile ("mov %0, %%dr7" :: "r" (val)); -} - static inline void force_new_asid(struct kvm_vcpu *vcpu) { to_svm(vcpu)->asid_generation--; @@ -198,7 +175,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) if (!npt_enabled && !(efer & EFER_LMA)) efer &= ~EFER_LME; - to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK; + to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; vcpu->arch.shadow_efer = efer; } @@ -207,6 +184,11 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, { struct vcpu_svm *svm = to_svm(vcpu); + /* If we are within a nested VM we'd better #VMEXIT and let the + guest handle the exception */ + if (nested_svm_check_exception(svm, nr, has_error_code, error_code)) + return; + svm->vmcb->control.event_inj = nr | SVM_EVTINJ_VALID | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0) @@ -214,17 +196,31 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, svm->vmcb->control.event_inj_err = error_code; } -static bool svm_exception_injected(struct kvm_vcpu *vcpu) +static int is_external_interrupt(u32 info) +{ + info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; + return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); +} + +static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) { struct vcpu_svm *svm = to_svm(vcpu); + u32 ret = 0; - return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID); + if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) + ret |= X86_SHADOW_INT_STI | X86_SHADOW_INT_MOV_SS; + return ret & mask; } -static int is_external_interrupt(u32 info) +static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) { - info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; - return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); + struct vcpu_svm *svm = to_svm(vcpu); + + if (mask == 0) + svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; + else + svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK; + } static void skip_emulated_instruction(struct kvm_vcpu *vcpu) @@ -232,7 +228,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); if (!svm->next_rip) { - printk(KERN_DEBUG "%s: NOP\n", __func__); + if (emulate_instruction(vcpu, vcpu->run, 0, 0, EMULTYPE_SKIP) != + EMULATE_DONE) + printk(KERN_DEBUG "%s: NOP\n", __func__); return; } if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) @@ -240,9 +238,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) __func__, kvm_rip_read(vcpu), svm->next_rip); kvm_rip_write(vcpu, svm->next_rip); - svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; - - vcpu->arch.interrupt_window_open = 1; + svm_set_interrupt_shadow(vcpu, 0); } static int has_svm(void) @@ -250,7 +246,7 @@ static int has_svm(void) const char *msg; if (!cpu_has_svm(&msg)) { - printk(KERN_INFO "has_svn: %s\n", msg); + printk(KERN_INFO "has_svm: %s\n", msg); return 0; } @@ -292,7 +288,7 @@ static void svm_hardware_enable(void *garbage) svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); rdmsrl(MSR_EFER, efer); - wrmsrl(MSR_EFER, efer | MSR_EFER_SVME_MASK); + wrmsrl(MSR_EFER, efer | EFER_SVME); wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(svm_data->save_area) << PAGE_SHIFT); @@ -411,12 +407,19 @@ static __init int svm_hardware_setup(void) iopm_va = page_address(iopm_pages); memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); - clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */ iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); + if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) + kvm_enable_efer_bits(EFER_FFXSR); + + if (nested) { + printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); + kvm_enable_efer_bits(EFER_SVME); + } + for_each_online_cpu(cpu) { r = svm_cpu_init(cpu); if (r) @@ -559,7 +562,7 @@ static void init_vmcb(struct vcpu_svm *svm) init_sys_seg(&save->ldtr, SEG_TYPE_LDT); init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); - save->efer = MSR_EFER_SVME_MASK; + save->efer = EFER_SVME; save->dr6 = 0xffff0ff0; save->dr7 = 0x400; save->rflags = 2; @@ -591,6 +594,9 @@ static void init_vmcb(struct vcpu_svm *svm) save->cr4 = 0; } force_new_asid(&svm->vcpu); + + svm->nested_vmcb = 0; + svm->vcpu.arch.hflags = HF_GIF_MASK; } static int svm_vcpu_reset(struct kvm_vcpu *vcpu) @@ -615,6 +621,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) struct vcpu_svm *svm; struct page *page; struct page *msrpm_pages; + struct page *hsave_page; + struct page *nested_msrpm_pages; int err; svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); @@ -637,14 +645,25 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); if (!msrpm_pages) goto uninit; + + nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); + if (!nested_msrpm_pages) + goto uninit; + svm->msrpm = page_address(msrpm_pages); svm_vcpu_init_msrpm(svm->msrpm); + hsave_page = alloc_page(GFP_KERNEL); + if (!hsave_page) + goto uninit; + svm->hsave = page_address(hsave_page); + + svm->nested_msrpm = page_address(nested_msrpm_pages); + svm->vmcb = page_address(page); clear_page(svm->vmcb); svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; svm->asid_generation = 0; - memset(svm->db_regs, 0, sizeof(svm->db_regs)); init_vmcb(svm); fx_init(&svm->vcpu); @@ -669,6 +688,8 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); + __free_page(virt_to_page(svm->hsave)); + __free_pages(virt_to_page(svm->nested_msrpm), MSRPM_ALLOC_ORDER); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, svm); } @@ -718,6 +739,16 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) to_svm(vcpu)->vmcb->save.rflags = rflags; } +static void svm_set_vintr(struct vcpu_svm *svm) +{ + svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR; +} + +static void svm_clear_vintr(struct vcpu_svm *svm) +{ + svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); +} + static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) { struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; @@ -760,22 +791,51 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; - /* - * SVM always stores 0 for the 'G' bit in the CS selector in - * the VMCB on a VMEXIT. This hurts cross-vendor migration: - * Intel's VMENTRY has a check on the 'G' bit. + /* AMD's VMCB does not have an explicit unusable field, so emulate it + * for cross vendor migration purposes by "not present" */ - if (seg == VCPU_SREG_CS) - var->g = s->limit > 0xfffff; + var->unusable = !var->present || (var->type == 0); - /* - * Work around a bug where the busy flag in the tr selector - * isn't exposed - */ - if (seg == VCPU_SREG_TR) + switch (seg) { + case VCPU_SREG_CS: + /* + * SVM always stores 0 for the 'G' bit in the CS selector in + * the VMCB on a VMEXIT. This hurts cross-vendor migration: + * Intel's VMENTRY has a check on the 'G' bit. + */ + var->g = s->limit > 0xfffff; + break; + case VCPU_SREG_TR: + /* + * Work around a bug where the busy flag in the tr selector + * isn't exposed + */ var->type |= 0x2; - - var->unusable = !var->present; + break; + case VCPU_SREG_DS: + case VCPU_SREG_ES: + case VCPU_SREG_FS: + case VCPU_SREG_GS: + /* + * The accessed bit must always be set in the segment + * descriptor cache, although it can be cleared in the + * descriptor, the cached bit always remains at 1. Since + * Intel has a check on this, set it here to support + * cross-vendor migration. + */ + if (!var->unusable) + var->type |= 0x1; + break; + case VCPU_SREG_SS: + /* On AMD CPUs sometimes the DB bit in the segment + * descriptor is left as 1, although the whole segment has + * been made unusable. Clear it here to pass an Intel VMX + * entry check when cross vendor migrating. + */ + if (var->unusable) + var->db = 0; + break; + } } static int svm_get_cpl(struct kvm_vcpu *vcpu) @@ -905,19 +965,48 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, } -static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) +static void update_db_intercept(struct kvm_vcpu *vcpu) { - return -EOPNOTSUPP; + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->control.intercept_exceptions &= + ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); + + if (vcpu->arch.singlestep) + svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR); + + if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { + if (vcpu->guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) + svm->vmcb->control.intercept_exceptions |= + 1 << DB_VECTOR; + if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) + svm->vmcb->control.intercept_exceptions |= + 1 << BP_VECTOR; + } else + vcpu->guest_debug = 0; } -static int svm_get_irq(struct kvm_vcpu *vcpu) +static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { + int old_debug = vcpu->guest_debug; struct vcpu_svm *svm = to_svm(vcpu); - u32 exit_int_info = svm->vmcb->control.exit_int_info; - if (is_external_interrupt(exit_int_info)) - return exit_int_info & SVM_EVTINJ_VEC_MASK; - return -1; + vcpu->guest_debug = dbg->control; + + update_db_intercept(vcpu); + + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) + svm->vmcb->save.dr7 = dbg->arch.debugreg[7]; + else + svm->vmcb->save.dr7 = vcpu->arch.dr7; + + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + svm->vmcb->save.rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF; + else if (old_debug & KVM_GUESTDBG_SINGLESTEP) + svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); + + return 0; } static void load_host_msrs(struct kvm_vcpu *vcpu) @@ -949,7 +1038,29 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data) static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) { - unsigned long val = to_svm(vcpu)->db_regs[dr]; + struct vcpu_svm *svm = to_svm(vcpu); + unsigned long val; + + switch (dr) { + case 0 ... 3: + val = vcpu->arch.db[dr]; + break; + case 6: + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) + val = vcpu->arch.dr6; + else + val = svm->vmcb->save.dr6; + break; + case 7: + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) + val = vcpu->arch.dr7; + else + val = svm->vmcb->save.dr7; + break; + default: + val = 0; + } + KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); return val; } @@ -959,33 +1070,40 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, { struct vcpu_svm *svm = to_svm(vcpu); - *exception = 0; + KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)value, handler); - if (svm->vmcb->save.dr7 & DR7_GD_MASK) { - svm->vmcb->save.dr7 &= ~DR7_GD_MASK; - svm->vmcb->save.dr6 |= DR6_BD_MASK; - *exception = DB_VECTOR; - return; - } + *exception = 0; switch (dr) { case 0 ... 3: - svm->db_regs[dr] = value; + vcpu->arch.db[dr] = value; + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) + vcpu->arch.eff_db[dr] = value; return; case 4 ... 5: - if (vcpu->arch.cr4 & X86_CR4_DE) { + if (vcpu->arch.cr4 & X86_CR4_DE) *exception = UD_VECTOR; + return; + case 6: + if (value & 0xffffffff00000000ULL) { + *exception = GP_VECTOR; return; } - case 7: { - if (value & ~((1ULL << 32) - 1)) { + vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1; + return; + case 7: + if (value & 0xffffffff00000000ULL) { *exception = GP_VECTOR; return; } - svm->vmcb->save.dr7 = value; + vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1; + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { + svm->vmcb->save.dr7 = vcpu->arch.dr7; + vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK); + } return; - } default: + /* FIXME: Possible case? */ printk(KERN_DEBUG "%s: unexpected dr %u\n", __func__, dr); *exception = UD_VECTOR; @@ -995,17 +1113,8 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - u32 exit_int_info = svm->vmcb->control.exit_int_info; - struct kvm *kvm = svm->vcpu.kvm; u64 fault_address; u32 error_code; - bool event_injection = false; - - if (!irqchip_in_kernel(kvm) && - is_external_interrupt(exit_int_info)) { - event_injection = true; - push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); - } fault_address = svm->vmcb->control.exit_info_2; error_code = svm->vmcb->control.exit_info_1; @@ -1025,12 +1134,50 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) */ if (npt_enabled) svm_flush_tlb(&svm->vcpu); - - if (!npt_enabled && event_injection) - kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); + else { + if (kvm_event_needs_reinjection(&svm->vcpu)) + kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); + } return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); } +static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + if (!(svm->vcpu.guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && + !svm->vcpu.arch.singlestep) { + kvm_queue_exception(&svm->vcpu, DB_VECTOR); + return 1; + } + + if (svm->vcpu.arch.singlestep) { + svm->vcpu.arch.singlestep = false; + if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) + svm->vmcb->save.rflags &= + ~(X86_EFLAGS_TF | X86_EFLAGS_RF); + update_db_intercept(&svm->vcpu); + } + + if (svm->vcpu.guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){ + kvm_run->exit_reason = KVM_EXIT_DEBUG; + kvm_run->debug.arch.pc = + svm->vmcb->save.cs.base + svm->vmcb->save.rip; + kvm_run->debug.arch.exception = DB_VECTOR; + return 0; + } + + return 1; +} + +static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + kvm_run->exit_reason = KVM_EXIT_DEBUG; + kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; + kvm_run->debug.arch.exception = BP_VECTOR; + return 0; +} + static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { int er; @@ -1080,7 +1227,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ - int size, down, in, string, rep; + int size, in, string; unsigned port; ++svm->vcpu.stat.io_exits; @@ -1099,8 +1246,6 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) in = (io_info & SVM_IOIO_TYPE_MASK) != 0; port = io_info >> 16; size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; - rep = (io_info & SVM_IOIO_REP_MASK) != 0; - down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0; skip_emulated_instruction(&svm->vcpu); return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); @@ -1139,6 +1284,567 @@ static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; } +static int nested_svm_check_permissions(struct vcpu_svm *svm) +{ + if (!(svm->vcpu.arch.shadow_efer & EFER_SVME) + || !is_paging(&svm->vcpu)) { + kvm_queue_exception(&svm->vcpu, UD_VECTOR); + return 1; + } + + if (svm->vmcb->save.cpl) { + kvm_inject_gp(&svm->vcpu, 0); + return 1; + } + + return 0; +} + +static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, + bool has_error_code, u32 error_code) +{ + if (is_nested(svm)) { + svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = error_code; + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; + if (nested_svm_exit_handled(svm, false)) { + nsvm_printk("VMexit -> EXCP 0x%x\n", nr); + + nested_svm_vmexit(svm); + return 1; + } + } + + return 0; +} + +static inline int nested_svm_intr(struct vcpu_svm *svm) +{ + if (is_nested(svm)) { + if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) + return 0; + + if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) + return 0; + + svm->vmcb->control.exit_code = SVM_EXIT_INTR; + + if (nested_svm_exit_handled(svm, false)) { + nsvm_printk("VMexit -> INTR\n"); + nested_svm_vmexit(svm); + return 1; + } + } + + return 0; +} + +static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa) +{ + struct page *page; + + down_read(¤t->mm->mmap_sem); + page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); + up_read(¤t->mm->mmap_sem); + + if (is_error_page(page)) { + printk(KERN_INFO "%s: could not find page at 0x%llx\n", + __func__, gpa); + kvm_release_page_clean(page); + kvm_inject_gp(&svm->vcpu, 0); + return NULL; + } + return page; +} + +static int nested_svm_do(struct vcpu_svm *svm, + u64 arg1_gpa, u64 arg2_gpa, void *opaque, + int (*handler)(struct vcpu_svm *svm, + void *arg1, + void *arg2, + void *opaque)) +{ + struct page *arg1_page; + struct page *arg2_page = NULL; + void *arg1; + void *arg2 = NULL; + int retval; + + arg1_page = nested_svm_get_page(svm, arg1_gpa); + if(arg1_page == NULL) + return 1; + + if (arg2_gpa) { + arg2_page = nested_svm_get_page(svm, arg2_gpa); + if(arg2_page == NULL) { + kvm_release_page_clean(arg1_page); + return 1; + } + } + + arg1 = kmap_atomic(arg1_page, KM_USER0); + if (arg2_gpa) + arg2 = kmap_atomic(arg2_page, KM_USER1); + + retval = handler(svm, arg1, arg2, opaque); + + kunmap_atomic(arg1, KM_USER0); + if (arg2_gpa) + kunmap_atomic(arg2, KM_USER1); + + kvm_release_page_dirty(arg1_page); + if (arg2_gpa) + kvm_release_page_dirty(arg2_page); + + return retval; +} + +static int nested_svm_exit_handled_real(struct vcpu_svm *svm, + void *arg1, + void *arg2, + void *opaque) +{ + struct vmcb *nested_vmcb = (struct vmcb *)arg1; + bool kvm_overrides = *(bool *)opaque; + u32 exit_code = svm->vmcb->control.exit_code; + + if (kvm_overrides) { + switch (exit_code) { + case SVM_EXIT_INTR: + case SVM_EXIT_NMI: + return 0; + /* For now we are always handling NPFs when using them */ + case SVM_EXIT_NPF: + if (npt_enabled) + return 0; + break; + /* When we're shadowing, trap PFs */ + case SVM_EXIT_EXCP_BASE + PF_VECTOR: + if (!npt_enabled) + return 0; + break; + default: + break; + } + } + + switch (exit_code) { + case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { + u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); + if (nested_vmcb->control.intercept_cr_read & cr_bits) + return 1; + break; + } + case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: { + u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0); + if (nested_vmcb->control.intercept_cr_write & cr_bits) + return 1; + break; + } + case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: { + u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0); + if (nested_vmcb->control.intercept_dr_read & dr_bits) + return 1; + break; + } + case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: { + u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0); + if (nested_vmcb->control.intercept_dr_write & dr_bits) + return 1; + break; + } + case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { + u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); + if (nested_vmcb->control.intercept_exceptions & excp_bits) + return 1; + break; + } + default: { + u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); + nsvm_printk("exit code: 0x%x\n", exit_code); + if (nested_vmcb->control.intercept & exit_bits) + return 1; + } + } + + return 0; +} + +static int nested_svm_exit_handled_msr(struct vcpu_svm *svm, + void *arg1, void *arg2, + void *opaque) +{ + struct vmcb *nested_vmcb = (struct vmcb *)arg1; + u8 *msrpm = (u8 *)arg2; + u32 t0, t1; + u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; + u32 param = svm->vmcb->control.exit_info_1 & 1; + + if (!(nested_vmcb->control.intercept & (1ULL << INTERCEPT_MSR_PROT))) + return 0; + + switch(msr) { + case 0 ... 0x1fff: + t0 = (msr * 2) % 8; + t1 = msr / 8; + break; + case 0xc0000000 ... 0xc0001fff: + t0 = (8192 + msr - 0xc0000000) * 2; + t1 = (t0 / 8); + t0 %= 8; + break; + case 0xc0010000 ... 0xc0011fff: + t0 = (16384 + msr - 0xc0010000) * 2; + t1 = (t0 / 8); + t0 %= 8; + break; + default: + return 1; + break; + } + if (msrpm[t1] & ((1 << param) << t0)) + return 1; + + return 0; +} + +static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override) +{ + bool k = kvm_override; + + switch (svm->vmcb->control.exit_code) { + case SVM_EXIT_MSR: + return nested_svm_do(svm, svm->nested_vmcb, + svm->nested_vmcb_msrpm, NULL, + nested_svm_exit_handled_msr); + default: break; + } + + return nested_svm_do(svm, svm->nested_vmcb, 0, &k, + nested_svm_exit_handled_real); +} + +static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, + void *arg2, void *opaque) +{ + struct vmcb *nested_vmcb = (struct vmcb *)arg1; + struct vmcb *hsave = svm->hsave; + u64 nested_save[] = { nested_vmcb->save.cr0, + nested_vmcb->save.cr3, + nested_vmcb->save.cr4, + nested_vmcb->save.efer, + nested_vmcb->control.intercept_cr_read, + nested_vmcb->control.intercept_cr_write, + nested_vmcb->control.intercept_dr_read, + nested_vmcb->control.intercept_dr_write, + nested_vmcb->control.intercept_exceptions, + nested_vmcb->control.intercept, + nested_vmcb->control.msrpm_base_pa, + nested_vmcb->control.iopm_base_pa, + nested_vmcb->control.tsc_offset }; + + /* Give the current vmcb to the guest */ + memcpy(nested_vmcb, svm->vmcb, sizeof(struct vmcb)); + nested_vmcb->save.cr0 = nested_save[0]; + if (!npt_enabled) + nested_vmcb->save.cr3 = nested_save[1]; + nested_vmcb->save.cr4 = nested_save[2]; + nested_vmcb->save.efer = nested_save[3]; + nested_vmcb->control.intercept_cr_read = nested_save[4]; + nested_vmcb->control.intercept_cr_write = nested_save[5]; + nested_vmcb->control.intercept_dr_read = nested_save[6]; + nested_vmcb->control.intercept_dr_write = nested_save[7]; + nested_vmcb->control.intercept_exceptions = nested_save[8]; + nested_vmcb->control.intercept = nested_save[9]; + nested_vmcb->control.msrpm_base_pa = nested_save[10]; + nested_vmcb->control.iopm_base_pa = nested_save[11]; + nested_vmcb->control.tsc_offset = nested_save[12]; + + /* We always set V_INTR_MASKING and remember the old value in hflags */ + if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) + nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; + + if ((nested_vmcb->control.int_ctl & V_IRQ_MASK) && + (nested_vmcb->control.int_vector)) { + nsvm_printk("WARNING: IRQ 0x%x still enabled on #VMEXIT\n", + nested_vmcb->control.int_vector); + } + + /* Restore the original control entries */ + svm->vmcb->control = hsave->control; + + /* Kill any pending exceptions */ + if (svm->vcpu.arch.exception.pending == true) + nsvm_printk("WARNING: Pending Exception\n"); + svm->vcpu.arch.exception.pending = false; + + /* Restore selected save entries */ + svm->vmcb->save.es = hsave->save.es; + svm->vmcb->save.cs = hsave->save.cs; + svm->vmcb->save.ss = hsave->save.ss; + svm->vmcb->save.ds = hsave->save.ds; + svm->vmcb->save.gdtr = hsave->save.gdtr; + svm->vmcb->save.idtr = hsave->save.idtr; + svm->vmcb->save.rflags = hsave->save.rflags; + svm_set_efer(&svm->vcpu, hsave->save.efer); + svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE); + svm_set_cr4(&svm->vcpu, hsave->save.cr4); + if (npt_enabled) { + svm->vmcb->save.cr3 = hsave->save.cr3; + svm->vcpu.arch.cr3 = hsave->save.cr3; + } else { + kvm_set_cr3(&svm->vcpu, hsave->save.cr3); + } + kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax); + kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp); + kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip); + svm->vmcb->save.dr7 = 0; + svm->vmcb->save.cpl = 0; + svm->vmcb->control.exit_int_info = 0; + + svm->vcpu.arch.hflags &= ~HF_GIF_MASK; + /* Exit nested SVM mode */ + svm->nested_vmcb = 0; + + return 0; +} + +static int nested_svm_vmexit(struct vcpu_svm *svm) +{ + nsvm_printk("VMexit\n"); + if (nested_svm_do(svm, svm->nested_vmcb, 0, + NULL, nested_svm_vmexit_real)) + return 1; + + kvm_mmu_reset_context(&svm->vcpu); + kvm_mmu_load(&svm->vcpu); + + return 0; +} + +static int nested_svm_vmrun_msrpm(struct vcpu_svm *svm, void *arg1, + void *arg2, void *opaque) +{ + int i; + u32 *nested_msrpm = (u32*)arg1; + for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++) + svm->nested_msrpm[i] = svm->msrpm[i] | nested_msrpm[i]; + svm->vmcb->control.msrpm_base_pa = __pa(svm->nested_msrpm); + + return 0; +} + +static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, + void *arg2, void *opaque) +{ + struct vmcb *nested_vmcb = (struct vmcb *)arg1; + struct vmcb *hsave = svm->hsave; + + /* nested_vmcb is our indicator if nested SVM is activated */ + svm->nested_vmcb = svm->vmcb->save.rax; + + /* Clear internal status */ + svm->vcpu.arch.exception.pending = false; + + /* Save the old vmcb, so we don't need to pick what we save, but + can restore everything when a VMEXIT occurs */ + memcpy(hsave, svm->vmcb, sizeof(struct vmcb)); + /* We need to remember the original CR3 in the SPT case */ + if (!npt_enabled) + hsave->save.cr3 = svm->vcpu.arch.cr3; + hsave->save.cr4 = svm->vcpu.arch.cr4; + hsave->save.rip = svm->next_rip; + + if (svm->vmcb->save.rflags & X86_EFLAGS_IF) + svm->vcpu.arch.hflags |= HF_HIF_MASK; + else + svm->vcpu.arch.hflags &= ~HF_HIF_MASK; + + /* Load the nested guest state */ + svm->vmcb->save.es = nested_vmcb->save.es; + svm->vmcb->save.cs = nested_vmcb->save.cs; + svm->vmcb->save.ss = nested_vmcb->save.ss; + svm->vmcb->save.ds = nested_vmcb->save.ds; + svm->vmcb->save.gdtr = nested_vmcb->save.gdtr; + svm->vmcb->save.idtr = nested_vmcb->save.idtr; + svm->vmcb->save.rflags = nested_vmcb->save.rflags; + svm_set_efer(&svm->vcpu, nested_vmcb->save.efer); + svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0); + svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4); + if (npt_enabled) { + svm->vmcb->save.cr3 = nested_vmcb->save.cr3; + svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; + } else { + kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); + kvm_mmu_reset_context(&svm->vcpu); + } + svm->vmcb->save.cr2 = nested_vmcb->save.cr2; + kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax); + kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp); + kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip); + /* In case we don't even reach vcpu_run, the fields are not updated */ + svm->vmcb->save.rax = nested_vmcb->save.rax; + svm->vmcb->save.rsp = nested_vmcb->save.rsp; + svm->vmcb->save.rip = nested_vmcb->save.rip; + svm->vmcb->save.dr7 = nested_vmcb->save.dr7; + svm->vmcb->save.dr6 = nested_vmcb->save.dr6; + svm->vmcb->save.cpl = nested_vmcb->save.cpl; + + /* We don't want a nested guest to be more powerful than the guest, + so all intercepts are ORed */ + svm->vmcb->control.intercept_cr_read |= + nested_vmcb->control.intercept_cr_read; + svm->vmcb->control.intercept_cr_write |= + nested_vmcb->control.intercept_cr_write; + svm->vmcb->control.intercept_dr_read |= + nested_vmcb->control.intercept_dr_read; + svm->vmcb->control.intercept_dr_write |= + nested_vmcb->control.intercept_dr_write; + svm->vmcb->control.intercept_exceptions |= + nested_vmcb->control.intercept_exceptions; + + svm->vmcb->control.intercept |= nested_vmcb->control.intercept; + + svm->nested_vmcb_msrpm = nested_vmcb->control.msrpm_base_pa; + + force_new_asid(&svm->vcpu); + svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info; + svm->vmcb->control.exit_int_info_err = nested_vmcb->control.exit_int_info_err; + svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; + if (nested_vmcb->control.int_ctl & V_IRQ_MASK) { + nsvm_printk("nSVM Injecting Interrupt: 0x%x\n", + nested_vmcb->control.int_ctl); + } + if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) + svm->vcpu.arch.hflags |= HF_VINTR_MASK; + else + svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; + + nsvm_printk("nSVM exit_int_info: 0x%x | int_state: 0x%x\n", + nested_vmcb->control.exit_int_info, + nested_vmcb->control.int_state); + + svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; + svm->vmcb->control.int_state = nested_vmcb->control.int_state; + svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset; + if (nested_vmcb->control.event_inj & SVM_EVTINJ_VALID) + nsvm_printk("Injecting Event: 0x%x\n", + nested_vmcb->control.event_inj); + svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; + svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; + + svm->vcpu.arch.hflags |= HF_GIF_MASK; + + return 0; +} + +static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) +{ + to_vmcb->save.fs = from_vmcb->save.fs; + to_vmcb->save.gs = from_vmcb->save.gs; + to_vmcb->save.tr = from_vmcb->save.tr; + to_vmcb->save.ldtr = from_vmcb->save.ldtr; + to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base; + to_vmcb->save.star = from_vmcb->save.star; + to_vmcb->save.lstar = from_vmcb->save.lstar; + to_vmcb->save.cstar = from_vmcb->save.cstar; + to_vmcb->save.sfmask = from_vmcb->save.sfmask; + to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs; + to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp; + to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; + + return 1; +} + +static int nested_svm_vmload(struct vcpu_svm *svm, void *nested_vmcb, + void *arg2, void *opaque) +{ + return nested_svm_vmloadsave((struct vmcb *)nested_vmcb, svm->vmcb); +} + +static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb, + void *arg2, void *opaque) +{ + return nested_svm_vmloadsave(svm->vmcb, (struct vmcb *)nested_vmcb); +} + +static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + if (nested_svm_check_permissions(svm)) + return 1; + + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; + skip_emulated_instruction(&svm->vcpu); + + nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmload); + + return 1; +} + +static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + if (nested_svm_check_permissions(svm)) + return 1; + + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; + skip_emulated_instruction(&svm->vcpu); + + nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmsave); + + return 1; +} + +static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + nsvm_printk("VMrun\n"); + if (nested_svm_check_permissions(svm)) + return 1; + + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; + skip_emulated_instruction(&svm->vcpu); + + if (nested_svm_do(svm, svm->vmcb->save.rax, 0, + NULL, nested_svm_vmrun)) + return 1; + + if (nested_svm_do(svm, svm->nested_vmcb_msrpm, 0, + NULL, nested_svm_vmrun_msrpm)) + return 1; + + return 1; +} + +static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + if (nested_svm_check_permissions(svm)) + return 1; + + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; + skip_emulated_instruction(&svm->vcpu); + + svm->vcpu.arch.hflags |= HF_GIF_MASK; + + return 1; +} + +static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + if (nested_svm_check_permissions(svm)) + return 1; + + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; + skip_emulated_instruction(&svm->vcpu); + + svm->vcpu.arch.hflags &= ~HF_GIF_MASK; + + /* After a CLGI no interrupts should come */ + svm_clear_vintr(svm); + svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; + + return 1; +} + static int invalid_op_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { @@ -1150,17 +1856,51 @@ static int task_switch_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { u16 tss_selector; + int reason; + int int_type = svm->vmcb->control.exit_int_info & + SVM_EXITINTINFO_TYPE_MASK; + int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK; + uint32_t type = + svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK; + uint32_t idt_v = + svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID; tss_selector = (u16)svm->vmcb->control.exit_info_1; + if (svm->vmcb->control.exit_info_2 & (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET)) - return kvm_task_switch(&svm->vcpu, tss_selector, - TASK_SWITCH_IRET); - if (svm->vmcb->control.exit_info_2 & - (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP)) - return kvm_task_switch(&svm->vcpu, tss_selector, - TASK_SWITCH_JMP); - return kvm_task_switch(&svm->vcpu, tss_selector, TASK_SWITCH_CALL); + reason = TASK_SWITCH_IRET; + else if (svm->vmcb->control.exit_info_2 & + (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP)) + reason = TASK_SWITCH_JMP; + else if (idt_v) + reason = TASK_SWITCH_GATE; + else + reason = TASK_SWITCH_CALL; + + if (reason == TASK_SWITCH_GATE) { + switch (type) { + case SVM_EXITINTINFO_TYPE_NMI: + svm->vcpu.arch.nmi_injected = false; + break; + case SVM_EXITINTINFO_TYPE_EXEPT: + kvm_clear_exception_queue(&svm->vcpu); + break; + case SVM_EXITINTINFO_TYPE_INTR: + kvm_clear_interrupt_queue(&svm->vcpu); + break; + default: + break; + } + } + + if (reason != TASK_SWITCH_GATE || + int_type == SVM_EXITINTINFO_TYPE_SOFT || + (int_type == SVM_EXITINTINFO_TYPE_EXEPT && + (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) + skip_emulated_instruction(&svm->vcpu); + + return kvm_task_switch(&svm->vcpu, tss_selector, reason); } static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) @@ -1170,6 +1910,14 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; } +static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + ++svm->vcpu.stat.nmi_window_exits; + svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); + svm->vcpu.arch.hflags |= HF_IRET_MASK; + return 1; +} + static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE) @@ -1187,8 +1935,14 @@ static int emulate_on_interception(struct vcpu_svm *svm, static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { + u8 cr8_prev = kvm_get_cr8(&svm->vcpu); + /* instruction emulation calls kvm_set_cr8() */ emulate_instruction(&svm->vcpu, NULL, 0, 0, 0); - if (irqchip_in_kernel(svm->vcpu.kvm)) + if (irqchip_in_kernel(svm->vcpu.kvm)) { + svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; + return 1; + } + if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) return 1; kvm_run->exit_reason = KVM_EXIT_SET_TPR; return 0; @@ -1250,6 +2004,15 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) case MSR_IA32_LASTINTTOIP: *data = svm->vmcb->save.last_excp_to; break; + case MSR_VM_HSAVE_PA: + *data = svm->hsave_msr; + break; + case MSR_VM_CR: + *data = 0; + break; + case MSR_IA32_UCODE_REV: + *data = 0x01000065; + break; default: return kvm_get_msr_common(vcpu, ecx, data); } @@ -1344,6 +2107,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", ecx, data); break; + case MSR_VM_HSAVE_PA: + svm->hsave_msr = data; + break; default: return kvm_set_msr_common(vcpu, ecx, data); } @@ -1380,14 +2146,15 @@ static int interrupt_window_interception(struct vcpu_svm *svm, { KVMTRACE_0D(PEND_INTR, &svm->vcpu, handler); - svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); + svm_clear_vintr(svm); svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; /* * If the user space waits to inject interrupts, exit as soon as * possible */ - if (kvm_run->request_interrupt_window && - !svm->vcpu.arch.irq_summary) { + if (!irqchip_in_kernel(svm->vcpu.kvm) && + kvm_run->request_interrupt_window && + !kvm_cpu_has_interrupt(&svm->vcpu)) { ++svm->vcpu.stat.irq_window_exits; kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; return 0; @@ -1417,6 +2184,8 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_WRITE_DR3] = emulate_on_interception, [SVM_EXIT_WRITE_DR5] = emulate_on_interception, [SVM_EXIT_WRITE_DR7] = emulate_on_interception, + [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, + [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, @@ -1428,6 +2197,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_VINTR] = interrupt_window_interception, /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */ [SVM_EXIT_CPUID] = cpuid_interception, + [SVM_EXIT_IRET] = iret_interception, [SVM_EXIT_INVD] = emulate_on_interception, [SVM_EXIT_HLT] = halt_interception, [SVM_EXIT_INVLPG] = invlpg_interception, @@ -1436,12 +2206,12 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_MSR] = msr_interception, [SVM_EXIT_TASK_SWITCH] = task_switch_interception, [SVM_EXIT_SHUTDOWN] = shutdown_interception, - [SVM_EXIT_VMRUN] = invalid_op_interception, + [SVM_EXIT_VMRUN] = vmrun_interception, [SVM_EXIT_VMMCALL] = vmmcall_interception, - [SVM_EXIT_VMLOAD] = invalid_op_interception, - [SVM_EXIT_VMSAVE] = invalid_op_interception, - [SVM_EXIT_STGI] = invalid_op_interception, - [SVM_EXIT_CLGI] = invalid_op_interception, + [SVM_EXIT_VMLOAD] = vmload_interception, + [SVM_EXIT_VMSAVE] = vmsave_interception, + [SVM_EXIT_STGI] = stgi_interception, + [SVM_EXIT_CLGI] = clgi_interception, [SVM_EXIT_SKINIT] = invalid_op_interception, [SVM_EXIT_WBINVD] = emulate_on_interception, [SVM_EXIT_MONITOR] = invalid_op_interception, @@ -1457,6 +2227,17 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) KVMTRACE_3D(VMEXIT, vcpu, exit_code, (u32)svm->vmcb->save.rip, (u32)((u64)svm->vmcb->save.rip >> 32), entryexit); + if (is_nested(svm)) { + nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n", + exit_code, svm->vmcb->control.exit_info_1, + svm->vmcb->control.exit_info_2, svm->vmcb->save.rip); + if (nested_svm_exit_handled(svm, true)) { + nested_svm_vmexit(svm); + nsvm_printk("-> #VMEXIT\n"); + return 1; + } + } + if (npt_enabled) { int mmu_reload = 0; if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { @@ -1477,7 +2258,6 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) } } - kvm_reput_irq(svm); if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; @@ -1488,7 +2268,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) if (is_external_interrupt(svm->vmcb->control.exit_int_info) && exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && - exit_code != SVM_EXIT_NPF) + exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH) printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " "exit_code 0x%x\n", __func__, svm->vmcb->control.exit_int_info, @@ -1525,6 +2305,15 @@ static void pre_svm_run(struct vcpu_svm *svm) new_asid(svm, svm_data); } +static void svm_inject_nmi(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; + vcpu->arch.hflags |= HF_NMI_MASK; + svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET); + ++vcpu->stat.nmi_injections; +} static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) { @@ -1540,121 +2329,71 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); } -static void svm_set_irq(struct kvm_vcpu *vcpu, int irq) +static void svm_queue_irq(struct kvm_vcpu *vcpu, unsigned nr) { struct vcpu_svm *svm = to_svm(vcpu); - svm_inject_irq(svm, irq); + svm->vmcb->control.event_inj = nr | + SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; } -static void update_cr8_intercept(struct kvm_vcpu *vcpu) +static void svm_set_irq(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb *vmcb = svm->vmcb; - int max_irr, tpr; - if (!irqchip_in_kernel(vcpu->kvm) || vcpu->arch.apic->vapic_addr) - return; + nested_svm_intr(svm); - vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; + svm_queue_irq(vcpu, vcpu->arch.interrupt.nr); +} - max_irr = kvm_lapic_find_highest_irr(vcpu); - if (max_irr == -1) - return; +static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) +{ + struct vcpu_svm *svm = to_svm(vcpu); - tpr = kvm_lapic_get_cr8(vcpu) << 4; + if (irr == -1) + return; - if (tpr >= (max_irr & 0xf0)) - vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK; + if (tpr >= irr) + svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK; } -static void svm_intr_assist(struct kvm_vcpu *vcpu) +static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; - int intr_vector = -1; - - if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) && - ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) { - intr_vector = vmcb->control.exit_int_info & - SVM_EVTINJ_VEC_MASK; - vmcb->control.exit_int_info = 0; - svm_inject_irq(svm, intr_vector); - goto out; - } - - if (vmcb->control.int_ctl & V_IRQ_MASK) - goto out; - - if (!kvm_cpu_has_interrupt(vcpu)) - goto out; - - if (!(vmcb->save.rflags & X86_EFLAGS_IF) || - (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || - (vmcb->control.event_inj & SVM_EVTINJ_VALID)) { - /* unable to deliver irq, set pending irq */ - vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR); - svm_inject_irq(svm, 0x0); - goto out; - } - /* Okay, we can deliver the interrupt: grab it and update PIC state. */ - intr_vector = kvm_cpu_get_interrupt(vcpu); - svm_inject_irq(svm, intr_vector); -out: - update_cr8_intercept(vcpu); + return !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && + !(svm->vcpu.arch.hflags & HF_NMI_MASK); } -static void kvm_reput_irq(struct vcpu_svm *svm) +static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) { - struct vmcb_control_area *control = &svm->vmcb->control; - - if ((control->int_ctl & V_IRQ_MASK) - && !irqchip_in_kernel(svm->vcpu.kvm)) { - control->int_ctl &= ~V_IRQ_MASK; - push_irq(&svm->vcpu, control->int_vector); - } - - svm->vcpu.arch.interrupt_window_open = - !(control->int_state & SVM_INTERRUPT_SHADOW_MASK); + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *vmcb = svm->vmcb; + return (vmcb->save.rflags & X86_EFLAGS_IF) && + !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && + (svm->vcpu.arch.hflags & HF_GIF_MASK); } -static void svm_do_inject_vector(struct vcpu_svm *svm) +static void enable_irq_window(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = &svm->vcpu; - int word_index = __ffs(vcpu->arch.irq_summary); - int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); - int irq = word_index * BITS_PER_LONG + bit_index; - - clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); - if (!vcpu->arch.irq_pending[word_index]) - clear_bit(word_index, &vcpu->arch.irq_summary); - svm_inject_irq(svm, irq); + svm_set_vintr(to_svm(vcpu)); + svm_inject_irq(to_svm(vcpu), 0x0); } -static void do_interrupt_requests(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) +static void enable_nmi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb_control_area *control = &svm->vmcb->control; - svm->vcpu.arch.interrupt_window_open = - (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && - (svm->vmcb->save.rflags & X86_EFLAGS_IF)); + if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) + == HF_NMI_MASK) + return; /* IRET will cause a vm exit */ - if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary) - /* - * If interrupts enabled, and not blocked by sti or mov ss. Good. - */ - svm_do_inject_vector(svm); - - /* - * Interrupts blocked. Wait for unblock. - */ - if (!svm->vcpu.arch.interrupt_window_open && - (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window)) - control->intercept |= 1ULL << INTERCEPT_VINTR; - else - control->intercept &= ~(1ULL << INTERCEPT_VINTR); + /* Something prevents NMI from been injected. Single step over + possible problem (IRET or exception injection or interrupt + shadow) */ + vcpu->arch.singlestep = true; + svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); + update_db_intercept(vcpu); } static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) @@ -1662,22 +2401,6 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) return 0; } -static void save_db_regs(unsigned long *db_regs) -{ - asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0])); - asm volatile ("mov %%dr1, %0" : "=r"(db_regs[1])); - asm volatile ("mov %%dr2, %0" : "=r"(db_regs[2])); - asm volatile ("mov %%dr3, %0" : "=r"(db_regs[3])); -} - -static void load_db_regs(unsigned long *db_regs) -{ - asm volatile ("mov %0, %%dr0" : : "r"(db_regs[0])); - asm volatile ("mov %0, %%dr1" : : "r"(db_regs[1])); - asm volatile ("mov %0, %%dr2" : : "r"(db_regs[2])); - asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3])); -} - static void svm_flush_tlb(struct kvm_vcpu *vcpu) { force_new_asid(vcpu); @@ -1693,7 +2416,7 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; - kvm_lapic_set_tpr(vcpu, cr8); + kvm_set_cr8(vcpu, cr8); } } @@ -1702,14 +2425,54 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); u64 cr8; - if (!irqchip_in_kernel(vcpu->kvm)) - return; - cr8 = kvm_get_cr8(vcpu); svm->vmcb->control.int_ctl &= ~V_TPR_MASK; svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; } +static void svm_complete_interrupts(struct vcpu_svm *svm) +{ + u8 vector; + int type; + u32 exitintinfo = svm->vmcb->control.exit_int_info; + + if (svm->vcpu.arch.hflags & HF_IRET_MASK) + svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); + + svm->vcpu.arch.nmi_injected = false; + kvm_clear_exception_queue(&svm->vcpu); + kvm_clear_interrupt_queue(&svm->vcpu); + + if (!(exitintinfo & SVM_EXITINTINFO_VALID)) + return; + + vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; + type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; + + switch (type) { + case SVM_EXITINTINFO_TYPE_NMI: + svm->vcpu.arch.nmi_injected = true; + break; + case SVM_EXITINTINFO_TYPE_EXEPT: + /* In case of software exception do not reinject an exception + vector, but re-execute and instruction instead */ + if (kvm_exception_is_soft(vector)) + break; + if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { + u32 err = svm->vmcb->control.exit_int_info_err; + kvm_queue_exception_e(&svm->vcpu, vector, err); + + } else + kvm_queue_exception(&svm->vcpu, vector); + break; + case SVM_EXITINTINFO_TYPE_INTR: + kvm_queue_interrupt(&svm->vcpu, vector, false); + break; + default: + break; + } +} + #ifdef CONFIG_X86_64 #define R "r" #else @@ -1736,19 +2499,12 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) gs_selector = kvm_read_gs(); ldt_selector = kvm_read_ldt(); svm->host_cr2 = kvm_read_cr2(); - svm->host_dr6 = read_dr6(); - svm->host_dr7 = read_dr7(); - svm->vmcb->save.cr2 = vcpu->arch.cr2; + if (!is_nested(svm)) + svm->vmcb->save.cr2 = vcpu->arch.cr2; /* required for live migration with NPT */ if (npt_enabled) svm->vmcb->save.cr3 = vcpu->arch.cr3; - if (svm->vmcb->save.dr7 & 0xff) { - write_dr7(0); - save_db_regs(svm->host_db_regs); - load_db_regs(svm->db_regs); - } - clgi(); local_irq_enable(); @@ -1824,16 +2580,11 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) #endif ); - if ((svm->vmcb->save.dr7 & 0xff)) - load_db_regs(svm->host_db_regs); - vcpu->arch.cr2 = svm->vmcb->save.cr2; vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; - write_dr6(svm->host_dr6); - write_dr7(svm->host_dr7); kvm_write_cr2(svm->host_cr2); kvm_load_fs(fs_selector); @@ -1850,6 +2601,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) sync_cr8_to_lapic(vcpu); svm->next_rip = 0; + + svm_complete_interrupts(svm); } #undef R @@ -1915,7 +2668,7 @@ static int get_npt_level(void) #endif } -static int svm_get_mt_mask_shift(void) +static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { return 0; } @@ -1965,17 +2718,21 @@ static struct kvm_x86_ops svm_x86_ops = { .run = svm_vcpu_run, .handle_exit = handle_exit, .skip_emulated_instruction = skip_emulated_instruction, + .set_interrupt_shadow = svm_set_interrupt_shadow, + .get_interrupt_shadow = svm_get_interrupt_shadow, .patch_hypercall = svm_patch_hypercall, - .get_irq = svm_get_irq, .set_irq = svm_set_irq, + .set_nmi = svm_inject_nmi, .queue_exception = svm_queue_exception, - .exception_injected = svm_exception_injected, - .inject_pending_irq = svm_intr_assist, - .inject_pending_vectors = do_interrupt_requests, + .interrupt_allowed = svm_interrupt_allowed, + .nmi_allowed = svm_nmi_allowed, + .enable_nmi_window = enable_nmi_window, + .enable_irq_window = enable_irq_window, + .update_cr8_intercept = update_cr8_intercept, .set_tss_addr = svm_set_tss_addr, .get_tdp_level = get_npt_level, - .get_mt_mask_shift = svm_get_mt_mask_shift, + .get_mt_mask = svm_get_mt_mask, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c new file mode 100644 index 00000000000..86dbac072d0 --- /dev/null +++ b/arch/x86/kvm/timer.c @@ -0,0 +1,46 @@ +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/hrtimer.h> +#include <asm/atomic.h> +#include "kvm_timer.h" + +static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) +{ + int restart_timer = 0; + wait_queue_head_t *q = &vcpu->wq; + + /* FIXME: this code should not know anything about vcpus */ + if (!atomic_inc_and_test(&ktimer->pending)) + set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); + + if (!ktimer->reinject) + atomic_set(&ktimer->pending, 1); + + if (waitqueue_active(q)) + wake_up_interruptible(q); + + if (ktimer->t_ops->is_periodic(ktimer)) { + hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); + restart_timer = 1; + } + + return restart_timer; +} + +enum hrtimer_restart kvm_timer_fn(struct hrtimer *data) +{ + int restart_timer; + struct kvm_vcpu *vcpu; + struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); + + vcpu = ktimer->kvm->vcpus[ktimer->vcpu_id]; + if (!vcpu) + return HRTIMER_NORESTART; + + restart_timer = __kvm_timer_fn(vcpu, ktimer); + if (restart_timer) + return HRTIMER_RESTART; + else + return HRTIMER_NORESTART; +} + diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7611af57682..e770bf349ec 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -32,26 +32,27 @@ #include <asm/desc.h> #include <asm/vmx.h> #include <asm/virtext.h> +#include <asm/mce.h> #define __ex(x) __kvm_handle_fault_on_reboot(x) MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); -static int bypass_guest_pf = 1; -module_param(bypass_guest_pf, bool, 0); +static int __read_mostly bypass_guest_pf = 1; +module_param(bypass_guest_pf, bool, S_IRUGO); -static int enable_vpid = 1; -module_param(enable_vpid, bool, 0); +static int __read_mostly enable_vpid = 1; +module_param_named(vpid, enable_vpid, bool, 0444); -static int flexpriority_enabled = 1; -module_param(flexpriority_enabled, bool, 0); +static int __read_mostly flexpriority_enabled = 1; +module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); -static int enable_ept = 1; -module_param(enable_ept, bool, 0); +static int __read_mostly enable_ept = 1; +module_param_named(ept, enable_ept, bool, S_IRUGO); -static int emulate_invalid_guest_state = 0; -module_param(emulate_invalid_guest_state, bool, 0); +static int __read_mostly emulate_invalid_guest_state = 0; +module_param(emulate_invalid_guest_state, bool, S_IRUGO); struct vmcs { u32 revision_id; @@ -91,11 +92,13 @@ struct vcpu_vmx { } rmode; int vpid; bool emulation_required; + enum emulation_result invalid_state_emulation_result; /* Support for vnmi-less CPUs */ int soft_vnmi_blocked; ktime_t entry_time; s64 vnmi_blocked_time; + u32 exit_reason; }; static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) @@ -110,9 +113,10 @@ static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); -static struct page *vmx_io_bitmap_a; -static struct page *vmx_io_bitmap_b; -static struct page *vmx_msr_bitmap; +static unsigned long *vmx_io_bitmap_a; +static unsigned long *vmx_io_bitmap_b; +static unsigned long *vmx_msr_bitmap_legacy; +static unsigned long *vmx_msr_bitmap_longmode; static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); static DEFINE_SPINLOCK(vmx_vpid_lock); @@ -189,21 +193,21 @@ static inline int is_page_fault(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | INTR_INFO_VALID_MASK)) == - (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); + (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); } static inline int is_no_device(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | INTR_INFO_VALID_MASK)) == - (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); + (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); } static inline int is_invalid_opcode(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | INTR_INFO_VALID_MASK)) == - (INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK); + (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK); } static inline int is_external_interrupt(u32 intr_info) @@ -212,70 +216,78 @@ static inline int is_external_interrupt(u32 intr_info) == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); } +static inline int is_machine_check(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | + INTR_INFO_VALID_MASK)) == + (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK); +} + static inline int cpu_has_vmx_msr_bitmap(void) { - return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS); + return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; } static inline int cpu_has_vmx_tpr_shadow(void) { - return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW); + return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; } static inline int vm_need_tpr_shadow(struct kvm *kvm) { - return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm))); + return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)); } static inline int cpu_has_secondary_exec_ctrls(void) { - return (vmcs_config.cpu_based_exec_ctrl & - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS); + return vmcs_config.cpu_based_exec_ctrl & + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; } static inline bool cpu_has_vmx_virtualize_apic_accesses(void) { - return flexpriority_enabled - && (vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; +} + +static inline bool cpu_has_vmx_flexpriority(void) +{ + return cpu_has_vmx_tpr_shadow() && + cpu_has_vmx_virtualize_apic_accesses(); } static inline int cpu_has_vmx_invept_individual_addr(void) { - return (!!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT)); + return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); } static inline int cpu_has_vmx_invept_context(void) { - return (!!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT)); + return !!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT); } static inline int cpu_has_vmx_invept_global(void) { - return (!!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT)); + return !!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT); } static inline int cpu_has_vmx_ept(void) { - return (vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_ENABLE_EPT); -} - -static inline int vm_need_ept(void) -{ - return (cpu_has_vmx_ept() && enable_ept); + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_EPT; } static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) { - return ((cpu_has_vmx_virtualize_apic_accesses()) && - (irqchip_in_kernel(kvm))); + return flexpriority_enabled && + (cpu_has_vmx_virtualize_apic_accesses()) && + (irqchip_in_kernel(kvm)); } static inline int cpu_has_vmx_vpid(void) { - return (vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_ENABLE_VPID); + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_VPID; } static inline int cpu_has_virtual_nmis(void) @@ -283,6 +295,11 @@ static inline int cpu_has_virtual_nmis(void) return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; } +static inline bool report_flexpriority(void) +{ + return flexpriority_enabled; +} + static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -380,7 +397,7 @@ static inline void ept_sync_global(void) static inline void ept_sync_context(u64 eptp) { - if (vm_need_ept()) { + if (enable_ept) { if (cpu_has_vmx_invept_context()) __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); else @@ -390,7 +407,7 @@ static inline void ept_sync_context(u64 eptp) static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa) { - if (vm_need_ept()) { + if (enable_ept) { if (cpu_has_vmx_invept_individual_addr()) __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR, eptp, gpa); @@ -477,14 +494,19 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) { u32 eb; - eb = (1u << PF_VECTOR) | (1u << UD_VECTOR); + eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR); if (!vcpu->fpu_active) eb |= 1u << NM_VECTOR; - if (vcpu->guest_debug.enabled) - eb |= 1u << DB_VECTOR; - if (vcpu->arch.rmode.active) + if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { + if (vcpu->guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) + eb |= 1u << DB_VECTOR; + if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) + eb |= 1u << BP_VECTOR; + } + if (vcpu->arch.rmode.vm86_active) eb = ~0; - if (vm_need_ept()) + if (enable_ept) eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ vmcs_write32(EXCEPTION_BITMAP, eb); } @@ -718,63 +740,84 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { - if (vcpu->arch.rmode.active) + if (vcpu->arch.rmode.vm86_active) rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; vmcs_writel(GUEST_RFLAGS, rflags); } +static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +{ + u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + int ret = 0; + + if (interruptibility & GUEST_INTR_STATE_STI) + ret |= X86_SHADOW_INT_STI; + if (interruptibility & GUEST_INTR_STATE_MOV_SS) + ret |= X86_SHADOW_INT_MOV_SS; + + return ret & mask; +} + +static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +{ + u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + u32 interruptibility = interruptibility_old; + + interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); + + if (mask & X86_SHADOW_INT_MOV_SS) + interruptibility |= GUEST_INTR_STATE_MOV_SS; + if (mask & X86_SHADOW_INT_STI) + interruptibility |= GUEST_INTR_STATE_STI; + + if ((interruptibility != interruptibility_old)) + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); +} + static void skip_emulated_instruction(struct kvm_vcpu *vcpu) { unsigned long rip; - u32 interruptibility; rip = kvm_rip_read(vcpu); rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); kvm_rip_write(vcpu, rip); - /* - * We emulated an instruction, so temporary interrupt blocking - * should be removed, if set. - */ - interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); - if (interruptibility & 3) - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, - interruptibility & ~3); - vcpu->arch.interrupt_window_open = 1; + /* skipping an emulated instruction also counts */ + vmx_set_interrupt_shadow(vcpu, 0); } static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, bool has_error_code, u32 error_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 intr_info = nr | INTR_INFO_VALID_MASK; - if (has_error_code) + if (has_error_code) { vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); + intr_info |= INTR_INFO_DELIVER_CODE_MASK; + } - if (vcpu->arch.rmode.active) { + if (vcpu->arch.rmode.vm86_active) { vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = nr; vmx->rmode.irq.rip = kvm_rip_read(vcpu); - if (nr == BP_VECTOR) + if (nr == BP_VECTOR || nr == OF_VECTOR) vmx->rmode.irq.rip++; - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - nr | INTR_TYPE_SOFT_INTR - | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) - | INTR_INFO_VALID_MASK); + intr_info |= INTR_TYPE_SOFT_INTR; + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); return; } - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - nr | INTR_TYPE_EXCEPTION - | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) - | INTR_INFO_VALID_MASK); -} + if (kvm_exception_is_soft(nr)) { + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmx->vcpu.arch.event_exit_inst_len); + intr_info |= INTR_TYPE_SOFT_EXCEPTION; + } else + intr_info |= INTR_TYPE_HARD_EXCEPTION; -static bool vmx_exception_injected(struct kvm_vcpu *vcpu) -{ - return false; + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); } /* @@ -802,6 +845,7 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) static void setup_msrs(struct vcpu_vmx *vmx) { int save_nmsrs; + unsigned long *msr_bitmap; vmx_load_host_state(vmx); save_nmsrs = 0; @@ -837,6 +881,15 @@ static void setup_msrs(struct vcpu_vmx *vmx) __find_msr_index(vmx, MSR_KERNEL_GS_BASE); #endif vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER); + + if (cpu_has_vmx_msr_bitmap()) { + if (is_long_mode(&vmx->vcpu)) + msr_bitmap = vmx_msr_bitmap_longmode; + else + msr_bitmap = vmx_msr_bitmap_legacy; + + vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); + } } /* @@ -856,11 +909,8 @@ static u64 guest_read_tsc(void) * writes 'guest_tsc' into guest's timestamp counter "register" * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc */ -static void guest_write_tsc(u64 guest_tsc) +static void guest_write_tsc(u64 guest_tsc, u64 host_tsc) { - u64 host_tsc; - - rdtscll(host_tsc); vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); } @@ -925,14 +975,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_msr_entry *msr; + u64 host_tsc; int ret = 0; switch (msr_index) { -#ifdef CONFIG_X86_64 case MSR_EFER: vmx_load_host_state(vmx); ret = kvm_set_msr_common(vcpu, msr_index, data); break; +#ifdef CONFIG_X86_64 case MSR_FS_BASE: vmcs_writel(GUEST_FS_BASE, data); break; @@ -950,7 +1001,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_TIME_STAMP_COUNTER: - guest_write_tsc(data); + rdtscll(host_tsc); + guest_write_tsc(data, host_tsc); break; case MSR_P6_PERFCTR0: case MSR_P6_PERFCTR1: @@ -999,51 +1051,32 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) } } -static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) +static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { - unsigned long dr7 = 0x400; - int old_singlestep; - - old_singlestep = vcpu->guest_debug.singlestep; - - vcpu->guest_debug.enabled = dbg->enabled; - if (vcpu->guest_debug.enabled) { - int i; + int old_debug = vcpu->guest_debug; + unsigned long flags; - dr7 |= 0x200; /* exact */ - for (i = 0; i < 4; ++i) { - if (!dbg->breakpoints[i].enabled) - continue; - vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address; - dr7 |= 2 << (i*2); /* global enable */ - dr7 |= 0 << (i*4+16); /* execution breakpoint */ - } + vcpu->guest_debug = dbg->control; + if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE)) + vcpu->guest_debug = 0; - vcpu->guest_debug.singlestep = dbg->singlestep; - } else - vcpu->guest_debug.singlestep = 0; - - if (old_singlestep && !vcpu->guest_debug.singlestep) { - unsigned long flags; + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) + vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]); + else + vmcs_writel(GUEST_DR7, vcpu->arch.dr7); - flags = vmcs_readl(GUEST_RFLAGS); + flags = vmcs_readl(GUEST_RFLAGS); + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + flags |= X86_EFLAGS_TF | X86_EFLAGS_RF; + else if (old_debug & KVM_GUESTDBG_SINGLESTEP) flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); - vmcs_writel(GUEST_RFLAGS, flags); - } + vmcs_writel(GUEST_RFLAGS, flags); update_exception_bitmap(vcpu); - vmcs_writel(GUEST_DR7, dr7); return 0; } -static int vmx_get_irq(struct kvm_vcpu *vcpu) -{ - if (!vcpu->arch.interrupt.pending) - return -1; - return vcpu->arch.interrupt.nr; -} - static __init int cpu_has_kvm_support(void) { return cpu_has_vmx(); @@ -1244,7 +1277,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu) struct page *pages; struct vmcs *vmcs; - pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); + pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order); if (!pages) return NULL; vmcs = page_address(pages); @@ -1297,6 +1330,18 @@ static __init int hardware_setup(void) if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); + if (!cpu_has_vmx_vpid()) + enable_vpid = 0; + + if (!cpu_has_vmx_ept()) + enable_ept = 0; + + if (!cpu_has_vmx_flexpriority()) + flexpriority_enabled = 0; + + if (!cpu_has_vmx_tpr_shadow()) + kvm_x86_ops->update_cr8_intercept = NULL; + return alloc_kvm_area(); } @@ -1327,7 +1372,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); vmx->emulation_required = 1; - vcpu->arch.rmode.active = 0; + vcpu->arch.rmode.vm86_active = 0; vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit); @@ -1389,7 +1434,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); vmx->emulation_required = 1; - vcpu->arch.rmode.active = 1; + vcpu->arch.rmode.vm86_active = 1; vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); @@ -1433,6 +1478,29 @@ continue_rmode: init_rmode(vcpu->kvm); } +static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); + + vcpu->arch.shadow_efer = efer; + if (!msr) + return; + if (efer & EFER_LMA) { + vmcs_write32(VM_ENTRY_CONTROLS, + vmcs_read32(VM_ENTRY_CONTROLS) | + VM_ENTRY_IA32E_MODE); + msr->data = efer; + } else { + vmcs_write32(VM_ENTRY_CONTROLS, + vmcs_read32(VM_ENTRY_CONTROLS) & + ~VM_ENTRY_IA32E_MODE); + + msr->data = efer & ~EFER_LME; + } + setup_msrs(vmx); +} + #ifdef CONFIG_X86_64 static void enter_lmode(struct kvm_vcpu *vcpu) @@ -1447,13 +1515,8 @@ static void enter_lmode(struct kvm_vcpu *vcpu) (guest_tr_ar & ~AR_TYPE_MASK) | AR_TYPE_BUSY_64_TSS); } - vcpu->arch.shadow_efer |= EFER_LMA; - - find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME; - vmcs_write32(VM_ENTRY_CONTROLS, - vmcs_read32(VM_ENTRY_CONTROLS) - | VM_ENTRY_IA32E_MODE); + vmx_set_efer(vcpu, vcpu->arch.shadow_efer); } static void exit_lmode(struct kvm_vcpu *vcpu) @@ -1470,7 +1533,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu) static void vmx_flush_tlb(struct kvm_vcpu *vcpu) { vpid_sync_vcpu_all(to_vmx(vcpu)); - if (vm_need_ept()) + if (enable_ept) ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); } @@ -1540,10 +1603,10 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmx_fpu_deactivate(vcpu); - if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE)) + if (vcpu->arch.rmode.vm86_active && (cr0 & X86_CR0_PE)) enter_pmode(vcpu); - if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE)) + if (!vcpu->arch.rmode.vm86_active && !(cr0 & X86_CR0_PE)) enter_rmode(vcpu); #ifdef CONFIG_X86_64 @@ -1555,7 +1618,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } #endif - if (vm_need_ept()) + if (enable_ept) ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); vmcs_writel(CR0_READ_SHADOW, cr0); @@ -1584,7 +1647,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) u64 eptp; guest_cr3 = cr3; - if (vm_need_ept()) { + if (enable_ept) { eptp = construct_eptp(cr3); vmcs_write64(EPT_POINTER, eptp); ept_sync_context(eptp); @@ -1601,41 +1664,17 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.active ? + unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.vm86_active ? KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); vcpu->arch.cr4 = cr4; - if (vm_need_ept()) + if (enable_ept) ept_update_paging_mode_cr4(&hw_cr4, vcpu); vmcs_writel(CR4_READ_SHADOW, cr4); vmcs_writel(GUEST_CR4, hw_cr4); } -static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); - - vcpu->arch.shadow_efer = efer; - if (!msr) - return; - if (efer & EFER_LMA) { - vmcs_write32(VM_ENTRY_CONTROLS, - vmcs_read32(VM_ENTRY_CONTROLS) | - VM_ENTRY_IA32E_MODE); - msr->data = efer; - - } else { - vmcs_write32(VM_ENTRY_CONTROLS, - vmcs_read32(VM_ENTRY_CONTROLS) & - ~VM_ENTRY_IA32E_MODE); - - msr->data = efer & ~EFER_LME; - } - setup_msrs(vmx); -} - static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) { struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; @@ -1653,7 +1692,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, var->limit = vmcs_read32(sf->limit); var->selector = vmcs_read16(sf->selector); ar = vmcs_read32(sf->ar_bytes); - if (ar & AR_UNUSABLE_MASK) + if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) ar = 0; var->type = ar & 15; var->s = (ar >> 4) & 1; @@ -1708,7 +1747,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; u32 ar; - if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) { + if (vcpu->arch.rmode.vm86_active && seg == VCPU_SREG_TR) { vcpu->arch.rmode.tr.selector = var->selector; vcpu->arch.rmode.tr.base = var->base; vcpu->arch.rmode.tr.limit = var->limit; @@ -1718,7 +1757,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, vmcs_writel(sf->base, var->base); vmcs_write32(sf->limit, var->limit); vmcs_write16(sf->selector, var->selector); - if (vcpu->arch.rmode.active && var->s) { + if (vcpu->arch.rmode.vm86_active && var->s) { /* * Hack real-mode segments into vm86 compatibility. */ @@ -1788,14 +1827,16 @@ static bool code_segment_valid(struct kvm_vcpu *vcpu) vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); cs_rpl = cs.selector & SELECTOR_RPL_MASK; + if (cs.unusable) + return false; if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK)) return false; if (!cs.s) return false; - if (!(~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK))) { + if (cs.type & AR_TYPE_WRITEABLE_MASK) { if (cs.dpl > cs_rpl) return false; - } else if (cs.type & AR_TYPE_CODE_MASK) { + } else { if (cs.dpl != cs_rpl) return false; } @@ -1814,7 +1855,9 @@ static bool stack_segment_valid(struct kvm_vcpu *vcpu) vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); ss_rpl = ss.selector & SELECTOR_RPL_MASK; - if ((ss.type != 3) || (ss.type != 7)) + if (ss.unusable) + return true; + if (ss.type != 3 && ss.type != 7) return false; if (!ss.s) return false; @@ -1834,6 +1877,8 @@ static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) vmx_get_segment(vcpu, &var, seg); rpl = var.selector & SELECTOR_RPL_MASK; + if (var.unusable) + return true; if (!var.s) return false; if (!var.present) @@ -1855,9 +1900,11 @@ static bool tr_valid(struct kvm_vcpu *vcpu) vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); + if (tr.unusable) + return false; if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */ return false; - if ((tr.type != 3) || (tr.type != 11)) /* TODO: Check if guest is in IA32e mode */ + if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ return false; if (!tr.present) return false; @@ -1871,6 +1918,8 @@ static bool ldtr_valid(struct kvm_vcpu *vcpu) vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); + if (ldtr.unusable) + return true; if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */ return false; if (ldtr.type != 2) @@ -1981,7 +2030,7 @@ static int init_rmode_identity_map(struct kvm *kvm) pfn_t identity_map_pfn; u32 tmp; - if (!vm_need_ept()) + if (!enable_ept) return 1; if (unlikely(!kvm->arch.ept_identity_pagetable)) { printk(KERN_ERR "EPT: identity-mapping pagetable " @@ -2070,7 +2119,7 @@ static void allocate_vpid(struct vcpu_vmx *vmx) int vpid; vmx->vpid = 0; - if (!enable_vpid || !cpu_has_vmx_vpid()) + if (!enable_vpid) return; spin_lock(&vmx_vpid_lock); vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); @@ -2081,9 +2130,9 @@ static void allocate_vpid(struct vcpu_vmx *vmx) spin_unlock(&vmx_vpid_lock); } -static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr) +static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr) { - void *va; + int f = sizeof(unsigned long); if (!cpu_has_vmx_msr_bitmap()) return; @@ -2093,16 +2142,21 @@ static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr) * have the write-low and read-high bitmap offsets the wrong way round. * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. */ - va = kmap(msr_bitmap); if (msr <= 0x1fff) { - __clear_bit(msr, va + 0x000); /* read-low */ - __clear_bit(msr, va + 0x800); /* write-low */ + __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */ + __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { msr &= 0x1fff; - __clear_bit(msr, va + 0x400); /* read-high */ - __clear_bit(msr, va + 0xc00); /* write-high */ + __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */ + __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */ } - kunmap(msr_bitmap); +} + +static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) +{ + if (!longmode_only) + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr); } /* @@ -2112,7 +2166,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) { u32 host_sysenter_cs, msr_low, msr_high; u32 junk; - u64 host_pat; + u64 host_pat, tsc_this, tsc_base; unsigned long a; struct descriptor_table dt; int i; @@ -2120,11 +2174,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) u32 exec_control; /* I/O */ - vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a)); - vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b)); + vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); + vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b)); if (cpu_has_vmx_msr_bitmap()) - vmcs_write64(MSR_BITMAP, page_to_phys(vmx_msr_bitmap)); + vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy)); vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ @@ -2140,7 +2194,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) CPU_BASED_CR8_LOAD_EXITING; #endif } - if (!vm_need_ept()) + if (!enable_ept) exec_control |= CPU_BASED_CR3_STORE_EXITING | CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_INVLPG_EXITING; @@ -2153,7 +2207,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; if (vmx->vpid == 0) exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; - if (!vm_need_ept()) + if (!enable_ept) exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } @@ -2240,6 +2294,12 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); + tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; + rdtscll(tsc_this); + if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc) + tsc_base = tsc_this; + + guest_write_tsc(0, tsc_base); return 0; } @@ -2266,7 +2326,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) goto out; } - vmx->vcpu.arch.rmode.active = 0; + vmx->vcpu.arch.rmode.vm86_active = 0; vmx->soft_vnmi_blocked = 0; @@ -2319,7 +2379,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) kvm_rip_write(vcpu, 0); kvm_register_write(vcpu, VCPU_REGS_RSP, 0); - /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */ vmcs_writel(GUEST_DR7, 0x400); vmcs_writel(GUEST_GDTR_BASE, 0); @@ -2332,8 +2391,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); - guest_write_tsc(0); - /* Special registers */ vmcs_write64(GUEST_IA32_DEBUGCTL, 0); @@ -2398,14 +2455,16 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); } -static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) +static void vmx_inject_irq(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + uint32_t intr; + int irq = vcpu->arch.interrupt.nr; KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); ++vcpu->stat.irq_injections; - if (vcpu->arch.rmode.active) { + if (vcpu->arch.rmode.vm86_active) { vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = irq; vmx->rmode.irq.rip = kvm_rip_read(vcpu); @@ -2415,8 +2474,14 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); return; } - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); + intr = irq | INTR_INFO_VALID_MASK; + if (vcpu->arch.interrupt.soft) { + intr |= INTR_TYPE_SOFT_INTR; + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmx->vcpu.arch.event_exit_inst_len); + } else + intr |= INTR_TYPE_EXT_INTR; + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); } static void vmx_inject_nmi(struct kvm_vcpu *vcpu) @@ -2437,7 +2502,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) } ++vcpu->stat.nmi_injections; - if (vcpu->arch.rmode.active) { + if (vcpu->arch.rmode.vm86_active) { vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = NMI_VECTOR; vmx->rmode.irq.rip = kvm_rip_read(vcpu); @@ -2452,71 +2517,21 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); } -static void vmx_update_window_states(struct kvm_vcpu *vcpu) +static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) { - u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); - - vcpu->arch.nmi_window_open = - !(guest_intr & (GUEST_INTR_STATE_STI | - GUEST_INTR_STATE_MOV_SS | - GUEST_INTR_STATE_NMI)); if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) - vcpu->arch.nmi_window_open = 0; - - vcpu->arch.interrupt_window_open = - ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && - !(guest_intr & (GUEST_INTR_STATE_STI | - GUEST_INTR_STATE_MOV_SS))); -} - -static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) -{ - int word_index = __ffs(vcpu->arch.irq_summary); - int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); - int irq = word_index * BITS_PER_LONG + bit_index; + return 0; - clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); - if (!vcpu->arch.irq_pending[word_index]) - clear_bit(word_index, &vcpu->arch.irq_summary); - kvm_queue_interrupt(vcpu, irq); + return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS | + GUEST_INTR_STATE_NMI)); } -static void do_interrupt_requests(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) +static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { - vmx_update_window_states(vcpu); - - if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { - if (vcpu->arch.interrupt.pending) { - enable_nmi_window(vcpu); - } else if (vcpu->arch.nmi_window_open) { - vcpu->arch.nmi_pending = false; - vcpu->arch.nmi_injected = true; - } else { - enable_nmi_window(vcpu); - return; - } - } - if (vcpu->arch.nmi_injected) { - vmx_inject_nmi(vcpu); - if (vcpu->arch.nmi_pending) - enable_nmi_window(vcpu); - else if (vcpu->arch.irq_summary - || kvm_run->request_interrupt_window) - enable_irq_window(vcpu); - return; - } - - if (vcpu->arch.interrupt_window_open) { - if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending) - kvm_do_inject_irq(vcpu); - - if (vcpu->arch.interrupt.pending) - vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); - } - if (!vcpu->arch.interrupt_window_open && - (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) - enable_irq_window(vcpu); + return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && + !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); } static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) @@ -2536,24 +2551,6 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) return 0; } -static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) -{ - struct kvm_guest_debug *dbg = &vcpu->guest_debug; - - set_debugreg(dbg->bp[0], 0); - set_debugreg(dbg->bp[1], 1); - set_debugreg(dbg->bp[2], 2); - set_debugreg(dbg->bp[3], 3); - - if (dbg->singlestep) { - unsigned long flags; - - flags = vmcs_readl(GUEST_RFLAGS); - flags |= X86_EFLAGS_TF | X86_EFLAGS_RF; - vmcs_writel(GUEST_RFLAGS, flags); - } -} - static int handle_rmode_exception(struct kvm_vcpu *vcpu, int vec, u32 err_code) { @@ -2570,9 +2567,17 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, * the required debugging infrastructure rework. */ switch (vec) { - case DE_VECTOR: case DB_VECTOR: + if (vcpu->guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) + return 0; + kvm_queue_exception(vcpu, vec); + return 1; case BP_VECTOR: + if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) + return 0; + /* fall through */ + case DE_VECTOR: case OF_VECTOR: case BR_VECTOR: case UD_VECTOR: @@ -2586,28 +2591,50 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, return 0; } +/* + * Trigger machine check on the host. We assume all the MSRs are already set up + * by the CPU and that we still run on the same CPU as the MCE occurred on. + * We pass a fake environment to the machine check handler because we want + * the guest to be always treated like user space, no matter what context + * it used internally. + */ +static void kvm_machine_check(void) +{ +#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) + struct pt_regs regs = { + .cs = 3, /* Fake ring 3 no matter what the guest ran on */ + .flags = X86_EFLAGS_IF, + }; + + do_machine_check(®s, 0); +#endif +} + +static int handle_machine_check(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + /* already handled by vcpu_run */ + return 1; +} + static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 intr_info, error_code; - unsigned long cr2, rip; + u32 intr_info, ex_no, error_code; + unsigned long cr2, rip, dr6; u32 vect_info; enum emulation_result er; vect_info = vmx->idt_vectoring_info; intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + if (is_machine_check(intr_info)) + return handle_machine_check(vcpu, kvm_run); + if ((vect_info & VECTORING_INFO_VALID_MASK) && !is_page_fault(intr_info)) printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " "intr info 0x%x\n", __func__, vect_info, intr_info); - if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) { - int irq = vect_info & VECTORING_INFO_VECTOR_MASK; - set_bit(irq, vcpu->arch.irq_pending); - set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); - } - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) return 1; /* already handled by vmx_vcpu_run() */ @@ -2629,17 +2656,17 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); if (is_page_fault(intr_info)) { /* EPT won't cause page fault directly */ - if (vm_need_ept()) + if (enable_ept) BUG(); cr2 = vmcs_readl(EXIT_QUALIFICATION); KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, (u32)((u64)cr2 >> 32), handler); - if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending) + if (kvm_event_needs_reinjection(vcpu)) kvm_mmu_unprotect_page_virt(vcpu, cr2); return kvm_mmu_page_fault(vcpu, cr2, error_code); } - if (vcpu->arch.rmode.active && + if (vcpu->arch.rmode.vm86_active && handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, error_code)) { if (vcpu->arch.halt_request) { @@ -2649,14 +2676,30 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } - if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == - (INTR_TYPE_EXCEPTION | 1)) { + ex_no = intr_info & INTR_INFO_VECTOR_MASK; + switch (ex_no) { + case DB_VECTOR: + dr6 = vmcs_readl(EXIT_QUALIFICATION); + if (!(vcpu->guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { + vcpu->arch.dr6 = dr6 | DR6_FIXED_1; + kvm_queue_exception(vcpu, DB_VECTOR); + return 1; + } + kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1; + kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); + /* fall through */ + case BP_VECTOR: kvm_run->exit_reason = KVM_EXIT_DEBUG; - return 0; + kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; + kvm_run->debug.arch.exception = ex_no; + break; + default: + kvm_run->exit_reason = KVM_EXIT_EXCEPTION; + kvm_run->ex.exception = ex_no; + kvm_run->ex.error_code = error_code; + break; } - kvm_run->exit_reason = KVM_EXIT_EXCEPTION; - kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK; - kvm_run->ex.error_code = error_code; return 0; } @@ -2677,7 +2720,7 @@ static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { unsigned long exit_qualification; - int size, down, in, string, rep; + int size, in, string; unsigned port; ++vcpu->stat.io_exits; @@ -2693,8 +2736,6 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) size = (exit_qualification & 7) + 1; in = (exit_qualification & 8) != 0; - down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0; - rep = (exit_qualification & 32) != 0; port = exit_qualification >> 16; skip_emulated_instruction(vcpu); @@ -2740,13 +2781,18 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg)); skip_emulated_instruction(vcpu); return 1; - case 8: - kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg)); - skip_emulated_instruction(vcpu); - if (irqchip_in_kernel(vcpu->kvm)) - return 1; - kvm_run->exit_reason = KVM_EXIT_SET_TPR; - return 0; + case 8: { + u8 cr8_prev = kvm_get_cr8(vcpu); + u8 cr8 = kvm_register_read(vcpu, reg); + kvm_set_cr8(vcpu, cr8); + skip_emulated_instruction(vcpu); + if (irqchip_in_kernel(vcpu->kvm)) + return 1; + if (cr8_prev <= cr8) + return 1; + kvm_run->exit_reason = KVM_EXIT_SET_TPR; + return 0; + } }; break; case 2: /* clts */ @@ -2795,21 +2841,44 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) unsigned long val; int dr, reg; - /* - * FIXME: this code assumes the host is debugging the guest. - * need to deal with guest debugging itself too. - */ + dr = vmcs_readl(GUEST_DR7); + if (dr & DR7_GD) { + /* + * As the vm-exit takes precedence over the debug trap, we + * need to emulate the latter, either for the host or the + * guest debugging itself. + */ + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { + kvm_run->debug.arch.dr6 = vcpu->arch.dr6; + kvm_run->debug.arch.dr7 = dr; + kvm_run->debug.arch.pc = + vmcs_readl(GUEST_CS_BASE) + + vmcs_readl(GUEST_RIP); + kvm_run->debug.arch.exception = DB_VECTOR; + kvm_run->exit_reason = KVM_EXIT_DEBUG; + return 0; + } else { + vcpu->arch.dr7 &= ~DR7_GD; + vcpu->arch.dr6 |= DR6_BD; + vmcs_writel(GUEST_DR7, vcpu->arch.dr7); + kvm_queue_exception(vcpu, DB_VECTOR); + return 1; + } + } + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - dr = exit_qualification & 7; - reg = (exit_qualification >> 8) & 15; - if (exit_qualification & 16) { - /* mov from dr */ + dr = exit_qualification & DEBUG_REG_ACCESS_NUM; + reg = DEBUG_REG_ACCESS_REG(exit_qualification); + if (exit_qualification & TYPE_MOV_FROM_DR) { switch (dr) { + case 0 ... 3: + val = vcpu->arch.db[dr]; + break; case 6: - val = 0xffff0ff0; + val = vcpu->arch.dr6; break; case 7: - val = 0x400; + val = vcpu->arch.dr7; break; default: val = 0; @@ -2817,7 +2886,38 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_register_write(vcpu, reg, val); KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); } else { - /* mov to dr */ + val = vcpu->arch.regs[reg]; + switch (dr) { + case 0 ... 3: + vcpu->arch.db[dr] = val; + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) + vcpu->arch.eff_db[dr] = val; + break; + case 4 ... 5: + if (vcpu->arch.cr4 & X86_CR4_DE) + kvm_queue_exception(vcpu, UD_VECTOR); + break; + case 6: + if (val & 0xffffffff00000000ULL) { + kvm_queue_exception(vcpu, GP_VECTOR); + break; + } + vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; + break; + case 7: + if (val & 0xffffffff00000000ULL) { + kvm_queue_exception(vcpu, GP_VECTOR); + break; + } + vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { + vmcs_writel(GUEST_DR7, vcpu->arch.dr7); + vcpu->arch.switch_db_regs = + (val & DR7_BP_EN_MASK); + } + break; + } + KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)val, handler); } skip_emulated_instruction(vcpu); return 1; @@ -2890,8 +2990,9 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, * If the user space waits to inject interrupts, exit as soon as * possible */ - if (kvm_run->request_interrupt_window && - !vcpu->arch.irq_summary) { + if (!irqchip_in_kernel(vcpu->kvm) && + kvm_run->request_interrupt_window && + !kvm_cpu_has_interrupt(vcpu)) { kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; return 0; } @@ -2913,7 +3014,7 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); kvm_mmu_invlpg(vcpu, exit_qualification); skip_emulated_instruction(vcpu); @@ -2929,11 +3030,11 @@ static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - u64 exit_qualification; + unsigned long exit_qualification; enum emulation_result er; unsigned long offset; - exit_qualification = vmcs_read64(EXIT_QUALIFICATION); + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); offset = exit_qualification & 0xffful; er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); @@ -2952,35 +3053,62 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qualification; u16 tss_selector; - int reason; + int reason, type, idt_v; + + idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); + type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); exit_qualification = vmcs_readl(EXIT_QUALIFICATION); reason = (u32)exit_qualification >> 30; - if (reason == TASK_SWITCH_GATE && vmx->vcpu.arch.nmi_injected && - (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && - (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK) - == INTR_TYPE_NMI_INTR) { - vcpu->arch.nmi_injected = false; - if (cpu_has_virtual_nmis()) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); + if (reason == TASK_SWITCH_GATE && idt_v) { + switch (type) { + case INTR_TYPE_NMI_INTR: + vcpu->arch.nmi_injected = false; + if (cpu_has_virtual_nmis()) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + break; + case INTR_TYPE_EXT_INTR: + case INTR_TYPE_SOFT_INTR: + kvm_clear_interrupt_queue(vcpu); + break; + case INTR_TYPE_HARD_EXCEPTION: + case INTR_TYPE_SOFT_EXCEPTION: + kvm_clear_exception_queue(vcpu); + break; + default: + break; + } } tss_selector = exit_qualification; - return kvm_task_switch(vcpu, tss_selector, reason); + if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && + type != INTR_TYPE_EXT_INTR && + type != INTR_TYPE_NMI_INTR)) + skip_emulated_instruction(vcpu); + + if (!kvm_task_switch(vcpu, tss_selector, reason)) + return 0; + + /* clear all local breakpoint enable flags */ + vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55); + + /* + * TODO: What about debug traps on tss switch? + * Are we supposed to inject them and update dr6? + */ + + return 1; } static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - u64 exit_qualification; - enum emulation_result er; + unsigned long exit_qualification; gpa_t gpa; - unsigned long hva; int gla_validity; - int r; - exit_qualification = vmcs_read64(EXIT_QUALIFICATION); + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); if (exit_qualification & (1 << 6)) { printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); @@ -2992,7 +3120,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), - (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS)); + vmcs_readl(GUEST_LINEAR_ADDRESS)); printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", (long unsigned int)exit_qualification); kvm_run->exit_reason = KVM_EXIT_UNKNOWN; @@ -3001,32 +3129,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); - hva = gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT); - if (!kvm_is_error_hva(hva)) { - r = kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0); - if (r < 0) { - printk(KERN_ERR "EPT: Not enough memory!\n"); - return -ENOMEM; - } - return 1; - } else { - /* must be MMIO */ - er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); - - if (er == EMULATE_FAIL) { - printk(KERN_ERR - "EPT: Fail to handle EPT violation vmexit!er is %d\n", - er); - printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", - (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), - (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS)); - printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", - (long unsigned int)exit_qualification); - return -ENOTSUPP; - } else if (er == EMULATE_DO_MMIO) - return 0; - } - return 1; + return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0); } static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) @@ -3046,7 +3149,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct vcpu_vmx *vmx = to_vmx(vcpu); - int err; + enum emulation_result err = EMULATE_DONE; preempt_enable(); local_irq_enable(); @@ -3071,10 +3174,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, local_irq_disable(); preempt_disable(); - /* Guest state should be valid now except if we need to - * emulate an MMIO */ - if (guest_state_valid(vcpu)) - vmx->emulation_required = 0; + vmx->invalid_state_emulation_result = err; } /* @@ -3103,6 +3203,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, [EXIT_REASON_WBINVD] = handle_wbinvd, [EXIT_REASON_TASK_SWITCH] = handle_task_switch, [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, + [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, }; static const int kvm_vmx_max_exit_handlers = @@ -3112,10 +3213,10 @@ static const int kvm_vmx_max_exit_handlers = * The guest has exited. See if we can fix it or if we need userspace * assistance. */ -static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { - u32 exit_reason = vmcs_read32(VM_EXIT_REASON); struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 exit_reason = vmx->exit_reason; u32 vectoring_info = vmx->idt_vectoring_info; KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), @@ -3123,12 +3224,15 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) /* If we need to emulate an MMIO from handle_invalid_guest_state * we just return 0 */ - if (vmx->emulation_required && emulate_invalid_guest_state) - return 0; + if (vmx->emulation_required && emulate_invalid_guest_state) { + if (guest_state_valid(vcpu)) + vmx->emulation_required = 0; + return vmx->invalid_state_emulation_result != EMULATE_DO_MMIO; + } /* Access CR3 don't cause VMExit in paging mode, so we need * to sync with guest real CR3. */ - if (vm_need_ept() && is_paging(vcpu)) { + if (enable_ept && is_paging(vcpu)) { vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); ept_load_pdptrs(vcpu); } @@ -3149,9 +3253,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) __func__, vectoring_info, exit_reason); if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) { - if (vcpu->arch.interrupt_window_open) { + if (vmx_interrupt_allowed(vcpu)) { vmx->soft_vnmi_blocked = 0; - vcpu->arch.nmi_window_open = 1; } else if (vmx->vnmi_blocked_time > 1000000000LL && vcpu->arch.nmi_pending) { /* @@ -3164,7 +3267,6 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) "state on VCPU %d after 1 s timeout\n", __func__, vcpu->vcpu_id); vmx->soft_vnmi_blocked = 0; - vmx->vcpu.arch.nmi_window_open = 1; } } @@ -3178,116 +3280,107 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) return 0; } -static void update_tpr_threshold(struct kvm_vcpu *vcpu) +static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { - int max_irr, tpr; - - if (!vm_need_tpr_shadow(vcpu->kvm)) - return; - - if (!kvm_lapic_enabled(vcpu) || - ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) { + if (irr == -1 || tpr < irr) { vmcs_write32(TPR_THRESHOLD, 0); return; } - tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4; - vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4); + vmcs_write32(TPR_THRESHOLD, irr); } static void vmx_complete_interrupts(struct vcpu_vmx *vmx) { u32 exit_intr_info; - u32 idt_vectoring_info; + u32 idt_vectoring_info = vmx->idt_vectoring_info; bool unblock_nmi; u8 vector; int type; bool idtv_info_valid; - u32 error; exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); + + /* Handle machine checks before interrupts are enabled */ + if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) + || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI + && is_machine_check(exit_intr_info))) + kvm_machine_check(); + + /* We need to handle NMIs before interrupts are enabled */ + if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && + (exit_intr_info & INTR_INFO_VALID_MASK)) { + KVMTRACE_0D(NMI, &vmx->vcpu, handler); + asm("int $2"); + } + + idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; + if (cpu_has_virtual_nmis()) { unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; vector = exit_intr_info & INTR_INFO_VECTOR_MASK; /* - * SDM 3: 25.7.1.2 + * SDM 3: 27.7.1.2 (September 2008) * Re-set bit "block by NMI" before VM entry if vmexit caused by * a guest IRET fault. + * SDM 3: 23.2.2 (September 2008) + * Bit 12 is undefined in any of the following cases: + * If the VM exit sets the valid bit in the IDT-vectoring + * information field. + * If the VM exit is due to a double fault. */ - if (unblock_nmi && vector != DF_VECTOR) + if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && + vector != DF_VECTOR && !idtv_info_valid) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); } else if (unlikely(vmx->soft_vnmi_blocked)) vmx->vnmi_blocked_time += ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); - idt_vectoring_info = vmx->idt_vectoring_info; - idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; + vmx->vcpu.arch.nmi_injected = false; + kvm_clear_exception_queue(&vmx->vcpu); + kvm_clear_interrupt_queue(&vmx->vcpu); + + if (!idtv_info_valid) + return; + vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; - if (vmx->vcpu.arch.nmi_injected) { + + switch (type) { + case INTR_TYPE_NMI_INTR: + vmx->vcpu.arch.nmi_injected = true; /* - * SDM 3: 25.7.1.2 - * Clear bit "block by NMI" before VM entry if a NMI delivery - * faulted. + * SDM 3: 27.7.1.2 (September 2008) + * Clear bit "block by NMI" before VM entry if a NMI + * delivery faulted. */ - if (idtv_info_valid && type == INTR_TYPE_NMI_INTR) - vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - else - vmx->vcpu.arch.nmi_injected = false; - } - kvm_clear_exception_queue(&vmx->vcpu); - if (idtv_info_valid && type == INTR_TYPE_EXCEPTION) { + vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + break; + case INTR_TYPE_SOFT_EXCEPTION: + vmx->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + /* fall through */ + case INTR_TYPE_HARD_EXCEPTION: if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { - error = vmcs_read32(IDT_VECTORING_ERROR_CODE); - kvm_queue_exception_e(&vmx->vcpu, vector, error); + u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE); + kvm_queue_exception_e(&vmx->vcpu, vector, err); } else kvm_queue_exception(&vmx->vcpu, vector); - vmx->idt_vectoring_info = 0; - } - kvm_clear_interrupt_queue(&vmx->vcpu); - if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) { - kvm_queue_interrupt(&vmx->vcpu, vector); - vmx->idt_vectoring_info = 0; - } -} - -static void vmx_intr_assist(struct kvm_vcpu *vcpu) -{ - update_tpr_threshold(vcpu); - - vmx_update_window_states(vcpu); - - if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { - if (vcpu->arch.interrupt.pending) { - enable_nmi_window(vcpu); - } else if (vcpu->arch.nmi_window_open) { - vcpu->arch.nmi_pending = false; - vcpu->arch.nmi_injected = true; - } else { - enable_nmi_window(vcpu); - return; - } - } - if (vcpu->arch.nmi_injected) { - vmx_inject_nmi(vcpu); - if (vcpu->arch.nmi_pending) - enable_nmi_window(vcpu); - else if (kvm_cpu_has_interrupt(vcpu)) - enable_irq_window(vcpu); - return; - } - if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) { - if (vcpu->arch.interrupt_window_open) - kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu)); - else - enable_irq_window(vcpu); - } - if (vcpu->arch.interrupt.pending) { - vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); - if (kvm_cpu_has_interrupt(vcpu)) - enable_irq_window(vcpu); + break; + case INTR_TYPE_SOFT_INTR: + vmx->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + /* fall through */ + case INTR_TYPE_EXT_INTR: + kvm_queue_interrupt(&vmx->vcpu, vector, + type == INTR_TYPE_SOFT_INTR); + break; + default: + break; } } @@ -3325,7 +3418,6 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx) static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 intr_info; /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) @@ -3347,6 +3439,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) */ vmcs_writel(HOST_CR0, read_cr0()); + set_debugreg(vcpu->arch.dr6, 6); + asm( /* Store host registers */ "push %%"R"dx; push %%"R"bp;" @@ -3441,24 +3535,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); vcpu->arch.regs_dirty = 0; + get_debugreg(vcpu->arch.dr6, 6); + vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); if (vmx->rmode.irq.pending) fixup_rmode_irq(vmx); - vmx_update_window_states(vcpu); - asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); vmx->launched = 1; - intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - - /* We need to handle NMIs before interrupts are enabled */ - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && - (intr_info & INTR_INFO_VALID_MASK)) { - KVMTRACE_0D(NMI, vcpu, handler); - asm("int $2"); - } - vmx_complete_interrupts(vmx); } @@ -3533,7 +3618,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (alloc_apic_access_page(kvm) != 0) goto free_vmcs; - if (vm_need_ept()) + if (enable_ept) if (alloc_identity_pagetable(kvm) != 0) goto free_vmcs; @@ -3571,9 +3656,32 @@ static int get_ept_level(void) return VMX_EPT_DEFAULT_GAW + 1; } -static int vmx_get_mt_mask_shift(void) +static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { - return VMX_EPT_MT_EPTE_SHIFT; + u64 ret; + + /* For VT-d and EPT combination + * 1. MMIO: always map as UC + * 2. EPT with VT-d: + * a. VT-d without snooping control feature: can't guarantee the + * result, try to trust guest. + * b. VT-d with snooping control feature: snooping control feature of + * VT-d engine can guarantee the cache correctness. Just set it + * to WB to keep consistent with host. So the same as item 3. + * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep + * consistent with host MTRR + */ + if (is_mmio) + ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; + else if (vcpu->kvm->arch.iommu_domain && + !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY)) + ret = kvm_get_guest_memory_type(vcpu, gfn) << + VMX_EPT_MT_EPTE_SHIFT; + else + ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) + | VMX_EPT_IGMT_BIT; + + return ret; } static struct kvm_x86_ops vmx_x86_ops = { @@ -3584,7 +3692,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .check_processor_compatibility = vmx_check_processor_compat, .hardware_enable = hardware_enable, .hardware_disable = hardware_disable, - .cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses, + .cpu_has_accelerated_tpr = report_flexpriority, .vcpu_create = vmx_create_vcpu, .vcpu_free = vmx_free_vcpu, @@ -3595,7 +3703,6 @@ static struct kvm_x86_ops vmx_x86_ops = { .vcpu_put = vmx_vcpu_put, .set_guest_debug = set_guest_debug, - .guest_debug_pre = kvm_guest_debug_pre, .get_msr = vmx_get_msr, .set_msr = vmx_set_msr, .get_segment_base = vmx_get_segment_base, @@ -3619,78 +3726,82 @@ static struct kvm_x86_ops vmx_x86_ops = { .tlb_flush = vmx_flush_tlb, .run = vmx_vcpu_run, - .handle_exit = kvm_handle_exit, + .handle_exit = vmx_handle_exit, .skip_emulated_instruction = skip_emulated_instruction, + .set_interrupt_shadow = vmx_set_interrupt_shadow, + .get_interrupt_shadow = vmx_get_interrupt_shadow, .patch_hypercall = vmx_patch_hypercall, - .get_irq = vmx_get_irq, .set_irq = vmx_inject_irq, + .set_nmi = vmx_inject_nmi, .queue_exception = vmx_queue_exception, - .exception_injected = vmx_exception_injected, - .inject_pending_irq = vmx_intr_assist, - .inject_pending_vectors = do_interrupt_requests, + .interrupt_allowed = vmx_interrupt_allowed, + .nmi_allowed = vmx_nmi_allowed, + .enable_nmi_window = enable_nmi_window, + .enable_irq_window = enable_irq_window, + .update_cr8_intercept = update_cr8_intercept, .set_tss_addr = vmx_set_tss_addr, .get_tdp_level = get_ept_level, - .get_mt_mask_shift = vmx_get_mt_mask_shift, + .get_mt_mask = vmx_get_mt_mask, }; static int __init vmx_init(void) { - void *va; int r; - vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_io_bitmap_a) return -ENOMEM; - vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_io_bitmap_b) { r = -ENOMEM; goto out; } - vmx_msr_bitmap = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); - if (!vmx_msr_bitmap) { + vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_legacy) { r = -ENOMEM; goto out1; } + vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_longmode) { + r = -ENOMEM; + goto out2; + } + /* * Allow direct access to the PC debug port (it is often used for I/O * delays, but the vmexits simply slow things down). */ - va = kmap(vmx_io_bitmap_a); - memset(va, 0xff, PAGE_SIZE); - clear_bit(0x80, va); - kunmap(vmx_io_bitmap_a); + memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE); + clear_bit(0x80, vmx_io_bitmap_a); - va = kmap(vmx_io_bitmap_b); - memset(va, 0xff, PAGE_SIZE); - kunmap(vmx_io_bitmap_b); + memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); - va = kmap(vmx_msr_bitmap); - memset(va, 0xff, PAGE_SIZE); - kunmap(vmx_msr_bitmap); + memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); + memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); if (r) - goto out2; + goto out3; - vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_FS_BASE); - vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_GS_BASE); - vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_CS); - vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP); - vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP); + vmx_disable_intercept_for_msr(MSR_FS_BASE, false); + vmx_disable_intercept_for_msr(MSR_GS_BASE, false); + vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); - if (vm_need_ept()) { + if (enable_ept) { bypass_guest_pf = 0; kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK); kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, - VMX_EPT_EXECUTABLE_MASK, - VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT); + VMX_EPT_EXECUTABLE_MASK); kvm_enable_tdp(); } else kvm_disable_tdp(); @@ -3702,20 +3813,23 @@ static int __init vmx_init(void) return 0; +out3: + free_page((unsigned long)vmx_msr_bitmap_longmode); out2: - __free_page(vmx_msr_bitmap); + free_page((unsigned long)vmx_msr_bitmap_legacy); out1: - __free_page(vmx_io_bitmap_b); + free_page((unsigned long)vmx_io_bitmap_b); out: - __free_page(vmx_io_bitmap_a); + free_page((unsigned long)vmx_io_bitmap_a); return r; } static void __exit vmx_exit(void) { - __free_page(vmx_msr_bitmap); - __free_page(vmx_io_bitmap_b); - __free_page(vmx_io_bitmap_a); + free_page((unsigned long)vmx_msr_bitmap_legacy); + free_page((unsigned long)vmx_msr_bitmap_longmode); + free_page((unsigned long)vmx_io_bitmap_b); + free_page((unsigned long)vmx_io_bitmap_a); kvm_exit(); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 758b7a155ae..249540f9851 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -36,6 +36,7 @@ #include <linux/highmem.h> #include <linux/iommu.h> #include <linux/intel-iommu.h> +#include <linux/cpufreq.h> #include <asm/uaccess.h> #include <asm/msr.h> @@ -69,6 +70,8 @@ static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, + u32 function, u32 index); struct kvm_x86_ops *kvm_x86_ops; EXPORT_SYMBOL_GPL(kvm_x86_ops); @@ -88,7 +91,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "hypercalls", VCPU_STAT(hypercalls) }, { "request_irq", VCPU_STAT(request_irq_exits) }, - { "request_nmi", VCPU_STAT(request_nmi_exits) }, { "irq_exits", VCPU_STAT(irq_exits) }, { "host_state_reload", VCPU_STAT(host_state_reload) }, { "efer_reload", VCPU_STAT(efer_reload) }, @@ -105,7 +107,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_recycled", VM_STAT(mmu_recycled) }, { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, { "mmu_unsync", VM_STAT(mmu_unsync) }, - { "mmu_unsync_global", VM_STAT(mmu_unsync_global) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, { "largepages", VM_STAT(lpages) }, { NULL } @@ -173,6 +174,7 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, u32 error_code) { ++vcpu->stat.pf_guest; + if (vcpu->arch.exception.pending) { if (vcpu->arch.exception.nr == PF_VECTOR) { printk(KERN_DEBUG "kvm: inject_page_fault:" @@ -230,7 +232,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) goto out; } for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { - if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { + if (is_present_pte(pdpte[i]) && + (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { ret = 0; goto out; } @@ -317,7 +320,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) kvm_x86_ops->set_cr0(vcpu, cr0); vcpu->arch.cr0 = cr0; - kvm_mmu_sync_global(vcpu); kvm_mmu_reset_context(vcpu); return; } @@ -334,6 +336,9 @@ EXPORT_SYMBOL_GPL(kvm_lmsw); void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { + unsigned long old_cr4 = vcpu->arch.cr4; + unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; + if (cr4 & CR4_RESERVED_BITS) { printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); kvm_inject_gp(vcpu, 0); @@ -347,7 +352,8 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) kvm_inject_gp(vcpu, 0); return; } - } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE) + } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) + && ((cr4 ^ old_cr4) & pdptr_bits) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); kvm_inject_gp(vcpu, 0); @@ -361,7 +367,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) } kvm_x86_ops->set_cr4(vcpu, cr4); vcpu->arch.cr4 = cr4; - kvm_mmu_sync_global(vcpu); + vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled; kvm_mmu_reset_context(vcpu); } EXPORT_SYMBOL_GPL(kvm_set_cr4); @@ -442,6 +448,11 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_get_cr8); +static inline u32 bit(int bitno) +{ + return 1 << (bitno & 31); +} + /* * List of msr numbers which we expose to userspace through KVM_GET_MSRS * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. @@ -456,7 +467,7 @@ static u32 msrs_to_save[] = { MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, - MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT + MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA }; static unsigned num_msrs_to_save; @@ -481,12 +492,37 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) return; } + if (efer & EFER_FFXSR) { + struct kvm_cpuid_entry2 *feat; + + feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { + printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n"); + kvm_inject_gp(vcpu, 0); + return; + } + } + + if (efer & EFER_SVME) { + struct kvm_cpuid_entry2 *feat; + + feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { + printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n"); + kvm_inject_gp(vcpu, 0); + return; + } + } + kvm_x86_ops->set_efer(vcpu, efer); efer &= ~EFER_LMA; efer |= vcpu->arch.shadow_efer & EFER_LMA; vcpu->arch.shadow_efer = efer; + + vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; + kvm_mmu_reset_context(vcpu); } void kvm_enable_efer_bits(u64 mask) @@ -586,20 +622,25 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info * hv_clock->tsc_to_system_mul); } +static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); + static void kvm_write_guest_time(struct kvm_vcpu *v) { struct timespec ts; unsigned long flags; struct kvm_vcpu_arch *vcpu = &v->arch; void *shared_kaddr; + unsigned long this_tsc_khz; if ((!vcpu->time_page)) return; - if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) { - kvm_set_time_scale(tsc_khz, &vcpu->hv_clock); - vcpu->hv_clock_tsc_khz = tsc_khz; + this_tsc_khz = get_cpu_var(cpu_tsc_khz); + if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) { + kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); + vcpu->hv_clock_tsc_khz = this_tsc_khz; } + put_cpu_var(cpu_tsc_khz); /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); @@ -629,6 +670,16 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); } +static int kvm_request_guest_time_update(struct kvm_vcpu *v) +{ + struct kvm_vcpu_arch *vcpu = &v->arch; + + if (!vcpu->time_page) + return 0; + set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests); + return 1; +} + static bool msr_mtrr_valid(unsigned msr) { switch (msr) { @@ -722,6 +773,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) break; case MSR_IA32_UCODE_REV: case MSR_IA32_UCODE_WRITE: + case MSR_VM_HSAVE_PA: break; case 0x200 ... 0x2ff: return set_msr_mtrr(vcpu, msr, data); @@ -758,7 +810,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) vcpu->arch.time_page = NULL; } - kvm_write_guest_time(vcpu); + kvm_request_guest_time_update(vcpu); break; } default: @@ -843,6 +895,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_LASTBRANCHTOIP: case MSR_IA32_LASTINTFROMIP: case MSR_IA32_LASTINTTOIP: + case MSR_VM_HSAVE_PA: + case MSR_P6_EVNTSEL0: + case MSR_P6_EVNTSEL1: data = 0; break; case MSR_MTRRcap: @@ -967,10 +1022,14 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: case KVM_CAP_SET_TSS_ADDR: case KVM_CAP_EXT_CPUID: + case KVM_CAP_CLOCKSOURCE: case KVM_CAP_PIT: case KVM_CAP_NOP_IO_DELAY: case KVM_CAP_MP_STATE: case KVM_CAP_SYNC_MMU: + case KVM_CAP_REINJECT_CONTROL: + case KVM_CAP_IRQ_INJECT_STATUS: + case KVM_CAP_ASSIGN_DEV_IRQ: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -991,9 +1050,6 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_IOMMU: r = iommu_found(); break; - case KVM_CAP_CLOCKSOURCE: - r = boot_cpu_has(X86_FEATURE_CONSTANT_TSC); - break; default: r = 0; break; @@ -1044,7 +1100,7 @@ long kvm_arch_dev_ioctl(struct file *filp, if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) goto out; r = kvm_dev_ioctl_get_supported_cpuid(&cpuid, - cpuid_arg->entries); + cpuid_arg->entries); if (r) goto out; @@ -1064,7 +1120,7 @@ out: void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { kvm_x86_ops->vcpu_load(vcpu, cpu); - kvm_write_guest_time(vcpu); + kvm_request_guest_time_update(vcpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) @@ -1075,9 +1131,9 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) static int is_efer_nx(void) { - u64 efer; + unsigned long long efer = 0; - rdmsrl(MSR_EFER, efer); + rdmsrl_safe(MSR_EFER, &efer); return efer & EFER_NX; } @@ -1142,8 +1198,8 @@ out: } static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, - struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) + struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries) { int r; @@ -1162,8 +1218,8 @@ out: } static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, - struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) + struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries) { int r; @@ -1172,7 +1228,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, goto out; r = -EFAULT; if (copy_to_user(entries, &vcpu->arch.cpuid_entries, - vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) + vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) goto out; return 0; @@ -1181,56 +1237,65 @@ out: return r; } -static inline u32 bit(int bitno) -{ - return 1 << (bitno & 31); -} - static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, - u32 index) + u32 index) { entry->function = function; entry->index = index; cpuid_count(entry->function, entry->index, - &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); + &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); entry->flags = 0; } +#define F(x) bit(X86_FEATURE_##x) + static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, u32 index, int *nent, int maxnent) { - const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) | - bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | - bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | - bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | - bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | - bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) | - bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | - bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) | - bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) | - bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP); - const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) | - bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | - bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | - bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | - bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | - bit(X86_FEATURE_PGE) | - bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | - bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) | - bit(X86_FEATURE_SYSCALL) | - (bit(X86_FEATURE_NX) && is_efer_nx()) | + unsigned f_nx = is_efer_nx() ? F(NX) : 0; #ifdef CONFIG_X86_64 - bit(X86_FEATURE_LM) | + unsigned f_lm = F(LM); +#else + unsigned f_lm = 0; #endif - bit(X86_FEATURE_MMXEXT) | - bit(X86_FEATURE_3DNOWEXT) | - bit(X86_FEATURE_3DNOW); - const u32 kvm_supported_word3_x86_features = - bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16); + + /* cpuid 1.edx */ + const u32 kvm_supported_word0_x86_features = + F(FPU) | F(VME) | F(DE) | F(PSE) | + F(TSC) | F(MSR) | F(PAE) | F(MCE) | + F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | + F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | + F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) | + 0 /* Reserved, DS, ACPI */ | F(MMX) | + F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | + 0 /* HTT, TM, Reserved, PBE */; + /* cpuid 0x80000001.edx */ + const u32 kvm_supported_word1_x86_features = + F(FPU) | F(VME) | F(DE) | F(PSE) | + F(TSC) | F(MSR) | F(PAE) | F(MCE) | + F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | + F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | + F(PAT) | F(PSE36) | 0 /* Reserved */ | + f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | + F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ | + 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); + /* cpuid 1.ecx */ + const u32 kvm_supported_word4_x86_features = + F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ | + 0 /* DS-CPL, VMX, SMX, EST */ | + 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | + 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | + 0 /* Reserved, DCA */ | F(XMM4_1) | + F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) | + 0 /* Reserved, XSAVE, OSXSAVE */; + /* cpuid 0x80000001.ecx */ const u32 kvm_supported_word6_x86_features = - bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY); + F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | + F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | + F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | + 0 /* SKINIT */ | 0 /* WDT */; - /* all func 2 cpuid_count() should be called on the same cpu */ + /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); do_cpuid_1_ent(entry, function, index); ++*nent; @@ -1241,7 +1306,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, break; case 1: entry->edx &= kvm_supported_word0_x86_features; - entry->ecx &= kvm_supported_word3_x86_features; + entry->ecx &= kvm_supported_word4_x86_features; break; /* function 2 entries are STATEFUL. That is, repeated cpuid commands * may return different values. This forces us to get_cpu() before @@ -1303,8 +1368,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, put_cpu(); } +#undef F + static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) + struct kvm_cpuid_entry2 __user *entries) { struct kvm_cpuid_entry2 *cpuid_entries; int limit, nent = 0, r = -E2BIG; @@ -1321,7 +1388,7 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, limit = cpuid_entries[0].eax; for (func = 1; func <= limit && nent < cpuid->nent; ++func) do_cpuid_ent(&cpuid_entries[nent], func, 0, - &nent, cpuid->nent); + &nent, cpuid->nent); r = -E2BIG; if (nent >= cpuid->nent) goto out_free; @@ -1330,10 +1397,10 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, limit = cpuid_entries[nent - 1].eax; for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) do_cpuid_ent(&cpuid_entries[nent], func, 0, - &nent, cpuid->nent); + &nent, cpuid->nent); r = -EFAULT; if (copy_to_user(entries, cpuid_entries, - nent * sizeof(struct kvm_cpuid_entry2))) + nent * sizeof(struct kvm_cpuid_entry2))) goto out_free; cpuid->nent = nent; r = 0; @@ -1374,8 +1441,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, return -ENXIO; vcpu_load(vcpu); - set_bit(irq->irq, vcpu->arch.irq_pending); - set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary); + kvm_queue_interrupt(vcpu, irq->irq, false); vcpu_put(vcpu); @@ -1477,7 +1543,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) goto out; r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, - cpuid_arg->entries); + cpuid_arg->entries); if (r) goto out; break; @@ -1490,7 +1556,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) goto out; r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, - cpuid_arg->entries); + cpuid_arg->entries); if (r) goto out; r = -EFAULT; @@ -1537,8 +1603,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; } out: - if (lapic) - kfree(lapic); + kfree(lapic); return r; } @@ -1559,10 +1624,12 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, return -EINVAL; down_write(&kvm->slots_lock); + spin_lock(&kvm->mmu_lock); kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; + spin_unlock(&kvm->mmu_lock); up_write(&kvm->slots_lock); return 0; } @@ -1710,6 +1777,15 @@ static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) return r; } +static int kvm_vm_ioctl_reinject(struct kvm *kvm, + struct kvm_reinject_control *control) +{ + if (!kvm->arch.vpit) + return -ENXIO; + kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; + return 0; +} + /* * Get (and clear) the dirty memory log for a memory slot. */ @@ -1729,7 +1805,9 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { + spin_lock(&kvm->mmu_lock); kvm_mmu_slot_remove_write_access(kvm, log->slot); + spin_unlock(&kvm->mmu_lock); kvm_flush_remote_tlbs(kvm); memslot = &kvm->memslots[log->slot]; n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; @@ -1807,13 +1885,26 @@ long kvm_arch_vm_ioctl(struct file *filp, } } else goto out; + r = kvm_setup_default_irq_routing(kvm); + if (r) { + kfree(kvm->arch.vpic); + kfree(kvm->arch.vioapic); + goto out; + } break; case KVM_CREATE_PIT: + mutex_lock(&kvm->lock); + r = -EEXIST; + if (kvm->arch.vpit) + goto create_pit_unlock; r = -ENOMEM; kvm->arch.vpit = kvm_create_pit(kvm); if (kvm->arch.vpit) r = 0; + create_pit_unlock: + mutex_unlock(&kvm->lock); break; + case KVM_IRQ_LINE_STATUS: case KVM_IRQ_LINE: { struct kvm_irq_level irq_event; @@ -1821,10 +1912,17 @@ long kvm_arch_vm_ioctl(struct file *filp, if (copy_from_user(&irq_event, argp, sizeof irq_event)) goto out; if (irqchip_in_kernel(kvm)) { + __s32 status; mutex_lock(&kvm->lock); - kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, - irq_event.irq, irq_event.level); + status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, + irq_event.irq, irq_event.level); mutex_unlock(&kvm->lock); + if (ioctl == KVM_IRQ_LINE_STATUS) { + irq_event.status = status; + if (copy_to_user(argp, &irq_event, + sizeof irq_event)) + goto out; + } r = 0; } break; @@ -1907,6 +2005,17 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; break; } + case KVM_REINJECT_CONTROL: { + struct kvm_reinject_control control; + r = -EFAULT; + if (copy_from_user(&control, argp, sizeof(control))) + goto out; + r = kvm_vm_ioctl_reinject(kvm, &control); + if (r) + goto out; + r = 0; + break; + } default: ; } @@ -1960,10 +2069,38 @@ static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, return dev; } -int emulator_read_std(unsigned long addr, - void *val, - unsigned int bytes, - struct kvm_vcpu *vcpu) +static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu) +{ + void *data = val; + int r = X86EMUL_CONTINUE; + + while (bytes) { + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + unsigned offset = addr & (PAGE_SIZE-1); + unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); + int ret; + + if (gpa == UNMAPPED_GVA) { + r = X86EMUL_PROPAGATE_FAULT; + goto out; + } + ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); + if (ret < 0) { + r = X86EMUL_UNHANDLEABLE; + goto out; + } + + bytes -= toread; + data += toread; + addr += toread; + } +out: + return r; +} + +static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu) { void *data = val; int r = X86EMUL_CONTINUE; @@ -1971,27 +2108,27 @@ int emulator_read_std(unsigned long addr, while (bytes) { gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); unsigned offset = addr & (PAGE_SIZE-1); - unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); + unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; if (gpa == UNMAPPED_GVA) { r = X86EMUL_PROPAGATE_FAULT; goto out; } - ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy); + ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); if (ret < 0) { r = X86EMUL_UNHANDLEABLE; goto out; } - bytes -= tocopy; - data += tocopy; - addr += tocopy; + bytes -= towrite; + data += towrite; + addr += towrite; } out: return r; } -EXPORT_SYMBOL_GPL(emulator_read_std); + static int emulator_read_emulated(unsigned long addr, void *val, @@ -2013,8 +2150,8 @@ static int emulator_read_emulated(unsigned long addr, if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto mmio; - if (emulator_read_std(addr, val, bytes, vcpu) - == X86EMUL_CONTINUE) + if (kvm_read_guest_virt(addr, val, bytes, vcpu) + == X86EMUL_CONTINUE) return X86EMUL_CONTINUE; if (gpa == UNMAPPED_GVA) return X86EMUL_PROPAGATE_FAULT; @@ -2217,7 +2354,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); - emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu); + kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu); printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); @@ -2225,7 +2362,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); static struct x86_emulate_ops emulate_ops = { - .read_std = emulator_read_std, + .read_std = kvm_read_guest_virt, .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, .cmpxchg_emulated = emulator_cmpxchg_emulated, @@ -2245,7 +2382,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, u16 error_code, int emulation_type) { - int r; + int r, shadow_mask; struct decode_cache *c; kvm_clear_exception_queue(vcpu); @@ -2293,7 +2430,16 @@ int emulate_instruction(struct kvm_vcpu *vcpu, } } + if (emulation_type & EMULTYPE_SKIP) { + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip); + return EMULATE_DONE; + } + r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); + shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; + + if (r == 0) + kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); if (vcpu->arch.pio.string) return EMULATE_DO_MMIO; @@ -2327,40 +2473,19 @@ int emulate_instruction(struct kvm_vcpu *vcpu, } EXPORT_SYMBOL_GPL(emulate_instruction); -static void free_pio_guest_pages(struct kvm_vcpu *vcpu) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i) - if (vcpu->arch.pio.guest_pages[i]) { - kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]); - vcpu->arch.pio.guest_pages[i] = NULL; - } -} - static int pio_copy_data(struct kvm_vcpu *vcpu) { void *p = vcpu->arch.pio_data; - void *q; + gva_t q = vcpu->arch.pio.guest_gva; unsigned bytes; - int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1; + int ret; - q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE, - PAGE_KERNEL); - if (!q) { - free_pio_guest_pages(vcpu); - return -ENOMEM; - } - q += vcpu->arch.pio.guest_page_offset; bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; if (vcpu->arch.pio.in) - memcpy(q, p, bytes); + ret = kvm_write_guest_virt(q, p, bytes, vcpu); else - memcpy(p, q, bytes); - q -= vcpu->arch.pio.guest_page_offset; - vunmap(q); - free_pio_guest_pages(vcpu); - return 0; + ret = kvm_read_guest_virt(q, p, bytes, vcpu); + return ret; } int complete_pio(struct kvm_vcpu *vcpu) @@ -2471,7 +2596,6 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, vcpu->arch.pio.in = in; vcpu->arch.pio.string = 0; vcpu->arch.pio.down = 0; - vcpu->arch.pio.guest_page_offset = 0; vcpu->arch.pio.rep = 0; if (vcpu->run->io.direction == KVM_EXIT_IO_IN) @@ -2499,9 +2623,7 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, gva_t address, int rep, unsigned port) { unsigned now, in_page; - int i, ret = 0; - int nr_pages = 1; - struct page *page; + int ret = 0; struct kvm_io_device *pio_dev; vcpu->run->exit_reason = KVM_EXIT_IO; @@ -2513,7 +2635,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, vcpu->arch.pio.in = in; vcpu->arch.pio.string = 1; vcpu->arch.pio.down = down; - vcpu->arch.pio.guest_page_offset = offset_in_page(address); vcpu->arch.pio.rep = rep; if (vcpu->run->io.direction == KVM_EXIT_IO_IN) @@ -2533,15 +2654,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, else in_page = offset_in_page(address) + size; now = min(count, (unsigned long)in_page / size); - if (!now) { - /* - * String I/O straddles page boundary. Pin two guest pages - * so that we satisfy atomicity constraints. Do just one - * transaction to avoid complexity. - */ - nr_pages = 2; + if (!now) now = 1; - } if (down) { /* * String I/O in reverse. Yuck. Kill the guest, fix later. @@ -2556,15 +2670,7 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count) kvm_x86_ops->skip_emulated_instruction(vcpu); - for (i = 0; i < nr_pages; ++i) { - page = gva_to_page(vcpu, address + i * PAGE_SIZE); - vcpu->arch.pio.guest_pages[i] = page; - if (!page) { - kvm_inject_gp(vcpu, 0); - free_pio_guest_pages(vcpu); - return 1; - } - } + vcpu->arch.pio.guest_gva = address; pio_dev = vcpu_find_pio_dev(vcpu, port, vcpu->arch.pio.cur_count, @@ -2572,7 +2678,11 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, if (!vcpu->arch.pio.in) { /* string PIO write */ ret = pio_copy_data(vcpu); - if (ret >= 0 && pio_dev) { + if (ret == X86EMUL_PROPAGATE_FAULT) { + kvm_inject_gp(vcpu, 0); + return 1; + } + if (ret == 0 && pio_dev) { pio_string_write(pio_dev, vcpu); complete_pio(vcpu); if (vcpu->arch.pio.count == 0) @@ -2587,9 +2697,72 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, } EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); +static void bounce_off(void *info) +{ + /* nothing */ +} + +static unsigned int ref_freq; +static unsigned long tsc_khz_ref; + +static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_freqs *freq = data; + struct kvm *kvm; + struct kvm_vcpu *vcpu; + int i, send_ipi = 0; + + if (!ref_freq) + ref_freq = freq->old; + + if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) + return 0; + if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) + return 0; + per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); + + spin_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) { + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + vcpu = kvm->vcpus[i]; + if (!vcpu) + continue; + if (vcpu->cpu != freq->cpu) + continue; + if (!kvm_request_guest_time_update(vcpu)) + continue; + if (vcpu->cpu != smp_processor_id()) + send_ipi++; + } + } + spin_unlock(&kvm_lock); + + if (freq->old < freq->new && send_ipi) { + /* + * We upscale the frequency. Must make the guest + * doesn't see old kvmclock values while running with + * the new frequency, otherwise we risk the guest sees + * time go backwards. + * + * In case we update the frequency for another cpu + * (which might be in guest context) send an interrupt + * to kick the cpu out of guest context. Next time + * guest context is entered kvmclock will be updated, + * so the guest will not see stale values. + */ + smp_call_function_single(freq->cpu, bounce_off, NULL, 1); + } + return 0; +} + +static struct notifier_block kvmclock_cpufreq_notifier_block = { + .notifier_call = kvmclock_cpufreq_notifier +}; + int kvm_arch_init(void *opaque) { - int r; + int r, cpu; struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; if (kvm_x86_ops) { @@ -2619,7 +2792,16 @@ int kvm_arch_init(void *opaque) kvm_mmu_set_nonpresent_ptes(0ull, 0ull); kvm_mmu_set_base_ptes(PT_PRESENT_MASK); kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, - PT_DIRTY_MASK, PT64_NX_MASK, 0, 0); + PT_DIRTY_MASK, PT64_NX_MASK, 0); + + for_each_possible_cpu(cpu) + per_cpu(cpu_tsc_khz, cpu) = tsc_khz; + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + tsc_khz_ref = tsc_khz; + cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); + } + return 0; out: @@ -2628,6 +2810,9 @@ out: void kvm_arch_exit(void) { + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); kvm_x86_ops = NULL; kvm_mmu_module_exit(); } @@ -2827,25 +3012,20 @@ static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) return 0; if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && - !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) + !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) return 0; return 1; } -void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, + u32 function, u32 index) { int i; - u32 function, index; - struct kvm_cpuid_entry2 *e, *best; + struct kvm_cpuid_entry2 *best = NULL; - function = kvm_register_read(vcpu, VCPU_REGS_RAX); - index = kvm_register_read(vcpu, VCPU_REGS_RCX); - kvm_register_write(vcpu, VCPU_REGS_RAX, 0); - kvm_register_write(vcpu, VCPU_REGS_RBX, 0); - kvm_register_write(vcpu, VCPU_REGS_RCX, 0); - kvm_register_write(vcpu, VCPU_REGS_RDX, 0); - best = NULL; for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { + struct kvm_cpuid_entry2 *e; + e = &vcpu->arch.cpuid_entries[i]; if (is_matching_cpuid_entry(e, function, index)) { if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) @@ -2860,6 +3040,31 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) if (!best || e->function > best->function) best = e; } + return best; +} + +int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); + if (best) + return best->eax & 0xff; + return 36; +} + +void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +{ + u32 function, index; + struct kvm_cpuid_entry2 *best; + + function = kvm_register_read(vcpu, VCPU_REGS_RAX); + index = kvm_register_read(vcpu, VCPU_REGS_RCX); + kvm_register_write(vcpu, VCPU_REGS_RAX, 0); + kvm_register_write(vcpu, VCPU_REGS_RBX, 0); + kvm_register_write(vcpu, VCPU_REGS_RCX, 0); + kvm_register_write(vcpu, VCPU_REGS_RDX, 0); + best = kvm_find_cpuid_entry(vcpu, function, index); if (best) { kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); @@ -2884,10 +3089,9 @@ EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - return (!vcpu->arch.irq_summary && + return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && kvm_run->request_interrupt_window && - vcpu->arch.interrupt_window_open && - (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF)); + kvm_arch_interrupt_allowed(vcpu)); } static void post_kvm_run_save(struct kvm_vcpu *vcpu, @@ -2900,8 +3104,9 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu, kvm_run->ready_for_interrupt_injection = 1; else kvm_run->ready_for_interrupt_injection = - (vcpu->arch.interrupt_window_open && - vcpu->arch.irq_summary == 0); + kvm_arch_interrupt_allowed(vcpu) && + !kvm_cpu_has_interrupt(vcpu) && + !kvm_event_needs_reinjection(vcpu); } static void vapic_enter(struct kvm_vcpu *vcpu) @@ -2930,9 +3135,63 @@ static void vapic_exit(struct kvm_vcpu *vcpu) up_read(&vcpu->kvm->slots_lock); } +static void update_cr8_intercept(struct kvm_vcpu *vcpu) +{ + int max_irr, tpr; + + if (!kvm_x86_ops->update_cr8_intercept) + return; + + if (!vcpu->arch.apic->vapic_addr) + max_irr = kvm_lapic_find_highest_irr(vcpu); + else + max_irr = -1; + + if (max_irr != -1) + max_irr >>= 4; + + tpr = kvm_lapic_get_cr8(vcpu); + + kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); +} + +static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + kvm_x86_ops->set_interrupt_shadow(vcpu, 0); + + /* try to reinject previous events if any */ + if (vcpu->arch.nmi_injected) { + kvm_x86_ops->set_nmi(vcpu); + return; + } + + if (vcpu->arch.interrupt.pending) { + kvm_x86_ops->set_irq(vcpu); + return; + } + + /* try to inject new event if pending */ + if (vcpu->arch.nmi_pending) { + if (kvm_x86_ops->nmi_allowed(vcpu)) { + vcpu->arch.nmi_pending = false; + vcpu->arch.nmi_injected = true; + kvm_x86_ops->set_nmi(vcpu); + } + } else if (kvm_cpu_has_interrupt(vcpu)) { + if (kvm_x86_ops->interrupt_allowed(vcpu)) { + kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), + false); + kvm_x86_ops->set_irq(vcpu); + } + } +} + static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; + bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && + kvm_run->request_interrupt_window; if (vcpu->requests) if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) @@ -2945,6 +3204,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (vcpu->requests) { if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) __kvm_migrate_timers(vcpu); + if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests)) + kvm_write_guest_time(vcpu); if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) kvm_mmu_sync_roots(vcpu); if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) @@ -2962,9 +3223,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } } - clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); - kvm_inject_pending_timer_irqs(vcpu); - preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); @@ -2972,6 +3230,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) local_irq_disable(); + clear_bit(KVM_REQ_KICK, &vcpu->requests); + smp_mb__after_clear_bit(); + if (vcpu->requests || need_resched() || signal_pending(current)) { local_irq_enable(); preempt_enable(); @@ -2979,34 +3240,55 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) goto out; } - if (vcpu->guest_debug.enabled) - kvm_x86_ops->guest_debug_pre(vcpu); - - vcpu->guest_mode = 1; - /* - * Make sure that guest_mode assignment won't happen after - * testing the pending IRQ vector bitmap. - */ - smp_wmb(); - if (vcpu->arch.exception.pending) __queue_exception(vcpu); - else if (irqchip_in_kernel(vcpu->kvm)) - kvm_x86_ops->inject_pending_irq(vcpu); else - kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); + inject_pending_irq(vcpu, kvm_run); + + /* enable NMI/IRQ window open exits if needed */ + if (vcpu->arch.nmi_pending) + kvm_x86_ops->enable_nmi_window(vcpu); + else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) + kvm_x86_ops->enable_irq_window(vcpu); - kvm_lapic_sync_to_vapic(vcpu); + if (kvm_lapic_enabled(vcpu)) { + update_cr8_intercept(vcpu); + kvm_lapic_sync_to_vapic(vcpu); + } up_read(&vcpu->kvm->slots_lock); kvm_guest_enter(); + get_debugreg(vcpu->arch.host_dr6, 6); + get_debugreg(vcpu->arch.host_dr7, 7); + if (unlikely(vcpu->arch.switch_db_regs)) { + get_debugreg(vcpu->arch.host_db[0], 0); + get_debugreg(vcpu->arch.host_db[1], 1); + get_debugreg(vcpu->arch.host_db[2], 2); + get_debugreg(vcpu->arch.host_db[3], 3); + + set_debugreg(0, 7); + set_debugreg(vcpu->arch.eff_db[0], 0); + set_debugreg(vcpu->arch.eff_db[1], 1); + set_debugreg(vcpu->arch.eff_db[2], 2); + set_debugreg(vcpu->arch.eff_db[3], 3); + } KVMTRACE_0D(VMENTRY, vcpu, entryexit); kvm_x86_ops->run(vcpu, kvm_run); - vcpu->guest_mode = 0; + if (unlikely(vcpu->arch.switch_db_regs)) { + set_debugreg(0, 7); + set_debugreg(vcpu->arch.host_db[0], 0); + set_debugreg(vcpu->arch.host_db[1], 1); + set_debugreg(vcpu->arch.host_db[2], 2); + set_debugreg(vcpu->arch.host_db[3], 3); + } + set_debugreg(vcpu->arch.host_dr6, 6); + set_debugreg(vcpu->arch.host_dr7, 7); + + set_bit(KVM_REQ_KICK, &vcpu->requests); local_irq_enable(); ++vcpu->stat.exits; @@ -3033,8 +3315,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) profile_hit(KVM_PROFILING, (void *)rip); } - if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) - vcpu->arch.exception.pending = false; kvm_lapic_sync_from_vapic(vcpu); @@ -3043,6 +3323,7 @@ out: return r; } + static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; @@ -3069,29 +3350,42 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_vcpu_block(vcpu); down_read(&vcpu->kvm->slots_lock); if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) - if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) + { + switch(vcpu->arch.mp_state) { + case KVM_MP_STATE_HALTED: vcpu->arch.mp_state = - KVM_MP_STATE_RUNNABLE; - if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) - r = -EINTR; + KVM_MP_STATE_RUNNABLE; + case KVM_MP_STATE_RUNNABLE: + break; + case KVM_MP_STATE_SIPI_RECEIVED: + default: + r = -EINTR; + break; + } + } } - if (r > 0) { - if (dm_request_for_irq_injection(vcpu, kvm_run)) { - r = -EINTR; - kvm_run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.request_irq_exits; - } - if (signal_pending(current)) { - r = -EINTR; - kvm_run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.signal_exits; - } - if (need_resched()) { - up_read(&vcpu->kvm->slots_lock); - kvm_resched(vcpu); - down_read(&vcpu->kvm->slots_lock); - } + if (r <= 0) + break; + + clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); + if (kvm_cpu_has_pending_timer(vcpu)) + kvm_inject_pending_timer_irqs(vcpu); + + if (dm_request_for_irq_injection(vcpu, kvm_run)) { + r = -EINTR; + kvm_run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.request_irq_exits; + } + if (signal_pending(current)) { + r = -EINTR; + kvm_run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.signal_exits; + } + if (need_resched()) { + up_read(&vcpu->kvm->slots_lock); + kvm_resched(vcpu); + down_read(&vcpu->kvm->slots_lock); } } @@ -3192,7 +3486,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) /* * Don't leak debug flags in case they were set for guest debugging */ - if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep) + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); vcpu_put(vcpu); @@ -3255,7 +3549,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { struct descriptor_table dt; - int pending_vec; vcpu_load(vcpu); @@ -3285,16 +3578,11 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, sregs->efer = vcpu->arch.shadow_efer; sregs->apic_base = kvm_get_apic_base(vcpu); - if (irqchip_in_kernel(vcpu->kvm)) { - memset(sregs->interrupt_bitmap, 0, - sizeof sregs->interrupt_bitmap); - pending_vec = kvm_x86_ops->get_irq(vcpu); - if (pending_vec >= 0) - set_bit(pending_vec, - (unsigned long *)sregs->interrupt_bitmap); - } else - memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending, - sizeof sregs->interrupt_bitmap); + memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); + + if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft) + set_bit(vcpu->arch.interrupt.nr, + (unsigned long *)sregs->interrupt_bitmap); vcpu_put(vcpu); @@ -3501,7 +3789,6 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu, tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); - tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR); } static int load_state_from_tss32(struct kvm_vcpu *vcpu, @@ -3598,8 +3885,8 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu, } static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, - u32 old_tss_base, - struct desc_struct *nseg_desc) + u16 old_tss_sel, u32 old_tss_base, + struct desc_struct *nseg_desc) { struct tss_segment_16 tss_segment_16; int ret = 0; @@ -3618,6 +3905,16 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, &tss_segment_16, sizeof tss_segment_16)) goto out; + if (old_tss_sel != 0xffff) { + tss_segment_16.prev_task_link = old_tss_sel; + + if (kvm_write_guest(vcpu->kvm, + get_tss_base_addr(vcpu, nseg_desc), + &tss_segment_16.prev_task_link, + sizeof tss_segment_16.prev_task_link)) + goto out; + } + if (load_state_from_tss16(vcpu, &tss_segment_16)) goto out; @@ -3627,7 +3924,7 @@ out: } static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, - u32 old_tss_base, + u16 old_tss_sel, u32 old_tss_base, struct desc_struct *nseg_desc) { struct tss_segment_32 tss_segment_32; @@ -3647,6 +3944,16 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, &tss_segment_32, sizeof tss_segment_32)) goto out; + if (old_tss_sel != 0xffff) { + tss_segment_32.prev_task_link = old_tss_sel; + + if (kvm_write_guest(vcpu->kvm, + get_tss_base_addr(vcpu, nseg_desc), + &tss_segment_32.prev_task_link, + sizeof tss_segment_32.prev_task_link)) + goto out; + } + if (load_state_from_tss32(vcpu, &tss_segment_32)) goto out; @@ -3700,14 +4007,22 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); } - kvm_x86_ops->skip_emulated_instruction(vcpu); + /* set back link to prev task only if NT bit is set in eflags + note that old_tss_sel is not used afetr this point */ + if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) + old_tss_sel = 0xffff; + + /* set back link to prev task only if NT bit is set in eflags + note that old_tss_sel is not used afetr this point */ + if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) + old_tss_sel = 0xffff; if (nseg_desc.type & 8) - ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base, - &nseg_desc); + ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel, + old_tss_base, &nseg_desc); else - ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base, - &nseg_desc); + ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel, + old_tss_base, &nseg_desc); if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { u32 eflags = kvm_x86_ops->get_rflags(vcpu); @@ -3733,7 +4048,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { int mmu_reset_needed = 0; - int i, pending_vec, max_bits; + int pending_vec, max_bits; struct descriptor_table dt; vcpu_load(vcpu); @@ -3747,7 +4062,13 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, vcpu->arch.cr2 = sregs->cr2; mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; - vcpu->arch.cr3 = sregs->cr3; + + down_read(&vcpu->kvm->slots_lock); + if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT)) + vcpu->arch.cr3 = sregs->cr3; + else + set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + up_read(&vcpu->kvm->slots_lock); kvm_set_cr8(vcpu, sregs->cr8); @@ -3769,25 +4090,14 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, if (mmu_reset_needed) kvm_mmu_reset_context(vcpu); - if (!irqchip_in_kernel(vcpu->kvm)) { - memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap, - sizeof vcpu->arch.irq_pending); - vcpu->arch.irq_summary = 0; - for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i) - if (vcpu->arch.irq_pending[i]) - __set_bit(i, &vcpu->arch.irq_summary); - } else { - max_bits = (sizeof sregs->interrupt_bitmap) << 3; - pending_vec = find_first_bit( - (const unsigned long *)sregs->interrupt_bitmap, - max_bits); - /* Only pending external irq is handled here */ - if (pending_vec < max_bits) { - kvm_x86_ops->set_irq(vcpu, pending_vec); - pr_debug("Set back pending irq %d\n", - pending_vec); - } - kvm_pic_clear_isr_ack(vcpu->kvm); + max_bits = (sizeof sregs->interrupt_bitmap) << 3; + pending_vec = find_first_bit( + (const unsigned long *)sregs->interrupt_bitmap, max_bits); + if (pending_vec < max_bits) { + kvm_queue_interrupt(vcpu, pending_vec, false); + pr_debug("Set back pending irq %d\n", pending_vec); + if (irqchip_in_kernel(vcpu->kvm)) + kvm_pic_clear_isr_ack(vcpu->kvm); } kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); @@ -3811,15 +4121,32 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return 0; } -int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, - struct kvm_debug_guest *dbg) +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg) { - int r; + int i, r; vcpu_load(vcpu); + if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) == + (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) { + for (i = 0; i < KVM_NR_DB_REGS; ++i) + vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; + vcpu->arch.switch_db_regs = + (dbg->arch.debugreg[7] & DR7_BP_EN_MASK); + } else { + for (i = 0; i < KVM_NR_DB_REGS; i++) + vcpu->arch.eff_db[i] = vcpu->arch.db[i]; + vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); + } + r = kvm_x86_ops->set_guest_debug(vcpu, dbg); + if (dbg->control & KVM_GUESTDBG_INJECT_DB) + kvm_queue_exception(vcpu, DB_VECTOR); + else if (dbg->control & KVM_GUESTDBG_INJECT_BP) + kvm_queue_exception(vcpu, BP_VECTOR); + vcpu_put(vcpu); return r; @@ -3962,6 +4289,11 @@ EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { + if (vcpu->arch.time_page) { + kvm_release_page_dirty(vcpu->arch.time_page); + vcpu->arch.time_page = NULL; + } + kvm_x86_ops->vcpu_free(vcpu); } @@ -4007,6 +4339,11 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) vcpu->arch.nmi_pending = false; vcpu->arch.nmi_injected = false; + vcpu->arch.switch_db_regs = 0; + memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); + vcpu->arch.dr6 = DR6_FIXED_1; + vcpu->arch.dr7 = DR7_FIXED_1; + return kvm_x86_ops->vcpu_reset(vcpu); } @@ -4094,12 +4431,13 @@ struct kvm *kvm_arch_create_vm(void) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); - INIT_LIST_HEAD(&kvm->arch.oos_global_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); + rdtscll(kvm->arch.vm_init_tsc); + return kvm; } @@ -4195,12 +4533,14 @@ int kvm_arch_set_memory_region(struct kvm *kvm, } } + spin_lock(&kvm->mmu_lock); if (!kvm->arch.n_requested_mmu_pages) { unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); } kvm_mmu_slot_remove_write_access(kvm, mem->slot); + spin_unlock(&kvm->mmu_lock); kvm_flush_remote_tlbs(kvm); return 0; @@ -4209,6 +4549,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm, void kvm_arch_flush_shadow(struct kvm *kvm) { kvm_mmu_zap_all(kvm); + kvm_reload_remote_mmus(kvm); } int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) @@ -4218,28 +4559,24 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) || vcpu->arch.nmi_pending; } -static void vcpu_kick_intr(void *info) -{ -#ifdef DEBUG - struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info; - printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu); -#endif -} - void kvm_vcpu_kick(struct kvm_vcpu *vcpu) { - int ipi_pcpu = vcpu->cpu; - int cpu = get_cpu(); + int me; + int cpu = vcpu->cpu; if (waitqueue_active(&vcpu->wq)) { wake_up_interruptible(&vcpu->wq); ++vcpu->stat.halt_wakeup; } - /* - * We may be called synchronously with irqs disabled in guest mode, - * So need not to call smp_call_function_single() in that case. - */ - if (vcpu->guest_mode && vcpu->cpu != cpu) - smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0); + + me = get_cpu(); + if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) + if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) + smp_send_reschedule(cpu); put_cpu(); } + +int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) +{ + return kvm_x86_ops->interrupt_allowed(vcpu); +} diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 6a4be78a738..4c8e10af78e 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -8,9 +8,11 @@ static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) vcpu->arch.exception.pending = false; } -static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector) +static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector, + bool soft) { vcpu->arch.interrupt.pending = true; + vcpu->arch.interrupt.soft = soft; vcpu->arch.interrupt.nr = vector; } @@ -19,4 +21,14 @@ static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu) vcpu->arch.interrupt.pending = false; } +static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.exception.pending || vcpu->arch.interrupt.pending || + vcpu->arch.nmi_injected; +} + +static inline bool kvm_exception_is_soft(unsigned int nr) +{ + return (nr == BP_VECTOR) || (nr == OF_VECTOR); +} #endif diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index d174db7a337..c1b6c232e02 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -59,13 +59,14 @@ #define SrcImm (5<<4) /* Immediate operand. */ #define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ #define SrcOne (7<<4) /* Implied '1' */ -#define SrcMask (7<<4) +#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ +#define SrcMask (0xf<<4) /* Generic ModRM decode. */ -#define ModRM (1<<7) +#define ModRM (1<<8) /* Destination is only written; never read. */ -#define Mov (1<<8) -#define BitOp (1<<9) -#define MemAbs (1<<10) /* Memory operand is absolute displacement */ +#define Mov (1<<9) +#define BitOp (1<<10) +#define MemAbs (1<<11) /* Memory operand is absolute displacement */ #define String (1<<12) /* String instruction (rep capable) */ #define Stack (1<<13) /* Stack instruction (push/pop) */ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ @@ -76,6 +77,7 @@ #define Src2CL (1<<29) #define Src2ImmByte (2<<29) #define Src2One (3<<29) +#define Src2Imm16 (4<<29) #define Src2Mask (7<<29) enum { @@ -135,11 +137,11 @@ static u32 opcode_table[256] = { SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ /* 0x70 - 0x77 */ - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, + SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, /* 0x78 - 0x7F */ - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, + SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, /* 0x80 - 0x87 */ Group | Group1_80, Group | Group1_81, Group | Group1_82, Group | Group1_83, @@ -153,7 +155,8 @@ static u32 opcode_table[256] = { /* 0x90 - 0x97 */ DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, /* 0x98 - 0x9F */ - 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, + 0, 0, SrcImm | Src2Imm16, 0, + ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, /* 0xA0 - 0xA7 */ ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, @@ -178,7 +181,8 @@ static u32 opcode_table[256] = { 0, ImplicitOps | Stack, 0, 0, ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, /* 0xC8 - 0xCF */ - 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, ImplicitOps | Stack, + ImplicitOps, SrcImmByte, ImplicitOps, ImplicitOps, /* 0xD0 - 0xD7 */ ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, @@ -187,11 +191,11 @@ static u32 opcode_table[256] = { 0, 0, 0, 0, 0, 0, 0, 0, /* 0xE0 - 0xE7 */ 0, 0, 0, 0, - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, + ByteOp | SrcImmUByte, SrcImmUByte, + ByteOp | SrcImmUByte, SrcImmUByte, /* 0xE8 - 0xEF */ - ImplicitOps | Stack, SrcImm | ImplicitOps, - ImplicitOps, SrcImmByte | ImplicitOps, + SrcImm | Stack, SrcImm | ImplicitOps, + SrcImm | Src2Imm16, SrcImmByte | ImplicitOps, SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* 0xF0 - 0xF7 */ @@ -230,10 +234,8 @@ static u32 twobyte_table[256] = { /* 0x70 - 0x7F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 - 0x8F */ - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, + SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, /* 0x90 - 0x9F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xA0 - 0xA7 */ @@ -1044,10 +1046,14 @@ done_prefixes: } break; case SrcImmByte: + case SrcImmUByte: c->src.type = OP_IMM; c->src.ptr = (unsigned long *)c->eip; c->src.bytes = 1; - c->src.val = insn_fetch(s8, 1, c->eip); + if ((c->d & SrcMask) == SrcImmByte) + c->src.val = insn_fetch(s8, 1, c->eip); + else + c->src.val = insn_fetch(u8, 1, c->eip); break; case SrcOne: c->src.bytes = 1; @@ -1072,6 +1078,12 @@ done_prefixes: c->src2.bytes = 1; c->src2.val = insn_fetch(u8, 1, c->eip); break; + case Src2Imm16: + c->src2.type = OP_IMM; + c->src2.ptr = (unsigned long *)c->eip; + c->src2.bytes = 2; + c->src2.val = insn_fetch(u16, 2, c->eip); + break; case Src2One: c->src2.bytes = 1; c->src2.val = 1; @@ -1136,18 +1148,19 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt) } static int emulate_pop(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) + struct x86_emulate_ops *ops, + void *dest, int len) { struct decode_cache *c = &ctxt->decode; int rc; rc = ops->read_emulated(register_address(c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]), - &c->src.val, c->src.bytes, ctxt->vcpu); + dest, len, ctxt->vcpu); if (rc != 0) return rc; - register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.bytes); + register_address_increment(c, &c->regs[VCPU_REGS_RSP], len); return rc; } @@ -1157,11 +1170,9 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, struct decode_cache *c = &ctxt->decode; int rc; - c->src.bytes = c->dst.bytes; - rc = emulate_pop(ctxt, ops); + rc = emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); if (rc != 0) return rc; - c->dst.val = c->src.val; return 0; } @@ -1279,6 +1290,25 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, return 0; } +static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + unsigned long cs; + + rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes); + if (rc) + return rc; + if (c->op_bytes == 4) + c->eip = (u32)c->eip; + rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); + if (rc) + return rc; + rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, 1, VCPU_SREG_CS); + return rc; +} + static inline int writeback(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { @@ -1331,6 +1361,20 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, return 0; } +void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) +{ + u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask); + /* + * an sti; sti; sequence only disable interrupts for the first + * instruction. So, if the last instruction, be it emulated or + * not, left the system with the INT_STI flag enabled, it + * means that the last instruction is an sti. We should not + * leave the flag on in this case. The same goes for mov ss + */ + if (!(int_shadow & mask)) + ctxt->interruptibility = mask; +} + int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { @@ -1342,6 +1386,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) int io_dir_in; int rc = 0; + ctxt->interruptibility = 0; + /* Shadow copy of register state. Committed on successful emulation. * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't * modify them. @@ -1467,11 +1513,9 @@ special_insn: break; case 0x58 ... 0x5f: /* pop reg */ pop_instruction: - c->src.bytes = c->op_bytes; - rc = emulate_pop(ctxt, ops); + rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); if (rc != 0) goto done; - c->dst.val = c->src.val; break; case 0x63: /* movsxd */ if (ctxt->mode != X86EMUL_MODE_PROT64) @@ -1515,13 +1559,10 @@ special_insn: return -1; } return 0; - case 0x70 ... 0x7f: /* jcc (short) */ { - int rel = insn_fetch(s8, 1, c->eip); - + case 0x70 ... 0x7f: /* jcc (short) */ if (test_cc(c->b, ctxt->eflags)) - jmp_rel(c, rel); + jmp_rel(c, c->src.val); break; - } case 0x80 ... 0x83: /* Grp1 */ switch (c->modrm_reg) { case 0: @@ -1593,6 +1634,9 @@ special_insn: int err; sel = c->src.val; + if (c->modrm_reg == VCPU_SREG_SS) + toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); + if (c->modrm_reg <= 5) { type_bits = (c->modrm_reg == 1) ? 9 : 1; err = kvm_load_segment_descriptor(ctxt->vcpu, sel, @@ -1738,6 +1782,11 @@ special_insn: mov: c->dst.val = c->src.val; break; + case 0xcb: /* ret far */ + rc = emulate_ret_far(ctxt, ops); + if (rc) + goto done; + break; case 0xd0 ... 0xd1: /* Grp2 */ c->src.val = 1; emulate_grp2(ctxt); @@ -1748,59 +1797,32 @@ special_insn: break; case 0xe4: /* inb */ case 0xe5: /* in */ - port = insn_fetch(u8, 1, c->eip); + port = c->src.val; io_dir_in = 1; goto do_io; case 0xe6: /* outb */ case 0xe7: /* out */ - port = insn_fetch(u8, 1, c->eip); + port = c->src.val; io_dir_in = 0; goto do_io; case 0xe8: /* call (near) */ { - long int rel; - switch (c->op_bytes) { - case 2: - rel = insn_fetch(s16, 2, c->eip); - break; - case 4: - rel = insn_fetch(s32, 4, c->eip); - break; - default: - DPRINTF("Call: Invalid op_bytes\n"); - goto cannot_emulate; - } + long int rel = c->src.val; c->src.val = (unsigned long) c->eip; jmp_rel(c, rel); - c->op_bytes = c->ad_bytes; emulate_push(ctxt); break; } case 0xe9: /* jmp rel */ goto jmp; - case 0xea: /* jmp far */ { - uint32_t eip; - uint16_t sel; - - switch (c->op_bytes) { - case 2: - eip = insn_fetch(u16, 2, c->eip); - break; - case 4: - eip = insn_fetch(u32, 4, c->eip); - break; - default: - DPRINTF("jmp far: Invalid op_bytes\n"); - goto cannot_emulate; - } - sel = insn_fetch(u16, 2, c->eip); - if (kvm_load_segment_descriptor(ctxt->vcpu, sel, 9, VCPU_SREG_CS) < 0) { + case 0xea: /* jmp far */ + if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9, + VCPU_SREG_CS) < 0) { DPRINTF("jmp far: Failed to load CS descriptor\n"); goto cannot_emulate; } - c->eip = eip; + c->eip = c->src.val; break; - } case 0xeb: jmp: /* jmp rel short */ jmp_rel(c, c->src.val); @@ -1844,6 +1866,7 @@ special_insn: c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xfb: /* sti */ + toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); ctxt->eflags |= X86_EFLAGS_IF; c->dst.type = OP_NONE; /* Disable writeback. */ break; @@ -1908,11 +1931,16 @@ twobyte_insn: c->dst.type = OP_NONE; break; case 3: /* lidt/vmmcall */ - if (c->modrm_mod == 3 && c->modrm_rm == 1) { - rc = kvm_fix_hypercall(ctxt->vcpu); - if (rc) - goto done; - kvm_emulate_hypercall(ctxt->vcpu); + if (c->modrm_mod == 3) { + switch (c->modrm_rm) { + case 1: + rc = kvm_fix_hypercall(ctxt->vcpu); + if (rc) + goto done; + break; + default: + goto cannot_emulate; + } } else { rc = read_descriptor(ctxt, ops, c->src.ptr, &size, &address, @@ -2013,28 +2041,11 @@ twobyte_insn: if (!test_cc(c->b, ctxt->eflags)) c->dst.type = OP_NONE; /* no writeback */ break; - case 0x80 ... 0x8f: /* jnz rel, etc*/ { - long int rel; - - switch (c->op_bytes) { - case 2: - rel = insn_fetch(s16, 2, c->eip); - break; - case 4: - rel = insn_fetch(s32, 4, c->eip); - break; - case 8: - rel = insn_fetch(s64, 8, c->eip); - break; - default: - DPRINTF("jnz: Invalid op_bytes\n"); - goto cannot_emulate; - } + case 0x80 ... 0x8f: /* jnz rel, etc*/ if (test_cc(c->b, ctxt->eflags)) - jmp_rel(c, rel); + jmp_rel(c, c->src.val); c->dst.type = OP_NONE; break; - } case 0xa3: bt: /* bt */ c->dst.type = OP_NONE; |