From a5d36f82c4f3e852b61fdf1fee13463c8aa91b90 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 29 Dec 2009 12:42:16 +0200 Subject: KVM: Fix race between APIC TMR and IRR When we queue an interrupt to the local apic, we set the IRR before the TMR. The vcpu can pick up the IRR and inject the interrupt before setting the TMR, and perhaps even EOI it, causing incorrect behaviour. The race is really insignificant since it can only occur on the first interrupt (usually following interrupts will not change TMR), but it's better closed than open. Fixed by reordering setting the TMR vs IRR. Cc: stable@kernel.org Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/lapic.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3063a0c4858..ba8c045da78 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -373,6 +373,12 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (unlikely(!apic_enabled(apic))) break; + if (trig_mode) { + apic_debug("level trig mode for vector %d", vector); + apic_set_vector(vector, apic->regs + APIC_TMR); + } else + apic_clear_vector(vector, apic->regs + APIC_TMR); + result = !apic_test_and_set_irr(vector, apic); trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector, !result); @@ -383,11 +389,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, break; } - if (trig_mode) { - apic_debug("level trig mode for vector %d", vector); - apic_set_vector(vector, apic->regs + APIC_TMR); - } else - apic_clear_vector(vector, apic->regs + APIC_TMR); kvm_vcpu_kick(vcpu); break; -- cgit v1.2.3-70-g09d2 From e1f829b6f453c7985cfdfab616994b4051c3d788 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Sun, 20 Dec 2009 22:24:06 +0100 Subject: KVM: powerpc: Show timing option only on embedded Embedded PowerPC KVM has an exit timing implementation to track and evaluate how much time was spent in which exit path. For Book3S, we don't implement it. So let's not expose it as a config option either. Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/kvm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 07703f72330..6fb6e8aa389 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -53,7 +53,7 @@ config KVM_440 config KVM_EXIT_TIMING bool "Detailed exit timing" - depends on KVM + depends on KVM_440 || KVM_E500 ---help--- Calculate elapsed time for every exit/enter cycle. A per-vcpu report is available in debugfs kvm/vm#_vcpu#_timing. -- cgit v1.2.3-70-g09d2 From 82b7005f0e72d8d1a8226e4c192cbb0850d10b3f Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 5 Jan 2010 19:02:28 +0800 Subject: KVM: x86: Fix host_mapping_level() When found a error hva, should not return PAGE_SIZE but the level... Also clean up the coding style of the following loop. Cc: stable@kernel.org Signed-off-by: Sheng Yang Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4c3e5b2314c..89a49fb46a2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -477,7 +477,7 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn) addr = gfn_to_hva(kvm, gfn); if (kvm_is_error_hva(addr)) - return page_size; + return PT_PAGE_TABLE_LEVEL; down_read(¤t->mm->mmap_sem); vma = find_vma(current->mm, addr); @@ -515,11 +515,9 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) if (host_level == PT_PAGE_TABLE_LEVEL) return host_level; - for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) { - + for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) break; - } return level - 1; } -- cgit v1.2.3-70-g09d2 From f1d1c309f35e9b0fb961cffd70fbd04f450ec47c Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 13 Jan 2010 18:58:09 +0200 Subject: KVM: only allow one gsi per fd Looks like repeatedly binding same fd to multiple gsi's with irqfd can use up a ton of kernel memory for irqfd structures. A simple fix is to allow each fd to only trigger one gsi: triggering a storm of interrupts in guest is likely useless anyway, and we can do it by binding a single gsi to many interrupts if we really want to. Cc: stable@kernel.org Signed-off-by: Michael S. Tsirkin Acked-by: Acked-by: Gregory Haskins Signed-off-by: Avi Kivity --- virt/kvm/eventfd.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 30f70fd511c..62e4cd947a9 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -166,7 +166,7 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, static int kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) { - struct _irqfd *irqfd; + struct _irqfd *irqfd, *tmp; struct file *file = NULL; struct eventfd_ctx *eventfd = NULL; int ret; @@ -203,9 +203,20 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc); + spin_lock_irq(&kvm->irqfds.lock); + + ret = 0; + list_for_each_entry(tmp, &kvm->irqfds.items, list) { + if (irqfd->eventfd != tmp->eventfd) + continue; + /* This fd is used for another irq already. */ + ret = -EBUSY; + spin_unlock_irq(&kvm->irqfds.lock); + goto fail; + } + events = file->f_op->poll(file, &irqfd->pt); - spin_lock_irq(&kvm->irqfds.lock); list_add_tail(&irqfd->list, &kvm->irqfds.items); spin_unlock_irq(&kvm->irqfds.lock); -- cgit v1.2.3-70-g09d2 From d72118cecabbb76b96b77107a50c74d1bb36c0c1 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 12 Jan 2010 16:42:09 -0200 Subject: KVM: properly check max PIC pin in irq route setup Otherwise memory beyond irq_states[16] might be accessed. Noticed by Juan Quintela. Cc: stable@kernel.org Signed-off-by: Marcelo Tosatti Acked-by: Juan Quintela Signed-off-by: Avi Kivity --- virt/kvm/irq_comm.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 9b077342ab5..9fd5b3ebc51 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -302,6 +302,7 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt, { int r = -EINVAL; int delta; + unsigned max_pin; struct kvm_kernel_irq_routing_entry *ei; struct hlist_node *n; @@ -322,12 +323,15 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt, switch (ue->u.irqchip.irqchip) { case KVM_IRQCHIP_PIC_MASTER: e->set = kvm_set_pic_irq; + max_pin = 16; break; case KVM_IRQCHIP_PIC_SLAVE: e->set = kvm_set_pic_irq; + max_pin = 16; delta = 8; break; case KVM_IRQCHIP_IOAPIC: + max_pin = KVM_IOAPIC_NUM_PINS; e->set = kvm_set_ioapic_irq; break; default: @@ -335,7 +339,7 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt, } e->irqchip.irqchip = ue->u.irqchip.irqchip; e->irqchip.pin = ue->u.irqchip.pin + delta; - if (e->irqchip.pin >= KVM_IOAPIC_NUM_PINS) + if (e->irqchip.pin >= max_pin) goto out; rt->chip[ue->u.irqchip.irqchip][e->irqchip.pin] = ue->gsi; break; -- cgit v1.2.3-70-g09d2 From a6085fbaf65ab09bfb5ec8d902d6d21680fe1895 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 14 Jan 2010 17:41:27 -0200 Subject: KVM: MMU: bail out pagewalk on kvm_read_guest error Exit the guest pagetable walk loop if reading gpte failed. Otherwise its possible to enter an endless loop processing the previous present pte. Cc: stable@kernel.org Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/paging_tmpl.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 58a0f1e8859..ede2131a922 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -150,7 +150,9 @@ walk: walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; - kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); + if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) + goto not_present; + trace_kvm_mmu_paging_element(pte, walker->level); if (!is_present_gpte(pte)) -- cgit v1.2.3-70-g09d2 From cb289d6244a37cf932c571d6deb0daa8030f931b Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Wed, 13 Jan 2010 09:34:36 -0800 Subject: eventfd - allow atomic read and waitqueue remove KVM needs a wait to atomically remove themselves from the eventfd ->poll() wait queue head, in order to handle correctly their IRQfd deassign operation. This patch introduces such API, plus a way to read an eventfd from its context. Signed-off-by: Davide Libenzi Signed-off-by: Avi Kivity --- fs/eventfd.c | 89 ++++++++++++++++++++++++++++++++++++++++--------- include/linux/eventfd.h | 16 +++++++++ 2 files changed, 90 insertions(+), 15 deletions(-) diff --git a/fs/eventfd.c b/fs/eventfd.c index d26402ff06e..7758cc382ef 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -135,26 +135,71 @@ static unsigned int eventfd_poll(struct file *file, poll_table *wait) return events; } -static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, - loff_t *ppos) +static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) +{ + *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; + ctx->count -= *cnt; +} + +/** + * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue. + * @ctx: [in] Pointer to eventfd context. + * @wait: [in] Wait queue to be removed. + * @cnt: [out] Pointer to the 64bit conter value. + * + * Returns zero if successful, or the following error codes: + * + * -EAGAIN : The operation would have blocked. + * + * This is used to atomically remove a wait queue entry from the eventfd wait + * queue head, and read/reset the counter value. + */ +int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait, + __u64 *cnt) +{ + unsigned long flags; + + spin_lock_irqsave(&ctx->wqh.lock, flags); + eventfd_ctx_do_read(ctx, cnt); + __remove_wait_queue(&ctx->wqh, wait); + if (*cnt != 0 && waitqueue_active(&ctx->wqh)) + wake_up_locked_poll(&ctx->wqh, POLLOUT); + spin_unlock_irqrestore(&ctx->wqh.lock, flags); + + return *cnt != 0 ? 0 : -EAGAIN; +} +EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue); + +/** + * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero. + * @ctx: [in] Pointer to eventfd context. + * @no_wait: [in] Different from zero if the operation should not block. + * @cnt: [out] Pointer to the 64bit conter value. + * + * Returns zero if successful, or the following error codes: + * + * -EAGAIN : The operation would have blocked but @no_wait was nonzero. + * -ERESTARTSYS : A signal interrupted the wait operation. + * + * If @no_wait is zero, the function might sleep until the eventfd internal + * counter becomes greater than zero. + */ +ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt) { - struct eventfd_ctx *ctx = file->private_data; ssize_t res; - __u64 ucnt = 0; DECLARE_WAITQUEUE(wait, current); - if (count < sizeof(ucnt)) - return -EINVAL; spin_lock_irq(&ctx->wqh.lock); + *cnt = 0; res = -EAGAIN; if (ctx->count > 0) - res = sizeof(ucnt); - else if (!(file->f_flags & O_NONBLOCK)) { + res = 0; + else if (!no_wait) { __add_wait_queue(&ctx->wqh, &wait); - for (res = 0;;) { + for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (ctx->count > 0) { - res = sizeof(ucnt); + res = 0; break; } if (signal_pending(current)) { @@ -168,18 +213,32 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, __remove_wait_queue(&ctx->wqh, &wait); __set_current_state(TASK_RUNNING); } - if (likely(res > 0)) { - ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; - ctx->count -= ucnt; + if (likely(res == 0)) { + eventfd_ctx_do_read(ctx, cnt); if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, POLLOUT); } spin_unlock_irq(&ctx->wqh.lock); - if (res > 0 && put_user(ucnt, (__u64 __user *) buf)) - return -EFAULT; return res; } +EXPORT_SYMBOL_GPL(eventfd_ctx_read); + +static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct eventfd_ctx *ctx = file->private_data; + ssize_t res; + __u64 cnt; + + if (count < sizeof(cnt)) + return -EINVAL; + res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt); + if (res < 0) + return res; + + return put_user(cnt, (__u64 __user *) buf) ? -EFAULT : sizeof(cnt); +} static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index 94dd10366a7..91bb4f27238 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -10,6 +10,7 @@ #include #include +#include /* * CAREFUL: Check include/asm-generic/fcntl.h when defining @@ -34,6 +35,9 @@ struct file *eventfd_fget(int fd); struct eventfd_ctx *eventfd_ctx_fdget(int fd); struct eventfd_ctx *eventfd_ctx_fileget(struct file *file); int eventfd_signal(struct eventfd_ctx *ctx, int n); +ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt); +int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait, + __u64 *cnt); #else /* CONFIG_EVENTFD */ @@ -61,6 +65,18 @@ static inline void eventfd_ctx_put(struct eventfd_ctx *ctx) } +static inline ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, + __u64 *cnt) +{ + return -ENOSYS; +} + +static inline int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, + wait_queue_t *wait, __u64 *cnt) +{ + return -ENOSYS; +} + #endif #endif /* _LINUX_EVENTFD_H */ -- cgit v1.2.3-70-g09d2 From b6a114d27273c37cd0107b0f49af208168498f05 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 13 Jan 2010 19:12:30 +0200 Subject: KVM: fix spurious interrupt with irqfd kvm didn't clear irqfd counter on deassign, as a result we could get a spurious interrupt when irqfd is assigned back. this leads to poor performance and, in theory, guest crash. Signed-off-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- virt/kvm/eventfd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 62e4cd947a9..a9d3fc6c681 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -72,12 +72,13 @@ static void irqfd_shutdown(struct work_struct *work) { struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown); + u64 cnt; /* * Synchronize with the wait-queue and unhook ourselves to prevent * further events. */ - remove_wait_queue(irqfd->wqh, &irqfd->wait); + eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt); /* * We know no new events will be scheduled at this point, so block -- cgit v1.2.3-70-g09d2 From 062d5e9b0d714f449b261bb522eadaaf6f00f438 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 21 Jan 2010 12:19:07 +0100 Subject: KVM: S390: fix potential array overrun in intercept handling kvm_handle_sie_intercept uses a jump table to get the intercept handler for a SIE intercept. Static code analysis revealed a potential problem: the intercept_funcs jump table was defined to contain (0x48 >> 2) entries, but we only checked for code > 0x48 which would cause an off-by-one array overflow if code == 0x48. Use the compiler and ARRAY_SIZE to automatically set the limits. Cc: stable@kernel.org Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti --- arch/s390/kvm/intercept.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index ba9d8a7bc1a..b40096494e4 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -213,7 +213,7 @@ static int handle_instruction_and_prog(struct kvm_vcpu *vcpu) return rc2; } -static const intercept_handler_t intercept_funcs[0x48 >> 2] = { +static const intercept_handler_t intercept_funcs[] = { [0x00 >> 2] = handle_noop, [0x04 >> 2] = handle_instruction, [0x08 >> 2] = handle_prog, @@ -230,7 +230,7 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) intercept_handler_t func; u8 code = vcpu->arch.sie_block->icptcode; - if (code & 3 || code > 0x48) + if (code & 3 || (code >> 2) >= ARRAY_SIZE(intercept_funcs)) return -ENOTSUPP; func = intercept_funcs[code >> 2]; if (func) -- cgit v1.2.3-70-g09d2 From 36cb93fd6b6bf7e9163a69a8bf20207aed5fea44 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 22 Jan 2010 14:18:47 +0800 Subject: KVM: x86: Fix probable memory leak of vcpu->arch.mce_banks vcpu->arch.mce_banks is malloc in kvm_arch_vcpu_init(), but never free in any place, this may cause memory leak. So this patch fixed to free it in kvm_arch_vcpu_uninit(). Cc: stable@kernel.org Signed-off-by: Wei Yongjun Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6651dbf5867..b265eecc741 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5088,6 +5088,7 @@ fail: void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { + kfree(vcpu->arch.mce_banks); kvm_free_lapic(vcpu); down_read(&vcpu->kvm->slots_lock); kvm_mmu_destroy(vcpu); -- cgit v1.2.3-70-g09d2 From 443c39bc9ef7d8f648408d74c97e943f3bb3f48a Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 22 Jan 2010 14:21:29 +0800 Subject: KVM: x86: Fix leak of free lapic date in kvm_arch_vcpu_init() In function kvm_arch_vcpu_init(), if the memory malloc for vcpu->arch.mce_banks is fail, it does not free the memory of lapic date. This patch fixed it. Cc: stable@kernel.org Signed-off-by: Wei Yongjun Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b265eecc741..1ddcad452ad 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5072,12 +5072,13 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) GFP_KERNEL); if (!vcpu->arch.mce_banks) { r = -ENOMEM; - goto fail_mmu_destroy; + goto fail_free_lapic; } vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; return 0; - +fail_free_lapic: + kvm_free_lapic(vcpu); fail_mmu_destroy: kvm_mmu_destroy(vcpu); fail_free_pio_data: -- cgit v1.2.3-70-g09d2