diff options
Diffstat (limited to 'drivers/kvm')
-rw-r--r-- | drivers/kvm/kvm.h | 13 | ||||
-rw-r--r-- | drivers/kvm/kvm_main.c | 780 | ||||
-rw-r--r-- | drivers/kvm/kvm_svm.h | 3 | ||||
-rw-r--r-- | drivers/kvm/mmu.c | 89 | ||||
-rw-r--r-- | drivers/kvm/paging_tmpl.h | 18 | ||||
-rw-r--r-- | drivers/kvm/svm.c | 42 | ||||
-rw-r--r-- | drivers/kvm/vmx.c | 80 |
7 files changed, 714 insertions, 311 deletions
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 04574a9d443..0d122bf889d 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -14,6 +14,7 @@ #include "vmx.h" #include <linux/kvm.h> +#include <linux/kvm_para.h> #define CR0_PE_MASK (1ULL << 0) #define CR0_TS_MASK (1ULL << 3) @@ -237,6 +238,9 @@ struct kvm_vcpu { unsigned long cr0; unsigned long cr2; unsigned long cr3; + gpa_t para_state_gpa; + struct page *para_state_page; + gpa_t hypercall_gpa; unsigned long cr4; unsigned long cr8; u64 pdptrs[4]; /* pae */ @@ -305,6 +309,7 @@ struct kvm { int busy; unsigned long rmap_overflow; struct list_head vm_list; + struct file *filp; }; struct kvm_stat { @@ -339,7 +344,7 @@ struct kvm_arch_ops { int (*vcpu_create)(struct kvm_vcpu *vcpu); void (*vcpu_free)(struct kvm_vcpu *vcpu); - struct kvm_vcpu *(*vcpu_load)(struct kvm_vcpu *vcpu); + void (*vcpu_load)(struct kvm_vcpu *vcpu); void (*vcpu_put)(struct kvm_vcpu *vcpu); void (*vcpu_decache)(struct kvm_vcpu *vcpu); @@ -382,6 +387,8 @@ struct kvm_arch_ops { int (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); int (*vcpu_setup)(struct kvm_vcpu *vcpu); void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); + void (*patch_hypercall)(struct kvm_vcpu *vcpu, + unsigned char *hypercall_addr); }; extern struct kvm_stat kvm_stat; @@ -476,6 +483,8 @@ void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes); int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); +int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run); + static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code) { @@ -523,7 +532,7 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) { struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); - return (struct kvm_mmu_page *)page->private; + return (struct kvm_mmu_page *)page_private(page); } static inline u16 read_fs(void) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index af866147ff2..dc7a8c78cbf 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -20,6 +20,7 @@ #include <linux/kvm.h> #include <linux/module.h> #include <linux/errno.h> +#include <linux/magic.h> #include <asm/processor.h> #include <linux/percpu.h> #include <linux/gfp.h> @@ -36,6 +37,9 @@ #include <asm/desc.h> #include <linux/sysdev.h> #include <linux/cpu.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/mount.h> #include "x86_emulate.h" #include "segment_descriptor.h" @@ -72,6 +76,8 @@ static struct kvm_stats_debugfs_item { static struct dentry *debugfs_dir; +struct vfsmount *kvmfs_mnt; + #define MAX_IO_MSRS 256 #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL @@ -90,6 +96,58 @@ struct segment_descriptor_64 { #endif +static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, + unsigned long arg); + +static struct inode *kvmfs_inode(struct file_operations *fops) +{ + int error = -ENOMEM; + struct inode *inode = new_inode(kvmfs_mnt->mnt_sb); + + if (!inode) + goto eexit_1; + + inode->i_fop = fops; + + /* + * Mark the inode dirty from the very beginning, + * that way it will never be moved to the dirty + * list because mark_inode_dirty() will think + * that it already _is_ on the dirty list. + */ + inode->i_state = I_DIRTY; + inode->i_mode = S_IRUSR | S_IWUSR; + inode->i_uid = current->fsuid; + inode->i_gid = current->fsgid; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + return inode; + +eexit_1: + return ERR_PTR(error); +} + +static struct file *kvmfs_file(struct inode *inode, void *private_data) +{ + struct file *file = get_empty_filp(); + + if (!file) + return ERR_PTR(-ENFILE); + + file->f_path.mnt = mntget(kvmfs_mnt); + file->f_path.dentry = d_alloc_anon(inode); + if (!file->f_path.dentry) + return ERR_PTR(-ENOMEM); + file->f_mapping = inode->i_mapping; + + file->f_pos = 0; + file->f_flags = O_RDWR; + file->f_op = inode->i_fop; + file->f_mode = FMODE_READ | FMODE_WRITE; + file->f_version = 0; + file->private_data = private_data; + return file; +} + unsigned long segment_base(u16 selector) { struct descriptor_table gdt; @@ -126,10 +184,8 @@ static inline int valid_vcpu(int n) return likely(n >= 0 && n < KVM_MAX_VCPUS); } -int kvm_read_guest(struct kvm_vcpu *vcpu, - gva_t addr, - unsigned long size, - void *dest) +int kvm_read_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size, + void *dest) { unsigned char *host_buf = dest; unsigned long req_size = size; @@ -161,10 +217,8 @@ int kvm_read_guest(struct kvm_vcpu *vcpu, } EXPORT_SYMBOL_GPL(kvm_read_guest); -int kvm_write_guest(struct kvm_vcpu *vcpu, - gva_t addr, - unsigned long size, - void *data) +int kvm_write_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size, + void *data) { unsigned char *host_buf = data; unsigned long req_size = size; @@ -174,12 +228,15 @@ int kvm_write_guest(struct kvm_vcpu *vcpu, unsigned now; unsigned offset; hva_t guest_buf; + gfn_t gfn; paddr = gva_to_hpa(vcpu, addr); if (is_error_hpa(paddr)) break; + gfn = vcpu->mmu.gva_to_gpa(vcpu, addr) >> PAGE_SHIFT; + mark_page_dirty(vcpu->kvm, gfn); guest_buf = (hva_t)kmap_atomic( pfn_to_page(paddr >> PAGE_SHIFT), KM_USER0); offset = addr & ~PAGE_MASK; @@ -195,24 +252,30 @@ int kvm_write_guest(struct kvm_vcpu *vcpu, } EXPORT_SYMBOL_GPL(kvm_write_guest); -static int vcpu_slot(struct kvm_vcpu *vcpu) +/* + * Switches to specified vcpu, until a matching vcpu_put() + */ +static void vcpu_load(struct kvm_vcpu *vcpu) { - return vcpu - vcpu->kvm->vcpus; + mutex_lock(&vcpu->mutex); + kvm_arch_ops->vcpu_load(vcpu); } /* - * Switches to specified vcpu, until a matching vcpu_put() + * Switches to specified vcpu, until a matching vcpu_put(). Will return NULL + * if the slot is not populated. */ -static struct kvm_vcpu *vcpu_load(struct kvm *kvm, int vcpu_slot) +static struct kvm_vcpu *vcpu_load_slot(struct kvm *kvm, int slot) { - struct kvm_vcpu *vcpu = &kvm->vcpus[vcpu_slot]; + struct kvm_vcpu *vcpu = &kvm->vcpus[slot]; mutex_lock(&vcpu->mutex); - if (unlikely(!vcpu->vmcs)) { + if (!vcpu->vmcs) { mutex_unlock(&vcpu->mutex); return NULL; } - return kvm_arch_ops->vcpu_load(vcpu); + kvm_arch_ops->vcpu_load(vcpu); + return vcpu; } static void vcpu_put(struct kvm_vcpu *vcpu) @@ -221,13 +284,13 @@ static void vcpu_put(struct kvm_vcpu *vcpu) mutex_unlock(&vcpu->mutex); } -static int kvm_dev_open(struct inode *inode, struct file *filp) +static struct kvm *kvm_create_vm(void) { struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); int i; if (!kvm) - return -ENOMEM; + return ERR_PTR(-ENOMEM); spin_lock_init(&kvm->lock); INIT_LIST_HEAD(&kvm->active_mmu_pages); @@ -243,7 +306,11 @@ static int kvm_dev_open(struct inode *inode, struct file *filp) list_add(&kvm->vm_list, &vm_list); spin_unlock(&kvm_lock); } - filp->private_data = kvm; + return kvm; +} + +static int kvm_dev_open(struct inode *inode, struct file *filp) +{ return 0; } @@ -281,9 +348,10 @@ static void kvm_free_physmem(struct kvm *kvm) static void kvm_free_vcpu(struct kvm_vcpu *vcpu) { - if (!vcpu_load(vcpu->kvm, vcpu_slot(vcpu))) + if (!vcpu->vmcs) return; + vcpu_load(vcpu); kvm_mmu_destroy(vcpu); vcpu_put(vcpu); kvm_arch_ops->vcpu_free(vcpu); @@ -299,14 +367,24 @@ static void kvm_free_vcpus(struct kvm *kvm) static int kvm_dev_release(struct inode *inode, struct file *filp) { - struct kvm *kvm = filp->private_data; + return 0; +} +static void kvm_destroy_vm(struct kvm *kvm) +{ spin_lock(&kvm_lock); list_del(&kvm->vm_list); spin_unlock(&kvm_lock); kvm_free_vcpus(kvm); kvm_free_physmem(kvm); kfree(kvm); +} + +static int kvm_vm_release(struct inode *inode, struct file *filp) +{ + struct kvm *kvm = filp->private_data; + + kvm_destroy_vm(kvm); return 0; } @@ -457,7 +535,7 @@ EXPORT_SYMBOL_GPL(set_cr4); void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { if (is_long_mode(vcpu)) { - if ( cr3 & CR3_L_MODE_RESEVED_BITS) { + if (cr3 & CR3_L_MODE_RESEVED_BITS) { printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); inject_gp(vcpu); return; @@ -533,55 +611,11 @@ void fx_init(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(fx_init); -/* - * Creates some virtual cpus. Good luck creating more than one. - */ -static int kvm_dev_ioctl_create_vcpu(struct kvm *kvm, int n) +static void do_remove_write_access(struct kvm_vcpu *vcpu, int slot) { - int r; - struct kvm_vcpu *vcpu; - - r = -EINVAL; - if (!valid_vcpu(n)) - goto out; - - vcpu = &kvm->vcpus[n]; - - mutex_lock(&vcpu->mutex); - - if (vcpu->vmcs) { - mutex_unlock(&vcpu->mutex); - return -EEXIST; - } - - vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf, - FX_IMAGE_ALIGN); - vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE; - - r = kvm_arch_ops->vcpu_create(vcpu); - if (r < 0) - goto out_free_vcpus; - - r = kvm_mmu_create(vcpu); - if (r < 0) - goto out_free_vcpus; - - kvm_arch_ops->vcpu_load(vcpu); - r = kvm_mmu_setup(vcpu); - if (r >= 0) - r = kvm_arch_ops->vcpu_setup(vcpu); - vcpu_put(vcpu); - - if (r < 0) - goto out_free_vcpus; - - return 0; - -out_free_vcpus: - kvm_free_vcpu(vcpu); - mutex_unlock(&vcpu->mutex); -out: - return r; + spin_lock(&vcpu->kvm->lock); + kvm_mmu_slot_remove_write_access(vcpu, slot); + spin_unlock(&vcpu->kvm->lock); } /* @@ -590,8 +624,8 @@ out: * * Discontiguous memory is allowed, mostly for framebuffers. */ -static int kvm_dev_ioctl_set_memory_region(struct kvm *kvm, - struct kvm_memory_region *mem) +static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, + struct kvm_memory_region *mem) { int r; gfn_t base_gfn; @@ -674,7 +708,7 @@ raced: | __GFP_ZERO); if (!new.phys_mem[i]) goto out_free; - new.phys_mem[i]->private = 0; + set_page_private(new.phys_mem[i],0); } } @@ -711,9 +745,11 @@ raced: for (i = 0; i < KVM_MAX_VCPUS; ++i) { struct kvm_vcpu *vcpu; - vcpu = vcpu_load(kvm, i); + vcpu = vcpu_load_slot(kvm, i); if (!vcpu) continue; + if (new.flags & KVM_MEM_LOG_DIRTY_PAGES) + do_remove_write_access(vcpu, mem->slot); kvm_mmu_reset_context(vcpu); vcpu_put(vcpu); } @@ -729,18 +765,11 @@ out: return r; } -static void do_remove_write_access(struct kvm_vcpu *vcpu, int slot) -{ - spin_lock(&vcpu->kvm->lock); - kvm_mmu_slot_remove_write_access(vcpu, slot); - spin_unlock(&vcpu->kvm->lock); -} - /* * Get (and clear) the dirty memory log for a memory slot. */ -static int kvm_dev_ioctl_get_dirty_log(struct kvm *kvm, - struct kvm_dirty_log *log) +static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, + struct kvm_dirty_log *log) { struct kvm_memory_slot *memslot; int r, i; @@ -765,21 +794,21 @@ static int kvm_dev_ioctl_get_dirty_log(struct kvm *kvm, if (!memslot->dirty_bitmap) goto out; - n = ALIGN(memslot->npages, 8) / 8; + n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; - for (i = 0; !any && i < n; ++i) + for (i = 0; !any && i < n/sizeof(long); ++i) any = memslot->dirty_bitmap[i]; r = -EFAULT; if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) goto out; - if (any) { cleared = 0; for (i = 0; i < KVM_MAX_VCPUS; ++i) { - struct kvm_vcpu *vcpu = vcpu_load(kvm, i); + struct kvm_vcpu *vcpu; + vcpu = vcpu_load_slot(kvm, i); if (!vcpu) continue; if (!cleared) { @@ -903,8 +932,9 @@ static int emulator_read_emulated(unsigned long addr, return X86EMUL_CONTINUE; else { gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); + if (gpa == UNMAPPED_GVA) - return vcpu_printf(vcpu, "not present\n"), X86EMUL_PROPAGATE_FAULT; + return X86EMUL_PROPAGATE_FAULT; vcpu->mmio_needed = 1; vcpu->mmio_phys_addr = gpa; vcpu->mmio_size = bytes; @@ -928,6 +958,7 @@ static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, return 0; page = gfn_to_page(m, gpa >> PAGE_SHIFT); kvm_mmu_pre_write(vcpu, gpa, bytes); + mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); virt = kmap_atomic(page, KM_USER0); memcpy(virt + offset_in_page(gpa), &val, bytes); kunmap_atomic(virt, KM_USER0); @@ -1142,6 +1173,42 @@ int emulate_instruction(struct kvm_vcpu *vcpu, } EXPORT_SYMBOL_GPL(emulate_instruction); +int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + unsigned long nr, a0, a1, a2, a3, a4, a5, ret; + + kvm_arch_ops->decache_regs(vcpu); + ret = -KVM_EINVAL; +#ifdef CONFIG_X86_64 + if (is_long_mode(vcpu)) { + nr = vcpu->regs[VCPU_REGS_RAX]; + a0 = vcpu->regs[VCPU_REGS_RDI]; + a1 = vcpu->regs[VCPU_REGS_RSI]; + a2 = vcpu->regs[VCPU_REGS_RDX]; + a3 = vcpu->regs[VCPU_REGS_RCX]; + a4 = vcpu->regs[VCPU_REGS_R8]; + a5 = vcpu->regs[VCPU_REGS_R9]; + } else +#endif + { + nr = vcpu->regs[VCPU_REGS_RBX] & -1u; + a0 = vcpu->regs[VCPU_REGS_RAX] & -1u; + a1 = vcpu->regs[VCPU_REGS_RCX] & -1u; + a2 = vcpu->regs[VCPU_REGS_RDX] & -1u; + a3 = vcpu->regs[VCPU_REGS_RSI] & -1u; + a4 = vcpu->regs[VCPU_REGS_RDI] & -1u; + a5 = vcpu->regs[VCPU_REGS_RBP] & -1u; + } + switch (nr) { + default: + ; + } + vcpu->regs[VCPU_REGS_RAX] = ret; + kvm_arch_ops->cache_regs(vcpu); + return 1; +} +EXPORT_SYMBOL_GPL(kvm_hypercall); + static u64 mk_cr_64(u64 curr_cr, u32 new_val) { return (curr_cr & ~((1ULL << 32) - 1)) | new_val; @@ -1208,6 +1275,75 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, } } +/* + * Register the para guest with the host: + */ +static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa) +{ + struct kvm_vcpu_para_state *para_state; + hpa_t para_state_hpa, hypercall_hpa; + struct page *para_state_page; + unsigned char *hypercall; + gpa_t hypercall_gpa; + + printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n"); + printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa); + + /* + * Needs to be page aligned: + */ + if (para_state_gpa != PAGE_ALIGN(para_state_gpa)) + goto err_gp; + + para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa); + printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa); + if (is_error_hpa(para_state_hpa)) + goto err_gp; + + mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT); + para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT); + para_state = kmap_atomic(para_state_page, KM_USER0); + + printk(KERN_DEBUG ".... guest version: %d\n", para_state->guest_version); + printk(KERN_DEBUG ".... size: %d\n", para_state->size); + + para_state->host_version = KVM_PARA_API_VERSION; + /* + * We cannot support guests that try to register themselves + * with a newer API version than the host supports: + */ + if (para_state->guest_version > KVM_PARA_API_VERSION) { + para_state->ret = -KVM_EINVAL; + goto err_kunmap_skip; + } + + hypercall_gpa = para_state->hypercall_gpa; + hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa); + printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa); + if (is_error_hpa(hypercall_hpa)) { + para_state->ret = -KVM_EINVAL; + goto err_kunmap_skip; + } + + printk(KERN_DEBUG "kvm: para guest successfully registered.\n"); + vcpu->para_state_page = para_state_page; + vcpu->para_state_gpa = para_state_gpa; + vcpu->hypercall_gpa = hypercall_gpa; + + mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT); + hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT), + KM_USER1) + (hypercall_hpa & ~PAGE_MASK); + kvm_arch_ops->patch_hypercall(vcpu, hypercall); + kunmap_atomic(hypercall, KM_USER1); + + para_state->ret = 0; +err_kunmap_skip: + kunmap_atomic(para_state, KM_USER0); + return 0; +err_gp: + return 1; +} + int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { u64 data; @@ -1316,6 +1452,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_IA32_MISC_ENABLE: vcpu->ia32_misc_enable_msr = data; break; + /* + * This is the 'probe whether the host is KVM' logic: + */ + case MSR_KVM_API_MAGIC: + return vcpu_register_para(vcpu, data); + default: printk(KERN_ERR "kvm: unhandled wrmsr: 0x%x\n", msr); return 1; @@ -1338,8 +1480,7 @@ void kvm_resched(struct kvm_vcpu *vcpu) { vcpu_put(vcpu); cond_resched(); - /* Cannot fail - no vcpu unplug yet. */ - vcpu_load(vcpu->kvm, vcpu_slot(vcpu)); + vcpu_load(vcpu); } EXPORT_SYMBOL_GPL(kvm_resched); @@ -1361,17 +1502,11 @@ void save_msrs(struct vmx_msr_entry *e, int n) } EXPORT_SYMBOL_GPL(save_msrs); -static int kvm_dev_ioctl_run(struct kvm *kvm, struct kvm_run *kvm_run) +static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - struct kvm_vcpu *vcpu; int r; - if (!valid_vcpu(kvm_run->vcpu)) - return -EINVAL; - - vcpu = vcpu_load(kvm, kvm_run->vcpu); - if (!vcpu) - return -ENOENT; + vcpu_load(vcpu); /* re-sync apic's tpr */ vcpu->cr8 = kvm_run->cr8; @@ -1394,16 +1529,10 @@ static int kvm_dev_ioctl_run(struct kvm *kvm, struct kvm_run *kvm_run) return r; } -static int kvm_dev_ioctl_get_regs(struct kvm *kvm, struct kvm_regs *regs) +static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, + struct kvm_regs *regs) { - struct kvm_vcpu *vcpu; - - if (!valid_vcpu(regs->vcpu)) - return -EINVAL; - - vcpu = vcpu_load(kvm, regs->vcpu); - if (!vcpu) - return -ENOENT; + vcpu_load(vcpu); kvm_arch_ops->cache_regs(vcpu); @@ -1440,16 +1569,10 @@ static int kvm_dev_ioctl_get_regs(struct kvm *kvm, struct kvm_regs *regs) return 0; } -static int kvm_dev_ioctl_set_regs(struct kvm *kvm, struct kvm_regs *regs) +static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, + struct kvm_regs *regs) { - struct kvm_vcpu *vcpu; - - if (!valid_vcpu(regs->vcpu)) - return -EINVAL; - - vcpu = vcpu_load(kvm, regs->vcpu); - if (!vcpu) - return -ENOENT; + vcpu_load(vcpu); vcpu->regs[VCPU_REGS_RAX] = regs->rax; vcpu->regs[VCPU_REGS_RBX] = regs->rbx; @@ -1486,16 +1609,12 @@ static void get_segment(struct kvm_vcpu *vcpu, return kvm_arch_ops->get_segment(vcpu, var, seg); } -static int kvm_dev_ioctl_get_sregs(struct kvm *kvm, struct kvm_sregs *sregs) +static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { - struct kvm_vcpu *vcpu; struct descriptor_table dt; - if (!valid_vcpu(sregs->vcpu)) - return -EINVAL; - vcpu = vcpu_load(kvm, sregs->vcpu); - if (!vcpu) - return -ENOENT; + vcpu_load(vcpu); get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); @@ -1537,18 +1656,14 @@ static void set_segment(struct kvm_vcpu *vcpu, return kvm_arch_ops->set_segment(vcpu, var, seg); } -static int kvm_dev_ioctl_set_sregs(struct kvm *kvm, struct kvm_sregs *sregs) +static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { - struct kvm_vcpu *vcpu; int mmu_reset_needed = 0; int i; struct descriptor_table dt; - if (!valid_vcpu(sregs->vcpu)) - return -EINVAL; - vcpu = vcpu_load(kvm, sregs->vcpu); - if (!vcpu) - return -ENOENT; + vcpu_load(vcpu); set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); @@ -1654,20 +1769,14 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) * * @return number of msrs set successfully. */ -static int __msr_io(struct kvm *kvm, struct kvm_msrs *msrs, +static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, struct kvm_msr_entry *entries, int (*do_msr)(struct kvm_vcpu *vcpu, unsigned index, u64 *data)) { - struct kvm_vcpu *vcpu; int i; - if (!valid_vcpu(msrs->vcpu)) - return -EINVAL; - - vcpu = vcpu_load(kvm, msrs->vcpu); - if (!vcpu) - return -ENOENT; + vcpu_load(vcpu); for (i = 0; i < msrs->nmsrs; ++i) if (do_msr(vcpu, entries[i].index, &entries[i].data)) @@ -1683,7 +1792,7 @@ static int __msr_io(struct kvm *kvm, struct kvm_msrs *msrs, * * @return number of msrs set successfully. */ -static int msr_io(struct kvm *kvm, struct kvm_msrs __user *user_msrs, +static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, int (*do_msr)(struct kvm_vcpu *vcpu, unsigned index, u64 *data), int writeback) @@ -1711,7 +1820,7 @@ static int msr_io(struct kvm *kvm, struct kvm_msrs __user *user_msrs, if (copy_from_user(entries, user_msrs->entries, size)) goto out_free; - r = n = __msr_io(kvm, &msrs, entries, do_msr); + r = n = __msr_io(vcpu, &msrs, entries, do_msr); if (r < 0) goto out_free; @@ -1730,38 +1839,31 @@ out: /* * Translate a guest virtual address to a guest physical address. */ -static int kvm_dev_ioctl_translate(struct kvm *kvm, struct kvm_translation *tr) +static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, + struct kvm_translation *tr) { unsigned long vaddr = tr->linear_address; - struct kvm_vcpu *vcpu; gpa_t gpa; - vcpu = vcpu_load(kvm, tr->vcpu); - if (!vcpu) - return -ENOENT; - spin_lock(&kvm->lock); + vcpu_load(vcpu); + spin_lock(&vcpu->kvm->lock); gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr); tr->physical_address = gpa; tr->valid = gpa != UNMAPPED_GVA; tr->writeable = 1; tr->usermode = 0; - spin_unlock(&kvm->lock); + spin_unlock(&vcpu->kvm->lock); vcpu_put(vcpu); return 0; } -static int kvm_dev_ioctl_interrupt(struct kvm *kvm, struct kvm_interrupt *irq) +static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, + struct kvm_interrupt *irq) { - struct kvm_vcpu *vcpu; - - if (!valid_vcpu(irq->vcpu)) - return -EINVAL; if (irq->irq < 0 || irq->irq >= 256) return -EINVAL; - vcpu = vcpu_load(kvm, irq->vcpu); - if (!vcpu) - return -ENOENT; + vcpu_load(vcpu); set_bit(irq->irq, vcpu->irq_pending); set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary); @@ -1771,17 +1873,12 @@ static int kvm_dev_ioctl_interrupt(struct kvm *kvm, struct kvm_interrupt *irq) return 0; } -static int kvm_dev_ioctl_debug_guest(struct kvm *kvm, - struct kvm_debug_guest *dbg) +static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, + struct kvm_debug_guest *dbg) { - struct kvm_vcpu *vcpu; int r; - if (!valid_vcpu(dbg->vcpu)) - return -EINVAL; - vcpu = vcpu_load(kvm, dbg->vcpu); - if (!vcpu) - return -ENOENT; + vcpu_load(vcpu); r = kvm_arch_ops->set_guest_debug(vcpu, dbg); @@ -1790,30 +1887,129 @@ static int kvm_dev_ioctl_debug_guest(struct kvm *kvm, return r; } -static long kvm_dev_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +static int kvm_vcpu_release(struct inode *inode, struct file *filp) { - struct kvm *kvm = filp->private_data; + struct kvm_vcpu *vcpu = filp->private_data; + + fput(vcpu->kvm->filp); + return 0; +} + +static struct file_operations kvm_vcpu_fops = { + .release = kvm_vcpu_release, + .unlocked_ioctl = kvm_vcpu_ioctl, + .compat_ioctl = kvm_vcpu_ioctl, +}; + +/* + * Allocates an inode for the vcpu. + */ +static int create_vcpu_fd(struct kvm_vcpu *vcpu) +{ + int fd, r; + struct inode *inode; + struct file *file; + + atomic_inc(&vcpu->kvm->filp->f_count); + inode = kvmfs_inode(&kvm_vcpu_fops); + if (IS_ERR(inode)) { + r = PTR_ERR(inode); + goto out1; + } + + file = kvmfs_file(inode, vcpu); + if (IS_ERR(file)) { + r = PTR_ERR(file); + goto out2; + } + + r = get_unused_fd(); + if (r < 0) + goto out3; + fd = r; + fd_install(fd, file); + + return fd; + +out3: + fput(file); +out2: + iput(inode); +out1: + fput(vcpu->kvm->filp); + return r; +} + +/* + * Creates some virtual cpus. Good luck creating more than one. + */ +static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) +{ + int r; + struct kvm_vcpu *vcpu; + + r = -EINVAL; + if (!valid_vcpu(n)) + goto out; + + vcpu = &kvm->vcpus[n]; + + mutex_lock(&vcpu->mutex); + + if (vcpu->vmcs) { + mutex_unlock(&vcpu->mutex); + return -EEXIST; + } + + vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf, + FX_IMAGE_ALIGN); + vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE; + + r = kvm_arch_ops->vcpu_create(vcpu); + if (r < 0) + goto out_free_vcpus; + + r = kvm_mmu_create(vcpu); + if (r < 0) + goto out_free_vcpus; + + kvm_arch_ops->vcpu_load(vcpu); + r = kvm_mmu_setup(vcpu); + if (r >= 0) + r = kvm_arch_ops->vcpu_setup(vcpu); + vcpu_put(vcpu); + + if (r < 0) + goto out_free_vcpus; + + r = create_vcpu_fd(vcpu); + if (r < 0) + goto out_free_vcpus; + + return r; + +out_free_vcpus: + kvm_free_vcpu(vcpu); + mutex_unlock(&vcpu->mutex); +out: + return r; +} + +static long kvm_vcpu_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; int r = -EINVAL; switch (ioctl) { - case KVM_GET_API_VERSION: - r = KVM_API_VERSION; - break; - case KVM_CREATE_VCPU: { - r = kvm_dev_ioctl_create_vcpu(kvm, arg); - if (r) - goto out; - break; - } case KVM_RUN: { struct kvm_run kvm_run; r = -EFAULT; if (copy_from_user(&kvm_run, argp, sizeof kvm_run)) goto out; - r = kvm_dev_ioctl_run(kvm, &kvm_run); + r = kvm_vcpu_ioctl_run(vcpu, &kvm_run); if (r < 0 && r != -EINTR) goto out; if (copy_to_user(argp, &kvm_run, sizeof kvm_run)) { @@ -1825,10 +2021,8 @@ static long kvm_dev_ioctl(struct file *filp, case KVM_GET_REGS: { struct kvm_regs kvm_regs; - r = -EFAULT; - if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs)) - goto out; - r = kvm_dev_ioctl_get_regs(kvm, &kvm_regs); + memset(&kvm_regs, 0, sizeof kvm_regs); + r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs); if (r) goto out; r = -EFAULT; @@ -1843,7 +2037,7 @@ static long kvm_dev_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs)) goto out; - r = kvm_dev_ioctl_set_regs(kvm, &kvm_regs); + r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs); if (r) goto out; r = 0; @@ -1852,10 +2046,8 @@ static long kvm_dev_ioctl(struct file *filp, case KVM_GET_SREGS: { struct kvm_sregs kvm_sregs; - r = -EFAULT; - if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs)) - goto out; - r = kvm_dev_ioctl_get_sregs(kvm, &kvm_sregs); + memset(&kvm_sregs, 0, sizeof kvm_sregs); + r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs); if (r) goto out; r = -EFAULT; @@ -1870,7 +2062,7 @@ static long kvm_dev_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs)) goto out; - r = kvm_dev_ioctl_set_sregs(kvm, &kvm_sregs); + r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs); if (r) goto out; r = 0; @@ -1882,7 +2074,7 @@ static long kvm_dev_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&tr, argp, sizeof tr)) goto out; - r = kvm_dev_ioctl_translate(kvm, &tr); + r = kvm_vcpu_ioctl_translate(vcpu, &tr); if (r) goto out; r = -EFAULT; @@ -1897,7 +2089,7 @@ static long kvm_dev_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&irq, argp, sizeof irq)) goto out; - r = kvm_dev_ioctl_interrupt(kvm, &irq); + r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); if (r) goto out; r = 0; @@ -1909,19 +2101,45 @@ static long kvm_dev_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&dbg, argp, sizeof dbg)) goto out; - r = kvm_dev_ioctl_debug_guest(kvm, &dbg); + r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg); if (r) goto out; r = 0; break; } + case KVM_GET_MSRS: + r = msr_io(vcpu, argp, get_msr, 1); + break; + case KVM_SET_MSRS: + r = msr_io(vcpu, argp, do_set_msr, 0); + break; + default: + ; + } +out: + return r; +} + +static long kvm_vm_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm *kvm = filp->private_data; + void __user *argp = (void __user *)arg; + int r = -EINVAL; + + switch (ioctl) { + case KVM_CREATE_VCPU: + r = kvm_vm_ioctl_create_vcpu(kvm, arg); + if (r < 0) + goto out; + break; case KVM_SET_MEMORY_REGION: { struct kvm_memory_region kvm_mem; r = -EFAULT; if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) goto out; - r = kvm_dev_ioctl_set_memory_region(kvm, &kvm_mem); + r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_mem); if (r) goto out; break; @@ -1932,16 +2150,112 @@ static long kvm_dev_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&log, argp, sizeof log)) goto out; - r = kvm_dev_ioctl_get_dirty_log(kvm, &log); + r = kvm_vm_ioctl_get_dirty_log(kvm, &log); if (r) goto out; break; } - case KVM_GET_MSRS: - r = msr_io(kvm, argp, get_msr, 1); + default: + ; + } +out: + return r; +} + +static struct page *kvm_vm_nopage(struct vm_area_struct *vma, + unsigned long address, + int *type) +{ + struct kvm *kvm = vma->vm_file->private_data; + unsigned long pgoff; + struct kvm_memory_slot *slot; + struct page *page; + + *type = VM_FAULT_MINOR; + pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + slot = gfn_to_memslot(kvm, pgoff); + if (!slot) + return NOPAGE_SIGBUS; + page = gfn_to_page(slot, pgoff); + if (!page) + return NOPAGE_SIGBUS; + get_page(page); + return page; +} + +static struct vm_operations_struct kvm_vm_vm_ops = { + .nopage = kvm_vm_nopage, +}; + +static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_ops = &kvm_vm_vm_ops; + return 0; +} + +static struct file_operations kvm_vm_fops = { + .release = kvm_vm_release, + .unlocked_ioctl = kvm_vm_ioctl, + .compat_ioctl = kvm_vm_ioctl, + .mmap = kvm_vm_mmap, +}; + +static int kvm_dev_ioctl_create_vm(void) +{ + int fd, r; + struct inode *inode; + struct file *file; + struct kvm *kvm; + + inode = kvmfs_inode(&kvm_vm_fops); + if (IS_ERR(inode)) { + r = PTR_ERR(inode); + goto out1; + } + + kvm = kvm_create_vm(); + if (IS_ERR(kvm)) { + r = PTR_ERR(kvm); + goto out2; + } + + file = kvmfs_file(inode, kvm); + if (IS_ERR(file)) { + r = PTR_ERR(file); + goto out3; + } + kvm->filp = file; + + r = get_unused_fd(); + if (r < 0) + goto out4; + fd = r; + fd_install(fd, file); + + return fd; + +out4: + fput(file); +out3: + kvm_destroy_vm(kvm); +out2: + iput(inode); +out1: + return r; +} + +static long kvm_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + int r = -EINVAL; + + switch (ioctl) { + case KVM_GET_API_VERSION: + r = KVM_API_VERSION; break; - case KVM_SET_MSRS: - r = msr_io(kvm, argp, do_set_msr, 0); + case KVM_CREATE_VM: + r = kvm_dev_ioctl_create_vm(); break; case KVM_GET_MSR_INDEX_LIST: { struct kvm_msr_list __user *user_msr_list = argp; @@ -1977,43 +2291,11 @@ out: return r; } -static struct page *kvm_dev_nopage(struct vm_area_struct *vma, - unsigned long address, - int *type) -{ - struct kvm *kvm = vma->vm_file->private_data; - unsigned long pgoff; - struct kvm_memory_slot *slot; - struct page *page; - - *type = VM_FAULT_MINOR; - pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - slot = gfn_to_memslot(kvm, pgoff); - if (!slot) - return NOPAGE_SIGBUS; - page = gfn_to_page(slot, pgoff); - if (!page) - return NOPAGE_SIGBUS; - get_page(page); - return page; -} - -static struct vm_operations_struct kvm_dev_vm_ops = { - .nopage = kvm_dev_nopage, -}; - -static int kvm_dev_mmap(struct file *file, struct vm_area_struct *vma) -{ - vma->vm_ops = &kvm_dev_vm_ops; - return 0; -} - static struct file_operations kvm_chardev_ops = { .open = kvm_dev_open, .release = kvm_dev_release, .unlocked_ioctl = kvm_dev_ioctl, .compat_ioctl = kvm_dev_ioctl, - .mmap = kvm_dev_mmap, }; static struct miscdevice kvm_dev = { @@ -2080,13 +2362,17 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, int cpu = (long)v; switch (val) { - case CPU_DEAD: + case CPU_DOWN_PREPARE: case CPU_UP_CANCELED: + printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", + cpu); decache_vcpus_on_cpu(cpu); smp_call_function_single(cpu, kvm_arch_ops->hardware_disable, NULL, 0, 1); break; - case CPU_UP_PREPARE: + case CPU_ONLINE: + printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", + cpu); smp_call_function_single(cpu, kvm_arch_ops->hardware_enable, NULL, 0, 1); break; @@ -2121,13 +2407,13 @@ static void kvm_exit_debug(void) static int kvm_suspend(struct sys_device *dev, pm_message_t state) { decache_vcpus_on_cpu(raw_smp_processor_id()); - on_each_cpu(kvm_arch_ops->hardware_disable, 0, 0, 1); + on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1); return 0; } static int kvm_resume(struct sys_device *dev) { - on_each_cpu(kvm_arch_ops->hardware_enable, 0, 0, 1); + on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1); return 0; } @@ -2144,6 +2430,18 @@ static struct sys_device kvm_sysdev = { hpa_t bad_page_address; +static int kvmfs_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, struct vfsmount *mnt) +{ + return get_sb_pseudo(fs_type, "kvm:", NULL, KVMFS_SUPER_MAGIC, mnt); +} + +static struct file_system_type kvm_fs_type = { + .name = "kvmfs", + .get_sb = kvmfs_get_sb, + .kill_sb = kill_anon_super, +}; + int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module) { int r; @@ -2166,7 +2464,7 @@ int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module) r = kvm_arch_ops->hardware_setup(); if (r < 0) - return r; + goto out; on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1); r = register_cpu_notifier(&kvm_cpu_notifier); @@ -2202,6 +2500,8 @@ out_free_2: out_free_1: on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1); kvm_arch_ops->hardware_unsetup(); +out: + kvm_arch_ops = NULL; return r; } @@ -2220,8 +2520,16 @@ void kvm_exit_arch(void) static __init int kvm_init(void) { static struct page *bad_page; - int r = 0; + int r; + + r = register_filesystem(&kvm_fs_type); + if (r) + goto out3; + kvmfs_mnt = kern_mount(&kvm_fs_type); + r = PTR_ERR(kvmfs_mnt); + if (IS_ERR(kvmfs_mnt)) + goto out2; kvm_init_debug(); kvm_init_msr_list(); @@ -2234,10 +2542,14 @@ static __init int kvm_init(void) bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT; memset(__va(bad_page_address), 0, PAGE_SIZE); - return r; + return 0; out: kvm_exit_debug(); + mntput(kvmfs_mnt); +out2: + unregister_filesystem(&kvm_fs_type); +out3: return r; } @@ -2245,6 +2557,8 @@ static __exit void kvm_exit(void) { kvm_exit_debug(); __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT)); + mntput(kvmfs_mnt); + unregister_filesystem(&kvm_fs_type); } module_init(kvm_init) diff --git a/drivers/kvm/kvm_svm.h b/drivers/kvm/kvm_svm.h index 74cc862f493..624f1ca4865 100644 --- a/drivers/kvm/kvm_svm.h +++ b/drivers/kvm/kvm_svm.h @@ -1,6 +1,7 @@ #ifndef __KVM_SVM_H #define __KVM_SVM_H +#include <linux/kernel.h> #include <linux/types.h> #include <linux/list.h> #include <asm/msr.h> @@ -18,7 +19,7 @@ static const u32 host_save_msrs[] = { MSR_IA32_LASTBRANCHTOIP, MSR_IA32_LASTINTFROMIP,MSR_IA32_LASTINTTOIP,*/ }; -#define NR_HOST_SAVE_MSRS (sizeof(host_save_msrs) / sizeof(*host_save_msrs)) +#define NR_HOST_SAVE_MSRS ARRAY_SIZE(host_save_msrs) #define NUM_DB_REGS 4 struct vcpu_svm { diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index be793770f31..cab26f301ea 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -131,7 +131,7 @@ static int dbg = 1; (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) -#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & PAGE_MASK) +#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) #define PT64_DIR_BASE_ADDR_MASK \ (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) @@ -298,18 +298,18 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte) if (!is_rmap_pte(*spte)) return; page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); - if (!page->private) { + if (!page_private(page)) { rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); - page->private = (unsigned long)spte; - } else if (!(page->private & 1)) { + set_page_private(page,(unsigned long)spte); + } else if (!(page_private(page) & 1)) { rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); desc = mmu_alloc_rmap_desc(vcpu); - desc->shadow_ptes[0] = (u64 *)page->private; + desc->shadow_ptes[0] = (u64 *)page_private(page); desc->shadow_ptes[1] = spte; - page->private = (unsigned long)desc | 1; + set_page_private(page,(unsigned long)desc | 1); } else { rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); - desc = (struct kvm_rmap_desc *)(page->private & ~1ul); + desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul); while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) desc = desc->more; if (desc->shadow_ptes[RMAP_EXT-1]) { @@ -337,12 +337,12 @@ static void rmap_desc_remove_entry(struct kvm_vcpu *vcpu, if (j != 0) return; if (!prev_desc && !desc->more) - page->private = (unsigned long)desc->shadow_ptes[0]; + set_page_private(page,(unsigned long)desc->shadow_ptes[0]); else if (prev_desc) prev_desc->more = desc->more; else - page->private = (unsigned long)desc->more | 1; + set_page_private(page,(unsigned long)desc->more | 1); mmu_free_rmap_desc(vcpu, desc); } @@ -356,20 +356,20 @@ static void rmap_remove(struct kvm_vcpu *vcpu, u64 *spte) if (!is_rmap_pte(*spte)) return; page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); - if (!page->private) { + if (!page_private(page)) { printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); BUG(); - } else if (!(page->private & 1)) { + } else if (!(page_private(page) & 1)) { rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); - if ((u64 *)page->private != spte) { + if ((u64 *)page_private(page) != spte) { printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", spte, *spte); BUG(); } - page->private = 0; + set_page_private(page,0); } else { rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); - desc = (struct kvm_rmap_desc *)(page->private & ~1ul); + desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul); prev_desc = NULL; while (desc) { for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) @@ -398,16 +398,16 @@ static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) BUG_ON(!slot); page = gfn_to_page(slot, gfn); - while (page->private) { - if (!(page->private & 1)) - spte = (u64 *)page->private; + while (page_private(page)) { + if (!(page_private(page) & 1)) + spte = (u64 *)page_private(page); else { - desc = (struct kvm_rmap_desc *)(page->private & ~1ul); + desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul); spte = desc->shadow_ptes[0]; } BUG_ON(!spte); - BUG_ON((*spte & PT64_BASE_ADDR_MASK) != - page_to_pfn(page) << PAGE_SHIFT); + BUG_ON((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT + != page_to_pfn(page)); BUG_ON(!(*spte & PT_PRESENT_MASK)); BUG_ON(!(*spte & PT_WRITABLE_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); @@ -1093,22 +1093,40 @@ out: return r; } +static void mmu_pre_write_zap_pte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *page, + u64 *spte) +{ + u64 pte; + struct kvm_mmu_page *child; + + pte = *spte; + if (is_present_pte(pte)) { + if (page->role.level == PT_PAGE_TABLE_LEVEL) + rmap_remove(vcpu, spte); + else { + child = page_header(pte & PT64_BASE_ADDR_MASK); + mmu_page_remove_parent_pte(vcpu, child, spte); + } + } + *spte = 0; +} + void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *page; - struct kvm_mmu_page *child; struct hlist_node *node, *n; struct hlist_head *bucket; unsigned index; u64 *spte; - u64 pte; unsigned offset = offset_in_page(gpa); unsigned pte_size; unsigned page_offset; unsigned misaligned; int level; int flooded = 0; + int npte; pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); if (gfn == vcpu->last_pt_write_gfn) { @@ -1144,22 +1162,27 @@ void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) } page_offset = offset; level = page->role.level; + npte = 1; if (page->role.glevels == PT32_ROOT_LEVEL) { - page_offset <<= 1; /* 32->64 */ + page_offset <<= 1; /* 32->64 */ + /* + * A 32-bit pde maps 4MB while the shadow pdes map + * only 2MB. So we need to double the offset again + * and zap two pdes instead of one. + */ + if (level == PT32_ROOT_LEVEL) { + page_offset &= ~7; /* kill rounding error */ + page_offset <<= 1; + npte = 2; + } page_offset &= ~PAGE_MASK; } spte = __va(page->page_hpa); spte += page_offset / sizeof(*spte); - pte = *spte; - if (is_present_pte(pte)) { - if (level == PT_PAGE_TABLE_LEVEL) - rmap_remove(vcpu, spte); - else { - child = page_header(pte & PT64_BASE_ADDR_MASK); - mmu_page_remove_parent_pte(vcpu, child, spte); - } + while (npte--) { + mmu_pre_write_zap_pte(vcpu, page, spte); + ++spte; } - *spte = 0; } } @@ -1218,7 +1241,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) INIT_LIST_HEAD(&page_header->link); if ((page = alloc_page(GFP_KERNEL)) == NULL) goto error_1; - page->private = (unsigned long)page_header; + set_page_private(page, (unsigned long)page_header); page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT; memset(__va(page_header->page_hpa), 0, PAGE_SIZE); list_add(&page_header->link, &vcpu->free_pages); diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index b6b90e9e130..f3bcee90465 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -128,8 +128,10 @@ static int FNAME(walk_addr)(struct guest_walker *walker, goto access_error; #endif - if (!(*ptep & PT_ACCESSED_MASK)) - *ptep |= PT_ACCESSED_MASK; /* avoid rmw */ + if (!(*ptep & PT_ACCESSED_MASK)) { + mark_page_dirty(vcpu->kvm, table_gfn); + *ptep |= PT_ACCESSED_MASK; + } if (walker->level == PT_PAGE_TABLE_LEVEL) { walker->gfn = (*ptep & PT_BASE_ADDR_MASK) @@ -185,6 +187,12 @@ static void FNAME(release_walker)(struct guest_walker *walker) kunmap_atomic(walker->table, KM_USER0); } +static void FNAME(mark_pagetable_dirty)(struct kvm *kvm, + struct guest_walker *walker) +{ + mark_page_dirty(kvm, walker->table_gfn[walker->level - 1]); +} + static void FNAME(set_pte)(struct kvm_vcpu *vcpu, u64 guest_pte, u64 *shadow_pte, u64 access_bits, gfn_t gfn) { @@ -348,12 +356,15 @@ static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu, } else if (kvm_mmu_lookup_page(vcpu, gfn)) { pgprintk("%s: found shadow page for %lx, marking ro\n", __FUNCTION__, gfn); + mark_page_dirty(vcpu->kvm, gfn); + FNAME(mark_pagetable_dirty)(vcpu->kvm, walker); *guest_ent |= PT_DIRTY_MASK; *write_pt = 1; return 0; } mark_page_dirty(vcpu->kvm, gfn); *shadow_ent |= PT_WRITABLE_MASK; + FNAME(mark_pagetable_dirty)(vcpu->kvm, walker); *guest_ent |= PT_DIRTY_MASK; rmap_add(vcpu, shadow_ent); @@ -430,9 +441,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, /* * mmio: emulate if accessible, otherwise its a guest fault. */ - if (is_io_pte(*shadow_pte)) { + if (is_io_pte(*shadow_pte)) return 1; - } ++kvm_stat.pf_fixed; kvm_mmu_audit(vcpu, "post page fault (fixed)"); diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index 83da4ea150a..3d8ea7ac2ec 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -15,6 +15,7 @@ */ #include <linux/module.h> +#include <linux/kernel.h> #include <linux/vmalloc.h> #include <linux/highmem.h> #include <linux/profile.h> @@ -75,7 +76,7 @@ struct svm_init_data { static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; -#define NUM_MSR_MAPS (sizeof(msrpm_ranges) / sizeof(*msrpm_ranges)) +#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) #define MSRS_RANGE_SIZE 2048 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) @@ -485,6 +486,7 @@ static void init_vmcb(struct vmcb *vmcb) control->intercept = (1ULL << INTERCEPT_INTR) | (1ULL << INTERCEPT_NMI) | + (1ULL << INTERCEPT_SMI) | /* * selective cr0 intercept bug? * 0: 0f 22 d8 mov %eax,%cr3 @@ -553,7 +555,7 @@ static void init_vmcb(struct vmcb *vmcb) * cr0 val on cpu init should be 0x60000010, we enable cpu * cache by default. the orderly way is to enable cache in bios. */ - save->cr0 = 0x00000010 | CR0_PG_MASK; + save->cr0 = 0x00000010 | CR0_PG_MASK | CR0_WP_MASK; save->cr4 = CR4_PAE_MASK; /* rdx = ?? */ } @@ -598,10 +600,9 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) kfree(vcpu->svm); } -static struct kvm_vcpu *svm_vcpu_load(struct kvm_vcpu *vcpu) +static void svm_vcpu_load(struct kvm_vcpu *vcpu) { get_cpu(); - return vcpu; } static void svm_vcpu_put(struct kvm_vcpu *vcpu) @@ -1042,22 +1043,22 @@ static int io_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) addr_mask = io_adress(vcpu, _in, &kvm_run->io.address); if (!addr_mask) { - printk(KERN_DEBUG "%s: get io address failed\n", __FUNCTION__); + printk(KERN_DEBUG "%s: get io address failed\n", + __FUNCTION__); return 1; } if (kvm_run->io.rep) { - kvm_run->io.count = vcpu->regs[VCPU_REGS_RCX] & addr_mask; + kvm_run->io.count + = vcpu->regs[VCPU_REGS_RCX] & addr_mask; kvm_run->io.string_down = (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0; } - } else { + } else kvm_run->io.value = vcpu->svm->vmcb->save.rax; - } return 0; } - static int nop_on_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { return 1; @@ -1075,6 +1076,12 @@ static int halt_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 0; } +static int vmmcall_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + vcpu->svm->vmcb->save.rip += 3; + return kvm_hypercall(vcpu, kvm_run); +} + static int invalid_op_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { inject_ud(vcpu); @@ -1275,7 +1282,7 @@ static int (*svm_exit_handlers[])(struct kvm_vcpu *vcpu, [SVM_EXIT_TASK_SWITCH] = task_switch_interception, [SVM_EXIT_SHUTDOWN] = shutdown_interception, [SVM_EXIT_VMRUN] = invalid_op_interception, - [SVM_EXIT_VMMCALL] = invalid_op_interception, + [SVM_EXIT_VMMCALL] = vmmcall_interception, [SVM_EXIT_VMLOAD] = invalid_op_interception, [SVM_EXIT_VMSAVE] = invalid_op_interception, [SVM_EXIT_STGI] = invalid_op_interception, @@ -1297,7 +1304,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) __FUNCTION__, vcpu->svm->vmcb->control.exit_int_info, exit_code); - if (exit_code >= sizeof(svm_exit_handlers) / sizeof(*svm_exit_handlers) + if (exit_code >= ARRAY_SIZE(svm_exit_handlers) || svm_exit_handlers[exit_code] == 0) { kvm_run->exit_reason = KVM_EXIT_UNKNOWN; printk(KERN_ERR "%s: 0x%x @ 0x%llx cr0 0x%lx rflags 0x%llx\n", @@ -1668,6 +1675,18 @@ static int is_disabled(void) return 0; } +static void +svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) +{ + /* + * Patch in the VMMCALL instruction: + */ + hypercall[0] = 0x0f; + hypercall[1] = 0x01; + hypercall[2] = 0xd9; + hypercall[3] = 0xc3; +} + static struct kvm_arch_ops svm_arch_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -1716,6 +1735,7 @@ static struct kvm_arch_ops svm_arch_ops = { .run = svm_vcpu_run, .skip_emulated_instruction = skip_emulated_instruction, .vcpu_setup = svm_vcpu_setup, + .patch_hypercall = svm_patch_hypercall, }; static int __init svm_init(void) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index fd4e9173438..fbbf9d6b299 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -19,6 +19,7 @@ #include "vmx.h" #include "kvm_vmx.h" #include <linux/module.h> +#include <linux/kernel.h> #include <linux/mm.h> #include <linux/highmem.h> #include <linux/profile.h> @@ -27,7 +28,6 @@ #include "segment_descriptor.h" - MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -76,7 +76,7 @@ static const u32 vmx_msr_index[] = { #endif MSR_EFER, MSR_K6_STAR, }; -#define NR_VMX_MSR (sizeof(vmx_msr_index) / sizeof(*vmx_msr_index)) +#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) static inline int is_page_fault(u32 intr_info) { @@ -204,7 +204,7 @@ static void vmcs_write64(unsigned long field, u64 value) * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. */ -static struct kvm_vcpu *vmx_vcpu_load(struct kvm_vcpu *vcpu) +static void vmx_vcpu_load(struct kvm_vcpu *vcpu) { u64 phys_addr = __pa(vcpu->vmcs); int cpu; @@ -242,7 +242,6 @@ static struct kvm_vcpu *vmx_vcpu_load(struct kvm_vcpu *vcpu) rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ } - return vcpu; } static void vmx_vcpu_put(struct kvm_vcpu *vcpu) @@ -372,10 +371,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) data = vmcs_read32(GUEST_SYSENTER_CS); break; case MSR_IA32_SYSENTER_EIP: - data = vmcs_read32(GUEST_SYSENTER_EIP); + data = vmcs_readl(GUEST_SYSENTER_EIP); break; case MSR_IA32_SYSENTER_ESP: - data = vmcs_read32(GUEST_SYSENTER_ESP); + data = vmcs_readl(GUEST_SYSENTER_ESP); break; default: msr = find_msr_entry(vcpu, msr_index); @@ -413,15 +412,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) vmcs_write32(GUEST_SYSENTER_CS, data); break; case MSR_IA32_SYSENTER_EIP: - vmcs_write32(GUEST_SYSENTER_EIP, data); + vmcs_writel(GUEST_SYSENTER_EIP, data); break; case MSR_IA32_SYSENTER_ESP: - vmcs_write32(GUEST_SYSENTER_ESP, data); + vmcs_writel(GUEST_SYSENTER_ESP, data); break; - case MSR_IA32_TIME_STAMP_COUNTER: { + case MSR_IA32_TIME_STAMP_COUNTER: guest_write_tsc(data); break; - } default: msr = find_msr_entry(vcpu, msr_index); if (msr) { @@ -620,7 +618,7 @@ static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) { struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - if (vmcs_readl(sf->base) == save->base) { + if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) { vmcs_write16(sf->selector, save->selector); vmcs_writel(sf->base, save->base); vmcs_write32(sf->limit, save->limit); @@ -793,6 +791,9 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) */ static void vmx_set_cr0_no_modeswitch(struct kvm_vcpu *vcpu, unsigned long cr0) { + if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK)) + enter_rmode(vcpu); + vcpu->rmode.active = ((cr0 & CR0_PE_MASK) == 0); update_exception_bitmap(vcpu); vmcs_writel(CR0_READ_SHADOW, cr0); @@ -1467,6 +1468,18 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 0; } +static void +vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) +{ + /* + * Patch in the VMCALL instruction: + */ + hypercall[0] = 0x0f; + hypercall[1] = 0x01; + hypercall[2] = 0xc1; + hypercall[3] = 0xc3; +} + static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { u64 exit_qualification; @@ -1643,6 +1656,12 @@ static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 0; } +static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP)+3); + return kvm_hypercall(vcpu, kvm_run); +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -1661,6 +1680,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, [EXIT_REASON_MSR_WRITE] = handle_wrmsr, [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, [EXIT_REASON_HLT] = handle_halt, + [EXIT_REASON_VMCALL] = handle_vmcall, }; static const int kvm_vmx_max_exit_handlers = @@ -1868,6 +1888,27 @@ again: [cr2]"i"(offsetof(struct kvm_vcpu, cr2)) : "cc", "memory" ); + /* + * Reload segment selectors ASAP. (it's needed for a functional + * kernel: x86 relies on having __KERNEL_PDA in %fs and x86_64 + * relies on having 0 in %gs for the CPU PDA to work.) + */ + if (fs_gs_ldt_reload_needed) { + load_ldt(ldt_sel); + load_fs(fs_sel); + /* + * If we have to reload gs, we must take care to + * preserve our gs base. + */ + local_irq_disable(); + load_gs(gs_sel); +#ifdef CONFIG_X86_64 + wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); +#endif + local_irq_enable(); + + reload_tss(); + } ++kvm_stat.exits; save_msrs(vcpu->guest_msrs, NR_BAD_MSRS); @@ -1885,22 +1926,6 @@ again: kvm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR); r = 0; } else { - if (fs_gs_ldt_reload_needed) { - load_ldt(ldt_sel); - load_fs(fs_sel); - /* - * If we have to reload gs, we must take care to - * preserve our gs base. - */ - local_irq_disable(); - load_gs(gs_sel); -#ifdef CONFIG_X86_64 - wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); -#endif - local_irq_enable(); - - reload_tss(); - } /* * Profile KVM exit RIPs: */ @@ -2062,6 +2087,7 @@ static struct kvm_arch_ops vmx_arch_ops = { .run = vmx_vcpu_run, .skip_emulated_instruction = skip_emulated_instruction, .vcpu_setup = vmx_vcpu_setup, + .patch_hypercall = vmx_patch_hypercall, }; static int __init vmx_init(void) |