diff options
182 files changed, 14842 insertions, 9360 deletions
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl index aa38cc5692a..77436d73501 100644 --- a/Documentation/DocBook/kernel-api.tmpl +++ b/Documentation/DocBook/kernel-api.tmpl @@ -419,7 +419,13 @@ X!Edrivers/pnp/system.c <chapter id="blkdev"> <title>Block Devices</title> -!Eblock/ll_rw_blk.c +!Eblock/blk-core.c +!Eblock/blk-map.c +!Iblock/blk-sysfs.c +!Eblock/blk-settings.c +!Eblock/blk-exec.c +!Eblock/blk-barrier.c +!Eblock/blk-tag.c </chapter> <chapter id="chrdev"> diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 9b0e322118b..6c8a2386cd5 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -79,6 +79,9 @@ static void *guest_base; /* The maximum guest physical address allowed, and maximum possible. */ static unsigned long guest_limit, guest_max; +/* a per-cpu variable indicating whose vcpu is currently running */ +static unsigned int __thread cpu_id; + /* This is our list of devices. */ struct device_list { @@ -153,6 +156,9 @@ struct virtqueue void (*handle_output)(int fd, struct virtqueue *me); }; +/* Remember the arguments to the program so we can "reboot" */ +static char **main_args; + /* Since guest is UP and we don't run at the same time, we don't need barriers. * But I include them in the code in case others copy it. */ #define wmb() @@ -554,7 +560,7 @@ static void wake_parent(int pipefd, int lguest_fd) else FD_CLR(-fd - 1, &devices.infds); } else /* Send LHREQ_BREAK command. */ - write(lguest_fd, args, sizeof(args)); + pwrite(lguest_fd, args, sizeof(args), cpu_id); } } @@ -1489,7 +1495,9 @@ static void setup_block_file(const char *filename) /* Create stack for thread and run it */ stack = malloc(32768); - if (clone(io_thread, stack + 32768, CLONE_VM, dev) == -1) + /* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from + * becoming a zombie. */ + if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1) err(1, "Creating clone"); /* We don't need to keep the I/O thread's end of the pipes open. */ @@ -1499,7 +1507,21 @@ static void setup_block_file(const char *filename) verbose("device %u: virtblock %llu sectors\n", devices.device_num, cap); } -/* That's the end of device setup. */ +/* That's the end of device setup. :*/ + +/* Reboot */ +static void __attribute__((noreturn)) restart_guest(void) +{ + unsigned int i; + + /* Closing pipes causes the waker thread and io_threads to die, and + * closing /dev/lguest cleans up the Guest. Since we don't track all + * open fds, we simply close everything beyond stderr. */ + for (i = 3; i < FD_SETSIZE; i++) + close(i); + execv(main_args[0], main_args); + err(1, "Could not exec %s", main_args[0]); +} /*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves * its input and output, and finally, lays it to rest. */ @@ -1511,7 +1533,8 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd) int readval; /* We read from the /dev/lguest device to run the Guest. */ - readval = read(lguest_fd, ¬ify_addr, sizeof(notify_addr)); + readval = pread(lguest_fd, ¬ify_addr, + sizeof(notify_addr), cpu_id); /* One unsigned long means the Guest did HCALL_NOTIFY */ if (readval == sizeof(notify_addr)) { @@ -1521,16 +1544,23 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd) /* ENOENT means the Guest died. Reading tells us why. */ } else if (errno == ENOENT) { char reason[1024] = { 0 }; - read(lguest_fd, reason, sizeof(reason)-1); + pread(lguest_fd, reason, sizeof(reason)-1, cpu_id); errx(1, "%s", reason); + /* ERESTART means that we need to reboot the guest */ + } else if (errno == ERESTART) { + restart_guest(); /* EAGAIN means the Waker wanted us to look at some input. * Anything else means a bug or incompatible change. */ } else if (errno != EAGAIN) err(1, "Running guest failed"); + /* Only service input on thread for CPU 0. */ + if (cpu_id != 0) + continue; + /* Service input, then unset the BREAK to release the Waker. */ handle_input(lguest_fd); - if (write(lguest_fd, args, sizeof(args)) < 0) + if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0) err(1, "Resetting break"); } } @@ -1571,6 +1601,12 @@ int main(int argc, char *argv[]) /* If they specify an initrd file to load. */ const char *initrd_name = NULL; + /* Save the args: we "reboot" by execing ourselves again. */ + main_args = argv; + /* We don't "wait" for the children, so prevent them from becoming + * zombies. */ + signal(SIGCHLD, SIG_IGN); + /* First we initialize the device list. Since console and network * device receive input from a file descriptor, we keep an fdset * (infds) and the maximum fd number (max_infd) with the head of the @@ -1582,6 +1618,7 @@ int main(int argc, char *argv[]) devices.lastdev = &devices.dev; devices.next_irq = 1; + cpu_id = 0; /* We need to know how much memory so we can set up the device * descriptor and memory pages for the devices as we parse the command * line. So we quickly look through the arguments to find the amount diff --git a/arch/ia64/hp/sim/simscsi.c b/arch/ia64/hp/sim/simscsi.c index 6ef9b521993..7661bb065fa 100644 --- a/arch/ia64/hp/sim/simscsi.c +++ b/arch/ia64/hp/sim/simscsi.c @@ -360,7 +360,6 @@ static struct scsi_host_template driver_template = { .max_sectors = 1024, .cmd_per_lun = SIMSCSI_REQ_QUEUE_LEN, .use_clustering = DISABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, }; static int __init diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c index 19a5656001c..f0bad7070fb 100644 --- a/arch/powerpc/kernel/vio.c +++ b/arch/powerpc/kernel/vio.c @@ -37,8 +37,6 @@ #include <asm/iseries/hv_call_xm.h> #include <asm/iseries/iommu.h> -extern struct kset devices_subsys; /* needed for vio_find_name() */ - static struct bus_type vio_bus_type; static struct vio_dev vio_bus_device = { /* fake "parent" device */ @@ -361,19 +359,16 @@ EXPORT_SYMBOL(vio_get_attribute); #ifdef CONFIG_PPC_PSERIES /* vio_find_name() - internal because only vio.c knows how we formatted the * kobject name - * XXX once vio_bus_type.devices is actually used as a kset in - * drivers/base/bus.c, this function should be removed in favor of - * "device_find(kobj_name, &vio_bus_type)" */ -static struct vio_dev *vio_find_name(const char *kobj_name) +static struct vio_dev *vio_find_name(const char *name) { - struct kobject *found; + struct device *found; - found = kset_find_obj(&devices_subsys, kobj_name); + found = bus_find_device_by_name(&vio_bus_type, NULL, name); if (!found) return NULL; - return to_vio_dev(container_of(found, struct device, kobj)); + return to_vio_dev(found); } /** diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fb3eea3e38e..65b449134cf 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -107,6 +107,7 @@ config ARCH_SUPPORTS_OPROFILE bool default y +select HAVE_KVM config ZONE_DMA32 bool @@ -1598,4 +1599,6 @@ source "security/Kconfig" source "crypto/Kconfig" +source "arch/x86/kvm/Kconfig" + source "lib/Kconfig" diff --git a/arch/x86/Makefile b/arch/x86/Makefile index b08f18261df..da8f4129780 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -7,6 +7,8 @@ else KBUILD_DEFCONFIG := $(ARCH)_defconfig endif +core-$(CONFIG_KVM) += arch/x86/kvm/ + # BITS is used as extension for files which are available in a 32 bit # and a 64 bit version to simplify shared Makefiles. # e.g.: obj-y += foo_$(BITS).o diff --git a/drivers/kvm/Kconfig b/arch/x86/kvm/Kconfig index 656920636cb..c83e1c9b512 100644 --- a/drivers/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -1,9 +1,12 @@ # # KVM configuration # +config HAVE_KVM + bool + menuconfig VIRTUALIZATION bool "Virtualization" - depends on X86 + depends on HAVE_KVM || X86 default y ---help--- Say Y here to get to see options for using your Linux host to run other @@ -16,7 +19,7 @@ if VIRTUALIZATION config KVM tristate "Kernel-based Virtual Machine (KVM) support" - depends on X86 && EXPERIMENTAL + depends on HAVE_KVM && EXPERIMENTAL select PREEMPT_NOTIFIERS select ANON_INODES ---help--- diff --git a/drivers/kvm/Makefile b/arch/x86/kvm/Makefile index e5a8f4d3e97..ffdd0b31078 100644 --- a/drivers/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -2,7 +2,11 @@ # Makefile for Kernel-based Virtual Machine module # -kvm-objs := kvm_main.o mmu.o x86_emulate.o i8259.o irq.o lapic.o ioapic.o +common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o) + +EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm + +kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o obj-$(CONFIG_KVM) += kvm.o kvm-intel-objs = vmx.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o diff --git a/drivers/kvm/i8259.c b/arch/x86/kvm/i8259.c index a679157bc59..ab29cf2def4 100644 --- a/drivers/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -28,6 +28,8 @@ #include <linux/mm.h> #include "irq.h" +#include <linux/kvm_host.h> + /* * set irq level. If an edge is detected, then the IRR is set to 1 */ @@ -181,10 +183,8 @@ int kvm_pic_read_irq(struct kvm_pic *s) return intno; } -static void pic_reset(void *opaque) +void kvm_pic_reset(struct kvm_kpic_state *s) { - struct kvm_kpic_state *s = opaque; - s->last_irr = 0; s->irr = 0; s->imr = 0; @@ -209,7 +209,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) addr &= 1; if (addr == 0) { if (val & 0x10) { - pic_reset(s); /* init */ + kvm_pic_reset(s); /* init */ /* * deassert a pending interrupt */ diff --git a/drivers/kvm/irq.c b/arch/x86/kvm/irq.c index 7628c7ff628..e5714759e97 100644 --- a/drivers/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -20,8 +20,8 @@ */ #include <linux/module.h> +#include <linux/kvm_host.h> -#include "kvm.h" #include "irq.h" /* @@ -63,26 +63,6 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) } EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); -static void vcpu_kick_intr(void *info) -{ -#ifdef DEBUG - struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info; - printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu); -#endif -} - -void kvm_vcpu_kick(struct kvm_vcpu *vcpu) -{ - int ipi_pcpu = vcpu->cpu; - - if (waitqueue_active(&vcpu->wq)) { - wake_up_interruptible(&vcpu->wq); - ++vcpu->stat.halt_wakeup; - } - if (vcpu->guest_mode) - smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0); -} - void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) { kvm_inject_apic_timer_irqs(vcpu); diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h new file mode 100644 index 00000000000..fa5ed5d59b5 --- /dev/null +++ b/arch/x86/kvm/irq.h @@ -0,0 +1,88 @@ +/* + * irq.h: in kernel interrupt controller related definitions + * Copyright (c) 2007, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * Authors: + * Yaozu (Eddie) Dong <Eddie.dong@intel.com> + * + */ + +#ifndef __IRQ_H +#define __IRQ_H + +#include <linux/mm_types.h> +#include <linux/hrtimer.h> +#include <linux/kvm_host.h> + +#include "iodev.h" +#include "ioapic.h" +#include "lapic.h" + +struct kvm; +struct kvm_vcpu; + +typedef void irq_request_func(void *opaque, int level); + +struct kvm_kpic_state { + u8 last_irr; /* edge detection */ + u8 irr; /* interrupt request register */ + u8 imr; /* interrupt mask register */ + u8 isr; /* interrupt service register */ + u8 priority_add; /* highest irq priority */ + u8 irq_base; + u8 read_reg_select; + u8 poll; + u8 special_mask; + u8 init_state; + u8 auto_eoi; + u8 rotate_on_auto_eoi; + u8 special_fully_nested_mode; + u8 init4; /* true if 4 byte init */ + u8 elcr; /* PIIX edge/trigger selection */ + u8 elcr_mask; + struct kvm_pic *pics_state; +}; + +struct kvm_pic { + struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ + irq_request_func *irq_request; + void *irq_request_opaque; + int output; /* intr from master PIC */ + struct kvm_io_device dev; +}; + +struct kvm_pic *kvm_create_pic(struct kvm *kvm); +void kvm_pic_set_irq(void *opaque, int irq, int level); +int kvm_pic_read_irq(struct kvm_pic *s); +void kvm_pic_update_irq(struct kvm_pic *s); + +static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) +{ + return kvm->arch.vpic; +} + +static inline int irqchip_in_kernel(struct kvm *kvm) +{ + return pic_irqchip(kvm) != NULL; +} + +void kvm_pic_reset(struct kvm_kpic_state *s); + +void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); +void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); +void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); +void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); + +#endif diff --git a/drivers/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h index a0e415daef5..ecdfe97e463 100644 --- a/drivers/kvm/kvm_svm.h +++ b/arch/x86/kvm/kvm_svm.h @@ -4,10 +4,10 @@ #include <linux/kernel.h> #include <linux/types.h> #include <linux/list.h> +#include <linux/kvm_host.h> #include <asm/msr.h> #include "svm.h" -#include "kvm.h" static const u32 host_save_user_msrs[] = { #ifdef CONFIG_X86_64 diff --git a/drivers/kvm/lapic.c b/arch/x86/kvm/lapic.c index 238fcad3cec..2cbee9479ce 100644 --- a/drivers/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -17,7 +17,7 @@ * the COPYING file in the top-level directory. */ -#include "kvm.h" +#include <linux/kvm_host.h> #include <linux/kvm.h> #include <linux/mm.h> #include <linux/highmem.h> @@ -56,6 +56,7 @@ #define VEC_POS(v) ((v) & (32 - 1)) #define REG_POS(v) (((v) >> 5) << 4) + static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off) { return *((u32 *) (apic->regs + reg_off)); @@ -88,7 +89,7 @@ static inline void apic_clear_vector(int vec, void *bitmap) static inline int apic_hw_enabled(struct kvm_lapic *apic) { - return (apic)->vcpu->apic_base & MSR_IA32_APICBASE_ENABLE; + return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; } static inline int apic_sw_enabled(struct kvm_lapic *apic) @@ -172,7 +173,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; + struct kvm_lapic *apic = vcpu->arch.apic; int highest_irr; if (!apic) @@ -183,8 +184,10 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); -int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig) +int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig) { + struct kvm_lapic *apic = vcpu->arch.apic; + if (!apic_test_and_set_irr(vec, apic)) { /* a new pending irq is set in IRR */ if (trig) @@ -268,7 +271,7 @@ static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, int dest, int dest_mode) { int result = 0; - struct kvm_lapic *target = vcpu->apic; + struct kvm_lapic *target = vcpu->arch.apic; apic_debug("target %p, source %p, dest 0x%x, " "dest_mode 0x%x, short_hand 0x%x", @@ -335,10 +338,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, } else apic_clear_vector(vector, apic->regs + APIC_TMR); - if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE) + if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE) kvm_vcpu_kick(vcpu); - else if (vcpu->mp_state == VCPU_MP_STATE_HALTED) { - vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; + else if (vcpu->arch.mp_state == VCPU_MP_STATE_HALTED) { + vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; if (waitqueue_active(&vcpu->wq)) wake_up_interruptible(&vcpu->wq); } @@ -359,11 +362,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, case APIC_DM_INIT: if (level) { - if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE) + if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE) printk(KERN_DEBUG "INIT on a runnable vcpu %d\n", vcpu->vcpu_id); - vcpu->mp_state = VCPU_MP_STATE_INIT_RECEIVED; + vcpu->arch.mp_state = VCPU_MP_STATE_INIT_RECEIVED; kvm_vcpu_kick(vcpu); } else { printk(KERN_DEBUG @@ -376,9 +379,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, case APIC_DM_STARTUP: printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", vcpu->vcpu_id, vector); - if (vcpu->mp_state == VCPU_MP_STATE_INIT_RECEIVED) { - vcpu->sipi_vector = vector; - vcpu->mp_state = VCPU_MP_STATE_SIPI_RECEIVED; + if (vcpu->arch.mp_state == VCPU_MP_STATE_INIT_RECEIVED) { + vcpu->arch.sipi_vector = vector; + vcpu->arch.mp_state = VCPU_MP_STATE_SIPI_RECEIVED; if (waitqueue_active(&vcpu->wq)) wake_up_interruptible(&vcpu->wq); } @@ -392,15 +395,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, return result; } -struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, +static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, unsigned long bitmap) { - int vcpu_id; int last; int next; - struct kvm_lapic *apic; + struct kvm_lapic *apic = NULL; - last = kvm->round_robin_prev_vcpu; + last = kvm->arch.round_robin_prev_vcpu; next = last; do { @@ -408,25 +410,30 @@ struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, next = 0; if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap)) continue; - apic = kvm->vcpus[next]->apic; + apic = kvm->vcpus[next]->arch.apic; if (apic && apic_enabled(apic)) break; apic = NULL; } while (next != last); - kvm->round_robin_prev_vcpu = next; + kvm->arch.round_robin_prev_vcpu = next; - if (!apic) { - vcpu_id = ffs(bitmap) - 1; - if (vcpu_id < 0) { - vcpu_id = 0; - printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n"); - } - apic = kvm->vcpus[vcpu_id]->apic; - } + if (!apic) + printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n"); return apic; } +struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, + unsigned long bitmap) +{ + struct kvm_lapic *apic; + + apic = kvm_apic_round_robin(kvm, vector, bitmap); + if (apic) + return apic->vcpu; + return NULL; +} + static void apic_set_eoi(struct kvm_lapic *apic) { int vector = apic_find_highest_isr(apic); @@ -458,7 +465,7 @@ static void apic_send_ipi(struct kvm_lapic *apic) unsigned int delivery_mode = icr_low & APIC_MODE_MASK; unsigned int vector = icr_low & APIC_VECTOR_MASK; - struct kvm_lapic *target; + struct kvm_vcpu *target; struct kvm_vcpu *vcpu; unsigned long lpr_map = 0; int i; @@ -474,20 +481,20 @@ static void apic_send_ipi(struct kvm_lapic *apic) if (!vcpu) continue; - if (vcpu->apic && + if (vcpu->arch.apic && apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) { if (delivery_mode == APIC_DM_LOWEST) set_bit(vcpu->vcpu_id, &lpr_map); else - __apic_accept_irq(vcpu->apic, delivery_mode, + __apic_accept_irq(vcpu->arch.apic, delivery_mode, vector, level, trig_mode); } } if (delivery_mode == APIC_DM_LOWEST) { - target = kvm_apic_round_robin(vcpu->kvm, vector, lpr_map); + target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map); if (target != NULL) - __apic_accept_irq(target, delivery_mode, + __apic_accept_irq(target->arch.apic, delivery_mode, vector, level, trig_mode); } } @@ -544,6 +551,23 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic) return tmcct; } +static void __report_tpr_access(struct kvm_lapic *apic, bool write) +{ + struct kvm_vcpu *vcpu = apic->vcpu; + struct kvm_run *run = vcpu->run; + + set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); + kvm_x86_ops->cache_regs(vcpu); + run->tpr_access.rip = vcpu->arch.rip; + run->tpr_access.is_write = write; +} + +static inline void report_tpr_access(struct kvm_lapic *apic, bool write) +{ + if (apic->vcpu->arch.tpr_access_reporting) + __report_tpr_access(apic, write); +} + static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) { u32 val = 0; @@ -561,6 +585,9 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) val = apic_get_tmcct(apic); break; + case APIC_TASKPRI: + report_tpr_access(apic, false); + /* fall thru */ default: apic_update_ppr(apic); val = apic_get_reg(apic, offset); @@ -670,6 +697,7 @@ static void apic_mmio_write(struct kvm_io_device *this, break; case APIC_TASKPRI: + report_tpr_access(apic, true); apic_set_tpr(apic, val & 0xff); break; @@ -762,19 +790,17 @@ static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr) return ret; } -void kvm_free_apic(struct kvm_lapic *apic) +void kvm_free_lapic(struct kvm_vcpu *vcpu) { - if (!apic) + if (!vcpu->arch.apic) return; - hrtimer_cancel(&apic->timer.dev); + hrtimer_cancel(&vcpu->arch.apic->timer.dev); - if (apic->regs_page) { - __free_page(apic->regs_page); - apic->regs_page = 0; - } + if (vcpu->arch.apic->regs_page) + __free_page(vcpu->arch.apic->regs_page); - kfree(apic); + kfree(vcpu->arch.apic); } /* @@ -785,16 +811,17 @@ void kvm_free_apic(struct kvm_lapic *apic) void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) { - struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; + struct kvm_lapic *apic = vcpu->arch.apic; if (!apic) return; - apic_set_tpr(apic, ((cr8 & 0x0f) << 4)); + apic_set_tpr(apic, ((cr8 & 0x0f) << 4) + | (apic_get_reg(apic, APIC_TASKPRI) & 4)); } u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; + struct kvm_lapic *apic = vcpu->arch.apic; u64 tpr; if (!apic) @@ -807,29 +834,29 @@ EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8); void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) { - struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; + struct kvm_lapic *apic = vcpu->arch.apic; if (!apic) { value |= MSR_IA32_APICBASE_BSP; - vcpu->apic_base = value; + vcpu->arch.apic_base = value; return; } if (apic->vcpu->vcpu_id) value &= ~MSR_IA32_APICBASE_BSP; - vcpu->apic_base = value; - apic->base_address = apic->vcpu->apic_base & + vcpu->arch.apic_base = value; + apic->base_address = apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; /* with FSB delivery interrupt, we can restart APIC functionality */ apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is " - "0x%lx.\n", apic->apic_base, apic->base_address); + "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address); } u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu) { - return vcpu->apic_base; + return vcpu->arch.apic_base; } EXPORT_SYMBOL_GPL(kvm_lapic_get_base); @@ -841,7 +868,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) apic_debug("%s\n", __FUNCTION__); ASSERT(vcpu); - apic = vcpu->apic; + apic = vcpu->arch.apic; ASSERT(apic != NULL); /* Stop the timer in case it's a reset to an active apic */ @@ -872,19 +899,19 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) update_divide_count(apic); atomic_set(&apic->timer.pending, 0); if (vcpu->vcpu_id == 0) - vcpu->apic_base |= MSR_IA32_APICBASE_BSP; + vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; apic_update_ppr(apic); apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__, vcpu, kvm_apic_id(apic), - vcpu->apic_base, apic->base_address); + vcpu->arch.apic_base, apic->base_address); } EXPORT_SYMBOL_GPL(kvm_lapic_reset); int kvm_lapic_enabled(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; + struct kvm_lapic *apic = vcpu->arch.apic; int ret = 0; if (!apic) @@ -908,9 +935,8 @@ static int __apic_timer_fn(struct kvm_lapic *apic) wait_queue_head_t *q = &apic->vcpu->wq; atomic_inc(&apic->timer.pending); - if (waitqueue_active(q)) - { - apic->vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; + if (waitqueue_active(q)) { + apic->vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; wake_up_interruptible(q); } if (apic_lvtt_period(apic)) { @@ -956,13 +982,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) if (!apic) goto nomem; - vcpu->apic = apic; + vcpu->arch.apic = apic; apic->regs_page = alloc_page(GFP_KERNEL); if (apic->regs_page == NULL) { printk(KERN_ERR "malloc apic regs error for vcpu %x\n", vcpu->vcpu_id); - goto nomem; + goto nomem_free_apic; } apic->regs = page_address(apic->regs_page); memset(apic->regs, 0, PAGE_SIZE); @@ -971,7 +997,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); apic->timer.dev.function = apic_timer_fn; apic->base_address = APIC_DEFAULT_PHYS_BASE; - vcpu->apic_base = APIC_DEFAULT_PHYS_BASE; + vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; kvm_lapic_reset(vcpu); apic->dev.read = apic_mmio_read; @@ -980,15 +1006,16 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) apic->dev.private = apic; return 0; +nomem_free_apic: + kfree(apic); nomem: - kvm_free_apic(apic); return -ENOMEM; } EXPORT_SYMBOL_GPL(kvm_create_lapic); int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = vcpu->apic; + struct kvm_lapic *apic = vcpu->arch.apic; int highest_irr; if (!apic || !apic_enabled(apic)) @@ -1004,11 +1031,11 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) { - u32 lvt0 = apic_get_reg(vcpu->apic, APIC_LVT0); + u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); int r = 0; if (vcpu->vcpu_id == 0) { - if (!apic_hw_enabled(vcpu->apic)) + if (!apic_hw_enabled(vcpu->arch.apic)) r = 1; if ((lvt0 & APIC_LVT_MASKED) == 0 && GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) @@ -1019,7 +1046,7 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = vcpu->apic; + struct kvm_lapic *apic = vcpu->arch.apic; if (apic && apic_lvt_enabled(apic, APIC_LVTT) && atomic_read(&apic->timer.pending) > 0) { @@ -1030,7 +1057,7 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec) { - struct kvm_lapic *apic = vcpu->apic; + struct kvm_lapic *apic = vcpu->arch.apic; if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec) apic->timer.last_update = ktime_add_ns( @@ -1041,7 +1068,7 @@ void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec) int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) { int vector = kvm_apic_has_interrupt(vcpu); - struct kvm_lapic *apic = vcpu->apic; + struct kvm_lapic *apic = vcpu->arch.apic; if (vector == -1) return -1; @@ -1054,9 +1081,9 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = vcpu->apic; + struct kvm_lapic *apic = vcpu->arch.apic; - apic->base_address = vcpu->apic_base & + apic->base_address = vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; apic_set_reg(apic, APIC_LVR, APIC_VERSION); apic_update_ppr(apic); @@ -1065,9 +1092,9 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) start_apic_timer(apic); } -void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) +void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = vcpu->apic; + struct kvm_lapic *apic = vcpu->arch.apic; struct hrtimer *timer; if (!apic) @@ -1077,4 +1104,51 @@ void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) if (hrtimer_cancel(timer)) hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); } -EXPORT_SYMBOL_GPL(kvm_migrate_apic_timer); + +void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) +{ + u32 data; + void *vapic; + + if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) + return; + + vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0); + data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)); + kunmap_atomic(vapic, KM_USER0); + + apic_set_tpr(vcpu->arch.apic, data & 0xff); +} + +void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) +{ + u32 data, tpr; + int max_irr, max_isr; + struct kvm_lapic *apic; + void *vapic; + + if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) + return; + + apic = vcpu->arch.apic; + tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff; + max_irr = apic_find_highest_irr(apic); + if (max_irr < 0) + max_irr = 0; + max_isr = apic_find_highest_isr(apic); + if (max_isr < 0) + max_isr = 0; + data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24); + + vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0); + *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data; + kunmap_atomic(vapic, KM_USER0); +} + +void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) +{ + if (!irqchip_in_kernel(vcpu->kvm)) + return; + + vcpu->arch.apic->vapic_addr = vapic_addr; +} diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h new file mode 100644 index 00000000000..676c396c9ce --- /dev/null +++ b/arch/x86/kvm/lapic.h @@ -0,0 +1,50 @@ +#ifndef __KVM_X86_LAPIC_H +#define __KVM_X86_LAPIC_H + +#include "iodev.h" + +#include <linux/kvm_host.h> + +struct kvm_lapic { + unsigned long base_address; + struct kvm_io_device dev; + struct { + atomic_t pending; + s64 period; /* unit: ns */ + u32 divide_count; + ktime_t last_update; + struct hrtimer dev; + } timer; + struct kvm_vcpu *vcpu; + struct page *regs_page; + void *regs; + gpa_t vapic_addr; + struct page *vapic_page; +}; +int kvm_create_lapic(struct kvm_vcpu *vcpu); +void kvm_free_lapic(struct kvm_vcpu *vcpu); + +int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); +int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); +int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); +void kvm_lapic_reset(struct kvm_vcpu *vcpu); +u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); +void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); +void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); + +int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); +int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); +int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig); + +u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); +void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); +void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); +int kvm_lapic_enabled(struct kvm_vcpu *vcpu); +int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); +void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec); + +void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); +void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); +void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); + +#endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c new file mode 100644 index 00000000000..8efdcdbebb0 --- /dev/null +++ b/arch/x86/kvm/mmu.c @@ -0,0 +1,1885 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * MMU support + * + * Copyright (C) 2006 Qumranet, Inc. + * + * Authors: + * Yaniv Kamay <yaniv@qumranet.com> + * Avi Kivity <avi@qumranet.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "vmx.h" +#include "mmu.h" + +#include <linux/kvm_host.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/module.h> +#include <linux/swap.h> + +#include <asm/page.h> +#include <asm/cmpxchg.h> +#include <asm/io.h> + +#undef MMU_DEBUG + +#undef AUDIT + +#ifdef AUDIT +static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg); +#else +static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} +#endif + +#ifdef MMU_DEBUG + +#define pgprintk(x...) do { if (dbg) printk(x); } while (0) +#define rmap_printk(x...) do { if (dbg) printk(x); } while (0) + +#else + +#define pgprintk(x...) do { } while (0) +#define rmap_printk(x...) do { } while (0) + +#endif + +#if defined(MMU_DEBUG) || defined(AUDIT) +static int dbg = 1; +#endif + +#ifndef MMU_DEBUG +#define ASSERT(x) do { } while (0) +#else +#define ASSERT(x) \ + if (!(x)) { \ + printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ + __FILE__, __LINE__, #x); \ + } +#endif + +#define PT64_PT_BITS 9 +#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) +#define PT32_PT_BITS 10 +#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) + +#define PT_WRITABLE_SHIFT 1 + +#define PT_PRESENT_MASK (1ULL << 0) +#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT) +#define PT_USER_MASK (1ULL << 2) +#define PT_PWT_MASK (1ULL << 3) +#define PT_PCD_MASK (1ULL << 4) +#define PT_ACCESSED_MASK (1ULL << 5) +#define PT_DIRTY_MASK (1ULL << 6) +#define PT_PAGE_SIZE_MASK (1ULL << 7) +#define PT_PAT_MASK (1ULL << 7) +#define PT_GLOBAL_MASK (1ULL << 8) +#define PT64_NX_SHIFT 63 +#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT) + +#define PT_PAT_SHIFT 7 +#define PT_DIR_PAT_SHIFT 12 +#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT) + +#define PT32_DIR_PSE36_SIZE 4 +#define PT32_DIR_PSE36_SHIFT 13 +#define PT32_DIR_PSE36_MASK \ + (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) + + +#define PT_FIRST_AVAIL_BITS_SHIFT 9 +#define PT64_SECOND_AVAIL_BITS_SHIFT 52 + +#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) + +#define VALID_PAGE(x) ((x) != INVALID_PAGE) + +#define PT64_LEVEL_BITS 9 + +#define PT64_LEVEL_SHIFT(level) \ + (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) + +#define PT64_LEVEL_MASK(level) \ + (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level)) + +#define PT64_INDEX(address, level)\ + (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) + + +#define PT32_LEVEL_BITS 10 + +#define PT32_LEVEL_SHIFT(level) \ + (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) + +#define PT32_LEVEL_MASK(level) \ + (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) + +#define PT32_INDEX(address, level)\ + (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) + + +#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) +#define PT64_DIR_BASE_ADDR_MASK \ + (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) + +#define PT32_BASE_ADDR_MASK PAGE_MASK +#define PT32_DIR_BASE_ADDR_MASK \ + (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) + +#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ + | PT64_NX_MASK) + +#define PFERR_PRESENT_MASK (1U << 0) +#define PFERR_WRITE_MASK (1U << 1) +#define PFERR_USER_MASK (1U << 2) +#define PFERR_FETCH_MASK (1U << 4) + +#define PT64_ROOT_LEVEL 4 +#define PT32_ROOT_LEVEL 2 +#define PT32E_ROOT_LEVEL 3 + +#define PT_DIRECTORY_LEVEL 2 +#define PT_PAGE_TABLE_LEVEL 1 + +#define RMAP_EXT 4 + +#define ACC_EXEC_MASK 1 +#define ACC_WRITE_MASK PT_WRITABLE_MASK +#define ACC_USER_MASK PT_USER_MASK +#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) + +struct kvm_rmap_desc { + u64 *shadow_ptes[RMAP_EXT]; + struct kvm_rmap_desc *more; +}; + +static struct kmem_cache *pte_chain_cache; +static struct kmem_cache *rmap_desc_cache; +static struct kmem_cache *mmu_page_header_cache; + +static u64 __read_mostly shadow_trap_nonpresent_pte; +static u64 __read_mostly shadow_notrap_nonpresent_pte; + +void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) +{ + shadow_trap_nonpresent_pte = trap_pte; + shadow_notrap_nonpresent_pte = notrap_pte; +} +EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); + +static int is_write_protection(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.cr0 & X86_CR0_WP; +} + +static int is_cpuid_PSE36(void) +{ + return 1; +} + +static int is_nx(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.shadow_efer & EFER_NX; +} + +static int is_present_pte(unsigned long pte) +{ + return pte & PT_PRESENT_MASK; +} + +static int is_shadow_present_pte(u64 pte) +{ + pte &= ~PT_SHADOW_IO_MARK; + return pte != shadow_trap_nonpresent_pte + && pte != shadow_notrap_nonpresent_pte; +} + +static int is_writeble_pte(unsigned long pte) +{ + return pte & PT_WRITABLE_MASK; +} + +static int is_dirty_pte(unsigned long pte) +{ + return pte & PT_DIRTY_MASK; +} + +static int is_io_pte(unsigned long pte) +{ + return pte & PT_SHADOW_IO_MARK; +} + +static int is_rmap_pte(u64 pte) +{ + return pte != shadow_trap_nonpresent_pte + && pte != shadow_notrap_nonpresent_pte; +} + +static gfn_t pse36_gfn_delta(u32 gpte) +{ + int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; + + return (gpte & PT32_DIR_PSE36_MASK) << shift; +} + +static void set_shadow_pte(u64 *sptep, u64 spte) +{ +#ifdef CONFIG_X86_64 + set_64bit((unsigned long *)sptep, spte); +#else + set_64bit((unsigned long long *)sptep, spte); +#endif +} + +static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, + struct kmem_cache *base_cache, int min) +{ + void *obj; + + if (cache->nobjs >= min) + return 0; + while (cache->nobjs < ARRAY_SIZE(cache->objects)) { + obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); + if (!obj) + return -ENOMEM; + cache->objects[cache->nobjs++] = obj; + } + return 0; +} + +static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) +{ + while (mc->nobjs) + kfree(mc->objects[--mc->nobjs]); +} + +static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, + int min) +{ + struct page *page; + + if (cache->nobjs >= min) + return 0; + while (cache->nobjs < ARRAY_SIZE(cache->objects)) { + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + set_page_private(page, 0); + cache->objects[cache->nobjs++] = page_address(page); + } + return 0; +} + +static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc) +{ + while (mc->nobjs) + free_page((unsigned long)mc->objects[--mc->nobjs]); +} + +static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) +{ + int r; + + r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache, + pte_chain_cache, 4); + if (r) + goto out; + r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, + rmap_desc_cache, 1); + if (r) + goto out; + r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); + if (r) + goto out; + r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, + mmu_page_header_cache, 4); +out: + return r; +} + +static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) +{ + mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache); + mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); +} + +static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, + size_t size) +{ + void *p; + + BUG_ON(!mc->nobjs); + p = mc->objects[--mc->nobjs]; + memset(p, 0, size); + return p; +} + +static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) +{ + return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache, + sizeof(struct kvm_pte_chain)); +} + +static void mmu_free_pte_chain(struct kvm_pte_chain *pc) +{ + kfree(pc); +} + +static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) +{ + return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache, + sizeof(struct kvm_rmap_desc)); +} + +static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) +{ + kfree(rd); +} + +/* + * Take gfn and return the reverse mapping to it. + * Note: gfn must be unaliased before this function get called + */ + +static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *slot; + + slot = gfn_to_memslot(kvm, gfn); + return &slot->rmap[gfn - slot->base_gfn]; +} + +/* + * Reverse mapping data structures: + * + * If rmapp bit zero is zero, then rmapp point to the shadw page table entry + * that points to page_address(page). + * + * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc + * containing more mappings. + */ +static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) +{ + struct kvm_mmu_page *sp; + struct kvm_rmap_desc *desc; + unsigned long *rmapp; + int i; + + if (!is_rmap_pte(*spte)) + return; + gfn = unalias_gfn(vcpu->kvm, gfn); + sp = page_header(__pa(spte)); + sp->gfns[spte - sp->spt] = gfn; + rmapp = gfn_to_rmap(vcpu->kvm, gfn); + if (!*rmapp) { + rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); + *rmapp = (unsigned long)spte; + } else if (!(*rmapp & 1)) { + rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); + desc = mmu_alloc_rmap_desc(vcpu); + desc->shadow_ptes[0] = (u64 *)*rmapp; + desc->shadow_ptes[1] = spte; + *rmapp = (unsigned long)desc | 1; + } else { + rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); + desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); + while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) + desc = desc->more; + if (desc->shadow_ptes[RMAP_EXT-1]) { + desc->more = mmu_alloc_rmap_desc(vcpu); + desc = desc->more; + } + for (i = 0; desc->shadow_ptes[i]; ++i) + ; + desc->shadow_ptes[i] = spte; + } +} + +static void rmap_desc_remove_entry(unsigned long *rmapp, + struct kvm_rmap_desc *desc, + int i, + struct kvm_rmap_desc *prev_desc) +{ + int j; + + for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j) + ; + desc->shadow_ptes[i] = desc->shadow_ptes[j]; + desc->shadow_ptes[j] = NULL; + if (j != 0) + return; + if (!prev_desc && !desc->more) + *rmapp = (unsigned long)desc->shadow_ptes[0]; + else + if (prev_desc) + prev_desc->more = desc->more; + else + *rmapp = (unsigned long)desc->more | 1; + mmu_free_rmap_desc(desc); +} + +static void rmap_remove(struct kvm *kvm, u64 *spte) +{ + struct kvm_rmap_desc *desc; + struct kvm_rmap_desc *prev_desc; + struct kvm_mmu_page *sp; + struct page *page; + unsigned long *rmapp; + int i; + + if (!is_rmap_pte(*spte)) + return; + sp = page_header(__pa(spte)); + page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); + mark_page_accessed(page); + if (is_writeble_pte(*spte)) + kvm_release_page_dirty(page); + else + kvm_release_page_clean(page); + rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]); + if (!*rmapp) { + printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); + BUG(); + } else if (!(*rmapp & 1)) { + rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); + if ((u64 *)*rmapp != spte) { + printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", + spte, *spte); + BUG(); + } + *rmapp = 0; + } else { + rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); + desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); + prev_desc = NULL; + while (desc) { + for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) + if (desc->shadow_ptes[i] == spte) { + rmap_desc_remove_entry(rmapp, + desc, i, + prev_desc); + return; + } + prev_desc = desc; + desc = desc->more; + } + BUG(); + } +} + +static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) +{ + struct kvm_rmap_desc *desc; + struct kvm_rmap_desc *prev_desc; + u64 *prev_spte; + int i; + + if (!*rmapp) + return NULL; + else if (!(*rmapp & 1)) { + if (!spte) + return (u64 *)*rmapp; + return NULL; + } + desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); + prev_desc = NULL; + prev_spte = NULL; + while (desc) { + for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) { + if (prev_spte == spte) + return desc->shadow_ptes[i]; + prev_spte = desc->shadow_ptes[i]; + } + desc = desc->more; + } + return NULL; +} + +static void rmap_write_protect(struct kvm *kvm, u64 gfn) +{ + unsigned long *rmapp; + u64 *spte; + int write_protected = 0; + + gfn = unalias_gfn(kvm, gfn); + rmapp = gfn_to_rmap(kvm, gfn); + + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + BUG_ON(!spte); + BUG_ON(!(*spte & PT_PRESENT_MASK)); + rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); + if (is_writeble_pte(*spte)) { + set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); + write_protected = 1; + } + spte = rmap_next(kvm, rmapp, spte); + } + if (write_protected) + kvm_flush_remote_tlbs(kvm); +} + +#ifdef MMU_DEBUG +static int is_empty_shadow_page(u64 *spt) +{ + u64 *pos; + u64 *end; + + for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) + if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) { + printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, + pos, *pos); + return 0; + } + return 1; +} +#endif + +static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + ASSERT(is_empty_shadow_page(sp->spt)); + list_del(&sp->link); + __free_page(virt_to_page(sp->spt)); + __free_page(virt_to_page(sp->gfns)); + kfree(sp); + ++kvm->arch.n_free_mmu_pages; +} + +static unsigned kvm_page_table_hashfn(gfn_t gfn) +{ + return gfn; +} + +static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, + u64 *parent_pte) +{ + struct kvm_mmu_page *sp; + + sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); + sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); + sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); + set_page_private(virt_to_page(sp->spt), (unsigned long)sp); + list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); + ASSERT(is_empty_shadow_page(sp->spt)); + sp->slot_bitmap = 0; + sp->multimapped = 0; + sp->parent_pte = parent_pte; + --vcpu->kvm->arch.n_free_mmu_pages; + return sp; +} + +static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, u64 *parent_pte) +{ + struct kvm_pte_chain *pte_chain; + struct hlist_node *node; + int i; + + if (!parent_pte) + return; + if (!sp->multimapped) { + u64 *old = sp->parent_pte; + + if (!old) { + sp->parent_pte = parent_pte; + return; + } + sp->multimapped = 1; + pte_chain = mmu_alloc_pte_chain(vcpu); + INIT_HLIST_HEAD(&sp->parent_ptes); + hlist_add_head(&pte_chain->link, &sp->parent_ptes); + pte_chain->parent_ptes[0] = old; + } + hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) { + if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1]) + continue; + for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) + if (!pte_chain->parent_ptes[i]) { + pte_chain->parent_ptes[i] = parent_pte; + return; + } + } + pte_chain = mmu_alloc_pte_chain(vcpu); + BUG_ON(!pte_chain); + hlist_add_head(&pte_chain->link, &sp->parent_ptes); + pte_chain->parent_ptes[0] = parent_pte; +} + +static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, + u64 *parent_pte) +{ + struct kvm_pte_chain *pte_chain; + struct hlist_node *node; + int i; + + if (!sp->multimapped) { + BUG_ON(sp->parent_pte != parent_pte); + sp->parent_pte = NULL; + return; + } + hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) + for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { + if (!pte_chain->parent_ptes[i]) + break; + if (pte_chain->parent_ptes[i] != parent_pte) + continue; + while (i + 1 < NR_PTE_CHAIN_ENTRIES + && pte_chain->parent_ptes[i + 1]) { + pte_chain->parent_ptes[i] + = pte_chain->parent_ptes[i + 1]; + ++i; + } + pte_chain->parent_ptes[i] = NULL; + if (i == 0) { + hlist_del(&pte_chain->link); + mmu_free_pte_chain(pte_chain); + if (hlist_empty(&sp->parent_ptes)) { + sp->multimapped = 0; + sp->parent_pte = NULL; + } + } + return; + } + BUG(); +} + +static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) +{ + unsigned index; + struct hlist_head *bucket; + struct kvm_mmu_page *sp; + struct hlist_node *node; + + pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); + index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + bucket = &kvm->arch.mmu_page_hash[index]; + hlist_for_each_entry(sp, node, bucket, hash_link) + if (sp->gfn == gfn && !sp->role.metaphysical) { + pgprintk("%s: found role %x\n", + __FUNCTION__, sp->role.word); + return sp; + } + return NULL; +} + +static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, + gfn_t gfn, + gva_t gaddr, + unsigned level, + int metaphysical, + unsigned access, + u64 *parent_pte, + bool *new_page) +{ + union kvm_mmu_page_role role; + unsigned index; + unsigned quadrant; + struct hlist_head *bucket; + struct kvm_mmu_page *sp; + struct hlist_node *node; + + role.word = 0; + role.glevels = vcpu->arch.mmu.root_level; + role.level = level; + role.metaphysical = metaphysical; + role.access = access; + if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { + quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); + quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; + role.quadrant = quadrant; + } + pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__, + gfn, role.word); + index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + bucket = &vcpu->kvm->arch.mmu_page_hash[index]; + hlist_for_each_entry(sp, node, bucket, hash_link) + if (sp->gfn == gfn && sp->role.word == role.word) { + mmu_page_add_parent_pte(vcpu, sp, parent_pte); + pgprintk("%s: found\n", __FUNCTION__); + return sp; + } + ++vcpu->kvm->stat.mmu_cache_miss; + sp = kvm_mmu_alloc_page(vcpu, parent_pte); + if (!sp) + return sp; + pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word); + sp->gfn = gfn; + sp->role = role; + hlist_add_head(&sp->hash_link, bucket); + vcpu->arch.mmu.prefetch_page(vcpu, sp); + if (!metaphysical) + rmap_write_protect(vcpu->kvm, gfn); + if (new_page) + *new_page = 1; + return sp; +} + +static void kvm_mmu_page_unlink_children(struct kvm *kvm, + struct kvm_mmu_page *sp) +{ + unsigned i; + u64 *pt; + u64 ent; + + pt = sp->spt; + + if (sp->role.level == PT_PAGE_TABLE_LEVEL) { + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + if (is_shadow_present_pte(pt[i])) + rmap_remove(kvm, &pt[i]); + pt[i] = shadow_trap_nonpresent_pte; + } + kvm_flush_remote_tlbs(kvm); + return; + } + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + ent = pt[i]; + + pt[i] = shadow_trap_nonpresent_pte; + if (!is_shadow_present_pte(ent)) + continue; + ent &= PT64_BASE_ADDR_MASK; + mmu_page_remove_parent_pte(page_header(ent), &pt[i]); + } + kvm_flush_remote_tlbs(kvm); +} + +static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) +{ + mmu_page_remove_parent_pte(sp, parent_pte); +} + +static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) +{ + int i; + + for (i = 0; i < KVM_MAX_VCPUS; ++i) + if (kvm->vcpus[i]) + kvm->vcpus[i]->arch.last_pte_updated = NULL; +} + +static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + u64 *parent_pte; + + ++kvm->stat.mmu_shadow_zapped; + while (sp->multimapped || sp->parent_pte) { + if (!sp->multimapped) + parent_pte = sp->parent_pte; + else { + struct kvm_pte_chain *chain; + + chain = container_of(sp->parent_ptes.first, + struct kvm_pte_chain, link); + parent_pte = chain->parent_ptes[0]; + } + BUG_ON(!parent_pte); + kvm_mmu_put_page(sp, parent_pte); + set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); + } + kvm_mmu_page_unlink_children(kvm, sp); + if (!sp->root_count) { + hlist_del(&sp->hash_link); + kvm_mmu_free_page(kvm, sp); + } else + list_move(&sp->link, &kvm->arch.active_mmu_pages); + kvm_mmu_reset_last_pte_updated(kvm); +} + +/* + * Changing the number of mmu pages allocated to the vm + * Note: if kvm_nr_mmu_pages is too small, you will get dead lock + */ +void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) +{ + /* + * If we set the number of mmu pages to be smaller be than the + * number of actived pages , we must to free some mmu pages before we + * change the value + */ + + if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) > + kvm_nr_mmu_pages) { + int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages + - kvm->arch.n_free_mmu_pages; + + while (n_used_mmu_pages > kvm_nr_mmu_pages) { + struct kvm_mmu_page *page; + + page = container_of(kvm->arch.active_mmu_pages.prev, + struct kvm_mmu_page, link); + kvm_mmu_zap_page(kvm, page); + n_used_mmu_pages--; + } + kvm->arch.n_free_mmu_pages = 0; + } + else + kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages + - kvm->arch.n_alloc_mmu_pages; + + kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages; +} + +static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) +{ + unsigned index; + struct hlist_head *bucket; + struct kvm_mmu_page *sp; + struct hlist_node *node, *n; + int r; + + pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); + r = 0; + index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + bucket = &kvm->arch.mmu_page_hash[index]; + hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) + if (sp->gfn == gfn && !sp->role.metaphysical) { + pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn, + sp->role.word); + kvm_mmu_zap_page(kvm, sp); + r = 1; + } + return r; +} + +static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_mmu_page *sp; + + while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) { + pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word); + kvm_mmu_zap_page(kvm, sp); + } +} + +static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) +{ + int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); + struct kvm_mmu_page *sp = page_header(__pa(pte)); + + __set_bit(slot, &sp->slot_bitmap); +} + +struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) +{ + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); + + if (gpa == UNMAPPED_GVA) + return NULL; + return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); +} + +static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, + unsigned pt_access, unsigned pte_access, + int user_fault, int write_fault, int dirty, + int *ptwrite, gfn_t gfn, struct page *page) +{ + u64 spte; + int was_rmapped = is_rmap_pte(*shadow_pte); + int was_writeble = is_writeble_pte(*shadow_pte); + + pgprintk("%s: spte %llx access %x write_fault %d" + " user_fault %d gfn %lx\n", + __FUNCTION__, *shadow_pte, pt_access, + write_fault, user_fault, gfn); + + /* + * We don't set the accessed bit, since we sometimes want to see + * whether the guest actually used the pte (in order to detect + * demand paging). + */ + spte = PT_PRESENT_MASK | PT_DIRTY_MASK; + if (!dirty) + pte_access &= ~ACC_WRITE_MASK; + if (!(pte_access & ACC_EXEC_MASK)) + spte |= PT64_NX_MASK; + + spte |= PT_PRESENT_MASK; + if (pte_access & ACC_USER_MASK) + spte |= PT_USER_MASK; + + if (is_error_page(page)) { + set_shadow_pte(shadow_pte, + shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK); + kvm_release_page_clean(page); + return; + } + + spte |= page_to_phys(page); + + if ((pte_access & ACC_WRITE_MASK) + || (write_fault && !is_write_protection(vcpu) && !user_fault)) { + struct kvm_mmu_page *shadow; + + spte |= PT_WRITABLE_MASK; + if (user_fault) { + mmu_unshadow(vcpu->kvm, gfn); + goto unshadowed; + } + + shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); + if (shadow) { + pgprintk("%s: found shadow page for %lx, marking ro\n", + __FUNCTION__, gfn); + pte_access &= ~ACC_WRITE_MASK; + if (is_writeble_pte(spte)) { + spte &= ~PT_WRITABLE_MASK; + kvm_x86_ops->tlb_flush(vcpu); + } + if (write_fault) + *ptwrite = 1; + } + } + +unshadowed: + + if (pte_access & ACC_WRITE_MASK) + mark_page_dirty(vcpu->kvm, gfn); + + pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte); + set_shadow_pte(shadow_pte, spte); + page_header_update_slot(vcpu->kvm, shadow_pte, gfn); + if (!was_rmapped) { + rmap_add(vcpu, shadow_pte, gfn); + if (!is_rmap_pte(*shadow_pte)) + kvm_release_page_clean(page); + } else { + if (was_writeble) + kvm_release_page_dirty(page); + else + kvm_release_page_clean(page); + } + if (!ptwrite || !*ptwrite) + vcpu->arch.last_pte_updated = shadow_pte; +} + +static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) +{ +} + +static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, + gfn_t gfn, struct page *page) +{ + int level = PT32E_ROOT_LEVEL; + hpa_t table_addr = vcpu->arch.mmu.root_hpa; + int pt_write = 0; + + for (; ; level--) { + u32 index = PT64_INDEX(v, level); + u64 *table; + + ASSERT(VALID_PAGE(table_addr)); + table = __va(table_addr); + + if (level == 1) { + mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, + 0, write, 1, &pt_write, gfn, page); + return pt_write || is_io_pte(table[index]); + } + + if (table[index] == shadow_trap_nonpresent_pte) { + struct kvm_mmu_page *new_table; + gfn_t pseudo_gfn; + + pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK) + >> PAGE_SHIFT; + new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, + v, level - 1, + 1, ACC_ALL, &table[index], + NULL); + if (!new_table) { + pgprintk("nonpaging_map: ENOMEM\n"); + kvm_release_page_clean(page); + return -ENOMEM; + } + + table[index] = __pa(new_table->spt) | PT_PRESENT_MASK + | PT_WRITABLE_MASK | PT_USER_MASK; + } + table_addr = table[index] & PT64_BASE_ADDR_MASK; + } +} + +static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) +{ + int r; + + struct page *page; + + down_read(¤t->mm->mmap_sem); + page = gfn_to_page(vcpu->kvm, gfn); + + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); + r = __nonpaging_map(vcpu, v, write, gfn, page); + spin_unlock(&vcpu->kvm->mmu_lock); + + up_read(¤t->mm->mmap_sem); + + return r; +} + + +static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + int i; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) + sp->spt[i] = shadow_trap_nonpresent_pte; +} + +static void mmu_free_roots(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_mmu_page *sp; + + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return; + spin_lock(&vcpu->kvm->mmu_lock); +#ifdef CONFIG_X86_64 + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->arch.mmu.root_hpa; + + sp = page_header(root); + --sp->root_count; + vcpu->arch.mmu.root_hpa = INVALID_PAGE; + spin_unlock(&vcpu->kvm->mmu_lock); + return; + } +#endif + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + if (root) { + root &= PT64_BASE_ADDR_MASK; + sp = page_header(root); + --sp->root_count; + } + vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; + } + spin_unlock(&vcpu->kvm->mmu_lock); + vcpu->arch.mmu.root_hpa = INVALID_PAGE; +} + +static void mmu_alloc_roots(struct kvm_vcpu *vcpu) +{ + int i; + gfn_t root_gfn; + struct kvm_mmu_page *sp; + + root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; + +#ifdef CONFIG_X86_64 + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->arch.mmu.root_hpa; + + ASSERT(!VALID_PAGE(root)); + sp = kvm_mmu_get_page(vcpu, root_gfn, 0, + PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL); + root = __pa(sp->spt); + ++sp->root_count; + vcpu->arch.mmu.root_hpa = root; + return; + } +#endif + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + ASSERT(!VALID_PAGE(root)); + if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { + if (!is_present_pte(vcpu->arch.pdptrs[i])) { + vcpu->arch.mmu.pae_root[i] = 0; + continue; + } + root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT; + } else if (vcpu->arch.mmu.root_level == 0) + root_gfn = 0; + sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, + PT32_ROOT_LEVEL, !is_paging(vcpu), + ACC_ALL, NULL, NULL); + root = __pa(sp->spt); + ++sp->root_count; + vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; + } + vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); +} + +static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) +{ + return vaddr; +} + +static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, + u32 error_code) +{ + gfn_t gfn; + int r; + + pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code); + r = mmu_topup_memory_caches(vcpu); + if (r) + return r; + + ASSERT(vcpu); + ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + gfn = gva >> PAGE_SHIFT; + + return nonpaging_map(vcpu, gva & PAGE_MASK, + error_code & PFERR_WRITE_MASK, gfn); +} + +static void nonpaging_free(struct kvm_vcpu *vcpu) +{ + mmu_free_roots(vcpu); +} + +static int nonpaging_init_context(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *context = &vcpu->arch.mmu; + + context->new_cr3 = nonpaging_new_cr3; + context->page_fault = nonpaging_page_fault; + context->gva_to_gpa = nonpaging_gva_to_gpa; + context->free = nonpaging_free; + context->prefetch_page = nonpaging_prefetch_page; + context->root_level = 0; + context->shadow_root_level = PT32E_ROOT_LEVEL; + context->root_hpa = INVALID_PAGE; + return 0; +} + +void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.tlb_flush; + kvm_x86_ops->tlb_flush(vcpu); +} + +static void paging_new_cr3(struct kvm_vcpu *vcpu) +{ + pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3); + mmu_free_roots(vcpu); +} + +static void inject_page_fault(struct kvm_vcpu *vcpu, + u64 addr, + u32 err_code) +{ + kvm_inject_page_fault(vcpu, addr, err_code); +} + +static void paging_free(struct kvm_vcpu *vcpu) +{ + nonpaging_free(vcpu); +} + +#define PTTYPE 64 +#include "paging_tmpl.h" +#undef PTTYPE + +#define PTTYPE 32 +#include "paging_tmpl.h" +#undef PTTYPE + +static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) +{ + struct kvm_mmu *context = &vcpu->arch.mmu; + + ASSERT(is_pae(vcpu)); + context->new_cr3 = paging_new_cr3; + context->page_fault = paging64_page_fault; + context->gva_to_gpa = paging64_gva_to_gpa; + context->prefetch_page = paging64_prefetch_page; + context->free = paging_free; + context->root_level = level; + context->shadow_root_level = level; + context->root_hpa = INVALID_PAGE; + return 0; +} + +static int paging64_init_context(struct kvm_vcpu *vcpu) +{ + return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); +} + +static int paging32_init_context(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *context = &vcpu->arch.mmu; + + context->new_cr3 = paging_new_cr3; + context->page_fault = paging32_page_fault; + context->gva_to_gpa = paging32_gva_to_gpa; + context->free = paging_free; + context->prefetch_page = paging32_prefetch_page; + context->root_level = PT32_ROOT_LEVEL; + context->shadow_root_level = PT32E_ROOT_LEVEL; + context->root_hpa = INVALID_PAGE; + return 0; +} + +static int paging32E_init_context(struct kvm_vcpu *vcpu) +{ + return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); +} + +static int init_kvm_mmu(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + if (!is_paging(vcpu)) + return nonpaging_init_context(vcpu); + else if (is_long_mode(vcpu)) + return paging64_init_context(vcpu); + else if (is_pae(vcpu)) + return paging32E_init_context(vcpu); + else + return paging32_init_context(vcpu); +} + +static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) { + vcpu->arch.mmu.free(vcpu); + vcpu->arch.mmu.root_hpa = INVALID_PAGE; + } +} + +int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) +{ + destroy_kvm_mmu(vcpu); + return init_kvm_mmu(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); + +int kvm_mmu_load(struct kvm_vcpu *vcpu) +{ + int r; + + r = mmu_topup_memory_caches(vcpu); + if (r) + goto out; + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); + mmu_alloc_roots(vcpu); + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); + kvm_mmu_flush_tlb(vcpu); +out: + return r; +} +EXPORT_SYMBOL_GPL(kvm_mmu_load); + +void kvm_mmu_unload(struct kvm_vcpu *vcpu) +{ + mmu_free_roots(vcpu); +} + +static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, + u64 *spte) +{ + u64 pte; + struct kvm_mmu_page *child; + + pte = *spte; + if (is_shadow_present_pte(pte)) { + if (sp->role.level == PT_PAGE_TABLE_LEVEL) + rmap_remove(vcpu->kvm, spte); + else { + child = page_header(pte & PT64_BASE_ADDR_MASK); + mmu_page_remove_parent_pte(child, spte); + } + } + set_shadow_pte(spte, shadow_trap_nonpresent_pte); +} + +static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, + u64 *spte, + const void *new, int bytes, + int offset_in_pte) +{ + if (sp->role.level != PT_PAGE_TABLE_LEVEL) { + ++vcpu->kvm->stat.mmu_pde_zapped; + return; + } + + ++vcpu->kvm->stat.mmu_pte_updated; + if (sp->role.glevels == PT32_ROOT_LEVEL) + paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte); + else + paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte); +} + +static bool need_remote_flush(u64 old, u64 new) +{ + if (!is_shadow_present_pte(old)) + return false; + if (!is_shadow_present_pte(new)) + return true; + if ((old ^ new) & PT64_BASE_ADDR_MASK) + return true; + old ^= PT64_NX_MASK; + new ^= PT64_NX_MASK; + return (old & ~new & PT64_PERM_MASK) != 0; +} + +static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new) +{ + if (need_remote_flush(old, new)) + kvm_flush_remote_tlbs(vcpu->kvm); + else + kvm_mmu_flush_tlb(vcpu); +} + +static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) +{ + u64 *spte = vcpu->arch.last_pte_updated; + + return !!(spte && (*spte & PT_ACCESSED_MASK)); +} + +static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, + const u8 *new, int bytes) +{ + gfn_t gfn; + int r; + u64 gpte = 0; + + if (bytes != 4 && bytes != 8) + return; + + /* + * Assume that the pte write on a page table of the same type + * as the current vcpu paging mode. This is nearly always true + * (might be false while changing modes). Note it is verified later + * by update_pte(). + */ + if (is_pae(vcpu)) { + /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ + if ((bytes == 4) && (gpa % 4 == 0)) { + r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8); + if (r) + return; + memcpy((void *)&gpte + (gpa % 8), new, 4); + } else if ((bytes == 8) && (gpa % 8 == 0)) { + memcpy((void *)&gpte, new, 8); + } + } else { + if ((bytes == 4) && (gpa % 4 == 0)) + memcpy((void *)&gpte, new, 4); + } + if (!is_present_pte(gpte)) + return; + gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; + vcpu->arch.update_pte.gfn = gfn; + vcpu->arch.update_pte.page = gfn_to_page(vcpu->kvm, gfn); +} + +void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, + const u8 *new, int bytes) +{ + gfn_t gfn = gpa >> PAGE_SHIFT; + struct kvm_mmu_page *sp; + struct hlist_node *node, *n; + struct hlist_head *bucket; + unsigned index; + u64 entry; + u64 *spte; + unsigned offset = offset_in_page(gpa); + unsigned pte_size; + unsigned page_offset; + unsigned misaligned; + unsigned quadrant; + int level; + int flooded = 0; + int npte; + + pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); + mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); + ++vcpu->kvm->stat.mmu_pte_write; + kvm_mmu_audit(vcpu, "pre pte write"); + if (gfn == vcpu->arch.last_pt_write_gfn + && !last_updated_pte_accessed(vcpu)) { + ++vcpu->arch.last_pt_write_count; + if (vcpu->arch.last_pt_write_count >= 3) + flooded = 1; + } else { + vcpu->arch.last_pt_write_gfn = gfn; + vcpu->arch.last_pt_write_count = 1; + vcpu->arch.last_pte_updated = NULL; + } + index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + bucket = &vcpu->kvm->arch.mmu_page_hash[index]; + hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { + if (sp->gfn != gfn || sp->role.metaphysical) + continue; + pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; + misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); + misaligned |= bytes < 4; + if (misaligned || flooded) { + /* + * Misaligned accesses are too much trouble to fix + * up; also, they usually indicate a page is not used + * as a page table. + * + * If we're seeing too many writes to a page, + * it may no longer be a page table, or we may be + * forking, in which case it is better to unmap the + * page. + */ + pgprintk("misaligned: gpa %llx bytes %d role %x\n", + gpa, bytes, sp->role.word); + kvm_mmu_zap_page(vcpu->kvm, sp); + ++vcpu->kvm->stat.mmu_flooded; + continue; + } + page_offset = offset; + level = sp->role.level; + npte = 1; + if (sp->role.glevels == PT32_ROOT_LEVEL) { + page_offset <<= 1; /* 32->64 */ + /* + * A 32-bit pde maps 4MB while the shadow pdes map + * only 2MB. So we need to double the offset again + * and zap two pdes instead of one. + */ + if (level == PT32_ROOT_LEVEL) { + page_offset &= ~7; /* kill rounding error */ + page_offset <<= 1; + npte = 2; + } + quadrant = page_offset >> PAGE_SHIFT; + page_offset &= ~PAGE_MASK; + if (quadrant != sp->role.quadrant) + continue; + } + spte = &sp->spt[page_offset / sizeof(*spte)]; + while (npte--) { + entry = *spte; + mmu_pte_write_zap_pte(vcpu, sp, spte); + mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes, + page_offset & (pte_size - 1)); + mmu_pte_write_flush_tlb(vcpu, entry, *spte); + ++spte; + } + } + kvm_mmu_audit(vcpu, "post pte write"); + spin_unlock(&vcpu->kvm->mmu_lock); + if (vcpu->arch.update_pte.page) { + kvm_release_page_clean(vcpu->arch.update_pte.page); + vcpu->arch.update_pte.page = NULL; + } +} + +int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) +{ + gpa_t gpa; + int r; + + down_read(¤t->mm->mmap_sem); + gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); + up_read(¤t->mm->mmap_sem); + + spin_lock(&vcpu->kvm->mmu_lock); + r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); + spin_unlock(&vcpu->kvm->mmu_lock); + return r; +} + +void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) +{ + while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) { + struct kvm_mmu_page *sp; + + sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, + struct kvm_mmu_page, link); + kvm_mmu_zap_page(vcpu->kvm, sp); + ++vcpu->kvm->stat.mmu_recycled; + } +} + +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) +{ + int r; + enum emulation_result er; + + r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); + if (r < 0) + goto out; + + if (!r) { + r = 1; + goto out; + } + + r = mmu_topup_memory_caches(vcpu); + if (r) + goto out; + + er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0); + + switch (er) { + case EMULATE_DONE: + return 1; + case EMULATE_DO_MMIO: + ++vcpu->stat.mmio_exits; + return 0; + case EMULATE_FAIL: + kvm_report_emulation_failure(vcpu, "pagetable"); + return 1; + default: + BUG(); + } +out: + return r; +} +EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); + +static void free_mmu_pages(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_page *sp; + + while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) { + sp = container_of(vcpu->kvm->arch.active_mmu_pages.next, + struct kvm_mmu_page, link); + kvm_mmu_zap_page(vcpu->kvm, sp); + } + free_page((unsigned long)vcpu->arch.mmu.pae_root); +} + +static int alloc_mmu_pages(struct kvm_vcpu *vcpu) +{ + struct page *page; + int i; + + ASSERT(vcpu); + + if (vcpu->kvm->arch.n_requested_mmu_pages) + vcpu->kvm->arch.n_free_mmu_pages = + vcpu->kvm->arch.n_requested_mmu_pages; + else + vcpu->kvm->arch.n_free_mmu_pages = + vcpu->kvm->arch.n_alloc_mmu_pages; + /* + * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. + * Therefore we need to allocate shadow page tables in the first + * 4GB of memory, which happens to fit the DMA32 zone. + */ + page = alloc_page(GFP_KERNEL | __GFP_DMA32); + if (!page) + goto error_1; + vcpu->arch.mmu.pae_root = page_address(page); + for (i = 0; i < 4; ++i) + vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; + + return 0; + +error_1: + free_mmu_pages(vcpu); + return -ENOMEM; +} + +int kvm_mmu_create(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + return alloc_mmu_pages(vcpu); +} + +int kvm_mmu_setup(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + return init_kvm_mmu(vcpu); +} + +void kvm_mmu_destroy(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + + destroy_kvm_mmu(vcpu); + free_mmu_pages(vcpu); + mmu_free_memory_caches(vcpu); +} + +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) +{ + struct kvm_mmu_page *sp; + + list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { + int i; + u64 *pt; + + if (!test_bit(slot, &sp->slot_bitmap)) + continue; + + pt = sp->spt; + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) + /* avoid RMW */ + if (pt[i] & PT_WRITABLE_MASK) + pt[i] &= ~PT_WRITABLE_MASK; + } +} + +void kvm_mmu_zap_all(struct kvm *kvm) +{ + struct kvm_mmu_page *sp, *node; + + spin_lock(&kvm->mmu_lock); + list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) + kvm_mmu_zap_page(kvm, sp); + spin_unlock(&kvm->mmu_lock); + + kvm_flush_remote_tlbs(kvm); +} + +void kvm_mmu_module_exit(void) +{ + if (pte_chain_cache) + kmem_cache_destroy(pte_chain_cache); + if (rmap_desc_cache) + kmem_cache_destroy(rmap_desc_cache); + if (mmu_page_header_cache) + kmem_cache_destroy(mmu_page_header_cache); +} + +int kvm_mmu_module_init(void) +{ + pte_chain_cache = kmem_cache_create("kvm_pte_chain", + sizeof(struct kvm_pte_chain), + 0, 0, NULL); + if (!pte_chain_cache) + goto nomem; + rmap_desc_cache = kmem_cache_create("kvm_rmap_desc", + sizeof(struct kvm_rmap_desc), + 0, 0, NULL); + if (!rmap_desc_cache) + goto nomem; + + mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", + sizeof(struct kvm_mmu_page), + 0, 0, NULL); + if (!mmu_page_header_cache) + goto nomem; + + return 0; + +nomem: + kvm_mmu_module_exit(); + return -ENOMEM; +} + +/* + * Caculate mmu pages needed for kvm. + */ +unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) +{ + int i; + unsigned int nr_mmu_pages; + unsigned int nr_pages = 0; + + for (i = 0; i < kvm->nmemslots; i++) + nr_pages += kvm->memslots[i].npages; + + nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; + nr_mmu_pages = max(nr_mmu_pages, + (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); + + return nr_mmu_pages; +} + +#ifdef AUDIT + +static const char *audit_msg; + +static gva_t canonicalize(gva_t gva) +{ +#ifdef CONFIG_X86_64 + gva = (long long)(gva << 16) >> 16; +#endif + return gva; +} + +static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, + gva_t va, int level) +{ + u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK); + int i; + gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1)); + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { + u64 ent = pt[i]; + + if (ent == shadow_trap_nonpresent_pte) + continue; + + va = canonicalize(va); + if (level > 1) { + if (ent == shadow_notrap_nonpresent_pte) + printk(KERN_ERR "audit: (%s) nontrapping pte" + " in nonleaf level: levels %d gva %lx" + " level %d pte %llx\n", audit_msg, + vcpu->arch.mmu.root_level, va, level, ent); + + audit_mappings_page(vcpu, ent, va, level - 1); + } else { + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); + struct page *page = gpa_to_page(vcpu, gpa); + hpa_t hpa = page_to_phys(page); + + if (is_shadow_present_pte(ent) + && (ent & PT64_BASE_ADDR_MASK) != hpa) + printk(KERN_ERR "xx audit error: (%s) levels %d" + " gva %lx gpa %llx hpa %llx ent %llx %d\n", + audit_msg, vcpu->arch.mmu.root_level, + va, gpa, hpa, ent, + is_shadow_present_pte(ent)); + else if (ent == shadow_notrap_nonpresent_pte + && !is_error_hpa(hpa)) + printk(KERN_ERR "audit: (%s) notrap shadow," + " valid guest gva %lx\n", audit_msg, va); + kvm_release_page_clean(page); + + } + } +} + +static void audit_mappings(struct kvm_vcpu *vcpu) +{ + unsigned i; + + if (vcpu->arch.mmu.root_level == 4) + audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4); + else + for (i = 0; i < 4; ++i) + if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK) + audit_mappings_page(vcpu, + vcpu->arch.mmu.pae_root[i], + i << 30, + 2); +} + +static int count_rmaps(struct kvm_vcpu *vcpu) +{ + int nmaps = 0; + int i, j, k; + + for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { + struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; + struct kvm_rmap_desc *d; + + for (j = 0; j < m->npages; ++j) { + unsigned long *rmapp = &m->rmap[j]; + + if (!*rmapp) + continue; + if (!(*rmapp & 1)) { + ++nmaps; + continue; + } + d = (struct kvm_rmap_desc *)(*rmapp & ~1ul); + while (d) { + for (k = 0; k < RMAP_EXT; ++k) + if (d->shadow_ptes[k]) + ++nmaps; + else + break; + d = d->more; + } + } + } + return nmaps; +} + +static int count_writable_mappings(struct kvm_vcpu *vcpu) +{ + int nmaps = 0; + struct kvm_mmu_page *sp; + int i; + + list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { + u64 *pt = sp->spt; + + if (sp->role.level != PT_PAGE_TABLE_LEVEL) + continue; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + u64 ent = pt[i]; + + if (!(ent & PT_PRESENT_MASK)) + continue; + if (!(ent & PT_WRITABLE_MASK)) + continue; + ++nmaps; + } + } + return nmaps; +} + +static void audit_rmap(struct kvm_vcpu *vcpu) +{ + int n_rmap = count_rmaps(vcpu); + int n_actual = count_writable_mappings(vcpu); + + if (n_rmap != n_actual) + printk(KERN_ERR "%s: (%s) rmap %d actual %d\n", + __FUNCTION__, audit_msg, n_rmap, n_actual); +} + +static void audit_write_protection(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_page *sp; + struct kvm_memory_slot *slot; + unsigned long *rmapp; + gfn_t gfn; + + list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { + if (sp->role.metaphysical) + continue; + + slot = gfn_to_memslot(vcpu->kvm, sp->gfn); + gfn = unalias_gfn(vcpu->kvm, sp->gfn); + rmapp = &slot->rmap[gfn - slot->base_gfn]; + if (*rmapp) + printk(KERN_ERR "%s: (%s) shadow page has writable" + " mappings: gfn %lx role %x\n", + __FUNCTION__, audit_msg, sp->gfn, + sp->role.word); + } +} + +static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) +{ + int olddbg = dbg; + + dbg = 0; + audit_msg = msg; + audit_rmap(vcpu); + audit_write_protection(vcpu); + audit_mappings(vcpu); + dbg = olddbg; +} + +#endif diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h new file mode 100644 index 00000000000..1fce19ec7a2 --- /dev/null +++ b/arch/x86/kvm/mmu.h @@ -0,0 +1,44 @@ +#ifndef __KVM_X86_MMU_H +#define __KVM_X86_MMU_H + +#include <linux/kvm_host.h> + +static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) +{ + if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) + __kvm_mmu_free_some_pages(vcpu); +} + +static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) +{ + if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE)) + return 0; + + return kvm_mmu_load(vcpu); +} + +static inline int is_long_mode(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_X86_64 + return vcpu->arch.shadow_efer & EFER_LME; +#else + return 0; +#endif +} + +static inline int is_pae(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.cr4 & X86_CR4_PAE; +} + +static inline int is_pse(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.cr4 & X86_CR4_PSE; +} + +static inline int is_paging(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.cr0 & X86_CR0_PG; +} + +#endif diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h new file mode 100644 index 00000000000..03ba8608fe0 --- /dev/null +++ b/arch/x86/kvm/paging_tmpl.h @@ -0,0 +1,484 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * MMU support + * + * Copyright (C) 2006 Qumranet, Inc. + * + * Authors: + * Yaniv Kamay <yaniv@qumranet.com> + * Avi Kivity <avi@qumranet.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +/* + * We need the mmu code to access both 32-bit and 64-bit guest ptes, + * so the code in this file is compiled twice, once per pte size. + */ + +#if PTTYPE == 64 + #define pt_element_t u64 + #define guest_walker guest_walker64 + #define FNAME(name) paging##64_##name + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK + #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK + #define PT_INDEX(addr, level) PT64_INDEX(addr, level) + #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) + #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) + #define PT_LEVEL_BITS PT64_LEVEL_BITS + #ifdef CONFIG_X86_64 + #define PT_MAX_FULL_LEVELS 4 + #define CMPXCHG cmpxchg + #else + #define CMPXCHG cmpxchg64 + #define PT_MAX_FULL_LEVELS 2 + #endif +#elif PTTYPE == 32 + #define pt_element_t u32 + #define guest_walker guest_walker32 + #define FNAME(name) paging##32_##name + #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK + #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK + #define PT_INDEX(addr, level) PT32_INDEX(addr, level) + #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) + #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) + #define PT_LEVEL_BITS PT32_LEVEL_BITS + #define PT_MAX_FULL_LEVELS 2 + #define CMPXCHG cmpxchg +#else + #error Invalid PTTYPE value +#endif + +#define gpte_to_gfn FNAME(gpte_to_gfn) +#define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde) + +/* + * The guest_walker structure emulates the behavior of the hardware page + * table walker. + */ +struct guest_walker { + int level; + gfn_t table_gfn[PT_MAX_FULL_LEVELS]; + pt_element_t ptes[PT_MAX_FULL_LEVELS]; + gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; + unsigned pt_access; + unsigned pte_access; + gfn_t gfn; + u32 error_code; +}; + +static gfn_t gpte_to_gfn(pt_element_t gpte) +{ + return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; +} + +static gfn_t gpte_to_gfn_pde(pt_element_t gpte) +{ + return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; +} + +static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, + gfn_t table_gfn, unsigned index, + pt_element_t orig_pte, pt_element_t new_pte) +{ + pt_element_t ret; + pt_element_t *table; + struct page *page; + + page = gfn_to_page(kvm, table_gfn); + table = kmap_atomic(page, KM_USER0); + + ret = CMPXCHG(&table[index], orig_pte, new_pte); + + kunmap_atomic(table, KM_USER0); + + kvm_release_page_dirty(page); + + return (ret != orig_pte); +} + +static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) +{ + unsigned access; + + access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; +#if PTTYPE == 64 + if (is_nx(vcpu)) + access &= ~(gpte >> PT64_NX_SHIFT); +#endif + return access; +} + +/* + * Fetch a guest pte for a guest virtual address + */ +static int FNAME(walk_addr)(struct guest_walker *walker, + struct kvm_vcpu *vcpu, gva_t addr, + int write_fault, int user_fault, int fetch_fault) +{ + pt_element_t pte; + gfn_t table_gfn; + unsigned index, pt_access, pte_access; + gpa_t pte_gpa; + + pgprintk("%s: addr %lx\n", __FUNCTION__, addr); +walk: + walker->level = vcpu->arch.mmu.root_level; + pte = vcpu->arch.cr3; +#if PTTYPE == 64 + if (!is_long_mode(vcpu)) { + pte = vcpu->arch.pdptrs[(addr >> 30) & 3]; + if (!is_present_pte(pte)) + goto not_present; + --walker->level; + } +#endif + ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || + (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0); + + pt_access = ACC_ALL; + + for (;;) { + index = PT_INDEX(addr, walker->level); + + table_gfn = gpte_to_gfn(pte); + pte_gpa = gfn_to_gpa(table_gfn); + pte_gpa += index * sizeof(pt_element_t); + walker->table_gfn[walker->level - 1] = table_gfn; + walker->pte_gpa[walker->level - 1] = pte_gpa; + pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, + walker->level - 1, table_gfn); + + kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); + + if (!is_present_pte(pte)) + goto not_present; + + if (write_fault && !is_writeble_pte(pte)) + if (user_fault || is_write_protection(vcpu)) + goto access_error; + + if (user_fault && !(pte & PT_USER_MASK)) + goto access_error; + +#if PTTYPE == 64 + if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK)) + goto access_error; +#endif + + if (!(pte & PT_ACCESSED_MASK)) { + mark_page_dirty(vcpu->kvm, table_gfn); + if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, + index, pte, pte|PT_ACCESSED_MASK)) + goto walk; + pte |= PT_ACCESSED_MASK; + } + + pte_access = pt_access & FNAME(gpte_access)(vcpu, pte); + + walker->ptes[walker->level - 1] = pte; + + if (walker->level == PT_PAGE_TABLE_LEVEL) { + walker->gfn = gpte_to_gfn(pte); + break; + } + + if (walker->level == PT_DIRECTORY_LEVEL + && (pte & PT_PAGE_SIZE_MASK) + && (PTTYPE == 64 || is_pse(vcpu))) { + walker->gfn = gpte_to_gfn_pde(pte); + walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL); + if (PTTYPE == 32 && is_cpuid_PSE36()) + walker->gfn += pse36_gfn_delta(pte); + break; + } + + pt_access = pte_access; + --walker->level; + } + + if (write_fault && !is_dirty_pte(pte)) { + bool ret; + + mark_page_dirty(vcpu->kvm, table_gfn); + ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, + pte|PT_DIRTY_MASK); + if (ret) + goto walk; + pte |= PT_DIRTY_MASK; + kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte)); + walker->ptes[walker->level - 1] = pte; + } + + walker->pt_access = pt_access; + walker->pte_access = pte_access; + pgprintk("%s: pte %llx pte_access %x pt_access %x\n", + __FUNCTION__, (u64)pte, pt_access, pte_access); + return 1; + +not_present: + walker->error_code = 0; + goto err; + +access_error: + walker->error_code = PFERR_PRESENT_MASK; + +err: + if (write_fault) + walker->error_code |= PFERR_WRITE_MASK; + if (user_fault) + walker->error_code |= PFERR_USER_MASK; + if (fetch_fault) + walker->error_code |= PFERR_FETCH_MASK; + return 0; +} + +static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, + u64 *spte, const void *pte, int bytes, + int offset_in_pte) +{ + pt_element_t gpte; + unsigned pte_access; + struct page *npage; + + gpte = *(const pt_element_t *)pte; + if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { + if (!offset_in_pte && !is_present_pte(gpte)) + set_shadow_pte(spte, shadow_notrap_nonpresent_pte); + return; + } + if (bytes < sizeof(pt_element_t)) + return; + pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); + pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); + if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) + return; + npage = vcpu->arch.update_pte.page; + if (!npage) + return; + get_page(npage); + mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, + gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), npage); +} + +/* + * Fetch a shadow pte for a specific level in the paging hierarchy. + */ +static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, + struct guest_walker *walker, + int user_fault, int write_fault, int *ptwrite, + struct page *page) +{ + hpa_t shadow_addr; + int level; + u64 *shadow_ent; + unsigned access = walker->pt_access; + + if (!is_present_pte(walker->ptes[walker->level - 1])) + return NULL; + + shadow_addr = vcpu->arch.mmu.root_hpa; + level = vcpu->arch.mmu.shadow_root_level; + if (level == PT32E_ROOT_LEVEL) { + shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; + shadow_addr &= PT64_BASE_ADDR_MASK; + --level; + } + + for (; ; level--) { + u32 index = SHADOW_PT_INDEX(addr, level); + struct kvm_mmu_page *shadow_page; + u64 shadow_pte; + int metaphysical; + gfn_t table_gfn; + bool new_page = 0; + + shadow_ent = ((u64 *)__va(shadow_addr)) + index; + if (level == PT_PAGE_TABLE_LEVEL) + break; + if (is_shadow_present_pte(*shadow_ent)) { + shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; + continue; + } + + if (level - 1 == PT_PAGE_TABLE_LEVEL + && walker->level == PT_DIRECTORY_LEVEL) { + metaphysical = 1; + if (!is_dirty_pte(walker->ptes[level - 1])) + access &= ~ACC_WRITE_MASK; + table_gfn = gpte_to_gfn(walker->ptes[level - 1]); + } else { + metaphysical = 0; + table_gfn = walker->table_gfn[level - 2]; + } + shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, + metaphysical, access, + shadow_ent, &new_page); + if (new_page && !metaphysical) { + int r; + pt_element_t curr_pte; + r = kvm_read_guest_atomic(vcpu->kvm, + walker->pte_gpa[level - 2], + &curr_pte, sizeof(curr_pte)); + if (r || curr_pte != walker->ptes[level - 2]) { + kvm_release_page_clean(page); + return NULL; + } + } + shadow_addr = __pa(shadow_page->spt); + shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK + | PT_WRITABLE_MASK | PT_USER_MASK; + *shadow_ent = shadow_pte; + } + + mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, + user_fault, write_fault, + walker->ptes[walker->level-1] & PT_DIRTY_MASK, + ptwrite, walker->gfn, page); + + return shadow_ent; +} + +/* + * Page fault handler. There are several causes for a page fault: + * - there is no shadow pte for the guest pte + * - write access through a shadow pte marked read only so that we can set + * the dirty bit + * - write access to a shadow pte marked read only so we can update the page + * dirty bitmap, when userspace requests it + * - mmio access; in this case we will never install a present shadow pte + * - normal guest page fault due to the guest pte marked not present, not + * writable, or not executable + * + * Returns: 1 if we need to emulate the instruction, 0 otherwise, or + * a negative value on error. + */ +static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, + u32 error_code) +{ + int write_fault = error_code & PFERR_WRITE_MASK; + int user_fault = error_code & PFERR_USER_MASK; + int fetch_fault = error_code & PFERR_FETCH_MASK; + struct guest_walker walker; + u64 *shadow_pte; + int write_pt = 0; + int r; + struct page *page; + + pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code); + kvm_mmu_audit(vcpu, "pre page fault"); + + r = mmu_topup_memory_caches(vcpu); + if (r) + return r; + + down_read(¤t->mm->mmap_sem); + /* + * Look up the shadow pte for the faulting address. + */ + r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, + fetch_fault); + + /* + * The page is not mapped by the guest. Let the guest handle it. + */ + if (!r) { + pgprintk("%s: guest page fault\n", __FUNCTION__); + inject_page_fault(vcpu, addr, walker.error_code); + vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ + up_read(¤t->mm->mmap_sem); + return 0; + } + + page = gfn_to_page(vcpu->kvm, walker.gfn); + + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); + shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, + &write_pt, page); + pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, + shadow_pte, *shadow_pte, write_pt); + + if (!write_pt) + vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ + + /* + * mmio: emulate if accessible, otherwise its a guest fault. + */ + if (shadow_pte && is_io_pte(*shadow_pte)) { + spin_unlock(&vcpu->kvm->mmu_lock); + up_read(¤t->mm->mmap_sem); + return 1; + } + + ++vcpu->stat.pf_fixed; + kvm_mmu_audit(vcpu, "post page fault (fixed)"); + spin_unlock(&vcpu->kvm->mmu_lock); + up_read(¤t->mm->mmap_sem); + + return write_pt; +} + +static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) +{ + struct guest_walker walker; + gpa_t gpa = UNMAPPED_GVA; + int r; + + r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); + + if (r) { + gpa = gfn_to_gpa(walker.gfn); + gpa |= vaddr & ~PAGE_MASK; + } + + return gpa; +} + +static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + int i, offset = 0, r = 0; + pt_element_t pt; + + if (sp->role.metaphysical + || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) { + nonpaging_prefetch_page(vcpu, sp); + return; + } + + if (PTTYPE == 32) + offset = sp->role.quadrant << PT64_LEVEL_BITS; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + gpa_t pte_gpa = gfn_to_gpa(sp->gfn); + pte_gpa += (i+offset) * sizeof(pt_element_t); + + r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &pt, + sizeof(pt_element_t)); + if (r || is_present_pte(pt)) + sp->spt[i] = shadow_trap_nonpresent_pte; + else + sp->spt[i] = shadow_notrap_nonpresent_pte; + } +} + +#undef pt_element_t +#undef guest_walker +#undef FNAME +#undef PT_BASE_ADDR_MASK +#undef PT_INDEX +#undef SHADOW_PT_INDEX +#undef PT_LEVEL_MASK +#undef PT_DIR_BASE_ADDR_MASK +#undef PT_LEVEL_BITS +#undef PT_MAX_FULL_LEVELS +#undef gpte_to_gfn +#undef gpte_to_gfn_pde +#undef CMPXCHG diff --git a/drivers/kvm/segment_descriptor.h b/arch/x86/kvm/segment_descriptor.h index 71fdf458619..56fc4c87338 100644 --- a/drivers/kvm/segment_descriptor.h +++ b/arch/x86/kvm/segment_descriptor.h @@ -1,3 +1,6 @@ +#ifndef __SEGMENT_DESCRIPTOR_H +#define __SEGMENT_DESCRIPTOR_H + struct segment_descriptor { u16 limit_low; u16 base_low; @@ -14,4 +17,13 @@ struct segment_descriptor { u8 base_high; } __attribute__((packed)); +#ifdef CONFIG_X86_64 +/* LDT or TSS descriptor in the GDT. 16 bytes. */ +struct segment_descriptor_64 { + struct segment_descriptor s; + u32 base_higher; + u32 pad_zero; +}; +#endif +#endif diff --git a/drivers/kvm/svm.c b/arch/x86/kvm/svm.c index ced4ac1955d..de755cb1431 100644 --- a/drivers/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -13,10 +13,11 @@ * the COPYING file in the top-level directory. * */ +#include <linux/kvm_host.h> #include "kvm_svm.h" -#include "x86_emulate.h" #include "irq.h" +#include "mmu.h" #include <linux/module.h> #include <linux/kernel.h> @@ -42,9 +43,6 @@ MODULE_LICENSE("GPL"); #define SEG_TYPE_LDT 2 #define SEG_TYPE_BUSY_TSS16 3 -#define KVM_EFER_LMA (1 << 10) -#define KVM_EFER_LME (1 << 8) - #define SVM_FEATURE_NPT (1 << 0) #define SVM_FEATURE_LBRV (1 << 1) #define SVM_DEATURE_SVML (1 << 2) @@ -102,20 +100,20 @@ static inline u32 svm_has(u32 feat) static inline u8 pop_irq(struct kvm_vcpu *vcpu) { - int word_index = __ffs(vcpu->irq_summary); - int bit_index = __ffs(vcpu->irq_pending[word_index]); + int word_index = __ffs(vcpu->arch.irq_summary); + int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); int irq = word_index * BITS_PER_LONG + bit_index; - clear_bit(bit_index, &vcpu->irq_pending[word_index]); - if (!vcpu->irq_pending[word_index]) - clear_bit(word_index, &vcpu->irq_summary); + clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); + if (!vcpu->arch.irq_pending[word_index]) + clear_bit(word_index, &vcpu->arch.irq_summary); return irq; } static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq) { - set_bit(irq, vcpu->irq_pending); - set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); + set_bit(irq, vcpu->arch.irq_pending); + set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); } static inline void clgi(void) @@ -184,35 +182,30 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) { - if (!(efer & KVM_EFER_LMA)) - efer &= ~KVM_EFER_LME; + if (!(efer & EFER_LMA)) + efer &= ~EFER_LME; to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK; - vcpu->shadow_efer = efer; + vcpu->arch.shadow_efer = efer; } -static void svm_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) +static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, + bool has_error_code, u32 error_code) { struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | - SVM_EVTINJ_VALID_ERR | - SVM_EVTINJ_TYPE_EXEPT | - GP_VECTOR; + svm->vmcb->control.event_inj = nr + | SVM_EVTINJ_VALID + | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0) + | SVM_EVTINJ_TYPE_EXEPT; svm->vmcb->control.event_inj_err = error_code; } -static void inject_ud(struct kvm_vcpu *vcpu) +static bool svm_exception_injected(struct kvm_vcpu *vcpu) { - to_svm(vcpu)->vmcb->control.event_inj = SVM_EVTINJ_VALID | - SVM_EVTINJ_TYPE_EXEPT | - UD_VECTOR; -} + struct vcpu_svm *svm = to_svm(vcpu); -static int is_page_fault(uint32_t info) -{ - info &= SVM_EVTINJ_VEC_MASK | SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; - return info == (PF_VECTOR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT); + return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID); } static int is_external_interrupt(u32 info) @@ -229,17 +222,16 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__); return; } - if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) { + if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", __FUNCTION__, svm->vmcb->save.rip, svm->next_rip); - } - vcpu->rip = svm->vmcb->save.rip = svm->next_rip; + vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip; svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; - vcpu->interrupt_window_open = 1; + vcpu->arch.interrupt_window_open = 1; } static int has_svm(void) @@ -312,7 +304,7 @@ static void svm_hardware_enable(void *garbage) svm_data->next_asid = svm_data->max_asid + 1; svm_features = cpuid_edx(SVM_CPUID_FUNC); - asm volatile ( "sgdt %0" : "=m"(gdt_descr) ); + asm volatile ("sgdt %0" : "=m"(gdt_descr)); gdt = (struct desc_struct *)gdt_descr.address; svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); @@ -458,11 +450,13 @@ static void init_vmcb(struct vmcb *vmcb) control->intercept_cr_read = INTERCEPT_CR0_MASK | INTERCEPT_CR3_MASK | - INTERCEPT_CR4_MASK; + INTERCEPT_CR4_MASK | + INTERCEPT_CR8_MASK; control->intercept_cr_write = INTERCEPT_CR0_MASK | INTERCEPT_CR3_MASK | - INTERCEPT_CR4_MASK; + INTERCEPT_CR4_MASK | + INTERCEPT_CR8_MASK; control->intercept_dr_read = INTERCEPT_DR0_MASK | INTERCEPT_DR1_MASK | @@ -476,7 +470,8 @@ static void init_vmcb(struct vmcb *vmcb) INTERCEPT_DR5_MASK | INTERCEPT_DR7_MASK; - control->intercept_exceptions = 1 << PF_VECTOR; + control->intercept_exceptions = (1 << PF_VECTOR) | + (1 << UD_VECTOR); control->intercept = (1ULL << INTERCEPT_INTR) | @@ -543,8 +538,7 @@ static void init_vmcb(struct vmcb *vmcb) init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); save->efer = MSR_EFER_SVME_MASK; - - save->dr6 = 0xffff0ff0; + save->dr6 = 0xffff0ff0; save->dr7 = 0x400; save->rflags = 2; save->rip = 0x0000fff0; @@ -558,7 +552,7 @@ static void init_vmcb(struct vmcb *vmcb) /* rdx = ?? */ } -static void svm_vcpu_reset(struct kvm_vcpu *vcpu) +static int svm_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -566,9 +560,11 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu) if (vcpu->vcpu_id != 0) { svm->vmcb->save.rip = 0; - svm->vmcb->save.cs.base = svm->vcpu.sipi_vector << 12; - svm->vmcb->save.cs.selector = svm->vcpu.sipi_vector << 8; + svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; + svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; } + + return 0; } static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) @@ -587,12 +583,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) if (err) goto free_svm; - if (irqchip_in_kernel(kvm)) { - err = kvm_create_lapic(&svm->vcpu); - if (err < 0) - goto free_svm; - } - page = alloc_page(GFP_KERNEL); if (!page) { err = -ENOMEM; @@ -608,9 +598,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) fx_init(&svm->vcpu); svm->vcpu.fpu_active = 1; - svm->vcpu.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; + svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; if (svm->vcpu.vcpu_id == 0) - svm->vcpu.apic_base |= MSR_IA32_APICBASE_BSP; + svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; return &svm->vcpu; @@ -644,7 +634,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) * increasing TSC. */ rdtscll(tsc_this); - delta = vcpu->host_tsc - tsc_this; + delta = vcpu->arch.host_tsc - tsc_this; svm->vmcb->control.tsc_offset += delta; vcpu->cpu = cpu; kvm_migrate_apic_timer(vcpu); @@ -659,11 +649,11 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); int i; + ++vcpu->stat.host_state_reload; for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); - rdtscll(vcpu->host_tsc); - kvm_put_guest_fpu(vcpu); + rdtscll(vcpu->arch.host_tsc); } static void svm_vcpu_decache(struct kvm_vcpu *vcpu) @@ -674,17 +664,17 @@ static void svm_cache_regs(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - vcpu->regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; - vcpu->regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; - vcpu->rip = svm->vmcb->save.rip; + vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; + vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; + vcpu->arch.rip = svm->vmcb->save.rip; } static void svm_decache_regs(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->save.rax = vcpu->regs[VCPU_REGS_RAX]; - svm->vmcb->save.rsp = vcpu->regs[VCPU_REGS_RSP]; - svm->vmcb->save.rip = vcpu->rip; + svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; + svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; + svm->vmcb->save.rip = vcpu->arch.rip; } static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) @@ -782,24 +772,24 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) struct vcpu_svm *svm = to_svm(vcpu); #ifdef CONFIG_X86_64 - if (vcpu->shadow_efer & KVM_EFER_LME) { + if (vcpu->arch.shadow_efer & EFER_LME) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { - vcpu->shadow_efer |= KVM_EFER_LMA; - svm->vmcb->save.efer |= KVM_EFER_LMA | KVM_EFER_LME; + vcpu->arch.shadow_efer |= EFER_LMA; + svm->vmcb->save.efer |= EFER_LMA | EFER_LME; } - if (is_paging(vcpu) && !(cr0 & X86_CR0_PG) ) { - vcpu->shadow_efer &= ~KVM_EFER_LMA; - svm->vmcb->save.efer &= ~(KVM_EFER_LMA | KVM_EFER_LME); + if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { + vcpu->arch.shadow_efer &= ~EFER_LMA; + svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); } } #endif - if ((vcpu->cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { + if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); vcpu->fpu_active = 1; } - vcpu->cr0 = cr0; + vcpu->arch.cr0 = cr0; cr0 |= X86_CR0_PG | X86_CR0_WP; cr0 &= ~(X86_CR0_CD | X86_CR0_NW); svm->vmcb->save.cr0 = cr0; @@ -807,7 +797,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - vcpu->cr4 = cr4; + vcpu->arch.cr4 = cr4; to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE; } @@ -912,7 +902,7 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, svm->db_regs[dr] = value; return; case 4 ... 5: - if (vcpu->cr4 & X86_CR4_DE) { + if (vcpu->arch.cr4 & X86_CR4_DE) { *exception = UD_VECTOR; return; } @@ -938,51 +928,30 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) struct kvm *kvm = svm->vcpu.kvm; u64 fault_address; u32 error_code; - enum emulation_result er; - int r; if (!irqchip_in_kernel(kvm) && is_external_interrupt(exit_int_info)) push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); - mutex_lock(&kvm->lock); - fault_address = svm->vmcb->control.exit_info_2; error_code = svm->vmcb->control.exit_info_1; - r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); - if (r < 0) { - mutex_unlock(&kvm->lock); - return r; - } - if (!r) { - mutex_unlock(&kvm->lock); - return 1; - } - er = emulate_instruction(&svm->vcpu, kvm_run, fault_address, - error_code); - mutex_unlock(&kvm->lock); + return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); +} - switch (er) { - case EMULATE_DONE: - return 1; - case EMULATE_DO_MMIO: - ++svm->vcpu.stat.mmio_exits; - return 0; - case EMULATE_FAIL: - kvm_report_emulation_failure(&svm->vcpu, "pagetable"); - break; - default: - BUG(); - } +static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + int er; - kvm_run->exit_reason = KVM_EXIT_UNKNOWN; - return 0; + er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); + if (er != EMULATE_DONE) + kvm_queue_exception(&svm->vcpu, UD_VECTOR); + return 1; } static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); - if (!(svm->vcpu.cr0 & X86_CR0_TS)) + if (!(svm->vcpu.arch.cr0 & X86_CR0_TS)) svm->vmcb->save.cr0 &= ~X86_CR0_TS; svm->vcpu.fpu_active = 1; @@ -1004,7 +973,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - u32 io_info = svm->vmcb->control.exit_info_1; //address size bug? + u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ int size, down, in, string, rep; unsigned port; @@ -1015,7 +984,8 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) string = (io_info & SVM_IOIO_STR_MASK) != 0; if (string) { - if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO) + if (emulate_instruction(&svm->vcpu, + kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) return 0; return 1; } @@ -1045,13 +1015,14 @@ static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { svm->next_rip = svm->vmcb->save.rip + 3; skip_emulated_instruction(&svm->vcpu); - return kvm_hypercall(&svm->vcpu, kvm_run); + kvm_emulate_hypercall(&svm->vcpu); + return 1; } static int invalid_op_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - inject_ud(&svm->vcpu); + kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; } @@ -1073,11 +1044,20 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int emulate_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - if (emulate_instruction(&svm->vcpu, NULL, 0, 0) != EMULATE_DONE) + if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE) pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__); return 1; } +static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + emulate_instruction(&svm->vcpu, NULL, 0, 0, 0); + if (irqchip_in_kernel(svm->vcpu.kvm)) + return 1; + kvm_run->exit_reason = KVM_EXIT_SET_TPR; + return 0; +} + static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1124,14 +1104,14 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX]; + u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; u64 data; if (svm_get_msr(&svm->vcpu, ecx, &data)) - svm_inject_gp(&svm->vcpu, 0); + kvm_inject_gp(&svm->vcpu, 0); else { svm->vmcb->save.rax = data & 0xffffffff; - svm->vcpu.regs[VCPU_REGS_RDX] = data >> 32; + svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; svm->next_rip = svm->vmcb->save.rip + 2; skip_emulated_instruction(&svm->vcpu); } @@ -1176,7 +1156,20 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) case MSR_IA32_SYSENTER_ESP: svm->vmcb->save.sysenter_esp = data; break; + case MSR_K7_EVNTSEL0: + case MSR_K7_EVNTSEL1: + case MSR_K7_EVNTSEL2: + case MSR_K7_EVNTSEL3: + /* + * only support writing 0 to the performance counters for now + * to make Windows happy. Should be replaced by a real + * performance counter emulation later. + */ + if (data != 0) + goto unhandled; + break; default: + unhandled: return kvm_set_msr_common(vcpu, ecx, data); } return 0; @@ -1184,12 +1177,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX]; + u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; u64 data = (svm->vmcb->save.rax & -1u) - | ((u64)(svm->vcpu.regs[VCPU_REGS_RDX] & -1u) << 32); + | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); svm->next_rip = svm->vmcb->save.rip + 2; if (svm_set_msr(&svm->vcpu, ecx, data)) - svm_inject_gp(&svm->vcpu, 0); + kvm_inject_gp(&svm->vcpu, 0); else skip_emulated_instruction(&svm->vcpu); return 1; @@ -1213,7 +1206,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm, * possible */ if (kvm_run->request_interrupt_window && - !svm->vcpu.irq_summary) { + !svm->vcpu.arch.irq_summary) { ++svm->vcpu.stat.irq_window_exits; kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; return 0; @@ -1227,10 +1220,12 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_READ_CR0] = emulate_on_interception, [SVM_EXIT_READ_CR3] = emulate_on_interception, [SVM_EXIT_READ_CR4] = emulate_on_interception, + [SVM_EXIT_READ_CR8] = emulate_on_interception, /* for now: */ [SVM_EXIT_WRITE_CR0] = emulate_on_interception, [SVM_EXIT_WRITE_CR3] = emulate_on_interception, [SVM_EXIT_WRITE_CR4] = emulate_on_interception, + [SVM_EXIT_WRITE_CR8] = cr8_write_interception, [SVM_EXIT_READ_DR0] = emulate_on_interception, [SVM_EXIT_READ_DR1] = emulate_on_interception, [SVM_EXIT_READ_DR2] = emulate_on_interception, @@ -1241,6 +1236,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_WRITE_DR3] = emulate_on_interception, [SVM_EXIT_WRITE_DR5] = emulate_on_interception, [SVM_EXIT_WRITE_DR7] = emulate_on_interception, + [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, [SVM_EXIT_INTR] = nop_on_interception, @@ -1293,7 +1289,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) exit_code); if (exit_code >= ARRAY_SIZE(svm_exit_handlers) - || svm_exit_handlers[exit_code] == 0) { + || !svm_exit_handlers[exit_code]) { kvm_run->exit_reason = KVM_EXIT_UNKNOWN; kvm_run->hw.hardware_exit_reason = exit_code; return 0; @@ -1307,7 +1303,7 @@ static void reload_tss(struct kvm_vcpu *vcpu) int cpu = raw_smp_processor_id(); struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); - svm_data->tss_desc->type = 9; //available 32/64-bit TSS + svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */ load_TR_desc(); } @@ -1348,7 +1344,6 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu) struct vmcb *vmcb = svm->vmcb; int intr_vector = -1; - kvm_inject_pending_timer_irqs(vcpu); if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) && ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) { intr_vector = vmcb->control.exit_int_info & @@ -1388,20 +1383,20 @@ static void kvm_reput_irq(struct vcpu_svm *svm) push_irq(&svm->vcpu, control->int_vector); } - svm->vcpu.interrupt_window_open = + svm->vcpu.arch.interrupt_window_open = !(control->int_state & SVM_INTERRUPT_SHADOW_MASK); } static void svm_do_inject_vector(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; - int word_index = __ffs(vcpu->irq_summary); - int bit_index = __ffs(vcpu->irq_pending[word_index]); + int word_index = __ffs(vcpu->arch.irq_summary); + int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); int irq = word_index * BITS_PER_LONG + bit_index; - clear_bit(bit_index, &vcpu->irq_pending[word_index]); - if (!vcpu->irq_pending[word_index]) - clear_bit(word_index, &vcpu->irq_summary); + clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); + if (!vcpu->arch.irq_pending[word_index]) + clear_bit(word_index, &vcpu->arch.irq_summary); svm_inject_irq(svm, irq); } @@ -1411,11 +1406,11 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, struct vcpu_svm *svm = to_svm(vcpu); struct vmcb_control_area *control = &svm->vmcb->control; - svm->vcpu.interrupt_window_open = + svm->vcpu.arch.interrupt_window_open = (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && (svm->vmcb->save.rflags & X86_EFLAGS_IF)); - if (svm->vcpu.interrupt_window_open && svm->vcpu.irq_summary) + if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary) /* * If interrupts enabled, and not blocked by sti or mov ss. Good. */ @@ -1424,13 +1419,18 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, /* * Interrupts blocked. Wait for unblock. */ - if (!svm->vcpu.interrupt_window_open && - (svm->vcpu.irq_summary || kvm_run->request_interrupt_window)) { + if (!svm->vcpu.arch.interrupt_window_open && + (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window)) control->intercept |= 1ULL << INTERCEPT_VINTR; - } else + else control->intercept &= ~(1ULL << INTERCEPT_VINTR); } +static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) +{ + return 0; +} + static void save_db_regs(unsigned long *db_regs) { asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0])); @@ -1472,7 +1472,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) svm->host_cr2 = kvm_read_cr2(); svm->host_dr6 = read_dr6(); svm->host_dr7 = read_dr7(); - svm->vmcb->save.cr2 = vcpu->cr2; + svm->vmcb->save.cr2 = vcpu->arch.cr2; if (svm->vmcb->save.dr7 & 0xff) { write_dr7(0); @@ -1486,13 +1486,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) asm volatile ( #ifdef CONFIG_X86_64 - "push %%rbx; push %%rcx; push %%rdx;" - "push %%rsi; push %%rdi; push %%rbp;" - "push %%r8; push %%r9; push %%r10; push %%r11;" - "push %%r12; push %%r13; push %%r14; push %%r15;" + "push %%rbp; \n\t" #else - "push %%ebx; push %%ecx; push %%edx;" - "push %%esi; push %%edi; push %%ebp;" + "push %%ebp; \n\t" #endif #ifdef CONFIG_X86_64 @@ -1554,10 +1550,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) "mov %%r14, %c[r14](%[svm]) \n\t" "mov %%r15, %c[r15](%[svm]) \n\t" - "pop %%r15; pop %%r14; pop %%r13; pop %%r12;" - "pop %%r11; pop %%r10; pop %%r9; pop %%r8;" - "pop %%rbp; pop %%rdi; pop %%rsi;" - "pop %%rdx; pop %%rcx; pop %%rbx; \n\t" + "pop %%rbp; \n\t" #else "mov %%ebx, %c[rbx](%[svm]) \n\t" "mov %%ecx, %c[rcx](%[svm]) \n\t" @@ -1566,34 +1559,40 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) "mov %%edi, %c[rdi](%[svm]) \n\t" "mov %%ebp, %c[rbp](%[svm]) \n\t" - "pop %%ebp; pop %%edi; pop %%esi;" - "pop %%edx; pop %%ecx; pop %%ebx; \n\t" + "pop %%ebp; \n\t" #endif : : [svm]"a"(svm), [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), - [rbx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBX])), - [rcx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RCX])), - [rdx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDX])), - [rsi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RSI])), - [rdi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDI])), - [rbp]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBP])) + [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])), + [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])), + [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])), + [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])), + [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])), + [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP])) #ifdef CONFIG_X86_64 - ,[r8 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R8])), - [r9 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R9 ])), - [r10]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R10])), - [r11]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R11])), - [r12]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R12])), - [r13]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R13])), - [r14]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R14])), - [r15]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R15])) + , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])), + [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])), + [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])), + [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])), + [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])), + [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])), + [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])), + [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) #endif - : "cc", "memory" ); + : "cc", "memory" +#ifdef CONFIG_X86_64 + , "rbx", "rcx", "rdx", "rsi", "rdi" + , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" +#else + , "ebx", "ecx", "edx" , "esi", "edi" +#endif + ); if ((svm->vmcb->save.dr7 & 0xff)) load_db_regs(svm->host_db_regs); - vcpu->cr2 = svm->vmcb->save.cr2; + vcpu->arch.cr2 = svm->vmcb->save.cr2; write_dr6(svm->host_dr6); write_dr7(svm->host_dr7); @@ -1627,34 +1626,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) } } -static void svm_inject_page_fault(struct kvm_vcpu *vcpu, - unsigned long addr, - uint32_t err_code) -{ - struct vcpu_svm *svm = to_svm(vcpu); - uint32_t exit_int_info = svm->vmcb->control.exit_int_info; - - ++vcpu->stat.pf_guest; - - if (is_page_fault(exit_int_info)) { - - svm->vmcb->control.event_inj_err = 0; - svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | - SVM_EVTINJ_VALID_ERR | - SVM_EVTINJ_TYPE_EXEPT | - DF_VECTOR; - return; - } - vcpu->cr2 = addr; - svm->vmcb->save.cr2 = addr; - svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | - SVM_EVTINJ_VALID_ERR | - SVM_EVTINJ_TYPE_EXEPT | - PF_VECTOR; - svm->vmcb->control.event_inj_err = err_code; -} - - static int is_disabled(void) { u64 vm_cr; @@ -1675,7 +1646,6 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[0] = 0x0f; hypercall[1] = 0x01; hypercall[2] = 0xd9; - hypercall[3] = 0xc3; } static void svm_check_processor_compat(void *rtn) @@ -1683,6 +1653,11 @@ static void svm_check_processor_compat(void *rtn) *(int *)rtn = 0; } +static bool svm_cpu_has_accelerated_tpr(void) +{ + return false; +} + static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -1691,6 +1666,7 @@ static struct kvm_x86_ops svm_x86_ops = { .check_processor_compatibility = svm_check_processor_compat, .hardware_enable = svm_hardware_enable, .hardware_disable = svm_hardware_disable, + .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr, .vcpu_create = svm_create_vcpu, .vcpu_free = svm_free_vcpu, @@ -1725,9 +1701,6 @@ static struct kvm_x86_ops svm_x86_ops = { .set_rflags = svm_set_rflags, .tlb_flush = svm_flush_tlb, - .inject_page_fault = svm_inject_page_fault, - - .inject_gp = svm_inject_gp, .run = svm_vcpu_run, .handle_exit = handle_exit, @@ -1735,19 +1708,23 @@ static struct kvm_x86_ops svm_x86_ops = { .patch_hypercall = svm_patch_hypercall, .get_irq = svm_get_irq, .set_irq = svm_set_irq, + .queue_exception = svm_queue_exception, + .exception_injected = svm_exception_injected, .inject_pending_irq = svm_intr_assist, .inject_pending_vectors = do_interrupt_requests, + + .set_tss_addr = svm_set_tss_addr, }; static int __init svm_init(void) { - return kvm_init_x86(&svm_x86_ops, sizeof(struct vcpu_svm), + return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm), THIS_MODULE); } static void __exit svm_exit(void) { - kvm_exit_x86(); + kvm_exit(); } module_init(svm_init) diff --git a/drivers/kvm/svm.h b/arch/x86/kvm/svm.h index 3b1b0f35b6c..5fd50491b55 100644 --- a/drivers/kvm/svm.h +++ b/arch/x86/kvm/svm.h @@ -204,6 +204,7 @@ struct __attribute__ ((__packed__)) vmcb { #define INTERCEPT_CR0_MASK 1 #define INTERCEPT_CR3_MASK (1 << 3) #define INTERCEPT_CR4_MASK (1 << 4) +#define INTERCEPT_CR8_MASK (1 << 8) #define INTERCEPT_DR0_MASK 1 #define INTERCEPT_DR1_MASK (1 << 1) @@ -311,7 +312,7 @@ struct __attribute__ ((__packed__)) vmcb { #define SVM_EXIT_ERR -1 -#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) // TS and MP +#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */ #define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" #define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" diff --git a/drivers/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5b397b6c9f9..ad36447e696 100644 --- a/drivers/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -15,17 +15,18 @@ * */ -#include "kvm.h" -#include "x86_emulate.h" #include "irq.h" #include "vmx.h" #include "segment_descriptor.h" +#include "mmu.h" +#include <linux/kvm_host.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/highmem.h> #include <linux/sched.h> +#include <linux/moduleparam.h> #include <asm/io.h> #include <asm/desc.h> @@ -33,6 +34,9 @@ MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); +static int bypass_guest_pf = 1; +module_param(bypass_guest_pf, bool, 0); + struct vmcs { u32 revision_id; u32 abort; @@ -43,6 +47,7 @@ struct vcpu_vmx { struct kvm_vcpu vcpu; int launched; u8 fail; + u32 idt_vectoring_info; struct kvm_msr_entry *guest_msrs; struct kvm_msr_entry *host_msrs; int nmsrs; @@ -57,8 +62,15 @@ struct vcpu_vmx { u16 fs_sel, gs_sel, ldt_sel; int gs_ldt_reload_needed; int fs_reload_needed; - }host_state; - + int guest_efer_loaded; + } host_state; + struct { + struct { + bool pending; + u8 vector; + unsigned rip; + } irq; + } rmode; }; static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) @@ -74,14 +86,13 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs); static struct page *vmx_io_bitmap_a; static struct page *vmx_io_bitmap_b; -#define EFER_SAVE_RESTORE_BITS ((u64)EFER_SCE) - static struct vmcs_config { int size; int order; u32 revision_id; u32 pin_based_exec_ctrl; u32 cpu_based_exec_ctrl; + u32 cpu_based_2nd_exec_ctrl; u32 vmexit_ctrl; u32 vmentry_ctrl; } vmcs_config; @@ -138,18 +149,6 @@ static void save_msrs(struct kvm_msr_entry *e, int n) rdmsrl(e[i].index, e[i].data); } -static inline u64 msr_efer_save_restore_bits(struct kvm_msr_entry msr) -{ - return (u64)msr.data & EFER_SAVE_RESTORE_BITS; -} - -static inline int msr_efer_need_save_restore(struct vcpu_vmx *vmx) -{ - int efer_offset = vmx->msr_offset_efer; - return msr_efer_save_restore_bits(vmx->host_msrs[efer_offset]) != - msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]); -} - static inline int is_page_fault(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | @@ -164,6 +163,13 @@ static inline int is_no_device(u32 intr_info) (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); } +static inline int is_invalid_opcode(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | + INTR_INFO_VALID_MASK)) == + (INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK); +} + static inline int is_external_interrupt(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -180,6 +186,24 @@ static inline int vm_need_tpr_shadow(struct kvm *kvm) return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm))); } +static inline int cpu_has_secondary_exec_ctrls(void) +{ + return (vmcs_config.cpu_based_exec_ctrl & + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS); +} + +static inline bool cpu_has_vmx_virtualize_apic_accesses(void) +{ + return (vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); +} + +static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) +{ + return ((cpu_has_vmx_virtualize_apic_accesses()) && + (irqchip_in_kernel(kvm))); +} + static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -222,16 +246,14 @@ static void __vcpu_clear(void *arg) vmcs_clear(vmx->vmcs); if (per_cpu(current_vmcs, cpu) == vmx->vmcs) per_cpu(current_vmcs, cpu) = NULL; - rdtscll(vmx->vcpu.host_tsc); + rdtscll(vmx->vcpu.arch.host_tsc); } static void vcpu_clear(struct vcpu_vmx *vmx) { - if (vmx->vcpu.cpu != raw_smp_processor_id() && vmx->vcpu.cpu != -1) - smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, - vmx, 0, 1); - else - __vcpu_clear(vmx); + if (vmx->vcpu.cpu == -1) + return; + smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1); vmx->launched = 0; } @@ -275,7 +297,7 @@ static void vmcs_writel(unsigned long field, unsigned long value) u8 error; asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" - : "=q"(error) : "a"(value), "d"(field) : "cc" ); + : "=q"(error) : "a"(value), "d"(field) : "cc"); if (unlikely(error)) vmwrite_error(field, value); } @@ -315,12 +337,12 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) { u32 eb; - eb = 1u << PF_VECTOR; + eb = (1u << PF_VECTOR) | (1u << UD_VECTOR); if (!vcpu->fpu_active) eb |= 1u << NM_VECTOR; if (vcpu->guest_debug.enabled) eb |= 1u << 1; - if (vcpu->rmode.active) + if (vcpu->arch.rmode.active) eb = ~0; vmcs_write32(EXCEPTION_BITMAP, eb); } @@ -344,16 +366,42 @@ static void reload_tss(void) static void load_transition_efer(struct vcpu_vmx *vmx) { - u64 trans_efer; int efer_offset = vmx->msr_offset_efer; + u64 host_efer = vmx->host_msrs[efer_offset].data; + u64 guest_efer = vmx->guest_msrs[efer_offset].data; + u64 ignore_bits; - trans_efer = vmx->host_msrs[efer_offset].data; - trans_efer &= ~EFER_SAVE_RESTORE_BITS; - trans_efer |= msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]); - wrmsrl(MSR_EFER, trans_efer); + if (efer_offset < 0) + return; + /* + * NX is emulated; LMA and LME handled by hardware; SCE meaninless + * outside long mode + */ + ignore_bits = EFER_NX | EFER_SCE; +#ifdef CONFIG_X86_64 + ignore_bits |= EFER_LMA | EFER_LME; + /* SCE is meaningful only in long mode on Intel */ + if (guest_efer & EFER_LMA) + ignore_bits &= ~(u64)EFER_SCE; +#endif + if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits)) + return; + + vmx->host_state.guest_efer_loaded = 1; + guest_efer &= ~ignore_bits; + guest_efer |= host_efer & ignore_bits; + wrmsrl(MSR_EFER, guest_efer); vmx->vcpu.stat.efer_reload++; } +static void reload_host_efer(struct vcpu_vmx *vmx) +{ + if (vmx->host_state.guest_efer_loaded) { + vmx->host_state.guest_efer_loaded = 0; + load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1); + } +} + static void vmx_save_host_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -393,14 +441,13 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) #endif #ifdef CONFIG_X86_64 - if (is_long_mode(&vmx->vcpu)) { + if (is_long_mode(&vmx->vcpu)) save_msrs(vmx->host_msrs + vmx->msr_offset_kernel_gs_base, 1); - } + #endif load_msrs(vmx->guest_msrs, vmx->save_nmsrs); - if (msr_efer_need_save_restore(vmx)) - load_transition_efer(vmx); + load_transition_efer(vmx); } static void vmx_load_host_state(struct vcpu_vmx *vmx) @@ -410,6 +457,7 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx) if (!vmx->host_state.loaded) return; + ++vmx->vcpu.stat.host_state_reload; vmx->host_state.loaded = 0; if (vmx->host_state.fs_reload_needed) load_fs(vmx->host_state.fs_sel); @@ -429,8 +477,7 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx) reload_tss(); save_msrs(vmx->guest_msrs, vmx->save_nmsrs); load_msrs(vmx->host_msrs, vmx->save_nmsrs); - if (msr_efer_need_save_restore(vmx)) - load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1); + reload_host_efer(vmx); } /* @@ -480,7 +527,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) * Make sure the time stamp counter is monotonous. */ rdtscll(tsc_this); - delta = vcpu->host_tsc - tsc_this; + delta = vcpu->arch.host_tsc - tsc_this; vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta); } } @@ -488,7 +535,6 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { vmx_load_host_state(to_vmx(vcpu)); - kvm_put_guest_fpu(vcpu); } static void vmx_fpu_activate(struct kvm_vcpu *vcpu) @@ -497,7 +543,7 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu) return; vcpu->fpu_active = 1; vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); - if (vcpu->cr0 & X86_CR0_TS) + if (vcpu->arch.cr0 & X86_CR0_TS) vmcs_set_bits(GUEST_CR0, X86_CR0_TS); update_exception_bitmap(vcpu); } @@ -523,7 +569,7 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { - if (vcpu->rmode.active) + if (vcpu->arch.rmode.active) rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; vmcs_writel(GUEST_RFLAGS, rflags); } @@ -545,19 +591,25 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) if (interruptibility & 3) vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility & ~3); - vcpu->interrupt_window_open = 1; + vcpu->arch.interrupt_window_open = 1; } -static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) +static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, + bool has_error_code, u32 error_code) { - printk(KERN_DEBUG "inject_general_protection: rip 0x%lx\n", - vmcs_readl(GUEST_RIP)); - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - GP_VECTOR | - INTR_TYPE_EXCEPTION | - INTR_INFO_DELIEVER_CODE_MASK | - INTR_INFO_VALID_MASK); + nr | INTR_TYPE_EXCEPTION + | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0) + | INTR_INFO_VALID_MASK); + if (has_error_code) + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); +} + +static bool vmx_exception_injected(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); } /* @@ -608,7 +660,7 @@ static void setup_msrs(struct vcpu_vmx *vmx) * if efer.sce is enabled. */ index = __find_msr_index(vmx, MSR_K6_STAR); - if ((index >= 0) && (vmx->vcpu.shadow_efer & EFER_SCE)) + if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE)) move_msr_up(vmx, index, save_nmsrs++); } #endif @@ -712,8 +764,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) #ifdef CONFIG_X86_64 case MSR_EFER: ret = kvm_set_msr_common(vcpu, msr_index, data); - if (vmx->host_state.loaded) + if (vmx->host_state.loaded) { + reload_host_efer(vmx); load_transition_efer(vmx); + } break; case MSR_FS_BASE: vmcs_writel(GUEST_FS_BASE, data); @@ -750,12 +804,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) /* * Sync the rsp and rip registers into the vcpu structure. This allows - * registers to be accessed by indexing vcpu->regs. + * registers to be accessed by indexing vcpu->arch.regs. */ static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) { - vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); - vcpu->rip = vmcs_readl(GUEST_RIP); + vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); + vcpu->arch.rip = vmcs_readl(GUEST_RIP); } /* @@ -764,8 +818,8 @@ static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) */ static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) { - vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]); - vmcs_writel(GUEST_RIP, vcpu->rip); + vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); + vmcs_writel(GUEST_RIP, vcpu->arch.rip); } static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) @@ -808,14 +862,15 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) static int vmx_get_irq(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); u32 idtv_info_field; - idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD); + idtv_info_field = vmx->idt_vectoring_info; if (idtv_info_field & INTR_INFO_VALID_MASK) { if (is_external_interrupt(idtv_info_field)) return idtv_info_field & VECTORING_INFO_VECTOR_MASK; else - printk("pending exception: not handled yet\n"); + printk(KERN_DEBUG "pending exception: not handled yet\n"); } return -1; } @@ -863,7 +918,7 @@ static void hardware_disable(void *garbage) } static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, - u32 msr, u32* result) + u32 msr, u32 *result) { u32 vmx_msr_low, vmx_msr_high; u32 ctl = ctl_min | ctl_opt; @@ -887,6 +942,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) u32 min, opt; u32 _pin_based_exec_control = 0; u32 _cpu_based_exec_control = 0; + u32 _cpu_based_2nd_exec_control = 0; u32 _vmexit_control = 0; u32 _vmentry_control = 0; @@ -904,11 +960,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING; -#ifdef CONFIG_X86_64 - opt = CPU_BASED_TPR_SHADOW; -#else - opt = 0; -#endif + opt = CPU_BASED_TPR_SHADOW | + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, &_cpu_based_exec_control) < 0) return -EIO; @@ -917,6 +970,19 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & ~CPU_BASED_CR8_STORE_EXITING; #endif + if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { + min = 0; + opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_WBINVD_EXITING; + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2, + &_cpu_based_2nd_exec_control) < 0) + return -EIO; + } +#ifndef CONFIG_X86_64 + if (!(_cpu_based_2nd_exec_control & + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; +#endif min = 0; #ifdef CONFIG_X86_64 @@ -954,6 +1020,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; + vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; vmcs_conf->vmexit_ctrl = _vmexit_control; vmcs_conf->vmentry_ctrl = _vmentry_control; @@ -1043,15 +1110,15 @@ static void enter_pmode(struct kvm_vcpu *vcpu) { unsigned long flags; - vcpu->rmode.active = 0; + vcpu->arch.rmode.active = 0; - vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base); - vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit); - vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar); + vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); + vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit); + vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar); flags = vmcs_readl(GUEST_RFLAGS); flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); - flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT); + flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT); vmcs_writel(GUEST_RFLAGS, flags); vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | @@ -1059,10 +1126,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu) update_exception_bitmap(vcpu); - fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->rmode.es); - fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->rmode.ds); - fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->rmode.gs); - fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->rmode.fs); + fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); + fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); + fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); + fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); vmcs_write16(GUEST_SS_SELECTOR, 0); vmcs_write32(GUEST_SS_AR_BYTES, 0x93); @@ -1072,10 +1139,14 @@ static void enter_pmode(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); } -static gva_t rmode_tss_base(struct kvm* kvm) +static gva_t rmode_tss_base(struct kvm *kvm) { - gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3; - return base_gfn << PAGE_SHIFT; + if (!kvm->arch.tss_addr) { + gfn_t base_gfn = kvm->memslots[0].base_gfn + + kvm->memslots[0].npages - 3; + return base_gfn << PAGE_SHIFT; + } + return kvm->arch.tss_addr; } static void fix_rmode_seg(int seg, struct kvm_save_segment *save) @@ -1086,7 +1157,8 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save) save->base = vmcs_readl(sf->base); save->limit = vmcs_read32(sf->limit); save->ar = vmcs_read32(sf->ar_bytes); - vmcs_write16(sf->selector, vmcs_readl(sf->base) >> 4); + vmcs_write16(sf->selector, save->base >> 4); + vmcs_write32(sf->base, save->base & 0xfffff); vmcs_write32(sf->limit, 0xffff); vmcs_write32(sf->ar_bytes, 0xf3); } @@ -1095,19 +1167,20 @@ static void enter_rmode(struct kvm_vcpu *vcpu) { unsigned long flags; - vcpu->rmode.active = 1; + vcpu->arch.rmode.active = 1; - vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); + vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); - vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); + vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); - vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); + vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); flags = vmcs_readl(GUEST_RFLAGS); - vcpu->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + vcpu->arch.rmode.save_iopl + = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; @@ -1125,10 +1198,10 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_CS_BASE, 0xf0000); vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); - fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es); - fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds); - fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs); - fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs); + fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es); + fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); + fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); + fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); kvm_mmu_reset_context(vcpu); init_rmode_tss(vcpu->kvm); @@ -1149,7 +1222,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu) | AR_TYPE_BUSY_64_TSS); } - vcpu->shadow_efer |= EFER_LMA; + vcpu->arch.shadow_efer |= EFER_LMA; find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME; vmcs_write32(VM_ENTRY_CONTROLS, @@ -1159,7 +1232,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu) static void exit_lmode(struct kvm_vcpu *vcpu) { - vcpu->shadow_efer &= ~EFER_LMA; + vcpu->arch.shadow_efer &= ~EFER_LMA; vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS) @@ -1170,22 +1243,22 @@ static void exit_lmode(struct kvm_vcpu *vcpu) static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { - vcpu->cr4 &= KVM_GUEST_CR4_MASK; - vcpu->cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; + vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; + vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; } static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { vmx_fpu_deactivate(vcpu); - if (vcpu->rmode.active && (cr0 & X86_CR0_PE)) + if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE)) enter_pmode(vcpu); - if (!vcpu->rmode.active && !(cr0 & X86_CR0_PE)) + if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE)) enter_rmode(vcpu); #ifdef CONFIG_X86_64 - if (vcpu->shadow_efer & EFER_LME) { + if (vcpu->arch.shadow_efer & EFER_LME) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) enter_lmode(vcpu); if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) @@ -1196,7 +1269,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); - vcpu->cr0 = cr0; + vcpu->arch.cr0 = cr0; if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE)) vmx_fpu_activate(vcpu); @@ -1205,16 +1278,16 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { vmcs_writel(GUEST_CR3, cr3); - if (vcpu->cr0 & X86_CR0_PE) + if (vcpu->arch.cr0 & X86_CR0_PE) vmx_fpu_deactivate(vcpu); } static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { vmcs_writel(CR4_READ_SHADOW, cr4); - vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ? + vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ? KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON)); - vcpu->cr4 = cr4; + vcpu->arch.cr4 = cr4; } #ifdef CONFIG_X86_64 @@ -1224,7 +1297,7 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); - vcpu->shadow_efer = efer; + vcpu->arch.shadow_efer = efer; if (efer & EFER_LMA) { vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS) | @@ -1301,17 +1374,17 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; u32 ar; - if (vcpu->rmode.active && seg == VCPU_SREG_TR) { - vcpu->rmode.tr.selector = var->selector; - vcpu->rmode.tr.base = var->base; - vcpu->rmode.tr.limit = var->limit; - vcpu->rmode.tr.ar = vmx_segment_access_rights(var); + if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) { + vcpu->arch.rmode.tr.selector = var->selector; + vcpu->arch.rmode.tr.base = var->base; + vcpu->arch.rmode.tr.limit = var->limit; + vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var); return; } vmcs_writel(sf->base, var->base); vmcs_write32(sf->limit, var->limit); vmcs_write16(sf->selector, var->selector); - if (vcpu->rmode.active && var->s) { + if (vcpu->arch.rmode.active && var->s) { /* * Hack real-mode segments into vm86 compatibility. */ @@ -1355,36 +1428,38 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) vmcs_writel(GUEST_GDTR_BASE, dt->base); } -static int init_rmode_tss(struct kvm* kvm) +static int init_rmode_tss(struct kvm *kvm) { - struct page *p1, *p2, *p3; gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; - char *page; - - p1 = gfn_to_page(kvm, fn++); - p2 = gfn_to_page(kvm, fn++); - p3 = gfn_to_page(kvm, fn); - - if (!p1 || !p2 || !p3) { - kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__); - return 0; - } - - page = kmap_atomic(p1, KM_USER0); - clear_page(page); - *(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; - kunmap_atomic(page, KM_USER0); - - page = kmap_atomic(p2, KM_USER0); - clear_page(page); - kunmap_atomic(page, KM_USER0); + u16 data = 0; + int ret = 0; + int r; - page = kmap_atomic(p3, KM_USER0); - clear_page(page); - *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0; - kunmap_atomic(page, KM_USER0); + down_read(¤t->mm->mmap_sem); + r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); + if (r < 0) + goto out; + data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; + r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16)); + if (r < 0) + goto out; + r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); + if (r < 0) + goto out; + r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); + if (r < 0) + goto out; + data = ~0; + r = kvm_write_guest_page(kvm, fn, &data, + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, + sizeof(u8)); + if (r < 0) + goto out; - return 1; + ret = 1; +out: + up_read(¤t->mm->mmap_sem); + return ret; } static void seg_setup(int seg) @@ -1397,6 +1472,27 @@ static void seg_setup(int seg) vmcs_write32(sf->ar_bytes, 0x93); } +static int alloc_apic_access_page(struct kvm *kvm) +{ + struct kvm_userspace_memory_region kvm_userspace_mem; + int r = 0; + + down_write(¤t->mm->mmap_sem); + if (kvm->arch.apic_access_page) + goto out; + kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; + kvm_userspace_mem.flags = 0; + kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL; + kvm_userspace_mem.memory_size = PAGE_SIZE; + r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); + if (r) + goto out; + kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); +out: + up_write(¤t->mm->mmap_sem); + return r; +} + /* * Sets up the vmcs for emulated real mode. */ @@ -1407,92 +1503,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) unsigned long a; struct descriptor_table dt; int i; - int ret = 0; unsigned long kvm_vmx_return; - u64 msr; u32 exec_control; - if (!init_rmode_tss(vmx->vcpu.kvm)) { - ret = -ENOMEM; - goto out; - } - - vmx->vcpu.rmode.active = 0; - - vmx->vcpu.regs[VCPU_REGS_RDX] = get_rdx_init_val(); - set_cr8(&vmx->vcpu, 0); - msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; - if (vmx->vcpu.vcpu_id == 0) - msr |= MSR_IA32_APICBASE_BSP; - kvm_set_apic_base(&vmx->vcpu, msr); - - fx_init(&vmx->vcpu); - - /* - * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode - * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. - */ - if (vmx->vcpu.vcpu_id == 0) { - vmcs_write16(GUEST_CS_SELECTOR, 0xf000); - vmcs_writel(GUEST_CS_BASE, 0x000f0000); - } else { - vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.sipi_vector << 8); - vmcs_writel(GUEST_CS_BASE, vmx->vcpu.sipi_vector << 12); - } - vmcs_write32(GUEST_CS_LIMIT, 0xffff); - vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); - - seg_setup(VCPU_SREG_DS); - seg_setup(VCPU_SREG_ES); - seg_setup(VCPU_SREG_FS); - seg_setup(VCPU_SREG_GS); - seg_setup(VCPU_SREG_SS); - - vmcs_write16(GUEST_TR_SELECTOR, 0); - vmcs_writel(GUEST_TR_BASE, 0); - vmcs_write32(GUEST_TR_LIMIT, 0xffff); - vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); - - vmcs_write16(GUEST_LDTR_SELECTOR, 0); - vmcs_writel(GUEST_LDTR_BASE, 0); - vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); - vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); - - vmcs_write32(GUEST_SYSENTER_CS, 0); - vmcs_writel(GUEST_SYSENTER_ESP, 0); - vmcs_writel(GUEST_SYSENTER_EIP, 0); - - vmcs_writel(GUEST_RFLAGS, 0x02); - if (vmx->vcpu.vcpu_id == 0) - vmcs_writel(GUEST_RIP, 0xfff0); - else - vmcs_writel(GUEST_RIP, 0); - vmcs_writel(GUEST_RSP, 0); - - //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 - vmcs_writel(GUEST_DR7, 0x400); - - vmcs_writel(GUEST_GDTR_BASE, 0); - vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); - - vmcs_writel(GUEST_IDTR_BASE, 0); - vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); - - vmcs_write32(GUEST_ACTIVITY_STATE, 0); - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); - vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); - /* I/O */ vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a)); vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b)); - guest_write_tsc(0); - vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ - /* Special registers */ - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); - /* Control */ vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmcs_config.pin_based_exec_ctrl); @@ -1507,8 +1526,16 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) } vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); - vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); - vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); + if (cpu_has_secondary_exec_ctrls()) { + exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; + if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) + exec_control &= + ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + } + + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ @@ -1536,7 +1563,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) get_idt(&dt); vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ - asm ("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); + asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); @@ -1567,97 +1594,145 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) ++vmx->nmsrs; } - setup_msrs(vmx); - vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); /* 22.2.1, 20.8.1 */ vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ - -#ifdef CONFIG_X86_64 - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); - if (vm_need_tpr_shadow(vmx->vcpu.kvm)) - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, - page_to_phys(vmx->vcpu.apic->regs_page)); - vmcs_write32(TPR_THRESHOLD, 0); -#endif - vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); - vmx->vcpu.cr0 = 0x60000010; - vmx_set_cr0(&vmx->vcpu, vmx->vcpu.cr0); // enter rmode - vmx_set_cr4(&vmx->vcpu, 0); -#ifdef CONFIG_X86_64 - vmx_set_efer(&vmx->vcpu, 0); -#endif - vmx_fpu_activate(&vmx->vcpu); - update_exception_bitmap(&vmx->vcpu); + if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) + if (alloc_apic_access_page(vmx->vcpu.kvm) != 0) + return -ENOMEM; return 0; - -out: - return ret; } -static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) +static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 msr; + int ret; - vmx_vcpu_setup(vmx); -} - -static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq) -{ - u16 ent[2]; - u16 cs; - u16 ip; - unsigned long flags; - unsigned long ss_base = vmcs_readl(GUEST_SS_BASE); - u16 sp = vmcs_readl(GUEST_RSP); - u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT); - - if (sp > ss_limit || sp < 6 ) { - vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n", - __FUNCTION__, - vmcs_readl(GUEST_RSP), - vmcs_readl(GUEST_SS_BASE), - vmcs_read32(GUEST_SS_LIMIT)); - return; + if (!init_rmode_tss(vmx->vcpu.kvm)) { + ret = -ENOMEM; + goto out; } - if (emulator_read_std(irq * sizeof(ent), &ent, sizeof(ent), vcpu) != - X86EMUL_CONTINUE) { - vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__); - return; + vmx->vcpu.arch.rmode.active = 0; + + vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); + set_cr8(&vmx->vcpu, 0); + msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; + if (vmx->vcpu.vcpu_id == 0) + msr |= MSR_IA32_APICBASE_BSP; + kvm_set_apic_base(&vmx->vcpu, msr); + + fx_init(&vmx->vcpu); + + /* + * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode + * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. + */ + if (vmx->vcpu.vcpu_id == 0) { + vmcs_write16(GUEST_CS_SELECTOR, 0xf000); + vmcs_writel(GUEST_CS_BASE, 0x000f0000); + } else { + vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); + vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); } + vmcs_write32(GUEST_CS_LIMIT, 0xffff); + vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); + + seg_setup(VCPU_SREG_DS); + seg_setup(VCPU_SREG_ES); + seg_setup(VCPU_SREG_FS); + seg_setup(VCPU_SREG_GS); + seg_setup(VCPU_SREG_SS); + + vmcs_write16(GUEST_TR_SELECTOR, 0); + vmcs_writel(GUEST_TR_BASE, 0); + vmcs_write32(GUEST_TR_LIMIT, 0xffff); + vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); - flags = vmcs_readl(GUEST_RFLAGS); - cs = vmcs_readl(GUEST_CS_BASE) >> 4; - ip = vmcs_readl(GUEST_RIP); + vmcs_write16(GUEST_LDTR_SELECTOR, 0); + vmcs_writel(GUEST_LDTR_BASE, 0); + vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); + vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); + vmcs_write32(GUEST_SYSENTER_CS, 0); + vmcs_writel(GUEST_SYSENTER_ESP, 0); + vmcs_writel(GUEST_SYSENTER_EIP, 0); - if (emulator_write_emulated(ss_base + sp - 2, &flags, 2, vcpu) != X86EMUL_CONTINUE || - emulator_write_emulated(ss_base + sp - 4, &cs, 2, vcpu) != X86EMUL_CONTINUE || - emulator_write_emulated(ss_base + sp - 6, &ip, 2, vcpu) != X86EMUL_CONTINUE) { - vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__); - return; + vmcs_writel(GUEST_RFLAGS, 0x02); + if (vmx->vcpu.vcpu_id == 0) + vmcs_writel(GUEST_RIP, 0xfff0); + else + vmcs_writel(GUEST_RIP, 0); + vmcs_writel(GUEST_RSP, 0); + + /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */ + vmcs_writel(GUEST_DR7, 0x400); + + vmcs_writel(GUEST_GDTR_BASE, 0); + vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); + + vmcs_writel(GUEST_IDTR_BASE, 0); + vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); + + vmcs_write32(GUEST_ACTIVITY_STATE, 0); + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); + vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); + + guest_write_tsc(0); + + /* Special registers */ + vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + + setup_msrs(vmx); + + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ + + if (cpu_has_vmx_tpr_shadow()) { + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); + if (vm_need_tpr_shadow(vmx->vcpu.kvm)) + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, + page_to_phys(vmx->vcpu.arch.apic->regs_page)); + vmcs_write32(TPR_THRESHOLD, 0); } - vmcs_writel(GUEST_RFLAGS, flags & - ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF)); - vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ; - vmcs_writel(GUEST_CS_BASE, ent[1] << 4); - vmcs_writel(GUEST_RIP, ent[0]); - vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6)); + if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) + vmcs_write64(APIC_ACCESS_ADDR, + page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); + + vmx->vcpu.arch.cr0 = 0x60000010; + vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ + vmx_set_cr4(&vmx->vcpu, 0); +#ifdef CONFIG_X86_64 + vmx_set_efer(&vmx->vcpu, 0); +#endif + vmx_fpu_activate(&vmx->vcpu); + update_exception_bitmap(&vmx->vcpu); + + return 0; + +out: + return ret; } static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) { - if (vcpu->rmode.active) { - inject_rmode_irq(vcpu, irq); + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (vcpu->arch.rmode.active) { + vmx->rmode.irq.pending = true; + vmx->rmode.irq.vector = irq; + vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP); + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); + vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1); return; } vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, @@ -1666,13 +1741,13 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) { - int word_index = __ffs(vcpu->irq_summary); - int bit_index = __ffs(vcpu->irq_pending[word_index]); + int word_index = __ffs(vcpu->arch.irq_summary); + int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); int irq = word_index * BITS_PER_LONG + bit_index; - clear_bit(bit_index, &vcpu->irq_pending[word_index]); - if (!vcpu->irq_pending[word_index]) - clear_bit(word_index, &vcpu->irq_summary); + clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); + if (!vcpu->arch.irq_pending[word_index]) + clear_bit(word_index, &vcpu->arch.irq_summary); vmx_inject_irq(vcpu, irq); } @@ -1682,12 +1757,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, { u32 cpu_based_vm_exec_control; - vcpu->interrupt_window_open = + vcpu->arch.interrupt_window_open = ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); - if (vcpu->interrupt_window_open && - vcpu->irq_summary && + if (vcpu->arch.interrupt_window_open && + vcpu->arch.irq_summary && !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) /* * If interrupts enabled, and not blocked by sti or mov ss. Good. @@ -1695,8 +1770,8 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, kvm_do_inject_irq(vcpu); cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - if (!vcpu->interrupt_window_open && - (vcpu->irq_summary || kvm_run->request_interrupt_window)) + if (!vcpu->arch.interrupt_window_open && + (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) /* * Interrupts blocked. Wait for unblock. */ @@ -1706,6 +1781,23 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); } +static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) +{ + int ret; + struct kvm_userspace_memory_region tss_mem = { + .slot = 8, + .guest_phys_addr = addr, + .memory_size = PAGE_SIZE * 3, + .flags = 0, + }; + + ret = kvm_set_memory_region(kvm, &tss_mem, 0); + if (ret) + return ret; + kvm->arch.tss_addr = addr; + return 0; +} + static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) { struct kvm_guest_debug *dbg = &vcpu->guest_debug; @@ -1727,7 +1819,7 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) static int handle_rmode_exception(struct kvm_vcpu *vcpu, int vec, u32 err_code) { - if (!vcpu->rmode.active) + if (!vcpu->arch.rmode.active) return 0; /* @@ -1735,32 +1827,31 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, * Cause the #SS fault with 0 error code in VM86 mode. */ if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) - if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE) + if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) return 1; return 0; } static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { + struct vcpu_vmx *vmx = to_vmx(vcpu); u32 intr_info, error_code; unsigned long cr2, rip; u32 vect_info; enum emulation_result er; - int r; - vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); + vect_info = vmx->idt_vectoring_info; intr_info = vmcs_read32(VM_EXIT_INTR_INFO); if ((vect_info & VECTORING_INFO_VALID_MASK) && - !is_page_fault(intr_info)) { + !is_page_fault(intr_info)) printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info); - } if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) { int irq = vect_info & VECTORING_INFO_VECTOR_MASK; - set_bit(irq, vcpu->irq_pending); - set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); + set_bit(irq, vcpu->arch.irq_pending); + set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); } if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ @@ -1771,52 +1862,34 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } + if (is_invalid_opcode(intr_info)) { + er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); + if (er != EMULATE_DONE) + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + error_code = 0; rip = vmcs_readl(GUEST_RIP); if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); if (is_page_fault(intr_info)) { cr2 = vmcs_readl(EXIT_QUALIFICATION); - - mutex_lock(&vcpu->kvm->lock); - r = kvm_mmu_page_fault(vcpu, cr2, error_code); - if (r < 0) { - mutex_unlock(&vcpu->kvm->lock); - return r; - } - if (!r) { - mutex_unlock(&vcpu->kvm->lock); - return 1; - } - - er = emulate_instruction(vcpu, kvm_run, cr2, error_code); - mutex_unlock(&vcpu->kvm->lock); - - switch (er) { - case EMULATE_DONE: - return 1; - case EMULATE_DO_MMIO: - ++vcpu->stat.mmio_exits; - return 0; - case EMULATE_FAIL: - kvm_report_emulation_failure(vcpu, "pagetable"); - break; - default: - BUG(); - } + return kvm_mmu_page_fault(vcpu, cr2, error_code); } - if (vcpu->rmode.active && + if (vcpu->arch.rmode.active && handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, error_code)) { - if (vcpu->halt_request) { - vcpu->halt_request = 0; + if (vcpu->arch.halt_request) { + vcpu->arch.halt_request = 0; return kvm_emulate_halt(vcpu); } return 1; } - if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) { + if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == + (INTR_TYPE_EXCEPTION | 1)) { kvm_run->exit_reason = KVM_EXIT_DEBUG; return 0; } @@ -1850,7 +1923,8 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) string = (exit_qualification & 16) != 0; if (string) { - if (emulate_instruction(vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO) + if (emulate_instruction(vcpu, + kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) return 0; return 1; } @@ -1873,7 +1947,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[0] = 0x0f; hypercall[1] = 0x01; hypercall[2] = 0xc1; - hypercall[3] = 0xc3; } static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) @@ -1890,23 +1963,25 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) switch (cr) { case 0: vcpu_load_rsp_rip(vcpu); - set_cr0(vcpu, vcpu->regs[reg]); + set_cr0(vcpu, vcpu->arch.regs[reg]); skip_emulated_instruction(vcpu); return 1; case 3: vcpu_load_rsp_rip(vcpu); - set_cr3(vcpu, vcpu->regs[reg]); + set_cr3(vcpu, vcpu->arch.regs[reg]); skip_emulated_instruction(vcpu); return 1; case 4: vcpu_load_rsp_rip(vcpu); - set_cr4(vcpu, vcpu->regs[reg]); + set_cr4(vcpu, vcpu->arch.regs[reg]); skip_emulated_instruction(vcpu); return 1; case 8: vcpu_load_rsp_rip(vcpu); - set_cr8(vcpu, vcpu->regs[reg]); + set_cr8(vcpu, vcpu->arch.regs[reg]); skip_emulated_instruction(vcpu); + if (irqchip_in_kernel(vcpu->kvm)) + return 1; kvm_run->exit_reason = KVM_EXIT_SET_TPR; return 0; }; @@ -1914,8 +1989,8 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) case 2: /* clts */ vcpu_load_rsp_rip(vcpu); vmx_fpu_deactivate(vcpu); - vcpu->cr0 &= ~X86_CR0_TS; - vmcs_writel(CR0_READ_SHADOW, vcpu->cr0); + vcpu->arch.cr0 &= ~X86_CR0_TS; + vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); vmx_fpu_activate(vcpu); skip_emulated_instruction(vcpu); return 1; @@ -1923,13 +1998,13 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) switch (cr) { case 3: vcpu_load_rsp_rip(vcpu); - vcpu->regs[reg] = vcpu->cr3; + vcpu->arch.regs[reg] = vcpu->arch.cr3; vcpu_put_rsp_rip(vcpu); skip_emulated_instruction(vcpu); return 1; case 8: vcpu_load_rsp_rip(vcpu); - vcpu->regs[reg] = get_cr8(vcpu); + vcpu->arch.regs[reg] = get_cr8(vcpu); vcpu_put_rsp_rip(vcpu); skip_emulated_instruction(vcpu); return 1; @@ -1975,7 +2050,7 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) default: val = 0; } - vcpu->regs[reg] = val; + vcpu->arch.regs[reg] = val; } else { /* mov to dr */ } @@ -1992,29 +2067,29 @@ static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - u32 ecx = vcpu->regs[VCPU_REGS_RCX]; + u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; u64 data; if (vmx_get_msr(vcpu, ecx, &data)) { - vmx_inject_gp(vcpu, 0); + kvm_inject_gp(vcpu, 0); return 1; } /* FIXME: handling of bits 32:63 of rax, rdx */ - vcpu->regs[VCPU_REGS_RAX] = data & -1u; - vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u; + vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; + vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u; skip_emulated_instruction(vcpu); return 1; } static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - u32 ecx = vcpu->regs[VCPU_REGS_RCX]; - u64 data = (vcpu->regs[VCPU_REGS_RAX] & -1u) - | ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32); + u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; + u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) + | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); if (vmx_set_msr(vcpu, ecx, data) != 0) { - vmx_inject_gp(vcpu, 0); + kvm_inject_gp(vcpu, 0); return 1; } @@ -2042,7 +2117,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, * possible */ if (kvm_run->request_interrupt_window && - !vcpu->irq_summary) { + !vcpu->arch.irq_summary) { kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; ++vcpu->stat.irq_window_exits; return 0; @@ -2059,7 +2134,35 @@ static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { skip_emulated_instruction(vcpu); - return kvm_hypercall(vcpu, kvm_run); + kvm_emulate_hypercall(vcpu); + return 1; +} + +static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + skip_emulated_instruction(vcpu); + /* TODO: Add support for VT-d/pass-through device */ + return 1; +} + +static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u64 exit_qualification; + enum emulation_result er; + unsigned long offset; + + exit_qualification = vmcs_read64(EXIT_QUALIFICATION); + offset = exit_qualification & 0xffful; + + er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); + + if (er != EMULATE_DONE) { + printk(KERN_ERR + "Fail to handle apic access vmexit! Offset is 0x%lx\n", + offset); + return -ENOTSUPP; + } + return 1; } /* @@ -2081,7 +2184,9 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, [EXIT_REASON_HLT] = handle_halt, [EXIT_REASON_VMCALL] = handle_vmcall, - [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold + [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, + [EXIT_REASON_APIC_ACCESS] = handle_apic_access, + [EXIT_REASON_WBINVD] = handle_wbinvd, }; static const int kvm_vmx_max_exit_handlers = @@ -2093,9 +2198,9 @@ static const int kvm_vmx_max_exit_handlers = */ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { - u32 vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); u32 exit_reason = vmcs_read32(VM_EXIT_REASON); struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 vectoring_info = vmx->idt_vectoring_info; if (unlikely(vmx->fail)) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; @@ -2104,8 +2209,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) return 0; } - if ( (vectoring_info & VECTORING_INFO_VALID_MASK) && - exit_reason != EXIT_REASON_EXCEPTION_NMI ) + if ((vectoring_info & VECTORING_INFO_VALID_MASK) && + exit_reason != EXIT_REASON_EXCEPTION_NMI) printk(KERN_WARNING "%s: unexpected, valid vectoring info and " "exit reason is 0x%x\n", __FUNCTION__, exit_reason); if (exit_reason < kvm_vmx_max_exit_handlers @@ -2150,26 +2255,38 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) static void vmx_intr_assist(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); u32 idtv_info_field, intr_info_field; int has_ext_irq, interrupt_window_open; int vector; - kvm_inject_pending_timer_irqs(vcpu); update_tpr_threshold(vcpu); has_ext_irq = kvm_cpu_has_interrupt(vcpu); intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); - idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD); + idtv_info_field = vmx->idt_vectoring_info; if (intr_info_field & INTR_INFO_VALID_MASK) { if (idtv_info_field & INTR_INFO_VALID_MASK) { /* TODO: fault when IDT_Vectoring */ - printk(KERN_ERR "Fault when IDT_Vectoring\n"); + if (printk_ratelimit()) + printk(KERN_ERR "Fault when IDT_Vectoring\n"); } if (has_ext_irq) enable_irq_window(vcpu); return; } if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) { + if ((idtv_info_field & VECTORING_INFO_TYPE_MASK) + == INTR_TYPE_EXT_INTR + && vcpu->arch.rmode.active) { + u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK; + + vmx_inject_irq(vcpu, vect); + if (unlikely(has_ext_irq)) + enable_irq_window(vcpu); + return; + } + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field); vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); @@ -2194,6 +2311,29 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) enable_irq_window(vcpu); } +/* + * Failure to inject an interrupt should give us the information + * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs + * when fetching the interrupt redirection bitmap in the real-mode + * tss, this doesn't happen. So we do it ourselves. + */ +static void fixup_rmode_irq(struct vcpu_vmx *vmx) +{ + vmx->rmode.irq.pending = 0; + if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip) + return; + vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip); + if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { + vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; + vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; + return; + } + vmx->idt_vectoring_info = + VECTORING_INFO_VALID_MASK + | INTR_TYPE_EXT_INTR + | vmx->rmode.irq.vector; +} + static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -2204,50 +2344,47 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) */ vmcs_writel(HOST_CR0, read_cr0()); - asm ( + asm( /* Store host registers */ #ifdef CONFIG_X86_64 - "push %%rax; push %%rbx; push %%rdx;" - "push %%rsi; push %%rdi; push %%rbp;" - "push %%r8; push %%r9; push %%r10; push %%r11;" - "push %%r12; push %%r13; push %%r14; push %%r15;" + "push %%rdx; push %%rbp;" "push %%rcx \n\t" - ASM_VMX_VMWRITE_RSP_RDX "\n\t" #else - "pusha; push %%ecx \n\t" - ASM_VMX_VMWRITE_RSP_RDX "\n\t" + "push %%edx; push %%ebp;" + "push %%ecx \n\t" #endif + ASM_VMX_VMWRITE_RSP_RDX "\n\t" /* Check if vmlaunch of vmresume is needed */ - "cmp $0, %1 \n\t" + "cmpl $0, %c[launched](%0) \n\t" /* Load guest registers. Don't clobber flags. */ #ifdef CONFIG_X86_64 - "mov %c[cr2](%3), %%rax \n\t" + "mov %c[cr2](%0), %%rax \n\t" "mov %%rax, %%cr2 \n\t" - "mov %c[rax](%3), %%rax \n\t" - "mov %c[rbx](%3), %%rbx \n\t" - "mov %c[rdx](%3), %%rdx \n\t" - "mov %c[rsi](%3), %%rsi \n\t" - "mov %c[rdi](%3), %%rdi \n\t" - "mov %c[rbp](%3), %%rbp \n\t" - "mov %c[r8](%3), %%r8 \n\t" - "mov %c[r9](%3), %%r9 \n\t" - "mov %c[r10](%3), %%r10 \n\t" - "mov %c[r11](%3), %%r11 \n\t" - "mov %c[r12](%3), %%r12 \n\t" - "mov %c[r13](%3), %%r13 \n\t" - "mov %c[r14](%3), %%r14 \n\t" - "mov %c[r15](%3), %%r15 \n\t" - "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */ + "mov %c[rax](%0), %%rax \n\t" + "mov %c[rbx](%0), %%rbx \n\t" + "mov %c[rdx](%0), %%rdx \n\t" + "mov %c[rsi](%0), %%rsi \n\t" + "mov %c[rdi](%0), %%rdi \n\t" + "mov %c[rbp](%0), %%rbp \n\t" + "mov %c[r8](%0), %%r8 \n\t" + "mov %c[r9](%0), %%r9 \n\t" + "mov %c[r10](%0), %%r10 \n\t" + "mov %c[r11](%0), %%r11 \n\t" + "mov %c[r12](%0), %%r12 \n\t" + "mov %c[r13](%0), %%r13 \n\t" + "mov %c[r14](%0), %%r14 \n\t" + "mov %c[r15](%0), %%r15 \n\t" + "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */ #else - "mov %c[cr2](%3), %%eax \n\t" + "mov %c[cr2](%0), %%eax \n\t" "mov %%eax, %%cr2 \n\t" - "mov %c[rax](%3), %%eax \n\t" - "mov %c[rbx](%3), %%ebx \n\t" - "mov %c[rdx](%3), %%edx \n\t" - "mov %c[rsi](%3), %%esi \n\t" - "mov %c[rdi](%3), %%edi \n\t" - "mov %c[rbp](%3), %%ebp \n\t" - "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */ + "mov %c[rax](%0), %%eax \n\t" + "mov %c[rbx](%0), %%ebx \n\t" + "mov %c[rdx](%0), %%edx \n\t" + "mov %c[rsi](%0), %%esi \n\t" + "mov %c[rdi](%0), %%edi \n\t" + "mov %c[rbp](%0), %%ebp \n\t" + "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */ #endif /* Enter guest mode */ "jne .Llaunched \n\t" @@ -2257,72 +2394,79 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ".Lkvm_vmx_return: " /* Save guest registers, load host registers, keep flags */ #ifdef CONFIG_X86_64 - "xchg %3, (%%rsp) \n\t" - "mov %%rax, %c[rax](%3) \n\t" - "mov %%rbx, %c[rbx](%3) \n\t" - "pushq (%%rsp); popq %c[rcx](%3) \n\t" - "mov %%rdx, %c[rdx](%3) \n\t" - "mov %%rsi, %c[rsi](%3) \n\t" - "mov %%rdi, %c[rdi](%3) \n\t" - "mov %%rbp, %c[rbp](%3) \n\t" - "mov %%r8, %c[r8](%3) \n\t" - "mov %%r9, %c[r9](%3) \n\t" - "mov %%r10, %c[r10](%3) \n\t" - "mov %%r11, %c[r11](%3) \n\t" - "mov %%r12, %c[r12](%3) \n\t" - "mov %%r13, %c[r13](%3) \n\t" - "mov %%r14, %c[r14](%3) \n\t" - "mov %%r15, %c[r15](%3) \n\t" + "xchg %0, (%%rsp) \n\t" + "mov %%rax, %c[rax](%0) \n\t" + "mov %%rbx, %c[rbx](%0) \n\t" + "pushq (%%rsp); popq %c[rcx](%0) \n\t" + "mov %%rdx, %c[rdx](%0) \n\t" + "mov %%rsi, %c[rsi](%0) \n\t" + "mov %%rdi, %c[rdi](%0) \n\t" + "mov %%rbp, %c[rbp](%0) \n\t" + "mov %%r8, %c[r8](%0) \n\t" + "mov %%r9, %c[r9](%0) \n\t" + "mov %%r10, %c[r10](%0) \n\t" + "mov %%r11, %c[r11](%0) \n\t" + "mov %%r12, %c[r12](%0) \n\t" + "mov %%r13, %c[r13](%0) \n\t" + "mov %%r14, %c[r14](%0) \n\t" + "mov %%r15, %c[r15](%0) \n\t" "mov %%cr2, %%rax \n\t" - "mov %%rax, %c[cr2](%3) \n\t" - "mov (%%rsp), %3 \n\t" + "mov %%rax, %c[cr2](%0) \n\t" - "pop %%rcx; pop %%r15; pop %%r14; pop %%r13; pop %%r12;" - "pop %%r11; pop %%r10; pop %%r9; pop %%r8;" - "pop %%rbp; pop %%rdi; pop %%rsi;" - "pop %%rdx; pop %%rbx; pop %%rax \n\t" + "pop %%rbp; pop %%rbp; pop %%rdx \n\t" #else - "xchg %3, (%%esp) \n\t" - "mov %%eax, %c[rax](%3) \n\t" - "mov %%ebx, %c[rbx](%3) \n\t" - "pushl (%%esp); popl %c[rcx](%3) \n\t" - "mov %%edx, %c[rdx](%3) \n\t" - "mov %%esi, %c[rsi](%3) \n\t" - "mov %%edi, %c[rdi](%3) \n\t" - "mov %%ebp, %c[rbp](%3) \n\t" + "xchg %0, (%%esp) \n\t" + "mov %%eax, %c[rax](%0) \n\t" + "mov %%ebx, %c[rbx](%0) \n\t" + "pushl (%%esp); popl %c[rcx](%0) \n\t" + "mov %%edx, %c[rdx](%0) \n\t" + "mov %%esi, %c[rsi](%0) \n\t" + "mov %%edi, %c[rdi](%0) \n\t" + "mov %%ebp, %c[rbp](%0) \n\t" "mov %%cr2, %%eax \n\t" - "mov %%eax, %c[cr2](%3) \n\t" - "mov (%%esp), %3 \n\t" + "mov %%eax, %c[cr2](%0) \n\t" - "pop %%ecx; popa \n\t" + "pop %%ebp; pop %%ebp; pop %%edx \n\t" +#endif + "setbe %c[fail](%0) \n\t" + : : "c"(vmx), "d"((unsigned long)HOST_RSP), + [launched]"i"(offsetof(struct vcpu_vmx, launched)), + [fail]"i"(offsetof(struct vcpu_vmx, fail)), + [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), + [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), + [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), + [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])), + [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])), + [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])), + [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])), +#ifdef CONFIG_X86_64 + [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])), + [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])), + [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])), + [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])), + [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])), + [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])), + [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), + [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), #endif - "setbe %0 \n\t" - : "=q" (vmx->fail) - : "r"(vmx->launched), "d"((unsigned long)HOST_RSP), - "c"(vcpu), - [rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])), - [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])), - [rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])), - [rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])), - [rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])), - [rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])), - [rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])), + [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) + : "cc", "memory" #ifdef CONFIG_X86_64 - [r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])), - [r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])), - [r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])), - [r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])), - [r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])), - [r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])), - [r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])), - [r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])), + , "rbx", "rdi", "rsi" + , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" +#else + , "ebx", "edi", "rsi" #endif - [cr2]"i"(offsetof(struct kvm_vcpu, cr2)) - : "cc", "memory" ); + ); + + vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); + if (vmx->rmode.irq.pending) + fixup_rmode_irq(vmx); - vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; + vcpu->arch.interrupt_window_open = + (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; - asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); + asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); vmx->launched = 1; intr_info = vmcs_read32(VM_EXIT_INTR_INFO); @@ -2332,36 +2476,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) asm("int $2"); } -static void vmx_inject_page_fault(struct kvm_vcpu *vcpu, - unsigned long addr, - u32 err_code) -{ - u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - - ++vcpu->stat.pf_guest; - - if (is_page_fault(vect_info)) { - printk(KERN_DEBUG "inject_page_fault: " - "double fault 0x%lx @ 0x%lx\n", - addr, vmcs_readl(GUEST_RIP)); - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0); - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - DF_VECTOR | - INTR_TYPE_EXCEPTION | - INTR_INFO_DELIEVER_CODE_MASK | - INTR_INFO_VALID_MASK); - return; - } - vcpu->cr2 = addr; - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, err_code); - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - PF_VECTOR | - INTR_TYPE_EXCEPTION | - INTR_INFO_DELIEVER_CODE_MASK | - INTR_INFO_VALID_MASK); - -} - static void vmx_free_vmcs(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -2397,12 +2511,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (err) goto free_vcpu; - if (irqchip_in_kernel(kvm)) { - err = kvm_create_lapic(&vmx->vcpu); - if (err < 0) - goto free_vcpu; - } - vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!vmx->guest_msrs) { err = -ENOMEM; @@ -2464,6 +2572,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .check_processor_compatibility = vmx_check_processor_compat, .hardware_enable = hardware_enable, .hardware_disable = hardware_disable, + .cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses, .vcpu_create = vmx_create_vcpu, .vcpu_free = vmx_free_vcpu, @@ -2499,9 +2608,6 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_rflags = vmx_set_rflags, .tlb_flush = vmx_flush_tlb, - .inject_page_fault = vmx_inject_page_fault, - - .inject_gp = vmx_inject_gp, .run = vmx_vcpu_run, .handle_exit = kvm_handle_exit, @@ -2509,8 +2615,12 @@ static struct kvm_x86_ops vmx_x86_ops = { .patch_hypercall = vmx_patch_hypercall, .get_irq = vmx_get_irq, .set_irq = vmx_inject_irq, + .queue_exception = vmx_queue_exception, + .exception_injected = vmx_exception_injected, .inject_pending_irq = vmx_intr_assist, .inject_pending_vectors = do_interrupt_requests, + + .set_tss_addr = vmx_set_tss_addr, }; static int __init vmx_init(void) @@ -2541,10 +2651,13 @@ static int __init vmx_init(void) memset(iova, 0xff, PAGE_SIZE); kunmap(vmx_io_bitmap_b); - r = kvm_init_x86(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); + r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); if (r) goto out1; + if (bypass_guest_pf) + kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); + return 0; out1: @@ -2559,7 +2672,7 @@ static void __exit vmx_exit(void) __free_page(vmx_io_bitmap_b); __free_page(vmx_io_bitmap_a); - kvm_exit_x86(); + kvm_exit(); } module_init(vmx_init) diff --git a/drivers/kvm/vmx.h b/arch/x86/kvm/vmx.h index fd4e1466608..d52ae8d7303 100644 --- a/drivers/kvm/vmx.h +++ b/arch/x86/kvm/vmx.h @@ -25,6 +25,9 @@ * */ +/* + * Definitions of Primary Processor-Based VM-Execution Controls. + */ #define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004 #define CPU_BASED_USE_TSC_OFFSETING 0x00000008 #define CPU_BASED_HLT_EXITING 0x00000080 @@ -42,6 +45,12 @@ #define CPU_BASED_MONITOR_EXITING 0x20000000 #define CPU_BASED_PAUSE_EXITING 0x40000000 #define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000 +/* + * Definitions of Secondary Processor-Based VM-Execution Controls. + */ +#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 +#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 + #define PIN_BASED_EXT_INTR_MASK 0x00000001 #define PIN_BASED_NMI_EXITING 0x00000008 @@ -54,8 +63,6 @@ #define VM_ENTRY_SMM 0x00000400 #define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 -#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 - /* VMCS Encodings */ enum vmcs_field { GUEST_ES_SELECTOR = 0x00000800, @@ -89,6 +96,8 @@ enum vmcs_field { TSC_OFFSET_HIGH = 0x00002011, VIRTUAL_APIC_PAGE_ADDR = 0x00002012, VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, + APIC_ACCESS_ADDR = 0x00002014, + APIC_ACCESS_ADDR_HIGH = 0x00002015, VMCS_LINK_POINTER = 0x00002800, VMCS_LINK_POINTER_HIGH = 0x00002801, GUEST_IA32_DEBUGCTL = 0x00002802, @@ -214,6 +223,8 @@ enum vmcs_field { #define EXIT_REASON_MSR_WRITE 32 #define EXIT_REASON_MWAIT_INSTRUCTION 36 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 +#define EXIT_REASON_APIC_ACCESS 44 +#define EXIT_REASON_WBINVD 54 /* * Interruption-information format @@ -230,13 +241,14 @@ enum vmcs_field { #define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ #define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */ +#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */ /* * Exit Qualifications for MOV for Control Register Access */ -#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control register */ +#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control reg.*/ #define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */ -#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose register */ +#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose reg. */ #define LMSW_SOURCE_DATA_SHIFT 16 #define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */ #define REG_EAX (0 << 8) @@ -259,11 +271,11 @@ enum vmcs_field { /* * Exit Qualifications for MOV for Debug Register Access */ -#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug register */ +#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug reg. */ #define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */ #define TYPE_MOV_TO_DR (0 << 4) #define TYPE_MOV_FROM_DR (1 << 4) -#define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose register */ +#define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose reg. */ /* segment AR */ @@ -307,4 +319,6 @@ enum vmcs_field { #define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1 #define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4 +#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 + #endif diff --git a/drivers/kvm/kvm_main.c b/arch/x86/kvm/x86.c index c0f372f1d76..8f94a0b89df 100644 --- a/drivers/kvm/kvm_main.c +++ b/arch/x86/kvm/x86.c @@ -1,8 +1,7 @@ /* * Kernel-based Virtual Machine driver for Linux * - * This module enables machines with Intel VT-x extensions to run virtual - * machines without emulation or binary translation. + * derived from drivers/kvm/kvm_main.c * * Copyright (C) 2006 Qumranet, Inc. * @@ -15,80 +14,22 @@ * */ -#include "kvm.h" -#include "x86_emulate.h" +#include <linux/kvm_host.h> #include "segment_descriptor.h" #include "irq.h" +#include "mmu.h" #include <linux/kvm.h> -#include <linux/module.h> -#include <linux/errno.h> -#include <linux/percpu.h> -#include <linux/gfp.h> -#include <linux/mm.h> -#include <linux/miscdevice.h> +#include <linux/fs.h> #include <linux/vmalloc.h> -#include <linux/reboot.h> -#include <linux/debugfs.h> +#include <linux/module.h> +#include <linux/mman.h> #include <linux/highmem.h> -#include <linux/file.h> -#include <linux/sysdev.h> -#include <linux/cpu.h> -#include <linux/sched.h> -#include <linux/cpumask.h> -#include <linux/smp.h> -#include <linux/anon_inodes.h> -#include <linux/profile.h> - -#include <asm/processor.h> -#include <asm/msr.h> -#include <asm/io.h> -#include <asm/uaccess.h> -#include <asm/desc.h> - -MODULE_AUTHOR("Qumranet"); -MODULE_LICENSE("GPL"); -static DEFINE_SPINLOCK(kvm_lock); -static LIST_HEAD(vm_list); - -static cpumask_t cpus_hardware_enabled; - -struct kvm_x86_ops *kvm_x86_ops; -struct kmem_cache *kvm_vcpu_cache; -EXPORT_SYMBOL_GPL(kvm_vcpu_cache); - -static __read_mostly struct preempt_ops kvm_preempt_ops; - -#define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x) - -static struct kvm_stats_debugfs_item { - const char *name; - int offset; - struct dentry *dentry; -} debugfs_entries[] = { - { "pf_fixed", STAT_OFFSET(pf_fixed) }, - { "pf_guest", STAT_OFFSET(pf_guest) }, - { "tlb_flush", STAT_OFFSET(tlb_flush) }, - { "invlpg", STAT_OFFSET(invlpg) }, - { "exits", STAT_OFFSET(exits) }, - { "io_exits", STAT_OFFSET(io_exits) }, - { "mmio_exits", STAT_OFFSET(mmio_exits) }, - { "signal_exits", STAT_OFFSET(signal_exits) }, - { "irq_window", STAT_OFFSET(irq_window_exits) }, - { "halt_exits", STAT_OFFSET(halt_exits) }, - { "halt_wakeup", STAT_OFFSET(halt_wakeup) }, - { "request_irq", STAT_OFFSET(request_irq_exits) }, - { "irq_exits", STAT_OFFSET(irq_exits) }, - { "light_exits", STAT_OFFSET(light_exits) }, - { "efer_reload", STAT_OFFSET(efer_reload) }, - { NULL } -}; - -static struct dentry *debugfs_dir; +#include <asm/uaccess.h> +#include <asm/msr.h> #define MAX_IO_MSRS 256 - #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ @@ -102,317 +43,151 @@ static struct dentry *debugfs_dir; #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) #define EFER_RESERVED_BITS 0xfffffffffffff2fe -#ifdef CONFIG_X86_64 -// LDT or TSS descriptor in the GDT. 16 bytes. -struct segment_descriptor_64 { - struct segment_descriptor s; - u32 base_higher; - u32 pad_zero; -}; +#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM +#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU -#endif +struct kvm_x86_ops *kvm_x86_ops; + +struct kvm_stats_debugfs_item debugfs_entries[] = { + { "pf_fixed", VCPU_STAT(pf_fixed) }, + { "pf_guest", VCPU_STAT(pf_guest) }, + { "tlb_flush", VCPU_STAT(tlb_flush) }, + { "invlpg", VCPU_STAT(invlpg) }, + { "exits", VCPU_STAT(exits) }, + { "io_exits", VCPU_STAT(io_exits) }, + { "mmio_exits", VCPU_STAT(mmio_exits) }, + { "signal_exits", VCPU_STAT(signal_exits) }, + { "irq_window", VCPU_STAT(irq_window_exits) }, + { "halt_exits", VCPU_STAT(halt_exits) }, + { "halt_wakeup", VCPU_STAT(halt_wakeup) }, + { "request_irq", VCPU_STAT(request_irq_exits) }, + { "irq_exits", VCPU_STAT(irq_exits) }, + { "host_state_reload", VCPU_STAT(host_state_reload) }, + { "efer_reload", VCPU_STAT(efer_reload) }, + { "fpu_reload", VCPU_STAT(fpu_reload) }, + { "insn_emulation", VCPU_STAT(insn_emulation) }, + { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, + { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, + { "mmu_pte_write", VM_STAT(mmu_pte_write) }, + { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, + { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, + { "mmu_flooded", VM_STAT(mmu_flooded) }, + { "mmu_recycled", VM_STAT(mmu_recycled) }, + { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, + { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, + { NULL } +}; -static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, - unsigned long arg); unsigned long segment_base(u16 selector) { struct descriptor_table gdt; struct segment_descriptor *d; unsigned long table_base; - typedef unsigned long ul; unsigned long v; if (selector == 0) return 0; - asm ("sgdt %0" : "=m"(gdt)); + asm("sgdt %0" : "=m"(gdt)); table_base = gdt.base; if (selector & 4) { /* from ldt */ u16 ldt_selector; - asm ("sldt %0" : "=g"(ldt_selector)); + asm("sldt %0" : "=g"(ldt_selector)); table_base = segment_base(ldt_selector); } d = (struct segment_descriptor *)(table_base + (selector & ~7)); - v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24); + v = d->base_low | ((unsigned long)d->base_mid << 16) | + ((unsigned long)d->base_high << 24); #ifdef CONFIG_X86_64 - if (d->system == 0 - && (d->type == 2 || d->type == 9 || d->type == 11)) - v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32; + if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) + v |= ((unsigned long) \ + ((struct segment_descriptor_64 *)d)->base_higher) << 32; #endif return v; } EXPORT_SYMBOL_GPL(segment_base); -static inline int valid_vcpu(int n) -{ - return likely(n >= 0 && n < KVM_MAX_VCPUS); -} - -void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) -{ - if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) - return; - - vcpu->guest_fpu_loaded = 1; - fx_save(&vcpu->host_fx_image); - fx_restore(&vcpu->guest_fx_image); -} -EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); - -void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) -{ - if (!vcpu->guest_fpu_loaded) - return; - - vcpu->guest_fpu_loaded = 0; - fx_save(&vcpu->guest_fx_image); - fx_restore(&vcpu->host_fx_image); -} -EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); - -/* - * Switches to specified vcpu, until a matching vcpu_put() - */ -static void vcpu_load(struct kvm_vcpu *vcpu) -{ - int cpu; - - mutex_lock(&vcpu->mutex); - cpu = get_cpu(); - preempt_notifier_register(&vcpu->preempt_notifier); - kvm_x86_ops->vcpu_load(vcpu, cpu); - put_cpu(); -} - -static void vcpu_put(struct kvm_vcpu *vcpu) -{ - preempt_disable(); - kvm_x86_ops->vcpu_put(vcpu); - preempt_notifier_unregister(&vcpu->preempt_notifier); - preempt_enable(); - mutex_unlock(&vcpu->mutex); -} - -static void ack_flush(void *_completed) -{ -} - -void kvm_flush_remote_tlbs(struct kvm *kvm) -{ - int i, cpu; - cpumask_t cpus; - struct kvm_vcpu *vcpu; - - cpus_clear(cpus); - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - vcpu = kvm->vcpus[i]; - if (!vcpu) - continue; - if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests)) - continue; - cpu = vcpu->cpu; - if (cpu != -1 && cpu != raw_smp_processor_id()) - cpu_set(cpu, cpus); - } - smp_call_function_mask(cpus, ack_flush, NULL, 1); -} - -int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) +u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) { - struct page *page; - int r; - - mutex_init(&vcpu->mutex); - vcpu->cpu = -1; - vcpu->mmu.root_hpa = INVALID_PAGE; - vcpu->kvm = kvm; - vcpu->vcpu_id = id; - if (!irqchip_in_kernel(kvm) || id == 0) - vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; + if (irqchip_in_kernel(vcpu->kvm)) + return vcpu->arch.apic_base; else - vcpu->mp_state = VCPU_MP_STATE_UNINITIALIZED; - init_waitqueue_head(&vcpu->wq); - - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) { - r = -ENOMEM; - goto fail; - } - vcpu->run = page_address(page); - - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) { - r = -ENOMEM; - goto fail_free_run; - } - vcpu->pio_data = page_address(page); - - r = kvm_mmu_create(vcpu); - if (r < 0) - goto fail_free_pio_data; - - return 0; - -fail_free_pio_data: - free_page((unsigned long)vcpu->pio_data); -fail_free_run: - free_page((unsigned long)vcpu->run); -fail: - return -ENOMEM; -} -EXPORT_SYMBOL_GPL(kvm_vcpu_init); - -void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) -{ - kvm_mmu_destroy(vcpu); - if (vcpu->apic) - hrtimer_cancel(&vcpu->apic->timer.dev); - kvm_free_apic(vcpu->apic); - free_page((unsigned long)vcpu->pio_data); - free_page((unsigned long)vcpu->run); -} -EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); - -static struct kvm *kvm_create_vm(void) -{ - struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); - - if (!kvm) - return ERR_PTR(-ENOMEM); - - kvm_io_bus_init(&kvm->pio_bus); - mutex_init(&kvm->lock); - INIT_LIST_HEAD(&kvm->active_mmu_pages); - kvm_io_bus_init(&kvm->mmio_bus); - spin_lock(&kvm_lock); - list_add(&kvm->vm_list, &vm_list); - spin_unlock(&kvm_lock); - return kvm; -} - -/* - * Free any memory in @free but not in @dont. - */ -static void kvm_free_physmem_slot(struct kvm_memory_slot *free, - struct kvm_memory_slot *dont) -{ - int i; - - if (!dont || free->phys_mem != dont->phys_mem) - if (free->phys_mem) { - for (i = 0; i < free->npages; ++i) - if (free->phys_mem[i]) - __free_page(free->phys_mem[i]); - vfree(free->phys_mem); - } - - if (!dont || free->dirty_bitmap != dont->dirty_bitmap) - vfree(free->dirty_bitmap); - - free->phys_mem = NULL; - free->npages = 0; - free->dirty_bitmap = NULL; -} - -static void kvm_free_physmem(struct kvm *kvm) -{ - int i; - - for (i = 0; i < kvm->nmemslots; ++i) - kvm_free_physmem_slot(&kvm->memslots[i], NULL); + return vcpu->arch.apic_base; } +EXPORT_SYMBOL_GPL(kvm_get_apic_base); -static void free_pio_guest_pages(struct kvm_vcpu *vcpu) +void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) { - int i; - - for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i) - if (vcpu->pio.guest_pages[i]) { - __free_page(vcpu->pio.guest_pages[i]); - vcpu->pio.guest_pages[i] = NULL; - } + /* TODO: reserve bits check */ + if (irqchip_in_kernel(vcpu->kvm)) + kvm_lapic_set_base(vcpu, data); + else + vcpu->arch.apic_base = data; } +EXPORT_SYMBOL_GPL(kvm_set_apic_base); -static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) +void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) { - vcpu_load(vcpu); - kvm_mmu_unload(vcpu); - vcpu_put(vcpu); + WARN_ON(vcpu->arch.exception.pending); + vcpu->arch.exception.pending = true; + vcpu->arch.exception.has_error_code = false; + vcpu->arch.exception.nr = nr; } +EXPORT_SYMBOL_GPL(kvm_queue_exception); -static void kvm_free_vcpus(struct kvm *kvm) +void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, + u32 error_code) { - unsigned int i; - - /* - * Unpin any mmu pages first. - */ - for (i = 0; i < KVM_MAX_VCPUS; ++i) - if (kvm->vcpus[i]) - kvm_unload_vcpu_mmu(kvm->vcpus[i]); - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - if (kvm->vcpus[i]) { - kvm_x86_ops->vcpu_free(kvm->vcpus[i]); - kvm->vcpus[i] = NULL; - } + ++vcpu->stat.pf_guest; + if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) { + printk(KERN_DEBUG "kvm: inject_page_fault:" + " double fault 0x%lx\n", addr); + vcpu->arch.exception.nr = DF_VECTOR; + vcpu->arch.exception.error_code = 0; + return; } - + vcpu->arch.cr2 = addr; + kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); } -static void kvm_destroy_vm(struct kvm *kvm) +void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { - spin_lock(&kvm_lock); - list_del(&kvm->vm_list); - spin_unlock(&kvm_lock); - kvm_io_bus_destroy(&kvm->pio_bus); - kvm_io_bus_destroy(&kvm->mmio_bus); - kfree(kvm->vpic); - kfree(kvm->vioapic); - kvm_free_vcpus(kvm); - kvm_free_physmem(kvm); - kfree(kvm); + WARN_ON(vcpu->arch.exception.pending); + vcpu->arch.exception.pending = true; + vcpu->arch.exception.has_error_code = true; + vcpu->arch.exception.nr = nr; + vcpu->arch.exception.error_code = error_code; } +EXPORT_SYMBOL_GPL(kvm_queue_exception_e); -static int kvm_vm_release(struct inode *inode, struct file *filp) +static void __queue_exception(struct kvm_vcpu *vcpu) { - struct kvm *kvm = filp->private_data; - - kvm_destroy_vm(kvm); - return 0; -} - -static void inject_gp(struct kvm_vcpu *vcpu) -{ - kvm_x86_ops->inject_gp(vcpu, 0); + kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, + vcpu->arch.exception.has_error_code, + vcpu->arch.exception.error_code); } /* * Load the pae pdptrs. Return true is they are all valid. */ -static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) +int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) { gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; int i; - u64 *pdpt; int ret; - struct page *page; - u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)]; + u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; - mutex_lock(&vcpu->kvm->lock); - page = gfn_to_page(vcpu->kvm, pdpt_gfn); - if (!page) { + down_read(¤t->mm->mmap_sem); + ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, + offset * sizeof(u64), sizeof(pdpte)); + if (ret < 0) { ret = 0; goto out; } - - pdpt = kmap_atomic(page, KM_USER0); - memcpy(pdpte, pdpt+offset, sizeof(pdpte)); - kunmap_atomic(pdpt, KM_USER0); - for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { ret = 0; @@ -421,78 +196,96 @@ static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) } ret = 1; - memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs)); + memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); out: - mutex_unlock(&vcpu->kvm->lock); + up_read(¤t->mm->mmap_sem); return ret; } +static bool pdptrs_changed(struct kvm_vcpu *vcpu) +{ + u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; + bool changed = true; + int r; + + if (is_long_mode(vcpu) || !is_pae(vcpu)) + return false; + + down_read(¤t->mm->mmap_sem); + r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); + if (r < 0) + goto out; + changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; +out: + up_read(¤t->mm->mmap_sem); + + return changed; +} + void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { if (cr0 & CR0_RESERVED_BITS) { printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", - cr0, vcpu->cr0); - inject_gp(vcpu); + cr0, vcpu->arch.cr0); + kvm_inject_gp(vcpu, 0); return; } if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); - inject_gp(vcpu); + kvm_inject_gp(vcpu, 0); return; } if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { printk(KERN_DEBUG "set_cr0: #GP, set PG flag " "and a clear PE flag\n"); - inject_gp(vcpu); + kvm_inject_gp(vcpu, 0); return; } if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { #ifdef CONFIG_X86_64 - if ((vcpu->shadow_efer & EFER_LME)) { + if ((vcpu->arch.shadow_efer & EFER_LME)) { int cs_db, cs_l; if (!is_pae(vcpu)) { printk(KERN_DEBUG "set_cr0: #GP, start paging " "in long mode while PAE is disabled\n"); - inject_gp(vcpu); + kvm_inject_gp(vcpu, 0); return; } kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); if (cs_l) { printk(KERN_DEBUG "set_cr0: #GP, start paging " "in long mode while CS.L == 1\n"); - inject_gp(vcpu); + kvm_inject_gp(vcpu, 0); return; } } else #endif - if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) { + if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { printk(KERN_DEBUG "set_cr0: #GP, pdptrs " "reserved bits\n"); - inject_gp(vcpu); + kvm_inject_gp(vcpu, 0); return; } } kvm_x86_ops->set_cr0(vcpu, cr0); - vcpu->cr0 = cr0; + vcpu->arch.cr0 = cr0; - mutex_lock(&vcpu->kvm->lock); kvm_mmu_reset_context(vcpu); - mutex_unlock(&vcpu->kvm->lock); return; } EXPORT_SYMBOL_GPL(set_cr0); void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { - set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f)); + set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); } EXPORT_SYMBOL_GPL(lmsw); @@ -500,7 +293,7 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { if (cr4 & CR4_RESERVED_BITS) { printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); - inject_gp(vcpu); + kvm_inject_gp(vcpu, 0); return; } @@ -508,35 +301,38 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!(cr4 & X86_CR4_PAE)) { printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " "in long mode\n"); - inject_gp(vcpu); + kvm_inject_gp(vcpu, 0); return; } } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE) - && !load_pdptrs(vcpu, vcpu->cr3)) { + && !load_pdptrs(vcpu, vcpu->arch.cr3)) { printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); - inject_gp(vcpu); + kvm_inject_gp(vcpu, 0); return; } if (cr4 & X86_CR4_VMXE) { printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); - inject_gp(vcpu); + kvm_inject_gp(vcpu, 0); return; } kvm_x86_ops->set_cr4(vcpu, cr4); - vcpu->cr4 = cr4; - mutex_lock(&vcpu->kvm->lock); + vcpu->arch.cr4 = cr4; kvm_mmu_reset_context(vcpu); - mutex_unlock(&vcpu->kvm->lock); } EXPORT_SYMBOL_GPL(set_cr4); void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { + if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { + kvm_mmu_flush_tlb(vcpu); + return; + } + if (is_long_mode(vcpu)) { if (cr3 & CR3_L_MODE_RESERVED_BITS) { printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); - inject_gp(vcpu); + kvm_inject_gp(vcpu, 0); return; } } else { @@ -544,26 +340,23 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (cr3 & CR3_PAE_RESERVED_BITS) { printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); - inject_gp(vcpu); + kvm_inject_gp(vcpu, 0); return; } if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { printk(KERN_DEBUG "set_cr3: #GP, pdptrs " "reserved bits\n"); - inject_gp(vcpu); - return; - } - } else { - if (cr3 & CR3_NONPAE_RESERVED_BITS) { - printk(KERN_DEBUG - "set_cr3: #GP, reserved bits\n"); - inject_gp(vcpu); + kvm_inject_gp(vcpu, 0); return; } } + /* + * We don't check reserved bits in nonpae mode, because + * this isn't enforced, and VMware depends on this. + */ } - mutex_lock(&vcpu->kvm->lock); + down_read(¤t->mm->mmap_sem); /* * Does the new cr3 value map to physical memory? (Note, we * catch an invalid cr3 even in real-mode, because it would @@ -574,12 +367,12 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) * to debug) behavior on the guest side. */ if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) - inject_gp(vcpu); + kvm_inject_gp(vcpu, 0); else { - vcpu->cr3 = cr3; - vcpu->mmu.new_cr3(vcpu); + vcpu->arch.cr3 = cr3; + vcpu->arch.mmu.new_cr3(vcpu); } - mutex_unlock(&vcpu->kvm->lock); + up_read(¤t->mm->mmap_sem); } EXPORT_SYMBOL_GPL(set_cr3); @@ -587,13 +380,13 @@ void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) { if (cr8 & CR8_RESERVED_BITS) { printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); - inject_gp(vcpu); + kvm_inject_gp(vcpu, 0); return; } if (irqchip_in_kernel(vcpu->kvm)) kvm_lapic_set_tpr(vcpu, cr8); else - vcpu->cr8 = cr8; + vcpu->arch.cr8 = cr8; } EXPORT_SYMBOL_GPL(set_cr8); @@ -602,210 +395,846 @@ unsigned long get_cr8(struct kvm_vcpu *vcpu) if (irqchip_in_kernel(vcpu->kvm)) return kvm_lapic_get_cr8(vcpu); else - return vcpu->cr8; + return vcpu->arch.cr8; } EXPORT_SYMBOL_GPL(get_cr8); -u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) +/* + * List of msr numbers which we expose to userspace through KVM_GET_MSRS + * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. + * + * This list is modified at module load time to reflect the + * capabilities of the host cpu. + */ +static u32 msrs_to_save[] = { + MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, + MSR_K6_STAR, +#ifdef CONFIG_X86_64 + MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, +#endif + MSR_IA32_TIME_STAMP_COUNTER, +}; + +static unsigned num_msrs_to_save; + +static u32 emulated_msrs[] = { + MSR_IA32_MISC_ENABLE, +}; + +#ifdef CONFIG_X86_64 + +static void set_efer(struct kvm_vcpu *vcpu, u64 efer) { - if (irqchip_in_kernel(vcpu->kvm)) - return vcpu->apic_base; - else - return vcpu->apic_base; + if (efer & EFER_RESERVED_BITS) { + printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", + efer); + kvm_inject_gp(vcpu, 0); + return; + } + + if (is_paging(vcpu) + && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { + printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); + kvm_inject_gp(vcpu, 0); + return; + } + + kvm_x86_ops->set_efer(vcpu, efer); + + efer &= ~EFER_LMA; + efer |= vcpu->arch.shadow_efer & EFER_LMA; + + vcpu->arch.shadow_efer = efer; } -EXPORT_SYMBOL_GPL(kvm_get_apic_base); -void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) +#endif + +/* + * Writes msr value into into the appropriate "register". + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) { - /* TODO: reserve bits check */ - if (irqchip_in_kernel(vcpu->kvm)) - kvm_lapic_set_base(vcpu, data); - else - vcpu->apic_base = data; + return kvm_x86_ops->set_msr(vcpu, msr_index, data); } -EXPORT_SYMBOL_GPL(kvm_set_apic_base); -void fx_init(struct kvm_vcpu *vcpu) +/* + * Adapt set_msr() to msr_io()'s calling convention + */ +static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { - unsigned after_mxcsr_mask; + return kvm_set_msr(vcpu, index, *data); +} - /* Initialize guest FPU by resetting ours and saving into guest's */ - preempt_disable(); - fx_save(&vcpu->host_fx_image); - fpu_init(); - fx_save(&vcpu->guest_fx_image); - fx_restore(&vcpu->host_fx_image); - preempt_enable(); - vcpu->cr0 |= X86_CR0_ET; - after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); - vcpu->guest_fx_image.mxcsr = 0x1f80; - memset((void *)&vcpu->guest_fx_image + after_mxcsr_mask, - 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); +int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + switch (msr) { +#ifdef CONFIG_X86_64 + case MSR_EFER: + set_efer(vcpu, data); + break; +#endif + case MSR_IA32_MC0_STATUS: + pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", + __FUNCTION__, data); + break; + case MSR_IA32_MCG_STATUS: + pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", + __FUNCTION__, data); + break; + case MSR_IA32_UCODE_REV: + case MSR_IA32_UCODE_WRITE: + case 0x200 ... 0x2ff: /* MTRRs */ + break; + case MSR_IA32_APICBASE: + kvm_set_apic_base(vcpu, data); + break; + case MSR_IA32_MISC_ENABLE: + vcpu->arch.ia32_misc_enable_msr = data; + break; + default: + pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); + return 1; + } + return 0; } -EXPORT_SYMBOL_GPL(fx_init); +EXPORT_SYMBOL_GPL(kvm_set_msr_common); + + +/* + * Reads an msr value (of 'msr_index') into 'pdata'. + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +{ + return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); +} + +int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data; + + switch (msr) { + case 0xc0010010: /* SYSCFG */ + case 0xc0010015: /* HWCR */ + case MSR_IA32_PLATFORM_ID: + case MSR_IA32_P5_MC_ADDR: + case MSR_IA32_P5_MC_TYPE: + case MSR_IA32_MC0_CTL: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MCG_CAP: + case MSR_IA32_MC0_MISC: + case MSR_IA32_MC0_MISC+4: + case MSR_IA32_MC0_MISC+8: + case MSR_IA32_MC0_MISC+12: + case MSR_IA32_MC0_MISC+16: + case MSR_IA32_UCODE_REV: + case MSR_IA32_PERF_STATUS: + case MSR_IA32_EBL_CR_POWERON: + /* MTRR registers */ + case 0xfe: + case 0x200 ... 0x2ff: + data = 0; + break; + case 0xcd: /* fsb frequency */ + data = 3; + break; + case MSR_IA32_APICBASE: + data = kvm_get_apic_base(vcpu); + break; + case MSR_IA32_MISC_ENABLE: + data = vcpu->arch.ia32_misc_enable_msr; + break; +#ifdef CONFIG_X86_64 + case MSR_EFER: + data = vcpu->arch.shadow_efer; + break; +#endif + default: + pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); + return 1; + } + *pdata = data; + return 0; +} +EXPORT_SYMBOL_GPL(kvm_get_msr_common); /* - * Allocate some memory and give it an address in the guest physical address - * space. + * Read or write a bunch of msrs. All parameters are kernel addresses. * - * Discontiguous memory is allowed, mostly for framebuffers. + * @return number of msrs set successfully. */ -static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, - struct kvm_memory_region *mem) +static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, + struct kvm_msr_entry *entries, + int (*do_msr)(struct kvm_vcpu *vcpu, + unsigned index, u64 *data)) { - int r; - gfn_t base_gfn; - unsigned long npages; - unsigned long i; - struct kvm_memory_slot *memslot; - struct kvm_memory_slot old, new; + int i; - r = -EINVAL; - /* General sanity checks */ - if (mem->memory_size & (PAGE_SIZE - 1)) - goto out; - if (mem->guest_phys_addr & (PAGE_SIZE - 1)) + vcpu_load(vcpu); + + for (i = 0; i < msrs->nmsrs; ++i) + if (do_msr(vcpu, entries[i].index, &entries[i].data)) + break; + + vcpu_put(vcpu); + + return i; +} + +/* + * Read or write a bunch of msrs. Parameters are user addresses. + * + * @return number of msrs set successfully. + */ +static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, + int (*do_msr)(struct kvm_vcpu *vcpu, + unsigned index, u64 *data), + int writeback) +{ + struct kvm_msrs msrs; + struct kvm_msr_entry *entries; + int r, n; + unsigned size; + + r = -EFAULT; + if (copy_from_user(&msrs, user_msrs, sizeof msrs)) goto out; - if (mem->slot >= KVM_MEMORY_SLOTS) + + r = -E2BIG; + if (msrs.nmsrs >= MAX_IO_MSRS) goto out; - if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) + + r = -ENOMEM; + size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; + entries = vmalloc(size); + if (!entries) goto out; - memslot = &kvm->memslots[mem->slot]; - base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; - npages = mem->memory_size >> PAGE_SHIFT; + r = -EFAULT; + if (copy_from_user(entries, user_msrs->entries, size)) + goto out_free; - if (!npages) - mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; + r = n = __msr_io(vcpu, &msrs, entries, do_msr); + if (r < 0) + goto out_free; - mutex_lock(&kvm->lock); + r = -EFAULT; + if (writeback && copy_to_user(user_msrs->entries, entries, size)) + goto out_free; - new = old = *memslot; + r = n; - new.base_gfn = base_gfn; - new.npages = npages; - new.flags = mem->flags; +out_free: + vfree(entries); +out: + return r; +} - /* Disallow changing a memory slot's size. */ - r = -EINVAL; - if (npages && old.npages && npages != old.npages) - goto out_unlock; +/* + * Make sure that a cpu that is being hot-unplugged does not have any vcpus + * cached on it. + */ +void decache_vcpus_on_cpu(int cpu) +{ + struct kvm *vm; + struct kvm_vcpu *vcpu; + int i; - /* Check for overlaps */ - r = -EEXIST; - for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { - struct kvm_memory_slot *s = &kvm->memslots[i]; + spin_lock(&kvm_lock); + list_for_each_entry(vm, &vm_list, vm_list) + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + vcpu = vm->vcpus[i]; + if (!vcpu) + continue; + /* + * If the vcpu is locked, then it is running on some + * other cpu and therefore it is not cached on the + * cpu in question. + * + * If it's not locked, check the last cpu it executed + * on. + */ + if (mutex_trylock(&vcpu->mutex)) { + if (vcpu->cpu == cpu) { + kvm_x86_ops->vcpu_decache(vcpu); + vcpu->cpu = -1; + } + mutex_unlock(&vcpu->mutex); + } + } + spin_unlock(&kvm_lock); +} - if (s == memslot) - continue; - if (!((base_gfn + npages <= s->base_gfn) || - (base_gfn >= s->base_gfn + s->npages))) - goto out_unlock; +int kvm_dev_ioctl_check_extension(long ext) +{ + int r; + + switch (ext) { + case KVM_CAP_IRQCHIP: + case KVM_CAP_HLT: + case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: + case KVM_CAP_USER_MEMORY: + case KVM_CAP_SET_TSS_ADDR: + case KVM_CAP_EXT_CPUID: + r = 1; + break; + case KVM_CAP_VAPIC: + r = !kvm_x86_ops->cpu_has_accelerated_tpr(); + break; + default: + r = 0; + break; } + return r; - /* Deallocate if slot is being removed */ - if (!npages) - new.phys_mem = NULL; +} - /* Free page dirty bitmap if unneeded */ - if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) - new.dirty_bitmap = NULL; +long kvm_arch_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + long r; - r = -ENOMEM; + switch (ioctl) { + case KVM_GET_MSR_INDEX_LIST: { + struct kvm_msr_list __user *user_msr_list = argp; + struct kvm_msr_list msr_list; + unsigned n; - /* Allocate if a slot is being created */ - if (npages && !new.phys_mem) { - new.phys_mem = vmalloc(npages * sizeof(struct page *)); + r = -EFAULT; + if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) + goto out; + n = msr_list.nmsrs; + msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); + if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) + goto out; + r = -E2BIG; + if (n < num_msrs_to_save) + goto out; + r = -EFAULT; + if (copy_to_user(user_msr_list->indices, &msrs_to_save, + num_msrs_to_save * sizeof(u32))) + goto out; + if (copy_to_user(user_msr_list->indices + + num_msrs_to_save * sizeof(u32), + &emulated_msrs, + ARRAY_SIZE(emulated_msrs) * sizeof(u32))) + goto out; + r = 0; + break; + } + default: + r = -EINVAL; + } +out: + return r; +} + +void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + kvm_x86_ops->vcpu_load(vcpu, cpu); +} - if (!new.phys_mem) - goto out_unlock; +void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) +{ + kvm_x86_ops->vcpu_put(vcpu); + kvm_put_guest_fpu(vcpu); +} - memset(new.phys_mem, 0, npages * sizeof(struct page *)); - for (i = 0; i < npages; ++i) { - new.phys_mem[i] = alloc_page(GFP_HIGHUSER - | __GFP_ZERO); - if (!new.phys_mem[i]) - goto out_unlock; - set_page_private(new.phys_mem[i],0); - } - } +static int is_efer_nx(void) +{ + u64 efer; + + rdmsrl(MSR_EFER, efer); + return efer & EFER_NX; +} - /* Allocate page dirty bitmap if needed */ - if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { - unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; +static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_cpuid_entry2 *e, *entry; - new.dirty_bitmap = vmalloc(dirty_bytes); - if (!new.dirty_bitmap) - goto out_unlock; - memset(new.dirty_bitmap, 0, dirty_bytes); + entry = NULL; + for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { + e = &vcpu->arch.cpuid_entries[i]; + if (e->function == 0x80000001) { + entry = e; + break; + } } + if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { + entry->edx &= ~(1 << 20); + printk(KERN_INFO "kvm: guest NX capability removed\n"); + } +} - if (mem->slot >= kvm->nmemslots) - kvm->nmemslots = mem->slot + 1; +/* when an old userspace process fills a new kernel module */ +static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, + struct kvm_cpuid *cpuid, + struct kvm_cpuid_entry __user *entries) +{ + int r, i; + struct kvm_cpuid_entry *cpuid_entries; - *memslot = new; + r = -E2BIG; + if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) + goto out; + r = -ENOMEM; + cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); + if (!cpuid_entries) + goto out; + r = -EFAULT; + if (copy_from_user(cpuid_entries, entries, + cpuid->nent * sizeof(struct kvm_cpuid_entry))) + goto out_free; + for (i = 0; i < cpuid->nent; i++) { + vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; + vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; + vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; + vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; + vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; + vcpu->arch.cpuid_entries[i].index = 0; + vcpu->arch.cpuid_entries[i].flags = 0; + vcpu->arch.cpuid_entries[i].padding[0] = 0; + vcpu->arch.cpuid_entries[i].padding[1] = 0; + vcpu->arch.cpuid_entries[i].padding[2] = 0; + } + vcpu->arch.cpuid_nent = cpuid->nent; + cpuid_fix_nx_cap(vcpu); + r = 0; - kvm_mmu_slot_remove_write_access(kvm, mem->slot); - kvm_flush_remote_tlbs(kvm); +out_free: + vfree(cpuid_entries); +out: + return r; +} - mutex_unlock(&kvm->lock); +static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, + struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries) +{ + int r; - kvm_free_physmem_slot(&old, &new); + r = -E2BIG; + if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) + goto out; + r = -EFAULT; + if (copy_from_user(&vcpu->arch.cpuid_entries, entries, + cpuid->nent * sizeof(struct kvm_cpuid_entry2))) + goto out; + vcpu->arch.cpuid_nent = cpuid->nent; return 0; -out_unlock: - mutex_unlock(&kvm->lock); - kvm_free_physmem_slot(&new, &old); out: return r; } -/* - * Get (and clear) the dirty memory log for a memory slot. - */ -static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, - struct kvm_dirty_log *log) +static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, + struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries) { - struct kvm_memory_slot *memslot; - int r, i; - int n; - unsigned long any = 0; - - mutex_lock(&kvm->lock); + int r; - r = -EINVAL; - if (log->slot >= KVM_MEMORY_SLOTS) + r = -E2BIG; + if (cpuid->nent < vcpu->arch.cpuid_nent) goto out; - - memslot = &kvm->memslots[log->slot]; - r = -ENOENT; - if (!memslot->dirty_bitmap) + r = -EFAULT; + if (copy_to_user(entries, &vcpu->arch.cpuid_entries, + vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) goto out; + return 0; + +out: + cpuid->nent = vcpu->arch.cpuid_nent; + return r; +} + +static inline u32 bit(int bitno) +{ + return 1 << (bitno & 31); +} + +static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, + u32 index) +{ + entry->function = function; + entry->index = index; + cpuid_count(entry->function, entry->index, + &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); + entry->flags = 0; +} + +static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, + u32 index, int *nent, int maxnent) +{ + const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) | + bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | + bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | + bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | + bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | + bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) | + bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | + bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) | + bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) | + bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP); + const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) | + bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | + bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | + bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | + bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | + bit(X86_FEATURE_PGE) | + bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | + bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) | + bit(X86_FEATURE_SYSCALL) | + (bit(X86_FEATURE_NX) && is_efer_nx()) | +#ifdef CONFIG_X86_64 + bit(X86_FEATURE_LM) | +#endif + bit(X86_FEATURE_MMXEXT) | + bit(X86_FEATURE_3DNOWEXT) | + bit(X86_FEATURE_3DNOW); + const u32 kvm_supported_word3_x86_features = + bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16); + const u32 kvm_supported_word6_x86_features = + bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY); + + /* all func 2 cpuid_count() should be called on the same cpu */ + get_cpu(); + do_cpuid_1_ent(entry, function, index); + ++*nent; + + switch (function) { + case 0: + entry->eax = min(entry->eax, (u32)0xb); + break; + case 1: + entry->edx &= kvm_supported_word0_x86_features; + entry->ecx &= kvm_supported_word3_x86_features; + break; + /* function 2 entries are STATEFUL. That is, repeated cpuid commands + * may return different values. This forces us to get_cpu() before + * issuing the first command, and also to emulate this annoying behavior + * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ + case 2: { + int t, times = entry->eax & 0xff; + + entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; + for (t = 1; t < times && *nent < maxnent; ++t) { + do_cpuid_1_ent(&entry[t], function, 0); + entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; + ++*nent; + } + break; + } + /* function 4 and 0xb have additional index. */ + case 4: { + int index, cache_type; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + /* read more entries until cache_type is zero */ + for (index = 1; *nent < maxnent; ++index) { + cache_type = entry[index - 1].eax & 0x1f; + if (!cache_type) + break; + do_cpuid_1_ent(&entry[index], function, index); + entry[index].flags |= + KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + } + break; + } + case 0xb: { + int index, level_type; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + /* read more entries until level_type is zero */ + for (index = 1; *nent < maxnent; ++index) { + level_type = entry[index - 1].ecx & 0xff; + if (!level_type) + break; + do_cpuid_1_ent(&entry[index], function, index); + entry[index].flags |= + KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + } + break; + } + case 0x80000000: + entry->eax = min(entry->eax, 0x8000001a); + break; + case 0x80000001: + entry->edx &= kvm_supported_word1_x86_features; + entry->ecx &= kvm_supported_word6_x86_features; + break; + } + put_cpu(); +} - n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; +static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm, + struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries) +{ + struct kvm_cpuid_entry2 *cpuid_entries; + int limit, nent = 0, r = -E2BIG; + u32 func; - for (i = 0; !any && i < n/sizeof(long); ++i) - any = memslot->dirty_bitmap[i]; + if (cpuid->nent < 1) + goto out; + r = -ENOMEM; + cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); + if (!cpuid_entries) + goto out; + do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); + limit = cpuid_entries[0].eax; + for (func = 1; func <= limit && nent < cpuid->nent; ++func) + do_cpuid_ent(&cpuid_entries[nent], func, 0, + &nent, cpuid->nent); + r = -E2BIG; + if (nent >= cpuid->nent) + goto out_free; + + do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); + limit = cpuid_entries[nent - 1].eax; + for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) + do_cpuid_ent(&cpuid_entries[nent], func, 0, + &nent, cpuid->nent); r = -EFAULT; - if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) - goto out; + if (copy_to_user(entries, cpuid_entries, + nent * sizeof(struct kvm_cpuid_entry2))) + goto out_free; + cpuid->nent = nent; + r = 0; - /* If nothing is dirty, don't bother messing with page tables. */ - if (any) { - kvm_mmu_slot_remove_write_access(kvm, log->slot); - kvm_flush_remote_tlbs(kvm); - memset(memslot->dirty_bitmap, 0, n); +out_free: + vfree(cpuid_entries); +out: + return r; +} + +static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, + struct kvm_lapic_state *s) +{ + vcpu_load(vcpu); + memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); + vcpu_put(vcpu); + + return 0; +} + +static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, + struct kvm_lapic_state *s) +{ + vcpu_load(vcpu); + memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); + kvm_apic_post_state_restore(vcpu); + vcpu_put(vcpu); + + return 0; +} + +static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, + struct kvm_interrupt *irq) +{ + if (irq->irq < 0 || irq->irq >= 256) + return -EINVAL; + if (irqchip_in_kernel(vcpu->kvm)) + return -ENXIO; + vcpu_load(vcpu); + + set_bit(irq->irq, vcpu->arch.irq_pending); + set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary); + + vcpu_put(vcpu); + + return 0; +} + +static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, + struct kvm_tpr_access_ctl *tac) +{ + if (tac->flags) + return -EINVAL; + vcpu->arch.tpr_access_reporting = !!tac->enabled; + return 0; +} + +long kvm_arch_vcpu_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm_vcpu *vcpu = filp->private_data; + void __user *argp = (void __user *)arg; + int r; + + switch (ioctl) { + case KVM_GET_LAPIC: { + struct kvm_lapic_state lapic; + + memset(&lapic, 0, sizeof lapic); + r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &lapic, sizeof lapic)) + goto out; + r = 0; + break; } + case KVM_SET_LAPIC: { + struct kvm_lapic_state lapic; - r = 0; + r = -EFAULT; + if (copy_from_user(&lapic, argp, sizeof lapic)) + goto out; + r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);; + if (r) + goto out; + r = 0; + break; + } + case KVM_INTERRUPT: { + struct kvm_interrupt irq; + + r = -EFAULT; + if (copy_from_user(&irq, argp, sizeof irq)) + goto out; + r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); + if (r) + goto out; + r = 0; + break; + } + case KVM_SET_CPUID: { + struct kvm_cpuid __user *cpuid_arg = argp; + struct kvm_cpuid cpuid; + r = -EFAULT; + if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) + goto out; + r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); + if (r) + goto out; + break; + } + case KVM_SET_CPUID2: { + struct kvm_cpuid2 __user *cpuid_arg = argp; + struct kvm_cpuid2 cpuid; + + r = -EFAULT; + if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) + goto out; + r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, + cpuid_arg->entries); + if (r) + goto out; + break; + } + case KVM_GET_CPUID2: { + struct kvm_cpuid2 __user *cpuid_arg = argp; + struct kvm_cpuid2 cpuid; + + r = -EFAULT; + if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) + goto out; + r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, + cpuid_arg->entries); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) + goto out; + r = 0; + break; + } + case KVM_GET_MSRS: + r = msr_io(vcpu, argp, kvm_get_msr, 1); + break; + case KVM_SET_MSRS: + r = msr_io(vcpu, argp, do_set_msr, 0); + break; + case KVM_TPR_ACCESS_REPORTING: { + struct kvm_tpr_access_ctl tac; + + r = -EFAULT; + if (copy_from_user(&tac, argp, sizeof tac)) + goto out; + r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &tac, sizeof tac)) + goto out; + r = 0; + break; + }; + case KVM_SET_VAPIC_ADDR: { + struct kvm_vapic_addr va; + + r = -EINVAL; + if (!irqchip_in_kernel(vcpu->kvm)) + goto out; + r = -EFAULT; + if (copy_from_user(&va, argp, sizeof va)) + goto out; + r = 0; + kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); + break; + } + default: + r = -EINVAL; + } out: - mutex_unlock(&kvm->lock); return r; } +static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) +{ + int ret; + + if (addr > (unsigned int)(-3 * PAGE_SIZE)) + return -1; + ret = kvm_x86_ops->set_tss_addr(kvm, addr); + return ret; +} + +static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, + u32 kvm_nr_mmu_pages) +{ + if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) + return -EINVAL; + + down_write(¤t->mm->mmap_sem); + + kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); + kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; + + up_write(¤t->mm->mmap_sem); + return 0; +} + +static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) +{ + return kvm->arch.n_alloc_mmu_pages; +} + +gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_mem_alias *alias; + + for (i = 0; i < kvm->arch.naliases; ++i) { + alias = &kvm->arch.aliases[i]; + if (gfn >= alias->base_gfn + && gfn < alias->base_gfn + alias->npages) + return alias->target_gfn + gfn - alias->base_gfn; + } + return gfn; +} + /* * Set a new alias region. Aliases map a portion of physical memory into * another portion. This is useful for memory windows, for example the PC @@ -832,21 +1261,21 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, < alias->target_phys_addr) goto out; - mutex_lock(&kvm->lock); + down_write(¤t->mm->mmap_sem); - p = &kvm->aliases[alias->slot]; + p = &kvm->arch.aliases[alias->slot]; p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; p->npages = alias->memory_size >> PAGE_SHIFT; p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; for (n = KVM_ALIAS_SLOTS; n > 0; --n) - if (kvm->aliases[n - 1].npages) + if (kvm->arch.aliases[n - 1].npages) break; - kvm->naliases = n; + kvm->arch.naliases = n; kvm_mmu_zap_all(kvm); - mutex_unlock(&kvm->lock); + up_write(¤t->mm->mmap_sem); return 0; @@ -861,17 +1290,17 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: - memcpy (&chip->chip.pic, + memcpy(&chip->chip.pic, &pic_irqchip(kvm)->pics[0], sizeof(struct kvm_pic_state)); break; case KVM_IRQCHIP_PIC_SLAVE: - memcpy (&chip->chip.pic, + memcpy(&chip->chip.pic, &pic_irqchip(kvm)->pics[1], sizeof(struct kvm_pic_state)); break; case KVM_IRQCHIP_IOAPIC: - memcpy (&chip->chip.ioapic, + memcpy(&chip->chip.ioapic, ioapic_irqchip(kvm), sizeof(struct kvm_ioapic_state)); break; @@ -889,17 +1318,17 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: - memcpy (&pic_irqchip(kvm)->pics[0], + memcpy(&pic_irqchip(kvm)->pics[0], &chip->chip.pic, sizeof(struct kvm_pic_state)); break; case KVM_IRQCHIP_PIC_SLAVE: - memcpy (&pic_irqchip(kvm)->pics[1], + memcpy(&pic_irqchip(kvm)->pics[1], &chip->chip.pic, sizeof(struct kvm_pic_state)); break; case KVM_IRQCHIP_IOAPIC: - memcpy (ioapic_irqchip(kvm), + memcpy(ioapic_irqchip(kvm), &chip->chip.ioapic, sizeof(struct kvm_ioapic_state)); break; @@ -911,110 +1340,191 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) return r; } -static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) +/* + * Get (and clear) the dirty memory log for a memory slot. + */ +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, + struct kvm_dirty_log *log) { - int i; - struct kvm_mem_alias *alias; - - for (i = 0; i < kvm->naliases; ++i) { - alias = &kvm->aliases[i]; - if (gfn >= alias->base_gfn - && gfn < alias->base_gfn + alias->npages) - return alias->target_gfn + gfn - alias->base_gfn; - } - return gfn; -} + int r; + int n; + struct kvm_memory_slot *memslot; + int is_dirty = 0; -static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn) -{ - int i; + down_write(¤t->mm->mmap_sem); - for (i = 0; i < kvm->nmemslots; ++i) { - struct kvm_memory_slot *memslot = &kvm->memslots[i]; + r = kvm_get_dirty_log(kvm, log, &is_dirty); + if (r) + goto out; - if (gfn >= memslot->base_gfn - && gfn < memslot->base_gfn + memslot->npages) - return memslot; + /* If nothing is dirty, don't bother messing with page tables. */ + if (is_dirty) { + kvm_mmu_slot_remove_write_access(kvm, log->slot); + kvm_flush_remote_tlbs(kvm); + memslot = &kvm->memslots[log->slot]; + n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; + memset(memslot->dirty_bitmap, 0, n); } - return NULL; -} - -struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) -{ - gfn = unalias_gfn(kvm, gfn); - return __gfn_to_memslot(kvm, gfn); -} - -struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) -{ - struct kvm_memory_slot *slot; - - gfn = unalias_gfn(kvm, gfn); - slot = __gfn_to_memslot(kvm, gfn); - if (!slot) - return NULL; - return slot->phys_mem[gfn - slot->base_gfn]; + r = 0; +out: + up_write(¤t->mm->mmap_sem); + return r; } -EXPORT_SYMBOL_GPL(gfn_to_page); -/* WARNING: Does not work on aliased pages. */ -void mark_page_dirty(struct kvm *kvm, gfn_t gfn) +long kvm_arch_vm_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) { - struct kvm_memory_slot *memslot; + struct kvm *kvm = filp->private_data; + void __user *argp = (void __user *)arg; + int r = -EINVAL; - memslot = __gfn_to_memslot(kvm, gfn); - if (memslot && memslot->dirty_bitmap) { - unsigned long rel_gfn = gfn - memslot->base_gfn; + switch (ioctl) { + case KVM_SET_TSS_ADDR: + r = kvm_vm_ioctl_set_tss_addr(kvm, arg); + if (r < 0) + goto out; + break; + case KVM_SET_MEMORY_REGION: { + struct kvm_memory_region kvm_mem; + struct kvm_userspace_memory_region kvm_userspace_mem; - /* avoid RMW */ - if (!test_bit(rel_gfn, memslot->dirty_bitmap)) - set_bit(rel_gfn, memslot->dirty_bitmap); + r = -EFAULT; + if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) + goto out; + kvm_userspace_mem.slot = kvm_mem.slot; + kvm_userspace_mem.flags = kvm_mem.flags; + kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr; + kvm_userspace_mem.memory_size = kvm_mem.memory_size; + r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0); + if (r) + goto out; + break; } -} + case KVM_SET_NR_MMU_PAGES: + r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); + if (r) + goto out; + break; + case KVM_GET_NR_MMU_PAGES: + r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); + break; + case KVM_SET_MEMORY_ALIAS: { + struct kvm_memory_alias alias; -int emulator_read_std(unsigned long addr, - void *val, - unsigned int bytes, - struct kvm_vcpu *vcpu) -{ - void *data = val; + r = -EFAULT; + if (copy_from_user(&alias, argp, sizeof alias)) + goto out; + r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); + if (r) + goto out; + break; + } + case KVM_CREATE_IRQCHIP: + r = -ENOMEM; + kvm->arch.vpic = kvm_create_pic(kvm); + if (kvm->arch.vpic) { + r = kvm_ioapic_init(kvm); + if (r) { + kfree(kvm->arch.vpic); + kvm->arch.vpic = NULL; + goto out; + } + } else + goto out; + break; + case KVM_IRQ_LINE: { + struct kvm_irq_level irq_event; - while (bytes) { - gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); - unsigned offset = addr & (PAGE_SIZE-1); - unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); - unsigned long pfn; - struct page *page; - void *page_virt; + r = -EFAULT; + if (copy_from_user(&irq_event, argp, sizeof irq_event)) + goto out; + if (irqchip_in_kernel(kvm)) { + mutex_lock(&kvm->lock); + if (irq_event.irq < 16) + kvm_pic_set_irq(pic_irqchip(kvm), + irq_event.irq, + irq_event.level); + kvm_ioapic_set_irq(kvm->arch.vioapic, + irq_event.irq, + irq_event.level); + mutex_unlock(&kvm->lock); + r = 0; + } + break; + } + case KVM_GET_IRQCHIP: { + /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ + struct kvm_irqchip chip; - if (gpa == UNMAPPED_GVA) - return X86EMUL_PROPAGATE_FAULT; - pfn = gpa >> PAGE_SHIFT; - page = gfn_to_page(vcpu->kvm, pfn); - if (!page) - return X86EMUL_UNHANDLEABLE; - page_virt = kmap_atomic(page, KM_USER0); + r = -EFAULT; + if (copy_from_user(&chip, argp, sizeof chip)) + goto out; + r = -ENXIO; + if (!irqchip_in_kernel(kvm)) + goto out; + r = kvm_vm_ioctl_get_irqchip(kvm, &chip); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &chip, sizeof chip)) + goto out; + r = 0; + break; + } + case KVM_SET_IRQCHIP: { + /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ + struct kvm_irqchip chip; - memcpy(data, page_virt + offset, tocopy); + r = -EFAULT; + if (copy_from_user(&chip, argp, sizeof chip)) + goto out; + r = -ENXIO; + if (!irqchip_in_kernel(kvm)) + goto out; + r = kvm_vm_ioctl_set_irqchip(kvm, &chip); + if (r) + goto out; + r = 0; + break; + } + case KVM_GET_SUPPORTED_CPUID: { + struct kvm_cpuid2 __user *cpuid_arg = argp; + struct kvm_cpuid2 cpuid; - kunmap_atomic(page_virt, KM_USER0); + r = -EFAULT; + if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) + goto out; + r = kvm_vm_ioctl_get_supported_cpuid(kvm, &cpuid, + cpuid_arg->entries); + if (r) + goto out; - bytes -= tocopy; - data += tocopy; - addr += tocopy; + r = -EFAULT; + if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) + goto out; + r = 0; + break; } - - return X86EMUL_CONTINUE; + default: + ; + } +out: + return r; } -EXPORT_SYMBOL_GPL(emulator_read_std); -static int emulator_write_std(unsigned long addr, - const void *val, - unsigned int bytes, - struct kvm_vcpu *vcpu) +static void kvm_init_msr_list(void) { - pr_unimpl(vcpu, "emulator_write_std: addr %lx n %d\n", addr, bytes); - return X86EMUL_UNHANDLEABLE; + u32 dummy[2]; + unsigned i, j; + + for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { + if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) + continue; + if (j < i) + msrs_to_save[j] = msrs_to_save[i]; + j++; + } + num_msrs_to_save = j; } /* @@ -1025,14 +1535,15 @@ static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, { struct kvm_io_device *dev; - if (vcpu->apic) { - dev = &vcpu->apic->dev; + if (vcpu->arch.apic) { + dev = &vcpu->arch.apic->dev; if (dev->in_range(dev, addr)) return dev; } return NULL; } + static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, gpa_t addr) { @@ -1044,11 +1555,40 @@ static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, return dev; } -static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, - gpa_t addr) +int emulator_read_std(unsigned long addr, + void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu) { - return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr); + void *data = val; + int r = X86EMUL_CONTINUE; + + down_read(¤t->mm->mmap_sem); + while (bytes) { + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + unsigned offset = addr & (PAGE_SIZE-1); + unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); + int ret; + + if (gpa == UNMAPPED_GVA) { + r = X86EMUL_PROPAGATE_FAULT; + goto out; + } + ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy); + if (ret < 0) { + r = X86EMUL_UNHANDLEABLE; + goto out; + } + + bytes -= tocopy; + data += tocopy; + addr += tocopy; + } +out: + up_read(¤t->mm->mmap_sem); + return r; } +EXPORT_SYMBOL_GPL(emulator_read_std); static int emulator_read_emulated(unsigned long addr, void *val, @@ -1062,22 +1602,34 @@ static int emulator_read_emulated(unsigned long addr, memcpy(val, vcpu->mmio_data, bytes); vcpu->mmio_read_completed = 0; return X86EMUL_CONTINUE; - } else if (emulator_read_std(addr, val, bytes, vcpu) - == X86EMUL_CONTINUE) - return X86EMUL_CONTINUE; + } + + down_read(¤t->mm->mmap_sem); + gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + up_read(¤t->mm->mmap_sem); + + /* For APIC access vmexit */ + if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) + goto mmio; - gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); + if (emulator_read_std(addr, val, bytes, vcpu) + == X86EMUL_CONTINUE) + return X86EMUL_CONTINUE; if (gpa == UNMAPPED_GVA) return X86EMUL_PROPAGATE_FAULT; +mmio: /* * Is this MMIO handled locally? */ + mutex_lock(&vcpu->kvm->lock); mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); if (mmio_dev) { kvm_iodevice_read(mmio_dev, gpa, bytes, val); + mutex_unlock(&vcpu->kvm->lock); return X86EMUL_CONTINUE; } + mutex_unlock(&vcpu->kvm->lock); vcpu->mmio_needed = 1; vcpu->mmio_phys_addr = gpa; @@ -1090,19 +1642,16 @@ static int emulator_read_emulated(unsigned long addr, static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes) { - struct page *page; - void *virt; + int ret; - if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT)) + down_read(¤t->mm->mmap_sem); + ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); + if (ret < 0) { + up_read(¤t->mm->mmap_sem); return 0; - page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - if (!page) - return 0; - mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); - virt = kmap_atomic(page, KM_USER0); + } kvm_mmu_pte_write(vcpu, gpa, val, bytes); - memcpy(virt + offset_in_page(gpa), val, bytes); - kunmap_atomic(virt, KM_USER0); + up_read(¤t->mm->mmap_sem); return 1; } @@ -1112,24 +1661,36 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct kvm_vcpu *vcpu) { struct kvm_io_device *mmio_dev; - gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); + gpa_t gpa; + + down_read(¤t->mm->mmap_sem); + gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + up_read(¤t->mm->mmap_sem); if (gpa == UNMAPPED_GVA) { - kvm_x86_ops->inject_page_fault(vcpu, addr, 2); + kvm_inject_page_fault(vcpu, addr, 2); return X86EMUL_PROPAGATE_FAULT; } + /* For APIC access vmexit */ + if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) + goto mmio; + if (emulator_write_phys(vcpu, gpa, val, bytes)) return X86EMUL_CONTINUE; +mmio: /* * Is this MMIO handled locally? */ + mutex_lock(&vcpu->kvm->lock); mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); if (mmio_dev) { kvm_iodevice_write(mmio_dev, gpa, bytes, val); + mutex_unlock(&vcpu->kvm->lock); return X86EMUL_CONTINUE; } + mutex_unlock(&vcpu->kvm->lock); vcpu->mmio_needed = 1; vcpu->mmio_phys_addr = gpa; @@ -1173,6 +1734,35 @@ static int emulator_cmpxchg_emulated(unsigned long addr, reported = 1; printk(KERN_WARNING "kvm: emulating exchange as write\n"); } +#ifndef CONFIG_X86_64 + /* guests cmpxchg8b have to be emulated atomically */ + if (bytes == 8) { + gpa_t gpa; + struct page *page; + char *addr; + u64 val; + + down_read(¤t->mm->mmap_sem); + gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + + if (gpa == UNMAPPED_GVA || + (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) + goto emul_write; + + if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) + goto emul_write; + + val = *(u64 *)new; + page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); + addr = kmap_atomic(page, KM_USER0); + set_64bit((u64 *)(addr + offset_in_page(gpa)), val); + kunmap_atomic(addr, KM_USER0); + kvm_release_page_dirty(page); + emul_write: + up_read(¤t->mm->mmap_sem); + } +#endif + return emulator_write_emulated(addr, new, bytes, vcpu); } @@ -1188,11 +1778,11 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) int emulate_clts(struct kvm_vcpu *vcpu) { - kvm_x86_ops->set_cr0(vcpu, vcpu->cr0 & ~X86_CR0_TS); + kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); return X86EMUL_CONTINUE; } -int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest) +int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) { struct kvm_vcpu *vcpu = ctxt->vcpu; @@ -1223,7 +1813,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) { static int reported; u8 opcodes[4]; - unsigned long rip = vcpu->rip; + unsigned long rip = vcpu->arch.rip; unsigned long rip_linear; rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); @@ -1241,7 +1831,6 @@ EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); struct x86_emulate_ops emulate_ops = { .read_std = emulator_read_std, - .write_std = emulator_write_std, .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, .cmpxchg_emulated = emulator_cmpxchg_emulated, @@ -1250,44 +1839,74 @@ struct x86_emulate_ops emulate_ops = { int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, unsigned long cr2, - u16 error_code) + u16 error_code, + int emulation_type) { - struct x86_emulate_ctxt emulate_ctxt; int r; - int cs_db, cs_l; + struct decode_cache *c; - vcpu->mmio_fault_cr2 = cr2; + vcpu->arch.mmio_fault_cr2 = cr2; kvm_x86_ops->cache_regs(vcpu); - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - - emulate_ctxt.vcpu = vcpu; - emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); - emulate_ctxt.cr2 = cr2; - emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM) - ? X86EMUL_MODE_REAL : cs_l - ? X86EMUL_MODE_PROT64 : cs_db - ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - - if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) { - emulate_ctxt.cs_base = 0; - emulate_ctxt.ds_base = 0; - emulate_ctxt.es_base = 0; - emulate_ctxt.ss_base = 0; - } else { - emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS); - emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS); - emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES); - emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS); + vcpu->mmio_is_write = 0; + vcpu->arch.pio.string = 0; + + if (!(emulation_type & EMULTYPE_NO_DECODE)) { + int cs_db, cs_l; + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + + vcpu->arch.emulate_ctxt.vcpu = vcpu; + vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); + vcpu->arch.emulate_ctxt.mode = + (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) + ? X86EMUL_MODE_REAL : cs_l + ? X86EMUL_MODE_PROT64 : cs_db + ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + + if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) { + vcpu->arch.emulate_ctxt.cs_base = 0; + vcpu->arch.emulate_ctxt.ds_base = 0; + vcpu->arch.emulate_ctxt.es_base = 0; + vcpu->arch.emulate_ctxt.ss_base = 0; + } else { + vcpu->arch.emulate_ctxt.cs_base = + get_segment_base(vcpu, VCPU_SREG_CS); + vcpu->arch.emulate_ctxt.ds_base = + get_segment_base(vcpu, VCPU_SREG_DS); + vcpu->arch.emulate_ctxt.es_base = + get_segment_base(vcpu, VCPU_SREG_ES); + vcpu->arch.emulate_ctxt.ss_base = + get_segment_base(vcpu, VCPU_SREG_SS); + } + + vcpu->arch.emulate_ctxt.gs_base = + get_segment_base(vcpu, VCPU_SREG_GS); + vcpu->arch.emulate_ctxt.fs_base = + get_segment_base(vcpu, VCPU_SREG_FS); + + r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); + + /* Reject the instructions other than VMCALL/VMMCALL when + * try to emulate invalid opcode */ + c = &vcpu->arch.emulate_ctxt.decode; + if ((emulation_type & EMULTYPE_TRAP_UD) && + (!(c->twobyte && c->b == 0x01 && + (c->modrm_reg == 0 || c->modrm_reg == 3) && + c->modrm_mod == 3 && c->modrm_rm == 1))) + return EMULATE_FAIL; + + ++vcpu->stat.insn_emulation; + if (r) { + ++vcpu->stat.insn_emulation_fail; + if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) + return EMULATE_DONE; + return EMULATE_FAIL; + } } - emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS); - emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS); + r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); - vcpu->mmio_is_write = 0; - vcpu->pio.string = 0; - r = x86_emulate_memop(&emulate_ctxt, &emulate_ops); - if (vcpu->pio.string) + if (vcpu->arch.pio.string) return EMULATE_DO_MMIO; if ((r || vcpu->mmio_is_write) && run) { @@ -1309,7 +1928,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, } kvm_x86_ops->decache_regs(vcpu); - kvm_x86_ops->set_rflags(vcpu, emulate_ctxt.eflags); + kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); if (vcpu->mmio_is_write) { vcpu->mmio_needed = 0; @@ -1320,439 +1939,45 @@ int emulate_instruction(struct kvm_vcpu *vcpu, } EXPORT_SYMBOL_GPL(emulate_instruction); -/* - * The vCPU has executed a HLT instruction with in-kernel mode enabled. - */ -static void kvm_vcpu_block(struct kvm_vcpu *vcpu) -{ - DECLARE_WAITQUEUE(wait, current); - - add_wait_queue(&vcpu->wq, &wait); - - /* - * We will block until either an interrupt or a signal wakes us up - */ - while (!kvm_cpu_has_interrupt(vcpu) - && !signal_pending(current) - && vcpu->mp_state != VCPU_MP_STATE_RUNNABLE - && vcpu->mp_state != VCPU_MP_STATE_SIPI_RECEIVED) { - set_current_state(TASK_INTERRUPTIBLE); - vcpu_put(vcpu); - schedule(); - vcpu_load(vcpu); - } - - __set_current_state(TASK_RUNNING); - remove_wait_queue(&vcpu->wq, &wait); -} - -int kvm_emulate_halt(struct kvm_vcpu *vcpu) -{ - ++vcpu->stat.halt_exits; - if (irqchip_in_kernel(vcpu->kvm)) { - vcpu->mp_state = VCPU_MP_STATE_HALTED; - kvm_vcpu_block(vcpu); - if (vcpu->mp_state != VCPU_MP_STATE_RUNNABLE) - return -EINTR; - return 1; - } else { - vcpu->run->exit_reason = KVM_EXIT_HLT; - return 0; - } -} -EXPORT_SYMBOL_GPL(kvm_emulate_halt); - -int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run) -{ - unsigned long nr, a0, a1, a2, a3, a4, a5, ret; - - kvm_x86_ops->cache_regs(vcpu); - ret = -KVM_EINVAL; -#ifdef CONFIG_X86_64 - if (is_long_mode(vcpu)) { - nr = vcpu->regs[VCPU_REGS_RAX]; - a0 = vcpu->regs[VCPU_REGS_RDI]; - a1 = vcpu->regs[VCPU_REGS_RSI]; - a2 = vcpu->regs[VCPU_REGS_RDX]; - a3 = vcpu->regs[VCPU_REGS_RCX]; - a4 = vcpu->regs[VCPU_REGS_R8]; - a5 = vcpu->regs[VCPU_REGS_R9]; - } else -#endif - { - nr = vcpu->regs[VCPU_REGS_RBX] & -1u; - a0 = vcpu->regs[VCPU_REGS_RAX] & -1u; - a1 = vcpu->regs[VCPU_REGS_RCX] & -1u; - a2 = vcpu->regs[VCPU_REGS_RDX] & -1u; - a3 = vcpu->regs[VCPU_REGS_RSI] & -1u; - a4 = vcpu->regs[VCPU_REGS_RDI] & -1u; - a5 = vcpu->regs[VCPU_REGS_RBP] & -1u; - } - switch (nr) { - default: - run->hypercall.nr = nr; - run->hypercall.args[0] = a0; - run->hypercall.args[1] = a1; - run->hypercall.args[2] = a2; - run->hypercall.args[3] = a3; - run->hypercall.args[4] = a4; - run->hypercall.args[5] = a5; - run->hypercall.ret = ret; - run->hypercall.longmode = is_long_mode(vcpu); - kvm_x86_ops->decache_regs(vcpu); - return 0; - } - vcpu->regs[VCPU_REGS_RAX] = ret; - kvm_x86_ops->decache_regs(vcpu); - return 1; -} -EXPORT_SYMBOL_GPL(kvm_hypercall); - -static u64 mk_cr_64(u64 curr_cr, u32 new_val) -{ - return (curr_cr & ~((1ULL << 32) - 1)) | new_val; -} - -void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) -{ - struct descriptor_table dt = { limit, base }; - - kvm_x86_ops->set_gdt(vcpu, &dt); -} - -void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) -{ - struct descriptor_table dt = { limit, base }; - - kvm_x86_ops->set_idt(vcpu, &dt); -} - -void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, - unsigned long *rflags) -{ - lmsw(vcpu, msw); - *rflags = kvm_x86_ops->get_rflags(vcpu); -} - -unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) -{ - kvm_x86_ops->decache_cr4_guest_bits(vcpu); - switch (cr) { - case 0: - return vcpu->cr0; - case 2: - return vcpu->cr2; - case 3: - return vcpu->cr3; - case 4: - return vcpu->cr4; - default: - vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); - return 0; - } -} - -void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, - unsigned long *rflags) -{ - switch (cr) { - case 0: - set_cr0(vcpu, mk_cr_64(vcpu->cr0, val)); - *rflags = kvm_x86_ops->get_rflags(vcpu); - break; - case 2: - vcpu->cr2 = val; - break; - case 3: - set_cr3(vcpu, val); - break; - case 4: - set_cr4(vcpu, mk_cr_64(vcpu->cr4, val)); - break; - default: - vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); - } -} - -/* - * Register the para guest with the host: - */ -static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa) -{ - struct kvm_vcpu_para_state *para_state; - hpa_t para_state_hpa, hypercall_hpa; - struct page *para_state_page; - unsigned char *hypercall; - gpa_t hypercall_gpa; - - printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n"); - printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa); - - /* - * Needs to be page aligned: - */ - if (para_state_gpa != PAGE_ALIGN(para_state_gpa)) - goto err_gp; - - para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa); - printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa); - if (is_error_hpa(para_state_hpa)) - goto err_gp; - - mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT); - para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT); - para_state = kmap(para_state_page); - - printk(KERN_DEBUG ".... guest version: %d\n", para_state->guest_version); - printk(KERN_DEBUG ".... size: %d\n", para_state->size); - - para_state->host_version = KVM_PARA_API_VERSION; - /* - * We cannot support guests that try to register themselves - * with a newer API version than the host supports: - */ - if (para_state->guest_version > KVM_PARA_API_VERSION) { - para_state->ret = -KVM_EINVAL; - goto err_kunmap_skip; - } - - hypercall_gpa = para_state->hypercall_gpa; - hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa); - printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa); - if (is_error_hpa(hypercall_hpa)) { - para_state->ret = -KVM_EINVAL; - goto err_kunmap_skip; - } - - printk(KERN_DEBUG "kvm: para guest successfully registered.\n"); - vcpu->para_state_page = para_state_page; - vcpu->para_state_gpa = para_state_gpa; - vcpu->hypercall_gpa = hypercall_gpa; - - mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT); - hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT), - KM_USER1) + (hypercall_hpa & ~PAGE_MASK); - kvm_x86_ops->patch_hypercall(vcpu, hypercall); - kunmap_atomic(hypercall, KM_USER1); - - para_state->ret = 0; -err_kunmap_skip: - kunmap(para_state_page); - return 0; -err_gp: - return 1; -} - -int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) -{ - u64 data; - - switch (msr) { - case 0xc0010010: /* SYSCFG */ - case 0xc0010015: /* HWCR */ - case MSR_IA32_PLATFORM_ID: - case MSR_IA32_P5_MC_ADDR: - case MSR_IA32_P5_MC_TYPE: - case MSR_IA32_MC0_CTL: - case MSR_IA32_MCG_STATUS: - case MSR_IA32_MCG_CAP: - case MSR_IA32_MC0_MISC: - case MSR_IA32_MC0_MISC+4: - case MSR_IA32_MC0_MISC+8: - case MSR_IA32_MC0_MISC+12: - case MSR_IA32_MC0_MISC+16: - case MSR_IA32_UCODE_REV: - case MSR_IA32_PERF_STATUS: - case MSR_IA32_EBL_CR_POWERON: - /* MTRR registers */ - case 0xfe: - case 0x200 ... 0x2ff: - data = 0; - break; - case 0xcd: /* fsb frequency */ - data = 3; - break; - case MSR_IA32_APICBASE: - data = kvm_get_apic_base(vcpu); - break; - case MSR_IA32_MISC_ENABLE: - data = vcpu->ia32_misc_enable_msr; - break; -#ifdef CONFIG_X86_64 - case MSR_EFER: - data = vcpu->shadow_efer; - break; -#endif - default: - pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); - return 1; - } - *pdata = data; - return 0; -} -EXPORT_SYMBOL_GPL(kvm_get_msr_common); - -/* - * Reads an msr value (of 'msr_index') into 'pdata'. - * Returns 0 on success, non-0 otherwise. - * Assumes vcpu_load() was already called. - */ -int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) -{ - return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); -} - -#ifdef CONFIG_X86_64 - -static void set_efer(struct kvm_vcpu *vcpu, u64 efer) -{ - if (efer & EFER_RESERVED_BITS) { - printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", - efer); - inject_gp(vcpu); - return; - } - - if (is_paging(vcpu) - && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) { - printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); - inject_gp(vcpu); - return; - } - - kvm_x86_ops->set_efer(vcpu, efer); - - efer &= ~EFER_LMA; - efer |= vcpu->shadow_efer & EFER_LMA; - - vcpu->shadow_efer = efer; -} - -#endif - -int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) -{ - switch (msr) { -#ifdef CONFIG_X86_64 - case MSR_EFER: - set_efer(vcpu, data); - break; -#endif - case MSR_IA32_MC0_STATUS: - pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", - __FUNCTION__, data); - break; - case MSR_IA32_MCG_STATUS: - pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", - __FUNCTION__, data); - break; - case MSR_IA32_UCODE_REV: - case MSR_IA32_UCODE_WRITE: - case 0x200 ... 0x2ff: /* MTRRs */ - break; - case MSR_IA32_APICBASE: - kvm_set_apic_base(vcpu, data); - break; - case MSR_IA32_MISC_ENABLE: - vcpu->ia32_misc_enable_msr = data; - break; - /* - * This is the 'probe whether the host is KVM' logic: - */ - case MSR_KVM_API_MAGIC: - return vcpu_register_para(vcpu, data); - - default: - pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr); - return 1; - } - return 0; -} -EXPORT_SYMBOL_GPL(kvm_set_msr_common); - -/* - * Writes msr value into into the appropriate "register". - * Returns 0 on success, non-0 otherwise. - * Assumes vcpu_load() was already called. - */ -int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) -{ - return kvm_x86_ops->set_msr(vcpu, msr_index, data); -} - -void kvm_resched(struct kvm_vcpu *vcpu) -{ - if (!need_resched()) - return; - cond_resched(); -} -EXPORT_SYMBOL_GPL(kvm_resched); - -void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +static void free_pio_guest_pages(struct kvm_vcpu *vcpu) { int i; - u32 function; - struct kvm_cpuid_entry *e, *best; - kvm_x86_ops->cache_regs(vcpu); - function = vcpu->regs[VCPU_REGS_RAX]; - vcpu->regs[VCPU_REGS_RAX] = 0; - vcpu->regs[VCPU_REGS_RBX] = 0; - vcpu->regs[VCPU_REGS_RCX] = 0; - vcpu->regs[VCPU_REGS_RDX] = 0; - best = NULL; - for (i = 0; i < vcpu->cpuid_nent; ++i) { - e = &vcpu->cpuid_entries[i]; - if (e->function == function) { - best = e; - break; + for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i) + if (vcpu->arch.pio.guest_pages[i]) { + kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]); + vcpu->arch.pio.guest_pages[i] = NULL; } - /* - * Both basic or both extended? - */ - if (((e->function ^ function) & 0x80000000) == 0) - if (!best || e->function > best->function) - best = e; - } - if (best) { - vcpu->regs[VCPU_REGS_RAX] = best->eax; - vcpu->regs[VCPU_REGS_RBX] = best->ebx; - vcpu->regs[VCPU_REGS_RCX] = best->ecx; - vcpu->regs[VCPU_REGS_RDX] = best->edx; - } - kvm_x86_ops->decache_regs(vcpu); - kvm_x86_ops->skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); static int pio_copy_data(struct kvm_vcpu *vcpu) { - void *p = vcpu->pio_data; + void *p = vcpu->arch.pio_data; void *q; unsigned bytes; - int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1; + int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1; - q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE, + q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE, PAGE_KERNEL); if (!q) { free_pio_guest_pages(vcpu); return -ENOMEM; } - q += vcpu->pio.guest_page_offset; - bytes = vcpu->pio.size * vcpu->pio.cur_count; - if (vcpu->pio.in) + q += vcpu->arch.pio.guest_page_offset; + bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; + if (vcpu->arch.pio.in) memcpy(q, p, bytes); else memcpy(p, q, bytes); - q -= vcpu->pio.guest_page_offset; + q -= vcpu->arch.pio.guest_page_offset; vunmap(q); free_pio_guest_pages(vcpu); return 0; } -static int complete_pio(struct kvm_vcpu *vcpu) +int complete_pio(struct kvm_vcpu *vcpu) { - struct kvm_pio_request *io = &vcpu->pio; + struct kvm_pio_request *io = &vcpu->arch.pio; long delta; int r; @@ -1760,7 +1985,7 @@ static int complete_pio(struct kvm_vcpu *vcpu) if (!io->string) { if (io->in) - memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data, + memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data, io->size); } else { if (io->in) { @@ -1778,15 +2003,15 @@ static int complete_pio(struct kvm_vcpu *vcpu) * The size of the register should really depend on * current address size. */ - vcpu->regs[VCPU_REGS_RCX] -= delta; + vcpu->arch.regs[VCPU_REGS_RCX] -= delta; } if (io->down) delta = -delta; delta *= io->size; if (io->in) - vcpu->regs[VCPU_REGS_RDI] += delta; + vcpu->arch.regs[VCPU_REGS_RDI] += delta; else - vcpu->regs[VCPU_REGS_RSI] += delta; + vcpu->arch.regs[VCPU_REGS_RSI] += delta; } kvm_x86_ops->decache_regs(vcpu); @@ -1804,13 +2029,13 @@ static void kernel_pio(struct kvm_io_device *pio_dev, /* TODO: String I/O for in kernel device */ mutex_lock(&vcpu->kvm->lock); - if (vcpu->pio.in) - kvm_iodevice_read(pio_dev, vcpu->pio.port, - vcpu->pio.size, + if (vcpu->arch.pio.in) + kvm_iodevice_read(pio_dev, vcpu->arch.pio.port, + vcpu->arch.pio.size, pd); else - kvm_iodevice_write(pio_dev, vcpu->pio.port, - vcpu->pio.size, + kvm_iodevice_write(pio_dev, vcpu->arch.pio.port, + vcpu->arch.pio.size, pd); mutex_unlock(&vcpu->kvm->lock); } @@ -1818,8 +2043,8 @@ static void kernel_pio(struct kvm_io_device *pio_dev, static void pio_string_write(struct kvm_io_device *pio_dev, struct kvm_vcpu *vcpu) { - struct kvm_pio_request *io = &vcpu->pio; - void *pd = vcpu->pio_data; + struct kvm_pio_request *io = &vcpu->arch.pio; + void *pd = vcpu->arch.pio_data; int i; mutex_lock(&vcpu->kvm->lock); @@ -1832,32 +2057,38 @@ static void pio_string_write(struct kvm_io_device *pio_dev, mutex_unlock(&vcpu->kvm->lock); } -int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in, +static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, + gpa_t addr) +{ + return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr); +} + +int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, int size, unsigned port) { struct kvm_io_device *pio_dev; vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; - vcpu->run->io.size = vcpu->pio.size = size; + vcpu->run->io.size = vcpu->arch.pio.size = size; vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; - vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = 1; - vcpu->run->io.port = vcpu->pio.port = port; - vcpu->pio.in = in; - vcpu->pio.string = 0; - vcpu->pio.down = 0; - vcpu->pio.guest_page_offset = 0; - vcpu->pio.rep = 0; + vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1; + vcpu->run->io.port = vcpu->arch.pio.port = port; + vcpu->arch.pio.in = in; + vcpu->arch.pio.string = 0; + vcpu->arch.pio.down = 0; + vcpu->arch.pio.guest_page_offset = 0; + vcpu->arch.pio.rep = 0; kvm_x86_ops->cache_regs(vcpu); - memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4); + memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); kvm_x86_ops->decache_regs(vcpu); kvm_x86_ops->skip_emulated_instruction(vcpu); pio_dev = vcpu_find_pio_dev(vcpu, port); if (pio_dev) { - kernel_pio(pio_dev, vcpu, vcpu->pio_data); + kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); complete_pio(vcpu); return 1; } @@ -1877,15 +2108,15 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; - vcpu->run->io.size = vcpu->pio.size = size; + vcpu->run->io.size = vcpu->arch.pio.size = size; vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; - vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = count; - vcpu->run->io.port = vcpu->pio.port = port; - vcpu->pio.in = in; - vcpu->pio.string = 1; - vcpu->pio.down = down; - vcpu->pio.guest_page_offset = offset_in_page(address); - vcpu->pio.rep = rep; + vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count; + vcpu->run->io.port = vcpu->arch.pio.port = port; + vcpu->arch.pio.in = in; + vcpu->arch.pio.string = 1; + vcpu->arch.pio.down = down; + vcpu->arch.pio.guest_page_offset = offset_in_page(address); + vcpu->arch.pio.rep = rep; if (!count) { kvm_x86_ops->skip_emulated_instruction(vcpu); @@ -1911,37 +2142,35 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, * String I/O in reverse. Yuck. Kill the guest, fix later. */ pr_unimpl(vcpu, "guest string pio down\n"); - inject_gp(vcpu); + kvm_inject_gp(vcpu, 0); return 1; } vcpu->run->io.count = now; - vcpu->pio.cur_count = now; + vcpu->arch.pio.cur_count = now; - if (vcpu->pio.cur_count == vcpu->pio.count) + if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count) kvm_x86_ops->skip_emulated_instruction(vcpu); for (i = 0; i < nr_pages; ++i) { - mutex_lock(&vcpu->kvm->lock); + down_read(¤t->mm->mmap_sem); page = gva_to_page(vcpu, address + i * PAGE_SIZE); - if (page) - get_page(page); - vcpu->pio.guest_pages[i] = page; - mutex_unlock(&vcpu->kvm->lock); + vcpu->arch.pio.guest_pages[i] = page; + up_read(¤t->mm->mmap_sem); if (!page) { - inject_gp(vcpu); + kvm_inject_gp(vcpu, 0); free_pio_guest_pages(vcpu); return 1; } } pio_dev = vcpu_find_pio_dev(vcpu, port); - if (!vcpu->pio.in) { + if (!vcpu->arch.pio.in) { /* string PIO write */ ret = pio_copy_data(vcpu); if (ret >= 0 && pio_dev) { pio_string_write(pio_dev, vcpu); complete_pio(vcpu); - if (vcpu->pio.count == 0) + if (vcpu->arch.pio.count == 0) ret = 1; } } else if (pio_dev) @@ -1953,6 +2182,263 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, } EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); +int kvm_arch_init(void *opaque) +{ + int r; + struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; + + if (kvm_x86_ops) { + printk(KERN_ERR "kvm: already loaded the other module\n"); + r = -EEXIST; + goto out; + } + + if (!ops->cpu_has_kvm_support()) { + printk(KERN_ERR "kvm: no hardware support\n"); + r = -EOPNOTSUPP; + goto out; + } + if (ops->disabled_by_bios()) { + printk(KERN_ERR "kvm: disabled by bios\n"); + r = -EOPNOTSUPP; + goto out; + } + + r = kvm_mmu_module_init(); + if (r) + goto out; + + kvm_init_msr_list(); + + kvm_x86_ops = ops; + kvm_mmu_set_nonpresent_ptes(0ull, 0ull); + return 0; + +out: + return r; +} + +void kvm_arch_exit(void) +{ + kvm_x86_ops = NULL; + kvm_mmu_module_exit(); +} + +int kvm_emulate_halt(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.halt_exits; + if (irqchip_in_kernel(vcpu->kvm)) { + vcpu->arch.mp_state = VCPU_MP_STATE_HALTED; + kvm_vcpu_block(vcpu); + if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE) + return -EINTR; + return 1; + } else { + vcpu->run->exit_reason = KVM_EXIT_HLT; + return 0; + } +} +EXPORT_SYMBOL_GPL(kvm_emulate_halt); + +int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) +{ + unsigned long nr, a0, a1, a2, a3, ret; + + kvm_x86_ops->cache_regs(vcpu); + + nr = vcpu->arch.regs[VCPU_REGS_RAX]; + a0 = vcpu->arch.regs[VCPU_REGS_RBX]; + a1 = vcpu->arch.regs[VCPU_REGS_RCX]; + a2 = vcpu->arch.regs[VCPU_REGS_RDX]; + a3 = vcpu->arch.regs[VCPU_REGS_RSI]; + + if (!is_long_mode(vcpu)) { + nr &= 0xFFFFFFFF; + a0 &= 0xFFFFFFFF; + a1 &= 0xFFFFFFFF; + a2 &= 0xFFFFFFFF; + a3 &= 0xFFFFFFFF; + } + + switch (nr) { + case KVM_HC_VAPIC_POLL_IRQ: + ret = 0; + break; + default: + ret = -KVM_ENOSYS; + break; + } + vcpu->arch.regs[VCPU_REGS_RAX] = ret; + kvm_x86_ops->decache_regs(vcpu); + return 0; +} +EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); + +int kvm_fix_hypercall(struct kvm_vcpu *vcpu) +{ + char instruction[3]; + int ret = 0; + + + /* + * Blow out the MMU to ensure that no other VCPU has an active mapping + * to ensure that the updated hypercall appears atomically across all + * VCPUs. + */ + kvm_mmu_zap_all(vcpu->kvm); + + kvm_x86_ops->cache_regs(vcpu); + kvm_x86_ops->patch_hypercall(vcpu, instruction); + if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu) + != X86EMUL_CONTINUE) + ret = -EFAULT; + + return ret; +} + +static u64 mk_cr_64(u64 curr_cr, u32 new_val) +{ + return (curr_cr & ~((1ULL << 32) - 1)) | new_val; +} + +void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) +{ + struct descriptor_table dt = { limit, base }; + + kvm_x86_ops->set_gdt(vcpu, &dt); +} + +void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) +{ + struct descriptor_table dt = { limit, base }; + + kvm_x86_ops->set_idt(vcpu, &dt); +} + +void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, + unsigned long *rflags) +{ + lmsw(vcpu, msw); + *rflags = kvm_x86_ops->get_rflags(vcpu); +} + +unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) +{ + kvm_x86_ops->decache_cr4_guest_bits(vcpu); + switch (cr) { + case 0: + return vcpu->arch.cr0; + case 2: + return vcpu->arch.cr2; + case 3: + return vcpu->arch.cr3; + case 4: + return vcpu->arch.cr4; + case 8: + return get_cr8(vcpu); + default: + vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); + return 0; + } +} + +void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, + unsigned long *rflags) +{ + switch (cr) { + case 0: + set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); + *rflags = kvm_x86_ops->get_rflags(vcpu); + break; + case 2: + vcpu->arch.cr2 = val; + break; + case 3: + set_cr3(vcpu, val); + break; + case 4: + set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); + break; + case 8: + set_cr8(vcpu, val & 0xfUL); + break; + default: + vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); + } +} + +static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) +{ + struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; + int j, nent = vcpu->arch.cpuid_nent; + + e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; + /* when no next entry is found, the current entry[i] is reselected */ + for (j = i + 1; j == i; j = (j + 1) % nent) { + struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; + if (ej->function == e->function) { + ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; + return j; + } + } + return 0; /* silence gcc, even though control never reaches here */ +} + +/* find an entry with matching function, matching index (if needed), and that + * should be read next (if it's stateful) */ +static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, + u32 function, u32 index) +{ + if (e->function != function) + return 0; + if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) + return 0; + if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && + !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) + return 0; + return 1; +} + +void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +{ + int i; + u32 function, index; + struct kvm_cpuid_entry2 *e, *best; + + kvm_x86_ops->cache_regs(vcpu); + function = vcpu->arch.regs[VCPU_REGS_RAX]; + index = vcpu->arch.regs[VCPU_REGS_RCX]; + vcpu->arch.regs[VCPU_REGS_RAX] = 0; + vcpu->arch.regs[VCPU_REGS_RBX] = 0; + vcpu->arch.regs[VCPU_REGS_RCX] = 0; + vcpu->arch.regs[VCPU_REGS_RDX] = 0; + best = NULL; + for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { + e = &vcpu->arch.cpuid_entries[i]; + if (is_matching_cpuid_entry(e, function, index)) { + if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) + move_to_next_stateful_cpuid_entry(vcpu, i); + best = e; + break; + } + /* + * Both basic or both extended? + */ + if (((e->function ^ function) & 0x80000000) == 0) + if (!best || e->function > best->function) + best = e; + } + if (best) { + vcpu->arch.regs[VCPU_REGS_RAX] = best->eax; + vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx; + vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx; + vcpu->arch.regs[VCPU_REGS_RDX] = best->edx; + } + kvm_x86_ops->decache_regs(vcpu); + kvm_x86_ops->skip_emulated_instruction(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); + /* * Check if userspace requested an interrupt window, and that the * interrupt window is open. @@ -1962,9 +2448,9 @@ EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - return (!vcpu->irq_summary && + return (!vcpu->arch.irq_summary && kvm_run->request_interrupt_window && - vcpu->interrupt_window_open && + vcpu->arch.interrupt_window_open && (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF)); } @@ -1978,22 +2464,51 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu, kvm_run->ready_for_interrupt_injection = 1; else kvm_run->ready_for_interrupt_injection = - (vcpu->interrupt_window_open && - vcpu->irq_summary == 0); + (vcpu->arch.interrupt_window_open && + vcpu->arch.irq_summary == 0); +} + +static void vapic_enter(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + struct page *page; + + if (!apic || !apic->vapic_addr) + return; + + down_read(¤t->mm->mmap_sem); + page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); + vcpu->arch.apic->vapic_page = page; + up_read(¤t->mm->mmap_sem); +} + +static void vapic_exit(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (!apic || !apic->vapic_addr) + return; + + kvm_release_page_dirty(apic->vapic_page); + mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); } static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; - if (unlikely(vcpu->mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) { - printk("vcpu %d received sipi with vector # %x\n", - vcpu->vcpu_id, vcpu->sipi_vector); + if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) { + pr_debug("vcpu %d received sipi with vector # %x\n", + vcpu->vcpu_id, vcpu->arch.sipi_vector); kvm_lapic_reset(vcpu); - kvm_x86_ops->vcpu_reset(vcpu); - vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; + r = kvm_x86_ops->vcpu_reset(vcpu); + if (r) + return r; + vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; } + vapic_enter(vcpu); + preempted: if (vcpu->guest_debug.enabled) kvm_x86_ops->guest_debug_pre(vcpu); @@ -2003,6 +2518,19 @@ again: if (unlikely(r)) goto out; + if (vcpu->requests) { + if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) + __kvm_migrate_apic_timer(vcpu); + if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, + &vcpu->requests)) { + kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; + r = 0; + goto out; + } + } + + kvm_inject_pending_timer_irqs(vcpu); + preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); @@ -2010,6 +2538,13 @@ again: local_irq_disable(); + if (need_resched()) { + local_irq_enable(); + preempt_enable(); + r = 1; + goto out; + } + if (signal_pending(current)) { local_irq_enable(); preempt_enable(); @@ -2019,16 +2554,20 @@ again: goto out; } - if (irqchip_in_kernel(vcpu->kvm)) + if (vcpu->arch.exception.pending) + __queue_exception(vcpu); + else if (irqchip_in_kernel(vcpu->kvm)) kvm_x86_ops->inject_pending_irq(vcpu); - else if (!vcpu->mmio_read_completed) + else kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); + kvm_lapic_sync_to_vapic(vcpu); + vcpu->guest_mode = 1; kvm_guest_enter(); if (vcpu->requests) - if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests)) + if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) kvm_x86_ops->tlb_flush(vcpu); kvm_x86_ops->run(vcpu, kvm_run); @@ -2055,9 +2594,14 @@ again: */ if (unlikely(prof_on == KVM_PROFILING)) { kvm_x86_ops->cache_regs(vcpu); - profile_hit(KVM_PROFILING, (void *)vcpu->rip); + profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip); } + if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) + vcpu->arch.exception.pending = false; + + kvm_lapic_sync_from_vapic(vcpu); + r = kvm_x86_ops->handle_exit(kvm_run, vcpu); if (r > 0) { @@ -2067,10 +2611,8 @@ again: ++vcpu->stat.request_irq_exits; goto out; } - if (!need_resched()) { - ++vcpu->stat.light_exits; + if (!need_resched()) goto again; - } } out: @@ -2081,18 +2623,19 @@ out: post_kvm_run_save(vcpu, kvm_run); + vapic_exit(vcpu); + return r; } - -static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; sigset_t sigsaved; vcpu_load(vcpu); - if (unlikely(vcpu->mp_state == VCPU_MP_STATE_UNINITIALIZED)) { + if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) { kvm_vcpu_block(vcpu); vcpu_put(vcpu); return -EAGAIN; @@ -2105,18 +2648,19 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (!irqchip_in_kernel(vcpu->kvm)) set_cr8(vcpu, kvm_run->cr8); - if (vcpu->pio.cur_count) { + if (vcpu->arch.pio.cur_count) { r = complete_pio(vcpu); if (r) goto out; } - +#if CONFIG_HAS_IOMEM if (vcpu->mmio_needed) { memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); vcpu->mmio_read_completed = 1; vcpu->mmio_needed = 0; r = emulate_instruction(vcpu, kvm_run, - vcpu->mmio_fault_cr2, 0); + vcpu->arch.mmio_fault_cr2, 0, + EMULTYPE_NO_DECODE); if (r == EMULATE_DO_MMIO) { /* * Read-modify-write. Back to userspace. @@ -2125,10 +2669,10 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) goto out; } } - +#endif if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { kvm_x86_ops->cache_regs(vcpu); - vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; + vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; kvm_x86_ops->decache_regs(vcpu); } @@ -2142,33 +2686,32 @@ out: return r; } -static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, - struct kvm_regs *regs) +int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { vcpu_load(vcpu); kvm_x86_ops->cache_regs(vcpu); - regs->rax = vcpu->regs[VCPU_REGS_RAX]; - regs->rbx = vcpu->regs[VCPU_REGS_RBX]; - regs->rcx = vcpu->regs[VCPU_REGS_RCX]; - regs->rdx = vcpu->regs[VCPU_REGS_RDX]; - regs->rsi = vcpu->regs[VCPU_REGS_RSI]; - regs->rdi = vcpu->regs[VCPU_REGS_RDI]; - regs->rsp = vcpu->regs[VCPU_REGS_RSP]; - regs->rbp = vcpu->regs[VCPU_REGS_RBP]; + regs->rax = vcpu->arch.regs[VCPU_REGS_RAX]; + regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX]; + regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX]; + regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX]; + regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI]; + regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI]; + regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP]; + regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP]; #ifdef CONFIG_X86_64 - regs->r8 = vcpu->regs[VCPU_REGS_R8]; - regs->r9 = vcpu->regs[VCPU_REGS_R9]; - regs->r10 = vcpu->regs[VCPU_REGS_R10]; - regs->r11 = vcpu->regs[VCPU_REGS_R11]; - regs->r12 = vcpu->regs[VCPU_REGS_R12]; - regs->r13 = vcpu->regs[VCPU_REGS_R13]; - regs->r14 = vcpu->regs[VCPU_REGS_R14]; - regs->r15 = vcpu->regs[VCPU_REGS_R15]; + regs->r8 = vcpu->arch.regs[VCPU_REGS_R8]; + regs->r9 = vcpu->arch.regs[VCPU_REGS_R9]; + regs->r10 = vcpu->arch.regs[VCPU_REGS_R10]; + regs->r11 = vcpu->arch.regs[VCPU_REGS_R11]; + regs->r12 = vcpu->arch.regs[VCPU_REGS_R12]; + regs->r13 = vcpu->arch.regs[VCPU_REGS_R13]; + regs->r14 = vcpu->arch.regs[VCPU_REGS_R14]; + regs->r15 = vcpu->arch.regs[VCPU_REGS_R15]; #endif - regs->rip = vcpu->rip; + regs->rip = vcpu->arch.rip; regs->rflags = kvm_x86_ops->get_rflags(vcpu); /* @@ -2182,31 +2725,30 @@ static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, return 0; } -static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, - struct kvm_regs *regs) +int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { vcpu_load(vcpu); - vcpu->regs[VCPU_REGS_RAX] = regs->rax; - vcpu->regs[VCPU_REGS_RBX] = regs->rbx; - vcpu->regs[VCPU_REGS_RCX] = regs->rcx; - vcpu->regs[VCPU_REGS_RDX] = regs->rdx; - vcpu->regs[VCPU_REGS_RSI] = regs->rsi; - vcpu->regs[VCPU_REGS_RDI] = regs->rdi; - vcpu->regs[VCPU_REGS_RSP] = regs->rsp; - vcpu->regs[VCPU_REGS_RBP] = regs->rbp; + vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax; + vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx; + vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx; + vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx; + vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi; + vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi; + vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp; + vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp; #ifdef CONFIG_X86_64 - vcpu->regs[VCPU_REGS_R8] = regs->r8; - vcpu->regs[VCPU_REGS_R9] = regs->r9; - vcpu->regs[VCPU_REGS_R10] = regs->r10; - vcpu->regs[VCPU_REGS_R11] = regs->r11; - vcpu->regs[VCPU_REGS_R12] = regs->r12; - vcpu->regs[VCPU_REGS_R13] = regs->r13; - vcpu->regs[VCPU_REGS_R14] = regs->r14; - vcpu->regs[VCPU_REGS_R15] = regs->r15; + vcpu->arch.regs[VCPU_REGS_R8] = regs->r8; + vcpu->arch.regs[VCPU_REGS_R9] = regs->r9; + vcpu->arch.regs[VCPU_REGS_R10] = regs->r10; + vcpu->arch.regs[VCPU_REGS_R11] = regs->r11; + vcpu->arch.regs[VCPU_REGS_R12] = regs->r12; + vcpu->arch.regs[VCPU_REGS_R13] = regs->r13; + vcpu->arch.regs[VCPU_REGS_R14] = regs->r14; + vcpu->arch.regs[VCPU_REGS_R15] = regs->r15; #endif - vcpu->rip = regs->rip; + vcpu->arch.rip = regs->rip; kvm_x86_ops->set_rflags(vcpu, regs->rflags); kvm_x86_ops->decache_regs(vcpu); @@ -2222,8 +2764,18 @@ static void get_segment(struct kvm_vcpu *vcpu, return kvm_x86_ops->get_segment(vcpu, var, seg); } -static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) +void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) +{ + struct kvm_segment cs; + + get_segment(vcpu, &cs, VCPU_SREG_CS); + *db = cs.db; + *l = cs.l; +} +EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); + +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { struct descriptor_table dt; int pending_vec; @@ -2248,12 +2800,12 @@ static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, sregs->gdt.base = dt.base; kvm_x86_ops->decache_cr4_guest_bits(vcpu); - sregs->cr0 = vcpu->cr0; - sregs->cr2 = vcpu->cr2; - sregs->cr3 = vcpu->cr3; - sregs->cr4 = vcpu->cr4; + sregs->cr0 = vcpu->arch.cr0; + sregs->cr2 = vcpu->arch.cr2; + sregs->cr3 = vcpu->arch.cr3; + sregs->cr4 = vcpu->arch.cr4; sregs->cr8 = get_cr8(vcpu); - sregs->efer = vcpu->shadow_efer; + sregs->efer = vcpu->arch.shadow_efer; sregs->apic_base = kvm_get_apic_base(vcpu); if (irqchip_in_kernel(vcpu->kvm)) { @@ -2261,9 +2813,10 @@ static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, sizeof sregs->interrupt_bitmap); pending_vec = kvm_x86_ops->get_irq(vcpu); if (pending_vec >= 0) - set_bit(pending_vec, (unsigned long *)sregs->interrupt_bitmap); + set_bit(pending_vec, + (unsigned long *)sregs->interrupt_bitmap); } else - memcpy(sregs->interrupt_bitmap, vcpu->irq_pending, + memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending, sizeof sregs->interrupt_bitmap); vcpu_put(vcpu); @@ -2277,8 +2830,8 @@ static void set_segment(struct kvm_vcpu *vcpu, return kvm_x86_ops->set_segment(vcpu, var, seg); } -static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { int mmu_reset_needed = 0; int i, pending_vec, max_bits; @@ -2293,13 +2846,13 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, dt.base = sregs->gdt.base; kvm_x86_ops->set_gdt(vcpu, &dt); - vcpu->cr2 = sregs->cr2; - mmu_reset_needed |= vcpu->cr3 != sregs->cr3; - vcpu->cr3 = sregs->cr3; + vcpu->arch.cr2 = sregs->cr2; + mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; + vcpu->arch.cr3 = sregs->cr3; set_cr8(vcpu, sregs->cr8); - mmu_reset_needed |= vcpu->shadow_efer != sregs->efer; + mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; #ifdef CONFIG_X86_64 kvm_x86_ops->set_efer(vcpu, sregs->efer); #endif @@ -2307,25 +2860,25 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_x86_ops->decache_cr4_guest_bits(vcpu); - mmu_reset_needed |= vcpu->cr0 != sregs->cr0; - vcpu->cr0 = sregs->cr0; + mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; + vcpu->arch.cr0 = sregs->cr0; kvm_x86_ops->set_cr0(vcpu, sregs->cr0); - mmu_reset_needed |= vcpu->cr4 != sregs->cr4; + mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; kvm_x86_ops->set_cr4(vcpu, sregs->cr4); if (!is_long_mode(vcpu) && is_pae(vcpu)) - load_pdptrs(vcpu, vcpu->cr3); + load_pdptrs(vcpu, vcpu->arch.cr3); if (mmu_reset_needed) kvm_mmu_reset_context(vcpu); if (!irqchip_in_kernel(vcpu->kvm)) { - memcpy(vcpu->irq_pending, sregs->interrupt_bitmap, - sizeof vcpu->irq_pending); - vcpu->irq_summary = 0; - for (i = 0; i < ARRAY_SIZE(vcpu->irq_pending); ++i) - if (vcpu->irq_pending[i]) - __set_bit(i, &vcpu->irq_summary); + memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap, + sizeof vcpu->arch.irq_pending); + vcpu->arch.irq_summary = 0; + for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i) + if (vcpu->arch.irq_pending[i]) + __set_bit(i, &vcpu->arch.irq_summary); } else { max_bits = (sizeof sregs->interrupt_bitmap) << 3; pending_vec = find_first_bit( @@ -2334,7 +2887,8 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, /* Only pending external irq is handled here */ if (pending_vec < max_bits) { kvm_x86_ops->set_irq(vcpu, pending_vec); - printk("Set back pending irq %d\n", pending_vec); + pr_debug("Set back pending irq %d\n", + pending_vec); } } @@ -2353,174 +2907,8 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return 0; } -void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) -{ - struct kvm_segment cs; - - get_segment(vcpu, &cs, VCPU_SREG_CS); - *db = cs.db; - *l = cs.l; -} -EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); - -/* - * List of msr numbers which we expose to userspace through KVM_GET_MSRS - * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. - * - * This list is modified at module load time to reflect the - * capabilities of the host cpu. - */ -static u32 msrs_to_save[] = { - MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, - MSR_K6_STAR, -#ifdef CONFIG_X86_64 - MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, -#endif - MSR_IA32_TIME_STAMP_COUNTER, -}; - -static unsigned num_msrs_to_save; - -static u32 emulated_msrs[] = { - MSR_IA32_MISC_ENABLE, -}; - -static __init void kvm_init_msr_list(void) -{ - u32 dummy[2]; - unsigned i, j; - - for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { - if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) - continue; - if (j < i) - msrs_to_save[j] = msrs_to_save[i]; - j++; - } - num_msrs_to_save = j; -} - -/* - * Adapt set_msr() to msr_io()'s calling convention - */ -static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) -{ - return kvm_set_msr(vcpu, index, *data); -} - -/* - * Read or write a bunch of msrs. All parameters are kernel addresses. - * - * @return number of msrs set successfully. - */ -static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, - struct kvm_msr_entry *entries, - int (*do_msr)(struct kvm_vcpu *vcpu, - unsigned index, u64 *data)) -{ - int i; - - vcpu_load(vcpu); - - for (i = 0; i < msrs->nmsrs; ++i) - if (do_msr(vcpu, entries[i].index, &entries[i].data)) - break; - - vcpu_put(vcpu); - - return i; -} - -/* - * Read or write a bunch of msrs. Parameters are user addresses. - * - * @return number of msrs set successfully. - */ -static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, - int (*do_msr)(struct kvm_vcpu *vcpu, - unsigned index, u64 *data), - int writeback) -{ - struct kvm_msrs msrs; - struct kvm_msr_entry *entries; - int r, n; - unsigned size; - - r = -EFAULT; - if (copy_from_user(&msrs, user_msrs, sizeof msrs)) - goto out; - - r = -E2BIG; - if (msrs.nmsrs >= MAX_IO_MSRS) - goto out; - - r = -ENOMEM; - size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; - entries = vmalloc(size); - if (!entries) - goto out; - - r = -EFAULT; - if (copy_from_user(entries, user_msrs->entries, size)) - goto out_free; - - r = n = __msr_io(vcpu, &msrs, entries, do_msr); - if (r < 0) - goto out_free; - - r = -EFAULT; - if (writeback && copy_to_user(user_msrs->entries, entries, size)) - goto out_free; - - r = n; - -out_free: - vfree(entries); -out: - return r; -} - -/* - * Translate a guest virtual address to a guest physical address. - */ -static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, - struct kvm_translation *tr) -{ - unsigned long vaddr = tr->linear_address; - gpa_t gpa; - - vcpu_load(vcpu); - mutex_lock(&vcpu->kvm->lock); - gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr); - tr->physical_address = gpa; - tr->valid = gpa != UNMAPPED_GVA; - tr->writeable = 1; - tr->usermode = 0; - mutex_unlock(&vcpu->kvm->lock); - vcpu_put(vcpu); - - return 0; -} - -static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, - struct kvm_interrupt *irq) -{ - if (irq->irq < 0 || irq->irq >= 256) - return -EINVAL; - if (irqchip_in_kernel(vcpu->kvm)) - return -ENXIO; - vcpu_load(vcpu); - - set_bit(irq->irq, vcpu->irq_pending); - set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary); - - vcpu_put(vcpu); - - return 0; -} - -static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, - struct kvm_debug_guest *dbg) +int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, + struct kvm_debug_guest *dbg) { int r; @@ -2533,179 +2921,6 @@ static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, return r; } -static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma, - unsigned long address, - int *type) -{ - struct kvm_vcpu *vcpu = vma->vm_file->private_data; - unsigned long pgoff; - struct page *page; - - pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - if (pgoff == 0) - page = virt_to_page(vcpu->run); - else if (pgoff == KVM_PIO_PAGE_OFFSET) - page = virt_to_page(vcpu->pio_data); - else - return NOPAGE_SIGBUS; - get_page(page); - if (type != NULL) - *type = VM_FAULT_MINOR; - - return page; -} - -static struct vm_operations_struct kvm_vcpu_vm_ops = { - .nopage = kvm_vcpu_nopage, -}; - -static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) -{ - vma->vm_ops = &kvm_vcpu_vm_ops; - return 0; -} - -static int kvm_vcpu_release(struct inode *inode, struct file *filp) -{ - struct kvm_vcpu *vcpu = filp->private_data; - - fput(vcpu->kvm->filp); - return 0; -} - -static struct file_operations kvm_vcpu_fops = { - .release = kvm_vcpu_release, - .unlocked_ioctl = kvm_vcpu_ioctl, - .compat_ioctl = kvm_vcpu_ioctl, - .mmap = kvm_vcpu_mmap, -}; - -/* - * Allocates an inode for the vcpu. - */ -static int create_vcpu_fd(struct kvm_vcpu *vcpu) -{ - int fd, r; - struct inode *inode; - struct file *file; - - r = anon_inode_getfd(&fd, &inode, &file, - "kvm-vcpu", &kvm_vcpu_fops, vcpu); - if (r) - return r; - atomic_inc(&vcpu->kvm->filp->f_count); - return fd; -} - -/* - * Creates some virtual cpus. Good luck creating more than one. - */ -static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) -{ - int r; - struct kvm_vcpu *vcpu; - - if (!valid_vcpu(n)) - return -EINVAL; - - vcpu = kvm_x86_ops->vcpu_create(kvm, n); - if (IS_ERR(vcpu)) - return PTR_ERR(vcpu); - - preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); - - /* We do fxsave: this must be aligned. */ - BUG_ON((unsigned long)&vcpu->host_fx_image & 0xF); - - vcpu_load(vcpu); - r = kvm_mmu_setup(vcpu); - vcpu_put(vcpu); - if (r < 0) - goto free_vcpu; - - mutex_lock(&kvm->lock); - if (kvm->vcpus[n]) { - r = -EEXIST; - mutex_unlock(&kvm->lock); - goto mmu_unload; - } - kvm->vcpus[n] = vcpu; - mutex_unlock(&kvm->lock); - - /* Now it's all set up, let userspace reach it */ - r = create_vcpu_fd(vcpu); - if (r < 0) - goto unlink; - return r; - -unlink: - mutex_lock(&kvm->lock); - kvm->vcpus[n] = NULL; - mutex_unlock(&kvm->lock); - -mmu_unload: - vcpu_load(vcpu); - kvm_mmu_unload(vcpu); - vcpu_put(vcpu); - -free_vcpu: - kvm_x86_ops->vcpu_free(vcpu); - return r; -} - -static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) -{ - u64 efer; - int i; - struct kvm_cpuid_entry *e, *entry; - - rdmsrl(MSR_EFER, efer); - entry = NULL; - for (i = 0; i < vcpu->cpuid_nent; ++i) { - e = &vcpu->cpuid_entries[i]; - if (e->function == 0x80000001) { - entry = e; - break; - } - } - if (entry && (entry->edx & (1 << 20)) && !(efer & EFER_NX)) { - entry->edx &= ~(1 << 20); - printk(KERN_INFO "kvm: guest NX capability removed\n"); - } -} - -static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, - struct kvm_cpuid *cpuid, - struct kvm_cpuid_entry __user *entries) -{ - int r; - - r = -E2BIG; - if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) - goto out; - r = -EFAULT; - if (copy_from_user(&vcpu->cpuid_entries, entries, - cpuid->nent * sizeof(struct kvm_cpuid_entry))) - goto out; - vcpu->cpuid_nent = cpuid->nent; - cpuid_fix_nx_cap(vcpu); - return 0; - -out: - return r; -} - -static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) -{ - if (sigset) { - sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); - vcpu->sigset_active = 1; - vcpu->sigset = *sigset; - } else - vcpu->sigset_active = 0; - return 0; -} - /* * fxsave fpu state. Taken from x86_64/processor.h. To be killed when * we have asm/x86/processor.h @@ -2727,9 +2942,31 @@ struct fxsave { #endif }; -static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +/* + * Translate a guest virtual address to a guest physical address. + */ +int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, + struct kvm_translation *tr) { - struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image; + unsigned long vaddr = tr->linear_address; + gpa_t gpa; + + vcpu_load(vcpu); + down_read(¤t->mm->mmap_sem); + gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); + up_read(¤t->mm->mmap_sem); + tr->physical_address = gpa; + tr->valid = gpa != UNMAPPED_GVA; + tr->writeable = 1; + tr->usermode = 0; + vcpu_put(vcpu); + + return 0; +} + +int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; vcpu_load(vcpu); @@ -2747,9 +2984,9 @@ static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) return 0; } -static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image; + struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; vcpu_load(vcpu); @@ -2767,862 +3004,284 @@ static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) return 0; } -static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, - struct kvm_lapic_state *s) +void fx_init(struct kvm_vcpu *vcpu) { - vcpu_load(vcpu); - memcpy(s->regs, vcpu->apic->regs, sizeof *s); - vcpu_put(vcpu); - - return 0; -} + unsigned after_mxcsr_mask; -static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, - struct kvm_lapic_state *s) -{ - vcpu_load(vcpu); - memcpy(vcpu->apic->regs, s->regs, sizeof *s); - kvm_apic_post_state_restore(vcpu); - vcpu_put(vcpu); + /* Initialize guest FPU by resetting ours and saving into guest's */ + preempt_disable(); + fx_save(&vcpu->arch.host_fx_image); + fpu_init(); + fx_save(&vcpu->arch.guest_fx_image); + fx_restore(&vcpu->arch.host_fx_image); + preempt_enable(); - return 0; + vcpu->arch.cr0 |= X86_CR0_ET; + after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); + vcpu->arch.guest_fx_image.mxcsr = 0x1f80; + memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, + 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); } +EXPORT_SYMBOL_GPL(fx_init); -static long kvm_vcpu_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = filp->private_data; - void __user *argp = (void __user *)arg; - int r = -EINVAL; - - switch (ioctl) { - case KVM_RUN: - r = -EINVAL; - if (arg) - goto out; - r = kvm_vcpu_ioctl_run(vcpu, vcpu->run); - break; - case KVM_GET_REGS: { - struct kvm_regs kvm_regs; - - memset(&kvm_regs, 0, sizeof kvm_regs); - r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs); - if (r) - goto out; - r = -EFAULT; - if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs)) - goto out; - r = 0; - break; - } - case KVM_SET_REGS: { - struct kvm_regs kvm_regs; - - r = -EFAULT; - if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs)) - goto out; - r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs); - if (r) - goto out; - r = 0; - break; - } - case KVM_GET_SREGS: { - struct kvm_sregs kvm_sregs; - - memset(&kvm_sregs, 0, sizeof kvm_sregs); - r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs); - if (r) - goto out; - r = -EFAULT; - if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs)) - goto out; - r = 0; - break; - } - case KVM_SET_SREGS: { - struct kvm_sregs kvm_sregs; - - r = -EFAULT; - if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs)) - goto out; - r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs); - if (r) - goto out; - r = 0; - break; - } - case KVM_TRANSLATE: { - struct kvm_translation tr; - - r = -EFAULT; - if (copy_from_user(&tr, argp, sizeof tr)) - goto out; - r = kvm_vcpu_ioctl_translate(vcpu, &tr); - if (r) - goto out; - r = -EFAULT; - if (copy_to_user(argp, &tr, sizeof tr)) - goto out; - r = 0; - break; - } - case KVM_INTERRUPT: { - struct kvm_interrupt irq; - - r = -EFAULT; - if (copy_from_user(&irq, argp, sizeof irq)) - goto out; - r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); - if (r) - goto out; - r = 0; - break; - } - case KVM_DEBUG_GUEST: { - struct kvm_debug_guest dbg; - - r = -EFAULT; - if (copy_from_user(&dbg, argp, sizeof dbg)) - goto out; - r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg); - if (r) - goto out; - r = 0; - break; - } - case KVM_GET_MSRS: - r = msr_io(vcpu, argp, kvm_get_msr, 1); - break; - case KVM_SET_MSRS: - r = msr_io(vcpu, argp, do_set_msr, 0); - break; - case KVM_SET_CPUID: { - struct kvm_cpuid __user *cpuid_arg = argp; - struct kvm_cpuid cpuid; - - r = -EFAULT; - if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) - goto out; - r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); - if (r) - goto out; - break; - } - case KVM_SET_SIGNAL_MASK: { - struct kvm_signal_mask __user *sigmask_arg = argp; - struct kvm_signal_mask kvm_sigmask; - sigset_t sigset, *p; - - p = NULL; - if (argp) { - r = -EFAULT; - if (copy_from_user(&kvm_sigmask, argp, - sizeof kvm_sigmask)) - goto out; - r = -EINVAL; - if (kvm_sigmask.len != sizeof sigset) - goto out; - r = -EFAULT; - if (copy_from_user(&sigset, sigmask_arg->sigset, - sizeof sigset)) - goto out; - p = &sigset; - } - r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); - break; - } - case KVM_GET_FPU: { - struct kvm_fpu fpu; - - memset(&fpu, 0, sizeof fpu); - r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu); - if (r) - goto out; - r = -EFAULT; - if (copy_to_user(argp, &fpu, sizeof fpu)) - goto out; - r = 0; - break; - } - case KVM_SET_FPU: { - struct kvm_fpu fpu; - - r = -EFAULT; - if (copy_from_user(&fpu, argp, sizeof fpu)) - goto out; - r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu); - if (r) - goto out; - r = 0; - break; - } - case KVM_GET_LAPIC: { - struct kvm_lapic_state lapic; - - memset(&lapic, 0, sizeof lapic); - r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic); - if (r) - goto out; - r = -EFAULT; - if (copy_to_user(argp, &lapic, sizeof lapic)) - goto out; - r = 0; - break; - } - case KVM_SET_LAPIC: { - struct kvm_lapic_state lapic; + if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) + return; - r = -EFAULT; - if (copy_from_user(&lapic, argp, sizeof lapic)) - goto out; - r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);; - if (r) - goto out; - r = 0; - break; - } - default: - ; - } -out: - return r; + vcpu->guest_fpu_loaded = 1; + fx_save(&vcpu->arch.host_fx_image); + fx_restore(&vcpu->arch.guest_fx_image); } +EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); -static long kvm_vm_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { - struct kvm *kvm = filp->private_data; - void __user *argp = (void __user *)arg; - int r = -EINVAL; - - switch (ioctl) { - case KVM_CREATE_VCPU: - r = kvm_vm_ioctl_create_vcpu(kvm, arg); - if (r < 0) - goto out; - break; - case KVM_SET_MEMORY_REGION: { - struct kvm_memory_region kvm_mem; - - r = -EFAULT; - if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) - goto out; - r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_mem); - if (r) - goto out; - break; - } - case KVM_GET_DIRTY_LOG: { - struct kvm_dirty_log log; - - r = -EFAULT; - if (copy_from_user(&log, argp, sizeof log)) - goto out; - r = kvm_vm_ioctl_get_dirty_log(kvm, &log); - if (r) - goto out; - break; - } - case KVM_SET_MEMORY_ALIAS: { - struct kvm_memory_alias alias; - - r = -EFAULT; - if (copy_from_user(&alias, argp, sizeof alias)) - goto out; - r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); - if (r) - goto out; - break; - } - case KVM_CREATE_IRQCHIP: - r = -ENOMEM; - kvm->vpic = kvm_create_pic(kvm); - if (kvm->vpic) { - r = kvm_ioapic_init(kvm); - if (r) { - kfree(kvm->vpic); - kvm->vpic = NULL; - goto out; - } - } - else - goto out; - break; - case KVM_IRQ_LINE: { - struct kvm_irq_level irq_event; - - r = -EFAULT; - if (copy_from_user(&irq_event, argp, sizeof irq_event)) - goto out; - if (irqchip_in_kernel(kvm)) { - mutex_lock(&kvm->lock); - if (irq_event.irq < 16) - kvm_pic_set_irq(pic_irqchip(kvm), - irq_event.irq, - irq_event.level); - kvm_ioapic_set_irq(kvm->vioapic, - irq_event.irq, - irq_event.level); - mutex_unlock(&kvm->lock); - r = 0; - } - break; - } - case KVM_GET_IRQCHIP: { - /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ - struct kvm_irqchip chip; - - r = -EFAULT; - if (copy_from_user(&chip, argp, sizeof chip)) - goto out; - r = -ENXIO; - if (!irqchip_in_kernel(kvm)) - goto out; - r = kvm_vm_ioctl_get_irqchip(kvm, &chip); - if (r) - goto out; - r = -EFAULT; - if (copy_to_user(argp, &chip, sizeof chip)) - goto out; - r = 0; - break; - } - case KVM_SET_IRQCHIP: { - /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ - struct kvm_irqchip chip; + if (!vcpu->guest_fpu_loaded) + return; - r = -EFAULT; - if (copy_from_user(&chip, argp, sizeof chip)) - goto out; - r = -ENXIO; - if (!irqchip_in_kernel(kvm)) - goto out; - r = kvm_vm_ioctl_set_irqchip(kvm, &chip); - if (r) - goto out; - r = 0; - break; - } - default: - ; - } -out: - return r; + vcpu->guest_fpu_loaded = 0; + fx_save(&vcpu->arch.guest_fx_image); + fx_restore(&vcpu->arch.host_fx_image); + ++vcpu->stat.fpu_reload; } +EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); -static struct page *kvm_vm_nopage(struct vm_area_struct *vma, - unsigned long address, - int *type) +void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { - struct kvm *kvm = vma->vm_file->private_data; - unsigned long pgoff; - struct page *page; - - pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - page = gfn_to_page(kvm, pgoff); - if (!page) - return NOPAGE_SIGBUS; - get_page(page); - if (type != NULL) - *type = VM_FAULT_MINOR; - - return page; + kvm_x86_ops->vcpu_free(vcpu); } -static struct vm_operations_struct kvm_vm_vm_ops = { - .nopage = kvm_vm_nopage, -}; - -static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) +struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, + unsigned int id) { - vma->vm_ops = &kvm_vm_vm_ops; - return 0; + return kvm_x86_ops->vcpu_create(kvm, id); } -static struct file_operations kvm_vm_fops = { - .release = kvm_vm_release, - .unlocked_ioctl = kvm_vm_ioctl, - .compat_ioctl = kvm_vm_ioctl, - .mmap = kvm_vm_mmap, -}; - -static int kvm_dev_ioctl_create_vm(void) +int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { - int fd, r; - struct inode *inode; - struct file *file; - struct kvm *kvm; + int r; - kvm = kvm_create_vm(); - if (IS_ERR(kvm)) - return PTR_ERR(kvm); - r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm); - if (r) { - kvm_destroy_vm(kvm); - return r; - } + /* We do fxsave: this must be aligned. */ + BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); - kvm->filp = file; + vcpu_load(vcpu); + r = kvm_arch_vcpu_reset(vcpu); + if (r == 0) + r = kvm_mmu_setup(vcpu); + vcpu_put(vcpu); + if (r < 0) + goto free_vcpu; - return fd; + return 0; +free_vcpu: + kvm_x86_ops->vcpu_free(vcpu); + return r; } -static long kvm_dev_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - void __user *argp = (void __user *)arg; - long r = -EINVAL; - - switch (ioctl) { - case KVM_GET_API_VERSION: - r = -EINVAL; - if (arg) - goto out; - r = KVM_API_VERSION; - break; - case KVM_CREATE_VM: - r = -EINVAL; - if (arg) - goto out; - r = kvm_dev_ioctl_create_vm(); - break; - case KVM_GET_MSR_INDEX_LIST: { - struct kvm_msr_list __user *user_msr_list = argp; - struct kvm_msr_list msr_list; - unsigned n; - - r = -EFAULT; - if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) - goto out; - n = msr_list.nmsrs; - msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); - if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) - goto out; - r = -E2BIG; - if (n < num_msrs_to_save) - goto out; - r = -EFAULT; - if (copy_to_user(user_msr_list->indices, &msrs_to_save, - num_msrs_to_save * sizeof(u32))) - goto out; - if (copy_to_user(user_msr_list->indices - + num_msrs_to_save * sizeof(u32), - &emulated_msrs, - ARRAY_SIZE(emulated_msrs) * sizeof(u32))) - goto out; - r = 0; - break; - } - case KVM_CHECK_EXTENSION: { - int ext = (long)argp; + vcpu_load(vcpu); + kvm_mmu_unload(vcpu); + vcpu_put(vcpu); - switch (ext) { - case KVM_CAP_IRQCHIP: - case KVM_CAP_HLT: - r = 1; - break; - default: - r = 0; - break; - } - break; - } - case KVM_GET_VCPU_MMAP_SIZE: - r = -EINVAL; - if (arg) - goto out; - r = 2 * PAGE_SIZE; - break; - default: - ; - } -out: - return r; + kvm_x86_ops->vcpu_free(vcpu); } -static struct file_operations kvm_chardev_ops = { - .unlocked_ioctl = kvm_dev_ioctl, - .compat_ioctl = kvm_dev_ioctl, -}; - -static struct miscdevice kvm_dev = { - KVM_MINOR, - "kvm", - &kvm_chardev_ops, -}; - -/* - * Make sure that a cpu that is being hot-unplugged does not have any vcpus - * cached on it. - */ -static void decache_vcpus_on_cpu(int cpu) +int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) { - struct kvm *vm; - struct kvm_vcpu *vcpu; - int i; - - spin_lock(&kvm_lock); - list_for_each_entry(vm, &vm_list, vm_list) - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - vcpu = vm->vcpus[i]; - if (!vcpu) - continue; - /* - * If the vcpu is locked, then it is running on some - * other cpu and therefore it is not cached on the - * cpu in question. - * - * If it's not locked, check the last cpu it executed - * on. - */ - if (mutex_trylock(&vcpu->mutex)) { - if (vcpu->cpu == cpu) { - kvm_x86_ops->vcpu_decache(vcpu); - vcpu->cpu = -1; - } - mutex_unlock(&vcpu->mutex); - } - } - spin_unlock(&kvm_lock); + return kvm_x86_ops->vcpu_reset(vcpu); } -static void hardware_enable(void *junk) +void kvm_arch_hardware_enable(void *garbage) { - int cpu = raw_smp_processor_id(); - - if (cpu_isset(cpu, cpus_hardware_enabled)) - return; - cpu_set(cpu, cpus_hardware_enabled); - kvm_x86_ops->hardware_enable(NULL); + kvm_x86_ops->hardware_enable(garbage); } -static void hardware_disable(void *junk) +void kvm_arch_hardware_disable(void *garbage) { - int cpu = raw_smp_processor_id(); - - if (!cpu_isset(cpu, cpus_hardware_enabled)) - return; - cpu_clear(cpu, cpus_hardware_enabled); - decache_vcpus_on_cpu(cpu); - kvm_x86_ops->hardware_disable(NULL); + kvm_x86_ops->hardware_disable(garbage); } -static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, - void *v) +int kvm_arch_hardware_setup(void) { - int cpu = (long)v; - - switch (val) { - case CPU_DYING: - case CPU_DYING_FROZEN: - printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", - cpu); - hardware_disable(NULL); - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", - cpu); - smp_call_function_single(cpu, hardware_disable, NULL, 0, 1); - break; - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", - cpu); - smp_call_function_single(cpu, hardware_enable, NULL, 0, 1); - break; - } - return NOTIFY_OK; + return kvm_x86_ops->hardware_setup(); } -static int kvm_reboot(struct notifier_block *notifier, unsigned long val, - void *v) +void kvm_arch_hardware_unsetup(void) { - if (val == SYS_RESTART) { - /* - * Some (well, at least mine) BIOSes hang on reboot if - * in vmx root mode. - */ - printk(KERN_INFO "kvm: exiting hardware virtualization\n"); - on_each_cpu(hardware_disable, NULL, 0, 1); - } - return NOTIFY_OK; + kvm_x86_ops->hardware_unsetup(); } -static struct notifier_block kvm_reboot_notifier = { - .notifier_call = kvm_reboot, - .priority = 0, -}; - -void kvm_io_bus_init(struct kvm_io_bus *bus) +void kvm_arch_check_processor_compat(void *rtn) { - memset(bus, 0, sizeof(*bus)); + kvm_x86_ops->check_processor_compatibility(rtn); } -void kvm_io_bus_destroy(struct kvm_io_bus *bus) +int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { - int i; + struct page *page; + struct kvm *kvm; + int r; - for (i = 0; i < bus->dev_count; i++) { - struct kvm_io_device *pos = bus->devs[i]; + BUG_ON(vcpu->kvm == NULL); + kvm = vcpu->kvm; - kvm_iodevice_destructor(pos); - } -} + vcpu->arch.mmu.root_hpa = INVALID_PAGE; + if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0) + vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; + else + vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED; -struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr) -{ - int i; + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) { + r = -ENOMEM; + goto fail; + } + vcpu->arch.pio_data = page_address(page); - for (i = 0; i < bus->dev_count; i++) { - struct kvm_io_device *pos = bus->devs[i]; + r = kvm_mmu_create(vcpu); + if (r < 0) + goto fail_free_pio_data; - if (pos->in_range(pos, addr)) - return pos; + if (irqchip_in_kernel(kvm)) { + r = kvm_create_lapic(vcpu); + if (r < 0) + goto fail_mmu_destroy; } - return NULL; -} - -void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev) -{ - BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1)); + return 0; - bus->devs[bus->dev_count++] = dev; +fail_mmu_destroy: + kvm_mmu_destroy(vcpu); +fail_free_pio_data: + free_page((unsigned long)vcpu->arch.pio_data); +fail: + return r; } -static struct notifier_block kvm_cpu_notifier = { - .notifier_call = kvm_cpu_hotplug, - .priority = 20, /* must be > scheduler priority */ -}; - -static u64 stat_get(void *_offset) +void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { - unsigned offset = (long)_offset; - u64 total = 0; - struct kvm *kvm; - struct kvm_vcpu *vcpu; - int i; - - spin_lock(&kvm_lock); - list_for_each_entry(kvm, &vm_list, vm_list) - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - vcpu = kvm->vcpus[i]; - if (vcpu) - total += *(u32 *)((void *)vcpu + offset); - } - spin_unlock(&kvm_lock); - return total; + kvm_free_lapic(vcpu); + kvm_mmu_destroy(vcpu); + free_page((unsigned long)vcpu->arch.pio_data); } -DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, NULL, "%llu\n"); - -static __init void kvm_init_debug(void) +struct kvm *kvm_arch_create_vm(void) { - struct kvm_stats_debugfs_item *p; - - debugfs_dir = debugfs_create_dir("kvm", NULL); - for (p = debugfs_entries; p->name; ++p) - p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir, - (void *)(long)p->offset, - &stat_fops); -} + struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); -static void kvm_exit_debug(void) -{ - struct kvm_stats_debugfs_item *p; + if (!kvm) + return ERR_PTR(-ENOMEM); - for (p = debugfs_entries; p->name; ++p) - debugfs_remove(p->dentry); - debugfs_remove(debugfs_dir); -} + INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); -static int kvm_suspend(struct sys_device *dev, pm_message_t state) -{ - hardware_disable(NULL); - return 0; + return kvm; } -static int kvm_resume(struct sys_device *dev) +static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) { - hardware_enable(NULL); - return 0; + vcpu_load(vcpu); + kvm_mmu_unload(vcpu); + vcpu_put(vcpu); } -static struct sysdev_class kvm_sysdev_class = { - .name = "kvm", - .suspend = kvm_suspend, - .resume = kvm_resume, -}; - -static struct sys_device kvm_sysdev = { - .id = 0, - .cls = &kvm_sysdev_class, -}; - -hpa_t bad_page_address; - -static inline -struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) +static void kvm_free_vcpus(struct kvm *kvm) { - return container_of(pn, struct kvm_vcpu, preempt_notifier); -} + unsigned int i; -static void kvm_sched_in(struct preempt_notifier *pn, int cpu) -{ - struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); + /* + * Unpin any mmu pages first. + */ + for (i = 0; i < KVM_MAX_VCPUS; ++i) + if (kvm->vcpus[i]) + kvm_unload_vcpu_mmu(kvm->vcpus[i]); + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + if (kvm->vcpus[i]) { + kvm_arch_vcpu_free(kvm->vcpus[i]); + kvm->vcpus[i] = NULL; + } + } - kvm_x86_ops->vcpu_load(vcpu, cpu); } -static void kvm_sched_out(struct preempt_notifier *pn, - struct task_struct *next) +void kvm_arch_destroy_vm(struct kvm *kvm) { - struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); - - kvm_x86_ops->vcpu_put(vcpu); + kfree(kvm->arch.vpic); + kfree(kvm->arch.vioapic); + kvm_free_vcpus(kvm); + kvm_free_physmem(kvm); + kfree(kvm); } -int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size, - struct module *module) +int kvm_arch_set_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old, + int user_alloc) { - int r; - int cpu; - - if (kvm_x86_ops) { - printk(KERN_ERR "kvm: already loaded the other module\n"); - return -EEXIST; - } + int npages = mem->memory_size >> PAGE_SHIFT; + struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; - if (!ops->cpu_has_kvm_support()) { - printk(KERN_ERR "kvm: no hardware support\n"); - return -EOPNOTSUPP; - } - if (ops->disabled_by_bios()) { - printk(KERN_ERR "kvm: disabled by bios\n"); - return -EOPNOTSUPP; - } - - kvm_x86_ops = ops; - - r = kvm_x86_ops->hardware_setup(); - if (r < 0) - goto out; - - for_each_online_cpu(cpu) { - smp_call_function_single(cpu, - kvm_x86_ops->check_processor_compatibility, - &r, 0, 1); - if (r < 0) - goto out_free_0; - } - - on_each_cpu(hardware_enable, NULL, 0, 1); - r = register_cpu_notifier(&kvm_cpu_notifier); - if (r) - goto out_free_1; - register_reboot_notifier(&kvm_reboot_notifier); - - r = sysdev_class_register(&kvm_sysdev_class); - if (r) - goto out_free_2; - - r = sysdev_register(&kvm_sysdev); - if (r) - goto out_free_3; - - /* A kmem cache lets us meet the alignment requirements of fx_save. */ - kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, - __alignof__(struct kvm_vcpu), 0, 0); - if (!kvm_vcpu_cache) { - r = -ENOMEM; - goto out_free_4; + /*To keep backward compatibility with older userspace, + *x86 needs to hanlde !user_alloc case. + */ + if (!user_alloc) { + if (npages && !old.rmap) { + memslot->userspace_addr = do_mmap(NULL, 0, + npages * PAGE_SIZE, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, + 0); + + if (IS_ERR((void *)memslot->userspace_addr)) + return PTR_ERR((void *)memslot->userspace_addr); + } else { + if (!old.user_alloc && old.rmap) { + int ret; + + ret = do_munmap(current->mm, old.userspace_addr, + old.npages * PAGE_SIZE); + if (ret < 0) + printk(KERN_WARNING + "kvm_vm_ioctl_set_memory_region: " + "failed to munmap memory\n"); + } + } } - kvm_chardev_ops.owner = module; - - r = misc_register(&kvm_dev); - if (r) { - printk (KERN_ERR "kvm: misc device register failed\n"); - goto out_free; + if (!kvm->arch.n_requested_mmu_pages) { + unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); + kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); } - kvm_preempt_ops.sched_in = kvm_sched_in; - kvm_preempt_ops.sched_out = kvm_sched_out; - - return r; + kvm_mmu_slot_remove_write_access(kvm, mem->slot); + kvm_flush_remote_tlbs(kvm); -out_free: - kmem_cache_destroy(kvm_vcpu_cache); -out_free_4: - sysdev_unregister(&kvm_sysdev); -out_free_3: - sysdev_class_unregister(&kvm_sysdev_class); -out_free_2: - unregister_reboot_notifier(&kvm_reboot_notifier); - unregister_cpu_notifier(&kvm_cpu_notifier); -out_free_1: - on_each_cpu(hardware_disable, NULL, 0, 1); -out_free_0: - kvm_x86_ops->hardware_unsetup(); -out: - kvm_x86_ops = NULL; - return r; + return 0; } -void kvm_exit_x86(void) +int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { - misc_deregister(&kvm_dev); - kmem_cache_destroy(kvm_vcpu_cache); - sysdev_unregister(&kvm_sysdev); - sysdev_class_unregister(&kvm_sysdev_class); - unregister_reboot_notifier(&kvm_reboot_notifier); - unregister_cpu_notifier(&kvm_cpu_notifier); - on_each_cpu(hardware_disable, NULL, 0, 1); - kvm_x86_ops->hardware_unsetup(); - kvm_x86_ops = NULL; + return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE + || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED; } -static __init int kvm_init(void) +static void vcpu_kick_intr(void *info) { - static struct page *bad_page; - int r; - - r = kvm_mmu_module_init(); - if (r) - goto out4; - - kvm_init_debug(); - - kvm_init_msr_list(); - - if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) { - r = -ENOMEM; - goto out; - } - - bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT; - memset(__va(bad_page_address), 0, PAGE_SIZE); - - return 0; - -out: - kvm_exit_debug(); - kvm_mmu_module_exit(); -out4: - return r; +#ifdef DEBUG + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info; + printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu); +#endif } -static __exit void kvm_exit(void) +void kvm_vcpu_kick(struct kvm_vcpu *vcpu) { - kvm_exit_debug(); - __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT)); - kvm_mmu_module_exit(); -} - -module_init(kvm_init) -module_exit(kvm_exit) + int ipi_pcpu = vcpu->cpu; -EXPORT_SYMBOL_GPL(kvm_init_x86); -EXPORT_SYMBOL_GPL(kvm_exit_x86); + if (waitqueue_active(&vcpu->wq)) { + wake_up_interruptible(&vcpu->wq); + ++vcpu->stat.halt_wakeup; + } + if (vcpu->guest_mode) + smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0); +} diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c new file mode 100644 index 00000000000..79586003397 --- /dev/null +++ b/arch/x86/kvm/x86_emulate.c @@ -0,0 +1,1912 @@ +/****************************************************************************** + * x86_emulate.c + * + * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. + * + * Copyright (c) 2005 Keir Fraser + * + * Linux coding style, mod r/m decoder, segment base fixes, real-mode + * privileged instructions: + * + * Copyright (C) 2006 Qumranet + * + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 + */ + +#ifndef __KERNEL__ +#include <stdio.h> +#include <stdint.h> +#include <public/xen.h> +#define DPRINTF(_f, _a ...) printf(_f , ## _a) +#else +#include <linux/kvm_host.h> +#define DPRINTF(x...) do {} while (0) +#endif +#include <linux/module.h> +#include <asm/kvm_x86_emulate.h> + +/* + * Opcode effective-address decode tables. + * Note that we only emulate instructions that have at least one memory + * operand (excluding implicit stack references). We assume that stack + * references and instruction fetches will never occur in special memory + * areas that require emulation. So, for example, 'mov <imm>,<reg>' need + * not be handled. + */ + +/* Operand sizes: 8-bit operands or specified/overridden size. */ +#define ByteOp (1<<0) /* 8-bit operands. */ +/* Destination operand type. */ +#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ +#define DstReg (2<<1) /* Register operand. */ +#define DstMem (3<<1) /* Memory operand. */ +#define DstMask (3<<1) +/* Source operand type. */ +#define SrcNone (0<<3) /* No source operand. */ +#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */ +#define SrcReg (1<<3) /* Register operand. */ +#define SrcMem (2<<3) /* Memory operand. */ +#define SrcMem16 (3<<3) /* Memory operand (16-bit). */ +#define SrcMem32 (4<<3) /* Memory operand (32-bit). */ +#define SrcImm (5<<3) /* Immediate operand. */ +#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */ +#define SrcMask (7<<3) +/* Generic ModRM decode. */ +#define ModRM (1<<6) +/* Destination is only written; never read. */ +#define Mov (1<<7) +#define BitOp (1<<8) +#define MemAbs (1<<9) /* Memory operand is absolute displacement */ +#define String (1<<10) /* String instruction (rep capable) */ +#define Stack (1<<11) /* Stack instruction (push/pop) */ + +static u16 opcode_table[256] = { + /* 0x00 - 0x07 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x08 - 0x0F */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x10 - 0x17 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x18 - 0x1F */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x20 - 0x27 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + SrcImmByte, SrcImm, 0, 0, + /* 0x28 - 0x2F */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x30 - 0x37 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x38 - 0x3F */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x40 - 0x47 */ + DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, + /* 0x48 - 0x4F */ + DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, + /* 0x50 - 0x57 */ + SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, + SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, + /* 0x58 - 0x5F */ + DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, + DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, + /* 0x60 - 0x67 */ + 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , + 0, 0, 0, 0, + /* 0x68 - 0x6F */ + 0, 0, ImplicitOps | Mov | Stack, 0, + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ + /* 0x70 - 0x77 */ + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + /* 0x78 - 0x7F */ + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + /* 0x80 - 0x87 */ + ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + /* 0x88 - 0x8F */ + ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, + ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack, + /* 0x90 - 0x9F */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, + /* 0xA0 - 0xA7 */ + ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, + ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, + ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, + ByteOp | ImplicitOps | String, ImplicitOps | String, + /* 0xA8 - 0xAF */ + 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, + ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, + ByteOp | ImplicitOps | String, ImplicitOps | String, + /* 0xB0 - 0xBF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xC0 - 0xC7 */ + ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, + 0, ImplicitOps | Stack, 0, 0, + ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, + /* 0xC8 - 0xCF */ + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xD0 - 0xD7 */ + ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, + ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, + 0, 0, 0, 0, + /* 0xD8 - 0xDF */ + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xE0 - 0xE7 */ + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xE8 - 0xEF */ + ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, + 0, 0, 0, 0, + /* 0xF0 - 0xF7 */ + 0, 0, 0, 0, + ImplicitOps, ImplicitOps, + ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, + /* 0xF8 - 0xFF */ + ImplicitOps, 0, ImplicitOps, ImplicitOps, + 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM +}; + +static u16 twobyte_table[256] = { + /* 0x00 - 0x0F */ + 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0, + ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, + /* 0x10 - 0x1F */ + 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, + /* 0x20 - 0x2F */ + ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x30 - 0x3F */ + ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x40 - 0x47 */ + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + /* 0x48 - 0x4F */ + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + /* 0x50 - 0x5F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x60 - 0x6F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x70 - 0x7F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x80 - 0x8F */ + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + /* 0x90 - 0x9F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xA0 - 0xA7 */ + 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, + /* 0xA8 - 0xAF */ + 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, + /* 0xB0 - 0xB7 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, + DstMem | SrcReg | ModRM | BitOp, + 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem16 | ModRM | Mov, + /* 0xB8 - 0xBF */ + 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, + 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem16 | ModRM | Mov, + /* 0xC0 - 0xCF */ + 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xD0 - 0xDF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xE0 - 0xEF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xF0 - 0xFF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* EFLAGS bit definitions. */ +#define EFLG_OF (1<<11) +#define EFLG_DF (1<<10) +#define EFLG_SF (1<<7) +#define EFLG_ZF (1<<6) +#define EFLG_AF (1<<4) +#define EFLG_PF (1<<2) +#define EFLG_CF (1<<0) + +/* + * Instruction emulation: + * Most instructions are emulated directly via a fragment of inline assembly + * code. This allows us to save/restore EFLAGS and thus very easily pick up + * any modified flags. + */ + +#if defined(CONFIG_X86_64) +#define _LO32 "k" /* force 32-bit operand */ +#define _STK "%%rsp" /* stack pointer */ +#elif defined(__i386__) +#define _LO32 "" /* force 32-bit operand */ +#define _STK "%%esp" /* stack pointer */ +#endif + +/* + * These EFLAGS bits are restored from saved value during emulation, and + * any changes are written back to the saved value after emulation. + */ +#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) + +/* Before executing instruction: restore necessary bits in EFLAGS. */ +#define _PRE_EFLAGS(_sav, _msk, _tmp) \ + /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \ + "movl %"_sav",%"_LO32 _tmp"; " \ + "push %"_tmp"; " \ + "push %"_tmp"; " \ + "movl %"_msk",%"_LO32 _tmp"; " \ + "andl %"_LO32 _tmp",("_STK"); " \ + "pushf; " \ + "notl %"_LO32 _tmp"; " \ + "andl %"_LO32 _tmp",("_STK"); " \ + "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \ + "pop %"_tmp"; " \ + "orl %"_LO32 _tmp",("_STK"); " \ + "popf; " \ + "pop %"_sav"; " + +/* After executing instruction: write-back necessary bits in EFLAGS. */ +#define _POST_EFLAGS(_sav, _msk, _tmp) \ + /* _sav |= EFLAGS & _msk; */ \ + "pushf; " \ + "pop %"_tmp"; " \ + "andl %"_msk",%"_LO32 _tmp"; " \ + "orl %"_LO32 _tmp",%"_sav"; " + +/* Raw emulation: instruction has two explicit operands. */ +#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ + do { \ + unsigned long _tmp; \ + \ + switch ((_dst).bytes) { \ + case 2: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "4", "2") \ + _op"w %"_wx"3,%1; " \ + _POST_EFLAGS("0", "4", "2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : _wy ((_src).val), "i" (EFLAGS_MASK)); \ + break; \ + case 4: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "4", "2") \ + _op"l %"_lx"3,%1; " \ + _POST_EFLAGS("0", "4", "2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : _ly ((_src).val), "i" (EFLAGS_MASK)); \ + break; \ + case 8: \ + __emulate_2op_8byte(_op, _src, _dst, \ + _eflags, _qx, _qy); \ + break; \ + } \ + } while (0) + +#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ + do { \ + unsigned long _tmp; \ + switch ((_dst).bytes) { \ + case 1: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "4", "2") \ + _op"b %"_bx"3,%1; " \ + _POST_EFLAGS("0", "4", "2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : _by ((_src).val), "i" (EFLAGS_MASK)); \ + break; \ + default: \ + __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ + _wx, _wy, _lx, _ly, _qx, _qy); \ + break; \ + } \ + } while (0) + +/* Source operand is byte-sized and may be restricted to just %cl. */ +#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \ + __emulate_2op(_op, _src, _dst, _eflags, \ + "b", "c", "b", "c", "b", "c", "b", "c") + +/* Source operand is byte, word, long or quad sized. */ +#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \ + __emulate_2op(_op, _src, _dst, _eflags, \ + "b", "q", "w", "r", _LO32, "r", "", "r") + +/* Source operand is word, long or quad sized. */ +#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \ + __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ + "w", "r", _LO32, "r", "", "r") + +/* Instruction has only one explicit operand (no source operand). */ +#define emulate_1op(_op, _dst, _eflags) \ + do { \ + unsigned long _tmp; \ + \ + switch ((_dst).bytes) { \ + case 1: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "3", "2") \ + _op"b %1; " \ + _POST_EFLAGS("0", "3", "2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : "i" (EFLAGS_MASK)); \ + break; \ + case 2: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "3", "2") \ + _op"w %1; " \ + _POST_EFLAGS("0", "3", "2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : "i" (EFLAGS_MASK)); \ + break; \ + case 4: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "3", "2") \ + _op"l %1; " \ + _POST_EFLAGS("0", "3", "2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : "i" (EFLAGS_MASK)); \ + break; \ + case 8: \ + __emulate_1op_8byte(_op, _dst, _eflags); \ + break; \ + } \ + } while (0) + +/* Emulate an instruction with quadword operands (x86/64 only). */ +#if defined(CONFIG_X86_64) +#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \ + do { \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "4", "2") \ + _op"q %"_qx"3,%1; " \ + _POST_EFLAGS("0", "4", "2") \ + : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ + : _qy ((_src).val), "i" (EFLAGS_MASK)); \ + } while (0) + +#define __emulate_1op_8byte(_op, _dst, _eflags) \ + do { \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "3", "2") \ + _op"q %1; " \ + _POST_EFLAGS("0", "3", "2") \ + : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ + : "i" (EFLAGS_MASK)); \ + } while (0) + +#elif defined(__i386__) +#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) +#define __emulate_1op_8byte(_op, _dst, _eflags) +#endif /* __i386__ */ + +/* Fetch next part of the instruction being emulated. */ +#define insn_fetch(_type, _size, _eip) \ +({ unsigned long _x; \ + rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \ + if (rc != 0) \ + goto done; \ + (_eip) += (_size); \ + (_type)_x; \ +}) + +/* Access/update address held in a register, based on addressing mode. */ +#define address_mask(reg) \ + ((c->ad_bytes == sizeof(unsigned long)) ? \ + (reg) : ((reg) & ((1UL << (c->ad_bytes << 3)) - 1))) +#define register_address(base, reg) \ + ((base) + address_mask(reg)) +#define register_address_increment(reg, inc) \ + do { \ + /* signed type ensures sign extension to long */ \ + int _inc = (inc); \ + if (c->ad_bytes == sizeof(unsigned long)) \ + (reg) += _inc; \ + else \ + (reg) = ((reg) & \ + ~((1UL << (c->ad_bytes << 3)) - 1)) | \ + (((reg) + _inc) & \ + ((1UL << (c->ad_bytes << 3)) - 1)); \ + } while (0) + +#define JMP_REL(rel) \ + do { \ + register_address_increment(c->eip, rel); \ + } while (0) + +static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + unsigned long linear, u8 *dest) +{ + struct fetch_cache *fc = &ctxt->decode.fetch; + int rc; + int size; + + if (linear < fc->start || linear >= fc->end) { + size = min(15UL, PAGE_SIZE - offset_in_page(linear)); + rc = ops->read_std(linear, fc->data, size, ctxt->vcpu); + if (rc) + return rc; + fc->start = linear; + fc->end = linear + size; + } + *dest = fc->data[linear - fc->start]; + return 0; +} + +static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + unsigned long eip, void *dest, unsigned size) +{ + int rc = 0; + + eip += ctxt->cs_base; + while (size--) { + rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); + if (rc) + return rc; + } + return 0; +} + +/* + * Given the 'reg' portion of a ModRM byte, and a register block, return a + * pointer into the block that addresses the relevant register. + * @highbyte_regs specifies whether to decode AH,CH,DH,BH. + */ +static void *decode_register(u8 modrm_reg, unsigned long *regs, + int highbyte_regs) +{ + void *p; + + p = ®s[modrm_reg]; + if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) + p = (unsigned char *)®s[modrm_reg & 3] + 1; + return p; +} + +static int read_descriptor(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + void *ptr, + u16 *size, unsigned long *address, int op_bytes) +{ + int rc; + + if (op_bytes == 2) + op_bytes = 3; + *address = 0; + rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, + ctxt->vcpu); + if (rc) + return rc; + rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, + ctxt->vcpu); + return rc; +} + +static int test_cc(unsigned int condition, unsigned int flags) +{ + int rc = 0; + + switch ((condition & 15) >> 1) { + case 0: /* o */ + rc |= (flags & EFLG_OF); + break; + case 1: /* b/c/nae */ + rc |= (flags & EFLG_CF); + break; + case 2: /* z/e */ + rc |= (flags & EFLG_ZF); + break; + case 3: /* be/na */ + rc |= (flags & (EFLG_CF|EFLG_ZF)); + break; + case 4: /* s */ + rc |= (flags & EFLG_SF); + break; + case 5: /* p/pe */ + rc |= (flags & EFLG_PF); + break; + case 7: /* le/ng */ + rc |= (flags & EFLG_ZF); + /* fall through */ + case 6: /* l/nge */ + rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF)); + break; + } + + /* Odd condition identifiers (lsb == 1) have inverted sense. */ + return (!!rc ^ (condition & 1)); +} + +static void decode_register_operand(struct operand *op, + struct decode_cache *c, + int inhibit_bytereg) +{ + unsigned reg = c->modrm_reg; + int highbyte_regs = c->rex_prefix == 0; + + if (!(c->d & ModRM)) + reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); + op->type = OP_REG; + if ((c->d & ByteOp) && !inhibit_bytereg) { + op->ptr = decode_register(reg, c->regs, highbyte_regs); + op->val = *(u8 *)op->ptr; + op->bytes = 1; + } else { + op->ptr = decode_register(reg, c->regs, 0); + op->bytes = c->op_bytes; + switch (op->bytes) { + case 2: + op->val = *(u16 *)op->ptr; + break; + case 4: + op->val = *(u32 *)op->ptr; + break; + case 8: + op->val = *(u64 *) op->ptr; + break; + } + } + op->orig_val = op->val; +} + +static int decode_modrm(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + u8 sib; + int index_reg = 0, base_reg = 0, scale, rip_relative = 0; + int rc = 0; + + if (c->rex_prefix) { + c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ + index_reg = (c->rex_prefix & 2) << 2; /* REX.X */ + c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */ + } + + c->modrm = insn_fetch(u8, 1, c->eip); + c->modrm_mod |= (c->modrm & 0xc0) >> 6; + c->modrm_reg |= (c->modrm & 0x38) >> 3; + c->modrm_rm |= (c->modrm & 0x07); + c->modrm_ea = 0; + c->use_modrm_ea = 1; + + if (c->modrm_mod == 3) { + c->modrm_val = *(unsigned long *) + decode_register(c->modrm_rm, c->regs, c->d & ByteOp); + return rc; + } + + if (c->ad_bytes == 2) { + unsigned bx = c->regs[VCPU_REGS_RBX]; + unsigned bp = c->regs[VCPU_REGS_RBP]; + unsigned si = c->regs[VCPU_REGS_RSI]; + unsigned di = c->regs[VCPU_REGS_RDI]; + + /* 16-bit ModR/M decode. */ + switch (c->modrm_mod) { + case 0: + if (c->modrm_rm == 6) + c->modrm_ea += insn_fetch(u16, 2, c->eip); + break; + case 1: + c->modrm_ea += insn_fetch(s8, 1, c->eip); + break; + case 2: + c->modrm_ea += insn_fetch(u16, 2, c->eip); + break; + } + switch (c->modrm_rm) { + case 0: + c->modrm_ea += bx + si; + break; + case 1: + c->modrm_ea += bx + di; + break; + case 2: + c->modrm_ea += bp + si; + break; + case 3: + c->modrm_ea += bp + di; + break; + case 4: + c->modrm_ea += si; + break; + case 5: + c->modrm_ea += di; + break; + case 6: + if (c->modrm_mod != 0) + c->modrm_ea += bp; + break; + case 7: + c->modrm_ea += bx; + break; + } + if (c->modrm_rm == 2 || c->modrm_rm == 3 || + (c->modrm_rm == 6 && c->modrm_mod != 0)) + if (!c->override_base) + c->override_base = &ctxt->ss_base; + c->modrm_ea = (u16)c->modrm_ea; + } else { + /* 32/64-bit ModR/M decode. */ + switch (c->modrm_rm) { + case 4: + case 12: + sib = insn_fetch(u8, 1, c->eip); + index_reg |= (sib >> 3) & 7; + base_reg |= sib & 7; + scale = sib >> 6; + + switch (base_reg) { + case 5: + if (c->modrm_mod != 0) + c->modrm_ea += c->regs[base_reg]; + else + c->modrm_ea += + insn_fetch(s32, 4, c->eip); + break; + default: + c->modrm_ea += c->regs[base_reg]; + } + switch (index_reg) { + case 4: + break; + default: + c->modrm_ea += c->regs[index_reg] << scale; + } + break; + case 5: + if (c->modrm_mod != 0) + c->modrm_ea += c->regs[c->modrm_rm]; + else if (ctxt->mode == X86EMUL_MODE_PROT64) + rip_relative = 1; + break; + default: + c->modrm_ea += c->regs[c->modrm_rm]; + break; + } + switch (c->modrm_mod) { + case 0: + if (c->modrm_rm == 5) + c->modrm_ea += insn_fetch(s32, 4, c->eip); + break; + case 1: + c->modrm_ea += insn_fetch(s8, 1, c->eip); + break; + case 2: + c->modrm_ea += insn_fetch(s32, 4, c->eip); + break; + } + } + if (rip_relative) { + c->modrm_ea += c->eip; + switch (c->d & SrcMask) { + case SrcImmByte: + c->modrm_ea += 1; + break; + case SrcImm: + if (c->d & ByteOp) + c->modrm_ea += 1; + else + if (c->op_bytes == 8) + c->modrm_ea += 4; + else + c->modrm_ea += c->op_bytes; + } + } +done: + return rc; +} + +static int decode_abs(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc = 0; + + switch (c->ad_bytes) { + case 2: + c->modrm_ea = insn_fetch(u16, 2, c->eip); + break; + case 4: + c->modrm_ea = insn_fetch(u32, 4, c->eip); + break; + case 8: + c->modrm_ea = insn_fetch(u64, 8, c->eip); + break; + } +done: + return rc; +} + +int +x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc = 0; + int mode = ctxt->mode; + int def_op_bytes, def_ad_bytes; + + /* Shadow copy of register state. Committed on successful emulation. */ + + memset(c, 0, sizeof(struct decode_cache)); + c->eip = ctxt->vcpu->arch.rip; + memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); + + switch (mode) { + case X86EMUL_MODE_REAL: + case X86EMUL_MODE_PROT16: + def_op_bytes = def_ad_bytes = 2; + break; + case X86EMUL_MODE_PROT32: + def_op_bytes = def_ad_bytes = 4; + break; +#ifdef CONFIG_X86_64 + case X86EMUL_MODE_PROT64: + def_op_bytes = 4; + def_ad_bytes = 8; + break; +#endif + default: + return -1; + } + + c->op_bytes = def_op_bytes; + c->ad_bytes = def_ad_bytes; + + /* Legacy prefixes. */ + for (;;) { + switch (c->b = insn_fetch(u8, 1, c->eip)) { + case 0x66: /* operand-size override */ + /* switch between 2/4 bytes */ + c->op_bytes = def_op_bytes ^ 6; + break; + case 0x67: /* address-size override */ + if (mode == X86EMUL_MODE_PROT64) + /* switch between 4/8 bytes */ + c->ad_bytes = def_ad_bytes ^ 12; + else + /* switch between 2/4 bytes */ + c->ad_bytes = def_ad_bytes ^ 6; + break; + case 0x2e: /* CS override */ + c->override_base = &ctxt->cs_base; + break; + case 0x3e: /* DS override */ + c->override_base = &ctxt->ds_base; + break; + case 0x26: /* ES override */ + c->override_base = &ctxt->es_base; + break; + case 0x64: /* FS override */ + c->override_base = &ctxt->fs_base; + break; + case 0x65: /* GS override */ + c->override_base = &ctxt->gs_base; + break; + case 0x36: /* SS override */ + c->override_base = &ctxt->ss_base; + break; + case 0x40 ... 0x4f: /* REX */ + if (mode != X86EMUL_MODE_PROT64) + goto done_prefixes; + c->rex_prefix = c->b; + continue; + case 0xf0: /* LOCK */ + c->lock_prefix = 1; + break; + case 0xf2: /* REPNE/REPNZ */ + c->rep_prefix = REPNE_PREFIX; + break; + case 0xf3: /* REP/REPE/REPZ */ + c->rep_prefix = REPE_PREFIX; + break; + default: + goto done_prefixes; + } + + /* Any legacy prefix after a REX prefix nullifies its effect. */ + + c->rex_prefix = 0; + } + +done_prefixes: + + /* REX prefix. */ + if (c->rex_prefix) + if (c->rex_prefix & 8) + c->op_bytes = 8; /* REX.W */ + + /* Opcode byte(s). */ + c->d = opcode_table[c->b]; + if (c->d == 0) { + /* Two-byte opcode? */ + if (c->b == 0x0f) { + c->twobyte = 1; + c->b = insn_fetch(u8, 1, c->eip); + c->d = twobyte_table[c->b]; + } + + /* Unrecognised? */ + if (c->d == 0) { + DPRINTF("Cannot emulate %02x\n", c->b); + return -1; + } + } + + if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) + c->op_bytes = 8; + + /* ModRM and SIB bytes. */ + if (c->d & ModRM) + rc = decode_modrm(ctxt, ops); + else if (c->d & MemAbs) + rc = decode_abs(ctxt, ops); + if (rc) + goto done; + + if (!c->override_base) + c->override_base = &ctxt->ds_base; + if (mode == X86EMUL_MODE_PROT64 && + c->override_base != &ctxt->fs_base && + c->override_base != &ctxt->gs_base) + c->override_base = NULL; + + if (c->override_base) + c->modrm_ea += *c->override_base; + + if (c->ad_bytes != 8) + c->modrm_ea = (u32)c->modrm_ea; + /* + * Decode and fetch the source operand: register, memory + * or immediate. + */ + switch (c->d & SrcMask) { + case SrcNone: + break; + case SrcReg: + decode_register_operand(&c->src, c, 0); + break; + case SrcMem16: + c->src.bytes = 2; + goto srcmem_common; + case SrcMem32: + c->src.bytes = 4; + goto srcmem_common; + case SrcMem: + c->src.bytes = (c->d & ByteOp) ? 1 : + c->op_bytes; + /* Don't fetch the address for invlpg: it could be unmapped. */ + if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7) + break; + srcmem_common: + /* + * For instructions with a ModR/M byte, switch to register + * access if Mod = 3. + */ + if ((c->d & ModRM) && c->modrm_mod == 3) { + c->src.type = OP_REG; + break; + } + c->src.type = OP_MEM; + break; + case SrcImm: + c->src.type = OP_IMM; + c->src.ptr = (unsigned long *)c->eip; + c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + if (c->src.bytes == 8) + c->src.bytes = 4; + /* NB. Immediates are sign-extended as necessary. */ + switch (c->src.bytes) { + case 1: + c->src.val = insn_fetch(s8, 1, c->eip); + break; + case 2: + c->src.val = insn_fetch(s16, 2, c->eip); + break; + case 4: + c->src.val = insn_fetch(s32, 4, c->eip); + break; + } + break; + case SrcImmByte: + c->src.type = OP_IMM; + c->src.ptr = (unsigned long *)c->eip; + c->src.bytes = 1; + c->src.val = insn_fetch(s8, 1, c->eip); + break; + } + + /* Decode and fetch the destination operand: register or memory. */ + switch (c->d & DstMask) { + case ImplicitOps: + /* Special instructions do their own operand decoding. */ + return 0; + case DstReg: + decode_register_operand(&c->dst, c, + c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); + break; + case DstMem: + if ((c->d & ModRM) && c->modrm_mod == 3) { + c->dst.type = OP_REG; + break; + } + c->dst.type = OP_MEM; + break; + } + +done: + return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; +} + +static inline void emulate_push(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->dst.type = OP_MEM; + c->dst.bytes = c->op_bytes; + c->dst.val = c->src.val; + register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes); + c->dst.ptr = (void *) register_address(ctxt->ss_base, + c->regs[VCPU_REGS_RSP]); +} + +static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + + rc = ops->read_std(register_address(ctxt->ss_base, + c->regs[VCPU_REGS_RSP]), + &c->dst.val, c->dst.bytes, ctxt->vcpu); + if (rc != 0) + return rc; + + register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes); + + return 0; +} + +static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + switch (c->modrm_reg) { + case 0: /* rol */ + emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags); + break; + case 1: /* ror */ + emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags); + break; + case 2: /* rcl */ + emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags); + break; + case 3: /* rcr */ + emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags); + break; + case 4: /* sal/shl */ + case 6: /* sal/shl */ + emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags); + break; + case 5: /* shr */ + emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags); + break; + case 7: /* sar */ + emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags); + break; + } +} + +static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc = 0; + + switch (c->modrm_reg) { + case 0 ... 1: /* test */ + /* + * Special case in Grp3: test has an immediate + * source operand. + */ + c->src.type = OP_IMM; + c->src.ptr = (unsigned long *)c->eip; + c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + if (c->src.bytes == 8) + c->src.bytes = 4; + switch (c->src.bytes) { + case 1: + c->src.val = insn_fetch(s8, 1, c->eip); + break; + case 2: + c->src.val = insn_fetch(s16, 2, c->eip); + break; + case 4: + c->src.val = insn_fetch(s32, 4, c->eip); + break; + } + emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); + break; + case 2: /* not */ + c->dst.val = ~c->dst.val; + break; + case 3: /* neg */ + emulate_1op("neg", c->dst, ctxt->eflags); + break; + default: + DPRINTF("Cannot emulate %02x\n", c->b); + rc = X86EMUL_UNHANDLEABLE; + break; + } +done: + return rc; +} + +static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + + switch (c->modrm_reg) { + case 0: /* inc */ + emulate_1op("inc", c->dst, ctxt->eflags); + break; + case 1: /* dec */ + emulate_1op("dec", c->dst, ctxt->eflags); + break; + case 4: /* jmp abs */ + if (c->b == 0xff) + c->eip = c->dst.val; + else { + DPRINTF("Cannot emulate %02x\n", c->b); + return X86EMUL_UNHANDLEABLE; + } + break; + case 6: /* push */ + + /* 64-bit mode: PUSH always pushes a 64-bit operand. */ + + if (ctxt->mode == X86EMUL_MODE_PROT64) { + c->dst.bytes = 8; + rc = ops->read_std((unsigned long)c->dst.ptr, + &c->dst.val, 8, ctxt->vcpu); + if (rc != 0) + return rc; + } + register_address_increment(c->regs[VCPU_REGS_RSP], + -c->dst.bytes); + rc = ops->write_emulated(register_address(ctxt->ss_base, + c->regs[VCPU_REGS_RSP]), &c->dst.val, + c->dst.bytes, ctxt->vcpu); + if (rc != 0) + return rc; + c->dst.type = OP_NONE; + break; + default: + DPRINTF("Cannot emulate %02x\n", c->b); + return X86EMUL_UNHANDLEABLE; + } + return 0; +} + +static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + unsigned long memop) +{ + struct decode_cache *c = &ctxt->decode; + u64 old, new; + int rc; + + rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu); + if (rc != 0) + return rc; + + if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || + ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) { + + c->regs[VCPU_REGS_RAX] = (u32) (old >> 0); + c->regs[VCPU_REGS_RDX] = (u32) (old >> 32); + ctxt->eflags &= ~EFLG_ZF; + + } else { + new = ((u64)c->regs[VCPU_REGS_RCX] << 32) | + (u32) c->regs[VCPU_REGS_RBX]; + + rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu); + if (rc != 0) + return rc; + ctxt->eflags |= EFLG_ZF; + } + return 0; +} + +static inline int writeback(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + int rc; + struct decode_cache *c = &ctxt->decode; + + switch (c->dst.type) { + case OP_REG: + /* The 4-byte case *is* correct: + * in 64-bit mode we zero-extend. + */ + switch (c->dst.bytes) { + case 1: + *(u8 *)c->dst.ptr = (u8)c->dst.val; + break; + case 2: + *(u16 *)c->dst.ptr = (u16)c->dst.val; + break; + case 4: + *c->dst.ptr = (u32)c->dst.val; + break; /* 64b: zero-ext */ + case 8: + *c->dst.ptr = c->dst.val; + break; + } + break; + case OP_MEM: + if (c->lock_prefix) + rc = ops->cmpxchg_emulated( + (unsigned long)c->dst.ptr, + &c->dst.orig_val, + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + else + rc = ops->write_emulated( + (unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + if (rc != 0) + return rc; + break; + case OP_NONE: + /* no writeback */ + break; + default: + break; + } + return 0; +} + +int +x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) +{ + unsigned long memop = 0; + u64 msr_data; + unsigned long saved_eip = 0; + struct decode_cache *c = &ctxt->decode; + int rc = 0; + + /* Shadow copy of register state. Committed on successful emulation. + * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't + * modify them. + */ + + memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); + saved_eip = c->eip; + + if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs)) + memop = c->modrm_ea; + + if (c->rep_prefix && (c->d & String)) { + /* All REP prefixes have the same first termination condition */ + if (c->regs[VCPU_REGS_RCX] == 0) { + ctxt->vcpu->arch.rip = c->eip; + goto done; + } + /* The second termination condition only applies for REPE + * and REPNE. Test if the repeat string operation prefix is + * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the + * corresponding termination condition according to: + * - if REPE/REPZ and ZF = 0 then done + * - if REPNE/REPNZ and ZF = 1 then done + */ + if ((c->b == 0xa6) || (c->b == 0xa7) || + (c->b == 0xae) || (c->b == 0xaf)) { + if ((c->rep_prefix == REPE_PREFIX) && + ((ctxt->eflags & EFLG_ZF) == 0)) { + ctxt->vcpu->arch.rip = c->eip; + goto done; + } + if ((c->rep_prefix == REPNE_PREFIX) && + ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { + ctxt->vcpu->arch.rip = c->eip; + goto done; + } + } + c->regs[VCPU_REGS_RCX]--; + c->eip = ctxt->vcpu->arch.rip; + } + + if (c->src.type == OP_MEM) { + c->src.ptr = (unsigned long *)memop; + c->src.val = 0; + rc = ops->read_emulated((unsigned long)c->src.ptr, + &c->src.val, + c->src.bytes, + ctxt->vcpu); + if (rc != 0) + goto done; + c->src.orig_val = c->src.val; + } + + if ((c->d & DstMask) == ImplicitOps) + goto special_insn; + + + if (c->dst.type == OP_MEM) { + c->dst.ptr = (unsigned long *)memop; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.val = 0; + if (c->d & BitOp) { + unsigned long mask = ~(c->dst.bytes * 8 - 1); + + c->dst.ptr = (void *)c->dst.ptr + + (c->src.val & mask) / 8; + } + if (!(c->d & Mov) && + /* optimisation - avoid slow emulated read */ + ((rc = ops->read_emulated((unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, ctxt->vcpu)) != 0)) + goto done; + } + c->dst.orig_val = c->dst.val; + +special_insn: + + if (c->twobyte) + goto twobyte_insn; + + switch (c->b) { + case 0x00 ... 0x05: + add: /* add */ + emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); + break; + case 0x08 ... 0x0d: + or: /* or */ + emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); + break; + case 0x10 ... 0x15: + adc: /* adc */ + emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); + break; + case 0x18 ... 0x1d: + sbb: /* sbb */ + emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); + break; + case 0x20 ... 0x23: + and: /* and */ + emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); + break; + case 0x24: /* and al imm8 */ + c->dst.type = OP_REG; + c->dst.ptr = &c->regs[VCPU_REGS_RAX]; + c->dst.val = *(u8 *)c->dst.ptr; + c->dst.bytes = 1; + c->dst.orig_val = c->dst.val; + goto and; + case 0x25: /* and ax imm16, or eax imm32 */ + c->dst.type = OP_REG; + c->dst.bytes = c->op_bytes; + c->dst.ptr = &c->regs[VCPU_REGS_RAX]; + if (c->op_bytes == 2) + c->dst.val = *(u16 *)c->dst.ptr; + else + c->dst.val = *(u32 *)c->dst.ptr; + c->dst.orig_val = c->dst.val; + goto and; + case 0x28 ... 0x2d: + sub: /* sub */ + emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); + break; + case 0x30 ... 0x35: + xor: /* xor */ + emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags); + break; + case 0x38 ... 0x3d: + cmp: /* cmp */ + emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); + break; + case 0x40 ... 0x47: /* inc r16/r32 */ + emulate_1op("inc", c->dst, ctxt->eflags); + break; + case 0x48 ... 0x4f: /* dec r16/r32 */ + emulate_1op("dec", c->dst, ctxt->eflags); + break; + case 0x50 ... 0x57: /* push reg */ + c->dst.type = OP_MEM; + c->dst.bytes = c->op_bytes; + c->dst.val = c->src.val; + register_address_increment(c->regs[VCPU_REGS_RSP], + -c->op_bytes); + c->dst.ptr = (void *) register_address( + ctxt->ss_base, c->regs[VCPU_REGS_RSP]); + break; + case 0x58 ... 0x5f: /* pop reg */ + pop_instruction: + if ((rc = ops->read_std(register_address(ctxt->ss_base, + c->regs[VCPU_REGS_RSP]), c->dst.ptr, + c->op_bytes, ctxt->vcpu)) != 0) + goto done; + + register_address_increment(c->regs[VCPU_REGS_RSP], + c->op_bytes); + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0x63: /* movsxd */ + if (ctxt->mode != X86EMUL_MODE_PROT64) + goto cannot_emulate; + c->dst.val = (s32) c->src.val; + break; + case 0x6a: /* push imm8 */ + c->src.val = 0L; + c->src.val = insn_fetch(s8, 1, c->eip); + emulate_push(ctxt); + break; + case 0x6c: /* insb */ + case 0x6d: /* insw/insd */ + if (kvm_emulate_pio_string(ctxt->vcpu, NULL, + 1, + (c->d & ByteOp) ? 1 : c->op_bytes, + c->rep_prefix ? + address_mask(c->regs[VCPU_REGS_RCX]) : 1, + (ctxt->eflags & EFLG_DF), + register_address(ctxt->es_base, + c->regs[VCPU_REGS_RDI]), + c->rep_prefix, + c->regs[VCPU_REGS_RDX]) == 0) { + c->eip = saved_eip; + return -1; + } + return 0; + case 0x6e: /* outsb */ + case 0x6f: /* outsw/outsd */ + if (kvm_emulate_pio_string(ctxt->vcpu, NULL, + 0, + (c->d & ByteOp) ? 1 : c->op_bytes, + c->rep_prefix ? + address_mask(c->regs[VCPU_REGS_RCX]) : 1, + (ctxt->eflags & EFLG_DF), + register_address(c->override_base ? + *c->override_base : + ctxt->ds_base, + c->regs[VCPU_REGS_RSI]), + c->rep_prefix, + c->regs[VCPU_REGS_RDX]) == 0) { + c->eip = saved_eip; + return -1; + } + return 0; + case 0x70 ... 0x7f: /* jcc (short) */ { + int rel = insn_fetch(s8, 1, c->eip); + + if (test_cc(c->b, ctxt->eflags)) + JMP_REL(rel); + break; + } + case 0x80 ... 0x83: /* Grp1 */ + switch (c->modrm_reg) { + case 0: + goto add; + case 1: + goto or; + case 2: + goto adc; + case 3: + goto sbb; + case 4: + goto and; + case 5: + goto sub; + case 6: + goto xor; + case 7: + goto cmp; + } + break; + case 0x84 ... 0x85: + emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); + break; + case 0x86 ... 0x87: /* xchg */ + /* Write back the register source. */ + switch (c->dst.bytes) { + case 1: + *(u8 *) c->src.ptr = (u8) c->dst.val; + break; + case 2: + *(u16 *) c->src.ptr = (u16) c->dst.val; + break; + case 4: + *c->src.ptr = (u32) c->dst.val; + break; /* 64b reg: zero-extend */ + case 8: + *c->src.ptr = c->dst.val; + break; + } + /* + * Write back the memory destination with implicit LOCK + * prefix. + */ + c->dst.val = c->src.val; + c->lock_prefix = 1; + break; + case 0x88 ... 0x8b: /* mov */ + goto mov; + case 0x8d: /* lea r16/r32, m */ + c->dst.val = c->modrm_val; + break; + case 0x8f: /* pop (sole member of Grp1a) */ + rc = emulate_grp1a(ctxt, ops); + if (rc != 0) + goto done; + break; + case 0x9c: /* pushf */ + c->src.val = (unsigned long) ctxt->eflags; + emulate_push(ctxt); + break; + case 0x9d: /* popf */ + c->dst.ptr = (unsigned long *) &ctxt->eflags; + goto pop_instruction; + case 0xa0 ... 0xa1: /* mov */ + c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; + c->dst.val = c->src.val; + break; + case 0xa2 ... 0xa3: /* mov */ + c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX]; + break; + case 0xa4 ... 0xa5: /* movs */ + c->dst.type = OP_MEM; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.ptr = (unsigned long *)register_address( + ctxt->es_base, + c->regs[VCPU_REGS_RDI]); + if ((rc = ops->read_emulated(register_address( + c->override_base ? *c->override_base : + ctxt->ds_base, + c->regs[VCPU_REGS_RSI]), + &c->dst.val, + c->dst.bytes, ctxt->vcpu)) != 0) + goto done; + register_address_increment(c->regs[VCPU_REGS_RSI], + (ctxt->eflags & EFLG_DF) ? -c->dst.bytes + : c->dst.bytes); + register_address_increment(c->regs[VCPU_REGS_RDI], + (ctxt->eflags & EFLG_DF) ? -c->dst.bytes + : c->dst.bytes); + break; + case 0xa6 ... 0xa7: /* cmps */ + c->src.type = OP_NONE; /* Disable writeback. */ + c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->src.ptr = (unsigned long *)register_address( + c->override_base ? *c->override_base : + ctxt->ds_base, + c->regs[VCPU_REGS_RSI]); + if ((rc = ops->read_emulated((unsigned long)c->src.ptr, + &c->src.val, + c->src.bytes, + ctxt->vcpu)) != 0) + goto done; + + c->dst.type = OP_NONE; /* Disable writeback. */ + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.ptr = (unsigned long *)register_address( + ctxt->es_base, + c->regs[VCPU_REGS_RDI]); + if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, + ctxt->vcpu)) != 0) + goto done; + + DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); + + emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); + + register_address_increment(c->regs[VCPU_REGS_RSI], + (ctxt->eflags & EFLG_DF) ? -c->src.bytes + : c->src.bytes); + register_address_increment(c->regs[VCPU_REGS_RDI], + (ctxt->eflags & EFLG_DF) ? -c->dst.bytes + : c->dst.bytes); + + break; + case 0xaa ... 0xab: /* stos */ + c->dst.type = OP_MEM; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.ptr = (unsigned long *)register_address( + ctxt->es_base, + c->regs[VCPU_REGS_RDI]); + c->dst.val = c->regs[VCPU_REGS_RAX]; + register_address_increment(c->regs[VCPU_REGS_RDI], + (ctxt->eflags & EFLG_DF) ? -c->dst.bytes + : c->dst.bytes); + break; + case 0xac ... 0xad: /* lods */ + c->dst.type = OP_REG; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; + if ((rc = ops->read_emulated(register_address( + c->override_base ? *c->override_base : + ctxt->ds_base, + c->regs[VCPU_REGS_RSI]), + &c->dst.val, + c->dst.bytes, + ctxt->vcpu)) != 0) + goto done; + register_address_increment(c->regs[VCPU_REGS_RSI], + (ctxt->eflags & EFLG_DF) ? -c->dst.bytes + : c->dst.bytes); + break; + case 0xae ... 0xaf: /* scas */ + DPRINTF("Urk! I don't handle SCAS.\n"); + goto cannot_emulate; + case 0xc0 ... 0xc1: + emulate_grp2(ctxt); + break; + case 0xc3: /* ret */ + c->dst.ptr = &c->eip; + goto pop_instruction; + case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ + mov: + c->dst.val = c->src.val; + break; + case 0xd0 ... 0xd1: /* Grp2 */ + c->src.val = 1; + emulate_grp2(ctxt); + break; + case 0xd2 ... 0xd3: /* Grp2 */ + c->src.val = c->regs[VCPU_REGS_RCX]; + emulate_grp2(ctxt); + break; + case 0xe8: /* call (near) */ { + long int rel; + switch (c->op_bytes) { + case 2: + rel = insn_fetch(s16, 2, c->eip); + break; + case 4: + rel = insn_fetch(s32, 4, c->eip); + break; + default: + DPRINTF("Call: Invalid op_bytes\n"); + goto cannot_emulate; + } + c->src.val = (unsigned long) c->eip; + JMP_REL(rel); + c->op_bytes = c->ad_bytes; + emulate_push(ctxt); + break; + } + case 0xe9: /* jmp rel */ + case 0xeb: /* jmp rel short */ + JMP_REL(c->src.val); + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xf4: /* hlt */ + ctxt->vcpu->arch.halt_request = 1; + goto done; + case 0xf5: /* cmc */ + /* complement carry flag from eflags reg */ + ctxt->eflags ^= EFLG_CF; + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xf6 ... 0xf7: /* Grp3 */ + rc = emulate_grp3(ctxt, ops); + if (rc != 0) + goto done; + break; + case 0xf8: /* clc */ + ctxt->eflags &= ~EFLG_CF; + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xfa: /* cli */ + ctxt->eflags &= ~X86_EFLAGS_IF; + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xfb: /* sti */ + ctxt->eflags |= X86_EFLAGS_IF; + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xfe ... 0xff: /* Grp4/Grp5 */ + rc = emulate_grp45(ctxt, ops); + if (rc != 0) + goto done; + break; + } + +writeback: + rc = writeback(ctxt, ops); + if (rc != 0) + goto done; + + /* Commit shadow register state. */ + memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); + ctxt->vcpu->arch.rip = c->eip; + +done: + if (rc == X86EMUL_UNHANDLEABLE) { + c->eip = saved_eip; + return -1; + } + return 0; + +twobyte_insn: + switch (c->b) { + case 0x01: /* lgdt, lidt, lmsw */ + switch (c->modrm_reg) { + u16 size; + unsigned long address; + + case 0: /* vmcall */ + if (c->modrm_mod != 3 || c->modrm_rm != 1) + goto cannot_emulate; + + rc = kvm_fix_hypercall(ctxt->vcpu); + if (rc) + goto done; + + kvm_emulate_hypercall(ctxt->vcpu); + break; + case 2: /* lgdt */ + rc = read_descriptor(ctxt, ops, c->src.ptr, + &size, &address, c->op_bytes); + if (rc) + goto done; + realmode_lgdt(ctxt->vcpu, size, address); + break; + case 3: /* lidt/vmmcall */ + if (c->modrm_mod == 3 && c->modrm_rm == 1) { + rc = kvm_fix_hypercall(ctxt->vcpu); + if (rc) + goto done; + kvm_emulate_hypercall(ctxt->vcpu); + } else { + rc = read_descriptor(ctxt, ops, c->src.ptr, + &size, &address, + c->op_bytes); + if (rc) + goto done; + realmode_lidt(ctxt->vcpu, size, address); + } + break; + case 4: /* smsw */ + if (c->modrm_mod != 3) + goto cannot_emulate; + *(u16 *)&c->regs[c->modrm_rm] + = realmode_get_cr(ctxt->vcpu, 0); + break; + case 6: /* lmsw */ + if (c->modrm_mod != 3) + goto cannot_emulate; + realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val, + &ctxt->eflags); + break; + case 7: /* invlpg*/ + emulate_invlpg(ctxt->vcpu, memop); + break; + default: + goto cannot_emulate; + } + /* Disable writeback. */ + c->dst.type = OP_NONE; + break; + case 0x06: + emulate_clts(ctxt->vcpu); + c->dst.type = OP_NONE; + break; + case 0x08: /* invd */ + case 0x09: /* wbinvd */ + case 0x0d: /* GrpP (prefetch) */ + case 0x18: /* Grp16 (prefetch/nop) */ + c->dst.type = OP_NONE; + break; + case 0x20: /* mov cr, reg */ + if (c->modrm_mod != 3) + goto cannot_emulate; + c->regs[c->modrm_rm] = + realmode_get_cr(ctxt->vcpu, c->modrm_reg); + c->dst.type = OP_NONE; /* no writeback */ + break; + case 0x21: /* mov from dr to reg */ + if (c->modrm_mod != 3) + goto cannot_emulate; + rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); + if (rc) + goto cannot_emulate; + c->dst.type = OP_NONE; /* no writeback */ + break; + case 0x22: /* mov reg, cr */ + if (c->modrm_mod != 3) + goto cannot_emulate; + realmode_set_cr(ctxt->vcpu, + c->modrm_reg, c->modrm_val, &ctxt->eflags); + c->dst.type = OP_NONE; + break; + case 0x23: /* mov from reg to dr */ + if (c->modrm_mod != 3) + goto cannot_emulate; + rc = emulator_set_dr(ctxt, c->modrm_reg, + c->regs[c->modrm_rm]); + if (rc) + goto cannot_emulate; + c->dst.type = OP_NONE; /* no writeback */ + break; + case 0x30: + /* wrmsr */ + msr_data = (u32)c->regs[VCPU_REGS_RAX] + | ((u64)c->regs[VCPU_REGS_RDX] << 32); + rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); + if (rc) { + kvm_inject_gp(ctxt->vcpu, 0); + c->eip = ctxt->vcpu->arch.rip; + } + rc = X86EMUL_CONTINUE; + c->dst.type = OP_NONE; + break; + case 0x32: + /* rdmsr */ + rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); + if (rc) { + kvm_inject_gp(ctxt->vcpu, 0); + c->eip = ctxt->vcpu->arch.rip; + } else { + c->regs[VCPU_REGS_RAX] = (u32)msr_data; + c->regs[VCPU_REGS_RDX] = msr_data >> 32; + } + rc = X86EMUL_CONTINUE; + c->dst.type = OP_NONE; + break; + case 0x40 ... 0x4f: /* cmov */ + c->dst.val = c->dst.orig_val = c->src.val; + if (!test_cc(c->b, ctxt->eflags)) + c->dst.type = OP_NONE; /* no writeback */ + break; + case 0x80 ... 0x8f: /* jnz rel, etc*/ { + long int rel; + + switch (c->op_bytes) { + case 2: + rel = insn_fetch(s16, 2, c->eip); + break; + case 4: + rel = insn_fetch(s32, 4, c->eip); + break; + case 8: + rel = insn_fetch(s64, 8, c->eip); + break; + default: + DPRINTF("jnz: Invalid op_bytes\n"); + goto cannot_emulate; + } + if (test_cc(c->b, ctxt->eflags)) + JMP_REL(rel); + c->dst.type = OP_NONE; + break; + } + case 0xa3: + bt: /* bt */ + c->dst.type = OP_NONE; + /* only subword offset */ + c->src.val &= (c->dst.bytes << 3) - 1; + emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags); + break; + case 0xab: + bts: /* bts */ + /* only subword offset */ + c->src.val &= (c->dst.bytes << 3) - 1; + emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); + break; + case 0xb0 ... 0xb1: /* cmpxchg */ + /* + * Save real source value, then compare EAX against + * destination. + */ + c->src.orig_val = c->src.val; + c->src.val = c->regs[VCPU_REGS_RAX]; + emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); + if (ctxt->eflags & EFLG_ZF) { + /* Success: write back to memory. */ + c->dst.val = c->src.orig_val; + } else { + /* Failure: write the value we saw to EAX. */ + c->dst.type = OP_REG; + c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; + } + break; + case 0xb3: + btr: /* btr */ + /* only subword offset */ + c->src.val &= (c->dst.bytes << 3) - 1; + emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); + break; + case 0xb6 ... 0xb7: /* movzx */ + c->dst.bytes = c->op_bytes; + c->dst.val = (c->d & ByteOp) ? (u8) c->src.val + : (u16) c->src.val; + break; + case 0xba: /* Grp8 */ + switch (c->modrm_reg & 3) { + case 0: + goto bt; + case 1: + goto bts; + case 2: + goto btr; + case 3: + goto btc; + } + break; + case 0xbb: + btc: /* btc */ + /* only subword offset */ + c->src.val &= (c->dst.bytes << 3) - 1; + emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); + break; + case 0xbe ... 0xbf: /* movsx */ + c->dst.bytes = c->op_bytes; + c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : + (s16) c->src.val; + break; + case 0xc3: /* movnti */ + c->dst.bytes = c->op_bytes; + c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : + (u64) c->src.val; + break; + case 0xc7: /* Grp9 (cmpxchg8b) */ + rc = emulate_grp9(ctxt, ops, memop); + if (rc != 0) + goto done; + c->dst.type = OP_NONE; + break; + } + goto writeback; + +cannot_emulate: + DPRINTF("Cannot emulate %02x\n", c->b); + c->eip = saved_eip; + return -1; +} diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index a63373759f0..5afdde4895d 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -67,6 +67,7 @@ #include <asm/mce.h> #include <asm/io.h> #include <asm/i387.h> +#include <asm/reboot.h> /* for struct machine_ops */ /*G:010 Welcome to the Guest! * @@ -813,7 +814,7 @@ static void lguest_safe_halt(void) * rather than virtual addresses, so we use __pa() here. */ static void lguest_power_off(void) { - hcall(LHCALL_CRASH, __pa("Power down"), 0, 0); + hcall(LHCALL_SHUTDOWN, __pa("Power down"), LGUEST_SHUTDOWN_POWEROFF, 0); } /* @@ -823,7 +824,7 @@ static void lguest_power_off(void) */ static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p) { - hcall(LHCALL_CRASH, __pa(p), 0, 0); + hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0); /* The hcall won't return, but to keep gcc happy, we're "done". */ return NOTIFY_DONE; } @@ -927,6 +928,11 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, return insn_len; } +static void lguest_restart(char *reason) +{ + hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0); +} + /*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops * structures in the kernel provide points for (almost) every routine we have * to override to avoid privileged instructions. */ @@ -1060,6 +1066,7 @@ __init void lguest_init(void) * the Guest routine to power off. */ pm_power_off = lguest_power_off; + machine_ops.restart = lguest_restart; /* Now we're set up, call start_kernel() in init/main.c and we proceed * to boot as normal. It never returns. */ start_kernel(); diff --git a/block/bsg.c b/block/bsg.c index 69b0a9d3330..8917c5174dc 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -279,6 +279,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr) goto out; } rq->next_rq = next_rq; + next_rq->cmd_type = rq->cmd_type; dxferp = (void*)(unsigned long)hdr->din_xferp; ret = blk_rq_map_user(q, next_rq, dxferp, hdr->din_xfer_len); diff --git a/drivers/Kconfig b/drivers/Kconfig index f4076d9e990..08d4ae20159 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -90,8 +90,6 @@ source "drivers/dca/Kconfig" source "drivers/auxdisplay/Kconfig" -source "drivers/kvm/Kconfig" - source "drivers/uio/Kconfig" source "drivers/virtio/Kconfig" diff --git a/drivers/Makefile b/drivers/Makefile index d92d4d82d00..0ee9a8a4095 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -47,7 +47,6 @@ obj-$(CONFIG_SPI) += spi/ obj-$(CONFIG_PCCARD) += pcmcia/ obj-$(CONFIG_DIO) += dio/ obj-$(CONFIG_SBUS) += sbus/ -obj-$(CONFIG_KVM) += kvm/ obj-$(CONFIG_ZORRO) += zorro/ obj-$(CONFIG_MAC) += macintosh/ obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/ @@ -73,7 +72,7 @@ obj-$(CONFIG_ISDN) += isdn/ obj-$(CONFIG_EDAC) += edac/ obj-$(CONFIG_MCA) += mca/ obj-$(CONFIG_EISA) += eisa/ -obj-$(CONFIG_LGUEST_GUEST) += lguest/ +obj-y += lguest/ obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_CPU_IDLE) += cpuidle/ obj-$(CONFIG_MMC) += mmc/ diff --git a/drivers/base/bus.c b/drivers/base/bus.c index f484495b2ad..055989e9479 100644 --- a/drivers/base/bus.c +++ b/drivers/base/bus.c @@ -163,15 +163,6 @@ static struct kset *bus_kset; #ifdef CONFIG_HOTPLUG /* Manually detach a device from its associated driver. */ -static int driver_helper(struct device *dev, void *data) -{ - const char *name = data; - - if (strcmp(name, dev->bus_id) == 0) - return 1; - return 0; -} - static ssize_t driver_unbind(struct device_driver *drv, const char *buf, size_t count) { @@ -179,7 +170,7 @@ static ssize_t driver_unbind(struct device_driver *drv, struct device *dev; int err = -ENODEV; - dev = bus_find_device(bus, NULL, (void *)buf, driver_helper); + dev = bus_find_device_by_name(bus, NULL, buf); if (dev && dev->driver == drv) { if (dev->parent) /* Needed for USB */ down(&dev->parent->sem); @@ -206,7 +197,7 @@ static ssize_t driver_bind(struct device_driver *drv, struct device *dev; int err = -ENODEV; - dev = bus_find_device(bus, NULL, (void *)buf, driver_helper); + dev = bus_find_device_by_name(bus, NULL, buf); if (dev && dev->driver == NULL) { if (dev->parent) /* Needed for USB */ down(&dev->parent->sem); @@ -250,7 +241,7 @@ static ssize_t store_drivers_probe(struct bus_type *bus, { struct device *dev; - dev = bus_find_device(bus, NULL, (void *)buf, driver_helper); + dev = bus_find_device_by_name(bus, NULL, buf); if (!dev) return -ENODEV; if (bus_rescan_devices_helper(dev, NULL) != 0) @@ -338,6 +329,32 @@ struct device *bus_find_device(struct bus_type *bus, } EXPORT_SYMBOL_GPL(bus_find_device); +static int match_name(struct device *dev, void *data) +{ + const char *name = data; + + if (strcmp(name, dev->bus_id) == 0) + return 1; + return 0; +} + +/** + * bus_find_device_by_name - device iterator for locating a particular device of a specific name + * @bus: bus type + * @start: Device to begin with + * @name: name of the device to match + * + * This is similar to the bus_find_device() function above, but it handles + * searching by a name automatically, no need to write another strcmp matching + * function. + */ +struct device *bus_find_device_by_name(struct bus_type *bus, + struct device *start, const char *name) +{ + return bus_find_device(bus, start, (void *)name, match_name); +} +EXPORT_SYMBOL_GPL(bus_find_device_by_name); + static struct device_driver *next_driver(struct klist_iter *i) { struct klist_node *n = klist_next(i); diff --git a/drivers/base/class.c b/drivers/base/class.c index 59cf35894cf..9d915376c31 100644 --- a/drivers/base/class.c +++ b/drivers/base/class.c @@ -149,7 +149,7 @@ int class_register(struct class *cls) if (error) return error; -#ifdef CONFIG_SYSFS_DEPRECATED +#if defined(CONFIG_SYSFS_DEPRECATED) && defined(CONFIG_BLOCK) /* let the block class directory show up in the root of sysfs */ if (cls != &block_class) cls->subsys.kobj.kset = class_kset; @@ -863,7 +863,7 @@ EXPORT_SYMBOL_GPL(class_for_each_device); * The callback should return 0 if the device doesn't match and non-zero * if it does. If the callback returns non-zero, this function will * return to the caller and not iterate over any more devices. - + * * Note, you will need to drop the reference with put_device() after use. * * We hold class->sem in this function, so it can not be diff --git a/drivers/base/core.c b/drivers/base/core.c index edf3bbeb8d6..b1727876182 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -27,9 +27,17 @@ int (*platform_notify)(struct device *dev) = NULL; int (*platform_notify_remove)(struct device *dev) = NULL; -/* - * sysfs bindings for devices. - */ +#ifdef CONFIG_BLOCK +static inline int device_is_not_partition(struct device *dev) +{ + return !(dev->type == &part_type); +} +#else +static inline int device_is_not_partition(struct device *dev) +{ + return 1; +} +#endif /** * dev_driver_string - Return a device's driver name, if at all possible @@ -652,14 +660,14 @@ static int device_add_class_symlinks(struct device *dev) #ifdef CONFIG_SYSFS_DEPRECATED /* stacked class devices need a symlink in the class directory */ if (dev->kobj.parent != &dev->class->subsys.kobj && - dev->type != &part_type) { + device_is_not_partition(dev)) { error = sysfs_create_link(&dev->class->subsys.kobj, &dev->kobj, dev->bus_id); if (error) goto out_subsys; } - if (dev->parent && dev->type != &part_type) { + if (dev->parent && device_is_not_partition(dev)) { struct device *parent = dev->parent; char *class_name; @@ -688,11 +696,11 @@ static int device_add_class_symlinks(struct device *dev) return 0; out_device: - if (dev->parent && dev->type != &part_type) + if (dev->parent && device_is_not_partition(dev)) sysfs_remove_link(&dev->kobj, "device"); out_busid: if (dev->kobj.parent != &dev->class->subsys.kobj && - dev->type != &part_type) + device_is_not_partition(dev)) sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id); #else /* link in the class directory pointing to the device */ @@ -701,7 +709,7 @@ out_busid: if (error) goto out_subsys; - if (dev->parent && dev->type != &part_type) { + if (dev->parent && device_is_not_partition(dev)) { error = sysfs_create_link(&dev->kobj, &dev->parent->kobj, "device"); if (error) @@ -725,7 +733,7 @@ static void device_remove_class_symlinks(struct device *dev) return; #ifdef CONFIG_SYSFS_DEPRECATED - if (dev->parent && dev->type != &part_type) { + if (dev->parent && device_is_not_partition(dev)) { char *class_name; class_name = make_class_name(dev->class->name, &dev->kobj); @@ -737,10 +745,10 @@ static void device_remove_class_symlinks(struct device *dev) } if (dev->kobj.parent != &dev->class->subsys.kobj && - dev->type != &part_type) + device_is_not_partition(dev)) sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id); #else - if (dev->parent && dev->type != &part_type) + if (dev->parent && device_is_not_partition(dev)) sysfs_remove_link(&dev->kobj, "device"); sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id); diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index f2d2c7e2c76..195ce7c1231 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -1571,7 +1571,6 @@ static struct scsi_host_template srp_template = { .this_id = -1, .cmd_per_lun = SRP_SQ_SIZE, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, .shost_attrs = srp_host_attrs }; diff --git a/drivers/kvm/irq.h b/drivers/kvm/irq.h deleted file mode 100644 index 11fc014e2b3..00000000000 --- a/drivers/kvm/irq.h +++ /dev/null @@ -1,165 +0,0 @@ -/* - * irq.h: in kernel interrupt controller related definitions - * Copyright (c) 2007, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. - * Authors: - * Yaozu (Eddie) Dong <Eddie.dong@intel.com> - * - */ - -#ifndef __IRQ_H -#define __IRQ_H - -#include "kvm.h" - -typedef void irq_request_func(void *opaque, int level); - -struct kvm_kpic_state { - u8 last_irr; /* edge detection */ - u8 irr; /* interrupt request register */ - u8 imr; /* interrupt mask register */ - u8 isr; /* interrupt service register */ - u8 priority_add; /* highest irq priority */ - u8 irq_base; - u8 read_reg_select; - u8 poll; - u8 special_mask; - u8 init_state; - u8 auto_eoi; - u8 rotate_on_auto_eoi; - u8 special_fully_nested_mode; - u8 init4; /* true if 4 byte init */ - u8 elcr; /* PIIX edge/trigger selection */ - u8 elcr_mask; - struct kvm_pic *pics_state; -}; - -struct kvm_pic { - struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ - irq_request_func *irq_request; - void *irq_request_opaque; - int output; /* intr from master PIC */ - struct kvm_io_device dev; -}; - -struct kvm_pic *kvm_create_pic(struct kvm *kvm); -void kvm_pic_set_irq(void *opaque, int irq, int level); -int kvm_pic_read_irq(struct kvm_pic *s); -int kvm_cpu_get_interrupt(struct kvm_vcpu *v); -int kvm_cpu_has_interrupt(struct kvm_vcpu *v); -void kvm_pic_update_irq(struct kvm_pic *s); - -#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS -#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */ -#define IOAPIC_EDGE_TRIG 0 -#define IOAPIC_LEVEL_TRIG 1 - -#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000 -#define IOAPIC_MEM_LENGTH 0x100 - -/* Direct registers. */ -#define IOAPIC_REG_SELECT 0x00 -#define IOAPIC_REG_WINDOW 0x10 -#define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */ - -/* Indirect registers. */ -#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */ -#define IOAPIC_REG_VERSION 0x01 -#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */ - -struct kvm_ioapic { - u64 base_address; - u32 ioregsel; - u32 id; - u32 irr; - u32 pad; - union ioapic_redir_entry { - u64 bits; - struct { - u8 vector; - u8 delivery_mode:3; - u8 dest_mode:1; - u8 delivery_status:1; - u8 polarity:1; - u8 remote_irr:1; - u8 trig_mode:1; - u8 mask:1; - u8 reserve:7; - u8 reserved[4]; - u8 dest_id; - } fields; - } redirtbl[IOAPIC_NUM_PINS]; - struct kvm_io_device dev; - struct kvm *kvm; -}; - -struct kvm_lapic { - unsigned long base_address; - struct kvm_io_device dev; - struct { - atomic_t pending; - s64 period; /* unit: ns */ - u32 divide_count; - ktime_t last_update; - struct hrtimer dev; - } timer; - struct kvm_vcpu *vcpu; - struct page *regs_page; - void *regs; -}; - -#ifdef DEBUG -#define ASSERT(x) \ -do { \ - if (!(x)) { \ - printk(KERN_EMERG "assertion failed %s: %d: %s\n", \ - __FILE__, __LINE__, #x); \ - BUG(); \ - } \ -} while (0) -#else -#define ASSERT(x) do { } while (0) -#endif - -void kvm_vcpu_kick(struct kvm_vcpu *vcpu); -int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); -int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); -int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); -int kvm_create_lapic(struct kvm_vcpu *vcpu); -void kvm_lapic_reset(struct kvm_vcpu *vcpu); -void kvm_free_apic(struct kvm_lapic *apic); -u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); -void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); -void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); -struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, - unsigned long bitmap); -u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); -void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); -int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); -void kvm_ioapic_update_eoi(struct kvm *kvm, int vector); -int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); -int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig); -void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); -int kvm_ioapic_init(struct kvm *kvm); -void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); -int kvm_lapic_enabled(struct kvm_vcpu *vcpu); -int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); -void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec); -void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); -void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); -void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); -void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); - -#endif diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c deleted file mode 100644 index feb5ac986c5..00000000000 --- a/drivers/kvm/mmu.c +++ /dev/null @@ -1,1498 +0,0 @@ -/* - * Kernel-based Virtual Machine driver for Linux - * - * This module enables machines with Intel VT-x extensions to run virtual - * machines without emulation or binary translation. - * - * MMU support - * - * Copyright (C) 2006 Qumranet, Inc. - * - * Authors: - * Yaniv Kamay <yaniv@qumranet.com> - * Avi Kivity <avi@qumranet.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include "vmx.h" -#include "kvm.h" - -#include <linux/types.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/module.h> - -#include <asm/page.h> -#include <asm/cmpxchg.h> - -#undef MMU_DEBUG - -#undef AUDIT - -#ifdef AUDIT -static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg); -#else -static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} -#endif - -#ifdef MMU_DEBUG - -#define pgprintk(x...) do { if (dbg) printk(x); } while (0) -#define rmap_printk(x...) do { if (dbg) printk(x); } while (0) - -#else - -#define pgprintk(x...) do { } while (0) -#define rmap_printk(x...) do { } while (0) - -#endif - -#if defined(MMU_DEBUG) || defined(AUDIT) -static int dbg = 1; -#endif - -#ifndef MMU_DEBUG -#define ASSERT(x) do { } while (0) -#else -#define ASSERT(x) \ - if (!(x)) { \ - printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ - __FILE__, __LINE__, #x); \ - } -#endif - -#define PT64_PT_BITS 9 -#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) -#define PT32_PT_BITS 10 -#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) - -#define PT_WRITABLE_SHIFT 1 - -#define PT_PRESENT_MASK (1ULL << 0) -#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT) -#define PT_USER_MASK (1ULL << 2) -#define PT_PWT_MASK (1ULL << 3) -#define PT_PCD_MASK (1ULL << 4) -#define PT_ACCESSED_MASK (1ULL << 5) -#define PT_DIRTY_MASK (1ULL << 6) -#define PT_PAGE_SIZE_MASK (1ULL << 7) -#define PT_PAT_MASK (1ULL << 7) -#define PT_GLOBAL_MASK (1ULL << 8) -#define PT64_NX_MASK (1ULL << 63) - -#define PT_PAT_SHIFT 7 -#define PT_DIR_PAT_SHIFT 12 -#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT) - -#define PT32_DIR_PSE36_SIZE 4 -#define PT32_DIR_PSE36_SHIFT 13 -#define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) - - -#define PT_FIRST_AVAIL_BITS_SHIFT 9 -#define PT64_SECOND_AVAIL_BITS_SHIFT 52 - -#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) - -#define VALID_PAGE(x) ((x) != INVALID_PAGE) - -#define PT64_LEVEL_BITS 9 - -#define PT64_LEVEL_SHIFT(level) \ - ( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS ) - -#define PT64_LEVEL_MASK(level) \ - (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level)) - -#define PT64_INDEX(address, level)\ - (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) - - -#define PT32_LEVEL_BITS 10 - -#define PT32_LEVEL_SHIFT(level) \ - ( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS ) - -#define PT32_LEVEL_MASK(level) \ - (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) - -#define PT32_INDEX(address, level)\ - (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) - - -#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) -#define PT64_DIR_BASE_ADDR_MASK \ - (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) - -#define PT32_BASE_ADDR_MASK PAGE_MASK -#define PT32_DIR_BASE_ADDR_MASK \ - (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) - - -#define PFERR_PRESENT_MASK (1U << 0) -#define PFERR_WRITE_MASK (1U << 1) -#define PFERR_USER_MASK (1U << 2) -#define PFERR_FETCH_MASK (1U << 4) - -#define PT64_ROOT_LEVEL 4 -#define PT32_ROOT_LEVEL 2 -#define PT32E_ROOT_LEVEL 3 - -#define PT_DIRECTORY_LEVEL 2 -#define PT_PAGE_TABLE_LEVEL 1 - -#define RMAP_EXT 4 - -struct kvm_rmap_desc { - u64 *shadow_ptes[RMAP_EXT]; - struct kvm_rmap_desc *more; -}; - -static struct kmem_cache *pte_chain_cache; -static struct kmem_cache *rmap_desc_cache; -static struct kmem_cache *mmu_page_header_cache; - -static int is_write_protection(struct kvm_vcpu *vcpu) -{ - return vcpu->cr0 & X86_CR0_WP; -} - -static int is_cpuid_PSE36(void) -{ - return 1; -} - -static int is_nx(struct kvm_vcpu *vcpu) -{ - return vcpu->shadow_efer & EFER_NX; -} - -static int is_present_pte(unsigned long pte) -{ - return pte & PT_PRESENT_MASK; -} - -static int is_writeble_pte(unsigned long pte) -{ - return pte & PT_WRITABLE_MASK; -} - -static int is_io_pte(unsigned long pte) -{ - return pte & PT_SHADOW_IO_MARK; -} - -static int is_rmap_pte(u64 pte) -{ - return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK)) - == (PT_WRITABLE_MASK | PT_PRESENT_MASK); -} - -static void set_shadow_pte(u64 *sptep, u64 spte) -{ -#ifdef CONFIG_X86_64 - set_64bit((unsigned long *)sptep, spte); -#else - set_64bit((unsigned long long *)sptep, spte); -#endif -} - -static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, - struct kmem_cache *base_cache, int min) -{ - void *obj; - - if (cache->nobjs >= min) - return 0; - while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); - if (!obj) - return -ENOMEM; - cache->objects[cache->nobjs++] = obj; - } - return 0; -} - -static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) -{ - while (mc->nobjs) - kfree(mc->objects[--mc->nobjs]); -} - -static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, - int min) -{ - struct page *page; - - if (cache->nobjs >= min) - return 0; - while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - page = alloc_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - set_page_private(page, 0); - cache->objects[cache->nobjs++] = page_address(page); - } - return 0; -} - -static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc) -{ - while (mc->nobjs) - free_page((unsigned long)mc->objects[--mc->nobjs]); -} - -static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) -{ - int r; - - kvm_mmu_free_some_pages(vcpu); - r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache, - pte_chain_cache, 4); - if (r) - goto out; - r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache, - rmap_desc_cache, 1); - if (r) - goto out; - r = mmu_topup_memory_cache_page(&vcpu->mmu_page_cache, 4); - if (r) - goto out; - r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache, - mmu_page_header_cache, 4); -out: - return r; -} - -static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) -{ - mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache); - mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache); - mmu_free_memory_cache_page(&vcpu->mmu_page_cache); - mmu_free_memory_cache(&vcpu->mmu_page_header_cache); -} - -static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, - size_t size) -{ - void *p; - - BUG_ON(!mc->nobjs); - p = mc->objects[--mc->nobjs]; - memset(p, 0, size); - return p; -} - -static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) -{ - return mmu_memory_cache_alloc(&vcpu->mmu_pte_chain_cache, - sizeof(struct kvm_pte_chain)); -} - -static void mmu_free_pte_chain(struct kvm_pte_chain *pc) -{ - kfree(pc); -} - -static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) -{ - return mmu_memory_cache_alloc(&vcpu->mmu_rmap_desc_cache, - sizeof(struct kvm_rmap_desc)); -} - -static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) -{ - kfree(rd); -} - -/* - * Reverse mapping data structures: - * - * If page->private bit zero is zero, then page->private points to the - * shadow page table entry that points to page_address(page). - * - * If page->private bit zero is one, (then page->private & ~1) points - * to a struct kvm_rmap_desc containing more mappings. - */ -static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte) -{ - struct page *page; - struct kvm_rmap_desc *desc; - int i; - - if (!is_rmap_pte(*spte)) - return; - page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); - if (!page_private(page)) { - rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); - set_page_private(page,(unsigned long)spte); - } else if (!(page_private(page) & 1)) { - rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); - desc = mmu_alloc_rmap_desc(vcpu); - desc->shadow_ptes[0] = (u64 *)page_private(page); - desc->shadow_ptes[1] = spte; - set_page_private(page,(unsigned long)desc | 1); - } else { - rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); - desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul); - while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) - desc = desc->more; - if (desc->shadow_ptes[RMAP_EXT-1]) { - desc->more = mmu_alloc_rmap_desc(vcpu); - desc = desc->more; - } - for (i = 0; desc->shadow_ptes[i]; ++i) - ; - desc->shadow_ptes[i] = spte; - } -} - -static void rmap_desc_remove_entry(struct page *page, - struct kvm_rmap_desc *desc, - int i, - struct kvm_rmap_desc *prev_desc) -{ - int j; - - for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j) - ; - desc->shadow_ptes[i] = desc->shadow_ptes[j]; - desc->shadow_ptes[j] = NULL; - if (j != 0) - return; - if (!prev_desc && !desc->more) - set_page_private(page,(unsigned long)desc->shadow_ptes[0]); - else - if (prev_desc) - prev_desc->more = desc->more; - else - set_page_private(page,(unsigned long)desc->more | 1); - mmu_free_rmap_desc(desc); -} - -static void rmap_remove(u64 *spte) -{ - struct page *page; - struct kvm_rmap_desc *desc; - struct kvm_rmap_desc *prev_desc; - int i; - - if (!is_rmap_pte(*spte)) - return; - page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); - if (!page_private(page)) { - printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); - BUG(); - } else if (!(page_private(page) & 1)) { - rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); - if ((u64 *)page_private(page) != spte) { - printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", - spte, *spte); - BUG(); - } - set_page_private(page,0); - } else { - rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); - desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul); - prev_desc = NULL; - while (desc) { - for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) - if (desc->shadow_ptes[i] == spte) { - rmap_desc_remove_entry(page, - desc, i, - prev_desc); - return; - } - prev_desc = desc; - desc = desc->more; - } - BUG(); - } -} - -static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) -{ - struct kvm *kvm = vcpu->kvm; - struct page *page; - struct kvm_rmap_desc *desc; - u64 *spte; - - page = gfn_to_page(kvm, gfn); - BUG_ON(!page); - - while (page_private(page)) { - if (!(page_private(page) & 1)) - spte = (u64 *)page_private(page); - else { - desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul); - spte = desc->shadow_ptes[0]; - } - BUG_ON(!spte); - BUG_ON((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT - != page_to_pfn(page)); - BUG_ON(!(*spte & PT_PRESENT_MASK)); - BUG_ON(!(*spte & PT_WRITABLE_MASK)); - rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); - rmap_remove(spte); - set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); - kvm_flush_remote_tlbs(vcpu->kvm); - } -} - -#ifdef MMU_DEBUG -static int is_empty_shadow_page(u64 *spt) -{ - u64 *pos; - u64 *end; - - for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) - if (*pos != 0) { - printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, - pos, *pos); - return 0; - } - return 1; -} -#endif - -static void kvm_mmu_free_page(struct kvm *kvm, - struct kvm_mmu_page *page_head) -{ - ASSERT(is_empty_shadow_page(page_head->spt)); - list_del(&page_head->link); - __free_page(virt_to_page(page_head->spt)); - kfree(page_head); - ++kvm->n_free_mmu_pages; -} - -static unsigned kvm_page_table_hashfn(gfn_t gfn) -{ - return gfn; -} - -static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, - u64 *parent_pte) -{ - struct kvm_mmu_page *page; - - if (!vcpu->kvm->n_free_mmu_pages) - return NULL; - - page = mmu_memory_cache_alloc(&vcpu->mmu_page_header_cache, - sizeof *page); - page->spt = mmu_memory_cache_alloc(&vcpu->mmu_page_cache, PAGE_SIZE); - set_page_private(virt_to_page(page->spt), (unsigned long)page); - list_add(&page->link, &vcpu->kvm->active_mmu_pages); - ASSERT(is_empty_shadow_page(page->spt)); - page->slot_bitmap = 0; - page->multimapped = 0; - page->parent_pte = parent_pte; - --vcpu->kvm->n_free_mmu_pages; - return page; -} - -static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *page, u64 *parent_pte) -{ - struct kvm_pte_chain *pte_chain; - struct hlist_node *node; - int i; - - if (!parent_pte) - return; - if (!page->multimapped) { - u64 *old = page->parent_pte; - - if (!old) { - page->parent_pte = parent_pte; - return; - } - page->multimapped = 1; - pte_chain = mmu_alloc_pte_chain(vcpu); - INIT_HLIST_HEAD(&page->parent_ptes); - hlist_add_head(&pte_chain->link, &page->parent_ptes); - pte_chain->parent_ptes[0] = old; - } - hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) { - if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1]) - continue; - for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) - if (!pte_chain->parent_ptes[i]) { - pte_chain->parent_ptes[i] = parent_pte; - return; - } - } - pte_chain = mmu_alloc_pte_chain(vcpu); - BUG_ON(!pte_chain); - hlist_add_head(&pte_chain->link, &page->parent_ptes); - pte_chain->parent_ptes[0] = parent_pte; -} - -static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page, - u64 *parent_pte) -{ - struct kvm_pte_chain *pte_chain; - struct hlist_node *node; - int i; - - if (!page->multimapped) { - BUG_ON(page->parent_pte != parent_pte); - page->parent_pte = NULL; - return; - } - hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) - for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { - if (!pte_chain->parent_ptes[i]) - break; - if (pte_chain->parent_ptes[i] != parent_pte) - continue; - while (i + 1 < NR_PTE_CHAIN_ENTRIES - && pte_chain->parent_ptes[i + 1]) { - pte_chain->parent_ptes[i] - = pte_chain->parent_ptes[i + 1]; - ++i; - } - pte_chain->parent_ptes[i] = NULL; - if (i == 0) { - hlist_del(&pte_chain->link); - mmu_free_pte_chain(pte_chain); - if (hlist_empty(&page->parent_ptes)) { - page->multimapped = 0; - page->parent_pte = NULL; - } - } - return; - } - BUG(); -} - -static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu, - gfn_t gfn) -{ - unsigned index; - struct hlist_head *bucket; - struct kvm_mmu_page *page; - struct hlist_node *node; - - pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); - index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; - bucket = &vcpu->kvm->mmu_page_hash[index]; - hlist_for_each_entry(page, node, bucket, hash_link) - if (page->gfn == gfn && !page->role.metaphysical) { - pgprintk("%s: found role %x\n", - __FUNCTION__, page->role.word); - return page; - } - return NULL; -} - -static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, - gfn_t gfn, - gva_t gaddr, - unsigned level, - int metaphysical, - unsigned hugepage_access, - u64 *parent_pte) -{ - union kvm_mmu_page_role role; - unsigned index; - unsigned quadrant; - struct hlist_head *bucket; - struct kvm_mmu_page *page; - struct hlist_node *node; - - role.word = 0; - role.glevels = vcpu->mmu.root_level; - role.level = level; - role.metaphysical = metaphysical; - role.hugepage_access = hugepage_access; - if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) { - quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); - quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; - role.quadrant = quadrant; - } - pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__, - gfn, role.word); - index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; - bucket = &vcpu->kvm->mmu_page_hash[index]; - hlist_for_each_entry(page, node, bucket, hash_link) - if (page->gfn == gfn && page->role.word == role.word) { - mmu_page_add_parent_pte(vcpu, page, parent_pte); - pgprintk("%s: found\n", __FUNCTION__); - return page; - } - page = kvm_mmu_alloc_page(vcpu, parent_pte); - if (!page) - return page; - pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word); - page->gfn = gfn; - page->role = role; - hlist_add_head(&page->hash_link, bucket); - if (!metaphysical) - rmap_write_protect(vcpu, gfn); - return page; -} - -static void kvm_mmu_page_unlink_children(struct kvm *kvm, - struct kvm_mmu_page *page) -{ - unsigned i; - u64 *pt; - u64 ent; - - pt = page->spt; - - if (page->role.level == PT_PAGE_TABLE_LEVEL) { - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - if (pt[i] & PT_PRESENT_MASK) - rmap_remove(&pt[i]); - pt[i] = 0; - } - kvm_flush_remote_tlbs(kvm); - return; - } - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - ent = pt[i]; - - pt[i] = 0; - if (!(ent & PT_PRESENT_MASK)) - continue; - ent &= PT64_BASE_ADDR_MASK; - mmu_page_remove_parent_pte(page_header(ent), &pt[i]); - } - kvm_flush_remote_tlbs(kvm); -} - -static void kvm_mmu_put_page(struct kvm_mmu_page *page, - u64 *parent_pte) -{ - mmu_page_remove_parent_pte(page, parent_pte); -} - -static void kvm_mmu_zap_page(struct kvm *kvm, - struct kvm_mmu_page *page) -{ - u64 *parent_pte; - - while (page->multimapped || page->parent_pte) { - if (!page->multimapped) - parent_pte = page->parent_pte; - else { - struct kvm_pte_chain *chain; - - chain = container_of(page->parent_ptes.first, - struct kvm_pte_chain, link); - parent_pte = chain->parent_ptes[0]; - } - BUG_ON(!parent_pte); - kvm_mmu_put_page(page, parent_pte); - set_shadow_pte(parent_pte, 0); - } - kvm_mmu_page_unlink_children(kvm, page); - if (!page->root_count) { - hlist_del(&page->hash_link); - kvm_mmu_free_page(kvm, page); - } else - list_move(&page->link, &kvm->active_mmu_pages); -} - -static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - unsigned index; - struct hlist_head *bucket; - struct kvm_mmu_page *page; - struct hlist_node *node, *n; - int r; - - pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); - r = 0; - index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; - bucket = &vcpu->kvm->mmu_page_hash[index]; - hlist_for_each_entry_safe(page, node, n, bucket, hash_link) - if (page->gfn == gfn && !page->role.metaphysical) { - pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn, - page->role.word); - kvm_mmu_zap_page(vcpu->kvm, page); - r = 1; - } - return r; -} - -static void mmu_unshadow(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - struct kvm_mmu_page *page; - - while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) { - pgprintk("%s: zap %lx %x\n", - __FUNCTION__, gfn, page->role.word); - kvm_mmu_zap_page(vcpu->kvm, page); - } -} - -static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa) -{ - int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT)); - struct kvm_mmu_page *page_head = page_header(__pa(pte)); - - __set_bit(slot, &page_head->slot_bitmap); -} - -hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa) -{ - hpa_t hpa = gpa_to_hpa(vcpu, gpa); - - return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa; -} - -hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa) -{ - struct page *page; - - ASSERT((gpa & HPA_ERR_MASK) == 0); - page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - if (!page) - return gpa | HPA_ERR_MASK; - return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT) - | (gpa & (PAGE_SIZE-1)); -} - -hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva) -{ - gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva); - - if (gpa == UNMAPPED_GVA) - return UNMAPPED_GVA; - return gpa_to_hpa(vcpu, gpa); -} - -struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) -{ - gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva); - - if (gpa == UNMAPPED_GVA) - return NULL; - return pfn_to_page(gpa_to_hpa(vcpu, gpa) >> PAGE_SHIFT); -} - -static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) -{ -} - -static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p) -{ - int level = PT32E_ROOT_LEVEL; - hpa_t table_addr = vcpu->mmu.root_hpa; - - for (; ; level--) { - u32 index = PT64_INDEX(v, level); - u64 *table; - u64 pte; - - ASSERT(VALID_PAGE(table_addr)); - table = __va(table_addr); - - if (level == 1) { - pte = table[index]; - if (is_present_pte(pte) && is_writeble_pte(pte)) - return 0; - mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT); - page_header_update_slot(vcpu->kvm, table, v); - table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK | - PT_USER_MASK; - rmap_add(vcpu, &table[index]); - return 0; - } - - if (table[index] == 0) { - struct kvm_mmu_page *new_table; - gfn_t pseudo_gfn; - - pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK) - >> PAGE_SHIFT; - new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, - v, level - 1, - 1, 0, &table[index]); - if (!new_table) { - pgprintk("nonpaging_map: ENOMEM\n"); - return -ENOMEM; - } - - table[index] = __pa(new_table->spt) | PT_PRESENT_MASK - | PT_WRITABLE_MASK | PT_USER_MASK; - } - table_addr = table[index] & PT64_BASE_ADDR_MASK; - } -} - -static void mmu_free_roots(struct kvm_vcpu *vcpu) -{ - int i; - struct kvm_mmu_page *page; - - if (!VALID_PAGE(vcpu->mmu.root_hpa)) - return; -#ifdef CONFIG_X86_64 - if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) { - hpa_t root = vcpu->mmu.root_hpa; - - page = page_header(root); - --page->root_count; - vcpu->mmu.root_hpa = INVALID_PAGE; - return; - } -#endif - for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->mmu.pae_root[i]; - - if (root) { - root &= PT64_BASE_ADDR_MASK; - page = page_header(root); - --page->root_count; - } - vcpu->mmu.pae_root[i] = INVALID_PAGE; - } - vcpu->mmu.root_hpa = INVALID_PAGE; -} - -static void mmu_alloc_roots(struct kvm_vcpu *vcpu) -{ - int i; - gfn_t root_gfn; - struct kvm_mmu_page *page; - - root_gfn = vcpu->cr3 >> PAGE_SHIFT; - -#ifdef CONFIG_X86_64 - if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) { - hpa_t root = vcpu->mmu.root_hpa; - - ASSERT(!VALID_PAGE(root)); - page = kvm_mmu_get_page(vcpu, root_gfn, 0, - PT64_ROOT_LEVEL, 0, 0, NULL); - root = __pa(page->spt); - ++page->root_count; - vcpu->mmu.root_hpa = root; - return; - } -#endif - for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->mmu.pae_root[i]; - - ASSERT(!VALID_PAGE(root)); - if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL) { - if (!is_present_pte(vcpu->pdptrs[i])) { - vcpu->mmu.pae_root[i] = 0; - continue; - } - root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT; - } else if (vcpu->mmu.root_level == 0) - root_gfn = 0; - page = kvm_mmu_get_page(vcpu, root_gfn, i << 30, - PT32_ROOT_LEVEL, !is_paging(vcpu), - 0, NULL); - root = __pa(page->spt); - ++page->root_count; - vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK; - } - vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root); -} - -static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) -{ - return vaddr; -} - -static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, - u32 error_code) -{ - gpa_t addr = gva; - hpa_t paddr; - int r; - - r = mmu_topup_memory_caches(vcpu); - if (r) - return r; - - ASSERT(vcpu); - ASSERT(VALID_PAGE(vcpu->mmu.root_hpa)); - - - paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK); - - if (is_error_hpa(paddr)) - return 1; - - return nonpaging_map(vcpu, addr & PAGE_MASK, paddr); -} - -static void nonpaging_free(struct kvm_vcpu *vcpu) -{ - mmu_free_roots(vcpu); -} - -static int nonpaging_init_context(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu *context = &vcpu->mmu; - - context->new_cr3 = nonpaging_new_cr3; - context->page_fault = nonpaging_page_fault; - context->gva_to_gpa = nonpaging_gva_to_gpa; - context->free = nonpaging_free; - context->root_level = 0; - context->shadow_root_level = PT32E_ROOT_LEVEL; - context->root_hpa = INVALID_PAGE; - return 0; -} - -static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) -{ - ++vcpu->stat.tlb_flush; - kvm_x86_ops->tlb_flush(vcpu); -} - -static void paging_new_cr3(struct kvm_vcpu *vcpu) -{ - pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3); - mmu_free_roots(vcpu); -} - -static void inject_page_fault(struct kvm_vcpu *vcpu, - u64 addr, - u32 err_code) -{ - kvm_x86_ops->inject_page_fault(vcpu, addr, err_code); -} - -static void paging_free(struct kvm_vcpu *vcpu) -{ - nonpaging_free(vcpu); -} - -#define PTTYPE 64 -#include "paging_tmpl.h" -#undef PTTYPE - -#define PTTYPE 32 -#include "paging_tmpl.h" -#undef PTTYPE - -static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) -{ - struct kvm_mmu *context = &vcpu->mmu; - - ASSERT(is_pae(vcpu)); - context->new_cr3 = paging_new_cr3; - context->page_fault = paging64_page_fault; - context->gva_to_gpa = paging64_gva_to_gpa; - context->free = paging_free; - context->root_level = level; - context->shadow_root_level = level; - context->root_hpa = INVALID_PAGE; - return 0; -} - -static int paging64_init_context(struct kvm_vcpu *vcpu) -{ - return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); -} - -static int paging32_init_context(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu *context = &vcpu->mmu; - - context->new_cr3 = paging_new_cr3; - context->page_fault = paging32_page_fault; - context->gva_to_gpa = paging32_gva_to_gpa; - context->free = paging_free; - context->root_level = PT32_ROOT_LEVEL; - context->shadow_root_level = PT32E_ROOT_LEVEL; - context->root_hpa = INVALID_PAGE; - return 0; -} - -static int paging32E_init_context(struct kvm_vcpu *vcpu) -{ - return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); -} - -static int init_kvm_mmu(struct kvm_vcpu *vcpu) -{ - ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); - - if (!is_paging(vcpu)) - return nonpaging_init_context(vcpu); - else if (is_long_mode(vcpu)) - return paging64_init_context(vcpu); - else if (is_pae(vcpu)) - return paging32E_init_context(vcpu); - else - return paging32_init_context(vcpu); -} - -static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) -{ - ASSERT(vcpu); - if (VALID_PAGE(vcpu->mmu.root_hpa)) { - vcpu->mmu.free(vcpu); - vcpu->mmu.root_hpa = INVALID_PAGE; - } -} - -int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) -{ - destroy_kvm_mmu(vcpu); - return init_kvm_mmu(vcpu); -} -EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); - -int kvm_mmu_load(struct kvm_vcpu *vcpu) -{ - int r; - - mutex_lock(&vcpu->kvm->lock); - r = mmu_topup_memory_caches(vcpu); - if (r) - goto out; - mmu_alloc_roots(vcpu); - kvm_x86_ops->set_cr3(vcpu, vcpu->mmu.root_hpa); - kvm_mmu_flush_tlb(vcpu); -out: - mutex_unlock(&vcpu->kvm->lock); - return r; -} -EXPORT_SYMBOL_GPL(kvm_mmu_load); - -void kvm_mmu_unload(struct kvm_vcpu *vcpu) -{ - mmu_free_roots(vcpu); -} - -static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *page, - u64 *spte) -{ - u64 pte; - struct kvm_mmu_page *child; - - pte = *spte; - if (is_present_pte(pte)) { - if (page->role.level == PT_PAGE_TABLE_LEVEL) - rmap_remove(spte); - else { - child = page_header(pte & PT64_BASE_ADDR_MASK); - mmu_page_remove_parent_pte(child, spte); - } - } - set_shadow_pte(spte, 0); - kvm_flush_remote_tlbs(vcpu->kvm); -} - -static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *page, - u64 *spte, - const void *new, int bytes) -{ - if (page->role.level != PT_PAGE_TABLE_LEVEL) - return; - - if (page->role.glevels == PT32_ROOT_LEVEL) - paging32_update_pte(vcpu, page, spte, new, bytes); - else - paging64_update_pte(vcpu, page, spte, new, bytes); -} - -void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes) -{ - gfn_t gfn = gpa >> PAGE_SHIFT; - struct kvm_mmu_page *page; - struct hlist_node *node, *n; - struct hlist_head *bucket; - unsigned index; - u64 *spte; - unsigned offset = offset_in_page(gpa); - unsigned pte_size; - unsigned page_offset; - unsigned misaligned; - unsigned quadrant; - int level; - int flooded = 0; - int npte; - - pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); - if (gfn == vcpu->last_pt_write_gfn) { - ++vcpu->last_pt_write_count; - if (vcpu->last_pt_write_count >= 3) - flooded = 1; - } else { - vcpu->last_pt_write_gfn = gfn; - vcpu->last_pt_write_count = 1; - } - index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; - bucket = &vcpu->kvm->mmu_page_hash[index]; - hlist_for_each_entry_safe(page, node, n, bucket, hash_link) { - if (page->gfn != gfn || page->role.metaphysical) - continue; - pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; - misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); - misaligned |= bytes < 4; - if (misaligned || flooded) { - /* - * Misaligned accesses are too much trouble to fix - * up; also, they usually indicate a page is not used - * as a page table. - * - * If we're seeing too many writes to a page, - * it may no longer be a page table, or we may be - * forking, in which case it is better to unmap the - * page. - */ - pgprintk("misaligned: gpa %llx bytes %d role %x\n", - gpa, bytes, page->role.word); - kvm_mmu_zap_page(vcpu->kvm, page); - continue; - } - page_offset = offset; - level = page->role.level; - npte = 1; - if (page->role.glevels == PT32_ROOT_LEVEL) { - page_offset <<= 1; /* 32->64 */ - /* - * A 32-bit pde maps 4MB while the shadow pdes map - * only 2MB. So we need to double the offset again - * and zap two pdes instead of one. - */ - if (level == PT32_ROOT_LEVEL) { - page_offset &= ~7; /* kill rounding error */ - page_offset <<= 1; - npte = 2; - } - quadrant = page_offset >> PAGE_SHIFT; - page_offset &= ~PAGE_MASK; - if (quadrant != page->role.quadrant) - continue; - } - spte = &page->spt[page_offset / sizeof(*spte)]; - while (npte--) { - mmu_pte_write_zap_pte(vcpu, page, spte); - mmu_pte_write_new_pte(vcpu, page, spte, new, bytes); - ++spte; - } - } -} - -int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) -{ - gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva); - - return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT); -} - -void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) -{ - while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) { - struct kvm_mmu_page *page; - - page = container_of(vcpu->kvm->active_mmu_pages.prev, - struct kvm_mmu_page, link); - kvm_mmu_zap_page(vcpu->kvm, page); - } -} - -static void free_mmu_pages(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu_page *page; - - while (!list_empty(&vcpu->kvm->active_mmu_pages)) { - page = container_of(vcpu->kvm->active_mmu_pages.next, - struct kvm_mmu_page, link); - kvm_mmu_zap_page(vcpu->kvm, page); - } - free_page((unsigned long)vcpu->mmu.pae_root); -} - -static int alloc_mmu_pages(struct kvm_vcpu *vcpu) -{ - struct page *page; - int i; - - ASSERT(vcpu); - - vcpu->kvm->n_free_mmu_pages = KVM_NUM_MMU_PAGES; - - /* - * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. - * Therefore we need to allocate shadow page tables in the first - * 4GB of memory, which happens to fit the DMA32 zone. - */ - page = alloc_page(GFP_KERNEL | __GFP_DMA32); - if (!page) - goto error_1; - vcpu->mmu.pae_root = page_address(page); - for (i = 0; i < 4; ++i) - vcpu->mmu.pae_root[i] = INVALID_PAGE; - - return 0; - -error_1: - free_mmu_pages(vcpu); - return -ENOMEM; -} - -int kvm_mmu_create(struct kvm_vcpu *vcpu) -{ - ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); - - return alloc_mmu_pages(vcpu); -} - -int kvm_mmu_setup(struct kvm_vcpu *vcpu) -{ - ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); - - return init_kvm_mmu(vcpu); -} - -void kvm_mmu_destroy(struct kvm_vcpu *vcpu) -{ - ASSERT(vcpu); - - destroy_kvm_mmu(vcpu); - free_mmu_pages(vcpu); - mmu_free_memory_caches(vcpu); -} - -void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) -{ - struct kvm_mmu_page *page; - - list_for_each_entry(page, &kvm->active_mmu_pages, link) { - int i; - u64 *pt; - - if (!test_bit(slot, &page->slot_bitmap)) - continue; - - pt = page->spt; - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) - /* avoid RMW */ - if (pt[i] & PT_WRITABLE_MASK) { - rmap_remove(&pt[i]); - pt[i] &= ~PT_WRITABLE_MASK; - } - } -} - -void kvm_mmu_zap_all(struct kvm *kvm) -{ - struct kvm_mmu_page *page, *node; - - list_for_each_entry_safe(page, node, &kvm->active_mmu_pages, link) - kvm_mmu_zap_page(kvm, page); - - kvm_flush_remote_tlbs(kvm); -} - -void kvm_mmu_module_exit(void) -{ - if (pte_chain_cache) - kmem_cache_destroy(pte_chain_cache); - if (rmap_desc_cache) - kmem_cache_destroy(rmap_desc_cache); - if (mmu_page_header_cache) - kmem_cache_destroy(mmu_page_header_cache); -} - -int kvm_mmu_module_init(void) -{ - pte_chain_cache = kmem_cache_create("kvm_pte_chain", - sizeof(struct kvm_pte_chain), - 0, 0, NULL); - if (!pte_chain_cache) - goto nomem; - rmap_desc_cache = kmem_cache_create("kvm_rmap_desc", - sizeof(struct kvm_rmap_desc), - 0, 0, NULL); - if (!rmap_desc_cache) - goto nomem; - - mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", - sizeof(struct kvm_mmu_page), - 0, 0, NULL); - if (!mmu_page_header_cache) - goto nomem; - - return 0; - -nomem: - kvm_mmu_module_exit(); - return -ENOMEM; -} - -#ifdef AUDIT - -static const char *audit_msg; - -static gva_t canonicalize(gva_t gva) -{ -#ifdef CONFIG_X86_64 - gva = (long long)(gva << 16) >> 16; -#endif - return gva; -} - -static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, - gva_t va, int level) -{ - u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK); - int i; - gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1)); - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { - u64 ent = pt[i]; - - if (!(ent & PT_PRESENT_MASK)) - continue; - - va = canonicalize(va); - if (level > 1) - audit_mappings_page(vcpu, ent, va, level - 1); - else { - gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va); - hpa_t hpa = gpa_to_hpa(vcpu, gpa); - - if ((ent & PT_PRESENT_MASK) - && (ent & PT64_BASE_ADDR_MASK) != hpa) - printk(KERN_ERR "audit error: (%s) levels %d" - " gva %lx gpa %llx hpa %llx ent %llx\n", - audit_msg, vcpu->mmu.root_level, - va, gpa, hpa, ent); - } - } -} - -static void audit_mappings(struct kvm_vcpu *vcpu) -{ - unsigned i; - - if (vcpu->mmu.root_level == 4) - audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4); - else - for (i = 0; i < 4; ++i) - if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK) - audit_mappings_page(vcpu, - vcpu->mmu.pae_root[i], - i << 30, - 2); -} - -static int count_rmaps(struct kvm_vcpu *vcpu) -{ - int nmaps = 0; - int i, j, k; - - for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { - struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; - struct kvm_rmap_desc *d; - - for (j = 0; j < m->npages; ++j) { - struct page *page = m->phys_mem[j]; - - if (!page->private) - continue; - if (!(page->private & 1)) { - ++nmaps; - continue; - } - d = (struct kvm_rmap_desc *)(page->private & ~1ul); - while (d) { - for (k = 0; k < RMAP_EXT; ++k) - if (d->shadow_ptes[k]) - ++nmaps; - else - break; - d = d->more; - } - } - } - return nmaps; -} - -static int count_writable_mappings(struct kvm_vcpu *vcpu) -{ - int nmaps = 0; - struct kvm_mmu_page *page; - int i; - - list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) { - u64 *pt = page->spt; - - if (page->role.level != PT_PAGE_TABLE_LEVEL) - continue; - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - u64 ent = pt[i]; - - if (!(ent & PT_PRESENT_MASK)) - continue; - if (!(ent & PT_WRITABLE_MASK)) - continue; - ++nmaps; - } - } - return nmaps; -} - -static void audit_rmap(struct kvm_vcpu *vcpu) -{ - int n_rmap = count_rmaps(vcpu); - int n_actual = count_writable_mappings(vcpu); - - if (n_rmap != n_actual) - printk(KERN_ERR "%s: (%s) rmap %d actual %d\n", - __FUNCTION__, audit_msg, n_rmap, n_actual); -} - -static void audit_write_protection(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu_page *page; - - list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) { - hfn_t hfn; - struct page *pg; - - if (page->role.metaphysical) - continue; - - hfn = gpa_to_hpa(vcpu, (gpa_t)page->gfn << PAGE_SHIFT) - >> PAGE_SHIFT; - pg = pfn_to_page(hfn); - if (pg->private) - printk(KERN_ERR "%s: (%s) shadow page has writable" - " mappings: gfn %lx role %x\n", - __FUNCTION__, audit_msg, page->gfn, - page->role.word); - } -} - -static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) -{ - int olddbg = dbg; - - dbg = 0; - audit_msg = msg; - audit_rmap(vcpu); - audit_write_protection(vcpu); - audit_mappings(vcpu); - dbg = olddbg; -} - -#endif diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h deleted file mode 100644 index 6b094b44f8f..00000000000 --- a/drivers/kvm/paging_tmpl.h +++ /dev/null @@ -1,511 +0,0 @@ -/* - * Kernel-based Virtual Machine driver for Linux - * - * This module enables machines with Intel VT-x extensions to run virtual - * machines without emulation or binary translation. - * - * MMU support - * - * Copyright (C) 2006 Qumranet, Inc. - * - * Authors: - * Yaniv Kamay <yaniv@qumranet.com> - * Avi Kivity <avi@qumranet.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -/* - * We need the mmu code to access both 32-bit and 64-bit guest ptes, - * so the code in this file is compiled twice, once per pte size. - */ - -#if PTTYPE == 64 - #define pt_element_t u64 - #define guest_walker guest_walker64 - #define FNAME(name) paging##64_##name - #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK - #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK - #define PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) - #ifdef CONFIG_X86_64 - #define PT_MAX_FULL_LEVELS 4 - #else - #define PT_MAX_FULL_LEVELS 2 - #endif -#elif PTTYPE == 32 - #define pt_element_t u32 - #define guest_walker guest_walker32 - #define FNAME(name) paging##32_##name - #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK - #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK - #define PT_INDEX(addr, level) PT32_INDEX(addr, level) - #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) - #define PT_MAX_FULL_LEVELS 2 -#else - #error Invalid PTTYPE value -#endif - -/* - * The guest_walker structure emulates the behavior of the hardware page - * table walker. - */ -struct guest_walker { - int level; - gfn_t table_gfn[PT_MAX_FULL_LEVELS]; - pt_element_t *table; - pt_element_t pte; - pt_element_t *ptep; - struct page *page; - int index; - pt_element_t inherited_ar; - gfn_t gfn; - u32 error_code; -}; - -/* - * Fetch a guest pte for a guest virtual address - */ -static int FNAME(walk_addr)(struct guest_walker *walker, - struct kvm_vcpu *vcpu, gva_t addr, - int write_fault, int user_fault, int fetch_fault) -{ - hpa_t hpa; - struct kvm_memory_slot *slot; - pt_element_t *ptep; - pt_element_t root; - gfn_t table_gfn; - - pgprintk("%s: addr %lx\n", __FUNCTION__, addr); - walker->level = vcpu->mmu.root_level; - walker->table = NULL; - walker->page = NULL; - walker->ptep = NULL; - root = vcpu->cr3; -#if PTTYPE == 64 - if (!is_long_mode(vcpu)) { - walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3]; - root = *walker->ptep; - walker->pte = root; - if (!(root & PT_PRESENT_MASK)) - goto not_present; - --walker->level; - } -#endif - table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; - walker->table_gfn[walker->level - 1] = table_gfn; - pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, - walker->level - 1, table_gfn); - slot = gfn_to_memslot(vcpu->kvm, table_gfn); - hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK); - walker->page = pfn_to_page(hpa >> PAGE_SHIFT); - walker->table = kmap_atomic(walker->page, KM_USER0); - - ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || - (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0); - - walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK; - - for (;;) { - int index = PT_INDEX(addr, walker->level); - hpa_t paddr; - - ptep = &walker->table[index]; - walker->index = index; - ASSERT(((unsigned long)walker->table & PAGE_MASK) == - ((unsigned long)ptep & PAGE_MASK)); - - if (!is_present_pte(*ptep)) - goto not_present; - - if (write_fault && !is_writeble_pte(*ptep)) - if (user_fault || is_write_protection(vcpu)) - goto access_error; - - if (user_fault && !(*ptep & PT_USER_MASK)) - goto access_error; - -#if PTTYPE == 64 - if (fetch_fault && is_nx(vcpu) && (*ptep & PT64_NX_MASK)) - goto access_error; -#endif - - if (!(*ptep & PT_ACCESSED_MASK)) { - mark_page_dirty(vcpu->kvm, table_gfn); - *ptep |= PT_ACCESSED_MASK; - } - - if (walker->level == PT_PAGE_TABLE_LEVEL) { - walker->gfn = (*ptep & PT_BASE_ADDR_MASK) - >> PAGE_SHIFT; - break; - } - - if (walker->level == PT_DIRECTORY_LEVEL - && (*ptep & PT_PAGE_SIZE_MASK) - && (PTTYPE == 64 || is_pse(vcpu))) { - walker->gfn = (*ptep & PT_DIR_BASE_ADDR_MASK) - >> PAGE_SHIFT; - walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL); - break; - } - - walker->inherited_ar &= walker->table[index]; - table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; - kunmap_atomic(walker->table, KM_USER0); - paddr = safe_gpa_to_hpa(vcpu, table_gfn << PAGE_SHIFT); - walker->page = pfn_to_page(paddr >> PAGE_SHIFT); - walker->table = kmap_atomic(walker->page, KM_USER0); - --walker->level; - walker->table_gfn[walker->level - 1 ] = table_gfn; - pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, - walker->level - 1, table_gfn); - } - walker->pte = *ptep; - if (walker->page) - walker->ptep = NULL; - if (walker->table) - kunmap_atomic(walker->table, KM_USER0); - pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep); - return 1; - -not_present: - walker->error_code = 0; - goto err; - -access_error: - walker->error_code = PFERR_PRESENT_MASK; - -err: - if (write_fault) - walker->error_code |= PFERR_WRITE_MASK; - if (user_fault) - walker->error_code |= PFERR_USER_MASK; - if (fetch_fault) - walker->error_code |= PFERR_FETCH_MASK; - if (walker->table) - kunmap_atomic(walker->table, KM_USER0); - return 0; -} - -static void FNAME(mark_pagetable_dirty)(struct kvm *kvm, - struct guest_walker *walker) -{ - mark_page_dirty(kvm, walker->table_gfn[walker->level - 1]); -} - -static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, - u64 *shadow_pte, - gpa_t gaddr, - pt_element_t gpte, - u64 access_bits, - int user_fault, - int write_fault, - int *ptwrite, - struct guest_walker *walker, - gfn_t gfn) -{ - hpa_t paddr; - int dirty = gpte & PT_DIRTY_MASK; - u64 spte = *shadow_pte; - int was_rmapped = is_rmap_pte(spte); - - pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d" - " user_fault %d gfn %lx\n", - __FUNCTION__, spte, (u64)gpte, access_bits, - write_fault, user_fault, gfn); - - if (write_fault && !dirty) { - pt_element_t *guest_ent, *tmp = NULL; - - if (walker->ptep) - guest_ent = walker->ptep; - else { - tmp = kmap_atomic(walker->page, KM_USER0); - guest_ent = &tmp[walker->index]; - } - - *guest_ent |= PT_DIRTY_MASK; - if (!walker->ptep) - kunmap_atomic(tmp, KM_USER0); - dirty = 1; - FNAME(mark_pagetable_dirty)(vcpu->kvm, walker); - } - - spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK; - spte |= gpte & PT64_NX_MASK; - if (!dirty) - access_bits &= ~PT_WRITABLE_MASK; - - paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK); - - spte |= PT_PRESENT_MASK; - if (access_bits & PT_USER_MASK) - spte |= PT_USER_MASK; - - if (is_error_hpa(paddr)) { - spte |= gaddr; - spte |= PT_SHADOW_IO_MARK; - spte &= ~PT_PRESENT_MASK; - set_shadow_pte(shadow_pte, spte); - return; - } - - spte |= paddr; - - if ((access_bits & PT_WRITABLE_MASK) - || (write_fault && !is_write_protection(vcpu) && !user_fault)) { - struct kvm_mmu_page *shadow; - - spte |= PT_WRITABLE_MASK; - if (user_fault) { - mmu_unshadow(vcpu, gfn); - goto unshadowed; - } - - shadow = kvm_mmu_lookup_page(vcpu, gfn); - if (shadow) { - pgprintk("%s: found shadow page for %lx, marking ro\n", - __FUNCTION__, gfn); - access_bits &= ~PT_WRITABLE_MASK; - if (is_writeble_pte(spte)) { - spte &= ~PT_WRITABLE_MASK; - kvm_x86_ops->tlb_flush(vcpu); - } - if (write_fault) - *ptwrite = 1; - } - } - -unshadowed: - - if (access_bits & PT_WRITABLE_MASK) - mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); - - set_shadow_pte(shadow_pte, spte); - page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); - if (!was_rmapped) - rmap_add(vcpu, shadow_pte); -} - -static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t gpte, - u64 *shadow_pte, u64 access_bits, - int user_fault, int write_fault, int *ptwrite, - struct guest_walker *walker, gfn_t gfn) -{ - access_bits &= gpte; - FNAME(set_pte_common)(vcpu, shadow_pte, gpte & PT_BASE_ADDR_MASK, - gpte, access_bits, user_fault, write_fault, - ptwrite, walker, gfn); -} - -static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, - u64 *spte, const void *pte, int bytes) -{ - pt_element_t gpte; - - if (bytes < sizeof(pt_element_t)) - return; - gpte = *(const pt_element_t *)pte; - if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) - return; - pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); - FNAME(set_pte)(vcpu, gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0, - 0, NULL, NULL, - (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT); -} - -static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t gpde, - u64 *shadow_pte, u64 access_bits, - int user_fault, int write_fault, int *ptwrite, - struct guest_walker *walker, gfn_t gfn) -{ - gpa_t gaddr; - - access_bits &= gpde; - gaddr = (gpa_t)gfn << PAGE_SHIFT; - if (PTTYPE == 32 && is_cpuid_PSE36()) - gaddr |= (gpde & PT32_DIR_PSE36_MASK) << - (32 - PT32_DIR_PSE36_SHIFT); - FNAME(set_pte_common)(vcpu, shadow_pte, gaddr, - gpde, access_bits, user_fault, write_fault, - ptwrite, walker, gfn); -} - -/* - * Fetch a shadow pte for a specific level in the paging hierarchy. - */ -static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, - struct guest_walker *walker, - int user_fault, int write_fault, int *ptwrite) -{ - hpa_t shadow_addr; - int level; - u64 *shadow_ent; - u64 *prev_shadow_ent = NULL; - - if (!is_present_pte(walker->pte)) - return NULL; - - shadow_addr = vcpu->mmu.root_hpa; - level = vcpu->mmu.shadow_root_level; - if (level == PT32E_ROOT_LEVEL) { - shadow_addr = vcpu->mmu.pae_root[(addr >> 30) & 3]; - shadow_addr &= PT64_BASE_ADDR_MASK; - --level; - } - - for (; ; level--) { - u32 index = SHADOW_PT_INDEX(addr, level); - struct kvm_mmu_page *shadow_page; - u64 shadow_pte; - int metaphysical; - gfn_t table_gfn; - unsigned hugepage_access = 0; - - shadow_ent = ((u64 *)__va(shadow_addr)) + index; - if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) { - if (level == PT_PAGE_TABLE_LEVEL) - break; - shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; - prev_shadow_ent = shadow_ent; - continue; - } - - if (level == PT_PAGE_TABLE_LEVEL) - break; - - if (level - 1 == PT_PAGE_TABLE_LEVEL - && walker->level == PT_DIRECTORY_LEVEL) { - metaphysical = 1; - hugepage_access = walker->pte; - hugepage_access &= PT_USER_MASK | PT_WRITABLE_MASK; - if (walker->pte & PT64_NX_MASK) - hugepage_access |= (1 << 2); - hugepage_access >>= PT_WRITABLE_SHIFT; - table_gfn = (walker->pte & PT_BASE_ADDR_MASK) - >> PAGE_SHIFT; - } else { - metaphysical = 0; - table_gfn = walker->table_gfn[level - 2]; - } - shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, - metaphysical, hugepage_access, - shadow_ent); - shadow_addr = __pa(shadow_page->spt); - shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK - | PT_WRITABLE_MASK | PT_USER_MASK; - *shadow_ent = shadow_pte; - prev_shadow_ent = shadow_ent; - } - - if (walker->level == PT_DIRECTORY_LEVEL) { - FNAME(set_pde)(vcpu, walker->pte, shadow_ent, - walker->inherited_ar, user_fault, write_fault, - ptwrite, walker, walker->gfn); - } else { - ASSERT(walker->level == PT_PAGE_TABLE_LEVEL); - FNAME(set_pte)(vcpu, walker->pte, shadow_ent, - walker->inherited_ar, user_fault, write_fault, - ptwrite, walker, walker->gfn); - } - return shadow_ent; -} - -/* - * Page fault handler. There are several causes for a page fault: - * - there is no shadow pte for the guest pte - * - write access through a shadow pte marked read only so that we can set - * the dirty bit - * - write access to a shadow pte marked read only so we can update the page - * dirty bitmap, when userspace requests it - * - mmio access; in this case we will never install a present shadow pte - * - normal guest page fault due to the guest pte marked not present, not - * writable, or not executable - * - * Returns: 1 if we need to emulate the instruction, 0 otherwise, or - * a negative value on error. - */ -static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, - u32 error_code) -{ - int write_fault = error_code & PFERR_WRITE_MASK; - int user_fault = error_code & PFERR_USER_MASK; - int fetch_fault = error_code & PFERR_FETCH_MASK; - struct guest_walker walker; - u64 *shadow_pte; - int write_pt = 0; - int r; - - pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code); - kvm_mmu_audit(vcpu, "pre page fault"); - - r = mmu_topup_memory_caches(vcpu); - if (r) - return r; - - /* - * Look up the shadow pte for the faulting address. - */ - r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, - fetch_fault); - - /* - * The page is not mapped by the guest. Let the guest handle it. - */ - if (!r) { - pgprintk("%s: guest page fault\n", __FUNCTION__); - inject_page_fault(vcpu, addr, walker.error_code); - vcpu->last_pt_write_count = 0; /* reset fork detector */ - return 0; - } - - shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, - &write_pt); - pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, - shadow_pte, *shadow_pte, write_pt); - - if (!write_pt) - vcpu->last_pt_write_count = 0; /* reset fork detector */ - - /* - * mmio: emulate if accessible, otherwise its a guest fault. - */ - if (is_io_pte(*shadow_pte)) - return 1; - - ++vcpu->stat.pf_fixed; - kvm_mmu_audit(vcpu, "post page fault (fixed)"); - - return write_pt; -} - -static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) -{ - struct guest_walker walker; - gpa_t gpa = UNMAPPED_GVA; - int r; - - r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); - - if (r) { - gpa = (gpa_t)walker.gfn << PAGE_SHIFT; - gpa |= vaddr & ~PAGE_MASK; - } - - return gpa; -} - -#undef pt_element_t -#undef guest_walker -#undef FNAME -#undef PT_BASE_ADDR_MASK -#undef PT_INDEX -#undef SHADOW_PT_INDEX -#undef PT_LEVEL_MASK -#undef PT_DIR_BASE_ADDR_MASK -#undef PT_MAX_FULL_LEVELS diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c deleted file mode 100644 index bd46de6bf89..00000000000 --- a/drivers/kvm/x86_emulate.c +++ /dev/null @@ -1,1662 +0,0 @@ -/****************************************************************************** - * x86_emulate.c - * - * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. - * - * Copyright (c) 2005 Keir Fraser - * - * Linux coding style, mod r/m decoder, segment base fixes, real-mode - * privileged instructions: - * - * Copyright (C) 2006 Qumranet - * - * Avi Kivity <avi@qumranet.com> - * Yaniv Kamay <yaniv@qumranet.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 - */ - -#ifndef __KERNEL__ -#include <stdio.h> -#include <stdint.h> -#include <public/xen.h> -#define DPRINTF(_f, _a ...) printf( _f , ## _a ) -#else -#include "kvm.h" -#define DPRINTF(x...) do {} while (0) -#endif -#include "x86_emulate.h" -#include <linux/module.h> - -/* - * Opcode effective-address decode tables. - * Note that we only emulate instructions that have at least one memory - * operand (excluding implicit stack references). We assume that stack - * references and instruction fetches will never occur in special memory - * areas that require emulation. So, for example, 'mov <imm>,<reg>' need - * not be handled. - */ - -/* Operand sizes: 8-bit operands or specified/overridden size. */ -#define ByteOp (1<<0) /* 8-bit operands. */ -/* Destination operand type. */ -#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ -#define DstReg (2<<1) /* Register operand. */ -#define DstMem (3<<1) /* Memory operand. */ -#define DstMask (3<<1) -/* Source operand type. */ -#define SrcNone (0<<3) /* No source operand. */ -#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */ -#define SrcReg (1<<3) /* Register operand. */ -#define SrcMem (2<<3) /* Memory operand. */ -#define SrcMem16 (3<<3) /* Memory operand (16-bit). */ -#define SrcMem32 (4<<3) /* Memory operand (32-bit). */ -#define SrcImm (5<<3) /* Immediate operand. */ -#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */ -#define SrcMask (7<<3) -/* Generic ModRM decode. */ -#define ModRM (1<<6) -/* Destination is only written; never read. */ -#define Mov (1<<7) -#define BitOp (1<<8) - -static u8 opcode_table[256] = { - /* 0x00 - 0x07 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x08 - 0x0F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x10 - 0x17 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x18 - 0x1F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x20 - 0x27 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - SrcImmByte, SrcImm, 0, 0, - /* 0x28 - 0x2F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x30 - 0x37 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x38 - 0x3F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x40 - 0x4F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x50 - 0x57 */ - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - /* 0x58 - 0x5F */ - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - /* 0x60 - 0x67 */ - 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , - 0, 0, 0, 0, - /* 0x68 - 0x6F */ - 0, 0, ImplicitOps|Mov, 0, - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ - /* 0x70 - 0x77 */ - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - /* 0x78 - 0x7F */ - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - /* 0x80 - 0x87 */ - ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - /* 0x88 - 0x8F */ - ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, - ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov, - /* 0x90 - 0x9F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps, ImplicitOps, 0, 0, - /* 0xA0 - 0xA7 */ - ByteOp | DstReg | SrcMem | Mov, DstReg | SrcMem | Mov, - ByteOp | DstMem | SrcReg | Mov, DstMem | SrcReg | Mov, - ByteOp | ImplicitOps | Mov, ImplicitOps | Mov, - ByteOp | ImplicitOps, ImplicitOps, - /* 0xA8 - 0xAF */ - 0, 0, ByteOp | ImplicitOps | Mov, ImplicitOps | Mov, - ByteOp | ImplicitOps | Mov, ImplicitOps | Mov, - ByteOp | ImplicitOps, ImplicitOps, - /* 0xB0 - 0xBF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xC0 - 0xC7 */ - ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, - 0, ImplicitOps, 0, 0, - ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, - /* 0xC8 - 0xCF */ - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xD0 - 0xD7 */ - ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, - ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, - 0, 0, 0, 0, - /* 0xD8 - 0xDF */ - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xE0 - 0xE7 */ - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xE8 - 0xEF */ - ImplicitOps, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, 0, 0, 0, 0, - /* 0xF0 - 0xF7 */ - 0, 0, 0, 0, - ImplicitOps, 0, - ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, - /* 0xF8 - 0xFF */ - 0, 0, 0, 0, - 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM -}; - -static u16 twobyte_table[256] = { - /* 0x00 - 0x0F */ - 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0, - ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, - /* 0x10 - 0x1F */ - 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, - /* 0x20 - 0x2F */ - ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x30 - 0x3F */ - ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x40 - 0x47 */ - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - /* 0x48 - 0x4F */ - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - /* 0x50 - 0x5F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x60 - 0x6F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x70 - 0x7F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x80 - 0x8F */ - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - /* 0x90 - 0x9F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xA0 - 0xA7 */ - 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, - /* 0xA8 - 0xAF */ - 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, - /* 0xB0 - 0xB7 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, - DstMem | SrcReg | ModRM | BitOp, - 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem16 | ModRM | Mov, - /* 0xB8 - 0xBF */ - 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, - 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem16 | ModRM | Mov, - /* 0xC0 - 0xCF */ - 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xD0 - 0xDF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xE0 - 0xEF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xF0 - 0xFF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; - -/* Type, address-of, and value of an instruction's operand. */ -struct operand { - enum { OP_REG, OP_MEM, OP_IMM } type; - unsigned int bytes; - unsigned long val, orig_val, *ptr; -}; - -/* EFLAGS bit definitions. */ -#define EFLG_OF (1<<11) -#define EFLG_DF (1<<10) -#define EFLG_SF (1<<7) -#define EFLG_ZF (1<<6) -#define EFLG_AF (1<<4) -#define EFLG_PF (1<<2) -#define EFLG_CF (1<<0) - -/* - * Instruction emulation: - * Most instructions are emulated directly via a fragment of inline assembly - * code. This allows us to save/restore EFLAGS and thus very easily pick up - * any modified flags. - */ - -#if defined(CONFIG_X86_64) -#define _LO32 "k" /* force 32-bit operand */ -#define _STK "%%rsp" /* stack pointer */ -#elif defined(__i386__) -#define _LO32 "" /* force 32-bit operand */ -#define _STK "%%esp" /* stack pointer */ -#endif - -/* - * These EFLAGS bits are restored from saved value during emulation, and - * any changes are written back to the saved value after emulation. - */ -#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) - -/* Before executing instruction: restore necessary bits in EFLAGS. */ -#define _PRE_EFLAGS(_sav, _msk, _tmp) \ - /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); */ \ - "push %"_sav"; " \ - "movl %"_msk",%"_LO32 _tmp"; " \ - "andl %"_LO32 _tmp",("_STK"); " \ - "pushf; " \ - "notl %"_LO32 _tmp"; " \ - "andl %"_LO32 _tmp",("_STK"); " \ - "pop %"_tmp"; " \ - "orl %"_LO32 _tmp",("_STK"); " \ - "popf; " \ - /* _sav &= ~msk; */ \ - "movl %"_msk",%"_LO32 _tmp"; " \ - "notl %"_LO32 _tmp"; " \ - "andl %"_LO32 _tmp",%"_sav"; " - -/* After executing instruction: write-back necessary bits in EFLAGS. */ -#define _POST_EFLAGS(_sav, _msk, _tmp) \ - /* _sav |= EFLAGS & _msk; */ \ - "pushf; " \ - "pop %"_tmp"; " \ - "andl %"_msk",%"_LO32 _tmp"; " \ - "orl %"_LO32 _tmp",%"_sav"; " - -/* Raw emulation: instruction has two explicit operands. */ -#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ - do { \ - unsigned long _tmp; \ - \ - switch ((_dst).bytes) { \ - case 2: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0","4","2") \ - _op"w %"_wx"3,%1; " \ - _POST_EFLAGS("0","4","2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : _wy ((_src).val), "i" (EFLAGS_MASK) ); \ - break; \ - case 4: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0","4","2") \ - _op"l %"_lx"3,%1; " \ - _POST_EFLAGS("0","4","2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : _ly ((_src).val), "i" (EFLAGS_MASK) ); \ - break; \ - case 8: \ - __emulate_2op_8byte(_op, _src, _dst, \ - _eflags, _qx, _qy); \ - break; \ - } \ - } while (0) - -#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ - do { \ - unsigned long _tmp; \ - switch ( (_dst).bytes ) \ - { \ - case 1: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0","4","2") \ - _op"b %"_bx"3,%1; " \ - _POST_EFLAGS("0","4","2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : _by ((_src).val), "i" (EFLAGS_MASK) ); \ - break; \ - default: \ - __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ - _wx, _wy, _lx, _ly, _qx, _qy); \ - break; \ - } \ - } while (0) - -/* Source operand is byte-sized and may be restricted to just %cl. */ -#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \ - __emulate_2op(_op, _src, _dst, _eflags, \ - "b", "c", "b", "c", "b", "c", "b", "c") - -/* Source operand is byte, word, long or quad sized. */ -#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \ - __emulate_2op(_op, _src, _dst, _eflags, \ - "b", "q", "w", "r", _LO32, "r", "", "r") - -/* Source operand is word, long or quad sized. */ -#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \ - __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ - "w", "r", _LO32, "r", "", "r") - -/* Instruction has only one explicit operand (no source operand). */ -#define emulate_1op(_op, _dst, _eflags) \ - do { \ - unsigned long _tmp; \ - \ - switch ( (_dst).bytes ) \ - { \ - case 1: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0","3","2") \ - _op"b %1; " \ - _POST_EFLAGS("0","3","2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : "i" (EFLAGS_MASK) ); \ - break; \ - case 2: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0","3","2") \ - _op"w %1; " \ - _POST_EFLAGS("0","3","2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : "i" (EFLAGS_MASK) ); \ - break; \ - case 4: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0","3","2") \ - _op"l %1; " \ - _POST_EFLAGS("0","3","2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : "i" (EFLAGS_MASK) ); \ - break; \ - case 8: \ - __emulate_1op_8byte(_op, _dst, _eflags); \ - break; \ - } \ - } while (0) - -/* Emulate an instruction with quadword operands (x86/64 only). */ -#if defined(CONFIG_X86_64) -#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \ - do { \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0","4","2") \ - _op"q %"_qx"3,%1; " \ - _POST_EFLAGS("0","4","2") \ - : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ - : _qy ((_src).val), "i" (EFLAGS_MASK) ); \ - } while (0) - -#define __emulate_1op_8byte(_op, _dst, _eflags) \ - do { \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0","3","2") \ - _op"q %1; " \ - _POST_EFLAGS("0","3","2") \ - : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ - : "i" (EFLAGS_MASK) ); \ - } while (0) - -#elif defined(__i386__) -#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) -#define __emulate_1op_8byte(_op, _dst, _eflags) -#endif /* __i386__ */ - -/* Fetch next part of the instruction being emulated. */ -#define insn_fetch(_type, _size, _eip) \ -({ unsigned long _x; \ - rc = ops->read_std((unsigned long)(_eip) + ctxt->cs_base, &_x, \ - (_size), ctxt->vcpu); \ - if ( rc != 0 ) \ - goto done; \ - (_eip) += (_size); \ - (_type)_x; \ -}) - -/* Access/update address held in a register, based on addressing mode. */ -#define address_mask(reg) \ - ((ad_bytes == sizeof(unsigned long)) ? \ - (reg) : ((reg) & ((1UL << (ad_bytes << 3)) - 1))) -#define register_address(base, reg) \ - ((base) + address_mask(reg)) -#define register_address_increment(reg, inc) \ - do { \ - /* signed type ensures sign extension to long */ \ - int _inc = (inc); \ - if ( ad_bytes == sizeof(unsigned long) ) \ - (reg) += _inc; \ - else \ - (reg) = ((reg) & ~((1UL << (ad_bytes << 3)) - 1)) | \ - (((reg) + _inc) & ((1UL << (ad_bytes << 3)) - 1)); \ - } while (0) - -#define JMP_REL(rel) \ - do { \ - register_address_increment(_eip, rel); \ - } while (0) - -/* - * Given the 'reg' portion of a ModRM byte, and a register block, return a - * pointer into the block that addresses the relevant register. - * @highbyte_regs specifies whether to decode AH,CH,DH,BH. - */ -static void *decode_register(u8 modrm_reg, unsigned long *regs, - int highbyte_regs) -{ - void *p; - - p = ®s[modrm_reg]; - if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) - p = (unsigned char *)®s[modrm_reg & 3] + 1; - return p; -} - -static int read_descriptor(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - void *ptr, - u16 *size, unsigned long *address, int op_bytes) -{ - int rc; - - if (op_bytes == 2) - op_bytes = 3; - *address = 0; - rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, - ctxt->vcpu); - if (rc) - return rc; - rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, - ctxt->vcpu); - return rc; -} - -static int test_cc(unsigned int condition, unsigned int flags) -{ - int rc = 0; - - switch ((condition & 15) >> 1) { - case 0: /* o */ - rc |= (flags & EFLG_OF); - break; - case 1: /* b/c/nae */ - rc |= (flags & EFLG_CF); - break; - case 2: /* z/e */ - rc |= (flags & EFLG_ZF); - break; - case 3: /* be/na */ - rc |= (flags & (EFLG_CF|EFLG_ZF)); - break; - case 4: /* s */ - rc |= (flags & EFLG_SF); - break; - case 5: /* p/pe */ - rc |= (flags & EFLG_PF); - break; - case 7: /* le/ng */ - rc |= (flags & EFLG_ZF); - /* fall through */ - case 6: /* l/nge */ - rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF)); - break; - } - - /* Odd condition identifiers (lsb == 1) have inverted sense. */ - return (!!rc ^ (condition & 1)); -} - -int -x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) -{ - unsigned d; - u8 b, sib, twobyte = 0, rex_prefix = 0; - u8 modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0; - unsigned long *override_base = NULL; - unsigned int op_bytes, ad_bytes, lock_prefix = 0, rep_prefix = 0, i; - int rc = 0; - struct operand src, dst; - unsigned long cr2 = ctxt->cr2; - int mode = ctxt->mode; - unsigned long modrm_ea; - int use_modrm_ea, index_reg = 0, base_reg = 0, scale, rip_relative = 0; - int no_wb = 0; - u64 msr_data; - - /* Shadow copy of register state. Committed on successful emulation. */ - unsigned long _regs[NR_VCPU_REGS]; - unsigned long _eip = ctxt->vcpu->rip, _eflags = ctxt->eflags; - unsigned long modrm_val = 0; - - memcpy(_regs, ctxt->vcpu->regs, sizeof _regs); - - switch (mode) { - case X86EMUL_MODE_REAL: - case X86EMUL_MODE_PROT16: - op_bytes = ad_bytes = 2; - break; - case X86EMUL_MODE_PROT32: - op_bytes = ad_bytes = 4; - break; -#ifdef CONFIG_X86_64 - case X86EMUL_MODE_PROT64: - op_bytes = 4; - ad_bytes = 8; - break; -#endif - default: - return -1; - } - - /* Legacy prefixes. */ - for (i = 0; i < 8; i++) { - switch (b = insn_fetch(u8, 1, _eip)) { - case 0x66: /* operand-size override */ - op_bytes ^= 6; /* switch between 2/4 bytes */ - break; - case 0x67: /* address-size override */ - if (mode == X86EMUL_MODE_PROT64) - ad_bytes ^= 12; /* switch between 4/8 bytes */ - else - ad_bytes ^= 6; /* switch between 2/4 bytes */ - break; - case 0x2e: /* CS override */ - override_base = &ctxt->cs_base; - break; - case 0x3e: /* DS override */ - override_base = &ctxt->ds_base; - break; - case 0x26: /* ES override */ - override_base = &ctxt->es_base; - break; - case 0x64: /* FS override */ - override_base = &ctxt->fs_base; - break; - case 0x65: /* GS override */ - override_base = &ctxt->gs_base; - break; - case 0x36: /* SS override */ - override_base = &ctxt->ss_base; - break; - case 0xf0: /* LOCK */ - lock_prefix = 1; - break; - case 0xf2: /* REPNE/REPNZ */ - case 0xf3: /* REP/REPE/REPZ */ - rep_prefix = 1; - break; - default: - goto done_prefixes; - } - } - -done_prefixes: - - /* REX prefix. */ - if ((mode == X86EMUL_MODE_PROT64) && ((b & 0xf0) == 0x40)) { - rex_prefix = b; - if (b & 8) - op_bytes = 8; /* REX.W */ - modrm_reg = (b & 4) << 1; /* REX.R */ - index_reg = (b & 2) << 2; /* REX.X */ - modrm_rm = base_reg = (b & 1) << 3; /* REG.B */ - b = insn_fetch(u8, 1, _eip); - } - - /* Opcode byte(s). */ - d = opcode_table[b]; - if (d == 0) { - /* Two-byte opcode? */ - if (b == 0x0f) { - twobyte = 1; - b = insn_fetch(u8, 1, _eip); - d = twobyte_table[b]; - } - - /* Unrecognised? */ - if (d == 0) - goto cannot_emulate; - } - - /* ModRM and SIB bytes. */ - if (d & ModRM) { - modrm = insn_fetch(u8, 1, _eip); - modrm_mod |= (modrm & 0xc0) >> 6; - modrm_reg |= (modrm & 0x38) >> 3; - modrm_rm |= (modrm & 0x07); - modrm_ea = 0; - use_modrm_ea = 1; - - if (modrm_mod == 3) { - modrm_val = *(unsigned long *) - decode_register(modrm_rm, _regs, d & ByteOp); - goto modrm_done; - } - - if (ad_bytes == 2) { - unsigned bx = _regs[VCPU_REGS_RBX]; - unsigned bp = _regs[VCPU_REGS_RBP]; - unsigned si = _regs[VCPU_REGS_RSI]; - unsigned di = _regs[VCPU_REGS_RDI]; - - /* 16-bit ModR/M decode. */ - switch (modrm_mod) { - case 0: - if (modrm_rm == 6) - modrm_ea += insn_fetch(u16, 2, _eip); - break; - case 1: - modrm_ea += insn_fetch(s8, 1, _eip); - break; - case 2: - modrm_ea += insn_fetch(u16, 2, _eip); - break; - } - switch (modrm_rm) { - case 0: - modrm_ea += bx + si; - break; - case 1: - modrm_ea += bx + di; - break; - case 2: - modrm_ea += bp + si; - break; - case 3: - modrm_ea += bp + di; - break; - case 4: - modrm_ea += si; - break; - case 5: - modrm_ea += di; - break; - case 6: - if (modrm_mod != 0) - modrm_ea += bp; - break; - case 7: - modrm_ea += bx; - break; - } - if (modrm_rm == 2 || modrm_rm == 3 || - (modrm_rm == 6 && modrm_mod != 0)) - if (!override_base) - override_base = &ctxt->ss_base; - modrm_ea = (u16)modrm_ea; - } else { - /* 32/64-bit ModR/M decode. */ - switch (modrm_rm) { - case 4: - case 12: - sib = insn_fetch(u8, 1, _eip); - index_reg |= (sib >> 3) & 7; - base_reg |= sib & 7; - scale = sib >> 6; - - switch (base_reg) { - case 5: - if (modrm_mod != 0) - modrm_ea += _regs[base_reg]; - else - modrm_ea += insn_fetch(s32, 4, _eip); - break; - default: - modrm_ea += _regs[base_reg]; - } - switch (index_reg) { - case 4: - break; - default: - modrm_ea += _regs[index_reg] << scale; - - } - break; - case 5: - if (modrm_mod != 0) - modrm_ea += _regs[modrm_rm]; - else if (mode == X86EMUL_MODE_PROT64) - rip_relative = 1; - break; - default: - modrm_ea += _regs[modrm_rm]; - break; - } - switch (modrm_mod) { - case 0: - if (modrm_rm == 5) - modrm_ea += insn_fetch(s32, 4, _eip); - break; - case 1: - modrm_ea += insn_fetch(s8, 1, _eip); - break; - case 2: - modrm_ea += insn_fetch(s32, 4, _eip); - break; - } - } - if (!override_base) - override_base = &ctxt->ds_base; - if (mode == X86EMUL_MODE_PROT64 && - override_base != &ctxt->fs_base && - override_base != &ctxt->gs_base) - override_base = NULL; - - if (override_base) - modrm_ea += *override_base; - - if (rip_relative) { - modrm_ea += _eip; - switch (d & SrcMask) { - case SrcImmByte: - modrm_ea += 1; - break; - case SrcImm: - if (d & ByteOp) - modrm_ea += 1; - else - if (op_bytes == 8) - modrm_ea += 4; - else - modrm_ea += op_bytes; - } - } - if (ad_bytes != 8) - modrm_ea = (u32)modrm_ea; - cr2 = modrm_ea; - modrm_done: - ; - } - - /* - * Decode and fetch the source operand: register, memory - * or immediate. - */ - switch (d & SrcMask) { - case SrcNone: - break; - case SrcReg: - src.type = OP_REG; - if (d & ByteOp) { - src.ptr = decode_register(modrm_reg, _regs, - (rex_prefix == 0)); - src.val = src.orig_val = *(u8 *) src.ptr; - src.bytes = 1; - } else { - src.ptr = decode_register(modrm_reg, _regs, 0); - switch ((src.bytes = op_bytes)) { - case 2: - src.val = src.orig_val = *(u16 *) src.ptr; - break; - case 4: - src.val = src.orig_val = *(u32 *) src.ptr; - break; - case 8: - src.val = src.orig_val = *(u64 *) src.ptr; - break; - } - } - break; - case SrcMem16: - src.bytes = 2; - goto srcmem_common; - case SrcMem32: - src.bytes = 4; - goto srcmem_common; - case SrcMem: - src.bytes = (d & ByteOp) ? 1 : op_bytes; - /* Don't fetch the address for invlpg: it could be unmapped. */ - if (twobyte && b == 0x01 && modrm_reg == 7) - break; - srcmem_common: - /* - * For instructions with a ModR/M byte, switch to register - * access if Mod = 3. - */ - if ((d & ModRM) && modrm_mod == 3) { - src.type = OP_REG; - break; - } - src.type = OP_MEM; - src.ptr = (unsigned long *)cr2; - src.val = 0; - if ((rc = ops->read_emulated((unsigned long)src.ptr, - &src.val, src.bytes, ctxt->vcpu)) != 0) - goto done; - src.orig_val = src.val; - break; - case SrcImm: - src.type = OP_IMM; - src.ptr = (unsigned long *)_eip; - src.bytes = (d & ByteOp) ? 1 : op_bytes; - if (src.bytes == 8) - src.bytes = 4; - /* NB. Immediates are sign-extended as necessary. */ - switch (src.bytes) { - case 1: - src.val = insn_fetch(s8, 1, _eip); - break; - case 2: - src.val = insn_fetch(s16, 2, _eip); - break; - case 4: - src.val = insn_fetch(s32, 4, _eip); - break; - } - break; - case SrcImmByte: - src.type = OP_IMM; - src.ptr = (unsigned long *)_eip; - src.bytes = 1; - src.val = insn_fetch(s8, 1, _eip); - break; - } - - /* Decode and fetch the destination operand: register or memory. */ - switch (d & DstMask) { - case ImplicitOps: - /* Special instructions do their own operand decoding. */ - goto special_insn; - case DstReg: - dst.type = OP_REG; - if ((d & ByteOp) - && !(twobyte && (b == 0xb6 || b == 0xb7))) { - dst.ptr = decode_register(modrm_reg, _regs, - (rex_prefix == 0)); - dst.val = *(u8 *) dst.ptr; - dst.bytes = 1; - } else { - dst.ptr = decode_register(modrm_reg, _regs, 0); - switch ((dst.bytes = op_bytes)) { - case 2: - dst.val = *(u16 *)dst.ptr; - break; - case 4: - dst.val = *(u32 *)dst.ptr; - break; - case 8: - dst.val = *(u64 *)dst.ptr; - break; - } - } - break; - case DstMem: - dst.type = OP_MEM; - dst.ptr = (unsigned long *)cr2; - dst.bytes = (d & ByteOp) ? 1 : op_bytes; - dst.val = 0; - /* - * For instructions with a ModR/M byte, switch to register - * access if Mod = 3. - */ - if ((d & ModRM) && modrm_mod == 3) { - dst.type = OP_REG; - break; - } - if (d & BitOp) { - unsigned long mask = ~(dst.bytes * 8 - 1); - - dst.ptr = (void *)dst.ptr + (src.val & mask) / 8; - } - if (!(d & Mov) && /* optimisation - avoid slow emulated read */ - ((rc = ops->read_emulated((unsigned long)dst.ptr, - &dst.val, dst.bytes, ctxt->vcpu)) != 0)) - goto done; - break; - } - dst.orig_val = dst.val; - - if (twobyte) - goto twobyte_insn; - - switch (b) { - case 0x00 ... 0x05: - add: /* add */ - emulate_2op_SrcV("add", src, dst, _eflags); - break; - case 0x08 ... 0x0d: - or: /* or */ - emulate_2op_SrcV("or", src, dst, _eflags); - break; - case 0x10 ... 0x15: - adc: /* adc */ - emulate_2op_SrcV("adc", src, dst, _eflags); - break; - case 0x18 ... 0x1d: - sbb: /* sbb */ - emulate_2op_SrcV("sbb", src, dst, _eflags); - break; - case 0x20 ... 0x23: - and: /* and */ - emulate_2op_SrcV("and", src, dst, _eflags); - break; - case 0x24: /* and al imm8 */ - dst.type = OP_REG; - dst.ptr = &_regs[VCPU_REGS_RAX]; - dst.val = *(u8 *)dst.ptr; - dst.bytes = 1; - dst.orig_val = dst.val; - goto and; - case 0x25: /* and ax imm16, or eax imm32 */ - dst.type = OP_REG; - dst.bytes = op_bytes; - dst.ptr = &_regs[VCPU_REGS_RAX]; - if (op_bytes == 2) - dst.val = *(u16 *)dst.ptr; - else - dst.val = *(u32 *)dst.ptr; - dst.orig_val = dst.val; - goto and; - case 0x28 ... 0x2d: - sub: /* sub */ - emulate_2op_SrcV("sub", src, dst, _eflags); - break; - case 0x30 ... 0x35: - xor: /* xor */ - emulate_2op_SrcV("xor", src, dst, _eflags); - break; - case 0x38 ... 0x3d: - cmp: /* cmp */ - emulate_2op_SrcV("cmp", src, dst, _eflags); - break; - case 0x63: /* movsxd */ - if (mode != X86EMUL_MODE_PROT64) - goto cannot_emulate; - dst.val = (s32) src.val; - break; - case 0x80 ... 0x83: /* Grp1 */ - switch (modrm_reg) { - case 0: - goto add; - case 1: - goto or; - case 2: - goto adc; - case 3: - goto sbb; - case 4: - goto and; - case 5: - goto sub; - case 6: - goto xor; - case 7: - goto cmp; - } - break; - case 0x84 ... 0x85: - test: /* test */ - emulate_2op_SrcV("test", src, dst, _eflags); - break; - case 0x86 ... 0x87: /* xchg */ - /* Write back the register source. */ - switch (dst.bytes) { - case 1: - *(u8 *) src.ptr = (u8) dst.val; - break; - case 2: - *(u16 *) src.ptr = (u16) dst.val; - break; - case 4: - *src.ptr = (u32) dst.val; - break; /* 64b reg: zero-extend */ - case 8: - *src.ptr = dst.val; - break; - } - /* - * Write back the memory destination with implicit LOCK - * prefix. - */ - dst.val = src.val; - lock_prefix = 1; - break; - case 0x88 ... 0x8b: /* mov */ - goto mov; - case 0x8d: /* lea r16/r32, m */ - dst.val = modrm_val; - break; - case 0x8f: /* pop (sole member of Grp1a) */ - /* 64-bit mode: POP always pops a 64-bit operand. */ - if (mode == X86EMUL_MODE_PROT64) - dst.bytes = 8; - if ((rc = ops->read_std(register_address(ctxt->ss_base, - _regs[VCPU_REGS_RSP]), - &dst.val, dst.bytes, ctxt->vcpu)) != 0) - goto done; - register_address_increment(_regs[VCPU_REGS_RSP], dst.bytes); - break; - case 0xa0 ... 0xa1: /* mov */ - dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX]; - dst.val = src.val; - _eip += ad_bytes; /* skip src displacement */ - break; - case 0xa2 ... 0xa3: /* mov */ - dst.val = (unsigned long)_regs[VCPU_REGS_RAX]; - _eip += ad_bytes; /* skip dst displacement */ - break; - case 0xc0 ... 0xc1: - grp2: /* Grp2 */ - switch (modrm_reg) { - case 0: /* rol */ - emulate_2op_SrcB("rol", src, dst, _eflags); - break; - case 1: /* ror */ - emulate_2op_SrcB("ror", src, dst, _eflags); - break; - case 2: /* rcl */ - emulate_2op_SrcB("rcl", src, dst, _eflags); - break; - case 3: /* rcr */ - emulate_2op_SrcB("rcr", src, dst, _eflags); - break; - case 4: /* sal/shl */ - case 6: /* sal/shl */ - emulate_2op_SrcB("sal", src, dst, _eflags); - break; - case 5: /* shr */ - emulate_2op_SrcB("shr", src, dst, _eflags); - break; - case 7: /* sar */ - emulate_2op_SrcB("sar", src, dst, _eflags); - break; - } - break; - case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ - mov: - dst.val = src.val; - break; - case 0xd0 ... 0xd1: /* Grp2 */ - src.val = 1; - goto grp2; - case 0xd2 ... 0xd3: /* Grp2 */ - src.val = _regs[VCPU_REGS_RCX]; - goto grp2; - case 0xf6 ... 0xf7: /* Grp3 */ - switch (modrm_reg) { - case 0 ... 1: /* test */ - /* - * Special case in Grp3: test has an immediate - * source operand. - */ - src.type = OP_IMM; - src.ptr = (unsigned long *)_eip; - src.bytes = (d & ByteOp) ? 1 : op_bytes; - if (src.bytes == 8) - src.bytes = 4; - switch (src.bytes) { - case 1: - src.val = insn_fetch(s8, 1, _eip); - break; - case 2: - src.val = insn_fetch(s16, 2, _eip); - break; - case 4: - src.val = insn_fetch(s32, 4, _eip); - break; - } - goto test; - case 2: /* not */ - dst.val = ~dst.val; - break; - case 3: /* neg */ - emulate_1op("neg", dst, _eflags); - break; - default: - goto cannot_emulate; - } - break; - case 0xfe ... 0xff: /* Grp4/Grp5 */ - switch (modrm_reg) { - case 0: /* inc */ - emulate_1op("inc", dst, _eflags); - break; - case 1: /* dec */ - emulate_1op("dec", dst, _eflags); - break; - case 4: /* jmp abs */ - if (b == 0xff) - _eip = dst.val; - else - goto cannot_emulate; - break; - case 6: /* push */ - /* 64-bit mode: PUSH always pushes a 64-bit operand. */ - if (mode == X86EMUL_MODE_PROT64) { - dst.bytes = 8; - if ((rc = ops->read_std((unsigned long)dst.ptr, - &dst.val, 8, - ctxt->vcpu)) != 0) - goto done; - } - register_address_increment(_regs[VCPU_REGS_RSP], - -dst.bytes); - if ((rc = ops->write_emulated( - register_address(ctxt->ss_base, - _regs[VCPU_REGS_RSP]), - &dst.val, dst.bytes, ctxt->vcpu)) != 0) - goto done; - no_wb = 1; - break; - default: - goto cannot_emulate; - } - break; - } - -writeback: - if (!no_wb) { - switch (dst.type) { - case OP_REG: - /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ - switch (dst.bytes) { - case 1: - *(u8 *)dst.ptr = (u8)dst.val; - break; - case 2: - *(u16 *)dst.ptr = (u16)dst.val; - break; - case 4: - *dst.ptr = (u32)dst.val; - break; /* 64b: zero-ext */ - case 8: - *dst.ptr = dst.val; - break; - } - break; - case OP_MEM: - if (lock_prefix) - rc = ops->cmpxchg_emulated((unsigned long)dst. - ptr, &dst.orig_val, - &dst.val, dst.bytes, - ctxt->vcpu); - else - rc = ops->write_emulated((unsigned long)dst.ptr, - &dst.val, dst.bytes, - ctxt->vcpu); - if (rc != 0) - goto done; - default: - break; - } - } - - /* Commit shadow register state. */ - memcpy(ctxt->vcpu->regs, _regs, sizeof _regs); - ctxt->eflags = _eflags; - ctxt->vcpu->rip = _eip; - -done: - return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; - -special_insn: - if (twobyte) - goto twobyte_special_insn; - switch(b) { - case 0x50 ... 0x57: /* push reg */ - if (op_bytes == 2) - src.val = (u16) _regs[b & 0x7]; - else - src.val = (u32) _regs[b & 0x7]; - dst.type = OP_MEM; - dst.bytes = op_bytes; - dst.val = src.val; - register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes); - dst.ptr = (void *) register_address( - ctxt->ss_base, _regs[VCPU_REGS_RSP]); - break; - case 0x58 ... 0x5f: /* pop reg */ - dst.ptr = (unsigned long *)&_regs[b & 0x7]; - pop_instruction: - if ((rc = ops->read_std(register_address(ctxt->ss_base, - _regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt->vcpu)) - != 0) - goto done; - - register_address_increment(_regs[VCPU_REGS_RSP], op_bytes); - no_wb = 1; /* Disable writeback. */ - break; - case 0x6a: /* push imm8 */ - src.val = 0L; - src.val = insn_fetch(s8, 1, _eip); - push: - dst.type = OP_MEM; - dst.bytes = op_bytes; - dst.val = src.val; - register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes); - dst.ptr = (void *) register_address(ctxt->ss_base, - _regs[VCPU_REGS_RSP]); - break; - case 0x6c: /* insb */ - case 0x6d: /* insw/insd */ - if (kvm_emulate_pio_string(ctxt->vcpu, NULL, - 1, /* in */ - (d & ByteOp) ? 1 : op_bytes, /* size */ - rep_prefix ? - address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */ - (_eflags & EFLG_DF), /* down */ - register_address(ctxt->es_base, - _regs[VCPU_REGS_RDI]), /* address */ - rep_prefix, - _regs[VCPU_REGS_RDX] /* port */ - ) == 0) - return -1; - return 0; - case 0x6e: /* outsb */ - case 0x6f: /* outsw/outsd */ - if (kvm_emulate_pio_string(ctxt->vcpu, NULL, - 0, /* in */ - (d & ByteOp) ? 1 : op_bytes, /* size */ - rep_prefix ? - address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */ - (_eflags & EFLG_DF), /* down */ - register_address(override_base ? - *override_base : ctxt->ds_base, - _regs[VCPU_REGS_RSI]), /* address */ - rep_prefix, - _regs[VCPU_REGS_RDX] /* port */ - ) == 0) - return -1; - return 0; - case 0x70 ... 0x7f: /* jcc (short) */ { - int rel = insn_fetch(s8, 1, _eip); - - if (test_cc(b, _eflags)) - JMP_REL(rel); - break; - } - case 0x9c: /* pushf */ - src.val = (unsigned long) _eflags; - goto push; - case 0x9d: /* popf */ - dst.ptr = (unsigned long *) &_eflags; - goto pop_instruction; - case 0xc3: /* ret */ - dst.ptr = &_eip; - goto pop_instruction; - case 0xf4: /* hlt */ - ctxt->vcpu->halt_request = 1; - goto done; - } - if (rep_prefix) { - if (_regs[VCPU_REGS_RCX] == 0) { - ctxt->vcpu->rip = _eip; - goto done; - } - _regs[VCPU_REGS_RCX]--; - _eip = ctxt->vcpu->rip; - } - switch (b) { - case 0xa4 ... 0xa5: /* movs */ - dst.type = OP_MEM; - dst.bytes = (d & ByteOp) ? 1 : op_bytes; - dst.ptr = (unsigned long *)register_address(ctxt->es_base, - _regs[VCPU_REGS_RDI]); - if ((rc = ops->read_emulated(register_address( - override_base ? *override_base : ctxt->ds_base, - _regs[VCPU_REGS_RSI]), &dst.val, dst.bytes, ctxt->vcpu)) != 0) - goto done; - register_address_increment(_regs[VCPU_REGS_RSI], - (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); - register_address_increment(_regs[VCPU_REGS_RDI], - (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); - break; - case 0xa6 ... 0xa7: /* cmps */ - DPRINTF("Urk! I don't handle CMPS.\n"); - goto cannot_emulate; - case 0xaa ... 0xab: /* stos */ - dst.type = OP_MEM; - dst.bytes = (d & ByteOp) ? 1 : op_bytes; - dst.ptr = (unsigned long *)cr2; - dst.val = _regs[VCPU_REGS_RAX]; - register_address_increment(_regs[VCPU_REGS_RDI], - (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); - break; - case 0xac ... 0xad: /* lods */ - dst.type = OP_REG; - dst.bytes = (d & ByteOp) ? 1 : op_bytes; - dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX]; - if ((rc = ops->read_emulated(cr2, &dst.val, dst.bytes, - ctxt->vcpu)) != 0) - goto done; - register_address_increment(_regs[VCPU_REGS_RSI], - (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); - break; - case 0xae ... 0xaf: /* scas */ - DPRINTF("Urk! I don't handle SCAS.\n"); - goto cannot_emulate; - case 0xe8: /* call (near) */ { - long int rel; - switch (op_bytes) { - case 2: - rel = insn_fetch(s16, 2, _eip); - break; - case 4: - rel = insn_fetch(s32, 4, _eip); - break; - case 8: - rel = insn_fetch(s64, 8, _eip); - break; - default: - DPRINTF("Call: Invalid op_bytes\n"); - goto cannot_emulate; - } - src.val = (unsigned long) _eip; - JMP_REL(rel); - op_bytes = ad_bytes; - goto push; - } - case 0xe9: /* jmp rel */ - case 0xeb: /* jmp rel short */ - JMP_REL(src.val); - no_wb = 1; /* Disable writeback. */ - break; - - - } - goto writeback; - -twobyte_insn: - switch (b) { - case 0x01: /* lgdt, lidt, lmsw */ - /* Disable writeback. */ - no_wb = 1; - switch (modrm_reg) { - u16 size; - unsigned long address; - - case 2: /* lgdt */ - rc = read_descriptor(ctxt, ops, src.ptr, - &size, &address, op_bytes); - if (rc) - goto done; - realmode_lgdt(ctxt->vcpu, size, address); - break; - case 3: /* lidt */ - rc = read_descriptor(ctxt, ops, src.ptr, - &size, &address, op_bytes); - if (rc) - goto done; - realmode_lidt(ctxt->vcpu, size, address); - break; - case 4: /* smsw */ - if (modrm_mod != 3) - goto cannot_emulate; - *(u16 *)&_regs[modrm_rm] - = realmode_get_cr(ctxt->vcpu, 0); - break; - case 6: /* lmsw */ - if (modrm_mod != 3) - goto cannot_emulate; - realmode_lmsw(ctxt->vcpu, (u16)modrm_val, &_eflags); - break; - case 7: /* invlpg*/ - emulate_invlpg(ctxt->vcpu, cr2); - break; - default: - goto cannot_emulate; - } - break; - case 0x21: /* mov from dr to reg */ - no_wb = 1; - if (modrm_mod != 3) - goto cannot_emulate; - rc = emulator_get_dr(ctxt, modrm_reg, &_regs[modrm_rm]); - break; - case 0x23: /* mov from reg to dr */ - no_wb = 1; - if (modrm_mod != 3) - goto cannot_emulate; - rc = emulator_set_dr(ctxt, modrm_reg, _regs[modrm_rm]); - break; - case 0x40 ... 0x4f: /* cmov */ - dst.val = dst.orig_val = src.val; - no_wb = 1; - /* - * First, assume we're decoding an even cmov opcode - * (lsb == 0). - */ - switch ((b & 15) >> 1) { - case 0: /* cmovo */ - no_wb = (_eflags & EFLG_OF) ? 0 : 1; - break; - case 1: /* cmovb/cmovc/cmovnae */ - no_wb = (_eflags & EFLG_CF) ? 0 : 1; - break; - case 2: /* cmovz/cmove */ - no_wb = (_eflags & EFLG_ZF) ? 0 : 1; - break; - case 3: /* cmovbe/cmovna */ - no_wb = (_eflags & (EFLG_CF | EFLG_ZF)) ? 0 : 1; - break; - case 4: /* cmovs */ - no_wb = (_eflags & EFLG_SF) ? 0 : 1; - break; - case 5: /* cmovp/cmovpe */ - no_wb = (_eflags & EFLG_PF) ? 0 : 1; - break; - case 7: /* cmovle/cmovng */ - no_wb = (_eflags & EFLG_ZF) ? 0 : 1; - /* fall through */ - case 6: /* cmovl/cmovnge */ - no_wb &= (!(_eflags & EFLG_SF) != - !(_eflags & EFLG_OF)) ? 0 : 1; - break; - } - /* Odd cmov opcodes (lsb == 1) have inverted sense. */ - no_wb ^= b & 1; - break; - case 0xa3: - bt: /* bt */ - src.val &= (dst.bytes << 3) - 1; /* only subword offset */ - emulate_2op_SrcV_nobyte("bt", src, dst, _eflags); - break; - case 0xab: - bts: /* bts */ - src.val &= (dst.bytes << 3) - 1; /* only subword offset */ - emulate_2op_SrcV_nobyte("bts", src, dst, _eflags); - break; - case 0xb0 ... 0xb1: /* cmpxchg */ - /* - * Save real source value, then compare EAX against - * destination. - */ - src.orig_val = src.val; - src.val = _regs[VCPU_REGS_RAX]; - emulate_2op_SrcV("cmp", src, dst, _eflags); - if (_eflags & EFLG_ZF) { - /* Success: write back to memory. */ - dst.val = src.orig_val; - } else { - /* Failure: write the value we saw to EAX. */ - dst.type = OP_REG; - dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX]; - } - break; - case 0xb3: - btr: /* btr */ - src.val &= (dst.bytes << 3) - 1; /* only subword offset */ - emulate_2op_SrcV_nobyte("btr", src, dst, _eflags); - break; - case 0xb6 ... 0xb7: /* movzx */ - dst.bytes = op_bytes; - dst.val = (d & ByteOp) ? (u8) src.val : (u16) src.val; - break; - case 0xba: /* Grp8 */ - switch (modrm_reg & 3) { - case 0: - goto bt; - case 1: - goto bts; - case 2: - goto btr; - case 3: - goto btc; - } - break; - case 0xbb: - btc: /* btc */ - src.val &= (dst.bytes << 3) - 1; /* only subword offset */ - emulate_2op_SrcV_nobyte("btc", src, dst, _eflags); - break; - case 0xbe ... 0xbf: /* movsx */ - dst.bytes = op_bytes; - dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val; - break; - case 0xc3: /* movnti */ - dst.bytes = op_bytes; - dst.val = (op_bytes == 4) ? (u32) src.val : (u64) src.val; - break; - } - goto writeback; - -twobyte_special_insn: - /* Disable writeback. */ - no_wb = 1; - switch (b) { - case 0x06: - emulate_clts(ctxt->vcpu); - break; - case 0x08: /* invd */ - break; - case 0x09: /* wbinvd */ - break; - case 0x0d: /* GrpP (prefetch) */ - case 0x18: /* Grp16 (prefetch/nop) */ - break; - case 0x20: /* mov cr, reg */ - if (modrm_mod != 3) - goto cannot_emulate; - _regs[modrm_rm] = realmode_get_cr(ctxt->vcpu, modrm_reg); - break; - case 0x22: /* mov reg, cr */ - if (modrm_mod != 3) - goto cannot_emulate; - realmode_set_cr(ctxt->vcpu, modrm_reg, modrm_val, &_eflags); - break; - case 0x30: - /* wrmsr */ - msr_data = (u32)_regs[VCPU_REGS_RAX] - | ((u64)_regs[VCPU_REGS_RDX] << 32); - rc = kvm_set_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], msr_data); - if (rc) { - kvm_x86_ops->inject_gp(ctxt->vcpu, 0); - _eip = ctxt->vcpu->rip; - } - rc = X86EMUL_CONTINUE; - break; - case 0x32: - /* rdmsr */ - rc = kvm_get_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], &msr_data); - if (rc) { - kvm_x86_ops->inject_gp(ctxt->vcpu, 0); - _eip = ctxt->vcpu->rip; - } else { - _regs[VCPU_REGS_RAX] = (u32)msr_data; - _regs[VCPU_REGS_RDX] = msr_data >> 32; - } - rc = X86EMUL_CONTINUE; - break; - case 0x80 ... 0x8f: /* jnz rel, etc*/ { - long int rel; - - switch (op_bytes) { - case 2: - rel = insn_fetch(s16, 2, _eip); - break; - case 4: - rel = insn_fetch(s32, 4, _eip); - break; - case 8: - rel = insn_fetch(s64, 8, _eip); - break; - default: - DPRINTF("jnz: Invalid op_bytes\n"); - goto cannot_emulate; - } - if (test_cc(b, _eflags)) - JMP_REL(rel); - break; - } - case 0xc7: /* Grp9 (cmpxchg8b) */ - { - u64 old, new; - if ((rc = ops->read_emulated(cr2, &old, 8, ctxt->vcpu)) - != 0) - goto done; - if (((u32) (old >> 0) != (u32) _regs[VCPU_REGS_RAX]) || - ((u32) (old >> 32) != (u32) _regs[VCPU_REGS_RDX])) { - _regs[VCPU_REGS_RAX] = (u32) (old >> 0); - _regs[VCPU_REGS_RDX] = (u32) (old >> 32); - _eflags &= ~EFLG_ZF; - } else { - new = ((u64)_regs[VCPU_REGS_RCX] << 32) - | (u32) _regs[VCPU_REGS_RBX]; - if ((rc = ops->cmpxchg_emulated(cr2, &old, - &new, 8, ctxt->vcpu)) != 0) - goto done; - _eflags |= EFLG_ZF; - } - break; - } - } - goto writeback; - -cannot_emulate: - DPRINTF("Cannot emulate %02x\n", b); - return -1; -} - -#ifdef __XEN__ - -#include <asm/mm.h> -#include <asm/uaccess.h> - -int -x86_emulate_read_std(unsigned long addr, - unsigned long *val, - unsigned int bytes, struct x86_emulate_ctxt *ctxt) -{ - unsigned int rc; - - *val = 0; - - if ((rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0) { - propagate_page_fault(addr + bytes - rc, 0); /* read fault */ - return X86EMUL_PROPAGATE_FAULT; - } - - return X86EMUL_CONTINUE; -} - -int -x86_emulate_write_std(unsigned long addr, - unsigned long val, - unsigned int bytes, struct x86_emulate_ctxt *ctxt) -{ - unsigned int rc; - - if ((rc = copy_to_user((void *)addr, (void *)&val, bytes)) != 0) { - propagate_page_fault(addr + bytes - rc, PGERR_write_access); - return X86EMUL_PROPAGATE_FAULT; - } - - return X86EMUL_CONTINUE; -} - -#endif diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index cb4c67025d5..7743d73768d 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -151,43 +151,43 @@ int lguest_address_ok(const struct lguest *lg, /* This routine copies memory from the Guest. Here we can see how useful the * kill_lguest() routine we met in the Launcher can be: we return a random * value (all zeroes) instead of needing to return an error. */ -void __lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes) +void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes) { - if (!lguest_address_ok(lg, addr, bytes) - || copy_from_user(b, lg->mem_base + addr, bytes) != 0) { + if (!lguest_address_ok(cpu->lg, addr, bytes) + || copy_from_user(b, cpu->lg->mem_base + addr, bytes) != 0) { /* copy_from_user should do this, but as we rely on it... */ memset(b, 0, bytes); - kill_guest(lg, "bad read address %#lx len %u", addr, bytes); + kill_guest(cpu, "bad read address %#lx len %u", addr, bytes); } } /* This is the write (copy into guest) version. */ -void __lgwrite(struct lguest *lg, unsigned long addr, const void *b, +void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b, unsigned bytes) { - if (!lguest_address_ok(lg, addr, bytes) - || copy_to_user(lg->mem_base + addr, b, bytes) != 0) - kill_guest(lg, "bad write address %#lx len %u", addr, bytes); + if (!lguest_address_ok(cpu->lg, addr, bytes) + || copy_to_user(cpu->lg->mem_base + addr, b, bytes) != 0) + kill_guest(cpu, "bad write address %#lx len %u", addr, bytes); } /*:*/ /*H:030 Let's jump straight to the the main loop which runs the Guest. * Remember, this is called by the Launcher reading /dev/lguest, and we keep * going around and around until something interesting happens. */ -int run_guest(struct lguest *lg, unsigned long __user *user) +int run_guest(struct lg_cpu *cpu, unsigned long __user *user) { /* We stop running once the Guest is dead. */ - while (!lg->dead) { + while (!cpu->lg->dead) { /* First we run any hypercalls the Guest wants done. */ - if (lg->hcall) - do_hypercalls(lg); + if (cpu->hcall) + do_hypercalls(cpu); /* It's possible the Guest did a NOTIFY hypercall to the * Launcher, in which case we return from the read() now. */ - if (lg->pending_notify) { - if (put_user(lg->pending_notify, user)) + if (cpu->pending_notify) { + if (put_user(cpu->pending_notify, user)) return -EFAULT; - return sizeof(lg->pending_notify); + return sizeof(cpu->pending_notify); } /* Check for signals */ @@ -195,13 +195,13 @@ int run_guest(struct lguest *lg, unsigned long __user *user) return -ERESTARTSYS; /* If Waker set break_out, return to Launcher. */ - if (lg->break_out) + if (cpu->break_out) return -EAGAIN; /* Check if there are any interrupts which can be delivered * now: if so, this sets up the hander to be executed when we * next run the Guest. */ - maybe_do_interrupt(lg); + maybe_do_interrupt(cpu); /* All long-lived kernel loops need to check with this horrible * thing called the freezer. If the Host is trying to suspend, @@ -210,12 +210,12 @@ int run_guest(struct lguest *lg, unsigned long __user *user) /* Just make absolutely sure the Guest is still alive. One of * those hypercalls could have been fatal, for example. */ - if (lg->dead) + if (cpu->lg->dead) break; /* If the Guest asked to be stopped, we sleep. The Guest's * clock timer or LHCALL_BREAK from the Waker will wake us. */ - if (lg->halted) { + if (cpu->halted) { set_current_state(TASK_INTERRUPTIBLE); schedule(); continue; @@ -226,15 +226,17 @@ int run_guest(struct lguest *lg, unsigned long __user *user) local_irq_disable(); /* Actually run the Guest until something happens. */ - lguest_arch_run_guest(lg); + lguest_arch_run_guest(cpu); /* Now we're ready to be interrupted or moved to other CPUs */ local_irq_enable(); /* Now we deal with whatever happened to the Guest. */ - lguest_arch_handle_trap(lg); + lguest_arch_handle_trap(cpu); } + if (cpu->lg->dead == ERR_PTR(-ERESTART)) + return -ERESTART; /* The Guest is dead => "No such file or directory" */ return -ENOENT; } @@ -253,7 +255,7 @@ static int __init init(void) /* Lguest can't run under Xen, VMI or itself. It does Tricky Stuff. */ if (paravirt_enabled()) { - printk("lguest is afraid of %s\n", pv_info.name); + printk("lguest is afraid of being a guest\n"); return -EPERM; } diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index b478affe8f9..0f2cb4fd7c6 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c @@ -23,13 +23,14 @@ #include <linux/uaccess.h> #include <linux/syscalls.h> #include <linux/mm.h> +#include <linux/ktime.h> #include <asm/page.h> #include <asm/pgtable.h> #include "lg.h" /*H:120 This is the core hypercall routine: where the Guest gets what it wants. * Or gets killed. Or, in the case of LHCALL_CRASH, both. */ -static void do_hcall(struct lguest *lg, struct hcall_args *args) +static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) { switch (args->arg0) { case LHCALL_FLUSH_ASYNC: @@ -39,60 +40,62 @@ static void do_hcall(struct lguest *lg, struct hcall_args *args) case LHCALL_LGUEST_INIT: /* You can't get here unless you're already initialized. Don't * do that. */ - kill_guest(lg, "already have lguest_data"); + kill_guest(cpu, "already have lguest_data"); break; - case LHCALL_CRASH: { - /* Crash is such a trivial hypercall that we do it in four + case LHCALL_SHUTDOWN: { + /* Shutdown is such a trivial hypercall that we do it in four * lines right here. */ char msg[128]; /* If the lgread fails, it will call kill_guest() itself; the * kill_guest() with the message will be ignored. */ - __lgread(lg, msg, args->arg1, sizeof(msg)); + __lgread(cpu, msg, args->arg1, sizeof(msg)); msg[sizeof(msg)-1] = '\0'; - kill_guest(lg, "CRASH: %s", msg); + kill_guest(cpu, "CRASH: %s", msg); + if (args->arg2 == LGUEST_SHUTDOWN_RESTART) + cpu->lg->dead = ERR_PTR(-ERESTART); break; } case LHCALL_FLUSH_TLB: /* FLUSH_TLB comes in two flavors, depending on the * argument: */ if (args->arg1) - guest_pagetable_clear_all(lg); + guest_pagetable_clear_all(cpu); else - guest_pagetable_flush_user(lg); + guest_pagetable_flush_user(cpu); break; /* All these calls simply pass the arguments through to the right * routines. */ case LHCALL_NEW_PGTABLE: - guest_new_pagetable(lg, args->arg1); + guest_new_pagetable(cpu, args->arg1); break; case LHCALL_SET_STACK: - guest_set_stack(lg, args->arg1, args->arg2, args->arg3); + guest_set_stack(cpu, args->arg1, args->arg2, args->arg3); break; case LHCALL_SET_PTE: - guest_set_pte(lg, args->arg1, args->arg2, __pte(args->arg3)); + guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3)); break; case LHCALL_SET_PMD: - guest_set_pmd(lg, args->arg1, args->arg2); + guest_set_pmd(cpu->lg, args->arg1, args->arg2); break; case LHCALL_SET_CLOCKEVENT: - guest_set_clockevent(lg, args->arg1); + guest_set_clockevent(cpu, args->arg1); break; case LHCALL_TS: /* This sets the TS flag, as we saw used in run_guest(). */ - lg->ts = args->arg1; + cpu->ts = args->arg1; break; case LHCALL_HALT: /* Similarly, this sets the halted flag for run_guest(). */ - lg->halted = 1; + cpu->halted = 1; break; case LHCALL_NOTIFY: - lg->pending_notify = args->arg1; + cpu->pending_notify = args->arg1; break; default: /* It should be an architecture-specific hypercall. */ - if (lguest_arch_do_hcall(lg, args)) - kill_guest(lg, "Bad hypercall %li\n", args->arg0); + if (lguest_arch_do_hcall(cpu, args)) + kill_guest(cpu, "Bad hypercall %li\n", args->arg0); } } /*:*/ @@ -104,13 +107,13 @@ static void do_hcall(struct lguest *lg, struct hcall_args *args) * Guest put them in the ring, but we also promise the Guest that they will * happen before any normal hypercall (which is why we check this before * checking for a normal hcall). */ -static void do_async_hcalls(struct lguest *lg) +static void do_async_hcalls(struct lg_cpu *cpu) { unsigned int i; u8 st[LHCALL_RING_SIZE]; /* For simplicity, we copy the entire call status array in at once. */ - if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st))) + if (copy_from_user(&st, &cpu->lg->lguest_data->hcall_status, sizeof(st))) return; /* We process "struct lguest_data"s hcalls[] ring once. */ @@ -119,7 +122,7 @@ static void do_async_hcalls(struct lguest *lg) /* We remember where we were up to from last time. This makes * sure that the hypercalls are done in the order the Guest * places them in the ring. */ - unsigned int n = lg->next_hcall; + unsigned int n = cpu->next_hcall; /* 0xFF means there's no call here (yet). */ if (st[n] == 0xFF) @@ -127,65 +130,65 @@ static void do_async_hcalls(struct lguest *lg) /* OK, we have hypercall. Increment the "next_hcall" cursor, * and wrap back to 0 if we reach the end. */ - if (++lg->next_hcall == LHCALL_RING_SIZE) - lg->next_hcall = 0; + if (++cpu->next_hcall == LHCALL_RING_SIZE) + cpu->next_hcall = 0; /* Copy the hypercall arguments into a local copy of * the hcall_args struct. */ - if (copy_from_user(&args, &lg->lguest_data->hcalls[n], + if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n], sizeof(struct hcall_args))) { - kill_guest(lg, "Fetching async hypercalls"); + kill_guest(cpu, "Fetching async hypercalls"); break; } /* Do the hypercall, same as a normal one. */ - do_hcall(lg, &args); + do_hcall(cpu, &args); /* Mark the hypercall done. */ - if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) { - kill_guest(lg, "Writing result for async hypercall"); + if (put_user(0xFF, &cpu->lg->lguest_data->hcall_status[n])) { + kill_guest(cpu, "Writing result for async hypercall"); break; } /* Stop doing hypercalls if they want to notify the Launcher: * it needs to service this first. */ - if (lg->pending_notify) + if (cpu->pending_notify) break; } } /* Last of all, we look at what happens first of all. The very first time the * Guest makes a hypercall, we end up here to set things up: */ -static void initialize(struct lguest *lg) +static void initialize(struct lg_cpu *cpu) { /* You can't do anything until you're initialized. The Guest knows the * rules, so we're unforgiving here. */ - if (lg->hcall->arg0 != LHCALL_LGUEST_INIT) { - kill_guest(lg, "hypercall %li before INIT", lg->hcall->arg0); + if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) { + kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0); return; } - if (lguest_arch_init_hypercalls(lg)) - kill_guest(lg, "bad guest page %p", lg->lguest_data); + if (lguest_arch_init_hypercalls(cpu)) + kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); /* The Guest tells us where we're not to deliver interrupts by putting * the range of addresses into "struct lguest_data". */ - if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start) - || get_user(lg->noirq_end, &lg->lguest_data->noirq_end)) - kill_guest(lg, "bad guest page %p", lg->lguest_data); + if (get_user(cpu->lg->noirq_start, &cpu->lg->lguest_data->noirq_start) + || get_user(cpu->lg->noirq_end, &cpu->lg->lguest_data->noirq_end)) + kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); /* We write the current time into the Guest's data page once so it can * set its clock. */ - write_timestamp(lg); + write_timestamp(cpu); /* page_tables.c will also do some setup. */ - page_table_guest_data_init(lg); + page_table_guest_data_init(cpu); /* This is the one case where the above accesses might have been the * first write to a Guest page. This may have caused a copy-on-write * fault, but the old page might be (read-only) in the Guest * pagetable. */ - guest_pagetable_clear_all(lg); + guest_pagetable_clear_all(cpu); } /*H:100 @@ -194,27 +197,27 @@ static void initialize(struct lguest *lg) * Remember from the Guest, hypercalls come in two flavors: normal and * asynchronous. This file handles both of types. */ -void do_hypercalls(struct lguest *lg) +void do_hypercalls(struct lg_cpu *cpu) { /* Not initialized yet? This hypercall must do it. */ - if (unlikely(!lg->lguest_data)) { + if (unlikely(!cpu->lg->lguest_data)) { /* Set up the "struct lguest_data" */ - initialize(lg); + initialize(cpu); /* Hcall is done. */ - lg->hcall = NULL; + cpu->hcall = NULL; return; } /* The Guest has initialized. * * Look in the hypercall ring for the async hypercalls: */ - do_async_hcalls(lg); + do_async_hcalls(cpu); /* If we stopped reading the hypercall ring because the Guest did a * NOTIFY to the Launcher, we want to return now. Otherwise we do * the hypercall. */ - if (!lg->pending_notify) { - do_hcall(lg, lg->hcall); + if (!cpu->pending_notify) { + do_hcall(cpu, cpu->hcall); /* Tricky point: we reset the hcall pointer to mark the * hypercall as "done". We use the hcall pointer rather than * the trap number to indicate a hypercall is pending. @@ -225,16 +228,17 @@ void do_hypercalls(struct lguest *lg) * Launcher, the run_guest() loop will exit without running the * Guest. When it comes back it would try to re-run the * hypercall. */ - lg->hcall = NULL; + cpu->hcall = NULL; } } /* This routine supplies the Guest with time: it's used for wallclock time at * initial boot and as a rough time source if the TSC isn't available. */ -void write_timestamp(struct lguest *lg) +void write_timestamp(struct lg_cpu *cpu) { struct timespec now; ktime_get_real_ts(&now); - if (copy_to_user(&lg->lguest_data->time, &now, sizeof(struct timespec))) - kill_guest(lg, "Writing timestamp"); + if (copy_to_user(&cpu->lg->lguest_data->time, + &now, sizeof(struct timespec))) + kill_guest(cpu, "Writing timestamp"); } diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index 2b66f79c208..32e97c1858e 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c @@ -41,11 +41,11 @@ static int idt_present(u32 lo, u32 hi) /* We need a helper to "push" a value onto the Guest's stack, since that's a * big part of what delivering an interrupt does. */ -static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val) +static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) { /* Stack grows upwards: move stack then write value. */ *gstack -= 4; - lgwrite(lg, *gstack, u32, val); + lgwrite(cpu, *gstack, u32, val); } /*H:210 The set_guest_interrupt() routine actually delivers the interrupt or @@ -60,7 +60,7 @@ static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val) * We set up the stack just like the CPU does for a real interrupt, so it's * identical for the Guest (and the standard "iret" instruction will undo * it). */ -static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) +static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, int has_err) { unsigned long gstack, origstack; u32 eflags, ss, irq_enable; @@ -69,59 +69,59 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) /* There are two cases for interrupts: one where the Guest is already * in the kernel, and a more complex one where the Guest is in * userspace. We check the privilege level to find out. */ - if ((lg->regs->ss&0x3) != GUEST_PL) { + if ((cpu->regs->ss&0x3) != GUEST_PL) { /* The Guest told us their kernel stack with the SET_STACK * hypercall: both the virtual address and the segment */ - virtstack = lg->esp1; - ss = lg->ss1; + virtstack = cpu->esp1; + ss = cpu->ss1; - origstack = gstack = guest_pa(lg, virtstack); + origstack = gstack = guest_pa(cpu, virtstack); /* We push the old stack segment and pointer onto the new * stack: when the Guest does an "iret" back from the interrupt * handler the CPU will notice they're dropping privilege * levels and expect these here. */ - push_guest_stack(lg, &gstack, lg->regs->ss); - push_guest_stack(lg, &gstack, lg->regs->esp); + push_guest_stack(cpu, &gstack, cpu->regs->ss); + push_guest_stack(cpu, &gstack, cpu->regs->esp); } else { /* We're staying on the same Guest (kernel) stack. */ - virtstack = lg->regs->esp; - ss = lg->regs->ss; + virtstack = cpu->regs->esp; + ss = cpu->regs->ss; - origstack = gstack = guest_pa(lg, virtstack); + origstack = gstack = guest_pa(cpu, virtstack); } /* Remember that we never let the Guest actually disable interrupts, so * the "Interrupt Flag" bit is always set. We copy that bit from the * Guest's "irq_enabled" field into the eflags word: we saw the Guest * copy it back in "lguest_iret". */ - eflags = lg->regs->eflags; - if (get_user(irq_enable, &lg->lguest_data->irq_enabled) == 0 + eflags = cpu->regs->eflags; + if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0 && !(irq_enable & X86_EFLAGS_IF)) eflags &= ~X86_EFLAGS_IF; /* An interrupt is expected to push three things on the stack: the old * "eflags" word, the old code segment, and the old instruction * pointer. */ - push_guest_stack(lg, &gstack, eflags); - push_guest_stack(lg, &gstack, lg->regs->cs); - push_guest_stack(lg, &gstack, lg->regs->eip); + push_guest_stack(cpu, &gstack, eflags); + push_guest_stack(cpu, &gstack, cpu->regs->cs); + push_guest_stack(cpu, &gstack, cpu->regs->eip); /* For the six traps which supply an error code, we push that, too. */ if (has_err) - push_guest_stack(lg, &gstack, lg->regs->errcode); + push_guest_stack(cpu, &gstack, cpu->regs->errcode); /* Now we've pushed all the old state, we change the stack, the code * segment and the address to execute. */ - lg->regs->ss = ss; - lg->regs->esp = virtstack + (gstack - origstack); - lg->regs->cs = (__KERNEL_CS|GUEST_PL); - lg->regs->eip = idt_address(lo, hi); + cpu->regs->ss = ss; + cpu->regs->esp = virtstack + (gstack - origstack); + cpu->regs->cs = (__KERNEL_CS|GUEST_PL); + cpu->regs->eip = idt_address(lo, hi); /* There are two kinds of interrupt handlers: 0xE is an "interrupt * gate" which expects interrupts to be disabled on entry. */ if (idt_type(lo, hi) == 0xE) - if (put_user(0, &lg->lguest_data->irq_enabled)) - kill_guest(lg, "Disabling interrupts"); + if (put_user(0, &cpu->lg->lguest_data->irq_enabled)) + kill_guest(cpu, "Disabling interrupts"); } /*H:205 @@ -129,23 +129,23 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) * * maybe_do_interrupt() gets called before every entry to the Guest, to see if * we should divert the Guest to running an interrupt handler. */ -void maybe_do_interrupt(struct lguest *lg) +void maybe_do_interrupt(struct lg_cpu *cpu) { unsigned int irq; DECLARE_BITMAP(blk, LGUEST_IRQS); struct desc_struct *idt; /* If the Guest hasn't even initialized yet, we can do nothing. */ - if (!lg->lguest_data) + if (!cpu->lg->lguest_data) return; /* Take our "irqs_pending" array and remove any interrupts the Guest * wants blocked: the result ends up in "blk". */ - if (copy_from_user(&blk, lg->lguest_data->blocked_interrupts, + if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, sizeof(blk))) return; - bitmap_andnot(blk, lg->irqs_pending, blk, LGUEST_IRQS); + bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS); /* Find the first interrupt. */ irq = find_first_bit(blk, LGUEST_IRQS); @@ -155,19 +155,20 @@ void maybe_do_interrupt(struct lguest *lg) /* They may be in the middle of an iret, where they asked us never to * deliver interrupts. */ - if (lg->regs->eip >= lg->noirq_start && lg->regs->eip < lg->noirq_end) + if (cpu->regs->eip >= cpu->lg->noirq_start && + (cpu->regs->eip < cpu->lg->noirq_end)) return; /* If they're halted, interrupts restart them. */ - if (lg->halted) { + if (cpu->halted) { /* Re-enable interrupts. */ - if (put_user(X86_EFLAGS_IF, &lg->lguest_data->irq_enabled)) - kill_guest(lg, "Re-enabling interrupts"); - lg->halted = 0; + if (put_user(X86_EFLAGS_IF, &cpu->lg->lguest_data->irq_enabled)) + kill_guest(cpu, "Re-enabling interrupts"); + cpu->halted = 0; } else { /* Otherwise we check if they have interrupts disabled. */ u32 irq_enabled; - if (get_user(irq_enabled, &lg->lguest_data->irq_enabled)) + if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled)) irq_enabled = 0; if (!irq_enabled) return; @@ -176,15 +177,15 @@ void maybe_do_interrupt(struct lguest *lg) /* Look at the IDT entry the Guest gave us for this interrupt. The * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip * over them. */ - idt = &lg->arch.idt[FIRST_EXTERNAL_VECTOR+irq]; + idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq]; /* If they don't have a handler (yet?), we just ignore it */ if (idt_present(idt->a, idt->b)) { /* OK, mark it no longer pending and deliver it. */ - clear_bit(irq, lg->irqs_pending); + clear_bit(irq, cpu->irqs_pending); /* set_guest_interrupt() takes the interrupt descriptor and a * flag to say whether this interrupt pushes an error code onto * the stack as well: virtual interrupts never do. */ - set_guest_interrupt(lg, idt->a, idt->b, 0); + set_guest_interrupt(cpu, idt->a, idt->b, 0); } /* Every time we deliver an interrupt, we update the timestamp in the @@ -192,7 +193,7 @@ void maybe_do_interrupt(struct lguest *lg) * did this more often, but it can actually be quite slow: doing it * here is a compromise which means at least it gets updated every * timer interrupt. */ - write_timestamp(lg); + write_timestamp(cpu); } /*:*/ @@ -245,19 +246,19 @@ static int has_err(unsigned int trap) } /* deliver_trap() returns true if it could deliver the trap. */ -int deliver_trap(struct lguest *lg, unsigned int num) +int deliver_trap(struct lg_cpu *cpu, unsigned int num) { /* Trap numbers are always 8 bit, but we set an impossible trap number * for traps inside the Switcher, so check that here. */ - if (num >= ARRAY_SIZE(lg->arch.idt)) + if (num >= ARRAY_SIZE(cpu->arch.idt)) return 0; /* Early on the Guest hasn't set the IDT entries (or maybe it put a * bogus one in): if we fail here, the Guest will be killed. */ - if (!idt_present(lg->arch.idt[num].a, lg->arch.idt[num].b)) + if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b)) return 0; - set_guest_interrupt(lg, lg->arch.idt[num].a, lg->arch.idt[num].b, - has_err(num)); + set_guest_interrupt(cpu, cpu->arch.idt[num].a, + cpu->arch.idt[num].b, has_err(num)); return 1; } @@ -309,18 +310,18 @@ static int direct_trap(unsigned int num) * the Guest. * * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. */ -void pin_stack_pages(struct lguest *lg) +void pin_stack_pages(struct lg_cpu *cpu) { unsigned int i; /* Depending on the CONFIG_4KSTACKS option, the Guest can have one or * two pages of stack space. */ - for (i = 0; i < lg->stack_pages; i++) + for (i = 0; i < cpu->lg->stack_pages; i++) /* The stack grows *upwards*, so the address we're given is the * start of the page after the kernel stack. Subtract one to * get back onto the first stack page, and keep subtracting to * get to the rest of the stack pages. */ - pin_page(lg, lg->esp1 - 1 - i * PAGE_SIZE); + pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE); } /* Direct traps also mean that we need to know whenever the Guest wants to use @@ -331,21 +332,21 @@ void pin_stack_pages(struct lguest *lg) * * In Linux each process has its own kernel stack, so this happens a lot: we * change stacks on each context switch. */ -void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages) +void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages) { /* You are not allowed have a stack segment with privilege level 0: bad * Guest! */ if ((seg & 0x3) != GUEST_PL) - kill_guest(lg, "bad stack segment %i", seg); + kill_guest(cpu, "bad stack segment %i", seg); /* We only expect one or two stack pages. */ if (pages > 2) - kill_guest(lg, "bad stack pages %u", pages); + kill_guest(cpu, "bad stack pages %u", pages); /* Save where the stack is, and how many pages */ - lg->ss1 = seg; - lg->esp1 = esp; - lg->stack_pages = pages; + cpu->ss1 = seg; + cpu->esp1 = esp; + cpu->lg->stack_pages = pages; /* Make sure the new stack pages are mapped */ - pin_stack_pages(lg); + pin_stack_pages(cpu); } /* All this reference to mapping stacks leads us neatly into the other complex @@ -353,7 +354,7 @@ void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages) /*H:235 This is the routine which actually checks the Guest's IDT entry and * transfers it into the entry in "struct lguest": */ -static void set_trap(struct lguest *lg, struct desc_struct *trap, +static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap, unsigned int num, u32 lo, u32 hi) { u8 type = idt_type(lo, hi); @@ -366,7 +367,7 @@ static void set_trap(struct lguest *lg, struct desc_struct *trap, /* We only support interrupt and trap gates. */ if (type != 0xE && type != 0xF) - kill_guest(lg, "bad IDT type %i", type); + kill_guest(cpu, "bad IDT type %i", type); /* We only copy the handler address, present bit, privilege level and * type. The privilege level controls where the trap can be triggered @@ -383,7 +384,7 @@ static void set_trap(struct lguest *lg, struct desc_struct *trap, * * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. */ -void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi) +void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi) { /* Guest never handles: NMI, doublefault, spurious interrupt or * hypercall. We ignore when it tries to set them. */ @@ -392,13 +393,13 @@ void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi) /* Mark the IDT as changed: next time the Guest runs we'll know we have * to copy this again. */ - lg->changed |= CHANGED_IDT; + cpu->changed |= CHANGED_IDT; /* Check that the Guest doesn't try to step outside the bounds. */ - if (num >= ARRAY_SIZE(lg->arch.idt)) - kill_guest(lg, "Setting idt entry %u", num); + if (num >= ARRAY_SIZE(cpu->arch.idt)) + kill_guest(cpu, "Setting idt entry %u", num); else - set_trap(lg, &lg->arch.idt[num], num, lo, hi); + set_trap(cpu, &cpu->arch.idt[num], num, lo, hi); } /* The default entry for each interrupt points into the Switcher routines which @@ -434,14 +435,14 @@ void setup_default_idt_entries(struct lguest_ro_state *state, /*H:240 We don't use the IDT entries in the "struct lguest" directly, instead * we copy them into the IDT which we've set up for Guests on this CPU, just * before we run the Guest. This routine does that copy. */ -void copy_traps(const struct lguest *lg, struct desc_struct *idt, +void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, const unsigned long *def) { unsigned int i; /* We can simply copy the direct traps, otherwise we use the default * ones in the Switcher: they will return to the Host. */ - for (i = 0; i < ARRAY_SIZE(lg->arch.idt); i++) { + for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) { /* If no Guest can ever override this trap, leave it alone. */ if (!direct_trap(i)) continue; @@ -450,8 +451,8 @@ void copy_traps(const struct lguest *lg, struct desc_struct *idt, * Interrupt gates (type 14) disable interrupts as they are * entered, which we never let the Guest do. Not present * entries (type 0x0) also can't go direct, of course. */ - if (idt_type(lg->arch.idt[i].a, lg->arch.idt[i].b) == 0xF) - idt[i] = lg->arch.idt[i]; + if (idt_type(cpu->arch.idt[i].a, cpu->arch.idt[i].b) == 0xF) + idt[i] = cpu->arch.idt[i]; else /* Reset it to the default. */ default_idt_entry(&idt[i], i, def[i]); @@ -470,13 +471,13 @@ void copy_traps(const struct lguest *lg, struct desc_struct *idt, * infrastructure to set a callback at that time. * * 0 means "turn off the clock". */ -void guest_set_clockevent(struct lguest *lg, unsigned long delta) +void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta) { ktime_t expires; if (unlikely(delta == 0)) { /* Clock event device is shutting down. */ - hrtimer_cancel(&lg->hrt); + hrtimer_cancel(&cpu->hrt); return; } @@ -484,25 +485,25 @@ void guest_set_clockevent(struct lguest *lg, unsigned long delta) * all the time between now and the timer interrupt it asked for. This * is almost always the right thing to do. */ expires = ktime_add_ns(ktime_get_real(), delta); - hrtimer_start(&lg->hrt, expires, HRTIMER_MODE_ABS); + hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS); } /* This is the function called when the Guest's timer expires. */ static enum hrtimer_restart clockdev_fn(struct hrtimer *timer) { - struct lguest *lg = container_of(timer, struct lguest, hrt); + struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt); /* Remember the first interrupt is the timer interrupt. */ - set_bit(0, lg->irqs_pending); + set_bit(0, cpu->irqs_pending); /* If the Guest is actually stopped, we need to wake it up. */ - if (lg->halted) - wake_up_process(lg->tsk); + if (cpu->halted) + wake_up_process(cpu->tsk); return HRTIMER_NORESTART; } /* This sets up the timer for this Guest. */ -void init_clockdev(struct lguest *lg) +void init_clockdev(struct lg_cpu *cpu) { - hrtimer_init(&lg->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS); - lg->hrt.function = clockdev_fn; + hrtimer_init(&cpu->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS); + cpu->hrt.function = clockdev_fn; } diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 86924891b5e..2337e1a06f0 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -8,6 +8,7 @@ #include <linux/lguest.h> #include <linux/lguest_launcher.h> #include <linux/wait.h> +#include <linux/hrtimer.h> #include <linux/err.h> #include <asm/semaphore.h> @@ -38,58 +39,72 @@ struct lguest_pages #define CHANGED_GDT_TLS 4 /* Actually a subset of CHANGED_GDT */ #define CHANGED_ALL 3 -/* The private info the thread maintains about the guest. */ -struct lguest -{ - /* At end of a page shared mapped over lguest_pages in guest. */ - unsigned long regs_page; - struct lguest_regs *regs; - struct lguest_data __user *lguest_data; +struct lguest; + +struct lg_cpu { + unsigned int id; + struct lguest *lg; struct task_struct *tsk; struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */ - u32 pfn_limit; - /* This provides the offset to the base of guest-physical - * memory in the Launcher. */ - void __user *mem_base; - unsigned long kernel_address; + u32 cr2; - int halted; int ts; - u32 next_hcall; u32 esp1; u8 ss1; + /* Bitmap of what has changed: see CHANGED_* above. */ + int changed; + + unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */ + + /* At end of a page shared mapped over lguest_pages in guest. */ + unsigned long regs_page; + struct lguest_regs *regs; + + struct lguest_pages *last_pages; + + int cpu_pgd; /* which pgd this cpu is currently using */ + /* If a hypercall was asked for, this points to the arguments. */ struct hcall_args *hcall; + u32 next_hcall; + + /* Virtual clock device */ + struct hrtimer hrt; /* Do we need to stop what we're doing and return to userspace? */ int break_out; wait_queue_head_t break_wq; + int halted; - /* Bitmap of what has changed: see CHANGED_* above. */ - int changed; - struct lguest_pages *last_pages; + /* Pending virtual interrupts */ + DECLARE_BITMAP(irqs_pending, LGUEST_IRQS); + + struct lg_cpu_arch arch; +}; + +/* The private info the thread maintains about the guest. */ +struct lguest +{ + struct lguest_data __user *lguest_data; + struct lg_cpu cpus[NR_CPUS]; + unsigned int nr_cpus; + + u32 pfn_limit; + /* This provides the offset to the base of guest-physical + * memory in the Launcher. */ + void __user *mem_base; + unsigned long kernel_address; - /* We keep a small number of these. */ - u32 pgdidx; struct pgdir pgdirs[4]; unsigned long noirq_start, noirq_end; - unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */ unsigned int stack_pages; u32 tsc_khz; /* Dead? */ const char *dead; - - struct lguest_arch arch; - - /* Virtual clock device */ - struct hrtimer hrt; - - /* Pending virtual interrupts */ - DECLARE_BITMAP(irqs_pending, LGUEST_IRQS); }; extern struct mutex lguest_lock; @@ -97,26 +112,26 @@ extern struct mutex lguest_lock; /* core.c: */ int lguest_address_ok(const struct lguest *lg, unsigned long addr, unsigned long len); -void __lgread(struct lguest *, void *, unsigned long, unsigned); -void __lgwrite(struct lguest *, unsigned long, const void *, unsigned); +void __lgread(struct lg_cpu *, void *, unsigned long, unsigned); +void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); /*H:035 Using memory-copy operations like that is usually inconvient, so we * have the following helper macros which read and write a specific type (often * an unsigned long). * * This reads into a variable of the given type then returns that. */ -#define lgread(lg, addr, type) \ - ({ type _v; __lgread((lg), &_v, (addr), sizeof(_v)); _v; }) +#define lgread(cpu, addr, type) \ + ({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; }) /* This checks that the variable is of the given type, then writes it out. */ -#define lgwrite(lg, addr, type, val) \ +#define lgwrite(cpu, addr, type, val) \ do { \ typecheck(type, val); \ - __lgwrite((lg), (addr), &(val), sizeof(val)); \ + __lgwrite((cpu), (addr), &(val), sizeof(val)); \ } while(0) /* (end of memory access helper routines) :*/ -int run_guest(struct lguest *lg, unsigned long __user *user); +int run_guest(struct lg_cpu *cpu, unsigned long __user *user); /* Helper macros to obtain the first 12 or the last 20 bits, this is only the * first step in the migration to the kernel types. pte_pfn is already defined @@ -126,52 +141,53 @@ int run_guest(struct lguest *lg, unsigned long __user *user); #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) /* interrupts_and_traps.c: */ -void maybe_do_interrupt(struct lguest *lg); -int deliver_trap(struct lguest *lg, unsigned int num); -void load_guest_idt_entry(struct lguest *lg, unsigned int i, u32 low, u32 hi); -void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages); -void pin_stack_pages(struct lguest *lg); +void maybe_do_interrupt(struct lg_cpu *cpu); +int deliver_trap(struct lg_cpu *cpu, unsigned int num); +void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i, + u32 low, u32 hi); +void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages); +void pin_stack_pages(struct lg_cpu *cpu); void setup_default_idt_entries(struct lguest_ro_state *state, const unsigned long *def); -void copy_traps(const struct lguest *lg, struct desc_struct *idt, +void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, const unsigned long *def); -void guest_set_clockevent(struct lguest *lg, unsigned long delta); -void init_clockdev(struct lguest *lg); +void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta); +void init_clockdev(struct lg_cpu *cpu); bool check_syscall_vector(struct lguest *lg); int init_interrupts(void); void free_interrupts(void); /* segments.c: */ void setup_default_gdt_entries(struct lguest_ro_state *state); -void setup_guest_gdt(struct lguest *lg); -void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num); -void guest_load_tls(struct lguest *lg, unsigned long tls_array); -void copy_gdt(const struct lguest *lg, struct desc_struct *gdt); -void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt); +void setup_guest_gdt(struct lg_cpu *cpu); +void load_guest_gdt(struct lg_cpu *cpu, unsigned long table, u32 num); +void guest_load_tls(struct lg_cpu *cpu, unsigned long tls_array); +void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt); +void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt); /* page_tables.c: */ int init_guest_pagetable(struct lguest *lg, unsigned long pgtable); void free_guest_pagetable(struct lguest *lg); -void guest_new_pagetable(struct lguest *lg, unsigned long pgtable); +void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable); void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i); -void guest_pagetable_clear_all(struct lguest *lg); -void guest_pagetable_flush_user(struct lguest *lg); -void guest_set_pte(struct lguest *lg, unsigned long gpgdir, +void guest_pagetable_clear_all(struct lg_cpu *cpu); +void guest_pagetable_flush_user(struct lg_cpu *cpu); +void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, unsigned long vaddr, pte_t val); -void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages); -int demand_page(struct lguest *info, unsigned long cr2, int errcode); -void pin_page(struct lguest *lg, unsigned long vaddr); -unsigned long guest_pa(struct lguest *lg, unsigned long vaddr); -void page_table_guest_data_init(struct lguest *lg); +void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages); +int demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode); +void pin_page(struct lg_cpu *cpu, unsigned long vaddr); +unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr); +void page_table_guest_data_init(struct lg_cpu *cpu); /* <arch>/core.c: */ void lguest_arch_host_init(void); void lguest_arch_host_fini(void); -void lguest_arch_run_guest(struct lguest *lg); -void lguest_arch_handle_trap(struct lguest *lg); -int lguest_arch_init_hypercalls(struct lguest *lg); -int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args); -void lguest_arch_setup_regs(struct lguest *lg, unsigned long start); +void lguest_arch_run_guest(struct lg_cpu *cpu); +void lguest_arch_handle_trap(struct lg_cpu *cpu); +int lguest_arch_init_hypercalls(struct lg_cpu *cpu); +int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args); +void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start); /* <arch>/switcher.S: */ extern char start_switcher_text[], end_switcher_text[], switch_to_guest[]; @@ -181,8 +197,8 @@ int lguest_device_init(void); void lguest_device_remove(void); /* hypercalls.c: */ -void do_hypercalls(struct lguest *lg); -void write_timestamp(struct lguest *lg); +void do_hypercalls(struct lg_cpu *cpu); +void write_timestamp(struct lg_cpu *cpu); /*L:035 * Let's step aside for the moment, to study one important routine that's used @@ -208,12 +224,12 @@ void write_timestamp(struct lguest *lg); * Like any macro which uses an "if", it is safely wrapped in a run-once "do { * } while(0)". */ -#define kill_guest(lg, fmt...) \ +#define kill_guest(cpu, fmt...) \ do { \ - if (!(lg)->dead) { \ - (lg)->dead = kasprintf(GFP_ATOMIC, fmt); \ - if (!(lg)->dead) \ - (lg)->dead = ERR_PTR(-ENOMEM); \ + if (!(cpu)->lg->dead) { \ + (cpu)->lg->dead = kasprintf(GFP_ATOMIC, fmt); \ + if (!(cpu)->lg->dead) \ + (cpu)->lg->dead = ERR_PTR(-ENOMEM); \ } \ } while(0) /* (End of aside) :*/ diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 3b92a61ba8d..85d42d3d01a 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -6,6 +6,7 @@ #include <linux/uaccess.h> #include <linux/miscdevice.h> #include <linux/fs.h> +#include <linux/sched.h> #include "lg.h" /*L:055 When something happens, the Waker process needs a way to stop the @@ -13,7 +14,7 @@ * LHREQ_BREAK and the value "1" to /dev/lguest to do this. Once the Launcher * has done whatever needs attention, it writes LHREQ_BREAK and "0" to release * the Waker. */ -static int break_guest_out(struct lguest *lg, const unsigned long __user *input) +static int break_guest_out(struct lg_cpu *cpu, const unsigned long __user*input) { unsigned long on; @@ -22,21 +23,21 @@ static int break_guest_out(struct lguest *lg, const unsigned long __user *input) return -EFAULT; if (on) { - lg->break_out = 1; + cpu->break_out = 1; /* Pop it out of the Guest (may be running on different CPU) */ - wake_up_process(lg->tsk); + wake_up_process(cpu->tsk); /* Wait for them to reset it */ - return wait_event_interruptible(lg->break_wq, !lg->break_out); + return wait_event_interruptible(cpu->break_wq, !cpu->break_out); } else { - lg->break_out = 0; - wake_up(&lg->break_wq); + cpu->break_out = 0; + wake_up(&cpu->break_wq); return 0; } } /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt * number to /dev/lguest. */ -static int user_send_irq(struct lguest *lg, const unsigned long __user *input) +static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) { unsigned long irq; @@ -46,7 +47,7 @@ static int user_send_irq(struct lguest *lg, const unsigned long __user *input) return -EINVAL; /* Next time the Guest runs, the core code will see if it can deliver * this interrupt. */ - set_bit(irq, lg->irqs_pending); + set_bit(irq, cpu->irqs_pending); return 0; } @@ -55,13 +56,21 @@ static int user_send_irq(struct lguest *lg, const unsigned long __user *input) static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) { struct lguest *lg = file->private_data; + struct lg_cpu *cpu; + unsigned int cpu_id = *o; /* You must write LHREQ_INITIALIZE first! */ if (!lg) return -EINVAL; + /* Watch out for arbitrary vcpu indexes! */ + if (cpu_id >= lg->nr_cpus) + return -EINVAL; + + cpu = &lg->cpus[cpu_id]; + /* If you're not the task which owns the Guest, go away. */ - if (current != lg->tsk) + if (current != cpu->tsk) return -EPERM; /* If the guest is already dead, we indicate why */ @@ -81,11 +90,53 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) /* If we returned from read() last time because the Guest notified, * clear the flag. */ - if (lg->pending_notify) - lg->pending_notify = 0; + if (cpu->pending_notify) + cpu->pending_notify = 0; /* Run the Guest until something interesting happens. */ - return run_guest(lg, (unsigned long __user *)user); + return run_guest(cpu, (unsigned long __user *)user); +} + +static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) +{ + if (id >= NR_CPUS) + return -EINVAL; + + cpu->id = id; + cpu->lg = container_of((cpu - id), struct lguest, cpus[0]); + cpu->lg->nr_cpus++; + init_clockdev(cpu); + + /* We need a complete page for the Guest registers: they are accessible + * to the Guest and we can only grant it access to whole pages. */ + cpu->regs_page = get_zeroed_page(GFP_KERNEL); + if (!cpu->regs_page) + return -ENOMEM; + + /* We actually put the registers at the bottom of the page. */ + cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs); + + /* Now we initialize the Guest's registers, handing it the start + * address. */ + lguest_arch_setup_regs(cpu, start_ip); + + /* Initialize the queue for the waker to wait on */ + init_waitqueue_head(&cpu->break_wq); + + /* We keep a pointer to the Launcher task (ie. current task) for when + * other Guests want to wake this one (inter-Guest I/O). */ + cpu->tsk = current; + + /* We need to keep a pointer to the Launcher's memory map, because if + * the Launcher dies we need to clean it up. If we don't keep a + * reference, it is destroyed before close() is called. */ + cpu->mm = get_task_mm(cpu->tsk); + + /* We remember which CPU's pages this Guest used last, for optimization + * when the same Guest runs on the same CPU twice. */ + cpu->last_pages = NULL; + + return 0; } /*L:020 The initialization write supplies 4 pointer sized (32 or 64 bit) @@ -134,15 +185,10 @@ static int initialize(struct file *file, const unsigned long __user *input) lg->mem_base = (void __user *)(long)args[0]; lg->pfn_limit = args[1]; - /* We need a complete page for the Guest registers: they are accessible - * to the Guest and we can only grant it access to whole pages. */ - lg->regs_page = get_zeroed_page(GFP_KERNEL); - if (!lg->regs_page) { - err = -ENOMEM; + /* This is the first cpu */ + err = lg_cpu_start(&lg->cpus[0], 0, args[3]); + if (err) goto release_guest; - } - /* We actually put the registers at the bottom of the page. */ - lg->regs = (void *)lg->regs_page + PAGE_SIZE - sizeof(*lg->regs); /* Initialize the Guest's shadow page tables, using the toplevel * address the Launcher gave us. This allocates memory, so can @@ -151,28 +197,6 @@ static int initialize(struct file *file, const unsigned long __user *input) if (err) goto free_regs; - /* Now we initialize the Guest's registers, handing it the start - * address. */ - lguest_arch_setup_regs(lg, args[3]); - - /* The timer for lguest's clock needs initialization. */ - init_clockdev(lg); - - /* We keep a pointer to the Launcher task (ie. current task) for when - * other Guests want to wake this one (inter-Guest I/O). */ - lg->tsk = current; - /* We need to keep a pointer to the Launcher's memory map, because if - * the Launcher dies we need to clean it up. If we don't keep a - * reference, it is destroyed before close() is called. */ - lg->mm = get_task_mm(lg->tsk); - - /* Initialize the queue for the waker to wait on */ - init_waitqueue_head(&lg->break_wq); - - /* We remember which CPU's pages this Guest used last, for optimization - * when the same Guest runs on the same CPU twice. */ - lg->last_pages = NULL; - /* We keep our "struct lguest" in the file's private_data. */ file->private_data = lg; @@ -182,7 +206,8 @@ static int initialize(struct file *file, const unsigned long __user *input) return sizeof(args); free_regs: - free_page(lg->regs_page); + /* FIXME: This should be in free_vcpu */ + free_page(lg->cpus[0].regs_page); release_guest: kfree(lg); unlock: @@ -202,30 +227,37 @@ static ssize_t write(struct file *file, const char __user *in, struct lguest *lg = file->private_data; const unsigned long __user *input = (const unsigned long __user *)in; unsigned long req; + struct lg_cpu *uninitialized_var(cpu); + unsigned int cpu_id = *off; if (get_user(req, input) != 0) return -EFAULT; input++; /* If you haven't initialized, you must do that first. */ - if (req != LHREQ_INITIALIZE && !lg) - return -EINVAL; + if (req != LHREQ_INITIALIZE) { + if (!lg || (cpu_id >= lg->nr_cpus)) + return -EINVAL; + cpu = &lg->cpus[cpu_id]; + if (!cpu) + return -EINVAL; + } /* Once the Guest is dead, all you can do is read() why it died. */ if (lg && lg->dead) return -ENOENT; /* If you're not the task which owns the Guest, you can only break */ - if (lg && current != lg->tsk && req != LHREQ_BREAK) + if (lg && current != cpu->tsk && req != LHREQ_BREAK) return -EPERM; switch (req) { case LHREQ_INITIALIZE: return initialize(file, input); case LHREQ_IRQ: - return user_send_irq(lg, input); + return user_send_irq(cpu, input); case LHREQ_BREAK: - return break_guest_out(lg, input); + return break_guest_out(cpu, input); default: return -EINVAL; } @@ -241,6 +273,7 @@ static ssize_t write(struct file *file, const char __user *in, static int close(struct inode *inode, struct file *file) { struct lguest *lg = file->private_data; + unsigned int i; /* If we never successfully initialized, there's nothing to clean up */ if (!lg) @@ -249,19 +282,23 @@ static int close(struct inode *inode, struct file *file) /* We need the big lock, to protect from inter-guest I/O and other * Launchers initializing guests. */ mutex_lock(&lguest_lock); - /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */ - hrtimer_cancel(&lg->hrt); + /* Free up the shadow page tables for the Guest. */ free_guest_pagetable(lg); - /* Now all the memory cleanups are done, it's safe to release the - * Launcher's memory management structure. */ - mmput(lg->mm); + + for (i = 0; i < lg->nr_cpus; i++) { + /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */ + hrtimer_cancel(&lg->cpus[i].hrt); + /* We can free up the register page we allocated. */ + free_page(lg->cpus[i].regs_page); + /* Now all the memory cleanups are done, it's safe to release + * the Launcher's memory management structure. */ + mmput(lg->cpus[i].mm); + } /* If lg->dead doesn't contain an error code it will be NULL or a * kmalloc()ed string, either of which is ok to hand to kfree(). */ if (!IS_ERR(lg->dead)) kfree(lg->dead); - /* We can free up the register page we allocated. */ - free_page(lg->regs_page); /* We clear the entire structure, which also marks it as free for the * next user. */ memset(lg, 0, sizeof(*lg)); diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index fffabb32715..74b4cf2a6c4 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -68,23 +68,23 @@ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); * page directory entry (PGD) for that address. Since we keep track of several * page tables, the "i" argument tells us which one we're interested in (it's * usually the current one). */ -static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) +static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) { unsigned int index = pgd_index(vaddr); /* We kill any Guest trying to touch the Switcher addresses. */ if (index >= SWITCHER_PGD_INDEX) { - kill_guest(lg, "attempt to access switcher pages"); + kill_guest(cpu, "attempt to access switcher pages"); index = 0; } /* Return a pointer index'th pgd entry for the i'th page table. */ - return &lg->pgdirs[i].pgdir[index]; + return &cpu->lg->pgdirs[i].pgdir[index]; } /* This routine then takes the page directory entry returned above, which * contains the address of the page table entry (PTE) page. It then returns a * pointer to the PTE entry for the given address. */ -static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr) +static pte_t *spte_addr(pgd_t spgd, unsigned long vaddr) { pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); /* You should never call this if the PGD entry wasn't valid */ @@ -94,14 +94,13 @@ static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr) /* These two functions just like the above two, except they access the Guest * page tables. Hence they return a Guest address. */ -static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr) +static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) { unsigned int index = vaddr >> (PGDIR_SHIFT); - return lg->pgdirs[lg->pgdidx].gpgdir + index * sizeof(pgd_t); + return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t); } -static unsigned long gpte_addr(struct lguest *lg, - pgd_t gpgd, unsigned long vaddr) +static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr) { unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); @@ -138,7 +137,7 @@ static unsigned long get_pfn(unsigned long virtpfn, int write) * entry can be a little tricky. The flags are (almost) the same, but the * Guest PTE contains a virtual page number: the CPU needs the real page * number. */ -static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write) +static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write) { unsigned long pfn, base, flags; @@ -149,7 +148,7 @@ static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write) flags = (pte_flags(gpte) & ~_PAGE_GLOBAL); /* The Guest's pages are offset inside the Launcher. */ - base = (unsigned long)lg->mem_base / PAGE_SIZE; + base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE; /* We need a temporary "unsigned long" variable to hold the answer from * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't @@ -157,7 +156,7 @@ static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write) * page, given the virtual number. */ pfn = get_pfn(base + pte_pfn(gpte), write); if (pfn == -1UL) { - kill_guest(lg, "failed to get page %lu", pte_pfn(gpte)); + kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte)); /* When we destroy the Guest, we'll go through the shadow page * tables and release_pte() them. Make sure we don't think * this one is valid! */ @@ -177,17 +176,18 @@ static void release_pte(pte_t pte) } /*:*/ -static void check_gpte(struct lguest *lg, pte_t gpte) +static void check_gpte(struct lg_cpu *cpu, pte_t gpte) { if ((pte_flags(gpte) & (_PAGE_PWT|_PAGE_PSE)) - || pte_pfn(gpte) >= lg->pfn_limit) - kill_guest(lg, "bad page table entry"); + || pte_pfn(gpte) >= cpu->lg->pfn_limit) + kill_guest(cpu, "bad page table entry"); } -static void check_gpgd(struct lguest *lg, pgd_t gpgd) +static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) { - if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || pgd_pfn(gpgd) >= lg->pfn_limit) - kill_guest(lg, "bad page directory entry"); + if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || + (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) + kill_guest(cpu, "bad page directory entry"); } /*H:330 @@ -200,7 +200,7 @@ static void check_gpgd(struct lguest *lg, pgd_t gpgd) * * If we fixed up the fault (ie. we mapped the address), this routine returns * true. Otherwise, it was a real fault and we need to tell the Guest. */ -int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) +int demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) { pgd_t gpgd; pgd_t *spgd; @@ -209,24 +209,24 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) pte_t *spte; /* First step: get the top-level Guest page table entry. */ - gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t); + gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); /* Toplevel not present? We can't map it in. */ if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) return 0; /* Now look at the matching shadow entry. */ - spgd = spgd_addr(lg, lg->pgdidx, vaddr); + spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { /* No shadow entry: allocate a new shadow PTE page. */ unsigned long ptepage = get_zeroed_page(GFP_KERNEL); /* This is not really the Guest's fault, but killing it is * simple for this corner case. */ if (!ptepage) { - kill_guest(lg, "out of memory allocating pte page"); + kill_guest(cpu, "out of memory allocating pte page"); return 0; } /* We check that the Guest pgd is OK. */ - check_gpgd(lg, gpgd); + check_gpgd(cpu, gpgd); /* And we copy the flags to the shadow PGD entry. The page * number in the shadow PGD is the page we just allocated. */ *spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd)); @@ -234,8 +234,8 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) /* OK, now we look at the lower level in the Guest page table: keep its * address, because we might update it later. */ - gpte_ptr = gpte_addr(lg, gpgd, vaddr); - gpte = lgread(lg, gpte_ptr, pte_t); + gpte_ptr = gpte_addr(gpgd, vaddr); + gpte = lgread(cpu, gpte_ptr, pte_t); /* If this page isn't in the Guest page tables, we can't page it in. */ if (!(pte_flags(gpte) & _PAGE_PRESENT)) @@ -252,7 +252,7 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) /* Check that the Guest PTE flags are OK, and the page number is below * the pfn_limit (ie. not mapping the Launcher binary). */ - check_gpte(lg, gpte); + check_gpte(cpu, gpte); /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ gpte = pte_mkyoung(gpte); @@ -260,7 +260,7 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) gpte = pte_mkdirty(gpte); /* Get the pointer to the shadow PTE entry we're going to set. */ - spte = spte_addr(lg, *spgd, vaddr); + spte = spte_addr(*spgd, vaddr); /* If there was a valid shadow PTE entry here before, we release it. * This can happen with a write to a previously read-only entry. */ release_pte(*spte); @@ -268,17 +268,17 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) /* If this is a write, we insist that the Guest page is writable (the * final arg to gpte_to_spte()). */ if (pte_dirty(gpte)) - *spte = gpte_to_spte(lg, gpte, 1); + *spte = gpte_to_spte(cpu, gpte, 1); else /* If this is a read, don't set the "writable" bit in the page * table entry, even if the Guest says it's writable. That way * we will come back here when a write does actually occur, so * we can update the Guest's _PAGE_DIRTY flag. */ - *spte = gpte_to_spte(lg, pte_wrprotect(gpte), 0); + *spte = gpte_to_spte(cpu, pte_wrprotect(gpte), 0); /* Finally, we write the Guest PTE entry back: we've set the * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ - lgwrite(lg, gpte_ptr, pte_t, gpte); + lgwrite(cpu, gpte_ptr, pte_t, gpte); /* The fault is fixed, the page table is populated, the mapping * manipulated, the result returned and the code complete. A small @@ -297,19 +297,19 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) * * This is a quick version which answers the question: is this virtual address * mapped by the shadow page tables, and is it writable? */ -static int page_writable(struct lguest *lg, unsigned long vaddr) +static int page_writable(struct lg_cpu *cpu, unsigned long vaddr) { pgd_t *spgd; unsigned long flags; /* Look at the current top level entry: is it present? */ - spgd = spgd_addr(lg, lg->pgdidx, vaddr); + spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) return 0; /* Check the flags on the pte entry itself: it must be present and * writable. */ - flags = pte_flags(*(spte_addr(lg, *spgd, vaddr))); + flags = pte_flags(*(spte_addr(*spgd, vaddr))); return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); } @@ -317,10 +317,10 @@ static int page_writable(struct lguest *lg, unsigned long vaddr) /* So, when pin_stack_pages() asks us to pin a page, we check if it's already * in the page tables, and if not, we call demand_page() with error code 2 * (meaning "write"). */ -void pin_page(struct lguest *lg, unsigned long vaddr) +void pin_page(struct lg_cpu *cpu, unsigned long vaddr) { - if (!page_writable(lg, vaddr) && !demand_page(lg, vaddr, 2)) - kill_guest(lg, "bad stack page %#lx", vaddr); + if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) + kill_guest(cpu, "bad stack page %#lx", vaddr); } /*H:450 If we chase down the release_pgd() code, it looks like this: */ @@ -358,28 +358,28 @@ static void flush_user_mappings(struct lguest *lg, int idx) * * The Guest has a hypercall to throw away the page tables: it's used when a * large number of mappings have been changed. */ -void guest_pagetable_flush_user(struct lguest *lg) +void guest_pagetable_flush_user(struct lg_cpu *cpu) { /* Drop the userspace part of the current page table. */ - flush_user_mappings(lg, lg->pgdidx); + flush_user_mappings(cpu->lg, cpu->cpu_pgd); } /*:*/ /* We walk down the guest page tables to get a guest-physical address */ -unsigned long guest_pa(struct lguest *lg, unsigned long vaddr) +unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) { pgd_t gpgd; pte_t gpte; /* First step: get the top-level Guest page table entry. */ - gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t); + gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); /* Toplevel not present? We can't map it in. */ if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) - kill_guest(lg, "Bad address %#lx", vaddr); + kill_guest(cpu, "Bad address %#lx", vaddr); - gpte = lgread(lg, gpte_addr(lg, gpgd, vaddr), pte_t); + gpte = lgread(cpu, gpte_addr(gpgd, vaddr), pte_t); if (!(pte_flags(gpte) & _PAGE_PRESENT)) - kill_guest(lg, "Bad address %#lx", vaddr); + kill_guest(cpu, "Bad address %#lx", vaddr); return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); } @@ -399,7 +399,7 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) /*H:435 And this is us, creating the new page directory. If we really do * allocate a new one (and so the kernel parts are not there), we set * blank_pgdir. */ -static unsigned int new_pgdir(struct lguest *lg, +static unsigned int new_pgdir(struct lg_cpu *cpu, unsigned long gpgdir, int *blank_pgdir) { @@ -407,22 +407,23 @@ static unsigned int new_pgdir(struct lguest *lg, /* We pick one entry at random to throw out. Choosing the Least * Recently Used might be better, but this is easy. */ - next = random32() % ARRAY_SIZE(lg->pgdirs); + next = random32() % ARRAY_SIZE(cpu->lg->pgdirs); /* If it's never been allocated at all before, try now. */ - if (!lg->pgdirs[next].pgdir) { - lg->pgdirs[next].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); + if (!cpu->lg->pgdirs[next].pgdir) { + cpu->lg->pgdirs[next].pgdir = + (pgd_t *)get_zeroed_page(GFP_KERNEL); /* If the allocation fails, just keep using the one we have */ - if (!lg->pgdirs[next].pgdir) - next = lg->pgdidx; + if (!cpu->lg->pgdirs[next].pgdir) + next = cpu->cpu_pgd; else /* This is a blank page, so there are no kernel * mappings: caller must map the stack! */ *blank_pgdir = 1; } /* Record which Guest toplevel this shadows. */ - lg->pgdirs[next].gpgdir = gpgdir; + cpu->lg->pgdirs[next].gpgdir = gpgdir; /* Release all the non-kernel mappings. */ - flush_user_mappings(lg, next); + flush_user_mappings(cpu->lg, next); return next; } @@ -432,21 +433,21 @@ static unsigned int new_pgdir(struct lguest *lg, * Now we've seen all the page table setting and manipulation, let's see what * what happens when the Guest changes page tables (ie. changes the top-level * pgdir). This occurs on almost every context switch. */ -void guest_new_pagetable(struct lguest *lg, unsigned long pgtable) +void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) { int newpgdir, repin = 0; /* Look to see if we have this one already. */ - newpgdir = find_pgdir(lg, pgtable); + newpgdir = find_pgdir(cpu->lg, pgtable); /* If not, we allocate or mug an existing one: if it's a fresh one, * repin gets set to 1. */ - if (newpgdir == ARRAY_SIZE(lg->pgdirs)) - newpgdir = new_pgdir(lg, pgtable, &repin); + if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) + newpgdir = new_pgdir(cpu, pgtable, &repin); /* Change the current pgd index to the new one. */ - lg->pgdidx = newpgdir; + cpu->cpu_pgd = newpgdir; /* If it was completely blank, we map in the Guest kernel stack */ if (repin) - pin_stack_pages(lg); + pin_stack_pages(cpu); } /*H:470 Finally, a routine which throws away everything: all PGD entries in all @@ -468,11 +469,11 @@ static void release_all_pagetables(struct lguest *lg) * mapping. Since kernel mappings are in every page table, it's easiest to * throw them all away. This traps the Guest in amber for a while as * everything faults back in, but it's rare. */ -void guest_pagetable_clear_all(struct lguest *lg) +void guest_pagetable_clear_all(struct lg_cpu *cpu) { - release_all_pagetables(lg); + release_all_pagetables(cpu->lg); /* We need the Guest kernel stack mapped again. */ - pin_stack_pages(lg); + pin_stack_pages(cpu); } /*:*/ /*M:009 Since we throw away all mappings when a kernel mapping changes, our @@ -497,24 +498,24 @@ void guest_pagetable_clear_all(struct lguest *lg) * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately. */ -static void do_set_pte(struct lguest *lg, int idx, +static void do_set_pte(struct lg_cpu *cpu, int idx, unsigned long vaddr, pte_t gpte) { /* Look up the matching shadow page directory entry. */ - pgd_t *spgd = spgd_addr(lg, idx, vaddr); + pgd_t *spgd = spgd_addr(cpu, idx, vaddr); /* If the top level isn't present, there's no entry to update. */ if (pgd_flags(*spgd) & _PAGE_PRESENT) { /* Otherwise, we start by releasing the existing entry. */ - pte_t *spte = spte_addr(lg, *spgd, vaddr); + pte_t *spte = spte_addr(*spgd, vaddr); release_pte(*spte); /* If they're setting this entry as dirty or accessed, we might * as well put that entry they've given us in now. This shaves * 10% off a copy-on-write micro-benchmark. */ if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { - check_gpte(lg, gpte); - *spte = gpte_to_spte(lg, gpte, + check_gpte(cpu, gpte); + *spte = gpte_to_spte(cpu, gpte, pte_flags(gpte) & _PAGE_DIRTY); } else /* Otherwise kill it and we can demand_page() it in @@ -533,22 +534,22 @@ static void do_set_pte(struct lguest *lg, int idx, * * The benefit is that when we have to track a new page table, we can copy keep * all the kernel mappings. This speeds up context switch immensely. */ -void guest_set_pte(struct lguest *lg, +void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, unsigned long vaddr, pte_t gpte) { /* Kernel mappings must be changed on all top levels. Slow, but * doesn't happen often. */ - if (vaddr >= lg->kernel_address) { + if (vaddr >= cpu->lg->kernel_address) { unsigned int i; - for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) - if (lg->pgdirs[i].pgdir) - do_set_pte(lg, i, vaddr, gpte); + for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++) + if (cpu->lg->pgdirs[i].pgdir) + do_set_pte(cpu, i, vaddr, gpte); } else { /* Is this page table one we have a shadow for? */ - int pgdir = find_pgdir(lg, gpgdir); - if (pgdir != ARRAY_SIZE(lg->pgdirs)) + int pgdir = find_pgdir(cpu->lg, gpgdir); + if (pgdir != ARRAY_SIZE(cpu->lg->pgdirs)) /* If so, do the update. */ - do_set_pte(lg, pgdir, vaddr, gpte); + do_set_pte(cpu, pgdir, vaddr, gpte); } } @@ -590,30 +591,32 @@ int init_guest_pagetable(struct lguest *lg, unsigned long pgtable) { /* We start on the first shadow page table, and give it a blank PGD * page. */ - lg->pgdidx = 0; - lg->pgdirs[lg->pgdidx].gpgdir = pgtable; - lg->pgdirs[lg->pgdidx].pgdir = (pgd_t*)get_zeroed_page(GFP_KERNEL); - if (!lg->pgdirs[lg->pgdidx].pgdir) + lg->pgdirs[0].gpgdir = pgtable; + lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); + if (!lg->pgdirs[0].pgdir) return -ENOMEM; + lg->cpus[0].cpu_pgd = 0; return 0; } /* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ -void page_table_guest_data_init(struct lguest *lg) +void page_table_guest_data_init(struct lg_cpu *cpu) { /* We get the kernel address: above this is all kernel memory. */ - if (get_user(lg->kernel_address, &lg->lguest_data->kernel_address) + if (get_user(cpu->lg->kernel_address, + &cpu->lg->lguest_data->kernel_address) /* We tell the Guest that it can't use the top 4MB of virtual * addresses used by the Switcher. */ - || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem) - || put_user(lg->pgdirs[lg->pgdidx].gpgdir,&lg->lguest_data->pgdir)) - kill_guest(lg, "bad guest page %p", lg->lguest_data); + || put_user(4U*1024*1024, &cpu->lg->lguest_data->reserve_mem) + || put_user(cpu->lg->pgdirs[0].gpgdir, &cpu->lg->lguest_data->pgdir)) + kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); /* In flush_user_mappings() we loop from 0 to * "pgd_index(lg->kernel_address)". This assumes it won't hit the * Switcher mappings, so check that now. */ - if (pgd_index(lg->kernel_address) >= SWITCHER_PGD_INDEX) - kill_guest(lg, "bad kernel address %#lx", lg->kernel_address); + if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX) + kill_guest(cpu, "bad kernel address %#lx", + cpu->lg->kernel_address); } /* When a Guest dies, our cleanup is fairly simple. */ @@ -634,17 +637,18 @@ void free_guest_pagetable(struct lguest *lg) * Guest (and not the pages for other CPUs). We have the appropriate PTE pages * for each CPU already set up, we just need to hook them in now we know which * Guest is about to run on this CPU. */ -void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) +void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) { pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); pgd_t switcher_pgd; pte_t regs_pte; + unsigned long pfn; /* Make the last PGD entry for this Guest point to the Switcher's PTE * page for this CPU (with appropriate flags). */ - switcher_pgd = __pgd(__pa(switcher_pte_page) | _PAGE_KERNEL); + switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL); - lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; + cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; /* We also change the Switcher PTE page. When we're running the Guest, * we want the Guest's "regs" page to appear where the first Switcher @@ -653,7 +657,8 @@ void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) * CPU's "struct lguest_pages": if we make sure the Guest's register * page is already mapped there, we don't have to copy them out * again. */ - regs_pte = pfn_pte (__pa(lg->regs_page) >> PAGE_SHIFT, __pgprot(_PAGE_KERNEL)); + pfn = __pa(cpu->regs_page) >> PAGE_SHIFT; + regs_pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL)); switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte; } /*:*/ diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c index 9e189cbec7d..ec6aa3f1c36 100644 --- a/drivers/lguest/segments.c +++ b/drivers/lguest/segments.c @@ -58,7 +58,7 @@ static int ignored_gdt(unsigned int num) * Protection Fault in the Switcher when it restores a Guest segment register * which tries to use that entry. Then we kill the Guest for causing such a * mess: the message will be "unhandled trap 256". */ -static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end) +static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end) { unsigned int i; @@ -71,14 +71,14 @@ static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end) /* Segment descriptors contain a privilege level: the Guest is * sometimes careless and leaves this as 0, even though it's * running at privilege level 1. If so, we fix it here. */ - if ((lg->arch.gdt[i].b & 0x00006000) == 0) - lg->arch.gdt[i].b |= (GUEST_PL << 13); + if ((cpu->arch.gdt[i].b & 0x00006000) == 0) + cpu->arch.gdt[i].b |= (GUEST_PL << 13); /* Each descriptor has an "accessed" bit. If we don't set it * now, the CPU will try to set it when the Guest first loads * that entry into a segment register. But the GDT isn't * writable by the Guest, so bad things can happen. */ - lg->arch.gdt[i].b |= 0x00000100; + cpu->arch.gdt[i].b |= 0x00000100; } } @@ -109,31 +109,31 @@ void setup_default_gdt_entries(struct lguest_ro_state *state) /* This routine sets up the initial Guest GDT for booting. All entries start * as 0 (unusable). */ -void setup_guest_gdt(struct lguest *lg) +void setup_guest_gdt(struct lg_cpu *cpu) { /* Start with full 0-4G segments... */ - lg->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; - lg->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; + cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; + cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; /* ...except the Guest is allowed to use them, so set the privilege * level appropriately in the flags. */ - lg->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); - lg->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); + cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); + cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); } /*H:650 An optimization of copy_gdt(), for just the three "thead-local storage" * entries. */ -void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt) +void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt) { unsigned int i; for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++) - gdt[i] = lg->arch.gdt[i]; + gdt[i] = cpu->arch.gdt[i]; } /*H:640 When the Guest is run on a different CPU, or the GDT entries have * changed, copy_gdt() is called to copy the Guest's GDT entries across to this * CPU's GDT. */ -void copy_gdt(const struct lguest *lg, struct desc_struct *gdt) +void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt) { unsigned int i; @@ -141,38 +141,38 @@ void copy_gdt(const struct lguest *lg, struct desc_struct *gdt) * replaced. See ignored_gdt() above. */ for (i = 0; i < GDT_ENTRIES; i++) if (!ignored_gdt(i)) - gdt[i] = lg->arch.gdt[i]; + gdt[i] = cpu->arch.gdt[i]; } /*H:620 This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT). * We copy it from the Guest and tweak the entries. */ -void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num) +void load_guest_gdt(struct lg_cpu *cpu, unsigned long table, u32 num) { /* We assume the Guest has the same number of GDT entries as the * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ - if (num > ARRAY_SIZE(lg->arch.gdt)) - kill_guest(lg, "too many gdt entries %i", num); + if (num > ARRAY_SIZE(cpu->arch.gdt)) + kill_guest(cpu, "too many gdt entries %i", num); /* We read the whole thing in, then fix it up. */ - __lgread(lg, lg->arch.gdt, table, num * sizeof(lg->arch.gdt[0])); - fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->arch.gdt)); + __lgread(cpu, cpu->arch.gdt, table, num * sizeof(cpu->arch.gdt[0])); + fixup_gdt_table(cpu, 0, ARRAY_SIZE(cpu->arch.gdt)); /* Mark that the GDT changed so the core knows it has to copy it again, * even if the Guest is run on the same CPU. */ - lg->changed |= CHANGED_GDT; + cpu->changed |= CHANGED_GDT; } /* This is the fast-track version for just changing the three TLS entries. * Remember that this happens on every context switch, so it's worth * optimizing. But wouldn't it be neater to have a single hypercall to cover * both cases? */ -void guest_load_tls(struct lguest *lg, unsigned long gtls) +void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls) { - struct desc_struct *tls = &lg->arch.gdt[GDT_ENTRY_TLS_MIN]; + struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN]; - __lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES); - fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1); + __lgread(cpu, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES); + fixup_gdt_table(cpu, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1); /* Note that just the TLS entries have changed. */ - lg->changed |= CHANGED_GDT_TLS; + cpu->changed |= CHANGED_GDT_TLS; } /*:*/ diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 44adb00e149..61f2f8eb8ca 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -60,7 +60,7 @@ static struct lguest_pages *lguest_pages(unsigned int cpu) (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]); } -static DEFINE_PER_CPU(struct lguest *, last_guest); +static DEFINE_PER_CPU(struct lg_cpu *, last_cpu); /*S:010 * We approach the Switcher. @@ -73,16 +73,16 @@ static DEFINE_PER_CPU(struct lguest *, last_guest); * since it last ran. We saw this set in interrupts_and_traps.c and * segments.c. */ -static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages) +static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages) { /* Copying all this data can be quite expensive. We usually run the * same Guest we ran last time (and that Guest hasn't run anywhere else * meanwhile). If that's not the case, we pretend everything in the * Guest has changed. */ - if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) { - __get_cpu_var(last_guest) = lg; - lg->last_pages = pages; - lg->changed = CHANGED_ALL; + if (__get_cpu_var(last_cpu) != cpu || cpu->last_pages != pages) { + __get_cpu_var(last_cpu) = cpu; + cpu->last_pages = pages; + cpu->changed = CHANGED_ALL; } /* These copies are pretty cheap, so we do them unconditionally: */ @@ -90,42 +90,42 @@ static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages) pages->state.host_cr3 = __pa(current->mm->pgd); /* Set up the Guest's page tables to see this CPU's pages (and no * other CPU's pages). */ - map_switcher_in_guest(lg, pages); + map_switcher_in_guest(cpu, pages); /* Set up the two "TSS" members which tell the CPU what stack to use * for traps which do directly into the Guest (ie. traps at privilege * level 1). */ - pages->state.guest_tss.sp1 = lg->esp1; - pages->state.guest_tss.ss1 = lg->ss1; + pages->state.guest_tss.esp1 = cpu->esp1; + pages->state.guest_tss.ss1 = cpu->ss1; /* Copy direct-to-Guest trap entries. */ - if (lg->changed & CHANGED_IDT) - copy_traps(lg, pages->state.guest_idt, default_idt_entries); + if (cpu->changed & CHANGED_IDT) + copy_traps(cpu, pages->state.guest_idt, default_idt_entries); /* Copy all GDT entries which the Guest can change. */ - if (lg->changed & CHANGED_GDT) - copy_gdt(lg, pages->state.guest_gdt); + if (cpu->changed & CHANGED_GDT) + copy_gdt(cpu, pages->state.guest_gdt); /* If only the TLS entries have changed, copy them. */ - else if (lg->changed & CHANGED_GDT_TLS) - copy_gdt_tls(lg, pages->state.guest_gdt); + else if (cpu->changed & CHANGED_GDT_TLS) + copy_gdt_tls(cpu, pages->state.guest_gdt); /* Mark the Guest as unchanged for next time. */ - lg->changed = 0; + cpu->changed = 0; } /* Finally: the code to actually call into the Switcher to run the Guest. */ -static void run_guest_once(struct lguest *lg, struct lguest_pages *pages) +static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) { /* This is a dummy value we need for GCC's sake. */ unsigned int clobber; /* Copy the guest-specific information into this CPU's "struct * lguest_pages". */ - copy_in_guest_info(lg, pages); + copy_in_guest_info(cpu, pages); /* Set the trap number to 256 (impossible value). If we fault while * switching to the Guest (bad segment registers or bug), this will * cause us to abort the Guest. */ - lg->regs->trapnum = 256; + cpu->regs->trapnum = 256; /* Now: we push the "eflags" register on the stack, then do an "lcall". * This is how we change from using the kernel code segment to using @@ -143,7 +143,7 @@ static void run_guest_once(struct lguest *lg, struct lguest_pages *pages) * 0-th argument above, ie "a"). %ebx contains the * physical address of the Guest's top-level page * directory. */ - : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir)) + : "0"(pages), "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)) /* We tell gcc that all these registers could change, * which means we don't have to save and restore them in * the Switcher. */ @@ -161,12 +161,12 @@ static void run_guest_once(struct lguest *lg, struct lguest_pages *pages) /*H:040 This is the i386-specific code to setup and run the Guest. Interrupts * are disabled: we own the CPU. */ -void lguest_arch_run_guest(struct lguest *lg) +void lguest_arch_run_guest(struct lg_cpu *cpu) { /* Remember the awfully-named TS bit? If the Guest has asked to set it * we set it now, so we can trap and pass that trap to the Guest if it * uses the FPU. */ - if (lg->ts) + if (cpu->ts) lguest_set_ts(); /* SYSENTER is an optimized way of doing system calls. We can't allow @@ -180,7 +180,7 @@ void lguest_arch_run_guest(struct lguest *lg) /* Now we actually run the Guest. It will return when something * interesting happens, and we can examine its registers to see what it * was doing. */ - run_guest_once(lg, lguest_pages(raw_smp_processor_id())); + run_guest_once(cpu, lguest_pages(raw_smp_processor_id())); /* Note that the "regs" pointer contains two extra entries which are * not really registers: a trap number which says what interrupt or @@ -191,11 +191,11 @@ void lguest_arch_run_guest(struct lguest *lg) * bad virtual address. We have to grab this now, because once we * re-enable interrupts an interrupt could fault and thus overwrite * cr2, or we could even move off to a different CPU. */ - if (lg->regs->trapnum == 14) - lg->arch.last_pagefault = read_cr2(); + if (cpu->regs->trapnum == 14) + cpu->arch.last_pagefault = read_cr2(); /* Similarly, if we took a trap because the Guest used the FPU, * we have to restore the FPU it expects to see. */ - else if (lg->regs->trapnum == 7) + else if (cpu->regs->trapnum == 7) math_state_restore(); /* Restore SYSENTER if it's supposed to be on. */ @@ -214,22 +214,22 @@ void lguest_arch_run_guest(struct lguest *lg) * When the Guest uses one of these instructions, we get a trap (General * Protection Fault) and come here. We see if it's one of those troublesome * instructions and skip over it. We return true if we did. */ -static int emulate_insn(struct lguest *lg) +static int emulate_insn(struct lg_cpu *cpu) { u8 insn; unsigned int insnlen = 0, in = 0, shift = 0; /* The eip contains the *virtual* address of the Guest's instruction: * guest_pa just subtracts the Guest's page_offset. */ - unsigned long physaddr = guest_pa(lg, lg->regs->eip); + unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); /* This must be the Guest kernel trying to do something, not userspace! * The bottom two bits of the CS segment register are the privilege * level. */ - if ((lg->regs->cs & 3) != GUEST_PL) + if ((cpu->regs->cs & 3) != GUEST_PL) return 0; /* Decoding x86 instructions is icky. */ - insn = lgread(lg, physaddr, u8); + insn = lgread(cpu, physaddr, u8); /* 0x66 is an "operand prefix". It means it's using the upper 16 bits of the eax register. */ @@ -237,7 +237,7 @@ static int emulate_insn(struct lguest *lg) shift = 16; /* The instruction is 1 byte so far, read the next byte. */ insnlen = 1; - insn = lgread(lg, physaddr + insnlen, u8); + insn = lgread(cpu, physaddr + insnlen, u8); } /* We can ignore the lower bit for the moment and decode the 4 opcodes @@ -268,26 +268,26 @@ static int emulate_insn(struct lguest *lg) if (in) { /* Lower bit tells is whether it's a 16 or 32 bit access */ if (insn & 0x1) - lg->regs->eax = 0xFFFFFFFF; + cpu->regs->eax = 0xFFFFFFFF; else - lg->regs->eax |= (0xFFFF << shift); + cpu->regs->eax |= (0xFFFF << shift); } /* Finally, we've "done" the instruction, so move past it. */ - lg->regs->eip += insnlen; + cpu->regs->eip += insnlen; /* Success! */ return 1; } /*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */ -void lguest_arch_handle_trap(struct lguest *lg) +void lguest_arch_handle_trap(struct lg_cpu *cpu) { - switch (lg->regs->trapnum) { + switch (cpu->regs->trapnum) { case 13: /* We've intercepted a General Protection Fault. */ /* Check if this was one of those annoying IN or OUT * instructions which we need to emulate. If so, we just go * back into the Guest after we've done it. */ - if (lg->regs->errcode == 0) { - if (emulate_insn(lg)) + if (cpu->regs->errcode == 0) { + if (emulate_insn(cpu)) return; } break; @@ -301,7 +301,8 @@ void lguest_arch_handle_trap(struct lguest *lg) * * The errcode tells whether this was a read or a write, and * whether kernel or userspace code. */ - if (demand_page(lg, lg->arch.last_pagefault, lg->regs->errcode)) + if (demand_page(cpu, cpu->arch.last_pagefault, + cpu->regs->errcode)) return; /* OK, it's really not there (or not OK): the Guest needs to @@ -311,15 +312,16 @@ void lguest_arch_handle_trap(struct lguest *lg) * Note that if the Guest were really messed up, this could * happen before it's done the LHCALL_LGUEST_INIT hypercall, so * lg->lguest_data could be NULL */ - if (lg->lguest_data && - put_user(lg->arch.last_pagefault, &lg->lguest_data->cr2)) - kill_guest(lg, "Writing cr2"); + if (cpu->lg->lguest_data && + put_user(cpu->arch.last_pagefault, + &cpu->lg->lguest_data->cr2)) + kill_guest(cpu, "Writing cr2"); break; case 7: /* We've intercepted a Device Not Available fault. */ /* If the Guest doesn't want to know, we already restored the * Floating Point Unit, so we just continue without telling * it. */ - if (!lg->ts) + if (!cpu->ts) return; break; case 32 ... 255: @@ -332,19 +334,19 @@ void lguest_arch_handle_trap(struct lguest *lg) case LGUEST_TRAP_ENTRY: /* Our 'struct hcall_args' maps directly over our regs: we set * up the pointer now to indicate a hypercall is pending. */ - lg->hcall = (struct hcall_args *)lg->regs; + cpu->hcall = (struct hcall_args *)cpu->regs; return; } /* We didn't handle the trap, so it needs to go to the Guest. */ - if (!deliver_trap(lg, lg->regs->trapnum)) + if (!deliver_trap(cpu, cpu->regs->trapnum)) /* If the Guest doesn't have a handler (either it hasn't * registered any yet, or it's one of the faults we don't let * it handle), it dies with a cryptic error message. */ - kill_guest(lg, "unhandled trap %li at %#lx (%#lx)", - lg->regs->trapnum, lg->regs->eip, - lg->regs->trapnum == 14 ? lg->arch.last_pagefault - : lg->regs->errcode); + kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)", + cpu->regs->trapnum, cpu->regs->eip, + cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault + : cpu->regs->errcode); } /* Now we can look at each of the routines this calls, in increasing order of @@ -487,17 +489,17 @@ void __exit lguest_arch_host_fini(void) /*H:122 The i386-specific hypercalls simply farm out to the right functions. */ -int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args) +int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args) { switch (args->arg0) { case LHCALL_LOAD_GDT: - load_guest_gdt(lg, args->arg1, args->arg2); + load_guest_gdt(cpu, args->arg1, args->arg2); break; case LHCALL_LOAD_IDT_ENTRY: - load_guest_idt_entry(lg, args->arg1, args->arg2, args->arg3); + load_guest_idt_entry(cpu, args->arg1, args->arg2, args->arg3); break; case LHCALL_LOAD_TLS: - guest_load_tls(lg, args->arg1); + guest_load_tls(cpu, args->arg1); break; default: /* Bad Guest. Bad! */ @@ -507,13 +509,14 @@ int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args) } /*H:126 i386-specific hypercall initialization: */ -int lguest_arch_init_hypercalls(struct lguest *lg) +int lguest_arch_init_hypercalls(struct lg_cpu *cpu) { u32 tsc_speed; /* The pointer to the Guest's "struct lguest_data" is the only * argument. We check that address now. */ - if (!lguest_address_ok(lg, lg->hcall->arg1, sizeof(*lg->lguest_data))) + if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1, + sizeof(*cpu->lg->lguest_data))) return -EFAULT; /* Having checked it, we simply set lg->lguest_data to point straight @@ -521,7 +524,7 @@ int lguest_arch_init_hypercalls(struct lguest *lg) * copy_to_user/from_user from now on, instead of lgread/write. I put * this in to show that I'm not immune to writing stupid * optimizations. */ - lg->lguest_data = lg->mem_base + lg->hcall->arg1; + cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1; /* We insist that the Time Stamp Counter exist and doesn't change with * cpu frequency. Some devious chip manufacturers decided that TSC @@ -534,12 +537,12 @@ int lguest_arch_init_hypercalls(struct lguest *lg) tsc_speed = tsc_khz; else tsc_speed = 0; - if (put_user(tsc_speed, &lg->lguest_data->tsc_khz)) + if (put_user(tsc_speed, &cpu->lg->lguest_data->tsc_khz)) return -EFAULT; /* The interrupt code might not like the system call vector. */ - if (!check_syscall_vector(lg)) - kill_guest(lg, "bad syscall vector"); + if (!check_syscall_vector(cpu->lg)) + kill_guest(cpu, "bad syscall vector"); return 0; } @@ -548,9 +551,9 @@ int lguest_arch_init_hypercalls(struct lguest *lg) * * Most of the Guest's registers are left alone: we used get_zeroed_page() to * allocate the structure, so they will be 0. */ -void lguest_arch_setup_regs(struct lguest *lg, unsigned long start) +void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start) { - struct lguest_regs *regs = lg->regs; + struct lguest_regs *regs = cpu->regs; /* There are four "segment" registers which the Guest needs to boot: * The "code segment" register (cs) refers to the kernel code segment @@ -577,5 +580,5 @@ void lguest_arch_setup_regs(struct lguest *lg, unsigned long start) /* There are a couple of GDT entries the Guest expects when first * booting. */ - setup_guest_gdt(lg); + setup_guest_gdt(cpu); } diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c index e45f85f7c7e..0dff05840ee 100644 --- a/drivers/s390/scsi/zfcp_fsf.c +++ b/drivers/s390/scsi/zfcp_fsf.c @@ -4224,10 +4224,10 @@ zfcp_fsf_send_fcp_command_task_handler(struct zfcp_fsf_req *fsf_req) ZFCP_LOG_TRACE("%i bytes sense data provided by FCP\n", fcp_rsp_iu->fcp_sns_len); - memcpy(&scpnt->sense_buffer, + memcpy(scpnt->sense_buffer, zfcp_get_fcp_sns_info_ptr(fcp_rsp_iu), sns_len); ZFCP_HEX_DUMP(ZFCP_LOG_LEVEL_TRACE, - (void *) &scpnt->sense_buffer, sns_len); + (void *)scpnt->sense_buffer, sns_len); } /* check for overrun */ diff --git a/drivers/scsi/3w-9xxx.c b/drivers/scsi/3w-9xxx.c index 1c244832c6c..b4912d1cee2 100644 --- a/drivers/scsi/3w-9xxx.c +++ b/drivers/scsi/3w-9xxx.c @@ -1990,7 +1990,6 @@ static struct scsi_host_template driver_template = { .max_sectors = TW_MAX_SECTORS, .cmd_per_lun = TW_MAX_CMDS_PER_LUN, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, .shost_attrs = twa_host_attrs, .emulated = 1 }; diff --git a/drivers/scsi/3w-xxxx.c b/drivers/scsi/3w-xxxx.c index 59716ebeb10..d0953216221 100644 --- a/drivers/scsi/3w-xxxx.c +++ b/drivers/scsi/3w-xxxx.c @@ -2261,7 +2261,6 @@ static struct scsi_host_template driver_template = { .max_sectors = TW_MAX_SECTORS, .cmd_per_lun = TW_MAX_CMDS_PER_LUN, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, .shost_attrs = tw_host_attrs, .emulated = 1 }; diff --git a/drivers/scsi/BusLogic.c b/drivers/scsi/BusLogic.c index ead47c143ce..4d3ebb1af49 100644 --- a/drivers/scsi/BusLogic.c +++ b/drivers/scsi/BusLogic.c @@ -3575,7 +3575,6 @@ static struct scsi_host_template Bus_Logic_template = { .unchecked_isa_dma = 1, .max_sectors = 128, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, }; /* diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig index 3e161cd6646..14fc7f39e83 100644 --- a/drivers/scsi/Kconfig +++ b/drivers/scsi/Kconfig @@ -345,7 +345,7 @@ config ISCSI_TCP config SGIWD93_SCSI tristate "SGI WD93C93 SCSI Driver" - depends on SGI_IP22 && SCSI + depends on SGI_HAS_WD93 && SCSI help If you have a Western Digital WD93 SCSI controller on an SGI MIPS system, say Y. Otherwise, say N. diff --git a/drivers/scsi/NCR53c406a.c b/drivers/scsi/NCR53c406a.c index 137d065db3d..6961f78742a 100644 --- a/drivers/scsi/NCR53c406a.c +++ b/drivers/scsi/NCR53c406a.c @@ -1065,7 +1065,6 @@ static struct scsi_host_template driver_template = .cmd_per_lun = 1 /* commands per lun */, .unchecked_isa_dma = 1 /* unchecked_isa_dma */, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, }; #include "scsi_module.c" diff --git a/drivers/scsi/a100u2w.c b/drivers/scsi/a100u2w.c index d3a6d15fb77..f608d4a1d6d 100644 --- a/drivers/scsi/a100u2w.c +++ b/drivers/scsi/a100u2w.c @@ -1071,7 +1071,6 @@ static struct scsi_host_template inia100_template = { .sg_tablesize = SG_ALL, .cmd_per_lun = 1, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, }; static int __devinit inia100_probe_one(struct pci_dev *pdev, diff --git a/drivers/scsi/aacraid/commctrl.c b/drivers/scsi/aacraid/commctrl.c index 851a7e599c5..f8afa358b6b 100644 --- a/drivers/scsi/aacraid/commctrl.c +++ b/drivers/scsi/aacraid/commctrl.c @@ -243,7 +243,6 @@ static int next_getadapter_fib(struct aac_dev * dev, void __user *arg) * Search the list of AdapterFibContext addresses on the adapter * to be sure this is a valid address */ - spin_lock_irqsave(&dev->fib_lock, flags); entry = dev->fib_list.next; fibctx = NULL; @@ -252,25 +251,24 @@ static int next_getadapter_fib(struct aac_dev * dev, void __user *arg) /* * Extract the AdapterFibContext from the Input parameters. */ - if (fibctx->unique == f.fibctx) { /* We found a winner */ + if (fibctx->unique == f.fibctx) { /* We found a winner */ break; } entry = entry->next; fibctx = NULL; } if (!fibctx) { - spin_unlock_irqrestore(&dev->fib_lock, flags); dprintk ((KERN_INFO "Fib Context not found\n")); return -EINVAL; } if((fibctx->type != FSAFS_NTC_GET_ADAPTER_FIB_CONTEXT) || (fibctx->size != sizeof(struct aac_fib_context))) { - spin_unlock_irqrestore(&dev->fib_lock, flags); dprintk ((KERN_INFO "Fib Context corrupt?\n")); return -EINVAL; } status = 0; + spin_lock_irqsave(&dev->fib_lock, flags); /* * If there are no fibs to send back, then either wait or return * -EAGAIN @@ -328,9 +326,7 @@ return_fib: int aac_close_fib_context(struct aac_dev * dev, struct aac_fib_context * fibctx) { struct fib *fib; - unsigned long flags; - spin_lock_irqsave(&dev->fib_lock, flags); /* * First free any FIBs that have not been consumed. */ @@ -353,7 +349,6 @@ int aac_close_fib_context(struct aac_dev * dev, struct aac_fib_context * fibctx) * Remove the Context from the AdapterFibContext List */ list_del(&fibctx->next); - spin_unlock_irqrestore(&dev->fib_lock, flags); /* * Invalidate context */ @@ -419,8 +414,8 @@ static int close_getadapter_fib(struct aac_dev * dev, void __user *arg) * @arg: ioctl arguments * * This routine returns the driver version. - * Under Linux, there have been no version incompatibilities, so this is - * simple! + * Under Linux, there have been no version incompatibilities, so this is + * simple! */ static int check_revision(struct aac_dev *dev, void __user *arg) @@ -468,7 +463,7 @@ static int aac_send_raw_srb(struct aac_dev* dev, void __user * arg) u32 data_dir; void __user *sg_user[32]; void *sg_list[32]; - u32 sg_indx = 0; + u32 sg_indx = 0; u32 byte_count = 0; u32 actual_fibsize64, actual_fibsize = 0; int i; @@ -522,11 +517,11 @@ static int aac_send_raw_srb(struct aac_dev* dev, void __user * arg) // Fix up srb for endian and force some values srbcmd->function = cpu_to_le32(SRBF_ExecuteScsi); // Force this - srbcmd->channel = cpu_to_le32(user_srbcmd->channel); + srbcmd->channel = cpu_to_le32(user_srbcmd->channel); srbcmd->id = cpu_to_le32(user_srbcmd->id); - srbcmd->lun = cpu_to_le32(user_srbcmd->lun); - srbcmd->timeout = cpu_to_le32(user_srbcmd->timeout); - srbcmd->flags = cpu_to_le32(flags); + srbcmd->lun = cpu_to_le32(user_srbcmd->lun); + srbcmd->timeout = cpu_to_le32(user_srbcmd->timeout); + srbcmd->flags = cpu_to_le32(flags); srbcmd->retry_limit = 0; // Obsolete parameter srbcmd->cdb_size = cpu_to_le32(user_srbcmd->cdb_size); memcpy(srbcmd->cdb, user_srbcmd->cdb, sizeof(srbcmd->cdb)); @@ -791,9 +786,9 @@ static int aac_get_pci_info(struct aac_dev* dev, void __user *arg) pci_info.bus = dev->pdev->bus->number; pci_info.slot = PCI_SLOT(dev->pdev->devfn); - if (copy_to_user(arg, &pci_info, sizeof(struct aac_pci_info))) { - dprintk((KERN_DEBUG "aacraid: Could not copy pci info\n")); - return -EFAULT; + if (copy_to_user(arg, &pci_info, sizeof(struct aac_pci_info))) { + dprintk((KERN_DEBUG "aacraid: Could not copy pci info\n")); + return -EFAULT; } return 0; } diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c index 61be22774e9..0e8267c1e91 100644 --- a/drivers/scsi/aacraid/linit.c +++ b/drivers/scsi/aacraid/linit.c @@ -1032,7 +1032,6 @@ static struct scsi_host_template aac_driver_template = { .cmd_per_lun = AAC_NUM_IO_FIB, #endif .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, .emulated = 1, }; diff --git a/drivers/scsi/aha1740.c b/drivers/scsi/aha1740.c index be58a0b097c..7c45d88a205 100644 --- a/drivers/scsi/aha1740.c +++ b/drivers/scsi/aha1740.c @@ -563,7 +563,6 @@ static struct scsi_host_template aha1740_template = { .sg_tablesize = AHA1740_SCATTER, .cmd_per_lun = AHA1740_CMDLUN, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, .eh_abort_handler = aha1740_eh_abort_handler, }; diff --git a/drivers/scsi/aic7xxx/aic79xx.h b/drivers/scsi/aic7xxx/aic79xx.h index ce638aa6005..2f00467b6b8 100644 --- a/drivers/scsi/aic7xxx/aic79xx.h +++ b/drivers/scsi/aic7xxx/aic79xx.h @@ -1340,8 +1340,10 @@ struct ahd_pci_identity *ahd_find_pci_device(ahd_dev_softc_t); int ahd_pci_config(struct ahd_softc *, struct ahd_pci_identity *); int ahd_pci_test_register_access(struct ahd_softc *); +#ifdef CONFIG_PM void ahd_pci_suspend(struct ahd_softc *); void ahd_pci_resume(struct ahd_softc *); +#endif /************************** SCB and SCB queue management **********************/ void ahd_qinfifo_requeue_tail(struct ahd_softc *ahd, @@ -1352,8 +1354,10 @@ struct ahd_softc *ahd_alloc(void *platform_arg, char *name); int ahd_softc_init(struct ahd_softc *); void ahd_controller_info(struct ahd_softc *ahd, char *buf); int ahd_init(struct ahd_softc *ahd); +#ifdef CONFIG_PM int ahd_suspend(struct ahd_softc *ahd); void ahd_resume(struct ahd_softc *ahd); +#endif int ahd_default_config(struct ahd_softc *ahd); int ahd_parse_vpddata(struct ahd_softc *ahd, struct vpd_config *vpd); @@ -1361,7 +1365,6 @@ int ahd_parse_cfgdata(struct ahd_softc *ahd, struct seeprom_config *sc); void ahd_intr_enable(struct ahd_softc *ahd, int enable); void ahd_pause_and_flushwork(struct ahd_softc *ahd); -int ahd_suspend(struct ahd_softc *ahd); void ahd_set_unit(struct ahd_softc *, int); void ahd_set_name(struct ahd_softc *, char *); struct scb *ahd_get_scb(struct ahd_softc *ahd, u_int col_idx); diff --git a/drivers/scsi/aic7xxx/aic79xx_core.c b/drivers/scsi/aic7xxx/aic79xx_core.c index a7dd8cdda47..ade0fb8fbdb 100644 --- a/drivers/scsi/aic7xxx/aic79xx_core.c +++ b/drivers/scsi/aic7xxx/aic79xx_core.c @@ -7175,6 +7175,7 @@ ahd_pause_and_flushwork(struct ahd_softc *ahd) ahd->flags &= ~AHD_ALL_INTERRUPTS; } +#ifdef CONFIG_PM int ahd_suspend(struct ahd_softc *ahd) { @@ -7197,6 +7198,7 @@ ahd_resume(struct ahd_softc *ahd) ahd_intr_enable(ahd, TRUE); ahd_restart(ahd); } +#endif /************************** Busy Target Table *********************************/ /* diff --git a/drivers/scsi/aic7xxx/aic79xx_osm.c b/drivers/scsi/aic7xxx/aic79xx_osm.c index 0e4708fd43c..01465479290 100644 --- a/drivers/scsi/aic7xxx/aic79xx_osm.c +++ b/drivers/scsi/aic7xxx/aic79xx_osm.c @@ -766,7 +766,6 @@ struct scsi_host_template aic79xx_driver_template = { .max_sectors = 8192, .cmd_per_lun = 2, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, .slave_alloc = ahd_linux_slave_alloc, .slave_configure = ahd_linux_slave_configure, .target_alloc = ahd_linux_target_alloc, @@ -1922,7 +1921,7 @@ ahd_linux_queue_cmd_complete(struct ahd_softc *ahd, struct scsi_cmnd *cmd) struct scsi_sense_data *sense; sense = (struct scsi_sense_data *) - &cmd->sense_buffer; + cmd->sense_buffer; if (sense->extra_len >= 5 && (sense->add_sense_code == 0x47 || sense->add_sense_code == 0x48)) diff --git a/drivers/scsi/aic7xxx/aic79xx_osm_pci.c b/drivers/scsi/aic7xxx/aic79xx_osm_pci.c index 66f0259edb6..4150c8a8fdc 100644 --- a/drivers/scsi/aic7xxx/aic79xx_osm_pci.c +++ b/drivers/scsi/aic7xxx/aic79xx_osm_pci.c @@ -43,17 +43,6 @@ #include "aic79xx_inline.h" #include "aic79xx_pci.h" -static int ahd_linux_pci_dev_probe(struct pci_dev *pdev, - const struct pci_device_id *ent); -static int ahd_linux_pci_reserve_io_regions(struct ahd_softc *ahd, - u_long *base, u_long *base2); -static int ahd_linux_pci_reserve_mem_region(struct ahd_softc *ahd, - u_long *bus_addr, - uint8_t __iomem **maddr); -static int ahd_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg); -static int ahd_linux_pci_dev_resume(struct pci_dev *pdev); -static void ahd_linux_pci_dev_remove(struct pci_dev *pdev); - /* Define the macro locally since it's different for different class of chips. */ #define ID(x) \ @@ -85,17 +74,7 @@ static struct pci_device_id ahd_linux_pci_id_table[] = { MODULE_DEVICE_TABLE(pci, ahd_linux_pci_id_table); -static struct pci_driver aic79xx_pci_driver = { - .name = "aic79xx", - .probe = ahd_linux_pci_dev_probe, #ifdef CONFIG_PM - .suspend = ahd_linux_pci_dev_suspend, - .resume = ahd_linux_pci_dev_resume, -#endif - .remove = ahd_linux_pci_dev_remove, - .id_table = ahd_linux_pci_id_table -}; - static int ahd_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg) { @@ -139,6 +118,7 @@ ahd_linux_pci_dev_resume(struct pci_dev *pdev) return rc; } +#endif static void ahd_linux_pci_dev_remove(struct pci_dev *pdev) @@ -245,6 +225,17 @@ ahd_linux_pci_dev_probe(struct pci_dev *pdev, const struct pci_device_id *ent) return (0); } +static struct pci_driver aic79xx_pci_driver = { + .name = "aic79xx", + .probe = ahd_linux_pci_dev_probe, +#ifdef CONFIG_PM + .suspend = ahd_linux_pci_dev_suspend, + .resume = ahd_linux_pci_dev_resume, +#endif + .remove = ahd_linux_pci_dev_remove, + .id_table = ahd_linux_pci_id_table +}; + int ahd_linux_pci_init(void) { diff --git a/drivers/scsi/aic7xxx/aic79xx_pci.c b/drivers/scsi/aic7xxx/aic79xx_pci.c index 7a203a90601..df853676e66 100644 --- a/drivers/scsi/aic7xxx/aic79xx_pci.c +++ b/drivers/scsi/aic7xxx/aic79xx_pci.c @@ -389,6 +389,7 @@ ahd_pci_config(struct ahd_softc *ahd, struct ahd_pci_identity *entry) return error; } +#ifdef CONFIG_PM void ahd_pci_suspend(struct ahd_softc *ahd) { @@ -415,6 +416,7 @@ ahd_pci_resume(struct ahd_softc *ahd) ahd_pci_write_config(ahd->dev_softc, CSIZE_LATTIME, ahd->suspend_state.pci_state.csize_lattime, /*bytes*/1); } +#endif /* * Perform some simple tests that should catch situations where diff --git a/drivers/scsi/aic7xxx/aic7xxx.h b/drivers/scsi/aic7xxx/aic7xxx.h index 3d4e42d9045..c0344e61765 100644 --- a/drivers/scsi/aic7xxx/aic7xxx.h +++ b/drivers/scsi/aic7xxx/aic7xxx.h @@ -1143,7 +1143,9 @@ struct ahc_pci_identity *ahc_find_pci_device(ahc_dev_softc_t); int ahc_pci_config(struct ahc_softc *, struct ahc_pci_identity *); int ahc_pci_test_register_access(struct ahc_softc *); +#ifdef CONFIG_PM void ahc_pci_resume(struct ahc_softc *ahc); +#endif /*************************** EISA/VL Front End ********************************/ struct aic7770_identity *aic7770_find_device(uint32_t); @@ -1170,8 +1172,10 @@ int ahc_chip_init(struct ahc_softc *ahc); int ahc_init(struct ahc_softc *ahc); void ahc_intr_enable(struct ahc_softc *ahc, int enable); void ahc_pause_and_flushwork(struct ahc_softc *ahc); +#ifdef CONFIG_PM int ahc_suspend(struct ahc_softc *ahc); int ahc_resume(struct ahc_softc *ahc); +#endif void ahc_set_unit(struct ahc_softc *, int); void ahc_set_name(struct ahc_softc *, char *); void ahc_alloc_scbs(struct ahc_softc *ahc); diff --git a/drivers/scsi/aic7xxx/aic7xxx_core.c b/drivers/scsi/aic7xxx/aic7xxx_core.c index f350b5e89e7..6d2ae641273 100644 --- a/drivers/scsi/aic7xxx/aic7xxx_core.c +++ b/drivers/scsi/aic7xxx/aic7xxx_core.c @@ -5078,6 +5078,7 @@ ahc_pause_and_flushwork(struct ahc_softc *ahc) ahc->flags &= ~AHC_ALL_INTERRUPTS; } +#ifdef CONFIG_PM int ahc_suspend(struct ahc_softc *ahc) { @@ -5113,7 +5114,7 @@ ahc_resume(struct ahc_softc *ahc) ahc_restart(ahc); return (0); } - +#endif /************************** Busy Target Table *********************************/ /* * Return the untagged transaction id for a given target/channel lun. diff --git a/drivers/scsi/aic7xxx/aic7xxx_osm.c b/drivers/scsi/aic7xxx/aic7xxx_osm.c index e310e414067..99a3b33a323 100644 --- a/drivers/scsi/aic7xxx/aic7xxx_osm.c +++ b/drivers/scsi/aic7xxx/aic7xxx_osm.c @@ -747,7 +747,6 @@ struct scsi_host_template aic7xxx_driver_template = { .max_sectors = 8192, .cmd_per_lun = 2, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, .slave_alloc = ahc_linux_slave_alloc, .slave_configure = ahc_linux_slave_configure, .target_alloc = ahc_linux_target_alloc, @@ -1658,9 +1657,12 @@ ahc_done(struct ahc_softc *ahc, struct scb *scb) untagged_q = &(ahc->untagged_queues[target_offset]); TAILQ_REMOVE(untagged_q, scb, links.tqe); BUG_ON(!TAILQ_EMPTY(untagged_q)); - } - - if ((scb->flags & SCB_ACTIVE) == 0) { + } else if ((scb->flags & SCB_ACTIVE) == 0) { + /* + * Transactions aborted from the untagged queue may + * not have been dispatched to the controller, so + * only check the SCB_ACTIVE flag for tagged transactions. + */ printf("SCB %d done'd twice\n", scb->hscb->tag); ahc_dump_card_state(ahc); panic("Stopping for safety"); diff --git a/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c b/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c index 4488946cff2..dd6e21d6f1d 100644 --- a/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c +++ b/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c @@ -42,17 +42,6 @@ #include "aic7xxx_osm.h" #include "aic7xxx_pci.h" -static int ahc_linux_pci_dev_probe(struct pci_dev *pdev, - const struct pci_device_id *ent); -static int ahc_linux_pci_reserve_io_region(struct ahc_softc *ahc, - u_long *base); -static int ahc_linux_pci_reserve_mem_region(struct ahc_softc *ahc, - u_long *bus_addr, - uint8_t __iomem **maddr); -static int ahc_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg); -static int ahc_linux_pci_dev_resume(struct pci_dev *pdev); -static void ahc_linux_pci_dev_remove(struct pci_dev *pdev); - /* Define the macro locally since it's different for different class of chips. */ #define ID(x) ID_C(x, PCI_CLASS_STORAGE_SCSI) @@ -132,17 +121,7 @@ static struct pci_device_id ahc_linux_pci_id_table[] = { MODULE_DEVICE_TABLE(pci, ahc_linux_pci_id_table); -static struct pci_driver aic7xxx_pci_driver = { - .name = "aic7xxx", - .probe = ahc_linux_pci_dev_probe, #ifdef CONFIG_PM - .suspend = ahc_linux_pci_dev_suspend, - .resume = ahc_linux_pci_dev_resume, -#endif - .remove = ahc_linux_pci_dev_remove, - .id_table = ahc_linux_pci_id_table -}; - static int ahc_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg) { @@ -182,6 +161,7 @@ ahc_linux_pci_dev_resume(struct pci_dev *pdev) return (ahc_resume(ahc)); } +#endif static void ahc_linux_pci_dev_remove(struct pci_dev *pdev) @@ -289,6 +269,17 @@ ahc_linux_pci_dev_probe(struct pci_dev *pdev, const struct pci_device_id *ent) return (0); } +static struct pci_driver aic7xxx_pci_driver = { + .name = "aic7xxx", + .probe = ahc_linux_pci_dev_probe, +#ifdef CONFIG_PM + .suspend = ahc_linux_pci_dev_suspend, + .resume = ahc_linux_pci_dev_resume, +#endif + .remove = ahc_linux_pci_dev_remove, + .id_table = ahc_linux_pci_id_table +}; + int ahc_linux_pci_init(void) { diff --git a/drivers/scsi/aic7xxx/aic7xxx_pci.c b/drivers/scsi/aic7xxx/aic7xxx_pci.c index ae35937b805..56848f41e4f 100644 --- a/drivers/scsi/aic7xxx/aic7xxx_pci.c +++ b/drivers/scsi/aic7xxx/aic7xxx_pci.c @@ -2020,6 +2020,7 @@ ahc_pci_chip_init(struct ahc_softc *ahc) return (ahc_chip_init(ahc)); } +#ifdef CONFIG_PM void ahc_pci_resume(struct ahc_softc *ahc) { @@ -2051,6 +2052,7 @@ ahc_pci_resume(struct ahc_softc *ahc) ahc_release_seeprom(&sd); } } +#endif static int ahc_aic785X_setup(struct ahc_softc *ahc) diff --git a/drivers/scsi/aic7xxx_old.c b/drivers/scsi/aic7xxx_old.c index bcb0b870320..3bfd9296bbf 100644 --- a/drivers/scsi/aic7xxx_old.c +++ b/drivers/scsi/aic7xxx_old.c @@ -11141,7 +11141,6 @@ static struct scsi_host_template driver_template = { .max_sectors = 2048, .cmd_per_lun = 3, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, }; #include "scsi_module.c" diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c index d80dba913a7..f4a202e8df2 100644 --- a/drivers/scsi/arcmsr/arcmsr_hba.c +++ b/drivers/scsi/arcmsr/arcmsr_hba.c @@ -122,7 +122,6 @@ static struct scsi_host_template arcmsr_scsi_host_template = { .max_sectors = ARCMSR_MAX_XFER_SECTORS, .cmd_per_lun = ARCMSR_MAX_CMD_PERLUN, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, .shost_attrs = arcmsr_host_attrs, }; #ifdef CONFIG_SCSI_ARCMSR_AER diff --git a/drivers/scsi/dc395x.c b/drivers/scsi/dc395x.c index f93c73c0ba5..22ef3716e78 100644 --- a/drivers/scsi/dc395x.c +++ b/drivers/scsi/dc395x.c @@ -4763,7 +4763,6 @@ static struct scsi_host_template dc395x_driver_template = { .eh_bus_reset_handler = dc395x_eh_bus_reset, .unchecked_isa_dma = 0, .use_clustering = DISABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, }; diff --git a/drivers/scsi/dpt_i2o.c b/drivers/scsi/dpt_i2o.c index 19cce125124..c9dd8392aab 100644 --- a/drivers/scsi/dpt_i2o.c +++ b/drivers/scsi/dpt_i2o.c @@ -3340,7 +3340,6 @@ static struct scsi_host_template driver_template = { .this_id = 7, .cmd_per_lun = 1, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, }; #include "scsi_module.c" MODULE_LICENSE("GPL"); diff --git a/drivers/scsi/eata.c b/drivers/scsi/eata.c index 05163cefec1..8be3d76656f 100644 --- a/drivers/scsi/eata.c +++ b/drivers/scsi/eata.c @@ -524,7 +524,6 @@ static struct scsi_host_template driver_template = { .this_id = 7, .unchecked_isa_dma = 1, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, }; #if !defined(__BIG_ENDIAN_BITFIELD) && !defined(__LITTLE_ENDIAN_BITFIELD) diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c index 5ea1f986220..880c78bff0e 100644 --- a/drivers/scsi/hosts.c +++ b/drivers/scsi/hosts.c @@ -342,7 +342,6 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize) shost->use_clustering = sht->use_clustering; shost->ordered_tag = sht->ordered_tag; shost->active_mode = sht->supported_mode; - shost->use_sg_chaining = sht->use_sg_chaining; if (sht->supported_mode == MODE_UNKNOWN) /* means we didn't set it ... default to INITIATOR */ diff --git a/drivers/scsi/hptiop.c b/drivers/scsi/hptiop.c index e7b2f3575ce..ff149ad6bc4 100644 --- a/drivers/scsi/hptiop.c +++ b/drivers/scsi/hptiop.c @@ -573,7 +573,7 @@ static void hptiop_finish_scsi_req(struct hptiop_hba *hba, u32 tag, scsi_set_resid(scp, scsi_bufflen(scp) - le32_to_cpu(req->dataxfer_length)); scp->result = SAM_STAT_CHECK_CONDITION; - memcpy(&scp->sense_buffer, &req->sg_list, + memcpy(scp->sense_buffer, &req->sg_list, min_t(size_t, SCSI_SENSE_BUFFERSIZE, le32_to_cpu(req->dataxfer_length))); break; @@ -906,7 +906,6 @@ static struct scsi_host_template driver_template = { .unchecked_isa_dma = 0, .emulated = 0, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, .proc_name = driver_name, .shost_attrs = hptiop_attrs, .this_id = -1, diff --git a/drivers/scsi/ibmmca.c b/drivers/scsi/ibmmca.c index db004a45073..4d15a62914e 100644 --- a/drivers/scsi/ibmmca.c +++ b/drivers/scsi/ibmmca.c @@ -1501,7 +1501,6 @@ static struct scsi_host_template ibmmca_driver_template = { .sg_tablesize = 16, .cmd_per_lun = 1, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, }; static int ibmmca_probe(struct device *dev) diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c index 30819012898..78d46a900bb 100644 --- a/drivers/scsi/ibmvscsi/ibmvscsi.c +++ b/drivers/scsi/ibmvscsi/ibmvscsi.c @@ -1600,7 +1600,6 @@ static struct scsi_host_template driver_template = { .this_id = -1, .sg_tablesize = SG_ALL, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, .shost_attrs = ibmvscsi_attrs, }; diff --git a/drivers/scsi/initio.c b/drivers/scsi/initio.c index a10a5c74b48..0cc8868ea35 100644 --- a/drivers/scsi/initio.c +++ b/drivers/scsi/initio.c @@ -2833,7 +2833,6 @@ static struct scsi_host_template initio_template = { .sg_tablesize = SG_ALL, .cmd_per_lun = 1, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, }; static int initio_probe_one(struct pci_dev *pdev, diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c index e5be5fd4ef5..b6f99dfbb03 100644 --- a/drivers/scsi/iscsi_tcp.c +++ b/drivers/scsi/iscsi_tcp.c @@ -1933,7 +1933,6 @@ static struct scsi_host_template iscsi_sht = { .eh_device_reset_handler= iscsi_eh_device_reset, .eh_host_reset_handler = iscsi_eh_host_reset, .use_clustering = DISABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, .slave_configure = iscsi_tcp_slave_configure, .proc_name = "iscsi_tcp", .this_id = -1, diff --git a/drivers/scsi/libsrp.c b/drivers/scsi/libsrp.c index 5cff0204227..6d6a76e65a6 100644 --- a/drivers/scsi/libsrp.c +++ b/drivers/scsi/libsrp.c @@ -426,8 +426,8 @@ int srp_cmd_queue(struct Scsi_Host *shost, struct srp_cmd *cmd, void *info, sc->SCp.ptr = info; memcpy(sc->cmnd, cmd->cdb, MAX_COMMAND_SIZE); - sc->request_bufflen = len; - sc->request_buffer = (void *) (unsigned long) addr; + sc->sdb.length = len; + sc->sdb.table.sgl = (void *) (unsigned long) addr; sc->tag = tag; err = scsi_tgt_queue_command(sc, itn_id, (struct scsi_lun *)&cmd->lun, cmd->tag); diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c index 6483c62730b..fc5c3a42b05 100644 --- a/drivers/scsi/lpfc/lpfc_scsi.c +++ b/drivers/scsi/lpfc/lpfc_scsi.c @@ -1459,7 +1459,6 @@ struct scsi_host_template lpfc_template = { .scan_finished = lpfc_scan_finished, .this_id = -1, .sg_tablesize = LPFC_DEFAULT_SG_SEG_CNT, - .use_sg_chaining = ENABLE_SG_CHAINING, .cmd_per_lun = LPFC_CMD_PER_LUN, .use_clustering = ENABLE_CLUSTERING, .shost_attrs = lpfc_hba_attrs, @@ -1482,7 +1481,6 @@ struct scsi_host_template lpfc_vport_template = { .sg_tablesize = LPFC_DEFAULT_SG_SEG_CNT, .cmd_per_lun = LPFC_CMD_PER_LUN, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, .shost_attrs = lpfc_vport_attrs, .max_sectors = 0xFFFF, }; diff --git a/drivers/scsi/mac53c94.c b/drivers/scsi/mac53c94.c index a035001f443..b12ad7c7c67 100644 --- a/drivers/scsi/mac53c94.c +++ b/drivers/scsi/mac53c94.c @@ -402,7 +402,6 @@ static struct scsi_host_template mac53c94_template = { .sg_tablesize = SG_ALL, .cmd_per_lun = 1, .use_clustering = DISABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, }; static int mac53c94_probe(struct macio_dev *mdev, const struct of_device_id *match) diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c index 765c24d2bc3..4d59ae8491a 100644 --- a/drivers/scsi/megaraid.c +++ b/drivers/scsi/megaraid.c @@ -4490,7 +4490,6 @@ static struct scsi_host_template megaraid_template = { .sg_tablesize = MAX_SGLIST, .cmd_per_lun = DEF_CMD_PER_LUN, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, .eh_abort_handler = megaraid_abort, .eh_device_reset_handler = megaraid_reset, .eh_bus_reset_handler = megaraid_reset, diff --git a/drivers/scsi/megaraid/megaraid_mbox.c b/drivers/scsi/megaraid/megaraid_mbox.c index 24e32e446e7..6db77c00e3e 100644 --- a/drivers/scsi/megaraid/megaraid_mbox.c +++ b/drivers/scsi/megaraid/megaraid_mbox.c @@ -361,7 +361,6 @@ static struct scsi_host_template megaraid_template_g = { .eh_host_reset_handler = megaraid_reset_handler, .change_queue_depth = megaraid_change_queue_depth, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, .sdev_attrs = megaraid_sdev_attrs, .shost_attrs = megaraid_shost_attrs, }; diff --git a/drivers/scsi/megaraid/megaraid_sas.c b/drivers/scsi/megaraid/megaraid_sas.c index d7ec921865c..672c759ac24 100644 --- a/drivers/scsi/megaraid/megaraid_sas.c +++ b/drivers/scsi/megaraid/megaraid_sas.c @@ -1192,7 +1192,6 @@ static struct scsi_host_template megasas_template = { .eh_timed_out = megasas_reset_timer, .bios_param = megasas_bios_param, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, }; /** diff --git a/drivers/scsi/mesh.c b/drivers/scsi/mesh.c index 7470ff39ab2..651d09b08f2 100644 --- a/drivers/scsi/mesh.c +++ b/drivers/scsi/mesh.c @@ -1843,7 +1843,6 @@ static struct scsi_host_template mesh_template = { .sg_tablesize = SG_ALL, .cmd_per_lun = 2, .use_clustering = DISABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, }; static int mesh_probe(struct macio_dev *mdev, const struct of_device_id *match) diff --git a/drivers/scsi/ncr53c8xx.c b/drivers/scsi/ncr53c8xx.c index c02771aa6c9..c5ebf018b37 100644 --- a/drivers/scsi/ncr53c8xx.c +++ b/drivers/scsi/ncr53c8xx.c @@ -4967,7 +4967,7 @@ void ncr_complete (struct ncb *np, struct ccb *cp) sizeof(cp->sense_buf))); if (DEBUG_FLAGS & (DEBUG_RESULT|DEBUG_TINY)) { - u_char * p = (u_char*) & cmd->sense_buffer; + u_char *p = cmd->sense_buffer; int i; PRINT_ADDR(cmd, "sense data:"); for (i=0; i<14; i++) printk (" %x", *p++); diff --git a/drivers/scsi/nsp32.c b/drivers/scsi/nsp32.c index 28161dc95e0..7fed3537215 100644 --- a/drivers/scsi/nsp32.c +++ b/drivers/scsi/nsp32.c @@ -281,7 +281,6 @@ static struct scsi_host_template nsp32_template = { .cmd_per_lun = 1, .this_id = NSP32_HOST_SCSIID, .use_clustering = DISABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, .eh_abort_handler = nsp32_eh_abort, .eh_bus_reset_handler = nsp32_eh_bus_reset, .eh_host_reset_handler = nsp32_eh_host_reset, diff --git a/drivers/scsi/pcmcia/sym53c500_cs.c b/drivers/scsi/pcmcia/sym53c500_cs.c index 969b9387a0c..3454a571474 100644 --- a/drivers/scsi/pcmcia/sym53c500_cs.c +++ b/drivers/scsi/pcmcia/sym53c500_cs.c @@ -692,7 +692,6 @@ static struct scsi_host_template sym53c500_driver_template = { .sg_tablesize = 32, .cmd_per_lun = 1, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, .shost_attrs = SYM53C500_shost_attrs }; diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c index c94906abfee..68c0d09ffe7 100644 --- a/drivers/scsi/qla1280.c +++ b/drivers/scsi/qla1280.c @@ -4204,7 +4204,6 @@ static struct scsi_host_template qla1280_driver_template = { .sg_tablesize = SG_ALL, .cmd_per_lun = 1, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, }; diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index aba1e6d4806..3954ed2d7b5 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -131,7 +131,6 @@ static struct scsi_host_template qla2x00_driver_template = { .this_id = -1, .cmd_per_lun = 3, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, .sg_tablesize = SG_ALL, /* @@ -163,7 +162,6 @@ struct scsi_host_template qla24xx_driver_template = { .this_id = -1, .cmd_per_lun = 3, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, .sg_tablesize = SG_ALL, .max_sectors = 0xFFFF, diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c index d3f86646cb0..2e2b9fedffc 100644 --- a/drivers/scsi/qla4xxx/ql4_os.c +++ b/drivers/scsi/qla4xxx/ql4_os.c @@ -94,7 +94,6 @@ static struct scsi_host_template qla4xxx_driver_template = { .this_id = -1, .cmd_per_lun = 3, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, .sg_tablesize = SG_ALL, .max_sectors = 0xFFFF, diff --git a/drivers/scsi/qlogicfas.c b/drivers/scsi/qlogicfas.c index 1769f965eed..1e874f1fb5c 100644 --- a/drivers/scsi/qlogicfas.c +++ b/drivers/scsi/qlogicfas.c @@ -197,7 +197,6 @@ static struct scsi_host_template qlogicfas_driver_template = { .sg_tablesize = SG_ALL, .cmd_per_lun = 1, .use_clustering = DISABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, }; static __init int qlogicfas_init(void) diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index 1a9fba6a9f9..b35d19472ca 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -757,7 +757,7 @@ void scsi_finish_command(struct scsi_cmnd *cmd) "Notifying upper driver of completion " "(result %x)\n", cmd->result)); - good_bytes = cmd->request_bufflen; + good_bytes = scsi_bufflen(cmd); if (cmd->request->cmd_type != REQ_TYPE_BLOCK_PC) { drv = scsi_cmd_to_driver(cmd); if (drv->done) diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 82c06f0a9d0..1541c174937 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -280,6 +280,8 @@ static int resp_write(struct scsi_cmnd * SCpnt, unsigned long long lba, unsigned int num, struct sdebug_dev_info * devip); static int resp_report_luns(struct scsi_cmnd * SCpnt, struct sdebug_dev_info * devip); +static int resp_xdwriteread(struct scsi_cmnd *scp, unsigned long long lba, + unsigned int num, struct sdebug_dev_info *devip); static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr, int arr_len); static int fetch_to_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr, @@ -311,12 +313,48 @@ static void sdebug_max_tgts_luns(void); static struct device pseudo_primary; static struct bus_type pseudo_lld_bus; +static void get_data_transfer_info(unsigned char *cmd, + unsigned long long *lba, unsigned int *num) +{ + int i; + + switch (*cmd) { + case WRITE_16: + case READ_16: + for (*lba = 0, i = 0; i < 8; ++i) { + if (i > 0) + *lba <<= 8; + *lba += cmd[2 + i]; + } + *num = cmd[13] + (cmd[12] << 8) + + (cmd[11] << 16) + (cmd[10] << 24); + break; + case WRITE_12: + case READ_12: + *lba = cmd[5] + (cmd[4] << 8) + (cmd[3] << 16) + (cmd[2] << 24); + *num = cmd[9] + (cmd[8] << 8) + (cmd[7] << 16) + (cmd[6] << 24); + break; + case WRITE_10: + case READ_10: + case XDWRITEREAD_10: + *lba = cmd[5] + (cmd[4] << 8) + (cmd[3] << 16) + (cmd[2] << 24); + *num = cmd[8] + (cmd[7] << 8); + break; + case WRITE_6: + case READ_6: + *lba = cmd[3] + (cmd[2] << 8) + ((cmd[1] & 0x1f) << 16); + *num = (0 == cmd[4]) ? 256 : cmd[4]; + break; + default: + break; + } +} static int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done) { unsigned char *cmd = (unsigned char *) SCpnt->cmnd; - int len, k, j; + int len, k; unsigned int num; unsigned long long lba; int errsts = 0; @@ -452,28 +490,7 @@ int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done) break; if (scsi_debug_fake_rw) break; - if ((*cmd) == READ_16) { - for (lba = 0, j = 0; j < 8; ++j) { - if (j > 0) - lba <<= 8; - lba += cmd[2 + j]; - } - num = cmd[13] + (cmd[12] << 8) + - (cmd[11] << 16) + (cmd[10] << 24); - } else if ((*cmd) == READ_12) { - lba = cmd[5] + (cmd[4] << 8) + - (cmd[3] << 16) + (cmd[2] << 24); - num = cmd[9] + (cmd[8] << 8) + - (cmd[7] << 16) + (cmd[6] << 24); - } else if ((*cmd) == READ_10) { - lba = cmd[5] + (cmd[4] << 8) + - (cmd[3] << 16) + (cmd[2] << 24); - num = cmd[8] + (cmd[7] << 8); - } else { /* READ (6) */ - lba = cmd[3] + (cmd[2] << 8) + - ((cmd[1] & 0x1f) << 16); - num = (0 == cmd[4]) ? 256 : cmd[4]; - } + get_data_transfer_info(cmd, &lba, &num); errsts = resp_read(SCpnt, lba, num, devip); if (inj_recovered && (0 == errsts)) { mk_sense_buffer(devip, RECOVERED_ERROR, @@ -500,28 +517,7 @@ int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done) break; if (scsi_debug_fake_rw) break; - if ((*cmd) == WRITE_16) { - for (lba = 0, j = 0; j < 8; ++j) { - if (j > 0) - lba <<= 8; - lba += cmd[2 + j]; - } - num = cmd[13] + (cmd[12] << 8) + - (cmd[11] << 16) + (cmd[10] << 24); - } else if ((*cmd) == WRITE_12) { - lba = cmd[5] + (cmd[4] << 8) + - (cmd[3] << 16) + (cmd[2] << 24); - num = cmd[9] + (cmd[8] << 8) + - (cmd[7] << 16) + (cmd[6] << 24); - } else if ((*cmd) == WRITE_10) { - lba = cmd[5] + (cmd[4] << 8) + - (cmd[3] << 16) + (cmd[2] << 24); - num = cmd[8] + (cmd[7] << 8); - } else { /* WRITE (6) */ - lba = cmd[3] + (cmd[2] << 8) + - ((cmd[1] & 0x1f) << 16); - num = (0 == cmd[4]) ? 256 : cmd[4]; - } + get_data_transfer_info(cmd, &lba, &num); errsts = resp_write(SCpnt, lba, num, devip); if (inj_recovered && (0 == errsts)) { mk_sense_buffer(devip, RECOVERED_ERROR, @@ -549,6 +545,28 @@ int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done) case WRITE_BUFFER: errsts = check_readiness(SCpnt, 1, devip); break; + case XDWRITEREAD_10: + if (!scsi_bidi_cmnd(SCpnt)) { + mk_sense_buffer(devip, ILLEGAL_REQUEST, + INVALID_FIELD_IN_CDB, 0); + errsts = check_condition_result; + break; + } + + errsts = check_readiness(SCpnt, 0, devip); + if (errsts) + break; + if (scsi_debug_fake_rw) + break; + get_data_transfer_info(cmd, &lba, &num); + errsts = resp_read(SCpnt, lba, num, devip); + if (errsts) + break; + errsts = resp_write(SCpnt, lba, num, devip); + if (errsts) + break; + errsts = resp_xdwriteread(SCpnt, lba, num, devip); + break; default: if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts) printk(KERN_INFO "scsi_debug: Opcode: 0x%x not " @@ -601,18 +619,18 @@ static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr, int k, req_len, act_len, len, active; void * kaddr; void * kaddr_off; - struct scatterlist * sg; + struct scatterlist *sg; + struct scsi_data_buffer *sdb = scsi_in(scp); - if (0 == scsi_bufflen(scp)) + if (!sdb->length) return 0; - if (NULL == scsi_sglist(scp)) + if (!sdb->table.sgl) return (DID_ERROR << 16); - if (! ((scp->sc_data_direction == DMA_BIDIRECTIONAL) || - (scp->sc_data_direction == DMA_FROM_DEVICE))) + if (!(scsi_bidi_cmnd(scp) || scp->sc_data_direction == DMA_FROM_DEVICE)) return (DID_ERROR << 16); active = 1; req_len = act_len = 0; - scsi_for_each_sg(scp, sg, scsi_sg_count(scp), k) { + for_each_sg(sdb->table.sgl, sg, sdb->table.nents, k) { if (active) { kaddr = (unsigned char *) kmap_atomic(sg_page(sg), KM_USER0); @@ -630,10 +648,10 @@ static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr, } req_len += sg->length; } - if (scsi_get_resid(scp)) - scsi_set_resid(scp, scsi_get_resid(scp) - act_len); + if (sdb->resid) + sdb->resid -= act_len; else - scsi_set_resid(scp, req_len - act_len); + sdb->resid = req_len - act_len; return 0; } @@ -650,8 +668,7 @@ static int fetch_to_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr, return 0; if (NULL == scsi_sglist(scp)) return -1; - if (! ((scp->sc_data_direction == DMA_BIDIRECTIONAL) || - (scp->sc_data_direction == DMA_TO_DEVICE))) + if (!(scsi_bidi_cmnd(scp) || scp->sc_data_direction == DMA_TO_DEVICE)) return -1; req_len = fin = 0; scsi_for_each_sg(scp, sg, scsi_sg_count(scp), k) { @@ -1956,6 +1973,50 @@ static int resp_report_luns(struct scsi_cmnd * scp, min((int)alloc_len, SDEBUG_RLUN_ARR_SZ)); } +static int resp_xdwriteread(struct scsi_cmnd *scp, unsigned long long lba, + unsigned int num, struct sdebug_dev_info *devip) +{ + int i, j, ret = -1; + unsigned char *kaddr, *buf; + unsigned int offset; + struct scatterlist *sg; + struct scsi_data_buffer *sdb = scsi_in(scp); + + /* better not to use temporary buffer. */ + buf = kmalloc(scsi_bufflen(scp), GFP_ATOMIC); + if (!buf) + return ret; + + offset = 0; + scsi_for_each_sg(scp, sg, scsi_sg_count(scp), i) { + kaddr = (unsigned char *)kmap_atomic(sg_page(sg), KM_USER0); + if (!kaddr) + goto out; + + memcpy(buf + offset, kaddr + sg->offset, sg->length); + offset += sg->length; + kunmap_atomic(kaddr, KM_USER0); + } + + offset = 0; + for_each_sg(sdb->table.sgl, sg, sdb->table.nents, i) { + kaddr = (unsigned char *)kmap_atomic(sg_page(sg), KM_USER0); + if (!kaddr) + goto out; + + for (j = 0; j < sg->length; j++) + *(kaddr + sg->offset + j) ^= *(buf + offset + j); + + offset += sg->length; + kunmap_atomic(kaddr, KM_USER0); + } + ret = 0; +out: + kfree(buf); + + return ret; +} + /* When timer goes off this function is called. */ static void timer_intr_handler(unsigned long indx) { @@ -1989,6 +2050,7 @@ static int scsi_debug_slave_alloc(struct scsi_device * sdp) if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts) printk(KERN_INFO "scsi_debug: slave_alloc <%u %u %u %u>\n", sdp->host->host_no, sdp->channel, sdp->id, sdp->lun); + set_bit(QUEUE_FLAG_BIDI, &sdp->request_queue->queue_flags); return 0; } diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 547e85aa414..045a0868fc7 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -617,29 +617,27 @@ void scsi_eh_prep_cmnd(struct scsi_cmnd *scmd, struct scsi_eh_save *ses, ses->cmd_len = scmd->cmd_len; memcpy(ses->cmnd, scmd->cmnd, sizeof(scmd->cmnd)); ses->data_direction = scmd->sc_data_direction; - ses->bufflen = scmd->request_bufflen; - ses->buffer = scmd->request_buffer; - ses->use_sg = scmd->use_sg; - ses->resid = scmd->resid; + ses->sdb = scmd->sdb; + ses->next_rq = scmd->request->next_rq; ses->result = scmd->result; + memset(&scmd->sdb, 0, sizeof(scmd->sdb)); + scmd->request->next_rq = NULL; + if (sense_bytes) { - scmd->request_bufflen = min_t(unsigned, - SCSI_SENSE_BUFFERSIZE, sense_bytes); + scmd->sdb.length = min_t(unsigned, SCSI_SENSE_BUFFERSIZE, + sense_bytes); sg_init_one(&ses->sense_sgl, scmd->sense_buffer, - scmd->request_bufflen); - scmd->request_buffer = &ses->sense_sgl; + scmd->sdb.length); + scmd->sdb.table.sgl = &ses->sense_sgl; scmd->sc_data_direction = DMA_FROM_DEVICE; - scmd->use_sg = 1; + scmd->sdb.table.nents = 1; memset(scmd->cmnd, 0, sizeof(scmd->cmnd)); scmd->cmnd[0] = REQUEST_SENSE; - scmd->cmnd[4] = scmd->request_bufflen; + scmd->cmnd[4] = scmd->sdb.length; scmd->cmd_len = COMMAND_SIZE(scmd->cmnd[0]); } else { - scmd->request_buffer = NULL; - scmd->request_bufflen = 0; scmd->sc_data_direction = DMA_NONE; - scmd->use_sg = 0; if (cmnd) { memset(scmd->cmnd, 0, sizeof(scmd->cmnd)); memcpy(scmd->cmnd, cmnd, cmnd_size); @@ -676,10 +674,8 @@ void scsi_eh_restore_cmnd(struct scsi_cmnd* scmd, struct scsi_eh_save *ses) scmd->cmd_len = ses->cmd_len; memcpy(scmd->cmnd, ses->cmnd, sizeof(scmd->cmnd)); scmd->sc_data_direction = ses->data_direction; - scmd->request_bufflen = ses->bufflen; - scmd->request_buffer = ses->buffer; - scmd->use_sg = ses->use_sg; - scmd->resid = ses->resid; + scmd->sdb = ses->sdb; + scmd->request->next_rq = ses->next_rq; scmd->result = ses->result; } EXPORT_SYMBOL(scsi_eh_restore_cmnd); @@ -1700,8 +1696,7 @@ scsi_reset_provider(struct scsi_device *dev, int flag) memset(&scmd->cmnd, '\0', sizeof(scmd->cmnd)); scmd->scsi_done = scsi_reset_provider_done_command; - scmd->request_buffer = NULL; - scmd->request_bufflen = 0; + memset(&scmd->sdb, 0, sizeof(scmd->sdb)); scmd->cmd_len = 0; diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 7c4c889c522..b12fb310e39 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -8,6 +8,7 @@ */ #include <linux/bio.h> +#include <linux/bitops.h> #include <linux/blkdev.h> #include <linux/completion.h> #include <linux/kernel.h> @@ -34,13 +35,6 @@ #define SG_MEMPOOL_NR ARRAY_SIZE(scsi_sg_pools) #define SG_MEMPOOL_SIZE 2 -/* - * The maximum number of SG segments that we will put inside a scatterlist - * (unless chaining is used). Should ideally fit inside a single page, to - * avoid a higher order allocation. - */ -#define SCSI_MAX_SG_SEGMENTS 128 - struct scsi_host_sg_pool { size_t size; char *name; @@ -48,22 +42,31 @@ struct scsi_host_sg_pool { mempool_t *pool; }; -#define SP(x) { x, "sgpool-" #x } +#define SP(x) { x, "sgpool-" __stringify(x) } +#if (SCSI_MAX_SG_SEGMENTS < 32) +#error SCSI_MAX_SG_SEGMENTS is too small (must be 32 or greater) +#endif static struct scsi_host_sg_pool scsi_sg_pools[] = { SP(8), SP(16), -#if (SCSI_MAX_SG_SEGMENTS > 16) - SP(32), #if (SCSI_MAX_SG_SEGMENTS > 32) - SP(64), + SP(32), #if (SCSI_MAX_SG_SEGMENTS > 64) + SP(64), +#if (SCSI_MAX_SG_SEGMENTS > 128) SP(128), +#if (SCSI_MAX_SG_SEGMENTS > 256) +#error SCSI_MAX_SG_SEGMENTS is too large (256 MAX) +#endif #endif #endif #endif + SP(SCSI_MAX_SG_SEGMENTS) }; #undef SP +static struct kmem_cache *scsi_bidi_sdb_cache; + static void scsi_run_queue(struct request_queue *q); /* @@ -440,7 +443,7 @@ EXPORT_SYMBOL_GPL(scsi_execute_async); static void scsi_init_cmd_errh(struct scsi_cmnd *cmd) { cmd->serial_number = 0; - cmd->resid = 0; + scsi_set_resid(cmd, 0); memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); if (cmd->cmd_len == 0) cmd->cmd_len = COMMAND_SIZE(cmd->cmnd[0]); @@ -690,42 +693,16 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int error, return NULL; } -/* - * Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit - * is totally arbitrary, a setting of 2048 will get you at least 8mb ios. - */ -#define SCSI_MAX_SG_CHAIN_SEGMENTS 2048 - static inline unsigned int scsi_sgtable_index(unsigned short nents) { unsigned int index; - switch (nents) { - case 1 ... 8: + BUG_ON(nents > SCSI_MAX_SG_SEGMENTS); + + if (nents <= 8) index = 0; - break; - case 9 ... 16: - index = 1; - break; -#if (SCSI_MAX_SG_SEGMENTS > 16) - case 17 ... 32: - index = 2; - break; -#if (SCSI_MAX_SG_SEGMENTS > 32) - case 33 ... 64: - index = 3; - break; -#if (SCSI_MAX_SG_SEGMENTS > 64) - case 65 ... 128: - index = 4; - break; -#endif -#endif -#endif - default: - printk(KERN_ERR "scsi: bad segment count=%d\n", nents); - BUG(); - } + else + index = get_count_order(nents) - 3; return index; } @@ -746,31 +723,27 @@ static struct scatterlist *scsi_sg_alloc(unsigned int nents, gfp_t gfp_mask) return mempool_alloc(sgp->pool, gfp_mask); } -int scsi_alloc_sgtable(struct scsi_cmnd *cmd, gfp_t gfp_mask) +static int scsi_alloc_sgtable(struct scsi_data_buffer *sdb, int nents, + gfp_t gfp_mask) { int ret; - BUG_ON(!cmd->use_sg); + BUG_ON(!nents); - ret = __sg_alloc_table(&cmd->sg_table, cmd->use_sg, - SCSI_MAX_SG_SEGMENTS, gfp_mask, scsi_sg_alloc); + ret = __sg_alloc_table(&sdb->table, nents, SCSI_MAX_SG_SEGMENTS, + gfp_mask, scsi_sg_alloc); if (unlikely(ret)) - __sg_free_table(&cmd->sg_table, SCSI_MAX_SG_SEGMENTS, + __sg_free_table(&sdb->table, SCSI_MAX_SG_SEGMENTS, scsi_sg_free); - cmd->request_buffer = cmd->sg_table.sgl; return ret; } -EXPORT_SYMBOL(scsi_alloc_sgtable); - -void scsi_free_sgtable(struct scsi_cmnd *cmd) +static void scsi_free_sgtable(struct scsi_data_buffer *sdb) { - __sg_free_table(&cmd->sg_table, SCSI_MAX_SG_SEGMENTS, scsi_sg_free); + __sg_free_table(&sdb->table, SCSI_MAX_SG_SEGMENTS, scsi_sg_free); } -EXPORT_SYMBOL(scsi_free_sgtable); - /* * Function: scsi_release_buffers() * @@ -788,17 +761,49 @@ EXPORT_SYMBOL(scsi_free_sgtable); * the scatter-gather table, and potentially any bounce * buffers. */ -static void scsi_release_buffers(struct scsi_cmnd *cmd) +void scsi_release_buffers(struct scsi_cmnd *cmd) +{ + if (cmd->sdb.table.nents) + scsi_free_sgtable(&cmd->sdb); + + memset(&cmd->sdb, 0, sizeof(cmd->sdb)); + + if (scsi_bidi_cmnd(cmd)) { + struct scsi_data_buffer *bidi_sdb = + cmd->request->next_rq->special; + scsi_free_sgtable(bidi_sdb); + kmem_cache_free(scsi_bidi_sdb_cache, bidi_sdb); + cmd->request->next_rq->special = NULL; + } +} +EXPORT_SYMBOL(scsi_release_buffers); + +/* + * Bidi commands Must be complete as a whole, both sides at once. + * If part of the bytes were written and lld returned + * scsi_in()->resid and/or scsi_out()->resid this information will be left + * in req->data_len and req->next_rq->data_len. The upper-layer driver can + * decide what to do with this information. + */ +void scsi_end_bidi_request(struct scsi_cmnd *cmd) { - if (cmd->use_sg) - scsi_free_sgtable(cmd); + struct request *req = cmd->request; + unsigned int dlen = req->data_len; + unsigned int next_dlen = req->next_rq->data_len; + + req->data_len = scsi_out(cmd)->resid; + req->next_rq->data_len = scsi_in(cmd)->resid; + + /* The req and req->next_rq have not been completed */ + BUG_ON(blk_end_bidi_request(req, 0, dlen, next_dlen)); + + scsi_release_buffers(cmd); /* - * Zero these out. They now point to freed memory, and it is - * dangerous to hang onto the pointers. + * This will goose the queue request function at the end, so we don't + * need to worry about launching another command. */ - cmd->request_buffer = NULL; - cmd->request_bufflen = 0; + scsi_next_command(cmd); } /* @@ -832,7 +837,7 @@ static void scsi_release_buffers(struct scsi_cmnd *cmd) void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) { int result = cmd->result; - int this_count = cmd->request_bufflen; + int this_count = scsi_bufflen(cmd); struct request_queue *q = cmd->device->request_queue; struct request *req = cmd->request; int clear_errors = 1; @@ -840,8 +845,6 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) int sense_valid = 0; int sense_deferred = 0; - scsi_release_buffers(cmd); - if (result) { sense_valid = scsi_command_normalize_sense(cmd, &sshdr); if (sense_valid) @@ -864,9 +867,17 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) req->sense_len = len; } } - req->data_len = cmd->resid; + if (scsi_bidi_cmnd(cmd)) { + /* will also release_buffers */ + scsi_end_bidi_request(cmd); + return; + } + req->data_len = scsi_get_resid(cmd); } + BUG_ON(blk_bidi_rq(req)); /* bidi not support for !blk_pc_request yet */ + scsi_release_buffers(cmd); + /* * Next deal with any sectors which we were able to correctly * handle. @@ -874,7 +885,6 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) SCSI_LOG_HLCOMPLETE(1, printk("%ld sectors total, " "%d bytes done.\n", req->nr_sectors, good_bytes)); - SCSI_LOG_HLCOMPLETE(1, printk("use_sg is %d\n", cmd->use_sg)); if (clear_errors) req->errors = 0; @@ -991,52 +1001,80 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) scsi_end_request(cmd, -EIO, this_count, !result); } -/* - * Function: scsi_init_io() - * - * Purpose: SCSI I/O initialize function. - * - * Arguments: cmd - Command descriptor we wish to initialize - * - * Returns: 0 on success - * BLKPREP_DEFER if the failure is retryable - */ -static int scsi_init_io(struct scsi_cmnd *cmd) +static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb, + gfp_t gfp_mask) { - struct request *req = cmd->request; - int count; - - /* - * We used to not use scatter-gather for single segment request, - * but now we do (it makes highmem I/O easier to support without - * kmapping pages) - */ - cmd->use_sg = req->nr_phys_segments; + int count; /* * If sg table allocation fails, requeue request later. */ - if (unlikely(scsi_alloc_sgtable(cmd, GFP_ATOMIC))) { - scsi_unprep_request(req); + if (unlikely(scsi_alloc_sgtable(sdb, req->nr_phys_segments, + gfp_mask))) { return BLKPREP_DEFER; } req->buffer = NULL; if (blk_pc_request(req)) - cmd->request_bufflen = req->data_len; + sdb->length = req->data_len; else - cmd->request_bufflen = req->nr_sectors << 9; + sdb->length = req->nr_sectors << 9; /* * Next, walk the list, and fill in the addresses and sizes of * each segment. */ - count = blk_rq_map_sg(req->q, req, cmd->request_buffer); - BUG_ON(count > cmd->use_sg); - cmd->use_sg = count; + count = blk_rq_map_sg(req->q, req, sdb->table.sgl); + BUG_ON(count > sdb->table.nents); + sdb->table.nents = count; return BLKPREP_OK; } +/* + * Function: scsi_init_io() + * + * Purpose: SCSI I/O initialize function. + * + * Arguments: cmd - Command descriptor we wish to initialize + * + * Returns: 0 on success + * BLKPREP_DEFER if the failure is retryable + * BLKPREP_KILL if the failure is fatal + */ +int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask) +{ + int error = scsi_init_sgtable(cmd->request, &cmd->sdb, gfp_mask); + if (error) + goto err_exit; + + if (blk_bidi_rq(cmd->request)) { + struct scsi_data_buffer *bidi_sdb = kmem_cache_zalloc( + scsi_bidi_sdb_cache, GFP_ATOMIC); + if (!bidi_sdb) { + error = BLKPREP_DEFER; + goto err_exit; + } + + cmd->request->next_rq->special = bidi_sdb; + error = scsi_init_sgtable(cmd->request->next_rq, bidi_sdb, + GFP_ATOMIC); + if (error) + goto err_exit; + } + + return BLKPREP_OK ; + +err_exit: + scsi_release_buffers(cmd); + if (error == BLKPREP_KILL) + scsi_put_command(cmd); + else /* BLKPREP_DEFER */ + scsi_unprep_request(cmd->request); + + return error; +} +EXPORT_SYMBOL(scsi_init_io); + static struct scsi_cmnd *scsi_get_cmd_from_req(struct scsi_device *sdev, struct request *req) { @@ -1081,16 +1119,14 @@ int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req) BUG_ON(!req->nr_phys_segments); - ret = scsi_init_io(cmd); + ret = scsi_init_io(cmd, GFP_ATOMIC); if (unlikely(ret)) return ret; } else { BUG_ON(req->data_len); BUG_ON(req->data); - cmd->request_bufflen = 0; - cmd->request_buffer = NULL; - cmd->use_sg = 0; + memset(&cmd->sdb, 0, sizeof(cmd->sdb)); req->buffer = NULL; } @@ -1132,7 +1168,7 @@ int scsi_setup_fs_cmnd(struct scsi_device *sdev, struct request *req) if (unlikely(!cmd)) return BLKPREP_DEFER; - return scsi_init_io(cmd); + return scsi_init_io(cmd, GFP_ATOMIC); } EXPORT_SYMBOL(scsi_setup_fs_cmnd); @@ -1542,20 +1578,7 @@ struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost, * this limit is imposed by hardware restrictions */ blk_queue_max_hw_segments(q, shost->sg_tablesize); - - /* - * In the future, sg chaining support will be mandatory and this - * ifdef can then go away. Right now we don't have all archs - * converted, so better keep it safe. - */ -#ifdef ARCH_HAS_SG_CHAIN - if (shost->use_sg_chaining) - blk_queue_max_phys_segments(q, SCSI_MAX_SG_CHAIN_SEGMENTS); - else - blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS); -#else - blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS); -#endif + blk_queue_max_phys_segments(q, SCSI_MAX_SG_CHAIN_SEGMENTS); blk_queue_max_sectors(q, shost->max_sectors); blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost)); @@ -1654,6 +1677,14 @@ int __init scsi_init_queue(void) return -ENOMEM; } + scsi_bidi_sdb_cache = kmem_cache_create("scsi_bidi_sdb", + sizeof(struct scsi_data_buffer), + 0, 0, NULL); + if (!scsi_bidi_sdb_cache) { + printk(KERN_ERR "SCSI: can't init scsi bidi sdb cache\n"); + goto cleanup_io_context; + } + for (i = 0; i < SG_MEMPOOL_NR; i++) { struct scsi_host_sg_pool *sgp = scsi_sg_pools + i; int size = sgp->size * sizeof(struct scatterlist); @@ -1663,6 +1694,7 @@ int __init scsi_init_queue(void) if (!sgp->slab) { printk(KERN_ERR "SCSI: can't init sg slab %s\n", sgp->name); + goto cleanup_bidi_sdb; } sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE, @@ -1670,10 +1702,25 @@ int __init scsi_init_queue(void) if (!sgp->pool) { printk(KERN_ERR "SCSI: can't init sg mempool %s\n", sgp->name); + goto cleanup_bidi_sdb; } } return 0; + +cleanup_bidi_sdb: + for (i = 0; i < SG_MEMPOOL_NR; i++) { + struct scsi_host_sg_pool *sgp = scsi_sg_pools + i; + if (sgp->pool) + mempool_destroy(sgp->pool); + if (sgp->slab) + kmem_cache_destroy(sgp->slab); + } + kmem_cache_destroy(scsi_bidi_sdb_cache); +cleanup_io_context: + kmem_cache_destroy(scsi_io_context_cache); + + return -ENOMEM; } void scsi_exit_queue(void) @@ -1681,6 +1728,7 @@ void scsi_exit_queue(void) int i; kmem_cache_destroy(scsi_io_context_cache); + kmem_cache_destroy(scsi_bidi_sdb_cache); for (i = 0; i < SG_MEMPOOL_NR; i++) { struct scsi_host_sg_pool *sgp = scsi_sg_pools + i; diff --git a/drivers/scsi/scsi_tgt_lib.c b/drivers/scsi/scsi_tgt_lib.c index 01e03f3f6ff..91630baea53 100644 --- a/drivers/scsi/scsi_tgt_lib.c +++ b/drivers/scsi/scsi_tgt_lib.c @@ -331,8 +331,7 @@ static void scsi_tgt_cmd_done(struct scsi_cmnd *cmd) scsi_tgt_uspace_send_status(cmd, tcmd->itn_id, tcmd->tag); - if (scsi_sglist(cmd)) - scsi_free_sgtable(cmd); + scsi_release_buffers(cmd); queue_work(scsi_tgtd, &tcmd->work); } @@ -353,25 +352,6 @@ static int scsi_tgt_transfer_response(struct scsi_cmnd *cmd) return 0; } -static int scsi_tgt_init_cmd(struct scsi_cmnd *cmd, gfp_t gfp_mask) -{ - struct request *rq = cmd->request; - int count; - - cmd->use_sg = rq->nr_phys_segments; - if (scsi_alloc_sgtable(cmd, gfp_mask)) - return -ENOMEM; - - cmd->request_bufflen = rq->data_len; - - dprintk("cmd %p cnt %d %lu\n", cmd, scsi_sg_count(cmd), - rq_data_dir(rq)); - count = blk_rq_map_sg(rq->q, rq, scsi_sglist(cmd)); - BUG_ON(count > cmd->use_sg); - cmd->use_sg = count; - return 0; -} - /* TODO: test this crap and replace bio_map_user with new interface maybe */ static int scsi_map_user_pages(struct scsi_tgt_cmd *tcmd, struct scsi_cmnd *cmd, unsigned long uaddr, unsigned int len, int rw) @@ -397,9 +377,11 @@ static int scsi_map_user_pages(struct scsi_tgt_cmd *tcmd, struct scsi_cmnd *cmd, } tcmd->bio = rq->bio; - err = scsi_tgt_init_cmd(cmd, GFP_KERNEL); - if (err) + err = scsi_init_io(cmd, GFP_KERNEL); + if (err) { + scsi_release_buffers(cmd); goto unmap_rq; + } return 0; diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 24eba3118b5..51a5557f42d 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -519,7 +519,7 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq) SCpnt->cmnd[4] = (unsigned char) this_count; SCpnt->cmnd[5] = 0; } - SCpnt->request_bufflen = this_count * sdp->sector_size; + SCpnt->sdb.length = this_count * sdp->sector_size; /* * We shouldn't disconnect in the middle of a sector, so with a dumb @@ -926,7 +926,7 @@ static struct block_device_operations sd_fops = { static int sd_done(struct scsi_cmnd *SCpnt) { int result = SCpnt->result; - unsigned int xfer_size = SCpnt->request_bufflen; + unsigned int xfer_size = scsi_bufflen(SCpnt); unsigned int good_bytes = result ? 0 : xfer_size; u64 start_lba = SCpnt->request->sector; u64 bad_lba; diff --git a/drivers/scsi/sgiwd93.c b/drivers/scsi/sgiwd93.c index d4ebe8c67ba..26cfc56c709 100644 --- a/drivers/scsi/sgiwd93.c +++ b/drivers/scsi/sgiwd93.c @@ -33,10 +33,9 @@ struct ip22_hostdata { struct WD33C93_hostdata wh; - struct hpc_data { - dma_addr_t dma; - void *cpu; - } hd; + dma_addr_t dma; + void *cpu; + struct device *dev; }; #define host_to_hostdata(host) ((struct ip22_hostdata *)((host)->hostdata)) @@ -46,6 +45,11 @@ struct hpc_chunk { u32 _padding; /* align to quadword boundary */ }; +/* space for hpc dma descriptors */ +#define HPC_DMA_SIZE PAGE_SIZE + +#define DMA_DIR(d) ((d == DATA_OUT_DIR) ? DMA_TO_DEVICE : DMA_FROM_DEVICE) + static irqreturn_t sgiwd93_intr(int irq, void *dev_id) { struct Scsi_Host * host = dev_id; @@ -59,15 +63,17 @@ static irqreturn_t sgiwd93_intr(int irq, void *dev_id) } static inline -void fill_hpc_entries(struct hpc_chunk *hcp, struct scsi_cmnd *cmd, int datainp) +void fill_hpc_entries(struct ip22_hostdata *hd, struct scsi_cmnd *cmd, int din) { unsigned long len = cmd->SCp.this_residual; void *addr = cmd->SCp.ptr; dma_addr_t physaddr; unsigned long count; + struct hpc_chunk *hcp; - physaddr = dma_map_single(NULL, addr, len, cmd->sc_data_direction); + physaddr = dma_map_single(hd->dev, addr, len, DMA_DIR(din)); cmd->SCp.dma_handle = physaddr; + hcp = hd->cpu; while (len) { /* @@ -89,6 +95,9 @@ void fill_hpc_entries(struct hpc_chunk *hcp, struct scsi_cmnd *cmd, int datainp) */ hcp->desc.pbuf = 0; hcp->desc.cntinfo = HPCDMA_EOX; + dma_cache_sync(hd->dev, hd->cpu, + (unsigned long)(hcp + 1) - (unsigned long)hd->cpu, + DMA_TO_DEVICE); } static int dma_setup(struct scsi_cmnd *cmd, int datainp) @@ -96,9 +105,8 @@ static int dma_setup(struct scsi_cmnd *cmd, int datainp) struct ip22_hostdata *hdata = host_to_hostdata(cmd->device->host); struct hpc3_scsiregs *hregs = (struct hpc3_scsiregs *) cmd->device->host->base; - struct hpc_chunk *hcp = (struct hpc_chunk *) hdata->hd.cpu; - pr_debug("dma_setup: datainp<%d> hcp<%p> ", datainp, hcp); + pr_debug("dma_setup: datainp<%d> hcp<%p> ", datainp, hdata->cpu); hdata->wh.dma_dir = datainp; @@ -111,12 +119,12 @@ static int dma_setup(struct scsi_cmnd *cmd, int datainp) if (cmd->SCp.ptr == NULL || cmd->SCp.this_residual == 0) return 1; - fill_hpc_entries(hcp, cmd, datainp); + fill_hpc_entries(hdata, cmd, datainp); pr_debug(" HPCGO\n"); /* Start up the HPC. */ - hregs->ndptr = hdata->hd.dma; + hregs->ndptr = hdata->dma; if (datainp) hregs->ctrl = HPC3_SCTRL_ACTIVE; else @@ -134,6 +142,9 @@ static void dma_stop(struct Scsi_Host *instance, struct scsi_cmnd *SCpnt, if (!SCpnt) return; + if (SCpnt->SCp.ptr == NULL || SCpnt->SCp.this_residual == 0) + return; + hregs = (struct hpc3_scsiregs *) SCpnt->device->host->base; pr_debug("dma_stop: status<%d> ", status); @@ -145,8 +156,9 @@ static void dma_stop(struct Scsi_Host *instance, struct scsi_cmnd *SCpnt, barrier(); } hregs->ctrl = 0; - dma_unmap_single(NULL, SCpnt->SCp.dma_handle, SCpnt->SCp.this_residual, - SCpnt->sc_data_direction); + dma_unmap_single(hdata->dev, SCpnt->SCp.dma_handle, + SCpnt->SCp.this_residual, + DMA_DIR(hdata->wh.dma_dir)); pr_debug("\n"); } @@ -161,22 +173,23 @@ void sgiwd93_reset(unsigned long base) } EXPORT_SYMBOL_GPL(sgiwd93_reset); -static inline void init_hpc_chain(struct hpc_data *hd) +static inline void init_hpc_chain(struct ip22_hostdata *hdata) { - struct hpc_chunk *hcp = (struct hpc_chunk *) hd->cpu; - struct hpc_chunk *dma = (struct hpc_chunk *) hd->dma; + struct hpc_chunk *hcp = (struct hpc_chunk *)hdata->cpu; + dma_addr_t dma = hdata->dma; unsigned long start, end; start = (unsigned long) hcp; - end = start + PAGE_SIZE; + end = start + HPC_DMA_SIZE; while (start < end) { - hcp->desc.pnext = (u32) (dma + 1); + hcp->desc.pnext = (u32) (dma + sizeof(struct hpc_chunk)); hcp->desc.cntinfo = HPCDMA_EOX; - hcp++; dma++; + hcp++; + dma += sizeof(struct hpc_chunk); start += sizeof(struct hpc_chunk); }; hcp--; - hcp->desc.pnext = hd->dma; + hcp->desc.pnext = hdata->dma; } static int sgiwd93_bus_reset(struct scsi_cmnd *cmd) @@ -235,16 +248,17 @@ static int __init sgiwd93_probe(struct platform_device *pdev) host->irq = irq; hdata = host_to_hostdata(host); - hdata->hd.cpu = dma_alloc_coherent(&pdev->dev, PAGE_SIZE, - &hdata->hd.dma, GFP_KERNEL); - if (!hdata->hd.cpu) { + hdata->dev = &pdev->dev; + hdata->cpu = dma_alloc_noncoherent(&pdev->dev, HPC_DMA_SIZE, + &hdata->dma, GFP_KERNEL); + if (!hdata->cpu) { printk(KERN_WARNING "sgiwd93: Could not allocate memory for " "host %d buffer.\n", unit); err = -ENOMEM; goto out_put; } - init_hpc_chain(&hdata->hd); + init_hpc_chain(hdata); regs.SASR = wdregs + 3; regs.SCMD = wdregs + 7; @@ -274,7 +288,7 @@ static int __init sgiwd93_probe(struct platform_device *pdev) out_irq: free_irq(irq, host); out_free: - dma_free_coherent(NULL, PAGE_SIZE, hdata->hd.cpu, hdata->hd.dma); + dma_free_noncoherent(&pdev->dev, HPC_DMA_SIZE, hdata->cpu, hdata->dma); out_put: scsi_host_put(host); out: @@ -290,7 +304,7 @@ static void __exit sgiwd93_remove(struct platform_device *pdev) scsi_remove_host(host); free_irq(pd->irq, host); - dma_free_coherent(&pdev->dev, PAGE_SIZE, hdata->hd.cpu, hdata->hd.dma); + dma_free_noncoherent(&pdev->dev, HPC_DMA_SIZE, hdata->cpu, hdata->dma); scsi_host_put(host); } diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 1fcee16fa36..50ba4925020 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -231,7 +231,7 @@ out: static int sr_done(struct scsi_cmnd *SCpnt) { int result = SCpnt->result; - int this_count = SCpnt->request_bufflen; + int this_count = scsi_bufflen(SCpnt); int good_bytes = (result == 0 ? this_count : 0); int block_sectors = 0; long error_sector; @@ -379,17 +379,18 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq) } { - struct scatterlist *sg = SCpnt->request_buffer; - int i, size = 0; - for (i = 0; i < SCpnt->use_sg; i++) - size += sg[i].length; + struct scatterlist *sg; + int i, size = 0, sg_count = scsi_sg_count(SCpnt); - if (size != SCpnt->request_bufflen && SCpnt->use_sg) { + scsi_for_each_sg(SCpnt, sg, sg_count, i) + size += sg->length; + + if (size != scsi_bufflen(SCpnt)) { scmd_printk(KERN_ERR, SCpnt, "mismatch count %d, bytes %d\n", - size, SCpnt->request_bufflen); - if (SCpnt->request_bufflen > size) - SCpnt->request_bufflen = size; + size, scsi_bufflen(SCpnt)); + if (scsi_bufflen(SCpnt) > size) + SCpnt->sdb.length = size; } } @@ -397,12 +398,12 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq) * request doesn't start on hw block boundary, add scatter pads */ if (((unsigned int)rq->sector % (s_size >> 9)) || - (SCpnt->request_bufflen % s_size)) { + (scsi_bufflen(SCpnt) % s_size)) { scmd_printk(KERN_NOTICE, SCpnt, "unaligned transfer\n"); goto out; } - this_count = (SCpnt->request_bufflen >> 9) / (s_size >> 9); + this_count = (scsi_bufflen(SCpnt) >> 9) / (s_size >> 9); SCSI_LOG_HLQUEUE(2, printk("%s : %s %d/%ld 512 byte blocks.\n", @@ -416,7 +417,7 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq) if (this_count > 0xffff) { this_count = 0xffff; - SCpnt->request_bufflen = this_count * s_size; + SCpnt->sdb.length = this_count * s_size; } SCpnt->cmnd[2] = (unsigned char) (block >> 24) & 0xff; diff --git a/drivers/scsi/stex.c b/drivers/scsi/stex.c index e3fab3a6aed..72f6d801535 100644 --- a/drivers/scsi/stex.c +++ b/drivers/scsi/stex.c @@ -1123,7 +1123,6 @@ static struct scsi_host_template driver_template = { .this_id = -1, .sg_tablesize = ST_MAX_SG, .cmd_per_lun = ST_CMD_PER_LUN, - .use_sg_chaining = ENABLE_SG_CHAINING, }; static int stex_set_dma_mask(struct pci_dev * pdev) diff --git a/drivers/scsi/sym53c416.c b/drivers/scsi/sym53c416.c index 1f6fd168033..6325901e509 100644 --- a/drivers/scsi/sym53c416.c +++ b/drivers/scsi/sym53c416.c @@ -840,6 +840,5 @@ static struct scsi_host_template driver_template = { .cmd_per_lun = 1, .unchecked_isa_dma = 1, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, }; #include "scsi_module.c" diff --git a/drivers/scsi/sym53c8xx_2/sym_glue.c b/drivers/scsi/sym53c8xx_2/sym_glue.c index 21e926dcdab..d39107b7669 100644 --- a/drivers/scsi/sym53c8xx_2/sym_glue.c +++ b/drivers/scsi/sym53c8xx_2/sym_glue.c @@ -207,7 +207,7 @@ void sym_set_cam_result_error(struct sym_hcb *np, struct sym_ccb *cp, int resid) /* * Bounce back the sense data to user. */ - memset(&cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); + memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); memcpy(cmd->sense_buffer, cp->sns_bbuf, min(SCSI_SENSE_BUFFERSIZE, SYM_SNS_BBUF_LEN)); #if 0 @@ -1681,7 +1681,6 @@ static struct scsi_host_template sym2_template = { .eh_host_reset_handler = sym53c8xx_eh_host_reset_handler, .this_id = 7, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, .max_sectors = 0xFFFF, #ifdef SYM_LINUX_PROC_INFO_SUPPORT .proc_info = sym53c8xx_proc_info, diff --git a/drivers/scsi/u14-34f.c b/drivers/scsi/u14-34f.c index 4bc5407f969..662c00451be 100644 --- a/drivers/scsi/u14-34f.c +++ b/drivers/scsi/u14-34f.c @@ -451,7 +451,6 @@ static struct scsi_host_template driver_template = { .this_id = 7, .unchecked_isa_dma = 1, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, }; #if !defined(__BIG_ENDIAN_BITFIELD) && !defined(__LITTLE_ENDIAN_BITFIELD) diff --git a/drivers/scsi/ultrastor.c b/drivers/scsi/ultrastor.c index 75eca6b22db..f385dce8dfb 100644 --- a/drivers/scsi/ultrastor.c +++ b/drivers/scsi/ultrastor.c @@ -1204,6 +1204,5 @@ static struct scsi_host_template driver_template = { .cmd_per_lun = ULTRASTOR_MAX_CMDS_PER_LUN, .unchecked_isa_dma = 1, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, }; #include "scsi_module.c" diff --git a/drivers/scsi/wd7000.c b/drivers/scsi/wd7000.c index b4304ae7852..c975c01b3a0 100644 --- a/drivers/scsi/wd7000.c +++ b/drivers/scsi/wd7000.c @@ -1671,7 +1671,6 @@ static struct scsi_host_template driver_template = { .cmd_per_lun = 1, .unchecked_isa_dma = 1, .use_clustering = ENABLE_CLUSTERING, - .use_sg_chaining = ENABLE_SG_CHAINING, }; #include "scsi_module.c" diff --git a/drivers/usb/storage/isd200.c b/drivers/usb/storage/isd200.c index 178e8c2a8a2..0db488624ab 100644 --- a/drivers/usb/storage/isd200.c +++ b/drivers/usb/storage/isd200.c @@ -415,14 +415,14 @@ static void isd200_set_srb(struct isd200_info *info, sg_init_one(&info->sg, buff, bufflen); srb->sc_data_direction = dir; - srb->request_buffer = buff ? &info->sg : NULL; - srb->request_bufflen = bufflen; - srb->use_sg = buff ? 1 : 0; + srb->sdb.table.sgl = buff ? &info->sg : NULL; + srb->sdb.length = bufflen; + srb->sdb.table.nents = buff ? 1 : 0; } static void isd200_srb_set_bufflen(struct scsi_cmnd *srb, unsigned bufflen) { - srb->request_bufflen = bufflen; + srb->sdb.length = bufflen; } diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index 899fc13d061..afcdc69e37d 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -609,7 +609,7 @@ config SBC_EPX_C3_WATCHDOG config INDYDOG tristate "Indy/I2 Hardware Watchdog" - depends on SGI_IP22 + depends on SGI_HAS_INDYDOG help Hardware driver for the Indy's/I2's watchdog. This is a watchdog timer that will reboot the machine after a 60 second diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c index 46754553fdc..ff97ba92433 100644 --- a/fs/dlm/dir.c +++ b/fs/dlm/dir.c @@ -49,7 +49,7 @@ static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len) spin_unlock(&ls->ls_recover_list_lock); if (!found) - de = allocate_direntry(ls, len); + de = kzalloc(sizeof(struct dlm_direntry) + len, GFP_KERNEL); return de; } @@ -62,7 +62,7 @@ void dlm_clear_free_entries(struct dlm_ls *ls) de = list_entry(ls->ls_recover_list.next, struct dlm_direntry, list); list_del(&de->list); - free_direntry(de); + kfree(de); } spin_unlock(&ls->ls_recover_list_lock); } @@ -171,7 +171,7 @@ void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen } list_del(&de->list); - free_direntry(de); + kfree(de); out: write_unlock(&ls->ls_dirtbl[bucket].lock); } @@ -302,7 +302,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name, write_unlock(&ls->ls_dirtbl[bucket].lock); - de = allocate_direntry(ls, namelen); + de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_KERNEL); if (!de) return -ENOMEM; @@ -313,7 +313,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name, write_lock(&ls->ls_dirtbl[bucket].lock); tmp = search_bucket(ls, name, namelen, bucket); if (tmp) { - free_direntry(de); + kfree(de); de = tmp; } else { list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list); @@ -329,49 +329,47 @@ int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen, return get_entry(ls, nodeid, name, namelen, r_nodeid); } -/* Copy the names of master rsb's into the buffer provided. - Only select names whose dir node is the given nodeid. */ +static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, char *name, int len) +{ + struct dlm_rsb *r; + + down_read(&ls->ls_root_sem); + list_for_each_entry(r, &ls->ls_root_list, res_root_list) { + if (len == r->res_length && !memcmp(name, r->res_name, len)) { + up_read(&ls->ls_root_sem); + return r; + } + } + up_read(&ls->ls_root_sem); + return NULL; +} + +/* Find the rsb where we left off (or start again), then send rsb names + for rsb's we're master of and whose directory node matches the requesting + node. inbuf is the rsb name last sent, inlen is the name's length */ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen, char *outbuf, int outlen, int nodeid) { struct list_head *list; - struct dlm_rsb *start_r = NULL, *r = NULL; - int offset = 0, start_namelen, error, dir_nodeid; - char *start_name; + struct dlm_rsb *r; + int offset = 0, dir_nodeid; uint16_t be_namelen; - /* - * Find the rsb where we left off (or start again) - */ - - start_namelen = inlen; - start_name = inbuf; - - if (start_namelen > 1) { - /* - * We could also use a find_rsb_root() function here that - * searched the ls_root_list. - */ - error = dlm_find_rsb(ls, start_name, start_namelen, R_MASTER, - &start_r); - DLM_ASSERT(!error && start_r, - printk("error %d\n", error);); - DLM_ASSERT(!list_empty(&start_r->res_root_list), - dlm_print_rsb(start_r);); - dlm_put_rsb(start_r); - } - - /* - * Send rsb names for rsb's we're master of and whose directory node - * matches the requesting node. - */ - down_read(&ls->ls_root_sem); - if (start_r) - list = start_r->res_root_list.next; - else + + if (inlen > 1) { + r = find_rsb_root(ls, inbuf, inlen); + if (!r) { + inbuf[inlen - 1] = '\0'; + log_error(ls, "copy_master_names from %d start %d %s", + nodeid, inlen, inbuf); + goto out; + } + list = r->res_root_list.next; + } else { list = ls->ls_root_list.next; + } for (offset = 0; list != &ls->ls_root_list; list = list->next) { r = list_entry(list, struct dlm_rsb, res_root_list); diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index d2fc2384c3b..ec61bbaf25d 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -570,5 +570,21 @@ static inline int dlm_no_directory(struct dlm_ls *ls) return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0; } +int dlm_netlink_init(void); +void dlm_netlink_exit(void); +void dlm_timeout_warn(struct dlm_lkb *lkb); + +#ifdef CONFIG_DLM_DEBUG +int dlm_register_debugfs(void); +void dlm_unregister_debugfs(void); +int dlm_create_debug_file(struct dlm_ls *ls); +void dlm_delete_debug_file(struct dlm_ls *ls); +#else +static inline int dlm_register_debugfs(void) { return 0; } +static inline void dlm_unregister_debugfs(void) { } +static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; } +static inline void dlm_delete_debug_file(struct dlm_ls *ls) { } +#endif + #endif /* __DLM_INTERNAL_DOT_H__ */ diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 3915b8e1414..ff4a198fa67 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1,7 +1,7 @@ /****************************************************************************** ******************************************************************************* ** -** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. +** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -88,7 +88,6 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, static int receive_extralen(struct dlm_message *ms); static void do_purge(struct dlm_ls *ls, int nodeid, int pid); static void del_timeout(struct dlm_lkb *lkb); -void dlm_timeout_warn(struct dlm_lkb *lkb); /* * Lock compatibilty matrix - thanks Steve @@ -335,7 +334,7 @@ static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len) { struct dlm_rsb *r; - r = allocate_rsb(ls, len); + r = dlm_allocate_rsb(ls, len); if (!r) return NULL; @@ -478,7 +477,7 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen, error = _search_rsb(ls, name, namelen, bucket, 0, &tmp); if (!error) { write_unlock(&ls->ls_rsbtbl[bucket].lock); - free_rsb(r); + dlm_free_rsb(r); r = tmp; goto out; } @@ -490,12 +489,6 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen, return error; } -int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen, - unsigned int flags, struct dlm_rsb **r_ret) -{ - return find_rsb(ls, name, namelen, flags, r_ret); -} - /* This is only called to add a reference when the code already holds a valid reference to the rsb, so there's no need for locking. */ @@ -519,7 +512,7 @@ static void toss_rsb(struct kref *kref) list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss); r->res_toss_time = jiffies; if (r->res_lvbptr) { - free_lvb(r->res_lvbptr); + dlm_free_lvb(r->res_lvbptr); r->res_lvbptr = NULL; } } @@ -589,7 +582,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) uint32_t lkid = 0; uint16_t bucket; - lkb = allocate_lkb(ls); + lkb = dlm_allocate_lkb(ls); if (!lkb) return -ENOMEM; @@ -683,8 +676,8 @@ static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb) /* for local/process lkbs, lvbptr points to caller's lksb */ if (lkb->lkb_lvbptr && is_master_copy(lkb)) - free_lvb(lkb->lkb_lvbptr); - free_lkb(lkb); + dlm_free_lvb(lkb->lkb_lvbptr); + dlm_free_lkb(lkb); return 1; } else { write_unlock(&ls->ls_lkbtbl[bucket].lock); @@ -988,7 +981,7 @@ static int shrink_bucket(struct dlm_ls *ls, int b) if (is_master(r)) dir_remove(r); - free_rsb(r); + dlm_free_rsb(r); count++; } else { write_unlock(&ls->ls_rsbtbl[b].lock); @@ -1171,7 +1164,7 @@ static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) return; if (!r->res_lvbptr) - r->res_lvbptr = allocate_lvb(r->res_ls); + r->res_lvbptr = dlm_allocate_lvb(r->res_ls); if (!r->res_lvbptr) return; @@ -1203,7 +1196,7 @@ static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb) return; if (!r->res_lvbptr) - r->res_lvbptr = allocate_lvb(r->res_ls); + r->res_lvbptr = dlm_allocate_lvb(r->res_ls); if (!r->res_lvbptr) return; @@ -1852,7 +1845,7 @@ static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb) static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb) { struct dlm_ls *ls = r->res_ls; - int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid(); + int i, error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid(); if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) { rsb_clear_flag(r, RSB_MASTER_UNCERTAIN); @@ -1886,7 +1879,7 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb) return 1; } - for (;;) { + for (i = 0; i < 2; i++) { /* It's possible for dlm_scand to remove an old rsb for this same resource from the toss list, us to create a new one, look up the master locally, and find it @@ -1900,6 +1893,8 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb) log_debug(ls, "dir_lookup error %d %s", error, r->res_name); schedule(); } + if (error && error != -EEXIST) + return error; if (ret_nodeid == our_nodeid) { r->res_first_lkid = 0; @@ -1941,8 +1936,11 @@ static void confirm_master(struct dlm_rsb *r, int error) break; case -EAGAIN: - /* the remote master didn't queue our NOQUEUE request; - make a waiting lkb the first_lkid */ + case -EBADR: + case -ENOTBLK: + /* the remote request failed and won't be retried (it was + a NOQUEUE, or has been canceled/unlocked); make a waiting + lkb the first_lkid */ r->res_first_lkid = 0; @@ -2108,17 +2106,18 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) /* an lkb may be waiting for an rsb lookup to complete where the lookup was initiated by another lock */ - if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) { - if (!list_empty(&lkb->lkb_rsb_lookup)) { + if (!list_empty(&lkb->lkb_rsb_lookup)) { + if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) { log_debug(ls, "unlock on rsb_lookup %x", lkb->lkb_id); list_del_init(&lkb->lkb_rsb_lookup); queue_cast(lkb->lkb_resource, lkb, args->flags & DLM_LKF_CANCEL ? -DLM_ECANCEL : -DLM_EUNLOCK); unhold_lkb(lkb); /* undoes create_lkb() */ - rv = -EBUSY; - goto out; } + /* caller changes -EBUSY to 0 for CANCEL and FORCEUNLOCK */ + rv = -EBUSY; + goto out; } /* cancel not allowed with another cancel/unlock in progress */ @@ -2986,7 +2985,7 @@ static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb, if (lkb->lkb_exflags & DLM_LKF_VALBLK) { if (!lkb->lkb_lvbptr) - lkb->lkb_lvbptr = allocate_lvb(ls); + lkb->lkb_lvbptr = dlm_allocate_lvb(ls); if (!lkb->lkb_lvbptr) return -ENOMEM; len = receive_extralen(ms); @@ -3006,11 +3005,9 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb, lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST); lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP); - DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb);); - if (lkb->lkb_exflags & DLM_LKF_VALBLK) { /* lkb was just created so there won't be an lvb yet */ - lkb->lkb_lvbptr = allocate_lvb(ls); + lkb->lkb_lvbptr = dlm_allocate_lvb(ls); if (!lkb->lkb_lvbptr) return -ENOMEM; } @@ -3021,16 +3018,6 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb, static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb, struct dlm_message *ms) { - if (lkb->lkb_nodeid != ms->m_header.h_nodeid) { - log_error(ls, "convert_args nodeid %d %d lkid %x %x", - lkb->lkb_nodeid, ms->m_header.h_nodeid, - lkb->lkb_id, lkb->lkb_remid); - return -EINVAL; - } - - if (!is_master_copy(lkb)) - return -EINVAL; - if (lkb->lkb_status != DLM_LKSTS_GRANTED) return -EBUSY; @@ -3046,8 +3033,6 @@ static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb, static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, struct dlm_message *ms) { - if (!is_master_copy(lkb)) - return -EINVAL; if (receive_lvb(ls, lkb, ms)) return -ENOMEM; return 0; @@ -3063,6 +3048,50 @@ static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms) lkb->lkb_remid = ms->m_lkid; } +/* This is called after the rsb is locked so that we can safely inspect + fields in the lkb. */ + +static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms) +{ + int from = ms->m_header.h_nodeid; + int error = 0; + + switch (ms->m_type) { + case DLM_MSG_CONVERT: + case DLM_MSG_UNLOCK: + case DLM_MSG_CANCEL: + if (!is_master_copy(lkb) || lkb->lkb_nodeid != from) + error = -EINVAL; + break; + + case DLM_MSG_CONVERT_REPLY: + case DLM_MSG_UNLOCK_REPLY: + case DLM_MSG_CANCEL_REPLY: + case DLM_MSG_GRANT: + case DLM_MSG_BAST: + if (!is_process_copy(lkb) || lkb->lkb_nodeid != from) + error = -EINVAL; + break; + + case DLM_MSG_REQUEST_REPLY: + if (!is_process_copy(lkb)) + error = -EINVAL; + else if (lkb->lkb_nodeid != -1 && lkb->lkb_nodeid != from) + error = -EINVAL; + break; + + default: + error = -EINVAL; + } + + if (error) + log_error(lkb->lkb_resource->res_ls, + "ignore invalid message %d from %d %x %x %x %d", + ms->m_type, from, lkb->lkb_id, lkb->lkb_remid, + lkb->lkb_flags, lkb->lkb_nodeid); + return error; +} + static void receive_request(struct dlm_ls *ls, struct dlm_message *ms) { struct dlm_lkb *lkb; @@ -3124,17 +3153,21 @@ static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms) hold_rsb(r); lock_rsb(r); + error = validate_message(lkb, ms); + if (error) + goto out; + receive_flags(lkb, ms); error = receive_convert_args(ls, lkb, ms); if (error) - goto out; + goto out_reply; reply = !down_conversion(lkb); error = do_convert(r, lkb); - out: + out_reply: if (reply) send_convert_reply(r, lkb, error); - + out: unlock_rsb(r); put_rsb(r); dlm_put_lkb(lkb); @@ -3160,15 +3193,19 @@ static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms) hold_rsb(r); lock_rsb(r); + error = validate_message(lkb, ms); + if (error) + goto out; + receive_flags(lkb, ms); error = receive_unlock_args(ls, lkb, ms); if (error) - goto out; + goto out_reply; error = do_unlock(r, lkb); - out: + out_reply: send_unlock_reply(r, lkb, error); - + out: unlock_rsb(r); put_rsb(r); dlm_put_lkb(lkb); @@ -3196,9 +3233,13 @@ static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms) hold_rsb(r); lock_rsb(r); + error = validate_message(lkb, ms); + if (error) + goto out; + error = do_cancel(r, lkb); send_cancel_reply(r, lkb, error); - + out: unlock_rsb(r); put_rsb(r); dlm_put_lkb(lkb); @@ -3217,22 +3258,26 @@ static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms) error = find_lkb(ls, ms->m_remid, &lkb); if (error) { - log_error(ls, "receive_grant no lkb"); + log_debug(ls, "receive_grant from %d no lkb %x", + ms->m_header.h_nodeid, ms->m_remid); return; } - DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); r = lkb->lkb_resource; hold_rsb(r); lock_rsb(r); + error = validate_message(lkb, ms); + if (error) + goto out; + receive_flags_reply(lkb, ms); if (is_altmode(lkb)) munge_altmode(lkb, ms); grant_lock_pc(r, lkb, ms); queue_cast(r, lkb, 0); - + out: unlock_rsb(r); put_rsb(r); dlm_put_lkb(lkb); @@ -3246,18 +3291,22 @@ static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms) error = find_lkb(ls, ms->m_remid, &lkb); if (error) { - log_error(ls, "receive_bast no lkb"); + log_debug(ls, "receive_bast from %d no lkb %x", + ms->m_header.h_nodeid, ms->m_remid); return; } - DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); r = lkb->lkb_resource; hold_rsb(r); lock_rsb(r); - queue_bast(r, lkb, ms->m_bastmode); + error = validate_message(lkb, ms); + if (error) + goto out; + queue_bast(r, lkb, ms->m_bastmode); + out: unlock_rsb(r); put_rsb(r); dlm_put_lkb(lkb); @@ -3323,15 +3372,19 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) error = find_lkb(ls, ms->m_remid, &lkb); if (error) { - log_error(ls, "receive_request_reply no lkb"); + log_debug(ls, "receive_request_reply from %d no lkb %x", + ms->m_header.h_nodeid, ms->m_remid); return; } - DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); r = lkb->lkb_resource; hold_rsb(r); lock_rsb(r); + error = validate_message(lkb, ms); + if (error) + goto out; + mstype = lkb->lkb_wait_type; error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY); if (error) @@ -3383,6 +3436,7 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) if (is_overlap(lkb)) { /* we'll ignore error in cancel/unlock reply */ queue_cast_overlap(r, lkb); + confirm_master(r, result); unhold_lkb(lkb); /* undoes create_lkb() */ } else _request_lock(r, lkb); @@ -3463,6 +3517,10 @@ static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms) hold_rsb(r); lock_rsb(r); + error = validate_message(lkb, ms); + if (error) + goto out; + /* stub reply can happen with waiters_mutex held */ error = remove_from_waiters_ms(lkb, ms); if (error) @@ -3481,10 +3539,10 @@ static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms) error = find_lkb(ls, ms->m_remid, &lkb); if (error) { - log_error(ls, "receive_convert_reply no lkb"); + log_debug(ls, "receive_convert_reply from %d no lkb %x", + ms->m_header.h_nodeid, ms->m_remid); return; } - DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); _receive_convert_reply(lkb, ms); dlm_put_lkb(lkb); @@ -3498,6 +3556,10 @@ static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms) hold_rsb(r); lock_rsb(r); + error = validate_message(lkb, ms); + if (error) + goto out; + /* stub reply can happen with waiters_mutex held */ error = remove_from_waiters_ms(lkb, ms); if (error) @@ -3529,10 +3591,10 @@ static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms) error = find_lkb(ls, ms->m_remid, &lkb); if (error) { - log_error(ls, "receive_unlock_reply no lkb"); + log_debug(ls, "receive_unlock_reply from %d no lkb %x", + ms->m_header.h_nodeid, ms->m_remid); return; } - DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); _receive_unlock_reply(lkb, ms); dlm_put_lkb(lkb); @@ -3546,6 +3608,10 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms) hold_rsb(r); lock_rsb(r); + error = validate_message(lkb, ms); + if (error) + goto out; + /* stub reply can happen with waiters_mutex held */ error = remove_from_waiters_ms(lkb, ms); if (error) @@ -3577,10 +3643,10 @@ static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms) error = find_lkb(ls, ms->m_remid, &lkb); if (error) { - log_error(ls, "receive_cancel_reply no lkb"); + log_debug(ls, "receive_cancel_reply from %d no lkb %x", + ms->m_header.h_nodeid, ms->m_remid); return; } - DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); _receive_cancel_reply(lkb, ms); dlm_put_lkb(lkb); @@ -3640,6 +3706,13 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms) static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms) { + if (!dlm_is_member(ls, ms->m_header.h_nodeid)) { + log_debug(ls, "ignore non-member message %d from %d %x %x %d", + ms->m_type, ms->m_header.h_nodeid, ms->m_lkid, + ms->m_remid, ms->m_result); + return; + } + switch (ms->m_type) { /* messages sent to a master node */ @@ -3778,8 +3851,9 @@ void dlm_receive_buffer(struct dlm_header *hd, int nodeid) ls = dlm_find_lockspace_global(hd->h_lockspace); if (!ls) { - log_print("invalid h_lockspace %x from %d cmd %d type %d", - hd->h_lockspace, nodeid, hd->h_cmd, type); + if (dlm_config.ci_log_debug) + log_print("invalid lockspace %x from %d cmd %d type %d", + hd->h_lockspace, nodeid, hd->h_cmd, type); if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS) dlm_send_ls_not_ready(nodeid, rc); @@ -3806,6 +3880,7 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb) ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY; ls->ls_stub_ms.m_result = -EINPROGRESS; ls->ls_stub_ms.m_flags = lkb->lkb_flags; + ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid; _receive_convert_reply(lkb, &ls->ls_stub_ms); /* Same special case as in receive_rcom_lock_args() */ @@ -3847,6 +3922,7 @@ static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb) void dlm_recover_waiters_pre(struct dlm_ls *ls) { struct dlm_lkb *lkb, *safe; + int wait_type, stub_unlock_result, stub_cancel_result; mutex_lock(&ls->ls_waiters_mutex); @@ -3865,7 +3941,33 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) if (!waiter_needs_recovery(ls, lkb)) continue; - switch (lkb->lkb_wait_type) { + wait_type = lkb->lkb_wait_type; + stub_unlock_result = -DLM_EUNLOCK; + stub_cancel_result = -DLM_ECANCEL; + + /* Main reply may have been received leaving a zero wait_type, + but a reply for the overlapping op may not have been + received. In that case we need to fake the appropriate + reply for the overlap op. */ + + if (!wait_type) { + if (is_overlap_cancel(lkb)) { + wait_type = DLM_MSG_CANCEL; + if (lkb->lkb_grmode == DLM_LOCK_IV) + stub_cancel_result = 0; + } + if (is_overlap_unlock(lkb)) { + wait_type = DLM_MSG_UNLOCK; + if (lkb->lkb_grmode == DLM_LOCK_IV) + stub_unlock_result = -ENOENT; + } + + log_debug(ls, "rwpre overlap %x %x %d %d %d", + lkb->lkb_id, lkb->lkb_flags, wait_type, + stub_cancel_result, stub_unlock_result); + } + + switch (wait_type) { case DLM_MSG_REQUEST: lkb->lkb_flags |= DLM_IFL_RESEND; @@ -3878,8 +3980,9 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) case DLM_MSG_UNLOCK: hold_lkb(lkb); ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY; - ls->ls_stub_ms.m_result = -DLM_EUNLOCK; + ls->ls_stub_ms.m_result = stub_unlock_result; ls->ls_stub_ms.m_flags = lkb->lkb_flags; + ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid; _receive_unlock_reply(lkb, &ls->ls_stub_ms); dlm_put_lkb(lkb); break; @@ -3887,15 +3990,16 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) case DLM_MSG_CANCEL: hold_lkb(lkb); ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY; - ls->ls_stub_ms.m_result = -DLM_ECANCEL; + ls->ls_stub_ms.m_result = stub_cancel_result; ls->ls_stub_ms.m_flags = lkb->lkb_flags; + ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid; _receive_cancel_reply(lkb, &ls->ls_stub_ms); dlm_put_lkb(lkb); break; default: - log_error(ls, "invalid lkb wait_type %d", - lkb->lkb_wait_type); + log_error(ls, "invalid lkb wait_type %d %d", + lkb->lkb_wait_type, wait_type); } schedule(); } @@ -4184,7 +4288,7 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP); if (lkb->lkb_exflags & DLM_LKF_VALBLK) { - lkb->lkb_lvbptr = allocate_lvb(ls); + lkb->lkb_lvbptr = dlm_allocate_lvb(ls); if (!lkb->lkb_lvbptr) return -ENOMEM; lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) - @@ -4259,7 +4363,7 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) put_rsb(r); out: if (error) - log_print("recover_master_copy %d %x", error, rl->rl_lkid); + log_debug(ls, "recover_master_copy %d %x", error, rl->rl_lkid); rl->rl_result = error; return error; } @@ -4342,7 +4446,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, } } - /* After ua is attached to lkb it will be freed by free_lkb(). + /* After ua is attached to lkb it will be freed by dlm_free_lkb(). When DLM_IFL_USER is set, the dlm knows that this is a userspace lock and that lkb_astparam is the dlm_user_args structure. */ @@ -4679,6 +4783,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) } list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) { + lkb->lkb_ast_type = 0; list_del(&lkb->lkb_astqueue); dlm_put_lkb(lkb); } diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index ada04680a1e..27b6ed30291 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -19,8 +19,6 @@ void dlm_print_lkb(struct dlm_lkb *lkb); void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms); void dlm_receive_buffer(struct dlm_header *hd, int nodeid); int dlm_modes_compat(int mode1, int mode2); -int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen, - unsigned int flags, struct dlm_rsb **r_ret); void dlm_put_rsb(struct dlm_rsb *r); void dlm_hold_rsb(struct dlm_rsb *r); int dlm_put_lkb(struct dlm_lkb *lkb); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 5c108c49cb8..b180fdc5108 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -24,14 +24,6 @@ #include "recover.h" #include "requestqueue.h" -#ifdef CONFIG_DLM_DEBUG -int dlm_create_debug_file(struct dlm_ls *ls); -void dlm_delete_debug_file(struct dlm_ls *ls); -#else -static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; } -static inline void dlm_delete_debug_file(struct dlm_ls *ls) { } -#endif - static int ls_count; static struct mutex ls_lock; static struct list_head lslist; @@ -684,9 +676,9 @@ static int release_lockspace(struct dlm_ls *ls, int force) dlm_del_ast(lkb); if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY) - free_lvb(lkb->lkb_lvbptr); + dlm_free_lvb(lkb->lkb_lvbptr); - free_lkb(lkb); + dlm_free_lkb(lkb); } } dlm_astd_resume(); @@ -704,7 +696,7 @@ static int release_lockspace(struct dlm_ls *ls, int force) res_hashchain); list_del(&rsb->res_hashchain); - free_rsb(rsb); + dlm_free_rsb(rsb); } head = &ls->ls_rsbtbl[i].toss; @@ -712,7 +704,7 @@ static int release_lockspace(struct dlm_ls *ls, int force) rsb = list_entry(head->next, struct dlm_rsb, res_hashchain); list_del(&rsb->res_hashchain); - free_rsb(rsb); + dlm_free_rsb(rsb); } } diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index e9923ca9c2d..7c1e5e5cccd 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -864,7 +864,7 @@ static void sctp_init_assoc(struct connection *con) static void tcp_connect_to_sock(struct connection *con) { int result = -EHOSTUNREACH; - struct sockaddr_storage saddr; + struct sockaddr_storage saddr, src_addr; int addr_len; struct socket *sock; @@ -898,6 +898,17 @@ static void tcp_connect_to_sock(struct connection *con) con->connect_action = tcp_connect_to_sock; add_sock(sock, con); + /* Bind to our cluster-known address connecting to avoid + routing problems */ + memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr)); + make_sockaddr(&src_addr, 0, &addr_len); + result = sock->ops->bind(sock, (struct sockaddr *) &src_addr, + addr_len); + if (result < 0) { + log_print("could not bind for connect: %d", result); + /* This *may* not indicate a critical error */ + } + make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len); log_print("connecting to %d", con->nodeid); @@ -1426,6 +1437,8 @@ void dlm_lowcomms_stop(void) con = __nodeid2con(i, 0); if (con) { close_connection(con, true); + if (con->othercon) + kmem_cache_free(con_cache, con->othercon); kmem_cache_free(con_cache, con); } } diff --git a/fs/dlm/main.c b/fs/dlm/main.c index eca2907f238..58487fb95a4 100644 --- a/fs/dlm/main.c +++ b/fs/dlm/main.c @@ -18,16 +18,6 @@ #include "memory.h" #include "config.h" -#ifdef CONFIG_DLM_DEBUG -int dlm_register_debugfs(void); -void dlm_unregister_debugfs(void); -#else -static inline int dlm_register_debugfs(void) { return 0; } -static inline void dlm_unregister_debugfs(void) { } -#endif -int dlm_netlink_init(void); -void dlm_netlink_exit(void); - static int __init init_dlm(void) { int error; diff --git a/fs/dlm/member.c b/fs/dlm/member.c index e9cdcab306e..fa17f5a2788 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -1,7 +1,7 @@ /****************************************************************************** ******************************************************************************* ** -** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. +** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -70,7 +70,7 @@ static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb) ls->ls_num_nodes--; } -static int dlm_is_member(struct dlm_ls *ls, int nodeid) +int dlm_is_member(struct dlm_ls *ls, int nodeid) { struct dlm_member *memb; diff --git a/fs/dlm/member.h b/fs/dlm/member.h index 927c08c1921..7a26fca1e0b 100644 --- a/fs/dlm/member.h +++ b/fs/dlm/member.h @@ -1,7 +1,7 @@ /****************************************************************************** ******************************************************************************* ** -** Copyright (C) 2005 Red Hat, Inc. All rights reserved. +** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -19,6 +19,7 @@ void dlm_clear_members(struct dlm_ls *ls); void dlm_clear_members_gone(struct dlm_ls *ls); int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out); int dlm_is_removed(struct dlm_ls *ls, int nodeid); +int dlm_is_member(struct dlm_ls *ls, int nodeid); #endif /* __MEMBER_DOT_H__ */ diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c index ecf0e5cb203..f7783867491 100644 --- a/fs/dlm/memory.c +++ b/fs/dlm/memory.c @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -35,7 +35,7 @@ void dlm_memory_exit(void) kmem_cache_destroy(lkb_cache); } -char *allocate_lvb(struct dlm_ls *ls) +char *dlm_allocate_lvb(struct dlm_ls *ls) { char *p; @@ -43,7 +43,7 @@ char *allocate_lvb(struct dlm_ls *ls) return p; } -void free_lvb(char *p) +void dlm_free_lvb(char *p) { kfree(p); } @@ -51,7 +51,7 @@ void free_lvb(char *p) /* FIXME: have some minimal space built-in to rsb for the name and kmalloc a separate name if needed, like dentries are done */ -struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen) +struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen) { struct dlm_rsb *r; @@ -61,14 +61,14 @@ struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen) return r; } -void free_rsb(struct dlm_rsb *r) +void dlm_free_rsb(struct dlm_rsb *r) { if (r->res_lvbptr) - free_lvb(r->res_lvbptr); + dlm_free_lvb(r->res_lvbptr); kfree(r); } -struct dlm_lkb *allocate_lkb(struct dlm_ls *ls) +struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls) { struct dlm_lkb *lkb; @@ -76,7 +76,7 @@ struct dlm_lkb *allocate_lkb(struct dlm_ls *ls) return lkb; } -void free_lkb(struct dlm_lkb *lkb) +void dlm_free_lkb(struct dlm_lkb *lkb) { if (lkb->lkb_flags & DLM_IFL_USER) { struct dlm_user_args *ua; @@ -90,19 +90,3 @@ void free_lkb(struct dlm_lkb *lkb) kmem_cache_free(lkb_cache, lkb); } -struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen) -{ - struct dlm_direntry *de; - - DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN, - printk("namelen = %d\n", namelen);); - - de = kzalloc(sizeof(*de) + namelen, GFP_KERNEL); - return de; -} - -void free_direntry(struct dlm_direntry *de) -{ - kfree(de); -} - diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h index 6ead158ccc5..485fb29143b 100644 --- a/fs/dlm/memory.h +++ b/fs/dlm/memory.h @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -16,14 +16,12 @@ int dlm_memory_init(void); void dlm_memory_exit(void); -struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen); -void free_rsb(struct dlm_rsb *r); -struct dlm_lkb *allocate_lkb(struct dlm_ls *ls); -void free_lkb(struct dlm_lkb *l); -struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen); -void free_direntry(struct dlm_direntry *de); -char *allocate_lvb(struct dlm_ls *ls); -void free_lvb(char *l); +struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen); +void dlm_free_rsb(struct dlm_rsb *r); +struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls); +void dlm_free_lkb(struct dlm_lkb *l); +char *dlm_allocate_lvb(struct dlm_ls *ls); +void dlm_free_lvb(char *l); #endif /* __MEMORY_DOT_H__ */ diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index f8c69dda16a..e69926e984d 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -58,8 +58,12 @@ static void copy_from_cb(void *dst, const void *base, unsigned offset, int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset, unsigned len, unsigned limit) { - unsigned char __tmp[DLM_INBUF_LEN]; - struct dlm_header *msg = (struct dlm_header *) __tmp; + union { + unsigned char __buf[DLM_INBUF_LEN]; + /* this is to force proper alignment on some arches */ + struct dlm_header dlm; + } __tmp; + struct dlm_header *msg = &__tmp.dlm; int ret = 0; int err = 0; uint16_t msglen; @@ -100,8 +104,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base, in the buffer on the stack (which should work for most ordinary messages). */ - if (msglen > sizeof(__tmp) && - msg == (struct dlm_header *) __tmp) { + if (msglen > DLM_INBUF_LEN && msg == &__tmp.dlm) { msg = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL); if (msg == NULL) return ret; @@ -119,7 +122,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base, dlm_receive_buffer(msg, nodeid); } - if (msg != (struct dlm_header *) __tmp) + if (msg != &__tmp.dlm) kfree(msg); return err ? err : ret; diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index ae2fd97fa4a..026824cd3ac 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. +** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -197,11 +197,6 @@ static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) spin_unlock(&ls->ls_rcom_spin); } -static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) -{ - receive_sync_reply(ls, rc_in); -} - int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) { struct dlm_rcom *rc; @@ -254,11 +249,6 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in) send_rcom(ls, mh, rc); } -static void receive_rcom_names_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) -{ - receive_sync_reply(ls, rc_in); -} - int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) { struct dlm_rcom *rc; @@ -381,11 +371,6 @@ static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in) send_rcom(ls, mh, rc); } -static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) -{ - dlm_recover_process_copy(ls, rc_in); -} - /* If the lockspace doesn't exist then still send a status message back; it's possible that it just doesn't have its global_id yet. */ @@ -481,11 +466,11 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) break; case DLM_RCOM_STATUS_REPLY: - receive_rcom_status_reply(ls, rc); + receive_sync_reply(ls, rc); break; case DLM_RCOM_NAMES_REPLY: - receive_rcom_names_reply(ls, rc); + receive_sync_reply(ls, rc); break; case DLM_RCOM_LOOKUP_REPLY: @@ -493,11 +478,11 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) break; case DLM_RCOM_LOCK_REPLY: - receive_rcom_lock_reply(ls, rc); + dlm_recover_process_copy(ls, rc); break; default: - DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type);); + log_error(ls, "receive_rcom bad type %d", rc->rc_type); } out: return; diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index c2cc7694cd1..df075dc300f 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c @@ -629,7 +629,7 @@ static void recover_lvb(struct dlm_rsb *r) goto out; if (!r->res_lvbptr) { - r->res_lvbptr = allocate_lvb(r->res_ls); + r->res_lvbptr = dlm_allocate_lvb(r->res_ls); if (!r->res_lvbptr) goto out; } @@ -731,6 +731,20 @@ int dlm_create_root_list(struct dlm_ls *ls) list_add(&r->res_root_list, &ls->ls_root_list); dlm_hold_rsb(r); } + + /* If we're using a directory, add tossed rsbs to the root + list; they'll have entries created in the new directory, + but no other recovery steps should do anything with them. */ + + if (dlm_no_directory(ls)) { + read_unlock(&ls->ls_rsbtbl[i].lock); + continue; + } + + list_for_each_entry(r, &ls->ls_rsbtbl[i].toss, res_hashchain) { + list_add(&r->res_root_list, &ls->ls_root_list); + dlm_hold_rsb(r); + } read_unlock(&ls->ls_rsbtbl[i].lock); } out: @@ -750,6 +764,11 @@ void dlm_release_root_list(struct dlm_ls *ls) up_write(&ls->ls_root_sem); } +/* If not using a directory, clear the entire toss list, there's no benefit to + caching the master value since it's fixed. If we are using a dir, keep the + rsb's we're the master of. Recovery will add them to the root list and from + there they'll be entered in the rebuilt directory. */ + void dlm_clear_toss_list(struct dlm_ls *ls) { struct dlm_rsb *r, *safe; @@ -759,8 +778,10 @@ void dlm_clear_toss_list(struct dlm_ls *ls) write_lock(&ls->ls_rsbtbl[i].lock); list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss, res_hashchain) { - list_del(&r->res_hashchain); - free_rsb(r); + if (dlm_no_directory(ls) || !is_master(r)) { + list_del(&r->res_hashchain); + dlm_free_rsb(r); + } } write_unlock(&ls->ls_rsbtbl[i].lock); } diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 4b89e20eebe..997f9531d59 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -67,17 +67,18 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_astd_resume(); /* - * This list of root rsb's will be the basis of most of the recovery - * routines. + * Free non-master tossed rsb's. Master rsb's are kept on toss + * list and put on root list to be included in resdir recovery. */ - dlm_create_root_list(ls); + dlm_clear_toss_list(ls); /* - * Free all the tossed rsb's so we don't have to recover them. + * This list of root rsb's will be the basis of most of the recovery + * routines. */ - dlm_clear_toss_list(ls); + dlm_create_root_list(ls); /* * Add or remove nodes from the lockspace's ls_nodes list. diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 4f741546f4b..7cbc6826239 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -24,8 +24,7 @@ #include "lvb_table.h" #include "user.h" -static const char *name_prefix="dlm"; -static struct miscdevice ctl_device; +static const char name_prefix[] = "dlm"; static const struct file_operations device_fops; #ifdef CONFIG_COMPAT @@ -82,7 +81,8 @@ struct dlm_lock_result32 { }; static void compat_input(struct dlm_write_request *kb, - struct dlm_write_request32 *kb32) + struct dlm_write_request32 *kb32, + int max_namelen) { kb->version[0] = kb32->version[0]; kb->version[1] = kb32->version[1]; @@ -112,7 +112,11 @@ static void compat_input(struct dlm_write_request *kb, kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr; kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb; memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN); - memcpy(kb->i.lock.name, kb32->i.lock.name, kb->i.lock.namelen); + if (kb->i.lock.namelen <= max_namelen) + memcpy(kb->i.lock.name, kb32->i.lock.name, + kb->i.lock.namelen); + else + kb->i.lock.namelen = max_namelen; } } @@ -236,12 +240,12 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type) spin_unlock(&proc->asts_spin); if (eol) { - spin_lock(&ua->proc->locks_spin); + spin_lock(&proc->locks_spin); if (!list_empty(&lkb->lkb_ownqueue)) { list_del_init(&lkb->lkb_ownqueue); dlm_put_lkb(lkb); } - spin_unlock(&ua->proc->locks_spin); + spin_unlock(&proc->locks_spin); } out: mutex_unlock(&ls->ls_clear_proc_locks); @@ -529,7 +533,8 @@ static ssize_t device_write(struct file *file, const char __user *buf, if (proc) set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags); - compat_input(kbuf, k32buf); + compat_input(kbuf, k32buf, + count - sizeof(struct dlm_write_request32)); kfree(k32buf); } #endif @@ -896,14 +901,16 @@ static const struct file_operations ctl_device_fops = { .owner = THIS_MODULE, }; +static struct miscdevice ctl_device = { + .name = "dlm-control", + .fops = &ctl_device_fops, + .minor = MISC_DYNAMIC_MINOR, +}; + int dlm_user_init(void) { int error; - ctl_device.name = "dlm-control"; - ctl_device.fops = &ctl_device_fops; - ctl_device.minor = MISC_DYNAMIC_MINOR; - error = misc_register(&ctl_device); if (error) log_print("misc_register failed for control device"); diff --git a/fs/dlm/util.c b/fs/dlm/util.c index 963889cf674..4d9c1f4e1bd 100644 --- a/fs/dlm/util.c +++ b/fs/dlm/util.c @@ -1,7 +1,7 @@ /****************************************************************************** ******************************************************************************* ** -** Copyright (C) 2005 Red Hat, Inc. All rights reserved. +** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -14,6 +14,14 @@ #include "rcom.h" #include "util.h" +#define DLM_ERRNO_EDEADLK 35 +#define DLM_ERRNO_EBADR 53 +#define DLM_ERRNO_EBADSLT 57 +#define DLM_ERRNO_EPROTO 71 +#define DLM_ERRNO_EOPNOTSUPP 95 +#define DLM_ERRNO_ETIMEDOUT 110 +#define DLM_ERRNO_EINPROGRESS 115 + static void header_out(struct dlm_header *hd) { hd->h_version = cpu_to_le32(hd->h_version); @@ -30,11 +38,54 @@ static void header_in(struct dlm_header *hd) hd->h_length = le16_to_cpu(hd->h_length); } -void dlm_message_out(struct dlm_message *ms) +/* higher errno values are inconsistent across architectures, so select + one set of values for on the wire */ + +static int to_dlm_errno(int err) +{ + switch (err) { + case -EDEADLK: + return -DLM_ERRNO_EDEADLK; + case -EBADR: + return -DLM_ERRNO_EBADR; + case -EBADSLT: + return -DLM_ERRNO_EBADSLT; + case -EPROTO: + return -DLM_ERRNO_EPROTO; + case -EOPNOTSUPP: + return -DLM_ERRNO_EOPNOTSUPP; + case -ETIMEDOUT: + return -DLM_ERRNO_ETIMEDOUT; + case -EINPROGRESS: + return -DLM_ERRNO_EINPROGRESS; + } + return err; +} + +static int from_dlm_errno(int err) { - struct dlm_header *hd = (struct dlm_header *) ms; + switch (err) { + case -DLM_ERRNO_EDEADLK: + return -EDEADLK; + case -DLM_ERRNO_EBADR: + return -EBADR; + case -DLM_ERRNO_EBADSLT: + return -EBADSLT; + case -DLM_ERRNO_EPROTO: + return -EPROTO; + case -DLM_ERRNO_EOPNOTSUPP: + return -EOPNOTSUPP; + case -DLM_ERRNO_ETIMEDOUT: + return -ETIMEDOUT; + case -DLM_ERRNO_EINPROGRESS: + return -EINPROGRESS; + } + return err; +} - header_out(hd); +void dlm_message_out(struct dlm_message *ms) +{ + header_out(&ms->m_header); ms->m_type = cpu_to_le32(ms->m_type); ms->m_nodeid = cpu_to_le32(ms->m_nodeid); @@ -53,14 +104,12 @@ void dlm_message_out(struct dlm_message *ms) ms->m_rqmode = cpu_to_le32(ms->m_rqmode); ms->m_bastmode = cpu_to_le32(ms->m_bastmode); ms->m_asts = cpu_to_le32(ms->m_asts); - ms->m_result = cpu_to_le32(ms->m_result); + ms->m_result = cpu_to_le32(to_dlm_errno(ms->m_result)); } void dlm_message_in(struct dlm_message *ms) { - struct dlm_header *hd = (struct dlm_header *) ms; - - header_in(hd); + header_in(&ms->m_header); ms->m_type = le32_to_cpu(ms->m_type); ms->m_nodeid = le32_to_cpu(ms->m_nodeid); @@ -79,7 +128,7 @@ void dlm_message_in(struct dlm_message *ms) ms->m_rqmode = le32_to_cpu(ms->m_rqmode); ms->m_bastmode = le32_to_cpu(ms->m_bastmode); ms->m_asts = le32_to_cpu(ms->m_asts); - ms->m_result = le32_to_cpu(ms->m_result); + ms->m_result = from_dlm_errno(le32_to_cpu(ms->m_result)); } static void rcom_lock_out(struct rcom_lock *rl) @@ -126,10 +175,9 @@ static void rcom_config_in(struct rcom_config *rf) void dlm_rcom_out(struct dlm_rcom *rc) { - struct dlm_header *hd = (struct dlm_header *) rc; int type = rc->rc_type; - header_out(hd); + header_out(&rc->rc_header); rc->rc_type = cpu_to_le32(rc->rc_type); rc->rc_result = cpu_to_le32(rc->rc_result); @@ -137,7 +185,7 @@ void dlm_rcom_out(struct dlm_rcom *rc) rc->rc_seq = cpu_to_le64(rc->rc_seq); rc->rc_seq_reply = cpu_to_le64(rc->rc_seq_reply); - if (type == DLM_RCOM_LOCK) + if ((type == DLM_RCOM_LOCK) || (type == DLM_RCOM_LOCK_REPLY)) rcom_lock_out((struct rcom_lock *) rc->rc_buf); else if (type == DLM_RCOM_STATUS_REPLY) @@ -146,9 +194,9 @@ void dlm_rcom_out(struct dlm_rcom *rc) void dlm_rcom_in(struct dlm_rcom *rc) { - struct dlm_header *hd = (struct dlm_header *) rc; + int type; - header_in(hd); + header_in(&rc->rc_header); rc->rc_type = le32_to_cpu(rc->rc_type); rc->rc_result = le32_to_cpu(rc->rc_result); @@ -156,10 +204,12 @@ void dlm_rcom_in(struct dlm_rcom *rc) rc->rc_seq = le64_to_cpu(rc->rc_seq); rc->rc_seq_reply = le64_to_cpu(rc->rc_seq_reply); - if (rc->rc_type == DLM_RCOM_LOCK) + type = rc->rc_type; + + if ((type == DLM_RCOM_LOCK) || (type == DLM_RCOM_LOCK_REPLY)) rcom_lock_in((struct rcom_lock *) rc->rc_buf); - else if (rc->rc_type == DLM_RCOM_STATUS_REPLY) + else if (type == DLM_RCOM_STATUS_REPLY) rcom_config_in((struct rcom_config *) rc->rc_buf); } diff --git a/include/asm-x86/Kbuild b/include/asm-x86/Kbuild index e6189b22914..3c6f0f80e82 100644 --- a/include/asm-x86/Kbuild +++ b/include/asm-x86/Kbuild @@ -3,6 +3,7 @@ include include/asm-generic/Kbuild.asm header-y += boot.h header-y += bootparam.h header-y += debugreg.h +header-y += kvm.h header-y += ldt.h header-y += msr-index.h header-y += prctl.h diff --git a/include/asm-x86/kvm.h b/include/asm-x86/kvm.h new file mode 100644 index 00000000000..7a71120426a --- /dev/null +++ b/include/asm-x86/kvm.h @@ -0,0 +1,191 @@ +#ifndef __LINUX_KVM_X86_H +#define __LINUX_KVM_X86_H + +/* + * KVM x86 specific structures and definitions + * + */ + +#include <asm/types.h> +#include <linux/ioctl.h> + +/* Architectural interrupt line count. */ +#define KVM_NR_INTERRUPTS 256 + +struct kvm_memory_alias { + __u32 slot; /* this has a different namespace than memory slots */ + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; + __u64 target_phys_addr; +}; + +/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */ +struct kvm_pic_state { + __u8 last_irr; /* edge detection */ + __u8 irr; /* interrupt request register */ + __u8 imr; /* interrupt mask register */ + __u8 isr; /* interrupt service register */ + __u8 priority_add; /* highest irq priority */ + __u8 irq_base; + __u8 read_reg_select; + __u8 poll; + __u8 special_mask; + __u8 init_state; + __u8 auto_eoi; + __u8 rotate_on_auto_eoi; + __u8 special_fully_nested_mode; + __u8 init4; /* true if 4 byte init */ + __u8 elcr; /* PIIX edge/trigger selection */ + __u8 elcr_mask; +}; + +#define KVM_IOAPIC_NUM_PINS 24 +struct kvm_ioapic_state { + __u64 base_address; + __u32 ioregsel; + __u32 id; + __u32 irr; + __u32 pad; + union { + __u64 bits; + struct { + __u8 vector; + __u8 delivery_mode:3; + __u8 dest_mode:1; + __u8 delivery_status:1; + __u8 polarity:1; + __u8 remote_irr:1; + __u8 trig_mode:1; + __u8 mask:1; + __u8 reserve:7; + __u8 reserved[4]; + __u8 dest_id; + } fields; + } redirtbl[KVM_IOAPIC_NUM_PINS]; +}; + +#define KVM_IRQCHIP_PIC_MASTER 0 +#define KVM_IRQCHIP_PIC_SLAVE 1 +#define KVM_IRQCHIP_IOAPIC 2 + +/* for KVM_GET_REGS and KVM_SET_REGS */ +struct kvm_regs { + /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ + __u64 rax, rbx, rcx, rdx; + __u64 rsi, rdi, rsp, rbp; + __u64 r8, r9, r10, r11; + __u64 r12, r13, r14, r15; + __u64 rip, rflags; +}; + +/* for KVM_GET_LAPIC and KVM_SET_LAPIC */ +#define KVM_APIC_REG_SIZE 0x400 +struct kvm_lapic_state { + char regs[KVM_APIC_REG_SIZE]; +}; + +struct kvm_segment { + __u64 base; + __u32 limit; + __u16 selector; + __u8 type; + __u8 present, dpl, db, s, l, g, avl; + __u8 unusable; + __u8 padding; +}; + +struct kvm_dtable { + __u64 base; + __u16 limit; + __u16 padding[3]; +}; + + +/* for KVM_GET_SREGS and KVM_SET_SREGS */ +struct kvm_sregs { + /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */ + struct kvm_segment cs, ds, es, fs, gs, ss; + struct kvm_segment tr, ldt; + struct kvm_dtable gdt, idt; + __u64 cr0, cr2, cr3, cr4, cr8; + __u64 efer; + __u64 apic_base; + __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; +}; + +/* for KVM_GET_FPU and KVM_SET_FPU */ +struct kvm_fpu { + __u8 fpr[8][16]; + __u16 fcw; + __u16 fsw; + __u8 ftwx; /* in fxsave format */ + __u8 pad1; + __u16 last_opcode; + __u64 last_ip; + __u64 last_dp; + __u8 xmm[16][16]; + __u32 mxcsr; + __u32 pad2; +}; + +struct kvm_msr_entry { + __u32 index; + __u32 reserved; + __u64 data; +}; + +/* for KVM_GET_MSRS and KVM_SET_MSRS */ +struct kvm_msrs { + __u32 nmsrs; /* number of msrs in entries */ + __u32 pad; + + struct kvm_msr_entry entries[0]; +}; + +/* for KVM_GET_MSR_INDEX_LIST */ +struct kvm_msr_list { + __u32 nmsrs; /* number of msrs in entries */ + __u32 indices[0]; +}; + + +struct kvm_cpuid_entry { + __u32 function; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding; +}; + +/* for KVM_SET_CPUID */ +struct kvm_cpuid { + __u32 nent; + __u32 padding; + struct kvm_cpuid_entry entries[0]; +}; + +struct kvm_cpuid_entry2 { + __u32 function; + __u32 index; + __u32 flags; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding[3]; +}; + +#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1 +#define KVM_CPUID_FLAG_STATEFUL_FUNC 2 +#define KVM_CPUID_FLAG_STATE_READ_NEXT 4 + +/* for KVM_SET_CPUID2 */ +struct kvm_cpuid2 { + __u32 nent; + __u32 padding; + struct kvm_cpuid_entry2 entries[0]; +}; + +#endif diff --git a/drivers/kvm/kvm.h b/include/asm-x86/kvm_host.h index 3b0bc4bda5f..4702b04b979 100644 --- a/drivers/kvm/kvm.h +++ b/include/asm-x86/kvm_host.h @@ -1,23 +1,24 @@ -#ifndef __KVM_H -#define __KVM_H - -/* +#/* + * Kernel-based Virtual Machine driver for Linux + * + * This header defines architecture specific interfaces, x86 version + * * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. + * */ +#ifndef ASM_KVM_HOST_H +#define ASM_KVM_HOST_H + #include <linux/types.h> -#include <linux/list.h> -#include <linux/mutex.h> -#include <linux/spinlock.h> -#include <linux/signal.h> -#include <linux/sched.h> #include <linux/mm.h> -#include <linux/preempt.h> -#include <asm/signal.h> #include <linux/kvm.h> #include <linux/kvm_para.h> +#include <linux/kvm_types.h> + +#include <asm/desc.h> #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) @@ -37,15 +38,8 @@ #define INVALID_PAGE (~(hpa_t)0) #define UNMAPPED_GVA (~(gpa_t)0) -#define KVM_MAX_VCPUS 4 -#define KVM_ALIAS_SLOTS 4 -#define KVM_MEMORY_SLOTS 8 -#define KVM_NUM_MMU_PAGES 1024 -#define KVM_MIN_FREE_MMU_PAGES 5 -#define KVM_REFILL_PAGES 25 -#define KVM_MAX_CPUID_ENTRIES 40 - #define DE_VECTOR 0 +#define UD_VECTOR 6 #define NM_VECTOR 7 #define DF_VECTOR 8 #define TS_VECTOR 10 @@ -59,31 +53,66 @@ #define IOPL_SHIFT 12 -#define KVM_PIO_PAGE_OFFSET 1 +#define KVM_ALIAS_SLOTS 4 -/* - * vcpu->requests bit members - */ -#define KVM_TLB_FLUSH 0 +#define KVM_PERMILLE_MMU_PAGES 20 +#define KVM_MIN_ALLOC_MMU_PAGES 64 +#define KVM_NUM_MMU_PAGES 1024 +#define KVM_MIN_FREE_MMU_PAGES 5 +#define KVM_REFILL_PAGES 25 +#define KVM_MAX_CPUID_ENTRIES 40 -/* - * Address types: - * - * gva - guest virtual address - * gpa - guest physical address - * gfn - guest frame number - * hva - host virtual address - * hpa - host physical address - * hfn - host frame number - */ +extern spinlock_t kvm_lock; +extern struct list_head vm_list; + +struct kvm_vcpu; +struct kvm; + +enum { + VCPU_REGS_RAX = 0, + VCPU_REGS_RCX = 1, + VCPU_REGS_RDX = 2, + VCPU_REGS_RBX = 3, + VCPU_REGS_RSP = 4, + VCPU_REGS_RBP = 5, + VCPU_REGS_RSI = 6, + VCPU_REGS_RDI = 7, +#ifdef CONFIG_X86_64 + VCPU_REGS_R8 = 8, + VCPU_REGS_R9 = 9, + VCPU_REGS_R10 = 10, + VCPU_REGS_R11 = 11, + VCPU_REGS_R12 = 12, + VCPU_REGS_R13 = 13, + VCPU_REGS_R14 = 14, + VCPU_REGS_R15 = 15, +#endif + NR_VCPU_REGS +}; + +enum { + VCPU_SREG_CS, + VCPU_SREG_DS, + VCPU_SREG_ES, + VCPU_SREG_FS, + VCPU_SREG_GS, + VCPU_SREG_SS, + VCPU_SREG_TR, + VCPU_SREG_LDTR, +}; -typedef unsigned long gva_t; -typedef u64 gpa_t; -typedef unsigned long gfn_t; +#include <asm/kvm_x86_emulate.h> -typedef unsigned long hva_t; -typedef u64 hpa_t; -typedef unsigned long hfn_t; +#define KVM_NR_MEM_OBJS 40 + +/* + * We don't want allocation failures within the mmu code, so we preallocate + * enough memory for a single page fault in a cache. + */ +struct kvm_mmu_memory_cache { + int nobjs; + void *objects[KVM_NR_MEM_OBJS]; +}; #define NR_PTE_CHAIN_ENTRIES 5 @@ -99,7 +128,7 @@ struct kvm_pte_chain { * bits 4:7 - page table level for this shadow (1-4) * bits 8:9 - page table quadrant for 2-level guests * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode) - * bits 17:19 - "access" - the user, writable, and nx bits of a huge page pde + * bits 17:19 - common access permissions for all ptes in this shadow page */ union kvm_mmu_page_role { unsigned word; @@ -109,7 +138,7 @@ union kvm_mmu_page_role { unsigned quadrant : 2; unsigned pad_for_nice_hex_output : 6; unsigned metaphysical : 1; - unsigned hugepage_access : 3; + unsigned access : 3; }; }; @@ -125,6 +154,8 @@ struct kvm_mmu_page { union kvm_mmu_page_role role; u64 *spt; + /* hold the gfn of each spte inside spt */ + gfn_t *gfns; unsigned long slot_bitmap; /* One bit set per slot which has memory * in this shadow page. */ @@ -136,9 +167,6 @@ struct kvm_mmu_page { }; }; -struct kvm_vcpu; -extern struct kmem_cache *kvm_vcpu_cache; - /* * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level * 32-bit). The kvm_mmu structure abstracts the details of the current mmu @@ -149,6 +177,8 @@ struct kvm_mmu { int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); void (*free)(struct kvm_vcpu *vcpu); gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); + void (*prefetch_page)(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *page); hpa_t root_hpa; int root_level; int shadow_root_level; @@ -156,159 +186,9 @@ struct kvm_mmu { u64 *pae_root; }; -#define KVM_NR_MEM_OBJS 20 - -struct kvm_mmu_memory_cache { - int nobjs; - void *objects[KVM_NR_MEM_OBJS]; -}; - -/* - * We don't want allocation failures within the mmu code, so we preallocate - * enough memory for a single page fault in a cache. - */ -struct kvm_guest_debug { - int enabled; - unsigned long bp[4]; - int singlestep; -}; - -enum { - VCPU_REGS_RAX = 0, - VCPU_REGS_RCX = 1, - VCPU_REGS_RDX = 2, - VCPU_REGS_RBX = 3, - VCPU_REGS_RSP = 4, - VCPU_REGS_RBP = 5, - VCPU_REGS_RSI = 6, - VCPU_REGS_RDI = 7, -#ifdef CONFIG_X86_64 - VCPU_REGS_R8 = 8, - VCPU_REGS_R9 = 9, - VCPU_REGS_R10 = 10, - VCPU_REGS_R11 = 11, - VCPU_REGS_R12 = 12, - VCPU_REGS_R13 = 13, - VCPU_REGS_R14 = 14, - VCPU_REGS_R15 = 15, -#endif - NR_VCPU_REGS -}; - -enum { - VCPU_SREG_CS, - VCPU_SREG_DS, - VCPU_SREG_ES, - VCPU_SREG_FS, - VCPU_SREG_GS, - VCPU_SREG_SS, - VCPU_SREG_TR, - VCPU_SREG_LDTR, -}; - -struct kvm_pio_request { - unsigned long count; - int cur_count; - struct page *guest_pages[2]; - unsigned guest_page_offset; - int in; - int port; - int size; - int string; - int down; - int rep; -}; - -struct kvm_stat { - u32 pf_fixed; - u32 pf_guest; - u32 tlb_flush; - u32 invlpg; - - u32 exits; - u32 io_exits; - u32 mmio_exits; - u32 signal_exits; - u32 irq_window_exits; - u32 halt_exits; - u32 halt_wakeup; - u32 request_irq_exits; - u32 irq_exits; - u32 light_exits; - u32 efer_reload; -}; - -struct kvm_io_device { - void (*read)(struct kvm_io_device *this, - gpa_t addr, - int len, - void *val); - void (*write)(struct kvm_io_device *this, - gpa_t addr, - int len, - const void *val); - int (*in_range)(struct kvm_io_device *this, gpa_t addr); - void (*destructor)(struct kvm_io_device *this); - - void *private; -}; - -static inline void kvm_iodevice_read(struct kvm_io_device *dev, - gpa_t addr, - int len, - void *val) -{ - dev->read(dev, addr, len, val); -} - -static inline void kvm_iodevice_write(struct kvm_io_device *dev, - gpa_t addr, - int len, - const void *val) -{ - dev->write(dev, addr, len, val); -} - -static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr) -{ - return dev->in_range(dev, addr); -} - -static inline void kvm_iodevice_destructor(struct kvm_io_device *dev) -{ - if (dev->destructor) - dev->destructor(dev); -} - -/* - * It would be nice to use something smarter than a linear search, TBD... - * Thankfully we dont expect many devices to register (famous last words :), - * so until then it will suffice. At least its abstracted so we can change - * in one place. - */ -struct kvm_io_bus { - int dev_count; -#define NR_IOBUS_DEVS 6 - struct kvm_io_device *devs[NR_IOBUS_DEVS]; -}; - -void kvm_io_bus_init(struct kvm_io_bus *bus); -void kvm_io_bus_destroy(struct kvm_io_bus *bus); -struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr); -void kvm_io_bus_register_dev(struct kvm_io_bus *bus, - struct kvm_io_device *dev); - -struct kvm_vcpu { - struct kvm *kvm; - struct preempt_notifier preempt_notifier; - int vcpu_id; - struct mutex mutex; - int cpu; +struct kvm_vcpu_arch { u64 host_tsc; - struct kvm_run *run; int interrupt_window_open; - int guest_mode; - unsigned long requests; unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS); unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */ @@ -317,9 +197,6 @@ struct kvm_vcpu { unsigned long cr0; unsigned long cr2; unsigned long cr3; - gpa_t para_state_gpa; - struct page *para_state_page; - gpa_t hypercall_gpa; unsigned long cr4; unsigned long cr8; u64 pdptrs[4]; /* pae */ @@ -334,6 +211,7 @@ struct kvm_vcpu { int mp_state; int sipi_vector; u64 ia32_misc_enable_msr; + bool tpr_access_reporting; struct kvm_mmu mmu; @@ -344,29 +222,26 @@ struct kvm_vcpu { gfn_t last_pt_write_gfn; int last_pt_write_count; + u64 *last_pte_updated; - struct kvm_guest_debug guest_debug; + struct { + gfn_t gfn; /* presumed gfn during guest pte update */ + struct page *page; /* page corresponding to that gfn */ + } update_pte; struct i387_fxsave_struct host_fx_image; struct i387_fxsave_struct guest_fx_image; - int fpu_active; - int guest_fpu_loaded; - - int mmio_needed; - int mmio_read_completed; - int mmio_is_write; - int mmio_size; - unsigned char mmio_data[8]; - gpa_t mmio_phys_addr; + gva_t mmio_fault_cr2; struct kvm_pio_request pio; void *pio_data; - wait_queue_head_t wq; - int sigset_active; - sigset_t sigset; - - struct kvm_stat stat; + struct kvm_queued_exception { + bool pending; + bool has_error_code; + u8 nr; + u32 error_code; + } exception; struct { int active; @@ -381,7 +256,10 @@ struct kvm_vcpu { int halt_request; /* real mode on Intel only */ int cpuid_nent; - struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES]; + struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; + /* emulate context */ + + struct x86_emulate_ctxt emulate_ctxt; }; struct kvm_mem_alias { @@ -390,51 +268,58 @@ struct kvm_mem_alias { gfn_t target_gfn; }; -struct kvm_memory_slot { - gfn_t base_gfn; - unsigned long npages; - unsigned long flags; - struct page **phys_mem; - unsigned long *dirty_bitmap; -}; - -struct kvm { - struct mutex lock; /* protects everything except vcpus */ +struct kvm_arch{ int naliases; struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; - int nmemslots; - struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS]; + + unsigned int n_free_mmu_pages; + unsigned int n_requested_mmu_pages; + unsigned int n_alloc_mmu_pages; + struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; /* * Hash table of struct kvm_mmu_page. */ struct list_head active_mmu_pages; - int n_free_mmu_pages; - struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; - struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; - unsigned long rmap_overflow; - struct list_head vm_list; - struct file *filp; - struct kvm_io_bus mmio_bus; - struct kvm_io_bus pio_bus; struct kvm_pic *vpic; struct kvm_ioapic *vioapic; + int round_robin_prev_vcpu; + unsigned int tss_addr; + struct page *apic_access_page; }; -static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) -{ - return kvm->vpic; -} +struct kvm_vm_stat { + u32 mmu_shadow_zapped; + u32 mmu_pte_write; + u32 mmu_pte_updated; + u32 mmu_pde_zapped; + u32 mmu_flooded; + u32 mmu_recycled; + u32 mmu_cache_miss; + u32 remote_tlb_flush; +}; -static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) -{ - return kvm->vioapic; -} +struct kvm_vcpu_stat { + u32 pf_fixed; + u32 pf_guest; + u32 tlb_flush; + u32 invlpg; -static inline int irqchip_in_kernel(struct kvm *kvm) -{ - return pic_irqchip(kvm) != 0; -} + u32 exits; + u32 io_exits; + u32 mmio_exits; + u32 signal_exits; + u32 irq_window_exits; + u32 halt_exits; + u32 halt_wakeup; + u32 request_irq_exits; + u32 irq_exits; + u32 host_state_reload; + u32 efer_reload; + u32 fpu_reload; + u32 insn_emulation; + u32 insn_emulation_fail; +}; struct descriptor_table { u16 limit; @@ -449,11 +334,12 @@ struct kvm_x86_ops { void (*check_processor_compatibility)(void *rtn); int (*hardware_setup)(void); /* __init */ void (*hardware_unsetup)(void); /* __exit */ + bool (*cpu_has_accelerated_tpr)(void); /* Create, but do not attach this VCPU */ struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); void (*vcpu_free)(struct kvm_vcpu *vcpu); - void (*vcpu_reset)(struct kvm_vcpu *vcpu); + int (*vcpu_reset)(struct kvm_vcpu *vcpu); void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); @@ -489,10 +375,6 @@ struct kvm_x86_ops { void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); void (*tlb_flush)(struct kvm_vcpu *vcpu); - void (*inject_page_fault)(struct kvm_vcpu *vcpu, - unsigned long addr, u32 err_code); - - void (*inject_gp)(struct kvm_vcpu *vcpu, unsigned err_code); void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); @@ -501,54 +383,31 @@ struct kvm_x86_ops { unsigned char *hypercall_addr); int (*get_irq)(struct kvm_vcpu *vcpu); void (*set_irq)(struct kvm_vcpu *vcpu, int vec); + void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, + bool has_error_code, u32 error_code); + bool (*exception_injected)(struct kvm_vcpu *vcpu); void (*inject_pending_irq)(struct kvm_vcpu *vcpu); void (*inject_pending_vectors)(struct kvm_vcpu *vcpu, struct kvm_run *run); + + int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); }; extern struct kvm_x86_ops *kvm_x86_ops; -/* The guest did something we don't support. */ -#define pr_unimpl(vcpu, fmt, ...) \ - do { \ - if (printk_ratelimit()) \ - printk(KERN_ERR "kvm: %i: cpu%i " fmt, \ - current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \ - } while(0) - -#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) -#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) - -int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); -void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); - -int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size, - struct module *module); -void kvm_exit_x86(void); - int kvm_mmu_module_init(void); void kvm_mmu_module_exit(void); void kvm_mmu_destroy(struct kvm_vcpu *vcpu); int kvm_mmu_create(struct kvm_vcpu *vcpu); int kvm_mmu_setup(struct kvm_vcpu *vcpu); +void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); void kvm_mmu_zap_all(struct kvm *kvm); - -hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa); -#define HPA_MSB ((sizeof(hpa_t) * 8) - 1) -#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB) -static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } -hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva); -struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva); - -extern hpa_t bad_page_address; - -struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); -struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); -void mark_page_dirty(struct kvm *kvm, gfn_t gfn); +unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); +void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); enum emulation_result { EMULATE_DONE, /* no further processing */ @@ -556,8 +415,10 @@ enum emulation_result { EMULATE_FAIL, /* can't emulate this instruction */ }; +#define EMULTYPE_NO_DECODE (1 << 0) +#define EMULTYPE_TRAP_UD (1 << 1) int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, - unsigned long cr2, u16 error_code); + unsigned long cr2, u16 error_code, int emulation_type); void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); @@ -572,7 +433,7 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); struct x86_emulate_ctxt; -int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in, +int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, int size, unsigned port); int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, int size, unsigned long count, int down, @@ -581,7 +442,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int kvm_emulate_halt(struct kvm_vcpu *vcpu); int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); int emulate_clts(struct kvm_vcpu *vcpu); -int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, +int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest); int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value); @@ -597,15 +458,15 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); -void fx_init(struct kvm_vcpu *vcpu); +void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); +void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); +void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, + u32 error_code); -void kvm_resched(struct kvm_vcpu *vcpu); -void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); -void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); -void kvm_flush_remote_tlbs(struct kvm *kvm); +void fx_init(struct kvm_vcpu *vcpu); int emulator_read_std(unsigned long addr, - void *val, + void *val, unsigned int bytes, struct kvm_vcpu *vcpu); int emulator_write_emulated(unsigned long addr, @@ -615,6 +476,7 @@ int emulator_write_emulated(unsigned long addr, unsigned long segment_base(u16 selector); +void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes); int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); @@ -622,66 +484,14 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); -int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run); +int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); -static inline void kvm_guest_enter(void) -{ - current->flags |= PF_VCPU; -} +int kvm_fix_hypercall(struct kvm_vcpu *vcpu); -static inline void kvm_guest_exit(void) -{ - current->flags &= ~PF_VCPU; -} +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code); -static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, - u32 error_code) -{ - return vcpu->mmu.page_fault(vcpu, gva, error_code); -} - -static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) -{ - if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) - __kvm_mmu_free_some_pages(vcpu); -} - -static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) -{ - if (likely(vcpu->mmu.root_hpa != INVALID_PAGE)) - return 0; - - return kvm_mmu_load(vcpu); -} - -static inline int is_long_mode(struct kvm_vcpu *vcpu) -{ -#ifdef CONFIG_X86_64 - return vcpu->shadow_efer & EFER_LME; -#else - return 0; -#endif -} - -static inline int is_pae(struct kvm_vcpu *vcpu) -{ - return vcpu->cr4 & X86_CR4_PAE; -} - -static inline int is_pse(struct kvm_vcpu *vcpu) -{ - return vcpu->cr4 & X86_CR4_PSE; -} - -static inline int is_paging(struct kvm_vcpu *vcpu) -{ - return vcpu->cr0 & X86_CR0_PG; -} - -static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot) -{ - return slot - kvm->memslots; -} +int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); +int complete_pio(struct kvm_vcpu *vcpu); static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) { @@ -693,55 +503,55 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) static inline u16 read_fs(void) { u16 seg; - asm ("mov %%fs, %0" : "=g"(seg)); + asm("mov %%fs, %0" : "=g"(seg)); return seg; } static inline u16 read_gs(void) { u16 seg; - asm ("mov %%gs, %0" : "=g"(seg)); + asm("mov %%gs, %0" : "=g"(seg)); return seg; } static inline u16 read_ldt(void) { u16 ldt; - asm ("sldt %0" : "=g"(ldt)); + asm("sldt %0" : "=g"(ldt)); return ldt; } static inline void load_fs(u16 sel) { - asm ("mov %0, %%fs" : : "rm"(sel)); + asm("mov %0, %%fs" : : "rm"(sel)); } static inline void load_gs(u16 sel) { - asm ("mov %0, %%gs" : : "rm"(sel)); + asm("mov %0, %%gs" : : "rm"(sel)); } #ifndef load_ldt static inline void load_ldt(u16 sel) { - asm ("lldt %0" : : "rm"(sel)); + asm("lldt %0" : : "rm"(sel)); } #endif static inline void get_idt(struct descriptor_table *table) { - asm ("sidt %0" : "=m"(*table)); + asm("sidt %0" : "=m"(*table)); } static inline void get_gdt(struct descriptor_table *table) { - asm ("sgdt %0" : "=m"(*table)); + asm("sgdt %0" : "=m"(*table)); } static inline unsigned long read_tr_base(void) { u16 tr; - asm ("str %0" : "=g"(tr)); + asm("str %0" : "=g"(tr)); return segment_base(tr); } @@ -757,17 +567,17 @@ static inline unsigned long read_msr(unsigned long msr) static inline void fx_save(struct i387_fxsave_struct *image) { - asm ("fxsave (%0)":: "r" (image)); + asm("fxsave (%0)":: "r" (image)); } static inline void fx_restore(struct i387_fxsave_struct *image) { - asm ("fxrstor (%0)":: "r" (image)); + asm("fxrstor (%0)":: "r" (image)); } static inline void fpu_init(void) { - asm ("finit"); + asm("finit"); } static inline u32 get_rdx_init_val(void) @@ -775,6 +585,11 @@ static inline u32 get_rdx_init_val(void) return 0x600; /* P6 family */ } +static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) +{ + kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); +} + #define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" #define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2" #define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3" diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h new file mode 100644 index 00000000000..c6f3fd8d8c5 --- /dev/null +++ b/include/asm-x86/kvm_para.h @@ -0,0 +1,105 @@ +#ifndef __X86_KVM_PARA_H +#define __X86_KVM_PARA_H + +/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It + * should be used to determine that a VM is running under KVM. + */ +#define KVM_CPUID_SIGNATURE 0x40000000 + +/* This CPUID returns a feature bitmap in eax. Before enabling a particular + * paravirtualization, the appropriate feature bit should be checked. + */ +#define KVM_CPUID_FEATURES 0x40000001 + +#ifdef __KERNEL__ +#include <asm/processor.h> + +/* This instruction is vmcall. On non-VT architectures, it will generate a + * trap that we will then rewrite to the appropriate instruction. + */ +#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1" + +/* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun + * instruction. The hypervisor may replace it with something else but only the + * instructions are guaranteed to be supported. + * + * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively. + * The hypercall number should be placed in rax and the return value will be + * placed in rax. No other registers will be clobbered unless explicited + * noted by the particular hypercall. + */ + +static inline long kvm_hypercall0(unsigned int nr) +{ + long ret; + asm volatile(KVM_HYPERCALL + : "=a"(ret) + : "a"(nr)); + return ret; +} + +static inline long kvm_hypercall1(unsigned int nr, unsigned long p1) +{ + long ret; + asm volatile(KVM_HYPERCALL + : "=a"(ret) + : "a"(nr), "b"(p1)); + return ret; +} + +static inline long kvm_hypercall2(unsigned int nr, unsigned long p1, + unsigned long p2) +{ + long ret; + asm volatile(KVM_HYPERCALL + : "=a"(ret) + : "a"(nr), "b"(p1), "c"(p2)); + return ret; +} + +static inline long kvm_hypercall3(unsigned int nr, unsigned long p1, + unsigned long p2, unsigned long p3) +{ + long ret; + asm volatile(KVM_HYPERCALL + : "=a"(ret) + : "a"(nr), "b"(p1), "c"(p2), "d"(p3)); + return ret; +} + +static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, + unsigned long p2, unsigned long p3, + unsigned long p4) +{ + long ret; + asm volatile(KVM_HYPERCALL + : "=a"(ret) + : "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4)); + return ret; +} + +static inline int kvm_para_available(void) +{ + unsigned int eax, ebx, ecx, edx; + char signature[13]; + + cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx); + memcpy(signature + 0, &ebx, 4); + memcpy(signature + 4, &ecx, 4); + memcpy(signature + 8, &edx, 4); + signature[12] = 0; + + if (strcmp(signature, "KVMKVMKVM") == 0) + return 1; + + return 0; +} + +static inline unsigned int kvm_arch_para_features(void) +{ + return cpuid_eax(KVM_CPUID_FEATURES); +} + +#endif + +#endif diff --git a/drivers/kvm/x86_emulate.h b/include/asm-x86/kvm_x86_emulate.h index 92c73aa7f9a..7db91b9bdcd 100644 --- a/drivers/kvm/x86_emulate.h +++ b/include/asm-x86/kvm_x86_emulate.h @@ -63,17 +63,6 @@ struct x86_emulate_ops { unsigned int bytes, struct kvm_vcpu *vcpu); /* - * write_std: Write bytes of standard (non-emulated/special) memory. - * Used for stack operations, and others. - * @addr: [IN ] Linear address to which to write. - * @val: [IN ] Value to write to memory (low-order bytes used as - * required). - * @bytes: [IN ] Number of bytes to write to memory. - */ - int (*write_std)(unsigned long addr, const void *val, - unsigned int bytes, struct kvm_vcpu *vcpu); - - /* * read_emulated: Read bytes from emulated/special memory area. * @addr: [IN ] Linear address from which to read. * @val: [OUT] Value read from memory, zero-extended to 'u_long'. @@ -112,13 +101,50 @@ struct x86_emulate_ops { }; +/* Type, address-of, and value of an instruction's operand. */ +struct operand { + enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; + unsigned int bytes; + unsigned long val, orig_val, *ptr; +}; + +struct fetch_cache { + u8 data[15]; + unsigned long start; + unsigned long end; +}; + +struct decode_cache { + u8 twobyte; + u8 b; + u8 lock_prefix; + u8 rep_prefix; + u8 op_bytes; + u8 ad_bytes; + u8 rex_prefix; + struct operand src; + struct operand dst; + unsigned long *override_base; + unsigned int d; + unsigned long regs[NR_VCPU_REGS]; + unsigned long eip; + /* modrm */ + u8 modrm; + u8 modrm_mod; + u8 modrm_reg; + u8 modrm_rm; + u8 use_modrm_ea; + unsigned long modrm_ea; + unsigned long modrm_val; + struct fetch_cache fetch; +}; + struct x86_emulate_ctxt { /* Register state before/after emulation. */ struct kvm_vcpu *vcpu; /* Linear faulting address (if emulating a page-faulting instruction). */ unsigned long eflags; - unsigned long cr2; /* Emulated execution mode, represented by an X86EMUL_MODE value. */ int mode; @@ -129,8 +155,16 @@ struct x86_emulate_ctxt { unsigned long ss_base; unsigned long gs_base; unsigned long fs_base; + + /* decode cache */ + + struct decode_cache decode; }; +/* Repeat String Operation Prefix */ +#define REPE_PREFIX 1 +#define REPNE_PREFIX 2 + /* Execution mode, passed to the emulator. */ #define X86EMUL_MODE_REAL 0 /* Real mode. */ #define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ @@ -144,12 +178,9 @@ struct x86_emulate_ctxt { #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 #endif -/* - * x86_emulate_memop: Emulate an instruction that faulted attempting to - * read/write a 'special' memory area. - * Returns -1 on failure, 0 on success. - */ -int x86_emulate_memop(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops); +int x86_decode_insn(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops); +int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops); #endif /* __X86_EMULATE_H__ */ diff --git a/include/asm-x86/lguest.h b/include/asm-x86/lguest.h index 1c8367a692f..4d9367b7297 100644 --- a/include/asm-x86/lguest.h +++ b/include/asm-x86/lguest.h @@ -56,7 +56,7 @@ struct lguest_ro_state struct desc_struct guest_gdt[GDT_ENTRIES]; }; -struct lguest_arch +struct lg_cpu_arch { /* The GDT entries copied into lguest_ro_state when running. */ struct desc_struct gdt[GDT_ENTRIES]; diff --git a/include/asm-x86/lguest_hcall.h b/include/asm-x86/lguest_hcall.h index 2091779e91f..758b9a5d453 100644 --- a/include/asm-x86/lguest_hcall.h +++ b/include/asm-x86/lguest_hcall.h @@ -4,7 +4,7 @@ #define LHCALL_FLUSH_ASYNC 0 #define LHCALL_LGUEST_INIT 1 -#define LHCALL_CRASH 2 +#define LHCALL_SHUTDOWN 2 #define LHCALL_LOAD_GDT 3 #define LHCALL_NEW_PGTABLE 4 #define LHCALL_FLUSH_TLB 5 @@ -20,6 +20,10 @@ #define LGUEST_TRAP_ENTRY 0x1F +/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */ +#define LGUEST_SHUTDOWN_POWEROFF 1 +#define LGUEST_SHUTDOWN_RESTART 2 + #ifndef __ASSEMBLY__ #include <asm/hw_irq.h> diff --git a/include/linux/Kbuild b/include/linux/Kbuild index 27b9350052b..85b2482cc73 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -100,7 +100,6 @@ header-y += iso_fs.h header-y += ixjuser.h header-y += jffs2.h header-y += keyctl.h -header-y += kvm.h header-y += limits.h header-y += lock_dlm_plock.h header-y += magic.h @@ -256,6 +255,7 @@ unifdef-y += kd.h unifdef-y += kernelcapi.h unifdef-y += kernel.h unifdef-y += keyboard.h +unifdef-$(CONFIG_HAVE_KVM) += kvm.h unifdef-y += llc.h unifdef-y += loop.h unifdef-y += lp.h diff --git a/include/linux/audit.h b/include/linux/audit.h index c6878169283..bdd6f5de5fc 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -115,6 +115,8 @@ #define AUDIT_MAC_IPSEC_ADDSPD 1413 /* Not used */ #define AUDIT_MAC_IPSEC_DELSPD 1414 /* Not used */ #define AUDIT_MAC_IPSEC_EVENT 1415 /* Audit an IPSec event */ +#define AUDIT_MAC_UNLBL_STCADD 1416 /* NetLabel: add a static label */ +#define AUDIT_MAC_UNLBL_STCDEL 1417 /* NetLabel: del a static label */ #define AUDIT_FIRST_KERN_ANOM_MSG 1700 #define AUDIT_LAST_KERN_ANOM_MSG 1799 diff --git a/include/linux/device.h b/include/linux/device.h index 1880208964d..db375be333c 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -84,6 +84,9 @@ int bus_for_each_dev(struct bus_type *bus, struct device *start, void *data, struct device *bus_find_device(struct bus_type *bus, struct device *start, void *data, int (*match)(struct device *dev, void *data)); +struct device *bus_find_device_by_name(struct bus_type *bus, + struct device *start, + const char *name); int __must_check bus_for_each_drv(struct bus_type *bus, struct device_driver *start, void *data, diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 057a7f34ee3..4de4fd2d860 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -9,12 +9,10 @@ #include <asm/types.h> #include <linux/ioctl.h> +#include <asm/kvm.h> #define KVM_API_VERSION 12 -/* Architectural interrupt line count. */ -#define KVM_NR_INTERRUPTS 256 - /* for KVM_CREATE_MEMORY_REGION */ struct kvm_memory_region { __u32 slot; @@ -23,17 +21,19 @@ struct kvm_memory_region { __u64 memory_size; /* bytes */ }; -/* for kvm_memory_region::flags */ -#define KVM_MEM_LOG_DIRTY_PAGES 1UL - -struct kvm_memory_alias { - __u32 slot; /* this has a different namespace than memory slots */ +/* for KVM_SET_USER_MEMORY_REGION */ +struct kvm_userspace_memory_region { + __u32 slot; __u32 flags; __u64 guest_phys_addr; - __u64 memory_size; - __u64 target_phys_addr; + __u64 memory_size; /* bytes */ + __u64 userspace_addr; /* start of the userspace allocated memory */ }; +/* for kvm_memory_region::flags */ +#define KVM_MEM_LOG_DIRTY_PAGES 1UL + + /* for KVM_IRQ_LINE */ struct kvm_irq_level { /* @@ -45,62 +45,18 @@ struct kvm_irq_level { __u32 level; }; -/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */ -struct kvm_pic_state { - __u8 last_irr; /* edge detection */ - __u8 irr; /* interrupt request register */ - __u8 imr; /* interrupt mask register */ - __u8 isr; /* interrupt service register */ - __u8 priority_add; /* highest irq priority */ - __u8 irq_base; - __u8 read_reg_select; - __u8 poll; - __u8 special_mask; - __u8 init_state; - __u8 auto_eoi; - __u8 rotate_on_auto_eoi; - __u8 special_fully_nested_mode; - __u8 init4; /* true if 4 byte init */ - __u8 elcr; /* PIIX edge/trigger selection */ - __u8 elcr_mask; -}; - -#define KVM_IOAPIC_NUM_PINS 24 -struct kvm_ioapic_state { - __u64 base_address; - __u32 ioregsel; - __u32 id; - __u32 irr; - __u32 pad; - union { - __u64 bits; - struct { - __u8 vector; - __u8 delivery_mode:3; - __u8 dest_mode:1; - __u8 delivery_status:1; - __u8 polarity:1; - __u8 remote_irr:1; - __u8 trig_mode:1; - __u8 mask:1; - __u8 reserve:7; - __u8 reserved[4]; - __u8 dest_id; - } fields; - } redirtbl[KVM_IOAPIC_NUM_PINS]; -}; - -#define KVM_IRQCHIP_PIC_MASTER 0 -#define KVM_IRQCHIP_PIC_SLAVE 1 -#define KVM_IRQCHIP_IOAPIC 2 struct kvm_irqchip { __u32 chip_id; __u32 pad; union { char dummy[512]; /* reserving space */ +#ifdef CONFIG_X86 struct kvm_pic_state pic; +#endif +#if defined(CONFIG_X86) || defined(CONFIG_IA64) struct kvm_ioapic_state ioapic; +#endif } chip; }; @@ -116,6 +72,7 @@ struct kvm_irqchip { #define KVM_EXIT_FAIL_ENTRY 9 #define KVM_EXIT_INTR 10 #define KVM_EXIT_SET_TPR 11 +#define KVM_EXIT_TPR_ACCESS 12 /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ struct kvm_run { @@ -174,90 +131,17 @@ struct kvm_run { __u32 longmode; __u32 pad; } hypercall; + /* KVM_EXIT_TPR_ACCESS */ + struct { + __u64 rip; + __u32 is_write; + __u32 pad; + } tpr_access; /* Fix the size of the union. */ char padding[256]; }; }; -/* for KVM_GET_REGS and KVM_SET_REGS */ -struct kvm_regs { - /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ - __u64 rax, rbx, rcx, rdx; - __u64 rsi, rdi, rsp, rbp; - __u64 r8, r9, r10, r11; - __u64 r12, r13, r14, r15; - __u64 rip, rflags; -}; - -/* for KVM_GET_FPU and KVM_SET_FPU */ -struct kvm_fpu { - __u8 fpr[8][16]; - __u16 fcw; - __u16 fsw; - __u8 ftwx; /* in fxsave format */ - __u8 pad1; - __u16 last_opcode; - __u64 last_ip; - __u64 last_dp; - __u8 xmm[16][16]; - __u32 mxcsr; - __u32 pad2; -}; - -/* for KVM_GET_LAPIC and KVM_SET_LAPIC */ -#define KVM_APIC_REG_SIZE 0x400 -struct kvm_lapic_state { - char regs[KVM_APIC_REG_SIZE]; -}; - -struct kvm_segment { - __u64 base; - __u32 limit; - __u16 selector; - __u8 type; - __u8 present, dpl, db, s, l, g, avl; - __u8 unusable; - __u8 padding; -}; - -struct kvm_dtable { - __u64 base; - __u16 limit; - __u16 padding[3]; -}; - -/* for KVM_GET_SREGS and KVM_SET_SREGS */ -struct kvm_sregs { - /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */ - struct kvm_segment cs, ds, es, fs, gs, ss; - struct kvm_segment tr, ldt; - struct kvm_dtable gdt, idt; - __u64 cr0, cr2, cr3, cr4, cr8; - __u64 efer; - __u64 apic_base; - __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; -}; - -struct kvm_msr_entry { - __u32 index; - __u32 reserved; - __u64 data; -}; - -/* for KVM_GET_MSRS and KVM_SET_MSRS */ -struct kvm_msrs { - __u32 nmsrs; /* number of msrs in entries */ - __u32 pad; - - struct kvm_msr_entry entries[0]; -}; - -/* for KVM_GET_MSR_INDEX_LIST */ -struct kvm_msr_list { - __u32 nmsrs; /* number of msrs in entries */ - __u32 indices[0]; -}; - /* for KVM_TRANSLATE */ struct kvm_translation { /* in */ @@ -302,28 +186,24 @@ struct kvm_dirty_log { }; }; -struct kvm_cpuid_entry { - __u32 function; - __u32 eax; - __u32 ebx; - __u32 ecx; - __u32 edx; - __u32 padding; -}; - -/* for KVM_SET_CPUID */ -struct kvm_cpuid { - __u32 nent; - __u32 padding; - struct kvm_cpuid_entry entries[0]; -}; - /* for KVM_SET_SIGNAL_MASK */ struct kvm_signal_mask { __u32 len; __u8 sigset[0]; }; +/* for KVM_TPR_ACCESS_REPORTING */ +struct kvm_tpr_access_ctl { + __u32 enabled; + __u32 flags; + __u32 reserved[8]; +}; + +/* for KVM_SET_VAPIC_ADDR */ +struct kvm_vapic_addr { + __u64 vapic_addr; +}; + #define KVMIO 0xAE /* @@ -347,11 +227,21 @@ struct kvm_signal_mask { */ #define KVM_CAP_IRQCHIP 0 #define KVM_CAP_HLT 1 +#define KVM_CAP_MMU_SHADOW_CACHE_CONTROL 2 +#define KVM_CAP_USER_MEMORY 3 +#define KVM_CAP_SET_TSS_ADDR 4 +#define KVM_CAP_EXT_CPUID 5 +#define KVM_CAP_VAPIC 6 /* * ioctls for VM fds */ #define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region) +#define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44) +#define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45) +#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46,\ + struct kvm_userspace_memory_region) +#define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) /* * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns * a vcpu fd. @@ -359,6 +249,7 @@ struct kvm_signal_mask { #define KVM_CREATE_VCPU _IO(KVMIO, 0x41) #define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) #define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) +#define KVM_GET_SUPPORTED_CPUID _IOWR(KVMIO, 0x48, struct kvm_cpuid2) /* Device model IOC */ #define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60) #define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level) @@ -384,5 +275,11 @@ struct kvm_signal_mask { #define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu) #define KVM_GET_LAPIC _IOR(KVMIO, 0x8e, struct kvm_lapic_state) #define KVM_SET_LAPIC _IOW(KVMIO, 0x8f, struct kvm_lapic_state) +#define KVM_SET_CPUID2 _IOW(KVMIO, 0x90, struct kvm_cpuid2) +#define KVM_GET_CPUID2 _IOWR(KVMIO, 0x91, struct kvm_cpuid2) +/* Available with KVM_CAP_VAPIC */ +#define KVM_TPR_ACCESS_REPORTING _IOWR(KVMIO, 0x92, struct kvm_tpr_access_ctl) +/* Available with KVM_CAP_VAPIC */ +#define KVM_SET_VAPIC_ADDR _IOW(KVMIO, 0x93, struct kvm_vapic_addr) #endif diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h new file mode 100644 index 00000000000..ea4764b0a2f --- /dev/null +++ b/include/linux/kvm_host.h @@ -0,0 +1,299 @@ +#ifndef __KVM_HOST_H +#define __KVM_HOST_H + +/* + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/types.h> +#include <linux/hardirq.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/spinlock.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/preempt.h> +#include <asm/signal.h> + +#include <linux/kvm.h> +#include <linux/kvm_para.h> + +#include <linux/kvm_types.h> + +#include <asm/kvm_host.h> + +#define KVM_MAX_VCPUS 4 +#define KVM_MEMORY_SLOTS 8 +/* memory slots that does not exposed to userspace */ +#define KVM_PRIVATE_MEM_SLOTS 4 + +#define KVM_PIO_PAGE_OFFSET 1 + +/* + * vcpu->requests bit members + */ +#define KVM_REQ_TLB_FLUSH 0 +#define KVM_REQ_MIGRATE_TIMER 1 +#define KVM_REQ_REPORT_TPR_ACCESS 2 + +struct kvm_vcpu; +extern struct kmem_cache *kvm_vcpu_cache; + +struct kvm_guest_debug { + int enabled; + unsigned long bp[4]; + int singlestep; +}; + +/* + * It would be nice to use something smarter than a linear search, TBD... + * Thankfully we dont expect many devices to register (famous last words :), + * so until then it will suffice. At least its abstracted so we can change + * in one place. + */ +struct kvm_io_bus { + int dev_count; +#define NR_IOBUS_DEVS 6 + struct kvm_io_device *devs[NR_IOBUS_DEVS]; +}; + +void kvm_io_bus_init(struct kvm_io_bus *bus); +void kvm_io_bus_destroy(struct kvm_io_bus *bus); +struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr); +void kvm_io_bus_register_dev(struct kvm_io_bus *bus, + struct kvm_io_device *dev); + +struct kvm_vcpu { + struct kvm *kvm; + struct preempt_notifier preempt_notifier; + int vcpu_id; + struct mutex mutex; + int cpu; + struct kvm_run *run; + int guest_mode; + unsigned long requests; + struct kvm_guest_debug guest_debug; + int fpu_active; + int guest_fpu_loaded; + wait_queue_head_t wq; + int sigset_active; + sigset_t sigset; + struct kvm_vcpu_stat stat; + +#ifdef CONFIG_HAS_IOMEM + int mmio_needed; + int mmio_read_completed; + int mmio_is_write; + int mmio_size; + unsigned char mmio_data[8]; + gpa_t mmio_phys_addr; +#endif + + struct kvm_vcpu_arch arch; +}; + +struct kvm_memory_slot { + gfn_t base_gfn; + unsigned long npages; + unsigned long flags; + unsigned long *rmap; + unsigned long *dirty_bitmap; + unsigned long userspace_addr; + int user_alloc; +}; + +struct kvm { + struct mutex lock; /* protects the vcpus array and APIC accesses */ + spinlock_t mmu_lock; + struct mm_struct *mm; /* userspace tied to this vm */ + int nmemslots; + struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS + + KVM_PRIVATE_MEM_SLOTS]; + struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; + struct list_head vm_list; + struct file *filp; + struct kvm_io_bus mmio_bus; + struct kvm_io_bus pio_bus; + struct kvm_vm_stat stat; + struct kvm_arch arch; +}; + +/* The guest did something we don't support. */ +#define pr_unimpl(vcpu, fmt, ...) \ + do { \ + if (printk_ratelimit()) \ + printk(KERN_ERR "kvm: %i: cpu%i " fmt, \ + current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \ + } while (0) + +#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) +#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) + +int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); +void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); + +void vcpu_load(struct kvm_vcpu *vcpu); +void vcpu_put(struct kvm_vcpu *vcpu); + +void decache_vcpus_on_cpu(int cpu); + + +int kvm_init(void *opaque, unsigned int vcpu_size, + struct module *module); +void kvm_exit(void); + +#define HPA_MSB ((sizeof(hpa_t) * 8) - 1) +#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB) +static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } +struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva); + +extern struct page *bad_page; + +int is_error_page(struct page *page); +int kvm_is_error_hva(unsigned long addr); +int kvm_set_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + int user_alloc); +int __kvm_set_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + int user_alloc); +int kvm_arch_set_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old, + int user_alloc); +gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn); +struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); +void kvm_release_page_clean(struct page *page); +void kvm_release_page_dirty(struct page *page); +int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, + int len); +int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, + unsigned long len); +int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); +int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, + int offset, int len); +int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, + unsigned long len); +int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); +int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); +struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); +int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); +void mark_page_dirty(struct kvm *kvm, gfn_t gfn); + +void kvm_vcpu_block(struct kvm_vcpu *vcpu); +void kvm_resched(struct kvm_vcpu *vcpu); +void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); +void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); +void kvm_flush_remote_tlbs(struct kvm *kvm); + +long kvm_arch_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg); +long kvm_arch_vcpu_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg); +void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); +void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); + +int kvm_dev_ioctl_check_extension(long ext); + +int kvm_get_dirty_log(struct kvm *kvm, + struct kvm_dirty_log *log, int *is_dirty); +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, + struct kvm_dirty_log *log); + +int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, + struct + kvm_userspace_memory_region *mem, + int user_alloc); +long kvm_arch_vm_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg); +void kvm_arch_destroy_vm(struct kvm *kvm); + +int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); +int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); + +int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, + struct kvm_translation *tr); + +int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs); +int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs); +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs); +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs); +int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, + struct kvm_debug_guest *dbg); +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run); + +int kvm_arch_init(void *opaque); +void kvm_arch_exit(void); + +int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu); +void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu); + +void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu); +void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); +void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); +struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id); +int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu); +void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); + +int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu); +void kvm_arch_hardware_enable(void *garbage); +void kvm_arch_hardware_disable(void *garbage); +int kvm_arch_hardware_setup(void); +void kvm_arch_hardware_unsetup(void); +void kvm_arch_check_processor_compat(void *rtn); +int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); + +void kvm_free_physmem(struct kvm *kvm); + +struct kvm *kvm_arch_create_vm(void); +void kvm_arch_destroy_vm(struct kvm *kvm); + +int kvm_cpu_get_interrupt(struct kvm_vcpu *v); +int kvm_cpu_has_interrupt(struct kvm_vcpu *v); +void kvm_vcpu_kick(struct kvm_vcpu *vcpu); + +static inline void kvm_guest_enter(void) +{ + account_system_vtime(current); + current->flags |= PF_VCPU; +} + +static inline void kvm_guest_exit(void) +{ + account_system_vtime(current); + current->flags &= ~PF_VCPU; +} + +static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + return slot - kvm->memslots; +} + +static inline gpa_t gfn_to_gpa(gfn_t gfn) +{ + return (gpa_t)gfn << PAGE_SHIFT; +} + +static inline void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) +{ + set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests); +} + +enum kvm_stat_kind { + KVM_STAT_VM, + KVM_STAT_VCPU, +}; + +struct kvm_stats_debugfs_item { + const char *name; + int offset; + enum kvm_stat_kind kind; + struct dentry *dentry; +}; +extern struct kvm_stats_debugfs_item debugfs_entries[]; + +#endif diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h index 3b292565a69..5497aac0d2f 100644 --- a/include/linux/kvm_para.h +++ b/include/linux/kvm_para.h @@ -2,72 +2,30 @@ #define __LINUX_KVM_PARA_H /* - * Guest OS interface for KVM paravirtualization - * - * Note: this interface is totally experimental, and is certain to change - * as we make progress. + * This header file provides a method for making a hypercall to the host + * Architectures should define: + * - kvm_hypercall0, kvm_hypercall1... + * - kvm_arch_para_features + * - kvm_para_available */ -/* - * Per-VCPU descriptor area shared between guest and host. Writable to - * both guest and host. Registered with the host by the guest when - * a guest acknowledges paravirtual mode. - * - * NOTE: all addresses are guest-physical addresses (gpa), to make it - * easier for the hypervisor to map between the various addresses. - */ -struct kvm_vcpu_para_state { - /* - * API version information for compatibility. If there's any support - * mismatch (too old host trying to execute too new guest) then - * the host will deny entry into paravirtual mode. Any other - * combination (new host + old guest and new host + new guest) - * is supposed to work - new host versions will support all old - * guest API versions. - */ - u32 guest_version; - u32 host_version; - u32 size; - u32 ret; - - /* - * The address of the vm exit instruction (VMCALL or VMMCALL), - * which the host will patch according to the CPU model the - * VM runs on: - */ - u64 hypercall_gpa; - -} __attribute__ ((aligned(PAGE_SIZE))); - -#define KVM_PARA_API_VERSION 1 - -/* - * This is used for an RDMSR's ECX parameter to probe for a KVM host. - * Hopefully no CPU vendor will use up this number. This is placed well - * out of way of the typical space occupied by CPU vendors' MSR indices, - * and we think (or at least hope) it wont be occupied in the future - * either. - */ -#define MSR_KVM_API_MAGIC 0x87655678 +/* Return values for hypercalls */ +#define KVM_ENOSYS 1000 -#define KVM_EINVAL 1 +#define KVM_HC_VAPIC_POLL_IRQ 1 /* - * Hypercall calling convention: - * - * Each hypercall may have 0-6 parameters. - * - * 64-bit hypercall index is in RAX, goes from 0 to __NR_hypercalls-1 - * - * 64-bit parameters 1-6 are in the standard gcc x86_64 calling convention - * order: RDI, RSI, RDX, RCX, R8, R9. - * - * 32-bit index is EBX, parameters are: EAX, ECX, EDX, ESI, EDI, EBP. - * (the first 3 are according to the gcc regparm calling convention) - * - * No registers are clobbered by the hypercall, except that the - * return value is in RAX. + * hypercalls use architecture specific */ -#define __NR_hypercalls 0 +#include <asm/kvm_para.h> + +#ifdef __KERNEL__ +static inline int kvm_para_has_feature(unsigned int feature) +{ + if (kvm_arch_para_features() & (1UL << feature)) + return 1; + return 0; +} +#endif /* __KERNEL__ */ +#endif /* __LINUX_KVM_PARA_H */ -#endif diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h new file mode 100644 index 00000000000..1c4e46decb2 --- /dev/null +++ b/include/linux/kvm_types.h @@ -0,0 +1,54 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + */ + +#ifndef __KVM_TYPES_H__ +#define __KVM_TYPES_H__ + +#include <asm/types.h> + +/* + * Address types: + * + * gva - guest virtual address + * gpa - guest physical address + * gfn - guest frame number + * hva - host virtual address + * hpa - host physical address + * hfn - host frame number + */ + +typedef unsigned long gva_t; +typedef u64 gpa_t; +typedef unsigned long gfn_t; + +typedef unsigned long hva_t; +typedef u64 hpa_t; +typedef unsigned long hfn_t; + +struct kvm_pio_request { + unsigned long count; + int cur_count; + struct page *guest_pages[2]; + unsigned guest_page_offset; + int in; + int port; + int size; + int string; + int down; + int rep; +}; + +#endif /* __KVM_TYPES_H__ */ diff --git a/include/linux/selinux.h b/include/linux/selinux.h index 6080f73fc85..8c2cc4c0252 100644 --- a/include/linux/selinux.h +++ b/include/linux/selinux.h @@ -120,16 +120,35 @@ void selinux_get_task_sid(struct task_struct *tsk, u32 *sid); int selinux_string_to_sid(char *str, u32 *sid); /** - * selinux_relabel_packet_permission - check permission to relabel a packet - * @sid: ID value to be applied to network packet (via SECMARK, most likely) + * selinux_secmark_relabel_packet_permission - secmark permission check + * @sid: SECMARK ID value to be applied to network packet * - * Returns 0 if the current task is allowed to label packets with the - * supplied security ID. Note that it is implicit that the packet is always - * being relabeled from the default unlabled value, and that the access - * control decision is made in the AVC. + * Returns 0 if the current task is allowed to set the SECMARK label of + * packets with the supplied security ID. Note that it is implicit that + * the packet is always being relabeled from the default unlabeled value, + * and that the access control decision is made in the AVC. */ -int selinux_relabel_packet_permission(u32 sid); +int selinux_secmark_relabel_packet_permission(u32 sid); +/** + * selinux_secmark_refcount_inc - increments the secmark use counter + * + * SELinux keeps track of the current SECMARK targets in use so it knows + * when to apply SECMARK label access checks to network packets. This + * function incements this reference count to indicate that a new SECMARK + * target has been configured. + */ +void selinux_secmark_refcount_inc(void); + +/** + * selinux_secmark_refcount_dec - decrements the secmark use counter + * + * SELinux keeps track of the current SECMARK targets in use so it knows + * when to apply SECMARK label access checks to network packets. This + * function decements this reference count to indicate that one of the + * existing SECMARK targets has been removed/flushed. + */ +void selinux_secmark_refcount_dec(void); #else static inline int selinux_audit_rule_init(u32 field, u32 op, @@ -184,11 +203,21 @@ static inline int selinux_string_to_sid(const char *str, u32 *sid) return 0; } -static inline int selinux_relabel_packet_permission(u32 sid) +static inline int selinux_secmark_relabel_packet_permission(u32 sid) { return 0; } +static inline void selinux_secmark_refcount_inc(void) +{ + return; +} + +static inline void selinux_secmark_refcount_dec(void) +{ + return; +} + #endif /* CONFIG_SECURITY_SELINUX */ #endif /* _LINUX_SELINUX_H */ diff --git a/include/net/netlabel.h b/include/net/netlabel.h index 2e5b2f6f9fa..b3213c7c530 100644 --- a/include/net/netlabel.h +++ b/include/net/netlabel.h @@ -67,7 +67,11 @@ * NetLabel NETLINK protocol */ -#define NETLBL_PROTO_VERSION 1 +/* NetLabel NETLINK protocol version + * 1: initial version + * 2: added static labels for unlabeled connections + */ +#define NETLBL_PROTO_VERSION 2 /* NetLabel NETLINK types/families */ #define NETLBL_NLTYPE_NONE 0 @@ -105,17 +109,49 @@ struct netlbl_dom_map; /* Domain mapping operations */ int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info); -/* LSM security attributes */ +/* + * LSM security attributes + */ + +/** + * struct netlbl_lsm_cache - NetLabel LSM security attribute cache + * @refcount: atomic reference counter + * @free: LSM supplied function to free the cache data + * @data: LSM supplied cache data + * + * Description: + * This structure is provided for LSMs which wish to make use of the NetLabel + * caching mechanism to store LSM specific data/attributes in the NetLabel + * cache. If the LSM has to perform a lot of translation from the NetLabel + * security attributes into it's own internal representation then the cache + * mechanism can provide a way to eliminate some or all of that translation + * overhead on a cache hit. + * + */ struct netlbl_lsm_cache { atomic_t refcount; void (*free) (const void *data); void *data; }; -/* The catmap bitmap field MUST be a power of two in length and large + +/** + * struct netlbl_lsm_secattr_catmap - NetLabel LSM secattr category bitmap + * @startbit: the value of the lowest order bit in the bitmap + * @bitmap: the category bitmap + * @next: pointer to the next bitmap "node" or NULL + * + * Description: + * This structure is used to represent category bitmaps. Due to the large + * number of categories supported by most labeling protocols it is not + * practical to transfer a full bitmap internally so NetLabel adopts a sparse + * bitmap structure modeled after SELinux's ebitmap structure. + * The catmap bitmap field MUST be a power of two in length and large * enough to hold at least 240 bits. Special care (i.e. check the code!) * should be used when changing these values as the LSM implementation * probably has functions which rely on the sizes of these types to speed - * processing. */ + * processing. + * + */ #define NETLBL_CATMAP_MAPTYPE u64 #define NETLBL_CATMAP_MAPCNT 4 #define NETLBL_CATMAP_MAPSIZE (sizeof(NETLBL_CATMAP_MAPTYPE) * 8) @@ -127,22 +163,48 @@ struct netlbl_lsm_secattr_catmap { NETLBL_CATMAP_MAPTYPE bitmap[NETLBL_CATMAP_MAPCNT]; struct netlbl_lsm_secattr_catmap *next; }; + +/** + * struct netlbl_lsm_secattr - NetLabel LSM security attributes + * @flags: indicate which attributes are contained in this structure + * @type: indicate the NLTYPE of the attributes + * @domain: the NetLabel LSM domain + * @cache: NetLabel LSM specific cache + * @attr.mls: MLS sensitivity label + * @attr.mls.cat: MLS category bitmap + * @attr.mls.lvl: MLS sensitivity level + * @attr.secid: LSM specific secid token + * + * Description: + * This structure is used to pass security attributes between NetLabel and the + * LSM modules. The flags field is used to specify which fields within the + * struct are valid and valid values can be created by bitwise OR'ing the + * NETLBL_SECATTR_* defines. The domain field is typically set by the LSM to + * specify domain specific configuration settings and is not usually used by + * NetLabel itself when returning security attributes to the LSM. + * + */ #define NETLBL_SECATTR_NONE 0x00000000 #define NETLBL_SECATTR_DOMAIN 0x00000001 #define NETLBL_SECATTR_CACHE 0x00000002 #define NETLBL_SECATTR_MLS_LVL 0x00000004 #define NETLBL_SECATTR_MLS_CAT 0x00000008 +#define NETLBL_SECATTR_SECID 0x00000010 #define NETLBL_SECATTR_CACHEABLE (NETLBL_SECATTR_MLS_LVL | \ - NETLBL_SECATTR_MLS_CAT) + NETLBL_SECATTR_MLS_CAT | \ + NETLBL_SECATTR_SECID) struct netlbl_lsm_secattr { u32 flags; - + u32 type; char *domain; - - u32 mls_lvl; - struct netlbl_lsm_secattr_catmap *mls_cat; - struct netlbl_lsm_cache *cache; + union { + struct { + struct netlbl_lsm_secattr_catmap *cat; + u32 lvl; + } mls; + u32 secid; + } attr; }; /* @@ -231,10 +293,7 @@ static inline void netlbl_secattr_catmap_free( */ static inline void netlbl_secattr_init(struct netlbl_lsm_secattr *secattr) { - secattr->flags = 0; - secattr->domain = NULL; - secattr->mls_cat = NULL; - secattr->cache = NULL; + memset(secattr, 0, sizeof(*secattr)); } /** @@ -248,11 +307,11 @@ static inline void netlbl_secattr_init(struct netlbl_lsm_secattr *secattr) */ static inline void netlbl_secattr_destroy(struct netlbl_lsm_secattr *secattr) { - if (secattr->cache) - netlbl_secattr_cache_free(secattr->cache); kfree(secattr->domain); - if (secattr->mls_cat) - netlbl_secattr_catmap_free(secattr->mls_cat); + if (secattr->flags & NETLBL_SECATTR_CACHE) + netlbl_secattr_cache_free(secattr->cache); + if (secattr->flags & NETLBL_SECATTR_MLS_CAT) + netlbl_secattr_catmap_free(secattr->attr.mls.cat); } /** @@ -300,7 +359,7 @@ int netlbl_secattr_catmap_setrng(struct netlbl_lsm_secattr_catmap *catmap, gfp_t flags); /* - * LSM protocol operations + * LSM protocol operations (NetLabel LSM/kernel API) */ int netlbl_enabled(void); int netlbl_sock_setattr(struct sock *sk, @@ -308,6 +367,7 @@ int netlbl_sock_setattr(struct sock *sk, int netlbl_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr); int netlbl_skbuff_getattr(const struct sk_buff *skb, + u16 family, struct netlbl_lsm_secattr *secattr); void netlbl_skbuff_err(struct sk_buff *skb, int error); @@ -360,6 +420,7 @@ static inline int netlbl_sock_getattr(struct sock *sk, return -ENOSYS; } static inline int netlbl_skbuff_getattr(const struct sk_buff *skb, + u16 family, struct netlbl_lsm_secattr *secattr) { return -ENOSYS; diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h index 702fcfeb37f..82251575a9b 100644 --- a/include/scsi/scsi.h +++ b/include/scsi/scsi.h @@ -11,6 +11,25 @@ #include <linux/types.h> /* + * The maximum number of SG segments that we will put inside a + * scatterlist (unless chaining is used). Should ideally fit inside a + * single page, to avoid a higher order allocation. We could define this + * to SG_MAX_SINGLE_ALLOC to pack correctly at the highest order. The + * minimum value is 32 + */ +#define SCSI_MAX_SG_SEGMENTS 128 + +/* + * Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit + * is totally arbitrary, a setting of 2048 will get you at least 8mb ios. + */ +#ifdef ARCH_HAS_SG_CHAIN +#define SCSI_MAX_SG_CHAIN_SEGMENTS 2048 +#else +#define SCSI_MAX_SG_CHAIN_SEGMENTS SCSI_MAX_SG_SEGMENTS +#endif + +/* * SCSI command lengths */ @@ -83,6 +102,7 @@ extern const unsigned char scsi_command_size[8]; #define READ_TOC 0x43 #define LOG_SELECT 0x4c #define LOG_SENSE 0x4d +#define XDWRITEREAD_10 0x53 #define MODE_SELECT_10 0x55 #define RESERVE_10 0x56 #define RELEASE_10 0x57 diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index a457fca66f6..de28aab820b 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -2,15 +2,20 @@ #define _SCSI_SCSI_CMND_H #include <linux/dma-mapping.h> +#include <linux/blkdev.h> #include <linux/list.h> #include <linux/types.h> #include <linux/timer.h> #include <linux/scatterlist.h> -struct request; struct Scsi_Host; struct scsi_device; +struct scsi_data_buffer { + struct sg_table table; + unsigned length; + int resid; +}; /* embedded in scsi_cmnd */ struct scsi_pointer { @@ -61,15 +66,11 @@ struct scsi_cmnd { /* These elements define the operation we are about to perform */ #define MAX_COMMAND_SIZE 16 unsigned char cmnd[MAX_COMMAND_SIZE]; - unsigned request_bufflen; /* Actual request size */ struct timer_list eh_timeout; /* Used to time out the command. */ - void *request_buffer; /* Actual requested buffer */ /* These elements define the operation we ultimately want to perform */ - struct sg_table sg_table; - unsigned short use_sg; /* Number of pieces of scatter-gather */ - + struct scsi_data_buffer sdb; unsigned underflow; /* Return error if less than this amount is transferred */ @@ -79,10 +80,6 @@ struct scsi_cmnd { reconnects. Probably == sector size */ - int resid; /* Number of bytes requested to be - transferred less actual number - transferred (0 if not supported) */ - struct request *request; /* The command we are working on */ @@ -127,27 +124,55 @@ extern void *scsi_kmap_atomic_sg(struct scatterlist *sg, int sg_count, size_t *offset, size_t *len); extern void scsi_kunmap_atomic_sg(void *virt); -extern int scsi_alloc_sgtable(struct scsi_cmnd *, gfp_t); -extern void scsi_free_sgtable(struct scsi_cmnd *); +extern int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask); +extern void scsi_release_buffers(struct scsi_cmnd *cmd); extern int scsi_dma_map(struct scsi_cmnd *cmd); extern void scsi_dma_unmap(struct scsi_cmnd *cmd); -#define scsi_sg_count(cmd) ((cmd)->use_sg) -#define scsi_sglist(cmd) ((cmd)->sg_table.sgl) -#define scsi_bufflen(cmd) ((cmd)->request_bufflen) +static inline unsigned scsi_sg_count(struct scsi_cmnd *cmd) +{ + return cmd->sdb.table.nents; +} + +static inline struct scatterlist *scsi_sglist(struct scsi_cmnd *cmd) +{ + return cmd->sdb.table.sgl; +} + +static inline unsigned scsi_bufflen(struct scsi_cmnd *cmd) +{ + return cmd->sdb.length; +} static inline void scsi_set_resid(struct scsi_cmnd *cmd, int resid) { - cmd->resid = resid; + cmd->sdb.resid = resid; } static inline int scsi_get_resid(struct scsi_cmnd *cmd) { - return cmd->resid; + return cmd->sdb.resid; } #define scsi_for_each_sg(cmd, sg, nseg, __i) \ for_each_sg(scsi_sglist(cmd), sg, nseg, __i) +static inline int scsi_bidi_cmnd(struct scsi_cmnd *cmd) +{ + return blk_bidi_rq(cmd->request) && + (cmd->request->next_rq->special != NULL); +} + +static inline struct scsi_data_buffer *scsi_in(struct scsi_cmnd *cmd) +{ + return scsi_bidi_cmnd(cmd) ? + cmd->request->next_rq->special : &cmd->sdb; +} + +static inline struct scsi_data_buffer *scsi_out(struct scsi_cmnd *cmd) +{ + return &cmd->sdb; +} + #endif /* _SCSI_SCSI_CMND_H */ diff --git a/include/scsi/scsi_eh.h b/include/scsi/scsi_eh.h index d21b8913ceb..25071d5d9bf 100644 --- a/include/scsi/scsi_eh.h +++ b/include/scsi/scsi_eh.h @@ -68,16 +68,15 @@ extern int scsi_get_sense_info_fld(const u8 * sense_buffer, int sb_len, extern int scsi_reset_provider(struct scsi_device *, int); struct scsi_eh_save { + /* saved state */ int result; enum dma_data_direction data_direction; unsigned char cmd_len; unsigned char cmnd[MAX_COMMAND_SIZE]; + struct scsi_data_buffer sdb; + struct request *next_rq; - void *buffer; - unsigned bufflen; - unsigned short use_sg; - int resid; - + /* new command support */ struct scatterlist sense_sgl; }; diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index 0fd4746ee39..5c58d594126 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -39,9 +39,6 @@ struct blk_queue_tags; #define DISABLE_CLUSTERING 0 #define ENABLE_CLUSTERING 1 -#define DISABLE_SG_CHAINING 0 -#define ENABLE_SG_CHAINING 1 - enum scsi_eh_timer_return { EH_NOT_HANDLED, EH_HANDLED, @@ -136,9 +133,9 @@ struct scsi_host_template { * the done callback is invoked. * * This is called to inform the LLD to transfer - * cmd->request_bufflen bytes. The cmd->use_sg speciefies the + * scsi_bufflen(cmd) bytes. scsi_sg_count(cmd) speciefies the * number of scatterlist entried in the command and - * cmd->request_buffer contains the scatterlist. + * scsi_sglist(cmd) returns the scatterlist. * * return values: see queuecommand * @@ -446,15 +443,6 @@ struct scsi_host_template { unsigned ordered_tag:1; /* - * true if the low-level driver can support sg chaining. this - * will be removed eventually when all the drivers are - * converted to support sg chaining. - * - * Status: OBSOLETE - */ - unsigned use_sg_chaining:1; - - /* * Countdown for host blocking with no commands outstanding */ unsigned int max_host_blocked; @@ -598,7 +586,6 @@ struct Scsi_Host { unsigned unchecked_isa_dma:1; unsigned use_clustering:1; unsigned use_blk_tcq:1; - unsigned use_sg_chaining:1; /* * Host has requested that no further requests come through for the diff --git a/kernel/fork.c b/kernel/fork.c index 314f5101d2b..05e0b6f4365 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -393,6 +393,7 @@ void fastcall __mmdrop(struct mm_struct *mm) destroy_context(mm); free_mm(mm); } +EXPORT_SYMBOL_GPL(__mmdrop); /* * Decrement the use count and release all resources for an mm. diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index d4dc4eb48d9..a2241060113 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -348,6 +348,7 @@ static int cipso_v4_cache_check(const unsigned char *key, atomic_inc(&entry->lsm_data->refcount); secattr->cache = entry->lsm_data; secattr->flags |= NETLBL_SECATTR_CACHE; + secattr->type = NETLBL_NLTYPE_CIPSOV4; if (prev_entry == NULL) { spin_unlock_bh(&cipso_v4_cache[bkt].lock); return 0; @@ -865,7 +866,7 @@ static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def, } for (;;) { - host_spot = netlbl_secattr_catmap_walk(secattr->mls_cat, + host_spot = netlbl_secattr_catmap_walk(secattr->attr.mls.cat, host_spot + 1); if (host_spot < 0) break; @@ -948,7 +949,7 @@ static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def, return -EPERM; break; } - ret_val = netlbl_secattr_catmap_setbit(secattr->mls_cat, + ret_val = netlbl_secattr_catmap_setbit(secattr->attr.mls.cat, host_spot, GFP_ATOMIC); if (ret_val != 0) @@ -1014,7 +1015,8 @@ static int cipso_v4_map_cat_enum_hton(const struct cipso_v4_doi *doi_def, u32 cat_iter = 0; for (;;) { - cat = netlbl_secattr_catmap_walk(secattr->mls_cat, cat + 1); + cat = netlbl_secattr_catmap_walk(secattr->attr.mls.cat, + cat + 1); if (cat < 0) break; if ((cat_iter + 2) > net_cat_len) @@ -1049,7 +1051,7 @@ static int cipso_v4_map_cat_enum_ntoh(const struct cipso_v4_doi *doi_def, u32 iter; for (iter = 0; iter < net_cat_len; iter += 2) { - ret_val = netlbl_secattr_catmap_setbit(secattr->mls_cat, + ret_val = netlbl_secattr_catmap_setbit(secattr->attr.mls.cat, ntohs(get_unaligned((__be16 *)&net_cat[iter])), GFP_ATOMIC); if (ret_val != 0) @@ -1130,7 +1132,8 @@ static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def, return -ENOSPC; for (;;) { - iter = netlbl_secattr_catmap_walk(secattr->mls_cat, iter + 1); + iter = netlbl_secattr_catmap_walk(secattr->attr.mls.cat, + iter + 1); if (iter < 0) break; cat_size += (iter == 0 ? 0 : sizeof(u16)); @@ -1138,7 +1141,8 @@ static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def, return -ENOSPC; array[array_cnt++] = iter; - iter = netlbl_secattr_catmap_walk_rng(secattr->mls_cat, iter); + iter = netlbl_secattr_catmap_walk_rng(secattr->attr.mls.cat, + iter); if (iter < 0) return -EFAULT; cat_size += sizeof(u16); @@ -1191,7 +1195,7 @@ static int cipso_v4_map_cat_rng_ntoh(const struct cipso_v4_doi *doi_def, else cat_low = 0; - ret_val = netlbl_secattr_catmap_setrng(secattr->mls_cat, + ret_val = netlbl_secattr_catmap_setrng(secattr->attr.mls.cat, cat_low, cat_high, GFP_ATOMIC); @@ -1251,7 +1255,9 @@ static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def, if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0) return -EPERM; - ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->mls_lvl, &level); + ret_val = cipso_v4_map_lvl_hton(doi_def, + secattr->attr.mls.lvl, + &level); if (ret_val != 0) return ret_val; @@ -1303,12 +1309,13 @@ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def, ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); if (ret_val != 0) return ret_val; - secattr->mls_lvl = level; + secattr->attr.mls.lvl = level; secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (tag_len > 4) { - secattr->mls_cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC); - if (secattr->mls_cat == NULL) + secattr->attr.mls.cat = + netlbl_secattr_catmap_alloc(GFP_ATOMIC); + if (secattr->attr.mls.cat == NULL) return -ENOMEM; ret_val = cipso_v4_map_cat_rbm_ntoh(doi_def, @@ -1316,7 +1323,7 @@ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def, tag_len - 4, secattr); if (ret_val != 0) { - netlbl_secattr_catmap_free(secattr->mls_cat); + netlbl_secattr_catmap_free(secattr->attr.mls.cat); return ret_val; } @@ -1350,7 +1357,9 @@ static int cipso_v4_gentag_enum(const struct cipso_v4_doi *doi_def, if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL)) return -EPERM; - ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->mls_lvl, &level); + ret_val = cipso_v4_map_lvl_hton(doi_def, + secattr->attr.mls.lvl, + &level); if (ret_val != 0) return ret_val; @@ -1396,12 +1405,13 @@ static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def, ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); if (ret_val != 0) return ret_val; - secattr->mls_lvl = level; + secattr->attr.mls.lvl = level; secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (tag_len > 4) { - secattr->mls_cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC); - if (secattr->mls_cat == NULL) + secattr->attr.mls.cat = + netlbl_secattr_catmap_alloc(GFP_ATOMIC); + if (secattr->attr.mls.cat == NULL) return -ENOMEM; ret_val = cipso_v4_map_cat_enum_ntoh(doi_def, @@ -1409,7 +1419,7 @@ static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def, tag_len - 4, secattr); if (ret_val != 0) { - netlbl_secattr_catmap_free(secattr->mls_cat); + netlbl_secattr_catmap_free(secattr->attr.mls.cat); return ret_val; } @@ -1443,7 +1453,9 @@ static int cipso_v4_gentag_rng(const struct cipso_v4_doi *doi_def, if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL)) return -EPERM; - ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->mls_lvl, &level); + ret_val = cipso_v4_map_lvl_hton(doi_def, + secattr->attr.mls.lvl, + &level); if (ret_val != 0) return ret_val; @@ -1488,12 +1500,13 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def, ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); if (ret_val != 0) return ret_val; - secattr->mls_lvl = level; + secattr->attr.mls.lvl = level; secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (tag_len > 4) { - secattr->mls_cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC); - if (secattr->mls_cat == NULL) + secattr->attr.mls.cat = + netlbl_secattr_catmap_alloc(GFP_ATOMIC); + if (secattr->attr.mls.cat == NULL) return -ENOMEM; ret_val = cipso_v4_map_cat_rng_ntoh(doi_def, @@ -1501,7 +1514,7 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def, tag_len - 4, secattr); if (ret_val != 0) { - netlbl_secattr_catmap_free(secattr->mls_cat); + netlbl_secattr_catmap_free(secattr->attr.mls.cat); return ret_val; } @@ -1850,6 +1863,8 @@ static int cipso_v4_getattr(const unsigned char *cipso, ret_val = cipso_v4_parsetag_rng(doi_def, &cipso[6], secattr); break; } + if (ret_val == 0) + secattr->type = NETLBL_NLTYPE_CIPSOV4; getattr_return: rcu_read_unlock(); diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c index b11b3ecbb39..7708e2084ce 100644 --- a/net/netfilter/xt_SECMARK.c +++ b/net/netfilter/xt_SECMARK.c @@ -72,12 +72,13 @@ static bool checkentry_selinux(struct xt_secmark_target_info *info) return false; } - err = selinux_relabel_packet_permission(sel->selsid); + err = selinux_secmark_relabel_packet_permission(sel->selsid); if (err) { printk(KERN_INFO PFX "unable to obtain relabeling permission\n"); return false; } + selinux_secmark_refcount_inc(); return true; } @@ -110,11 +111,20 @@ secmark_tg_check(const char *tablename, const void *entry, return true; } +void secmark_tg_destroy(const struct xt_target *target, void *targinfo) +{ + switch (mode) { + case SECMARK_MODE_SEL: + selinux_secmark_refcount_dec(); + } +} + static struct xt_target secmark_tg_reg[] __read_mostly = { { .name = "SECMARK", .family = AF_INET, .checkentry = secmark_tg_check, + .destroy = secmark_tg_destroy, .target = secmark_tg, .targetsize = sizeof(struct xt_secmark_target_info), .table = "mangle", @@ -124,6 +134,7 @@ static struct xt_target secmark_tg_reg[] __read_mostly = { .name = "SECMARK", .family = AF_INET6, .checkentry = secmark_tg_check, + .destroy = secmark_tg_destroy, .target = secmark_tg, .targetsize = sizeof(struct xt_secmark_target_info), .table = "mangle", diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c index ba0ca8d3f77..becf91a952a 100644 --- a/net/netlabel/netlabel_cipso_v4.c +++ b/net/netlabel/netlabel_cipso_v4.c @@ -38,6 +38,7 @@ #include <net/genetlink.h> #include <net/netlabel.h> #include <net/cipso_ipv4.h> +#include <asm/atomic.h> #include "netlabel_user.h" #include "netlabel_cipso_v4.h" @@ -421,7 +422,7 @@ static int netlbl_cipsov4_add(struct sk_buff *skb, struct genl_info *info) break; } if (ret_val == 0) - netlbl_mgmt_protocount_inc(); + atomic_inc(&netlabel_mgmt_protocount); audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_ADD, &audit_info); @@ -698,7 +699,7 @@ static int netlbl_cipsov4_remove(struct sk_buff *skb, struct genl_info *info) &audit_info, netlbl_cipsov4_doi_free); if (ret_val == 0) - netlbl_mgmt_protocount_dec(); + atomic_dec(&netlabel_mgmt_protocount); audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_DEL, &audit_info); diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c index b3675bd7db3..9a8ea0195c4 100644 --- a/net/netlabel/netlabel_domainhash.c +++ b/net/netlabel/netlabel_domainhash.c @@ -54,9 +54,6 @@ struct netlbl_domhsh_tbl { * hash table should be okay */ static DEFINE_SPINLOCK(netlbl_domhsh_lock); static struct netlbl_domhsh_tbl *netlbl_domhsh = NULL; - -/* Default domain mapping */ -static DEFINE_SPINLOCK(netlbl_domhsh_def_lock); static struct netlbl_dom_map *netlbl_domhsh_def = NULL; /* @@ -109,17 +106,14 @@ static u32 netlbl_domhsh_hash(const char *key) /** * netlbl_domhsh_search - Search for a domain entry * @domain: the domain - * @def: return default if no match is found * * Description: * Searches the domain hash table and returns a pointer to the hash table - * entry if found, otherwise NULL is returned. If @def is non-zero and a - * match is not found in the domain hash table the default mapping is returned - * if it exists. The caller is responsibile for the rcu hash table locks - * (i.e. the caller much call rcu_read_[un]lock()). + * entry if found, otherwise NULL is returned. The caller is responsibile for + * the rcu hash table locks (i.e. the caller much call rcu_read_[un]lock()). * */ -static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain, u32 def) +static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain) { u32 bkt; struct netlbl_dom_map *iter; @@ -133,10 +127,31 @@ static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain, u32 def) return iter; } - if (def != 0) { - iter = rcu_dereference(netlbl_domhsh_def); - if (iter != NULL && iter->valid) - return iter; + return NULL; +} + +/** + * netlbl_domhsh_search_def - Search for a domain entry + * @domain: the domain + * @def: return default if no match is found + * + * Description: + * Searches the domain hash table and returns a pointer to the hash table + * entry if an exact match is found, if an exact match is not present in the + * hash table then the default entry is returned if valid otherwise NULL is + * returned. The caller is responsibile for the rcu hash table locks + * (i.e. the caller much call rcu_read_[un]lock()). + * + */ +static struct netlbl_dom_map *netlbl_domhsh_search_def(const char *domain) +{ + struct netlbl_dom_map *entry; + + entry = netlbl_domhsh_search(domain); + if (entry == NULL) { + entry = rcu_dereference(netlbl_domhsh_def); + if (entry != NULL && entry->valid) + return entry; } return NULL; @@ -221,24 +236,22 @@ int netlbl_domhsh_add(struct netlbl_dom_map *entry, INIT_RCU_HEAD(&entry->rcu); rcu_read_lock(); + spin_lock(&netlbl_domhsh_lock); if (entry->domain != NULL) { bkt = netlbl_domhsh_hash(entry->domain); - spin_lock(&netlbl_domhsh_lock); - if (netlbl_domhsh_search(entry->domain, 0) == NULL) + if (netlbl_domhsh_search(entry->domain) == NULL) list_add_tail_rcu(&entry->list, &rcu_dereference(netlbl_domhsh)->tbl[bkt]); else ret_val = -EEXIST; - spin_unlock(&netlbl_domhsh_lock); } else { INIT_LIST_HEAD(&entry->list); - spin_lock(&netlbl_domhsh_def_lock); if (rcu_dereference(netlbl_domhsh_def) == NULL) rcu_assign_pointer(netlbl_domhsh_def, entry); else ret_val = -EEXIST; - spin_unlock(&netlbl_domhsh_def_lock); } + spin_unlock(&netlbl_domhsh_lock); audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info); if (audit_buf != NULL) { audit_log_format(audit_buf, @@ -307,7 +320,10 @@ int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info) struct audit_buffer *audit_buf; rcu_read_lock(); - entry = netlbl_domhsh_search(domain, (domain != NULL ? 0 : 1)); + if (domain) + entry = netlbl_domhsh_search(domain); + else + entry = netlbl_domhsh_search_def(domain); if (entry == NULL) goto remove_return; switch (entry->type) { @@ -316,23 +332,16 @@ int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info) entry->domain); break; } - if (entry != rcu_dereference(netlbl_domhsh_def)) { - spin_lock(&netlbl_domhsh_lock); - if (entry->valid) { - entry->valid = 0; + spin_lock(&netlbl_domhsh_lock); + if (entry->valid) { + entry->valid = 0; + if (entry != rcu_dereference(netlbl_domhsh_def)) list_del_rcu(&entry->list); - ret_val = 0; - } - spin_unlock(&netlbl_domhsh_lock); - } else { - spin_lock(&netlbl_domhsh_def_lock); - if (entry->valid) { - entry->valid = 0; + else rcu_assign_pointer(netlbl_domhsh_def, NULL); - ret_val = 0; - } - spin_unlock(&netlbl_domhsh_def_lock); + ret_val = 0; } + spin_unlock(&netlbl_domhsh_lock); audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info); if (audit_buf != NULL) { @@ -377,7 +386,7 @@ int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info) */ struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain) { - return netlbl_domhsh_search(domain, 1); + return netlbl_domhsh_search_def(domain); } /** diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 4f50949722a..c69e3e1f05c 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -34,6 +34,7 @@ #include <net/netlabel.h> #include <net/cipso_ipv4.h> #include <asm/bug.h> +#include <asm/atomic.h> #include "netlabel_domainhash.h" #include "netlabel_unlabeled.h" @@ -262,7 +263,7 @@ int netlbl_enabled(void) /* At some point we probably want to expose this mechanism to the user * as well so that admins can toggle NetLabel regardless of the * configuration */ - return (netlbl_mgmt_protocount_value() > 0 ? 1 : 0); + return (atomic_read(&netlabel_mgmt_protocount) > 0); } /** @@ -311,7 +312,7 @@ socket_setattr_return: * @secattr: the security attributes * * Description: - * Examines the given sock to see any NetLabel style labeling has been + * Examines the given sock to see if any NetLabel style labeling has been * applied to the sock, if so it parses the socket label and returns the * security attributes in @secattr. Returns zero on success, negative values * on failure. @@ -319,18 +320,13 @@ socket_setattr_return: */ int netlbl_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) { - int ret_val; - - ret_val = cipso_v4_sock_getattr(sk, secattr); - if (ret_val == 0) - return 0; - - return netlbl_unlabel_getattr(secattr); + return cipso_v4_sock_getattr(sk, secattr); } /** * netlbl_skbuff_getattr - Determine the security attributes of a packet * @skb: the packet + * @family: protocol family * @secattr: the security attributes * * Description: @@ -341,13 +337,14 @@ int netlbl_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) * */ int netlbl_skbuff_getattr(const struct sk_buff *skb, + u16 family, struct netlbl_lsm_secattr *secattr) { if (CIPSO_V4_OPTEXIST(skb) && cipso_v4_skbuff_getattr(skb, secattr) == 0) return 0; - return netlbl_unlabel_getattr(secattr); + return netlbl_unlabel_getattr(skb, family, secattr); } /** @@ -431,6 +428,10 @@ static int __init netlbl_init(void) if (ret_val != 0) goto init_failure; + ret_val = netlbl_unlabel_init(NETLBL_UNLHSH_BITSIZE); + if (ret_val != 0) + goto init_failure; + ret_val = netlbl_netlink_init(); if (ret_val != 0) goto init_failure; diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c index 9c41464d58d..e2258dc3c84 100644 --- a/net/netlabel/netlabel_mgmt.c +++ b/net/netlabel/netlabel_mgmt.c @@ -37,14 +37,14 @@ #include <net/genetlink.h> #include <net/netlabel.h> #include <net/cipso_ipv4.h> +#include <asm/atomic.h> #include "netlabel_domainhash.h" #include "netlabel_user.h" #include "netlabel_mgmt.h" -/* NetLabel configured protocol count */ -static DEFINE_SPINLOCK(netlabel_mgmt_protocount_lock); -static u32 netlabel_mgmt_protocount = 0; +/* NetLabel configured protocol counter */ +atomic_t netlabel_mgmt_protocount = ATOMIC_INIT(0); /* Argument struct for netlbl_domhsh_walk() */ struct netlbl_domhsh_walk_arg { @@ -71,63 +71,6 @@ static const struct nla_policy netlbl_mgmt_genl_policy[NLBL_MGMT_A_MAX + 1] = { }; /* - * NetLabel Misc Management Functions - */ - -/** - * netlbl_mgmt_protocount_inc - Increment the configured labeled protocol count - * - * Description: - * Increment the number of labeled protocol configurations in the current - * NetLabel configuration. Keep track of this for use in determining if - * NetLabel label enforcement should be active/enabled or not in the LSM. - * - */ -void netlbl_mgmt_protocount_inc(void) -{ - spin_lock(&netlabel_mgmt_protocount_lock); - netlabel_mgmt_protocount++; - spin_unlock(&netlabel_mgmt_protocount_lock); -} - -/** - * netlbl_mgmt_protocount_dec - Decrement the configured labeled protocol count - * - * Description: - * Decrement the number of labeled protocol configurations in the current - * NetLabel configuration. Keep track of this for use in determining if - * NetLabel label enforcement should be active/enabled or not in the LSM. - * - */ -void netlbl_mgmt_protocount_dec(void) -{ - spin_lock(&netlabel_mgmt_protocount_lock); - if (netlabel_mgmt_protocount > 0) - netlabel_mgmt_protocount--; - spin_unlock(&netlabel_mgmt_protocount_lock); -} - -/** - * netlbl_mgmt_protocount_value - Return the number of configured protocols - * - * Description: - * Return the number of labeled protocols in the current NetLabel - * configuration. This value is useful in determining if NetLabel label - * enforcement should be active/enabled or not in the LSM. - * - */ -u32 netlbl_mgmt_protocount_value(void) -{ - u32 val; - - rcu_read_lock(); - val = netlabel_mgmt_protocount; - rcu_read_unlock(); - - return val; -} - -/* * NetLabel Command Handlers */ diff --git a/net/netlabel/netlabel_mgmt.h b/net/netlabel/netlabel_mgmt.h index ccb2b392359..a43bff169d6 100644 --- a/net/netlabel/netlabel_mgmt.h +++ b/net/netlabel/netlabel_mgmt.h @@ -32,6 +32,7 @@ #define _NETLABEL_MGMT_H #include <net/netlabel.h> +#include <asm/atomic.h> /* * The following NetLabel payloads are supported by the management interface. @@ -168,9 +169,7 @@ enum { /* NetLabel protocol functions */ int netlbl_mgmt_genl_init(void); -/* NetLabel misc management functions */ -void netlbl_mgmt_protocount_inc(void); -void netlbl_mgmt_protocount_dec(void); -u32 netlbl_mgmt_protocount_value(void); +/* NetLabel configured protocol reference counter */ +extern atomic_t netlabel_mgmt_protocount; #endif diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 348292450de..42e81fd8cc4 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -10,7 +10,7 @@ */ /* - * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 + * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 - 2007 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -36,22 +36,92 @@ #include <linux/string.h> #include <linux/skbuff.h> #include <linux/audit.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/notifier.h> +#include <linux/netdevice.h> +#include <linux/security.h> #include <net/sock.h> #include <net/netlink.h> #include <net/genetlink.h> - +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/net_namespace.h> #include <net/netlabel.h> #include <asm/bug.h> +#include <asm/atomic.h> #include "netlabel_user.h" #include "netlabel_domainhash.h" #include "netlabel_unlabeled.h" +#include "netlabel_mgmt.h" + +/* NOTE: at present we always use init's network namespace since we don't + * presently support different namespaces even though the majority of + * the functions in this file are "namespace safe" */ + +/* The unlabeled connection hash table which we use to map network interfaces + * and addresses of unlabeled packets to a user specified secid value for the + * LSM. The hash table is used to lookup the network interface entry + * (struct netlbl_unlhsh_iface) and then the interface entry is used to + * lookup an IP address match from an ordered list. If a network interface + * match can not be found in the hash table then the default entry + * (netlbl_unlhsh_def) is used. The IP address entry list + * (struct netlbl_unlhsh_addr) is ordered such that the entries with a + * larger netmask come first. + */ +struct netlbl_unlhsh_tbl { + struct list_head *tbl; + u32 size; +}; +struct netlbl_unlhsh_addr4 { + __be32 addr; + __be32 mask; + u32 secid; + + u32 valid; + struct list_head list; + struct rcu_head rcu; +}; +struct netlbl_unlhsh_addr6 { + struct in6_addr addr; + struct in6_addr mask; + u32 secid; + + u32 valid; + struct list_head list; + struct rcu_head rcu; +}; +struct netlbl_unlhsh_iface { + int ifindex; + struct list_head addr4_list; + struct list_head addr6_list; + + u32 valid; + struct list_head list; + struct rcu_head rcu; +}; + +/* Argument struct for netlbl_unlhsh_walk() */ +struct netlbl_unlhsh_walk_arg { + struct netlink_callback *nl_cb; + struct sk_buff *skb; + u32 seq; +}; + +/* Unlabeled connection hash table */ +/* updates should be so rare that having one spinlock for the entire + * hash table should be okay */ +static DEFINE_SPINLOCK(netlbl_unlhsh_lock); +static struct netlbl_unlhsh_tbl *netlbl_unlhsh = NULL; +static struct netlbl_unlhsh_iface *netlbl_unlhsh_def = NULL; /* Accept unlabeled packets flag */ -static DEFINE_SPINLOCK(netlabel_unlabel_acceptflg_lock); static u8 netlabel_unlabel_acceptflg = 0; -/* NetLabel Generic NETLINK CIPSOv4 family */ +/* NetLabel Generic NETLINK unlabeled family */ static struct genl_family netlbl_unlabel_gnl_family = { .id = GENL_ID_GENERATE, .hdrsize = 0, @@ -63,11 +133,841 @@ static struct genl_family netlbl_unlabel_gnl_family = { /* NetLabel Netlink attribute policy */ static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = { [NLBL_UNLABEL_A_ACPTFLG] = { .type = NLA_U8 }, + [NLBL_UNLABEL_A_IPV6ADDR] = { .type = NLA_BINARY, + .len = sizeof(struct in6_addr) }, + [NLBL_UNLABEL_A_IPV6MASK] = { .type = NLA_BINARY, + .len = sizeof(struct in6_addr) }, + [NLBL_UNLABEL_A_IPV4ADDR] = { .type = NLA_BINARY, + .len = sizeof(struct in_addr) }, + [NLBL_UNLABEL_A_IPV4MASK] = { .type = NLA_BINARY, + .len = sizeof(struct in_addr) }, + [NLBL_UNLABEL_A_IFACE] = { .type = NLA_NUL_STRING, + .len = IFNAMSIZ - 1 }, + [NLBL_UNLABEL_A_SECCTX] = { .type = NLA_BINARY } }; /* - * Helper Functions + * Audit Helper Functions + */ + +/** + * netlbl_unlabel_audit_addr4 - Audit an IPv4 address + * @audit_buf: audit buffer + * @dev: network interface + * @addr: IP address + * @mask: IP address mask + * + * Description: + * Write the IPv4 address and address mask, if necessary, to @audit_buf. + * + */ +static void netlbl_unlabel_audit_addr4(struct audit_buffer *audit_buf, + const char *dev, + __be32 addr, __be32 mask) +{ + u32 mask_val = ntohl(mask); + + if (dev != NULL) + audit_log_format(audit_buf, " netif=%s", dev); + audit_log_format(audit_buf, " src=" NIPQUAD_FMT, NIPQUAD(addr)); + if (mask_val != 0xffffffff) { + u32 mask_len = 0; + while (mask_val > 0) { + mask_val <<= 1; + mask_len++; + } + audit_log_format(audit_buf, " src_prefixlen=%d", mask_len); + } +} + +/** + * netlbl_unlabel_audit_addr6 - Audit an IPv6 address + * @audit_buf: audit buffer + * @dev: network interface + * @addr: IP address + * @mask: IP address mask + * + * Description: + * Write the IPv6 address and address mask, if necessary, to @audit_buf. + * + */ +static void netlbl_unlabel_audit_addr6(struct audit_buffer *audit_buf, + const char *dev, + const struct in6_addr *addr, + const struct in6_addr *mask) +{ + if (dev != NULL) + audit_log_format(audit_buf, " netif=%s", dev); + audit_log_format(audit_buf, " src=" NIP6_FMT, NIP6(*addr)); + if (ntohl(mask->s6_addr32[3]) != 0xffffffff) { + u32 mask_len = 0; + u32 mask_val; + int iter = -1; + while (ntohl(mask->s6_addr32[++iter]) == 0xffffffff) + mask_len += 32; + mask_val = ntohl(mask->s6_addr32[iter]); + while (mask_val > 0) { + mask_val <<= 1; + mask_len++; + } + audit_log_format(audit_buf, " src_prefixlen=%d", mask_len); + } +} + +/* + * Unlabeled Connection Hash Table Functions + */ + +/** + * netlbl_unlhsh_free_addr4 - Frees an IPv4 address entry from the hash table + * @entry: the entry's RCU field + * + * Description: + * This function is designed to be used as a callback to the call_rcu() + * function so that memory allocated to a hash table address entry can be + * released safely. + * + */ +static void netlbl_unlhsh_free_addr4(struct rcu_head *entry) +{ + struct netlbl_unlhsh_addr4 *ptr; + + ptr = container_of(entry, struct netlbl_unlhsh_addr4, rcu); + kfree(ptr); +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +/** + * netlbl_unlhsh_free_addr6 - Frees an IPv6 address entry from the hash table + * @entry: the entry's RCU field + * + * Description: + * This function is designed to be used as a callback to the call_rcu() + * function so that memory allocated to a hash table address entry can be + * released safely. + * + */ +static void netlbl_unlhsh_free_addr6(struct rcu_head *entry) +{ + struct netlbl_unlhsh_addr6 *ptr; + + ptr = container_of(entry, struct netlbl_unlhsh_addr6, rcu); + kfree(ptr); +} +#endif /* IPv6 */ + +/** + * netlbl_unlhsh_free_iface - Frees an interface entry from the hash table + * @entry: the entry's RCU field + * + * Description: + * This function is designed to be used as a callback to the call_rcu() + * function so that memory allocated to a hash table interface entry can be + * released safely. It is important to note that this function does not free + * the IPv4 and IPv6 address lists contained as part of an interface entry. It + * is up to the rest of the code to make sure an interface entry is only freed + * once it's address lists are empty. + * + */ +static void netlbl_unlhsh_free_iface(struct rcu_head *entry) +{ + struct netlbl_unlhsh_iface *iface; + struct netlbl_unlhsh_addr4 *iter4; + struct netlbl_unlhsh_addr4 *tmp4; + struct netlbl_unlhsh_addr6 *iter6; + struct netlbl_unlhsh_addr6 *tmp6; + + iface = container_of(entry, struct netlbl_unlhsh_iface, rcu); + + /* no need for locks here since we are the only one with access to this + * structure */ + + list_for_each_entry_safe(iter4, tmp4, &iface->addr4_list, list) + if (iter4->valid) { + list_del_rcu(&iter4->list); + kfree(iter4); + } + list_for_each_entry_safe(iter6, tmp6, &iface->addr6_list, list) + if (iter6->valid) { + list_del_rcu(&iter6->list); + kfree(iter6); + } + kfree(iface); +} + +/** + * netlbl_unlhsh_hash - Hashing function for the hash table + * @ifindex: the network interface/device to hash + * + * Description: + * This is the hashing function for the unlabeled hash table, it returns the + * bucket number for the given device/interface. The caller is responsible for + * calling the rcu_read_[un]lock() functions. + * */ +static u32 netlbl_unlhsh_hash(int ifindex) +{ + /* this is taken _almost_ directly from + * security/selinux/netif.c:sel_netif_hasfn() as they do pretty much + * the same thing */ + return ifindex & (rcu_dereference(netlbl_unlhsh)->size - 1); +} + +/** + * netlbl_unlhsh_search_addr4 - Search for a matching IPv4 address entry + * @addr: IPv4 address + * @iface: the network interface entry + * + * Description: + * Searches the IPv4 address list of the network interface specified by @iface. + * If a matching address entry is found it is returned, otherwise NULL is + * returned. The caller is responsible for calling the rcu_read_[un]lock() + * functions. + * + */ +static struct netlbl_unlhsh_addr4 *netlbl_unlhsh_search_addr4( + __be32 addr, + const struct netlbl_unlhsh_iface *iface) +{ + struct netlbl_unlhsh_addr4 *iter; + + list_for_each_entry_rcu(iter, &iface->addr4_list, list) + if (iter->valid && (addr & iter->mask) == iter->addr) + return iter; + + return NULL; +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +/** + * netlbl_unlhsh_search_addr6 - Search for a matching IPv6 address entry + * @addr: IPv6 address + * @iface: the network interface entry + * + * Description: + * Searches the IPv6 address list of the network interface specified by @iface. + * If a matching address entry is found it is returned, otherwise NULL is + * returned. The caller is responsible for calling the rcu_read_[un]lock() + * functions. + * + */ +static struct netlbl_unlhsh_addr6 *netlbl_unlhsh_search_addr6( + const struct in6_addr *addr, + const struct netlbl_unlhsh_iface *iface) +{ + struct netlbl_unlhsh_addr6 *iter; + + list_for_each_entry_rcu(iter, &iface->addr6_list, list) + if (iter->valid && + ipv6_masked_addr_cmp(&iter->addr, &iter->mask, addr) == 0) + return iter; + + return NULL; +} +#endif /* IPv6 */ + +/** + * netlbl_unlhsh_search_iface - Search for a matching interface entry + * @ifindex: the network interface + * + * Description: + * Searches the unlabeled connection hash table and returns a pointer to the + * interface entry which matches @ifindex, otherwise NULL is returned. The + * caller is responsible for calling the rcu_read_[un]lock() functions. + * + */ +static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface(int ifindex) +{ + u32 bkt; + struct netlbl_unlhsh_iface *iter; + + bkt = netlbl_unlhsh_hash(ifindex); + list_for_each_entry_rcu(iter, + &rcu_dereference(netlbl_unlhsh)->tbl[bkt], + list) + if (iter->valid && iter->ifindex == ifindex) + return iter; + + return NULL; +} + +/** + * netlbl_unlhsh_search_iface_def - Search for a matching interface entry + * @ifindex: the network interface + * + * Description: + * Searches the unlabeled connection hash table and returns a pointer to the + * interface entry which matches @ifindex. If an exact match can not be found + * and there is a valid default entry, the default entry is returned, otherwise + * NULL is returned. The caller is responsible for calling the + * rcu_read_[un]lock() functions. + * + */ +static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface_def(int ifindex) +{ + struct netlbl_unlhsh_iface *entry; + + entry = netlbl_unlhsh_search_iface(ifindex); + if (entry != NULL) + return entry; + + entry = rcu_dereference(netlbl_unlhsh_def); + if (entry != NULL && entry->valid) + return entry; + + return NULL; +} + +/** + * netlbl_unlhsh_add_addr4 - Add a new IPv4 address entry to the hash table + * @iface: the associated interface entry + * @addr: IPv4 address in network byte order + * @mask: IPv4 address mask in network byte order + * @secid: LSM secid value for entry + * + * Description: + * Add a new address entry into the unlabeled connection hash table using the + * interface entry specified by @iface. On success zero is returned, otherwise + * a negative value is returned. The caller is responsible for calling the + * rcu_read_[un]lock() functions. + * + */ +static int netlbl_unlhsh_add_addr4(struct netlbl_unlhsh_iface *iface, + const struct in_addr *addr, + const struct in_addr *mask, + u32 secid) +{ + struct netlbl_unlhsh_addr4 *entry; + struct netlbl_unlhsh_addr4 *iter; + + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (entry == NULL) + return -ENOMEM; + + entry->addr = addr->s_addr & mask->s_addr; + entry->mask = mask->s_addr; + entry->secid = secid; + entry->valid = 1; + INIT_RCU_HEAD(&entry->rcu); + + spin_lock(&netlbl_unlhsh_lock); + iter = netlbl_unlhsh_search_addr4(entry->addr, iface); + if (iter != NULL && + iter->addr == addr->s_addr && iter->mask == mask->s_addr) { + spin_unlock(&netlbl_unlhsh_lock); + kfree(entry); + return -EEXIST; + } + /* in order to speed up address searches through the list (the common + * case) we need to keep the list in order based on the size of the + * address mask such that the entry with the widest mask (smallest + * numerical value) appears first in the list */ + list_for_each_entry_rcu(iter, &iface->addr4_list, list) + if (iter->valid && + ntohl(entry->mask) > ntohl(iter->mask)) { + __list_add_rcu(&entry->list, + iter->list.prev, + &iter->list); + spin_unlock(&netlbl_unlhsh_lock); + return 0; + } + list_add_tail_rcu(&entry->list, &iface->addr4_list); + spin_unlock(&netlbl_unlhsh_lock); + return 0; +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +/** + * netlbl_unlhsh_add_addr6 - Add a new IPv6 address entry to the hash table + * @iface: the associated interface entry + * @addr: IPv6 address in network byte order + * @mask: IPv6 address mask in network byte order + * @secid: LSM secid value for entry + * + * Description: + * Add a new address entry into the unlabeled connection hash table using the + * interface entry specified by @iface. On success zero is returned, otherwise + * a negative value is returned. The caller is responsible for calling the + * rcu_read_[un]lock() functions. + * + */ +static int netlbl_unlhsh_add_addr6(struct netlbl_unlhsh_iface *iface, + const struct in6_addr *addr, + const struct in6_addr *mask, + u32 secid) +{ + struct netlbl_unlhsh_addr6 *entry; + struct netlbl_unlhsh_addr6 *iter; + + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (entry == NULL) + return -ENOMEM; + + ipv6_addr_copy(&entry->addr, addr); + entry->addr.s6_addr32[0] &= mask->s6_addr32[0]; + entry->addr.s6_addr32[1] &= mask->s6_addr32[1]; + entry->addr.s6_addr32[2] &= mask->s6_addr32[2]; + entry->addr.s6_addr32[3] &= mask->s6_addr32[3]; + ipv6_addr_copy(&entry->mask, mask); + entry->secid = secid; + entry->valid = 1; + INIT_RCU_HEAD(&entry->rcu); + + spin_lock(&netlbl_unlhsh_lock); + iter = netlbl_unlhsh_search_addr6(&entry->addr, iface); + if (iter != NULL && + (ipv6_addr_equal(&iter->addr, addr) && + ipv6_addr_equal(&iter->mask, mask))) { + spin_unlock(&netlbl_unlhsh_lock); + kfree(entry); + return -EEXIST; + } + /* in order to speed up address searches through the list (the common + * case) we need to keep the list in order based on the size of the + * address mask such that the entry with the widest mask (smallest + * numerical value) appears first in the list */ + list_for_each_entry_rcu(iter, &iface->addr6_list, list) + if (iter->valid && + ipv6_addr_cmp(&entry->mask, &iter->mask) > 0) { + __list_add_rcu(&entry->list, + iter->list.prev, + &iter->list); + spin_unlock(&netlbl_unlhsh_lock); + return 0; + } + list_add_tail_rcu(&entry->list, &iface->addr6_list); + spin_unlock(&netlbl_unlhsh_lock); + return 0; +} +#endif /* IPv6 */ + +/** + * netlbl_unlhsh_add_iface - Adds a new interface entry to the hash table + * @ifindex: network interface + * + * Description: + * Add a new, empty, interface entry into the unlabeled connection hash table. + * On success a pointer to the new interface entry is returned, on failure NULL + * is returned. The caller is responsible for calling the rcu_read_[un]lock() + * functions. + * + */ +static struct netlbl_unlhsh_iface *netlbl_unlhsh_add_iface(int ifindex) +{ + u32 bkt; + struct netlbl_unlhsh_iface *iface; + + iface = kzalloc(sizeof(*iface), GFP_ATOMIC); + if (iface == NULL) + return NULL; + + iface->ifindex = ifindex; + INIT_LIST_HEAD(&iface->addr4_list); + INIT_LIST_HEAD(&iface->addr6_list); + iface->valid = 1; + INIT_RCU_HEAD(&iface->rcu); + + spin_lock(&netlbl_unlhsh_lock); + if (ifindex > 0) { + bkt = netlbl_unlhsh_hash(ifindex); + if (netlbl_unlhsh_search_iface(ifindex) != NULL) + goto add_iface_failure; + list_add_tail_rcu(&iface->list, + &rcu_dereference(netlbl_unlhsh)->tbl[bkt]); + } else { + INIT_LIST_HEAD(&iface->list); + if (rcu_dereference(netlbl_unlhsh_def) != NULL) + goto add_iface_failure; + rcu_assign_pointer(netlbl_unlhsh_def, iface); + } + spin_unlock(&netlbl_unlhsh_lock); + + return iface; + +add_iface_failure: + spin_unlock(&netlbl_unlhsh_lock); + kfree(iface); + return NULL; +} + +/** + * netlbl_unlhsh_add - Adds a new entry to the unlabeled connection hash table + * @net: network namespace + * @dev_name: interface name + * @addr: IP address in network byte order + * @mask: address mask in network byte order + * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6) + * @secid: LSM secid value for the entry + * @audit_info: NetLabel audit information + * + * Description: + * Adds a new entry to the unlabeled connection hash table. Returns zero on + * success, negative values on failure. + * + */ +static int netlbl_unlhsh_add(struct net *net, + const char *dev_name, + const void *addr, + const void *mask, + u32 addr_len, + u32 secid, + struct netlbl_audit *audit_info) +{ + int ret_val; + int ifindex; + struct net_device *dev; + struct netlbl_unlhsh_iface *iface; + struct in_addr *addr4, *mask4; + struct in6_addr *addr6, *mask6; + struct audit_buffer *audit_buf = NULL; + char *secctx = NULL; + u32 secctx_len; + + if (addr_len != sizeof(struct in_addr) && + addr_len != sizeof(struct in6_addr)) + return -EINVAL; + + rcu_read_lock(); + if (dev_name != NULL) { + dev = dev_get_by_name(net, dev_name); + if (dev == NULL) { + ret_val = -ENODEV; + goto unlhsh_add_return; + } + ifindex = dev->ifindex; + dev_put(dev); + iface = netlbl_unlhsh_search_iface(ifindex); + } else { + ifindex = 0; + iface = rcu_dereference(netlbl_unlhsh_def); + } + if (iface == NULL) { + iface = netlbl_unlhsh_add_iface(ifindex); + if (iface == NULL) { + ret_val = -ENOMEM; + goto unlhsh_add_return; + } + } + audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCADD, + audit_info); + switch (addr_len) { + case sizeof(struct in_addr): + addr4 = (struct in_addr *)addr; + mask4 = (struct in_addr *)mask; + ret_val = netlbl_unlhsh_add_addr4(iface, addr4, mask4, secid); + if (audit_buf != NULL) + netlbl_unlabel_audit_addr4(audit_buf, + dev_name, + addr4->s_addr, + mask4->s_addr); + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case sizeof(struct in6_addr): + addr6 = (struct in6_addr *)addr; + mask6 = (struct in6_addr *)mask; + ret_val = netlbl_unlhsh_add_addr6(iface, addr6, mask6, secid); + if (audit_buf != NULL) + netlbl_unlabel_audit_addr6(audit_buf, + dev_name, + addr6, mask6); + break; +#endif /* IPv6 */ + default: + ret_val = -EINVAL; + } + if (ret_val == 0) + atomic_inc(&netlabel_mgmt_protocount); + +unlhsh_add_return: + rcu_read_unlock(); + if (audit_buf != NULL) { + if (security_secid_to_secctx(secid, + &secctx, + &secctx_len) == 0) { + audit_log_format(audit_buf, " sec_obj=%s", secctx); + security_release_secctx(secctx, secctx_len); + } + audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0); + audit_log_end(audit_buf); + } + return ret_val; +} + +/** + * netlbl_unlhsh_remove_addr4 - Remove an IPv4 address entry + * @net: network namespace + * @iface: interface entry + * @addr: IP address + * @mask: IP address mask + * @audit_info: NetLabel audit information + * + * Description: + * Remove an IP address entry from the unlabeled connection hash table. + * Returns zero on success, negative values on failure. The caller is + * responsible for calling the rcu_read_[un]lock() functions. + * + */ +static int netlbl_unlhsh_remove_addr4(struct net *net, + struct netlbl_unlhsh_iface *iface, + const struct in_addr *addr, + const struct in_addr *mask, + struct netlbl_audit *audit_info) +{ + int ret_val = -ENOENT; + struct netlbl_unlhsh_addr4 *entry; + struct audit_buffer *audit_buf = NULL; + struct net_device *dev; + char *secctx = NULL; + u32 secctx_len; + + spin_lock(&netlbl_unlhsh_lock); + entry = netlbl_unlhsh_search_addr4(addr->s_addr, iface); + if (entry != NULL && + entry->addr == addr->s_addr && entry->mask == mask->s_addr) { + entry->valid = 0; + list_del_rcu(&entry->list); + ret_val = 0; + } + spin_unlock(&netlbl_unlhsh_lock); + + audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL, + audit_info); + if (audit_buf != NULL) { + dev = dev_get_by_index(net, iface->ifindex); + netlbl_unlabel_audit_addr4(audit_buf, + (dev != NULL ? dev->name : NULL), + entry->addr, entry->mask); + if (dev != NULL) + dev_put(dev); + if (security_secid_to_secctx(entry->secid, + &secctx, + &secctx_len) == 0) { + audit_log_format(audit_buf, " sec_obj=%s", secctx); + security_release_secctx(secctx, secctx_len); + } + audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0); + audit_log_end(audit_buf); + } + + if (ret_val == 0) + call_rcu(&entry->rcu, netlbl_unlhsh_free_addr4); + return ret_val; +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +/** + * netlbl_unlhsh_remove_addr6 - Remove an IPv6 address entry + * @net: network namespace + * @iface: interface entry + * @addr: IP address + * @mask: IP address mask + * @audit_info: NetLabel audit information + * + * Description: + * Remove an IP address entry from the unlabeled connection hash table. + * Returns zero on success, negative values on failure. The caller is + * responsible for calling the rcu_read_[un]lock() functions. + * + */ +static int netlbl_unlhsh_remove_addr6(struct net *net, + struct netlbl_unlhsh_iface *iface, + const struct in6_addr *addr, + const struct in6_addr *mask, + struct netlbl_audit *audit_info) +{ + int ret_val = -ENOENT; + struct netlbl_unlhsh_addr6 *entry; + struct audit_buffer *audit_buf = NULL; + struct net_device *dev; + char *secctx = NULL; + u32 secctx_len; + + spin_lock(&netlbl_unlhsh_lock); + entry = netlbl_unlhsh_search_addr6(addr, iface); + if (entry != NULL && + (ipv6_addr_equal(&entry->addr, addr) && + ipv6_addr_equal(&entry->mask, mask))) { + entry->valid = 0; + list_del_rcu(&entry->list); + ret_val = 0; + } + spin_unlock(&netlbl_unlhsh_lock); + + audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL, + audit_info); + if (audit_buf != NULL) { + dev = dev_get_by_index(net, iface->ifindex); + netlbl_unlabel_audit_addr6(audit_buf, + (dev != NULL ? dev->name : NULL), + addr, mask); + if (dev != NULL) + dev_put(dev); + if (security_secid_to_secctx(entry->secid, + &secctx, + &secctx_len) == 0) { + audit_log_format(audit_buf, " sec_obj=%s", secctx); + security_release_secctx(secctx, secctx_len); + } + audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0); + audit_log_end(audit_buf); + } + + if (ret_val == 0) + call_rcu(&entry->rcu, netlbl_unlhsh_free_addr6); + return ret_val; +} +#endif /* IPv6 */ + +/** + * netlbl_unlhsh_condremove_iface - Remove an interface entry + * @iface: the interface entry + * + * Description: + * Remove an interface entry from the unlabeled connection hash table if it is + * empty. An interface entry is considered to be empty if there are no + * address entries assigned to it. + * + */ +static void netlbl_unlhsh_condremove_iface(struct netlbl_unlhsh_iface *iface) +{ + struct netlbl_unlhsh_addr4 *iter4; + struct netlbl_unlhsh_addr6 *iter6; + + spin_lock(&netlbl_unlhsh_lock); + list_for_each_entry_rcu(iter4, &iface->addr4_list, list) + if (iter4->valid) + goto unlhsh_condremove_failure; + list_for_each_entry_rcu(iter6, &iface->addr6_list, list) + if (iter6->valid) + goto unlhsh_condremove_failure; + iface->valid = 0; + if (iface->ifindex > 0) + list_del_rcu(&iface->list); + else + rcu_assign_pointer(netlbl_unlhsh_def, NULL); + spin_unlock(&netlbl_unlhsh_lock); + + call_rcu(&iface->rcu, netlbl_unlhsh_free_iface); + return; + +unlhsh_condremove_failure: + spin_unlock(&netlbl_unlhsh_lock); + return; +} + +/** + * netlbl_unlhsh_remove - Remove an entry from the unlabeled hash table + * @net: network namespace + * @dev_name: interface name + * @addr: IP address in network byte order + * @mask: address mask in network byte order + * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6) + * @audit_info: NetLabel audit information + * + * Description: + * Removes and existing entry from the unlabeled connection hash table. + * Returns zero on success, negative values on failure. + * + */ +static int netlbl_unlhsh_remove(struct net *net, + const char *dev_name, + const void *addr, + const void *mask, + u32 addr_len, + struct netlbl_audit *audit_info) +{ + int ret_val; + struct net_device *dev; + struct netlbl_unlhsh_iface *iface; + + if (addr_len != sizeof(struct in_addr) && + addr_len != sizeof(struct in6_addr)) + return -EINVAL; + + rcu_read_lock(); + if (dev_name != NULL) { + dev = dev_get_by_name(net, dev_name); + if (dev == NULL) { + ret_val = -ENODEV; + goto unlhsh_remove_return; + } + iface = netlbl_unlhsh_search_iface(dev->ifindex); + dev_put(dev); + } else + iface = rcu_dereference(netlbl_unlhsh_def); + if (iface == NULL) { + ret_val = -ENOENT; + goto unlhsh_remove_return; + } + switch (addr_len) { + case sizeof(struct in_addr): + ret_val = netlbl_unlhsh_remove_addr4(net, + iface, addr, mask, + audit_info); + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case sizeof(struct in6_addr): + ret_val = netlbl_unlhsh_remove_addr6(net, + iface, addr, mask, + audit_info); + break; +#endif /* IPv6 */ + default: + ret_val = -EINVAL; + } + if (ret_val == 0) { + netlbl_unlhsh_condremove_iface(iface); + atomic_dec(&netlabel_mgmt_protocount); + } + +unlhsh_remove_return: + rcu_read_unlock(); + return ret_val; +} + +/* + * General Helper Functions + */ + +/** + * netlbl_unlhsh_netdev_handler - Network device notification handler + * @this: notifier block + * @event: the event + * @ptr: the network device (cast to void) + * + * Description: + * Handle network device events, although at present all we care about is a + * network device going away. In the case of a device going away we clear any + * related entries from the unlabeled connection hash table. + * + */ +static int netlbl_unlhsh_netdev_handler(struct notifier_block *this, + unsigned long event, + void *ptr) +{ + struct net_device *dev = ptr; + struct netlbl_unlhsh_iface *iface = NULL; + + if (dev->nd_net != &init_net) + return NOTIFY_DONE; + + /* XXX - should this be a check for NETDEV_DOWN or _UNREGISTER? */ + if (event == NETDEV_DOWN) { + spin_lock(&netlbl_unlhsh_lock); + iface = netlbl_unlhsh_search_iface(dev->ifindex); + if (iface != NULL && iface->valid) { + iface->valid = 0; + list_del_rcu(&iface->list); + } else + iface = NULL; + spin_unlock(&netlbl_unlhsh_lock); + } + + if (iface != NULL) + call_rcu(&iface->rcu, netlbl_unlhsh_free_iface); + + return NOTIFY_DONE; +} /** * netlbl_unlabel_acceptflg_set - Set the unlabeled accept flag @@ -84,11 +984,8 @@ static void netlbl_unlabel_acceptflg_set(u8 value, struct audit_buffer *audit_buf; u8 old_val; - spin_lock(&netlabel_unlabel_acceptflg_lock); old_val = netlabel_unlabel_acceptflg; netlabel_unlabel_acceptflg = value; - spin_unlock(&netlabel_unlabel_acceptflg_lock); - audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_ALLOW, audit_info); if (audit_buf != NULL) { @@ -98,6 +995,48 @@ static void netlbl_unlabel_acceptflg_set(u8 value, } } +/** + * netlbl_unlabel_addrinfo_get - Get the IPv4/6 address information + * @info: the Generic NETLINK info block + * @addr: the IP address + * @mask: the IP address mask + * @len: the address length + * + * Description: + * Examine the Generic NETLINK message and extract the IP address information. + * Returns zero on success, negative values on failure. + * + */ +static int netlbl_unlabel_addrinfo_get(struct genl_info *info, + void **addr, + void **mask, + u32 *len) +{ + u32 addr_len; + + if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR]) { + addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]); + if (addr_len != sizeof(struct in_addr) && + addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV4MASK])) + return -EINVAL; + *len = addr_len; + *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]); + *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4MASK]); + return 0; + } else if (info->attrs[NLBL_UNLABEL_A_IPV6ADDR]) { + addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]); + if (addr_len != sizeof(struct in6_addr) && + addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV6MASK])) + return -EINVAL; + *len = addr_len; + *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]); + *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6MASK]); + return 0; + } + + return -EINVAL; +} + /* * NetLabel Command Handlers */ @@ -155,11 +1094,9 @@ static int netlbl_unlabel_list(struct sk_buff *skb, struct genl_info *info) goto list_failure; } - rcu_read_lock(); ret_val = nla_put_u8(ans_skb, NLBL_UNLABEL_A_ACPTFLG, netlabel_unlabel_acceptflg); - rcu_read_unlock(); if (ret_val != 0) goto list_failure; @@ -175,11 +1112,489 @@ list_failure: return ret_val; } +/** + * netlbl_unlabel_staticadd - Handle a STATICADD message + * @skb: the NETLINK buffer + * @info: the Generic NETLINK info block + * + * Description: + * Process a user generated STATICADD message and add a new unlabeled + * connection entry to the hash table. Returns zero on success, negative + * values on failure. + * + */ +static int netlbl_unlabel_staticadd(struct sk_buff *skb, + struct genl_info *info) +{ + int ret_val; + char *dev_name; + void *addr; + void *mask; + u32 addr_len; + u32 secid; + struct netlbl_audit audit_info; + + /* Don't allow users to add both IPv4 and IPv6 addresses for a + * single entry. However, allow users to create two entries, one each + * for IPv4 and IPv4, with the same LSM security context which should + * achieve the same result. */ + if (!info->attrs[NLBL_UNLABEL_A_SECCTX] || + !info->attrs[NLBL_UNLABEL_A_IFACE] || + !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || + !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ + (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || + !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) + return -EINVAL; + + netlbl_netlink_auditinfo(skb, &audit_info); + + ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); + if (ret_val != 0) + return ret_val; + dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]); + ret_val = security_secctx_to_secid( + nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]), + nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]), + &secid); + if (ret_val != 0) + return ret_val; + + return netlbl_unlhsh_add(&init_net, + dev_name, addr, mask, addr_len, secid, + &audit_info); +} + +/** + * netlbl_unlabel_staticadddef - Handle a STATICADDDEF message + * @skb: the NETLINK buffer + * @info: the Generic NETLINK info block + * + * Description: + * Process a user generated STATICADDDEF message and add a new default + * unlabeled connection entry. Returns zero on success, negative values on + * failure. + * + */ +static int netlbl_unlabel_staticadddef(struct sk_buff *skb, + struct genl_info *info) +{ + int ret_val; + void *addr; + void *mask; + u32 addr_len; + u32 secid; + struct netlbl_audit audit_info; + + /* Don't allow users to add both IPv4 and IPv6 addresses for a + * single entry. However, allow users to create two entries, one each + * for IPv4 and IPv6, with the same LSM security context which should + * achieve the same result. */ + if (!info->attrs[NLBL_UNLABEL_A_SECCTX] || + !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || + !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ + (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || + !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) + return -EINVAL; + + netlbl_netlink_auditinfo(skb, &audit_info); + + ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); + if (ret_val != 0) + return ret_val; + ret_val = security_secctx_to_secid( + nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]), + nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]), + &secid); + if (ret_val != 0) + return ret_val; + + return netlbl_unlhsh_add(&init_net, + NULL, addr, mask, addr_len, secid, + &audit_info); +} + +/** + * netlbl_unlabel_staticremove - Handle a STATICREMOVE message + * @skb: the NETLINK buffer + * @info: the Generic NETLINK info block + * + * Description: + * Process a user generated STATICREMOVE message and remove the specified + * unlabeled connection entry. Returns zero on success, negative values on + * failure. + * + */ +static int netlbl_unlabel_staticremove(struct sk_buff *skb, + struct genl_info *info) +{ + int ret_val; + char *dev_name; + void *addr; + void *mask; + u32 addr_len; + struct netlbl_audit audit_info; + + /* See the note in netlbl_unlabel_staticadd() about not allowing both + * IPv4 and IPv6 in the same entry. */ + if (!info->attrs[NLBL_UNLABEL_A_IFACE] || + !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || + !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ + (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || + !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) + return -EINVAL; + + netlbl_netlink_auditinfo(skb, &audit_info); + + ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); + if (ret_val != 0) + return ret_val; + dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]); + + return netlbl_unlhsh_remove(&init_net, + dev_name, addr, mask, addr_len, + &audit_info); +} + +/** + * netlbl_unlabel_staticremovedef - Handle a STATICREMOVEDEF message + * @skb: the NETLINK buffer + * @info: the Generic NETLINK info block + * + * Description: + * Process a user generated STATICREMOVEDEF message and remove the default + * unlabeled connection entry. Returns zero on success, negative values on + * failure. + * + */ +static int netlbl_unlabel_staticremovedef(struct sk_buff *skb, + struct genl_info *info) +{ + int ret_val; + void *addr; + void *mask; + u32 addr_len; + struct netlbl_audit audit_info; + + /* See the note in netlbl_unlabel_staticadd() about not allowing both + * IPv4 and IPv6 in the same entry. */ + if (!((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || + !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ + (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || + !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) + return -EINVAL; + + netlbl_netlink_auditinfo(skb, &audit_info); + + ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); + if (ret_val != 0) + return ret_val; + + return netlbl_unlhsh_remove(&init_net, + NULL, addr, mask, addr_len, + &audit_info); +} + + +/** + * netlbl_unlabel_staticlist_gen - Generate messages for STATICLIST[DEF] + * @cmd: command/message + * @iface: the interface entry + * @addr4: the IPv4 address entry + * @addr6: the IPv6 address entry + * @arg: the netlbl_unlhsh_walk_arg structure + * + * Description: + * This function is designed to be used to generate a response for a + * STATICLIST or STATICLISTDEF message. When called either @addr4 or @addr6 + * can be specified, not both, the other unspecified entry should be set to + * NULL by the caller. Returns the size of the message on success, negative + * values on failure. + * + */ +static int netlbl_unlabel_staticlist_gen(u32 cmd, + const struct netlbl_unlhsh_iface *iface, + const struct netlbl_unlhsh_addr4 *addr4, + const struct netlbl_unlhsh_addr6 *addr6, + void *arg) +{ + int ret_val = -ENOMEM; + struct netlbl_unlhsh_walk_arg *cb_arg = arg; + struct net_device *dev; + void *data; + u32 secid; + char *secctx; + u32 secctx_len; + + data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).pid, + cb_arg->seq, &netlbl_unlabel_gnl_family, + NLM_F_MULTI, cmd); + if (data == NULL) + goto list_cb_failure; + + if (iface->ifindex > 0) { + dev = dev_get_by_index(&init_net, iface->ifindex); + ret_val = nla_put_string(cb_arg->skb, + NLBL_UNLABEL_A_IFACE, dev->name); + dev_put(dev); + if (ret_val != 0) + goto list_cb_failure; + } + + if (addr4) { + struct in_addr addr_struct; + + addr_struct.s_addr = addr4->addr; + ret_val = nla_put(cb_arg->skb, + NLBL_UNLABEL_A_IPV4ADDR, + sizeof(struct in_addr), + &addr_struct); + if (ret_val != 0) + goto list_cb_failure; + + addr_struct.s_addr = addr4->mask; + ret_val = nla_put(cb_arg->skb, + NLBL_UNLABEL_A_IPV4MASK, + sizeof(struct in_addr), + &addr_struct); + if (ret_val != 0) + goto list_cb_failure; + + secid = addr4->secid; + } else { + ret_val = nla_put(cb_arg->skb, + NLBL_UNLABEL_A_IPV6ADDR, + sizeof(struct in6_addr), + &addr6->addr); + if (ret_val != 0) + goto list_cb_failure; + + ret_val = nla_put(cb_arg->skb, + NLBL_UNLABEL_A_IPV6MASK, + sizeof(struct in6_addr), + &addr6->mask); + if (ret_val != 0) + goto list_cb_failure; + + secid = addr6->secid; + } + + ret_val = security_secid_to_secctx(secid, &secctx, &secctx_len); + if (ret_val != 0) + goto list_cb_failure; + ret_val = nla_put(cb_arg->skb, + NLBL_UNLABEL_A_SECCTX, + secctx_len, + secctx); + security_release_secctx(secctx, secctx_len); + if (ret_val != 0) + goto list_cb_failure; + + cb_arg->seq++; + return genlmsg_end(cb_arg->skb, data); + +list_cb_failure: + genlmsg_cancel(cb_arg->skb, data); + return ret_val; +} + +/** + * netlbl_unlabel_staticlist - Handle a STATICLIST message + * @skb: the NETLINK buffer + * @cb: the NETLINK callback + * + * Description: + * Process a user generated STATICLIST message and dump the unlabeled + * connection hash table in a form suitable for use in a kernel generated + * STATICLIST message. Returns the length of @skb. + * + */ +static int netlbl_unlabel_staticlist(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct netlbl_unlhsh_walk_arg cb_arg; + u32 skip_bkt = cb->args[0]; + u32 skip_chain = cb->args[1]; + u32 skip_addr4 = cb->args[2]; + u32 skip_addr6 = cb->args[3]; + u32 iter_bkt; + u32 iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0; + struct netlbl_unlhsh_iface *iface; + struct netlbl_unlhsh_addr4 *addr4; + struct netlbl_unlhsh_addr6 *addr6; + + cb_arg.nl_cb = cb; + cb_arg.skb = skb; + cb_arg.seq = cb->nlh->nlmsg_seq; + + rcu_read_lock(); + for (iter_bkt = skip_bkt; + iter_bkt < rcu_dereference(netlbl_unlhsh)->size; + iter_bkt++, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0) { + list_for_each_entry_rcu(iface, + &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt], + list) { + if (!iface->valid || + iter_chain++ < skip_chain) + continue; + list_for_each_entry_rcu(addr4, + &iface->addr4_list, + list) { + if (!addr4->valid || iter_addr4++ < skip_addr4) + continue; + if (netlbl_unlabel_staticlist_gen( + NLBL_UNLABEL_C_STATICLIST, + iface, + addr4, + NULL, + &cb_arg) < 0) { + iter_addr4--; + iter_chain--; + goto unlabel_staticlist_return; + } + } + list_for_each_entry_rcu(addr6, + &iface->addr6_list, + list) { + if (!addr6->valid || iter_addr6++ < skip_addr6) + continue; + if (netlbl_unlabel_staticlist_gen( + NLBL_UNLABEL_C_STATICLIST, + iface, + NULL, + addr6, + &cb_arg) < 0) { + iter_addr6--; + iter_chain--; + goto unlabel_staticlist_return; + } + } + } + } + +unlabel_staticlist_return: + rcu_read_unlock(); + cb->args[0] = skip_bkt; + cb->args[1] = skip_chain; + cb->args[2] = skip_addr4; + cb->args[3] = skip_addr6; + return skb->len; +} + +/** + * netlbl_unlabel_staticlistdef - Handle a STATICLISTDEF message + * @skb: the NETLINK buffer + * @cb: the NETLINK callback + * + * Description: + * Process a user generated STATICLISTDEF message and dump the default + * unlabeled connection entry in a form suitable for use in a kernel generated + * STATICLISTDEF message. Returns the length of @skb. + * + */ +static int netlbl_unlabel_staticlistdef(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct netlbl_unlhsh_walk_arg cb_arg; + struct netlbl_unlhsh_iface *iface; + u32 skip_addr4 = cb->args[0]; + u32 skip_addr6 = cb->args[1]; + u32 iter_addr4 = 0, iter_addr6 = 0; + struct netlbl_unlhsh_addr4 *addr4; + struct netlbl_unlhsh_addr6 *addr6; + + cb_arg.nl_cb = cb; + cb_arg.skb = skb; + cb_arg.seq = cb->nlh->nlmsg_seq; + + rcu_read_lock(); + iface = rcu_dereference(netlbl_unlhsh_def); + if (iface == NULL || !iface->valid) + goto unlabel_staticlistdef_return; + + list_for_each_entry_rcu(addr4, &iface->addr4_list, list) { + if (!addr4->valid || iter_addr4++ < skip_addr4) + continue; + if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF, + iface, + addr4, + NULL, + &cb_arg) < 0) { + iter_addr4--; + goto unlabel_staticlistdef_return; + } + } + list_for_each_entry_rcu(addr6, &iface->addr6_list, list) { + if (addr6->valid || iter_addr6++ < skip_addr6) + continue; + if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF, + iface, + NULL, + addr6, + &cb_arg) < 0) { + iter_addr6--; + goto unlabel_staticlistdef_return; + } + } + +unlabel_staticlistdef_return: + rcu_read_unlock(); + cb->args[0] = skip_addr4; + cb->args[1] = skip_addr6; + return skb->len; +} /* * NetLabel Generic NETLINK Command Definitions */ +static struct genl_ops netlbl_unlabel_genl_c_staticadd = { + .cmd = NLBL_UNLABEL_C_STATICADD, + .flags = GENL_ADMIN_PERM, + .policy = netlbl_unlabel_genl_policy, + .doit = netlbl_unlabel_staticadd, + .dumpit = NULL, +}; + +static struct genl_ops netlbl_unlabel_genl_c_staticremove = { + .cmd = NLBL_UNLABEL_C_STATICREMOVE, + .flags = GENL_ADMIN_PERM, + .policy = netlbl_unlabel_genl_policy, + .doit = netlbl_unlabel_staticremove, + .dumpit = NULL, +}; + +static struct genl_ops netlbl_unlabel_genl_c_staticlist = { + .cmd = NLBL_UNLABEL_C_STATICLIST, + .flags = 0, + .policy = netlbl_unlabel_genl_policy, + .doit = NULL, + .dumpit = netlbl_unlabel_staticlist, +}; + +static struct genl_ops netlbl_unlabel_genl_c_staticadddef = { + .cmd = NLBL_UNLABEL_C_STATICADDDEF, + .flags = GENL_ADMIN_PERM, + .policy = netlbl_unlabel_genl_policy, + .doit = netlbl_unlabel_staticadddef, + .dumpit = NULL, +}; + +static struct genl_ops netlbl_unlabel_genl_c_staticremovedef = { + .cmd = NLBL_UNLABEL_C_STATICREMOVEDEF, + .flags = GENL_ADMIN_PERM, + .policy = netlbl_unlabel_genl_policy, + .doit = netlbl_unlabel_staticremovedef, + .dumpit = NULL, +}; + +static struct genl_ops netlbl_unlabel_genl_c_staticlistdef = { + .cmd = NLBL_UNLABEL_C_STATICLISTDEF, + .flags = 0, + .policy = netlbl_unlabel_genl_policy, + .doit = NULL, + .dumpit = netlbl_unlabel_staticlistdef, +}; + static struct genl_ops netlbl_unlabel_genl_c_accept = { .cmd = NLBL_UNLABEL_C_ACCEPT, .flags = GENL_ADMIN_PERM, @@ -196,7 +1611,6 @@ static struct genl_ops netlbl_unlabel_genl_c_list = { .dumpit = NULL, }; - /* * NetLabel Generic NETLINK Protocol Functions */ @@ -218,6 +1632,36 @@ int netlbl_unlabel_genl_init(void) return ret_val; ret_val = genl_register_ops(&netlbl_unlabel_gnl_family, + &netlbl_unlabel_genl_c_staticadd); + if (ret_val != 0) + return ret_val; + + ret_val = genl_register_ops(&netlbl_unlabel_gnl_family, + &netlbl_unlabel_genl_c_staticremove); + if (ret_val != 0) + return ret_val; + + ret_val = genl_register_ops(&netlbl_unlabel_gnl_family, + &netlbl_unlabel_genl_c_staticlist); + if (ret_val != 0) + return ret_val; + + ret_val = genl_register_ops(&netlbl_unlabel_gnl_family, + &netlbl_unlabel_genl_c_staticadddef); + if (ret_val != 0) + return ret_val; + + ret_val = genl_register_ops(&netlbl_unlabel_gnl_family, + &netlbl_unlabel_genl_c_staticremovedef); + if (ret_val != 0) + return ret_val; + + ret_val = genl_register_ops(&netlbl_unlabel_gnl_family, + &netlbl_unlabel_genl_c_staticlistdef); + if (ret_val != 0) + return ret_val; + + ret_val = genl_register_ops(&netlbl_unlabel_gnl_family, &netlbl_unlabel_genl_c_accept); if (ret_val != 0) return ret_val; @@ -234,8 +1678,58 @@ int netlbl_unlabel_genl_init(void) * NetLabel KAPI Hooks */ +static struct notifier_block netlbl_unlhsh_netdev_notifier = { + .notifier_call = netlbl_unlhsh_netdev_handler, +}; + +/** + * netlbl_unlabel_init - Initialize the unlabeled connection hash table + * @size: the number of bits to use for the hash buckets + * + * Description: + * Initializes the unlabeled connection hash table and registers a network + * device notification handler. This function should only be called by the + * NetLabel subsystem itself during initialization. Returns zero on success, + * non-zero values on error. + * + */ +int netlbl_unlabel_init(u32 size) +{ + u32 iter; + struct netlbl_unlhsh_tbl *hsh_tbl; + + if (size == 0) + return -EINVAL; + + hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL); + if (hsh_tbl == NULL) + return -ENOMEM; + hsh_tbl->size = 1 << size; + hsh_tbl->tbl = kcalloc(hsh_tbl->size, + sizeof(struct list_head), + GFP_KERNEL); + if (hsh_tbl->tbl == NULL) { + kfree(hsh_tbl); + return -ENOMEM; + } + for (iter = 0; iter < hsh_tbl->size; iter++) + INIT_LIST_HEAD(&hsh_tbl->tbl[iter]); + + rcu_read_lock(); + spin_lock(&netlbl_unlhsh_lock); + rcu_assign_pointer(netlbl_unlhsh, hsh_tbl); + spin_unlock(&netlbl_unlhsh_lock); + rcu_read_unlock(); + + register_netdevice_notifier(&netlbl_unlhsh_netdev_notifier); + + return 0; +} + /** * netlbl_unlabel_getattr - Get the security attributes for an unlabled packet + * @skb: the packet + * @family: protocol family * @secattr: the security attributes * * Description: @@ -243,19 +1737,52 @@ int netlbl_unlabel_genl_init(void) * them in @secattr. Returns zero on success and negative values on failure. * */ -int netlbl_unlabel_getattr(struct netlbl_lsm_secattr *secattr) +int netlbl_unlabel_getattr(const struct sk_buff *skb, + u16 family, + struct netlbl_lsm_secattr *secattr) { - int ret_val; + struct iphdr *hdr4; + struct ipv6hdr *hdr6; + struct netlbl_unlhsh_addr4 *addr4; + struct netlbl_unlhsh_addr6 *addr6; + struct netlbl_unlhsh_iface *iface; rcu_read_lock(); - if (netlabel_unlabel_acceptflg == 1) { - netlbl_secattr_init(secattr); - ret_val = 0; - } else - ret_val = -ENOMSG; + iface = netlbl_unlhsh_search_iface_def(skb->iif); + if (iface == NULL) + goto unlabel_getattr_nolabel; + switch (family) { + case PF_INET: + hdr4 = ip_hdr(skb); + addr4 = netlbl_unlhsh_search_addr4(hdr4->saddr, iface); + if (addr4 == NULL) + goto unlabel_getattr_nolabel; + secattr->attr.secid = addr4->secid; + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case PF_INET6: + hdr6 = ipv6_hdr(skb); + addr6 = netlbl_unlhsh_search_addr6(&hdr6->saddr, iface); + if (addr6 == NULL) + goto unlabel_getattr_nolabel; + secattr->attr.secid = addr6->secid; + break; +#endif /* IPv6 */ + default: + goto unlabel_getattr_nolabel; + } rcu_read_unlock(); - return ret_val; + secattr->flags |= NETLBL_SECATTR_SECID; + secattr->type = NETLBL_NLTYPE_UNLABELED; + return 0; + +unlabel_getattr_nolabel: + rcu_read_unlock(); + if (netlabel_unlabel_acceptflg == 0) + return -ENOMSG; + secattr->type = NETLBL_NLTYPE_UNLABELED; + return 0; } /** diff --git a/net/netlabel/netlabel_unlabeled.h b/net/netlabel/netlabel_unlabeled.h index c2917fbb42c..06b1301ac07 100644 --- a/net/netlabel/netlabel_unlabeled.h +++ b/net/netlabel/netlabel_unlabeled.h @@ -36,6 +36,116 @@ /* * The following NetLabel payloads are supported by the Unlabeled subsystem. * + * o STATICADD + * This message is sent from an application to add a new static label for + * incoming unlabeled connections. + * + * Required attributes: + * + * NLBL_UNLABEL_A_IFACE + * NLBL_UNLABEL_A_SECCTX + * + * If IPv4 is specified the following attributes are required: + * + * NLBL_UNLABEL_A_IPV4ADDR + * NLBL_UNLABEL_A_IPV4MASK + * + * If IPv6 is specified the following attributes are required: + * + * NLBL_UNLABEL_A_IPV6ADDR + * NLBL_UNLABEL_A_IPV6MASK + * + * o STATICREMOVE + * This message is sent from an application to remove an existing static + * label for incoming unlabeled connections. + * + * Required attributes: + * + * NLBL_UNLABEL_A_IFACE + * + * If IPv4 is specified the following attributes are required: + * + * NLBL_UNLABEL_A_IPV4ADDR + * NLBL_UNLABEL_A_IPV4MASK + * + * If IPv6 is specified the following attributes are required: + * + * NLBL_UNLABEL_A_IPV6ADDR + * NLBL_UNLABEL_A_IPV6MASK + * + * o STATICLIST + * This message can be sent either from an application or by the kernel in + * response to an application generated STATICLIST message. When sent by an + * application there is no payload and the NLM_F_DUMP flag should be set. + * The kernel should response with a series of the following messages. + * + * Required attributes: + * + * NLBL_UNLABEL_A_IFACE + * NLBL_UNLABEL_A_SECCTX + * + * If IPv4 is specified the following attributes are required: + * + * NLBL_UNLABEL_A_IPV4ADDR + * NLBL_UNLABEL_A_IPV4MASK + * + * If IPv6 is specified the following attributes are required: + * + * NLBL_UNLABEL_A_IPV6ADDR + * NLBL_UNLABEL_A_IPV6MASK + * + * o STATICADDDEF + * This message is sent from an application to set the default static + * label for incoming unlabeled connections. + * + * Required attribute: + * + * NLBL_UNLABEL_A_SECCTX + * + * If IPv4 is specified the following attributes are required: + * + * NLBL_UNLABEL_A_IPV4ADDR + * NLBL_UNLABEL_A_IPV4MASK + * + * If IPv6 is specified the following attributes are required: + * + * NLBL_UNLABEL_A_IPV6ADDR + * NLBL_UNLABEL_A_IPV6MASK + * + * o STATICREMOVEDEF + * This message is sent from an application to remove the existing default + * static label for incoming unlabeled connections. + * + * If IPv4 is specified the following attributes are required: + * + * NLBL_UNLABEL_A_IPV4ADDR + * NLBL_UNLABEL_A_IPV4MASK + * + * If IPv6 is specified the following attributes are required: + * + * NLBL_UNLABEL_A_IPV6ADDR + * NLBL_UNLABEL_A_IPV6MASK + * + * o STATICLISTDEF + * This message can be sent either from an application or by the kernel in + * response to an application generated STATICLISTDEF message. When sent by + * an application there is no payload and the NLM_F_DUMP flag should be set. + * The kernel should response with the following message. + * + * Required attribute: + * + * NLBL_UNLABEL_A_SECCTX + * + * If IPv4 is specified the following attributes are required: + * + * NLBL_UNLABEL_A_IPV4ADDR + * NLBL_UNLABEL_A_IPV4MASK + * + * If IPv6 is specified the following attributes are required: + * + * NLBL_UNLABEL_A_IPV6ADDR + * NLBL_UNLABEL_A_IPV6MASK + * * o ACCEPT * This message is sent from an application to specify if the kernel should * allow unlabled packets to pass if they do not match any of the static @@ -62,6 +172,12 @@ enum { NLBL_UNLABEL_C_UNSPEC, NLBL_UNLABEL_C_ACCEPT, NLBL_UNLABEL_C_LIST, + NLBL_UNLABEL_C_STATICADD, + NLBL_UNLABEL_C_STATICREMOVE, + NLBL_UNLABEL_C_STATICLIST, + NLBL_UNLABEL_C_STATICADDDEF, + NLBL_UNLABEL_C_STATICREMOVEDEF, + NLBL_UNLABEL_C_STATICLISTDEF, __NLBL_UNLABEL_C_MAX, }; #define NLBL_UNLABEL_C_MAX (__NLBL_UNLABEL_C_MAX - 1) @@ -73,6 +189,24 @@ enum { /* (NLA_U8) * if true then unlabeled packets are allowed to pass, else unlabeled * packets are rejected */ + NLBL_UNLABEL_A_IPV6ADDR, + /* (NLA_BINARY, struct in6_addr) + * an IPv6 address */ + NLBL_UNLABEL_A_IPV6MASK, + /* (NLA_BINARY, struct in6_addr) + * an IPv6 address mask */ + NLBL_UNLABEL_A_IPV4ADDR, + /* (NLA_BINARY, struct in_addr) + * an IPv4 address */ + NLBL_UNLABEL_A_IPV4MASK, + /* (NLA_BINARY, struct in_addr) + * and IPv4 address mask */ + NLBL_UNLABEL_A_IFACE, + /* (NLA_NULL_STRING) + * network interface */ + NLBL_UNLABEL_A_SECCTX, + /* (NLA_BINARY) + * a LSM specific security context */ __NLBL_UNLABEL_A_MAX, }; #define NLBL_UNLABEL_A_MAX (__NLBL_UNLABEL_A_MAX - 1) @@ -80,8 +214,17 @@ enum { /* NetLabel protocol functions */ int netlbl_unlabel_genl_init(void); +/* Unlabeled connection hash table size */ +/* XXX - currently this number is an uneducated guess */ +#define NETLBL_UNLHSH_BITSIZE 7 + +/* General Unlabeled init function */ +int netlbl_unlabel_init(u32 size); + /* Process Unlabeled incoming network packets */ -int netlbl_unlabel_getattr(struct netlbl_lsm_secattr *secattr); +int netlbl_unlabel_getattr(const struct sk_buff *skb, + u16 family, + struct netlbl_lsm_secattr *secattr); /* Set the default configuration to allow Unlabeled packets */ int netlbl_unlabel_defconf(void); diff --git a/security/Kconfig b/security/Kconfig index 8086e61058e..389e151e3b6 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -76,6 +76,7 @@ config SECURITY_NETWORK_XFRM config SECURITY_CAPABILITIES bool "Default Linux Capabilities" depends on SECURITY + default y help This enables the "default" Linux capabilities functionality. If you are unsure how to answer this question, answer Y. diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig index b32a459c068..2b517d61867 100644 --- a/security/selinux/Kconfig +++ b/security/selinux/Kconfig @@ -145,7 +145,7 @@ config SECURITY_SELINUX_POLICYDB_VERSION_MAX config SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE int "NSA SELinux maximum supported policy format version value" depends on SECURITY_SELINUX_POLICYDB_VERSION_MAX - range 15 21 + range 15 22 default 19 help This option sets the value for the maximum policy format version diff --git a/security/selinux/Makefile b/security/selinux/Makefile index dc3502e30b1..00afd85f1ed 100644 --- a/security/selinux/Makefile +++ b/security/selinux/Makefile @@ -4,7 +4,14 @@ obj-$(CONFIG_SECURITY_SELINUX) := selinux.o ss/ -selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o netif.o exports.o +selinux-y := avc.o \ + hooks.o \ + selinuxfs.o \ + netlink.o \ + nlmsgtab.o \ + netif.o \ + netnode.o \ + exports.o selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o diff --git a/security/selinux/avc.c b/security/selinux/avc.c index 81b3dff3cbf..e8529e2f51e 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c @@ -661,9 +661,18 @@ void avc_audit(u32 ssid, u32 tsid, "daddr", "dest"); break; } - if (a->u.net.netif) - audit_log_format(ab, " netif=%s", - a->u.net.netif); + if (a->u.net.netif > 0) { + struct net_device *dev; + + /* NOTE: we always use init's namespace */ + dev = dev_get_by_index(&init_net, + a->u.net.netif); + if (dev) { + audit_log_format(ab, " netif=%s", + dev->name); + dev_put(dev); + } + } break; } } diff --git a/security/selinux/exports.c b/security/selinux/exports.c index b6f96943be1..87d2bb3ea35 100644 --- a/security/selinux/exports.c +++ b/security/selinux/exports.c @@ -17,10 +17,14 @@ #include <linux/selinux.h> #include <linux/fs.h> #include <linux/ipc.h> +#include <asm/atomic.h> #include "security.h" #include "objsec.h" +/* SECMARK reference count */ +extern atomic_t selinux_secmark_refcount; + int selinux_sid_to_string(u32 sid, char **ctx, u32 *ctxlen) { if (selinux_enabled) @@ -74,7 +78,7 @@ int selinux_string_to_sid(char *str, u32 *sid) } EXPORT_SYMBOL_GPL(selinux_string_to_sid); -int selinux_relabel_packet_permission(u32 sid) +int selinux_secmark_relabel_packet_permission(u32 sid) { if (selinux_enabled) { struct task_security_struct *tsec = current->security; @@ -84,4 +88,16 @@ int selinux_relabel_packet_permission(u32 sid) } return 0; } -EXPORT_SYMBOL_GPL(selinux_relabel_packet_permission); +EXPORT_SYMBOL_GPL(selinux_secmark_relabel_packet_permission); + +void selinux_secmark_refcount_inc(void) +{ + atomic_inc(&selinux_secmark_refcount); +} +EXPORT_SYMBOL_GPL(selinux_secmark_refcount_inc); + +void selinux_secmark_refcount_dec(void) +{ + atomic_dec(&selinux_secmark_refcount); +} +EXPORT_SYMBOL_GPL(selinux_secmark_refcount_dec); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 64d414efb40..be6de0b8734 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -12,8 +12,8 @@ * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com> * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. * <dgoeddel@trustedcs.com> - * Copyright (C) 2006 Hewlett-Packard Development Company, L.P. - * Paul Moore, <paul.moore@hp.com> + * Copyright (C) 2006, 2007 Hewlett-Packard Development Company, L.P. + * Paul Moore <paul.moore@hp.com> * Copyright (C) 2007 Hitachi Software Engineering Co., Ltd. * Yuichi Nakamura <ynakam@hitachisoft.jp> * @@ -50,8 +50,11 @@ #include <net/icmp.h> #include <net/ip.h> /* for local_port_range[] */ #include <net/tcp.h> /* struct or_callable used in sock_rcv_skb */ +#include <net/net_namespace.h> +#include <net/netlabel.h> #include <asm/uaccess.h> #include <asm/ioctls.h> +#include <asm/atomic.h> #include <linux/bitops.h> #include <linux/interrupt.h> #include <linux/netdevice.h> /* for network interface checks */ @@ -76,6 +79,7 @@ #include "avc.h" #include "objsec.h" #include "netif.h" +#include "netnode.h" #include "xfrm.h" #include "netlabel.h" @@ -89,6 +93,9 @@ extern int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm); extern int selinux_compat_net; extern struct security_operations *security_ops; +/* SECMARK reference count */ +atomic_t selinux_secmark_refcount = ATOMIC_INIT(0); + #ifdef CONFIG_SECURITY_SELINUX_DEVELOP int selinux_enforcing = 0; @@ -155,6 +162,21 @@ getsecurity_exit: return len; } +/** + * selinux_secmark_enabled - Check to see if SECMARK is currently enabled + * + * Description: + * This function checks the SECMARK reference counter to see if any SECMARK + * targets are currently configured, if the reference counter is greater than + * zero SECMARK is considered to be enabled. Returns true (1) if SECMARK is + * enabled, false (0) if SECMARK is disabled. + * + */ +static int selinux_secmark_enabled(void) +{ + return (atomic_read(&selinux_secmark_refcount) > 0); +} + /* Allocate and free functions for each kind of security blob. */ static int task_alloc_security(struct task_struct *task) @@ -561,8 +583,8 @@ static int bad_option(struct superblock_security_struct *sbsec, char flag, * Allow filesystems with binary mount data to explicitly set mount point * labeling information. */ -int selinux_set_mnt_opts(struct super_block *sb, char **mount_options, - int *flags, int num_opts) +static int selinux_set_mnt_opts(struct super_block *sb, char **mount_options, + int *flags, int num_opts) { int rc = 0, i; struct task_security_struct *tsec = current->security; @@ -3395,7 +3417,7 @@ out: #endif /* IPV6 */ static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad, - char **addrp, int *len, int src, u8 *proto) + char **addrp, int src, u8 *proto) { int ret = 0; @@ -3404,7 +3426,6 @@ static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad, ret = selinux_parse_skb_ipv4(skb, ad, proto); if (ret || !addrp) break; - *len = 4; *addrp = (char *)(src ? &ad->u.net.v4info.saddr : &ad->u.net.v4info.daddr); break; @@ -3414,7 +3435,6 @@ static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad, ret = selinux_parse_skb_ipv6(skb, ad, proto); if (ret || !addrp) break; - *len = 16; *addrp = (char *)(src ? &ad->u.net.v6info.saddr : &ad->u.net.v6info.daddr); break; @@ -3423,36 +3443,48 @@ static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad, break; } + if (unlikely(ret)) + printk(KERN_WARNING + "SELinux: failure in selinux_parse_skb()," + " unable to parse packet\n"); + return ret; } /** - * selinux_skb_extlbl_sid - Determine the external label of a packet + * selinux_skb_peerlbl_sid - Determine the peer label of a packet * @skb: the packet - * @sid: the packet's SID + * @family: protocol family + * @sid: the packet's peer label SID * * Description: - * Check the various different forms of external packet labeling and determine - * the external SID for the packet. If only one form of external labeling is - * present then it is used, if both labeled IPsec and NetLabel labels are - * present then the SELinux type information is taken from the labeled IPsec - * SA and the MLS sensitivity label information is taken from the NetLabel - * security attributes. This bit of "magic" is done in the call to - * selinux_netlbl_skbuff_getsid(). + * Check the various different forms of network peer labeling and determine + * the peer label/SID for the packet; most of the magic actually occurs in + * the security server function security_net_peersid_cmp(). The function + * returns zero if the value in @sid is valid (although it may be SECSID_NULL) + * or -EACCES if @sid is invalid due to inconsistencies with the different + * peer labels. * */ -static void selinux_skb_extlbl_sid(struct sk_buff *skb, u32 *sid) +static int selinux_skb_peerlbl_sid(struct sk_buff *skb, u16 family, u32 *sid) { + int err; u32 xfrm_sid; u32 nlbl_sid; + u32 nlbl_type; selinux_skb_xfrm_sid(skb, &xfrm_sid); - if (selinux_netlbl_skbuff_getsid(skb, - (xfrm_sid == SECSID_NULL ? - SECINITSID_NETMSG : xfrm_sid), - &nlbl_sid) != 0) - nlbl_sid = SECSID_NULL; - *sid = (nlbl_sid == SECSID_NULL ? xfrm_sid : nlbl_sid); + selinux_netlbl_skbuff_getsid(skb, family, &nlbl_type, &nlbl_sid); + + err = security_net_peersid_resolve(nlbl_sid, nlbl_type, xfrm_sid, sid); + if (unlikely(err)) { + printk(KERN_WARNING + "SELinux: failure in selinux_skb_peerlbl_sid()," + " unable to determine packet's peer label\n"); + return -EACCES; + } + + return 0; } /* socket security operations */ @@ -3518,6 +3550,7 @@ static int selinux_socket_post_create(struct socket *sock, int family, if (sock->sk) { sksec = sock->sk->sk_security; sksec->sid = isec->sid; + sksec->sclass = isec->sclass; err = selinux_netlbl_socket_post_create(sock); } @@ -3610,7 +3643,7 @@ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, in break; } - err = security_node_sid(family, addrp, addrlen, &sid); + err = sel_netnode_sid(addrp, family, &sid); if (err) goto out; @@ -3821,131 +3854,182 @@ static int selinux_socket_unix_may_send(struct socket *sock, return 0; } -static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb, - struct avc_audit_data *ad, u16 family, char *addrp, int len) +static int selinux_inet_sys_rcv_skb(int ifindex, char *addrp, u16 family, + u32 peer_sid, + struct avc_audit_data *ad) { - int err = 0; - u32 netif_perm, node_perm, node_sid, if_sid, recv_perm = 0; - struct socket *sock; - u16 sock_class = 0; - u32 sock_sid = 0; - - read_lock_bh(&sk->sk_callback_lock); - sock = sk->sk_socket; - if (sock) { - struct inode *inode; - inode = SOCK_INODE(sock); - if (inode) { - struct inode_security_struct *isec; - isec = inode->i_security; - sock_sid = isec->sid; - sock_class = isec->sclass; - } - } - read_unlock_bh(&sk->sk_callback_lock); - if (!sock_sid) - goto out; + int err; + u32 if_sid; + u32 node_sid; - if (!skb->dev) - goto out; + err = sel_netif_sid(ifindex, &if_sid); + if (err) + return err; + err = avc_has_perm(peer_sid, if_sid, + SECCLASS_NETIF, NETIF__INGRESS, ad); + if (err) + return err; - err = sel_netif_sids(skb->dev, &if_sid, NULL); + err = sel_netnode_sid(addrp, family, &node_sid); if (err) - goto out; + return err; + return avc_has_perm(peer_sid, node_sid, + SECCLASS_NODE, NODE__RECVFROM, ad); +} + +static int selinux_sock_rcv_skb_iptables_compat(struct sock *sk, + struct sk_buff *skb, + struct avc_audit_data *ad, + u16 family, + char *addrp) +{ + int err; + struct sk_security_struct *sksec = sk->sk_security; + u16 sk_class; + u32 netif_perm, node_perm, recv_perm; + u32 port_sid, node_sid, if_sid, sk_sid; - switch (sock_class) { + sk_sid = sksec->sid; + sk_class = sksec->sclass; + + switch (sk_class) { case SECCLASS_UDP_SOCKET: netif_perm = NETIF__UDP_RECV; node_perm = NODE__UDP_RECV; recv_perm = UDP_SOCKET__RECV_MSG; break; - case SECCLASS_TCP_SOCKET: netif_perm = NETIF__TCP_RECV; node_perm = NODE__TCP_RECV; recv_perm = TCP_SOCKET__RECV_MSG; break; - case SECCLASS_DCCP_SOCKET: netif_perm = NETIF__DCCP_RECV; node_perm = NODE__DCCP_RECV; recv_perm = DCCP_SOCKET__RECV_MSG; break; - default: netif_perm = NETIF__RAWIP_RECV; node_perm = NODE__RAWIP_RECV; + recv_perm = 0; break; } - err = avc_has_perm(sock_sid, if_sid, SECCLASS_NETIF, netif_perm, ad); + err = sel_netif_sid(skb->iif, &if_sid); if (err) - goto out; - - err = security_node_sid(family, addrp, len, &node_sid); + return err; + err = avc_has_perm(sk_sid, if_sid, SECCLASS_NETIF, netif_perm, ad); if (err) - goto out; + return err; - err = avc_has_perm(sock_sid, node_sid, SECCLASS_NODE, node_perm, ad); + err = sel_netnode_sid(addrp, family, &node_sid); if (err) - goto out; + return err; + err = avc_has_perm(sk_sid, node_sid, SECCLASS_NODE, node_perm, ad); + if (err) + return err; - if (recv_perm) { - u32 port_sid; + if (!recv_perm) + return 0; + err = security_port_sid(sk->sk_family, sk->sk_type, + sk->sk_protocol, ntohs(ad->u.net.sport), + &port_sid); + if (unlikely(err)) { + printk(KERN_WARNING + "SELinux: failure in" + " selinux_sock_rcv_skb_iptables_compat()," + " network port label not found\n"); + return err; + } + return avc_has_perm(sk_sid, port_sid, sk_class, recv_perm, ad); +} - err = security_port_sid(sk->sk_family, sk->sk_type, - sk->sk_protocol, ntohs(ad->u.net.sport), - &port_sid); - if (err) - goto out; +static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb, + struct avc_audit_data *ad, + u16 family, char *addrp) +{ + int err; + struct sk_security_struct *sksec = sk->sk_security; + u32 peer_sid; + u32 sk_sid = sksec->sid; - err = avc_has_perm(sock_sid, port_sid, - sock_class, recv_perm, ad); + if (selinux_compat_net) + err = selinux_sock_rcv_skb_iptables_compat(sk, skb, ad, + family, addrp); + else + err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET, + PACKET__RECV, ad); + if (err) + return err; + + if (selinux_policycap_netpeer) { + err = selinux_skb_peerlbl_sid(skb, family, &peer_sid); + if (err) + return err; + err = avc_has_perm(sk_sid, peer_sid, + SECCLASS_PEER, PEER__RECV, ad); + } else { + err = selinux_netlbl_sock_rcv_skb(sksec, skb, family, ad); + if (err) + return err; + err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, ad); } -out: return err; } static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) { - u16 family; - char *addrp; - int len, err = 0; - struct avc_audit_data ad; + int err; struct sk_security_struct *sksec = sk->sk_security; + u16 family = sk->sk_family; + u32 sk_sid = sksec->sid; + struct avc_audit_data ad; + char *addrp; - family = sk->sk_family; if (family != PF_INET && family != PF_INET6) - goto out; + return 0; /* Handle mapped IPv4 packets arriving via IPv6 sockets */ if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP)) family = PF_INET; AVC_AUDIT_DATA_INIT(&ad, NET); - ad.u.net.netif = skb->dev ? skb->dev->name : "[unknown]"; + ad.u.net.netif = skb->iif; ad.u.net.family = family; - - err = selinux_parse_skb(skb, &ad, &addrp, &len, 1, NULL); + err = selinux_parse_skb(skb, &ad, &addrp, 1, NULL); if (err) - goto out; + return err; - if (selinux_compat_net) - err = selinux_sock_rcv_skb_compat(sk, skb, &ad, family, - addrp, len); - else - err = avc_has_perm(sksec->sid, skb->secmark, SECCLASS_PACKET, - PACKET__RECV, &ad); - if (err) - goto out; + /* If any sort of compatibility mode is enabled then handoff processing + * to the selinux_sock_rcv_skb_compat() function to deal with the + * special handling. We do this in an attempt to keep this function + * as fast and as clean as possible. */ + if (selinux_compat_net || !selinux_policycap_netpeer) + return selinux_sock_rcv_skb_compat(sk, skb, &ad, + family, addrp); - err = selinux_netlbl_sock_rcv_skb(sksec, skb, &ad); - if (err) - goto out; + if (netlbl_enabled() || selinux_xfrm_enabled()) { + u32 peer_sid; + + err = selinux_skb_peerlbl_sid(skb, family, &peer_sid); + if (err) + return err; + err = selinux_inet_sys_rcv_skb(skb->iif, addrp, family, + peer_sid, &ad); + if (err) + return err; + err = avc_has_perm(sk_sid, peer_sid, SECCLASS_PEER, + PEER__RECV, &ad); + } + + if (selinux_secmark_enabled()) { + err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET, + PACKET__RECV, &ad); + if (err) + return err; + } - err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, &ad); -out: return err; } @@ -3996,18 +4080,25 @@ out: static int selinux_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid) { u32 peer_secid = SECSID_NULL; - int err = 0; + u16 family; + + if (sock) + family = sock->sk->sk_family; + else if (skb && skb->sk) + family = skb->sk->sk_family; + else + goto out; - if (sock && sock->sk->sk_family == PF_UNIX) + if (sock && family == PF_UNIX) selinux_get_inode_sid(SOCK_INODE(sock), &peer_secid); else if (skb) - selinux_skb_extlbl_sid(skb, &peer_secid); + selinux_skb_peerlbl_sid(skb, family, &peer_secid); - if (peer_secid == SECSID_NULL) - err = -EINVAL; +out: *secid = peer_secid; - - return err; + if (peer_secid == SECSID_NULL) + return -EINVAL; + return 0; } static int selinux_sk_alloc_security(struct sock *sk, int family, gfp_t priority) @@ -4027,6 +4118,7 @@ static void selinux_sk_clone_security(const struct sock *sk, struct sock *newsk) newssec->sid = ssec->sid; newssec->peer_sid = ssec->peer_sid; + newssec->sclass = ssec->sclass; selinux_netlbl_sk_security_clone(ssec, newssec); } @@ -4050,6 +4142,7 @@ static void selinux_sock_graft(struct sock* sk, struct socket *parent) if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6 || sk->sk_family == PF_UNIX) isec->sid = sksec->sid; + sksec->sclass = isec->sclass; selinux_netlbl_sock_graft(sk, parent); } @@ -4062,7 +4155,9 @@ static int selinux_inet_conn_request(struct sock *sk, struct sk_buff *skb, u32 newsid; u32 peersid; - selinux_skb_extlbl_sid(skb, &peersid); + err = selinux_skb_peerlbl_sid(skb, sk->sk_family, &peersid); + if (err) + return err; if (peersid == SECSID_NULL) { req->secid = sksec->sid; req->peer_secid = SECSID_NULL; @@ -4100,7 +4195,7 @@ static void selinux_inet_conn_established(struct sock *sk, { struct sk_security_struct *sksec = sk->sk_security; - selinux_skb_extlbl_sid(skb, &sksec->peer_sid); + selinux_skb_peerlbl_sid(skb, sk->sk_family, &sksec->peer_sid); } static void selinux_req_classify_flow(const struct request_sock *req, @@ -4147,149 +4242,260 @@ out: #ifdef CONFIG_NETFILTER -static int selinux_ip_postroute_last_compat(struct sock *sk, struct net_device *dev, - struct avc_audit_data *ad, - u16 family, char *addrp, int len) +static unsigned int selinux_ip_forward(struct sk_buff *skb, int ifindex, + u16 family) { - int err = 0; - u32 netif_perm, node_perm, node_sid, if_sid, send_perm = 0; - struct socket *sock; - struct inode *inode; - struct inode_security_struct *isec; + char *addrp; + u32 peer_sid; + struct avc_audit_data ad; + u8 secmark_active; + u8 peerlbl_active; - sock = sk->sk_socket; - if (!sock) - goto out; + if (!selinux_policycap_netpeer) + return NF_ACCEPT; - inode = SOCK_INODE(sock); - if (!inode) - goto out; + secmark_active = selinux_secmark_enabled(); + peerlbl_active = netlbl_enabled() || selinux_xfrm_enabled(); + if (!secmark_active && !peerlbl_active) + return NF_ACCEPT; - isec = inode->i_security; - - err = sel_netif_sids(dev, &if_sid, NULL); - if (err) - goto out; + AVC_AUDIT_DATA_INIT(&ad, NET); + ad.u.net.netif = ifindex; + ad.u.net.family = family; + if (selinux_parse_skb(skb, &ad, &addrp, 1, NULL) != 0) + return NF_DROP; + + if (selinux_skb_peerlbl_sid(skb, family, &peer_sid) != 0) + return NF_DROP; + + if (peerlbl_active) + if (selinux_inet_sys_rcv_skb(ifindex, addrp, family, + peer_sid, &ad) != 0) + return NF_DROP; + + if (secmark_active) + if (avc_has_perm(peer_sid, skb->secmark, + SECCLASS_PACKET, PACKET__FORWARD_IN, &ad)) + return NF_DROP; + + return NF_ACCEPT; +} + +static unsigned int selinux_ipv4_forward(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return selinux_ip_forward(skb, in->ifindex, PF_INET); +} - switch (isec->sclass) { +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static unsigned int selinux_ipv6_forward(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return selinux_ip_forward(skb, in->ifindex, PF_INET6); +} +#endif /* IPV6 */ + +static int selinux_ip_postroute_iptables_compat(struct sock *sk, + int ifindex, + struct avc_audit_data *ad, + u16 family, char *addrp) +{ + int err; + struct sk_security_struct *sksec = sk->sk_security; + u16 sk_class; + u32 netif_perm, node_perm, send_perm; + u32 port_sid, node_sid, if_sid, sk_sid; + + sk_sid = sksec->sid; + sk_class = sksec->sclass; + + switch (sk_class) { case SECCLASS_UDP_SOCKET: netif_perm = NETIF__UDP_SEND; node_perm = NODE__UDP_SEND; send_perm = UDP_SOCKET__SEND_MSG; break; - case SECCLASS_TCP_SOCKET: netif_perm = NETIF__TCP_SEND; node_perm = NODE__TCP_SEND; send_perm = TCP_SOCKET__SEND_MSG; break; - case SECCLASS_DCCP_SOCKET: netif_perm = NETIF__DCCP_SEND; node_perm = NODE__DCCP_SEND; send_perm = DCCP_SOCKET__SEND_MSG; break; - default: netif_perm = NETIF__RAWIP_SEND; node_perm = NODE__RAWIP_SEND; + send_perm = 0; break; } - err = avc_has_perm(isec->sid, if_sid, SECCLASS_NETIF, netif_perm, ad); + err = sel_netif_sid(ifindex, &if_sid); if (err) - goto out; + return err; + err = avc_has_perm(sk_sid, if_sid, SECCLASS_NETIF, netif_perm, ad); + return err; - err = security_node_sid(family, addrp, len, &node_sid); + err = sel_netnode_sid(addrp, family, &node_sid); if (err) - goto out; - - err = avc_has_perm(isec->sid, node_sid, SECCLASS_NODE, node_perm, ad); + return err; + err = avc_has_perm(sk_sid, node_sid, SECCLASS_NODE, node_perm, ad); if (err) - goto out; + return err; - if (send_perm) { - u32 port_sid; - - err = security_port_sid(sk->sk_family, - sk->sk_type, - sk->sk_protocol, - ntohs(ad->u.net.dport), - &port_sid); - if (err) - goto out; + if (send_perm != 0) + return 0; - err = avc_has_perm(isec->sid, port_sid, isec->sclass, - send_perm, ad); + err = security_port_sid(sk->sk_family, sk->sk_type, + sk->sk_protocol, ntohs(ad->u.net.dport), + &port_sid); + if (unlikely(err)) { + printk(KERN_WARNING + "SELinux: failure in" + " selinux_ip_postroute_iptables_compat()," + " network port label not found\n"); + return err; } -out: - return err; + return avc_has_perm(sk_sid, port_sid, sk_class, send_perm, ad); } -static unsigned int selinux_ip_postroute_last(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *), - u16 family) +static unsigned int selinux_ip_postroute_compat(struct sk_buff *skb, + int ifindex, + struct avc_audit_data *ad, + u16 family, + char *addrp, + u8 proto) { - char *addrp; - int len, err = 0; - struct sock *sk; - struct avc_audit_data ad; - struct net_device *dev = (struct net_device *)out; + struct sock *sk = skb->sk; struct sk_security_struct *sksec; - u8 proto; - - sk = skb->sk; - if (!sk) - goto out; + if (sk == NULL) + return NF_ACCEPT; sksec = sk->sk_security; - AVC_AUDIT_DATA_INIT(&ad, NET); - ad.u.net.netif = dev->name; - ad.u.net.family = family; + if (selinux_compat_net) { + if (selinux_ip_postroute_iptables_compat(skb->sk, ifindex, + ad, family, addrp)) + return NF_DROP; + } else { + if (avc_has_perm(sksec->sid, skb->secmark, + SECCLASS_PACKET, PACKET__SEND, ad)) + return NF_DROP; + } - err = selinux_parse_skb(skb, &ad, &addrp, &len, 0, &proto); - if (err) - goto out; + if (selinux_policycap_netpeer) + if (selinux_xfrm_postroute_last(sksec->sid, skb, ad, proto)) + return NF_DROP; - if (selinux_compat_net) - err = selinux_ip_postroute_last_compat(sk, dev, &ad, - family, addrp, len); - else - err = avc_has_perm(sksec->sid, skb->secmark, SECCLASS_PACKET, - PACKET__SEND, &ad); + return NF_ACCEPT; +} - if (err) - goto out; +static unsigned int selinux_ip_postroute(struct sk_buff *skb, int ifindex, + u16 family) +{ + u32 secmark_perm; + u32 peer_sid; + struct sock *sk; + struct avc_audit_data ad; + char *addrp; + u8 proto; + u8 secmark_active; + u8 peerlbl_active; - err = selinux_xfrm_postroute_last(sksec->sid, skb, &ad, proto); -out: - return err ? NF_DROP : NF_ACCEPT; + AVC_AUDIT_DATA_INIT(&ad, NET); + ad.u.net.netif = ifindex; + ad.u.net.family = family; + if (selinux_parse_skb(skb, &ad, &addrp, 0, &proto)) + return NF_DROP; + + /* If any sort of compatibility mode is enabled then handoff processing + * to the selinux_ip_postroute_compat() function to deal with the + * special handling. We do this in an attempt to keep this function + * as fast and as clean as possible. */ + if (selinux_compat_net || !selinux_policycap_netpeer) + return selinux_ip_postroute_compat(skb, ifindex, &ad, + family, addrp, proto); + + /* If skb->dst->xfrm is non-NULL then the packet is undergoing an IPsec + * packet transformation so allow the packet to pass without any checks + * since we'll have another chance to perform access control checks + * when the packet is on it's final way out. + * NOTE: there appear to be some IPv6 multicast cases where skb->dst + * is NULL, in this case go ahead and apply access control. */ + if (skb->dst != NULL && skb->dst->xfrm != NULL) + return NF_ACCEPT; + + secmark_active = selinux_secmark_enabled(); + peerlbl_active = netlbl_enabled() || selinux_xfrm_enabled(); + if (!secmark_active && !peerlbl_active) + return NF_ACCEPT; + + /* if the packet is locally generated (skb->sk != NULL) then use the + * socket's label as the peer label, otherwise the packet is being + * forwarded through this system and we need to fetch the peer label + * directly from the packet */ + sk = skb->sk; + if (sk) { + struct sk_security_struct *sksec = sk->sk_security; + peer_sid = sksec->sid; + secmark_perm = PACKET__SEND; + } else { + if (selinux_skb_peerlbl_sid(skb, family, &peer_sid)) + return NF_DROP; + secmark_perm = PACKET__FORWARD_OUT; + } + + if (secmark_active) + if (avc_has_perm(peer_sid, skb->secmark, + SECCLASS_PACKET, secmark_perm, &ad)) + return NF_DROP; + + if (peerlbl_active) { + u32 if_sid; + u32 node_sid; + + if (sel_netif_sid(ifindex, &if_sid)) + return NF_DROP; + if (avc_has_perm(peer_sid, if_sid, + SECCLASS_NETIF, NETIF__EGRESS, &ad)) + return NF_DROP; + + if (sel_netnode_sid(addrp, family, &node_sid)) + return NF_DROP; + if (avc_has_perm(peer_sid, node_sid, + SECCLASS_NODE, NODE__SENDTO, &ad)) + return NF_DROP; + } + + return NF_ACCEPT; } -static unsigned int selinux_ipv4_postroute_last(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int selinux_ipv4_postroute(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - return selinux_ip_postroute_last(hooknum, skb, in, out, okfn, PF_INET); + return selinux_ip_postroute(skb, out->ifindex, PF_INET); } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - -static unsigned int selinux_ipv6_postroute_last(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int selinux_ipv6_postroute(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - return selinux_ip_postroute_last(hooknum, skb, in, out, okfn, PF_INET6); + return selinux_ip_postroute(skb, out->ifindex, PF_INET6); } - #endif /* IPV6 */ #endif /* CONFIG_NETFILTER */ @@ -5277,22 +5483,40 @@ security_initcall(selinux_init); #if defined(CONFIG_NETFILTER) -static struct nf_hook_ops selinux_ipv4_op = { - .hook = selinux_ipv4_postroute_last, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_SELINUX_LAST, +static struct nf_hook_ops selinux_ipv4_ops[] = { + { + .hook = selinux_ipv4_postroute, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_SELINUX_LAST, + }, + { + .hook = selinux_ipv4_forward, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_FORWARD, + .priority = NF_IP_PRI_SELINUX_FIRST, + } }; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static struct nf_hook_ops selinux_ipv6_op = { - .hook = selinux_ipv6_postroute_last, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP6_PRI_SELINUX_LAST, +static struct nf_hook_ops selinux_ipv6_ops[] = { + { + .hook = selinux_ipv6_postroute, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP6_PRI_SELINUX_LAST, + }, + { + .hook = selinux_ipv6_forward, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_FORWARD, + .priority = NF_IP6_PRI_SELINUX_FIRST, + } }; #endif /* IPV6 */ @@ -5300,22 +5524,27 @@ static struct nf_hook_ops selinux_ipv6_op = { static int __init selinux_nf_ip_init(void) { int err = 0; + u32 iter; if (!selinux_enabled) goto out; printk(KERN_DEBUG "SELinux: Registering netfilter hooks\n"); - err = nf_register_hook(&selinux_ipv4_op); - if (err) - panic("SELinux: nf_register_hook for IPv4: error %d\n", err); + for (iter = 0; iter < ARRAY_SIZE(selinux_ipv4_ops); iter++) { + err = nf_register_hook(&selinux_ipv4_ops[iter]); + if (err) + panic("SELinux: nf_register_hook for IPv4: error %d\n", + err); + } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - - err = nf_register_hook(&selinux_ipv6_op); - if (err) - panic("SELinux: nf_register_hook for IPv6: error %d\n", err); - + for (iter = 0; iter < ARRAY_SIZE(selinux_ipv6_ops); iter++) { + err = nf_register_hook(&selinux_ipv6_ops[iter]); + if (err) + panic("SELinux: nf_register_hook for IPv6: error %d\n", + err); + } #endif /* IPV6 */ out: @@ -5327,11 +5556,15 @@ __initcall(selinux_nf_ip_init); #ifdef CONFIG_SECURITY_SELINUX_DISABLE static void selinux_nf_ip_exit(void) { + u32 iter; + printk(KERN_DEBUG "SELinux: Unregistering netfilter hooks\n"); - nf_unregister_hook(&selinux_ipv4_op); + for (iter = 0; iter < ARRAY_SIZE(selinux_ipv4_ops); iter++) + nf_unregister_hook(&selinux_ipv4_ops[iter]); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - nf_unregister_hook(&selinux_ipv6_op); + for (iter = 0; iter < ARRAY_SIZE(selinux_ipv6_ops); iter++) + nf_unregister_hook(&selinux_ipv6_ops[iter]); #endif /* IPV6 */ } #endif diff --git a/security/selinux/include/av_perm_to_string.h b/security/selinux/include/av_perm_to_string.h index 049bf69429b..399f868c5c8 100644 --- a/security/selinux/include/av_perm_to_string.h +++ b/security/selinux/include/av_perm_to_string.h @@ -37,6 +37,8 @@ S_(SECCLASS_NODE, NODE__ENFORCE_DEST, "enforce_dest") S_(SECCLASS_NODE, NODE__DCCP_RECV, "dccp_recv") S_(SECCLASS_NODE, NODE__DCCP_SEND, "dccp_send") + S_(SECCLASS_NODE, NODE__RECVFROM, "recvfrom") + S_(SECCLASS_NODE, NODE__SENDTO, "sendto") S_(SECCLASS_NETIF, NETIF__TCP_RECV, "tcp_recv") S_(SECCLASS_NETIF, NETIF__TCP_SEND, "tcp_send") S_(SECCLASS_NETIF, NETIF__UDP_RECV, "udp_recv") @@ -45,6 +47,8 @@ S_(SECCLASS_NETIF, NETIF__RAWIP_SEND, "rawip_send") S_(SECCLASS_NETIF, NETIF__DCCP_RECV, "dccp_recv") S_(SECCLASS_NETIF, NETIF__DCCP_SEND, "dccp_send") + S_(SECCLASS_NETIF, NETIF__INGRESS, "ingress") + S_(SECCLASS_NETIF, NETIF__EGRESS, "egress") S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__CONNECTTO, "connectto") S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__NEWCONN, "newconn") S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__ACCEPTFROM, "acceptfrom") @@ -149,6 +153,10 @@ S_(SECCLASS_PACKET, PACKET__SEND, "send") S_(SECCLASS_PACKET, PACKET__RECV, "recv") S_(SECCLASS_PACKET, PACKET__RELABELTO, "relabelto") + S_(SECCLASS_PACKET, PACKET__FLOW_IN, "flow_in") + S_(SECCLASS_PACKET, PACKET__FLOW_OUT, "flow_out") + S_(SECCLASS_PACKET, PACKET__FORWARD_IN, "forward_in") + S_(SECCLASS_PACKET, PACKET__FORWARD_OUT, "forward_out") S_(SECCLASS_KEY, KEY__VIEW, "view") S_(SECCLASS_KEY, KEY__READ, "read") S_(SECCLASS_KEY, KEY__WRITE, "write") @@ -159,3 +167,4 @@ S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NODE_BIND, "node_bind") S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NAME_CONNECT, "name_connect") S_(SECCLASS_MEMPROTECT, MEMPROTECT__MMAP_ZERO, "mmap_zero") + S_(SECCLASS_PEER, PEER__RECV, "recv") diff --git a/security/selinux/include/av_permissions.h b/security/selinux/include/av_permissions.h index eda89a2ec63..84c9abc8097 100644 --- a/security/selinux/include/av_permissions.h +++ b/security/selinux/include/av_permissions.h @@ -292,6 +292,8 @@ #define NODE__ENFORCE_DEST 0x00000040UL #define NODE__DCCP_RECV 0x00000080UL #define NODE__DCCP_SEND 0x00000100UL +#define NODE__RECVFROM 0x00000200UL +#define NODE__SENDTO 0x00000400UL #define NETIF__TCP_RECV 0x00000001UL #define NETIF__TCP_SEND 0x00000002UL #define NETIF__UDP_RECV 0x00000004UL @@ -300,6 +302,8 @@ #define NETIF__RAWIP_SEND 0x00000020UL #define NETIF__DCCP_RECV 0x00000040UL #define NETIF__DCCP_SEND 0x00000080UL +#define NETIF__INGRESS 0x00000100UL +#define NETIF__EGRESS 0x00000200UL #define NETLINK_SOCKET__IOCTL 0x00000001UL #define NETLINK_SOCKET__READ 0x00000002UL #define NETLINK_SOCKET__WRITE 0x00000004UL @@ -792,6 +796,10 @@ #define PACKET__SEND 0x00000001UL #define PACKET__RECV 0x00000002UL #define PACKET__RELABELTO 0x00000004UL +#define PACKET__FLOW_IN 0x00000008UL +#define PACKET__FLOW_OUT 0x00000010UL +#define PACKET__FORWARD_IN 0x00000020UL +#define PACKET__FORWARD_OUT 0x00000040UL #define KEY__VIEW 0x00000001UL #define KEY__READ 0x00000002UL #define KEY__WRITE 0x00000004UL @@ -824,3 +832,4 @@ #define DCCP_SOCKET__NODE_BIND 0x00400000UL #define DCCP_SOCKET__NAME_CONNECT 0x00800000UL #define MEMPROTECT__MMAP_ZERO 0x00000001UL +#define PEER__RECV 0x00000001UL diff --git a/security/selinux/include/avc.h b/security/selinux/include/avc.h index 553607a19e9..80c28fa6621 100644 --- a/security/selinux/include/avc.h +++ b/security/selinux/include/avc.h @@ -51,7 +51,7 @@ struct avc_audit_data { struct inode *inode; } fs; struct { - char *netif; + int netif; struct sock *sk; u16 family; __be16 dport; diff --git a/security/selinux/include/class_to_string.h b/security/selinux/include/class_to_string.h index e77de0e62ea..b1b0d1d8f95 100644 --- a/security/selinux/include/class_to_string.h +++ b/security/selinux/include/class_to_string.h @@ -64,3 +64,10 @@ S_(NULL) S_("dccp_socket") S_("memprotect") + S_(NULL) + S_(NULL) + S_(NULL) + S_(NULL) + S_(NULL) + S_(NULL) + S_("peer") diff --git a/security/selinux/include/flask.h b/security/selinux/include/flask.h index a9c2b20f14b..09e9dd23ee1 100644 --- a/security/selinux/include/flask.h +++ b/security/selinux/include/flask.h @@ -50,6 +50,7 @@ #define SECCLASS_KEY 58 #define SECCLASS_DCCP_SOCKET 60 #define SECCLASS_MEMPROTECT 61 +#define SECCLASS_PEER 68 /* * Security identifier indices for initial entities diff --git a/security/selinux/include/netif.h b/security/selinux/include/netif.h index 8bd6f9992d2..ce23edd128b 100644 --- a/security/selinux/include/netif.h +++ b/security/selinux/include/netif.h @@ -7,6 +7,8 @@ * Author: James Morris <jmorris@redhat.com> * * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com> + * Copyright (C) 2007 Hewlett-Packard Development Company, L.P. + * Paul Moore, <paul.moore@hp.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2, @@ -15,7 +17,7 @@ #ifndef _SELINUX_NETIF_H_ #define _SELINUX_NETIF_H_ -int sel_netif_sids(struct net_device *dev, u32 *if_sid, u32 *msg_sid); +int sel_netif_sid(int ifindex, u32 *sid); #endif /* _SELINUX_NETIF_H_ */ diff --git a/security/selinux/include/netlabel.h b/security/selinux/include/netlabel.h index 218e3f77c35..00a2809c850 100644 --- a/security/selinux/include/netlabel.h +++ b/security/selinux/include/netlabel.h @@ -46,13 +46,17 @@ void selinux_netlbl_sk_security_init(struct sk_security_struct *ssec, void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec, struct sk_security_struct *newssec); -int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid); +int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, + u16 family, + u32 *type, + u32 *sid); void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock); int selinux_netlbl_socket_post_create(struct socket *sock); int selinux_netlbl_inode_permission(struct inode *inode, int mask); int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, struct sk_buff *skb, + u16 family, struct avc_audit_data *ad); int selinux_netlbl_socket_setsockopt(struct socket *sock, int level, @@ -83,9 +87,11 @@ static inline void selinux_netlbl_sk_security_clone( } static inline int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, - u32 base_sid, + u16 family, + u32 *type, u32 *sid) { + *type = NETLBL_NLTYPE_NONE; *sid = SECSID_NULL; return 0; } @@ -106,6 +112,7 @@ static inline int selinux_netlbl_inode_permission(struct inode *inode, } static inline int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, struct sk_buff *skb, + u16 family, struct avc_audit_data *ad) { return 0; diff --git a/security/selinux/include/netnode.h b/security/selinux/include/netnode.h new file mode 100644 index 00000000000..1b94450d11d --- /dev/null +++ b/security/selinux/include/netnode.h @@ -0,0 +1,32 @@ +/* + * Network node table + * + * SELinux must keep a mapping of network nodes to labels/SIDs. This + * mapping is maintained as part of the normal policy but a fast cache is + * needed to reduce the lookup overhead since most of these queries happen on + * a per-packet basis. + * + * Author: Paul Moore <paul.moore@hp.com> + * + */ + +/* + * (c) Copyright Hewlett-Packard Development Company, L.P., 2007 + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef _SELINUX_NETNODE_H +#define _SELINUX_NETNODE_H + +int sel_netnode_sid(void *addr, u16 family, u32 *sid); + +#endif diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h index 4138a80f8e2..c6c2bb4ebac 100644 --- a/security/selinux/include/objsec.h +++ b/security/selinux/include/objsec.h @@ -96,17 +96,25 @@ struct bprm_security_struct { }; struct netif_security_struct { - struct net_device *dev; /* back pointer */ - u32 if_sid; /* SID for this interface */ - u32 msg_sid; /* default SID for messages received on this interface */ + int ifindex; /* device index */ + u32 sid; /* SID for this interface */ +}; + +struct netnode_security_struct { + union { + __be32 ipv4; /* IPv4 node address */ + struct in6_addr ipv6; /* IPv6 node address */ + } addr; + u32 sid; /* SID for this node */ + u16 family; /* address family */ }; struct sk_security_struct { struct sock *sk; /* back pointer to sk object */ u32 sid; /* SID of this object */ u32 peer_sid; /* SID of peer */ -#ifdef CONFIG_NETLABEL u16 sclass; /* sock security class */ +#ifdef CONFIG_NETLABEL enum { /* NetLabel state */ NLBL_UNSET = 0, NLBL_REQUIRE, diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index 39337afffec..23137c17f91 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h @@ -25,13 +25,14 @@ #define POLICYDB_VERSION_MLS 19 #define POLICYDB_VERSION_AVTAB 20 #define POLICYDB_VERSION_RANGETRANS 21 +#define POLICYDB_VERSION_POLCAP 22 /* Range of policy versions we understand*/ #define POLICYDB_VERSION_MIN POLICYDB_VERSION_BASE #ifdef CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX #define POLICYDB_VERSION_MAX CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE #else -#define POLICYDB_VERSION_MAX POLICYDB_VERSION_RANGETRANS +#define POLICYDB_VERSION_MAX POLICYDB_VERSION_POLCAP #endif struct netlbl_lsm_secattr; @@ -39,8 +40,19 @@ struct netlbl_lsm_secattr; extern int selinux_enabled; extern int selinux_mls_enabled; +/* Policy capabilities */ +enum { + POLICYDB_CAPABILITY_NETPEER, + __POLICYDB_CAPABILITY_MAX +}; +#define POLICYDB_CAPABILITY_MAX (__POLICYDB_CAPABILITY_MAX - 1) + +extern int selinux_policycap_netpeer; + int security_load_policy(void * data, size_t len); +int security_policycap_supported(unsigned int req_cap); + #define SEL_VEC_MAX 32 struct av_decision { u32 allowed; @@ -77,8 +89,7 @@ int security_get_user_sids(u32 callsid, char *username, int security_port_sid(u16 domain, u16 type, u8 protocol, u16 port, u32 *out_sid); -int security_netif_sid(char *name, u32 *if_sid, - u32 *msg_sid); +int security_netif_sid(char *name, u32 *if_sid); int security_node_sid(u16 domain, void *addr, u32 addrlen, u32 *out_sid); @@ -88,10 +99,15 @@ int security_validate_transition(u32 oldsid, u32 newsid, u32 tasksid, int security_sid_mls_copy(u32 sid, u32 mls_sid, u32 *new_sid); +int security_net_peersid_resolve(u32 nlbl_sid, u32 nlbl_type, + u32 xfrm_sid, + u32 *peer_sid); + int security_get_classes(char ***classes, int *nclasses); int security_get_permissions(char *class, char ***perms, int *nperms); int security_get_reject_unknown(void); int security_get_allow_unknown(void); +int security_get_policycaps(int *len, int **values); #define SECURITY_FS_USE_XATTR 1 /* use xattr */ #define SECURITY_FS_USE_TRANS 2 /* use transition SIDs, e.g. devpts/tmpfs */ @@ -108,7 +124,6 @@ int security_genfs_sid(const char *fstype, char *name, u16 sclass, #ifdef CONFIG_NETLABEL int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, - u32 base_sid, u32 *sid); int security_netlbl_sid_to_secattr(u32 sid, @@ -116,7 +131,6 @@ int security_netlbl_sid_to_secattr(u32 sid, #else static inline int security_netlbl_secattr_to_sid( struct netlbl_lsm_secattr *secattr, - u32 base_sid, u32 *sid) { return -EIDRM; diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h index 31929e39f5c..36b0510efa7 100644 --- a/security/selinux/include/xfrm.h +++ b/security/selinux/include/xfrm.h @@ -32,6 +32,13 @@ static inline struct inode_security_struct *get_sock_isec(struct sock *sk) } #ifdef CONFIG_SECURITY_NETWORK_XFRM +extern atomic_t selinux_xfrm_refcount; + +static inline int selinux_xfrm_enabled(void) +{ + return (atomic_read(&selinux_xfrm_refcount) > 0); +} + int selinux_xfrm_sock_rcv_skb(u32 sid, struct sk_buff *skb, struct avc_audit_data *ad); int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb, @@ -43,6 +50,11 @@ static inline void selinux_xfrm_notify_policyload(void) atomic_inc(&flow_cache_genid); } #else +static inline int selinux_xfrm_enabled(void) +{ + return 0; +} + static inline int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb, struct avc_audit_data *ad) { diff --git a/security/selinux/netif.c b/security/selinux/netif.c index e87ab948104..013d3117a86 100644 --- a/security/selinux/netif.c +++ b/security/selinux/netif.c @@ -7,6 +7,8 @@ * Author: James Morris <jmorris@redhat.com> * * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com> + * Copyright (C) 2007 Hewlett-Packard Development Company, L.P. + * Paul Moore <paul.moore@hp.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2, @@ -29,14 +31,6 @@ #define SEL_NETIF_HASH_SIZE 64 #define SEL_NETIF_HASH_MAX 1024 -#undef DEBUG - -#ifdef DEBUG -#define DEBUGP printk -#else -#define DEBUGP(format, args...) -#endif - struct sel_netif { struct list_head list; @@ -49,174 +43,226 @@ static LIST_HEAD(sel_netif_list); static DEFINE_SPINLOCK(sel_netif_lock); static struct list_head sel_netif_hash[SEL_NETIF_HASH_SIZE]; -static inline u32 sel_netif_hasfn(struct net_device *dev) +/** + * sel_netif_hashfn - Hashing function for the interface table + * @ifindex: the network interface + * + * Description: + * This is the hashing function for the network interface table, it returns the + * bucket number for the given interface. + * + */ +static inline u32 sel_netif_hashfn(int ifindex) { - return (dev->ifindex & (SEL_NETIF_HASH_SIZE - 1)); + return (ifindex & (SEL_NETIF_HASH_SIZE - 1)); } -/* - * All of the devices should normally fit in the hash, so we optimize - * for that case. +/** + * sel_netif_find - Search for an interface record + * @ifindex: the network interface + * + * Description: + * Search the network interface table and return the record matching @ifindex. + * If an entry can not be found in the table return NULL. + * */ -static inline struct sel_netif *sel_netif_find(struct net_device *dev) +static inline struct sel_netif *sel_netif_find(int ifindex) { - struct list_head *pos; - int idx = sel_netif_hasfn(dev); + int idx = sel_netif_hashfn(ifindex); + struct sel_netif *netif; - __list_for_each_rcu(pos, &sel_netif_hash[idx]) { - struct sel_netif *netif = list_entry(pos, - struct sel_netif, list); - if (likely(netif->nsec.dev == dev)) + list_for_each_entry_rcu(netif, &sel_netif_hash[idx], list) + /* all of the devices should normally fit in the hash, so we + * optimize for that case */ + if (likely(netif->nsec.ifindex == ifindex)) return netif; - } + return NULL; } +/** + * sel_netif_insert - Insert a new interface into the table + * @netif: the new interface record + * + * Description: + * Add a new interface record to the network interface hash table. Returns + * zero on success, negative values on failure. + * + */ static int sel_netif_insert(struct sel_netif *netif) { - int idx, ret = 0; + int idx; - if (sel_netif_total >= SEL_NETIF_HASH_MAX) { - ret = -ENOSPC; - goto out; - } + if (sel_netif_total >= SEL_NETIF_HASH_MAX) + return -ENOSPC; - idx = sel_netif_hasfn(netif->nsec.dev); + idx = sel_netif_hashfn(netif->nsec.ifindex); list_add_rcu(&netif->list, &sel_netif_hash[idx]); sel_netif_total++; -out: - return ret; + + return 0; } +/** + * sel_netif_free - Frees an interface entry + * @p: the entry's RCU field + * + * Description: + * This function is designed to be used as a callback to the call_rcu() + * function so that memory allocated to a hash table interface entry can be + * released safely. + * + */ static void sel_netif_free(struct rcu_head *p) { struct sel_netif *netif = container_of(p, struct sel_netif, rcu_head); - - DEBUGP("%s: %s\n", __FUNCTION__, netif->nsec.dev->name); kfree(netif); } +/** + * sel_netif_destroy - Remove an interface record from the table + * @netif: the existing interface record + * + * Description: + * Remove an existing interface record from the network interface table. + * + */ static void sel_netif_destroy(struct sel_netif *netif) { - DEBUGP("%s: %s\n", __FUNCTION__, netif->nsec.dev->name); - list_del_rcu(&netif->list); sel_netif_total--; call_rcu(&netif->rcu_head, sel_netif_free); } -static struct sel_netif *sel_netif_lookup(struct net_device *dev) +/** + * sel_netif_sid_slow - Lookup the SID of a network interface using the policy + * @ifindex: the network interface + * @sid: interface SID + * + * Description: + * This function determines the SID of a network interface by quering the + * security policy. The result is added to the network interface table to + * speedup future queries. Returns zero on success, negative values on + * failure. + * + */ +static int sel_netif_sid_slow(int ifindex, u32 *sid) { int ret; - struct sel_netif *netif, *new; - struct netif_security_struct *nsec; - - netif = sel_netif_find(dev); - if (likely(netif != NULL)) - goto out; - - new = kzalloc(sizeof(*new), GFP_ATOMIC); - if (!new) { - netif = ERR_PTR(-ENOMEM); - goto out; + struct sel_netif *netif; + struct sel_netif *new = NULL; + struct net_device *dev; + + /* NOTE: we always use init's network namespace since we don't + * currently support containers */ + + dev = dev_get_by_index(&init_net, ifindex); + if (unlikely(dev == NULL)) { + printk(KERN_WARNING + "SELinux: failure in sel_netif_sid_slow()," + " invalid network interface (%d)\n", ifindex); + return -ENOENT; } - - nsec = &new->nsec; - ret = security_netif_sid(dev->name, &nsec->if_sid, &nsec->msg_sid); - if (ret < 0) { - kfree(new); - netif = ERR_PTR(ret); + spin_lock_bh(&sel_netif_lock); + netif = sel_netif_find(ifindex); + if (netif != NULL) { + *sid = netif->nsec.sid; + ret = 0; goto out; } - - nsec->dev = dev; - - spin_lock_bh(&sel_netif_lock); - - netif = sel_netif_find(dev); - if (netif) { - spin_unlock_bh(&sel_netif_lock); - kfree(new); + new = kzalloc(sizeof(*new), GFP_ATOMIC); + if (new == NULL) { + ret = -ENOMEM; goto out; } - + ret = security_netif_sid(dev->name, &new->nsec.sid); + if (ret != 0) + goto out; + new->nsec.ifindex = ifindex; ret = sel_netif_insert(new); - spin_unlock_bh(&sel_netif_lock); - - if (ret) { - kfree(new); - netif = ERR_PTR(ret); + if (ret != 0) goto out; - } + *sid = new->nsec.sid; - netif = new; - - DEBUGP("new: ifindex=%u name=%s if_sid=%u msg_sid=%u\n", dev->ifindex, dev->name, - nsec->if_sid, nsec->msg_sid); out: - return netif; -} - -static void sel_netif_assign_sids(u32 if_sid_in, u32 msg_sid_in, u32 *if_sid_out, u32 *msg_sid_out) -{ - if (if_sid_out) - *if_sid_out = if_sid_in; - if (msg_sid_out) - *msg_sid_out = msg_sid_in; -} - -static int sel_netif_sids_slow(struct net_device *dev, u32 *if_sid, u32 *msg_sid) -{ - int ret = 0; - u32 tmp_if_sid, tmp_msg_sid; - - ret = security_netif_sid(dev->name, &tmp_if_sid, &tmp_msg_sid); - if (!ret) - sel_netif_assign_sids(tmp_if_sid, tmp_msg_sid, if_sid, msg_sid); + spin_unlock_bh(&sel_netif_lock); + dev_put(dev); + if (unlikely(ret)) { + printk(KERN_WARNING + "SELinux: failure in sel_netif_sid_slow()," + " unable to determine network interface label (%d)\n", + ifindex); + kfree(new); + } return ret; } -int sel_netif_sids(struct net_device *dev, u32 *if_sid, u32 *msg_sid) +/** + * sel_netif_sid - Lookup the SID of a network interface + * @ifindex: the network interface + * @sid: interface SID + * + * Description: + * This function determines the SID of a network interface using the fastest + * method possible. First the interface table is queried, but if an entry + * can't be found then the policy is queried and the result is added to the + * table to speedup future queries. Returns zero on success, negative values + * on failure. + * + */ +int sel_netif_sid(int ifindex, u32 *sid) { - int ret = 0; struct sel_netif *netif; rcu_read_lock(); - netif = sel_netif_lookup(dev); - if (IS_ERR(netif)) { + netif = sel_netif_find(ifindex); + if (likely(netif != NULL)) { + *sid = netif->nsec.sid; rcu_read_unlock(); - ret = sel_netif_sids_slow(dev, if_sid, msg_sid); - goto out; + return 0; } - sel_netif_assign_sids(netif->nsec.if_sid, netif->nsec.msg_sid, if_sid, msg_sid); rcu_read_unlock(); -out: - return ret; + + return sel_netif_sid_slow(ifindex, sid); } -static void sel_netif_kill(struct net_device *dev) +/** + * sel_netif_kill - Remove an entry from the network interface table + * @ifindex: the network interface + * + * Description: + * This function removes the entry matching @ifindex from the network interface + * table if it exists. + * + */ +static void sel_netif_kill(int ifindex) { struct sel_netif *netif; spin_lock_bh(&sel_netif_lock); - netif = sel_netif_find(dev); + netif = sel_netif_find(ifindex); if (netif) sel_netif_destroy(netif); spin_unlock_bh(&sel_netif_lock); } +/** + * sel_netif_flush - Flush the entire network interface table + * + * Description: + * Remove all entries from the network interface table. + * + */ static void sel_netif_flush(void) { int idx; + struct sel_netif *netif; spin_lock_bh(&sel_netif_lock); - for (idx = 0; idx < SEL_NETIF_HASH_SIZE; idx++) { - struct sel_netif *netif; - + for (idx = 0; idx < SEL_NETIF_HASH_SIZE; idx++) list_for_each_entry(netif, &sel_netif_hash[idx], list) sel_netif_destroy(netif); - } spin_unlock_bh(&sel_netif_lock); } @@ -239,7 +285,7 @@ static int sel_netif_netdev_notifier_handler(struct notifier_block *this, return NOTIFY_DONE; if (event == NETDEV_DOWN) - sel_netif_kill(dev); + sel_netif_kill(dev->ifindex); return NOTIFY_DONE; } @@ -250,10 +296,10 @@ static struct notifier_block sel_netif_netdev_notifier = { static __init int sel_netif_init(void) { - int i, err = 0; + int i, err; if (!selinux_enabled) - goto out; + return 0; for (i = 0; i < SEL_NETIF_HASH_SIZE; i++) INIT_LIST_HEAD(&sel_netif_hash[i]); @@ -265,7 +311,6 @@ static __init int sel_netif_init(void) if (err) panic("avc_add_callback() failed, error %d\n", err); -out: return err; } diff --git a/security/selinux/netlabel.c b/security/selinux/netlabel.c index 66e013d6f6f..0fa2be4149e 100644 --- a/security/selinux/netlabel.c +++ b/security/selinux/netlabel.c @@ -36,6 +36,33 @@ #include "security.h" /** + * selinux_netlbl_sidlookup_cached - Cache a SID lookup + * @skb: the packet + * @secattr: the NetLabel security attributes + * @sid: the SID + * + * Description: + * Query the SELinux security server to lookup the correct SID for the given + * security attributes. If the query is successful, cache the result to speed + * up future lookups. Returns zero on success, negative values on failure. + * + */ +static int selinux_netlbl_sidlookup_cached(struct sk_buff *skb, + struct netlbl_lsm_secattr *secattr, + u32 *sid) +{ + int rc; + + rc = security_netlbl_secattr_to_sid(secattr, sid); + if (rc == 0 && + (secattr->flags & NETLBL_SECATTR_CACHEABLE) && + (secattr->flags & NETLBL_SECATTR_CACHE)) + netlbl_cache_add(skb, secattr); + + return rc; +} + +/** * selinux_netlbl_sock_setsid - Label a socket using the NetLabel mechanism * @sk: the socket to label * @sid: the SID to use @@ -137,14 +164,14 @@ void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec, * lock as other threads could have access to ssec */ rcu_read_lock(); selinux_netlbl_sk_security_reset(newssec, ssec->sk->sk_family); - newssec->sclass = ssec->sclass; rcu_read_unlock(); } /** * selinux_netlbl_skbuff_getsid - Get the sid of a packet using NetLabel * @skb: the packet - * @base_sid: the SELinux SID to use as a context for MLS only attributes + * @family: protocol family + * @type: NetLabel labeling protocol type * @sid: the SID * * Description: @@ -153,7 +180,10 @@ void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec, * assign to the packet. Returns zero on success, negative values on failure. * */ -int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid) +int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, + u16 family, + u32 *type, + u32 *sid) { int rc; struct netlbl_lsm_secattr secattr; @@ -164,15 +194,12 @@ int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid) } netlbl_secattr_init(&secattr); - rc = netlbl_skbuff_getattr(skb, &secattr); - if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) { - rc = security_netlbl_secattr_to_sid(&secattr, base_sid, sid); - if (rc == 0 && - (secattr.flags & NETLBL_SECATTR_CACHEABLE) && - (secattr.flags & NETLBL_SECATTR_CACHE)) - netlbl_cache_add(skb, &secattr); - } else + rc = netlbl_skbuff_getattr(skb, family, &secattr); + if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) + rc = selinux_netlbl_sidlookup_cached(skb, &secattr, sid); + else *sid = SECSID_NULL; + *type = secattr.type; netlbl_secattr_destroy(&secattr); return rc; @@ -190,13 +217,10 @@ int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid) */ void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock) { - struct inode_security_struct *isec = SOCK_INODE(sock)->i_security; struct sk_security_struct *sksec = sk->sk_security; struct netlbl_lsm_secattr secattr; u32 nlbl_peer_sid; - sksec->sclass = isec->sclass; - rcu_read_lock(); if (sksec->nlbl_state != NLBL_REQUIRE) { @@ -207,9 +231,7 @@ void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock) netlbl_secattr_init(&secattr); if (netlbl_sock_getattr(sk, &secattr) == 0 && secattr.flags != NETLBL_SECATTR_NONE && - security_netlbl_secattr_to_sid(&secattr, - SECINITSID_NETMSG, - &nlbl_peer_sid) == 0) + security_netlbl_secattr_to_sid(&secattr, &nlbl_peer_sid) == 0) sksec->peer_sid = nlbl_peer_sid; netlbl_secattr_destroy(&secattr); @@ -234,11 +256,8 @@ int selinux_netlbl_socket_post_create(struct socket *sock) { int rc = 0; struct sock *sk = sock->sk; - struct inode_security_struct *isec = SOCK_INODE(sock)->i_security; struct sk_security_struct *sksec = sk->sk_security; - sksec->sclass = isec->sclass; - rcu_read_lock(); if (sksec->nlbl_state == NLBL_REQUIRE) rc = selinux_netlbl_sock_setsid(sk, sksec->sid); @@ -292,6 +311,7 @@ int selinux_netlbl_inode_permission(struct inode *inode, int mask) * selinux_netlbl_sock_rcv_skb - Do an inbound access check using NetLabel * @sksec: the sock's sk_security_struct * @skb: the packet + * @family: protocol family * @ad: the audit data * * Description: @@ -302,6 +322,7 @@ int selinux_netlbl_inode_permission(struct inode *inode, int mask) */ int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, struct sk_buff *skb, + u16 family, struct avc_audit_data *ad) { int rc; @@ -313,16 +334,10 @@ int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, return 0; netlbl_secattr_init(&secattr); - rc = netlbl_skbuff_getattr(skb, &secattr); - if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) { - rc = security_netlbl_secattr_to_sid(&secattr, - SECINITSID_NETMSG, - &nlbl_sid); - if (rc == 0 && - (secattr.flags & NETLBL_SECATTR_CACHEABLE) && - (secattr.flags & NETLBL_SECATTR_CACHE)) - netlbl_cache_add(skb, &secattr); - } else + rc = netlbl_skbuff_getattr(skb, family, &secattr); + if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) + rc = selinux_netlbl_sidlookup_cached(skb, &secattr, &nlbl_sid); + else nlbl_sid = SECINITSID_UNLABELED; netlbl_secattr_destroy(&secattr); if (rc != 0) diff --git a/security/selinux/netnode.c b/security/selinux/netnode.c new file mode 100644 index 00000000000..f3c526f2cac --- /dev/null +++ b/security/selinux/netnode.c @@ -0,0 +1,354 @@ +/* + * Network node table + * + * SELinux must keep a mapping of network nodes to labels/SIDs. This + * mapping is maintained as part of the normal policy but a fast cache is + * needed to reduce the lookup overhead since most of these queries happen on + * a per-packet basis. + * + * Author: Paul Moore <paul.moore@hp.com> + * + * This code is heavily based on the "netif" concept originally developed by + * James Morris <jmorris@redhat.com> + * (see security/selinux/netif.c for more information) + * + */ + +/* + * (c) Copyright Hewlett-Packard Development Company, L.P., 2007 + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include <linux/types.h> +#include <linux/rcupdate.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <asm/bug.h> + +#include "objsec.h" + +#define SEL_NETNODE_HASH_SIZE 256 +#define SEL_NETNODE_HASH_BKT_LIMIT 16 + +struct sel_netnode { + struct netnode_security_struct nsec; + + struct list_head list; + struct rcu_head rcu; +}; + +/* NOTE: we are using a combined hash table for both IPv4 and IPv6, the reason + * for this is that I suspect most users will not make heavy use of both + * address families at the same time so one table will usually end up wasted, + * if this becomes a problem we can always add a hash table for each address + * family later */ + +static LIST_HEAD(sel_netnode_list); +static DEFINE_SPINLOCK(sel_netnode_lock); +static struct list_head sel_netnode_hash[SEL_NETNODE_HASH_SIZE]; + +/** + * sel_netnode_free - Frees a node entry + * @p: the entry's RCU field + * + * Description: + * This function is designed to be used as a callback to the call_rcu() + * function so that memory allocated to a hash table node entry can be + * released safely. + * + */ +static void sel_netnode_free(struct rcu_head *p) +{ + struct sel_netnode *node = container_of(p, struct sel_netnode, rcu); + kfree(node); +} + +/** + * sel_netnode_hashfn_ipv4 - IPv4 hashing function for the node table + * @addr: IPv4 address + * + * Description: + * This is the IPv4 hashing function for the node interface table, it returns + * the bucket number for the given IP address. + * + */ +static u32 sel_netnode_hashfn_ipv4(__be32 addr) +{ + /* at some point we should determine if the mismatch in byte order + * affects the hash function dramatically */ + return (addr & (SEL_NETNODE_HASH_SIZE - 1)); +} + +/** + * sel_netnode_hashfn_ipv6 - IPv6 hashing function for the node table + * @addr: IPv6 address + * + * Description: + * This is the IPv6 hashing function for the node interface table, it returns + * the bucket number for the given IP address. + * + */ +static u32 sel_netnode_hashfn_ipv6(const struct in6_addr *addr) +{ + /* just hash the least significant 32 bits to keep things fast (they + * are the most likely to be different anyway), we can revisit this + * later if needed */ + return (addr->s6_addr32[3] & (SEL_NETNODE_HASH_SIZE - 1)); +} + +/** + * sel_netnode_find - Search for a node record + * @addr: IP address + * @family: address family + * + * Description: + * Search the network node table and return the record matching @addr. If an + * entry can not be found in the table return NULL. + * + */ +static struct sel_netnode *sel_netnode_find(const void *addr, u16 family) +{ + u32 idx; + struct sel_netnode *node; + + switch (family) { + case PF_INET: + idx = sel_netnode_hashfn_ipv4(*(__be32 *)addr); + break; + case PF_INET6: + idx = sel_netnode_hashfn_ipv6(addr); + break; + default: + BUG(); + } + + list_for_each_entry_rcu(node, &sel_netnode_hash[idx], list) + if (node->nsec.family == family) + switch (family) { + case PF_INET: + if (node->nsec.addr.ipv4 == *(__be32 *)addr) + return node; + break; + case PF_INET6: + if (ipv6_addr_equal(&node->nsec.addr.ipv6, + addr)) + return node; + break; + } + + return NULL; +} + +/** + * sel_netnode_insert - Insert a new node into the table + * @node: the new node record + * + * Description: + * Add a new node record to the network address hash table. Returns zero on + * success, negative values on failure. + * + */ +static int sel_netnode_insert(struct sel_netnode *node) +{ + u32 idx; + u32 count = 0; + struct sel_netnode *iter; + + switch (node->nsec.family) { + case PF_INET: + idx = sel_netnode_hashfn_ipv4(node->nsec.addr.ipv4); + break; + case PF_INET6: + idx = sel_netnode_hashfn_ipv6(&node->nsec.addr.ipv6); + break; + default: + BUG(); + } + list_add_rcu(&node->list, &sel_netnode_hash[idx]); + + /* we need to impose a limit on the growth of the hash table so check + * this bucket to make sure it is within the specified bounds */ + list_for_each_entry(iter, &sel_netnode_hash[idx], list) + if (++count > SEL_NETNODE_HASH_BKT_LIMIT) { + list_del_rcu(&iter->list); + call_rcu(&iter->rcu, sel_netnode_free); + break; + } + + return 0; +} + +/** + * sel_netnode_destroy - Remove a node record from the table + * @node: the existing node record + * + * Description: + * Remove an existing node record from the network address table. + * + */ +static void sel_netnode_destroy(struct sel_netnode *node) +{ + list_del_rcu(&node->list); + call_rcu(&node->rcu, sel_netnode_free); +} + +/** + * sel_netnode_sid_slow - Lookup the SID of a network address using the policy + * @addr: the IP address + * @family: the address family + * @sid: node SID + * + * Description: + * This function determines the SID of a network address by quering the + * security policy. The result is added to the network address table to + * speedup future queries. Returns zero on success, negative values on + * failure. + * + */ +static int sel_netnode_sid_slow(void *addr, u16 family, u32 *sid) +{ + int ret; + struct sel_netnode *node; + struct sel_netnode *new = NULL; + + spin_lock_bh(&sel_netnode_lock); + node = sel_netnode_find(addr, family); + if (node != NULL) { + *sid = node->nsec.sid; + ret = 0; + goto out; + } + new = kzalloc(sizeof(*new), GFP_ATOMIC); + if (new == NULL) { + ret = -ENOMEM; + goto out; + } + switch (family) { + case PF_INET: + ret = security_node_sid(PF_INET, + addr, sizeof(struct in_addr), + &new->nsec.sid); + new->nsec.addr.ipv4 = *(__be32 *)addr; + break; + case PF_INET6: + ret = security_node_sid(PF_INET6, + addr, sizeof(struct in6_addr), + &new->nsec.sid); + ipv6_addr_copy(&new->nsec.addr.ipv6, addr); + break; + default: + BUG(); + } + if (ret != 0) + goto out; + new->nsec.family = family; + ret = sel_netnode_insert(new); + if (ret != 0) + goto out; + *sid = new->nsec.sid; + +out: + spin_unlock_bh(&sel_netnode_lock); + if (unlikely(ret)) { + printk(KERN_WARNING + "SELinux: failure in sel_netnode_sid_slow()," + " unable to determine network node label\n"); + kfree(new); + } + return ret; +} + +/** + * sel_netnode_sid - Lookup the SID of a network address + * @addr: the IP address + * @family: the address family + * @sid: node SID + * + * Description: + * This function determines the SID of a network address using the fastest + * method possible. First the address table is queried, but if an entry + * can't be found then the policy is queried and the result is added to the + * table to speedup future queries. Returns zero on success, negative values + * on failure. + * + */ +int sel_netnode_sid(void *addr, u16 family, u32 *sid) +{ + struct sel_netnode *node; + + rcu_read_lock(); + node = sel_netnode_find(addr, family); + if (node != NULL) { + *sid = node->nsec.sid; + rcu_read_unlock(); + return 0; + } + rcu_read_unlock(); + + return sel_netnode_sid_slow(addr, family, sid); +} + +/** + * sel_netnode_flush - Flush the entire network address table + * + * Description: + * Remove all entries from the network address table. + * + */ +static void sel_netnode_flush(void) +{ + u32 idx; + struct sel_netnode *node; + + spin_lock_bh(&sel_netnode_lock); + for (idx = 0; idx < SEL_NETNODE_HASH_SIZE; idx++) + list_for_each_entry(node, &sel_netnode_hash[idx], list) + sel_netnode_destroy(node); + spin_unlock_bh(&sel_netnode_lock); +} + +static int sel_netnode_avc_callback(u32 event, u32 ssid, u32 tsid, + u16 class, u32 perms, u32 *retained) +{ + if (event == AVC_CALLBACK_RESET) { + sel_netnode_flush(); + synchronize_net(); + } + return 0; +} + +static __init int sel_netnode_init(void) +{ + int iter; + int ret; + + if (!selinux_enabled) + return 0; + + for (iter = 0; iter < SEL_NETNODE_HASH_SIZE; iter++) + INIT_LIST_HEAD(&sel_netnode_hash[iter]); + + ret = avc_add_callback(sel_netnode_avc_callback, AVC_CALLBACK_RESET, + SECSID_NULL, SECSID_NULL, SECCLASS_NULL, 0); + if (ret != 0) + panic("avc_add_callback() failed, error %d\n", ret); + + return ret; +} + +__initcall(sel_netnode_init); diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 397fd4955fe..a85740530af 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -2,6 +2,11 @@ * * Added conditional policy language extensions * + * Updated: Hewlett-Packard <paul.moore@hp.com> + * + * Added support for the policy capability bitmap + * + * Copyright (C) 2007 Hewlett-Packard Development Company, L.P. * Copyright (C) 2003 - 2004 Tresys Technology, LLC * Copyright (C) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> * This program is free software; you can redistribute it and/or modify @@ -35,6 +40,11 @@ #include "objsec.h" #include "conditional.h" +/* Policy capability filenames */ +static char *policycap_names[] = { + "network_peer_controls" +}; + unsigned int selinux_checkreqprot = CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE; #ifdef CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT @@ -72,6 +82,9 @@ static int *bool_pending_values = NULL; static struct dentry *class_dir = NULL; static unsigned long last_class_ino; +/* global data for policy capabilities */ +static struct dentry *policycap_dir = NULL; + extern void selnl_notify_setenforce(int val); /* Check whether a task is allowed to use a security operation. */ @@ -111,10 +124,11 @@ enum sel_inos { static unsigned long sel_last_ino = SEL_INO_NEXT - 1; -#define SEL_INITCON_INO_OFFSET 0x01000000 -#define SEL_BOOL_INO_OFFSET 0x02000000 -#define SEL_CLASS_INO_OFFSET 0x04000000 -#define SEL_INO_MASK 0x00ffffff +#define SEL_INITCON_INO_OFFSET 0x01000000 +#define SEL_BOOL_INO_OFFSET 0x02000000 +#define SEL_CLASS_INO_OFFSET 0x04000000 +#define SEL_POLICYCAP_INO_OFFSET 0x08000000 +#define SEL_INO_MASK 0x00ffffff #define TMPBUFLEN 12 static ssize_t sel_read_enforce(struct file *filp, char __user *buf, @@ -263,6 +277,7 @@ static const struct file_operations sel_policyvers_ops = { /* declaration for sel_write_load */ static int sel_make_bools(void); static int sel_make_classes(void); +static int sel_make_policycap(void); /* declaration for sel_make_class_dirs */ static int sel_make_dir(struct inode *dir, struct dentry *dentry, @@ -323,6 +338,12 @@ static ssize_t sel_write_load(struct file * file, const char __user * buf, } ret = sel_make_classes(); + if (ret) { + length = ret; + goto out1; + } + + ret = sel_make_policycap(); if (ret) length = ret; else @@ -1399,6 +1420,24 @@ static const struct file_operations sel_perm_ops = { .read = sel_read_perm, }; +static ssize_t sel_read_policycap(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + int value; + char tmpbuf[TMPBUFLEN]; + ssize_t length; + unsigned long i_ino = file->f_path.dentry->d_inode->i_ino; + + value = security_policycap_supported(i_ino & SEL_INO_MASK); + length = scnprintf(tmpbuf, TMPBUFLEN, "%d", value); + + return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); +} + +static const struct file_operations sel_policycap_ops = { + .read = sel_read_policycap, +}; + static int sel_make_perm_files(char *objclass, int classvalue, struct dentry *dir) { @@ -1545,6 +1584,36 @@ out: return rc; } +static int sel_make_policycap(void) +{ + unsigned int iter; + struct dentry *dentry = NULL; + struct inode *inode = NULL; + + sel_remove_entries(policycap_dir); + + for (iter = 0; iter <= POLICYDB_CAPABILITY_MAX; iter++) { + if (iter < ARRAY_SIZE(policycap_names)) + dentry = d_alloc_name(policycap_dir, + policycap_names[iter]); + else + dentry = d_alloc_name(policycap_dir, "unknown"); + + if (dentry == NULL) + return -ENOMEM; + + inode = sel_make_inode(policycap_dir->d_sb, S_IFREG | S_IRUGO); + if (inode == NULL) + return -ENOMEM; + + inode->i_fop = &sel_policycap_ops; + inode->i_ino = iter | SEL_POLICYCAP_INO_OFFSET; + d_add(dentry, inode); + } + + return 0; +} + static int sel_make_dir(struct inode *dir, struct dentry *dentry, unsigned long *ino) { @@ -1673,6 +1742,18 @@ static int sel_fill_super(struct super_block * sb, void * data, int silent) class_dir = dentry; + dentry = d_alloc_name(sb->s_root, "policy_capabilities"); + if (!dentry) { + ret = -ENOMEM; + goto err; + } + + ret = sel_make_dir(root_inode, dentry, &sel_last_ino); + if (ret) + goto err; + + policycap_dir = dentry; + out: return ret; err: diff --git a/security/selinux/ss/mls.c b/security/selinux/ss/mls.c index 3bbcb5369af..feaf0a5b828 100644 --- a/security/selinux/ss/mls.c +++ b/security/selinux/ss/mls.c @@ -562,7 +562,7 @@ void mls_export_netlbl_lvl(struct context *context, if (!selinux_mls_enabled) return; - secattr->mls_lvl = context->range.level[0].sens - 1; + secattr->attr.mls.lvl = context->range.level[0].sens - 1; secattr->flags |= NETLBL_SECATTR_MLS_LVL; } @@ -582,7 +582,7 @@ void mls_import_netlbl_lvl(struct context *context, if (!selinux_mls_enabled) return; - context->range.level[0].sens = secattr->mls_lvl + 1; + context->range.level[0].sens = secattr->attr.mls.lvl + 1; context->range.level[1].sens = context->range.level[0].sens; } @@ -605,8 +605,8 @@ int mls_export_netlbl_cat(struct context *context, return 0; rc = ebitmap_netlbl_export(&context->range.level[0].cat, - &secattr->mls_cat); - if (rc == 0 && secattr->mls_cat != NULL) + &secattr->attr.mls.cat); + if (rc == 0 && secattr->attr.mls.cat != NULL) secattr->flags |= NETLBL_SECATTR_MLS_CAT; return rc; @@ -633,7 +633,7 @@ int mls_import_netlbl_cat(struct context *context, return 0; rc = ebitmap_netlbl_import(&context->range.level[0].cat, - secattr->mls_cat); + secattr->attr.mls.cat); if (rc != 0) goto import_netlbl_cat_failure; diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c index b582aae3c62..bd7d6a00342 100644 --- a/security/selinux/ss/policydb.c +++ b/security/selinux/ss/policydb.c @@ -13,6 +13,11 @@ * * Added conditional policy language extensions * + * Updated: Hewlett-Packard <paul.moore@hp.com> + * + * Added support for the policy capability bitmap + * + * Copyright (C) 2007 Hewlett-Packard Development Company, L.P. * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. * Copyright (C) 2003 - 2004 Tresys Technology, LLC * This program is free software; you can redistribute it and/or modify @@ -102,6 +107,11 @@ static struct policydb_compat_info policydb_compat[] = { .sym_num = SYM_NUM, .ocon_num = OCON_NUM, }, + { + .version = POLICYDB_VERSION_POLCAP, + .sym_num = SYM_NUM, + .ocon_num = OCON_NUM, + } }; static struct policydb_compat_info *policydb_lookup_compat(int version) @@ -183,6 +193,8 @@ static int policydb_init(struct policydb *p) if (rc) goto out_free_symtab; + ebitmap_init(&p->policycaps); + out: return rc; @@ -673,8 +685,8 @@ void policydb_destroy(struct policydb *p) ebitmap_destroy(&p->type_attr_map[i]); } kfree(p->type_attr_map); - kfree(p->undefined_perms); + ebitmap_destroy(&p->policycaps); return; } @@ -1554,6 +1566,10 @@ int policydb_read(struct policydb *p, void *fp) p->reject_unknown = !!(le32_to_cpu(buf[1]) & REJECT_UNKNOWN); p->allow_unknown = !!(le32_to_cpu(buf[1]) & ALLOW_UNKNOWN); + if (p->policyvers >= POLICYDB_VERSION_POLCAP && + ebitmap_read(&p->policycaps, fp) != 0) + goto bad; + info = policydb_lookup_compat(p->policyvers); if (!info) { printk(KERN_ERR "security: unable to find policy compat info " diff --git a/security/selinux/ss/policydb.h b/security/selinux/ss/policydb.h index ed6fc687c66..c4ce996e202 100644 --- a/security/selinux/ss/policydb.h +++ b/security/selinux/ss/policydb.h @@ -241,6 +241,8 @@ struct policydb { /* type -> attribute reverse mapping */ struct ebitmap *type_attr_map; + struct ebitmap policycaps; + unsigned int policyvers; unsigned int reject_unknown : 1; diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index 4bf715d4cf2..f96dec1f925 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -16,12 +16,13 @@ * Updated: Hewlett-Packard <paul.moore@hp.com> * * Added support for NetLabel + * Added support for the policy capability bitmap * * Updated: Chad Sellers <csellers@tresys.com> * * Added validation of kernel classes and permissions * - * Copyright (C) 2006 Hewlett-Packard Development Company, L.P. + * Copyright (C) 2006, 2007 Hewlett-Packard Development Company, L.P. * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc. * Copyright (C) 2003 - 2004, 2006 Tresys Technology, LLC * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com> @@ -59,6 +60,8 @@ extern void selnl_notify_policyload(u32 seqno); unsigned int policydb_loaded_version; +int selinux_policycap_netpeer; + /* * This is declared in avc.c */ @@ -1299,6 +1302,12 @@ bad: goto out; } +static void security_load_policycaps(void) +{ + selinux_policycap_netpeer = ebitmap_get_bit(&policydb.policycaps, + POLICYDB_CAPABILITY_NETPEER); +} + extern void selinux_complete_init(void); static int security_preserve_bools(struct policydb *p); @@ -1346,6 +1355,7 @@ int security_load_policy(void *data, size_t len) avtab_cache_destroy(); return -EINVAL; } + security_load_policycaps(); policydb_loaded_version = policydb.policyvers; ss_initialized = 1; seqno = ++latest_granting; @@ -1404,6 +1414,7 @@ int security_load_policy(void *data, size_t len) POLICY_WRLOCK; memcpy(&policydb, &newpolicydb, sizeof policydb); sidtab_set(&sidtab, &newsidtab); + security_load_policycaps(); seqno = ++latest_granting; policydb_loaded_version = policydb.policyvers; POLICY_WRUNLOCK; @@ -1478,11 +1489,8 @@ out: * security_netif_sid - Obtain the SID for a network interface. * @name: interface name * @if_sid: interface SID - * @msg_sid: default SID for received packets */ -int security_netif_sid(char *name, - u32 *if_sid, - u32 *msg_sid) +int security_netif_sid(char *name, u32 *if_sid) { int rc = 0; struct ocontext *c; @@ -1510,11 +1518,8 @@ int security_netif_sid(char *name, goto out; } *if_sid = c->sid[0]; - *msg_sid = c->sid[1]; - } else { + } else *if_sid = SECINITSID_NETIF; - *msg_sid = SECINITSID_NETMSG; - } out: POLICY_RDUNLOCK; @@ -2049,6 +2054,91 @@ out: return rc; } +/** + * security_net_peersid_resolve - Compare and resolve two network peer SIDs + * @nlbl_sid: NetLabel SID + * @nlbl_type: NetLabel labeling protocol type + * @xfrm_sid: XFRM SID + * + * Description: + * Compare the @nlbl_sid and @xfrm_sid values and if the two SIDs can be + * resolved into a single SID it is returned via @peer_sid and the function + * returns zero. Otherwise @peer_sid is set to SECSID_NULL and the function + * returns a negative value. A table summarizing the behavior is below: + * + * | function return | @sid + * ------------------------------+-----------------+----------------- + * no peer labels | 0 | SECSID_NULL + * single peer label | 0 | <peer_label> + * multiple, consistent labels | 0 | <peer_label> + * multiple, inconsistent labels | -<errno> | SECSID_NULL + * + */ +int security_net_peersid_resolve(u32 nlbl_sid, u32 nlbl_type, + u32 xfrm_sid, + u32 *peer_sid) +{ + int rc; + struct context *nlbl_ctx; + struct context *xfrm_ctx; + + /* handle the common (which also happens to be the set of easy) cases + * right away, these two if statements catch everything involving a + * single or absent peer SID/label */ + if (xfrm_sid == SECSID_NULL) { + *peer_sid = nlbl_sid; + return 0; + } + /* NOTE: an nlbl_type == NETLBL_NLTYPE_UNLABELED is a "fallback" label + * and is treated as if nlbl_sid == SECSID_NULL when a XFRM SID/label + * is present */ + if (nlbl_sid == SECSID_NULL || nlbl_type == NETLBL_NLTYPE_UNLABELED) { + *peer_sid = xfrm_sid; + return 0; + } + + /* we don't need to check ss_initialized here since the only way both + * nlbl_sid and xfrm_sid are not equal to SECSID_NULL would be if the + * security server was initialized and ss_initialized was true */ + if (!selinux_mls_enabled) { + *peer_sid = SECSID_NULL; + return 0; + } + + POLICY_RDLOCK; + + nlbl_ctx = sidtab_search(&sidtab, nlbl_sid); + if (!nlbl_ctx) { + printk(KERN_ERR + "security_sid_mls_cmp: unrecognized SID %d\n", + nlbl_sid); + rc = -EINVAL; + goto out_slowpath; + } + xfrm_ctx = sidtab_search(&sidtab, xfrm_sid); + if (!xfrm_ctx) { + printk(KERN_ERR + "security_sid_mls_cmp: unrecognized SID %d\n", + xfrm_sid); + rc = -EINVAL; + goto out_slowpath; + } + rc = (mls_context_cmp(nlbl_ctx, xfrm_ctx) ? 0 : -EACCES); + +out_slowpath: + POLICY_RDUNLOCK; + if (rc == 0) + /* at present NetLabel SIDs/labels really only carry MLS + * information so if the MLS portion of the NetLabel SID + * matches the MLS portion of the labeled XFRM SID/label + * then pass along the XFRM SID as it is the most + * expressive */ + *peer_sid = xfrm_sid; + else + *peer_sid = SECSID_NULL; + return rc; +} + static int get_classes_callback(void *k, void *d, void *args) { struct class_datum *datum = d; @@ -2154,6 +2244,60 @@ int security_get_allow_unknown(void) return policydb.allow_unknown; } +/** + * security_get_policycaps - Query the loaded policy for its capabilities + * @len: the number of capability bits + * @values: the capability bit array + * + * Description: + * Get an array of the policy capabilities in @values where each entry in + * @values is either true (1) or false (0) depending the policy's support of + * that feature. The policy capabilities are defined by the + * POLICYDB_CAPABILITY_* enums. The size of the array is stored in @len and it + * is up to the caller to free the array in @values. Returns zero on success, + * negative values on failure. + * + */ +int security_get_policycaps(int *len, int **values) +{ + int rc = -ENOMEM; + unsigned int iter; + + POLICY_RDLOCK; + + *values = kcalloc(POLICYDB_CAPABILITY_MAX, sizeof(int), GFP_ATOMIC); + if (*values == NULL) + goto out; + for (iter = 0; iter < POLICYDB_CAPABILITY_MAX; iter++) + (*values)[iter] = ebitmap_get_bit(&policydb.policycaps, iter); + *len = POLICYDB_CAPABILITY_MAX; + +out: + POLICY_RDUNLOCK; + return rc; +} + +/** + * security_policycap_supported - Check for a specific policy capability + * @req_cap: capability + * + * Description: + * This function queries the currently loaded policy to see if it supports the + * capability specified by @req_cap. Returns true (1) if the capability is + * supported, false (0) if it isn't supported. + * + */ +int security_policycap_supported(unsigned int req_cap) +{ + int rc; + + POLICY_RDLOCK; + rc = ebitmap_get_bit(&policydb.policycaps, req_cap); + POLICY_RDUNLOCK; + + return rc; +} + struct selinux_audit_rule { u32 au_seqno; struct context au_ctxt; @@ -2403,50 +2547,10 @@ void selinux_audit_set_callback(int (*callback)(void)) } #ifdef CONFIG_NETLABEL -/* - * NetLabel cache structure - */ -#define NETLBL_CACHE(x) ((struct selinux_netlbl_cache *)(x)) -#define NETLBL_CACHE_T_NONE 0 -#define NETLBL_CACHE_T_SID 1 -#define NETLBL_CACHE_T_MLS 2 -struct selinux_netlbl_cache { - u32 type; - union { - u32 sid; - struct mls_range mls_label; - } data; -}; - -/** - * security_netlbl_cache_free - Free the NetLabel cached data - * @data: the data to free - * - * Description: - * This function is intended to be used as the free() callback inside the - * netlbl_lsm_cache structure. - * - */ -static void security_netlbl_cache_free(const void *data) -{ - struct selinux_netlbl_cache *cache; - - if (data == NULL) - return; - - cache = NETLBL_CACHE(data); - switch (cache->type) { - case NETLBL_CACHE_T_MLS: - ebitmap_destroy(&cache->data.mls_label.level[0].cat); - break; - } - kfree(data); -} - /** * security_netlbl_cache_add - Add an entry to the NetLabel cache * @secattr: the NetLabel packet security attributes - * @ctx: the SELinux context + * @sid: the SELinux SID * * Description: * Attempt to cache the context in @ctx, which was derived from the packet in @@ -2455,60 +2559,46 @@ static void security_netlbl_cache_free(const void *data) * */ static void security_netlbl_cache_add(struct netlbl_lsm_secattr *secattr, - struct context *ctx) + u32 sid) { - struct selinux_netlbl_cache *cache = NULL; + u32 *sid_cache; - secattr->cache = netlbl_secattr_cache_alloc(GFP_ATOMIC); - if (secattr->cache == NULL) - return; - - cache = kzalloc(sizeof(*cache), GFP_ATOMIC); - if (cache == NULL) + sid_cache = kmalloc(sizeof(*sid_cache), GFP_ATOMIC); + if (sid_cache == NULL) return; - - cache->type = NETLBL_CACHE_T_MLS; - if (ebitmap_cpy(&cache->data.mls_label.level[0].cat, - &ctx->range.level[0].cat) != 0) { - kfree(cache); + secattr->cache = netlbl_secattr_cache_alloc(GFP_ATOMIC); + if (secattr->cache == NULL) { + kfree(sid_cache); return; } - cache->data.mls_label.level[1].cat.highbit = - cache->data.mls_label.level[0].cat.highbit; - cache->data.mls_label.level[1].cat.node = - cache->data.mls_label.level[0].cat.node; - cache->data.mls_label.level[0].sens = ctx->range.level[0].sens; - cache->data.mls_label.level[1].sens = ctx->range.level[0].sens; - secattr->cache->free = security_netlbl_cache_free; - secattr->cache->data = (void *)cache; + *sid_cache = sid; + secattr->cache->free = kfree; + secattr->cache->data = sid_cache; secattr->flags |= NETLBL_SECATTR_CACHE; } /** * security_netlbl_secattr_to_sid - Convert a NetLabel secattr to a SELinux SID * @secattr: the NetLabel packet security attributes - * @base_sid: the SELinux SID to use as a context for MLS only attributes * @sid: the SELinux SID * * Description: * Convert the given NetLabel security attributes in @secattr into a * SELinux SID. If the @secattr field does not contain a full SELinux - * SID/context then use the context in @base_sid as the foundation. If - * possibile the 'cache' field of @secattr is set and the CACHE flag is set; - * this is to allow the @secattr to be used by NetLabel to cache the secattr to - * SID conversion for future lookups. Returns zero on success, negative - * values on failure. + * SID/context then use SECINITSID_NETMSG as the foundation. If possibile the + * 'cache' field of @secattr is set and the CACHE flag is set; this is to + * allow the @secattr to be used by NetLabel to cache the secattr to SID + * conversion for future lookups. Returns zero on success, negative values on + * failure. * */ int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, - u32 base_sid, u32 *sid) { int rc = -EIDRM; struct context *ctx; struct context ctx_new; - struct selinux_netlbl_cache *cache; if (!ss_initialized) { *sid = SECSID_NULL; @@ -2518,40 +2608,13 @@ int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, POLICY_RDLOCK; if (secattr->flags & NETLBL_SECATTR_CACHE) { - cache = NETLBL_CACHE(secattr->cache->data); - switch (cache->type) { - case NETLBL_CACHE_T_SID: - *sid = cache->data.sid; - rc = 0; - break; - case NETLBL_CACHE_T_MLS: - ctx = sidtab_search(&sidtab, base_sid); - if (ctx == NULL) - goto netlbl_secattr_to_sid_return; - - ctx_new.user = ctx->user; - ctx_new.role = ctx->role; - ctx_new.type = ctx->type; - ctx_new.range.level[0].sens = - cache->data.mls_label.level[0].sens; - ctx_new.range.level[0].cat.highbit = - cache->data.mls_label.level[0].cat.highbit; - ctx_new.range.level[0].cat.node = - cache->data.mls_label.level[0].cat.node; - ctx_new.range.level[1].sens = - cache->data.mls_label.level[1].sens; - ctx_new.range.level[1].cat.highbit = - cache->data.mls_label.level[1].cat.highbit; - ctx_new.range.level[1].cat.node = - cache->data.mls_label.level[1].cat.node; - - rc = sidtab_context_to_sid(&sidtab, &ctx_new, sid); - break; - default: - goto netlbl_secattr_to_sid_return; - } + *sid = *(u32 *)secattr->cache->data; + rc = 0; + } else if (secattr->flags & NETLBL_SECATTR_SECID) { + *sid = secattr->attr.secid; + rc = 0; } else if (secattr->flags & NETLBL_SECATTR_MLS_LVL) { - ctx = sidtab_search(&sidtab, base_sid); + ctx = sidtab_search(&sidtab, SECINITSID_NETMSG); if (ctx == NULL) goto netlbl_secattr_to_sid_return; @@ -2561,7 +2624,7 @@ int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, mls_import_netlbl_lvl(&ctx_new, secattr); if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { if (ebitmap_netlbl_import(&ctx_new.range.level[0].cat, - secattr->mls_cat) != 0) + secattr->attr.mls.cat) != 0) goto netlbl_secattr_to_sid_return; ctx_new.range.level[1].cat.highbit = ctx_new.range.level[0].cat.highbit; @@ -2578,7 +2641,7 @@ int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, if (rc != 0) goto netlbl_secattr_to_sid_return_cleanup; - security_netlbl_cache_add(secattr, &ctx_new); + security_netlbl_cache_add(secattr, *sid); ebitmap_destroy(&ctx_new.range.level[0].cat); } else { diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c index e0760396903..7e158205d08 100644 --- a/security/selinux/xfrm.c +++ b/security/selinux/xfrm.c @@ -46,11 +46,14 @@ #include <net/checksum.h> #include <net/udp.h> #include <asm/semaphore.h> +#include <asm/atomic.h> #include "avc.h" #include "objsec.h" #include "xfrm.h" +/* Labeled XFRM instance counter */ +atomic_t selinux_xfrm_refcount = ATOMIC_INIT(0); /* * Returns true if an LSM/SELinux context @@ -293,6 +296,9 @@ int selinux_xfrm_policy_alloc(struct xfrm_policy *xp, BUG_ON(!uctx); err = selinux_xfrm_sec_ctx_alloc(&xp->security, uctx, 0); + if (err == 0) + atomic_inc(&selinux_xfrm_refcount); + return err; } @@ -340,10 +346,13 @@ int selinux_xfrm_policy_delete(struct xfrm_policy *xp) struct xfrm_sec_ctx *ctx = xp->security; int rc = 0; - if (ctx) + if (ctx) { rc = avc_has_perm(tsec->sid, ctx->ctx_sid, SECCLASS_ASSOCIATION, ASSOCIATION__SETCONTEXT, NULL); + if (rc == 0) + atomic_dec(&selinux_xfrm_refcount); + } return rc; } @@ -360,6 +369,8 @@ int selinux_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *uct BUG_ON(!x); err = selinux_xfrm_sec_ctx_alloc(&x->security, uctx, secid); + if (err == 0) + atomic_inc(&selinux_xfrm_refcount); return err; } @@ -382,10 +393,13 @@ int selinux_xfrm_state_delete(struct xfrm_state *x) struct xfrm_sec_ctx *ctx = x->security; int rc = 0; - if (ctx) + if (ctx) { rc = avc_has_perm(tsec->sid, ctx->ctx_sid, SECCLASS_ASSOCIATION, ASSOCIATION__SETCONTEXT, NULL); + if (rc == 0) + atomic_dec(&selinux_xfrm_refcount); + } return rc; } diff --git a/drivers/kvm/ioapic.c b/virt/kvm/ioapic.c index c7992e667fd..317f8e211cd 100644 --- a/drivers/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -26,7 +26,7 @@ * Based on Xen 3.1 code. */ -#include "kvm.h" +#include <linux/kvm_host.h> #include <linux/kvm.h> #include <linux/mm.h> #include <linux/highmem.h> @@ -34,14 +34,17 @@ #include <linux/hrtimer.h> #include <linux/io.h> #include <asm/processor.h> -#include <asm/msr.h> #include <asm/page.h> #include <asm/current.h> -#include <asm/apicdef.h> -#include <asm/io_apic.h> -#include "irq.h" -/* #define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */ + +#include "ioapic.h" +#include "lapic.h" + +#if 0 +#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) +#else #define ioapic_debug(fmt, arg...) +#endif static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq); static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, @@ -113,7 +116,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) default: index = (ioapic->ioregsel - 0x10) >> 1; - ioapic_debug("change redir index %x val %x", index, val); + ioapic_debug("change redir index %x val %x\n", index, val); if (index >= IOAPIC_NUM_PINS) return; if (ioapic->ioregsel & 1) { @@ -131,16 +134,16 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) } static void ioapic_inj_irq(struct kvm_ioapic *ioapic, - struct kvm_lapic *target, + struct kvm_vcpu *vcpu, u8 vector, u8 trig_mode, u8 delivery_mode) { - ioapic_debug("irq %d trig %d deliv %d", vector, trig_mode, + ioapic_debug("irq %d trig %d deliv %d\n", vector, trig_mode, delivery_mode); - ASSERT((delivery_mode == dest_Fixed) || - (delivery_mode == dest_LowestPrio)); + ASSERT((delivery_mode == IOAPIC_FIXED) || + (delivery_mode == IOAPIC_LOWEST_PRIORITY)); - kvm_apic_set_irq(target, vector, trig_mode); + kvm_apic_set_irq(vcpu, vector, trig_mode); } static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, @@ -151,12 +154,12 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, struct kvm *kvm = ioapic->kvm; struct kvm_vcpu *vcpu; - ioapic_debug("dest %d dest_mode %d", dest, dest_mode); + ioapic_debug("dest %d dest_mode %d\n", dest, dest_mode); if (dest_mode == 0) { /* Physical mode. */ if (dest == 0xFF) { /* Broadcast. */ for (i = 0; i < KVM_MAX_VCPUS; ++i) - if (kvm->vcpus[i] && kvm->vcpus[i]->apic) + if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic) mask |= 1 << i; return mask; } @@ -164,8 +167,8 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, vcpu = kvm->vcpus[i]; if (!vcpu) continue; - if (kvm_apic_match_physical_addr(vcpu->apic, dest)) { - if (vcpu->apic) + if (kvm_apic_match_physical_addr(vcpu->arch.apic, dest)) { + if (vcpu->arch.apic) mask = 1 << i; break; } @@ -175,11 +178,11 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, vcpu = kvm->vcpus[i]; if (!vcpu) continue; - if (vcpu->apic && - kvm_apic_match_logical_addr(vcpu->apic, dest)) + if (vcpu->arch.apic && + kvm_apic_match_logical_addr(vcpu->arch.apic, dest)) mask |= 1 << vcpu->vcpu_id; } - ioapic_debug("mask %x", mask); + ioapic_debug("mask %x\n", mask); return mask; } @@ -191,41 +194,39 @@ static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq) u8 vector = ioapic->redirtbl[irq].fields.vector; u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode; u32 deliver_bitmask; - struct kvm_lapic *target; struct kvm_vcpu *vcpu; int vcpu_id; ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " - "vector=%x trig_mode=%x", + "vector=%x trig_mode=%x\n", dest, dest_mode, delivery_mode, vector, trig_mode); deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode); if (!deliver_bitmask) { - ioapic_debug("no target on destination"); + ioapic_debug("no target on destination\n"); return; } switch (delivery_mode) { - case dest_LowestPrio: - target = - kvm_apic_round_robin(ioapic->kvm, vector, deliver_bitmask); - if (target != NULL) - ioapic_inj_irq(ioapic, target, vector, + case IOAPIC_LOWEST_PRIORITY: + vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector, + deliver_bitmask); + if (vcpu != NULL) + ioapic_inj_irq(ioapic, vcpu, vector, trig_mode, delivery_mode); else - ioapic_debug("null round robin: " - "mask=%x vector=%x delivery_mode=%x", - deliver_bitmask, vector, dest_LowestPrio); + ioapic_debug("null lowest prio vcpu: " + "mask=%x vector=%x delivery_mode=%x\n", + deliver_bitmask, vector, IOAPIC_LOWEST_PRIORITY); break; - case dest_Fixed: + case IOAPIC_FIXED: for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { if (!(deliver_bitmask & (1 << vcpu_id))) continue; deliver_bitmask &= ~(1 << vcpu_id); vcpu = ioapic->kvm->vcpus[vcpu_id]; if (vcpu) { - target = vcpu->apic; - ioapic_inj_irq(ioapic, target, vector, + ioapic_inj_irq(ioapic, vcpu, vector, trig_mode, delivery_mode); } } @@ -271,7 +272,7 @@ static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector) void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) { - struct kvm_ioapic *ioapic = kvm->vioapic; + struct kvm_ioapic *ioapic = kvm->arch.vioapic; union ioapic_redir_entry *ent; int gsi; @@ -304,7 +305,7 @@ static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; u32 result; - ioapic_debug("addr %lx", (unsigned long)addr); + ioapic_debug("addr %lx\n", (unsigned long)addr); ASSERT(!(addr & 0xf)); /* check alignment */ addr &= 0xff; @@ -341,8 +342,8 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; u32 data; - ioapic_debug("ioapic_mmio_write addr=%lx len=%d val=%p\n", - addr, len, val); + ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n", + (void*)addr, len, val); ASSERT(!(addr & 0xf)); /* check alignment */ if (len == 4 || len == 8) data = *(u32 *) val; @@ -360,24 +361,38 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, case IOAPIC_REG_WINDOW: ioapic_write_indirect(ioapic, data); break; +#ifdef CONFIG_IA64 + case IOAPIC_REG_EOI: + kvm_ioapic_update_eoi(ioapic->kvm, data); + break; +#endif default: break; } } +void kvm_ioapic_reset(struct kvm_ioapic *ioapic) +{ + int i; + + for (i = 0; i < IOAPIC_NUM_PINS; i++) + ioapic->redirtbl[i].fields.mask = 1; + ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; + ioapic->ioregsel = 0; + ioapic->irr = 0; + ioapic->id = 0; +} + int kvm_ioapic_init(struct kvm *kvm) { struct kvm_ioapic *ioapic; - int i; ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); if (!ioapic) return -ENOMEM; - kvm->vioapic = ioapic; - for (i = 0; i < IOAPIC_NUM_PINS; i++) - ioapic->redirtbl[i].fields.mask = 1; - ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; + kvm->arch.vioapic = ioapic; + kvm_ioapic_reset(ioapic); ioapic->dev.read = ioapic_mmio_read; ioapic->dev.write = ioapic_mmio_write; ioapic->dev.in_range = ioapic_in_range; diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h new file mode 100644 index 00000000000..7f16675fe78 --- /dev/null +++ b/virt/kvm/ioapic.h @@ -0,0 +1,95 @@ +#ifndef __KVM_IO_APIC_H +#define __KVM_IO_APIC_H + +#include <linux/kvm_host.h> + +#include "iodev.h" + +struct kvm; +struct kvm_vcpu; + +#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS +#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */ +#define IOAPIC_EDGE_TRIG 0 +#define IOAPIC_LEVEL_TRIG 1 + +#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000 +#define IOAPIC_MEM_LENGTH 0x100 + +/* Direct registers. */ +#define IOAPIC_REG_SELECT 0x00 +#define IOAPIC_REG_WINDOW 0x10 +#define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */ + +/* Indirect registers. */ +#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */ +#define IOAPIC_REG_VERSION 0x01 +#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */ + +/*ioapic delivery mode*/ +#define IOAPIC_FIXED 0x0 +#define IOAPIC_LOWEST_PRIORITY 0x1 +#define IOAPIC_PMI 0x2 +#define IOAPIC_NMI 0x4 +#define IOAPIC_INIT 0x5 +#define IOAPIC_EXTINT 0x7 + +struct kvm_ioapic { + u64 base_address; + u32 ioregsel; + u32 id; + u32 irr; + u32 pad; + union ioapic_redir_entry { + u64 bits; + struct { + u8 vector; + u8 delivery_mode:3; + u8 dest_mode:1; + u8 delivery_status:1; + u8 polarity:1; + u8 remote_irr:1; + u8 trig_mode:1; + u8 mask:1; + u8 reserve:7; + u8 reserved[4]; + u8 dest_id; + } fields; + } redirtbl[IOAPIC_NUM_PINS]; + struct kvm_io_device dev; + struct kvm *kvm; +}; + +#ifdef DEBUG +#define ASSERT(x) \ +do { \ + if (!(x)) { \ + printk(KERN_EMERG "assertion failed %s: %d: %s\n", \ + __FILE__, __LINE__, #x); \ + BUG(); \ + } \ +} while (0) +#else +#define ASSERT(x) do { } while (0) +#endif + +static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) +{ + return kvm->arch.vioapic; +} + +#ifdef CONFIG_IA64 +static inline int irqchip_in_kernel(struct kvm *kvm) +{ + return 1; +} +#endif + +struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, + unsigned long bitmap); +void kvm_ioapic_update_eoi(struct kvm *kvm, int vector); +int kvm_ioapic_init(struct kvm *kvm); +void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); +void kvm_ioapic_reset(struct kvm_ioapic *ioapic); + +#endif diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h new file mode 100644 index 00000000000..c14e642027b --- /dev/null +++ b/virt/kvm/iodev.h @@ -0,0 +1,63 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef __KVM_IODEV_H__ +#define __KVM_IODEV_H__ + +#include <linux/kvm_types.h> + +struct kvm_io_device { + void (*read)(struct kvm_io_device *this, + gpa_t addr, + int len, + void *val); + void (*write)(struct kvm_io_device *this, + gpa_t addr, + int len, + const void *val); + int (*in_range)(struct kvm_io_device *this, gpa_t addr); + void (*destructor)(struct kvm_io_device *this); + + void *private; +}; + +static inline void kvm_iodevice_read(struct kvm_io_device *dev, + gpa_t addr, + int len, + void *val) +{ + dev->read(dev, addr, len, val); +} + +static inline void kvm_iodevice_write(struct kvm_io_device *dev, + gpa_t addr, + int len, + const void *val) +{ + dev->write(dev, addr, len, val); +} + +static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr) +{ + return dev->in_range(dev, addr); +} + +static inline void kvm_iodevice_destructor(struct kvm_io_device *dev) +{ + if (dev->destructor) + dev->destructor(dev); +} + +#endif /* __KVM_IODEV_H__ */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c new file mode 100644 index 00000000000..3c4fe26096f --- /dev/null +++ b/virt/kvm/kvm_main.c @@ -0,0 +1,1400 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * Copyright (C) 2006 Qumranet, Inc. + * + * Authors: + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "iodev.h" + +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/percpu.h> +#include <linux/gfp.h> +#include <linux/mm.h> +#include <linux/miscdevice.h> +#include <linux/vmalloc.h> +#include <linux/reboot.h> +#include <linux/debugfs.h> +#include <linux/highmem.h> +#include <linux/file.h> +#include <linux/sysdev.h> +#include <linux/cpu.h> +#include <linux/sched.h> +#include <linux/cpumask.h> +#include <linux/smp.h> +#include <linux/anon_inodes.h> +#include <linux/profile.h> +#include <linux/kvm_para.h> +#include <linux/pagemap.h> +#include <linux/mman.h> + +#include <asm/processor.h> +#include <asm/io.h> +#include <asm/uaccess.h> +#include <asm/pgtable.h> + +MODULE_AUTHOR("Qumranet"); +MODULE_LICENSE("GPL"); + +DEFINE_SPINLOCK(kvm_lock); +LIST_HEAD(vm_list); + +static cpumask_t cpus_hardware_enabled; + +struct kmem_cache *kvm_vcpu_cache; +EXPORT_SYMBOL_GPL(kvm_vcpu_cache); + +static __read_mostly struct preempt_ops kvm_preempt_ops; + +static struct dentry *debugfs_dir; + +static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, + unsigned long arg); + +static inline int valid_vcpu(int n) +{ + return likely(n >= 0 && n < KVM_MAX_VCPUS); +} + +/* + * Switches to specified vcpu, until a matching vcpu_put() + */ +void vcpu_load(struct kvm_vcpu *vcpu) +{ + int cpu; + + mutex_lock(&vcpu->mutex); + cpu = get_cpu(); + preempt_notifier_register(&vcpu->preempt_notifier); + kvm_arch_vcpu_load(vcpu, cpu); + put_cpu(); +} + +void vcpu_put(struct kvm_vcpu *vcpu) +{ + preempt_disable(); + kvm_arch_vcpu_put(vcpu); + preempt_notifier_unregister(&vcpu->preempt_notifier); + preempt_enable(); + mutex_unlock(&vcpu->mutex); +} + +static void ack_flush(void *_completed) +{ +} + +void kvm_flush_remote_tlbs(struct kvm *kvm) +{ + int i, cpu; + cpumask_t cpus; + struct kvm_vcpu *vcpu; + + cpus_clear(cpus); + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + vcpu = kvm->vcpus[i]; + if (!vcpu) + continue; + if (test_and_set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) + continue; + cpu = vcpu->cpu; + if (cpu != -1 && cpu != raw_smp_processor_id()) + cpu_set(cpu, cpus); + } + if (cpus_empty(cpus)) + return; + ++kvm->stat.remote_tlb_flush; + smp_call_function_mask(cpus, ack_flush, NULL, 1); +} + +int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) +{ + struct page *page; + int r; + + mutex_init(&vcpu->mutex); + vcpu->cpu = -1; + vcpu->kvm = kvm; + vcpu->vcpu_id = id; + init_waitqueue_head(&vcpu->wq); + + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) { + r = -ENOMEM; + goto fail; + } + vcpu->run = page_address(page); + + r = kvm_arch_vcpu_init(vcpu); + if (r < 0) + goto fail_free_run; + return 0; + +fail_free_run: + free_page((unsigned long)vcpu->run); +fail: + return r; +} +EXPORT_SYMBOL_GPL(kvm_vcpu_init); + +void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) +{ + kvm_arch_vcpu_uninit(vcpu); + free_page((unsigned long)vcpu->run); +} +EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); + +static struct kvm *kvm_create_vm(void) +{ + struct kvm *kvm = kvm_arch_create_vm(); + + if (IS_ERR(kvm)) + goto out; + + kvm->mm = current->mm; + atomic_inc(&kvm->mm->mm_count); + spin_lock_init(&kvm->mmu_lock); + kvm_io_bus_init(&kvm->pio_bus); + mutex_init(&kvm->lock); + kvm_io_bus_init(&kvm->mmio_bus); + spin_lock(&kvm_lock); + list_add(&kvm->vm_list, &vm_list); + spin_unlock(&kvm_lock); +out: + return kvm; +} + +/* + * Free any memory in @free but not in @dont. + */ +static void kvm_free_physmem_slot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ + if (!dont || free->rmap != dont->rmap) + vfree(free->rmap); + + if (!dont || free->dirty_bitmap != dont->dirty_bitmap) + vfree(free->dirty_bitmap); + + free->npages = 0; + free->dirty_bitmap = NULL; + free->rmap = NULL; +} + +void kvm_free_physmem(struct kvm *kvm) +{ + int i; + + for (i = 0; i < kvm->nmemslots; ++i) + kvm_free_physmem_slot(&kvm->memslots[i], NULL); +} + +static void kvm_destroy_vm(struct kvm *kvm) +{ + struct mm_struct *mm = kvm->mm; + + spin_lock(&kvm_lock); + list_del(&kvm->vm_list); + spin_unlock(&kvm_lock); + kvm_io_bus_destroy(&kvm->pio_bus); + kvm_io_bus_destroy(&kvm->mmio_bus); + kvm_arch_destroy_vm(kvm); + mmdrop(mm); +} + +static int kvm_vm_release(struct inode *inode, struct file *filp) +{ + struct kvm *kvm = filp->private_data; + + kvm_destroy_vm(kvm); + return 0; +} + +/* + * Allocate some memory and give it an address in the guest physical address + * space. + * + * Discontiguous memory is allowed, mostly for framebuffers. + * + * Must be called holding mmap_sem for write. + */ +int __kvm_set_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + int user_alloc) +{ + int r; + gfn_t base_gfn; + unsigned long npages; + unsigned long i; + struct kvm_memory_slot *memslot; + struct kvm_memory_slot old, new; + + r = -EINVAL; + /* General sanity checks */ + if (mem->memory_size & (PAGE_SIZE - 1)) + goto out; + if (mem->guest_phys_addr & (PAGE_SIZE - 1)) + goto out; + if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) + goto out; + if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) + goto out; + + memslot = &kvm->memslots[mem->slot]; + base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; + npages = mem->memory_size >> PAGE_SHIFT; + + if (!npages) + mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; + + new = old = *memslot; + + new.base_gfn = base_gfn; + new.npages = npages; + new.flags = mem->flags; + + /* Disallow changing a memory slot's size. */ + r = -EINVAL; + if (npages && old.npages && npages != old.npages) + goto out_free; + + /* Check for overlaps */ + r = -EEXIST; + for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { + struct kvm_memory_slot *s = &kvm->memslots[i]; + + if (s == memslot) + continue; + if (!((base_gfn + npages <= s->base_gfn) || + (base_gfn >= s->base_gfn + s->npages))) + goto out_free; + } + + /* Free page dirty bitmap if unneeded */ + if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) + new.dirty_bitmap = NULL; + + r = -ENOMEM; + + /* Allocate if a slot is being created */ + if (npages && !new.rmap) { + new.rmap = vmalloc(npages * sizeof(struct page *)); + + if (!new.rmap) + goto out_free; + + memset(new.rmap, 0, npages * sizeof(*new.rmap)); + + new.user_alloc = user_alloc; + new.userspace_addr = mem->userspace_addr; + } + + /* Allocate page dirty bitmap if needed */ + if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { + unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; + + new.dirty_bitmap = vmalloc(dirty_bytes); + if (!new.dirty_bitmap) + goto out_free; + memset(new.dirty_bitmap, 0, dirty_bytes); + } + + if (mem->slot >= kvm->nmemslots) + kvm->nmemslots = mem->slot + 1; + + *memslot = new; + + r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc); + if (r) { + *memslot = old; + goto out_free; + } + + kvm_free_physmem_slot(&old, &new); + return 0; + +out_free: + kvm_free_physmem_slot(&new, &old); +out: + return r; + +} +EXPORT_SYMBOL_GPL(__kvm_set_memory_region); + +int kvm_set_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + int user_alloc) +{ + int r; + + down_write(¤t->mm->mmap_sem); + r = __kvm_set_memory_region(kvm, mem, user_alloc); + up_write(¤t->mm->mmap_sem); + return r; +} +EXPORT_SYMBOL_GPL(kvm_set_memory_region); + +int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, + struct + kvm_userspace_memory_region *mem, + int user_alloc) +{ + if (mem->slot >= KVM_MEMORY_SLOTS) + return -EINVAL; + return kvm_set_memory_region(kvm, mem, user_alloc); +} + +int kvm_get_dirty_log(struct kvm *kvm, + struct kvm_dirty_log *log, int *is_dirty) +{ + struct kvm_memory_slot *memslot; + int r, i; + int n; + unsigned long any = 0; + + r = -EINVAL; + if (log->slot >= KVM_MEMORY_SLOTS) + goto out; + + memslot = &kvm->memslots[log->slot]; + r = -ENOENT; + if (!memslot->dirty_bitmap) + goto out; + + n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; + + for (i = 0; !any && i < n/sizeof(long); ++i) + any = memslot->dirty_bitmap[i]; + + r = -EFAULT; + if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) + goto out; + + if (any) + *is_dirty = 1; + + r = 0; +out: + return r; +} + +int is_error_page(struct page *page) +{ + return page == bad_page; +} +EXPORT_SYMBOL_GPL(is_error_page); + +static inline unsigned long bad_hva(void) +{ + return PAGE_OFFSET; +} + +int kvm_is_error_hva(unsigned long addr) +{ + return addr == bad_hva(); +} +EXPORT_SYMBOL_GPL(kvm_is_error_hva); + +static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn) +{ + int i; + + for (i = 0; i < kvm->nmemslots; ++i) { + struct kvm_memory_slot *memslot = &kvm->memslots[i]; + + if (gfn >= memslot->base_gfn + && gfn < memslot->base_gfn + memslot->npages) + return memslot; + } + return NULL; +} + +struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) +{ + gfn = unalias_gfn(kvm, gfn); + return __gfn_to_memslot(kvm, gfn); +} + +int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) +{ + int i; + + gfn = unalias_gfn(kvm, gfn); + for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { + struct kvm_memory_slot *memslot = &kvm->memslots[i]; + + if (gfn >= memslot->base_gfn + && gfn < memslot->base_gfn + memslot->npages) + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); + +static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *slot; + + gfn = unalias_gfn(kvm, gfn); + slot = __gfn_to_memslot(kvm, gfn); + if (!slot) + return bad_hva(); + return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); +} + +/* + * Requires current->mm->mmap_sem to be held + */ +struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) +{ + struct page *page[1]; + unsigned long addr; + int npages; + + might_sleep(); + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) { + get_page(bad_page); + return bad_page; + } + + npages = get_user_pages(current, current->mm, addr, 1, 1, 1, page, + NULL); + + if (npages != 1) { + get_page(bad_page); + return bad_page; + } + + return page[0]; +} + +EXPORT_SYMBOL_GPL(gfn_to_page); + +void kvm_release_page_clean(struct page *page) +{ + put_page(page); +} +EXPORT_SYMBOL_GPL(kvm_release_page_clean); + +void kvm_release_page_dirty(struct page *page) +{ + if (!PageReserved(page)) + SetPageDirty(page); + put_page(page); +} +EXPORT_SYMBOL_GPL(kvm_release_page_dirty); + +static int next_segment(unsigned long len, int offset) +{ + if (len > PAGE_SIZE - offset) + return PAGE_SIZE - offset; + else + return len; +} + +int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, + int len) +{ + int r; + unsigned long addr; + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return -EFAULT; + r = copy_from_user(data, (void __user *)addr + offset, len); + if (r) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL_GPL(kvm_read_guest_page); + +int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) +{ + gfn_t gfn = gpa >> PAGE_SHIFT; + int seg; + int offset = offset_in_page(gpa); + int ret; + + while ((seg = next_segment(len, offset)) != 0) { + ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); + if (ret < 0) + return ret; + offset = 0; + len -= seg; + data += seg; + ++gfn; + } + return 0; +} +EXPORT_SYMBOL_GPL(kvm_read_guest); + +int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, + unsigned long len) +{ + int r; + unsigned long addr; + gfn_t gfn = gpa >> PAGE_SHIFT; + int offset = offset_in_page(gpa); + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return -EFAULT; + r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); + if (r) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL(kvm_read_guest_atomic); + +int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, + int offset, int len) +{ + int r; + unsigned long addr; + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return -EFAULT; + r = copy_to_user((void __user *)addr + offset, data, len); + if (r) + return -EFAULT; + mark_page_dirty(kvm, gfn); + return 0; +} +EXPORT_SYMBOL_GPL(kvm_write_guest_page); + +int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, + unsigned long len) +{ + gfn_t gfn = gpa >> PAGE_SHIFT; + int seg; + int offset = offset_in_page(gpa); + int ret; + + while ((seg = next_segment(len, offset)) != 0) { + ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); + if (ret < 0) + return ret; + offset = 0; + len -= seg; + data += seg; + ++gfn; + } + return 0; +} + +int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) +{ + return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); +} +EXPORT_SYMBOL_GPL(kvm_clear_guest_page); + +int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) +{ + gfn_t gfn = gpa >> PAGE_SHIFT; + int seg; + int offset = offset_in_page(gpa); + int ret; + + while ((seg = next_segment(len, offset)) != 0) { + ret = kvm_clear_guest_page(kvm, gfn, offset, seg); + if (ret < 0) + return ret; + offset = 0; + len -= seg; + ++gfn; + } + return 0; +} +EXPORT_SYMBOL_GPL(kvm_clear_guest); + +void mark_page_dirty(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *memslot; + + gfn = unalias_gfn(kvm, gfn); + memslot = __gfn_to_memslot(kvm, gfn); + if (memslot && memslot->dirty_bitmap) { + unsigned long rel_gfn = gfn - memslot->base_gfn; + + /* avoid RMW */ + if (!test_bit(rel_gfn, memslot->dirty_bitmap)) + set_bit(rel_gfn, memslot->dirty_bitmap); + } +} + +/* + * The vCPU has executed a HLT instruction with in-kernel mode enabled. + */ +void kvm_vcpu_block(struct kvm_vcpu *vcpu) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&vcpu->wq, &wait); + + /* + * We will block until either an interrupt or a signal wakes us up + */ + while (!kvm_cpu_has_interrupt(vcpu) + && !signal_pending(current) + && !kvm_arch_vcpu_runnable(vcpu)) { + set_current_state(TASK_INTERRUPTIBLE); + vcpu_put(vcpu); + schedule(); + vcpu_load(vcpu); + } + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&vcpu->wq, &wait); +} + +void kvm_resched(struct kvm_vcpu *vcpu) +{ + if (!need_resched()) + return; + cond_resched(); +} +EXPORT_SYMBOL_GPL(kvm_resched); + +static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct kvm_vcpu *vcpu = vma->vm_file->private_data; + struct page *page; + + if (vmf->pgoff == 0) + page = virt_to_page(vcpu->run); + else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) + page = virt_to_page(vcpu->arch.pio_data); + else + return VM_FAULT_SIGBUS; + get_page(page); + vmf->page = page; + return 0; +} + +static struct vm_operations_struct kvm_vcpu_vm_ops = { + .fault = kvm_vcpu_fault, +}; + +static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_ops = &kvm_vcpu_vm_ops; + return 0; +} + +static int kvm_vcpu_release(struct inode *inode, struct file *filp) +{ + struct kvm_vcpu *vcpu = filp->private_data; + + fput(vcpu->kvm->filp); + return 0; +} + +static struct file_operations kvm_vcpu_fops = { + .release = kvm_vcpu_release, + .unlocked_ioctl = kvm_vcpu_ioctl, + .compat_ioctl = kvm_vcpu_ioctl, + .mmap = kvm_vcpu_mmap, +}; + +/* + * Allocates an inode for the vcpu. + */ +static int create_vcpu_fd(struct kvm_vcpu *vcpu) +{ + int fd, r; + struct inode *inode; + struct file *file; + + r = anon_inode_getfd(&fd, &inode, &file, + "kvm-vcpu", &kvm_vcpu_fops, vcpu); + if (r) + return r; + atomic_inc(&vcpu->kvm->filp->f_count); + return fd; +} + +/* + * Creates some virtual cpus. Good luck creating more than one. + */ +static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) +{ + int r; + struct kvm_vcpu *vcpu; + + if (!valid_vcpu(n)) + return -EINVAL; + + vcpu = kvm_arch_vcpu_create(kvm, n); + if (IS_ERR(vcpu)) + return PTR_ERR(vcpu); + + preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); + + r = kvm_arch_vcpu_setup(vcpu); + if (r) + goto vcpu_destroy; + + mutex_lock(&kvm->lock); + if (kvm->vcpus[n]) { + r = -EEXIST; + mutex_unlock(&kvm->lock); + goto vcpu_destroy; + } + kvm->vcpus[n] = vcpu; + mutex_unlock(&kvm->lock); + + /* Now it's all set up, let userspace reach it */ + r = create_vcpu_fd(vcpu); + if (r < 0) + goto unlink; + return r; + +unlink: + mutex_lock(&kvm->lock); + kvm->vcpus[n] = NULL; + mutex_unlock(&kvm->lock); +vcpu_destroy: + kvm_arch_vcpu_destroy(vcpu); + return r; +} + +static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) +{ + if (sigset) { + sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); + vcpu->sigset_active = 1; + vcpu->sigset = *sigset; + } else + vcpu->sigset_active = 0; + return 0; +} + +static long kvm_vcpu_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm_vcpu *vcpu = filp->private_data; + void __user *argp = (void __user *)arg; + int r; + + if (vcpu->kvm->mm != current->mm) + return -EIO; + switch (ioctl) { + case KVM_RUN: + r = -EINVAL; + if (arg) + goto out; + r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); + break; + case KVM_GET_REGS: { + struct kvm_regs kvm_regs; + + memset(&kvm_regs, 0, sizeof kvm_regs); + r = kvm_arch_vcpu_ioctl_get_regs(vcpu, &kvm_regs); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs)) + goto out; + r = 0; + break; + } + case KVM_SET_REGS: { + struct kvm_regs kvm_regs; + + r = -EFAULT; + if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs)) + goto out; + r = kvm_arch_vcpu_ioctl_set_regs(vcpu, &kvm_regs); + if (r) + goto out; + r = 0; + break; + } + case KVM_GET_SREGS: { + struct kvm_sregs kvm_sregs; + + memset(&kvm_sregs, 0, sizeof kvm_sregs); + r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs)) + goto out; + r = 0; + break; + } + case KVM_SET_SREGS: { + struct kvm_sregs kvm_sregs; + + r = -EFAULT; + if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs)) + goto out; + r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs); + if (r) + goto out; + r = 0; + break; + } + case KVM_TRANSLATE: { + struct kvm_translation tr; + + r = -EFAULT; + if (copy_from_user(&tr, argp, sizeof tr)) + goto out; + r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &tr, sizeof tr)) + goto out; + r = 0; + break; + } + case KVM_DEBUG_GUEST: { + struct kvm_debug_guest dbg; + + r = -EFAULT; + if (copy_from_user(&dbg, argp, sizeof dbg)) + goto out; + r = kvm_arch_vcpu_ioctl_debug_guest(vcpu, &dbg); + if (r) + goto out; + r = 0; + break; + } + case KVM_SET_SIGNAL_MASK: { + struct kvm_signal_mask __user *sigmask_arg = argp; + struct kvm_signal_mask kvm_sigmask; + sigset_t sigset, *p; + + p = NULL; + if (argp) { + r = -EFAULT; + if (copy_from_user(&kvm_sigmask, argp, + sizeof kvm_sigmask)) + goto out; + r = -EINVAL; + if (kvm_sigmask.len != sizeof sigset) + goto out; + r = -EFAULT; + if (copy_from_user(&sigset, sigmask_arg->sigset, + sizeof sigset)) + goto out; + p = &sigset; + } + r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); + break; + } + case KVM_GET_FPU: { + struct kvm_fpu fpu; + + memset(&fpu, 0, sizeof fpu); + r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, &fpu); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &fpu, sizeof fpu)) + goto out; + r = 0; + break; + } + case KVM_SET_FPU: { + struct kvm_fpu fpu; + + r = -EFAULT; + if (copy_from_user(&fpu, argp, sizeof fpu)) + goto out; + r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, &fpu); + if (r) + goto out; + r = 0; + break; + } + default: + r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); + } +out: + return r; +} + +static long kvm_vm_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm *kvm = filp->private_data; + void __user *argp = (void __user *)arg; + int r; + + if (kvm->mm != current->mm) + return -EIO; + switch (ioctl) { + case KVM_CREATE_VCPU: + r = kvm_vm_ioctl_create_vcpu(kvm, arg); + if (r < 0) + goto out; + break; + case KVM_SET_USER_MEMORY_REGION: { + struct kvm_userspace_memory_region kvm_userspace_mem; + + r = -EFAULT; + if (copy_from_user(&kvm_userspace_mem, argp, + sizeof kvm_userspace_mem)) + goto out; + + r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); + if (r) + goto out; + break; + } + case KVM_GET_DIRTY_LOG: { + struct kvm_dirty_log log; + + r = -EFAULT; + if (copy_from_user(&log, argp, sizeof log)) + goto out; + r = kvm_vm_ioctl_get_dirty_log(kvm, &log); + if (r) + goto out; + break; + } + default: + r = kvm_arch_vm_ioctl(filp, ioctl, arg); + } +out: + return r; +} + +static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct kvm *kvm = vma->vm_file->private_data; + struct page *page; + + if (!kvm_is_visible_gfn(kvm, vmf->pgoff)) + return VM_FAULT_SIGBUS; + page = gfn_to_page(kvm, vmf->pgoff); + if (is_error_page(page)) { + kvm_release_page_clean(page); + return VM_FAULT_SIGBUS; + } + vmf->page = page; + return 0; +} + +static struct vm_operations_struct kvm_vm_vm_ops = { + .fault = kvm_vm_fault, +}; + +static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_ops = &kvm_vm_vm_ops; + return 0; +} + +static struct file_operations kvm_vm_fops = { + .release = kvm_vm_release, + .unlocked_ioctl = kvm_vm_ioctl, + .compat_ioctl = kvm_vm_ioctl, + .mmap = kvm_vm_mmap, +}; + +static int kvm_dev_ioctl_create_vm(void) +{ + int fd, r; + struct inode *inode; + struct file *file; + struct kvm *kvm; + + kvm = kvm_create_vm(); + if (IS_ERR(kvm)) + return PTR_ERR(kvm); + r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm); + if (r) { + kvm_destroy_vm(kvm); + return r; + } + + kvm->filp = file; + + return fd; +} + +static long kvm_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + long r = -EINVAL; + + switch (ioctl) { + case KVM_GET_API_VERSION: + r = -EINVAL; + if (arg) + goto out; + r = KVM_API_VERSION; + break; + case KVM_CREATE_VM: + r = -EINVAL; + if (arg) + goto out; + r = kvm_dev_ioctl_create_vm(); + break; + case KVM_CHECK_EXTENSION: + r = kvm_dev_ioctl_check_extension((long)argp); + break; + case KVM_GET_VCPU_MMAP_SIZE: + r = -EINVAL; + if (arg) + goto out; + r = 2 * PAGE_SIZE; + break; + default: + return kvm_arch_dev_ioctl(filp, ioctl, arg); + } +out: + return r; +} + +static struct file_operations kvm_chardev_ops = { + .unlocked_ioctl = kvm_dev_ioctl, + .compat_ioctl = kvm_dev_ioctl, +}; + +static struct miscdevice kvm_dev = { + KVM_MINOR, + "kvm", + &kvm_chardev_ops, +}; + +static void hardware_enable(void *junk) +{ + int cpu = raw_smp_processor_id(); + + if (cpu_isset(cpu, cpus_hardware_enabled)) + return; + cpu_set(cpu, cpus_hardware_enabled); + kvm_arch_hardware_enable(NULL); +} + +static void hardware_disable(void *junk) +{ + int cpu = raw_smp_processor_id(); + + if (!cpu_isset(cpu, cpus_hardware_enabled)) + return; + cpu_clear(cpu, cpus_hardware_enabled); + decache_vcpus_on_cpu(cpu); + kvm_arch_hardware_disable(NULL); +} + +static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, + void *v) +{ + int cpu = (long)v; + + val &= ~CPU_TASKS_FROZEN; + switch (val) { + case CPU_DYING: + printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", + cpu); + hardware_disable(NULL); + break; + case CPU_UP_CANCELED: + printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", + cpu); + smp_call_function_single(cpu, hardware_disable, NULL, 0, 1); + break; + case CPU_ONLINE: + printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", + cpu); + smp_call_function_single(cpu, hardware_enable, NULL, 0, 1); + break; + } + return NOTIFY_OK; +} + +static int kvm_reboot(struct notifier_block *notifier, unsigned long val, + void *v) +{ + if (val == SYS_RESTART) { + /* + * Some (well, at least mine) BIOSes hang on reboot if + * in vmx root mode. + */ + printk(KERN_INFO "kvm: exiting hardware virtualization\n"); + on_each_cpu(hardware_disable, NULL, 0, 1); + } + return NOTIFY_OK; +} + +static struct notifier_block kvm_reboot_notifier = { + .notifier_call = kvm_reboot, + .priority = 0, +}; + +void kvm_io_bus_init(struct kvm_io_bus *bus) +{ + memset(bus, 0, sizeof(*bus)); +} + +void kvm_io_bus_destroy(struct kvm_io_bus *bus) +{ + int i; + + for (i = 0; i < bus->dev_count; i++) { + struct kvm_io_device *pos = bus->devs[i]; + + kvm_iodevice_destructor(pos); + } +} + +struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr) +{ + int i; + + for (i = 0; i < bus->dev_count; i++) { + struct kvm_io_device *pos = bus->devs[i]; + + if (pos->in_range(pos, addr)) + return pos; + } + + return NULL; +} + +void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev) +{ + BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1)); + + bus->devs[bus->dev_count++] = dev; +} + +static struct notifier_block kvm_cpu_notifier = { + .notifier_call = kvm_cpu_hotplug, + .priority = 20, /* must be > scheduler priority */ +}; + +static u64 vm_stat_get(void *_offset) +{ + unsigned offset = (long)_offset; + u64 total = 0; + struct kvm *kvm; + + spin_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) + total += *(u32 *)((void *)kvm + offset); + spin_unlock(&kvm_lock); + return total; +} + +DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n"); + +static u64 vcpu_stat_get(void *_offset) +{ + unsigned offset = (long)_offset; + u64 total = 0; + struct kvm *kvm; + struct kvm_vcpu *vcpu; + int i; + + spin_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + vcpu = kvm->vcpus[i]; + if (vcpu) + total += *(u32 *)((void *)vcpu + offset); + } + spin_unlock(&kvm_lock); + return total; +} + +DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); + +static struct file_operations *stat_fops[] = { + [KVM_STAT_VCPU] = &vcpu_stat_fops, + [KVM_STAT_VM] = &vm_stat_fops, +}; + +static void kvm_init_debug(void) +{ + struct kvm_stats_debugfs_item *p; + + debugfs_dir = debugfs_create_dir("kvm", NULL); + for (p = debugfs_entries; p->name; ++p) + p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir, + (void *)(long)p->offset, + stat_fops[p->kind]); +} + +static void kvm_exit_debug(void) +{ + struct kvm_stats_debugfs_item *p; + + for (p = debugfs_entries; p->name; ++p) + debugfs_remove(p->dentry); + debugfs_remove(debugfs_dir); +} + +static int kvm_suspend(struct sys_device *dev, pm_message_t state) +{ + hardware_disable(NULL); + return 0; +} + +static int kvm_resume(struct sys_device *dev) +{ + hardware_enable(NULL); + return 0; +} + +static struct sysdev_class kvm_sysdev_class = { + .name = "kvm", + .suspend = kvm_suspend, + .resume = kvm_resume, +}; + +static struct sys_device kvm_sysdev = { + .id = 0, + .cls = &kvm_sysdev_class, +}; + +struct page *bad_page; + +static inline +struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) +{ + return container_of(pn, struct kvm_vcpu, preempt_notifier); +} + +static void kvm_sched_in(struct preempt_notifier *pn, int cpu) +{ + struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); + + kvm_arch_vcpu_load(vcpu, cpu); +} + +static void kvm_sched_out(struct preempt_notifier *pn, + struct task_struct *next) +{ + struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); + + kvm_arch_vcpu_put(vcpu); +} + +int kvm_init(void *opaque, unsigned int vcpu_size, + struct module *module) +{ + int r; + int cpu; + + kvm_init_debug(); + + r = kvm_arch_init(opaque); + if (r) + goto out_fail; + + bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + + if (bad_page == NULL) { + r = -ENOMEM; + goto out; + } + + r = kvm_arch_hardware_setup(); + if (r < 0) + goto out_free_0; + + for_each_online_cpu(cpu) { + smp_call_function_single(cpu, + kvm_arch_check_processor_compat, + &r, 0, 1); + if (r < 0) + goto out_free_1; + } + + on_each_cpu(hardware_enable, NULL, 0, 1); + r = register_cpu_notifier(&kvm_cpu_notifier); + if (r) + goto out_free_2; + register_reboot_notifier(&kvm_reboot_notifier); + + r = sysdev_class_register(&kvm_sysdev_class); + if (r) + goto out_free_3; + + r = sysdev_register(&kvm_sysdev); + if (r) + goto out_free_4; + + /* A kmem cache lets us meet the alignment requirements of fx_save. */ + kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, + __alignof__(struct kvm_vcpu), + 0, NULL); + if (!kvm_vcpu_cache) { + r = -ENOMEM; + goto out_free_5; + } + + kvm_chardev_ops.owner = module; + + r = misc_register(&kvm_dev); + if (r) { + printk(KERN_ERR "kvm: misc device register failed\n"); + goto out_free; + } + + kvm_preempt_ops.sched_in = kvm_sched_in; + kvm_preempt_ops.sched_out = kvm_sched_out; + + return 0; + +out_free: + kmem_cache_destroy(kvm_vcpu_cache); +out_free_5: + sysdev_unregister(&kvm_sysdev); +out_free_4: + sysdev_class_unregister(&kvm_sysdev_class); +out_free_3: + unregister_reboot_notifier(&kvm_reboot_notifier); + unregister_cpu_notifier(&kvm_cpu_notifier); +out_free_2: + on_each_cpu(hardware_disable, NULL, 0, 1); +out_free_1: + kvm_arch_hardware_unsetup(); +out_free_0: + __free_page(bad_page); +out: + kvm_arch_exit(); + kvm_exit_debug(); +out_fail: + return r; +} +EXPORT_SYMBOL_GPL(kvm_init); + +void kvm_exit(void) +{ + misc_deregister(&kvm_dev); + kmem_cache_destroy(kvm_vcpu_cache); + sysdev_unregister(&kvm_sysdev); + sysdev_class_unregister(&kvm_sysdev_class); + unregister_reboot_notifier(&kvm_reboot_notifier); + unregister_cpu_notifier(&kvm_cpu_notifier); + on_each_cpu(hardware_disable, NULL, 0, 1); + kvm_arch_hardware_unsetup(); + kvm_arch_exit(); + kvm_exit_debug(); + __free_page(bad_page); +} +EXPORT_SYMBOL_GPL(kvm_exit); |