summaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
Diffstat (limited to 'drivers')
-rw-r--r--drivers/Kconfig2
-rw-r--r--drivers/Makefile5
-rw-r--r--drivers/acpi/processor_idle.c34
-rw-r--r--drivers/base/bus.c41
-rw-r--r--drivers/base/class.c4
-rw-r--r--drivers/base/core.c30
-rw-r--r--drivers/block/cciss.c1
-rw-r--r--drivers/block/xsysace.c6
-rw-r--r--drivers/char/agp/ali-agp.c2
-rw-r--r--drivers/char/agp/backend.c3
-rw-r--r--drivers/char/agp/generic.c3
-rw-r--r--drivers/char/agp/i460-agp.c2
-rw-r--r--drivers/char/agp/intel-agp.c11
-rw-r--r--drivers/char/hpet.c126
-rw-r--r--drivers/char/rtc.c253
-rw-r--r--drivers/cpufreq/cpufreq.c2
-rw-r--r--drivers/firmware/dmi_scan.c26
-rw-r--r--drivers/ieee1394/Makefile1
-rw-r--r--drivers/ieee1394/init_ohci1394_dma.c285
-rw-r--r--drivers/infiniband/ulp/srp/ib_srp.c1
-rw-r--r--drivers/input/mouse/pc110pad.c7
-rw-r--r--drivers/kvm/Kconfig54
-rw-r--r--drivers/kvm/Makefile10
-rw-r--r--drivers/kvm/i8259.c450
-rw-r--r--drivers/kvm/ioapic.c388
-rw-r--r--drivers/kvm/irq.c98
-rw-r--r--drivers/kvm/irq.h165
-rw-r--r--drivers/kvm/kvm.h796
-rw-r--r--drivers/kvm/kvm_main.c3628
-rw-r--r--drivers/kvm/kvm_svm.h45
-rw-r--r--drivers/kvm/lapic.c1080
-rw-r--r--drivers/kvm/mmu.c1498
-rw-r--r--drivers/kvm/paging_tmpl.h511
-rw-r--r--drivers/kvm/segment_descriptor.h17
-rw-r--r--drivers/kvm/svm.c1754
-rw-r--r--drivers/kvm/svm.h324
-rw-r--r--drivers/kvm/vmx.c2566
-rw-r--r--drivers/kvm/vmx.h310
-rw-r--r--drivers/kvm/x86_emulate.c1662
-rw-r--r--drivers/kvm/x86_emulate.h155
-rw-r--r--drivers/lguest/core.c46
-rw-r--r--drivers/lguest/hypercalls.c106
-rw-r--r--drivers/lguest/interrupts_and_traps.c149
-rw-r--r--drivers/lguest/lg.h154
-rw-r--r--drivers/lguest/lguest_user.c147
-rw-r--r--drivers/lguest/page_tables.c179
-rw-r--r--drivers/lguest/segments.c48
-rw-r--r--drivers/lguest/x86/core.c129
-rw-r--r--drivers/net/Kconfig3
-rw-r--r--drivers/net/e1000/e1000_main.c33
-rw-r--r--drivers/pnp/pnpbios/bioscalls.c5
-rw-r--r--drivers/s390/scsi/zfcp_fsf.c4
-rw-r--r--drivers/scsi/3w-9xxx.c1
-rw-r--r--drivers/scsi/3w-xxxx.c1
-rw-r--r--drivers/scsi/BusLogic.c1
-rw-r--r--drivers/scsi/Kconfig2
-rw-r--r--drivers/scsi/NCR53c406a.c1
-rw-r--r--drivers/scsi/a100u2w.c1
-rw-r--r--drivers/scsi/aacraid/commctrl.c29
-rw-r--r--drivers/scsi/aacraid/linit.c1
-rw-r--r--drivers/scsi/aha1740.c1
-rw-r--r--drivers/scsi/aic7xxx/aic79xx.h5
-rw-r--r--drivers/scsi/aic7xxx/aic79xx_core.c2
-rw-r--r--drivers/scsi/aic7xxx/aic79xx_osm.c3
-rw-r--r--drivers/scsi/aic7xxx/aic79xx_osm_pci.c33
-rw-r--r--drivers/scsi/aic7xxx/aic79xx_pci.c2
-rw-r--r--drivers/scsi/aic7xxx/aic7xxx.h4
-rw-r--r--drivers/scsi/aic7xxx/aic7xxx_core.c3
-rw-r--r--drivers/scsi/aic7xxx/aic7xxx_osm.c10
-rw-r--r--drivers/scsi/aic7xxx/aic7xxx_osm_pci.c33
-rw-r--r--drivers/scsi/aic7xxx/aic7xxx_pci.c2
-rw-r--r--drivers/scsi/aic7xxx_old.c1
-rw-r--r--drivers/scsi/arcmsr/arcmsr_hba.c1
-rw-r--r--drivers/scsi/dc395x.c1
-rw-r--r--drivers/scsi/dpt_i2o.c1
-rw-r--r--drivers/scsi/eata.c1
-rw-r--r--drivers/scsi/hosts.c1
-rw-r--r--drivers/scsi/hptiop.c3
-rw-r--r--drivers/scsi/ibmmca.c1
-rw-r--r--drivers/scsi/ibmvscsi/ibmvscsi.c1
-rw-r--r--drivers/scsi/initio.c1
-rw-r--r--drivers/scsi/iscsi_tcp.c1
-rw-r--r--drivers/scsi/libsrp.c4
-rw-r--r--drivers/scsi/lpfc/lpfc_scsi.c2
-rw-r--r--drivers/scsi/mac53c94.c1
-rw-r--r--drivers/scsi/megaraid.c1
-rw-r--r--drivers/scsi/megaraid/megaraid_mbox.c1
-rw-r--r--drivers/scsi/megaraid/megaraid_sas.c1
-rw-r--r--drivers/scsi/mesh.c1
-rw-r--r--drivers/scsi/ncr53c8xx.c2
-rw-r--r--drivers/scsi/nsp32.c1
-rw-r--r--drivers/scsi/pcmcia/sym53c500_cs.c1
-rw-r--r--drivers/scsi/qla1280.c1
-rw-r--r--drivers/scsi/qla2xxx/qla_os.c2
-rw-r--r--drivers/scsi/qla4xxx/ql4_os.c1
-rw-r--r--drivers/scsi/qlogicfas.c1
-rw-r--r--drivers/scsi/scsi.c2
-rw-r--r--drivers/scsi/scsi_debug.c174
-rw-r--r--drivers/scsi/scsi_error.c33
-rw-r--r--drivers/scsi/scsi_lib.c274
-rw-r--r--drivers/scsi/scsi_tgt_lib.c28
-rw-r--r--drivers/scsi/sd.c4
-rw-r--r--drivers/scsi/sgiwd93.c64
-rw-r--r--drivers/scsi/sr.c25
-rw-r--r--drivers/scsi/stex.c1
-rw-r--r--drivers/scsi/sym53c416.c1
-rw-r--r--drivers/scsi/sym53c8xx_2/sym_glue.c3
-rw-r--r--drivers/scsi/u14-34f.c1
-rw-r--r--drivers/scsi/ultrastor.c1
-rw-r--r--drivers/scsi/wd7000.c1
-rw-r--r--drivers/usb/storage/isd200.c8
-rw-r--r--drivers/video/vermilion/vermilion.c15
112 files changed, 1569 insertions, 16593 deletions
diff --git a/drivers/Kconfig b/drivers/Kconfig
index f4076d9e990..08d4ae20159 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -90,8 +90,6 @@ source "drivers/dca/Kconfig"
source "drivers/auxdisplay/Kconfig"
-source "drivers/kvm/Kconfig"
-
source "drivers/uio/Kconfig"
source "drivers/virtio/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index 8cb37e3557d..0ee9a8a4095 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -38,7 +38,7 @@ obj-$(CONFIG_SCSI) += scsi/
obj-$(CONFIG_ATA) += ata/
obj-$(CONFIG_FUSION) += message/
obj-$(CONFIG_FIREWIRE) += firewire/
-obj-$(CONFIG_IEEE1394) += ieee1394/
+obj-y += ieee1394/
obj-$(CONFIG_UIO) += uio/
obj-y += cdrom/
obj-y += auxdisplay/
@@ -47,7 +47,6 @@ obj-$(CONFIG_SPI) += spi/
obj-$(CONFIG_PCCARD) += pcmcia/
obj-$(CONFIG_DIO) += dio/
obj-$(CONFIG_SBUS) += sbus/
-obj-$(CONFIG_KVM) += kvm/
obj-$(CONFIG_ZORRO) += zorro/
obj-$(CONFIG_MAC) += macintosh/
obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/
@@ -73,7 +72,7 @@ obj-$(CONFIG_ISDN) += isdn/
obj-$(CONFIG_EDAC) += edac/
obj-$(CONFIG_MCA) += mca/
obj-$(CONFIG_EISA) += eisa/
-obj-$(CONFIG_LGUEST_GUEST) += lguest/
+obj-y += lguest/
obj-$(CONFIG_CPU_FREQ) += cpufreq/
obj-$(CONFIG_CPU_IDLE) += cpuidle/
obj-$(CONFIG_MMC) += mmc/
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 2235f4e02d2..eb1f82f7915 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -357,6 +357,26 @@ int acpi_processor_resume(struct acpi_device * device)
return 0;
}
+#if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86_TSC)
+static int tsc_halts_in_c(int state)
+{
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ /*
+ * AMD Fam10h TSC will tick in all
+ * C/P/S0/S1 states when this bit is set.
+ */
+ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ return 0;
+ /*FALL THROUGH*/
+ case X86_VENDOR_INTEL:
+ /* Several cases known where TSC halts in C2 too */
+ default:
+ return state > ACPI_STATE_C1;
+ }
+}
+#endif
+
#ifndef CONFIG_CPU_IDLE
static void acpi_processor_idle(void)
{
@@ -516,7 +536,8 @@ static void acpi_processor_idle(void)
#if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86_TSC)
/* TSC halts in C2, so notify users */
- mark_tsc_unstable("possible TSC halt in C2");
+ if (tsc_halts_in_c(ACPI_STATE_C2))
+ mark_tsc_unstable("possible TSC halt in C2");
#endif
/* Compute time (ticks) that we were actually asleep */
sleep_ticks = ticks_elapsed(t1, t2);
@@ -534,6 +555,7 @@ static void acpi_processor_idle(void)
break;
case ACPI_STATE_C3:
+ acpi_unlazy_tlb(smp_processor_id());
/*
* Must be done before busmaster disable as we might
* need to access HPET !
@@ -579,7 +601,8 @@ static void acpi_processor_idle(void)
#if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86_TSC)
/* TSC halts in C3, so notify users */
- mark_tsc_unstable("TSC halts in C3");
+ if (tsc_halts_in_c(ACPI_STATE_C3))
+ mark_tsc_unstable("TSC halts in C3");
#endif
/* Compute time (ticks) that we were actually asleep */
sleep_ticks = ticks_elapsed(t1, t2);
@@ -1423,6 +1446,7 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev,
return 0;
}
+ acpi_unlazy_tlb(smp_processor_id());
/*
* Must be done before busmaster disable as we might need to
* access HPET !
@@ -1443,7 +1467,8 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev,
#if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86_TSC)
/* TSC could halt in idle, so notify users */
- mark_tsc_unstable("TSC halts in idle");;
+ if (tsc_halts_in_c(cx->type))
+ mark_tsc_unstable("TSC halts in idle");;
#endif
sleep_ticks = ticks_elapsed(t1, t2);
@@ -1554,7 +1579,8 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev,
#if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86_TSC)
/* TSC could halt in idle, so notify users */
- mark_tsc_unstable("TSC halts in idle");
+ if (tsc_halts_in_c(ACPI_STATE_C3))
+ mark_tsc_unstable("TSC halts in idle");
#endif
sleep_ticks = ticks_elapsed(t1, t2);
/* Tell the scheduler how much we idled: */
diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index f484495b2ad..055989e9479 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -163,15 +163,6 @@ static struct kset *bus_kset;
#ifdef CONFIG_HOTPLUG
/* Manually detach a device from its associated driver. */
-static int driver_helper(struct device *dev, void *data)
-{
- const char *name = data;
-
- if (strcmp(name, dev->bus_id) == 0)
- return 1;
- return 0;
-}
-
static ssize_t driver_unbind(struct device_driver *drv,
const char *buf, size_t count)
{
@@ -179,7 +170,7 @@ static ssize_t driver_unbind(struct device_driver *drv,
struct device *dev;
int err = -ENODEV;
- dev = bus_find_device(bus, NULL, (void *)buf, driver_helper);
+ dev = bus_find_device_by_name(bus, NULL, buf);
if (dev && dev->driver == drv) {
if (dev->parent) /* Needed for USB */
down(&dev->parent->sem);
@@ -206,7 +197,7 @@ static ssize_t driver_bind(struct device_driver *drv,
struct device *dev;
int err = -ENODEV;
- dev = bus_find_device(bus, NULL, (void *)buf, driver_helper);
+ dev = bus_find_device_by_name(bus, NULL, buf);
if (dev && dev->driver == NULL) {
if (dev->parent) /* Needed for USB */
down(&dev->parent->sem);
@@ -250,7 +241,7 @@ static ssize_t store_drivers_probe(struct bus_type *bus,
{
struct device *dev;
- dev = bus_find_device(bus, NULL, (void *)buf, driver_helper);
+ dev = bus_find_device_by_name(bus, NULL, buf);
if (!dev)
return -ENODEV;
if (bus_rescan_devices_helper(dev, NULL) != 0)
@@ -338,6 +329,32 @@ struct device *bus_find_device(struct bus_type *bus,
}
EXPORT_SYMBOL_GPL(bus_find_device);
+static int match_name(struct device *dev, void *data)
+{
+ const char *name = data;
+
+ if (strcmp(name, dev->bus_id) == 0)
+ return 1;
+ return 0;
+}
+
+/**
+ * bus_find_device_by_name - device iterator for locating a particular device of a specific name
+ * @bus: bus type
+ * @start: Device to begin with
+ * @name: name of the device to match
+ *
+ * This is similar to the bus_find_device() function above, but it handles
+ * searching by a name automatically, no need to write another strcmp matching
+ * function.
+ */
+struct device *bus_find_device_by_name(struct bus_type *bus,
+ struct device *start, const char *name)
+{
+ return bus_find_device(bus, start, (void *)name, match_name);
+}
+EXPORT_SYMBOL_GPL(bus_find_device_by_name);
+
static struct device_driver *next_driver(struct klist_iter *i)
{
struct klist_node *n = klist_next(i);
diff --git a/drivers/base/class.c b/drivers/base/class.c
index 59cf35894cf..9d915376c31 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -149,7 +149,7 @@ int class_register(struct class *cls)
if (error)
return error;
-#ifdef CONFIG_SYSFS_DEPRECATED
+#if defined(CONFIG_SYSFS_DEPRECATED) && defined(CONFIG_BLOCK)
/* let the block class directory show up in the root of sysfs */
if (cls != &block_class)
cls->subsys.kobj.kset = class_kset;
@@ -863,7 +863,7 @@ EXPORT_SYMBOL_GPL(class_for_each_device);
* The callback should return 0 if the device doesn't match and non-zero
* if it does. If the callback returns non-zero, this function will
* return to the caller and not iterate over any more devices.
-
+ *
* Note, you will need to drop the reference with put_device() after use.
*
* We hold class->sem in this function, so it can not be
diff --git a/drivers/base/core.c b/drivers/base/core.c
index edf3bbeb8d6..b1727876182 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -27,9 +27,17 @@
int (*platform_notify)(struct device *dev) = NULL;
int (*platform_notify_remove)(struct device *dev) = NULL;
-/*
- * sysfs bindings for devices.
- */
+#ifdef CONFIG_BLOCK
+static inline int device_is_not_partition(struct device *dev)
+{
+ return !(dev->type == &part_type);
+}
+#else
+static inline int device_is_not_partition(struct device *dev)
+{
+ return 1;
+}
+#endif
/**
* dev_driver_string - Return a device's driver name, if at all possible
@@ -652,14 +660,14 @@ static int device_add_class_symlinks(struct device *dev)
#ifdef CONFIG_SYSFS_DEPRECATED
/* stacked class devices need a symlink in the class directory */
if (dev->kobj.parent != &dev->class->subsys.kobj &&
- dev->type != &part_type) {
+ device_is_not_partition(dev)) {
error = sysfs_create_link(&dev->class->subsys.kobj, &dev->kobj,
dev->bus_id);
if (error)
goto out_subsys;
}
- if (dev->parent && dev->type != &part_type) {
+ if (dev->parent && device_is_not_partition(dev)) {
struct device *parent = dev->parent;
char *class_name;
@@ -688,11 +696,11 @@ static int device_add_class_symlinks(struct device *dev)
return 0;
out_device:
- if (dev->parent && dev->type != &part_type)
+ if (dev->parent && device_is_not_partition(dev))
sysfs_remove_link(&dev->kobj, "device");
out_busid:
if (dev->kobj.parent != &dev->class->subsys.kobj &&
- dev->type != &part_type)
+ device_is_not_partition(dev))
sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id);
#else
/* link in the class directory pointing to the device */
@@ -701,7 +709,7 @@ out_busid:
if (error)
goto out_subsys;
- if (dev->parent && dev->type != &part_type) {
+ if (dev->parent && device_is_not_partition(dev)) {
error = sysfs_create_link(&dev->kobj, &dev->parent->kobj,
"device");
if (error)
@@ -725,7 +733,7 @@ static void device_remove_class_symlinks(struct device *dev)
return;
#ifdef CONFIG_SYSFS_DEPRECATED
- if (dev->parent && dev->type != &part_type) {
+ if (dev->parent && device_is_not_partition(dev)) {
char *class_name;
class_name = make_class_name(dev->class->name, &dev->kobj);
@@ -737,10 +745,10 @@ static void device_remove_class_symlinks(struct device *dev)
}
if (dev->kobj.parent != &dev->class->subsys.kobj &&
- dev->type != &part_type)
+ device_is_not_partition(dev))
sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id);
#else
- if (dev->parent && dev->type != &part_type)
+ if (dev->parent && device_is_not_partition(dev))
sysfs_remove_link(&dev->kobj, "device");
sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id);
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index ef50068def8..855ce8e5efb 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -2524,7 +2524,6 @@ after_error_processing:
resend_cciss_cmd(h, cmd);
return;
}
- cmd->rq->data_len = 0;
cmd->rq->completion_data = cmd;
blk_complete_request(cmd->rq);
}
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index 2c81465fd60..78ebfffc77e 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -483,7 +483,6 @@ static void ace_fsm_dostate(struct ace_device *ace)
u32 status;
u16 val;
int count;
- int i;
#if defined(DEBUG)
dev_dbg(ace->dev, "fsm_state=%i, id_req_count=%i\n",
@@ -688,7 +687,6 @@ static void ace_fsm_dostate(struct ace_device *ace)
}
/* Transfer the next buffer */
- i = 16;
if (ace->fsm_task == ACE_TASK_WRITE)
ace->reg_ops->dataout(ace);
else
@@ -702,8 +700,8 @@ static void ace_fsm_dostate(struct ace_device *ace)
}
/* bio finished; is there another one? */
- i = ace->req->current_nr_sectors;
- if (__blk_end_request(ace->req, 0, i)) {
+ if (__blk_end_request(ace->req, 0,
+ blk_rq_cur_bytes(ace->req))) {
/* dev_dbg(ace->dev, "next block; h=%li c=%i\n",
* ace->req->hard_nr_sectors,
* ace->req->current_nr_sectors);
diff --git a/drivers/char/agp/ali-agp.c b/drivers/char/agp/ali-agp.c
index aa5ddb716ff..1ffb381130c 100644
--- a/drivers/char/agp/ali-agp.c
+++ b/drivers/char/agp/ali-agp.c
@@ -145,7 +145,6 @@ static void *m1541_alloc_page(struct agp_bridge_data *bridge)
void *addr = agp_generic_alloc_page(agp_bridge);
u32 temp;
- global_flush_tlb();
if (!addr)
return NULL;
@@ -162,7 +161,6 @@ static void ali_destroy_page(void * addr, int flags)
if (flags & AGP_PAGE_DESTROY_UNMAP) {
global_cache_flush(); /* is this really needed? --hch */
agp_generic_destroy_page(addr, flags);
- global_flush_tlb();
} else
agp_generic_destroy_page(addr, flags);
}
diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c
index 832ded20fe7..2720882e66f 100644
--- a/drivers/char/agp/backend.c
+++ b/drivers/char/agp/backend.c
@@ -147,7 +147,6 @@ static int agp_backend_initialize(struct agp_bridge_data *bridge)
printk(KERN_ERR PFX "unable to get memory for scratch page.\n");
return -ENOMEM;
}
- flush_agp_mappings();
bridge->scratch_page_real = virt_to_gart(addr);
bridge->scratch_page =
@@ -191,7 +190,6 @@ err_out:
if (bridge->driver->needs_scratch_page) {
bridge->driver->agp_destroy_page(gart_to_virt(bridge->scratch_page_real),
AGP_PAGE_DESTROY_UNMAP);
- flush_agp_mappings();
bridge->driver->agp_destroy_page(gart_to_virt(bridge->scratch_page_real),
AGP_PAGE_DESTROY_FREE);
}
@@ -219,7 +217,6 @@ static void agp_backend_cleanup(struct agp_bridge_data *bridge)
bridge->driver->needs_scratch_page) {
bridge->driver->agp_destroy_page(gart_to_virt(bridge->scratch_page_real),
AGP_PAGE_DESTROY_UNMAP);
- flush_agp_mappings();
bridge->driver->agp_destroy_page(gart_to_virt(bridge->scratch_page_real),
AGP_PAGE_DESTROY_FREE);
}
diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c
index 64b2f6d7059..1a4674ce0c7 100644
--- a/drivers/char/agp/generic.c
+++ b/drivers/char/agp/generic.c
@@ -197,7 +197,6 @@ void agp_free_memory(struct agp_memory *curr)
for (i = 0; i < curr->page_count; i++) {
curr->bridge->driver->agp_destroy_page(gart_to_virt(curr->memory[i]), AGP_PAGE_DESTROY_UNMAP);
}
- flush_agp_mappings();
for (i = 0; i < curr->page_count; i++) {
curr->bridge->driver->agp_destroy_page(gart_to_virt(curr->memory[i]), AGP_PAGE_DESTROY_FREE);
}
@@ -267,8 +266,6 @@ struct agp_memory *agp_allocate_memory(struct agp_bridge_data *bridge,
}
new->bridge = bridge;
- flush_agp_mappings();
-
return new;
}
EXPORT_SYMBOL(agp_allocate_memory);
diff --git a/drivers/char/agp/i460-agp.c b/drivers/char/agp/i460-agp.c
index e72a83e2bad..76f581c85a7 100644
--- a/drivers/char/agp/i460-agp.c
+++ b/drivers/char/agp/i460-agp.c
@@ -527,7 +527,6 @@ static void *i460_alloc_page (struct agp_bridge_data *bridge)
if (I460_IO_PAGE_SHIFT <= PAGE_SHIFT) {
page = agp_generic_alloc_page(agp_bridge);
- global_flush_tlb();
} else
/* Returning NULL would cause problems */
/* AK: really dubious code. */
@@ -539,7 +538,6 @@ static void i460_destroy_page (void *page, int flags)
{
if (I460_IO_PAGE_SHIFT <= PAGE_SHIFT) {
agp_generic_destroy_page(page, flags);
- global_flush_tlb();
}
}
diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
index 03eac1eb8e0..189efb6ef97 100644
--- a/drivers/char/agp/intel-agp.c
+++ b/drivers/char/agp/intel-agp.c
@@ -210,13 +210,11 @@ static void *i8xx_alloc_pages(void)
if (page == NULL)
return NULL;
- if (change_page_attr(page, 4, PAGE_KERNEL_NOCACHE) < 0) {
- change_page_attr(page, 4, PAGE_KERNEL);
- global_flush_tlb();
+ if (set_pages_uc(page, 4) < 0) {
+ set_pages_wb(page, 4);
__free_pages(page, 2);
return NULL;
}
- global_flush_tlb();
get_page(page);
atomic_inc(&agp_bridge->current_memory_agp);
return page_address(page);
@@ -230,8 +228,7 @@ static void i8xx_destroy_pages(void *addr)
return;
page = virt_to_page(addr);
- change_page_attr(page, 4, PAGE_KERNEL);
- global_flush_tlb();
+ set_pages_wb(page, 4);
put_page(page);
__free_pages(page, 2);
atomic_dec(&agp_bridge->current_memory_agp);
@@ -341,7 +338,6 @@ static struct agp_memory *alloc_agpphysmem_i8xx(size_t pg_count, int type)
switch (pg_count) {
case 1: addr = agp_bridge->driver->agp_alloc_page(agp_bridge);
- global_flush_tlb();
break;
case 4:
/* kludge to get 4 physical pages for ARGB cursor */
@@ -404,7 +400,6 @@ static void intel_i810_free_by_type(struct agp_memory *curr)
else {
agp_bridge->driver->agp_destroy_page(gart_to_virt(curr->memory[0]),
AGP_PAGE_DESTROY_UNMAP);
- global_flush_tlb();
agp_bridge->driver->agp_destroy_page(gart_to_virt(curr->memory[0]),
AGP_PAGE_DESTROY_FREE);
}
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index 4c16778e3f8..465ad35ed38 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -600,63 +600,6 @@ static int hpet_is_known(struct hpet_data *hdp)
return 0;
}
-EXPORT_SYMBOL(hpet_alloc);
-EXPORT_SYMBOL(hpet_register);
-EXPORT_SYMBOL(hpet_unregister);
-EXPORT_SYMBOL(hpet_control);
-
-int hpet_register(struct hpet_task *tp, int periodic)
-{
- unsigned int i;
- u64 mask;
- struct hpet_timer __iomem *timer;
- struct hpet_dev *devp;
- struct hpets *hpetp;
-
- switch (periodic) {
- case 1:
- mask = Tn_PER_INT_CAP_MASK;
- break;
- case 0:
- mask = 0;
- break;
- default:
- return -EINVAL;
- }
-
- tp->ht_opaque = NULL;
-
- spin_lock_irq(&hpet_task_lock);
- spin_lock(&hpet_lock);
-
- for (devp = NULL, hpetp = hpets; hpetp && !devp; hpetp = hpetp->hp_next)
- for (timer = hpetp->hp_hpet->hpet_timers, i = 0;
- i < hpetp->hp_ntimer; i++, timer++) {
- if ((readq(&timer->hpet_config) & Tn_PER_INT_CAP_MASK)
- != mask)
- continue;
-
- devp = &hpetp->hp_dev[i];
-
- if (devp->hd_flags & HPET_OPEN || devp->hd_task) {
- devp = NULL;
- continue;
- }
-
- tp->ht_opaque = devp;
- devp->hd_task = tp;
- break;
- }
-
- spin_unlock(&hpet_lock);
- spin_unlock_irq(&hpet_task_lock);
-
- if (tp->ht_opaque)
- return 0;
- else
- return -EBUSY;
-}
-
static inline int hpet_tpcheck(struct hpet_task *tp)
{
struct hpet_dev *devp;
@@ -706,24 +649,6 @@ int hpet_unregister(struct hpet_task *tp)
return 0;
}
-int hpet_control(struct hpet_task *tp, unsigned int cmd, unsigned long arg)
-{
- struct hpet_dev *devp;
- int err;
-
- if ((err = hpet_tpcheck(tp)))
- return err;
-
- spin_lock_irq(&hpet_lock);
- devp = tp->ht_opaque;
- if (devp->hd_task != tp) {
- spin_unlock_irq(&hpet_lock);
- return -ENXIO;
- }
- spin_unlock_irq(&hpet_lock);
- return hpet_ioctl_common(devp, cmd, arg, 1);
-}
-
static ctl_table hpet_table[] = {
{
.ctl_name = CTL_UNNUMBERED,
@@ -806,14 +731,14 @@ static unsigned long hpet_calibrate(struct hpets *hpetp)
int hpet_alloc(struct hpet_data *hdp)
{
- u64 cap, mcfg;
+ u64 cap, mcfg, hpet_config;
struct hpet_dev *devp;
- u32 i, ntimer;
+ u32 i, ntimer, irq;
struct hpets *hpetp;
size_t siz;
struct hpet __iomem *hpet;
static struct hpets *last = NULL;
- unsigned long period;
+ unsigned long period, irq_bitmap;
unsigned long long temp;
/*
@@ -840,11 +765,47 @@ int hpet_alloc(struct hpet_data *hdp)
hpetp->hp_hpet_phys = hdp->hd_phys_address;
hpetp->hp_ntimer = hdp->hd_nirqs;
+ hpet = hpetp->hp_hpet;
- for (i = 0; i < hdp->hd_nirqs; i++)
- hpetp->hp_dev[i].hd_hdwirq = hdp->hd_irq[i];
+ /* Assign IRQs statically for legacy devices */
+ hpetp->hp_dev[0].hd_hdwirq = hdp->hd_irq[0];
+ hpetp->hp_dev[1].hd_hdwirq = hdp->hd_irq[1];
- hpet = hpetp->hp_hpet;
+ /* Assign IRQs dynamically for the others */
+ for (i = 2, devp = &hpetp->hp_dev[2]; i < hdp->hd_nirqs; i++, devp++) {
+ struct hpet_timer __iomem *timer;
+
+ timer = &hpet->hpet_timers[devp - hpetp->hp_dev];
+
+ /* Check if there's already an IRQ assigned to the timer */
+ if (hdp->hd_irq[i]) {
+ hpetp->hp_dev[i].hd_hdwirq = hdp->hd_irq[i];
+ continue;
+ }
+
+ hpet_config = readq(&timer->hpet_config);
+ irq_bitmap = (hpet_config & Tn_INT_ROUTE_CAP_MASK)
+ >> Tn_INT_ROUTE_CAP_SHIFT;
+ if (!irq_bitmap)
+ irq = 0; /* No valid IRQ Assignable */
+ else {
+ irq = find_first_bit(&irq_bitmap, 32);
+ do {
+ hpet_config |= irq << Tn_INT_ROUTE_CNF_SHIFT;
+ writeq(hpet_config, &timer->hpet_config);
+
+ /*
+ * Verify whether we have written a valid
+ * IRQ number by reading it back again
+ */
+ hpet_config = readq(&timer->hpet_config);
+ if (irq == (hpet_config & Tn_INT_ROUTE_CNF_MASK)
+ >> Tn_INT_ROUTE_CNF_SHIFT)
+ break; /* Success */
+ } while ((irq = (find_next_bit(&irq_bitmap, 32, irq))));
+ }
+ hpetp->hp_dev[i].hd_hdwirq = irq;
+ }
cap = readq(&hpet->hpet_cap);
@@ -875,7 +836,8 @@ int hpet_alloc(struct hpet_data *hdp)
hpetp->hp_which, hdp->hd_phys_address,
hpetp->hp_ntimer > 1 ? "s" : "");
for (i = 0; i < hpetp->hp_ntimer; i++)
- printk("%s %d", i > 0 ? "," : "", hdp->hd_irq[i]);
+ printk("%s %d", i > 0 ? "," : "",
+ hpetp->hp_dev[i].hd_hdwirq);
printk("\n");
printk(KERN_INFO "hpet%u: %u %d-bit timers, %Lu Hz\n",
diff --git a/drivers/char/rtc.c b/drivers/char/rtc.c
index 0c66b802736..78b151c4d20 100644
--- a/drivers/char/rtc.c
+++ b/drivers/char/rtc.c
@@ -1,5 +1,5 @@
/*
- * Real Time Clock interface for Linux
+ * Real Time Clock interface for Linux
*
* Copyright (C) 1996 Paul Gortmaker
*
@@ -17,7 +17,7 @@
* has been received. If a RTC interrupt has already happened,
* it will output an unsigned long and then block. The output value
* contains the interrupt status in the low byte and the number of
- * interrupts since the last read in the remaining high bytes. The
+ * interrupts since the last read in the remaining high bytes. The
* /dev/rtc interface can also be used with the select(2) call.
*
* This program is free software; you can redistribute it and/or
@@ -104,12 +104,14 @@ static int rtc_has_irq = 1;
#ifndef CONFIG_HPET_EMULATE_RTC
#define is_hpet_enabled() 0
-#define hpet_set_alarm_time(hrs, min, sec) 0
-#define hpet_set_periodic_freq(arg) 0
-#define hpet_mask_rtc_irq_bit(arg) 0
-#define hpet_set_rtc_irq_bit(arg) 0
-#define hpet_rtc_timer_init() do { } while (0)
-#define hpet_rtc_dropped_irq() 0
+#define hpet_set_alarm_time(hrs, min, sec) 0
+#define hpet_set_periodic_freq(arg) 0
+#define hpet_mask_rtc_irq_bit(arg) 0
+#define hpet_set_rtc_irq_bit(arg) 0
+#define hpet_rtc_timer_init() do { } while (0)
+#define hpet_rtc_dropped_irq() 0
+#define hpet_register_irq_handler(h) 0
+#define hpet_unregister_irq_handler(h) 0
#ifdef RTC_IRQ
static irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
{
@@ -147,7 +149,7 @@ static int rtc_ioctl(struct inode *inode, struct file *file,
static unsigned int rtc_poll(struct file *file, poll_table *wait);
#endif
-static void get_rtc_alm_time (struct rtc_time *alm_tm);
+static void get_rtc_alm_time(struct rtc_time *alm_tm);
#ifdef RTC_IRQ
static void set_rtc_irq_bit_locked(unsigned char bit);
static void mask_rtc_irq_bit_locked(unsigned char bit);
@@ -185,9 +187,9 @@ static int rtc_proc_open(struct inode *inode, struct file *file);
* rtc_status but before mod_timer is called, which would then reenable the
* timer (but you would need to have an awful timing before you'd trip on it)
*/
-static unsigned long rtc_status = 0; /* bitmapped status byte. */
-static unsigned long rtc_freq = 0; /* Current periodic IRQ rate */
-static unsigned long rtc_irq_data = 0; /* our output to the world */
+static unsigned long rtc_status; /* bitmapped status byte. */
+static unsigned long rtc_freq; /* Current periodic IRQ rate */
+static unsigned long rtc_irq_data; /* our output to the world */
static unsigned long rtc_max_user_freq = 64; /* > this, need CAP_SYS_RESOURCE */
#ifdef RTC_IRQ
@@ -195,7 +197,7 @@ static unsigned long rtc_max_user_freq = 64; /* > this, need CAP_SYS_RESOURCE */
* rtc_task_lock nests inside rtc_lock.
*/
static DEFINE_SPINLOCK(rtc_task_lock);
-static rtc_task_t *rtc_callback = NULL;
+static rtc_task_t *rtc_callback;
#endif
/*
@@ -205,7 +207,7 @@ static rtc_task_t *rtc_callback = NULL;
static unsigned long epoch = 1900; /* year corresponding to 0x00 */
-static const unsigned char days_in_mo[] =
+static const unsigned char days_in_mo[] =
{0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31};
/*
@@ -242,7 +244,7 @@ irqreturn_t rtc_interrupt(int irq, void *dev_id)
* the last read in the remainder of rtc_irq_data.
*/
- spin_lock (&rtc_lock);
+ spin_lock(&rtc_lock);
rtc_irq_data += 0x100;
rtc_irq_data &= ~0xff;
if (is_hpet_enabled()) {
@@ -259,16 +261,16 @@ irqreturn_t rtc_interrupt(int irq, void *dev_id)
if (rtc_status & RTC_TIMER_ON)
mod_timer(&rtc_irq_timer, jiffies + HZ/rtc_freq + 2*HZ/100);
- spin_unlock (&rtc_lock);
+ spin_unlock(&rtc_lock);
/* Now do the rest of the actions */
spin_lock(&rtc_task_lock);
if (rtc_callback)
rtc_callback->func(rtc_callback->private_data);
spin_unlock(&rtc_task_lock);
- wake_up_interruptible(&rtc_wait);
+ wake_up_interruptible(&rtc_wait);
- kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+ kill_fasync(&rtc_async_queue, SIGIO, POLL_IN);
return IRQ_HANDLED;
}
@@ -335,7 +337,7 @@ static ssize_t rtc_read(struct file *file, char __user *buf,
DECLARE_WAITQUEUE(wait, current);
unsigned long data;
ssize_t retval;
-
+
if (rtc_has_irq == 0)
return -EIO;
@@ -358,11 +360,11 @@ static ssize_t rtc_read(struct file *file, char __user *buf,
* confusing. And no, xchg() is not the answer. */
__set_current_state(TASK_INTERRUPTIBLE);
-
- spin_lock_irq (&rtc_lock);
+
+ spin_lock_irq(&rtc_lock);
data = rtc_irq_data;
rtc_irq_data = 0;
- spin_unlock_irq (&rtc_lock);
+ spin_unlock_irq(&rtc_lock);
if (data != 0)
break;
@@ -378,10 +380,13 @@ static ssize_t rtc_read(struct file *file, char __user *buf,
schedule();
} while (1);
- if (count == sizeof(unsigned int))
- retval = put_user(data, (unsigned int __user *)buf) ?: sizeof(int);
- else
- retval = put_user(data, (unsigned long __user *)buf) ?: sizeof(long);
+ if (count == sizeof(unsigned int)) {
+ retval = put_user(data,
+ (unsigned int __user *)buf) ?: sizeof(int);
+ } else {
+ retval = put_user(data,
+ (unsigned long __user *)buf) ?: sizeof(long);
+ }
if (!retval)
retval = count;
out:
@@ -394,7 +399,7 @@ static ssize_t rtc_read(struct file *file, char __user *buf,
static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
{
- struct rtc_time wtime;
+ struct rtc_time wtime;
#ifdef RTC_IRQ
if (rtc_has_irq == 0) {
@@ -426,35 +431,41 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
}
case RTC_PIE_OFF: /* Mask periodic int. enab. bit */
{
- unsigned long flags; /* can be called from isr via rtc_control() */
- spin_lock_irqsave (&rtc_lock, flags);
+ /* can be called from isr via rtc_control() */
+ unsigned long flags;
+
+ spin_lock_irqsave(&rtc_lock, flags);
mask_rtc_irq_bit_locked(RTC_PIE);
if (rtc_status & RTC_TIMER_ON) {
rtc_status &= ~RTC_TIMER_ON;
del_timer(&rtc_irq_timer);
}
- spin_unlock_irqrestore (&rtc_lock, flags);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+
return 0;
}
case RTC_PIE_ON: /* Allow periodic ints */
{
- unsigned long flags; /* can be called from isr via rtc_control() */
+ /* can be called from isr via rtc_control() */
+ unsigned long flags;
+
/*
* We don't really want Joe User enabling more
* than 64Hz of interrupts on a multi-user machine.
*/
if (!kernel && (rtc_freq > rtc_max_user_freq) &&
- (!capable(CAP_SYS_RESOURCE)))
+ (!capable(CAP_SYS_RESOURCE)))
return -EACCES;
- spin_lock_irqsave (&rtc_lock, flags);
+ spin_lock_irqsave(&rtc_lock, flags);
if (!(rtc_status & RTC_TIMER_ON)) {
mod_timer(&rtc_irq_timer, jiffies + HZ/rtc_freq +
2*HZ/100);
rtc_status |= RTC_TIMER_ON;
}
set_rtc_irq_bit_locked(RTC_PIE);
- spin_unlock_irqrestore (&rtc_lock, flags);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+
return 0;
}
case RTC_UIE_OFF: /* Mask ints from RTC updates. */
@@ -477,7 +488,7 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
*/
memset(&wtime, 0, sizeof(struct rtc_time));
get_rtc_alm_time(&wtime);
- break;
+ break;
}
case RTC_ALM_SET: /* Store a time into the alarm */
{
@@ -505,16 +516,21 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
*/
}
if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) ||
- RTC_ALWAYS_BCD)
- {
- if (sec < 60) BIN_TO_BCD(sec);
- else sec = 0xff;
-
- if (min < 60) BIN_TO_BCD(min);
- else min = 0xff;
-
- if (hrs < 24) BIN_TO_BCD(hrs);
- else hrs = 0xff;
+ RTC_ALWAYS_BCD) {
+ if (sec < 60)
+ BIN_TO_BCD(sec);
+ else
+ sec = 0xff;
+
+ if (min < 60)
+ BIN_TO_BCD(min);
+ else
+ min = 0xff;
+
+ if (hrs < 24)
+ BIN_TO_BCD(hrs);
+ else
+ hrs = 0xff;
}
CMOS_WRITE(hrs, RTC_HOURS_ALARM);
CMOS_WRITE(min, RTC_MINUTES_ALARM);
@@ -563,11 +579,12 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
if (day > (days_in_mo[mon] + ((mon == 2) && leap_yr)))
return -EINVAL;
-
+
if ((hrs >= 24) || (min >= 60) || (sec >= 60))
return -EINVAL;
- if ((yrs -= epoch) > 255) /* They are unsigned */
+ yrs -= epoch;
+ if (yrs > 255) /* They are unsigned */
return -EINVAL;
spin_lock_irq(&rtc_lock);
@@ -635,9 +652,10 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
{
int tmp = 0;
unsigned char val;
- unsigned long flags; /* can be called from isr via rtc_control() */
+ /* can be called from isr via rtc_control() */
+ unsigned long flags;
- /*
+ /*
* The max we can do is 8192Hz.
*/
if ((arg < 2) || (arg > 8192))
@@ -646,7 +664,8 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
* We don't really want Joe User generating more
* than 64Hz of interrupts on a multi-user machine.
*/
- if (!kernel && (arg > rtc_max_user_freq) && (!capable(CAP_SYS_RESOURCE)))
+ if (!kernel && (arg > rtc_max_user_freq) &&
+ !capable(CAP_SYS_RESOURCE))
return -EACCES;
while (arg > (1<<tmp))
@@ -674,11 +693,11 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
#endif
case RTC_EPOCH_READ: /* Read the epoch. */
{
- return put_user (epoch, (unsigned long __user *)arg);
+ return put_user(epoch, (unsigned long __user *)arg);
}
case RTC_EPOCH_SET: /* Set the epoch. */
{
- /*
+ /*
* There were no RTC clocks before 1900.
*/
if (arg < 1900)
@@ -693,7 +712,8 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
default:
return -ENOTTY;
}
- return copy_to_user((void __user *)arg, &wtime, sizeof wtime) ? -EFAULT : 0;
+ return copy_to_user((void __user *)arg,
+ &wtime, sizeof wtime) ? -EFAULT : 0;
}
static int rtc_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
@@ -712,26 +732,25 @@ static int rtc_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
* needed here. Or anywhere else in this driver. */
static int rtc_open(struct inode *inode, struct file *file)
{
- spin_lock_irq (&rtc_lock);
+ spin_lock_irq(&rtc_lock);
- if(rtc_status & RTC_IS_OPEN)
+ if (rtc_status & RTC_IS_OPEN)
goto out_busy;
rtc_status |= RTC_IS_OPEN;
rtc_irq_data = 0;
- spin_unlock_irq (&rtc_lock);
+ spin_unlock_irq(&rtc_lock);
return 0;
out_busy:
- spin_unlock_irq (&rtc_lock);
+ spin_unlock_irq(&rtc_lock);
return -EBUSY;
}
-static int rtc_fasync (int fd, struct file *filp, int on)
-
+static int rtc_fasync(int fd, struct file *filp, int on)
{
- return fasync_helper (fd, filp, on, &rtc_async_queue);
+ return fasync_helper(fd, filp, on, &rtc_async_queue);
}
static int rtc_release(struct inode *inode, struct file *file)
@@ -762,16 +781,16 @@ static int rtc_release(struct inode *inode, struct file *file)
}
spin_unlock_irq(&rtc_lock);
- if (file->f_flags & FASYNC) {
- rtc_fasync (-1, file, 0);
- }
+ if (file->f_flags & FASYNC)
+ rtc_fasync(-1, file, 0);
no_irq:
#endif
- spin_lock_irq (&rtc_lock);
+ spin_lock_irq(&rtc_lock);
rtc_irq_data = 0;
rtc_status &= ~RTC_IS_OPEN;
- spin_unlock_irq (&rtc_lock);
+ spin_unlock_irq(&rtc_lock);
+
return 0;
}
@@ -786,9 +805,9 @@ static unsigned int rtc_poll(struct file *file, poll_table *wait)
poll_wait(file, &rtc_wait, wait);
- spin_lock_irq (&rtc_lock);
+ spin_lock_irq(&rtc_lock);
l = rtc_irq_data;
- spin_unlock_irq (&rtc_lock);
+ spin_unlock_irq(&rtc_lock);
if (l != 0)
return POLLIN | POLLRDNORM;
@@ -796,14 +815,6 @@ static unsigned int rtc_poll(struct file *file, poll_table *wait)
}
#endif
-/*
- * exported stuffs
- */
-
-EXPORT_SYMBOL(rtc_register);
-EXPORT_SYMBOL(rtc_unregister);
-EXPORT_SYMBOL(rtc_control);
-
int rtc_register(rtc_task_t *task)
{
#ifndef RTC_IRQ
@@ -829,6 +840,7 @@ int rtc_register(rtc_task_t *task)
return 0;
#endif
}
+EXPORT_SYMBOL(rtc_register);
int rtc_unregister(rtc_task_t *task)
{
@@ -845,7 +857,7 @@ int rtc_unregister(rtc_task_t *task)
return -ENXIO;
}
rtc_callback = NULL;
-
+
/* disable controls */
if (!hpet_mask_rtc_irq_bit(RTC_PIE | RTC_AIE | RTC_UIE)) {
tmp = CMOS_READ(RTC_CONTROL);
@@ -865,6 +877,7 @@ int rtc_unregister(rtc_task_t *task)
return 0;
#endif
}
+EXPORT_SYMBOL(rtc_unregister);
int rtc_control(rtc_task_t *task, unsigned int cmd, unsigned long arg)
{
@@ -883,7 +896,7 @@ int rtc_control(rtc_task_t *task, unsigned int cmd, unsigned long arg)
return rtc_do_ioctl(cmd, arg, 1);
#endif
}
-
+EXPORT_SYMBOL(rtc_control);
/*
* The various file operations we support.
@@ -910,11 +923,11 @@ static struct miscdevice rtc_dev = {
#ifdef CONFIG_PROC_FS
static const struct file_operations rtc_proc_fops = {
- .owner = THIS_MODULE,
- .open = rtc_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
+ .owner = THIS_MODULE,
+ .open = rtc_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
};
#endif
@@ -965,7 +978,7 @@ static int __init rtc_init(void)
#ifdef CONFIG_SPARC32
for_each_ebus(ebus) {
for_each_ebusdev(edev, ebus) {
- if(strcmp(edev->prom_node->name, "rtc") == 0) {
+ if (strcmp(edev->prom_node->name, "rtc") == 0) {
rtc_port = edev->resource[0].start;
rtc_irq = edev->irqs[0];
goto found;
@@ -986,7 +999,8 @@ found:
* XXX Interrupt pin #7 in Espresso is shared between RTC and
* PCI Slot 2 INTA# (and some INTx# in Slot 1).
*/
- if (request_irq(rtc_irq, rtc_interrupt, IRQF_SHARED, "rtc", (void *)&rtc_port)) {
+ if (request_irq(rtc_irq, rtc_interrupt, IRQF_SHARED, "rtc",
+ (void *)&rtc_port)) {
rtc_has_irq = 0;
printk(KERN_ERR "rtc: cannot register IRQ %d\n", rtc_irq);
return -EIO;
@@ -1015,16 +1029,26 @@ no_irq:
#ifdef RTC_IRQ
if (is_hpet_enabled()) {
+ int err;
+
rtc_int_handler_ptr = hpet_rtc_interrupt;
+ err = hpet_register_irq_handler(rtc_interrupt);
+ if (err != 0) {
+ printk(KERN_WARNING "hpet_register_irq_handler failed "
+ "in rtc_init().");
+ return err;
+ }
} else {
rtc_int_handler_ptr = rtc_interrupt;
}
- if(request_irq(RTC_IRQ, rtc_int_handler_ptr, IRQF_DISABLED, "rtc", NULL)) {
+ if (request_irq(RTC_IRQ, rtc_int_handler_ptr, IRQF_DISABLED,
+ "rtc", NULL)) {
/* Yeah right, seeing as irq 8 doesn't even hit the bus. */
rtc_has_irq = 0;
printk(KERN_ERR "rtc: IRQ %d is not free.\n", RTC_IRQ);
rtc_release_region();
+
return -EIO;
}
hpet_rtc_timer_init();
@@ -1036,6 +1060,7 @@ no_irq:
if (misc_register(&rtc_dev)) {
#ifdef RTC_IRQ
free_irq(RTC_IRQ, NULL);
+ hpet_unregister_irq_handler(rtc_interrupt);
rtc_has_irq = 0;
#endif
rtc_release_region();
@@ -1052,21 +1077,21 @@ no_irq:
#if defined(__alpha__) || defined(__mips__)
rtc_freq = HZ;
-
+
/* Each operating system on an Alpha uses its own epoch.
Let's try to guess which one we are using now. */
-
+
if (rtc_is_updating() != 0)
msleep(20);
-
+
spin_lock_irq(&rtc_lock);
year = CMOS_READ(RTC_YEAR);
ctrl = CMOS_READ(RTC_CONTROL);
spin_unlock_irq(&rtc_lock);
-
+
if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
BCD_TO_BIN(year); /* This should never happen... */
-
+
if (year < 20) {
epoch = 2000;
guess = "SRM (post-2000)";
@@ -1087,7 +1112,8 @@ no_irq:
#endif
}
if (guess)
- printk(KERN_INFO "rtc: %s epoch (%lu) detected\n", guess, epoch);
+ printk(KERN_INFO "rtc: %s epoch (%lu) detected\n",
+ guess, epoch);
#endif
#ifdef RTC_IRQ
if (rtc_has_irq == 0)
@@ -1096,8 +1122,12 @@ no_irq:
spin_lock_irq(&rtc_lock);
rtc_freq = 1024;
if (!hpet_set_periodic_freq(rtc_freq)) {
- /* Initialize periodic freq. to CMOS reset default, which is 1024Hz */
- CMOS_WRITE(((CMOS_READ(RTC_FREQ_SELECT) & 0xF0) | 0x06), RTC_FREQ_SELECT);
+ /*
+ * Initialize periodic frequency to CMOS reset default,
+ * which is 1024Hz
+ */
+ CMOS_WRITE(((CMOS_READ(RTC_FREQ_SELECT) & 0xF0) | 0x06),
+ RTC_FREQ_SELECT);
}
spin_unlock_irq(&rtc_lock);
no_irq2:
@@ -1110,20 +1140,22 @@ no_irq2:
return 0;
}
-static void __exit rtc_exit (void)
+static void __exit rtc_exit(void)
{
cleanup_sysctl();
- remove_proc_entry ("driver/rtc", NULL);
+ remove_proc_entry("driver/rtc", NULL);
misc_deregister(&rtc_dev);
#ifdef CONFIG_SPARC32
if (rtc_has_irq)
- free_irq (rtc_irq, &rtc_port);
+ free_irq(rtc_irq, &rtc_port);
#else
rtc_release_region();
#ifdef RTC_IRQ
- if (rtc_has_irq)
- free_irq (RTC_IRQ, NULL);
+ if (rtc_has_irq) {
+ free_irq(RTC_IRQ, NULL);
+ hpet_unregister_irq_handler(hpet_rtc_interrupt);
+ }
#endif
#endif /* CONFIG_SPARC32 */
}
@@ -1133,14 +1165,14 @@ module_exit(rtc_exit);
#ifdef RTC_IRQ
/*
- * At IRQ rates >= 4096Hz, an interrupt may get lost altogether.
+ * At IRQ rates >= 4096Hz, an interrupt may get lost altogether.
* (usually during an IDE disk interrupt, with IRQ unmasking off)
* Since the interrupt handler doesn't get called, the IRQ status
* byte doesn't get read, and the RTC stops generating interrupts.
* A timer is set, and will call this function if/when that happens.
* To get it out of this stalled state, we just read the status.
* At least a jiffy of interrupts (rtc_freq/HZ) will have been lost.
- * (You *really* shouldn't be trying to use a non-realtime system
+ * (You *really* shouldn't be trying to use a non-realtime system
* for something that requires a steady > 1KHz signal anyways.)
*/
@@ -1148,7 +1180,7 @@ static void rtc_dropped_irq(unsigned long data)
{
unsigned long freq;
- spin_lock_irq (&rtc_lock);
+ spin_lock_irq(&rtc_lock);
if (hpet_rtc_dropped_irq()) {
spin_unlock_irq(&rtc_lock);
@@ -1167,13 +1199,15 @@ static void rtc_dropped_irq(unsigned long data)
spin_unlock_irq(&rtc_lock);
- if (printk_ratelimit())
- printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", freq);
+ if (printk_ratelimit()) {
+ printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
+ freq);
+ }
/* Now we have new data */
wake_up_interruptible(&rtc_wait);
- kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+ kill_fasync(&rtc_async_queue, SIGIO, POLL_IN);
}
#endif
@@ -1277,7 +1311,7 @@ void rtc_get_rtc_time(struct rtc_time *rtc_tm)
* can take just over 2ms. We wait 20ms. There is no need to
* to poll-wait (up to 1s - eeccch) for the falling edge of RTC_UIP.
* If you need to know *exactly* when a second has started, enable
- * periodic update complete interrupts, (via ioctl) and then
+ * periodic update complete interrupts, (via ioctl) and then
* immediately read /dev/rtc which will block until you get the IRQ.
* Once the read clears, read the RTC time (again via ioctl). Easy.
*/
@@ -1307,8 +1341,7 @@ void rtc_get_rtc_time(struct rtc_time *rtc_tm)
ctrl = CMOS_READ(RTC_CONTROL);
spin_unlock_irqrestore(&rtc_lock, flags);
- if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
- {
+ if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
BCD_TO_BIN(rtc_tm->tm_sec);
BCD_TO_BIN(rtc_tm->tm_min);
BCD_TO_BIN(rtc_tm->tm_hour);
@@ -1326,7 +1359,8 @@ void rtc_get_rtc_time(struct rtc_time *rtc_tm)
* Account for differences between how the RTC uses the values
* and how they are defined in a struct rtc_time;
*/
- if ((rtc_tm->tm_year += (epoch - 1900)) <= 69)
+ rtc_tm->tm_year += epoch - 1900;
+ if (rtc_tm->tm_year <= 69)
rtc_tm->tm_year += 100;
rtc_tm->tm_mon--;
@@ -1347,8 +1381,7 @@ static void get_rtc_alm_time(struct rtc_time *alm_tm)
ctrl = CMOS_READ(RTC_CONTROL);
spin_unlock_irq(&rtc_lock);
- if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
- {
+ if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
BCD_TO_BIN(alm_tm->tm_sec);
BCD_TO_BIN(alm_tm->tm_min);
BCD_TO_BIN(alm_tm->tm_hour);
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 5efd5550f4c..b730d670952 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1604,7 +1604,7 @@ static int __cpufreq_set_policy(struct cpufreq_policy *data,
memcpy(&policy->cpuinfo, &data->cpuinfo,
sizeof(struct cpufreq_cpuinfo));
- if (policy->min > data->min && policy->min > policy->max) {
+ if (policy->min > data->max || policy->max < data->min) {
ret = -EINVAL;
goto error_out;
}
diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
index 5e596a7e360..9008ed5ef4c 100644
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -8,6 +8,8 @@
#include <linux/slab.h>
#include <asm/dmi.h>
+static char dmi_empty_string[] = " ";
+
static char * __init dmi_string(const struct dmi_header *dm, u8 s)
{
const u8 *bp = ((u8 *) dm) + dm->length;
@@ -21,11 +23,16 @@ static char * __init dmi_string(const struct dmi_header *dm, u8 s)
}
if (*bp != 0) {
- str = dmi_alloc(strlen(bp) + 1);
+ size_t len = strlen(bp)+1;
+ size_t cmp_len = len > 8 ? 8 : len;
+
+ if (!memcmp(bp, dmi_empty_string, cmp_len))
+ return dmi_empty_string;
+ str = dmi_alloc(len);
if (str != NULL)
strcpy(str, bp);
else
- printk(KERN_ERR "dmi_string: out of memory.\n");
+ printk(KERN_ERR "dmi_string: cannot allocate %Zu bytes.\n", len);
}
}
@@ -175,12 +182,23 @@ static void __init dmi_save_devices(const struct dmi_header *dm)
}
}
+static struct dmi_device empty_oem_string_dev = {
+ .name = dmi_empty_string,
+};
+
static void __init dmi_save_oem_strings_devices(const struct dmi_header *dm)
{
int i, count = *(u8 *)(dm + 1);
struct dmi_device *dev;
for (i = 1; i <= count; i++) {
+ char *devname = dmi_string(dm, i);
+
+ if (!strcmp(devname, dmi_empty_string)) {
+ list_add(&empty_oem_string_dev.list, &dmi_devices);
+ continue;
+ }
+
dev = dmi_alloc(sizeof(*dev));
if (!dev) {
printk(KERN_ERR
@@ -189,7 +207,7 @@ static void __init dmi_save_oem_strings_devices(const struct dmi_header *dm)
}
dev->type = DMI_DEV_TYPE_OEM_STRING;
- dev->name = dmi_string(dm, i);
+ dev->name = devname;
dev->device_data = NULL;
list_add(&dev->list, &dmi_devices);
@@ -331,9 +349,11 @@ void __init dmi_scan_machine(void)
rc = dmi_present(q);
if (!rc) {
dmi_available = 1;
+ dmi_iounmap(p, 0x10000);
return;
}
}
+ dmi_iounmap(p, 0x10000);
}
out: printk(KERN_INFO "DMI not present or invalid.\n");
}
diff --git a/drivers/ieee1394/Makefile b/drivers/ieee1394/Makefile
index 489c133664d..1f8153b5750 100644
--- a/drivers/ieee1394/Makefile
+++ b/drivers/ieee1394/Makefile
@@ -15,3 +15,4 @@ obj-$(CONFIG_IEEE1394_SBP2) += sbp2.o
obj-$(CONFIG_IEEE1394_DV1394) += dv1394.o
obj-$(CONFIG_IEEE1394_ETH1394) += eth1394.o
+obj-$(CONFIG_PROVIDE_OHCI1394_DMA_INIT) += init_ohci1394_dma.o
diff --git a/drivers/ieee1394/init_ohci1394_dma.c b/drivers/ieee1394/init_ohci1394_dma.c
new file mode 100644
index 00000000000..ddaab6eb8ac
--- /dev/null
+++ b/drivers/ieee1394/init_ohci1394_dma.c
@@ -0,0 +1,285 @@
+/*
+ * init_ohci1394_dma.c - Initializes physical DMA on all OHCI 1394 controllers
+ *
+ * Copyright (C) 2006-2007 Bernhard Kaindl <bk@suse.de>
+ *
+ * Derived from drivers/ieee1394/ohci1394.c and arch/x86/kernel/early-quirks.c
+ * this file has functions to:
+ * - scan the PCI very early on boot for all OHCI 1394-compliant controllers
+ * - reset and initialize them and make them join the IEEE1394 bus and
+ * - enable physical DMA on them to allow remote debugging
+ *
+ * All code and data is marked as __init and __initdata, respective as
+ * during boot, all OHCI1394 controllers may be claimed by the firewire
+ * stack and at this point, this code should not touch them anymore.
+ *
+ * To use physical DMA after the initialization of the firewire stack,
+ * be sure that the stack enables it and (re-)attach after the bus reset
+ * which may be caused by the firewire stack initialization.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+#include <linux/interrupt.h> /* for ohci1394.h */
+#include <linux/delay.h>
+#include <linux/pci.h> /* for PCI defines */
+#include <linux/init_ohci1394_dma.h>
+#include <asm/pci-direct.h> /* for direct PCI config space access */
+#include <asm/fixmap.h>
+
+#include "ieee1394_types.h"
+#include "ohci1394.h"
+
+int __initdata init_ohci1394_dma_early;
+
+/* Reads a PHY register of an OHCI-1394 controller */
+static inline u8 __init get_phy_reg(struct ti_ohci *ohci, u8 addr)
+{
+ int i;
+ quadlet_t r;
+
+ reg_write(ohci, OHCI1394_PhyControl, (addr << 8) | 0x00008000);
+
+ for (i = 0; i < OHCI_LOOP_COUNT; i++) {
+ if (reg_read(ohci, OHCI1394_PhyControl) & 0x80000000)
+ break;
+ mdelay(1);
+ }
+ r = reg_read(ohci, OHCI1394_PhyControl);
+
+ return (r & 0x00ff0000) >> 16;
+}
+
+/* Writes to a PHY register of an OHCI-1394 controller */
+static inline void __init set_phy_reg(struct ti_ohci *ohci, u8 addr, u8 data)
+{
+ int i;
+
+ reg_write(ohci, OHCI1394_PhyControl, (addr << 8) | data | 0x00004000);
+
+ for (i = 0; i < OHCI_LOOP_COUNT; i++) {
+ u32 r = reg_read(ohci, OHCI1394_PhyControl);
+ if (!(r & 0x00004000))
+ break;
+ mdelay(1);
+ }
+}
+
+/* Resets an OHCI-1394 controller (for sane state before initialization) */
+static inline void __init init_ohci1394_soft_reset(struct ti_ohci *ohci) {
+ int i;
+
+ reg_write(ohci, OHCI1394_HCControlSet, OHCI1394_HCControl_softReset);
+
+ for (i = 0; i < OHCI_LOOP_COUNT; i++) {
+ if (!(reg_read(ohci, OHCI1394_HCControlSet)
+ & OHCI1394_HCControl_softReset))
+ break;
+ mdelay(1);
+ }
+}
+
+/* Basic OHCI-1394 register and port inititalization */
+static inline void __init init_ohci1394_initialize(struct ti_ohci *ohci)
+{
+ quadlet_t bus_options;
+ int num_ports, i;
+
+ /* Put some defaults to these undefined bus options */
+ bus_options = reg_read(ohci, OHCI1394_BusOptions);
+ bus_options |= 0x60000000; /* Enable CMC and ISC */
+ bus_options &= ~0x00ff0000; /* XXX: Set cyc_clk_acc to zero for now */
+ bus_options &= ~0x18000000; /* Disable PMC and BMC */
+ reg_write(ohci, OHCI1394_BusOptions, bus_options);
+
+ /* Set the bus number */
+ reg_write(ohci, OHCI1394_NodeID, 0x0000ffc0);
+
+ /* Enable posted writes */
+ reg_write(ohci, OHCI1394_HCControlSet,
+ OHCI1394_HCControl_postedWriteEnable);
+
+ /* Clear link control register */
+ reg_write(ohci, OHCI1394_LinkControlClear, 0xffffffff);
+
+ /* enable phys */
+ reg_write(ohci, OHCI1394_LinkControlSet,
+ OHCI1394_LinkControl_RcvPhyPkt);
+
+ /* Don't accept phy packets into AR request context */
+ reg_write(ohci, OHCI1394_LinkControlClear, 0x00000400);
+
+ /* Clear the Isochonouys interrupt masks */
+ reg_write(ohci, OHCI1394_IsoRecvIntMaskClear, 0xffffffff);
+ reg_write(ohci, OHCI1394_IsoRecvIntEventClear, 0xffffffff);
+ reg_write(ohci, OHCI1394_IsoXmitIntMaskClear, 0xffffffff);
+ reg_write(ohci, OHCI1394_IsoXmitIntEventClear, 0xffffffff);
+
+ /* Accept asyncronous transfer requests from all nodes for now */
+ reg_write(ohci,OHCI1394_AsReqFilterHiSet, 0x80000000);
+
+ /* Specify asyncronous transfer retries */
+ reg_write(ohci, OHCI1394_ATRetries,
+ OHCI1394_MAX_AT_REQ_RETRIES |
+ (OHCI1394_MAX_AT_RESP_RETRIES<<4) |
+ (OHCI1394_MAX_PHYS_RESP_RETRIES<<8));
+
+ /* We don't want hardware swapping */
+ reg_write(ohci, OHCI1394_HCControlClear, OHCI1394_HCControl_noByteSwap);
+
+ /* Enable link */
+ reg_write(ohci, OHCI1394_HCControlSet, OHCI1394_HCControl_linkEnable);
+
+ /* If anything is connected to a port, make sure it is enabled */
+ num_ports = get_phy_reg(ohci, 2) & 0xf;
+ for (i = 0; i < num_ports; i++) {
+ unsigned int status;
+
+ set_phy_reg(ohci, 7, i);
+ status = get_phy_reg(ohci, 8);
+
+ if (status & 0x20)
+ set_phy_reg(ohci, 8, status & ~1);
+ }
+}
+
+/**
+ * init_ohci1394_wait_for_busresets - wait until bus resets are completed
+ *
+ * OHCI1394 initialization itself and any device going on- or offline
+ * and any cable issue cause a IEEE1394 bus reset. The OHCI1394 spec
+ * specifies that physical DMA is disabled on each bus reset and it
+ * has to be enabled after each bus reset when needed. We resort
+ * to polling here because on early boot, we have no interrupts.
+ */
+static inline void __init init_ohci1394_wait_for_busresets(struct ti_ohci *ohci)
+{
+ int i, events;
+
+ for (i=0; i < 9; i++) {
+ mdelay(200);
+ events = reg_read(ohci, OHCI1394_IntEventSet);
+ if (events & OHCI1394_busReset)
+ reg_write(ohci, OHCI1394_IntEventClear,
+ OHCI1394_busReset);
+ }
+}
+
+/**
+ * init_ohci1394_enable_physical_dma - Enable physical DMA for remote debugging
+ * This enables remote DMA access over IEEE1394 from every host for the low
+ * 4GB of address space. DMA accesses above 4GB are not available currently.
+ */
+static inline void __init init_ohci1394_enable_physical_dma(struct ti_ohci *hci)
+{
+ reg_write(hci, OHCI1394_PhyReqFilterHiSet, 0xffffffff);
+ reg_write(hci, OHCI1394_PhyReqFilterLoSet, 0xffffffff);
+ reg_write(hci, OHCI1394_PhyUpperBound, 0xffff0000);
+}
+
+/**
+ * init_ohci1394_reset_and_init_dma - init controller and enable DMA
+ * This initializes the given controller and enables physical DMA engine in it.
+ */
+static inline void __init init_ohci1394_reset_and_init_dma(struct ti_ohci *ohci)
+{
+ /* Start off with a soft reset, clears everything to a sane state. */
+ init_ohci1394_soft_reset(ohci);
+
+ /* Accessing some registers without LPS enabled may cause lock up */
+ reg_write(ohci, OHCI1394_HCControlSet, OHCI1394_HCControl_LPS);
+
+ /* Disable and clear interrupts */
+ reg_write(ohci, OHCI1394_IntEventClear, 0xffffffff);
+ reg_write(ohci, OHCI1394_IntMaskClear, 0xffffffff);
+
+ mdelay(50); /* Wait 50msec to make sure we have full link enabled */
+
+ init_ohci1394_initialize(ohci);
+ /*
+ * The initialization causes at least one IEEE1394 bus reset. Enabling
+ * physical DMA only works *after* *all* bus resets have calmed down:
+ */
+ init_ohci1394_wait_for_busresets(ohci);
+
+ /* We had to wait and do this now if we want to debug early problems */
+ init_ohci1394_enable_physical_dma(ohci);
+}
+
+/**
+ * init_ohci1394_controller - Map the registers of the controller and init DMA
+ * This maps the registers of the specified controller and initializes it
+ */
+static inline void __init init_ohci1394_controller(int num, int slot, int func)
+{
+ unsigned long ohci_base;
+ struct ti_ohci ohci;
+
+ printk(KERN_INFO "init_ohci1394_dma: initializing OHCI-1394"
+ " at %02x:%02x.%x\n", num, slot, func);
+
+ ohci_base = read_pci_config(num, slot, func, PCI_BASE_ADDRESS_0+(0<<2))
+ & PCI_BASE_ADDRESS_MEM_MASK;
+
+ set_fixmap_nocache(FIX_OHCI1394_BASE, ohci_base);
+
+ ohci.registers = (void *)fix_to_virt(FIX_OHCI1394_BASE);
+
+ init_ohci1394_reset_and_init_dma(&ohci);
+}
+
+/**
+ * debug_init_ohci1394_dma - scan for OHCI1394 controllers and init DMA on them
+ * Scans the whole PCI space for OHCI1394 controllers and inits DMA on them
+ */
+void __init init_ohci1394_dma_on_all_controllers(void)
+{
+ int num, slot, func;
+
+ if (!early_pci_allowed())
+ return;
+
+ /* Poor man's PCI discovery, the only thing we can do at early boot */
+ for (num = 0; num < 32; num++) {
+ for (slot = 0; slot < 32; slot++) {
+ for (func = 0; func < 8; func++) {
+ u32 class = read_pci_config(num,slot,func,
+ PCI_CLASS_REVISION);
+ if ((class == 0xffffffff))
+ continue; /* No device at this func */
+
+ if (class>>8 != PCI_CLASS_SERIAL_FIREWIRE_OHCI)
+ continue; /* Not an OHCI-1394 device */
+
+ init_ohci1394_controller(num, slot, func);
+ break; /* Assume one controller per device */
+ }
+ }
+ }
+ printk(KERN_INFO "init_ohci1394_dma: finished initializing OHCI DMA\n");
+}
+
+/**
+ * setup_init_ohci1394_early - enables early OHCI1394 DMA initialization
+ */
+static int __init setup_ohci1394_dma(char *opt)
+{
+ if (!strcmp(opt, "early"))
+ init_ohci1394_dma_early = 1;
+ return 0;
+}
+
+/* passing ohci1394_dma=early on boot causes early OHCI1394 DMA initialization */
+early_param("ohci1394_dma", setup_ohci1394_dma);
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index f2d2c7e2c76..195ce7c1231 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -1571,7 +1571,6 @@ static struct scsi_host_template srp_template = {
.this_id = -1,
.cmd_per_lun = SRP_SQ_SIZE,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.shost_attrs = srp_host_attrs
};
diff --git a/drivers/input/mouse/pc110pad.c b/drivers/input/mouse/pc110pad.c
index 8991ab0b4fe..61cff8374e6 100644
--- a/drivers/input/mouse/pc110pad.c
+++ b/drivers/input/mouse/pc110pad.c
@@ -39,6 +39,7 @@
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/pci.h>
+#include <linux/delay.h>
#include <asm/io.h>
#include <asm/irq.h>
@@ -62,8 +63,10 @@ static irqreturn_t pc110pad_interrupt(int irq, void *ptr)
int value = inb_p(pc110pad_io);
int handshake = inb_p(pc110pad_io + 2);
- outb_p(handshake | 1, pc110pad_io + 2);
- outb_p(handshake & ~1, pc110pad_io + 2);
+ outb(handshake | 1, pc110pad_io + 2);
+ udelay(2);
+ outb(handshake & ~1, pc110pad_io + 2);
+ udelay(2);
inb_p(0x64);
pc110pad_data[pc110pad_count++] = value;
diff --git a/drivers/kvm/Kconfig b/drivers/kvm/Kconfig
deleted file mode 100644
index 656920636cb..00000000000
--- a/drivers/kvm/Kconfig
+++ /dev/null
@@ -1,54 +0,0 @@
-#
-# KVM configuration
-#
-menuconfig VIRTUALIZATION
- bool "Virtualization"
- depends on X86
- default y
- ---help---
- Say Y here to get to see options for using your Linux host to run other
- operating systems inside virtual machines (guests).
- This option alone does not add any kernel code.
-
- If you say N, all options in this submenu will be skipped and disabled.
-
-if VIRTUALIZATION
-
-config KVM
- tristate "Kernel-based Virtual Machine (KVM) support"
- depends on X86 && EXPERIMENTAL
- select PREEMPT_NOTIFIERS
- select ANON_INODES
- ---help---
- Support hosting fully virtualized guest machines using hardware
- virtualization extensions. You will need a fairly recent
- processor equipped with virtualization extensions. You will also
- need to select one or more of the processor modules below.
-
- This module provides access to the hardware capabilities through
- a character device node named /dev/kvm.
-
- To compile this as a module, choose M here: the module
- will be called kvm.
-
- If unsure, say N.
-
-config KVM_INTEL
- tristate "KVM for Intel processors support"
- depends on KVM
- ---help---
- Provides support for KVM on Intel processors equipped with the VT
- extensions.
-
-config KVM_AMD
- tristate "KVM for AMD processors support"
- depends on KVM
- ---help---
- Provides support for KVM on AMD processors equipped with the AMD-V
- (SVM) extensions.
-
-# OK, it's a little counter-intuitive to do this, but it puts it neatly under
-# the virtualization menu.
-source drivers/lguest/Kconfig
-
-endif # VIRTUALIZATION
diff --git a/drivers/kvm/Makefile b/drivers/kvm/Makefile
deleted file mode 100644
index e5a8f4d3e97..00000000000
--- a/drivers/kvm/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
-#
-# Makefile for Kernel-based Virtual Machine module
-#
-
-kvm-objs := kvm_main.o mmu.o x86_emulate.o i8259.o irq.o lapic.o ioapic.o
-obj-$(CONFIG_KVM) += kvm.o
-kvm-intel-objs = vmx.o
-obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
-kvm-amd-objs = svm.o
-obj-$(CONFIG_KVM_AMD) += kvm-amd.o
diff --git a/drivers/kvm/i8259.c b/drivers/kvm/i8259.c
deleted file mode 100644
index a679157bc59..00000000000
--- a/drivers/kvm/i8259.c
+++ /dev/null
@@ -1,450 +0,0 @@
-/*
- * 8259 interrupt controller emulation
- *
- * Copyright (c) 2003-2004 Fabrice Bellard
- * Copyright (c) 2007 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- * Authors:
- * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
- * Port from Qemu.
- */
-#include <linux/mm.h>
-#include "irq.h"
-
-/*
- * set irq level. If an edge is detected, then the IRR is set to 1
- */
-static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
-{
- int mask;
- mask = 1 << irq;
- if (s->elcr & mask) /* level triggered */
- if (level) {
- s->irr |= mask;
- s->last_irr |= mask;
- } else {
- s->irr &= ~mask;
- s->last_irr &= ~mask;
- }
- else /* edge triggered */
- if (level) {
- if ((s->last_irr & mask) == 0)
- s->irr |= mask;
- s->last_irr |= mask;
- } else
- s->last_irr &= ~mask;
-}
-
-/*
- * return the highest priority found in mask (highest = smallest
- * number). Return 8 if no irq
- */
-static inline int get_priority(struct kvm_kpic_state *s, int mask)
-{
- int priority;
- if (mask == 0)
- return 8;
- priority = 0;
- while ((mask & (1 << ((priority + s->priority_add) & 7))) == 0)
- priority++;
- return priority;
-}
-
-/*
- * return the pic wanted interrupt. return -1 if none
- */
-static int pic_get_irq(struct kvm_kpic_state *s)
-{
- int mask, cur_priority, priority;
-
- mask = s->irr & ~s->imr;
- priority = get_priority(s, mask);
- if (priority == 8)
- return -1;
- /*
- * compute current priority. If special fully nested mode on the
- * master, the IRQ coming from the slave is not taken into account
- * for the priority computation.
- */
- mask = s->isr;
- if (s->special_fully_nested_mode && s == &s->pics_state->pics[0])
- mask &= ~(1 << 2);
- cur_priority = get_priority(s, mask);
- if (priority < cur_priority)
- /*
- * higher priority found: an irq should be generated
- */
- return (priority + s->priority_add) & 7;
- else
- return -1;
-}
-
-/*
- * raise irq to CPU if necessary. must be called every time the active
- * irq may change
- */
-static void pic_update_irq(struct kvm_pic *s)
-{
- int irq2, irq;
-
- irq2 = pic_get_irq(&s->pics[1]);
- if (irq2 >= 0) {
- /*
- * if irq request by slave pic, signal master PIC
- */
- pic_set_irq1(&s->pics[0], 2, 1);
- pic_set_irq1(&s->pics[0], 2, 0);
- }
- irq = pic_get_irq(&s->pics[0]);
- if (irq >= 0)
- s->irq_request(s->irq_request_opaque, 1);
- else
- s->irq_request(s->irq_request_opaque, 0);
-}
-
-void kvm_pic_update_irq(struct kvm_pic *s)
-{
- pic_update_irq(s);
-}
-
-void kvm_pic_set_irq(void *opaque, int irq, int level)
-{
- struct kvm_pic *s = opaque;
-
- pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
- pic_update_irq(s);
-}
-
-/*
- * acknowledge interrupt 'irq'
- */
-static inline void pic_intack(struct kvm_kpic_state *s, int irq)
-{
- if (s->auto_eoi) {
- if (s->rotate_on_auto_eoi)
- s->priority_add = (irq + 1) & 7;
- } else
- s->isr |= (1 << irq);
- /*
- * We don't clear a level sensitive interrupt here
- */
- if (!(s->elcr & (1 << irq)))
- s->irr &= ~(1 << irq);
-}
-
-int kvm_pic_read_irq(struct kvm_pic *s)
-{
- int irq, irq2, intno;
-
- irq = pic_get_irq(&s->pics[0]);
- if (irq >= 0) {
- pic_intack(&s->pics[0], irq);
- if (irq == 2) {
- irq2 = pic_get_irq(&s->pics[1]);
- if (irq2 >= 0)
- pic_intack(&s->pics[1], irq2);
- else
- /*
- * spurious IRQ on slave controller
- */
- irq2 = 7;
- intno = s->pics[1].irq_base + irq2;
- irq = irq2 + 8;
- } else
- intno = s->pics[0].irq_base + irq;
- } else {
- /*
- * spurious IRQ on host controller
- */
- irq = 7;
- intno = s->pics[0].irq_base + irq;
- }
- pic_update_irq(s);
-
- return intno;
-}
-
-static void pic_reset(void *opaque)
-{
- struct kvm_kpic_state *s = opaque;
-
- s->last_irr = 0;
- s->irr = 0;
- s->imr = 0;
- s->isr = 0;
- s->priority_add = 0;
- s->irq_base = 0;
- s->read_reg_select = 0;
- s->poll = 0;
- s->special_mask = 0;
- s->init_state = 0;
- s->auto_eoi = 0;
- s->rotate_on_auto_eoi = 0;
- s->special_fully_nested_mode = 0;
- s->init4 = 0;
-}
-
-static void pic_ioport_write(void *opaque, u32 addr, u32 val)
-{
- struct kvm_kpic_state *s = opaque;
- int priority, cmd, irq;
-
- addr &= 1;
- if (addr == 0) {
- if (val & 0x10) {
- pic_reset(s); /* init */
- /*
- * deassert a pending interrupt
- */
- s->pics_state->irq_request(s->pics_state->
- irq_request_opaque, 0);
- s->init_state = 1;
- s->init4 = val & 1;
- if (val & 0x02)
- printk(KERN_ERR "single mode not supported");
- if (val & 0x08)
- printk(KERN_ERR
- "level sensitive irq not supported");
- } else if (val & 0x08) {
- if (val & 0x04)
- s->poll = 1;
- if (val & 0x02)
- s->read_reg_select = val & 1;
- if (val & 0x40)
- s->special_mask = (val >> 5) & 1;
- } else {
- cmd = val >> 5;
- switch (cmd) {
- case 0:
- case 4:
- s->rotate_on_auto_eoi = cmd >> 2;
- break;
- case 1: /* end of interrupt */
- case 5:
- priority = get_priority(s, s->isr);
- if (priority != 8) {
- irq = (priority + s->priority_add) & 7;
- s->isr &= ~(1 << irq);
- if (cmd == 5)
- s->priority_add = (irq + 1) & 7;
- pic_update_irq(s->pics_state);
- }
- break;
- case 3:
- irq = val & 7;
- s->isr &= ~(1 << irq);
- pic_update_irq(s->pics_state);
- break;
- case 6:
- s->priority_add = (val + 1) & 7;
- pic_update_irq(s->pics_state);
- break;
- case 7:
- irq = val & 7;
- s->isr &= ~(1 << irq);
- s->priority_add = (irq + 1) & 7;
- pic_update_irq(s->pics_state);
- break;
- default:
- break; /* no operation */
- }
- }
- } else
- switch (s->init_state) {
- case 0: /* normal mode */
- s->imr = val;
- pic_update_irq(s->pics_state);
- break;
- case 1:
- s->irq_base = val & 0xf8;
- s->init_state = 2;
- break;
- case 2:
- if (s->init4)
- s->init_state = 3;
- else
- s->init_state = 0;
- break;
- case 3:
- s->special_fully_nested_mode = (val >> 4) & 1;
- s->auto_eoi = (val >> 1) & 1;
- s->init_state = 0;
- break;
- }
-}
-
-static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
-{
- int ret;
-
- ret = pic_get_irq(s);
- if (ret >= 0) {
- if (addr1 >> 7) {
- s->pics_state->pics[0].isr &= ~(1 << 2);
- s->pics_state->pics[0].irr &= ~(1 << 2);
- }
- s->irr &= ~(1 << ret);
- s->isr &= ~(1 << ret);
- if (addr1 >> 7 || ret != 2)
- pic_update_irq(s->pics_state);
- } else {
- ret = 0x07;
- pic_update_irq(s->pics_state);
- }
-
- return ret;
-}
-
-static u32 pic_ioport_read(void *opaque, u32 addr1)
-{
- struct kvm_kpic_state *s = opaque;
- unsigned int addr;
- int ret;
-
- addr = addr1;
- addr &= 1;
- if (s->poll) {
- ret = pic_poll_read(s, addr1);
- s->poll = 0;
- } else
- if (addr == 0)
- if (s->read_reg_select)
- ret = s->isr;
- else
- ret = s->irr;
- else
- ret = s->imr;
- return ret;
-}
-
-static void elcr_ioport_write(void *opaque, u32 addr, u32 val)
-{
- struct kvm_kpic_state *s = opaque;
- s->elcr = val & s->elcr_mask;
-}
-
-static u32 elcr_ioport_read(void *opaque, u32 addr1)
-{
- struct kvm_kpic_state *s = opaque;
- return s->elcr;
-}
-
-static int picdev_in_range(struct kvm_io_device *this, gpa_t addr)
-{
- switch (addr) {
- case 0x20:
- case 0x21:
- case 0xa0:
- case 0xa1:
- case 0x4d0:
- case 0x4d1:
- return 1;
- default:
- return 0;
- }
-}
-
-static void picdev_write(struct kvm_io_device *this,
- gpa_t addr, int len, const void *val)
-{
- struct kvm_pic *s = this->private;
- unsigned char data = *(unsigned char *)val;
-
- if (len != 1) {
- if (printk_ratelimit())
- printk(KERN_ERR "PIC: non byte write\n");
- return;
- }
- switch (addr) {
- case 0x20:
- case 0x21:
- case 0xa0:
- case 0xa1:
- pic_ioport_write(&s->pics[addr >> 7], addr, data);
- break;
- case 0x4d0:
- case 0x4d1:
- elcr_ioport_write(&s->pics[addr & 1], addr, data);
- break;
- }
-}
-
-static void picdev_read(struct kvm_io_device *this,
- gpa_t addr, int len, void *val)
-{
- struct kvm_pic *s = this->private;
- unsigned char data = 0;
-
- if (len != 1) {
- if (printk_ratelimit())
- printk(KERN_ERR "PIC: non byte read\n");
- return;
- }
- switch (addr) {
- case 0x20:
- case 0x21:
- case 0xa0:
- case 0xa1:
- data = pic_ioport_read(&s->pics[addr >> 7], addr);
- break;
- case 0x4d0:
- case 0x4d1:
- data = elcr_ioport_read(&s->pics[addr & 1], addr);
- break;
- }
- *(unsigned char *)val = data;
-}
-
-/*
- * callback when PIC0 irq status changed
- */
-static void pic_irq_request(void *opaque, int level)
-{
- struct kvm *kvm = opaque;
- struct kvm_vcpu *vcpu = kvm->vcpus[0];
-
- pic_irqchip(kvm)->output = level;
- if (vcpu)
- kvm_vcpu_kick(vcpu);
-}
-
-struct kvm_pic *kvm_create_pic(struct kvm *kvm)
-{
- struct kvm_pic *s;
- s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
- if (!s)
- return NULL;
- s->pics[0].elcr_mask = 0xf8;
- s->pics[1].elcr_mask = 0xde;
- s->irq_request = pic_irq_request;
- s->irq_request_opaque = kvm;
- s->pics[0].pics_state = s;
- s->pics[1].pics_state = s;
-
- /*
- * Initialize PIO device
- */
- s->dev.read = picdev_read;
- s->dev.write = picdev_write;
- s->dev.in_range = picdev_in_range;
- s->dev.private = s;
- kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev);
- return s;
-}
diff --git a/drivers/kvm/ioapic.c b/drivers/kvm/ioapic.c
deleted file mode 100644
index c7992e667fd..00000000000
--- a/drivers/kvm/ioapic.c
+++ /dev/null
@@ -1,388 +0,0 @@
-/*
- * Copyright (C) 2001 MandrakeSoft S.A.
- *
- * MandrakeSoft S.A.
- * 43, rue d'Aboukir
- * 75002 Paris - France
- * http://www.linux-mandrake.com/
- * http://www.mandrakesoft.com/
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Yunhong Jiang <yunhong.jiang@intel.com>
- * Yaozu (Eddie) Dong <eddie.dong@intel.com>
- * Based on Xen 3.1 code.
- */
-
-#include "kvm.h"
-#include <linux/kvm.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/smp.h>
-#include <linux/hrtimer.h>
-#include <linux/io.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
-#include <asm/page.h>
-#include <asm/current.h>
-#include <asm/apicdef.h>
-#include <asm/io_apic.h>
-#include "irq.h"
-/* #define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
-#define ioapic_debug(fmt, arg...)
-static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq);
-
-static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
- unsigned long addr,
- unsigned long length)
-{
- unsigned long result = 0;
-
- switch (ioapic->ioregsel) {
- case IOAPIC_REG_VERSION:
- result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16)
- | (IOAPIC_VERSION_ID & 0xff));
- break;
-
- case IOAPIC_REG_APIC_ID:
- case IOAPIC_REG_ARB_ID:
- result = ((ioapic->id & 0xf) << 24);
- break;
-
- default:
- {
- u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
- u64 redir_content;
-
- ASSERT(redir_index < IOAPIC_NUM_PINS);
-
- redir_content = ioapic->redirtbl[redir_index].bits;
- result = (ioapic->ioregsel & 0x1) ?
- (redir_content >> 32) & 0xffffffff :
- redir_content & 0xffffffff;
- break;
- }
- }
-
- return result;
-}
-
-static void ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
-{
- union ioapic_redir_entry *pent;
-
- pent = &ioapic->redirtbl[idx];
-
- if (!pent->fields.mask) {
- ioapic_deliver(ioapic, idx);
- if (pent->fields.trig_mode == IOAPIC_LEVEL_TRIG)
- pent->fields.remote_irr = 1;
- }
- if (!pent->fields.trig_mode)
- ioapic->irr &= ~(1 << idx);
-}
-
-static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
-{
- unsigned index;
-
- switch (ioapic->ioregsel) {
- case IOAPIC_REG_VERSION:
- /* Writes are ignored. */
- break;
-
- case IOAPIC_REG_APIC_ID:
- ioapic->id = (val >> 24) & 0xf;
- break;
-
- case IOAPIC_REG_ARB_ID:
- break;
-
- default:
- index = (ioapic->ioregsel - 0x10) >> 1;
-
- ioapic_debug("change redir index %x val %x", index, val);
- if (index >= IOAPIC_NUM_PINS)
- return;
- if (ioapic->ioregsel & 1) {
- ioapic->redirtbl[index].bits &= 0xffffffff;
- ioapic->redirtbl[index].bits |= (u64) val << 32;
- } else {
- ioapic->redirtbl[index].bits &= ~0xffffffffULL;
- ioapic->redirtbl[index].bits |= (u32) val;
- ioapic->redirtbl[index].fields.remote_irr = 0;
- }
- if (ioapic->irr & (1 << index))
- ioapic_service(ioapic, index);
- break;
- }
-}
-
-static void ioapic_inj_irq(struct kvm_ioapic *ioapic,
- struct kvm_lapic *target,
- u8 vector, u8 trig_mode, u8 delivery_mode)
-{
- ioapic_debug("irq %d trig %d deliv %d", vector, trig_mode,
- delivery_mode);
-
- ASSERT((delivery_mode == dest_Fixed) ||
- (delivery_mode == dest_LowestPrio));
-
- kvm_apic_set_irq(target, vector, trig_mode);
-}
-
-static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
- u8 dest_mode)
-{
- u32 mask = 0;
- int i;
- struct kvm *kvm = ioapic->kvm;
- struct kvm_vcpu *vcpu;
-
- ioapic_debug("dest %d dest_mode %d", dest, dest_mode);
-
- if (dest_mode == 0) { /* Physical mode. */
- if (dest == 0xFF) { /* Broadcast. */
- for (i = 0; i < KVM_MAX_VCPUS; ++i)
- if (kvm->vcpus[i] && kvm->vcpus[i]->apic)
- mask |= 1 << i;
- return mask;
- }
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
- vcpu = kvm->vcpus[i];
- if (!vcpu)
- continue;
- if (kvm_apic_match_physical_addr(vcpu->apic, dest)) {
- if (vcpu->apic)
- mask = 1 << i;
- break;
- }
- }
- } else if (dest != 0) /* Logical mode, MDA non-zero. */
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
- vcpu = kvm->vcpus[i];
- if (!vcpu)
- continue;
- if (vcpu->apic &&
- kvm_apic_match_logical_addr(vcpu->apic, dest))
- mask |= 1 << vcpu->vcpu_id;
- }
- ioapic_debug("mask %x", mask);
- return mask;
-}
-
-static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
-{
- u8 dest = ioapic->redirtbl[irq].fields.dest_id;
- u8 dest_mode = ioapic->redirtbl[irq].fields.dest_mode;
- u8 delivery_mode = ioapic->redirtbl[irq].fields.delivery_mode;
- u8 vector = ioapic->redirtbl[irq].fields.vector;
- u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode;
- u32 deliver_bitmask;
- struct kvm_lapic *target;
- struct kvm_vcpu *vcpu;
- int vcpu_id;
-
- ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
- "vector=%x trig_mode=%x",
- dest, dest_mode, delivery_mode, vector, trig_mode);
-
- deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode);
- if (!deliver_bitmask) {
- ioapic_debug("no target on destination");
- return;
- }
-
- switch (delivery_mode) {
- case dest_LowestPrio:
- target =
- kvm_apic_round_robin(ioapic->kvm, vector, deliver_bitmask);
- if (target != NULL)
- ioapic_inj_irq(ioapic, target, vector,
- trig_mode, delivery_mode);
- else
- ioapic_debug("null round robin: "
- "mask=%x vector=%x delivery_mode=%x",
- deliver_bitmask, vector, dest_LowestPrio);
- break;
- case dest_Fixed:
- for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
- if (!(deliver_bitmask & (1 << vcpu_id)))
- continue;
- deliver_bitmask &= ~(1 << vcpu_id);
- vcpu = ioapic->kvm->vcpus[vcpu_id];
- if (vcpu) {
- target = vcpu->apic;
- ioapic_inj_irq(ioapic, target, vector,
- trig_mode, delivery_mode);
- }
- }
- break;
-
- /* TODO: NMI */
- default:
- printk(KERN_WARNING "Unsupported delivery mode %d\n",
- delivery_mode);
- break;
- }
-}
-
-void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
-{
- u32 old_irr = ioapic->irr;
- u32 mask = 1 << irq;
- union ioapic_redir_entry entry;
-
- if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
- entry = ioapic->redirtbl[irq];
- level ^= entry.fields.polarity;
- if (!level)
- ioapic->irr &= ~mask;
- else {
- ioapic->irr |= mask;
- if ((!entry.fields.trig_mode && old_irr != ioapic->irr)
- || !entry.fields.remote_irr)
- ioapic_service(ioapic, irq);
- }
- }
-}
-
-static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector)
-{
- int i;
-
- for (i = 0; i < IOAPIC_NUM_PINS; i++)
- if (ioapic->redirtbl[i].fields.vector == vector)
- return i;
- return -1;
-}
-
-void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
-{
- struct kvm_ioapic *ioapic = kvm->vioapic;
- union ioapic_redir_entry *ent;
- int gsi;
-
- gsi = get_eoi_gsi(ioapic, vector);
- if (gsi == -1) {
- printk(KERN_WARNING "Can't find redir item for %d EOI\n",
- vector);
- return;
- }
-
- ent = &ioapic->redirtbl[gsi];
- ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
-
- ent->fields.remote_irr = 0;
- if (!ent->fields.mask && (ioapic->irr & (1 << gsi)))
- ioapic_deliver(ioapic, gsi);
-}
-
-static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr)
-{
- struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
-
- return ((addr >= ioapic->base_address &&
- (addr < ioapic->base_address + IOAPIC_MEM_LENGTH)));
-}
-
-static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
- void *val)
-{
- struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
- u32 result;
-
- ioapic_debug("addr %lx", (unsigned long)addr);
- ASSERT(!(addr & 0xf)); /* check alignment */
-
- addr &= 0xff;
- switch (addr) {
- case IOAPIC_REG_SELECT:
- result = ioapic->ioregsel;
- break;
-
- case IOAPIC_REG_WINDOW:
- result = ioapic_read_indirect(ioapic, addr, len);
- break;
-
- default:
- result = 0;
- break;
- }
- switch (len) {
- case 8:
- *(u64 *) val = result;
- break;
- case 1:
- case 2:
- case 4:
- memcpy(val, (char *)&result, len);
- break;
- default:
- printk(KERN_WARNING "ioapic: wrong length %d\n", len);
- }
-}
-
-static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
- const void *val)
-{
- struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
- u32 data;
-
- ioapic_debug("ioapic_mmio_write addr=%lx len=%d val=%p\n",
- addr, len, val);
- ASSERT(!(addr & 0xf)); /* check alignment */
- if (len == 4 || len == 8)
- data = *(u32 *) val;
- else {
- printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);
- return;
- }
-
- addr &= 0xff;
- switch (addr) {
- case IOAPIC_REG_SELECT:
- ioapic->ioregsel = data;
- break;
-
- case IOAPIC_REG_WINDOW:
- ioapic_write_indirect(ioapic, data);
- break;
-
- default:
- break;
- }
-}
-
-int kvm_ioapic_init(struct kvm *kvm)
-{
- struct kvm_ioapic *ioapic;
- int i;
-
- ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
- if (!ioapic)
- return -ENOMEM;
- kvm->vioapic = ioapic;
- for (i = 0; i < IOAPIC_NUM_PINS; i++)
- ioapic->redirtbl[i].fields.mask = 1;
- ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
- ioapic->dev.read = ioapic_mmio_read;
- ioapic->dev.write = ioapic_mmio_write;
- ioapic->dev.in_range = ioapic_in_range;
- ioapic->dev.private = ioapic;
- ioapic->kvm = kvm;
- kvm_io_bus_register_dev(&kvm->mmio_bus, &ioapic->dev);
- return 0;
-}
diff --git a/drivers/kvm/irq.c b/drivers/kvm/irq.c
deleted file mode 100644
index 7628c7ff628..00000000000
--- a/drivers/kvm/irq.c
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * irq.c: API for in kernel interrupt controller
- * Copyright (c) 2007, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- * Authors:
- * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
- *
- */
-
-#include <linux/module.h>
-
-#include "kvm.h"
-#include "irq.h"
-
-/*
- * check if there is pending interrupt without
- * intack.
- */
-int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
-{
- struct kvm_pic *s;
-
- if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */
- if (kvm_apic_accept_pic_intr(v)) {
- s = pic_irqchip(v->kvm); /* PIC */
- return s->output;
- } else
- return 0;
- }
- return 1;
-}
-EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
-
-/*
- * Read pending interrupt vector and intack.
- */
-int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
-{
- struct kvm_pic *s;
- int vector;
-
- vector = kvm_get_apic_interrupt(v); /* APIC */
- if (vector == -1) {
- if (kvm_apic_accept_pic_intr(v)) {
- s = pic_irqchip(v->kvm);
- s->output = 0; /* PIC */
- vector = kvm_pic_read_irq(s);
- }
- }
- return vector;
-}
-EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
-
-static void vcpu_kick_intr(void *info)
-{
-#ifdef DEBUG
- struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
- printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
-#endif
-}
-
-void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
-{
- int ipi_pcpu = vcpu->cpu;
-
- if (waitqueue_active(&vcpu->wq)) {
- wake_up_interruptible(&vcpu->wq);
- ++vcpu->stat.halt_wakeup;
- }
- if (vcpu->guest_mode)
- smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
-}
-
-void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
-{
- kvm_inject_apic_timer_irqs(vcpu);
- /* TODO: PIT, RTC etc. */
-}
-EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
-
-void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
-{
- kvm_apic_timer_intr_post(vcpu, vec);
- /* TODO: PIT, RTC etc. */
-}
-EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
diff --git a/drivers/kvm/irq.h b/drivers/kvm/irq.h
deleted file mode 100644
index 11fc014e2b3..00000000000
--- a/drivers/kvm/irq.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * irq.h: in kernel interrupt controller related definitions
- * Copyright (c) 2007, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- * Authors:
- * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
- *
- */
-
-#ifndef __IRQ_H
-#define __IRQ_H
-
-#include "kvm.h"
-
-typedef void irq_request_func(void *opaque, int level);
-
-struct kvm_kpic_state {
- u8 last_irr; /* edge detection */
- u8 irr; /* interrupt request register */
- u8 imr; /* interrupt mask register */
- u8 isr; /* interrupt service register */
- u8 priority_add; /* highest irq priority */
- u8 irq_base;
- u8 read_reg_select;
- u8 poll;
- u8 special_mask;
- u8 init_state;
- u8 auto_eoi;
- u8 rotate_on_auto_eoi;
- u8 special_fully_nested_mode;
- u8 init4; /* true if 4 byte init */
- u8 elcr; /* PIIX edge/trigger selection */
- u8 elcr_mask;
- struct kvm_pic *pics_state;
-};
-
-struct kvm_pic {
- struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
- irq_request_func *irq_request;
- void *irq_request_opaque;
- int output; /* intr from master PIC */
- struct kvm_io_device dev;
-};
-
-struct kvm_pic *kvm_create_pic(struct kvm *kvm);
-void kvm_pic_set_irq(void *opaque, int irq, int level);
-int kvm_pic_read_irq(struct kvm_pic *s);
-int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
-int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
-void kvm_pic_update_irq(struct kvm_pic *s);
-
-#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
-#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
-#define IOAPIC_EDGE_TRIG 0
-#define IOAPIC_LEVEL_TRIG 1
-
-#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000
-#define IOAPIC_MEM_LENGTH 0x100
-
-/* Direct registers. */
-#define IOAPIC_REG_SELECT 0x00
-#define IOAPIC_REG_WINDOW 0x10
-#define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */
-
-/* Indirect registers. */
-#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */
-#define IOAPIC_REG_VERSION 0x01
-#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */
-
-struct kvm_ioapic {
- u64 base_address;
- u32 ioregsel;
- u32 id;
- u32 irr;
- u32 pad;
- union ioapic_redir_entry {
- u64 bits;
- struct {
- u8 vector;
- u8 delivery_mode:3;
- u8 dest_mode:1;
- u8 delivery_status:1;
- u8 polarity:1;
- u8 remote_irr:1;
- u8 trig_mode:1;
- u8 mask:1;
- u8 reserve:7;
- u8 reserved[4];
- u8 dest_id;
- } fields;
- } redirtbl[IOAPIC_NUM_PINS];
- struct kvm_io_device dev;
- struct kvm *kvm;
-};
-
-struct kvm_lapic {
- unsigned long base_address;
- struct kvm_io_device dev;
- struct {
- atomic_t pending;
- s64 period; /* unit: ns */
- u32 divide_count;
- ktime_t last_update;
- struct hrtimer dev;
- } timer;
- struct kvm_vcpu *vcpu;
- struct page *regs_page;
- void *regs;
-};
-
-#ifdef DEBUG
-#define ASSERT(x) \
-do { \
- if (!(x)) { \
- printk(KERN_EMERG "assertion failed %s: %d: %s\n", \
- __FILE__, __LINE__, #x); \
- BUG(); \
- } \
-} while (0)
-#else
-#define ASSERT(x) do { } while (0)
-#endif
-
-void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
-int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
-int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
-int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
-int kvm_create_lapic(struct kvm_vcpu *vcpu);
-void kvm_lapic_reset(struct kvm_vcpu *vcpu);
-void kvm_free_apic(struct kvm_lapic *apic);
-u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
-void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
-void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
-struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
- unsigned long bitmap);
-u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
-void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
-int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
-void kvm_ioapic_update_eoi(struct kvm *kvm, int vector);
-int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
-int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig);
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
-int kvm_ioapic_init(struct kvm *kvm);
-void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
-int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
-void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
-void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
-void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
-void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
-void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
-
-#endif
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
deleted file mode 100644
index 3b0bc4bda5f..00000000000
--- a/drivers/kvm/kvm.h
+++ /dev/null
@@ -1,796 +0,0 @@
-#ifndef __KVM_H
-#define __KVM_H
-
-/*
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- */
-
-#include <linux/types.h>
-#include <linux/list.h>
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/preempt.h>
-#include <asm/signal.h>
-
-#include <linux/kvm.h>
-#include <linux/kvm_para.h>
-
-#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
-#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
-#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS|0xFFFFFF0000000000ULL)
-
-#define KVM_GUEST_CR0_MASK \
- (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \
- | X86_CR0_NW | X86_CR0_CD)
-#define KVM_VM_CR0_ALWAYS_ON \
- (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \
- | X86_CR0_MP)
-#define KVM_GUEST_CR4_MASK \
- (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
-#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
-#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
-
-#define INVALID_PAGE (~(hpa_t)0)
-#define UNMAPPED_GVA (~(gpa_t)0)
-
-#define KVM_MAX_VCPUS 4
-#define KVM_ALIAS_SLOTS 4
-#define KVM_MEMORY_SLOTS 8
-#define KVM_NUM_MMU_PAGES 1024
-#define KVM_MIN_FREE_MMU_PAGES 5
-#define KVM_REFILL_PAGES 25
-#define KVM_MAX_CPUID_ENTRIES 40
-
-#define DE_VECTOR 0
-#define NM_VECTOR 7
-#define DF_VECTOR 8
-#define TS_VECTOR 10
-#define NP_VECTOR 11
-#define SS_VECTOR 12
-#define GP_VECTOR 13
-#define PF_VECTOR 14
-
-#define SELECTOR_TI_MASK (1 << 2)
-#define SELECTOR_RPL_MASK 0x03
-
-#define IOPL_SHIFT 12
-
-#define KVM_PIO_PAGE_OFFSET 1
-
-/*
- * vcpu->requests bit members
- */
-#define KVM_TLB_FLUSH 0
-
-/*
- * Address types:
- *
- * gva - guest virtual address
- * gpa - guest physical address
- * gfn - guest frame number
- * hva - host virtual address
- * hpa - host physical address
- * hfn - host frame number
- */
-
-typedef unsigned long gva_t;
-typedef u64 gpa_t;
-typedef unsigned long gfn_t;
-
-typedef unsigned long hva_t;
-typedef u64 hpa_t;
-typedef unsigned long hfn_t;
-
-#define NR_PTE_CHAIN_ENTRIES 5
-
-struct kvm_pte_chain {
- u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES];
- struct hlist_node link;
-};
-
-/*
- * kvm_mmu_page_role, below, is defined as:
- *
- * bits 0:3 - total guest paging levels (2-4, or zero for real mode)
- * bits 4:7 - page table level for this shadow (1-4)
- * bits 8:9 - page table quadrant for 2-level guests
- * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode)
- * bits 17:19 - "access" - the user, writable, and nx bits of a huge page pde
- */
-union kvm_mmu_page_role {
- unsigned word;
- struct {
- unsigned glevels : 4;
- unsigned level : 4;
- unsigned quadrant : 2;
- unsigned pad_for_nice_hex_output : 6;
- unsigned metaphysical : 1;
- unsigned hugepage_access : 3;
- };
-};
-
-struct kvm_mmu_page {
- struct list_head link;
- struct hlist_node hash_link;
-
- /*
- * The following two entries are used to key the shadow page in the
- * hash table.
- */
- gfn_t gfn;
- union kvm_mmu_page_role role;
-
- u64 *spt;
- unsigned long slot_bitmap; /* One bit set per slot which has memory
- * in this shadow page.
- */
- int multimapped; /* More than one parent_pte? */
- int root_count; /* Currently serving as active root */
- union {
- u64 *parent_pte; /* !multimapped */
- struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
- };
-};
-
-struct kvm_vcpu;
-extern struct kmem_cache *kvm_vcpu_cache;
-
-/*
- * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
- * 32-bit). The kvm_mmu structure abstracts the details of the current mmu
- * mode.
- */
-struct kvm_mmu {
- void (*new_cr3)(struct kvm_vcpu *vcpu);
- int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
- void (*free)(struct kvm_vcpu *vcpu);
- gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
- hpa_t root_hpa;
- int root_level;
- int shadow_root_level;
-
- u64 *pae_root;
-};
-
-#define KVM_NR_MEM_OBJS 20
-
-struct kvm_mmu_memory_cache {
- int nobjs;
- void *objects[KVM_NR_MEM_OBJS];
-};
-
-/*
- * We don't want allocation failures within the mmu code, so we preallocate
- * enough memory for a single page fault in a cache.
- */
-struct kvm_guest_debug {
- int enabled;
- unsigned long bp[4];
- int singlestep;
-};
-
-enum {
- VCPU_REGS_RAX = 0,
- VCPU_REGS_RCX = 1,
- VCPU_REGS_RDX = 2,
- VCPU_REGS_RBX = 3,
- VCPU_REGS_RSP = 4,
- VCPU_REGS_RBP = 5,
- VCPU_REGS_RSI = 6,
- VCPU_REGS_RDI = 7,
-#ifdef CONFIG_X86_64
- VCPU_REGS_R8 = 8,
- VCPU_REGS_R9 = 9,
- VCPU_REGS_R10 = 10,
- VCPU_REGS_R11 = 11,
- VCPU_REGS_R12 = 12,
- VCPU_REGS_R13 = 13,
- VCPU_REGS_R14 = 14,
- VCPU_REGS_R15 = 15,
-#endif
- NR_VCPU_REGS
-};
-
-enum {
- VCPU_SREG_CS,
- VCPU_SREG_DS,
- VCPU_SREG_ES,
- VCPU_SREG_FS,
- VCPU_SREG_GS,
- VCPU_SREG_SS,
- VCPU_SREG_TR,
- VCPU_SREG_LDTR,
-};
-
-struct kvm_pio_request {
- unsigned long count;
- int cur_count;
- struct page *guest_pages[2];
- unsigned guest_page_offset;
- int in;
- int port;
- int size;
- int string;
- int down;
- int rep;
-};
-
-struct kvm_stat {
- u32 pf_fixed;
- u32 pf_guest;
- u32 tlb_flush;
- u32 invlpg;
-
- u32 exits;
- u32 io_exits;
- u32 mmio_exits;
- u32 signal_exits;
- u32 irq_window_exits;
- u32 halt_exits;
- u32 halt_wakeup;
- u32 request_irq_exits;
- u32 irq_exits;
- u32 light_exits;
- u32 efer_reload;
-};
-
-struct kvm_io_device {
- void (*read)(struct kvm_io_device *this,
- gpa_t addr,
- int len,
- void *val);
- void (*write)(struct kvm_io_device *this,
- gpa_t addr,
- int len,
- const void *val);
- int (*in_range)(struct kvm_io_device *this, gpa_t addr);
- void (*destructor)(struct kvm_io_device *this);
-
- void *private;
-};
-
-static inline void kvm_iodevice_read(struct kvm_io_device *dev,
- gpa_t addr,
- int len,
- void *val)
-{
- dev->read(dev, addr, len, val);
-}
-
-static inline void kvm_iodevice_write(struct kvm_io_device *dev,
- gpa_t addr,
- int len,
- const void *val)
-{
- dev->write(dev, addr, len, val);
-}
-
-static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr)
-{
- return dev->in_range(dev, addr);
-}
-
-static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
-{
- if (dev->destructor)
- dev->destructor(dev);
-}
-
-/*
- * It would be nice to use something smarter than a linear search, TBD...
- * Thankfully we dont expect many devices to register (famous last words :),
- * so until then it will suffice. At least its abstracted so we can change
- * in one place.
- */
-struct kvm_io_bus {
- int dev_count;
-#define NR_IOBUS_DEVS 6
- struct kvm_io_device *devs[NR_IOBUS_DEVS];
-};
-
-void kvm_io_bus_init(struct kvm_io_bus *bus);
-void kvm_io_bus_destroy(struct kvm_io_bus *bus);
-struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
-void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
- struct kvm_io_device *dev);
-
-struct kvm_vcpu {
- struct kvm *kvm;
- struct preempt_notifier preempt_notifier;
- int vcpu_id;
- struct mutex mutex;
- int cpu;
- u64 host_tsc;
- struct kvm_run *run;
- int interrupt_window_open;
- int guest_mode;
- unsigned long requests;
- unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
- DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
- unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */
- unsigned long rip; /* needs vcpu_load_rsp_rip() */
-
- unsigned long cr0;
- unsigned long cr2;
- unsigned long cr3;
- gpa_t para_state_gpa;
- struct page *para_state_page;
- gpa_t hypercall_gpa;
- unsigned long cr4;
- unsigned long cr8;
- u64 pdptrs[4]; /* pae */
- u64 shadow_efer;
- u64 apic_base;
- struct kvm_lapic *apic; /* kernel irqchip context */
-#define VCPU_MP_STATE_RUNNABLE 0
-#define VCPU_MP_STATE_UNINITIALIZED 1
-#define VCPU_MP_STATE_INIT_RECEIVED 2
-#define VCPU_MP_STATE_SIPI_RECEIVED 3
-#define VCPU_MP_STATE_HALTED 4
- int mp_state;
- int sipi_vector;
- u64 ia32_misc_enable_msr;
-
- struct kvm_mmu mmu;
-
- struct kvm_mmu_memory_cache mmu_pte_chain_cache;
- struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
- struct kvm_mmu_memory_cache mmu_page_cache;
- struct kvm_mmu_memory_cache mmu_page_header_cache;
-
- gfn_t last_pt_write_gfn;
- int last_pt_write_count;
-
- struct kvm_guest_debug guest_debug;
-
- struct i387_fxsave_struct host_fx_image;
- struct i387_fxsave_struct guest_fx_image;
- int fpu_active;
- int guest_fpu_loaded;
-
- int mmio_needed;
- int mmio_read_completed;
- int mmio_is_write;
- int mmio_size;
- unsigned char mmio_data[8];
- gpa_t mmio_phys_addr;
- gva_t mmio_fault_cr2;
- struct kvm_pio_request pio;
- void *pio_data;
- wait_queue_head_t wq;
-
- int sigset_active;
- sigset_t sigset;
-
- struct kvm_stat stat;
-
- struct {
- int active;
- u8 save_iopl;
- struct kvm_save_segment {
- u16 selector;
- unsigned long base;
- u32 limit;
- u32 ar;
- } tr, es, ds, fs, gs;
- } rmode;
- int halt_request; /* real mode on Intel only */
-
- int cpuid_nent;
- struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES];
-};
-
-struct kvm_mem_alias {
- gfn_t base_gfn;
- unsigned long npages;
- gfn_t target_gfn;
-};
-
-struct kvm_memory_slot {
- gfn_t base_gfn;
- unsigned long npages;
- unsigned long flags;
- struct page **phys_mem;
- unsigned long *dirty_bitmap;
-};
-
-struct kvm {
- struct mutex lock; /* protects everything except vcpus */
- int naliases;
- struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
- int nmemslots;
- struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS];
- /*
- * Hash table of struct kvm_mmu_page.
- */
- struct list_head active_mmu_pages;
- int n_free_mmu_pages;
- struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
- struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
- unsigned long rmap_overflow;
- struct list_head vm_list;
- struct file *filp;
- struct kvm_io_bus mmio_bus;
- struct kvm_io_bus pio_bus;
- struct kvm_pic *vpic;
- struct kvm_ioapic *vioapic;
- int round_robin_prev_vcpu;
-};
-
-static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
-{
- return kvm->vpic;
-}
-
-static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
-{
- return kvm->vioapic;
-}
-
-static inline int irqchip_in_kernel(struct kvm *kvm)
-{
- return pic_irqchip(kvm) != 0;
-}
-
-struct descriptor_table {
- u16 limit;
- unsigned long base;
-} __attribute__((packed));
-
-struct kvm_x86_ops {
- int (*cpu_has_kvm_support)(void); /* __init */
- int (*disabled_by_bios)(void); /* __init */
- void (*hardware_enable)(void *dummy); /* __init */
- void (*hardware_disable)(void *dummy);
- void (*check_processor_compatibility)(void *rtn);
- int (*hardware_setup)(void); /* __init */
- void (*hardware_unsetup)(void); /* __exit */
-
- /* Create, but do not attach this VCPU */
- struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
- void (*vcpu_free)(struct kvm_vcpu *vcpu);
- void (*vcpu_reset)(struct kvm_vcpu *vcpu);
-
- void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
- void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
- void (*vcpu_put)(struct kvm_vcpu *vcpu);
- void (*vcpu_decache)(struct kvm_vcpu *vcpu);
-
- int (*set_guest_debug)(struct kvm_vcpu *vcpu,
- struct kvm_debug_guest *dbg);
- void (*guest_debug_pre)(struct kvm_vcpu *vcpu);
- int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
- int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
- u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
- void (*get_segment)(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg);
- void (*set_segment)(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg);
- void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
- void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
- void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
- void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
- void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
- void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
- void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
- void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
- void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
- void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
- unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr);
- void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value,
- int *exception);
- void (*cache_regs)(struct kvm_vcpu *vcpu);
- void (*decache_regs)(struct kvm_vcpu *vcpu);
- unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
- void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
-
- void (*tlb_flush)(struct kvm_vcpu *vcpu);
- void (*inject_page_fault)(struct kvm_vcpu *vcpu,
- unsigned long addr, u32 err_code);
-
- void (*inject_gp)(struct kvm_vcpu *vcpu, unsigned err_code);
-
- void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
- int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
- void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
- void (*patch_hypercall)(struct kvm_vcpu *vcpu,
- unsigned char *hypercall_addr);
- int (*get_irq)(struct kvm_vcpu *vcpu);
- void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
- void (*inject_pending_irq)(struct kvm_vcpu *vcpu);
- void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
- struct kvm_run *run);
-};
-
-extern struct kvm_x86_ops *kvm_x86_ops;
-
-/* The guest did something we don't support. */
-#define pr_unimpl(vcpu, fmt, ...) \
- do { \
- if (printk_ratelimit()) \
- printk(KERN_ERR "kvm: %i: cpu%i " fmt, \
- current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
- } while(0)
-
-#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
-#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
-
-int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
-void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
-
-int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size,
- struct module *module);
-void kvm_exit_x86(void);
-
-int kvm_mmu_module_init(void);
-void kvm_mmu_module_exit(void);
-
-void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
-int kvm_mmu_create(struct kvm_vcpu *vcpu);
-int kvm_mmu_setup(struct kvm_vcpu *vcpu);
-
-int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
-void kvm_mmu_zap_all(struct kvm *kvm);
-
-hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa);
-#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
-#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
-static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
-hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva);
-struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
-
-extern hpa_t bad_page_address;
-
-struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
-struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
-void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
-
-enum emulation_result {
- EMULATE_DONE, /* no further processing */
- EMULATE_DO_MMIO, /* kvm_run filled with mmio request */
- EMULATE_FAIL, /* can't emulate this instruction */
-};
-
-int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
- unsigned long cr2, u16 error_code);
-void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
-void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
-void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
-void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
- unsigned long *rflags);
-
-unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr);
-void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value,
- unsigned long *rflags);
-int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
-int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
-
-struct x86_emulate_ctxt;
-
-int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
- int size, unsigned port);
-int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
- int size, unsigned long count, int down,
- gva_t address, int rep, unsigned port);
-void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
-int kvm_emulate_halt(struct kvm_vcpu *vcpu);
-int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
-int emulate_clts(struct kvm_vcpu *vcpu);
-int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr,
- unsigned long *dest);
-int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
- unsigned long value);
-
-void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
-void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr0);
-void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr0);
-void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr0);
-unsigned long get_cr8(struct kvm_vcpu *vcpu);
-void lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
-void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
-
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
-
-void fx_init(struct kvm_vcpu *vcpu);
-
-void kvm_resched(struct kvm_vcpu *vcpu);
-void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
-void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
-void kvm_flush_remote_tlbs(struct kvm *kvm);
-
-int emulator_read_std(unsigned long addr,
- void *val,
- unsigned int bytes,
- struct kvm_vcpu *vcpu);
-int emulator_write_emulated(unsigned long addr,
- const void *val,
- unsigned int bytes,
- struct kvm_vcpu *vcpu);
-
-unsigned long segment_base(u16 selector);
-
-void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *new, int bytes);
-int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
-void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
-int kvm_mmu_load(struct kvm_vcpu *vcpu);
-void kvm_mmu_unload(struct kvm_vcpu *vcpu);
-
-int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run);
-
-static inline void kvm_guest_enter(void)
-{
- current->flags |= PF_VCPU;
-}
-
-static inline void kvm_guest_exit(void)
-{
- current->flags &= ~PF_VCPU;
-}
-
-static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
- u32 error_code)
-{
- return vcpu->mmu.page_fault(vcpu, gva, error_code);
-}
-
-static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
-{
- if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
- __kvm_mmu_free_some_pages(vcpu);
-}
-
-static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
-{
- if (likely(vcpu->mmu.root_hpa != INVALID_PAGE))
- return 0;
-
- return kvm_mmu_load(vcpu);
-}
-
-static inline int is_long_mode(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
- return vcpu->shadow_efer & EFER_LME;
-#else
- return 0;
-#endif
-}
-
-static inline int is_pae(struct kvm_vcpu *vcpu)
-{
- return vcpu->cr4 & X86_CR4_PAE;
-}
-
-static inline int is_pse(struct kvm_vcpu *vcpu)
-{
- return vcpu->cr4 & X86_CR4_PSE;
-}
-
-static inline int is_paging(struct kvm_vcpu *vcpu)
-{
- return vcpu->cr0 & X86_CR0_PG;
-}
-
-static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot)
-{
- return slot - kvm->memslots;
-}
-
-static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
-{
- struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
-
- return (struct kvm_mmu_page *)page_private(page);
-}
-
-static inline u16 read_fs(void)
-{
- u16 seg;
- asm ("mov %%fs, %0" : "=g"(seg));
- return seg;
-}
-
-static inline u16 read_gs(void)
-{
- u16 seg;
- asm ("mov %%gs, %0" : "=g"(seg));
- return seg;
-}
-
-static inline u16 read_ldt(void)
-{
- u16 ldt;
- asm ("sldt %0" : "=g"(ldt));
- return ldt;
-}
-
-static inline void load_fs(u16 sel)
-{
- asm ("mov %0, %%fs" : : "rm"(sel));
-}
-
-static inline void load_gs(u16 sel)
-{
- asm ("mov %0, %%gs" : : "rm"(sel));
-}
-
-#ifndef load_ldt
-static inline void load_ldt(u16 sel)
-{
- asm ("lldt %0" : : "rm"(sel));
-}
-#endif
-
-static inline void get_idt(struct descriptor_table *table)
-{
- asm ("sidt %0" : "=m"(*table));
-}
-
-static inline void get_gdt(struct descriptor_table *table)
-{
- asm ("sgdt %0" : "=m"(*table));
-}
-
-static inline unsigned long read_tr_base(void)
-{
- u16 tr;
- asm ("str %0" : "=g"(tr));
- return segment_base(tr);
-}
-
-#ifdef CONFIG_X86_64
-static inline unsigned long read_msr(unsigned long msr)
-{
- u64 value;
-
- rdmsrl(msr, value);
- return value;
-}
-#endif
-
-static inline void fx_save(struct i387_fxsave_struct *image)
-{
- asm ("fxsave (%0)":: "r" (image));
-}
-
-static inline void fx_restore(struct i387_fxsave_struct *image)
-{
- asm ("fxrstor (%0)":: "r" (image));
-}
-
-static inline void fpu_init(void)
-{
- asm ("finit");
-}
-
-static inline u32 get_rdx_init_val(void)
-{
- return 0x600; /* P6 family */
-}
-
-#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30"
-#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2"
-#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3"
-#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30"
-#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0"
-#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0"
-#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4"
-#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4"
-#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30"
-
-#define MSR_IA32_TIME_STAMP_COUNTER 0x010
-
-#define TSS_IOPB_BASE_OFFSET 0x66
-#define TSS_BASE_SIZE 0x68
-#define TSS_IOPB_SIZE (65536 / 8)
-#define TSS_REDIRECTION_SIZE (256 / 8)
-#define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
-
-#endif
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
deleted file mode 100644
index c0f372f1d76..00000000000
--- a/drivers/kvm/kvm_main.c
+++ /dev/null
@@ -1,3628 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * Copyright (C) 2006 Qumranet, Inc.
- *
- * Authors:
- * Avi Kivity <avi@qumranet.com>
- * Yaniv Kamay <yaniv@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include "kvm.h"
-#include "x86_emulate.h"
-#include "segment_descriptor.h"
-#include "irq.h"
-
-#include <linux/kvm.h>
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/percpu.h>
-#include <linux/gfp.h>
-#include <linux/mm.h>
-#include <linux/miscdevice.h>
-#include <linux/vmalloc.h>
-#include <linux/reboot.h>
-#include <linux/debugfs.h>
-#include <linux/highmem.h>
-#include <linux/file.h>
-#include <linux/sysdev.h>
-#include <linux/cpu.h>
-#include <linux/sched.h>
-#include <linux/cpumask.h>
-#include <linux/smp.h>
-#include <linux/anon_inodes.h>
-#include <linux/profile.h>
-
-#include <asm/processor.h>
-#include <asm/msr.h>
-#include <asm/io.h>
-#include <asm/uaccess.h>
-#include <asm/desc.h>
-
-MODULE_AUTHOR("Qumranet");
-MODULE_LICENSE("GPL");
-
-static DEFINE_SPINLOCK(kvm_lock);
-static LIST_HEAD(vm_list);
-
-static cpumask_t cpus_hardware_enabled;
-
-struct kvm_x86_ops *kvm_x86_ops;
-struct kmem_cache *kvm_vcpu_cache;
-EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
-
-static __read_mostly struct preempt_ops kvm_preempt_ops;
-
-#define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
-
-static struct kvm_stats_debugfs_item {
- const char *name;
- int offset;
- struct dentry *dentry;
-} debugfs_entries[] = {
- { "pf_fixed", STAT_OFFSET(pf_fixed) },
- { "pf_guest", STAT_OFFSET(pf_guest) },
- { "tlb_flush", STAT_OFFSET(tlb_flush) },
- { "invlpg", STAT_OFFSET(invlpg) },
- { "exits", STAT_OFFSET(exits) },
- { "io_exits", STAT_OFFSET(io_exits) },
- { "mmio_exits", STAT_OFFSET(mmio_exits) },
- { "signal_exits", STAT_OFFSET(signal_exits) },
- { "irq_window", STAT_OFFSET(irq_window_exits) },
- { "halt_exits", STAT_OFFSET(halt_exits) },
- { "halt_wakeup", STAT_OFFSET(halt_wakeup) },
- { "request_irq", STAT_OFFSET(request_irq_exits) },
- { "irq_exits", STAT_OFFSET(irq_exits) },
- { "light_exits", STAT_OFFSET(light_exits) },
- { "efer_reload", STAT_OFFSET(efer_reload) },
- { NULL }
-};
-
-static struct dentry *debugfs_dir;
-
-#define MAX_IO_MSRS 256
-
-#define CR0_RESERVED_BITS \
- (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
- | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
- | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
-#define CR4_RESERVED_BITS \
- (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
- | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
- | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
- | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
-
-#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
-#define EFER_RESERVED_BITS 0xfffffffffffff2fe
-
-#ifdef CONFIG_X86_64
-// LDT or TSS descriptor in the GDT. 16 bytes.
-struct segment_descriptor_64 {
- struct segment_descriptor s;
- u32 base_higher;
- u32 pad_zero;
-};
-
-#endif
-
-static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
- unsigned long arg);
-
-unsigned long segment_base(u16 selector)
-{
- struct descriptor_table gdt;
- struct segment_descriptor *d;
- unsigned long table_base;
- typedef unsigned long ul;
- unsigned long v;
-
- if (selector == 0)
- return 0;
-
- asm ("sgdt %0" : "=m"(gdt));
- table_base = gdt.base;
-
- if (selector & 4) { /* from ldt */
- u16 ldt_selector;
-
- asm ("sldt %0" : "=g"(ldt_selector));
- table_base = segment_base(ldt_selector);
- }
- d = (struct segment_descriptor *)(table_base + (selector & ~7));
- v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
-#ifdef CONFIG_X86_64
- if (d->system == 0
- && (d->type == 2 || d->type == 9 || d->type == 11))
- v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
-#endif
- return v;
-}
-EXPORT_SYMBOL_GPL(segment_base);
-
-static inline int valid_vcpu(int n)
-{
- return likely(n >= 0 && n < KVM_MAX_VCPUS);
-}
-
-void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
-{
- if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
- return;
-
- vcpu->guest_fpu_loaded = 1;
- fx_save(&vcpu->host_fx_image);
- fx_restore(&vcpu->guest_fx_image);
-}
-EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
-
-void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
-{
- if (!vcpu->guest_fpu_loaded)
- return;
-
- vcpu->guest_fpu_loaded = 0;
- fx_save(&vcpu->guest_fx_image);
- fx_restore(&vcpu->host_fx_image);
-}
-EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
-
-/*
- * Switches to specified vcpu, until a matching vcpu_put()
- */
-static void vcpu_load(struct kvm_vcpu *vcpu)
-{
- int cpu;
-
- mutex_lock(&vcpu->mutex);
- cpu = get_cpu();
- preempt_notifier_register(&vcpu->preempt_notifier);
- kvm_x86_ops->vcpu_load(vcpu, cpu);
- put_cpu();
-}
-
-static void vcpu_put(struct kvm_vcpu *vcpu)
-{
- preempt_disable();
- kvm_x86_ops->vcpu_put(vcpu);
- preempt_notifier_unregister(&vcpu->preempt_notifier);
- preempt_enable();
- mutex_unlock(&vcpu->mutex);
-}
-
-static void ack_flush(void *_completed)
-{
-}
-
-void kvm_flush_remote_tlbs(struct kvm *kvm)
-{
- int i, cpu;
- cpumask_t cpus;
- struct kvm_vcpu *vcpu;
-
- cpus_clear(cpus);
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
- vcpu = kvm->vcpus[i];
- if (!vcpu)
- continue;
- if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests))
- continue;
- cpu = vcpu->cpu;
- if (cpu != -1 && cpu != raw_smp_processor_id())
- cpu_set(cpu, cpus);
- }
- smp_call_function_mask(cpus, ack_flush, NULL, 1);
-}
-
-int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
-{
- struct page *page;
- int r;
-
- mutex_init(&vcpu->mutex);
- vcpu->cpu = -1;
- vcpu->mmu.root_hpa = INVALID_PAGE;
- vcpu->kvm = kvm;
- vcpu->vcpu_id = id;
- if (!irqchip_in_kernel(kvm) || id == 0)
- vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
- else
- vcpu->mp_state = VCPU_MP_STATE_UNINITIALIZED;
- init_waitqueue_head(&vcpu->wq);
-
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- if (!page) {
- r = -ENOMEM;
- goto fail;
- }
- vcpu->run = page_address(page);
-
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- if (!page) {
- r = -ENOMEM;
- goto fail_free_run;
- }
- vcpu->pio_data = page_address(page);
-
- r = kvm_mmu_create(vcpu);
- if (r < 0)
- goto fail_free_pio_data;
-
- return 0;
-
-fail_free_pio_data:
- free_page((unsigned long)vcpu->pio_data);
-fail_free_run:
- free_page((unsigned long)vcpu->run);
-fail:
- return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(kvm_vcpu_init);
-
-void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
-{
- kvm_mmu_destroy(vcpu);
- if (vcpu->apic)
- hrtimer_cancel(&vcpu->apic->timer.dev);
- kvm_free_apic(vcpu->apic);
- free_page((unsigned long)vcpu->pio_data);
- free_page((unsigned long)vcpu->run);
-}
-EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
-
-static struct kvm *kvm_create_vm(void)
-{
- struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
-
- if (!kvm)
- return ERR_PTR(-ENOMEM);
-
- kvm_io_bus_init(&kvm->pio_bus);
- mutex_init(&kvm->lock);
- INIT_LIST_HEAD(&kvm->active_mmu_pages);
- kvm_io_bus_init(&kvm->mmio_bus);
- spin_lock(&kvm_lock);
- list_add(&kvm->vm_list, &vm_list);
- spin_unlock(&kvm_lock);
- return kvm;
-}
-
-/*
- * Free any memory in @free but not in @dont.
- */
-static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
- struct kvm_memory_slot *dont)
-{
- int i;
-
- if (!dont || free->phys_mem != dont->phys_mem)
- if (free->phys_mem) {
- for (i = 0; i < free->npages; ++i)
- if (free->phys_mem[i])
- __free_page(free->phys_mem[i]);
- vfree(free->phys_mem);
- }
-
- if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
- vfree(free->dirty_bitmap);
-
- free->phys_mem = NULL;
- free->npages = 0;
- free->dirty_bitmap = NULL;
-}
-
-static void kvm_free_physmem(struct kvm *kvm)
-{
- int i;
-
- for (i = 0; i < kvm->nmemslots; ++i)
- kvm_free_physmem_slot(&kvm->memslots[i], NULL);
-}
-
-static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i)
- if (vcpu->pio.guest_pages[i]) {
- __free_page(vcpu->pio.guest_pages[i]);
- vcpu->pio.guest_pages[i] = NULL;
- }
-}
-
-static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
-{
- vcpu_load(vcpu);
- kvm_mmu_unload(vcpu);
- vcpu_put(vcpu);
-}
-
-static void kvm_free_vcpus(struct kvm *kvm)
-{
- unsigned int i;
-
- /*
- * Unpin any mmu pages first.
- */
- for (i = 0; i < KVM_MAX_VCPUS; ++i)
- if (kvm->vcpus[i])
- kvm_unload_vcpu_mmu(kvm->vcpus[i]);
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
- if (kvm->vcpus[i]) {
- kvm_x86_ops->vcpu_free(kvm->vcpus[i]);
- kvm->vcpus[i] = NULL;
- }
- }
-
-}
-
-static void kvm_destroy_vm(struct kvm *kvm)
-{
- spin_lock(&kvm_lock);
- list_del(&kvm->vm_list);
- spin_unlock(&kvm_lock);
- kvm_io_bus_destroy(&kvm->pio_bus);
- kvm_io_bus_destroy(&kvm->mmio_bus);
- kfree(kvm->vpic);
- kfree(kvm->vioapic);
- kvm_free_vcpus(kvm);
- kvm_free_physmem(kvm);
- kfree(kvm);
-}
-
-static int kvm_vm_release(struct inode *inode, struct file *filp)
-{
- struct kvm *kvm = filp->private_data;
-
- kvm_destroy_vm(kvm);
- return 0;
-}
-
-static void inject_gp(struct kvm_vcpu *vcpu)
-{
- kvm_x86_ops->inject_gp(vcpu, 0);
-}
-
-/*
- * Load the pae pdptrs. Return true is they are all valid.
- */
-static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
-{
- gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
- unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
- int i;
- u64 *pdpt;
- int ret;
- struct page *page;
- u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)];
-
- mutex_lock(&vcpu->kvm->lock);
- page = gfn_to_page(vcpu->kvm, pdpt_gfn);
- if (!page) {
- ret = 0;
- goto out;
- }
-
- pdpt = kmap_atomic(page, KM_USER0);
- memcpy(pdpte, pdpt+offset, sizeof(pdpte));
- kunmap_atomic(pdpt, KM_USER0);
-
- for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
- if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
- ret = 0;
- goto out;
- }
- }
- ret = 1;
-
- memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs));
-out:
- mutex_unlock(&vcpu->kvm->lock);
-
- return ret;
-}
-
-void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
-{
- if (cr0 & CR0_RESERVED_BITS) {
- printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
- cr0, vcpu->cr0);
- inject_gp(vcpu);
- return;
- }
-
- if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
- printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
- inject_gp(vcpu);
- return;
- }
-
- if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
- printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
- "and a clear PE flag\n");
- inject_gp(vcpu);
- return;
- }
-
- if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
-#ifdef CONFIG_X86_64
- if ((vcpu->shadow_efer & EFER_LME)) {
- int cs_db, cs_l;
-
- if (!is_pae(vcpu)) {
- printk(KERN_DEBUG "set_cr0: #GP, start paging "
- "in long mode while PAE is disabled\n");
- inject_gp(vcpu);
- return;
- }
- kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
- if (cs_l) {
- printk(KERN_DEBUG "set_cr0: #GP, start paging "
- "in long mode while CS.L == 1\n");
- inject_gp(vcpu);
- return;
-
- }
- } else
-#endif
- if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
- printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
- "reserved bits\n");
- inject_gp(vcpu);
- return;
- }
-
- }
-
- kvm_x86_ops->set_cr0(vcpu, cr0);
- vcpu->cr0 = cr0;
-
- mutex_lock(&vcpu->kvm->lock);
- kvm_mmu_reset_context(vcpu);
- mutex_unlock(&vcpu->kvm->lock);
- return;
-}
-EXPORT_SYMBOL_GPL(set_cr0);
-
-void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
-{
- set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
-}
-EXPORT_SYMBOL_GPL(lmsw);
-
-void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
-{
- if (cr4 & CR4_RESERVED_BITS) {
- printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
- inject_gp(vcpu);
- return;
- }
-
- if (is_long_mode(vcpu)) {
- if (!(cr4 & X86_CR4_PAE)) {
- printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
- "in long mode\n");
- inject_gp(vcpu);
- return;
- }
- } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
- && !load_pdptrs(vcpu, vcpu->cr3)) {
- printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
- inject_gp(vcpu);
- return;
- }
-
- if (cr4 & X86_CR4_VMXE) {
- printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
- inject_gp(vcpu);
- return;
- }
- kvm_x86_ops->set_cr4(vcpu, cr4);
- vcpu->cr4 = cr4;
- mutex_lock(&vcpu->kvm->lock);
- kvm_mmu_reset_context(vcpu);
- mutex_unlock(&vcpu->kvm->lock);
-}
-EXPORT_SYMBOL_GPL(set_cr4);
-
-void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
-{
- if (is_long_mode(vcpu)) {
- if (cr3 & CR3_L_MODE_RESERVED_BITS) {
- printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
- inject_gp(vcpu);
- return;
- }
- } else {
- if (is_pae(vcpu)) {
- if (cr3 & CR3_PAE_RESERVED_BITS) {
- printk(KERN_DEBUG
- "set_cr3: #GP, reserved bits\n");
- inject_gp(vcpu);
- return;
- }
- if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
- printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
- "reserved bits\n");
- inject_gp(vcpu);
- return;
- }
- } else {
- if (cr3 & CR3_NONPAE_RESERVED_BITS) {
- printk(KERN_DEBUG
- "set_cr3: #GP, reserved bits\n");
- inject_gp(vcpu);
- return;
- }
- }
- }
-
- mutex_lock(&vcpu->kvm->lock);
- /*
- * Does the new cr3 value map to physical memory? (Note, we
- * catch an invalid cr3 even in real-mode, because it would
- * cause trouble later on when we turn on paging anyway.)
- *
- * A real CPU would silently accept an invalid cr3 and would
- * attempt to use it - with largely undefined (and often hard
- * to debug) behavior on the guest side.
- */
- if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
- inject_gp(vcpu);
- else {
- vcpu->cr3 = cr3;
- vcpu->mmu.new_cr3(vcpu);
- }
- mutex_unlock(&vcpu->kvm->lock);
-}
-EXPORT_SYMBOL_GPL(set_cr3);
-
-void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
-{
- if (cr8 & CR8_RESERVED_BITS) {
- printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
- inject_gp(vcpu);
- return;
- }
- if (irqchip_in_kernel(vcpu->kvm))
- kvm_lapic_set_tpr(vcpu, cr8);
- else
- vcpu->cr8 = cr8;
-}
-EXPORT_SYMBOL_GPL(set_cr8);
-
-unsigned long get_cr8(struct kvm_vcpu *vcpu)
-{
- if (irqchip_in_kernel(vcpu->kvm))
- return kvm_lapic_get_cr8(vcpu);
- else
- return vcpu->cr8;
-}
-EXPORT_SYMBOL_GPL(get_cr8);
-
-u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
-{
- if (irqchip_in_kernel(vcpu->kvm))
- return vcpu->apic_base;
- else
- return vcpu->apic_base;
-}
-EXPORT_SYMBOL_GPL(kvm_get_apic_base);
-
-void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
-{
- /* TODO: reserve bits check */
- if (irqchip_in_kernel(vcpu->kvm))
- kvm_lapic_set_base(vcpu, data);
- else
- vcpu->apic_base = data;
-}
-EXPORT_SYMBOL_GPL(kvm_set_apic_base);
-
-void fx_init(struct kvm_vcpu *vcpu)
-{
- unsigned after_mxcsr_mask;
-
- /* Initialize guest FPU by resetting ours and saving into guest's */
- preempt_disable();
- fx_save(&vcpu->host_fx_image);
- fpu_init();
- fx_save(&vcpu->guest_fx_image);
- fx_restore(&vcpu->host_fx_image);
- preempt_enable();
-
- vcpu->cr0 |= X86_CR0_ET;
- after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
- vcpu->guest_fx_image.mxcsr = 0x1f80;
- memset((void *)&vcpu->guest_fx_image + after_mxcsr_mask,
- 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
-}
-EXPORT_SYMBOL_GPL(fx_init);
-
-/*
- * Allocate some memory and give it an address in the guest physical address
- * space.
- *
- * Discontiguous memory is allowed, mostly for framebuffers.
- */
-static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
- struct kvm_memory_region *mem)
-{
- int r;
- gfn_t base_gfn;
- unsigned long npages;
- unsigned long i;
- struct kvm_memory_slot *memslot;
- struct kvm_memory_slot old, new;
-
- r = -EINVAL;
- /* General sanity checks */
- if (mem->memory_size & (PAGE_SIZE - 1))
- goto out;
- if (mem->guest_phys_addr & (PAGE_SIZE - 1))
- goto out;
- if (mem->slot >= KVM_MEMORY_SLOTS)
- goto out;
- if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
- goto out;
-
- memslot = &kvm->memslots[mem->slot];
- base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
- npages = mem->memory_size >> PAGE_SHIFT;
-
- if (!npages)
- mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
-
- mutex_lock(&kvm->lock);
-
- new = old = *memslot;
-
- new.base_gfn = base_gfn;
- new.npages = npages;
- new.flags = mem->flags;
-
- /* Disallow changing a memory slot's size. */
- r = -EINVAL;
- if (npages && old.npages && npages != old.npages)
- goto out_unlock;
-
- /* Check for overlaps */
- r = -EEXIST;
- for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
- struct kvm_memory_slot *s = &kvm->memslots[i];
-
- if (s == memslot)
- continue;
- if (!((base_gfn + npages <= s->base_gfn) ||
- (base_gfn >= s->base_gfn + s->npages)))
- goto out_unlock;
- }
-
- /* Deallocate if slot is being removed */
- if (!npages)
- new.phys_mem = NULL;
-
- /* Free page dirty bitmap if unneeded */
- if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
- new.dirty_bitmap = NULL;
-
- r = -ENOMEM;
-
- /* Allocate if a slot is being created */
- if (npages && !new.phys_mem) {
- new.phys_mem = vmalloc(npages * sizeof(struct page *));
-
- if (!new.phys_mem)
- goto out_unlock;
-
- memset(new.phys_mem, 0, npages * sizeof(struct page *));
- for (i = 0; i < npages; ++i) {
- new.phys_mem[i] = alloc_page(GFP_HIGHUSER
- | __GFP_ZERO);
- if (!new.phys_mem[i])
- goto out_unlock;
- set_page_private(new.phys_mem[i],0);
- }
- }
-
- /* Allocate page dirty bitmap if needed */
- if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
- unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
-
- new.dirty_bitmap = vmalloc(dirty_bytes);
- if (!new.dirty_bitmap)
- goto out_unlock;
- memset(new.dirty_bitmap, 0, dirty_bytes);
- }
-
- if (mem->slot >= kvm->nmemslots)
- kvm->nmemslots = mem->slot + 1;
-
- *memslot = new;
-
- kvm_mmu_slot_remove_write_access(kvm, mem->slot);
- kvm_flush_remote_tlbs(kvm);
-
- mutex_unlock(&kvm->lock);
-
- kvm_free_physmem_slot(&old, &new);
- return 0;
-
-out_unlock:
- mutex_unlock(&kvm->lock);
- kvm_free_physmem_slot(&new, &old);
-out:
- return r;
-}
-
-/*
- * Get (and clear) the dirty memory log for a memory slot.
- */
-static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
- struct kvm_dirty_log *log)
-{
- struct kvm_memory_slot *memslot;
- int r, i;
- int n;
- unsigned long any = 0;
-
- mutex_lock(&kvm->lock);
-
- r = -EINVAL;
- if (log->slot >= KVM_MEMORY_SLOTS)
- goto out;
-
- memslot = &kvm->memslots[log->slot];
- r = -ENOENT;
- if (!memslot->dirty_bitmap)
- goto out;
-
- n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
-
- for (i = 0; !any && i < n/sizeof(long); ++i)
- any = memslot->dirty_bitmap[i];
-
- r = -EFAULT;
- if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
- goto out;
-
- /* If nothing is dirty, don't bother messing with page tables. */
- if (any) {
- kvm_mmu_slot_remove_write_access(kvm, log->slot);
- kvm_flush_remote_tlbs(kvm);
- memset(memslot->dirty_bitmap, 0, n);
- }
-
- r = 0;
-
-out:
- mutex_unlock(&kvm->lock);
- return r;
-}
-
-/*
- * Set a new alias region. Aliases map a portion of physical memory into
- * another portion. This is useful for memory windows, for example the PC
- * VGA region.
- */
-static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
- struct kvm_memory_alias *alias)
-{
- int r, n;
- struct kvm_mem_alias *p;
-
- r = -EINVAL;
- /* General sanity checks */
- if (alias->memory_size & (PAGE_SIZE - 1))
- goto out;
- if (alias->guest_phys_addr & (PAGE_SIZE - 1))
- goto out;
- if (alias->slot >= KVM_ALIAS_SLOTS)
- goto out;
- if (alias->guest_phys_addr + alias->memory_size
- < alias->guest_phys_addr)
- goto out;
- if (alias->target_phys_addr + alias->memory_size
- < alias->target_phys_addr)
- goto out;
-
- mutex_lock(&kvm->lock);
-
- p = &kvm->aliases[alias->slot];
- p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
- p->npages = alias->memory_size >> PAGE_SHIFT;
- p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
-
- for (n = KVM_ALIAS_SLOTS; n > 0; --n)
- if (kvm->aliases[n - 1].npages)
- break;
- kvm->naliases = n;
-
- kvm_mmu_zap_all(kvm);
-
- mutex_unlock(&kvm->lock);
-
- return 0;
-
-out:
- return r;
-}
-
-static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
-{
- int r;
-
- r = 0;
- switch (chip->chip_id) {
- case KVM_IRQCHIP_PIC_MASTER:
- memcpy (&chip->chip.pic,
- &pic_irqchip(kvm)->pics[0],
- sizeof(struct kvm_pic_state));
- break;
- case KVM_IRQCHIP_PIC_SLAVE:
- memcpy (&chip->chip.pic,
- &pic_irqchip(kvm)->pics[1],
- sizeof(struct kvm_pic_state));
- break;
- case KVM_IRQCHIP_IOAPIC:
- memcpy (&chip->chip.ioapic,
- ioapic_irqchip(kvm),
- sizeof(struct kvm_ioapic_state));
- break;
- default:
- r = -EINVAL;
- break;
- }
- return r;
-}
-
-static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
-{
- int r;
-
- r = 0;
- switch (chip->chip_id) {
- case KVM_IRQCHIP_PIC_MASTER:
- memcpy (&pic_irqchip(kvm)->pics[0],
- &chip->chip.pic,
- sizeof(struct kvm_pic_state));
- break;
- case KVM_IRQCHIP_PIC_SLAVE:
- memcpy (&pic_irqchip(kvm)->pics[1],
- &chip->chip.pic,
- sizeof(struct kvm_pic_state));
- break;
- case KVM_IRQCHIP_IOAPIC:
- memcpy (ioapic_irqchip(kvm),
- &chip->chip.ioapic,
- sizeof(struct kvm_ioapic_state));
- break;
- default:
- r = -EINVAL;
- break;
- }
- kvm_pic_update_irq(pic_irqchip(kvm));
- return r;
-}
-
-static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
-{
- int i;
- struct kvm_mem_alias *alias;
-
- for (i = 0; i < kvm->naliases; ++i) {
- alias = &kvm->aliases[i];
- if (gfn >= alias->base_gfn
- && gfn < alias->base_gfn + alias->npages)
- return alias->target_gfn + gfn - alias->base_gfn;
- }
- return gfn;
-}
-
-static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
-{
- int i;
-
- for (i = 0; i < kvm->nmemslots; ++i) {
- struct kvm_memory_slot *memslot = &kvm->memslots[i];
-
- if (gfn >= memslot->base_gfn
- && gfn < memslot->base_gfn + memslot->npages)
- return memslot;
- }
- return NULL;
-}
-
-struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
-{
- gfn = unalias_gfn(kvm, gfn);
- return __gfn_to_memslot(kvm, gfn);
-}
-
-struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
-{
- struct kvm_memory_slot *slot;
-
- gfn = unalias_gfn(kvm, gfn);
- slot = __gfn_to_memslot(kvm, gfn);
- if (!slot)
- return NULL;
- return slot->phys_mem[gfn - slot->base_gfn];
-}
-EXPORT_SYMBOL_GPL(gfn_to_page);
-
-/* WARNING: Does not work on aliased pages. */
-void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
-{
- struct kvm_memory_slot *memslot;
-
- memslot = __gfn_to_memslot(kvm, gfn);
- if (memslot && memslot->dirty_bitmap) {
- unsigned long rel_gfn = gfn - memslot->base_gfn;
-
- /* avoid RMW */
- if (!test_bit(rel_gfn, memslot->dirty_bitmap))
- set_bit(rel_gfn, memslot->dirty_bitmap);
- }
-}
-
-int emulator_read_std(unsigned long addr,
- void *val,
- unsigned int bytes,
- struct kvm_vcpu *vcpu)
-{
- void *data = val;
-
- while (bytes) {
- gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
- unsigned offset = addr & (PAGE_SIZE-1);
- unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
- unsigned long pfn;
- struct page *page;
- void *page_virt;
-
- if (gpa == UNMAPPED_GVA)
- return X86EMUL_PROPAGATE_FAULT;
- pfn = gpa >> PAGE_SHIFT;
- page = gfn_to_page(vcpu->kvm, pfn);
- if (!page)
- return X86EMUL_UNHANDLEABLE;
- page_virt = kmap_atomic(page, KM_USER0);
-
- memcpy(data, page_virt + offset, tocopy);
-
- kunmap_atomic(page_virt, KM_USER0);
-
- bytes -= tocopy;
- data += tocopy;
- addr += tocopy;
- }
-
- return X86EMUL_CONTINUE;
-}
-EXPORT_SYMBOL_GPL(emulator_read_std);
-
-static int emulator_write_std(unsigned long addr,
- const void *val,
- unsigned int bytes,
- struct kvm_vcpu *vcpu)
-{
- pr_unimpl(vcpu, "emulator_write_std: addr %lx n %d\n", addr, bytes);
- return X86EMUL_UNHANDLEABLE;
-}
-
-/*
- * Only apic need an MMIO device hook, so shortcut now..
- */
-static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
- gpa_t addr)
-{
- struct kvm_io_device *dev;
-
- if (vcpu->apic) {
- dev = &vcpu->apic->dev;
- if (dev->in_range(dev, addr))
- return dev;
- }
- return NULL;
-}
-
-static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
- gpa_t addr)
-{
- struct kvm_io_device *dev;
-
- dev = vcpu_find_pervcpu_dev(vcpu, addr);
- if (dev == NULL)
- dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
- return dev;
-}
-
-static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
- gpa_t addr)
-{
- return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
-}
-
-static int emulator_read_emulated(unsigned long addr,
- void *val,
- unsigned int bytes,
- struct kvm_vcpu *vcpu)
-{
- struct kvm_io_device *mmio_dev;
- gpa_t gpa;
-
- if (vcpu->mmio_read_completed) {
- memcpy(val, vcpu->mmio_data, bytes);
- vcpu->mmio_read_completed = 0;
- return X86EMUL_CONTINUE;
- } else if (emulator_read_std(addr, val, bytes, vcpu)
- == X86EMUL_CONTINUE)
- return X86EMUL_CONTINUE;
-
- gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
- if (gpa == UNMAPPED_GVA)
- return X86EMUL_PROPAGATE_FAULT;
-
- /*
- * Is this MMIO handled locally?
- */
- mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
- if (mmio_dev) {
- kvm_iodevice_read(mmio_dev, gpa, bytes, val);
- return X86EMUL_CONTINUE;
- }
-
- vcpu->mmio_needed = 1;
- vcpu->mmio_phys_addr = gpa;
- vcpu->mmio_size = bytes;
- vcpu->mmio_is_write = 0;
-
- return X86EMUL_UNHANDLEABLE;
-}
-
-static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
- const void *val, int bytes)
-{
- struct page *page;
- void *virt;
-
- if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
- return 0;
- page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
- if (!page)
- return 0;
- mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
- virt = kmap_atomic(page, KM_USER0);
- kvm_mmu_pte_write(vcpu, gpa, val, bytes);
- memcpy(virt + offset_in_page(gpa), val, bytes);
- kunmap_atomic(virt, KM_USER0);
- return 1;
-}
-
-static int emulator_write_emulated_onepage(unsigned long addr,
- const void *val,
- unsigned int bytes,
- struct kvm_vcpu *vcpu)
-{
- struct kvm_io_device *mmio_dev;
- gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
-
- if (gpa == UNMAPPED_GVA) {
- kvm_x86_ops->inject_page_fault(vcpu, addr, 2);
- return X86EMUL_PROPAGATE_FAULT;
- }
-
- if (emulator_write_phys(vcpu, gpa, val, bytes))
- return X86EMUL_CONTINUE;
-
- /*
- * Is this MMIO handled locally?
- */
- mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
- if (mmio_dev) {
- kvm_iodevice_write(mmio_dev, gpa, bytes, val);
- return X86EMUL_CONTINUE;
- }
-
- vcpu->mmio_needed = 1;
- vcpu->mmio_phys_addr = gpa;
- vcpu->mmio_size = bytes;
- vcpu->mmio_is_write = 1;
- memcpy(vcpu->mmio_data, val, bytes);
-
- return X86EMUL_CONTINUE;
-}
-
-int emulator_write_emulated(unsigned long addr,
- const void *val,
- unsigned int bytes,
- struct kvm_vcpu *vcpu)
-{
- /* Crossing a page boundary? */
- if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
- int rc, now;
-
- now = -addr & ~PAGE_MASK;
- rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- addr += now;
- val += now;
- bytes -= now;
- }
- return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
-}
-EXPORT_SYMBOL_GPL(emulator_write_emulated);
-
-static int emulator_cmpxchg_emulated(unsigned long addr,
- const void *old,
- const void *new,
- unsigned int bytes,
- struct kvm_vcpu *vcpu)
-{
- static int reported;
-
- if (!reported) {
- reported = 1;
- printk(KERN_WARNING "kvm: emulating exchange as write\n");
- }
- return emulator_write_emulated(addr, new, bytes, vcpu);
-}
-
-static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
-{
- return kvm_x86_ops->get_segment_base(vcpu, seg);
-}
-
-int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
-{
- return X86EMUL_CONTINUE;
-}
-
-int emulate_clts(struct kvm_vcpu *vcpu)
-{
- kvm_x86_ops->set_cr0(vcpu, vcpu->cr0 & ~X86_CR0_TS);
- return X86EMUL_CONTINUE;
-}
-
-int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest)
-{
- struct kvm_vcpu *vcpu = ctxt->vcpu;
-
- switch (dr) {
- case 0 ... 3:
- *dest = kvm_x86_ops->get_dr(vcpu, dr);
- return X86EMUL_CONTINUE;
- default:
- pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
- return X86EMUL_UNHANDLEABLE;
- }
-}
-
-int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
-{
- unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
- int exception;
-
- kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
- if (exception) {
- /* FIXME: better handling */
- return X86EMUL_UNHANDLEABLE;
- }
- return X86EMUL_CONTINUE;
-}
-
-void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
-{
- static int reported;
- u8 opcodes[4];
- unsigned long rip = vcpu->rip;
- unsigned long rip_linear;
-
- rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
-
- if (reported)
- return;
-
- emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
-
- printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
- context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
- reported = 1;
-}
-EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
-
-struct x86_emulate_ops emulate_ops = {
- .read_std = emulator_read_std,
- .write_std = emulator_write_std,
- .read_emulated = emulator_read_emulated,
- .write_emulated = emulator_write_emulated,
- .cmpxchg_emulated = emulator_cmpxchg_emulated,
-};
-
-int emulate_instruction(struct kvm_vcpu *vcpu,
- struct kvm_run *run,
- unsigned long cr2,
- u16 error_code)
-{
- struct x86_emulate_ctxt emulate_ctxt;
- int r;
- int cs_db, cs_l;
-
- vcpu->mmio_fault_cr2 = cr2;
- kvm_x86_ops->cache_regs(vcpu);
-
- kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
-
- emulate_ctxt.vcpu = vcpu;
- emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
- emulate_ctxt.cr2 = cr2;
- emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
- ? X86EMUL_MODE_REAL : cs_l
- ? X86EMUL_MODE_PROT64 : cs_db
- ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
-
- if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
- emulate_ctxt.cs_base = 0;
- emulate_ctxt.ds_base = 0;
- emulate_ctxt.es_base = 0;
- emulate_ctxt.ss_base = 0;
- } else {
- emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS);
- emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS);
- emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES);
- emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS);
- }
-
- emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS);
- emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS);
-
- vcpu->mmio_is_write = 0;
- vcpu->pio.string = 0;
- r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
- if (vcpu->pio.string)
- return EMULATE_DO_MMIO;
-
- if ((r || vcpu->mmio_is_write) && run) {
- run->exit_reason = KVM_EXIT_MMIO;
- run->mmio.phys_addr = vcpu->mmio_phys_addr;
- memcpy(run->mmio.data, vcpu->mmio_data, 8);
- run->mmio.len = vcpu->mmio_size;
- run->mmio.is_write = vcpu->mmio_is_write;
- }
-
- if (r) {
- if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
- return EMULATE_DONE;
- if (!vcpu->mmio_needed) {
- kvm_report_emulation_failure(vcpu, "mmio");
- return EMULATE_FAIL;
- }
- return EMULATE_DO_MMIO;
- }
-
- kvm_x86_ops->decache_regs(vcpu);
- kvm_x86_ops->set_rflags(vcpu, emulate_ctxt.eflags);
-
- if (vcpu->mmio_is_write) {
- vcpu->mmio_needed = 0;
- return EMULATE_DO_MMIO;
- }
-
- return EMULATE_DONE;
-}
-EXPORT_SYMBOL_GPL(emulate_instruction);
-
-/*
- * The vCPU has executed a HLT instruction with in-kernel mode enabled.
- */
-static void kvm_vcpu_block(struct kvm_vcpu *vcpu)
-{
- DECLARE_WAITQUEUE(wait, current);
-
- add_wait_queue(&vcpu->wq, &wait);
-
- /*
- * We will block until either an interrupt or a signal wakes us up
- */
- while (!kvm_cpu_has_interrupt(vcpu)
- && !signal_pending(current)
- && vcpu->mp_state != VCPU_MP_STATE_RUNNABLE
- && vcpu->mp_state != VCPU_MP_STATE_SIPI_RECEIVED) {
- set_current_state(TASK_INTERRUPTIBLE);
- vcpu_put(vcpu);
- schedule();
- vcpu_load(vcpu);
- }
-
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&vcpu->wq, &wait);
-}
-
-int kvm_emulate_halt(struct kvm_vcpu *vcpu)
-{
- ++vcpu->stat.halt_exits;
- if (irqchip_in_kernel(vcpu->kvm)) {
- vcpu->mp_state = VCPU_MP_STATE_HALTED;
- kvm_vcpu_block(vcpu);
- if (vcpu->mp_state != VCPU_MP_STATE_RUNNABLE)
- return -EINTR;
- return 1;
- } else {
- vcpu->run->exit_reason = KVM_EXIT_HLT;
- return 0;
- }
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_halt);
-
-int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run)
-{
- unsigned long nr, a0, a1, a2, a3, a4, a5, ret;
-
- kvm_x86_ops->cache_regs(vcpu);
- ret = -KVM_EINVAL;
-#ifdef CONFIG_X86_64
- if (is_long_mode(vcpu)) {
- nr = vcpu->regs[VCPU_REGS_RAX];
- a0 = vcpu->regs[VCPU_REGS_RDI];
- a1 = vcpu->regs[VCPU_REGS_RSI];
- a2 = vcpu->regs[VCPU_REGS_RDX];
- a3 = vcpu->regs[VCPU_REGS_RCX];
- a4 = vcpu->regs[VCPU_REGS_R8];
- a5 = vcpu->regs[VCPU_REGS_R9];
- } else
-#endif
- {
- nr = vcpu->regs[VCPU_REGS_RBX] & -1u;
- a0 = vcpu->regs[VCPU_REGS_RAX] & -1u;
- a1 = vcpu->regs[VCPU_REGS_RCX] & -1u;
- a2 = vcpu->regs[VCPU_REGS_RDX] & -1u;
- a3 = vcpu->regs[VCPU_REGS_RSI] & -1u;
- a4 = vcpu->regs[VCPU_REGS_RDI] & -1u;
- a5 = vcpu->regs[VCPU_REGS_RBP] & -1u;
- }
- switch (nr) {
- default:
- run->hypercall.nr = nr;
- run->hypercall.args[0] = a0;
- run->hypercall.args[1] = a1;
- run->hypercall.args[2] = a2;
- run->hypercall.args[3] = a3;
- run->hypercall.args[4] = a4;
- run->hypercall.args[5] = a5;
- run->hypercall.ret = ret;
- run->hypercall.longmode = is_long_mode(vcpu);
- kvm_x86_ops->decache_regs(vcpu);
- return 0;
- }
- vcpu->regs[VCPU_REGS_RAX] = ret;
- kvm_x86_ops->decache_regs(vcpu);
- return 1;
-}
-EXPORT_SYMBOL_GPL(kvm_hypercall);
-
-static u64 mk_cr_64(u64 curr_cr, u32 new_val)
-{
- return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
-}
-
-void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
-{
- struct descriptor_table dt = { limit, base };
-
- kvm_x86_ops->set_gdt(vcpu, &dt);
-}
-
-void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
-{
- struct descriptor_table dt = { limit, base };
-
- kvm_x86_ops->set_idt(vcpu, &dt);
-}
-
-void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
- unsigned long *rflags)
-{
- lmsw(vcpu, msw);
- *rflags = kvm_x86_ops->get_rflags(vcpu);
-}
-
-unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
-{
- kvm_x86_ops->decache_cr4_guest_bits(vcpu);
- switch (cr) {
- case 0:
- return vcpu->cr0;
- case 2:
- return vcpu->cr2;
- case 3:
- return vcpu->cr3;
- case 4:
- return vcpu->cr4;
- default:
- vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
- return 0;
- }
-}
-
-void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
- unsigned long *rflags)
-{
- switch (cr) {
- case 0:
- set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
- *rflags = kvm_x86_ops->get_rflags(vcpu);
- break;
- case 2:
- vcpu->cr2 = val;
- break;
- case 3:
- set_cr3(vcpu, val);
- break;
- case 4:
- set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
- break;
- default:
- vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
- }
-}
-
-/*
- * Register the para guest with the host:
- */
-static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa)
-{
- struct kvm_vcpu_para_state *para_state;
- hpa_t para_state_hpa, hypercall_hpa;
- struct page *para_state_page;
- unsigned char *hypercall;
- gpa_t hypercall_gpa;
-
- printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n");
- printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa);
-
- /*
- * Needs to be page aligned:
- */
- if (para_state_gpa != PAGE_ALIGN(para_state_gpa))
- goto err_gp;
-
- para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa);
- printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa);
- if (is_error_hpa(para_state_hpa))
- goto err_gp;
-
- mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT);
- para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT);
- para_state = kmap(para_state_page);
-
- printk(KERN_DEBUG ".... guest version: %d\n", para_state->guest_version);
- printk(KERN_DEBUG ".... size: %d\n", para_state->size);
-
- para_state->host_version = KVM_PARA_API_VERSION;
- /*
- * We cannot support guests that try to register themselves
- * with a newer API version than the host supports:
- */
- if (para_state->guest_version > KVM_PARA_API_VERSION) {
- para_state->ret = -KVM_EINVAL;
- goto err_kunmap_skip;
- }
-
- hypercall_gpa = para_state->hypercall_gpa;
- hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa);
- printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa);
- if (is_error_hpa(hypercall_hpa)) {
- para_state->ret = -KVM_EINVAL;
- goto err_kunmap_skip;
- }
-
- printk(KERN_DEBUG "kvm: para guest successfully registered.\n");
- vcpu->para_state_page = para_state_page;
- vcpu->para_state_gpa = para_state_gpa;
- vcpu->hypercall_gpa = hypercall_gpa;
-
- mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT);
- hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT),
- KM_USER1) + (hypercall_hpa & ~PAGE_MASK);
- kvm_x86_ops->patch_hypercall(vcpu, hypercall);
- kunmap_atomic(hypercall, KM_USER1);
-
- para_state->ret = 0;
-err_kunmap_skip:
- kunmap(para_state_page);
- return 0;
-err_gp:
- return 1;
-}
-
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
-{
- u64 data;
-
- switch (msr) {
- case 0xc0010010: /* SYSCFG */
- case 0xc0010015: /* HWCR */
- case MSR_IA32_PLATFORM_ID:
- case MSR_IA32_P5_MC_ADDR:
- case MSR_IA32_P5_MC_TYPE:
- case MSR_IA32_MC0_CTL:
- case MSR_IA32_MCG_STATUS:
- case MSR_IA32_MCG_CAP:
- case MSR_IA32_MC0_MISC:
- case MSR_IA32_MC0_MISC+4:
- case MSR_IA32_MC0_MISC+8:
- case MSR_IA32_MC0_MISC+12:
- case MSR_IA32_MC0_MISC+16:
- case MSR_IA32_UCODE_REV:
- case MSR_IA32_PERF_STATUS:
- case MSR_IA32_EBL_CR_POWERON:
- /* MTRR registers */
- case 0xfe:
- case 0x200 ... 0x2ff:
- data = 0;
- break;
- case 0xcd: /* fsb frequency */
- data = 3;
- break;
- case MSR_IA32_APICBASE:
- data = kvm_get_apic_base(vcpu);
- break;
- case MSR_IA32_MISC_ENABLE:
- data = vcpu->ia32_misc_enable_msr;
- break;
-#ifdef CONFIG_X86_64
- case MSR_EFER:
- data = vcpu->shadow_efer;
- break;
-#endif
- default:
- pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
- return 1;
- }
- *pdata = data;
- return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_get_msr_common);
-
-/*
- * Reads an msr value (of 'msr_index') into 'pdata'.
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
-{
- return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
-}
-
-#ifdef CONFIG_X86_64
-
-static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
-{
- if (efer & EFER_RESERVED_BITS) {
- printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
- efer);
- inject_gp(vcpu);
- return;
- }
-
- if (is_paging(vcpu)
- && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
- printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
- inject_gp(vcpu);
- return;
- }
-
- kvm_x86_ops->set_efer(vcpu, efer);
-
- efer &= ~EFER_LMA;
- efer |= vcpu->shadow_efer & EFER_LMA;
-
- vcpu->shadow_efer = efer;
-}
-
-#endif
-
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
- switch (msr) {
-#ifdef CONFIG_X86_64
- case MSR_EFER:
- set_efer(vcpu, data);
- break;
-#endif
- case MSR_IA32_MC0_STATUS:
- pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
- __FUNCTION__, data);
- break;
- case MSR_IA32_MCG_STATUS:
- pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
- __FUNCTION__, data);
- break;
- case MSR_IA32_UCODE_REV:
- case MSR_IA32_UCODE_WRITE:
- case 0x200 ... 0x2ff: /* MTRRs */
- break;
- case MSR_IA32_APICBASE:
- kvm_set_apic_base(vcpu, data);
- break;
- case MSR_IA32_MISC_ENABLE:
- vcpu->ia32_misc_enable_msr = data;
- break;
- /*
- * This is the 'probe whether the host is KVM' logic:
- */
- case MSR_KVM_API_MAGIC:
- return vcpu_register_para(vcpu, data);
-
- default:
- pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr);
- return 1;
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_set_msr_common);
-
-/*
- * Writes msr value into into the appropriate "register".
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
-{
- return kvm_x86_ops->set_msr(vcpu, msr_index, data);
-}
-
-void kvm_resched(struct kvm_vcpu *vcpu)
-{
- if (!need_resched())
- return;
- cond_resched();
-}
-EXPORT_SYMBOL_GPL(kvm_resched);
-
-void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
-{
- int i;
- u32 function;
- struct kvm_cpuid_entry *e, *best;
-
- kvm_x86_ops->cache_regs(vcpu);
- function = vcpu->regs[VCPU_REGS_RAX];
- vcpu->regs[VCPU_REGS_RAX] = 0;
- vcpu->regs[VCPU_REGS_RBX] = 0;
- vcpu->regs[VCPU_REGS_RCX] = 0;
- vcpu->regs[VCPU_REGS_RDX] = 0;
- best = NULL;
- for (i = 0; i < vcpu->cpuid_nent; ++i) {
- e = &vcpu->cpuid_entries[i];
- if (e->function == function) {
- best = e;
- break;
- }
- /*
- * Both basic or both extended?
- */
- if (((e->function ^ function) & 0x80000000) == 0)
- if (!best || e->function > best->function)
- best = e;
- }
- if (best) {
- vcpu->regs[VCPU_REGS_RAX] = best->eax;
- vcpu->regs[VCPU_REGS_RBX] = best->ebx;
- vcpu->regs[VCPU_REGS_RCX] = best->ecx;
- vcpu->regs[VCPU_REGS_RDX] = best->edx;
- }
- kvm_x86_ops->decache_regs(vcpu);
- kvm_x86_ops->skip_emulated_instruction(vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
-
-static int pio_copy_data(struct kvm_vcpu *vcpu)
-{
- void *p = vcpu->pio_data;
- void *q;
- unsigned bytes;
- int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1;
-
- q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
- PAGE_KERNEL);
- if (!q) {
- free_pio_guest_pages(vcpu);
- return -ENOMEM;
- }
- q += vcpu->pio.guest_page_offset;
- bytes = vcpu->pio.size * vcpu->pio.cur_count;
- if (vcpu->pio.in)
- memcpy(q, p, bytes);
- else
- memcpy(p, q, bytes);
- q -= vcpu->pio.guest_page_offset;
- vunmap(q);
- free_pio_guest_pages(vcpu);
- return 0;
-}
-
-static int complete_pio(struct kvm_vcpu *vcpu)
-{
- struct kvm_pio_request *io = &vcpu->pio;
- long delta;
- int r;
-
- kvm_x86_ops->cache_regs(vcpu);
-
- if (!io->string) {
- if (io->in)
- memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data,
- io->size);
- } else {
- if (io->in) {
- r = pio_copy_data(vcpu);
- if (r) {
- kvm_x86_ops->cache_regs(vcpu);
- return r;
- }
- }
-
- delta = 1;
- if (io->rep) {
- delta *= io->cur_count;
- /*
- * The size of the register should really depend on
- * current address size.
- */
- vcpu->regs[VCPU_REGS_RCX] -= delta;
- }
- if (io->down)
- delta = -delta;
- delta *= io->size;
- if (io->in)
- vcpu->regs[VCPU_REGS_RDI] += delta;
- else
- vcpu->regs[VCPU_REGS_RSI] += delta;
- }
-
- kvm_x86_ops->decache_regs(vcpu);
-
- io->count -= io->cur_count;
- io->cur_count = 0;
-
- return 0;
-}
-
-static void kernel_pio(struct kvm_io_device *pio_dev,
- struct kvm_vcpu *vcpu,
- void *pd)
-{
- /* TODO: String I/O for in kernel device */
-
- mutex_lock(&vcpu->kvm->lock);
- if (vcpu->pio.in)
- kvm_iodevice_read(pio_dev, vcpu->pio.port,
- vcpu->pio.size,
- pd);
- else
- kvm_iodevice_write(pio_dev, vcpu->pio.port,
- vcpu->pio.size,
- pd);
- mutex_unlock(&vcpu->kvm->lock);
-}
-
-static void pio_string_write(struct kvm_io_device *pio_dev,
- struct kvm_vcpu *vcpu)
-{
- struct kvm_pio_request *io = &vcpu->pio;
- void *pd = vcpu->pio_data;
- int i;
-
- mutex_lock(&vcpu->kvm->lock);
- for (i = 0; i < io->cur_count; i++) {
- kvm_iodevice_write(pio_dev, io->port,
- io->size,
- pd);
- pd += io->size;
- }
- mutex_unlock(&vcpu->kvm->lock);
-}
-
-int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
- int size, unsigned port)
-{
- struct kvm_io_device *pio_dev;
-
- vcpu->run->exit_reason = KVM_EXIT_IO;
- vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
- vcpu->run->io.size = vcpu->pio.size = size;
- vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
- vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = 1;
- vcpu->run->io.port = vcpu->pio.port = port;
- vcpu->pio.in = in;
- vcpu->pio.string = 0;
- vcpu->pio.down = 0;
- vcpu->pio.guest_page_offset = 0;
- vcpu->pio.rep = 0;
-
- kvm_x86_ops->cache_regs(vcpu);
- memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4);
- kvm_x86_ops->decache_regs(vcpu);
-
- kvm_x86_ops->skip_emulated_instruction(vcpu);
-
- pio_dev = vcpu_find_pio_dev(vcpu, port);
- if (pio_dev) {
- kernel_pio(pio_dev, vcpu, vcpu->pio_data);
- complete_pio(vcpu);
- return 1;
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_pio);
-
-int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
- int size, unsigned long count, int down,
- gva_t address, int rep, unsigned port)
-{
- unsigned now, in_page;
- int i, ret = 0;
- int nr_pages = 1;
- struct page *page;
- struct kvm_io_device *pio_dev;
-
- vcpu->run->exit_reason = KVM_EXIT_IO;
- vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
- vcpu->run->io.size = vcpu->pio.size = size;
- vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
- vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = count;
- vcpu->run->io.port = vcpu->pio.port = port;
- vcpu->pio.in = in;
- vcpu->pio.string = 1;
- vcpu->pio.down = down;
- vcpu->pio.guest_page_offset = offset_in_page(address);
- vcpu->pio.rep = rep;
-
- if (!count) {
- kvm_x86_ops->skip_emulated_instruction(vcpu);
- return 1;
- }
-
- if (!down)
- in_page = PAGE_SIZE - offset_in_page(address);
- else
- in_page = offset_in_page(address) + size;
- now = min(count, (unsigned long)in_page / size);
- if (!now) {
- /*
- * String I/O straddles page boundary. Pin two guest pages
- * so that we satisfy atomicity constraints. Do just one
- * transaction to avoid complexity.
- */
- nr_pages = 2;
- now = 1;
- }
- if (down) {
- /*
- * String I/O in reverse. Yuck. Kill the guest, fix later.
- */
- pr_unimpl(vcpu, "guest string pio down\n");
- inject_gp(vcpu);
- return 1;
- }
- vcpu->run->io.count = now;
- vcpu->pio.cur_count = now;
-
- if (vcpu->pio.cur_count == vcpu->pio.count)
- kvm_x86_ops->skip_emulated_instruction(vcpu);
-
- for (i = 0; i < nr_pages; ++i) {
- mutex_lock(&vcpu->kvm->lock);
- page = gva_to_page(vcpu, address + i * PAGE_SIZE);
- if (page)
- get_page(page);
- vcpu->pio.guest_pages[i] = page;
- mutex_unlock(&vcpu->kvm->lock);
- if (!page) {
- inject_gp(vcpu);
- free_pio_guest_pages(vcpu);
- return 1;
- }
- }
-
- pio_dev = vcpu_find_pio_dev(vcpu, port);
- if (!vcpu->pio.in) {
- /* string PIO write */
- ret = pio_copy_data(vcpu);
- if (ret >= 0 && pio_dev) {
- pio_string_write(pio_dev, vcpu);
- complete_pio(vcpu);
- if (vcpu->pio.count == 0)
- ret = 1;
- }
- } else if (pio_dev)
- pr_unimpl(vcpu, "no string pio read support yet, "
- "port %x size %d count %ld\n",
- port, size, count);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
-
-/*
- * Check if userspace requested an interrupt window, and that the
- * interrupt window is open.
- *
- * No need to exit to userspace if we already have an interrupt queued.
- */
-static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
- struct kvm_run *kvm_run)
-{
- return (!vcpu->irq_summary &&
- kvm_run->request_interrupt_window &&
- vcpu->interrupt_window_open &&
- (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
-}
-
-static void post_kvm_run_save(struct kvm_vcpu *vcpu,
- struct kvm_run *kvm_run)
-{
- kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
- kvm_run->cr8 = get_cr8(vcpu);
- kvm_run->apic_base = kvm_get_apic_base(vcpu);
- if (irqchip_in_kernel(vcpu->kvm))
- kvm_run->ready_for_interrupt_injection = 1;
- else
- kvm_run->ready_for_interrupt_injection =
- (vcpu->interrupt_window_open &&
- vcpu->irq_summary == 0);
-}
-
-static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- int r;
-
- if (unlikely(vcpu->mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
- printk("vcpu %d received sipi with vector # %x\n",
- vcpu->vcpu_id, vcpu->sipi_vector);
- kvm_lapic_reset(vcpu);
- kvm_x86_ops->vcpu_reset(vcpu);
- vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
- }
-
-preempted:
- if (vcpu->guest_debug.enabled)
- kvm_x86_ops->guest_debug_pre(vcpu);
-
-again:
- r = kvm_mmu_reload(vcpu);
- if (unlikely(r))
- goto out;
-
- preempt_disable();
-
- kvm_x86_ops->prepare_guest_switch(vcpu);
- kvm_load_guest_fpu(vcpu);
-
- local_irq_disable();
-
- if (signal_pending(current)) {
- local_irq_enable();
- preempt_enable();
- r = -EINTR;
- kvm_run->exit_reason = KVM_EXIT_INTR;
- ++vcpu->stat.signal_exits;
- goto out;
- }
-
- if (irqchip_in_kernel(vcpu->kvm))
- kvm_x86_ops->inject_pending_irq(vcpu);
- else if (!vcpu->mmio_read_completed)
- kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
-
- vcpu->guest_mode = 1;
- kvm_guest_enter();
-
- if (vcpu->requests)
- if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
- kvm_x86_ops->tlb_flush(vcpu);
-
- kvm_x86_ops->run(vcpu, kvm_run);
-
- vcpu->guest_mode = 0;
- local_irq_enable();
-
- ++vcpu->stat.exits;
-
- /*
- * We must have an instruction between local_irq_enable() and
- * kvm_guest_exit(), so the timer interrupt isn't delayed by
- * the interrupt shadow. The stat.exits increment will do nicely.
- * But we need to prevent reordering, hence this barrier():
- */
- barrier();
-
- kvm_guest_exit();
-
- preempt_enable();
-
- /*
- * Profile KVM exit RIPs:
- */
- if (unlikely(prof_on == KVM_PROFILING)) {
- kvm_x86_ops->cache_regs(vcpu);
- profile_hit(KVM_PROFILING, (void *)vcpu->rip);
- }
-
- r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
-
- if (r > 0) {
- if (dm_request_for_irq_injection(vcpu, kvm_run)) {
- r = -EINTR;
- kvm_run->exit_reason = KVM_EXIT_INTR;
- ++vcpu->stat.request_irq_exits;
- goto out;
- }
- if (!need_resched()) {
- ++vcpu->stat.light_exits;
- goto again;
- }
- }
-
-out:
- if (r > 0) {
- kvm_resched(vcpu);
- goto preempted;
- }
-
- post_kvm_run_save(vcpu, kvm_run);
-
- return r;
-}
-
-
-static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- int r;
- sigset_t sigsaved;
-
- vcpu_load(vcpu);
-
- if (unlikely(vcpu->mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
- kvm_vcpu_block(vcpu);
- vcpu_put(vcpu);
- return -EAGAIN;
- }
-
- if (vcpu->sigset_active)
- sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
-
- /* re-sync apic's tpr */
- if (!irqchip_in_kernel(vcpu->kvm))
- set_cr8(vcpu, kvm_run->cr8);
-
- if (vcpu->pio.cur_count) {
- r = complete_pio(vcpu);
- if (r)
- goto out;
- }
-
- if (vcpu->mmio_needed) {
- memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
- vcpu->mmio_read_completed = 1;
- vcpu->mmio_needed = 0;
- r = emulate_instruction(vcpu, kvm_run,
- vcpu->mmio_fault_cr2, 0);
- if (r == EMULATE_DO_MMIO) {
- /*
- * Read-modify-write. Back to userspace.
- */
- r = 0;
- goto out;
- }
- }
-
- if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
- kvm_x86_ops->cache_regs(vcpu);
- vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
- kvm_x86_ops->decache_regs(vcpu);
- }
-
- r = __vcpu_run(vcpu, kvm_run);
-
-out:
- if (vcpu->sigset_active)
- sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-
- vcpu_put(vcpu);
- return r;
-}
-
-static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu,
- struct kvm_regs *regs)
-{
- vcpu_load(vcpu);
-
- kvm_x86_ops->cache_regs(vcpu);
-
- regs->rax = vcpu->regs[VCPU_REGS_RAX];
- regs->rbx = vcpu->regs[VCPU_REGS_RBX];
- regs->rcx = vcpu->regs[VCPU_REGS_RCX];
- regs->rdx = vcpu->regs[VCPU_REGS_RDX];
- regs->rsi = vcpu->regs[VCPU_REGS_RSI];
- regs->rdi = vcpu->regs[VCPU_REGS_RDI];
- regs->rsp = vcpu->regs[VCPU_REGS_RSP];
- regs->rbp = vcpu->regs[VCPU_REGS_RBP];
-#ifdef CONFIG_X86_64
- regs->r8 = vcpu->regs[VCPU_REGS_R8];
- regs->r9 = vcpu->regs[VCPU_REGS_R9];
- regs->r10 = vcpu->regs[VCPU_REGS_R10];
- regs->r11 = vcpu->regs[VCPU_REGS_R11];
- regs->r12 = vcpu->regs[VCPU_REGS_R12];
- regs->r13 = vcpu->regs[VCPU_REGS_R13];
- regs->r14 = vcpu->regs[VCPU_REGS_R14];
- regs->r15 = vcpu->regs[VCPU_REGS_R15];
-#endif
-
- regs->rip = vcpu->rip;
- regs->rflags = kvm_x86_ops->get_rflags(vcpu);
-
- /*
- * Don't leak debug flags in case they were set for guest debugging
- */
- if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
- regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
-
- vcpu_put(vcpu);
-
- return 0;
-}
-
-static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu,
- struct kvm_regs *regs)
-{
- vcpu_load(vcpu);
-
- vcpu->regs[VCPU_REGS_RAX] = regs->rax;
- vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
- vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
- vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
- vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
- vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
- vcpu->regs[VCPU_REGS_RSP] = regs->rsp;
- vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
-#ifdef CONFIG_X86_64
- vcpu->regs[VCPU_REGS_R8] = regs->r8;
- vcpu->regs[VCPU_REGS_R9] = regs->r9;
- vcpu->regs[VCPU_REGS_R10] = regs->r10;
- vcpu->regs[VCPU_REGS_R11] = regs->r11;
- vcpu->regs[VCPU_REGS_R12] = regs->r12;
- vcpu->regs[VCPU_REGS_R13] = regs->r13;
- vcpu->regs[VCPU_REGS_R14] = regs->r14;
- vcpu->regs[VCPU_REGS_R15] = regs->r15;
-#endif
-
- vcpu->rip = regs->rip;
- kvm_x86_ops->set_rflags(vcpu, regs->rflags);
-
- kvm_x86_ops->decache_regs(vcpu);
-
- vcpu_put(vcpu);
-
- return 0;
-}
-
-static void get_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- return kvm_x86_ops->get_segment(vcpu, var, seg);
-}
-
-static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
- struct kvm_sregs *sregs)
-{
- struct descriptor_table dt;
- int pending_vec;
-
- vcpu_load(vcpu);
-
- get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
- get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
- get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
- get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
- get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
- get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
-
- get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
- get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
-
- kvm_x86_ops->get_idt(vcpu, &dt);
- sregs->idt.limit = dt.limit;
- sregs->idt.base = dt.base;
- kvm_x86_ops->get_gdt(vcpu, &dt);
- sregs->gdt.limit = dt.limit;
- sregs->gdt.base = dt.base;
-
- kvm_x86_ops->decache_cr4_guest_bits(vcpu);
- sregs->cr0 = vcpu->cr0;
- sregs->cr2 = vcpu->cr2;
- sregs->cr3 = vcpu->cr3;
- sregs->cr4 = vcpu->cr4;
- sregs->cr8 = get_cr8(vcpu);
- sregs->efer = vcpu->shadow_efer;
- sregs->apic_base = kvm_get_apic_base(vcpu);
-
- if (irqchip_in_kernel(vcpu->kvm)) {
- memset(sregs->interrupt_bitmap, 0,
- sizeof sregs->interrupt_bitmap);
- pending_vec = kvm_x86_ops->get_irq(vcpu);
- if (pending_vec >= 0)
- set_bit(pending_vec, (unsigned long *)sregs->interrupt_bitmap);
- } else
- memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
- sizeof sregs->interrupt_bitmap);
-
- vcpu_put(vcpu);
-
- return 0;
-}
-
-static void set_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- return kvm_x86_ops->set_segment(vcpu, var, seg);
-}
-
-static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
- struct kvm_sregs *sregs)
-{
- int mmu_reset_needed = 0;
- int i, pending_vec, max_bits;
- struct descriptor_table dt;
-
- vcpu_load(vcpu);
-
- dt.limit = sregs->idt.limit;
- dt.base = sregs->idt.base;
- kvm_x86_ops->set_idt(vcpu, &dt);
- dt.limit = sregs->gdt.limit;
- dt.base = sregs->gdt.base;
- kvm_x86_ops->set_gdt(vcpu, &dt);
-
- vcpu->cr2 = sregs->cr2;
- mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
- vcpu->cr3 = sregs->cr3;
-
- set_cr8(vcpu, sregs->cr8);
-
- mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
-#ifdef CONFIG_X86_64
- kvm_x86_ops->set_efer(vcpu, sregs->efer);
-#endif
- kvm_set_apic_base(vcpu, sregs->apic_base);
-
- kvm_x86_ops->decache_cr4_guest_bits(vcpu);
-
- mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
- vcpu->cr0 = sregs->cr0;
- kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
-
- mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
- kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
- if (!is_long_mode(vcpu) && is_pae(vcpu))
- load_pdptrs(vcpu, vcpu->cr3);
-
- if (mmu_reset_needed)
- kvm_mmu_reset_context(vcpu);
-
- if (!irqchip_in_kernel(vcpu->kvm)) {
- memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
- sizeof vcpu->irq_pending);
- vcpu->irq_summary = 0;
- for (i = 0; i < ARRAY_SIZE(vcpu->irq_pending); ++i)
- if (vcpu->irq_pending[i])
- __set_bit(i, &vcpu->irq_summary);
- } else {
- max_bits = (sizeof sregs->interrupt_bitmap) << 3;
- pending_vec = find_first_bit(
- (const unsigned long *)sregs->interrupt_bitmap,
- max_bits);
- /* Only pending external irq is handled here */
- if (pending_vec < max_bits) {
- kvm_x86_ops->set_irq(vcpu, pending_vec);
- printk("Set back pending irq %d\n", pending_vec);
- }
- }
-
- set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
- set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
- set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
- set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
- set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
- set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
-
- set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
- set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
-
- vcpu_put(vcpu);
-
- return 0;
-}
-
-void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
-{
- struct kvm_segment cs;
-
- get_segment(vcpu, &cs, VCPU_SREG_CS);
- *db = cs.db;
- *l = cs.l;
-}
-EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
-
-/*
- * List of msr numbers which we expose to userspace through KVM_GET_MSRS
- * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
- *
- * This list is modified at module load time to reflect the
- * capabilities of the host cpu.
- */
-static u32 msrs_to_save[] = {
- MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
- MSR_K6_STAR,
-#ifdef CONFIG_X86_64
- MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
-#endif
- MSR_IA32_TIME_STAMP_COUNTER,
-};
-
-static unsigned num_msrs_to_save;
-
-static u32 emulated_msrs[] = {
- MSR_IA32_MISC_ENABLE,
-};
-
-static __init void kvm_init_msr_list(void)
-{
- u32 dummy[2];
- unsigned i, j;
-
- for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
- if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
- continue;
- if (j < i)
- msrs_to_save[j] = msrs_to_save[i];
- j++;
- }
- num_msrs_to_save = j;
-}
-
-/*
- * Adapt set_msr() to msr_io()'s calling convention
- */
-static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
-{
- return kvm_set_msr(vcpu, index, *data);
-}
-
-/*
- * Read or write a bunch of msrs. All parameters are kernel addresses.
- *
- * @return number of msrs set successfully.
- */
-static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
- struct kvm_msr_entry *entries,
- int (*do_msr)(struct kvm_vcpu *vcpu,
- unsigned index, u64 *data))
-{
- int i;
-
- vcpu_load(vcpu);
-
- for (i = 0; i < msrs->nmsrs; ++i)
- if (do_msr(vcpu, entries[i].index, &entries[i].data))
- break;
-
- vcpu_put(vcpu);
-
- return i;
-}
-
-/*
- * Read or write a bunch of msrs. Parameters are user addresses.
- *
- * @return number of msrs set successfully.
- */
-static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
- int (*do_msr)(struct kvm_vcpu *vcpu,
- unsigned index, u64 *data),
- int writeback)
-{
- struct kvm_msrs msrs;
- struct kvm_msr_entry *entries;
- int r, n;
- unsigned size;
-
- r = -EFAULT;
- if (copy_from_user(&msrs, user_msrs, sizeof msrs))
- goto out;
-
- r = -E2BIG;
- if (msrs.nmsrs >= MAX_IO_MSRS)
- goto out;
-
- r = -ENOMEM;
- size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
- entries = vmalloc(size);
- if (!entries)
- goto out;
-
- r = -EFAULT;
- if (copy_from_user(entries, user_msrs->entries, size))
- goto out_free;
-
- r = n = __msr_io(vcpu, &msrs, entries, do_msr);
- if (r < 0)
- goto out_free;
-
- r = -EFAULT;
- if (writeback && copy_to_user(user_msrs->entries, entries, size))
- goto out_free;
-
- r = n;
-
-out_free:
- vfree(entries);
-out:
- return r;
-}
-
-/*
- * Translate a guest virtual address to a guest physical address.
- */
-static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
- struct kvm_translation *tr)
-{
- unsigned long vaddr = tr->linear_address;
- gpa_t gpa;
-
- vcpu_load(vcpu);
- mutex_lock(&vcpu->kvm->lock);
- gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
- tr->physical_address = gpa;
- tr->valid = gpa != UNMAPPED_GVA;
- tr->writeable = 1;
- tr->usermode = 0;
- mutex_unlock(&vcpu->kvm->lock);
- vcpu_put(vcpu);
-
- return 0;
-}
-
-static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
- struct kvm_interrupt *irq)
-{
- if (irq->irq < 0 || irq->irq >= 256)
- return -EINVAL;
- if (irqchip_in_kernel(vcpu->kvm))
- return -ENXIO;
- vcpu_load(vcpu);
-
- set_bit(irq->irq, vcpu->irq_pending);
- set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
-
- vcpu_put(vcpu);
-
- return 0;
-}
-
-static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
- struct kvm_debug_guest *dbg)
-{
- int r;
-
- vcpu_load(vcpu);
-
- r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
-
- vcpu_put(vcpu);
-
- return r;
-}
-
-static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma,
- unsigned long address,
- int *type)
-{
- struct kvm_vcpu *vcpu = vma->vm_file->private_data;
- unsigned long pgoff;
- struct page *page;
-
- pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
- if (pgoff == 0)
- page = virt_to_page(vcpu->run);
- else if (pgoff == KVM_PIO_PAGE_OFFSET)
- page = virt_to_page(vcpu->pio_data);
- else
- return NOPAGE_SIGBUS;
- get_page(page);
- if (type != NULL)
- *type = VM_FAULT_MINOR;
-
- return page;
-}
-
-static struct vm_operations_struct kvm_vcpu_vm_ops = {
- .nopage = kvm_vcpu_nopage,
-};
-
-static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
-{
- vma->vm_ops = &kvm_vcpu_vm_ops;
- return 0;
-}
-
-static int kvm_vcpu_release(struct inode *inode, struct file *filp)
-{
- struct kvm_vcpu *vcpu = filp->private_data;
-
- fput(vcpu->kvm->filp);
- return 0;
-}
-
-static struct file_operations kvm_vcpu_fops = {
- .release = kvm_vcpu_release,
- .unlocked_ioctl = kvm_vcpu_ioctl,
- .compat_ioctl = kvm_vcpu_ioctl,
- .mmap = kvm_vcpu_mmap,
-};
-
-/*
- * Allocates an inode for the vcpu.
- */
-static int create_vcpu_fd(struct kvm_vcpu *vcpu)
-{
- int fd, r;
- struct inode *inode;
- struct file *file;
-
- r = anon_inode_getfd(&fd, &inode, &file,
- "kvm-vcpu", &kvm_vcpu_fops, vcpu);
- if (r)
- return r;
- atomic_inc(&vcpu->kvm->filp->f_count);
- return fd;
-}
-
-/*
- * Creates some virtual cpus. Good luck creating more than one.
- */
-static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
-{
- int r;
- struct kvm_vcpu *vcpu;
-
- if (!valid_vcpu(n))
- return -EINVAL;
-
- vcpu = kvm_x86_ops->vcpu_create(kvm, n);
- if (IS_ERR(vcpu))
- return PTR_ERR(vcpu);
-
- preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
-
- /* We do fxsave: this must be aligned. */
- BUG_ON((unsigned long)&vcpu->host_fx_image & 0xF);
-
- vcpu_load(vcpu);
- r = kvm_mmu_setup(vcpu);
- vcpu_put(vcpu);
- if (r < 0)
- goto free_vcpu;
-
- mutex_lock(&kvm->lock);
- if (kvm->vcpus[n]) {
- r = -EEXIST;
- mutex_unlock(&kvm->lock);
- goto mmu_unload;
- }
- kvm->vcpus[n] = vcpu;
- mutex_unlock(&kvm->lock);
-
- /* Now it's all set up, let userspace reach it */
- r = create_vcpu_fd(vcpu);
- if (r < 0)
- goto unlink;
- return r;
-
-unlink:
- mutex_lock(&kvm->lock);
- kvm->vcpus[n] = NULL;
- mutex_unlock(&kvm->lock);
-
-mmu_unload:
- vcpu_load(vcpu);
- kvm_mmu_unload(vcpu);
- vcpu_put(vcpu);
-
-free_vcpu:
- kvm_x86_ops->vcpu_free(vcpu);
- return r;
-}
-
-static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
-{
- u64 efer;
- int i;
- struct kvm_cpuid_entry *e, *entry;
-
- rdmsrl(MSR_EFER, efer);
- entry = NULL;
- for (i = 0; i < vcpu->cpuid_nent; ++i) {
- e = &vcpu->cpuid_entries[i];
- if (e->function == 0x80000001) {
- entry = e;
- break;
- }
- }
- if (entry && (entry->edx & (1 << 20)) && !(efer & EFER_NX)) {
- entry->edx &= ~(1 << 20);
- printk(KERN_INFO "kvm: guest NX capability removed\n");
- }
-}
-
-static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
- struct kvm_cpuid *cpuid,
- struct kvm_cpuid_entry __user *entries)
-{
- int r;
-
- r = -E2BIG;
- if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
- goto out;
- r = -EFAULT;
- if (copy_from_user(&vcpu->cpuid_entries, entries,
- cpuid->nent * sizeof(struct kvm_cpuid_entry)))
- goto out;
- vcpu->cpuid_nent = cpuid->nent;
- cpuid_fix_nx_cap(vcpu);
- return 0;
-
-out:
- return r;
-}
-
-static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
-{
- if (sigset) {
- sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
- vcpu->sigset_active = 1;
- vcpu->sigset = *sigset;
- } else
- vcpu->sigset_active = 0;
- return 0;
-}
-
-/*
- * fxsave fpu state. Taken from x86_64/processor.h. To be killed when
- * we have asm/x86/processor.h
- */
-struct fxsave {
- u16 cwd;
- u16 swd;
- u16 twd;
- u16 fop;
- u64 rip;
- u64 rdp;
- u32 mxcsr;
- u32 mxcsr_mask;
- u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
-#ifdef CONFIG_X86_64
- u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
-#else
- u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
-#endif
-};
-
-static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
-{
- struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
-
- vcpu_load(vcpu);
-
- memcpy(fpu->fpr, fxsave->st_space, 128);
- fpu->fcw = fxsave->cwd;
- fpu->fsw = fxsave->swd;
- fpu->ftwx = fxsave->twd;
- fpu->last_opcode = fxsave->fop;
- fpu->last_ip = fxsave->rip;
- fpu->last_dp = fxsave->rdp;
- memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
-
- vcpu_put(vcpu);
-
- return 0;
-}
-
-static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
-{
- struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
-
- vcpu_load(vcpu);
-
- memcpy(fxsave->st_space, fpu->fpr, 128);
- fxsave->cwd = fpu->fcw;
- fxsave->swd = fpu->fsw;
- fxsave->twd = fpu->ftwx;
- fxsave->fop = fpu->last_opcode;
- fxsave->rip = fpu->last_ip;
- fxsave->rdp = fpu->last_dp;
- memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
-
- vcpu_put(vcpu);
-
- return 0;
-}
-
-static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
- struct kvm_lapic_state *s)
-{
- vcpu_load(vcpu);
- memcpy(s->regs, vcpu->apic->regs, sizeof *s);
- vcpu_put(vcpu);
-
- return 0;
-}
-
-static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
- struct kvm_lapic_state *s)
-{
- vcpu_load(vcpu);
- memcpy(vcpu->apic->regs, s->regs, sizeof *s);
- kvm_apic_post_state_restore(vcpu);
- vcpu_put(vcpu);
-
- return 0;
-}
-
-static long kvm_vcpu_ioctl(struct file *filp,
- unsigned int ioctl, unsigned long arg)
-{
- struct kvm_vcpu *vcpu = filp->private_data;
- void __user *argp = (void __user *)arg;
- int r = -EINVAL;
-
- switch (ioctl) {
- case KVM_RUN:
- r = -EINVAL;
- if (arg)
- goto out;
- r = kvm_vcpu_ioctl_run(vcpu, vcpu->run);
- break;
- case KVM_GET_REGS: {
- struct kvm_regs kvm_regs;
-
- memset(&kvm_regs, 0, sizeof kvm_regs);
- r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
- if (r)
- goto out;
- r = -EFAULT;
- if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
- goto out;
- r = 0;
- break;
- }
- case KVM_SET_REGS: {
- struct kvm_regs kvm_regs;
-
- r = -EFAULT;
- if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
- goto out;
- r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
- if (r)
- goto out;
- r = 0;
- break;
- }
- case KVM_GET_SREGS: {
- struct kvm_sregs kvm_sregs;
-
- memset(&kvm_sregs, 0, sizeof kvm_sregs);
- r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
- if (r)
- goto out;
- r = -EFAULT;
- if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
- goto out;
- r = 0;
- break;
- }
- case KVM_SET_SREGS: {
- struct kvm_sregs kvm_sregs;
-
- r = -EFAULT;
- if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
- goto out;
- r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
- if (r)
- goto out;
- r = 0;
- break;
- }
- case KVM_TRANSLATE: {
- struct kvm_translation tr;
-
- r = -EFAULT;
- if (copy_from_user(&tr, argp, sizeof tr))
- goto out;
- r = kvm_vcpu_ioctl_translate(vcpu, &tr);
- if (r)
- goto out;
- r = -EFAULT;
- if (copy_to_user(argp, &tr, sizeof tr))
- goto out;
- r = 0;
- break;
- }
- case KVM_INTERRUPT: {
- struct kvm_interrupt irq;
-
- r = -EFAULT;
- if (copy_from_user(&irq, argp, sizeof irq))
- goto out;
- r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
- if (r)
- goto out;
- r = 0;
- break;
- }
- case KVM_DEBUG_GUEST: {
- struct kvm_debug_guest dbg;
-
- r = -EFAULT;
- if (copy_from_user(&dbg, argp, sizeof dbg))
- goto out;
- r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg);
- if (r)
- goto out;
- r = 0;
- break;
- }
- case KVM_GET_MSRS:
- r = msr_io(vcpu, argp, kvm_get_msr, 1);
- break;
- case KVM_SET_MSRS:
- r = msr_io(vcpu, argp, do_set_msr, 0);
- break;
- case KVM_SET_CPUID: {
- struct kvm_cpuid __user *cpuid_arg = argp;
- struct kvm_cpuid cpuid;
-
- r = -EFAULT;
- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
- goto out;
- r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
- if (r)
- goto out;
- break;
- }
- case KVM_SET_SIGNAL_MASK: {
- struct kvm_signal_mask __user *sigmask_arg = argp;
- struct kvm_signal_mask kvm_sigmask;
- sigset_t sigset, *p;
-
- p = NULL;
- if (argp) {
- r = -EFAULT;
- if (copy_from_user(&kvm_sigmask, argp,
- sizeof kvm_sigmask))
- goto out;
- r = -EINVAL;
- if (kvm_sigmask.len != sizeof sigset)
- goto out;
- r = -EFAULT;
- if (copy_from_user(&sigset, sigmask_arg->sigset,
- sizeof sigset))
- goto out;
- p = &sigset;
- }
- r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
- break;
- }
- case KVM_GET_FPU: {
- struct kvm_fpu fpu;
-
- memset(&fpu, 0, sizeof fpu);
- r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu);
- if (r)
- goto out;
- r = -EFAULT;
- if (copy_to_user(argp, &fpu, sizeof fpu))
- goto out;
- r = 0;
- break;
- }
- case KVM_SET_FPU: {
- struct kvm_fpu fpu;
-
- r = -EFAULT;
- if (copy_from_user(&fpu, argp, sizeof fpu))
- goto out;
- r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu);
- if (r)
- goto out;
- r = 0;
- break;
- }
- case KVM_GET_LAPIC: {
- struct kvm_lapic_state lapic;
-
- memset(&lapic, 0, sizeof lapic);
- r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
- if (r)
- goto out;
- r = -EFAULT;
- if (copy_to_user(argp, &lapic, sizeof lapic))
- goto out;
- r = 0;
- break;
- }
- case KVM_SET_LAPIC: {
- struct kvm_lapic_state lapic;
-
- r = -EFAULT;
- if (copy_from_user(&lapic, argp, sizeof lapic))
- goto out;
- r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
- if (r)
- goto out;
- r = 0;
- break;
- }
- default:
- ;
- }
-out:
- return r;
-}
-
-static long kvm_vm_ioctl(struct file *filp,
- unsigned int ioctl, unsigned long arg)
-{
- struct kvm *kvm = filp->private_data;
- void __user *argp = (void __user *)arg;
- int r = -EINVAL;
-
- switch (ioctl) {
- case KVM_CREATE_VCPU:
- r = kvm_vm_ioctl_create_vcpu(kvm, arg);
- if (r < 0)
- goto out;
- break;
- case KVM_SET_MEMORY_REGION: {
- struct kvm_memory_region kvm_mem;
-
- r = -EFAULT;
- if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
- goto out;
- r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_mem);
- if (r)
- goto out;
- break;
- }
- case KVM_GET_DIRTY_LOG: {
- struct kvm_dirty_log log;
-
- r = -EFAULT;
- if (copy_from_user(&log, argp, sizeof log))
- goto out;
- r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
- if (r)
- goto out;
- break;
- }
- case KVM_SET_MEMORY_ALIAS: {
- struct kvm_memory_alias alias;
-
- r = -EFAULT;
- if (copy_from_user(&alias, argp, sizeof alias))
- goto out;
- r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
- if (r)
- goto out;
- break;
- }
- case KVM_CREATE_IRQCHIP:
- r = -ENOMEM;
- kvm->vpic = kvm_create_pic(kvm);
- if (kvm->vpic) {
- r = kvm_ioapic_init(kvm);
- if (r) {
- kfree(kvm->vpic);
- kvm->vpic = NULL;
- goto out;
- }
- }
- else
- goto out;
- break;
- case KVM_IRQ_LINE: {
- struct kvm_irq_level irq_event;
-
- r = -EFAULT;
- if (copy_from_user(&irq_event, argp, sizeof irq_event))
- goto out;
- if (irqchip_in_kernel(kvm)) {
- mutex_lock(&kvm->lock);
- if (irq_event.irq < 16)
- kvm_pic_set_irq(pic_irqchip(kvm),
- irq_event.irq,
- irq_event.level);
- kvm_ioapic_set_irq(kvm->vioapic,
- irq_event.irq,
- irq_event.level);
- mutex_unlock(&kvm->lock);
- r = 0;
- }
- break;
- }
- case KVM_GET_IRQCHIP: {
- /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
- struct kvm_irqchip chip;
-
- r = -EFAULT;
- if (copy_from_user(&chip, argp, sizeof chip))
- goto out;
- r = -ENXIO;
- if (!irqchip_in_kernel(kvm))
- goto out;
- r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
- if (r)
- goto out;
- r = -EFAULT;
- if (copy_to_user(argp, &chip, sizeof chip))
- goto out;
- r = 0;
- break;
- }
- case KVM_SET_IRQCHIP: {
- /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
- struct kvm_irqchip chip;
-
- r = -EFAULT;
- if (copy_from_user(&chip, argp, sizeof chip))
- goto out;
- r = -ENXIO;
- if (!irqchip_in_kernel(kvm))
- goto out;
- r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
- if (r)
- goto out;
- r = 0;
- break;
- }
- default:
- ;
- }
-out:
- return r;
-}
-
-static struct page *kvm_vm_nopage(struct vm_area_struct *vma,
- unsigned long address,
- int *type)
-{
- struct kvm *kvm = vma->vm_file->private_data;
- unsigned long pgoff;
- struct page *page;
-
- pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
- page = gfn_to_page(kvm, pgoff);
- if (!page)
- return NOPAGE_SIGBUS;
- get_page(page);
- if (type != NULL)
- *type = VM_FAULT_MINOR;
-
- return page;
-}
-
-static struct vm_operations_struct kvm_vm_vm_ops = {
- .nopage = kvm_vm_nopage,
-};
-
-static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
-{
- vma->vm_ops = &kvm_vm_vm_ops;
- return 0;
-}
-
-static struct file_operations kvm_vm_fops = {
- .release = kvm_vm_release,
- .unlocked_ioctl = kvm_vm_ioctl,
- .compat_ioctl = kvm_vm_ioctl,
- .mmap = kvm_vm_mmap,
-};
-
-static int kvm_dev_ioctl_create_vm(void)
-{
- int fd, r;
- struct inode *inode;
- struct file *file;
- struct kvm *kvm;
-
- kvm = kvm_create_vm();
- if (IS_ERR(kvm))
- return PTR_ERR(kvm);
- r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
- if (r) {
- kvm_destroy_vm(kvm);
- return r;
- }
-
- kvm->filp = file;
-
- return fd;
-}
-
-static long kvm_dev_ioctl(struct file *filp,
- unsigned int ioctl, unsigned long arg)
-{
- void __user *argp = (void __user *)arg;
- long r = -EINVAL;
-
- switch (ioctl) {
- case KVM_GET_API_VERSION:
- r = -EINVAL;
- if (arg)
- goto out;
- r = KVM_API_VERSION;
- break;
- case KVM_CREATE_VM:
- r = -EINVAL;
- if (arg)
- goto out;
- r = kvm_dev_ioctl_create_vm();
- break;
- case KVM_GET_MSR_INDEX_LIST: {
- struct kvm_msr_list __user *user_msr_list = argp;
- struct kvm_msr_list msr_list;
- unsigned n;
-
- r = -EFAULT;
- if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
- goto out;
- n = msr_list.nmsrs;
- msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
- if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
- goto out;
- r = -E2BIG;
- if (n < num_msrs_to_save)
- goto out;
- r = -EFAULT;
- if (copy_to_user(user_msr_list->indices, &msrs_to_save,
- num_msrs_to_save * sizeof(u32)))
- goto out;
- if (copy_to_user(user_msr_list->indices
- + num_msrs_to_save * sizeof(u32),
- &emulated_msrs,
- ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
- goto out;
- r = 0;
- break;
- }
- case KVM_CHECK_EXTENSION: {
- int ext = (long)argp;
-
- switch (ext) {
- case KVM_CAP_IRQCHIP:
- case KVM_CAP_HLT:
- r = 1;
- break;
- default:
- r = 0;
- break;
- }
- break;
- }
- case KVM_GET_VCPU_MMAP_SIZE:
- r = -EINVAL;
- if (arg)
- goto out;
- r = 2 * PAGE_SIZE;
- break;
- default:
- ;
- }
-out:
- return r;
-}
-
-static struct file_operations kvm_chardev_ops = {
- .unlocked_ioctl = kvm_dev_ioctl,
- .compat_ioctl = kvm_dev_ioctl,
-};
-
-static struct miscdevice kvm_dev = {
- KVM_MINOR,
- "kvm",
- &kvm_chardev_ops,
-};
-
-/*
- * Make sure that a cpu that is being hot-unplugged does not have any vcpus
- * cached on it.
- */
-static void decache_vcpus_on_cpu(int cpu)
-{
- struct kvm *vm;
- struct kvm_vcpu *vcpu;
- int i;
-
- spin_lock(&kvm_lock);
- list_for_each_entry(vm, &vm_list, vm_list)
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
- vcpu = vm->vcpus[i];
- if (!vcpu)
- continue;
- /*
- * If the vcpu is locked, then it is running on some
- * other cpu and therefore it is not cached on the
- * cpu in question.
- *
- * If it's not locked, check the last cpu it executed
- * on.
- */
- if (mutex_trylock(&vcpu->mutex)) {
- if (vcpu->cpu == cpu) {
- kvm_x86_ops->vcpu_decache(vcpu);
- vcpu->cpu = -1;
- }
- mutex_unlock(&vcpu->mutex);
- }
- }
- spin_unlock(&kvm_lock);
-}
-
-static void hardware_enable(void *junk)
-{
- int cpu = raw_smp_processor_id();
-
- if (cpu_isset(cpu, cpus_hardware_enabled))
- return;
- cpu_set(cpu, cpus_hardware_enabled);
- kvm_x86_ops->hardware_enable(NULL);
-}
-
-static void hardware_disable(void *junk)
-{
- int cpu = raw_smp_processor_id();
-
- if (!cpu_isset(cpu, cpus_hardware_enabled))
- return;
- cpu_clear(cpu, cpus_hardware_enabled);
- decache_vcpus_on_cpu(cpu);
- kvm_x86_ops->hardware_disable(NULL);
-}
-
-static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
- void *v)
-{
- int cpu = (long)v;
-
- switch (val) {
- case CPU_DYING:
- case CPU_DYING_FROZEN:
- printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
- cpu);
- hardware_disable(NULL);
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
- cpu);
- smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
- break;
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
- cpu);
- smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
- break;
- }
- return NOTIFY_OK;
-}
-
-static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
- void *v)
-{
- if (val == SYS_RESTART) {
- /*
- * Some (well, at least mine) BIOSes hang on reboot if
- * in vmx root mode.
- */
- printk(KERN_INFO "kvm: exiting hardware virtualization\n");
- on_each_cpu(hardware_disable, NULL, 0, 1);
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block kvm_reboot_notifier = {
- .notifier_call = kvm_reboot,
- .priority = 0,
-};
-
-void kvm_io_bus_init(struct kvm_io_bus *bus)
-{
- memset(bus, 0, sizeof(*bus));
-}
-
-void kvm_io_bus_destroy(struct kvm_io_bus *bus)
-{
- int i;
-
- for (i = 0; i < bus->dev_count; i++) {
- struct kvm_io_device *pos = bus->devs[i];
-
- kvm_iodevice_destructor(pos);
- }
-}
-
-struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
-{
- int i;
-
- for (i = 0; i < bus->dev_count; i++) {
- struct kvm_io_device *pos = bus->devs[i];
-
- if (pos->in_range(pos, addr))
- return pos;
- }
-
- return NULL;
-}
-
-void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
-{
- BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
-
- bus->devs[bus->dev_count++] = dev;
-}
-
-static struct notifier_block kvm_cpu_notifier = {
- .notifier_call = kvm_cpu_hotplug,
- .priority = 20, /* must be > scheduler priority */
-};
-
-static u64 stat_get(void *_offset)
-{
- unsigned offset = (long)_offset;
- u64 total = 0;
- struct kvm *kvm;
- struct kvm_vcpu *vcpu;
- int i;
-
- spin_lock(&kvm_lock);
- list_for_each_entry(kvm, &vm_list, vm_list)
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
- vcpu = kvm->vcpus[i];
- if (vcpu)
- total += *(u32 *)((void *)vcpu + offset);
- }
- spin_unlock(&kvm_lock);
- return total;
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, NULL, "%llu\n");
-
-static __init void kvm_init_debug(void)
-{
- struct kvm_stats_debugfs_item *p;
-
- debugfs_dir = debugfs_create_dir("kvm", NULL);
- for (p = debugfs_entries; p->name; ++p)
- p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
- (void *)(long)p->offset,
- &stat_fops);
-}
-
-static void kvm_exit_debug(void)
-{
- struct kvm_stats_debugfs_item *p;
-
- for (p = debugfs_entries; p->name; ++p)
- debugfs_remove(p->dentry);
- debugfs_remove(debugfs_dir);
-}
-
-static int kvm_suspend(struct sys_device *dev, pm_message_t state)
-{
- hardware_disable(NULL);
- return 0;
-}
-
-static int kvm_resume(struct sys_device *dev)
-{
- hardware_enable(NULL);
- return 0;
-}
-
-static struct sysdev_class kvm_sysdev_class = {
- .name = "kvm",
- .suspend = kvm_suspend,
- .resume = kvm_resume,
-};
-
-static struct sys_device kvm_sysdev = {
- .id = 0,
- .cls = &kvm_sysdev_class,
-};
-
-hpa_t bad_page_address;
-
-static inline
-struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
-{
- return container_of(pn, struct kvm_vcpu, preempt_notifier);
-}
-
-static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
-{
- struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
-
- kvm_x86_ops->vcpu_load(vcpu, cpu);
-}
-
-static void kvm_sched_out(struct preempt_notifier *pn,
- struct task_struct *next)
-{
- struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
-
- kvm_x86_ops->vcpu_put(vcpu);
-}
-
-int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size,
- struct module *module)
-{
- int r;
- int cpu;
-
- if (kvm_x86_ops) {
- printk(KERN_ERR "kvm: already loaded the other module\n");
- return -EEXIST;
- }
-
- if (!ops->cpu_has_kvm_support()) {
- printk(KERN_ERR "kvm: no hardware support\n");
- return -EOPNOTSUPP;
- }
- if (ops->disabled_by_bios()) {
- printk(KERN_ERR "kvm: disabled by bios\n");
- return -EOPNOTSUPP;
- }
-
- kvm_x86_ops = ops;
-
- r = kvm_x86_ops->hardware_setup();
- if (r < 0)
- goto out;
-
- for_each_online_cpu(cpu) {
- smp_call_function_single(cpu,
- kvm_x86_ops->check_processor_compatibility,
- &r, 0, 1);
- if (r < 0)
- goto out_free_0;
- }
-
- on_each_cpu(hardware_enable, NULL, 0, 1);
- r = register_cpu_notifier(&kvm_cpu_notifier);
- if (r)
- goto out_free_1;
- register_reboot_notifier(&kvm_reboot_notifier);
-
- r = sysdev_class_register(&kvm_sysdev_class);
- if (r)
- goto out_free_2;
-
- r = sysdev_register(&kvm_sysdev);
- if (r)
- goto out_free_3;
-
- /* A kmem cache lets us meet the alignment requirements of fx_save. */
- kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
- __alignof__(struct kvm_vcpu), 0, 0);
- if (!kvm_vcpu_cache) {
- r = -ENOMEM;
- goto out_free_4;
- }
-
- kvm_chardev_ops.owner = module;
-
- r = misc_register(&kvm_dev);
- if (r) {
- printk (KERN_ERR "kvm: misc device register failed\n");
- goto out_free;
- }
-
- kvm_preempt_ops.sched_in = kvm_sched_in;
- kvm_preempt_ops.sched_out = kvm_sched_out;
-
- return r;
-
-out_free:
- kmem_cache_destroy(kvm_vcpu_cache);
-out_free_4:
- sysdev_unregister(&kvm_sysdev);
-out_free_3:
- sysdev_class_unregister(&kvm_sysdev_class);
-out_free_2:
- unregister_reboot_notifier(&kvm_reboot_notifier);
- unregister_cpu_notifier(&kvm_cpu_notifier);
-out_free_1:
- on_each_cpu(hardware_disable, NULL, 0, 1);
-out_free_0:
- kvm_x86_ops->hardware_unsetup();
-out:
- kvm_x86_ops = NULL;
- return r;
-}
-
-void kvm_exit_x86(void)
-{
- misc_deregister(&kvm_dev);
- kmem_cache_destroy(kvm_vcpu_cache);
- sysdev_unregister(&kvm_sysdev);
- sysdev_class_unregister(&kvm_sysdev_class);
- unregister_reboot_notifier(&kvm_reboot_notifier);
- unregister_cpu_notifier(&kvm_cpu_notifier);
- on_each_cpu(hardware_disable, NULL, 0, 1);
- kvm_x86_ops->hardware_unsetup();
- kvm_x86_ops = NULL;
-}
-
-static __init int kvm_init(void)
-{
- static struct page *bad_page;
- int r;
-
- r = kvm_mmu_module_init();
- if (r)
- goto out4;
-
- kvm_init_debug();
-
- kvm_init_msr_list();
-
- if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) {
- r = -ENOMEM;
- goto out;
- }
-
- bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT;
- memset(__va(bad_page_address), 0, PAGE_SIZE);
-
- return 0;
-
-out:
- kvm_exit_debug();
- kvm_mmu_module_exit();
-out4:
- return r;
-}
-
-static __exit void kvm_exit(void)
-{
- kvm_exit_debug();
- __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
- kvm_mmu_module_exit();
-}
-
-module_init(kvm_init)
-module_exit(kvm_exit)
-
-EXPORT_SYMBOL_GPL(kvm_init_x86);
-EXPORT_SYMBOL_GPL(kvm_exit_x86);
diff --git a/drivers/kvm/kvm_svm.h b/drivers/kvm/kvm_svm.h
deleted file mode 100644
index a0e415daef5..00000000000
--- a/drivers/kvm/kvm_svm.h
+++ /dev/null
@@ -1,45 +0,0 @@
-#ifndef __KVM_SVM_H
-#define __KVM_SVM_H
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/list.h>
-#include <asm/msr.h>
-
-#include "svm.h"
-#include "kvm.h"
-
-static const u32 host_save_user_msrs[] = {
-#ifdef CONFIG_X86_64
- MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
- MSR_FS_BASE,
-#endif
- MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
-};
-
-#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
-#define NUM_DB_REGS 4
-
-struct kvm_vcpu;
-
-struct vcpu_svm {
- struct kvm_vcpu vcpu;
- struct vmcb *vmcb;
- unsigned long vmcb_pa;
- struct svm_cpu_data *svm_data;
- uint64_t asid_generation;
-
- unsigned long db_regs[NUM_DB_REGS];
-
- u64 next_rip;
-
- u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
- u64 host_gs_base;
- unsigned long host_cr2;
- unsigned long host_db_regs[NUM_DB_REGS];
- unsigned long host_dr6;
- unsigned long host_dr7;
-};
-
-#endif
-
diff --git a/drivers/kvm/lapic.c b/drivers/kvm/lapic.c
deleted file mode 100644
index 238fcad3cec..00000000000
--- a/drivers/kvm/lapic.c
+++ /dev/null
@@ -1,1080 +0,0 @@
-
-/*
- * Local APIC virtualization
- *
- * Copyright (C) 2006 Qumranet, Inc.
- * Copyright (C) 2007 Novell
- * Copyright (C) 2007 Intel
- *
- * Authors:
- * Dor Laor <dor.laor@qumranet.com>
- * Gregory Haskins <ghaskins@novell.com>
- * Yaozu (Eddie) Dong <eddie.dong@intel.com>
- *
- * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation.
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- */
-
-#include "kvm.h"
-#include <linux/kvm.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/smp.h>
-#include <linux/hrtimer.h>
-#include <linux/io.h>
-#include <linux/module.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
-#include <asm/page.h>
-#include <asm/current.h>
-#include <asm/apicdef.h>
-#include <asm/atomic.h>
-#include <asm/div64.h>
-#include "irq.h"
-
-#define PRId64 "d"
-#define PRIx64 "llx"
-#define PRIu64 "u"
-#define PRIo64 "o"
-
-#define APIC_BUS_CYCLE_NS 1
-
-/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
-#define apic_debug(fmt, arg...)
-
-#define APIC_LVT_NUM 6
-/* 14 is the version for Xeon and Pentium 8.4.8*/
-#define APIC_VERSION (0x14UL | ((APIC_LVT_NUM - 1) << 16))
-#define LAPIC_MMIO_LENGTH (1 << 12)
-/* followed define is not in apicdef.h */
-#define APIC_SHORT_MASK 0xc0000
-#define APIC_DEST_NOSHORT 0x0
-#define APIC_DEST_MASK 0x800
-#define MAX_APIC_VECTOR 256
-
-#define VEC_POS(v) ((v) & (32 - 1))
-#define REG_POS(v) (((v) >> 5) << 4)
-static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
-{
- return *((u32 *) (apic->regs + reg_off));
-}
-
-static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
-{
- *((u32 *) (apic->regs + reg_off)) = val;
-}
-
-static inline int apic_test_and_set_vector(int vec, void *bitmap)
-{
- return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
-static inline int apic_test_and_clear_vector(int vec, void *bitmap)
-{
- return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
-static inline void apic_set_vector(int vec, void *bitmap)
-{
- set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
-static inline void apic_clear_vector(int vec, void *bitmap)
-{
- clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
-static inline int apic_hw_enabled(struct kvm_lapic *apic)
-{
- return (apic)->vcpu->apic_base & MSR_IA32_APICBASE_ENABLE;
-}
-
-static inline int apic_sw_enabled(struct kvm_lapic *apic)
-{
- return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
-}
-
-static inline int apic_enabled(struct kvm_lapic *apic)
-{
- return apic_sw_enabled(apic) && apic_hw_enabled(apic);
-}
-
-#define LVT_MASK \
- (APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK)
-
-#define LINT_MASK \
- (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
- APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
-
-static inline int kvm_apic_id(struct kvm_lapic *apic)
-{
- return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
-}
-
-static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
-{
- return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
-}
-
-static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
-{
- return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
-}
-
-static inline int apic_lvtt_period(struct kvm_lapic *apic)
-{
- return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
-}
-
-static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
- LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */
- LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
- LVT_MASK | APIC_MODE_MASK, /* LVTPC */
- LINT_MASK, LINT_MASK, /* LVT0-1 */
- LVT_MASK /* LVTERR */
-};
-
-static int find_highest_vector(void *bitmap)
-{
- u32 *word = bitmap;
- int word_offset = MAX_APIC_VECTOR >> 5;
-
- while ((word_offset != 0) && (word[(--word_offset) << 2] == 0))
- continue;
-
- if (likely(!word_offset && !word[0]))
- return -1;
- else
- return fls(word[word_offset << 2]) - 1 + (word_offset << 5);
-}
-
-static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
-{
- return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
-}
-
-static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
-{
- apic_clear_vector(vec, apic->regs + APIC_IRR);
-}
-
-static inline int apic_find_highest_irr(struct kvm_lapic *apic)
-{
- int result;
-
- result = find_highest_vector(apic->regs + APIC_IRR);
- ASSERT(result == -1 || result >= 16);
-
- return result;
-}
-
-int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
- int highest_irr;
-
- if (!apic)
- return 0;
- highest_irr = apic_find_highest_irr(apic);
-
- return highest_irr;
-}
-EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
-
-int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig)
-{
- if (!apic_test_and_set_irr(vec, apic)) {
- /* a new pending irq is set in IRR */
- if (trig)
- apic_set_vector(vec, apic->regs + APIC_TMR);
- else
- apic_clear_vector(vec, apic->regs + APIC_TMR);
- kvm_vcpu_kick(apic->vcpu);
- return 1;
- }
- return 0;
-}
-
-static inline int apic_find_highest_isr(struct kvm_lapic *apic)
-{
- int result;
-
- result = find_highest_vector(apic->regs + APIC_ISR);
- ASSERT(result == -1 || result >= 16);
-
- return result;
-}
-
-static void apic_update_ppr(struct kvm_lapic *apic)
-{
- u32 tpr, isrv, ppr;
- int isr;
-
- tpr = apic_get_reg(apic, APIC_TASKPRI);
- isr = apic_find_highest_isr(apic);
- isrv = (isr != -1) ? isr : 0;
-
- if ((tpr & 0xf0) >= (isrv & 0xf0))
- ppr = tpr & 0xff;
- else
- ppr = isrv & 0xf0;
-
- apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
- apic, ppr, isr, isrv);
-
- apic_set_reg(apic, APIC_PROCPRI, ppr);
-}
-
-static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
-{
- apic_set_reg(apic, APIC_TASKPRI, tpr);
- apic_update_ppr(apic);
-}
-
-int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
-{
- return kvm_apic_id(apic) == dest;
-}
-
-int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
-{
- int result = 0;
- u8 logical_id;
-
- logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR));
-
- switch (apic_get_reg(apic, APIC_DFR)) {
- case APIC_DFR_FLAT:
- if (logical_id & mda)
- result = 1;
- break;
- case APIC_DFR_CLUSTER:
- if (((logical_id >> 4) == (mda >> 0x4))
- && (logical_id & mda & 0xf))
- result = 1;
- break;
- default:
- printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n",
- apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR));
- break;
- }
-
- return result;
-}
-
-static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
- int short_hand, int dest, int dest_mode)
-{
- int result = 0;
- struct kvm_lapic *target = vcpu->apic;
-
- apic_debug("target %p, source %p, dest 0x%x, "
- "dest_mode 0x%x, short_hand 0x%x",
- target, source, dest, dest_mode, short_hand);
-
- ASSERT(!target);
- switch (short_hand) {
- case APIC_DEST_NOSHORT:
- if (dest_mode == 0) {
- /* Physical mode. */
- if ((dest == 0xFF) || (dest == kvm_apic_id(target)))
- result = 1;
- } else
- /* Logical mode. */
- result = kvm_apic_match_logical_addr(target, dest);
- break;
- case APIC_DEST_SELF:
- if (target == source)
- result = 1;
- break;
- case APIC_DEST_ALLINC:
- result = 1;
- break;
- case APIC_DEST_ALLBUT:
- if (target != source)
- result = 1;
- break;
- default:
- printk(KERN_WARNING "Bad dest shorthand value %x\n",
- short_hand);
- break;
- }
-
- return result;
-}
-
-/*
- * Add a pending IRQ into lapic.
- * Return 1 if successfully added and 0 if discarded.
- */
-static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
- int vector, int level, int trig_mode)
-{
- int orig_irr, result = 0;
- struct kvm_vcpu *vcpu = apic->vcpu;
-
- switch (delivery_mode) {
- case APIC_DM_FIXED:
- case APIC_DM_LOWEST:
- /* FIXME add logic for vcpu on reset */
- if (unlikely(!apic_enabled(apic)))
- break;
-
- orig_irr = apic_test_and_set_irr(vector, apic);
- if (orig_irr && trig_mode) {
- apic_debug("level trig mode repeatedly for vector %d",
- vector);
- break;
- }
-
- if (trig_mode) {
- apic_debug("level trig mode for vector %d", vector);
- apic_set_vector(vector, apic->regs + APIC_TMR);
- } else
- apic_clear_vector(vector, apic->regs + APIC_TMR);
-
- if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE)
- kvm_vcpu_kick(vcpu);
- else if (vcpu->mp_state == VCPU_MP_STATE_HALTED) {
- vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
- if (waitqueue_active(&vcpu->wq))
- wake_up_interruptible(&vcpu->wq);
- }
-
- result = (orig_irr == 0);
- break;
-
- case APIC_DM_REMRD:
- printk(KERN_DEBUG "Ignoring delivery mode 3\n");
- break;
-
- case APIC_DM_SMI:
- printk(KERN_DEBUG "Ignoring guest SMI\n");
- break;
- case APIC_DM_NMI:
- printk(KERN_DEBUG "Ignoring guest NMI\n");
- break;
-
- case APIC_DM_INIT:
- if (level) {
- if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE)
- printk(KERN_DEBUG
- "INIT on a runnable vcpu %d\n",
- vcpu->vcpu_id);
- vcpu->mp_state = VCPU_MP_STATE_INIT_RECEIVED;
- kvm_vcpu_kick(vcpu);
- } else {
- printk(KERN_DEBUG
- "Ignoring de-assert INIT to vcpu %d\n",
- vcpu->vcpu_id);
- }
-
- break;
-
- case APIC_DM_STARTUP:
- printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
- vcpu->vcpu_id, vector);
- if (vcpu->mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
- vcpu->sipi_vector = vector;
- vcpu->mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
- if (waitqueue_active(&vcpu->wq))
- wake_up_interruptible(&vcpu->wq);
- }
- break;
-
- default:
- printk(KERN_ERR "TODO: unsupported delivery mode %x\n",
- delivery_mode);
- break;
- }
- return result;
-}
-
-struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
- unsigned long bitmap)
-{
- int vcpu_id;
- int last;
- int next;
- struct kvm_lapic *apic;
-
- last = kvm->round_robin_prev_vcpu;
- next = last;
-
- do {
- if (++next == KVM_MAX_VCPUS)
- next = 0;
- if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap))
- continue;
- apic = kvm->vcpus[next]->apic;
- if (apic && apic_enabled(apic))
- break;
- apic = NULL;
- } while (next != last);
- kvm->round_robin_prev_vcpu = next;
-
- if (!apic) {
- vcpu_id = ffs(bitmap) - 1;
- if (vcpu_id < 0) {
- vcpu_id = 0;
- printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
- }
- apic = kvm->vcpus[vcpu_id]->apic;
- }
-
- return apic;
-}
-
-static void apic_set_eoi(struct kvm_lapic *apic)
-{
- int vector = apic_find_highest_isr(apic);
-
- /*
- * Not every write EOI will has corresponding ISR,
- * one example is when Kernel check timer on setup_IO_APIC
- */
- if (vector == -1)
- return;
-
- apic_clear_vector(vector, apic->regs + APIC_ISR);
- apic_update_ppr(apic);
-
- if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
- kvm_ioapic_update_eoi(apic->vcpu->kvm, vector);
-}
-
-static void apic_send_ipi(struct kvm_lapic *apic)
-{
- u32 icr_low = apic_get_reg(apic, APIC_ICR);
- u32 icr_high = apic_get_reg(apic, APIC_ICR2);
-
- unsigned int dest = GET_APIC_DEST_FIELD(icr_high);
- unsigned int short_hand = icr_low & APIC_SHORT_MASK;
- unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG;
- unsigned int level = icr_low & APIC_INT_ASSERT;
- unsigned int dest_mode = icr_low & APIC_DEST_MASK;
- unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
- unsigned int vector = icr_low & APIC_VECTOR_MASK;
-
- struct kvm_lapic *target;
- struct kvm_vcpu *vcpu;
- unsigned long lpr_map = 0;
- int i;
-
- apic_debug("icr_high 0x%x, icr_low 0x%x, "
- "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
- "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
- icr_high, icr_low, short_hand, dest,
- trig_mode, level, dest_mode, delivery_mode, vector);
-
- for (i = 0; i < KVM_MAX_VCPUS; i++) {
- vcpu = apic->vcpu->kvm->vcpus[i];
- if (!vcpu)
- continue;
-
- if (vcpu->apic &&
- apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
- if (delivery_mode == APIC_DM_LOWEST)
- set_bit(vcpu->vcpu_id, &lpr_map);
- else
- __apic_accept_irq(vcpu->apic, delivery_mode,
- vector, level, trig_mode);
- }
- }
-
- if (delivery_mode == APIC_DM_LOWEST) {
- target = kvm_apic_round_robin(vcpu->kvm, vector, lpr_map);
- if (target != NULL)
- __apic_accept_irq(target, delivery_mode,
- vector, level, trig_mode);
- }
-}
-
-static u32 apic_get_tmcct(struct kvm_lapic *apic)
-{
- u64 counter_passed;
- ktime_t passed, now;
- u32 tmcct;
-
- ASSERT(apic != NULL);
-
- now = apic->timer.dev.base->get_time();
- tmcct = apic_get_reg(apic, APIC_TMICT);
-
- /* if initial count is 0, current count should also be 0 */
- if (tmcct == 0)
- return 0;
-
- if (unlikely(ktime_to_ns(now) <=
- ktime_to_ns(apic->timer.last_update))) {
- /* Wrap around */
- passed = ktime_add(( {
- (ktime_t) {
- .tv64 = KTIME_MAX -
- (apic->timer.last_update).tv64}; }
- ), now);
- apic_debug("time elapsed\n");
- } else
- passed = ktime_sub(now, apic->timer.last_update);
-
- counter_passed = div64_64(ktime_to_ns(passed),
- (APIC_BUS_CYCLE_NS * apic->timer.divide_count));
-
- if (counter_passed > tmcct) {
- if (unlikely(!apic_lvtt_period(apic))) {
- /* one-shot timers stick at 0 until reset */
- tmcct = 0;
- } else {
- /*
- * periodic timers reset to APIC_TMICT when they
- * hit 0. The while loop simulates this happening N
- * times. (counter_passed %= tmcct) would also work,
- * but might be slower or not work on 32-bit??
- */
- while (counter_passed > tmcct)
- counter_passed -= tmcct;
- tmcct -= counter_passed;
- }
- } else {
- tmcct -= counter_passed;
- }
-
- return tmcct;
-}
-
-static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
-{
- u32 val = 0;
-
- if (offset >= LAPIC_MMIO_LENGTH)
- return 0;
-
- switch (offset) {
- case APIC_ARBPRI:
- printk(KERN_WARNING "Access APIC ARBPRI register "
- "which is for P6\n");
- break;
-
- case APIC_TMCCT: /* Timer CCR */
- val = apic_get_tmcct(apic);
- break;
-
- default:
- apic_update_ppr(apic);
- val = apic_get_reg(apic, offset);
- break;
- }
-
- return val;
-}
-
-static void apic_mmio_read(struct kvm_io_device *this,
- gpa_t address, int len, void *data)
-{
- struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
- unsigned int offset = address - apic->base_address;
- unsigned char alignment = offset & 0xf;
- u32 result;
-
- if ((alignment + len) > 4) {
- printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d",
- (unsigned long)address, len);
- return;
- }
- result = __apic_read(apic, offset & ~0xf);
-
- switch (len) {
- case 1:
- case 2:
- case 4:
- memcpy(data, (char *)&result + alignment, len);
- break;
- default:
- printk(KERN_ERR "Local APIC read with len = %x, "
- "should be 1,2, or 4 instead\n", len);
- break;
- }
-}
-
-static void update_divide_count(struct kvm_lapic *apic)
-{
- u32 tmp1, tmp2, tdcr;
-
- tdcr = apic_get_reg(apic, APIC_TDCR);
- tmp1 = tdcr & 0xf;
- tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
- apic->timer.divide_count = 0x1 << (tmp2 & 0x7);
-
- apic_debug("timer divide count is 0x%x\n",
- apic->timer.divide_count);
-}
-
-static void start_apic_timer(struct kvm_lapic *apic)
-{
- ktime_t now = apic->timer.dev.base->get_time();
-
- apic->timer.last_update = now;
-
- apic->timer.period = apic_get_reg(apic, APIC_TMICT) *
- APIC_BUS_CYCLE_NS * apic->timer.divide_count;
- atomic_set(&apic->timer.pending, 0);
- hrtimer_start(&apic->timer.dev,
- ktime_add_ns(now, apic->timer.period),
- HRTIMER_MODE_ABS);
-
- apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
- PRIx64 ", "
- "timer initial count 0x%x, period %lldns, "
- "expire @ 0x%016" PRIx64 ".\n", __FUNCTION__,
- APIC_BUS_CYCLE_NS, ktime_to_ns(now),
- apic_get_reg(apic, APIC_TMICT),
- apic->timer.period,
- ktime_to_ns(ktime_add_ns(now,
- apic->timer.period)));
-}
-
-static void apic_mmio_write(struct kvm_io_device *this,
- gpa_t address, int len, const void *data)
-{
- struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
- unsigned int offset = address - apic->base_address;
- unsigned char alignment = offset & 0xf;
- u32 val;
-
- /*
- * APIC register must be aligned on 128-bits boundary.
- * 32/64/128 bits registers must be accessed thru 32 bits.
- * Refer SDM 8.4.1
- */
- if (len != 4 || alignment) {
- if (printk_ratelimit())
- printk(KERN_ERR "apic write: bad size=%d %lx\n",
- len, (long)address);
- return;
- }
-
- val = *(u32 *) data;
-
- /* too common printing */
- if (offset != APIC_EOI)
- apic_debug("%s: offset 0x%x with length 0x%x, and value is "
- "0x%x\n", __FUNCTION__, offset, len, val);
-
- offset &= 0xff0;
-
- switch (offset) {
- case APIC_ID: /* Local APIC ID */
- apic_set_reg(apic, APIC_ID, val);
- break;
-
- case APIC_TASKPRI:
- apic_set_tpr(apic, val & 0xff);
- break;
-
- case APIC_EOI:
- apic_set_eoi(apic);
- break;
-
- case APIC_LDR:
- apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
- break;
-
- case APIC_DFR:
- apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
- break;
-
- case APIC_SPIV:
- apic_set_reg(apic, APIC_SPIV, val & 0x3ff);
- if (!(val & APIC_SPIV_APIC_ENABLED)) {
- int i;
- u32 lvt_val;
-
- for (i = 0; i < APIC_LVT_NUM; i++) {
- lvt_val = apic_get_reg(apic,
- APIC_LVTT + 0x10 * i);
- apic_set_reg(apic, APIC_LVTT + 0x10 * i,
- lvt_val | APIC_LVT_MASKED);
- }
- atomic_set(&apic->timer.pending, 0);
-
- }
- break;
-
- case APIC_ICR:
- /* No delay here, so we always clear the pending bit */
- apic_set_reg(apic, APIC_ICR, val & ~(1 << 12));
- apic_send_ipi(apic);
- break;
-
- case APIC_ICR2:
- apic_set_reg(apic, APIC_ICR2, val & 0xff000000);
- break;
-
- case APIC_LVTT:
- case APIC_LVTTHMR:
- case APIC_LVTPC:
- case APIC_LVT0:
- case APIC_LVT1:
- case APIC_LVTERR:
- /* TODO: Check vector */
- if (!apic_sw_enabled(apic))
- val |= APIC_LVT_MASKED;
-
- val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4];
- apic_set_reg(apic, offset, val);
-
- break;
-
- case APIC_TMICT:
- hrtimer_cancel(&apic->timer.dev);
- apic_set_reg(apic, APIC_TMICT, val);
- start_apic_timer(apic);
- return;
-
- case APIC_TDCR:
- if (val & 4)
- printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val);
- apic_set_reg(apic, APIC_TDCR, val);
- update_divide_count(apic);
- break;
-
- default:
- apic_debug("Local APIC Write to read-only register %x\n",
- offset);
- break;
- }
-
-}
-
-static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr)
-{
- struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
- int ret = 0;
-
-
- if (apic_hw_enabled(apic) &&
- (addr >= apic->base_address) &&
- (addr < (apic->base_address + LAPIC_MMIO_LENGTH)))
- ret = 1;
-
- return ret;
-}
-
-void kvm_free_apic(struct kvm_lapic *apic)
-{
- if (!apic)
- return;
-
- hrtimer_cancel(&apic->timer.dev);
-
- if (apic->regs_page) {
- __free_page(apic->regs_page);
- apic->regs_page = 0;
- }
-
- kfree(apic);
-}
-
-/*
- *----------------------------------------------------------------------
- * LAPIC interface
- *----------------------------------------------------------------------
- */
-
-void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
-{
- struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
-
- if (!apic)
- return;
- apic_set_tpr(apic, ((cr8 & 0x0f) << 4));
-}
-
-u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
- u64 tpr;
-
- if (!apic)
- return 0;
- tpr = (u64) apic_get_reg(apic, APIC_TASKPRI);
-
- return (tpr & 0xf0) >> 4;
-}
-EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8);
-
-void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
-{
- struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
-
- if (!apic) {
- value |= MSR_IA32_APICBASE_BSP;
- vcpu->apic_base = value;
- return;
- }
- if (apic->vcpu->vcpu_id)
- value &= ~MSR_IA32_APICBASE_BSP;
-
- vcpu->apic_base = value;
- apic->base_address = apic->vcpu->apic_base &
- MSR_IA32_APICBASE_BASE;
-
- /* with FSB delivery interrupt, we can restart APIC functionality */
- apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is "
- "0x%lx.\n", apic->apic_base, apic->base_address);
-
-}
-
-u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
-{
- return vcpu->apic_base;
-}
-EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
-
-void kvm_lapic_reset(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic;
- int i;
-
- apic_debug("%s\n", __FUNCTION__);
-
- ASSERT(vcpu);
- apic = vcpu->apic;
- ASSERT(apic != NULL);
-
- /* Stop the timer in case it's a reset to an active apic */
- hrtimer_cancel(&apic->timer.dev);
-
- apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
- apic_set_reg(apic, APIC_LVR, APIC_VERSION);
-
- for (i = 0; i < APIC_LVT_NUM; i++)
- apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
- apic_set_reg(apic, APIC_LVT0,
- SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
-
- apic_set_reg(apic, APIC_DFR, 0xffffffffU);
- apic_set_reg(apic, APIC_SPIV, 0xff);
- apic_set_reg(apic, APIC_TASKPRI, 0);
- apic_set_reg(apic, APIC_LDR, 0);
- apic_set_reg(apic, APIC_ESR, 0);
- apic_set_reg(apic, APIC_ICR, 0);
- apic_set_reg(apic, APIC_ICR2, 0);
- apic_set_reg(apic, APIC_TDCR, 0);
- apic_set_reg(apic, APIC_TMICT, 0);
- for (i = 0; i < 8; i++) {
- apic_set_reg(apic, APIC_IRR + 0x10 * i, 0);
- apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
- apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
- }
- update_divide_count(apic);
- atomic_set(&apic->timer.pending, 0);
- if (vcpu->vcpu_id == 0)
- vcpu->apic_base |= MSR_IA32_APICBASE_BSP;
- apic_update_ppr(apic);
-
- apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
- "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__,
- vcpu, kvm_apic_id(apic),
- vcpu->apic_base, apic->base_address);
-}
-EXPORT_SYMBOL_GPL(kvm_lapic_reset);
-
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
- int ret = 0;
-
- if (!apic)
- return 0;
- ret = apic_enabled(apic);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
-
-/*
- *----------------------------------------------------------------------
- * timer interface
- *----------------------------------------------------------------------
- */
-
-/* TODO: make sure __apic_timer_fn runs in current pCPU */
-static int __apic_timer_fn(struct kvm_lapic *apic)
-{
- int result = 0;
- wait_queue_head_t *q = &apic->vcpu->wq;
-
- atomic_inc(&apic->timer.pending);
- if (waitqueue_active(q))
- {
- apic->vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
- wake_up_interruptible(q);
- }
- if (apic_lvtt_period(apic)) {
- result = 1;
- apic->timer.dev.expires = ktime_add_ns(
- apic->timer.dev.expires,
- apic->timer.period);
- }
- return result;
-}
-
-static int __inject_apic_timer_irq(struct kvm_lapic *apic)
-{
- int vector;
-
- vector = apic_lvt_vector(apic, APIC_LVTT);
- return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0);
-}
-
-static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
-{
- struct kvm_lapic *apic;
- int restart_timer = 0;
-
- apic = container_of(data, struct kvm_lapic, timer.dev);
-
- restart_timer = __apic_timer_fn(apic);
-
- if (restart_timer)
- return HRTIMER_RESTART;
- else
- return HRTIMER_NORESTART;
-}
-
-int kvm_create_lapic(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic;
-
- ASSERT(vcpu != NULL);
- apic_debug("apic_init %d\n", vcpu->vcpu_id);
-
- apic = kzalloc(sizeof(*apic), GFP_KERNEL);
- if (!apic)
- goto nomem;
-
- vcpu->apic = apic;
-
- apic->regs_page = alloc_page(GFP_KERNEL);
- if (apic->regs_page == NULL) {
- printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
- vcpu->vcpu_id);
- goto nomem;
- }
- apic->regs = page_address(apic->regs_page);
- memset(apic->regs, 0, PAGE_SIZE);
- apic->vcpu = vcpu;
-
- hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
- apic->timer.dev.function = apic_timer_fn;
- apic->base_address = APIC_DEFAULT_PHYS_BASE;
- vcpu->apic_base = APIC_DEFAULT_PHYS_BASE;
-
- kvm_lapic_reset(vcpu);
- apic->dev.read = apic_mmio_read;
- apic->dev.write = apic_mmio_write;
- apic->dev.in_range = apic_mmio_range;
- apic->dev.private = apic;
-
- return 0;
-nomem:
- kvm_free_apic(apic);
- return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(kvm_create_lapic);
-
-int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic = vcpu->apic;
- int highest_irr;
-
- if (!apic || !apic_enabled(apic))
- return -1;
-
- apic_update_ppr(apic);
- highest_irr = apic_find_highest_irr(apic);
- if ((highest_irr == -1) ||
- ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI)))
- return -1;
- return highest_irr;
-}
-
-int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
-{
- u32 lvt0 = apic_get_reg(vcpu->apic, APIC_LVT0);
- int r = 0;
-
- if (vcpu->vcpu_id == 0) {
- if (!apic_hw_enabled(vcpu->apic))
- r = 1;
- if ((lvt0 & APIC_LVT_MASKED) == 0 &&
- GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
- r = 1;
- }
- return r;
-}
-
-void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic = vcpu->apic;
-
- if (apic && apic_lvt_enabled(apic, APIC_LVTT) &&
- atomic_read(&apic->timer.pending) > 0) {
- if (__inject_apic_timer_irq(apic))
- atomic_dec(&apic->timer.pending);
- }
-}
-
-void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
-{
- struct kvm_lapic *apic = vcpu->apic;
-
- if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec)
- apic->timer.last_update = ktime_add_ns(
- apic->timer.last_update,
- apic->timer.period);
-}
-
-int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
-{
- int vector = kvm_apic_has_interrupt(vcpu);
- struct kvm_lapic *apic = vcpu->apic;
-
- if (vector == -1)
- return -1;
-
- apic_set_vector(vector, apic->regs + APIC_ISR);
- apic_update_ppr(apic);
- apic_clear_irr(vector, apic);
- return vector;
-}
-
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic = vcpu->apic;
-
- apic->base_address = vcpu->apic_base &
- MSR_IA32_APICBASE_BASE;
- apic_set_reg(apic, APIC_LVR, APIC_VERSION);
- apic_update_ppr(apic);
- hrtimer_cancel(&apic->timer.dev);
- update_divide_count(apic);
- start_apic_timer(apic);
-}
-
-void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic = vcpu->apic;
- struct hrtimer *timer;
-
- if (!apic)
- return;
-
- timer = &apic->timer.dev;
- if (hrtimer_cancel(timer))
- hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
-}
-EXPORT_SYMBOL_GPL(kvm_migrate_apic_timer);
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
deleted file mode 100644
index feb5ac986c5..00000000000
--- a/drivers/kvm/mmu.c
+++ /dev/null
@@ -1,1498 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * MMU support
- *
- * Copyright (C) 2006 Qumranet, Inc.
- *
- * Authors:
- * Yaniv Kamay <yaniv@qumranet.com>
- * Avi Kivity <avi@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include "vmx.h"
-#include "kvm.h"
-
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/module.h>
-
-#include <asm/page.h>
-#include <asm/cmpxchg.h>
-
-#undef MMU_DEBUG
-
-#undef AUDIT
-
-#ifdef AUDIT
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
-#else
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
-#endif
-
-#ifdef MMU_DEBUG
-
-#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
-#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
-
-#else
-
-#define pgprintk(x...) do { } while (0)
-#define rmap_printk(x...) do { } while (0)
-
-#endif
-
-#if defined(MMU_DEBUG) || defined(AUDIT)
-static int dbg = 1;
-#endif
-
-#ifndef MMU_DEBUG
-#define ASSERT(x) do { } while (0)
-#else
-#define ASSERT(x) \
- if (!(x)) { \
- printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
- __FILE__, __LINE__, #x); \
- }
-#endif
-
-#define PT64_PT_BITS 9
-#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
-#define PT32_PT_BITS 10
-#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
-
-#define PT_WRITABLE_SHIFT 1
-
-#define PT_PRESENT_MASK (1ULL << 0)
-#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
-#define PT_USER_MASK (1ULL << 2)
-#define PT_PWT_MASK (1ULL << 3)
-#define PT_PCD_MASK (1ULL << 4)
-#define PT_ACCESSED_MASK (1ULL << 5)
-#define PT_DIRTY_MASK (1ULL << 6)
-#define PT_PAGE_SIZE_MASK (1ULL << 7)
-#define PT_PAT_MASK (1ULL << 7)
-#define PT_GLOBAL_MASK (1ULL << 8)
-#define PT64_NX_MASK (1ULL << 63)
-
-#define PT_PAT_SHIFT 7
-#define PT_DIR_PAT_SHIFT 12
-#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
-
-#define PT32_DIR_PSE36_SIZE 4
-#define PT32_DIR_PSE36_SHIFT 13
-#define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
-
-
-#define PT_FIRST_AVAIL_BITS_SHIFT 9
-#define PT64_SECOND_AVAIL_BITS_SHIFT 52
-
-#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
-
-#define VALID_PAGE(x) ((x) != INVALID_PAGE)
-
-#define PT64_LEVEL_BITS 9
-
-#define PT64_LEVEL_SHIFT(level) \
- ( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS )
-
-#define PT64_LEVEL_MASK(level) \
- (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
-
-#define PT64_INDEX(address, level)\
- (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
-
-
-#define PT32_LEVEL_BITS 10
-
-#define PT32_LEVEL_SHIFT(level) \
- ( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS )
-
-#define PT32_LEVEL_MASK(level) \
- (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
-
-#define PT32_INDEX(address, level)\
- (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
-
-
-#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
-#define PT64_DIR_BASE_ADDR_MASK \
- (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
-
-#define PT32_BASE_ADDR_MASK PAGE_MASK
-#define PT32_DIR_BASE_ADDR_MASK \
- (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
-
-
-#define PFERR_PRESENT_MASK (1U << 0)
-#define PFERR_WRITE_MASK (1U << 1)
-#define PFERR_USER_MASK (1U << 2)
-#define PFERR_FETCH_MASK (1U << 4)
-
-#define PT64_ROOT_LEVEL 4
-#define PT32_ROOT_LEVEL 2
-#define PT32E_ROOT_LEVEL 3
-
-#define PT_DIRECTORY_LEVEL 2
-#define PT_PAGE_TABLE_LEVEL 1
-
-#define RMAP_EXT 4
-
-struct kvm_rmap_desc {
- u64 *shadow_ptes[RMAP_EXT];
- struct kvm_rmap_desc *more;
-};
-
-static struct kmem_cache *pte_chain_cache;
-static struct kmem_cache *rmap_desc_cache;
-static struct kmem_cache *mmu_page_header_cache;
-
-static int is_write_protection(struct kvm_vcpu *vcpu)
-{
- return vcpu->cr0 & X86_CR0_WP;
-}
-
-static int is_cpuid_PSE36(void)
-{
- return 1;
-}
-
-static int is_nx(struct kvm_vcpu *vcpu)
-{
- return vcpu->shadow_efer & EFER_NX;
-}
-
-static int is_present_pte(unsigned long pte)
-{
- return pte & PT_PRESENT_MASK;
-}
-
-static int is_writeble_pte(unsigned long pte)
-{
- return pte & PT_WRITABLE_MASK;
-}
-
-static int is_io_pte(unsigned long pte)
-{
- return pte & PT_SHADOW_IO_MARK;
-}
-
-static int is_rmap_pte(u64 pte)
-{
- return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK))
- == (PT_WRITABLE_MASK | PT_PRESENT_MASK);
-}
-
-static void set_shadow_pte(u64 *sptep, u64 spte)
-{
-#ifdef CONFIG_X86_64
- set_64bit((unsigned long *)sptep, spte);
-#else
- set_64bit((unsigned long long *)sptep, spte);
-#endif
-}
-
-static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
- struct kmem_cache *base_cache, int min)
-{
- void *obj;
-
- if (cache->nobjs >= min)
- return 0;
- while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
- obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
- if (!obj)
- return -ENOMEM;
- cache->objects[cache->nobjs++] = obj;
- }
- return 0;
-}
-
-static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
-{
- while (mc->nobjs)
- kfree(mc->objects[--mc->nobjs]);
-}
-
-static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
- int min)
-{
- struct page *page;
-
- if (cache->nobjs >= min)
- return 0;
- while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
- page = alloc_page(GFP_KERNEL);
- if (!page)
- return -ENOMEM;
- set_page_private(page, 0);
- cache->objects[cache->nobjs++] = page_address(page);
- }
- return 0;
-}
-
-static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
-{
- while (mc->nobjs)
- free_page((unsigned long)mc->objects[--mc->nobjs]);
-}
-
-static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
-{
- int r;
-
- kvm_mmu_free_some_pages(vcpu);
- r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache,
- pte_chain_cache, 4);
- if (r)
- goto out;
- r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
- rmap_desc_cache, 1);
- if (r)
- goto out;
- r = mmu_topup_memory_cache_page(&vcpu->mmu_page_cache, 4);
- if (r)
- goto out;
- r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache,
- mmu_page_header_cache, 4);
-out:
- return r;
-}
-
-static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
-{
- mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache);
- mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache);
- mmu_free_memory_cache_page(&vcpu->mmu_page_cache);
- mmu_free_memory_cache(&vcpu->mmu_page_header_cache);
-}
-
-static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
- size_t size)
-{
- void *p;
-
- BUG_ON(!mc->nobjs);
- p = mc->objects[--mc->nobjs];
- memset(p, 0, size);
- return p;
-}
-
-static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
-{
- return mmu_memory_cache_alloc(&vcpu->mmu_pte_chain_cache,
- sizeof(struct kvm_pte_chain));
-}
-
-static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
-{
- kfree(pc);
-}
-
-static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
-{
- return mmu_memory_cache_alloc(&vcpu->mmu_rmap_desc_cache,
- sizeof(struct kvm_rmap_desc));
-}
-
-static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
-{
- kfree(rd);
-}
-
-/*
- * Reverse mapping data structures:
- *
- * If page->private bit zero is zero, then page->private points to the
- * shadow page table entry that points to page_address(page).
- *
- * If page->private bit zero is one, (then page->private & ~1) points
- * to a struct kvm_rmap_desc containing more mappings.
- */
-static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte)
-{
- struct page *page;
- struct kvm_rmap_desc *desc;
- int i;
-
- if (!is_rmap_pte(*spte))
- return;
- page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
- if (!page_private(page)) {
- rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
- set_page_private(page,(unsigned long)spte);
- } else if (!(page_private(page) & 1)) {
- rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
- desc = mmu_alloc_rmap_desc(vcpu);
- desc->shadow_ptes[0] = (u64 *)page_private(page);
- desc->shadow_ptes[1] = spte;
- set_page_private(page,(unsigned long)desc | 1);
- } else {
- rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
- desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
- while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
- desc = desc->more;
- if (desc->shadow_ptes[RMAP_EXT-1]) {
- desc->more = mmu_alloc_rmap_desc(vcpu);
- desc = desc->more;
- }
- for (i = 0; desc->shadow_ptes[i]; ++i)
- ;
- desc->shadow_ptes[i] = spte;
- }
-}
-
-static void rmap_desc_remove_entry(struct page *page,
- struct kvm_rmap_desc *desc,
- int i,
- struct kvm_rmap_desc *prev_desc)
-{
- int j;
-
- for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
- ;
- desc->shadow_ptes[i] = desc->shadow_ptes[j];
- desc->shadow_ptes[j] = NULL;
- if (j != 0)
- return;
- if (!prev_desc && !desc->more)
- set_page_private(page,(unsigned long)desc->shadow_ptes[0]);
- else
- if (prev_desc)
- prev_desc->more = desc->more;
- else
- set_page_private(page,(unsigned long)desc->more | 1);
- mmu_free_rmap_desc(desc);
-}
-
-static void rmap_remove(u64 *spte)
-{
- struct page *page;
- struct kvm_rmap_desc *desc;
- struct kvm_rmap_desc *prev_desc;
- int i;
-
- if (!is_rmap_pte(*spte))
- return;
- page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
- if (!page_private(page)) {
- printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
- BUG();
- } else if (!(page_private(page) & 1)) {
- rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
- if ((u64 *)page_private(page) != spte) {
- printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
- spte, *spte);
- BUG();
- }
- set_page_private(page,0);
- } else {
- rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
- desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
- prev_desc = NULL;
- while (desc) {
- for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
- if (desc->shadow_ptes[i] == spte) {
- rmap_desc_remove_entry(page,
- desc, i,
- prev_desc);
- return;
- }
- prev_desc = desc;
- desc = desc->more;
- }
- BUG();
- }
-}
-
-static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
-{
- struct kvm *kvm = vcpu->kvm;
- struct page *page;
- struct kvm_rmap_desc *desc;
- u64 *spte;
-
- page = gfn_to_page(kvm, gfn);
- BUG_ON(!page);
-
- while (page_private(page)) {
- if (!(page_private(page) & 1))
- spte = (u64 *)page_private(page);
- else {
- desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
- spte = desc->shadow_ptes[0];
- }
- BUG_ON(!spte);
- BUG_ON((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT
- != page_to_pfn(page));
- BUG_ON(!(*spte & PT_PRESENT_MASK));
- BUG_ON(!(*spte & PT_WRITABLE_MASK));
- rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
- rmap_remove(spte);
- set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
- kvm_flush_remote_tlbs(vcpu->kvm);
- }
-}
-
-#ifdef MMU_DEBUG
-static int is_empty_shadow_page(u64 *spt)
-{
- u64 *pos;
- u64 *end;
-
- for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
- if (*pos != 0) {
- printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
- pos, *pos);
- return 0;
- }
- return 1;
-}
-#endif
-
-static void kvm_mmu_free_page(struct kvm *kvm,
- struct kvm_mmu_page *page_head)
-{
- ASSERT(is_empty_shadow_page(page_head->spt));
- list_del(&page_head->link);
- __free_page(virt_to_page(page_head->spt));
- kfree(page_head);
- ++kvm->n_free_mmu_pages;
-}
-
-static unsigned kvm_page_table_hashfn(gfn_t gfn)
-{
- return gfn;
-}
-
-static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
- u64 *parent_pte)
-{
- struct kvm_mmu_page *page;
-
- if (!vcpu->kvm->n_free_mmu_pages)
- return NULL;
-
- page = mmu_memory_cache_alloc(&vcpu->mmu_page_header_cache,
- sizeof *page);
- page->spt = mmu_memory_cache_alloc(&vcpu->mmu_page_cache, PAGE_SIZE);
- set_page_private(virt_to_page(page->spt), (unsigned long)page);
- list_add(&page->link, &vcpu->kvm->active_mmu_pages);
- ASSERT(is_empty_shadow_page(page->spt));
- page->slot_bitmap = 0;
- page->multimapped = 0;
- page->parent_pte = parent_pte;
- --vcpu->kvm->n_free_mmu_pages;
- return page;
-}
-
-static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *page, u64 *parent_pte)
-{
- struct kvm_pte_chain *pte_chain;
- struct hlist_node *node;
- int i;
-
- if (!parent_pte)
- return;
- if (!page->multimapped) {
- u64 *old = page->parent_pte;
-
- if (!old) {
- page->parent_pte = parent_pte;
- return;
- }
- page->multimapped = 1;
- pte_chain = mmu_alloc_pte_chain(vcpu);
- INIT_HLIST_HEAD(&page->parent_ptes);
- hlist_add_head(&pte_chain->link, &page->parent_ptes);
- pte_chain->parent_ptes[0] = old;
- }
- hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) {
- if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
- continue;
- for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
- if (!pte_chain->parent_ptes[i]) {
- pte_chain->parent_ptes[i] = parent_pte;
- return;
- }
- }
- pte_chain = mmu_alloc_pte_chain(vcpu);
- BUG_ON(!pte_chain);
- hlist_add_head(&pte_chain->link, &page->parent_ptes);
- pte_chain->parent_ptes[0] = parent_pte;
-}
-
-static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page,
- u64 *parent_pte)
-{
- struct kvm_pte_chain *pte_chain;
- struct hlist_node *node;
- int i;
-
- if (!page->multimapped) {
- BUG_ON(page->parent_pte != parent_pte);
- page->parent_pte = NULL;
- return;
- }
- hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link)
- for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
- if (!pte_chain->parent_ptes[i])
- break;
- if (pte_chain->parent_ptes[i] != parent_pte)
- continue;
- while (i + 1 < NR_PTE_CHAIN_ENTRIES
- && pte_chain->parent_ptes[i + 1]) {
- pte_chain->parent_ptes[i]
- = pte_chain->parent_ptes[i + 1];
- ++i;
- }
- pte_chain->parent_ptes[i] = NULL;
- if (i == 0) {
- hlist_del(&pte_chain->link);
- mmu_free_pte_chain(pte_chain);
- if (hlist_empty(&page->parent_ptes)) {
- page->multimapped = 0;
- page->parent_pte = NULL;
- }
- }
- return;
- }
- BUG();
-}
-
-static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu,
- gfn_t gfn)
-{
- unsigned index;
- struct hlist_head *bucket;
- struct kvm_mmu_page *page;
- struct hlist_node *node;
-
- pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
- index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
- bucket = &vcpu->kvm->mmu_page_hash[index];
- hlist_for_each_entry(page, node, bucket, hash_link)
- if (page->gfn == gfn && !page->role.metaphysical) {
- pgprintk("%s: found role %x\n",
- __FUNCTION__, page->role.word);
- return page;
- }
- return NULL;
-}
-
-static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
- gfn_t gfn,
- gva_t gaddr,
- unsigned level,
- int metaphysical,
- unsigned hugepage_access,
- u64 *parent_pte)
-{
- union kvm_mmu_page_role role;
- unsigned index;
- unsigned quadrant;
- struct hlist_head *bucket;
- struct kvm_mmu_page *page;
- struct hlist_node *node;
-
- role.word = 0;
- role.glevels = vcpu->mmu.root_level;
- role.level = level;
- role.metaphysical = metaphysical;
- role.hugepage_access = hugepage_access;
- if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) {
- quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
- quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
- role.quadrant = quadrant;
- }
- pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
- gfn, role.word);
- index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
- bucket = &vcpu->kvm->mmu_page_hash[index];
- hlist_for_each_entry(page, node, bucket, hash_link)
- if (page->gfn == gfn && page->role.word == role.word) {
- mmu_page_add_parent_pte(vcpu, page, parent_pte);
- pgprintk("%s: found\n", __FUNCTION__);
- return page;
- }
- page = kvm_mmu_alloc_page(vcpu, parent_pte);
- if (!page)
- return page;
- pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
- page->gfn = gfn;
- page->role = role;
- hlist_add_head(&page->hash_link, bucket);
- if (!metaphysical)
- rmap_write_protect(vcpu, gfn);
- return page;
-}
-
-static void kvm_mmu_page_unlink_children(struct kvm *kvm,
- struct kvm_mmu_page *page)
-{
- unsigned i;
- u64 *pt;
- u64 ent;
-
- pt = page->spt;
-
- if (page->role.level == PT_PAGE_TABLE_LEVEL) {
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- if (pt[i] & PT_PRESENT_MASK)
- rmap_remove(&pt[i]);
- pt[i] = 0;
- }
- kvm_flush_remote_tlbs(kvm);
- return;
- }
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- ent = pt[i];
-
- pt[i] = 0;
- if (!(ent & PT_PRESENT_MASK))
- continue;
- ent &= PT64_BASE_ADDR_MASK;
- mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
- }
- kvm_flush_remote_tlbs(kvm);
-}
-
-static void kvm_mmu_put_page(struct kvm_mmu_page *page,
- u64 *parent_pte)
-{
- mmu_page_remove_parent_pte(page, parent_pte);
-}
-
-static void kvm_mmu_zap_page(struct kvm *kvm,
- struct kvm_mmu_page *page)
-{
- u64 *parent_pte;
-
- while (page->multimapped || page->parent_pte) {
- if (!page->multimapped)
- parent_pte = page->parent_pte;
- else {
- struct kvm_pte_chain *chain;
-
- chain = container_of(page->parent_ptes.first,
- struct kvm_pte_chain, link);
- parent_pte = chain->parent_ptes[0];
- }
- BUG_ON(!parent_pte);
- kvm_mmu_put_page(page, parent_pte);
- set_shadow_pte(parent_pte, 0);
- }
- kvm_mmu_page_unlink_children(kvm, page);
- if (!page->root_count) {
- hlist_del(&page->hash_link);
- kvm_mmu_free_page(kvm, page);
- } else
- list_move(&page->link, &kvm->active_mmu_pages);
-}
-
-static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- unsigned index;
- struct hlist_head *bucket;
- struct kvm_mmu_page *page;
- struct hlist_node *node, *n;
- int r;
-
- pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
- r = 0;
- index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
- bucket = &vcpu->kvm->mmu_page_hash[index];
- hlist_for_each_entry_safe(page, node, n, bucket, hash_link)
- if (page->gfn == gfn && !page->role.metaphysical) {
- pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
- page->role.word);
- kvm_mmu_zap_page(vcpu->kvm, page);
- r = 1;
- }
- return r;
-}
-
-static void mmu_unshadow(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- struct kvm_mmu_page *page;
-
- while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) {
- pgprintk("%s: zap %lx %x\n",
- __FUNCTION__, gfn, page->role.word);
- kvm_mmu_zap_page(vcpu->kvm, page);
- }
-}
-
-static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
-{
- int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT));
- struct kvm_mmu_page *page_head = page_header(__pa(pte));
-
- __set_bit(slot, &page_head->slot_bitmap);
-}
-
-hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
-{
- hpa_t hpa = gpa_to_hpa(vcpu, gpa);
-
- return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa;
-}
-
-hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
-{
- struct page *page;
-
- ASSERT((gpa & HPA_ERR_MASK) == 0);
- page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
- if (!page)
- return gpa | HPA_ERR_MASK;
- return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT)
- | (gpa & (PAGE_SIZE-1));
-}
-
-hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva)
-{
- gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
-
- if (gpa == UNMAPPED_GVA)
- return UNMAPPED_GVA;
- return gpa_to_hpa(vcpu, gpa);
-}
-
-struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
-{
- gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
-
- if (gpa == UNMAPPED_GVA)
- return NULL;
- return pfn_to_page(gpa_to_hpa(vcpu, gpa) >> PAGE_SHIFT);
-}
-
-static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
-{
-}
-
-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
-{
- int level = PT32E_ROOT_LEVEL;
- hpa_t table_addr = vcpu->mmu.root_hpa;
-
- for (; ; level--) {
- u32 index = PT64_INDEX(v, level);
- u64 *table;
- u64 pte;
-
- ASSERT(VALID_PAGE(table_addr));
- table = __va(table_addr);
-
- if (level == 1) {
- pte = table[index];
- if (is_present_pte(pte) && is_writeble_pte(pte))
- return 0;
- mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
- page_header_update_slot(vcpu->kvm, table, v);
- table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
- PT_USER_MASK;
- rmap_add(vcpu, &table[index]);
- return 0;
- }
-
- if (table[index] == 0) {
- struct kvm_mmu_page *new_table;
- gfn_t pseudo_gfn;
-
- pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
- >> PAGE_SHIFT;
- new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
- v, level - 1,
- 1, 0, &table[index]);
- if (!new_table) {
- pgprintk("nonpaging_map: ENOMEM\n");
- return -ENOMEM;
- }
-
- table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
- | PT_WRITABLE_MASK | PT_USER_MASK;
- }
- table_addr = table[index] & PT64_BASE_ADDR_MASK;
- }
-}
-
-static void mmu_free_roots(struct kvm_vcpu *vcpu)
-{
- int i;
- struct kvm_mmu_page *page;
-
- if (!VALID_PAGE(vcpu->mmu.root_hpa))
- return;
-#ifdef CONFIG_X86_64
- if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
- hpa_t root = vcpu->mmu.root_hpa;
-
- page = page_header(root);
- --page->root_count;
- vcpu->mmu.root_hpa = INVALID_PAGE;
- return;
- }
-#endif
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->mmu.pae_root[i];
-
- if (root) {
- root &= PT64_BASE_ADDR_MASK;
- page = page_header(root);
- --page->root_count;
- }
- vcpu->mmu.pae_root[i] = INVALID_PAGE;
- }
- vcpu->mmu.root_hpa = INVALID_PAGE;
-}
-
-static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
-{
- int i;
- gfn_t root_gfn;
- struct kvm_mmu_page *page;
-
- root_gfn = vcpu->cr3 >> PAGE_SHIFT;
-
-#ifdef CONFIG_X86_64
- if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
- hpa_t root = vcpu->mmu.root_hpa;
-
- ASSERT(!VALID_PAGE(root));
- page = kvm_mmu_get_page(vcpu, root_gfn, 0,
- PT64_ROOT_LEVEL, 0, 0, NULL);
- root = __pa(page->spt);
- ++page->root_count;
- vcpu->mmu.root_hpa = root;
- return;
- }
-#endif
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->mmu.pae_root[i];
-
- ASSERT(!VALID_PAGE(root));
- if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL) {
- if (!is_present_pte(vcpu->pdptrs[i])) {
- vcpu->mmu.pae_root[i] = 0;
- continue;
- }
- root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT;
- } else if (vcpu->mmu.root_level == 0)
- root_gfn = 0;
- page = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
- PT32_ROOT_LEVEL, !is_paging(vcpu),
- 0, NULL);
- root = __pa(page->spt);
- ++page->root_count;
- vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
- }
- vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
-}
-
-static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
-{
- return vaddr;
-}
-
-static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
- u32 error_code)
-{
- gpa_t addr = gva;
- hpa_t paddr;
- int r;
-
- r = mmu_topup_memory_caches(vcpu);
- if (r)
- return r;
-
- ASSERT(vcpu);
- ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
-
-
- paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
-
- if (is_error_hpa(paddr))
- return 1;
-
- return nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
-}
-
-static void nonpaging_free(struct kvm_vcpu *vcpu)
-{
- mmu_free_roots(vcpu);
-}
-
-static int nonpaging_init_context(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu *context = &vcpu->mmu;
-
- context->new_cr3 = nonpaging_new_cr3;
- context->page_fault = nonpaging_page_fault;
- context->gva_to_gpa = nonpaging_gva_to_gpa;
- context->free = nonpaging_free;
- context->root_level = 0;
- context->shadow_root_level = PT32E_ROOT_LEVEL;
- context->root_hpa = INVALID_PAGE;
- return 0;
-}
-
-static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
-{
- ++vcpu->stat.tlb_flush;
- kvm_x86_ops->tlb_flush(vcpu);
-}
-
-static void paging_new_cr3(struct kvm_vcpu *vcpu)
-{
- pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
- mmu_free_roots(vcpu);
-}
-
-static void inject_page_fault(struct kvm_vcpu *vcpu,
- u64 addr,
- u32 err_code)
-{
- kvm_x86_ops->inject_page_fault(vcpu, addr, err_code);
-}
-
-static void paging_free(struct kvm_vcpu *vcpu)
-{
- nonpaging_free(vcpu);
-}
-
-#define PTTYPE 64
-#include "paging_tmpl.h"
-#undef PTTYPE
-
-#define PTTYPE 32
-#include "paging_tmpl.h"
-#undef PTTYPE
-
-static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
-{
- struct kvm_mmu *context = &vcpu->mmu;
-
- ASSERT(is_pae(vcpu));
- context->new_cr3 = paging_new_cr3;
- context->page_fault = paging64_page_fault;
- context->gva_to_gpa = paging64_gva_to_gpa;
- context->free = paging_free;
- context->root_level = level;
- context->shadow_root_level = level;
- context->root_hpa = INVALID_PAGE;
- return 0;
-}
-
-static int paging64_init_context(struct kvm_vcpu *vcpu)
-{
- return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
-}
-
-static int paging32_init_context(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu *context = &vcpu->mmu;
-
- context->new_cr3 = paging_new_cr3;
- context->page_fault = paging32_page_fault;
- context->gva_to_gpa = paging32_gva_to_gpa;
- context->free = paging_free;
- context->root_level = PT32_ROOT_LEVEL;
- context->shadow_root_level = PT32E_ROOT_LEVEL;
- context->root_hpa = INVALID_PAGE;
- return 0;
-}
-
-static int paging32E_init_context(struct kvm_vcpu *vcpu)
-{
- return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
-}
-
-static int init_kvm_mmu(struct kvm_vcpu *vcpu)
-{
- ASSERT(vcpu);
- ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
-
- if (!is_paging(vcpu))
- return nonpaging_init_context(vcpu);
- else if (is_long_mode(vcpu))
- return paging64_init_context(vcpu);
- else if (is_pae(vcpu))
- return paging32E_init_context(vcpu);
- else
- return paging32_init_context(vcpu);
-}
-
-static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
-{
- ASSERT(vcpu);
- if (VALID_PAGE(vcpu->mmu.root_hpa)) {
- vcpu->mmu.free(vcpu);
- vcpu->mmu.root_hpa = INVALID_PAGE;
- }
-}
-
-int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
-{
- destroy_kvm_mmu(vcpu);
- return init_kvm_mmu(vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
-
-int kvm_mmu_load(struct kvm_vcpu *vcpu)
-{
- int r;
-
- mutex_lock(&vcpu->kvm->lock);
- r = mmu_topup_memory_caches(vcpu);
- if (r)
- goto out;
- mmu_alloc_roots(vcpu);
- kvm_x86_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
- kvm_mmu_flush_tlb(vcpu);
-out:
- mutex_unlock(&vcpu->kvm->lock);
- return r;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_load);
-
-void kvm_mmu_unload(struct kvm_vcpu *vcpu)
-{
- mmu_free_roots(vcpu);
-}
-
-static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *page,
- u64 *spte)
-{
- u64 pte;
- struct kvm_mmu_page *child;
-
- pte = *spte;
- if (is_present_pte(pte)) {
- if (page->role.level == PT_PAGE_TABLE_LEVEL)
- rmap_remove(spte);
- else {
- child = page_header(pte & PT64_BASE_ADDR_MASK);
- mmu_page_remove_parent_pte(child, spte);
- }
- }
- set_shadow_pte(spte, 0);
- kvm_flush_remote_tlbs(vcpu->kvm);
-}
-
-static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *page,
- u64 *spte,
- const void *new, int bytes)
-{
- if (page->role.level != PT_PAGE_TABLE_LEVEL)
- return;
-
- if (page->role.glevels == PT32_ROOT_LEVEL)
- paging32_update_pte(vcpu, page, spte, new, bytes);
- else
- paging64_update_pte(vcpu, page, spte, new, bytes);
-}
-
-void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *new, int bytes)
-{
- gfn_t gfn = gpa >> PAGE_SHIFT;
- struct kvm_mmu_page *page;
- struct hlist_node *node, *n;
- struct hlist_head *bucket;
- unsigned index;
- u64 *spte;
- unsigned offset = offset_in_page(gpa);
- unsigned pte_size;
- unsigned page_offset;
- unsigned misaligned;
- unsigned quadrant;
- int level;
- int flooded = 0;
- int npte;
-
- pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
- if (gfn == vcpu->last_pt_write_gfn) {
- ++vcpu->last_pt_write_count;
- if (vcpu->last_pt_write_count >= 3)
- flooded = 1;
- } else {
- vcpu->last_pt_write_gfn = gfn;
- vcpu->last_pt_write_count = 1;
- }
- index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
- bucket = &vcpu->kvm->mmu_page_hash[index];
- hlist_for_each_entry_safe(page, node, n, bucket, hash_link) {
- if (page->gfn != gfn || page->role.metaphysical)
- continue;
- pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
- misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
- misaligned |= bytes < 4;
- if (misaligned || flooded) {
- /*
- * Misaligned accesses are too much trouble to fix
- * up; also, they usually indicate a page is not used
- * as a page table.
- *
- * If we're seeing too many writes to a page,
- * it may no longer be a page table, or we may be
- * forking, in which case it is better to unmap the
- * page.
- */
- pgprintk("misaligned: gpa %llx bytes %d role %x\n",
- gpa, bytes, page->role.word);
- kvm_mmu_zap_page(vcpu->kvm, page);
- continue;
- }
- page_offset = offset;
- level = page->role.level;
- npte = 1;
- if (page->role.glevels == PT32_ROOT_LEVEL) {
- page_offset <<= 1; /* 32->64 */
- /*
- * A 32-bit pde maps 4MB while the shadow pdes map
- * only 2MB. So we need to double the offset again
- * and zap two pdes instead of one.
- */
- if (level == PT32_ROOT_LEVEL) {
- page_offset &= ~7; /* kill rounding error */
- page_offset <<= 1;
- npte = 2;
- }
- quadrant = page_offset >> PAGE_SHIFT;
- page_offset &= ~PAGE_MASK;
- if (quadrant != page->role.quadrant)
- continue;
- }
- spte = &page->spt[page_offset / sizeof(*spte)];
- while (npte--) {
- mmu_pte_write_zap_pte(vcpu, page, spte);
- mmu_pte_write_new_pte(vcpu, page, spte, new, bytes);
- ++spte;
- }
- }
-}
-
-int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
-{
- gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
-
- return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT);
-}
-
-void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
-{
- while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) {
- struct kvm_mmu_page *page;
-
- page = container_of(vcpu->kvm->active_mmu_pages.prev,
- struct kvm_mmu_page, link);
- kvm_mmu_zap_page(vcpu->kvm, page);
- }
-}
-
-static void free_mmu_pages(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu_page *page;
-
- while (!list_empty(&vcpu->kvm->active_mmu_pages)) {
- page = container_of(vcpu->kvm->active_mmu_pages.next,
- struct kvm_mmu_page, link);
- kvm_mmu_zap_page(vcpu->kvm, page);
- }
- free_page((unsigned long)vcpu->mmu.pae_root);
-}
-
-static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
-{
- struct page *page;
- int i;
-
- ASSERT(vcpu);
-
- vcpu->kvm->n_free_mmu_pages = KVM_NUM_MMU_PAGES;
-
- /*
- * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
- * Therefore we need to allocate shadow page tables in the first
- * 4GB of memory, which happens to fit the DMA32 zone.
- */
- page = alloc_page(GFP_KERNEL | __GFP_DMA32);
- if (!page)
- goto error_1;
- vcpu->mmu.pae_root = page_address(page);
- for (i = 0; i < 4; ++i)
- vcpu->mmu.pae_root[i] = INVALID_PAGE;
-
- return 0;
-
-error_1:
- free_mmu_pages(vcpu);
- return -ENOMEM;
-}
-
-int kvm_mmu_create(struct kvm_vcpu *vcpu)
-{
- ASSERT(vcpu);
- ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
-
- return alloc_mmu_pages(vcpu);
-}
-
-int kvm_mmu_setup(struct kvm_vcpu *vcpu)
-{
- ASSERT(vcpu);
- ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
-
- return init_kvm_mmu(vcpu);
-}
-
-void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
-{
- ASSERT(vcpu);
-
- destroy_kvm_mmu(vcpu);
- free_mmu_pages(vcpu);
- mmu_free_memory_caches(vcpu);
-}
-
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
-{
- struct kvm_mmu_page *page;
-
- list_for_each_entry(page, &kvm->active_mmu_pages, link) {
- int i;
- u64 *pt;
-
- if (!test_bit(slot, &page->slot_bitmap))
- continue;
-
- pt = page->spt;
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
- /* avoid RMW */
- if (pt[i] & PT_WRITABLE_MASK) {
- rmap_remove(&pt[i]);
- pt[i] &= ~PT_WRITABLE_MASK;
- }
- }
-}
-
-void kvm_mmu_zap_all(struct kvm *kvm)
-{
- struct kvm_mmu_page *page, *node;
-
- list_for_each_entry_safe(page, node, &kvm->active_mmu_pages, link)
- kvm_mmu_zap_page(kvm, page);
-
- kvm_flush_remote_tlbs(kvm);
-}
-
-void kvm_mmu_module_exit(void)
-{
- if (pte_chain_cache)
- kmem_cache_destroy(pte_chain_cache);
- if (rmap_desc_cache)
- kmem_cache_destroy(rmap_desc_cache);
- if (mmu_page_header_cache)
- kmem_cache_destroy(mmu_page_header_cache);
-}
-
-int kvm_mmu_module_init(void)
-{
- pte_chain_cache = kmem_cache_create("kvm_pte_chain",
- sizeof(struct kvm_pte_chain),
- 0, 0, NULL);
- if (!pte_chain_cache)
- goto nomem;
- rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
- sizeof(struct kvm_rmap_desc),
- 0, 0, NULL);
- if (!rmap_desc_cache)
- goto nomem;
-
- mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
- sizeof(struct kvm_mmu_page),
- 0, 0, NULL);
- if (!mmu_page_header_cache)
- goto nomem;
-
- return 0;
-
-nomem:
- kvm_mmu_module_exit();
- return -ENOMEM;
-}
-
-#ifdef AUDIT
-
-static const char *audit_msg;
-
-static gva_t canonicalize(gva_t gva)
-{
-#ifdef CONFIG_X86_64
- gva = (long long)(gva << 16) >> 16;
-#endif
- return gva;
-}
-
-static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
- gva_t va, int level)
-{
- u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
- int i;
- gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
- u64 ent = pt[i];
-
- if (!(ent & PT_PRESENT_MASK))
- continue;
-
- va = canonicalize(va);
- if (level > 1)
- audit_mappings_page(vcpu, ent, va, level - 1);
- else {
- gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va);
- hpa_t hpa = gpa_to_hpa(vcpu, gpa);
-
- if ((ent & PT_PRESENT_MASK)
- && (ent & PT64_BASE_ADDR_MASK) != hpa)
- printk(KERN_ERR "audit error: (%s) levels %d"
- " gva %lx gpa %llx hpa %llx ent %llx\n",
- audit_msg, vcpu->mmu.root_level,
- va, gpa, hpa, ent);
- }
- }
-}
-
-static void audit_mappings(struct kvm_vcpu *vcpu)
-{
- unsigned i;
-
- if (vcpu->mmu.root_level == 4)
- audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4);
- else
- for (i = 0; i < 4; ++i)
- if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK)
- audit_mappings_page(vcpu,
- vcpu->mmu.pae_root[i],
- i << 30,
- 2);
-}
-
-static int count_rmaps(struct kvm_vcpu *vcpu)
-{
- int nmaps = 0;
- int i, j, k;
-
- for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
- struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
- struct kvm_rmap_desc *d;
-
- for (j = 0; j < m->npages; ++j) {
- struct page *page = m->phys_mem[j];
-
- if (!page->private)
- continue;
- if (!(page->private & 1)) {
- ++nmaps;
- continue;
- }
- d = (struct kvm_rmap_desc *)(page->private & ~1ul);
- while (d) {
- for (k = 0; k < RMAP_EXT; ++k)
- if (d->shadow_ptes[k])
- ++nmaps;
- else
- break;
- d = d->more;
- }
- }
- }
- return nmaps;
-}
-
-static int count_writable_mappings(struct kvm_vcpu *vcpu)
-{
- int nmaps = 0;
- struct kvm_mmu_page *page;
- int i;
-
- list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
- u64 *pt = page->spt;
-
- if (page->role.level != PT_PAGE_TABLE_LEVEL)
- continue;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- u64 ent = pt[i];
-
- if (!(ent & PT_PRESENT_MASK))
- continue;
- if (!(ent & PT_WRITABLE_MASK))
- continue;
- ++nmaps;
- }
- }
- return nmaps;
-}
-
-static void audit_rmap(struct kvm_vcpu *vcpu)
-{
- int n_rmap = count_rmaps(vcpu);
- int n_actual = count_writable_mappings(vcpu);
-
- if (n_rmap != n_actual)
- printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
- __FUNCTION__, audit_msg, n_rmap, n_actual);
-}
-
-static void audit_write_protection(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu_page *page;
-
- list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
- hfn_t hfn;
- struct page *pg;
-
- if (page->role.metaphysical)
- continue;
-
- hfn = gpa_to_hpa(vcpu, (gpa_t)page->gfn << PAGE_SHIFT)
- >> PAGE_SHIFT;
- pg = pfn_to_page(hfn);
- if (pg->private)
- printk(KERN_ERR "%s: (%s) shadow page has writable"
- " mappings: gfn %lx role %x\n",
- __FUNCTION__, audit_msg, page->gfn,
- page->role.word);
- }
-}
-
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
-{
- int olddbg = dbg;
-
- dbg = 0;
- audit_msg = msg;
- audit_rmap(vcpu);
- audit_write_protection(vcpu);
- audit_mappings(vcpu);
- dbg = olddbg;
-}
-
-#endif
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h
deleted file mode 100644
index 6b094b44f8f..00000000000
--- a/drivers/kvm/paging_tmpl.h
+++ /dev/null
@@ -1,511 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * MMU support
- *
- * Copyright (C) 2006 Qumranet, Inc.
- *
- * Authors:
- * Yaniv Kamay <yaniv@qumranet.com>
- * Avi Kivity <avi@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- */
-
-/*
- * We need the mmu code to access both 32-bit and 64-bit guest ptes,
- * so the code in this file is compiled twice, once per pte size.
- */
-
-#if PTTYPE == 64
- #define pt_element_t u64
- #define guest_walker guest_walker64
- #define FNAME(name) paging##64_##name
- #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
- #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
- #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
- #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
- #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
- #ifdef CONFIG_X86_64
- #define PT_MAX_FULL_LEVELS 4
- #else
- #define PT_MAX_FULL_LEVELS 2
- #endif
-#elif PTTYPE == 32
- #define pt_element_t u32
- #define guest_walker guest_walker32
- #define FNAME(name) paging##32_##name
- #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
- #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
- #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
- #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
- #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
- #define PT_MAX_FULL_LEVELS 2
-#else
- #error Invalid PTTYPE value
-#endif
-
-/*
- * The guest_walker structure emulates the behavior of the hardware page
- * table walker.
- */
-struct guest_walker {
- int level;
- gfn_t table_gfn[PT_MAX_FULL_LEVELS];
- pt_element_t *table;
- pt_element_t pte;
- pt_element_t *ptep;
- struct page *page;
- int index;
- pt_element_t inherited_ar;
- gfn_t gfn;
- u32 error_code;
-};
-
-/*
- * Fetch a guest pte for a guest virtual address
- */
-static int FNAME(walk_addr)(struct guest_walker *walker,
- struct kvm_vcpu *vcpu, gva_t addr,
- int write_fault, int user_fault, int fetch_fault)
-{
- hpa_t hpa;
- struct kvm_memory_slot *slot;
- pt_element_t *ptep;
- pt_element_t root;
- gfn_t table_gfn;
-
- pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
- walker->level = vcpu->mmu.root_level;
- walker->table = NULL;
- walker->page = NULL;
- walker->ptep = NULL;
- root = vcpu->cr3;
-#if PTTYPE == 64
- if (!is_long_mode(vcpu)) {
- walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3];
- root = *walker->ptep;
- walker->pte = root;
- if (!(root & PT_PRESENT_MASK))
- goto not_present;
- --walker->level;
- }
-#endif
- table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
- walker->table_gfn[walker->level - 1] = table_gfn;
- pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
- walker->level - 1, table_gfn);
- slot = gfn_to_memslot(vcpu->kvm, table_gfn);
- hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK);
- walker->page = pfn_to_page(hpa >> PAGE_SHIFT);
- walker->table = kmap_atomic(walker->page, KM_USER0);
-
- ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
- (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
-
- walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK;
-
- for (;;) {
- int index = PT_INDEX(addr, walker->level);
- hpa_t paddr;
-
- ptep = &walker->table[index];
- walker->index = index;
- ASSERT(((unsigned long)walker->table & PAGE_MASK) ==
- ((unsigned long)ptep & PAGE_MASK));
-
- if (!is_present_pte(*ptep))
- goto not_present;
-
- if (write_fault && !is_writeble_pte(*ptep))
- if (user_fault || is_write_protection(vcpu))
- goto access_error;
-
- if (user_fault && !(*ptep & PT_USER_MASK))
- goto access_error;
-
-#if PTTYPE == 64
- if (fetch_fault && is_nx(vcpu) && (*ptep & PT64_NX_MASK))
- goto access_error;
-#endif
-
- if (!(*ptep & PT_ACCESSED_MASK)) {
- mark_page_dirty(vcpu->kvm, table_gfn);
- *ptep |= PT_ACCESSED_MASK;
- }
-
- if (walker->level == PT_PAGE_TABLE_LEVEL) {
- walker->gfn = (*ptep & PT_BASE_ADDR_MASK)
- >> PAGE_SHIFT;
- break;
- }
-
- if (walker->level == PT_DIRECTORY_LEVEL
- && (*ptep & PT_PAGE_SIZE_MASK)
- && (PTTYPE == 64 || is_pse(vcpu))) {
- walker->gfn = (*ptep & PT_DIR_BASE_ADDR_MASK)
- >> PAGE_SHIFT;
- walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
- break;
- }
-
- walker->inherited_ar &= walker->table[index];
- table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
- kunmap_atomic(walker->table, KM_USER0);
- paddr = safe_gpa_to_hpa(vcpu, table_gfn << PAGE_SHIFT);
- walker->page = pfn_to_page(paddr >> PAGE_SHIFT);
- walker->table = kmap_atomic(walker->page, KM_USER0);
- --walker->level;
- walker->table_gfn[walker->level - 1 ] = table_gfn;
- pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
- walker->level - 1, table_gfn);
- }
- walker->pte = *ptep;
- if (walker->page)
- walker->ptep = NULL;
- if (walker->table)
- kunmap_atomic(walker->table, KM_USER0);
- pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep);
- return 1;
-
-not_present:
- walker->error_code = 0;
- goto err;
-
-access_error:
- walker->error_code = PFERR_PRESENT_MASK;
-
-err:
- if (write_fault)
- walker->error_code |= PFERR_WRITE_MASK;
- if (user_fault)
- walker->error_code |= PFERR_USER_MASK;
- if (fetch_fault)
- walker->error_code |= PFERR_FETCH_MASK;
- if (walker->table)
- kunmap_atomic(walker->table, KM_USER0);
- return 0;
-}
-
-static void FNAME(mark_pagetable_dirty)(struct kvm *kvm,
- struct guest_walker *walker)
-{
- mark_page_dirty(kvm, walker->table_gfn[walker->level - 1]);
-}
-
-static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
- u64 *shadow_pte,
- gpa_t gaddr,
- pt_element_t gpte,
- u64 access_bits,
- int user_fault,
- int write_fault,
- int *ptwrite,
- struct guest_walker *walker,
- gfn_t gfn)
-{
- hpa_t paddr;
- int dirty = gpte & PT_DIRTY_MASK;
- u64 spte = *shadow_pte;
- int was_rmapped = is_rmap_pte(spte);
-
- pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d"
- " user_fault %d gfn %lx\n",
- __FUNCTION__, spte, (u64)gpte, access_bits,
- write_fault, user_fault, gfn);
-
- if (write_fault && !dirty) {
- pt_element_t *guest_ent, *tmp = NULL;
-
- if (walker->ptep)
- guest_ent = walker->ptep;
- else {
- tmp = kmap_atomic(walker->page, KM_USER0);
- guest_ent = &tmp[walker->index];
- }
-
- *guest_ent |= PT_DIRTY_MASK;
- if (!walker->ptep)
- kunmap_atomic(tmp, KM_USER0);
- dirty = 1;
- FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
- }
-
- spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK;
- spte |= gpte & PT64_NX_MASK;
- if (!dirty)
- access_bits &= ~PT_WRITABLE_MASK;
-
- paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
-
- spte |= PT_PRESENT_MASK;
- if (access_bits & PT_USER_MASK)
- spte |= PT_USER_MASK;
-
- if (is_error_hpa(paddr)) {
- spte |= gaddr;
- spte |= PT_SHADOW_IO_MARK;
- spte &= ~PT_PRESENT_MASK;
- set_shadow_pte(shadow_pte, spte);
- return;
- }
-
- spte |= paddr;
-
- if ((access_bits & PT_WRITABLE_MASK)
- || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
- struct kvm_mmu_page *shadow;
-
- spte |= PT_WRITABLE_MASK;
- if (user_fault) {
- mmu_unshadow(vcpu, gfn);
- goto unshadowed;
- }
-
- shadow = kvm_mmu_lookup_page(vcpu, gfn);
- if (shadow) {
- pgprintk("%s: found shadow page for %lx, marking ro\n",
- __FUNCTION__, gfn);
- access_bits &= ~PT_WRITABLE_MASK;
- if (is_writeble_pte(spte)) {
- spte &= ~PT_WRITABLE_MASK;
- kvm_x86_ops->tlb_flush(vcpu);
- }
- if (write_fault)
- *ptwrite = 1;
- }
- }
-
-unshadowed:
-
- if (access_bits & PT_WRITABLE_MASK)
- mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
-
- set_shadow_pte(shadow_pte, spte);
- page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
- if (!was_rmapped)
- rmap_add(vcpu, shadow_pte);
-}
-
-static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t gpte,
- u64 *shadow_pte, u64 access_bits,
- int user_fault, int write_fault, int *ptwrite,
- struct guest_walker *walker, gfn_t gfn)
-{
- access_bits &= gpte;
- FNAME(set_pte_common)(vcpu, shadow_pte, gpte & PT_BASE_ADDR_MASK,
- gpte, access_bits, user_fault, write_fault,
- ptwrite, walker, gfn);
-}
-
-static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
- u64 *spte, const void *pte, int bytes)
-{
- pt_element_t gpte;
-
- if (bytes < sizeof(pt_element_t))
- return;
- gpte = *(const pt_element_t *)pte;
- if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK))
- return;
- pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
- FNAME(set_pte)(vcpu, gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0,
- 0, NULL, NULL,
- (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT);
-}
-
-static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t gpde,
- u64 *shadow_pte, u64 access_bits,
- int user_fault, int write_fault, int *ptwrite,
- struct guest_walker *walker, gfn_t gfn)
-{
- gpa_t gaddr;
-
- access_bits &= gpde;
- gaddr = (gpa_t)gfn << PAGE_SHIFT;
- if (PTTYPE == 32 && is_cpuid_PSE36())
- gaddr |= (gpde & PT32_DIR_PSE36_MASK) <<
- (32 - PT32_DIR_PSE36_SHIFT);
- FNAME(set_pte_common)(vcpu, shadow_pte, gaddr,
- gpde, access_bits, user_fault, write_fault,
- ptwrite, walker, gfn);
-}
-
-/*
- * Fetch a shadow pte for a specific level in the paging hierarchy.
- */
-static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
- struct guest_walker *walker,
- int user_fault, int write_fault, int *ptwrite)
-{
- hpa_t shadow_addr;
- int level;
- u64 *shadow_ent;
- u64 *prev_shadow_ent = NULL;
-
- if (!is_present_pte(walker->pte))
- return NULL;
-
- shadow_addr = vcpu->mmu.root_hpa;
- level = vcpu->mmu.shadow_root_level;
- if (level == PT32E_ROOT_LEVEL) {
- shadow_addr = vcpu->mmu.pae_root[(addr >> 30) & 3];
- shadow_addr &= PT64_BASE_ADDR_MASK;
- --level;
- }
-
- for (; ; level--) {
- u32 index = SHADOW_PT_INDEX(addr, level);
- struct kvm_mmu_page *shadow_page;
- u64 shadow_pte;
- int metaphysical;
- gfn_t table_gfn;
- unsigned hugepage_access = 0;
-
- shadow_ent = ((u64 *)__va(shadow_addr)) + index;
- if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) {
- if (level == PT_PAGE_TABLE_LEVEL)
- break;
- shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
- prev_shadow_ent = shadow_ent;
- continue;
- }
-
- if (level == PT_PAGE_TABLE_LEVEL)
- break;
-
- if (level - 1 == PT_PAGE_TABLE_LEVEL
- && walker->level == PT_DIRECTORY_LEVEL) {
- metaphysical = 1;
- hugepage_access = walker->pte;
- hugepage_access &= PT_USER_MASK | PT_WRITABLE_MASK;
- if (walker->pte & PT64_NX_MASK)
- hugepage_access |= (1 << 2);
- hugepage_access >>= PT_WRITABLE_SHIFT;
- table_gfn = (walker->pte & PT_BASE_ADDR_MASK)
- >> PAGE_SHIFT;
- } else {
- metaphysical = 0;
- table_gfn = walker->table_gfn[level - 2];
- }
- shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
- metaphysical, hugepage_access,
- shadow_ent);
- shadow_addr = __pa(shadow_page->spt);
- shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
- | PT_WRITABLE_MASK | PT_USER_MASK;
- *shadow_ent = shadow_pte;
- prev_shadow_ent = shadow_ent;
- }
-
- if (walker->level == PT_DIRECTORY_LEVEL) {
- FNAME(set_pde)(vcpu, walker->pte, shadow_ent,
- walker->inherited_ar, user_fault, write_fault,
- ptwrite, walker, walker->gfn);
- } else {
- ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
- FNAME(set_pte)(vcpu, walker->pte, shadow_ent,
- walker->inherited_ar, user_fault, write_fault,
- ptwrite, walker, walker->gfn);
- }
- return shadow_ent;
-}
-
-/*
- * Page fault handler. There are several causes for a page fault:
- * - there is no shadow pte for the guest pte
- * - write access through a shadow pte marked read only so that we can set
- * the dirty bit
- * - write access to a shadow pte marked read only so we can update the page
- * dirty bitmap, when userspace requests it
- * - mmio access; in this case we will never install a present shadow pte
- * - normal guest page fault due to the guest pte marked not present, not
- * writable, or not executable
- *
- * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
- * a negative value on error.
- */
-static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
- u32 error_code)
-{
- int write_fault = error_code & PFERR_WRITE_MASK;
- int user_fault = error_code & PFERR_USER_MASK;
- int fetch_fault = error_code & PFERR_FETCH_MASK;
- struct guest_walker walker;
- u64 *shadow_pte;
- int write_pt = 0;
- int r;
-
- pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
- kvm_mmu_audit(vcpu, "pre page fault");
-
- r = mmu_topup_memory_caches(vcpu);
- if (r)
- return r;
-
- /*
- * Look up the shadow pte for the faulting address.
- */
- r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
- fetch_fault);
-
- /*
- * The page is not mapped by the guest. Let the guest handle it.
- */
- if (!r) {
- pgprintk("%s: guest page fault\n", __FUNCTION__);
- inject_page_fault(vcpu, addr, walker.error_code);
- vcpu->last_pt_write_count = 0; /* reset fork detector */
- return 0;
- }
-
- shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
- &write_pt);
- pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
- shadow_pte, *shadow_pte, write_pt);
-
- if (!write_pt)
- vcpu->last_pt_write_count = 0; /* reset fork detector */
-
- /*
- * mmio: emulate if accessible, otherwise its a guest fault.
- */
- if (is_io_pte(*shadow_pte))
- return 1;
-
- ++vcpu->stat.pf_fixed;
- kvm_mmu_audit(vcpu, "post page fault (fixed)");
-
- return write_pt;
-}
-
-static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
-{
- struct guest_walker walker;
- gpa_t gpa = UNMAPPED_GVA;
- int r;
-
- r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
-
- if (r) {
- gpa = (gpa_t)walker.gfn << PAGE_SHIFT;
- gpa |= vaddr & ~PAGE_MASK;
- }
-
- return gpa;
-}
-
-#undef pt_element_t
-#undef guest_walker
-#undef FNAME
-#undef PT_BASE_ADDR_MASK
-#undef PT_INDEX
-#undef SHADOW_PT_INDEX
-#undef PT_LEVEL_MASK
-#undef PT_DIR_BASE_ADDR_MASK
-#undef PT_MAX_FULL_LEVELS
diff --git a/drivers/kvm/segment_descriptor.h b/drivers/kvm/segment_descriptor.h
deleted file mode 100644
index 71fdf458619..00000000000
--- a/drivers/kvm/segment_descriptor.h
+++ /dev/null
@@ -1,17 +0,0 @@
-struct segment_descriptor {
- u16 limit_low;
- u16 base_low;
- u8 base_mid;
- u8 type : 4;
- u8 system : 1;
- u8 dpl : 2;
- u8 present : 1;
- u8 limit_high : 4;
- u8 avl : 1;
- u8 long_mode : 1;
- u8 default_op : 1;
- u8 granularity : 1;
- u8 base_high;
-} __attribute__((packed));
-
-
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
deleted file mode 100644
index 4e04e49a2f1..00000000000
--- a/drivers/kvm/svm.c
+++ /dev/null
@@ -1,1754 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * AMD SVM support
- *
- * Copyright (C) 2006 Qumranet, Inc.
- *
- * Authors:
- * Yaniv Kamay <yaniv@qumranet.com>
- * Avi Kivity <avi@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include "kvm_svm.h"
-#include "x86_emulate.h"
-#include "irq.h"
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/vmalloc.h>
-#include <linux/highmem.h>
-#include <linux/sched.h>
-
-#include <asm/desc.h>
-
-MODULE_AUTHOR("Qumranet");
-MODULE_LICENSE("GPL");
-
-#define IOPM_ALLOC_ORDER 2
-#define MSRPM_ALLOC_ORDER 1
-
-#define DB_VECTOR 1
-#define UD_VECTOR 6
-#define GP_VECTOR 13
-
-#define DR7_GD_MASK (1 << 13)
-#define DR6_BD_MASK (1 << 13)
-
-#define SEG_TYPE_LDT 2
-#define SEG_TYPE_BUSY_TSS16 3
-
-#define KVM_EFER_LMA (1 << 10)
-#define KVM_EFER_LME (1 << 8)
-
-#define SVM_FEATURE_NPT (1 << 0)
-#define SVM_FEATURE_LBRV (1 << 1)
-#define SVM_DEATURE_SVML (1 << 2)
-
-static void kvm_reput_irq(struct vcpu_svm *svm);
-
-static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
-{
- return container_of(vcpu, struct vcpu_svm, vcpu);
-}
-
-unsigned long iopm_base;
-unsigned long msrpm_base;
-
-struct kvm_ldttss_desc {
- u16 limit0;
- u16 base0;
- unsigned base1 : 8, type : 5, dpl : 2, p : 1;
- unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
- u32 base3;
- u32 zero1;
-} __attribute__((packed));
-
-struct svm_cpu_data {
- int cpu;
-
- u64 asid_generation;
- u32 max_asid;
- u32 next_asid;
- struct kvm_ldttss_desc *tss_desc;
-
- struct page *save_area;
-};
-
-static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
-static uint32_t svm_features;
-
-struct svm_init_data {
- int cpu;
- int r;
-};
-
-static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
-
-#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
-#define MSRS_RANGE_SIZE 2048
-#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
-
-#define MAX_INST_SIZE 15
-
-static inline u32 svm_has(u32 feat)
-{
- return svm_features & feat;
-}
-
-static inline u8 pop_irq(struct kvm_vcpu *vcpu)
-{
- int word_index = __ffs(vcpu->irq_summary);
- int bit_index = __ffs(vcpu->irq_pending[word_index]);
- int irq = word_index * BITS_PER_LONG + bit_index;
-
- clear_bit(bit_index, &vcpu->irq_pending[word_index]);
- if (!vcpu->irq_pending[word_index])
- clear_bit(word_index, &vcpu->irq_summary);
- return irq;
-}
-
-static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
-{
- set_bit(irq, vcpu->irq_pending);
- set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
-}
-
-static inline void clgi(void)
-{
- asm volatile (SVM_CLGI);
-}
-
-static inline void stgi(void)
-{
- asm volatile (SVM_STGI);
-}
-
-static inline void invlpga(unsigned long addr, u32 asid)
-{
- asm volatile (SVM_INVLPGA :: "a"(addr), "c"(asid));
-}
-
-static inline unsigned long kvm_read_cr2(void)
-{
- unsigned long cr2;
-
- asm volatile ("mov %%cr2, %0" : "=r" (cr2));
- return cr2;
-}
-
-static inline void kvm_write_cr2(unsigned long val)
-{
- asm volatile ("mov %0, %%cr2" :: "r" (val));
-}
-
-static inline unsigned long read_dr6(void)
-{
- unsigned long dr6;
-
- asm volatile ("mov %%dr6, %0" : "=r" (dr6));
- return dr6;
-}
-
-static inline void write_dr6(unsigned long val)
-{
- asm volatile ("mov %0, %%dr6" :: "r" (val));
-}
-
-static inline unsigned long read_dr7(void)
-{
- unsigned long dr7;
-
- asm volatile ("mov %%dr7, %0" : "=r" (dr7));
- return dr7;
-}
-
-static inline void write_dr7(unsigned long val)
-{
- asm volatile ("mov %0, %%dr7" :: "r" (val));
-}
-
-static inline void force_new_asid(struct kvm_vcpu *vcpu)
-{
- to_svm(vcpu)->asid_generation--;
-}
-
-static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
-{
- force_new_asid(vcpu);
-}
-
-static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
-{
- if (!(efer & KVM_EFER_LMA))
- efer &= ~KVM_EFER_LME;
-
- to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
- vcpu->shadow_efer = efer;
-}
-
-static void svm_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
- SVM_EVTINJ_VALID_ERR |
- SVM_EVTINJ_TYPE_EXEPT |
- GP_VECTOR;
- svm->vmcb->control.event_inj_err = error_code;
-}
-
-static void inject_ud(struct kvm_vcpu *vcpu)
-{
- to_svm(vcpu)->vmcb->control.event_inj = SVM_EVTINJ_VALID |
- SVM_EVTINJ_TYPE_EXEPT |
- UD_VECTOR;
-}
-
-static int is_page_fault(uint32_t info)
-{
- info &= SVM_EVTINJ_VEC_MASK | SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
- return info == (PF_VECTOR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT);
-}
-
-static int is_external_interrupt(u32 info)
-{
- info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
- return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
-}
-
-static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (!svm->next_rip) {
- printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__);
- return;
- }
- if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) {
- printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
- __FUNCTION__,
- svm->vmcb->save.rip,
- svm->next_rip);
- }
-
- vcpu->rip = svm->vmcb->save.rip = svm->next_rip;
- svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
-
- vcpu->interrupt_window_open = 1;
-}
-
-static int has_svm(void)
-{
- uint32_t eax, ebx, ecx, edx;
-
- if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
- printk(KERN_INFO "has_svm: not amd\n");
- return 0;
- }
-
- cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
- if (eax < SVM_CPUID_FUNC) {
- printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n");
- return 0;
- }
-
- cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
- if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
- printk(KERN_DEBUG "has_svm: svm not available\n");
- return 0;
- }
- return 1;
-}
-
-static void svm_hardware_disable(void *garbage)
-{
- struct svm_cpu_data *svm_data
- = per_cpu(svm_data, raw_smp_processor_id());
-
- if (svm_data) {
- uint64_t efer;
-
- wrmsrl(MSR_VM_HSAVE_PA, 0);
- rdmsrl(MSR_EFER, efer);
- wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
- per_cpu(svm_data, raw_smp_processor_id()) = NULL;
- __free_page(svm_data->save_area);
- kfree(svm_data);
- }
-}
-
-static void svm_hardware_enable(void *garbage)
-{
-
- struct svm_cpu_data *svm_data;
- uint64_t efer;
-#ifdef CONFIG_X86_64
- struct desc_ptr gdt_descr;
-#else
- struct Xgt_desc_struct gdt_descr;
-#endif
- struct desc_struct *gdt;
- int me = raw_smp_processor_id();
-
- if (!has_svm()) {
- printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me);
- return;
- }
- svm_data = per_cpu(svm_data, me);
-
- if (!svm_data) {
- printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n",
- me);
- return;
- }
-
- svm_data->asid_generation = 1;
- svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
- svm_data->next_asid = svm_data->max_asid + 1;
- svm_features = cpuid_edx(SVM_CPUID_FUNC);
-
- asm volatile ( "sgdt %0" : "=m"(gdt_descr) );
- gdt = (struct desc_struct *)gdt_descr.address;
- svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
-
- rdmsrl(MSR_EFER, efer);
- wrmsrl(MSR_EFER, efer | MSR_EFER_SVME_MASK);
-
- wrmsrl(MSR_VM_HSAVE_PA,
- page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
-}
-
-static int svm_cpu_init(int cpu)
-{
- struct svm_cpu_data *svm_data;
- int r;
-
- svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
- if (!svm_data)
- return -ENOMEM;
- svm_data->cpu = cpu;
- svm_data->save_area = alloc_page(GFP_KERNEL);
- r = -ENOMEM;
- if (!svm_data->save_area)
- goto err_1;
-
- per_cpu(svm_data, cpu) = svm_data;
-
- return 0;
-
-err_1:
- kfree(svm_data);
- return r;
-
-}
-
-static void set_msr_interception(u32 *msrpm, unsigned msr,
- int read, int write)
-{
- int i;
-
- for (i = 0; i < NUM_MSR_MAPS; i++) {
- if (msr >= msrpm_ranges[i] &&
- msr < msrpm_ranges[i] + MSRS_IN_RANGE) {
- u32 msr_offset = (i * MSRS_IN_RANGE + msr -
- msrpm_ranges[i]) * 2;
-
- u32 *base = msrpm + (msr_offset / 32);
- u32 msr_shift = msr_offset % 32;
- u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1);
- *base = (*base & ~(0x3 << msr_shift)) |
- (mask << msr_shift);
- return;
- }
- }
- BUG();
-}
-
-static __init int svm_hardware_setup(void)
-{
- int cpu;
- struct page *iopm_pages;
- struct page *msrpm_pages;
- void *iopm_va, *msrpm_va;
- int r;
-
- iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
-
- if (!iopm_pages)
- return -ENOMEM;
-
- iopm_va = page_address(iopm_pages);
- memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
- clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */
- iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
-
-
- msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
-
- r = -ENOMEM;
- if (!msrpm_pages)
- goto err_1;
-
- msrpm_va = page_address(msrpm_pages);
- memset(msrpm_va, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
- msrpm_base = page_to_pfn(msrpm_pages) << PAGE_SHIFT;
-
-#ifdef CONFIG_X86_64
- set_msr_interception(msrpm_va, MSR_GS_BASE, 1, 1);
- set_msr_interception(msrpm_va, MSR_FS_BASE, 1, 1);
- set_msr_interception(msrpm_va, MSR_KERNEL_GS_BASE, 1, 1);
- set_msr_interception(msrpm_va, MSR_LSTAR, 1, 1);
- set_msr_interception(msrpm_va, MSR_CSTAR, 1, 1);
- set_msr_interception(msrpm_va, MSR_SYSCALL_MASK, 1, 1);
-#endif
- set_msr_interception(msrpm_va, MSR_K6_STAR, 1, 1);
- set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_CS, 1, 1);
- set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1);
- set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1);
-
- for_each_online_cpu(cpu) {
- r = svm_cpu_init(cpu);
- if (r)
- goto err_2;
- }
- return 0;
-
-err_2:
- __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
- msrpm_base = 0;
-err_1:
- __free_pages(iopm_pages, IOPM_ALLOC_ORDER);
- iopm_base = 0;
- return r;
-}
-
-static __exit void svm_hardware_unsetup(void)
-{
- __free_pages(pfn_to_page(msrpm_base >> PAGE_SHIFT), MSRPM_ALLOC_ORDER);
- __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
- iopm_base = msrpm_base = 0;
-}
-
-static void init_seg(struct vmcb_seg *seg)
-{
- seg->selector = 0;
- seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
- SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
- seg->limit = 0xffff;
- seg->base = 0;
-}
-
-static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
-{
- seg->selector = 0;
- seg->attrib = SVM_SELECTOR_P_MASK | type;
- seg->limit = 0xffff;
- seg->base = 0;
-}
-
-static void init_vmcb(struct vmcb *vmcb)
-{
- struct vmcb_control_area *control = &vmcb->control;
- struct vmcb_save_area *save = &vmcb->save;
-
- control->intercept_cr_read = INTERCEPT_CR0_MASK |
- INTERCEPT_CR3_MASK |
- INTERCEPT_CR4_MASK;
-
- control->intercept_cr_write = INTERCEPT_CR0_MASK |
- INTERCEPT_CR3_MASK |
- INTERCEPT_CR4_MASK;
-
- control->intercept_dr_read = INTERCEPT_DR0_MASK |
- INTERCEPT_DR1_MASK |
- INTERCEPT_DR2_MASK |
- INTERCEPT_DR3_MASK;
-
- control->intercept_dr_write = INTERCEPT_DR0_MASK |
- INTERCEPT_DR1_MASK |
- INTERCEPT_DR2_MASK |
- INTERCEPT_DR3_MASK |
- INTERCEPT_DR5_MASK |
- INTERCEPT_DR7_MASK;
-
- control->intercept_exceptions = 1 << PF_VECTOR;
-
-
- control->intercept = (1ULL << INTERCEPT_INTR) |
- (1ULL << INTERCEPT_NMI) |
- (1ULL << INTERCEPT_SMI) |
- /*
- * selective cr0 intercept bug?
- * 0: 0f 22 d8 mov %eax,%cr3
- * 3: 0f 20 c0 mov %cr0,%eax
- * 6: 0d 00 00 00 80 or $0x80000000,%eax
- * b: 0f 22 c0 mov %eax,%cr0
- * set cr3 ->interception
- * get cr0 ->interception
- * set cr0 -> no interception
- */
- /* (1ULL << INTERCEPT_SELECTIVE_CR0) | */
- (1ULL << INTERCEPT_CPUID) |
- (1ULL << INTERCEPT_INVD) |
- (1ULL << INTERCEPT_HLT) |
- (1ULL << INTERCEPT_INVLPGA) |
- (1ULL << INTERCEPT_IOIO_PROT) |
- (1ULL << INTERCEPT_MSR_PROT) |
- (1ULL << INTERCEPT_TASK_SWITCH) |
- (1ULL << INTERCEPT_SHUTDOWN) |
- (1ULL << INTERCEPT_VMRUN) |
- (1ULL << INTERCEPT_VMMCALL) |
- (1ULL << INTERCEPT_VMLOAD) |
- (1ULL << INTERCEPT_VMSAVE) |
- (1ULL << INTERCEPT_STGI) |
- (1ULL << INTERCEPT_CLGI) |
- (1ULL << INTERCEPT_SKINIT) |
- (1ULL << INTERCEPT_WBINVD) |
- (1ULL << INTERCEPT_MONITOR) |
- (1ULL << INTERCEPT_MWAIT);
-
- control->iopm_base_pa = iopm_base;
- control->msrpm_base_pa = msrpm_base;
- control->tsc_offset = 0;
- control->int_ctl = V_INTR_MASKING_MASK;
-
- init_seg(&save->es);
- init_seg(&save->ss);
- init_seg(&save->ds);
- init_seg(&save->fs);
- init_seg(&save->gs);
-
- save->cs.selector = 0xf000;
- /* Executable/Readable Code Segment */
- save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
- SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
- save->cs.limit = 0xffff;
- /*
- * cs.base should really be 0xffff0000, but vmx can't handle that, so
- * be consistent with it.
- *
- * Replace when we have real mode working for vmx.
- */
- save->cs.base = 0xf0000;
-
- save->gdtr.limit = 0xffff;
- save->idtr.limit = 0xffff;
-
- init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
- init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
-
- save->efer = MSR_EFER_SVME_MASK;
-
- save->dr6 = 0xffff0ff0;
- save->dr7 = 0x400;
- save->rflags = 2;
- save->rip = 0x0000fff0;
-
- /*
- * cr0 val on cpu init should be 0x60000010, we enable cpu
- * cache by default. the orderly way is to enable cache in bios.
- */
- save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP;
- save->cr4 = X86_CR4_PAE;
- /* rdx = ?? */
-}
-
-static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- init_vmcb(svm->vmcb);
-
- if (vcpu->vcpu_id != 0) {
- svm->vmcb->save.rip = 0;
- svm->vmcb->save.cs.base = svm->vcpu.sipi_vector << 12;
- svm->vmcb->save.cs.selector = svm->vcpu.sipi_vector << 8;
- }
-}
-
-static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
-{
- struct vcpu_svm *svm;
- struct page *page;
- int err;
-
- svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
- if (!svm) {
- err = -ENOMEM;
- goto out;
- }
-
- err = kvm_vcpu_init(&svm->vcpu, kvm, id);
- if (err)
- goto free_svm;
-
- if (irqchip_in_kernel(kvm)) {
- err = kvm_create_lapic(&svm->vcpu);
- if (err < 0)
- goto free_svm;
- }
-
- page = alloc_page(GFP_KERNEL);
- if (!page) {
- err = -ENOMEM;
- goto uninit;
- }
-
- svm->vmcb = page_address(page);
- clear_page(svm->vmcb);
- svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
- svm->asid_generation = 0;
- memset(svm->db_regs, 0, sizeof(svm->db_regs));
- init_vmcb(svm->vmcb);
-
- fx_init(&svm->vcpu);
- svm->vcpu.fpu_active = 1;
- svm->vcpu.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
- if (svm->vcpu.vcpu_id == 0)
- svm->vcpu.apic_base |= MSR_IA32_APICBASE_BSP;
-
- return &svm->vcpu;
-
-uninit:
- kvm_vcpu_uninit(&svm->vcpu);
-free_svm:
- kmem_cache_free(kvm_vcpu_cache, svm);
-out:
- return ERR_PTR(err);
-}
-
-static void svm_free_vcpu(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
- kvm_vcpu_uninit(vcpu);
- kmem_cache_free(kvm_vcpu_cache, svm);
-}
-
-static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- int i;
-
- if (unlikely(cpu != vcpu->cpu)) {
- u64 tsc_this, delta;
-
- /*
- * Make sure that the guest sees a monotonically
- * increasing TSC.
- */
- rdtscll(tsc_this);
- delta = vcpu->host_tsc - tsc_this;
- svm->vmcb->control.tsc_offset += delta;
- vcpu->cpu = cpu;
- kvm_migrate_apic_timer(vcpu);
- }
-
- for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
- rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
-}
-
-static void svm_vcpu_put(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- int i;
-
- for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
- wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
-
- rdtscll(vcpu->host_tsc);
- kvm_put_guest_fpu(vcpu);
-}
-
-static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
-{
-}
-
-static void svm_cache_regs(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- vcpu->regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
- vcpu->regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
- vcpu->rip = svm->vmcb->save.rip;
-}
-
-static void svm_decache_regs(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->save.rax = vcpu->regs[VCPU_REGS_RAX];
- svm->vmcb->save.rsp = vcpu->regs[VCPU_REGS_RSP];
- svm->vmcb->save.rip = vcpu->rip;
-}
-
-static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
-{
- return to_svm(vcpu)->vmcb->save.rflags;
-}
-
-static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
-{
- to_svm(vcpu)->vmcb->save.rflags = rflags;
-}
-
-static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
-{
- struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
-
- switch (seg) {
- case VCPU_SREG_CS: return &save->cs;
- case VCPU_SREG_DS: return &save->ds;
- case VCPU_SREG_ES: return &save->es;
- case VCPU_SREG_FS: return &save->fs;
- case VCPU_SREG_GS: return &save->gs;
- case VCPU_SREG_SS: return &save->ss;
- case VCPU_SREG_TR: return &save->tr;
- case VCPU_SREG_LDTR: return &save->ldtr;
- }
- BUG();
- return NULL;
-}
-
-static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
-{
- struct vmcb_seg *s = svm_seg(vcpu, seg);
-
- return s->base;
-}
-
-static void svm_get_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- struct vmcb_seg *s = svm_seg(vcpu, seg);
-
- var->base = s->base;
- var->limit = s->limit;
- var->selector = s->selector;
- var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
- var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
- var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
- var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
- var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
- var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
- var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
- var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
- var->unusable = !var->present;
-}
-
-static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- dt->limit = svm->vmcb->save.idtr.limit;
- dt->base = svm->vmcb->save.idtr.base;
-}
-
-static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->save.idtr.limit = dt->limit;
- svm->vmcb->save.idtr.base = dt->base ;
-}
-
-static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- dt->limit = svm->vmcb->save.gdtr.limit;
- dt->base = svm->vmcb->save.gdtr.base;
-}
-
-static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->save.gdtr.limit = dt->limit;
- svm->vmcb->save.gdtr.base = dt->base ;
-}
-
-static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
-{
-}
-
-static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
-#ifdef CONFIG_X86_64
- if (vcpu->shadow_efer & KVM_EFER_LME) {
- if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
- vcpu->shadow_efer |= KVM_EFER_LMA;
- svm->vmcb->save.efer |= KVM_EFER_LMA | KVM_EFER_LME;
- }
-
- if (is_paging(vcpu) && !(cr0 & X86_CR0_PG) ) {
- vcpu->shadow_efer &= ~KVM_EFER_LMA;
- svm->vmcb->save.efer &= ~(KVM_EFER_LMA | KVM_EFER_LME);
- }
- }
-#endif
- if ((vcpu->cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
- svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
- vcpu->fpu_active = 1;
- }
-
- vcpu->cr0 = cr0;
- cr0 |= X86_CR0_PG | X86_CR0_WP;
- cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
- svm->vmcb->save.cr0 = cr0;
-}
-
-static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
-{
- vcpu->cr4 = cr4;
- to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE;
-}
-
-static void svm_set_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb_seg *s = svm_seg(vcpu, seg);
-
- s->base = var->base;
- s->limit = var->limit;
- s->selector = var->selector;
- if (var->unusable)
- s->attrib = 0;
- else {
- s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
- s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
- s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
- s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
- s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
- s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
- s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
- s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
- }
- if (seg == VCPU_SREG_CS)
- svm->vmcb->save.cpl
- = (svm->vmcb->save.cs.attrib
- >> SVM_SELECTOR_DPL_SHIFT) & 3;
-
-}
-
-/* FIXME:
-
- svm(vcpu)->vmcb->control.int_ctl &= ~V_TPR_MASK;
- svm(vcpu)->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK);
-
-*/
-
-static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
-{
- return -EOPNOTSUPP;
-}
-
-static int svm_get_irq(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u32 exit_int_info = svm->vmcb->control.exit_int_info;
-
- if (is_external_interrupt(exit_int_info))
- return exit_int_info & SVM_EVTINJ_VEC_MASK;
- return -1;
-}
-
-static void load_host_msrs(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
- wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
-#endif
-}
-
-static void save_host_msrs(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
- rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
-#endif
-}
-
-static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
-{
- if (svm_data->next_asid > svm_data->max_asid) {
- ++svm_data->asid_generation;
- svm_data->next_asid = 1;
- svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
- }
-
- svm->vcpu.cpu = svm_data->cpu;
- svm->asid_generation = svm_data->asid_generation;
- svm->vmcb->control.asid = svm_data->next_asid++;
-}
-
-static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
-{
- return to_svm(vcpu)->db_regs[dr];
-}
-
-static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
- int *exception)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- *exception = 0;
-
- if (svm->vmcb->save.dr7 & DR7_GD_MASK) {
- svm->vmcb->save.dr7 &= ~DR7_GD_MASK;
- svm->vmcb->save.dr6 |= DR6_BD_MASK;
- *exception = DB_VECTOR;
- return;
- }
-
- switch (dr) {
- case 0 ... 3:
- svm->db_regs[dr] = value;
- return;
- case 4 ... 5:
- if (vcpu->cr4 & X86_CR4_DE) {
- *exception = UD_VECTOR;
- return;
- }
- case 7: {
- if (value & ~((1ULL << 32) - 1)) {
- *exception = GP_VECTOR;
- return;
- }
- svm->vmcb->save.dr7 = value;
- return;
- }
- default:
- printk(KERN_DEBUG "%s: unexpected dr %u\n",
- __FUNCTION__, dr);
- *exception = UD_VECTOR;
- return;
- }
-}
-
-static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- u32 exit_int_info = svm->vmcb->control.exit_int_info;
- struct kvm *kvm = svm->vcpu.kvm;
- u64 fault_address;
- u32 error_code;
- enum emulation_result er;
- int r;
-
- if (!irqchip_in_kernel(kvm) &&
- is_external_interrupt(exit_int_info))
- push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
-
- mutex_lock(&kvm->lock);
-
- fault_address = svm->vmcb->control.exit_info_2;
- error_code = svm->vmcb->control.exit_info_1;
- r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
- if (r < 0) {
- mutex_unlock(&kvm->lock);
- return r;
- }
- if (!r) {
- mutex_unlock(&kvm->lock);
- return 1;
- }
- er = emulate_instruction(&svm->vcpu, kvm_run, fault_address,
- error_code);
- mutex_unlock(&kvm->lock);
-
- switch (er) {
- case EMULATE_DONE:
- return 1;
- case EMULATE_DO_MMIO:
- ++svm->vcpu.stat.mmio_exits;
- return 0;
- case EMULATE_FAIL:
- kvm_report_emulation_failure(&svm->vcpu, "pagetable");
- break;
- default:
- BUG();
- }
-
- kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
- return 0;
-}
-
-static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
- if (!(svm->vcpu.cr0 & X86_CR0_TS))
- svm->vmcb->save.cr0 &= ~X86_CR0_TS;
- svm->vcpu.fpu_active = 1;
-
- return 1;
-}
-
-static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- /*
- * VMCB is undefined after a SHUTDOWN intercept
- * so reinitialize it.
- */
- clear_page(svm->vmcb);
- init_vmcb(svm->vmcb);
-
- kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
- return 0;
-}
-
-static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- u32 io_info = svm->vmcb->control.exit_info_1; //address size bug?
- int size, down, in, string, rep;
- unsigned port;
-
- ++svm->vcpu.stat.io_exits;
-
- svm->next_rip = svm->vmcb->control.exit_info_2;
-
- string = (io_info & SVM_IOIO_STR_MASK) != 0;
-
- if (string) {
- if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO)
- return 0;
- return 1;
- }
-
- in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
- port = io_info >> 16;
- size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
- rep = (io_info & SVM_IOIO_REP_MASK) != 0;
- down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
-
- return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
-}
-
-static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- return 1;
-}
-
-static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- svm->next_rip = svm->vmcb->save.rip + 1;
- skip_emulated_instruction(&svm->vcpu);
- return kvm_emulate_halt(&svm->vcpu);
-}
-
-static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- svm->next_rip = svm->vmcb->save.rip + 3;
- skip_emulated_instruction(&svm->vcpu);
- return kvm_hypercall(&svm->vcpu, kvm_run);
-}
-
-static int invalid_op_interception(struct vcpu_svm *svm,
- struct kvm_run *kvm_run)
-{
- inject_ud(&svm->vcpu);
- return 1;
-}
-
-static int task_switch_interception(struct vcpu_svm *svm,
- struct kvm_run *kvm_run)
-{
- pr_unimpl(&svm->vcpu, "%s: task switch is unsupported\n", __FUNCTION__);
- kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
- return 0;
-}
-
-static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- svm->next_rip = svm->vmcb->save.rip + 2;
- kvm_emulate_cpuid(&svm->vcpu);
- return 1;
-}
-
-static int emulate_on_interception(struct vcpu_svm *svm,
- struct kvm_run *kvm_run)
-{
- if (emulate_instruction(&svm->vcpu, NULL, 0, 0) != EMULATE_DONE)
- pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__);
- return 1;
-}
-
-static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- switch (ecx) {
- case MSR_IA32_TIME_STAMP_COUNTER: {
- u64 tsc;
-
- rdtscll(tsc);
- *data = svm->vmcb->control.tsc_offset + tsc;
- break;
- }
- case MSR_K6_STAR:
- *data = svm->vmcb->save.star;
- break;
-#ifdef CONFIG_X86_64
- case MSR_LSTAR:
- *data = svm->vmcb->save.lstar;
- break;
- case MSR_CSTAR:
- *data = svm->vmcb->save.cstar;
- break;
- case MSR_KERNEL_GS_BASE:
- *data = svm->vmcb->save.kernel_gs_base;
- break;
- case MSR_SYSCALL_MASK:
- *data = svm->vmcb->save.sfmask;
- break;
-#endif
- case MSR_IA32_SYSENTER_CS:
- *data = svm->vmcb->save.sysenter_cs;
- break;
- case MSR_IA32_SYSENTER_EIP:
- *data = svm->vmcb->save.sysenter_eip;
- break;
- case MSR_IA32_SYSENTER_ESP:
- *data = svm->vmcb->save.sysenter_esp;
- break;
- default:
- return kvm_get_msr_common(vcpu, ecx, data);
- }
- return 0;
-}
-
-static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX];
- u64 data;
-
- if (svm_get_msr(&svm->vcpu, ecx, &data))
- svm_inject_gp(&svm->vcpu, 0);
- else {
- svm->vmcb->save.rax = data & 0xffffffff;
- svm->vcpu.regs[VCPU_REGS_RDX] = data >> 32;
- svm->next_rip = svm->vmcb->save.rip + 2;
- skip_emulated_instruction(&svm->vcpu);
- }
- return 1;
-}
-
-static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- switch (ecx) {
- case MSR_IA32_TIME_STAMP_COUNTER: {
- u64 tsc;
-
- rdtscll(tsc);
- svm->vmcb->control.tsc_offset = data - tsc;
- break;
- }
- case MSR_K6_STAR:
- svm->vmcb->save.star = data;
- break;
-#ifdef CONFIG_X86_64
- case MSR_LSTAR:
- svm->vmcb->save.lstar = data;
- break;
- case MSR_CSTAR:
- svm->vmcb->save.cstar = data;
- break;
- case MSR_KERNEL_GS_BASE:
- svm->vmcb->save.kernel_gs_base = data;
- break;
- case MSR_SYSCALL_MASK:
- svm->vmcb->save.sfmask = data;
- break;
-#endif
- case MSR_IA32_SYSENTER_CS:
- svm->vmcb->save.sysenter_cs = data;
- break;
- case MSR_IA32_SYSENTER_EIP:
- svm->vmcb->save.sysenter_eip = data;
- break;
- case MSR_IA32_SYSENTER_ESP:
- svm->vmcb->save.sysenter_esp = data;
- break;
- default:
- return kvm_set_msr_common(vcpu, ecx, data);
- }
- return 0;
-}
-
-static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX];
- u64 data = (svm->vmcb->save.rax & -1u)
- | ((u64)(svm->vcpu.regs[VCPU_REGS_RDX] & -1u) << 32);
- svm->next_rip = svm->vmcb->save.rip + 2;
- if (svm_set_msr(&svm->vcpu, ecx, data))
- svm_inject_gp(&svm->vcpu, 0);
- else
- skip_emulated_instruction(&svm->vcpu);
- return 1;
-}
-
-static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- if (svm->vmcb->control.exit_info_1)
- return wrmsr_interception(svm, kvm_run);
- else
- return rdmsr_interception(svm, kvm_run);
-}
-
-static int interrupt_window_interception(struct vcpu_svm *svm,
- struct kvm_run *kvm_run)
-{
- svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
- svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
- /*
- * If the user space waits to inject interrupts, exit as soon as
- * possible
- */
- if (kvm_run->request_interrupt_window &&
- !svm->vcpu.irq_summary) {
- ++svm->vcpu.stat.irq_window_exits;
- kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
- return 0;
- }
-
- return 1;
-}
-
-static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
- struct kvm_run *kvm_run) = {
- [SVM_EXIT_READ_CR0] = emulate_on_interception,
- [SVM_EXIT_READ_CR3] = emulate_on_interception,
- [SVM_EXIT_READ_CR4] = emulate_on_interception,
- /* for now: */
- [SVM_EXIT_WRITE_CR0] = emulate_on_interception,
- [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
- [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
- [SVM_EXIT_READ_DR0] = emulate_on_interception,
- [SVM_EXIT_READ_DR1] = emulate_on_interception,
- [SVM_EXIT_READ_DR2] = emulate_on_interception,
- [SVM_EXIT_READ_DR3] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR0] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR1] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR2] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR3] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR5] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR7] = emulate_on_interception,
- [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
- [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
- [SVM_EXIT_INTR] = nop_on_interception,
- [SVM_EXIT_NMI] = nop_on_interception,
- [SVM_EXIT_SMI] = nop_on_interception,
- [SVM_EXIT_INIT] = nop_on_interception,
- [SVM_EXIT_VINTR] = interrupt_window_interception,
- /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */
- [SVM_EXIT_CPUID] = cpuid_interception,
- [SVM_EXIT_INVD] = emulate_on_interception,
- [SVM_EXIT_HLT] = halt_interception,
- [SVM_EXIT_INVLPG] = emulate_on_interception,
- [SVM_EXIT_INVLPGA] = invalid_op_interception,
- [SVM_EXIT_IOIO] = io_interception,
- [SVM_EXIT_MSR] = msr_interception,
- [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
- [SVM_EXIT_SHUTDOWN] = shutdown_interception,
- [SVM_EXIT_VMRUN] = invalid_op_interception,
- [SVM_EXIT_VMMCALL] = vmmcall_interception,
- [SVM_EXIT_VMLOAD] = invalid_op_interception,
- [SVM_EXIT_VMSAVE] = invalid_op_interception,
- [SVM_EXIT_STGI] = invalid_op_interception,
- [SVM_EXIT_CLGI] = invalid_op_interception,
- [SVM_EXIT_SKINIT] = invalid_op_interception,
- [SVM_EXIT_WBINVD] = emulate_on_interception,
- [SVM_EXIT_MONITOR] = invalid_op_interception,
- [SVM_EXIT_MWAIT] = invalid_op_interception,
-};
-
-
-static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u32 exit_code = svm->vmcb->control.exit_code;
-
- kvm_reput_irq(svm);
-
- if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
- kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
- kvm_run->fail_entry.hardware_entry_failure_reason
- = svm->vmcb->control.exit_code;
- return 0;
- }
-
- if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
- exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR)
- printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
- "exit_code 0x%x\n",
- __FUNCTION__, svm->vmcb->control.exit_int_info,
- exit_code);
-
- if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
- || svm_exit_handlers[exit_code] == 0) {
- kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
- kvm_run->hw.hardware_exit_reason = exit_code;
- return 0;
- }
-
- return svm_exit_handlers[exit_code](svm, kvm_run);
-}
-
-static void reload_tss(struct kvm_vcpu *vcpu)
-{
- int cpu = raw_smp_processor_id();
-
- struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
- svm_data->tss_desc->type = 9; //available 32/64-bit TSS
- load_TR_desc();
-}
-
-static void pre_svm_run(struct vcpu_svm *svm)
-{
- int cpu = raw_smp_processor_id();
-
- struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
-
- svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
- if (svm->vcpu.cpu != cpu ||
- svm->asid_generation != svm_data->asid_generation)
- new_asid(svm, svm_data);
-}
-
-
-static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
-{
- struct vmcb_control_area *control;
-
- control = &svm->vmcb->control;
- control->int_vector = irq;
- control->int_ctl &= ~V_INTR_PRIO_MASK;
- control->int_ctl |= V_IRQ_MASK |
- ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
-}
-
-static void svm_set_irq(struct kvm_vcpu *vcpu, int irq)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm_inject_irq(svm, irq);
-}
-
-static void svm_intr_assist(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb *vmcb = svm->vmcb;
- int intr_vector = -1;
-
- kvm_inject_pending_timer_irqs(vcpu);
- if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) &&
- ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) {
- intr_vector = vmcb->control.exit_int_info &
- SVM_EVTINJ_VEC_MASK;
- vmcb->control.exit_int_info = 0;
- svm_inject_irq(svm, intr_vector);
- return;
- }
-
- if (vmcb->control.int_ctl & V_IRQ_MASK)
- return;
-
- if (!kvm_cpu_has_interrupt(vcpu))
- return;
-
- if (!(vmcb->save.rflags & X86_EFLAGS_IF) ||
- (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
- (vmcb->control.event_inj & SVM_EVTINJ_VALID)) {
- /* unable to deliver irq, set pending irq */
- vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR);
- svm_inject_irq(svm, 0x0);
- return;
- }
- /* Okay, we can deliver the interrupt: grab it and update PIC state. */
- intr_vector = kvm_cpu_get_interrupt(vcpu);
- svm_inject_irq(svm, intr_vector);
- kvm_timer_intr_post(vcpu, intr_vector);
-}
-
-static void kvm_reput_irq(struct vcpu_svm *svm)
-{
- struct vmcb_control_area *control = &svm->vmcb->control;
-
- if ((control->int_ctl & V_IRQ_MASK)
- && !irqchip_in_kernel(svm->vcpu.kvm)) {
- control->int_ctl &= ~V_IRQ_MASK;
- push_irq(&svm->vcpu, control->int_vector);
- }
-
- svm->vcpu.interrupt_window_open =
- !(control->int_state & SVM_INTERRUPT_SHADOW_MASK);
-}
-
-static void svm_do_inject_vector(struct vcpu_svm *svm)
-{
- struct kvm_vcpu *vcpu = &svm->vcpu;
- int word_index = __ffs(vcpu->irq_summary);
- int bit_index = __ffs(vcpu->irq_pending[word_index]);
- int irq = word_index * BITS_PER_LONG + bit_index;
-
- clear_bit(bit_index, &vcpu->irq_pending[word_index]);
- if (!vcpu->irq_pending[word_index])
- clear_bit(word_index, &vcpu->irq_summary);
- svm_inject_irq(svm, irq);
-}
-
-static void do_interrupt_requests(struct kvm_vcpu *vcpu,
- struct kvm_run *kvm_run)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb_control_area *control = &svm->vmcb->control;
-
- svm->vcpu.interrupt_window_open =
- (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
- (svm->vmcb->save.rflags & X86_EFLAGS_IF));
-
- if (svm->vcpu.interrupt_window_open && svm->vcpu.irq_summary)
- /*
- * If interrupts enabled, and not blocked by sti or mov ss. Good.
- */
- svm_do_inject_vector(svm);
-
- /*
- * Interrupts blocked. Wait for unblock.
- */
- if (!svm->vcpu.interrupt_window_open &&
- (svm->vcpu.irq_summary || kvm_run->request_interrupt_window)) {
- control->intercept |= 1ULL << INTERCEPT_VINTR;
- } else
- control->intercept &= ~(1ULL << INTERCEPT_VINTR);
-}
-
-static void save_db_regs(unsigned long *db_regs)
-{
- asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0]));
- asm volatile ("mov %%dr1, %0" : "=r"(db_regs[1]));
- asm volatile ("mov %%dr2, %0" : "=r"(db_regs[2]));
- asm volatile ("mov %%dr3, %0" : "=r"(db_regs[3]));
-}
-
-static void load_db_regs(unsigned long *db_regs)
-{
- asm volatile ("mov %0, %%dr0" : : "r"(db_regs[0]));
- asm volatile ("mov %0, %%dr1" : : "r"(db_regs[1]));
- asm volatile ("mov %0, %%dr2" : : "r"(db_regs[2]));
- asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3]));
-}
-
-static void svm_flush_tlb(struct kvm_vcpu *vcpu)
-{
- force_new_asid(vcpu);
-}
-
-static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
-{
-}
-
-static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u16 fs_selector;
- u16 gs_selector;
- u16 ldt_selector;
-
- pre_svm_run(svm);
-
- save_host_msrs(vcpu);
- fs_selector = read_fs();
- gs_selector = read_gs();
- ldt_selector = read_ldt();
- svm->host_cr2 = kvm_read_cr2();
- svm->host_dr6 = read_dr6();
- svm->host_dr7 = read_dr7();
- svm->vmcb->save.cr2 = vcpu->cr2;
-
- if (svm->vmcb->save.dr7 & 0xff) {
- write_dr7(0);
- save_db_regs(svm->host_db_regs);
- load_db_regs(svm->db_regs);
- }
-
- clgi();
-
- local_irq_enable();
-
- asm volatile (
-#ifdef CONFIG_X86_64
- "push %%rbx; push %%rcx; push %%rdx;"
- "push %%rsi; push %%rdi; push %%rbp;"
- "push %%r8; push %%r9; push %%r10; push %%r11;"
- "push %%r12; push %%r13; push %%r14; push %%r15;"
-#else
- "push %%ebx; push %%ecx; push %%edx;"
- "push %%esi; push %%edi; push %%ebp;"
-#endif
-
-#ifdef CONFIG_X86_64
- "mov %c[rbx](%[svm]), %%rbx \n\t"
- "mov %c[rcx](%[svm]), %%rcx \n\t"
- "mov %c[rdx](%[svm]), %%rdx \n\t"
- "mov %c[rsi](%[svm]), %%rsi \n\t"
- "mov %c[rdi](%[svm]), %%rdi \n\t"
- "mov %c[rbp](%[svm]), %%rbp \n\t"
- "mov %c[r8](%[svm]), %%r8 \n\t"
- "mov %c[r9](%[svm]), %%r9 \n\t"
- "mov %c[r10](%[svm]), %%r10 \n\t"
- "mov %c[r11](%[svm]), %%r11 \n\t"
- "mov %c[r12](%[svm]), %%r12 \n\t"
- "mov %c[r13](%[svm]), %%r13 \n\t"
- "mov %c[r14](%[svm]), %%r14 \n\t"
- "mov %c[r15](%[svm]), %%r15 \n\t"
-#else
- "mov %c[rbx](%[svm]), %%ebx \n\t"
- "mov %c[rcx](%[svm]), %%ecx \n\t"
- "mov %c[rdx](%[svm]), %%edx \n\t"
- "mov %c[rsi](%[svm]), %%esi \n\t"
- "mov %c[rdi](%[svm]), %%edi \n\t"
- "mov %c[rbp](%[svm]), %%ebp \n\t"
-#endif
-
-#ifdef CONFIG_X86_64
- /* Enter guest mode */
- "push %%rax \n\t"
- "mov %c[vmcb](%[svm]), %%rax \n\t"
- SVM_VMLOAD "\n\t"
- SVM_VMRUN "\n\t"
- SVM_VMSAVE "\n\t"
- "pop %%rax \n\t"
-#else
- /* Enter guest mode */
- "push %%eax \n\t"
- "mov %c[vmcb](%[svm]), %%eax \n\t"
- SVM_VMLOAD "\n\t"
- SVM_VMRUN "\n\t"
- SVM_VMSAVE "\n\t"
- "pop %%eax \n\t"
-#endif
-
- /* Save guest registers, load host registers */
-#ifdef CONFIG_X86_64
- "mov %%rbx, %c[rbx](%[svm]) \n\t"
- "mov %%rcx, %c[rcx](%[svm]) \n\t"
- "mov %%rdx, %c[rdx](%[svm]) \n\t"
- "mov %%rsi, %c[rsi](%[svm]) \n\t"
- "mov %%rdi, %c[rdi](%[svm]) \n\t"
- "mov %%rbp, %c[rbp](%[svm]) \n\t"
- "mov %%r8, %c[r8](%[svm]) \n\t"
- "mov %%r9, %c[r9](%[svm]) \n\t"
- "mov %%r10, %c[r10](%[svm]) \n\t"
- "mov %%r11, %c[r11](%[svm]) \n\t"
- "mov %%r12, %c[r12](%[svm]) \n\t"
- "mov %%r13, %c[r13](%[svm]) \n\t"
- "mov %%r14, %c[r14](%[svm]) \n\t"
- "mov %%r15, %c[r15](%[svm]) \n\t"
-
- "pop %%r15; pop %%r14; pop %%r13; pop %%r12;"
- "pop %%r11; pop %%r10; pop %%r9; pop %%r8;"
- "pop %%rbp; pop %%rdi; pop %%rsi;"
- "pop %%rdx; pop %%rcx; pop %%rbx; \n\t"
-#else
- "mov %%ebx, %c[rbx](%[svm]) \n\t"
- "mov %%ecx, %c[rcx](%[svm]) \n\t"
- "mov %%edx, %c[rdx](%[svm]) \n\t"
- "mov %%esi, %c[rsi](%[svm]) \n\t"
- "mov %%edi, %c[rdi](%[svm]) \n\t"
- "mov %%ebp, %c[rbp](%[svm]) \n\t"
-
- "pop %%ebp; pop %%edi; pop %%esi;"
- "pop %%edx; pop %%ecx; pop %%ebx; \n\t"
-#endif
- :
- : [svm]"a"(svm),
- [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
- [rbx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBX])),
- [rcx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RCX])),
- [rdx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDX])),
- [rsi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RSI])),
- [rdi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDI])),
- [rbp]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBP]))
-#ifdef CONFIG_X86_64
- ,[r8 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R8])),
- [r9 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R9 ])),
- [r10]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R10])),
- [r11]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R11])),
- [r12]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R12])),
- [r13]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R13])),
- [r14]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R14])),
- [r15]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R15]))
-#endif
- : "cc", "memory" );
-
- if ((svm->vmcb->save.dr7 & 0xff))
- load_db_regs(svm->host_db_regs);
-
- vcpu->cr2 = svm->vmcb->save.cr2;
-
- write_dr6(svm->host_dr6);
- write_dr7(svm->host_dr7);
- kvm_write_cr2(svm->host_cr2);
-
- load_fs(fs_selector);
- load_gs(gs_selector);
- load_ldt(ldt_selector);
- load_host_msrs(vcpu);
-
- reload_tss(vcpu);
-
- local_irq_disable();
-
- stgi();
-
- svm->next_rip = 0;
-}
-
-static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->save.cr3 = root;
- force_new_asid(vcpu);
-
- if (vcpu->fpu_active) {
- svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
- svm->vmcb->save.cr0 |= X86_CR0_TS;
- vcpu->fpu_active = 0;
- }
-}
-
-static void svm_inject_page_fault(struct kvm_vcpu *vcpu,
- unsigned long addr,
- uint32_t err_code)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- uint32_t exit_int_info = svm->vmcb->control.exit_int_info;
-
- ++vcpu->stat.pf_guest;
-
- if (is_page_fault(exit_int_info)) {
-
- svm->vmcb->control.event_inj_err = 0;
- svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
- SVM_EVTINJ_VALID_ERR |
- SVM_EVTINJ_TYPE_EXEPT |
- DF_VECTOR;
- return;
- }
- vcpu->cr2 = addr;
- svm->vmcb->save.cr2 = addr;
- svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
- SVM_EVTINJ_VALID_ERR |
- SVM_EVTINJ_TYPE_EXEPT |
- PF_VECTOR;
- svm->vmcb->control.event_inj_err = err_code;
-}
-
-
-static int is_disabled(void)
-{
- u64 vm_cr;
-
- rdmsrl(MSR_VM_CR, vm_cr);
- if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
- return 1;
-
- return 0;
-}
-
-static void
-svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
-{
- /*
- * Patch in the VMMCALL instruction:
- */
- hypercall[0] = 0x0f;
- hypercall[1] = 0x01;
- hypercall[2] = 0xd9;
- hypercall[3] = 0xc3;
-}
-
-static void svm_check_processor_compat(void *rtn)
-{
- *(int *)rtn = 0;
-}
-
-static struct kvm_x86_ops svm_x86_ops = {
- .cpu_has_kvm_support = has_svm,
- .disabled_by_bios = is_disabled,
- .hardware_setup = svm_hardware_setup,
- .hardware_unsetup = svm_hardware_unsetup,
- .check_processor_compatibility = svm_check_processor_compat,
- .hardware_enable = svm_hardware_enable,
- .hardware_disable = svm_hardware_disable,
-
- .vcpu_create = svm_create_vcpu,
- .vcpu_free = svm_free_vcpu,
- .vcpu_reset = svm_vcpu_reset,
-
- .prepare_guest_switch = svm_prepare_guest_switch,
- .vcpu_load = svm_vcpu_load,
- .vcpu_put = svm_vcpu_put,
- .vcpu_decache = svm_vcpu_decache,
-
- .set_guest_debug = svm_guest_debug,
- .get_msr = svm_get_msr,
- .set_msr = svm_set_msr,
- .get_segment_base = svm_get_segment_base,
- .get_segment = svm_get_segment,
- .set_segment = svm_set_segment,
- .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
- .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
- .set_cr0 = svm_set_cr0,
- .set_cr3 = svm_set_cr3,
- .set_cr4 = svm_set_cr4,
- .set_efer = svm_set_efer,
- .get_idt = svm_get_idt,
- .set_idt = svm_set_idt,
- .get_gdt = svm_get_gdt,
- .set_gdt = svm_set_gdt,
- .get_dr = svm_get_dr,
- .set_dr = svm_set_dr,
- .cache_regs = svm_cache_regs,
- .decache_regs = svm_decache_regs,
- .get_rflags = svm_get_rflags,
- .set_rflags = svm_set_rflags,
-
- .tlb_flush = svm_flush_tlb,
- .inject_page_fault = svm_inject_page_fault,
-
- .inject_gp = svm_inject_gp,
-
- .run = svm_vcpu_run,
- .handle_exit = handle_exit,
- .skip_emulated_instruction = skip_emulated_instruction,
- .patch_hypercall = svm_patch_hypercall,
- .get_irq = svm_get_irq,
- .set_irq = svm_set_irq,
- .inject_pending_irq = svm_intr_assist,
- .inject_pending_vectors = do_interrupt_requests,
-};
-
-static int __init svm_init(void)
-{
- return kvm_init_x86(&svm_x86_ops, sizeof(struct vcpu_svm),
- THIS_MODULE);
-}
-
-static void __exit svm_exit(void)
-{
- kvm_exit_x86();
-}
-
-module_init(svm_init)
-module_exit(svm_exit)
diff --git a/drivers/kvm/svm.h b/drivers/kvm/svm.h
deleted file mode 100644
index 3b1b0f35b6c..00000000000
--- a/drivers/kvm/svm.h
+++ /dev/null
@@ -1,324 +0,0 @@
-#ifndef __SVM_H
-#define __SVM_H
-
-enum {
- INTERCEPT_INTR,
- INTERCEPT_NMI,
- INTERCEPT_SMI,
- INTERCEPT_INIT,
- INTERCEPT_VINTR,
- INTERCEPT_SELECTIVE_CR0,
- INTERCEPT_STORE_IDTR,
- INTERCEPT_STORE_GDTR,
- INTERCEPT_STORE_LDTR,
- INTERCEPT_STORE_TR,
- INTERCEPT_LOAD_IDTR,
- INTERCEPT_LOAD_GDTR,
- INTERCEPT_LOAD_LDTR,
- INTERCEPT_LOAD_TR,
- INTERCEPT_RDTSC,
- INTERCEPT_RDPMC,
- INTERCEPT_PUSHF,
- INTERCEPT_POPF,
- INTERCEPT_CPUID,
- INTERCEPT_RSM,
- INTERCEPT_IRET,
- INTERCEPT_INTn,
- INTERCEPT_INVD,
- INTERCEPT_PAUSE,
- INTERCEPT_HLT,
- INTERCEPT_INVLPG,
- INTERCEPT_INVLPGA,
- INTERCEPT_IOIO_PROT,
- INTERCEPT_MSR_PROT,
- INTERCEPT_TASK_SWITCH,
- INTERCEPT_FERR_FREEZE,
- INTERCEPT_SHUTDOWN,
- INTERCEPT_VMRUN,
- INTERCEPT_VMMCALL,
- INTERCEPT_VMLOAD,
- INTERCEPT_VMSAVE,
- INTERCEPT_STGI,
- INTERCEPT_CLGI,
- INTERCEPT_SKINIT,
- INTERCEPT_RDTSCP,
- INTERCEPT_ICEBP,
- INTERCEPT_WBINVD,
- INTERCEPT_MONITOR,
- INTERCEPT_MWAIT,
- INTERCEPT_MWAIT_COND,
-};
-
-
-struct __attribute__ ((__packed__)) vmcb_control_area {
- u16 intercept_cr_read;
- u16 intercept_cr_write;
- u16 intercept_dr_read;
- u16 intercept_dr_write;
- u32 intercept_exceptions;
- u64 intercept;
- u8 reserved_1[44];
- u64 iopm_base_pa;
- u64 msrpm_base_pa;
- u64 tsc_offset;
- u32 asid;
- u8 tlb_ctl;
- u8 reserved_2[3];
- u32 int_ctl;
- u32 int_vector;
- u32 int_state;
- u8 reserved_3[4];
- u32 exit_code;
- u32 exit_code_hi;
- u64 exit_info_1;
- u64 exit_info_2;
- u32 exit_int_info;
- u32 exit_int_info_err;
- u64 nested_ctl;
- u8 reserved_4[16];
- u32 event_inj;
- u32 event_inj_err;
- u64 nested_cr3;
- u64 lbr_ctl;
- u8 reserved_5[832];
-};
-
-
-#define TLB_CONTROL_DO_NOTHING 0
-#define TLB_CONTROL_FLUSH_ALL_ASID 1
-
-#define V_TPR_MASK 0x0f
-
-#define V_IRQ_SHIFT 8
-#define V_IRQ_MASK (1 << V_IRQ_SHIFT)
-
-#define V_INTR_PRIO_SHIFT 16
-#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT)
-
-#define V_IGN_TPR_SHIFT 20
-#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT)
-
-#define V_INTR_MASKING_SHIFT 24
-#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT)
-
-#define SVM_INTERRUPT_SHADOW_MASK 1
-
-#define SVM_IOIO_STR_SHIFT 2
-#define SVM_IOIO_REP_SHIFT 3
-#define SVM_IOIO_SIZE_SHIFT 4
-#define SVM_IOIO_ASIZE_SHIFT 7
-
-#define SVM_IOIO_TYPE_MASK 1
-#define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT)
-#define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT)
-#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
-#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
-
-struct __attribute__ ((__packed__)) vmcb_seg {
- u16 selector;
- u16 attrib;
- u32 limit;
- u64 base;
-};
-
-struct __attribute__ ((__packed__)) vmcb_save_area {
- struct vmcb_seg es;
- struct vmcb_seg cs;
- struct vmcb_seg ss;
- struct vmcb_seg ds;
- struct vmcb_seg fs;
- struct vmcb_seg gs;
- struct vmcb_seg gdtr;
- struct vmcb_seg ldtr;
- struct vmcb_seg idtr;
- struct vmcb_seg tr;
- u8 reserved_1[43];
- u8 cpl;
- u8 reserved_2[4];
- u64 efer;
- u8 reserved_3[112];
- u64 cr4;
- u64 cr3;
- u64 cr0;
- u64 dr7;
- u64 dr6;
- u64 rflags;
- u64 rip;
- u8 reserved_4[88];
- u64 rsp;
- u8 reserved_5[24];
- u64 rax;
- u64 star;
- u64 lstar;
- u64 cstar;
- u64 sfmask;
- u64 kernel_gs_base;
- u64 sysenter_cs;
- u64 sysenter_esp;
- u64 sysenter_eip;
- u64 cr2;
- u8 reserved_6[32];
- u64 g_pat;
- u64 dbgctl;
- u64 br_from;
- u64 br_to;
- u64 last_excp_from;
- u64 last_excp_to;
-};
-
-struct __attribute__ ((__packed__)) vmcb {
- struct vmcb_control_area control;
- struct vmcb_save_area save;
-};
-
-#define SVM_CPUID_FEATURE_SHIFT 2
-#define SVM_CPUID_FUNC 0x8000000a
-
-#define MSR_EFER_SVME_MASK (1ULL << 12)
-#define MSR_VM_CR 0xc0010114
-#define MSR_VM_HSAVE_PA 0xc0010117ULL
-
-#define SVM_VM_CR_SVM_DISABLE 4
-
-#define SVM_SELECTOR_S_SHIFT 4
-#define SVM_SELECTOR_DPL_SHIFT 5
-#define SVM_SELECTOR_P_SHIFT 7
-#define SVM_SELECTOR_AVL_SHIFT 8
-#define SVM_SELECTOR_L_SHIFT 9
-#define SVM_SELECTOR_DB_SHIFT 10
-#define SVM_SELECTOR_G_SHIFT 11
-
-#define SVM_SELECTOR_TYPE_MASK (0xf)
-#define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT)
-#define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT)
-#define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT)
-#define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT)
-#define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT)
-#define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT)
-#define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT)
-
-#define SVM_SELECTOR_WRITE_MASK (1 << 1)
-#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
-#define SVM_SELECTOR_CODE_MASK (1 << 3)
-
-#define INTERCEPT_CR0_MASK 1
-#define INTERCEPT_CR3_MASK (1 << 3)
-#define INTERCEPT_CR4_MASK (1 << 4)
-
-#define INTERCEPT_DR0_MASK 1
-#define INTERCEPT_DR1_MASK (1 << 1)
-#define INTERCEPT_DR2_MASK (1 << 2)
-#define INTERCEPT_DR3_MASK (1 << 3)
-#define INTERCEPT_DR4_MASK (1 << 4)
-#define INTERCEPT_DR5_MASK (1 << 5)
-#define INTERCEPT_DR6_MASK (1 << 6)
-#define INTERCEPT_DR7_MASK (1 << 7)
-
-#define SVM_EVTINJ_VEC_MASK 0xff
-
-#define SVM_EVTINJ_TYPE_SHIFT 8
-#define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT)
-
-#define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT)
-#define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT)
-#define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT)
-#define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT)
-
-#define SVM_EVTINJ_VALID (1 << 31)
-#define SVM_EVTINJ_VALID_ERR (1 << 11)
-
-#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK
-
-#define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR
-#define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI
-#define SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT
-#define SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT
-
-#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID
-#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR
-
-#define SVM_EXIT_READ_CR0 0x000
-#define SVM_EXIT_READ_CR3 0x003
-#define SVM_EXIT_READ_CR4 0x004
-#define SVM_EXIT_READ_CR8 0x008
-#define SVM_EXIT_WRITE_CR0 0x010
-#define SVM_EXIT_WRITE_CR3 0x013
-#define SVM_EXIT_WRITE_CR4 0x014
-#define SVM_EXIT_WRITE_CR8 0x018
-#define SVM_EXIT_READ_DR0 0x020
-#define SVM_EXIT_READ_DR1 0x021
-#define SVM_EXIT_READ_DR2 0x022
-#define SVM_EXIT_READ_DR3 0x023
-#define SVM_EXIT_READ_DR4 0x024
-#define SVM_EXIT_READ_DR5 0x025
-#define SVM_EXIT_READ_DR6 0x026
-#define SVM_EXIT_READ_DR7 0x027
-#define SVM_EXIT_WRITE_DR0 0x030
-#define SVM_EXIT_WRITE_DR1 0x031
-#define SVM_EXIT_WRITE_DR2 0x032
-#define SVM_EXIT_WRITE_DR3 0x033
-#define SVM_EXIT_WRITE_DR4 0x034
-#define SVM_EXIT_WRITE_DR5 0x035
-#define SVM_EXIT_WRITE_DR6 0x036
-#define SVM_EXIT_WRITE_DR7 0x037
-#define SVM_EXIT_EXCP_BASE 0x040
-#define SVM_EXIT_INTR 0x060
-#define SVM_EXIT_NMI 0x061
-#define SVM_EXIT_SMI 0x062
-#define SVM_EXIT_INIT 0x063
-#define SVM_EXIT_VINTR 0x064
-#define SVM_EXIT_CR0_SEL_WRITE 0x065
-#define SVM_EXIT_IDTR_READ 0x066
-#define SVM_EXIT_GDTR_READ 0x067
-#define SVM_EXIT_LDTR_READ 0x068
-#define SVM_EXIT_TR_READ 0x069
-#define SVM_EXIT_IDTR_WRITE 0x06a
-#define SVM_EXIT_GDTR_WRITE 0x06b
-#define SVM_EXIT_LDTR_WRITE 0x06c
-#define SVM_EXIT_TR_WRITE 0x06d
-#define SVM_EXIT_RDTSC 0x06e
-#define SVM_EXIT_RDPMC 0x06f
-#define SVM_EXIT_PUSHF 0x070
-#define SVM_EXIT_POPF 0x071
-#define SVM_EXIT_CPUID 0x072
-#define SVM_EXIT_RSM 0x073
-#define SVM_EXIT_IRET 0x074
-#define SVM_EXIT_SWINT 0x075
-#define SVM_EXIT_INVD 0x076
-#define SVM_EXIT_PAUSE 0x077
-#define SVM_EXIT_HLT 0x078
-#define SVM_EXIT_INVLPG 0x079
-#define SVM_EXIT_INVLPGA 0x07a
-#define SVM_EXIT_IOIO 0x07b
-#define SVM_EXIT_MSR 0x07c
-#define SVM_EXIT_TASK_SWITCH 0x07d
-#define SVM_EXIT_FERR_FREEZE 0x07e
-#define SVM_EXIT_SHUTDOWN 0x07f
-#define SVM_EXIT_VMRUN 0x080
-#define SVM_EXIT_VMMCALL 0x081
-#define SVM_EXIT_VMLOAD 0x082
-#define SVM_EXIT_VMSAVE 0x083
-#define SVM_EXIT_STGI 0x084
-#define SVM_EXIT_CLGI 0x085
-#define SVM_EXIT_SKINIT 0x086
-#define SVM_EXIT_RDTSCP 0x087
-#define SVM_EXIT_ICEBP 0x088
-#define SVM_EXIT_WBINVD 0x089
-#define SVM_EXIT_MONITOR 0x08a
-#define SVM_EXIT_MWAIT 0x08b
-#define SVM_EXIT_MWAIT_COND 0x08c
-#define SVM_EXIT_NPF 0x400
-
-#define SVM_EXIT_ERR -1
-
-#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) // TS and MP
-
-#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
-#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8"
-#define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb"
-#define SVM_CLGI ".byte 0x0f, 0x01, 0xdd"
-#define SVM_STGI ".byte 0x0f, 0x01, 0xdc"
-#define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf"
-
-#endif
-
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
deleted file mode 100644
index bb56ae3f89b..00000000000
--- a/drivers/kvm/vmx.c
+++ /dev/null
@@ -1,2566 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * Copyright (C) 2006 Qumranet, Inc.
- *
- * Authors:
- * Avi Kivity <avi@qumranet.com>
- * Yaniv Kamay <yaniv@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include "kvm.h"
-#include "x86_emulate.h"
-#include "irq.h"
-#include "vmx.h"
-#include "segment_descriptor.h"
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/sched.h>
-
-#include <asm/io.h>
-#include <asm/desc.h>
-
-MODULE_AUTHOR("Qumranet");
-MODULE_LICENSE("GPL");
-
-struct vmcs {
- u32 revision_id;
- u32 abort;
- char data[0];
-};
-
-struct vcpu_vmx {
- struct kvm_vcpu vcpu;
- int launched;
- u8 fail;
- struct kvm_msr_entry *guest_msrs;
- struct kvm_msr_entry *host_msrs;
- int nmsrs;
- int save_nmsrs;
- int msr_offset_efer;
-#ifdef CONFIG_X86_64
- int msr_offset_kernel_gs_base;
-#endif
- struct vmcs *vmcs;
- struct {
- int loaded;
- u16 fs_sel, gs_sel, ldt_sel;
- int gs_ldt_reload_needed;
- int fs_reload_needed;
- }host_state;
-
-};
-
-static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
-{
- return container_of(vcpu, struct vcpu_vmx, vcpu);
-}
-
-static int init_rmode_tss(struct kvm *kvm);
-
-static DEFINE_PER_CPU(struct vmcs *, vmxarea);
-static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
-
-static struct page *vmx_io_bitmap_a;
-static struct page *vmx_io_bitmap_b;
-
-#define EFER_SAVE_RESTORE_BITS ((u64)EFER_SCE)
-
-static struct vmcs_config {
- int size;
- int order;
- u32 revision_id;
- u32 pin_based_exec_ctrl;
- u32 cpu_based_exec_ctrl;
- u32 vmexit_ctrl;
- u32 vmentry_ctrl;
-} vmcs_config;
-
-#define VMX_SEGMENT_FIELD(seg) \
- [VCPU_SREG_##seg] = { \
- .selector = GUEST_##seg##_SELECTOR, \
- .base = GUEST_##seg##_BASE, \
- .limit = GUEST_##seg##_LIMIT, \
- .ar_bytes = GUEST_##seg##_AR_BYTES, \
- }
-
-static struct kvm_vmx_segment_field {
- unsigned selector;
- unsigned base;
- unsigned limit;
- unsigned ar_bytes;
-} kvm_vmx_segment_fields[] = {
- VMX_SEGMENT_FIELD(CS),
- VMX_SEGMENT_FIELD(DS),
- VMX_SEGMENT_FIELD(ES),
- VMX_SEGMENT_FIELD(FS),
- VMX_SEGMENT_FIELD(GS),
- VMX_SEGMENT_FIELD(SS),
- VMX_SEGMENT_FIELD(TR),
- VMX_SEGMENT_FIELD(LDTR),
-};
-
-/*
- * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
- * away by decrementing the array size.
- */
-static const u32 vmx_msr_index[] = {
-#ifdef CONFIG_X86_64
- MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
-#endif
- MSR_EFER, MSR_K6_STAR,
-};
-#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
-
-static void load_msrs(struct kvm_msr_entry *e, int n)
-{
- int i;
-
- for (i = 0; i < n; ++i)
- wrmsrl(e[i].index, e[i].data);
-}
-
-static void save_msrs(struct kvm_msr_entry *e, int n)
-{
- int i;
-
- for (i = 0; i < n; ++i)
- rdmsrl(e[i].index, e[i].data);
-}
-
-static inline u64 msr_efer_save_restore_bits(struct kvm_msr_entry msr)
-{
- return (u64)msr.data & EFER_SAVE_RESTORE_BITS;
-}
-
-static inline int msr_efer_need_save_restore(struct vcpu_vmx *vmx)
-{
- int efer_offset = vmx->msr_offset_efer;
- return msr_efer_save_restore_bits(vmx->host_msrs[efer_offset]) !=
- msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]);
-}
-
-static inline int is_page_fault(u32 intr_info)
-{
- return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
- INTR_INFO_VALID_MASK)) ==
- (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
-}
-
-static inline int is_no_device(u32 intr_info)
-{
- return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
- INTR_INFO_VALID_MASK)) ==
- (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
-}
-
-static inline int is_external_interrupt(u32 intr_info)
-{
- return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
- == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
-}
-
-static inline int cpu_has_vmx_tpr_shadow(void)
-{
- return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW);
-}
-
-static inline int vm_need_tpr_shadow(struct kvm *kvm)
-{
- return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)));
-}
-
-static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
-{
- int i;
-
- for (i = 0; i < vmx->nmsrs; ++i)
- if (vmx->guest_msrs[i].index == msr)
- return i;
- return -1;
-}
-
-static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
-{
- int i;
-
- i = __find_msr_index(vmx, msr);
- if (i >= 0)
- return &vmx->guest_msrs[i];
- return NULL;
-}
-
-static void vmcs_clear(struct vmcs *vmcs)
-{
- u64 phys_addr = __pa(vmcs);
- u8 error;
-
- asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0"
- : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
- : "cc", "memory");
- if (error)
- printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
- vmcs, phys_addr);
-}
-
-static void __vcpu_clear(void *arg)
-{
- struct vcpu_vmx *vmx = arg;
- int cpu = raw_smp_processor_id();
-
- if (vmx->vcpu.cpu == cpu)
- vmcs_clear(vmx->vmcs);
- if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
- per_cpu(current_vmcs, cpu) = NULL;
- rdtscll(vmx->vcpu.host_tsc);
-}
-
-static void vcpu_clear(struct vcpu_vmx *vmx)
-{
- if (vmx->vcpu.cpu != raw_smp_processor_id() && vmx->vcpu.cpu != -1)
- smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear,
- vmx, 0, 1);
- else
- __vcpu_clear(vmx);
- vmx->launched = 0;
-}
-
-static unsigned long vmcs_readl(unsigned long field)
-{
- unsigned long value;
-
- asm volatile (ASM_VMX_VMREAD_RDX_RAX
- : "=a"(value) : "d"(field) : "cc");
- return value;
-}
-
-static u16 vmcs_read16(unsigned long field)
-{
- return vmcs_readl(field);
-}
-
-static u32 vmcs_read32(unsigned long field)
-{
- return vmcs_readl(field);
-}
-
-static u64 vmcs_read64(unsigned long field)
-{
-#ifdef CONFIG_X86_64
- return vmcs_readl(field);
-#else
- return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
-#endif
-}
-
-static noinline void vmwrite_error(unsigned long field, unsigned long value)
-{
- printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
- field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
- dump_stack();
-}
-
-static void vmcs_writel(unsigned long field, unsigned long value)
-{
- u8 error;
-
- asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
- : "=q"(error) : "a"(value), "d"(field) : "cc" );
- if (unlikely(error))
- vmwrite_error(field, value);
-}
-
-static void vmcs_write16(unsigned long field, u16 value)
-{
- vmcs_writel(field, value);
-}
-
-static void vmcs_write32(unsigned long field, u32 value)
-{
- vmcs_writel(field, value);
-}
-
-static void vmcs_write64(unsigned long field, u64 value)
-{
-#ifdef CONFIG_X86_64
- vmcs_writel(field, value);
-#else
- vmcs_writel(field, value);
- asm volatile ("");
- vmcs_writel(field+1, value >> 32);
-#endif
-}
-
-static void vmcs_clear_bits(unsigned long field, u32 mask)
-{
- vmcs_writel(field, vmcs_readl(field) & ~mask);
-}
-
-static void vmcs_set_bits(unsigned long field, u32 mask)
-{
- vmcs_writel(field, vmcs_readl(field) | mask);
-}
-
-static void update_exception_bitmap(struct kvm_vcpu *vcpu)
-{
- u32 eb;
-
- eb = 1u << PF_VECTOR;
- if (!vcpu->fpu_active)
- eb |= 1u << NM_VECTOR;
- if (vcpu->guest_debug.enabled)
- eb |= 1u << 1;
- if (vcpu->rmode.active)
- eb = ~0;
- vmcs_write32(EXCEPTION_BITMAP, eb);
-}
-
-static void reload_tss(void)
-{
-#ifndef CONFIG_X86_64
-
- /*
- * VT restores TR but not its size. Useless.
- */
- struct descriptor_table gdt;
- struct segment_descriptor *descs;
-
- get_gdt(&gdt);
- descs = (void *)gdt.base;
- descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
- load_TR_desc();
-#endif
-}
-
-static void load_transition_efer(struct vcpu_vmx *vmx)
-{
- u64 trans_efer;
- int efer_offset = vmx->msr_offset_efer;
-
- trans_efer = vmx->host_msrs[efer_offset].data;
- trans_efer &= ~EFER_SAVE_RESTORE_BITS;
- trans_efer |= msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]);
- wrmsrl(MSR_EFER, trans_efer);
- vmx->vcpu.stat.efer_reload++;
-}
-
-static void vmx_save_host_state(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- if (vmx->host_state.loaded)
- return;
-
- vmx->host_state.loaded = 1;
- /*
- * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
- * allow segment selectors with cpl > 0 or ti == 1.
- */
- vmx->host_state.ldt_sel = read_ldt();
- vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
- vmx->host_state.fs_sel = read_fs();
- if (!(vmx->host_state.fs_sel & 7)) {
- vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
- vmx->host_state.fs_reload_needed = 0;
- } else {
- vmcs_write16(HOST_FS_SELECTOR, 0);
- vmx->host_state.fs_reload_needed = 1;
- }
- vmx->host_state.gs_sel = read_gs();
- if (!(vmx->host_state.gs_sel & 7))
- vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
- else {
- vmcs_write16(HOST_GS_SELECTOR, 0);
- vmx->host_state.gs_ldt_reload_needed = 1;
- }
-
-#ifdef CONFIG_X86_64
- vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
- vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
-#else
- vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
- vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
-#endif
-
-#ifdef CONFIG_X86_64
- if (is_long_mode(&vmx->vcpu)) {
- save_msrs(vmx->host_msrs +
- vmx->msr_offset_kernel_gs_base, 1);
- }
-#endif
- load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
- if (msr_efer_need_save_restore(vmx))
- load_transition_efer(vmx);
-}
-
-static void vmx_load_host_state(struct vcpu_vmx *vmx)
-{
- unsigned long flags;
-
- if (!vmx->host_state.loaded)
- return;
-
- vmx->host_state.loaded = 0;
- if (vmx->host_state.fs_reload_needed)
- load_fs(vmx->host_state.fs_sel);
- if (vmx->host_state.gs_ldt_reload_needed) {
- load_ldt(vmx->host_state.ldt_sel);
- /*
- * If we have to reload gs, we must take care to
- * preserve our gs base.
- */
- local_irq_save(flags);
- load_gs(vmx->host_state.gs_sel);
-#ifdef CONFIG_X86_64
- wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
-#endif
- local_irq_restore(flags);
- }
- reload_tss();
- save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
- load_msrs(vmx->host_msrs, vmx->save_nmsrs);
- if (msr_efer_need_save_restore(vmx))
- load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
-}
-
-/*
- * Switches to specified vcpu, until a matching vcpu_put(), but assumes
- * vcpu mutex is already taken.
- */
-static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u64 phys_addr = __pa(vmx->vmcs);
- u64 tsc_this, delta;
-
- if (vcpu->cpu != cpu) {
- vcpu_clear(vmx);
- kvm_migrate_apic_timer(vcpu);
- }
-
- if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
- u8 error;
-
- per_cpu(current_vmcs, cpu) = vmx->vmcs;
- asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0"
- : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
- : "cc");
- if (error)
- printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
- vmx->vmcs, phys_addr);
- }
-
- if (vcpu->cpu != cpu) {
- struct descriptor_table dt;
- unsigned long sysenter_esp;
-
- vcpu->cpu = cpu;
- /*
- * Linux uses per-cpu TSS and GDT, so set these when switching
- * processors.
- */
- vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */
- get_gdt(&dt);
- vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */
-
- rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
- vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
-
- /*
- * Make sure the time stamp counter is monotonous.
- */
- rdtscll(tsc_this);
- delta = vcpu->host_tsc - tsc_this;
- vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta);
- }
-}
-
-static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
-{
- vmx_load_host_state(to_vmx(vcpu));
- kvm_put_guest_fpu(vcpu);
-}
-
-static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
-{
- if (vcpu->fpu_active)
- return;
- vcpu->fpu_active = 1;
- vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
- if (vcpu->cr0 & X86_CR0_TS)
- vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
- update_exception_bitmap(vcpu);
-}
-
-static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
-{
- if (!vcpu->fpu_active)
- return;
- vcpu->fpu_active = 0;
- vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
- update_exception_bitmap(vcpu);
-}
-
-static void vmx_vcpu_decache(struct kvm_vcpu *vcpu)
-{
- vcpu_clear(to_vmx(vcpu));
-}
-
-static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
-{
- return vmcs_readl(GUEST_RFLAGS);
-}
-
-static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
-{
- if (vcpu->rmode.active)
- rflags |= IOPL_MASK | X86_EFLAGS_VM;
- vmcs_writel(GUEST_RFLAGS, rflags);
-}
-
-static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
-{
- unsigned long rip;
- u32 interruptibility;
-
- rip = vmcs_readl(GUEST_RIP);
- rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
- vmcs_writel(GUEST_RIP, rip);
-
- /*
- * We emulated an instruction, so temporary interrupt blocking
- * should be removed, if set.
- */
- interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
- if (interruptibility & 3)
- vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
- interruptibility & ~3);
- vcpu->interrupt_window_open = 1;
-}
-
-static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
-{
- printk(KERN_DEBUG "inject_general_protection: rip 0x%lx\n",
- vmcs_readl(GUEST_RIP));
- vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
- GP_VECTOR |
- INTR_TYPE_EXCEPTION |
- INTR_INFO_DELIEVER_CODE_MASK |
- INTR_INFO_VALID_MASK);
-}
-
-/*
- * Swap MSR entry in host/guest MSR entry array.
- */
-#ifdef CONFIG_X86_64
-static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
-{
- struct kvm_msr_entry tmp;
-
- tmp = vmx->guest_msrs[to];
- vmx->guest_msrs[to] = vmx->guest_msrs[from];
- vmx->guest_msrs[from] = tmp;
- tmp = vmx->host_msrs[to];
- vmx->host_msrs[to] = vmx->host_msrs[from];
- vmx->host_msrs[from] = tmp;
-}
-#endif
-
-/*
- * Set up the vmcs to automatically save and restore system
- * msrs. Don't touch the 64-bit msrs if the guest is in legacy
- * mode, as fiddling with msrs is very expensive.
- */
-static void setup_msrs(struct vcpu_vmx *vmx)
-{
- int save_nmsrs;
-
- save_nmsrs = 0;
-#ifdef CONFIG_X86_64
- if (is_long_mode(&vmx->vcpu)) {
- int index;
-
- index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
- if (index >= 0)
- move_msr_up(vmx, index, save_nmsrs++);
- index = __find_msr_index(vmx, MSR_LSTAR);
- if (index >= 0)
- move_msr_up(vmx, index, save_nmsrs++);
- index = __find_msr_index(vmx, MSR_CSTAR);
- if (index >= 0)
- move_msr_up(vmx, index, save_nmsrs++);
- index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
- if (index >= 0)
- move_msr_up(vmx, index, save_nmsrs++);
- /*
- * MSR_K6_STAR is only needed on long mode guests, and only
- * if efer.sce is enabled.
- */
- index = __find_msr_index(vmx, MSR_K6_STAR);
- if ((index >= 0) && (vmx->vcpu.shadow_efer & EFER_SCE))
- move_msr_up(vmx, index, save_nmsrs++);
- }
-#endif
- vmx->save_nmsrs = save_nmsrs;
-
-#ifdef CONFIG_X86_64
- vmx->msr_offset_kernel_gs_base =
- __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
-#endif
- vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
-}
-
-/*
- * reads and returns guest's timestamp counter "register"
- * guest_tsc = host_tsc + tsc_offset -- 21.3
- */
-static u64 guest_read_tsc(void)
-{
- u64 host_tsc, tsc_offset;
-
- rdtscll(host_tsc);
- tsc_offset = vmcs_read64(TSC_OFFSET);
- return host_tsc + tsc_offset;
-}
-
-/*
- * writes 'guest_tsc' into guest's timestamp counter "register"
- * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
- */
-static void guest_write_tsc(u64 guest_tsc)
-{
- u64 host_tsc;
-
- rdtscll(host_tsc);
- vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
-}
-
-/*
- * Reads an msr value (of 'msr_index') into 'pdata'.
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
-{
- u64 data;
- struct kvm_msr_entry *msr;
-
- if (!pdata) {
- printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
- return -EINVAL;
- }
-
- switch (msr_index) {
-#ifdef CONFIG_X86_64
- case MSR_FS_BASE:
- data = vmcs_readl(GUEST_FS_BASE);
- break;
- case MSR_GS_BASE:
- data = vmcs_readl(GUEST_GS_BASE);
- break;
- case MSR_EFER:
- return kvm_get_msr_common(vcpu, msr_index, pdata);
-#endif
- case MSR_IA32_TIME_STAMP_COUNTER:
- data = guest_read_tsc();
- break;
- case MSR_IA32_SYSENTER_CS:
- data = vmcs_read32(GUEST_SYSENTER_CS);
- break;
- case MSR_IA32_SYSENTER_EIP:
- data = vmcs_readl(GUEST_SYSENTER_EIP);
- break;
- case MSR_IA32_SYSENTER_ESP:
- data = vmcs_readl(GUEST_SYSENTER_ESP);
- break;
- default:
- msr = find_msr_entry(to_vmx(vcpu), msr_index);
- if (msr) {
- data = msr->data;
- break;
- }
- return kvm_get_msr_common(vcpu, msr_index, pdata);
- }
-
- *pdata = data;
- return 0;
-}
-
-/*
- * Writes msr value into into the appropriate "register".
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct kvm_msr_entry *msr;
- int ret = 0;
-
- switch (msr_index) {
-#ifdef CONFIG_X86_64
- case MSR_EFER:
- ret = kvm_set_msr_common(vcpu, msr_index, data);
- if (vmx->host_state.loaded)
- load_transition_efer(vmx);
- break;
- case MSR_FS_BASE:
- vmcs_writel(GUEST_FS_BASE, data);
- break;
- case MSR_GS_BASE:
- vmcs_writel(GUEST_GS_BASE, data);
- break;
-#endif
- case MSR_IA32_SYSENTER_CS:
- vmcs_write32(GUEST_SYSENTER_CS, data);
- break;
- case MSR_IA32_SYSENTER_EIP:
- vmcs_writel(GUEST_SYSENTER_EIP, data);
- break;
- case MSR_IA32_SYSENTER_ESP:
- vmcs_writel(GUEST_SYSENTER_ESP, data);
- break;
- case MSR_IA32_TIME_STAMP_COUNTER:
- guest_write_tsc(data);
- break;
- default:
- msr = find_msr_entry(vmx, msr_index);
- if (msr) {
- msr->data = data;
- if (vmx->host_state.loaded)
- load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
- break;
- }
- ret = kvm_set_msr_common(vcpu, msr_index, data);
- }
-
- return ret;
-}
-
-/*
- * Sync the rsp and rip registers into the vcpu structure. This allows
- * registers to be accessed by indexing vcpu->regs.
- */
-static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
-{
- vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
- vcpu->rip = vmcs_readl(GUEST_RIP);
-}
-
-/*
- * Syncs rsp and rip back into the vmcs. Should be called after possible
- * modification.
- */
-static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
-{
- vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]);
- vmcs_writel(GUEST_RIP, vcpu->rip);
-}
-
-static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
-{
- unsigned long dr7 = 0x400;
- int old_singlestep;
-
- old_singlestep = vcpu->guest_debug.singlestep;
-
- vcpu->guest_debug.enabled = dbg->enabled;
- if (vcpu->guest_debug.enabled) {
- int i;
-
- dr7 |= 0x200; /* exact */
- for (i = 0; i < 4; ++i) {
- if (!dbg->breakpoints[i].enabled)
- continue;
- vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
- dr7 |= 2 << (i*2); /* global enable */
- dr7 |= 0 << (i*4+16); /* execution breakpoint */
- }
-
- vcpu->guest_debug.singlestep = dbg->singlestep;
- } else
- vcpu->guest_debug.singlestep = 0;
-
- if (old_singlestep && !vcpu->guest_debug.singlestep) {
- unsigned long flags;
-
- flags = vmcs_readl(GUEST_RFLAGS);
- flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
- vmcs_writel(GUEST_RFLAGS, flags);
- }
-
- update_exception_bitmap(vcpu);
- vmcs_writel(GUEST_DR7, dr7);
-
- return 0;
-}
-
-static int vmx_get_irq(struct kvm_vcpu *vcpu)
-{
- u32 idtv_info_field;
-
- idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
- if (idtv_info_field & INTR_INFO_VALID_MASK) {
- if (is_external_interrupt(idtv_info_field))
- return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
- else
- printk("pending exception: not handled yet\n");
- }
- return -1;
-}
-
-static __init int cpu_has_kvm_support(void)
-{
- unsigned long ecx = cpuid_ecx(1);
- return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
-}
-
-static __init int vmx_disabled_by_bios(void)
-{
- u64 msr;
-
- rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
- return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED |
- MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
- == MSR_IA32_FEATURE_CONTROL_LOCKED;
- /* locked but not enabled */
-}
-
-static void hardware_enable(void *garbage)
-{
- int cpu = raw_smp_processor_id();
- u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
- u64 old;
-
- rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
- if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED |
- MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
- != (MSR_IA32_FEATURE_CONTROL_LOCKED |
- MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
- /* enable and lock */
- wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
- MSR_IA32_FEATURE_CONTROL_LOCKED |
- MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED);
- write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
- asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr)
- : "memory", "cc");
-}
-
-static void hardware_disable(void *garbage)
-{
- asm volatile (ASM_VMX_VMXOFF : : : "cc");
-}
-
-static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
- u32 msr, u32* result)
-{
- u32 vmx_msr_low, vmx_msr_high;
- u32 ctl = ctl_min | ctl_opt;
-
- rdmsr(msr, vmx_msr_low, vmx_msr_high);
-
- ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
- ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
-
- /* Ensure minimum (required) set of control bits are supported. */
- if (ctl_min & ~ctl)
- return -EIO;
-
- *result = ctl;
- return 0;
-}
-
-static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
-{
- u32 vmx_msr_low, vmx_msr_high;
- u32 min, opt;
- u32 _pin_based_exec_control = 0;
- u32 _cpu_based_exec_control = 0;
- u32 _vmexit_control = 0;
- u32 _vmentry_control = 0;
-
- min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
- opt = 0;
- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
- &_pin_based_exec_control) < 0)
- return -EIO;
-
- min = CPU_BASED_HLT_EXITING |
-#ifdef CONFIG_X86_64
- CPU_BASED_CR8_LOAD_EXITING |
- CPU_BASED_CR8_STORE_EXITING |
-#endif
- CPU_BASED_USE_IO_BITMAPS |
- CPU_BASED_MOV_DR_EXITING |
- CPU_BASED_USE_TSC_OFFSETING;
-#ifdef CONFIG_X86_64
- opt = CPU_BASED_TPR_SHADOW;
-#else
- opt = 0;
-#endif
- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
- &_cpu_based_exec_control) < 0)
- return -EIO;
-#ifdef CONFIG_X86_64
- if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
- _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
- ~CPU_BASED_CR8_STORE_EXITING;
-#endif
-
- min = 0;
-#ifdef CONFIG_X86_64
- min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
-#endif
- opt = 0;
- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
- &_vmexit_control) < 0)
- return -EIO;
-
- min = opt = 0;
- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
- &_vmentry_control) < 0)
- return -EIO;
-
- rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
-
- /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
- if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
- return -EIO;
-
-#ifdef CONFIG_X86_64
- /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
- if (vmx_msr_high & (1u<<16))
- return -EIO;
-#endif
-
- /* Require Write-Back (WB) memory type for VMCS accesses. */
- if (((vmx_msr_high >> 18) & 15) != 6)
- return -EIO;
-
- vmcs_conf->size = vmx_msr_high & 0x1fff;
- vmcs_conf->order = get_order(vmcs_config.size);
- vmcs_conf->revision_id = vmx_msr_low;
-
- vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
- vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
- vmcs_conf->vmexit_ctrl = _vmexit_control;
- vmcs_conf->vmentry_ctrl = _vmentry_control;
-
- return 0;
-}
-
-static struct vmcs *alloc_vmcs_cpu(int cpu)
-{
- int node = cpu_to_node(cpu);
- struct page *pages;
- struct vmcs *vmcs;
-
- pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
- if (!pages)
- return NULL;
- vmcs = page_address(pages);
- memset(vmcs, 0, vmcs_config.size);
- vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
- return vmcs;
-}
-
-static struct vmcs *alloc_vmcs(void)
-{
- return alloc_vmcs_cpu(raw_smp_processor_id());
-}
-
-static void free_vmcs(struct vmcs *vmcs)
-{
- free_pages((unsigned long)vmcs, vmcs_config.order);
-}
-
-static void free_kvm_area(void)
-{
- int cpu;
-
- for_each_online_cpu(cpu)
- free_vmcs(per_cpu(vmxarea, cpu));
-}
-
-static __init int alloc_kvm_area(void)
-{
- int cpu;
-
- for_each_online_cpu(cpu) {
- struct vmcs *vmcs;
-
- vmcs = alloc_vmcs_cpu(cpu);
- if (!vmcs) {
- free_kvm_area();
- return -ENOMEM;
- }
-
- per_cpu(vmxarea, cpu) = vmcs;
- }
- return 0;
-}
-
-static __init int hardware_setup(void)
-{
- if (setup_vmcs_config(&vmcs_config) < 0)
- return -EIO;
- return alloc_kvm_area();
-}
-
-static __exit void hardware_unsetup(void)
-{
- free_kvm_area();
-}
-
-static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
-{
- struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
-
- if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) {
- vmcs_write16(sf->selector, save->selector);
- vmcs_writel(sf->base, save->base);
- vmcs_write32(sf->limit, save->limit);
- vmcs_write32(sf->ar_bytes, save->ar);
- } else {
- u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
- << AR_DPL_SHIFT;
- vmcs_write32(sf->ar_bytes, 0x93 | dpl);
- }
-}
-
-static void enter_pmode(struct kvm_vcpu *vcpu)
-{
- unsigned long flags;
-
- vcpu->rmode.active = 0;
-
- vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base);
- vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit);
- vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
-
- flags = vmcs_readl(GUEST_RFLAGS);
- flags &= ~(IOPL_MASK | X86_EFLAGS_VM);
- flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
- vmcs_writel(GUEST_RFLAGS, flags);
-
- vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
- (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
-
- update_exception_bitmap(vcpu);
-
- fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->rmode.es);
- fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->rmode.ds);
- fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->rmode.gs);
- fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->rmode.fs);
-
- vmcs_write16(GUEST_SS_SELECTOR, 0);
- vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
-
- vmcs_write16(GUEST_CS_SELECTOR,
- vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
- vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
-}
-
-static gva_t rmode_tss_base(struct kvm* kvm)
-{
- gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3;
- return base_gfn << PAGE_SHIFT;
-}
-
-static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
-{
- struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
-
- save->selector = vmcs_read16(sf->selector);
- save->base = vmcs_readl(sf->base);
- save->limit = vmcs_read32(sf->limit);
- save->ar = vmcs_read32(sf->ar_bytes);
- vmcs_write16(sf->selector, vmcs_readl(sf->base) >> 4);
- vmcs_write32(sf->limit, 0xffff);
- vmcs_write32(sf->ar_bytes, 0xf3);
-}
-
-static void enter_rmode(struct kvm_vcpu *vcpu)
-{
- unsigned long flags;
-
- vcpu->rmode.active = 1;
-
- vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
- vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
-
- vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
- vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
-
- vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
- vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
-
- flags = vmcs_readl(GUEST_RFLAGS);
- vcpu->rmode.save_iopl = (flags & IOPL_MASK) >> IOPL_SHIFT;
-
- flags |= IOPL_MASK | X86_EFLAGS_VM;
-
- vmcs_writel(GUEST_RFLAGS, flags);
- vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
- update_exception_bitmap(vcpu);
-
- vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
- vmcs_write32(GUEST_SS_LIMIT, 0xffff);
- vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
-
- vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
- vmcs_write32(GUEST_CS_LIMIT, 0xffff);
- if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
- vmcs_writel(GUEST_CS_BASE, 0xf0000);
- vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
-
- fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es);
- fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds);
- fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs);
- fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs);
-
- kvm_mmu_reset_context(vcpu);
- init_rmode_tss(vcpu->kvm);
-}
-
-#ifdef CONFIG_X86_64
-
-static void enter_lmode(struct kvm_vcpu *vcpu)
-{
- u32 guest_tr_ar;
-
- guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
- if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
- printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
- __FUNCTION__);
- vmcs_write32(GUEST_TR_AR_BYTES,
- (guest_tr_ar & ~AR_TYPE_MASK)
- | AR_TYPE_BUSY_64_TSS);
- }
-
- vcpu->shadow_efer |= EFER_LMA;
-
- find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME;
- vmcs_write32(VM_ENTRY_CONTROLS,
- vmcs_read32(VM_ENTRY_CONTROLS)
- | VM_ENTRY_IA32E_MODE);
-}
-
-static void exit_lmode(struct kvm_vcpu *vcpu)
-{
- vcpu->shadow_efer &= ~EFER_LMA;
-
- vmcs_write32(VM_ENTRY_CONTROLS,
- vmcs_read32(VM_ENTRY_CONTROLS)
- & ~VM_ENTRY_IA32E_MODE);
-}
-
-#endif
-
-static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
-{
- vcpu->cr4 &= KVM_GUEST_CR4_MASK;
- vcpu->cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
-}
-
-static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
-{
- vmx_fpu_deactivate(vcpu);
-
- if (vcpu->rmode.active && (cr0 & X86_CR0_PE))
- enter_pmode(vcpu);
-
- if (!vcpu->rmode.active && !(cr0 & X86_CR0_PE))
- enter_rmode(vcpu);
-
-#ifdef CONFIG_X86_64
- if (vcpu->shadow_efer & EFER_LME) {
- if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
- enter_lmode(vcpu);
- if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
- exit_lmode(vcpu);
- }
-#endif
-
- vmcs_writel(CR0_READ_SHADOW, cr0);
- vmcs_writel(GUEST_CR0,
- (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
- vcpu->cr0 = cr0;
-
- if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
- vmx_fpu_activate(vcpu);
-}
-
-static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
-{
- vmcs_writel(GUEST_CR3, cr3);
- if (vcpu->cr0 & X86_CR0_PE)
- vmx_fpu_deactivate(vcpu);
-}
-
-static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
-{
- vmcs_writel(CR4_READ_SHADOW, cr4);
- vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ?
- KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
- vcpu->cr4 = cr4;
-}
-
-#ifdef CONFIG_X86_64
-
-static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
-
- vcpu->shadow_efer = efer;
- if (efer & EFER_LMA) {
- vmcs_write32(VM_ENTRY_CONTROLS,
- vmcs_read32(VM_ENTRY_CONTROLS) |
- VM_ENTRY_IA32E_MODE);
- msr->data = efer;
-
- } else {
- vmcs_write32(VM_ENTRY_CONTROLS,
- vmcs_read32(VM_ENTRY_CONTROLS) &
- ~VM_ENTRY_IA32E_MODE);
-
- msr->data = efer & ~EFER_LME;
- }
- setup_msrs(vmx);
-}
-
-#endif
-
-static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
-{
- struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
-
- return vmcs_readl(sf->base);
-}
-
-static void vmx_get_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
- u32 ar;
-
- var->base = vmcs_readl(sf->base);
- var->limit = vmcs_read32(sf->limit);
- var->selector = vmcs_read16(sf->selector);
- ar = vmcs_read32(sf->ar_bytes);
- if (ar & AR_UNUSABLE_MASK)
- ar = 0;
- var->type = ar & 15;
- var->s = (ar >> 4) & 1;
- var->dpl = (ar >> 5) & 3;
- var->present = (ar >> 7) & 1;
- var->avl = (ar >> 12) & 1;
- var->l = (ar >> 13) & 1;
- var->db = (ar >> 14) & 1;
- var->g = (ar >> 15) & 1;
- var->unusable = (ar >> 16) & 1;
-}
-
-static u32 vmx_segment_access_rights(struct kvm_segment *var)
-{
- u32 ar;
-
- if (var->unusable)
- ar = 1 << 16;
- else {
- ar = var->type & 15;
- ar |= (var->s & 1) << 4;
- ar |= (var->dpl & 3) << 5;
- ar |= (var->present & 1) << 7;
- ar |= (var->avl & 1) << 12;
- ar |= (var->l & 1) << 13;
- ar |= (var->db & 1) << 14;
- ar |= (var->g & 1) << 15;
- }
- if (ar == 0) /* a 0 value means unusable */
- ar = AR_UNUSABLE_MASK;
-
- return ar;
-}
-
-static void vmx_set_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
- u32 ar;
-
- if (vcpu->rmode.active && seg == VCPU_SREG_TR) {
- vcpu->rmode.tr.selector = var->selector;
- vcpu->rmode.tr.base = var->base;
- vcpu->rmode.tr.limit = var->limit;
- vcpu->rmode.tr.ar = vmx_segment_access_rights(var);
- return;
- }
- vmcs_writel(sf->base, var->base);
- vmcs_write32(sf->limit, var->limit);
- vmcs_write16(sf->selector, var->selector);
- if (vcpu->rmode.active && var->s) {
- /*
- * Hack real-mode segments into vm86 compatibility.
- */
- if (var->base == 0xffff0000 && var->selector == 0xf000)
- vmcs_writel(sf->base, 0xf0000);
- ar = 0xf3;
- } else
- ar = vmx_segment_access_rights(var);
- vmcs_write32(sf->ar_bytes, ar);
-}
-
-static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
-{
- u32 ar = vmcs_read32(GUEST_CS_AR_BYTES);
-
- *db = (ar >> 14) & 1;
- *l = (ar >> 13) & 1;
-}
-
-static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
- dt->limit = vmcs_read32(GUEST_IDTR_LIMIT);
- dt->base = vmcs_readl(GUEST_IDTR_BASE);
-}
-
-static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
- vmcs_write32(GUEST_IDTR_LIMIT, dt->limit);
- vmcs_writel(GUEST_IDTR_BASE, dt->base);
-}
-
-static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
- dt->limit = vmcs_read32(GUEST_GDTR_LIMIT);
- dt->base = vmcs_readl(GUEST_GDTR_BASE);
-}
-
-static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
- vmcs_write32(GUEST_GDTR_LIMIT, dt->limit);
- vmcs_writel(GUEST_GDTR_BASE, dt->base);
-}
-
-static int init_rmode_tss(struct kvm* kvm)
-{
- struct page *p1, *p2, *p3;
- gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
- char *page;
-
- p1 = gfn_to_page(kvm, fn++);
- p2 = gfn_to_page(kvm, fn++);
- p3 = gfn_to_page(kvm, fn);
-
- if (!p1 || !p2 || !p3) {
- kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__);
- return 0;
- }
-
- page = kmap_atomic(p1, KM_USER0);
- clear_page(page);
- *(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
- kunmap_atomic(page, KM_USER0);
-
- page = kmap_atomic(p2, KM_USER0);
- clear_page(page);
- kunmap_atomic(page, KM_USER0);
-
- page = kmap_atomic(p3, KM_USER0);
- clear_page(page);
- *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
- kunmap_atomic(page, KM_USER0);
-
- return 1;
-}
-
-static void seg_setup(int seg)
-{
- struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
-
- vmcs_write16(sf->selector, 0);
- vmcs_writel(sf->base, 0);
- vmcs_write32(sf->limit, 0xffff);
- vmcs_write32(sf->ar_bytes, 0x93);
-}
-
-/*
- * Sets up the vmcs for emulated real mode.
- */
-static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
-{
- u32 host_sysenter_cs;
- u32 junk;
- unsigned long a;
- struct descriptor_table dt;
- int i;
- int ret = 0;
- unsigned long kvm_vmx_return;
- u64 msr;
- u32 exec_control;
-
- if (!init_rmode_tss(vmx->vcpu.kvm)) {
- ret = -ENOMEM;
- goto out;
- }
-
- vmx->vcpu.rmode.active = 0;
-
- vmx->vcpu.regs[VCPU_REGS_RDX] = get_rdx_init_val();
- set_cr8(&vmx->vcpu, 0);
- msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
- if (vmx->vcpu.vcpu_id == 0)
- msr |= MSR_IA32_APICBASE_BSP;
- kvm_set_apic_base(&vmx->vcpu, msr);
-
- fx_init(&vmx->vcpu);
-
- /*
- * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
- * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
- */
- if (vmx->vcpu.vcpu_id == 0) {
- vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
- vmcs_writel(GUEST_CS_BASE, 0x000f0000);
- } else {
- vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.sipi_vector << 8);
- vmcs_writel(GUEST_CS_BASE, vmx->vcpu.sipi_vector << 12);
- }
- vmcs_write32(GUEST_CS_LIMIT, 0xffff);
- vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
-
- seg_setup(VCPU_SREG_DS);
- seg_setup(VCPU_SREG_ES);
- seg_setup(VCPU_SREG_FS);
- seg_setup(VCPU_SREG_GS);
- seg_setup(VCPU_SREG_SS);
-
- vmcs_write16(GUEST_TR_SELECTOR, 0);
- vmcs_writel(GUEST_TR_BASE, 0);
- vmcs_write32(GUEST_TR_LIMIT, 0xffff);
- vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
-
- vmcs_write16(GUEST_LDTR_SELECTOR, 0);
- vmcs_writel(GUEST_LDTR_BASE, 0);
- vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
- vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
-
- vmcs_write32(GUEST_SYSENTER_CS, 0);
- vmcs_writel(GUEST_SYSENTER_ESP, 0);
- vmcs_writel(GUEST_SYSENTER_EIP, 0);
-
- vmcs_writel(GUEST_RFLAGS, 0x02);
- if (vmx->vcpu.vcpu_id == 0)
- vmcs_writel(GUEST_RIP, 0xfff0);
- else
- vmcs_writel(GUEST_RIP, 0);
- vmcs_writel(GUEST_RSP, 0);
-
- //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
- vmcs_writel(GUEST_DR7, 0x400);
-
- vmcs_writel(GUEST_GDTR_BASE, 0);
- vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
-
- vmcs_writel(GUEST_IDTR_BASE, 0);
- vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
-
- vmcs_write32(GUEST_ACTIVITY_STATE, 0);
- vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
- vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
-
- /* I/O */
- vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
- vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
-
- guest_write_tsc(0);
-
- vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
-
- /* Special registers */
- vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
-
- /* Control */
- vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
- vmcs_config.pin_based_exec_ctrl);
-
- exec_control = vmcs_config.cpu_based_exec_ctrl;
- if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
- exec_control &= ~CPU_BASED_TPR_SHADOW;
-#ifdef CONFIG_X86_64
- exec_control |= CPU_BASED_CR8_STORE_EXITING |
- CPU_BASED_CR8_LOAD_EXITING;
-#endif
- }
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
-
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
- vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
-
- vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */
- vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
- vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
-
- vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
- vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
- vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
- vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */
- vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */
- vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
-#ifdef CONFIG_X86_64
- rdmsrl(MSR_FS_BASE, a);
- vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
- rdmsrl(MSR_GS_BASE, a);
- vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
-#else
- vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
- vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
-#endif
-
- vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
-
- get_idt(&dt);
- vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
-
- asm ("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
- vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
- vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
-
- rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
- vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
- rdmsrl(MSR_IA32_SYSENTER_ESP, a);
- vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */
- rdmsrl(MSR_IA32_SYSENTER_EIP, a);
- vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
-
- for (i = 0; i < NR_VMX_MSR; ++i) {
- u32 index = vmx_msr_index[i];
- u32 data_low, data_high;
- u64 data;
- int j = vmx->nmsrs;
-
- if (rdmsr_safe(index, &data_low, &data_high) < 0)
- continue;
- if (wrmsr_safe(index, data_low, data_high) < 0)
- continue;
- data = data_low | ((u64)data_high << 32);
- vmx->host_msrs[j].index = index;
- vmx->host_msrs[j].reserved = 0;
- vmx->host_msrs[j].data = data;
- vmx->guest_msrs[j] = vmx->host_msrs[j];
- ++vmx->nmsrs;
- }
-
- setup_msrs(vmx);
-
- vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
-
- /* 22.2.1, 20.8.1 */
- vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
-
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
-
-#ifdef CONFIG_X86_64
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
- if (vm_need_tpr_shadow(vmx->vcpu.kvm))
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
- page_to_phys(vmx->vcpu.apic->regs_page));
- vmcs_write32(TPR_THRESHOLD, 0);
-#endif
-
- vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
- vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
-
- vmx->vcpu.cr0 = 0x60000010;
- vmx_set_cr0(&vmx->vcpu, vmx->vcpu.cr0); // enter rmode
- vmx_set_cr4(&vmx->vcpu, 0);
-#ifdef CONFIG_X86_64
- vmx_set_efer(&vmx->vcpu, 0);
-#endif
- vmx_fpu_activate(&vmx->vcpu);
- update_exception_bitmap(&vmx->vcpu);
-
- return 0;
-
-out:
- return ret;
-}
-
-static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- vmx_vcpu_setup(vmx);
-}
-
-static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq)
-{
- u16 ent[2];
- u16 cs;
- u16 ip;
- unsigned long flags;
- unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
- u16 sp = vmcs_readl(GUEST_RSP);
- u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT);
-
- if (sp > ss_limit || sp < 6 ) {
- vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
- __FUNCTION__,
- vmcs_readl(GUEST_RSP),
- vmcs_readl(GUEST_SS_BASE),
- vmcs_read32(GUEST_SS_LIMIT));
- return;
- }
-
- if (emulator_read_std(irq * sizeof(ent), &ent, sizeof(ent), vcpu) !=
- X86EMUL_CONTINUE) {
- vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
- return;
- }
-
- flags = vmcs_readl(GUEST_RFLAGS);
- cs = vmcs_readl(GUEST_CS_BASE) >> 4;
- ip = vmcs_readl(GUEST_RIP);
-
-
- if (emulator_write_emulated(ss_base + sp - 2, &flags, 2, vcpu) != X86EMUL_CONTINUE ||
- emulator_write_emulated(ss_base + sp - 4, &cs, 2, vcpu) != X86EMUL_CONTINUE ||
- emulator_write_emulated(ss_base + sp - 6, &ip, 2, vcpu) != X86EMUL_CONTINUE) {
- vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
- return;
- }
-
- vmcs_writel(GUEST_RFLAGS, flags &
- ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF));
- vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ;
- vmcs_writel(GUEST_CS_BASE, ent[1] << 4);
- vmcs_writel(GUEST_RIP, ent[0]);
- vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
-}
-
-static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
-{
- if (vcpu->rmode.active) {
- inject_rmode_irq(vcpu, irq);
- return;
- }
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
- irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
-}
-
-static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
-{
- int word_index = __ffs(vcpu->irq_summary);
- int bit_index = __ffs(vcpu->irq_pending[word_index]);
- int irq = word_index * BITS_PER_LONG + bit_index;
-
- clear_bit(bit_index, &vcpu->irq_pending[word_index]);
- if (!vcpu->irq_pending[word_index])
- clear_bit(word_index, &vcpu->irq_summary);
- vmx_inject_irq(vcpu, irq);
-}
-
-
-static void do_interrupt_requests(struct kvm_vcpu *vcpu,
- struct kvm_run *kvm_run)
-{
- u32 cpu_based_vm_exec_control;
-
- vcpu->interrupt_window_open =
- ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
- (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
-
- if (vcpu->interrupt_window_open &&
- vcpu->irq_summary &&
- !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
- /*
- * If interrupts enabled, and not blocked by sti or mov ss. Good.
- */
- kvm_do_inject_irq(vcpu);
-
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- if (!vcpu->interrupt_window_open &&
- (vcpu->irq_summary || kvm_run->request_interrupt_window))
- /*
- * Interrupts blocked. Wait for unblock.
- */
- cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
- else
- cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-}
-
-static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
-{
- struct kvm_guest_debug *dbg = &vcpu->guest_debug;
-
- set_debugreg(dbg->bp[0], 0);
- set_debugreg(dbg->bp[1], 1);
- set_debugreg(dbg->bp[2], 2);
- set_debugreg(dbg->bp[3], 3);
-
- if (dbg->singlestep) {
- unsigned long flags;
-
- flags = vmcs_readl(GUEST_RFLAGS);
- flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
- vmcs_writel(GUEST_RFLAGS, flags);
- }
-}
-
-static int handle_rmode_exception(struct kvm_vcpu *vcpu,
- int vec, u32 err_code)
-{
- if (!vcpu->rmode.active)
- return 0;
-
- /*
- * Instruction with address size override prefix opcode 0x67
- * Cause the #SS fault with 0 error code in VM86 mode.
- */
- if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
- if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE)
- return 1;
- return 0;
-}
-
-static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- u32 intr_info, error_code;
- unsigned long cr2, rip;
- u32 vect_info;
- enum emulation_result er;
- int r;
-
- vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
- intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
- if ((vect_info & VECTORING_INFO_VALID_MASK) &&
- !is_page_fault(intr_info)) {
- printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
- "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
- }
-
- if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
- int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
- set_bit(irq, vcpu->irq_pending);
- set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
- }
-
- if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
- return 1; /* already handled by vmx_vcpu_run() */
-
- if (is_no_device(intr_info)) {
- vmx_fpu_activate(vcpu);
- return 1;
- }
-
- error_code = 0;
- rip = vmcs_readl(GUEST_RIP);
- if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
- error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
- if (is_page_fault(intr_info)) {
- cr2 = vmcs_readl(EXIT_QUALIFICATION);
-
- mutex_lock(&vcpu->kvm->lock);
- r = kvm_mmu_page_fault(vcpu, cr2, error_code);
- if (r < 0) {
- mutex_unlock(&vcpu->kvm->lock);
- return r;
- }
- if (!r) {
- mutex_unlock(&vcpu->kvm->lock);
- return 1;
- }
-
- er = emulate_instruction(vcpu, kvm_run, cr2, error_code);
- mutex_unlock(&vcpu->kvm->lock);
-
- switch (er) {
- case EMULATE_DONE:
- return 1;
- case EMULATE_DO_MMIO:
- ++vcpu->stat.mmio_exits;
- return 0;
- case EMULATE_FAIL:
- kvm_report_emulation_failure(vcpu, "pagetable");
- break;
- default:
- BUG();
- }
- }
-
- if (vcpu->rmode.active &&
- handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
- error_code)) {
- if (vcpu->halt_request) {
- vcpu->halt_request = 0;
- return kvm_emulate_halt(vcpu);
- }
- return 1;
- }
-
- if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) {
- kvm_run->exit_reason = KVM_EXIT_DEBUG;
- return 0;
- }
- kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
- kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
- kvm_run->ex.error_code = error_code;
- return 0;
-}
-
-static int handle_external_interrupt(struct kvm_vcpu *vcpu,
- struct kvm_run *kvm_run)
-{
- ++vcpu->stat.irq_exits;
- return 1;
-}
-
-static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
- return 0;
-}
-
-static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- unsigned long exit_qualification;
- int size, down, in, string, rep;
- unsigned port;
-
- ++vcpu->stat.io_exits;
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- string = (exit_qualification & 16) != 0;
-
- if (string) {
- if (emulate_instruction(vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO)
- return 0;
- return 1;
- }
-
- size = (exit_qualification & 7) + 1;
- in = (exit_qualification & 8) != 0;
- down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
- rep = (exit_qualification & 32) != 0;
- port = exit_qualification >> 16;
-
- return kvm_emulate_pio(vcpu, kvm_run, in, size, port);
-}
-
-static void
-vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
-{
- /*
- * Patch in the VMCALL instruction:
- */
- hypercall[0] = 0x0f;
- hypercall[1] = 0x01;
- hypercall[2] = 0xc1;
- hypercall[3] = 0xc3;
-}
-
-static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- unsigned long exit_qualification;
- int cr;
- int reg;
-
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- cr = exit_qualification & 15;
- reg = (exit_qualification >> 8) & 15;
- switch ((exit_qualification >> 4) & 3) {
- case 0: /* mov to cr */
- switch (cr) {
- case 0:
- vcpu_load_rsp_rip(vcpu);
- set_cr0(vcpu, vcpu->regs[reg]);
- skip_emulated_instruction(vcpu);
- return 1;
- case 3:
- vcpu_load_rsp_rip(vcpu);
- set_cr3(vcpu, vcpu->regs[reg]);
- skip_emulated_instruction(vcpu);
- return 1;
- case 4:
- vcpu_load_rsp_rip(vcpu);
- set_cr4(vcpu, vcpu->regs[reg]);
- skip_emulated_instruction(vcpu);
- return 1;
- case 8:
- vcpu_load_rsp_rip(vcpu);
- set_cr8(vcpu, vcpu->regs[reg]);
- skip_emulated_instruction(vcpu);
- kvm_run->exit_reason = KVM_EXIT_SET_TPR;
- return 0;
- };
- break;
- case 2: /* clts */
- vcpu_load_rsp_rip(vcpu);
- vmx_fpu_deactivate(vcpu);
- vcpu->cr0 &= ~X86_CR0_TS;
- vmcs_writel(CR0_READ_SHADOW, vcpu->cr0);
- vmx_fpu_activate(vcpu);
- skip_emulated_instruction(vcpu);
- return 1;
- case 1: /*mov from cr*/
- switch (cr) {
- case 3:
- vcpu_load_rsp_rip(vcpu);
- vcpu->regs[reg] = vcpu->cr3;
- vcpu_put_rsp_rip(vcpu);
- skip_emulated_instruction(vcpu);
- return 1;
- case 8:
- vcpu_load_rsp_rip(vcpu);
- vcpu->regs[reg] = get_cr8(vcpu);
- vcpu_put_rsp_rip(vcpu);
- skip_emulated_instruction(vcpu);
- return 1;
- }
- break;
- case 3: /* lmsw */
- lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
-
- skip_emulated_instruction(vcpu);
- return 1;
- default:
- break;
- }
- kvm_run->exit_reason = 0;
- pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
- (int)(exit_qualification >> 4) & 3, cr);
- return 0;
-}
-
-static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- unsigned long exit_qualification;
- unsigned long val;
- int dr, reg;
-
- /*
- * FIXME: this code assumes the host is debugging the guest.
- * need to deal with guest debugging itself too.
- */
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- dr = exit_qualification & 7;
- reg = (exit_qualification >> 8) & 15;
- vcpu_load_rsp_rip(vcpu);
- if (exit_qualification & 16) {
- /* mov from dr */
- switch (dr) {
- case 6:
- val = 0xffff0ff0;
- break;
- case 7:
- val = 0x400;
- break;
- default:
- val = 0;
- }
- vcpu->regs[reg] = val;
- } else {
- /* mov to dr */
- }
- vcpu_put_rsp_rip(vcpu);
- skip_emulated_instruction(vcpu);
- return 1;
-}
-
-static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- kvm_emulate_cpuid(vcpu);
- return 1;
-}
-
-static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- u32 ecx = vcpu->regs[VCPU_REGS_RCX];
- u64 data;
-
- if (vmx_get_msr(vcpu, ecx, &data)) {
- vmx_inject_gp(vcpu, 0);
- return 1;
- }
-
- /* FIXME: handling of bits 32:63 of rax, rdx */
- vcpu->regs[VCPU_REGS_RAX] = data & -1u;
- vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
- skip_emulated_instruction(vcpu);
- return 1;
-}
-
-static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- u32 ecx = vcpu->regs[VCPU_REGS_RCX];
- u64 data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
- | ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
-
- if (vmx_set_msr(vcpu, ecx, data) != 0) {
- vmx_inject_gp(vcpu, 0);
- return 1;
- }
-
- skip_emulated_instruction(vcpu);
- return 1;
-}
-
-static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu,
- struct kvm_run *kvm_run)
-{
- return 1;
-}
-
-static int handle_interrupt_window(struct kvm_vcpu *vcpu,
- struct kvm_run *kvm_run)
-{
- u32 cpu_based_vm_exec_control;
-
- /* clear pending irq */
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
- /*
- * If the user space waits to inject interrupts, exit as soon as
- * possible
- */
- if (kvm_run->request_interrupt_window &&
- !vcpu->irq_summary) {
- kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
- ++vcpu->stat.irq_window_exits;
- return 0;
- }
- return 1;
-}
-
-static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- skip_emulated_instruction(vcpu);
- return kvm_emulate_halt(vcpu);
-}
-
-static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- skip_emulated_instruction(vcpu);
- return kvm_hypercall(vcpu, kvm_run);
-}
-
-/*
- * The exit handlers return 1 if the exit was handled fully and guest execution
- * may resume. Otherwise they set the kvm_run parameter to indicate what needs
- * to be done to userspace and return 0.
- */
-static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
- struct kvm_run *kvm_run) = {
- [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
- [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
- [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
- [EXIT_REASON_IO_INSTRUCTION] = handle_io,
- [EXIT_REASON_CR_ACCESS] = handle_cr,
- [EXIT_REASON_DR_ACCESS] = handle_dr,
- [EXIT_REASON_CPUID] = handle_cpuid,
- [EXIT_REASON_MSR_READ] = handle_rdmsr,
- [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
- [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
- [EXIT_REASON_HLT] = handle_halt,
- [EXIT_REASON_VMCALL] = handle_vmcall,
- [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold
-};
-
-static const int kvm_vmx_max_exit_handlers =
- ARRAY_SIZE(kvm_vmx_exit_handlers);
-
-/*
- * The guest has exited. See if we can fix it or if we need userspace
- * assistance.
- */
-static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
-{
- u32 vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
- u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- if (unlikely(vmx->fail)) {
- kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
- kvm_run->fail_entry.hardware_entry_failure_reason
- = vmcs_read32(VM_INSTRUCTION_ERROR);
- return 0;
- }
-
- if ( (vectoring_info & VECTORING_INFO_VALID_MASK) &&
- exit_reason != EXIT_REASON_EXCEPTION_NMI )
- printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
- "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
- if (exit_reason < kvm_vmx_max_exit_handlers
- && kvm_vmx_exit_handlers[exit_reason])
- return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
- else {
- kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
- kvm_run->hw.hardware_exit_reason = exit_reason;
- }
- return 0;
-}
-
-static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
-{
-}
-
-static void update_tpr_threshold(struct kvm_vcpu *vcpu)
-{
- int max_irr, tpr;
-
- if (!vm_need_tpr_shadow(vcpu->kvm))
- return;
-
- if (!kvm_lapic_enabled(vcpu) ||
- ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) {
- vmcs_write32(TPR_THRESHOLD, 0);
- return;
- }
-
- tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4;
- vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4);
-}
-
-static void enable_irq_window(struct kvm_vcpu *vcpu)
-{
- u32 cpu_based_vm_exec_control;
-
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-}
-
-static void vmx_intr_assist(struct kvm_vcpu *vcpu)
-{
- u32 idtv_info_field, intr_info_field;
- int has_ext_irq, interrupt_window_open;
- int vector;
-
- kvm_inject_pending_timer_irqs(vcpu);
- update_tpr_threshold(vcpu);
-
- has_ext_irq = kvm_cpu_has_interrupt(vcpu);
- intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
- idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
- if (intr_info_field & INTR_INFO_VALID_MASK) {
- if (idtv_info_field & INTR_INFO_VALID_MASK) {
- /* TODO: fault when IDT_Vectoring */
- printk(KERN_ERR "Fault when IDT_Vectoring\n");
- }
- if (has_ext_irq)
- enable_irq_window(vcpu);
- return;
- }
- if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
- vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
- vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
-
- if (unlikely(idtv_info_field & INTR_INFO_DELIEVER_CODE_MASK))
- vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
- vmcs_read32(IDT_VECTORING_ERROR_CODE));
- if (unlikely(has_ext_irq))
- enable_irq_window(vcpu);
- return;
- }
- if (!has_ext_irq)
- return;
- interrupt_window_open =
- ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
- (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
- if (interrupt_window_open) {
- vector = kvm_cpu_get_interrupt(vcpu);
- vmx_inject_irq(vcpu, vector);
- kvm_timer_intr_post(vcpu, vector);
- } else
- enable_irq_window(vcpu);
-}
-
-static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 intr_info;
-
- /*
- * Loading guest fpu may have cleared host cr0.ts
- */
- vmcs_writel(HOST_CR0, read_cr0());
-
- asm (
- /* Store host registers */
-#ifdef CONFIG_X86_64
- "push %%rax; push %%rbx; push %%rdx;"
- "push %%rsi; push %%rdi; push %%rbp;"
- "push %%r8; push %%r9; push %%r10; push %%r11;"
- "push %%r12; push %%r13; push %%r14; push %%r15;"
- "push %%rcx \n\t"
- ASM_VMX_VMWRITE_RSP_RDX "\n\t"
-#else
- "pusha; push %%ecx \n\t"
- ASM_VMX_VMWRITE_RSP_RDX "\n\t"
-#endif
- /* Check if vmlaunch of vmresume is needed */
- "cmp $0, %1 \n\t"
- /* Load guest registers. Don't clobber flags. */
-#ifdef CONFIG_X86_64
- "mov %c[cr2](%3), %%rax \n\t"
- "mov %%rax, %%cr2 \n\t"
- "mov %c[rax](%3), %%rax \n\t"
- "mov %c[rbx](%3), %%rbx \n\t"
- "mov %c[rdx](%3), %%rdx \n\t"
- "mov %c[rsi](%3), %%rsi \n\t"
- "mov %c[rdi](%3), %%rdi \n\t"
- "mov %c[rbp](%3), %%rbp \n\t"
- "mov %c[r8](%3), %%r8 \n\t"
- "mov %c[r9](%3), %%r9 \n\t"
- "mov %c[r10](%3), %%r10 \n\t"
- "mov %c[r11](%3), %%r11 \n\t"
- "mov %c[r12](%3), %%r12 \n\t"
- "mov %c[r13](%3), %%r13 \n\t"
- "mov %c[r14](%3), %%r14 \n\t"
- "mov %c[r15](%3), %%r15 \n\t"
- "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */
-#else
- "mov %c[cr2](%3), %%eax \n\t"
- "mov %%eax, %%cr2 \n\t"
- "mov %c[rax](%3), %%eax \n\t"
- "mov %c[rbx](%3), %%ebx \n\t"
- "mov %c[rdx](%3), %%edx \n\t"
- "mov %c[rsi](%3), %%esi \n\t"
- "mov %c[rdi](%3), %%edi \n\t"
- "mov %c[rbp](%3), %%ebp \n\t"
- "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */
-#endif
- /* Enter guest mode */
- "jne .Llaunched \n\t"
- ASM_VMX_VMLAUNCH "\n\t"
- "jmp .Lkvm_vmx_return \n\t"
- ".Llaunched: " ASM_VMX_VMRESUME "\n\t"
- ".Lkvm_vmx_return: "
- /* Save guest registers, load host registers, keep flags */
-#ifdef CONFIG_X86_64
- "xchg %3, (%%rsp) \n\t"
- "mov %%rax, %c[rax](%3) \n\t"
- "mov %%rbx, %c[rbx](%3) \n\t"
- "pushq (%%rsp); popq %c[rcx](%3) \n\t"
- "mov %%rdx, %c[rdx](%3) \n\t"
- "mov %%rsi, %c[rsi](%3) \n\t"
- "mov %%rdi, %c[rdi](%3) \n\t"
- "mov %%rbp, %c[rbp](%3) \n\t"
- "mov %%r8, %c[r8](%3) \n\t"
- "mov %%r9, %c[r9](%3) \n\t"
- "mov %%r10, %c[r10](%3) \n\t"
- "mov %%r11, %c[r11](%3) \n\t"
- "mov %%r12, %c[r12](%3) \n\t"
- "mov %%r13, %c[r13](%3) \n\t"
- "mov %%r14, %c[r14](%3) \n\t"
- "mov %%r15, %c[r15](%3) \n\t"
- "mov %%cr2, %%rax \n\t"
- "mov %%rax, %c[cr2](%3) \n\t"
- "mov (%%rsp), %3 \n\t"
-
- "pop %%rcx; pop %%r15; pop %%r14; pop %%r13; pop %%r12;"
- "pop %%r11; pop %%r10; pop %%r9; pop %%r8;"
- "pop %%rbp; pop %%rdi; pop %%rsi;"
- "pop %%rdx; pop %%rbx; pop %%rax \n\t"
-#else
- "xchg %3, (%%esp) \n\t"
- "mov %%eax, %c[rax](%3) \n\t"
- "mov %%ebx, %c[rbx](%3) \n\t"
- "pushl (%%esp); popl %c[rcx](%3) \n\t"
- "mov %%edx, %c[rdx](%3) \n\t"
- "mov %%esi, %c[rsi](%3) \n\t"
- "mov %%edi, %c[rdi](%3) \n\t"
- "mov %%ebp, %c[rbp](%3) \n\t"
- "mov %%cr2, %%eax \n\t"
- "mov %%eax, %c[cr2](%3) \n\t"
- "mov (%%esp), %3 \n\t"
-
- "pop %%ecx; popa \n\t"
-#endif
- "setbe %0 \n\t"
- : "=q" (vmx->fail)
- : "r"(vmx->launched), "d"((unsigned long)HOST_RSP),
- "c"(vcpu),
- [rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])),
- [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])),
- [rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])),
- [rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])),
- [rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])),
- [rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])),
- [rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])),
-#ifdef CONFIG_X86_64
- [r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])),
- [r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])),
- [r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])),
- [r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])),
- [r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])),
- [r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])),
- [r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])),
- [r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])),
-#endif
- [cr2]"i"(offsetof(struct kvm_vcpu, cr2))
- : "cc", "memory" );
-
- vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
-
- asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
- vmx->launched = 1;
-
- intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
- /* We need to handle NMIs before interrupts are enabled */
- if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
- asm("int $2");
-}
-
-static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
- unsigned long addr,
- u32 err_code)
-{
- u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
-
- ++vcpu->stat.pf_guest;
-
- if (is_page_fault(vect_info)) {
- printk(KERN_DEBUG "inject_page_fault: "
- "double fault 0x%lx @ 0x%lx\n",
- addr, vmcs_readl(GUEST_RIP));
- vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
- DF_VECTOR |
- INTR_TYPE_EXCEPTION |
- INTR_INFO_DELIEVER_CODE_MASK |
- INTR_INFO_VALID_MASK);
- return;
- }
- vcpu->cr2 = addr;
- vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, err_code);
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
- PF_VECTOR |
- INTR_TYPE_EXCEPTION |
- INTR_INFO_DELIEVER_CODE_MASK |
- INTR_INFO_VALID_MASK);
-
-}
-
-static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- if (vmx->vmcs) {
- on_each_cpu(__vcpu_clear, vmx, 0, 1);
- free_vmcs(vmx->vmcs);
- vmx->vmcs = NULL;
- }
-}
-
-static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- vmx_free_vmcs(vcpu);
- kfree(vmx->host_msrs);
- kfree(vmx->guest_msrs);
- kvm_vcpu_uninit(vcpu);
- kmem_cache_free(kvm_vcpu_cache, vmx);
-}
-
-static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
-{
- int err;
- struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
- int cpu;
-
- if (!vmx)
- return ERR_PTR(-ENOMEM);
-
- err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
- if (err)
- goto free_vcpu;
-
- if (irqchip_in_kernel(kvm)) {
- err = kvm_create_lapic(&vmx->vcpu);
- if (err < 0)
- goto free_vcpu;
- }
-
- vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (!vmx->guest_msrs) {
- err = -ENOMEM;
- goto uninit_vcpu;
- }
-
- vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (!vmx->host_msrs)
- goto free_guest_msrs;
-
- vmx->vmcs = alloc_vmcs();
- if (!vmx->vmcs)
- goto free_msrs;
-
- vmcs_clear(vmx->vmcs);
-
- cpu = get_cpu();
- vmx_vcpu_load(&vmx->vcpu, cpu);
- err = vmx_vcpu_setup(vmx);
- vmx_vcpu_put(&vmx->vcpu);
- put_cpu();
- if (err)
- goto free_vmcs;
-
- return &vmx->vcpu;
-
-free_vmcs:
- free_vmcs(vmx->vmcs);
-free_msrs:
- kfree(vmx->host_msrs);
-free_guest_msrs:
- kfree(vmx->guest_msrs);
-uninit_vcpu:
- kvm_vcpu_uninit(&vmx->vcpu);
-free_vcpu:
- kmem_cache_free(kvm_vcpu_cache, vmx);
- return ERR_PTR(err);
-}
-
-static void __init vmx_check_processor_compat(void *rtn)
-{
- struct vmcs_config vmcs_conf;
-
- *(int *)rtn = 0;
- if (setup_vmcs_config(&vmcs_conf) < 0)
- *(int *)rtn = -EIO;
- if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
- printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
- smp_processor_id());
- *(int *)rtn = -EIO;
- }
-}
-
-static struct kvm_x86_ops vmx_x86_ops = {
- .cpu_has_kvm_support = cpu_has_kvm_support,
- .disabled_by_bios = vmx_disabled_by_bios,
- .hardware_setup = hardware_setup,
- .hardware_unsetup = hardware_unsetup,
- .check_processor_compatibility = vmx_check_processor_compat,
- .hardware_enable = hardware_enable,
- .hardware_disable = hardware_disable,
-
- .vcpu_create = vmx_create_vcpu,
- .vcpu_free = vmx_free_vcpu,
- .vcpu_reset = vmx_vcpu_reset,
-
- .prepare_guest_switch = vmx_save_host_state,
- .vcpu_load = vmx_vcpu_load,
- .vcpu_put = vmx_vcpu_put,
- .vcpu_decache = vmx_vcpu_decache,
-
- .set_guest_debug = set_guest_debug,
- .guest_debug_pre = kvm_guest_debug_pre,
- .get_msr = vmx_get_msr,
- .set_msr = vmx_set_msr,
- .get_segment_base = vmx_get_segment_base,
- .get_segment = vmx_get_segment,
- .set_segment = vmx_set_segment,
- .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
- .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
- .set_cr0 = vmx_set_cr0,
- .set_cr3 = vmx_set_cr3,
- .set_cr4 = vmx_set_cr4,
-#ifdef CONFIG_X86_64
- .set_efer = vmx_set_efer,
-#endif
- .get_idt = vmx_get_idt,
- .set_idt = vmx_set_idt,
- .get_gdt = vmx_get_gdt,
- .set_gdt = vmx_set_gdt,
- .cache_regs = vcpu_load_rsp_rip,
- .decache_regs = vcpu_put_rsp_rip,
- .get_rflags = vmx_get_rflags,
- .set_rflags = vmx_set_rflags,
-
- .tlb_flush = vmx_flush_tlb,
- .inject_page_fault = vmx_inject_page_fault,
-
- .inject_gp = vmx_inject_gp,
-
- .run = vmx_vcpu_run,
- .handle_exit = kvm_handle_exit,
- .skip_emulated_instruction = skip_emulated_instruction,
- .patch_hypercall = vmx_patch_hypercall,
- .get_irq = vmx_get_irq,
- .set_irq = vmx_inject_irq,
- .inject_pending_irq = vmx_intr_assist,
- .inject_pending_vectors = do_interrupt_requests,
-};
-
-static int __init vmx_init(void)
-{
- void *iova;
- int r;
-
- vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
- if (!vmx_io_bitmap_a)
- return -ENOMEM;
-
- vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
- if (!vmx_io_bitmap_b) {
- r = -ENOMEM;
- goto out;
- }
-
- /*
- * Allow direct access to the PC debug port (it is often used for I/O
- * delays, but the vmexits simply slow things down).
- */
- iova = kmap(vmx_io_bitmap_a);
- memset(iova, 0xff, PAGE_SIZE);
- clear_bit(0x80, iova);
- kunmap(vmx_io_bitmap_a);
-
- iova = kmap(vmx_io_bitmap_b);
- memset(iova, 0xff, PAGE_SIZE);
- kunmap(vmx_io_bitmap_b);
-
- r = kvm_init_x86(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
- if (r)
- goto out1;
-
- return 0;
-
-out1:
- __free_page(vmx_io_bitmap_b);
-out:
- __free_page(vmx_io_bitmap_a);
- return r;
-}
-
-static void __exit vmx_exit(void)
-{
- __free_page(vmx_io_bitmap_b);
- __free_page(vmx_io_bitmap_a);
-
- kvm_exit_x86();
-}
-
-module_init(vmx_init)
-module_exit(vmx_exit)
diff --git a/drivers/kvm/vmx.h b/drivers/kvm/vmx.h
deleted file mode 100644
index fd4e1466608..00000000000
--- a/drivers/kvm/vmx.h
+++ /dev/null
@@ -1,310 +0,0 @@
-#ifndef VMX_H
-#define VMX_H
-
-/*
- * vmx.h: VMX Architecture related definitions
- * Copyright (c) 2004, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
- * A few random additions are:
- * Copyright (C) 2006 Qumranet
- * Avi Kivity <avi@qumranet.com>
- * Yaniv Kamay <yaniv@qumranet.com>
- *
- */
-
-#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004
-#define CPU_BASED_USE_TSC_OFFSETING 0x00000008
-#define CPU_BASED_HLT_EXITING 0x00000080
-#define CPU_BASED_INVLPG_EXITING 0x00000200
-#define CPU_BASED_MWAIT_EXITING 0x00000400
-#define CPU_BASED_RDPMC_EXITING 0x00000800
-#define CPU_BASED_RDTSC_EXITING 0x00001000
-#define CPU_BASED_CR8_LOAD_EXITING 0x00080000
-#define CPU_BASED_CR8_STORE_EXITING 0x00100000
-#define CPU_BASED_TPR_SHADOW 0x00200000
-#define CPU_BASED_MOV_DR_EXITING 0x00800000
-#define CPU_BASED_UNCOND_IO_EXITING 0x01000000
-#define CPU_BASED_USE_IO_BITMAPS 0x02000000
-#define CPU_BASED_USE_MSR_BITMAPS 0x10000000
-#define CPU_BASED_MONITOR_EXITING 0x20000000
-#define CPU_BASED_PAUSE_EXITING 0x40000000
-#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000
-
-#define PIN_BASED_EXT_INTR_MASK 0x00000001
-#define PIN_BASED_NMI_EXITING 0x00000008
-#define PIN_BASED_VIRTUAL_NMIS 0x00000020
-
-#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
-#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
-
-#define VM_ENTRY_IA32E_MODE 0x00000200
-#define VM_ENTRY_SMM 0x00000400
-#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
-
-#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
-
-/* VMCS Encodings */
-enum vmcs_field {
- GUEST_ES_SELECTOR = 0x00000800,
- GUEST_CS_SELECTOR = 0x00000802,
- GUEST_SS_SELECTOR = 0x00000804,
- GUEST_DS_SELECTOR = 0x00000806,
- GUEST_FS_SELECTOR = 0x00000808,
- GUEST_GS_SELECTOR = 0x0000080a,
- GUEST_LDTR_SELECTOR = 0x0000080c,
- GUEST_TR_SELECTOR = 0x0000080e,
- HOST_ES_SELECTOR = 0x00000c00,
- HOST_CS_SELECTOR = 0x00000c02,
- HOST_SS_SELECTOR = 0x00000c04,
- HOST_DS_SELECTOR = 0x00000c06,
- HOST_FS_SELECTOR = 0x00000c08,
- HOST_GS_SELECTOR = 0x00000c0a,
- HOST_TR_SELECTOR = 0x00000c0c,
- IO_BITMAP_A = 0x00002000,
- IO_BITMAP_A_HIGH = 0x00002001,
- IO_BITMAP_B = 0x00002002,
- IO_BITMAP_B_HIGH = 0x00002003,
- MSR_BITMAP = 0x00002004,
- MSR_BITMAP_HIGH = 0x00002005,
- VM_EXIT_MSR_STORE_ADDR = 0x00002006,
- VM_EXIT_MSR_STORE_ADDR_HIGH = 0x00002007,
- VM_EXIT_MSR_LOAD_ADDR = 0x00002008,
- VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009,
- VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a,
- VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b,
- TSC_OFFSET = 0x00002010,
- TSC_OFFSET_HIGH = 0x00002011,
- VIRTUAL_APIC_PAGE_ADDR = 0x00002012,
- VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013,
- VMCS_LINK_POINTER = 0x00002800,
- VMCS_LINK_POINTER_HIGH = 0x00002801,
- GUEST_IA32_DEBUGCTL = 0x00002802,
- GUEST_IA32_DEBUGCTL_HIGH = 0x00002803,
- PIN_BASED_VM_EXEC_CONTROL = 0x00004000,
- CPU_BASED_VM_EXEC_CONTROL = 0x00004002,
- EXCEPTION_BITMAP = 0x00004004,
- PAGE_FAULT_ERROR_CODE_MASK = 0x00004006,
- PAGE_FAULT_ERROR_CODE_MATCH = 0x00004008,
- CR3_TARGET_COUNT = 0x0000400a,
- VM_EXIT_CONTROLS = 0x0000400c,
- VM_EXIT_MSR_STORE_COUNT = 0x0000400e,
- VM_EXIT_MSR_LOAD_COUNT = 0x00004010,
- VM_ENTRY_CONTROLS = 0x00004012,
- VM_ENTRY_MSR_LOAD_COUNT = 0x00004014,
- VM_ENTRY_INTR_INFO_FIELD = 0x00004016,
- VM_ENTRY_EXCEPTION_ERROR_CODE = 0x00004018,
- VM_ENTRY_INSTRUCTION_LEN = 0x0000401a,
- TPR_THRESHOLD = 0x0000401c,
- SECONDARY_VM_EXEC_CONTROL = 0x0000401e,
- VM_INSTRUCTION_ERROR = 0x00004400,
- VM_EXIT_REASON = 0x00004402,
- VM_EXIT_INTR_INFO = 0x00004404,
- VM_EXIT_INTR_ERROR_CODE = 0x00004406,
- IDT_VECTORING_INFO_FIELD = 0x00004408,
- IDT_VECTORING_ERROR_CODE = 0x0000440a,
- VM_EXIT_INSTRUCTION_LEN = 0x0000440c,
- VMX_INSTRUCTION_INFO = 0x0000440e,
- GUEST_ES_LIMIT = 0x00004800,
- GUEST_CS_LIMIT = 0x00004802,
- GUEST_SS_LIMIT = 0x00004804,
- GUEST_DS_LIMIT = 0x00004806,
- GUEST_FS_LIMIT = 0x00004808,
- GUEST_GS_LIMIT = 0x0000480a,
- GUEST_LDTR_LIMIT = 0x0000480c,
- GUEST_TR_LIMIT = 0x0000480e,
- GUEST_GDTR_LIMIT = 0x00004810,
- GUEST_IDTR_LIMIT = 0x00004812,
- GUEST_ES_AR_BYTES = 0x00004814,
- GUEST_CS_AR_BYTES = 0x00004816,
- GUEST_SS_AR_BYTES = 0x00004818,
- GUEST_DS_AR_BYTES = 0x0000481a,
- GUEST_FS_AR_BYTES = 0x0000481c,
- GUEST_GS_AR_BYTES = 0x0000481e,
- GUEST_LDTR_AR_BYTES = 0x00004820,
- GUEST_TR_AR_BYTES = 0x00004822,
- GUEST_INTERRUPTIBILITY_INFO = 0x00004824,
- GUEST_ACTIVITY_STATE = 0X00004826,
- GUEST_SYSENTER_CS = 0x0000482A,
- HOST_IA32_SYSENTER_CS = 0x00004c00,
- CR0_GUEST_HOST_MASK = 0x00006000,
- CR4_GUEST_HOST_MASK = 0x00006002,
- CR0_READ_SHADOW = 0x00006004,
- CR4_READ_SHADOW = 0x00006006,
- CR3_TARGET_VALUE0 = 0x00006008,
- CR3_TARGET_VALUE1 = 0x0000600a,
- CR3_TARGET_VALUE2 = 0x0000600c,
- CR3_TARGET_VALUE3 = 0x0000600e,
- EXIT_QUALIFICATION = 0x00006400,
- GUEST_LINEAR_ADDRESS = 0x0000640a,
- GUEST_CR0 = 0x00006800,
- GUEST_CR3 = 0x00006802,
- GUEST_CR4 = 0x00006804,
- GUEST_ES_BASE = 0x00006806,
- GUEST_CS_BASE = 0x00006808,
- GUEST_SS_BASE = 0x0000680a,
- GUEST_DS_BASE = 0x0000680c,
- GUEST_FS_BASE = 0x0000680e,
- GUEST_GS_BASE = 0x00006810,
- GUEST_LDTR_BASE = 0x00006812,
- GUEST_TR_BASE = 0x00006814,
- GUEST_GDTR_BASE = 0x00006816,
- GUEST_IDTR_BASE = 0x00006818,
- GUEST_DR7 = 0x0000681a,
- GUEST_RSP = 0x0000681c,
- GUEST_RIP = 0x0000681e,
- GUEST_RFLAGS = 0x00006820,
- GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822,
- GUEST_SYSENTER_ESP = 0x00006824,
- GUEST_SYSENTER_EIP = 0x00006826,
- HOST_CR0 = 0x00006c00,
- HOST_CR3 = 0x00006c02,
- HOST_CR4 = 0x00006c04,
- HOST_FS_BASE = 0x00006c06,
- HOST_GS_BASE = 0x00006c08,
- HOST_TR_BASE = 0x00006c0a,
- HOST_GDTR_BASE = 0x00006c0c,
- HOST_IDTR_BASE = 0x00006c0e,
- HOST_IA32_SYSENTER_ESP = 0x00006c10,
- HOST_IA32_SYSENTER_EIP = 0x00006c12,
- HOST_RSP = 0x00006c14,
- HOST_RIP = 0x00006c16,
-};
-
-#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000
-
-#define EXIT_REASON_EXCEPTION_NMI 0
-#define EXIT_REASON_EXTERNAL_INTERRUPT 1
-#define EXIT_REASON_TRIPLE_FAULT 2
-
-#define EXIT_REASON_PENDING_INTERRUPT 7
-
-#define EXIT_REASON_TASK_SWITCH 9
-#define EXIT_REASON_CPUID 10
-#define EXIT_REASON_HLT 12
-#define EXIT_REASON_INVLPG 14
-#define EXIT_REASON_RDPMC 15
-#define EXIT_REASON_RDTSC 16
-#define EXIT_REASON_VMCALL 18
-#define EXIT_REASON_VMCLEAR 19
-#define EXIT_REASON_VMLAUNCH 20
-#define EXIT_REASON_VMPTRLD 21
-#define EXIT_REASON_VMPTRST 22
-#define EXIT_REASON_VMREAD 23
-#define EXIT_REASON_VMRESUME 24
-#define EXIT_REASON_VMWRITE 25
-#define EXIT_REASON_VMOFF 26
-#define EXIT_REASON_VMON 27
-#define EXIT_REASON_CR_ACCESS 28
-#define EXIT_REASON_DR_ACCESS 29
-#define EXIT_REASON_IO_INSTRUCTION 30
-#define EXIT_REASON_MSR_READ 31
-#define EXIT_REASON_MSR_WRITE 32
-#define EXIT_REASON_MWAIT_INSTRUCTION 36
-#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
-
-/*
- * Interruption-information format
- */
-#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */
-#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */
-#define INTR_INFO_DELIEVER_CODE_MASK 0x800 /* 11 */
-#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */
-
-#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK
-#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK
-#define VECTORING_INFO_DELIEVER_CODE_MASK INTR_INFO_DELIEVER_CODE_MASK
-#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK
-
-#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */
-#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */
-
-/*
- * Exit Qualifications for MOV for Control Register Access
- */
-#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control register */
-#define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */
-#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose register */
-#define LMSW_SOURCE_DATA_SHIFT 16
-#define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */
-#define REG_EAX (0 << 8)
-#define REG_ECX (1 << 8)
-#define REG_EDX (2 << 8)
-#define REG_EBX (3 << 8)
-#define REG_ESP (4 << 8)
-#define REG_EBP (5 << 8)
-#define REG_ESI (6 << 8)
-#define REG_EDI (7 << 8)
-#define REG_R8 (8 << 8)
-#define REG_R9 (9 << 8)
-#define REG_R10 (10 << 8)
-#define REG_R11 (11 << 8)
-#define REG_R12 (12 << 8)
-#define REG_R13 (13 << 8)
-#define REG_R14 (14 << 8)
-#define REG_R15 (15 << 8)
-
-/*
- * Exit Qualifications for MOV for Debug Register Access
- */
-#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug register */
-#define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */
-#define TYPE_MOV_TO_DR (0 << 4)
-#define TYPE_MOV_FROM_DR (1 << 4)
-#define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose register */
-
-
-/* segment AR */
-#define SEGMENT_AR_L_MASK (1 << 13)
-
-#define AR_TYPE_ACCESSES_MASK 1
-#define AR_TYPE_READABLE_MASK (1 << 1)
-#define AR_TYPE_WRITEABLE_MASK (1 << 2)
-#define AR_TYPE_CODE_MASK (1 << 3)
-#define AR_TYPE_MASK 0x0f
-#define AR_TYPE_BUSY_64_TSS 11
-#define AR_TYPE_BUSY_32_TSS 11
-#define AR_TYPE_BUSY_16_TSS 3
-#define AR_TYPE_LDT 2
-
-#define AR_UNUSABLE_MASK (1 << 16)
-#define AR_S_MASK (1 << 4)
-#define AR_P_MASK (1 << 7)
-#define AR_L_MASK (1 << 13)
-#define AR_DB_MASK (1 << 14)
-#define AR_G_MASK (1 << 15)
-#define AR_DPL_SHIFT 5
-#define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3)
-
-#define AR_RESERVD_MASK 0xfffe0f00
-
-#define MSR_IA32_VMX_BASIC 0x480
-#define MSR_IA32_VMX_PINBASED_CTLS 0x481
-#define MSR_IA32_VMX_PROCBASED_CTLS 0x482
-#define MSR_IA32_VMX_EXIT_CTLS 0x483
-#define MSR_IA32_VMX_ENTRY_CTLS 0x484
-#define MSR_IA32_VMX_MISC 0x485
-#define MSR_IA32_VMX_CR0_FIXED0 0x486
-#define MSR_IA32_VMX_CR0_FIXED1 0x487
-#define MSR_IA32_VMX_CR4_FIXED0 0x488
-#define MSR_IA32_VMX_CR4_FIXED1 0x489
-#define MSR_IA32_VMX_VMCS_ENUM 0x48a
-#define MSR_IA32_VMX_PROCBASED_CTLS2 0x48b
-
-#define MSR_IA32_FEATURE_CONTROL 0x3a
-#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1
-#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4
-
-#endif
diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c
deleted file mode 100644
index bd46de6bf89..00000000000
--- a/drivers/kvm/x86_emulate.c
+++ /dev/null
@@ -1,1662 +0,0 @@
-/******************************************************************************
- * x86_emulate.c
- *
- * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
- *
- * Copyright (c) 2005 Keir Fraser
- *
- * Linux coding style, mod r/m decoder, segment base fixes, real-mode
- * privileged instructions:
- *
- * Copyright (C) 2006 Qumranet
- *
- * Avi Kivity <avi@qumranet.com>
- * Yaniv Kamay <yaniv@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
- */
-
-#ifndef __KERNEL__
-#include <stdio.h>
-#include <stdint.h>
-#include <public/xen.h>
-#define DPRINTF(_f, _a ...) printf( _f , ## _a )
-#else
-#include "kvm.h"
-#define DPRINTF(x...) do {} while (0)
-#endif
-#include "x86_emulate.h"
-#include <linux/module.h>
-
-/*
- * Opcode effective-address decode tables.
- * Note that we only emulate instructions that have at least one memory
- * operand (excluding implicit stack references). We assume that stack
- * references and instruction fetches will never occur in special memory
- * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
- * not be handled.
- */
-
-/* Operand sizes: 8-bit operands or specified/overridden size. */
-#define ByteOp (1<<0) /* 8-bit operands. */
-/* Destination operand type. */
-#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
-#define DstReg (2<<1) /* Register operand. */
-#define DstMem (3<<1) /* Memory operand. */
-#define DstMask (3<<1)
-/* Source operand type. */
-#define SrcNone (0<<3) /* No source operand. */
-#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */
-#define SrcReg (1<<3) /* Register operand. */
-#define SrcMem (2<<3) /* Memory operand. */
-#define SrcMem16 (3<<3) /* Memory operand (16-bit). */
-#define SrcMem32 (4<<3) /* Memory operand (32-bit). */
-#define SrcImm (5<<3) /* Immediate operand. */
-#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */
-#define SrcMask (7<<3)
-/* Generic ModRM decode. */
-#define ModRM (1<<6)
-/* Destination is only written; never read. */
-#define Mov (1<<7)
-#define BitOp (1<<8)
-
-static u8 opcode_table[256] = {
- /* 0x00 - 0x07 */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
- /* 0x08 - 0x0F */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
- /* 0x10 - 0x17 */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
- /* 0x18 - 0x1F */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
- /* 0x20 - 0x27 */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- SrcImmByte, SrcImm, 0, 0,
- /* 0x28 - 0x2F */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
- /* 0x30 - 0x37 */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
- /* 0x38 - 0x3F */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
- /* 0x40 - 0x4F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x50 - 0x57 */
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- /* 0x58 - 0x5F */
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- /* 0x60 - 0x67 */
- 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
- 0, 0, 0, 0,
- /* 0x68 - 0x6F */
- 0, 0, ImplicitOps|Mov, 0,
- SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
- SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
- /* 0x70 - 0x77 */
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- /* 0x78 - 0x7F */
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- /* 0x80 - 0x87 */
- ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
- ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- /* 0x88 - 0x8F */
- ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
- ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov,
- /* 0x90 - 0x9F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps, ImplicitOps, 0, 0,
- /* 0xA0 - 0xA7 */
- ByteOp | DstReg | SrcMem | Mov, DstReg | SrcMem | Mov,
- ByteOp | DstMem | SrcReg | Mov, DstMem | SrcReg | Mov,
- ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
- ByteOp | ImplicitOps, ImplicitOps,
- /* 0xA8 - 0xAF */
- 0, 0, ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
- ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
- ByteOp | ImplicitOps, ImplicitOps,
- /* 0xB0 - 0xBF */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xC0 - 0xC7 */
- ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
- 0, ImplicitOps, 0, 0,
- ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
- /* 0xC8 - 0xCF */
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xD0 - 0xD7 */
- ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
- ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
- 0, 0, 0, 0,
- /* 0xD8 - 0xDF */
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xE0 - 0xE7 */
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xE8 - 0xEF */
- ImplicitOps, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, 0, 0, 0, 0,
- /* 0xF0 - 0xF7 */
- 0, 0, 0, 0,
- ImplicitOps, 0,
- ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
- /* 0xF8 - 0xFF */
- 0, 0, 0, 0,
- 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
-};
-
-static u16 twobyte_table[256] = {
- /* 0x00 - 0x0F */
- 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
- ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
- /* 0x10 - 0x1F */
- 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
- /* 0x20 - 0x2F */
- ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x30 - 0x3F */
- ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x40 - 0x47 */
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- /* 0x48 - 0x4F */
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- /* 0x50 - 0x5F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x60 - 0x6F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x70 - 0x7F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x80 - 0x8F */
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- /* 0x90 - 0x9F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xA0 - 0xA7 */
- 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
- /* 0xA8 - 0xAF */
- 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
- /* 0xB0 - 0xB7 */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
- DstMem | SrcReg | ModRM | BitOp,
- 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem16 | ModRM | Mov,
- /* 0xB8 - 0xBF */
- 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp,
- 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem16 | ModRM | Mov,
- /* 0xC0 - 0xCF */
- 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xD0 - 0xDF */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xE0 - 0xEF */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xF0 - 0xFF */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-/* Type, address-of, and value of an instruction's operand. */
-struct operand {
- enum { OP_REG, OP_MEM, OP_IMM } type;
- unsigned int bytes;
- unsigned long val, orig_val, *ptr;
-};
-
-/* EFLAGS bit definitions. */
-#define EFLG_OF (1<<11)
-#define EFLG_DF (1<<10)
-#define EFLG_SF (1<<7)
-#define EFLG_ZF (1<<6)
-#define EFLG_AF (1<<4)
-#define EFLG_PF (1<<2)
-#define EFLG_CF (1<<0)
-
-/*
- * Instruction emulation:
- * Most instructions are emulated directly via a fragment of inline assembly
- * code. This allows us to save/restore EFLAGS and thus very easily pick up
- * any modified flags.
- */
-
-#if defined(CONFIG_X86_64)
-#define _LO32 "k" /* force 32-bit operand */
-#define _STK "%%rsp" /* stack pointer */
-#elif defined(__i386__)
-#define _LO32 "" /* force 32-bit operand */
-#define _STK "%%esp" /* stack pointer */
-#endif
-
-/*
- * These EFLAGS bits are restored from saved value during emulation, and
- * any changes are written back to the saved value after emulation.
- */
-#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
-
-/* Before executing instruction: restore necessary bits in EFLAGS. */
-#define _PRE_EFLAGS(_sav, _msk, _tmp) \
- /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); */ \
- "push %"_sav"; " \
- "movl %"_msk",%"_LO32 _tmp"; " \
- "andl %"_LO32 _tmp",("_STK"); " \
- "pushf; " \
- "notl %"_LO32 _tmp"; " \
- "andl %"_LO32 _tmp",("_STK"); " \
- "pop %"_tmp"; " \
- "orl %"_LO32 _tmp",("_STK"); " \
- "popf; " \
- /* _sav &= ~msk; */ \
- "movl %"_msk",%"_LO32 _tmp"; " \
- "notl %"_LO32 _tmp"; " \
- "andl %"_LO32 _tmp",%"_sav"; "
-
-/* After executing instruction: write-back necessary bits in EFLAGS. */
-#define _POST_EFLAGS(_sav, _msk, _tmp) \
- /* _sav |= EFLAGS & _msk; */ \
- "pushf; " \
- "pop %"_tmp"; " \
- "andl %"_msk",%"_LO32 _tmp"; " \
- "orl %"_LO32 _tmp",%"_sav"; "
-
-/* Raw emulation: instruction has two explicit operands. */
-#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
- do { \
- unsigned long _tmp; \
- \
- switch ((_dst).bytes) { \
- case 2: \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0","4","2") \
- _op"w %"_wx"3,%1; " \
- _POST_EFLAGS("0","4","2") \
- : "=m" (_eflags), "=m" ((_dst).val), \
- "=&r" (_tmp) \
- : _wy ((_src).val), "i" (EFLAGS_MASK) ); \
- break; \
- case 4: \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0","4","2") \
- _op"l %"_lx"3,%1; " \
- _POST_EFLAGS("0","4","2") \
- : "=m" (_eflags), "=m" ((_dst).val), \
- "=&r" (_tmp) \
- : _ly ((_src).val), "i" (EFLAGS_MASK) ); \
- break; \
- case 8: \
- __emulate_2op_8byte(_op, _src, _dst, \
- _eflags, _qx, _qy); \
- break; \
- } \
- } while (0)
-
-#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
- do { \
- unsigned long _tmp; \
- switch ( (_dst).bytes ) \
- { \
- case 1: \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0","4","2") \
- _op"b %"_bx"3,%1; " \
- _POST_EFLAGS("0","4","2") \
- : "=m" (_eflags), "=m" ((_dst).val), \
- "=&r" (_tmp) \
- : _by ((_src).val), "i" (EFLAGS_MASK) ); \
- break; \
- default: \
- __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
- _wx, _wy, _lx, _ly, _qx, _qy); \
- break; \
- } \
- } while (0)
-
-/* Source operand is byte-sized and may be restricted to just %cl. */
-#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \
- __emulate_2op(_op, _src, _dst, _eflags, \
- "b", "c", "b", "c", "b", "c", "b", "c")
-
-/* Source operand is byte, word, long or quad sized. */
-#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \
- __emulate_2op(_op, _src, _dst, _eflags, \
- "b", "q", "w", "r", _LO32, "r", "", "r")
-
-/* Source operand is word, long or quad sized. */
-#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \
- __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
- "w", "r", _LO32, "r", "", "r")
-
-/* Instruction has only one explicit operand (no source operand). */
-#define emulate_1op(_op, _dst, _eflags) \
- do { \
- unsigned long _tmp; \
- \
- switch ( (_dst).bytes ) \
- { \
- case 1: \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0","3","2") \
- _op"b %1; " \
- _POST_EFLAGS("0","3","2") \
- : "=m" (_eflags), "=m" ((_dst).val), \
- "=&r" (_tmp) \
- : "i" (EFLAGS_MASK) ); \
- break; \
- case 2: \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0","3","2") \
- _op"w %1; " \
- _POST_EFLAGS("0","3","2") \
- : "=m" (_eflags), "=m" ((_dst).val), \
- "=&r" (_tmp) \
- : "i" (EFLAGS_MASK) ); \
- break; \
- case 4: \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0","3","2") \
- _op"l %1; " \
- _POST_EFLAGS("0","3","2") \
- : "=m" (_eflags), "=m" ((_dst).val), \
- "=&r" (_tmp) \
- : "i" (EFLAGS_MASK) ); \
- break; \
- case 8: \
- __emulate_1op_8byte(_op, _dst, _eflags); \
- break; \
- } \
- } while (0)
-
-/* Emulate an instruction with quadword operands (x86/64 only). */
-#if defined(CONFIG_X86_64)
-#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \
- do { \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0","4","2") \
- _op"q %"_qx"3,%1; " \
- _POST_EFLAGS("0","4","2") \
- : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
- : _qy ((_src).val), "i" (EFLAGS_MASK) ); \
- } while (0)
-
-#define __emulate_1op_8byte(_op, _dst, _eflags) \
- do { \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0","3","2") \
- _op"q %1; " \
- _POST_EFLAGS("0","3","2") \
- : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
- : "i" (EFLAGS_MASK) ); \
- } while (0)
-
-#elif defined(__i386__)
-#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
-#define __emulate_1op_8byte(_op, _dst, _eflags)
-#endif /* __i386__ */
-
-/* Fetch next part of the instruction being emulated. */
-#define insn_fetch(_type, _size, _eip) \
-({ unsigned long _x; \
- rc = ops->read_std((unsigned long)(_eip) + ctxt->cs_base, &_x, \
- (_size), ctxt->vcpu); \
- if ( rc != 0 ) \
- goto done; \
- (_eip) += (_size); \
- (_type)_x; \
-})
-
-/* Access/update address held in a register, based on addressing mode. */
-#define address_mask(reg) \
- ((ad_bytes == sizeof(unsigned long)) ? \
- (reg) : ((reg) & ((1UL << (ad_bytes << 3)) - 1)))
-#define register_address(base, reg) \
- ((base) + address_mask(reg))
-#define register_address_increment(reg, inc) \
- do { \
- /* signed type ensures sign extension to long */ \
- int _inc = (inc); \
- if ( ad_bytes == sizeof(unsigned long) ) \
- (reg) += _inc; \
- else \
- (reg) = ((reg) & ~((1UL << (ad_bytes << 3)) - 1)) | \
- (((reg) + _inc) & ((1UL << (ad_bytes << 3)) - 1)); \
- } while (0)
-
-#define JMP_REL(rel) \
- do { \
- register_address_increment(_eip, rel); \
- } while (0)
-
-/*
- * Given the 'reg' portion of a ModRM byte, and a register block, return a
- * pointer into the block that addresses the relevant register.
- * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
- */
-static void *decode_register(u8 modrm_reg, unsigned long *regs,
- int highbyte_regs)
-{
- void *p;
-
- p = &regs[modrm_reg];
- if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
- p = (unsigned char *)&regs[modrm_reg & 3] + 1;
- return p;
-}
-
-static int read_descriptor(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- void *ptr,
- u16 *size, unsigned long *address, int op_bytes)
-{
- int rc;
-
- if (op_bytes == 2)
- op_bytes = 3;
- *address = 0;
- rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
- ctxt->vcpu);
- if (rc)
- return rc;
- rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
- ctxt->vcpu);
- return rc;
-}
-
-static int test_cc(unsigned int condition, unsigned int flags)
-{
- int rc = 0;
-
- switch ((condition & 15) >> 1) {
- case 0: /* o */
- rc |= (flags & EFLG_OF);
- break;
- case 1: /* b/c/nae */
- rc |= (flags & EFLG_CF);
- break;
- case 2: /* z/e */
- rc |= (flags & EFLG_ZF);
- break;
- case 3: /* be/na */
- rc |= (flags & (EFLG_CF|EFLG_ZF));
- break;
- case 4: /* s */
- rc |= (flags & EFLG_SF);
- break;
- case 5: /* p/pe */
- rc |= (flags & EFLG_PF);
- break;
- case 7: /* le/ng */
- rc |= (flags & EFLG_ZF);
- /* fall through */
- case 6: /* l/nge */
- rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
- break;
- }
-
- /* Odd condition identifiers (lsb == 1) have inverted sense. */
- return (!!rc ^ (condition & 1));
-}
-
-int
-x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
-{
- unsigned d;
- u8 b, sib, twobyte = 0, rex_prefix = 0;
- u8 modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0;
- unsigned long *override_base = NULL;
- unsigned int op_bytes, ad_bytes, lock_prefix = 0, rep_prefix = 0, i;
- int rc = 0;
- struct operand src, dst;
- unsigned long cr2 = ctxt->cr2;
- int mode = ctxt->mode;
- unsigned long modrm_ea;
- int use_modrm_ea, index_reg = 0, base_reg = 0, scale, rip_relative = 0;
- int no_wb = 0;
- u64 msr_data;
-
- /* Shadow copy of register state. Committed on successful emulation. */
- unsigned long _regs[NR_VCPU_REGS];
- unsigned long _eip = ctxt->vcpu->rip, _eflags = ctxt->eflags;
- unsigned long modrm_val = 0;
-
- memcpy(_regs, ctxt->vcpu->regs, sizeof _regs);
-
- switch (mode) {
- case X86EMUL_MODE_REAL:
- case X86EMUL_MODE_PROT16:
- op_bytes = ad_bytes = 2;
- break;
- case X86EMUL_MODE_PROT32:
- op_bytes = ad_bytes = 4;
- break;
-#ifdef CONFIG_X86_64
- case X86EMUL_MODE_PROT64:
- op_bytes = 4;
- ad_bytes = 8;
- break;
-#endif
- default:
- return -1;
- }
-
- /* Legacy prefixes. */
- for (i = 0; i < 8; i++) {
- switch (b = insn_fetch(u8, 1, _eip)) {
- case 0x66: /* operand-size override */
- op_bytes ^= 6; /* switch between 2/4 bytes */
- break;
- case 0x67: /* address-size override */
- if (mode == X86EMUL_MODE_PROT64)
- ad_bytes ^= 12; /* switch between 4/8 bytes */
- else
- ad_bytes ^= 6; /* switch between 2/4 bytes */
- break;
- case 0x2e: /* CS override */
- override_base = &ctxt->cs_base;
- break;
- case 0x3e: /* DS override */
- override_base = &ctxt->ds_base;
- break;
- case 0x26: /* ES override */
- override_base = &ctxt->es_base;
- break;
- case 0x64: /* FS override */
- override_base = &ctxt->fs_base;
- break;
- case 0x65: /* GS override */
- override_base = &ctxt->gs_base;
- break;
- case 0x36: /* SS override */
- override_base = &ctxt->ss_base;
- break;
- case 0xf0: /* LOCK */
- lock_prefix = 1;
- break;
- case 0xf2: /* REPNE/REPNZ */
- case 0xf3: /* REP/REPE/REPZ */
- rep_prefix = 1;
- break;
- default:
- goto done_prefixes;
- }
- }
-
-done_prefixes:
-
- /* REX prefix. */
- if ((mode == X86EMUL_MODE_PROT64) && ((b & 0xf0) == 0x40)) {
- rex_prefix = b;
- if (b & 8)
- op_bytes = 8; /* REX.W */
- modrm_reg = (b & 4) << 1; /* REX.R */
- index_reg = (b & 2) << 2; /* REX.X */
- modrm_rm = base_reg = (b & 1) << 3; /* REG.B */
- b = insn_fetch(u8, 1, _eip);
- }
-
- /* Opcode byte(s). */
- d = opcode_table[b];
- if (d == 0) {
- /* Two-byte opcode? */
- if (b == 0x0f) {
- twobyte = 1;
- b = insn_fetch(u8, 1, _eip);
- d = twobyte_table[b];
- }
-
- /* Unrecognised? */
- if (d == 0)
- goto cannot_emulate;
- }
-
- /* ModRM and SIB bytes. */
- if (d & ModRM) {
- modrm = insn_fetch(u8, 1, _eip);
- modrm_mod |= (modrm & 0xc0) >> 6;
- modrm_reg |= (modrm & 0x38) >> 3;
- modrm_rm |= (modrm & 0x07);
- modrm_ea = 0;
- use_modrm_ea = 1;
-
- if (modrm_mod == 3) {
- modrm_val = *(unsigned long *)
- decode_register(modrm_rm, _regs, d & ByteOp);
- goto modrm_done;
- }
-
- if (ad_bytes == 2) {
- unsigned bx = _regs[VCPU_REGS_RBX];
- unsigned bp = _regs[VCPU_REGS_RBP];
- unsigned si = _regs[VCPU_REGS_RSI];
- unsigned di = _regs[VCPU_REGS_RDI];
-
- /* 16-bit ModR/M decode. */
- switch (modrm_mod) {
- case 0:
- if (modrm_rm == 6)
- modrm_ea += insn_fetch(u16, 2, _eip);
- break;
- case 1:
- modrm_ea += insn_fetch(s8, 1, _eip);
- break;
- case 2:
- modrm_ea += insn_fetch(u16, 2, _eip);
- break;
- }
- switch (modrm_rm) {
- case 0:
- modrm_ea += bx + si;
- break;
- case 1:
- modrm_ea += bx + di;
- break;
- case 2:
- modrm_ea += bp + si;
- break;
- case 3:
- modrm_ea += bp + di;
- break;
- case 4:
- modrm_ea += si;
- break;
- case 5:
- modrm_ea += di;
- break;
- case 6:
- if (modrm_mod != 0)
- modrm_ea += bp;
- break;
- case 7:
- modrm_ea += bx;
- break;
- }
- if (modrm_rm == 2 || modrm_rm == 3 ||
- (modrm_rm == 6 && modrm_mod != 0))
- if (!override_base)
- override_base = &ctxt->ss_base;
- modrm_ea = (u16)modrm_ea;
- } else {
- /* 32/64-bit ModR/M decode. */
- switch (modrm_rm) {
- case 4:
- case 12:
- sib = insn_fetch(u8, 1, _eip);
- index_reg |= (sib >> 3) & 7;
- base_reg |= sib & 7;
- scale = sib >> 6;
-
- switch (base_reg) {
- case 5:
- if (modrm_mod != 0)
- modrm_ea += _regs[base_reg];
- else
- modrm_ea += insn_fetch(s32, 4, _eip);
- break;
- default:
- modrm_ea += _regs[base_reg];
- }
- switch (index_reg) {
- case 4:
- break;
- default:
- modrm_ea += _regs[index_reg] << scale;
-
- }
- break;
- case 5:
- if (modrm_mod != 0)
- modrm_ea += _regs[modrm_rm];
- else if (mode == X86EMUL_MODE_PROT64)
- rip_relative = 1;
- break;
- default:
- modrm_ea += _regs[modrm_rm];
- break;
- }
- switch (modrm_mod) {
- case 0:
- if (modrm_rm == 5)
- modrm_ea += insn_fetch(s32, 4, _eip);
- break;
- case 1:
- modrm_ea += insn_fetch(s8, 1, _eip);
- break;
- case 2:
- modrm_ea += insn_fetch(s32, 4, _eip);
- break;
- }
- }
- if (!override_base)
- override_base = &ctxt->ds_base;
- if (mode == X86EMUL_MODE_PROT64 &&
- override_base != &ctxt->fs_base &&
- override_base != &ctxt->gs_base)
- override_base = NULL;
-
- if (override_base)
- modrm_ea += *override_base;
-
- if (rip_relative) {
- modrm_ea += _eip;
- switch (d & SrcMask) {
- case SrcImmByte:
- modrm_ea += 1;
- break;
- case SrcImm:
- if (d & ByteOp)
- modrm_ea += 1;
- else
- if (op_bytes == 8)
- modrm_ea += 4;
- else
- modrm_ea += op_bytes;
- }
- }
- if (ad_bytes != 8)
- modrm_ea = (u32)modrm_ea;
- cr2 = modrm_ea;
- modrm_done:
- ;
- }
-
- /*
- * Decode and fetch the source operand: register, memory
- * or immediate.
- */
- switch (d & SrcMask) {
- case SrcNone:
- break;
- case SrcReg:
- src.type = OP_REG;
- if (d & ByteOp) {
- src.ptr = decode_register(modrm_reg, _regs,
- (rex_prefix == 0));
- src.val = src.orig_val = *(u8 *) src.ptr;
- src.bytes = 1;
- } else {
- src.ptr = decode_register(modrm_reg, _regs, 0);
- switch ((src.bytes = op_bytes)) {
- case 2:
- src.val = src.orig_val = *(u16 *) src.ptr;
- break;
- case 4:
- src.val = src.orig_val = *(u32 *) src.ptr;
- break;
- case 8:
- src.val = src.orig_val = *(u64 *) src.ptr;
- break;
- }
- }
- break;
- case SrcMem16:
- src.bytes = 2;
- goto srcmem_common;
- case SrcMem32:
- src.bytes = 4;
- goto srcmem_common;
- case SrcMem:
- src.bytes = (d & ByteOp) ? 1 : op_bytes;
- /* Don't fetch the address for invlpg: it could be unmapped. */
- if (twobyte && b == 0x01 && modrm_reg == 7)
- break;
- srcmem_common:
- /*
- * For instructions with a ModR/M byte, switch to register
- * access if Mod = 3.
- */
- if ((d & ModRM) && modrm_mod == 3) {
- src.type = OP_REG;
- break;
- }
- src.type = OP_MEM;
- src.ptr = (unsigned long *)cr2;
- src.val = 0;
- if ((rc = ops->read_emulated((unsigned long)src.ptr,
- &src.val, src.bytes, ctxt->vcpu)) != 0)
- goto done;
- src.orig_val = src.val;
- break;
- case SrcImm:
- src.type = OP_IMM;
- src.ptr = (unsigned long *)_eip;
- src.bytes = (d & ByteOp) ? 1 : op_bytes;
- if (src.bytes == 8)
- src.bytes = 4;
- /* NB. Immediates are sign-extended as necessary. */
- switch (src.bytes) {
- case 1:
- src.val = insn_fetch(s8, 1, _eip);
- break;
- case 2:
- src.val = insn_fetch(s16, 2, _eip);
- break;
- case 4:
- src.val = insn_fetch(s32, 4, _eip);
- break;
- }
- break;
- case SrcImmByte:
- src.type = OP_IMM;
- src.ptr = (unsigned long *)_eip;
- src.bytes = 1;
- src.val = insn_fetch(s8, 1, _eip);
- break;
- }
-
- /* Decode and fetch the destination operand: register or memory. */
- switch (d & DstMask) {
- case ImplicitOps:
- /* Special instructions do their own operand decoding. */
- goto special_insn;
- case DstReg:
- dst.type = OP_REG;
- if ((d & ByteOp)
- && !(twobyte && (b == 0xb6 || b == 0xb7))) {
- dst.ptr = decode_register(modrm_reg, _regs,
- (rex_prefix == 0));
- dst.val = *(u8 *) dst.ptr;
- dst.bytes = 1;
- } else {
- dst.ptr = decode_register(modrm_reg, _regs, 0);
- switch ((dst.bytes = op_bytes)) {
- case 2:
- dst.val = *(u16 *)dst.ptr;
- break;
- case 4:
- dst.val = *(u32 *)dst.ptr;
- break;
- case 8:
- dst.val = *(u64 *)dst.ptr;
- break;
- }
- }
- break;
- case DstMem:
- dst.type = OP_MEM;
- dst.ptr = (unsigned long *)cr2;
- dst.bytes = (d & ByteOp) ? 1 : op_bytes;
- dst.val = 0;
- /*
- * For instructions with a ModR/M byte, switch to register
- * access if Mod = 3.
- */
- if ((d & ModRM) && modrm_mod == 3) {
- dst.type = OP_REG;
- break;
- }
- if (d & BitOp) {
- unsigned long mask = ~(dst.bytes * 8 - 1);
-
- dst.ptr = (void *)dst.ptr + (src.val & mask) / 8;
- }
- if (!(d & Mov) && /* optimisation - avoid slow emulated read */
- ((rc = ops->read_emulated((unsigned long)dst.ptr,
- &dst.val, dst.bytes, ctxt->vcpu)) != 0))
- goto done;
- break;
- }
- dst.orig_val = dst.val;
-
- if (twobyte)
- goto twobyte_insn;
-
- switch (b) {
- case 0x00 ... 0x05:
- add: /* add */
- emulate_2op_SrcV("add", src, dst, _eflags);
- break;
- case 0x08 ... 0x0d:
- or: /* or */
- emulate_2op_SrcV("or", src, dst, _eflags);
- break;
- case 0x10 ... 0x15:
- adc: /* adc */
- emulate_2op_SrcV("adc", src, dst, _eflags);
- break;
- case 0x18 ... 0x1d:
- sbb: /* sbb */
- emulate_2op_SrcV("sbb", src, dst, _eflags);
- break;
- case 0x20 ... 0x23:
- and: /* and */
- emulate_2op_SrcV("and", src, dst, _eflags);
- break;
- case 0x24: /* and al imm8 */
- dst.type = OP_REG;
- dst.ptr = &_regs[VCPU_REGS_RAX];
- dst.val = *(u8 *)dst.ptr;
- dst.bytes = 1;
- dst.orig_val = dst.val;
- goto and;
- case 0x25: /* and ax imm16, or eax imm32 */
- dst.type = OP_REG;
- dst.bytes = op_bytes;
- dst.ptr = &_regs[VCPU_REGS_RAX];
- if (op_bytes == 2)
- dst.val = *(u16 *)dst.ptr;
- else
- dst.val = *(u32 *)dst.ptr;
- dst.orig_val = dst.val;
- goto and;
- case 0x28 ... 0x2d:
- sub: /* sub */
- emulate_2op_SrcV("sub", src, dst, _eflags);
- break;
- case 0x30 ... 0x35:
- xor: /* xor */
- emulate_2op_SrcV("xor", src, dst, _eflags);
- break;
- case 0x38 ... 0x3d:
- cmp: /* cmp */
- emulate_2op_SrcV("cmp", src, dst, _eflags);
- break;
- case 0x63: /* movsxd */
- if (mode != X86EMUL_MODE_PROT64)
- goto cannot_emulate;
- dst.val = (s32) src.val;
- break;
- case 0x80 ... 0x83: /* Grp1 */
- switch (modrm_reg) {
- case 0:
- goto add;
- case 1:
- goto or;
- case 2:
- goto adc;
- case 3:
- goto sbb;
- case 4:
- goto and;
- case 5:
- goto sub;
- case 6:
- goto xor;
- case 7:
- goto cmp;
- }
- break;
- case 0x84 ... 0x85:
- test: /* test */
- emulate_2op_SrcV("test", src, dst, _eflags);
- break;
- case 0x86 ... 0x87: /* xchg */
- /* Write back the register source. */
- switch (dst.bytes) {
- case 1:
- *(u8 *) src.ptr = (u8) dst.val;
- break;
- case 2:
- *(u16 *) src.ptr = (u16) dst.val;
- break;
- case 4:
- *src.ptr = (u32) dst.val;
- break; /* 64b reg: zero-extend */
- case 8:
- *src.ptr = dst.val;
- break;
- }
- /*
- * Write back the memory destination with implicit LOCK
- * prefix.
- */
- dst.val = src.val;
- lock_prefix = 1;
- break;
- case 0x88 ... 0x8b: /* mov */
- goto mov;
- case 0x8d: /* lea r16/r32, m */
- dst.val = modrm_val;
- break;
- case 0x8f: /* pop (sole member of Grp1a) */
- /* 64-bit mode: POP always pops a 64-bit operand. */
- if (mode == X86EMUL_MODE_PROT64)
- dst.bytes = 8;
- if ((rc = ops->read_std(register_address(ctxt->ss_base,
- _regs[VCPU_REGS_RSP]),
- &dst.val, dst.bytes, ctxt->vcpu)) != 0)
- goto done;
- register_address_increment(_regs[VCPU_REGS_RSP], dst.bytes);
- break;
- case 0xa0 ... 0xa1: /* mov */
- dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
- dst.val = src.val;
- _eip += ad_bytes; /* skip src displacement */
- break;
- case 0xa2 ... 0xa3: /* mov */
- dst.val = (unsigned long)_regs[VCPU_REGS_RAX];
- _eip += ad_bytes; /* skip dst displacement */
- break;
- case 0xc0 ... 0xc1:
- grp2: /* Grp2 */
- switch (modrm_reg) {
- case 0: /* rol */
- emulate_2op_SrcB("rol", src, dst, _eflags);
- break;
- case 1: /* ror */
- emulate_2op_SrcB("ror", src, dst, _eflags);
- break;
- case 2: /* rcl */
- emulate_2op_SrcB("rcl", src, dst, _eflags);
- break;
- case 3: /* rcr */
- emulate_2op_SrcB("rcr", src, dst, _eflags);
- break;
- case 4: /* sal/shl */
- case 6: /* sal/shl */
- emulate_2op_SrcB("sal", src, dst, _eflags);
- break;
- case 5: /* shr */
- emulate_2op_SrcB("shr", src, dst, _eflags);
- break;
- case 7: /* sar */
- emulate_2op_SrcB("sar", src, dst, _eflags);
- break;
- }
- break;
- case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
- mov:
- dst.val = src.val;
- break;
- case 0xd0 ... 0xd1: /* Grp2 */
- src.val = 1;
- goto grp2;
- case 0xd2 ... 0xd3: /* Grp2 */
- src.val = _regs[VCPU_REGS_RCX];
- goto grp2;
- case 0xf6 ... 0xf7: /* Grp3 */
- switch (modrm_reg) {
- case 0 ... 1: /* test */
- /*
- * Special case in Grp3: test has an immediate
- * source operand.
- */
- src.type = OP_IMM;
- src.ptr = (unsigned long *)_eip;
- src.bytes = (d & ByteOp) ? 1 : op_bytes;
- if (src.bytes == 8)
- src.bytes = 4;
- switch (src.bytes) {
- case 1:
- src.val = insn_fetch(s8, 1, _eip);
- break;
- case 2:
- src.val = insn_fetch(s16, 2, _eip);
- break;
- case 4:
- src.val = insn_fetch(s32, 4, _eip);
- break;
- }
- goto test;
- case 2: /* not */
- dst.val = ~dst.val;
- break;
- case 3: /* neg */
- emulate_1op("neg", dst, _eflags);
- break;
- default:
- goto cannot_emulate;
- }
- break;
- case 0xfe ... 0xff: /* Grp4/Grp5 */
- switch (modrm_reg) {
- case 0: /* inc */
- emulate_1op("inc", dst, _eflags);
- break;
- case 1: /* dec */
- emulate_1op("dec", dst, _eflags);
- break;
- case 4: /* jmp abs */
- if (b == 0xff)
- _eip = dst.val;
- else
- goto cannot_emulate;
- break;
- case 6: /* push */
- /* 64-bit mode: PUSH always pushes a 64-bit operand. */
- if (mode == X86EMUL_MODE_PROT64) {
- dst.bytes = 8;
- if ((rc = ops->read_std((unsigned long)dst.ptr,
- &dst.val, 8,
- ctxt->vcpu)) != 0)
- goto done;
- }
- register_address_increment(_regs[VCPU_REGS_RSP],
- -dst.bytes);
- if ((rc = ops->write_emulated(
- register_address(ctxt->ss_base,
- _regs[VCPU_REGS_RSP]),
- &dst.val, dst.bytes, ctxt->vcpu)) != 0)
- goto done;
- no_wb = 1;
- break;
- default:
- goto cannot_emulate;
- }
- break;
- }
-
-writeback:
- if (!no_wb) {
- switch (dst.type) {
- case OP_REG:
- /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
- switch (dst.bytes) {
- case 1:
- *(u8 *)dst.ptr = (u8)dst.val;
- break;
- case 2:
- *(u16 *)dst.ptr = (u16)dst.val;
- break;
- case 4:
- *dst.ptr = (u32)dst.val;
- break; /* 64b: zero-ext */
- case 8:
- *dst.ptr = dst.val;
- break;
- }
- break;
- case OP_MEM:
- if (lock_prefix)
- rc = ops->cmpxchg_emulated((unsigned long)dst.
- ptr, &dst.orig_val,
- &dst.val, dst.bytes,
- ctxt->vcpu);
- else
- rc = ops->write_emulated((unsigned long)dst.ptr,
- &dst.val, dst.bytes,
- ctxt->vcpu);
- if (rc != 0)
- goto done;
- default:
- break;
- }
- }
-
- /* Commit shadow register state. */
- memcpy(ctxt->vcpu->regs, _regs, sizeof _regs);
- ctxt->eflags = _eflags;
- ctxt->vcpu->rip = _eip;
-
-done:
- return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
-
-special_insn:
- if (twobyte)
- goto twobyte_special_insn;
- switch(b) {
- case 0x50 ... 0x57: /* push reg */
- if (op_bytes == 2)
- src.val = (u16) _regs[b & 0x7];
- else
- src.val = (u32) _regs[b & 0x7];
- dst.type = OP_MEM;
- dst.bytes = op_bytes;
- dst.val = src.val;
- register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes);
- dst.ptr = (void *) register_address(
- ctxt->ss_base, _regs[VCPU_REGS_RSP]);
- break;
- case 0x58 ... 0x5f: /* pop reg */
- dst.ptr = (unsigned long *)&_regs[b & 0x7];
- pop_instruction:
- if ((rc = ops->read_std(register_address(ctxt->ss_base,
- _regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt->vcpu))
- != 0)
- goto done;
-
- register_address_increment(_regs[VCPU_REGS_RSP], op_bytes);
- no_wb = 1; /* Disable writeback. */
- break;
- case 0x6a: /* push imm8 */
- src.val = 0L;
- src.val = insn_fetch(s8, 1, _eip);
- push:
- dst.type = OP_MEM;
- dst.bytes = op_bytes;
- dst.val = src.val;
- register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes);
- dst.ptr = (void *) register_address(ctxt->ss_base,
- _regs[VCPU_REGS_RSP]);
- break;
- case 0x6c: /* insb */
- case 0x6d: /* insw/insd */
- if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
- 1, /* in */
- (d & ByteOp) ? 1 : op_bytes, /* size */
- rep_prefix ?
- address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */
- (_eflags & EFLG_DF), /* down */
- register_address(ctxt->es_base,
- _regs[VCPU_REGS_RDI]), /* address */
- rep_prefix,
- _regs[VCPU_REGS_RDX] /* port */
- ) == 0)
- return -1;
- return 0;
- case 0x6e: /* outsb */
- case 0x6f: /* outsw/outsd */
- if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
- 0, /* in */
- (d & ByteOp) ? 1 : op_bytes, /* size */
- rep_prefix ?
- address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */
- (_eflags & EFLG_DF), /* down */
- register_address(override_base ?
- *override_base : ctxt->ds_base,
- _regs[VCPU_REGS_RSI]), /* address */
- rep_prefix,
- _regs[VCPU_REGS_RDX] /* port */
- ) == 0)
- return -1;
- return 0;
- case 0x70 ... 0x7f: /* jcc (short) */ {
- int rel = insn_fetch(s8, 1, _eip);
-
- if (test_cc(b, _eflags))
- JMP_REL(rel);
- break;
- }
- case 0x9c: /* pushf */
- src.val = (unsigned long) _eflags;
- goto push;
- case 0x9d: /* popf */
- dst.ptr = (unsigned long *) &_eflags;
- goto pop_instruction;
- case 0xc3: /* ret */
- dst.ptr = &_eip;
- goto pop_instruction;
- case 0xf4: /* hlt */
- ctxt->vcpu->halt_request = 1;
- goto done;
- }
- if (rep_prefix) {
- if (_regs[VCPU_REGS_RCX] == 0) {
- ctxt->vcpu->rip = _eip;
- goto done;
- }
- _regs[VCPU_REGS_RCX]--;
- _eip = ctxt->vcpu->rip;
- }
- switch (b) {
- case 0xa4 ... 0xa5: /* movs */
- dst.type = OP_MEM;
- dst.bytes = (d & ByteOp) ? 1 : op_bytes;
- dst.ptr = (unsigned long *)register_address(ctxt->es_base,
- _regs[VCPU_REGS_RDI]);
- if ((rc = ops->read_emulated(register_address(
- override_base ? *override_base : ctxt->ds_base,
- _regs[VCPU_REGS_RSI]), &dst.val, dst.bytes, ctxt->vcpu)) != 0)
- goto done;
- register_address_increment(_regs[VCPU_REGS_RSI],
- (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
- register_address_increment(_regs[VCPU_REGS_RDI],
- (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
- break;
- case 0xa6 ... 0xa7: /* cmps */
- DPRINTF("Urk! I don't handle CMPS.\n");
- goto cannot_emulate;
- case 0xaa ... 0xab: /* stos */
- dst.type = OP_MEM;
- dst.bytes = (d & ByteOp) ? 1 : op_bytes;
- dst.ptr = (unsigned long *)cr2;
- dst.val = _regs[VCPU_REGS_RAX];
- register_address_increment(_regs[VCPU_REGS_RDI],
- (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
- break;
- case 0xac ... 0xad: /* lods */
- dst.type = OP_REG;
- dst.bytes = (d & ByteOp) ? 1 : op_bytes;
- dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
- if ((rc = ops->read_emulated(cr2, &dst.val, dst.bytes,
- ctxt->vcpu)) != 0)
- goto done;
- register_address_increment(_regs[VCPU_REGS_RSI],
- (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
- break;
- case 0xae ... 0xaf: /* scas */
- DPRINTF("Urk! I don't handle SCAS.\n");
- goto cannot_emulate;
- case 0xe8: /* call (near) */ {
- long int rel;
- switch (op_bytes) {
- case 2:
- rel = insn_fetch(s16, 2, _eip);
- break;
- case 4:
- rel = insn_fetch(s32, 4, _eip);
- break;
- case 8:
- rel = insn_fetch(s64, 8, _eip);
- break;
- default:
- DPRINTF("Call: Invalid op_bytes\n");
- goto cannot_emulate;
- }
- src.val = (unsigned long) _eip;
- JMP_REL(rel);
- op_bytes = ad_bytes;
- goto push;
- }
- case 0xe9: /* jmp rel */
- case 0xeb: /* jmp rel short */
- JMP_REL(src.val);
- no_wb = 1; /* Disable writeback. */
- break;
-
-
- }
- goto writeback;
-
-twobyte_insn:
- switch (b) {
- case 0x01: /* lgdt, lidt, lmsw */
- /* Disable writeback. */
- no_wb = 1;
- switch (modrm_reg) {
- u16 size;
- unsigned long address;
-
- case 2: /* lgdt */
- rc = read_descriptor(ctxt, ops, src.ptr,
- &size, &address, op_bytes);
- if (rc)
- goto done;
- realmode_lgdt(ctxt->vcpu, size, address);
- break;
- case 3: /* lidt */
- rc = read_descriptor(ctxt, ops, src.ptr,
- &size, &address, op_bytes);
- if (rc)
- goto done;
- realmode_lidt(ctxt->vcpu, size, address);
- break;
- case 4: /* smsw */
- if (modrm_mod != 3)
- goto cannot_emulate;
- *(u16 *)&_regs[modrm_rm]
- = realmode_get_cr(ctxt->vcpu, 0);
- break;
- case 6: /* lmsw */
- if (modrm_mod != 3)
- goto cannot_emulate;
- realmode_lmsw(ctxt->vcpu, (u16)modrm_val, &_eflags);
- break;
- case 7: /* invlpg*/
- emulate_invlpg(ctxt->vcpu, cr2);
- break;
- default:
- goto cannot_emulate;
- }
- break;
- case 0x21: /* mov from dr to reg */
- no_wb = 1;
- if (modrm_mod != 3)
- goto cannot_emulate;
- rc = emulator_get_dr(ctxt, modrm_reg, &_regs[modrm_rm]);
- break;
- case 0x23: /* mov from reg to dr */
- no_wb = 1;
- if (modrm_mod != 3)
- goto cannot_emulate;
- rc = emulator_set_dr(ctxt, modrm_reg, _regs[modrm_rm]);
- break;
- case 0x40 ... 0x4f: /* cmov */
- dst.val = dst.orig_val = src.val;
- no_wb = 1;
- /*
- * First, assume we're decoding an even cmov opcode
- * (lsb == 0).
- */
- switch ((b & 15) >> 1) {
- case 0: /* cmovo */
- no_wb = (_eflags & EFLG_OF) ? 0 : 1;
- break;
- case 1: /* cmovb/cmovc/cmovnae */
- no_wb = (_eflags & EFLG_CF) ? 0 : 1;
- break;
- case 2: /* cmovz/cmove */
- no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
- break;
- case 3: /* cmovbe/cmovna */
- no_wb = (_eflags & (EFLG_CF | EFLG_ZF)) ? 0 : 1;
- break;
- case 4: /* cmovs */
- no_wb = (_eflags & EFLG_SF) ? 0 : 1;
- break;
- case 5: /* cmovp/cmovpe */
- no_wb = (_eflags & EFLG_PF) ? 0 : 1;
- break;
- case 7: /* cmovle/cmovng */
- no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
- /* fall through */
- case 6: /* cmovl/cmovnge */
- no_wb &= (!(_eflags & EFLG_SF) !=
- !(_eflags & EFLG_OF)) ? 0 : 1;
- break;
- }
- /* Odd cmov opcodes (lsb == 1) have inverted sense. */
- no_wb ^= b & 1;
- break;
- case 0xa3:
- bt: /* bt */
- src.val &= (dst.bytes << 3) - 1; /* only subword offset */
- emulate_2op_SrcV_nobyte("bt", src, dst, _eflags);
- break;
- case 0xab:
- bts: /* bts */
- src.val &= (dst.bytes << 3) - 1; /* only subword offset */
- emulate_2op_SrcV_nobyte("bts", src, dst, _eflags);
- break;
- case 0xb0 ... 0xb1: /* cmpxchg */
- /*
- * Save real source value, then compare EAX against
- * destination.
- */
- src.orig_val = src.val;
- src.val = _regs[VCPU_REGS_RAX];
- emulate_2op_SrcV("cmp", src, dst, _eflags);
- if (_eflags & EFLG_ZF) {
- /* Success: write back to memory. */
- dst.val = src.orig_val;
- } else {
- /* Failure: write the value we saw to EAX. */
- dst.type = OP_REG;
- dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
- }
- break;
- case 0xb3:
- btr: /* btr */
- src.val &= (dst.bytes << 3) - 1; /* only subword offset */
- emulate_2op_SrcV_nobyte("btr", src, dst, _eflags);
- break;
- case 0xb6 ... 0xb7: /* movzx */
- dst.bytes = op_bytes;
- dst.val = (d & ByteOp) ? (u8) src.val : (u16) src.val;
- break;
- case 0xba: /* Grp8 */
- switch (modrm_reg & 3) {
- case 0:
- goto bt;
- case 1:
- goto bts;
- case 2:
- goto btr;
- case 3:
- goto btc;
- }
- break;
- case 0xbb:
- btc: /* btc */
- src.val &= (dst.bytes << 3) - 1; /* only subword offset */
- emulate_2op_SrcV_nobyte("btc", src, dst, _eflags);
- break;
- case 0xbe ... 0xbf: /* movsx */
- dst.bytes = op_bytes;
- dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val;
- break;
- case 0xc3: /* movnti */
- dst.bytes = op_bytes;
- dst.val = (op_bytes == 4) ? (u32) src.val : (u64) src.val;
- break;
- }
- goto writeback;
-
-twobyte_special_insn:
- /* Disable writeback. */
- no_wb = 1;
- switch (b) {
- case 0x06:
- emulate_clts(ctxt->vcpu);
- break;
- case 0x08: /* invd */
- break;
- case 0x09: /* wbinvd */
- break;
- case 0x0d: /* GrpP (prefetch) */
- case 0x18: /* Grp16 (prefetch/nop) */
- break;
- case 0x20: /* mov cr, reg */
- if (modrm_mod != 3)
- goto cannot_emulate;
- _regs[modrm_rm] = realmode_get_cr(ctxt->vcpu, modrm_reg);
- break;
- case 0x22: /* mov reg, cr */
- if (modrm_mod != 3)
- goto cannot_emulate;
- realmode_set_cr(ctxt->vcpu, modrm_reg, modrm_val, &_eflags);
- break;
- case 0x30:
- /* wrmsr */
- msr_data = (u32)_regs[VCPU_REGS_RAX]
- | ((u64)_regs[VCPU_REGS_RDX] << 32);
- rc = kvm_set_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], msr_data);
- if (rc) {
- kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
- _eip = ctxt->vcpu->rip;
- }
- rc = X86EMUL_CONTINUE;
- break;
- case 0x32:
- /* rdmsr */
- rc = kvm_get_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], &msr_data);
- if (rc) {
- kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
- _eip = ctxt->vcpu->rip;
- } else {
- _regs[VCPU_REGS_RAX] = (u32)msr_data;
- _regs[VCPU_REGS_RDX] = msr_data >> 32;
- }
- rc = X86EMUL_CONTINUE;
- break;
- case 0x80 ... 0x8f: /* jnz rel, etc*/ {
- long int rel;
-
- switch (op_bytes) {
- case 2:
- rel = insn_fetch(s16, 2, _eip);
- break;
- case 4:
- rel = insn_fetch(s32, 4, _eip);
- break;
- case 8:
- rel = insn_fetch(s64, 8, _eip);
- break;
- default:
- DPRINTF("jnz: Invalid op_bytes\n");
- goto cannot_emulate;
- }
- if (test_cc(b, _eflags))
- JMP_REL(rel);
- break;
- }
- case 0xc7: /* Grp9 (cmpxchg8b) */
- {
- u64 old, new;
- if ((rc = ops->read_emulated(cr2, &old, 8, ctxt->vcpu))
- != 0)
- goto done;
- if (((u32) (old >> 0) != (u32) _regs[VCPU_REGS_RAX]) ||
- ((u32) (old >> 32) != (u32) _regs[VCPU_REGS_RDX])) {
- _regs[VCPU_REGS_RAX] = (u32) (old >> 0);
- _regs[VCPU_REGS_RDX] = (u32) (old >> 32);
- _eflags &= ~EFLG_ZF;
- } else {
- new = ((u64)_regs[VCPU_REGS_RCX] << 32)
- | (u32) _regs[VCPU_REGS_RBX];
- if ((rc = ops->cmpxchg_emulated(cr2, &old,
- &new, 8, ctxt->vcpu)) != 0)
- goto done;
- _eflags |= EFLG_ZF;
- }
- break;
- }
- }
- goto writeback;
-
-cannot_emulate:
- DPRINTF("Cannot emulate %02x\n", b);
- return -1;
-}
-
-#ifdef __XEN__
-
-#include <asm/mm.h>
-#include <asm/uaccess.h>
-
-int
-x86_emulate_read_std(unsigned long addr,
- unsigned long *val,
- unsigned int bytes, struct x86_emulate_ctxt *ctxt)
-{
- unsigned int rc;
-
- *val = 0;
-
- if ((rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0) {
- propagate_page_fault(addr + bytes - rc, 0); /* read fault */
- return X86EMUL_PROPAGATE_FAULT;
- }
-
- return X86EMUL_CONTINUE;
-}
-
-int
-x86_emulate_write_std(unsigned long addr,
- unsigned long val,
- unsigned int bytes, struct x86_emulate_ctxt *ctxt)
-{
- unsigned int rc;
-
- if ((rc = copy_to_user((void *)addr, (void *)&val, bytes)) != 0) {
- propagate_page_fault(addr + bytes - rc, PGERR_write_access);
- return X86EMUL_PROPAGATE_FAULT;
- }
-
- return X86EMUL_CONTINUE;
-}
-
-#endif
diff --git a/drivers/kvm/x86_emulate.h b/drivers/kvm/x86_emulate.h
deleted file mode 100644
index 92c73aa7f9a..00000000000
--- a/drivers/kvm/x86_emulate.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/******************************************************************************
- * x86_emulate.h
- *
- * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
- *
- * Copyright (c) 2005 Keir Fraser
- *
- * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
- */
-
-#ifndef __X86_EMULATE_H__
-#define __X86_EMULATE_H__
-
-struct x86_emulate_ctxt;
-
-/*
- * x86_emulate_ops:
- *
- * These operations represent the instruction emulator's interface to memory.
- * There are two categories of operation: those that act on ordinary memory
- * regions (*_std), and those that act on memory regions known to require
- * special treatment or emulation (*_emulated).
- *
- * The emulator assumes that an instruction accesses only one 'emulated memory'
- * location, that this location is the given linear faulting address (cr2), and
- * that this is one of the instruction's data operands. Instruction fetches and
- * stack operations are assumed never to access emulated memory. The emulator
- * automatically deduces which operand of a string-move operation is accessing
- * emulated memory, and assumes that the other operand accesses normal memory.
- *
- * NOTES:
- * 1. The emulator isn't very smart about emulated vs. standard memory.
- * 'Emulated memory' access addresses should be checked for sanity.
- * 'Normal memory' accesses may fault, and the caller must arrange to
- * detect and handle reentrancy into the emulator via recursive faults.
- * Accesses may be unaligned and may cross page boundaries.
- * 2. If the access fails (cannot emulate, or a standard access faults) then
- * it is up to the memop to propagate the fault to the guest VM via
- * some out-of-band mechanism, unknown to the emulator. The memop signals
- * failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will
- * then immediately bail.
- * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only
- * cmpxchg8b_emulated need support 8-byte accesses.
- * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system.
- */
-/* Access completed successfully: continue emulation as normal. */
-#define X86EMUL_CONTINUE 0
-/* Access is unhandleable: bail from emulation and return error to caller. */
-#define X86EMUL_UNHANDLEABLE 1
-/* Terminate emulation but return success to the caller. */
-#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
-#define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */
-#define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */
-struct x86_emulate_ops {
- /*
- * read_std: Read bytes of standard (non-emulated/special) memory.
- * Used for instruction fetch, stack operations, and others.
- * @addr: [IN ] Linear address from which to read.
- * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
- * @bytes: [IN ] Number of bytes to read from memory.
- */
- int (*read_std)(unsigned long addr, void *val,
- unsigned int bytes, struct kvm_vcpu *vcpu);
-
- /*
- * write_std: Write bytes of standard (non-emulated/special) memory.
- * Used for stack operations, and others.
- * @addr: [IN ] Linear address to which to write.
- * @val: [IN ] Value to write to memory (low-order bytes used as
- * required).
- * @bytes: [IN ] Number of bytes to write to memory.
- */
- int (*write_std)(unsigned long addr, const void *val,
- unsigned int bytes, struct kvm_vcpu *vcpu);
-
- /*
- * read_emulated: Read bytes from emulated/special memory area.
- * @addr: [IN ] Linear address from which to read.
- * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
- * @bytes: [IN ] Number of bytes to read from memory.
- */
- int (*read_emulated) (unsigned long addr,
- void *val,
- unsigned int bytes,
- struct kvm_vcpu *vcpu);
-
- /*
- * write_emulated: Read bytes from emulated/special memory area.
- * @addr: [IN ] Linear address to which to write.
- * @val: [IN ] Value to write to memory (low-order bytes used as
- * required).
- * @bytes: [IN ] Number of bytes to write to memory.
- */
- int (*write_emulated) (unsigned long addr,
- const void *val,
- unsigned int bytes,
- struct kvm_vcpu *vcpu);
-
- /*
- * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an
- * emulated/special memory area.
- * @addr: [IN ] Linear address to access.
- * @old: [IN ] Value expected to be current at @addr.
- * @new: [IN ] Value to write to @addr.
- * @bytes: [IN ] Number of bytes to access using CMPXCHG.
- */
- int (*cmpxchg_emulated) (unsigned long addr,
- const void *old,
- const void *new,
- unsigned int bytes,
- struct kvm_vcpu *vcpu);
-
-};
-
-struct x86_emulate_ctxt {
- /* Register state before/after emulation. */
- struct kvm_vcpu *vcpu;
-
- /* Linear faulting address (if emulating a page-faulting instruction). */
- unsigned long eflags;
- unsigned long cr2;
-
- /* Emulated execution mode, represented by an X86EMUL_MODE value. */
- int mode;
-
- unsigned long cs_base;
- unsigned long ds_base;
- unsigned long es_base;
- unsigned long ss_base;
- unsigned long gs_base;
- unsigned long fs_base;
-};
-
-/* Execution mode, passed to the emulator. */
-#define X86EMUL_MODE_REAL 0 /* Real mode. */
-#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */
-#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */
-#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
-
-/* Host execution mode. */
-#if defined(__i386__)
-#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
-#elif defined(CONFIG_X86_64)
-#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
-#endif
-
-/*
- * x86_emulate_memop: Emulate an instruction that faulted attempting to
- * read/write a 'special' memory area.
- * Returns -1 on failure, 0 on success.
- */
-int x86_emulate_memop(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops);
-
-#endif /* __X86_EMULATE_H__ */
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index cb4c67025d5..7743d73768d 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -151,43 +151,43 @@ int lguest_address_ok(const struct lguest *lg,
/* This routine copies memory from the Guest. Here we can see how useful the
* kill_lguest() routine we met in the Launcher can be: we return a random
* value (all zeroes) instead of needing to return an error. */
-void __lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes)
+void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes)
{
- if (!lguest_address_ok(lg, addr, bytes)
- || copy_from_user(b, lg->mem_base + addr, bytes) != 0) {
+ if (!lguest_address_ok(cpu->lg, addr, bytes)
+ || copy_from_user(b, cpu->lg->mem_base + addr, bytes) != 0) {
/* copy_from_user should do this, but as we rely on it... */
memset(b, 0, bytes);
- kill_guest(lg, "bad read address %#lx len %u", addr, bytes);
+ kill_guest(cpu, "bad read address %#lx len %u", addr, bytes);
}
}
/* This is the write (copy into guest) version. */
-void __lgwrite(struct lguest *lg, unsigned long addr, const void *b,
+void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b,
unsigned bytes)
{
- if (!lguest_address_ok(lg, addr, bytes)
- || copy_to_user(lg->mem_base + addr, b, bytes) != 0)
- kill_guest(lg, "bad write address %#lx len %u", addr, bytes);
+ if (!lguest_address_ok(cpu->lg, addr, bytes)
+ || copy_to_user(cpu->lg->mem_base + addr, b, bytes) != 0)
+ kill_guest(cpu, "bad write address %#lx len %u", addr, bytes);
}
/*:*/
/*H:030 Let's jump straight to the the main loop which runs the Guest.
* Remember, this is called by the Launcher reading /dev/lguest, and we keep
* going around and around until something interesting happens. */
-int run_guest(struct lguest *lg, unsigned long __user *user)
+int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
{
/* We stop running once the Guest is dead. */
- while (!lg->dead) {
+ while (!cpu->lg->dead) {
/* First we run any hypercalls the Guest wants done. */
- if (lg->hcall)
- do_hypercalls(lg);
+ if (cpu->hcall)
+ do_hypercalls(cpu);
/* It's possible the Guest did a NOTIFY hypercall to the
* Launcher, in which case we return from the read() now. */
- if (lg->pending_notify) {
- if (put_user(lg->pending_notify, user))
+ if (cpu->pending_notify) {
+ if (put_user(cpu->pending_notify, user))
return -EFAULT;
- return sizeof(lg->pending_notify);
+ return sizeof(cpu->pending_notify);
}
/* Check for signals */
@@ -195,13 +195,13 @@ int run_guest(struct lguest *lg, unsigned long __user *user)
return -ERESTARTSYS;
/* If Waker set break_out, return to Launcher. */
- if (lg->break_out)
+ if (cpu->break_out)
return -EAGAIN;
/* Check if there are any interrupts which can be delivered
* now: if so, this sets up the hander to be executed when we
* next run the Guest. */
- maybe_do_interrupt(lg);
+ maybe_do_interrupt(cpu);
/* All long-lived kernel loops need to check with this horrible
* thing called the freezer. If the Host is trying to suspend,
@@ -210,12 +210,12 @@ int run_guest(struct lguest *lg, unsigned long __user *user)
/* Just make absolutely sure the Guest is still alive. One of
* those hypercalls could have been fatal, for example. */
- if (lg->dead)
+ if (cpu->lg->dead)
break;
/* If the Guest asked to be stopped, we sleep. The Guest's
* clock timer or LHCALL_BREAK from the Waker will wake us. */
- if (lg->halted) {
+ if (cpu->halted) {
set_current_state(TASK_INTERRUPTIBLE);
schedule();
continue;
@@ -226,15 +226,17 @@ int run_guest(struct lguest *lg, unsigned long __user *user)
local_irq_disable();
/* Actually run the Guest until something happens. */
- lguest_arch_run_guest(lg);
+ lguest_arch_run_guest(cpu);
/* Now we're ready to be interrupted or moved to other CPUs */
local_irq_enable();
/* Now we deal with whatever happened to the Guest. */
- lguest_arch_handle_trap(lg);
+ lguest_arch_handle_trap(cpu);
}
+ if (cpu->lg->dead == ERR_PTR(-ERESTART))
+ return -ERESTART;
/* The Guest is dead => "No such file or directory" */
return -ENOENT;
}
@@ -253,7 +255,7 @@ static int __init init(void)
/* Lguest can't run under Xen, VMI or itself. It does Tricky Stuff. */
if (paravirt_enabled()) {
- printk("lguest is afraid of %s\n", pv_info.name);
+ printk("lguest is afraid of being a guest\n");
return -EPERM;
}
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index b478affe8f9..0f2cb4fd7c6 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -23,13 +23,14 @@
#include <linux/uaccess.h>
#include <linux/syscalls.h>
#include <linux/mm.h>
+#include <linux/ktime.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include "lg.h"
/*H:120 This is the core hypercall routine: where the Guest gets what it wants.
* Or gets killed. Or, in the case of LHCALL_CRASH, both. */
-static void do_hcall(struct lguest *lg, struct hcall_args *args)
+static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
{
switch (args->arg0) {
case LHCALL_FLUSH_ASYNC:
@@ -39,60 +40,62 @@ static void do_hcall(struct lguest *lg, struct hcall_args *args)
case LHCALL_LGUEST_INIT:
/* You can't get here unless you're already initialized. Don't
* do that. */
- kill_guest(lg, "already have lguest_data");
+ kill_guest(cpu, "already have lguest_data");
break;
- case LHCALL_CRASH: {
- /* Crash is such a trivial hypercall that we do it in four
+ case LHCALL_SHUTDOWN: {
+ /* Shutdown is such a trivial hypercall that we do it in four
* lines right here. */
char msg[128];
/* If the lgread fails, it will call kill_guest() itself; the
* kill_guest() with the message will be ignored. */
- __lgread(lg, msg, args->arg1, sizeof(msg));
+ __lgread(cpu, msg, args->arg1, sizeof(msg));
msg[sizeof(msg)-1] = '\0';
- kill_guest(lg, "CRASH: %s", msg);
+ kill_guest(cpu, "CRASH: %s", msg);
+ if (args->arg2 == LGUEST_SHUTDOWN_RESTART)
+ cpu->lg->dead = ERR_PTR(-ERESTART);
break;
}
case LHCALL_FLUSH_TLB:
/* FLUSH_TLB comes in two flavors, depending on the
* argument: */
if (args->arg1)
- guest_pagetable_clear_all(lg);
+ guest_pagetable_clear_all(cpu);
else
- guest_pagetable_flush_user(lg);
+ guest_pagetable_flush_user(cpu);
break;
/* All these calls simply pass the arguments through to the right
* routines. */
case LHCALL_NEW_PGTABLE:
- guest_new_pagetable(lg, args->arg1);
+ guest_new_pagetable(cpu, args->arg1);
break;
case LHCALL_SET_STACK:
- guest_set_stack(lg, args->arg1, args->arg2, args->arg3);
+ guest_set_stack(cpu, args->arg1, args->arg2, args->arg3);
break;
case LHCALL_SET_PTE:
- guest_set_pte(lg, args->arg1, args->arg2, __pte(args->arg3));
+ guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3));
break;
case LHCALL_SET_PMD:
- guest_set_pmd(lg, args->arg1, args->arg2);
+ guest_set_pmd(cpu->lg, args->arg1, args->arg2);
break;
case LHCALL_SET_CLOCKEVENT:
- guest_set_clockevent(lg, args->arg1);
+ guest_set_clockevent(cpu, args->arg1);
break;
case LHCALL_TS:
/* This sets the TS flag, as we saw used in run_guest(). */
- lg->ts = args->arg1;
+ cpu->ts = args->arg1;
break;
case LHCALL_HALT:
/* Similarly, this sets the halted flag for run_guest(). */
- lg->halted = 1;
+ cpu->halted = 1;
break;
case LHCALL_NOTIFY:
- lg->pending_notify = args->arg1;
+ cpu->pending_notify = args->arg1;
break;
default:
/* It should be an architecture-specific hypercall. */
- if (lguest_arch_do_hcall(lg, args))
- kill_guest(lg, "Bad hypercall %li\n", args->arg0);
+ if (lguest_arch_do_hcall(cpu, args))
+ kill_guest(cpu, "Bad hypercall %li\n", args->arg0);
}
}
/*:*/
@@ -104,13 +107,13 @@ static void do_hcall(struct lguest *lg, struct hcall_args *args)
* Guest put them in the ring, but we also promise the Guest that they will
* happen before any normal hypercall (which is why we check this before
* checking for a normal hcall). */
-static void do_async_hcalls(struct lguest *lg)
+static void do_async_hcalls(struct lg_cpu *cpu)
{
unsigned int i;
u8 st[LHCALL_RING_SIZE];
/* For simplicity, we copy the entire call status array in at once. */
- if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st)))
+ if (copy_from_user(&st, &cpu->lg->lguest_data->hcall_status, sizeof(st)))
return;
/* We process "struct lguest_data"s hcalls[] ring once. */
@@ -119,7 +122,7 @@ static void do_async_hcalls(struct lguest *lg)
/* We remember where we were up to from last time. This makes
* sure that the hypercalls are done in the order the Guest
* places them in the ring. */
- unsigned int n = lg->next_hcall;
+ unsigned int n = cpu->next_hcall;
/* 0xFF means there's no call here (yet). */
if (st[n] == 0xFF)
@@ -127,65 +130,65 @@ static void do_async_hcalls(struct lguest *lg)
/* OK, we have hypercall. Increment the "next_hcall" cursor,
* and wrap back to 0 if we reach the end. */
- if (++lg->next_hcall == LHCALL_RING_SIZE)
- lg->next_hcall = 0;
+ if (++cpu->next_hcall == LHCALL_RING_SIZE)
+ cpu->next_hcall = 0;
/* Copy the hypercall arguments into a local copy of
* the hcall_args struct. */
- if (copy_from_user(&args, &lg->lguest_data->hcalls[n],
+ if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n],
sizeof(struct hcall_args))) {
- kill_guest(lg, "Fetching async hypercalls");
+ kill_guest(cpu, "Fetching async hypercalls");
break;
}
/* Do the hypercall, same as a normal one. */
- do_hcall(lg, &args);
+ do_hcall(cpu, &args);
/* Mark the hypercall done. */
- if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) {
- kill_guest(lg, "Writing result for async hypercall");
+ if (put_user(0xFF, &cpu->lg->lguest_data->hcall_status[n])) {
+ kill_guest(cpu, "Writing result for async hypercall");
break;
}
/* Stop doing hypercalls if they want to notify the Launcher:
* it needs to service this first. */
- if (lg->pending_notify)
+ if (cpu->pending_notify)
break;
}
}
/* Last of all, we look at what happens first of all. The very first time the
* Guest makes a hypercall, we end up here to set things up: */
-static void initialize(struct lguest *lg)
+static void initialize(struct lg_cpu *cpu)
{
/* You can't do anything until you're initialized. The Guest knows the
* rules, so we're unforgiving here. */
- if (lg->hcall->arg0 != LHCALL_LGUEST_INIT) {
- kill_guest(lg, "hypercall %li before INIT", lg->hcall->arg0);
+ if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) {
+ kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0);
return;
}
- if (lguest_arch_init_hypercalls(lg))
- kill_guest(lg, "bad guest page %p", lg->lguest_data);
+ if (lguest_arch_init_hypercalls(cpu))
+ kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
/* The Guest tells us where we're not to deliver interrupts by putting
* the range of addresses into "struct lguest_data". */
- if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start)
- || get_user(lg->noirq_end, &lg->lguest_data->noirq_end))
- kill_guest(lg, "bad guest page %p", lg->lguest_data);
+ if (get_user(cpu->lg->noirq_start, &cpu->lg->lguest_data->noirq_start)
+ || get_user(cpu->lg->noirq_end, &cpu->lg->lguest_data->noirq_end))
+ kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
/* We write the current time into the Guest's data page once so it can
* set its clock. */
- write_timestamp(lg);
+ write_timestamp(cpu);
/* page_tables.c will also do some setup. */
- page_table_guest_data_init(lg);
+ page_table_guest_data_init(cpu);
/* This is the one case where the above accesses might have been the
* first write to a Guest page. This may have caused a copy-on-write
* fault, but the old page might be (read-only) in the Guest
* pagetable. */
- guest_pagetable_clear_all(lg);
+ guest_pagetable_clear_all(cpu);
}
/*H:100
@@ -194,27 +197,27 @@ static void initialize(struct lguest *lg)
* Remember from the Guest, hypercalls come in two flavors: normal and
* asynchronous. This file handles both of types.
*/
-void do_hypercalls(struct lguest *lg)
+void do_hypercalls(struct lg_cpu *cpu)
{
/* Not initialized yet? This hypercall must do it. */
- if (unlikely(!lg->lguest_data)) {
+ if (unlikely(!cpu->lg->lguest_data)) {
/* Set up the "struct lguest_data" */
- initialize(lg);
+ initialize(cpu);
/* Hcall is done. */
- lg->hcall = NULL;
+ cpu->hcall = NULL;
return;
}
/* The Guest has initialized.
*
* Look in the hypercall ring for the async hypercalls: */
- do_async_hcalls(lg);
+ do_async_hcalls(cpu);
/* If we stopped reading the hypercall ring because the Guest did a
* NOTIFY to the Launcher, we want to return now. Otherwise we do
* the hypercall. */
- if (!lg->pending_notify) {
- do_hcall(lg, lg->hcall);
+ if (!cpu->pending_notify) {
+ do_hcall(cpu, cpu->hcall);
/* Tricky point: we reset the hcall pointer to mark the
* hypercall as "done". We use the hcall pointer rather than
* the trap number to indicate a hypercall is pending.
@@ -225,16 +228,17 @@ void do_hypercalls(struct lguest *lg)
* Launcher, the run_guest() loop will exit without running the
* Guest. When it comes back it would try to re-run the
* hypercall. */
- lg->hcall = NULL;
+ cpu->hcall = NULL;
}
}
/* This routine supplies the Guest with time: it's used for wallclock time at
* initial boot and as a rough time source if the TSC isn't available. */
-void write_timestamp(struct lguest *lg)
+void write_timestamp(struct lg_cpu *cpu)
{
struct timespec now;
ktime_get_real_ts(&now);
- if (copy_to_user(&lg->lguest_data->time, &now, sizeof(struct timespec)))
- kill_guest(lg, "Writing timestamp");
+ if (copy_to_user(&cpu->lg->lguest_data->time,
+ &now, sizeof(struct timespec)))
+ kill_guest(cpu, "Writing timestamp");
}
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index 2b66f79c208..32e97c1858e 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -41,11 +41,11 @@ static int idt_present(u32 lo, u32 hi)
/* We need a helper to "push" a value onto the Guest's stack, since that's a
* big part of what delivering an interrupt does. */
-static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val)
+static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val)
{
/* Stack grows upwards: move stack then write value. */
*gstack -= 4;
- lgwrite(lg, *gstack, u32, val);
+ lgwrite(cpu, *gstack, u32, val);
}
/*H:210 The set_guest_interrupt() routine actually delivers the interrupt or
@@ -60,7 +60,7 @@ static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val)
* We set up the stack just like the CPU does for a real interrupt, so it's
* identical for the Guest (and the standard "iret" instruction will undo
* it). */
-static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
+static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, int has_err)
{
unsigned long gstack, origstack;
u32 eflags, ss, irq_enable;
@@ -69,59 +69,59 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
/* There are two cases for interrupts: one where the Guest is already
* in the kernel, and a more complex one where the Guest is in
* userspace. We check the privilege level to find out. */
- if ((lg->regs->ss&0x3) != GUEST_PL) {
+ if ((cpu->regs->ss&0x3) != GUEST_PL) {
/* The Guest told us their kernel stack with the SET_STACK
* hypercall: both the virtual address and the segment */
- virtstack = lg->esp1;
- ss = lg->ss1;
+ virtstack = cpu->esp1;
+ ss = cpu->ss1;
- origstack = gstack = guest_pa(lg, virtstack);
+ origstack = gstack = guest_pa(cpu, virtstack);
/* We push the old stack segment and pointer onto the new
* stack: when the Guest does an "iret" back from the interrupt
* handler the CPU will notice they're dropping privilege
* levels and expect these here. */
- push_guest_stack(lg, &gstack, lg->regs->ss);
- push_guest_stack(lg, &gstack, lg->regs->esp);
+ push_guest_stack(cpu, &gstack, cpu->regs->ss);
+ push_guest_stack(cpu, &gstack, cpu->regs->esp);
} else {
/* We're staying on the same Guest (kernel) stack. */
- virtstack = lg->regs->esp;
- ss = lg->regs->ss;
+ virtstack = cpu->regs->esp;
+ ss = cpu->regs->ss;
- origstack = gstack = guest_pa(lg, virtstack);
+ origstack = gstack = guest_pa(cpu, virtstack);
}
/* Remember that we never let the Guest actually disable interrupts, so
* the "Interrupt Flag" bit is always set. We copy that bit from the
* Guest's "irq_enabled" field into the eflags word: we saw the Guest
* copy it back in "lguest_iret". */
- eflags = lg->regs->eflags;
- if (get_user(irq_enable, &lg->lguest_data->irq_enabled) == 0
+ eflags = cpu->regs->eflags;
+ if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0
&& !(irq_enable & X86_EFLAGS_IF))
eflags &= ~X86_EFLAGS_IF;
/* An interrupt is expected to push three things on the stack: the old
* "eflags" word, the old code segment, and the old instruction
* pointer. */
- push_guest_stack(lg, &gstack, eflags);
- push_guest_stack(lg, &gstack, lg->regs->cs);
- push_guest_stack(lg, &gstack, lg->regs->eip);
+ push_guest_stack(cpu, &gstack, eflags);
+ push_guest_stack(cpu, &gstack, cpu->regs->cs);
+ push_guest_stack(cpu, &gstack, cpu->regs->eip);
/* For the six traps which supply an error code, we push that, too. */
if (has_err)
- push_guest_stack(lg, &gstack, lg->regs->errcode);
+ push_guest_stack(cpu, &gstack, cpu->regs->errcode);
/* Now we've pushed all the old state, we change the stack, the code
* segment and the address to execute. */
- lg->regs->ss = ss;
- lg->regs->esp = virtstack + (gstack - origstack);
- lg->regs->cs = (__KERNEL_CS|GUEST_PL);
- lg->regs->eip = idt_address(lo, hi);
+ cpu->regs->ss = ss;
+ cpu->regs->esp = virtstack + (gstack - origstack);
+ cpu->regs->cs = (__KERNEL_CS|GUEST_PL);
+ cpu->regs->eip = idt_address(lo, hi);
/* There are two kinds of interrupt handlers: 0xE is an "interrupt
* gate" which expects interrupts to be disabled on entry. */
if (idt_type(lo, hi) == 0xE)
- if (put_user(0, &lg->lguest_data->irq_enabled))
- kill_guest(lg, "Disabling interrupts");
+ if (put_user(0, &cpu->lg->lguest_data->irq_enabled))
+ kill_guest(cpu, "Disabling interrupts");
}
/*H:205
@@ -129,23 +129,23 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
*
* maybe_do_interrupt() gets called before every entry to the Guest, to see if
* we should divert the Guest to running an interrupt handler. */
-void maybe_do_interrupt(struct lguest *lg)
+void maybe_do_interrupt(struct lg_cpu *cpu)
{
unsigned int irq;
DECLARE_BITMAP(blk, LGUEST_IRQS);
struct desc_struct *idt;
/* If the Guest hasn't even initialized yet, we can do nothing. */
- if (!lg->lguest_data)
+ if (!cpu->lg->lguest_data)
return;
/* Take our "irqs_pending" array and remove any interrupts the Guest
* wants blocked: the result ends up in "blk". */
- if (copy_from_user(&blk, lg->lguest_data->blocked_interrupts,
+ if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts,
sizeof(blk)))
return;
- bitmap_andnot(blk, lg->irqs_pending, blk, LGUEST_IRQS);
+ bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS);
/* Find the first interrupt. */
irq = find_first_bit(blk, LGUEST_IRQS);
@@ -155,19 +155,20 @@ void maybe_do_interrupt(struct lguest *lg)
/* They may be in the middle of an iret, where they asked us never to
* deliver interrupts. */
- if (lg->regs->eip >= lg->noirq_start && lg->regs->eip < lg->noirq_end)
+ if (cpu->regs->eip >= cpu->lg->noirq_start &&
+ (cpu->regs->eip < cpu->lg->noirq_end))
return;
/* If they're halted, interrupts restart them. */
- if (lg->halted) {
+ if (cpu->halted) {
/* Re-enable interrupts. */
- if (put_user(X86_EFLAGS_IF, &lg->lguest_data->irq_enabled))
- kill_guest(lg, "Re-enabling interrupts");
- lg->halted = 0;
+ if (put_user(X86_EFLAGS_IF, &cpu->lg->lguest_data->irq_enabled))
+ kill_guest(cpu, "Re-enabling interrupts");
+ cpu->halted = 0;
} else {
/* Otherwise we check if they have interrupts disabled. */
u32 irq_enabled;
- if (get_user(irq_enabled, &lg->lguest_data->irq_enabled))
+ if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled))
irq_enabled = 0;
if (!irq_enabled)
return;
@@ -176,15 +177,15 @@ void maybe_do_interrupt(struct lguest *lg)
/* Look at the IDT entry the Guest gave us for this interrupt. The
* first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip
* over them. */
- idt = &lg->arch.idt[FIRST_EXTERNAL_VECTOR+irq];
+ idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq];
/* If they don't have a handler (yet?), we just ignore it */
if (idt_present(idt->a, idt->b)) {
/* OK, mark it no longer pending and deliver it. */
- clear_bit(irq, lg->irqs_pending);
+ clear_bit(irq, cpu->irqs_pending);
/* set_guest_interrupt() takes the interrupt descriptor and a
* flag to say whether this interrupt pushes an error code onto
* the stack as well: virtual interrupts never do. */
- set_guest_interrupt(lg, idt->a, idt->b, 0);
+ set_guest_interrupt(cpu, idt->a, idt->b, 0);
}
/* Every time we deliver an interrupt, we update the timestamp in the
@@ -192,7 +193,7 @@ void maybe_do_interrupt(struct lguest *lg)
* did this more often, but it can actually be quite slow: doing it
* here is a compromise which means at least it gets updated every
* timer interrupt. */
- write_timestamp(lg);
+ write_timestamp(cpu);
}
/*:*/
@@ -245,19 +246,19 @@ static int has_err(unsigned int trap)
}
/* deliver_trap() returns true if it could deliver the trap. */
-int deliver_trap(struct lguest *lg, unsigned int num)
+int deliver_trap(struct lg_cpu *cpu, unsigned int num)
{
/* Trap numbers are always 8 bit, but we set an impossible trap number
* for traps inside the Switcher, so check that here. */
- if (num >= ARRAY_SIZE(lg->arch.idt))
+ if (num >= ARRAY_SIZE(cpu->arch.idt))
return 0;
/* Early on the Guest hasn't set the IDT entries (or maybe it put a
* bogus one in): if we fail here, the Guest will be killed. */
- if (!idt_present(lg->arch.idt[num].a, lg->arch.idt[num].b))
+ if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b))
return 0;
- set_guest_interrupt(lg, lg->arch.idt[num].a, lg->arch.idt[num].b,
- has_err(num));
+ set_guest_interrupt(cpu, cpu->arch.idt[num].a,
+ cpu->arch.idt[num].b, has_err(num));
return 1;
}
@@ -309,18 +310,18 @@ static int direct_trap(unsigned int num)
* the Guest.
*
* Which is deeply unfair, because (literally!) it wasn't the Guests' fault. */
-void pin_stack_pages(struct lguest *lg)
+void pin_stack_pages(struct lg_cpu *cpu)
{
unsigned int i;
/* Depending on the CONFIG_4KSTACKS option, the Guest can have one or
* two pages of stack space. */
- for (i = 0; i < lg->stack_pages; i++)
+ for (i = 0; i < cpu->lg->stack_pages; i++)
/* The stack grows *upwards*, so the address we're given is the
* start of the page after the kernel stack. Subtract one to
* get back onto the first stack page, and keep subtracting to
* get to the rest of the stack pages. */
- pin_page(lg, lg->esp1 - 1 - i * PAGE_SIZE);
+ pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE);
}
/* Direct traps also mean that we need to know whenever the Guest wants to use
@@ -331,21 +332,21 @@ void pin_stack_pages(struct lguest *lg)
*
* In Linux each process has its own kernel stack, so this happens a lot: we
* change stacks on each context switch. */
-void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages)
+void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages)
{
/* You are not allowed have a stack segment with privilege level 0: bad
* Guest! */
if ((seg & 0x3) != GUEST_PL)
- kill_guest(lg, "bad stack segment %i", seg);
+ kill_guest(cpu, "bad stack segment %i", seg);
/* We only expect one or two stack pages. */
if (pages > 2)
- kill_guest(lg, "bad stack pages %u", pages);
+ kill_guest(cpu, "bad stack pages %u", pages);
/* Save where the stack is, and how many pages */
- lg->ss1 = seg;
- lg->esp1 = esp;
- lg->stack_pages = pages;
+ cpu->ss1 = seg;
+ cpu->esp1 = esp;
+ cpu->lg->stack_pages = pages;
/* Make sure the new stack pages are mapped */
- pin_stack_pages(lg);
+ pin_stack_pages(cpu);
}
/* All this reference to mapping stacks leads us neatly into the other complex
@@ -353,7 +354,7 @@ void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages)
/*H:235 This is the routine which actually checks the Guest's IDT entry and
* transfers it into the entry in "struct lguest": */
-static void set_trap(struct lguest *lg, struct desc_struct *trap,
+static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap,
unsigned int num, u32 lo, u32 hi)
{
u8 type = idt_type(lo, hi);
@@ -366,7 +367,7 @@ static void set_trap(struct lguest *lg, struct desc_struct *trap,
/* We only support interrupt and trap gates. */
if (type != 0xE && type != 0xF)
- kill_guest(lg, "bad IDT type %i", type);
+ kill_guest(cpu, "bad IDT type %i", type);
/* We only copy the handler address, present bit, privilege level and
* type. The privilege level controls where the trap can be triggered
@@ -383,7 +384,7 @@ static void set_trap(struct lguest *lg, struct desc_struct *trap,
*
* We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the
* LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. */
-void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi)
+void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi)
{
/* Guest never handles: NMI, doublefault, spurious interrupt or
* hypercall. We ignore when it tries to set them. */
@@ -392,13 +393,13 @@ void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi)
/* Mark the IDT as changed: next time the Guest runs we'll know we have
* to copy this again. */
- lg->changed |= CHANGED_IDT;
+ cpu->changed |= CHANGED_IDT;
/* Check that the Guest doesn't try to step outside the bounds. */
- if (num >= ARRAY_SIZE(lg->arch.idt))
- kill_guest(lg, "Setting idt entry %u", num);
+ if (num >= ARRAY_SIZE(cpu->arch.idt))
+ kill_guest(cpu, "Setting idt entry %u", num);
else
- set_trap(lg, &lg->arch.idt[num], num, lo, hi);
+ set_trap(cpu, &cpu->arch.idt[num], num, lo, hi);
}
/* The default entry for each interrupt points into the Switcher routines which
@@ -434,14 +435,14 @@ void setup_default_idt_entries(struct lguest_ro_state *state,
/*H:240 We don't use the IDT entries in the "struct lguest" directly, instead
* we copy them into the IDT which we've set up for Guests on this CPU, just
* before we run the Guest. This routine does that copy. */
-void copy_traps(const struct lguest *lg, struct desc_struct *idt,
+void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
const unsigned long *def)
{
unsigned int i;
/* We can simply copy the direct traps, otherwise we use the default
* ones in the Switcher: they will return to the Host. */
- for (i = 0; i < ARRAY_SIZE(lg->arch.idt); i++) {
+ for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) {
/* If no Guest can ever override this trap, leave it alone. */
if (!direct_trap(i))
continue;
@@ -450,8 +451,8 @@ void copy_traps(const struct lguest *lg, struct desc_struct *idt,
* Interrupt gates (type 14) disable interrupts as they are
* entered, which we never let the Guest do. Not present
* entries (type 0x0) also can't go direct, of course. */
- if (idt_type(lg->arch.idt[i].a, lg->arch.idt[i].b) == 0xF)
- idt[i] = lg->arch.idt[i];
+ if (idt_type(cpu->arch.idt[i].a, cpu->arch.idt[i].b) == 0xF)
+ idt[i] = cpu->arch.idt[i];
else
/* Reset it to the default. */
default_idt_entry(&idt[i], i, def[i]);
@@ -470,13 +471,13 @@ void copy_traps(const struct lguest *lg, struct desc_struct *idt,
* infrastructure to set a callback at that time.
*
* 0 means "turn off the clock". */
-void guest_set_clockevent(struct lguest *lg, unsigned long delta)
+void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta)
{
ktime_t expires;
if (unlikely(delta == 0)) {
/* Clock event device is shutting down. */
- hrtimer_cancel(&lg->hrt);
+ hrtimer_cancel(&cpu->hrt);
return;
}
@@ -484,25 +485,25 @@ void guest_set_clockevent(struct lguest *lg, unsigned long delta)
* all the time between now and the timer interrupt it asked for. This
* is almost always the right thing to do. */
expires = ktime_add_ns(ktime_get_real(), delta);
- hrtimer_start(&lg->hrt, expires, HRTIMER_MODE_ABS);
+ hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS);
}
/* This is the function called when the Guest's timer expires. */
static enum hrtimer_restart clockdev_fn(struct hrtimer *timer)
{
- struct lguest *lg = container_of(timer, struct lguest, hrt);
+ struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt);
/* Remember the first interrupt is the timer interrupt. */
- set_bit(0, lg->irqs_pending);
+ set_bit(0, cpu->irqs_pending);
/* If the Guest is actually stopped, we need to wake it up. */
- if (lg->halted)
- wake_up_process(lg->tsk);
+ if (cpu->halted)
+ wake_up_process(cpu->tsk);
return HRTIMER_NORESTART;
}
/* This sets up the timer for this Guest. */
-void init_clockdev(struct lguest *lg)
+void init_clockdev(struct lg_cpu *cpu)
{
- hrtimer_init(&lg->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS);
- lg->hrt.function = clockdev_fn;
+ hrtimer_init(&cpu->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+ cpu->hrt.function = clockdev_fn;
}
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 86924891b5e..2337e1a06f0 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -8,6 +8,7 @@
#include <linux/lguest.h>
#include <linux/lguest_launcher.h>
#include <linux/wait.h>
+#include <linux/hrtimer.h>
#include <linux/err.h>
#include <asm/semaphore.h>
@@ -38,58 +39,72 @@ struct lguest_pages
#define CHANGED_GDT_TLS 4 /* Actually a subset of CHANGED_GDT */
#define CHANGED_ALL 3
-/* The private info the thread maintains about the guest. */
-struct lguest
-{
- /* At end of a page shared mapped over lguest_pages in guest. */
- unsigned long regs_page;
- struct lguest_regs *regs;
- struct lguest_data __user *lguest_data;
+struct lguest;
+
+struct lg_cpu {
+ unsigned int id;
+ struct lguest *lg;
struct task_struct *tsk;
struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */
- u32 pfn_limit;
- /* This provides the offset to the base of guest-physical
- * memory in the Launcher. */
- void __user *mem_base;
- unsigned long kernel_address;
+
u32 cr2;
- int halted;
int ts;
- u32 next_hcall;
u32 esp1;
u8 ss1;
+ /* Bitmap of what has changed: see CHANGED_* above. */
+ int changed;
+
+ unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */
+
+ /* At end of a page shared mapped over lguest_pages in guest. */
+ unsigned long regs_page;
+ struct lguest_regs *regs;
+
+ struct lguest_pages *last_pages;
+
+ int cpu_pgd; /* which pgd this cpu is currently using */
+
/* If a hypercall was asked for, this points to the arguments. */
struct hcall_args *hcall;
+ u32 next_hcall;
+
+ /* Virtual clock device */
+ struct hrtimer hrt;
/* Do we need to stop what we're doing and return to userspace? */
int break_out;
wait_queue_head_t break_wq;
+ int halted;
- /* Bitmap of what has changed: see CHANGED_* above. */
- int changed;
- struct lguest_pages *last_pages;
+ /* Pending virtual interrupts */
+ DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
+
+ struct lg_cpu_arch arch;
+};
+
+/* The private info the thread maintains about the guest. */
+struct lguest
+{
+ struct lguest_data __user *lguest_data;
+ struct lg_cpu cpus[NR_CPUS];
+ unsigned int nr_cpus;
+
+ u32 pfn_limit;
+ /* This provides the offset to the base of guest-physical
+ * memory in the Launcher. */
+ void __user *mem_base;
+ unsigned long kernel_address;
- /* We keep a small number of these. */
- u32 pgdidx;
struct pgdir pgdirs[4];
unsigned long noirq_start, noirq_end;
- unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */
unsigned int stack_pages;
u32 tsc_khz;
/* Dead? */
const char *dead;
-
- struct lguest_arch arch;
-
- /* Virtual clock device */
- struct hrtimer hrt;
-
- /* Pending virtual interrupts */
- DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
};
extern struct mutex lguest_lock;
@@ -97,26 +112,26 @@ extern struct mutex lguest_lock;
/* core.c: */
int lguest_address_ok(const struct lguest *lg,
unsigned long addr, unsigned long len);
-void __lgread(struct lguest *, void *, unsigned long, unsigned);
-void __lgwrite(struct lguest *, unsigned long, const void *, unsigned);
+void __lgread(struct lg_cpu *, void *, unsigned long, unsigned);
+void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned);
/*H:035 Using memory-copy operations like that is usually inconvient, so we
* have the following helper macros which read and write a specific type (often
* an unsigned long).
*
* This reads into a variable of the given type then returns that. */
-#define lgread(lg, addr, type) \
- ({ type _v; __lgread((lg), &_v, (addr), sizeof(_v)); _v; })
+#define lgread(cpu, addr, type) \
+ ({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; })
/* This checks that the variable is of the given type, then writes it out. */
-#define lgwrite(lg, addr, type, val) \
+#define lgwrite(cpu, addr, type, val) \
do { \
typecheck(type, val); \
- __lgwrite((lg), (addr), &(val), sizeof(val)); \
+ __lgwrite((cpu), (addr), &(val), sizeof(val)); \
} while(0)
/* (end of memory access helper routines) :*/
-int run_guest(struct lguest *lg, unsigned long __user *user);
+int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
/* Helper macros to obtain the first 12 or the last 20 bits, this is only the
* first step in the migration to the kernel types. pte_pfn is already defined
@@ -126,52 +141,53 @@ int run_guest(struct lguest *lg, unsigned long __user *user);
#define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT)
/* interrupts_and_traps.c: */
-void maybe_do_interrupt(struct lguest *lg);
-int deliver_trap(struct lguest *lg, unsigned int num);
-void load_guest_idt_entry(struct lguest *lg, unsigned int i, u32 low, u32 hi);
-void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages);
-void pin_stack_pages(struct lguest *lg);
+void maybe_do_interrupt(struct lg_cpu *cpu);
+int deliver_trap(struct lg_cpu *cpu, unsigned int num);
+void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i,
+ u32 low, u32 hi);
+void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages);
+void pin_stack_pages(struct lg_cpu *cpu);
void setup_default_idt_entries(struct lguest_ro_state *state,
const unsigned long *def);
-void copy_traps(const struct lguest *lg, struct desc_struct *idt,
+void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
const unsigned long *def);
-void guest_set_clockevent(struct lguest *lg, unsigned long delta);
-void init_clockdev(struct lguest *lg);
+void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta);
+void init_clockdev(struct lg_cpu *cpu);
bool check_syscall_vector(struct lguest *lg);
int init_interrupts(void);
void free_interrupts(void);
/* segments.c: */
void setup_default_gdt_entries(struct lguest_ro_state *state);
-void setup_guest_gdt(struct lguest *lg);
-void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num);
-void guest_load_tls(struct lguest *lg, unsigned long tls_array);
-void copy_gdt(const struct lguest *lg, struct desc_struct *gdt);
-void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt);
+void setup_guest_gdt(struct lg_cpu *cpu);
+void load_guest_gdt(struct lg_cpu *cpu, unsigned long table, u32 num);
+void guest_load_tls(struct lg_cpu *cpu, unsigned long tls_array);
+void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt);
+void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt);
/* page_tables.c: */
int init_guest_pagetable(struct lguest *lg, unsigned long pgtable);
void free_guest_pagetable(struct lguest *lg);
-void guest_new_pagetable(struct lguest *lg, unsigned long pgtable);
+void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable);
void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i);
-void guest_pagetable_clear_all(struct lguest *lg);
-void guest_pagetable_flush_user(struct lguest *lg);
-void guest_set_pte(struct lguest *lg, unsigned long gpgdir,
+void guest_pagetable_clear_all(struct lg_cpu *cpu);
+void guest_pagetable_flush_user(struct lg_cpu *cpu);
+void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir,
unsigned long vaddr, pte_t val);
-void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages);
-int demand_page(struct lguest *info, unsigned long cr2, int errcode);
-void pin_page(struct lguest *lg, unsigned long vaddr);
-unsigned long guest_pa(struct lguest *lg, unsigned long vaddr);
-void page_table_guest_data_init(struct lguest *lg);
+void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages);
+int demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode);
+void pin_page(struct lg_cpu *cpu, unsigned long vaddr);
+unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr);
+void page_table_guest_data_init(struct lg_cpu *cpu);
/* <arch>/core.c: */
void lguest_arch_host_init(void);
void lguest_arch_host_fini(void);
-void lguest_arch_run_guest(struct lguest *lg);
-void lguest_arch_handle_trap(struct lguest *lg);
-int lguest_arch_init_hypercalls(struct lguest *lg);
-int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args);
-void lguest_arch_setup_regs(struct lguest *lg, unsigned long start);
+void lguest_arch_run_guest(struct lg_cpu *cpu);
+void lguest_arch_handle_trap(struct lg_cpu *cpu);
+int lguest_arch_init_hypercalls(struct lg_cpu *cpu);
+int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args);
+void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start);
/* <arch>/switcher.S: */
extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
@@ -181,8 +197,8 @@ int lguest_device_init(void);
void lguest_device_remove(void);
/* hypercalls.c: */
-void do_hypercalls(struct lguest *lg);
-void write_timestamp(struct lguest *lg);
+void do_hypercalls(struct lg_cpu *cpu);
+void write_timestamp(struct lg_cpu *cpu);
/*L:035
* Let's step aside for the moment, to study one important routine that's used
@@ -208,12 +224,12 @@ void write_timestamp(struct lguest *lg);
* Like any macro which uses an "if", it is safely wrapped in a run-once "do {
* } while(0)".
*/
-#define kill_guest(lg, fmt...) \
+#define kill_guest(cpu, fmt...) \
do { \
- if (!(lg)->dead) { \
- (lg)->dead = kasprintf(GFP_ATOMIC, fmt); \
- if (!(lg)->dead) \
- (lg)->dead = ERR_PTR(-ENOMEM); \
+ if (!(cpu)->lg->dead) { \
+ (cpu)->lg->dead = kasprintf(GFP_ATOMIC, fmt); \
+ if (!(cpu)->lg->dead) \
+ (cpu)->lg->dead = ERR_PTR(-ENOMEM); \
} \
} while(0)
/* (End of aside) :*/
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index 3b92a61ba8d..85d42d3d01a 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -6,6 +6,7 @@
#include <linux/uaccess.h>
#include <linux/miscdevice.h>
#include <linux/fs.h>
+#include <linux/sched.h>
#include "lg.h"
/*L:055 When something happens, the Waker process needs a way to stop the
@@ -13,7 +14,7 @@
* LHREQ_BREAK and the value "1" to /dev/lguest to do this. Once the Launcher
* has done whatever needs attention, it writes LHREQ_BREAK and "0" to release
* the Waker. */
-static int break_guest_out(struct lguest *lg, const unsigned long __user *input)
+static int break_guest_out(struct lg_cpu *cpu, const unsigned long __user*input)
{
unsigned long on;
@@ -22,21 +23,21 @@ static int break_guest_out(struct lguest *lg, const unsigned long __user *input)
return -EFAULT;
if (on) {
- lg->break_out = 1;
+ cpu->break_out = 1;
/* Pop it out of the Guest (may be running on different CPU) */
- wake_up_process(lg->tsk);
+ wake_up_process(cpu->tsk);
/* Wait for them to reset it */
- return wait_event_interruptible(lg->break_wq, !lg->break_out);
+ return wait_event_interruptible(cpu->break_wq, !cpu->break_out);
} else {
- lg->break_out = 0;
- wake_up(&lg->break_wq);
+ cpu->break_out = 0;
+ wake_up(&cpu->break_wq);
return 0;
}
}
/*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
* number to /dev/lguest. */
-static int user_send_irq(struct lguest *lg, const unsigned long __user *input)
+static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
{
unsigned long irq;
@@ -46,7 +47,7 @@ static int user_send_irq(struct lguest *lg, const unsigned long __user *input)
return -EINVAL;
/* Next time the Guest runs, the core code will see if it can deliver
* this interrupt. */
- set_bit(irq, lg->irqs_pending);
+ set_bit(irq, cpu->irqs_pending);
return 0;
}
@@ -55,13 +56,21 @@ static int user_send_irq(struct lguest *lg, const unsigned long __user *input)
static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
{
struct lguest *lg = file->private_data;
+ struct lg_cpu *cpu;
+ unsigned int cpu_id = *o;
/* You must write LHREQ_INITIALIZE first! */
if (!lg)
return -EINVAL;
+ /* Watch out for arbitrary vcpu indexes! */
+ if (cpu_id >= lg->nr_cpus)
+ return -EINVAL;
+
+ cpu = &lg->cpus[cpu_id];
+
/* If you're not the task which owns the Guest, go away. */
- if (current != lg->tsk)
+ if (current != cpu->tsk)
return -EPERM;
/* If the guest is already dead, we indicate why */
@@ -81,11 +90,53 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
/* If we returned from read() last time because the Guest notified,
* clear the flag. */
- if (lg->pending_notify)
- lg->pending_notify = 0;
+ if (cpu->pending_notify)
+ cpu->pending_notify = 0;
/* Run the Guest until something interesting happens. */
- return run_guest(lg, (unsigned long __user *)user);
+ return run_guest(cpu, (unsigned long __user *)user);
+}
+
+static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
+{
+ if (id >= NR_CPUS)
+ return -EINVAL;
+
+ cpu->id = id;
+ cpu->lg = container_of((cpu - id), struct lguest, cpus[0]);
+ cpu->lg->nr_cpus++;
+ init_clockdev(cpu);
+
+ /* We need a complete page for the Guest registers: they are accessible
+ * to the Guest and we can only grant it access to whole pages. */
+ cpu->regs_page = get_zeroed_page(GFP_KERNEL);
+ if (!cpu->regs_page)
+ return -ENOMEM;
+
+ /* We actually put the registers at the bottom of the page. */
+ cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs);
+
+ /* Now we initialize the Guest's registers, handing it the start
+ * address. */
+ lguest_arch_setup_regs(cpu, start_ip);
+
+ /* Initialize the queue for the waker to wait on */
+ init_waitqueue_head(&cpu->break_wq);
+
+ /* We keep a pointer to the Launcher task (ie. current task) for when
+ * other Guests want to wake this one (inter-Guest I/O). */
+ cpu->tsk = current;
+
+ /* We need to keep a pointer to the Launcher's memory map, because if
+ * the Launcher dies we need to clean it up. If we don't keep a
+ * reference, it is destroyed before close() is called. */
+ cpu->mm = get_task_mm(cpu->tsk);
+
+ /* We remember which CPU's pages this Guest used last, for optimization
+ * when the same Guest runs on the same CPU twice. */
+ cpu->last_pages = NULL;
+
+ return 0;
}
/*L:020 The initialization write supplies 4 pointer sized (32 or 64 bit)
@@ -134,15 +185,10 @@ static int initialize(struct file *file, const unsigned long __user *input)
lg->mem_base = (void __user *)(long)args[0];
lg->pfn_limit = args[1];
- /* We need a complete page for the Guest registers: they are accessible
- * to the Guest and we can only grant it access to whole pages. */
- lg->regs_page = get_zeroed_page(GFP_KERNEL);
- if (!lg->regs_page) {
- err = -ENOMEM;
+ /* This is the first cpu */
+ err = lg_cpu_start(&lg->cpus[0], 0, args[3]);
+ if (err)
goto release_guest;
- }
- /* We actually put the registers at the bottom of the page. */
- lg->regs = (void *)lg->regs_page + PAGE_SIZE - sizeof(*lg->regs);
/* Initialize the Guest's shadow page tables, using the toplevel
* address the Launcher gave us. This allocates memory, so can
@@ -151,28 +197,6 @@ static int initialize(struct file *file, const unsigned long __user *input)
if (err)
goto free_regs;
- /* Now we initialize the Guest's registers, handing it the start
- * address. */
- lguest_arch_setup_regs(lg, args[3]);
-
- /* The timer for lguest's clock needs initialization. */
- init_clockdev(lg);
-
- /* We keep a pointer to the Launcher task (ie. current task) for when
- * other Guests want to wake this one (inter-Guest I/O). */
- lg->tsk = current;
- /* We need to keep a pointer to the Launcher's memory map, because if
- * the Launcher dies we need to clean it up. If we don't keep a
- * reference, it is destroyed before close() is called. */
- lg->mm = get_task_mm(lg->tsk);
-
- /* Initialize the queue for the waker to wait on */
- init_waitqueue_head(&lg->break_wq);
-
- /* We remember which CPU's pages this Guest used last, for optimization
- * when the same Guest runs on the same CPU twice. */
- lg->last_pages = NULL;
-
/* We keep our "struct lguest" in the file's private_data. */
file->private_data = lg;
@@ -182,7 +206,8 @@ static int initialize(struct file *file, const unsigned long __user *input)
return sizeof(args);
free_regs:
- free_page(lg->regs_page);
+ /* FIXME: This should be in free_vcpu */
+ free_page(lg->cpus[0].regs_page);
release_guest:
kfree(lg);
unlock:
@@ -202,30 +227,37 @@ static ssize_t write(struct file *file, const char __user *in,
struct lguest *lg = file->private_data;
const unsigned long __user *input = (const unsigned long __user *)in;
unsigned long req;
+ struct lg_cpu *uninitialized_var(cpu);
+ unsigned int cpu_id = *off;
if (get_user(req, input) != 0)
return -EFAULT;
input++;
/* If you haven't initialized, you must do that first. */
- if (req != LHREQ_INITIALIZE && !lg)
- return -EINVAL;
+ if (req != LHREQ_INITIALIZE) {
+ if (!lg || (cpu_id >= lg->nr_cpus))
+ return -EINVAL;
+ cpu = &lg->cpus[cpu_id];
+ if (!cpu)
+ return -EINVAL;
+ }
/* Once the Guest is dead, all you can do is read() why it died. */
if (lg && lg->dead)
return -ENOENT;
/* If you're not the task which owns the Guest, you can only break */
- if (lg && current != lg->tsk && req != LHREQ_BREAK)
+ if (lg && current != cpu->tsk && req != LHREQ_BREAK)
return -EPERM;
switch (req) {
case LHREQ_INITIALIZE:
return initialize(file, input);
case LHREQ_IRQ:
- return user_send_irq(lg, input);
+ return user_send_irq(cpu, input);
case LHREQ_BREAK:
- return break_guest_out(lg, input);
+ return break_guest_out(cpu, input);
default:
return -EINVAL;
}
@@ -241,6 +273,7 @@ static ssize_t write(struct file *file, const char __user *in,
static int close(struct inode *inode, struct file *file)
{
struct lguest *lg = file->private_data;
+ unsigned int i;
/* If we never successfully initialized, there's nothing to clean up */
if (!lg)
@@ -249,19 +282,23 @@ static int close(struct inode *inode, struct file *file)
/* We need the big lock, to protect from inter-guest I/O and other
* Launchers initializing guests. */
mutex_lock(&lguest_lock);
- /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */
- hrtimer_cancel(&lg->hrt);
+
/* Free up the shadow page tables for the Guest. */
free_guest_pagetable(lg);
- /* Now all the memory cleanups are done, it's safe to release the
- * Launcher's memory management structure. */
- mmput(lg->mm);
+
+ for (i = 0; i < lg->nr_cpus; i++) {
+ /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */
+ hrtimer_cancel(&lg->cpus[i].hrt);
+ /* We can free up the register page we allocated. */
+ free_page(lg->cpus[i].regs_page);
+ /* Now all the memory cleanups are done, it's safe to release
+ * the Launcher's memory management structure. */
+ mmput(lg->cpus[i].mm);
+ }
/* If lg->dead doesn't contain an error code it will be NULL or a
* kmalloc()ed string, either of which is ok to hand to kfree(). */
if (!IS_ERR(lg->dead))
kfree(lg->dead);
- /* We can free up the register page we allocated. */
- free_page(lg->regs_page);
/* We clear the entire structure, which also marks it as free for the
* next user. */
memset(lg, 0, sizeof(*lg));
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index fffabb32715..74b4cf2a6c4 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -68,23 +68,23 @@ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
* page directory entry (PGD) for that address. Since we keep track of several
* page tables, the "i" argument tells us which one we're interested in (it's
* usually the current one). */
-static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr)
+static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
{
unsigned int index = pgd_index(vaddr);
/* We kill any Guest trying to touch the Switcher addresses. */
if (index >= SWITCHER_PGD_INDEX) {
- kill_guest(lg, "attempt to access switcher pages");
+ kill_guest(cpu, "attempt to access switcher pages");
index = 0;
}
/* Return a pointer index'th pgd entry for the i'th page table. */
- return &lg->pgdirs[i].pgdir[index];
+ return &cpu->lg->pgdirs[i].pgdir[index];
}
/* This routine then takes the page directory entry returned above, which
* contains the address of the page table entry (PTE) page. It then returns a
* pointer to the PTE entry for the given address. */
-static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr)
+static pte_t *spte_addr(pgd_t spgd, unsigned long vaddr)
{
pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
/* You should never call this if the PGD entry wasn't valid */
@@ -94,14 +94,13 @@ static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr)
/* These two functions just like the above two, except they access the Guest
* page tables. Hence they return a Guest address. */
-static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr)
+static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)
{
unsigned int index = vaddr >> (PGDIR_SHIFT);
- return lg->pgdirs[lg->pgdidx].gpgdir + index * sizeof(pgd_t);
+ return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t);
}
-static unsigned long gpte_addr(struct lguest *lg,
- pgd_t gpgd, unsigned long vaddr)
+static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr)
{
unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
@@ -138,7 +137,7 @@ static unsigned long get_pfn(unsigned long virtpfn, int write)
* entry can be a little tricky. The flags are (almost) the same, but the
* Guest PTE contains a virtual page number: the CPU needs the real page
* number. */
-static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write)
+static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write)
{
unsigned long pfn, base, flags;
@@ -149,7 +148,7 @@ static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write)
flags = (pte_flags(gpte) & ~_PAGE_GLOBAL);
/* The Guest's pages are offset inside the Launcher. */
- base = (unsigned long)lg->mem_base / PAGE_SIZE;
+ base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE;
/* We need a temporary "unsigned long" variable to hold the answer from
* get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't
@@ -157,7 +156,7 @@ static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write)
* page, given the virtual number. */
pfn = get_pfn(base + pte_pfn(gpte), write);
if (pfn == -1UL) {
- kill_guest(lg, "failed to get page %lu", pte_pfn(gpte));
+ kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte));
/* When we destroy the Guest, we'll go through the shadow page
* tables and release_pte() them. Make sure we don't think
* this one is valid! */
@@ -177,17 +176,18 @@ static void release_pte(pte_t pte)
}
/*:*/
-static void check_gpte(struct lguest *lg, pte_t gpte)
+static void check_gpte(struct lg_cpu *cpu, pte_t gpte)
{
if ((pte_flags(gpte) & (_PAGE_PWT|_PAGE_PSE))
- || pte_pfn(gpte) >= lg->pfn_limit)
- kill_guest(lg, "bad page table entry");
+ || pte_pfn(gpte) >= cpu->lg->pfn_limit)
+ kill_guest(cpu, "bad page table entry");
}
-static void check_gpgd(struct lguest *lg, pgd_t gpgd)
+static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
{
- if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || pgd_pfn(gpgd) >= lg->pfn_limit)
- kill_guest(lg, "bad page directory entry");
+ if ((pgd_flags(gpgd) & ~_PAGE_TABLE) ||
+ (pgd_pfn(gpgd) >= cpu->lg->pfn_limit))
+ kill_guest(cpu, "bad page directory entry");
}
/*H:330
@@ -200,7 +200,7 @@ static void check_gpgd(struct lguest *lg, pgd_t gpgd)
*
* If we fixed up the fault (ie. we mapped the address), this routine returns
* true. Otherwise, it was a real fault and we need to tell the Guest. */
-int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
+int demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
{
pgd_t gpgd;
pgd_t *spgd;
@@ -209,24 +209,24 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
pte_t *spte;
/* First step: get the top-level Guest page table entry. */
- gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t);
+ gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
/* Toplevel not present? We can't map it in. */
if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
return 0;
/* Now look at the matching shadow entry. */
- spgd = spgd_addr(lg, lg->pgdidx, vaddr);
+ spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
/* No shadow entry: allocate a new shadow PTE page. */
unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
/* This is not really the Guest's fault, but killing it is
* simple for this corner case. */
if (!ptepage) {
- kill_guest(lg, "out of memory allocating pte page");
+ kill_guest(cpu, "out of memory allocating pte page");
return 0;
}
/* We check that the Guest pgd is OK. */
- check_gpgd(lg, gpgd);
+ check_gpgd(cpu, gpgd);
/* And we copy the flags to the shadow PGD entry. The page
* number in the shadow PGD is the page we just allocated. */
*spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd));
@@ -234,8 +234,8 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
/* OK, now we look at the lower level in the Guest page table: keep its
* address, because we might update it later. */
- gpte_ptr = gpte_addr(lg, gpgd, vaddr);
- gpte = lgread(lg, gpte_ptr, pte_t);
+ gpte_ptr = gpte_addr(gpgd, vaddr);
+ gpte = lgread(cpu, gpte_ptr, pte_t);
/* If this page isn't in the Guest page tables, we can't page it in. */
if (!(pte_flags(gpte) & _PAGE_PRESENT))
@@ -252,7 +252,7 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
/* Check that the Guest PTE flags are OK, and the page number is below
* the pfn_limit (ie. not mapping the Launcher binary). */
- check_gpte(lg, gpte);
+ check_gpte(cpu, gpte);
/* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
gpte = pte_mkyoung(gpte);
@@ -260,7 +260,7 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
gpte = pte_mkdirty(gpte);
/* Get the pointer to the shadow PTE entry we're going to set. */
- spte = spte_addr(lg, *spgd, vaddr);
+ spte = spte_addr(*spgd, vaddr);
/* If there was a valid shadow PTE entry here before, we release it.
* This can happen with a write to a previously read-only entry. */
release_pte(*spte);
@@ -268,17 +268,17 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
/* If this is a write, we insist that the Guest page is writable (the
* final arg to gpte_to_spte()). */
if (pte_dirty(gpte))
- *spte = gpte_to_spte(lg, gpte, 1);
+ *spte = gpte_to_spte(cpu, gpte, 1);
else
/* If this is a read, don't set the "writable" bit in the page
* table entry, even if the Guest says it's writable. That way
* we will come back here when a write does actually occur, so
* we can update the Guest's _PAGE_DIRTY flag. */
- *spte = gpte_to_spte(lg, pte_wrprotect(gpte), 0);
+ *spte = gpte_to_spte(cpu, pte_wrprotect(gpte), 0);
/* Finally, we write the Guest PTE entry back: we've set the
* _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */
- lgwrite(lg, gpte_ptr, pte_t, gpte);
+ lgwrite(cpu, gpte_ptr, pte_t, gpte);
/* The fault is fixed, the page table is populated, the mapping
* manipulated, the result returned and the code complete. A small
@@ -297,19 +297,19 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
*
* This is a quick version which answers the question: is this virtual address
* mapped by the shadow page tables, and is it writable? */
-static int page_writable(struct lguest *lg, unsigned long vaddr)
+static int page_writable(struct lg_cpu *cpu, unsigned long vaddr)
{
pgd_t *spgd;
unsigned long flags;
/* Look at the current top level entry: is it present? */
- spgd = spgd_addr(lg, lg->pgdidx, vaddr);
+ spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
return 0;
/* Check the flags on the pte entry itself: it must be present and
* writable. */
- flags = pte_flags(*(spte_addr(lg, *spgd, vaddr)));
+ flags = pte_flags(*(spte_addr(*spgd, vaddr)));
return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
}
@@ -317,10 +317,10 @@ static int page_writable(struct lguest *lg, unsigned long vaddr)
/* So, when pin_stack_pages() asks us to pin a page, we check if it's already
* in the page tables, and if not, we call demand_page() with error code 2
* (meaning "write"). */
-void pin_page(struct lguest *lg, unsigned long vaddr)
+void pin_page(struct lg_cpu *cpu, unsigned long vaddr)
{
- if (!page_writable(lg, vaddr) && !demand_page(lg, vaddr, 2))
- kill_guest(lg, "bad stack page %#lx", vaddr);
+ if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2))
+ kill_guest(cpu, "bad stack page %#lx", vaddr);
}
/*H:450 If we chase down the release_pgd() code, it looks like this: */
@@ -358,28 +358,28 @@ static void flush_user_mappings(struct lguest *lg, int idx)
*
* The Guest has a hypercall to throw away the page tables: it's used when a
* large number of mappings have been changed. */
-void guest_pagetable_flush_user(struct lguest *lg)
+void guest_pagetable_flush_user(struct lg_cpu *cpu)
{
/* Drop the userspace part of the current page table. */
- flush_user_mappings(lg, lg->pgdidx);
+ flush_user_mappings(cpu->lg, cpu->cpu_pgd);
}
/*:*/
/* We walk down the guest page tables to get a guest-physical address */
-unsigned long guest_pa(struct lguest *lg, unsigned long vaddr)
+unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
{
pgd_t gpgd;
pte_t gpte;
/* First step: get the top-level Guest page table entry. */
- gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t);
+ gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
/* Toplevel not present? We can't map it in. */
if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
- kill_guest(lg, "Bad address %#lx", vaddr);
+ kill_guest(cpu, "Bad address %#lx", vaddr);
- gpte = lgread(lg, gpte_addr(lg, gpgd, vaddr), pte_t);
+ gpte = lgread(cpu, gpte_addr(gpgd, vaddr), pte_t);
if (!(pte_flags(gpte) & _PAGE_PRESENT))
- kill_guest(lg, "Bad address %#lx", vaddr);
+ kill_guest(cpu, "Bad address %#lx", vaddr);
return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
}
@@ -399,7 +399,7 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
/*H:435 And this is us, creating the new page directory. If we really do
* allocate a new one (and so the kernel parts are not there), we set
* blank_pgdir. */
-static unsigned int new_pgdir(struct lguest *lg,
+static unsigned int new_pgdir(struct lg_cpu *cpu,
unsigned long gpgdir,
int *blank_pgdir)
{
@@ -407,22 +407,23 @@ static unsigned int new_pgdir(struct lguest *lg,
/* We pick one entry at random to throw out. Choosing the Least
* Recently Used might be better, but this is easy. */
- next = random32() % ARRAY_SIZE(lg->pgdirs);
+ next = random32() % ARRAY_SIZE(cpu->lg->pgdirs);
/* If it's never been allocated at all before, try now. */
- if (!lg->pgdirs[next].pgdir) {
- lg->pgdirs[next].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
+ if (!cpu->lg->pgdirs[next].pgdir) {
+ cpu->lg->pgdirs[next].pgdir =
+ (pgd_t *)get_zeroed_page(GFP_KERNEL);
/* If the allocation fails, just keep using the one we have */
- if (!lg->pgdirs[next].pgdir)
- next = lg->pgdidx;
+ if (!cpu->lg->pgdirs[next].pgdir)
+ next = cpu->cpu_pgd;
else
/* This is a blank page, so there are no kernel
* mappings: caller must map the stack! */
*blank_pgdir = 1;
}
/* Record which Guest toplevel this shadows. */
- lg->pgdirs[next].gpgdir = gpgdir;
+ cpu->lg->pgdirs[next].gpgdir = gpgdir;
/* Release all the non-kernel mappings. */
- flush_user_mappings(lg, next);
+ flush_user_mappings(cpu->lg, next);
return next;
}
@@ -432,21 +433,21 @@ static unsigned int new_pgdir(struct lguest *lg,
* Now we've seen all the page table setting and manipulation, let's see what
* what happens when the Guest changes page tables (ie. changes the top-level
* pgdir). This occurs on almost every context switch. */
-void guest_new_pagetable(struct lguest *lg, unsigned long pgtable)
+void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
{
int newpgdir, repin = 0;
/* Look to see if we have this one already. */
- newpgdir = find_pgdir(lg, pgtable);
+ newpgdir = find_pgdir(cpu->lg, pgtable);
/* If not, we allocate or mug an existing one: if it's a fresh one,
* repin gets set to 1. */
- if (newpgdir == ARRAY_SIZE(lg->pgdirs))
- newpgdir = new_pgdir(lg, pgtable, &repin);
+ if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
+ newpgdir = new_pgdir(cpu, pgtable, &repin);
/* Change the current pgd index to the new one. */
- lg->pgdidx = newpgdir;
+ cpu->cpu_pgd = newpgdir;
/* If it was completely blank, we map in the Guest kernel stack */
if (repin)
- pin_stack_pages(lg);
+ pin_stack_pages(cpu);
}
/*H:470 Finally, a routine which throws away everything: all PGD entries in all
@@ -468,11 +469,11 @@ static void release_all_pagetables(struct lguest *lg)
* mapping. Since kernel mappings are in every page table, it's easiest to
* throw them all away. This traps the Guest in amber for a while as
* everything faults back in, but it's rare. */
-void guest_pagetable_clear_all(struct lguest *lg)
+void guest_pagetable_clear_all(struct lg_cpu *cpu)
{
- release_all_pagetables(lg);
+ release_all_pagetables(cpu->lg);
/* We need the Guest kernel stack mapped again. */
- pin_stack_pages(lg);
+ pin_stack_pages(cpu);
}
/*:*/
/*M:009 Since we throw away all mappings when a kernel mapping changes, our
@@ -497,24 +498,24 @@ void guest_pagetable_clear_all(struct lguest *lg)
* _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if
* they set _PAGE_DIRTY then we can put a writable PTE entry in immediately.
*/
-static void do_set_pte(struct lguest *lg, int idx,
+static void do_set_pte(struct lg_cpu *cpu, int idx,
unsigned long vaddr, pte_t gpte)
{
/* Look up the matching shadow page directory entry. */
- pgd_t *spgd = spgd_addr(lg, idx, vaddr);
+ pgd_t *spgd = spgd_addr(cpu, idx, vaddr);
/* If the top level isn't present, there's no entry to update. */
if (pgd_flags(*spgd) & _PAGE_PRESENT) {
/* Otherwise, we start by releasing the existing entry. */
- pte_t *spte = spte_addr(lg, *spgd, vaddr);
+ pte_t *spte = spte_addr(*spgd, vaddr);
release_pte(*spte);
/* If they're setting this entry as dirty or accessed, we might
* as well put that entry they've given us in now. This shaves
* 10% off a copy-on-write micro-benchmark. */
if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
- check_gpte(lg, gpte);
- *spte = gpte_to_spte(lg, gpte,
+ check_gpte(cpu, gpte);
+ *spte = gpte_to_spte(cpu, gpte,
pte_flags(gpte) & _PAGE_DIRTY);
} else
/* Otherwise kill it and we can demand_page() it in
@@ -533,22 +534,22 @@ static void do_set_pte(struct lguest *lg, int idx,
*
* The benefit is that when we have to track a new page table, we can copy keep
* all the kernel mappings. This speeds up context switch immensely. */
-void guest_set_pte(struct lguest *lg,
+void guest_set_pte(struct lg_cpu *cpu,
unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
{
/* Kernel mappings must be changed on all top levels. Slow, but
* doesn't happen often. */
- if (vaddr >= lg->kernel_address) {
+ if (vaddr >= cpu->lg->kernel_address) {
unsigned int i;
- for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
- if (lg->pgdirs[i].pgdir)
- do_set_pte(lg, i, vaddr, gpte);
+ for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++)
+ if (cpu->lg->pgdirs[i].pgdir)
+ do_set_pte(cpu, i, vaddr, gpte);
} else {
/* Is this page table one we have a shadow for? */
- int pgdir = find_pgdir(lg, gpgdir);
- if (pgdir != ARRAY_SIZE(lg->pgdirs))
+ int pgdir = find_pgdir(cpu->lg, gpgdir);
+ if (pgdir != ARRAY_SIZE(cpu->lg->pgdirs))
/* If so, do the update. */
- do_set_pte(lg, pgdir, vaddr, gpte);
+ do_set_pte(cpu, pgdir, vaddr, gpte);
}
}
@@ -590,30 +591,32 @@ int init_guest_pagetable(struct lguest *lg, unsigned long pgtable)
{
/* We start on the first shadow page table, and give it a blank PGD
* page. */
- lg->pgdidx = 0;
- lg->pgdirs[lg->pgdidx].gpgdir = pgtable;
- lg->pgdirs[lg->pgdidx].pgdir = (pgd_t*)get_zeroed_page(GFP_KERNEL);
- if (!lg->pgdirs[lg->pgdidx].pgdir)
+ lg->pgdirs[0].gpgdir = pgtable;
+ lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
+ if (!lg->pgdirs[0].pgdir)
return -ENOMEM;
+ lg->cpus[0].cpu_pgd = 0;
return 0;
}
/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
-void page_table_guest_data_init(struct lguest *lg)
+void page_table_guest_data_init(struct lg_cpu *cpu)
{
/* We get the kernel address: above this is all kernel memory. */
- if (get_user(lg->kernel_address, &lg->lguest_data->kernel_address)
+ if (get_user(cpu->lg->kernel_address,
+ &cpu->lg->lguest_data->kernel_address)
/* We tell the Guest that it can't use the top 4MB of virtual
* addresses used by the Switcher. */
- || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem)
- || put_user(lg->pgdirs[lg->pgdidx].gpgdir,&lg->lguest_data->pgdir))
- kill_guest(lg, "bad guest page %p", lg->lguest_data);
+ || put_user(4U*1024*1024, &cpu->lg->lguest_data->reserve_mem)
+ || put_user(cpu->lg->pgdirs[0].gpgdir, &cpu->lg->lguest_data->pgdir))
+ kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
/* In flush_user_mappings() we loop from 0 to
* "pgd_index(lg->kernel_address)". This assumes it won't hit the
* Switcher mappings, so check that now. */
- if (pgd_index(lg->kernel_address) >= SWITCHER_PGD_INDEX)
- kill_guest(lg, "bad kernel address %#lx", lg->kernel_address);
+ if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX)
+ kill_guest(cpu, "bad kernel address %#lx",
+ cpu->lg->kernel_address);
}
/* When a Guest dies, our cleanup is fairly simple. */
@@ -634,17 +637,18 @@ void free_guest_pagetable(struct lguest *lg)
* Guest (and not the pages for other CPUs). We have the appropriate PTE pages
* for each CPU already set up, we just need to hook them in now we know which
* Guest is about to run on this CPU. */
-void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages)
+void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
{
pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
pgd_t switcher_pgd;
pte_t regs_pte;
+ unsigned long pfn;
/* Make the last PGD entry for this Guest point to the Switcher's PTE
* page for this CPU (with appropriate flags). */
- switcher_pgd = __pgd(__pa(switcher_pte_page) | _PAGE_KERNEL);
+ switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL);
- lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
+ cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
/* We also change the Switcher PTE page. When we're running the Guest,
* we want the Guest's "regs" page to appear where the first Switcher
@@ -653,7 +657,8 @@ void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages)
* CPU's "struct lguest_pages": if we make sure the Guest's register
* page is already mapped there, we don't have to copy them out
* again. */
- regs_pte = pfn_pte (__pa(lg->regs_page) >> PAGE_SHIFT, __pgprot(_PAGE_KERNEL));
+ pfn = __pa(cpu->regs_page) >> PAGE_SHIFT;
+ regs_pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL));
switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte;
}
/*:*/
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c
index 9e189cbec7d..ec6aa3f1c36 100644
--- a/drivers/lguest/segments.c
+++ b/drivers/lguest/segments.c
@@ -58,7 +58,7 @@ static int ignored_gdt(unsigned int num)
* Protection Fault in the Switcher when it restores a Guest segment register
* which tries to use that entry. Then we kill the Guest for causing such a
* mess: the message will be "unhandled trap 256". */
-static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end)
+static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end)
{
unsigned int i;
@@ -71,14 +71,14 @@ static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end)
/* Segment descriptors contain a privilege level: the Guest is
* sometimes careless and leaves this as 0, even though it's
* running at privilege level 1. If so, we fix it here. */
- if ((lg->arch.gdt[i].b & 0x00006000) == 0)
- lg->arch.gdt[i].b |= (GUEST_PL << 13);
+ if ((cpu->arch.gdt[i].b & 0x00006000) == 0)
+ cpu->arch.gdt[i].b |= (GUEST_PL << 13);
/* Each descriptor has an "accessed" bit. If we don't set it
* now, the CPU will try to set it when the Guest first loads
* that entry into a segment register. But the GDT isn't
* writable by the Guest, so bad things can happen. */
- lg->arch.gdt[i].b |= 0x00000100;
+ cpu->arch.gdt[i].b |= 0x00000100;
}
}
@@ -109,31 +109,31 @@ void setup_default_gdt_entries(struct lguest_ro_state *state)
/* This routine sets up the initial Guest GDT for booting. All entries start
* as 0 (unusable). */
-void setup_guest_gdt(struct lguest *lg)
+void setup_guest_gdt(struct lg_cpu *cpu)
{
/* Start with full 0-4G segments... */
- lg->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
- lg->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
+ cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
+ cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
/* ...except the Guest is allowed to use them, so set the privilege
* level appropriately in the flags. */
- lg->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13);
- lg->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13);
+ cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13);
+ cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13);
}
/*H:650 An optimization of copy_gdt(), for just the three "thead-local storage"
* entries. */
-void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt)
+void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt)
{
unsigned int i;
for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++)
- gdt[i] = lg->arch.gdt[i];
+ gdt[i] = cpu->arch.gdt[i];
}
/*H:640 When the Guest is run on a different CPU, or the GDT entries have
* changed, copy_gdt() is called to copy the Guest's GDT entries across to this
* CPU's GDT. */
-void copy_gdt(const struct lguest *lg, struct desc_struct *gdt)
+void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt)
{
unsigned int i;
@@ -141,38 +141,38 @@ void copy_gdt(const struct lguest *lg, struct desc_struct *gdt)
* replaced. See ignored_gdt() above. */
for (i = 0; i < GDT_ENTRIES; i++)
if (!ignored_gdt(i))
- gdt[i] = lg->arch.gdt[i];
+ gdt[i] = cpu->arch.gdt[i];
}
/*H:620 This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT).
* We copy it from the Guest and tweak the entries. */
-void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num)
+void load_guest_gdt(struct lg_cpu *cpu, unsigned long table, u32 num)
{
/* We assume the Guest has the same number of GDT entries as the
* Host, otherwise we'd have to dynamically allocate the Guest GDT. */
- if (num > ARRAY_SIZE(lg->arch.gdt))
- kill_guest(lg, "too many gdt entries %i", num);
+ if (num > ARRAY_SIZE(cpu->arch.gdt))
+ kill_guest(cpu, "too many gdt entries %i", num);
/* We read the whole thing in, then fix it up. */
- __lgread(lg, lg->arch.gdt, table, num * sizeof(lg->arch.gdt[0]));
- fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->arch.gdt));
+ __lgread(cpu, cpu->arch.gdt, table, num * sizeof(cpu->arch.gdt[0]));
+ fixup_gdt_table(cpu, 0, ARRAY_SIZE(cpu->arch.gdt));
/* Mark that the GDT changed so the core knows it has to copy it again,
* even if the Guest is run on the same CPU. */
- lg->changed |= CHANGED_GDT;
+ cpu->changed |= CHANGED_GDT;
}
/* This is the fast-track version for just changing the three TLS entries.
* Remember that this happens on every context switch, so it's worth
* optimizing. But wouldn't it be neater to have a single hypercall to cover
* both cases? */
-void guest_load_tls(struct lguest *lg, unsigned long gtls)
+void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls)
{
- struct desc_struct *tls = &lg->arch.gdt[GDT_ENTRY_TLS_MIN];
+ struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN];
- __lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
- fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
+ __lgread(cpu, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
+ fixup_gdt_table(cpu, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
/* Note that just the TLS entries have changed. */
- lg->changed |= CHANGED_GDT_TLS;
+ cpu->changed |= CHANGED_GDT_TLS;
}
/*:*/
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 96d0fd07c57..61f2f8eb8ca 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -60,7 +60,7 @@ static struct lguest_pages *lguest_pages(unsigned int cpu)
(SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
}
-static DEFINE_PER_CPU(struct lguest *, last_guest);
+static DEFINE_PER_CPU(struct lg_cpu *, last_cpu);
/*S:010
* We approach the Switcher.
@@ -73,16 +73,16 @@ static DEFINE_PER_CPU(struct lguest *, last_guest);
* since it last ran. We saw this set in interrupts_and_traps.c and
* segments.c.
*/
-static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages)
+static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages)
{
/* Copying all this data can be quite expensive. We usually run the
* same Guest we ran last time (and that Guest hasn't run anywhere else
* meanwhile). If that's not the case, we pretend everything in the
* Guest has changed. */
- if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) {
- __get_cpu_var(last_guest) = lg;
- lg->last_pages = pages;
- lg->changed = CHANGED_ALL;
+ if (__get_cpu_var(last_cpu) != cpu || cpu->last_pages != pages) {
+ __get_cpu_var(last_cpu) = cpu;
+ cpu->last_pages = pages;
+ cpu->changed = CHANGED_ALL;
}
/* These copies are pretty cheap, so we do them unconditionally: */
@@ -90,42 +90,42 @@ static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages)
pages->state.host_cr3 = __pa(current->mm->pgd);
/* Set up the Guest's page tables to see this CPU's pages (and no
* other CPU's pages). */
- map_switcher_in_guest(lg, pages);
+ map_switcher_in_guest(cpu, pages);
/* Set up the two "TSS" members which tell the CPU what stack to use
* for traps which do directly into the Guest (ie. traps at privilege
* level 1). */
- pages->state.guest_tss.esp1 = lg->esp1;
- pages->state.guest_tss.ss1 = lg->ss1;
+ pages->state.guest_tss.esp1 = cpu->esp1;
+ pages->state.guest_tss.ss1 = cpu->ss1;
/* Copy direct-to-Guest trap entries. */
- if (lg->changed & CHANGED_IDT)
- copy_traps(lg, pages->state.guest_idt, default_idt_entries);
+ if (cpu->changed & CHANGED_IDT)
+ copy_traps(cpu, pages->state.guest_idt, default_idt_entries);
/* Copy all GDT entries which the Guest can change. */
- if (lg->changed & CHANGED_GDT)
- copy_gdt(lg, pages->state.guest_gdt);
+ if (cpu->changed & CHANGED_GDT)
+ copy_gdt(cpu, pages->state.guest_gdt);
/* If only the TLS entries have changed, copy them. */
- else if (lg->changed & CHANGED_GDT_TLS)
- copy_gdt_tls(lg, pages->state.guest_gdt);
+ else if (cpu->changed & CHANGED_GDT_TLS)
+ copy_gdt_tls(cpu, pages->state.guest_gdt);
/* Mark the Guest as unchanged for next time. */
- lg->changed = 0;
+ cpu->changed = 0;
}
/* Finally: the code to actually call into the Switcher to run the Guest. */
-static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
+static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
{
/* This is a dummy value we need for GCC's sake. */
unsigned int clobber;
/* Copy the guest-specific information into this CPU's "struct
* lguest_pages". */
- copy_in_guest_info(lg, pages);
+ copy_in_guest_info(cpu, pages);
/* Set the trap number to 256 (impossible value). If we fault while
* switching to the Guest (bad segment registers or bug), this will
* cause us to abort the Guest. */
- lg->regs->trapnum = 256;
+ cpu->regs->trapnum = 256;
/* Now: we push the "eflags" register on the stack, then do an "lcall".
* This is how we change from using the kernel code segment to using
@@ -143,7 +143,7 @@ static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
* 0-th argument above, ie "a"). %ebx contains the
* physical address of the Guest's top-level page
* directory. */
- : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir))
+ : "0"(pages), "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir))
/* We tell gcc that all these registers could change,
* which means we don't have to save and restore them in
* the Switcher. */
@@ -161,12 +161,12 @@ static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
/*H:040 This is the i386-specific code to setup and run the Guest. Interrupts
* are disabled: we own the CPU. */
-void lguest_arch_run_guest(struct lguest *lg)
+void lguest_arch_run_guest(struct lg_cpu *cpu)
{
/* Remember the awfully-named TS bit? If the Guest has asked to set it
* we set it now, so we can trap and pass that trap to the Guest if it
* uses the FPU. */
- if (lg->ts)
+ if (cpu->ts)
lguest_set_ts();
/* SYSENTER is an optimized way of doing system calls. We can't allow
@@ -180,7 +180,7 @@ void lguest_arch_run_guest(struct lguest *lg)
/* Now we actually run the Guest. It will return when something
* interesting happens, and we can examine its registers to see what it
* was doing. */
- run_guest_once(lg, lguest_pages(raw_smp_processor_id()));
+ run_guest_once(cpu, lguest_pages(raw_smp_processor_id()));
/* Note that the "regs" pointer contains two extra entries which are
* not really registers: a trap number which says what interrupt or
@@ -191,11 +191,11 @@ void lguest_arch_run_guest(struct lguest *lg)
* bad virtual address. We have to grab this now, because once we
* re-enable interrupts an interrupt could fault and thus overwrite
* cr2, or we could even move off to a different CPU. */
- if (lg->regs->trapnum == 14)
- lg->arch.last_pagefault = read_cr2();
+ if (cpu->regs->trapnum == 14)
+ cpu->arch.last_pagefault = read_cr2();
/* Similarly, if we took a trap because the Guest used the FPU,
* we have to restore the FPU it expects to see. */
- else if (lg->regs->trapnum == 7)
+ else if (cpu->regs->trapnum == 7)
math_state_restore();
/* Restore SYSENTER if it's supposed to be on. */
@@ -214,22 +214,22 @@ void lguest_arch_run_guest(struct lguest *lg)
* When the Guest uses one of these instructions, we get a trap (General
* Protection Fault) and come here. We see if it's one of those troublesome
* instructions and skip over it. We return true if we did. */
-static int emulate_insn(struct lguest *lg)
+static int emulate_insn(struct lg_cpu *cpu)
{
u8 insn;
unsigned int insnlen = 0, in = 0, shift = 0;
/* The eip contains the *virtual* address of the Guest's instruction:
* guest_pa just subtracts the Guest's page_offset. */
- unsigned long physaddr = guest_pa(lg, lg->regs->eip);
+ unsigned long physaddr = guest_pa(cpu, cpu->regs->eip);
/* This must be the Guest kernel trying to do something, not userspace!
* The bottom two bits of the CS segment register are the privilege
* level. */
- if ((lg->regs->cs & 3) != GUEST_PL)
+ if ((cpu->regs->cs & 3) != GUEST_PL)
return 0;
/* Decoding x86 instructions is icky. */
- insn = lgread(lg, physaddr, u8);
+ insn = lgread(cpu, physaddr, u8);
/* 0x66 is an "operand prefix". It means it's using the upper 16 bits
of the eax register. */
@@ -237,7 +237,7 @@ static int emulate_insn(struct lguest *lg)
shift = 16;
/* The instruction is 1 byte so far, read the next byte. */
insnlen = 1;
- insn = lgread(lg, physaddr + insnlen, u8);
+ insn = lgread(cpu, physaddr + insnlen, u8);
}
/* We can ignore the lower bit for the moment and decode the 4 opcodes
@@ -268,26 +268,26 @@ static int emulate_insn(struct lguest *lg)
if (in) {
/* Lower bit tells is whether it's a 16 or 32 bit access */
if (insn & 0x1)
- lg->regs->eax = 0xFFFFFFFF;
+ cpu->regs->eax = 0xFFFFFFFF;
else
- lg->regs->eax |= (0xFFFF << shift);
+ cpu->regs->eax |= (0xFFFF << shift);
}
/* Finally, we've "done" the instruction, so move past it. */
- lg->regs->eip += insnlen;
+ cpu->regs->eip += insnlen;
/* Success! */
return 1;
}
/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
-void lguest_arch_handle_trap(struct lguest *lg)
+void lguest_arch_handle_trap(struct lg_cpu *cpu)
{
- switch (lg->regs->trapnum) {
+ switch (cpu->regs->trapnum) {
case 13: /* We've intercepted a General Protection Fault. */
/* Check if this was one of those annoying IN or OUT
* instructions which we need to emulate. If so, we just go
* back into the Guest after we've done it. */
- if (lg->regs->errcode == 0) {
- if (emulate_insn(lg))
+ if (cpu->regs->errcode == 0) {
+ if (emulate_insn(cpu))
return;
}
break;
@@ -301,7 +301,8 @@ void lguest_arch_handle_trap(struct lguest *lg)
*
* The errcode tells whether this was a read or a write, and
* whether kernel or userspace code. */
- if (demand_page(lg, lg->arch.last_pagefault, lg->regs->errcode))
+ if (demand_page(cpu, cpu->arch.last_pagefault,
+ cpu->regs->errcode))
return;
/* OK, it's really not there (or not OK): the Guest needs to
@@ -311,15 +312,16 @@ void lguest_arch_handle_trap(struct lguest *lg)
* Note that if the Guest were really messed up, this could
* happen before it's done the LHCALL_LGUEST_INIT hypercall, so
* lg->lguest_data could be NULL */
- if (lg->lguest_data &&
- put_user(lg->arch.last_pagefault, &lg->lguest_data->cr2))
- kill_guest(lg, "Writing cr2");
+ if (cpu->lg->lguest_data &&
+ put_user(cpu->arch.last_pagefault,
+ &cpu->lg->lguest_data->cr2))
+ kill_guest(cpu, "Writing cr2");
break;
case 7: /* We've intercepted a Device Not Available fault. */
/* If the Guest doesn't want to know, we already restored the
* Floating Point Unit, so we just continue without telling
* it. */
- if (!lg->ts)
+ if (!cpu->ts)
return;
break;
case 32 ... 255:
@@ -332,19 +334,19 @@ void lguest_arch_handle_trap(struct lguest *lg)
case LGUEST_TRAP_ENTRY:
/* Our 'struct hcall_args' maps directly over our regs: we set
* up the pointer now to indicate a hypercall is pending. */
- lg->hcall = (struct hcall_args *)lg->regs;
+ cpu->hcall = (struct hcall_args *)cpu->regs;
return;
}
/* We didn't handle the trap, so it needs to go to the Guest. */
- if (!deliver_trap(lg, lg->regs->trapnum))
+ if (!deliver_trap(cpu, cpu->regs->trapnum))
/* If the Guest doesn't have a handler (either it hasn't
* registered any yet, or it's one of the faults we don't let
* it handle), it dies with a cryptic error message. */
- kill_guest(lg, "unhandled trap %li at %#lx (%#lx)",
- lg->regs->trapnum, lg->regs->eip,
- lg->regs->trapnum == 14 ? lg->arch.last_pagefault
- : lg->regs->errcode);
+ kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)",
+ cpu->regs->trapnum, cpu->regs->eip,
+ cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault
+ : cpu->regs->errcode);
}
/* Now we can look at each of the routines this calls, in increasing order of
@@ -416,7 +418,7 @@ void __init lguest_arch_host_init(void)
/* We know where we want the stack to be when the Guest enters
* the switcher: in pages->regs. The stack grows upwards, so
* we start it at the end of that structure. */
- state->guest_tss.esp0 = (long)(&pages->regs + 1);
+ state->guest_tss.sp0 = (long)(&pages->regs + 1);
/* And this is the GDT entry to use for the stack: we keep a
* couple of special LGUEST entries. */
state->guest_tss.ss0 = LGUEST_DS;
@@ -487,17 +489,17 @@ void __exit lguest_arch_host_fini(void)
/*H:122 The i386-specific hypercalls simply farm out to the right functions. */
-int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args)
+int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
{
switch (args->arg0) {
case LHCALL_LOAD_GDT:
- load_guest_gdt(lg, args->arg1, args->arg2);
+ load_guest_gdt(cpu, args->arg1, args->arg2);
break;
case LHCALL_LOAD_IDT_ENTRY:
- load_guest_idt_entry(lg, args->arg1, args->arg2, args->arg3);
+ load_guest_idt_entry(cpu, args->arg1, args->arg2, args->arg3);
break;
case LHCALL_LOAD_TLS:
- guest_load_tls(lg, args->arg1);
+ guest_load_tls(cpu, args->arg1);
break;
default:
/* Bad Guest. Bad! */
@@ -507,13 +509,14 @@ int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args)
}
/*H:126 i386-specific hypercall initialization: */
-int lguest_arch_init_hypercalls(struct lguest *lg)
+int lguest_arch_init_hypercalls(struct lg_cpu *cpu)
{
u32 tsc_speed;
/* The pointer to the Guest's "struct lguest_data" is the only
* argument. We check that address now. */
- if (!lguest_address_ok(lg, lg->hcall->arg1, sizeof(*lg->lguest_data)))
+ if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1,
+ sizeof(*cpu->lg->lguest_data)))
return -EFAULT;
/* Having checked it, we simply set lg->lguest_data to point straight
@@ -521,7 +524,7 @@ int lguest_arch_init_hypercalls(struct lguest *lg)
* copy_to_user/from_user from now on, instead of lgread/write. I put
* this in to show that I'm not immune to writing stupid
* optimizations. */
- lg->lguest_data = lg->mem_base + lg->hcall->arg1;
+ cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1;
/* We insist that the Time Stamp Counter exist and doesn't change with
* cpu frequency. Some devious chip manufacturers decided that TSC
@@ -534,12 +537,12 @@ int lguest_arch_init_hypercalls(struct lguest *lg)
tsc_speed = tsc_khz;
else
tsc_speed = 0;
- if (put_user(tsc_speed, &lg->lguest_data->tsc_khz))
+ if (put_user(tsc_speed, &cpu->lg->lguest_data->tsc_khz))
return -EFAULT;
/* The interrupt code might not like the system call vector. */
- if (!check_syscall_vector(lg))
- kill_guest(lg, "bad syscall vector");
+ if (!check_syscall_vector(cpu->lg))
+ kill_guest(cpu, "bad syscall vector");
return 0;
}
@@ -548,9 +551,9 @@ int lguest_arch_init_hypercalls(struct lguest *lg)
*
* Most of the Guest's registers are left alone: we used get_zeroed_page() to
* allocate the structure, so they will be 0. */
-void lguest_arch_setup_regs(struct lguest *lg, unsigned long start)
+void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start)
{
- struct lguest_regs *regs = lg->regs;
+ struct lguest_regs *regs = cpu->regs;
/* There are four "segment" registers which the Guest needs to boot:
* The "code segment" register (cs) refers to the kernel code segment
@@ -577,5 +580,5 @@ void lguest_arch_setup_regs(struct lguest *lg, unsigned long start)
/* There are a couple of GDT entries the Guest expects when first
* booting. */
- setup_guest_gdt(lg);
+ setup_guest_gdt(cpu);
}
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index af40ff434de..6c575403bd3 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -2009,6 +2009,9 @@ config E1000E
To compile this driver as a module, choose M here. The module
will be called e1000e.
+config E1000E_ENABLED
+ def_bool E1000E != n
+
config IP1000
tristate "IP1000 Gigabit Ethernet support"
depends on PCI && EXPERIMENTAL
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index 7f5b2ae70d5..8c87940a9ce 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -47,6 +47,12 @@ static const char e1000_copyright[] = "Copyright (c) 1999-2006 Intel Corporation
* Macro expands to...
* {PCI_DEVICE(PCI_VENDOR_ID_INTEL, device_id)}
*/
+#ifdef CONFIG_E1000E_ENABLED
+ #define PCIE(x)
+#else
+ #define PCIE(x) x,
+#endif
+
static struct pci_device_id e1000_pci_tbl[] = {
INTEL_E1000_ETHERNET_DEVICE(0x1000),
INTEL_E1000_ETHERNET_DEVICE(0x1001),
@@ -73,6 +79,14 @@ static struct pci_device_id e1000_pci_tbl[] = {
INTEL_E1000_ETHERNET_DEVICE(0x1026),
INTEL_E1000_ETHERNET_DEVICE(0x1027),
INTEL_E1000_ETHERNET_DEVICE(0x1028),
+PCIE( INTEL_E1000_ETHERNET_DEVICE(0x1049))
+PCIE( INTEL_E1000_ETHERNET_DEVICE(0x104A))
+PCIE( INTEL_E1000_ETHERNET_DEVICE(0x104B))
+PCIE( INTEL_E1000_ETHERNET_DEVICE(0x104C))
+PCIE( INTEL_E1000_ETHERNET_DEVICE(0x104D))
+PCIE( INTEL_E1000_ETHERNET_DEVICE(0x105E))
+PCIE( INTEL_E1000_ETHERNET_DEVICE(0x105F))
+PCIE( INTEL_E1000_ETHERNET_DEVICE(0x1060))
INTEL_E1000_ETHERNET_DEVICE(0x1075),
INTEL_E1000_ETHERNET_DEVICE(0x1076),
INTEL_E1000_ETHERNET_DEVICE(0x1077),
@@ -81,9 +95,28 @@ static struct pci_device_id e1000_pci_tbl[] = {
INTEL_E1000_ETHERNET_DEVICE(0x107A),
INTEL_E1000_ETHERNET_DEVICE(0x107B),
INTEL_E1000_ETHERNET_DEVICE(0x107C),
+PCIE( INTEL_E1000_ETHERNET_DEVICE(0x107D))
+PCIE( INTEL_E1000_ETHERNET_DEVICE(0x107E))
+PCIE( INTEL_E1000_ETHERNET_DEVICE(0x107F))
INTEL_E1000_ETHERNET_DEVICE(0x108A),
+PCIE( INTEL_E1000_ETHERNET_DEVICE(0x108B))
+PCIE( INTEL_E1000_ETHERNET_DEVICE(0x108C))
+PCIE( INTEL_E1000_ETHERNET_DEVICE(0x1096))
+PCIE( INTEL_E1000_ETHERNET_DEVICE(0x1098))
INTEL_E1000_ETHERNET_DEVICE(0x1099),
+PCIE( INTEL_E1000_ETHERNET_DEVICE(0x109A))
+PCIE( INTEL_E1000_ETHERNET_DEVICE(0x10A4))
+PCIE( INTEL_E1000_ETHERNET_DEVICE(0x10A5))
INTEL_E1000_ETHERNET_DEVICE(0x10B5),
+PCIE( INTEL_E1000_ETHERNET_DEVICE(0x10B9))
+PCIE( INTEL_E1000_ETHERNET_DEVICE(0x10BA))
+PCIE( INTEL_E1000_ETHERNET_DEVICE(0x10BB))
+PCIE( INTEL_E1000_ETHERNET_DEVICE(0x10BC))
+PCIE( INTEL_E1000_ETHERNET_DEVICE(0x10C4))
+PCIE( INTEL_E1000_ETHERNET_DEVICE(0x10C5))
+PCIE( INTEL_E1000_ETHERNET_DEVICE(0x10D5))
+PCIE( INTEL_E1000_ETHERNET_DEVICE(0x10D9))
+PCIE( INTEL_E1000_ETHERNET_DEVICE(0x10DA))
/* required last entry */
{0,}
};
diff --git a/drivers/pnp/pnpbios/bioscalls.c b/drivers/pnp/pnpbios/bioscalls.c
index 5dba68fe33f..a8364d81522 100644
--- a/drivers/pnp/pnpbios/bioscalls.c
+++ b/drivers/pnp/pnpbios/bioscalls.c
@@ -61,7 +61,7 @@ set_base(gdt[(selname) >> 3], (u32)(address)); \
set_limit(gdt[(selname) >> 3], size); \
} while(0)
-static struct desc_struct bad_bios_desc = { 0, 0x00409200 };
+static struct desc_struct bad_bios_desc;
/*
* At some point we want to use this stack frame pointer to unwind
@@ -477,6 +477,9 @@ void pnpbios_calls_init(union pnp_bios_install_struct *header)
pnp_bios_callpoint.offset = header->fields.pm16offset;
pnp_bios_callpoint.segment = PNP_CS16;
+ bad_bios_desc.a = 0;
+ bad_bios_desc.b = 0x00409200;
+
set_base(bad_bios_desc, __va((unsigned long)0x40 << 4));
_set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4));
for (i = 0; i < NR_CPUS; i++) {
diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c
index e45f85f7c7e..0dff05840ee 100644
--- a/drivers/s390/scsi/zfcp_fsf.c
+++ b/drivers/s390/scsi/zfcp_fsf.c
@@ -4224,10 +4224,10 @@ zfcp_fsf_send_fcp_command_task_handler(struct zfcp_fsf_req *fsf_req)
ZFCP_LOG_TRACE("%i bytes sense data provided by FCP\n",
fcp_rsp_iu->fcp_sns_len);
- memcpy(&scpnt->sense_buffer,
+ memcpy(scpnt->sense_buffer,
zfcp_get_fcp_sns_info_ptr(fcp_rsp_iu), sns_len);
ZFCP_HEX_DUMP(ZFCP_LOG_LEVEL_TRACE,
- (void *) &scpnt->sense_buffer, sns_len);
+ (void *)scpnt->sense_buffer, sns_len);
}
/* check for overrun */
diff --git a/drivers/scsi/3w-9xxx.c b/drivers/scsi/3w-9xxx.c
index 1c244832c6c..b4912d1cee2 100644
--- a/drivers/scsi/3w-9xxx.c
+++ b/drivers/scsi/3w-9xxx.c
@@ -1990,7 +1990,6 @@ static struct scsi_host_template driver_template = {
.max_sectors = TW_MAX_SECTORS,
.cmd_per_lun = TW_MAX_CMDS_PER_LUN,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.shost_attrs = twa_host_attrs,
.emulated = 1
};
diff --git a/drivers/scsi/3w-xxxx.c b/drivers/scsi/3w-xxxx.c
index 59716ebeb10..d0953216221 100644
--- a/drivers/scsi/3w-xxxx.c
+++ b/drivers/scsi/3w-xxxx.c
@@ -2261,7 +2261,6 @@ static struct scsi_host_template driver_template = {
.max_sectors = TW_MAX_SECTORS,
.cmd_per_lun = TW_MAX_CMDS_PER_LUN,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.shost_attrs = tw_host_attrs,
.emulated = 1
};
diff --git a/drivers/scsi/BusLogic.c b/drivers/scsi/BusLogic.c
index ead47c143ce..4d3ebb1af49 100644
--- a/drivers/scsi/BusLogic.c
+++ b/drivers/scsi/BusLogic.c
@@ -3575,7 +3575,6 @@ static struct scsi_host_template Bus_Logic_template = {
.unchecked_isa_dma = 1,
.max_sectors = 128,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
/*
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 3e161cd6646..14fc7f39e83 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -345,7 +345,7 @@ config ISCSI_TCP
config SGIWD93_SCSI
tristate "SGI WD93C93 SCSI Driver"
- depends on SGI_IP22 && SCSI
+ depends on SGI_HAS_WD93 && SCSI
help
If you have a Western Digital WD93 SCSI controller on
an SGI MIPS system, say Y. Otherwise, say N.
diff --git a/drivers/scsi/NCR53c406a.c b/drivers/scsi/NCR53c406a.c
index 137d065db3d..6961f78742a 100644
--- a/drivers/scsi/NCR53c406a.c
+++ b/drivers/scsi/NCR53c406a.c
@@ -1065,7 +1065,6 @@ static struct scsi_host_template driver_template =
.cmd_per_lun = 1 /* commands per lun */,
.unchecked_isa_dma = 1 /* unchecked_isa_dma */,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
#include "scsi_module.c"
diff --git a/drivers/scsi/a100u2w.c b/drivers/scsi/a100u2w.c
index d3a6d15fb77..f608d4a1d6d 100644
--- a/drivers/scsi/a100u2w.c
+++ b/drivers/scsi/a100u2w.c
@@ -1071,7 +1071,6 @@ static struct scsi_host_template inia100_template = {
.sg_tablesize = SG_ALL,
.cmd_per_lun = 1,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
static int __devinit inia100_probe_one(struct pci_dev *pdev,
diff --git a/drivers/scsi/aacraid/commctrl.c b/drivers/scsi/aacraid/commctrl.c
index 851a7e599c5..f8afa358b6b 100644
--- a/drivers/scsi/aacraid/commctrl.c
+++ b/drivers/scsi/aacraid/commctrl.c
@@ -243,7 +243,6 @@ static int next_getadapter_fib(struct aac_dev * dev, void __user *arg)
* Search the list of AdapterFibContext addresses on the adapter
* to be sure this is a valid address
*/
- spin_lock_irqsave(&dev->fib_lock, flags);
entry = dev->fib_list.next;
fibctx = NULL;
@@ -252,25 +251,24 @@ static int next_getadapter_fib(struct aac_dev * dev, void __user *arg)
/*
* Extract the AdapterFibContext from the Input parameters.
*/
- if (fibctx->unique == f.fibctx) { /* We found a winner */
+ if (fibctx->unique == f.fibctx) { /* We found a winner */
break;
}
entry = entry->next;
fibctx = NULL;
}
if (!fibctx) {
- spin_unlock_irqrestore(&dev->fib_lock, flags);
dprintk ((KERN_INFO "Fib Context not found\n"));
return -EINVAL;
}
if((fibctx->type != FSAFS_NTC_GET_ADAPTER_FIB_CONTEXT) ||
(fibctx->size != sizeof(struct aac_fib_context))) {
- spin_unlock_irqrestore(&dev->fib_lock, flags);
dprintk ((KERN_INFO "Fib Context corrupt?\n"));
return -EINVAL;
}
status = 0;
+ spin_lock_irqsave(&dev->fib_lock, flags);
/*
* If there are no fibs to send back, then either wait or return
* -EAGAIN
@@ -328,9 +326,7 @@ return_fib:
int aac_close_fib_context(struct aac_dev * dev, struct aac_fib_context * fibctx)
{
struct fib *fib;
- unsigned long flags;
- spin_lock_irqsave(&dev->fib_lock, flags);
/*
* First free any FIBs that have not been consumed.
*/
@@ -353,7 +349,6 @@ int aac_close_fib_context(struct aac_dev * dev, struct aac_fib_context * fibctx)
* Remove the Context from the AdapterFibContext List
*/
list_del(&fibctx->next);
- spin_unlock_irqrestore(&dev->fib_lock, flags);
/*
* Invalidate context
*/
@@ -419,8 +414,8 @@ static int close_getadapter_fib(struct aac_dev * dev, void __user *arg)
* @arg: ioctl arguments
*
* This routine returns the driver version.
- * Under Linux, there have been no version incompatibilities, so this is
- * simple!
+ * Under Linux, there have been no version incompatibilities, so this is
+ * simple!
*/
static int check_revision(struct aac_dev *dev, void __user *arg)
@@ -468,7 +463,7 @@ static int aac_send_raw_srb(struct aac_dev* dev, void __user * arg)
u32 data_dir;
void __user *sg_user[32];
void *sg_list[32];
- u32 sg_indx = 0;
+ u32 sg_indx = 0;
u32 byte_count = 0;
u32 actual_fibsize64, actual_fibsize = 0;
int i;
@@ -522,11 +517,11 @@ static int aac_send_raw_srb(struct aac_dev* dev, void __user * arg)
// Fix up srb for endian and force some values
srbcmd->function = cpu_to_le32(SRBF_ExecuteScsi); // Force this
- srbcmd->channel = cpu_to_le32(user_srbcmd->channel);
+ srbcmd->channel = cpu_to_le32(user_srbcmd->channel);
srbcmd->id = cpu_to_le32(user_srbcmd->id);
- srbcmd->lun = cpu_to_le32(user_srbcmd->lun);
- srbcmd->timeout = cpu_to_le32(user_srbcmd->timeout);
- srbcmd->flags = cpu_to_le32(flags);
+ srbcmd->lun = cpu_to_le32(user_srbcmd->lun);
+ srbcmd->timeout = cpu_to_le32(user_srbcmd->timeout);
+ srbcmd->flags = cpu_to_le32(flags);
srbcmd->retry_limit = 0; // Obsolete parameter
srbcmd->cdb_size = cpu_to_le32(user_srbcmd->cdb_size);
memcpy(srbcmd->cdb, user_srbcmd->cdb, sizeof(srbcmd->cdb));
@@ -791,9 +786,9 @@ static int aac_get_pci_info(struct aac_dev* dev, void __user *arg)
pci_info.bus = dev->pdev->bus->number;
pci_info.slot = PCI_SLOT(dev->pdev->devfn);
- if (copy_to_user(arg, &pci_info, sizeof(struct aac_pci_info))) {
- dprintk((KERN_DEBUG "aacraid: Could not copy pci info\n"));
- return -EFAULT;
+ if (copy_to_user(arg, &pci_info, sizeof(struct aac_pci_info))) {
+ dprintk((KERN_DEBUG "aacraid: Could not copy pci info\n"));
+ return -EFAULT;
}
return 0;
}
diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c
index 61be22774e9..0e8267c1e91 100644
--- a/drivers/scsi/aacraid/linit.c
+++ b/drivers/scsi/aacraid/linit.c
@@ -1032,7 +1032,6 @@ static struct scsi_host_template aac_driver_template = {
.cmd_per_lun = AAC_NUM_IO_FIB,
#endif
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.emulated = 1,
};
diff --git a/drivers/scsi/aha1740.c b/drivers/scsi/aha1740.c
index be58a0b097c..7c45d88a205 100644
--- a/drivers/scsi/aha1740.c
+++ b/drivers/scsi/aha1740.c
@@ -563,7 +563,6 @@ static struct scsi_host_template aha1740_template = {
.sg_tablesize = AHA1740_SCATTER,
.cmd_per_lun = AHA1740_CMDLUN,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.eh_abort_handler = aha1740_eh_abort_handler,
};
diff --git a/drivers/scsi/aic7xxx/aic79xx.h b/drivers/scsi/aic7xxx/aic79xx.h
index ce638aa6005..2f00467b6b8 100644
--- a/drivers/scsi/aic7xxx/aic79xx.h
+++ b/drivers/scsi/aic7xxx/aic79xx.h
@@ -1340,8 +1340,10 @@ struct ahd_pci_identity *ahd_find_pci_device(ahd_dev_softc_t);
int ahd_pci_config(struct ahd_softc *,
struct ahd_pci_identity *);
int ahd_pci_test_register_access(struct ahd_softc *);
+#ifdef CONFIG_PM
void ahd_pci_suspend(struct ahd_softc *);
void ahd_pci_resume(struct ahd_softc *);
+#endif
/************************** SCB and SCB queue management **********************/
void ahd_qinfifo_requeue_tail(struct ahd_softc *ahd,
@@ -1352,8 +1354,10 @@ struct ahd_softc *ahd_alloc(void *platform_arg, char *name);
int ahd_softc_init(struct ahd_softc *);
void ahd_controller_info(struct ahd_softc *ahd, char *buf);
int ahd_init(struct ahd_softc *ahd);
+#ifdef CONFIG_PM
int ahd_suspend(struct ahd_softc *ahd);
void ahd_resume(struct ahd_softc *ahd);
+#endif
int ahd_default_config(struct ahd_softc *ahd);
int ahd_parse_vpddata(struct ahd_softc *ahd,
struct vpd_config *vpd);
@@ -1361,7 +1365,6 @@ int ahd_parse_cfgdata(struct ahd_softc *ahd,
struct seeprom_config *sc);
void ahd_intr_enable(struct ahd_softc *ahd, int enable);
void ahd_pause_and_flushwork(struct ahd_softc *ahd);
-int ahd_suspend(struct ahd_softc *ahd);
void ahd_set_unit(struct ahd_softc *, int);
void ahd_set_name(struct ahd_softc *, char *);
struct scb *ahd_get_scb(struct ahd_softc *ahd, u_int col_idx);
diff --git a/drivers/scsi/aic7xxx/aic79xx_core.c b/drivers/scsi/aic7xxx/aic79xx_core.c
index a7dd8cdda47..ade0fb8fbdb 100644
--- a/drivers/scsi/aic7xxx/aic79xx_core.c
+++ b/drivers/scsi/aic7xxx/aic79xx_core.c
@@ -7175,6 +7175,7 @@ ahd_pause_and_flushwork(struct ahd_softc *ahd)
ahd->flags &= ~AHD_ALL_INTERRUPTS;
}
+#ifdef CONFIG_PM
int
ahd_suspend(struct ahd_softc *ahd)
{
@@ -7197,6 +7198,7 @@ ahd_resume(struct ahd_softc *ahd)
ahd_intr_enable(ahd, TRUE);
ahd_restart(ahd);
}
+#endif
/************************** Busy Target Table *********************************/
/*
diff --git a/drivers/scsi/aic7xxx/aic79xx_osm.c b/drivers/scsi/aic7xxx/aic79xx_osm.c
index 0e4708fd43c..01465479290 100644
--- a/drivers/scsi/aic7xxx/aic79xx_osm.c
+++ b/drivers/scsi/aic7xxx/aic79xx_osm.c
@@ -766,7 +766,6 @@ struct scsi_host_template aic79xx_driver_template = {
.max_sectors = 8192,
.cmd_per_lun = 2,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.slave_alloc = ahd_linux_slave_alloc,
.slave_configure = ahd_linux_slave_configure,
.target_alloc = ahd_linux_target_alloc,
@@ -1922,7 +1921,7 @@ ahd_linux_queue_cmd_complete(struct ahd_softc *ahd, struct scsi_cmnd *cmd)
struct scsi_sense_data *sense;
sense = (struct scsi_sense_data *)
- &cmd->sense_buffer;
+ cmd->sense_buffer;
if (sense->extra_len >= 5 &&
(sense->add_sense_code == 0x47
|| sense->add_sense_code == 0x48))
diff --git a/drivers/scsi/aic7xxx/aic79xx_osm_pci.c b/drivers/scsi/aic7xxx/aic79xx_osm_pci.c
index 66f0259edb6..4150c8a8fdc 100644
--- a/drivers/scsi/aic7xxx/aic79xx_osm_pci.c
+++ b/drivers/scsi/aic7xxx/aic79xx_osm_pci.c
@@ -43,17 +43,6 @@
#include "aic79xx_inline.h"
#include "aic79xx_pci.h"
-static int ahd_linux_pci_dev_probe(struct pci_dev *pdev,
- const struct pci_device_id *ent);
-static int ahd_linux_pci_reserve_io_regions(struct ahd_softc *ahd,
- u_long *base, u_long *base2);
-static int ahd_linux_pci_reserve_mem_region(struct ahd_softc *ahd,
- u_long *bus_addr,
- uint8_t __iomem **maddr);
-static int ahd_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg);
-static int ahd_linux_pci_dev_resume(struct pci_dev *pdev);
-static void ahd_linux_pci_dev_remove(struct pci_dev *pdev);
-
/* Define the macro locally since it's different for different class of chips.
*/
#define ID(x) \
@@ -85,17 +74,7 @@ static struct pci_device_id ahd_linux_pci_id_table[] = {
MODULE_DEVICE_TABLE(pci, ahd_linux_pci_id_table);
-static struct pci_driver aic79xx_pci_driver = {
- .name = "aic79xx",
- .probe = ahd_linux_pci_dev_probe,
#ifdef CONFIG_PM
- .suspend = ahd_linux_pci_dev_suspend,
- .resume = ahd_linux_pci_dev_resume,
-#endif
- .remove = ahd_linux_pci_dev_remove,
- .id_table = ahd_linux_pci_id_table
-};
-
static int
ahd_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg)
{
@@ -139,6 +118,7 @@ ahd_linux_pci_dev_resume(struct pci_dev *pdev)
return rc;
}
+#endif
static void
ahd_linux_pci_dev_remove(struct pci_dev *pdev)
@@ -245,6 +225,17 @@ ahd_linux_pci_dev_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
return (0);
}
+static struct pci_driver aic79xx_pci_driver = {
+ .name = "aic79xx",
+ .probe = ahd_linux_pci_dev_probe,
+#ifdef CONFIG_PM
+ .suspend = ahd_linux_pci_dev_suspend,
+ .resume = ahd_linux_pci_dev_resume,
+#endif
+ .remove = ahd_linux_pci_dev_remove,
+ .id_table = ahd_linux_pci_id_table
+};
+
int
ahd_linux_pci_init(void)
{
diff --git a/drivers/scsi/aic7xxx/aic79xx_pci.c b/drivers/scsi/aic7xxx/aic79xx_pci.c
index 7a203a90601..df853676e66 100644
--- a/drivers/scsi/aic7xxx/aic79xx_pci.c
+++ b/drivers/scsi/aic7xxx/aic79xx_pci.c
@@ -389,6 +389,7 @@ ahd_pci_config(struct ahd_softc *ahd, struct ahd_pci_identity *entry)
return error;
}
+#ifdef CONFIG_PM
void
ahd_pci_suspend(struct ahd_softc *ahd)
{
@@ -415,6 +416,7 @@ ahd_pci_resume(struct ahd_softc *ahd)
ahd_pci_write_config(ahd->dev_softc, CSIZE_LATTIME,
ahd->suspend_state.pci_state.csize_lattime, /*bytes*/1);
}
+#endif
/*
* Perform some simple tests that should catch situations where
diff --git a/drivers/scsi/aic7xxx/aic7xxx.h b/drivers/scsi/aic7xxx/aic7xxx.h
index 3d4e42d9045..c0344e61765 100644
--- a/drivers/scsi/aic7xxx/aic7xxx.h
+++ b/drivers/scsi/aic7xxx/aic7xxx.h
@@ -1143,7 +1143,9 @@ struct ahc_pci_identity *ahc_find_pci_device(ahc_dev_softc_t);
int ahc_pci_config(struct ahc_softc *,
struct ahc_pci_identity *);
int ahc_pci_test_register_access(struct ahc_softc *);
+#ifdef CONFIG_PM
void ahc_pci_resume(struct ahc_softc *ahc);
+#endif
/*************************** EISA/VL Front End ********************************/
struct aic7770_identity *aic7770_find_device(uint32_t);
@@ -1170,8 +1172,10 @@ int ahc_chip_init(struct ahc_softc *ahc);
int ahc_init(struct ahc_softc *ahc);
void ahc_intr_enable(struct ahc_softc *ahc, int enable);
void ahc_pause_and_flushwork(struct ahc_softc *ahc);
+#ifdef CONFIG_PM
int ahc_suspend(struct ahc_softc *ahc);
int ahc_resume(struct ahc_softc *ahc);
+#endif
void ahc_set_unit(struct ahc_softc *, int);
void ahc_set_name(struct ahc_softc *, char *);
void ahc_alloc_scbs(struct ahc_softc *ahc);
diff --git a/drivers/scsi/aic7xxx/aic7xxx_core.c b/drivers/scsi/aic7xxx/aic7xxx_core.c
index f350b5e89e7..6d2ae641273 100644
--- a/drivers/scsi/aic7xxx/aic7xxx_core.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_core.c
@@ -5078,6 +5078,7 @@ ahc_pause_and_flushwork(struct ahc_softc *ahc)
ahc->flags &= ~AHC_ALL_INTERRUPTS;
}
+#ifdef CONFIG_PM
int
ahc_suspend(struct ahc_softc *ahc)
{
@@ -5113,7 +5114,7 @@ ahc_resume(struct ahc_softc *ahc)
ahc_restart(ahc);
return (0);
}
-
+#endif
/************************** Busy Target Table *********************************/
/*
* Return the untagged transaction id for a given target/channel lun.
diff --git a/drivers/scsi/aic7xxx/aic7xxx_osm.c b/drivers/scsi/aic7xxx/aic7xxx_osm.c
index e310e414067..99a3b33a323 100644
--- a/drivers/scsi/aic7xxx/aic7xxx_osm.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_osm.c
@@ -747,7 +747,6 @@ struct scsi_host_template aic7xxx_driver_template = {
.max_sectors = 8192,
.cmd_per_lun = 2,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.slave_alloc = ahc_linux_slave_alloc,
.slave_configure = ahc_linux_slave_configure,
.target_alloc = ahc_linux_target_alloc,
@@ -1658,9 +1657,12 @@ ahc_done(struct ahc_softc *ahc, struct scb *scb)
untagged_q = &(ahc->untagged_queues[target_offset]);
TAILQ_REMOVE(untagged_q, scb, links.tqe);
BUG_ON(!TAILQ_EMPTY(untagged_q));
- }
-
- if ((scb->flags & SCB_ACTIVE) == 0) {
+ } else if ((scb->flags & SCB_ACTIVE) == 0) {
+ /*
+ * Transactions aborted from the untagged queue may
+ * not have been dispatched to the controller, so
+ * only check the SCB_ACTIVE flag for tagged transactions.
+ */
printf("SCB %d done'd twice\n", scb->hscb->tag);
ahc_dump_card_state(ahc);
panic("Stopping for safety");
diff --git a/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c b/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c
index 4488946cff2..dd6e21d6f1d 100644
--- a/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c
@@ -42,17 +42,6 @@
#include "aic7xxx_osm.h"
#include "aic7xxx_pci.h"
-static int ahc_linux_pci_dev_probe(struct pci_dev *pdev,
- const struct pci_device_id *ent);
-static int ahc_linux_pci_reserve_io_region(struct ahc_softc *ahc,
- u_long *base);
-static int ahc_linux_pci_reserve_mem_region(struct ahc_softc *ahc,
- u_long *bus_addr,
- uint8_t __iomem **maddr);
-static int ahc_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg);
-static int ahc_linux_pci_dev_resume(struct pci_dev *pdev);
-static void ahc_linux_pci_dev_remove(struct pci_dev *pdev);
-
/* Define the macro locally since it's different for different class of chips.
*/
#define ID(x) ID_C(x, PCI_CLASS_STORAGE_SCSI)
@@ -132,17 +121,7 @@ static struct pci_device_id ahc_linux_pci_id_table[] = {
MODULE_DEVICE_TABLE(pci, ahc_linux_pci_id_table);
-static struct pci_driver aic7xxx_pci_driver = {
- .name = "aic7xxx",
- .probe = ahc_linux_pci_dev_probe,
#ifdef CONFIG_PM
- .suspend = ahc_linux_pci_dev_suspend,
- .resume = ahc_linux_pci_dev_resume,
-#endif
- .remove = ahc_linux_pci_dev_remove,
- .id_table = ahc_linux_pci_id_table
-};
-
static int
ahc_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg)
{
@@ -182,6 +161,7 @@ ahc_linux_pci_dev_resume(struct pci_dev *pdev)
return (ahc_resume(ahc));
}
+#endif
static void
ahc_linux_pci_dev_remove(struct pci_dev *pdev)
@@ -289,6 +269,17 @@ ahc_linux_pci_dev_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
return (0);
}
+static struct pci_driver aic7xxx_pci_driver = {
+ .name = "aic7xxx",
+ .probe = ahc_linux_pci_dev_probe,
+#ifdef CONFIG_PM
+ .suspend = ahc_linux_pci_dev_suspend,
+ .resume = ahc_linux_pci_dev_resume,
+#endif
+ .remove = ahc_linux_pci_dev_remove,
+ .id_table = ahc_linux_pci_id_table
+};
+
int
ahc_linux_pci_init(void)
{
diff --git a/drivers/scsi/aic7xxx/aic7xxx_pci.c b/drivers/scsi/aic7xxx/aic7xxx_pci.c
index ae35937b805..56848f41e4f 100644
--- a/drivers/scsi/aic7xxx/aic7xxx_pci.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_pci.c
@@ -2020,6 +2020,7 @@ ahc_pci_chip_init(struct ahc_softc *ahc)
return (ahc_chip_init(ahc));
}
+#ifdef CONFIG_PM
void
ahc_pci_resume(struct ahc_softc *ahc)
{
@@ -2051,6 +2052,7 @@ ahc_pci_resume(struct ahc_softc *ahc)
ahc_release_seeprom(&sd);
}
}
+#endif
static int
ahc_aic785X_setup(struct ahc_softc *ahc)
diff --git a/drivers/scsi/aic7xxx_old.c b/drivers/scsi/aic7xxx_old.c
index bcb0b870320..3bfd9296bbf 100644
--- a/drivers/scsi/aic7xxx_old.c
+++ b/drivers/scsi/aic7xxx_old.c
@@ -11141,7 +11141,6 @@ static struct scsi_host_template driver_template = {
.max_sectors = 2048,
.cmd_per_lun = 3,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
#include "scsi_module.c"
diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c
index d80dba913a7..f4a202e8df2 100644
--- a/drivers/scsi/arcmsr/arcmsr_hba.c
+++ b/drivers/scsi/arcmsr/arcmsr_hba.c
@@ -122,7 +122,6 @@ static struct scsi_host_template arcmsr_scsi_host_template = {
.max_sectors = ARCMSR_MAX_XFER_SECTORS,
.cmd_per_lun = ARCMSR_MAX_CMD_PERLUN,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.shost_attrs = arcmsr_host_attrs,
};
#ifdef CONFIG_SCSI_ARCMSR_AER
diff --git a/drivers/scsi/dc395x.c b/drivers/scsi/dc395x.c
index f93c73c0ba5..22ef3716e78 100644
--- a/drivers/scsi/dc395x.c
+++ b/drivers/scsi/dc395x.c
@@ -4763,7 +4763,6 @@ static struct scsi_host_template dc395x_driver_template = {
.eh_bus_reset_handler = dc395x_eh_bus_reset,
.unchecked_isa_dma = 0,
.use_clustering = DISABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
diff --git a/drivers/scsi/dpt_i2o.c b/drivers/scsi/dpt_i2o.c
index 19cce125124..c9dd8392aab 100644
--- a/drivers/scsi/dpt_i2o.c
+++ b/drivers/scsi/dpt_i2o.c
@@ -3340,7 +3340,6 @@ static struct scsi_host_template driver_template = {
.this_id = 7,
.cmd_per_lun = 1,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
#include "scsi_module.c"
MODULE_LICENSE("GPL");
diff --git a/drivers/scsi/eata.c b/drivers/scsi/eata.c
index 05163cefec1..8be3d76656f 100644
--- a/drivers/scsi/eata.c
+++ b/drivers/scsi/eata.c
@@ -524,7 +524,6 @@ static struct scsi_host_template driver_template = {
.this_id = 7,
.unchecked_isa_dma = 1,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
#if !defined(__BIG_ENDIAN_BITFIELD) && !defined(__LITTLE_ENDIAN_BITFIELD)
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 5ea1f986220..880c78bff0e 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -342,7 +342,6 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
shost->use_clustering = sht->use_clustering;
shost->ordered_tag = sht->ordered_tag;
shost->active_mode = sht->supported_mode;
- shost->use_sg_chaining = sht->use_sg_chaining;
if (sht->supported_mode == MODE_UNKNOWN)
/* means we didn't set it ... default to INITIATOR */
diff --git a/drivers/scsi/hptiop.c b/drivers/scsi/hptiop.c
index e7b2f3575ce..ff149ad6bc4 100644
--- a/drivers/scsi/hptiop.c
+++ b/drivers/scsi/hptiop.c
@@ -573,7 +573,7 @@ static void hptiop_finish_scsi_req(struct hptiop_hba *hba, u32 tag,
scsi_set_resid(scp,
scsi_bufflen(scp) - le32_to_cpu(req->dataxfer_length));
scp->result = SAM_STAT_CHECK_CONDITION;
- memcpy(&scp->sense_buffer, &req->sg_list,
+ memcpy(scp->sense_buffer, &req->sg_list,
min_t(size_t, SCSI_SENSE_BUFFERSIZE,
le32_to_cpu(req->dataxfer_length)));
break;
@@ -906,7 +906,6 @@ static struct scsi_host_template driver_template = {
.unchecked_isa_dma = 0,
.emulated = 0,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.proc_name = driver_name,
.shost_attrs = hptiop_attrs,
.this_id = -1,
diff --git a/drivers/scsi/ibmmca.c b/drivers/scsi/ibmmca.c
index db004a45073..4d15a62914e 100644
--- a/drivers/scsi/ibmmca.c
+++ b/drivers/scsi/ibmmca.c
@@ -1501,7 +1501,6 @@ static struct scsi_host_template ibmmca_driver_template = {
.sg_tablesize = 16,
.cmd_per_lun = 1,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
static int ibmmca_probe(struct device *dev)
diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c
index 30819012898..78d46a900bb 100644
--- a/drivers/scsi/ibmvscsi/ibmvscsi.c
+++ b/drivers/scsi/ibmvscsi/ibmvscsi.c
@@ -1600,7 +1600,6 @@ static struct scsi_host_template driver_template = {
.this_id = -1,
.sg_tablesize = SG_ALL,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.shost_attrs = ibmvscsi_attrs,
};
diff --git a/drivers/scsi/initio.c b/drivers/scsi/initio.c
index a10a5c74b48..0cc8868ea35 100644
--- a/drivers/scsi/initio.c
+++ b/drivers/scsi/initio.c
@@ -2833,7 +2833,6 @@ static struct scsi_host_template initio_template = {
.sg_tablesize = SG_ALL,
.cmd_per_lun = 1,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
static int initio_probe_one(struct pci_dev *pdev,
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index e5be5fd4ef5..b6f99dfbb03 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -1933,7 +1933,6 @@ static struct scsi_host_template iscsi_sht = {
.eh_device_reset_handler= iscsi_eh_device_reset,
.eh_host_reset_handler = iscsi_eh_host_reset,
.use_clustering = DISABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.slave_configure = iscsi_tcp_slave_configure,
.proc_name = "iscsi_tcp",
.this_id = -1,
diff --git a/drivers/scsi/libsrp.c b/drivers/scsi/libsrp.c
index 5cff0204227..6d6a76e65a6 100644
--- a/drivers/scsi/libsrp.c
+++ b/drivers/scsi/libsrp.c
@@ -426,8 +426,8 @@ int srp_cmd_queue(struct Scsi_Host *shost, struct srp_cmd *cmd, void *info,
sc->SCp.ptr = info;
memcpy(sc->cmnd, cmd->cdb, MAX_COMMAND_SIZE);
- sc->request_bufflen = len;
- sc->request_buffer = (void *) (unsigned long) addr;
+ sc->sdb.length = len;
+ sc->sdb.table.sgl = (void *) (unsigned long) addr;
sc->tag = tag;
err = scsi_tgt_queue_command(sc, itn_id, (struct scsi_lun *)&cmd->lun,
cmd->tag);
diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c
index 6483c62730b..fc5c3a42b05 100644
--- a/drivers/scsi/lpfc/lpfc_scsi.c
+++ b/drivers/scsi/lpfc/lpfc_scsi.c
@@ -1459,7 +1459,6 @@ struct scsi_host_template lpfc_template = {
.scan_finished = lpfc_scan_finished,
.this_id = -1,
.sg_tablesize = LPFC_DEFAULT_SG_SEG_CNT,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.cmd_per_lun = LPFC_CMD_PER_LUN,
.use_clustering = ENABLE_CLUSTERING,
.shost_attrs = lpfc_hba_attrs,
@@ -1482,7 +1481,6 @@ struct scsi_host_template lpfc_vport_template = {
.sg_tablesize = LPFC_DEFAULT_SG_SEG_CNT,
.cmd_per_lun = LPFC_CMD_PER_LUN,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.shost_attrs = lpfc_vport_attrs,
.max_sectors = 0xFFFF,
};
diff --git a/drivers/scsi/mac53c94.c b/drivers/scsi/mac53c94.c
index a035001f443..b12ad7c7c67 100644
--- a/drivers/scsi/mac53c94.c
+++ b/drivers/scsi/mac53c94.c
@@ -402,7 +402,6 @@ static struct scsi_host_template mac53c94_template = {
.sg_tablesize = SG_ALL,
.cmd_per_lun = 1,
.use_clustering = DISABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
static int mac53c94_probe(struct macio_dev *mdev, const struct of_device_id *match)
diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c
index 765c24d2bc3..4d59ae8491a 100644
--- a/drivers/scsi/megaraid.c
+++ b/drivers/scsi/megaraid.c
@@ -4490,7 +4490,6 @@ static struct scsi_host_template megaraid_template = {
.sg_tablesize = MAX_SGLIST,
.cmd_per_lun = DEF_CMD_PER_LUN,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.eh_abort_handler = megaraid_abort,
.eh_device_reset_handler = megaraid_reset,
.eh_bus_reset_handler = megaraid_reset,
diff --git a/drivers/scsi/megaraid/megaraid_mbox.c b/drivers/scsi/megaraid/megaraid_mbox.c
index 24e32e446e7..6db77c00e3e 100644
--- a/drivers/scsi/megaraid/megaraid_mbox.c
+++ b/drivers/scsi/megaraid/megaraid_mbox.c
@@ -361,7 +361,6 @@ static struct scsi_host_template megaraid_template_g = {
.eh_host_reset_handler = megaraid_reset_handler,
.change_queue_depth = megaraid_change_queue_depth,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.sdev_attrs = megaraid_sdev_attrs,
.shost_attrs = megaraid_shost_attrs,
};
diff --git a/drivers/scsi/megaraid/megaraid_sas.c b/drivers/scsi/megaraid/megaraid_sas.c
index d7ec921865c..672c759ac24 100644
--- a/drivers/scsi/megaraid/megaraid_sas.c
+++ b/drivers/scsi/megaraid/megaraid_sas.c
@@ -1192,7 +1192,6 @@ static struct scsi_host_template megasas_template = {
.eh_timed_out = megasas_reset_timer,
.bios_param = megasas_bios_param,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
/**
diff --git a/drivers/scsi/mesh.c b/drivers/scsi/mesh.c
index 7470ff39ab2..651d09b08f2 100644
--- a/drivers/scsi/mesh.c
+++ b/drivers/scsi/mesh.c
@@ -1843,7 +1843,6 @@ static struct scsi_host_template mesh_template = {
.sg_tablesize = SG_ALL,
.cmd_per_lun = 2,
.use_clustering = DISABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
static int mesh_probe(struct macio_dev *mdev, const struct of_device_id *match)
diff --git a/drivers/scsi/ncr53c8xx.c b/drivers/scsi/ncr53c8xx.c
index c02771aa6c9..c5ebf018b37 100644
--- a/drivers/scsi/ncr53c8xx.c
+++ b/drivers/scsi/ncr53c8xx.c
@@ -4967,7 +4967,7 @@ void ncr_complete (struct ncb *np, struct ccb *cp)
sizeof(cp->sense_buf)));
if (DEBUG_FLAGS & (DEBUG_RESULT|DEBUG_TINY)) {
- u_char * p = (u_char*) & cmd->sense_buffer;
+ u_char *p = cmd->sense_buffer;
int i;
PRINT_ADDR(cmd, "sense data:");
for (i=0; i<14; i++) printk (" %x", *p++);
diff --git a/drivers/scsi/nsp32.c b/drivers/scsi/nsp32.c
index 28161dc95e0..7fed3537215 100644
--- a/drivers/scsi/nsp32.c
+++ b/drivers/scsi/nsp32.c
@@ -281,7 +281,6 @@ static struct scsi_host_template nsp32_template = {
.cmd_per_lun = 1,
.this_id = NSP32_HOST_SCSIID,
.use_clustering = DISABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.eh_abort_handler = nsp32_eh_abort,
.eh_bus_reset_handler = nsp32_eh_bus_reset,
.eh_host_reset_handler = nsp32_eh_host_reset,
diff --git a/drivers/scsi/pcmcia/sym53c500_cs.c b/drivers/scsi/pcmcia/sym53c500_cs.c
index 969b9387a0c..3454a571474 100644
--- a/drivers/scsi/pcmcia/sym53c500_cs.c
+++ b/drivers/scsi/pcmcia/sym53c500_cs.c
@@ -692,7 +692,6 @@ static struct scsi_host_template sym53c500_driver_template = {
.sg_tablesize = 32,
.cmd_per_lun = 1,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.shost_attrs = SYM53C500_shost_attrs
};
diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c
index c94906abfee..68c0d09ffe7 100644
--- a/drivers/scsi/qla1280.c
+++ b/drivers/scsi/qla1280.c
@@ -4204,7 +4204,6 @@ static struct scsi_host_template qla1280_driver_template = {
.sg_tablesize = SG_ALL,
.cmd_per_lun = 1,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index aba1e6d4806..3954ed2d7b5 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -131,7 +131,6 @@ static struct scsi_host_template qla2x00_driver_template = {
.this_id = -1,
.cmd_per_lun = 3,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.sg_tablesize = SG_ALL,
/*
@@ -163,7 +162,6 @@ struct scsi_host_template qla24xx_driver_template = {
.this_id = -1,
.cmd_per_lun = 3,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.sg_tablesize = SG_ALL,
.max_sectors = 0xFFFF,
diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c
index d3f86646cb0..2e2b9fedffc 100644
--- a/drivers/scsi/qla4xxx/ql4_os.c
+++ b/drivers/scsi/qla4xxx/ql4_os.c
@@ -94,7 +94,6 @@ static struct scsi_host_template qla4xxx_driver_template = {
.this_id = -1,
.cmd_per_lun = 3,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.sg_tablesize = SG_ALL,
.max_sectors = 0xFFFF,
diff --git a/drivers/scsi/qlogicfas.c b/drivers/scsi/qlogicfas.c
index 1769f965eed..1e874f1fb5c 100644
--- a/drivers/scsi/qlogicfas.c
+++ b/drivers/scsi/qlogicfas.c
@@ -197,7 +197,6 @@ static struct scsi_host_template qlogicfas_driver_template = {
.sg_tablesize = SG_ALL,
.cmd_per_lun = 1,
.use_clustering = DISABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
static __init int qlogicfas_init(void)
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 1a9fba6a9f9..b35d19472ca 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -757,7 +757,7 @@ void scsi_finish_command(struct scsi_cmnd *cmd)
"Notifying upper driver of completion "
"(result %x)\n", cmd->result));
- good_bytes = cmd->request_bufflen;
+ good_bytes = scsi_bufflen(cmd);
if (cmd->request->cmd_type != REQ_TYPE_BLOCK_PC) {
drv = scsi_cmd_to_driver(cmd);
if (drv->done)
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 82c06f0a9d0..1541c174937 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -280,6 +280,8 @@ static int resp_write(struct scsi_cmnd * SCpnt, unsigned long long lba,
unsigned int num, struct sdebug_dev_info * devip);
static int resp_report_luns(struct scsi_cmnd * SCpnt,
struct sdebug_dev_info * devip);
+static int resp_xdwriteread(struct scsi_cmnd *scp, unsigned long long lba,
+ unsigned int num, struct sdebug_dev_info *devip);
static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
int arr_len);
static int fetch_to_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
@@ -311,12 +313,48 @@ static void sdebug_max_tgts_luns(void);
static struct device pseudo_primary;
static struct bus_type pseudo_lld_bus;
+static void get_data_transfer_info(unsigned char *cmd,
+ unsigned long long *lba, unsigned int *num)
+{
+ int i;
+
+ switch (*cmd) {
+ case WRITE_16:
+ case READ_16:
+ for (*lba = 0, i = 0; i < 8; ++i) {
+ if (i > 0)
+ *lba <<= 8;
+ *lba += cmd[2 + i];
+ }
+ *num = cmd[13] + (cmd[12] << 8) +
+ (cmd[11] << 16) + (cmd[10] << 24);
+ break;
+ case WRITE_12:
+ case READ_12:
+ *lba = cmd[5] + (cmd[4] << 8) + (cmd[3] << 16) + (cmd[2] << 24);
+ *num = cmd[9] + (cmd[8] << 8) + (cmd[7] << 16) + (cmd[6] << 24);
+ break;
+ case WRITE_10:
+ case READ_10:
+ case XDWRITEREAD_10:
+ *lba = cmd[5] + (cmd[4] << 8) + (cmd[3] << 16) + (cmd[2] << 24);
+ *num = cmd[8] + (cmd[7] << 8);
+ break;
+ case WRITE_6:
+ case READ_6:
+ *lba = cmd[3] + (cmd[2] << 8) + ((cmd[1] & 0x1f) << 16);
+ *num = (0 == cmd[4]) ? 256 : cmd[4];
+ break;
+ default:
+ break;
+ }
+}
static
int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done)
{
unsigned char *cmd = (unsigned char *) SCpnt->cmnd;
- int len, k, j;
+ int len, k;
unsigned int num;
unsigned long long lba;
int errsts = 0;
@@ -452,28 +490,7 @@ int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done)
break;
if (scsi_debug_fake_rw)
break;
- if ((*cmd) == READ_16) {
- for (lba = 0, j = 0; j < 8; ++j) {
- if (j > 0)
- lba <<= 8;
- lba += cmd[2 + j];
- }
- num = cmd[13] + (cmd[12] << 8) +
- (cmd[11] << 16) + (cmd[10] << 24);
- } else if ((*cmd) == READ_12) {
- lba = cmd[5] + (cmd[4] << 8) +
- (cmd[3] << 16) + (cmd[2] << 24);
- num = cmd[9] + (cmd[8] << 8) +
- (cmd[7] << 16) + (cmd[6] << 24);
- } else if ((*cmd) == READ_10) {
- lba = cmd[5] + (cmd[4] << 8) +
- (cmd[3] << 16) + (cmd[2] << 24);
- num = cmd[8] + (cmd[7] << 8);
- } else { /* READ (6) */
- lba = cmd[3] + (cmd[2] << 8) +
- ((cmd[1] & 0x1f) << 16);
- num = (0 == cmd[4]) ? 256 : cmd[4];
- }
+ get_data_transfer_info(cmd, &lba, &num);
errsts = resp_read(SCpnt, lba, num, devip);
if (inj_recovered && (0 == errsts)) {
mk_sense_buffer(devip, RECOVERED_ERROR,
@@ -500,28 +517,7 @@ int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done)
break;
if (scsi_debug_fake_rw)
break;
- if ((*cmd) == WRITE_16) {
- for (lba = 0, j = 0; j < 8; ++j) {
- if (j > 0)
- lba <<= 8;
- lba += cmd[2 + j];
- }
- num = cmd[13] + (cmd[12] << 8) +
- (cmd[11] << 16) + (cmd[10] << 24);
- } else if ((*cmd) == WRITE_12) {
- lba = cmd[5] + (cmd[4] << 8) +
- (cmd[3] << 16) + (cmd[2] << 24);
- num = cmd[9] + (cmd[8] << 8) +
- (cmd[7] << 16) + (cmd[6] << 24);
- } else if ((*cmd) == WRITE_10) {
- lba = cmd[5] + (cmd[4] << 8) +
- (cmd[3] << 16) + (cmd[2] << 24);
- num = cmd[8] + (cmd[7] << 8);
- } else { /* WRITE (6) */
- lba = cmd[3] + (cmd[2] << 8) +
- ((cmd[1] & 0x1f) << 16);
- num = (0 == cmd[4]) ? 256 : cmd[4];
- }
+ get_data_transfer_info(cmd, &lba, &num);
errsts = resp_write(SCpnt, lba, num, devip);
if (inj_recovered && (0 == errsts)) {
mk_sense_buffer(devip, RECOVERED_ERROR,
@@ -549,6 +545,28 @@ int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done)
case WRITE_BUFFER:
errsts = check_readiness(SCpnt, 1, devip);
break;
+ case XDWRITEREAD_10:
+ if (!scsi_bidi_cmnd(SCpnt)) {
+ mk_sense_buffer(devip, ILLEGAL_REQUEST,
+ INVALID_FIELD_IN_CDB, 0);
+ errsts = check_condition_result;
+ break;
+ }
+
+ errsts = check_readiness(SCpnt, 0, devip);
+ if (errsts)
+ break;
+ if (scsi_debug_fake_rw)
+ break;
+ get_data_transfer_info(cmd, &lba, &num);
+ errsts = resp_read(SCpnt, lba, num, devip);
+ if (errsts)
+ break;
+ errsts = resp_write(SCpnt, lba, num, devip);
+ if (errsts)
+ break;
+ errsts = resp_xdwriteread(SCpnt, lba, num, devip);
+ break;
default:
if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts)
printk(KERN_INFO "scsi_debug: Opcode: 0x%x not "
@@ -601,18 +619,18 @@ static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
int k, req_len, act_len, len, active;
void * kaddr;
void * kaddr_off;
- struct scatterlist * sg;
+ struct scatterlist *sg;
+ struct scsi_data_buffer *sdb = scsi_in(scp);
- if (0 == scsi_bufflen(scp))
+ if (!sdb->length)
return 0;
- if (NULL == scsi_sglist(scp))
+ if (!sdb->table.sgl)
return (DID_ERROR << 16);
- if (! ((scp->sc_data_direction == DMA_BIDIRECTIONAL) ||
- (scp->sc_data_direction == DMA_FROM_DEVICE)))
+ if (!(scsi_bidi_cmnd(scp) || scp->sc_data_direction == DMA_FROM_DEVICE))
return (DID_ERROR << 16);
active = 1;
req_len = act_len = 0;
- scsi_for_each_sg(scp, sg, scsi_sg_count(scp), k) {
+ for_each_sg(sdb->table.sgl, sg, sdb->table.nents, k) {
if (active) {
kaddr = (unsigned char *)
kmap_atomic(sg_page(sg), KM_USER0);
@@ -630,10 +648,10 @@ static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
}
req_len += sg->length;
}
- if (scsi_get_resid(scp))
- scsi_set_resid(scp, scsi_get_resid(scp) - act_len);
+ if (sdb->resid)
+ sdb->resid -= act_len;
else
- scsi_set_resid(scp, req_len - act_len);
+ sdb->resid = req_len - act_len;
return 0;
}
@@ -650,8 +668,7 @@ static int fetch_to_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
return 0;
if (NULL == scsi_sglist(scp))
return -1;
- if (! ((scp->sc_data_direction == DMA_BIDIRECTIONAL) ||
- (scp->sc_data_direction == DMA_TO_DEVICE)))
+ if (!(scsi_bidi_cmnd(scp) || scp->sc_data_direction == DMA_TO_DEVICE))
return -1;
req_len = fin = 0;
scsi_for_each_sg(scp, sg, scsi_sg_count(scp), k) {
@@ -1956,6 +1973,50 @@ static int resp_report_luns(struct scsi_cmnd * scp,
min((int)alloc_len, SDEBUG_RLUN_ARR_SZ));
}
+static int resp_xdwriteread(struct scsi_cmnd *scp, unsigned long long lba,
+ unsigned int num, struct sdebug_dev_info *devip)
+{
+ int i, j, ret = -1;
+ unsigned char *kaddr, *buf;
+ unsigned int offset;
+ struct scatterlist *sg;
+ struct scsi_data_buffer *sdb = scsi_in(scp);
+
+ /* better not to use temporary buffer. */
+ buf = kmalloc(scsi_bufflen(scp), GFP_ATOMIC);
+ if (!buf)
+ return ret;
+
+ offset = 0;
+ scsi_for_each_sg(scp, sg, scsi_sg_count(scp), i) {
+ kaddr = (unsigned char *)kmap_atomic(sg_page(sg), KM_USER0);
+ if (!kaddr)
+ goto out;
+
+ memcpy(buf + offset, kaddr + sg->offset, sg->length);
+ offset += sg->length;
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+
+ offset = 0;
+ for_each_sg(sdb->table.sgl, sg, sdb->table.nents, i) {
+ kaddr = (unsigned char *)kmap_atomic(sg_page(sg), KM_USER0);
+ if (!kaddr)
+ goto out;
+
+ for (j = 0; j < sg->length; j++)
+ *(kaddr + sg->offset + j) ^= *(buf + offset + j);
+
+ offset += sg->length;
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+ ret = 0;
+out:
+ kfree(buf);
+
+ return ret;
+}
+
/* When timer goes off this function is called. */
static void timer_intr_handler(unsigned long indx)
{
@@ -1989,6 +2050,7 @@ static int scsi_debug_slave_alloc(struct scsi_device * sdp)
if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts)
printk(KERN_INFO "scsi_debug: slave_alloc <%u %u %u %u>\n",
sdp->host->host_no, sdp->channel, sdp->id, sdp->lun);
+ set_bit(QUEUE_FLAG_BIDI, &sdp->request_queue->queue_flags);
return 0;
}
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 547e85aa414..045a0868fc7 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -617,29 +617,27 @@ void scsi_eh_prep_cmnd(struct scsi_cmnd *scmd, struct scsi_eh_save *ses,
ses->cmd_len = scmd->cmd_len;
memcpy(ses->cmnd, scmd->cmnd, sizeof(scmd->cmnd));
ses->data_direction = scmd->sc_data_direction;
- ses->bufflen = scmd->request_bufflen;
- ses->buffer = scmd->request_buffer;
- ses->use_sg = scmd->use_sg;
- ses->resid = scmd->resid;
+ ses->sdb = scmd->sdb;
+ ses->next_rq = scmd->request->next_rq;
ses->result = scmd->result;
+ memset(&scmd->sdb, 0, sizeof(scmd->sdb));
+ scmd->request->next_rq = NULL;
+
if (sense_bytes) {
- scmd->request_bufflen = min_t(unsigned,
- SCSI_SENSE_BUFFERSIZE, sense_bytes);
+ scmd->sdb.length = min_t(unsigned, SCSI_SENSE_BUFFERSIZE,
+ sense_bytes);
sg_init_one(&ses->sense_sgl, scmd->sense_buffer,
- scmd->request_bufflen);
- scmd->request_buffer = &ses->sense_sgl;
+ scmd->sdb.length);
+ scmd->sdb.table.sgl = &ses->sense_sgl;
scmd->sc_data_direction = DMA_FROM_DEVICE;
- scmd->use_sg = 1;
+ scmd->sdb.table.nents = 1;
memset(scmd->cmnd, 0, sizeof(scmd->cmnd));
scmd->cmnd[0] = REQUEST_SENSE;
- scmd->cmnd[4] = scmd->request_bufflen;
+ scmd->cmnd[4] = scmd->sdb.length;
scmd->cmd_len = COMMAND_SIZE(scmd->cmnd[0]);
} else {
- scmd->request_buffer = NULL;
- scmd->request_bufflen = 0;
scmd->sc_data_direction = DMA_NONE;
- scmd->use_sg = 0;
if (cmnd) {
memset(scmd->cmnd, 0, sizeof(scmd->cmnd));
memcpy(scmd->cmnd, cmnd, cmnd_size);
@@ -676,10 +674,8 @@ void scsi_eh_restore_cmnd(struct scsi_cmnd* scmd, struct scsi_eh_save *ses)
scmd->cmd_len = ses->cmd_len;
memcpy(scmd->cmnd, ses->cmnd, sizeof(scmd->cmnd));
scmd->sc_data_direction = ses->data_direction;
- scmd->request_bufflen = ses->bufflen;
- scmd->request_buffer = ses->buffer;
- scmd->use_sg = ses->use_sg;
- scmd->resid = ses->resid;
+ scmd->sdb = ses->sdb;
+ scmd->request->next_rq = ses->next_rq;
scmd->result = ses->result;
}
EXPORT_SYMBOL(scsi_eh_restore_cmnd);
@@ -1700,8 +1696,7 @@ scsi_reset_provider(struct scsi_device *dev, int flag)
memset(&scmd->cmnd, '\0', sizeof(scmd->cmnd));
scmd->scsi_done = scsi_reset_provider_done_command;
- scmd->request_buffer = NULL;
- scmd->request_bufflen = 0;
+ memset(&scmd->sdb, 0, sizeof(scmd->sdb));
scmd->cmd_len = 0;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 7c4c889c522..b12fb310e39 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -8,6 +8,7 @@
*/
#include <linux/bio.h>
+#include <linux/bitops.h>
#include <linux/blkdev.h>
#include <linux/completion.h>
#include <linux/kernel.h>
@@ -34,13 +35,6 @@
#define SG_MEMPOOL_NR ARRAY_SIZE(scsi_sg_pools)
#define SG_MEMPOOL_SIZE 2
-/*
- * The maximum number of SG segments that we will put inside a scatterlist
- * (unless chaining is used). Should ideally fit inside a single page, to
- * avoid a higher order allocation.
- */
-#define SCSI_MAX_SG_SEGMENTS 128
-
struct scsi_host_sg_pool {
size_t size;
char *name;
@@ -48,22 +42,31 @@ struct scsi_host_sg_pool {
mempool_t *pool;
};
-#define SP(x) { x, "sgpool-" #x }
+#define SP(x) { x, "sgpool-" __stringify(x) }
+#if (SCSI_MAX_SG_SEGMENTS < 32)
+#error SCSI_MAX_SG_SEGMENTS is too small (must be 32 or greater)
+#endif
static struct scsi_host_sg_pool scsi_sg_pools[] = {
SP(8),
SP(16),
-#if (SCSI_MAX_SG_SEGMENTS > 16)
- SP(32),
#if (SCSI_MAX_SG_SEGMENTS > 32)
- SP(64),
+ SP(32),
#if (SCSI_MAX_SG_SEGMENTS > 64)
+ SP(64),
+#if (SCSI_MAX_SG_SEGMENTS > 128)
SP(128),
+#if (SCSI_MAX_SG_SEGMENTS > 256)
+#error SCSI_MAX_SG_SEGMENTS is too large (256 MAX)
+#endif
#endif
#endif
#endif
+ SP(SCSI_MAX_SG_SEGMENTS)
};
#undef SP
+static struct kmem_cache *scsi_bidi_sdb_cache;
+
static void scsi_run_queue(struct request_queue *q);
/*
@@ -440,7 +443,7 @@ EXPORT_SYMBOL_GPL(scsi_execute_async);
static void scsi_init_cmd_errh(struct scsi_cmnd *cmd)
{
cmd->serial_number = 0;
- cmd->resid = 0;
+ scsi_set_resid(cmd, 0);
memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
if (cmd->cmd_len == 0)
cmd->cmd_len = COMMAND_SIZE(cmd->cmnd[0]);
@@ -690,42 +693,16 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int error,
return NULL;
}
-/*
- * Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit
- * is totally arbitrary, a setting of 2048 will get you at least 8mb ios.
- */
-#define SCSI_MAX_SG_CHAIN_SEGMENTS 2048
-
static inline unsigned int scsi_sgtable_index(unsigned short nents)
{
unsigned int index;
- switch (nents) {
- case 1 ... 8:
+ BUG_ON(nents > SCSI_MAX_SG_SEGMENTS);
+
+ if (nents <= 8)
index = 0;
- break;
- case 9 ... 16:
- index = 1;
- break;
-#if (SCSI_MAX_SG_SEGMENTS > 16)
- case 17 ... 32:
- index = 2;
- break;
-#if (SCSI_MAX_SG_SEGMENTS > 32)
- case 33 ... 64:
- index = 3;
- break;
-#if (SCSI_MAX_SG_SEGMENTS > 64)
- case 65 ... 128:
- index = 4;
- break;
-#endif
-#endif
-#endif
- default:
- printk(KERN_ERR "scsi: bad segment count=%d\n", nents);
- BUG();
- }
+ else
+ index = get_count_order(nents) - 3;
return index;
}
@@ -746,31 +723,27 @@ static struct scatterlist *scsi_sg_alloc(unsigned int nents, gfp_t gfp_mask)
return mempool_alloc(sgp->pool, gfp_mask);
}
-int scsi_alloc_sgtable(struct scsi_cmnd *cmd, gfp_t gfp_mask)
+static int scsi_alloc_sgtable(struct scsi_data_buffer *sdb, int nents,
+ gfp_t gfp_mask)
{
int ret;
- BUG_ON(!cmd->use_sg);
+ BUG_ON(!nents);
- ret = __sg_alloc_table(&cmd->sg_table, cmd->use_sg,
- SCSI_MAX_SG_SEGMENTS, gfp_mask, scsi_sg_alloc);
+ ret = __sg_alloc_table(&sdb->table, nents, SCSI_MAX_SG_SEGMENTS,
+ gfp_mask, scsi_sg_alloc);
if (unlikely(ret))
- __sg_free_table(&cmd->sg_table, SCSI_MAX_SG_SEGMENTS,
+ __sg_free_table(&sdb->table, SCSI_MAX_SG_SEGMENTS,
scsi_sg_free);
- cmd->request_buffer = cmd->sg_table.sgl;
return ret;
}
-EXPORT_SYMBOL(scsi_alloc_sgtable);
-
-void scsi_free_sgtable(struct scsi_cmnd *cmd)
+static void scsi_free_sgtable(struct scsi_data_buffer *sdb)
{
- __sg_free_table(&cmd->sg_table, SCSI_MAX_SG_SEGMENTS, scsi_sg_free);
+ __sg_free_table(&sdb->table, SCSI_MAX_SG_SEGMENTS, scsi_sg_free);
}
-EXPORT_SYMBOL(scsi_free_sgtable);
-
/*
* Function: scsi_release_buffers()
*
@@ -788,17 +761,49 @@ EXPORT_SYMBOL(scsi_free_sgtable);
* the scatter-gather table, and potentially any bounce
* buffers.
*/
-static void scsi_release_buffers(struct scsi_cmnd *cmd)
+void scsi_release_buffers(struct scsi_cmnd *cmd)
+{
+ if (cmd->sdb.table.nents)
+ scsi_free_sgtable(&cmd->sdb);
+
+ memset(&cmd->sdb, 0, sizeof(cmd->sdb));
+
+ if (scsi_bidi_cmnd(cmd)) {
+ struct scsi_data_buffer *bidi_sdb =
+ cmd->request->next_rq->special;
+ scsi_free_sgtable(bidi_sdb);
+ kmem_cache_free(scsi_bidi_sdb_cache, bidi_sdb);
+ cmd->request->next_rq->special = NULL;
+ }
+}
+EXPORT_SYMBOL(scsi_release_buffers);
+
+/*
+ * Bidi commands Must be complete as a whole, both sides at once.
+ * If part of the bytes were written and lld returned
+ * scsi_in()->resid and/or scsi_out()->resid this information will be left
+ * in req->data_len and req->next_rq->data_len. The upper-layer driver can
+ * decide what to do with this information.
+ */
+void scsi_end_bidi_request(struct scsi_cmnd *cmd)
{
- if (cmd->use_sg)
- scsi_free_sgtable(cmd);
+ struct request *req = cmd->request;
+ unsigned int dlen = req->data_len;
+ unsigned int next_dlen = req->next_rq->data_len;
+
+ req->data_len = scsi_out(cmd)->resid;
+ req->next_rq->data_len = scsi_in(cmd)->resid;
+
+ /* The req and req->next_rq have not been completed */
+ BUG_ON(blk_end_bidi_request(req, 0, dlen, next_dlen));
+
+ scsi_release_buffers(cmd);
/*
- * Zero these out. They now point to freed memory, and it is
- * dangerous to hang onto the pointers.
+ * This will goose the queue request function at the end, so we don't
+ * need to worry about launching another command.
*/
- cmd->request_buffer = NULL;
- cmd->request_bufflen = 0;
+ scsi_next_command(cmd);
}
/*
@@ -832,7 +837,7 @@ static void scsi_release_buffers(struct scsi_cmnd *cmd)
void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
{
int result = cmd->result;
- int this_count = cmd->request_bufflen;
+ int this_count = scsi_bufflen(cmd);
struct request_queue *q = cmd->device->request_queue;
struct request *req = cmd->request;
int clear_errors = 1;
@@ -840,8 +845,6 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
int sense_valid = 0;
int sense_deferred = 0;
- scsi_release_buffers(cmd);
-
if (result) {
sense_valid = scsi_command_normalize_sense(cmd, &sshdr);
if (sense_valid)
@@ -864,9 +867,17 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
req->sense_len = len;
}
}
- req->data_len = cmd->resid;
+ if (scsi_bidi_cmnd(cmd)) {
+ /* will also release_buffers */
+ scsi_end_bidi_request(cmd);
+ return;
+ }
+ req->data_len = scsi_get_resid(cmd);
}
+ BUG_ON(blk_bidi_rq(req)); /* bidi not support for !blk_pc_request yet */
+ scsi_release_buffers(cmd);
+
/*
* Next deal with any sectors which we were able to correctly
* handle.
@@ -874,7 +885,6 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
SCSI_LOG_HLCOMPLETE(1, printk("%ld sectors total, "
"%d bytes done.\n",
req->nr_sectors, good_bytes));
- SCSI_LOG_HLCOMPLETE(1, printk("use_sg is %d\n", cmd->use_sg));
if (clear_errors)
req->errors = 0;
@@ -991,52 +1001,80 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
scsi_end_request(cmd, -EIO, this_count, !result);
}
-/*
- * Function: scsi_init_io()
- *
- * Purpose: SCSI I/O initialize function.
- *
- * Arguments: cmd - Command descriptor we wish to initialize
- *
- * Returns: 0 on success
- * BLKPREP_DEFER if the failure is retryable
- */
-static int scsi_init_io(struct scsi_cmnd *cmd)
+static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb,
+ gfp_t gfp_mask)
{
- struct request *req = cmd->request;
- int count;
-
- /*
- * We used to not use scatter-gather for single segment request,
- * but now we do (it makes highmem I/O easier to support without
- * kmapping pages)
- */
- cmd->use_sg = req->nr_phys_segments;
+ int count;
/*
* If sg table allocation fails, requeue request later.
*/
- if (unlikely(scsi_alloc_sgtable(cmd, GFP_ATOMIC))) {
- scsi_unprep_request(req);
+ if (unlikely(scsi_alloc_sgtable(sdb, req->nr_phys_segments,
+ gfp_mask))) {
return BLKPREP_DEFER;
}
req->buffer = NULL;
if (blk_pc_request(req))
- cmd->request_bufflen = req->data_len;
+ sdb->length = req->data_len;
else
- cmd->request_bufflen = req->nr_sectors << 9;
+ sdb->length = req->nr_sectors << 9;
/*
* Next, walk the list, and fill in the addresses and sizes of
* each segment.
*/
- count = blk_rq_map_sg(req->q, req, cmd->request_buffer);
- BUG_ON(count > cmd->use_sg);
- cmd->use_sg = count;
+ count = blk_rq_map_sg(req->q, req, sdb->table.sgl);
+ BUG_ON(count > sdb->table.nents);
+ sdb->table.nents = count;
return BLKPREP_OK;
}
+/*
+ * Function: scsi_init_io()
+ *
+ * Purpose: SCSI I/O initialize function.
+ *
+ * Arguments: cmd - Command descriptor we wish to initialize
+ *
+ * Returns: 0 on success
+ * BLKPREP_DEFER if the failure is retryable
+ * BLKPREP_KILL if the failure is fatal
+ */
+int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask)
+{
+ int error = scsi_init_sgtable(cmd->request, &cmd->sdb, gfp_mask);
+ if (error)
+ goto err_exit;
+
+ if (blk_bidi_rq(cmd->request)) {
+ struct scsi_data_buffer *bidi_sdb = kmem_cache_zalloc(
+ scsi_bidi_sdb_cache, GFP_ATOMIC);
+ if (!bidi_sdb) {
+ error = BLKPREP_DEFER;
+ goto err_exit;
+ }
+
+ cmd->request->next_rq->special = bidi_sdb;
+ error = scsi_init_sgtable(cmd->request->next_rq, bidi_sdb,
+ GFP_ATOMIC);
+ if (error)
+ goto err_exit;
+ }
+
+ return BLKPREP_OK ;
+
+err_exit:
+ scsi_release_buffers(cmd);
+ if (error == BLKPREP_KILL)
+ scsi_put_command(cmd);
+ else /* BLKPREP_DEFER */
+ scsi_unprep_request(cmd->request);
+
+ return error;
+}
+EXPORT_SYMBOL(scsi_init_io);
+
static struct scsi_cmnd *scsi_get_cmd_from_req(struct scsi_device *sdev,
struct request *req)
{
@@ -1081,16 +1119,14 @@ int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req)
BUG_ON(!req->nr_phys_segments);
- ret = scsi_init_io(cmd);
+ ret = scsi_init_io(cmd, GFP_ATOMIC);
if (unlikely(ret))
return ret;
} else {
BUG_ON(req->data_len);
BUG_ON(req->data);
- cmd->request_bufflen = 0;
- cmd->request_buffer = NULL;
- cmd->use_sg = 0;
+ memset(&cmd->sdb, 0, sizeof(cmd->sdb));
req->buffer = NULL;
}
@@ -1132,7 +1168,7 @@ int scsi_setup_fs_cmnd(struct scsi_device *sdev, struct request *req)
if (unlikely(!cmd))
return BLKPREP_DEFER;
- return scsi_init_io(cmd);
+ return scsi_init_io(cmd, GFP_ATOMIC);
}
EXPORT_SYMBOL(scsi_setup_fs_cmnd);
@@ -1542,20 +1578,7 @@ struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost,
* this limit is imposed by hardware restrictions
*/
blk_queue_max_hw_segments(q, shost->sg_tablesize);
-
- /*
- * In the future, sg chaining support will be mandatory and this
- * ifdef can then go away. Right now we don't have all archs
- * converted, so better keep it safe.
- */
-#ifdef ARCH_HAS_SG_CHAIN
- if (shost->use_sg_chaining)
- blk_queue_max_phys_segments(q, SCSI_MAX_SG_CHAIN_SEGMENTS);
- else
- blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS);
-#else
- blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS);
-#endif
+ blk_queue_max_phys_segments(q, SCSI_MAX_SG_CHAIN_SEGMENTS);
blk_queue_max_sectors(q, shost->max_sectors);
blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
@@ -1654,6 +1677,14 @@ int __init scsi_init_queue(void)
return -ENOMEM;
}
+ scsi_bidi_sdb_cache = kmem_cache_create("scsi_bidi_sdb",
+ sizeof(struct scsi_data_buffer),
+ 0, 0, NULL);
+ if (!scsi_bidi_sdb_cache) {
+ printk(KERN_ERR "SCSI: can't init scsi bidi sdb cache\n");
+ goto cleanup_io_context;
+ }
+
for (i = 0; i < SG_MEMPOOL_NR; i++) {
struct scsi_host_sg_pool *sgp = scsi_sg_pools + i;
int size = sgp->size * sizeof(struct scatterlist);
@@ -1663,6 +1694,7 @@ int __init scsi_init_queue(void)
if (!sgp->slab) {
printk(KERN_ERR "SCSI: can't init sg slab %s\n",
sgp->name);
+ goto cleanup_bidi_sdb;
}
sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE,
@@ -1670,10 +1702,25 @@ int __init scsi_init_queue(void)
if (!sgp->pool) {
printk(KERN_ERR "SCSI: can't init sg mempool %s\n",
sgp->name);
+ goto cleanup_bidi_sdb;
}
}
return 0;
+
+cleanup_bidi_sdb:
+ for (i = 0; i < SG_MEMPOOL_NR; i++) {
+ struct scsi_host_sg_pool *sgp = scsi_sg_pools + i;
+ if (sgp->pool)
+ mempool_destroy(sgp->pool);
+ if (sgp->slab)
+ kmem_cache_destroy(sgp->slab);
+ }
+ kmem_cache_destroy(scsi_bidi_sdb_cache);
+cleanup_io_context:
+ kmem_cache_destroy(scsi_io_context_cache);
+
+ return -ENOMEM;
}
void scsi_exit_queue(void)
@@ -1681,6 +1728,7 @@ void scsi_exit_queue(void)
int i;
kmem_cache_destroy(scsi_io_context_cache);
+ kmem_cache_destroy(scsi_bidi_sdb_cache);
for (i = 0; i < SG_MEMPOOL_NR; i++) {
struct scsi_host_sg_pool *sgp = scsi_sg_pools + i;
diff --git a/drivers/scsi/scsi_tgt_lib.c b/drivers/scsi/scsi_tgt_lib.c
index 01e03f3f6ff..91630baea53 100644
--- a/drivers/scsi/scsi_tgt_lib.c
+++ b/drivers/scsi/scsi_tgt_lib.c
@@ -331,8 +331,7 @@ static void scsi_tgt_cmd_done(struct scsi_cmnd *cmd)
scsi_tgt_uspace_send_status(cmd, tcmd->itn_id, tcmd->tag);
- if (scsi_sglist(cmd))
- scsi_free_sgtable(cmd);
+ scsi_release_buffers(cmd);
queue_work(scsi_tgtd, &tcmd->work);
}
@@ -353,25 +352,6 @@ static int scsi_tgt_transfer_response(struct scsi_cmnd *cmd)
return 0;
}
-static int scsi_tgt_init_cmd(struct scsi_cmnd *cmd, gfp_t gfp_mask)
-{
- struct request *rq = cmd->request;
- int count;
-
- cmd->use_sg = rq->nr_phys_segments;
- if (scsi_alloc_sgtable(cmd, gfp_mask))
- return -ENOMEM;
-
- cmd->request_bufflen = rq->data_len;
-
- dprintk("cmd %p cnt %d %lu\n", cmd, scsi_sg_count(cmd),
- rq_data_dir(rq));
- count = blk_rq_map_sg(rq->q, rq, scsi_sglist(cmd));
- BUG_ON(count > cmd->use_sg);
- cmd->use_sg = count;
- return 0;
-}
-
/* TODO: test this crap and replace bio_map_user with new interface maybe */
static int scsi_map_user_pages(struct scsi_tgt_cmd *tcmd, struct scsi_cmnd *cmd,
unsigned long uaddr, unsigned int len, int rw)
@@ -397,9 +377,11 @@ static int scsi_map_user_pages(struct scsi_tgt_cmd *tcmd, struct scsi_cmnd *cmd,
}
tcmd->bio = rq->bio;
- err = scsi_tgt_init_cmd(cmd, GFP_KERNEL);
- if (err)
+ err = scsi_init_io(cmd, GFP_KERNEL);
+ if (err) {
+ scsi_release_buffers(cmd);
goto unmap_rq;
+ }
return 0;
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 24eba3118b5..51a5557f42d 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -519,7 +519,7 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
SCpnt->cmnd[4] = (unsigned char) this_count;
SCpnt->cmnd[5] = 0;
}
- SCpnt->request_bufflen = this_count * sdp->sector_size;
+ SCpnt->sdb.length = this_count * sdp->sector_size;
/*
* We shouldn't disconnect in the middle of a sector, so with a dumb
@@ -926,7 +926,7 @@ static struct block_device_operations sd_fops = {
static int sd_done(struct scsi_cmnd *SCpnt)
{
int result = SCpnt->result;
- unsigned int xfer_size = SCpnt->request_bufflen;
+ unsigned int xfer_size = scsi_bufflen(SCpnt);
unsigned int good_bytes = result ? 0 : xfer_size;
u64 start_lba = SCpnt->request->sector;
u64 bad_lba;
diff --git a/drivers/scsi/sgiwd93.c b/drivers/scsi/sgiwd93.c
index d4ebe8c67ba..26cfc56c709 100644
--- a/drivers/scsi/sgiwd93.c
+++ b/drivers/scsi/sgiwd93.c
@@ -33,10 +33,9 @@
struct ip22_hostdata {
struct WD33C93_hostdata wh;
- struct hpc_data {
- dma_addr_t dma;
- void *cpu;
- } hd;
+ dma_addr_t dma;
+ void *cpu;
+ struct device *dev;
};
#define host_to_hostdata(host) ((struct ip22_hostdata *)((host)->hostdata))
@@ -46,6 +45,11 @@ struct hpc_chunk {
u32 _padding; /* align to quadword boundary */
};
+/* space for hpc dma descriptors */
+#define HPC_DMA_SIZE PAGE_SIZE
+
+#define DMA_DIR(d) ((d == DATA_OUT_DIR) ? DMA_TO_DEVICE : DMA_FROM_DEVICE)
+
static irqreturn_t sgiwd93_intr(int irq, void *dev_id)
{
struct Scsi_Host * host = dev_id;
@@ -59,15 +63,17 @@ static irqreturn_t sgiwd93_intr(int irq, void *dev_id)
}
static inline
-void fill_hpc_entries(struct hpc_chunk *hcp, struct scsi_cmnd *cmd, int datainp)
+void fill_hpc_entries(struct ip22_hostdata *hd, struct scsi_cmnd *cmd, int din)
{
unsigned long len = cmd->SCp.this_residual;
void *addr = cmd->SCp.ptr;
dma_addr_t physaddr;
unsigned long count;
+ struct hpc_chunk *hcp;
- physaddr = dma_map_single(NULL, addr, len, cmd->sc_data_direction);
+ physaddr = dma_map_single(hd->dev, addr, len, DMA_DIR(din));
cmd->SCp.dma_handle = physaddr;
+ hcp = hd->cpu;
while (len) {
/*
@@ -89,6 +95,9 @@ void fill_hpc_entries(struct hpc_chunk *hcp, struct scsi_cmnd *cmd, int datainp)
*/
hcp->desc.pbuf = 0;
hcp->desc.cntinfo = HPCDMA_EOX;
+ dma_cache_sync(hd->dev, hd->cpu,
+ (unsigned long)(hcp + 1) - (unsigned long)hd->cpu,
+ DMA_TO_DEVICE);
}
static int dma_setup(struct scsi_cmnd *cmd, int datainp)
@@ -96,9 +105,8 @@ static int dma_setup(struct scsi_cmnd *cmd, int datainp)
struct ip22_hostdata *hdata = host_to_hostdata(cmd->device->host);
struct hpc3_scsiregs *hregs =
(struct hpc3_scsiregs *) cmd->device->host->base;
- struct hpc_chunk *hcp = (struct hpc_chunk *) hdata->hd.cpu;
- pr_debug("dma_setup: datainp<%d> hcp<%p> ", datainp, hcp);
+ pr_debug("dma_setup: datainp<%d> hcp<%p> ", datainp, hdata->cpu);
hdata->wh.dma_dir = datainp;
@@ -111,12 +119,12 @@ static int dma_setup(struct scsi_cmnd *cmd, int datainp)
if (cmd->SCp.ptr == NULL || cmd->SCp.this_residual == 0)
return 1;
- fill_hpc_entries(hcp, cmd, datainp);
+ fill_hpc_entries(hdata, cmd, datainp);
pr_debug(" HPCGO\n");
/* Start up the HPC. */
- hregs->ndptr = hdata->hd.dma;
+ hregs->ndptr = hdata->dma;
if (datainp)
hregs->ctrl = HPC3_SCTRL_ACTIVE;
else
@@ -134,6 +142,9 @@ static void dma_stop(struct Scsi_Host *instance, struct scsi_cmnd *SCpnt,
if (!SCpnt)
return;
+ if (SCpnt->SCp.ptr == NULL || SCpnt->SCp.this_residual == 0)
+ return;
+
hregs = (struct hpc3_scsiregs *) SCpnt->device->host->base;
pr_debug("dma_stop: status<%d> ", status);
@@ -145,8 +156,9 @@ static void dma_stop(struct Scsi_Host *instance, struct scsi_cmnd *SCpnt,
barrier();
}
hregs->ctrl = 0;
- dma_unmap_single(NULL, SCpnt->SCp.dma_handle, SCpnt->SCp.this_residual,
- SCpnt->sc_data_direction);
+ dma_unmap_single(hdata->dev, SCpnt->SCp.dma_handle,
+ SCpnt->SCp.this_residual,
+ DMA_DIR(hdata->wh.dma_dir));
pr_debug("\n");
}
@@ -161,22 +173,23 @@ void sgiwd93_reset(unsigned long base)
}
EXPORT_SYMBOL_GPL(sgiwd93_reset);
-static inline void init_hpc_chain(struct hpc_data *hd)
+static inline void init_hpc_chain(struct ip22_hostdata *hdata)
{
- struct hpc_chunk *hcp = (struct hpc_chunk *) hd->cpu;
- struct hpc_chunk *dma = (struct hpc_chunk *) hd->dma;
+ struct hpc_chunk *hcp = (struct hpc_chunk *)hdata->cpu;
+ dma_addr_t dma = hdata->dma;
unsigned long start, end;
start = (unsigned long) hcp;
- end = start + PAGE_SIZE;
+ end = start + HPC_DMA_SIZE;
while (start < end) {
- hcp->desc.pnext = (u32) (dma + 1);
+ hcp->desc.pnext = (u32) (dma + sizeof(struct hpc_chunk));
hcp->desc.cntinfo = HPCDMA_EOX;
- hcp++; dma++;
+ hcp++;
+ dma += sizeof(struct hpc_chunk);
start += sizeof(struct hpc_chunk);
};
hcp--;
- hcp->desc.pnext = hd->dma;
+ hcp->desc.pnext = hdata->dma;
}
static int sgiwd93_bus_reset(struct scsi_cmnd *cmd)
@@ -235,16 +248,17 @@ static int __init sgiwd93_probe(struct platform_device *pdev)
host->irq = irq;
hdata = host_to_hostdata(host);
- hdata->hd.cpu = dma_alloc_coherent(&pdev->dev, PAGE_SIZE,
- &hdata->hd.dma, GFP_KERNEL);
- if (!hdata->hd.cpu) {
+ hdata->dev = &pdev->dev;
+ hdata->cpu = dma_alloc_noncoherent(&pdev->dev, HPC_DMA_SIZE,
+ &hdata->dma, GFP_KERNEL);
+ if (!hdata->cpu) {
printk(KERN_WARNING "sgiwd93: Could not allocate memory for "
"host %d buffer.\n", unit);
err = -ENOMEM;
goto out_put;
}
- init_hpc_chain(&hdata->hd);
+ init_hpc_chain(hdata);
regs.SASR = wdregs + 3;
regs.SCMD = wdregs + 7;
@@ -274,7 +288,7 @@ static int __init sgiwd93_probe(struct platform_device *pdev)
out_irq:
free_irq(irq, host);
out_free:
- dma_free_coherent(NULL, PAGE_SIZE, hdata->hd.cpu, hdata->hd.dma);
+ dma_free_noncoherent(&pdev->dev, HPC_DMA_SIZE, hdata->cpu, hdata->dma);
out_put:
scsi_host_put(host);
out:
@@ -290,7 +304,7 @@ static void __exit sgiwd93_remove(struct platform_device *pdev)
scsi_remove_host(host);
free_irq(pd->irq, host);
- dma_free_coherent(&pdev->dev, PAGE_SIZE, hdata->hd.cpu, hdata->hd.dma);
+ dma_free_noncoherent(&pdev->dev, HPC_DMA_SIZE, hdata->cpu, hdata->dma);
scsi_host_put(host);
}
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 1fcee16fa36..50ba4925020 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -231,7 +231,7 @@ out:
static int sr_done(struct scsi_cmnd *SCpnt)
{
int result = SCpnt->result;
- int this_count = SCpnt->request_bufflen;
+ int this_count = scsi_bufflen(SCpnt);
int good_bytes = (result == 0 ? this_count : 0);
int block_sectors = 0;
long error_sector;
@@ -379,17 +379,18 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq)
}
{
- struct scatterlist *sg = SCpnt->request_buffer;
- int i, size = 0;
- for (i = 0; i < SCpnt->use_sg; i++)
- size += sg[i].length;
+ struct scatterlist *sg;
+ int i, size = 0, sg_count = scsi_sg_count(SCpnt);
- if (size != SCpnt->request_bufflen && SCpnt->use_sg) {
+ scsi_for_each_sg(SCpnt, sg, sg_count, i)
+ size += sg->length;
+
+ if (size != scsi_bufflen(SCpnt)) {
scmd_printk(KERN_ERR, SCpnt,
"mismatch count %d, bytes %d\n",
- size, SCpnt->request_bufflen);
- if (SCpnt->request_bufflen > size)
- SCpnt->request_bufflen = size;
+ size, scsi_bufflen(SCpnt));
+ if (scsi_bufflen(SCpnt) > size)
+ SCpnt->sdb.length = size;
}
}
@@ -397,12 +398,12 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq)
* request doesn't start on hw block boundary, add scatter pads
*/
if (((unsigned int)rq->sector % (s_size >> 9)) ||
- (SCpnt->request_bufflen % s_size)) {
+ (scsi_bufflen(SCpnt) % s_size)) {
scmd_printk(KERN_NOTICE, SCpnt, "unaligned transfer\n");
goto out;
}
- this_count = (SCpnt->request_bufflen >> 9) / (s_size >> 9);
+ this_count = (scsi_bufflen(SCpnt) >> 9) / (s_size >> 9);
SCSI_LOG_HLQUEUE(2, printk("%s : %s %d/%ld 512 byte blocks.\n",
@@ -416,7 +417,7 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq)
if (this_count > 0xffff) {
this_count = 0xffff;
- SCpnt->request_bufflen = this_count * s_size;
+ SCpnt->sdb.length = this_count * s_size;
}
SCpnt->cmnd[2] = (unsigned char) (block >> 24) & 0xff;
diff --git a/drivers/scsi/stex.c b/drivers/scsi/stex.c
index e3fab3a6aed..72f6d801535 100644
--- a/drivers/scsi/stex.c
+++ b/drivers/scsi/stex.c
@@ -1123,7 +1123,6 @@ static struct scsi_host_template driver_template = {
.this_id = -1,
.sg_tablesize = ST_MAX_SG,
.cmd_per_lun = ST_CMD_PER_LUN,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
static int stex_set_dma_mask(struct pci_dev * pdev)
diff --git a/drivers/scsi/sym53c416.c b/drivers/scsi/sym53c416.c
index 1f6fd168033..6325901e509 100644
--- a/drivers/scsi/sym53c416.c
+++ b/drivers/scsi/sym53c416.c
@@ -840,6 +840,5 @@ static struct scsi_host_template driver_template = {
.cmd_per_lun = 1,
.unchecked_isa_dma = 1,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
#include "scsi_module.c"
diff --git a/drivers/scsi/sym53c8xx_2/sym_glue.c b/drivers/scsi/sym53c8xx_2/sym_glue.c
index 21e926dcdab..d39107b7669 100644
--- a/drivers/scsi/sym53c8xx_2/sym_glue.c
+++ b/drivers/scsi/sym53c8xx_2/sym_glue.c
@@ -207,7 +207,7 @@ void sym_set_cam_result_error(struct sym_hcb *np, struct sym_ccb *cp, int resid)
/*
* Bounce back the sense data to user.
*/
- memset(&cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
+ memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
memcpy(cmd->sense_buffer, cp->sns_bbuf,
min(SCSI_SENSE_BUFFERSIZE, SYM_SNS_BBUF_LEN));
#if 0
@@ -1681,7 +1681,6 @@ static struct scsi_host_template sym2_template = {
.eh_host_reset_handler = sym53c8xx_eh_host_reset_handler,
.this_id = 7,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
.max_sectors = 0xFFFF,
#ifdef SYM_LINUX_PROC_INFO_SUPPORT
.proc_info = sym53c8xx_proc_info,
diff --git a/drivers/scsi/u14-34f.c b/drivers/scsi/u14-34f.c
index 4bc5407f969..662c00451be 100644
--- a/drivers/scsi/u14-34f.c
+++ b/drivers/scsi/u14-34f.c
@@ -451,7 +451,6 @@ static struct scsi_host_template driver_template = {
.this_id = 7,
.unchecked_isa_dma = 1,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
#if !defined(__BIG_ENDIAN_BITFIELD) && !defined(__LITTLE_ENDIAN_BITFIELD)
diff --git a/drivers/scsi/ultrastor.c b/drivers/scsi/ultrastor.c
index 75eca6b22db..f385dce8dfb 100644
--- a/drivers/scsi/ultrastor.c
+++ b/drivers/scsi/ultrastor.c
@@ -1204,6 +1204,5 @@ static struct scsi_host_template driver_template = {
.cmd_per_lun = ULTRASTOR_MAX_CMDS_PER_LUN,
.unchecked_isa_dma = 1,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
#include "scsi_module.c"
diff --git a/drivers/scsi/wd7000.c b/drivers/scsi/wd7000.c
index b4304ae7852..c975c01b3a0 100644
--- a/drivers/scsi/wd7000.c
+++ b/drivers/scsi/wd7000.c
@@ -1671,7 +1671,6 @@ static struct scsi_host_template driver_template = {
.cmd_per_lun = 1,
.unchecked_isa_dma = 1,
.use_clustering = ENABLE_CLUSTERING,
- .use_sg_chaining = ENABLE_SG_CHAINING,
};
#include "scsi_module.c"
diff --git a/drivers/usb/storage/isd200.c b/drivers/usb/storage/isd200.c
index 178e8c2a8a2..0db488624ab 100644
--- a/drivers/usb/storage/isd200.c
+++ b/drivers/usb/storage/isd200.c
@@ -415,14 +415,14 @@ static void isd200_set_srb(struct isd200_info *info,
sg_init_one(&info->sg, buff, bufflen);
srb->sc_data_direction = dir;
- srb->request_buffer = buff ? &info->sg : NULL;
- srb->request_bufflen = bufflen;
- srb->use_sg = buff ? 1 : 0;
+ srb->sdb.table.sgl = buff ? &info->sg : NULL;
+ srb->sdb.length = bufflen;
+ srb->sdb.table.nents = buff ? 1 : 0;
}
static void isd200_srb_set_bufflen(struct scsi_cmnd *srb, unsigned bufflen)
{
- srb->request_bufflen = bufflen;
+ srb->sdb.length = bufflen;
}
diff --git a/drivers/video/vermilion/vermilion.c b/drivers/video/vermilion/vermilion.c
index c31f549ebea..1c656667b93 100644
--- a/drivers/video/vermilion/vermilion.c
+++ b/drivers/video/vermilion/vermilion.c
@@ -88,9 +88,7 @@ static int vmlfb_alloc_vram_area(struct vram_area *va, unsigned max_order,
{
gfp_t flags;
unsigned long i;
- pgprot_t wc_pageprot;
- wc_pageprot = PAGE_KERNEL_NOCACHE;
max_order++;
do {
/*
@@ -126,14 +124,8 @@ static int vmlfb_alloc_vram_area(struct vram_area *va, unsigned max_order,
/*
* Change caching policy of the linear kernel map to avoid
* mapping type conflicts with user-space mappings.
- * The first global_flush_tlb() is really only there to do a global
- * wbinvd().
*/
-
- global_flush_tlb();
- change_page_attr(virt_to_page(va->logical), va->size >> PAGE_SHIFT,
- wc_pageprot);
- global_flush_tlb();
+ set_pages_uc(virt_to_page(va->logical), va->size >> PAGE_SHIFT);
printk(KERN_DEBUG MODULE_NAME
": Allocated %ld bytes vram area at 0x%08lx\n",
@@ -157,9 +149,8 @@ static void vmlfb_free_vram_area(struct vram_area *va)
* Reset the linear kernel map caching policy.
*/
- change_page_attr(virt_to_page(va->logical),
- va->size >> PAGE_SHIFT, PAGE_KERNEL);
- global_flush_tlb();
+ set_pages_wb(virt_to_page(va->logical),
+ va->size >> PAGE_SHIFT);
/*
* Decrease the usage count on the pages we've used