summaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/Kconfig13
-rw-r--r--arch/alpha/include/asm/local.h17
-rw-r--r--arch/arm/configs/rx51_defconfig11
-rw-r--r--arch/arm/include/asm/hardware/iop3xx-adma.h12
-rw-r--r--arch/arm/mach-mx2/devices.c80
-rw-r--r--arch/arm/mach-mx2/devices.h1
-rw-r--r--arch/arm/mach-u300/include/mach/coh901318.h2
-rw-r--r--arch/arm/plat-mxc/include/mach/mx21-usbhost.h38
-rw-r--r--arch/avr32/mach-at32ap/at32ap700x.c7
-rw-r--r--arch/blackfin/mach-common/entry.S4
-rw-r--r--arch/cris/arch-v10/kernel/entry.S2
-rw-r--r--arch/cris/arch-v32/mm/mmu.S2
-rw-r--r--arch/ia64/include/asm/percpu.h4
-rw-r--r--arch/ia64/include/asm/xen/events.h4
-rw-r--r--arch/ia64/kernel/acpi.c4
-rw-r--r--arch/ia64/kernel/ia64_ksyms.c4
-rw-r--r--arch/ia64/kvm/Kconfig1
-rw-r--r--arch/ia64/kvm/kvm-ia64.c50
-rw-r--r--arch/ia64/kvm/kvm_fw.c28
-rw-r--r--arch/ia64/kvm/mmio.c4
-rw-r--r--arch/ia64/kvm/vcpu.c4
-rw-r--r--arch/ia64/mm/discontig.c2
-rw-r--r--arch/m32r/include/asm/local.h25
-rw-r--r--arch/microblaze/include/asm/entry.h2
-rw-r--r--arch/mips/include/asm/local.h25
-rw-r--r--arch/parisc/lib/fixup.S8
-rw-r--r--arch/powerpc/include/asm/kvm_asm.h6
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h11
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64_asm.h18
-rw-r--r--arch/powerpc/include/asm/kvm_e500.h3
-rw-r--r--arch/powerpc/include/asm/kvm_host.h23
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h83
-rw-r--r--arch/powerpc/include/asm/local.h25
-rw-r--r--arch/powerpc/include/asm/paca.h5
-rw-r--r--arch/powerpc/include/asm/reg.h4
-rw-r--r--arch/powerpc/kernel/asm-offsets.c33
-rw-r--r--arch/powerpc/kernel/ppc_ksyms.c1
-rw-r--r--arch/powerpc/kvm/44x_emulate.c25
-rw-r--r--arch/powerpc/kvm/44x_tlb.c20
-rw-r--r--arch/powerpc/kvm/Kconfig1
-rw-r--r--arch/powerpc/kvm/book3s.c309
-rw-r--r--arch/powerpc/kvm/book3s_64_emulate.c77
-rw-r--r--arch/powerpc/kvm/book3s_64_exports.c8
-rw-r--r--arch/powerpc/kvm/book3s_64_interrupts.S336
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu.c10
-rw-r--r--arch/powerpc/kvm/book3s_64_rmhandlers.S119
-rw-r--r--arch/powerpc/kvm/book3s_64_slb.S160
-rw-r--r--arch/powerpc/kvm/booke.c87
-rw-r--r--arch/powerpc/kvm/booke_emulate.c107
-rw-r--r--arch/powerpc/kvm/e500.c6
-rw-r--r--arch/powerpc/kvm/e500_emulate.c93
-rw-r--r--arch/powerpc/kvm/e500_tlb.c10
-rw-r--r--arch/powerpc/kvm/emulate.c118
-rw-r--r--arch/powerpc/kvm/powerpc.c40
-rw-r--r--arch/s390/hypfs/inode.c42
-rw-r--r--arch/s390/kvm/kvm-s390.c26
-rw-r--r--arch/s390/kvm/kvm-s390.h10
-rw-r--r--arch/sparc/configs/sparc32_defconfig56
-rw-r--r--arch/sparc/configs/sparc64_defconfig34
-rw-r--r--arch/sparc/include/asm/io_32.h4
-rw-r--r--arch/sparc/include/asm/io_64.h4
-rw-r--r--arch/sparc/include/asm/perfctr.h4
-rw-r--r--arch/sparc/include/asm/system_64.h15
-rw-r--r--arch/sparc/include/asm/thread_info_64.h25
-rw-r--r--arch/sparc/kernel/entry.h1
-rw-r--r--arch/sparc/kernel/nmi.c7
-rw-r--r--arch/sparc/kernel/process_64.c23
-rw-r--r--arch/sparc/kernel/rtrap_64.S62
-rw-r--r--arch/sparc/kernel/sys32.S1
-rw-r--r--arch/sparc/kernel/sys_sparc_64.c104
-rw-r--r--arch/sparc/kernel/syscalls.S23
-rw-r--r--arch/sparc/kernel/systbls.h2
-rw-r--r--arch/sparc/kernel/systbls_64.S4
-rw-r--r--arch/sparc/kernel/traps_64.c9
-rw-r--r--arch/sparc/prom/p1275.c12
-rw-r--r--arch/um/drivers/mconsole_kern.c2
-rw-r--r--arch/x86/Kconfig16
-rw-r--r--arch/x86/ia32/ia32_aout.c1
-rw-r--r--arch/x86/include/asm/Kbuild1
-rw-r--r--arch/x86/include/asm/alternative.h4
-rw-r--r--arch/x86/include/asm/e820.h5
-rw-r--r--arch/x86/include/asm/highmem.h4
-rw-r--r--arch/x86/include/asm/hyperv.h186
-rw-r--r--arch/x86/include/asm/i8259.h2
-rw-r--r--arch/x86/include/asm/io_apic.h1
-rw-r--r--arch/x86/include/asm/irq.h1
-rw-r--r--arch/x86/include/asm/irq_vectors.h48
-rw-r--r--arch/x86/include/asm/kprobes.h31
-rw-r--r--arch/x86/include/asm/kvm_emulate.h17
-rw-r--r--arch/x86/include/asm/kvm_host.h60
-rw-r--r--arch/x86/include/asm/kvm_para.h1
-rw-r--r--arch/x86/include/asm/local.h37
-rw-r--r--arch/x86/include/asm/paravirt.h9
-rw-r--r--arch/x86/include/asm/paravirt_types.h4
-rw-r--r--arch/x86/include/asm/pci.h2
-rw-r--r--arch/x86/include/asm/pci_64.h2
-rw-r--r--arch/x86/include/asm/percpu.h119
-rw-r--r--arch/x86/include/asm/pgtable_32.h4
-rw-r--r--arch/x86/include/asm/proto.h10
-rw-r--r--arch/x86/include/asm/svm.h2
-rw-r--r--arch/x86/include/asm/system.h8
-rw-r--r--arch/x86/include/asm/vmx.h5
-rw-r--r--arch/x86/kernel/acpi/boot.c9
-rw-r--r--arch/x86/kernel/alternative.c60
-rw-r--r--arch/x86/kernel/apic/io_apic.c258
-rw-r--r--arch/x86/kernel/apic/nmi.c12
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c208
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c1
-rw-r--r--arch/x86/kernel/e820.c349
-rw-r--r--arch/x86/kernel/head32.c10
-rw-r--r--arch/x86/kernel/head_32.S6
-rw-r--r--arch/x86/kernel/i8259.c30
-rw-r--r--arch/x86/kernel/irqinit.c35
-rw-r--r--arch/x86/kernel/kprobes.c609
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c7
-rw-r--r--arch/x86/kernel/paravirt.c4
-rw-r--r--arch/x86/kernel/pci-dma.c13
-rw-r--r--arch/x86/kernel/reboot.c8
-rw-r--r--arch/x86/kernel/setup.c10
-rw-r--r--arch/x86/kernel/setup_percpu.c6
-rw-r--r--arch/x86/kernel/smpboot.c13
-rw-r--r--arch/x86/kernel/time.c4
-rw-r--r--arch/x86/kernel/visws_quirks.c6
-rw-r--r--arch/x86/kernel/vmi_32.c35
-rw-r--r--arch/x86/kernel/vmiclock_32.c6
-rw-r--r--arch/x86/kernel/vmlinux.lds.S4
-rw-r--r--arch/x86/kernel/vsyscall_64.c3
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/emulate.c440
-rw-r--r--arch/x86/kvm/i8254.c23
-rw-r--r--arch/x86/kvm/i8254.h2
-rw-r--r--arch/x86/kvm/i8259.c46
-rw-r--r--arch/x86/kvm/irq.h3
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h31
-rw-r--r--arch/x86/kvm/lapic.c31
-rw-r--r--arch/x86/kvm/lapic.h8
-rw-r--r--arch/x86/kvm/mmu.c137
-rw-r--r--arch/x86/kvm/mmu.h35
-rw-r--r--arch/x86/kvm/paging_tmpl.h13
-rw-r--r--arch/x86/kvm/svm.c237
-rw-r--r--arch/x86/kvm/trace.h59
-rw-r--r--arch/x86/kvm/vmx.c396
-rw-r--r--arch/x86/kvm/x86.c1098
-rw-r--r--arch/x86/kvm/x86.h30
-rw-r--r--arch/x86/mm/init_32.c7
-rw-r--r--arch/x86/mm/init_64.c9
-rw-r--r--arch/x86/mm/numa_32.c3
-rw-r--r--arch/x86/mm/numa_64.c97
-rw-r--r--arch/x86/pci/Makefile3
-rw-r--r--arch/x86/pci/amd_bus.c127
-rw-r--r--arch/x86/pci/bus_numa.c25
-rw-r--r--arch/x86/pci/bus_numa.h9
-rw-r--r--arch/x86/pci/i386.c4
-rw-r--r--arch/x86/xen/enlighten.c7
-rw-r--r--arch/x86/xen/mmu.c21
-rw-r--r--arch/x86/xen/xen-asm_32.S4
156 files changed, 4696 insertions, 3058 deletions
diff --git a/arch/Kconfig b/arch/Kconfig
index 215e46073c4..e5eb1337a53 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -41,6 +41,17 @@ config KPROBES
for kernel debugging, non-intrusive instrumentation and testing.
If in doubt, say "N".
+config OPTPROBES
+ bool "Kprobes jump optimization support (EXPERIMENTAL)"
+ default y
+ depends on KPROBES
+ depends on !PREEMPT
+ depends on HAVE_OPTPROBES
+ select KALLSYMS_ALL
+ help
+ This option will allow kprobes to optimize breakpoint to
+ a jump for reducing its overhead.
+
config HAVE_EFFICIENT_UNALIGNED_ACCESS
bool
help
@@ -83,6 +94,8 @@ config HAVE_KPROBES
config HAVE_KRETPROBES
bool
+config HAVE_OPTPROBES
+ bool
#
# An arch should select this if it provides all these things:
#
diff --git a/arch/alpha/include/asm/local.h b/arch/alpha/include/asm/local.h
index 6ad3ea69642..b9e3e331837 100644
--- a/arch/alpha/include/asm/local.h
+++ b/arch/alpha/include/asm/local.h
@@ -98,21 +98,4 @@ static __inline__ long local_sub_return(long i, local_t * l)
#define __local_add(i,l) ((l)->a.counter+=(i))
#define __local_sub(i,l) ((l)->a.counter-=(i))
-/* Use these for per-cpu local_t variables: on some archs they are
- * much more efficient than these naive implementations. Note they take
- * a variable, not an address.
- */
-#define cpu_local_read(l) local_read(&__get_cpu_var(l))
-#define cpu_local_set(l, i) local_set(&__get_cpu_var(l), (i))
-
-#define cpu_local_inc(l) local_inc(&__get_cpu_var(l))
-#define cpu_local_dec(l) local_dec(&__get_cpu_var(l))
-#define cpu_local_add(i, l) local_add((i), &__get_cpu_var(l))
-#define cpu_local_sub(i, l) local_sub((i), &__get_cpu_var(l))
-
-#define __cpu_local_inc(l) __local_inc(&__get_cpu_var(l))
-#define __cpu_local_dec(l) __local_dec(&__get_cpu_var(l))
-#define __cpu_local_add(i, l) __local_add((i), &__get_cpu_var(l))
-#define __cpu_local_sub(i, l) __local_sub((i), &__get_cpu_var(l))
-
#endif /* _ALPHA_LOCAL_H */
diff --git a/arch/arm/configs/rx51_defconfig b/arch/arm/configs/rx51_defconfig
index 426ae948aef..193bd334fbb 100644
--- a/arch/arm/configs/rx51_defconfig
+++ b/arch/arm/configs/rx51_defconfig
@@ -445,6 +445,8 @@ CONFIG_IP_NF_FILTER=m
# CONFIG_LAPB is not set
# CONFIG_ECONET is not set
# CONFIG_WAN_ROUTER is not set
+CONFIG_PHONET=y
+# CONFIG_IEEE802154 is not set
# CONFIG_NET_SCHED is not set
# CONFIG_DCB is not set
@@ -1325,27 +1327,34 @@ CONFIG_USB_GADGET_SELECTED=y
# CONFIG_USB_GADGET_LH7A40X is not set
# CONFIG_USB_GADGET_OMAP is not set
# CONFIG_USB_GADGET_PXA25X is not set
+# CONFIG_USB_GADGET_R8A66597 is not set
# CONFIG_USB_GADGET_PXA27X is not set
-# CONFIG_USB_GADGET_S3C2410 is not set
+# CONFIG_USB_GADGET_S3C_HSOTG is not set
# CONFIG_USB_GADGET_IMX is not set
+# CONFIG_USB_GADGET_S3C2410 is not set
# CONFIG_USB_GADGET_M66592 is not set
# CONFIG_USB_GADGET_AMD5536UDC is not set
# CONFIG_USB_GADGET_FSL_QE is not set
# CONFIG_USB_GADGET_CI13XXX is not set
# CONFIG_USB_GADGET_NET2280 is not set
# CONFIG_USB_GADGET_GOKU is not set
+# CONFIG_USB_GADGET_LANGWELL is not set
# CONFIG_USB_GADGET_DUMMY_HCD is not set
CONFIG_USB_GADGET_DUALSPEED=y
CONFIG_USB_ZERO=m
# CONFIG_USB_ZERO_HNPTEST is not set
+# CONFIG_USB_AUDIO is not set
# CONFIG_USB_ETH is not set
# CONFIG_USB_GADGETFS is not set
CONFIG_USB_FILE_STORAGE=m
# CONFIG_USB_FILE_STORAGE_TEST is not set
+# CONFIG_USB_MASS_STORAGE is not set
# CONFIG_USB_G_SERIAL is not set
# CONFIG_USB_MIDI_GADGET is not set
# CONFIG_USB_G_PRINTER is not set
# CONFIG_USB_CDC_COMPOSITE is not set
+CONFIG_USB_G_NOKIA=m
+# CONFIG_USB_G_MULTI is not set
#
# OTG and related infrastructure
diff --git a/arch/arm/include/asm/hardware/iop3xx-adma.h b/arch/arm/include/asm/hardware/iop3xx-adma.h
index 1a8c7279a28..9b28f1243bd 100644
--- a/arch/arm/include/asm/hardware/iop3xx-adma.h
+++ b/arch/arm/include/asm/hardware/iop3xx-adma.h
@@ -366,8 +366,7 @@ static inline int iop_chan_xor_slot_count(size_t len, int src_cnt,
slot_cnt += *slots_per_op;
}
- if (len)
- slot_cnt += *slots_per_op;
+ slot_cnt += *slots_per_op;
return slot_cnt;
}
@@ -389,8 +388,7 @@ static inline int iop_chan_zero_sum_slot_count(size_t len, int src_cnt,
slot_cnt += *slots_per_op;
}
- if (len)
- slot_cnt += *slots_per_op;
+ slot_cnt += *slots_per_op;
return slot_cnt;
}
@@ -737,10 +735,8 @@ iop_desc_set_zero_sum_byte_count(struct iop_adma_desc_slot *desc, u32 len)
i += slots_per_op;
} while (len > IOP_ADMA_ZERO_SUM_MAX_BYTE_COUNT);
- if (len) {
- iter = iop_hw_desc_slot_idx(hw_desc, i);
- iter->byte_count = len;
- }
+ iter = iop_hw_desc_slot_idx(hw_desc, i);
+ iter->byte_count = len;
}
}
diff --git a/arch/arm/mach-mx2/devices.c b/arch/arm/mach-mx2/devices.c
index 3d398ce09b3..3956d82b7c4 100644
--- a/arch/arm/mach-mx2/devices.c
+++ b/arch/arm/mach-mx2/devices.c
@@ -31,6 +31,7 @@
#include <linux/init.h>
#include <linux/platform_device.h>
#include <linux/gpio.h>
+#include <linux/dma-mapping.h>
#include <mach/irqs.h>
#include <mach/hardware.h>
@@ -292,7 +293,7 @@ struct platform_device mxc_fb_device = {
.num_resources = ARRAY_SIZE(mxc_fb),
.resource = mxc_fb,
.dev = {
- .coherent_dma_mask = 0xFFFFFFFF,
+ .coherent_dma_mask = DMA_BIT_MASK(32),
},
};
@@ -395,17 +396,17 @@ static struct resource mxc_sdhc1_resources[] = {
},
};
-static u64 mxc_sdhc1_dmamask = 0xffffffffUL;
+static u64 mxc_sdhc1_dmamask = DMA_BIT_MASK(32);
struct platform_device mxc_sdhc_device0 = {
- .name = "mxc-mmc",
- .id = 0,
- .dev = {
- .dma_mask = &mxc_sdhc1_dmamask,
- .coherent_dma_mask = 0xffffffff,
- },
- .num_resources = ARRAY_SIZE(mxc_sdhc1_resources),
- .resource = mxc_sdhc1_resources,
+ .name = "mxc-mmc",
+ .id = 0,
+ .dev = {
+ .dma_mask = &mxc_sdhc1_dmamask,
+ .coherent_dma_mask = DMA_BIT_MASK(32),
+ },
+ .num_resources = ARRAY_SIZE(mxc_sdhc1_resources),
+ .resource = mxc_sdhc1_resources,
};
static struct resource mxc_sdhc2_resources[] = {
@@ -424,17 +425,17 @@ static struct resource mxc_sdhc2_resources[] = {
},
};
-static u64 mxc_sdhc2_dmamask = 0xffffffffUL;
+static u64 mxc_sdhc2_dmamask = DMA_BIT_MASK(32);
struct platform_device mxc_sdhc_device1 = {
- .name = "mxc-mmc",
- .id = 1,
- .dev = {
- .dma_mask = &mxc_sdhc2_dmamask,
- .coherent_dma_mask = 0xffffffff,
- },
- .num_resources = ARRAY_SIZE(mxc_sdhc2_resources),
- .resource = mxc_sdhc2_resources,
+ .name = "mxc-mmc",
+ .id = 1,
+ .dev = {
+ .dma_mask = &mxc_sdhc2_dmamask,
+ .coherent_dma_mask = DMA_BIT_MASK(32),
+ },
+ .num_resources = ARRAY_SIZE(mxc_sdhc2_resources),
+ .resource = mxc_sdhc2_resources,
};
#ifdef CONFIG_MACH_MX27
@@ -450,7 +451,7 @@ static struct resource otg_resources[] = {
},
};
-static u64 otg_dmamask = 0xffffffffUL;
+static u64 otg_dmamask = DMA_BIT_MASK(32);
/* OTG gadget device */
struct platform_device mxc_otg_udc_device = {
@@ -458,7 +459,7 @@ struct platform_device mxc_otg_udc_device = {
.id = -1,
.dev = {
.dma_mask = &otg_dmamask,
- .coherent_dma_mask = 0xffffffffUL,
+ .coherent_dma_mask = DMA_BIT_MASK(32),
},
.resource = otg_resources,
.num_resources = ARRAY_SIZE(otg_resources),
@@ -469,7 +470,7 @@ struct platform_device mxc_otg_host = {
.name = "mxc-ehci",
.id = 0,
.dev = {
- .coherent_dma_mask = 0xffffffff,
+ .coherent_dma_mask = DMA_BIT_MASK(32),
.dma_mask = &otg_dmamask,
},
.resource = otg_resources,
@@ -478,7 +479,7 @@ struct platform_device mxc_otg_host = {
/* USB host 1 */
-static u64 usbh1_dmamask = 0xffffffffUL;
+static u64 usbh1_dmamask = DMA_BIT_MASK(32);
static struct resource mxc_usbh1_resources[] = {
{
@@ -496,7 +497,7 @@ struct platform_device mxc_usbh1 = {
.name = "mxc-ehci",
.id = 1,
.dev = {
- .coherent_dma_mask = 0xffffffff,
+ .coherent_dma_mask = DMA_BIT_MASK(32),
.dma_mask = &usbh1_dmamask,
},
.resource = mxc_usbh1_resources,
@@ -504,7 +505,7 @@ struct platform_device mxc_usbh1 = {
};
/* USB host 2 */
-static u64 usbh2_dmamask = 0xffffffffUL;
+static u64 usbh2_dmamask = DMA_BIT_MASK(32);
static struct resource mxc_usbh2_resources[] = {
{
@@ -522,7 +523,7 @@ struct platform_device mxc_usbh2 = {
.name = "mxc-ehci",
.id = 2,
.dev = {
- .coherent_dma_mask = 0xffffffff,
+ .coherent_dma_mask = DMA_BIT_MASK(32),
.dma_mask = &usbh2_dmamask,
},
.resource = mxc_usbh2_resources,
@@ -642,3 +643,30 @@ int __init mxc_register_gpios(void)
{
return mxc_gpio_init(imx_gpio_ports, ARRAY_SIZE(imx_gpio_ports));
}
+
+#ifdef CONFIG_MACH_MX21
+static struct resource mx21_usbhc_resources[] = {
+ {
+ .start = USBOTG_BASE_ADDR,
+ .end = USBOTG_BASE_ADDR + 0x1FFF,
+ .flags = IORESOURCE_MEM,
+ },
+ {
+ .start = MXC_INT_USBHOST,
+ .end = MXC_INT_USBHOST,
+ .flags = IORESOURCE_IRQ,
+ },
+};
+
+struct platform_device mx21_usbhc_device = {
+ .name = "imx21-hcd",
+ .id = 0,
+ .dev = {
+ .dma_mask = &mx21_usbhc_device.dev.coherent_dma_mask,
+ .coherent_dma_mask = DMA_BIT_MASK(32),
+ },
+ .num_resources = ARRAY_SIZE(mx21_usbhc_resources),
+ .resource = mx21_usbhc_resources,
+};
+#endif
+
diff --git a/arch/arm/mach-mx2/devices.h b/arch/arm/mach-mx2/devices.h
index 97306aa18f1..f12694b0736 100644
--- a/arch/arm/mach-mx2/devices.h
+++ b/arch/arm/mach-mx2/devices.h
@@ -26,5 +26,6 @@ extern struct platform_device mxc_usbh2;
extern struct platform_device mxc_spi_device0;
extern struct platform_device mxc_spi_device1;
extern struct platform_device mxc_spi_device2;
+extern struct platform_device mx21_usbhc_device;
extern struct platform_device imx_ssi_device0;
extern struct platform_device imx_ssi_device1;
diff --git a/arch/arm/mach-u300/include/mach/coh901318.h b/arch/arm/mach-u300/include/mach/coh901318.h
index f4cfee9c7d2..b8155b4e5ff 100644
--- a/arch/arm/mach-u300/include/mach/coh901318.h
+++ b/arch/arm/mach-u300/include/mach/coh901318.h
@@ -53,7 +53,7 @@ struct coh901318_params {
* struct coh_dma_channel - dma channel base
* @name: ascii name of dma channel
* @number: channel id number
- * @desc_nbr_max: number of preallocated descriptortors
+ * @desc_nbr_max: number of preallocated descriptors
* @priority_high: prio of channel, 0 low otherwise high.
* @param: configuration parameters
* @dev_addr: physical address of periphal connected to channel
diff --git a/arch/arm/plat-mxc/include/mach/mx21-usbhost.h b/arch/arm/plat-mxc/include/mach/mx21-usbhost.h
new file mode 100644
index 00000000000..22d0b596262
--- /dev/null
+++ b/arch/arm/plat-mxc/include/mach/mx21-usbhost.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2009 Martin Fuzzey <mfuzzey@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __ASM_ARCH_MX21_USBH
+#define __ASM_ARCH_MX21_USBH
+
+enum mx21_usbh_xcvr {
+ /* Values below as used by hardware (HWMODE register) */
+ MX21_USBXCVR_TXDIF_RXDIF = 0,
+ MX21_USBXCVR_TXDIF_RXSE = 1,
+ MX21_USBXCVR_TXSE_RXDIF = 2,
+ MX21_USBXCVR_TXSE_RXSE = 3,
+};
+
+struct mx21_usbh_platform_data {
+ enum mx21_usbh_xcvr host_xcvr; /* tranceiver mode host 1,2 ports */
+ enum mx21_usbh_xcvr otg_xcvr; /* tranceiver mode otg (as host) port */
+ u16 enable_host1:1,
+ enable_host2:1,
+ enable_otg_host:1, /* enable "OTG" port (as host) */
+ host1_xcverless:1, /* traceiverless host1 port */
+ host1_txenoe:1, /* output enable host1 transmit enable */
+ otg_ext_xcvr:1, /* external tranceiver for OTG port */
+ unused:10;
+};
+
+#endif /* __ASM_ARCH_MX21_USBH */
diff --git a/arch/avr32/mach-at32ap/at32ap700x.c b/arch/avr32/mach-at32ap/at32ap700x.c
index b13d1879e51..3a4bc1a1843 100644
--- a/arch/avr32/mach-at32ap/at32ap700x.c
+++ b/arch/avr32/mach-at32ap/at32ap700x.c
@@ -1770,10 +1770,13 @@ at32_add_device_usba(unsigned int id, struct usba_platform_data *data)
ARRAY_SIZE(usba0_resource)))
goto out_free_pdev;
- if (data)
+ if (data) {
usba_data.pdata.vbus_pin = data->vbus_pin;
- else
+ usba_data.pdata.vbus_pin_inverted = data->vbus_pin_inverted;
+ } else {
usba_data.pdata.vbus_pin = -EINVAL;
+ usba_data.pdata.vbus_pin_inverted = -EINVAL;
+ }
data = &usba_data.pdata;
data->num_ep = ARRAY_SIZE(at32_usba_ep);
diff --git a/arch/blackfin/mach-common/entry.S b/arch/blackfin/mach-common/entry.S
index b0ed0b487ff..01b2f58dfb9 100644
--- a/arch/blackfin/mach-common/entry.S
+++ b/arch/blackfin/mach-common/entry.S
@@ -816,8 +816,8 @@ ENDPROC(_resume)
ENTRY(_ret_from_exception)
#ifdef CONFIG_IPIPE
- p2.l = _per_cpu__ipipe_percpu_domain;
- p2.h = _per_cpu__ipipe_percpu_domain;
+ p2.l = _ipipe_percpu_domain;
+ p2.h = _ipipe_percpu_domain;
r0.l = _ipipe_root;
r0.h = _ipipe_root;
r2 = [p2];
diff --git a/arch/cris/arch-v10/kernel/entry.S b/arch/cris/arch-v10/kernel/entry.S
index 2c18d08cd91..c52bef39e25 100644
--- a/arch/cris/arch-v10/kernel/entry.S
+++ b/arch/cris/arch-v10/kernel/entry.S
@@ -358,7 +358,7 @@ mmu_bus_fault:
1: btstq 12, $r1 ; Refill?
bpl 2f
lsrq 24, $r1 ; Get PGD index (bit 24-31)
- move.d [per_cpu__current_pgd], $r0 ; PGD for the current process
+ move.d [current_pgd], $r0 ; PGD for the current process
move.d [$r0+$r1.d], $r0 ; Get PMD
beq 2f
nop
diff --git a/arch/cris/arch-v32/mm/mmu.S b/arch/cris/arch-v32/mm/mmu.S
index 2238d154bde..f125d912e14 100644
--- a/arch/cris/arch-v32/mm/mmu.S
+++ b/arch/cris/arch-v32/mm/mmu.S
@@ -115,7 +115,7 @@
#ifdef CONFIG_SMP
move $s7, $acr ; PGD
#else
- move.d per_cpu__current_pgd, $acr ; PGD
+ move.d current_pgd, $acr ; PGD
#endif
; Look up PMD in PGD
lsrq 24, $r0 ; Get PMD index into PGD (bit 24-31)
diff --git a/arch/ia64/include/asm/percpu.h b/arch/ia64/include/asm/percpu.h
index 30cf46534dd..f7c00a5e0e2 100644
--- a/arch/ia64/include/asm/percpu.h
+++ b/arch/ia64/include/asm/percpu.h
@@ -9,7 +9,7 @@
#define PERCPU_ENOUGH_ROOM PERCPU_PAGE_SIZE
#ifdef __ASSEMBLY__
-# define THIS_CPU(var) (per_cpu__##var) /* use this to mark accesses to per-CPU variables... */
+# define THIS_CPU(var) (var) /* use this to mark accesses to per-CPU variables... */
#else /* !__ASSEMBLY__ */
@@ -39,7 +39,7 @@ extern void *per_cpu_init(void);
* On the positive side, using __ia64_per_cpu_var() instead of __get_cpu_var() is slightly
* more efficient.
*/
-#define __ia64_per_cpu_var(var) per_cpu__##var
+#define __ia64_per_cpu_var(var) var
#include <asm-generic/percpu.h>
diff --git a/arch/ia64/include/asm/xen/events.h b/arch/ia64/include/asm/xen/events.h
index b8370c8b619..baa74c82aa7 100644
--- a/arch/ia64/include/asm/xen/events.h
+++ b/arch/ia64/include/asm/xen/events.h
@@ -36,10 +36,6 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
return !(ia64_psr(regs)->i);
}
-static inline void handle_irq(int irq, struct pt_regs *regs)
-{
- __do_IRQ(irq);
-}
#define irq_ctx_init(cpu) do { } while (0)
#endif /* _ASM_IA64_XEN_EVENTS_H */
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index c16fb03037d..a7ca07f3754 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -852,8 +852,8 @@ __init void prefill_possible_map(void)
possible = available_cpus + additional_cpus;
- if (possible > NR_CPUS)
- possible = NR_CPUS;
+ if (possible > nr_cpu_ids)
+ possible = nr_cpu_ids;
printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
possible, max((possible - available_cpus), 0));
diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c
index 461b99902bf..7f4a0ed2415 100644
--- a/arch/ia64/kernel/ia64_ksyms.c
+++ b/arch/ia64/kernel/ia64_ksyms.c
@@ -30,9 +30,9 @@ EXPORT_SYMBOL(max_low_pfn); /* defined by bootmem.c, but not exported by generic
#endif
#include <asm/processor.h>
-EXPORT_SYMBOL(per_cpu__ia64_cpu_info);
+EXPORT_SYMBOL(ia64_cpu_info);
#ifdef CONFIG_SMP
-EXPORT_SYMBOL(per_cpu__local_per_cpu_offset);
+EXPORT_SYMBOL(local_per_cpu_offset);
#endif
#include <asm/uaccess.h>
diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig
index 01c75797119..fa4d1e59deb 100644
--- a/arch/ia64/kvm/Kconfig
+++ b/arch/ia64/kvm/Kconfig
@@ -26,6 +26,7 @@ config KVM
select ANON_INODES
select HAVE_KVM_IRQCHIP
select KVM_APIC_ARCHITECTURE
+ select KVM_MMIO
---help---
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 5fdeec5fddc..26e0e089bfe 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -241,10 +241,10 @@ static int handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
return 0;
mmio:
if (p->dir)
- r = kvm_io_bus_read(&vcpu->kvm->mmio_bus, p->addr,
+ r = kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, p->addr,
p->size, &p->data);
else
- r = kvm_io_bus_write(&vcpu->kvm->mmio_bus, p->addr,
+ r = kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, p->addr,
p->size, &p->data);
if (r)
printk(KERN_ERR"kvm: No iodevice found! addr:%lx\n", p->addr);
@@ -636,12 +636,9 @@ static void kvm_vcpu_post_transition(struct kvm_vcpu *vcpu)
static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
union context *host_ctx, *guest_ctx;
- int r;
+ int r, idx;
- /*
- * down_read() may sleep and return with interrupts enabled
- */
- down_read(&vcpu->kvm->slots_lock);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
again:
if (signal_pending(current)) {
@@ -663,7 +660,7 @@ again:
if (r < 0)
goto vcpu_run_fail;
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
kvm_guest_enter();
/*
@@ -687,7 +684,7 @@ again:
kvm_guest_exit();
preempt_enable();
- down_read(&vcpu->kvm->slots_lock);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
r = kvm_handle_exit(kvm_run, vcpu);
@@ -697,10 +694,10 @@ again:
}
out:
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
if (r > 0) {
kvm_resched(vcpu);
- down_read(&vcpu->kvm->slots_lock);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
goto again;
}
@@ -971,7 +968,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
goto out;
r = kvm_setup_default_irq_routing(kvm);
if (r) {
- kfree(kvm->arch.vioapic);
+ kvm_ioapic_destroy(kvm);
goto out;
}
break;
@@ -1377,12 +1374,14 @@ static void free_kvm(struct kvm *kvm)
static void kvm_release_vm_pages(struct kvm *kvm)
{
+ struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
int i, j;
unsigned long base_gfn;
- for (i = 0; i < kvm->nmemslots; i++) {
- memslot = &kvm->memslots[i];
+ slots = rcu_dereference(kvm->memslots);
+ for (i = 0; i < slots->nmemslots; i++) {
+ memslot = &slots->memslots[i];
base_gfn = memslot->base_gfn;
for (j = 0; j < memslot->npages; j++) {
@@ -1405,6 +1404,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kfree(kvm->arch.vioapic);
kvm_release_vm_pages(kvm);
kvm_free_physmem(kvm);
+ cleanup_srcu_struct(&kvm->srcu);
free_kvm(kvm);
}
@@ -1576,15 +1576,15 @@ out:
return r;
}
-int kvm_arch_set_memory_region(struct kvm *kvm,
- struct kvm_userspace_memory_region *mem,
+int kvm_arch_prepare_memory_region(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
struct kvm_memory_slot old,
+ struct kvm_userspace_memory_region *mem,
int user_alloc)
{
unsigned long i;
unsigned long pfn;
- int npages = mem->memory_size >> PAGE_SHIFT;
- struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
+ int npages = memslot->npages;
unsigned long base_gfn = memslot->base_gfn;
if (base_gfn + npages > (KVM_MAX_MEM_SIZE >> PAGE_SHIFT))
@@ -1608,6 +1608,14 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
return 0;
}
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ struct kvm_memory_slot old,
+ int user_alloc)
+{
+ return;
+}
+
void kvm_arch_flush_shadow(struct kvm *kvm)
{
kvm_flush_remote_tlbs(kvm);
@@ -1802,7 +1810,7 @@ static int kvm_ia64_sync_dirty_log(struct kvm *kvm,
if (log->slot >= KVM_MEMORY_SLOTS)
goto out;
- memslot = &kvm->memslots[log->slot];
+ memslot = &kvm->memslots->memslots[log->slot];
r = -ENOENT;
if (!memslot->dirty_bitmap)
goto out;
@@ -1827,6 +1835,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
struct kvm_memory_slot *memslot;
int is_dirty = 0;
+ mutex_lock(&kvm->slots_lock);
spin_lock(&kvm->arch.dirty_log_lock);
r = kvm_ia64_sync_dirty_log(kvm, log);
@@ -1840,12 +1849,13 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
/* If nothing is dirty, don't bother messing with page tables. */
if (is_dirty) {
kvm_flush_remote_tlbs(kvm);
- memslot = &kvm->memslots[log->slot];
+ memslot = &kvm->memslots->memslots[log->slot];
n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
memset(memslot->dirty_bitmap, 0, n);
}
r = 0;
out:
+ mutex_unlock(&kvm->slots_lock);
spin_unlock(&kvm->arch.dirty_log_lock);
return r;
}
diff --git a/arch/ia64/kvm/kvm_fw.c b/arch/ia64/kvm/kvm_fw.c
index e4b82319881..cb548ee9fca 100644
--- a/arch/ia64/kvm/kvm_fw.c
+++ b/arch/ia64/kvm/kvm_fw.c
@@ -75,7 +75,7 @@ static void set_pal_result(struct kvm_vcpu *vcpu,
struct exit_ctl_data *p;
p = kvm_get_exit_data(vcpu);
- if (p && p->exit_reason == EXIT_REASON_PAL_CALL) {
+ if (p->exit_reason == EXIT_REASON_PAL_CALL) {
p->u.pal_data.ret = result;
return ;
}
@@ -87,7 +87,7 @@ static void set_sal_result(struct kvm_vcpu *vcpu,
struct exit_ctl_data *p;
p = kvm_get_exit_data(vcpu);
- if (p && p->exit_reason == EXIT_REASON_SAL_CALL) {
+ if (p->exit_reason == EXIT_REASON_SAL_CALL) {
p->u.sal_data.ret = result;
return ;
}
@@ -322,7 +322,7 @@ static u64 kvm_get_pal_call_index(struct kvm_vcpu *vcpu)
struct exit_ctl_data *p;
p = kvm_get_exit_data(vcpu);
- if (p && (p->exit_reason == EXIT_REASON_PAL_CALL))
+ if (p->exit_reason == EXIT_REASON_PAL_CALL)
index = p->u.pal_data.gr28;
return index;
@@ -646,18 +646,16 @@ static void kvm_get_sal_call_data(struct kvm_vcpu *vcpu, u64 *in0, u64 *in1,
p = kvm_get_exit_data(vcpu);
- if (p) {
- if (p->exit_reason == EXIT_REASON_SAL_CALL) {
- *in0 = p->u.sal_data.in0;
- *in1 = p->u.sal_data.in1;
- *in2 = p->u.sal_data.in2;
- *in3 = p->u.sal_data.in3;
- *in4 = p->u.sal_data.in4;
- *in5 = p->u.sal_data.in5;
- *in6 = p->u.sal_data.in6;
- *in7 = p->u.sal_data.in7;
- return ;
- }
+ if (p->exit_reason == EXIT_REASON_SAL_CALL) {
+ *in0 = p->u.sal_data.in0;
+ *in1 = p->u.sal_data.in1;
+ *in2 = p->u.sal_data.in2;
+ *in3 = p->u.sal_data.in3;
+ *in4 = p->u.sal_data.in4;
+ *in5 = p->u.sal_data.in5;
+ *in6 = p->u.sal_data.in6;
+ *in7 = p->u.sal_data.in7;
+ return ;
}
*in0 = 0;
}
diff --git a/arch/ia64/kvm/mmio.c b/arch/ia64/kvm/mmio.c
index 9bf55afd08d..fb8f9f59a1e 100644
--- a/arch/ia64/kvm/mmio.c
+++ b/arch/ia64/kvm/mmio.c
@@ -316,8 +316,8 @@ void emulate_io_inst(struct kvm_vcpu *vcpu, u64 padr, u64 ma)
return;
} else {
inst_type = -1;
- panic_vm(vcpu, "Unsupported MMIO access instruction! \
- Bunld[0]=0x%lx, Bundle[1]=0x%lx\n",
+ panic_vm(vcpu, "Unsupported MMIO access instruction! "
+ "Bunld[0]=0x%lx, Bundle[1]=0x%lx\n",
bundle.i64[0], bundle.i64[1]);
}
diff --git a/arch/ia64/kvm/vcpu.c b/arch/ia64/kvm/vcpu.c
index dce75b70cdd..958815c9787 100644
--- a/arch/ia64/kvm/vcpu.c
+++ b/arch/ia64/kvm/vcpu.c
@@ -1639,8 +1639,8 @@ void vcpu_set_psr(struct kvm_vcpu *vcpu, unsigned long val)
* Otherwise panic
*/
if (val & (IA64_PSR_PK | IA64_PSR_IS | IA64_PSR_VM))
- panic_vm(vcpu, "Only support guests with vpsr.pk =0 \
- & vpsr.is=0\n");
+ panic_vm(vcpu, "Only support guests with vpsr.pk =0 "
+ "& vpsr.is=0\n");
/*
* For those IA64_PSR bits: id/da/dd/ss/ed/ia
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index 19c4b2195dc..8d586d1e251 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -459,7 +459,7 @@ static void __init initialize_pernode_data(void)
cpu = 0;
node = node_cpuid[cpu].nid;
cpu0_cpu_info = (struct cpuinfo_ia64 *)(__phys_per_cpu_start +
- ((char *)&per_cpu__ia64_cpu_info - __per_cpu_start));
+ ((char *)&ia64_cpu_info - __per_cpu_start));
cpu0_cpu_info->node_data = mem_data[node].node_data;
}
#endif /* CONFIG_SMP */
diff --git a/arch/m32r/include/asm/local.h b/arch/m32r/include/asm/local.h
index 22256d13863..734bca87018 100644
--- a/arch/m32r/include/asm/local.h
+++ b/arch/m32r/include/asm/local.h
@@ -338,29 +338,4 @@ static inline void local_set_mask(unsigned long mask, local_t *addr)
* a variable, not an address.
*/
-/* Need to disable preemption for the cpu local counters otherwise we could
- still access a variable of a previous CPU in a non local way. */
-#define cpu_local_wrap_v(l) \
- ({ local_t res__; \
- preempt_disable(); \
- res__ = (l); \
- preempt_enable(); \
- res__; })
-#define cpu_local_wrap(l) \
- ({ preempt_disable(); \
- l; \
- preempt_enable(); }) \
-
-#define cpu_local_read(l) cpu_local_wrap_v(local_read(&__get_cpu_var(l)))
-#define cpu_local_set(l, i) cpu_local_wrap(local_set(&__get_cpu_var(l), (i)))
-#define cpu_local_inc(l) cpu_local_wrap(local_inc(&__get_cpu_var(l)))
-#define cpu_local_dec(l) cpu_local_wrap(local_dec(&__get_cpu_var(l)))
-#define cpu_local_add(i, l) cpu_local_wrap(local_add((i), &__get_cpu_var(l)))
-#define cpu_local_sub(i, l) cpu_local_wrap(local_sub((i), &__get_cpu_var(l)))
-
-#define __cpu_local_inc(l) cpu_local_inc(l)
-#define __cpu_local_dec(l) cpu_local_dec(l)
-#define __cpu_local_add(i, l) cpu_local_add((i), (l))
-#define __cpu_local_sub(i, l) cpu_local_sub((i), (l))
-
#endif /* __M32R_LOCAL_H */
diff --git a/arch/microblaze/include/asm/entry.h b/arch/microblaze/include/asm/entry.h
index 61abbd23264..ec89f2ad0fe 100644
--- a/arch/microblaze/include/asm/entry.h
+++ b/arch/microblaze/include/asm/entry.h
@@ -21,7 +21,7 @@
* places
*/
-#define PER_CPU(var) per_cpu__##var
+#define PER_CPU(var) var
# ifndef __ASSEMBLY__
DECLARE_PER_CPU(unsigned int, KSP); /* Saved kernel stack pointer */
diff --git a/arch/mips/include/asm/local.h b/arch/mips/include/asm/local.h
index 361f4f16c30..bdcdef02d14 100644
--- a/arch/mips/include/asm/local.h
+++ b/arch/mips/include/asm/local.h
@@ -193,29 +193,4 @@ static __inline__ long local_sub_return(long i, local_t * l)
#define __local_add(i, l) ((l)->a.counter+=(i))
#define __local_sub(i, l) ((l)->a.counter-=(i))
-/* Need to disable preemption for the cpu local counters otherwise we could
- still access a variable of a previous CPU in a non atomic way. */
-#define cpu_local_wrap_v(l) \
- ({ local_t res__; \
- preempt_disable(); \
- res__ = (l); \
- preempt_enable(); \
- res__; })
-#define cpu_local_wrap(l) \
- ({ preempt_disable(); \
- l; \
- preempt_enable(); }) \
-
-#define cpu_local_read(l) cpu_local_wrap_v(local_read(&__get_cpu_var(l)))
-#define cpu_local_set(l, i) cpu_local_wrap(local_set(&__get_cpu_var(l), (i)))
-#define cpu_local_inc(l) cpu_local_wrap(local_inc(&__get_cpu_var(l)))
-#define cpu_local_dec(l) cpu_local_wrap(local_dec(&__get_cpu_var(l)))
-#define cpu_local_add(i, l) cpu_local_wrap(local_add((i), &__get_cpu_var(l)))
-#define cpu_local_sub(i, l) cpu_local_wrap(local_sub((i), &__get_cpu_var(l)))
-
-#define __cpu_local_inc(l) cpu_local_inc(l)
-#define __cpu_local_dec(l) cpu_local_dec(l)
-#define __cpu_local_add(i, l) cpu_local_add((i), (l))
-#define __cpu_local_sub(i, l) cpu_local_sub((i), (l))
-
#endif /* _ARCH_MIPS_LOCAL_H */
diff --git a/arch/parisc/lib/fixup.S b/arch/parisc/lib/fixup.S
index d172d4245cd..f8c45cc2947 100644
--- a/arch/parisc/lib/fixup.S
+++ b/arch/parisc/lib/fixup.S
@@ -36,8 +36,8 @@
#endif
/* t2 = &__per_cpu_offset[smp_processor_id()]; */
LDREGX \t2(\t1),\t2
- addil LT%per_cpu__exception_data,%r27
- LDREG RT%per_cpu__exception_data(%r1),\t1
+ addil LT%exception_data,%r27
+ LDREG RT%exception_data(%r1),\t1
/* t1 = &__get_cpu_var(exception_data) */
add,l \t1,\t2,\t1
/* t1 = t1->fault_ip */
@@ -46,8 +46,8 @@
#else
.macro get_fault_ip t1 t2
/* t1 = &__get_cpu_var(exception_data) */
- addil LT%per_cpu__exception_data,%r27
- LDREG RT%per_cpu__exception_data(%r1),\t2
+ addil LT%exception_data,%r27
+ LDREG RT%exception_data(%r1),\t2
/* t1 = t2->fault_ip */
LDREG EXCDATA_IP(\t2), \t1
.endm
diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index af2abe74f54..aadf2dd6f84 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -97,4 +97,10 @@
#define RESUME_HOST RESUME_FLAG_HOST
#define RESUME_HOST_NV (RESUME_FLAG_HOST|RESUME_FLAG_NV)
+#define KVM_GUEST_MODE_NONE 0
+#define KVM_GUEST_MODE_GUEST 1
+#define KVM_GUEST_MODE_SKIP 2
+
+#define KVM_INST_FETCH_FAILED -1
+
#endif /* __POWERPC_KVM_ASM_H__ */
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 74b7369770d..db7db0a9696 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -22,7 +22,7 @@
#include <linux/types.h>
#include <linux/kvm_host.h>
-#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s_64_asm.h>
struct kvmppc_slb {
u64 esid;
@@ -33,7 +33,8 @@ struct kvmppc_slb {
bool Ks;
bool Kp;
bool nx;
- bool large;
+ bool large; /* PTEs are 16MB */
+ bool tb; /* 1TB segment */
bool class;
};
@@ -69,6 +70,7 @@ struct kvmppc_sid_map {
struct kvmppc_vcpu_book3s {
struct kvm_vcpu vcpu;
+ struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
struct kvmppc_sid_map sid_map[SID_MAP_NUM];
struct kvmppc_slb slb[64];
struct {
@@ -89,6 +91,7 @@ struct kvmppc_vcpu_book3s {
u64 vsid_next;
u64 vsid_max;
int context_id;
+ ulong prog_flags; /* flags to inject when giving a 700 trap */
};
#define CONTEXT_HOST 0
@@ -119,6 +122,10 @@ extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
extern u32 kvmppc_trampoline_lowmem;
extern u32 kvmppc_trampoline_enter;
+extern void kvmppc_rmcall(ulong srr0, ulong srr1);
+extern void kvmppc_load_up_fpu(void);
+extern void kvmppc_load_up_altivec(void);
+extern void kvmppc_load_up_vsx(void);
static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
{
diff --git a/arch/powerpc/include/asm/kvm_book3s_64_asm.h b/arch/powerpc/include/asm/kvm_book3s_64_asm.h
index 2e06ee8184e..183461b4840 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64_asm.h
@@ -20,6 +20,8 @@
#ifndef __ASM_KVM_BOOK3S_ASM_H__
#define __ASM_KVM_BOOK3S_ASM_H__
+#ifdef __ASSEMBLY__
+
#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
#include <asm/kvm_asm.h>
@@ -55,4 +57,20 @@ kvmppc_resume_\intno:
#endif /* CONFIG_KVM_BOOK3S_64_HANDLER */
+#else /*__ASSEMBLY__ */
+
+struct kvmppc_book3s_shadow_vcpu {
+ ulong gpr[14];
+ u32 cr;
+ u32 xer;
+ ulong host_r1;
+ ulong host_r2;
+ ulong handler;
+ ulong scratch0;
+ ulong scratch1;
+ ulong vmhandler;
+};
+
+#endif /*__ASSEMBLY__ */
+
#endif /* __ASM_KVM_BOOK3S_ASM_H__ */
diff --git a/arch/powerpc/include/asm/kvm_e500.h b/arch/powerpc/include/asm/kvm_e500.h
index 9d497ce4972..7fea26fffb2 100644
--- a/arch/powerpc/include/asm/kvm_e500.h
+++ b/arch/powerpc/include/asm/kvm_e500.h
@@ -52,9 +52,12 @@ struct kvmppc_vcpu_e500 {
u32 mas5;
u32 mas6;
u32 mas7;
+ u32 l1csr0;
u32 l1csr1;
u32 hid0;
u32 hid1;
+ u32 tlb0cfg;
+ u32 tlb1cfg;
struct kvm_vcpu vcpu;
};
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 1201f62d0d7..5e5bae7e152 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -167,23 +167,40 @@ struct kvm_vcpu_arch {
ulong trampoline_lowmem;
ulong trampoline_enter;
ulong highmem_handler;
+ ulong rmcall;
ulong host_paca_phys;
struct kvmppc_mmu mmu;
#endif
- u64 fpr[32];
ulong gpr[32];
+ u64 fpr[32];
+ u32 fpscr;
+
+#ifdef CONFIG_ALTIVEC
+ vector128 vr[32];
+ vector128 vscr;
+#endif
+
+#ifdef CONFIG_VSX
+ u64 vsr[32];
+#endif
+
ulong pc;
- u32 cr;
ulong ctr;
ulong lr;
+
+#ifdef CONFIG_BOOKE
ulong xer;
+ u32 cr;
+#endif
ulong msr;
#ifdef CONFIG_PPC64
ulong shadow_msr;
+ ulong shadow_srr1;
ulong hflags;
+ ulong guest_owned_ext;
#endif
u32 mmucr;
ulong sprg0;
@@ -242,6 +259,8 @@ struct kvm_vcpu_arch {
#endif
ulong fault_dear;
ulong fault_esr;
+ ulong queued_dear;
+ ulong queued_esr;
gpa_t paddr_accessed;
u8 io_gpr; /* GPR used as IO source/target */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 269ee46ab02..e2642829e43 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -28,6 +28,9 @@
#include <linux/types.h>
#include <linux/kvm_types.h>
#include <linux/kvm_host.h>
+#ifdef CONFIG_PPC_BOOK3S
+#include <asm/kvm_book3s.h>
+#endif
enum emulation_result {
EMULATE_DONE, /* no further processing */
@@ -80,8 +83,9 @@ extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu);
extern void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu);
extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu);
-extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu);
+extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags);
extern void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu);
+extern void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu);
extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
struct kvm_interrupt *irq);
@@ -95,4 +99,81 @@ extern void kvmppc_booke_exit(void);
extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
+#ifdef CONFIG_PPC_BOOK3S
+
+/* We assume we're always acting on the current vcpu */
+
+static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
+{
+ if ( num < 14 ) {
+ get_paca()->shadow_vcpu.gpr[num] = val;
+ to_book3s(vcpu)->shadow_vcpu.gpr[num] = val;
+ } else
+ vcpu->arch.gpr[num] = val;
+}
+
+static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
+{
+ if ( num < 14 )
+ return get_paca()->shadow_vcpu.gpr[num];
+ else
+ return vcpu->arch.gpr[num];
+}
+
+static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
+{
+ get_paca()->shadow_vcpu.cr = val;
+ to_book3s(vcpu)->shadow_vcpu.cr = val;
+}
+
+static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
+{
+ return get_paca()->shadow_vcpu.cr;
+}
+
+static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
+{
+ get_paca()->shadow_vcpu.xer = val;
+ to_book3s(vcpu)->shadow_vcpu.xer = val;
+}
+
+static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
+{
+ return get_paca()->shadow_vcpu.xer;
+}
+
+#else
+
+static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
+{
+ vcpu->arch.gpr[num] = val;
+}
+
+static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
+{
+ return vcpu->arch.gpr[num];
+}
+
+static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
+{
+ vcpu->arch.cr = val;
+}
+
+static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.cr;
+}
+
+static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
+{
+ vcpu->arch.xer = val;
+}
+
+static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.xer;
+}
+
+#endif
+
#endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/arch/powerpc/include/asm/local.h b/arch/powerpc/include/asm/local.h
index ce58c80e1bc..c2410af6bfd 100644
--- a/arch/powerpc/include/asm/local.h
+++ b/arch/powerpc/include/asm/local.h
@@ -172,29 +172,4 @@ static __inline__ long local_dec_if_positive(local_t *l)
#define __local_add(i,l) ((l)->a.counter+=(i))
#define __local_sub(i,l) ((l)->a.counter-=(i))
-/* Need to disable preemption for the cpu local counters otherwise we could
- still access a variable of a previous CPU in a non atomic way. */
-#define cpu_local_wrap_v(l) \
- ({ local_t res__; \
- preempt_disable(); \
- res__ = (l); \
- preempt_enable(); \
- res__; })
-#define cpu_local_wrap(l) \
- ({ preempt_disable(); \
- l; \
- preempt_enable(); }) \
-
-#define cpu_local_read(l) cpu_local_wrap_v(local_read(&__get_cpu_var(l)))
-#define cpu_local_set(l, i) cpu_local_wrap(local_set(&__get_cpu_var(l), (i)))
-#define cpu_local_inc(l) cpu_local_wrap(local_inc(&__get_cpu_var(l)))
-#define cpu_local_dec(l) cpu_local_wrap(local_dec(&__get_cpu_var(l)))
-#define cpu_local_add(i, l) cpu_local_wrap(local_add((i), &__get_cpu_var(l)))
-#define cpu_local_sub(i, l) cpu_local_wrap(local_sub((i), &__get_cpu_var(l)))
-
-#define __cpu_local_inc(l) cpu_local_inc(l)
-#define __cpu_local_dec(l) cpu_local_dec(l)
-#define __cpu_local_add(i, l) cpu_local_add((i), (l))
-#define __cpu_local_sub(i, l) cpu_local_sub((i), (l))
-
#endif /* _ARCH_POWERPC_LOCAL_H */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 5e9b4ef7141..d8a693109c8 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -19,6 +19,9 @@
#include <asm/mmu.h>
#include <asm/page.h>
#include <asm/exception-64e.h>
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#include <asm/kvm_book3s_64_asm.h>
+#endif
register struct paca_struct *local_paca asm("r13");
@@ -135,6 +138,8 @@ struct paca_struct {
u64 esid;
u64 vsid;
} kvm_slb[64]; /* guest SLB */
+ /* We use this to store guest state in */
+ struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
u8 kvm_slb_max; /* highest used guest slb entry */
u8 kvm_in_guest; /* are we inside the guest? */
#endif
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index bc8dd53f718..5572e86223f 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -426,6 +426,10 @@
#define SRR1_WAKEMT 0x00280000 /* mtctrl */
#define SRR1_WAKEDEC 0x00180000 /* Decrementer interrupt */
#define SRR1_WAKETHERM 0x00100000 /* Thermal management interrupt */
+#define SRR1_PROGFPE 0x00100000 /* Floating Point Enabled */
+#define SRR1_PROGPRIV 0x00040000 /* Privileged instruction */
+#define SRR1_PROGTRAP 0x00020000 /* Trap */
+#define SRR1_PROGADDR 0x00010000 /* SRR0 contains subsequent addr */
#define SPRN_HSRR0 0x13A /* Save/Restore Register 0 */
#define SPRN_HSRR1 0x13B /* Save/Restore Register 1 */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index a6c2b63227b..957ceb7059c 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -194,6 +194,30 @@ int main(void)
DEFINE(PACA_KVM_IN_GUEST, offsetof(struct paca_struct, kvm_in_guest));
DEFINE(PACA_KVM_SLB, offsetof(struct paca_struct, kvm_slb));
DEFINE(PACA_KVM_SLB_MAX, offsetof(struct paca_struct, kvm_slb_max));
+ DEFINE(PACA_KVM_CR, offsetof(struct paca_struct, shadow_vcpu.cr));
+ DEFINE(PACA_KVM_XER, offsetof(struct paca_struct, shadow_vcpu.xer));
+ DEFINE(PACA_KVM_R0, offsetof(struct paca_struct, shadow_vcpu.gpr[0]));
+ DEFINE(PACA_KVM_R1, offsetof(struct paca_struct, shadow_vcpu.gpr[1]));
+ DEFINE(PACA_KVM_R2, offsetof(struct paca_struct, shadow_vcpu.gpr[2]));
+ DEFINE(PACA_KVM_R3, offsetof(struct paca_struct, shadow_vcpu.gpr[3]));
+ DEFINE(PACA_KVM_R4, offsetof(struct paca_struct, shadow_vcpu.gpr[4]));
+ DEFINE(PACA_KVM_R5, offsetof(struct paca_struct, shadow_vcpu.gpr[5]));
+ DEFINE(PACA_KVM_R6, offsetof(struct paca_struct, shadow_vcpu.gpr[6]));
+ DEFINE(PACA_KVM_R7, offsetof(struct paca_struct, shadow_vcpu.gpr[7]));
+ DEFINE(PACA_KVM_R8, offsetof(struct paca_struct, shadow_vcpu.gpr[8]));
+ DEFINE(PACA_KVM_R9, offsetof(struct paca_struct, shadow_vcpu.gpr[9]));
+ DEFINE(PACA_KVM_R10, offsetof(struct paca_struct, shadow_vcpu.gpr[10]));
+ DEFINE(PACA_KVM_R11, offsetof(struct paca_struct, shadow_vcpu.gpr[11]));
+ DEFINE(PACA_KVM_R12, offsetof(struct paca_struct, shadow_vcpu.gpr[12]));
+ DEFINE(PACA_KVM_R13, offsetof(struct paca_struct, shadow_vcpu.gpr[13]));
+ DEFINE(PACA_KVM_HOST_R1, offsetof(struct paca_struct, shadow_vcpu.host_r1));
+ DEFINE(PACA_KVM_HOST_R2, offsetof(struct paca_struct, shadow_vcpu.host_r2));
+ DEFINE(PACA_KVM_VMHANDLER, offsetof(struct paca_struct,
+ shadow_vcpu.vmhandler));
+ DEFINE(PACA_KVM_SCRATCH0, offsetof(struct paca_struct,
+ shadow_vcpu.scratch0));
+ DEFINE(PACA_KVM_SCRATCH1, offsetof(struct paca_struct,
+ shadow_vcpu.scratch1));
#endif
#endif /* CONFIG_PPC64 */
@@ -389,8 +413,6 @@ int main(void)
DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
- DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
- DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
DEFINE(VCPU_CTR, offsetof(struct kvm_vcpu, arch.ctr));
DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc));
DEFINE(VCPU_MSR, offsetof(struct kvm_vcpu, arch.msr));
@@ -411,11 +433,16 @@ int main(void)
DEFINE(VCPU_HOST_R2, offsetof(struct kvm_vcpu, arch.host_r2));
DEFINE(VCPU_HOST_MSR, offsetof(struct kvm_vcpu, arch.host_msr));
DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr));
+ DEFINE(VCPU_SHADOW_SRR1, offsetof(struct kvm_vcpu, arch.shadow_srr1));
DEFINE(VCPU_TRAMPOLINE_LOWMEM, offsetof(struct kvm_vcpu, arch.trampoline_lowmem));
DEFINE(VCPU_TRAMPOLINE_ENTER, offsetof(struct kvm_vcpu, arch.trampoline_enter));
DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, arch.highmem_handler));
+ DEFINE(VCPU_RMCALL, offsetof(struct kvm_vcpu, arch.rmcall));
DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags));
-#endif
+#else
+ DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
+ DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
+#endif /* CONFIG_PPC64 */
#endif
#ifdef CONFIG_44x
DEFINE(PGD_T_LOG2, PGD_T_LOG2);
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index 425451453e9..ab3e392ac63 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -107,6 +107,7 @@ EXPORT_SYMBOL(giveup_altivec);
#endif /* CONFIG_ALTIVEC */
#ifdef CONFIG_VSX
EXPORT_SYMBOL(giveup_vsx);
+EXPORT_SYMBOL_GPL(__giveup_vsx);
#endif /* CONFIG_VSX */
#ifdef CONFIG_SPE
EXPORT_SYMBOL(giveup_spe);
diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c
index 61af58fcece..65ea083a5b2 100644
--- a/arch/powerpc/kvm/44x_emulate.c
+++ b/arch/powerpc/kvm/44x_emulate.c
@@ -65,13 +65,14 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
*/
switch (dcrn) {
case DCRN_CPR0_CONFIG_ADDR:
- vcpu->arch.gpr[rt] = vcpu->arch.cpr0_cfgaddr;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.cpr0_cfgaddr);
break;
case DCRN_CPR0_CONFIG_DATA:
local_irq_disable();
mtdcr(DCRN_CPR0_CONFIG_ADDR,
vcpu->arch.cpr0_cfgaddr);
- vcpu->arch.gpr[rt] = mfdcr(DCRN_CPR0_CONFIG_DATA);
+ kvmppc_set_gpr(vcpu, rt,
+ mfdcr(DCRN_CPR0_CONFIG_DATA));
local_irq_enable();
break;
default:
@@ -93,11 +94,11 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
/* emulate some access in kernel */
switch (dcrn) {
case DCRN_CPR0_CONFIG_ADDR:
- vcpu->arch.cpr0_cfgaddr = vcpu->arch.gpr[rs];
+ vcpu->arch.cpr0_cfgaddr = kvmppc_get_gpr(vcpu, rs);
break;
default:
run->dcr.dcrn = dcrn;
- run->dcr.data = vcpu->arch.gpr[rs];
+ run->dcr.data = kvmppc_get_gpr(vcpu, rs);
run->dcr.is_write = 1;
vcpu->arch.dcr_needed = 1;
kvmppc_account_exit(vcpu, DCR_EXITS);
@@ -146,13 +147,13 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
switch (sprn) {
case SPRN_PID:
- kvmppc_set_pid(vcpu, vcpu->arch.gpr[rs]); break;
+ kvmppc_set_pid(vcpu, kvmppc_get_gpr(vcpu, rs)); break;
case SPRN_MMUCR:
- vcpu->arch.mmucr = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.mmucr = kvmppc_get_gpr(vcpu, rs); break;
case SPRN_CCR0:
- vcpu->arch.ccr0 = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.ccr0 = kvmppc_get_gpr(vcpu, rs); break;
case SPRN_CCR1:
- vcpu->arch.ccr1 = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.ccr1 = kvmppc_get_gpr(vcpu, rs); break;
default:
emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, rs);
}
@@ -167,13 +168,13 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
switch (sprn) {
case SPRN_PID:
- vcpu->arch.gpr[rt] = vcpu->arch.pid; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.pid); break;
case SPRN_MMUCR:
- vcpu->arch.gpr[rt] = vcpu->arch.mmucr; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.mmucr); break;
case SPRN_CCR0:
- vcpu->arch.gpr[rt] = vcpu->arch.ccr0; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ccr0); break;
case SPRN_CCR1:
- vcpu->arch.gpr[rt] = vcpu->arch.ccr1; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ccr1); break;
default:
emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, rt);
}
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index ff3cb63b811..2570fcc7665 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -439,7 +439,7 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws)
struct kvmppc_44x_tlbe *tlbe;
unsigned int gtlb_index;
- gtlb_index = vcpu->arch.gpr[ra];
+ gtlb_index = kvmppc_get_gpr(vcpu, ra);
if (gtlb_index > KVM44x_GUEST_TLB_SIZE) {
printk("%s: index %d\n", __func__, gtlb_index);
kvmppc_dump_vcpu(vcpu);
@@ -455,15 +455,15 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws)
switch (ws) {
case PPC44x_TLB_PAGEID:
tlbe->tid = get_mmucr_stid(vcpu);
- tlbe->word0 = vcpu->arch.gpr[rs];
+ tlbe->word0 = kvmppc_get_gpr(vcpu, rs);
break;
case PPC44x_TLB_XLAT:
- tlbe->word1 = vcpu->arch.gpr[rs];
+ tlbe->word1 = kvmppc_get_gpr(vcpu, rs);
break;
case PPC44x_TLB_ATTRIB:
- tlbe->word2 = vcpu->arch.gpr[rs];
+ tlbe->word2 = kvmppc_get_gpr(vcpu, rs);
break;
default:
@@ -500,18 +500,20 @@ int kvmppc_44x_emul_tlbsx(struct kvm_vcpu *vcpu, u8 rt, u8 ra, u8 rb, u8 rc)
unsigned int as = get_mmucr_sts(vcpu);
unsigned int pid = get_mmucr_stid(vcpu);
- ea = vcpu->arch.gpr[rb];
+ ea = kvmppc_get_gpr(vcpu, rb);
if (ra)
- ea += vcpu->arch.gpr[ra];
+ ea += kvmppc_get_gpr(vcpu, ra);
gtlb_index = kvmppc_44x_tlb_index(vcpu, ea, pid, as);
if (rc) {
+ u32 cr = kvmppc_get_cr(vcpu);
+
if (gtlb_index < 0)
- vcpu->arch.cr &= ~0x20000000;
+ kvmppc_set_cr(vcpu, cr & ~0x20000000);
else
- vcpu->arch.cr |= 0x20000000;
+ kvmppc_set_cr(vcpu, cr | 0x20000000);
}
- vcpu->arch.gpr[rt] = gtlb_index;
+ kvmppc_set_gpr(vcpu, rt, gtlb_index);
kvmppc_set_exit_type(vcpu, EMULATED_TLBSX_EXITS);
return EMULATE_DONE;
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index fe037fdaf1b..60624cc9f4d 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -20,6 +20,7 @@ config KVM
bool
select PREEMPT_NOTIFIERS
select ANON_INODES
+ select KVM_MMIO
config KVM_BOOK3S_64_HANDLER
bool
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 3e294bd9b8c..9a271f0929c 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -33,12 +33,9 @@
/* #define EXIT_DEBUG */
/* #define EXIT_DEBUG_SIMPLE */
+/* #define DEBUG_EXT */
-/* Without AGGRESSIVE_DEC we only fire off a DEC interrupt when DEC turns 0.
- * When set, we retrigger a DEC interrupt after that if DEC <= 0.
- * PPC32 Linux runs faster without AGGRESSIVE_DEC, PPC64 Linux requires it. */
-
-/* #define AGGRESSIVE_DEC */
+static void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "exits", VCPU_STAT(sum_exits) },
@@ -72,16 +69,24 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu)
void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
memcpy(get_paca()->kvm_slb, to_book3s(vcpu)->slb_shadow, sizeof(get_paca()->kvm_slb));
+ memcpy(&get_paca()->shadow_vcpu, &to_book3s(vcpu)->shadow_vcpu,
+ sizeof(get_paca()->shadow_vcpu));
get_paca()->kvm_slb_max = to_book3s(vcpu)->slb_shadow_max;
}
void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
{
memcpy(to_book3s(vcpu)->slb_shadow, get_paca()->kvm_slb, sizeof(get_paca()->kvm_slb));
+ memcpy(&to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu,
+ sizeof(get_paca()->shadow_vcpu));
to_book3s(vcpu)->slb_shadow_max = get_paca()->kvm_slb_max;
+
+ kvmppc_giveup_ext(vcpu, MSR_FP);
+ kvmppc_giveup_ext(vcpu, MSR_VEC);
+ kvmppc_giveup_ext(vcpu, MSR_VSX);
}
-#if defined(AGGRESSIVE_DEC) || defined(EXIT_DEBUG)
+#if defined(EXIT_DEBUG)
static u32 kvmppc_get_dec(struct kvm_vcpu *vcpu)
{
u64 jd = mftb() - vcpu->arch.dec_jiffies;
@@ -89,6 +94,23 @@ static u32 kvmppc_get_dec(struct kvm_vcpu *vcpu)
}
#endif
+static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.shadow_msr = vcpu->arch.msr;
+ /* Guest MSR values */
+ vcpu->arch.shadow_msr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE |
+ MSR_BE | MSR_DE;
+ /* Process MSR values */
+ vcpu->arch.shadow_msr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR |
+ MSR_EE;
+ /* External providers the guest reserved */
+ vcpu->arch.shadow_msr |= (vcpu->arch.msr & vcpu->arch.guest_owned_ext);
+ /* 64-bit Process MSR values */
+#ifdef CONFIG_PPC_BOOK3S_64
+ vcpu->arch.shadow_msr |= MSR_ISF | MSR_HV;
+#endif
+}
+
void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
{
ulong old_msr = vcpu->arch.msr;
@@ -96,12 +118,10 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
#ifdef EXIT_DEBUG
printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr);
#endif
+
msr &= to_book3s(vcpu)->msr_mask;
vcpu->arch.msr = msr;
- vcpu->arch.shadow_msr = msr | MSR_USER32;
- vcpu->arch.shadow_msr &= ( MSR_VEC | MSR_VSX | MSR_FP | MSR_FE0 |
- MSR_USER64 | MSR_SE | MSR_BE | MSR_DE |
- MSR_FE1);
+ kvmppc_recalc_shadow_msr(vcpu);
if (msr & (MSR_WE|MSR_POW)) {
if (!vcpu->arch.pending_exceptions) {
@@ -125,11 +145,10 @@ void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags)
vcpu->arch.mmu.reset_msr(vcpu);
}
-void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec)
+static int kvmppc_book3s_vec2irqprio(unsigned int vec)
{
unsigned int prio;
- vcpu->stat.queue_intr++;
switch (vec) {
case 0x100: prio = BOOK3S_IRQPRIO_SYSTEM_RESET; break;
case 0x200: prio = BOOK3S_IRQPRIO_MACHINE_CHECK; break;
@@ -149,15 +168,31 @@ void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec)
default: prio = BOOK3S_IRQPRIO_MAX; break;
}
- set_bit(prio, &vcpu->arch.pending_exceptions);
+ return prio;
+}
+
+static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
+ unsigned int vec)
+{
+ clear_bit(kvmppc_book3s_vec2irqprio(vec),
+ &vcpu->arch.pending_exceptions);
+}
+
+void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec)
+{
+ vcpu->stat.queue_intr++;
+
+ set_bit(kvmppc_book3s_vec2irqprio(vec),
+ &vcpu->arch.pending_exceptions);
#ifdef EXIT_DEBUG
printk(KERN_INFO "Queueing interrupt %x\n", vec);
#endif
}
-void kvmppc_core_queue_program(struct kvm_vcpu *vcpu)
+void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags)
{
+ to_book3s(vcpu)->prog_flags = flags;
kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_PROGRAM);
}
@@ -171,6 +206,11 @@ int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu)
return test_bit(BOOK3S_INTERRUPT_DECREMENTER >> 7, &vcpu->arch.pending_exceptions);
}
+void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu)
+{
+ kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_DECREMENTER);
+}
+
void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
struct kvm_interrupt *irq)
{
@@ -181,6 +221,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
{
int deliver = 1;
int vec = 0;
+ ulong flags = 0ULL;
switch (priority) {
case BOOK3S_IRQPRIO_DECREMENTER:
@@ -214,6 +255,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
break;
case BOOK3S_IRQPRIO_PROGRAM:
vec = BOOK3S_INTERRUPT_PROGRAM;
+ flags = to_book3s(vcpu)->prog_flags;
break;
case BOOK3S_IRQPRIO_VSX:
vec = BOOK3S_INTERRUPT_VSX;
@@ -244,7 +286,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
#endif
if (deliver)
- kvmppc_inject_interrupt(vcpu, vec, 0ULL);
+ kvmppc_inject_interrupt(vcpu, vec, flags);
return deliver;
}
@@ -254,21 +296,15 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
unsigned long *pending = &vcpu->arch.pending_exceptions;
unsigned int priority;
- /* XXX be more clever here - no need to mftb() on every entry */
- /* Issue DEC again if it's still active */
-#ifdef AGGRESSIVE_DEC
- if (vcpu->arch.msr & MSR_EE)
- if (kvmppc_get_dec(vcpu) & 0x80000000)
- kvmppc_core_queue_dec(vcpu);
-#endif
-
#ifdef EXIT_DEBUG
if (vcpu->arch.pending_exceptions)
printk(KERN_EMERG "KVM: Check pending: %lx\n", vcpu->arch.pending_exceptions);
#endif
priority = __ffs(*pending);
while (priority <= (sizeof(unsigned int) * 8)) {
- if (kvmppc_book3s_irqprio_deliver(vcpu, priority)) {
+ if (kvmppc_book3s_irqprio_deliver(vcpu, priority) &&
+ (priority != BOOK3S_IRQPRIO_DECREMENTER)) {
+ /* DEC interrupts get cleared by mtdec */
clear_bit(priority, &vcpu->arch.pending_exceptions);
break;
}
@@ -503,14 +539,14 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
/* Page not found in guest PTE entries */
vcpu->arch.dear = vcpu->arch.fault_dear;
to_book3s(vcpu)->dsisr = vcpu->arch.fault_dsisr;
- vcpu->arch.msr |= (vcpu->arch.shadow_msr & 0x00000000f8000000ULL);
+ vcpu->arch.msr |= (vcpu->arch.shadow_srr1 & 0x00000000f8000000ULL);
kvmppc_book3s_queue_irqprio(vcpu, vec);
} else if (page_found == -EPERM) {
/* Storage protection */
vcpu->arch.dear = vcpu->arch.fault_dear;
to_book3s(vcpu)->dsisr = vcpu->arch.fault_dsisr & ~DSISR_NOHPTE;
to_book3s(vcpu)->dsisr |= DSISR_PROTFAULT;
- vcpu->arch.msr |= (vcpu->arch.shadow_msr & 0x00000000f8000000ULL);
+ vcpu->arch.msr |= (vcpu->arch.shadow_srr1 & 0x00000000f8000000ULL);
kvmppc_book3s_queue_irqprio(vcpu, vec);
} else if (page_found == -EINVAL) {
/* Page not found in guest SLB */
@@ -532,13 +568,122 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
r = kvmppc_emulate_mmio(run, vcpu);
if ( r == RESUME_HOST_NV )
r = RESUME_HOST;
- if ( r == RESUME_GUEST_NV )
- r = RESUME_GUEST;
}
return r;
}
+static inline int get_fpr_index(int i)
+{
+#ifdef CONFIG_VSX
+ i *= 2;
+#endif
+ return i;
+}
+
+/* Give up external provider (FPU, Altivec, VSX) */
+static void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
+{
+ struct thread_struct *t = &current->thread;
+ u64 *vcpu_fpr = vcpu->arch.fpr;
+ u64 *vcpu_vsx = vcpu->arch.vsr;
+ u64 *thread_fpr = (u64*)t->fpr;
+ int i;
+
+ if (!(vcpu->arch.guest_owned_ext & msr))
+ return;
+
+#ifdef DEBUG_EXT
+ printk(KERN_INFO "Giving up ext 0x%lx\n", msr);
+#endif
+
+ switch (msr) {
+ case MSR_FP:
+ giveup_fpu(current);
+ for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
+ vcpu_fpr[i] = thread_fpr[get_fpr_index(i)];
+
+ vcpu->arch.fpscr = t->fpscr.val;
+ break;
+ case MSR_VEC:
+#ifdef CONFIG_ALTIVEC
+ giveup_altivec(current);
+ memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr));
+ vcpu->arch.vscr = t->vscr;
+#endif
+ break;
+ case MSR_VSX:
+#ifdef CONFIG_VSX
+ __giveup_vsx(current);
+ for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++)
+ vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1];
+#endif
+ break;
+ default:
+ BUG();
+ }
+
+ vcpu->arch.guest_owned_ext &= ~msr;
+ current->thread.regs->msr &= ~msr;
+ kvmppc_recalc_shadow_msr(vcpu);
+}
+
+/* Handle external providers (FPU, Altivec, VSX) */
+static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
+ ulong msr)
+{
+ struct thread_struct *t = &current->thread;
+ u64 *vcpu_fpr = vcpu->arch.fpr;
+ u64 *vcpu_vsx = vcpu->arch.vsr;
+ u64 *thread_fpr = (u64*)t->fpr;
+ int i;
+
+ if (!(vcpu->arch.msr & msr)) {
+ kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+ return RESUME_GUEST;
+ }
+
+#ifdef DEBUG_EXT
+ printk(KERN_INFO "Loading up ext 0x%lx\n", msr);
+#endif
+
+ current->thread.regs->msr |= msr;
+
+ switch (msr) {
+ case MSR_FP:
+ for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
+ thread_fpr[get_fpr_index(i)] = vcpu_fpr[i];
+
+ t->fpscr.val = vcpu->arch.fpscr;
+ t->fpexc_mode = 0;
+ kvmppc_load_up_fpu();
+ break;
+ case MSR_VEC:
+#ifdef CONFIG_ALTIVEC
+ memcpy(t->vr, vcpu->arch.vr, sizeof(vcpu->arch.vr));
+ t->vscr = vcpu->arch.vscr;
+ t->vrsave = -1;
+ kvmppc_load_up_altivec();
+#endif
+ break;
+ case MSR_VSX:
+#ifdef CONFIG_VSX
+ for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++)
+ thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i];
+ kvmppc_load_up_vsx();
+#endif
+ break;
+ default:
+ BUG();
+ }
+
+ vcpu->arch.guest_owned_ext |= msr;
+
+ kvmppc_recalc_shadow_msr(vcpu);
+
+ return RESUME_GUEST;
+}
+
int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
unsigned int exit_nr)
{
@@ -563,7 +708,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
case BOOK3S_INTERRUPT_INST_STORAGE:
vcpu->stat.pf_instruc++;
/* only care about PTEG not found errors, but leave NX alone */
- if (vcpu->arch.shadow_msr & 0x40000000) {
+ if (vcpu->arch.shadow_srr1 & 0x40000000) {
r = kvmppc_handle_pagefault(run, vcpu, vcpu->arch.pc, exit_nr);
vcpu->stat.sp_instruc++;
} else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
@@ -575,7 +720,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
*/
kvmppc_mmu_pte_flush(vcpu, vcpu->arch.pc, ~0xFFFULL);
} else {
- vcpu->arch.msr |= (vcpu->arch.shadow_msr & 0x58000000);
+ vcpu->arch.msr |= vcpu->arch.shadow_srr1 & 0x58000000;
kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
kvmppc_mmu_pte_flush(vcpu, vcpu->arch.pc, ~0xFFFULL);
r = RESUME_GUEST;
@@ -621,6 +766,9 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
case BOOK3S_INTERRUPT_PROGRAM:
{
enum emulation_result er;
+ ulong flags;
+
+ flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
if (vcpu->arch.msr & MSR_PR) {
#ifdef EXIT_DEBUG
@@ -628,7 +776,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
#endif
if ((vcpu->arch.last_inst & 0xff0007ff) !=
(INS_DCBZ & 0xfffffff7)) {
- kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+ kvmppc_core_queue_program(vcpu, flags);
r = RESUME_GUEST;
break;
}
@@ -638,12 +786,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
er = kvmppc_emulate_instruction(run, vcpu);
switch (er) {
case EMULATE_DONE:
- r = RESUME_GUEST;
+ r = RESUME_GUEST_NV;
break;
case EMULATE_FAIL:
printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
__func__, vcpu->arch.pc, vcpu->arch.last_inst);
- kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+ kvmppc_core_queue_program(vcpu, flags);
r = RESUME_GUEST;
break;
default:
@@ -653,23 +801,30 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
}
case BOOK3S_INTERRUPT_SYSCALL:
#ifdef EXIT_DEBUG
- printk(KERN_INFO "Syscall Nr %d\n", (int)vcpu->arch.gpr[0]);
+ printk(KERN_INFO "Syscall Nr %d\n", (int)kvmppc_get_gpr(vcpu, 0));
#endif
vcpu->stat.syscall_exits++;
kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
r = RESUME_GUEST;
break;
- case BOOK3S_INTERRUPT_MACHINE_CHECK:
case BOOK3S_INTERRUPT_FP_UNAVAIL:
- case BOOK3S_INTERRUPT_TRACE:
+ r = kvmppc_handle_ext(vcpu, exit_nr, MSR_FP);
+ break;
case BOOK3S_INTERRUPT_ALTIVEC:
+ r = kvmppc_handle_ext(vcpu, exit_nr, MSR_VEC);
+ break;
case BOOK3S_INTERRUPT_VSX:
+ r = kvmppc_handle_ext(vcpu, exit_nr, MSR_VSX);
+ break;
+ case BOOK3S_INTERRUPT_MACHINE_CHECK:
+ case BOOK3S_INTERRUPT_TRACE:
kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
r = RESUME_GUEST;
break;
default:
/* Ugh - bork here! What did we get? */
- printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n", exit_nr, vcpu->arch.pc, vcpu->arch.shadow_msr);
+ printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n",
+ exit_nr, vcpu->arch.pc, vcpu->arch.shadow_srr1);
r = RESUME_HOST;
BUG();
break;
@@ -712,10 +867,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
int i;
regs->pc = vcpu->arch.pc;
- regs->cr = vcpu->arch.cr;
+ regs->cr = kvmppc_get_cr(vcpu);
regs->ctr = vcpu->arch.ctr;
regs->lr = vcpu->arch.lr;
- regs->xer = vcpu->arch.xer;
+ regs->xer = kvmppc_get_xer(vcpu);
regs->msr = vcpu->arch.msr;
regs->srr0 = vcpu->arch.srr0;
regs->srr1 = vcpu->arch.srr1;
@@ -729,7 +884,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
regs->sprg7 = vcpu->arch.sprg6;
for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
- regs->gpr[i] = vcpu->arch.gpr[i];
+ regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
return 0;
}
@@ -739,10 +894,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
int i;
vcpu->arch.pc = regs->pc;
- vcpu->arch.cr = regs->cr;
+ kvmppc_set_cr(vcpu, regs->cr);
vcpu->arch.ctr = regs->ctr;
vcpu->arch.lr = regs->lr;
- vcpu->arch.xer = regs->xer;
+ kvmppc_set_xer(vcpu, regs->xer);
kvmppc_set_msr(vcpu, regs->msr);
vcpu->arch.srr0 = regs->srr0;
vcpu->arch.srr1 = regs->srr1;
@@ -754,8 +909,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
vcpu->arch.sprg6 = regs->sprg5;
vcpu->arch.sprg7 = regs->sprg6;
- for (i = 0; i < ARRAY_SIZE(vcpu->arch.gpr); i++)
- vcpu->arch.gpr[i] = regs->gpr[i];
+ for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
+ kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
return 0;
}
@@ -850,7 +1005,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
int is_dirty = 0;
int r, n;
- down_write(&kvm->slots_lock);
+ mutex_lock(&kvm->slots_lock);
r = kvm_get_dirty_log(kvm, log, &is_dirty);
if (r)
@@ -858,7 +1013,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
/* If nothing is dirty, don't bother messing with page tables. */
if (is_dirty) {
- memslot = &kvm->memslots[log->slot];
+ memslot = &kvm->memslots->memslots[log->slot];
ga = memslot->base_gfn << PAGE_SHIFT;
ga_end = ga + (memslot->npages << PAGE_SHIFT);
@@ -872,7 +1027,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
r = 0;
out:
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
return r;
}
@@ -910,6 +1065,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
vcpu->arch.trampoline_lowmem = kvmppc_trampoline_lowmem;
vcpu->arch.trampoline_enter = kvmppc_trampoline_enter;
vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem;
+ vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall;
vcpu->arch.shadow_msr = MSR_USER64;
@@ -943,6 +1099,10 @@ extern int __kvmppc_vcpu_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
{
int ret;
+ struct thread_struct ext_bkp;
+ bool save_vec = current->thread.used_vr;
+ bool save_vsx = current->thread.used_vsr;
+ ulong ext_msr;
/* No need to go into the guest when all we do is going out */
if (signal_pending(current)) {
@@ -950,6 +1110,35 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
return -EINTR;
}
+ /* Save FPU state in stack */
+ if (current->thread.regs->msr & MSR_FP)
+ giveup_fpu(current);
+ memcpy(ext_bkp.fpr, current->thread.fpr, sizeof(current->thread.fpr));
+ ext_bkp.fpscr = current->thread.fpscr;
+ ext_bkp.fpexc_mode = current->thread.fpexc_mode;
+
+#ifdef CONFIG_ALTIVEC
+ /* Save Altivec state in stack */
+ if (save_vec) {
+ if (current->thread.regs->msr & MSR_VEC)
+ giveup_altivec(current);
+ memcpy(ext_bkp.vr, current->thread.vr, sizeof(ext_bkp.vr));
+ ext_bkp.vscr = current->thread.vscr;
+ ext_bkp.vrsave = current->thread.vrsave;
+ }
+ ext_bkp.used_vr = current->thread.used_vr;
+#endif
+
+#ifdef CONFIG_VSX
+ /* Save VSX state in stack */
+ if (save_vsx && (current->thread.regs->msr & MSR_VSX))
+ __giveup_vsx(current);
+ ext_bkp.used_vsr = current->thread.used_vsr;
+#endif
+
+ /* Remember the MSR with disabled extensions */
+ ext_msr = current->thread.regs->msr;
+
/* XXX we get called with irq disabled - change that! */
local_irq_enable();
@@ -957,6 +1146,32 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
local_irq_disable();
+ current->thread.regs->msr = ext_msr;
+
+ /* Make sure we save the guest FPU/Altivec/VSX state */
+ kvmppc_giveup_ext(vcpu, MSR_FP);
+ kvmppc_giveup_ext(vcpu, MSR_VEC);
+ kvmppc_giveup_ext(vcpu, MSR_VSX);
+
+ /* Restore FPU state from stack */
+ memcpy(current->thread.fpr, ext_bkp.fpr, sizeof(ext_bkp.fpr));
+ current->thread.fpscr = ext_bkp.fpscr;
+ current->thread.fpexc_mode = ext_bkp.fpexc_mode;
+
+#ifdef CONFIG_ALTIVEC
+ /* Restore Altivec state from stack */
+ if (save_vec && current->thread.used_vr) {
+ memcpy(current->thread.vr, ext_bkp.vr, sizeof(ext_bkp.vr));
+ current->thread.vscr = ext_bkp.vscr;
+ current->thread.vrsave= ext_bkp.vrsave;
+ }
+ current->thread.used_vr = ext_bkp.used_vr;
+#endif
+
+#ifdef CONFIG_VSX
+ current->thread.used_vsr = ext_bkp.used_vsr;
+#endif
+
return ret;
}
diff --git a/arch/powerpc/kvm/book3s_64_emulate.c b/arch/powerpc/kvm/book3s_64_emulate.c
index 1027eac6d47..2b0ee7e040c 100644
--- a/arch/powerpc/kvm/book3s_64_emulate.c
+++ b/arch/powerpc/kvm/book3s_64_emulate.c
@@ -65,11 +65,11 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
case 31:
switch (get_xop(inst)) {
case OP_31_XOP_MFMSR:
- vcpu->arch.gpr[get_rt(inst)] = vcpu->arch.msr;
+ kvmppc_set_gpr(vcpu, get_rt(inst), vcpu->arch.msr);
break;
case OP_31_XOP_MTMSRD:
{
- ulong rs = vcpu->arch.gpr[get_rs(inst)];
+ ulong rs = kvmppc_get_gpr(vcpu, get_rs(inst));
if (inst & 0x10000) {
vcpu->arch.msr &= ~(MSR_RI | MSR_EE);
vcpu->arch.msr |= rs & (MSR_RI | MSR_EE);
@@ -78,30 +78,30 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
break;
}
case OP_31_XOP_MTMSR:
- kvmppc_set_msr(vcpu, vcpu->arch.gpr[get_rs(inst)]);
+ kvmppc_set_msr(vcpu, kvmppc_get_gpr(vcpu, get_rs(inst)));
break;
case OP_31_XOP_MFSRIN:
{
int srnum;
- srnum = (vcpu->arch.gpr[get_rb(inst)] >> 28) & 0xf;
+ srnum = (kvmppc_get_gpr(vcpu, get_rb(inst)) >> 28) & 0xf;
if (vcpu->arch.mmu.mfsrin) {
u32 sr;
sr = vcpu->arch.mmu.mfsrin(vcpu, srnum);
- vcpu->arch.gpr[get_rt(inst)] = sr;
+ kvmppc_set_gpr(vcpu, get_rt(inst), sr);
}
break;
}
case OP_31_XOP_MTSRIN:
vcpu->arch.mmu.mtsrin(vcpu,
- (vcpu->arch.gpr[get_rb(inst)] >> 28) & 0xf,
- vcpu->arch.gpr[get_rs(inst)]);
+ (kvmppc_get_gpr(vcpu, get_rb(inst)) >> 28) & 0xf,
+ kvmppc_get_gpr(vcpu, get_rs(inst)));
break;
case OP_31_XOP_TLBIE:
case OP_31_XOP_TLBIEL:
{
bool large = (inst & 0x00200000) ? true : false;
- ulong addr = vcpu->arch.gpr[get_rb(inst)];
+ ulong addr = kvmppc_get_gpr(vcpu, get_rb(inst));
vcpu->arch.mmu.tlbie(vcpu, addr, large);
break;
}
@@ -111,14 +111,16 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
if (!vcpu->arch.mmu.slbmte)
return EMULATE_FAIL;
- vcpu->arch.mmu.slbmte(vcpu, vcpu->arch.gpr[get_rs(inst)],
- vcpu->arch.gpr[get_rb(inst)]);
+ vcpu->arch.mmu.slbmte(vcpu,
+ kvmppc_get_gpr(vcpu, get_rs(inst)),
+ kvmppc_get_gpr(vcpu, get_rb(inst)));
break;
case OP_31_XOP_SLBIE:
if (!vcpu->arch.mmu.slbie)
return EMULATE_FAIL;
- vcpu->arch.mmu.slbie(vcpu, vcpu->arch.gpr[get_rb(inst)]);
+ vcpu->arch.mmu.slbie(vcpu,
+ kvmppc_get_gpr(vcpu, get_rb(inst)));
break;
case OP_31_XOP_SLBIA:
if (!vcpu->arch.mmu.slbia)
@@ -132,9 +134,9 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
} else {
ulong t, rb;
- rb = vcpu->arch.gpr[get_rb(inst)];
+ rb = kvmppc_get_gpr(vcpu, get_rb(inst));
t = vcpu->arch.mmu.slbmfee(vcpu, rb);
- vcpu->arch.gpr[get_rt(inst)] = t;
+ kvmppc_set_gpr(vcpu, get_rt(inst), t);
}
break;
case OP_31_XOP_SLBMFEV:
@@ -143,20 +145,20 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
} else {
ulong t, rb;
- rb = vcpu->arch.gpr[get_rb(inst)];
+ rb = kvmppc_get_gpr(vcpu, get_rb(inst));
t = vcpu->arch.mmu.slbmfev(vcpu, rb);
- vcpu->arch.gpr[get_rt(inst)] = t;
+ kvmppc_set_gpr(vcpu, get_rt(inst), t);
}
break;
case OP_31_XOP_DCBZ:
{
- ulong rb = vcpu->arch.gpr[get_rb(inst)];
+ ulong rb = kvmppc_get_gpr(vcpu, get_rb(inst));
ulong ra = 0;
ulong addr;
u32 zeros[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
if (get_ra(inst))
- ra = vcpu->arch.gpr[get_ra(inst)];
+ ra = kvmppc_get_gpr(vcpu, get_ra(inst));
addr = (ra + rb) & ~31ULL;
if (!(vcpu->arch.msr & MSR_SF))
@@ -233,43 +235,44 @@ static void kvmppc_write_bat(struct kvm_vcpu *vcpu, int sprn, u32 val)
int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
{
int emulated = EMULATE_DONE;
+ ulong spr_val = kvmppc_get_gpr(vcpu, rs);
switch (sprn) {
case SPRN_SDR1:
- to_book3s(vcpu)->sdr1 = vcpu->arch.gpr[rs];
+ to_book3s(vcpu)->sdr1 = spr_val;
break;
case SPRN_DSISR:
- to_book3s(vcpu)->dsisr = vcpu->arch.gpr[rs];
+ to_book3s(vcpu)->dsisr = spr_val;
break;
case SPRN_DAR:
- vcpu->arch.dear = vcpu->arch.gpr[rs];
+ vcpu->arch.dear = spr_val;
break;
case SPRN_HIOR:
- to_book3s(vcpu)->hior = vcpu->arch.gpr[rs];
+ to_book3s(vcpu)->hior = spr_val;
break;
case SPRN_IBAT0U ... SPRN_IBAT3L:
case SPRN_IBAT4U ... SPRN_IBAT7L:
case SPRN_DBAT0U ... SPRN_DBAT3L:
case SPRN_DBAT4U ... SPRN_DBAT7L:
- kvmppc_write_bat(vcpu, sprn, (u32)vcpu->arch.gpr[rs]);
+ kvmppc_write_bat(vcpu, sprn, (u32)spr_val);
/* BAT writes happen so rarely that we're ok to flush
* everything here */
kvmppc_mmu_pte_flush(vcpu, 0, 0);
break;
case SPRN_HID0:
- to_book3s(vcpu)->hid[0] = vcpu->arch.gpr[rs];
+ to_book3s(vcpu)->hid[0] = spr_val;
break;
case SPRN_HID1:
- to_book3s(vcpu)->hid[1] = vcpu->arch.gpr[rs];
+ to_book3s(vcpu)->hid[1] = spr_val;
break;
case SPRN_HID2:
- to_book3s(vcpu)->hid[2] = vcpu->arch.gpr[rs];
+ to_book3s(vcpu)->hid[2] = spr_val;
break;
case SPRN_HID4:
- to_book3s(vcpu)->hid[4] = vcpu->arch.gpr[rs];
+ to_book3s(vcpu)->hid[4] = spr_val;
break;
case SPRN_HID5:
- to_book3s(vcpu)->hid[5] = vcpu->arch.gpr[rs];
+ to_book3s(vcpu)->hid[5] = spr_val;
/* guest HID5 set can change is_dcbz32 */
if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
(mfmsr() & MSR_HV))
@@ -299,38 +302,38 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
switch (sprn) {
case SPRN_SDR1:
- vcpu->arch.gpr[rt] = to_book3s(vcpu)->sdr1;
+ kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->sdr1);
break;
case SPRN_DSISR:
- vcpu->arch.gpr[rt] = to_book3s(vcpu)->dsisr;
+ kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->dsisr);
break;
case SPRN_DAR:
- vcpu->arch.gpr[rt] = vcpu->arch.dear;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.dear);
break;
case SPRN_HIOR:
- vcpu->arch.gpr[rt] = to_book3s(vcpu)->hior;
+ kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hior);
break;
case SPRN_HID0:
- vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[0];
+ kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[0]);
break;
case SPRN_HID1:
- vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[1];
+ kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[1]);
break;
case SPRN_HID2:
- vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[2];
+ kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[2]);
break;
case SPRN_HID4:
- vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[4];
+ kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[4]);
break;
case SPRN_HID5:
- vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[5];
+ kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[5]);
break;
case SPRN_THRM1:
case SPRN_THRM2:
case SPRN_THRM3:
case SPRN_CTRLF:
case SPRN_CTRLT:
- vcpu->arch.gpr[rt] = 0;
+ kvmppc_set_gpr(vcpu, rt, 0);
break;
default:
printk(KERN_INFO "KVM: invalid SPR read: %d\n", sprn);
diff --git a/arch/powerpc/kvm/book3s_64_exports.c b/arch/powerpc/kvm/book3s_64_exports.c
index 5b2db38ed86..1dd5a1ddfd0 100644
--- a/arch/powerpc/kvm/book3s_64_exports.c
+++ b/arch/powerpc/kvm/book3s_64_exports.c
@@ -22,3 +22,11 @@
EXPORT_SYMBOL_GPL(kvmppc_trampoline_enter);
EXPORT_SYMBOL_GPL(kvmppc_trampoline_lowmem);
+EXPORT_SYMBOL_GPL(kvmppc_rmcall);
+EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu);
+#ifdef CONFIG_ALTIVEC
+EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec);
+#endif
+#ifdef CONFIG_VSX
+EXPORT_SYMBOL_GPL(kvmppc_load_up_vsx);
+#endif
diff --git a/arch/powerpc/kvm/book3s_64_interrupts.S b/arch/powerpc/kvm/book3s_64_interrupts.S
index 7b55d8094c8..c1584d0cbce 100644
--- a/arch/powerpc/kvm/book3s_64_interrupts.S
+++ b/arch/powerpc/kvm/book3s_64_interrupts.S
@@ -28,11 +28,6 @@
#define ULONG_SIZE 8
#define VCPU_GPR(n) (VCPU_GPRS + (n * ULONG_SIZE))
-.macro mfpaca tmp_reg, src_reg, offset, vcpu_reg
- ld \tmp_reg, (PACA_EXMC+\offset)(r13)
- std \tmp_reg, VCPU_GPR(\src_reg)(\vcpu_reg)
-.endm
-
.macro DISABLE_INTERRUPTS
mfmsr r0
rldicl r0,r0,48,1
@@ -40,6 +35,26 @@
mtmsrd r0,1
.endm
+#define VCPU_LOAD_NVGPRS(vcpu) \
+ ld r14, VCPU_GPR(r14)(vcpu); \
+ ld r15, VCPU_GPR(r15)(vcpu); \
+ ld r16, VCPU_GPR(r16)(vcpu); \
+ ld r17, VCPU_GPR(r17)(vcpu); \
+ ld r18, VCPU_GPR(r18)(vcpu); \
+ ld r19, VCPU_GPR(r19)(vcpu); \
+ ld r20, VCPU_GPR(r20)(vcpu); \
+ ld r21, VCPU_GPR(r21)(vcpu); \
+ ld r22, VCPU_GPR(r22)(vcpu); \
+ ld r23, VCPU_GPR(r23)(vcpu); \
+ ld r24, VCPU_GPR(r24)(vcpu); \
+ ld r25, VCPU_GPR(r25)(vcpu); \
+ ld r26, VCPU_GPR(r26)(vcpu); \
+ ld r27, VCPU_GPR(r27)(vcpu); \
+ ld r28, VCPU_GPR(r28)(vcpu); \
+ ld r29, VCPU_GPR(r29)(vcpu); \
+ ld r30, VCPU_GPR(r30)(vcpu); \
+ ld r31, VCPU_GPR(r31)(vcpu); \
+
/*****************************************************************************
* *
* Guest entry / exit code that is in kernel module memory (highmem) *
@@ -67,61 +82,32 @@ kvm_start_entry:
SAVE_NVGPRS(r1)
/* Save LR */
- mflr r14
- std r14, _LINK(r1)
-
-/* XXX optimize non-volatile loading away */
-kvm_start_lightweight:
+ std r0, _LINK(r1)
- DISABLE_INTERRUPTS
+ /* Load non-volatile guest state from the vcpu */
+ VCPU_LOAD_NVGPRS(r4)
/* Save R1/R2 in the PACA */
- std r1, PACAR1(r13)
- std r2, (PACA_EXMC+EX_SRR0)(r13)
+ std r1, PACA_KVM_HOST_R1(r13)
+ std r2, PACA_KVM_HOST_R2(r13)
+
+ /* XXX swap in/out on load? */
ld r3, VCPU_HIGHMEM_HANDLER(r4)
- std r3, PACASAVEDMSR(r13)
+ std r3, PACA_KVM_VMHANDLER(r13)
- /* Load non-volatile guest state from the vcpu */
- ld r14, VCPU_GPR(r14)(r4)
- ld r15, VCPU_GPR(r15)(r4)
- ld r16, VCPU_GPR(r16)(r4)
- ld r17, VCPU_GPR(r17)(r4)
- ld r18, VCPU_GPR(r18)(r4)
- ld r19, VCPU_GPR(r19)(r4)
- ld r20, VCPU_GPR(r20)(r4)
- ld r21, VCPU_GPR(r21)(r4)
- ld r22, VCPU_GPR(r22)(r4)
- ld r23, VCPU_GPR(r23)(r4)
- ld r24, VCPU_GPR(r24)(r4)
- ld r25, VCPU_GPR(r25)(r4)
- ld r26, VCPU_GPR(r26)(r4)
- ld r27, VCPU_GPR(r27)(r4)
- ld r28, VCPU_GPR(r28)(r4)
- ld r29, VCPU_GPR(r29)(r4)
- ld r30, VCPU_GPR(r30)(r4)
- ld r31, VCPU_GPR(r31)(r4)
+kvm_start_lightweight:
ld r9, VCPU_PC(r4) /* r9 = vcpu->arch.pc */
ld r10, VCPU_SHADOW_MSR(r4) /* r10 = vcpu->arch.shadow_msr */
- ld r3, VCPU_TRAMPOLINE_ENTER(r4)
- mtsrr0 r3
-
- LOAD_REG_IMMEDIATE(r3, MSR_KERNEL & ~(MSR_IR | MSR_DR))
- mtsrr1 r3
-
- /* Load guest state in the respective registers */
- lwz r3, VCPU_CR(r4) /* r3 = vcpu->arch.cr */
- stw r3, (PACA_EXMC + EX_CCR)(r13)
-
- ld r3, VCPU_CTR(r4) /* r3 = vcpu->arch.ctr */
- mtctr r3 /* CTR = r3 */
+ /* Load some guest state in the respective registers */
+ ld r5, VCPU_CTR(r4) /* r5 = vcpu->arch.ctr */
+ /* will be swapped in by rmcall */
ld r3, VCPU_LR(r4) /* r3 = vcpu->arch.lr */
mtlr r3 /* LR = r3 */
- ld r3, VCPU_XER(r4) /* r3 = vcpu->arch.xer */
- std r3, (PACA_EXMC + EX_R3)(r13)
+ DISABLE_INTERRUPTS
/* Some guests may need to have dcbz set to 32 byte length.
*
@@ -141,36 +127,15 @@ kvm_start_lightweight:
mtspr SPRN_HID5,r3
no_dcbz32_on:
- /* Load guest GPRs */
-
- ld r3, VCPU_GPR(r9)(r4)
- std r3, (PACA_EXMC + EX_R9)(r13)
- ld r3, VCPU_GPR(r10)(r4)
- std r3, (PACA_EXMC + EX_R10)(r13)
- ld r3, VCPU_GPR(r11)(r4)
- std r3, (PACA_EXMC + EX_R11)(r13)
- ld r3, VCPU_GPR(r12)(r4)
- std r3, (PACA_EXMC + EX_R12)(r13)
- ld r3, VCPU_GPR(r13)(r4)
- std r3, (PACA_EXMC + EX_R13)(r13)
-
- ld r0, VCPU_GPR(r0)(r4)
- ld r1, VCPU_GPR(r1)(r4)
- ld r2, VCPU_GPR(r2)(r4)
- ld r3, VCPU_GPR(r3)(r4)
- ld r5, VCPU_GPR(r5)(r4)
- ld r6, VCPU_GPR(r6)(r4)
- ld r7, VCPU_GPR(r7)(r4)
- ld r8, VCPU_GPR(r8)(r4)
- ld r4, VCPU_GPR(r4)(r4)
-
- /* This sets the Magic value for the trampoline */
-
- li r11, 1
- stb r11, PACA_KVM_IN_GUEST(r13)
+
+ ld r6, VCPU_RMCALL(r4)
+ mtctr r6
+
+ ld r3, VCPU_TRAMPOLINE_ENTER(r4)
+ LOAD_REG_IMMEDIATE(r4, MSR_KERNEL & ~(MSR_IR | MSR_DR))
/* Jump to SLB patching handlder and into our guest */
- RFI
+ bctr
/*
* This is the handler in module memory. It gets jumped at from the
@@ -184,125 +149,70 @@ kvmppc_handler_highmem:
/*
* Register usage at this point:
*
- * R00 = guest R13
- * R01 = host R1
- * R02 = host R2
- * R10 = guest PC
- * R11 = guest MSR
- * R12 = exit handler id
- * R13 = PACA
- * PACA.exmc.R9 = guest R1
- * PACA.exmc.R10 = guest R10
- * PACA.exmc.R11 = guest R11
- * PACA.exmc.R12 = guest R12
- * PACA.exmc.R13 = guest R2
- * PACA.exmc.DAR = guest DAR
- * PACA.exmc.DSISR = guest DSISR
- * PACA.exmc.LR = guest instruction
- * PACA.exmc.CCR = guest CR
- * PACA.exmc.SRR0 = guest R0
+ * R0 = guest last inst
+ * R1 = host R1
+ * R2 = host R2
+ * R3 = guest PC
+ * R4 = guest MSR
+ * R5 = guest DAR
+ * R6 = guest DSISR
+ * R13 = PACA
+ * PACA.KVM.* = guest *
*
*/
- std r3, (PACA_EXMC+EX_R3)(r13)
+ /* R7 = vcpu */
+ ld r7, GPR4(r1)
- /* save the exit id in R3 */
- mr r3, r12
+ /* Now save the guest state */
- /* R12 = vcpu */
- ld r12, GPR4(r1)
+ stw r0, VCPU_LAST_INST(r7)
- /* Now save the guest state */
+ std r3, VCPU_PC(r7)
+ std r4, VCPU_SHADOW_SRR1(r7)
+ std r5, VCPU_FAULT_DEAR(r7)
+ std r6, VCPU_FAULT_DSISR(r7)
- std r0, VCPU_GPR(r13)(r12)
- std r4, VCPU_GPR(r4)(r12)
- std r5, VCPU_GPR(r5)(r12)
- std r6, VCPU_GPR(r6)(r12)
- std r7, VCPU_GPR(r7)(r12)
- std r8, VCPU_GPR(r8)(r12)
- std r9, VCPU_GPR(r9)(r12)
-
- /* get registers from PACA */
- mfpaca r5, r0, EX_SRR0, r12
- mfpaca r5, r3, EX_R3, r12
- mfpaca r5, r1, EX_R9, r12
- mfpaca r5, r10, EX_R10, r12
- mfpaca r5, r11, EX_R11, r12
- mfpaca r5, r12, EX_R12, r12
- mfpaca r5, r2, EX_R13, r12
-
- lwz r5, (PACA_EXMC+EX_LR)(r13)
- stw r5, VCPU_LAST_INST(r12)
-
- lwz r5, (PACA_EXMC+EX_CCR)(r13)
- stw r5, VCPU_CR(r12)
-
- ld r5, VCPU_HFLAGS(r12)
+ ld r5, VCPU_HFLAGS(r7)
rldicl. r5, r5, 0, 63 /* CR = ((r5 & 1) == 0) */
beq no_dcbz32_off
+ li r4, 0
mfspr r5,SPRN_HID5
- rldimi r5,r5,6,56
+ rldimi r5,r4,6,56
mtspr SPRN_HID5,r5
no_dcbz32_off:
- /* XXX maybe skip on lightweight? */
- std r14, VCPU_GPR(r14)(r12)
- std r15, VCPU_GPR(r15)(r12)
- std r16, VCPU_GPR(r16)(r12)
- std r17, VCPU_GPR(r17)(r12)
- std r18, VCPU_GPR(r18)(r12)
- std r19, VCPU_GPR(r19)(r12)
- std r20, VCPU_GPR(r20)(r12)
- std r21, VCPU_GPR(r21)(r12)
- std r22, VCPU_GPR(r22)(r12)
- std r23, VCPU_GPR(r23)(r12)
- std r24, VCPU_GPR(r24)(r12)
- std r25, VCPU_GPR(r25)(r12)
- std r26, VCPU_GPR(r26)(r12)
- std r27, VCPU_GPR(r27)(r12)
- std r28, VCPU_GPR(r28)(r12)
- std r29, VCPU_GPR(r29)(r12)
- std r30, VCPU_GPR(r30)(r12)
- std r31, VCPU_GPR(r31)(r12)
-
- /* Restore non-volatile host registers (r14 - r31) */
- REST_NVGPRS(r1)
-
- /* Save guest PC (R10) */
- std r10, VCPU_PC(r12)
-
- /* Save guest msr (R11) */
- std r11, VCPU_SHADOW_MSR(r12)
-
- /* Save guest CTR (in R12) */
+ std r14, VCPU_GPR(r14)(r7)
+ std r15, VCPU_GPR(r15)(r7)
+ std r16, VCPU_GPR(r16)(r7)
+ std r17, VCPU_GPR(r17)(r7)
+ std r18, VCPU_GPR(r18)(r7)
+ std r19, VCPU_GPR(r19)(r7)
+ std r20, VCPU_GPR(r20)(r7)
+ std r21, VCPU_GPR(r21)(r7)
+ std r22, VCPU_GPR(r22)(r7)
+ std r23, VCPU_GPR(r23)(r7)
+ std r24, VCPU_GPR(r24)(r7)
+ std r25, VCPU_GPR(r25)(r7)
+ std r26, VCPU_GPR(r26)(r7)
+ std r27, VCPU_GPR(r27)(r7)
+ std r28, VCPU_GPR(r28)(r7)
+ std r29, VCPU_GPR(r29)(r7)
+ std r30, VCPU_GPR(r30)(r7)
+ std r31, VCPU_GPR(r31)(r7)
+
+ /* Save guest CTR */
mfctr r5
- std r5, VCPU_CTR(r12)
+ std r5, VCPU_CTR(r7)
/* Save guest LR */
mflr r5
- std r5, VCPU_LR(r12)
-
- /* Save guest XER */
- mfxer r5
- std r5, VCPU_XER(r12)
-
- /* Save guest DAR */
- ld r5, (PACA_EXMC+EX_DAR)(r13)
- std r5, VCPU_FAULT_DEAR(r12)
-
- /* Save guest DSISR */
- lwz r5, (PACA_EXMC+EX_DSISR)(r13)
- std r5, VCPU_FAULT_DSISR(r12)
+ std r5, VCPU_LR(r7)
/* Restore host msr -> SRR1 */
- ld r7, VCPU_HOST_MSR(r12)
- mtsrr1 r7
-
- /* Restore host IP -> SRR0 */
- ld r6, VCPU_HOST_RETIP(r12)
- mtsrr0 r6
+ ld r6, VCPU_HOST_MSR(r7)
/*
* For some interrupts, we need to call the real Linux
@@ -314,13 +224,14 @@ no_dcbz32_off:
* r3 = address of interrupt handler (exit reason)
*/
- cmpwi r3, BOOK3S_INTERRUPT_EXTERNAL
+ cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
beq call_linux_handler
- cmpwi r3, BOOK3S_INTERRUPT_DECREMENTER
+ cmpwi r12, BOOK3S_INTERRUPT_DECREMENTER
beq call_linux_handler
- /* Back to Interruptable Mode! (goto kvm_return_point) */
- RFI
+ /* Back to EE=1 */
+ mtmsr r6
+ b kvm_return_point
call_linux_handler:
@@ -333,16 +244,22 @@ call_linux_handler:
* interrupt handler!
*
* R3 still contains the exit code,
- * R6 VCPU_HOST_RETIP and
- * R7 VCPU_HOST_MSR
+ * R5 VCPU_HOST_RETIP and
+ * R6 VCPU_HOST_MSR
*/
- mtlr r3
+ /* Restore host IP -> SRR0 */
+ ld r5, VCPU_HOST_RETIP(r7)
+
+ /* XXX Better move to a safe function?
+ * What if we get an HTAB flush in between mtsrr0 and mtsrr1? */
- ld r5, VCPU_TRAMPOLINE_LOWMEM(r12)
- mtsrr0 r5
- LOAD_REG_IMMEDIATE(r5, MSR_KERNEL & ~(MSR_IR | MSR_DR))
- mtsrr1 r5
+ mtlr r12
+
+ ld r4, VCPU_TRAMPOLINE_LOWMEM(r7)
+ mtsrr0 r4
+ LOAD_REG_IMMEDIATE(r3, MSR_KERNEL & ~(MSR_IR | MSR_DR))
+ mtsrr1 r3
RFI
@@ -351,42 +268,51 @@ kvm_return_point:
/* Jump back to lightweight entry if we're supposed to */
/* go back into the guest */
- mr r5, r3
+
+ /* Pass the exit number as 3rd argument to kvmppc_handle_exit */
+ mr r5, r12
+
/* Restore r3 (kvm_run) and r4 (vcpu) */
REST_2GPRS(3, r1)
bl KVMPPC_HANDLE_EXIT
-#if 0 /* XXX get lightweight exits back */
+ /* If RESUME_GUEST, get back in the loop */
cmpwi r3, RESUME_GUEST
- bne kvm_exit_heavyweight
+ beq kvm_loop_lightweight
- /* put VCPU and KVM_RUN back into place and roll again! */
- REST_2GPRS(3, r1)
- b kvm_start_lightweight
+ cmpwi r3, RESUME_GUEST_NV
+ beq kvm_loop_heavyweight
-kvm_exit_heavyweight:
- /* Restore non-volatile host registers */
- ld r14, _LINK(r1)
- mtlr r14
- REST_NVGPRS(r1)
+kvm_exit_loop:
- addi r1, r1, SWITCH_FRAME_SIZE
-#else
ld r4, _LINK(r1)
mtlr r4
- cmpwi r3, RESUME_GUEST
- bne kvm_exit_heavyweight
+ /* Restore non-volatile host registers (r14 - r31) */
+ REST_NVGPRS(r1)
+
+ addi r1, r1, SWITCH_FRAME_SIZE
+ blr
+
+kvm_loop_heavyweight:
+
+ ld r4, _LINK(r1)
+ std r4, (16 + SWITCH_FRAME_SIZE)(r1)
+ /* Load vcpu and cpu_run */
REST_2GPRS(3, r1)
- addi r1, r1, SWITCH_FRAME_SIZE
+ /* Load non-volatile guest state from the vcpu */
+ VCPU_LOAD_NVGPRS(r4)
- b kvm_start_entry
+ /* Jump back into the beginning of this function */
+ b kvm_start_lightweight
-kvm_exit_heavyweight:
+kvm_loop_lightweight:
- addi r1, r1, SWITCH_FRAME_SIZE
-#endif
+ /* We'll need the vcpu pointer */
+ REST_GPR(4, r1)
+
+ /* Jump back into the beginning of this function */
+ b kvm_start_lightweight
- blr
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index e4beeb371a7..512dcff7755 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -54,7 +54,7 @@ static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe(
if (!vcpu_book3s->slb[i].valid)
continue;
- if (vcpu_book3s->slb[i].large)
+ if (vcpu_book3s->slb[i].tb)
cmp_esid = esid_1t;
if (vcpu_book3s->slb[i].esid == cmp_esid)
@@ -65,9 +65,10 @@ static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe(
eaddr, esid, esid_1t);
for (i = 0; i < vcpu_book3s->slb_nr; i++) {
if (vcpu_book3s->slb[i].vsid)
- dprintk(" %d: %c%c %llx %llx\n", i,
+ dprintk(" %d: %c%c%c %llx %llx\n", i,
vcpu_book3s->slb[i].valid ? 'v' : ' ',
vcpu_book3s->slb[i].large ? 'l' : ' ',
+ vcpu_book3s->slb[i].tb ? 't' : ' ',
vcpu_book3s->slb[i].esid,
vcpu_book3s->slb[i].vsid);
}
@@ -84,7 +85,7 @@ static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr,
if (!slb)
return 0;
- if (slb->large)
+ if (slb->tb)
return (((u64)eaddr >> 12) & 0xfffffff) |
(((u64)slb->vsid) << 28);
@@ -309,7 +310,8 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb)
slbe = &vcpu_book3s->slb[slb_nr];
slbe->large = (rs & SLB_VSID_L) ? 1 : 0;
- slbe->esid = slbe->large ? esid_1t : esid;
+ slbe->tb = (rs & SLB_VSID_B_1T) ? 1 : 0;
+ slbe->esid = slbe->tb ? esid_1t : esid;
slbe->vsid = rs >> 12;
slbe->valid = (rb & SLB_ESID_V) ? 1 : 0;
slbe->Ks = (rs & SLB_VSID_KS) ? 1 : 0;
diff --git a/arch/powerpc/kvm/book3s_64_rmhandlers.S b/arch/powerpc/kvm/book3s_64_rmhandlers.S
index fb7dd2e9ac8..c83c60ad96c 100644
--- a/arch/powerpc/kvm/book3s_64_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_64_rmhandlers.S
@@ -45,36 +45,25 @@ kvmppc_trampoline_\intno:
* To distinguish, we check a magic byte in the PACA
*/
mfspr r13, SPRN_SPRG_PACA /* r13 = PACA */
- std r12, (PACA_EXMC + EX_R12)(r13)
+ std r12, PACA_KVM_SCRATCH0(r13)
mfcr r12
- stw r12, (PACA_EXMC + EX_CCR)(r13)
+ stw r12, PACA_KVM_SCRATCH1(r13)
lbz r12, PACA_KVM_IN_GUEST(r13)
- cmpwi r12, 0
+ cmpwi r12, KVM_GUEST_MODE_NONE
bne ..kvmppc_handler_hasmagic_\intno
/* No KVM guest? Then jump back to the Linux handler! */
- lwz r12, (PACA_EXMC + EX_CCR)(r13)
+ lwz r12, PACA_KVM_SCRATCH1(r13)
mtcr r12
- ld r12, (PACA_EXMC + EX_R12)(r13)
+ ld r12, PACA_KVM_SCRATCH0(r13)
mfspr r13, SPRN_SPRG_SCRATCH0 /* r13 = original r13 */
b kvmppc_resume_\intno /* Get back original handler */
/* Now we know we're handling a KVM guest */
..kvmppc_handler_hasmagic_\intno:
- /* Unset guest state */
- li r12, 0
- stb r12, PACA_KVM_IN_GUEST(r13)
- std r1, (PACA_EXMC+EX_R9)(r13)
- std r10, (PACA_EXMC+EX_R10)(r13)
- std r11, (PACA_EXMC+EX_R11)(r13)
- std r2, (PACA_EXMC+EX_R13)(r13)
-
- mfsrr0 r10
- mfsrr1 r11
-
- /* Restore R1/R2 so we can handle faults */
- ld r1, PACAR1(r13)
- ld r2, (PACA_EXMC+EX_SRR0)(r13)
+ /* Should we just skip the faulting instruction? */
+ cmpwi r12, KVM_GUEST_MODE_SKIP
+ beq kvmppc_handler_skip_ins
/* Let's store which interrupt we're handling */
li r12, \intno
@@ -102,23 +91,107 @@ INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_ALTIVEC
INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_VSX
/*
+ * Bring us back to the faulting code, but skip the
+ * faulting instruction.
+ *
+ * This is a generic exit path from the interrupt
+ * trampolines above.
+ *
+ * Input Registers:
+ *
+ * R12 = free
+ * R13 = PACA
+ * PACA.KVM.SCRATCH0 = guest R12
+ * PACA.KVM.SCRATCH1 = guest CR
+ * SPRG_SCRATCH0 = guest R13
+ *
+ */
+kvmppc_handler_skip_ins:
+
+ /* Patch the IP to the next instruction */
+ mfsrr0 r12
+ addi r12, r12, 4
+ mtsrr0 r12
+
+ /* Clean up all state */
+ lwz r12, PACA_KVM_SCRATCH1(r13)
+ mtcr r12
+ ld r12, PACA_KVM_SCRATCH0(r13)
+ mfspr r13, SPRN_SPRG_SCRATCH0
+
+ /* And get back into the code */
+ RFI
+
+/*
* This trampoline brings us back to a real mode handler
*
* Input Registers:
*
- * R6 = SRR0
- * R7 = SRR1
+ * R5 = SRR0
+ * R6 = SRR1
* LR = real-mode IP
*
*/
.global kvmppc_handler_lowmem_trampoline
kvmppc_handler_lowmem_trampoline:
- mtsrr0 r6
- mtsrr1 r7
+ mtsrr0 r5
+ mtsrr1 r6
blr
kvmppc_handler_lowmem_trampoline_end:
+/*
+ * Call a function in real mode
+ *
+ * Input Registers:
+ *
+ * R3 = function
+ * R4 = MSR
+ * R5 = CTR
+ *
+ */
+_GLOBAL(kvmppc_rmcall)
+ mtmsr r4 /* Disable relocation, so mtsrr
+ doesn't get interrupted */
+ mtctr r5
+ mtsrr0 r3
+ mtsrr1 r4
+ RFI
+
+/*
+ * Activate current's external feature (FPU/Altivec/VSX)
+ */
+#define define_load_up(what) \
+ \
+_GLOBAL(kvmppc_load_up_ ## what); \
+ subi r1, r1, INT_FRAME_SIZE; \
+ mflr r3; \
+ std r3, _LINK(r1); \
+ mfmsr r4; \
+ std r31, GPR3(r1); \
+ mr r31, r4; \
+ li r5, MSR_DR; \
+ oris r5, r5, MSR_EE@h; \
+ andc r4, r4, r5; \
+ mtmsr r4; \
+ \
+ bl .load_up_ ## what; \
+ \
+ mtmsr r31; \
+ ld r3, _LINK(r1); \
+ ld r31, GPR3(r1); \
+ addi r1, r1, INT_FRAME_SIZE; \
+ mtlr r3; \
+ blr
+
+define_load_up(fpu)
+#ifdef CONFIG_ALTIVEC
+define_load_up(altivec)
+#endif
+#ifdef CONFIG_VSX
+define_load_up(vsx)
+#endif
+
.global kvmppc_trampoline_lowmem
kvmppc_trampoline_lowmem:
.long kvmppc_handler_lowmem_trampoline - _stext
diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S
index ecd237a03fd..35b76272218 100644
--- a/arch/powerpc/kvm/book3s_64_slb.S
+++ b/arch/powerpc/kvm/book3s_64_slb.S
@@ -31,7 +31,7 @@
#define REBOLT_SLB_ENTRY(num) \
ld r10, SHADOW_SLB_ESID(num)(r11); \
cmpdi r10, 0; \
- beq slb_exit_skip_1; \
+ beq slb_exit_skip_ ## num; \
oris r10, r10, SLB_ESID_V@h; \
ld r9, SHADOW_SLB_VSID(num)(r11); \
slbmte r9, r10; \
@@ -51,23 +51,21 @@ kvmppc_handler_trampoline_enter:
*
* MSR = ~IR|DR
* R13 = PACA
+ * R1 = host R1
+ * R2 = host R2
* R9 = guest IP
* R10 = guest MSR
- * R11 = free
- * R12 = free
- * PACA[PACA_EXMC + EX_R9] = guest R9
- * PACA[PACA_EXMC + EX_R10] = guest R10
- * PACA[PACA_EXMC + EX_R11] = guest R11
- * PACA[PACA_EXMC + EX_R12] = guest R12
- * PACA[PACA_EXMC + EX_R13] = guest R13
- * PACA[PACA_EXMC + EX_CCR] = guest CR
- * PACA[PACA_EXMC + EX_R3] = guest XER
+ * all other GPRS = free
+ * PACA[KVM_CR] = guest CR
+ * PACA[KVM_XER] = guest XER
*/
mtsrr0 r9
mtsrr1 r10
- mtspr SPRN_SPRG_SCRATCH0, r0
+ /* Activate guest mode, so faults get handled by KVM */
+ li r11, KVM_GUEST_MODE_GUEST
+ stb r11, PACA_KVM_IN_GUEST(r13)
/* Remove LPAR shadow entries */
@@ -131,20 +129,27 @@ slb_do_enter:
/* Enter guest */
- mfspr r0, SPRN_SPRG_SCRATCH0
-
- ld r9, (PACA_EXMC+EX_R9)(r13)
- ld r10, (PACA_EXMC+EX_R10)(r13)
- ld r12, (PACA_EXMC+EX_R12)(r13)
-
- lwz r11, (PACA_EXMC+EX_CCR)(r13)
+ ld r0, (PACA_KVM_R0)(r13)
+ ld r1, (PACA_KVM_R1)(r13)
+ ld r2, (PACA_KVM_R2)(r13)
+ ld r3, (PACA_KVM_R3)(r13)
+ ld r4, (PACA_KVM_R4)(r13)
+ ld r5, (PACA_KVM_R5)(r13)
+ ld r6, (PACA_KVM_R6)(r13)
+ ld r7, (PACA_KVM_R7)(r13)
+ ld r8, (PACA_KVM_R8)(r13)
+ ld r9, (PACA_KVM_R9)(r13)
+ ld r10, (PACA_KVM_R10)(r13)
+ ld r12, (PACA_KVM_R12)(r13)
+
+ lwz r11, (PACA_KVM_CR)(r13)
mtcr r11
- ld r11, (PACA_EXMC+EX_R3)(r13)
+ ld r11, (PACA_KVM_XER)(r13)
mtxer r11
- ld r11, (PACA_EXMC+EX_R11)(r13)
- ld r13, (PACA_EXMC+EX_R13)(r13)
+ ld r11, (PACA_KVM_R11)(r13)
+ ld r13, (PACA_KVM_R13)(r13)
RFI
kvmppc_handler_trampoline_enter_end:
@@ -162,28 +167,54 @@ kvmppc_handler_trampoline_exit:
/* Register usage at this point:
*
- * SPRG_SCRATCH0 = guest R13
- * R01 = host R1
- * R02 = host R2
- * R10 = guest PC
- * R11 = guest MSR
- * R12 = exit handler id
- * R13 = PACA
- * PACA.exmc.CCR = guest CR
- * PACA.exmc.R9 = guest R1
- * PACA.exmc.R10 = guest R10
- * PACA.exmc.R11 = guest R11
- * PACA.exmc.R12 = guest R12
- * PACA.exmc.R13 = guest R2
+ * SPRG_SCRATCH0 = guest R13
+ * R12 = exit handler id
+ * R13 = PACA
+ * PACA.KVM.SCRATCH0 = guest R12
+ * PACA.KVM.SCRATCH1 = guest CR
*
*/
/* Save registers */
- std r0, (PACA_EXMC+EX_SRR0)(r13)
- std r9, (PACA_EXMC+EX_R3)(r13)
- std r10, (PACA_EXMC+EX_LR)(r13)
- std r11, (PACA_EXMC+EX_DAR)(r13)
+ std r0, PACA_KVM_R0(r13)
+ std r1, PACA_KVM_R1(r13)
+ std r2, PACA_KVM_R2(r13)
+ std r3, PACA_KVM_R3(r13)
+ std r4, PACA_KVM_R4(r13)
+ std r5, PACA_KVM_R5(r13)
+ std r6, PACA_KVM_R6(r13)
+ std r7, PACA_KVM_R7(r13)
+ std r8, PACA_KVM_R8(r13)
+ std r9, PACA_KVM_R9(r13)
+ std r10, PACA_KVM_R10(r13)
+ std r11, PACA_KVM_R11(r13)
+
+ /* Restore R1/R2 so we can handle faults */
+ ld r1, PACA_KVM_HOST_R1(r13)
+ ld r2, PACA_KVM_HOST_R2(r13)
+
+ /* Save guest PC and MSR in GPRs */
+ mfsrr0 r3
+ mfsrr1 r4
+
+ /* Get scratch'ed off registers */
+ mfspr r9, SPRN_SPRG_SCRATCH0
+ std r9, PACA_KVM_R13(r13)
+
+ ld r8, PACA_KVM_SCRATCH0(r13)
+ std r8, PACA_KVM_R12(r13)
+
+ lwz r7, PACA_KVM_SCRATCH1(r13)
+ stw r7, PACA_KVM_CR(r13)
+
+ /* Save more register state */
+
+ mfxer r6
+ stw r6, PACA_KVM_XER(r13)
+
+ mfdar r5
+ mfdsisr r6
/*
* In order for us to easily get the last instruction,
@@ -202,17 +233,28 @@ kvmppc_handler_trampoline_exit:
ld_last_inst:
/* Save off the guest instruction we're at */
+
+ /* Set guest mode to 'jump over instruction' so if lwz faults
+ * we'll just continue at the next IP. */
+ li r9, KVM_GUEST_MODE_SKIP
+ stb r9, PACA_KVM_IN_GUEST(r13)
+
/* 1) enable paging for data */
mfmsr r9
ori r11, r9, MSR_DR /* Enable paging for data */
mtmsr r11
/* 2) fetch the instruction */
- lwz r0, 0(r10)
+ li r0, KVM_INST_FETCH_FAILED /* In case lwz faults */
+ lwz r0, 0(r3)
/* 3) disable paging again */
mtmsr r9
no_ld_last_inst:
+ /* Unset guest mode */
+ li r9, KVM_GUEST_MODE_NONE
+ stb r9, PACA_KVM_IN_GUEST(r13)
+
/* Restore bolted entries from the shadow and fix it along the way */
/* We don't store anything in entry 0, so we don't need to take care of it */
@@ -233,29 +275,27 @@ no_ld_last_inst:
slb_do_exit:
- /* Restore registers */
-
- ld r11, (PACA_EXMC+EX_DAR)(r13)
- ld r10, (PACA_EXMC+EX_LR)(r13)
- ld r9, (PACA_EXMC+EX_R3)(r13)
-
- /* Save last inst */
- stw r0, (PACA_EXMC+EX_LR)(r13)
-
- /* Save DAR and DSISR before going to paged mode */
- mfdar r0
- std r0, (PACA_EXMC+EX_DAR)(r13)
- mfdsisr r0
- stw r0, (PACA_EXMC+EX_DSISR)(r13)
+ /* Register usage at this point:
+ *
+ * R0 = guest last inst
+ * R1 = host R1
+ * R2 = host R2
+ * R3 = guest PC
+ * R4 = guest MSR
+ * R5 = guest DAR
+ * R6 = guest DSISR
+ * R12 = exit handler id
+ * R13 = PACA
+ * PACA.KVM.* = guest *
+ *
+ */
/* RFI into the highmem handler */
- mfmsr r0
- ori r0, r0, MSR_IR|MSR_DR|MSR_RI /* Enable paging */
- mtsrr1 r0
- ld r0, PACASAVEDMSR(r13) /* Highmem handler address */
- mtsrr0 r0
-
- mfspr r0, SPRN_SPRG_SCRATCH0
+ mfmsr r7
+ ori r7, r7, MSR_IR|MSR_DR|MSR_RI /* Enable paging */
+ mtsrr1 r7
+ ld r8, PACA_KVM_VMHANDLER(r13) /* Highmem handler address */
+ mtsrr0 r8
RFI
kvmppc_handler_trampoline_exit_end:
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 06f5a9ecc42..4d686cc6b26 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -69,10 +69,10 @@ void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu)
for (i = 0; i < 32; i += 4) {
printk("gpr%02d: %08lx %08lx %08lx %08lx\n", i,
- vcpu->arch.gpr[i],
- vcpu->arch.gpr[i+1],
- vcpu->arch.gpr[i+2],
- vcpu->arch.gpr[i+3]);
+ kvmppc_get_gpr(vcpu, i),
+ kvmppc_get_gpr(vcpu, i+1),
+ kvmppc_get_gpr(vcpu, i+2),
+ kvmppc_get_gpr(vcpu, i+3));
}
}
@@ -82,8 +82,32 @@ static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu,
set_bit(priority, &vcpu->arch.pending_exceptions);
}
-void kvmppc_core_queue_program(struct kvm_vcpu *vcpu)
+static void kvmppc_core_queue_dtlb_miss(struct kvm_vcpu *vcpu,
+ ulong dear_flags, ulong esr_flags)
{
+ vcpu->arch.queued_dear = dear_flags;
+ vcpu->arch.queued_esr = esr_flags;
+ kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DTLB_MISS);
+}
+
+static void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu,
+ ulong dear_flags, ulong esr_flags)
+{
+ vcpu->arch.queued_dear = dear_flags;
+ vcpu->arch.queued_esr = esr_flags;
+ kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DATA_STORAGE);
+}
+
+static void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu,
+ ulong esr_flags)
+{
+ vcpu->arch.queued_esr = esr_flags;
+ kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_INST_STORAGE);
+}
+
+void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong esr_flags)
+{
+ vcpu->arch.queued_esr = esr_flags;
kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_PROGRAM);
}
@@ -97,6 +121,11 @@ int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu)
return test_bit(BOOKE_IRQPRIO_DECREMENTER, &vcpu->arch.pending_exceptions);
}
+void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu)
+{
+ clear_bit(BOOKE_IRQPRIO_DECREMENTER, &vcpu->arch.pending_exceptions);
+}
+
void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
struct kvm_interrupt *irq)
{
@@ -109,14 +138,19 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
{
int allowed = 0;
ulong msr_mask;
+ bool update_esr = false, update_dear = false;
switch (priority) {
- case BOOKE_IRQPRIO_PROGRAM:
case BOOKE_IRQPRIO_DTLB_MISS:
- case BOOKE_IRQPRIO_ITLB_MISS:
- case BOOKE_IRQPRIO_SYSCALL:
case BOOKE_IRQPRIO_DATA_STORAGE:
+ update_dear = true;
+ /* fall through */
case BOOKE_IRQPRIO_INST_STORAGE:
+ case BOOKE_IRQPRIO_PROGRAM:
+ update_esr = true;
+ /* fall through */
+ case BOOKE_IRQPRIO_ITLB_MISS:
+ case BOOKE_IRQPRIO_SYSCALL:
case BOOKE_IRQPRIO_FP_UNAVAIL:
case BOOKE_IRQPRIO_SPE_UNAVAIL:
case BOOKE_IRQPRIO_SPE_FP_DATA:
@@ -151,6 +185,10 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
vcpu->arch.srr0 = vcpu->arch.pc;
vcpu->arch.srr1 = vcpu->arch.msr;
vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[priority];
+ if (update_esr == true)
+ vcpu->arch.esr = vcpu->arch.queued_esr;
+ if (update_dear == true)
+ vcpu->arch.dear = vcpu->arch.queued_dear;
kvmppc_set_msr(vcpu, vcpu->arch.msr & msr_mask);
clear_bit(priority, &vcpu->arch.pending_exceptions);
@@ -223,8 +261,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
if (vcpu->arch.msr & MSR_PR) {
/* Program traps generated by user-level software must be handled
* by the guest kernel. */
- vcpu->arch.esr = vcpu->arch.fault_esr;
- kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_PROGRAM);
+ kvmppc_core_queue_program(vcpu, vcpu->arch.fault_esr);
r = RESUME_GUEST;
kvmppc_account_exit(vcpu, USR_PR_INST);
break;
@@ -280,16 +317,14 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
break;
case BOOKE_INTERRUPT_DATA_STORAGE:
- vcpu->arch.dear = vcpu->arch.fault_dear;
- vcpu->arch.esr = vcpu->arch.fault_esr;
- kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DATA_STORAGE);
+ kvmppc_core_queue_data_storage(vcpu, vcpu->arch.fault_dear,
+ vcpu->arch.fault_esr);
kvmppc_account_exit(vcpu, DSI_EXITS);
r = RESUME_GUEST;
break;
case BOOKE_INTERRUPT_INST_STORAGE:
- vcpu->arch.esr = vcpu->arch.fault_esr;
- kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_INST_STORAGE);
+ kvmppc_core_queue_inst_storage(vcpu, vcpu->arch.fault_esr);
kvmppc_account_exit(vcpu, ISI_EXITS);
r = RESUME_GUEST;
break;
@@ -310,9 +345,9 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
gtlb_index = kvmppc_mmu_dtlb_index(vcpu, eaddr);
if (gtlb_index < 0) {
/* The guest didn't have a mapping for it. */
- kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DTLB_MISS);
- vcpu->arch.dear = vcpu->arch.fault_dear;
- vcpu->arch.esr = vcpu->arch.fault_esr;
+ kvmppc_core_queue_dtlb_miss(vcpu,
+ vcpu->arch.fault_dear,
+ vcpu->arch.fault_esr);
kvmppc_mmu_dtlb_miss(vcpu);
kvmppc_account_exit(vcpu, DTLB_REAL_MISS_EXITS);
r = RESUME_GUEST;
@@ -426,7 +461,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
{
vcpu->arch.pc = 0;
vcpu->arch.msr = 0;
- vcpu->arch.gpr[1] = (16<<20) - 8; /* -8 for the callee-save LR slot */
+ kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */
vcpu->arch.shadow_pid = 1;
@@ -444,10 +479,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
int i;
regs->pc = vcpu->arch.pc;
- regs->cr = vcpu->arch.cr;
+ regs->cr = kvmppc_get_cr(vcpu);
regs->ctr = vcpu->arch.ctr;
regs->lr = vcpu->arch.lr;
- regs->xer = vcpu->arch.xer;
+ regs->xer = kvmppc_get_xer(vcpu);
regs->msr = vcpu->arch.msr;
regs->srr0 = vcpu->arch.srr0;
regs->srr1 = vcpu->arch.srr1;
@@ -461,7 +496,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
regs->sprg7 = vcpu->arch.sprg6;
for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
- regs->gpr[i] = vcpu->arch.gpr[i];
+ regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
return 0;
}
@@ -471,10 +506,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
int i;
vcpu->arch.pc = regs->pc;
- vcpu->arch.cr = regs->cr;
+ kvmppc_set_cr(vcpu, regs->cr);
vcpu->arch.ctr = regs->ctr;
vcpu->arch.lr = regs->lr;
- vcpu->arch.xer = regs->xer;
+ kvmppc_set_xer(vcpu, regs->xer);
kvmppc_set_msr(vcpu, regs->msr);
vcpu->arch.srr0 = regs->srr0;
vcpu->arch.srr1 = regs->srr1;
@@ -486,8 +521,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
vcpu->arch.sprg6 = regs->sprg5;
vcpu->arch.sprg7 = regs->sprg6;
- for (i = 0; i < ARRAY_SIZE(vcpu->arch.gpr); i++)
- vcpu->arch.gpr[i] = regs->gpr[i];
+ for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
+ kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
return 0;
}
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
index aebc65e93f4..cbc790ee192 100644
--- a/arch/powerpc/kvm/booke_emulate.c
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -62,20 +62,20 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
case OP_31_XOP_MFMSR:
rt = get_rt(inst);
- vcpu->arch.gpr[rt] = vcpu->arch.msr;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.msr);
kvmppc_set_exit_type(vcpu, EMULATED_MFMSR_EXITS);
break;
case OP_31_XOP_MTMSR:
rs = get_rs(inst);
kvmppc_set_exit_type(vcpu, EMULATED_MTMSR_EXITS);
- kvmppc_set_msr(vcpu, vcpu->arch.gpr[rs]);
+ kvmppc_set_msr(vcpu, kvmppc_get_gpr(vcpu, rs));
break;
case OP_31_XOP_WRTEE:
rs = get_rs(inst);
vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE)
- | (vcpu->arch.gpr[rs] & MSR_EE);
+ | (kvmppc_get_gpr(vcpu, rs) & MSR_EE);
kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS);
break;
@@ -101,22 +101,23 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
{
int emulated = EMULATE_DONE;
+ ulong spr_val = kvmppc_get_gpr(vcpu, rs);
switch (sprn) {
case SPRN_DEAR:
- vcpu->arch.dear = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.dear = spr_val; break;
case SPRN_ESR:
- vcpu->arch.esr = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.esr = spr_val; break;
case SPRN_DBCR0:
- vcpu->arch.dbcr0 = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.dbcr0 = spr_val; break;
case SPRN_DBCR1:
- vcpu->arch.dbcr1 = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.dbcr1 = spr_val; break;
case SPRN_DBSR:
- vcpu->arch.dbsr &= ~vcpu->arch.gpr[rs]; break;
+ vcpu->arch.dbsr &= ~spr_val; break;
case SPRN_TSR:
- vcpu->arch.tsr &= ~vcpu->arch.gpr[rs]; break;
+ vcpu->arch.tsr &= ~spr_val; break;
case SPRN_TCR:
- vcpu->arch.tcr = vcpu->arch.gpr[rs];
+ vcpu->arch.tcr = spr_val;
kvmppc_emulate_dec(vcpu);
break;
@@ -124,64 +125,64 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
* loaded into the real SPRGs when resuming the
* guest. */
case SPRN_SPRG4:
- vcpu->arch.sprg4 = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.sprg4 = spr_val; break;
case SPRN_SPRG5:
- vcpu->arch.sprg5 = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.sprg5 = spr_val; break;
case SPRN_SPRG6:
- vcpu->arch.sprg6 = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.sprg6 = spr_val; break;
case SPRN_SPRG7:
- vcpu->arch.sprg7 = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.sprg7 = spr_val; break;
case SPRN_IVPR:
- vcpu->arch.ivpr = vcpu->arch.gpr[rs];
+ vcpu->arch.ivpr = spr_val;
break;
case SPRN_IVOR0:
- vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = spr_val;
break;
case SPRN_IVOR1:
- vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK] = spr_val;
break;
case SPRN_IVOR2:
- vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = spr_val;
break;
case SPRN_IVOR3:
- vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = spr_val;
break;
case SPRN_IVOR4:
- vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL] = spr_val;
break;
case SPRN_IVOR5:
- vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT] = spr_val;
break;
case SPRN_IVOR6:
- vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM] = spr_val;
break;
case SPRN_IVOR7:
- vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL] = spr_val;
break;
case SPRN_IVOR8:
- vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = spr_val;
break;
case SPRN_IVOR9:
- vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = spr_val;
break;
case SPRN_IVOR10:
- vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER] = spr_val;
break;
case SPRN_IVOR11:
- vcpu->arch.ivor[BOOKE_IRQPRIO_FIT] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_FIT] = spr_val;
break;
case SPRN_IVOR12:
- vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG] = spr_val;
break;
case SPRN_IVOR13:
- vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS] = spr_val;
break;
case SPRN_IVOR14:
- vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS] = spr_val;
break;
case SPRN_IVOR15:
- vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = spr_val;
break;
default:
@@ -197,65 +198,65 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
switch (sprn) {
case SPRN_IVPR:
- vcpu->arch.gpr[rt] = vcpu->arch.ivpr; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivpr); break;
case SPRN_DEAR:
- vcpu->arch.gpr[rt] = vcpu->arch.dear; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.dear); break;
case SPRN_ESR:
- vcpu->arch.gpr[rt] = vcpu->arch.esr; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.esr); break;
case SPRN_DBCR0:
- vcpu->arch.gpr[rt] = vcpu->arch.dbcr0; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr0); break;
case SPRN_DBCR1:
- vcpu->arch.gpr[rt] = vcpu->arch.dbcr1; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr1); break;
case SPRN_DBSR:
- vcpu->arch.gpr[rt] = vcpu->arch.dbsr; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbsr); break;
case SPRN_IVOR0:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL]);
break;
case SPRN_IVOR1:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK]);
break;
case SPRN_IVOR2:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE]);
break;
case SPRN_IVOR3:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE]);
break;
case SPRN_IVOR4:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL]);
break;
case SPRN_IVOR5:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT]);
break;
case SPRN_IVOR6:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM]);
break;
case SPRN_IVOR7:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL]);
break;
case SPRN_IVOR8:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL]);
break;
case SPRN_IVOR9:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL]);
break;
case SPRN_IVOR10:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER]);
break;
case SPRN_IVOR11:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_FIT];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_FIT]);
break;
case SPRN_IVOR12:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG]);
break;
case SPRN_IVOR13:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS]);
break;
case SPRN_IVOR14:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS]);
break;
case SPRN_IVOR15:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]);
break;
default:
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index 64949eef43f..efa1198940a 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -60,6 +60,12 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
kvmppc_e500_tlb_setup(vcpu_e500);
+ /* Registers init */
+ vcpu->arch.pvr = mfspr(SPRN_PVR);
+
+ /* Since booke kvm only support one core, update all vcpus' PIR to 0 */
+ vcpu->vcpu_id = 0;
+
return 0;
}
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index be95b8d8e3b..8e3edfbc963 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -74,54 +74,59 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
{
struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
int emulated = EMULATE_DONE;
+ ulong spr_val = kvmppc_get_gpr(vcpu, rs);
switch (sprn) {
case SPRN_PID:
vcpu_e500->pid[0] = vcpu->arch.shadow_pid =
- vcpu->arch.pid = vcpu->arch.gpr[rs];
+ vcpu->arch.pid = spr_val;
break;
case SPRN_PID1:
- vcpu_e500->pid[1] = vcpu->arch.gpr[rs]; break;
+ vcpu_e500->pid[1] = spr_val; break;
case SPRN_PID2:
- vcpu_e500->pid[2] = vcpu->arch.gpr[rs]; break;
+ vcpu_e500->pid[2] = spr_val; break;
case SPRN_MAS0:
- vcpu_e500->mas0 = vcpu->arch.gpr[rs]; break;
+ vcpu_e500->mas0 = spr_val; break;
case SPRN_MAS1:
- vcpu_e500->mas1 = vcpu->arch.gpr[rs]; break;
+ vcpu_e500->mas1 = spr_val; break;
case SPRN_MAS2:
- vcpu_e500->mas2 = vcpu->arch.gpr[rs]; break;
+ vcpu_e500->mas2 = spr_val; break;
case SPRN_MAS3:
- vcpu_e500->mas3 = vcpu->arch.gpr[rs]; break;
+ vcpu_e500->mas3 = spr_val; break;
case SPRN_MAS4:
- vcpu_e500->mas4 = vcpu->arch.gpr[rs]; break;
+ vcpu_e500->mas4 = spr_val; break;
case SPRN_MAS6:
- vcpu_e500->mas6 = vcpu->arch.gpr[rs]; break;
+ vcpu_e500->mas6 = spr_val; break;
case SPRN_MAS7:
- vcpu_e500->mas7 = vcpu->arch.gpr[rs]; break;
+ vcpu_e500->mas7 = spr_val; break;
+ case SPRN_L1CSR0:
+ vcpu_e500->l1csr0 = spr_val;
+ vcpu_e500->l1csr0 &= ~(L1CSR0_DCFI | L1CSR0_CLFC);
+ break;
case SPRN_L1CSR1:
- vcpu_e500->l1csr1 = vcpu->arch.gpr[rs]; break;
+ vcpu_e500->l1csr1 = spr_val; break;
case SPRN_HID0:
- vcpu_e500->hid0 = vcpu->arch.gpr[rs]; break;
+ vcpu_e500->hid0 = spr_val; break;
case SPRN_HID1:
- vcpu_e500->hid1 = vcpu->arch.gpr[rs]; break;
+ vcpu_e500->hid1 = spr_val; break;
case SPRN_MMUCSR0:
emulated = kvmppc_e500_emul_mt_mmucsr0(vcpu_e500,
- vcpu->arch.gpr[rs]);
+ spr_val);
break;
/* extra exceptions */
case SPRN_IVOR32:
- vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL] = spr_val;
break;
case SPRN_IVOR33:
- vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA] = spr_val;
break;
case SPRN_IVOR34:
- vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND] = spr_val;
break;
case SPRN_IVOR35:
- vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] = vcpu->arch.gpr[rs];
+ vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] = spr_val;
break;
default:
@@ -138,63 +143,57 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
switch (sprn) {
case SPRN_PID:
- vcpu->arch.gpr[rt] = vcpu_e500->pid[0]; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[0]); break;
case SPRN_PID1:
- vcpu->arch.gpr[rt] = vcpu_e500->pid[1]; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[1]); break;
case SPRN_PID2:
- vcpu->arch.gpr[rt] = vcpu_e500->pid[2]; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[2]); break;
case SPRN_MAS0:
- vcpu->arch.gpr[rt] = vcpu_e500->mas0; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas0); break;
case SPRN_MAS1:
- vcpu->arch.gpr[rt] = vcpu_e500->mas1; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas1); break;
case SPRN_MAS2:
- vcpu->arch.gpr[rt] = vcpu_e500->mas2; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas2); break;
case SPRN_MAS3:
- vcpu->arch.gpr[rt] = vcpu_e500->mas3; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas3); break;
case SPRN_MAS4:
- vcpu->arch.gpr[rt] = vcpu_e500->mas4; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas4); break;
case SPRN_MAS6:
- vcpu->arch.gpr[rt] = vcpu_e500->mas6; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas6); break;
case SPRN_MAS7:
- vcpu->arch.gpr[rt] = vcpu_e500->mas7; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas7); break;
case SPRN_TLB0CFG:
- vcpu->arch.gpr[rt] = mfspr(SPRN_TLB0CFG);
- vcpu->arch.gpr[rt] &= ~0xfffUL;
- vcpu->arch.gpr[rt] |= vcpu_e500->guest_tlb_size[0];
- break;
-
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->tlb0cfg); break;
case SPRN_TLB1CFG:
- vcpu->arch.gpr[rt] = mfspr(SPRN_TLB1CFG);
- vcpu->arch.gpr[rt] &= ~0xfffUL;
- vcpu->arch.gpr[rt] |= vcpu_e500->guest_tlb_size[1];
- break;
-
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->tlb1cfg); break;
+ case SPRN_L1CSR0:
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->l1csr0); break;
case SPRN_L1CSR1:
- vcpu->arch.gpr[rt] = vcpu_e500->l1csr1; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->l1csr1); break;
case SPRN_HID0:
- vcpu->arch.gpr[rt] = vcpu_e500->hid0; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->hid0); break;
case SPRN_HID1:
- vcpu->arch.gpr[rt] = vcpu_e500->hid1; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu_e500->hid1); break;
case SPRN_MMUCSR0:
- vcpu->arch.gpr[rt] = 0; break;
+ kvmppc_set_gpr(vcpu, rt, 0); break;
case SPRN_MMUCFG:
- vcpu->arch.gpr[rt] = mfspr(SPRN_MMUCFG); break;
+ kvmppc_set_gpr(vcpu, rt, mfspr(SPRN_MMUCFG)); break;
/* extra exceptions */
case SPRN_IVOR32:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL]);
break;
case SPRN_IVOR33:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA]);
break;
case SPRN_IVOR34:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND]);
break;
case SPRN_IVOR35:
- vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR];
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR]);
break;
default:
emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, rt);
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index fb1e1dc11ba..0d772e6b631 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -417,7 +417,7 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb)
int esel, tlbsel;
gva_t ea;
- ea = ((ra) ? vcpu->arch.gpr[ra] : 0) + vcpu->arch.gpr[rb];
+ ea = ((ra) ? kvmppc_get_gpr(vcpu, ra) : 0) + kvmppc_get_gpr(vcpu, rb);
ia = (ea >> 2) & 0x1;
@@ -470,7 +470,7 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb)
struct tlbe *gtlbe = NULL;
gva_t ea;
- ea = vcpu->arch.gpr[rb];
+ ea = kvmppc_get_gpr(vcpu, rb);
for (tlbsel = 0; tlbsel < 2; tlbsel++) {
esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as);
@@ -728,6 +728,12 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
if (vcpu_e500->shadow_pages[1] == NULL)
goto err_out_page0;
+ /* Init TLB configuration register */
+ vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & ~0xfffUL;
+ vcpu_e500->tlb0cfg |= vcpu_e500->guest_tlb_size[0];
+ vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) & ~0xfffUL;
+ vcpu_e500->tlb1cfg |= vcpu_e500->guest_tlb_size[1];
+
return 0;
err_out_page0:
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 4a9ac6640fa..cb72a65f4ec 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -83,6 +83,9 @@ void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
pr_debug("mtDEC: %x\n", vcpu->arch.dec);
#ifdef CONFIG_PPC64
+ /* mtdec lowers the interrupt line when positive. */
+ kvmppc_core_dequeue_dec(vcpu);
+
/* POWER4+ triggers a dec interrupt if the value is < 0 */
if (vcpu->arch.dec & 0x80000000) {
hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
@@ -140,14 +143,18 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
pr_debug(KERN_INFO "Emulating opcode %d / %d\n", get_op(inst), get_xop(inst));
+ /* Try again next time */
+ if (inst == KVM_INST_FETCH_FAILED)
+ return EMULATE_DONE;
+
switch (get_op(inst)) {
case OP_TRAP:
#ifdef CONFIG_PPC64
case OP_TRAP_64:
+ kvmppc_core_queue_program(vcpu, SRR1_PROGTRAP);
#else
- vcpu->arch.esr |= ESR_PTR;
+ kvmppc_core_queue_program(vcpu, vcpu->arch.esr | ESR_PTR);
#endif
- kvmppc_core_queue_program(vcpu);
advance = 0;
break;
@@ -167,14 +174,14 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
case OP_31_XOP_STWX:
rs = get_rs(inst);
emulated = kvmppc_handle_store(run, vcpu,
- vcpu->arch.gpr[rs],
+ kvmppc_get_gpr(vcpu, rs),
4, 1);
break;
case OP_31_XOP_STBX:
rs = get_rs(inst);
emulated = kvmppc_handle_store(run, vcpu,
- vcpu->arch.gpr[rs],
+ kvmppc_get_gpr(vcpu, rs),
1, 1);
break;
@@ -183,14 +190,14 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
ra = get_ra(inst);
rb = get_rb(inst);
- ea = vcpu->arch.gpr[rb];
+ ea = kvmppc_get_gpr(vcpu, rb);
if (ra)
- ea += vcpu->arch.gpr[ra];
+ ea += kvmppc_get_gpr(vcpu, ra);
emulated = kvmppc_handle_store(run, vcpu,
- vcpu->arch.gpr[rs],
+ kvmppc_get_gpr(vcpu, rs),
1, 1);
- vcpu->arch.gpr[rs] = ea;
+ kvmppc_set_gpr(vcpu, rs, ea);
break;
case OP_31_XOP_LHZX:
@@ -203,12 +210,12 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
ra = get_ra(inst);
rb = get_rb(inst);
- ea = vcpu->arch.gpr[rb];
+ ea = kvmppc_get_gpr(vcpu, rb);
if (ra)
- ea += vcpu->arch.gpr[ra];
+ ea += kvmppc_get_gpr(vcpu, ra);
emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1);
- vcpu->arch.gpr[ra] = ea;
+ kvmppc_set_gpr(vcpu, ra, ea);
break;
case OP_31_XOP_MFSPR:
@@ -217,47 +224,49 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
switch (sprn) {
case SPRN_SRR0:
- vcpu->arch.gpr[rt] = vcpu->arch.srr0; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.srr0); break;
case SPRN_SRR1:
- vcpu->arch.gpr[rt] = vcpu->arch.srr1; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.srr1); break;
case SPRN_PVR:
- vcpu->arch.gpr[rt] = vcpu->arch.pvr; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.pvr); break;
case SPRN_PIR:
- vcpu->arch.gpr[rt] = vcpu->vcpu_id; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->vcpu_id); break;
case SPRN_MSSSR0:
- vcpu->arch.gpr[rt] = 0; break;
+ kvmppc_set_gpr(vcpu, rt, 0); break;
/* Note: mftb and TBRL/TBWL are user-accessible, so
* the guest can always access the real TB anyways.
* In fact, we probably will never see these traps. */
case SPRN_TBWL:
- vcpu->arch.gpr[rt] = get_tb() >> 32; break;
+ kvmppc_set_gpr(vcpu, rt, get_tb() >> 32); break;
case SPRN_TBWU:
- vcpu->arch.gpr[rt] = get_tb(); break;
+ kvmppc_set_gpr(vcpu, rt, get_tb()); break;
case SPRN_SPRG0:
- vcpu->arch.gpr[rt] = vcpu->arch.sprg0; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg0); break;
case SPRN_SPRG1:
- vcpu->arch.gpr[rt] = vcpu->arch.sprg1; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg1); break;
case SPRN_SPRG2:
- vcpu->arch.gpr[rt] = vcpu->arch.sprg2; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg2); break;
case SPRN_SPRG3:
- vcpu->arch.gpr[rt] = vcpu->arch.sprg3; break;
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg3); break;
/* Note: SPRG4-7 are user-readable, so we don't get
* a trap. */
case SPRN_DEC:
{
u64 jd = get_tb() - vcpu->arch.dec_jiffies;
- vcpu->arch.gpr[rt] = vcpu->arch.dec - jd;
- pr_debug(KERN_INFO "mfDEC: %x - %llx = %lx\n", vcpu->arch.dec, jd, vcpu->arch.gpr[rt]);
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.dec - jd);
+ pr_debug(KERN_INFO "mfDEC: %x - %llx = %lx\n",
+ vcpu->arch.dec, jd,
+ kvmppc_get_gpr(vcpu, rt));
break;
}
default:
emulated = kvmppc_core_emulate_mfspr(vcpu, sprn, rt);
if (emulated == EMULATE_FAIL) {
printk("mfspr: unknown spr %x\n", sprn);
- vcpu->arch.gpr[rt] = 0;
+ kvmppc_set_gpr(vcpu, rt, 0);
}
break;
}
@@ -269,7 +278,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
rb = get_rb(inst);
emulated = kvmppc_handle_store(run, vcpu,
- vcpu->arch.gpr[rs],
+ kvmppc_get_gpr(vcpu, rs),
2, 1);
break;
@@ -278,14 +287,14 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
ra = get_ra(inst);
rb = get_rb(inst);
- ea = vcpu->arch.gpr[rb];
+ ea = kvmppc_get_gpr(vcpu, rb);
if (ra)
- ea += vcpu->arch.gpr[ra];
+ ea += kvmppc_get_gpr(vcpu, ra);
emulated = kvmppc_handle_store(run, vcpu,
- vcpu->arch.gpr[rs],
+ kvmppc_get_gpr(vcpu, rs),
2, 1);
- vcpu->arch.gpr[ra] = ea;
+ kvmppc_set_gpr(vcpu, ra, ea);
break;
case OP_31_XOP_MTSPR:
@@ -293,9 +302,9 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
rs = get_rs(inst);
switch (sprn) {
case SPRN_SRR0:
- vcpu->arch.srr0 = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.srr0 = kvmppc_get_gpr(vcpu, rs); break;
case SPRN_SRR1:
- vcpu->arch.srr1 = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.srr1 = kvmppc_get_gpr(vcpu, rs); break;
/* XXX We need to context-switch the timebase for
* watchdog and FIT. */
@@ -305,18 +314,18 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
case SPRN_MSSSR0: break;
case SPRN_DEC:
- vcpu->arch.dec = vcpu->arch.gpr[rs];
+ vcpu->arch.dec = kvmppc_get_gpr(vcpu, rs);
kvmppc_emulate_dec(vcpu);
break;
case SPRN_SPRG0:
- vcpu->arch.sprg0 = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.sprg0 = kvmppc_get_gpr(vcpu, rs); break;
case SPRN_SPRG1:
- vcpu->arch.sprg1 = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.sprg1 = kvmppc_get_gpr(vcpu, rs); break;
case SPRN_SPRG2:
- vcpu->arch.sprg2 = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.sprg2 = kvmppc_get_gpr(vcpu, rs); break;
case SPRN_SPRG3:
- vcpu->arch.sprg3 = vcpu->arch.gpr[rs]; break;
+ vcpu->arch.sprg3 = kvmppc_get_gpr(vcpu, rs); break;
default:
emulated = kvmppc_core_emulate_mtspr(vcpu, sprn, rs);
@@ -348,7 +357,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
rb = get_rb(inst);
emulated = kvmppc_handle_store(run, vcpu,
- vcpu->arch.gpr[rs],
+ kvmppc_get_gpr(vcpu, rs),
4, 0);
break;
@@ -363,7 +372,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
rb = get_rb(inst);
emulated = kvmppc_handle_store(run, vcpu,
- vcpu->arch.gpr[rs],
+ kvmppc_get_gpr(vcpu, rs),
2, 0);
break;
@@ -382,7 +391,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
ra = get_ra(inst);
rt = get_rt(inst);
emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
- vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed;
+ kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed);
break;
case OP_LBZ:
@@ -394,35 +403,39 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
ra = get_ra(inst);
rt = get_rt(inst);
emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1);
- vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed;
+ kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed);
break;
case OP_STW:
rs = get_rs(inst);
- emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs],
+ emulated = kvmppc_handle_store(run, vcpu,
+ kvmppc_get_gpr(vcpu, rs),
4, 1);
break;
case OP_STWU:
ra = get_ra(inst);
rs = get_rs(inst);
- emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs],
+ emulated = kvmppc_handle_store(run, vcpu,
+ kvmppc_get_gpr(vcpu, rs),
4, 1);
- vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed;
+ kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed);
break;
case OP_STB:
rs = get_rs(inst);
- emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs],
+ emulated = kvmppc_handle_store(run, vcpu,
+ kvmppc_get_gpr(vcpu, rs),
1, 1);
break;
case OP_STBU:
ra = get_ra(inst);
rs = get_rs(inst);
- emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs],
+ emulated = kvmppc_handle_store(run, vcpu,
+ kvmppc_get_gpr(vcpu, rs),
1, 1);
- vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed;
+ kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed);
break;
case OP_LHZ:
@@ -434,21 +447,23 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
ra = get_ra(inst);
rt = get_rt(inst);
emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1);
- vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed;
+ kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed);
break;
case OP_STH:
rs = get_rs(inst);
- emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs],
+ emulated = kvmppc_handle_store(run, vcpu,
+ kvmppc_get_gpr(vcpu, rs),
2, 1);
break;
case OP_STHU:
ra = get_ra(inst);
rs = get_rs(inst);
- emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs],
+ emulated = kvmppc_handle_store(run, vcpu,
+ kvmppc_get_gpr(vcpu, rs),
2, 1);
- vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed;
+ kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed);
break;
default:
@@ -461,6 +476,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
advance = 0;
printk(KERN_ERR "Couldn't emulate instruction 0x%08x "
"(op %d xop %d)\n", inst, get_op(inst), get_xop(inst));
+ kvmppc_core_queue_program(vcpu, 0);
}
}
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index f06cf93b178..51aedd7f16b 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -137,6 +137,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
{
kvmppc_free_vcpus(kvm);
kvm_free_physmem(kvm);
+ cleanup_srcu_struct(&kvm->srcu);
kfree(kvm);
}
@@ -165,14 +166,24 @@ long kvm_arch_dev_ioctl(struct file *filp,
return -EINVAL;
}
-int kvm_arch_set_memory_region(struct kvm *kvm,
- struct kvm_userspace_memory_region *mem,
- struct kvm_memory_slot old,
- int user_alloc)
+int kvm_arch_prepare_memory_region(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
+ struct kvm_memory_slot old,
+ struct kvm_userspace_memory_region *mem,
+ int user_alloc)
{
return 0;
}
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ struct kvm_memory_slot old,
+ int user_alloc)
+{
+ return;
+}
+
+
void kvm_arch_flush_shadow(struct kvm *kvm)
{
}
@@ -260,34 +271,35 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu,
struct kvm_run *run)
{
- ulong *gpr = &vcpu->arch.gpr[vcpu->arch.io_gpr];
- *gpr = run->dcr.data;
+ kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, run->dcr.data);
}
static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
struct kvm_run *run)
{
- ulong *gpr = &vcpu->arch.gpr[vcpu->arch.io_gpr];
+ ulong gpr;
- if (run->mmio.len > sizeof(*gpr)) {
+ if (run->mmio.len > sizeof(gpr)) {
printk(KERN_ERR "bad MMIO length: %d\n", run->mmio.len);
return;
}
if (vcpu->arch.mmio_is_bigendian) {
switch (run->mmio.len) {
- case 4: *gpr = *(u32 *)run->mmio.data; break;
- case 2: *gpr = *(u16 *)run->mmio.data; break;
- case 1: *gpr = *(u8 *)run->mmio.data; break;
+ case 4: gpr = *(u32 *)run->mmio.data; break;
+ case 2: gpr = *(u16 *)run->mmio.data; break;
+ case 1: gpr = *(u8 *)run->mmio.data; break;
}
} else {
/* Convert BE data from userland back to LE. */
switch (run->mmio.len) {
- case 4: *gpr = ld_le32((u32 *)run->mmio.data); break;
- case 2: *gpr = ld_le16((u16 *)run->mmio.data); break;
- case 1: *gpr = *(u8 *)run->mmio.data; break;
+ case 4: gpr = ld_le32((u32 *)run->mmio.data); break;
+ case 2: gpr = ld_le16((u16 *)run->mmio.data); break;
+ case 1: gpr = *(u8 *)run->mmio.data; break;
}
}
+
+ kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr);
}
int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index 341aff2687a..cd128b07bed 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -288,46 +288,30 @@ static int hypfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
sb->s_magic = HYPFS_MAGIC;
sb->s_op = &hypfs_s_ops;
- if (hypfs_parse_options(data, sb)) {
- rc = -EINVAL;
- goto err_alloc;
- }
+ if (hypfs_parse_options(data, sb))
+ return -EINVAL;
root_inode = hypfs_make_inode(sb, S_IFDIR | 0755);
- if (!root_inode) {
- rc = -ENOMEM;
- goto err_alloc;
- }
+ if (!root_inode)
+ return -ENOMEM;
root_inode->i_op = &simple_dir_inode_operations;
root_inode->i_fop = &simple_dir_operations;
- root_dentry = d_alloc_root(root_inode);
+ sb->s_root = root_dentry = d_alloc_root(root_inode);
if (!root_dentry) {
iput(root_inode);
- rc = -ENOMEM;
- goto err_alloc;
+ return -ENOMEM;
}
if (MACHINE_IS_VM)
rc = hypfs_vm_create_files(sb, root_dentry);
else
rc = hypfs_diag_create_files(sb, root_dentry);
if (rc)
- goto err_tree;
+ return rc;
sbi->update_file = hypfs_create_update_file(sb, root_dentry);
- if (IS_ERR(sbi->update_file)) {
- rc = PTR_ERR(sbi->update_file);
- goto err_tree;
- }
+ if (IS_ERR(sbi->update_file))
+ return PTR_ERR(sbi->update_file);
hypfs_update_update(sb);
- sb->s_root = root_dentry;
pr_info("Hypervisor filesystem mounted\n");
return 0;
-
-err_tree:
- hypfs_delete_tree(root_dentry);
- d_genocide(root_dentry);
- dput(root_dentry);
-err_alloc:
- kfree(sbi);
- return rc;
}
static int hypfs_get_super(struct file_system_type *fst, int flags,
@@ -340,12 +324,12 @@ static void hypfs_kill_super(struct super_block *sb)
{
struct hypfs_sb_info *sb_info = sb->s_fs_info;
- if (sb->s_root) {
+ if (sb->s_root)
hypfs_delete_tree(sb->s_root);
+ if (sb_info->update_file)
hypfs_remove(sb_info->update_file);
- kfree(sb->s_fs_info);
- sb->s_fs_info = NULL;
- }
+ kfree(sb->s_fs_info);
+ sb->s_fs_info = NULL;
kill_litter_super(sb);
}
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 3fa0a10e466..49292869a5c 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -242,6 +242,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kvm_free_physmem(kvm);
free_page((unsigned long)(kvm->arch.sca));
debug_unregister(kvm->arch.dbf);
+ cleanup_srcu_struct(&kvm->srcu);
kfree(kvm);
}
@@ -690,14 +691,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
/* Section: memory related */
-int kvm_arch_set_memory_region(struct kvm *kvm,
- struct kvm_userspace_memory_region *mem,
- struct kvm_memory_slot old,
- int user_alloc)
+int kvm_arch_prepare_memory_region(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
+ struct kvm_memory_slot old,
+ struct kvm_userspace_memory_region *mem,
+ int user_alloc)
{
- int i;
- struct kvm_vcpu *vcpu;
-
/* A few sanity checks. We can have exactly one memory slot which has
to start at guest virtual zero and which has to be located at a
page boundary in userland and which has to end at a page boundary.
@@ -720,14 +719,23 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
if (!user_alloc)
return -EINVAL;
+ return 0;
+}
+
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ struct kvm_memory_slot old,
+ int user_alloc)
+{
+ int i;
+ struct kvm_vcpu *vcpu;
+
/* request update of sie control block for all available vcpus */
kvm_for_each_vcpu(i, vcpu, kvm) {
if (test_and_set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
continue;
kvm_s390_inject_sigp_stop(vcpu, ACTION_RELOADVCPU_ON_STOP);
}
-
- return 0;
}
void kvm_arch_flush_shadow(struct kvm *kvm)
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 06cce8285ba..60f09ab3672 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -67,10 +67,14 @@ static inline long kvm_s390_vcpu_get_memsize(struct kvm_vcpu *vcpu)
static inline void kvm_s390_vcpu_set_mem(struct kvm_vcpu *vcpu)
{
+ int idx;
struct kvm_memory_slot *mem;
+ struct kvm_memslots *memslots;
- down_read(&vcpu->kvm->slots_lock);
- mem = &vcpu->kvm->memslots[0];
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ memslots = rcu_dereference(vcpu->kvm->memslots);
+
+ mem = &memslots->memslots[0];
vcpu->arch.sie_block->gmsor = mem->userspace_addr;
vcpu->arch.sie_block->gmslm =
@@ -78,7 +82,7 @@ static inline void kvm_s390_vcpu_set_mem(struct kvm_vcpu *vcpu)
(mem->npages << PAGE_SHIFT) +
VIRTIODESCSPACE - 1ul;
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
}
/* implemented in priv.c */
diff --git a/arch/sparc/configs/sparc32_defconfig b/arch/sparc/configs/sparc32_defconfig
index 99a1f191497..6a8d078070c 100644
--- a/arch/sparc/configs/sparc32_defconfig
+++ b/arch/sparc/configs/sparc32_defconfig
@@ -1,7 +1,7 @@
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.33-rc2
-# Mon Jan 11 23:20:31 2010
+# Linux kernel version: 2.6.33
+# Wed Mar 3 02:52:23 2010
#
# CONFIG_64BIT is not set
CONFIG_SPARC=y
@@ -9,6 +9,8 @@ CONFIG_SPARC32=y
# CONFIG_SPARC64 is not set
CONFIG_ARCH_DEFCONFIG="arch/sparc/configs/sparc32_defconfig"
CONFIG_BITS=32
+CONFIG_GENERIC_TIME=y
+CONFIG_ARCH_USES_GETTIMEOFFSET=y
CONFIG_AUDIT_ARCH=y
CONFIG_MMU=y
CONFIG_HIGHMEM=y
@@ -48,11 +50,6 @@ CONFIG_RCU_FANOUT=32
# CONFIG_TREE_RCU_TRACE is not set
# CONFIG_IKCONFIG is not set
CONFIG_LOG_BUF_SHIFT=14
-CONFIG_GROUP_SCHED=y
-CONFIG_FAIR_GROUP_SCHED=y
-CONFIG_RT_GROUP_SCHED=y
-CONFIG_USER_SCHED=y
-# CONFIG_CGROUP_SCHED is not set
# CONFIG_CGROUPS is not set
CONFIG_SYSFS_DEPRECATED=y
CONFIG_SYSFS_DEPRECATED_V2=y
@@ -68,6 +65,7 @@ CONFIG_INITRAMFS_SOURCE=""
CONFIG_RD_GZIP=y
CONFIG_RD_BZIP2=y
CONFIG_RD_LZMA=y
+CONFIG_RD_LZO=y
# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
CONFIG_SYSCTL=y
CONFIG_ANON_INODES=y
@@ -211,7 +209,6 @@ CONFIG_SBUSCHAR=y
CONFIG_PCI=y
CONFIG_PCI_SYSCALL=y
# CONFIG_ARCH_SUPPORTS_MSI is not set
-CONFIG_PCI_LEGACY=y
# CONFIG_PCI_DEBUG is not set
# CONFIG_PCI_STUB is not set
# CONFIG_PCI_IOV is not set
@@ -232,7 +229,6 @@ CONFIG_NET=y
# Networking options
#
CONFIG_PACKET=y
-# CONFIG_PACKET_MMAP is not set
CONFIG_UNIX=y
CONFIG_XFRM=y
CONFIG_XFRM_USER=m
@@ -379,11 +375,13 @@ CONFIG_MISC_DEVICES=y
# CONFIG_TIFM_CORE is not set
# CONFIG_ENCLOSURE_SERVICES is not set
# CONFIG_HP_ILO is not set
+# CONFIG_TI_DAC7512 is not set
# CONFIG_C2PORT is not set
#
# EEPROM support
#
+# CONFIG_EEPROM_AT25 is not set
# CONFIG_EEPROM_93CX6 is not set
# CONFIG_CB710_CORE is not set
CONFIG_HAVE_IDE=y
@@ -507,7 +505,9 @@ CONFIG_SUNQE=m
# CONFIG_SUNGEM is not set
# CONFIG_CASSINI is not set
# CONFIG_NET_VENDOR_3COM is not set
+# CONFIG_ENC28J60 is not set
# CONFIG_ETHOC is not set
+# CONFIG_GRETH is not set
# CONFIG_DNET is not set
# CONFIG_NET_TULIP is not set
# CONFIG_HP100 is not set
@@ -521,6 +521,7 @@ CONFIG_SUNQE=m
# CONFIG_NET_PCI is not set
# CONFIG_B44 is not set
# CONFIG_KS8842 is not set
+# CONFIG_KS8851 is not set
# CONFIG_KS8851_MLL is not set
# CONFIG_ATL2 is not set
CONFIG_NETDEV_1000=y
@@ -563,6 +564,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
# CONFIG_MLX4_CORE is not set
# CONFIG_TEHUTI is not set
# CONFIG_BNX2X is not set
+# CONFIG_QLCNIC is not set
# CONFIG_QLGE is not set
# CONFIG_SFC is not set
# CONFIG_BE2NET is not set
@@ -665,6 +667,7 @@ CONFIG_DEVKMEM=y
#
# Non-8250 serial port support
#
+# CONFIG_SERIAL_MAX3100 is not set
CONFIG_SERIAL_SUNCORE=y
CONFIG_SERIAL_SUNZILOG=y
CONFIG_SERIAL_SUNZILOG_CONSOLE=y
@@ -689,7 +692,23 @@ CONFIG_HW_RANDOM=m
# CONFIG_TCG_TPM is not set
CONFIG_DEVPORT=y
# CONFIG_I2C is not set
-# CONFIG_SPI is not set
+CONFIG_SPI=y
+# CONFIG_SPI_DEBUG is not set
+CONFIG_SPI_MASTER=y
+
+#
+# SPI Master Controller Drivers
+#
+CONFIG_SPI_BITBANG=m
+CONFIG_SPI_XILINX=m
+CONFIG_SPI_XILINX_PLTFM=m
+# CONFIG_SPI_DESIGNWARE is not set
+
+#
+# SPI Protocol Masters
+#
+# CONFIG_SPI_SPIDEV is not set
+# CONFIG_SPI_TLE62X0 is not set
#
# PPS support
@@ -706,10 +725,13 @@ CONFIG_HWMON=y
#
# Native drivers
#
+# CONFIG_SENSORS_ADCXX is not set
# CONFIG_SENSORS_I5K_AMB is not set
# CONFIG_SENSORS_F71805F is not set
# CONFIG_SENSORS_F71882FG is not set
# CONFIG_SENSORS_IT87 is not set
+# CONFIG_SENSORS_LM70 is not set
+# CONFIG_SENSORS_MAX1111 is not set
# CONFIG_SENSORS_PC87360 is not set
# CONFIG_SENSORS_PC87427 is not set
# CONFIG_SENSORS_SIS5595 is not set
@@ -720,6 +742,7 @@ CONFIG_HWMON=y
# CONFIG_SENSORS_VT8231 is not set
# CONFIG_SENSORS_W83627HF is not set
# CONFIG_SENSORS_W83627EHF is not set
+# CONFIG_SENSORS_LIS3_SPI is not set
# CONFIG_THERMAL is not set
# CONFIG_WATCHDOG is not set
CONFIG_SSB_POSSIBLE=y
@@ -736,6 +759,8 @@ CONFIG_SSB_POSSIBLE=y
# CONFIG_MFD_SM501 is not set
# CONFIG_HTC_PASIC3 is not set
# CONFIG_MFD_TMIO is not set
+# CONFIG_MFD_MC13783 is not set
+# CONFIG_AB4500_CORE is not set
# CONFIG_REGULATOR is not set
# CONFIG_MEDIA_SUPPORT is not set
@@ -743,6 +768,7 @@ CONFIG_SSB_POSSIBLE=y
# Graphics support
#
CONFIG_VGA_ARB=y
+CONFIG_VGA_ARB_MAX_GPUS=16
# CONFIG_VGASTATE is not set
# CONFIG_VIDEO_OUTPUT_CONTROL is not set
# CONFIG_FB is not set
@@ -808,6 +834,14 @@ CONFIG_RTC_INTF_DEV=y
#
# SPI RTC drivers
#
+# CONFIG_RTC_DRV_M41T94 is not set
+# CONFIG_RTC_DRV_DS1305 is not set
+# CONFIG_RTC_DRV_DS1390 is not set
+# CONFIG_RTC_DRV_MAX6902 is not set
+# CONFIG_RTC_DRV_R9701 is not set
+# CONFIG_RTC_DRV_RS5C348 is not set
+# CONFIG_RTC_DRV_DS3234 is not set
+# CONFIG_RTC_DRV_PCF2123 is not set
#
# Platform RTC drivers
@@ -1180,9 +1214,11 @@ CONFIG_CRC32=y
CONFIG_LIBCRC32C=m
CONFIG_ZLIB_INFLATE=y
CONFIG_ZLIB_DEFLATE=y
+CONFIG_LZO_DECOMPRESS=y
CONFIG_DECOMPRESS_GZIP=y
CONFIG_DECOMPRESS_BZIP2=y
CONFIG_DECOMPRESS_LZMA=y
+CONFIG_DECOMPRESS_LZO=y
CONFIG_HAS_IOMEM=y
CONFIG_HAS_IOPORT=y
CONFIG_HAS_DMA=y
diff --git a/arch/sparc/configs/sparc64_defconfig b/arch/sparc/configs/sparc64_defconfig
index 41c5a56aa6f..56e3163673e 100644
--- a/arch/sparc/configs/sparc64_defconfig
+++ b/arch/sparc/configs/sparc64_defconfig
@@ -1,7 +1,7 @@
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.33-rc2
-# Wed Jan 20 16:31:47 2010
+# Linux kernel version: 2.6.33
+# Wed Mar 3 02:54:29 2010
#
CONFIG_64BIT=y
CONFIG_SPARC=y
@@ -55,14 +55,10 @@ CONFIG_TREE_RCU=y
# CONFIG_RCU_TRACE is not set
CONFIG_RCU_FANOUT=64
# CONFIG_RCU_FANOUT_EXACT is not set
+# CONFIG_RCU_FAST_NO_HZ is not set
# CONFIG_TREE_RCU_TRACE is not set
# CONFIG_IKCONFIG is not set
CONFIG_LOG_BUF_SHIFT=18
-CONFIG_GROUP_SCHED=y
-CONFIG_FAIR_GROUP_SCHED=y
-CONFIG_RT_GROUP_SCHED=y
-CONFIG_USER_SCHED=y
-# CONFIG_CGROUP_SCHED is not set
# CONFIG_CGROUPS is not set
# CONFIG_SYSFS_DEPRECATED_V2 is not set
CONFIG_RELAY=y
@@ -77,6 +73,7 @@ CONFIG_INITRAMFS_SOURCE=""
CONFIG_RD_GZIP=y
CONFIG_RD_BZIP2=y
CONFIG_RD_LZMA=y
+CONFIG_RD_LZO=y
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
CONFIG_SYSCTL=y
CONFIG_ANON_INODES=y
@@ -105,7 +102,6 @@ CONFIG_PERF_USE_VMALLOC=y
# Kernel Performance Events And Counters
#
CONFIG_PERF_EVENTS=y
-CONFIG_EVENT_PROFILE=y
CONFIG_PERF_COUNTERS=y
# CONFIG_DEBUG_PERF_USE_VMALLOC is not set
CONFIG_VM_EVENT_COUNTERS=y
@@ -266,7 +262,6 @@ CONFIG_PCI_DOMAINS=y
CONFIG_PCI_SYSCALL=y
CONFIG_ARCH_SUPPORTS_MSI=y
CONFIG_PCI_MSI=y
-# CONFIG_PCI_LEGACY is not set
# CONFIG_PCI_DEBUG is not set
# CONFIG_PCI_STUB is not set
# CONFIG_PCI_IOV is not set
@@ -290,7 +285,6 @@ CONFIG_NET=y
# Networking options
#
CONFIG_PACKET=y
-CONFIG_PACKET_MMAP=y
CONFIG_UNIX=y
CONFIG_XFRM=y
CONFIG_XFRM_USER=m
@@ -425,10 +419,6 @@ CONFIG_BLK_DEV=y
# CONFIG_BLK_DEV_COW_COMMON is not set
CONFIG_BLK_DEV_LOOP=m
CONFIG_BLK_DEV_CRYPTOLOOP=m
-
-#
-# DRBD disabled because PROC_FS, INET or CONNECTOR not selected
-#
# CONFIG_BLK_DEV_DRBD is not set
CONFIG_BLK_DEV_NBD=m
# CONFIG_BLK_DEV_SX8 is not set
@@ -677,6 +667,7 @@ CONFIG_SUNGEM=m
CONFIG_SUNVNET=m
# CONFIG_NET_VENDOR_3COM is not set
# CONFIG_ETHOC is not set
+# CONFIG_GRETH is not set
# CONFIG_DNET is not set
# CONFIG_NET_TULIP is not set
# CONFIG_HP100 is not set
@@ -691,6 +682,7 @@ CONFIG_NET_PCI=y
# CONFIG_PCNET32 is not set
# CONFIG_AMD8111_ETH is not set
# CONFIG_ADAPTEC_STARFIRE is not set
+# CONFIG_KSZ884X_PCI is not set
# CONFIG_B44 is not set
# CONFIG_FORCEDETH is not set
# CONFIG_E100 is not set
@@ -741,6 +733,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
# CONFIG_CHELSIO_T3 is not set
# CONFIG_ENIC is not set
# CONFIG_IXGBE is not set
+# CONFIG_IXGBEVF is not set
# CONFIG_IXGB is not set
# CONFIG_S2IO is not set
# CONFIG_VXGE is not set
@@ -751,6 +744,7 @@ CONFIG_NIU=m
# CONFIG_MLX4_CORE is not set
# CONFIG_TEHUTI is not set
# CONFIG_BNX2X is not set
+# CONFIG_QLCNIC is not set
# CONFIG_QLGE is not set
# CONFIG_SFC is not set
# CONFIG_BE2NET is not set
@@ -1028,6 +1022,7 @@ CONFIG_HWMON=y
# CONFIG_SENSORS_SMSC47M192 is not set
# CONFIG_SENSORS_SMSC47B397 is not set
# CONFIG_SENSORS_ADS7828 is not set
+# CONFIG_SENSORS_AMC6821 is not set
# CONFIG_SENSORS_THMC50 is not set
# CONFIG_SENSORS_TMP401 is not set
# CONFIG_SENSORS_TMP421 is not set
@@ -1076,6 +1071,7 @@ CONFIG_SSB_POSSIBLE=y
# Graphics support
#
CONFIG_VGA_ARB=y
+CONFIG_VGA_ARB_MAX_GPUS=16
# CONFIG_DRM is not set
# CONFIG_VGASTATE is not set
# CONFIG_VIDEO_OUTPUT_CONTROL is not set
@@ -1279,6 +1275,7 @@ CONFIG_SND_ALI5451=m
# CONFIG_SND_YMFPCI is not set
CONFIG_SND_USB=y
# CONFIG_SND_USB_AUDIO is not set
+# CONFIG_SND_USB_UA101 is not set
# CONFIG_SND_USB_CAIAQ is not set
CONFIG_SND_SPARC=y
# CONFIG_SND_SUN_AMD7930 is not set
@@ -1301,6 +1298,7 @@ CONFIG_USB_HIDDEV=y
#
# Special HID drivers
#
+# CONFIG_HID_3M_PCT is not set
CONFIG_HID_A4TECH=y
CONFIG_HID_APPLE=y
CONFIG_HID_BELKIN=y
@@ -1317,14 +1315,19 @@ CONFIG_HID_KENSINGTON=y
CONFIG_HID_LOGITECH=y
# CONFIG_LOGITECH_FF is not set
# CONFIG_LOGIRUMBLEPAD2_FF is not set
+# CONFIG_LOGIG940_FF is not set
CONFIG_HID_MICROSOFT=y
+# CONFIG_HID_MOSART is not set
CONFIG_HID_MONTEREY=y
CONFIG_HID_NTRIG=y
+CONFIG_HID_ORTEK=y
CONFIG_HID_PANTHERLORD=y
# CONFIG_PANTHERLORD_FF is not set
CONFIG_HID_PETALYNX=y
+# CONFIG_HID_QUANTA is not set
CONFIG_HID_SAMSUNG=y
CONFIG_HID_SONY=y
+# CONFIG_HID_STANTUM is not set
CONFIG_HID_SUNPLUS=y
CONFIG_HID_GREENASIA=y
# CONFIG_GREENASIA_FF is not set
@@ -1807,6 +1810,7 @@ CONFIG_CRYPTO_MANAGER=y
CONFIG_CRYPTO_MANAGER2=y
CONFIG_CRYPTO_GF128MUL=m
CONFIG_CRYPTO_NULL=m
+# CONFIG_CRYPTO_PCRYPT is not set
CONFIG_CRYPTO_WORKQUEUE=y
# CONFIG_CRYPTO_CRYPTD is not set
CONFIG_CRYPTO_AUTHENC=y
@@ -1904,9 +1908,11 @@ CONFIG_CRC32=y
CONFIG_LIBCRC32C=m
CONFIG_ZLIB_INFLATE=y
CONFIG_ZLIB_DEFLATE=y
+CONFIG_LZO_DECOMPRESS=y
CONFIG_DECOMPRESS_GZIP=y
CONFIG_DECOMPRESS_BZIP2=y
CONFIG_DECOMPRESS_LZMA=y
+CONFIG_DECOMPRESS_LZO=y
CONFIG_HAS_IOMEM=y
CONFIG_HAS_IOPORT=y
CONFIG_HAS_DMA=y
diff --git a/arch/sparc/include/asm/io_32.h b/arch/sparc/include/asm/io_32.h
index 679c7504625..2889574608d 100644
--- a/arch/sparc/include/asm/io_32.h
+++ b/arch/sparc/include/asm/io_32.h
@@ -249,10 +249,14 @@ extern void iounmap(volatile void __iomem *addr);
#define ioread8(X) readb(X)
#define ioread16(X) readw(X)
+#define ioread16be(X) __raw_readw(X)
#define ioread32(X) readl(X)
+#define ioread32be(X) __raw_readl(X)
#define iowrite8(val,X) writeb(val,X)
#define iowrite16(val,X) writew(val,X)
+#define iowrite16be(val,X) __raw_writew(val,X)
#define iowrite32(val,X) writel(val,X)
+#define iowrite32be(val,X) __raw_writel(val,X)
static inline void ioread8_rep(void __iomem *port, void *buf, unsigned long count)
{
diff --git a/arch/sparc/include/asm/io_64.h b/arch/sparc/include/asm/io_64.h
index 4aee21dc9c6..9517d063c79 100644
--- a/arch/sparc/include/asm/io_64.h
+++ b/arch/sparc/include/asm/io_64.h
@@ -468,10 +468,14 @@ static inline void iounmap(volatile void __iomem *addr)
#define ioread8(X) readb(X)
#define ioread16(X) readw(X)
+#define ioread16be(X) __raw_readw(X)
#define ioread32(X) readl(X)
+#define ioread32be(X) __raw_readl(X)
#define iowrite8(val,X) writeb(val,X)
#define iowrite16(val,X) writew(val,X)
+#define iowrite16be(val,X) __raw_writew(val,X)
#define iowrite32(val,X) writel(val,X)
+#define iowrite32be(val,X) __raw_writel(val,X)
/* Create a virtual mapping cookie for an IO port range */
extern void __iomem *ioport_map(unsigned long port, unsigned int nr);
diff --git a/arch/sparc/include/asm/perfctr.h b/arch/sparc/include/asm/perfctr.h
index 836873002b7..8d8720a8770 100644
--- a/arch/sparc/include/asm/perfctr.h
+++ b/arch/sparc/include/asm/perfctr.h
@@ -10,8 +10,8 @@
* from enumeration below. The meaning of further arguments
* are determined by the operation code.
*
- * int sys_perfctr(int opcode, unsigned long arg0,
- * unsigned long arg1, unsigned long arg2)
+ * NOTE: This system call is no longer provided, use the perf_events
+ * infrastructure.
*
* Pointers which are passed by the user are pointers to 64-bit
* integers.
diff --git a/arch/sparc/include/asm/system_64.h b/arch/sparc/include/asm/system_64.h
index d47a98e6697..d24cfe16afc 100644
--- a/arch/sparc/include/asm/system_64.h
+++ b/arch/sparc/include/asm/system_64.h
@@ -143,15 +143,7 @@ do { \
* and 2 stores in this critical code path. -DaveM
*/
#define switch_to(prev, next, last) \
-do { if (test_thread_flag(TIF_PERFCTR)) { \
- unsigned long __tmp; \
- read_pcr(__tmp); \
- current_thread_info()->pcr_reg = __tmp; \
- read_pic(__tmp); \
- current_thread_info()->kernel_cntd0 += (unsigned int)(__tmp);\
- current_thread_info()->kernel_cntd1 += ((__tmp) >> 32); \
- } \
- flush_tlb_pending(); \
+do { flush_tlb_pending(); \
save_and_clear_fpu(); \
/* If you are tempted to conditionalize the following */ \
/* so that ASI is only written if it changes, think again. */ \
@@ -197,11 +189,6 @@ do { if (test_thread_flag(TIF_PERFCTR)) { \
"l1", "l2", "l3", "l4", "l5", "l6", "l7", \
"i0", "i1", "i2", "i3", "i4", "i5", \
"o0", "o1", "o2", "o3", "o4", "o5", "o7"); \
- /* If you fuck with this, update ret_from_syscall code too. */ \
- if (test_thread_flag(TIF_PERFCTR)) { \
- write_pcr(current_thread_info()->pcr_reg); \
- reset_pic(); \
- } \
} while(0)
static inline unsigned long xchg32(__volatile__ unsigned int *m, unsigned int val)
diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h
index 39be9f256e5..9e2d9447f2a 100644
--- a/arch/sparc/include/asm/thread_info_64.h
+++ b/arch/sparc/include/asm/thread_info_64.h
@@ -58,11 +58,6 @@ struct thread_info {
unsigned long gsr[7];
unsigned long xfsr[7];
- __u64 __user *user_cntd0;
- __u64 __user *user_cntd1;
- __u64 kernel_cntd0, kernel_cntd1;
- __u64 pcr_reg;
-
struct restart_block restart_block;
struct pt_regs *kern_una_regs;
@@ -96,15 +91,10 @@ struct thread_info {
#define TI_RWIN_SPTRS 0x000003c8
#define TI_GSR 0x00000400
#define TI_XFSR 0x00000438
-#define TI_USER_CNTD0 0x00000470
-#define TI_USER_CNTD1 0x00000478
-#define TI_KERN_CNTD0 0x00000480
-#define TI_KERN_CNTD1 0x00000488
-#define TI_PCR 0x00000490
-#define TI_RESTART_BLOCK 0x00000498
-#define TI_KUNA_REGS 0x000004c8
-#define TI_KUNA_INSN 0x000004d0
-#define TI_FPREGS 0x00000500
+#define TI_RESTART_BLOCK 0x00000470
+#define TI_KUNA_REGS 0x000004a0
+#define TI_KUNA_INSN 0x000004a8
+#define TI_FPREGS 0x000004c0
/* We embed this in the uppermost byte of thread_info->flags */
#define FAULT_CODE_WRITE 0x01 /* Write access, implies D-TLB */
@@ -199,7 +189,7 @@ register struct thread_info *current_thread_info_reg asm("g6");
*
* On trap return we need to test several values:
*
- * user: need_resched, notify_resume, sigpending, wsaved, perfctr
+ * user: need_resched, notify_resume, sigpending, wsaved
* kernel: fpdepth
*
* So to check for work in the kernel case we simply load the fpdepth
@@ -220,7 +210,7 @@ register struct thread_info *current_thread_info_reg asm("g6");
#define TIF_NOTIFY_RESUME 1 /* callback before returning to user */
#define TIF_SIGPENDING 2 /* signal pending */
#define TIF_NEED_RESCHED 3 /* rescheduling necessary */
-#define TIF_PERFCTR 4 /* performance counters active */
+/* flag bit 4 is available */
#define TIF_UNALIGNED 5 /* allowed to do unaligned accesses */
/* flag bit 6 is available */
#define TIF_32BIT 7 /* 32-bit binary */
@@ -241,7 +231,6 @@ register struct thread_info *current_thread_info_reg asm("g6");
#define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME)
#define _TIF_SIGPENDING (1<<TIF_SIGPENDING)
#define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED)
-#define _TIF_PERFCTR (1<<TIF_PERFCTR)
#define _TIF_UNALIGNED (1<<TIF_UNALIGNED)
#define _TIF_32BIT (1<<TIF_32BIT)
#define _TIF_SECCOMP (1<<TIF_SECCOMP)
@@ -252,7 +241,7 @@ register struct thread_info *current_thread_info_reg asm("g6");
#define _TIF_USER_WORK_MASK ((0xff << TI_FLAG_WSAVED_SHIFT) | \
_TIF_DO_NOTIFY_RESUME_MASK | \
- _TIF_NEED_RESCHED | _TIF_PERFCTR)
+ _TIF_NEED_RESCHED)
#define _TIF_DO_NOTIFY_RESUME_MASK (_TIF_NOTIFY_RESUME | _TIF_SIGPENDING)
/*
diff --git a/arch/sparc/kernel/entry.h b/arch/sparc/kernel/entry.h
index 4f53a2395ac..c011b932bb1 100644
--- a/arch/sparc/kernel/entry.h
+++ b/arch/sparc/kernel/entry.h
@@ -48,7 +48,6 @@ extern void __init boot_cpu_id_too_large(int cpu);
extern unsigned int dcache_parity_tl1_occurred;
extern unsigned int icache_parity_tl1_occurred;
-extern asmlinkage void update_perfctrs(void);
extern asmlinkage void sparc_breakpoint(struct pt_regs *regs);
extern void timer_interrupt(int irq, struct pt_regs *regs);
diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c
index d242a734054..b287b62c7ea 100644
--- a/arch/sparc/kernel/nmi.c
+++ b/arch/sparc/kernel/nmi.c
@@ -21,7 +21,6 @@
#include <asm/perf_event.h>
#include <asm/ptrace.h>
-#include <asm/local.h>
#include <asm/pcr.h>
/* We don't have a real NMI on sparc64, but we can fake one
@@ -113,13 +112,13 @@ notrace __kprobes void perfctr_irq(int irq, struct pt_regs *regs)
touched = 1;
}
if (!touched && __get_cpu_var(last_irq_sum) == sum) {
- __this_cpu_inc(per_cpu_var(alert_counter));
- if (__this_cpu_read(per_cpu_var(alert_counter)) == 30 * nmi_hz)
+ __this_cpu_inc(alert_counter);
+ if (__this_cpu_read(alert_counter) == 30 * nmi_hz)
die_nmi("BUG: NMI Watchdog detected LOCKUP",
regs, panic_on_timeout);
} else {
__get_cpu_var(last_irq_sum) = sum;
- __this_cpu_write(per_cpu_var(alert_counter), 0);
+ __this_cpu_write(alert_counter, 0);
}
if (__get_cpu_var(wd_enabled)) {
write_pic(picl_value(nmi_hz));
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index cb70476bd8f..a5cf3864b31 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -352,12 +352,6 @@ void exit_thread(void)
else
t->utraps[0]--;
}
-
- if (test_and_clear_thread_flag(TIF_PERFCTR)) {
- t->user_cntd0 = t->user_cntd1 = NULL;
- t->pcr_reg = 0;
- write_pcr(0);
- }
}
void flush_thread(void)
@@ -371,13 +365,6 @@ void flush_thread(void)
set_thread_wsaved(0);
- /* Turn off performance counters if on. */
- if (test_and_clear_thread_flag(TIF_PERFCTR)) {
- t->user_cntd0 = t->user_cntd1 = NULL;
- t->pcr_reg = 0;
- write_pcr(0);
- }
-
/* Clear FPU register state. */
t->fpsaved[0] = 0;
@@ -591,16 +578,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
t->kregs->u_regs[UREG_FP] =
((unsigned long) child_sf) - STACK_BIAS;
- /* Special case, if we are spawning a kernel thread from
- * a userspace task (usermode helper, NFS or similar), we
- * must disable performance counters in the child because
- * the address space and protection realm are changing.
- */
- if (t->flags & _TIF_PERFCTR) {
- t->user_cntd0 = t->user_cntd1 = NULL;
- t->pcr_reg = 0;
- t->flags &= ~_TIF_PERFCTR;
- }
t->flags |= ((long)ASI_P << TI_FLAG_CURRENT_DS_SHIFT);
t->kregs->u_regs[UREG_G6] = (unsigned long) t;
t->kregs->u_regs[UREG_G4] = (unsigned long) t->task;
diff --git a/arch/sparc/kernel/rtrap_64.S b/arch/sparc/kernel/rtrap_64.S
index fd3cee4d117..83f1873c6c1 100644
--- a/arch/sparc/kernel/rtrap_64.S
+++ b/arch/sparc/kernel/rtrap_64.S
@@ -65,48 +65,6 @@ __handle_user_windows:
ba,pt %xcc, __handle_user_windows_continue
andn %l1, %l4, %l1
-__handle_perfctrs:
- call update_perfctrs
- wrpr %g0, RTRAP_PSTATE, %pstate
- wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate
- ldub [%g6 + TI_WSAVED], %o2
- brz,pt %o2, 1f
- nop
- /* Redo userwin+sched+sig checks */
- call fault_in_user_windows
-
- wrpr %g0, RTRAP_PSTATE, %pstate
- wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate
- ldx [%g6 + TI_FLAGS], %l0
- andcc %l0, _TIF_NEED_RESCHED, %g0
- be,pt %xcc, 1f
-
- nop
- call schedule
- wrpr %g0, RTRAP_PSTATE, %pstate
- wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate
- ldx [%g6 + TI_FLAGS], %l0
-1: andcc %l0, _TIF_DO_NOTIFY_RESUME_MASK, %g0
-
- be,pt %xcc, __handle_perfctrs_continue
- sethi %hi(TSTATE_PEF), %o0
- mov %l5, %o1
- add %sp, PTREGS_OFF, %o0
- mov %l0, %o2
- call do_notify_resume
-
- wrpr %g0, RTRAP_PSTATE, %pstate
- wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate
- /* Signal delivery can modify pt_regs tstate, so we must
- * reload it.
- */
- ldx [%sp + PTREGS_OFF + PT_V9_TSTATE], %l1
- sethi %hi(0xf << 20), %l4
- and %l1, %l4, %l4
- andn %l1, %l4, %l1
- ba,pt %xcc, __handle_perfctrs_continue
-
- sethi %hi(TSTATE_PEF), %o0
__handle_userfpu:
rd %fprs, %l5
andcc %l5, FPRS_FEF, %g0
@@ -149,11 +107,11 @@ rtrap_nmi: ldx [%sp + PTREGS_OFF + PT_V9_TSTATE], %l1
rtrap_irq:
rtrap:
#ifndef CONFIG_SMP
- sethi %hi(per_cpu____cpu_data), %l0
- lduw [%l0 + %lo(per_cpu____cpu_data)], %l1
+ sethi %hi(__cpu_data), %l0
+ lduw [%l0 + %lo(__cpu_data)], %l1
#else
- sethi %hi(per_cpu____cpu_data), %l0
- or %l0, %lo(per_cpu____cpu_data), %l0
+ sethi %hi(__cpu_data), %l0
+ or %l0, %lo(__cpu_data), %l0
lduw [%l0 + %g5], %l1
#endif
cmp %l1, 0
@@ -191,9 +149,9 @@ rtrap_no_irq_enable:
* take until the next local IRQ before the signal/resched
* event would be handled.
*
- * This also means that if we have to deal with performance
- * counters or user windows, we have to redo all of these
- * sched+signal checks with IRQs disabled.
+ * This also means that if we have to deal with user
+ * windows, we have to redo all of these sched+signal checks
+ * with IRQs disabled.
*/
to_user: wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate
wrpr 0, %pil
@@ -214,12 +172,8 @@ __handle_signal_continue:
brnz,pn %o2, __handle_user_windows
nop
__handle_user_windows_continue:
- ldx [%g6 + TI_FLAGS], %l5
- andcc %l5, _TIF_PERFCTR, %g0
sethi %hi(TSTATE_PEF), %o0
- bne,pn %xcc, __handle_perfctrs
-__handle_perfctrs_continue:
- andcc %l1, %o0, %g0
+ andcc %l1, %o0, %g0
/* This fpdepth clear is necessary for non-syscall rtraps only */
user_nowork:
diff --git a/arch/sparc/kernel/sys32.S b/arch/sparc/kernel/sys32.S
index e7061138c98..46a76ba3fb4 100644
--- a/arch/sparc/kernel/sys32.S
+++ b/arch/sparc/kernel/sys32.S
@@ -51,7 +51,6 @@ SIGN1(sys32_exit_group, sys_exit_group, %o0)
SIGN1(sys32_wait4, compat_sys_wait4, %o2)
SIGN1(sys32_creat, sys_creat, %o1)
SIGN1(sys32_mknod, sys_mknod, %o1)
-SIGN1(sys32_perfctr, sys_perfctr, %o0)
SIGN1(sys32_umount, sys_umount, %o1)
SIGN1(sys32_signal, sys_signal, %o0)
SIGN1(sys32_access, sys_access, %o1)
diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c
index d77f5431694..cb1bef6f14b 100644
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -27,7 +27,6 @@
#include <asm/uaccess.h>
#include <asm/utrap.h>
-#include <asm/perfctr.h>
#include <asm/unistd.h>
#include "entry.h"
@@ -766,109 +765,6 @@ SYSCALL_DEFINE5(rt_sigaction, int, sig, const struct sigaction __user *, act,
return ret;
}
-/* Invoked by rtrap code to update performance counters in
- * user space.
- */
-asmlinkage void update_perfctrs(void)
-{
- unsigned long pic, tmp;
-
- read_pic(pic);
- tmp = (current_thread_info()->kernel_cntd0 += (unsigned int)pic);
- __put_user(tmp, current_thread_info()->user_cntd0);
- tmp = (current_thread_info()->kernel_cntd1 += (pic >> 32));
- __put_user(tmp, current_thread_info()->user_cntd1);
- reset_pic();
-}
-
-SYSCALL_DEFINE4(perfctr, int, opcode, unsigned long, arg0,
- unsigned long, arg1, unsigned long, arg2)
-{
- int err = 0;
-
- switch(opcode) {
- case PERFCTR_ON:
- current_thread_info()->pcr_reg = arg2;
- current_thread_info()->user_cntd0 = (u64 __user *) arg0;
- current_thread_info()->user_cntd1 = (u64 __user *) arg1;
- current_thread_info()->kernel_cntd0 =
- current_thread_info()->kernel_cntd1 = 0;
- write_pcr(arg2);
- reset_pic();
- set_thread_flag(TIF_PERFCTR);
- break;
-
- case PERFCTR_OFF:
- err = -EINVAL;
- if (test_thread_flag(TIF_PERFCTR)) {
- current_thread_info()->user_cntd0 =
- current_thread_info()->user_cntd1 = NULL;
- current_thread_info()->pcr_reg = 0;
- write_pcr(0);
- clear_thread_flag(TIF_PERFCTR);
- err = 0;
- }
- break;
-
- case PERFCTR_READ: {
- unsigned long pic, tmp;
-
- if (!test_thread_flag(TIF_PERFCTR)) {
- err = -EINVAL;
- break;
- }
- read_pic(pic);
- tmp = (current_thread_info()->kernel_cntd0 += (unsigned int)pic);
- err |= __put_user(tmp, current_thread_info()->user_cntd0);
- tmp = (current_thread_info()->kernel_cntd1 += (pic >> 32));
- err |= __put_user(tmp, current_thread_info()->user_cntd1);
- reset_pic();
- break;
- }
-
- case PERFCTR_CLRPIC:
- if (!test_thread_flag(TIF_PERFCTR)) {
- err = -EINVAL;
- break;
- }
- current_thread_info()->kernel_cntd0 =
- current_thread_info()->kernel_cntd1 = 0;
- reset_pic();
- break;
-
- case PERFCTR_SETPCR: {
- u64 __user *user_pcr = (u64 __user *)arg0;
-
- if (!test_thread_flag(TIF_PERFCTR)) {
- err = -EINVAL;
- break;
- }
- err |= __get_user(current_thread_info()->pcr_reg, user_pcr);
- write_pcr(current_thread_info()->pcr_reg);
- current_thread_info()->kernel_cntd0 =
- current_thread_info()->kernel_cntd1 = 0;
- reset_pic();
- break;
- }
-
- case PERFCTR_GETPCR: {
- u64 __user *user_pcr = (u64 __user *)arg0;
-
- if (!test_thread_flag(TIF_PERFCTR)) {
- err = -EINVAL;
- break;
- }
- err |= __put_user(current_thread_info()->pcr_reg, user_pcr);
- break;
- }
-
- default:
- err = -EINVAL;
- break;
- };
- return err;
-}
-
/*
* Do a system call from kernel instead of calling sys_execve so we
* end up with proper pt_regs.
diff --git a/arch/sparc/kernel/syscalls.S b/arch/sparc/kernel/syscalls.S
index dc4a458f74d..1d7e274f3f2 100644
--- a/arch/sparc/kernel/syscalls.S
+++ b/arch/sparc/kernel/syscalls.S
@@ -110,31 +110,12 @@ sys_clone:
.globl ret_from_syscall
ret_from_syscall:
- /* Clear current_thread_info()->new_child, and
- * check performance counter stuff too.
- */
+ /* Clear current_thread_info()->new_child. */
stb %g0, [%g6 + TI_NEW_CHILD]
ldx [%g6 + TI_FLAGS], %l0
call schedule_tail
mov %g7, %o0
- andcc %l0, _TIF_PERFCTR, %g0
- be,pt %icc, 1f
- nop
- ldx [%g6 + TI_PCR], %o7
- wr %g0, %o7, %pcr
-
- /* Blackbird errata workaround. See commentary in
- * smp.c:smp_percpu_timer_interrupt() for more
- * information.
- */
- ba,pt %xcc, 99f
- nop
-
- .align 64
-99: wr %g0, %g0, %pic
- rd %pic, %g0
-
-1: ba,pt %xcc, ret_sys_call
+ ba,pt %xcc, ret_sys_call
ldx [%sp + PTREGS_OFF + PT_V9_I0], %o0
.globl sparc_exit
diff --git a/arch/sparc/kernel/systbls.h b/arch/sparc/kernel/systbls.h
index d2f999ae2b8..68312fe8da7 100644
--- a/arch/sparc/kernel/systbls.h
+++ b/arch/sparc/kernel/systbls.h
@@ -36,8 +36,6 @@ extern asmlinkage long sys_rt_sigaction(int sig,
struct sigaction __user *oact,
void __user *restorer,
size_t sigsetsize);
-extern asmlinkage long sys_perfctr(int opcode, unsigned long arg0,
- unsigned long arg1, unsigned long arg2);
extern asmlinkage void sparc64_set_context(struct pt_regs *regs);
extern asmlinkage void sparc64_get_context(struct pt_regs *regs);
diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S
index e575b46bd7a..17614251fb6 100644
--- a/arch/sparc/kernel/systbls_64.S
+++ b/arch/sparc/kernel/systbls_64.S
@@ -21,7 +21,7 @@ sys_call_table32:
/*0*/ .word sys_restart_syscall, sys32_exit, sys_fork, sys_read, sys_write
/*5*/ .word sys32_open, sys_close, sys32_wait4, sys32_creat, sys_link
/*10*/ .word sys_unlink, sunos_execv, sys_chdir, sys_chown16, sys32_mknod
-/*15*/ .word sys_chmod, sys_lchown16, sys_brk, sys32_perfctr, sys32_lseek
+/*15*/ .word sys_chmod, sys_lchown16, sys_brk, sys_nis_syscall, sys32_lseek
/*20*/ .word sys_getpid, sys_capget, sys_capset, sys_setuid16, sys_getuid16
/*25*/ .word sys32_vmsplice, compat_sys_ptrace, sys_alarm, sys32_sigaltstack, sys_pause
/*30*/ .word compat_sys_utime, sys_lchown, sys_fchown, sys32_access, sys32_nice
@@ -96,7 +96,7 @@ sys_call_table:
/*0*/ .word sys_restart_syscall, sparc_exit, sys_fork, sys_read, sys_write
/*5*/ .word sys_open, sys_close, sys_wait4, sys_creat, sys_link
/*10*/ .word sys_unlink, sys_nis_syscall, sys_chdir, sys_chown, sys_mknod
-/*15*/ .word sys_chmod, sys_lchown, sys_brk, sys_perfctr, sys_lseek
+/*15*/ .word sys_chmod, sys_lchown, sys_brk, sys_nis_syscall, sys_lseek
/*20*/ .word sys_getpid, sys_capget, sys_capset, sys_setuid, sys_getuid
/*25*/ .word sys_vmsplice, sys_ptrace, sys_alarm, sys_sigaltstack, sys_nis_syscall
/*30*/ .word sys_utime, sys_nis_syscall, sys_nis_syscall, sys_access, sys_nice
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index 10f7bb9fc14..bdc05a21908 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -2548,15 +2548,6 @@ void __init trap_init(void)
rwbuf_stkptrs) ||
TI_GSR != offsetof(struct thread_info, gsr) ||
TI_XFSR != offsetof(struct thread_info, xfsr) ||
- TI_USER_CNTD0 != offsetof(struct thread_info,
- user_cntd0) ||
- TI_USER_CNTD1 != offsetof(struct thread_info,
- user_cntd1) ||
- TI_KERN_CNTD0 != offsetof(struct thread_info,
- kernel_cntd0) ||
- TI_KERN_CNTD1 != offsetof(struct thread_info,
- kernel_cntd1) ||
- TI_PCR != offsetof(struct thread_info, pcr_reg) ||
TI_PRE_COUNT != offsetof(struct thread_info,
preempt_count) ||
TI_NEW_CHILD != offsetof(struct thread_info, new_child) ||
diff --git a/arch/sparc/prom/p1275.c b/arch/sparc/prom/p1275.c
index 4b7c937bba6..2d8b70d397f 100644
--- a/arch/sparc/prom/p1275.c
+++ b/arch/sparc/prom/p1275.c
@@ -32,10 +32,9 @@ extern void prom_cif_interface(void);
extern void prom_cif_callback(void);
/*
- * This provides SMP safety on the p1275buf. prom_callback() drops this lock
- * to allow recursuve acquisition.
+ * This provides SMP safety on the p1275buf.
*/
-DEFINE_SPINLOCK(prom_entry_lock);
+DEFINE_RAW_SPINLOCK(prom_entry_lock);
long p1275_cmd(const char *service, long fmt, ...)
{
@@ -47,7 +46,9 @@ long p1275_cmd(const char *service, long fmt, ...)
p = p1275buf.prom_buffer;
- spin_lock_irqsave(&prom_entry_lock, flags);
+ raw_local_save_flags(flags);
+ raw_local_irq_restore(PIL_NMI);
+ raw_spin_lock(&prom_entry_lock);
p1275buf.prom_args[0] = (unsigned long)p; /* service */
strcpy (p, service);
@@ -139,7 +140,8 @@ long p1275_cmd(const char *service, long fmt, ...)
va_end(list);
x = p1275buf.prom_args [nargs + 3];
- spin_unlock_irqrestore(&prom_entry_lock, flags);
+ raw_spin_unlock(&prom_entry_lock);
+ raw_local_irq_restore(flags);
return x;
}
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 3b3c36601a7..de317d0c329 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -140,7 +140,7 @@ void mconsole_proc(struct mc_request *req)
goto out;
}
- err = may_open(&nd.path, MAY_READ, FMODE_READ);
+ err = may_open(&nd.path, MAY_READ, O_RDONLY);
if (result) {
mconsole_reply(req, "Failed to open file", 1, 0);
path_put(&nd.path);
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0896008f750..f15f37bfbd6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -31,6 +31,7 @@ config X86
select ARCH_WANT_FRAME_POINTERS
select HAVE_DMA_ATTRS
select HAVE_KRETPROBES
+ select HAVE_OPTPROBES
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_DYNAMIC_FTRACE
select HAVE_FUNCTION_TRACER
@@ -184,6 +185,9 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING
config ARCH_SUPPORTS_DEBUG_PAGEALLOC
def_bool y
+config HAVE_EARLY_RES
+ def_bool y
+
config HAVE_INTEL_TXT
def_bool y
depends on EXPERIMENTAL && DMAR && ACPI
@@ -569,6 +573,18 @@ config PARAVIRT_DEBUG
Enable to debug paravirt_ops internals. Specifically, BUG if
a paravirt_op is missing when it is called.
+config NO_BOOTMEM
+ default y
+ bool "Disable Bootmem code"
+ ---help---
+ Use early_res directly instead of bootmem before slab is ready.
+ - allocator (buddy) [generic]
+ - early allocator (bootmem) [generic]
+ - very early allocator (reserve_early*()) [x86]
+ - very very early allocator (early brk model) [x86]
+ So reduce one layer between early allocator to final allocator
+
+
config MEMTEST
bool "Memtest"
---help---
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 9046e4af66c..280c019cfad 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -327,7 +327,6 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
current->mm->free_area_cache = TASK_UNMAPPED_BASE;
current->mm->cached_hole_size = 0;
- current->mm->mmap = NULL;
install_exec_creds(bprm);
current->flags &= ~PF_FORKNOEXEC;
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 9f828f87ca3..493092efaa3 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -11,6 +11,7 @@ header-y += sigcontext32.h
header-y += ucontext.h
header-y += processor-flags.h
header-y += hw_breakpoint.h
+header-y += hyperv.h
unifdef-y += e820.h
unifdef-y += ist.h
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index f1e253ceba4..b09ec55650b 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -165,10 +165,12 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
* invalid instruction possible) or if the instructions are changed from a
* consistent state to another consistent state atomically.
* More care must be taken when modifying code in the SMP case because of
- * Intel's errata.
+ * Intel's errata. text_poke_smp() takes care that errata, but still
+ * doesn't support NMI/MCE handler code modifying.
* On the local CPU you need to be protected again NMI or MCE handlers seeing an
* inconsistent instruction while you patch.
*/
extern void *text_poke(void *addr, const void *opcode, size_t len);
+extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 761249e396f..0e22296790d 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -111,11 +111,8 @@ extern unsigned long end_user_pfn;
extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align);
extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
-extern void reserve_early(u64 start, u64 end, char *name);
-extern void reserve_early_overlap_ok(u64 start, u64 end, char *name);
-extern void free_early(u64 start, u64 end);
-extern void early_res_to_bootmem(u64 start, u64 end);
extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
+#include <linux/early_res.h>
extern unsigned long e820_end_of_ram_pfn(void);
extern unsigned long e820_end_of_low_ram_pfn(void);
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index 014c2b85ae4..a726650fc80 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -66,10 +66,6 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
struct page *kmap_atomic_to_page(void *ptr);
-#ifndef CONFIG_PARAVIRT
-#define kmap_atomic_pte(page, type) kmap_atomic(page, type)
-#endif
-
#define flush_cache_kmaps() do { } while (0)
extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
diff --git a/arch/x86/include/asm/hyperv.h b/arch/x86/include/asm/hyperv.h
new file mode 100644
index 00000000000..e153a2b3889
--- /dev/null
+++ b/arch/x86/include/asm/hyperv.h
@@ -0,0 +1,186 @@
+#ifndef _ASM_X86_KVM_HYPERV_H
+#define _ASM_X86_KVM_HYPERV_H
+
+#include <linux/types.h>
+
+/*
+ * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent
+ * is set by CPUID(HvCpuIdFunctionVersionAndFeatures).
+ */
+#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 0x40000000
+#define HYPERV_CPUID_INTERFACE 0x40000001
+#define HYPERV_CPUID_VERSION 0x40000002
+#define HYPERV_CPUID_FEATURES 0x40000003
+#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004
+#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005
+
+/*
+ * Feature identification. EAX indicates which features are available
+ * to the partition based upon the current partition privileges.
+ */
+
+/* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */
+#define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0)
+/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/
+#define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1)
+/*
+ * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM
+ * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available
+ */
+#define HV_X64_MSR_SYNIC_AVAILABLE (1 << 2)
+/*
+ * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through
+ * HV_X64_MSR_STIMER3_COUNT) available
+ */
+#define HV_X64_MSR_SYNTIMER_AVAILABLE (1 << 3)
+/*
+ * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR)
+ * are available
+ */
+#define HV_X64_MSR_APIC_ACCESS_AVAILABLE (1 << 4)
+/* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/
+#define HV_X64_MSR_HYPERCALL_AVAILABLE (1 << 5)
+/* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/
+#define HV_X64_MSR_VP_INDEX_AVAILABLE (1 << 6)
+/* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/
+#define HV_X64_MSR_RESET_AVAILABLE (1 << 7)
+ /*
+ * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE,
+ * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE,
+ * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available
+ */
+#define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8)
+
+/*
+ * Feature identification: EBX indicates which flags were specified at
+ * partition creation. The format is the same as the partition creation
+ * flag structure defined in section Partition Creation Flags.
+ */
+#define HV_X64_CREATE_PARTITIONS (1 << 0)
+#define HV_X64_ACCESS_PARTITION_ID (1 << 1)
+#define HV_X64_ACCESS_MEMORY_POOL (1 << 2)
+#define HV_X64_ADJUST_MESSAGE_BUFFERS (1 << 3)
+#define HV_X64_POST_MESSAGES (1 << 4)
+#define HV_X64_SIGNAL_EVENTS (1 << 5)
+#define HV_X64_CREATE_PORT (1 << 6)
+#define HV_X64_CONNECT_PORT (1 << 7)
+#define HV_X64_ACCESS_STATS (1 << 8)
+#define HV_X64_DEBUGGING (1 << 11)
+#define HV_X64_CPU_POWER_MANAGEMENT (1 << 12)
+#define HV_X64_CONFIGURE_PROFILER (1 << 13)
+
+/*
+ * Feature identification. EDX indicates which miscellaneous features
+ * are available to the partition.
+ */
+/* The MWAIT instruction is available (per section MONITOR / MWAIT) */
+#define HV_X64_MWAIT_AVAILABLE (1 << 0)
+/* Guest debugging support is available */
+#define HV_X64_GUEST_DEBUGGING_AVAILABLE (1 << 1)
+/* Performance Monitor support is available*/
+#define HV_X64_PERF_MONITOR_AVAILABLE (1 << 2)
+/* Support for physical CPU dynamic partitioning events is available*/
+#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE (1 << 3)
+/*
+ * Support for passing hypercall input parameter block via XMM
+ * registers is available
+ */
+#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE (1 << 4)
+/* Support for a virtual guest idle state is available */
+#define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5)
+
+/*
+ * Implementation recommendations. Indicates which behaviors the hypervisor
+ * recommends the OS implement for optimal performance.
+ */
+ /*
+ * Recommend using hypercall for address space switches rather
+ * than MOV to CR3 instruction
+ */
+#define HV_X64_MWAIT_RECOMMENDED (1 << 0)
+/* Recommend using hypercall for local TLB flushes rather
+ * than INVLPG or MOV to CR3 instructions */
+#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED (1 << 1)
+/*
+ * Recommend using hypercall for remote TLB flushes rather
+ * than inter-processor interrupts
+ */
+#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED (1 << 2)
+/*
+ * Recommend using MSRs for accessing APIC registers
+ * EOI, ICR and TPR rather than their memory-mapped counterparts
+ */
+#define HV_X64_APIC_ACCESS_RECOMMENDED (1 << 3)
+/* Recommend using the hypervisor-provided MSR to initiate a system RESET */
+#define HV_X64_SYSTEM_RESET_RECOMMENDED (1 << 4)
+/*
+ * Recommend using relaxed timing for this partition. If used,
+ * the VM should disable any watchdog timeouts that rely on the
+ * timely delivery of external interrupts
+ */
+#define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5)
+
+/* MSR used to identify the guest OS. */
+#define HV_X64_MSR_GUEST_OS_ID 0x40000000
+
+/* MSR used to setup pages used to communicate with the hypervisor. */
+#define HV_X64_MSR_HYPERCALL 0x40000001
+
+/* MSR used to provide vcpu index */
+#define HV_X64_MSR_VP_INDEX 0x40000002
+
+/* Define the virtual APIC registers */
+#define HV_X64_MSR_EOI 0x40000070
+#define HV_X64_MSR_ICR 0x40000071
+#define HV_X64_MSR_TPR 0x40000072
+#define HV_X64_MSR_APIC_ASSIST_PAGE 0x40000073
+
+/* Define synthetic interrupt controller model specific registers. */
+#define HV_X64_MSR_SCONTROL 0x40000080
+#define HV_X64_MSR_SVERSION 0x40000081
+#define HV_X64_MSR_SIEFP 0x40000082
+#define HV_X64_MSR_SIMP 0x40000083
+#define HV_X64_MSR_EOM 0x40000084
+#define HV_X64_MSR_SINT0 0x40000090
+#define HV_X64_MSR_SINT1 0x40000091
+#define HV_X64_MSR_SINT2 0x40000092
+#define HV_X64_MSR_SINT3 0x40000093
+#define HV_X64_MSR_SINT4 0x40000094
+#define HV_X64_MSR_SINT5 0x40000095
+#define HV_X64_MSR_SINT6 0x40000096
+#define HV_X64_MSR_SINT7 0x40000097
+#define HV_X64_MSR_SINT8 0x40000098
+#define HV_X64_MSR_SINT9 0x40000099
+#define HV_X64_MSR_SINT10 0x4000009A
+#define HV_X64_MSR_SINT11 0x4000009B
+#define HV_X64_MSR_SINT12 0x4000009C
+#define HV_X64_MSR_SINT13 0x4000009D
+#define HV_X64_MSR_SINT14 0x4000009E
+#define HV_X64_MSR_SINT15 0x4000009F
+
+
+#define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001
+#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12
+#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \
+ (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1))
+
+/* Declare the various hypercall operations. */
+#define HV_X64_HV_NOTIFY_LONG_SPIN_WAIT 0x0008
+
+#define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE 0x00000001
+#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT 12
+#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK \
+ (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
+
+#define HV_PROCESSOR_POWER_STATE_C0 0
+#define HV_PROCESSOR_POWER_STATE_C1 1
+#define HV_PROCESSOR_POWER_STATE_C2 2
+#define HV_PROCESSOR_POWER_STATE_C3 3
+
+/* hypercall status code */
+#define HV_STATUS_SUCCESS 0
+#define HV_STATUS_INVALID_HYPERCALL_CODE 2
+#define HV_STATUS_INVALID_HYPERCALL_INPUT 3
+#define HV_STATUS_INVALID_ALIGNMENT 4
+
+#endif
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index 58d7091eeb1..7ec65b18085 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -24,7 +24,7 @@ extern unsigned int cached_irq_mask;
#define SLAVE_ICW4_DEFAULT 0x01
#define PIC_ICW4_AEOI 2
-extern spinlock_t i8259A_lock;
+extern raw_spinlock_t i8259A_lock;
extern void init_8259A(int auto_eoi);
extern void enable_8259A_irq(unsigned int irq);
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 7c7c16cde1f..5f61f6e0ffd 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -160,6 +160,7 @@ extern int io_apic_get_redir_entries(int ioapic);
struct io_apic_irq_attr;
extern int io_apic_set_pci_routing(struct device *dev, int irq,
struct io_apic_irq_attr *irq_attr);
+void setup_IO_APIC_irq_extra(u32 gsi);
extern int (*ioapic_renumber_irq)(int ioapic, int irq);
extern void ioapic_init_mappings(void);
extern void ioapic_insert_resources(void);
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 5458380b6ef..262292729fc 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -48,5 +48,6 @@ extern DECLARE_BITMAP(used_vectors, NR_VECTORS);
extern int vector_used_by_percpu_irq(unsigned int vector);
extern void init_ISA_irqs(void);
+extern int nr_legacy_irqs;
#endif /* _ASM_X86_IRQ_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 4611f085cd4..8767d99c4f6 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -28,28 +28,33 @@
#define MCE_VECTOR 0x12
/*
- * IDT vectors usable for external interrupt sources start
- * at 0x20:
+ * IDT vectors usable for external interrupt sources start at 0x20.
+ * (0x80 is the syscall vector, 0x30-0x3f are for ISA)
*/
#define FIRST_EXTERNAL_VECTOR 0x20
-
-#ifdef CONFIG_X86_32
-# define SYSCALL_VECTOR 0x80
-# define IA32_SYSCALL_VECTOR 0x80
-#else
-# define IA32_SYSCALL_VECTOR 0x80
-#endif
+/*
+ * We start allocating at 0x21 to spread out vectors evenly between
+ * priority levels. (0x80 is the syscall vector)
+ */
+#define VECTOR_OFFSET_START 1
/*
- * Reserve the lowest usable priority level 0x20 - 0x2f for triggering
- * cleanup after irq migration.
+ * Reserve the lowest usable vector (and hence lowest priority) 0x20 for
+ * triggering cleanup after irq migration. 0x21-0x2f will still be used
+ * for device interrupts.
*/
#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR
+#define IA32_SYSCALL_VECTOR 0x80
+#ifdef CONFIG_X86_32
+# define SYSCALL_VECTOR 0x80
+#endif
+
/*
* Vectors 0x30-0x3f are used for ISA interrupts.
+ * round up to the next 16-vector boundary
*/
-#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10)
+#define IRQ0_VECTOR ((FIRST_EXTERNAL_VECTOR + 16) & ~15)
#define IRQ1_VECTOR (IRQ0_VECTOR + 1)
#define IRQ2_VECTOR (IRQ0_VECTOR + 2)
@@ -120,13 +125,6 @@
*/
#define MCE_SELF_VECTOR 0xeb
-/*
- * First APIC vector available to drivers: (vectors 0x30-0xee) we
- * start at 0x31(0x41) to spread out vectors evenly between priority
- * levels. (0x80 is the syscall vector)
- */
-#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2)
-
#define NR_VECTORS 256
#define FPU_IRQ 13
@@ -154,21 +152,21 @@ static inline int invalid_vm86_irq(int irq)
#define NR_IRQS_LEGACY 16
-#define CPU_VECTOR_LIMIT ( 8 * NR_CPUS )
#define IO_APIC_VECTOR_LIMIT ( 32 * MAX_IO_APICS )
#ifdef CONFIG_X86_IO_APIC
# ifdef CONFIG_SPARSE_IRQ
+# define CPU_VECTOR_LIMIT (64 * NR_CPUS)
# define NR_IRQS \
(CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \
(NR_VECTORS + CPU_VECTOR_LIMIT) : \
(NR_VECTORS + IO_APIC_VECTOR_LIMIT))
# else
-# if NR_CPUS < MAX_IO_APICS
-# define NR_IRQS (NR_VECTORS + 4*CPU_VECTOR_LIMIT)
-# else
-# define NR_IRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT)
-# endif
+# define CPU_VECTOR_LIMIT (32 * NR_CPUS)
+# define NR_IRQS \
+ (CPU_VECTOR_LIMIT < IO_APIC_VECTOR_LIMIT ? \
+ (NR_VECTORS + CPU_VECTOR_LIMIT) : \
+ (NR_VECTORS + IO_APIC_VECTOR_LIMIT))
# endif
#else /* !CONFIG_X86_IO_APIC: */
# define NR_IRQS NR_IRQS_LEGACY
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 4fe681de1e7..4ffa345a8cc 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -32,7 +32,10 @@ struct kprobe;
typedef u8 kprobe_opcode_t;
#define BREAKPOINT_INSTRUCTION 0xcc
-#define RELATIVEJUMP_INSTRUCTION 0xe9
+#define RELATIVEJUMP_OPCODE 0xe9
+#define RELATIVEJUMP_SIZE 5
+#define RELATIVECALL_OPCODE 0xe8
+#define RELATIVE_ADDR_SIZE 4
#define MAX_INSN_SIZE 16
#define MAX_STACK_SIZE 64
#define MIN_STACK_SIZE(ADDR) \
@@ -44,6 +47,17 @@ typedef u8 kprobe_opcode_t;
#define flush_insn_slot(p) do { } while (0)
+/* optinsn template addresses */
+extern kprobe_opcode_t optprobe_template_entry;
+extern kprobe_opcode_t optprobe_template_val;
+extern kprobe_opcode_t optprobe_template_call;
+extern kprobe_opcode_t optprobe_template_end;
+#define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE)
+#define MAX_OPTINSN_SIZE \
+ (((unsigned long)&optprobe_template_end - \
+ (unsigned long)&optprobe_template_entry) + \
+ MAX_OPTIMIZED_LENGTH + RELATIVEJUMP_SIZE)
+
extern const int kretprobe_blacklist_size;
void arch_remove_kprobe(struct kprobe *p);
@@ -64,6 +78,21 @@ struct arch_specific_insn {
int boostable;
};
+struct arch_optimized_insn {
+ /* copy of the original instructions */
+ kprobe_opcode_t copied_insn[RELATIVE_ADDR_SIZE];
+ /* detour code buffer */
+ kprobe_opcode_t *insn;
+ /* the size of instructions copied to detour code buffer */
+ size_t size;
+};
+
+/* Return true (!0) if optinsn is prepared for optimization. */
+static inline int arch_prepared_optinsn(struct arch_optimized_insn *optinsn)
+{
+ return optinsn->size;
+}
+
struct prev_kprobe {
struct kprobe *kp;
unsigned long status;
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 7c18e1230f5..7a6f54fa13b 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -54,13 +54,23 @@ struct x86_emulate_ctxt;
struct x86_emulate_ops {
/*
* read_std: Read bytes of standard (non-emulated/special) memory.
- * Used for instruction fetch, stack operations, and others.
+ * Used for descriptor reading.
* @addr: [IN ] Linear address from which to read.
* @val: [OUT] Value read from memory, zero-extended to 'u_long'.
* @bytes: [IN ] Number of bytes to read from memory.
*/
int (*read_std)(unsigned long addr, void *val,
- unsigned int bytes, struct kvm_vcpu *vcpu);
+ unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
+
+ /*
+ * fetch: Read bytes of standard (non-emulated/special) memory.
+ * Used for instruction fetch.
+ * @addr: [IN ] Linear address from which to read.
+ * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to read from memory.
+ */
+ int (*fetch)(unsigned long addr, void *val,
+ unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
/*
* read_emulated: Read bytes from emulated/special memory area.
@@ -74,7 +84,7 @@ struct x86_emulate_ops {
struct kvm_vcpu *vcpu);
/*
- * write_emulated: Read bytes from emulated/special memory area.
+ * write_emulated: Write bytes to emulated/special memory area.
* @addr: [IN ] Linear address to which to write.
* @val: [IN ] Value to write to memory (low-order bytes used as
* required).
@@ -168,6 +178,7 @@ struct x86_emulate_ctxt {
/* Execution mode, passed to the emulator. */
#define X86EMUL_MODE_REAL 0 /* Real mode. */
+#define X86EMUL_MODE_VM86 1 /* Virtual 8086 mode. */
#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */
#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */
#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4f865e8b854..06d9e79ca37 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -25,7 +25,7 @@
#include <asm/mtrr.h>
#include <asm/msr-index.h>
-#define KVM_MAX_VCPUS 16
+#define KVM_MAX_VCPUS 64
#define KVM_MEMORY_SLOTS 32
/* memory slots that does not exposed to userspace */
#define KVM_PRIVATE_MEM_SLOTS 4
@@ -38,19 +38,6 @@
#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
0xFFFFFF0000000000ULL)
-#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
- (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
-#define KVM_GUEST_CR0_MASK \
- (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
-#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
- (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP)
-#define KVM_VM_CR0_ALWAYS_ON \
- (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
-#define KVM_GUEST_CR4_MASK \
- (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
-#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
-#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
-
#define INVALID_PAGE (~(hpa_t)0)
#define UNMAPPED_GVA (~(gpa_t)0)
@@ -256,7 +243,8 @@ struct kvm_mmu {
void (*new_cr3)(struct kvm_vcpu *vcpu);
int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
void (*free)(struct kvm_vcpu *vcpu);
- gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
+ gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
+ u32 *error);
void (*prefetch_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page);
int (*sync_page)(struct kvm_vcpu *vcpu,
@@ -282,13 +270,15 @@ struct kvm_vcpu_arch {
u32 regs_dirty;
unsigned long cr0;
+ unsigned long cr0_guest_owned_bits;
unsigned long cr2;
unsigned long cr3;
unsigned long cr4;
+ unsigned long cr4_guest_owned_bits;
unsigned long cr8;
u32 hflags;
u64 pdptrs[4]; /* pae */
- u64 shadow_efer;
+ u64 efer;
u64 apic_base;
struct kvm_lapic *apic; /* kernel irqchip context */
int32_t apic_arb_prio;
@@ -374,17 +364,27 @@ struct kvm_vcpu_arch {
/* used for guest single stepping over the given code position */
u16 singlestep_cs;
unsigned long singlestep_rip;
+ /* fields used by HYPER-V emulation */
+ u64 hv_vapic;
};
struct kvm_mem_alias {
gfn_t base_gfn;
unsigned long npages;
gfn_t target_gfn;
+#define KVM_ALIAS_INVALID 1UL
+ unsigned long flags;
};
-struct kvm_arch{
- int naliases;
+#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION
+
+struct kvm_mem_aliases {
struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
+ int naliases;
+};
+
+struct kvm_arch {
+ struct kvm_mem_aliases *aliases;
unsigned int n_free_mmu_pages;
unsigned int n_requested_mmu_pages;
@@ -416,6 +416,10 @@ struct kvm_arch{
s64 kvmclock_offset;
struct kvm_xen_hvm_config xen_hvm_config;
+
+ /* fields used by HYPER-V emulation */
+ u64 hv_guest_os_id;
+ u64 hv_hypercall;
};
struct kvm_vm_stat {
@@ -471,6 +475,7 @@ struct kvm_x86_ops {
int (*hardware_setup)(void); /* __init */
void (*hardware_unsetup)(void); /* __exit */
bool (*cpu_has_accelerated_tpr)(void);
+ void (*cpuid_update)(struct kvm_vcpu *vcpu);
/* Create, but do not attach this VCPU */
struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
@@ -492,6 +497,7 @@ struct kvm_x86_ops {
void (*set_segment)(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
+ void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -501,12 +507,13 @@ struct kvm_x86_ops {
void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
- unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr);
- void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value,
- int *exception);
+ int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest);
+ int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value);
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+ void (*fpu_activate)(struct kvm_vcpu *vcpu);
+ void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
void (*tlb_flush)(struct kvm_vcpu *vcpu);
@@ -531,7 +538,8 @@ struct kvm_x86_ops {
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*get_tdp_level)(void);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
- bool (*gb_page_enable)(void);
+ int (*get_lpage_level)(void);
+ bool (*rdtscp_supported)(void);
const struct trace_print_flags *exit_reasons_str;
};
@@ -606,8 +614,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
unsigned long value);
void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
-int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
- int type_bits, int seg);
+int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason);
@@ -653,6 +660,10 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
+gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
@@ -666,6 +677,7 @@ void kvm_disable_tdp(void);
int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
int complete_pio(struct kvm_vcpu *vcpu);
+bool kvm_check_iopl(struct kvm_vcpu *vcpu);
struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index c584076a47f..ffae1420e7d 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -2,6 +2,7 @@
#define _ASM_X86_KVM_PARA_H
#include <linux/types.h>
+#include <asm/hyperv.h>
/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
* should be used to determine that a VM is running under KVM.
diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h
index 47b9b6f1905..2e9972468a5 100644
--- a/arch/x86/include/asm/local.h
+++ b/arch/x86/include/asm/local.h
@@ -195,41 +195,4 @@ static inline long local_sub_return(long i, local_t *l)
#define __local_add(i, l) local_add((i), (l))
#define __local_sub(i, l) local_sub((i), (l))
-/* Use these for per-cpu local_t variables: on some archs they are
- * much more efficient than these naive implementations. Note they take
- * a variable, not an address.
- *
- * X86_64: This could be done better if we moved the per cpu data directly
- * after GS.
- */
-
-/* Need to disable preemption for the cpu local counters otherwise we could
- still access a variable of a previous CPU in a non atomic way. */
-#define cpu_local_wrap_v(l) \
-({ \
- local_t res__; \
- preempt_disable(); \
- res__ = (l); \
- preempt_enable(); \
- res__; \
-})
-#define cpu_local_wrap(l) \
-({ \
- preempt_disable(); \
- (l); \
- preempt_enable(); \
-}) \
-
-#define cpu_local_read(l) cpu_local_wrap_v(local_read(&__get_cpu_var((l))))
-#define cpu_local_set(l, i) cpu_local_wrap(local_set(&__get_cpu_var((l)), (i)))
-#define cpu_local_inc(l) cpu_local_wrap(local_inc(&__get_cpu_var((l))))
-#define cpu_local_dec(l) cpu_local_wrap(local_dec(&__get_cpu_var((l))))
-#define cpu_local_add(i, l) cpu_local_wrap(local_add((i), &__get_cpu_var((l))))
-#define cpu_local_sub(i, l) cpu_local_wrap(local_sub((i), &__get_cpu_var((l))))
-
-#define __cpu_local_inc(l) cpu_local_inc((l))
-#define __cpu_local_dec(l) cpu_local_dec((l))
-#define __cpu_local_add(i, l) cpu_local_add((i), (l))
-#define __cpu_local_sub(i, l) cpu_local_sub((i), (l))
-
#endif /* _ASM_X86_LOCAL_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index dd59a85a918..5653f43d90e 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -435,15 +435,6 @@ static inline void paravirt_release_pud(unsigned long pfn)
PVOP_VCALL1(pv_mmu_ops.release_pud, pfn);
}
-#ifdef CONFIG_HIGHPTE
-static inline void *kmap_atomic_pte(struct page *page, enum km_type type)
-{
- unsigned long ret;
- ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte, page, type);
- return (void *)ret;
-}
-#endif
-
static inline void pte_update(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index b1e70d51e40..db9ef553234 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -304,10 +304,6 @@ struct pv_mmu_ops {
#endif /* PAGETABLE_LEVELS == 4 */
#endif /* PAGETABLE_LEVELS >= 3 */
-#ifdef CONFIG_HIGHPTE
- void *(*kmap_atomic_pte)(struct page *page, enum km_type type);
-#endif
-
struct pv_lazy_ops lazy_mode;
/* dom0 ops */
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index ada8c201d51..b4a00dd4eed 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -124,6 +124,8 @@ extern void pci_iommu_alloc(void);
#include "pci_64.h"
#endif
+void dma32_reserve_bootmem(void);
+
/* implement the pci_ DMA API in terms of the generic device dma_ one */
#include <asm-generic/pci-dma-compat.h>
diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h
index ae5e40f67da..fe15cfb21b9 100644
--- a/arch/x86/include/asm/pci_64.h
+++ b/arch/x86/include/asm/pci_64.h
@@ -22,8 +22,6 @@ extern int (*pci_config_read)(int seg, int bus, int dev, int fn,
extern int (*pci_config_write)(int seg, int bus, int dev, int fn,
int reg, int len, u32 value);
-extern void dma32_reserve_bootmem(void);
-
#endif /* __KERNEL__ */
#endif /* _ASM_X86_PCI_64_H */
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 0c44196b78a..66a272dfd8b 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -25,19 +25,18 @@
*/
#ifdef CONFIG_SMP
#define PER_CPU(var, reg) \
- __percpu_mov_op %__percpu_seg:per_cpu__this_cpu_off, reg; \
- lea per_cpu__##var(reg), reg
-#define PER_CPU_VAR(var) %__percpu_seg:per_cpu__##var
+ __percpu_mov_op %__percpu_seg:this_cpu_off, reg; \
+ lea var(reg), reg
+#define PER_CPU_VAR(var) %__percpu_seg:var
#else /* ! SMP */
-#define PER_CPU(var, reg) \
- __percpu_mov_op $per_cpu__##var, reg
-#define PER_CPU_VAR(var) per_cpu__##var
+#define PER_CPU(var, reg) __percpu_mov_op $var, reg
+#define PER_CPU_VAR(var) var
#endif /* SMP */
#ifdef CONFIG_X86_64_SMP
#define INIT_PER_CPU_VAR(var) init_per_cpu__##var
#else
-#define INIT_PER_CPU_VAR(var) per_cpu__##var
+#define INIT_PER_CPU_VAR(var) var
#endif
#else /* ...!ASSEMBLY */
@@ -60,12 +59,12 @@
* There also must be an entry in vmlinux_64.lds.S
*/
#define DECLARE_INIT_PER_CPU(var) \
- extern typeof(per_cpu_var(var)) init_per_cpu_var(var)
+ extern typeof(var) init_per_cpu_var(var)
#ifdef CONFIG_X86_64_SMP
#define init_per_cpu_var(var) init_per_cpu__##var
#else
-#define init_per_cpu_var(var) per_cpu_var(var)
+#define init_per_cpu_var(var) var
#endif
/* For arch-specific code, we can use direct single-insn ops (they
@@ -104,6 +103,64 @@ do { \
} \
} while (0)
+/*
+ * Generate a percpu add to memory instruction and optimize code
+ * if a one is added or subtracted.
+ */
+#define percpu_add_op(var, val) \
+do { \
+ typedef typeof(var) pao_T__; \
+ const int pao_ID__ = (__builtin_constant_p(val) && \
+ ((val) == 1 || (val) == -1)) ? (val) : 0; \
+ if (0) { \
+ pao_T__ pao_tmp__; \
+ pao_tmp__ = (val); \
+ } \
+ switch (sizeof(var)) { \
+ case 1: \
+ if (pao_ID__ == 1) \
+ asm("incb "__percpu_arg(0) : "+m" (var)); \
+ else if (pao_ID__ == -1) \
+ asm("decb "__percpu_arg(0) : "+m" (var)); \
+ else \
+ asm("addb %1, "__percpu_arg(0) \
+ : "+m" (var) \
+ : "qi" ((pao_T__)(val))); \
+ break; \
+ case 2: \
+ if (pao_ID__ == 1) \
+ asm("incw "__percpu_arg(0) : "+m" (var)); \
+ else if (pao_ID__ == -1) \
+ asm("decw "__percpu_arg(0) : "+m" (var)); \
+ else \
+ asm("addw %1, "__percpu_arg(0) \
+ : "+m" (var) \
+ : "ri" ((pao_T__)(val))); \
+ break; \
+ case 4: \
+ if (pao_ID__ == 1) \
+ asm("incl "__percpu_arg(0) : "+m" (var)); \
+ else if (pao_ID__ == -1) \
+ asm("decl "__percpu_arg(0) : "+m" (var)); \
+ else \
+ asm("addl %1, "__percpu_arg(0) \
+ : "+m" (var) \
+ : "ri" ((pao_T__)(val))); \
+ break; \
+ case 8: \
+ if (pao_ID__ == 1) \
+ asm("incq "__percpu_arg(0) : "+m" (var)); \
+ else if (pao_ID__ == -1) \
+ asm("decq "__percpu_arg(0) : "+m" (var)); \
+ else \
+ asm("addq %1, "__percpu_arg(0) \
+ : "+m" (var) \
+ : "re" ((pao_T__)(val))); \
+ break; \
+ default: __bad_percpu_size(); \
+ } \
+} while (0)
+
#define percpu_from_op(op, var, constraint) \
({ \
typeof(var) pfo_ret__; \
@@ -142,16 +199,14 @@ do { \
* per-thread variables implemented as per-cpu variables and thus
* stable for the duration of the respective task.
*/
-#define percpu_read(var) percpu_from_op("mov", per_cpu__##var, \
- "m" (per_cpu__##var))
-#define percpu_read_stable(var) percpu_from_op("mov", per_cpu__##var, \
- "p" (&per_cpu__##var))
-#define percpu_write(var, val) percpu_to_op("mov", per_cpu__##var, val)
-#define percpu_add(var, val) percpu_to_op("add", per_cpu__##var, val)
-#define percpu_sub(var, val) percpu_to_op("sub", per_cpu__##var, val)
-#define percpu_and(var, val) percpu_to_op("and", per_cpu__##var, val)
-#define percpu_or(var, val) percpu_to_op("or", per_cpu__##var, val)
-#define percpu_xor(var, val) percpu_to_op("xor", per_cpu__##var, val)
+#define percpu_read(var) percpu_from_op("mov", var, "m" (var))
+#define percpu_read_stable(var) percpu_from_op("mov", var, "p" (&(var)))
+#define percpu_write(var, val) percpu_to_op("mov", var, val)
+#define percpu_add(var, val) percpu_add_op(var, val)
+#define percpu_sub(var, val) percpu_add_op(var, -(val))
+#define percpu_and(var, val) percpu_to_op("and", var, val)
+#define percpu_or(var, val) percpu_to_op("or", var, val)
+#define percpu_xor(var, val) percpu_to_op("xor", var, val)
#define __this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
#define __this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
@@ -160,9 +215,9 @@ do { \
#define __this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val)
#define __this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val)
#define __this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val)
-#define __this_cpu_add_1(pcp, val) percpu_to_op("add", (pcp), val)
-#define __this_cpu_add_2(pcp, val) percpu_to_op("add", (pcp), val)
-#define __this_cpu_add_4(pcp, val) percpu_to_op("add", (pcp), val)
+#define __this_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
+#define __this_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
+#define __this_cpu_add_4(pcp, val) percpu_add_op((pcp), val)
#define __this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
#define __this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
#define __this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
@@ -179,9 +234,9 @@ do { \
#define this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val)
#define this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val)
#define this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val)
-#define this_cpu_add_1(pcp, val) percpu_to_op("add", (pcp), val)
-#define this_cpu_add_2(pcp, val) percpu_to_op("add", (pcp), val)
-#define this_cpu_add_4(pcp, val) percpu_to_op("add", (pcp), val)
+#define this_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
+#define this_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
+#define this_cpu_add_4(pcp, val) percpu_add_op((pcp), val)
#define this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
#define this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
#define this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
@@ -192,9 +247,9 @@ do { \
#define this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
#define this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
-#define irqsafe_cpu_add_1(pcp, val) percpu_to_op("add", (pcp), val)
-#define irqsafe_cpu_add_2(pcp, val) percpu_to_op("add", (pcp), val)
-#define irqsafe_cpu_add_4(pcp, val) percpu_to_op("add", (pcp), val)
+#define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
+#define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
+#define irqsafe_cpu_add_4(pcp, val) percpu_add_op((pcp), val)
#define irqsafe_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
#define irqsafe_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
#define irqsafe_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
@@ -212,19 +267,19 @@ do { \
#ifdef CONFIG_X86_64
#define __this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
#define __this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
-#define __this_cpu_add_8(pcp, val) percpu_to_op("add", (pcp), val)
+#define __this_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
#define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
#define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
-#define this_cpu_add_8(pcp, val) percpu_to_op("add", (pcp), val)
+#define this_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
#define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
-#define irqsafe_cpu_add_8(pcp, val) percpu_to_op("add", (pcp), val)
+#define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
@@ -236,7 +291,7 @@ do { \
({ \
int old__; \
asm volatile("btr %2,"__percpu_arg(1)"\n\tsbbl %0,%0" \
- : "=r" (old__), "+m" (per_cpu__##var) \
+ : "=r" (old__), "+m" (var) \
: "dIr" (bit)); \
old__; \
})
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index a2866839650..47339a1ac7b 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -54,10 +54,10 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
in_irq() ? KM_IRQ_PTE : \
KM_PTE0)
#define pte_offset_map(dir, address) \
- ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), __KM_PTE) + \
+ ((pte_t *)kmap_atomic(pmd_page(*(dir)), __KM_PTE) + \
pte_index((address)))
#define pte_offset_map_nested(dir, address) \
- ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \
+ ((pte_t *)kmap_atomic(pmd_page(*(dir)), KM_PTE1) + \
pte_index((address)))
#define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE)
#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 4009f6534f5..6f414ed8862 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -23,14 +23,4 @@ extern int reboot_force;
long do_arch_prctl(struct task_struct *task, int code, unsigned long addr);
-/*
- * This looks more complex than it should be. But we need to
- * get the type for the ~ right in round_down (it needs to be
- * as wide as the result!), and we want to evaluate the macro
- * arguments just once each.
- */
-#define __round_mask(x,y) ((__typeof__(x))((y)-1))
-#define round_up(x,y) ((((x)-1) | __round_mask(x,y))+1)
-#define round_down(x,y) ((x) & ~__round_mask(x,y))
-
#endif /* _ASM_X86_PROTO_H */
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 1fecb7e6113..38638cd2fa4 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -313,7 +313,7 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_EXIT_ERR -1
-#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */
+#define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP)
#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8"
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index e04740f7a0b..b8fe48ee2ed 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -32,7 +32,7 @@ extern void show_regs_common(void);
"movl %P[task_canary](%[next]), %%ebx\n\t" \
"movl %%ebx, "__percpu_arg([stack_canary])"\n\t"
#define __switch_canary_oparam \
- , [stack_canary] "=m" (per_cpu_var(stack_canary.canary))
+ , [stack_canary] "=m" (stack_canary.canary)
#define __switch_canary_iparam \
, [task_canary] "i" (offsetof(struct task_struct, stack_canary))
#else /* CC_STACKPROTECTOR */
@@ -114,7 +114,7 @@ do { \
"movq %P[task_canary](%%rsi),%%r8\n\t" \
"movq %%r8,"__percpu_arg([gs_canary])"\n\t"
#define __switch_canary_oparam \
- , [gs_canary] "=m" (per_cpu_var(irq_stack_union.stack_canary))
+ , [gs_canary] "=m" (irq_stack_union.stack_canary)
#define __switch_canary_iparam \
, [task_canary] "i" (offsetof(struct task_struct, stack_canary))
#else /* CC_STACKPROTECTOR */
@@ -133,7 +133,7 @@ do { \
__switch_canary \
"movq %P[thread_info](%%rsi),%%r8\n\t" \
"movq %%rax,%%rdi\n\t" \
- "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \
+ "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \
"jnz ret_from_fork\n\t" \
RESTORE_CONTEXT \
: "=a" (last) \
@@ -143,7 +143,7 @@ do { \
[ti_flags] "i" (offsetof(struct thread_info, flags)), \
[_tif_fork] "i" (_TIF_FORK), \
[thread_info] "i" (offsetof(struct task_struct, stack)), \
- [current_task] "m" (per_cpu_var(current_task)) \
+ [current_task] "m" (current_task) \
__switch_canary_iparam \
: "memory", "cc" __EXTRA_CLOBBER)
#endif
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 2b4945419a8..fb9a080740e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -53,6 +53,7 @@
*/
#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
#define SECONDARY_EXEC_ENABLE_EPT 0x00000002
+#define SECONDARY_EXEC_RDTSCP 0x00000008
#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
@@ -251,6 +252,7 @@ enum vmcs_field {
#define EXIT_REASON_MSR_READ 31
#define EXIT_REASON_MSR_WRITE 32
#define EXIT_REASON_MWAIT_INSTRUCTION 36
+#define EXIT_REASON_MONITOR_INSTRUCTION 39
#define EXIT_REASON_PAUSE_INSTRUCTION 40
#define EXIT_REASON_MCE_DURING_VMENTRY 41
#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
@@ -362,6 +364,7 @@ enum vmcs_field {
#define VMX_EPTP_UC_BIT (1ull << 8)
#define VMX_EPTP_WB_BIT (1ull << 14)
#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
+#define VMX_EPT_1GB_PAGE_BIT (1ull << 17)
#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24)
#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
@@ -374,7 +377,7 @@ enum vmcs_field {
#define VMX_EPT_READABLE_MASK 0x1ull
#define VMX_EPT_WRITABLE_MASK 0x2ull
#define VMX_EPT_EXECUTABLE_MASK 0x4ull
-#define VMX_EPT_IGMT_BIT (1ull << 6)
+#define VMX_EPT_IPAT_BIT (1ull << 6)
#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index f95703098f8..738fcb60e70 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -447,6 +447,12 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
{
*irq = gsi;
+
+#ifdef CONFIG_X86_IO_APIC
+ if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC)
+ setup_IO_APIC_irq_extra(gsi);
+#endif
+
return 0;
}
@@ -474,7 +480,8 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
}
#endif
- acpi_gsi_to_irq(plat_gsi, &irq);
+ irq = plat_gsi;
+
return irq;
}
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index e6ea0342c8f..3a4bf35c179 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -7,6 +7,7 @@
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/memory.h>
+#include <linux/stop_machine.h>
#include <asm/alternative.h>
#include <asm/sections.h>
#include <asm/pgtable.h>
@@ -572,3 +573,62 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
local_irq_restore(flags);
return addr;
}
+
+/*
+ * Cross-modifying kernel text with stop_machine().
+ * This code originally comes from immediate value.
+ */
+static atomic_t stop_machine_first;
+static int wrote_text;
+
+struct text_poke_params {
+ void *addr;
+ const void *opcode;
+ size_t len;
+};
+
+static int __kprobes stop_machine_text_poke(void *data)
+{
+ struct text_poke_params *tpp = data;
+
+ if (atomic_dec_and_test(&stop_machine_first)) {
+ text_poke(tpp->addr, tpp->opcode, tpp->len);
+ smp_wmb(); /* Make sure other cpus see that this has run */
+ wrote_text = 1;
+ } else {
+ while (!wrote_text)
+ cpu_relax();
+ smp_mb(); /* Load wrote_text before following execution */
+ }
+
+ flush_icache_range((unsigned long)tpp->addr,
+ (unsigned long)tpp->addr + tpp->len);
+ return 0;
+}
+
+/**
+ * text_poke_smp - Update instructions on a live kernel on SMP
+ * @addr: address to modify
+ * @opcode: source of the copy
+ * @len: length to copy
+ *
+ * Modify multi-byte instruction by using stop_machine() on SMP. This allows
+ * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying
+ * should be allowed, since stop_machine() does _not_ protect code against
+ * NMI and MCE.
+ *
+ * Note: Must be called under get_online_cpus() and text_mutex.
+ */
+void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
+{
+ struct text_poke_params tpp;
+
+ tpp.addr = addr;
+ tpp.opcode = opcode;
+ tpp.len = len;
+ atomic_set(&stop_machine_first, 1);
+ wrote_text = 0;
+ stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
+ return addr;
+}
+
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 6bdd2c7ead7..14862f11cc4 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -73,8 +73,8 @@
*/
int sis_apic_bug = -1;
-static DEFINE_SPINLOCK(ioapic_lock);
-static DEFINE_SPINLOCK(vector_lock);
+static DEFINE_RAW_SPINLOCK(ioapic_lock);
+static DEFINE_RAW_SPINLOCK(vector_lock);
/*
* # of IRQ routing registers
@@ -94,8 +94,6 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
/* # of MP IRQ source entries */
int mp_irq_entries;
-/* Number of legacy interrupts */
-static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY;
/* GSI interrupts */
static int nr_irqs_gsi = NR_IRQS_LEGACY;
@@ -140,27 +138,10 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node)
/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
#ifdef CONFIG_SPARSE_IRQ
-static struct irq_cfg irq_cfgx[] = {
+static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY];
#else
-static struct irq_cfg irq_cfgx[NR_IRQS] = {
+static struct irq_cfg irq_cfgx[NR_IRQS];
#endif
- [0] = { .vector = IRQ0_VECTOR, },
- [1] = { .vector = IRQ1_VECTOR, },
- [2] = { .vector = IRQ2_VECTOR, },
- [3] = { .vector = IRQ3_VECTOR, },
- [4] = { .vector = IRQ4_VECTOR, },
- [5] = { .vector = IRQ5_VECTOR, },
- [6] = { .vector = IRQ6_VECTOR, },
- [7] = { .vector = IRQ7_VECTOR, },
- [8] = { .vector = IRQ8_VECTOR, },
- [9] = { .vector = IRQ9_VECTOR, },
- [10] = { .vector = IRQ10_VECTOR, },
- [11] = { .vector = IRQ11_VECTOR, },
- [12] = { .vector = IRQ12_VECTOR, },
- [13] = { .vector = IRQ13_VECTOR, },
- [14] = { .vector = IRQ14_VECTOR, },
- [15] = { .vector = IRQ15_VECTOR, },
-};
void __init io_apic_disable_legacy(void)
{
@@ -185,8 +166,14 @@ int __init arch_early_irq_init(void)
desc->chip_data = &cfg[i];
zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
- if (i < nr_legacy_irqs)
- cpumask_setall(cfg[i].domain);
+ /*
+ * For legacy IRQ's, start with assigning irq0 to irq15 to
+ * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0.
+ */
+ if (i < nr_legacy_irqs) {
+ cfg[i].vector = IRQ0_VECTOR + i;
+ cpumask_set_cpu(0, cfg[i].domain);
+ }
}
return 0;
@@ -406,7 +393,7 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
struct irq_pin_list *entry;
unsigned long flags;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
for_each_irq_pin(entry, cfg->irq_2_pin) {
unsigned int reg;
int pin;
@@ -415,11 +402,11 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
reg = io_apic_read(entry->apic, 0x10 + pin*2);
/* Is the remote IRR bit set? */
if (reg & IO_APIC_REDIR_REMOTE_IRR) {
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return true;
}
}
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return false;
}
@@ -433,10 +420,10 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
{
union entry_union eu;
unsigned long flags;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return eu.entry;
}
@@ -459,9 +446,9 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
{
unsigned long flags;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
__ioapic_write_entry(apic, pin, e);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
/*
@@ -474,10 +461,10 @@ static void ioapic_mask_entry(int apic, int pin)
unsigned long flags;
union entry_union eu = { .entry.mask = 1 };
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
io_apic_write(apic, 0x10 + 2*pin, eu.w1);
io_apic_write(apic, 0x11 + 2*pin, eu.w2);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
/*
@@ -604,9 +591,9 @@ static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
BUG_ON(!cfg);
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
__mask_IO_APIC_irq(cfg);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
@@ -614,9 +601,9 @@ static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
struct irq_cfg *cfg = desc->chip_data;
unsigned long flags;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
__unmask_IO_APIC_irq(cfg);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
static void mask_IO_APIC_irq(unsigned int irq)
@@ -1140,12 +1127,12 @@ void lock_vector_lock(void)
/* Used to the online set of cpus does not change
* during assign_irq_vector.
*/
- spin_lock(&vector_lock);
+ raw_spin_lock(&vector_lock);
}
void unlock_vector_lock(void)
{
- spin_unlock(&vector_lock);
+ raw_spin_unlock(&vector_lock);
}
static int
@@ -1162,7 +1149,8 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
* Also, we've got to be careful not to trash gate
* 0x80, because int 0x80 is hm, kind of importantish. ;)
*/
- static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
+ static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
+ static int current_offset = VECTOR_OFFSET_START % 8;
unsigned int old_vector;
int cpu, err;
cpumask_var_t tmp_mask;
@@ -1198,7 +1186,7 @@ next:
if (vector >= first_system_vector) {
/* If out of vectors on large boxen, must share them. */
offset = (offset + 1) % 8;
- vector = FIRST_DEVICE_VECTOR + offset;
+ vector = FIRST_EXTERNAL_VECTOR + offset;
}
if (unlikely(current_vector == vector))
continue;
@@ -1232,9 +1220,9 @@ int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
int err;
unsigned long flags;
- spin_lock_irqsave(&vector_lock, flags);
+ raw_spin_lock_irqsave(&vector_lock, flags);
err = __assign_irq_vector(irq, cfg, mask);
- spin_unlock_irqrestore(&vector_lock, flags);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
return err;
}
@@ -1268,11 +1256,16 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
void __setup_vector_irq(int cpu)
{
/* Initialize vector_irq on a new cpu */
- /* This function must be called with vector_lock held */
int irq, vector;
struct irq_cfg *cfg;
struct irq_desc *desc;
+ /*
+ * vector_lock will make sure that we don't run into irq vector
+ * assignments that might be happening on another cpu in parallel,
+ * while we setup our initial vector to irq mappings.
+ */
+ raw_spin_lock(&vector_lock);
/* Mark the inuse vectors */
for_each_irq_desc(irq, desc) {
cfg = desc->chip_data;
@@ -1291,6 +1284,7 @@ void __setup_vector_irq(int cpu)
if (!cpumask_test_cpu(cpu, cfg->domain))
per_cpu(vector_irq, cpu)[vector] = -1;
}
+ raw_spin_unlock(&vector_lock);
}
static struct irq_chip ioapic_chip;
@@ -1440,6 +1434,14 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
cfg = desc->chip_data;
+ /*
+ * For legacy irqs, cfg->domain starts with cpu 0 for legacy
+ * controllers like 8259. Now that IO-APIC can handle this irq, update
+ * the cfg->domain.
+ */
+ if (irq < nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain))
+ apic->vector_allocation_domain(0, cfg->domain);
+
if (assign_irq_vector(irq, cfg, apic->target_cpus()))
return;
@@ -1473,7 +1475,7 @@ static struct {
static void __init setup_IO_APIC_irqs(void)
{
- int apic_id = 0, pin, idx, irq;
+ int apic_id, pin, idx, irq;
int notcon = 0;
struct irq_desc *desc;
struct irq_cfg *cfg;
@@ -1481,14 +1483,7 @@ static void __init setup_IO_APIC_irqs(void)
apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
-#ifdef CONFIG_ACPI
- if (!acpi_disabled && acpi_ioapic) {
- apic_id = mp_find_ioapic(0);
- if (apic_id < 0)
- apic_id = 0;
- }
-#endif
-
+ for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
idx = find_irq_entry(apic_id, pin, mp_INT);
if (idx == -1) {
@@ -1510,6 +1505,9 @@ static void __init setup_IO_APIC_irqs(void)
irq = pin_2_irq(idx, apic_id, pin);
+ if ((apic_id > 0) && (irq > 16))
+ continue;
+
/*
* Skip the timer IRQ if there's a quirk handler
* installed and if it returns 1:
@@ -1539,6 +1537,56 @@ static void __init setup_IO_APIC_irqs(void)
}
/*
+ * for the gsit that is not in first ioapic
+ * but could not use acpi_register_gsi()
+ * like some special sci in IBM x3330
+ */
+void setup_IO_APIC_irq_extra(u32 gsi)
+{
+ int apic_id = 0, pin, idx, irq;
+ int node = cpu_to_node(boot_cpu_id);
+ struct irq_desc *desc;
+ struct irq_cfg *cfg;
+
+ /*
+ * Convert 'gsi' to 'ioapic.pin'.
+ */
+ apic_id = mp_find_ioapic(gsi);
+ if (apic_id < 0)
+ return;
+
+ pin = mp_find_ioapic_pin(apic_id, gsi);
+ idx = find_irq_entry(apic_id, pin, mp_INT);
+ if (idx == -1)
+ return;
+
+ irq = pin_2_irq(idx, apic_id, pin);
+#ifdef CONFIG_SPARSE_IRQ
+ desc = irq_to_desc(irq);
+ if (desc)
+ return;
+#endif
+ desc = irq_to_desc_alloc_node(irq, node);
+ if (!desc) {
+ printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+ return;
+ }
+
+ cfg = desc->chip_data;
+ add_pin_to_irq_node(cfg, node, apic_id, pin);
+
+ if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) {
+ pr_debug("Pin %d-%d already programmed\n",
+ mp_ioapics[apic_id].apicid, pin);
+ return;
+ }
+ set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed);
+
+ setup_IO_APIC_irq(apic_id, pin, irq, desc,
+ irq_trigger(idx), irq_polarity(idx));
+}
+
+/*
* Set up the timer pin, possibly with the 8259A-master behind.
*/
static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
@@ -1601,14 +1649,14 @@ __apicdebuginit(void) print_IO_APIC(void)
for (apic = 0; apic < nr_ioapics; apic++) {
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(apic, 0);
reg_01.raw = io_apic_read(apic, 1);
if (reg_01.bits.version >= 0x10)
reg_02.raw = io_apic_read(apic, 2);
if (reg_01.bits.version >= 0x20)
reg_03.raw = io_apic_read(apic, 3);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
printk("\n");
printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid);
@@ -1830,7 +1878,7 @@ __apicdebuginit(void) print_PIC(void)
printk(KERN_DEBUG "\nprinting PIC contents\n");
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
v = inb(0xa1) << 8 | inb(0x21);
printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
@@ -1844,7 +1892,7 @@ __apicdebuginit(void) print_PIC(void)
outb(0x0a,0xa0);
outb(0x0a,0x20);
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
@@ -1903,9 +1951,9 @@ void __init enable_IO_APIC(void)
* The number of IO-APIC IRQ registers (== #pins):
*/
for (apic = 0; apic < nr_ioapics; apic++) {
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
reg_01.raw = io_apic_read(apic, 1);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
nr_ioapic_registers[apic] = reg_01.bits.entries+1;
}
@@ -2045,9 +2093,9 @@ void __init setup_ioapic_ids_from_mpc(void)
for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
/* Read the register 0 value */
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(apic_id, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
old_id = mp_ioapics[apic_id].apicid;
@@ -2106,16 +2154,16 @@ void __init setup_ioapic_ids_from_mpc(void)
mp_ioapics[apic_id].apicid);
reg_00.bits.ID = mp_ioapics[apic_id].apicid;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
io_apic_write(apic_id, 0, reg_00.raw);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
/*
* Sanity check
*/
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(apic_id, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
if (reg_00.bits.ID != mp_ioapics[apic_id].apicid)
printk("could not set ID!\n");
else
@@ -2198,7 +2246,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
unsigned long flags;
struct irq_cfg *cfg;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
if (irq < nr_legacy_irqs) {
disable_8259A_irq(irq);
if (i8259A_irq_pending(irq))
@@ -2206,7 +2254,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
}
cfg = irq_cfg(irq);
__unmask_IO_APIC_irq(cfg);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return was_pending;
}
@@ -2217,9 +2265,9 @@ static int ioapic_retrigger_irq(unsigned int irq)
struct irq_cfg *cfg = irq_cfg(irq);
unsigned long flags;
- spin_lock_irqsave(&vector_lock, flags);
+ raw_spin_lock_irqsave(&vector_lock, flags);
apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
- spin_unlock_irqrestore(&vector_lock, flags);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
return 1;
}
@@ -2312,14 +2360,14 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
irq = desc->irq;
cfg = desc->chip_data;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
ret = set_desc_affinity(desc, mask, &dest);
if (!ret) {
/* Only the high 8 bits are valid. */
dest = SET_APIC_LOGICAL_ID(dest);
__target_IO_APIC_irq(irq, dest, cfg);
}
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return ret;
}
@@ -2554,9 +2602,9 @@ static void eoi_ioapic_irq(struct irq_desc *desc)
irq = desc->irq;
cfg = desc->chip_data;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
__eoi_ioapic_irq(irq, cfg);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
static void ack_apic_level(unsigned int irq)
@@ -3138,13 +3186,13 @@ static int ioapic_resume(struct sys_device *dev)
data = container_of(dev, struct sysfs_ioapic_data, dev);
entry = data->entry;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(dev->id, 0);
if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) {
reg_00.bits.ID = mp_ioapics[dev->id].apicid;
io_apic_write(dev->id, 0, reg_00.raw);
}
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
ioapic_write_entry(dev->id, i, entry[i]);
@@ -3207,7 +3255,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
if (irq_want < nr_irqs_gsi)
irq_want = nr_irqs_gsi;
- spin_lock_irqsave(&vector_lock, flags);
+ raw_spin_lock_irqsave(&vector_lock, flags);
for (new = irq_want; new < nr_irqs; new++) {
desc_new = irq_to_desc_alloc_node(new, node);
if (!desc_new) {
@@ -3226,14 +3274,11 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
irq = new;
break;
}
- spin_unlock_irqrestore(&vector_lock, flags);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+
+ if (irq > 0)
+ dynamic_irq_init_keep_chip_data(irq);
- if (irq > 0) {
- dynamic_irq_init(irq);
- /* restore it, in case dynamic_irq_init clear it */
- if (desc_new)
- desc_new->chip_data = cfg_new;
- }
return irq;
}
@@ -3255,20 +3300,13 @@ int create_irq(void)
void destroy_irq(unsigned int irq)
{
unsigned long flags;
- struct irq_cfg *cfg;
- struct irq_desc *desc;
- /* store it, in case dynamic_irq_cleanup clear it */
- desc = irq_to_desc(irq);
- cfg = desc->chip_data;
- dynamic_irq_cleanup(irq);
- /* connect back irq_cfg */
- desc->chip_data = cfg;
+ dynamic_irq_cleanup_keep_chip_data(irq);
free_irte(irq);
- spin_lock_irqsave(&vector_lock, flags);
- __clear_irq_vector(irq, cfg);
- spin_unlock_irqrestore(&vector_lock, flags);
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ __clear_irq_vector(irq, get_irq_chip_data(irq));
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
}
/*
@@ -3805,9 +3843,9 @@ int __init io_apic_get_redir_entries (int ioapic)
union IO_APIC_reg_01 reg_01;
unsigned long flags;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
reg_01.raw = io_apic_read(ioapic, 1);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return reg_01.bits.entries;
}
@@ -3969,9 +4007,9 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
if (physids_empty(apic_id_map))
apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map);
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(ioapic, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
if (apic_id >= get_physical_broadcast()) {
printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
@@ -4005,10 +4043,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
if (reg_00.bits.ID != apic_id) {
reg_00.bits.ID = apic_id;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
io_apic_write(ioapic, 0, reg_00.raw);
reg_00.raw = io_apic_read(ioapic, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
/* Sanity check */
if (reg_00.bits.ID != apic_id) {
@@ -4029,9 +4067,9 @@ int __init io_apic_get_version(int ioapic)
union IO_APIC_reg_01 reg_01;
unsigned long flags;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
reg_01.raw = io_apic_read(ioapic, 1);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return reg_01.bits.version;
}
@@ -4063,27 +4101,23 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
#ifdef CONFIG_SMP
void __init setup_ioapic_dest(void)
{
- int pin, ioapic = 0, irq, irq_entry;
+ int pin, ioapic, irq, irq_entry;
struct irq_desc *desc;
const struct cpumask *mask;
if (skip_ioapic_setup == 1)
return;
-#ifdef CONFIG_ACPI
- if (!acpi_disabled && acpi_ioapic) {
- ioapic = mp_find_ioapic(0);
- if (ioapic < 0)
- ioapic = 0;
- }
-#endif
-
+ for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
irq_entry = find_irq_entry(ioapic, pin, mp_INT);
if (irq_entry == -1)
continue;
irq = pin_2_irq(irq_entry, ioapic, pin);
+ if ((ioapic > 0) && (irq > 16))
+ continue;
+
desc = irq_to_desc(irq);
/*
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 0159a69396c..bd7c96b5e8d 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -416,13 +416,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
/* We can be called before check_nmi_watchdog, hence NULL check. */
if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
- static DEFINE_SPINLOCK(lock); /* Serialise the printks */
+ static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */
- spin_lock(&lock);
+ raw_spin_lock(&lock);
printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
show_regs(regs);
dump_stack();
- spin_unlock(&lock);
+ raw_spin_unlock(&lock);
cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
rc = 1;
@@ -438,8 +438,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
* Ayiee, looks like this CPU is stuck ...
* wait a few IRQs (5 seconds) before doing the oops ...
*/
- __this_cpu_inc(per_cpu_var(alert_counter));
- if (__this_cpu_read(per_cpu_var(alert_counter)) == 5 * nmi_hz)
+ __this_cpu_inc(alert_counter);
+ if (__this_cpu_read(alert_counter) == 5 * nmi_hz)
/*
* die_nmi will return ONLY if NOTIFY_STOP happens..
*/
@@ -447,7 +447,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
regs, panic_on_timeout);
} else {
__get_cpu_var(last_irq_sum) = sum;
- __this_cpu_write(per_cpu_var(alert_counter), 0);
+ __this_cpu_write(alert_counter, 0);
}
/* see if the nmi watchdog went off */
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 09b1698e046..06130b52f01 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -22,10 +22,10 @@
#include <linux/pci.h>
#include <linux/smp.h>
#include <linux/cpu.h>
-#include <linux/sort.h>
#include <linux/mutex.h>
#include <linux/uaccess.h>
#include <linux/kvm_para.h>
+#include <linux/range.h>
#include <asm/processor.h>
#include <asm/e820.h>
@@ -34,11 +34,6 @@
#include "mtrr.h"
-struct res_range {
- unsigned long start;
- unsigned long end;
-};
-
struct var_mtrr_range_state {
unsigned long base_pfn;
unsigned long size_pfn;
@@ -56,7 +51,7 @@ struct var_mtrr_state {
/* Should be related to MTRR_VAR_RANGES nums */
#define RANGE_NUM 256
-static struct res_range __initdata range[RANGE_NUM];
+static struct range __initdata range[RANGE_NUM];
static int __initdata nr_range;
static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
@@ -64,152 +59,11 @@ static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
static int __initdata debug_print;
#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0)
-
-static int __init
-add_range(struct res_range *range, int nr_range,
- unsigned long start, unsigned long end)
-{
- /* Out of slots: */
- if (nr_range >= RANGE_NUM)
- return nr_range;
-
- range[nr_range].start = start;
- range[nr_range].end = end;
-
- nr_range++;
-
- return nr_range;
-}
-
-static int __init
-add_range_with_merge(struct res_range *range, int nr_range,
- unsigned long start, unsigned long end)
-{
- int i;
-
- /* Try to merge it with old one: */
- for (i = 0; i < nr_range; i++) {
- unsigned long final_start, final_end;
- unsigned long common_start, common_end;
-
- if (!range[i].end)
- continue;
-
- common_start = max(range[i].start, start);
- common_end = min(range[i].end, end);
- if (common_start > common_end + 1)
- continue;
-
- final_start = min(range[i].start, start);
- final_end = max(range[i].end, end);
-
- range[i].start = final_start;
- range[i].end = final_end;
- return nr_range;
- }
-
- /* Need to add it: */
- return add_range(range, nr_range, start, end);
-}
-
-static void __init
-subtract_range(struct res_range *range, unsigned long start, unsigned long end)
-{
- int i, j;
-
- for (j = 0; j < RANGE_NUM; j++) {
- if (!range[j].end)
- continue;
-
- if (start <= range[j].start && end >= range[j].end) {
- range[j].start = 0;
- range[j].end = 0;
- continue;
- }
-
- if (start <= range[j].start && end < range[j].end &&
- range[j].start < end + 1) {
- range[j].start = end + 1;
- continue;
- }
-
-
- if (start > range[j].start && end >= range[j].end &&
- range[j].end > start - 1) {
- range[j].end = start - 1;
- continue;
- }
-
- if (start > range[j].start && end < range[j].end) {
- /* Find the new spare: */
- for (i = 0; i < RANGE_NUM; i++) {
- if (range[i].end == 0)
- break;
- }
- if (i < RANGE_NUM) {
- range[i].end = range[j].end;
- range[i].start = end + 1;
- } else {
- printk(KERN_ERR "run of slot in ranges\n");
- }
- range[j].end = start - 1;
- continue;
- }
- }
-}
-
-static int __init cmp_range(const void *x1, const void *x2)
-{
- const struct res_range *r1 = x1;
- const struct res_range *r2 = x2;
- long start1, start2;
-
- start1 = r1->start;
- start2 = r2->start;
-
- return start1 - start2;
-}
-
-static int __init clean_sort_range(struct res_range *range, int az)
-{
- int i, j, k = az - 1, nr_range = 0;
-
- for (i = 0; i < k; i++) {
- if (range[i].end)
- continue;
- for (j = k; j > i; j--) {
- if (range[j].end) {
- k = j;
- break;
- }
- }
- if (j == i)
- break;
- range[i].start = range[k].start;
- range[i].end = range[k].end;
- range[k].start = 0;
- range[k].end = 0;
- k--;
- }
- /* count it */
- for (i = 0; i < az; i++) {
- if (!range[i].end) {
- nr_range = i;
- break;
- }
- }
-
- /* sort them */
- sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
-
- return nr_range;
-}
-
#define BIOS_BUG_MSG KERN_WARNING \
"WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
static int __init
-x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
+x86_get_mtrr_mem_range(struct range *range, int nr_range,
unsigned long extra_remove_base,
unsigned long extra_remove_size)
{
@@ -223,14 +77,14 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
continue;
base = range_state[i].base_pfn;
size = range_state[i].size_pfn;
- nr_range = add_range_with_merge(range, nr_range, base,
- base + size - 1);
+ nr_range = add_range_with_merge(range, RANGE_NUM, nr_range,
+ base, base + size);
}
if (debug_print) {
printk(KERN_DEBUG "After WB checking\n");
for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
- range[i].start, range[i].end + 1);
+ printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
+ range[i].start, range[i].end);
}
/* Take out UC ranges: */
@@ -252,19 +106,19 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
size -= (1<<(20-PAGE_SHIFT)) - base;
base = 1<<(20-PAGE_SHIFT);
}
- subtract_range(range, base, base + size - 1);
+ subtract_range(range, RANGE_NUM, base, base + size);
}
if (extra_remove_size)
- subtract_range(range, extra_remove_base,
- extra_remove_base + extra_remove_size - 1);
+ subtract_range(range, RANGE_NUM, extra_remove_base,
+ extra_remove_base + extra_remove_size);
if (debug_print) {
printk(KERN_DEBUG "After UC checking\n");
for (i = 0; i < RANGE_NUM; i++) {
if (!range[i].end)
continue;
- printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
- range[i].start, range[i].end + 1);
+ printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
+ range[i].start, range[i].end);
}
}
@@ -273,26 +127,22 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
if (debug_print) {
printk(KERN_DEBUG "After sorting\n");
for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
- range[i].start, range[i].end + 1);
+ printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
+ range[i].start, range[i].end);
}
- /* clear those is not used */
- for (i = nr_range; i < RANGE_NUM; i++)
- memset(&range[i], 0, sizeof(range[i]));
-
return nr_range;
}
#ifdef CONFIG_MTRR_SANITIZER
-static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
+static unsigned long __init sum_ranges(struct range *range, int nr_range)
{
unsigned long sum = 0;
int i;
for (i = 0; i < nr_range; i++)
- sum += range[i].end + 1 - range[i].start;
+ sum += range[i].end - range[i].start;
return sum;
}
@@ -621,7 +471,7 @@ static int __init parse_mtrr_spare_reg(char *arg)
early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
static int __init
-x86_setup_var_mtrrs(struct res_range *range, int nr_range,
+x86_setup_var_mtrrs(struct range *range, int nr_range,
u64 chunk_size, u64 gran_size)
{
struct var_mtrr_state var_state;
@@ -639,7 +489,7 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range,
/* Write the range: */
for (i = 0; i < nr_range; i++) {
set_var_mtrr_range(&var_state, range[i].start,
- range[i].end - range[i].start + 1);
+ range[i].end - range[i].start);
}
/* Write the last range: */
@@ -742,7 +592,7 @@ mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
unsigned long x_remove_base,
unsigned long x_remove_size, int i)
{
- static struct res_range range_new[RANGE_NUM];
+ static struct range range_new[RANGE_NUM];
unsigned long range_sums_new;
static int nr_range_new;
int num_reg;
@@ -869,10 +719,10 @@ int __init mtrr_cleanup(unsigned address_bits)
* [0, 1M) should always be covered by var mtrr with WB
* and fixed mtrrs should take effect before var mtrr for it:
*/
- nr_range = add_range_with_merge(range, nr_range, 0,
- (1ULL<<(20 - PAGE_SHIFT)) - 1);
+ nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, 0,
+ 1ULL<<(20 - PAGE_SHIFT));
/* Sort the ranges: */
- sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
+ sort_range(range, nr_range);
range_sums = sum_ranges(range, nr_range);
printk(KERN_INFO "total RAM covered: %ldM\n",
@@ -1089,9 +939,9 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
nr_range = 0;
if (mtrr_tom2) {
range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
- range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
- if (highest_pfn < range[nr_range].end + 1)
- highest_pfn = range[nr_range].end + 1;
+ range[nr_range].end = mtrr_tom2 >> PAGE_SHIFT;
+ if (highest_pfn < range[nr_range].end)
+ highest_pfn = range[nr_range].end;
nr_range++;
}
nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
@@ -1103,15 +953,15 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
/* Check the holes: */
for (i = 0; i < nr_range - 1; i++) {
- if (range[i].end + 1 < range[i+1].start)
- total_trim_size += real_trim_memory(range[i].end + 1,
+ if (range[i].end < range[i+1].start)
+ total_trim_size += real_trim_memory(range[i].end,
range[i+1].start);
}
/* Check the top: */
i = nr_range - 1;
- if (range[i].end + 1 < end_pfn)
- total_trim_size += real_trim_memory(range[i].end + 1,
+ if (range[i].end < end_pfn)
+ total_trim_size += real_trim_memory(range[i].end,
end_pfn);
if (total_trim_size) {
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index fe4622e8c83..79556bd9b60 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -145,6 +145,7 @@ struct set_mtrr_data {
/**
* ipi_handler - Synchronisation handler. Executed by "other" CPUs.
+ * @info: pointer to mtrr configuration data
*
* Returns nothing.
*/
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index a966b753e49..740b440fbd7 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -12,21 +12,13 @@
#include <linux/types.h>
#include <linux/init.h>
#include <linux/bootmem.h>
-#include <linux/ioport.h>
-#include <linux/string.h>
-#include <linux/kexec.h>
-#include <linux/module.h>
-#include <linux/mm.h>
#include <linux/pfn.h>
#include <linux/suspend.h>
#include <linux/firmware-map.h>
-#include <asm/pgtable.h>
-#include <asm/page.h>
#include <asm/e820.h>
#include <asm/proto.h>
#include <asm/setup.h>
-#include <asm/trampoline.h>
/*
* The e820 map is the map that gets modified e.g. with command line parameters
@@ -730,319 +722,44 @@ core_initcall(e820_mark_nvs_memory);
#endif
/*
- * Early reserved memory areas.
- */
-#define MAX_EARLY_RES 32
-
-struct early_res {
- u64 start, end;
- char name[16];
- char overlap_ok;
-};
-static struct early_res early_res[MAX_EARLY_RES] __initdata = {
- { 0, PAGE_SIZE, "BIOS data page", 1 }, /* BIOS data page */
-#if defined(CONFIG_X86_32) && defined(CONFIG_X86_TRAMPOLINE)
- /*
- * But first pinch a few for the stack/trampoline stuff
- * FIXME: Don't need the extra page at 4K, but need to fix
- * trampoline before removing it. (see the GDT stuff)
- */
- { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE", 1 },
-#endif
-
- {}
-};
-
-static int __init find_overlapped_early(u64 start, u64 end)
-{
- int i;
- struct early_res *r;
-
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
- r = &early_res[i];
- if (end > r->start && start < r->end)
- break;
- }
-
- return i;
-}
-
-/*
- * Drop the i-th range from the early reservation map,
- * by copying any higher ranges down one over it, and
- * clearing what had been the last slot.
- */
-static void __init drop_range(int i)
-{
- int j;
-
- for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
- ;
-
- memmove(&early_res[i], &early_res[i + 1],
- (j - 1 - i) * sizeof(struct early_res));
-
- early_res[j - 1].end = 0;
-}
-
-/*
- * Split any existing ranges that:
- * 1) are marked 'overlap_ok', and
- * 2) overlap with the stated range [start, end)
- * into whatever portion (if any) of the existing range is entirely
- * below or entirely above the stated range. Drop the portion
- * of the existing range that overlaps with the stated range,
- * which will allow the caller of this routine to then add that
- * stated range without conflicting with any existing range.
+ * Find a free area with specified alignment in a specific range.
*/
-static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
+u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
{
int i;
- struct early_res *r;
- u64 lower_start, lower_end;
- u64 upper_start, upper_end;
- char name[16];
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
- r = &early_res[i];
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ u64 addr;
+ u64 ei_start, ei_last;
- /* Continue past non-overlapping ranges */
- if (end <= r->start || start >= r->end)
+ if (ei->type != E820_RAM)
continue;
- /*
- * Leave non-ok overlaps as is; let caller
- * panic "Overlapping early reservations"
- * when it hits this overlap.
- */
- if (!r->overlap_ok)
- return;
-
- /*
- * We have an ok overlap. We will drop it from the early
- * reservation map, and add back in any non-overlapping
- * portions (lower or upper) as separate, overlap_ok,
- * non-overlapping ranges.
- */
-
- /* 1. Note any non-overlapping (lower or upper) ranges. */
- strncpy(name, r->name, sizeof(name) - 1);
-
- lower_start = lower_end = 0;
- upper_start = upper_end = 0;
- if (r->start < start) {
- lower_start = r->start;
- lower_end = start;
- }
- if (r->end > end) {
- upper_start = end;
- upper_end = r->end;
- }
-
- /* 2. Drop the original ok overlapping range */
- drop_range(i);
-
- i--; /* resume for-loop on copied down entry */
-
- /* 3. Add back in any non-overlapping ranges. */
- if (lower_end)
- reserve_early_overlap_ok(lower_start, lower_end, name);
- if (upper_end)
- reserve_early_overlap_ok(upper_start, upper_end, name);
- }
-}
-
-static void __init __reserve_early(u64 start, u64 end, char *name,
- int overlap_ok)
-{
- int i;
- struct early_res *r;
-
- i = find_overlapped_early(start, end);
- if (i >= MAX_EARLY_RES)
- panic("Too many early reservations");
- r = &early_res[i];
- if (r->end)
- panic("Overlapping early reservations "
- "%llx-%llx %s to %llx-%llx %s\n",
- start, end - 1, name?name:"", r->start,
- r->end - 1, r->name);
- r->start = start;
- r->end = end;
- r->overlap_ok = overlap_ok;
- if (name)
- strncpy(r->name, name, sizeof(r->name) - 1);
-}
-
-/*
- * A few early reservtations come here.
- *
- * The 'overlap_ok' in the name of this routine does -not- mean it
- * is ok for these reservations to overlap an earlier reservation.
- * Rather it means that it is ok for subsequent reservations to
- * overlap this one.
- *
- * Use this entry point to reserve early ranges when you are doing
- * so out of "Paranoia", reserving perhaps more memory than you need,
- * just in case, and don't mind a subsequent overlapping reservation
- * that is known to be needed.
- *
- * The drop_overlaps_that_are_ok() call here isn't really needed.
- * It would be needed if we had two colliding 'overlap_ok'
- * reservations, so that the second such would not panic on the
- * overlap with the first. We don't have any such as of this
- * writing, but might as well tolerate such if it happens in
- * the future.
- */
-void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
-{
- drop_overlaps_that_are_ok(start, end);
- __reserve_early(start, end, name, 1);
-}
-
-/*
- * Most early reservations come here.
- *
- * We first have drop_overlaps_that_are_ok() drop any pre-existing
- * 'overlap_ok' ranges, so that we can then reserve this memory
- * range without risk of panic'ing on an overlapping overlap_ok
- * early reservation.
- */
-void __init reserve_early(u64 start, u64 end, char *name)
-{
- if (start >= end)
- return;
-
- drop_overlaps_that_are_ok(start, end);
- __reserve_early(start, end, name, 0);
-}
-
-void __init free_early(u64 start, u64 end)
-{
- struct early_res *r;
- int i;
-
- i = find_overlapped_early(start, end);
- r = &early_res[i];
- if (i >= MAX_EARLY_RES || r->end != end || r->start != start)
- panic("free_early on not reserved area: %llx-%llx!",
- start, end - 1);
-
- drop_range(i);
-}
-
-void __init early_res_to_bootmem(u64 start, u64 end)
-{
- int i, count;
- u64 final_start, final_end;
-
- count = 0;
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
- count++;
-
- printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
- count, start, end);
- for (i = 0; i < count; i++) {
- struct early_res *r = &early_res[i];
- printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
- r->start, r->end, r->name);
- final_start = max(start, r->start);
- final_end = min(end, r->end);
- if (final_start >= final_end) {
- printk(KERN_CONT "\n");
- continue;
- }
- printk(KERN_CONT " ==> [%010llx - %010llx]\n",
- final_start, final_end);
- reserve_bootmem_generic(final_start, final_end - final_start,
- BOOTMEM_DEFAULT);
- }
-}
+ ei_last = ei->addr + ei->size;
+ ei_start = ei->addr;
+ addr = find_early_area(ei_start, ei_last, start, end,
+ size, align);
-/* Check for already reserved areas */
-static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
-{
- int i;
- u64 addr = *addrp;
- int changed = 0;
- struct early_res *r;
-again:
- i = find_overlapped_early(addr, addr + size);
- r = &early_res[i];
- if (i < MAX_EARLY_RES && r->end) {
- *addrp = addr = round_up(r->end, align);
- changed = 1;
- goto again;
+ if (addr != -1ULL)
+ return addr;
}
- return changed;
+ return -1ULL;
}
-/* Check for already reserved areas */
-static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
+u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align)
{
- int i;
- u64 addr = *addrp, last;
- u64 size = *sizep;
- int changed = 0;
-again:
- last = addr + size;
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
- struct early_res *r = &early_res[i];
- if (last > r->start && addr < r->start) {
- size = r->start - addr;
- changed = 1;
- goto again;
- }
- if (last > r->end && addr < r->end) {
- addr = round_up(r->end, align);
- size = last - addr;
- changed = 1;
- goto again;
- }
- if (last <= r->end && addr >= r->start) {
- (*sizep)++;
- return 0;
- }
- }
- if (changed) {
- *addrp = addr;
- *sizep = size;
- }
- return changed;
+ return find_e820_area(start, end, size, align);
}
-/*
- * Find a free area with specified alignment in a specific range.
- */
-u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
+u64 __init get_max_mapped(void)
{
- int i;
+ u64 end = max_pfn_mapped;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- u64 addr, last;
- u64 ei_last;
+ end <<= PAGE_SHIFT;
- if (ei->type != E820_RAM)
- continue;
- addr = round_up(ei->addr, align);
- ei_last = ei->addr + ei->size;
- if (addr < start)
- addr = round_up(start, align);
- if (addr >= ei_last)
- continue;
- while (bad_addr(&addr, size, align) && addr+size <= ei_last)
- ;
- last = addr + size;
- if (last > ei_last)
- continue;
- if (last > end)
- continue;
- return addr;
- }
- return -1ULL;
+ return end;
}
-
/*
* Find next free range after *start
*/
@@ -1052,25 +769,19 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
for (i = 0; i < e820.nr_map; i++) {
struct e820entry *ei = &e820.map[i];
- u64 addr, last;
- u64 ei_last;
+ u64 addr;
+ u64 ei_start, ei_last;
if (ei->type != E820_RAM)
continue;
- addr = round_up(ei->addr, align);
+
ei_last = ei->addr + ei->size;
- if (addr < start)
- addr = round_up(start, align);
- if (addr >= ei_last)
- continue;
- *sizep = ei_last - addr;
- while (bad_addr_size(&addr, sizep, align) &&
- addr + *sizep <= ei_last)
- ;
- last = addr + *sizep;
- if (last > ei_last)
- continue;
- return addr;
+ ei_start = ei->addr;
+ addr = find_early_area_size(ei_start, ei_last, start,
+ sizep, align);
+
+ if (addr != -1ULL)
+ return addr;
}
return -1ULL;
@@ -1429,6 +1140,8 @@ void __init e820_reserve_resources_late(void)
end = MAX_RESOURCE_SIZE;
if (start >= end)
continue;
+ printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ",
+ start, end);
reserve_region_with_split(&iomem_resource, start, end,
"RAM buffer");
}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 5051b94c906..adedeef1ded 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -29,6 +29,16 @@ static void __init i386_default_early_setup(void)
void __init i386_start_kernel(void)
{
+#ifdef CONFIG_X86_TRAMPOLINE
+ /*
+ * But first pinch a few for the stack/trampoline stuff
+ * FIXME: Don't need the extra page at 4K, but need to fix
+ * trampoline before removing it. (see the GDT stuff)
+ */
+ reserve_early_overlap_ok(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE,
+ "EX TRAMPOLINE");
+#endif
+
reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 7fd318bac59..37c3d4b17d8 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -442,8 +442,8 @@ is386: movl $2,%ecx # set MP
*/
cmpb $0,ready
jne 1f
- movl $per_cpu__gdt_page,%eax
- movl $per_cpu__stack_canary,%ecx
+ movl $gdt_page,%eax
+ movl $stack_canary,%ecx
movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
shrl $16, %ecx
movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
@@ -706,7 +706,7 @@ idt_descr:
.word 0 # 32 bit align gdt_desc.address
ENTRY(early_gdt_descr)
.word GDT_ENTRIES*8-1
- .long per_cpu__gdt_page /* Overwritten for secondary CPUs */
+ .long gdt_page /* Overwritten for secondary CPUs */
/*
* The boot_gdt must mirror the equivalent in setup.S and is
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index df89102bef8..8c93a84bb62 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -32,7 +32,7 @@
*/
static int i8259A_auto_eoi;
-DEFINE_SPINLOCK(i8259A_lock);
+DEFINE_RAW_SPINLOCK(i8259A_lock);
static void mask_and_ack_8259A(unsigned int);
struct irq_chip i8259A_chip = {
@@ -68,13 +68,13 @@ void disable_8259A_irq(unsigned int irq)
unsigned int mask = 1 << irq;
unsigned long flags;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
cached_irq_mask |= mask;
if (irq & 8)
outb(cached_slave_mask, PIC_SLAVE_IMR);
else
outb(cached_master_mask, PIC_MASTER_IMR);
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
}
void enable_8259A_irq(unsigned int irq)
@@ -82,13 +82,13 @@ void enable_8259A_irq(unsigned int irq)
unsigned int mask = ~(1 << irq);
unsigned long flags;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
cached_irq_mask &= mask;
if (irq & 8)
outb(cached_slave_mask, PIC_SLAVE_IMR);
else
outb(cached_master_mask, PIC_MASTER_IMR);
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
}
int i8259A_irq_pending(unsigned int irq)
@@ -97,12 +97,12 @@ int i8259A_irq_pending(unsigned int irq)
unsigned long flags;
int ret;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
if (irq < 8)
ret = inb(PIC_MASTER_CMD) & mask;
else
ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
return ret;
}
@@ -150,7 +150,7 @@ static void mask_and_ack_8259A(unsigned int irq)
unsigned int irqmask = 1 << irq;
unsigned long flags;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
/*
* Lightweight spurious IRQ detection. We do not want
* to overdo spurious IRQ handling - it's usually a sign
@@ -183,7 +183,7 @@ handle_real_irq:
outb(cached_master_mask, PIC_MASTER_IMR);
outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */
}
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
return;
spurious_8259A_irq:
@@ -285,24 +285,24 @@ void mask_8259A(void)
{
unsigned long flags;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
}
void unmask_8259A(void)
{
unsigned long flags;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
}
void init_8259A(int auto_eoi)
@@ -311,7 +311,7 @@ void init_8259A(int auto_eoi)
i8259A_auto_eoi = auto_eoi;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
@@ -356,5 +356,5 @@ void init_8259A(int auto_eoi)
outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
}
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index d5932226614..fce55d53263 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -84,24 +84,7 @@ static struct irqaction irq2 = {
};
DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
- [0 ... IRQ0_VECTOR - 1] = -1,
- [IRQ0_VECTOR] = 0,
- [IRQ1_VECTOR] = 1,
- [IRQ2_VECTOR] = 2,
- [IRQ3_VECTOR] = 3,
- [IRQ4_VECTOR] = 4,
- [IRQ5_VECTOR] = 5,
- [IRQ6_VECTOR] = 6,
- [IRQ7_VECTOR] = 7,
- [IRQ8_VECTOR] = 8,
- [IRQ9_VECTOR] = 9,
- [IRQ10_VECTOR] = 10,
- [IRQ11_VECTOR] = 11,
- [IRQ12_VECTOR] = 12,
- [IRQ13_VECTOR] = 13,
- [IRQ14_VECTOR] = 14,
- [IRQ15_VECTOR] = 15,
- [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
+ [0 ... NR_VECTORS - 1] = -1,
};
int vector_used_by_percpu_irq(unsigned int vector)
@@ -116,6 +99,9 @@ int vector_used_by_percpu_irq(unsigned int vector)
return 0;
}
+/* Number of legacy interrupts */
+int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY;
+
void __init init_ISA_irqs(void)
{
int i;
@@ -142,6 +128,19 @@ void __init init_ISA_irqs(void)
void __init init_IRQ(void)
{
+ int i;
+
+ /*
+ * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15.
+ * If these IRQ's are handled by legacy interrupt-controllers like PIC,
+ * then this configuration will likely be static after the boot. If
+ * these IRQ's are handled by more mordern controllers like IO-APIC,
+ * then this vector space can be freed and re-used dynamically as the
+ * irq's migrate etc.
+ */
+ for (i = 0; i < nr_legacy_irqs; i++)
+ per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i;
+
x86_init.irqs.intr_init();
}
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 5de9f4a9c3f..b43bbaebe2c 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -49,6 +49,7 @@
#include <linux/module.h>
#include <linux/kdebug.h>
#include <linux/kallsyms.h>
+#include <linux/ftrace.h>
#include <asm/cacheflush.h>
#include <asm/desc.h>
@@ -106,16 +107,22 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
};
const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
-/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
-static void __kprobes set_jmp_op(void *from, void *to)
+static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
{
- struct __arch_jmp_op {
- char op;
+ struct __arch_relative_insn {
+ u8 op;
s32 raddr;
- } __attribute__((packed)) * jop;
- jop = (struct __arch_jmp_op *)from;
- jop->raddr = (s32)((long)(to) - ((long)(from) + 5));
- jop->op = RELATIVEJUMP_INSTRUCTION;
+ } __attribute__((packed)) *insn;
+
+ insn = (struct __arch_relative_insn *)from;
+ insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
+ insn->op = op;
+}
+
+/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
+static void __kprobes synthesize_reljump(void *from, void *to)
+{
+ __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
}
/*
@@ -202,7 +209,7 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
/*
* Basically, kp->ainsn.insn has an original instruction.
* However, RIP-relative instruction can not do single-stepping
- * at different place, fix_riprel() tweaks the displacement of
+ * at different place, __copy_instruction() tweaks the displacement of
* that instruction. In that case, we can't recover the instruction
* from the kp->ainsn.insn.
*
@@ -284,21 +291,37 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
}
/*
- * Adjust the displacement if the instruction uses the %rip-relative
- * addressing mode.
+ * Copy an instruction and adjust the displacement if the instruction
+ * uses the %rip-relative addressing mode.
* If it does, Return the address of the 32-bit displacement word.
* If not, return null.
* Only applicable to 64-bit x86.
*/
-static void __kprobes fix_riprel(struct kprobe *p)
+static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
{
-#ifdef CONFIG_X86_64
struct insn insn;
- kernel_insn_init(&insn, p->ainsn.insn);
+ int ret;
+ kprobe_opcode_t buf[MAX_INSN_SIZE];
+ kernel_insn_init(&insn, src);
+ if (recover) {
+ insn_get_opcode(&insn);
+ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
+ ret = recover_probed_instruction(buf,
+ (unsigned long)src);
+ if (ret)
+ return 0;
+ kernel_insn_init(&insn, buf);
+ }
+ }
+ insn_get_length(&insn);
+ memcpy(dest, insn.kaddr, insn.length);
+
+#ifdef CONFIG_X86_64
if (insn_rip_relative(&insn)) {
s64 newdisp;
u8 *disp;
+ kernel_insn_init(&insn, dest);
insn_get_displacement(&insn);
/*
* The copied instruction uses the %rip-relative addressing
@@ -312,20 +335,23 @@ static void __kprobes fix_riprel(struct kprobe *p)
* extension of the original signed 32-bit displacement would
* have given.
*/
- newdisp = (u8 *) p->addr + (s64) insn.displacement.value -
- (u8 *) p->ainsn.insn;
+ newdisp = (u8 *) src + (s64) insn.displacement.value -
+ (u8 *) dest;
BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */
- disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn);
+ disp = (u8 *) dest + insn_offset_displacement(&insn);
*(s32 *) disp = (s32) newdisp;
}
#endif
+ return insn.length;
}
static void __kprobes arch_copy_kprobe(struct kprobe *p)
{
- memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
-
- fix_riprel(p);
+ /*
+ * Copy an instruction without recovering int3, because it will be
+ * put by another subsystem.
+ */
+ __copy_instruction(p->ainsn.insn, p->addr, 0);
if (can_boost(p->addr))
p->ainsn.boostable = 0;
@@ -406,18 +432,6 @@ static void __kprobes restore_btf(void)
update_debugctlmsr(current->thread.debugctlmsr);
}
-static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
-{
- clear_btf();
- regs->flags |= X86_EFLAGS_TF;
- regs->flags &= ~X86_EFLAGS_IF;
- /* single step inline if the instruction is an int3 */
- if (p->opcode == BREAKPOINT_INSTRUCTION)
- regs->ip = (unsigned long)p->addr;
- else
- regs->ip = (unsigned long)p->ainsn.insn;
-}
-
void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
struct pt_regs *regs)
{
@@ -429,20 +443,50 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
*sara = (unsigned long) &kretprobe_trampoline;
}
+#ifdef CONFIG_OPTPROBES
+static int __kprobes setup_detour_execution(struct kprobe *p,
+ struct pt_regs *regs,
+ int reenter);
+#else
+#define setup_detour_execution(p, regs, reenter) (0)
+#endif
+
static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb)
+ struct kprobe_ctlblk *kcb, int reenter)
{
+ if (setup_detour_execution(p, regs, reenter))
+ return;
+
#if !defined(CONFIG_PREEMPT)
if (p->ainsn.boostable == 1 && !p->post_handler) {
/* Boost up -- we can execute copied instructions directly */
- reset_current_kprobe();
+ if (!reenter)
+ reset_current_kprobe();
+ /*
+ * Reentering boosted probe doesn't reset current_kprobe,
+ * nor set current_kprobe, because it doesn't use single
+ * stepping.
+ */
regs->ip = (unsigned long)p->ainsn.insn;
preempt_enable_no_resched();
return;
}
#endif
- prepare_singlestep(p, regs);
- kcb->kprobe_status = KPROBE_HIT_SS;
+ if (reenter) {
+ save_previous_kprobe(kcb);
+ set_current_kprobe(p, regs, kcb);
+ kcb->kprobe_status = KPROBE_REENTER;
+ } else
+ kcb->kprobe_status = KPROBE_HIT_SS;
+ /* Prepare real single stepping */
+ clear_btf();
+ regs->flags |= X86_EFLAGS_TF;
+ regs->flags &= ~X86_EFLAGS_IF;
+ /* single step inline if the instruction is an int3 */
+ if (p->opcode == BREAKPOINT_INSTRUCTION)
+ regs->ip = (unsigned long)p->addr;
+ else
+ regs->ip = (unsigned long)p->ainsn.insn;
}
/*
@@ -456,11 +500,8 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
switch (kcb->kprobe_status) {
case KPROBE_HIT_SSDONE:
case KPROBE_HIT_ACTIVE:
- save_previous_kprobe(kcb);
- set_current_kprobe(p, regs, kcb);
kprobes_inc_nmissed_count(p);
- prepare_singlestep(p, regs);
- kcb->kprobe_status = KPROBE_REENTER;
+ setup_singlestep(p, regs, kcb, 1);
break;
case KPROBE_HIT_SS:
/* A probe has been hit in the codepath leading up to, or just
@@ -535,13 +576,13 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
* more here.
*/
if (!p->pre_handler || !p->pre_handler(p, regs))
- setup_singlestep(p, regs, kcb);
+ setup_singlestep(p, regs, kcb, 0);
return 1;
}
} else if (kprobe_running()) {
p = __get_cpu_var(current_kprobe);
if (p->break_handler && p->break_handler(p, regs)) {
- setup_singlestep(p, regs, kcb);
+ setup_singlestep(p, regs, kcb, 0);
return 1;
}
} /* else: not a kprobe fault; let the kernel handle it */
@@ -550,6 +591,69 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
return 0;
}
+#ifdef CONFIG_X86_64
+#define SAVE_REGS_STRING \
+ /* Skip cs, ip, orig_ax. */ \
+ " subq $24, %rsp\n" \
+ " pushq %rdi\n" \
+ " pushq %rsi\n" \
+ " pushq %rdx\n" \
+ " pushq %rcx\n" \
+ " pushq %rax\n" \
+ " pushq %r8\n" \
+ " pushq %r9\n" \
+ " pushq %r10\n" \
+ " pushq %r11\n" \
+ " pushq %rbx\n" \
+ " pushq %rbp\n" \
+ " pushq %r12\n" \
+ " pushq %r13\n" \
+ " pushq %r14\n" \
+ " pushq %r15\n"
+#define RESTORE_REGS_STRING \
+ " popq %r15\n" \
+ " popq %r14\n" \
+ " popq %r13\n" \
+ " popq %r12\n" \
+ " popq %rbp\n" \
+ " popq %rbx\n" \
+ " popq %r11\n" \
+ " popq %r10\n" \
+ " popq %r9\n" \
+ " popq %r8\n" \
+ " popq %rax\n" \
+ " popq %rcx\n" \
+ " popq %rdx\n" \
+ " popq %rsi\n" \
+ " popq %rdi\n" \
+ /* Skip orig_ax, ip, cs */ \
+ " addq $24, %rsp\n"
+#else
+#define SAVE_REGS_STRING \
+ /* Skip cs, ip, orig_ax and gs. */ \
+ " subl $16, %esp\n" \
+ " pushl %fs\n" \
+ " pushl %ds\n" \
+ " pushl %es\n" \
+ " pushl %eax\n" \
+ " pushl %ebp\n" \
+ " pushl %edi\n" \
+ " pushl %esi\n" \
+ " pushl %edx\n" \
+ " pushl %ecx\n" \
+ " pushl %ebx\n"
+#define RESTORE_REGS_STRING \
+ " popl %ebx\n" \
+ " popl %ecx\n" \
+ " popl %edx\n" \
+ " popl %esi\n" \
+ " popl %edi\n" \
+ " popl %ebp\n" \
+ " popl %eax\n" \
+ /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
+ " addl $24, %esp\n"
+#endif
+
/*
* When a retprobed function returns, this code saves registers and
* calls trampoline_handler() runs, which calls the kretprobe's handler.
@@ -563,65 +667,16 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
/* We don't bother saving the ss register */
" pushq %rsp\n"
" pushfq\n"
- /*
- * Skip cs, ip, orig_ax.
- * trampoline_handler() will plug in these values
- */
- " subq $24, %rsp\n"
- " pushq %rdi\n"
- " pushq %rsi\n"
- " pushq %rdx\n"
- " pushq %rcx\n"
- " pushq %rax\n"
- " pushq %r8\n"
- " pushq %r9\n"
- " pushq %r10\n"
- " pushq %r11\n"
- " pushq %rbx\n"
- " pushq %rbp\n"
- " pushq %r12\n"
- " pushq %r13\n"
- " pushq %r14\n"
- " pushq %r15\n"
+ SAVE_REGS_STRING
" movq %rsp, %rdi\n"
" call trampoline_handler\n"
/* Replace saved sp with true return address. */
" movq %rax, 152(%rsp)\n"
- " popq %r15\n"
- " popq %r14\n"
- " popq %r13\n"
- " popq %r12\n"
- " popq %rbp\n"
- " popq %rbx\n"
- " popq %r11\n"
- " popq %r10\n"
- " popq %r9\n"
- " popq %r8\n"
- " popq %rax\n"
- " popq %rcx\n"
- " popq %rdx\n"
- " popq %rsi\n"
- " popq %rdi\n"
- /* Skip orig_ax, ip, cs */
- " addq $24, %rsp\n"
+ RESTORE_REGS_STRING
" popfq\n"
#else
" pushf\n"
- /*
- * Skip cs, ip, orig_ax and gs.
- * trampoline_handler() will plug in these values
- */
- " subl $16, %esp\n"
- " pushl %fs\n"
- " pushl %es\n"
- " pushl %ds\n"
- " pushl %eax\n"
- " pushl %ebp\n"
- " pushl %edi\n"
- " pushl %esi\n"
- " pushl %edx\n"
- " pushl %ecx\n"
- " pushl %ebx\n"
+ SAVE_REGS_STRING
" movl %esp, %eax\n"
" call trampoline_handler\n"
/* Move flags to cs */
@@ -629,15 +684,7 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
" movl %edx, 52(%esp)\n"
/* Replace saved flags with true return address. */
" movl %eax, 56(%esp)\n"
- " popl %ebx\n"
- " popl %ecx\n"
- " popl %edx\n"
- " popl %esi\n"
- " popl %edi\n"
- " popl %ebp\n"
- " popl %eax\n"
- /* Skip ds, es, fs, gs, orig_ax and ip */
- " addl $24, %esp\n"
+ RESTORE_REGS_STRING
" popf\n"
#endif
" ret\n");
@@ -805,8 +852,8 @@ static void __kprobes resume_execution(struct kprobe *p,
* These instructions can be executed directly if it
* jumps back to correct address.
*/
- set_jmp_op((void *)regs->ip,
- (void *)orig_ip + (regs->ip - copy_ip));
+ synthesize_reljump((void *)regs->ip,
+ (void *)orig_ip + (regs->ip - copy_ip));
p->ainsn.boostable = 1;
} else {
p->ainsn.boostable = -1;
@@ -1033,6 +1080,358 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
return 0;
}
+
+#ifdef CONFIG_OPTPROBES
+
+/* Insert a call instruction at address 'from', which calls address 'to'.*/
+static void __kprobes synthesize_relcall(void *from, void *to)
+{
+ __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
+}
+
+/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
+static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
+ unsigned long val)
+{
+#ifdef CONFIG_X86_64
+ *addr++ = 0x48;
+ *addr++ = 0xbf;
+#else
+ *addr++ = 0xb8;
+#endif
+ *(unsigned long *)addr = val;
+}
+
+void __kprobes kprobes_optinsn_template_holder(void)
+{
+ asm volatile (
+ ".global optprobe_template_entry\n"
+ "optprobe_template_entry: \n"
+#ifdef CONFIG_X86_64
+ /* We don't bother saving the ss register */
+ " pushq %rsp\n"
+ " pushfq\n"
+ SAVE_REGS_STRING
+ " movq %rsp, %rsi\n"
+ ".global optprobe_template_val\n"
+ "optprobe_template_val: \n"
+ ASM_NOP5
+ ASM_NOP5
+ ".global optprobe_template_call\n"
+ "optprobe_template_call: \n"
+ ASM_NOP5
+ /* Move flags to rsp */
+ " movq 144(%rsp), %rdx\n"
+ " movq %rdx, 152(%rsp)\n"
+ RESTORE_REGS_STRING
+ /* Skip flags entry */
+ " addq $8, %rsp\n"
+ " popfq\n"
+#else /* CONFIG_X86_32 */
+ " pushf\n"
+ SAVE_REGS_STRING
+ " movl %esp, %edx\n"
+ ".global optprobe_template_val\n"
+ "optprobe_template_val: \n"
+ ASM_NOP5
+ ".global optprobe_template_call\n"
+ "optprobe_template_call: \n"
+ ASM_NOP5
+ RESTORE_REGS_STRING
+ " addl $4, %esp\n" /* skip cs */
+ " popf\n"
+#endif
+ ".global optprobe_template_end\n"
+ "optprobe_template_end: \n");
+}
+
+#define TMPL_MOVE_IDX \
+ ((long)&optprobe_template_val - (long)&optprobe_template_entry)
+#define TMPL_CALL_IDX \
+ ((long)&optprobe_template_call - (long)&optprobe_template_entry)
+#define TMPL_END_IDX \
+ ((long)&optprobe_template_end - (long)&optprobe_template_entry)
+
+#define INT3_SIZE sizeof(kprobe_opcode_t)
+
+/* Optimized kprobe call back function: called from optinsn */
+static void __kprobes optimized_callback(struct optimized_kprobe *op,
+ struct pt_regs *regs)
+{
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+ preempt_disable();
+ if (kprobe_running()) {
+ kprobes_inc_nmissed_count(&op->kp);
+ } else {
+ /* Save skipped registers */
+#ifdef CONFIG_X86_64
+ regs->cs = __KERNEL_CS;
+#else
+ regs->cs = __KERNEL_CS | get_kernel_rpl();
+ regs->gs = 0;
+#endif
+ regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
+ regs->orig_ax = ~0UL;
+
+ __get_cpu_var(current_kprobe) = &op->kp;
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+ opt_pre_handler(&op->kp, regs);
+ __get_cpu_var(current_kprobe) = NULL;
+ }
+ preempt_enable_no_resched();
+}
+
+static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
+{
+ int len = 0, ret;
+
+ while (len < RELATIVEJUMP_SIZE) {
+ ret = __copy_instruction(dest + len, src + len, 1);
+ if (!ret || !can_boost(dest + len))
+ return -EINVAL;
+ len += ret;
+ }
+ /* Check whether the address range is reserved */
+ if (ftrace_text_reserved(src, src + len - 1) ||
+ alternatives_text_reserved(src, src + len - 1))
+ return -EBUSY;
+
+ return len;
+}
+
+/* Check whether insn is indirect jump */
+static int __kprobes insn_is_indirect_jump(struct insn *insn)
+{
+ return ((insn->opcode.bytes[0] == 0xff &&
+ (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
+ insn->opcode.bytes[0] == 0xea); /* Segment based jump */
+}
+
+/* Check whether insn jumps into specified address range */
+static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
+{
+ unsigned long target = 0;
+
+ switch (insn->opcode.bytes[0]) {
+ case 0xe0: /* loopne */
+ case 0xe1: /* loope */
+ case 0xe2: /* loop */
+ case 0xe3: /* jcxz */
+ case 0xe9: /* near relative jump */
+ case 0xeb: /* short relative jump */
+ break;
+ case 0x0f:
+ if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
+ break;
+ return 0;
+ default:
+ if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
+ break;
+ return 0;
+ }
+ target = (unsigned long)insn->next_byte + insn->immediate.value;
+
+ return (start <= target && target <= start + len);
+}
+
+/* Decode whole function to ensure any instructions don't jump into target */
+static int __kprobes can_optimize(unsigned long paddr)
+{
+ int ret;
+ unsigned long addr, size = 0, offset = 0;
+ struct insn insn;
+ kprobe_opcode_t buf[MAX_INSN_SIZE];
+ /* Dummy buffers for lookup_symbol_attrs */
+ static char __dummy_buf[KSYM_NAME_LEN];
+
+ /* Lookup symbol including addr */
+ if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf))
+ return 0;
+
+ /* Check there is enough space for a relative jump. */
+ if (size - offset < RELATIVEJUMP_SIZE)
+ return 0;
+
+ /* Decode instructions */
+ addr = paddr - offset;
+ while (addr < paddr - offset + size) { /* Decode until function end */
+ if (search_exception_tables(addr))
+ /*
+ * Since some fixup code will jumps into this function,
+ * we can't optimize kprobe in this function.
+ */
+ return 0;
+ kernel_insn_init(&insn, (void *)addr);
+ insn_get_opcode(&insn);
+ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
+ ret = recover_probed_instruction(buf, addr);
+ if (ret)
+ return 0;
+ kernel_insn_init(&insn, buf);
+ }
+ insn_get_length(&insn);
+ /* Recover address */
+ insn.kaddr = (void *)addr;
+ insn.next_byte = (void *)(addr + insn.length);
+ /* Check any instructions don't jump into target */
+ if (insn_is_indirect_jump(&insn) ||
+ insn_jump_into_range(&insn, paddr + INT3_SIZE,
+ RELATIVE_ADDR_SIZE))
+ return 0;
+ addr += insn.length;
+ }
+
+ return 1;
+}
+
+/* Check optimized_kprobe can actually be optimized. */
+int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
+{
+ int i;
+ struct kprobe *p;
+
+ for (i = 1; i < op->optinsn.size; i++) {
+ p = get_kprobe(op->kp.addr + i);
+ if (p && !kprobe_disabled(p))
+ return -EEXIST;
+ }
+
+ return 0;
+}
+
+/* Check the addr is within the optimized instructions. */
+int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op,
+ unsigned long addr)
+{
+ return ((unsigned long)op->kp.addr <= addr &&
+ (unsigned long)op->kp.addr + op->optinsn.size > addr);
+}
+
+/* Free optimized instruction slot */
+static __kprobes
+void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
+{
+ if (op->optinsn.insn) {
+ free_optinsn_slot(op->optinsn.insn, dirty);
+ op->optinsn.insn = NULL;
+ op->optinsn.size = 0;
+ }
+}
+
+void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
+{
+ __arch_remove_optimized_kprobe(op, 1);
+}
+
+/*
+ * Copy replacing target instructions
+ * Target instructions MUST be relocatable (checked inside)
+ */
+int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
+{
+ u8 *buf;
+ int ret;
+ long rel;
+
+ if (!can_optimize((unsigned long)op->kp.addr))
+ return -EILSEQ;
+
+ op->optinsn.insn = get_optinsn_slot();
+ if (!op->optinsn.insn)
+ return -ENOMEM;
+
+ /*
+ * Verify if the address gap is in 2GB range, because this uses
+ * a relative jump.
+ */
+ rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
+ if (abs(rel) > 0x7fffffff)
+ return -ERANGE;
+
+ buf = (u8 *)op->optinsn.insn;
+
+ /* Copy instructions into the out-of-line buffer */
+ ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
+ if (ret < 0) {
+ __arch_remove_optimized_kprobe(op, 0);
+ return ret;
+ }
+ op->optinsn.size = ret;
+
+ /* Copy arch-dep-instance from template */
+ memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
+
+ /* Set probe information */
+ synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
+
+ /* Set probe function call */
+ synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
+
+ /* Set returning jmp instruction at the tail of out-of-line buffer */
+ synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
+ (u8 *)op->kp.addr + op->optinsn.size);
+
+ flush_icache_range((unsigned long) buf,
+ (unsigned long) buf + TMPL_END_IDX +
+ op->optinsn.size + RELATIVEJUMP_SIZE);
+ return 0;
+}
+
+/* Replace a breakpoint (int3) with a relative jump. */
+int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op)
+{
+ unsigned char jmp_code[RELATIVEJUMP_SIZE];
+ s32 rel = (s32)((long)op->optinsn.insn -
+ ((long)op->kp.addr + RELATIVEJUMP_SIZE));
+
+ /* Backup instructions which will be replaced by jump address */
+ memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
+ RELATIVE_ADDR_SIZE);
+
+ jmp_code[0] = RELATIVEJUMP_OPCODE;
+ *(s32 *)(&jmp_code[1]) = rel;
+
+ /*
+ * text_poke_smp doesn't support NMI/MCE code modifying.
+ * However, since kprobes itself also doesn't support NMI/MCE
+ * code probing, it's not a problem.
+ */
+ text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE);
+ return 0;
+}
+
+/* Replace a relative jump with a breakpoint (int3). */
+void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
+{
+ u8 buf[RELATIVEJUMP_SIZE];
+
+ /* Set int3 to first byte for kprobes */
+ buf[0] = BREAKPOINT_INSTRUCTION;
+ memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+ text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
+}
+
+static int __kprobes setup_detour_execution(struct kprobe *p,
+ struct pt_regs *regs,
+ int reenter)
+{
+ struct optimized_kprobe *op;
+
+ if (p->flags & KPROBE_FLAG_OPTIMIZED) {
+ /* This kprobe is really able to run optimized path. */
+ op = container_of(p, struct optimized_kprobe, kp);
+ /* Detour through copied instructions */
+ regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
+ if (!reenter)
+ reset_current_kprobe();
+ preempt_enable_no_resched();
+ return 1;
+ }
+ return 0;
+}
+#endif
+
int __init arch_init_kprobes(void)
{
return 0;
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 712d15fdc41..71825806cd4 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -7,6 +7,8 @@
#include <linux/string.h>
#include <linux/pci.h>
#include <linux/dmi.h>
+#include <linux/range.h>
+
#include <asm/pci-direct.h>
#include <linux/sort.h>
#include <asm/io.h>
@@ -30,11 +32,6 @@ static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = {
{ 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 },
};
-struct range {
- u64 start;
- u64 end;
-};
-
static int __cpuinit cmp_range(const void *x1, const void *x2)
{
const struct range *r1 = x1;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1b1739d1631..1db183ed7c0 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -428,10 +428,6 @@ struct pv_mmu_ops pv_mmu_ops = {
.ptep_modify_prot_start = __ptep_modify_prot_start,
.ptep_modify_prot_commit = __ptep_modify_prot_commit,
-#ifdef CONFIG_HIGHPTE
- .kmap_atomic_pte = kmap_atomic,
-#endif
-
#if PAGETABLE_LEVELS >= 3
#ifdef CONFIG_X86_PAE
.set_pte_atomic = native_set_pte_atomic,
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 75e14e21f61..1aa966c565f 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -65,7 +65,7 @@ int dma_set_mask(struct device *dev, u64 mask)
}
EXPORT_SYMBOL(dma_set_mask);
-#ifdef CONFIG_X86_64
+#if defined(CONFIG_X86_64) && !defined(CONFIG_NUMA)
static __initdata void *dma32_bootmem_ptr;
static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
@@ -116,14 +116,21 @@ static void __init dma32_free_bootmem(void)
dma32_bootmem_ptr = NULL;
dma32_bootmem_size = 0;
}
+#else
+void __init dma32_reserve_bootmem(void)
+{
+}
+static void __init dma32_free_bootmem(void)
+{
+}
+
#endif
void __init pci_iommu_alloc(void)
{
-#ifdef CONFIG_X86_64
/* free the range so iommu could get some range less than 4G */
dma32_free_bootmem();
-#endif
+
if (pci_swiotlb_detect())
goto out;
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 704bddcdf64..8e1aac86b50 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -461,6 +461,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"),
},
},
+ { /* Handle problems with rebooting on the iMac9,1. */
+ .callback = set_pci_reboot,
+ .ident = "Apple iMac9,1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
+ },
+ },
{ }
};
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index cb42109a55b..5d7ba1a449b 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -969,15 +969,11 @@ void __init setup_arch(char **cmdline_p)
#endif
initmem_init(0, max_pfn, acpi, k8);
+#ifndef CONFIG_NO_BOOTMEM
+ early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
+#endif
-#ifdef CONFIG_X86_64
- /*
- * dma32_reserve_bootmem() allocates bootmem which may conflict
- * with the crashkernel command line, so do that after
- * reserve_crashkernel()
- */
dma32_reserve_bootmem();
-#endif
reserve_ibft_region();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 35abcb8b00e..ef6370b00e7 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -137,7 +137,13 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
static void __init pcpu_fc_free(void *ptr, size_t size)
{
+#ifdef CONFIG_NO_BOOTMEM
+ u64 start = __pa(ptr);
+ u64 end = start + size;
+ free_early_partial(start, end);
+#else
free_bootmem(__pa(ptr), size);
+#endif
}
static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9b4401115ea..a435c76d714 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -241,6 +241,11 @@ static void __cpuinit smp_callin(void)
map_cpu_to_logical_apicid();
notify_cpu_starting(cpuid);
+
+ /*
+ * Need to setup vector mappings before we enable interrupts.
+ */
+ __setup_vector_irq(smp_processor_id());
/*
* Get our bogomips.
*
@@ -315,7 +320,6 @@ notrace static void __cpuinit start_secondary(void *unused)
*/
ipi_call_lock();
lock_vector_lock();
- __setup_vector_irq(smp_processor_id());
set_cpu_online(smp_processor_id(), true);
unlock_vector_lock();
ipi_call_unlock();
@@ -1212,11 +1216,12 @@ __init void prefill_possible_map(void)
total_cpus = max_t(int, possible, num_processors + disabled_cpus);
- if (possible > CONFIG_NR_CPUS) {
+ /* nr_cpu_ids could be reduced via nr_cpus= */
+ if (possible > nr_cpu_ids) {
printk(KERN_WARNING
"%d Processors exceeds NR_CPUS limit of %d\n",
- possible, CONFIG_NR_CPUS);
- possible = CONFIG_NR_CPUS;
+ possible, nr_cpu_ids);
+ possible = nr_cpu_ids;
}
printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index be2573448ed..fb5cc5e14cf 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -70,11 +70,11 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
* manually to deassert NMI lines for the watchdog if run
* on an 82489DX-based system.
*/
- spin_lock(&i8259A_lock);
+ raw_spin_lock(&i8259A_lock);
outb(0x0c, PIC_MASTER_OCW3);
/* Ack the IRQ; AEOI will end it automatically. */
inb(PIC_MASTER_POLL);
- spin_unlock(&i8259A_lock);
+ raw_spin_unlock(&i8259A_lock);
}
global_clock_event->event_handler(global_clock_event);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 34a279a7471..ab38ce0984f 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -559,7 +559,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
struct irq_desc *desc;
unsigned long flags;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
/* Find out what's interrupting in the PIIX4 master 8259 */
outb(0x0c, 0x20); /* OCW3 Poll command */
@@ -596,7 +596,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
outb(0x60 + realirq, 0x20);
}
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
desc = irq_to_desc(realirq);
@@ -614,7 +614,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
return IRQ_HANDLED;
out_unlock:
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
return IRQ_NONE;
}
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index d430e4c3019..7dd599deca4 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -33,6 +33,7 @@
#include <asm/fixmap.h>
#include <asm/apicdef.h>
#include <asm/apic.h>
+#include <asm/pgalloc.h>
#include <asm/processor.h>
#include <asm/timer.h>
#include <asm/vmi_time.h>
@@ -266,30 +267,6 @@ static void vmi_nop(void)
{
}
-#ifdef CONFIG_HIGHPTE
-static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
-{
- void *va = kmap_atomic(page, type);
-
- /*
- * Internally, the VMI ROM must map virtual addresses to physical
- * addresses for processing MMU updates. By the time MMU updates
- * are issued, this information is typically already lost.
- * Fortunately, the VMI provides a cache of mapping slots for active
- * page tables.
- *
- * We use slot zero for the linear mapping of physical memory, and
- * in HIGHPTE kernels, slot 1 and 2 for KM_PTE0 and KM_PTE1.
- *
- * args: SLOT VA COUNT PFN
- */
- BUG_ON(type != KM_PTE0 && type != KM_PTE1);
- vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page));
-
- return va;
-}
-#endif
-
static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
{
vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
@@ -640,6 +617,12 @@ static inline int __init activate_vmi(void)
u64 reloc;
const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
+ /*
+ * Prevent page tables from being allocated in highmem, even if
+ * CONFIG_HIGHPTE is enabled.
+ */
+ __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
+
if (call_vrom_func(vmi_rom, vmi_init) != 0) {
printk(KERN_ERR "VMI ROM failed to initialize!");
return 0;
@@ -778,10 +761,6 @@ static inline int __init activate_vmi(void)
/* Set linear is needed in all cases */
vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
-#ifdef CONFIG_HIGHPTE
- if (vmi_ops.set_linear_mapping)
- pv_mmu_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
-#endif
/*
* These MUST always be patched. Don't support indirect jumps
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 74c92bb194d..2f1ca561429 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -79,11 +79,7 @@ unsigned long vmi_tsc_khz(void)
static inline unsigned int vmi_get_timer_vector(void)
{
-#ifdef CONFIG_X86_IO_APIC
- return FIRST_DEVICE_VECTOR;
-#else
- return FIRST_EXTERNAL_VECTOR;
-#endif
+ return IRQ0_VECTOR;
}
/** vmi clockchip */
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index f92a0da608c..44879df5569 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -341,7 +341,7 @@ SECTIONS
* Per-cpu symbols which need to be offset from __per_cpu_load
* for the boot processor.
*/
-#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
+#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load
INIT_PER_CPU(gdt_page);
INIT_PER_CPU(irq_stack_union);
@@ -352,7 +352,7 @@ INIT_PER_CPU(irq_stack_union);
"kernel image bigger than KERNEL_IMAGE_SIZE");
#ifdef CONFIG_SMP
-. = ASSERT((per_cpu__irq_stack_union == 0),
+. = ASSERT((irq_stack_union == 0),
"irq_stack_union is not at start of per-cpu area");
#endif
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 9055e5872ff..1c0c6ab9c60 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -301,7 +301,8 @@ static int __init vsyscall_init(void)
register_sysctl_table(kernel_root_table2);
#endif
on_each_cpu(cpu_vsyscall_init, NULL, 1);
- hotcpu_notifier(cpu_vsyscall_notifier, 0);
+ /* notifier priority > KVM */
+ hotcpu_notifier(cpu_vsyscall_notifier, 30);
return 0;
}
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 3c4d0109ad2..970bbd47951 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -29,6 +29,7 @@ config KVM
select HAVE_KVM_EVENTFD
select KVM_APIC_ARCHITECTURE
select USER_RETURN_NOTIFIER
+ select KVM_MMIO
---help---
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7e8faea4651..4dade6ac082 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -32,7 +32,7 @@
#include <linux/module.h>
#include <asm/kvm_emulate.h>
-#include "mmu.h" /* for is_long_mode() */
+#include "x86.h"
/*
* Opcode effective-address decode tables.
@@ -76,6 +76,8 @@
#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
#define GroupMask 0xff /* Group number stored in bits 0:7 */
/* Misc flags */
+#define Lock (1<<26) /* lock prefix is allowed for the instruction */
+#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
#define No64 (1<<28)
/* Source 2 operand type */
#define Src2None (0<<29)
@@ -88,39 +90,40 @@
enum {
Group1_80, Group1_81, Group1_82, Group1_83,
Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
+ Group8, Group9,
};
static u32 opcode_table[256] = {
/* 0x00 - 0x07 */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
/* 0x08 - 0x0F */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
ImplicitOps | Stack | No64, 0,
/* 0x10 - 0x17 */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
/* 0x18 - 0x1F */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
/* 0x20 - 0x27 */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
/* 0x28 - 0x2F */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
0, 0, 0, 0,
/* 0x30 - 0x37 */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
0, 0, 0, 0,
/* 0x38 - 0x3F */
@@ -156,7 +159,7 @@ static u32 opcode_table[256] = {
Group | Group1_80, Group | Group1_81,
Group | Group1_82, Group | Group1_83,
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
/* 0x88 - 0x8F */
ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
@@ -210,7 +213,7 @@ static u32 opcode_table[256] = {
SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
/* 0xF0 - 0xF7 */
0, 0, 0, 0,
- ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3,
+ ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
/* 0xF8 - 0xFF */
ImplicitOps, 0, ImplicitOps, ImplicitOps,
ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
@@ -218,16 +221,20 @@ static u32 opcode_table[256] = {
static u32 twobyte_table[256] = {
/* 0x00 - 0x0F */
- 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0,
- ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
+ 0, Group | GroupDual | Group7, 0, 0,
+ 0, ImplicitOps, ImplicitOps | Priv, 0,
+ ImplicitOps | Priv, ImplicitOps | Priv, 0, 0,
+ 0, ImplicitOps | ModRM, 0, 0,
/* 0x10 - 0x1F */
0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
/* 0x20 - 0x2F */
- ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
+ ModRM | ImplicitOps | Priv, ModRM | Priv,
+ ModRM | ImplicitOps | Priv, ModRM | Priv,
+ 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
/* 0x30 - 0x3F */
- ImplicitOps, 0, ImplicitOps, 0,
- ImplicitOps, ImplicitOps, 0, 0,
+ ImplicitOps | Priv, 0, ImplicitOps | Priv, 0,
+ ImplicitOps, ImplicitOps | Priv, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
/* 0x40 - 0x47 */
DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
@@ -257,21 +264,23 @@ static u32 twobyte_table[256] = {
DstMem | SrcReg | Src2CL | ModRM, 0, 0,
/* 0xA8 - 0xAF */
ImplicitOps | Stack, ImplicitOps | Stack,
- 0, DstMem | SrcReg | ModRM | BitOp,
+ 0, DstMem | SrcReg | ModRM | BitOp | Lock,
DstMem | SrcReg | Src2ImmByte | ModRM,
DstMem | SrcReg | Src2CL | ModRM,
ModRM, 0,
/* 0xB0 - 0xB7 */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
- DstMem | SrcReg | ModRM | BitOp,
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
+ 0, DstMem | SrcReg | ModRM | BitOp | Lock,
0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
DstReg | SrcMem16 | ModRM | Mov,
/* 0xB8 - 0xBF */
- 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp,
+ 0, 0,
+ Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock,
0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
DstReg | SrcMem16 | ModRM | Mov,
/* 0xC0 - 0xCF */
- 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
+ 0, 0, 0, DstMem | SrcReg | ModRM | Mov,
+ 0, 0, 0, Group | GroupDual | Group9,
0, 0, 0, 0, 0, 0, 0, 0,
/* 0xD0 - 0xDF */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -283,25 +292,41 @@ static u32 twobyte_table[256] = {
static u32 group_table[] = {
[Group1_80*8] =
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM,
[Group1_81*8] =
- DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
- DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
- DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
- DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM,
[Group1_82*8] =
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64,
[Group1_83*8] =
- DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
- DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
- DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
- DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM,
[Group1A*8] =
DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
[Group3_Byte*8] =
@@ -320,24 +345,39 @@ static u32 group_table[] = {
SrcMem | ModRM | Stack, 0,
SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0,
[Group7*8] =
- 0, 0, ModRM | SrcMem, ModRM | SrcMem,
+ 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
SrcNone | ModRM | DstMem | Mov, 0,
- SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp,
+ SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv,
+ [Group8*8] =
+ 0, 0, 0, 0,
+ DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
+ [Group9*8] =
+ 0, ImplicitOps | ModRM | Lock, 0, 0, 0, 0, 0, 0,
};
static u32 group2_table[] = {
[Group7*8] =
- SrcNone | ModRM, 0, 0, SrcNone | ModRM,
+ SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM,
SrcNone | ModRM | DstMem | Mov, 0,
SrcMem16 | ModRM | Mov, 0,
+ [Group9*8] =
+ 0, 0, 0, 0, 0, 0, 0, 0,
};
/* EFLAGS bit definitions. */
+#define EFLG_ID (1<<21)
+#define EFLG_VIP (1<<20)
+#define EFLG_VIF (1<<19)
+#define EFLG_AC (1<<18)
#define EFLG_VM (1<<17)
#define EFLG_RF (1<<16)
+#define EFLG_IOPL (3<<12)
+#define EFLG_NT (1<<14)
#define EFLG_OF (1<<11)
#define EFLG_DF (1<<10)
#define EFLG_IF (1<<9)
+#define EFLG_TF (1<<8)
#define EFLG_SF (1<<7)
#define EFLG_ZF (1<<6)
#define EFLG_AF (1<<4)
@@ -606,7 +646,7 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
if (linear < fc->start || linear >= fc->end) {
size = min(15UL, PAGE_SIZE - offset_in_page(linear));
- rc = ops->read_std(linear, fc->data, size, ctxt->vcpu);
+ rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL);
if (rc)
return rc;
fc->start = linear;
@@ -661,11 +701,11 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
op_bytes = 3;
*address = 0;
rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
- ctxt->vcpu);
+ ctxt->vcpu, NULL);
if (rc)
return rc;
rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
- ctxt->vcpu);
+ ctxt->vcpu, NULL);
return rc;
}
@@ -889,6 +929,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
switch (mode) {
case X86EMUL_MODE_REAL:
+ case X86EMUL_MODE_VM86:
case X86EMUL_MODE_PROT16:
def_op_bytes = def_ad_bytes = 2;
break;
@@ -975,7 +1016,7 @@ done_prefixes:
}
if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
- kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");;
+ kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");
return -1;
}
@@ -1196,13 +1237,56 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
rc = ops->read_emulated(register_address(c, ss_base(ctxt),
c->regs[VCPU_REGS_RSP]),
dest, len, ctxt->vcpu);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
return rc;
register_address_increment(c, &c->regs[VCPU_REGS_RSP], len);
return rc;
}
+static int emulate_popf(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ void *dest, int len)
+{
+ int rc;
+ unsigned long val, change_mask;
+ int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
+ int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu);
+
+ rc = emulate_pop(ctxt, ops, &val, len);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF
+ | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID;
+
+ switch(ctxt->mode) {
+ case X86EMUL_MODE_PROT64:
+ case X86EMUL_MODE_PROT32:
+ case X86EMUL_MODE_PROT16:
+ if (cpl == 0)
+ change_mask |= EFLG_IOPL;
+ if (cpl <= iopl)
+ change_mask |= EFLG_IF;
+ break;
+ case X86EMUL_MODE_VM86:
+ if (iopl < 3) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ change_mask |= EFLG_IF;
+ break;
+ default: /* real mode */
+ change_mask |= (EFLG_IOPL | EFLG_IF);
+ break;
+ }
+
+ *(unsigned long *)dest =
+ (ctxt->eflags & ~change_mask) | (val & change_mask);
+
+ return rc;
+}
+
static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
{
struct decode_cache *c = &ctxt->decode;
@@ -1225,7 +1309,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
if (rc != 0)
return rc;
- rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, 1, seg);
+ rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg);
return rc;
}
@@ -1370,7 +1454,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
int rc;
rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
return rc;
if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
@@ -1385,7 +1469,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
(u32) c->regs[VCPU_REGS_RBX];
rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
return rc;
ctxt->eflags |= EFLG_ZF;
}
@@ -1407,7 +1491,7 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
if (rc)
return rc;
- rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, 1, VCPU_SREG_CS);
+ rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS);
return rc;
}
@@ -1451,7 +1535,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
&c->dst.val,
c->dst.bytes,
ctxt->vcpu);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
return rc;
break;
case OP_NONE:
@@ -1514,9 +1598,8 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
u64 msr_data;
/* syscall is not available in real mode */
- if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL
- || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE))
- return -1;
+ if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86)
+ return X86EMUL_UNHANDLEABLE;
setup_syscalls_segments(ctxt, &cs, &ss);
@@ -1553,7 +1636,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
}
- return 0;
+ return X86EMUL_CONTINUE;
}
static int
@@ -1563,22 +1646,17 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
struct kvm_segment cs, ss;
u64 msr_data;
- /* inject #UD if LOCK prefix is used */
- if (c->lock_prefix)
- return -1;
-
- /* inject #GP if in real mode or paging is disabled */
- if (ctxt->mode == X86EMUL_MODE_REAL ||
- !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) {
+ /* inject #GP if in real mode */
+ if (ctxt->mode == X86EMUL_MODE_REAL) {
kvm_inject_gp(ctxt->vcpu, 0);
- return -1;
+ return X86EMUL_UNHANDLEABLE;
}
/* XXX sysenter/sysexit have not been tested in 64bit mode.
* Therefore, we inject an #UD.
*/
if (ctxt->mode == X86EMUL_MODE_PROT64)
- return -1;
+ return X86EMUL_UNHANDLEABLE;
setup_syscalls_segments(ctxt, &cs, &ss);
@@ -1587,13 +1665,13 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
case X86EMUL_MODE_PROT32:
if ((msr_data & 0xfffc) == 0x0) {
kvm_inject_gp(ctxt->vcpu, 0);
- return -1;
+ return X86EMUL_PROPAGATE_FAULT;
}
break;
case X86EMUL_MODE_PROT64:
if (msr_data == 0x0) {
kvm_inject_gp(ctxt->vcpu, 0);
- return -1;
+ return X86EMUL_PROPAGATE_FAULT;
}
break;
}
@@ -1618,7 +1696,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
c->regs[VCPU_REGS_RSP] = msr_data;
- return 0;
+ return X86EMUL_CONTINUE;
}
static int
@@ -1629,21 +1707,11 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
u64 msr_data;
int usermode;
- /* inject #UD if LOCK prefix is used */
- if (c->lock_prefix)
- return -1;
-
- /* inject #GP if in real mode or paging is disabled */
- if (ctxt->mode == X86EMUL_MODE_REAL
- || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) {
- kvm_inject_gp(ctxt->vcpu, 0);
- return -1;
- }
-
- /* sysexit must be called from CPL 0 */
- if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) {
+ /* inject #GP if in real mode or Virtual 8086 mode */
+ if (ctxt->mode == X86EMUL_MODE_REAL ||
+ ctxt->mode == X86EMUL_MODE_VM86) {
kvm_inject_gp(ctxt->vcpu, 0);
- return -1;
+ return X86EMUL_UNHANDLEABLE;
}
setup_syscalls_segments(ctxt, &cs, &ss);
@@ -1661,7 +1729,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
cs.selector = (u16)(msr_data + 16);
if ((msr_data & 0xfffc) == 0x0) {
kvm_inject_gp(ctxt->vcpu, 0);
- return -1;
+ return X86EMUL_PROPAGATE_FAULT;
}
ss.selector = (u16)(msr_data + 24);
break;
@@ -1669,7 +1737,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
cs.selector = (u16)(msr_data + 32);
if (msr_data == 0x0) {
kvm_inject_gp(ctxt->vcpu, 0);
- return -1;
+ return X86EMUL_PROPAGATE_FAULT;
}
ss.selector = cs.selector + 8;
cs.db = 0;
@@ -1685,7 +1753,58 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX];
c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX];
- return 0;
+ return X86EMUL_CONTINUE;
+}
+
+static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
+{
+ int iopl;
+ if (ctxt->mode == X86EMUL_MODE_REAL)
+ return false;
+ if (ctxt->mode == X86EMUL_MODE_VM86)
+ return true;
+ iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
+ return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl;
+}
+
+static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 port, u16 len)
+{
+ struct kvm_segment tr_seg;
+ int r;
+ u16 io_bitmap_ptr;
+ u8 perm, bit_idx = port & 0x7;
+ unsigned mask = (1 << len) - 1;
+
+ kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR);
+ if (tr_seg.unusable)
+ return false;
+ if (tr_seg.limit < 103)
+ return false;
+ r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu,
+ NULL);
+ if (r != X86EMUL_CONTINUE)
+ return false;
+ if (io_bitmap_ptr + port/8 > tr_seg.limit)
+ return false;
+ r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1,
+ ctxt->vcpu, NULL);
+ if (r != X86EMUL_CONTINUE)
+ return false;
+ if ((perm >> bit_idx) & mask)
+ return false;
+ return true;
+}
+
+static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 port, u16 len)
+{
+ if (emulator_bad_iopl(ctxt))
+ if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
+ return false;
+ return true;
}
int
@@ -1709,6 +1828,18 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
saved_eip = c->eip;
+ /* LOCK prefix is allowed only with some instructions */
+ if (c->lock_prefix && !(c->d & Lock)) {
+ kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ goto done;
+ }
+
+ /* Privileged instruction can be executed only in CPL=0 */
+ if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ goto done;
+ }
+
if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
memop = c->modrm_ea;
@@ -1749,7 +1880,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
&c->src.val,
c->src.bytes,
ctxt->vcpu);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
goto done;
c->src.orig_val = c->src.val;
}
@@ -1768,12 +1899,15 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
c->dst.ptr = (void *)c->dst.ptr +
(c->src.val & mask) / 8;
}
- if (!(c->d & Mov) &&
- /* optimisation - avoid slow emulated read */
- ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
- &c->dst.val,
- c->dst.bytes, ctxt->vcpu)) != 0))
- goto done;
+ if (!(c->d & Mov)) {
+ /* optimisation - avoid slow emulated read */
+ rc = ops->read_emulated((unsigned long)c->dst.ptr,
+ &c->dst.val,
+ c->dst.bytes,
+ ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
}
c->dst.orig_val = c->dst.val;
@@ -1876,7 +2010,12 @@ special_insn:
break;
case 0x6c: /* insb */
case 0x6d: /* insw/insd */
- if (kvm_emulate_pio_string(ctxt->vcpu,
+ if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
+ (c->d & ByteOp) ? 1 : c->op_bytes)) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ goto done;
+ }
+ if (kvm_emulate_pio_string(ctxt->vcpu,
1,
(c->d & ByteOp) ? 1 : c->op_bytes,
c->rep_prefix ?
@@ -1892,6 +2031,11 @@ special_insn:
return 0;
case 0x6e: /* outsb */
case 0x6f: /* outsw/outsd */
+ if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
+ (c->d & ByteOp) ? 1 : c->op_bytes)) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ goto done;
+ }
if (kvm_emulate_pio_string(ctxt->vcpu,
0,
(c->d & ByteOp) ? 1 : c->op_bytes,
@@ -1978,25 +2122,19 @@ special_insn:
break;
case 0x8e: { /* mov seg, r/m16 */
uint16_t sel;
- int type_bits;
- int err;
sel = c->src.val;
- if (c->modrm_reg == VCPU_SREG_SS)
- toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
- if (c->modrm_reg <= 5) {
- type_bits = (c->modrm_reg == 1) ? 9 : 1;
- err = kvm_load_segment_descriptor(ctxt->vcpu, sel,
- type_bits, c->modrm_reg);
- } else {
- printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n",
- c->modrm);
- goto cannot_emulate;
+ if (c->modrm_reg == VCPU_SREG_CS ||
+ c->modrm_reg > VCPU_SREG_GS) {
+ kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ goto done;
}
- if (err < 0)
- goto cannot_emulate;
+ if (c->modrm_reg == VCPU_SREG_SS)
+ toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
+
+ rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg);
c->dst.type = OP_NONE; /* Disable writeback. */
break;
@@ -2025,7 +2163,10 @@ special_insn:
c->dst.type = OP_REG;
c->dst.ptr = (unsigned long *) &ctxt->eflags;
c->dst.bytes = c->op_bytes;
- goto pop_instruction;
+ rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ break;
case 0xa0 ... 0xa1: /* mov */
c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
c->dst.val = c->src.val;
@@ -2039,11 +2180,12 @@ special_insn:
c->dst.ptr = (unsigned long *)register_address(c,
es_base(ctxt),
c->regs[VCPU_REGS_RDI]);
- if ((rc = ops->read_emulated(register_address(c,
- seg_override_base(ctxt, c),
- c->regs[VCPU_REGS_RSI]),
+ rc = ops->read_emulated(register_address(c,
+ seg_override_base(ctxt, c),
+ c->regs[VCPU_REGS_RSI]),
&c->dst.val,
- c->dst.bytes, ctxt->vcpu)) != 0)
+ c->dst.bytes, ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
goto done;
register_address_increment(c, &c->regs[VCPU_REGS_RSI],
(ctxt->eflags & EFLG_DF) ? -c->dst.bytes
@@ -2058,10 +2200,11 @@ special_insn:
c->src.ptr = (unsigned long *)register_address(c,
seg_override_base(ctxt, c),
c->regs[VCPU_REGS_RSI]);
- if ((rc = ops->read_emulated((unsigned long)c->src.ptr,
- &c->src.val,
- c->src.bytes,
- ctxt->vcpu)) != 0)
+ rc = ops->read_emulated((unsigned long)c->src.ptr,
+ &c->src.val,
+ c->src.bytes,
+ ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
goto done;
c->dst.type = OP_NONE; /* Disable writeback. */
@@ -2069,10 +2212,11 @@ special_insn:
c->dst.ptr = (unsigned long *)register_address(c,
es_base(ctxt),
c->regs[VCPU_REGS_RDI]);
- if ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
- &c->dst.val,
- c->dst.bytes,
- ctxt->vcpu)) != 0)
+ rc = ops->read_emulated((unsigned long)c->dst.ptr,
+ &c->dst.val,
+ c->dst.bytes,
+ ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
goto done;
DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
@@ -2102,12 +2246,13 @@ special_insn:
c->dst.type = OP_REG;
c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
- if ((rc = ops->read_emulated(register_address(c,
- seg_override_base(ctxt, c),
- c->regs[VCPU_REGS_RSI]),
- &c->dst.val,
- c->dst.bytes,
- ctxt->vcpu)) != 0)
+ rc = ops->read_emulated(register_address(c,
+ seg_override_base(ctxt, c),
+ c->regs[VCPU_REGS_RSI]),
+ &c->dst.val,
+ c->dst.bytes,
+ ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
goto done;
register_address_increment(c, &c->regs[VCPU_REGS_RSI],
(ctxt->eflags & EFLG_DF) ? -c->dst.bytes
@@ -2163,11 +2308,9 @@ special_insn:
case 0xe9: /* jmp rel */
goto jmp;
case 0xea: /* jmp far */
- if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9,
- VCPU_SREG_CS) < 0) {
- DPRINTF("jmp far: Failed to load CS descriptor\n");
- goto cannot_emulate;
- }
+ if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val,
+ VCPU_SREG_CS))
+ goto done;
c->eip = c->src.val;
break;
@@ -2185,7 +2328,13 @@ special_insn:
case 0xef: /* out (e/r)ax,dx */
port = c->regs[VCPU_REGS_RDX];
io_dir_in = 0;
- do_io: if (kvm_emulate_pio(ctxt->vcpu, io_dir_in,
+ do_io:
+ if (!emulator_io_permited(ctxt, ops, port,
+ (c->d & ByteOp) ? 1 : c->op_bytes)) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ goto done;
+ }
+ if (kvm_emulate_pio(ctxt->vcpu, io_dir_in,
(c->d & ByteOp) ? 1 : c->op_bytes,
port) != 0) {
c->eip = saved_eip;
@@ -2210,13 +2359,21 @@ special_insn:
c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xfa: /* cli */
- ctxt->eflags &= ~X86_EFLAGS_IF;
- c->dst.type = OP_NONE; /* Disable writeback. */
+ if (emulator_bad_iopl(ctxt))
+ kvm_inject_gp(ctxt->vcpu, 0);
+ else {
+ ctxt->eflags &= ~X86_EFLAGS_IF;
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ }
break;
case 0xfb: /* sti */
- toggle_interruptibility(ctxt, X86_SHADOW_INT_STI);
- ctxt->eflags |= X86_EFLAGS_IF;
- c->dst.type = OP_NONE; /* Disable writeback. */
+ if (emulator_bad_iopl(ctxt))
+ kvm_inject_gp(ctxt->vcpu, 0);
+ else {
+ toggle_interruptibility(ctxt, X86_SHADOW_INT_STI);
+ ctxt->eflags |= X86_EFLAGS_IF;
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ }
break;
case 0xfc: /* cld */
ctxt->eflags &= ~EFLG_DF;
@@ -2319,8 +2476,9 @@ twobyte_insn:
}
break;
case 0x05: /* syscall */
- if (emulate_syscall(ctxt) == -1)
- goto cannot_emulate;
+ rc = emulate_syscall(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
else
goto writeback;
break;
@@ -2391,14 +2549,16 @@ twobyte_insn:
c->dst.type = OP_NONE;
break;
case 0x34: /* sysenter */
- if (emulate_sysenter(ctxt) == -1)
- goto cannot_emulate;
+ rc = emulate_sysenter(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
else
goto writeback;
break;
case 0x35: /* sysexit */
- if (emulate_sysexit(ctxt) == -1)
- goto cannot_emulate;
+ rc = emulate_sysexit(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
else
goto writeback;
break;
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 15578f180e5..294698b6daf 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -242,11 +242,11 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
{
struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
irq_ack_notifier);
- spin_lock(&ps->inject_lock);
+ raw_spin_lock(&ps->inject_lock);
if (atomic_dec_return(&ps->pit_timer.pending) < 0)
atomic_inc(&ps->pit_timer.pending);
ps->irq_ack = 1;
- spin_unlock(&ps->inject_lock);
+ raw_spin_unlock(&ps->inject_lock);
}
void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -605,7 +605,7 @@ static const struct kvm_io_device_ops speaker_dev_ops = {
.write = speaker_ioport_write,
};
-/* Caller must have writers lock on slots_lock */
+/* Caller must hold slots_lock */
struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
{
struct kvm_pit *pit;
@@ -624,7 +624,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
mutex_init(&pit->pit_state.lock);
mutex_lock(&pit->pit_state.lock);
- spin_lock_init(&pit->pit_state.inject_lock);
+ raw_spin_lock_init(&pit->pit_state.inject_lock);
kvm->arch.vpit = pit;
pit->kvm = kvm;
@@ -645,13 +645,13 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
kvm_iodevice_init(&pit->dev, &pit_dev_ops);
- ret = __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev);
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev);
if (ret < 0)
goto fail;
if (flags & KVM_PIT_SPEAKER_DUMMY) {
kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
- ret = __kvm_io_bus_register_dev(&kvm->pio_bus,
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS,
&pit->speaker_dev);
if (ret < 0)
goto fail_unregister;
@@ -660,11 +660,12 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
return pit;
fail_unregister:
- __kvm_io_bus_unregister_dev(&kvm->pio_bus, &pit->dev);
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
fail:
- if (pit->irq_source_id >= 0)
- kvm_free_irq_source_id(kvm, pit->irq_source_id);
+ kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+ kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
+ kvm_free_irq_source_id(kvm, pit->irq_source_id);
kfree(pit);
return NULL;
@@ -723,12 +724,12 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
/* Try to inject pending interrupts when
* last one has been acked.
*/
- spin_lock(&ps->inject_lock);
+ raw_spin_lock(&ps->inject_lock);
if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
ps->irq_ack = 0;
inject = 1;
}
- spin_unlock(&ps->inject_lock);
+ raw_spin_unlock(&ps->inject_lock);
if (inject)
__inject_pit_timer_intr(kvm);
}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index d4c1c7ffdc0..900d6b0ba7c 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -27,7 +27,7 @@ struct kvm_kpit_state {
u32 speaker_data_on;
struct mutex lock;
struct kvm_pit *pit;
- spinlock_t inject_lock;
+ raw_spinlock_t inject_lock;
unsigned long irq_ack;
struct kvm_irq_ack_notifier irq_ack_notifier;
};
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index d057c0cbd24..07771da85de 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -44,18 +44,19 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
* Other interrupt may be delivered to PIC while lock is dropped but
* it should be safe since PIC state is already updated at this stage.
*/
- spin_unlock(&s->pics_state->lock);
+ raw_spin_unlock(&s->pics_state->lock);
kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
- spin_lock(&s->pics_state->lock);
+ raw_spin_lock(&s->pics_state->lock);
}
void kvm_pic_clear_isr_ack(struct kvm *kvm)
{
struct kvm_pic *s = pic_irqchip(kvm);
- spin_lock(&s->lock);
+
+ raw_spin_lock(&s->lock);
s->pics[0].isr_ack = 0xff;
s->pics[1].isr_ack = 0xff;
- spin_unlock(&s->lock);
+ raw_spin_unlock(&s->lock);
}
/*
@@ -156,9 +157,9 @@ static void pic_update_irq(struct kvm_pic *s)
void kvm_pic_update_irq(struct kvm_pic *s)
{
- spin_lock(&s->lock);
+ raw_spin_lock(&s->lock);
pic_update_irq(s);
- spin_unlock(&s->lock);
+ raw_spin_unlock(&s->lock);
}
int kvm_pic_set_irq(void *opaque, int irq, int level)
@@ -166,14 +167,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
struct kvm_pic *s = opaque;
int ret = -1;
- spin_lock(&s->lock);
+ raw_spin_lock(&s->lock);
if (irq >= 0 && irq < PIC_NUM_PINS) {
ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
pic_update_irq(s);
trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
s->pics[irq >> 3].imr, ret == 0);
}
- spin_unlock(&s->lock);
+ raw_spin_unlock(&s->lock);
return ret;
}
@@ -203,7 +204,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
int irq, irq2, intno;
struct kvm_pic *s = pic_irqchip(kvm);
- spin_lock(&s->lock);
+ raw_spin_lock(&s->lock);
irq = pic_get_irq(&s->pics[0]);
if (irq >= 0) {
pic_intack(&s->pics[0], irq);
@@ -228,7 +229,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
intno = s->pics[0].irq_base + irq;
}
pic_update_irq(s);
- spin_unlock(&s->lock);
+ raw_spin_unlock(&s->lock);
return intno;
}
@@ -442,7 +443,7 @@ static int picdev_write(struct kvm_io_device *this,
printk(KERN_ERR "PIC: non byte write\n");
return 0;
}
- spin_lock(&s->lock);
+ raw_spin_lock(&s->lock);
switch (addr) {
case 0x20:
case 0x21:
@@ -455,7 +456,7 @@ static int picdev_write(struct kvm_io_device *this,
elcr_ioport_write(&s->pics[addr & 1], addr, data);
break;
}
- spin_unlock(&s->lock);
+ raw_spin_unlock(&s->lock);
return 0;
}
@@ -472,7 +473,7 @@ static int picdev_read(struct kvm_io_device *this,
printk(KERN_ERR "PIC: non byte read\n");
return 0;
}
- spin_lock(&s->lock);
+ raw_spin_lock(&s->lock);
switch (addr) {
case 0x20:
case 0x21:
@@ -486,7 +487,7 @@ static int picdev_read(struct kvm_io_device *this,
break;
}
*(unsigned char *)val = data;
- spin_unlock(&s->lock);
+ raw_spin_unlock(&s->lock);
return 0;
}
@@ -520,7 +521,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
if (!s)
return NULL;
- spin_lock_init(&s->lock);
+ raw_spin_lock_init(&s->lock);
s->kvm = kvm;
s->pics[0].elcr_mask = 0xf8;
s->pics[1].elcr_mask = 0xde;
@@ -533,7 +534,9 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
* Initialize PIO device
*/
kvm_iodevice_init(&s->dev, &picdev_ops);
- ret = kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev);
+ mutex_lock(&kvm->slots_lock);
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev);
+ mutex_unlock(&kvm->slots_lock);
if (ret < 0) {
kfree(s);
return NULL;
@@ -541,3 +544,14 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
return s;
}
+
+void kvm_destroy_pic(struct kvm *kvm)
+{
+ struct kvm_pic *vpic = kvm->arch.vpic;
+
+ if (vpic) {
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev);
+ kvm->arch.vpic = NULL;
+ kfree(vpic);
+ }
+}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index be399e207d5..34b15915754 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -62,7 +62,7 @@ struct kvm_kpic_state {
};
struct kvm_pic {
- spinlock_t lock;
+ raw_spinlock_t lock;
unsigned pending_acks;
struct kvm *kvm;
struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
@@ -75,6 +75,7 @@ struct kvm_pic {
};
struct kvm_pic *kvm_create_pic(struct kvm *kvm);
+void kvm_destroy_pic(struct kvm *kvm);
int kvm_pic_read_irq(struct kvm *kvm);
void kvm_pic_update_irq(struct kvm_pic *s);
void kvm_pic_clear_isr_ack(struct kvm *kvm);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 7bcc5b6a440..cff851cf532 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -1,6 +1,11 @@
#ifndef ASM_KVM_CACHE_REGS_H
#define ASM_KVM_CACHE_REGS_H
+#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS
+#define KVM_POSSIBLE_CR4_GUEST_BITS \
+ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
+ | X86_CR4_OSXMMEXCPT | X86_CR4_PGE)
+
static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
enum kvm_reg reg)
{
@@ -38,4 +43,30 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
return vcpu->arch.pdptrs[index];
}
+static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
+{
+ ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
+ if (tmask & vcpu->arch.cr0_guest_owned_bits)
+ kvm_x86_ops->decache_cr0_guest_bits(vcpu);
+ return vcpu->arch.cr0 & mask;
+}
+
+static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr0_bits(vcpu, ~0UL);
+}
+
+static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
+{
+ ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS;
+ if (tmask & vcpu->arch.cr4_guest_owned_bits)
+ kvm_x86_ops->decache_cr4_guest_bits(vcpu);
+ return vcpu->arch.cr4 & mask;
+}
+
+static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr4_bits(vcpu, ~0UL);
+}
+
#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ba8c045da78..4b224f90087 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1246,3 +1246,34 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
return 0;
}
+
+int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (!irqchip_in_kernel(vcpu->kvm))
+ return 1;
+
+ /* if this is ICR write vector before command */
+ if (reg == APIC_ICR)
+ apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
+ return apic_reg_write(apic, reg, (u32)data);
+}
+
+int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 low, high = 0;
+
+ if (!irqchip_in_kernel(vcpu->kvm))
+ return 1;
+
+ if (apic_reg_read(apic, reg, 4, &low))
+ return 1;
+ if (reg == APIC_ICR)
+ apic_reg_read(apic, APIC_ICR2, 4, &high);
+
+ *data = (((u64)high) << 32) | low;
+
+ return 0;
+}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 40010b09c4a..f5fe32c5eda 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -48,4 +48,12 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+
+int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+
+static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
+}
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 89a49fb46a2..741373e8ca7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -18,6 +18,7 @@
*/
#include "mmu.h"
+#include "x86.h"
#include "kvm_cache_regs.h"
#include <linux/kvm_host.h>
@@ -29,6 +30,7 @@
#include <linux/swap.h>
#include <linux/hugetlb.h>
#include <linux/compiler.h>
+#include <linux/srcu.h>
#include <asm/page.h>
#include <asm/cmpxchg.h>
@@ -136,16 +138,6 @@ module_param(oos_shadow, bool, 0644);
#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
| PT64_NX_MASK)
-#define PFERR_PRESENT_MASK (1U << 0)
-#define PFERR_WRITE_MASK (1U << 1)
-#define PFERR_USER_MASK (1U << 2)
-#define PFERR_RSVD_MASK (1U << 3)
-#define PFERR_FETCH_MASK (1U << 4)
-
-#define PT_PDPE_LEVEL 3
-#define PT_DIRECTORY_LEVEL 2
-#define PT_PAGE_TABLE_LEVEL 1
-
#define RMAP_EXT 4
#define ACC_EXEC_MASK 1
@@ -153,6 +145,9 @@ module_param(oos_shadow, bool, 0644);
#define ACC_USER_MASK PT_USER_MASK
#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
+#include <trace/events/kvm.h>
+
+#undef TRACE_INCLUDE_FILE
#define CREATE_TRACE_POINTS
#include "mmutrace.h"
@@ -229,7 +224,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
static int is_write_protection(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.cr0 & X86_CR0_WP;
+ return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
}
static int is_cpuid_PSE36(void)
@@ -239,7 +234,7 @@ static int is_cpuid_PSE36(void)
static int is_nx(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.shadow_efer & EFER_NX;
+ return vcpu->arch.efer & EFER_NX;
}
static int is_shadow_present_pte(u64 pte)
@@ -253,7 +248,7 @@ static int is_large_pte(u64 pte)
return pte & PT_PAGE_SIZE_MASK;
}
-static int is_writeble_pte(unsigned long pte)
+static int is_writable_pte(unsigned long pte)
{
return pte & PT_WRITABLE_MASK;
}
@@ -470,24 +465,10 @@ static int has_wrprotected_page(struct kvm *kvm,
static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
{
- unsigned long page_size = PAGE_SIZE;
- struct vm_area_struct *vma;
- unsigned long addr;
+ unsigned long page_size;
int i, ret = 0;
- addr = gfn_to_hva(kvm, gfn);
- if (kvm_is_error_hva(addr))
- return PT_PAGE_TABLE_LEVEL;
-
- down_read(&current->mm->mmap_sem);
- vma = find_vma(current->mm, addr);
- if (!vma)
- goto out;
-
- page_size = vma_kernel_pagesize(vma);
-
-out:
- up_read(&current->mm->mmap_sem);
+ page_size = kvm_host_page_size(kvm, gfn);
for (i = PT_PAGE_TABLE_LEVEL;
i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
@@ -503,8 +484,7 @@ out:
static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
{
struct kvm_memory_slot *slot;
- int host_level;
- int level = PT_PAGE_TABLE_LEVEL;
+ int host_level, level, max_level;
slot = gfn_to_memslot(vcpu->kvm, large_gfn);
if (slot && slot->dirty_bitmap)
@@ -515,7 +495,10 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
if (host_level == PT_PAGE_TABLE_LEVEL)
return host_level;
- for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level)
+ max_level = kvm_x86_ops->get_lpage_level() < host_level ?
+ kvm_x86_ops->get_lpage_level() : host_level;
+
+ for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
break;
@@ -633,7 +616,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
pfn = spte_to_pfn(*spte);
if (*spte & shadow_accessed_mask)
kvm_set_pfn_accessed(pfn);
- if (is_writeble_pte(*spte))
+ if (is_writable_pte(*spte))
kvm_set_pfn_dirty(pfn);
rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
if (!*rmapp) {
@@ -662,6 +645,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
prev_desc = desc;
desc = desc->more;
}
+ pr_err("rmap_remove: %p %llx many->many\n", spte, *spte);
BUG();
}
}
@@ -708,7 +692,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
BUG_ON(!spte);
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
- if (is_writeble_pte(*spte)) {
+ if (is_writable_pte(*spte)) {
__set_spte(spte, *spte & ~PT_WRITABLE_MASK);
write_protected = 1;
}
@@ -732,7 +716,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
BUG_ON(!(*spte & PT_PRESENT_MASK));
BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
- if (is_writeble_pte(*spte)) {
+ if (is_writable_pte(*spte)) {
rmap_remove(kvm, spte);
--kvm->stat.lpages;
__set_spte(spte, shadow_trap_nonpresent_pte);
@@ -787,7 +771,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
new_spte &= ~PT_WRITABLE_MASK;
new_spte &= ~SPTE_HOST_WRITEABLE;
- if (is_writeble_pte(*spte))
+ if (is_writable_pte(*spte))
kvm_set_pfn_dirty(spte_to_pfn(*spte));
__set_spte(spte, new_spte);
spte = rmap_next(kvm, rmapp, spte);
@@ -805,35 +789,32 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
unsigned long data))
{
int i, j;
+ int ret;
int retval = 0;
+ struct kvm_memslots *slots;
- /*
- * If mmap_sem isn't taken, we can look the memslots with only
- * the mmu_lock by skipping over the slots with userspace_addr == 0.
- */
- for (i = 0; i < kvm->nmemslots; i++) {
- struct kvm_memory_slot *memslot = &kvm->memslots[i];
+ slots = rcu_dereference(kvm->memslots);
+
+ for (i = 0; i < slots->nmemslots; i++) {
+ struct kvm_memory_slot *memslot = &slots->memslots[i];
unsigned long start = memslot->userspace_addr;
unsigned long end;
- /* mmu_lock protects userspace_addr */
- if (!start)
- continue;
-
end = start + (memslot->npages << PAGE_SHIFT);
if (hva >= start && hva < end) {
gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
- retval |= handler(kvm, &memslot->rmap[gfn_offset],
- data);
+ ret = handler(kvm, &memslot->rmap[gfn_offset], data);
for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
int idx = gfn_offset;
idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
- retval |= handler(kvm,
+ ret |= handler(kvm,
&memslot->lpage_info[j][idx].rmap_pde,
data);
}
+ trace_kvm_age_page(hva, memslot, ret);
+ retval |= ret;
}
}
@@ -856,9 +837,15 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
u64 *spte;
int young = 0;
- /* always return old for EPT */
+ /*
+ * Emulate the accessed bit for EPT, by checking if this page has
+ * an EPT mapping, and clearing it if it does. On the next access,
+ * a new EPT mapping will be established.
+ * This has some overhead, but not as much as the cost of swapping
+ * out actively used pages or breaking up actively used hugepages.
+ */
if (!shadow_accessed_mask)
- return 0;
+ return kvm_unmap_rmapp(kvm, rmapp, data);
spte = rmap_next(kvm, rmapp, NULL);
while (spte) {
@@ -1615,7 +1602,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
{
- int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
+ int slot = memslot_id(kvm, gfn);
struct kvm_mmu_page *sp = page_header(__pa(pte));
__set_bit(slot, sp->slot_bitmap);
@@ -1639,7 +1626,7 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
{
struct page *page;
- gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
+ gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
if (gpa == UNMAPPED_GVA)
return NULL;
@@ -1852,7 +1839,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
* is responsibility of mmu_get_page / kvm_sync_page.
* Same reasoning can be applied to dirty page accounting.
*/
- if (!can_unsync && is_writeble_pte(*sptep))
+ if (!can_unsync && is_writable_pte(*sptep))
goto set_pte;
if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
@@ -1860,7 +1847,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
__func__, gfn);
ret = 1;
pte_access &= ~ACC_WRITE_MASK;
- if (is_writeble_pte(spte))
+ if (is_writable_pte(spte))
spte &= ~PT_WRITABLE_MASK;
}
}
@@ -1881,7 +1868,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
bool reset_host_protection)
{
int was_rmapped = 0;
- int was_writeble = is_writeble_pte(*sptep);
+ int was_writable = is_writable_pte(*sptep);
int rmap_count;
pgprintk("%s: spte %llx access %x write_fault %d"
@@ -1932,7 +1919,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (rmap_count > RMAP_RECYCLE_THRESHOLD)
rmap_recycle(vcpu, sptep, gfn);
} else {
- if (was_writeble)
+ if (was_writable)
kvm_release_pfn_dirty(pfn);
else
kvm_release_pfn_clean(pfn);
@@ -2162,8 +2149,11 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
spin_unlock(&vcpu->kvm->mmu_lock);
}
-static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
+static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
+ u32 access, u32 *error)
{
+ if (error)
+ *error = 0;
return vaddr;
}
@@ -2747,7 +2737,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
if (tdp_enabled)
return 0;
- gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
+ gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
spin_lock(&vcpu->kvm->mmu_lock);
r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
@@ -2847,16 +2837,13 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
*/
page = alloc_page(GFP_KERNEL | __GFP_DMA32);
if (!page)
- goto error_1;
+ return -ENOMEM;
+
vcpu->arch.mmu.pae_root = page_address(page);
for (i = 0; i < 4; ++i)
vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
return 0;
-
-error_1:
- free_mmu_pages(vcpu);
- return -ENOMEM;
}
int kvm_mmu_create(struct kvm_vcpu *vcpu)
@@ -2936,10 +2923,9 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
- int npages;
+ int npages, idx;
- if (!down_read_trylock(&kvm->slots_lock))
- continue;
+ idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
npages = kvm->arch.n_alloc_mmu_pages -
kvm->arch.n_free_mmu_pages;
@@ -2952,7 +2938,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
nr_to_scan--;
spin_unlock(&kvm->mmu_lock);
- up_read(&kvm->slots_lock);
+ srcu_read_unlock(&kvm->srcu, idx);
}
if (kvm_freed)
list_move_tail(&kvm_freed->vm_list, &vm_list);
@@ -3019,9 +3005,11 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
int i;
unsigned int nr_mmu_pages;
unsigned int nr_pages = 0;
+ struct kvm_memslots *slots;
- for (i = 0; i < kvm->nmemslots; i++)
- nr_pages += kvm->memslots[i].npages;
+ slots = rcu_dereference(kvm->memslots);
+ for (i = 0; i < slots->nmemslots; i++)
+ nr_pages += slots->memslots[i].npages;
nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
nr_mmu_pages = max(nr_mmu_pages,
@@ -3246,7 +3234,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
audit_mappings_page(vcpu, ent, va, level - 1);
else {
- gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
+ gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL);
gfn_t gfn = gpa >> PAGE_SHIFT;
pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
@@ -3291,10 +3279,12 @@ static void audit_mappings(struct kvm_vcpu *vcpu)
static int count_rmaps(struct kvm_vcpu *vcpu)
{
int nmaps = 0;
- int i, j, k;
+ int i, j, k, idx;
+ idx = srcu_read_lock(&kvm->srcu);
+ slots = rcu_dereference(kvm->memslots);
for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
- struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
+ struct kvm_memory_slot *m = &slots->memslots[i];
struct kvm_rmap_desc *d;
for (j = 0; j < m->npages; ++j) {
@@ -3317,6 +3307,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
}
}
}
+ srcu_read_unlock(&kvm->srcu, idx);
return nmaps;
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 61a1b3884b4..be66759321a 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -2,6 +2,7 @@
#define __KVM_X86_MMU_H
#include <linux/kvm_host.h>
+#include "kvm_cache_regs.h"
#define PT64_PT_BITS 9
#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
@@ -37,6 +38,16 @@
#define PT32_ROOT_LEVEL 2
#define PT32E_ROOT_LEVEL 3
+#define PT_PDPE_LEVEL 3
+#define PT_DIRECTORY_LEVEL 2
+#define PT_PAGE_TABLE_LEVEL 1
+
+#define PFERR_PRESENT_MASK (1U << 0)
+#define PFERR_WRITE_MASK (1U << 1)
+#define PFERR_USER_MASK (1U << 2)
+#define PFERR_RSVD_MASK (1U << 3)
+#define PFERR_FETCH_MASK (1U << 4)
+
int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
@@ -53,30 +64,6 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
return kvm_mmu_load(vcpu);
}
-static inline int is_long_mode(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
- return vcpu->arch.shadow_efer & EFER_LMA;
-#else
- return 0;
-#endif
-}
-
-static inline int is_pae(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.cr4 & X86_CR4_PAE;
-}
-
-static inline int is_pse(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.cr4 & X86_CR4_PSE;
-}
-
-static inline int is_paging(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.cr0 & X86_CR0_PG;
-}
-
static inline int is_present_gpte(unsigned long pte)
{
return pte & PT_PRESENT_MASK;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index ede2131a922..81eab9a50e6 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -162,7 +162,7 @@ walk:
if (rsvd_fault)
goto access_error;
- if (write_fault && !is_writeble_pte(pte))
+ if (write_fault && !is_writable_pte(pte))
if (user_fault || is_write_protection(vcpu))
goto access_error;
@@ -490,18 +490,23 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
spin_unlock(&vcpu->kvm->mmu_lock);
}
-static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
+static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
+ u32 *error)
{
struct guest_walker walker;
gpa_t gpa = UNMAPPED_GVA;
int r;
- r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
+ r = FNAME(walk_addr)(&walker, vcpu, vaddr,
+ !!(access & PFERR_WRITE_MASK),
+ !!(access & PFERR_USER_MASK),
+ !!(access & PFERR_FETCH_MASK));
if (r) {
gpa = gfn_to_gpa(walker.gfn);
gpa |= vaddr & ~PAGE_MASK;
- }
+ } else if (error)
+ *error = walker.error_code;
return gpa;
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1d9b33843c8..52f78dd0301 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -231,7 +231,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
efer &= ~EFER_LME;
to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
- vcpu->arch.shadow_efer = efer;
+ vcpu->arch.efer = efer;
}
static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
@@ -540,6 +540,8 @@ static void init_vmcb(struct vcpu_svm *svm)
struct vmcb_control_area *control = &svm->vmcb->control;
struct vmcb_save_area *save = &svm->vmcb->save;
+ svm->vcpu.fpu_active = 1;
+
control->intercept_cr_read = INTERCEPT_CR0_MASK |
INTERCEPT_CR3_MASK |
INTERCEPT_CR4_MASK;
@@ -552,13 +554,19 @@ static void init_vmcb(struct vcpu_svm *svm)
control->intercept_dr_read = INTERCEPT_DR0_MASK |
INTERCEPT_DR1_MASK |
INTERCEPT_DR2_MASK |
- INTERCEPT_DR3_MASK;
+ INTERCEPT_DR3_MASK |
+ INTERCEPT_DR4_MASK |
+ INTERCEPT_DR5_MASK |
+ INTERCEPT_DR6_MASK |
+ INTERCEPT_DR7_MASK;
control->intercept_dr_write = INTERCEPT_DR0_MASK |
INTERCEPT_DR1_MASK |
INTERCEPT_DR2_MASK |
INTERCEPT_DR3_MASK |
+ INTERCEPT_DR4_MASK |
INTERCEPT_DR5_MASK |
+ INTERCEPT_DR6_MASK |
INTERCEPT_DR7_MASK;
control->intercept_exceptions = (1 << PF_VECTOR) |
@@ -569,6 +577,7 @@ static void init_vmcb(struct vcpu_svm *svm)
control->intercept = (1ULL << INTERCEPT_INTR) |
(1ULL << INTERCEPT_NMI) |
(1ULL << INTERCEPT_SMI) |
+ (1ULL << INTERCEPT_SELECTIVE_CR0) |
(1ULL << INTERCEPT_CPUID) |
(1ULL << INTERCEPT_INVD) |
(1ULL << INTERCEPT_HLT) |
@@ -641,10 +650,8 @@ static void init_vmcb(struct vcpu_svm *svm)
control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
(1ULL << INTERCEPT_INVLPG));
control->intercept_exceptions &= ~(1 << PF_VECTOR);
- control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK|
- INTERCEPT_CR3_MASK);
- control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK|
- INTERCEPT_CR3_MASK);
+ control->intercept_cr_read &= ~INTERCEPT_CR3_MASK;
+ control->intercept_cr_write &= ~INTERCEPT_CR3_MASK;
save->g_pat = 0x0007040600070406ULL;
save->cr3 = 0;
save->cr4 = 0;
@@ -730,7 +737,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
init_vmcb(svm);
fx_init(&svm->vcpu);
- svm->vcpu.fpu_active = 1;
svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
if (kvm_vcpu_is_bsp(&svm->vcpu))
svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
@@ -765,14 +771,16 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (unlikely(cpu != vcpu->cpu)) {
u64 delta;
- /*
- * Make sure that the guest sees a monotonically
- * increasing TSC.
- */
- delta = vcpu->arch.host_tsc - native_read_tsc();
- svm->vmcb->control.tsc_offset += delta;
- if (is_nested(svm))
- svm->nested.hsave->control.tsc_offset += delta;
+ if (check_tsc_unstable()) {
+ /*
+ * Make sure that the guest sees a monotonically
+ * increasing TSC.
+ */
+ delta = vcpu->arch.host_tsc - native_read_tsc();
+ svm->vmcb->control.tsc_offset += delta;
+ if (is_nested(svm))
+ svm->nested.hsave->control.tsc_offset += delta;
+ }
vcpu->cpu = cpu;
kvm_migrate_timers(vcpu);
svm->asid_generation = 0;
@@ -954,42 +962,59 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
svm->vmcb->save.gdtr.base = dt->base ;
}
+static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
+{
+}
+
static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
{
}
+static void update_cr0_intercept(struct vcpu_svm *svm)
+{
+ ulong gcr0 = svm->vcpu.arch.cr0;
+ u64 *hcr0 = &svm->vmcb->save.cr0;
+
+ if (!svm->vcpu.fpu_active)
+ *hcr0 |= SVM_CR0_SELECTIVE_MASK;
+ else
+ *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
+ | (gcr0 & SVM_CR0_SELECTIVE_MASK);
+
+
+ if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
+ svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
+ svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
+ } else {
+ svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
+ svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
+ }
+}
+
static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_svm *svm = to_svm(vcpu);
#ifdef CONFIG_X86_64
- if (vcpu->arch.shadow_efer & EFER_LME) {
+ if (vcpu->arch.efer & EFER_LME) {
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
- vcpu->arch.shadow_efer |= EFER_LMA;
+ vcpu->arch.efer |= EFER_LMA;
svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
}
if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
- vcpu->arch.shadow_efer &= ~EFER_LMA;
+ vcpu->arch.efer &= ~EFER_LMA;
svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
}
}
#endif
- if (npt_enabled)
- goto set;
+ vcpu->arch.cr0 = cr0;
- if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
- svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
- vcpu->fpu_active = 1;
- }
+ if (!npt_enabled)
+ cr0 |= X86_CR0_PG | X86_CR0_WP;
- vcpu->arch.cr0 = cr0;
- cr0 |= X86_CR0_PG | X86_CR0_WP;
- if (!vcpu->fpu_active) {
- svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
+ if (!vcpu->fpu_active)
cr0 |= X86_CR0_TS;
- }
-set:
/*
* re-enable caching here because the QEMU bios
* does not do it - this results in some delay at
@@ -997,6 +1022,7 @@ set:
*/
cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
svm->vmcb->save.cr0 = cr0;
+ update_cr0_intercept(svm);
}
static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1102,76 +1128,70 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
svm->vmcb->control.asid = sd->next_asid++;
}
-static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
+static int svm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *dest)
{
struct vcpu_svm *svm = to_svm(vcpu);
- unsigned long val;
switch (dr) {
case 0 ... 3:
- val = vcpu->arch.db[dr];
+ *dest = vcpu->arch.db[dr];
break;
+ case 4:
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ return EMULATE_FAIL; /* will re-inject UD */
+ /* fall through */
case 6:
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
- val = vcpu->arch.dr6;
+ *dest = vcpu->arch.dr6;
else
- val = svm->vmcb->save.dr6;
+ *dest = svm->vmcb->save.dr6;
break;
+ case 5:
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ return EMULATE_FAIL; /* will re-inject UD */
+ /* fall through */
case 7:
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
- val = vcpu->arch.dr7;
+ *dest = vcpu->arch.dr7;
else
- val = svm->vmcb->save.dr7;
+ *dest = svm->vmcb->save.dr7;
break;
- default:
- val = 0;
}
- return val;
+ return EMULATE_DONE;
}
-static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
- int *exception)
+static int svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value)
{
struct vcpu_svm *svm = to_svm(vcpu);
- *exception = 0;
-
switch (dr) {
case 0 ... 3:
vcpu->arch.db[dr] = value;
if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
vcpu->arch.eff_db[dr] = value;
- return;
- case 4 ... 5:
- if (vcpu->arch.cr4 & X86_CR4_DE)
- *exception = UD_VECTOR;
- return;
+ break;
+ case 4:
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ return EMULATE_FAIL; /* will re-inject UD */
+ /* fall through */
case 6:
- if (value & 0xffffffff00000000ULL) {
- *exception = GP_VECTOR;
- return;
- }
vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1;
- return;
+ break;
+ case 5:
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ return EMULATE_FAIL; /* will re-inject UD */
+ /* fall through */
case 7:
- if (value & 0xffffffff00000000ULL) {
- *exception = GP_VECTOR;
- return;
- }
vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1;
if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
svm->vmcb->save.dr7 = vcpu->arch.dr7;
vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK);
}
- return;
- default:
- /* FIXME: Possible case? */
- printk(KERN_DEBUG "%s: unexpected dr %u\n",
- __func__, dr);
- *exception = UD_VECTOR;
- return;
+ break;
}
+
+ return EMULATE_DONE;
}
static int pf_interception(struct vcpu_svm *svm)
@@ -1239,13 +1259,17 @@ static int ud_interception(struct vcpu_svm *svm)
return 1;
}
-static int nm_interception(struct vcpu_svm *svm)
+static void svm_fpu_activate(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
- if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
- svm->vmcb->save.cr0 &= ~X86_CR0_TS;
svm->vcpu.fpu_active = 1;
+ update_cr0_intercept(svm);
+}
+static int nm_interception(struct vcpu_svm *svm)
+{
+ svm_fpu_activate(&svm->vcpu);
return 1;
}
@@ -1337,7 +1361,7 @@ static int vmmcall_interception(struct vcpu_svm *svm)
static int nested_svm_check_permissions(struct vcpu_svm *svm)
{
- if (!(svm->vcpu.arch.shadow_efer & EFER_SVME)
+ if (!(svm->vcpu.arch.efer & EFER_SVME)
|| !is_paging(&svm->vcpu)) {
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
return 1;
@@ -1740,8 +1764,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
hsave->save.ds = vmcb->save.ds;
hsave->save.gdtr = vmcb->save.gdtr;
hsave->save.idtr = vmcb->save.idtr;
- hsave->save.efer = svm->vcpu.arch.shadow_efer;
- hsave->save.cr0 = svm->vcpu.arch.cr0;
+ hsave->save.efer = svm->vcpu.arch.efer;
+ hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
hsave->save.cr4 = svm->vcpu.arch.cr4;
hsave->save.rflags = vmcb->save.rflags;
hsave->save.rip = svm->next_rip;
@@ -2153,9 +2177,10 @@ static int rdmsr_interception(struct vcpu_svm *svm)
u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
u64 data;
- if (svm_get_msr(&svm->vcpu, ecx, &data))
+ if (svm_get_msr(&svm->vcpu, ecx, &data)) {
+ trace_kvm_msr_read_ex(ecx);
kvm_inject_gp(&svm->vcpu, 0);
- else {
+ } else {
trace_kvm_msr_read(ecx, data);
svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
@@ -2247,13 +2272,15 @@ static int wrmsr_interception(struct vcpu_svm *svm)
u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
| ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
- trace_kvm_msr_write(ecx, data);
svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
- if (svm_set_msr(&svm->vcpu, ecx, data))
+ if (svm_set_msr(&svm->vcpu, ecx, data)) {
+ trace_kvm_msr_write_ex(ecx, data);
kvm_inject_gp(&svm->vcpu, 0);
- else
+ } else {
+ trace_kvm_msr_write(ecx, data);
skip_emulated_instruction(&svm->vcpu);
+ }
return 1;
}
@@ -2297,7 +2324,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_READ_CR3] = emulate_on_interception,
[SVM_EXIT_READ_CR4] = emulate_on_interception,
[SVM_EXIT_READ_CR8] = emulate_on_interception,
- /* for now: */
+ [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
[SVM_EXIT_WRITE_CR0] = emulate_on_interception,
[SVM_EXIT_WRITE_CR3] = emulate_on_interception,
[SVM_EXIT_WRITE_CR4] = emulate_on_interception,
@@ -2306,11 +2333,17 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_READ_DR1] = emulate_on_interception,
[SVM_EXIT_READ_DR2] = emulate_on_interception,
[SVM_EXIT_READ_DR3] = emulate_on_interception,
+ [SVM_EXIT_READ_DR4] = emulate_on_interception,
+ [SVM_EXIT_READ_DR5] = emulate_on_interception,
+ [SVM_EXIT_READ_DR6] = emulate_on_interception,
+ [SVM_EXIT_READ_DR7] = emulate_on_interception,
[SVM_EXIT_WRITE_DR0] = emulate_on_interception,
[SVM_EXIT_WRITE_DR1] = emulate_on_interception,
[SVM_EXIT_WRITE_DR2] = emulate_on_interception,
[SVM_EXIT_WRITE_DR3] = emulate_on_interception,
+ [SVM_EXIT_WRITE_DR4] = emulate_on_interception,
[SVM_EXIT_WRITE_DR5] = emulate_on_interception,
+ [SVM_EXIT_WRITE_DR6] = emulate_on_interception,
[SVM_EXIT_WRITE_DR7] = emulate_on_interception,
[SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
[SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
@@ -2383,20 +2416,10 @@ static int handle_exit(struct kvm_vcpu *vcpu)
svm_complete_interrupts(svm);
- if (npt_enabled) {
- int mmu_reload = 0;
- if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) {
- svm_set_cr0(vcpu, svm->vmcb->save.cr0);
- mmu_reload = 1;
- }
+ if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))
vcpu->arch.cr0 = svm->vmcb->save.cr0;
+ if (npt_enabled)
vcpu->arch.cr3 = svm->vmcb->save.cr3;
- if (mmu_reload) {
- kvm_mmu_reset_context(vcpu);
- kvm_mmu_load(vcpu);
- }
- }
-
if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -2798,12 +2821,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
svm->vmcb->save.cr3 = root;
force_new_asid(vcpu);
-
- if (vcpu->fpu_active) {
- svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
- svm->vmcb->save.cr0 |= X86_CR0_TS;
- vcpu->fpu_active = 0;
- }
}
static int is_disabled(void)
@@ -2852,6 +2869,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
return 0;
}
+static void svm_cpuid_update(struct kvm_vcpu *vcpu)
+{
+}
+
static const struct trace_print_flags svm_exit_reasons_str[] = {
{ SVM_EXIT_READ_CR0, "read_cr0" },
{ SVM_EXIT_READ_CR3, "read_cr3" },
@@ -2905,9 +2926,22 @@ static const struct trace_print_flags svm_exit_reasons_str[] = {
{ -1, NULL }
};
-static bool svm_gb_page_enable(void)
+static int svm_get_lpage_level(void)
{
- return true;
+ return PT_PDPE_LEVEL;
+}
+
+static bool svm_rdtscp_supported(void)
+{
+ return false;
+}
+
+static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ update_cr0_intercept(svm);
+ svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR;
}
static struct kvm_x86_ops svm_x86_ops = {
@@ -2936,6 +2970,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.set_segment = svm_set_segment,
.get_cpl = svm_get_cpl,
.get_cs_db_l_bits = kvm_get_cs_db_l_bits,
+ .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
.decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
.set_cr0 = svm_set_cr0,
.set_cr3 = svm_set_cr3,
@@ -2950,6 +2985,8 @@ static struct kvm_x86_ops svm_x86_ops = {
.cache_reg = svm_cache_reg,
.get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags,
+ .fpu_activate = svm_fpu_activate,
+ .fpu_deactivate = svm_fpu_deactivate,
.tlb_flush = svm_flush_tlb,
@@ -2975,7 +3012,11 @@ static struct kvm_x86_ops svm_x86_ops = {
.get_mt_mask = svm_get_mt_mask,
.exit_reasons_str = svm_exit_reasons_str,
- .gb_page_enable = svm_gb_page_enable,
+ .get_lpage_level = svm_get_lpage_level,
+
+ .cpuid_update = svm_cpuid_update,
+
+ .rdtscp_supported = svm_rdtscp_supported,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 816e0449db0..6ad30a29f04 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -56,6 +56,38 @@ TRACE_EVENT(kvm_hypercall,
);
/*
+ * Tracepoint for hypercall.
+ */
+TRACE_EVENT(kvm_hv_hypercall,
+ TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx,
+ __u64 ingpa, __u64 outgpa),
+ TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa),
+
+ TP_STRUCT__entry(
+ __field( __u16, code )
+ __field( bool, fast )
+ __field( __u16, rep_cnt )
+ __field( __u16, rep_idx )
+ __field( __u64, ingpa )
+ __field( __u64, outgpa )
+ ),
+
+ TP_fast_assign(
+ __entry->code = code;
+ __entry->fast = fast;
+ __entry->rep_cnt = rep_cnt;
+ __entry->rep_idx = rep_idx;
+ __entry->ingpa = ingpa;
+ __entry->outgpa = outgpa;
+ ),
+
+ TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx",
+ __entry->code, __entry->fast ? "fast" : "slow",
+ __entry->rep_cnt, __entry->rep_idx, __entry->ingpa,
+ __entry->outgpa)
+);
+
+/*
* Tracepoint for PIO.
*/
TRACE_EVENT(kvm_pio,
@@ -214,28 +246,33 @@ TRACE_EVENT(kvm_page_fault,
* Tracepoint for guest MSR access.
*/
TRACE_EVENT(kvm_msr,
- TP_PROTO(unsigned int rw, unsigned int ecx, unsigned long data),
- TP_ARGS(rw, ecx, data),
+ TP_PROTO(unsigned write, u32 ecx, u64 data, bool exception),
+ TP_ARGS(write, ecx, data, exception),
TP_STRUCT__entry(
- __field( unsigned int, rw )
- __field( unsigned int, ecx )
- __field( unsigned long, data )
+ __field( unsigned, write )
+ __field( u32, ecx )
+ __field( u64, data )
+ __field( u8, exception )
),
TP_fast_assign(
- __entry->rw = rw;
+ __entry->write = write;
__entry->ecx = ecx;
__entry->data = data;
+ __entry->exception = exception;
),
- TP_printk("msr_%s %x = 0x%lx",
- __entry->rw ? "write" : "read",
- __entry->ecx, __entry->data)
+ TP_printk("msr_%s %x = 0x%llx%s",
+ __entry->write ? "write" : "read",
+ __entry->ecx, __entry->data,
+ __entry->exception ? " (#GP)" : "")
);
-#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data)
-#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data)
+#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data, false)
+#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data, false)
+#define trace_kvm_msr_read_ex(ecx) trace_kvm_msr(0, ecx, 0, true)
+#define trace_kvm_msr_write_ex(ecx, data) trace_kvm_msr(1, ecx, data, true)
/*
* Tracepoint for guest CR access.
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d4918d6fc92..14873b9f843 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -61,6 +61,21 @@ module_param_named(unrestricted_guest,
static int __read_mostly emulate_invalid_guest_state = 0;
module_param(emulate_invalid_guest_state, bool, S_IRUGO);
+#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
+ (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
+#define KVM_GUEST_CR0_MASK \
+ (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
+#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
+ (X86_CR0_WP | X86_CR0_NE)
+#define KVM_VM_CR0_ALWAYS_ON \
+ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
+#define KVM_CR4_GUEST_OWNED_BITS \
+ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
+ | X86_CR4_OSXMMEXCPT)
+
+#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
+#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
+
/*
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
* ple_gap: upper bound on the amount of time between two successive
@@ -136,6 +151,8 @@ struct vcpu_vmx {
ktime_t entry_time;
s64 vnmi_blocked_time;
u32 exit_reason;
+
+ bool rdtscp_enabled;
};
static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -210,7 +227,7 @@ static const u32 vmx_msr_index[] = {
#ifdef CONFIG_X86_64
MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
#endif
- MSR_EFER, MSR_K6_STAR,
+ MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR,
};
#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
@@ -301,6 +318,11 @@ static inline bool cpu_has_vmx_ept_2m_page(void)
return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);
}
+static inline bool cpu_has_vmx_ept_1g_page(void)
+{
+ return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT);
+}
+
static inline int cpu_has_vmx_invept_individual_addr(void)
{
return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
@@ -336,9 +358,7 @@ static inline int cpu_has_vmx_ple(void)
static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
{
- return flexpriority_enabled &&
- (cpu_has_vmx_virtualize_apic_accesses()) &&
- (irqchip_in_kernel(kvm));
+ return flexpriority_enabled && irqchip_in_kernel(kvm);
}
static inline int cpu_has_vmx_vpid(void)
@@ -347,6 +367,12 @@ static inline int cpu_has_vmx_vpid(void)
SECONDARY_EXEC_ENABLE_VPID;
}
+static inline int cpu_has_vmx_rdtscp(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_RDTSCP;
+}
+
static inline int cpu_has_virtual_nmis(void)
{
return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
@@ -551,22 +577,18 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
{
u32 eb;
- eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR);
- if (!vcpu->fpu_active)
- eb |= 1u << NM_VECTOR;
- /*
- * Unconditionally intercept #DB so we can maintain dr6 without
- * reading it every exit.
- */
- eb |= 1u << DB_VECTOR;
- if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
- eb |= 1u << BP_VECTOR;
- }
+ eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
+ (1u << NM_VECTOR) | (1u << DB_VECTOR);
+ if ((vcpu->guest_debug &
+ (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
+ (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
+ eb |= 1u << BP_VECTOR;
if (to_vmx(vcpu)->rmode.vm86_active)
eb = ~0;
if (enable_ept)
eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
+ if (vcpu->fpu_active)
+ eb &= ~(1u << NM_VECTOR);
vmcs_write32(EXCEPTION_BITMAP, eb);
}
@@ -589,7 +611,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
u64 guest_efer;
u64 ignore_bits;
- guest_efer = vmx->vcpu.arch.shadow_efer;
+ guest_efer = vmx->vcpu.arch.efer;
/*
* NX is emulated; LMA and LME handled by hardware; SCE meaninless
@@ -767,22 +789,30 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
{
+ ulong cr0;
+
if (vcpu->fpu_active)
return;
vcpu->fpu_active = 1;
- vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
- if (vcpu->arch.cr0 & X86_CR0_TS)
- vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
+ cr0 = vmcs_readl(GUEST_CR0);
+ cr0 &= ~(X86_CR0_TS | X86_CR0_MP);
+ cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
+ vmcs_writel(GUEST_CR0, cr0);
update_exception_bitmap(vcpu);
+ vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
+ vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
}
+static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
+
static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
{
- if (!vcpu->fpu_active)
- return;
- vcpu->fpu_active = 0;
- vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
+ vmx_decache_cr0_guest_bits(vcpu);
+ vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
update_exception_bitmap(vcpu);
+ vcpu->arch.cr0_guest_owned_bits = 0;
+ vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
+ vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
}
static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
@@ -878,6 +908,11 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
}
+static bool vmx_rdtscp_supported(void)
+{
+ return cpu_has_vmx_rdtscp();
+}
+
/*
* Swap MSR entry in host/guest MSR entry array.
*/
@@ -913,12 +948,15 @@ static void setup_msrs(struct vcpu_vmx *vmx)
index = __find_msr_index(vmx, MSR_CSTAR);
if (index >= 0)
move_msr_up(vmx, index, save_nmsrs++);
+ index = __find_msr_index(vmx, MSR_TSC_AUX);
+ if (index >= 0 && vmx->rdtscp_enabled)
+ move_msr_up(vmx, index, save_nmsrs++);
/*
* MSR_K6_STAR is only needed on long mode guests, and only
* if efer.sce is enabled.
*/
index = __find_msr_index(vmx, MSR_K6_STAR);
- if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE))
+ if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
move_msr_up(vmx, index, save_nmsrs++);
}
#endif
@@ -1002,6 +1040,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
case MSR_IA32_SYSENTER_ESP:
data = vmcs_readl(GUEST_SYSENTER_ESP);
break;
+ case MSR_TSC_AUX:
+ if (!to_vmx(vcpu)->rdtscp_enabled)
+ return 1;
+ /* Otherwise falls through */
default:
vmx_load_host_state(to_vmx(vcpu));
msr = find_msr_entry(to_vmx(vcpu), msr_index);
@@ -1065,7 +1107,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
vcpu->arch.pat = data;
break;
}
- /* Otherwise falls through to kvm_set_msr_common */
+ ret = kvm_set_msr_common(vcpu, msr_index, data);
+ break;
+ case MSR_TSC_AUX:
+ if (!vmx->rdtscp_enabled)
+ return 1;
+ /* Check reserved bit, higher 32 bits should be zero */
+ if ((data >> 32) != 0)
+ return 1;
+ /* Otherwise falls through */
default:
msr = find_msr_entry(vmx, msr_index);
if (msr) {
@@ -1224,6 +1274,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
CPU_BASED_USE_IO_BITMAPS |
CPU_BASED_MOV_DR_EXITING |
CPU_BASED_USE_TSC_OFFSETING |
+ CPU_BASED_MWAIT_EXITING |
+ CPU_BASED_MONITOR_EXITING |
CPU_BASED_INVLPG_EXITING;
opt = CPU_BASED_TPR_SHADOW |
CPU_BASED_USE_MSR_BITMAPS |
@@ -1243,7 +1295,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
SECONDARY_EXEC_ENABLE_VPID |
SECONDARY_EXEC_ENABLE_EPT |
SECONDARY_EXEC_UNRESTRICTED_GUEST |
- SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+ SECONDARY_EXEC_PAUSE_LOOP_EXITING |
+ SECONDARY_EXEC_RDTSCP;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
@@ -1457,8 +1510,12 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
static gva_t rmode_tss_base(struct kvm *kvm)
{
if (!kvm->arch.tss_addr) {
- gfn_t base_gfn = kvm->memslots[0].base_gfn +
- kvm->memslots[0].npages - 3;
+ struct kvm_memslots *slots;
+ gfn_t base_gfn;
+
+ slots = rcu_dereference(kvm->memslots);
+ base_gfn = kvm->memslots->memslots[0].base_gfn +
+ kvm->memslots->memslots[0].npages - 3;
return base_gfn << PAGE_SHIFT;
}
return kvm->arch.tss_addr;
@@ -1544,9 +1601,7 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
* of this msr depends on is_long_mode().
*/
vmx_load_host_state(to_vmx(vcpu));
- vcpu->arch.shadow_efer = efer;
- if (!msr)
- return;
+ vcpu->arch.efer = efer;
if (efer & EFER_LMA) {
vmcs_write32(VM_ENTRY_CONTROLS,
vmcs_read32(VM_ENTRY_CONTROLS) |
@@ -1576,13 +1631,13 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
(guest_tr_ar & ~AR_TYPE_MASK)
| AR_TYPE_BUSY_64_TSS);
}
- vcpu->arch.shadow_efer |= EFER_LMA;
- vmx_set_efer(vcpu, vcpu->arch.shadow_efer);
+ vcpu->arch.efer |= EFER_LMA;
+ vmx_set_efer(vcpu, vcpu->arch.efer);
}
static void exit_lmode(struct kvm_vcpu *vcpu)
{
- vcpu->arch.shadow_efer &= ~EFER_LMA;
+ vcpu->arch.efer &= ~EFER_LMA;
vmcs_write32(VM_ENTRY_CONTROLS,
vmcs_read32(VM_ENTRY_CONTROLS)
@@ -1598,10 +1653,20 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
}
+static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
+{
+ ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
+
+ vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
+ vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
+}
+
static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
{
- vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK;
- vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
+ ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
+
+ vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
+ vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
}
static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
@@ -1646,7 +1711,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
(CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING));
vcpu->arch.cr0 = cr0;
- vmx_set_cr4(vcpu, vcpu->arch.cr4);
+ vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
} else if (!is_paging(vcpu)) {
/* From nonpaging to paging */
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -1654,23 +1719,13 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
~(CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING));
vcpu->arch.cr0 = cr0;
- vmx_set_cr4(vcpu, vcpu->arch.cr4);
+ vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
}
if (!(cr0 & X86_CR0_WP))
*hw_cr0 &= ~X86_CR0_WP;
}
-static void ept_update_paging_mode_cr4(unsigned long *hw_cr4,
- struct kvm_vcpu *vcpu)
-{
- if (!is_paging(vcpu)) {
- *hw_cr4 &= ~X86_CR4_PAE;
- *hw_cr4 |= X86_CR4_PSE;
- } else if (!(vcpu->arch.cr4 & X86_CR4_PAE))
- *hw_cr4 &= ~X86_CR4_PAE;
-}
-
static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -1682,8 +1737,6 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
else
hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON;
- vmx_fpu_deactivate(vcpu);
-
if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
enter_pmode(vcpu);
@@ -1691,7 +1744,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
enter_rmode(vcpu);
#ifdef CONFIG_X86_64
- if (vcpu->arch.shadow_efer & EFER_LME) {
+ if (vcpu->arch.efer & EFER_LME) {
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
enter_lmode(vcpu);
if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
@@ -1702,12 +1755,12 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
if (enable_ept)
ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
+ if (!vcpu->fpu_active)
+ hw_cr0 |= X86_CR0_TS | X86_CR0_MP;
+
vmcs_writel(CR0_READ_SHADOW, cr0);
vmcs_writel(GUEST_CR0, hw_cr0);
vcpu->arch.cr0 = cr0;
-
- if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
- vmx_fpu_activate(vcpu);
}
static u64 construct_eptp(unsigned long root_hpa)
@@ -1738,8 +1791,6 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
vmx_flush_tlb(vcpu);
vmcs_writel(GUEST_CR3, guest_cr3);
- if (vcpu->arch.cr0 & X86_CR0_PE)
- vmx_fpu_deactivate(vcpu);
}
static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1748,8 +1799,14 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
vcpu->arch.cr4 = cr4;
- if (enable_ept)
- ept_update_paging_mode_cr4(&hw_cr4, vcpu);
+ if (enable_ept) {
+ if (!is_paging(vcpu)) {
+ hw_cr4 &= ~X86_CR4_PAE;
+ hw_cr4 |= X86_CR4_PSE;
+ } else if (!(cr4 & X86_CR4_PAE)) {
+ hw_cr4 &= ~X86_CR4_PAE;
+ }
+ }
vmcs_writel(CR4_READ_SHADOW, cr4);
vmcs_writel(GUEST_CR4, hw_cr4);
@@ -1787,7 +1844,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
static int vmx_get_cpl(struct kvm_vcpu *vcpu)
{
- if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */
+ if (!is_protmode(vcpu))
return 0;
if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */
@@ -2042,7 +2099,7 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
static bool guest_state_valid(struct kvm_vcpu *vcpu)
{
/* real mode guest state checks */
- if (!(vcpu->arch.cr0 & X86_CR0_PE)) {
+ if (!is_protmode(vcpu)) {
if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
return false;
if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
@@ -2175,7 +2232,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
struct kvm_userspace_memory_region kvm_userspace_mem;
int r = 0;
- down_write(&kvm->slots_lock);
+ mutex_lock(&kvm->slots_lock);
if (kvm->arch.apic_access_page)
goto out;
kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
@@ -2188,7 +2245,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
out:
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
return r;
}
@@ -2197,7 +2254,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
struct kvm_userspace_memory_region kvm_userspace_mem;
int r = 0;
- down_write(&kvm->slots_lock);
+ mutex_lock(&kvm->slots_lock);
if (kvm->arch.ept_identity_pagetable)
goto out;
kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
@@ -2212,7 +2269,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);
out:
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
return r;
}
@@ -2384,14 +2441,12 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
for (i = 0; i < NR_VMX_MSR; ++i) {
u32 index = vmx_msr_index[i];
u32 data_low, data_high;
- u64 data;
int j = vmx->nmsrs;
if (rdmsr_safe(index, &data_low, &data_high) < 0)
continue;
if (wrmsr_safe(index, data_low, data_high) < 0)
continue;
- data = data_low | ((u64)data_high << 32);
vmx->guest_msrs[j].index = i;
vmx->guest_msrs[j].data = 0;
vmx->guest_msrs[j].mask = -1ull;
@@ -2404,7 +2459,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
- vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
+ vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
+ if (enable_ept)
+ vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
+ vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc;
rdtscll(tsc_this);
@@ -2429,10 +2487,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 msr;
- int ret;
+ int ret, idx;
vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
- down_read(&vcpu->kvm->slots_lock);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
if (!init_rmode(vmx->vcpu.kvm)) {
ret = -ENOMEM;
goto out;
@@ -2526,7 +2584,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
- vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
+ vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
vmx_set_cr4(&vmx->vcpu, 0);
vmx_set_efer(&vmx->vcpu, 0);
vmx_fpu_activate(&vmx->vcpu);
@@ -2540,7 +2598,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmx->emulation_required = 0;
out:
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
return ret;
}
@@ -2717,6 +2775,12 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
kvm_queue_exception(vcpu, vec);
return 1;
case BP_VECTOR:
+ /*
+ * Update instruction length as we may reinject the exception
+ * from user space while in guest debugging mode.
+ */
+ to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
return 0;
/* fall through */
@@ -2839,6 +2903,13 @@ static int handle_exception(struct kvm_vcpu *vcpu)
kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
/* fall through */
case BP_VECTOR:
+ /*
+ * Update instruction length as we may reinject #BP from
+ * user space while in guest debugging mode. Reading it for
+ * #DB as well causes no harm, it is not used in that case.
+ */
+ vmx->vcpu.arch.event_exit_inst_len =
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
kvm_run->exit_reason = KVM_EXIT_DEBUG;
kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
kvm_run->debug.arch.exception = ex_no;
@@ -2940,11 +3011,10 @@ static int handle_cr(struct kvm_vcpu *vcpu)
};
break;
case 2: /* clts */
- vmx_fpu_deactivate(vcpu);
- vcpu->arch.cr0 &= ~X86_CR0_TS;
- vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
- vmx_fpu_activate(vcpu);
+ vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+ trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
skip_emulated_instruction(vcpu);
+ vmx_fpu_activate(vcpu);
return 1;
case 1: /*mov from cr*/
switch (cr) {
@@ -2962,7 +3032,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
}
break;
case 3: /* lmsw */
- kvm_lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
+ val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
+ trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
+ kvm_lmsw(vcpu, val);
skip_emulated_instruction(vcpu);
return 1;
@@ -2975,12 +3047,22 @@ static int handle_cr(struct kvm_vcpu *vcpu)
return 0;
}
+static int check_dr_alias(struct kvm_vcpu *vcpu)
+{
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return -1;
+ }
+ return 0;
+}
+
static int handle_dr(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification;
unsigned long val;
int dr, reg;
+ /* Do not handle if the CPL > 0, will trigger GP on re-entry */
if (!kvm_require_cpl(vcpu, 0))
return 1;
dr = vmcs_readl(GUEST_DR7);
@@ -3016,14 +3098,20 @@ static int handle_dr(struct kvm_vcpu *vcpu)
case 0 ... 3:
val = vcpu->arch.db[dr];
break;
+ case 4:
+ if (check_dr_alias(vcpu) < 0)
+ return 1;
+ /* fall through */
case 6:
val = vcpu->arch.dr6;
break;
- case 7:
+ case 5:
+ if (check_dr_alias(vcpu) < 0)
+ return 1;
+ /* fall through */
+ default: /* 7 */
val = vcpu->arch.dr7;
break;
- default:
- val = 0;
}
kvm_register_write(vcpu, reg, val);
} else {
@@ -3034,21 +3122,25 @@ static int handle_dr(struct kvm_vcpu *vcpu)
if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
vcpu->arch.eff_db[dr] = val;
break;
- case 4 ... 5:
- if (vcpu->arch.cr4 & X86_CR4_DE)
- kvm_queue_exception(vcpu, UD_VECTOR);
- break;
+ case 4:
+ if (check_dr_alias(vcpu) < 0)
+ return 1;
+ /* fall through */
case 6:
if (val & 0xffffffff00000000ULL) {
- kvm_queue_exception(vcpu, GP_VECTOR);
- break;
+ kvm_inject_gp(vcpu, 0);
+ return 1;
}
vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
break;
- case 7:
+ case 5:
+ if (check_dr_alias(vcpu) < 0)
+ return 1;
+ /* fall through */
+ default: /* 7 */
if (val & 0xffffffff00000000ULL) {
- kvm_queue_exception(vcpu, GP_VECTOR);
- break;
+ kvm_inject_gp(vcpu, 0);
+ return 1;
}
vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
@@ -3075,6 +3167,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu)
u64 data;
if (vmx_get_msr(vcpu, ecx, &data)) {
+ trace_kvm_msr_read_ex(ecx);
kvm_inject_gp(vcpu, 0);
return 1;
}
@@ -3094,13 +3187,13 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
| ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
- trace_kvm_msr_write(ecx, data);
-
if (vmx_set_msr(vcpu, ecx, data) != 0) {
+ trace_kvm_msr_write_ex(ecx, data);
kvm_inject_gp(vcpu, 0);
return 1;
}
+ trace_kvm_msr_write(ecx, data);
skip_emulated_instruction(vcpu);
return 1;
}
@@ -3385,7 +3478,6 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
}
if (err != EMULATE_DONE) {
- kvm_report_emulation_failure(vcpu, "emulation failure");
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0;
@@ -3416,6 +3508,12 @@ static int handle_pause(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_invalid_op(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -3453,6 +3551,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
[EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
[EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
+ [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op,
+ [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op,
};
static const int kvm_vmx_max_exit_handlers =
@@ -3686,9 +3786,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
*/
vmcs_writel(HOST_CR0, read_cr0());
- if (vcpu->arch.switch_db_regs)
- set_debugreg(vcpu->arch.dr6, 6);
-
asm(
/* Store host registers */
"push %%"R"dx; push %%"R"bp;"
@@ -3789,9 +3886,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
| (1 << VCPU_EXREG_PDPTR));
vcpu->arch.regs_dirty = 0;
- if (vcpu->arch.switch_db_regs)
- get_debugreg(vcpu->arch.dr6, 6);
-
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
if (vmx->rmode.irq.pending)
fixup_rmode_irq(vmx);
@@ -3920,7 +4014,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
* b. VT-d with snooping control feature: snooping control feature of
* VT-d engine can guarantee the cache correctness. Just set it
* to WB to keep consistent with host. So the same as item 3.
- * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep
+ * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
* consistent with host MTRR
*/
if (is_mmio)
@@ -3931,37 +4025,88 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
VMX_EPT_MT_EPTE_SHIFT;
else
ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
- | VMX_EPT_IGMT_BIT;
+ | VMX_EPT_IPAT_BIT;
return ret;
}
+#define _ER(x) { EXIT_REASON_##x, #x }
+
static const struct trace_print_flags vmx_exit_reasons_str[] = {
- { EXIT_REASON_EXCEPTION_NMI, "exception" },
- { EXIT_REASON_EXTERNAL_INTERRUPT, "ext_irq" },
- { EXIT_REASON_TRIPLE_FAULT, "triple_fault" },
- { EXIT_REASON_NMI_WINDOW, "nmi_window" },
- { EXIT_REASON_IO_INSTRUCTION, "io_instruction" },
- { EXIT_REASON_CR_ACCESS, "cr_access" },
- { EXIT_REASON_DR_ACCESS, "dr_access" },
- { EXIT_REASON_CPUID, "cpuid" },
- { EXIT_REASON_MSR_READ, "rdmsr" },
- { EXIT_REASON_MSR_WRITE, "wrmsr" },
- { EXIT_REASON_PENDING_INTERRUPT, "interrupt_window" },
- { EXIT_REASON_HLT, "halt" },
- { EXIT_REASON_INVLPG, "invlpg" },
- { EXIT_REASON_VMCALL, "hypercall" },
- { EXIT_REASON_TPR_BELOW_THRESHOLD, "tpr_below_thres" },
- { EXIT_REASON_APIC_ACCESS, "apic_access" },
- { EXIT_REASON_WBINVD, "wbinvd" },
- { EXIT_REASON_TASK_SWITCH, "task_switch" },
- { EXIT_REASON_EPT_VIOLATION, "ept_violation" },
+ _ER(EXCEPTION_NMI),
+ _ER(EXTERNAL_INTERRUPT),
+ _ER(TRIPLE_FAULT),
+ _ER(PENDING_INTERRUPT),
+ _ER(NMI_WINDOW),
+ _ER(TASK_SWITCH),
+ _ER(CPUID),
+ _ER(HLT),
+ _ER(INVLPG),
+ _ER(RDPMC),
+ _ER(RDTSC),
+ _ER(VMCALL),
+ _ER(VMCLEAR),
+ _ER(VMLAUNCH),
+ _ER(VMPTRLD),
+ _ER(VMPTRST),
+ _ER(VMREAD),
+ _ER(VMRESUME),
+ _ER(VMWRITE),
+ _ER(VMOFF),
+ _ER(VMON),
+ _ER(CR_ACCESS),
+ _ER(DR_ACCESS),
+ _ER(IO_INSTRUCTION),
+ _ER(MSR_READ),
+ _ER(MSR_WRITE),
+ _ER(MWAIT_INSTRUCTION),
+ _ER(MONITOR_INSTRUCTION),
+ _ER(PAUSE_INSTRUCTION),
+ _ER(MCE_DURING_VMENTRY),
+ _ER(TPR_BELOW_THRESHOLD),
+ _ER(APIC_ACCESS),
+ _ER(EPT_VIOLATION),
+ _ER(EPT_MISCONFIG),
+ _ER(WBINVD),
{ -1, NULL }
};
-static bool vmx_gb_page_enable(void)
+#undef _ER
+
+static int vmx_get_lpage_level(void)
+{
+ if (enable_ept && !cpu_has_vmx_ept_1g_page())
+ return PT_DIRECTORY_LEVEL;
+ else
+ /* For shadow and EPT supported 1GB page */
+ return PT_PDPE_LEVEL;
+}
+
+static inline u32 bit(int bitno)
+{
+ return 1 << (bitno & 31);
+}
+
+static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
{
- return false;
+ struct kvm_cpuid_entry2 *best;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 exec_control;
+
+ vmx->rdtscp_enabled = false;
+ if (vmx_rdtscp_supported()) {
+ exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ if (exec_control & SECONDARY_EXEC_RDTSCP) {
+ best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+ if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
+ vmx->rdtscp_enabled = true;
+ else {
+ exec_control &= ~SECONDARY_EXEC_RDTSCP;
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+ exec_control);
+ }
+ }
+ }
}
static struct kvm_x86_ops vmx_x86_ops = {
@@ -3990,6 +4135,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.set_segment = vmx_set_segment,
.get_cpl = vmx_get_cpl,
.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
+ .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
.decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
.set_cr0 = vmx_set_cr0,
.set_cr3 = vmx_set_cr3,
@@ -4002,6 +4148,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
.cache_reg = vmx_cache_reg,
.get_rflags = vmx_get_rflags,
.set_rflags = vmx_set_rflags,
+ .fpu_activate = vmx_fpu_activate,
+ .fpu_deactivate = vmx_fpu_deactivate,
.tlb_flush = vmx_flush_tlb,
@@ -4027,7 +4175,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
.get_mt_mask = vmx_get_mt_mask,
.exit_reasons_str = vmx_exit_reasons_str,
- .gb_page_enable = vmx_gb_page_enable,
+ .get_lpage_level = vmx_get_lpage_level,
+
+ .cpuid_update = vmx_cpuid_update,
+
+ .rdtscp_supported = vmx_rdtscp_supported,
};
static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a1e1bc9d412..e46282a5656 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -38,6 +38,7 @@
#include <linux/intel-iommu.h>
#include <linux/cpufreq.h>
#include <linux/user-return-notifier.h>
+#include <linux/srcu.h>
#include <trace/events/kvm.h>
#undef TRACE_INCLUDE_FILE
#define CREATE_TRACE_POINTS
@@ -93,16 +94,16 @@ module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
struct kvm_shared_msrs_global {
int nr;
- struct kvm_shared_msr {
- u32 msr;
- u64 value;
- } msrs[KVM_NR_SHARED_MSRS];
+ u32 msrs[KVM_NR_SHARED_MSRS];
};
struct kvm_shared_msrs {
struct user_return_notifier urn;
bool registered;
- u64 current_value[KVM_NR_SHARED_MSRS];
+ struct kvm_shared_msr_values {
+ u64 host;
+ u64 curr;
+ } values[KVM_NR_SHARED_MSRS];
};
static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
@@ -147,53 +148,64 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
static void kvm_on_user_return(struct user_return_notifier *urn)
{
unsigned slot;
- struct kvm_shared_msr *global;
struct kvm_shared_msrs *locals
= container_of(urn, struct kvm_shared_msrs, urn);
+ struct kvm_shared_msr_values *values;
for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
- global = &shared_msrs_global.msrs[slot];
- if (global->value != locals->current_value[slot]) {
- wrmsrl(global->msr, global->value);
- locals->current_value[slot] = global->value;
+ values = &locals->values[slot];
+ if (values->host != values->curr) {
+ wrmsrl(shared_msrs_global.msrs[slot], values->host);
+ values->curr = values->host;
}
}
locals->registered = false;
user_return_notifier_unregister(urn);
}
-void kvm_define_shared_msr(unsigned slot, u32 msr)
+static void shared_msr_update(unsigned slot, u32 msr)
{
- int cpu;
+ struct kvm_shared_msrs *smsr;
u64 value;
+ smsr = &__get_cpu_var(shared_msrs);
+ /* only read, and nobody should modify it at this time,
+ * so don't need lock */
+ if (slot >= shared_msrs_global.nr) {
+ printk(KERN_ERR "kvm: invalid MSR slot!");
+ return;
+ }
+ rdmsrl_safe(msr, &value);
+ smsr->values[slot].host = value;
+ smsr->values[slot].curr = value;
+}
+
+void kvm_define_shared_msr(unsigned slot, u32 msr)
+{
if (slot >= shared_msrs_global.nr)
shared_msrs_global.nr = slot + 1;
- shared_msrs_global.msrs[slot].msr = msr;
- rdmsrl_safe(msr, &value);
- shared_msrs_global.msrs[slot].value = value;
- for_each_online_cpu(cpu)
- per_cpu(shared_msrs, cpu).current_value[slot] = value;
+ shared_msrs_global.msrs[slot] = msr;
+ /* we need ensured the shared_msr_global have been updated */
+ smp_wmb();
}
EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
static void kvm_shared_msr_cpu_online(void)
{
unsigned i;
- struct kvm_shared_msrs *locals = &__get_cpu_var(shared_msrs);
for (i = 0; i < shared_msrs_global.nr; ++i)
- locals->current_value[i] = shared_msrs_global.msrs[i].value;
+ shared_msr_update(i, shared_msrs_global.msrs[i]);
}
void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
{
struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
- if (((value ^ smsr->current_value[slot]) & mask) == 0)
+ if (((value ^ smsr->values[slot].curr) & mask) == 0)
return;
- smsr->current_value[slot] = value;
- wrmsrl(shared_msrs_global.msrs[slot].msr, value);
+ smsr->values[slot].curr = value;
+ wrmsrl(shared_msrs_global.msrs[slot], value);
if (!smsr->registered) {
smsr->urn.on_user_return = kvm_on_user_return;
user_return_notifier_register(&smsr->urn);
@@ -257,12 +269,68 @@ void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
}
EXPORT_SYMBOL_GPL(kvm_set_apic_base);
+#define EXCPT_BENIGN 0
+#define EXCPT_CONTRIBUTORY 1
+#define EXCPT_PF 2
+
+static int exception_class(int vector)
+{
+ switch (vector) {
+ case PF_VECTOR:
+ return EXCPT_PF;
+ case DE_VECTOR:
+ case TS_VECTOR:
+ case NP_VECTOR:
+ case SS_VECTOR:
+ case GP_VECTOR:
+ return EXCPT_CONTRIBUTORY;
+ default:
+ break;
+ }
+ return EXCPT_BENIGN;
+}
+
+static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
+ unsigned nr, bool has_error, u32 error_code)
+{
+ u32 prev_nr;
+ int class1, class2;
+
+ if (!vcpu->arch.exception.pending) {
+ queue:
+ vcpu->arch.exception.pending = true;
+ vcpu->arch.exception.has_error_code = has_error;
+ vcpu->arch.exception.nr = nr;
+ vcpu->arch.exception.error_code = error_code;
+ return;
+ }
+
+ /* to check exception */
+ prev_nr = vcpu->arch.exception.nr;
+ if (prev_nr == DF_VECTOR) {
+ /* triple fault -> shutdown */
+ set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+ return;
+ }
+ class1 = exception_class(prev_nr);
+ class2 = exception_class(nr);
+ if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
+ || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
+ /* generate double fault per SDM Table 5-5 */
+ vcpu->arch.exception.pending = true;
+ vcpu->arch.exception.has_error_code = true;
+ vcpu->arch.exception.nr = DF_VECTOR;
+ vcpu->arch.exception.error_code = 0;
+ } else
+ /* replace previous exception with a new one in a hope
+ that instruction re-execution will regenerate lost
+ exception */
+ goto queue;
+}
+
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
{
- WARN_ON(vcpu->arch.exception.pending);
- vcpu->arch.exception.pending = true;
- vcpu->arch.exception.has_error_code = false;
- vcpu->arch.exception.nr = nr;
+ kvm_multiple_exception(vcpu, nr, false, 0);
}
EXPORT_SYMBOL_GPL(kvm_queue_exception);
@@ -270,25 +338,6 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
u32 error_code)
{
++vcpu->stat.pf_guest;
-
- if (vcpu->arch.exception.pending) {
- switch(vcpu->arch.exception.nr) {
- case DF_VECTOR:
- /* triple fault -> shutdown */
- set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
- return;
- case PF_VECTOR:
- vcpu->arch.exception.nr = DF_VECTOR;
- vcpu->arch.exception.error_code = 0;
- return;
- default:
- /* replace previous exception with a new one in a hope
- that instruction re-execution will regenerate lost
- exception */
- vcpu->arch.exception.pending = false;
- break;
- }
- }
vcpu->arch.cr2 = addr;
kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
}
@@ -301,11 +350,7 @@ EXPORT_SYMBOL_GPL(kvm_inject_nmi);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
{
- WARN_ON(vcpu->arch.exception.pending);
- vcpu->arch.exception.pending = true;
- vcpu->arch.exception.has_error_code = true;
- vcpu->arch.exception.nr = nr;
- vcpu->arch.exception.error_code = error_code;
+ kvm_multiple_exception(vcpu, nr, true, error_code);
}
EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
@@ -383,12 +428,18 @@ out:
void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
- if (cr0 & CR0_RESERVED_BITS) {
+ cr0 |= X86_CR0_ET;
+
+#ifdef CONFIG_X86_64
+ if (cr0 & 0xffffffff00000000UL) {
printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
- cr0, vcpu->arch.cr0);
+ cr0, kvm_read_cr0(vcpu));
kvm_inject_gp(vcpu, 0);
return;
}
+#endif
+
+ cr0 &= ~CR0_RESERVED_BITS;
if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
@@ -405,7 +456,7 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
#ifdef CONFIG_X86_64
- if ((vcpu->arch.shadow_efer & EFER_LME)) {
+ if ((vcpu->arch.efer & EFER_LME)) {
int cs_db, cs_l;
if (!is_pae(vcpu)) {
@@ -443,13 +494,13 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
{
- kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
+ kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f));
}
EXPORT_SYMBOL_GPL(kvm_lmsw);
void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
- unsigned long old_cr4 = vcpu->arch.cr4;
+ unsigned long old_cr4 = kvm_read_cr4(vcpu);
unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
if (cr4 & CR4_RESERVED_BITS) {
@@ -575,9 +626,11 @@ static inline u32 bit(int bitno)
* kvm-specific. Those are put in the beginning of the list.
*/
-#define KVM_SAVE_MSRS_BEGIN 2
+#define KVM_SAVE_MSRS_BEGIN 5
static u32 msrs_to_save[] = {
MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+ HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
+ HV_X64_MSR_APIC_ASSIST_PAGE,
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
MSR_K6_STAR,
#ifdef CONFIG_X86_64
@@ -602,7 +655,7 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
}
if (is_paging(vcpu)
- && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
+ && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) {
printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
kvm_inject_gp(vcpu, 0);
return;
@@ -633,9 +686,9 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
kvm_x86_ops->set_efer(vcpu, efer);
efer &= ~EFER_LMA;
- efer |= vcpu->arch.shadow_efer & EFER_LMA;
+ efer |= vcpu->arch.efer & EFER_LMA;
- vcpu->arch.shadow_efer = efer;
+ vcpu->arch.efer = efer;
vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
kvm_mmu_reset_context(vcpu);
@@ -957,6 +1010,100 @@ out:
return r;
}
+static bool kvm_hv_hypercall_enabled(struct kvm *kvm)
+{
+ return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
+}
+
+static bool kvm_hv_msr_partition_wide(u32 msr)
+{
+ bool r = false;
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ case HV_X64_MSR_HYPERCALL:
+ r = true;
+ break;
+ }
+
+ return r;
+}
+
+static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ kvm->arch.hv_guest_os_id = data;
+ /* setting guest os id to zero disables hypercall page */
+ if (!kvm->arch.hv_guest_os_id)
+ kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
+ break;
+ case HV_X64_MSR_HYPERCALL: {
+ u64 gfn;
+ unsigned long addr;
+ u8 instructions[4];
+
+ /* if guest os id is not set hypercall should remain disabled */
+ if (!kvm->arch.hv_guest_os_id)
+ break;
+ if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
+ kvm->arch.hv_hypercall = data;
+ break;
+ }
+ gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr))
+ return 1;
+ kvm_x86_ops->patch_hypercall(vcpu, instructions);
+ ((unsigned char *)instructions)[3] = 0xc3; /* ret */
+ if (copy_to_user((void __user *)addr, instructions, 4))
+ return 1;
+ kvm->arch.hv_hypercall = data;
+ break;
+ }
+ default:
+ pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
+ "data 0x%llx\n", msr, data);
+ return 1;
+ }
+ return 0;
+}
+
+static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ switch (msr) {
+ case HV_X64_MSR_APIC_ASSIST_PAGE: {
+ unsigned long addr;
+
+ if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
+ vcpu->arch.hv_vapic = data;
+ break;
+ }
+ addr = gfn_to_hva(vcpu->kvm, data >>
+ HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
+ if (kvm_is_error_hva(addr))
+ return 1;
+ if (clear_user((void __user *)addr, PAGE_SIZE))
+ return 1;
+ vcpu->arch.hv_vapic = data;
+ break;
+ }
+ case HV_X64_MSR_EOI:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
+ case HV_X64_MSR_ICR:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
+ case HV_X64_MSR_TPR:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
+ default:
+ pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
+ "data 0x%llx\n", msr, data);
+ return 1;
+ }
+
+ return 0;
+}
+
int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
switch (msr) {
@@ -1071,6 +1218,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
"0x%x data 0x%llx\n", msr, data);
break;
+ case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+ if (kvm_hv_msr_partition_wide(msr)) {
+ int r;
+ mutex_lock(&vcpu->kvm->lock);
+ r = set_msr_hyperv_pw(vcpu, msr, data);
+ mutex_unlock(&vcpu->kvm->lock);
+ return r;
+ } else
+ return set_msr_hyperv(vcpu, msr, data);
+ break;
default:
if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
return xen_hvm_config(vcpu, data);
@@ -1170,6 +1327,54 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
return 0;
}
+static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+ u64 data = 0;
+ struct kvm *kvm = vcpu->kvm;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ data = kvm->arch.hv_guest_os_id;
+ break;
+ case HV_X64_MSR_HYPERCALL:
+ data = kvm->arch.hv_hypercall;
+ break;
+ default:
+ pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+ return 1;
+ }
+
+ *pdata = data;
+ return 0;
+}
+
+static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+ u64 data = 0;
+
+ switch (msr) {
+ case HV_X64_MSR_VP_INDEX: {
+ int r;
+ struct kvm_vcpu *v;
+ kvm_for_each_vcpu(r, v, vcpu->kvm)
+ if (v == vcpu)
+ data = r;
+ break;
+ }
+ case HV_X64_MSR_EOI:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
+ case HV_X64_MSR_ICR:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
+ case HV_X64_MSR_TPR:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
+ default:
+ pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+ return 1;
+ }
+ *pdata = data;
+ return 0;
+}
+
int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
{
u64 data;
@@ -1221,7 +1426,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
data |= (((uint64_t)4ULL) << 40);
break;
case MSR_EFER:
- data = vcpu->arch.shadow_efer;
+ data = vcpu->arch.efer;
break;
case MSR_KVM_WALL_CLOCK:
data = vcpu->kvm->arch.wall_clock;
@@ -1236,6 +1441,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
return get_msr_mce(vcpu, msr, pdata);
+ case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+ if (kvm_hv_msr_partition_wide(msr)) {
+ int r;
+ mutex_lock(&vcpu->kvm->lock);
+ r = get_msr_hyperv_pw(vcpu, msr, pdata);
+ mutex_unlock(&vcpu->kvm->lock);
+ return r;
+ } else
+ return get_msr_hyperv(vcpu, msr, pdata);
+ break;
default:
if (!ignore_msrs) {
pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
@@ -1261,15 +1476,15 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
int (*do_msr)(struct kvm_vcpu *vcpu,
unsigned index, u64 *data))
{
- int i;
+ int i, idx;
vcpu_load(vcpu);
- down_read(&vcpu->kvm->slots_lock);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
for (i = 0; i < msrs->nmsrs; ++i)
if (do_msr(vcpu, entries[i].index, &entries[i].data))
break;
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
vcpu_put(vcpu);
@@ -1351,6 +1566,11 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_XEN_HVM:
case KVM_CAP_ADJUST_CLOCK:
case KVM_CAP_VCPU_EVENTS:
+ case KVM_CAP_HYPERV:
+ case KVM_CAP_HYPERV_VAPIC:
+ case KVM_CAP_HYPERV_SPIN:
+ case KVM_CAP_PCI_SEGMENT:
+ case KVM_CAP_X86_ROBUST_SINGLESTEP:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -1464,8 +1684,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
- kvm_x86_ops->vcpu_put(vcpu);
kvm_put_guest_fpu(vcpu);
+ kvm_x86_ops->vcpu_put(vcpu);
}
static int is_efer_nx(void)
@@ -1530,6 +1750,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
cpuid_fix_nx_cap(vcpu);
r = 0;
kvm_apic_set_version(vcpu);
+ kvm_x86_ops->cpuid_update(vcpu);
out_free:
vfree(cpuid_entries);
@@ -1552,6 +1773,7 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
goto out;
vcpu->arch.cpuid_nent = cpuid->nent;
kvm_apic_set_version(vcpu);
+ kvm_x86_ops->cpuid_update(vcpu);
return 0;
out:
@@ -1594,12 +1816,15 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
u32 index, int *nent, int maxnent)
{
unsigned f_nx = is_efer_nx() ? F(NX) : 0;
- unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0;
#ifdef CONFIG_X86_64
+ unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
+ ? F(GBPAGES) : 0;
unsigned f_lm = F(LM);
#else
+ unsigned f_gbpages = 0;
unsigned f_lm = 0;
#endif
+ unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
/* cpuid 1.edx */
const u32 kvm_supported_word0_x86_features =
@@ -1619,7 +1844,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
F(PAT) | F(PSE36) | 0 /* Reserved */ |
f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
- F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ |
+ F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
/* cpuid 1.ecx */
const u32 kvm_supported_word4_x86_features =
@@ -1866,7 +2091,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
return 0;
if (mce->status & MCI_STATUS_UC) {
if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
- !(vcpu->arch.cr4 & X86_CR4_MCE)) {
+ !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
printk(KERN_DEBUG "kvm: set_mce: "
"injects mce exception while "
"previous one is in progress!\n");
@@ -2160,14 +2385,14 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
return -EINVAL;
- down_write(&kvm->slots_lock);
+ mutex_lock(&kvm->slots_lock);
spin_lock(&kvm->mmu_lock);
kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
spin_unlock(&kvm->mmu_lock);
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
return 0;
}
@@ -2176,13 +2401,35 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
return kvm->arch.n_alloc_mmu_pages;
}
+gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
+{
+ int i;
+ struct kvm_mem_alias *alias;
+ struct kvm_mem_aliases *aliases;
+
+ aliases = rcu_dereference(kvm->arch.aliases);
+
+ for (i = 0; i < aliases->naliases; ++i) {
+ alias = &aliases->aliases[i];
+ if (alias->flags & KVM_ALIAS_INVALID)
+ continue;
+ if (gfn >= alias->base_gfn
+ && gfn < alias->base_gfn + alias->npages)
+ return alias->target_gfn + gfn - alias->base_gfn;
+ }
+ return gfn;
+}
+
gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
{
int i;
struct kvm_mem_alias *alias;
+ struct kvm_mem_aliases *aliases;
- for (i = 0; i < kvm->arch.naliases; ++i) {
- alias = &kvm->arch.aliases[i];
+ aliases = rcu_dereference(kvm->arch.aliases);
+
+ for (i = 0; i < aliases->naliases; ++i) {
+ alias = &aliases->aliases[i];
if (gfn >= alias->base_gfn
&& gfn < alias->base_gfn + alias->npages)
return alias->target_gfn + gfn - alias->base_gfn;
@@ -2200,6 +2447,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
{
int r, n;
struct kvm_mem_alias *p;
+ struct kvm_mem_aliases *aliases, *old_aliases;
r = -EINVAL;
/* General sanity checks */
@@ -2216,26 +2464,48 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
< alias->target_phys_addr)
goto out;
- down_write(&kvm->slots_lock);
- spin_lock(&kvm->mmu_lock);
+ r = -ENOMEM;
+ aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
+ if (!aliases)
+ goto out;
+
+ mutex_lock(&kvm->slots_lock);
- p = &kvm->arch.aliases[alias->slot];
+ /* invalidate any gfn reference in case of deletion/shrinking */
+ memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
+ aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID;
+ old_aliases = kvm->arch.aliases;
+ rcu_assign_pointer(kvm->arch.aliases, aliases);
+ synchronize_srcu_expedited(&kvm->srcu);
+ kvm_mmu_zap_all(kvm);
+ kfree(old_aliases);
+
+ r = -ENOMEM;
+ aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
+ if (!aliases)
+ goto out_unlock;
+
+ memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
+
+ p = &aliases->aliases[alias->slot];
p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
p->npages = alias->memory_size >> PAGE_SHIFT;
p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
+ p->flags &= ~(KVM_ALIAS_INVALID);
for (n = KVM_ALIAS_SLOTS; n > 0; --n)
- if (kvm->arch.aliases[n - 1].npages)
+ if (aliases->aliases[n - 1].npages)
break;
- kvm->arch.naliases = n;
+ aliases->naliases = n;
- spin_unlock(&kvm->mmu_lock);
- kvm_mmu_zap_all(kvm);
-
- up_write(&kvm->slots_lock);
-
- return 0;
+ old_aliases = kvm->arch.aliases;
+ rcu_assign_pointer(kvm->arch.aliases, aliases);
+ synchronize_srcu_expedited(&kvm->srcu);
+ kfree(old_aliases);
+ r = 0;
+out_unlock:
+ mutex_unlock(&kvm->slots_lock);
out:
return r;
}
@@ -2273,18 +2543,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
r = 0;
switch (chip->chip_id) {
case KVM_IRQCHIP_PIC_MASTER:
- spin_lock(&pic_irqchip(kvm)->lock);
+ raw_spin_lock(&pic_irqchip(kvm)->lock);
memcpy(&pic_irqchip(kvm)->pics[0],
&chip->chip.pic,
sizeof(struct kvm_pic_state));
- spin_unlock(&pic_irqchip(kvm)->lock);
+ raw_spin_unlock(&pic_irqchip(kvm)->lock);
break;
case KVM_IRQCHIP_PIC_SLAVE:
- spin_lock(&pic_irqchip(kvm)->lock);
+ raw_spin_lock(&pic_irqchip(kvm)->lock);
memcpy(&pic_irqchip(kvm)->pics[1],
&chip->chip.pic,
sizeof(struct kvm_pic_state));
- spin_unlock(&pic_irqchip(kvm)->lock);
+ raw_spin_unlock(&pic_irqchip(kvm)->lock);
break;
case KVM_IRQCHIP_IOAPIC:
r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
@@ -2364,29 +2634,62 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
struct kvm_dirty_log *log)
{
- int r;
- int n;
+ int r, n, i;
struct kvm_memory_slot *memslot;
- int is_dirty = 0;
+ unsigned long is_dirty = 0;
+ unsigned long *dirty_bitmap = NULL;
- down_write(&kvm->slots_lock);
+ mutex_lock(&kvm->slots_lock);
- r = kvm_get_dirty_log(kvm, log, &is_dirty);
- if (r)
+ r = -EINVAL;
+ if (log->slot >= KVM_MEMORY_SLOTS)
+ goto out;
+
+ memslot = &kvm->memslots->memslots[log->slot];
+ r = -ENOENT;
+ if (!memslot->dirty_bitmap)
+ goto out;
+
+ n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
+
+ r = -ENOMEM;
+ dirty_bitmap = vmalloc(n);
+ if (!dirty_bitmap)
goto out;
+ memset(dirty_bitmap, 0, n);
+
+ for (i = 0; !is_dirty && i < n/sizeof(long); i++)
+ is_dirty = memslot->dirty_bitmap[i];
/* If nothing is dirty, don't bother messing with page tables. */
if (is_dirty) {
+ struct kvm_memslots *slots, *old_slots;
+
spin_lock(&kvm->mmu_lock);
kvm_mmu_slot_remove_write_access(kvm, log->slot);
spin_unlock(&kvm->mmu_lock);
- memslot = &kvm->memslots[log->slot];
- n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
- memset(memslot->dirty_bitmap, 0, n);
+
+ slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+ if (!slots)
+ goto out_free;
+
+ memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
+ slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
+
+ old_slots = kvm->memslots;
+ rcu_assign_pointer(kvm->memslots, slots);
+ synchronize_srcu_expedited(&kvm->srcu);
+ dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
+ kfree(old_slots);
}
+
r = 0;
+ if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
+ r = -EFAULT;
+out_free:
+ vfree(dirty_bitmap);
out:
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
return r;
}
@@ -2469,6 +2772,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (vpic) {
r = kvm_ioapic_init(kvm);
if (r) {
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
+ &vpic->dev);
kfree(vpic);
goto create_irqchip_unlock;
}
@@ -2480,10 +2785,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = kvm_setup_default_irq_routing(kvm);
if (r) {
mutex_lock(&kvm->irq_lock);
- kfree(kvm->arch.vpic);
- kfree(kvm->arch.vioapic);
- kvm->arch.vpic = NULL;
- kvm->arch.vioapic = NULL;
+ kvm_ioapic_destroy(kvm);
+ kvm_destroy_pic(kvm);
mutex_unlock(&kvm->irq_lock);
}
create_irqchip_unlock:
@@ -2499,7 +2802,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
sizeof(struct kvm_pit_config)))
goto out;
create_pit:
- down_write(&kvm->slots_lock);
+ mutex_lock(&kvm->slots_lock);
r = -EEXIST;
if (kvm->arch.vpit)
goto create_pit_unlock;
@@ -2508,7 +2811,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (kvm->arch.vpit)
r = 0;
create_pit_unlock:
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
break;
case KVM_IRQ_LINE_STATUS:
case KVM_IRQ_LINE: {
@@ -2725,7 +3028,7 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
!kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
return 0;
- return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v);
+ return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
}
static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
@@ -2734,17 +3037,44 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
!kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
return 0;
- return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v);
+ return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
}
-static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu)
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+}
+
+ gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ access |= PFERR_FETCH_MASK;
+ return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+}
+
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ access |= PFERR_WRITE_MASK;
+ return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+}
+
+/* uses this to access any guest's mapped memory without checking CPL */
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+{
+ return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error);
+}
+
+static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu, u32 access,
+ u32 *error)
{
void *data = val;
int r = X86EMUL_CONTINUE;
while (bytes) {
- gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
@@ -2767,14 +3097,37 @@ out:
return r;
}
+/* used for instruction fetching */
+static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu, u32 *error)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
+ access | PFERR_FETCH_MASK, error);
+}
+
+static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu, u32 *error)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
+ error);
+}
+
+static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu, u32 *error)
+{
+ return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
+}
+
static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu)
+ struct kvm_vcpu *vcpu, u32 *error)
{
void *data = val;
int r = X86EMUL_CONTINUE;
while (bytes) {
- gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
@@ -2804,6 +3157,7 @@ static int emulator_read_emulated(unsigned long addr,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
+ u32 error_code;
if (vcpu->mmio_read_completed) {
memcpy(val, vcpu->mmio_data, bytes);
@@ -2813,17 +3167,20 @@ static int emulator_read_emulated(unsigned long addr,
return X86EMUL_CONTINUE;
}
- gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code);
+
+ if (gpa == UNMAPPED_GVA) {
+ kvm_inject_page_fault(vcpu, addr, error_code);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
/* For APIC access vmexit */
if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
goto mmio;
- if (kvm_read_guest_virt(addr, val, bytes, vcpu)
+ if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL)
== X86EMUL_CONTINUE)
return X86EMUL_CONTINUE;
- if (gpa == UNMAPPED_GVA)
- return X86EMUL_PROPAGATE_FAULT;
mmio:
/*
@@ -2862,11 +3219,12 @@ static int emulator_write_emulated_onepage(unsigned long addr,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
+ u32 error_code;
- gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code);
if (gpa == UNMAPPED_GVA) {
- kvm_inject_page_fault(vcpu, addr, 2);
+ kvm_inject_page_fault(vcpu, addr, error_code);
return X86EMUL_PROPAGATE_FAULT;
}
@@ -2930,7 +3288,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
char *kaddr;
u64 val;
- gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
if (gpa == UNMAPPED_GVA ||
(gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -2967,35 +3325,21 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
int emulate_clts(struct kvm_vcpu *vcpu)
{
- kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
+ kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+ kvm_x86_ops->fpu_activate(vcpu);
return X86EMUL_CONTINUE;
}
int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
{
- struct kvm_vcpu *vcpu = ctxt->vcpu;
-
- switch (dr) {
- case 0 ... 3:
- *dest = kvm_x86_ops->get_dr(vcpu, dr);
- return X86EMUL_CONTINUE;
- default:
- pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);
- return X86EMUL_UNHANDLEABLE;
- }
+ return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest);
}
int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
{
unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
- int exception;
- kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
- if (exception) {
- /* FIXME: better handling */
- return X86EMUL_UNHANDLEABLE;
- }
- return X86EMUL_CONTINUE;
+ return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask);
}
void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
@@ -3009,7 +3353,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
- kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu);
+ kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL);
printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
@@ -3017,7 +3361,8 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
static struct x86_emulate_ops emulate_ops = {
- .read_std = kvm_read_guest_virt,
+ .read_std = kvm_read_guest_virt_system,
+ .fetch = kvm_fetch_guest_virt,
.read_emulated = emulator_read_emulated,
.write_emulated = emulator_write_emulated,
.cmpxchg_emulated = emulator_cmpxchg_emulated,
@@ -3060,8 +3405,9 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
vcpu->arch.emulate_ctxt.vcpu = vcpu;
vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
vcpu->arch.emulate_ctxt.mode =
+ (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
- ? X86EMUL_MODE_REAL : cs_l
+ ? X86EMUL_MODE_VM86 : cs_l
? X86EMUL_MODE_PROT64 : cs_db
? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
@@ -3153,12 +3499,17 @@ static int pio_copy_data(struct kvm_vcpu *vcpu)
gva_t q = vcpu->arch.pio.guest_gva;
unsigned bytes;
int ret;
+ u32 error_code;
bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
if (vcpu->arch.pio.in)
- ret = kvm_write_guest_virt(q, p, bytes, vcpu);
+ ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code);
else
- ret = kvm_read_guest_virt(q, p, bytes, vcpu);
+ ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code);
+
+ if (ret == X86EMUL_PROPAGATE_FAULT)
+ kvm_inject_page_fault(vcpu, q, error_code);
+
return ret;
}
@@ -3179,7 +3530,7 @@ int complete_pio(struct kvm_vcpu *vcpu)
if (io->in) {
r = pio_copy_data(vcpu);
if (r)
- return r;
+ goto out;
}
delta = 1;
@@ -3206,7 +3557,7 @@ int complete_pio(struct kvm_vcpu *vcpu)
kvm_register_write(vcpu, VCPU_REGS_RSI, val);
}
}
-
+out:
io->count -= io->cur_count;
io->cur_count = 0;
@@ -3219,11 +3570,12 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
int r;
if (vcpu->arch.pio.in)
- r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
+ r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
vcpu->arch.pio.size, pd);
else
- r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
- vcpu->arch.pio.size, pd);
+ r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
+ vcpu->arch.pio.port, vcpu->arch.pio.size,
+ pd);
return r;
}
@@ -3234,7 +3586,7 @@ static int pio_string_write(struct kvm_vcpu *vcpu)
int i, r = 0;
for (i = 0; i < io->cur_count; i++) {
- if (kvm_io_bus_write(&vcpu->kvm->pio_bus,
+ if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
io->port, io->size, pd)) {
r = -EOPNOTSUPP;
break;
@@ -3248,6 +3600,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
{
unsigned long val;
+ trace_kvm_pio(!in, port, size, 1);
+
vcpu->run->exit_reason = KVM_EXIT_IO;
vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
vcpu->run->io.size = vcpu->arch.pio.size = size;
@@ -3259,11 +3613,10 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
vcpu->arch.pio.down = 0;
vcpu->arch.pio.rep = 0;
- trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
- size, 1);
-
- val = kvm_register_read(vcpu, VCPU_REGS_RAX);
- memcpy(vcpu->arch.pio_data, &val, 4);
+ if (!vcpu->arch.pio.in) {
+ val = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ memcpy(vcpu->arch.pio_data, &val, 4);
+ }
if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
complete_pio(vcpu);
@@ -3280,6 +3633,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
unsigned now, in_page;
int ret = 0;
+ trace_kvm_pio(!in, port, size, count);
+
vcpu->run->exit_reason = KVM_EXIT_IO;
vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
vcpu->run->io.size = vcpu->arch.pio.size = size;
@@ -3291,9 +3646,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
vcpu->arch.pio.down = down;
vcpu->arch.pio.rep = rep;
- trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
- size, count);
-
if (!count) {
kvm_x86_ops->skip_emulated_instruction(vcpu);
return 1;
@@ -3325,10 +3677,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
if (!vcpu->arch.pio.in) {
/* string PIO write */
ret = pio_copy_data(vcpu);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
- kvm_inject_gp(vcpu, 0);
+ if (ret == X86EMUL_PROPAGATE_FAULT)
return 1;
- }
if (ret == 0 && !pio_string_write(vcpu)) {
complete_pio(vcpu);
if (vcpu->arch.pio.count == 0)
@@ -3487,11 +3837,76 @@ static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
return a0 | ((gpa_t)a1 << 32);
}
+int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
+{
+ u64 param, ingpa, outgpa, ret;
+ uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
+ bool fast, longmode;
+ int cs_db, cs_l;
+
+ /*
+ * hypercall generates UD from non zero cpl and real mode
+ * per HYPER-V spec
+ */
+ if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 0;
+ }
+
+ kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ longmode = is_long_mode(vcpu) && cs_l == 1;
+
+ if (!longmode) {
+ param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
+ (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);
+ ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |
+ (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);
+ outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |
+ (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);
+ }
+#ifdef CONFIG_X86_64
+ else {
+ param = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
+ outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
+ }
+#endif
+
+ code = param & 0xffff;
+ fast = (param >> 16) & 0x1;
+ rep_cnt = (param >> 32) & 0xfff;
+ rep_idx = (param >> 48) & 0xfff;
+
+ trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
+
+ switch (code) {
+ case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
+ kvm_vcpu_on_spin(vcpu);
+ break;
+ default:
+ res = HV_STATUS_INVALID_HYPERCALL_CODE;
+ break;
+ }
+
+ ret = res | (((u64)rep_done & 0xfff) << 32);
+ if (longmode) {
+ kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
+ } else {
+ kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
+ }
+
+ return 1;
+}
+
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
{
unsigned long nr, a0, a1, a2, a3, ret;
int r = 1;
+ if (kvm_hv_hypercall_enabled(vcpu->kvm))
+ return kvm_hv_hypercall(vcpu);
+
nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
@@ -3534,10 +3949,8 @@ EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
{
char instruction[3];
- int ret = 0;
unsigned long rip = kvm_rip_read(vcpu);
-
/*
* Blow out the MMU to ensure that no other VCPU has an active mapping
* to ensure that the updated hypercall appears atomically across all
@@ -3546,11 +3959,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
kvm_mmu_zap_all(vcpu->kvm);
kvm_x86_ops->patch_hypercall(vcpu, instruction);
- if (emulator_write_emulated(rip, instruction, 3, vcpu)
- != X86EMUL_CONTINUE)
- ret = -EFAULT;
- return ret;
+ return emulator_write_emulated(rip, instruction, 3, vcpu);
}
static u64 mk_cr_64(u64 curr_cr, u32 new_val)
@@ -3583,10 +3993,9 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
{
unsigned long value;
- kvm_x86_ops->decache_cr4_guest_bits(vcpu);
switch (cr) {
case 0:
- value = vcpu->arch.cr0;
+ value = kvm_read_cr0(vcpu);
break;
case 2:
value = vcpu->arch.cr2;
@@ -3595,7 +4004,7 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
value = vcpu->arch.cr3;
break;
case 4:
- value = vcpu->arch.cr4;
+ value = kvm_read_cr4(vcpu);
break;
case 8:
value = kvm_get_cr8(vcpu);
@@ -3613,7 +4022,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
{
switch (cr) {
case 0:
- kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
+ kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
*rflags = kvm_get_rflags(vcpu);
break;
case 2:
@@ -3623,7 +4032,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
kvm_set_cr3(vcpu, val);
break;
case 4:
- kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
+ kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
break;
case 8:
kvm_set_cr8(vcpu, val & 0xfUL);
@@ -3690,6 +4099,7 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
}
return best;
}
+EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
{
@@ -3773,14 +4183,15 @@ static void vapic_enter(struct kvm_vcpu *vcpu)
static void vapic_exit(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
+ int idx;
if (!apic || !apic->vapic_addr)
return;
- down_read(&vcpu->kvm->slots_lock);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_release_page_dirty(apic->vapic_page);
mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
}
static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@@ -3876,12 +4287,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
r = 0;
goto out;
}
+ if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) {
+ vcpu->fpu_active = 0;
+ kvm_x86_ops->fpu_deactivate(vcpu);
+ }
}
preempt_disable();
kvm_x86_ops->prepare_guest_switch(vcpu);
- kvm_load_guest_fpu(vcpu);
+ if (vcpu->fpu_active)
+ kvm_load_guest_fpu(vcpu);
local_irq_disable();
@@ -3909,7 +4325,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_lapic_sync_to_vapic(vcpu);
}
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
kvm_guest_enter();
@@ -3951,7 +4367,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
preempt_enable();
- down_read(&vcpu->kvm->slots_lock);
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
/*
* Profile KVM exit RIPs:
@@ -3973,6 +4389,7 @@ out:
static int __vcpu_run(struct kvm_vcpu *vcpu)
{
int r;
+ struct kvm *kvm = vcpu->kvm;
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
pr_debug("vcpu %d received sipi with vector # %x\n",
@@ -3984,7 +4401,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
}
- down_read(&vcpu->kvm->slots_lock);
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
vapic_enter(vcpu);
r = 1;
@@ -3992,9 +4409,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
r = vcpu_enter_guest(vcpu);
else {
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_block(vcpu);
- down_read(&vcpu->kvm->slots_lock);
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
{
switch(vcpu->arch.mp_state) {
@@ -4029,13 +4446,13 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
++vcpu->stat.signal_exits;
}
if (need_resched()) {
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_resched(vcpu);
- down_read(&vcpu->kvm->slots_lock);
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
}
}
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
post_kvm_run_save(vcpu);
vapic_exit(vcpu);
@@ -4074,10 +4491,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
vcpu->mmio_read_completed = 1;
vcpu->mmio_needed = 0;
- down_read(&vcpu->kvm->slots_lock);
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
EMULTYPE_NO_DECODE);
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
if (r == EMULATE_DO_MMIO) {
/*
* Read-modify-write. Back to userspace.
@@ -4204,13 +4621,12 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
sregs->gdt.limit = dt.limit;
sregs->gdt.base = dt.base;
- kvm_x86_ops->decache_cr4_guest_bits(vcpu);
- sregs->cr0 = vcpu->arch.cr0;
+ sregs->cr0 = kvm_read_cr0(vcpu);
sregs->cr2 = vcpu->arch.cr2;
sregs->cr3 = vcpu->arch.cr3;
- sregs->cr4 = vcpu->arch.cr4;
+ sregs->cr4 = kvm_read_cr4(vcpu);
sregs->cr8 = kvm_get_cr8(vcpu);
- sregs->efer = vcpu->arch.shadow_efer;
+ sregs->efer = vcpu->arch.efer;
sregs->apic_base = kvm_get_apic_base(vcpu);
memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
@@ -4298,14 +4714,23 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
{
struct descriptor_table dtable;
u16 index = selector >> 3;
+ int ret;
+ u32 err;
+ gva_t addr;
get_segment_descriptor_dtable(vcpu, selector, &dtable);
if (dtable.limit < index * 8 + 7) {
kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
- return 1;
+ return X86EMUL_PROPAGATE_FAULT;
}
- return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
+ addr = dtable.base + index * 8;
+ ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc),
+ vcpu, &err);
+ if (ret == X86EMUL_PROPAGATE_FAULT)
+ kvm_inject_page_fault(vcpu, addr, err);
+
+ return ret;
}
/* allowed just for 8 bytes segments */
@@ -4319,15 +4744,23 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
if (dtable.limit < index * 8 + 7)
return 1;
- return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
+ return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL);
+}
+
+static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu,
+ struct desc_struct *seg_desc)
+{
+ u32 base_addr = get_desc_base(seg_desc);
+
+ return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL);
}
-static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu,
+static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu,
struct desc_struct *seg_desc)
{
u32 base_addr = get_desc_base(seg_desc);
- return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
+ return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL);
}
static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
@@ -4338,18 +4771,6 @@ static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
return kvm_seg.selector;
}
-static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
- u16 selector,
- struct kvm_segment *kvm_seg)
-{
- struct desc_struct seg_desc;
-
- if (load_guest_segment_descriptor(vcpu, selector, &seg_desc))
- return 1;
- seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg);
- return 0;
-}
-
static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
{
struct kvm_segment segvar = {
@@ -4367,7 +4788,7 @@ static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int se
.unusable = 0,
};
kvm_x86_ops->set_segment(vcpu, &segvar, seg);
- return 0;
+ return X86EMUL_CONTINUE;
}
static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
@@ -4377,24 +4798,112 @@ static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
(kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
}
-int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
- int type_bits, int seg)
+int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg)
{
struct kvm_segment kvm_seg;
+ struct desc_struct seg_desc;
+ u8 dpl, rpl, cpl;
+ unsigned err_vec = GP_VECTOR;
+ u32 err_code = 0;
+ bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
+ int ret;
- if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE))
+ if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu))
return kvm_load_realmode_segment(vcpu, selector, seg);
- if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
- return 1;
- kvm_seg.type |= type_bits;
- if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS &&
- seg != VCPU_SREG_LDTR)
- if (!kvm_seg.s)
- kvm_seg.unusable = 1;
+ /* NULL selector is not valid for TR, CS and SS */
+ if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
+ && null_selector)
+ goto exception;
+
+ /* TR should be in GDT only */
+ if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
+ goto exception;
+
+ ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc);
+ if (ret)
+ return ret;
+
+ seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg);
+
+ if (null_selector) { /* for NULL selector skip all following checks */
+ kvm_seg.unusable = 1;
+ goto load;
+ }
+
+ err_code = selector & 0xfffc;
+ err_vec = GP_VECTOR;
+ /* can't load system descriptor into segment selecor */
+ if (seg <= VCPU_SREG_GS && !kvm_seg.s)
+ goto exception;
+
+ if (!kvm_seg.present) {
+ err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
+ goto exception;
+ }
+
+ rpl = selector & 3;
+ dpl = kvm_seg.dpl;
+ cpl = kvm_x86_ops->get_cpl(vcpu);
+
+ switch (seg) {
+ case VCPU_SREG_SS:
+ /*
+ * segment is not a writable data segment or segment
+ * selector's RPL != CPL or segment selector's RPL != CPL
+ */
+ if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl)
+ goto exception;
+ break;
+ case VCPU_SREG_CS:
+ if (!(kvm_seg.type & 8))
+ goto exception;
+
+ if (kvm_seg.type & 4) {
+ /* conforming */
+ if (dpl > cpl)
+ goto exception;
+ } else {
+ /* nonconforming */
+ if (rpl > cpl || dpl != cpl)
+ goto exception;
+ }
+ /* CS(RPL) <- CPL */
+ selector = (selector & 0xfffc) | cpl;
+ break;
+ case VCPU_SREG_TR:
+ if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9))
+ goto exception;
+ break;
+ case VCPU_SREG_LDTR:
+ if (kvm_seg.s || kvm_seg.type != 2)
+ goto exception;
+ break;
+ default: /* DS, ES, FS, or GS */
+ /*
+ * segment is not a data or readable code segment or
+ * ((segment is a data or nonconforming code segment)
+ * and (both RPL and CPL > DPL))
+ */
+ if ((kvm_seg.type & 0xa) == 0x8 ||
+ (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl)))
+ goto exception;
+ break;
+ }
+
+ if (!kvm_seg.unusable && kvm_seg.s) {
+ /* mark segment as accessed */
+ kvm_seg.type |= 1;
+ seg_desc.type |= 1;
+ save_guest_segment_descriptor(vcpu, selector, &seg_desc);
+ }
+load:
kvm_set_segment(vcpu, &kvm_seg, seg);
- return 0;
+ return X86EMUL_CONTINUE;
+exception:
+ kvm_queue_exception_e(vcpu, err_vec, err_code);
+ return X86EMUL_PROPAGATE_FAULT;
}
static void save_state_to_tss32(struct kvm_vcpu *vcpu,
@@ -4420,6 +4929,14 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
}
+static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg)
+{
+ struct kvm_segment kvm_seg;
+ kvm_get_segment(vcpu, &kvm_seg, seg);
+ kvm_seg.selector = sel;
+ kvm_set_segment(vcpu, &kvm_seg, seg);
+}
+
static int load_state_from_tss32(struct kvm_vcpu *vcpu,
struct tss_segment_32 *tss)
{
@@ -4437,25 +4954,41 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
- if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
+ /*
+ * SDM says that segment selectors are loaded before segment
+ * descriptors
+ */
+ kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR);
+ kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
+ kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
+ kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
+ kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
+ kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS);
+ kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS);
+
+ /*
+ * Now load segment descriptors. If fault happenes at this stage
+ * it is handled in a context of new task
+ */
+ if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR))
return 1;
- if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
+ if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
return 1;
- if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
+ if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
return 1;
- if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
+ if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
return 1;
- if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
+ if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
return 1;
- if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))
+ if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS))
return 1;
- if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))
+ if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS))
return 1;
return 0;
}
@@ -4495,19 +5028,33 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu,
kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
- if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
+ /*
+ * SDM says that segment selectors are loaded before segment
+ * descriptors
+ */
+ kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR);
+ kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
+ kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
+ kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
+ kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
+
+ /*
+ * Now load segment descriptors. If fault happenes at this stage
+ * it is handled in a context of new task
+ */
+ if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR))
return 1;
- if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
+ if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
return 1;
- if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
+ if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
return 1;
- if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
+ if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
return 1;
- if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
+ if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
return 1;
return 0;
}
@@ -4529,7 +5076,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
sizeof tss_segment_16))
goto out;
- if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
+ if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
&tss_segment_16, sizeof tss_segment_16))
goto out;
@@ -4537,7 +5084,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
tss_segment_16.prev_task_link = old_tss_sel;
if (kvm_write_guest(vcpu->kvm,
- get_tss_base_addr(vcpu, nseg_desc),
+ get_tss_base_addr_write(vcpu, nseg_desc),
&tss_segment_16.prev_task_link,
sizeof tss_segment_16.prev_task_link))
goto out;
@@ -4568,7 +5115,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
sizeof tss_segment_32))
goto out;
- if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
+ if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
&tss_segment_32, sizeof tss_segment_32))
goto out;
@@ -4576,7 +5123,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
tss_segment_32.prev_task_link = old_tss_sel;
if (kvm_write_guest(vcpu->kvm,
- get_tss_base_addr(vcpu, nseg_desc),
+ get_tss_base_addr_write(vcpu, nseg_desc),
&tss_segment_32.prev_task_link,
sizeof tss_segment_32.prev_task_link))
goto out;
@@ -4599,7 +5146,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
- old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base);
+ old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL);
/* FIXME: Handle errors. Failure to read either TSS or their
* descriptors should generate a pagefault.
@@ -4658,7 +5205,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
&nseg_desc);
}
- kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS);
+ kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS);
seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
tr_seg.type = 11;
kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
@@ -4689,17 +5236,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
kvm_set_cr8(vcpu, sregs->cr8);
- mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
+ mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
kvm_x86_ops->set_efer(vcpu, sregs->efer);
kvm_set_apic_base(vcpu, sregs->apic_base);
- kvm_x86_ops->decache_cr4_guest_bits(vcpu);
-
- mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
+ mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
vcpu->arch.cr0 = sregs->cr0;
- mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
+ mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
if (!is_long_mode(vcpu) && is_pae(vcpu)) {
load_pdptrs(vcpu, vcpu->arch.cr3);
@@ -4734,7 +5279,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
/* Older userspace won't unhalt the vcpu on reset. */
if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
- !(vcpu->arch.cr0 & X86_CR0_PE))
+ !is_protmode(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
vcpu_put(vcpu);
@@ -4832,11 +5377,12 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
{
unsigned long vaddr = tr->linear_address;
gpa_t gpa;
+ int idx;
vcpu_load(vcpu);
- down_read(&vcpu->kvm->slots_lock);
- gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
- up_read(&vcpu->kvm->slots_lock);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
tr->physical_address = gpa;
tr->valid = gpa != UNMAPPED_GVA;
tr->writeable = 1;
@@ -4917,14 +5463,14 @@ EXPORT_SYMBOL_GPL(fx_init);
void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
{
- if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
+ if (vcpu->guest_fpu_loaded)
return;
vcpu->guest_fpu_loaded = 1;
kvm_fx_save(&vcpu->arch.host_fx_image);
kvm_fx_restore(&vcpu->arch.guest_fx_image);
+ trace_kvm_fpu(1);
}
-EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
@@ -4935,8 +5481,9 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
kvm_fx_save(&vcpu->arch.guest_fx_image);
kvm_fx_restore(&vcpu->arch.host_fx_image);
++vcpu->stat.fpu_reload;
+ set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests);
+ trace_kvm_fpu(0);
}
-EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
{
@@ -5088,11 +5635,13 @@ fail:
void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
{
+ int idx;
+
kfree(vcpu->arch.mce_banks);
kvm_free_lapic(vcpu);
- down_read(&vcpu->kvm->slots_lock);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_mmu_destroy(vcpu);
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
free_page((unsigned long)vcpu->arch.pio_data);
}
@@ -5103,6 +5652,12 @@ struct kvm *kvm_arch_create_vm(void)
if (!kvm)
return ERR_PTR(-ENOMEM);
+ kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
+ if (!kvm->arch.aliases) {
+ kfree(kvm);
+ return ERR_PTR(-ENOMEM);
+ }
+
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
@@ -5159,16 +5714,18 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
put_page(kvm->arch.apic_access_page);
if (kvm->arch.ept_identity_pagetable)
put_page(kvm->arch.ept_identity_pagetable);
+ cleanup_srcu_struct(&kvm->srcu);
+ kfree(kvm->arch.aliases);
kfree(kvm);
}
-int kvm_arch_set_memory_region(struct kvm *kvm,
- struct kvm_userspace_memory_region *mem,
+int kvm_arch_prepare_memory_region(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
struct kvm_memory_slot old,
+ struct kvm_userspace_memory_region *mem,
int user_alloc)
{
- int npages = mem->memory_size >> PAGE_SHIFT;
- struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
+ int npages = memslot->npages;
/*To keep backward compatibility with older userspace,
*x86 needs to hanlde !user_alloc case.
@@ -5188,26 +5745,35 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
if (IS_ERR((void *)userspace_addr))
return PTR_ERR((void *)userspace_addr);
- /* set userspace_addr atomically for kvm_hva_to_rmapp */
- spin_lock(&kvm->mmu_lock);
memslot->userspace_addr = userspace_addr;
- spin_unlock(&kvm->mmu_lock);
- } else {
- if (!old.user_alloc && old.rmap) {
- int ret;
-
- down_write(&current->mm->mmap_sem);
- ret = do_munmap(current->mm, old.userspace_addr,
- old.npages * PAGE_SIZE);
- up_write(&current->mm->mmap_sem);
- if (ret < 0)
- printk(KERN_WARNING
- "kvm_vm_ioctl_set_memory_region: "
- "failed to munmap memory\n");
- }
}
}
+
+ return 0;
+}
+
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ struct kvm_memory_slot old,
+ int user_alloc)
+{
+
+ int npages = mem->memory_size >> PAGE_SHIFT;
+
+ if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
+ int ret;
+
+ down_write(&current->mm->mmap_sem);
+ ret = do_munmap(current->mm, old.userspace_addr,
+ old.npages * PAGE_SIZE);
+ up_write(&current->mm->mmap_sem);
+ if (ret < 0)
+ printk(KERN_WARNING
+ "kvm_vm_ioctl_set_memory_region: "
+ "failed to munmap memory\n");
+ }
+
spin_lock(&kvm->mmu_lock);
if (!kvm->arch.n_requested_mmu_pages) {
unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
@@ -5216,8 +5782,6 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
kvm_mmu_slot_remove_write_access(kvm, mem->slot);
spin_unlock(&kvm->mmu_lock);
-
- return 0;
}
void kvm_arch_flush_shadow(struct kvm *kvm)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 5eadea585d2..2d101639bd8 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -2,6 +2,7 @@
#define ARCH_X86_KVM_X86_H
#include <linux/kvm_host.h>
+#include "kvm_cache_regs.h"
static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
{
@@ -35,4 +36,33 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
u32 function, u32 index);
+static inline bool is_protmode(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr0_bits(vcpu, X86_CR0_PE);
+}
+
+static inline int is_long_mode(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+ return vcpu->arch.efer & EFER_LMA;
+#else
+ return 0;
+#endif
+}
+
+static inline int is_pae(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
+}
+
+static inline int is_pse(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr4_bits(vcpu, X86_CR4_PSE);
+}
+
+static inline int is_paging(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
+}
+
#endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 2226f2c70ea..5cb3f0f54f4 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -750,6 +750,7 @@ static void __init zone_sizes_init(void)
free_area_init_nodes(max_zone_pfns);
}
+#ifndef CONFIG_NO_BOOTMEM
static unsigned long __init setup_node_bootmem(int nodeid,
unsigned long start_pfn,
unsigned long end_pfn,
@@ -766,13 +767,14 @@ static unsigned long __init setup_node_bootmem(int nodeid,
printk(KERN_INFO " node %d bootmap %08lx - %08lx\n",
nodeid, bootmap, bootmap + bootmap_size);
free_bootmem_with_active_regions(nodeid, end_pfn);
- early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
return bootmap + bootmap_size;
}
+#endif
void __init setup_bootmem_allocator(void)
{
+#ifndef CONFIG_NO_BOOTMEM
int nodeid;
unsigned long bootmap_size, bootmap;
/*
@@ -784,11 +786,13 @@ void __init setup_bootmem_allocator(void)
if (bootmap == -1L)
panic("Cannot find bootmem map of size %ld\n", bootmap_size);
reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
+#endif
printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
max_pfn_mapped<<PAGE_SHIFT);
printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
+#ifndef CONFIG_NO_BOOTMEM
for_each_online_node(nodeid) {
unsigned long start_pfn, end_pfn;
@@ -806,6 +810,7 @@ void __init setup_bootmem_allocator(void)
bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
bootmap);
}
+#endif
after_bootmem = 1;
}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 69ddfbd9113..e9b040e1cde 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -572,6 +572,7 @@ kernel_physical_mapping_init(unsigned long start,
void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
int acpi, int k8)
{
+#ifndef CONFIG_NO_BOOTMEM
unsigned long bootmap_size, bootmap;
bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
@@ -579,13 +580,15 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
PAGE_SIZE);
if (bootmap == -1L)
panic("Cannot find bootmem map of size %ld\n", bootmap_size);
+ reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
/* don't touch min_low_pfn */
bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
0, end_pfn);
e820_register_active_regions(0, start_pfn, end_pfn);
free_bootmem_with_active_regions(0, end_pfn);
- early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
- reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
+#else
+ e820_register_active_regions(0, start_pfn, end_pfn);
+#endif
}
#endif
@@ -974,7 +977,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
if (pmd_none(*pmd)) {
pte_t entry;
- p = vmemmap_alloc_block(PMD_SIZE, node);
+ p = vmemmap_alloc_block_buf(PMD_SIZE, node);
if (!p)
return -ENOMEM;
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index b20760ca724..809baaaf48b 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -418,7 +418,10 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
for_each_online_node(nid) {
memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
+ NODE_DATA(nid)->node_id = nid;
+#ifndef CONFIG_NO_BOOTMEM
NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
+#endif
}
setup_bootmem_allocator();
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 3307ea8bd43..8948f47fde0 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -163,30 +163,48 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
unsigned long end, unsigned long size,
unsigned long align)
{
- unsigned long mem = find_e820_area(start, end, size, align);
- void *ptr;
+ unsigned long mem;
+ /*
+ * put it on high as possible
+ * something will go with NODE_DATA
+ */
+ if (start < (MAX_DMA_PFN<<PAGE_SHIFT))
+ start = MAX_DMA_PFN<<PAGE_SHIFT;
+ if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
+ end > (MAX_DMA32_PFN<<PAGE_SHIFT))
+ start = MAX_DMA32_PFN<<PAGE_SHIFT;
+ mem = find_e820_area(start, end, size, align);
+ if (mem != -1L)
+ return __va(mem);
+
+ /* extend the search scope */
+ end = max_pfn_mapped << PAGE_SHIFT;
+ if (end > (MAX_DMA32_PFN<<PAGE_SHIFT))
+ start = MAX_DMA32_PFN<<PAGE_SHIFT;
+ else
+ start = MAX_DMA_PFN<<PAGE_SHIFT;
+ mem = find_e820_area(start, end, size, align);
if (mem != -1L)
return __va(mem);
- ptr = __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
- if (ptr == NULL) {
- printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
+ printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
size, nodeid);
- return NULL;
- }
- return ptr;
+
+ return NULL;
}
/* Initialize bootmem allocator for a node */
void __init
setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
{
- unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
+ unsigned long start_pfn, last_pfn, nodedata_phys;
const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
- unsigned long bootmap_start, nodedata_phys;
- void *bootmap;
int nid;
+#ifndef CONFIG_NO_BOOTMEM
+ unsigned long bootmap_start, bootmap_pages, bootmap_size;
+ void *bootmap;
+#endif
if (!end)
return;
@@ -200,7 +218,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
start = roundup(start, ZONE_ALIGN);
- printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
+ printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid,
start, end);
start_pfn = start >> PAGE_SHIFT;
@@ -211,14 +229,21 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
if (node_data[nodeid] == NULL)
return;
nodedata_phys = __pa(node_data[nodeid]);
+ reserve_early(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
nodedata_phys + pgdat_size - 1);
+ nid = phys_to_nid(nodedata_phys);
+ if (nid != nodeid)
+ printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
- NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
+ NODE_DATA(nodeid)->node_id = nodeid;
NODE_DATA(nodeid)->node_start_pfn = start_pfn;
NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
+#ifndef CONFIG_NO_BOOTMEM
+ NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
+
/*
* Find a place for the bootmem map
* nodedata_phys could be on other nodes by alloc_bootmem,
@@ -227,11 +252,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
* of alloc_bootmem, that could clash with reserved range
*/
bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
- nid = phys_to_nid(nodedata_phys);
- if (nid == nodeid)
- bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
- else
- bootmap_start = roundup(start, PAGE_SIZE);
+ bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
/*
* SMP_CACHE_BYTES could be enough, but init_bootmem_node like
* to use that to align to PAGE_SIZE
@@ -239,18 +260,13 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
bootmap = early_node_mem(nodeid, bootmap_start, end,
bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
if (bootmap == NULL) {
- if (nodedata_phys < start || nodedata_phys >= end) {
- /*
- * only need to free it if it is from other node
- * bootmem
- */
- if (nid != nodeid)
- free_bootmem(nodedata_phys, pgdat_size);
- }
+ free_early(nodedata_phys, nodedata_phys + pgdat_size);
node_data[nodeid] = NULL;
return;
}
bootmap_start = __pa(bootmap);
+ reserve_early(bootmap_start, bootmap_start+(bootmap_pages<<PAGE_SHIFT),
+ "BOOTMAP");
bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
bootmap_start >> PAGE_SHIFT,
@@ -259,31 +275,12 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n",
bootmap_start, bootmap_start + bootmap_size - 1,
bootmap_pages);
-
- free_bootmem_with_active_regions(nodeid, end);
-
- /*
- * convert early reserve to bootmem reserve earlier
- * otherwise early_node_mem could use early reserved mem
- * on previous node
- */
- early_res_to_bootmem(start, end);
-
- /*
- * in some case early_node_mem could use alloc_bootmem
- * to get range on other node, don't reserve that again
- */
- if (nid != nodeid)
- printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
- else
- reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys,
- pgdat_size, BOOTMEM_DEFAULT);
nid = phys_to_nid(bootmap_start);
if (nid != nodeid)
printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid);
- else
- reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
- bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT);
+
+ free_bootmem_with_active_regions(nodeid, end);
+#endif
node_set_online(nodeid);
}
@@ -709,6 +706,10 @@ unsigned long __init numa_free_all_bootmem(void)
for_each_online_node(i)
pages += free_all_bootmem_node(NODE_DATA(i));
+#ifdef CONFIG_NO_BOOTMEM
+ pages += free_all_memory_core_early(MAX_NUMNODES);
+#endif
+
return pages;
}
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index 39fba37f702..0b7d3e9593e 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -14,8 +14,7 @@ obj-$(CONFIG_X86_VISWS) += visws.o
obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
obj-y += common.o early.o
-obj-y += amd_bus.o
-obj-$(CONFIG_X86_64) += bus_numa.o
+obj-y += amd_bus.o bus_numa.o
ifeq ($(CONFIG_PCI_DEBUG),y)
EXTRA_CFLAGS += -DDEBUG
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 95ecbd49595..fc1e8fe07e5 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -2,11 +2,11 @@
#include <linux/pci.h>
#include <linux/topology.h>
#include <linux/cpu.h>
+#include <linux/range.h>
+
#include <asm/pci_x86.h>
-#ifdef CONFIG_X86_64
#include <asm/pci-direct.h>
-#endif
#include "bus_numa.h"
@@ -15,60 +15,6 @@
* also get peer root bus resource for io,mmio
*/
-#ifdef CONFIG_X86_64
-
-#define RANGE_NUM 16
-
-struct res_range {
- size_t start;
- size_t end;
-};
-
-static void __init update_range(struct res_range *range, size_t start,
- size_t end)
-{
- int i;
- int j;
-
- for (j = 0; j < RANGE_NUM; j++) {
- if (!range[j].end)
- continue;
-
- if (start <= range[j].start && end >= range[j].end) {
- range[j].start = 0;
- range[j].end = 0;
- continue;
- }
-
- if (start <= range[j].start && end < range[j].end && range[j].start < end + 1) {
- range[j].start = end + 1;
- continue;
- }
-
-
- if (start > range[j].start && end >= range[j].end && range[j].end > start - 1) {
- range[j].end = start - 1;
- continue;
- }
-
- if (start > range[j].start && end < range[j].end) {
- /* find the new spare */
- for (i = 0; i < RANGE_NUM; i++) {
- if (range[i].end == 0)
- break;
- }
- if (i < RANGE_NUM) {
- range[i].end = range[j].end;
- range[i].start = end + 1;
- } else {
- printk(KERN_ERR "run of slot in ranges\n");
- }
- range[j].end = start - 1;
- continue;
- }
- }
-}
-
struct pci_hostbridge_probe {
u32 bus;
u32 slot;
@@ -111,6 +57,8 @@ static void __init get_pci_mmcfg_amd_fam10h_range(void)
fam10h_mmconf_end = base + (1ULL<<(segn_busn_bits + 20)) - 1;
}
+#define RANGE_NUM 16
+
/**
* early_fill_mp_bus_to_node()
* called before pcibios_scan_root and pci_scan_bus
@@ -130,16 +78,17 @@ static int __init early_fill_mp_bus_info(void)
struct pci_root_info *info;
u32 reg;
struct resource *res;
- size_t start;
- size_t end;
- struct res_range range[RANGE_NUM];
+ u64 start;
+ u64 end;
+ struct range range[RANGE_NUM];
u64 val;
u32 address;
+ bool found;
if (!early_pci_allowed())
return -1;
- found_all_numa_early = 0;
+ found = false;
for (i = 0; i < ARRAY_SIZE(pci_probes); i++) {
u32 id;
u16 device;
@@ -153,12 +102,12 @@ static int __init early_fill_mp_bus_info(void)
device = (id>>16) & 0xffff;
if (pci_probes[i].vendor == vendor &&
pci_probes[i].device == device) {
- found_all_numa_early = 1;
+ found = true;
break;
}
}
- if (!found_all_numa_early)
+ if (!found)
return 0;
pci_root_num = 0;
@@ -196,7 +145,7 @@ static int __init early_fill_mp_bus_info(void)
def_link = (reg >> 8) & 0x03;
memset(range, 0, sizeof(range));
- range[0].end = 0xffff;
+ add_range(range, RANGE_NUM, 0, 0, 0xffff + 1);
/* io port resource */
for (i = 0; i < 4; i++) {
reg = read_pci_config(bus, slot, 1, 0xc0 + (i << 3));
@@ -220,13 +169,13 @@ static int __init early_fill_mp_bus_info(void)
info = &pci_root_info[j];
printk(KERN_DEBUG "node %d link %d: io port [%llx, %llx]\n",
- node, link, (u64)start, (u64)end);
+ node, link, start, end);
/* kernel only handle 16 bit only */
if (end > 0xffff)
end = 0xffff;
update_res(info, start, end, IORESOURCE_IO, 1);
- update_range(range, start, end);
+ subtract_range(range, RANGE_NUM, start, end + 1);
}
/* add left over io port range to def node/link, [0, 0xffff] */
/* find the position */
@@ -241,29 +190,32 @@ static int __init early_fill_mp_bus_info(void)
if (!range[i].end)
continue;
- update_res(info, range[i].start, range[i].end,
+ update_res(info, range[i].start, range[i].end - 1,
IORESOURCE_IO, 1);
}
}
memset(range, 0, sizeof(range));
/* 0xfd00000000-0xffffffffff for HT */
- range[0].end = (0xfdULL<<32) - 1;
+ end = cap_resource((0xfdULL<<32) - 1);
+ end++;
+ add_range(range, RANGE_NUM, 0, 0, end);
/* need to take out [0, TOM) for RAM*/
address = MSR_K8_TOP_MEM1;
rdmsrl(address, val);
end = (val & 0xffffff800000ULL);
- printk(KERN_INFO "TOM: %016lx aka %ldM\n", end, end>>20);
+ printk(KERN_INFO "TOM: %016llx aka %lldM\n", end, end>>20);
if (end < (1ULL<<32))
- update_range(range, 0, end - 1);
+ subtract_range(range, RANGE_NUM, 0, end);
/* get mmconfig */
get_pci_mmcfg_amd_fam10h_range();
/* need to take out mmconf range */
if (fam10h_mmconf_end) {
printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end);
- update_range(range, fam10h_mmconf_start, fam10h_mmconf_end);
+ subtract_range(range, RANGE_NUM, fam10h_mmconf_start,
+ fam10h_mmconf_end + 1);
}
/* mmio resource */
@@ -293,7 +245,7 @@ static int __init early_fill_mp_bus_info(void)
info = &pci_root_info[j];
printk(KERN_DEBUG "node %d link %d: mmio [%llx, %llx]",
- node, link, (u64)start, (u64)end);
+ node, link, start, end);
/*
* some sick allocation would have range overlap with fam10h
* mmconf range, so need to update start and end.
@@ -318,14 +270,15 @@ static int __init early_fill_mp_bus_info(void)
/* we got a hole */
endx = fam10h_mmconf_start - 1;
update_res(info, start, endx, IORESOURCE_MEM, 0);
- update_range(range, start, endx);
- printk(KERN_CONT " ==> [%llx, %llx]", (u64)start, endx);
+ subtract_range(range, RANGE_NUM, start,
+ endx + 1);
+ printk(KERN_CONT " ==> [%llx, %llx]", start, endx);
start = fam10h_mmconf_end + 1;
changed = 1;
}
if (changed) {
if (start <= end) {
- printk(KERN_CONT " %s [%llx, %llx]", endx?"and":"==>", (u64)start, (u64)end);
+ printk(KERN_CONT " %s [%llx, %llx]", endx ? "and" : "==>", start, end);
} else {
printk(KERN_CONT "%s\n", endx?"":" ==> none");
continue;
@@ -333,8 +286,9 @@ static int __init early_fill_mp_bus_info(void)
}
}
- update_res(info, start, end, IORESOURCE_MEM, 1);
- update_range(range, start, end);
+ update_res(info, cap_resource(start), cap_resource(end),
+ IORESOURCE_MEM, 1);
+ subtract_range(range, RANGE_NUM, start, end + 1);
printk(KERN_CONT "\n");
}
@@ -348,8 +302,8 @@ static int __init early_fill_mp_bus_info(void)
address = MSR_K8_TOP_MEM2;
rdmsrl(address, val);
end = (val & 0xffffff800000ULL);
- printk(KERN_INFO "TOM2: %016lx aka %ldM\n", end, end>>20);
- update_range(range, 1ULL<<32, end - 1);
+ printk(KERN_INFO "TOM2: %016llx aka %lldM\n", end, end>>20);
+ subtract_range(range, RANGE_NUM, 1ULL<<32, end);
}
/*
@@ -368,7 +322,8 @@ static int __init early_fill_mp_bus_info(void)
if (!range[i].end)
continue;
- update_res(info, range[i].start, range[i].end,
+ update_res(info, cap_resource(range[i].start),
+ cap_resource(range[i].end - 1),
IORESOURCE_MEM, 1);
}
}
@@ -384,24 +339,14 @@ static int __init early_fill_mp_bus_info(void)
info->bus_min, info->bus_max, info->node, info->link);
for (j = 0; j < res_num; j++) {
res = &info->res[j];
- printk(KERN_DEBUG "bus: %02x index %x %s: [%llx, %llx]\n",
- busnum, j,
- (res->flags & IORESOURCE_IO)?"io port":"mmio",
- res->start, res->end);
+ printk(KERN_DEBUG "bus: %02x index %x %pR\n",
+ busnum, j, res);
}
}
return 0;
}
-#else /* !CONFIG_X86_64 */
-
-static int __init early_fill_mp_bus_info(void) { return 0; }
-
-#endif /* !CONFIG_X86_64 */
-
-/* common 32/64 bit code */
-
#define ENABLE_CF8_EXT_CFG (1ULL << 46)
static void enable_pci_io_ecs(void *unused)
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
index 12d54ff3654..64a12288389 100644
--- a/arch/x86/pci/bus_numa.c
+++ b/arch/x86/pci/bus_numa.c
@@ -1,11 +1,11 @@
#include <linux/init.h>
#include <linux/pci.h>
+#include <linux/range.h>
#include "bus_numa.h"
int pci_root_num;
struct pci_root_info pci_root_info[PCI_ROOT_NR];
-int found_all_numa_early;
void x86_pci_root_bus_res_quirks(struct pci_bus *b)
{
@@ -21,10 +21,6 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b)
if (!pci_root_num)
return;
- /* for amd, if only one root bus, don't need to do anything */
- if (pci_root_num < 2 && found_all_numa_early)
- return;
-
for (i = 0; i < pci_root_num; i++) {
if (pci_root_info[i].bus_min == b->number)
break;
@@ -52,8 +48,8 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b)
}
}
-void __devinit update_res(struct pci_root_info *info, size_t start,
- size_t end, unsigned long flags, int merge)
+void __devinit update_res(struct pci_root_info *info, resource_size_t start,
+ resource_size_t end, unsigned long flags, int merge)
{
int i;
struct resource *res;
@@ -61,25 +57,28 @@ void __devinit update_res(struct pci_root_info *info, size_t start,
if (start > end)
return;
+ if (start == MAX_RESOURCE)
+ return;
+
if (!merge)
goto addit;
/* try to merge it with old one */
for (i = 0; i < info->res_num; i++) {
- size_t final_start, final_end;
- size_t common_start, common_end;
+ resource_size_t final_start, final_end;
+ resource_size_t common_start, common_end;
res = &info->res[i];
if (res->flags != flags)
continue;
- common_start = max((size_t)res->start, start);
- common_end = min((size_t)res->end, end);
+ common_start = max(res->start, start);
+ common_end = min(res->end, end);
if (common_start > common_end + 1)
continue;
- final_start = min((size_t)res->start, start);
- final_end = max((size_t)res->end, end);
+ final_start = min(res->start, start);
+ final_end = max(res->end, end);
res->start = final_start;
res->end = final_end;
diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h
index 731b64ee8d8..804a4b40c31 100644
--- a/arch/x86/pci/bus_numa.h
+++ b/arch/x86/pci/bus_numa.h
@@ -1,5 +1,5 @@
-#ifdef CONFIG_X86_64
-
+#ifndef __BUS_NUMA_H
+#define __BUS_NUMA_H
/*
* sub bus (transparent) will use entres from 3 to store extra from
* root, so need to make sure we have enough slot there.
@@ -19,8 +19,7 @@ struct pci_root_info {
#define PCI_ROOT_NR 4
extern int pci_root_num;
extern struct pci_root_info pci_root_info[PCI_ROOT_NR];
-extern int found_all_numa_early;
-extern void update_res(struct pci_root_info *info, size_t start,
- size_t end, unsigned long flags, int merge);
+extern void update_res(struct pci_root_info *info, resource_size_t start,
+ resource_size_t end, unsigned long flags, int merge);
#endif
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 5a8fbf8d4ca..dece3eb9c90 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -255,10 +255,6 @@ void __init pcibios_resource_survey(void)
*/
fs_initcall(pcibios_assign_resources);
-void __weak x86_pci_root_bus_res_quirks(struct pci_bus *b)
-{
-}
-
/*
* If we set up a device for bus mastering, we need to check the latency
* timer as certain crappy BIOSes forget to set it properly.
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 36daccb6864..b607239c1ba 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -50,6 +50,7 @@
#include <asm/traps.h>
#include <asm/setup.h>
#include <asm/desc.h>
+#include <asm/pgalloc.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/reboot.h>
@@ -1094,6 +1095,12 @@ asmlinkage void __init xen_start_kernel(void)
__supported_pte_mask |= _PAGE_IOMAP;
+ /*
+ * Prevent page tables from being allocated in highmem, even
+ * if CONFIG_HIGHPTE is enabled.
+ */
+ __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
+
/* Work out if we support NX */
x86_configure_nx();
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index bf4cd6bfe95..f9eb7de74f4 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1427,23 +1427,6 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
#endif
}
-#ifdef CONFIG_HIGHPTE
-static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
-{
- pgprot_t prot = PAGE_KERNEL;
-
- if (PagePinned(page))
- prot = PAGE_KERNEL_RO;
-
- if (0 && PageHighMem(page))
- printk("mapping highpte %lx type %d prot %s\n",
- page_to_pfn(page), type,
- (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
-
- return kmap_atomic_prot(page, type, prot);
-}
-#endif
-
#ifdef CONFIG_X86_32
static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
{
@@ -1902,10 +1885,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
.alloc_pmd_clone = paravirt_nop,
.release_pmd = xen_release_pmd_init,
-#ifdef CONFIG_HIGHPTE
- .kmap_atomic_pte = xen_kmap_atomic_pte,
-#endif
-
#ifdef CONFIG_X86_64
.set_pte = xen_set_pte,
#else
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index 88e15deb8b8..22a2093b586 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -90,9 +90,9 @@ ENTRY(xen_iret)
GET_THREAD_INFO(%eax)
movl TI_cpu(%eax), %eax
movl __per_cpu_offset(,%eax,4), %eax
- mov per_cpu__xen_vcpu(%eax), %eax
+ mov xen_vcpu(%eax), %eax
#else
- movl per_cpu__xen_vcpu, %eax
+ movl xen_vcpu, %eax
#endif
/* check IF state we're restoring */