summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig11
-rw-r--r--arch/x86/boot/compressed/string.c4
-rw-r--r--arch/x86/boot/string.c9
-rw-r--r--arch/x86/configs/i386_defconfig1
-rw-r--r--arch/x86/configs/x86_64_defconfig1
-rw-r--r--arch/x86/include/asm/atomic.h7
-rw-r--r--arch/x86/include/asm/barrier.h4
-rw-r--r--arch/x86/include/asm/bitops.h6
-rw-r--r--arch/x86/include/asm/cmdline.h6
-rw-r--r--arch/x86/include/asm/hw_irq.h4
-rw-r--r--arch/x86/include/asm/iosf_mbi.h55
-rw-r--r--arch/x86/include/asm/microcode.h1
-rw-r--r--arch/x86/include/asm/page_64_types.h2
-rw-r--r--arch/x86/include/asm/pci.h1
-rw-r--r--arch/x86/include/asm/sync_bitops.h2
-rw-r--r--arch/x86/include/asm/thread_info.h4
-rw-r--r--arch/x86/include/asm/uprobes.h16
-rw-r--r--arch/x86/include/asm/xen/hypercall.h2
-rw-r--r--arch/x86/include/asm/xen/interface.h3
-rw-r--r--arch/x86/kernel/aperture_64.c59
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c2
-rw-r--r--arch/x86/kernel/apm_32.c11
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c6
-rw-r--r--arch/x86/kernel/cpu/microcode/core_early.c37
-rw-r--r--arch/x86/kernel/cpu/perf_event.c1
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c22
-rw-r--r--arch/x86/kernel/entry_64.S185
-rw-r--r--arch/x86/kernel/iosf_mbi.c13
-rw-r--r--arch/x86/kernel/uprobes.c551
-rw-r--r--arch/x86/kvm/vmx.c7
-rw-r--r--arch/x86/kvm/x86.c6
-rw-r--r--arch/x86/lib/Makefile2
-rw-r--r--arch/x86/lib/cmdline.c84
-rw-r--r--arch/x86/mm/ioremap.c26
-rw-r--r--arch/x86/mm/pgtable.c21
-rw-r--r--arch/x86/pci/acpi.c6
-rw-r--r--arch/x86/pci/amd_bus.c83
-rw-r--r--arch/x86/pci/broadcom_bus.c4
-rw-r--r--arch/x86/pci/fixup.c18
-rw-r--r--arch/x86/pci/i386.c27
-rw-r--r--arch/x86/realmode/rm/Makefile3
-rw-r--r--arch/x86/xen/enlighten.c1
-rw-r--r--arch/x86/xen/mmu.c125
-rw-r--r--arch/x86/xen/p2m.c174
-rw-r--r--arch/x86/xen/setup.c15
-rw-r--r--arch/x86/xen/suspend.c23
-rw-r--r--arch/x86/xen/xen-ops.h2
47 files changed, 1128 insertions, 525 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 25d2c6f7325..7d5feb5908d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -261,6 +261,9 @@ config ARCH_HWEIGHT_CFLAGS
config ARCH_SUPPORTS_UPROBES
def_bool y
+config FIX_EARLYCON_MEM
+ def_bool y
+
source "init/Kconfig"
source "kernel/Kconfig.freezer"
@@ -415,7 +418,6 @@ config X86_UV
config X86_GOLDFISH
bool "Goldfish (Virtual Platform)"
- depends on X86_32
depends on X86_EXTENDED_PLATFORM
---help---
Enable support for the Goldfish virtual platform used primarily
@@ -2375,12 +2377,9 @@ config X86_DMA_REMAP
depends on STA2X11
config IOSF_MBI
- bool
+ tristate
+ default m
depends on PCI
- ---help---
- To be selected by modules requiring access to the Intel OnChip System
- Fabric (IOSF) Sideband MailBox Interface (MBI). For MBI platforms
- enumerable by PCI.
source "net/Kconfig"
diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c
index f3c57e34140..00e788be1db 100644
--- a/arch/x86/boot/compressed/string.c
+++ b/arch/x86/boot/compressed/string.c
@@ -1,9 +1,5 @@
-#include "misc.h"
#include "../string.c"
-/* misc.h might pull in string_32.h which has a macro for memcpy. undef that */
-#undef memcpy
-
#ifdef CONFIG_X86_32
void *memcpy(void *dest, const void *src, size_t n)
{
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 5339040ef86..493f3fd9f13 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -12,14 +12,9 @@
* Very basic string functions
*/
-#include "boot.h"
+#include <linux/types.h>
+#include "ctype.h"
-/*
- * This file gets included in compressed/string.c which might pull in
- * string_32.h and which in turn maps memcmp to __builtin_memcmp(). Undo
- * that first.
- */
-#undef memcmp
int memcmp(const void *s1, const void *s2, size_t len)
{
u8 diff;
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 619e7f7426c..32d2e7056c8 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -244,7 +244,6 @@ CONFIG_HID_TOPSEED=y
CONFIG_HID_PID=y
CONFIG_USB_HIDDEV=y
CONFIG_USB=y
-CONFIG_USB_DEBUG=y
CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
CONFIG_USB_MON=y
CONFIG_USB_EHCI_HCD=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 6181c69b786..a481dd4755d 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -239,7 +239,6 @@ CONFIG_HID_TOPSEED=y
CONFIG_HID_PID=y
CONFIG_USB_HIDDEV=y
CONFIG_USB=y
-CONFIG_USB_DEBUG=y
CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
CONFIG_USB_MON=y
CONFIG_USB_EHCI_HCD=y
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index b17f4f48ecd..6dd1c7dd047 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -7,6 +7,7 @@
#include <asm/alternative.h>
#include <asm/cmpxchg.h>
#include <asm/rmwcc.h>
+#include <asm/barrier.h>
/*
* Atomic operations that C can't guarantee us. Useful for
@@ -243,12 +244,6 @@ static inline void atomic_or_long(unsigned long *v1, unsigned long v2)
: : "r" ((unsigned)(mask)), "m" (*(addr)) \
: "memory")
-/* Atomic operations are already serializing on x86 */
-#define smp_mb__before_atomic_dec() barrier()
-#define smp_mb__after_atomic_dec() barrier()
-#define smp_mb__before_atomic_inc() barrier()
-#define smp_mb__after_atomic_inc() barrier()
-
#ifdef CONFIG_X86_32
# include <asm/atomic64_32.h>
#else
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 69bbb484502..5c7198cca5e 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -137,6 +137,10 @@ do { \
#endif
+/* Atomic operations are already serializing on x86 */
+#define smp_mb__before_atomic() barrier()
+#define smp_mb__after_atomic() barrier()
+
/*
* Stop RDTSC speculation. This is needed when you need to use RDTSC
* (or get_cycles or vread that possibly accesses the TSC) in a defined
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 9fc1af74dc8..afcd35d331d 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -15,6 +15,7 @@
#include <linux/compiler.h>
#include <asm/alternative.h>
#include <asm/rmwcc.h>
+#include <asm/barrier.h>
#if BITS_PER_LONG == 32
# define _BITOPS_LONG_SHIFT 5
@@ -102,7 +103,7 @@ static inline void __set_bit(long nr, volatile unsigned long *addr)
*
* clear_bit() is atomic and may not be reordered. However, it does
* not contain a memory barrier, so if it is used for locking purposes,
- * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
+ * you should call smp_mb__before_atomic() and/or smp_mb__after_atomic()
* in order to ensure changes are visible on other processors.
*/
static __always_inline void
@@ -156,9 +157,6 @@ static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
__clear_bit(nr, addr);
}
-#define smp_mb__before_clear_bit() barrier()
-#define smp_mb__after_clear_bit() barrier()
-
/**
* __change_bit - Toggle a bit in memory
* @nr: the bit to change
diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h
new file mode 100644
index 00000000000..e01f7f7ccb0
--- /dev/null
+++ b/arch/x86/include/asm/cmdline.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_X86_CMDLINE_H
+#define _ASM_X86_CMDLINE_H
+
+int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
+
+#endif /* _ASM_X86_CMDLINE_H */
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index a307b7530e5..4615906d83d 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -190,8 +190,8 @@ extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
#define trace_interrupt interrupt
#endif
-#define VECTOR_UNDEFINED -1
-#define VECTOR_RETRIGGERED -2
+#define VECTOR_UNDEFINED (-1)
+#define VECTOR_RETRIGGERED (-2)
typedef int vector_irq_t[NR_VECTORS];
DECLARE_PER_CPU(vector_irq_t, vector_irq);
diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h
index 8e71c794176..57995f0596a 100644
--- a/arch/x86/include/asm/iosf_mbi.h
+++ b/arch/x86/include/asm/iosf_mbi.h
@@ -50,6 +50,32 @@
#define BT_MBI_PCIE_READ 0x00
#define BT_MBI_PCIE_WRITE 0x01
+/* Quark available units */
+#define QRK_MBI_UNIT_HBA 0x00
+#define QRK_MBI_UNIT_HB 0x03
+#define QRK_MBI_UNIT_RMU 0x04
+#define QRK_MBI_UNIT_MM 0x05
+#define QRK_MBI_UNIT_MMESRAM 0x05
+#define QRK_MBI_UNIT_SOC 0x31
+
+/* Quark read/write opcodes */
+#define QRK_MBI_HBA_READ 0x10
+#define QRK_MBI_HBA_WRITE 0x11
+#define QRK_MBI_HB_READ 0x10
+#define QRK_MBI_HB_WRITE 0x11
+#define QRK_MBI_RMU_READ 0x10
+#define QRK_MBI_RMU_WRITE 0x11
+#define QRK_MBI_MM_READ 0x10
+#define QRK_MBI_MM_WRITE 0x11
+#define QRK_MBI_MMESRAM_READ 0x12
+#define QRK_MBI_MMESRAM_WRITE 0x13
+#define QRK_MBI_SOC_READ 0x06
+#define QRK_MBI_SOC_WRITE 0x07
+
+#if IS_ENABLED(CONFIG_IOSF_MBI)
+
+bool iosf_mbi_available(void);
+
/**
* iosf_mbi_read() - MailBox Interface read command
* @port: port indicating subunit being accessed
@@ -87,4 +113,33 @@ int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr);
*/
int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask);
+#else /* CONFIG_IOSF_MBI is not enabled */
+static inline
+bool iosf_mbi_available(void)
+{
+ return false;
+}
+
+static inline
+int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr)
+{
+ WARN(1, "IOSF_MBI driver not available");
+ return -EPERM;
+}
+
+static inline
+int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr)
+{
+ WARN(1, "IOSF_MBI driver not available");
+ return -EPERM;
+}
+
+static inline
+int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask)
+{
+ WARN(1, "IOSF_MBI driver not available");
+ return -EPERM;
+}
+#endif /* CONFIG_IOSF_MBI */
+
#endif /* IOSF_MBI_SYMS_H */
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index b59827e7652..64dc362506b 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -25,6 +25,7 @@ struct cpu_signature {
struct device;
enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND };
+extern bool dis_ucode_ldr;
struct microcode_ops {
enum ucode_state (*request_microcode_user) (int cpu,
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 8de6d9cf3b9..678205195ae 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -1,7 +1,7 @@
#ifndef _ASM_X86_PAGE_64_DEFS_H
#define _ASM_X86_PAGE_64_DEFS_H
-#define THREAD_SIZE_ORDER 1
+#define THREAD_SIZE_ORDER 2
#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
#define CURRENT_MASK (~(THREAD_SIZE - 1))
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 96ae4f4040b..0892ea0e683 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -68,7 +68,6 @@ void pcibios_config_init(void);
void pcibios_scan_root(int bus);
void pcibios_set_master(struct pci_dev *dev);
-void pcibios_penalize_isa_irq(int irq, int active);
struct irq_routing_table *pcibios_get_irq_routing_table(void);
int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h
index 05af3b31d52..f28a24b51dc 100644
--- a/arch/x86/include/asm/sync_bitops.h
+++ b/arch/x86/include/asm/sync_bitops.h
@@ -41,7 +41,7 @@ static inline void sync_set_bit(long nr, volatile unsigned long *addr)
*
* sync_clear_bit() is atomic and may not be reordered. However, it does
* not contain a memory barrier, so if it is used for locking purposes,
- * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
+ * you should call smp_mb__before_atomic() and/or smp_mb__after_atomic()
* in order to ensure changes are visible on other processors.
*/
static inline void sync_clear_bit(long nr, volatile unsigned long *addr)
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 47e5de25ba7..854053889d4 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,6 +83,7 @@ struct thread_info {
#define TIF_FORK 18 /* ret_from_fork */
#define TIF_NOHZ 19 /* in adaptive nohz mode */
#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
+#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */
#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
@@ -106,6 +107,7 @@ struct thread_info {
#define _TIF_IA32 (1 << TIF_IA32)
#define _TIF_FORK (1 << TIF_FORK)
#define _TIF_NOHZ (1 << TIF_NOHZ)
+#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)
#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
#define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP)
@@ -191,8 +193,6 @@ static inline struct thread_info *current_thread_info(void)
* have to worry about atomic accesses.
*/
#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/
-#define TS_POLLING 0x0004 /* idle task polling need_resched,
- skip sending interrupt */
#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */
#ifndef __ASSEMBLY__
diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
index 3087ea9c5f2..93bee7b9385 100644
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -33,15 +33,27 @@ typedef u8 uprobe_opcode_t;
#define UPROBE_SWBP_INSN 0xcc
#define UPROBE_SWBP_INSN_SIZE 1
+struct uprobe_xol_ops;
+
struct arch_uprobe {
- u16 fixups;
union {
u8 insn[MAX_UINSN_BYTES];
u8 ixol[MAX_UINSN_BYTES];
};
+
+ u16 fixups;
+ const struct uprobe_xol_ops *ops;
+
+ union {
#ifdef CONFIG_X86_64
- unsigned long rip_rela_target_address;
+ unsigned long rip_rela_target_address;
#endif
+ struct {
+ s32 offs;
+ u8 ilen;
+ u8 opc1;
+ } branch;
+ };
};
struct arch_uprobe_task {
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index e709884d0ef..ca08a27b90b 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -343,7 +343,7 @@ HYPERVISOR_memory_op(unsigned int cmd, void *arg)
}
static inline int
-HYPERVISOR_multicall(void *call_list, int nr_calls)
+HYPERVISOR_multicall(void *call_list, uint32_t nr_calls)
{
return _hypercall2(int, multicall, call_list, nr_calls);
}
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
index fd9cb7695b5..3400dbaec3c 100644
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -54,6 +54,9 @@ typedef unsigned long xen_pfn_t;
#define PRI_xen_pfn "lx"
typedef unsigned long xen_ulong_t;
#define PRI_xen_ulong "lx"
+typedef long xen_long_t;
+#define PRI_xen_long "lx"
+
/* Guest handles for primitive C types. */
__DEFINE_GUEST_HANDLE(uchar, unsigned char);
__DEFINE_GUEST_HANDLE(uint, unsigned int);
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 9fa8aa051f5..76164e173a2 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -10,6 +10,8 @@
*
* Copyright 2002 Andi Kleen, SuSE Labs.
*/
+#define pr_fmt(fmt) "AGP: " fmt
+
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/init.h>
@@ -75,14 +77,13 @@ static u32 __init allocate_aperture(void)
addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,
aper_size, aper_size);
if (!addr) {
- printk(KERN_ERR
- "Cannot allocate aperture memory hole (%lx,%uK)\n",
- addr, aper_size>>10);
+ pr_err("Cannot allocate aperture memory hole [mem %#010lx-%#010lx] (%uKB)\n",
+ addr, addr + aper_size - 1, aper_size >> 10);
return 0;
}
memblock_reserve(addr, aper_size);
- printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n",
- aper_size >> 10, addr);
+ pr_info("Mapping aperture over RAM [mem %#010lx-%#010lx] (%uKB)\n",
+ addr, addr + aper_size - 1, aper_size >> 10);
register_nosave_region(addr >> PAGE_SHIFT,
(addr+aper_size) >> PAGE_SHIFT);
@@ -126,10 +127,11 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
u64 aper;
u32 old_order;
- printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", bus, slot, func);
+ pr_info("pci 0000:%02x:%02x:%02x: AGP bridge\n", bus, slot, func);
apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14);
if (apsizereg == 0xffffffff) {
- printk(KERN_ERR "APSIZE in AGP bridge unreadable\n");
+ pr_err("pci 0000:%02x:%02x.%d: APSIZE unreadable\n",
+ bus, slot, func);
return 0;
}
@@ -153,16 +155,18 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
* On some sick chips, APSIZE is 0. It means it wants 4G
* so let double check that order, and lets trust AMD NB settings:
*/
- printk(KERN_INFO "Aperture from AGP @ %Lx old size %u MB\n",
- aper, 32 << old_order);
+ pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (old size %uMB)\n",
+ bus, slot, func, aper, aper + (32ULL << (old_order + 20)) - 1,
+ 32 << old_order);
if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) {
- printk(KERN_INFO "Aperture size %u MB (APSIZE %x) is not right, using settings from NB\n",
- 32 << *order, apsizereg);
+ pr_info("pci 0000:%02x:%02x.%d: AGP aperture size %uMB (APSIZE %#x) is not right, using settings from NB\n",
+ bus, slot, func, 32 << *order, apsizereg);
*order = old_order;
}
- printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n",
- aper, 32 << *order, apsizereg);
+ pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (%uMB, APSIZE %#x)\n",
+ bus, slot, func, aper, aper + (32ULL << (*order + 20)) - 1,
+ 32 << *order, apsizereg);
if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20))
return 0;
@@ -218,7 +222,7 @@ static u32 __init search_agp_bridge(u32 *order, int *valid_agp)
}
}
}
- printk(KERN_INFO "No AGP bridge found\n");
+ pr_info("No AGP bridge found\n");
return 0;
}
@@ -310,7 +314,8 @@ void __init early_gart_iommu_check(void)
if (e820_any_mapped(aper_base, aper_base + aper_size,
E820_RAM)) {
/* reserve it, so we can reuse it in second kernel */
- printk(KERN_INFO "update e820 for GART\n");
+ pr_info("e820: reserve [mem %#010Lx-%#010Lx] for GART\n",
+ aper_base, aper_base + aper_size - 1);
e820_add_region(aper_base, aper_size, E820_RESERVED);
update_e820();
}
@@ -354,7 +359,7 @@ int __init gart_iommu_hole_init(void)
!early_pci_allowed())
return -ENODEV;
- printk(KERN_INFO "Checking aperture...\n");
+ pr_info("Checking aperture...\n");
if (!fallback_aper_force)
agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
@@ -395,8 +400,9 @@ int __init gart_iommu_hole_init(void)
aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
aper_base <<= 25;
- printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n",
- node, aper_base, aper_size >> 20);
+ pr_info("Node %d: aperture [bus addr %#010Lx-%#010Lx] (%uMB)\n",
+ node, aper_base, aper_base + aper_size - 1,
+ aper_size >> 20);
node++;
if (!aperture_valid(aper_base, aper_size, 64<<20)) {
@@ -407,9 +413,9 @@ int __init gart_iommu_hole_init(void)
if (!no_iommu &&
max_pfn > MAX_DMA32_PFN &&
!printed_gart_size_msg) {
- printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n");
- printk(KERN_ERR "please increase GART size in your BIOS setup\n");
- printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n");
+ pr_err("you are using iommu with agp, but GART size is less than 64MB\n");
+ pr_err("please increase GART size in your BIOS setup\n");
+ pr_err("if BIOS doesn't have that option, contact your HW vendor!\n");
printed_gart_size_msg = 1;
}
} else {
@@ -446,13 +452,10 @@ out:
force_iommu ||
valid_agp ||
fallback_aper_force) {
- printk(KERN_INFO
- "Your BIOS doesn't leave a aperture memory hole\n");
- printk(KERN_INFO
- "Please enable the IOMMU option in the BIOS setup\n");
- printk(KERN_INFO
- "This costs you %d MB of RAM\n",
- 32 << fallback_aper_order);
+ pr_info("Your BIOS doesn't leave a aperture memory hole\n");
+ pr_info("Please enable the IOMMU option in the BIOS setup\n");
+ pr_info("This costs you %dMB of RAM\n",
+ 32 << fallback_aper_order);
aper_order = fallback_aper_order;
aper_alloc = allocate_aperture();
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index a698d7165c9..eab67047dec 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -57,7 +57,7 @@ void arch_trigger_all_cpu_backtrace(void)
}
clear_bit(0, &backtrace_flag);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
}
static int __kprobes
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 3ab03430211..f3a1f04ed4c 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -844,21 +844,10 @@ static int apm_do_idle(void)
int polling;
int err = 0;
- polling = !!(current_thread_info()->status & TS_POLLING);
- if (polling) {
- current_thread_info()->status &= ~TS_POLLING;
- /*
- * TS_POLLING-cleared state must be visible before we
- * test NEED_RESCHED:
- */
- smp_mb();
- }
if (!need_resched()) {
idled = 1;
ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err);
}
- if (polling)
- current_thread_info()->status |= TS_POLLING;
if (!idled)
return 0;
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 15c987698b0..dd9d6190b08 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -97,6 +97,9 @@ MODULE_LICENSE("GPL");
static struct microcode_ops *microcode_ops;
+bool dis_ucode_ldr;
+module_param(dis_ucode_ldr, bool, 0);
+
/*
* Synchronization.
*
@@ -546,6 +549,9 @@ static int __init microcode_init(void)
struct cpuinfo_x86 *c = &cpu_data(0);
int error;
+ if (dis_ucode_ldr)
+ return 0;
+
if (c->x86_vendor == X86_VENDOR_INTEL)
microcode_ops = init_intel_microcode();
else if (c->x86_vendor == X86_VENDOR_AMD)
diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c
index be7f8514f57..5f28a64e71e 100644
--- a/arch/x86/kernel/cpu/microcode/core_early.c
+++ b/arch/x86/kernel/cpu/microcode/core_early.c
@@ -17,9 +17,11 @@
* 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
+#include <asm/microcode.h>
#include <asm/microcode_intel.h>
#include <asm/microcode_amd.h>
#include <asm/processor.h>
+#include <asm/cmdline.h>
#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
@@ -72,10 +74,33 @@ static int x86_family(void)
return x86;
}
+static bool __init check_loader_disabled_bsp(void)
+{
+#ifdef CONFIG_X86_32
+ const char *cmdline = (const char *)__pa_nodebug(boot_command_line);
+ const char *opt = "dis_ucode_ldr";
+ const char *option = (const char *)__pa_nodebug(opt);
+ bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr);
+
+#else /* CONFIG_X86_64 */
+ const char *cmdline = boot_command_line;
+ const char *option = "dis_ucode_ldr";
+ bool *res = &dis_ucode_ldr;
+#endif
+
+ if (cmdline_find_option_bool(cmdline, option))
+ *res = true;
+
+ return *res;
+}
+
void __init load_ucode_bsp(void)
{
int vendor, x86;
+ if (check_loader_disabled_bsp())
+ return;
+
if (!have_cpuid_p())
return;
@@ -96,10 +121,22 @@ void __init load_ucode_bsp(void)
}
}
+static bool check_loader_disabled_ap(void)
+{
+#ifdef CONFIG_X86_32
+ return __pa_nodebug(dis_ucode_ldr);
+#else
+ return dis_ucode_ldr;
+#endif
+}
+
void load_ucode_ap(void)
{
int vendor, x86;
+ if (check_loader_disabled_ap())
+ return;
+
if (!have_cpuid_p())
return;
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index ae407f7226c..89f3b7c1af2 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -721,6 +721,7 @@ int perf_assign_events(struct perf_event **events, int n,
return sched.state.unassigned;
}
+EXPORT_SYMBOL_GPL(perf_assign_events);
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
{
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index ae96cfa5edd..980970cb744 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -108,15 +108,31 @@ static u64 precise_store_data(u64 status)
return val;
}
-static u64 precise_store_data_hsw(u64 status)
+static u64 precise_store_data_hsw(struct perf_event *event, u64 status)
{
union perf_mem_data_src dse;
+ u64 cfg = event->hw.config & INTEL_ARCH_EVENT_MASK;
dse.val = 0;
dse.mem_op = PERF_MEM_OP_STORE;
dse.mem_lvl = PERF_MEM_LVL_NA;
+
+ /*
+ * L1 info only valid for following events:
+ *
+ * MEM_UOPS_RETIRED.STLB_MISS_STORES
+ * MEM_UOPS_RETIRED.LOCK_STORES
+ * MEM_UOPS_RETIRED.SPLIT_STORES
+ * MEM_UOPS_RETIRED.ALL_STORES
+ */
+ if (cfg != 0x12d0 && cfg != 0x22d0 && cfg != 0x42d0 && cfg != 0x82d0)
+ return dse.mem_lvl;
+
if (status & 1)
- dse.mem_lvl = PERF_MEM_LVL_L1;
+ dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
+ else
+ dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS;
+
/* Nothing else supported. Sorry. */
return dse.val;
}
@@ -887,7 +903,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
data.data_src.val = load_latency_data(pebs->dse);
else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
data.data_src.val =
- precise_store_data_hsw(pebs->dse);
+ precise_store_data_hsw(event, pebs->dse);
else
data.data_src.val = precise_store_data(pebs->dse);
}
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1e96c3628bf..be846d2468f 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -36,7 +36,7 @@
* - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
* frame that is otherwise undefined after a SYSCALL
* - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
- * - errorentry/paranoidentry/zeroentry - Define exception entry points.
+ * - idtentry - Define exception entry points.
*/
#include <linux/linkage.h>
@@ -1203,125 +1203,100 @@ apicinterrupt IRQ_WORK_VECTOR \
/*
* Exception entry points.
*/
-.macro zeroentry sym do_sym
-ENTRY(\sym)
- INTR_FRAME
- ASM_CLAC
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
- subq $ORIG_RAX-R15, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
- call error_entry
- DEFAULT_FRAME 0
- movq %rsp,%rdi /* pt_regs pointer */
- xorl %esi,%esi /* no error code */
- call \do_sym
- jmp error_exit /* %ebx: no swapgs flag */
- CFI_ENDPROC
-END(\sym)
-.endm
+#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
-.macro paranoidzeroentry sym do_sym
+.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
ENTRY(\sym)
- INTR_FRAME
- ASM_CLAC
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
- subq $ORIG_RAX-R15, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
- call save_paranoid
- TRACE_IRQS_OFF
- movq %rsp,%rdi /* pt_regs pointer */
- xorl %esi,%esi /* no error code */
- call \do_sym
- jmp paranoid_exit /* %ebx: no swapgs flag */
- CFI_ENDPROC
-END(\sym)
-.endm
+ /* Sanity check */
+ .if \shift_ist != -1 && \paranoid == 0
+ .error "using shift_ist requires paranoid=1"
+ .endif
-#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
-.macro paranoidzeroentry_ist sym do_sym ist
-ENTRY(\sym)
+ .if \has_error_code
+ XCPT_FRAME
+ .else
INTR_FRAME
- ASM_CLAC
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
- subq $ORIG_RAX-R15, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
- call save_paranoid
- TRACE_IRQS_OFF_DEBUG
- movq %rsp,%rdi /* pt_regs pointer */
- xorl %esi,%esi /* no error code */
- subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
- call \do_sym
- addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
- jmp paranoid_exit /* %ebx: no swapgs flag */
- CFI_ENDPROC
-END(\sym)
-.endm
+ .endif
-.macro errorentry sym do_sym
-ENTRY(\sym)
- XCPT_FRAME
ASM_CLAC
PARAVIRT_ADJUST_EXCEPTION_FRAME
+
+ .ifeq \has_error_code
+ pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
+ .endif
+
subq $ORIG_RAX-R15, %rsp
CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
+
+ .if \paranoid
+ call save_paranoid
+ .else
call error_entry
+ .endif
+
DEFAULT_FRAME 0
+
+ .if \paranoid
+ .if \shift_ist != -1
+ TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */
+ .else
+ TRACE_IRQS_OFF
+ .endif
+ .endif
+
movq %rsp,%rdi /* pt_regs pointer */
+
+ .if \has_error_code
movq ORIG_RAX(%rsp),%rsi /* get error code */
movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
+ .else
+ xorl %esi,%esi /* no error code */
+ .endif
+
+ .if \shift_ist != -1
+ subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
+ .endif
+
call \do_sym
+
+ .if \shift_ist != -1
+ addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
+ .endif
+
+ .if \paranoid
+ jmp paranoid_exit /* %ebx: no swapgs flag */
+ .else
jmp error_exit /* %ebx: no swapgs flag */
+ .endif
+
CFI_ENDPROC
END(\sym)
.endm
#ifdef CONFIG_TRACING
-.macro trace_errorentry sym do_sym
-errorentry trace(\sym) trace(\do_sym)
-errorentry \sym \do_sym
+.macro trace_idtentry sym do_sym has_error_code:req
+idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code
+idtentry \sym \do_sym has_error_code=\has_error_code
.endm
#else
-.macro trace_errorentry sym do_sym
-errorentry \sym \do_sym
+.macro trace_idtentry sym do_sym has_error_code:req
+idtentry \sym \do_sym has_error_code=\has_error_code
.endm
#endif
- /* error code is on the stack already */
-.macro paranoiderrorentry sym do_sym
-ENTRY(\sym)
- XCPT_FRAME
- ASM_CLAC
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- subq $ORIG_RAX-R15, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
- call save_paranoid
- DEFAULT_FRAME 0
- TRACE_IRQS_OFF
- movq %rsp,%rdi /* pt_regs pointer */
- movq ORIG_RAX(%rsp),%rsi /* get error code */
- movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
- call \do_sym
- jmp paranoid_exit /* %ebx: no swapgs flag */
- CFI_ENDPROC
-END(\sym)
-.endm
-
-zeroentry divide_error do_divide_error
-zeroentry overflow do_overflow
-zeroentry bounds do_bounds
-zeroentry invalid_op do_invalid_op
-zeroentry device_not_available do_device_not_available
-paranoiderrorentry double_fault do_double_fault
-zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
-errorentry invalid_TSS do_invalid_TSS
-errorentry segment_not_present do_segment_not_present
-zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
-zeroentry coprocessor_error do_coprocessor_error
-errorentry alignment_check do_alignment_check
-zeroentry simd_coprocessor_error do_simd_coprocessor_error
+idtentry divide_error do_divide_error has_error_code=0
+idtentry overflow do_overflow has_error_code=0
+idtentry bounds do_bounds has_error_code=0
+idtentry invalid_op do_invalid_op has_error_code=0
+idtentry device_not_available do_device_not_available has_error_code=0
+idtentry double_fault do_double_fault has_error_code=1 paranoid=1
+idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
+idtentry invalid_TSS do_invalid_TSS has_error_code=1
+idtentry segment_not_present do_segment_not_present has_error_code=1
+idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0
+idtentry coprocessor_error do_coprocessor_error has_error_code=0
+idtentry alignment_check do_alignment_check has_error_code=1
+idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
/* Reload gs selector with exception handling */
@@ -1371,7 +1346,7 @@ ENTRY(do_softirq_own_stack)
END(do_softirq_own_stack)
#ifdef CONFIG_XEN
-zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
+idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
/*
* A note on the "critical region" in our callback handler.
@@ -1482,21 +1457,21 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
*/
.pushsection .kprobes.text, "ax"
-paranoidzeroentry_ist debug do_debug DEBUG_STACK
-paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
-paranoiderrorentry stack_segment do_stack_segment
+idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
+idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
+idtentry stack_segment do_stack_segment has_error_code=1 paranoid=1
#ifdef CONFIG_XEN
-zeroentry xen_debug do_debug
-zeroentry xen_int3 do_int3
-errorentry xen_stack_segment do_stack_segment
+idtentry xen_debug do_debug has_error_code=0
+idtentry xen_int3 do_int3 has_error_code=0
+idtentry xen_stack_segment do_stack_segment has_error_code=1
#endif
-errorentry general_protection do_general_protection
-trace_errorentry page_fault do_page_fault
+idtentry general_protection do_general_protection has_error_code=1
+trace_idtentry page_fault do_page_fault has_error_code=1
#ifdef CONFIG_KVM_GUEST
-errorentry async_page_fault do_async_page_fault
+idtentry async_page_fault do_async_page_fault has_error_code=1
#endif
#ifdef CONFIG_X86_MCE
-paranoidzeroentry machine_check *machine_check_vector(%rip)
+idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)
#endif
/*
diff --git a/arch/x86/kernel/iosf_mbi.c b/arch/x86/kernel/iosf_mbi.c
index c3aae667284..d30acdc1229 100644
--- a/arch/x86/kernel/iosf_mbi.c
+++ b/arch/x86/kernel/iosf_mbi.c
@@ -25,6 +25,9 @@
#include <asm/iosf_mbi.h>
+#define PCI_DEVICE_ID_BAYTRAIL 0x0F00
+#define PCI_DEVICE_ID_QUARK_X1000 0x0958
+
static DEFINE_SPINLOCK(iosf_mbi_lock);
static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset)
@@ -177,6 +180,13 @@ int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask)
}
EXPORT_SYMBOL(iosf_mbi_modify);
+bool iosf_mbi_available(void)
+{
+ /* Mbi isn't hot-pluggable. No remove routine is provided */
+ return mbi_pdev;
+}
+EXPORT_SYMBOL(iosf_mbi_available);
+
static int iosf_mbi_probe(struct pci_dev *pdev,
const struct pci_device_id *unused)
{
@@ -193,7 +203,8 @@ static int iosf_mbi_probe(struct pci_dev *pdev,
}
static DEFINE_PCI_DEVICE_TABLE(iosf_mbi_pci_ids) = {
- { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x0F00) },
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BAYTRAIL) },
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_QUARK_X1000) },
{ 0, },
};
MODULE_DEVICE_TABLE(pci, iosf_mbi_pci_ids);
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 2ed845928b5..ace22916ade 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -53,7 +53,7 @@
#define OPCODE1(insn) ((insn)->opcode.bytes[0])
#define OPCODE2(insn) ((insn)->opcode.bytes[1])
#define OPCODE3(insn) ((insn)->opcode.bytes[2])
-#define MODRM_REG(insn) X86_MODRM_REG(insn->modrm.value)
+#define MODRM_REG(insn) X86_MODRM_REG((insn)->modrm.value)
#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
(((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
@@ -229,63 +229,6 @@ static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn)
return -ENOTSUPP;
}
-/*
- * Figure out which fixups arch_uprobe_post_xol() will need to perform, and
- * annotate arch_uprobe->fixups accordingly. To start with,
- * arch_uprobe->fixups is either zero or it reflects rip-related fixups.
- */
-static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn)
-{
- bool fix_ip = true, fix_call = false; /* defaults */
- int reg;
-
- insn_get_opcode(insn); /* should be a nop */
-
- switch (OPCODE1(insn)) {
- case 0x9d:
- /* popf */
- auprobe->fixups |= UPROBE_FIX_SETF;
- break;
- case 0xc3: /* ret/lret */
- case 0xcb:
- case 0xc2:
- case 0xca:
- /* ip is correct */
- fix_ip = false;
- break;
- case 0xe8: /* call relative - Fix return addr */
- fix_call = true;
- break;
- case 0x9a: /* call absolute - Fix return addr, not ip */
- fix_call = true;
- fix_ip = false;
- break;
- case 0xff:
- insn_get_modrm(insn);
- reg = MODRM_REG(insn);
- if (reg == 2 || reg == 3) {
- /* call or lcall, indirect */
- /* Fix return addr; ip is correct. */
- fix_call = true;
- fix_ip = false;
- } else if (reg == 4 || reg == 5) {
- /* jmp or ljmp, indirect */
- /* ip is correct. */
- fix_ip = false;
- }
- break;
- case 0xea: /* jmp absolute -- ip is correct */
- fix_ip = false;
- break;
- default:
- break;
- }
- if (fix_ip)
- auprobe->fixups |= UPROBE_FIX_IP;
- if (fix_call)
- auprobe->fixups |= UPROBE_FIX_CALL;
-}
-
#ifdef CONFIG_X86_64
/*
* If arch_uprobe->insn doesn't use rip-relative addressing, return
@@ -310,15 +253,11 @@ static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn)
* - The displacement is always 4 bytes.
*/
static void
-handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
+handle_riprel_insn(struct arch_uprobe *auprobe, struct insn *insn)
{
u8 *cursor;
u8 reg;
- if (mm->context.ia32_compat)
- return;
-
- auprobe->rip_rela_target_address = 0x0;
if (!insn_rip_relative(insn))
return;
@@ -372,7 +311,48 @@ handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct ins
cursor++;
memmove(cursor, cursor + insn->displacement.nbytes, insn->immediate.nbytes);
}
- return;
+}
+
+/*
+ * If we're emulating a rip-relative instruction, save the contents
+ * of the scratch register and store the target address in that register.
+ */
+static void
+pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs,
+ struct arch_uprobe_task *autask)
+{
+ if (auprobe->fixups & UPROBE_FIX_RIP_AX) {
+ autask->saved_scratch_register = regs->ax;
+ regs->ax = current->utask->vaddr;
+ regs->ax += auprobe->rip_rela_target_address;
+ } else if (auprobe->fixups & UPROBE_FIX_RIP_CX) {
+ autask->saved_scratch_register = regs->cx;
+ regs->cx = current->utask->vaddr;
+ regs->cx += auprobe->rip_rela_target_address;
+ }
+}
+
+static void
+handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction)
+{
+ if (auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) {
+ struct arch_uprobe_task *autask;
+
+ autask = &current->utask->autask;
+ if (auprobe->fixups & UPROBE_FIX_RIP_AX)
+ regs->ax = autask->saved_scratch_register;
+ else
+ regs->cx = autask->saved_scratch_register;
+
+ /*
+ * The original instruction includes a displacement, and so
+ * is 4 bytes longer than what we've just single-stepped.
+ * Caller may need to apply other fixups to handle stuff
+ * like "jmpq *...(%rip)" and "callq *...(%rip)".
+ */
+ if (correction)
+ *correction += 4;
+ }
}
static int validate_insn_64bits(struct arch_uprobe *auprobe, struct insn *insn)
@@ -401,9 +381,19 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm,
return validate_insn_64bits(auprobe, insn);
}
#else /* 32-bit: */
-static void handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
+/*
+ * No RIP-relative addressing on 32-bit
+ */
+static void handle_riprel_insn(struct arch_uprobe *auprobe, struct insn *insn)
+{
+}
+static void pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs,
+ struct arch_uprobe_task *autask)
+{
+}
+static void handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs,
+ long *correction)
{
- /* No RIP-relative addressing on 32-bit */
}
static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
@@ -412,141 +402,311 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm,
}
#endif /* CONFIG_X86_64 */
-/**
- * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
- * @mm: the probed address space.
- * @arch_uprobe: the probepoint information.
- * @addr: virtual address at which to install the probepoint
- * Return 0 on success or a -ve number on error.
+struct uprobe_xol_ops {
+ bool (*emulate)(struct arch_uprobe *, struct pt_regs *);
+ int (*pre_xol)(struct arch_uprobe *, struct pt_regs *);
+ int (*post_xol)(struct arch_uprobe *, struct pt_regs *);
+};
+
+static inline int sizeof_long(void)
+{
+ return is_ia32_task() ? 4 : 8;
+}
+
+static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ pre_xol_rip_insn(auprobe, regs, &current->utask->autask);
+ return 0;
+}
+
+/*
+ * Adjust the return address pushed by a call insn executed out of line.
*/
-int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)
+static int adjust_ret_addr(unsigned long sp, long correction)
{
- int ret;
- struct insn insn;
+ int rasize = sizeof_long();
+ long ra;
- auprobe->fixups = 0;
- ret = validate_insn_bits(auprobe, mm, &insn);
- if (ret != 0)
- return ret;
+ if (copy_from_user(&ra, (void __user *)sp, rasize))
+ return -EFAULT;
- handle_riprel_insn(auprobe, mm, &insn);
- prepare_fixups(auprobe, &insn);
+ ra += correction;
+ if (copy_to_user((void __user *)sp, &ra, rasize))
+ return -EFAULT;
return 0;
}
-#ifdef CONFIG_X86_64
-/*
- * If we're emulating a rip-relative instruction, save the contents
- * of the scratch register and store the target address in that register.
- */
-static void
-pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs,
- struct arch_uprobe_task *autask)
+static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
- if (auprobe->fixups & UPROBE_FIX_RIP_AX) {
- autask->saved_scratch_register = regs->ax;
- regs->ax = current->utask->vaddr;
- regs->ax += auprobe->rip_rela_target_address;
- } else if (auprobe->fixups & UPROBE_FIX_RIP_CX) {
- autask->saved_scratch_register = regs->cx;
- regs->cx = current->utask->vaddr;
- regs->cx += auprobe->rip_rela_target_address;
+ struct uprobe_task *utask = current->utask;
+ long correction = (long)(utask->vaddr - utask->xol_vaddr);
+
+ handle_riprel_post_xol(auprobe, regs, &correction);
+ if (auprobe->fixups & UPROBE_FIX_IP)
+ regs->ip += correction;
+
+ if (auprobe->fixups & UPROBE_FIX_CALL) {
+ if (adjust_ret_addr(regs->sp, correction)) {
+ regs->sp += sizeof_long();
+ return -ERESTART;
+ }
}
+
+ return 0;
}
-#else
-static void
-pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs,
- struct arch_uprobe_task *autask)
+
+static struct uprobe_xol_ops default_xol_ops = {
+ .pre_xol = default_pre_xol_op,
+ .post_xol = default_post_xol_op,
+};
+
+static bool branch_is_call(struct arch_uprobe *auprobe)
{
- /* No RIP-relative addressing on 32-bit */
+ return auprobe->branch.opc1 == 0xe8;
}
-#endif
-/*
- * arch_uprobe_pre_xol - prepare to execute out of line.
- * @auprobe: the probepoint information.
- * @regs: reflects the saved user state of current task.
- */
-int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
-{
- struct arch_uprobe_task *autask;
+#define CASE_COND \
+ COND(70, 71, XF(OF)) \
+ COND(72, 73, XF(CF)) \
+ COND(74, 75, XF(ZF)) \
+ COND(78, 79, XF(SF)) \
+ COND(7a, 7b, XF(PF)) \
+ COND(76, 77, XF(CF) || XF(ZF)) \
+ COND(7c, 7d, XF(SF) != XF(OF)) \
+ COND(7e, 7f, XF(ZF) || XF(SF) != XF(OF))
- autask = &current->utask->autask;
- autask->saved_trap_nr = current->thread.trap_nr;
- current->thread.trap_nr = UPROBE_TRAP_NR;
- regs->ip = current->utask->xol_vaddr;
- pre_xol_rip_insn(auprobe, regs, autask);
+#define COND(op_y, op_n, expr) \
+ case 0x ## op_y: DO((expr) != 0) \
+ case 0x ## op_n: DO((expr) == 0)
- autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF);
- regs->flags |= X86_EFLAGS_TF;
- if (test_tsk_thread_flag(current, TIF_BLOCKSTEP))
- set_task_blockstep(current, false);
+#define XF(xf) (!!(flags & X86_EFLAGS_ ## xf))
- return 0;
+static bool is_cond_jmp_opcode(u8 opcode)
+{
+ switch (opcode) {
+ #define DO(expr) \
+ return true;
+ CASE_COND
+ #undef DO
+
+ default:
+ return false;
+ }
}
-/*
- * This function is called by arch_uprobe_post_xol() to adjust the return
- * address pushed by a call instruction executed out of line.
- */
-static int adjust_ret_addr(unsigned long sp, long correction)
+static bool check_jmp_cond(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
- int rasize, ncopied;
- long ra = 0;
+ unsigned long flags = regs->flags;
- if (is_ia32_task())
- rasize = 4;
- else
- rasize = 8;
+ switch (auprobe->branch.opc1) {
+ #define DO(expr) \
+ return expr;
+ CASE_COND
+ #undef DO
- ncopied = copy_from_user(&ra, (void __user *)sp, rasize);
- if (unlikely(ncopied))
- return -EFAULT;
+ default: /* not a conditional jmp */
+ return true;
+ }
+}
- ra += correction;
- ncopied = copy_to_user((void __user *)sp, &ra, rasize);
- if (unlikely(ncopied))
- return -EFAULT;
+#undef XF
+#undef COND
+#undef CASE_COND
- return 0;
+static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ unsigned long new_ip = regs->ip += auprobe->branch.ilen;
+ unsigned long offs = (long)auprobe->branch.offs;
+
+ if (branch_is_call(auprobe)) {
+ unsigned long new_sp = regs->sp - sizeof_long();
+ /*
+ * If it fails we execute this (mangled, see the comment in
+ * branch_clear_offset) insn out-of-line. In the likely case
+ * this should trigger the trap, and the probed application
+ * should die or restart the same insn after it handles the
+ * signal, arch_uprobe_post_xol() won't be even called.
+ *
+ * But there is corner case, see the comment in ->post_xol().
+ */
+ if (copy_to_user((void __user *)new_sp, &new_ip, sizeof_long()))
+ return false;
+ regs->sp = new_sp;
+ } else if (!check_jmp_cond(auprobe, regs)) {
+ offs = 0;
+ }
+
+ regs->ip = new_ip + offs;
+ return true;
}
-#ifdef CONFIG_X86_64
-static bool is_riprel_insn(struct arch_uprobe *auprobe)
+static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
- return ((auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) != 0);
+ BUG_ON(!branch_is_call(auprobe));
+ /*
+ * We can only get here if branch_emulate_op() failed to push the ret
+ * address _and_ another thread expanded our stack before the (mangled)
+ * "call" insn was executed out-of-line. Just restore ->sp and restart.
+ * We could also restore ->ip and try to call branch_emulate_op() again.
+ */
+ regs->sp += sizeof_long();
+ return -ERESTART;
}
-static void
-handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction)
+static void branch_clear_offset(struct arch_uprobe *auprobe, struct insn *insn)
{
- if (is_riprel_insn(auprobe)) {
- struct arch_uprobe_task *autask;
+ /*
+ * Turn this insn into "call 1f; 1:", this is what we will execute
+ * out-of-line if ->emulate() fails. We only need this to generate
+ * a trap, so that the probed task receives the correct signal with
+ * the properly filled siginfo.
+ *
+ * But see the comment in ->post_xol(), in the unlikely case it can
+ * succeed. So we need to ensure that the new ->ip can not fall into
+ * the non-canonical area and trigger #GP.
+ *
+ * We could turn it into (say) "pushf", but then we would need to
+ * divorce ->insn[] and ->ixol[]. We need to preserve the 1st byte
+ * of ->insn[] for set_orig_insn().
+ */
+ memset(auprobe->insn + insn_offset_immediate(insn),
+ 0, insn->immediate.nbytes);
+}
- autask = &current->utask->autask;
- if (auprobe->fixups & UPROBE_FIX_RIP_AX)
- regs->ax = autask->saved_scratch_register;
- else
- regs->cx = autask->saved_scratch_register;
+static struct uprobe_xol_ops branch_xol_ops = {
+ .emulate = branch_emulate_op,
+ .post_xol = branch_post_xol_op,
+};
+
+/* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */
+static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
+{
+ u8 opc1 = OPCODE1(insn);
+
+ /* has the side-effect of processing the entire instruction */
+ insn_get_length(insn);
+ if (WARN_ON_ONCE(!insn_complete(insn)))
+ return -ENOEXEC;
+
+ switch (opc1) {
+ case 0xeb: /* jmp 8 */
+ case 0xe9: /* jmp 32 */
+ case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */
+ break;
+
+ case 0xe8: /* call relative */
+ branch_clear_offset(auprobe, insn);
+ break;
+ case 0x0f:
+ if (insn->opcode.nbytes != 2)
+ return -ENOSYS;
/*
- * The original instruction includes a displacement, and so
- * is 4 bytes longer than what we've just single-stepped.
- * Fall through to handle stuff like "jmpq *...(%rip)" and
- * "callq *...(%rip)".
+ * If it is a "near" conditional jmp, OPCODE2() - 0x10 matches
+ * OPCODE1() of the "short" jmp which checks the same condition.
*/
- if (correction)
- *correction += 4;
+ opc1 = OPCODE2(insn) - 0x10;
+ default:
+ if (!is_cond_jmp_opcode(opc1))
+ return -ENOSYS;
}
+
+ auprobe->branch.opc1 = opc1;
+ auprobe->branch.ilen = insn->length;
+ auprobe->branch.offs = insn->immediate.value;
+
+ auprobe->ops = &branch_xol_ops;
+ return 0;
}
-#else
-static void
-handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction)
+
+/**
+ * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
+ * @mm: the probed address space.
+ * @arch_uprobe: the probepoint information.
+ * @addr: virtual address at which to install the probepoint
+ * Return 0 on success or a -ve number on error.
+ */
+int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)
+{
+ struct insn insn;
+ bool fix_ip = true, fix_call = false;
+ int ret;
+
+ ret = validate_insn_bits(auprobe, mm, &insn);
+ if (ret)
+ return ret;
+
+ ret = branch_setup_xol_ops(auprobe, &insn);
+ if (ret != -ENOSYS)
+ return ret;
+
+ /*
+ * Figure out which fixups arch_uprobe_post_xol() will need to perform,
+ * and annotate arch_uprobe->fixups accordingly. To start with, ->fixups
+ * is either zero or it reflects rip-related fixups.
+ */
+ switch (OPCODE1(&insn)) {
+ case 0x9d: /* popf */
+ auprobe->fixups |= UPROBE_FIX_SETF;
+ break;
+ case 0xc3: /* ret or lret -- ip is correct */
+ case 0xcb:
+ case 0xc2:
+ case 0xca:
+ fix_ip = false;
+ break;
+ case 0x9a: /* call absolute - Fix return addr, not ip */
+ fix_call = true;
+ fix_ip = false;
+ break;
+ case 0xea: /* jmp absolute -- ip is correct */
+ fix_ip = false;
+ break;
+ case 0xff:
+ insn_get_modrm(&insn);
+ switch (MODRM_REG(&insn)) {
+ case 2: case 3: /* call or lcall, indirect */
+ fix_call = true;
+ case 4: case 5: /* jmp or ljmp, indirect */
+ fix_ip = false;
+ }
+ /* fall through */
+ default:
+ handle_riprel_insn(auprobe, &insn);
+ }
+
+ if (fix_ip)
+ auprobe->fixups |= UPROBE_FIX_IP;
+ if (fix_call)
+ auprobe->fixups |= UPROBE_FIX_CALL;
+
+ auprobe->ops = &default_xol_ops;
+ return 0;
+}
+
+/*
+ * arch_uprobe_pre_xol - prepare to execute out of line.
+ * @auprobe: the probepoint information.
+ * @regs: reflects the saved user state of current task.
+ */
+int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
- /* No RIP-relative addressing on 32-bit */
+ struct uprobe_task *utask = current->utask;
+
+ regs->ip = utask->xol_vaddr;
+ utask->autask.saved_trap_nr = current->thread.trap_nr;
+ current->thread.trap_nr = UPROBE_TRAP_NR;
+
+ utask->autask.saved_tf = !!(regs->flags & X86_EFLAGS_TF);
+ regs->flags |= X86_EFLAGS_TF;
+ if (test_tsk_thread_flag(current, TIF_BLOCKSTEP))
+ set_task_blockstep(current, false);
+
+ if (auprobe->ops->pre_xol)
+ return auprobe->ops->pre_xol(auprobe, regs);
+ return 0;
}
-#endif
/*
* If xol insn itself traps and generates a signal(Say,
@@ -592,22 +752,25 @@ bool arch_uprobe_xol_was_trapped(struct task_struct *t)
*/
int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
- struct uprobe_task *utask;
- long correction;
- int result = 0;
+ struct uprobe_task *utask = current->utask;
WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR);
- utask = current->utask;
- current->thread.trap_nr = utask->autask.saved_trap_nr;
- correction = (long)(utask->vaddr - utask->xol_vaddr);
- handle_riprel_post_xol(auprobe, regs, &correction);
- if (auprobe->fixups & UPROBE_FIX_IP)
- regs->ip += correction;
-
- if (auprobe->fixups & UPROBE_FIX_CALL)
- result = adjust_ret_addr(regs->sp, correction);
+ if (auprobe->ops->post_xol) {
+ int err = auprobe->ops->post_xol(auprobe, regs);
+ if (err) {
+ arch_uprobe_abort_xol(auprobe, regs);
+ /*
+ * Restart the probed insn. ->post_xol() must ensure
+ * this is really possible if it returns -ERESTART.
+ */
+ if (err == -ERESTART)
+ return 0;
+ return err;
+ }
+ }
+ current->thread.trap_nr = utask->autask.saved_trap_nr;
/*
* arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP
* so we can get an extra SIGTRAP if we do not clear TF. We need
@@ -618,7 +781,7 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
else if (!(auprobe->fixups & UPROBE_FIX_SETF))
regs->flags &= ~X86_EFLAGS_TF;
- return result;
+ return 0;
}
/* callback routine for handling exceptions. */
@@ -652,8 +815,9 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val,
/*
* This function gets called when XOL instruction either gets trapped or
- * the thread has a fatal signal, so reset the instruction pointer to its
- * probed address.
+ * the thread has a fatal signal, or if arch_uprobe_post_xol() failed.
+ * Reset the instruction pointer to its probed address for the potential
+ * restart or for post mortem analysis.
*/
void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
@@ -668,25 +832,10 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
regs->flags &= ~X86_EFLAGS_TF;
}
-/*
- * Skip these instructions as per the currently known x86 ISA.
- * rep=0x66*; nop=0x90
- */
static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
- int i;
-
- for (i = 0; i < MAX_UINSN_BYTES; i++) {
- if (auprobe->insn[i] == 0x66)
- continue;
-
- if (auprobe->insn[i] == 0x90) {
- regs->ip += i + 1;
- return true;
- }
-
- break;
- }
+ if (auprobe->ops->emulate)
+ return auprobe->ops->emulate(auprobe, regs);
return false;
}
@@ -701,23 +850,21 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
unsigned long
arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs)
{
- int rasize, ncopied;
+ int rasize = sizeof_long(), nleft;
unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */
- rasize = is_ia32_task() ? 4 : 8;
- ncopied = copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize);
- if (unlikely(ncopied))
+ if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize))
return -1;
/* check whether address has been already hijacked */
if (orig_ret_vaddr == trampoline_vaddr)
return orig_ret_vaddr;
- ncopied = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize);
- if (likely(!ncopied))
+ nleft = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize);
+ if (likely(!nleft))
return orig_ret_vaddr;
- if (ncopied != rasize) {
+ if (nleft != rasize) {
pr_err("uprobe: return address clobbered: pid=%d, %%sp=%#lx, "
"%%ip=%#lx\n", current->pid, regs->sp, regs->ip);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 33e8c028842..138ceffc637 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7778,7 +7778,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
exec_control = vmcs12->pin_based_vm_exec_control;
exec_control |= vmcs_config.pin_based_exec_ctrl;
- exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+ exec_control &= ~(PIN_BASED_VMX_PREEMPTION_TIMER |
+ PIN_BASED_POSTED_INTR);
vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
vmx->nested.preemption_timer_expired = false;
@@ -7815,7 +7816,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
if (!vmx->rdtscp_enabled)
exec_control &= ~SECONDARY_EXEC_RDTSCP;
/* Take the following fields only from vmcs12 */
- exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
+ SECONDARY_EXEC_APIC_REGISTER_VIRT);
if (nested_cpu_has(vmcs12,
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
exec_control |= vmcs12->secondary_vm_exec_control;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b6c0bacca9b..20316c67b82 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -106,6 +106,8 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
static u32 tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
+static bool backwards_tsc_observed = false;
+
#define KVM_NR_SHARED_MSRS 16
struct kvm_shared_msrs_global {
@@ -1486,7 +1488,8 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
&ka->master_kernel_ns,
&ka->master_cycle_now);
- ka->use_master_clock = host_tsc_clocksource & vcpus_matched;
+ ka->use_master_clock = host_tsc_clocksource && vcpus_matched
+ && !backwards_tsc_observed;
if (ka->use_master_clock)
atomic_set(&kvm_guest_has_master_clock, 1);
@@ -6945,6 +6948,7 @@ int kvm_arch_hardware_enable(void *garbage)
*/
if (backwards_tsc) {
u64 delta_cyc = max_tsc - local_tsc;
+ backwards_tsc_observed = true;
list_for_each_entry(kvm, &vm_list, vm_list) {
kvm_for_each_vcpu(i, vcpu, kvm) {
vcpu->arch.tsc_offset_adjustment += delta_cyc;
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index eabcb6e6a90..4d4f96a2763 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -16,7 +16,7 @@ clean-files := inat-tables.c
obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
-lib-y := delay.o misc.o
+lib-y := delay.o misc.o cmdline.o
lib-y += thunk_$(BITS).o
lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
lib-y += memcpy_$(BITS).o
diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
new file mode 100644
index 00000000000..422db000d72
--- /dev/null
+++ b/arch/x86/lib/cmdline.c
@@ -0,0 +1,84 @@
+/*
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * Misc librarized functions for cmdline poking.
+ */
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <asm/setup.h>
+
+static inline int myisspace(u8 c)
+{
+ return c <= ' '; /* Close enough approximation */
+}
+
+/**
+ * Find a boolean option (like quiet,noapic,nosmp....)
+ *
+ * @cmdline: the cmdline string
+ * @option: option string to look for
+ *
+ * Returns the position of that @option (starts counting with 1)
+ * or 0 on not found.
+ */
+int cmdline_find_option_bool(const char *cmdline, const char *option)
+{
+ char c;
+ int len, pos = 0, wstart = 0;
+ const char *opptr = NULL;
+ enum {
+ st_wordstart = 0, /* Start of word/after whitespace */
+ st_wordcmp, /* Comparing this word */
+ st_wordskip, /* Miscompare, skip */
+ } state = st_wordstart;
+
+ if (!cmdline)
+ return -1; /* No command line */
+
+ len = min_t(int, strlen(cmdline), COMMAND_LINE_SIZE);
+ if (!len)
+ return 0;
+
+ while (len--) {
+ c = *(char *)cmdline++;
+ pos++;
+
+ switch (state) {
+ case st_wordstart:
+ if (!c)
+ return 0;
+ else if (myisspace(c))
+ break;
+
+ state = st_wordcmp;
+ opptr = option;
+ wstart = pos;
+ /* fall through */
+
+ case st_wordcmp:
+ if (!*opptr)
+ if (!c || myisspace(c))
+ return wstart;
+ else
+ state = st_wordskip;
+ else if (!c)
+ return 0;
+ else if (c != *opptr++)
+ state = st_wordskip;
+ else if (!len) /* last word and is matching */
+ return wstart;
+ break;
+
+ case st_wordskip:
+ if (!c)
+ return 0;
+ else if (myisspace(c))
+ state = st_wordstart;
+ break;
+ }
+ }
+
+ return 0; /* Buffer overrun */
+}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 597ac155c91..bc7527e109c 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -50,6 +50,21 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
return err;
}
+static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
+ void *arg)
+{
+ unsigned long i;
+
+ for (i = 0; i < nr_pages; ++i)
+ if (pfn_valid(start_pfn + i) &&
+ !PageReserved(pfn_to_page(start_pfn + i)))
+ return 1;
+
+ WARN_ONCE(1, "ioremap on RAM pfn 0x%lx\n", start_pfn);
+
+ return 0;
+}
+
/*
* Remap an arbitrary physical address space into the kernel virtual
* address space. Needed when the kernel wants to access high addresses
@@ -93,14 +108,11 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
/*
* Don't allow anybody to remap normal RAM that we're using..
*/
+ pfn = phys_addr >> PAGE_SHIFT;
last_pfn = last_addr >> PAGE_SHIFT;
- for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) {
- int is_ram = page_is_ram(pfn);
-
- if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
- return NULL;
- WARN_ON_ONCE(is_ram);
- }
+ if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL,
+ __ioremap_check_ram) == 1)
+ return NULL;
/*
* Mappings have to be page-aligned
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index c96314abd14..0004ac72dbd 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -399,13 +399,20 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
int ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep)
{
- int young;
-
- young = ptep_test_and_clear_young(vma, address, ptep);
- if (young)
- flush_tlb_page(vma, address);
-
- return young;
+ /*
+ * On x86 CPUs, clearing the accessed bit without a TLB flush
+ * doesn't cause data corruption. [ It could cause incorrect
+ * page aging and the (mistaken) reclaim of hot pages, but the
+ * chance of that should be relatively low. ]
+ *
+ * So as a performance optimization don't flush the TLB when
+ * clearing the accessed bit, it will eventually be flushed by
+ * a context switch or a VM operation anyway. [ In the rare
+ * event of it not getting flushed for a long time the delay
+ * shouldn't really matter because there's no real memory
+ * pressure for swapout to react to. ]
+ */
+ return ptep_test_and_clear_young(vma, address, ptep);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 01edac6c5e1..5075371ab59 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -489,8 +489,12 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
}
node = acpi_get_node(device->handle);
- if (node == NUMA_NO_NODE)
+ if (node == NUMA_NO_NODE) {
node = x86_pci_root_bus_node(busnum);
+ if (node != 0 && node != NUMA_NO_NODE)
+ dev_info(&device->dev, FW_BUG "no _PXM; falling back to node %d from hardware (may be inconsistent with ACPI node numbers)\n",
+ node);
+ }
if (node != NUMA_NO_NODE && !node_online(node))
node = NUMA_NO_NODE;
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index e88f4c53d7f..c20d2cc7ef6 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -11,27 +11,33 @@
#include "bus_numa.h"
-/*
- * This discovers the pcibus <-> node mapping on AMD K8.
- * also get peer root bus resource for io,mmio
- */
+#define AMD_NB_F0_NODE_ID 0x60
+#define AMD_NB_F0_UNIT_ID 0x64
+#define AMD_NB_F1_CONFIG_MAP_REG 0xe0
+
+#define RANGE_NUM 16
+#define AMD_NB_F1_CONFIG_MAP_RANGES 4
-struct pci_hostbridge_probe {
+struct amd_hostbridge {
u32 bus;
u32 slot;
- u32 vendor;
u32 device;
};
-static struct pci_hostbridge_probe pci_probes[] __initdata = {
- { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1100 },
- { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 },
- { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 },
- { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1300 },
+/*
+ * IMPORTANT NOTE:
+ * hb_probes[] and early_root_info_init() is in maintenance mode.
+ * It only supports K8, Fam10h, Fam11h, and Fam15h_00h-0fh .
+ * Future processor will rely on information in ACPI.
+ */
+static struct amd_hostbridge hb_probes[] __initdata = {
+ { 0, 0x18, 0x1100 }, /* K8 */
+ { 0, 0x18, 0x1200 }, /* Family10h */
+ { 0xff, 0, 0x1200 }, /* Family10h */
+ { 0, 0x18, 0x1300 }, /* Family11h */
+ { 0, 0x18, 0x1600 }, /* Family15h */
};
-#define RANGE_NUM 16
-
static struct pci_root_info __init *find_pci_root_info(int node, int link)
{
struct pci_root_info *info;
@@ -45,12 +51,12 @@ static struct pci_root_info __init *find_pci_root_info(int node, int link)
}
/**
- * early_fill_mp_bus_to_node()
+ * early_root_info_init()
* called before pcibios_scan_root and pci_scan_bus
- * fills the mp_bus_to_cpumask array based according to the LDT Bus Number
- * Registers found in the K8 northbridge
+ * fills the mp_bus_to_cpumask array based according
+ * to the LDT Bus Number Registers found in the northbridge.
*/
-static int __init early_fill_mp_bus_info(void)
+static int __init early_root_info_init(void)
{
int i;
unsigned bus;
@@ -75,19 +81,21 @@ static int __init early_fill_mp_bus_info(void)
return -1;
found = false;
- for (i = 0; i < ARRAY_SIZE(pci_probes); i++) {
+ for (i = 0; i < ARRAY_SIZE(hb_probes); i++) {
u32 id;
u16 device;
u16 vendor;
- bus = pci_probes[i].bus;
- slot = pci_probes[i].slot;
+ bus = hb_probes[i].bus;
+ slot = hb_probes[i].slot;
id = read_pci_config(bus, slot, 0, PCI_VENDOR_ID);
-
vendor = id & 0xffff;
device = (id>>16) & 0xffff;
- if (pci_probes[i].vendor == vendor &&
- pci_probes[i].device == device) {
+
+ if (vendor != PCI_VENDOR_ID_AMD)
+ continue;
+
+ if (hb_probes[i].device == device) {
found = true;
break;
}
@@ -96,10 +104,16 @@ static int __init early_fill_mp_bus_info(void)
if (!found)
return 0;
- for (i = 0; i < 4; i++) {
+ /*
+ * We should learn topology and routing information from _PXM and
+ * _CRS methods in the ACPI namespace. We extract node numbers
+ * here to work around BIOSes that don't supply _PXM.
+ */
+ for (i = 0; i < AMD_NB_F1_CONFIG_MAP_RANGES; i++) {
int min_bus;
int max_bus;
- reg = read_pci_config(bus, slot, 1, 0xe0 + (i << 2));
+ reg = read_pci_config(bus, slot, 1,
+ AMD_NB_F1_CONFIG_MAP_REG + (i << 2));
/* Check if that register is enabled for bus range */
if ((reg & 7) != 3)
@@ -113,10 +127,21 @@ static int __init early_fill_mp_bus_info(void)
info = alloc_pci_root_info(min_bus, max_bus, node, link);
}
+ /*
+ * The following code extracts routing information for use on old
+ * systems where Linux doesn't automatically use host bridge _CRS
+ * methods (or when the user specifies "pci=nocrs").
+ *
+ * We only do this through Fam11h, because _CRS should be enough on
+ * newer systems.
+ */
+ if (boot_cpu_data.x86 > 0x11)
+ return 0;
+
/* get the default node and link for left over res */
- reg = read_pci_config(bus, slot, 0, 0x60);
+ reg = read_pci_config(bus, slot, 0, AMD_NB_F0_NODE_ID);
def_node = (reg >> 8) & 0x07;
- reg = read_pci_config(bus, slot, 0, 0x64);
+ reg = read_pci_config(bus, slot, 0, AMD_NB_F0_UNIT_ID);
def_link = (reg >> 8) & 0x03;
memset(range, 0, sizeof(range));
@@ -363,7 +388,7 @@ static int __init pci_io_ecs_init(void)
int cpu;
/* assume all cpus from fam10h have IO ECS */
- if (boot_cpu_data.x86 < 0x10)
+ if (boot_cpu_data.x86 < 0x10)
return 0;
/* Try the PCI method first. */
@@ -387,7 +412,7 @@ static int __init amd_postcore_init(void)
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
return 0;
- early_fill_mp_bus_info();
+ early_root_info_init();
pci_io_ecs_init();
return 0;
diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c
index 614392ced7d..bb461cfd01a 100644
--- a/arch/x86/pci/broadcom_bus.c
+++ b/arch/x86/pci/broadcom_bus.c
@@ -60,8 +60,8 @@ static void __init cnb20le_res(u8 bus, u8 slot, u8 func)
word1 = read_pci_config_16(bus, slot, func, 0xc4);
word2 = read_pci_config_16(bus, slot, func, 0xc6);
if (word1 != word2) {
- res.start = (word1 << 16) | 0x0000;
- res.end = (word2 << 16) | 0xffff;
+ res.start = ((resource_size_t) word1 << 16) | 0x0000;
+ res.end = ((resource_size_t) word2 << 16) | 0xffff;
res.flags = IORESOURCE_MEM | IORESOURCE_PREFETCH;
update_res(info, res.start, res.end, res.flags, 0);
}
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 94ae9ae9574..b5e60268d93 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -6,6 +6,7 @@
#include <linux/dmi.h>
#include <linux/pci.h>
#include <linux/vgaarb.h>
+#include <asm/hpet.h>
#include <asm/pci_x86.h>
static void pci_fixup_i450nx(struct pci_dev *d)
@@ -337,9 +338,7 @@ static void pci_fixup_video(struct pci_dev *pdev)
* type BRIDGE, or CARDBUS. Host to PCI controllers use
* PCI header type NORMAL.
*/
- if (bridge
- && ((bridge->hdr_type == PCI_HEADER_TYPE_BRIDGE)
- || (bridge->hdr_type == PCI_HEADER_TYPE_CARDBUS))) {
+ if (bridge && (pci_is_bridge(bridge))) {
pci_read_config_word(bridge, PCI_BRIDGE_CONTROL,
&config);
if (!(config & PCI_BRIDGE_CTL_VGA))
@@ -526,6 +525,19 @@ static void sb600_disable_hpet_bar(struct pci_dev *dev)
}
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_ATI, 0x4385, sb600_disable_hpet_bar);
+#ifdef CONFIG_HPET_TIMER
+static void sb600_hpet_quirk(struct pci_dev *dev)
+{
+ struct resource *r = &dev->resource[1];
+
+ if (r->flags & IORESOURCE_MEM && r->start == hpet_address) {
+ r->flags |= IORESOURCE_PCI_FIXED;
+ dev_info(&dev->dev, "reg 0x14 contains HPET; making it immovable\n");
+ }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, 0x4385, sb600_hpet_quirk);
+#endif
+
/*
* Twinhead H12Y needs us to block out a region otherwise we map devices
* there and any access kills the box.
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index db6b1ab4325..a19ed92e74e 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -271,11 +271,16 @@ static void pcibios_allocate_dev_resources(struct pci_dev *dev, int pass)
"BAR %d: reserving %pr (d=%d, p=%d)\n",
idx, r, disabled, pass);
if (pci_claim_resource(dev, idx) < 0) {
- /* We'll assign a new address later */
- pcibios_save_fw_addr(dev,
- idx, r->start);
- r->end -= r->start;
- r->start = 0;
+ if (r->flags & IORESOURCE_PCI_FIXED) {
+ dev_info(&dev->dev, "BAR %d %pR is immovable\n",
+ idx, r);
+ } else {
+ /* We'll assign a new address later */
+ pcibios_save_fw_addr(dev,
+ idx, r->start);
+ r->end -= r->start;
+ r->start = 0;
+ }
}
}
}
@@ -356,6 +361,12 @@ static int __init pcibios_assign_resources(void)
return 0;
}
+/**
+ * called in fs_initcall (one below subsys_initcall),
+ * give a chance for motherboard reserve resources
+ */
+fs_initcall(pcibios_assign_resources);
+
void pcibios_resource_survey_bus(struct pci_bus *bus)
{
dev_printk(KERN_DEBUG, &bus->dev, "Allocating resources\n");
@@ -392,12 +403,6 @@ void __init pcibios_resource_survey(void)
ioapic_insert_resources();
}
-/**
- * called in fs_initcall (one below subsys_initcall),
- * give a chance for motherboard reserve resources
- */
-fs_initcall(pcibios_assign_resources);
-
static const struct vm_operations_struct pci_mmap_ops = {
.access = generic_access_phys,
};
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 3497f14e4de..7c0d7be176a 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -52,8 +52,9 @@ $(obj)/realmode.elf: $(obj)/realmode.lds $(REALMODE_OBJS) FORCE
OBJCOPYFLAGS_realmode.bin := -O binary
targets += realmode.bin
-$(obj)/realmode.bin: $(obj)/realmode.elf $(obj)/realmode.relocs
+$(obj)/realmode.bin: $(obj)/realmode.elf $(obj)/realmode.relocs FORCE
$(call if_changed,objcopy)
+ @:
quiet_cmd_relocs = RELOCS $@
cmd_relocs = arch/x86/tools/relocs --realmode $< > $@
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c34bfc4bbe7..f17b29210ac 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1339,6 +1339,7 @@ xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
static struct notifier_block xen_panic_block = {
.notifier_call= xen_panic_event,
+ .priority = INT_MIN
};
int xen_panic_handler_init(void)
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 86e02eabb64..6f6e15d2846 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -2510,6 +2510,95 @@ void __init xen_hvm_init_mmu_ops(void)
}
#endif
+#ifdef CONFIG_XEN_PVH
+/*
+ * Map foreign gfn (fgfn), to local pfn (lpfn). This for the user
+ * space creating new guest on pvh dom0 and needing to map domU pages.
+ */
+static int xlate_add_to_p2m(unsigned long lpfn, unsigned long fgfn,
+ unsigned int domid)
+{
+ int rc, err = 0;
+ xen_pfn_t gpfn = lpfn;
+ xen_ulong_t idx = fgfn;
+
+ struct xen_add_to_physmap_range xatp = {
+ .domid = DOMID_SELF,
+ .foreign_domid = domid,
+ .size = 1,
+ .space = XENMAPSPACE_gmfn_foreign,
+ };
+ set_xen_guest_handle(xatp.idxs, &idx);
+ set_xen_guest_handle(xatp.gpfns, &gpfn);
+ set_xen_guest_handle(xatp.errs, &err);
+
+ rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp);
+ if (rc < 0)
+ return rc;
+ return err;
+}
+
+static int xlate_remove_from_p2m(unsigned long spfn, int count)
+{
+ struct xen_remove_from_physmap xrp;
+ int i, rc;
+
+ for (i = 0; i < count; i++) {
+ xrp.domid = DOMID_SELF;
+ xrp.gpfn = spfn+i;
+ rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp);
+ if (rc)
+ break;
+ }
+ return rc;
+}
+
+struct xlate_remap_data {
+ unsigned long fgfn; /* foreign domain's gfn */
+ pgprot_t prot;
+ domid_t domid;
+ int index;
+ struct page **pages;
+};
+
+static int xlate_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr,
+ void *data)
+{
+ int rc;
+ struct xlate_remap_data *remap = data;
+ unsigned long pfn = page_to_pfn(remap->pages[remap->index++]);
+ pte_t pteval = pte_mkspecial(pfn_pte(pfn, remap->prot));
+
+ rc = xlate_add_to_p2m(pfn, remap->fgfn, remap->domid);
+ if (rc)
+ return rc;
+ native_set_pte(ptep, pteval);
+
+ return 0;
+}
+
+static int xlate_remap_gfn_range(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long mfn,
+ int nr, pgprot_t prot, unsigned domid,
+ struct page **pages)
+{
+ int err;
+ struct xlate_remap_data pvhdata;
+
+ BUG_ON(!pages);
+
+ pvhdata.fgfn = mfn;
+ pvhdata.prot = prot;
+ pvhdata.domid = domid;
+ pvhdata.index = 0;
+ pvhdata.pages = pages;
+ err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT,
+ xlate_map_pte_fn, &pvhdata);
+ flush_tlb_all();
+ return err;
+}
+#endif
+
#define REMAP_BATCH_SIZE 16
struct remap_data {
@@ -2522,7 +2611,7 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
unsigned long addr, void *data)
{
struct remap_data *rmd = data;
- pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
+ pte_t pte = pte_mkspecial(mfn_pte(rmd->mfn++, rmd->prot));
rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
rmd->mmu_update->val = pte_val_ma(pte);
@@ -2544,13 +2633,18 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
unsigned long range;
int err = 0;
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return -EINVAL;
-
- prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
-
BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
+#ifdef CONFIG_XEN_PVH
+ /* We need to update the local page tables and the xen HAP */
+ return xlate_remap_gfn_range(vma, addr, mfn, nr, prot,
+ domid, pages);
+#else
+ return -EINVAL;
+#endif
+ }
+
rmd.mfn = mfn;
rmd.prot = prot;
@@ -2588,6 +2682,25 @@ int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
if (!pages || !xen_feature(XENFEAT_auto_translated_physmap))
return 0;
+#ifdef CONFIG_XEN_PVH
+ while (numpgs--) {
+ /*
+ * The mmu has already cleaned up the process mmu
+ * resources at this point (lookup_address will return
+ * NULL).
+ */
+ unsigned long pfn = page_to_pfn(pages[numpgs]);
+
+ xlate_remove_from_p2m(pfn, 1);
+ }
+ /*
+ * We don't need to flush tlbs because as part of
+ * xlate_remove_from_p2m, the hypervisor will do tlb flushes
+ * after removing the p2m entries from the EPT/NPT
+ */
+ return 0;
+#else
return -EINVAL;
+#endif
}
EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range);
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 85e5d78c987..9bb3d82ffec 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -36,7 +36,7 @@
* pfn_to_mfn(0xc0000)=0xc0000
*
* The benefit of this is, that we can assume for non-RAM regions (think
- * PCI BARs, or ACPI spaces), we can create mappings easily b/c we
+ * PCI BARs, or ACPI spaces), we can create mappings easily because we
* get the PFN value to match the MFN.
*
* For this to work efficiently we have one new page p2m_identity and
@@ -60,7 +60,7 @@
* There is also a digram of the P2M at the end that can help.
* Imagine your E820 looking as so:
*
- * 1GB 2GB
+ * 1GB 2GB 4GB
* /-------------------+---------\/----\ /----------\ /---+-----\
* | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM |
* \-------------------+---------/\----/ \----------/ \---+-----/
@@ -77,9 +77,8 @@
* of the PFN and the end PFN (263424 and 512256 respectively). The first step
* is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page
* covers 512^2 of page estate (1GB) and in case the start or end PFN is not
- * aligned on 512^2*PAGE_SIZE (1GB) we loop on aligned 1GB PFNs from start pfn
- * to end pfn. We reserve_brk top leaf pages if they are missing (means they
- * point to p2m_mid_missing).
+ * aligned on 512^2*PAGE_SIZE (1GB) we reserve_brk new middle and leaf pages as
+ * required to split any existing p2m_mid_missing middle pages.
*
* With the E820 example above, 263424 is not 1GB aligned so we allocate a
* reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000.
@@ -88,7 +87,7 @@
* Next stage is to determine if we need to do a more granular boundary check
* on the 4MB (or 2MB depending on architecture) off the start and end pfn's.
* We check if the start pfn and end pfn violate that boundary check, and if
- * so reserve_brk a middle (p2m[x][y]) leaf page. This way we have a much finer
+ * so reserve_brk a (p2m[x][y]) leaf page. This way we have a much finer
* granularity of setting which PFNs are missing and which ones are identity.
* In our example 263424 and 512256 both fail the check so we reserve_brk two
* pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing"
@@ -102,9 +101,10 @@
*
* The next step is to walk from the start pfn to the end pfn setting
* the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity.
- * If we find that the middle leaf is pointing to p2m_missing we can swap it
- * over to p2m_identity - this way covering 4MB (or 2MB) PFN space. At this
- * point we do not need to worry about boundary aligment (so no need to
+ * If we find that the middle entry is pointing to p2m_missing we can swap it
+ * over to p2m_identity - this way covering 4MB (or 2MB) PFN space (and
+ * similarly swapping p2m_mid_missing for p2m_mid_identity for larger regions).
+ * At this point we do not need to worry about boundary aligment (so no need to
* reserve_brk a middle page, figure out which PFNs are "missing" and which
* ones are identity), as that has been done earlier. If we find that the
* middle leaf is not occupied by p2m_identity or p2m_missing, we dereference
@@ -118,6 +118,9 @@
* considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511]
* contain the INVALID_P2M_ENTRY value and are considered "missing."
*
+ * Finally, the region beyond the end of of the E820 (4 GB in this example)
+ * is set to be identity (in case there are MMIO regions placed here).
+ *
* This is what the p2m ends up looking (for the E820 above) with this
* fabulous drawing:
*
@@ -129,21 +132,27 @@
* |-----| \ | [p2m_identity]+\\ | .... |
* | 2 |--\ \-------------------->| ... | \\ \----------------/
* |-----| \ \---------------/ \\
- * | 3 |\ \ \\ p2m_identity
- * |-----| \ \-------------------->/---------------\ /-----------------\
- * | .. +->+ | [p2m_identity]+-->| ~0, ~0, ~0, ... |
- * \-----/ / | [p2m_identity]+-->| ..., ~0 |
- * / /---------------\ | .... | \-----------------/
- * / | IDENTITY[@0] | /-+-[x], ~0, ~0.. |
- * / | IDENTITY[@256]|<----/ \---------------/
- * / | ~0, ~0, .... |
- * | \---------------/
- * |
- * p2m_mid_missing p2m_missing
- * /-----------------\ /------------\
- * | [p2m_missing] +---->| ~0, ~0, ~0 |
- * | [p2m_missing] +---->| ..., ~0 |
- * \-----------------/ \------------/
+ * | 3 |-\ \ \\ p2m_identity [1]
+ * |-----| \ \-------------------->/---------------\ /-----------------\
+ * | .. |\ | | [p2m_identity]+-->| ~0, ~0, ~0, ... |
+ * \-----/ | | | [p2m_identity]+-->| ..., ~0 |
+ * | | | .... | \-----------------/
+ * | | +-[x], ~0, ~0.. +\
+ * | | \---------------/ \
+ * | | \-> /---------------\
+ * | V p2m_mid_missing p2m_missing | IDENTITY[@0] |
+ * | /-----------------\ /------------\ | IDENTITY[@256]|
+ * | | [p2m_missing] +---->| ~0, ~0, ...| | ~0, ~0, .... |
+ * | | [p2m_missing] +---->| ..., ~0 | \---------------/
+ * | | ... | \------------/
+ * | \-----------------/
+ * |
+ * | p2m_mid_identity
+ * | /-----------------\
+ * \-->| [p2m_identity] +---->[1]
+ * | [p2m_identity] +---->[1]
+ * | ... |
+ * \-----------------/
*
* where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT)
*/
@@ -187,13 +196,15 @@ static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_identity, P2M_MID_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_identity_mfn, P2M_MID_PER_PAGE);
RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
/* We might hit two boundary violations at the start and end, at max each
* boundary violation will require three middle nodes. */
-RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3);
+RESERVE_BRK(p2m_mid_extra, PAGE_SIZE * 2 * 3);
/* When we populate back during bootup, the amount of pages can vary. The
* max we have is seen is 395979, but that does not mean it can't be more.
@@ -242,20 +253,20 @@ static void p2m_top_mfn_p_init(unsigned long **top)
top[i] = p2m_mid_missing_mfn;
}
-static void p2m_mid_init(unsigned long **mid)
+static void p2m_mid_init(unsigned long **mid, unsigned long *leaf)
{
unsigned i;
for (i = 0; i < P2M_MID_PER_PAGE; i++)
- mid[i] = p2m_missing;
+ mid[i] = leaf;
}
-static void p2m_mid_mfn_init(unsigned long *mid)
+static void p2m_mid_mfn_init(unsigned long *mid, unsigned long *leaf)
{
unsigned i;
for (i = 0; i < P2M_MID_PER_PAGE; i++)
- mid[i] = virt_to_mfn(p2m_missing);
+ mid[i] = virt_to_mfn(leaf);
}
static void p2m_init(unsigned long *p2m)
@@ -286,7 +297,9 @@ void __ref xen_build_mfn_list_list(void)
/* Pre-initialize p2m_top_mfn to be completely missing */
if (p2m_top_mfn == NULL) {
p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_mfn_init(p2m_mid_missing_mfn);
+ p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing);
+ p2m_mid_identity_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity);
p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_top_mfn_p_init(p2m_top_mfn_p);
@@ -295,7 +308,8 @@ void __ref xen_build_mfn_list_list(void)
p2m_top_mfn_init(p2m_top_mfn);
} else {
/* Reinitialise, mfn's all change after migration */
- p2m_mid_mfn_init(p2m_mid_missing_mfn);
+ p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing);
+ p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity);
}
for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
@@ -327,7 +341,7 @@ void __ref xen_build_mfn_list_list(void)
* it too late.
*/
mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_mfn_init(mid_mfn_p);
+ p2m_mid_mfn_init(mid_mfn_p, p2m_missing);
p2m_top_mfn_p[topidx] = mid_mfn_p;
}
@@ -365,16 +379,17 @@ void __init xen_build_dynamic_phys_to_machine(void)
p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_init(p2m_missing);
+ p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_init(p2m_identity);
p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_init(p2m_mid_missing);
+ p2m_mid_init(p2m_mid_missing, p2m_missing);
+ p2m_mid_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_init(p2m_mid_identity, p2m_identity);
p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_top_init(p2m_top);
- p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_init(p2m_identity);
-
/*
* The domain builder gives us a pre-constructed p2m array in
* mfn_list for all the pages initially given to us, so we just
@@ -386,7 +401,7 @@ void __init xen_build_dynamic_phys_to_machine(void)
if (p2m_top[topidx] == p2m_mid_missing) {
unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_init(mid);
+ p2m_mid_init(mid, p2m_missing);
p2m_top[topidx] = mid;
}
@@ -492,7 +507,7 @@ unsigned long get_phys_to_machine(unsigned long pfn)
unsigned topidx, mididx, idx;
if (unlikely(pfn >= MAX_P2M_PFN))
- return INVALID_P2M_ENTRY;
+ return IDENTITY_FRAME(pfn);
topidx = p2m_top_index(pfn);
mididx = p2m_mid_index(pfn);
@@ -545,7 +560,7 @@ static bool alloc_p2m(unsigned long pfn)
if (!mid)
return false;
- p2m_mid_init(mid);
+ p2m_mid_init(mid, p2m_missing);
if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
free_p2m_page(mid);
@@ -565,7 +580,7 @@ static bool alloc_p2m(unsigned long pfn)
if (!mid_mfn)
return false;
- p2m_mid_mfn_init(mid_mfn);
+ p2m_mid_mfn_init(mid_mfn, p2m_missing);
missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
mid_mfn_mfn = virt_to_mfn(mid_mfn);
@@ -596,7 +611,7 @@ static bool alloc_p2m(unsigned long pfn)
return true;
}
-static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary)
+static bool __init early_alloc_p2m(unsigned long pfn, bool check_boundary)
{
unsigned topidx, mididx, idx;
unsigned long *p2m;
@@ -638,7 +653,7 @@ static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary
return true;
}
-static bool __init early_alloc_p2m(unsigned long pfn)
+static bool __init early_alloc_p2m_middle(unsigned long pfn)
{
unsigned topidx = p2m_top_index(pfn);
unsigned long *mid_mfn_p;
@@ -649,7 +664,7 @@ static bool __init early_alloc_p2m(unsigned long pfn)
if (mid == p2m_mid_missing) {
mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_init(mid);
+ p2m_mid_init(mid, p2m_missing);
p2m_top[topidx] = mid;
@@ -658,12 +673,12 @@ static bool __init early_alloc_p2m(unsigned long pfn)
/* And the save/restore P2M tables.. */
if (mid_mfn_p == p2m_mid_missing_mfn) {
mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_mfn_init(mid_mfn_p);
+ p2m_mid_mfn_init(mid_mfn_p, p2m_missing);
p2m_top_mfn_p[topidx] = mid_mfn_p;
p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
/* Note: we don't set mid_mfn_p[midix] here,
- * look in early_alloc_p2m_middle */
+ * look in early_alloc_p2m() */
}
return true;
}
@@ -739,7 +754,7 @@ found:
/* This shouldn't happen */
if (WARN_ON(p2m_top[topidx] == p2m_mid_missing))
- early_alloc_p2m(set_pfn);
+ early_alloc_p2m_middle(set_pfn);
if (WARN_ON(p2m_top[topidx][mididx] != p2m_missing))
return false;
@@ -754,13 +769,13 @@ found:
bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn)
{
if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
- if (!early_alloc_p2m(pfn))
+ if (!early_alloc_p2m_middle(pfn))
return false;
if (early_can_reuse_p2m_middle(pfn, mfn))
return __set_phys_to_machine(pfn, mfn);
- if (!early_alloc_p2m_middle(pfn, false /* boundary crossover OK!*/))
+ if (!early_alloc_p2m(pfn, false /* boundary crossover OK!*/))
return false;
if (!__set_phys_to_machine(pfn, mfn))
@@ -769,12 +784,30 @@ bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn)
return true;
}
+
+static void __init early_split_p2m(unsigned long pfn)
+{
+ unsigned long mididx, idx;
+
+ mididx = p2m_mid_index(pfn);
+ idx = p2m_index(pfn);
+
+ /*
+ * Allocate new middle and leaf pages if this pfn lies in the
+ * middle of one.
+ */
+ if (mididx || idx)
+ early_alloc_p2m_middle(pfn);
+ if (idx)
+ early_alloc_p2m(pfn, false);
+}
+
unsigned long __init set_phys_range_identity(unsigned long pfn_s,
unsigned long pfn_e)
{
unsigned long pfn;
- if (unlikely(pfn_s >= MAX_P2M_PFN || pfn_e >= MAX_P2M_PFN))
+ if (unlikely(pfn_s >= MAX_P2M_PFN))
return 0;
if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
@@ -783,19 +816,30 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s,
if (pfn_s > pfn_e)
return 0;
- for (pfn = (pfn_s & ~(P2M_MID_PER_PAGE * P2M_PER_PAGE - 1));
- pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE));
- pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE)
- {
- WARN_ON(!early_alloc_p2m(pfn));
- }
+ if (pfn_e > MAX_P2M_PFN)
+ pfn_e = MAX_P2M_PFN;
- early_alloc_p2m_middle(pfn_s, true);
- early_alloc_p2m_middle(pfn_e, true);
+ early_split_p2m(pfn_s);
+ early_split_p2m(pfn_e);
+
+ for (pfn = pfn_s; pfn < pfn_e;) {
+ unsigned topidx = p2m_top_index(pfn);
+ unsigned mididx = p2m_mid_index(pfn);
- for (pfn = pfn_s; pfn < pfn_e; pfn++)
if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn)))
break;
+ pfn++;
+
+ /*
+ * If the PFN was set to a middle or leaf identity
+ * page the remainder must also be identity, so skip
+ * ahead to the next middle or leaf entry.
+ */
+ if (p2m_top[topidx] == p2m_mid_identity)
+ pfn = ALIGN(pfn, P2M_MID_PER_PAGE * P2M_PER_PAGE);
+ else if (p2m_top[topidx][mididx] == p2m_identity)
+ pfn = ALIGN(pfn, P2M_PER_PAGE);
+ }
if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s),
"Identity mapping failed. We are %ld short of 1-1 mappings!\n",
@@ -825,8 +869,22 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
/* For sparse holes were the p2m leaf has real PFN along with
* PCI holes, stick in the PFN as the MFN value.
+ *
+ * set_phys_range_identity() will have allocated new middle
+ * and leaf pages as required so an existing p2m_mid_missing
+ * or p2m_missing mean that whole range will be identity so
+ * these can be switched to p2m_mid_identity or p2m_identity.
*/
if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) {
+ if (p2m_top[topidx] == p2m_mid_identity)
+ return true;
+
+ if (p2m_top[topidx] == p2m_mid_missing) {
+ WARN_ON(cmpxchg(&p2m_top[topidx], p2m_mid_missing,
+ p2m_mid_identity) != p2m_mid_missing);
+ return true;
+ }
+
if (p2m_top[topidx][mididx] == p2m_identity)
return true;
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 0982233b9b8..210426a26cc 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -89,10 +89,10 @@ static void __init xen_add_extra_mem(u64 start, u64 size)
for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) {
unsigned long mfn = pfn_to_mfn(pfn);
- if (WARN(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn))
+ if (WARN_ONCE(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn))
continue;
- WARN(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n",
- pfn, mfn);
+ WARN_ONCE(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n",
+ pfn, mfn);
__set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
}
@@ -469,6 +469,15 @@ char * __init xen_memory_setup(void)
}
/*
+ * Set the rest as identity mapped, in case PCI BARs are
+ * located here.
+ *
+ * PFNs above MAX_P2M_PFN are considered identity mapped as
+ * well.
+ */
+ set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul);
+
+ /*
* In domU, the ISA region is normal, usable memory, but we
* reserve ISA memory anyway because too many things poke
* about in there.
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 45329c8c226..c4df9dbd63b 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -12,8 +12,10 @@
#include "xen-ops.h"
#include "mmu.h"
-void xen_arch_pre_suspend(void)
+static void xen_pv_pre_suspend(void)
{
+ xen_mm_pin_all();
+
xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
xen_start_info->console.domU.mfn =
mfn_to_pfn(xen_start_info->console.domU.mfn);
@@ -26,7 +28,7 @@ void xen_arch_pre_suspend(void)
BUG();
}
-void xen_arch_hvm_post_suspend(int suspend_cancelled)
+static void xen_hvm_post_suspend(int suspend_cancelled)
{
#ifdef CONFIG_XEN_PVHVM
int cpu;
@@ -41,7 +43,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled)
#endif
}
-void xen_arch_post_suspend(int suspend_cancelled)
+static void xen_pv_post_suspend(int suspend_cancelled)
{
xen_build_mfn_list_list();
@@ -60,6 +62,21 @@ void xen_arch_post_suspend(int suspend_cancelled)
xen_vcpu_restore();
}
+ xen_mm_unpin_all();
+}
+
+void xen_arch_pre_suspend(void)
+{
+ if (xen_pv_domain())
+ xen_pv_pre_suspend();
+}
+
+void xen_arch_post_suspend(int cancelled)
+{
+ if (xen_pv_domain())
+ xen_pv_post_suspend(cancelled);
+ else
+ xen_hvm_post_suspend(cancelled);
}
static void xen_vcpu_notify_restore(void *data)
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 1cb6f4c3730..c834d4b231f 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -31,6 +31,8 @@ void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
void xen_reserve_top(void);
extern unsigned long xen_max_p2m_pfn;
+void xen_mm_pin_all(void);
+void xen_mm_unpin_all(void);
void xen_set_pat(u64);
char * __init xen_memory_setup(void);