summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig9
-rw-r--r--arch/x86/ia32/ia32_aout.c6
-rw-r--r--arch/x86/kernel/machine_kexec_32.c39
-rw-r--r--arch/x86/kernel/machine_kexec_64.c2
-rw-r--r--arch/x86/kernel/pci-calgary_64.c73
-rw-r--r--arch/x86/kernel/pci-dma.c27
-rw-r--r--arch/x86/kernel/pci-gart_64.c3
-rw-r--r--arch/x86/kernel/pci-nommu.c14
-rw-r--r--arch/x86/kernel/pci-swiotlb_64.c2
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S174
-rw-r--r--arch/x86/mm/Makefile1
-rw-r--r--arch/x86/mm/gup.c295
-rw-r--r--arch/x86/mm/init_64.c37
-rw-r--r--arch/x86/mm/pgtable_32.c47
14 files changed, 535 insertions, 194 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 39ae6798595..b6fa2877b17 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -22,6 +22,7 @@ config X86
select HAVE_IDE
select HAVE_OPROFILE
select HAVE_IOREMAP_PROT
+ select HAVE_GET_USER_PAGES_FAST
select HAVE_KPROBES
select ARCH_WANT_OPTIONAL_GPIOLIB
select HAVE_KRETPROBES
@@ -1275,6 +1276,14 @@ config CRASH_DUMP
(CONFIG_RELOCATABLE=y).
For more details see Documentation/kdump/kdump.txt
+config KEXEC_JUMP
+ bool "kexec jump (EXPERIMENTAL)"
+ depends on EXPERIMENTAL
+ depends on KEXEC && HIBERNATION && X86_32
+ help
+ Jump between original kernel and kexeced kernel and invoke
+ code in physical address mode via KEXEC
+
config PHYSICAL_START
hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
default "0x1000000" if X86_NUMAQ
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 58cccb6483b..a0e1dbe67dc 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -441,12 +441,6 @@ beyond_if:
regs->r8 = regs->r9 = regs->r10 = regs->r11 =
regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
set_fs(USER_DS);
- if (unlikely(current->ptrace & PT_PTRACED)) {
- if (current->ptrace & PT_TRACE_EXEC)
- ptrace_notify((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
- else
- send_sig(SIGTRAP, current, 0);
- }
return 0;
}
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 8864230d55a..9fe478d9840 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -22,6 +22,7 @@
#include <asm/cpufeature.h>
#include <asm/desc.h>
#include <asm/system.h>
+#include <asm/cacheflush.h>
#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
static u32 kexec_pgd[1024] PAGE_ALIGNED;
@@ -85,10 +86,12 @@ static void load_segments(void)
* reboot code buffer to allow us to avoid allocations
* later.
*
- * Currently nothing.
+ * Make control page executable.
*/
int machine_kexec_prepare(struct kimage *image)
{
+ if (nx_enabled)
+ set_pages_x(image->control_code_page, 1);
return 0;
}
@@ -98,27 +101,48 @@ int machine_kexec_prepare(struct kimage *image)
*/
void machine_kexec_cleanup(struct kimage *image)
{
+ if (nx_enabled)
+ set_pages_nx(image->control_code_page, 1);
}
/*
* Do not allocate memory (or fail in any way) in machine_kexec().
* We are past the point of no return, committed to rebooting now.
*/
-NORET_TYPE void machine_kexec(struct kimage *image)
+void machine_kexec(struct kimage *image)
{
unsigned long page_list[PAGES_NR];
void *control_page;
+ asmlinkage unsigned long
+ (*relocate_kernel_ptr)(unsigned long indirection_page,
+ unsigned long control_page,
+ unsigned long start_address,
+ unsigned int has_pae,
+ unsigned int preserve_context);
tracer_disable();
/* Interrupts aren't acceptable while we reboot */
local_irq_disable();
+ if (image->preserve_context) {
+#ifdef CONFIG_X86_IO_APIC
+ /* We need to put APICs in legacy mode so that we can
+ * get timer interrupts in second kernel. kexec/kdump
+ * paths already have calls to disable_IO_APIC() in
+ * one form or other. kexec jump path also need
+ * one.
+ */
+ disable_IO_APIC();
+#endif
+ }
+
control_page = page_address(image->control_code_page);
- memcpy(control_page, relocate_kernel, PAGE_SIZE);
+ memcpy(control_page, relocate_kernel, PAGE_SIZE/2);
+ relocate_kernel_ptr = control_page;
page_list[PA_CONTROL_PAGE] = __pa(control_page);
- page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
+ page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
page_list[PA_PGD] = __pa(kexec_pgd);
page_list[VA_PGD] = (unsigned long)kexec_pgd;
#ifdef CONFIG_X86_PAE
@@ -131,6 +155,7 @@ NORET_TYPE void machine_kexec(struct kimage *image)
page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
page_list[PA_PTE_1] = __pa(kexec_pte1);
page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
+ page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT);
/* The segment registers are funny things, they have both a
* visible and an invisible part. Whenever the visible part is
@@ -149,8 +174,10 @@ NORET_TYPE void machine_kexec(struct kimage *image)
set_idt(phys_to_virt(0),0);
/* now call it */
- relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
- image->start, cpu_has_pae);
+ image->start = relocate_kernel_ptr((unsigned long)image->head,
+ (unsigned long)page_list,
+ image->start, cpu_has_pae,
+ image->preserve_context);
}
void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 9dd9262693a..c43caa3a91f 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -181,7 +181,7 @@ void machine_kexec_cleanup(struct kimage *image)
* Do not allocate memory (or fail in any way) in machine_kexec().
* We are past the point of no return, committed to rebooting now.
*/
-NORET_TYPE void machine_kexec(struct kimage *image)
+void machine_kexec(struct kimage *image)
{
unsigned long page_list[PAGES_NR];
void *control_page;
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 19e7fc7c2c4..b67a4b1d4ea 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -37,6 +37,7 @@
#include <linux/delay.h>
#include <linux/scatterlist.h>
#include <linux/iommu-helper.h>
+
#include <asm/iommu.h>
#include <asm/calgary.h>
#include <asm/tce.h>
@@ -413,22 +414,6 @@ static void calgary_unmap_sg(struct device *dev,
}
}
-static int calgary_nontranslate_map_sg(struct device* dev,
- struct scatterlist *sg, int nelems, int direction)
-{
- struct scatterlist *s;
- int i;
-
- for_each_sg(sg, s, nelems, i) {
- struct page *p = sg_page(s);
-
- BUG_ON(!p);
- s->dma_address = virt_to_bus(sg_virt(s));
- s->dma_length = s->length;
- }
- return nelems;
-}
-
static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
int nelems, int direction)
{
@@ -439,9 +424,6 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
unsigned long entry;
int i;
- if (!translation_enabled(tbl))
- return calgary_nontranslate_map_sg(dev, sg, nelems, direction);
-
for_each_sg(sg, s, nelems, i) {
BUG_ON(!sg_page(s));
@@ -477,7 +459,6 @@ error:
static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
size_t size, int direction)
{
- dma_addr_t dma_handle = bad_dma_address;
void *vaddr = phys_to_virt(paddr);
unsigned long uaddr;
unsigned int npages;
@@ -486,12 +467,7 @@ static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
uaddr = (unsigned long)vaddr;
npages = num_dma_pages(uaddr, size);
- if (translation_enabled(tbl))
- dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction);
- else
- dma_handle = virt_to_bus(vaddr);
-
- return dma_handle;
+ return iommu_alloc(dev, tbl, vaddr, npages, direction);
}
static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
@@ -500,9 +476,6 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
struct iommu_table *tbl = find_iommu_table(dev);
unsigned int npages;
- if (!translation_enabled(tbl))
- return;
-
npages = num_dma_pages(dma_handle, size);
iommu_free(tbl, dma_handle, npages);
}
@@ -525,18 +498,12 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
goto error;
memset(ret, 0, size);
- if (translation_enabled(tbl)) {
- /* set up tces to cover the allocated range */
- mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
- if (mapping == bad_dma_address)
- goto free;
-
- *dma_handle = mapping;
- } else /* non translated slot */
- *dma_handle = virt_to_bus(ret);
-
+ /* set up tces to cover the allocated range */
+ mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
+ if (mapping == bad_dma_address)
+ goto free;
+ *dma_handle = mapping;
return ret;
-
free:
free_pages((unsigned long)ret, get_order(size));
ret = NULL;
@@ -544,7 +511,7 @@ error:
return ret;
}
-static const struct dma_mapping_ops calgary_dma_ops = {
+static struct dma_mapping_ops calgary_dma_ops = {
.alloc_coherent = calgary_alloc_coherent,
.map_single = calgary_map_single,
.unmap_single = calgary_unmap_single,
@@ -1241,6 +1208,16 @@ static int __init calgary_init(void)
goto error;
} while (1);
+ dev = NULL;
+ for_each_pci_dev(dev) {
+ struct iommu_table *tbl;
+
+ tbl = find_iommu_table(&dev->dev);
+
+ if (translation_enabled(tbl))
+ dev->dev.archdata.dma_ops = &calgary_dma_ops;
+ }
+
return ret;
error:
@@ -1262,6 +1239,7 @@ error:
calgary_disable_translation(dev);
calgary_free_bus(dev);
pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
+ dev->dev.archdata.dma_ops = NULL;
} while (1);
return ret;
@@ -1503,6 +1481,10 @@ void __init detect_calgary(void)
printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, "
"CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size,
debugging ? "enabled" : "disabled");
+
+ /* swiotlb for devices that aren't behind the Calgary. */
+ if (max_pfn > MAX_DMA32_PFN)
+ swiotlb = 1;
}
return;
@@ -1519,7 +1501,7 @@ int __init calgary_iommu_init(void)
{
int ret;
- if (no_iommu || swiotlb)
+ if (no_iommu || (swiotlb && !calgary_detected))
return -ENODEV;
if (!calgary_detected)
@@ -1532,15 +1514,14 @@ int __init calgary_iommu_init(void)
if (ret) {
printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
"falling back to no_iommu\n", ret);
- if (max_pfn > MAX_DMA32_PFN)
- printk(KERN_ERR "WARNING more than 4GB of memory, "
- "32bit PCI may malfunction.\n");
return ret;
}
force_iommu = 1;
bad_dma_address = 0x0;
- dma_ops = &calgary_dma_ops;
+ /* dma_ops is set to swiotlb or nommu */
+ if (!dma_ops)
+ dma_ops = &nommu_dma_ops;
return 0;
}
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index cbecb05551b..37544123896 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -11,7 +11,7 @@
static int forbid_dac __read_mostly;
-const struct dma_mapping_ops *dma_ops;
+struct dma_mapping_ops *dma_ops;
EXPORT_SYMBOL(dma_ops);
static int iommu_sac_force __read_mostly;
@@ -312,6 +312,8 @@ static int dma_release_coherent(struct device *dev, int order, void *vaddr)
int dma_supported(struct device *dev, u64 mask)
{
+ struct dma_mapping_ops *ops = get_dma_ops(dev);
+
#ifdef CONFIG_PCI
if (mask > 0xffffffff && forbid_dac > 0) {
dev_info(dev, "PCI: Disallowing DAC for device\n");
@@ -319,8 +321,8 @@ int dma_supported(struct device *dev, u64 mask)
}
#endif
- if (dma_ops->dma_supported)
- return dma_ops->dma_supported(dev, mask);
+ if (ops->dma_supported)
+ return ops->dma_supported(dev, mask);
/* Copied from i386. Doesn't make much sense, because it will
only work for pci_alloc_coherent.
@@ -367,6 +369,7 @@ void *
dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
gfp_t gfp)
{
+ struct dma_mapping_ops *ops = get_dma_ops(dev);
void *memory = NULL;
struct page *page;
unsigned long dma_mask = 0;
@@ -435,8 +438,8 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
/* Let low level make its own zone decisions */
gfp &= ~(GFP_DMA32|GFP_DMA);
- if (dma_ops->alloc_coherent)
- return dma_ops->alloc_coherent(dev, size,
+ if (ops->alloc_coherent)
+ return ops->alloc_coherent(dev, size,
dma_handle, gfp);
return NULL;
}
@@ -448,14 +451,14 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
}
}
- if (dma_ops->alloc_coherent) {
+ if (ops->alloc_coherent) {
free_pages((unsigned long)memory, get_order(size));
gfp &= ~(GFP_DMA|GFP_DMA32);
- return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
+ return ops->alloc_coherent(dev, size, dma_handle, gfp);
}
- if (dma_ops->map_simple) {
- *dma_handle = dma_ops->map_simple(dev, virt_to_phys(memory),
+ if (ops->map_simple) {
+ *dma_handle = ops->map_simple(dev, virt_to_phys(memory),
size,
PCI_DMA_BIDIRECTIONAL);
if (*dma_handle != bad_dma_address)
@@ -477,12 +480,14 @@ EXPORT_SYMBOL(dma_alloc_coherent);
void dma_free_coherent(struct device *dev, size_t size,
void *vaddr, dma_addr_t bus)
{
+ struct dma_mapping_ops *ops = get_dma_ops(dev);
+
int order = get_order(size);
WARN_ON(irqs_disabled()); /* for portability */
if (dma_release_coherent(dev, order, vaddr))
return;
- if (dma_ops->unmap_single)
- dma_ops->unmap_single(dev, bus, size, 0);
+ if (ops->unmap_single)
+ ops->unmap_single(dev, bus, size, 0);
free_pages((unsigned long)vaddr, order);
}
EXPORT_SYMBOL(dma_free_coherent);
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index df5f142657d..744126e6495 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -692,8 +692,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
extern int agp_amd64_init(void);
-static const struct dma_mapping_ops gart_dma_ops = {
- .mapping_error = NULL,
+static struct dma_mapping_ops gart_dma_ops = {
.map_single = gart_map_single,
.map_simple = gart_map_simple,
.unmap_single = gart_unmap_single,
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 792b9179eff..3f91f71cdc3 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -72,21 +72,9 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
return nents;
}
-/* Make sure we keep the same behaviour */
-static int nommu_mapping_error(dma_addr_t dma_addr)
-{
-#ifdef CONFIG_X86_32
- return 0;
-#else
- return (dma_addr == bad_dma_address);
-#endif
-}
-
-
-const struct dma_mapping_ops nommu_dma_ops = {
+struct dma_mapping_ops nommu_dma_ops = {
.map_single = nommu_map_single,
.map_sg = nommu_map_sg,
- .mapping_error = nommu_mapping_error,
.is_phys = 1,
};
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 20df839b9c2..c4ce0332759 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -18,7 +18,7 @@ swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size,
return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction);
}
-const struct dma_mapping_ops swiotlb_dma_ops = {
+struct dma_mapping_ops swiotlb_dma_ops = {
.mapping_error = swiotlb_dma_mapping_error,
.alloc_coherent = swiotlb_alloc_coherent,
.free_coherent = swiotlb_free_coherent,
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index c30fe25d470..703310a9902 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -20,11 +20,44 @@
#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
#define PAE_PGD_ATTR (_PAGE_PRESENT)
+/* control_page + PAGE_SIZE/2 ~ control_page + PAGE_SIZE * 3/4 are
+ * used to save some data for jumping back
+ */
+#define DATA(offset) (PAGE_SIZE/2+(offset))
+
+/* Minimal CPU state */
+#define ESP DATA(0x0)
+#define CR0 DATA(0x4)
+#define CR3 DATA(0x8)
+#define CR4 DATA(0xc)
+
+/* other data */
+#define CP_VA_CONTROL_PAGE DATA(0x10)
+#define CP_PA_PGD DATA(0x14)
+#define CP_PA_SWAP_PAGE DATA(0x18)
+#define CP_PA_BACKUP_PAGES_MAP DATA(0x1c)
+
.text
.align PAGE_SIZE
.globl relocate_kernel
relocate_kernel:
- movl 8(%esp), %ebp /* list of pages */
+ /* Save the CPU context, used for jumping back */
+
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ pushl %ebp
+ pushf
+
+ movl 20+8(%esp), %ebp /* list of pages */
+ movl PTR(VA_CONTROL_PAGE)(%ebp), %edi
+ movl %esp, ESP(%edi)
+ movl %cr0, %eax
+ movl %eax, CR0(%edi)
+ movl %cr3, %eax
+ movl %eax, CR3(%edi)
+ movl %cr4, %eax
+ movl %eax, CR4(%edi)
#ifdef CONFIG_X86_PAE
/* map the control page at its virtual address */
@@ -138,15 +171,25 @@ relocate_kernel:
relocate_new_kernel:
/* read the arguments and say goodbye to the stack */
- movl 4(%esp), %ebx /* page_list */
- movl 8(%esp), %ebp /* list of pages */
- movl 12(%esp), %edx /* start address */
- movl 16(%esp), %ecx /* cpu_has_pae */
+ movl 20+4(%esp), %ebx /* page_list */
+ movl 20+8(%esp), %ebp /* list of pages */
+ movl 20+12(%esp), %edx /* start address */
+ movl 20+16(%esp), %ecx /* cpu_has_pae */
+ movl 20+20(%esp), %esi /* preserve_context */
/* zero out flags, and disable interrupts */
pushl $0
popfl
+ /* save some information for jumping back */
+ movl PTR(VA_CONTROL_PAGE)(%ebp), %edi
+ movl %edi, CP_VA_CONTROL_PAGE(%edi)
+ movl PTR(PA_PGD)(%ebp), %eax
+ movl %eax, CP_PA_PGD(%edi)
+ movl PTR(PA_SWAP_PAGE)(%ebp), %eax
+ movl %eax, CP_PA_SWAP_PAGE(%edi)
+ movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi)
+
/* get physical address of control page now */
/* this is impossible after page table switch */
movl PTR(PA_CONTROL_PAGE)(%ebp), %edi
@@ -197,8 +240,90 @@ identity_mapped:
xorl %eax, %eax
movl %eax, %cr3
+ movl CP_PA_SWAP_PAGE(%edi), %eax
+ pushl %eax
+ pushl %ebx
+ call swap_pages
+ addl $8, %esp
+
+ /* To be certain of avoiding problems with self-modifying code
+ * I need to execute a serializing instruction here.
+ * So I flush the TLB, it's handy, and not processor dependent.
+ */
+ xorl %eax, %eax
+ movl %eax, %cr3
+
+ /* set all of the registers to known values */
+ /* leave %esp alone */
+
+ testl %esi, %esi
+ jnz 1f
+ xorl %edi, %edi
+ xorl %eax, %eax
+ xorl %ebx, %ebx
+ xorl %ecx, %ecx
+ xorl %edx, %edx
+ xorl %esi, %esi
+ xorl %ebp, %ebp
+ ret
+1:
+ popl %edx
+ movl CP_PA_SWAP_PAGE(%edi), %esp
+ addl $PAGE_SIZE, %esp
+2:
+ call *%edx
+
+ /* get the re-entry point of the peer system */
+ movl 0(%esp), %ebp
+ call 1f
+1:
+ popl %ebx
+ subl $(1b - relocate_kernel), %ebx
+ movl CP_VA_CONTROL_PAGE(%ebx), %edi
+ lea PAGE_SIZE(%ebx), %esp
+ movl CP_PA_SWAP_PAGE(%ebx), %eax
+ movl CP_PA_BACKUP_PAGES_MAP(%ebx), %edx
+ pushl %eax
+ pushl %edx
+ call swap_pages
+ addl $8, %esp
+ movl CP_PA_PGD(%ebx), %eax
+ movl %eax, %cr3
+ movl %cr0, %eax
+ orl $(1<<31), %eax
+ movl %eax, %cr0
+ lea PAGE_SIZE(%edi), %esp
+ movl %edi, %eax
+ addl $(virtual_mapped - relocate_kernel), %eax
+ pushl %eax
+ ret
+
+virtual_mapped:
+ movl CR4(%edi), %eax
+ movl %eax, %cr4
+ movl CR3(%edi), %eax
+ movl %eax, %cr3
+ movl CR0(%edi), %eax
+ movl %eax, %cr0
+ movl ESP(%edi), %esp
+ movl %ebp, %eax
+
+ popf
+ popl %ebp
+ popl %edi
+ popl %esi
+ popl %ebx
+ ret
+
/* Do the copies */
- movl %ebx, %ecx
+swap_pages:
+ movl 8(%esp), %edx
+ movl 4(%esp), %ecx
+ pushl %ebp
+ pushl %ebx
+ pushl %edi
+ pushl %esi
+ movl %ecx, %ebx
jmp 1f
0: /* top, read another word from the indirection page */
@@ -226,27 +351,28 @@ identity_mapped:
movl %ecx, %esi /* For every source page do a copy */
andl $0xfffff000, %esi
+ movl %edi, %eax
+ movl %esi, %ebp
+
+ movl %edx, %edi
movl $1024, %ecx
rep ; movsl
- jmp 0b
-3:
-
- /* To be certain of avoiding problems with self-modifying code
- * I need to execute a serializing instruction here.
- * So I flush the TLB, it's handy, and not processor dependent.
- */
- xorl %eax, %eax
- movl %eax, %cr3
+ movl %ebp, %edi
+ movl %eax, %esi
+ movl $1024, %ecx
+ rep ; movsl
- /* set all of the registers to known values */
- /* leave %esp alone */
+ movl %eax, %edi
+ movl %edx, %esi
+ movl $1024, %ecx
+ rep ; movsl
- xorl %eax, %eax
- xorl %ebx, %ebx
- xorl %ecx, %ecx
- xorl %edx, %edx
- xorl %esi, %esi
- xorl %edi, %edi
- xorl %ebp, %ebp
+ lea PAGE_SIZE(%ebp), %esi
+ jmp 0b
+3:
+ popl %esi
+ popl %edi
+ popl %ebx
+ popl %ebp
ret
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 1fbb844c3d7..2977ea37791 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,6 +1,7 @@
obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
pat.o pgtable.o
+obj-$(CONFIG_HAVE_GET_USER_PAGES_FAST) += gup.o
obj-$(CONFIG_X86_32) += pgtable_32.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
new file mode 100644
index 00000000000..3085f25b435
--- /dev/null
+++ b/arch/x86/mm/gup.c
@@ -0,0 +1,295 @@
+/*
+ * Lockless get_user_pages_fast for x86
+ *
+ * Copyright (C) 2008 Nick Piggin
+ * Copyright (C) 2008 Novell Inc.
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
+#include <linux/highmem.h>
+
+#include <asm/pgtable.h>
+
+static inline pte_t gup_get_pte(pte_t *ptep)
+{
+#ifndef CONFIG_X86_PAE
+ return *ptep;
+#else
+ /*
+ * With get_user_pages_fast, we walk down the pagetables without taking
+ * any locks. For this we would like to load the pointers atoimcally,
+ * but that is not possible (without expensive cmpxchg8b) on PAE. What
+ * we do have is the guarantee that a pte will only either go from not
+ * present to present, or present to not present or both -- it will not
+ * switch to a completely different present page without a TLB flush in
+ * between; something that we are blocking by holding interrupts off.
+ *
+ * Setting ptes from not present to present goes:
+ * ptep->pte_high = h;
+ * smp_wmb();
+ * ptep->pte_low = l;
+ *
+ * And present to not present goes:
+ * ptep->pte_low = 0;
+ * smp_wmb();
+ * ptep->pte_high = 0;
+ *
+ * We must ensure here that the load of pte_low sees l iff pte_high
+ * sees h. We load pte_high *after* loading pte_low, which ensures we
+ * don't see an older value of pte_high. *Then* we recheck pte_low,
+ * which ensures that we haven't picked up a changed pte high. We might
+ * have got rubbish values from pte_low and pte_high, but we are
+ * guaranteed that pte_low will not have the present bit set *unless*
+ * it is 'l'. And get_user_pages_fast only operates on present ptes, so
+ * we're safe.
+ *
+ * gup_get_pte should not be used or copied outside gup.c without being
+ * very careful -- it does not atomically load the pte or anything that
+ * is likely to be useful for you.
+ */
+ pte_t pte;
+
+retry:
+ pte.pte_low = ptep->pte_low;
+ smp_rmb();
+ pte.pte_high = ptep->pte_high;
+ smp_rmb();
+ if (unlikely(pte.pte_low != ptep->pte_low))
+ goto retry;
+
+ return pte;
+#endif
+}
+
+/*
+ * The performance critical leaf functions are made noinline otherwise gcc
+ * inlines everything into a single function which results in too much
+ * register pressure.
+ */
+static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
+{
+ unsigned long mask;
+ pte_t *ptep;
+
+ mask = _PAGE_PRESENT|_PAGE_USER;
+ if (write)
+ mask |= _PAGE_RW;
+
+ ptep = pte_offset_map(&pmd, addr);
+ do {
+ pte_t pte = gup_get_pte(ptep);
+ struct page *page;
+
+ if ((pte_val(pte) & (mask | _PAGE_SPECIAL)) != mask) {
+ pte_unmap(ptep);
+ return 0;
+ }
+ VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+ page = pte_page(pte);
+ get_page(page);
+ pages[*nr] = page;
+ (*nr)++;
+
+ } while (ptep++, addr += PAGE_SIZE, addr != end);
+ pte_unmap(ptep - 1);
+
+ return 1;
+}
+
+static inline void get_head_page_multiple(struct page *page, int nr)
+{
+ VM_BUG_ON(page != compound_head(page));
+ VM_BUG_ON(page_count(page) == 0);
+ atomic_add(nr, &page->_count);
+}
+
+static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
+{
+ unsigned long mask;
+ pte_t pte = *(pte_t *)&pmd;
+ struct page *head, *page;
+ int refs;
+
+ mask = _PAGE_PRESENT|_PAGE_USER;
+ if (write)
+ mask |= _PAGE_RW;
+ if ((pte_val(pte) & mask) != mask)
+ return 0;
+ /* hugepages are never "special" */
+ VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL);
+ VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+ refs = 0;
+ head = pte_page(pte);
+ page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ do {
+ VM_BUG_ON(compound_head(page) != head);
+ pages[*nr] = page;
+ (*nr)++;
+ page++;
+ refs++;
+ } while (addr += PAGE_SIZE, addr != end);
+ get_head_page_multiple(head, refs);
+
+ return 1;
+}
+
+static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
+ int write, struct page **pages, int *nr)
+{
+ unsigned long next;
+ pmd_t *pmdp;
+
+ pmdp = pmd_offset(&pud, addr);
+ do {
+ pmd_t pmd = *pmdp;
+
+ next = pmd_addr_end(addr, end);
+ if (pmd_none(pmd))
+ return 0;
+ if (unlikely(pmd_large(pmd))) {
+ if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
+ return 0;
+ } else {
+ if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+ return 0;
+ }
+ } while (pmdp++, addr = next, addr != end);
+
+ return 1;
+}
+
+static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
+{
+ unsigned long mask;
+ pte_t pte = *(pte_t *)&pud;
+ struct page *head, *page;
+ int refs;
+
+ mask = _PAGE_PRESENT|_PAGE_USER;
+ if (write)
+ mask |= _PAGE_RW;
+ if ((pte_val(pte) & mask) != mask)
+ return 0;
+ /* hugepages are never "special" */
+ VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL);
+ VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+ refs = 0;
+ head = pte_page(pte);
+ page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ do {
+ VM_BUG_ON(compound_head(page) != head);
+ pages[*nr] = page;
+ (*nr)++;
+ page++;
+ refs++;
+ } while (addr += PAGE_SIZE, addr != end);
+ get_head_page_multiple(head, refs);
+
+ return 1;
+}
+
+static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
+ int write, struct page **pages, int *nr)
+{
+ unsigned long next;
+ pud_t *pudp;
+
+ pudp = pud_offset(&pgd, addr);
+ do {
+ pud_t pud = *pudp;
+
+ next = pud_addr_end(addr, end);
+ if (pud_none(pud))
+ return 0;
+ if (unlikely(pud_large(pud))) {
+ if (!gup_huge_pud(pud, addr, next, write, pages, nr))
+ return 0;
+ } else {
+ if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+ return 0;
+ }
+ } while (pudp++, addr = next, addr != end);
+
+ return 1;
+}
+
+int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+ struct page **pages)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long end = start + (nr_pages << PAGE_SHIFT);
+ unsigned long addr = start;
+ unsigned long next;
+ pgd_t *pgdp;
+ int nr = 0;
+
+ if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+ start, nr_pages*PAGE_SIZE)))
+ goto slow_irqon;
+
+ /*
+ * XXX: batch / limit 'nr', to avoid large irq off latency
+ * needs some instrumenting to determine the common sizes used by
+ * important workloads (eg. DB2), and whether limiting the batch size
+ * will decrease performance.
+ *
+ * It seems like we're in the clear for the moment. Direct-IO is
+ * the main guy that batches up lots of get_user_pages, and even
+ * they are limited to 64-at-a-time which is not so many.
+ */
+ /*
+ * This doesn't prevent pagetable teardown, but does prevent
+ * the pagetables and pages from being freed on x86.
+ *
+ * So long as we atomically load page table pointers versus teardown
+ * (which we do on x86, with the above PAE exception), we can follow the
+ * address down to the the page and take a ref on it.
+ */
+ local_irq_disable();
+ pgdp = pgd_offset(mm, addr);
+ do {
+ pgd_t pgd = *pgdp;
+
+ next = pgd_addr_end(addr, end);
+ if (pgd_none(pgd))
+ goto slow;
+ if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+ goto slow;
+ } while (pgdp++, addr = next, addr != end);
+ local_irq_enable();
+
+ VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
+ return nr;
+
+ {
+ int ret;
+
+slow:
+ local_irq_enable();
+slow_irqon:
+ /* Try to get the remaining pages with get_user_pages */
+ start += nr << PAGE_SHIFT;
+ pages += nr;
+
+ down_read(&mm->mmap_sem);
+ ret = get_user_pages(current, mm, start,
+ (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
+ up_read(&mm->mmap_sem);
+
+ /* Have to be a bit careful with return values */
+ if (nr > 0) {
+ if (ret < 0)
+ ret = nr;
+ else
+ ret += nr;
+ }
+
+ return ret;
+ }
+}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ec37121f670..129618ca0ea 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -86,43 +86,6 @@ early_param("gbpages", parse_direct_gbpages_on);
* around without checking the pgd every time.
*/
-void show_mem(void)
-{
- long i, total = 0, reserved = 0;
- long shared = 0, cached = 0;
- struct page *page;
- pg_data_t *pgdat;
-
- printk(KERN_INFO "Mem-info:\n");
- show_free_areas();
- for_each_online_pgdat(pgdat) {
- for (i = 0; i < pgdat->node_spanned_pages; ++i) {
- /*
- * This loop can take a while with 256 GB and
- * 4k pages so defer the NMI watchdog:
- */
- if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
- touch_nmi_watchdog();
-
- if (!pfn_valid(pgdat->node_start_pfn + i))
- continue;
-
- page = pfn_to_page(pgdat->node_start_pfn + i);
- total++;
- if (PageReserved(page))
- reserved++;
- else if (PageSwapCache(page))
- cached++;
- else if (page_count(page))
- shared += page_count(page) - 1;
- }
- }
- printk(KERN_INFO "%lu pages of RAM\n", total);
- printk(KERN_INFO "%lu reserved pages\n", reserved);
- printk(KERN_INFO "%lu pages shared\n", shared);
- printk(KERN_INFO "%lu pages swap cached\n", cached);
-}
-
int after_bootmem;
static __init void *spp_getpage(void)
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index b4becbf8c57..cab0abbd1eb 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -20,53 +20,6 @@
#include <asm/tlb.h>
#include <asm/tlbflush.h>
-void show_mem(void)
-{
- int total = 0, reserved = 0;
- int shared = 0, cached = 0;
- int highmem = 0;
- struct page *page;
- pg_data_t *pgdat;
- unsigned long i;
- unsigned long flags;
-
- printk(KERN_INFO "Mem-info:\n");
- show_free_areas();
- for_each_online_pgdat(pgdat) {
- pgdat_resize_lock(pgdat, &flags);
- for (i = 0; i < pgdat->node_spanned_pages; ++i) {
- if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
- touch_nmi_watchdog();
- page = pgdat_page_nr(pgdat, i);
- total++;
- if (PageHighMem(page))
- highmem++;
- if (PageReserved(page))
- reserved++;
- else if (PageSwapCache(page))
- cached++;
- else if (page_count(page))
- shared += page_count(page) - 1;
- }
- pgdat_resize_unlock(pgdat, &flags);
- }
- printk(KERN_INFO "%d pages of RAM\n", total);
- printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
- printk(KERN_INFO "%d reserved pages\n", reserved);
- printk(KERN_INFO "%d pages shared\n", shared);
- printk(KERN_INFO "%d pages swap cached\n", cached);
-
- printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
- printk(KERN_INFO "%lu pages writeback\n",
- global_page_state(NR_WRITEBACK));
- printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
- printk(KERN_INFO "%lu pages slab\n",
- global_page_state(NR_SLAB_RECLAIMABLE) +
- global_page_state(NR_SLAB_UNRECLAIMABLE));
- printk(KERN_INFO "%lu pages pagetables\n",
- global_page_state(NR_PAGETABLE));
-}
-
/*
* Associate a virtual page frame with a given physical page frame
* and protection flags for that frame.