From b4e0409a36f4533770a12095bde2a574a08a319e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 21 Feb 2008 13:45:16 +0100
Subject: x86: check vmlinux limits, 64-bit

these build-time and link-time checks would have prevented the
vmlinux size regression.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/fault.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index ec08d838985..81fcbeec389 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -976,9 +976,5 @@ void vmalloc_sync_all(void)
 		if (address == start)
 			start = address + PGDIR_SIZE;
 	}
-	/* Check that there is no need to do the same for the modules area. */
-	BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
-	BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
-				(__START_KERNEL & PGDIR_MASK)));
 #endif
 }
-- 
cgit v1.2.3-70-g09d2


From ee7ae7a1981caaa4a5b14d8c75692a9dccd52105 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 17 Apr 2008 17:40:45 +0200
Subject: x86: add debug info to DEBUG_PAGEALLOC

Add debug information for DEBUG_PAGEALLOC to get some statistics about
the pool usage and split status.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/pageattr.c | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 7b79f6be4e7..6cdfc0fd68b 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -9,6 +9,8 @@
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
 
 #include <asm/e820.h>
 #include <asm/processor.h>
@@ -918,6 +920,45 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
 	cpa_fill_pool(NULL);
 }
 
+#ifdef CONFIG_DEBUG_FS
+static int dpa_show(struct seq_file *m, void *v)
+{
+	seq_puts(m, "DEBUG_PAGEALLOC\n");
+	seq_printf(m, "pool_size     : %lu\n", pool_size);
+	seq_printf(m, "pool_pages    : %lu\n", pool_pages);
+	seq_printf(m, "pool_low      : %lu\n", pool_low);
+	seq_printf(m, "pool_used     : %lu\n", pool_used);
+	seq_printf(m, "pool_failed   : %lu\n", pool_failed);
+
+	return 0;
+}
+
+static int dpa_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, dpa_show, NULL);
+}
+
+static const struct file_operations dpa_fops = {
+	.open		= dpa_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+int __init debug_pagealloc_proc_init(void)
+{
+	struct dentry *de;
+
+	de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL,
+				 &dpa_fops);
+	if (!de)
+		return -ENOMEM;
+
+	return 0;
+}
+__initcall(debug_pagealloc_proc_init);
+#endif
+
 #ifdef CONFIG_HIBERNATION
 
 bool kernel_page_present(struct page *page)
-- 
cgit v1.2.3-70-g09d2


From 2596e0fae094be9354b29ddb17e6326a18012e8c Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Thu, 17 Apr 2008 17:40:45 +0200
Subject: x86: unify arch/x86/mm/Makefile

Unify arch/x86/mm/Makefile between 32 and 64 bits.

All configuration variables that are protected by Kconfig constraints
have been put in the common part of the Makefile; however, the NUMA
files are totally different between 32 and 64 bits and are handled via
an ifdef.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/Makefile    | 14 ++++++++++++--
 arch/x86/mm/Makefile_32 |  9 ---------
 arch/x86/mm/Makefile_64 |  9 ---------
 3 files changed, 12 insertions(+), 20 deletions(-)
 delete mode 100644 arch/x86/mm/Makefile_32
 delete mode 100644 arch/x86/mm/Makefile_64

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 98329109684..8e81660604b 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,15 @@
+obj-y	:=  init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o
+
+obj-$(CONFIG_X86_32)		+= pgtable_32.o
+
+obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
+
+obj-$(CONFIG_HIGHMEM)		+= highmem_32.o
+
 ifeq ($(CONFIG_X86_32),y)
-include ${srctree}/arch/x86/mm/Makefile_32
+obj-$(CONFIG_NUMA)		+= discontig_32.o
 else
-include ${srctree}/arch/x86/mm/Makefile_64
+obj-$(CONFIG_NUMA)		+= numa_64.o
+obj-$(CONFIG_K8_NUMA)		+= k8topology_64.o
+obj-$(CONFIG_ACPI_NUMA)		+= srat_64.o
 endif
diff --git a/arch/x86/mm/Makefile_32 b/arch/x86/mm/Makefile_32
deleted file mode 100644
index c36ae88bb54..00000000000
--- a/arch/x86/mm/Makefile_32
+++ /dev/null
@@ -1,9 +0,0 @@
-#
-# Makefile for the linux i386-specific parts of the memory manager.
-#
-
-obj-y	:= init_32.o pgtable_32.o fault.o ioremap.o extable.o pageattr.o mmap.o
-
-obj-$(CONFIG_NUMA) += discontig_32.o
-obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
-obj-$(CONFIG_HIGHMEM) += highmem_32.o
diff --git a/arch/x86/mm/Makefile_64 b/arch/x86/mm/Makefile_64
deleted file mode 100644
index 688c8c28ac8..00000000000
--- a/arch/x86/mm/Makefile_64
+++ /dev/null
@@ -1,9 +0,0 @@
-#
-# Makefile for the linux x86_64-specific parts of the memory manager.
-#
-
-obj-y	 := init_64.o fault.o ioremap.o extable.o pageattr.o mmap.o
-obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
-obj-$(CONFIG_NUMA) += numa_64.o
-obj-$(CONFIG_K8_NUMA) += k8topology_64.o
-obj-$(CONFIG_ACPI_NUMA) += srat_64.o
-- 
cgit v1.2.3-70-g09d2


From 926e5392ba8a388ae32ca0d2714cc2c73945c609 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Thu, 17 Apr 2008 17:40:45 +0200
Subject: x86: add code to dump the (kernel) page tables for visual inspection
 by kernel developers

This patch adds code to the kernel to have an (optional)
/proc/kernel_page_tables debug file that basically dumps the kernel
pagetables; this allows us kernel developers to verify that nothing fishy is
going on and that the various mappings are set up correctly. This was quite
useful in finding various change_page_attr() bugs, and is very likely to be
useful in the future as well.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: mingo@elte.hu
Cc: tglx@tglx.de
Cc: hpa@zytor.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig.debug        |  12 ++
 arch/x86/mm/Makefile          |   1 +
 arch/x86/mm/dump_pagetables.c | 301 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 314 insertions(+)
 create mode 100644 arch/x86/mm/dump_pagetables.c

(limited to 'arch/x86/mm')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 702eb39901c..cb7002eca88 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -54,6 +54,18 @@ config DEBUG_PER_CPU_MAPS
 
 	  Say N if unsure.
 
+config X86_PTDUMP
+	bool "Export kernel pagetable layout to userspace via debugfs"
+	depends on X86_64
+	select DEBUG_FS
+	help
+	  Say Y here if you want to show the kernel pagetable layout in a
+	  debugfs file. This information is only useful for kernel developers
+	  who are working in architecture specific areas of the kernel.
+	  It is probably not a good idea to enable this feature in a production
+	  kernel.
+	  If in doubt, say "N"
+
 config DEBUG_RODATA
 	bool "Write protect kernel read-only data structures"
 	default y
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 8e81660604b..28632f42ca6 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -12,4 +12,5 @@ else
 obj-$(CONFIG_NUMA)		+= numa_64.o
 obj-$(CONFIG_K8_NUMA)		+= k8topology_64.o
 obj-$(CONFIG_ACPI_NUMA)		+= srat_64.o
+obj-$(CONFIG_X86_PTDUMP)	+= dump_pagetables.o
 endif
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
new file mode 100644
index 00000000000..5e7f6430c27
--- /dev/null
+++ b/arch/x86/mm/dump_pagetables.c
@@ -0,0 +1,301 @@
+/*
+ * Debug helper to dump the current kernel pagetables of the system
+ * so that we can see what the various memory ranges are set to.
+ *
+ * (C) Copyright 2008 Intel Corporation
+ *
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+
+#include <asm/pgtable.h>
+
+/*
+ * The dumper groups pagetable entries of the same type into one, and for
+ * that it needs to keep some state when walking, and flush this state
+ * when a "break" in the continuity is found.
+ */
+struct pg_state {
+	int level;
+	pgprot_t current_prot;
+	unsigned long start_address;
+	unsigned long current_address;
+	int printed_vmalloc;
+	int printed_modules;
+	int printed_vmemmap;
+	int printed_highmap;
+};
+
+/* Multipliers for offsets within the PTEs */
+#define LEVEL_4_MULT (PAGE_SIZE)
+#define LEVEL_3_MULT (512UL * LEVEL_4_MULT)
+#define LEVEL_2_MULT (512UL * LEVEL_3_MULT)
+#define LEVEL_1_MULT (512UL * LEVEL_2_MULT)
+
+
+/*
+ * Print a readable form of a pgprot_t to the seq_file
+ */
+static void printk_prot(struct seq_file *m, pgprot_t prot, int level)
+{
+	unsigned long pr = pgprot_val(prot);
+
+	if (pr & _PAGE_USER)
+		seq_printf(m, "USR ");
+	else
+		seq_printf(m, "    ");
+	if (pr & _PAGE_RW)
+		seq_printf(m, "RW ");
+	else
+		seq_printf(m, "ro ");
+	if (pr & _PAGE_PWT)
+		seq_printf(m, "PWT ");
+	else
+		seq_printf(m, "    ");
+	if (pr & _PAGE_PCD)
+		seq_printf(m, "PCD ");
+	else
+		seq_printf(m, "    ");
+
+	/* Bit 9 has a different meaning on level 3 vs 4 */
+	if (level <= 3) {
+		if (pr & _PAGE_PSE)
+			seq_printf(m, "PSE ");
+		else
+			seq_printf(m, "    ");
+	} else {
+		if (pr & _PAGE_PAT)
+			seq_printf(m, "pat ");
+		else
+			seq_printf(m, "    ");
+	}
+	if (pr & _PAGE_GLOBAL)
+		seq_printf(m, "GLB ");
+	else
+		seq_printf(m, "    ");
+	if (pr & _PAGE_NX)
+		seq_printf(m, "NX ");
+	else
+		seq_printf(m, "x  ");
+}
+
+/*
+ * Sign-extend the 48 bit address to 64 bit
+ */
+static unsigned long sign_extend(unsigned long u)
+{
+	if (u>>47)
+		u = u | (0xffffUL << 48);
+	return u;
+}
+
+/*
+ * This function gets called on a break in a continuous series
+ * of PTE entries; the next one is different so we need to
+ * print what we collected so far.
+ */
+static void note_page(struct seq_file *m, struct pg_state *st,
+					pgprot_t new_prot, int level)
+{
+	unsigned long prot, cur;
+
+	/*
+	 * If we have a "break" in the series, we need to flush the state that
+	 * we have now. "break" is either changing perms or a different level.
+	 */
+	prot = pgprot_val(new_prot) & ~(PTE_MASK);
+	cur = pgprot_val(st->current_prot) & ~(PTE_MASK);
+
+	if ((prot != cur || level != st->level) &&
+				st->current_address != st->start_address) {
+		char unit = 'K';
+		unsigned long delta;
+
+		/*
+		 * We print markers for special areas of address space,
+		 * such as the start of vmalloc space etc.
+		 * This helps in the interpretation.
+		 */
+		if (!st->printed_vmalloc &&
+				st->start_address >= VMALLOC_START) {
+			seq_printf(m, "---[ VMALLOC SPACE ]---\n");
+			st->printed_vmalloc = 1;
+		}
+		if (!st->printed_modules &&
+				st->start_address >= MODULES_VADDR) {
+			seq_printf(m, "---[ MODULES SPACE ]---\n");
+			st->printed_modules = 1;
+		}
+		if (st->printed_modules < 2 &&
+				st->start_address >= MODULES_END) {
+			seq_printf(m, "---[ END MODULES SPACE ]---\n");
+			st->printed_modules = 2;
+		}
+		if (!st->printed_vmemmap &&
+				st->start_address >= VMEMMAP_START) {
+			seq_printf(m, "---[ VMMEMMAP SPACE ]---\n");
+			st->printed_vmemmap = 1;
+		}
+		if (!st->printed_highmap &&
+				st->start_address >= __START_KERNEL_map) {
+			seq_printf(m, "---[ HIGH KERNEL MAPPING ]---\n");
+			st->printed_highmap = 1;
+		}
+
+		/*
+		 * Now print the actual finished series
+		 */
+		seq_printf(m, "[ %016lx -  %016lx   ",
+				st->start_address, st->current_address);
+
+		delta = (st->current_address - st->start_address) >> 10;
+		if ((delta & 1023) == 0) {
+			delta = delta >> 10;
+			unit = 'M';
+		}
+		if (pgprot_val(st->current_prot)) {
+			seq_printf(m, "Size %9lu%cb ", delta, unit);
+			printk_prot(m, st->current_prot, st->level);
+			seq_printf(m, "L%i]\n", st->level);
+		} else {
+			/* don't print protections on non-present memory */
+			seq_printf(m, "%14lu%cb", delta, unit);
+			seq_printf(m, "                           L%i]\n",
+					st->level);
+		}
+		st->start_address = st->current_address;
+		st->current_prot = new_prot;
+		st->level = level;
+	};
+}
+
+static void walk_level_4(struct seq_file *m, struct pg_state *st, pmd_t addr,
+							unsigned long P)
+{
+	int i;
+	pte_t *start;
+
+	start = (pte_t *) pmd_page_vaddr(addr);
+	for (i = 0; i < PTRS_PER_PTE; i++) {
+		pgprot_t prot = pte_pgprot(*start);
+
+		st->current_address = sign_extend(P + i * LEVEL_4_MULT);
+		note_page(m, st, prot, 4);
+		start++;
+	}
+}
+
+
+static void walk_level_3(struct seq_file *m, struct pg_state *st, pud_t addr,
+							unsigned long P)
+{
+	int i;
+	pmd_t *start;
+
+	start = (pmd_t *) pud_page_vaddr(addr);
+	for (i = 0; i < PTRS_PER_PMD; i++) {
+		st->current_address = sign_extend(P + i * LEVEL_3_MULT);
+		if (!pmd_none(*start)) {
+			unsigned long prot;
+
+			prot = pmd_val(*start) & ~(PTE_MASK);
+			/* Deal with 2Mb pages */
+			if (pmd_large(*start))
+				note_page(m, st, __pgprot(prot), 3);
+			else
+				walk_level_4(m, st, *start,
+							P + i * LEVEL_3_MULT);
+		} else
+			note_page(m, st, __pgprot(0), 3);
+		start++;
+	}
+}
+
+
+static void walk_level_2(struct seq_file *m, struct pg_state *st, pgd_t addr,
+							unsigned long P)
+{
+	int i;
+	pud_t *start;
+
+	start = (pud_t *) pgd_page_vaddr(addr);
+
+	for (i = 0; i < PTRS_PER_PUD; i++) {
+		if (!pud_none(*start)) {
+			unsigned long prot;
+
+			prot = pud_val(*start) & ~(PTE_MASK);
+			/* Deal with 1Gb pages */
+			if (pud_large(*start))
+				note_page(m, st, __pgprot(prot), 2);
+			else
+				walk_level_3(m, st, *start,
+					P + i * LEVEL_2_MULT);
+		} else
+			note_page(m, st, __pgprot(0), 2);
+
+		start++;
+	}
+}
+
+static void walk_level_1(struct seq_file *m)
+{
+	pgd_t *start = (pgd_t *) &init_level4_pgt;
+	int i;
+	struct pg_state st;
+
+	memset(&st, 0, sizeof(st));
+	st.level = 1;
+
+	for (i = 0; i < PTRS_PER_PGD; i++) {
+		if (!pgd_none(*start))
+			walk_level_2(m, &st, *start, i * LEVEL_1_MULT);
+		else
+			note_page(m, &st, __pgprot(0), 1);
+		start++;
+	}
+}
+
+static int ptdump_show(struct seq_file *m, void *v)
+{
+	seq_puts(m, "Kernel pagetable dump\n");
+	walk_level_1(m);
+	return 0;
+}
+
+static int ptdump_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, ptdump_show, NULL);
+}
+
+static const struct file_operations ptdump_fops = {
+	.open		= ptdump_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+int pt_dump_init(void)
+{
+	struct dentry *pe;
+
+	pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL,
+				 &ptdump_fops);
+	if (!pe)
+		return -ENOMEM;
+
+	return 0;
+}
+
+__initcall(pt_dump_init);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
+MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables");
-- 
cgit v1.2.3-70-g09d2


From fe770bf0310d90b3b033c19044d45b7de5f2041c Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Thu, 17 Apr 2008 17:40:45 +0200
Subject: x86: clean up the page table dumper and add 32-bit support

Clean up the page table dumper (fix boundary conditions, table driven
address ranges, some formatting changes since it is no longer using
the kernel log but a separate virtual file), and generalize to 32
bits.

[ mingo@elte.hu: x86: fix the pagetable dumper ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig.debug        |   2 +-
 arch/x86/mm/Makefile          |   2 +-
 arch/x86/mm/dump_pagetables.c | 301 +++++++++++++++++++++++++-----------------
 3 files changed, 179 insertions(+), 126 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index cb7002eca88..7ce8e702566 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -56,7 +56,7 @@ config DEBUG_PER_CPU_MAPS
 
 config X86_PTDUMP
 	bool "Export kernel pagetable layout to userspace via debugfs"
-	depends on X86_64
+	depends on DEBUG_KERNEL
 	select DEBUG_FS
 	help
 	  Say Y here if you want to show the kernel pagetable layout in a
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 28632f42ca6..9ab9889863f 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -3,6 +3,7 @@ obj-y	:=  init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o
 obj-$(CONFIG_X86_32)		+= pgtable_32.o
 
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
+obj-$(CONFIG_X86_PTDUMP)	+= dump_pagetables.o
 
 obj-$(CONFIG_HIGHMEM)		+= highmem_32.o
 
@@ -12,5 +13,4 @@ else
 obj-$(CONFIG_NUMA)		+= numa_64.o
 obj-$(CONFIG_K8_NUMA)		+= k8topology_64.o
 obj-$(CONFIG_ACPI_NUMA)		+= srat_64.o
-obj-$(CONFIG_X86_PTDUMP)	+= dump_pagetables.o
 endif
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 5e7f6430c27..6d840338f80 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -12,9 +12,10 @@
  * of the License.
  */
 
+#include <linux/debugfs.h>
+#include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/seq_file.h>
-#include <linux/debugfs.h>
 
 #include <asm/pgtable.h>
 
@@ -28,73 +29,107 @@ struct pg_state {
 	pgprot_t current_prot;
 	unsigned long start_address;
 	unsigned long current_address;
-	int printed_vmalloc;
-	int printed_modules;
-	int printed_vmemmap;
-	int printed_highmap;
+	const struct addr_marker *marker;
 };
 
-/* Multipliers for offsets within the PTEs */
-#define LEVEL_4_MULT (PAGE_SIZE)
-#define LEVEL_3_MULT (512UL * LEVEL_4_MULT)
-#define LEVEL_2_MULT (512UL * LEVEL_3_MULT)
-#define LEVEL_1_MULT (512UL * LEVEL_2_MULT)
+struct addr_marker {
+	unsigned long start_address;
+	const char *name;
+};
+
+/* Address space markers hints */
+static struct addr_marker address_markers[] = {
+	{ 0, "User Space" },
+#ifdef CONFIG_X86_64
+	{ 0x8000000000000000UL, "Kernel Space" },
+	{ 0xffff810000000000UL, "Low Kernel Mapping" },
+	{ VMALLOC_START,        "vmalloc() Area" },
+	{ MODULES_VADDR,        "Modules" },
+	{ MODULES_END,          "End Modules" },
+	{ VMEMMAP_START,        "Vmemmap" },
+	{ __START_KERNEL_map,   "High Kernel Mapping" },
+#else
+	{ PAGE_OFFSET,          "Kernel Mapping" },
+	{ 0/* VMALLOC_START */, "vmalloc() Area" },
+	{ 0/*VMALLOC_END*/,     "vmalloc() End" },
+# ifdef CONFIG_HIGHMEM
+	{ 0/*PKMAP_BASE*/,      "Persisent kmap() Area" },
+# endif
+	{ 0/*FIXADDR_START*/,   "Fixmap Area" },
+#endif
+	{ -1, NULL }		/* End of list */
+};
 
+/* Multipliers for offsets within the PTEs */
+#define PTE_LEVEL_MULT (PAGE_SIZE)
+#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
+#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
+#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
 
 /*
  * Print a readable form of a pgprot_t to the seq_file
  */
 static void printk_prot(struct seq_file *m, pgprot_t prot, int level)
 {
-	unsigned long pr = pgprot_val(prot);
-
-	if (pr & _PAGE_USER)
-		seq_printf(m, "USR ");
-	else
-		seq_printf(m, "    ");
-	if (pr & _PAGE_RW)
-		seq_printf(m, "RW ");
-	else
-		seq_printf(m, "ro ");
-	if (pr & _PAGE_PWT)
-		seq_printf(m, "PWT ");
-	else
-		seq_printf(m, "    ");
-	if (pr & _PAGE_PCD)
-		seq_printf(m, "PCD ");
-	else
-		seq_printf(m, "    ");
-
-	/* Bit 9 has a different meaning on level 3 vs 4 */
-	if (level <= 3) {
-		if (pr & _PAGE_PSE)
-			seq_printf(m, "PSE ");
+	pgprotval_t pr = pgprot_val(prot);
+	static const char * const level_name[] =
+		{ "cr3", "pgd", "pud", "pmd", "pte" };
+
+	if (!pgprot_val(prot)) {
+		/* Not present */
+		seq_printf(m, "                          ");
+	} else {
+		if (pr & _PAGE_USER)
+			seq_printf(m, "USR ");
 		else
 			seq_printf(m, "    ");
-	} else {
-		if (pr & _PAGE_PAT)
-			seq_printf(m, "pat ");
+		if (pr & _PAGE_RW)
+			seq_printf(m, "RW ");
+		else
+			seq_printf(m, "ro ");
+		if (pr & _PAGE_PWT)
+			seq_printf(m, "PWT ");
+		else
+			seq_printf(m, "    ");
+		if (pr & _PAGE_PCD)
+			seq_printf(m, "PCD ");
 		else
 			seq_printf(m, "    ");
+
+		/* Bit 9 has a different meaning on level 3 vs 4 */
+		if (level <= 3) {
+			if (pr & _PAGE_PSE)
+				seq_printf(m, "PSE ");
+			else
+				seq_printf(m, "    ");
+		} else {
+			if (pr & _PAGE_PAT)
+				seq_printf(m, "pat ");
+			else
+				seq_printf(m, "    ");
+		}
+		if (pr & _PAGE_GLOBAL)
+			seq_printf(m, "GLB ");
+		else
+			seq_printf(m, "    ");
+		if (pr & _PAGE_NX)
+			seq_printf(m, "NX ");
+		else
+			seq_printf(m, "x  ");
 	}
-	if (pr & _PAGE_GLOBAL)
-		seq_printf(m, "GLB ");
-	else
-		seq_printf(m, "    ");
-	if (pr & _PAGE_NX)
-		seq_printf(m, "NX ");
-	else
-		seq_printf(m, "x  ");
+	seq_printf(m, "%s\n", level_name[level]);
 }
 
 /*
- * Sign-extend the 48 bit address to 64 bit
+ * On 64 bits, sign-extend the 48 bit address to 64 bit
  */
-static unsigned long sign_extend(unsigned long u)
+static unsigned long normalize_addr(unsigned long u)
 {
-	if (u>>47)
-		u = u | (0xffffUL << 48);
+#ifdef CONFIG_X86_64
+	return (signed long)(u << 16) >> 16;
+#else
 	return u;
+#endif
 }
 
 /*
@@ -103,81 +138,62 @@ static unsigned long sign_extend(unsigned long u)
  * print what we collected so far.
  */
 static void note_page(struct seq_file *m, struct pg_state *st,
-					pgprot_t new_prot, int level)
+		      pgprot_t new_prot, int level)
 {
-	unsigned long prot, cur;
+	pgprotval_t prot, cur;
+	static const char units[] = "KMGTPE";
 
 	/*
 	 * If we have a "break" in the series, we need to flush the state that
-	 * we have now. "break" is either changing perms or a different level.
+	 * we have now. "break" is either changing perms, levels or
+	 * address space marker.
 	 */
 	prot = pgprot_val(new_prot) & ~(PTE_MASK);
 	cur = pgprot_val(st->current_prot) & ~(PTE_MASK);
 
-	if ((prot != cur || level != st->level) &&
-				st->current_address != st->start_address) {
-		char unit = 'K';
+	if (!st->level) {
+		/* First entry */
+		st->current_prot = new_prot;
+		st->level = level;
+		st->marker = address_markers;
+		seq_printf(m, "---[ %s ]---\n", st->marker->name);
+	} else if (prot != cur || level != st->level ||
+		   st->current_address >= st->marker[1].start_address) {
+		const char *unit = units;
 		unsigned long delta;
 
-		/*
-		 * We print markers for special areas of address space,
-		 * such as the start of vmalloc space etc.
-		 * This helps in the interpretation.
-		 */
-		if (!st->printed_vmalloc &&
-				st->start_address >= VMALLOC_START) {
-			seq_printf(m, "---[ VMALLOC SPACE ]---\n");
-			st->printed_vmalloc = 1;
-		}
-		if (!st->printed_modules &&
-				st->start_address >= MODULES_VADDR) {
-			seq_printf(m, "---[ MODULES SPACE ]---\n");
-			st->printed_modules = 1;
-		}
-		if (st->printed_modules < 2 &&
-				st->start_address >= MODULES_END) {
-			seq_printf(m, "---[ END MODULES SPACE ]---\n");
-			st->printed_modules = 2;
-		}
-		if (!st->printed_vmemmap &&
-				st->start_address >= VMEMMAP_START) {
-			seq_printf(m, "---[ VMMEMMAP SPACE ]---\n");
-			st->printed_vmemmap = 1;
-		}
-		if (!st->printed_highmap &&
-				st->start_address >= __START_KERNEL_map) {
-			seq_printf(m, "---[ HIGH KERNEL MAPPING ]---\n");
-			st->printed_highmap = 1;
-		}
-
 		/*
 		 * Now print the actual finished series
 		 */
-		seq_printf(m, "[ %016lx -  %016lx   ",
-				st->start_address, st->current_address);
+		seq_printf(m, "0x%p-0x%p   ",
+			   (void *)st->start_address,
+			   (void *)st->current_address);
 
 		delta = (st->current_address - st->start_address) >> 10;
-		if ((delta & 1023) == 0) {
-			delta = delta >> 10;
-			unit = 'M';
+		while (!(delta & 1023) && unit[1]) {
+			delta >>= 10;
+			unit++;
 		}
-		if (pgprot_val(st->current_prot)) {
-			seq_printf(m, "Size %9lu%cb ", delta, unit);
-			printk_prot(m, st->current_prot, st->level);
-			seq_printf(m, "L%i]\n", st->level);
-		} else {
-			/* don't print protections on non-present memory */
-			seq_printf(m, "%14lu%cb", delta, unit);
-			seq_printf(m, "                           L%i]\n",
-					st->level);
+		seq_printf(m, "%9lu%c ", delta, *unit);
+		printk_prot(m, st->current_prot, st->level);
+
+		/*
+		 * We print markers for special areas of address space,
+		 * such as the start of vmalloc space etc.
+		 * This helps in the interpretation.
+		 */
+		if (st->current_address >= st->marker[1].start_address) {
+			st->marker++;
+			seq_printf(m, "---[ %s ]---\n", st->marker->name);
 		}
+
 		st->start_address = st->current_address;
 		st->current_prot = new_prot;
 		st->level = level;
-	};
+	}
 }
 
-static void walk_level_4(struct seq_file *m, struct pg_state *st, pmd_t addr,
+static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
 							unsigned long P)
 {
 	int i;
@@ -187,14 +203,15 @@ static void walk_level_4(struct seq_file *m, struct pg_state *st, pmd_t addr,
 	for (i = 0; i < PTRS_PER_PTE; i++) {
 		pgprot_t prot = pte_pgprot(*start);
 
-		st->current_address = sign_extend(P + i * LEVEL_4_MULT);
+		st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
 		note_page(m, st, prot, 4);
 		start++;
 	}
 }
 
+#if PTRS_PER_PMD > 1
 
-static void walk_level_3(struct seq_file *m, struct pg_state *st, pud_t addr,
+static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
 							unsigned long P)
 {
 	int i;
@@ -202,25 +219,30 @@ static void walk_level_3(struct seq_file *m, struct pg_state *st, pud_t addr,
 
 	start = (pmd_t *) pud_page_vaddr(addr);
 	for (i = 0; i < PTRS_PER_PMD; i++) {
-		st->current_address = sign_extend(P + i * LEVEL_3_MULT);
+		st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
 		if (!pmd_none(*start)) {
-			unsigned long prot;
+			pgprotval_t prot = pmd_val(*start) & ~PTE_MASK;
 
-			prot = pmd_val(*start) & ~(PTE_MASK);
-			/* Deal with 2Mb pages */
-			if (pmd_large(*start))
+			if (pmd_large(*start) || !pmd_present(*start))
 				note_page(m, st, __pgprot(prot), 3);
 			else
-				walk_level_4(m, st, *start,
-							P + i * LEVEL_3_MULT);
+				walk_pte_level(m, st, *start,
+					       P + i * PMD_LEVEL_MULT);
 		} else
 			note_page(m, st, __pgprot(0), 3);
 		start++;
 	}
 }
 
+#else
+#define walk_pmd_level(m,s,a,p) walk_pte_level(m,s,__pmd(pud_val(a)),p)
+#define pud_large(a) pmd_large(__pmd(pud_val(a)))
+#define pud_none(a)  pmd_none(__pmd(pud_val(a)))
+#endif
 
-static void walk_level_2(struct seq_file *m, struct pg_state *st, pgd_t addr,
+#if PTRS_PER_PUD > 1
+
+static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
 							unsigned long P)
 {
 	int i;
@@ -229,16 +251,15 @@ static void walk_level_2(struct seq_file *m, struct pg_state *st, pgd_t addr,
 	start = (pud_t *) pgd_page_vaddr(addr);
 
 	for (i = 0; i < PTRS_PER_PUD; i++) {
+		st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
 		if (!pud_none(*start)) {
-			unsigned long prot;
+			pgprotval_t prot = pud_val(*start) & ~PTE_MASK;
 
-			prot = pud_val(*start) & ~(PTE_MASK);
-			/* Deal with 1Gb pages */
-			if (pud_large(*start))
+			if (pud_large(*start) || !pud_present(*start))
 				note_page(m, st, __pgprot(prot), 2);
 			else
-				walk_level_3(m, st, *start,
-					P + i * LEVEL_2_MULT);
+				walk_pmd_level(m, st, *start,
+					       P + i * PUD_LEVEL_MULT);
 		} else
 			note_page(m, st, __pgprot(0), 2);
 
@@ -246,28 +267,48 @@ static void walk_level_2(struct seq_file *m, struct pg_state *st, pgd_t addr,
 	}
 }
 
-static void walk_level_1(struct seq_file *m)
+#else
+#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(pgd_val(a)),p)
+#define pgd_large(a) pud_large(__pud(pgd_val(a)))
+#define pgd_none(a)  pud_none(__pud(pgd_val(a)))
+#endif
+
+static void walk_pgd_level(struct seq_file *m)
 {
+#ifdef CONFIG_X86_64
 	pgd_t *start = (pgd_t *) &init_level4_pgt;
+#else
+	pgd_t *start = swapper_pg_dir;
+#endif
 	int i;
 	struct pg_state st;
 
 	memset(&st, 0, sizeof(st));
-	st.level = 1;
 
 	for (i = 0; i < PTRS_PER_PGD; i++) {
-		if (!pgd_none(*start))
-			walk_level_2(m, &st, *start, i * LEVEL_1_MULT);
-		else
+		st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
+		if (!pgd_none(*start)) {
+			pgprotval_t prot = pgd_val(*start) & ~PTE_MASK;
+
+			if (pgd_large(*start) || !pgd_present(*start))
+				note_page(m, &st, __pgprot(prot), 1);
+			else
+				walk_pud_level(m, &st, *start,
+					       i * PGD_LEVEL_MULT);
+		} else
 			note_page(m, &st, __pgprot(0), 1);
+
 		start++;
 	}
+
+	/* Flush out the last page */
+	st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT);
+	note_page(m, &st, __pgprot(0), 0);
 }
 
 static int ptdump_show(struct seq_file *m, void *v)
 {
-	seq_puts(m, "Kernel pagetable dump\n");
-	walk_level_1(m);
+	walk_pgd_level(m);
 	return 0;
 }
 
@@ -287,6 +328,18 @@ int pt_dump_init(void)
 {
 	struct dentry *pe;
 
+#ifdef CONFIG_X86_32
+	/* Not a compile-time constant on x86-32 */
+	address_markers[2].start_address = VMALLOC_START;
+	address_markers[3].start_address = VMALLOC_END;
+# ifdef CONFIG_HIGHMEM
+	address_markers[4].start_address = PKMAP_BASE;
+	address_markers[5].start_address = FIXADDR_START;
+# else
+	address_markers[4].start_address = FIXADDR_START;
+# endif
+#endif
+
 	pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL,
 				 &ptdump_fops);
 	if (!pe)
-- 
cgit v1.2.3-70-g09d2


From 00d1c5e05736f947687be27706bda01cec104e57 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 17 Apr 2008 17:40:45 +0200
Subject: x86: add gbpages switches

These new controls toggle experimental support for a new CPU feature,
the straightforward extension of largepages from the pmd level to the
pud level, which allows 1GB (kernel) TLBs instead of 2MB TLBs.

Turn it off by default, as this code has not been tested well enough yet.

Use the CONFIG_DIRECT_GBPAGES=y .config option or gbpages on the
boot line can be used to enable it. If enabled in the .config then
nogbpages boot option disables it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/x86_64/boot-options.txt |  5 +++++
 arch/x86/Kconfig.debug                | 12 ++++++++++++
 arch/x86/mm/init_64.c                 | 20 ++++++++++++++++++++
 include/asm-x86/pgtable_64.h          |  2 ++
 4 files changed, 39 insertions(+)

(limited to 'arch/x86/mm')

diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt
index 34abae4e944..b0c7b6c4abd 100644
--- a/Documentation/x86_64/boot-options.txt
+++ b/Documentation/x86_64/boot-options.txt
@@ -307,3 +307,8 @@ Debugging
 			stuck (default)
 
 Miscellaneous
+
+	nogbpages
+		Do not use GB pages for kernel direct mappings.
+	gbpages
+		Use GB pages for kernel direct mappings.
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 7ce8e702566..f4413c04e68 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -76,6 +76,18 @@ config DEBUG_RODATA
 	  data. This is recommended so that we can catch kernel bugs sooner.
 	  If in doubt, say "Y".
 
+config DIRECT_GBPAGES
+	bool "Enable gbpages-mapped kernel pagetables"
+	depends on DEBUG_KERNEL && EXPERIMENTAL && X86_64
+	help
+	  Enable gigabyte pages support (if the CPU supports it). This can
+	  improve the kernel's performance a tiny bit by reducing TLB
+	  pressure.
+
+	  This is experimental code.
+
+	  If in doubt, say "N".
+
 config DEBUG_RODATA_TEST
 	bool "Testcase for the DEBUG_RODATA feature"
 	depends on DEBUG_RODATA
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a02a14f0f32..6e7d5a42a09 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -54,6 +54,26 @@ static unsigned long dma_reserve __initdata;
 
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 
+int direct_gbpages __meminitdata
+#ifdef CONFIG_DIRECT_GBPAGES
+				= 1
+#endif
+;
+
+static int __init parse_direct_gbpages_off(char *arg)
+{
+	direct_gbpages = 0;
+	return 0;
+}
+early_param("nogbpages", parse_direct_gbpages_off);
+
+static int __init parse_direct_gbpages_on(char *arg)
+{
+	direct_gbpages = 1;
+	return 0;
+}
+early_param("gbpages", parse_direct_gbpages_on);
+
 /*
  * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
  * physical space so we can cache the place of the first one and move
diff --git a/include/asm-x86/pgtable_64.h b/include/asm-x86/pgtable_64.h
index 01d2359e7a3..6ef09914acb 100644
--- a/include/asm-x86/pgtable_64.h
+++ b/include/asm-x86/pgtable_64.h
@@ -239,6 +239,8 @@ static inline int pud_large(pud_t pte)
 
 #define update_mmu_cache(vma,address,pte) do { } while (0)
 
+extern int direct_gbpages;
+
 /* Encode and de-code a swap entry */
 #define __swp_type(x)			(((x).val >> 1) & 0x3f)
 #define __swp_offset(x)			((x).val >> 8)
-- 
cgit v1.2.3-70-g09d2


From ef9257668e3199f9566dc4a31f5292838bd99b49 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 17 Apr 2008 17:40:45 +0200
Subject: x86: do kernel direct mapping at boot using GB pages

The AMD Fam10h CPUs support new Gigabyte page table entry for
mapping 1GB at a time. Use this for the kernel direct mapping.

Only done for 64bit because i386 does not support GB page tables.

This only applies to the data portion of the direct mapping; the
kernel text mapping stays with 2MB pages because the AMD Fam10h
microarchitecture does not support GB ITLBs and AMD recommends
against using GB mappings for code.

Can be disabled with disable_gbpages on the kernel command line

[ tglx@linutronix.de: simplify enable code ]
[ Yinghai Lu <yinghai.lu@sun.com>: boot fix on 256 GB RAM ]

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/init_64.c | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 6e7d5a42a09..ae66d8d4c58 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -370,7 +370,14 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
 		}
 
 		if (pud_val(*pud)) {
-			phys_pmd_update(pud, addr, end);
+			if (!pud_large(*pud))
+				phys_pmd_update(pud, addr, end);
+			continue;
+		}
+
+		if (direct_gbpages) {
+			set_pte((pte_t *)pud,
+				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
 			continue;
 		}
 
@@ -391,9 +398,11 @@ static void __init find_early_table_space(unsigned long end)
 	unsigned long puds, pmds, tables, start;
 
 	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
-	pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
-	tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
-		 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
+	tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
+	if (!direct_gbpages) {
+		pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+		tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
+	}
 
 	/*
 	 * RED-PEN putting page tables only on node 0 could
@@ -413,6 +422,14 @@ static void __init find_early_table_space(unsigned long end)
 		(table_start << PAGE_SHIFT) + tables);
 }
 
+static void __init init_gbpages(void)
+{
+	if (direct_gbpages && cpu_has_gbpages)
+		printk(KERN_INFO "Using GB pages for direct mapping\n");
+	else
+		direct_gbpages = 0;
+}
+
 /*
  * Setup the direct mapping of the physical memory at PAGE_OFFSET.
  * This runs before bootmem is initialized and gets pages directly from
@@ -431,8 +448,10 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
 	 * memory mapped. Unfortunately this is done currently before the
 	 * nodes are discovered.
 	 */
-	if (!after_bootmem)
+	if (!after_bootmem) {
+		init_gbpages();
 		find_early_table_space(end);
+	}
 
 	start = (unsigned long)__va(start);
 	end = (unsigned long)__va(end);
-- 
cgit v1.2.3-70-g09d2


From beafe91f1c2c49713221ca2616755e1f3d472084 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <Yinghai.Lu@Sun.COM>
Date: Sat, 16 Feb 2008 23:00:22 -0800
Subject: x86: get apic_id later in acpi_numa_processor_affinity_init

we don't need get that so early.

Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/srat_64.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 845001c617c..04e06c8226e 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -132,7 +132,6 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 	int pxm, node;
 	int apic_id;
 
-	apic_id = pa->apic_id;
 	if (srat_disabled())
 		return;
 	if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
@@ -148,6 +147,8 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 		bad_srat();
 		return;
 	}
+
+	apic_id = pa->apic_id;
 	apicid_to_node[apic_id] = node;
 	acpi_numa = 1;
 	printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
-- 
cgit v1.2.3-70-g09d2


From 04adf11435a5187383c35017a94b55701984243b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <Yinghai.Lu@Sun.COM>
Date: Sat, 16 Feb 2008 23:02:03 -0800
Subject: x86: remove never used nodenumer in pda

Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/numa_64.c | 2 --
 include/asm-x86/pda.h | 1 -
 2 files changed, 3 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 16b82ad34b9..18267a02e67 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -548,8 +548,6 @@ void __cpuinit numa_set_node(int cpu, int node)
 {
 	int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr;
 
-	cpu_pda(cpu)->nodenumber = node;
-
 	if(cpu_to_node_map)
 		cpu_to_node_map[cpu] = node;
 	else if(per_cpu_offset(cpu))
diff --git a/include/asm-x86/pda.h b/include/asm-x86/pda.h
index c0305bff0f1..d9dc209c24a 100644
--- a/include/asm-x86/pda.h
+++ b/include/asm-x86/pda.h
@@ -22,7 +22,6 @@ struct x8664_pda {
 					   offset 40!!! */
 #endif
 	char *irqstackptr;
-	unsigned int nodenumber;	/* number of current node */
 	unsigned int __softirq_pending;
 	unsigned int __nmi_count;	/* number of NMI on this CPUs */
 	short mmu_state;
-- 
cgit v1.2.3-70-g09d2


From c92a7a54d6579c9c01374092e7b61a6161f2ef70 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ijc@hellion.org.uk>
Date: Sun, 17 Feb 2008 19:09:42 +0000
Subject: x86: reduce arch/x86/mm/ioremap.o size

> Don't we have a special section for page-aligned data so it doesn't
> waste most of two pages?

We have .bss.page_aligned and it seems appropriate to use it.

    text	   data	    bss	    dec	    hex	filename
    -   3388	   8236	      4	  11628	   2d6c	../build-32/arch/x86/mm/ioremap.o
    +   3388	     48	   4100	   7536	   1d70	../build-32/arch/x86/mm/ioremap.o

Signed-off-by: Ian Campbell <ijc@hellion.org.uk>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Huang Ying <ying.huang@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/ioremap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 794895c6dcc..868bbde7469 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -272,8 +272,8 @@ static int __init early_ioremap_debug_setup(char *str)
 early_param("early_ioremap_debug", early_ioremap_debug_setup);
 
 static __initdata int after_paging_init;
-static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
-				__attribute__((aligned(PAGE_SIZE)));
+static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
+		__section(.bss.page_aligned);
 
 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
 {
-- 
cgit v1.2.3-70-g09d2


From e3100c82abd9aa643dc15828202aceeae3504e03 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 27 Feb 2008 20:57:40 +0100
Subject: x86: check physical address range in ioremap

Roland Dreier reported in http://lkml.org/lkml/2008/2/27/194

[ 8425.915139] BUG: unable to handle kernel paging request at ffffc20001a0a000
[ 8425.919087] IP: [<ffffffff8021dacc>] clflush_cache_range+0xc/0x25
[ 8425.919087] PGD 1bf80e067 PUD 1bf80f067 PMD 1bb497067 PTE 80000047000ee17b

This is on a Intel machine with 36bit physical address space. The PTE
entry references 47000ee000, which is outside of it.

Add a check for the physical address space and warn/printk about the
stupid caller.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/ioremap.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 868bbde7469..17f51883902 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -35,6 +35,18 @@ unsigned long __phys_addr(unsigned long x)
 }
 EXPORT_SYMBOL(__phys_addr);
 
+static inline int phys_addr_valid(unsigned long addr)
+{
+	return addr < (1UL << boot_cpu_data.x86_phys_bits);
+}
+
+#else
+
+static inline int phys_addr_valid(unsigned long addr)
+{
+	return 1;
+}
+
 #endif
 
 int page_is_ram(unsigned long pagenr)
@@ -118,6 +130,13 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
 	if (!size || last_addr < phys_addr)
 		return NULL;
 
+	if (!phys_addr_valid(phys_addr)) {
+		printk(KERN_WARNING "ioremap: invalid physical address %lx\n",
+		       phys_addr);
+		WARN_ON_ONCE(1);
+		return NULL;
+	}
+
 	/*
 	 * Don't remap the low PCI/ISA area, it's always mapped..
 	 */
-- 
cgit v1.2.3-70-g09d2


From bdd3cee2e4b7279457139058615ced6c2b41e7de Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 28 Feb 2008 14:10:49 +0100
Subject: x86: ioremap(), extend check to all RAM pages

Suggested by Jan Beulich.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Jan Beulich <jbeulich@novell.com>
---
 arch/x86/mm/ioremap.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 17f51883902..e5608d38017 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -146,8 +146,9 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
 	/*
 	 * Don't allow anybody to remap normal RAM that we're using..
 	 */
-	for (pfn = phys_addr >> PAGE_SHIFT; pfn < max_pfn_mapped &&
-	     (pfn << PAGE_SHIFT) < last_addr; pfn++) {
+	for (pfn = phys_addr >> PAGE_SHIFT;
+				(pfn << PAGE_SHIFT) < last_addr; pfn++) {
+
 		if (page_is_ram(pfn) && pfn_valid(pfn) &&
 		    !PageReserved(pfn_to_page(pfn)))
 			return NULL;
-- 
cgit v1.2.3-70-g09d2


From ba748d221eb74b849453a94fdf0e1d0566b407d7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Mar 2008 09:37:41 +0100
Subject: x86: warn about RAM pages in ioremap()

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/ioremap.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index e5608d38017..3d0a589d92c 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -149,9 +149,11 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
 	for (pfn = phys_addr >> PAGE_SHIFT;
 				(pfn << PAGE_SHIFT) < last_addr; pfn++) {
 
-		if (page_is_ram(pfn) && pfn_valid(pfn) &&
-		    !PageReserved(pfn_to_page(pfn)))
+		int is_ram = page_is_ram(pfn);
+
+		if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
 			return NULL;
+		WARN_ON_ONCE(is_ram);
 	}
 
 	switch (mode) {
-- 
cgit v1.2.3-70-g09d2


From 9fc34113f6880b215cbea4e7017fc818700384c2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Mar 2008 09:53:17 +0100
Subject: x86: debug pmd_bad()

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pgtable_32.c     | 7 +++++++
 include/asm-x86/pgtable_32.h | 6 +++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 2f9e9afcb9f..76e4f4d2627 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -381,3 +381,10 @@ void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 }
 
 #endif
+
+int pmd_bad(pmd_t pmd)
+{
+	WARN_ON_ONCE(pmd_bad_v1(pmd) != pmd_bad_v2(pmd));
+
+	return pmd_bad_v1(pmd);
+}
diff --git a/include/asm-x86/pgtable_32.h b/include/asm-x86/pgtable_32.h
index 997c36c6b4d..1e2c0d83952 100644
--- a/include/asm-x86/pgtable_32.h
+++ b/include/asm-x86/pgtable_32.h
@@ -90,7 +90,11 @@ extern unsigned long pg0[];
 /* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
 #define pmd_none(x)	(!(unsigned long)pmd_val(x))
 #define pmd_present(x)	(pmd_val(x) & _PAGE_PRESENT)
-#define	pmd_bad(x)	((pmd_val(x) \
+
+extern int pmd_bad(pmd_t pmd);
+
+#define pmd_bad_v1(x)	((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
+#define	pmd_bad_v2(x)	((pmd_val(x) \
 			  & ~(PAGE_MASK | _PAGE_USER | _PAGE_PSE | _PAGE_NX)) \
 			 != _KERNPG_TABLE)
 
-- 
cgit v1.2.3-70-g09d2


From 4e4eee0e0139811b36a07854dcfa9746bc8b16d3 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Sat, 2 Feb 2008 15:42:20 -0500
Subject: x86: enhance DEBUG_RODATA support for hotplug and kprobes

Standardize DEBUG_RODATA, removing special cases for hotplug and kprobes.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: pageexec@freemail.hu
Cc: akpm@linux-foundation.org
CC: Andi Kleen <andi@firstfloor.org>
CC: pageexec@freemail.hu
CC: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/init_32.c | 24 ++++++++----------------
 arch/x86/mm/init_64.c | 20 ++------------------
 2 files changed, 10 insertions(+), 34 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index ee1091a4696..00168e65688 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -723,25 +723,17 @@ void mark_rodata_ro(void)
 	unsigned long start = PFN_ALIGN(_text);
 	unsigned long size = PFN_ALIGN(_etext) - start;
 
-#ifndef CONFIG_KPROBES
-#ifdef CONFIG_HOTPLUG_CPU
-	/* It must still be possible to apply SMP alternatives. */
-	if (num_possible_cpus() <= 1)
-#endif
-	{
-		set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
-		printk(KERN_INFO "Write protecting the kernel text: %luk\n",
-			size >> 10);
+	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+	printk(KERN_INFO "Write protecting the kernel text: %luk\n",
+		size >> 10);
 
 #ifdef CONFIG_CPA_DEBUG
-		printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
-			start, start+size);
-		set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
+	printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
+		start, start+size);
+	set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
 
-		printk(KERN_INFO "Testing CPA: write protecting again\n");
-		set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
-#endif
-	}
+	printk(KERN_INFO "Testing CPA: write protecting again\n");
+	set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
 #endif
 	start += size;
 	size = (unsigned long)__end_rodata - start;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ae66d8d4c58..4cb6c7f80f0 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -635,24 +635,7 @@ EXPORT_SYMBOL_GPL(rodata_test_data);
 
 void mark_rodata_ro(void)
 {
-	unsigned long start = (unsigned long)_stext, end;
-
-#ifdef CONFIG_HOTPLUG_CPU
-	/* It must still be possible to apply SMP alternatives. */
-	if (num_possible_cpus() > 1)
-		start = (unsigned long)_etext;
-#endif
-
-#ifdef CONFIG_KPROBES
-	start = (unsigned long)__start_rodata;
-#endif
-
-	end = (unsigned long)__end_rodata;
-	start = (start + PAGE_SIZE - 1) & PAGE_MASK;
-	end &= PAGE_MASK;
-	if (end <= start)
-		return;
-
+	unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
 
 	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
 	       (end - start) >> 10);
@@ -675,6 +658,7 @@ void mark_rodata_ro(void)
 	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
 #endif
 }
+
 #endif
 
 #ifdef CONFIG_BLK_DEV_INITRD
-- 
cgit v1.2.3-70-g09d2


From 9a79cf9c1aa671325fa5ba37576c2cee23823d04 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Fri, 7 Mar 2008 19:17:55 -0800
Subject: x86: sort address_markers for dump_pagetables

otherwise Vmemmap and High Kernel Mapping string is not showing up.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/dump_pagetables.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 6d840338f80..6791b8334bc 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -44,10 +44,10 @@ static struct addr_marker address_markers[] = {
 	{ 0x8000000000000000UL, "Kernel Space" },
 	{ 0xffff810000000000UL, "Low Kernel Mapping" },
 	{ VMALLOC_START,        "vmalloc() Area" },
-	{ MODULES_VADDR,        "Modules" },
-	{ MODULES_END,          "End Modules" },
 	{ VMEMMAP_START,        "Vmemmap" },
 	{ __START_KERNEL_map,   "High Kernel Mapping" },
+	{ MODULES_VADDR,        "Modules" },
+	{ MODULES_END,          "End Modules" },
 #else
 	{ PAGE_OFFSET,          "Kernel Mapping" },
 	{ 0/* VMALLOC_START */, "vmalloc() Area" },
-- 
cgit v1.2.3-70-g09d2


From 1415d160c7f7fe8f1026735d5b6cc19aec7a367f Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@saeurebad.de>
Date: Mon, 10 Mar 2008 21:10:57 +0100
Subject: x86: Remove redundant display of free swap space in show_mem()

Signed-off-by: Johannes Weiner <hannes@saeurebad.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c    | 3 ---
 arch/x86/mm/pgtable_32.c | 1 -
 2 files changed, 4 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 4cb6c7f80f0..255e51feb15 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -89,9 +89,6 @@ void show_mem(void)
 
 	printk(KERN_INFO "Mem-info:\n");
 	show_free_areas();
-	printk(KERN_INFO "Free swap:       %6ldkB\n",
-		nr_swap_pages << (PAGE_SHIFT-10));
-
 	for_each_online_pgdat(pgdat) {
 		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
 			/*
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 76e4f4d2627..3165ec0672b 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -36,7 +36,6 @@ void show_mem(void)
 
 	printk(KERN_INFO "Mem-info:\n");
 	show_free_areas();
-	printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_online_pgdat(pgdat) {
 		pgdat_resize_lock(pgdat, &flags);
 		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
-- 
cgit v1.2.3-70-g09d2


From ce3fe6b2bfded4f5d931c5f2f9325dc2e3fd3a74 Mon Sep 17 00:00:00 2001
From: Alexey Starikovskiy <astarikovskiy@suse.de>
Date: Mon, 17 Mar 2008 22:08:17 +0300
Subject: x86: use get_bios_ebda in mpparse_64.c

Signed-off-by: Alexey Starikovskiy <astarikovskiy@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/mpparse_32.c             |  2 +-
 arch/x86/kernel/mpparse_64.c             | 11 ++++-------
 arch/x86/kernel/setup_32.c               |  2 +-
 arch/x86/mm/discontig_32.c               |  2 +-
 include/asm-x86/bios_ebda.h              | 15 +++++++++++++++
 include/asm-x86/mach-default/bios_ebda.h | 15 ---------------
 6 files changed, 22 insertions(+), 25 deletions(-)
 create mode 100644 include/asm-x86/bios_ebda.h
 delete mode 100644 include/asm-x86/mach-default/bios_ebda.h

(limited to 'arch/x86/mm')

diff --git a/arch/x86/kernel/mpparse_32.c b/arch/x86/kernel/mpparse_32.c
index 838e4974e1c..a2162644cb4 100644
--- a/arch/x86/kernel/mpparse_32.c
+++ b/arch/x86/kernel/mpparse_32.c
@@ -27,11 +27,11 @@
 #include <asm/mtrr.h>
 #include <asm/mpspec.h>
 #include <asm/io_apic.h>
+#include <asm/bios_ebda.h>
 
 #include <mach_apic.h>
 #include <mach_apicdef.h>
 #include <mach_mpparse.h>
-#include <bios_ebda.h>
 
 /* Have we found an MP table */
 int smp_found_config;
diff --git a/arch/x86/kernel/mpparse_64.c b/arch/x86/kernel/mpparse_64.c
index 269fd46df42..fb74135f9d0 100644
--- a/arch/x86/kernel/mpparse_64.c
+++ b/arch/x86/kernel/mpparse_64.c
@@ -29,6 +29,7 @@
 #include <asm/io_apic.h>
 #include <asm/proto.h>
 #include <asm/acpi.h>
+#include <asm/bios_ebda.h>
 
 #include <mach_apic.h>
 
@@ -641,13 +642,9 @@ static void __init __find_smp_config(unsigned reserve)
 	 * should be fixed.
 	 */
 
-	address = *(unsigned short *)phys_to_virt(0x40E);
-	address <<= 4;
-	if (smp_scan_config(address, 0x1000, reserve))
-		return;
-
-	/* If we have come this far, we did not find an MP table  */
-	printk(KERN_INFO "No mptable found.\n");
+	address = get_bios_ebda();
+	if (address)
+		smp_scan_config(address, 0x1000, reserve);
 }
 
 void __init early_find_smp_config(void)
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index eb97bcfe0f6..58f3c1fbc5c 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -62,7 +62,7 @@
 #include <asm/io.h>
 #include <asm/vmi.h>
 #include <setup_arch.h>
-#include <bios_ebda.h>
+#include <asm/bios_ebda.h>
 #include <asm/cacheflush.h>
 #include <asm/processor.h>
 
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 8e25e06ff73..eba0bbede7a 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -37,7 +37,7 @@
 #include <asm/e820.h>
 #include <asm/setup.h>
 #include <asm/mmzone.h>
-#include <bios_ebda.h>
+#include <asm/bios_ebda.h>
 
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
diff --git a/include/asm-x86/bios_ebda.h b/include/asm-x86/bios_ebda.h
new file mode 100644
index 00000000000..9cbd9a668af
--- /dev/null
+++ b/include/asm-x86/bios_ebda.h
@@ -0,0 +1,15 @@
+#ifndef _MACH_BIOS_EBDA_H
+#define _MACH_BIOS_EBDA_H
+
+/*
+ * there is a real-mode segmented pointer pointing to the
+ * 4K EBDA area at 0x40E.
+ */
+static inline unsigned int get_bios_ebda(void)
+{
+	unsigned int address = *(unsigned short *)phys_to_virt(0x40E);
+	address <<= 4;
+	return address;	/* 0 means none */
+}
+
+#endif /* _MACH_BIOS_EBDA_H */
diff --git a/include/asm-x86/mach-default/bios_ebda.h b/include/asm-x86/mach-default/bios_ebda.h
deleted file mode 100644
index 9cbd9a668af..00000000000
--- a/include/asm-x86/mach-default/bios_ebda.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef _MACH_BIOS_EBDA_H
-#define _MACH_BIOS_EBDA_H
-
-/*
- * there is a real-mode segmented pointer pointing to the
- * 4K EBDA area at 0x40E.
- */
-static inline unsigned int get_bios_ebda(void)
-{
-	unsigned int address = *(unsigned short *)phys_to_virt(0x40E);
-	address <<= 4;
-	return address;	/* 0 means none */
-}
-
-#endif /* _MACH_BIOS_EBDA_H */
-- 
cgit v1.2.3-70-g09d2


From 272b9cad6e7a2f61b13cfcd7dde0010e02e9376e Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel.send@gmail.com>
Date: Thu, 20 Mar 2008 23:58:33 -0700
Subject: x86: early memtest to find bad ram

do simple memtest after init_memory_mapping

use find_e820_area_size to find all ram range that is not reserved.

and do some simple bits test to find some bad ram.

if find some bad ram, use reserve_early to exclude that range.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820_64.c |  70 +++++++++++++++++++++++++++++-
 arch/x86/mm/init_64.c     | 106 +++++++++++++++++++++++++++++++++++++++++++++-
 include/asm-x86/e820_64.h |   3 ++
 3 files changed, 177 insertions(+), 2 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 4a0953857cb..4509757844e 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -114,6 +114,40 @@ again:
 	return changed;
 }
 
+/* Check for already reserved areas */
+static inline int
+bad_addr_size(unsigned long *addrp, unsigned long *sizep, unsigned long align)
+{
+	int i;
+	unsigned long addr = *addrp, last;
+	unsigned long size = *sizep;
+	int changed = 0;
+again:
+	last = addr + size;
+	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+		struct early_res *r = &early_res[i];
+		if (last > r->start && addr < r->start) {
+			size = r->start - addr;
+			changed = 1;
+			goto again;
+		}
+		if (last > r->end && addr < r->end) {
+			addr = round_up(r->end, align);
+			size = last - addr;
+			changed = 1;
+			goto again;
+		}
+		if (last <= r->end && addr >= r->start) {
+			(*sizep)++;
+			return 0;
+		}
+	}
+	if (changed) {
+		*addrp = addr;
+		*sizep = size;
+	}
+	return changed;
+}
 /*
  * This function checks if any part of the range <start,end> is mapped
  * with type.
@@ -190,7 +224,7 @@ unsigned long __init find_e820_area(unsigned long start, unsigned long end,
 		ei_last = ei->addr + ei->size;
 		if (addr < start)
 			addr = round_up(start, align);
-		if (addr > ei_last)
+		if (addr >= ei_last)
 			continue;
 		while (bad_addr(&addr, size, align) && addr+size <= ei_last)
 			;
@@ -204,6 +238,40 @@ unsigned long __init find_e820_area(unsigned long start, unsigned long end,
 	return -1UL;
 }
 
+/*
+ * Find next free range after *start
+ */
+unsigned long __init find_e820_area_size(unsigned long start, unsigned long *sizep, unsigned long align)
+{
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+		unsigned long addr, last;
+		unsigned long ei_last;
+
+		if (ei->type != E820_RAM)
+			continue;
+		addr = round_up(ei->addr, align);
+		ei_last = ei->addr + ei->size;
+//		printk(KERN_DEBUG "find_e820_area_size : e820 %d [%llx, %lx]\n", i, ei->addr, ei_last);
+		if (addr < start)
+			addr = round_up(start, align);
+//		printk(KERN_DEBUG "find_e820_area_size : 0 [%lx, %lx]\n", addr, ei_last);
+		if (addr >= ei_last)
+			continue;
+		*sizep = ei_last - addr;
+		while (bad_addr_size(&addr, sizep, align) && addr+ *sizep <= ei_last)
+			;
+		last = addr + *sizep;
+//		printk(KERN_DEBUG "find_e820_area_size : 1 [%lx, %lx]\n", addr, last);
+		if (last > ei_last)
+			continue;
+		return addr;
+	}
+	return -1UL;
+
+}
 /*
  * Find the highest page frame number we have available
  */
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 255e51feb15..52f54ee4559 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -427,6 +427,106 @@ static void __init init_gbpages(void)
 		direct_gbpages = 0;
 }
 
+static void __init memtest(unsigned long start_phys, unsigned long size, unsigned pattern)
+{
+	unsigned long i;
+	unsigned long *start;
+	unsigned long start_bad;
+	unsigned long last_bad;
+	unsigned long val;
+	unsigned long start_phys_aligned;
+	unsigned long count;
+	unsigned long incr;
+
+	switch (pattern) {
+	case 0:
+		val = 0UL;
+		break;
+	case 1:
+		val = -1UL;
+		break;
+	case 2:
+		val = 0x5555555555555555UL;
+		break;
+	case 3:
+		val = 0xaaaaaaaaaaaaaaaaUL;
+		break;
+	default:
+		return;
+	}
+
+	incr = sizeof(unsigned long);
+	start_phys_aligned = ALIGN(start_phys, incr);
+	count = (size - (start_phys_aligned - start_phys))/incr;
+	start = __va(start_phys_aligned);
+	start_bad = 0;
+	last_bad = 0;
+
+	for (i = 0; i < count; i++)
+		start[i] = val;
+	for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
+		if (*start != val) {
+			if (start_phys_aligned == last_bad + incr) {
+				last_bad += incr;
+			} else {
+				if (start_bad) {
+					printk(KERN_INFO "  %016lxx bad mem addr %016lx - %016lx reserved\n",
+						val, start_bad, last_bad + incr);
+					reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
+				}
+				start_bad = last_bad = start_phys_aligned;
+			}
+		}
+	}
+	if (start_bad) {
+		printk(KERN_INFO "  %016lx bad mem addr %016lx - %016lx reserved\n",
+			val, start_bad, last_bad + incr);
+		reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
+	}
+
+}
+
+static int __initdata memtest_pattern;
+static int __init parse_memtest(char *arg)
+{
+	if (arg)
+		memtest_pattern = simple_strtoul(arg, NULL, 0) + 1;
+	return 0;
+}
+
+early_param("memtest", parse_memtest);
+
+static void __init early_memtest(unsigned long start, unsigned long end)
+{
+	unsigned long t_start, t_size;
+	unsigned pattern;
+
+	if (memtest_pattern)
+		printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
+	for (pattern = 0; pattern < memtest_pattern; pattern++) {
+		t_start = start;
+		t_size = 0;
+		while (t_start < end) {
+			t_start = find_e820_area_size(t_start, &t_size, 1);
+
+			/* done ? */
+			if (t_start >= end)
+				break;
+			if (t_start + t_size > end)
+				t_size = end - t_start;
+
+			printk(KERN_CONT "\n  %016lx - %016lx pattern %d",
+				t_start, t_start + t_size, pattern);
+
+			memtest(t_start, t_size, pattern);
+
+			t_start += t_size;
+		}
+	}
+	if (memtest_pattern)
+		printk(KERN_CONT "\n");
+}
+
 /*
  * Setup the direct mapping of the physical memory at PAGE_OFFSET.
  * This runs before bootmem is initialized and gets pages directly from
@@ -435,8 +535,9 @@ static void __init init_gbpages(void)
 void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
 {
 	unsigned long next;
+	unsigned long start_phys = start, end_phys = end;
 
-	pr_debug("init_memory_mapping\n");
+	printk(KERN_INFO "init_memory_mapping\n");
 
 	/*
 	 * Find space for the kernel direct mapping tables.
@@ -479,6 +580,9 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
 	if (!after_bootmem)
 		reserve_early(table_start << PAGE_SHIFT,
 				 table_end << PAGE_SHIFT, "PGTABLE");
+
+	if (!after_bootmem)
+		early_memtest(start_phys, end_phys);
 }
 
 #ifndef CONFIG_NUMA
diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h
index ef653a403e0..d38820b31c1 100644
--- a/include/asm-x86/e820_64.h
+++ b/include/asm-x86/e820_64.h
@@ -16,6 +16,9 @@
 #ifndef __ASSEMBLY__
 extern unsigned long find_e820_area(unsigned long start, unsigned long end, 
 				    unsigned long size, unsigned long align);
+extern unsigned long find_e820_area_size(unsigned long start,
+					 unsigned long *sizep,
+					 unsigned long align);
 extern void add_memory_region(unsigned long start, unsigned long size, 
 			      int type);
 extern void update_memory_range(u64 start, u64 size, unsigned old_type,
-- 
cgit v1.2.3-70-g09d2


From 2e5d9c857d4e6c9e7b7d8c8c86a68a7842d213d6 Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Tue, 18 Mar 2008 17:00:14 -0700
Subject: x86: PAT infrastructure patch

Sets up pat_init() infrastructure.

PAT MSR has following setting.
	PAT
	|PCD
	||PWT
	|||
	000 WB		_PAGE_CACHE_WB
	001 WC		_PAGE_CACHE_WC
	010 UC-		_PAGE_CACHE_UC_MINUS
	011 UC		_PAGE_CACHE_UC

We are effectively changing WT from boot time setting to WC.
UC_MINUS is used to provide backward compatibility to existing /dev/mem
users(X).

reserve_memtype and free_memtype are new interfaces for maintaining alias-free
mapping. It is currently implemented in a simple way with a linked list and
not optimized. reserve and free tracks the effective memory type, as a result
of PAT and MTRR setting rather than what is actually requested in PAT.

pat_init piggy backs on mtrr_init as the rules for setting both pat and mtrr
are same.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                   |  15 ++
 arch/x86/kernel/cpu/mtrr/generic.c | 120 +++++++++++
 arch/x86/mm/Makefile               |   3 +-
 arch/x86/mm/pageattr.c             |   4 +-
 arch/x86/mm/pat.c                  | 402 +++++++++++++++++++++++++++++++++++++
 include/asm-x86/cpufeature.h       |   1 +
 include/asm-x86/msr-index.h        |   2 +
 include/asm-x86/mtrr.h             |   2 +
 include/asm-x86/pat.h              |  16 ++
 include/asm-x86/pgtable.h          |   6 +
 10 files changed, 568 insertions(+), 3 deletions(-)
 create mode 100644 arch/x86/mm/pat.c
 create mode 100644 include/asm-x86/pat.h

(limited to 'arch/x86/mm')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fd27048087b..5b46756e4c7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1009,6 +1009,21 @@ config MTRR
 
 	  See <file:Documentation/mtrr.txt> for more information.
 
+config X86_PAT
+	def_bool y
+	prompt "x86 PAT support"
+	depends on MTRR && NONPROMISC_DEVMEM
+	help
+	  Use PAT attributes to setup page level cache control.
+	---help---
+	  PATs are the modern equivalents of MTRRs and are much more
+	  flexible than MTRRs.
+
+	  Say N here if you see bootup problems (boot crash, boot hang,
+	  spontaneous reboots) or a non-working Xorg.
+
+	  If unsure, say Y.
+
 config EFI
 	def_bool n
 	prompt "EFI runtime service support"
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 3e18db4cefe..011e07e99cd 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -11,6 +11,7 @@
 #include <asm/cpufeature.h>
 #include <asm/processor-flags.h>
 #include <asm/tlbflush.h>
+#include <asm/pat.h>
 #include "mtrr.h"
 
 struct mtrr_state {
@@ -35,6 +36,7 @@ static struct fixed_range_block fixed_range_blocks[] = {
 
 static unsigned long smp_changes_mask;
 static struct mtrr_state mtrr_state = {};
+static int mtrr_state_set;
 
 #undef MODULE_PARAM_PREFIX
 #define MODULE_PARAM_PREFIX "mtrr."
@@ -42,6 +44,106 @@ static struct mtrr_state mtrr_state = {};
 static int mtrr_show;
 module_param_named(show, mtrr_show, bool, 0);
 
+/*
+ * Returns the effective MTRR type for the region
+ * Error returns:
+ * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR
+ * - 0xFF - when MTRR is not enabled
+ */
+u8 mtrr_type_lookup(u64 start, u64 end)
+{
+	int i;
+	u64 base, mask;
+	u8 prev_match, curr_match;
+
+	if (!mtrr_state_set)
+		return 0xFF;
+
+	if (!mtrr_state.enabled)
+		return 0xFF;
+
+	/* Make end inclusive end, instead of exclusive */
+	end--;
+
+	/* Look in fixed ranges. Just return the type as per start */
+	if (mtrr_state.have_fixed && (start < 0x100000)) {
+		int idx;
+
+		if (start < 0x80000) {
+			idx = 0;
+			idx += (start >> 16);
+			return mtrr_state.fixed_ranges[idx];
+		} else if (start < 0xC0000) {
+			idx = 1 * 8;
+			idx += ((start - 0x80000) >> 14);
+			return mtrr_state.fixed_ranges[idx];
+		} else if (start < 0x1000000) {
+			idx = 3 * 8;
+			idx += ((start - 0xC0000) >> 12);
+			return mtrr_state.fixed_ranges[idx];
+		}
+	}
+
+	/*
+	 * Look in variable ranges
+	 * Look of multiple ranges matching this address and pick type
+	 * as per MTRR precedence
+	 */
+	if (!mtrr_state.enabled & 2) {
+		return mtrr_state.def_type;
+	}
+
+	prev_match = 0xFF;
+	for (i = 0; i < num_var_ranges; ++i) {
+		unsigned short start_state, end_state;
+
+		if (!(mtrr_state.var_ranges[i].mask_lo & (1 << 11)))
+			continue;
+
+		base = (((u64)mtrr_state.var_ranges[i].base_hi) << 32) +
+		       (mtrr_state.var_ranges[i].base_lo & PAGE_MASK);
+		mask = (((u64)mtrr_state.var_ranges[i].mask_hi) << 32) +
+		       (mtrr_state.var_ranges[i].mask_lo & PAGE_MASK);
+
+		start_state = ((start & mask) == (base & mask));
+		end_state = ((end & mask) == (base & mask));
+		if (start_state != end_state)
+			return 0xFE;
+
+		if ((start & mask) != (base & mask)) {
+			continue;
+		}
+
+		curr_match = mtrr_state.var_ranges[i].base_lo & 0xff;
+		if (prev_match == 0xFF) {
+			prev_match = curr_match;
+			continue;
+		}
+
+		if (prev_match == MTRR_TYPE_UNCACHABLE ||
+		    curr_match == MTRR_TYPE_UNCACHABLE) {
+			return MTRR_TYPE_UNCACHABLE;
+		}
+
+		if ((prev_match == MTRR_TYPE_WRBACK &&
+		     curr_match == MTRR_TYPE_WRTHROUGH) ||
+		    (prev_match == MTRR_TYPE_WRTHROUGH &&
+		     curr_match == MTRR_TYPE_WRBACK)) {
+			prev_match = MTRR_TYPE_WRTHROUGH;
+			curr_match = MTRR_TYPE_WRTHROUGH;
+		}
+
+		if (prev_match != curr_match) {
+			return MTRR_TYPE_UNCACHABLE;
+		}
+	}
+
+	if (prev_match != 0xFF)
+		return prev_match;
+
+	return mtrr_state.def_type;
+}
+
 /*  Get the MSR pair relating to a var range  */
 static void
 get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
@@ -79,12 +181,16 @@ static void print_fixed(unsigned base, unsigned step, const mtrr_type*types)
 			base, base + step - 1, mtrr_attrib_to_str(*types));
 }
 
+static void prepare_set(void);
+static void post_set(void);
+
 /*  Grab all of the MTRR state for this CPU into *state  */
 void __init get_mtrr_state(void)
 {
 	unsigned int i;
 	struct mtrr_var_range *vrs;
 	unsigned lo, dummy;
+	unsigned long flags;
 
 	vrs = mtrr_state.var_ranges;
 
@@ -131,6 +237,17 @@ void __init get_mtrr_state(void)
 				printk(KERN_INFO "MTRR %u disabled\n", i);
 		}
 	}
+	mtrr_state_set = 1;
+
+	/* PAT setup for BP. We need to go through sync steps here */
+	local_irq_save(flags);
+	prepare_set();
+
+	pat_init();
+
+	post_set();
+	local_irq_restore(flags);
+
 }
 
 /*  Some BIOS's are fucked and don't set all MTRRs the same!  */
@@ -397,6 +514,9 @@ static void generic_set_all(void)
 	/* Actually set the state */
 	mask = set_mtrr_state();
 
+	/* also set PAT */
+	pat_init();
+
 	post_set();
 	local_irq_restore(flags);
 
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 9ab9889863f..20941d2954e 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,4 +1,5 @@
-obj-y	:=  init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o
+obj-y	:=  init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
+	    pat.o
 
 obj-$(CONFIG_X86_32)		+= pgtable_32.o
 
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 6cdfc0fd68b..f7d5ca170c2 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -773,14 +773,14 @@ static inline int change_page_attr_clear(unsigned long addr, int numpages,
 int set_memory_uc(unsigned long addr, int numpages)
 {
 	return change_page_attr_set(addr, numpages,
-				    __pgprot(_PAGE_PCD));
+				    __pgprot(_PAGE_CACHE_UC));
 }
 EXPORT_SYMBOL(set_memory_uc);
 
 int set_memory_wb(unsigned long addr, int numpages)
 {
 	return change_page_attr_clear(addr, numpages,
-				      __pgprot(_PAGE_PCD | _PAGE_PWT));
+				      __pgprot(_PAGE_CACHE_MASK));
 }
 EXPORT_SYMBOL(set_memory_wb);
 
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
new file mode 100644
index 00000000000..7cc71d86848
--- /dev/null
+++ b/arch/x86/mm/pat.c
@@ -0,0 +1,402 @@
+/*
+ * Handle caching attributes in page tables (PAT)
+ *
+ * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ *          Suresh B Siddha <suresh.b.siddha@intel.com>
+ *
+ * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
+ */
+
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/fs.h>
+
+#include <asm/msr.h>
+#include <asm/tlbflush.h>
+#include <asm/processor.h>
+#include <asm/pgtable.h>
+#include <asm/pat.h>
+#include <asm/e820.h>
+#include <asm/cacheflush.h>
+#include <asm/fcntl.h>
+#include <asm/mtrr.h>
+
+int pat_wc_enabled = 1;
+
+static u64 __read_mostly boot_pat_state;
+
+static int nopat(char *str)
+{
+	pat_wc_enabled = 0;
+	printk(KERN_INFO "x86: PAT support disabled.\n");
+
+	return 0;
+}
+early_param("nopat", nopat);
+
+static int pat_known_cpu(void)
+{
+	if (!pat_wc_enabled)
+		return 0;
+
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+	    (boot_cpu_data.x86 == 0xF ||
+	     (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model >= 15))) {
+		if (cpu_has_pat) {
+			return 1;
+		}
+	}
+
+	pat_wc_enabled = 0;
+	printk(KERN_INFO "CPU and/or kernel does not support PAT.\n");
+	return 0;
+}
+
+enum {
+	PAT_UC = 0,		/* uncached */
+	PAT_WC = 1,		/* Write combining */
+	PAT_WT = 4,		/* Write Through */
+	PAT_WP = 5,		/* Write Protected */
+	PAT_WB = 6,		/* Write Back (default) */
+	PAT_UC_MINUS = 7,	/* UC, but can be overriden by MTRR */
+};
+
+#define PAT(x,y)	((u64)PAT_ ## y << ((x)*8))
+
+void pat_init(void)
+{
+	u64 pat;
+
+#ifndef CONFIG_X86_PAT
+	nopat(NULL);
+#endif
+
+	/* Boot CPU enables PAT based on CPU feature */
+	if (!smp_processor_id() && !pat_known_cpu())
+		return;
+
+	/* APs enable PAT iff boot CPU has enabled it before */
+	if (smp_processor_id() && !pat_wc_enabled)
+		return;
+
+	/* Set PWT to Write-Combining. All other bits stay the same */
+	/*
+	 * PTE encoding used in Linux:
+	 *      PAT
+	 *      |PCD
+	 *      ||PWT
+	 *      |||
+	 *      000 WB		_PAGE_CACHE_WB
+	 *      001 WC		_PAGE_CACHE_WC
+	 *      010 UC-		_PAGE_CACHE_UC_MINUS
+	 *      011 UC		_PAGE_CACHE_UC
+	 * PAT bit unused
+	 */
+	pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) |
+	      PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC);
+
+	/* Boot CPU check */
+	if (!smp_processor_id()) {
+		rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
+	}
+
+	wrmsrl(MSR_IA32_CR_PAT, pat);
+	printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n",
+	       smp_processor_id(), boot_pat_state, pat);
+}
+
+#undef PAT
+
+static char *cattr_name(unsigned long flags)
+{
+	switch (flags & _PAGE_CACHE_MASK) {
+		case _PAGE_CACHE_UC:		return "uncached";
+		case _PAGE_CACHE_UC_MINUS:	return "uncached-minus";
+		case _PAGE_CACHE_WB:		return "write-back";
+		case _PAGE_CACHE_WC:		return "write-combining";
+		default:			return "broken";
+	}
+}
+
+/*
+ * The global memtype list keeps track of memory type for specific
+ * physical memory areas. Conflicting memory types in different
+ * mappings can cause CPU cache corruption. To avoid this we keep track.
+ *
+ * The list is sorted based on starting address and can contain multiple
+ * entries for each address (this allows reference counting for overlapping
+ * areas). All the aliases have the same cache attributes of course.
+ * Zero attributes are represented as holes.
+ *
+ * Currently the data structure is a list because the number of mappings
+ * are expected to be relatively small. If this should be a problem
+ * it could be changed to a rbtree or similar.
+ *
+ * memtype_lock protects the whole list.
+ */
+
+struct memtype {
+	u64 start;
+	u64 end;
+	unsigned long type;
+	struct list_head nd;
+};
+
+static LIST_HEAD(memtype_list);
+static DEFINE_SPINLOCK(memtype_lock); 	/* protects memtype list */
+
+/*
+ * Does intersection of PAT memory type and MTRR memory type and returns
+ * the resulting memory type as PAT understands it.
+ * (Type in pat and mtrr will not have same value)
+ * The intersection is based on "Effective Memory Type" tables in IA-32
+ * SDM vol 3a
+ */
+static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
+				unsigned long *ret_prot)
+{
+	unsigned long pat_type;
+	u8 mtrr_type;
+
+	mtrr_type = mtrr_type_lookup(start, end);
+	if (mtrr_type == 0xFF) {		/* MTRR not enabled */
+		*ret_prot = prot;
+		return 0;
+	}
+	if (mtrr_type == 0xFE) {		/* MTRR match error */
+		*ret_prot = _PAGE_CACHE_UC;
+		return -1;
+	}
+	if (mtrr_type != MTRR_TYPE_UNCACHABLE &&
+	    mtrr_type != MTRR_TYPE_WRBACK &&
+	    mtrr_type != MTRR_TYPE_WRCOMB) {	/* MTRR type unhandled */
+		*ret_prot = _PAGE_CACHE_UC;
+		return -1;
+	}
+
+	pat_type = prot & _PAGE_CACHE_MASK;
+	prot &= (~_PAGE_CACHE_MASK);
+
+	/* Currently doing intersection by hand. Optimize it later. */
+	if (pat_type == _PAGE_CACHE_WC) {
+		*ret_prot = prot | _PAGE_CACHE_WC;
+	} else if (pat_type == _PAGE_CACHE_UC_MINUS) {
+		*ret_prot = prot | _PAGE_CACHE_UC_MINUS;
+	} else if (pat_type == _PAGE_CACHE_UC ||
+	           mtrr_type == MTRR_TYPE_UNCACHABLE) {
+		*ret_prot = prot | _PAGE_CACHE_UC;
+	} else if (mtrr_type == MTRR_TYPE_WRCOMB) {
+		*ret_prot = prot | _PAGE_CACHE_WC;
+	} else {
+		*ret_prot = prot | _PAGE_CACHE_WB;
+	}
+
+	return 0;
+}
+
+int reserve_memtype(u64 start, u64 end, unsigned long req_type,
+			unsigned long *ret_type)
+{
+	struct memtype *new_entry = NULL;
+	struct memtype *parse;
+	unsigned long actual_type;
+	int err = 0;
+
+	/* Only track when pat_wc_enabled */
+	if (!pat_wc_enabled) {
+		if (ret_type)
+			*ret_type = req_type;
+
+		return 0;
+	}
+
+	/* Low ISA region is always mapped WB in page table. No need to track */
+	if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) {
+		if (ret_type)
+			*ret_type = _PAGE_CACHE_WB;
+
+		return 0;
+	}
+
+	req_type &= _PAGE_CACHE_MASK;
+	err = pat_x_mtrr_type(start, end, req_type, &actual_type);
+	if (err) {
+		if (ret_type)
+			*ret_type = actual_type;
+
+		return -EINVAL;
+	}
+
+	new_entry  = kmalloc(sizeof(struct memtype), GFP_KERNEL);
+	if (!new_entry)
+		return -ENOMEM;
+
+	new_entry->start = start;
+	new_entry->end = end;
+	new_entry->type = actual_type;
+
+	if (ret_type)
+		*ret_type = actual_type;
+
+	spin_lock(&memtype_lock);
+
+	/* Search for existing mapping that overlaps the current range */
+	list_for_each_entry(parse, &memtype_list, nd) {
+		struct memtype *saved_ptr;
+
+		if (parse->start >= end) {
+			list_add(&new_entry->nd, parse->nd.prev);
+			new_entry = NULL;
+			break;
+		}
+
+		if (start <= parse->start && end >= parse->start) {
+			if (actual_type != parse->type && ret_type) {
+				actual_type = parse->type;
+				*ret_type = actual_type;
+				new_entry->type = actual_type;
+			}
+
+			if (actual_type != parse->type) {
+				printk(
+		KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
+					current->comm, current->pid,
+					start, end,
+					cattr_name(actual_type),
+					cattr_name(parse->type));
+				err = -EBUSY;
+				break;
+			}
+
+			saved_ptr = parse;
+			/*
+			 * Check to see whether the request overlaps more
+			 * than one entry in the list
+			 */
+			list_for_each_entry_continue(parse, &memtype_list, nd) {
+				if (end <= parse->start) {
+					break;
+				}
+
+				if (actual_type != parse->type) {
+					printk(
+		KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
+						current->comm, current->pid,
+						start, end,
+						cattr_name(actual_type),
+						cattr_name(parse->type));
+					err = -EBUSY;
+					break;
+				}
+			}
+
+			if (err) {
+				break;
+			}
+
+			/* No conflict. Go ahead and add this new entry */
+			list_add(&new_entry->nd, saved_ptr->nd.prev);
+			new_entry = NULL;
+			break;
+		}
+
+		if (start < parse->end) {
+			if (actual_type != parse->type && ret_type) {
+				actual_type = parse->type;
+				*ret_type = actual_type;
+				new_entry->type = actual_type;
+			}
+
+			if (actual_type != parse->type) {
+				printk(
+		KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
+					current->comm, current->pid,
+					start, end,
+					cattr_name(actual_type),
+					cattr_name(parse->type));
+				err = -EBUSY;
+				break;
+			}
+
+			saved_ptr = parse;
+			/*
+			 * Check to see whether the request overlaps more
+			 * than one entry in the list
+			 */
+			list_for_each_entry_continue(parse, &memtype_list, nd) {
+				if (end <= parse->start) {
+					break;
+				}
+
+				if (actual_type != parse->type) {
+					printk(
+		KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
+						current->comm, current->pid,
+						start, end,
+						cattr_name(actual_type),
+						cattr_name(parse->type));
+					err = -EBUSY;
+					break;
+				}
+			}
+
+			if (err) {
+				break;
+			}
+
+			/* No conflict. Go ahead and add this new entry */
+			list_add(&new_entry->nd, &saved_ptr->nd);
+			new_entry = NULL;
+			break;
+		}
+	}
+
+	if (err) {
+		kfree(new_entry);
+		spin_unlock(&memtype_lock);
+		return err;
+	}
+
+	if (new_entry) {
+		/* No conflict. Not yet added to the list. Add to the tail */
+		list_add_tail(&new_entry->nd, &memtype_list);
+	}
+
+	spin_unlock(&memtype_lock);
+	return err;
+}
+
+int free_memtype(u64 start, u64 end)
+{
+	struct memtype *ml;
+	int err = -EINVAL;
+
+	/* Only track when pat_wc_enabled */
+	if (!pat_wc_enabled) {
+		return 0;
+	}
+
+	/* Low ISA region is always mapped WB. No need to track */
+	if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) {
+		return 0;
+	}
+
+	spin_lock(&memtype_lock);
+	list_for_each_entry(ml, &memtype_list, nd) {
+		if (ml->start == start && ml->end == end) {
+			list_del(&ml->nd);
+			kfree(ml);
+			err = 0;
+			break;
+		}
+	}
+	spin_unlock(&memtype_lock);
+
+	if (err) {
+		printk(KERN_DEBUG "%s:%d freeing invalid memtype %Lx-%Lx\n",
+			current->comm, current->pid, start, end);
+	}
+	return err;
+}
+
diff --git a/include/asm-x86/cpufeature.h b/include/asm-x86/cpufeature.h
index 90feb6f2562..0d609c837a4 100644
--- a/include/asm-x86/cpufeature.h
+++ b/include/asm-x86/cpufeature.h
@@ -186,6 +186,7 @@ extern const char * const x86_power_flags[32];
 #define cpu_has_bts		boot_cpu_has(X86_FEATURE_BTS)
 #define cpu_has_gbpages		boot_cpu_has(X86_FEATURE_GBPAGES)
 #define cpu_has_arch_perfmon	boot_cpu_has(X86_FEATURE_ARCH_PERFMON)
+#define cpu_has_pat		boot_cpu_has(X86_FEATURE_PAT)
 
 #if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
 # define cpu_has_invlpg		1
diff --git a/include/asm-x86/msr-index.h b/include/asm-x86/msr-index.h
index 3ed97144c07..af4e07f661b 100644
--- a/include/asm-x86/msr-index.h
+++ b/include/asm-x86/msr-index.h
@@ -57,6 +57,8 @@
 #define MSR_MTRRfix4K_F8000		0x0000026f
 #define MSR_MTRRdefType			0x000002ff
 
+#define MSR_IA32_CR_PAT			0x00000277
+
 #define MSR_IA32_DEBUGCTLMSR		0x000001d9
 #define MSR_IA32_LASTBRANCHFROMIP	0x000001db
 #define MSR_IA32_LASTBRANCHTOIP		0x000001dc
diff --git a/include/asm-x86/mtrr.h b/include/asm-x86/mtrr.h
index 319d065800b..968794af93f 100644
--- a/include/asm-x86/mtrr.h
+++ b/include/asm-x86/mtrr.h
@@ -84,6 +84,8 @@ struct mtrr_gentry
 
 #ifdef __KERNEL__
 
+extern u8 mtrr_type_lookup(u64 addr, u64 end);
+
 /*  The following functions are for use by other drivers  */
 # ifdef CONFIG_MTRR
 extern void mtrr_save_fixed_ranges(void *);
diff --git a/include/asm-x86/pat.h b/include/asm-x86/pat.h
new file mode 100644
index 00000000000..8b822b5a178
--- /dev/null
+++ b/include/asm-x86/pat.h
@@ -0,0 +1,16 @@
+
+#ifndef _ASM_PAT_H
+#define _ASM_PAT_H 1
+
+#include <linux/types.h>
+
+extern int pat_wc_enabled;
+
+extern void pat_init(void);
+
+extern int reserve_memtype(u64 start, u64 end,
+		unsigned long req_type, unsigned long *ret_type);
+extern int free_memtype(u64 start, u64 end);
+
+#endif
+
diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h
index 9cf472aeb9c..ca6deb3de7c 100644
--- a/include/asm-x86/pgtable.h
+++ b/include/asm-x86/pgtable.h
@@ -57,6 +57,12 @@
 
 #define _PAGE_CHG_MASK	(PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
 
+#define _PAGE_CACHE_MASK	(_PAGE_PCD | _PAGE_PWT)
+#define _PAGE_CACHE_WB		(0)
+#define _PAGE_CACHE_WC		(_PAGE_PWT)
+#define _PAGE_CACHE_UC_MINUS	(_PAGE_PCD)
+#define _PAGE_CACHE_UC		(_PAGE_PCD | _PAGE_PWT)
+
 #define PAGE_NONE	__pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
 #define PAGE_SHARED	__pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
 
-- 
cgit v1.2.3-70-g09d2


From 55c626820a82b25d7fceca702e9422037ae80626 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 26 Mar 2008 06:19:45 +0100
Subject: x86: revert ucminus change

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/ioremap.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 3d0a589d92c..df95d1d6b4d 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -159,11 +159,7 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
 	switch (mode) {
 	case IOR_MODE_UNCACHED:
 	default:
-		/*
-		 * FIXME: we will use UC MINUS for now, as video fb drivers
-		 * depend on it. Upcoming ioremap_wc() will fix this behavior.
-		 */
-		prot = PAGE_KERNEL_UC_MINUS;
+		prot = PAGE_KERNEL_NOCACHE;
 		break;
 	case IOR_MODE_CACHED:
 		prot = PAGE_KERNEL;
-- 
cgit v1.2.3-70-g09d2


From 3a96ce8cac808fbed5493adc5c605bced28e2ca1 Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Tue, 18 Mar 2008 17:00:16 -0700
Subject: x86: PAT make ioremap_change_attr non-static

Make ioremap_change_attr() non-static and use prot_val in place of ioremap_mode.
This interface is used in subsequent PAT patches.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/ioremap.c | 29 ++++++++++++-----------------
 include/asm-x86/io.h  |  3 +++
 2 files changed, 15 insertions(+), 17 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index df95d1d6b4d..2ac09a5822c 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -20,11 +20,6 @@
 #include <asm/tlbflush.h>
 #include <asm/pgalloc.h>
 
-enum ioremap_mode {
-	IOR_MODE_UNCACHED,
-	IOR_MODE_CACHED,
-};
-
 #ifdef CONFIG_X86_64
 
 unsigned long __phys_addr(unsigned long x)
@@ -90,18 +85,18 @@ int page_is_ram(unsigned long pagenr)
  * Fix up the linear direct mapping of the kernel to avoid cache attribute
  * conflicts.
  */
-static int ioremap_change_attr(unsigned long vaddr, unsigned long size,
-			       enum ioremap_mode mode)
+int ioremap_change_attr(unsigned long vaddr, unsigned long size,
+			       unsigned long prot_val)
 {
 	unsigned long nrpages = size >> PAGE_SHIFT;
 	int err;
 
-	switch (mode) {
-	case IOR_MODE_UNCACHED:
+	switch (prot_val) {
+	case _PAGE_CACHE_UC:
 	default:
 		err = set_memory_uc(vaddr, nrpages);
 		break;
-	case IOR_MODE_CACHED:
+	case _PAGE_CACHE_WB:
 		err = set_memory_wb(vaddr, nrpages);
 		break;
 	}
@@ -119,7 +114,7 @@ static int ioremap_change_attr(unsigned long vaddr, unsigned long size,
  * caller shouldn't need to know that small detail.
  */
 static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
-			       enum ioremap_mode mode)
+			       unsigned long prot_val)
 {
 	unsigned long pfn, offset, last_addr, vaddr;
 	struct vm_struct *area;
@@ -156,12 +151,12 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
 		WARN_ON_ONCE(is_ram);
 	}
 
-	switch (mode) {
-	case IOR_MODE_UNCACHED:
+	switch (prot_val) {
+	case _PAGE_CACHE_UC:
 	default:
 		prot = PAGE_KERNEL_NOCACHE;
 		break;
-	case IOR_MODE_CACHED:
+	case _PAGE_CACHE_WB:
 		prot = PAGE_KERNEL;
 		break;
 	}
@@ -186,7 +181,7 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
 		return NULL;
 	}
 
-	if (ioremap_change_attr(vaddr, size, mode) < 0) {
+	if (ioremap_change_attr(vaddr, size, prot_val) < 0) {
 		vunmap(area->addr);
 		return NULL;
 	}
@@ -217,13 +212,13 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
  */
 void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
 {
-	return __ioremap(phys_addr, size, IOR_MODE_UNCACHED);
+	return __ioremap(phys_addr, size, _PAGE_CACHE_UC);
 }
 EXPORT_SYMBOL(ioremap_nocache);
 
 void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
 {
-	return __ioremap(phys_addr, size, IOR_MODE_CACHED);
+	return __ioremap(phys_addr, size, _PAGE_CACHE_WB);
 }
 EXPORT_SYMBOL(ioremap_cache);
 
diff --git a/include/asm-x86/io.h b/include/asm-x86/io.h
index 5a58b176dd6..6fa150fa68f 100644
--- a/include/asm-x86/io.h
+++ b/include/asm-x86/io.h
@@ -3,3 +3,6 @@
 #else
 # include "io_64.h"
 #endif
+extern int ioremap_change_attr(unsigned long vaddr, unsigned long size,
+				unsigned long prot_val);
+
-- 
cgit v1.2.3-70-g09d2


From d7677d4034f040f4ce565713e0b83a31cc26f23e Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Tue, 18 Mar 2008 17:00:17 -0700
Subject: x86: PAT use reserve free memtype in ioremap and iounmap

Use reserve_memtype and free_memtype interfaces in ioremap/iounmap to avoid
aliasing.

If there is an existing alias for the region, inherit the memory type from
the alias. If there are conflicting aliases for the entire region, then fail
ioremap.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/ioremap.c | 35 ++++++++++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 7 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 2ac09a5822c..20c01f2b2e1 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -19,6 +19,7 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/pgalloc.h>
+#include <asm/pat.h>
 
 #ifdef CONFIG_X86_64
 
@@ -118,6 +119,7 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
 {
 	unsigned long pfn, offset, last_addr, vaddr;
 	struct vm_struct *area;
+	unsigned long new_prot_val;
 	pgprot_t prot;
 
 	/* Don't allow wraparound or zero size */
@@ -151,6 +153,28 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
 		WARN_ON_ONCE(is_ram);
 	}
 
+	/*
+	 * Mappings have to be page-aligned
+	 */
+	offset = phys_addr & ~PAGE_MASK;
+	phys_addr &= PAGE_MASK;
+	size = PAGE_ALIGN(last_addr+1) - phys_addr;
+
+	if (reserve_memtype(phys_addr, phys_addr + size,
+	                    prot_val, &new_prot_val)) {
+		/*
+		 * Do not fallback to certain memory types with certain
+		 * requested type:
+		 * - request is uncached, return cannot be write-back
+		 */
+		if ((prot_val == _PAGE_CACHE_UC &&
+		     new_prot_val == _PAGE_CACHE_WB)) {
+			free_memtype(phys_addr, phys_addr + size);
+			return NULL;
+		}
+		prot_val = new_prot_val;
+	}
+
 	switch (prot_val) {
 	case _PAGE_CACHE_UC:
 	default:
@@ -161,13 +185,6 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
 		break;
 	}
 
-	/*
-	 * Mappings have to be page-aligned
-	 */
-	offset = phys_addr & ~PAGE_MASK;
-	phys_addr &= PAGE_MASK;
-	size = PAGE_ALIGN(last_addr+1) - phys_addr;
-
 	/*
 	 * Ok, go for it..
 	 */
@@ -177,11 +194,13 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
 	area->phys_addr = phys_addr;
 	vaddr = (unsigned long) area->addr;
 	if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) {
+		free_memtype(phys_addr, phys_addr + size);
 		free_vm_area(area);
 		return NULL;
 	}
 
 	if (ioremap_change_attr(vaddr, size, prot_val) < 0) {
+		free_memtype(phys_addr, phys_addr + size);
 		vunmap(area->addr);
 		return NULL;
 	}
@@ -265,6 +284,8 @@ void iounmap(volatile void __iomem *addr)
 		return;
 	}
 
+	free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
+
 	/* Finally remove it */
 	o = remove_vm_area((void *)addr);
 	BUG_ON(p != o || o == NULL);
-- 
cgit v1.2.3-70-g09d2


From 1219333dfdd488e85f08cf07881b8bc63cf92f21 Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Tue, 18 Mar 2008 17:00:18 -0700
Subject: x86: PAT use reserve free memtype in set_memory_uc

Use reserve_memtype and free_memtype interfaces in set_memory_uc/set_memory_wb
interfaces to avoid aliasing.
Usage model of set_memory_uc and set_memory_wb is for RAM memory and users
will first call set_memory_uc and call set_memory_wb after use to reset the
attribute.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/ioremap.c        |  4 ++--
 arch/x86/mm/pageattr.c       | 21 +++++++++++++++++++--
 include/asm-x86/cacheflush.h |  2 ++
 3 files changed, 23 insertions(+), 4 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 20c01f2b2e1..0cdb7f11ce4 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -95,10 +95,10 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
 	switch (prot_val) {
 	case _PAGE_CACHE_UC:
 	default:
-		err = set_memory_uc(vaddr, nrpages);
+		err = _set_memory_uc(vaddr, nrpages);
 		break;
 	case _PAGE_CACHE_WB:
-		err = set_memory_wb(vaddr, nrpages);
+		err = _set_memory_wb(vaddr, nrpages);
 		break;
 	}
 
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index f7d5ca170c2..938df5e8402 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -19,6 +19,7 @@
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
 #include <asm/proto.h>
+#include <asm/pat.h>
 
 /*
  * The current flushing context - we pass it instead of 5 arguments:
@@ -770,18 +771,34 @@ static inline int change_page_attr_clear(unsigned long addr, int numpages,
 	return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
 }
 
-int set_memory_uc(unsigned long addr, int numpages)
+int _set_memory_uc(unsigned long addr, int numpages)
 {
 	return change_page_attr_set(addr, numpages,
 				    __pgprot(_PAGE_CACHE_UC));
 }
+
+int set_memory_uc(unsigned long addr, int numpages)
+{
+	if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
+	                    _PAGE_CACHE_UC, NULL))
+		return -EINVAL;
+
+	return _set_memory_uc(addr, numpages);
+}
 EXPORT_SYMBOL(set_memory_uc);
 
-int set_memory_wb(unsigned long addr, int numpages)
+int _set_memory_wb(unsigned long addr, int numpages)
 {
 	return change_page_attr_clear(addr, numpages,
 				      __pgprot(_PAGE_CACHE_MASK));
 }
+
+int set_memory_wb(unsigned long addr, int numpages)
+{
+	free_memtype(addr, addr + numpages * PAGE_SIZE);
+
+	return _set_memory_wb(addr, numpages);
+}
 EXPORT_SYMBOL(set_memory_wb);
 
 int set_memory_x(unsigned long addr, int numpages)
diff --git a/include/asm-x86/cacheflush.h b/include/asm-x86/cacheflush.h
index 5396c212d8c..5676fba10a0 100644
--- a/include/asm-x86/cacheflush.h
+++ b/include/asm-x86/cacheflush.h
@@ -34,6 +34,8 @@ int set_pages_nx(struct page *page, int numpages);
 int set_pages_ro(struct page *page, int numpages);
 int set_pages_rw(struct page *page, int numpages);
 
+int _set_memory_uc(unsigned long addr, int numpages);
+int _set_memory_wb(unsigned long addr, int numpages);
 int set_memory_uc(unsigned long addr, int numpages);
 int set_memory_wb(unsigned long addr, int numpages);
 int set_memory_x(unsigned long addr, int numpages);
-- 
cgit v1.2.3-70-g09d2


From ef354af4629e5cc76a3f64fc46d452f2b56d5a59 Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Tue, 18 Mar 2008 17:00:23 -0700
Subject: x86: PAT add set_memory_wc() interface

Add a set_memory_wc interface(), similar to set_memory_uc interface.
Callers has to call set_memory_uc, set_memory_wb and
set_memory_wc, set_memory_wb as pairs.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pageattr.c       | 19 +++++++++++++++++++
 include/asm-x86/cacheflush.h |  2 ++
 2 files changed, 21 insertions(+)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 938df5e8402..270cab2e603 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -787,6 +787,25 @@ int set_memory_uc(unsigned long addr, int numpages)
 }
 EXPORT_SYMBOL(set_memory_uc);
 
+int _set_memory_wc(unsigned long addr, int numpages)
+{
+	return change_page_attr_set(addr, numpages,
+				    __pgprot(_PAGE_CACHE_WC));
+}
+
+int set_memory_wc(unsigned long addr, int numpages)
+{
+	if (!pat_wc_enabled)
+		return set_memory_uc(addr, numpages);
+
+	if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
+		_PAGE_CACHE_WC, NULL))
+		return -EINVAL;
+
+	return _set_memory_wc(addr, numpages);
+}
+EXPORT_SYMBOL(set_memory_wc);
+
 int _set_memory_wb(unsigned long addr, int numpages)
 {
 	return change_page_attr_clear(addr, numpages,
diff --git a/include/asm-x86/cacheflush.h b/include/asm-x86/cacheflush.h
index 5676fba10a0..90437d3f761 100644
--- a/include/asm-x86/cacheflush.h
+++ b/include/asm-x86/cacheflush.h
@@ -35,8 +35,10 @@ int set_pages_ro(struct page *page, int numpages);
 int set_pages_rw(struct page *page, int numpages);
 
 int _set_memory_uc(unsigned long addr, int numpages);
+int _set_memory_wc(unsigned long addr, int numpages);
 int _set_memory_wb(unsigned long addr, int numpages);
 int set_memory_uc(unsigned long addr, int numpages);
+int set_memory_wc(unsigned long addr, int numpages);
 int set_memory_wb(unsigned long addr, int numpages);
 int set_memory_x(unsigned long addr, int numpages);
 int set_memory_nx(unsigned long addr, int numpages);
-- 
cgit v1.2.3-70-g09d2


From b310f381d220b2c6e3fab16e8c6e4ca13eea75b2 Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Tue, 18 Mar 2008 17:00:24 -0700
Subject: x86: PAT add ioremap_wc() interface

Introduce ioremap_wc for wc remap.

(generic wrapper is in a later patch)

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/ioremap.c     | 30 ++++++++++++++++++++++++++++++
 include/asm-x86/io.h      |  3 +++
 include/asm-x86/pgtable.h |  2 ++
 3 files changed, 35 insertions(+)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 0cdb7f11ce4..51cd3956c56 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -97,6 +97,9 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
 	default:
 		err = _set_memory_uc(vaddr, nrpages);
 		break;
+	case _PAGE_CACHE_WC:
+		err = _set_memory_wc(vaddr, nrpages);
+		break;
 	case _PAGE_CACHE_WB:
 		err = _set_memory_wb(vaddr, nrpages);
 		break;
@@ -166,8 +169,13 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
 		 * Do not fallback to certain memory types with certain
 		 * requested type:
 		 * - request is uncached, return cannot be write-back
+		 * - request is uncached, return cannot be write-combine
+		 * - request is write-combine, return cannot be write-back
 		 */
 		if ((prot_val == _PAGE_CACHE_UC &&
+		     (new_prot_val == _PAGE_CACHE_WB ||
+		      new_prot_val == _PAGE_CACHE_WC)) ||
+		    (prot_val == _PAGE_CACHE_WC &&
 		     new_prot_val == _PAGE_CACHE_WB)) {
 			free_memtype(phys_addr, phys_addr + size);
 			return NULL;
@@ -180,6 +188,9 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
 	default:
 		prot = PAGE_KERNEL_NOCACHE;
 		break;
+	case _PAGE_CACHE_WC:
+		prot = PAGE_KERNEL_WC;
+		break;
 	case _PAGE_CACHE_WB:
 		prot = PAGE_KERNEL;
 		break;
@@ -235,6 +246,25 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
 }
 EXPORT_SYMBOL(ioremap_nocache);
 
+/**
+ * ioremap_wc	-	map memory into CPU space write combined
+ * @offset:	bus address of the memory
+ * @size:	size of the resource to map
+ *
+ * This version of ioremap ensures that the memory is marked write combining.
+ * Write combining allows faster writes to some hardware devices.
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
+{
+	if (pat_wc_enabled)
+		return __ioremap(phys_addr, size, _PAGE_CACHE_WC);
+	else
+		return ioremap_nocache(phys_addr, size);
+}
+EXPORT_SYMBOL(ioremap_wc);
+
 void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
 {
 	return __ioremap(phys_addr, size, _PAGE_CACHE_WB);
diff --git a/include/asm-x86/io.h b/include/asm-x86/io.h
index 6fa150fa68f..599cad3505c 100644
--- a/include/asm-x86/io.h
+++ b/include/asm-x86/io.h
@@ -1,3 +1,5 @@
+#define ARCH_HAS_IOREMAP_WC
+
 #ifdef CONFIG_X86_32
 # include "io_32.h"
 #else
@@ -5,4 +7,5 @@
 #endif
 extern int ioremap_change_attr(unsigned long vaddr, unsigned long size,
 				unsigned long prot_val);
+extern void __iomem * ioremap_wc(unsigned long offset, unsigned long size);
 
diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h
index ca6deb3de7c..e814cfe96af 100644
--- a/include/asm-x86/pgtable.h
+++ b/include/asm-x86/pgtable.h
@@ -90,6 +90,7 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
 #define __PAGE_KERNEL_RO		(__PAGE_KERNEL & ~_PAGE_RW)
 #define __PAGE_KERNEL_RX		(__PAGE_KERNEL_EXEC & ~_PAGE_RW)
 #define __PAGE_KERNEL_EXEC_NOCACHE	(__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_WC		(__PAGE_KERNEL | _PAGE_CACHE_WC)
 #define __PAGE_KERNEL_NOCACHE		(__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
 #define __PAGE_KERNEL_UC_MINUS		(__PAGE_KERNEL | _PAGE_PCD)
 #define __PAGE_KERNEL_VSYSCALL		(__PAGE_KERNEL_RX | _PAGE_USER)
@@ -107,6 +108,7 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
 #define PAGE_KERNEL_RO			MAKE_GLOBAL(__PAGE_KERNEL_RO)
 #define PAGE_KERNEL_EXEC		MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
 #define PAGE_KERNEL_RX			MAKE_GLOBAL(__PAGE_KERNEL_RX)
+#define PAGE_KERNEL_WC			MAKE_GLOBAL(__PAGE_KERNEL_WC)
 #define PAGE_KERNEL_NOCACHE		MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
 #define PAGE_KERNEL_UC_MINUS		MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS)
 #define PAGE_KERNEL_EXEC_NOCACHE	MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE)
-- 
cgit v1.2.3-70-g09d2


From 6997ab4982a29925e79f72c3a59823cf944c3529 Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Tue, 18 Mar 2008 17:00:25 -0700
Subject: x86: add PAT related debug prints

Adds debug prints at critical code. Adds enough info in dmesg to allow us to
do effective first round of analysis of any issues that may result due to PAT
patch series.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/ioremap.c |  6 +++++-
 arch/x86/mm/pat.c     | 24 ++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 51cd3956c56..6cd3418afe7 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -131,7 +131,7 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
 		return NULL;
 
 	if (!phys_addr_valid(phys_addr)) {
-		printk(KERN_WARNING "ioremap: invalid physical address %lx\n",
+		printk(KERN_WARNING "ioremap: invalid physical address %llx\n",
 		       phys_addr);
 		WARN_ON_ONCE(1);
 		return NULL;
@@ -177,6 +177,10 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
 		      new_prot_val == _PAGE_CACHE_WC)) ||
 		    (prot_val == _PAGE_CACHE_WC &&
 		     new_prot_val == _PAGE_CACHE_WB)) {
+			printk(
+		"ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n",
+				phys_addr, phys_addr + size,
+				prot_val, new_prot_val);
 			free_memtype(phys_addr, phys_addr + size);
 			return NULL;
 		}
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 7cc71d86848..57a2af36d6e 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -246,6 +246,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 		struct memtype *saved_ptr;
 
 		if (parse->start >= end) {
+			printk("New Entry\n");
 			list_add(&new_entry->nd, parse->nd.prev);
 			new_entry = NULL;
 			break;
@@ -295,6 +296,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 				break;
 			}
 
+			printk("Overlap at 0x%Lx-0x%Lx\n",
+			       saved_ptr->start, saved_ptr->end);
 			/* No conflict. Go ahead and add this new entry */
 			list_add(&new_entry->nd, saved_ptr->nd.prev);
 			new_entry = NULL;
@@ -345,6 +348,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 				break;
 			}
 
+			printk("Overlap at 0x%Lx-0x%Lx\n",
+			       saved_ptr->start, saved_ptr->end);
 			/* No conflict. Go ahead and add this new entry */
 			list_add(&new_entry->nd, &saved_ptr->nd);
 			new_entry = NULL;
@@ -353,6 +358,10 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 	}
 
 	if (err) {
+		printk(
+	"reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n",
+			start, end, cattr_name(new_entry->type),
+			cattr_name(req_type));
 		kfree(new_entry);
 		spin_unlock(&memtype_lock);
 		return err;
@@ -361,6 +370,19 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 	if (new_entry) {
 		/* No conflict. Not yet added to the list. Add to the tail */
 		list_add_tail(&new_entry->nd, &memtype_list);
+		printk("New Entry\n");
+  	}
+
+	if (ret_type) {
+		printk(
+	"reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
+			start, end, cattr_name(actual_type),
+			cattr_name(req_type), cattr_name(*ret_type));
+	} else {
+		printk(
+	"reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
+			start, end, cattr_name(actual_type),
+			cattr_name(req_type));
 	}
 
 	spin_unlock(&memtype_lock);
@@ -397,6 +419,8 @@ int free_memtype(u64 start, u64 end)
 		printk(KERN_DEBUG "%s:%d freeing invalid memtype %Lx-%Lx\n",
 			current->comm, current->pid, start, end);
 	}
+
+	printk( "free_memtype request 0x%Lx-0x%Lx\n", start, end);
 	return err;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 35605a1027ac630f85a1b95684f7e86b82498cd6 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel.send@gmail.com>
Date: Mon, 24 Mar 2008 16:02:01 -0700
Subject: x86: enable PAT for amd k8 and fam10h

make known_pat_cpu to think amd k8 and fam10h is ok too.

also make tom2 below to be WRBACK

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mtrr/generic.c | 17 +++++++++++++++++
 arch/x86/kernel/cpu/mtrr/main.c    |  2 +-
 arch/x86/mm/pat.c                  |  6 ++++++
 include/asm-x86/mtrr.h             |  1 +
 4 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 011e07e99cd..74ec2ea4ed3 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -37,6 +37,7 @@ static struct fixed_range_block fixed_range_blocks[] = {
 static unsigned long smp_changes_mask;
 static struct mtrr_state mtrr_state = {};
 static int mtrr_state_set;
+static u64 tom2;
 
 #undef MODULE_PARAM_PREFIX
 #define MODULE_PARAM_PREFIX "mtrr."
@@ -138,6 +139,11 @@ u8 mtrr_type_lookup(u64 start, u64 end)
 		}
 	}
 
+	if (tom2) {
+		if (start >= (1ULL<<32) && (end < tom2))
+			return MTRR_TYPE_WRBACK;
+	}
+
 	if (prev_match != 0xFF)
 		return prev_match;
 
@@ -206,6 +212,15 @@ void __init get_mtrr_state(void)
 	mtrr_state.def_type = (lo & 0xff);
 	mtrr_state.enabled = (lo & 0xc00) >> 10;
 
+	if (amd_special_default_mtrr()) {
+		unsigned lo, hi;
+		/* TOP_MEM2 */
+		rdmsr(MSR_K8_TOP_MEM2, lo, hi);
+		tom2 = hi;
+		tom2 <<= 32;
+		tom2 |= lo;
+		tom2 &= 0xffffff8000000ULL;
+	}
 	if (mtrr_show) {
 		int high_width;
 
@@ -236,6 +251,8 @@ void __init get_mtrr_state(void)
 			else
 				printk(KERN_INFO "MTRR %u disabled\n", i);
 		}
+		if (tom2)
+			printk(KERN_INFO "TOM2: %016lx aka %ldM\n", tom2, tom2>>20);
 	}
 	mtrr_state_set = 1;
 
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index a6450b3ae75..6a1e278d932 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -627,7 +627,7 @@ early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
 #define Tom2Enabled (1U << 21)
 #define Tom2ForceMemTypeWB (1U << 22)
 
-static __init int amd_special_default_mtrr(void)
+int __init amd_special_default_mtrr(void)
 {
 	u32 l, h;
 
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 57a2af36d6e..0648a2225b0 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -47,6 +47,12 @@ static int pat_known_cpu(void)
 			return 1;
 		}
 	}
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+	     boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x11) {
+		if (cpu_has_pat) {
+			return 1;
+		}
+	}
 
 	pat_wc_enabled = 0;
 	printk(KERN_INFO "CPU and/or kernel does not support PAT.\n");
diff --git a/include/asm-x86/mtrr.h b/include/asm-x86/mtrr.h
index ee172296e05..d3d26625aea 100644
--- a/include/asm-x86/mtrr.h
+++ b/include/asm-x86/mtrr.h
@@ -99,6 +99,7 @@ extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi);
 extern void mtrr_ap_init(void);
 extern void mtrr_bp_init(void);
 extern int mtrr_trim_uncached_memory(unsigned long end_pfn);
+extern int amd_special_default_mtrr(void);
 #  else
 static inline u8 mtrr_type_lookup(u64 addr, u64 end)
 {
-- 
cgit v1.2.3-70-g09d2


From 9307cacad0dfe3749f00303125c6f7f0523e5616 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel.send@gmail.com>
Date: Mon, 24 Mar 2008 23:24:34 -0700
Subject: x86: pat cpu feature bit setting for known cpus

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/common.c | 25 +++++++++++++++++++++++++
 arch/x86/kernel/setup_64.c   |  7 +++++++
 arch/x86/mm/pat.c            | 15 ++-------------
 3 files changed, 34 insertions(+), 13 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 0dd87b8d670..d999d7833bc 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -309,6 +309,19 @@ static void __cpuinit early_get_cap(struct cpuinfo_x86 *c)
 
 	}
 
+	clear_cpu_cap(c, X86_FEATURE_PAT);
+
+	switch (c->x86_vendor) {
+	case X86_VENDOR_AMD:
+		if (c->x86 >= 0xf && c->x86 <= 0x11)
+			set_cpu_cap(c, X86_FEATURE_PAT);
+		break;
+	case X86_VENDOR_INTEL:
+		if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15))
+			set_cpu_cap(c, X86_FEATURE_PAT);
+		break;
+	}
+
 }
 
 /*
@@ -397,6 +410,18 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
 		init_scattered_cpuid_features(c);
 	}
 
+	clear_cpu_cap(c, X86_FEATURE_PAT);
+
+	switch (c->x86_vendor) {
+	case X86_VENDOR_AMD:
+		if (c->x86 >= 0xf && c->x86 <= 0x11)
+			set_cpu_cap(c, X86_FEATURE_PAT);
+		break;
+	case X86_VENDOR_INTEL:
+		if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15))
+			set_cpu_cap(c, X86_FEATURE_PAT);
+		break;
+	}
 }
 
 static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 20034493b5a..c6fe1e4bc7c 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -962,12 +962,19 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
 	if (c->extended_cpuid_level >= 0x80000007)
 		c->x86_power = cpuid_edx(0x80000007);
 
+
+	clear_cpu_cap(c, X86_FEATURE_PAT);
+
 	switch (c->x86_vendor) {
 	case X86_VENDOR_AMD:
 		early_init_amd(c);
+		if (c->x86 >= 0xf && c->x86 <= 0x11)
+			set_cpu_cap(c, X86_FEATURE_PAT);
 		break;
 	case X86_VENDOR_INTEL:
 		early_init_intel(c);
+		if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15))
+			set_cpu_cap(c, X86_FEATURE_PAT);
 		break;
 	}
 
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 0648a2225b0..72c0f609740 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -40,19 +40,8 @@ static int pat_known_cpu(void)
 	if (!pat_wc_enabled)
 		return 0;
 
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
-	    (boot_cpu_data.x86 == 0xF ||
-	     (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model >= 15))) {
-		if (cpu_has_pat) {
-			return 1;
-		}
-	}
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
-	     boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x11) {
-		if (cpu_has_pat) {
-			return 1;
-		}
-	}
+	if (cpu_has_pat)
+		return 1;
 
 	pat_wc_enabled = 0;
 	printk(KERN_INFO "CPU and/or kernel does not support PAT.\n");
-- 
cgit v1.2.3-70-g09d2


From dee7cbb210fdd266ad81af4689bcbac3649f38ff Mon Sep 17 00:00:00 2001
From: Venki Pallipadi <venkatesh.pallipadi@intel.com>
Date: Mon, 24 Mar 2008 14:39:55 -0700
Subject: x86: PAT bug fix for attribute type check after reserve_memtype

Bug fixes for reserve_memtype() call in __ioremap and pci_mmap_page_range().
If reserve_memtype returns non-zero, then it is an error and subsequent free is
not required. Requested and returned prot value check should be done when
reserve_memtype returns success.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/ioremap.c | 11 +++++++++--
 arch/x86/pci/i386.c   |  7 ++++++-
 2 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 6cd3418afe7..3f7f05e2c43 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -124,6 +124,7 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
 	struct vm_struct *area;
 	unsigned long new_prot_val;
 	pgprot_t prot;
+	int retval;
 
 	/* Don't allow wraparound or zero size */
 	last_addr = phys_addr + size - 1;
@@ -163,8 +164,14 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
 	phys_addr &= PAGE_MASK;
 	size = PAGE_ALIGN(last_addr+1) - phys_addr;
 
-	if (reserve_memtype(phys_addr, phys_addr + size,
-	                    prot_val, &new_prot_val)) {
+	retval = reserve_memtype(phys_addr, phys_addr + size,
+						prot_val, &new_prot_val);
+	if (retval) {
+		printk("reserve_memtype returned %d\n", retval);
+		return NULL;
+	}
+
+	if (prot_val != new_prot_val) {
 		/*
 		 * Do not fallback to certain memory types with certain
 		 * requested type:
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 4ebf52f6b1f..2ead7236307 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -328,6 +328,7 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 	unsigned long len = vma->vm_end - vma->vm_start;
 	unsigned long flags;
 	unsigned long new_flags;
+	int retval;
 
 	/* I/O space cannot be accessed via normal processor loads and
 	 * stores on this platform.
@@ -344,7 +345,11 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 	vma->vm_page_prot = __pgprot(prot);
 
 	flags = pgprot_val(vma->vm_page_prot) & _PAGE_CACHE_MASK;
-	if (reserve_memtype(addr, addr + len, flags, &new_flags)) {
+	retval = reserve_memtype(addr, addr + len, flags, &new_flags);
+	if (retval)
+		return retval;
+
+	if (flags != new_flags) {
 		/*
 		 * Do not fallback to certain memory types with certain
 		 * requested type:
-- 
cgit v1.2.3-70-g09d2


From b450e5e816d10893e17f57b3eb9d29c52635862a Mon Sep 17 00:00:00 2001
From: Venki Pallipadi <venkatesh.pallipadi@intel.com>
Date: Tue, 25 Mar 2008 16:51:26 -0700
Subject: x86: PAT bug fix for attribute type check after reserve_memtype,
 debug

Make the PAT related printks in ioremap pr_debug.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/ioremap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 3f7f05e2c43..7338c5d3dd3 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -167,7 +167,7 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
 	retval = reserve_memtype(phys_addr, phys_addr + size,
 						prot_val, &new_prot_val);
 	if (retval) {
-		printk("reserve_memtype returned %d\n", retval);
+		pr_debug("Warning: reserve_memtype returned %d\n", retval);
 		return NULL;
 	}
 
@@ -184,7 +184,7 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
 		      new_prot_val == _PAGE_CACHE_WC)) ||
 		    (prot_val == _PAGE_CACHE_WC &&
 		     new_prot_val == _PAGE_CACHE_WB)) {
-			printk(
+			pr_debug(
 		"ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n",
 				phys_addr, phys_addr + size,
 				prot_val, new_prot_val);
-- 
cgit v1.2.3-70-g09d2


From c64df70793a9c344874eb4af19f85e0662d2d3ee Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel.send@gmail.com>
Date: Fri, 21 Mar 2008 18:56:19 -0700
Subject: x86: memtest bootparam

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/kernel-parameters.txt |  5 +++++
 arch/x86/Kconfig                    | 29 +++++++++++++++++++++++++++++
 arch/x86/kernel/e820_64.c           | 10 +++++-----
 arch/x86/mm/init_64.c               | 24 +++++++++++++++++-------
 4 files changed, 56 insertions(+), 12 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 43c527d72f2..f9ea0803d5d 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1147,6 +1147,11 @@ and is between 256 and 4096 characters. It is defined in the file
 			         or
 			         memmap=0x10000$0x18690000
 
+	memtest=	[KNL,X86_64] Enable memtest
+			Format: <integer>
+			range: 0,4 : pattern number
+			default : 0 <disable>
+
 	meye.*=		[HW] Set MotionEye Camera parameters
 			See Documentation/video4linux/meye.txt.
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e50e38e76d9..a0d7406e8b3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -382,6 +382,35 @@ config PARAVIRT
 
 endif
 
+config MEMTEST_BOOTPARAM
+	bool "Memtest boot parameter"
+	depends on X86_64
+	default y
+	help
+	  This option adds a kernel parameter 'memtest', which allows memtest
+	  to be disabled at boot.  If this option is selected, memtest
+	  functionality can be disabled with memtest=0 on the kernel
+	  command line.  The purpose of this option is to allow a single
+	  kernel image to be distributed with memtest built in, but not
+	  necessarily enabled.
+
+	  If you are unsure how to answer this question, answer Y.
+
+config MEMTEST_BOOTPARAM_VALUE
+	int "Memtest boot parameter default value (0-4)"
+	depends on MEMTEST_BOOTPARAM
+	range 0 4
+	default 0
+	help
+	  This option sets the default value for the kernel parameter
+	  'memtest', which allows memtest to be disabled at boot.  If this
+	  option is set to 0 (zero), the memtest kernel parameter will
+	  default to 0, disabling memtest at bootup.  If this option is
+	  set to 4, the memtest kernel parameter will default to 4,
+	  enabling memtest at bootup, and use that as pattern number.
+
+	  If you are unsure how to answer this question, answer 0.
+
 config ACPI_SRAT
 	def_bool y
 	depends on X86_32 && ACPI && NUMA && (X86_SUMMIT || X86_GENERICARCH)
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 9184e6437c4..d6ada083387 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -241,7 +241,9 @@ unsigned long __init find_e820_area(unsigned long start, unsigned long end,
 /*
  * Find next free range after *start
  */
-unsigned long __init find_e820_area_size(unsigned long start, unsigned long *sizep, unsigned long align)
+unsigned long __init find_e820_area_size(unsigned long start,
+					 unsigned long *sizep,
+					 unsigned long align)
 {
 	int i;
 
@@ -254,17 +256,15 @@ unsigned long __init find_e820_area_size(unsigned long start, unsigned long *siz
 			continue;
 		addr = round_up(ei->addr, align);
 		ei_last = ei->addr + ei->size;
-//		printk(KERN_DEBUG "find_e820_area_size : e820 %d [%llx, %lx]\n", i, ei->addr, ei_last);
 		if (addr < start)
 			addr = round_up(start, align);
-//		printk(KERN_DEBUG "find_e820_area_size : 0 [%lx, %lx]\n", addr, ei_last);
 		if (addr >= ei_last)
 			continue;
 		*sizep = ei_last - addr;
-		while (bad_addr_size(&addr, sizep, align) && addr+ *sizep <= ei_last)
+		while (bad_addr_size(&addr, sizep, align) &&
+			addr + *sizep <= ei_last)
 			;
 		last = addr + *sizep;
-//		printk(KERN_DEBUG "find_e820_area_size : 1 [%lx, %lx]\n", addr, last);
 		if (last > ei_last)
 			continue;
 		return addr;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 52f54ee4559..ae225c3ae9a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -427,7 +427,10 @@ static void __init init_gbpages(void)
 		direct_gbpages = 0;
 }
 
-static void __init memtest(unsigned long start_phys, unsigned long size, unsigned pattern)
+#ifdef CONFIG_MEMTEST_BOOTPARAM
+
+static void __init memtest(unsigned long start_phys, unsigned long size,
+				 unsigned pattern)
 {
 	unsigned long i;
 	unsigned long *start;
@@ -486,11 +489,12 @@ static void __init memtest(unsigned long start_phys, unsigned long size, unsigne
 
 }
 
-static int __initdata memtest_pattern;
+static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE;
+
 static int __init parse_memtest(char *arg)
 {
 	if (arg)
-		memtest_pattern = simple_strtoul(arg, NULL, 0) + 1;
+		memtest_pattern = simple_strtoul(arg, NULL, 0);
 	return 0;
 }
 
@@ -501,8 +505,10 @@ static void __init early_memtest(unsigned long start, unsigned long end)
 	unsigned long t_start, t_size;
 	unsigned pattern;
 
-	if (memtest_pattern)
-		printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
+	if (!memtest_pattern)
+		return;
+
+	printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
 	for (pattern = 0; pattern < memtest_pattern; pattern++) {
 		t_start = start;
 		t_size = 0;
@@ -523,9 +529,13 @@ static void __init early_memtest(unsigned long start, unsigned long end)
 			t_start += t_size;
 		}
 	}
-	if (memtest_pattern)
-		printk(KERN_CONT "\n");
+	printk(KERN_CONT "\n");
 }
+#else
+static void __init early_memtest(unsigned long start, unsigned long end)
+{
+}
+#endif
 
 /*
  * Setup the direct mapping of the physical memory at PAGE_OFFSET.
-- 
cgit v1.2.3-70-g09d2


From dcfe946520719943fabd3e5ed13813956e48e37c Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel.send@gmail.com>
Date: Tue, 15 Apr 2008 23:17:42 -0700
Subject: x86: fix memtest print out

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ae225c3ae9a..210243e94d8 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -473,7 +473,7 @@ static void __init memtest(unsigned long start_phys, unsigned long size,
 				last_bad += incr;
 			} else {
 				if (start_bad) {
-					printk(KERN_INFO "  %016lxx bad mem addr %016lx - %016lx reserved\n",
+					printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
 						val, start_bad, last_bad + incr);
 					reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
 				}
@@ -482,7 +482,7 @@ static void __init memtest(unsigned long start_phys, unsigned long size,
 		}
 	}
 	if (start_bad) {
-		printk(KERN_INFO "  %016lx bad mem addr %016lx - %016lx reserved\n",
+		printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
 			val, start_bad, last_bad + incr);
 		reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
 	}
-- 
cgit v1.2.3-70-g09d2


From 7d1116a92d709c22e7db910724c9fcd2001b0499 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 12 Mar 2008 03:53:27 +0100
Subject: x86: implement true end_pfn_mapped for 32bit

Even on 32bit 2MB pages can map more memory than is in the true
max_low_pfn if end_pfn is not highmem and not aligned to 2MB.
Add a end_pfn_map similar to x86-64 that accounts for this
fact. This is important for code that really needs to know about
all mapping aliases.

Signed-off-by: Andi Kleen <ak@suse.de>
Cc: andreas.herrmann3@amd.com
Cc: mingo@elte.hu
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_32.c     | 4 ++++
 include/asm-x86/page.h    | 4 +++-
 include/asm-x86/page_64.h | 1 -
 3 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 00168e65688..73dd0601166 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -51,6 +51,8 @@
 
 unsigned int __VMALLOC_RESERVE = 128 << 20;
 
+unsigned long end_pfn_map;
+
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 unsigned long highstart_pfn, highend_pfn;
 
@@ -194,6 +196,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 				set_pmd(pmd, pfn_pmd(pfn, prot));
 
 				pfn += PTRS_PER_PTE;
+				end_pfn_map = pfn;
 				continue;
 			}
 			pte = one_page_table_init(pmd);
@@ -208,6 +211,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 
 				set_pte(pte, pfn_pte(pfn, prot));
 			}
+			end_pfn_map = pfn;
 		}
 	}
 }
diff --git a/include/asm-x86/page.h b/include/asm-x86/page.h
index a05b2896492..b734939916c 100644
--- a/include/asm-x86/page.h
+++ b/include/asm-x86/page.h
@@ -36,7 +36,7 @@
 #define max_pfn_mapped		end_pfn_map
 #else
 #include <asm/page_32.h>
-#define max_pfn_mapped		max_low_pfn
+#define max_pfn_mapped		end_pfn_map
 #endif	/* CONFIG_X86_64 */
 
 #define PAGE_OFFSET		((unsigned long)__PAGE_OFFSET)
@@ -50,6 +50,8 @@
 
 extern int page_is_ram(unsigned long pagenr);
 
+extern unsigned long end_pfn_map;
+
 struct page;
 
 static inline void clear_user_page(void *page, unsigned long vaddr,
diff --git a/include/asm-x86/page_64.h b/include/asm-x86/page_64.h
index f156778f707..54d5db63485 100644
--- a/include/asm-x86/page_64.h
+++ b/include/asm-x86/page_64.h
@@ -59,7 +59,6 @@ void clear_page(void *page);
 void copy_page(void *to, void *from);
 
 extern unsigned long end_pfn;
-extern unsigned long end_pfn_map;
 extern unsigned long phys_base;
 
 extern unsigned long __phys_addr(unsigned long);
-- 
cgit v1.2.3-70-g09d2


From 67794292c8615b05f46419ba8d4fd99e7c9a5db9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 21 Mar 2008 21:27:10 +0100
Subject: x86: replace the now useless max_pfn_mapped define

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/boot.c |  2 +-
 arch/x86/kernel/e820_64.c   | 28 ++++++++++++++--------------
 arch/x86/kernel/setup_64.c  |  2 +-
 arch/x86/mm/init_32.c       |  6 +++---
 arch/x86/mm/init_64.c       |  2 +-
 include/asm-x86/page.h      |  4 +---
 6 files changed, 21 insertions(+), 23 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 956b60f3ebd..e277c370246 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -115,7 +115,7 @@ char *__init __acpi_map_table(unsigned long phys_addr, unsigned long size)
 	if (!phys_addr || !size)
 		return NULL;
 
-	if (phys_addr+size <= (end_pfn_map << PAGE_SHIFT) + PAGE_SIZE)
+	if (phys_addr+size <= (max_pfn_mapped << PAGE_SHIFT) + PAGE_SIZE)
 		return __va(phys_addr);
 
 	return NULL;
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index d6ada083387..a720f3d5ed9 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -36,11 +36,11 @@ struct e820map e820;
 unsigned long end_pfn;
 
 /*
- * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
- * The direct mapping extends to end_pfn_map, so that we can directly access
+ * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
+ * The direct mapping extends to max_pfn_mapped, so that we can directly access
  * apertures, ACPI and other tables without having to play with fixmaps.
  */
-unsigned long end_pfn_map;
+unsigned long max_pfn_mapped;
 
 /*
  * Last pfn which the user wants to use.
@@ -281,16 +281,16 @@ unsigned long __init e820_end_of_ram(void)
 
 	end_pfn = find_max_pfn_with_active_regions();
 
-	if (end_pfn > end_pfn_map)
-		end_pfn_map = end_pfn;
-	if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
-		end_pfn_map = MAXMEM>>PAGE_SHIFT;
+	if (end_pfn > max_pfn_mapped)
+		max_pfn_mapped = end_pfn;
+	if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT)
+		max_pfn_mapped = MAXMEM>>PAGE_SHIFT;
 	if (end_pfn > end_user_pfn)
 		end_pfn = end_user_pfn;
-	if (end_pfn > end_pfn_map)
-		end_pfn = end_pfn_map;
+	if (end_pfn > max_pfn_mapped)
+		end_pfn = max_pfn_mapped;
 
-	printk(KERN_INFO "end_pfn_map = %lu\n", end_pfn_map);
+	printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped);
 	return end_pfn;
 }
 
@@ -366,9 +366,9 @@ static int __init e820_find_active_region(const struct e820entry *ei,
 	if (*ei_startpfn >= *ei_endpfn)
 		return 0;
 
-	/* Check if end_pfn_map should be updated */
-	if (ei->type != E820_RAM && *ei_endpfn > end_pfn_map)
-		end_pfn_map = *ei_endpfn;
+	/* Check if max_pfn_mapped should be updated */
+	if (ei->type != E820_RAM && *ei_endpfn > max_pfn_mapped)
+		max_pfn_mapped = *ei_endpfn;
 
 	/* Skip if map is outside the node */
 	if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
@@ -759,7 +759,7 @@ static int __init parse_memmap_opt(char *p)
 		saved_max_pfn = e820_end_of_ram();
 		remove_all_active_ranges();
 #endif
-		end_pfn_map = 0;
+		max_pfn_mapped = 0;
 		e820.nr_map = 0;
 		userdef = 1;
 		return 0;
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index c6fe1e4bc7c..413b8fc3154 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -347,7 +347,7 @@ void __init setup_arch(char **cmdline_p)
 
 	check_efer();
 
-	init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
+	init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT));
 	if (efi_enabled)
 		efi_init();
 
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 73dd0601166..fc3ace2e88f 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -51,7 +51,7 @@
 
 unsigned int __VMALLOC_RESERVE = 128 << 20;
 
-unsigned long end_pfn_map;
+unsigned long max_pfn_mapped;
 
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 unsigned long highstart_pfn, highend_pfn;
@@ -196,7 +196,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 				set_pmd(pmd, pfn_pmd(pfn, prot));
 
 				pfn += PTRS_PER_PTE;
-				end_pfn_map = pfn;
+				max_pfn_mapped = pfn;
 				continue;
 			}
 			pte = one_page_table_init(pmd);
@@ -211,7 +211,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 
 				set_pte(pte, pfn_pte(pfn, prot));
 			}
-			end_pfn_map = pfn;
+			max_pfn_mapped = pfn;
 		}
 	}
 }
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 210243e94d8..ef9e9cfb1fc 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -791,7 +791,7 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
 		 * This can happen with kdump kernels when accessing
 		 * firmware tables:
 		 */
-		if (pfn < end_pfn_map)
+		if (pfn < max_pfn_mapped)
 			return;
 
 		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
diff --git a/include/asm-x86/page.h b/include/asm-x86/page.h
index b734939916c..6724a4bc6b7 100644
--- a/include/asm-x86/page.h
+++ b/include/asm-x86/page.h
@@ -33,10 +33,8 @@
 
 #ifdef CONFIG_X86_64
 #include <asm/page_64.h>
-#define max_pfn_mapped		end_pfn_map
 #else
 #include <asm/page_32.h>
-#define max_pfn_mapped		end_pfn_map
 #endif	/* CONFIG_X86_64 */
 
 #define PAGE_OFFSET		((unsigned long)__PAGE_OFFSET)
@@ -50,7 +48,7 @@
 
 extern int page_is_ram(unsigned long pagenr);
 
-extern unsigned long end_pfn_map;
+extern unsigned long max_pfn_mapped;
 
 struct page;
 
-- 
cgit v1.2.3-70-g09d2


From cc6150321903ca4c3bc9d53b0cdafb05d77d64d0 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 12 Mar 2008 03:53:28 +0100
Subject: x86: account overlapped mappings in max_pfn_mapped

When end_pfn is not aligned to 2MB (or 1GB) then the kernel might
map more memory than end_pfn. Account this in max_pfn_mapped.

Signed-off-by: Andi Kleen <ak@suse.de>
Cc: andreas.herrmann3@amd.com
Cc: mingo@elte.hu
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_64.c |  2 +-
 arch/x86/mm/init_64.c      | 34 +++++++++++++++++++++++-----------
 include/asm-x86/page_64.h  |  3 +++
 include/asm-x86/proto.h    |  2 --
 4 files changed, 27 insertions(+), 14 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 413b8fc3154..3d76dbd9f2c 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -347,7 +347,7 @@ void __init setup_arch(char **cmdline_p)
 
 	check_efer();
 
-	init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT));
+	max_pfn_mapped = init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT));
 	if (efi_enabled)
 		efi_init();
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ef9e9cfb1fc..1076097dcab 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -313,7 +313,7 @@ __meminit void early_iounmap(void *addr, unsigned long size)
 	__flush_tlb_all();
 }
 
-static void __meminit
+static unsigned long __meminit
 phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
 {
 	int i = pmd_index(address);
@@ -335,21 +335,26 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
 		set_pte((pte_t *)pmd,
 			pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
 	}
+	return address;
 }
 
-static void __meminit
+static unsigned long __meminit
 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
 {
 	pmd_t *pmd = pmd_offset(pud, 0);
+	unsigned long last_map_addr;
+
 	spin_lock(&init_mm.page_table_lock);
-	phys_pmd_init(pmd, address, end);
+	last_map_addr = phys_pmd_init(pmd, address, end);
 	spin_unlock(&init_mm.page_table_lock);
 	__flush_tlb_all();
+	return last_map_addr;
 }
 
-static void __meminit
+static unsigned long __meminit
 phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
 {
+	unsigned long last_map_addr = end;
 	int i = pud_index(addr);
 
 	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
@@ -368,13 +373,14 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
 
 		if (pud_val(*pud)) {
 			if (!pud_large(*pud))
-				phys_pmd_update(pud, addr, end);
+				last_map_addr = phys_pmd_update(pud, addr, end);
 			continue;
 		}
 
 		if (direct_gbpages) {
 			set_pte((pte_t *)pud,
 				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+			last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
 			continue;
 		}
 
@@ -382,12 +388,14 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
 
 		spin_lock(&init_mm.page_table_lock);
 		set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
-		phys_pmd_init(pmd, addr, end);
+		last_map_addr = phys_pmd_init(pmd, addr, end);
 		spin_unlock(&init_mm.page_table_lock);
 
 		unmap_low_page(pmd);
 	}
 	__flush_tlb_all();
+
+	return last_map_addr >> PAGE_SHIFT;
 }
 
 static void __init find_early_table_space(unsigned long end)
@@ -542,9 +550,9 @@ static void __init early_memtest(unsigned long start, unsigned long end)
  * This runs before bootmem is initialized and gets pages directly from
  * the physical memory. To access them they are temporarily mapped.
  */
-void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
+unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
 {
-	unsigned long next;
+	unsigned long next, last_map_addr = end;
 	unsigned long start_phys = start, end_phys = end;
 
 	printk(KERN_INFO "init_memory_mapping\n");
@@ -577,7 +585,7 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
 		next = start + PGDIR_SIZE;
 		if (next > end)
 			next = end;
-		phys_pud_init(pud, __pa(start), __pa(next));
+		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
 		if (!after_bootmem)
 			set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
 		unmap_low_page(pud);
@@ -593,6 +601,8 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
 
 	if (!after_bootmem)
 		early_memtest(start_phys, end_phys);
+
+	return last_map_addr;
 }
 
 #ifndef CONFIG_NUMA
@@ -632,11 +642,13 @@ int arch_add_memory(int nid, u64 start, u64 size)
 {
 	struct pglist_data *pgdat = NODE_DATA(nid);
 	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
-	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
 
-	init_memory_mapping(start, start + size-1);
+	last_mapped_pfn = init_memory_mapping(start, start + size-1);
+	if (last_mapped_pfn > max_pfn_mapped)
+		max_pfn_mapped = last_mapped_pfn;
 
 	ret = __add_pages(zone, start_pfn, nr_pages);
 	WARN_ON(1);
diff --git a/include/asm-x86/page_64.h b/include/asm-x86/page_64.h
index 54d5db63485..6ea72859c49 100644
--- a/include/asm-x86/page_64.h
+++ b/include/asm-x86/page_64.h
@@ -80,6 +80,9 @@ typedef struct { pteval_t pte; } pte_t;
 
 #define vmemmap ((struct page *)VMEMMAP_START)
 
+extern unsigned long init_memory_mapping(unsigned long start,
+					 unsigned long end);
+
 #endif	/* !__ASSEMBLY__ */
 
 #ifdef CONFIG_FLATMEM
diff --git a/include/asm-x86/proto.h b/include/asm-x86/proto.h
index 9da46af3b0e..1e17bcce450 100644
--- a/include/asm-x86/proto.h
+++ b/include/asm-x86/proto.h
@@ -7,8 +7,6 @@
 
 extern void early_idt_handler(void);
 
-extern void init_memory_mapping(unsigned long start, unsigned long end);
-
 extern void system_call(void);
 extern void syscall_init(void);
 
-- 
cgit v1.2.3-70-g09d2


From c9caa02c529d5e113e40cbc77254558fcdfa4215 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 12 Mar 2008 03:53:29 +0100
Subject: x86: add set_memory_4k to pageattr.c

Add a new function to force split large pages into 4k pages.
This is needed for some followup optimizations.

I had to add a new field to cpa_data to pass down the information
that try_preserve_large_page should not run.

Right now no set_page_4k() because I didn't need it and all the
specialized users I have in mind would be more comfortable with
pure addresses. I also didn't export it because it's unlikely
external code needs it.

Signed-off-by: Andi Kleen <ak@suse.de>
Cc: andreas.herrmann3@amd.com
Cc: mingo@elte.hu
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pageattr.c       | 20 ++++++++++++++++----
 include/asm-x86/cacheflush.h |  1 +
 2 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 270cab2e603..7d9517abc9a 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -31,6 +31,7 @@ struct cpa_data {
 	int		numpages;
 	int		flushtlb;
 	unsigned long	pfn;
+	unsigned	force_split : 1;
 };
 
 #ifdef CONFIG_X86_64
@@ -262,6 +263,9 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 	int i, do_split = 1;
 	unsigned int level;
 
+	if (cpa->force_split)
+		return 1;
+
 	spin_lock_irqsave(&pgd_lock, flags);
 	/*
 	 * Check for races, another CPU might have split this page
@@ -696,7 +700,8 @@ static inline int cache_attr(pgprot_t attr)
 }
 
 static int change_page_attr_set_clr(unsigned long addr, int numpages,
-				    pgprot_t mask_set, pgprot_t mask_clr)
+				    pgprot_t mask_set, pgprot_t mask_clr,
+				    int force_split)
 {
 	struct cpa_data cpa;
 	int ret, cache, checkalias;
@@ -707,7 +712,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
 	 */
 	mask_set = canon_pgprot(mask_set);
 	mask_clr = canon_pgprot(mask_clr);
-	if (!pgprot_val(mask_set) && !pgprot_val(mask_clr))
+	if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
 		return 0;
 
 	/* Ensure we are PAGE_SIZE aligned */
@@ -724,6 +729,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
 	cpa.mask_set = mask_set;
 	cpa.mask_clr = mask_clr;
 	cpa.flushtlb = 0;
+	cpa.force_split = force_split;
 
 	/* No alias checking for _NX bit modifications */
 	checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
@@ -762,13 +768,13 @@ out:
 static inline int change_page_attr_set(unsigned long addr, int numpages,
 				       pgprot_t mask)
 {
-	return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
+	return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0);
 }
 
 static inline int change_page_attr_clear(unsigned long addr, int numpages,
 					 pgprot_t mask)
 {
-	return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
+	return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0);
 }
 
 int _set_memory_uc(unsigned long addr, int numpages)
@@ -847,6 +853,12 @@ int set_memory_np(unsigned long addr, int numpages)
 	return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
 }
 
+int set_memory_4k(unsigned long addr, int numpages)
+{
+	return change_page_attr_set_clr(addr, numpages, __pgprot(0),
+					__pgprot(0), 1);
+}
+
 int set_pages_uc(struct page *page, int numpages)
 {
 	unsigned long addr = (unsigned long)page_address(page);
diff --git a/include/asm-x86/cacheflush.h b/include/asm-x86/cacheflush.h
index 7ab5b520b7b..cb1d6f8fd00 100644
--- a/include/asm-x86/cacheflush.h
+++ b/include/asm-x86/cacheflush.h
@@ -45,6 +45,7 @@ int set_memory_nx(unsigned long addr, int numpages);
 int set_memory_ro(unsigned long addr, int numpages);
 int set_memory_rw(unsigned long addr, int numpages);
 int set_memory_np(unsigned long addr, int numpages);
+int set_memory_4k(unsigned long addr, int numpages);
 
 void clflush_cache_range(void *addr, unsigned int size);
 
-- 
cgit v1.2.3-70-g09d2


From f5c24a7fd0798d636af184cc7032e7e0cb149112 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 12 Mar 2008 03:53:30 +0100
Subject: x86: don't use large pages to map the first 2/4MB of memory

Intel recommends to not use large pages for the first 1MB
of the physical memory because there are fixed size MTRRs there
which cause splitups in the TLBs.

On AMD doing so is also a good idea.

The implementation is a little different between 32bit and 64bit.
On 32bit I just taught the initial page table set up about this
because it was very simple to do. This also has the advantage
that the risk of a prefetch ever seeing the page even
if it only exists for a short time is minimized.

On 64bit that is not quite possible, so use set_memory_4k() a little
later (in check_bugs) instead.

Signed-off-by: Andi Kleen <ak@suse.de>
Acked-by: andreas.herrmann3@amd.com
Cc: mingo@elte.hu
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/bugs_64.c | 12 ++++++++++++
 arch/x86/mm/init_32.c     |  7 ++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/kernel/bugs_64.c b/arch/x86/kernel/bugs_64.c
index 60207e999a0..9a3ed0649d4 100644
--- a/arch/x86/kernel/bugs_64.c
+++ b/arch/x86/kernel/bugs_64.c
@@ -9,6 +9,7 @@
 #include <asm/bugs.h>
 #include <asm/processor.h>
 #include <asm/mtrr.h>
+#include <asm/cacheflush.h>
 
 void __init check_bugs(void)
 {
@@ -18,4 +19,15 @@ void __init check_bugs(void)
 	print_cpu_info(&boot_cpu_data);
 #endif
 	alternative_instructions();
+
+	/*
+	 * Make sure the first 2MB area is not mapped by huge pages
+	 * There are typically fixed size MTRRs in there and overlapping
+	 * MTRRs into large pages causes slow downs.
+	 *
+	 * Right now we don't do that with gbpages because there seems
+	 * very little benefit for that case.
+	 */
+	if (!direct_gbpages)
+		set_memory_4k((unsigned long)__va(0), 1);
 }
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index fc3ace2e88f..1500dc8d63e 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -181,8 +181,13 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 			/*
 			 * Map with big pages if possible, otherwise
 			 * create normal page tables:
+			 *
+			 * Don't use a large page for the first 2/4MB of memory
+			 * because there are often fixed size MTRRs in there
+			 * and overlapping MTRRs into large pages can cause
+			 * slowdowns.
 			 */
-			if (cpu_has_pse) {
+			if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) {
 				unsigned int addr2;
 				pgprot_t prot = PAGE_KERNEL_LARGE;
 
-- 
cgit v1.2.3-70-g09d2


From 756a6c68556600aec9460346332884d891d5beb4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 25 Mar 2008 08:31:17 +0100
Subject: x86: ioremap of 64-bit resource on 32-bit kernel fix

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/ioremap.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 7338c5d3dd3..c590fd200e2 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -47,7 +47,7 @@ static inline int phys_addr_valid(unsigned long addr)
 
 int page_is_ram(unsigned long pagenr)
 {
-	unsigned long addr, end;
+	resource_size_t addr, end;
 	int i;
 
 	/*
@@ -120,7 +120,8 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
 static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
 			       unsigned long prot_val)
 {
-	unsigned long pfn, offset, last_addr, vaddr;
+	unsigned long pfn, offset, vaddr;
+	resource_size_t last_addr;
 	struct vm_struct *area;
 	unsigned long new_prot_val;
 	pgprot_t prot;
-- 
cgit v1.2.3-70-g09d2


From 6b6891f9c545ccd45d6d8ddfd33ce27c22c271a7 Mon Sep 17 00:00:00 2001
From: "gorcunov@gmail.com" <gorcunov@gmail.com>
Date: Fri, 28 Mar 2008 17:56:57 +0300
Subject: x86: cleanup - rename VM_MASK to X86_VM_MASK

This patch renames VM_MASK to X86_VM_MASK (which
in turn defined as alias to X86_EFLAGS_VM) to better
distinguish from virtual memory flags. We can't just
use X86_EFLAGS_VM instead because it is also used
for conditional compilation

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/traps_32.c | 8 ++++----
 arch/x86/kernel/vm86_32.c  | 2 +-
 arch/x86/mm/fault.c        | 2 +-
 include/asm-x86/ptrace.h   | 4 ++--
 include/asm-x86/vm86.h     | 4 ++--
 5 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 3284502a1bf..bb9107c56ff 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -498,7 +498,7 @@ do_trap(int trapnr, int signr, char *str, int vm86, struct pt_regs *regs,
 {
 	struct task_struct *tsk = current;
 
-	if (regs->flags & VM_MASK) {
+	if (regs->flags & X86_VM_MASK) {
 		if (vm86)
 			goto vm86_trap;
 		goto trap_signal;
@@ -643,7 +643,7 @@ void __kprobes do_general_protection(struct pt_regs *regs, long error_code)
 	}
 	put_cpu();
 
-	if (regs->flags & VM_MASK)
+	if (regs->flags & X86_VM_MASK)
 		goto gp_in_vm86;
 
 	if (!user_mode(regs))
@@ -922,7 +922,7 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
 			goto clear_dr7;
 	}
 
-	if (regs->flags & VM_MASK)
+	if (regs->flags & X86_VM_MASK)
 		goto debug_vm86;
 
 	/* Save debug status register where ptrace can see it */
@@ -1094,7 +1094,7 @@ void do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 	 * Handle strange cache flush from user space exception
 	 * in all other cases.  This is undocumented behaviour.
 	 */
-	if (regs->flags & VM_MASK) {
+	if (regs->flags & X86_VM_MASK) {
 		handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code);
 		return;
 	}
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 51040698c22..c866c00f4a8 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -299,7 +299,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 	VEFLAGS = info->regs.pt.flags;
 	info->regs.pt.flags &= SAFE_MASK;
 	info->regs.pt.flags |= info->regs32->flags & ~SAFE_MASK;
-	info->regs.pt.flags |= VM_MASK;
+	info->regs.pt.flags |= X86_VM_MASK;
 
 	switch (info->cpu_type) {
 	case CPU_286:
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 81fcbeec389..fd7e1798c75 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -639,7 +639,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 #ifdef CONFIG_X86_32
 	/* It's safe to allow irq's after cr2 has been saved and the vmalloc
 	   fault has been handled. */
-	if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
+	if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK))
 		local_irq_enable();
 
 	/*
diff --git a/include/asm-x86/ptrace.h b/include/asm-x86/ptrace.h
index e779f2b26b3..24ec061566c 100644
--- a/include/asm-x86/ptrace.h
+++ b/include/asm-x86/ptrace.h
@@ -170,7 +170,7 @@ static inline int user_mode(struct pt_regs *regs)
 static inline int user_mode_vm(struct pt_regs *regs)
 {
 #ifdef CONFIG_X86_32
-	return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & VM_MASK)) >=
+	return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >=
 		USER_RPL;
 #else
 	return user_mode(regs);
@@ -180,7 +180,7 @@ static inline int user_mode_vm(struct pt_regs *regs)
 static inline int v8086_mode(struct pt_regs *regs)
 {
 #ifdef CONFIG_X86_32
-	return (regs->flags & VM_MASK);
+	return (regs->flags & X86_VM_MASK);
 #else
 	return 0;	/* No V86 mode support in long mode */
 #endif
diff --git a/include/asm-x86/vm86.h b/include/asm-x86/vm86.h
index a2be241ed03..f5f3dc479c3 100644
--- a/include/asm-x86/vm86.h
+++ b/include/asm-x86/vm86.h
@@ -17,9 +17,9 @@
 #define IOPL_MASK	0x00003000
 #define NT_MASK		0x00004000
 #ifdef CONFIG_VM86
-#define VM_MASK		0x00020000
+#define X86_VM_MASK	X86_EFLAGS_VM
 #else
-#define VM_MASK		0 /* ignored */
+#define X86_VM_MASK	0 /* No VM86 support */
 #endif
 #define AC_MASK		0x00040000
 #define VIF_MASK	0x00080000	/* virtual interrupt flag */
-- 
cgit v1.2.3-70-g09d2


From a65d1d644c2b65bfb99e766e7160d764b8b2bfa4 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Fri, 28 Mar 2008 14:12:08 -0500
Subject: x86: increase size of APICID

Increase the number of bits in an apicid from 8 to 32.

By default, MP_processor_info() gets the APICID from the
mpc_config_processor structure. However, this structure limits
the size of APICID to 8 bits. This patch allows the caller of
MP_processor_info() to optionally pass a larger APICID that will
be used instead of the one in the mpc_config_processor struct.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/mpparse_32.c | 4 ++--
 arch/x86/kernel/mpparse_64.c | 4 ++--
 arch/x86/mm/srat_64.c        | 6 +++++-
 include/asm-x86/apicdef.h    | 9 ++++++---
 include/asm-x86/mpspec.h     | 4 ++--
 5 files changed, 17 insertions(+), 10 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/kernel/mpparse_32.c b/arch/x86/kernel/mpparse_32.c
index 4b46a37e063..7b7e008496e 100644
--- a/arch/x86/kernel/mpparse_32.c
+++ b/arch/x86/kernel/mpparse_32.c
@@ -807,7 +807,7 @@ void __init mp_register_lapic_address(u64 address)
 	Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
 }
 
-void __cpuinit mp_register_lapic (u8 id, u8 enabled)
+void __cpuinit mp_register_lapic (int id, u8 enabled)
 {
 	if (MAX_APICS - id <= 0) {
 		printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
@@ -862,7 +862,7 @@ static u8 uniq_ioapic_id(u8 id)
 		return id;
 }
 
-void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
+void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 {
 	int idx = 0;
 
diff --git a/arch/x86/kernel/mpparse_64.c b/arch/x86/kernel/mpparse_64.c
index 29d2c40e54a..4840a846904 100644
--- a/arch/x86/kernel/mpparse_64.c
+++ b/arch/x86/kernel/mpparse_64.c
@@ -633,7 +633,7 @@ void __init mp_register_lapic_address(u64 address)
 	if (boot_cpu_physical_apicid == -1U)
 		boot_cpu_physical_apicid  = GET_APIC_ID(read_apic_id());
 }
-void __cpuinit mp_register_lapic(u8 id, u8 enabled)
+void __cpuinit mp_register_lapic(int id, u8 enabled)
 {
 	if (!enabled) {
 		++disabled_cpus;
@@ -683,7 +683,7 @@ static u8 uniq_ioapic_id(u8 id)
 	return find_first_zero_bit(used, 256);
 }
 
-void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
+void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 {
 	int idx = 0;
 
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 04e06c8226e..1bae9c855ce 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -20,6 +20,7 @@
 #include <asm/proto.h>
 #include <asm/numa.h>
 #include <asm/e820.h>
+#include <asm/genapic.h>
 
 int acpi_numa __initdata;
 
@@ -148,7 +149,10 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 		return;
 	}
 
-	apic_id = pa->apic_id;
+	if (is_uv_system())
+		apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
+	else
+		apic_id = pa->apic_id;
 	apicid_to_node[apic_id] = node;
 	acpi_numa = 1;
 	printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
diff --git a/include/asm-x86/apicdef.h b/include/asm-x86/apicdef.h
index 8b244683431..6b9008c7873 100644
--- a/include/asm-x86/apicdef.h
+++ b/include/asm-x86/apicdef.h
@@ -133,7 +133,7 @@
 # define MAX_IO_APICS 64
 #else
 # define MAX_IO_APICS 128
-# define MAX_LOCAL_APIC 256
+# define MAX_LOCAL_APIC 32768
 #endif
 
 /*
@@ -406,6 +406,9 @@ struct local_apic {
 
 #undef u32
 
-#define BAD_APICID 0xFFu
-
+#ifdef CONFIG_X86_32
+ #define BAD_APICID 0xFFu
+#else
+ #define BAD_APICID 0xFFFFu
+#endif
 #endif
diff --git a/include/asm-x86/mpspec.h b/include/asm-x86/mpspec.h
index 31bac12a97d..1f6445b147f 100644
--- a/include/asm-x86/mpspec.h
+++ b/include/asm-x86/mpspec.h
@@ -47,9 +47,9 @@ extern void get_smp_config(void);
 
 void __cpuinit generic_processor_info(int apicid, int version);
 #ifdef CONFIG_ACPI
-extern void mp_register_lapic(u8 id, u8 enabled);
+extern void mp_register_lapic(int id, u8 enabled);
 extern void mp_register_lapic_address(u64 address);
-extern void mp_register_ioapic(u8 id, u32 address, u32 gsi_base);
+extern void mp_register_ioapic(int id, u32 address, u32 gsi_base);
 extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
 				   u32 gsi);
 extern void mp_config_acpi_legacy_irqs(void);
-- 
cgit v1.2.3-70-g09d2


From b447a468fcd130aa8951672b6115c673c274e888 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Tue, 25 Mar 2008 15:06:51 -0700
Subject: x86: clean up non-smp usage of cpu maps

Cleanup references to the early cpu maps for the non-SMP configuration
and remove some functions called for SMP configurations only.

Cc: Andi Kleen <ak@suse.de>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c    | 28 +++++++++++-----------------
 arch/x86/mm/numa_64.c      |  4 +++-
 include/asm-x86/smp.h      |  5 +++++
 include/asm-x86/topology.h | 15 +++++++++++----
 4 files changed, 30 insertions(+), 22 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 1179aa06cdb..dc7940955b7 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -10,7 +10,7 @@
 #include <asm/setup.h>
 #include <asm/topology.h>
 
-#ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA
+#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_SMP)
 /*
  * Copy data used in early init routines from the initial arrays to the
  * per cpu data areas.  These arrays then become expendable and the
@@ -21,21 +21,12 @@ static void __init setup_per_cpu_maps(void)
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
-#ifdef CONFIG_SMP
-		if (per_cpu_offset(cpu)) {
-#endif
-			per_cpu(x86_cpu_to_apicid, cpu) =
-						x86_cpu_to_apicid_init[cpu];
-			per_cpu(x86_bios_cpu_apicid, cpu) =
+		per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu];
+		per_cpu(x86_bios_cpu_apicid, cpu) =
 						x86_bios_cpu_apicid_init[cpu];
 #ifdef CONFIG_NUMA
-			per_cpu(x86_cpu_to_node_map, cpu) =
+		per_cpu(x86_cpu_to_node_map, cpu) =
 						x86_cpu_to_node_map_init[cpu];
-#endif
-#ifdef CONFIG_SMP
-		} else
-			printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n",
-									cpu);
 #endif
 	}
 
@@ -72,17 +63,20 @@ void __init setup_per_cpu_areas(void)
 
 	/* Copy section for each CPU (we discard the original) */
 	size = PERCPU_ENOUGH_ROOM;
-
 	printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n",
 			  size);
-	for_each_cpu_mask(i, cpu_possible_map) {
+
+	for_each_possible_cpu(i) {
 		char *ptr;
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 		ptr = alloc_bootmem_pages(size);
 #else
 		int node = early_cpu_to_node(i);
-		if (!node_online(node) || !NODE_DATA(node))
+		if (!node_online(node) || !NODE_DATA(node)) {
 			ptr = alloc_bootmem_pages(size);
+			printk(KERN_INFO
+			       "cpu %d has no node or node-local memory\n", i);
+		}
 		else
 			ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
 #endif
@@ -96,7 +90,7 @@ void __init setup_per_cpu_areas(void)
 		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
 	}
 
-	/* setup percpu data maps early */
+	/* Setup percpu data maps */
 	setup_per_cpu_maps();
 }
 
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 18267a02e67..2ea56f48f29 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -31,13 +31,15 @@ bootmem_data_t plat_node_bdata[MAX_NUMNODES];
 
 struct memnode memnode;
 
+#ifdef CONFIG_SMP
 int x86_cpu_to_node_map_init[NR_CPUS] = {
 	[0 ... NR_CPUS-1] = NUMA_NO_NODE
 };
 void *x86_cpu_to_node_map_early_ptr;
+EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr);
+#endif
 DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
 EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map);
-EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr);
 
 s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
 	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
diff --git a/include/asm-x86/smp.h b/include/asm-x86/smp.h
index 654724c58f5..d973c11688c 100644
--- a/include/asm-x86/smp.h
+++ b/include/asm-x86/smp.h
@@ -29,10 +29,15 @@ extern int smp_num_siblings;
 extern unsigned int num_processors;
 extern cpumask_t cpu_initialized;
 
+#ifdef CONFIG_SMP
 extern u16 x86_cpu_to_apicid_init[];
 extern u16 x86_bios_cpu_apicid_init[];
 extern void *x86_cpu_to_apicid_early_ptr;
 extern void *x86_bios_cpu_apicid_early_ptr;
+#else
+#define x86_cpu_to_apicid_early_ptr NULL
+#define x86_bios_cpu_apicid_early_ptr NULL
+#endif
 
 DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
 DECLARE_PER_CPU(cpumask_t, cpu_core_map);
diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h
index 8d1a1f3d21b..81a29eb08ac 100644
--- a/include/asm-x86/topology.h
+++ b/include/asm-x86/topology.h
@@ -38,8 +38,13 @@ extern int cpu_to_node_map[];
 #endif
 
 DECLARE_PER_CPU(int, x86_cpu_to_node_map);
+
+#ifdef CONFIG_SMP
 extern int x86_cpu_to_node_map_init[];
 extern void *x86_cpu_to_node_map_early_ptr;
+#else
+#define x86_cpu_to_node_map_early_ptr NULL
+#endif
 
 extern cpumask_t node_to_cpumask_map[];
 
@@ -54,6 +59,8 @@ static inline int cpu_to_node(int cpu)
 }
 
 #else /* CONFIG_X86_64 */
+
+#ifdef CONFIG_SMP
 static inline int early_cpu_to_node(int cpu)
 {
 	int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr;
@@ -65,6 +72,9 @@ static inline int early_cpu_to_node(int cpu)
 	else
 		return NUMA_NO_NODE;
 }
+#else
+#define	early_cpu_to_node(cpu)	cpu_to_node(cpu)
+#endif
 
 static inline int cpu_to_node(int cpu)
 {
@@ -76,10 +86,7 @@ static inline int cpu_to_node(int cpu)
 		return ((int *)x86_cpu_to_node_map_early_ptr)[cpu];
 	}
 #endif
-	if (per_cpu_offset(cpu))
-		return per_cpu(x86_cpu_to_node_map, cpu);
-	else
-		return NUMA_NO_NODE;
+	return per_cpu(x86_cpu_to_node_map, cpu);
 }
 #endif /* CONFIG_X86_64 */
 
-- 
cgit v1.2.3-70-g09d2


From d1a4be630fb068f251d64b62919f143c49ca8057 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 18 Apr 2008 21:32:22 +0200
Subject: x86 PAT: fix mmap() of holes

do not return a -EINVAL when mmap()-ing PCI holes.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
---
 arch/x86/mm/pageattr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 7d9517abc9a..f7823a17286 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -542,7 +542,7 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
 repeat:
 	kpte = lookup_address(address, &level);
 	if (!kpte)
-		return primary ? -EINVAL : 0;
+		return 0;
 
 	old_pte = *kpte;
 	if (!pte_val(old_pte)) {
-- 
cgit v1.2.3-70-g09d2


From cf9b111c170733dde39139e8989b676ec8b81573 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Sat, 8 Mar 2008 18:15:06 +0800
Subject: x86: remove pointless comments

Remove old comments that include the old arch/i386 directory.

Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/boot/a20.c                      | 2 --
 arch/x86/boot/apm.c                      | 2 --
 arch/x86/boot/bitops.h                   | 2 --
 arch/x86/boot/boot.h                     | 2 --
 arch/x86/boot/cmdline.c                  | 2 --
 arch/x86/boot/copy.S                     | 2 --
 arch/x86/boot/cpucheck.c                 | 2 --
 arch/x86/boot/edd.c                      | 2 --
 arch/x86/boot/install.sh                 | 2 --
 arch/x86/boot/main.c                     | 2 --
 arch/x86/boot/mca.c                      | 2 --
 arch/x86/boot/memory.c                   | 2 --
 arch/x86/boot/pm.c                       | 2 --
 arch/x86/boot/pmjump.S                   | 2 --
 arch/x86/boot/printf.c                   | 2 --
 arch/x86/boot/string.c                   | 2 --
 arch/x86/boot/tty.c                      | 2 --
 arch/x86/boot/version.c                  | 2 --
 arch/x86/boot/video-bios.c               | 2 --
 arch/x86/boot/video-vesa.c               | 2 --
 arch/x86/boot/video-vga.c                | 2 --
 arch/x86/boot/video.c                    | 2 --
 arch/x86/boot/video.h                    | 2 --
 arch/x86/boot/voyager.c                  | 2 --
 arch/x86/kernel/acpi/cstate.c            | 2 --
 arch/x86/kernel/acpi/processor.c         | 2 --
 arch/x86/kernel/cpu/mcheck/therm_throt.c | 1 -
 arch/x86/kernel/entry_32.S               | 1 -
 arch/x86/kernel/head_32.S                | 1 -
 arch/x86/mach-visws/visws_apic.c         | 2 --
 arch/x86/mach-voyager/voyager_basic.c    | 2 --
 arch/x86/mach-voyager/voyager_cat.c      | 2 --
 arch/x86/mach-voyager/voyager_smp.c      | 2 --
 arch/x86/mach-voyager/voyager_thread.c   | 2 --
 arch/x86/mm/init_32.c                    | 1 -
 arch/x86/mm/pgtable_32.c                 | 4 ----
 arch/x86/video/fbdev.c                   | 1 -
 37 files changed, 71 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c
index 31348d054fc..90943f83e84 100644
--- a/arch/x86/boot/a20.c
+++ b/arch/x86/boot/a20.c
@@ -9,8 +9,6 @@
  * ----------------------------------------------------------------------- */
 
 /*
- * arch/i386/boot/a20.c
- *
  * Enable A20 gate (return -1 on failure)
  */
 
diff --git a/arch/x86/boot/apm.c b/arch/x86/boot/apm.c
index c117c7fb859..7aa6033001f 100644
--- a/arch/x86/boot/apm.c
+++ b/arch/x86/boot/apm.c
@@ -12,8 +12,6 @@
  * ----------------------------------------------------------------------- */
 
 /*
- * arch/i386/boot/apm.c
- *
  * Get APM BIOS information
  */
 
diff --git a/arch/x86/boot/bitops.h b/arch/x86/boot/bitops.h
index 8dcc8dc7db8..878e4b9940d 100644
--- a/arch/x86/boot/bitops.h
+++ b/arch/x86/boot/bitops.h
@@ -9,8 +9,6 @@
  * ----------------------------------------------------------------------- */
 
 /*
- * arch/i386/boot/bitops.h
- *
  * Very simple bitops for the boot code.
  */
 
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 09578070bfb..a34b9982c7c 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -9,8 +9,6 @@
  * ----------------------------------------------------------------------- */
 
 /*
- * arch/i386/boot/boot.h
- *
  * Header file for the real-mode kernel code
  */
 
diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c
index 680408a0f46..a1d35634bce 100644
--- a/arch/x86/boot/cmdline.c
+++ b/arch/x86/boot/cmdline.c
@@ -9,8 +9,6 @@
  * ----------------------------------------------------------------------- */
 
 /*
- * arch/i386/boot/cmdline.c
- *
  * Simple command-line parser for early boot.
  */
 
diff --git a/arch/x86/boot/copy.S b/arch/x86/boot/copy.S
index ef127e56a3c..ef50c84e8b4 100644
--- a/arch/x86/boot/copy.S
+++ b/arch/x86/boot/copy.S
@@ -9,8 +9,6 @@
  * ----------------------------------------------------------------------- */
 
 /*
- * arch/i386/boot/copy.S
- *
  * Memory copy routines
  */
 
diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
index 2462c88689e..7804389ee00 100644
--- a/arch/x86/boot/cpucheck.c
+++ b/arch/x86/boot/cpucheck.c
@@ -9,8 +9,6 @@
  * ----------------------------------------------------------------------- */
 
 /*
- * arch/i386/boot/cpucheck.c
- *
  * Check for obligatory CPU features and abort if the features are not
  * present.  This code should be compilable as 16-, 32- or 64-bit
  * code, so be very careful with types and inline assembly.
diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c
index 8721dc46a0b..d84a48ece78 100644
--- a/arch/x86/boot/edd.c
+++ b/arch/x86/boot/edd.c
@@ -9,8 +9,6 @@
  * ----------------------------------------------------------------------- */
 
 /*
- * arch/i386/boot/edd.c
- *
  * Get EDD BIOS disk information
  */
 
diff --git a/arch/x86/boot/install.sh b/arch/x86/boot/install.sh
index 88d77761d01..8d60ee15dfd 100644
--- a/arch/x86/boot/install.sh
+++ b/arch/x86/boot/install.sh
@@ -1,7 +1,5 @@
 #!/bin/sh
 #
-# arch/i386/boot/install.sh
-#
 # This file is subject to the terms and conditions of the GNU General Public
 # License.  See the file "COPYING" in the main directory of this archive
 # for more details.
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index 7828da5cfd0..77569a4a3be 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -9,8 +9,6 @@
  * ----------------------------------------------------------------------- */
 
 /*
- * arch/i386/boot/main.c
- *
  * Main module for the real-mode kernel code
  */
 
diff --git a/arch/x86/boot/mca.c b/arch/x86/boot/mca.c
index 68222f2d4b6..911eaae5d69 100644
--- a/arch/x86/boot/mca.c
+++ b/arch/x86/boot/mca.c
@@ -9,8 +9,6 @@
  * ----------------------------------------------------------------------- */
 
 /*
- * arch/i386/boot/mca.c
- *
  * Get the MCA system description table
  */
 
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index e77d89f9e8a..acad32eb429 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -9,8 +9,6 @@
  * ----------------------------------------------------------------------- */
 
 /*
- * arch/i386/boot/memory.c
- *
  * Memory detection code
  */
 
diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c
index a93cb8bded4..328956fdb59 100644
--- a/arch/x86/boot/pm.c
+++ b/arch/x86/boot/pm.c
@@ -9,8 +9,6 @@
  * ----------------------------------------------------------------------- */
 
 /*
- * arch/i386/boot/pm.c
- *
  * Prepare the machine for transition to protected mode.
  */
 
diff --git a/arch/x86/boot/pmjump.S b/arch/x86/boot/pmjump.S
index f5402d51f7c..ab049d40a88 100644
--- a/arch/x86/boot/pmjump.S
+++ b/arch/x86/boot/pmjump.S
@@ -9,8 +9,6 @@
  * ----------------------------------------------------------------------- */
 
 /*
- * arch/i386/boot/pmjump.S
- *
  * The actual transition into protected mode
  */
 
diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c
index 7e7e890699b..c1d00c0274c 100644
--- a/arch/x86/boot/printf.c
+++ b/arch/x86/boot/printf.c
@@ -9,8 +9,6 @@
  * ----------------------------------------------------------------------- */
 
 /*
- * arch/i386/boot/printf.c
- *
  * Oh, it's a waste of space, but oh-so-yummy for debugging.  This
  * version of printf() does not include 64-bit support.  "Live with
  * it."
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 481a2209778..f94b7a0c2ab 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -9,8 +9,6 @@
  * ----------------------------------------------------------------------- */
 
 /*
- * arch/i386/boot/string.c
- *
  * Very basic string functions
  */
 
diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c
index f3f14bd2637..0be77b39328 100644
--- a/arch/x86/boot/tty.c
+++ b/arch/x86/boot/tty.c
@@ -9,8 +9,6 @@
  * ----------------------------------------------------------------------- */
 
 /*
- * arch/i386/boot/tty.c
- *
  * Very simple screen I/O
  * XXX: Probably should add very simple serial I/O?
  */
diff --git a/arch/x86/boot/version.c b/arch/x86/boot/version.c
index c61462f7d9a..2723d9b5ce4 100644
--- a/arch/x86/boot/version.c
+++ b/arch/x86/boot/version.c
@@ -9,8 +9,6 @@
  * ----------------------------------------------------------------------- */
 
 /*
- * arch/i386/boot/version.c
- *
  * Kernel version string
  */
 
diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c
index 39e247e9617..49f26aaaebc 100644
--- a/arch/x86/boot/video-bios.c
+++ b/arch/x86/boot/video-bios.c
@@ -9,8 +9,6 @@
  * ----------------------------------------------------------------------- */
 
 /*
- * arch/i386/boot/video-bios.c
- *
  * Standard video BIOS modes
  *
  * We have two options for this; silent and scanned.
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index 5d5a3f6e8b5..401ad998ad0 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -9,8 +9,6 @@
  * ----------------------------------------------------------------------- */
 
 /*
- * arch/i386/boot/video-vesa.c
- *
  * VESA text modes
  */
 
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
index 330d6589a2a..40ecb8d7688 100644
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -9,8 +9,6 @@
  * ----------------------------------------------------------------------- */
 
 /*
- * arch/i386/boot/video-vga.c
- *
  * Common all-VGA modes
  */
 
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index c1c47ba069e..83598b23093 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -9,8 +9,6 @@
  * ----------------------------------------------------------------------- */
 
 /*
- * arch/i386/boot/video.c
- *
  * Select video mode
  */
 
diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h
index d69347f79e8..ee63f5d1446 100644
--- a/arch/x86/boot/video.h
+++ b/arch/x86/boot/video.h
@@ -9,8 +9,6 @@
  * ----------------------------------------------------------------------- */
 
 /*
- * arch/i386/boot/video.h
- *
  * Header file for the real-mode video probing code
  */
 
diff --git a/arch/x86/boot/voyager.c b/arch/x86/boot/voyager.c
index 6499e3239b4..433909d61e5 100644
--- a/arch/x86/boot/voyager.c
+++ b/arch/x86/boot/voyager.c
@@ -9,8 +9,6 @@
  * ----------------------------------------------------------------------- */
 
 /*
- * arch/i386/boot/voyager.c
- *
  * Get the Voyager config information
  */
 
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 8ca3557a6d5..9366fb68d8d 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -1,6 +1,4 @@
 /*
- * arch/i386/kernel/acpi/cstate.c
- *
  * Copyright (C) 2005 Intel Corporation
  * 	Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
  * 	- Added _PDC for SMP C-states on Intel CPUs
diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c
index 324eb0cab19..de2d2e4ebad 100644
--- a/arch/x86/kernel/acpi/processor.c
+++ b/arch/x86/kernel/acpi/processor.c
@@ -1,6 +1,4 @@
 /*
- * arch/i386/kernel/acpi/processor.c
- *
  * Copyright (C) 2005 Intel Corporation
  * 	Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
  * 	- Added _PDC for platforms with Intel CPUs
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 9b7e01daa1c..1f4cc48c14c 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -1,5 +1,4 @@
 /*
- * linux/arch/i386/kernel/cpu/mcheck/therm_throt.c
  *
  * Thermal throttle event support code (such as syslog messaging and rate
  * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 9ba49a26dff..f0f8934fc30 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1,5 +1,4 @@
 /*
- *  linux/arch/i386/entry.S
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  */
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 826988a6e96..90f038af3ad 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -1,5 +1,4 @@
 /*
- *  linux/arch/i386/kernel/head.S -- the 32-bit startup code.
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *
diff --git a/arch/x86/mach-visws/visws_apic.c b/arch/x86/mach-visws/visws_apic.c
index 710faf71a65..cef9cb1d15a 100644
--- a/arch/x86/mach-visws/visws_apic.c
+++ b/arch/x86/mach-visws/visws_apic.c
@@ -1,6 +1,4 @@
 /*
- *	linux/arch/i386/mach-visws/visws_apic.c
- *
  *	Copyright (C) 1999 Bent Hagemark, Ingo Molnar
  *
  *  SGI Visual Workstation interrupt controller
diff --git a/arch/x86/mach-voyager/voyager_basic.c b/arch/x86/mach-voyager/voyager_basic.c
index 6a949e4edde..46d6f806769 100644
--- a/arch/x86/mach-voyager/voyager_basic.c
+++ b/arch/x86/mach-voyager/voyager_basic.c
@@ -2,8 +2,6 @@
  *
  * Author: J.E.J.Bottomley@HansenPartnership.com
  *
- * linux/arch/i386/kernel/voyager.c
- *
  * This file contains all the voyager specific routines for getting
  * initialisation of the architecture to function.  For additional
  * features see:
diff --git a/arch/x86/mach-voyager/voyager_cat.c b/arch/x86/mach-voyager/voyager_cat.c
index 17a7904f75b..ecab9fff0fd 100644
--- a/arch/x86/mach-voyager/voyager_cat.c
+++ b/arch/x86/mach-voyager/voyager_cat.c
@@ -4,8 +4,6 @@
  *
  * Author: J.E.J.Bottomley@HansenPartnership.com
  *
- * linux/arch/i386/kernel/voyager_cat.c
- *
  * This file contains all the logic for manipulating the CAT bus
  * in a level 5 machine.
  *
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index be7235bf105..96f60c7cd12 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -4,8 +4,6 @@
  *
  * Author: J.E.J.Bottomley@HansenPartnership.com
  *
- * linux/arch/i386/kernel/voyager_smp.c
- *
  * This file provides all the same external entries as smp.c but uses
  * the voyager hal to provide the functionality
  */
diff --git a/arch/x86/mach-voyager/voyager_thread.c b/arch/x86/mach-voyager/voyager_thread.c
index c69c931818e..15464a20fb3 100644
--- a/arch/x86/mach-voyager/voyager_thread.c
+++ b/arch/x86/mach-voyager/voyager_thread.c
@@ -4,8 +4,6 @@
  *
  * Author: J.E.J.Bottomley@HansenPartnership.com
  *
- * linux/arch/i386/kernel/voyager_thread.c
- *
  * This module provides the machine status monitor thread for the
  * voyager architecture.  This allows us to monitor the machine
  * environment (temp, voltage, fan function) and the front panel and
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 1500dc8d63e..9ec62da85fd 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -1,5 +1,4 @@
 /*
- *  linux/arch/i386/mm/init.c
  *
  *  Copyright (C) 1995  Linus Torvalds
  *
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 3165ec0672b..6fb9e7c6893 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -1,7 +1,3 @@
-/*
- *  linux/arch/i386/mm/pgtable.c
- */
-
 #include <linux/sched.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
diff --git a/arch/x86/video/fbdev.c b/arch/x86/video/fbdev.c
index 48fb38d7d2c..4db42bff8c6 100644
--- a/arch/x86/video/fbdev.c
+++ b/arch/x86/video/fbdev.c
@@ -1,5 +1,4 @@
 /*
- * arch/i386/video/fbdev.c - i386 Framebuffer
  *
  * Copyright (C) 2007 Antonino Daplas <adaplas@gmail.com>
  *
-- 
cgit v1.2.3-70-g09d2


From 4c8337ac425b220594fec45ad6d3ac76d3ce2b90 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Thu, 10 Apr 2008 15:09:50 -0700
Subject: x86: fix arch/x86/mm/ioremap.c warning

Fix printk formats in x86/mm/ioremap.c:

next-20080410/arch/x86/mm/ioremap.c:137: warning: format '%llx' expects type 'long long unsigned int', but argument 2 has type 'resource_size_t'
next-20080410/arch/x86/mm/ioremap.c:188: warning: format '%llx' expects type 'long long unsigned int', but argument 2 has type 'resource_size_t'
next-20080410/arch/x86/mm/ioremap.c:188: warning: format '%llx' expects type 'long long unsigned int', but argument 3 has type 'long unsigned int'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/ioremap.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c590fd200e2..3a4baf95e24 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -134,7 +134,7 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
 
 	if (!phys_addr_valid(phys_addr)) {
 		printk(KERN_WARNING "ioremap: invalid physical address %llx\n",
-		       phys_addr);
+		       (unsigned long long)phys_addr);
 		WARN_ON_ONCE(1);
 		return NULL;
 	}
@@ -187,7 +187,8 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
 		     new_prot_val == _PAGE_CACHE_WB)) {
 			pr_debug(
 		"ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n",
-				phys_addr, phys_addr + size,
+				(unsigned long long)phys_addr,
+				(unsigned long long)(phys_addr + size),
 				prot_val, new_prot_val);
 			free_memtype(phys_addr, phys_addr + size);
 			return NULL;
-- 
cgit v1.2.3-70-g09d2


From fa5c4639419668cbb18ca3d20c1253559a3b43ae Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 16 Apr 2008 02:29:42 +0200
Subject: x86: rename find_max_pfn() to propagate_e820_map()

this function doesnt just 'find' the max_pfn - it also has
other side-effects such as registering sparse memory maps.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/e820_32.c  | 4 ++--
 arch/x86/kernel/setup_32.c | 4 ++--
 arch/x86/mm/discontig_32.c | 6 +++---
 include/asm-x86/e820_32.h  | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
index 0240cd77836..ed733e7cf4e 100644
--- a/arch/x86/kernel/e820_32.c
+++ b/arch/x86/kernel/e820_32.c
@@ -475,7 +475,7 @@ int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
 /*
  * Find the highest page frame number we have available
  */
-void __init find_max_pfn(void)
+void __init propagate_e820_map(void)
 {
 	int i;
 
@@ -704,7 +704,7 @@ static int __init parse_memmap(char *arg)
 		 * size before original memory map is
 		 * reset.
 		 */
-		find_max_pfn();
+		propagate_e820_map();
 		saved_max_pfn = max_pfn;
 #endif
 		e820.nr_map = 0;
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 5b0bffb7fcc..1c4799e6871 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -812,10 +812,10 @@ void __init setup_arch(char **cmdline_p)
 		efi_init();
 
 	/* update e820 for memory not covered by WB MTRRs */
-	find_max_pfn();
+	propagate_e820_map();
 	mtrr_bp_init();
 	if (mtrr_trim_uncached_memory(max_pfn))
-		find_max_pfn();
+		propagate_e820_map();
 
 	max_low_pfn = setup_memory();
 
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index eba0bbede7a..18378850e25 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -120,7 +120,7 @@ int __init get_memcfg_numa_flat(void)
 	printk("NUMA - single node, flat memory mode\n");
 
 	/* Run the memory configuration and find the top of memory. */
-	find_max_pfn();
+	propagate_e820_map();
 	node_start_pfn[0] = 0;
 	node_end_pfn[0] = max_pfn;
 	memory_present(0, 0, max_pfn);
@@ -134,7 +134,7 @@ int __init get_memcfg_numa_flat(void)
 /*
  * Find the highest page frame number we have available for the node
  */
-static void __init find_max_pfn_node(int nid)
+static void __init propagate_e820_map_node(int nid)
 {
 	if (node_end_pfn[nid] > max_pfn)
 		node_end_pfn[nid] = max_pfn;
@@ -379,7 +379,7 @@ unsigned long __init setup_memory(void)
 	printk("High memory starts at vaddr %08lx\n",
 			(ulong) pfn_to_kaddr(highstart_pfn));
 	for_each_online_node(nid)
-		find_max_pfn_node(nid);
+		propagate_e820_map_node(nid);
 
 	memset(NODE_DATA(0), 0, sizeof(struct pglist_data));
 	NODE_DATA(0)->bdata = &node0_bdata;
diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h
index 43b1a8bd4b3..a9f7c6ec32b 100644
--- a/include/asm-x86/e820_32.h
+++ b/include/asm-x86/e820_32.h
@@ -24,7 +24,7 @@ extern void update_e820(void);
 extern int e820_all_mapped(unsigned long start, unsigned long end,
 			   unsigned type);
 extern int e820_any_mapped(u64 start, u64 end, unsigned type);
-extern void find_max_pfn(void);
+extern void propagate_e820_map(void);
 extern void register_bootmem_low_pages(unsigned long max_low_pfn);
 extern void add_memory_region(unsigned long long start,
 			      unsigned long long size, int type);
-- 
cgit v1.2.3-70-g09d2


From 6ec6e0d9f2fd7cb6ca6bc3bfab5ae7b5cdd8c36f Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Tue, 25 Mar 2008 10:14:35 -0700
Subject: srat, x86: add support for nodes spanning other nodes

For example, If the physical address layout on a two node system with 8 GB
memory is something like:
node 0: 0-2GB, 4-6GB
node 1: 2-4GB, 6-8GB

Current kernels fail to boot/detect this NUMA topology.

ACPI SRAT tables can expose such a topology which needs to be supported.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig            |  9 +++++++++
 arch/x86/mm/k8topology_64.c |  2 +-
 arch/x86/mm/numa_64.c       | 16 +++++++++++-----
 arch/x86/mm/srat_64.c       | 32 +++++++++++++++++++++-----------
 include/asm-x86/numa_64.h   |  3 ++-
 5 files changed, 44 insertions(+), 18 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2a59dbb2824..07cf7711356 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -903,6 +903,15 @@ config X86_64_ACPI_NUMA
 	help
 	  Enable ACPI SRAT based node topology detection.
 
+# Some NUMA nodes have memory ranges that span
+# other nodes.  Even though a pfn is valid and
+# between a node's start and end pfns, it may not
+# reside on that node.  See memmap_init_zone()
+# for details.
+config NODES_SPAN_OTHER_NODES
+	def_bool y
+	depends on X86_64_ACPI_NUMA
+
 config NUMA_EMU
 	bool "NUMA emulation"
 	depends on X86_64 && NUMA
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 7a2ebce87df..86808e666f9 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -164,7 +164,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
 	if (!found)
 		return -1;
 
-	memnode_shift = compute_hash_shift(nodes, 8);
+	memnode_shift = compute_hash_shift(nodes, 8, NULL);
 	if (memnode_shift < 0) {
 		printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n");
 		return -1;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 2ea56f48f29..cb317018635 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -60,7 +60,7 @@ unsigned long __initdata nodemap_size;
  * -1 if node overlap or lost ram (shift too big)
  */
 static int __init populate_memnodemap(const struct bootnode *nodes,
-				      int numnodes, int shift)
+				      int numnodes, int shift, int *nodeids)
 {
 	unsigned long addr, end;
 	int i, res = -1;
@@ -76,7 +76,12 @@ static int __init populate_memnodemap(const struct bootnode *nodes,
 		do {
 			if (memnodemap[addr >> shift] != NUMA_NO_NODE)
 				return -1;
-			memnodemap[addr >> shift] = i;
+
+			if (!nodeids)
+				memnodemap[addr >> shift] = i;
+			else
+				memnodemap[addr >> shift] = nodeids[i];
+
 			addr += (1UL << shift);
 		} while (addr < end);
 		res = 1;
@@ -139,7 +144,8 @@ static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
 	return i;
 }
 
-int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
+int __init compute_hash_shift(struct bootnode *nodes, int numnodes,
+			      int *nodeids)
 {
 	int shift;
 
@@ -149,7 +155,7 @@ int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
 	printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
 		shift);
 
-	if (populate_memnodemap(nodes, numnodes, shift) != 1) {
+	if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) {
 		printk(KERN_INFO "Your memory is not aligned you need to "
 		       "rebuild your kernel with a bigger NODEMAPSIZE "
 		       "shift=%d\n", shift);
@@ -462,7 +468,7 @@ done:
 		}
 	}
 out:
-	memnode_shift = compute_hash_shift(nodes, num_nodes);
+	memnode_shift = compute_hash_shift(nodes, num_nodes, NULL);
 	if (memnode_shift < 0) {
 		memnode_shift = 0;
 		printk(KERN_ERR "No NUMA hash function found.  NUMA emulation "
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 1bae9c855ce..fb43d89f46f 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -32,6 +32,10 @@ static struct bootnode nodes_add[MAX_NUMNODES];
 static int found_add_area __initdata;
 int hotadd_percent __initdata = 0;
 
+static int num_node_memblks __initdata;
+static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
+static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
+
 /* Too small nodes confuse the VM badly. Usually they result
    from BIOS bugs. */
 #define NODE_MIN_SIZE (4*1024*1024)
@@ -41,17 +45,17 @@ static __init int setup_node(int pxm)
 	return acpi_map_pxm_to_node(pxm);
 }
 
-static __init int conflicting_nodes(unsigned long start, unsigned long end)
+static __init int conflicting_memblks(unsigned long start, unsigned long end)
 {
 	int i;
-	for_each_node_mask(i, nodes_parsed) {
-		struct bootnode *nd = &nodes[i];
+	for (i = 0; i < num_node_memblks; i++) {
+		struct bootnode *nd = &node_memblk_range[i];
 		if (nd->start == nd->end)
 			continue;
 		if (nd->end > start && nd->start < end)
-			return i;
+			return memblk_nodeid[i];
 		if (nd->end == end && nd->start == start)
-			return i;
+			return memblk_nodeid[i];
 	}
 	return -1;
 }
@@ -258,7 +262,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 		bad_srat();
 		return;
 	}
-	i = conflicting_nodes(start, end);
+	i = conflicting_memblks(start, end);
 	if (i == node) {
 		printk(KERN_WARNING
 		"SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
@@ -283,10 +287,10 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 			nd->end = end;
 	}
 
-	printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
-	       nd->start, nd->end);
-	e820_register_active_regions(node, nd->start >> PAGE_SHIFT,
-						nd->end >> PAGE_SHIFT);
+	printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
+	       start, end);
+	e820_register_active_regions(node, start >> PAGE_SHIFT,
+				     end >> PAGE_SHIFT);
 	push_node_boundaries(node, nd->start >> PAGE_SHIFT,
 						nd->end >> PAGE_SHIFT);
 
@@ -298,6 +302,11 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 		if ((nd->start | nd->end) == 0)
 			node_clear(node, nodes_parsed);
 	}
+
+	node_memblk_range[num_node_memblks].start = start;
+	node_memblk_range[num_node_memblks].end = end;
+	memblk_nodeid[num_node_memblks] = node;
+	num_node_memblks++;
 }
 
 /* Sanity check to catch more bad SRATs (they are amazingly common).
@@ -368,7 +377,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 		return -1;
 	}
 
-	memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES);
+	memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
+					   memblk_nodeid);
 	if (memnode_shift < 0) {
 		printk(KERN_ERR
 		     "SRAT: No NUMA node hash function found. Contact maintainer\n");
diff --git a/include/asm-x86/numa_64.h b/include/asm-x86/numa_64.h
index 32c22ae0709..22e87c9f6a8 100644
--- a/include/asm-x86/numa_64.h
+++ b/include/asm-x86/numa_64.h
@@ -9,7 +9,8 @@ struct bootnode {
 	u64 end;
 };
 
-extern int compute_hash_shift(struct bootnode *nodes, int numnodes);
+extern int compute_hash_shift(struct bootnode *nodes, int numblks,
+			      int *nodeids);
 
 #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
 
-- 
cgit v1.2.3-70-g09d2


From 85c246ee16fe00bf7bf9e7ff09a5d17d9a83cf71 Mon Sep 17 00:00:00 2001
From: Glauber Costa <gcosta@redhat.com>
Date: Tue, 8 Apr 2008 13:20:50 -0300
Subject: x86: move definition to pci-dma.c

Move dma_ops structure definition to pci-dma.c, where it
belongs.

Signed-off-by: Glauber Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/pci-base_32.c | 11 ++++++++---
 arch/x86/kernel/pci-dma.c     |  3 +++
 arch/x86/mm/init_64.c         |  3 ---
 3 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/kernel/pci-base_32.c b/arch/x86/kernel/pci-base_32.c
index 837bbe91043..b44ea517fcf 100644
--- a/arch/x86/kernel/pci-base_32.c
+++ b/arch/x86/kernel/pci-base_32.c
@@ -37,7 +37,7 @@ static int pci32_map_error(dma_addr_t dma_addr)
 	return 0;
 }
 
-static const struct dma_mapping_ops pci32_dma_ops = {
+const struct dma_mapping_ops pci32_dma_ops = {
 	.map_single = pci32_map_single,
 	.unmap_single = NULL,
 	.map_sg = pci32_dma_map_sg,
@@ -51,5 +51,10 @@ static const struct dma_mapping_ops pci32_dma_ops = {
 	.mapping_error = pci32_map_error,
 };
 
-const struct dma_mapping_ops *dma_ops = &pci32_dma_ops;
-EXPORT_SYMBOL(dma_ops);
+/* this is temporary */
+int __init no_iommu_init(void)
+{
+	dma_ops = &pci32_dma_ops;
+	return 0;
+}
+fs_initcall(no_iommu_init);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index f1c24d8e794..1323cd80387 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -1,5 +1,8 @@
 #include <linux/dma-mapping.h>
 
+const struct dma_mapping_ops *dma_ops;
+EXPORT_SYMBOL(dma_ops);
+
 int dma_set_mask(struct device *dev, u64 mask)
 {
 	if (!dev->dma_mask || !dma_supported(dev, mask))
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 1076097dcab..1ff7906a9a4 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -47,9 +47,6 @@
 #include <asm/numa.h>
 #include <asm/cacheflush.h>
 
-const struct dma_mapping_ops *dma_ops;
-EXPORT_SYMBOL(dma_ops);
-
 static unsigned long dma_reserve __initdata;
 
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-- 
cgit v1.2.3-70-g09d2


From f46bdf2db25dfaff3b611c9711705645cdb03acc Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Fri, 4 Apr 2008 18:11:09 -0700
Subject: numa: move large array from stack to _initdata section

  * Move large array "struct bootnode nodes" from stack to _initdata
    section to reduce amount of stack space required.

Cc: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/numa_64.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 2ea56f48f29..3f6c53c0e00 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -380,9 +380,10 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
  * Sets up the system RAM area from start_pfn to end_pfn according to the
  * numa=fake command-line option.
  */
+static struct bootnode nodes[MAX_NUMNODES] __initdata;
+
 static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
 {
-	struct bootnode nodes[MAX_NUMNODES];
 	u64 size, addr = start_pfn << PAGE_SHIFT;
 	u64 max_addr = end_pfn << PAGE_SHIFT;
 	int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
-- 
cgit v1.2.3-70-g09d2


From a4928cffe6435caf427ae673131a633c1329dbf3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 23 Apr 2008 13:20:56 +0200
Subject: "make namespacecheck" fixes

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic_32.c     |  2 +-
 arch/x86/kernel/apic_64.c     |  4 ++--
 arch/x86/kernel/process_32.c  |  2 +-
 arch/x86/kernel/process_64.c  |  2 +-
 arch/x86/kernel/setup_32.c    |  4 ++--
 arch/x86/kernel/smpboot.c     | 12 ++++++------
 arch/x86/kernel/tlb_64.c      |  2 +-
 arch/x86/kernel/vsyscall_64.c |  2 +-
 arch/x86/mm/dump_pagetables.c |  2 +-
 arch/x86/mm/pageattr.c        |  2 +-
 arch/x86/mm/srat_64.c         |  2 +-
 include/asm-x86/smp.h         |  1 -
 include/asm-x86/tsc.h         |  1 -
 13 files changed, 18 insertions(+), 20 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 687208190b0..8317401170b 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -902,7 +902,7 @@ void __init init_bsp_APIC(void)
 	apic_write_around(APIC_LVT1, value);
 }
 
-void __cpuinit lapic_setup_esr(void)
+static void __cpuinit lapic_setup_esr(void)
 {
 	unsigned long oldvalue, value, maxlvt;
 	if (lapic_is_integrated() && !esr_disable) {
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 9e8e5c050c5..bf83157337e 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -429,7 +429,7 @@ void __init setup_boot_APIC_clock(void)
  * set the DUMMY flag again and force the broadcast mode in the
  * clockevents layer.
  */
-void __cpuinit check_boot_apic_timer_broadcast(void)
+static void __cpuinit check_boot_apic_timer_broadcast(void)
 {
 	if (!disable_apic_timer ||
 	    (lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY))
@@ -834,7 +834,7 @@ void __cpuinit setup_local_APIC(void)
 	preempt_enable();
 }
 
-void __cpuinit lapic_setup_esr(void)
+static void __cpuinit lapic_setup_esr(void)
 {
 	unsigned maxlvt = lapic_get_maxlvt();
 
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 7adad088e37..77de848bd1f 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -550,7 +550,7 @@ static void hard_enable_TSC(void)
 	write_cr4(read_cr4() & ~X86_CR4_TSD);
 }
 
-void enable_TSC(void)
+static void enable_TSC(void)
 {
 	preempt_disable();
 	if (test_and_clear_thread_flag(TIF_NOTSC))
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 891af1a1b48..131c2ee7ac5 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -562,7 +562,7 @@ static void hard_enable_TSC(void)
 	write_cr4(read_cr4() & ~X86_CR4_TSD);
 }
 
-void enable_TSC(void)
+static void enable_TSC(void)
 {
 	preempt_disable();
 	if (test_and_clear_thread_flag(TIF_NOTSC))
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 78828b0f604..455d3c80960 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -442,7 +442,7 @@ static void __init reserve_ebda_region(void)
 }
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-void __init setup_bootmem_allocator(void);
+static void __init setup_bootmem_allocator(void);
 static unsigned long __init setup_memory(void)
 {
 	/*
@@ -477,7 +477,7 @@ static unsigned long __init setup_memory(void)
 	return max_low_pfn;
 }
 
-void __init zone_sizes_init(void)
+static void __init zone_sizes_init(void)
 {
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 6a925394bc7..ade371f9663 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -184,7 +184,7 @@ static void unmap_cpu_to_node(int cpu)
 u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly =
 					{ [0 ... NR_CPUS-1] = BAD_APICID };
 
-void map_cpu_to_logical_apicid(void)
+static void map_cpu_to_logical_apicid(void)
 {
 	int cpu = smp_processor_id();
 	int apicid = logical_smp_processor_id();
@@ -197,7 +197,7 @@ void map_cpu_to_logical_apicid(void)
 	map_cpu_to_node(cpu, node);
 }
 
-void unmap_cpu_to_logical_apicid(int cpu)
+static void unmap_cpu_to_logical_apicid(int cpu)
 {
 	cpu_2_logical_apicid[cpu] = BAD_APICID;
 	unmap_cpu_to_node(cpu);
@@ -211,7 +211,7 @@ void unmap_cpu_to_logical_apicid(int cpu)
  * Report back to the Boot Processor.
  * Running on AP.
  */
-void __cpuinit smp_callin(void)
+static void __cpuinit smp_callin(void)
 {
 	int cpuid, phys_id;
 	unsigned long timeout;
@@ -436,7 +436,7 @@ valid_k7:
 #endif
 }
 
-void __cpuinit smp_checks(void)
+static void __cpuinit smp_checks(void)
 {
 	if (smp_b_stepping)
 		printk(KERN_WARNING "WARNING: SMP operation may be unreliable"
@@ -565,7 +565,7 @@ void __init smp_alloc_memory(void)
 }
 #endif
 
-void impress_friends(void)
+static void impress_friends(void)
 {
 	int cpu;
 	unsigned long bogosum = 0;
@@ -1287,7 +1287,7 @@ void cpu_exit_clear(void)
 }
 #  endif /* CONFIG_X86_32 */
 
-void remove_siblinginfo(int cpu)
+static void remove_siblinginfo(int cpu)
 {
 	int sibling;
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c
index 1558e513757..df224a8774c 100644
--- a/arch/x86/kernel/tlb_64.c
+++ b/arch/x86/kernel/tlb_64.c
@@ -191,7 +191,7 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
 	spin_unlock(&f->tlbstate_lock);
 }
 
-int __cpuinit init_smp_flush(void)
+static int __cpuinit init_smp_flush(void)
 {
 	int i;
 
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index edff4c98548..61efa2f7d56 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -216,7 +216,7 @@ vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
 	return 0;
 }
 
-long __vsyscall(3) venosys_1(void)
+static long __vsyscall(3) venosys_1(void)
 {
 	return -ENOSYS;
 }
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 6791b8334bc..2c24bea92c6 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -324,7 +324,7 @@ static const struct file_operations ptdump_fops = {
 	.release	= single_release,
 };
 
-int pt_dump_init(void)
+static int pt_dump_init(void)
 {
 	struct dentry *pe;
 
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index f7823a17286..c29ebd03725 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -993,7 +993,7 @@ static const struct file_operations dpa_fops = {
 	.release	= single_release,
 };
 
-int __init debug_pagealloc_proc_init(void)
+static int __init debug_pagealloc_proc_init(void)
 {
 	struct dentry *de;
 
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index fb43d89f46f..3890234e5b2 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -163,7 +163,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 	       pxm, apic_id, node);
 }
 
-int update_end_of_memory(unsigned long end) {return -1;}
+static int update_end_of_memory(unsigned long end) {return -1;}
 static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 static inline int save_add_info(void) {return 1;}
diff --git a/include/asm-x86/smp.h b/include/asm-x86/smp.h
index 62ebdec394b..1ebaa5cd311 100644
--- a/include/asm-x86/smp.h
+++ b/include/asm-x86/smp.h
@@ -199,7 +199,6 @@ static inline int hard_smp_processor_id(void)
 #ifdef CONFIG_HOTPLUG_CPU
 extern void cpu_exit_clear(void);
 extern void cpu_uninit(void);
-extern void remove_siblinginfo(int cpu);
 #endif
 
 extern void smp_alloc_memory(void);
diff --git a/include/asm-x86/tsc.h b/include/asm-x86/tsc.h
index 0434bd8349a..d2d8eb5b55f 100644
--- a/include/asm-x86/tsc.h
+++ b/include/asm-x86/tsc.h
@@ -18,7 +18,6 @@ extern unsigned int cpu_khz;
 extern unsigned int tsc_khz;
 
 extern void disable_TSC(void);
-extern void enable_TSC(void);
 
 static inline cycles_t get_cycles(void)
 {
-- 
cgit v1.2.3-70-g09d2


From ae531c26c5c2a28ca1b35a75b39b3b256850f2c8 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Thu, 24 Apr 2008 23:40:47 +0200
Subject: x86: introduce /dev/mem restrictions with a config option

This patch introduces a restriction on /dev/mem: Only non-memory can be
read or written unless the newly introduced config option is set.

The X server needs access to /dev/mem for the PCI space, but it doesn't need
access to memory; both the file permissions and SELinux permissions of /dev/mem
just make X effectively super-super powerful. With the exception of the
BIOS area, there's just no valid app that uses /dev/mem on actual memory.
Other popular users of /dev/mem are rootkits and the like.
(note: mmap access of memory via /dev/mem was already not allowed since
a really long time)

People who want to use /dev/mem for kernel debugging can enable the config
option.

The restrictions of this patch have been in the Fedora and RHEL kernels for
at least 4 years without any problems.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig.debug | 12 ++++++++++++
 arch/x86/mm/init_32.c  | 19 +++++++++++++++++++
 arch/x86/mm/init_64.c  | 20 ++++++++++++++++++++
 drivers/char/mem.c     | 28 ++++++++++++++++++++++++++++
 include/asm-x86/page.h |  1 +
 5 files changed, 80 insertions(+)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 610aaecc19f..0c1890c4127 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -5,6 +5,18 @@ config TRACE_IRQFLAGS_SUPPORT
 
 source "lib/Kconfig.debug"
 
+config NONPROMISC_DEVMEM
+	bool "Disable promiscuous /dev/mem"
+	default y
+	help
+	  The /dev/mem file by default only allows userspace access to PCI
+	  space and the BIOS code and data regions. This is sufficient for
+	  dosemu and X and all common users of /dev/mem. With this config
+	  option, you allow userspace access to all of memory, including
+	  kernel and userspace memory. Accidental access to this is
+	  obviously disasterous, but specific access can be used by people
+	  debugging the kernel.
+
 config EARLY_PRINTK
 	bool "Early printk" if EMBEDDED
 	default y
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 9ec62da85fd..39852d53901 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -227,6 +227,25 @@ static inline int page_kills_ppro(unsigned long pagenr)
 	return 0;
 }
 
+/*
+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
+ * is valid. The argument is a physical page number.
+ *
+ *
+ * On x86, access has to be given to the first megabyte of ram because that area
+ * contains bios code and data regions used by X and dosemu and similar apps.
+ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
+ * mmio resources as well as potential bios/acpi data regions.
+ */
+int devmem_is_allowed(unsigned long pagenr)
+{
+	if (pagenr <= 256)
+		return 1;
+	if (!page_is_ram(pagenr))
+		return 1;
+	return 0;
+}
+
 #ifdef CONFIG_HIGHMEM
 pte_t *kmap_pte;
 pgprot_t kmap_prot;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 1ff7906a9a4..49c274ee2fb 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -664,6 +664,26 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
+/*
+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
+ * is valid. The argument is a physical page number.
+ *
+ *
+ * On x86, access has to be given to the first megabyte of ram because that area
+ * contains bios code and data regions used by X and dosemu and similar apps.
+ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
+ * mmio resources as well as potential bios/acpi data regions.
+ */
+int devmem_is_allowed(unsigned long pagenr)
+{
+	if (pagenr <= 256)
+		return 1;
+	if (!page_is_ram(pagenr))
+		return 1;
+	return 0;
+}
+
+
 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
 			 kcore_modules, kcore_vsyscall;
 
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 20070b7c573..dcf6e31970a 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -108,6 +108,30 @@ static inline int valid_mmap_phys_addr_range(unsigned long pfn, size_t size)
 }
 #endif
 
+#ifdef CONFIG_NONPROMISC_DEVMEM
+static inline int range_is_allowed(unsigned long from, unsigned long to)
+{
+	unsigned long cursor;
+
+	cursor = from >> PAGE_SHIFT;
+	while ((cursor << PAGE_SHIFT) < to) {
+		if (!devmem_is_allowed(cursor)) {
+			printk(KERN_INFO "Program %s tried to read /dev/mem "
+				"between %lx->%lx.\n",
+				current->comm, from, to);
+			return 0;
+		}
+		cursor++;
+	}
+	return 1;
+}
+#else
+static inline int range_is_allowed(unsigned long from, unsigned long to)
+{
+	return 1;
+}
+#endif
+
 /*
  * This funcion reads the *physical* memory. The f_pos points directly to the 
  * memory location. 
@@ -157,6 +181,8 @@ static ssize_t read_mem(struct file * file, char __user * buf,
 		 */
 		ptr = xlate_dev_mem_ptr(p);
 
+		if (!range_is_allowed(p, p+count))
+			return -EPERM;
 		if (copy_to_user(buf, ptr, sz))
 			return -EFAULT;
 		buf += sz;
@@ -214,6 +240,8 @@ static ssize_t write_mem(struct file * file, const char __user * buf,
 		 */
 		ptr = xlate_dev_mem_ptr(p);
 
+		if (!range_is_allowed(p, p+sz))
+			return -EPERM;
 		copied = copy_from_user(ptr, buf, sz);
 		if (copied) {
 			written += sz - copied;
diff --git a/include/asm-x86/page.h b/include/asm-x86/page.h
index 6724a4bc6b7..b381f4a5a0b 100644
--- a/include/asm-x86/page.h
+++ b/include/asm-x86/page.h
@@ -47,6 +47,7 @@
 #ifndef __ASSEMBLY__
 
 extern int page_is_ram(unsigned long pagenr);
+extern int devmem_is_allowed(unsigned long pagenr);
 
 extern unsigned long max_pfn_mapped;
 
-- 
cgit v1.2.3-70-g09d2


From e045fb2a988a9a1964059b0d33dbaf18d12f925f Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Tue, 18 Mar 2008 17:00:15 -0700
Subject: x86: PAT avoid aliasing in /dev/mem read/write

Add xlate and unxlate around /dev/mem read/write. This sets up the mapping
that can be used for /dev/mem read and write without aliasing worries.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/ioremap.c   | 29 +++++++++++++++++++++++++++++
 drivers/char/mem.c      | 32 +++++++++++++++++++++++++++-----
 include/asm-x86/io.h    |  8 ++++++++
 include/asm-x86/io_32.h |  6 ------
 include/asm-x86/io_64.h |  6 ------
 5 files changed, 64 insertions(+), 17 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 3a4baf95e24..caac7d5699a 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -336,6 +336,35 @@ void iounmap(volatile void __iomem *addr)
 }
 EXPORT_SYMBOL(iounmap);
 
+/*
+ * Convert a physical pointer to a virtual kernel pointer for /dev/mem
+ * access
+ */
+void *xlate_dev_mem_ptr(unsigned long phys)
+{
+	void *addr;
+	unsigned long start = phys & PAGE_MASK;
+
+	/* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
+	if (page_is_ram(start >> PAGE_SHIFT))
+		return __va(phys);
+
+	addr = (void *)ioremap(start, PAGE_SIZE);
+	if (addr)
+		addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
+
+	return addr;
+}
+
+void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
+{
+	if (page_is_ram(phys >> PAGE_SHIFT))
+		return;
+
+	iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
+	return;
+}
+
 #ifdef CONFIG_X86_32
 
 int __initdata early_ioremap_debug;
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 964ff3b1cff..83495885ada 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -134,6 +134,10 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
 }
 #endif
 
+void __attribute__((weak)) unxlate_dev_mem_ptr(unsigned long phys, void *addr)
+{
+}
+
 /*
  * This funcion reads the *physical* memory. The f_pos points directly to the 
  * memory location. 
@@ -176,17 +180,25 @@ static ssize_t read_mem(struct file * file, char __user * buf,
 
 		sz = min_t(unsigned long, sz, count);
 
+		if (!range_is_allowed(p >> PAGE_SHIFT, count))
+			return -EPERM;
+
 		/*
 		 * On ia64 if a page has been mapped somewhere as
 		 * uncached, then it must also be accessed uncached
 		 * by the kernel or data corruption may occur
 		 */
 		ptr = xlate_dev_mem_ptr(p);
+		if (!ptr)
+			return -EFAULT;
 
-		if (!range_is_allowed(p >> PAGE_SHIFT, count))
-			return -EPERM;
-		if (copy_to_user(buf, ptr, sz))
+		if (copy_to_user(buf, ptr, sz)) {
+			unxlate_dev_mem_ptr(p, ptr);
 			return -EFAULT;
+		}
+
+		unxlate_dev_mem_ptr(p, ptr);
+
 		buf += sz;
 		p += sz;
 		count -= sz;
@@ -235,22 +247,32 @@ static ssize_t write_mem(struct file * file, const char __user * buf,
 
 		sz = min_t(unsigned long, sz, count);
 
+		if (!range_is_allowed(p >> PAGE_SHIFT, sz))
+			return -EPERM;
+
 		/*
 		 * On ia64 if a page has been mapped somewhere as
 		 * uncached, then it must also be accessed uncached
 		 * by the kernel or data corruption may occur
 		 */
 		ptr = xlate_dev_mem_ptr(p);
+		if (!ptr) {
+			if (written)
+				break;
+			return -EFAULT;
+		}
 
-		if (!range_is_allowed(p >> PAGE_SHIFT, sz))
-			return -EPERM;
 		copied = copy_from_user(ptr, buf, sz);
 		if (copied) {
 			written += sz - copied;
+			unxlate_dev_mem_ptr(p, ptr);
 			if (written)
 				break;
 			return -EFAULT;
 		}
+
+		unxlate_dev_mem_ptr(p, ptr);
+
 		buf += sz;
 		p += sz;
 		count -= sz;
diff --git a/include/asm-x86/io.h b/include/asm-x86/io.h
index 7b292d38671..d5b11f60dbd 100644
--- a/include/asm-x86/io.h
+++ b/include/asm-x86/io.h
@@ -1,3 +1,6 @@
+#ifndef _ASM_X86_IO_H
+#define _ASM_X86_IO_H
+
 #define ARCH_HAS_IOREMAP_WC
 
 #ifdef CONFIG_X86_32
@@ -5,7 +8,12 @@
 #else
 # include "io_64.h"
 #endif
+
+extern void *xlate_dev_mem_ptr(unsigned long phys);
+extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
+
 extern int ioremap_change_attr(unsigned long vaddr, unsigned long size,
 				unsigned long prot_val);
 extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size);
 
+#endif /* _ASM_X86_IO_H */
diff --git a/include/asm-x86/io_32.h b/include/asm-x86/io_32.h
index 509045f5fda..6e73467a4fb 100644
--- a/include/asm-x86/io_32.h
+++ b/include/asm-x86/io_32.h
@@ -48,12 +48,6 @@
 
 #include <linux/vmalloc.h>
 
-/*
- * Convert a physical pointer to a virtual kernel pointer for /dev/mem
- * access
- */
-#define xlate_dev_mem_ptr(p)	__va(p)
-
 /*
  * Convert a virtual cached pointer to an uncached pointer
  */
diff --git a/include/asm-x86/io_64.h b/include/asm-x86/io_64.h
index c2f5eef47b8..0930bedf9e4 100644
--- a/include/asm-x86/io_64.h
+++ b/include/asm-x86/io_64.h
@@ -307,12 +307,6 @@ void memset_io(volatile void __iomem *a, int b, size_t c);
 extern int iommu_bio_merge;
 #define BIO_VMERGE_BOUNDARY iommu_bio_merge
 
-/*
- * Convert a physical pointer to a virtual kernel pointer for /dev/mem
- * access
- */
-#define xlate_dev_mem_ptr(p)	__va(p)
-
 /*
  * Convert a virtual cached pointer to an uncached pointer
  */
-- 
cgit v1.2.3-70-g09d2


From f0970c13b6a5b01189aeb196ebb573cf87d95839 Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Tue, 18 Mar 2008 17:00:20 -0700
Subject: x86: PAT phys_mem_access_prot_allowed for dev/mem mmap

Introduce phys_mem_access_prot_allowed(), which checks whether the mapping
is possible, without any conflicts and returns success or failure based on that.
phys_mem_access_prot() by itself does not allow failure case. This ability
to return error is needed for PAT where we may have aliasing conflicts.

x86 setup __HAVE_PHYS_MEM_ACCESS_PROT and move x86 specific code out of
/dev/mem into arch specific area.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pat.c         | 39 +++++++++++++++++++++++++++++++++++++++
 drivers/char/mem.c        | 41 +++++++++++------------------------------
 include/asm-x86/pgtable.h |  9 +++++++++
 3 files changed, 59 insertions(+), 30 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 72c0f609740..64cc0c18233 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -419,3 +419,42 @@ int free_memtype(u64 start, u64 end)
 	return err;
 }
 
+
+/* /dev/mem interface. Use the previous mapping */
+pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+				unsigned long size, pgprot_t vma_prot)
+{
+	return vma_prot;
+}
+
+int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
+				unsigned long size, pgprot_t *vma_prot)
+{
+
+	if (file->f_flags & O_SYNC) {
+		*vma_prot = pgprot_noncached(*vma_prot);
+		return 1;
+	}
+
+#ifdef CONFIG_X86_32
+	/*
+	 * On the PPro and successors, the MTRRs are used to set
+	 * memory types for physical addresses outside main memory,
+	 * so blindly setting UC or PWT on those pages is wrong.
+	 * For Pentiums and earlier, the surround logic should disable
+	 * caching for the high addresses through the KEN pin, but
+	 * we maintain the tradition of paranoia in this code.
+	 */
+	if (!pat_wc_enabled &&
+	    ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) ||
+		test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) ||
+		test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) ||
+		test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) &&
+	   (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
+		*vma_prot = pgprot_noncached(*vma_prot);
+		return 1;
+	}
+#endif
+
+	return 1;
+}
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 83495885ada..56b2fb4fbc9 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -41,36 +41,7 @@
  */
 static inline int uncached_access(struct file *file, unsigned long addr)
 {
-#if defined(__i386__) && !defined(__arch_um__)
-	/*
-	 * On the PPro and successors, the MTRRs are used to set
-	 * memory types for physical addresses outside main memory,
-	 * so blindly setting PCD or PWT on those pages is wrong.
-	 * For Pentiums and earlier, the surround logic should disable
-	 * caching for the high addresses through the KEN pin, but
-	 * we maintain the tradition of paranoia in this code.
-	 */
-	if (file->f_flags & O_SYNC)
-		return 1;
- 	return !( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) ||
-		  test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) ||
-		  test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) ||
-		  test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability) )
-	  && addr >= __pa(high_memory);
-#elif defined(__x86_64__) && !defined(__arch_um__)
-	/* 
-	 * This is broken because it can generate memory type aliases,
-	 * which can cause cache corruptions
-	 * But it is only available for root and we have to be bug-to-bug
-	 * compatible with i386.
-	 */
-	if (file->f_flags & O_SYNC)
-		return 1;
-	/* same behaviour as i386. PAT always set to cached and MTRRs control the
-	   caching behaviour. 
-	   Hopefully a full PAT implementation will fix that soon. */	   
-	return 0;
-#elif defined(CONFIG_IA64)
+#if defined(CONFIG_IA64)
 	/*
 	 * On ia64, we ignore O_SYNC because we cannot tolerate memory attribute aliases.
 	 */
@@ -283,6 +254,12 @@ static ssize_t write_mem(struct file * file, const char __user * buf,
 	return written;
 }
 
+int __attribute__((weak)) phys_mem_access_prot_allowed(struct file *file,
+	unsigned long pfn, unsigned long size, pgprot_t *vma_prot)
+{
+	return 1;
+}
+
 #ifndef __HAVE_PHYS_MEM_ACCESS_PROT
 static pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 				     unsigned long size, pgprot_t vma_prot)
@@ -336,6 +313,10 @@ static int mmap_mem(struct file * file, struct vm_area_struct * vma)
 	if (!range_is_allowed(vma->vm_pgoff, size))
 		return -EPERM;
 
+	if (!phys_mem_access_prot_allowed(file, vma->vm_pgoff, size,
+						&vma->vm_page_prot))
+		return -EINVAL;
+
 	vma->vm_page_prot = phys_mem_access_prot(file, vma->vm_pgoff,
 						 size,
 						 vma->vm_page_prot);
diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h
index f1d9f4a03f6..1902f0aed6c 100644
--- a/include/asm-x86/pgtable.h
+++ b/include/asm-x86/pgtable.h
@@ -289,6 +289,15 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 
 #define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
 
+#ifndef __ASSEMBLY__
+#define __HAVE_PHYS_MEM_ACCESS_PROT
+struct file;
+pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+                              unsigned long size, pgprot_t vma_prot);
+int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
+                              unsigned long size, pgprot_t *vma_prot);
+#endif
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else  /* !CONFIG_PARAVIRT */
-- 
cgit v1.2.3-70-g09d2


From e7f260a276f2c9184fe753732d834b1f6fbe9f17 Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Tue, 18 Mar 2008 17:00:21 -0700
Subject: x86: PAT use reserve free memtype in mmap of /dev/mem

Use reserve_memtype and free_memtype wrappers for /dev/mem mmaps. The memtype
is slightly complicated here, given that we have to support existing X mappings.
We fallback on UC_MINUS for that.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pat.c  | 128 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 drivers/char/mem.c |  35 ++++++++++++++-
 2 files changed, 152 insertions(+), 11 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 64cc0c18233..1489aafbfa7 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -11,6 +11,7 @@
 #include <linux/kernel.h>
 #include <linux/gfp.h>
 #include <linux/fs.h>
+#include <linux/bootmem.h>
 
 #include <asm/msr.h>
 #include <asm/tlbflush.h>
@@ -21,6 +22,7 @@
 #include <asm/cacheflush.h>
 #include <asm/fcntl.h>
 #include <asm/mtrr.h>
+#include <asm/io.h>
 
 int pat_wc_enabled = 1;
 
@@ -190,6 +192,21 @@ static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
 	return 0;
 }
 
+/*
+ * req_type typically has one of the:
+ * - _PAGE_CACHE_WB
+ * - _PAGE_CACHE_WC
+ * - _PAGE_CACHE_UC_MINUS
+ * - _PAGE_CACHE_UC
+ *
+ * req_type will have a special case value '-1', when requester want to inherit
+ * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
+ *
+ * If ret_type is NULL, function will return an error if it cannot reserve the
+ * region with req_type. If ret_type is non-null, function will return
+ * available type in ret_type in case of no error. In case of any error
+ * it will return a negative return value.
+ */
 int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 			unsigned long *ret_type)
 {
@@ -200,9 +217,14 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 
 	/* Only track when pat_wc_enabled */
 	if (!pat_wc_enabled) {
-		if (ret_type)
-			*ret_type = req_type;
-
+		/* This is identical to page table setting without PAT */
+		if (ret_type) {
+			if (req_type == -1) {
+				*ret_type = _PAGE_CACHE_WB;
+			} else {
+				*ret_type = req_type;
+			}
+		}
 		return 0;
 	}
 
@@ -214,8 +236,29 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 		return 0;
 	}
 
-	req_type &= _PAGE_CACHE_MASK;
-	err = pat_x_mtrr_type(start, end, req_type, &actual_type);
+	if (req_type == -1) {
+		/*
+		 * Special case where caller wants to inherit from mtrr or
+		 * existing pat mapping, defaulting to UC_MINUS in case of
+		 * no match.
+		 */
+		u8 mtrr_type = mtrr_type_lookup(start, end);
+		if (mtrr_type == 0xFE) { /* MTRR match error */
+			err = -1;
+		}
+
+		if (mtrr_type == MTRR_TYPE_WRBACK) {
+			req_type = _PAGE_CACHE_WB;
+			actual_type = _PAGE_CACHE_WB;
+		} else {
+			req_type = _PAGE_CACHE_UC_MINUS;
+			actual_type = _PAGE_CACHE_UC_MINUS;
+		}
+	} else {
+		req_type &= _PAGE_CACHE_MASK;
+		err = pat_x_mtrr_type(start, end, req_type, &actual_type);
+	}
+
 	if (err) {
 		if (ret_type)
 			*ret_type = actual_type;
@@ -420,7 +463,14 @@ int free_memtype(u64 start, u64 end)
 }
 
 
-/* /dev/mem interface. Use the previous mapping */
+/*
+ * /dev/mem mmap interface. The memtype used for mapping varies:
+ * - Use UC for mappings with O_SYNC flag
+ * - Without O_SYNC flag, if there is any conflict in reserve_memtype,
+ *   inherit the memtype from existing mapping.
+ * - Else use UC_MINUS memtype (for backward compatibility with existing
+ *   X drivers.
+ */
 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 				unsigned long size, pgprot_t vma_prot)
 {
@@ -430,10 +480,13 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 				unsigned long size, pgprot_t *vma_prot)
 {
+	u64 offset = ((u64) pfn) << PAGE_SHIFT;
+	unsigned long flags = _PAGE_CACHE_UC_MINUS;
+	unsigned long ret_flags;
+	int retval;
 
 	if (file->f_flags & O_SYNC) {
-		*vma_prot = pgprot_noncached(*vma_prot);
-		return 1;
+		flags = _PAGE_CACHE_UC;
 	}
 
 #ifdef CONFIG_X86_32
@@ -451,10 +504,65 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 		test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) ||
 		test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) &&
 	   (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
-		*vma_prot = pgprot_noncached(*vma_prot);
-		return 1;
+		flags = _PAGE_CACHE_UC;
 	}
 #endif
 
+	/*
+	 * With O_SYNC, we can only take UC mapping. Fail if we cannot.
+	 * Without O_SYNC, we want to get
+	 * - WB for WB-able memory and no other conflicting mappings
+	 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
+	 * - Inherit from confliting mappings otherwise
+	 */
+	if (flags != _PAGE_CACHE_UC_MINUS) {
+		retval = reserve_memtype(offset, offset + size, flags, NULL);
+	} else {
+		retval = reserve_memtype(offset, offset + size, -1, &ret_flags);
+	}
+
+	if (retval < 0)
+		return 0;
+
+	flags = ret_flags;
+
+	if (pfn <= max_pfn_mapped &&
+            ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
+		free_memtype(offset, offset + size);
+		printk(KERN_DEBUG
+		"%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n",
+			current->comm, current->pid,
+			cattr_name(flags),
+			offset, offset + size);
+		return 0;
+	}
+
+	*vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
+			     flags);
 	return 1;
 }
+
+void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
+{
+	u64 addr = (u64)pfn << PAGE_SHIFT;
+	unsigned long flags;
+	unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
+
+	reserve_memtype(addr, addr + size, want_flags, &flags);
+	if (flags != want_flags) {
+		printk(KERN_DEBUG
+		"%s:%d /dev/mem expected mapping type %s for %Lx-%Lx, got %s\n",
+			current->comm, current->pid,
+			cattr_name(want_flags),
+			addr, addr + size,
+			cattr_name(flags));
+	}
+}
+
+void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
+{
+	u64 addr = (u64)pfn << PAGE_SHIFT;
+
+	free_memtype(addr, addr + size);
+}
+
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 56b2fb4fbc9..e83623ead44 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -300,6 +300,35 @@ static inline int private_mapping_ok(struct vm_area_struct *vma)
 }
 #endif
 
+void __attribute__((weak))
+map_devmem(unsigned long pfn, unsigned long len, pgprot_t prot)
+{
+	/* nothing. architectures can override. */
+}
+
+void __attribute__((weak))
+unmap_devmem(unsigned long pfn, unsigned long len, pgprot_t prot)
+{
+	/* nothing. architectures can override. */
+}
+
+static void mmap_mem_open(struct vm_area_struct *vma)
+{
+	map_devmem(vma->vm_pgoff,  vma->vm_end - vma->vm_start,
+			vma->vm_page_prot);
+}
+
+static void mmap_mem_close(struct vm_area_struct *vma)
+{
+	unmap_devmem(vma->vm_pgoff,  vma->vm_end - vma->vm_start,
+			vma->vm_page_prot);
+}
+
+static struct vm_operations_struct mmap_mem_ops = {
+	.open  = mmap_mem_open,
+	.close = mmap_mem_close
+};
+
 static int mmap_mem(struct file * file, struct vm_area_struct * vma)
 {
 	size_t size = vma->vm_end - vma->vm_start;
@@ -321,13 +350,17 @@ static int mmap_mem(struct file * file, struct vm_area_struct * vma)
 						 size,
 						 vma->vm_page_prot);
 
+	vma->vm_ops = &mmap_mem_ops;
+
 	/* Remap-pfn-range will mark the range VM_IO and VM_RESERVED */
 	if (remap_pfn_range(vma,
 			    vma->vm_start,
 			    vma->vm_pgoff,
 			    size,
-			    vma->vm_page_prot))
+			    vma->vm_page_prot)) {
+		unmap_devmem(vma->vm_pgoff, size, vma->vm_page_prot);
 		return -EAGAIN;
+	}
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 28eb559b5b0b9b51b9165a9b8faa75b0bb91ca8d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 3 Apr 2008 10:14:33 +0200
Subject: pat: cleanups

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pat.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 1489aafbfa7..ef8b64b89c7 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -284,7 +284,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 		struct memtype *saved_ptr;
 
 		if (parse->start >= end) {
-			printk("New Entry\n");
+			pr_debug("New Entry\n");
 			list_add(&new_entry->nd, parse->nd.prev);
 			new_entry = NULL;
 			break;
@@ -386,7 +386,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 				break;
 			}
 
-			printk("Overlap at 0x%Lx-0x%Lx\n",
+			printk(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
 			       saved_ptr->start, saved_ptr->end);
 			/* No conflict. Go ahead and add this new entry */
 			list_add(&new_entry->nd, &saved_ptr->nd);
@@ -396,7 +396,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 	}
 
 	if (err) {
-		printk(
+		printk(KERN_INFO
 	"reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n",
 			start, end, cattr_name(new_entry->type),
 			cattr_name(req_type));
@@ -408,16 +408,16 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 	if (new_entry) {
 		/* No conflict. Not yet added to the list. Add to the tail */
 		list_add_tail(&new_entry->nd, &memtype_list);
-		printk("New Entry\n");
-  	}
+		pr_debug("New Entry\n");
+	}
 
 	if (ret_type) {
-		printk(
+		pr_debug(
 	"reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
 			start, end, cattr_name(actual_type),
 			cattr_name(req_type), cattr_name(*ret_type));
 	} else {
-		printk(
+		pr_debug(
 	"reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
 			start, end, cattr_name(actual_type),
 			cattr_name(req_type));
@@ -454,11 +454,11 @@ int free_memtype(u64 start, u64 end)
 	spin_unlock(&memtype_lock);
 
 	if (err) {
-		printk(KERN_DEBUG "%s:%d freeing invalid memtype %Lx-%Lx\n",
+		printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n",
 			current->comm, current->pid, start, end);
 	}
 
-	printk( "free_memtype request 0x%Lx-0x%Lx\n", start, end);
+	pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end);
 	return err;
 }
 
@@ -529,7 +529,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 	if (pfn <= max_pfn_mapped &&
             ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
 		free_memtype(offset, offset + size);
-		printk(KERN_DEBUG
+		printk(KERN_INFO
 		"%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n",
 			current->comm, current->pid,
 			cattr_name(flags),
@@ -550,7 +550,7 @@ void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
 
 	reserve_memtype(addr, addr + size, want_flags, &flags);
 	if (flags != want_flags) {
-		printk(KERN_DEBUG
+		printk(KERN_INFO
 		"%s:%d /dev/mem expected mapping type %s for %Lx-%Lx, got %s\n",
 			current->comm, current->pid,
 			cattr_name(want_flags),
-- 
cgit v1.2.3-70-g09d2


From 79bf6d66abb5a20813a19dd365dfc49104f0bb88 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 17 Mar 2008 16:36:54 -0700
Subject: x86: convert pgalloc_64.h from macros to inlines

Convert asm-x86/pgalloc_64.h from macros into functions (#include hell
prevents __*_free_tlb from being inline, but they're probably a bit
big to inline anyway).

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/init_64.c        | 16 ++++++++++++++++
 include/asm-x86/pgalloc_64.h | 33 ++++++++++++++++++---------------
 2 files changed, 34 insertions(+), 15 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 1ff7906a9a4..2d89b7abbc5 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -662,6 +662,22 @@ int memory_add_physaddr_to_nid(u64 start)
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 #endif
 
+void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
+{
+	pgtable_page_dtor(pte);
+	tlb_remove_page(tlb, pte);
+}
+
+void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
+{
+	tlb_remove_page(tlb, virt_to_page(pmd));
+}
+
+void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
+{
+	tlb_remove_page(tlb, virt_to_page(pud));
+}
+
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
diff --git a/include/asm-x86/pgalloc_64.h b/include/asm-x86/pgalloc_64.h
index 8d6722320dc..bcf525f3fbd 100644
--- a/include/asm-x86/pgalloc_64.h
+++ b/include/asm-x86/pgalloc_64.h
@@ -1,16 +1,24 @@
 #ifndef _X86_64_PGALLOC_H
 #define _X86_64_PGALLOC_H
 
-#include <asm/pda.h>
 #include <linux/threads.h>
 #include <linux/mm.h>
+#include <asm/pda.h>
 
-#define pmd_populate_kernel(mm, pmd, pte) \
-		set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)))
-#define pud_populate(mm, pud, pmd) \
-		set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)))
-#define pgd_populate(mm, pgd, pud) \
-		set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)))
+static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
+{
+	set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)));
+}
+
+static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
+{
+	set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
+}
+
+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+{
+	set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
+}
 
 #define pmd_pgtable(pmd) pmd_page(pmd)
 
@@ -121,13 +129,8 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
 	__free_page(pte);
 } 
 
-#define __pte_free_tlb(tlb,pte)				\
-do {							\
-	pgtable_page_dtor((pte));				\
-	tlb_remove_page((tlb), (pte));			\
-} while (0)
-
-#define __pmd_free_tlb(tlb,x)   tlb_remove_page((tlb),virt_to_page(x))
-#define __pud_free_tlb(tlb,x)   tlb_remove_page((tlb),virt_to_page(x))
+extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
+extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
+extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
 
 #endif /* _X86_64_PGALLOC_H */
-- 
cgit v1.2.3-70-g09d2


From 4f76cd382213b29dd3658e3e1ea47c0c2be06f3c Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 17 Mar 2008 16:36:55 -0700
Subject: x86: add common mm/pgtable.c

Add a common arch/x86/mm/pgtable.c file for common pagetable functions.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/Makefile         |   2 +-
 arch/x86/mm/pgtable.c        | 239 +++++++++++++++++++++++++++++++++++++++++++
 arch/x86/mm/pgtable_32.c     | 187 ---------------------------------
 include/asm-x86/pgalloc.h    |  18 ++++
 include/asm-x86/pgalloc_32.h |  11 --
 include/asm-x86/pgalloc_64.h |  67 ------------
 6 files changed, 258 insertions(+), 266 deletions(-)
 create mode 100644 arch/x86/mm/pgtable.c

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 20941d2954e..b7b3e4c7cfc 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,5 @@
 obj-y	:=  init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
-	    pat.o
+	    pat.o pgtable.o
 
 obj-$(CONFIG_X86_32)		+= pgtable_32.o
 
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
new file mode 100644
index 00000000000..d526b46ae18
--- /dev/null
+++ b/arch/x86/mm/pgtable.c
@@ -0,0 +1,239 @@
+#include <linux/mm.h>
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+
+pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+{
+	return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
+}
+
+pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+	struct page *pte;
+
+#ifdef CONFIG_HIGHPTE
+	pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
+#else
+	pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+#endif
+	if (pte)
+		pgtable_page_ctor(pte);
+	return pte;
+}
+
+#ifdef CONFIG_X86_64
+static inline void pgd_list_add(pgd_t *pgd)
+{
+	struct page *page = virt_to_page(pgd);
+	unsigned long flags;
+
+	spin_lock_irqsave(&pgd_lock, flags);
+	list_add(&page->lru, &pgd_list);
+	spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
+static inline void pgd_list_del(pgd_t *pgd)
+{
+	struct page *page = virt_to_page(pgd);
+	unsigned long flags;
+
+	spin_lock_irqsave(&pgd_lock, flags);
+	list_del(&page->lru);
+	spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
+pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+	unsigned boundary;
+	pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
+	if (!pgd)
+		return NULL;
+	pgd_list_add(pgd);
+	/*
+	 * Copy kernel pointers in from init.
+	 * Could keep a freelist or slab cache of those because the kernel
+	 * part never changes.
+	 */
+	boundary = pgd_index(__PAGE_OFFSET);
+	memset(pgd, 0, boundary * sizeof(pgd_t));
+	memcpy(pgd + boundary,
+	       init_level4_pgt + boundary,
+	       (PTRS_PER_PGD - boundary) * sizeof(pgd_t));
+	return pgd;
+}
+
+void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+	BUG_ON((unsigned long)pgd & (PAGE_SIZE-1));
+	pgd_list_del(pgd);
+	free_page((unsigned long)pgd);
+}
+#else
+/*
+ * List of all pgd's needed for non-PAE so it can invalidate entries
+ * in both cached and uncached pgd's; not needed for PAE since the
+ * kernel pmd is shared. If PAE were not to share the pmd a similar
+ * tactic would be needed. This is essentially codepath-based locking
+ * against pageattr.c; it is the unique case in which a valid change
+ * of kernel pagetables can't be lazily synchronized by vmalloc faults.
+ * vmalloc faults work because attached pagetables are never freed.
+ * -- wli
+ */
+static inline void pgd_list_add(pgd_t *pgd)
+{
+	struct page *page = virt_to_page(pgd);
+
+	list_add(&page->lru, &pgd_list);
+}
+
+static inline void pgd_list_del(pgd_t *pgd)
+{
+	struct page *page = virt_to_page(pgd);
+
+	list_del(&page->lru);
+}
+
+#define UNSHARED_PTRS_PER_PGD				\
+	(SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
+
+static void pgd_ctor(void *p)
+{
+	pgd_t *pgd = p;
+	unsigned long flags;
+
+	/* Clear usermode parts of PGD */
+	memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
+
+	spin_lock_irqsave(&pgd_lock, flags);
+
+	/* If the pgd points to a shared pagetable level (either the
+	   ptes in non-PAE, or shared PMD in PAE), then just copy the
+	   references from swapper_pg_dir. */
+	if (PAGETABLE_LEVELS == 2 ||
+	    (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
+		clone_pgd_range(pgd + USER_PTRS_PER_PGD,
+				swapper_pg_dir + USER_PTRS_PER_PGD,
+				KERNEL_PGD_PTRS);
+		paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
+					__pa(swapper_pg_dir) >> PAGE_SHIFT,
+					USER_PTRS_PER_PGD,
+					KERNEL_PGD_PTRS);
+	}
+
+	/* list required to sync kernel mapping updates */
+	if (!SHARED_KERNEL_PMD)
+		pgd_list_add(pgd);
+
+	spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
+static void pgd_dtor(void *pgd)
+{
+	unsigned long flags; /* can be called from interrupt context */
+
+	if (SHARED_KERNEL_PMD)
+		return;
+
+	spin_lock_irqsave(&pgd_lock, flags);
+	pgd_list_del(pgd);
+	spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
+#ifdef CONFIG_X86_PAE
+/*
+ * Mop up any pmd pages which may still be attached to the pgd.
+ * Normally they will be freed by munmap/exit_mmap, but any pmd we
+ * preallocate which never got a corresponding vma will need to be
+ * freed manually.
+ */
+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
+{
+	int i;
+
+	for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
+		pgd_t pgd = pgdp[i];
+
+		if (pgd_val(pgd) != 0) {
+			pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
+
+			pgdp[i] = native_make_pgd(0);
+
+			paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
+			pmd_free(mm, pmd);
+		}
+	}
+}
+
+/*
+ * In PAE mode, we need to do a cr3 reload (=tlb flush) when
+ * updating the top-level pagetable entries to guarantee the
+ * processor notices the update.  Since this is expensive, and
+ * all 4 top-level entries are used almost immediately in a
+ * new process's life, we just pre-populate them here.
+ *
+ * Also, if we're in a paravirt environment where the kernel pmd is
+ * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
+ * and initialize the kernel pmds here.
+ */
+static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
+{
+	pud_t *pud;
+	unsigned long addr;
+	int i;
+
+	pud = pud_offset(pgd, 0);
+ 	for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
+	     i++, pud++, addr += PUD_SIZE) {
+		pmd_t *pmd = pmd_alloc_one(mm, addr);
+
+		if (!pmd) {
+			pgd_mop_up_pmds(mm, pgd);
+			return 0;
+		}
+
+		if (i >= USER_PTRS_PER_PGD)
+			memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
+			       sizeof(pmd_t) * PTRS_PER_PMD);
+
+		pud_populate(mm, pud, pmd);
+	}
+
+	return 1;
+}
+#else  /* !CONFIG_X86_PAE */
+/* No need to prepopulate any pagetable entries in non-PAE modes. */
+static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
+{
+	return 1;
+}
+
+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
+{
+}
+#endif	/* CONFIG_X86_PAE */
+
+pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+	pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+
+	/* so that alloc_pd can use it */
+	mm->pgd = pgd;
+	if (pgd)
+		pgd_ctor(pgd);
+
+	if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
+		pgd_dtor(pgd);
+		free_page((unsigned long)pgd);
+		pgd = NULL;
+	}
+
+	return pgd;
+}
+
+void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+	pgd_mop_up_pmds(mm, pgd);
+	pgd_dtor(pgd);
+	free_page((unsigned long)pgd);
+}
+#endif
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 6fb9e7c6893..b46893e45d0 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -173,193 +173,6 @@ void reserve_top_address(unsigned long reserve)
 	__VMALLOC_RESERVE += reserve;
 }
 
-pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
-{
-	return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
-}
-
-pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
-{
-	struct page *pte;
-
-#ifdef CONFIG_HIGHPTE
-	pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
-#else
-	pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
-#endif
-	if (pte)
-		pgtable_page_ctor(pte);
-	return pte;
-}
-
-/*
- * List of all pgd's needed for non-PAE so it can invalidate entries
- * in both cached and uncached pgd's; not needed for PAE since the
- * kernel pmd is shared. If PAE were not to share the pmd a similar
- * tactic would be needed. This is essentially codepath-based locking
- * against pageattr.c; it is the unique case in which a valid change
- * of kernel pagetables can't be lazily synchronized by vmalloc faults.
- * vmalloc faults work because attached pagetables are never freed.
- * -- wli
- */
-static inline void pgd_list_add(pgd_t *pgd)
-{
-	struct page *page = virt_to_page(pgd);
-
-	list_add(&page->lru, &pgd_list);
-}
-
-static inline void pgd_list_del(pgd_t *pgd)
-{
-	struct page *page = virt_to_page(pgd);
-
-	list_del(&page->lru);
-}
-
-#define UNSHARED_PTRS_PER_PGD				\
-	(SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
-
-static void pgd_ctor(void *p)
-{
-	pgd_t *pgd = p;
-	unsigned long flags;
-
-	/* Clear usermode parts of PGD */
-	memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
-
-	spin_lock_irqsave(&pgd_lock, flags);
-
-	/* If the pgd points to a shared pagetable level (either the
-	   ptes in non-PAE, or shared PMD in PAE), then just copy the
-	   references from swapper_pg_dir. */
-	if (PAGETABLE_LEVELS == 2 ||
-	    (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
-		clone_pgd_range(pgd + USER_PTRS_PER_PGD,
-				swapper_pg_dir + USER_PTRS_PER_PGD,
-				KERNEL_PGD_PTRS);
-		paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
-					__pa(swapper_pg_dir) >> PAGE_SHIFT,
-					USER_PTRS_PER_PGD,
-					KERNEL_PGD_PTRS);
-	}
-
-	/* list required to sync kernel mapping updates */
-	if (!SHARED_KERNEL_PMD)
-		pgd_list_add(pgd);
-
-	spin_unlock_irqrestore(&pgd_lock, flags);
-}
-
-static void pgd_dtor(void *pgd)
-{
-	unsigned long flags; /* can be called from interrupt context */
-
-	if (SHARED_KERNEL_PMD)
-		return;
-
-	spin_lock_irqsave(&pgd_lock, flags);
-	pgd_list_del(pgd);
-	spin_unlock_irqrestore(&pgd_lock, flags);
-}
-
-#ifdef CONFIG_X86_PAE
-/*
- * Mop up any pmd pages which may still be attached to the pgd.
- * Normally they will be freed by munmap/exit_mmap, but any pmd we
- * preallocate which never got a corresponding vma will need to be
- * freed manually.
- */
-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
-{
-	int i;
-
-	for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
-		pgd_t pgd = pgdp[i];
-
-		if (pgd_val(pgd) != 0) {
-			pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
-
-			pgdp[i] = native_make_pgd(0);
-
-			paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
-			pmd_free(mm, pmd);
-		}
-	}
-}
-
-/*
- * In PAE mode, we need to do a cr3 reload (=tlb flush) when
- * updating the top-level pagetable entries to guarantee the
- * processor notices the update.  Since this is expensive, and
- * all 4 top-level entries are used almost immediately in a
- * new process's life, we just pre-populate them here.
- *
- * Also, if we're in a paravirt environment where the kernel pmd is
- * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
- * and initialize the kernel pmds here.
- */
-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
-{
-	pud_t *pud;
-	unsigned long addr;
-	int i;
-
-	pud = pud_offset(pgd, 0);
- 	for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
-	     i++, pud++, addr += PUD_SIZE) {
-		pmd_t *pmd = pmd_alloc_one(mm, addr);
-
-		if (!pmd) {
-			pgd_mop_up_pmds(mm, pgd);
-			return 0;
-		}
-
-		if (i >= USER_PTRS_PER_PGD)
-			memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
-			       sizeof(pmd_t) * PTRS_PER_PMD);
-
-		pud_populate(mm, pud, pmd);
-	}
-
-	return 1;
-}
-#else  /* !CONFIG_X86_PAE */
-/* No need to prepopulate any pagetable entries in non-PAE modes. */
-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
-{
-	return 1;
-}
-
-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
-{
-}
-#endif	/* CONFIG_X86_PAE */
-
-pgd_t *pgd_alloc(struct mm_struct *mm)
-{
-	pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-
-	/* so that alloc_pd can use it */
-	mm->pgd = pgd;
-	if (pgd)
-		pgd_ctor(pgd);
-
-	if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
-		pgd_dtor(pgd);
-		free_page((unsigned long)pgd);
-		pgd = NULL;
-	}
-
-	return pgd;
-}
-
-void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-	pgd_mop_up_pmds(mm, pgd);
-	pgd_dtor(pgd);
-	free_page((unsigned long)pgd);
-}
-
 void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
 {
 	pgtable_page_dtor(pte);
diff --git a/include/asm-x86/pgalloc.h b/include/asm-x86/pgalloc.h
index 5886eed0588..ea9d27ad7f4 100644
--- a/include/asm-x86/pgalloc.h
+++ b/include/asm-x86/pgalloc.h
@@ -1,5 +1,23 @@
+#ifndef _ASM_X86_PGALLOC_H
+#define _ASM_X86_PGALLOC_H
+
+#include <linux/threads.h>
+#include <linux/mm.h>		/* for struct page */
+#include <linux/pagemap.h>
+
+/*
+ * Allocate and free page tables.
+ */
+extern pgd_t *pgd_alloc(struct mm_struct *);
+extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
+
+extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
+extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
+
 #ifdef CONFIG_X86_32
 # include "pgalloc_32.h"
 #else
 # include "pgalloc_64.h"
 #endif
+
+#endif	/* _ASM_X86_PGALLOC_H */
diff --git a/include/asm-x86/pgalloc_32.h b/include/asm-x86/pgalloc_32.h
index 6bea6e5b5ee..d60edb14f85 100644
--- a/include/asm-x86/pgalloc_32.h
+++ b/include/asm-x86/pgalloc_32.h
@@ -1,12 +1,6 @@
 #ifndef _I386_PGALLOC_H
 #define _I386_PGALLOC_H
 
-#include <linux/threads.h>
-#include <linux/mm.h>		/* for struct page */
-#include <linux/pagemap.h>
-#include <asm/tlb.h>
-#include <asm-generic/tlb.h>
-
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
@@ -36,11 +30,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *p
 /*
  * Allocate and free page tables.
  */
-extern pgd_t *pgd_alloc(struct mm_struct *);
-extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
-
-extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
-extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
 
 static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
diff --git a/include/asm-x86/pgalloc_64.h b/include/asm-x86/pgalloc_64.h
index bcf525f3fbd..23f87501ac2 100644
--- a/include/asm-x86/pgalloc_64.h
+++ b/include/asm-x86/pgalloc_64.h
@@ -1,8 +1,6 @@
 #ifndef _X86_64_PGALLOC_H
 #define _X86_64_PGALLOC_H
 
-#include <linux/threads.h>
-#include <linux/mm.h>
 #include <asm/pda.h>
 
 static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
@@ -49,71 +47,6 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
 	free_page((unsigned long)pud);
 }
 
-static inline void pgd_list_add(pgd_t *pgd)
-{
-	struct page *page = virt_to_page(pgd);
-	unsigned long flags;
-
-	spin_lock_irqsave(&pgd_lock, flags);
-	list_add(&page->lru, &pgd_list);
-	spin_unlock_irqrestore(&pgd_lock, flags);
-}
-
-static inline void pgd_list_del(pgd_t *pgd)
-{
-	struct page *page = virt_to_page(pgd);
-	unsigned long flags;
-
-	spin_lock_irqsave(&pgd_lock, flags);
-	list_del(&page->lru);
-	spin_unlock_irqrestore(&pgd_lock, flags);
-}
-
-static inline pgd_t *pgd_alloc(struct mm_struct *mm)
-{
-	unsigned boundary;
-	pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
-	if (!pgd)
-		return NULL;
-	pgd_list_add(pgd);
-	/*
-	 * Copy kernel pointers in from init.
-	 * Could keep a freelist or slab cache of those because the kernel
-	 * part never changes.
-	 */
-	boundary = pgd_index(__PAGE_OFFSET);
-	memset(pgd, 0, boundary * sizeof(pgd_t));
-	memcpy(pgd + boundary,
-	       init_level4_pgt + boundary,
-	       (PTRS_PER_PGD - boundary) * sizeof(pgd_t));
-	return pgd;
-}
-
-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-	BUG_ON((unsigned long)pgd & (PAGE_SIZE-1));
-	pgd_list_del(pgd);
-	free_page((unsigned long)pgd);
-}
-
-static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
-{
-	return (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
-}
-
-static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
-{
-	struct page *page;
-	void *p;
-
-	p = (void *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
-	if (!p)
-		return NULL;
-	page = virt_to_page(p);
-	pgtable_page_ctor(page);
-	return page;
-}
-
 /* Should really implement gc for free page table pages. This could be
    done with a reference count in struct page. */
 
-- 
cgit v1.2.3-70-g09d2


From 1ec1fe73dfb711f9ea5a0ef8a7e3af5b6ac8b653 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 19 Mar 2008 20:30:40 +0100
Subject: x86: xen unify x86 add common mm pgtable c fix

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/pgtable.c        | 18 ++++++++++++++++++
 include/asm-x86/pgalloc_32.h | 17 +----------------
 2 files changed, 19 insertions(+), 16 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index d526b46ae18..ed16b7704a3 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -200,6 +200,24 @@ static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
 
 	return 1;
 }
+
+void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
+{
+	paravirt_alloc_pd(mm, __pa(pmd) >> PAGE_SHIFT);
+
+	/* Note: almost everything apart from _PAGE_PRESENT is
+	   reserved at the pmd (PDPT) level. */
+	set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
+
+	/*
+	 * According to Intel App note "TLBs, Paging-Structure Caches,
+	 * and Their Invalidation", April 2007, document 317080-001,
+	 * section 8.1: in PAE mode we explicitly have to flush the
+	 * TLB via cr3 if the top-level pgd is changed...
+	 */
+	if (mm == current->active_mm)
+		write_cr3(read_cr3());
+}
 #else  /* !CONFIG_X86_PAE */
 /* No need to prepopulate any pagetable entries in non-PAE modes. */
 static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
diff --git a/include/asm-x86/pgalloc_32.h b/include/asm-x86/pgalloc_32.h
index d60edb14f85..aaa322cb4b6 100644
--- a/include/asm-x86/pgalloc_32.h
+++ b/include/asm-x86/pgalloc_32.h
@@ -62,23 +62,8 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 
 extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
 
-static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
-{
-	paravirt_alloc_pd(mm, __pa(pmd) >> PAGE_SHIFT);
-
-	/* Note: almost everything apart from _PAGE_PRESENT is
-	   reserved at the pmd (PDPT) level. */
-	set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
+extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
 
-	/*
-	 * According to Intel App note "TLBs, Paging-Structure Caches,
-	 * and Their Invalidation", April 2007, document 317080-001,
-	 * section 8.1: in PAE mode we explicitly have to flush the
-	 * TLB via cr3 if the top-level pgd is changed...
-	 */
-	if (mm == current->active_mm)
-		write_cr3(read_cr3());
-}
 #endif	/* CONFIG_X86_PAE */
 
 #endif /* _I386_PGALLOC_H */
-- 
cgit v1.2.3-70-g09d2


From 1d262d3a4932b5ae7222c8d9900696650ee95188 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 17 Mar 2008 16:36:56 -0700
Subject: x86: put paravirt stubs into common asm/pgalloc.h

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/pageattr.c       |  2 --
 include/asm-x86/pgalloc.h    | 10 ++++++++++
 include/asm-x86/pgalloc_32.h | 10 ----------
 3 files changed, 10 insertions(+), 12 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index f7823a17286..938130d49b7 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -483,9 +483,7 @@ static int split_large_page(pte_t *kpte, unsigned long address)
 		goto out_unlock;
 
 	pbase = (pte_t *)page_address(base);
-#ifdef CONFIG_X86_32
 	paravirt_alloc_pt(&init_mm, page_to_pfn(base));
-#endif
 	ref_prot = pte_pgprot(pte_clrhuge(*kpte));
 
 #ifdef CONFIG_X86_64
diff --git a/include/asm-x86/pgalloc.h b/include/asm-x86/pgalloc.h
index ea9d27ad7f4..28773594f74 100644
--- a/include/asm-x86/pgalloc.h
+++ b/include/asm-x86/pgalloc.h
@@ -5,6 +5,16 @@
 #include <linux/mm.h>		/* for struct page */
 #include <linux/pagemap.h>
 
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define paravirt_alloc_pt(mm, pfn) do { } while (0)
+#define paravirt_alloc_pd(mm, pfn) do { } while (0)
+#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
+#define paravirt_release_pt(pfn) do { } while (0)
+#define paravirt_release_pd(pfn) do { } while (0)
+#endif
+
 /*
  * Allocate and free page tables.
  */
diff --git a/include/asm-x86/pgalloc_32.h b/include/asm-x86/pgalloc_32.h
index aaa322cb4b6..c4e7faa8961 100644
--- a/include/asm-x86/pgalloc_32.h
+++ b/include/asm-x86/pgalloc_32.h
@@ -1,16 +1,6 @@
 #ifndef _I386_PGALLOC_H
 #define _I386_PGALLOC_H
 
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#else
-#define paravirt_alloc_pt(mm, pfn) do { } while (0)
-#define paravirt_alloc_pd(mm, pfn) do { } while (0)
-#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
-#define paravirt_release_pt(pfn) do { } while (0)
-#define paravirt_release_pd(pfn) do { } while (0)
-#endif
-
 static inline void pmd_populate_kernel(struct mm_struct *mm,
 				       pmd_t *pmd, pte_t *pte)
 {
-- 
cgit v1.2.3-70-g09d2


From 397f687ab7f840dbe50353c4b60108672b653d0c Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 17 Mar 2008 16:36:57 -0700
Subject: x86: move pte functions into common asm/pgalloc.h

Common definitions for 2-level pagetable functions.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/init_64.c        |  6 ------
 arch/x86/mm/pgtable.c        |  7 +++++++
 arch/x86/mm/pgtable_32.c     |  7 -------
 include/asm-x86/pgalloc.h    | 16 ++++++++++++++++
 include/asm-x86/pgalloc_32.h | 18 ------------------
 include/asm-x86/pgalloc_64.h | 16 ----------------
 6 files changed, 23 insertions(+), 47 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 2d89b7abbc5..9c09893e54f 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -662,12 +662,6 @@ int memory_add_physaddr_to_nid(u64 start)
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 #endif
 
-void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
-{
-	pgtable_page_dtor(pte);
-	tlb_remove_page(tlb, pte);
-}
-
 void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 {
 	tlb_remove_page(tlb, virt_to_page(pmd));
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index ed16b7704a3..83765328e26 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -21,6 +21,13 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
 	return pte;
 }
 
+void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
+{
+	pgtable_page_dtor(pte);
+	paravirt_release_pt(page_to_pfn(pte));
+	tlb_remove_page(tlb, pte);
+}
+
 #ifdef CONFIG_X86_64
 static inline void pgd_list_add(pgd_t *pgd)
 {
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index b46893e45d0..b9e3dac3239 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -173,13 +173,6 @@ void reserve_top_address(unsigned long reserve)
 	__VMALLOC_RESERVE += reserve;
 }
 
-void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
-{
-	pgtable_page_dtor(pte);
-	paravirt_release_pt(page_to_pfn(pte));
-	tlb_remove_page(tlb, pte);
-}
-
 #ifdef CONFIG_X86_PAE
 
 void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
diff --git a/include/asm-x86/pgalloc.h b/include/asm-x86/pgalloc.h
index 28773594f74..0d15e21eefe 100644
--- a/include/asm-x86/pgalloc.h
+++ b/include/asm-x86/pgalloc.h
@@ -24,6 +24,22 @@ extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
 extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
 extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
 
+/* Should really implement gc for free page table pages. This could be
+   done with a reference count in struct page. */
+
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+{
+	BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
+	free_page((unsigned long)pte);
+}
+
+static inline void pte_free(struct mm_struct *mm, struct page *pte)
+{
+	__free_page(pte);
+}
+
+extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
+
 #ifdef CONFIG_X86_32
 # include "pgalloc_32.h"
 #else
diff --git a/include/asm-x86/pgalloc_32.h b/include/asm-x86/pgalloc_32.h
index c4e7faa8961..96a09f56615 100644
--- a/include/asm-x86/pgalloc_32.h
+++ b/include/asm-x86/pgalloc_32.h
@@ -17,24 +17,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *p
 }
 #define pmd_pgtable(pmd) pmd_page(pmd)
 
-/*
- * Allocate and free page tables.
- */
-
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
-	free_page((unsigned long)pte);
-}
-
-static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
-{
-	pgtable_page_dtor(pte);
-	__free_page(pte);
-}
-
-
-extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
-
 #ifdef CONFIG_X86_PAE
 /*
  * In the PAE case we free the pmds as part of the pgd.
diff --git a/include/asm-x86/pgalloc_64.h b/include/asm-x86/pgalloc_64.h
index 23f87501ac2..fc3414d3ed0 100644
--- a/include/asm-x86/pgalloc_64.h
+++ b/include/asm-x86/pgalloc_64.h
@@ -47,22 +47,6 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
 	free_page((unsigned long)pud);
 }
 
-/* Should really implement gc for free page table pages. This could be
-   done with a reference count in struct page. */
-
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
-	BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
-	free_page((unsigned long)pte); 
-}
-
-static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
-{
-	pgtable_page_dtor(pte);
-	__free_page(pte);
-} 
-
-extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
 extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
 extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
 
-- 
cgit v1.2.3-70-g09d2


From 170fdff7057d4247e3f28cca96d0db1fbc854e3b Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 17 Mar 2008 16:36:58 -0700
Subject: x86: move pmd functions into common asm/pgalloc.h

Common definitions for 3-level pagetable functions.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/init_64.c        |  5 -----
 arch/x86/mm/pgtable.c        |  8 ++++++++
 arch/x86/mm/pgtable_32.c     | 10 ----------
 include/asm-x86/pgalloc.h    | 33 +++++++++++++++++++++++++++++++++
 include/asm-x86/pgalloc_32.h | 32 --------------------------------
 include/asm-x86/pgalloc_64.h | 24 ------------------------
 6 files changed, 41 insertions(+), 71 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 9c09893e54f..4465104f551 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -662,11 +662,6 @@ int memory_add_physaddr_to_nid(u64 start)
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 #endif
 
-void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
-{
-	tlb_remove_page(tlb, virt_to_page(pmd));
-}
-
 void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
 {
 	tlb_remove_page(tlb, virt_to_page(pud));
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 83765328e26..1c41efedf6d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -28,6 +28,14 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
 	tlb_remove_page(tlb, pte);
 }
 
+#if PAGETABLE_LEVELS > 2
+void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
+{
+	paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
+	tlb_remove_page(tlb, virt_to_page(pmd));
+}
+#endif	/* PAGETABLE_LEVELS > 2 */
+
 #ifdef CONFIG_X86_64
 static inline void pgd_list_add(pgd_t *pgd)
 {
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index b9e3dac3239..9ee007be914 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -173,16 +173,6 @@ void reserve_top_address(unsigned long reserve)
 	__VMALLOC_RESERVE += reserve;
 }
 
-#ifdef CONFIG_X86_PAE
-
-void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
-{
-	paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
-	tlb_remove_page(tlb, virt_to_page(pmd));
-}
-
-#endif
-
 int pmd_bad(pmd_t pmd)
 {
 	WARN_ON_ONCE(pmd_bad_v1(pmd) != pmd_bad_v2(pmd));
diff --git a/include/asm-x86/pgalloc.h b/include/asm-x86/pgalloc.h
index 0d15e21eefe..ae23839db20 100644
--- a/include/asm-x86/pgalloc.h
+++ b/include/asm-x86/pgalloc.h
@@ -40,6 +40,39 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte)
 
 extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
 
+static inline void pmd_populate_kernel(struct mm_struct *mm,
+				       pmd_t *pmd, pte_t *pte)
+{
+	paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT);
+	set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
+}
+
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
+				struct page *pte)
+{
+	unsigned long pfn = page_to_pfn(pte);
+
+	paravirt_alloc_pt(mm, pfn);
+	set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
+}
+
+#define pmd_pgtable(pmd) pmd_page(pmd)
+
+#if PAGETABLE_LEVELS > 2
+static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+	return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
+}
+
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
+{
+	BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
+	free_page((unsigned long)pmd);
+}
+
+extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
+#endif	/* PAGETABLE_LEVELS > 2 */
+
 #ifdef CONFIG_X86_32
 # include "pgalloc_32.h"
 #else
diff --git a/include/asm-x86/pgalloc_32.h b/include/asm-x86/pgalloc_32.h
index 96a09f56615..b83bc010af0 100644
--- a/include/asm-x86/pgalloc_32.h
+++ b/include/asm-x86/pgalloc_32.h
@@ -1,39 +1,7 @@
 #ifndef _I386_PGALLOC_H
 #define _I386_PGALLOC_H
 
-static inline void pmd_populate_kernel(struct mm_struct *mm,
-				       pmd_t *pmd, pte_t *pte)
-{
-	paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT);
-	set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
-}
-
-static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
-{
-	unsigned long pfn = page_to_pfn(pte);
-
-	paravirt_alloc_pt(mm, pfn);
-	set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
-}
-#define pmd_pgtable(pmd) pmd_page(pmd)
-
 #ifdef CONFIG_X86_PAE
-/*
- * In the PAE case we free the pmds as part of the pgd.
- */
-static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
-	return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
-}
-
-static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
-{
-	BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
-	free_page((unsigned long)pmd);
-}
-
-extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
-
 extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
 
 #endif	/* CONFIG_X86_PAE */
diff --git a/include/asm-x86/pgalloc_64.h b/include/asm-x86/pgalloc_64.h
index fc3414d3ed0..50196819425 100644
--- a/include/asm-x86/pgalloc_64.h
+++ b/include/asm-x86/pgalloc_64.h
@@ -3,11 +3,6 @@
 
 #include <asm/pda.h>
 
-static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
-{
-	set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)));
-}
-
 static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
 {
 	set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
@@ -18,24 +13,6 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
 	set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
 }
 
-#define pmd_pgtable(pmd) pmd_page(pmd)
-
-static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
-{
-	set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
-}
-
-static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
-{
-	BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
-	free_page((unsigned long)pmd);
-}
-
-static inline pmd_t *pmd_alloc_one (struct mm_struct *mm, unsigned long addr)
-{
-	return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
-}
-
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
 	return (pud_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
@@ -47,7 +24,6 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
 	free_page((unsigned long)pud);
 }
 
-extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
 extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
 
 #endif /* _X86_64_PGALLOC_H */
-- 
cgit v1.2.3-70-g09d2


From 5a5f8f42241cf09caec5530a7639cfa8dccc3a7b Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 17 Mar 2008 16:36:59 -0700
Subject: x86: move pgalloc pud and pgd operations into common place

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/init_64.c        |  5 -----
 arch/x86/mm/pgtable.c        |  7 +++++++
 include/asm-x86/pgalloc.h    | 36 ++++++++++++++++++++++++++++++------
 include/asm-x86/pgalloc_32.h |  9 ---------
 include/asm-x86/pgalloc_64.h | 29 -----------------------------
 5 files changed, 37 insertions(+), 49 deletions(-)
 delete mode 100644 include/asm-x86/pgalloc_32.h
 delete mode 100644 include/asm-x86/pgalloc_64.h

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 4465104f551..1ff7906a9a4 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -662,11 +662,6 @@ int memory_add_physaddr_to_nid(u64 start)
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 #endif
 
-void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
-{
-	tlb_remove_page(tlb, virt_to_page(pud));
-}
-
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 1c41efedf6d..c67966e10a9 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -34,6 +34,13 @@ void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 	paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
 	tlb_remove_page(tlb, virt_to_page(pmd));
 }
+
+#if PAGETABLE_LEVELS > 3
+void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
+{
+	tlb_remove_page(tlb, virt_to_page(pud));
+}
+#endif	/* PAGETABLE_LEVELS > 3 */
 #endif	/* PAGETABLE_LEVELS > 2 */
 
 #ifdef CONFIG_X86_64
diff --git a/include/asm-x86/pgalloc.h b/include/asm-x86/pgalloc.h
index ae23839db20..73e5b031847 100644
--- a/include/asm-x86/pgalloc.h
+++ b/include/asm-x86/pgalloc.h
@@ -71,12 +71,36 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 }
 
 extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
-#endif	/* PAGETABLE_LEVELS > 2 */
 
-#ifdef CONFIG_X86_32
-# include "pgalloc_32.h"
-#else
-# include "pgalloc_64.h"
-#endif
+#ifdef CONFIG_X86_PAE
+extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
+#else	/* !CONFIG_X86_PAE */
+static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
+{
+	paravirt_alloc_pd(mm, __pa(pmd) >> PAGE_SHIFT);
+	set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
+}
+#endif	/* CONFIG_X86_PAE */
+
+#if PAGETABLE_LEVELS > 3
+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+{
+	set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
+}
+
+static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+	return (pud_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
+}
+
+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+{
+	BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
+	free_page((unsigned long)pud);
+}
+
+extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
+#endif	/* PAGETABLE_LEVELS > 3 */
+#endif	/* PAGETABLE_LEVELS > 2 */
 
 #endif	/* _ASM_X86_PGALLOC_H */
diff --git a/include/asm-x86/pgalloc_32.h b/include/asm-x86/pgalloc_32.h
deleted file mode 100644
index b83bc010af0..00000000000
--- a/include/asm-x86/pgalloc_32.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef _I386_PGALLOC_H
-#define _I386_PGALLOC_H
-
-#ifdef CONFIG_X86_PAE
-extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
-
-#endif	/* CONFIG_X86_PAE */
-
-#endif /* _I386_PGALLOC_H */
diff --git a/include/asm-x86/pgalloc_64.h b/include/asm-x86/pgalloc_64.h
deleted file mode 100644
index 50196819425..00000000000
--- a/include/asm-x86/pgalloc_64.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef _X86_64_PGALLOC_H
-#define _X86_64_PGALLOC_H
-
-#include <asm/pda.h>
-
-static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
-{
-	set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
-}
-
-static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
-{
-	set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
-}
-
-static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
-	return (pud_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
-}
-
-static inline void pud_free(struct mm_struct *mm, pud_t *pud)
-{
-	BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
-	free_page((unsigned long)pud);
-}
-
-extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
-
-#endif /* _X86_64_PGALLOC_H */
-- 
cgit v1.2.3-70-g09d2


From 394158559d4c912cc58c311b6346cdea0ed2b1de Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 17 Mar 2008 16:37:00 -0700
Subject: x86: move all the pgd_list handling to one place

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/pgtable.c | 28 +++++++---------------------
 1 file changed, 7 insertions(+), 21 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index c67966e10a9..0d2866b8f42 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -43,34 +43,31 @@ void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
 #endif	/* PAGETABLE_LEVELS > 3 */
 #endif	/* PAGETABLE_LEVELS > 2 */
 
-#ifdef CONFIG_X86_64
 static inline void pgd_list_add(pgd_t *pgd)
 {
 	struct page *page = virt_to_page(pgd);
-	unsigned long flags;
 
-	spin_lock_irqsave(&pgd_lock, flags);
 	list_add(&page->lru, &pgd_list);
-	spin_unlock_irqrestore(&pgd_lock, flags);
 }
 
 static inline void pgd_list_del(pgd_t *pgd)
 {
 	struct page *page = virt_to_page(pgd);
-	unsigned long flags;
 
-	spin_lock_irqsave(&pgd_lock, flags);
 	list_del(&page->lru);
-	spin_unlock_irqrestore(&pgd_lock, flags);
 }
 
+#ifdef CONFIG_X86_64
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	unsigned boundary;
 	pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
+	unsigned long flags;
 	if (!pgd)
 		return NULL;
+	spin_lock_irqsave(&pgd_lock, flags);
 	pgd_list_add(pgd);
+	spin_unlock_irqrestore(&pgd_lock, flags);
 	/*
 	 * Copy kernel pointers in from init.
 	 * Could keep a freelist or slab cache of those because the kernel
@@ -86,8 +83,11 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 
 void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
+	unsigned long flags;
 	BUG_ON((unsigned long)pgd & (PAGE_SIZE-1));
+	spin_lock_irqsave(&pgd_lock, flags);
 	pgd_list_del(pgd);
+	spin_unlock_irqrestore(&pgd_lock, flags);
 	free_page((unsigned long)pgd);
 }
 #else
@@ -101,20 +101,6 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
  * vmalloc faults work because attached pagetables are never freed.
  * -- wli
  */
-static inline void pgd_list_add(pgd_t *pgd)
-{
-	struct page *page = virt_to_page(pgd);
-
-	list_add(&page->lru, &pgd_list);
-}
-
-static inline void pgd_list_del(pgd_t *pgd)
-{
-	struct page *page = virt_to_page(pgd);
-
-	list_del(&page->lru);
-}
-
 #define UNSHARED_PTRS_PER_PGD				\
 	(SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
 
-- 
cgit v1.2.3-70-g09d2


From 6944a9c8945212a0cc1de3589736d59ec542c539 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 17 Mar 2008 16:37:01 -0700
Subject: x86: rename paravirt_alloc_pt etc after the pagetable structure

Rename (alloc|release)_(pt|pd) to pte/pmd to explicitly match the name
of the appropriate pagetable level structure.

[ x86.git merge work by Mark McLoughlin <markmc@redhat.com> ]

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/paravirt.c | 10 +++++-----
 arch/x86/kernel/vmi_32.c   | 20 ++++++++++----------
 arch/x86/mm/init_32.c      |  6 +++---
 arch/x86/mm/ioremap.c      |  2 +-
 arch/x86/mm/pageattr.c     |  2 +-
 arch/x86/mm/pgtable.c      | 18 +++++++++---------
 arch/x86/xen/enlighten.c   | 32 ++++++++++++++++----------------
 include/asm-x86/paravirt.h | 32 ++++++++++++++++----------------
 include/asm-x86/pgalloc.h  | 16 ++++++++--------
 9 files changed, 69 insertions(+), 69 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 3733412d135..362653da003 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -366,11 +366,11 @@ struct pv_mmu_ops pv_mmu_ops = {
 	.flush_tlb_single = native_flush_tlb_single,
 	.flush_tlb_others = native_flush_tlb_others,
 
-	.alloc_pt = paravirt_nop,
-	.alloc_pd = paravirt_nop,
-	.alloc_pd_clone = paravirt_nop,
-	.release_pt = paravirt_nop,
-	.release_pd = paravirt_nop,
+	.alloc_pte = paravirt_nop,
+	.alloc_pmd = paravirt_nop,
+	.alloc_pmd_clone = paravirt_nop,
+	.release_pte = paravirt_nop,
+	.release_pmd = paravirt_nop,
 
 	.set_pte = native_set_pte,
 	.set_pte_at = native_set_pte_at,
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 12affe1f9bc..44f7ca153b7 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -392,13 +392,13 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
 }
 #endif
 
-static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn)
+static void vmi_allocate_pte(struct mm_struct *mm, u32 pfn)
 {
 	vmi_set_page_type(pfn, VMI_PAGE_L1);
 	vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
 }
 
-static void vmi_allocate_pd(struct mm_struct *mm, u32 pfn)
+static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn)
 {
  	/*
 	 * This call comes in very early, before mem_map is setup.
@@ -409,20 +409,20 @@ static void vmi_allocate_pd(struct mm_struct *mm, u32 pfn)
 	vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
 }
 
-static void vmi_allocate_pd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
+static void vmi_allocate_pmd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
 {
  	vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
 	vmi_check_page_type(clonepfn, VMI_PAGE_L2);
 	vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
 }
 
-static void vmi_release_pt(u32 pfn)
+static void vmi_release_pte(u32 pfn)
 {
 	vmi_ops.release_page(pfn, VMI_PAGE_L1);
 	vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
 }
 
-static void vmi_release_pd(u32 pfn)
+static void vmi_release_pmd(u32 pfn)
 {
 	vmi_ops.release_page(pfn, VMI_PAGE_L2);
 	vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
@@ -871,15 +871,15 @@ static inline int __init activate_vmi(void)
 
 	vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
 	if (vmi_ops.allocate_page) {
-		pv_mmu_ops.alloc_pt = vmi_allocate_pt;
-		pv_mmu_ops.alloc_pd = vmi_allocate_pd;
-		pv_mmu_ops.alloc_pd_clone = vmi_allocate_pd_clone;
+		pv_mmu_ops.alloc_pte = vmi_allocate_pte;
+		pv_mmu_ops.alloc_pmd = vmi_allocate_pmd;
+		pv_mmu_ops.alloc_pmd_clone = vmi_allocate_pmd_clone;
 	}
 
 	vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
 	if (vmi_ops.release_page) {
-		pv_mmu_ops.release_pt = vmi_release_pt;
-		pv_mmu_ops.release_pd = vmi_release_pd;
+		pv_mmu_ops.release_pte = vmi_release_pte;
+		pv_mmu_ops.release_pmd = vmi_release_pmd;
 	}
 
 	/* Set linear is needed in all cases */
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 9ec62da85fd..df490905f37 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -71,7 +71,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
 	if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
 		pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
 
-		paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
+		paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
 		set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
 		pud = pud_offset(pgd, 0);
 		BUG_ON(pmd_table != pmd_offset(pud, 0));
@@ -100,7 +100,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
 				(pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
 		}
 
-		paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
+		paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
 		set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
 		BUG_ON(page_table != pte_offset_kernel(pmd, 0));
 	}
@@ -365,7 +365,7 @@ void __init native_pagetable_setup_start(pgd_t *base)
 
 		pte_clear(NULL, va, pte);
 	}
-	paravirt_alloc_pd(&init_mm, __pa(base) >> PAGE_SHIFT);
+	paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT);
 }
 
 void __init native_pagetable_setup_done(pgd_t *base)
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 3a4baf95e24..36a3f7ded62 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -407,7 +407,7 @@ void __init early_ioremap_clear(void)
 
 	pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
 	pmd_clear(pmd);
-	paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT);
+	paravirt_release_pte(__pa(bm_pte) >> PAGE_SHIFT);
 	__flush_tlb_all();
 }
 
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 938130d49b7..57e762c141f 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -483,7 +483,7 @@ static int split_large_page(pte_t *kpte, unsigned long address)
 		goto out_unlock;
 
 	pbase = (pte_t *)page_address(base);
-	paravirt_alloc_pt(&init_mm, page_to_pfn(base));
+	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
 	ref_prot = pte_pgprot(pte_clrhuge(*kpte));
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 0d2866b8f42..1d44d6dd4c9 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -24,14 +24,14 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
 void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
 {
 	pgtable_page_dtor(pte);
-	paravirt_release_pt(page_to_pfn(pte));
+	paravirt_release_pte(page_to_pfn(pte));
 	tlb_remove_page(tlb, pte);
 }
 
 #if PAGETABLE_LEVELS > 2
 void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 {
-	paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
+	paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
 	tlb_remove_page(tlb, virt_to_page(pmd));
 }
 
@@ -122,10 +122,10 @@ static void pgd_ctor(void *p)
 		clone_pgd_range(pgd + USER_PTRS_PER_PGD,
 				swapper_pg_dir + USER_PTRS_PER_PGD,
 				KERNEL_PGD_PTRS);
-		paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
-					__pa(swapper_pg_dir) >> PAGE_SHIFT,
-					USER_PTRS_PER_PGD,
-					KERNEL_PGD_PTRS);
+		paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
+					 __pa(swapper_pg_dir) >> PAGE_SHIFT,
+					 USER_PTRS_PER_PGD,
+					 KERNEL_PGD_PTRS);
 	}
 
 	/* list required to sync kernel mapping updates */
@@ -166,7 +166,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
 
 			pgdp[i] = native_make_pgd(0);
 
-			paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
+			paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
 			pmd_free(mm, pmd);
 		}
 	}
@@ -211,7 +211,7 @@ static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
 
 void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
 {
-	paravirt_alloc_pd(mm, __pa(pmd) >> PAGE_SHIFT);
+	paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
 
 	/* Note: almost everything apart from _PAGE_PRESENT is
 	   reserved at the pmd (PDPT) level. */
@@ -242,7 +242,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
 
-	/* so that alloc_pd can use it */
+	/* so that alloc_pmd can use it */
 	mm->pgd = pgd;
 	if (pgd)
 		pgd_ctor(pgd);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c0388220cf9..36f36e6b087 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -655,15 +655,15 @@ static void xen_write_cr3(unsigned long cr3)
 
 /* Early in boot, while setting up the initial pagetable, assume
    everything is pinned. */
-static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
+static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
 {
 	BUG_ON(mem_map);	/* should only be used early */
 	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
 }
 
-/* Early release_pt assumes that all pts are pinned, since there's
+/* Early release_pte assumes that all pts are pinned, since there's
    only init_mm and anything attached to that is pinned. */
-static void xen_release_pt_init(u32 pfn)
+static void xen_release_pte_init(u32 pfn)
 {
 	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
 }
@@ -697,12 +697,12 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level)
 	}
 }
 
-static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
+static void xen_alloc_pte(struct mm_struct *mm, u32 pfn)
 {
 	xen_alloc_ptpage(mm, pfn, PT_PTE);
 }
 
-static void xen_alloc_pd(struct mm_struct *mm, u32 pfn)
+static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn)
 {
 	xen_alloc_ptpage(mm, pfn, PT_PMD);
 }
@@ -722,12 +722,12 @@ static void xen_release_ptpage(u32 pfn, unsigned level)
 	}
 }
 
-static void xen_release_pt(u32 pfn)
+static void xen_release_pte(u32 pfn)
 {
 	xen_release_ptpage(pfn, PT_PTE);
 }
 
-static void xen_release_pd(u32 pfn)
+static void xen_release_pmd(u32 pfn)
 {
 	xen_release_ptpage(pfn, PT_PMD);
 }
@@ -849,10 +849,10 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
 {
 	/* This will work as long as patching hasn't happened yet
 	   (which it hasn't) */
-	pv_mmu_ops.alloc_pt = xen_alloc_pt;
-	pv_mmu_ops.alloc_pd = xen_alloc_pd;
-	pv_mmu_ops.release_pt = xen_release_pt;
-	pv_mmu_ops.release_pd = xen_release_pd;
+	pv_mmu_ops.alloc_pte = xen_alloc_pte;
+	pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
+	pv_mmu_ops.release_pte = xen_release_pte;
+	pv_mmu_ops.release_pmd = xen_release_pmd;
 	pv_mmu_ops.set_pte = xen_set_pte;
 
 	setup_shared_info();
@@ -1059,11 +1059,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
 	.pte_update = paravirt_nop,
 	.pte_update_defer = paravirt_nop,
 
-	.alloc_pt = xen_alloc_pt_init,
-	.release_pt = xen_release_pt_init,
-	.alloc_pd = xen_alloc_pt_init,
-	.alloc_pd_clone = paravirt_nop,
-	.release_pd = xen_release_pt_init,
+	.alloc_pte = xen_alloc_pte_init,
+	.release_pte = xen_release_pte_init,
+	.alloc_pmd = xen_alloc_pte_init,
+	.alloc_pmd_clone = paravirt_nop,
+	.release_pmd = xen_release_pte_init,
 
 #ifdef CONFIG_HIGHPTE
 	.kmap_atomic_pte = xen_kmap_atomic_pte,
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index 3d419398499..c4480b9bda5 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -220,11 +220,11 @@ struct pv_mmu_ops {
 				 unsigned long va);
 
 	/* Hooks for allocating/releasing pagetable pages */
-	void (*alloc_pt)(struct mm_struct *mm, u32 pfn);
-	void (*alloc_pd)(struct mm_struct *mm, u32 pfn);
-	void (*alloc_pd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count);
-	void (*release_pt)(u32 pfn);
-	void (*release_pd)(u32 pfn);
+	void (*alloc_pte)(struct mm_struct *mm, u32 pfn);
+	void (*alloc_pmd)(struct mm_struct *mm, u32 pfn);
+	void (*alloc_pmd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count);
+	void (*release_pte)(u32 pfn);
+	void (*release_pmd)(u32 pfn);
 
 	/* Pagetable manipulation functions */
 	void (*set_pte)(pte_t *ptep, pte_t pteval);
@@ -910,28 +910,28 @@ static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
 	PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, &cpumask, mm, va);
 }
 
-static inline void paravirt_alloc_pt(struct mm_struct *mm, unsigned pfn)
+static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned pfn)
 {
-	PVOP_VCALL2(pv_mmu_ops.alloc_pt, mm, pfn);
+	PVOP_VCALL2(pv_mmu_ops.alloc_pte, mm, pfn);
 }
-static inline void paravirt_release_pt(unsigned pfn)
+static inline void paravirt_release_pte(unsigned pfn)
 {
-	PVOP_VCALL1(pv_mmu_ops.release_pt, pfn);
+	PVOP_VCALL1(pv_mmu_ops.release_pte, pfn);
 }
 
-static inline void paravirt_alloc_pd(struct mm_struct *mm, unsigned pfn)
+static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned pfn)
 {
-	PVOP_VCALL2(pv_mmu_ops.alloc_pd, mm, pfn);
+	PVOP_VCALL2(pv_mmu_ops.alloc_pmd, mm, pfn);
 }
 
-static inline void paravirt_alloc_pd_clone(unsigned pfn, unsigned clonepfn,
-					   unsigned start, unsigned count)
+static inline void paravirt_alloc_pmd_clone(unsigned pfn, unsigned clonepfn,
+					    unsigned start, unsigned count)
 {
-	PVOP_VCALL4(pv_mmu_ops.alloc_pd_clone, pfn, clonepfn, start, count);
+	PVOP_VCALL4(pv_mmu_ops.alloc_pmd_clone, pfn, clonepfn, start, count);
 }
-static inline void paravirt_release_pd(unsigned pfn)
+static inline void paravirt_release_pmd(unsigned pfn)
 {
-	PVOP_VCALL1(pv_mmu_ops.release_pd, pfn);
+	PVOP_VCALL1(pv_mmu_ops.release_pmd, pfn);
 }
 
 #ifdef CONFIG_HIGHPTE
diff --git a/include/asm-x86/pgalloc.h b/include/asm-x86/pgalloc.h
index 73e5b031847..a25d5402987 100644
--- a/include/asm-x86/pgalloc.h
+++ b/include/asm-x86/pgalloc.h
@@ -8,11 +8,11 @@
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
-#define paravirt_alloc_pt(mm, pfn) do { } while (0)
-#define paravirt_alloc_pd(mm, pfn) do { } while (0)
-#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
-#define paravirt_release_pt(pfn) do { } while (0)
-#define paravirt_release_pd(pfn) do { } while (0)
+#define paravirt_alloc_pte(mm, pfn) do { } while (0)
+#define paravirt_alloc_pmd(mm, pfn) do { } while (0)
+#define paravirt_alloc_pmd_clone(pfn, clonepfn, start, count) do { } while (0)
+#define paravirt_release_pte(pfn) do { } while (0)
+#define paravirt_release_pmd(pfn) do { } while (0)
 #endif
 
 /*
@@ -43,7 +43,7 @@ extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
 static inline void pmd_populate_kernel(struct mm_struct *mm,
 				       pmd_t *pmd, pte_t *pte)
 {
-	paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT);
+	paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
 	set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
 }
 
@@ -52,7 +52,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 {
 	unsigned long pfn = page_to_pfn(pte);
 
-	paravirt_alloc_pt(mm, pfn);
+	paravirt_alloc_pte(mm, pfn);
 	set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
 }
 
@@ -77,7 +77,7 @@ extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
 #else	/* !CONFIG_X86_PAE */
 static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
 {
-	paravirt_alloc_pd(mm, __pa(pmd) >> PAGE_SHIFT);
+	paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
 	set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
 }
 #endif	/* CONFIG_X86_PAE */
-- 
cgit v1.2.3-70-g09d2


From 2761fa0920756dc471d297843646a4a9bca6656f Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 17 Mar 2008 16:37:02 -0700
Subject: x86: add pud_alloc for 4-level pagetables

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/paravirt.c |  2 ++
 arch/x86/mm/pgtable.c      |  1 +
 include/asm-x86/paravirt.h | 11 +++++++++++
 include/asm-x86/pgalloc.h  |  3 +++
 4 files changed, 17 insertions(+)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 362653da003..74f0c5ea2a0 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -369,8 +369,10 @@ struct pv_mmu_ops pv_mmu_ops = {
 	.alloc_pte = paravirt_nop,
 	.alloc_pmd = paravirt_nop,
 	.alloc_pmd_clone = paravirt_nop,
+	.alloc_pud = paravirt_nop,
 	.release_pte = paravirt_nop,
 	.release_pmd = paravirt_nop,
+	.release_pud = paravirt_nop,
 
 	.set_pte = native_set_pte,
 	.set_pte_at = native_set_pte_at,
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 1d44d6dd4c9..5accc08683c 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -38,6 +38,7 @@ void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 #if PAGETABLE_LEVELS > 3
 void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
 {
+	paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
 	tlb_remove_page(tlb, virt_to_page(pud));
 }
 #endif	/* PAGETABLE_LEVELS > 3 */
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index c4480b9bda5..0f13b945e24 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -223,8 +223,10 @@ struct pv_mmu_ops {
 	void (*alloc_pte)(struct mm_struct *mm, u32 pfn);
 	void (*alloc_pmd)(struct mm_struct *mm, u32 pfn);
 	void (*alloc_pmd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count);
+	void (*alloc_pud)(struct mm_struct *mm, u32 pfn);
 	void (*release_pte)(u32 pfn);
 	void (*release_pmd)(u32 pfn);
+	void (*release_pud)(u32 pfn);
 
 	/* Pagetable manipulation functions */
 	void (*set_pte)(pte_t *ptep, pte_t pteval);
@@ -934,6 +936,15 @@ static inline void paravirt_release_pmd(unsigned pfn)
 	PVOP_VCALL1(pv_mmu_ops.release_pmd, pfn);
 }
 
+static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned pfn)
+{
+	PVOP_VCALL2(pv_mmu_ops.alloc_pud, mm, pfn);
+}
+static inline void paravirt_release_pud(unsigned pfn)
+{
+	PVOP_VCALL1(pv_mmu_ops.release_pud, pfn);
+}
+
 #ifdef CONFIG_HIGHPTE
 static inline void *kmap_atomic_pte(struct page *page, enum km_type type)
 {
diff --git a/include/asm-x86/pgalloc.h b/include/asm-x86/pgalloc.h
index a25d5402987..60e7f514ea0 100644
--- a/include/asm-x86/pgalloc.h
+++ b/include/asm-x86/pgalloc.h
@@ -11,8 +11,10 @@
 #define paravirt_alloc_pte(mm, pfn) do { } while (0)
 #define paravirt_alloc_pmd(mm, pfn) do { } while (0)
 #define paravirt_alloc_pmd_clone(pfn, clonepfn, start, count) do { } while (0)
+#define paravirt_alloc_pud(mm, pfn) do { } while (0)
 #define paravirt_release_pte(pfn) do { } while (0)
 #define paravirt_release_pmd(pfn) do { } while (0)
+#define paravirt_release_pud(pfn) do { } while (0)
 #endif
 
 /*
@@ -85,6 +87,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
 #if PAGETABLE_LEVELS > 3
 static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
 {
+	paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
 	set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
 }
 
-- 
cgit v1.2.3-70-g09d2


From ee5aa8d3ba65d76157f22b7afedd089d8acfe524 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 17 Mar 2008 16:37:03 -0700
Subject: x86/pgtable.h: demacro ptep_set_access_flags

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/pgtable.c     | 16 ++++++++++++++++
 include/asm-x86/pgtable.h | 13 +++----------
 2 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 5accc08683c..e7cda2057e1 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -1,5 +1,6 @@
 #include <linux/mm.h>
 #include <asm/pgalloc.h>
+#include <asm/pgtable.h>
 #include <asm/tlb.h>
 
 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
@@ -264,3 +265,18 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 	free_page((unsigned long)pgd);
 }
 #endif
+
+int ptep_set_access_flags(struct vm_area_struct *vma,
+			  unsigned long address, pte_t *ptep,
+			  pte_t entry, int dirty)
+{
+	int changed = !pte_same(*ptep, entry);
+
+	if (changed && dirty) {
+		*ptep = entry;
+		pte_update_defer(vma->vm_mm, address, ptep);
+		flush_tlb_page(vma, address);
+	}
+
+	return changed;
+}
diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h
index f1d9f4a03f6..feddddc2d97 100644
--- a/include/asm-x86/pgtable.h
+++ b/include/asm-x86/pgtable.h
@@ -389,16 +389,9 @@ static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
  * bit at the same time.
  */
 #define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
-#define ptep_set_access_flags(vma, address, ptep, entry, dirty)		\
-({									\
-	int __changed = !pte_same(*(ptep), entry);			\
-	if (__changed && dirty) {					\
-		*ptep = entry;						\
-		pte_update_defer((vma)->vm_mm, (address), (ptep));	\
-		flush_tlb_page(vma, address);				\
-	}								\
-	__changed;							\
-})
+extern int ptep_set_access_flags(struct vm_area_struct *vma,
+				 unsigned long address, pte_t *ptep,
+				 pte_t entry, int dirty);
 
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
 #define ptep_test_and_clear_young(vma, addr, ptep) ({			\
-- 
cgit v1.2.3-70-g09d2


From f9fbf1a36a6bb6a639459802bccee01185ee3220 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 17 Mar 2008 16:37:04 -0700
Subject: x86/pgtable.h: demacro ptep_test_and_clear_young

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/pgtable.c     | 15 +++++++++++++++
 include/asm-x86/pgtable.h | 11 ++---------
 2 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index e7cda2057e1..54bd77a7eee 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -280,3 +280,18 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
 
 	return changed;
 }
+
+int ptep_test_and_clear_young(struct vm_area_struct *vma,
+			      unsigned long addr, pte_t *ptep)
+{
+	int ret = 0;
+
+	if (pte_young(*ptep))
+		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+					 &ptep->pte);
+
+	if (ret)
+		pte_update(vma->vm_mm, addr, ptep);
+
+	return ret;
+}
diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h
index feddddc2d97..676408c9863 100644
--- a/include/asm-x86/pgtable.h
+++ b/include/asm-x86/pgtable.h
@@ -394,15 +394,8 @@ extern int ptep_set_access_flags(struct vm_area_struct *vma,
 				 pte_t entry, int dirty);
 
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-#define ptep_test_and_clear_young(vma, addr, ptep) ({			\
-	int __ret = 0;							\
-	if (pte_young(*(ptep)))						\
-		__ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,		\
-					   &(ptep)->pte);		\
-	if (__ret)							\
-		pte_update((vma)->vm_mm, addr, ptep);			\
-	__ret;								\
-})
+extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
+				     unsigned long addr, pte_t *ptep);
 
 #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
 #define ptep_clear_flush_young(vma, address, ptep)			\
-- 
cgit v1.2.3-70-g09d2


From c20311e165eb94f5ef12b15e452cc6ec24bd7813 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 17 Mar 2008 16:37:05 -0700
Subject: x86/pgtable.h: demacro ptep_clear_flush_young

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/pgtable.c     | 12 ++++++++++++
 include/asm-x86/pgtable.h | 10 ++--------
 2 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 54bd77a7eee..af0c50161d9 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -295,3 +295,15 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
 
 	return ret;
 }
+
+int ptep_clear_flush_young(struct vm_area_struct *vma,
+			   unsigned long address, pte_t *ptep)
+{
+	int young;
+
+	young = ptep_test_and_clear_young(vma, address, ptep);
+	if (young)
+		flush_tlb_page(vma, address);
+
+	return young;
+}
diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h
index 676408c9863..4ebea41ea70 100644
--- a/include/asm-x86/pgtable.h
+++ b/include/asm-x86/pgtable.h
@@ -398,14 +398,8 @@ extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
 				     unsigned long addr, pte_t *ptep);
 
 #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
-#define ptep_clear_flush_young(vma, address, ptep)			\
-({									\
-	int __young;							\
-	__young = ptep_test_and_clear_young((vma), (address), (ptep));	\
-	if (__young)							\
-		flush_tlb_page(vma, address);				\
-	__young;							\
-})
+extern int ptep_clear_flush_young(struct vm_area_struct *vma,
+				  unsigned long address, pte_t *ptep);
 
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
-- 
cgit v1.2.3-70-g09d2


From 68db065c845bd9d0eb96946ab104b4c82d0ae9da Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 17 Mar 2008 16:37:13 -0700
Subject: x86: unify KERNEL_PGD_PTRS

Make KERNEL_PGD_PTRS common, as previously it was only being defined
for 32-bit.

There are a couple of follow-on changes from this:
 - KERNEL_PGD_PTRS was being defined in terms of USER_PGD_PTRS.  The
   definition of USER_PGD_PTRS doesn't really make much sense on x86-64,
   since it can have two different user address-space configurations.
   I renamed USER_PGD_PTRS to KERNEL_PGD_BOUNDARY, which is meaningful
   for all of 32/32, 32/64 and 64/64 process configurations.

 - USER_PTRS_PER_PGD was also defined and was being used for similar
   purposes.  Converting its users to KERNEL_PGD_BOUNDARY left it
   completely unused, and so I removed it.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Zach Amsden <zach@vmware.com>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/reboot.c            |  4 ++--
 arch/x86/kernel/smpboot.c           |  4 ++--
 arch/x86/kernel/vmi_32.c            |  2 +-
 arch/x86/mach-voyager/voyager_smp.c |  4 ++--
 arch/x86/mm/init_32.c               |  2 +-
 arch/x86/mm/pgtable.c               | 12 ++++++------
 include/asm-x86/pgtable.h           |  4 +++-
 include/asm-x86/pgtable_32.h        |  3 ---
 8 files changed, 17 insertions(+), 18 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 19c9386ac11..1791a751a77 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -8,6 +8,7 @@
 #include <asm/apic.h>
 #include <asm/desc.h>
 #include <asm/hpet.h>
+#include <asm/pgtable.h>
 #include <asm/reboot_fixups.h>
 #include <asm/reboot.h>
 
@@ -15,7 +16,6 @@
 # include <linux/dmi.h>
 # include <linux/ctype.h>
 # include <linux/mc146818rtc.h>
-# include <asm/pgtable.h>
 #else
 # include <asm/iommu.h>
 #endif
@@ -275,7 +275,7 @@ void machine_real_restart(unsigned char *code, int length)
 	/* Remap the kernel at virtual address zero, as well as offset zero
 	   from the kernel segment.  This assumes the kernel segment starts at
 	   virtual address PAGE_OFFSET. */
-	memcpy(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
+	memcpy(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
 		sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
 
 	/*
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 6a925394bc7..2de2f7a2ed5 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1039,8 +1039,8 @@ int __cpuinit native_cpu_up(unsigned int cpu)
 
 #ifdef CONFIG_X86_32
 	/* init low mem mapping */
-	clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
-			min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
+	clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+			min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
 	flush_tlb_all();
 #endif
 
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 44f7ca153b7..956f38927aa 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -320,7 +320,7 @@ static void check_zeroed_page(u32 pfn, int type, struct page *page)
 	 * pdes need to be zeroed.
 	 */
 	if (type & VMI_PAGE_CLONE)
-		limit = USER_PTRS_PER_PGD;
+		limit = KERNEL_PGD_BOUNDARY;
 	for (i = 0; i < limit; i++)
 		BUG_ON(ptr[i]);
 }
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 96f60c7cd12..394046effd7 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -560,8 +560,8 @@ static void __init do_boot_cpu(__u8 cpu)
 		hijack_source.idt.Offset, stack_start.sp));
 
 	/* init lowmem identity mapping */
-	clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
-			min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
+	clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+			min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
 	flush_tlb_all();
 
 	if (quad_boot) {
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index df490905f37..08aa1878fad 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -457,7 +457,7 @@ void zap_low_mappings(void)
 	 * Note that "pgd_clear()" doesn't do it for
 	 * us, because pgd_clear() is a no-op on i386.
 	 */
-	for (i = 0; i < USER_PTRS_PER_PGD; i++) {
+	for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) {
 #ifdef CONFIG_X86_PAE
 		set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
 #else
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index af0c50161d9..e2ac320e615 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -104,7 +104,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
  * -- wli
  */
 #define UNSHARED_PTRS_PER_PGD				\
-	(SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
+	(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
 
 static void pgd_ctor(void *p)
 {
@@ -112,7 +112,7 @@ static void pgd_ctor(void *p)
 	unsigned long flags;
 
 	/* Clear usermode parts of PGD */
-	memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
+	memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
 
 	spin_lock_irqsave(&pgd_lock, flags);
 
@@ -121,12 +121,12 @@ static void pgd_ctor(void *p)
 	   references from swapper_pg_dir. */
 	if (PAGETABLE_LEVELS == 2 ||
 	    (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
-		clone_pgd_range(pgd + USER_PTRS_PER_PGD,
-				swapper_pg_dir + USER_PTRS_PER_PGD,
+		clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
+				swapper_pg_dir + KERNEL_PGD_BOUNDARY,
 				KERNEL_PGD_PTRS);
 		paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
 					 __pa(swapper_pg_dir) >> PAGE_SHIFT,
-					 USER_PTRS_PER_PGD,
+					 KERNEL_PGD_BOUNDARY,
 					 KERNEL_PGD_PTRS);
 	}
 
@@ -201,7 +201,7 @@ static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
 			return 0;
 		}
 
-		if (i >= USER_PTRS_PER_PGD)
+		if (i >= KERNEL_PGD_BOUNDARY)
 			memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
 			       sizeof(pmd_t) * PTRS_PER_PMD);
 
diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h
index 4ebea41ea70..e61075e70a5 100644
--- a/include/asm-x86/pgtable.h
+++ b/include/asm-x86/pgtable.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_X86_PGTABLE_H
 #define _ASM_X86_PGTABLE_H
 
-#define USER_PTRS_PER_PGD	((TASK_SIZE-1)/PGDIR_SIZE+1)
 #define FIRST_USER_ADDRESS	0
 
 #define _PAGE_BIT_PRESENT	0	/* is present */
@@ -330,6 +329,9 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 # include "pgtable_64.h"
 #endif
 
+#define KERNEL_PGD_BOUNDARY	pgd_index(PAGE_OFFSET)
+#define KERNEL_PGD_PTRS		(PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)
+
 #ifndef __ASSEMBLY__
 
 enum {
diff --git a/include/asm-x86/pgtable_32.h b/include/asm-x86/pgtable_32.h
index c4a64367445..cc52da32fbe 100644
--- a/include/asm-x86/pgtable_32.h
+++ b/include/asm-x86/pgtable_32.h
@@ -48,9 +48,6 @@ void paging_init(void);
 #define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
 #define PGDIR_MASK	(~(PGDIR_SIZE - 1))
 
-#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
-#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
-
 /* Just any arbitrary offset to the start of the vmalloc VM area: the
  * current 8MB value just means that there will be a 8MB "hole" after the
  * physical memory until the kernel virtual memory starts.  That means that
-- 
cgit v1.2.3-70-g09d2


From 85958b465c2e0de315575b1d3d7e7c2ce7126880 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 17 Mar 2008 16:37:14 -0700
Subject: x86: unify pgd ctor/dtor

All pagetables need fundamentally the same setup and destruction, so
just use the same code for everything.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/pgtable.c        | 59 ++++++++++----------------------------------
 include/asm-x86/pgtable.h    | 16 ++++++++++++
 include/asm-x86/pgtable_32.h | 15 -----------
 include/asm-x86/pgtable_64.h |  2 +-
 4 files changed, 30 insertions(+), 62 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index e2ac320e615..50159764f69 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -59,50 +59,6 @@ static inline void pgd_list_del(pgd_t *pgd)
 	list_del(&page->lru);
 }
 
-#ifdef CONFIG_X86_64
-pgd_t *pgd_alloc(struct mm_struct *mm)
-{
-	unsigned boundary;
-	pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
-	unsigned long flags;
-	if (!pgd)
-		return NULL;
-	spin_lock_irqsave(&pgd_lock, flags);
-	pgd_list_add(pgd);
-	spin_unlock_irqrestore(&pgd_lock, flags);
-	/*
-	 * Copy kernel pointers in from init.
-	 * Could keep a freelist or slab cache of those because the kernel
-	 * part never changes.
-	 */
-	boundary = pgd_index(__PAGE_OFFSET);
-	memset(pgd, 0, boundary * sizeof(pgd_t));
-	memcpy(pgd + boundary,
-	       init_level4_pgt + boundary,
-	       (PTRS_PER_PGD - boundary) * sizeof(pgd_t));
-	return pgd;
-}
-
-void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-	unsigned long flags;
-	BUG_ON((unsigned long)pgd & (PAGE_SIZE-1));
-	spin_lock_irqsave(&pgd_lock, flags);
-	pgd_list_del(pgd);
-	spin_unlock_irqrestore(&pgd_lock, flags);
-	free_page((unsigned long)pgd);
-}
-#else
-/*
- * List of all pgd's needed for non-PAE so it can invalidate entries
- * in both cached and uncached pgd's; not needed for PAE since the
- * kernel pmd is shared. If PAE were not to share the pmd a similar
- * tactic would be needed. This is essentially codepath-based locking
- * against pageattr.c; it is the unique case in which a valid change
- * of kernel pagetables can't be lazily synchronized by vmalloc faults.
- * vmalloc faults work because attached pagetables are never freed.
- * -- wli
- */
 #define UNSHARED_PTRS_PER_PGD				\
 	(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
 
@@ -120,7 +76,8 @@ static void pgd_ctor(void *p)
 	   ptes in non-PAE, or shared PMD in PAE), then just copy the
 	   references from swapper_pg_dir. */
 	if (PAGETABLE_LEVELS == 2 ||
-	    (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
+	    (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
+	    PAGETABLE_LEVELS == 4) {
 		clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
 				swapper_pg_dir + KERNEL_PGD_BOUNDARY,
 				KERNEL_PGD_PTRS);
@@ -149,6 +106,17 @@ static void pgd_dtor(void *pgd)
 	spin_unlock_irqrestore(&pgd_lock, flags);
 }
 
+/*
+ * List of all pgd's needed for non-PAE so it can invalidate entries
+ * in both cached and uncached pgd's; not needed for PAE since the
+ * kernel pmd is shared. If PAE were not to share the pmd a similar
+ * tactic would be needed. This is essentially codepath-based locking
+ * against pageattr.c; it is the unique case in which a valid change
+ * of kernel pagetables can't be lazily synchronized by vmalloc faults.
+ * vmalloc faults work because attached pagetables are never freed.
+ * -- wli
+ */
+
 #ifdef CONFIG_X86_PAE
 /*
  * Mop up any pmd pages which may still be attached to the pgd.
@@ -264,7 +232,6 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 	pgd_dtor(pgd);
 	free_page((unsigned long)pgd);
 }
-#endif
 
 int ptep_set_access_flags(struct vm_area_struct *vma,
 			  unsigned long address, pte_t *ptep,
diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h
index e61075e70a5..b8a08bd7bd4 100644
--- a/include/asm-x86/pgtable.h
+++ b/include/asm-x86/pgtable.h
@@ -438,6 +438,22 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
 	pte_update(mm, addr, ptep);
 }
 
+/*
+ * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
+ *
+ *  dst - pointer to pgd range anwhere on a pgd page
+ *  src - ""
+ *  count - the number of pgds to copy.
+ *
+ * dst and src can be on the same page, but the range must not overlap,
+ * and must not cross a page boundary.
+ */
+static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
+{
+       memcpy(dst, src, count * sizeof(pgd_t));
+}
+
+
 #include <asm-generic/pgtable.h>
 #endif	/* __ASSEMBLY__ */
 
diff --git a/include/asm-x86/pgtable_32.h b/include/asm-x86/pgtable_32.h
index cc52da32fbe..168b6447cf1 100644
--- a/include/asm-x86/pgtable_32.h
+++ b/include/asm-x86/pgtable_32.h
@@ -105,21 +105,6 @@ extern int pmd_bad(pmd_t pmd);
 # include <asm/pgtable-2level.h>
 #endif
 
-/*
- * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
- *
- *  dst - pointer to pgd range anwhere on a pgd page
- *  src - ""
- *  count - the number of pgds to copy.
- *
- * dst and src can be on the same page, but the range must not overlap,
- * and must not cross a page boundary.
- */
-static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
-{
-       memcpy(dst, src, count * sizeof(pgd_t));
-}
-
 /*
  * Macro to mark a page protection value as "uncacheable".
  * On processors which do not support it, this is a no-op.
diff --git a/include/asm-x86/pgtable_64.h b/include/asm-x86/pgtable_64.h
index 9fd87d0b647..a3bbf8766c1 100644
--- a/include/asm-x86/pgtable_64.h
+++ b/include/asm-x86/pgtable_64.h
@@ -24,7 +24,7 @@ extern void paging_init(void);
 
 #endif /* !__ASSEMBLY__ */
 
-#define SHARED_KERNEL_PMD	1
+#define SHARED_KERNEL_PMD	0
 
 /*
  * PGDIR_SHIFT determines what a top-level page table entry can map
-- 
cgit v1.2.3-70-g09d2


From 82a355f5a2fdc203e5a32626d667ec43fc76b8b1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Apr 2008 18:28:21 +0200
Subject: x86: make __set_fixmap() non-init

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 1ff7906a9a4..7a81dd02069 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -135,7 +135,7 @@ static __init void *spp_getpage(void)
 	return ptr;
 }
 
-static __init void
+static void
 set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
 {
 	pgd_t *pgd;
@@ -214,8 +214,7 @@ void __init cleanup_highmap(void)
 }
 
 /* NOTE: this is meant to be run only at boot */
-void __init
-__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
+void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
 {
 	unsigned long address = __fix_to_virt(idx);
 
-- 
cgit v1.2.3-70-g09d2


From 70c9f590ffc3f959cc81c1a3cecb6b8133caf35d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Apr 2008 18:05:57 +0200
Subject: x86: remove set_fixmap() warning

set_fixmap()+clear_fixmap() is safe.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 7a81dd02069..b798e7b92b1 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -173,7 +173,7 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
 	new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
 
 	pte = pte_offset_kernel(pmd, vaddr);
-	if (!pte_none(*pte) &&
+	if (!pte_none(*pte) && pte_val(new_pte) &&
 	    pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
 		pte_ERROR(*pte);
 	set_pte(pte, new_pte);
-- 
cgit v1.2.3-70-g09d2


From 1ebcc654f010d4a63f3ebf8ddd2cab5a709b1824 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 26 Apr 2008 11:40:31 +0200
Subject: x86 PAT: tone down debugging messages

Linus reported these excessive debug printouts:

>       Overlap at 0xe0300000-0xe0400000
>       Overlap at 0xe0300000-0xe0380000
>       Overlap at 0xe0300000-0xe0400000
>       Overlap at 0xe0300000-0xe0400000
>       Overlap at 0xe0300000-0xe0400000
>       Overlap at 0xe0300000-0xe0400000
>       Overlap at 0xe0300000-0xe0400000

turn that into a pr_debug().

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index ef8b64b89c7..9851265e4d6 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -334,7 +334,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 				break;
 			}
 
-			printk("Overlap at 0x%Lx-0x%Lx\n",
+			pr_debug("Overlap at 0x%Lx-0x%Lx\n",
 			       saved_ptr->start, saved_ptr->end);
 			/* No conflict. Go ahead and add this new entry */
 			list_add(&new_entry->nd, saved_ptr->nd.prev);
-- 
cgit v1.2.3-70-g09d2


From f7f17a67c589f031c567d9fdc809dee7c5868c8a Mon Sep 17 00:00:00 2001
From: Dmitri Vorobiev <dmitri.vorobiev@gmail.com>
Date: Mon, 21 Apr 2008 00:47:55 +0400
Subject: x86: remove NexGen support

It is claimed that NexGen CPUs were never shipped:

   http://lkml.org/lkml/2008/4/20/179

Also, the kernel support for these chips has been broken for
a long time, the code intended to support NexGen thereby being
essentially dead.

As an outcome of the discussion that can be found using the URL
above, this patch removes the NexGen support altogether.

The changes in this patch survived a defconfig build for i386, a
couple of successful randconfig builds, as well as a runtime test,
which consisted in booting a 32-bit x86 box up to the shell prompt.

Signed-off-by: Dmitri Vorobiev <dmitri.vorobiev@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig.cpu         |  4 +--
 arch/x86/kernel/cpu/Makefile |  1 -
 arch/x86/kernel/cpu/nexgen.c | 59 --------------------------------------------
 arch/x86/mm/init_32.c        |  6 ++---
 include/asm-x86/processor.h  |  1 -
 5 files changed, 5 insertions(+), 66 deletions(-)
 delete mode 100644 arch/x86/kernel/cpu/nexgen.c

(limited to 'arch/x86/mm')

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 57072f2716f..4da3cdb9c1b 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -21,8 +21,8 @@ config M386
 
 	  Here are the settings recommended for greatest speed:
 	  - "386" for the AMD/Cyrix/Intel 386DX/DXL/SL/SLC/SX, Cyrix/TI
-	  486DLC/DLC2, UMC 486SX-S and NexGen Nx586.  Only "386" kernels
-	  will run on a 386 class machine.
+	  486DLC/DLC2, and UMC 486SX-S.  Only "386" kernels will run on a 386
+	  class machine.
 	  - "486" for the AMD/Cyrix/IBM/Intel 486DX/DX2/DX4 or
 	  SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or U5S.
 	  - "586" for generic Pentium CPUs lacking the TSC
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index ee7c45235e5..a0c6f819088 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -11,7 +11,6 @@ obj-$(CONFIG_X86_32)	+= cyrix.o
 obj-$(CONFIG_X86_32)	+= centaur.o
 obj-$(CONFIG_X86_32)	+= transmeta.o
 obj-$(CONFIG_X86_32)	+= intel.o
-obj-$(CONFIG_X86_32)	+= nexgen.o
 obj-$(CONFIG_X86_32)	+= umc.o
 
 obj-$(CONFIG_X86_MCE)	+= mcheck/
diff --git a/arch/x86/kernel/cpu/nexgen.c b/arch/x86/kernel/cpu/nexgen.c
deleted file mode 100644
index 5d5e1c13412..00000000000
--- a/arch/x86/kernel/cpu/nexgen.c
+++ /dev/null
@@ -1,59 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <asm/processor.h>
-
-#include "cpu.h"
-
-/*
- *	Detect a NexGen CPU running without BIOS hypercode new enough
- *	to have CPUID. (Thanks to Herbert Oppmann)
- */
-
-static int __cpuinit deep_magic_nexgen_probe(void)
-{
-	int ret;
-
-	__asm__ __volatile__ (
-		"	movw	$0x5555, %%ax\n"
-		"	xorw	%%dx,%%dx\n"
-		"	movw	$2, %%cx\n"
-		"	divw	%%cx\n"
-		"	movl	$0, %%eax\n"
-		"	jnz	1f\n"
-		"	movl	$1, %%eax\n"
-		"1:\n"
-		: "=a" (ret) : : "cx", "dx");
-	return  ret;
-}
-
-static void __cpuinit init_nexgen(struct cpuinfo_x86 *c)
-{
-	c->x86_cache_size = 256; /* A few had 1 MB... */
-}
-
-static void __cpuinit nexgen_identify(struct cpuinfo_x86 *c)
-{
-	/* Detect NexGen with old hypercode */
-	if (deep_magic_nexgen_probe())
-		strcpy(c->x86_vendor_id, "NexGenDriven");
-}
-
-static struct cpu_dev nexgen_cpu_dev __cpuinitdata = {
-	.c_vendor	= "Nexgen",
-	.c_ident	= { "NexGenDriven" },
-	.c_models = {
-			{ .vendor = X86_VENDOR_NEXGEN,
-			  .family = 5,
-			  .model_names = { [1] = "Nx586" }
-			},
-	},
-	.c_init		= init_nexgen,
-	.c_identify	= nexgen_identify,
-};
-
-int __init nexgen_init_cpu(void)
-{
-	cpu_devs[X86_VENDOR_NEXGEN] = &nexgen_cpu_dev;
-	return 0;
-}
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index baf7c4f643c..4a476189295 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -566,9 +566,9 @@ void __init paging_init(void)
 
 /*
  * Test if the WP bit works in supervisor mode. It isn't supported on 386's
- * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This
- * used to involve black magic jumps to work around some nasty CPU bugs,
- * but fortunately the switch to using exceptions got rid of all that.
+ * and also on some strange 486's. All 586+'s are OK. This used to involve
+ * black magic jumps to work around some nasty CPU bugs, but fortunately the
+ * switch to using exceptions got rid of all that.
  */
 static void __init test_wp_bit(void)
 {
diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h
index e6bf92ddeb2..117343b0c27 100644
--- a/include/asm-x86/processor.h
+++ b/include/asm-x86/processor.h
@@ -118,7 +118,6 @@ struct cpuinfo_x86 {
 #define X86_VENDOR_CYRIX	1
 #define X86_VENDOR_AMD		2
 #define X86_VENDOR_UMC		3
-#define X86_VENDOR_NEXGEN	4
 #define X86_VENDOR_CENTAUR	5
 #define X86_VENDOR_TRANSMETA	7
 #define X86_VENDOR_NSC		8
-- 
cgit v1.2.3-70-g09d2


From 0124cecfc85a6664b1ad5f1d28cf0ab8df66fc42 Mon Sep 17 00:00:00 2001
From: Venki Pallipadi <venkatesh.pallipadi@intel.com>
Date: Sat, 26 Apr 2008 11:32:12 -0700
Subject: x86, PAT: disable /dev/mem mmap RAM with PAT

disable /dev/mem mmap of RAM with PAT. It makes things safer and
eliminates aliasing. A future improvement would be to avoid the
range_is_allowed duplication.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pat.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 9851265e4d6..e7ca7fc48d1 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -16,6 +16,7 @@
 #include <asm/msr.h>
 #include <asm/tlbflush.h>
 #include <asm/processor.h>
+#include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/pat.h>
 #include <asm/e820.h>
@@ -477,6 +478,33 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 	return vma_prot;
 }
 
+#ifdef CONFIG_NONPROMISC_DEVMEM
+/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/
+static inline int range_is_allowed(unsigned long pfn, unsigned long size)
+{
+	return 1;
+}
+#else
+static inline int range_is_allowed(unsigned long pfn, unsigned long size)
+{
+	u64 from = ((u64)pfn) << PAGE_SHIFT;
+	u64 to = from + size;
+	u64 cursor = from;
+
+	while (cursor < to) {
+		if (!devmem_is_allowed(pfn)) {
+			printk(KERN_INFO
+		"Program %s tried to access /dev/mem between %Lx->%Lx.\n",
+				current->comm, from, to);
+			return 0;
+		}
+		cursor += PAGE_SIZE;
+		pfn++;
+	}
+	return 1;
+}
+#endif /* CONFIG_NONPROMISC_DEVMEM */
+
 int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 				unsigned long size, pgprot_t *vma_prot)
 {
@@ -485,6 +513,9 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 	unsigned long ret_flags;
 	int retval;
 
+	if (!range_is_allowed(pfn, size))
+		return 0;
+
 	if (file->f_flags & O_SYNC) {
 		flags = _PAGE_CACHE_UC;
 	}
-- 
cgit v1.2.3-70-g09d2


From 8b3cd09ed23049fcb02479c6286744b36324ac9d Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 18 Mar 2008 12:50:21 -0700
Subject: x86_64: make reserve_bootmem_generic() use new reserve_bootmem()

"mm: make reserve_bootmem can crossed the nodes" provides new
reserve_bootmem(), let reserve_bootmem_generic() use that.

reserve_bootmem_generic() is used to reserve initramdisk, so this way
we can make sure even when bootloader or kexec load ranges cross the
node memory boundaries, reserve_bootmem still works.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 0cca6266303..7dc4fbc2d6b 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -810,7 +810,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
 {
 #ifdef CONFIG_NUMA
-	int nid = phys_to_nid(phys);
+	int nid, next_nid;
 #endif
 	unsigned long pfn = phys >> PAGE_SHIFT;
 
@@ -829,10 +829,16 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
 
 	/* Should check here against the e820 map to avoid double free */
 #ifdef CONFIG_NUMA
-	reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
+	nid = phys_to_nid(phys);
+	next_nid = phys_to_nid(phys + len - 1);
+	if (nid == next_nid)
+		reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
+	else
+		reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
 #else
 	reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
 #endif
+
 	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
 		dma_reserve += len / PAGE_SIZE;
 		set_dma_reserve(dma_reserve);
-- 
cgit v1.2.3-70-g09d2


From 1a27fc0a42162964d758e9d36d2d1b49c082a67c Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 18 Mar 2008 12:52:37 -0700
Subject: x86_64: fix setup_node_bootmem to support big mem excluding with
 memmap

typical case: four sockets system, every node has 4g ram, and we are using:

	memmap=10g$4g

to mask out memory on node1 and node2

when numa is enabled, early_node_mem is used to get node_data and node_bootmap.

if it can not get memory from the same node with find_e820_area(), it will
use alloc_bootmem to get buff from previous nodes.

so check it and print out some info about it.

need to move early_res_to_bootmem into every setup_node_bootmem.
and it takes range that node has. otherwise alloc_bootmem could return addr
that reserved early.

depends on "mm: make reserve_bootmem can crossed the nodes".

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820_64.c  | 13 +++++++++----
 arch/x86/kernel/setup_64.c |  3 +--
 arch/x86/mm/numa_64.c      | 42 ++++++++++++++++++++++++++++++++++++------
 include/asm-x86/e820_64.h  |  2 +-
 4 files changed, 47 insertions(+), 13 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 79f0d52fa99..645ee5e32a2 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -106,14 +106,19 @@ void __init free_early(unsigned long start, unsigned long end)
 	early_res[j - 1].end = 0;
 }
 
-void __init early_res_to_bootmem(void)
+void __init early_res_to_bootmem(unsigned long start, unsigned long end)
 {
 	int i;
+	unsigned long final_start, final_end;
 	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
 		struct early_res *r = &early_res[i];
-		printk(KERN_INFO "early res: %d [%lx-%lx] %s\n", i,
-			r->start, r->end - 1, r->name);
-		reserve_bootmem_generic(r->start, r->end - r->start);
+		final_start = max(start, r->start);
+		final_end = min(end, r->end);
+		if (final_start >= final_end)
+			continue;
+		printk(KERN_INFO "  early res: %d [%lx-%lx] %s\n", i,
+			final_start, final_end - 1, r->name);
+		reserve_bootmem_generic(final_start, final_end - final_start);
 	}
 }
 
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index b04e2c011e1..60e64c8eee9 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -190,6 +190,7 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 	bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
 	e820_register_active_regions(0, start_pfn, end_pfn);
 	free_bootmem_with_active_regions(0, end_pfn);
+	early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
 	reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
 }
 #endif
@@ -421,8 +422,6 @@ void __init setup_arch(char **cmdline_p)
 	contig_initmem_init(0, end_pfn);
 #endif
 
-	early_res_to_bootmem();
-
 	dma32_reserve_bootmem();
 
 #ifdef CONFIG_ACPI_SLEEP
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 9a6892200b2..c5066d519e5 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -196,6 +196,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
 	unsigned long bootmap_start, nodedata_phys;
 	void *bootmap;
 	const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
+	int nid;
 
 	start = round_up(start, ZONE_ALIGN);
 
@@ -218,9 +219,19 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
 	NODE_DATA(nodeid)->node_start_pfn = start_pfn;
 	NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
 
-	/* Find a place for the bootmem map */
+	/*
+	 * Find a place for the bootmem map
+	 * nodedata_phys could be on other nodes by alloc_bootmem,
+	 * so need to sure bootmap_start not to be small, otherwise
+	 * early_node_mem will get that with find_e820_area instead
+	 * of alloc_bootmem, that could clash with reserved range
+	 */
 	bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
-	bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
+	nid = phys_to_nid(nodedata_phys);
+	if (nid == nodeid)
+		bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
+	else
+		bootmap_start = round_up(start, PAGE_SIZE);
 	/*
 	 * SMP_CAHCE_BYTES could be enough, but init_bootmem_node like
 	 * to use that to align to PAGE_SIZE
@@ -245,10 +256,29 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
 
 	free_bootmem_with_active_regions(nodeid, end);
 
-	reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size,
-			BOOTMEM_DEFAULT);
-	reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
-			bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT);
+	/*
+	 * convert early reserve to bootmem reserve earlier
+	 * otherwise early_node_mem could use early reserved mem
+	 * on previous node
+	 */
+	early_res_to_bootmem(start, end);
+
+	/*
+	 * in some case early_node_mem could use alloc_bootmem
+	 * to get range on other node, don't reserve that again
+	 */
+	if (nid != nodeid)
+		printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nodeid, nid);
+	else
+		reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys,
+					pgdat_size, BOOTMEM_DEFAULT);
+	nid = phys_to_nid(bootmap_start);
+	if (nid != nodeid)
+		printk(KERN_INFO "    bootmap(%d) on node %d\n", nodeid, nid);
+	else
+		reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
+				 bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT);
+
 #ifdef CONFIG_ACPI_NUMA
 	srat_reserve_add_area(nodeid);
 #endif
diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h
index b5e02e379af..71c4d685d30 100644
--- a/include/asm-x86/e820_64.h
+++ b/include/asm-x86/e820_64.h
@@ -49,7 +49,7 @@ extern void update_e820(void);
 
 extern void reserve_early(unsigned long start, unsigned long end, char *name);
 extern void free_early(unsigned long start, unsigned long end);
-extern void early_res_to_bootmem(void);
+extern void early_res_to_bootmem(unsigned long start, unsigned long end);
 
 #endif/*!__ASSEMBLY__*/
 
-- 
cgit v1.2.3-70-g09d2


From c2b91e2eec9678dbda274e906cc32ea8f711da3b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel.send@gmail.com>
Date: Sat, 12 Apr 2008 01:19:24 -0700
Subject: x86_64/mm: check and print vmemmap allocation continuous

On big systems with lots of memory, don't print out too much during
bootup, and make it easy to find if it is continuous.

on 256G 8 sockets system will get
 [ffffe20000000000-ffffe20002bfffff] PMD -> [ffff810001400000-ffff810003ffffff] on node 0
[ffffe2001c700000-ffffe2001c7fffff] potential offnode page_structs
 [ffffe20002c00000-ffffe2001c7fffff] PMD -> [ffff81000c000000-ffff8100255fffff] on node 0
[ffffe20038700000-ffffe200387fffff] potential offnode page_structs
 [ffffe2001c800000-ffffe200387fffff] PMD -> [ffff810820200000-ffff81083c1fffff] on node 1
 [ffffe20040000000-ffffe2007fffffff] PUD ->ffff811027a00000 on node 2
 [ffffe20038800000-ffffe2003fffffff] PMD -> [ffff811020200000-ffff8110279fffff] on node 2
[ffffe20054700000-ffffe200547fffff] potential offnode page_structs
 [ffffe20040000000-ffffe200547fffff] PMD -> [ffff811027c00000-ffff81103c3fffff] on node 2
[ffffe20070700000-ffffe200707fffff] potential offnode page_structs
 [ffffe20054800000-ffffe200707fffff] PMD -> [ffff811820200000-ffff81183c1fffff] on node 3
 [ffffe20080000000-ffffe200bfffffff] PUD ->ffff81202fa00000 on node 4
 [ffffe20070800000-ffffe2007fffffff] PMD -> [ffff812020200000-ffff81202f9fffff] on node 4
[ffffe2008c700000-ffffe2008c7fffff] potential offnode page_structs
 [ffffe20080000000-ffffe2008c7fffff] PMD -> [ffff81202fc00000-ffff81203c3fffff] on node 4
[ffffe200a8700000-ffffe200a87fffff] potential offnode page_structs
 [ffffe2008c800000-ffffe200a87fffff] PMD -> [ffff812820200000-ffff81283c1fffff] on node 5
 [ffffe200c0000000-ffffe200ffffffff] PUD ->ffff813037a00000 on node 6
 [ffffe200a8800000-ffffe200bfffffff] PMD -> [ffff813020200000-ffff8130379fffff] on node 6
[ffffe200c4700000-ffffe200c47fffff] potential offnode page_structs
 [ffffe200c0000000-ffffe200c47fffff] PMD -> [ffff813037c00000-ffff81303c3fffff] on node 6
 [ffffe200c4800000-ffffe200e07fffff] PMD -> [ffff813820200000-ffff81383c1fffff] on node 7

instead of a very long print out...

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/init_64.c | 28 ++++++++++++++++++++++++++--
 include/linux/mm.h    |  1 +
 mm/sparse.c           |  5 +++++
 3 files changed, 32 insertions(+), 2 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 7dc4fbc2d6b..5fbb8652cf5 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -932,6 +932,10 @@ const char *arch_vma_name(struct vm_area_struct *vma)
 /*
  * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
  */
+static long __meminitdata addr_start, addr_end;
+static void __meminitdata *p_start, *p_end;
+static int __meminitdata node_start;
+
 int __meminit
 vmemmap_populate(struct page *start_page, unsigned long size, int node)
 {
@@ -966,12 +970,32 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
 							PAGE_KERNEL_LARGE);
 			set_pmd(pmd, __pmd(pte_val(entry)));
 
-			printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
-				addr, addr + PMD_SIZE - 1, p, node);
+			/* check to see if we have contiguous blocks */
+			if (p_end != p || node_start != node) {
+				if (p_start)
+					printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+						addr_start, addr_end-1, p_start, p_end-1, node_start);
+				addr_start = addr;
+				node_start = node;
+				p_start = p;
+			}
+			addr_end = addr + PMD_SIZE;
+			p_end = p + PMD_SIZE;
 		} else {
 			vmemmap_verify((pte_t *)pmd, node, addr, next);
 		}
 	}
 	return 0;
 }
+
+void __meminit vmemmap_populate_print_last(void)
+{
+	if (p_start) {
+		printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+			addr_start, addr_end-1, p_start, p_end-1, node_start);
+		p_start = NULL;
+		p_end = NULL;
+		node_start = 0;
+	}
+}
 #endif
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b695875d63e..286d3152160 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1229,6 +1229,7 @@ void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
 int vmemmap_populate_basepages(struct page *start_page,
 						unsigned long pages, int node);
 int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
+void vmemmap_populate_print_last(void);
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/mm/sparse.c b/mm/sparse.c
index 458109b99e6..7e9191381f8 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -295,6 +295,9 @@ struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
 	return NULL;
 }
 
+void __attribute__((weak)) __meminit vmemmap_populate_print_last(void)
+{
+}
 /*
  * Allocate the accumulated non-linear sections, allocate a mem_map
  * for each and record the physical to section mapping.
@@ -345,6 +348,8 @@ void __init sparse_init(void)
 								usemap);
 	}
 
+	vmemmap_populate_print_last();
+
 	free_bootmem(__pa(usemap_map), size);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 86cf02f8eaea1b09e102e0f432fc137dc5cf4407 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 27 Apr 2008 11:59:30 -0700
Subject: x86 PAT: tone down debugging messages some more

Ingo already fixed one of these at my request (in "x86 PAT: tone down
debugging messages", commit 1ebcc654f010d4a63f3ebf8ddd2cab5a709b1824),
but there was another one he missed.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/pat.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index e7ca7fc48d1..b17cdf64e41 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -387,8 +387,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 				break;
 			}
 
-			printk(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
-			       saved_ptr->start, saved_ptr->end);
+			pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
+				 saved_ptr->start, saved_ptr->end);
 			/* No conflict. Go ahead and add this new entry */
 			list_add(&new_entry->nd, &saved_ptr->nd);
 			new_entry = NULL;
-- 
cgit v1.2.3-70-g09d2


From f022bfd58253099102218db5249220a7f4787114 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 21 Mar 2008 15:42:28 +0100
Subject: x86: PAT fix

Adrian Bunk noticed the following Coverity report:

> Commit e7f260a276f2c9184fe753732d834b1f6fbe9f17
> (x86: PAT use reserve free memtype in mmap of /dev/mem)
> added the following gem to arch/x86/mm/pat.c:
>
> <--  snip  -->
>
> ...
> int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
>                                 unsigned long size, pgprot_t *vma_prot)
> {
>         u64 offset = ((u64) pfn) << PAGE_SHIFT;
>         unsigned long flags = _PAGE_CACHE_UC_MINUS;
>         unsigned long ret_flags;
> ...
> ...  (nothing that touches ret_flags)
> ...
>         if (flags != _PAGE_CACHE_UC_MINUS) {
>                 retval = reserve_memtype(offset, offset + size, flags, NULL);
>         } else {
>                 retval = reserve_memtype(offset, offset + size, -1, &ret_flags);
>         }
>
>         if (retval < 0)
>                 return 0;
>
>         flags = ret_flags;
>
>         if (pfn <= max_pfn_mapped &&
>             ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
>                 free_memtype(offset, offset + size);
>                 printk(KERN_INFO
>                 "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n",
>                         current->comm, current->pid,
>                         cattr_name(flags),
>                         offset, offset + size);
>                 return 0;
>         }
>
>         *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
>                              flags);
>         return 1;
> }
>
> <--  snip  -->
>
> If (flags != _PAGE_CACHE_UC_MINUS) we pass garbage from the stack to
> ioremap_change_attr() and/or __pgprot().
>
> Spotted by the Coverity checker.

the fix simplifies the code as we get rid of the 'ret_flags'
complication.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/pat.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index b17cdf64e41..277446cd30b 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -510,7 +510,6 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 {
 	u64 offset = ((u64) pfn) << PAGE_SHIFT;
 	unsigned long flags = _PAGE_CACHE_UC_MINUS;
-	unsigned long ret_flags;
 	int retval;
 
 	if (!range_is_allowed(pfn, size))
@@ -549,14 +548,12 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 	if (flags != _PAGE_CACHE_UC_MINUS) {
 		retval = reserve_memtype(offset, offset + size, flags, NULL);
 	} else {
-		retval = reserve_memtype(offset, offset + size, -1, &ret_flags);
+		retval = reserve_memtype(offset, offset + size, -1, &flags);
 	}
 
 	if (retval < 0)
 		return 0;
 
-	flags = ret_flags;
-
 	if (pfn <= max_pfn_mapped &&
             ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
 		free_memtype(offset, offset + size);
-- 
cgit v1.2.3-70-g09d2


From 180c06efce691f2b721dd0d965079827bdd7ee03 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 28 Apr 2008 02:12:03 -0700
Subject: hotplug-memory: make online_page() common

All architectures use an effectively identical definition of online_page(), so
just make it common code.  x86-64, ia64, powerpc and sh are actually
identical; x86-32 is slightly different.

x86-32's differences arise because it puts its hotplug pages in the highmem
zone.  We can handle this in the generic code by inspecting the page to see if
its in highmem, and update the totalhigh_pages count appropriately.  This
leaves init_32.c:free_new_highpage with a single caller, so I folded it into
add_one_highpage_init.

I also removed an incorrect comment referring to the NUMA case; any NUMA
details have already been dealt with by the time online_page() is called.

[akpm@linux-foundation.org: fix indenting]
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: Dave Hansen <dave@linux.vnet.ibm.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamez.hiroyu@jp.fujitsu.com>
Tested-by: KAMEZAWA Hiroyuki <kamez.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/mm/init.c   |  9 ---------
 arch/powerpc/mm/mem.c |  9 ---------
 arch/sh/mm/init.c     |  9 ---------
 arch/x86/mm/init_32.c | 36 +++---------------------------------
 arch/x86/mm/init_64.c |  9 ---------
 mm/memory_hotplug.c   | 19 +++++++++++++++++++
 6 files changed, 22 insertions(+), 69 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 5c1de53c8c1..fc6c6636ffd 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -682,15 +682,6 @@ mem_init (void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-void online_page(struct page *page)
-{
-	ClearPageReserved(page);
-	init_page_count(page);
-	__free_page(page);
-	totalram_pages++;
-	num_physpages++;
-}
-
 int arch_add_memory(int nid, u64 start, u64 size)
 {
 	pg_data_t *pgdat;
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 5ccb579b81e..d9e37f365b5 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -110,15 +110,6 @@ EXPORT_SYMBOL(phys_mem_access_prot);
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 
-void online_page(struct page *page)
-{
-	ClearPageReserved(page);
-	init_page_count(page);
-	__free_page(page);
-	totalram_pages++;
-	num_physpages++;
-}
-
 #ifdef CONFIG_NUMA
 int memory_add_physaddr_to_nid(u64 start)
 {
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 53dde060736..d7df26bd1e5 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -307,15 +307,6 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 #endif
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-void online_page(struct page *page)
-{
-	ClearPageReserved(page);
-	init_page_count(page);
-	__free_page(page);
-	totalram_pages++;
-	num_physpages++;
-}
-
 int arch_add_memory(int nid, u64 start, u64 size)
 {
 	pg_data_t *pgdat;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 4a476189295..de236e419cb 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -287,47 +287,17 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
 	pkmap_page_table = pte;
 }
 
-static void __meminit free_new_highpage(struct page *page)
-{
-	init_page_count(page);
-	__free_page(page);
-	totalhigh_pages++;
-}
-
 void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
 {
 	if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
 		ClearPageReserved(page);
-		free_new_highpage(page);
+		init_page_count(page);
+		__free_page(page);
+		totalhigh_pages++;
 	} else
 		SetPageReserved(page);
 }
 
-static int __meminit
-add_one_highpage_hotplug(struct page *page, unsigned long pfn)
-{
-	free_new_highpage(page);
-	totalram_pages++;
-#ifdef CONFIG_FLATMEM
-	max_mapnr = max(pfn, max_mapnr);
-#endif
-	num_physpages++;
-
-	return 0;
-}
-
-/*
- * Not currently handling the NUMA case.
- * Assuming single node and all memory that
- * has been added dynamically that would be
- * onlined here is in HIGHMEM.
- */
-void __meminit online_page(struct page *page)
-{
-	ClearPageReserved(page);
-	add_one_highpage_hotplug(page, page_to_pfn(page));
-}
-
 #ifndef CONFIG_NUMA
 static void __init set_highmem_pages_init(int bad_ppro)
 {
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5fbb8652cf5..32ba13b0f81 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -620,15 +620,6 @@ void __init paging_init(void)
 /*
  * Memory hotplug specific functions
  */
-void online_page(struct page *page)
-{
-	ClearPageReserved(page);
-	init_page_count(page);
-	__free_page(page);
-	totalram_pages++;
-	num_physpages++;
-}
-
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
  * Memory is added always to NORMAL zone. This means you will never get
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d5094929766..c8b3ca79de2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -219,6 +219,25 @@ static void grow_pgdat_span(struct pglist_data *pgdat,
 					pgdat->node_start_pfn;
 }
 
+void online_page(struct page *page)
+{
+	totalram_pages++;
+	num_physpages++;
+
+#ifdef CONFIG_HIGHMEM
+	if (PageHighMem(page))
+		totalhigh_pages++;
+#endif
+
+#ifdef CONFIG_FLATMEM
+	max_mapnr = max(page_to_pfn(page), max_mapnr);
+#endif
+
+	ClearPageReserved(page);
+	init_page_count(page);
+	__free_page(page);
+}
+
 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
 			void *arg)
 {
-- 
cgit v1.2.3-70-g09d2


From 2301696932b55e2ea2085cefc84f7b94fa2dd54b Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 28 Apr 2008 02:12:42 -0700
Subject: vmallocinfo: add caller information

Add caller information so that /proc/vmallocinfo shows where the allocation
request for a slice of vmalloc memory originated.

Results in output like this:

0xffffc20000000000-0xffffc20000801000 8392704 alloc_large_system_hash+0x127/0x246 pages=2048 vmalloc vpages
0xffffc20000801000-0xffffc20000806000   20480 alloc_large_system_hash+0x127/0x246 pages=4 vmalloc
0xffffc20000806000-0xffffc20000c07000 4198400 alloc_large_system_hash+0x127/0x246 pages=1024 vmalloc vpages
0xffffc20000c07000-0xffffc20000c0a000   12288 alloc_large_system_hash+0x127/0x246 pages=2 vmalloc
0xffffc20000c0a000-0xffffc20000c0c000    8192 acpi_os_map_memory+0x13/0x1c phys=cff68000 ioremap
0xffffc20000c0c000-0xffffc20000c0f000   12288 acpi_os_map_memory+0x13/0x1c phys=cff64000 ioremap
0xffffc20000c10000-0xffffc20000c15000   20480 acpi_os_map_memory+0x13/0x1c phys=cff65000 ioremap
0xffffc20000c16000-0xffffc20000c18000    8192 acpi_os_map_memory+0x13/0x1c phys=cff69000 ioremap
0xffffc20000c18000-0xffffc20000c1a000    8192 acpi_os_map_memory+0x13/0x1c phys=fed1f000 ioremap
0xffffc20000c1a000-0xffffc20000c1c000    8192 acpi_os_map_memory+0x13/0x1c phys=cff68000 ioremap
0xffffc20000c1c000-0xffffc20000c1e000    8192 acpi_os_map_memory+0x13/0x1c phys=cff68000 ioremap
0xffffc20000c1e000-0xffffc20000c20000    8192 acpi_os_map_memory+0x13/0x1c phys=cff68000 ioremap
0xffffc20000c20000-0xffffc20000c22000    8192 acpi_os_map_memory+0x13/0x1c phys=cff68000 ioremap
0xffffc20000c22000-0xffffc20000c24000    8192 acpi_os_map_memory+0x13/0x1c phys=cff68000 ioremap
0xffffc20000c24000-0xffffc20000c26000    8192 acpi_os_map_memory+0x13/0x1c phys=e0081000 ioremap
0xffffc20000c26000-0xffffc20000c28000    8192 acpi_os_map_memory+0x13/0x1c phys=e0080000 ioremap
0xffffc20000c28000-0xffffc20000c2d000   20480 alloc_large_system_hash+0x127/0x246 pages=4 vmalloc
0xffffc20000c2d000-0xffffc20000c31000   16384 tcp_init+0xd5/0x31c pages=3 vmalloc
0xffffc20000c31000-0xffffc20000c34000   12288 alloc_large_system_hash+0x127/0x246 pages=2 vmalloc
0xffffc20000c34000-0xffffc20000c36000    8192 init_vdso_vars+0xde/0x1f1
0xffffc20000c36000-0xffffc20000c38000    8192 pci_iomap+0x8a/0xb4 phys=d8e00000 ioremap
0xffffc20000c38000-0xffffc20000c3a000    8192 usb_hcd_pci_probe+0x139/0x295 [usbcore] phys=d8e00000 ioremap
0xffffc20000c3a000-0xffffc20000c3e000   16384 sys_swapon+0x509/0xa15 pages=3 vmalloc
0xffffc20000c40000-0xffffc20000c61000  135168 e1000_probe+0x1c4/0xa32 phys=d8a20000 ioremap
0xffffc20000c61000-0xffffc20000c6a000   36864 _xfs_buf_map_pages+0x8e/0xc0 vmap
0xffffc20000c6a000-0xffffc20000c73000   36864 _xfs_buf_map_pages+0x8e/0xc0 vmap
0xffffc20000c73000-0xffffc20000c7c000   36864 _xfs_buf_map_pages+0x8e/0xc0 vmap
0xffffc20000c7c000-0xffffc20000c7f000   12288 e1000e_setup_tx_resources+0x29/0xbe pages=2 vmalloc
0xffffc20000c80000-0xffffc20001481000 8392704 pci_mmcfg_arch_init+0x90/0x118 phys=e0000000 ioremap
0xffffc20001481000-0xffffc20001682000 2101248 alloc_large_system_hash+0x127/0x246 pages=512 vmalloc
0xffffc20001682000-0xffffc20001e83000 8392704 alloc_large_system_hash+0x127/0x246 pages=2048 vmalloc vpages
0xffffc20001e83000-0xffffc20002204000 3674112 alloc_large_system_hash+0x127/0x246 pages=896 vmalloc vpages
0xffffc20002204000-0xffffc2000220d000   36864 _xfs_buf_map_pages+0x8e/0xc0 vmap
0xffffc2000220d000-0xffffc20002216000   36864 _xfs_buf_map_pages+0x8e/0xc0 vmap
0xffffc20002216000-0xffffc2000221f000   36864 _xfs_buf_map_pages+0x8e/0xc0 vmap
0xffffc2000221f000-0xffffc20002228000   36864 _xfs_buf_map_pages+0x8e/0xc0 vmap
0xffffc20002228000-0xffffc20002231000   36864 _xfs_buf_map_pages+0x8e/0xc0 vmap
0xffffc20002231000-0xffffc20002234000   12288 e1000e_setup_rx_resources+0x35/0x122 pages=2 vmalloc
0xffffc20002240000-0xffffc20002261000  135168 e1000_probe+0x1c4/0xa32 phys=d8a60000 ioremap
0xffffc20002261000-0xffffc2000270c000 4894720 sys_swapon+0x509/0xa15 pages=1194 vmalloc vpages
0xffffffffa0000000-0xffffffffa0022000  139264 module_alloc+0x4f/0x55 pages=33 vmalloc
0xffffffffa0022000-0xffffffffa0029000   28672 module_alloc+0x4f/0x55 pages=6 vmalloc
0xffffffffa002b000-0xffffffffa0034000   36864 module_alloc+0x4f/0x55 pages=8 vmalloc
0xffffffffa0034000-0xffffffffa003d000   36864 module_alloc+0x4f/0x55 pages=8 vmalloc
0xffffffffa003d000-0xffffffffa0049000   49152 module_alloc+0x4f/0x55 pages=11 vmalloc
0xffffffffa0049000-0xffffffffa0050000   28672 module_alloc+0x4f/0x55 pages=6 vmalloc

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/ioremap.c   | 15 +++++++-----
 include/linux/vmalloc.h |  3 +++
 mm/vmalloc.c            | 65 +++++++++++++++++++++++++++++++++++--------------
 3 files changed, 59 insertions(+), 24 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index d176b23110c..804de18abcc 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -117,8 +117,8 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
  * have to convert them into an offset in a page-aligned mapping, but the
  * caller shouldn't need to know that small detail.
  */
-static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
-			       unsigned long prot_val)
+static void __iomem *__ioremap_caller(resource_size_t phys_addr,
+		unsigned long size, unsigned long prot_val, void *caller)
 {
 	unsigned long pfn, offset, vaddr;
 	resource_size_t last_addr;
@@ -212,7 +212,7 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
 	/*
 	 * Ok, go for it..
 	 */
-	area = get_vm_area(size, VM_IOREMAP);
+	area = get_vm_area_caller(size, VM_IOREMAP, caller);
 	if (!area)
 		return NULL;
 	area->phys_addr = phys_addr;
@@ -255,7 +255,8 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
  */
 void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
 {
-	return __ioremap(phys_addr, size, _PAGE_CACHE_UC);
+	return __ioremap_caller(phys_addr, size, _PAGE_CACHE_UC,
+				__builtin_return_address(0));
 }
 EXPORT_SYMBOL(ioremap_nocache);
 
@@ -272,7 +273,8 @@ EXPORT_SYMBOL(ioremap_nocache);
 void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
 {
 	if (pat_wc_enabled)
-		return __ioremap(phys_addr, size, _PAGE_CACHE_WC);
+		return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
+					__builtin_return_address(0));
 	else
 		return ioremap_nocache(phys_addr, size);
 }
@@ -280,7 +282,8 @@ EXPORT_SYMBOL(ioremap_wc);
 
 void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
 {
-	return __ioremap(phys_addr, size, _PAGE_CACHE_WB);
+	return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WB,
+				__builtin_return_address(0));
 }
 EXPORT_SYMBOL(ioremap_cache);
 
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 7f3adfda337..364789aae9f 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -31,6 +31,7 @@ struct vm_struct {
 	struct page		**pages;
 	unsigned int		nr_pages;
 	unsigned long		phys_addr;
+	void			*caller;
 };
 
 /*
@@ -66,6 +67,8 @@ static inline size_t get_vm_area_size(const struct vm_struct *area)
 }
 
 extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags);
+extern struct vm_struct *get_vm_area_caller(unsigned long size,
+					unsigned long flags, void *caller);
 extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
 					unsigned long start, unsigned long end);
 extern struct vm_struct *get_vm_area_node(unsigned long size,
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index afa550f6653..e33e0ae69ad 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -16,6 +16,7 @@
 #include <linux/interrupt.h>
 #include <linux/seq_file.h>
 #include <linux/vmalloc.h>
+#include <linux/kallsyms.h>
 
 #include <asm/uaccess.h>
 #include <asm/tlbflush.h>
@@ -25,7 +26,7 @@ DEFINE_RWLOCK(vmlist_lock);
 struct vm_struct *vmlist;
 
 static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
-			    int node);
+			    int node, void *caller);
 
 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
 {
@@ -204,9 +205,9 @@ unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
 }
 EXPORT_SYMBOL(vmalloc_to_pfn);
 
-static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
-					    unsigned long start, unsigned long end,
-					    int node, gfp_t gfp_mask)
+static struct vm_struct *
+__get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start,
+		unsigned long end, int node, gfp_t gfp_mask, void *caller)
 {
 	struct vm_struct **p, *tmp, *area;
 	unsigned long align = 1;
@@ -269,6 +270,7 @@ found:
 	area->pages = NULL;
 	area->nr_pages = 0;
 	area->phys_addr = 0;
+	area->caller = caller;
 	write_unlock(&vmlist_lock);
 
 	return area;
@@ -284,7 +286,8 @@ out:
 struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
 				unsigned long start, unsigned long end)
 {
-	return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL);
+	return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL,
+						__builtin_return_address(0));
 }
 EXPORT_SYMBOL_GPL(__get_vm_area);
 
@@ -299,14 +302,22 @@ EXPORT_SYMBOL_GPL(__get_vm_area);
  */
 struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
 {
-	return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END);
+	return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END,
+				-1, GFP_KERNEL, __builtin_return_address(0));
+}
+
+struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
+				void *caller)
+{
+	return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END,
+						-1, GFP_KERNEL, caller);
 }
 
 struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
 				   int node, gfp_t gfp_mask)
 {
 	return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node,
-				  gfp_mask);
+				  gfp_mask, __builtin_return_address(0));
 }
 
 /* Caller must hold vmlist_lock */
@@ -455,9 +466,11 @@ void *vmap(struct page **pages, unsigned int count,
 	if (count > num_physpages)
 		return NULL;
 
-	area = get_vm_area((count << PAGE_SHIFT), flags);
+	area = get_vm_area_caller((count << PAGE_SHIFT), flags,
+					__builtin_return_address(0));
 	if (!area)
 		return NULL;
+
 	if (map_vm_area(area, prot, &pages)) {
 		vunmap(area->addr);
 		return NULL;
@@ -468,7 +481,7 @@ void *vmap(struct page **pages, unsigned int count,
 EXPORT_SYMBOL(vmap);
 
 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
-				 pgprot_t prot, int node)
+				 pgprot_t prot, int node, void *caller)
 {
 	struct page **pages;
 	unsigned int nr_pages, array_size, i;
@@ -480,7 +493,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	/* Please note that the recursion is strictly bounded. */
 	if (array_size > PAGE_SIZE) {
 		pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO,
-					PAGE_KERNEL, node);
+				PAGE_KERNEL, node, caller);
 		area->flags |= VM_VPAGES;
 	} else {
 		pages = kmalloc_node(array_size,
@@ -488,6 +501,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 				node);
 	}
 	area->pages = pages;
+	area->caller = caller;
 	if (!area->pages) {
 		remove_vm_area(area->addr);
 		kfree(area);
@@ -521,7 +535,8 @@ fail:
 
 void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
 {
-	return __vmalloc_area_node(area, gfp_mask, prot, -1);
+	return __vmalloc_area_node(area, gfp_mask, prot, -1,
+					__builtin_return_address(0));
 }
 
 /**
@@ -536,7 +551,7 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
  *	kernel virtual space, using a pagetable protection of @prot.
  */
 static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
-			    int node)
+						int node, void *caller)
 {
 	struct vm_struct *area;
 
@@ -544,16 +559,19 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
 	if (!size || (size >> PAGE_SHIFT) > num_physpages)
 		return NULL;
 
-	area = get_vm_area_node(size, VM_ALLOC, node, gfp_mask);
+	area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END,
+						node, gfp_mask, caller);
+
 	if (!area)
 		return NULL;
 
-	return __vmalloc_area_node(area, gfp_mask, prot, node);
+	return __vmalloc_area_node(area, gfp_mask, prot, node, caller);
 }
 
 void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 {
-	return __vmalloc_node(size, gfp_mask, prot, -1);
+	return __vmalloc_node(size, gfp_mask, prot, -1,
+				__builtin_return_address(0));
 }
 EXPORT_SYMBOL(__vmalloc);
 
@@ -568,7 +586,8 @@ EXPORT_SYMBOL(__vmalloc);
  */
 void *vmalloc(unsigned long size)
 {
-	return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
+	return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
+					-1, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(vmalloc);
 
@@ -608,7 +627,8 @@ EXPORT_SYMBOL(vmalloc_user);
  */
 void *vmalloc_node(unsigned long size, int node)
 {
-	return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node);
+	return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
+					node, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(vmalloc_node);
 
@@ -843,7 +863,8 @@ struct vm_struct *alloc_vm_area(size_t size)
 {
 	struct vm_struct *area;
 
-	area = get_vm_area(size, VM_IOREMAP);
+	area = get_vm_area_caller(size, VM_IOREMAP,
+				__builtin_return_address(0));
 	if (area == NULL)
 		return NULL;
 
@@ -914,6 +935,14 @@ static int s_show(struct seq_file *m, void *p)
 	seq_printf(m, "0x%p-0x%p %7ld",
 		v->addr, v->addr + v->size, v->size);
 
+	if (v->caller) {
+		char buff[2 * KSYM_NAME_LEN];
+
+		seq_putc(m, ' ');
+		sprint_symbol(buff, (unsigned long)v->caller);
+		seq_puts(m, buff);
+	}
+
 	if (v->nr_pages)
 		seq_printf(m, " pages=%d", v->nr_pages);
 
-- 
cgit v1.2.3-70-g09d2