From e65845045588806fa5c8df8a4f4253516515a5e3 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Wed, 2 May 2007 19:27:06 +0200
Subject: [PATCH] x86-64: dma_ops as const

The dma_ops structure can be const since it never changes
after boot.

Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/mm/init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64/mm')

diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index ec31534eb10..5ca61731f55 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -46,7 +46,7 @@
 #define Dprintk(x...)
 #endif
 
-struct dma_mapping_ops* dma_ops;
+const struct dma_mapping_ops* dma_ops;
 EXPORT_SYMBOL(dma_ops);
 
 static unsigned long dma_reserve __initdata;
-- 
cgit v1.2.3-70-g09d2


From dafe41ee3a9389c08c91cdfd8670295f20f89e04 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Wed, 2 May 2007 19:27:06 +0200
Subject: [PATCH] x86-64: Kill temp boot pmds

Early in the boot process we need the ability to set
up temporary mappings, before our normal mechanisms are
initialized.  Currently this is used to map pages that
are part of the page tables we are building and pages
during the dmi scan.

The core problem is that we are using the user portion of
the page tables to implement this.  Which means that while
this mechanism is active we cannot catch NULL pointer dereferences
and we deviate from the normal ways of handling things.

In this patch I modify early_ioremap to map pages into
the kernel portion of address space, roughly where
we will later put modules, and I make the discovery of
which addresses we can use dynamic which removes all
kinds of static limits and remove the dependencies
on implementation details between different parts of the code.

Now alloc_low_page() and unmap_low_page() use
early_iomap() and early_iounmap() to allocate/map and
unmap a page.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/head.S |   3 --
 arch/x86_64/mm/init.c     | 100 +++++++++++++++++++++-------------------------
 2 files changed, 45 insertions(+), 58 deletions(-)

(limited to 'arch/x86_64/mm')

diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index 598a4d0351f..118c6088198 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -288,9 +288,6 @@ NEXT_PAGE(level2_ident_pgt)
 	.quad	i << 21 | 0x083
 	i = i + 1
 	.endr
-	/* Temporary mappings for the super early allocator in arch/x86_64/mm/init.c */
-	.globl temp_boot_pmds
-temp_boot_pmds:
 	.fill	492,8,0
 	
 NEXT_PAGE(level2_kernel_pgt)
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 5ca61731f55..4ab3d40aac9 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -167,23 +167,9 @@ __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
 
 unsigned long __initdata table_start, table_end; 
 
-extern pmd_t temp_boot_pmds[]; 
-
-static  struct temp_map { 
-	pmd_t *pmd;
-	void  *address; 
-	int    allocated; 
-} temp_mappings[] __initdata = { 
-	{ &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) },
-	{ &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) }, 
-	{}
-}; 
-
-static __meminit void *alloc_low_page(int *index, unsigned long *phys)
+static __meminit void *alloc_low_page(unsigned long *phys)
 { 
-	struct temp_map *ti;
-	int i; 
-	unsigned long pfn = table_end++, paddr; 
+	unsigned long pfn = table_end++;
 	void *adr;
 
 	if (after_bootmem) {
@@ -194,57 +180,63 @@ static __meminit void *alloc_low_page(int *index, unsigned long *phys)
 
 	if (pfn >= end_pfn) 
 		panic("alloc_low_page: ran out of memory"); 
-	for (i = 0; temp_mappings[i].allocated; i++) {
-		if (!temp_mappings[i].pmd) 
-			panic("alloc_low_page: ran out of temp mappings"); 
-	} 
-	ti = &temp_mappings[i];
-	paddr = (pfn << PAGE_SHIFT) & PMD_MASK; 
-	set_pmd(ti->pmd, __pmd(paddr | _KERNPG_TABLE | _PAGE_PSE)); 
-	ti->allocated = 1; 
-	__flush_tlb(); 	       
-	adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK); 
+
+	adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
 	memset(adr, 0, PAGE_SIZE);
-	*index = i; 
-	*phys  = pfn * PAGE_SIZE;  
-	return adr; 
-} 
+	*phys  = pfn * PAGE_SIZE;
+	return adr;
+}
 
-static __meminit void unmap_low_page(int i)
+static __meminit void unmap_low_page(void *adr)
 { 
-	struct temp_map *ti;
 
 	if (after_bootmem)
 		return;
 
-	ti = &temp_mappings[i];
-	set_pmd(ti->pmd, __pmd(0));
-	ti->allocated = 0; 
+	early_iounmap(adr, PAGE_SIZE);
 } 
 
 /* Must run before zap_low_mappings */
 __init void *early_ioremap(unsigned long addr, unsigned long size)
 {
-	unsigned long map = round_down(addr, LARGE_PAGE_SIZE); 
-
-	/* actually usually some more */
-	if (size >= LARGE_PAGE_SIZE) { 
-		return NULL;
+	unsigned long vaddr;
+	pmd_t *pmd, *last_pmd;
+	int i, pmds;
+
+	pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
+	vaddr = __START_KERNEL_map;
+	pmd = level2_kernel_pgt;
+	last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
+	for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
+		for (i = 0; i < pmds; i++) {
+			if (pmd_present(pmd[i]))
+				goto next;
+		}
+		vaddr += addr & ~PMD_MASK;
+		addr &= PMD_MASK;
+		for (i = 0; i < pmds; i++, addr += PMD_SIZE)
+			set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE));
+		__flush_tlb();
+		return (void *)vaddr;
+	next:
+		;
 	}
-	set_pmd(temp_mappings[0].pmd,  __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
-	map += LARGE_PAGE_SIZE;
-	set_pmd(temp_mappings[1].pmd,  __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
-	__flush_tlb();
-	return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1));
+	printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
+	return NULL;
 }
 
 /* To avoid virtual aliases later */
 __init void early_iounmap(void *addr, unsigned long size)
 {
-	if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address)
-		printk("early_iounmap: bad address %p\n", addr);
-	set_pmd(temp_mappings[0].pmd, __pmd(0));
-	set_pmd(temp_mappings[1].pmd, __pmd(0));
+	unsigned long vaddr;
+	pmd_t *pmd;
+	int i, pmds;
+
+	vaddr = (unsigned long)addr;
+	pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
+	pmd = level2_kernel_pgt + pmd_index(vaddr);
+	for (i = 0; i < pmds; i++)
+		pmd_clear(pmd + i);
 	__flush_tlb();
 }
 
@@ -289,7 +281,6 @@ static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigne
 
 
 	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
-		int map; 
 		unsigned long pmd_phys;
 		pud_t *pud = pud_page + pud_index(addr);
 		pmd_t *pmd;
@@ -307,12 +298,12 @@ static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigne
 			continue;
 		}
 
-		pmd = alloc_low_page(&map, &pmd_phys);
+		pmd = alloc_low_page(&pmd_phys);
 		spin_lock(&init_mm.page_table_lock);
 		set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
 		phys_pmd_init(pmd, addr, end);
 		spin_unlock(&init_mm.page_table_lock);
-		unmap_low_page(map);
+		unmap_low_page(pmd);
 	}
 	__flush_tlb();
 } 
@@ -364,7 +355,6 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end)
 	end = (unsigned long)__va(end);
 
 	for (; start < end; start = next) {
-		int map;
 		unsigned long pud_phys; 
 		pgd_t *pgd = pgd_offset_k(start);
 		pud_t *pud;
@@ -372,7 +362,7 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end)
 		if (after_bootmem)
 			pud = pud_offset(pgd, start & PGDIR_MASK);
 		else
-			pud = alloc_low_page(&map, &pud_phys);
+			pud = alloc_low_page(&pud_phys);
 
 		next = start + PGDIR_SIZE;
 		if (next > end) 
@@ -380,7 +370,7 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end)
 		phys_pud_init(pud, __pa(start), __pa(next));
 		if (!after_bootmem)
 			set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
-		unmap_low_page(map);   
+		unmap_low_page(pud);
 	} 
 
 	if (!after_bootmem)
-- 
cgit v1.2.3-70-g09d2


From cfd243d4af7c7f8f52f5cb99d3932d9074b039ff Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Wed, 2 May 2007 19:27:07 +0200
Subject: [PATCH] x86-64: Remove the identity mapping as early as possible

With the rewrite of the SMP trampoline and the early page
allocator there is nothing that needs identity mapped pages,
once we start executing C code.

So add zap_identity_mappings into head64.c and remove
zap_low_mappings() from much later in the code.  The functions
 are subtly different thus the name change.

This also kills boot_level4_pgt which was from an earlier
attempt to move the identity mappings as early as possible,
and is now no longer needed.  Essentially I have replaced
boot_level4_pgt with trampoline_level4_pgt in trampoline.S

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/head.S    | 39 ++++++++++++++-------------------------
 arch/x86_64/kernel/head64.c  | 17 +++++++++++------
 arch/x86_64/kernel/setup.c   |  2 --
 arch/x86_64/kernel/setup64.c |  1 -
 arch/x86_64/mm/init.c        | 24 ------------------------
 include/asm-x86_64/pgtable.h |  1 -
 include/asm-x86_64/proto.h   |  2 --
 7 files changed, 25 insertions(+), 61 deletions(-)

(limited to 'arch/x86_64/mm')

diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index 926aa2197aa..c211e52f133 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -71,7 +71,7 @@ startup_32:
 	movl	%eax, %cr4
 
 	/* Setup early boot stage 4 level pagetables */
-	movl	$(boot_level4_pgt - __START_KERNEL_map), %eax
+	movl	$(init_level4_pgt - __START_KERNEL_map), %eax
 	movl	%eax, %cr3
 
 	/* Setup EFER (Extended Feature Enable Register) */
@@ -115,7 +115,7 @@ ENTRY(secondary_startup_64)
 	movq	%rax, %cr4
 
 	/* Setup early boot stage 4 level pagetables. */
-	movq	$(boot_level4_pgt - __START_KERNEL_map), %rax
+	movq	$(init_level4_pgt - __START_KERNEL_map), %rax
 	movq	%rax, %cr3
 
 	/* Check if nx is implemented */
@@ -274,9 +274,19 @@ ENTRY(name)
 	i = i + 1 ;				\
 	.endr
 
+	/*
+	 * This default setting generates an ident mapping at address 0x100000
+	 * and a mapping for the kernel that precisely maps virtual address
+	 * 0xffffffff80000000 to physical address 0x000000. (always using
+	 * 2Mbyte large pages provided by PAE mode)
+	 */
 NEXT_PAGE(init_level4_pgt)
-	/* This gets initialized in x86_64_start_kernel */
-	.fill	512,8,0
+	.quad	level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+	.fill	257,8,0
+	.quad	level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+	.fill	252,8,0
+	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
+	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
 
 NEXT_PAGE(level3_ident_pgt)
 	.quad	level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
@@ -307,27 +317,6 @@ NEXT_PAGE(level2_kernel_pgt)
 #undef NEXT_PAGE
 
 	.data
-
-#ifndef CONFIG_HOTPLUG_CPU
-	__INITDATA
-#endif
-	/*
-	 * This default setting generates an ident mapping at address 0x100000
-	 * and a mapping for the kernel that precisely maps virtual address
-	 * 0xffffffff80000000 to physical address 0x000000. (always using
-	 * 2Mbyte large pages provided by PAE mode)
-	 */
-	.align PAGE_SIZE
-ENTRY(boot_level4_pgt)
-	.quad	level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
-	.fill	257,8,0
-	.quad	level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
-	.fill	252,8,0
-	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
-	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
-
-	.data
-
 	.align 16
 	.globl cpu_gdt_descr
 cpu_gdt_descr:
diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
index 5c529c1e3d6..6c34bdd22e2 100644
--- a/arch/x86_64/kernel/head64.c
+++ b/arch/x86_64/kernel/head64.c
@@ -18,8 +18,16 @@
 #include <asm/setup.h>
 #include <asm/desc.h>
 #include <asm/pgtable.h>
+#include <asm/tlbflush.h>
 #include <asm/sections.h>
 
+static void __init zap_identity_mappings(void)
+{
+	pgd_t *pgd = pgd_offset_k(0UL);
+	pgd_clear(pgd);
+	__flush_tlb();
+}
+
 /* Don't add a printk in there. printk relies on the PDA which is not initialized 
    yet. */
 static void __init clear_bss(void)
@@ -57,18 +65,15 @@ void __init x86_64_start_kernel(char * real_mode_data)
 	/* clear bss before set_intr_gate with early_idt_handler */
 	clear_bss();
 
+	/* Make NULL pointers segfault */
+	zap_identity_mappings();
+
 	for (i = 0; i < IDT_ENTRIES; i++)
 		set_intr_gate(i, early_idt_handler);
 	asm volatile("lidt %0" :: "m" (idt_descr));
 
 	early_printk("Kernel alive\n");
 
-	/*
-	 * switch to init_level4_pgt from boot_level4_pgt
-	 */
-	memcpy(init_level4_pgt, boot_level4_pgt, PTRS_PER_PGD*sizeof(pgd_t));
-	asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
-
  	for (i = 0; i < NR_CPUS; i++)
  		cpu_pda(i) = &boot_cpu_pda[i];
 
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 65e2bc551a2..0e2b8df0ea6 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -274,8 +274,6 @@ void __init setup_arch(char **cmdline_p)
 
 	dmi_scan_machine();
 
-	zap_low_mappings(0);
-
 #ifdef CONFIG_ACPI
 	/*
 	 * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 6a70b55f719..53064a9a365 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -201,7 +201,6 @@ void __cpuinit cpu_init (void)
 	/* CPU 0 is initialised in head64.c */
 	if (cpu != 0) {
 		pda_init(cpu);
-		zap_low_mappings(cpu);
 	} else 
 		estacks = boot_exception_stacks; 
 
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 4ab3d40aac9..b0a60789218 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -378,21 +378,6 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end)
 	__flush_tlb_all();
 }
 
-void __cpuinit zap_low_mappings(int cpu)
-{
-	if (cpu == 0) {
-		pgd_t *pgd = pgd_offset_k(0UL);
-		pgd_clear(pgd);
-	} else {
-		/*
-		 * For AP's, zap the low identity mappings by changing the cr3
-		 * to init_level4_pgt and doing local flush tlb all
-		 */
-		asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
-	}
-	__flush_tlb_all();
-}
-
 #ifndef CONFIG_NUMA
 void __init paging_init(void)
 {
@@ -569,15 +554,6 @@ void __init mem_init(void)
 		reservedpages << (PAGE_SHIFT-10),
 		datasize >> 10,
 		initsize >> 10);
-
-#ifdef CONFIG_SMP
-	/*
-	 * Sync boot_level4_pgt mappings with the init_level4_pgt
-	 * except for the low identity mappings which are already zapped
-	 * in init_level4_pgt. This sync-up is essential for AP's bringup
-	 */
-	memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
-#endif
 }
 
 void free_init_pages(char *what, unsigned long begin, unsigned long end)
diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
index 5a5d43b3ef5..703f0243f27 100644
--- a/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@ -17,7 +17,6 @@ extern pud_t level3_kernel_pgt[512];
 extern pud_t level3_ident_pgt[512];
 extern pmd_t level2_kernel_pgt[512];
 extern pgd_t init_level4_pgt[];
-extern pgd_t boot_level4_pgt[];
 extern unsigned long __supported_pte_mask;
 
 #define swapper_pg_dir init_level4_pgt
diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h
index 78427021d94..3f8f285138d 100644
--- a/include/asm-x86_64/proto.h
+++ b/include/asm-x86_64/proto.h
@@ -11,8 +11,6 @@ struct pt_regs;
 extern void start_kernel(void);
 extern void pda_init(int); 
 
-extern void zap_low_mappings(int cpu);
-
 extern void early_idt_handler(void);
 
 extern void mcheck_init(struct cpuinfo_x86 *c);
-- 
cgit v1.2.3-70-g09d2


From 0dbf7028c0c1f266c9631139450a1502d3cd457e Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Wed, 2 May 2007 19:27:07 +0200
Subject: [PATCH] x86: __pa and __pa_symbol address space separation

Currently __pa_symbol is for use with symbols in the kernel address
map and __pa is for use with pointers into the physical memory map.
But the code is implemented so you can usually interchange the two.

__pa which is much more common can be implemented much more cheaply
if it is it doesn't have to worry about any other kernel address
spaces.  This is especially true with a relocatable kernel as
__pa_symbol needs to peform an extra variable read to resolve
the address.

There is a third macro that is added for the vsyscall data
__pa_vsymbol for finding the physical addesses of vsyscall pages.

Most of this patch is simply sorting through the references to
__pa or __pa_symbol and using the proper one.  A little of
it is continuing to use a physical address when we have it
instead of recalculating it several times.

swapper_pgd is now NULL.  leave_mm now uses init_mm.pgd
and init_mm.pgd is initialized at boot (instead of compile time)
to the physmem virtual mapping of init_level4_pgd.  The
physical address changed.

Except for the for EMPTY_ZERO page all of the remaining references
to __pa_symbol appear to be during kernel initialization.  So this
should reduce the cost of __pa in the common case, even on a relocated
kernel.

As this is technically a semantic change we need to be on the lookout
for anything I missed.  But it works for me (tm).

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/alternative.c     |  4 ++--
 arch/i386/mm/init.c                | 15 ++++++++-------
 arch/x86_64/kernel/machine_kexec.c | 14 +++++++-------
 arch/x86_64/kernel/setup.c         |  9 +++++----
 arch/x86_64/kernel/smp.c           |  2 +-
 arch/x86_64/kernel/vsyscall.c      |  9 +++++++--
 arch/x86_64/mm/init.c              | 21 +++++++++++----------
 arch/x86_64/mm/pageattr.c          | 16 ++++++++--------
 include/asm-x86_64/page.h          |  6 ++----
 include/asm-x86_64/pgtable.h       |  4 ++--
 10 files changed, 53 insertions(+), 47 deletions(-)

(limited to 'arch/x86_64/mm')

diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c
index 426f59b0106..a27c8d34736 100644
--- a/arch/i386/kernel/alternative.c
+++ b/arch/i386/kernel/alternative.c
@@ -402,8 +402,8 @@ void __init alternative_instructions(void)
 						_text, _etext);
 		}
 		free_init_pages("SMP alternatives",
-				(unsigned long)__smp_alt_begin,
-				(unsigned long)__smp_alt_end);
+				__pa_symbol(&__smp_alt_begin),
+				__pa_symbol(&__smp_alt_end));
 	} else {
 		alternatives_smp_save(__smp_alt_instructions,
 				      __smp_alt_instructions_end);
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index ae436882af7..23be1b0aafa 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -774,10 +774,11 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 	unsigned long addr;
 
 	for (addr = begin; addr < end; addr += PAGE_SIZE) {
-		ClearPageReserved(virt_to_page(addr));
-		init_page_count(virt_to_page(addr));
-		memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
-		free_page(addr);
+		struct page *page = pfn_to_page(addr >> PAGE_SHIFT);
+		ClearPageReserved(page);
+		init_page_count(page);
+		memset(page_address(page), POISON_FREE_INITMEM, PAGE_SIZE);
+		__free_page(page);
 		totalram_pages++;
 	}
 	printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
@@ -786,14 +787,14 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 void free_initmem(void)
 {
 	free_init_pages("unused kernel memory",
-			(unsigned long)(&__init_begin),
-			(unsigned long)(&__init_end));
+			__pa_symbol(&__init_begin),
+			__pa_symbol(&__init_end));
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_init_pages("initrd memory", start, end);
+	free_init_pages("initrd memory", __pa(start), __pa(end));
 }
 #endif
 
diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c
index 0497e3bd5bf..a8bb33c1a8f 100644
--- a/arch/x86_64/kernel/machine_kexec.c
+++ b/arch/x86_64/kernel/machine_kexec.c
@@ -191,19 +191,19 @@ NORET_TYPE void machine_kexec(struct kimage *image)
 
 	page_list[PA_CONTROL_PAGE] = __pa(control_page);
 	page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
-	page_list[PA_PGD] = __pa(kexec_pgd);
+	page_list[PA_PGD] = __pa_symbol(&kexec_pgd);
 	page_list[VA_PGD] = (unsigned long)kexec_pgd;
-	page_list[PA_PUD_0] = __pa(kexec_pud0);
+	page_list[PA_PUD_0] = __pa_symbol(&kexec_pud0);
 	page_list[VA_PUD_0] = (unsigned long)kexec_pud0;
-	page_list[PA_PMD_0] = __pa(kexec_pmd0);
+	page_list[PA_PMD_0] = __pa_symbol(&kexec_pmd0);
 	page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
-	page_list[PA_PTE_0] = __pa(kexec_pte0);
+	page_list[PA_PTE_0] = __pa_symbol(&kexec_pte0);
 	page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
-	page_list[PA_PUD_1] = __pa(kexec_pud1);
+	page_list[PA_PUD_1] = __pa_symbol(&kexec_pud1);
 	page_list[VA_PUD_1] = (unsigned long)kexec_pud1;
-	page_list[PA_PMD_1] = __pa(kexec_pmd1);
+	page_list[PA_PMD_1] = __pa_symbol(&kexec_pmd1);
 	page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
-	page_list[PA_PTE_1] = __pa(kexec_pte1);
+	page_list[PA_PTE_1] = __pa_symbol(&kexec_pte1);
 	page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
 
 	page_list[PA_TABLE_PAGE] =
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 0e2b8df0ea6..b9bdfc1b54f 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -243,11 +243,12 @@ void __init setup_arch(char **cmdline_p)
 	init_mm.end_code = (unsigned long) &_etext;
 	init_mm.end_data = (unsigned long) &_edata;
 	init_mm.brk = (unsigned long) &_end;
+	init_mm.pgd = __va(__pa_symbol(&init_level4_pgt));
 
-	code_resource.start = virt_to_phys(&_text);
-	code_resource.end = virt_to_phys(&_etext)-1;
-	data_resource.start = virt_to_phys(&_etext);
-	data_resource.end = virt_to_phys(&_edata)-1;
+	code_resource.start = __pa_symbol(&_text);
+	code_resource.end = __pa_symbol(&_etext)-1;
+	data_resource.start = __pa_symbol(&_etext);
+	data_resource.end = __pa_symbol(&_edata)-1;
 
 	early_identify_cpu(&boot_cpu_data);
 
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index bd1d123947c..22abae4e9f3 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -76,7 +76,7 @@ static inline void leave_mm(int cpu)
 	if (read_pda(mmu_state) == TLBSTATE_OK)
 		BUG();
 	cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
-	load_cr3(swapper_pg_dir);
+	load_cr3(init_mm.pgd);
 }
 
 /*
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index b43c698cf7d..d14cbb3e0eb 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -45,6 +45,11 @@
 
 #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
 #define __syscall_clobber "r11","rcx","memory"
+#define __pa_vsymbol(x)			\
+	({unsigned long v;  		\
+	extern char __vsyscall_0; 	\
+	  asm("" : "=r" (v) : "0" (x)); \
+	  ((v - VSYSCALL_FIRST_PAGE) + __pa_symbol(&__vsyscall_0)); })
 
 struct vsyscall_gtod_data_t {
 	seqlock_t lock;
@@ -224,10 +229,10 @@ static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
 		return ret;
 	/* gcc has some trouble with __va(__pa()), so just do it this
 	   way. */
-	map1 = ioremap(__pa_symbol(&vsysc1), 2);
+	map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
 	if (!map1)
 		return -ENOMEM;
-	map2 = ioremap(__pa_symbol(&vsysc2), 2);
+	map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
 	if (!map2) {
 		ret = -ENOMEM;
 		goto out;
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index b0a60789218..69e22d3c923 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -565,11 +565,11 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 
 	printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
 	for (addr = begin; addr < end; addr += PAGE_SIZE) {
-		ClearPageReserved(virt_to_page(addr));
-		init_page_count(virt_to_page(addr));
-		memset((void *)(addr & ~(PAGE_SIZE-1)),
-			POISON_FREE_INITMEM, PAGE_SIZE);
-		free_page(addr);
+		struct page *page = pfn_to_page(addr >> PAGE_SHIFT);
+		ClearPageReserved(page);
+		init_page_count(page);
+		memset(page_address(page), POISON_FREE_INITMEM, PAGE_SIZE);
+		__free_page(page);
 		totalram_pages++;
 	}
 }
@@ -579,17 +579,18 @@ void free_initmem(void)
 	memset(__initdata_begin, POISON_FREE_INITDATA,
 		__initdata_end - __initdata_begin);
 	free_init_pages("unused kernel memory",
-			(unsigned long)(&__init_begin),
-			(unsigned long)(&__init_end));
+			__pa_symbol(&__init_begin),
+			__pa_symbol(&__init_end));
 }
 
 #ifdef CONFIG_DEBUG_RODATA
 
 void mark_rodata_ro(void)
 {
-	unsigned long addr = (unsigned long)__start_rodata;
+	unsigned long addr = (unsigned long)__va(__pa_symbol(&__start_rodata));
+	unsigned long end  = (unsigned long)__va(__pa_symbol(&__end_rodata));
 
-	for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
+	for (; addr < end; addr += PAGE_SIZE)
 		change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
 
 	printk ("Write protecting the kernel read-only data: %luk\n",
@@ -608,7 +609,7 @@ void mark_rodata_ro(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_init_pages("initrd memory", start, end);
+	free_init_pages("initrd memory", __pa(start), __pa(end));
 }
 #endif
 
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
index 081409aa345..76ee90a5abe 100644
--- a/arch/x86_64/mm/pageattr.c
+++ b/arch/x86_64/mm/pageattr.c
@@ -51,7 +51,6 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot,
 	SetPagePrivate(base);
 	page_private(base) = 0;
 
-	address = __pa(address);
 	addr = address & LARGE_PAGE_MASK; 
 	pbase = (pte_t *)page_address(base);
 	for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
@@ -101,13 +100,12 @@ static inline void save_page(struct page *fpage)
  * No more special protections in this 2/4MB area - revert to a
  * large page again. 
  */
-static void revert_page(unsigned long address, pgprot_t ref_prot)
+static void revert_page(unsigned long address, unsigned long pfn, pgprot_t ref_prot)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t large_pte;
-	unsigned long pfn;
 
 	pgd = pgd_offset_k(address);
 	BUG_ON(pgd_none(*pgd));
@@ -115,7 +113,6 @@ static void revert_page(unsigned long address, pgprot_t ref_prot)
 	BUG_ON(pud_none(*pud));
 	pmd = pmd_offset(pud, address);
 	BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
-	pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT;
 	large_pte = pfn_pte(pfn, ref_prot);
 	large_pte = pte_mkhuge(large_pte);
 	set_pte((pte_t *)pmd, large_pte);
@@ -141,7 +138,8 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
  			 */
 			struct page *split;
 			ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
-			split = split_large_page(address, prot, ref_prot2);
+			split = split_large_page(pfn << PAGE_SHIFT, prot,
+							ref_prot2);
 			if (!split)
 				return -ENOMEM;
 			set_pte(kpte, mk_pte(split, ref_prot2));
@@ -160,7 +158,7 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
 
 	if (page_private(kpte_page) == 0) {
 		save_page(kpte_page);
-		revert_page(address, ref_prot);
+		revert_page(address, pfn, ref_prot);
  	}
 	return 0;
 } 
@@ -180,6 +178,7 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
  */
 int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
 {
+	unsigned long phys_base_pfn = __pa_symbol(__START_KERNEL_map) >> PAGE_SHIFT;
 	int err = 0; 
 	int i; 
 
@@ -192,10 +191,11 @@ int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
 			break; 
 		/* Handle kernel mapping too which aliases part of the
 		 * lowmem */
-		if (__pa(address) < KERNEL_TEXT_SIZE) {
+		if ((pfn >= phys_base_pfn) &&
+			((pfn - phys_base_pfn) < (KERNEL_TEXT_SIZE >> PAGE_SHIFT))) {
 			unsigned long addr2;
 			pgprot_t prot2;
-			addr2 = __START_KERNEL_map + __pa(address);
+			addr2 = __START_KERNEL_map + ((pfn - phys_base_pfn) << PAGE_SHIFT);
 			/* Make sure the kernel mappings stay executable */
 			prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot)));
 			err = __change_page_attr(addr2, pfn, prot2,
diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h
index d554b94485d..4974433bbf3 100644
--- a/include/asm-x86_64/page.h
+++ b/include/asm-x86_64/page.h
@@ -102,17 +102,15 @@ typedef struct { unsigned long pgprot; } pgprot_t;
 
 /* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol.
    Otherwise you risk miscompilation. */ 
-#define __pa(x)			(((unsigned long)(x)>=__START_KERNEL_map)?(unsigned long)(x) - (unsigned long)__START_KERNEL_map:(unsigned long)(x) - PAGE_OFFSET)
+#define __pa(x)			((unsigned long)(x) - PAGE_OFFSET)
 /* __pa_symbol should be used for C visible symbols.
    This seems to be the official gcc blessed way to do such arithmetic. */ 
 #define __pa_symbol(x)		\
 	({unsigned long v;  \
 	  asm("" : "=r" (v) : "0" (x)); \
-	  __pa(v); })
+	  (v - __START_KERNEL_map); })
 
 #define __va(x)			((void *)((unsigned long)(x)+PAGE_OFFSET))
-#define __boot_va(x)		__va(x)
-#define __boot_pa(x)		__pa(x)
 #ifdef CONFIG_FLATMEM
 #define pfn_valid(pfn)		((pfn) < end_pfn)
 #endif
diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
index 703f0243f27..c1865e38c7b 100644
--- a/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@ -19,7 +19,7 @@ extern pmd_t level2_kernel_pgt[512];
 extern pgd_t init_level4_pgt[];
 extern unsigned long __supported_pte_mask;
 
-#define swapper_pg_dir init_level4_pgt
+#define swapper_pg_dir ((pgd_t *)NULL)
 
 extern void paging_init(void);
 extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
@@ -29,7 +29,7 @@ extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
  * for zero-mapped memory areas etc..
  */
 extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
-#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
+#define ZERO_PAGE(vaddr) (pfn_to_page(__pa_symbol(&empty_zero_page) >> PAGE_SHIFT))
 
 #endif /* !__ASSEMBLY__ */
 
-- 
cgit v1.2.3-70-g09d2


From 8b8ca80e192b10eecc01fc44a2902510af86f73b Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 2 May 2007 19:27:09 +0200
Subject: [PATCH] x86-64: configurable fake numa node sizes

Extends the numa=fake x86_64 command-line option to allow for configurable
node sizes.  These nodes can be used in conjunction with cpusets for coarse
memory resource management.

The old command-line option is still supported:
  numa=fake=32	gives 32 fake NUMA nodes, ignoring the NUMA setup of the
		actual machine.

But now you may configure your system for the node sizes of your choice:
  numa=fake=2*512,1024,2*256
		gives two 512M nodes, one 1024M node, two 256M nodes, and
		the rest of system memory to a sixth node.

The existing hash function is maintained to support the various node sizes
that are possible with this implementation.

Each node of the same size receives roughly the same amount of available
pages, regardless of any reserved memory with its address range.  The total
available pages on the system is calculated and divided by the number of equal
nodes to allocate.  These nodes are then dynamically allocated and their
borders extended until such time as their number of available pages reaches
the required size.

Configurable node sizes are recommended when used in conjunction with cpusets
for memory control because it eliminates the overhead associated with scanning
the zonelists of many smaller full nodes on page_alloc().

Cc: Andi Kleen <ak@suse.de>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/x86_64/boot-options.txt |   8 +-
 arch/x86_64/mm/numa.c                 | 261 ++++++++++++++++++++--------------
 include/asm-x86_64/mmzone.h           |   2 +-
 3 files changed, 161 insertions(+), 110 deletions(-)

(limited to 'arch/x86_64/mm')

diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt
index 85f51e5a749..7500aad95f3 100644
--- a/Documentation/x86_64/boot-options.txt
+++ b/Documentation/x86_64/boot-options.txt
@@ -149,7 +149,13 @@ NUMA
 
   numa=noacpi   Don't parse the SRAT table for NUMA setup
 
-  numa=fake=X   Fake X nodes and ignore NUMA setup of the actual machine.
+  numa=fake=CMDLINE
+		If a number, fakes CMDLINE nodes and ignores NUMA setup of the
+		actual machine.  Otherwise, system memory is configured
+		depending on the sizes and coefficients listed.  For example:
+			numa=fake=2*512,1024,4*256
+		gives two 512M nodes, a 1024M node, and four 256M nodes.  The
+		remaining system RAM is allocated to an additional node.
 
   numa=hotadd=percent
 		Only allow hotadd memory to preallocate page structures upto
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 41b8fb06992..c55936bc6be 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -273,125 +273,172 @@ void __init numa_init_array(void)
 
 #ifdef CONFIG_NUMA_EMU
 /* Numa emulation */
-int numa_fake __initdata = 0;
+#define E820_ADDR_HOLE_SIZE(start, end)					\
+	(e820_hole_size((start) >> PAGE_SHIFT, (end) >> PAGE_SHIFT) <<	\
+	PAGE_SHIFT)
+char *cmdline __initdata;
 
 /*
- * This function is used to find out if the start and end correspond to
- * different zones.
+ * Setups up nid to range from addr to addr + size.  If the end boundary is
+ * greater than max_addr, then max_addr is used instead.  The return value is 0
+ * if there is additional memory left for allocation past addr and -1 otherwise.
+ * addr is adjusted to be at the end of the node.
  */
-int zone_cross_over(unsigned long start, unsigned long end)
+static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
+				   u64 size, u64 max_addr)
 {
-	if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) &&
-			(end >= (MAX_DMA32_PFN << PAGE_SHIFT)))
-		return 1;
-	return 0;
+	int ret = 0;
+	nodes[nid].start = *addr;
+	*addr += size;
+	if (*addr >= max_addr) {
+		*addr = max_addr;
+		ret = -1;
+	}
+	nodes[nid].end = *addr;
+	node_set_online(nid);
+	printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
+	       nodes[nid].start, nodes[nid].end,
+	       (nodes[nid].end - nodes[nid].start) >> 20);
+	return ret;
 }
 
-static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
+/*
+ * Splits num_nodes nodes up equally starting at node_start.  The return value
+ * is the number of nodes split up and addr is adjusted to be at the end of the
+ * last node allocated.
+ */
+static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
+				      u64 max_addr, int node_start,
+				      int num_nodes)
 {
- 	int i, big;
- 	struct bootnode nodes[MAX_NUMNODES];
- 	unsigned long sz, old_sz;
-	unsigned long hole_size;
-	unsigned long start, end;
-	unsigned long max_addr = (end_pfn << PAGE_SHIFT);
-
-	start = (start_pfn << PAGE_SHIFT);
-	hole_size = e820_hole_size(start, max_addr);
-	sz = (max_addr - start - hole_size) / numa_fake;
-
- 	/* Kludge needed for the hash function */
-
-	old_sz = sz;
-	/*
-	 * Round down to the nearest FAKE_NODE_MIN_SIZE.
-	 */
-	sz &= FAKE_NODE_MIN_HASH_MASK;
+	unsigned int big;
+	u64 size;
+	int i;
 
+	if (num_nodes <= 0)
+		return -1;
+	if (num_nodes > MAX_NUMNODES)
+		num_nodes = MAX_NUMNODES;
+	size = (max_addr - *addr - E820_ADDR_HOLE_SIZE(*addr, max_addr)) /
+	       num_nodes;
 	/*
-	 * We ensure that each node is at least 64MB big.  Smaller than this
-	 * size can cause VM hiccups.
+	 * Calculate the number of big nodes that can be allocated as a result
+	 * of consolidating the leftovers.
 	 */
-	if (sz == 0) {
-		printk(KERN_INFO "Not enough memory for %d nodes.  Reducing "
-				"the number of nodes\n", numa_fake);
-		numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE;
-		printk(KERN_INFO "Number of fake nodes will be = %d\n",
-				numa_fake);
-		sz = FAKE_NODE_MIN_SIZE;
+	big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) /
+	      FAKE_NODE_MIN_SIZE;
+
+	/* Round down to nearest FAKE_NODE_MIN_SIZE. */
+	size &= FAKE_NODE_MIN_HASH_MASK;
+	if (!size) {
+		printk(KERN_ERR "Not enough memory for each node.  "
+		       "NUMA emulation disabled.\n");
+		return -1;
 	}
-	/*
-	 * Find out how many nodes can get an extra NODE_MIN_SIZE granule.
-	 * This logic ensures the extra memory gets distributed among as many
-	 * nodes as possible (as compared to one single node getting all that
-	 * extra memory.
-	 */
-	big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE;
-	printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: "
-			"%d\n",
-			(sz >> 20), (hole_size >> 20), big);
- 	memset(&nodes,0,sizeof(nodes));
-	end = start;
- 	for (i = 0; i < numa_fake; i++) {
-		/*
-		 * In case we are not able to allocate enough memory for all
-		 * the nodes, we reduce the number of fake nodes.
-		 */
-		if (end >= max_addr) {
-			numa_fake = i - 1;
-			break;
-		}
- 		start = nodes[i].start = end;
-		/*
-		 * Final node can have all the remaining memory.
-		 */
- 		if (i == numa_fake-1)
- 			sz = max_addr - start;
- 		end = nodes[i].start + sz;
-		/*
-		 * Fir "big" number of nodes get extra granule.
-		 */
+
+	for (i = node_start; i < num_nodes + node_start; i++) {
+		u64 end = *addr + size;
 		if (i < big)
 			end += FAKE_NODE_MIN_SIZE;
 		/*
-		 * Iterate over the range to ensure that this node gets at
-		 * least sz amount of RAM (excluding holes)
+		 * The final node can have the remaining system RAM.  Other
+		 * nodes receive roughly the same amount of available pages.
 		 */
-		while ((end - start - e820_hole_size(start, end)) < sz) {
-			end += FAKE_NODE_MIN_SIZE;
-			if (end >= max_addr)
-				break;
+		if (i == num_nodes + node_start - 1)
+			end = max_addr;
+		else
+			while (end - *addr - E820_ADDR_HOLE_SIZE(*addr, end) <
+			       size) {
+				end += FAKE_NODE_MIN_SIZE;
+				if (end > max_addr) {
+					end = max_addr;
+					break;
+				}
+			}
+		if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0)
+			break;
+	}
+	return i - node_start + 1;
+}
+
+/*
+ * Sets up the system RAM area from start_pfn to end_pfn according to the
+ * numa=fake command-line option.
+ */
+static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
+{
+	struct bootnode nodes[MAX_NUMNODES];
+	u64 addr = start_pfn << PAGE_SHIFT;
+	u64 max_addr = end_pfn << PAGE_SHIFT;
+	unsigned int coeff;
+	unsigned int num = 0;
+	int num_nodes = 0;
+	u64 size;
+	int i;
+
+	memset(&nodes, 0, sizeof(nodes));
+	/*
+	 * If the numa=fake command-line is just a single number N, split the
+	 * system RAM into N fake nodes.
+	 */
+	if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
+		num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0,
+						simple_strtol(cmdline, NULL, 0));
+		if (num_nodes < 0)
+			return num_nodes;
+		goto out;
+	}
+
+	/* Parse the command line. */
+	for (coeff = 1; ; cmdline++) {
+		if (*cmdline && isdigit(*cmdline)) {
+			num = num * 10 + *cmdline - '0';
+			continue;
 		}
-		/*
-		 * Look at the next node to make sure there is some real memory
-		 * to map.  Bad things happen when the only memory present
-		 * in a zone on a fake node is IO hole.
-		 */
-		while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) {
-			if (zone_cross_over(start, end + sz)) {
-				end = (MAX_DMA32_PFN << PAGE_SHIFT);
-				break;
+		if (*cmdline == '*')
+			coeff = num;
+		if (!*cmdline || *cmdline == ',') {
+			/*
+			 * Round down to the nearest FAKE_NODE_MIN_SIZE.
+			 * Command-line coefficients are in megabytes.
+			 */
+			size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
+			if (size) {
+				for (i = 0; i < coeff; i++, num_nodes++)
+					if (setup_node_range(num_nodes, nodes,
+						&addr, size, max_addr) < 0)
+						goto done;
+				coeff = 1;
 			}
-			if (end >= max_addr)
-				break;
-			end += FAKE_NODE_MIN_SIZE;
 		}
-		if (end > max_addr)
-			end = max_addr;
-		nodes[i].end = end;
- 		printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
- 		       i,
- 		       nodes[i].start, nodes[i].end,
- 		       (nodes[i].end - nodes[i].start) >> 20);
-		node_set_online(i);
- 	}
- 	memnode_shift = compute_hash_shift(nodes, numa_fake);
- 	if (memnode_shift < 0) {
- 		memnode_shift = 0;
- 		printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
- 		return -1;
- 	}
- 	for_each_online_node(i) {
+		if (!*cmdline)
+			break;
+		num = 0;
+	}
+done:
+	if (!num_nodes)
+		return -1;
+	/* Fill remainder of system RAM with a final node, if appropriate. */
+	if (addr < max_addr) {
+		setup_node_range(num_nodes, nodes, &addr, max_addr - addr,
+				 max_addr);
+		num_nodes++;
+	}
+out:
+	memnode_shift = compute_hash_shift(nodes, num_nodes);
+	if (memnode_shift < 0) {
+		memnode_shift = 0;
+		printk(KERN_ERR "No NUMA hash function found.  NUMA emulation "
+		       "disabled.\n");
+		return -1;
+	}
+
+	/*
+	 * We need to vacate all active ranges that may have been registered by
+	 * SRAT.
+	 */
+	remove_all_active_ranges();
+	for_each_online_node(i) {
 		e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
 						nodes[i].end >> PAGE_SHIFT);
  		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
@@ -399,14 +446,15 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
  	numa_init_array();
  	return 0;
 }
-#endif
+#undef E820_ADDR_HOLE_SIZE
+#endif /* CONFIG_NUMA_EMU */
 
 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 { 
 	int i;
 
 #ifdef CONFIG_NUMA_EMU
-	if (numa_fake && !numa_emulation(start_pfn, end_pfn))
+	if (cmdline && !numa_emulation(start_pfn, end_pfn))
  		return;
 #endif
 
@@ -486,11 +534,8 @@ static __init int numa_setup(char *opt)
 	if (!strncmp(opt,"off",3))
 		numa_off = 1;
 #ifdef CONFIG_NUMA_EMU
-	if(!strncmp(opt, "fake=", 5)) {
-		numa_fake = simple_strtoul(opt+5,NULL,0); ;
-		if (numa_fake >= MAX_NUMNODES)
-			numa_fake = MAX_NUMNODES;
-	}
+	if (!strncmp(opt, "fake=", 5))
+		cmdline = opt + 5;
 #endif
 #ifdef CONFIG_ACPI_NUMA
  	if (!strncmp(opt,"noacpi",6))
diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h
index fb558fb1d21..19a89377b12 100644
--- a/include/asm-x86_64/mmzone.h
+++ b/include/asm-x86_64/mmzone.h
@@ -49,7 +49,7 @@ extern int pfn_valid(unsigned long pfn);
 
 #ifdef CONFIG_NUMA_EMU
 #define FAKE_NODE_MIN_SIZE	(64*1024*1024)
-#define FAKE_NODE_MIN_HASH_MASK	(~(FAKE_NODE_MIN_SIZE - 1ul))
+#define FAKE_NODE_MIN_HASH_MASK	(~(FAKE_NODE_MIN_SIZE - 1uL))
 #endif
 
 #endif
-- 
cgit v1.2.3-70-g09d2


From 14694d736bb66d0ec250d05c81c6e98a19c229c6 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 2 May 2007 19:27:09 +0200
Subject: [PATCH] x86-64: split remaining fake nodes equally

Extends the numa=fake x86_64 command-line option to split the remaining
system memory into equal-sized nodes.

For example:
numa=fake=2*512,4*	gives two 512M nodes and the remaining system
			memory is split into four approximately equal
			chunks.

This is beneficial for systems where the exact size of RAM is unknown or not
necessarily relevant, but the granularity with which nodes shall be allocated
is known.

Cc: Andi Kleen <ak@suse.de>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/x86_64/boot-options.txt |  4 +++-
 arch/x86_64/mm/numa.c                 | 22 ++++++++++++++++++----
 2 files changed, 21 insertions(+), 5 deletions(-)

(limited to 'arch/x86_64/mm')

diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt
index 7500aad95f3..12a9aacecaa 100644
--- a/Documentation/x86_64/boot-options.txt
+++ b/Documentation/x86_64/boot-options.txt
@@ -155,7 +155,9 @@ NUMA
 		depending on the sizes and coefficients listed.  For example:
 			numa=fake=2*512,1024,4*256
 		gives two 512M nodes, a 1024M node, and four 256M nodes.  The
-		remaining system RAM is allocated to an additional node.
+		remaining system RAM is allocated to an additional node.  If
+		the last character of CMDLINE is a *, the remaining system RAM
+		is instead divided up equally among its coefficient.
 
   numa=hotadd=percent
 		Only allow hotadd memory to preallocate page structures upto
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index c55936bc6be..0ae2d9d5d7e 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -418,11 +418,25 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
 done:
 	if (!num_nodes)
 		return -1;
-	/* Fill remainder of system RAM with a final node, if appropriate. */
+	/* Fill remainder of system RAM, if appropriate. */
 	if (addr < max_addr) {
-		setup_node_range(num_nodes, nodes, &addr, max_addr - addr,
-				 max_addr);
-		num_nodes++;
+		switch (*(cmdline - 1)) {
+		case '*':
+			/* Split remaining nodes into coeff chunks */
+			if (coeff <= 0)
+				break;
+			num_nodes += split_nodes_equally(nodes, &addr, max_addr,
+							 num_nodes, coeff);
+			break;
+		case ',':
+			/* Do not allocate remaining system RAM */
+			break;
+		default:
+			/* Give one final node */
+			setup_node_range(num_nodes, nodes, &addr,
+					 max_addr - addr, max_addr);
+			num_nodes++;
+		}
 	}
 out:
 	memnode_shift = compute_hash_shift(nodes, num_nodes);
-- 
cgit v1.2.3-70-g09d2


From 382591d500bbcd20a44416c5e0e292708468587c Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 2 May 2007 19:27:09 +0200
Subject: [PATCH] x86-64: fixed size remaining fake nodes

Extends the numa=fake x86_64 command-line option to split the remaining system
memory into nodes of fixed size.  Any leftover memory is allocated to a final
node unless the command-line ends with a comma.

For example:
  numa=fake=2*512,*128	gives two 512M nodes and the remaining system
			memory is split into nodes of 128M each.

This is beneficial for systems where the exact size of RAM is unknown or not
necessarily relevant, but the size of the remaining nodes to be allocated is
known based on their capacity for resource management.

Cc: Andi Kleen <ak@suse.de>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/x86_64/boot-options.txt | 14 +++++++----
 arch/x86_64/mm/numa.c                 | 47 +++++++++++++++++++++++++++--------
 2 files changed, 46 insertions(+), 15 deletions(-)

(limited to 'arch/x86_64/mm')

diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt
index 12a9aacecaa..6177d881983 100644
--- a/Documentation/x86_64/boot-options.txt
+++ b/Documentation/x86_64/boot-options.txt
@@ -153,11 +153,15 @@ NUMA
 		If a number, fakes CMDLINE nodes and ignores NUMA setup of the
 		actual machine.  Otherwise, system memory is configured
 		depending on the sizes and coefficients listed.  For example:
-			numa=fake=2*512,1024,4*256
-		gives two 512M nodes, a 1024M node, and four 256M nodes.  The
-		remaining system RAM is allocated to an additional node.  If
-		the last character of CMDLINE is a *, the remaining system RAM
-		is instead divided up equally among its coefficient.
+			numa=fake=2*512,1024,4*256,*128
+		gives two 512M nodes, a 1024M node, four 256M nodes, and the
+		rest split into 128M chunks.  If the last character of CMDLINE
+		is a *, the remaining memory is divided up equally among its
+		coefficient:
+			numa=fake=2*512,2*
+		gives two 512M nodes and the rest split into two nodes.
+		Otherwise, the remaining system RAM is allocated to an
+		additional node.
 
   numa=hotadd=percent
 		Only allow hotadd memory to preallocate page structures upto
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 0ae2d9d5d7e..5ee07bc41eb 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -361,6 +361,21 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
 	return i - node_start + 1;
 }
 
+/*
+ * Splits the remaining system RAM into chunks of size.  The remaining memory is
+ * always assigned to a final node and can be asymmetric.  Returns the number of
+ * nodes split.
+ */
+static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
+				      u64 max_addr, int node_start, u64 size)
+{
+	int i = node_start;
+	size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
+	while (!setup_node_range(i++, nodes, addr, size, max_addr))
+		;
+	return i - node_start;
+}
+
 /*
  * Sets up the system RAM area from start_pfn to end_pfn according to the
  * numa=fake command-line option.
@@ -370,9 +385,10 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
 	struct bootnode nodes[MAX_NUMNODES];
 	u64 addr = start_pfn << PAGE_SHIFT;
 	u64 max_addr = end_pfn << PAGE_SHIFT;
-	unsigned int coeff;
-	unsigned int num = 0;
 	int num_nodes = 0;
+	int coeff_flag;
+	int coeff = -1;
+	int num = 0;
 	u64 size;
 	int i;
 
@@ -390,29 +406,34 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
 	}
 
 	/* Parse the command line. */
-	for (coeff = 1; ; cmdline++) {
+	for (coeff_flag = 0; ; cmdline++) {
 		if (*cmdline && isdigit(*cmdline)) {
 			num = num * 10 + *cmdline - '0';
 			continue;
 		}
-		if (*cmdline == '*')
-			coeff = num;
+		if (*cmdline == '*') {
+			if (num > 0)
+				coeff = num;
+			coeff_flag = 1;
+		}
 		if (!*cmdline || *cmdline == ',') {
+			if (!coeff_flag)
+				coeff = 1;
 			/*
 			 * Round down to the nearest FAKE_NODE_MIN_SIZE.
 			 * Command-line coefficients are in megabytes.
 			 */
 			size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
-			if (size) {
+			if (size)
 				for (i = 0; i < coeff; i++, num_nodes++)
 					if (setup_node_range(num_nodes, nodes,
 						&addr, size, max_addr) < 0)
 						goto done;
-				coeff = 1;
-			}
+			if (!*cmdline)
+				break;
+			coeff_flag = 0;
+			coeff = -1;
 		}
-		if (!*cmdline)
-			break;
 		num = 0;
 	}
 done:
@@ -420,6 +441,12 @@ done:
 		return -1;
 	/* Fill remainder of system RAM, if appropriate. */
 	if (addr < max_addr) {
+		if (coeff_flag && coeff < 0) {
+			/* Split remaining nodes into num-sized chunks */
+			num_nodes += split_nodes_by_size(nodes, &addr, max_addr,
+							 num_nodes, num);
+			goto out;
+		}
 		switch (*(cmdline - 1)) {
 		case '*':
 			/* Split remaining nodes into coeff chunks */
-- 
cgit v1.2.3-70-g09d2


From d01ad8dd56527be72947b4b9997bb2c05783c3ed Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Wed, 2 May 2007 19:27:10 +0200
Subject: [PATCH] x86: Improve handling of kernel mappings in change_page_attr

Fix various broken corner cases in i386 and x86-64 change_page_attr.

AK: split off from tighten kernel image access rights

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/mm/pageattr.c    |  4 ++--
 arch/x86_64/mm/pageattr.c  | 16 ++++++++++++----
 include/asm-i386/pgtable.h |  2 ++
 3 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'arch/x86_64/mm')

diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index 412ebbd8adb..ea6b6d4a0a2 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -142,7 +142,7 @@ __change_page_attr(struct page *page, pgprot_t prot)
 		return -EINVAL;
 	kpte_page = virt_to_page(kpte);
 	if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { 
-		if ((pte_val(*kpte) & _PAGE_PSE) == 0) { 
+		if (!pte_huge(*kpte)) {
 			set_pte_atomic(kpte, mk_pte(page, prot)); 
 		} else {
 			pgprot_t ref_prot;
@@ -158,7 +158,7 @@ __change_page_attr(struct page *page, pgprot_t prot)
 			kpte_page = split;
 		}
 		page_private(kpte_page)++;
-	} else if ((pte_val(*kpte) & _PAGE_PSE) == 0) { 
+	} else if (!pte_huge(*kpte)) {
 		set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL));
 		BUG_ON(page_private(kpte_page) == 0);
 		page_private(kpte_page)--;
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
index 76ee90a5abe..bf4aa8dd425 100644
--- a/arch/x86_64/mm/pageattr.c
+++ b/arch/x86_64/mm/pageattr.c
@@ -179,16 +179,24 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
 int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
 {
 	unsigned long phys_base_pfn = __pa_symbol(__START_KERNEL_map) >> PAGE_SHIFT;
-	int err = 0; 
+	int err = 0, kernel_map = 0;
 	int i; 
 
+	if (address >= __START_KERNEL_map
+	    && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) {
+		address = (unsigned long)__va(__pa(address));
+		kernel_map = 1;
+	}
+
 	down_write(&init_mm.mmap_sem);
 	for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
 		unsigned long pfn = __pa(address) >> PAGE_SHIFT;
 
-		err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
-		if (err) 
-			break; 
+		if (!kernel_map || pte_present(pfn_pte(0, prot))) {
+			err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
+			if (err)
+				break;
+		}
 		/* Handle kernel mapping too which aliases part of the
 		 * lowmem */
 		if ((pfn >= phys_base_pfn) &&
diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index c3b58d473a5..143ddc42b86 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -159,6 +159,7 @@ void paging_init(void);
 
 extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
 #define __PAGE_KERNEL_RO		(__PAGE_KERNEL & ~_PAGE_RW)
+#define __PAGE_KERNEL_RX		(__PAGE_KERNEL_EXEC & ~_PAGE_RW)
 #define __PAGE_KERNEL_NOCACHE		(__PAGE_KERNEL | _PAGE_PCD)
 #define __PAGE_KERNEL_LARGE		(__PAGE_KERNEL | _PAGE_PSE)
 #define __PAGE_KERNEL_LARGE_EXEC	(__PAGE_KERNEL_EXEC | _PAGE_PSE)
@@ -166,6 +167,7 @@ extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
 #define PAGE_KERNEL		__pgprot(__PAGE_KERNEL)
 #define PAGE_KERNEL_RO		__pgprot(__PAGE_KERNEL_RO)
 #define PAGE_KERNEL_EXEC	__pgprot(__PAGE_KERNEL_EXEC)
+#define PAGE_KERNEL_RX		__pgprot(__PAGE_KERNEL_RX)
 #define PAGE_KERNEL_NOCACHE	__pgprot(__PAGE_KERNEL_NOCACHE)
 #define PAGE_KERNEL_LARGE	__pgprot(__PAGE_KERNEL_LARGE)
 #define PAGE_KERNEL_LARGE_EXEC	__pgprot(__PAGE_KERNEL_LARGE_EXEC)
-- 
cgit v1.2.3-70-g09d2


From 6fb14755a676282a4e6caa05a08c92db8e45cfff Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Wed, 2 May 2007 19:27:10 +0200
Subject: [PATCH] x86: tighten kernel image page access rights

On x86-64, kernel memory freed after init can be entirely unmapped instead
of just getting 'poisoned' by overwriting with a debug pattern.

On i386 and x86-64 (under CONFIG_DEBUG_RODATA), kernel text and bug table
can also be write-protected.

Compared to the first version, this one prevents re-creating deleted
mappings in the kernel image range on x86-64, if those got removed
previously. This, together with the original changes, prevents temporarily
having inconsistent mappings when cacheability attributes are being
changed on such pages (e.g. from AGP code). While on i386 such duplicate
mappings don't exist, the same change is done there, too, both for
consistency and because checking pte_present() before using various other
pte_XXX functions is a requirement anyway. At once, i386 code gets
adjusted to use pte_huge() instead of open coding this.

AK: split out cpa() changes

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/vmlinux.lds.S   |  4 ++--
 arch/i386/mm/init.c              | 25 +++++++++++++++++++------
 arch/x86_64/kernel/head.S        |  1 -
 arch/x86_64/kernel/vmlinux.lds.S |  5 +++--
 arch/x86_64/mm/init.c            | 25 ++++++++++++++++---------
 include/linux/poison.h           |  3 ---
 6 files changed, 40 insertions(+), 23 deletions(-)

(limited to 'arch/x86_64/mm')

diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 6f38f818380..f4ec7223183 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -61,8 +61,6 @@ SECTIONS
   	__stop___ex_table = .;
   }
 
-  RODATA
-
   BUG_TABLE
 
   . = ALIGN(4);
@@ -72,6 +70,8 @@ SECTIONS
   	__tracedata_end = .;
   }
 
+  RODATA
+
   /* writeable */
   . = ALIGN(4096);
   .data : AT(ADDR(.data) - LOAD_OFFSET) {	/* Data */
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 23be1b0aafa..bd5ef371850 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -22,6 +22,7 @@
 #include <linux/init.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/pfn.h>
 #include <linux/poison.h>
 #include <linux/bootmem.h>
 #include <linux/slab.h>
@@ -751,13 +752,25 @@ static int noinline do_test_wp_bit(void)
 
 void mark_rodata_ro(void)
 {
-	unsigned long addr = (unsigned long)__start_rodata;
+	unsigned long start = PFN_ALIGN(_text);
+	unsigned long size = PFN_ALIGN(_etext) - start;
 
-	for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
-		change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO);
+#ifdef CONFIG_HOTPLUG_CPU
+	/* It must still be possible to apply SMP alternatives. */
+	if (num_possible_cpus() <= 1)
+#endif
+	{
+		change_page_attr(virt_to_page(start),
+		                 size >> PAGE_SHIFT, PAGE_KERNEL_RX);
+		printk("Write protecting the kernel text: %luk\n", size >> 10);
+	}
 
-	printk("Write protecting the kernel read-only data: %uk\n",
-			(__end_rodata - __start_rodata) >> 10);
+	start += size;
+	size = (unsigned long)__end_rodata - start;
+	change_page_attr(virt_to_page(start),
+	                 size >> PAGE_SHIFT, PAGE_KERNEL_RO);
+	printk("Write protecting the kernel read-only data: %luk\n",
+	       size >> 10);
 
 	/*
 	 * change_page_attr() requires a global_flush_tlb() call after it.
@@ -781,7 +794,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 		__free_page(page);
 		totalram_pages++;
 	}
-	printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
+	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
 }
 
 void free_initmem(void)
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index 36aa98a6d15..fd9fdfdd143 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -280,7 +280,6 @@ early_idt_ripmsg:
 
 .balign PAGE_SIZE
 ENTRY(stext)
-ENTRY(_stext)
 
 #define NEXT_PAGE(name) \
 	.balign	PAGE_SIZE; \
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 5176ecf006e..3bdeb88d28f 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -29,6 +29,7 @@ SECTIONS
   .text :  AT(ADDR(.text) - LOAD_OFFSET) {
 	/* First the code that has to be first for bootstrapping */
 	*(.bootstrap.text)
+	_stext = .;
 	/* Then all the functions that are "hot" in profiles, to group them
            onto the same hugetlb entry */
 	#include "functionlist"
@@ -50,10 +51,10 @@ SECTIONS
   __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
   __stop___ex_table = .;
 
-  RODATA
-
   BUG_TABLE
 
+  RODATA
+
   . = ALIGN(PAGE_SIZE);        /* Align data segment to page size boundary */
 				/* Data */
   .data : AT(ADDR(.data) - LOAD_OFFSET) {
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 69e22d3c923..e3134bc9a4f 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -22,6 +22,7 @@
 #include <linux/bootmem.h>
 #include <linux/proc_fs.h>
 #include <linux/pci.h>
+#include <linux/pfn.h>
 #include <linux/poison.h>
 #include <linux/dma-mapping.h>
 #include <linux/module.h>
@@ -563,21 +564,23 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 	if (begin >= end)
 		return;
 
-	printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
+	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
 	for (addr = begin; addr < end; addr += PAGE_SIZE) {
 		struct page *page = pfn_to_page(addr >> PAGE_SHIFT);
 		ClearPageReserved(page);
 		init_page_count(page);
 		memset(page_address(page), POISON_FREE_INITMEM, PAGE_SIZE);
+		if (addr >= __START_KERNEL_map)
+			change_page_attr_addr(addr, 1, __pgprot(0));
 		__free_page(page);
 		totalram_pages++;
 	}
+	if (addr > __START_KERNEL_map)
+		global_flush_tlb();
 }
 
 void free_initmem(void)
 {
-	memset(__initdata_begin, POISON_FREE_INITDATA,
-		__initdata_end - __initdata_begin);
 	free_init_pages("unused kernel memory",
 			__pa_symbol(&__init_begin),
 			__pa_symbol(&__init_end));
@@ -587,14 +590,18 @@ void free_initmem(void)
 
 void mark_rodata_ro(void)
 {
-	unsigned long addr = (unsigned long)__va(__pa_symbol(&__start_rodata));
-	unsigned long end  = (unsigned long)__va(__pa_symbol(&__end_rodata));
+	unsigned long start = PFN_ALIGN(__va(__pa_symbol(&_stext))), size;
 
-	for (; addr < end; addr += PAGE_SIZE)
-		change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
+#ifdef CONFIG_HOTPLUG_CPU
+	/* It must still be possible to apply SMP alternatives. */
+	if (num_possible_cpus() > 1)
+		start = PFN_ALIGN(__va(__pa_symbol(&_etext)));
+#endif
+	size = (unsigned long)__va(__pa_symbol(&__end_rodata)) - start;
+	change_page_attr_addr(start, size >> PAGE_SHIFT, PAGE_KERNEL_RO);
 
-	printk ("Write protecting the kernel read-only data: %luk\n",
-			(__end_rodata - __start_rodata) >> 10);
+	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
+	       size >> 10);
 
 	/*
 	 * change_page_attr_addr() requires a global_flush_tlb() call after it.
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 3e628f990fd..89580b76495 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -26,9 +26,6 @@
 /********** arch/$ARCH/mm/init.c **********/
 #define POISON_FREE_INITMEM	0xcc
 
-/********** arch/x86_64/mm/init.c **********/
-#define	POISON_FREE_INITDATA	0xba
-
 /********** arch/ia64/hp/common/sba_iommu.c **********/
 /*
  * arch/ia64/hp/common/sba_iommu.c uses a 16-byte poison string with a
-- 
cgit v1.2.3-70-g09d2


From 2bff73830c3df5f575d3bc21bf19df1a10bf7091 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 2 May 2007 19:27:10 +0200
Subject: [PATCH] x86-64: use lru instead of page->index and page->private for
 pgd lists management.

x86_64 currently simulates a list using the index and private fields of the
page struct.  Seems that the code was inherited from i386.  But x86_64 does
not use the slab to allocate pgds and pmds etc.  So the lru field is not
used by the slab and therefore available.

This patch uses standard list operations on page->lru to realize pgd
tracking.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86_64/mm/fault.c       |  5 ++---
 include/asm-x86_64/pgalloc.h | 14 +++-----------
 include/asm-x86_64/pgtable.h |  2 +-
 3 files changed, 6 insertions(+), 15 deletions(-)

(limited to 'arch/x86_64/mm')

diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 6ada7231f3a..de99dba2c51 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -585,7 +585,7 @@ do_sigbus:
 }
 
 DEFINE_SPINLOCK(pgd_lock);
-struct page *pgd_list;
+LIST_HEAD(pgd_list);
 
 void vmalloc_sync_all(void)
 {
@@ -605,8 +605,7 @@ void vmalloc_sync_all(void)
 			if (pgd_none(*pgd_ref))
 				continue;
 			spin_lock(&pgd_lock);
-			for (page = pgd_list; page;
-			     page = (struct page *)page->index) {
+			list_for_each_entry(page, &pgd_list, lru) {
 				pgd_t *pgd;
 				pgd = (pgd_t *)page_address(page) + pgd_index(address);
 				if (pgd_none(*pgd))
diff --git a/include/asm-x86_64/pgalloc.h b/include/asm-x86_64/pgalloc.h
index 31d49717196..8bb56468786 100644
--- a/include/asm-x86_64/pgalloc.h
+++ b/include/asm-x86_64/pgalloc.h
@@ -44,24 +44,16 @@ static inline void pgd_list_add(pgd_t *pgd)
 	struct page *page = virt_to_page(pgd);
 
 	spin_lock(&pgd_lock);
-	page->index = (pgoff_t)pgd_list;
-	if (pgd_list)
-		pgd_list->private = (unsigned long)&page->index;
-	pgd_list = page;
-	page->private = (unsigned long)&pgd_list;
+	list_add(&page->lru, &pgd_list);
 	spin_unlock(&pgd_lock);
 }
 
 static inline void pgd_list_del(pgd_t *pgd)
 {
-	struct page *next, **pprev, *page = virt_to_page(pgd);
+	struct page *page = virt_to_page(pgd);
 
 	spin_lock(&pgd_lock);
-	next = (struct page *)page->index;
-	pprev = (struct page **)page->private;
-	*pprev = next;
-	if (next)
-		next->private = (unsigned long)pprev;
+	list_del(&page->lru);
 	spin_unlock(&pgd_lock);
 }
 
diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
index c1865e38c7b..599993f6ba8 100644
--- a/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@ -410,7 +410,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 #define __swp_entry_to_pte(x)		((pte_t) { (x).val })
 
 extern spinlock_t pgd_lock;
-extern struct page *pgd_list;
+extern struct list_head pgd_list;
 void vmalloc_sync_all(void);
 
 extern int kern_addr_valid(unsigned long addr); 
-- 
cgit v1.2.3-70-g09d2


From ae32b1297a77c23fd0badd642bb685062f7a37f8 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek <konrad@darnok.org>
Date: Wed, 2 May 2007 19:27:11 +0200
Subject: [PATCH] x86-64: Inhibit machine from asserting an NMI when doing
 Alt-SysRq-M operation.

This patch touches the NMI watchdog every MAX_ORDER_NR_PAGES
to inhibit the machine from triggering an NMI while the CPUs
are locked. This situation is happening on boxes with more
than 64CPUs and 128GB of RAM when Alt-SysRq-m is performed.

It has been succesfully tested for regression on uni, 2, 4, 8
32, and 64 CPU boxes with various memory configuration.

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/mm/init.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86_64/mm')

diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index e3134bc9a4f..282b0a8f00a 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -27,6 +27,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/module.h>
 #include <linux/memory_hotplug.h>
+#include <linux/nmi.h>
 
 #include <asm/processor.h>
 #include <asm/system.h>
@@ -73,6 +74,11 @@ void show_mem(void)
 
 	for_each_online_pgdat(pgdat) {
                for (i = 0; i < pgdat->node_spanned_pages; ++i) {
+			/* this loop can take a while with 256 GB and 4k pages
+			   so update the NMI watchdog */
+			if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
+				touch_nmi_watchdog();
+			}
 			page = pfn_to_page(pgdat->node_start_pfn + i);
 			total++;
 			if (PageReserved(page))
-- 
cgit v1.2.3-70-g09d2


From e3f1caeef9a70b0699518092d653c15274b025ab Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Wed, 2 May 2007 19:27:20 +0200
Subject: [PATCH] x86-64: set node_possible_map at runtime - try 2

Set the node_possible_map at runtime on x86_64.  On a non NUMA system,
num_possible_nodes() will now say '1'.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
---
 arch/x86_64/mm/k8topology.c |  7 ++-----
 arch/x86_64/mm/numa.c       | 10 ++++++++--
 arch/x86_64/mm/srat.c       |  8 +++++---
 3 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'arch/x86_64/mm')

diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c
index b5b8dba28b4..60e860e5ef4 100644
--- a/arch/x86_64/mm/k8topology.c
+++ b/arch/x86_64/mm/k8topology.c
@@ -49,11 +49,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
 	int found = 0;
 	u32 reg;
 	unsigned numnodes;
-	nodemask_t nodes_parsed;
 	unsigned dualcore = 0;
 
-	nodes_clear(nodes_parsed);
-
 	if (!early_pci_allowed())
 		return -1;
 
@@ -102,7 +99,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
 			       nodeid, (base>>8)&3, (limit>>8) & 3); 
 			return -1; 
 		}	
-		if (node_isset(nodeid, nodes_parsed)) { 
+		if (node_isset(nodeid, node_possible_map)) {
 			printk(KERN_INFO "Node %d already present. Skipping\n", 
 			       nodeid);
 			continue;
@@ -155,7 +152,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
 
 		prevbase = base;
 
-		node_set(nodeid, nodes_parsed);
+		node_set(nodeid, node_possible_map);
 	} 
 
 	if (!found)
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 5ee07bc41eb..51548947ad3 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -295,7 +295,7 @@ static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
 		ret = -1;
 	}
 	nodes[nid].end = *addr;
-	node_set_online(nid);
+	node_set(nid, node_possible_map);
 	printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
 	       nodes[nid].start, nodes[nid].end,
 	       (nodes[nid].end - nodes[nid].start) >> 20);
@@ -479,7 +479,7 @@ out:
 	 * SRAT.
 	 */
 	remove_all_active_ranges();
-	for_each_online_node(i) {
+	for_each_node_mask(i, node_possible_map) {
 		e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
 						nodes[i].end >> PAGE_SHIFT);
  		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
@@ -494,20 +494,25 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 { 
 	int i;
 
+	nodes_clear(node_possible_map);
+
 #ifdef CONFIG_NUMA_EMU
 	if (cmdline && !numa_emulation(start_pfn, end_pfn))
  		return;
+	nodes_clear(node_possible_map);
 #endif
 
 #ifdef CONFIG_ACPI_NUMA
 	if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
 					  end_pfn << PAGE_SHIFT))
  		return;
+	nodes_clear(node_possible_map);
 #endif
 
 #ifdef CONFIG_K8_NUMA
 	if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
 		return;
+	nodes_clear(node_possible_map);
 #endif
 	printk(KERN_INFO "%s\n",
 	       numa_off ? "NUMA turned off" : "No NUMA configuration found");
@@ -521,6 +526,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 	memnodemap[0] = 0;
 	nodes_clear(node_online_map);
 	node_set_online(0);
+	node_set(0, node_possible_map);
 	for (i = 0; i < NR_CPUS; i++)
 		numa_set_node(i, 0);
 	node_to_cpumask[0] = cpumask_of_cpu(0);
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index 2efe215fc76..1e76bb0a727 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -419,19 +419,21 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 		return -1;
 	}
 
+	node_possible_map = nodes_parsed;
+
 	/* Finally register nodes */
-	for_each_node_mask(i, nodes_parsed)
+	for_each_node_mask(i, node_possible_map)
 		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
 	/* Try again in case setup_node_bootmem missed one due
 	   to missing bootmem */
-	for_each_node_mask(i, nodes_parsed)
+	for_each_node_mask(i, node_possible_map)
 		if (!node_online(i))
 			setup_node_bootmem(i, nodes[i].start, nodes[i].end);
 
 	for (i = 0; i < NR_CPUS; i++) {
 		if (cpu_to_node[i] == NUMA_NO_NODE)
 			continue;
-		if (!node_isset(cpu_to_node[i], nodes_parsed))
+		if (!node_isset(cpu_to_node[i], node_possible_map))
 			numa_set_node(i, NUMA_NO_NODE);
 	}
 	numa_init_array();
-- 
cgit v1.2.3-70-g09d2


From 3bea9c9793a17053e05d970e5d90d48fc9fce07d Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 2 May 2007 19:27:21 +0200
Subject: [PATCH] x86-64: Don't enable NUMA for a single node in K8 NUMA
 scanning

This was supposed to see the full memory on a ASUS A8SX motherboard
with 4GB RAM where the northbridge reports less memory, but it didn't
help there. But it's a reasonable change so let's include it anyways.

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/mm/k8topology.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86_64/mm')

diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c
index 60e860e5ef4..f983c75825d 100644
--- a/arch/x86_64/mm/k8topology.c
+++ b/arch/x86_64/mm/k8topology.c
@@ -62,6 +62,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
 
 	reg = read_pci_config(0, nb, 0, 0x60); 
 	numnodes = ((reg >> 4) & 0xF) + 1;
+	if (numnodes <= 1)
+		return -1;
 
 	printk(KERN_INFO "Number of nodes %d\n", numnodes);
 
-- 
cgit v1.2.3-70-g09d2


From e3ebadd95cb621e2c7436f3d3646447ac9d5c16d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@woody.linux-foundation.org>
Date: Mon, 7 May 2007 08:44:24 -0700
Subject: Revert "[PATCH] x86: __pa and __pa_symbol address space separation"

This was broken.  It adds complexity, for no good reason.  Rather than
separate __pa() and __pa_symbol(), we should deprecate __pa_symbol(),
and preferably __pa() too - and just use "virt_to_phys()" instead, which
is more readable and has nicer semantics.

However, right now, just undo the separation, and make __pa_symbol() be
the exact same as __pa().  That fixes the bugs this patch introduced,
and we can do the fairly obvious cleanups later.

Do the new __phys_addr() function (which is now the actual workhorse for
the unified __pa()/__pa_symbol()) as a real external function, that way
all the potential issues with compile/link-time optimizations of
constant symbol addresses go away, and we can also, if we choose to, add
more sanity-checking of the argument.

Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Vivek Goyal <vgoyal@in.ibm.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/i386/kernel/alternative.c     |  4 ++--
 arch/i386/mm/init.c                | 15 +++++++--------
 arch/x86_64/kernel/machine_kexec.c | 16 ++++++++--------
 arch/x86_64/kernel/setup.c         |  9 ++++-----
 arch/x86_64/kernel/smp.c           |  2 +-
 arch/x86_64/mm/init.c              | 31 ++++++++++++++++++-------------
 arch/x86_64/mm/ioremap.c           |  9 +++++++++
 arch/x86_64/mm/pageattr.c          | 16 ++++++++--------
 include/asm-x86_64/page.h          | 18 +++++++-----------
 include/asm-x86_64/pgtable.h       |  4 ++--
 10 files changed, 66 insertions(+), 58 deletions(-)

(limited to 'arch/x86_64/mm')

diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c
index e5cec6685cc..d8cda14fff8 100644
--- a/arch/i386/kernel/alternative.c
+++ b/arch/i386/kernel/alternative.c
@@ -390,8 +390,8 @@ void __init alternative_instructions(void)
 						_text, _etext);
 		}
 		free_init_pages("SMP alternatives",
-				__pa_symbol(&__smp_locks),
-				__pa_symbol(&__smp_locks_end));
+				(unsigned long)__smp_locks,
+				(unsigned long)__smp_locks_end);
 	} else {
 		alternatives_smp_module_add(NULL, "core kernel",
 					    __smp_locks, __smp_locks_end,
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index dbe16f63a56..1a7197e89eb 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -843,11 +843,10 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 	unsigned long addr;
 
 	for (addr = begin; addr < end; addr += PAGE_SIZE) {
-		struct page *page = pfn_to_page(addr >> PAGE_SHIFT);
-		ClearPageReserved(page);
-		init_page_count(page);
-		memset(page_address(page), POISON_FREE_INITMEM, PAGE_SIZE);
-		__free_page(page);
+		ClearPageReserved(virt_to_page(addr));
+		init_page_count(virt_to_page(addr));
+		memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
+		free_page(addr);
 		totalram_pages++;
 	}
 	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
@@ -856,14 +855,14 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 void free_initmem(void)
 {
 	free_init_pages("unused kernel memory",
-			__pa_symbol(&__init_begin),
-			__pa_symbol(&__init_end));
+			(unsigned long)(&__init_begin),
+			(unsigned long)(&__init_end));
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_init_pages("initrd memory", __pa(start), __pa(end));
+	free_init_pages("initrd memory", start, end);
 }
 #endif
 
diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c
index a8bb33c1a8f..c3a55470367 100644
--- a/arch/x86_64/kernel/machine_kexec.c
+++ b/arch/x86_64/kernel/machine_kexec.c
@@ -189,21 +189,21 @@ NORET_TYPE void machine_kexec(struct kimage *image)
 	control_page = page_address(image->control_code_page) + PAGE_SIZE;
 	memcpy(control_page, relocate_kernel, PAGE_SIZE);
 
-	page_list[PA_CONTROL_PAGE] = __pa(control_page);
+	page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
 	page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
-	page_list[PA_PGD] = __pa_symbol(&kexec_pgd);
+	page_list[PA_PGD] = virt_to_phys(&kexec_pgd);
 	page_list[VA_PGD] = (unsigned long)kexec_pgd;
-	page_list[PA_PUD_0] = __pa_symbol(&kexec_pud0);
+	page_list[PA_PUD_0] = virt_to_phys(&kexec_pud0);
 	page_list[VA_PUD_0] = (unsigned long)kexec_pud0;
-	page_list[PA_PMD_0] = __pa_symbol(&kexec_pmd0);
+	page_list[PA_PMD_0] = virt_to_phys(&kexec_pmd0);
 	page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
-	page_list[PA_PTE_0] = __pa_symbol(&kexec_pte0);
+	page_list[PA_PTE_0] = virt_to_phys(&kexec_pte0);
 	page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
-	page_list[PA_PUD_1] = __pa_symbol(&kexec_pud1);
+	page_list[PA_PUD_1] = virt_to_phys(&kexec_pud1);
 	page_list[VA_PUD_1] = (unsigned long)kexec_pud1;
-	page_list[PA_PMD_1] = __pa_symbol(&kexec_pmd1);
+	page_list[PA_PMD_1] = virt_to_phys(&kexec_pmd1);
 	page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
-	page_list[PA_PTE_1] = __pa_symbol(&kexec_pte1);
+	page_list[PA_PTE_1] = virt_to_phys(&kexec_pte1);
 	page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
 
 	page_list[PA_TABLE_PAGE] =
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index db30b5bcef6..db51577bda3 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -245,12 +245,11 @@ void __init setup_arch(char **cmdline_p)
 	init_mm.end_code = (unsigned long) &_etext;
 	init_mm.end_data = (unsigned long) &_edata;
 	init_mm.brk = (unsigned long) &_end;
-	init_mm.pgd = __va(__pa_symbol(&init_level4_pgt));
 
-	code_resource.start = __pa_symbol(&_text);
-	code_resource.end = __pa_symbol(&_etext)-1;
-	data_resource.start = __pa_symbol(&_etext);
-	data_resource.end = __pa_symbol(&_edata)-1;
+	code_resource.start = virt_to_phys(&_text);
+	code_resource.end = virt_to_phys(&_etext)-1;
+	data_resource.start = virt_to_phys(&_etext);
+	data_resource.end = virt_to_phys(&_edata)-1;
 
 	early_identify_cpu(&boot_cpu_data);
 
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 22abae4e9f3..bd1d123947c 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -76,7 +76,7 @@ static inline void leave_mm(int cpu)
 	if (read_pda(mmu_state) == TLBSTATE_OK)
 		BUG();
 	cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
-	load_cr3(init_mm.pgd);
+	load_cr3(swapper_pg_dir);
 }
 
 /*
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 282b0a8f00a..c0822683b91 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -572,13 +572,13 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 
 	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
 	for (addr = begin; addr < end; addr += PAGE_SIZE) {
-		struct page *page = pfn_to_page(addr >> PAGE_SHIFT);
-		ClearPageReserved(page);
-		init_page_count(page);
-		memset(page_address(page), POISON_FREE_INITMEM, PAGE_SIZE);
+		ClearPageReserved(virt_to_page(addr));
+		init_page_count(virt_to_page(addr));
+		memset((void *)(addr & ~(PAGE_SIZE-1)),
+			POISON_FREE_INITMEM, PAGE_SIZE);
 		if (addr >= __START_KERNEL_map)
 			change_page_attr_addr(addr, 1, __pgprot(0));
-		__free_page(page);
+		free_page(addr);
 		totalram_pages++;
 	}
 	if (addr > __START_KERNEL_map)
@@ -588,26 +588,31 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 void free_initmem(void)
 {
 	free_init_pages("unused kernel memory",
-			__pa_symbol(&__init_begin),
-			__pa_symbol(&__init_end));
+			(unsigned long)(&__init_begin),
+			(unsigned long)(&__init_end));
 }
 
 #ifdef CONFIG_DEBUG_RODATA
 
 void mark_rodata_ro(void)
 {
-	unsigned long start = PFN_ALIGN(__va(__pa_symbol(&_stext))), size;
+	unsigned long start = (unsigned long)_stext, end;
 
 #ifdef CONFIG_HOTPLUG_CPU
 	/* It must still be possible to apply SMP alternatives. */
 	if (num_possible_cpus() > 1)
-		start = PFN_ALIGN(__va(__pa_symbol(&_etext)));
+		start = (unsigned long)_etext;
 #endif
-	size = (unsigned long)__va(__pa_symbol(&__end_rodata)) - start;
-	change_page_attr_addr(start, size >> PAGE_SHIFT, PAGE_KERNEL_RO);
+	end = (unsigned long)__end_rodata;
+	start = (start + PAGE_SIZE - 1) & PAGE_MASK;
+	end &= PAGE_MASK;
+	if (end <= start)
+		return;
+
+	change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
 
 	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
-	       size >> 10);
+	       (end - start) >> 10);
 
 	/*
 	 * change_page_attr_addr() requires a global_flush_tlb() call after it.
@@ -622,7 +627,7 @@ void mark_rodata_ro(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_init_pages("initrd memory", __pa(start), __pa(end));
+	free_init_pages("initrd memory", start, end);
 }
 #endif
 
diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c
index c6e5e8d401a..6cac90aa503 100644
--- a/arch/x86_64/mm/ioremap.c
+++ b/arch/x86_64/mm/ioremap.c
@@ -13,12 +13,21 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/io.h>
+
 #include <asm/pgalloc.h>
 #include <asm/fixmap.h>
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
 #include <asm/proto.h>
 
+unsigned long __phys_addr(unsigned long x)
+{
+	if (x >= __START_KERNEL_map)
+		return x - __START_KERNEL_map + phys_base;
+	return x - PAGE_OFFSET;
+}
+EXPORT_SYMBOL(__phys_addr);
+
 #define ISA_START_ADDRESS      0xa0000
 #define ISA_END_ADDRESS                0x100000
 
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
index bf4aa8dd425..d653d0bf3df 100644
--- a/arch/x86_64/mm/pageattr.c
+++ b/arch/x86_64/mm/pageattr.c
@@ -51,6 +51,7 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot,
 	SetPagePrivate(base);
 	page_private(base) = 0;
 
+	address = __pa(address);
 	addr = address & LARGE_PAGE_MASK; 
 	pbase = (pte_t *)page_address(base);
 	for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
@@ -100,12 +101,13 @@ static inline void save_page(struct page *fpage)
  * No more special protections in this 2/4MB area - revert to a
  * large page again. 
  */
-static void revert_page(unsigned long address, unsigned long pfn, pgprot_t ref_prot)
+static void revert_page(unsigned long address, pgprot_t ref_prot)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t large_pte;
+	unsigned long pfn;
 
 	pgd = pgd_offset_k(address);
 	BUG_ON(pgd_none(*pgd));
@@ -113,6 +115,7 @@ static void revert_page(unsigned long address, unsigned long pfn, pgprot_t ref_p
 	BUG_ON(pud_none(*pud));
 	pmd = pmd_offset(pud, address);
 	BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
+	pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT;
 	large_pte = pfn_pte(pfn, ref_prot);
 	large_pte = pte_mkhuge(large_pte);
 	set_pte((pte_t *)pmd, large_pte);
@@ -138,8 +141,7 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
  			 */
 			struct page *split;
 			ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
-			split = split_large_page(pfn << PAGE_SHIFT, prot,
-							ref_prot2);
+			split = split_large_page(address, prot, ref_prot2);
 			if (!split)
 				return -ENOMEM;
 			set_pte(kpte, mk_pte(split, ref_prot2));
@@ -158,7 +160,7 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
 
 	if (page_private(kpte_page) == 0) {
 		save_page(kpte_page);
-		revert_page(address, pfn, ref_prot);
+		revert_page(address, ref_prot);
  	}
 	return 0;
 } 
@@ -178,7 +180,6 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
  */
 int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
 {
-	unsigned long phys_base_pfn = __pa_symbol(__START_KERNEL_map) >> PAGE_SHIFT;
 	int err = 0, kernel_map = 0;
 	int i; 
 
@@ -199,11 +200,10 @@ int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
 		}
 		/* Handle kernel mapping too which aliases part of the
 		 * lowmem */
-		if ((pfn >= phys_base_pfn) &&
-			((pfn - phys_base_pfn) < (KERNEL_TEXT_SIZE >> PAGE_SHIFT))) {
+		if (__pa(address) < KERNEL_TEXT_SIZE) {
 			unsigned long addr2;
 			pgprot_t prot2;
-			addr2 = __START_KERNEL_map + ((pfn - phys_base_pfn) << PAGE_SHIFT);
+			addr2 = __START_KERNEL_map + __pa(address);
 			/* Make sure the kernel mappings stay executable */
 			prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot)));
 			err = __change_page_attr(addr2, pfn, prot2,
diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h
index b17fc16ec2e..4d04e247956 100644
--- a/include/asm-x86_64/page.h
+++ b/include/asm-x86_64/page.h
@@ -94,26 +94,22 @@ extern unsigned long phys_base;
 
 #define KERNEL_TEXT_SIZE  (40*1024*1024)
 #define KERNEL_TEXT_START 0xffffffff80000000
+#define PAGE_OFFSET		__PAGE_OFFSET
 
 #ifndef __ASSEMBLY__
 
 #include <asm/bug.h>
 
-#endif /* __ASSEMBLY__ */
+extern unsigned long __phys_addr(unsigned long);
 
-#define PAGE_OFFSET		__PAGE_OFFSET
+#endif /* __ASSEMBLY__ */
 
-/* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol.
-   Otherwise you risk miscompilation. */
-#define __pa(x)			((unsigned long)(x) - PAGE_OFFSET)
-/* __pa_symbol should be used for C visible symbols.
-   This seems to be the official gcc blessed way to do such arithmetic. */ 
-#define __pa_symbol(x)		\
-	({unsigned long v;  \
-	  asm("" : "=r" (v) : "0" (x)); \
-	  ((v - __START_KERNEL_map) + phys_base); })
+#define __pa(x)		__phys_addr((unsigned long)(x))
+#define __pa_symbol(x)	__phys_addr((unsigned long)(x))
 
 #define __va(x)			((void *)((unsigned long)(x)+PAGE_OFFSET))
+#define __boot_va(x)		__va(x)
+#define __boot_pa(x)		__pa(x)
 #ifdef CONFIG_FLATMEM
 #define pfn_valid(pfn)		((pfn) < end_pfn)
 #endif
diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
index 599993f6ba8..da3390faaea 100644
--- a/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@ -19,7 +19,7 @@ extern pmd_t level2_kernel_pgt[512];
 extern pgd_t init_level4_pgt[];
 extern unsigned long __supported_pte_mask;
 
-#define swapper_pg_dir ((pgd_t *)NULL)
+#define swapper_pg_dir init_level4_pgt
 
 extern void paging_init(void);
 extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
@@ -29,7 +29,7 @@ extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
  * for zero-mapped memory areas etc..
  */
 extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
-#define ZERO_PAGE(vaddr) (pfn_to_page(__pa_symbol(&empty_zero_page) >> PAGE_SHIFT))
+#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
 
 #endif /* !__ASSEMBLY__ */
 
-- 
cgit v1.2.3-70-g09d2