From 07eee78ea8ba2d0b7b20551c35a3e7dd158d50bb Mon Sep 17 00:00:00 2001
From: Keir Fraser <Keir.Fraser@cl.cam.ac.uk>
Date: Wed, 30 Mar 2005 13:17:04 -0800
Subject: [PATCH] AGP fix for Xen VMM

When Linux is running on the Xen virtual machine monitor, physical
addresses are virtualised and cannot be directly referenced by the AGP
GART.  This patch fixes the GART driver for Xen by adding a layer of
abstraction between physical addresses and 'GART addresses'.

Architecture-specific functions are also defined for allocating and freeing
the GATT.  Xen requires this to ensure that table really is contiguous from
the point of view of the GART.

These extra interface functions are defined as 'no-ops' for all existing
architectures that use the GART driver.

Signed-off-by: Keir Fraser <keir@xensource.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 include/asm-x86_64/agp.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/asm-x86_64')

diff --git a/include/asm-x86_64/agp.h b/include/asm-x86_64/agp.h
index 0bb9019d58a..06c52ee9c06 100644
--- a/include/asm-x86_64/agp.h
+++ b/include/asm-x86_64/agp.h
@@ -19,4 +19,14 @@ int unmap_page_from_agp(struct page *page);
    worth it. Would need a page for it. */
 #define flush_agp_cache() asm volatile("wbinvd":::"memory")
 
+/* Convert a physical address to an address suitable for the GART. */
+#define phys_to_gart(x) (x)
+#define gart_to_phys(x) (x)
+
+/* GATT allocation. Returns/accepts GATT kernel virtual address. */
+#define alloc_gatt_pages(order)		\
+	((char *)__get_free_pages(GFP_KERNEL, (order)))
+#define free_gatt_pages(table, order)	\
+	free_pages((unsigned long)(table), (order))
+
 #endif
-- 
cgit v1.2.3-70-g09d2


From 84929801e14d968caeb84795bfbb88f04283fbd9 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Tue, 21 Jun 2005 17:14:32 -0700
Subject: [PATCH] x86_64: TASK_SIZE fixes for compatibility mode processes

Appended patch will setup compatibility mode TASK_SIZE properly.  This will
fix atleast three known bugs that can be encountered while running
compatibility mode apps.

a) A malicious 32bit app can have an elf section at 0xffffe000.  During
   exec of this app, we will have a memory leak as insert_vm_struct() is
   not checking for return value in syscall32_setup_pages() and thus not
   freeing the vma allocated for the vsyscall page.  And instead of exec
   failing (as it has addresses > TASK_SIZE), we were allowing it to
   succeed previously.

b) With a 32bit app, hugetlb_get_unmapped_area/arch_get_unmapped_area
   may return addresses beyond 32bits, ultimately causing corruption
   because of wrap-around and resulting in SEGFAULT, instead of returning
   ENOMEM.

c) 32bit app doing this below mmap will now fail.

  mmap((void *)(0xFFFFE000UL), 0x10000UL, PROT_READ|PROT_WRITE,
	MAP_FIXED|MAP_PRIVATE|MAP_ANON, 0, 0);

Signed-off-by: Zou Nan hai <nanhai.zou@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/ia32/ia32_binfmt.c  |  5 +----
 arch/x86_64/kernel/process.c    |  4 ++--
 arch/x86_64/kernel/ptrace.c     | 17 ++++++++++-------
 arch/x86_64/kernel/sys_x86_64.c | 14 ++++----------
 arch/x86_64/mm/fault.c          |  2 +-
 include/asm-x86_64/a.out.h      |  2 +-
 include/asm-x86_64/processor.h  | 11 ++++++-----
 7 files changed, 25 insertions(+), 30 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c
index 99b522052d1..c8131f342cf 100644
--- a/arch/x86_64/ia32/ia32_binfmt.c
+++ b/arch/x86_64/ia32/ia32_binfmt.c
@@ -46,7 +46,7 @@ struct elf_phdr;
 
 #define IA32_EMULATOR 1
 
-#define ELF_ET_DYN_BASE		(TASK_UNMAPPED_32 + 0x1000000)
+#define ELF_ET_DYN_BASE		(TASK_UNMAPPED_BASE + 0x1000000)
 
 #undef ELF_ARCH
 #define ELF_ARCH EM_386
@@ -307,9 +307,6 @@ MODULE_AUTHOR("Eric Youngdale, Andi Kleen");
 
 #define elf_addr_t __u32
 
-#undef TASK_SIZE
-#define TASK_SIZE 0xffffffff
-
 static void elf32_init(struct pt_regs *);
 
 #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 761b6d35e33..dce8bab4306 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -656,7 +656,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 
 	switch (code) { 
 	case ARCH_SET_GS:
-		if (addr >= TASK_SIZE) 
+		if (addr >= TASK_SIZE_OF(task))
 			return -EPERM; 
 		cpu = get_cpu();
 		/* handle small bases via the GDT because that's faster to 
@@ -682,7 +682,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 	case ARCH_SET_FS:
 		/* Not strictly needed for fs, but do it for symmetry
 		   with gs */
-		if (addr >= TASK_SIZE)
+		if (addr >= TASK_SIZE_OF(task))
 			return -EPERM; 
 		cpu = get_cpu();
 		/* handle small bases via the GDT because that's faster to 
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index 525f6a128a2..bbf64b59a21 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -257,12 +257,12 @@ static int putreg(struct task_struct *child,
 			value &= 0xffff;
 			return 0;
 		case offsetof(struct user_regs_struct,fs_base):
-			if (value >= TASK_SIZE)
+			if (value >= TASK_SIZE_OF(child))
 				return -EIO;
 			child->thread.fs = value;
 			return 0;
 		case offsetof(struct user_regs_struct,gs_base):
-			if (value >= TASK_SIZE)
+			if (value >= TASK_SIZE_OF(child))
 				return -EIO;
 			child->thread.gs = value;
 			return 0;
@@ -279,7 +279,7 @@ static int putreg(struct task_struct *child,
 			break;
 		case offsetof(struct user_regs_struct, rip):
 			/* Check if the new RIP address is canonical */
-			if (value >= TASK_SIZE)
+			if (value >= TASK_SIZE_OF(child))
 				return -EIO;
 			break;
 	}
@@ -419,6 +419,8 @@ asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data
 		break;
 
 	case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
+	{
+		int dsize = test_tsk_thread_flag(child, TIF_IA32) ? 3 : 7;
 		ret = -EIO;
 		if ((addr & 7) ||
 		    addr > sizeof(struct user) - 7)
@@ -430,22 +432,22 @@ asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data
 			break;
 		/* Disallows to set a breakpoint into the vsyscall */
 		case offsetof(struct user, u_debugreg[0]):
-			if (data >= TASK_SIZE-7) break;
+			if (data >= TASK_SIZE_OF(child) - dsize) break;
 			child->thread.debugreg0 = data;
 			ret = 0;
 			break;
 		case offsetof(struct user, u_debugreg[1]):
-			if (data >= TASK_SIZE-7) break;
+			if (data >= TASK_SIZE_OF(child) - dsize) break;
 			child->thread.debugreg1 = data;
 			ret = 0;
 			break;
 		case offsetof(struct user, u_debugreg[2]):
-			if (data >= TASK_SIZE-7) break;
+			if (data >= TASK_SIZE_OF(child) - dsize) break;
 			child->thread.debugreg2 = data;
 			ret = 0;
 			break;
 		case offsetof(struct user, u_debugreg[3]):
-			if (data >= TASK_SIZE-7) break;
+			if (data >= TASK_SIZE_OF(child) - dsize) break;
 			child->thread.debugreg3 = data;
 			ret = 0;
 			break;
@@ -469,6 +471,7 @@ asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data
 		  break;
 		}
 		break;
+	}
 	case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
 	case PTRACE_CONT:    /* restart after signal. */
 
diff --git a/arch/x86_64/kernel/sys_x86_64.c b/arch/x86_64/kernel/sys_x86_64.c
index dbebd5ccba6..d9798dd433f 100644
--- a/arch/x86_64/kernel/sys_x86_64.c
+++ b/arch/x86_64/kernel/sys_x86_64.c
@@ -68,13 +68,7 @@ out:
 static void find_start_end(unsigned long flags, unsigned long *begin,
 			   unsigned long *end)
 {
-#ifdef CONFIG_IA32_EMULATION
-	if (test_thread_flag(TIF_IA32)) { 
-		*begin = TASK_UNMAPPED_32;
-		*end = IA32_PAGE_OFFSET; 
-	} else 
-#endif
-	if (flags & MAP_32BIT) { 
+	if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) {
 		/* This is usually used needed to map code in small
 		   model, so it needs to be in the first 31bit. Limit
 		   it to that.  This means we need to move the
@@ -84,10 +78,10 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
 		   of playground for now. -AK */ 
 		*begin = 0x40000000; 
 		*end = 0x80000000;		
-	} else { 
-		*begin = TASK_UNMAPPED_64; 
+	} else {
+		*begin = TASK_UNMAPPED_BASE;
 		*end = TASK_SIZE; 
-		}
+	}
 } 
 
 unsigned long
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 5d6b2114cc9..57d3ab15a5c 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -350,7 +350,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	 * (error_code & 4) == 0, and that the fault was not a
 	 * protection error (error_code & 1) == 0.
 	 */
-	if (unlikely(address >= TASK_SIZE)) {
+	if (unlikely(address >= TASK_SIZE64)) {
 		if (!(error_code & 5) &&
 		      ((address >= VMALLOC_START && address < VMALLOC_END) ||
 		       (address >= MODULES_VADDR && address < MODULES_END))) {
diff --git a/include/asm-x86_64/a.out.h b/include/asm-x86_64/a.out.h
index 5952914f412..7255cde0653 100644
--- a/include/asm-x86_64/a.out.h
+++ b/include/asm-x86_64/a.out.h
@@ -21,7 +21,7 @@ struct exec
 
 #ifdef __KERNEL__
 #include <linux/thread_info.h>
-#define STACK_TOP (test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE)
+#define STACK_TOP TASK_SIZE
 #endif
 
 #endif /* __A_OUT_GNU_H__ */
diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h
index d641b19f6da..8b55f139968 100644
--- a/include/asm-x86_64/processor.h
+++ b/include/asm-x86_64/processor.h
@@ -160,16 +160,17 @@ static inline void clear_in_cr4 (unsigned long mask)
 /*
  * User space process size. 47bits minus one guard page.
  */
-#define TASK_SIZE	(0x800000000000UL - 4096)
+#define TASK_SIZE64	(0x800000000000UL - 4096)
 
 /* This decides where the kernel will search for a free chunk of vm
  * space during mmap's.
  */
 #define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000)
-#define TASK_UNMAPPED_32 PAGE_ALIGN(IA32_PAGE_OFFSET/3)
-#define TASK_UNMAPPED_64 PAGE_ALIGN(TASK_SIZE/3) 
-#define TASK_UNMAPPED_BASE	\
-	(test_thread_flag(TIF_IA32) ? TASK_UNMAPPED_32 : TASK_UNMAPPED_64)  
+
+#define TASK_SIZE 		(test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64)
+#define TASK_SIZE_OF(child) 	((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64)
+
+#define TASK_UNMAPPED_BASE	PAGE_ALIGN(TASK_SIZE/3)
 
 /*
  * Size of io_bitmap.
-- 
cgit v1.2.3-70-g09d2


From 39c715b71740c4a78ba4769fb54826929bac03cb Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 21 Jun 2005 17:14:34 -0700
Subject: [PATCH] smp_processor_id() cleanup

This patch implements a number of smp_processor_id() cleanup ideas that
Arjan van de Ven and I came up with.

The previous __smp_processor_id/_smp_processor_id/smp_processor_id API
spaghetti was hard to follow both on the implementational and on the
usage side.

Some of the complexity arose from picking wrong names, some of the
complexity comes from the fact that not all architectures defined
__smp_processor_id.

In the new code, there are two externally visible symbols:

 - smp_processor_id(): debug variant.

 - raw_smp_processor_id(): nondebug variant. Replaces all existing
   uses of _smp_processor_id() and __smp_processor_id(). Defined
   by every SMP architecture in include/asm-*/smp.h.

There is one new internal symbol, dependent on DEBUG_PREEMPT:

 - debug_smp_processor_id(): internal debug variant, mapped to
                             smp_processor_id().

Also, i moved debug_smp_processor_id() from lib/kernel_lock.c into a new
lib/smp_processor_id.c file.  All related comments got updated and/or
clarified.

I have build/boot tested the following 8 .config combinations on x86:

 {SMP,UP} x {PREEMPT,!PREEMPT} x {DEBUG_PREEMPT,!DEBUG_PREEMPT}

I have also build/boot tested x64 on UP/PREEMPT/DEBUG_PREEMPT.  (Other
architectures are untested, but should work just fine.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/traps.c          |  2 +-
 arch/i386/lib/delay.c             |  2 +-
 arch/ppc/lib/locks.c              |  4 +--
 arch/ppc64/kernel/idle.c          |  2 +-
 arch/sh/lib/delay.c               |  2 +-
 arch/sparc64/lib/delay.c          |  2 +-
 arch/x86_64/lib/delay.c           |  2 +-
 drivers/acpi/processor_idle.c     |  2 +-
 drivers/input/gameport/gameport.c |  2 +-
 drivers/oprofile/buffer_sync.c    |  4 +--
 fs/xfs/linux-2.6/xfs_linux.h      |  6 ++---
 include/asm-alpha/smp.h           |  2 +-
 include/asm-arm/smp.h             |  2 +-
 include/asm-i386/smp.h            |  2 +-
 include/asm-ia64/smp.h            |  2 +-
 include/asm-m32r/smp.h            |  2 +-
 include/asm-mips/smp.h            |  2 +-
 include/asm-parisc/smp.h          |  2 +-
 include/asm-ppc/smp.h             |  2 +-
 include/asm-ppc64/smp.h           |  2 +-
 include/asm-s390/smp.h            |  2 +-
 include/asm-sh/smp.h              |  2 +-
 include/asm-sparc/smp.h           |  2 +-
 include/asm-sparc64/smp.h         |  2 +-
 include/asm-um/smp.h              |  3 ++-
 include/asm-x86_64/smp.h          |  2 +-
 include/linux/mmzone.h            |  2 +-
 include/linux/smp.h               | 40 ++++++++++++----------------
 include/net/route.h               |  2 +-
 include/net/snmp.h                | 14 +++++-----
 kernel/module.c                   |  2 +-
 kernel/power/smp.c                |  4 +--
 kernel/sched.c                    |  4 +--
 kernel/stop_machine.c             |  4 +--
 lib/Makefile                      |  1 +
 lib/kernel_lock.c                 | 55 ---------------------------------------
 lib/smp_processor_id.c            | 55 +++++++++++++++++++++++++++++++++++++++
 37 files changed, 119 insertions(+), 125 deletions(-)
 create mode 100644 lib/smp_processor_id.c

(limited to 'include/asm-x86_64')

diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 00c63419c06..83c579e82a8 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -306,7 +306,7 @@ void die(const char * str, struct pt_regs * regs, long err)
 	};
 	static int die_counter;
 
-	if (die.lock_owner != _smp_processor_id()) {
+	if (die.lock_owner != raw_smp_processor_id()) {
 		console_verbose();
 		spin_lock_irq(&die.lock);
 		die.lock_owner = smp_processor_id();
diff --git a/arch/i386/lib/delay.c b/arch/i386/lib/delay.c
index 080639f262b..eb0cdfe9280 100644
--- a/arch/i386/lib/delay.c
+++ b/arch/i386/lib/delay.c
@@ -34,7 +34,7 @@ inline void __const_udelay(unsigned long xloops)
 	xloops *= 4;
 	__asm__("mull %0"
 		:"=d" (xloops), "=&a" (d0)
-		:"1" (xloops),"0" (cpu_data[_smp_processor_id()].loops_per_jiffy * (HZ/4)));
+		:"1" (xloops),"0" (cpu_data[raw_smp_processor_id()].loops_per_jiffy * (HZ/4)));
         __delay(++xloops);
 }
 
diff --git a/arch/ppc/lib/locks.c b/arch/ppc/lib/locks.c
index 694163d696d..c450dc4b766 100644
--- a/arch/ppc/lib/locks.c
+++ b/arch/ppc/lib/locks.c
@@ -130,7 +130,7 @@ void _raw_read_lock(rwlock_t *rw)
 		while (!read_can_lock(rw)) {
 			if (--stuck == 0) {
 				printk("_read_lock(%p) CPU#%d lock %d\n",
-				       rw, _smp_processor_id(), rw->lock);
+				       rw, raw_smp_processor_id(), rw->lock);
 				stuck = INIT_STUCK;
 			}
 		}
@@ -158,7 +158,7 @@ void _raw_write_lock(rwlock_t *rw)
 		while (!write_can_lock(rw)) {
 			if (--stuck == 0) {
 				printk("write_lock(%p) CPU#%d lock %d)\n",
-				       rw, _smp_processor_id(), rw->lock);
+				       rw, raw_smp_processor_id(), rw->lock);
 				stuck = INIT_STUCK;
 			}
 		}
diff --git a/arch/ppc64/kernel/idle.c b/arch/ppc64/kernel/idle.c
index f24ce2b8720..ff8a7db142d 100644
--- a/arch/ppc64/kernel/idle.c
+++ b/arch/ppc64/kernel/idle.c
@@ -292,7 +292,7 @@ static int native_idle(void)
 		if (need_resched())
 			schedule();
 
-		if (cpu_is_offline(_smp_processor_id()) &&
+		if (cpu_is_offline(raw_smp_processor_id()) &&
 		    system_state == SYSTEM_RUNNING)
 			cpu_die();
 	}
diff --git a/arch/sh/lib/delay.c b/arch/sh/lib/delay.c
index 50b36037d86..351714694d6 100644
--- a/arch/sh/lib/delay.c
+++ b/arch/sh/lib/delay.c
@@ -24,7 +24,7 @@ inline void __const_udelay(unsigned long xloops)
 	__asm__("dmulu.l	%0, %2\n\t"
 		"sts	mach, %0"
 		: "=r" (xloops)
-		: "0" (xloops), "r" (cpu_data[_smp_processor_id()].loops_per_jiffy)
+		: "0" (xloops), "r" (cpu_data[raw_smp_processor_id()].loops_per_jiffy)
 		: "macl", "mach");
 	__delay(xloops * HZ);
 }
diff --git a/arch/sparc64/lib/delay.c b/arch/sparc64/lib/delay.c
index f6b4c784d53..e8808727617 100644
--- a/arch/sparc64/lib/delay.c
+++ b/arch/sparc64/lib/delay.c
@@ -31,7 +31,7 @@ void __const_udelay(unsigned long n)
 {
 	n *= 4;
 
-	n *= (cpu_data(_smp_processor_id()).udelay_val * (HZ/4));
+	n *= (cpu_data(raw_smp_processor_id()).udelay_val * (HZ/4));
 	n >>= 32;
 
 	__delay(n + 1);
diff --git a/arch/x86_64/lib/delay.c b/arch/x86_64/lib/delay.c
index 6e2d66472eb..aed61a668a1 100644
--- a/arch/x86_64/lib/delay.c
+++ b/arch/x86_64/lib/delay.c
@@ -34,7 +34,7 @@ void __delay(unsigned long loops)
 
 inline void __const_udelay(unsigned long xloops)
 {
-	__delay(((xloops * cpu_data[_smp_processor_id()].loops_per_jiffy) >> 32) * HZ);
+	__delay(((xloops * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32) * HZ);
 }
 
 void __udelay(unsigned long usecs)
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index ff64d333e95..c9d671cf785 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -171,7 +171,7 @@ static void acpi_processor_idle (void)
 	int			sleep_ticks = 0;
 	u32			t1, t2 = 0;
 
-	pr = processors[_smp_processor_id()];
+	pr = processors[raw_smp_processor_id()];
 	if (!pr)
 		return;
 
diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
index 9b8ff396e6f..e152d0fa0cd 100644
--- a/drivers/input/gameport/gameport.c
+++ b/drivers/input/gameport/gameport.c
@@ -134,7 +134,7 @@ static int gameport_measure_speed(struct gameport *gameport)
 	}
 
 	gameport_close(gameport);
-	return (cpu_data[_smp_processor_id()].loops_per_jiffy * (unsigned long)HZ / (1000 / 50)) / (tx < 1 ? 1 : tx);
+	return (cpu_data[raw_smp_processor_id()].loops_per_jiffy * (unsigned long)HZ / (1000 / 50)) / (tx < 1 ? 1 : tx);
 
 #else
 
diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c
index 55720dc6ec4..745a1418363 100644
--- a/drivers/oprofile/buffer_sync.c
+++ b/drivers/oprofile/buffer_sync.c
@@ -62,7 +62,7 @@ static int task_exit_notify(struct notifier_block * self, unsigned long val, voi
 	/* To avoid latency problems, we only process the current CPU,
 	 * hoping that most samples for the task are on this CPU
 	 */
-	sync_buffer(_smp_processor_id());
+	sync_buffer(raw_smp_processor_id());
   	return 0;
 }
 
@@ -86,7 +86,7 @@ static int munmap_notify(struct notifier_block * self, unsigned long val, void *
 		/* To avoid latency problems, we only process the current CPU,
 		 * hoping that most samples for the task are on this CPU
 		 */
-		sync_buffer(_smp_processor_id());
+		sync_buffer(raw_smp_processor_id());
 		return 0;
 	}
 
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 71bb41019a1..7d7c8788ea7 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -145,10 +145,10 @@ static inline void set_buffer_unwritten_io(struct buffer_head *bh)
 #define xfs_inherit_nosymlinks	xfs_params.inherit_nosym.val
 #define xfs_rotorstep		xfs_params.rotorstep.val
 
-#ifndef __smp_processor_id
-#define __smp_processor_id()	smp_processor_id()
+#ifndef raw_smp_processor_id
+#define raw_smp_processor_id()	smp_processor_id()
 #endif
-#define current_cpu()		__smp_processor_id()
+#define current_cpu()		raw_smp_processor_id()
 #define current_pid()		(current->pid)
 #define current_fsuid(cred)	(current->fsuid)
 #define current_fsgid(cred)	(current->fsgid)
diff --git a/include/asm-alpha/smp.h b/include/asm-alpha/smp.h
index cbc173ae45a..9950706abdf 100644
--- a/include/asm-alpha/smp.h
+++ b/include/asm-alpha/smp.h
@@ -43,7 +43,7 @@ extern struct cpuinfo_alpha cpu_data[NR_CPUS];
 #define PROC_CHANGE_PENALTY     20
 
 #define hard_smp_processor_id()	__hard_smp_processor_id()
-#define smp_processor_id()	(current_thread_info()->cpu)
+#define raw_smp_processor_id()	(current_thread_info()->cpu)
 
 extern cpumask_t cpu_present_mask;
 extern cpumask_t cpu_online_map;
diff --git a/include/asm-arm/smp.h b/include/asm-arm/smp.h
index bd44f894690..6c6c60adbba 100644
--- a/include/asm-arm/smp.h
+++ b/include/asm-arm/smp.h
@@ -21,7 +21,7 @@
 # error "<asm-arm/smp.h> included in non-SMP build"
 #endif
 
-#define smp_processor_id()	(current_thread_info()->cpu)
+#define raw_smp_processor_id() (current_thread_info()->cpu)
 
 extern cpumask_t cpu_present_mask;
 #define cpu_possible_map cpu_present_mask
diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h
index e03a206dfa3..55ef31f66bb 100644
--- a/include/asm-i386/smp.h
+++ b/include/asm-i386/smp.h
@@ -51,7 +51,7 @@ extern u8 x86_cpu_to_apicid[];
  * from the initial startup. We map APIC_BASE very early in page_setup(),
  * so this is correct in the x86 case.
  */
-#define __smp_processor_id() (current_thread_info()->cpu)
+#define raw_smp_processor_id() (current_thread_info()->cpu)
 
 extern cpumask_t cpu_callout_map;
 extern cpumask_t cpu_callin_map;
diff --git a/include/asm-ia64/smp.h b/include/asm-ia64/smp.h
index 3ba1a061e4a..a3914352c99 100644
--- a/include/asm-ia64/smp.h
+++ b/include/asm-ia64/smp.h
@@ -46,7 +46,7 @@ ia64_get_lid (void)
 #define SMP_IRQ_REDIRECTION	(1 << 0)
 #define SMP_IPI_REDIRECTION	(1 << 1)
 
-#define smp_processor_id()	(current_thread_info()->cpu)
+#define raw_smp_processor_id() (current_thread_info()->cpu)
 
 extern struct smp_boot_data {
 	int cpu_count;
diff --git a/include/asm-m32r/smp.h b/include/asm-m32r/smp.h
index 8cd4d0da4be..b9a20cdad65 100644
--- a/include/asm-m32r/smp.h
+++ b/include/asm-m32r/smp.h
@@ -66,7 +66,7 @@ extern volatile int cpu_2_physid[NR_CPUS];
 #define physid_to_cpu(physid)	physid_2_cpu[physid]
 #define cpu_to_physid(cpu_id)	cpu_2_physid[cpu_id]
 
-#define smp_processor_id()	(current_thread_info()->cpu)
+#define raw_smp_processor_id()	(current_thread_info()->cpu)
 
 extern cpumask_t cpu_callout_map;
 #define cpu_possible_map cpu_callout_map
diff --git a/include/asm-mips/smp.h b/include/asm-mips/smp.h
index 8ba370ecfd4..5618f1e12f4 100644
--- a/include/asm-mips/smp.h
+++ b/include/asm-mips/smp.h
@@ -21,7 +21,7 @@
 #include <linux/cpumask.h>
 #include <asm/atomic.h>
 
-#define smp_processor_id()	(current_thread_info()->cpu)
+#define raw_smp_processor_id() (current_thread_info()->cpu)
 
 /* Map from cpu id to sequential logical cpu number.  This will only
    not be idempotent when cpus failed to come on-line.  */
diff --git a/include/asm-parisc/smp.h b/include/asm-parisc/smp.h
index fde77ac3546..9413f67a540 100644
--- a/include/asm-parisc/smp.h
+++ b/include/asm-parisc/smp.h
@@ -51,7 +51,7 @@ extern void smp_send_reschedule(int cpu);
 
 extern unsigned long cpu_present_mask;
 
-#define smp_processor_id()	(current_thread_info()->cpu)
+#define raw_smp_processor_id()	(current_thread_info()->cpu)
 
 #endif /* CONFIG_SMP */
 
diff --git a/include/asm-ppc/smp.h b/include/asm-ppc/smp.h
index ebfb614f55f..17530c232c7 100644
--- a/include/asm-ppc/smp.h
+++ b/include/asm-ppc/smp.h
@@ -44,7 +44,7 @@ extern void smp_message_recv(int, struct pt_regs *);
 #define NO_PROC_ID		0xFF            /* No processor magic marker */
 #define PROC_CHANGE_PENALTY	20
 
-#define smp_processor_id() (current_thread_info()->cpu)
+#define raw_smp_processor_id()	(current_thread_info()->cpu)
 
 extern int __cpu_up(unsigned int cpu);
 
diff --git a/include/asm-ppc64/smp.h b/include/asm-ppc64/smp.h
index c8646fa999c..8115ecb8fee 100644
--- a/include/asm-ppc64/smp.h
+++ b/include/asm-ppc64/smp.h
@@ -45,7 +45,7 @@ void generic_cpu_die(unsigned int cpu);
 void generic_mach_cpu_die(void);
 #endif
 
-#define __smp_processor_id() (get_paca()->paca_index)
+#define raw_smp_processor_id()	(get_paca()->paca_index)
 #define hard_smp_processor_id() (get_paca()->hw_cpu_id)
 
 extern cpumask_t cpu_sibling_map[NR_CPUS];
diff --git a/include/asm-s390/smp.h b/include/asm-s390/smp.h
index 9473786387a..dd50e57a928 100644
--- a/include/asm-s390/smp.h
+++ b/include/asm-s390/smp.h
@@ -47,7 +47,7 @@ extern int smp_call_function_on(void (*func) (void *info), void *info,
  
 #define PROC_CHANGE_PENALTY	20		/* Schedule penalty */
 
-#define smp_processor_id() (S390_lowcore.cpu_data.cpu_nr)
+#define raw_smp_processor_id()	(S390_lowcore.cpu_data.cpu_nr)
 
 extern int smp_get_cpu(cpumask_t cpu_map);
 extern void smp_put_cpu(int cpu);
diff --git a/include/asm-sh/smp.h b/include/asm-sh/smp.h
index 38b54469d7d..f19a8b3b69a 100644
--- a/include/asm-sh/smp.h
+++ b/include/asm-sh/smp.h
@@ -25,7 +25,7 @@ extern cpumask_t cpu_possible_map;
 
 #define cpu_online(cpu)		cpu_isset(cpu, cpu_online_map)
 
-#define smp_processor_id()	(current_thread_info()->cpu)
+#define raw_smp_processor_id()	(current_thread_info()->cpu)
 
 /* I've no idea what the real meaning of this is */
 #define PROC_CHANGE_PENALTY	20
diff --git a/include/asm-sparc/smp.h b/include/asm-sparc/smp.h
index f986c0d0922..4f96d8333a1 100644
--- a/include/asm-sparc/smp.h
+++ b/include/asm-sparc/smp.h
@@ -148,7 +148,7 @@ extern __inline__ int hard_smp_processor_id(void)
 }
 #endif
 
-#define smp_processor_id()	(current_thread_info()->cpu)
+#define raw_smp_processor_id()		(current_thread_info()->cpu)
 
 #define prof_multiplier(__cpu)		cpu_data(__cpu).multiplier
 #define prof_counter(__cpu)		cpu_data(__cpu).counter
diff --git a/include/asm-sparc64/smp.h b/include/asm-sparc64/smp.h
index 5e3e06d908f..110a2de8912 100644
--- a/include/asm-sparc64/smp.h
+++ b/include/asm-sparc64/smp.h
@@ -64,7 +64,7 @@ static __inline__ int hard_smp_processor_id(void)
 	}
 }
 
-#define smp_processor_id() (current_thread_info()->cpu)
+#define raw_smp_processor_id() (current_thread_info()->cpu)
 
 #endif /* !(__ASSEMBLY__) */
 
diff --git a/include/asm-um/smp.h b/include/asm-um/smp.h
index 4412d5d9c26..d879eba2b52 100644
--- a/include/asm-um/smp.h
+++ b/include/asm-um/smp.h
@@ -8,7 +8,8 @@
 #include "asm/current.h"
 #include "linux/cpumask.h"
 
-#define smp_processor_id() (current_thread->cpu)
+#define raw_smp_processor_id() (current_thread->cpu)
+
 #define cpu_logical_map(n) (n)
 #define cpu_number_map(n) (n)
 #define PROC_CHANGE_PENALTY	15 /* Pick a number, any number */
diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h
index 96844fecbde..a7425aa5a3b 100644
--- a/include/asm-x86_64/smp.h
+++ b/include/asm-x86_64/smp.h
@@ -68,7 +68,7 @@ static inline int num_booting_cpus(void)
 	return cpus_weight(cpu_callout_map);
 }
 
-#define __smp_processor_id() read_pda(cpunumber)
+#define raw_smp_processor_id() read_pda(cpunumber)
 
 extern __inline int hard_smp_processor_id(void)
 {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e530c6c092f..beacd931b60 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -381,7 +381,7 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
 
 #include <linux/topology.h>
 /* Returns the number of the current Node. */
-#define numa_node_id()		(cpu_to_node(_smp_processor_id()))
+#define numa_node_id()		(cpu_to_node(raw_smp_processor_id()))
 
 #ifndef CONFIG_DISCONTIGMEM
 
diff --git a/include/linux/smp.h b/include/linux/smp.h
index dcf1db3b35d..9dfa3ee769a 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -92,10 +92,7 @@ void smp_prepare_boot_cpu(void);
 /*
  *	These macros fold the SMP functionality into a single CPU system
  */
-
-#if !defined(__smp_processor_id) || !defined(CONFIG_PREEMPT)
-# define smp_processor_id()			0
-#endif
+#define raw_smp_processor_id()			0
 #define hard_smp_processor_id()			0
 #define smp_call_function(func,info,retry,wait)	({ 0; })
 #define on_each_cpu(func,info,retry,wait)	({ func(info); 0; })
@@ -106,30 +103,25 @@ static inline void smp_send_reschedule(int cpu) { }
 #endif /* !SMP */
 
 /*
- * DEBUG_PREEMPT support: check whether smp_processor_id() is being
- * used in a preemption-safe way.
+ * smp_processor_id(): get the current CPU ID.
  *
- * An architecture has to enable this debugging code explicitly.
- * It can do so by renaming the smp_processor_id() macro to
- * __smp_processor_id().  This should only be done after some minimal
- * testing, because usually there are a number of false positives
- * that an architecture will trigger.
+ * if DEBUG_PREEMPT is enabled the we check whether it is
+ * used in a preemption-safe way. (smp_processor_id() is safe
+ * if it's used in a preemption-off critical section, or in
+ * a thread that is bound to the current CPU.)
  *
- * To fix a false positive (i.e. smp_processor_id() use that the
- * debugging code reports but which use for some reason is legal),
- * change the smp_processor_id() reference to _smp_processor_id(),
- * which is the nondebug variant.  NOTE: don't use this to hack around
- * real bugs.
+ * NOTE: raw_smp_processor_id() is for internal use only
+ * (smp_processor_id() is the preferred variant), but in rare
+ * instances it might also be used to turn off false positives
+ * (i.e. smp_processor_id() use that the debugging code reports but
+ * which use for some reason is legal). Don't use this to hack around
+ * the warning message, as your code might not work under PREEMPT.
  */
-#ifdef __smp_processor_id
-# if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
-   extern unsigned int smp_processor_id(void);
-# else
-#  define smp_processor_id() __smp_processor_id()
-# endif
-# define _smp_processor_id() __smp_processor_id()
+#ifdef CONFIG_DEBUG_PREEMPT
+  extern unsigned int debug_smp_processor_id(void);
+# define smp_processor_id() debug_smp_processor_id()
 #else
-# define _smp_processor_id() smp_processor_id()
+# define smp_processor_id() raw_smp_processor_id()
 #endif
 
 #define get_cpu()		({ preempt_disable(); smp_processor_id(); })
diff --git a/include/net/route.h b/include/net/route.h
index d34ca8fc675..c3cd069a9ac 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -107,7 +107,7 @@ struct rt_cache_stat
 
 extern struct rt_cache_stat *rt_cache_stat;
 #define RT_CACHE_STAT_INC(field)					  \
-		(per_cpu_ptr(rt_cache_stat, _smp_processor_id())->field++)
+		(per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++)
 
 extern struct ip_rt_acct *ip_rt_acct;
 
diff --git a/include/net/snmp.h b/include/net/snmp.h
index a15ab256276..a36bed8ea21 100644
--- a/include/net/snmp.h
+++ b/include/net/snmp.h
@@ -128,18 +128,18 @@ struct linux_mib {
 #define SNMP_STAT_USRPTR(name)	(name[1])
 
 #define SNMP_INC_STATS_BH(mib, field) 	\
-	(per_cpu_ptr(mib[0], _smp_processor_id())->mibs[field]++)
+	(per_cpu_ptr(mib[0], raw_smp_processor_id())->mibs[field]++)
 #define SNMP_INC_STATS_OFFSET_BH(mib, field, offset)	\
-	(per_cpu_ptr(mib[0], _smp_processor_id())->mibs[field + (offset)]++)
+	(per_cpu_ptr(mib[0], raw_smp_processor_id())->mibs[field + (offset)]++)
 #define SNMP_INC_STATS_USER(mib, field) \
-	(per_cpu_ptr(mib[1], _smp_processor_id())->mibs[field]++)
+	(per_cpu_ptr(mib[1], raw_smp_processor_id())->mibs[field]++)
 #define SNMP_INC_STATS(mib, field) 	\
-	(per_cpu_ptr(mib[!in_softirq()], _smp_processor_id())->mibs[field]++)
+	(per_cpu_ptr(mib[!in_softirq()], raw_smp_processor_id())->mibs[field]++)
 #define SNMP_DEC_STATS(mib, field) 	\
-	(per_cpu_ptr(mib[!in_softirq()], _smp_processor_id())->mibs[field]--)
+	(per_cpu_ptr(mib[!in_softirq()], raw_smp_processor_id())->mibs[field]--)
 #define SNMP_ADD_STATS_BH(mib, field, addend) 	\
-	(per_cpu_ptr(mib[0], _smp_processor_id())->mibs[field] += addend)
+	(per_cpu_ptr(mib[0], raw_smp_processor_id())->mibs[field] += addend)
 #define SNMP_ADD_STATS_USER(mib, field, addend) 	\
-	(per_cpu_ptr(mib[1], _smp_processor_id())->mibs[field] += addend)
+	(per_cpu_ptr(mib[1], raw_smp_processor_id())->mibs[field] += addend)
 
 #endif
diff --git a/kernel/module.c b/kernel/module.c
index 83b3d376708..a566745dde6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -379,7 +379,7 @@ static void module_unload_init(struct module *mod)
 	for (i = 0; i < NR_CPUS; i++)
 		local_set(&mod->ref[i].count, 0);
 	/* Hold reference count during initialization. */
-	local_set(&mod->ref[_smp_processor_id()].count, 1);
+	local_set(&mod->ref[raw_smp_processor_id()].count, 1);
 	/* Backwards compatibility macros put refcount during init. */
 	mod->waiter = current;
 }
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
index cba3584b80f..457c2302ed4 100644
--- a/kernel/power/smp.c
+++ b/kernel/power/smp.c
@@ -48,11 +48,11 @@ void disable_nonboot_cpus(void)
 {
 	oldmask = current->cpus_allowed;
 	set_cpus_allowed(current, cpumask_of_cpu(0));
-	printk("Freezing CPUs (at %d)", _smp_processor_id());
+	printk("Freezing CPUs (at %d)", raw_smp_processor_id());
 	current->state = TASK_INTERRUPTIBLE;
 	schedule_timeout(HZ);
 	printk("...");
-	BUG_ON(_smp_processor_id() != 0);
+	BUG_ON(raw_smp_processor_id() != 0);
 
 	/* FIXME: for this to work, all the CPUs must be running
 	 * "idle" thread (or we deadlock). Is that guaranteed? */
diff --git a/kernel/sched.c b/kernel/sched.c
index f12a0c8a7d9..deca041fc36 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3814,7 +3814,7 @@ EXPORT_SYMBOL(yield);
  */
 void __sched io_schedule(void)
 {
-	struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id());
+	struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
 
 	atomic_inc(&rq->nr_iowait);
 	schedule();
@@ -3825,7 +3825,7 @@ EXPORT_SYMBOL(io_schedule);
 
 long __sched io_schedule_timeout(long timeout)
 {
-	struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id());
+	struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
 	long ret;
 
 	atomic_inc(&rq->nr_iowait);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 6116b25aa7c..84a9d18aa8d 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -100,7 +100,7 @@ static int stop_machine(void)
 	stopmachine_state = STOPMACHINE_WAIT;
 
 	for_each_online_cpu(i) {
-		if (i == _smp_processor_id())
+		if (i == raw_smp_processor_id())
 			continue;
 		ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL);
 		if (ret < 0)
@@ -182,7 +182,7 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
 
 	/* If they don't care which CPU fn runs on, bind to any online one. */
 	if (cpu == NR_CPUS)
-		cpu = _smp_processor_id();
+		cpu = raw_smp_processor_id();
 
 	p = kthread_create(do_stop, &smdata, "kstopmachine");
 	if (!IS_ERR(p)) {
diff --git a/lib/Makefile b/lib/Makefile
index 9eccea9429a..5f10cb89840 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -20,6 +20,7 @@ lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
 obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
+obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
 
 ifneq ($(CONFIG_HAVE_DEC_LOCK),y) 
   lib-y += dec_and_lock.o
diff --git a/lib/kernel_lock.c b/lib/kernel_lock.c
index 99b0ae3d51d..bd2bc5d887b 100644
--- a/lib/kernel_lock.c
+++ b/lib/kernel_lock.c
@@ -9,61 +9,6 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 
-#if defined(CONFIG_PREEMPT) && defined(__smp_processor_id) && \
-		defined(CONFIG_DEBUG_PREEMPT)
-
-/*
- * Debugging check.
- */
-unsigned int smp_processor_id(void)
-{
-	unsigned long preempt_count = preempt_count();
-	int this_cpu = __smp_processor_id();
-	cpumask_t this_mask;
-
-	if (likely(preempt_count))
-		goto out;
-
-	if (irqs_disabled())
-		goto out;
-
-	/*
-	 * Kernel threads bound to a single CPU can safely use
-	 * smp_processor_id():
-	 */
-	this_mask = cpumask_of_cpu(this_cpu);
-
-	if (cpus_equal(current->cpus_allowed, this_mask))
-		goto out;
-
-	/*
-	 * It is valid to assume CPU-locality during early bootup:
-	 */
-	if (system_state != SYSTEM_RUNNING)
-		goto out;
-
-	/*
-	 * Avoid recursion:
-	 */
-	preempt_disable();
-
-	if (!printk_ratelimit())
-		goto out_enable;
-
-	printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x] code: %s/%d\n", preempt_count(), current->comm, current->pid);
-	print_symbol("caller is %s\n", (long)__builtin_return_address(0));
-	dump_stack();
-
-out_enable:
-	preempt_enable_no_resched();
-out:
-	return this_cpu;
-}
-
-EXPORT_SYMBOL(smp_processor_id);
-
-#endif /* PREEMPT && __smp_processor_id && DEBUG_PREEMPT */
-
 #ifdef CONFIG_PREEMPT_BKL
 /*
  * The 'big kernel semaphore'
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
new file mode 100644
index 00000000000..42c08ef828c
--- /dev/null
+++ b/lib/smp_processor_id.c
@@ -0,0 +1,55 @@
+/*
+ * lib/smp_processor_id.c
+ *
+ * DEBUG_PREEMPT variant of smp_processor_id().
+ */
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+
+unsigned int debug_smp_processor_id(void)
+{
+	unsigned long preempt_count = preempt_count();
+	int this_cpu = raw_smp_processor_id();
+	cpumask_t this_mask;
+
+	if (likely(preempt_count))
+		goto out;
+
+	if (irqs_disabled())
+		goto out;
+
+	/*
+	 * Kernel threads bound to a single CPU can safely use
+	 * smp_processor_id():
+	 */
+	this_mask = cpumask_of_cpu(this_cpu);
+
+	if (cpus_equal(current->cpus_allowed, this_mask))
+		goto out;
+
+	/*
+	 * It is valid to assume CPU-locality during early bootup:
+	 */
+	if (system_state != SYSTEM_RUNNING)
+		goto out;
+
+	/*
+	 * Avoid recursion:
+	 */
+	preempt_disable();
+
+	if (!printk_ratelimit())
+		goto out_enable;
+
+	printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x] code: %s/%d\n", preempt_count(), current->comm, current->pid);
+	print_symbol("caller is %s\n", (long)__builtin_return_address(0));
+	dump_stack();
+
+out_enable:
+	preempt_enable_no_resched();
+out:
+	return this_cpu;
+}
+
+EXPORT_SYMBOL(debug_smp_processor_id);
+
-- 
cgit v1.2.3-70-g09d2


From 63551ae0feaaa23807ebea60de1901564bbef32e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 21 Jun 2005 17:14:44 -0700
Subject: [PATCH] Hugepage consolidation

A lot of the code in arch/*/mm/hugetlbpage.c is quite similar.  This patch
attempts to consolidate a lot of the code across the arch's, putting the
combined version in mm/hugetlb.c.  There are a couple of uglyish hacks in
order to covert all the hugepage archs, but the result is a very large
reduction in the total amount of code.  It also means things like hugepage
lazy allocation could be implemented in one place, instead of six.

Tested, at least a little, on ppc64, i386 and x86_64.

Notes:
	- this patch changes the meaning of set_huge_pte() to be more
	  analagous to set_pte()
	- does SH4 need s special huge_ptep_get_and_clear()??

Acked-by: William Lee Irwin <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/hugetlbpage.c    | 170 ++----------------------------------
 arch/ia64/mm/hugetlbpage.c    | 158 +---------------------------------
 arch/ppc64/mm/hugetlbpage.c   | 180 +-------------------------------------
 arch/sh/mm/hugetlbpage.c      | 196 ++++++------------------------------------
 arch/sh64/mm/hugetlbpage.c    |  18 +++-
 arch/sparc64/mm/hugetlbpage.c | 195 ++++++-----------------------------------
 include/asm-i386/page.h       |   1 +
 include/asm-i386/pgtable.h    |   2 +-
 include/asm-ia64/pgtable.h    |   1 +
 include/asm-sh/page.h         |   1 +
 include/asm-sh/pgtable.h      |   1 +
 include/asm-sh64/page.h       |   1 +
 include/asm-sh64/pgtable.h    |   2 +
 include/asm-sparc64/page.h    |   2 +
 include/asm-sparc64/pgtable.h |   1 +
 include/asm-x86_64/page.h     |   1 +
 include/asm-x86_64/pgtable.h  |   3 +-
 include/linux/hugetlb.h       |  40 +++++++--
 mm/hugetlb.c                  | 177 +++++++++++++++++++++++++++++++++++++-
 19 files changed, 300 insertions(+), 850 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
index 171fc925e1e..5aa06001a4b 100644
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -18,7 +18,7 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
-static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
@@ -30,7 +30,7 @@ static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 	return (pte_t *) pmd;
 }
 
-static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
@@ -42,21 +42,6 @@ static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	return (pte_t *) pmd;
 }
 
-static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page, pte_t * page_table, int write_access)
-{
-	pte_t entry;
-
-	add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
-	if (write_access) {
-		entry =
-		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
-	} else
-		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
-	entry = pte_mkyoung(entry);
-	mk_pte_huge(entry);
-	set_pte(page_table, entry);
-}
-
 /*
  * This function checks for proper alignment of input addr and len parameters.
  */
@@ -69,77 +54,6 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
 	return 0;
 }
 
-int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
-			struct vm_area_struct *vma)
-{
-	pte_t *src_pte, *dst_pte, entry;
-	struct page *ptepage;
-	unsigned long addr = vma->vm_start;
-	unsigned long end = vma->vm_end;
-
-	while (addr < end) {
-		dst_pte = huge_pte_alloc(dst, addr);
-		if (!dst_pte)
-			goto nomem;
-		src_pte = huge_pte_offset(src, addr);
-		entry = *src_pte;
-		ptepage = pte_page(entry);
-		get_page(ptepage);
-		set_pte(dst_pte, entry);
-		add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
-		addr += HPAGE_SIZE;
-	}
-	return 0;
-
-nomem:
-	return -ENOMEM;
-}
-
-int
-follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		    struct page **pages, struct vm_area_struct **vmas,
-		    unsigned long *position, int *length, int i)
-{
-	unsigned long vpfn, vaddr = *position;
-	int remainder = *length;
-
-	WARN_ON(!is_vm_hugetlb_page(vma));
-
-	vpfn = vaddr/PAGE_SIZE;
-	while (vaddr < vma->vm_end && remainder) {
-
-		if (pages) {
-			pte_t *pte;
-			struct page *page;
-
-			pte = huge_pte_offset(mm, vaddr);
-
-			/* hugetlb should be locked, and hence, prefaulted */
-			WARN_ON(!pte || pte_none(*pte));
-
-			page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
-
-			WARN_ON(!PageCompound(page));
-
-			get_page(page);
-			pages[i] = page;
-		}
-
-		if (vmas)
-			vmas[i] = vma;
-
-		vaddr += PAGE_SIZE;
-		++vpfn;
-		--remainder;
-		++i;
-	}
-
-	*length = remainder;
-	*position = vaddr;
-
-	return i;
-}
-
 #if 0	/* This is just for testing */
 struct page *
 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
@@ -204,83 +118,15 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 }
 #endif
 
-void unmap_hugepage_range(struct vm_area_struct *vma,
-		unsigned long start, unsigned long end)
+void hugetlb_clean_stale_pgtable(pte_t *pte)
 {
-	struct mm_struct *mm = vma->vm_mm;
-	unsigned long address;
-	pte_t pte, *ptep;
+	pmd_t *pmd = (pmd_t *) pte;
 	struct page *page;
 
-	BUG_ON(start & (HPAGE_SIZE - 1));
-	BUG_ON(end & (HPAGE_SIZE - 1));
-
-	for (address = start; address < end; address += HPAGE_SIZE) {
-		ptep = huge_pte_offset(mm, address);
-		if (!ptep)
-			continue;
-		pte = ptep_get_and_clear(mm, address, ptep);
-		if (pte_none(pte))
-			continue;
-		page = pte_page(pte);
-		put_page(page);
-	}
-	add_mm_counter(mm ,rss, -((end - start) >> PAGE_SHIFT));
-	flush_tlb_range(vma, start, end);
-}
-
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
-{
-	struct mm_struct *mm = current->mm;
-	unsigned long addr;
-	int ret = 0;
-
-	BUG_ON(vma->vm_start & ~HPAGE_MASK);
-	BUG_ON(vma->vm_end & ~HPAGE_MASK);
-
-	spin_lock(&mm->page_table_lock);
-	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
-		unsigned long idx;
-		pte_t *pte = huge_pte_alloc(mm, addr);
-		struct page *page;
-
-		if (!pte) {
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		if (!pte_none(*pte))
-			continue;
-
-		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
-			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-		page = find_get_page(mapping, idx);
-		if (!page) {
-			/* charge the fs quota first */
-			if (hugetlb_get_quota(mapping)) {
-				ret = -ENOMEM;
-				goto out;
-			}
-			page = alloc_huge_page();
-			if (!page) {
-				hugetlb_put_quota(mapping);
-				ret = -ENOMEM;
-				goto out;
-			}
-			ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
-			if (! ret) {
-				unlock_page(page);
-			} else {
-				hugetlb_put_quota(mapping);
-				free_huge_page(page);
-				goto out;
-			}
-		}
-		set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
-	}
-out:
-	spin_unlock(&mm->page_table_lock);
-	return ret;
+	page = pmd_page(*pmd);
+	pmd_clear(pmd);
+	dec_page_state(nr_page_table_pages);
+	page_cache_release(page);
 }
 
 /* x86_64 also uses this file */
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index df08ae7634b..e0a776a3044 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -24,7 +24,7 @@
 
 unsigned int hpage_shift=HPAGE_SHIFT_DEFAULT;
 
-static pte_t *
+pte_t *
 huge_pte_alloc (struct mm_struct *mm, unsigned long addr)
 {
 	unsigned long taddr = htlbpage_to_page(addr);
@@ -43,7 +43,7 @@ huge_pte_alloc (struct mm_struct *mm, unsigned long addr)
 	return pte;
 }
 
-static pte_t *
+pte_t *
 huge_pte_offset (struct mm_struct *mm, unsigned long addr)
 {
 	unsigned long taddr = htlbpage_to_page(addr);
@@ -67,23 +67,6 @@ huge_pte_offset (struct mm_struct *mm, unsigned long addr)
 
 #define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; }
 
-static void
-set_huge_pte (struct mm_struct *mm, struct vm_area_struct *vma,
-	      struct page *page, pte_t * page_table, int write_access)
-{
-	pte_t entry;
-
-	add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
-	if (write_access) {
-		entry =
-		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
-	} else
-		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
-	entry = pte_mkyoung(entry);
-	mk_pte_huge(entry);
-	set_pte(page_table, entry);
-	return;
-}
 /*
  * This function checks for proper alignment of input addr and len parameters.
  */
@@ -99,68 +82,6 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
 	return 0;
 }
 
-int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
-			struct vm_area_struct *vma)
-{
-	pte_t *src_pte, *dst_pte, entry;
-	struct page *ptepage;
-	unsigned long addr = vma->vm_start;
-	unsigned long end = vma->vm_end;
-
-	while (addr < end) {
-		dst_pte = huge_pte_alloc(dst, addr);
-		if (!dst_pte)
-			goto nomem;
-		src_pte = huge_pte_offset(src, addr);
-		entry = *src_pte;
-		ptepage = pte_page(entry);
-		get_page(ptepage);
-		set_pte(dst_pte, entry);
-		add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
-		addr += HPAGE_SIZE;
-	}
-	return 0;
-nomem:
-	return -ENOMEM;
-}
-
-int
-follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		    struct page **pages, struct vm_area_struct **vmas,
-		    unsigned long *st, int *length, int i)
-{
-	pte_t *ptep, pte;
-	unsigned long start = *st;
-	unsigned long pstart;
-	int len = *length;
-	struct page *page;
-
-	do {
-		pstart = start & HPAGE_MASK;
-		ptep = huge_pte_offset(mm, start);
-		pte = *ptep;
-
-back1:
-		page = pte_page(pte);
-		if (pages) {
-			page += ((start & ~HPAGE_MASK) >> PAGE_SHIFT);
-			get_page(page);
-			pages[i] = page;
-		}
-		if (vmas)
-			vmas[i] = vma;
-		i++;
-		len--;
-		start += PAGE_SIZE;
-		if (((start & HPAGE_MASK) == pstart) && len &&
-				(start < vma->vm_end))
-			goto back1;
-	} while (len && start < vma->vm_end);
-	*length = len;
-	*st = start;
-	return i;
-}
-
 struct page *follow_huge_addr(struct mm_struct *mm, unsigned long addr, int write)
 {
 	struct page *page;
@@ -212,81 +133,6 @@ void hugetlb_free_pgd_range(struct mmu_gather **tlb,
 	free_pgd_range(tlb, addr, end, floor, ceiling);
 }
 
-void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	unsigned long address;
-	pte_t *pte;
-	struct page *page;
-
-	BUG_ON(start & (HPAGE_SIZE - 1));
-	BUG_ON(end & (HPAGE_SIZE - 1));
-
-	for (address = start; address < end; address += HPAGE_SIZE) {
-		pte = huge_pte_offset(mm, address);
-		if (pte_none(*pte))
-			continue;
-		page = pte_page(*pte);
-		put_page(page);
-		pte_clear(mm, address, pte);
-	}
-	add_mm_counter(mm, rss, - ((end - start) >> PAGE_SHIFT));
-	flush_tlb_range(vma, start, end);
-}
-
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
-{
-	struct mm_struct *mm = current->mm;
-	unsigned long addr;
-	int ret = 0;
-
-	BUG_ON(vma->vm_start & ~HPAGE_MASK);
-	BUG_ON(vma->vm_end & ~HPAGE_MASK);
-
-	spin_lock(&mm->page_table_lock);
-	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
-		unsigned long idx;
-		pte_t *pte = huge_pte_alloc(mm, addr);
-		struct page *page;
-
-		if (!pte) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		if (!pte_none(*pte))
-			continue;
-
-		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
-			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-		page = find_get_page(mapping, idx);
-		if (!page) {
-			/* charge the fs quota first */
-			if (hugetlb_get_quota(mapping)) {
-				ret = -ENOMEM;
-				goto out;
-			}
-			page = alloc_huge_page();
-			if (!page) {
-				hugetlb_put_quota(mapping);
-				ret = -ENOMEM;
-				goto out;
-			}
-			ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
-			if (! ret) {
-				unlock_page(page);
-			} else {
-				hugetlb_put_quota(mapping);
-				page_cache_release(page);
-				goto out;
-			}
-		}
-		set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
-	}
-out:
-	spin_unlock(&mm->page_table_lock);
-	return ret;
-}
-
 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 		unsigned long pgoff, unsigned long flags)
 {
diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c
index d3bf86a5c1a..b4ab766f598 100644
--- a/arch/ppc64/mm/hugetlbpage.c
+++ b/arch/ppc64/mm/hugetlbpage.c
@@ -121,7 +121,7 @@ static pte_t *hugepte_alloc(struct mm_struct *mm, pud_t *dir, unsigned long addr
 	return hugepte_offset(dir, addr);
 }
 
-static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
 	pud_t *pud;
 
@@ -134,7 +134,7 @@ static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	return hugepte_offset(pud, addr);
 }
 
-static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pud_t *pud;
 
@@ -147,25 +147,6 @@ static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 	return hugepte_alloc(mm, pud, addr);
 }
 
-static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma,
-			 unsigned long addr, struct page *page,
-			 pte_t *ptep, int write_access)
-{
-	pte_t entry;
-
-	add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
-	if (write_access) {
-		entry =
-		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
-	} else {
-		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
-	}
-	entry = pte_mkyoung(entry);
-	entry = pte_mkhuge(entry);
-
-	set_pte_at(mm, addr, ptep, entry);
-}
-
 /*
  * This function checks for proper alignment of input addr and len parameters.
  */
@@ -259,80 +240,6 @@ int prepare_hugepage_range(unsigned long addr, unsigned long len)
 	return -EINVAL;
 }
 
-int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
-			struct vm_area_struct *vma)
-{
-	pte_t *src_pte, *dst_pte, entry;
-	struct page *ptepage;
-	unsigned long addr = vma->vm_start;
-	unsigned long end = vma->vm_end;
-	int err = -ENOMEM;
-
-	while (addr < end) {
-		dst_pte = huge_pte_alloc(dst, addr);
-		if (!dst_pte)
-			goto out;
-
-		src_pte = huge_pte_offset(src, addr);
-		entry = *src_pte;
-		
-		ptepage = pte_page(entry);
-		get_page(ptepage);
-		add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
-		set_pte_at(dst, addr, dst_pte, entry);
-
-		addr += HPAGE_SIZE;
-	}
-
-	err = 0;
- out:
-	return err;
-}
-
-int
-follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		    struct page **pages, struct vm_area_struct **vmas,
-		    unsigned long *position, int *length, int i)
-{
-	unsigned long vpfn, vaddr = *position;
-	int remainder = *length;
-
-	WARN_ON(!is_vm_hugetlb_page(vma));
-
-	vpfn = vaddr/PAGE_SIZE;
-	while (vaddr < vma->vm_end && remainder) {
-		if (pages) {
-			pte_t *pte;
-			struct page *page;
-
-			pte = huge_pte_offset(mm, vaddr);
-
-			/* hugetlb should be locked, and hence, prefaulted */
-			WARN_ON(!pte || pte_none(*pte));
-
-			page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
-
-			WARN_ON(!PageCompound(page));
-
-			get_page(page);
-			pages[i] = page;
-		}
-
-		if (vmas)
-			vmas[i] = vma;
-
-		vaddr += PAGE_SIZE;
-		++vpfn;
-		--remainder;
-		++i;
-	}
-
-	*length = remainder;
-	*position = vaddr;
-
-	return i;
-}
-
 struct page *
 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 {
@@ -363,89 +270,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 	return NULL;
 }
 
-void unmap_hugepage_range(struct vm_area_struct *vma,
-			  unsigned long start, unsigned long end)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	unsigned long addr;
-	pte_t *ptep;
-	struct page *page;
-
-	WARN_ON(!is_vm_hugetlb_page(vma));
-	BUG_ON((start % HPAGE_SIZE) != 0);
-	BUG_ON((end % HPAGE_SIZE) != 0);
-
-	for (addr = start; addr < end; addr += HPAGE_SIZE) {
-		pte_t pte;
-
-		ptep = huge_pte_offset(mm, addr);
-		if (!ptep || pte_none(*ptep))
-			continue;
-
-		pte = *ptep;
-		page = pte_page(pte);
-		pte_clear(mm, addr, ptep);
-
-		put_page(page);
-	}
-	add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT));
-	flush_tlb_pending();
-}
-
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
-{
-	struct mm_struct *mm = current->mm;
-	unsigned long addr;
-	int ret = 0;
-
-	WARN_ON(!is_vm_hugetlb_page(vma));
-	BUG_ON((vma->vm_start % HPAGE_SIZE) != 0);
-	BUG_ON((vma->vm_end % HPAGE_SIZE) != 0);
-
-	spin_lock(&mm->page_table_lock);
-	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
-		unsigned long idx;
-		pte_t *pte = huge_pte_alloc(mm, addr);
-		struct page *page;
-
-		if (!pte) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		if (! pte_none(*pte))
-			continue;
-
-		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
-			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-		page = find_get_page(mapping, idx);
-		if (!page) {
-			/* charge the fs quota first */
-			if (hugetlb_get_quota(mapping)) {
-				ret = -ENOMEM;
-				goto out;
-			}
-			page = alloc_huge_page();
-			if (!page) {
-				hugetlb_put_quota(mapping);
-				ret = -ENOMEM;
-				goto out;
-			}
-			ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
-			if (! ret) {
-				unlock_page(page);
-			} else {
-				hugetlb_put_quota(mapping);
-				free_huge_page(page);
-				goto out;
-			}
-		}
-		set_huge_pte(mm, vma, addr, page, pte, vma->vm_flags & VM_WRITE);
-	}
-out:
-	spin_unlock(&mm->page_table_lock);
-	return ret;
-}
-
 /* Because we have an exclusive hugepage region which lies within the
  * normal user address space, we have to take special measures to make
  * non-huge mmap()s evade the hugepage reserved regions. */
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index 1f897bab231..95bb1a6c606 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -24,7 +24,7 @@
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
 
-static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pmd_t *pmd;
@@ -39,7 +39,7 @@ static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 	return pte;
 }
 
-static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pmd_t *pmd;
@@ -56,28 +56,34 @@ static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 
 #define mk_pte_huge(entry) do { pte_val(entry) |= _PAGE_SZHUGE; } while (0)
 
-static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma,
-			 struct page *page, pte_t * page_table, int write_access)
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, pte_t entry)
 {
-	unsigned long i;
-	pte_t entry;
+	int i;
 
-	add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
+	for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
+		set_pte_at(mm, addr, ptep, entry);
+		ptep++;
+		addr += PAGE_SIZE;
+		pte_val(entry) += PAGE_SIZE;
+	}
+}
 
-	if (write_access)
-		entry = pte_mkwrite(pte_mkdirty(mk_pte(page,
-						       vma->vm_page_prot)));
-	else
-		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
-	entry = pte_mkyoung(entry);
-	mk_pte_huge(entry);
+pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
+			      pte_t *ptep)
+{
+	pte_t entry;
+	int i;
 
-	for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
-		set_pte(page_table, entry);
-		page_table++;
+	entry = *ptep;
 
-		pte_val(entry) += PAGE_SIZE;
+	for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
+		pte_clear(mm, addr, ptep);
+		addr += PAGE_SIZE;
+		ptep++;
 	}
+
+	return entry;
 }
 
 /*
@@ -92,79 +98,6 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
 	return 0;
 }
 
-int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
-			    struct vm_area_struct *vma)
-{
-	pte_t *src_pte, *dst_pte, entry;
-	struct page *ptepage;
-	unsigned long addr = vma->vm_start;
-	unsigned long end = vma->vm_end;
-	int i;
-
-	while (addr < end) {
-		dst_pte = huge_pte_alloc(dst, addr);
-		if (!dst_pte)
-			goto nomem;
-		src_pte = huge_pte_offset(src, addr);
-		BUG_ON(!src_pte || pte_none(*src_pte));
-		entry = *src_pte;
-		ptepage = pte_page(entry);
-		get_page(ptepage);
-		for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
-			set_pte(dst_pte, entry);
-			pte_val(entry) += PAGE_SIZE;
-			dst_pte++;
-		}
-		add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
-		addr += HPAGE_SIZE;
-	}
-	return 0;
-
-nomem:
-	return -ENOMEM;
-}
-
-int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
-			struct page **pages, struct vm_area_struct **vmas,
-			unsigned long *position, int *length, int i)
-{
-	unsigned long vaddr = *position;
-	int remainder = *length;
-
-	WARN_ON(!is_vm_hugetlb_page(vma));
-
-	while (vaddr < vma->vm_end && remainder) {
-		if (pages) {
-			pte_t *pte;
-			struct page *page;
-
-			pte = huge_pte_offset(mm, vaddr);
-
-			/* hugetlb should be locked, and hence, prefaulted */
-			BUG_ON(!pte || pte_none(*pte));
-
-			page = pte_page(*pte);
-
-			WARN_ON(!PageCompound(page));
-
-			get_page(page);
-			pages[i] = page;
-		}
-
-		if (vmas)
-			vmas[i] = vma;
-
-		vaddr += PAGE_SIZE;
-		--remainder;
-		++i;
-	}
-
-	*length = remainder;
-	*position = vaddr;
-
-	return i;
-}
-
 struct page *follow_huge_addr(struct mm_struct *mm,
 			      unsigned long address, int write)
 {
@@ -181,84 +114,3 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 {
 	return NULL;
 }
-
-void unmap_hugepage_range(struct vm_area_struct *vma,
-			  unsigned long start, unsigned long end)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	unsigned long address;
-	pte_t *pte;
-	struct page *page;
-	int i;
-
-	BUG_ON(start & (HPAGE_SIZE - 1));
-	BUG_ON(end & (HPAGE_SIZE - 1));
-
-	for (address = start; address < end; address += HPAGE_SIZE) {
-		pte = huge_pte_offset(mm, address);
-		BUG_ON(!pte);
-		if (pte_none(*pte))
-			continue;
-		page = pte_page(*pte);
-		put_page(page);
-		for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
-			pte_clear(mm, address+(i*PAGE_SIZE), pte);
-			pte++;
-		}
-	}
-	add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT));
-	flush_tlb_range(vma, start, end);
-}
-
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
-{
-	struct mm_struct *mm = current->mm;
-	unsigned long addr;
-	int ret = 0;
-
-	BUG_ON(vma->vm_start & ~HPAGE_MASK);
-	BUG_ON(vma->vm_end & ~HPAGE_MASK);
-
-	spin_lock(&mm->page_table_lock);
-	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
-		unsigned long idx;
-		pte_t *pte = huge_pte_alloc(mm, addr);
-		struct page *page;
-
-		if (!pte) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		if (!pte_none(*pte))
-			continue;
-
-		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
-			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-		page = find_get_page(mapping, idx);
-		if (!page) {
-			/* charge the fs quota first */
-			if (hugetlb_get_quota(mapping)) {
-				ret = -ENOMEM;
-				goto out;
-			}
-			page = alloc_huge_page();
-			if (!page) {
-				hugetlb_put_quota(mapping);
-				ret = -ENOMEM;
-				goto out;
-			}
-			ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
-			if (! ret) {
-				unlock_page(page);
-			} else {
-				hugetlb_put_quota(mapping);
-				free_huge_page(page);
-				goto out;
-			}
-		}
-		set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
-	}
-out:
-	spin_unlock(&mm->page_table_lock);
-	return ret;
-}
diff --git a/arch/sh64/mm/hugetlbpage.c b/arch/sh64/mm/hugetlbpage.c
index bcad2aefa4e..dcd9c8a8baf 100644
--- a/arch/sh64/mm/hugetlbpage.c
+++ b/arch/sh64/mm/hugetlbpage.c
@@ -24,7 +24,7 @@
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
 
-static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pmd_t *pmd;
@@ -39,7 +39,7 @@ static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 	return pte;
 }
 
-static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pmd_t *pmd;
@@ -80,6 +80,20 @@ static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 }
 
+pte_t huge_ptep_get_and_clear(pte_t *ptep)
+{
+	pte_t entry;
+
+	entry = *ptep;
+
+	for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
+		pte_clear(pte);
+		pte++;
+	}
+
+	return entry;
+}
+
 /*
  * This function checks for proper alignment of input addr and len parameters.
  */
diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c
index 5a1f831b2de..625cbb336a2 100644
--- a/arch/sparc64/mm/hugetlbpage.c
+++ b/arch/sparc64/mm/hugetlbpage.c
@@ -22,7 +22,7 @@
 #include <asm/cacheflush.h>
 #include <asm/mmu_context.h>
 
-static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
@@ -41,7 +41,7 @@ static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 	return pte;
 }
 
-static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
@@ -62,30 +62,34 @@ static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 
 #define mk_pte_huge(entry) do { pte_val(entry) |= _PAGE_SZHUGE; } while (0)
 
-static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma,
-			 unsigned long addr,
-			 struct page *page, pte_t * page_table, int write_access)
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, pte_t entry)
 {
-	unsigned long i;
-	pte_t entry;
+	int i;
+
+	for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
+		set_pte_at(mm, addr, ptep, entry);
+		ptep++;
+		addr += PAGE_SIZE;
+		pte_val(entry) += PAGE_SIZE;
+	}
+}
 
-	add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
+pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
+			      pte_t *ptep)
+{
+	pte_t entry;
+	int i;
 
-	if (write_access)
-		entry = pte_mkwrite(pte_mkdirty(mk_pte(page,
-						       vma->vm_page_prot)));
-	else
-		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
-	entry = pte_mkyoung(entry);
-	mk_pte_huge(entry);
+	entry = *ptep;
 
 	for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
-		set_pte_at(mm, addr, page_table, entry);
-		page_table++;
+		pte_clear(mm, addr, ptep);
 		addr += PAGE_SIZE;
-
-		pte_val(entry) += PAGE_SIZE;
+		ptep++;
 	}
+
+	return entry;
 }
 
 /*
@@ -100,79 +104,6 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
 	return 0;
 }
 
-int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
-			    struct vm_area_struct *vma)
-{
-	pte_t *src_pte, *dst_pte, entry;
-	struct page *ptepage;
-	unsigned long addr = vma->vm_start;
-	unsigned long end = vma->vm_end;
-	int i;
-
-	while (addr < end) {
-		dst_pte = huge_pte_alloc(dst, addr);
-		if (!dst_pte)
-			goto nomem;
-		src_pte = huge_pte_offset(src, addr);
-		BUG_ON(!src_pte || pte_none(*src_pte));
-		entry = *src_pte;
-		ptepage = pte_page(entry);
-		get_page(ptepage);
-		for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
-			set_pte_at(dst, addr, dst_pte, entry);
-			pte_val(entry) += PAGE_SIZE;
-			dst_pte++;
-			addr += PAGE_SIZE;
-		}
-		add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
-	}
-	return 0;
-
-nomem:
-	return -ENOMEM;
-}
-
-int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
-			struct page **pages, struct vm_area_struct **vmas,
-			unsigned long *position, int *length, int i)
-{
-	unsigned long vaddr = *position;
-	int remainder = *length;
-
-	WARN_ON(!is_vm_hugetlb_page(vma));
-
-	while (vaddr < vma->vm_end && remainder) {
-		if (pages) {
-			pte_t *pte;
-			struct page *page;
-
-			pte = huge_pte_offset(mm, vaddr);
-
-			/* hugetlb should be locked, and hence, prefaulted */
-			BUG_ON(!pte || pte_none(*pte));
-
-			page = pte_page(*pte);
-
-			WARN_ON(!PageCompound(page));
-
-			get_page(page);
-			pages[i] = page;
-		}
-
-		if (vmas)
-			vmas[i] = vma;
-
-		vaddr += PAGE_SIZE;
-		--remainder;
-		++i;
-	}
-
-	*length = remainder;
-	*position = vaddr;
-
-	return i;
-}
-
 struct page *follow_huge_addr(struct mm_struct *mm,
 			      unsigned long address, int write)
 {
@@ -190,34 +121,6 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 	return NULL;
 }
 
-void unmap_hugepage_range(struct vm_area_struct *vma,
-			  unsigned long start, unsigned long end)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	unsigned long address;
-	pte_t *pte;
-	struct page *page;
-	int i;
-
-	BUG_ON(start & (HPAGE_SIZE - 1));
-	BUG_ON(end & (HPAGE_SIZE - 1));
-
-	for (address = start; address < end; address += HPAGE_SIZE) {
-		pte = huge_pte_offset(mm, address);
-		BUG_ON(!pte);
-		if (pte_none(*pte))
-			continue;
-		page = pte_page(*pte);
-		put_page(page);
-		for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
-			pte_clear(mm, address+(i*PAGE_SIZE), pte);
-			pte++;
-		}
-	}
-	add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT));
-	flush_tlb_range(vma, start, end);
-}
-
 static void context_reload(void *__data)
 {
 	struct mm_struct *mm = __data;
@@ -226,12 +129,8 @@ static void context_reload(void *__data)
 		load_secondary_context(mm);
 }
 
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+void hugetlb_prefault_arch_hook(struct mm_struct *mm)
 {
-	struct mm_struct *mm = current->mm;
-	unsigned long addr;
-	int ret = 0;
-
 	/* On UltraSPARC-III+ and later, configure the second half of
 	 * the Data-TLB for huge pages.
 	 */
@@ -261,50 +160,4 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 		}
 		spin_unlock(&ctx_alloc_lock);
 	}
-
-	BUG_ON(vma->vm_start & ~HPAGE_MASK);
-	BUG_ON(vma->vm_end & ~HPAGE_MASK);
-
-	spin_lock(&mm->page_table_lock);
-	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
-		unsigned long idx;
-		pte_t *pte = huge_pte_alloc(mm, addr);
-		struct page *page;
-
-		if (!pte) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		if (!pte_none(*pte))
-			continue;
-
-		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
-			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-		page = find_get_page(mapping, idx);
-		if (!page) {
-			/* charge the fs quota first */
-			if (hugetlb_get_quota(mapping)) {
-				ret = -ENOMEM;
-				goto out;
-			}
-			page = alloc_huge_page();
-			if (!page) {
-				hugetlb_put_quota(mapping);
-				ret = -ENOMEM;
-				goto out;
-			}
-			ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
-			if (! ret) {
-				unlock_page(page);
-			} else {
-				hugetlb_put_quota(mapping);
-				free_huge_page(page);
-				goto out;
-			}
-		}
-		set_huge_pte(mm, vma, addr, page, pte, vma->vm_flags & VM_WRITE);
-	}
-out:
-	spin_unlock(&mm->page_table_lock);
-	return ret;
 }
diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h
index ed13969fa2d..41400d342d4 100644
--- a/include/asm-i386/page.h
+++ b/include/asm-i386/page.h
@@ -68,6 +68,7 @@ typedef struct { unsigned long pgprot; } pgprot_t;
 #define HPAGE_MASK	(~(HPAGE_SIZE - 1))
 #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
 #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
+#define ARCH_HAS_HUGETLB_CLEAN_STALE_PGTABLE
 #endif
 
 #define pgd_val(x)	((x).pgd)
diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index 8d60c2b4b00..e9efe148fdf 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -236,6 +236,7 @@ static inline pte_t pte_mkexec(pte_t pte)	{ (pte).pte_low |= _PAGE_USER; return
 static inline pte_t pte_mkdirty(pte_t pte)	{ (pte).pte_low |= _PAGE_DIRTY; return pte; }
 static inline pte_t pte_mkyoung(pte_t pte)	{ (pte).pte_low |= _PAGE_ACCESSED; return pte; }
 static inline pte_t pte_mkwrite(pte_t pte)	{ (pte).pte_low |= _PAGE_RW; return pte; }
+static inline pte_t pte_mkhuge(pte_t pte)	{ (pte).pte_low |= _PAGE_PRESENT | _PAGE_PSE; return pte; }
 
 #ifdef CONFIG_X86_PAE
 # include <asm/pgtable-3level.h>
@@ -275,7 +276,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
  */
 
 #define mk_pte(page, pgprot)	pfn_pte(page_to_pfn(page), (pgprot))
-#define mk_pte_huge(entry) ((entry).pte_low |= _PAGE_PRESENT | _PAGE_PSE)
 
 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 {
diff --git a/include/asm-ia64/pgtable.h b/include/asm-ia64/pgtable.h
index fcc9c3344ab..48586e08f43 100644
--- a/include/asm-ia64/pgtable.h
+++ b/include/asm-ia64/pgtable.h
@@ -283,6 +283,7 @@ ia64_phys_addr_valid (unsigned long addr)
 #define pte_mkyoung(pte)	(__pte(pte_val(pte) | _PAGE_A))
 #define pte_mkclean(pte)	(__pte(pte_val(pte) & ~_PAGE_D))
 #define pte_mkdirty(pte)	(__pte(pte_val(pte) | _PAGE_D))
+#define pte_mkhuge(pte)		(__pte(pte_val(pte) | _PAGE_P))
 
 /*
  * Macro to a page protection value as "uncacheable".  Note that "protection" is really a
diff --git a/include/asm-sh/page.h b/include/asm-sh/page.h
index 4c6d129e7d9..180467be8e7 100644
--- a/include/asm-sh/page.h
+++ b/include/asm-sh/page.h
@@ -31,6 +31,7 @@
 #define HPAGE_SIZE		(1UL << HPAGE_SHIFT)
 #define HPAGE_MASK		(~(HPAGE_SIZE-1))
 #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT-PAGE_SHIFT)
+#define ARCH_HAS_SETCLEAR_HUGE_PTE
 #endif
 
 #ifdef __KERNEL__
diff --git a/include/asm-sh/pgtable.h b/include/asm-sh/pgtable.h
index cd847a47a9a..ecb909572d3 100644
--- a/include/asm-sh/pgtable.h
+++ b/include/asm-sh/pgtable.h
@@ -196,6 +196,7 @@ static inline pte_t pte_mkexec(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _
 static inline pte_t pte_mkdirty(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_DIRTY)); return pte; }
 static inline pte_t pte_mkyoung(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED)); return pte; }
 static inline pte_t pte_mkwrite(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_RW)); return pte; }
+static inline pte_t pte_mkhuge(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_SZHUGE)); return pte; }
 
 /*
  * Macro and implementation to make a page protection as uncachable.
diff --git a/include/asm-sh64/page.h b/include/asm-sh64/page.h
index e1f7f5a4121..d6167f1c0e9 100644
--- a/include/asm-sh64/page.h
+++ b/include/asm-sh64/page.h
@@ -41,6 +41,7 @@
 #define HPAGE_SIZE		(1UL << HPAGE_SHIFT)
 #define HPAGE_MASK		(~(HPAGE_SIZE-1))
 #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT-PAGE_SHIFT)
+#define ARCH_HAS_SETCLEAR_HUGE_PTE
 #endif
 
 #ifdef __KERNEL__
diff --git a/include/asm-sh64/pgtable.h b/include/asm-sh64/pgtable.h
index 525e1523ef5..78ac6be2d9e 100644
--- a/include/asm-sh64/pgtable.h
+++ b/include/asm-sh64/pgtable.h
@@ -430,6 +430,8 @@ extern inline pte_t pte_mkwrite(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) |
 extern inline pte_t pte_mkexec(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_EXECUTE)); return pte; }
 extern inline pte_t pte_mkdirty(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_DIRTY)); return pte; }
 extern inline pte_t pte_mkyoung(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED)); return pte; }
+extern inline pte_t pte_mkhuge(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_SZHUGE)); return pte; }
+
 
 /*
  * Conversion functions: convert a page and protection to a page entry.
diff --git a/include/asm-sparc64/page.h b/include/asm-sparc64/page.h
index 219ea043a14..b87dbbd64bc 100644
--- a/include/asm-sparc64/page.h
+++ b/include/asm-sparc64/page.h
@@ -95,6 +95,8 @@ typedef unsigned long pgprot_t;
 #define HPAGE_SIZE		(_AC(1,UL) << HPAGE_SHIFT)
 #define HPAGE_MASK		(~(HPAGE_SIZE - 1UL))
 #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
+#define ARCH_HAS_SETCLEAR_HUGE_PTE
+#define ARCH_HAS_HUGETLB_PREFAULT_HOOK
 #endif
 
 #define TASK_UNMAPPED_BASE	(test_thread_flag(TIF_32BIT) ? \
diff --git a/include/asm-sparc64/pgtable.h b/include/asm-sparc64/pgtable.h
index ae2cd5b09a7..1ae00c5087f 100644
--- a/include/asm-sparc64/pgtable.h
+++ b/include/asm-sparc64/pgtable.h
@@ -286,6 +286,7 @@ static inline pte_t pte_modify(pte_t orig_pte, pgprot_t new_prot)
 #define pte_mkyoung(pte)	(__pte(pte_val(pte) | _PAGE_ACCESSED | _PAGE_R))
 #define pte_mkwrite(pte)	(__pte(pte_val(pte) | _PAGE_WRITE))
 #define pte_mkdirty(pte)	(__pte(pte_val(pte) | _PAGE_MODIFIED | _PAGE_W))
+#define pte_mkhuge(pte)		(__pte(pte_val(pte) | _PAGE_SZHUGE))
 
 /* to find an entry in a page-table-directory. */
 #define pgd_index(address)	(((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h
index f43048035a0..9ce338c3a71 100644
--- a/include/asm-x86_64/page.h
+++ b/include/asm-x86_64/page.h
@@ -28,6 +28,7 @@
 #define HPAGE_SIZE	((1UL) << HPAGE_SHIFT)
 #define HPAGE_MASK	(~(HPAGE_SIZE - 1))
 #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
+#define ARCH_HAS_HUGETLB_CLEAN_STALE_PGTABLE
 
 #ifdef __KERNEL__
 #ifndef __ASSEMBLY__
diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
index db2a0efbf57..4eec176c3c3 100644
--- a/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@ -253,6 +253,7 @@ extern inline int pte_young(pte_t pte)		{ return pte_val(pte) & _PAGE_ACCESSED;
 extern inline int pte_write(pte_t pte)		{ return pte_val(pte) & _PAGE_RW; }
 static inline int pte_file(pte_t pte)		{ return pte_val(pte) & _PAGE_FILE; }
 
+#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT)
 extern inline pte_t pte_rdprotect(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER)); return pte; }
 extern inline pte_t pte_exprotect(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER)); return pte; }
 extern inline pte_t pte_mkclean(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_DIRTY)); return pte; }
@@ -263,6 +264,7 @@ extern inline pte_t pte_mkexec(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _
 extern inline pte_t pte_mkdirty(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_DIRTY)); return pte; }
 extern inline pte_t pte_mkyoung(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED)); return pte; }
 extern inline pte_t pte_mkwrite(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_RW)); return pte; }
+extern inline pte_t pte_mkhuge(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | __LARGE_PTE)); return pte; }
 
 struct vm_area_struct;
 
@@ -290,7 +292,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
  */
 #define pgprot_noncached(prot)	(__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))
 
-#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT) 
 static inline int pmd_large(pmd_t pte) { 
 	return (pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE; 
 } 	
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 6af1ae4a821..f529d144281 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -4,6 +4,7 @@
 #ifdef CONFIG_HUGETLB_PAGE
 
 #include <linux/mempolicy.h>
+#include <asm/tlbflush.h>
 
 struct ctl_table;
 
@@ -22,12 +23,6 @@ int hugetlb_report_meminfo(char *);
 int hugetlb_report_node_meminfo(int, char *);
 int is_hugepage_mem_enough(size_t);
 unsigned long hugetlb_total_pages(void);
-struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
-			      int write);
-struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-				pmd_t *pmd, int write);
-int is_aligned_hugepage_range(unsigned long addr, unsigned long len);
-int pmd_huge(pmd_t pmd);
 struct page *alloc_huge_page(void);
 void free_huge_page(struct page *);
 
@@ -35,6 +30,17 @@ extern unsigned long max_huge_pages;
 extern const unsigned long hugetlb_zero, hugetlb_infinity;
 extern int sysctl_hugetlb_shm_group;
 
+/* arch callbacks */
+
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr);
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr);
+struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
+			      int write);
+struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
+				pmd_t *pmd, int write);
+int is_aligned_hugepage_range(unsigned long addr, unsigned long len);
+int pmd_huge(pmd_t pmd);
+
 #ifndef ARCH_HAS_HUGEPAGE_ONLY_RANGE
 #define is_hugepage_only_range(mm, addr, len)	0
 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
@@ -48,6 +54,28 @@ extern int sysctl_hugetlb_shm_group;
 int prepare_hugepage_range(unsigned long addr, unsigned long len);
 #endif
 
+#ifndef ARCH_HAS_SETCLEAR_HUGE_PTE
+#define set_huge_pte_at(mm, addr, ptep, pte)	set_pte_at(mm, addr, ptep, pte)
+#define huge_ptep_get_and_clear(mm, addr, ptep) ptep_get_and_clear(mm, addr, ptep)
+#else
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, pte_t pte);
+pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
+			      pte_t *ptep);
+#endif
+
+#ifndef ARCH_HAS_HUGETLB_PREFAULT_HOOK
+#define hugetlb_prefault_arch_hook(mm)		do { } while (0)
+#else
+void hugetlb_prefault_arch_hook(struct mm_struct *mm);
+#endif
+
+#ifndef ARCH_HAS_HUGETLB_CLEAN_STALE_PGTABLE
+#define hugetlb_clean_stale_pgtable(pte)	BUG()
+#else
+void hugetlb_clean_stale_pgtable(pte_t *pte);
+#endif
+
 #else /* !CONFIG_HUGETLB_PAGE */
 
 static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4eb5ae3fbe1..fbd1111ea11 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7,10 +7,14 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mm.h>
-#include <linux/hugetlb.h>
 #include <linux/sysctl.h>
 #include <linux/highmem.h>
 #include <linux/nodemask.h>
+#include <linux/pagemap.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+
+#include <linux/hugetlb.h>
 
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
 static unsigned long nr_huge_pages, free_huge_pages;
@@ -249,6 +253,72 @@ struct vm_operations_struct hugetlb_vm_ops = {
 	.nopage = hugetlb_nopage,
 };
 
+static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
+{
+	pte_t entry;
+
+	if (vma->vm_flags & VM_WRITE) {
+		entry =
+		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
+	} else {
+		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
+	}
+	entry = pte_mkyoung(entry);
+	entry = pte_mkhuge(entry);
+
+	return entry;
+}
+
+int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
+			    struct vm_area_struct *vma)
+{
+	pte_t *src_pte, *dst_pte, entry;
+	struct page *ptepage;
+	unsigned long addr = vma->vm_start;
+	unsigned long end = vma->vm_end;
+
+	while (addr < end) {
+		dst_pte = huge_pte_alloc(dst, addr);
+		if (!dst_pte)
+			goto nomem;
+		src_pte = huge_pte_offset(src, addr);
+		BUG_ON(!src_pte || pte_none(*src_pte)); /* prefaulted */
+		entry = *src_pte;
+		ptepage = pte_page(entry);
+		get_page(ptepage);
+		add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
+		set_huge_pte_at(dst, addr, dst_pte, entry);
+		addr += HPAGE_SIZE;
+	}
+	return 0;
+
+nomem:
+	return -ENOMEM;
+}
+
+void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
+			  unsigned long end)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long address;
+	pte_t pte;
+	struct page *page;
+
+	WARN_ON(!is_vm_hugetlb_page(vma));
+	BUG_ON(start & ~HPAGE_MASK);
+	BUG_ON(end & ~HPAGE_MASK);
+
+	for (address = start; address < end; address += HPAGE_SIZE) {
+		pte = huge_ptep_get_and_clear(mm, address, huge_pte_offset(mm, address));
+		if (pte_none(pte))
+			continue;
+		page = pte_page(pte);
+		put_page(page);
+	}
+	add_mm_counter(mm, rss,  -((end - start) >> PAGE_SHIFT));
+	flush_tlb_range(vma, start, end);
+}
+
 void zap_hugepage_range(struct vm_area_struct *vma,
 			unsigned long start, unsigned long length)
 {
@@ -258,3 +328,108 @@ void zap_hugepage_range(struct vm_area_struct *vma,
 	unmap_hugepage_range(vma, start, start + length);
 	spin_unlock(&mm->page_table_lock);
 }
+
+int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long addr;
+	int ret = 0;
+
+	WARN_ON(!is_vm_hugetlb_page(vma));
+	BUG_ON(vma->vm_start & ~HPAGE_MASK);
+	BUG_ON(vma->vm_end & ~HPAGE_MASK);
+
+	hugetlb_prefault_arch_hook(mm);
+
+	spin_lock(&mm->page_table_lock);
+	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+		unsigned long idx;
+		pte_t *pte = huge_pte_alloc(mm, addr);
+		struct page *page;
+
+		if (!pte) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		if (! pte_none(*pte))
+			hugetlb_clean_stale_pgtable(pte);
+
+		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
+			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+		page = find_get_page(mapping, idx);
+		if (!page) {
+			/* charge the fs quota first */
+			if (hugetlb_get_quota(mapping)) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			page = alloc_huge_page();
+			if (!page) {
+				hugetlb_put_quota(mapping);
+				ret = -ENOMEM;
+				goto out;
+			}
+			ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
+			if (! ret) {
+				unlock_page(page);
+			} else {
+				hugetlb_put_quota(mapping);
+				free_huge_page(page);
+				goto out;
+			}
+		}
+		add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
+		set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
+	}
+out:
+	spin_unlock(&mm->page_table_lock);
+	return ret;
+}
+
+int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
+			struct page **pages, struct vm_area_struct **vmas,
+			unsigned long *position, int *length, int i)
+{
+	unsigned long vpfn, vaddr = *position;
+	int remainder = *length;
+
+	BUG_ON(!is_vm_hugetlb_page(vma));
+
+	vpfn = vaddr/PAGE_SIZE;
+	while (vaddr < vma->vm_end && remainder) {
+
+		if (pages) {
+			pte_t *pte;
+			struct page *page;
+
+			/* Some archs (sparc64, sh*) have multiple
+			 * pte_ts to each hugepage.  We have to make
+			 * sure we get the first, for the page
+			 * indexing below to work. */
+			pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
+
+			/* hugetlb should be locked, and hence, prefaulted */
+			WARN_ON(!pte || pte_none(*pte));
+
+			page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
+
+			WARN_ON(!PageCompound(page));
+
+			get_page(page);
+			pages[i] = page;
+		}
+
+		if (vmas)
+			vmas[i] = vma;
+
+		vaddr += PAGE_SIZE;
+		++vpfn;
+		--remainder;
+		++i;
+	}
+
+	*length = remainder;
+	*position = vaddr;
+
+	return i;
+}
-- 
cgit v1.2.3-70-g09d2


From 408fde81c1bff15c875a3618481e93a01dcc79ea Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Thu, 23 Jun 2005 00:07:37 -0700
Subject: [PATCH] remove non-DISCONTIG use of pgdat->node_mem_map

This patch effectively eliminates direct use of pgdat->node_mem_map outside
of the DISCONTIG code.  On a flat memory system, these fields aren't
currently used, neither are they on a sparsemem system.

There was also a node_mem_map(nid) macro on many architectures.  Its use
along with the use of ->node_mem_map itself was not consistent.  It has
been removed in favor of two new, more explicit, arch-independent macros:

	pgdat_page_nr(pgdat, pagenr)
	nid_page_nr(nid, pagenr)

I called them "pgdat" and "nid" because we overload the term "node" to mean
"NUMA node", "DISCONTIG node" or "pg_data_t" in very confusing ways.  I
believe the newer names are much clearer.

These macros can be overridden in the sparsemem case with a theoretically
slower operation using node_start_pfn and pfn_to_page(), instead.  We could
make this the only behavior if people want, but I don't want to change too
much at once.  One thing at a time.

This patch removes more code than it adds.

Compile tested on alpha, alpha discontig, arm, arm-discontig, i386, i386
generic, NUMAQ, Summit, ppc64, ppc64 discontig, and x86_64.  Full list
here: http://sr71.net/patches/2.6.12/2.6.12-rc1-mhp2/configs/

Boot tested on NUMAQ, x86 SMP and ppc64 power4/5 LPARs.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin J. Bligh <mbligh@aracnet.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/mm/numa.c             | 16 +++++++---------
 arch/i386/mm/pgtable.c           |  2 +-
 arch/ia64/mm/discontig.c         |  9 +++++----
 arch/m32r/mm/init.c              |  4 ++--
 arch/mips/sgi-ip27/ip27-memory.c |  5 ++---
 arch/parisc/mm/init.c            |  2 +-
 arch/ppc64/mm/init.c             |  4 ++--
 include/asm-alpha/mmzone.h       |  3 +--
 include/asm-i386/mmzone.h        |  3 +--
 include/asm-m32r/mmzone.h        |  3 +--
 include/asm-parisc/mmzone.h      |  3 +--
 include/asm-ppc64/mmzone.h       |  3 +--
 include/asm-x86_64/mmzone.h      |  5 +----
 include/linux/mmzone.h           |  2 ++
 14 files changed, 28 insertions(+), 36 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c
index ba81c4422aa..c7481d59b6d 100644
--- a/arch/alpha/mm/numa.c
+++ b/arch/alpha/mm/numa.c
@@ -327,8 +327,6 @@ void __init mem_init(void)
 	extern char _text, _etext, _data, _edata;
 	extern char __init_begin, __init_end;
 	unsigned long nid, i;
-	struct page * lmem_map;
-
 	high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT);
 
 	reservedpages = 0;
@@ -338,10 +336,10 @@ void __init mem_init(void)
 		 */
 		totalram_pages += free_all_bootmem_node(NODE_DATA(nid));
 
-		lmem_map = node_mem_map(nid);
 		pfn = NODE_DATA(nid)->node_start_pfn;
 		for (i = 0; i < node_spanned_pages(nid); i++, pfn++)
-			if (page_is_ram(pfn) && PageReserved(lmem_map+i))
+			if (page_is_ram(pfn) &&
+			    PageReserved(nid_page_nr(nid, i)))
 				reservedpages++;
 	}
 
@@ -373,18 +371,18 @@ show_mem(void)
 	show_free_areas();
 	printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_online_node(nid) {
-		struct page * lmem_map = node_mem_map(nid);
 		i = node_spanned_pages(nid);
 		while (i-- > 0) {
+			struct page *page = nid_page_nr(nid, i);
 			total++;
-			if (PageReserved(lmem_map+i))
+			if (PageReserved(page))
 				reserved++;
-			else if (PageSwapCache(lmem_map+i))
+			else if (PageSwapCache(page))
 				cached++;
-			else if (!page_count(lmem_map+i))
+			else if (!page_count(page))
 				free++;
 			else
-				shared += page_count(lmem_map + i) - 1;
+				shared += page_count(page) - 1;
 		}
 	}
 	printk("%ld pages of RAM\n",total);
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index dd81479ff88..80c84cdf22e 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -36,7 +36,7 @@ void show_mem(void)
 	printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_pgdat(pgdat) {
 		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
-			page = pgdat->node_mem_map + i;
+			page = pgdat_page_nr(pgdat, i);
 			total++;
 			if (PageHighMem(page))
 				highmem++;
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index c0071092939..f3fd528ead3 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -560,14 +560,15 @@ void show_mem(void)
 		int shared = 0, cached = 0, reserved = 0;
 		printk("Node ID: %d\n", pgdat->node_id);
 		for(i = 0; i < pgdat->node_spanned_pages; i++) {
+			struct page *page = pgdat_page_nr(pgdat, i);
 			if (!ia64_pfn_valid(pgdat->node_start_pfn+i))
 				continue;
-			if (PageReserved(pgdat->node_mem_map+i))
+			if (PageReserved(page))
 				reserved++;
-			else if (PageSwapCache(pgdat->node_mem_map+i))
+			else if (PageSwapCache(page))
 				cached++;
-			else if (page_count(pgdat->node_mem_map+i))
-				shared += page_count(pgdat->node_mem_map+i)-1;
+			else if (page_count(page))
+				shared += page_count(page)-1;
 		}
 		total_present += present;
 		total_reserved += reserved;
diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c
index bc423d838fb..d9a40b1fe8b 100644
--- a/arch/m32r/mm/init.c
+++ b/arch/m32r/mm/init.c
@@ -49,7 +49,7 @@ void show_mem(void)
 	printk("Free swap:       %6ldkB\n",nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_pgdat(pgdat) {
 		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
-			page = pgdat->node_mem_map + i;
+			page = pgdat_page_nr(pgdat, i);
 			total++;
 			if (PageHighMem(page))
 				highmem++;
@@ -152,7 +152,7 @@ int __init reservedpages_count(void)
 	reservedpages = 0;
 	for_each_online_node(nid)
 		for (i = 0 ; i < MAX_LOW_PFN(nid) - START_PFN(nid) ; i++)
-			if (PageReserved(NODE_DATA(nid)->node_mem_map + i))
+			if (PageReserved(nid_page_nr(nid, i)))
 				reservedpages++;
 
 	return reservedpages;
diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c
index 0a44a98d7ad..a160d04f7db 100644
--- a/arch/mips/sgi-ip27/ip27-memory.c
+++ b/arch/mips/sgi-ip27/ip27-memory.c
@@ -549,9 +549,8 @@ void __init mem_init(void)
 		 */
 		numslots = node_getlastslot(node);
 		for (slot = 1; slot <= numslots; slot++) {
-			p = NODE_DATA(node)->node_mem_map +
-				(slot_getbasepfn(node, slot) -
-				 slot_getbasepfn(node, 0));
+			p = nid_page_nr(node, slot_getbasepfn(node, slot) -
+					      slot_getbasepfn(node, 0));
 
 			/*
 			 * Free valid memory in current slot.
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index cac37589e35..2886ad70db4 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -506,7 +506,7 @@ void show_mem(void)
 		for (j = node_start_pfn(i); j < node_end_pfn(i); j++) {
 			struct page *p;
 
-			p = node_mem_map(i) + j - node_start_pfn(i);
+			p = nid_page_nr(i, j) - node_start_pfn(i);
 
 			total++;
 			if (PageReserved(p))
diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c
index 6fa1e6490b5..29dbe084c21 100644
--- a/arch/ppc64/mm/init.c
+++ b/arch/ppc64/mm/init.c
@@ -98,7 +98,7 @@ void show_mem(void)
 	printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_pgdat(pgdat) {
 		for (i = 0; i < pgdat->node_spanned_pages; i++) {
-			page = pgdat->node_mem_map + i;
+			page = pgdat_page_nr(pgdat, i);
 			total++;
 			if (PageReserved(page))
 				reserved++;
@@ -654,7 +654,7 @@ void __init mem_init(void)
 
 	for_each_pgdat(pgdat) {
 		for (i = 0; i < pgdat->node_spanned_pages; i++) {
-			page = pgdat->node_mem_map + i;
+			page = pgdat_page_nr(pgdat, i);
 			if (PageReserved(page))
 				reservedpages++;
 		}
diff --git a/include/asm-alpha/mmzone.h b/include/asm-alpha/mmzone.h
index 726c150dcbe..a011ef4cf3d 100644
--- a/include/asm-alpha/mmzone.h
+++ b/include/asm-alpha/mmzone.h
@@ -57,7 +57,6 @@ PLAT_NODE_DATA_LOCALNR(unsigned long p, int n)
  * Given a kernel address, find the home node of the underlying memory.
  */
 #define kvaddr_to_nid(kaddr)	pa_to_nid(__pa(kaddr))
-#define node_mem_map(nid)	(NODE_DATA(nid)->node_mem_map)
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 
 #define local_mapnr(kvaddr) \
@@ -108,7 +107,7 @@ PLAT_NODE_DATA_LOCALNR(unsigned long p, int n)
 #define pfn_to_page(pfn)						\
 ({									\
  	unsigned long kaddr = (unsigned long)__va((pfn) << PAGE_SHIFT);	\
-	(node_mem_map(kvaddr_to_nid(kaddr)) + local_mapnr(kaddr));	\
+	(NODE_DATA(kvaddr_to_nid(kaddr))->node_mem_map + local_mapnr(kaddr));	\
 })
 
 #define page_to_pfn(page)						\
diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h
index 13830ae67ca..9cec191f462 100644
--- a/include/asm-i386/mmzone.h
+++ b/include/asm-i386/mmzone.h
@@ -79,7 +79,6 @@ static inline int pfn_to_nid(unsigned long pfn)
  */
 #define kvaddr_to_nid(kaddr)	pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT)
 
-#define node_mem_map(nid)	(NODE_DATA(nid)->node_mem_map)
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 #define node_end_pfn(nid)						\
 ({									\
@@ -100,7 +99,7 @@ static inline int pfn_to_nid(unsigned long pfn)
 ({									\
 	unsigned long __pfn = pfn;					\
 	int __node  = pfn_to_nid(__pfn);				\
-	&node_mem_map(__node)[node_localnr(__pfn,__node)];		\
+	&NODE_DATA(__node)->node_mem_map[node_localnr(__pfn,__node)];	\
 })
 
 #define page_to_pfn(pg)							\
diff --git a/include/asm-m32r/mmzone.h b/include/asm-m32r/mmzone.h
index ebf0228fec4..d58878ec899 100644
--- a/include/asm-m32r/mmzone.h
+++ b/include/asm-m32r/mmzone.h
@@ -14,7 +14,6 @@ extern struct pglist_data *node_data[];
 #define NODE_DATA(nid)		(node_data[nid])
 
 #define node_localnr(pfn, nid)	((pfn) - NODE_DATA(nid)->node_start_pfn)
-#define node_mem_map(nid)	(NODE_DATA(nid)->node_mem_map)
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 #define node_end_pfn(nid)						\
 ({									\
@@ -32,7 +31,7 @@ extern struct pglist_data *node_data[];
 ({									\
 	unsigned long __pfn = pfn;					\
 	int __node  = pfn_to_nid(__pfn);				\
-	&node_mem_map(__node)[node_localnr(__pfn,__node)];		\
+	&NODE_DATA(__node)->node_mem_map[node_localnr(__pfn,__node)];	\
 })
 
 #define page_to_pfn(pg)							\
diff --git a/include/asm-parisc/mmzone.h b/include/asm-parisc/mmzone.h
index 928bf50c469..595d3dce120 100644
--- a/include/asm-parisc/mmzone.h
+++ b/include/asm-parisc/mmzone.h
@@ -19,7 +19,6 @@ extern struct node_map_data node_data[];
  */
 #define kvaddr_to_nid(kaddr)	pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT)
 
-#define node_mem_map(nid)	(NODE_DATA(nid)->node_mem_map)
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 #define node_end_pfn(nid)						\
 ({									\
@@ -38,7 +37,7 @@ extern struct node_map_data node_data[];
 ({									\
 	unsigned long __pfn = (pfn);					\
 	int __node  = pfn_to_nid(__pfn);				\
-	&node_mem_map(__node)[node_localnr(__pfn,__node)];		\
+	&NODE_DATA(__node)->node_mem_map[node_localnr(__pfn,__node)];	\
 })
 
 #define page_to_pfn(pg)							\
diff --git a/include/asm-ppc64/mmzone.h b/include/asm-ppc64/mmzone.h
index 0619a41a3c9..cbfc5ecfe87 100644
--- a/include/asm-ppc64/mmzone.h
+++ b/include/asm-ppc64/mmzone.h
@@ -65,7 +65,6 @@ static inline int pa_to_nid(unsigned long pa)
  */
 #define kvaddr_to_nid(kaddr)	pa_to_nid(__pa(kaddr))
 
-#define node_mem_map(nid)	(NODE_DATA(nid)->node_mem_map)
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 #define node_end_pfn(nid)	(NODE_DATA(nid)->node_end_pfn)
 
@@ -76,7 +75,7 @@ static inline int pa_to_nid(unsigned long pa)
 #define discontigmem_pfn_to_page(pfn) \
 ({ \
 	unsigned long __tmp = pfn; \
-	(node_mem_map(pfn_to_nid(__tmp)) + \
+	(NODE_DATA(pfn_to_nid(__tmp))->node_mem_map + \
 	 node_localnr(__tmp, pfn_to_nid(__tmp))); \
 })
 
diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h
index d95b7c24083..ca4fc3fe0de 100644
--- a/include/asm-x86_64/mmzone.h
+++ b/include/asm-x86_64/mmzone.h
@@ -35,9 +35,6 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
 #define kvaddr_to_nid(kaddr)	phys_to_nid(__pa(kaddr))
 #define NODE_DATA(nid)		(node_data[nid])
 
-#define node_mem_map(nid)	(NODE_DATA(nid)->node_mem_map)
-
-#define node_mem_map(nid)	(NODE_DATA(nid)->node_mem_map)
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 #define node_end_pfn(nid)       (NODE_DATA(nid)->node_start_pfn + \
 				 NODE_DATA(nid)->node_spanned_pages)
@@ -50,7 +47,7 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
    (2.4 used to). */
 #define pfn_to_page(pfn) ({ \
 	int nid = phys_to_nid(((unsigned long)(pfn)) << PAGE_SHIFT); 	\
-	((pfn) - node_start_pfn(nid)) + node_mem_map(nid);		\
+	((pfn) - node_start_pfn(nid)) + NODE_DATA(nid)->node_mem_map;	\
 })
 
 #define page_to_pfn(page) \
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 4733d35d822..b79633d3a97 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -284,6 +284,8 @@ typedef struct pglist_data {
 
 #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
 #define node_spanned_pages(nid)	(NODE_DATA(nid)->node_spanned_pages)
+#define pgdat_page_nr(pgdat, pagenr)	((pgdat)->node_mem_map + (pagenr))
+#define nid_page_nr(nid, pagenr) 	pgdat_page_nr(NODE_DATA(nid),(pagenr))
 
 extern struct pglist_data *pgdat_list;
 
-- 
cgit v1.2.3-70-g09d2


From 2b97690f4cd960779fb351b7cd9974390afabb36 Mon Sep 17 00:00:00 2001
From: Matt Tolentino <metolent@snoqualmie.dp.intel.com>
Date: Thu, 23 Jun 2005 00:08:06 -0700
Subject: [PATCH] reorganize x86-64 NUMA and DISCONTIGMEM config options

In order to use the alternative sparsemem implmentation for NUMA kernels,
we need to reorganize the config options.  This patch effectively abstracts
out the CONFIG_DISCONTIGMEM options to CONFIG_NUMA in most cases.  Thus,
the discontigmem implementation may be employed as always, but the
sparsemem implementation may be used alternatively.

Signed-off-by: Matt Tolentino <matthew.e.tolentino@intel.com>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/head64.c   |  2 +-
 arch/x86_64/kernel/setup.c    |  6 +++---
 arch/x86_64/mm/Makefile       |  2 +-
 arch/x86_64/mm/init.c         |  9 ++++++---
 arch/x86_64/mm/ioremap.c      |  2 +-
 include/asm-x86_64/io.h       |  5 -----
 include/asm-x86_64/mmzone.h   | 15 +++++++++------
 include/asm-x86_64/page.h     |  4 +++-
 include/asm-x86_64/topology.h |  4 +---
 9 files changed, 25 insertions(+), 24 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
index 0f8c78dcd38..cf6ab147a2a 100644
--- a/arch/x86_64/kernel/head64.c
+++ b/arch/x86_64/kernel/head64.c
@@ -94,7 +94,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
 	s = strstr(saved_command_line, "earlyprintk=");
 	if (s != NULL)
 		setup_early_printk(s);
-#ifdef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_NUMA
 	s = strstr(saved_command_line, "numa=");
 	if (s != NULL)
 		numa_setup(s+5);
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 125a0bb1f5d..dd8419b37a7 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -348,7 +348,7 @@ static __init void parse_cmdline_early (char ** cmdline_p)
 		if (!memcmp(from, "mem=", 4))
 			parse_memopt(from+4, &from); 
 
-#ifdef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_NUMA
 		if (!memcmp(from, "numa=", 5))
 			numa_setup(from+5); 
 #endif
@@ -377,7 +377,7 @@ static __init void parse_cmdline_early (char ** cmdline_p)
 	*cmdline_p = command_line;
 }
 
-#ifndef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_NUMA
 static void __init contig_initmem_init(void)
 {
         unsigned long bootmap_size, bootmap; 
@@ -554,7 +554,7 @@ void __init setup_arch(char **cmdline_p)
 	acpi_numa_init();
 #endif
 
-#ifdef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_NUMA
 	numa_initmem_init(0, end_pfn); 
 #else
 	contig_initmem_init(); 
diff --git a/arch/x86_64/mm/Makefile b/arch/x86_64/mm/Makefile
index 66c354ad80c..1d232a87f11 100644
--- a/arch/x86_64/mm/Makefile
+++ b/arch/x86_64/mm/Makefile
@@ -4,7 +4,7 @@
 
 obj-y	 := init.o fault.o ioremap.o extable.o pageattr.o
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
-obj-$(CONFIG_DISCONTIGMEM) += numa.o
+obj-$(CONFIG_NUMA) += numa.o
 obj-$(CONFIG_K8_NUMA) += k8topology.o
 obj-$(CONFIG_ACPI_NUMA) += srat.o
 
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index dbe53b4c7e6..72e4b364ed7 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -318,7 +318,7 @@ void zap_low_mappings(void)
 	flush_tlb_all();
 }
 
-#ifndef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_NUMA
 void __init paging_init(void)
 {
 	{
@@ -427,13 +427,16 @@ void __init mem_init(void)
 	reservedpages = 0;
 
 	/* this will put all low memory onto the freelists */
-#ifdef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_NUMA
 	totalram_pages += numa_free_all_bootmem();
 	tmp = 0;
 	/* should count reserved pages here for all nodes */ 
 #else
+
+#ifdef CONFIG_FLATMEM
 	max_mapnr = end_pfn;
 	if (!mem_map) BUG();
+#endif
 
 	totalram_pages += free_all_bootmem();
 
@@ -515,7 +518,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 
 { 
 	/* Should check here against the e820 map to avoid double free */ 
-#ifdef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_NUMA
 	int nid = phys_to_nid(phys);
   	reserve_bootmem_node(NODE_DATA(nid), phys, len);
 #else       		
diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c
index 58aac23760e..6972df480d2 100644
--- a/arch/x86_64/mm/ioremap.c
+++ b/arch/x86_64/mm/ioremap.c
@@ -178,7 +178,7 @@ void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned l
 	if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
 		return (__force void __iomem *)phys_to_virt(phys_addr);
 
-#ifndef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_FLATMEM
 	/*
 	 * Don't allow anybody to remap normal RAM that we're using..
 	 */
diff --git a/include/asm-x86_64/io.h b/include/asm-x86_64/io.h
index 94202703fae..37fc3f149a5 100644
--- a/include/asm-x86_64/io.h
+++ b/include/asm-x86_64/io.h
@@ -124,12 +124,7 @@ extern inline void * phys_to_virt(unsigned long address)
 /*
  * Change "struct page" to physical address.
  */
-#ifdef CONFIG_DISCONTIGMEM
-#include <asm/mmzone.h>
 #define page_to_phys(page)    ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
-#else
-#define page_to_phys(page)	((page - mem_map) << PAGE_SHIFT)
-#endif
 
 #include <asm-generic/iomap.h>
 
diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h
index ca4fc3fe0de..768413751b3 100644
--- a/include/asm-x86_64/mmzone.h
+++ b/include/asm-x86_64/mmzone.h
@@ -6,7 +6,7 @@
 
 #include <linux/config.h>
 
-#ifdef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_NUMA
 
 #define VIRTUAL_BUG_ON(x) 
 
@@ -30,17 +30,16 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
 	return nid; 
 } 
 
-#define pfn_to_nid(pfn) phys_to_nid((unsigned long)(pfn) << PAGE_SHIFT)
-
-#define kvaddr_to_nid(kaddr)	phys_to_nid(__pa(kaddr))
 #define NODE_DATA(nid)		(node_data[nid])
 
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 #define node_end_pfn(nid)       (NODE_DATA(nid)->node_start_pfn + \
 				 NODE_DATA(nid)->node_spanned_pages)
 
-#define local_mapnr(kvaddr) \
-	( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) )
+#ifdef CONFIG_DISCONTIGMEM
+
+#define pfn_to_nid(pfn) phys_to_nid((unsigned long)(pfn) << PAGE_SHIFT)
+#define kvaddr_to_nid(kaddr)	phys_to_nid(__pa(kaddr))
 
 /* AK: this currently doesn't deal with invalid addresses. We'll see 
    if the 2.5 kernel doesn't pass them
@@ -57,4 +56,8 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
 			({ u8 nid__ = pfn_to_nid(pfn); \
 			   nid__ != 0xff && (pfn) >= node_start_pfn(nid__) && (pfn) <= node_end_pfn(nid__); }))
 #endif
+
+#define local_mapnr(kvaddr) \
+	( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) )
+#endif
 #endif
diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h
index 9ce338c3a71..60130f4ca98 100644
--- a/include/asm-x86_64/page.h
+++ b/include/asm-x86_64/page.h
@@ -119,7 +119,9 @@ extern __inline__ int get_order(unsigned long size)
 	  __pa(v); })
 
 #define __va(x)			((void *)((unsigned long)(x)+PAGE_OFFSET))
-#ifndef CONFIG_DISCONTIGMEM
+#define __boot_va(x)		__va(x)
+#define __boot_pa(x)		__pa(x)
+#ifdef CONFIG_FLATMEM
 #define pfn_to_page(pfn)	(mem_map + (pfn))
 #define page_to_pfn(page)	((unsigned long)((page) - mem_map))
 #define pfn_valid(pfn)		((pfn) < max_mapnr)
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index 67f24e0ea81..da21573ec73 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -3,7 +3,7 @@
 
 #include <linux/config.h>
 
-#ifdef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_NUMA
 
 #include <asm/mpspec.h>
 #include <asm/bitops.h>
@@ -37,7 +37,6 @@ static inline cpumask_t __pcibus_to_cpumask(int bus)
 }
 #define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus->number)
 
-#ifdef CONFIG_NUMA
 /* sched_domains SD_NODE_INIT for x86_64 machines */
 #define SD_NODE_INIT (struct sched_domain) {		\
 	.span			= CPU_MASK_NONE,	\
@@ -59,7 +58,6 @@ static inline cpumask_t __pcibus_to_cpumask(int bus)
 	.balance_interval	= 1,			\
 	.nr_balance_failed	= 0,			\
 }
-#endif
 
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From bbfceef47fb9467424113a004070bf37a806a97c Mon Sep 17 00:00:00 2001
From: Matt Tolentino <metolent@snoqualmie.dp.intel.com>
Date: Thu, 23 Jun 2005 00:08:07 -0700
Subject: [PATCH] add x86-64 specific support for sparsemem

This patch adds in the necessary support for sparsemem such that x86-64
kernels may use sparsemem as an alternative to discontigmem for NUMA
kernels.  Note that this does no preclude one from continuing to build NUMA
kernels using discontigmem, but merely allows the option to build NUMA
kernels with sparsemem.

Interestingly, the use of sparsemem in lieu of discontigmem in NUMA kernels
results in reduced text size for otherwise equivalent kernels as shown in
the example builds below:

   text	   data	    bss	    dec	    hex	filename
2371036	 765884	1237108	4374028	 42be0c	vmlinux.discontig
2366549	 776484	1302772	4445805	 43d66d	vmlinux.sparse

Signed-off-by: Matt Tolentino <matthew.e.tolentino@intel.com>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/setup.c     | 27 +++++++++++++++++----------
 arch/x86_64/mm/numa.c          |  8 ++++++++
 include/asm-x86_64/bitops.h    |  2 --
 include/asm-x86_64/sparsemem.h | 26 ++++++++++++++++++++++++++
 4 files changed, 51 insertions(+), 12 deletions(-)
 create mode 100644 include/asm-x86_64/sparsemem.h

(limited to 'include/asm-x86_64')

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index dd8419b37a7..000015dd5a8 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -40,6 +40,8 @@
 #include <linux/acpi.h>
 #include <linux/kallsyms.h>
 #include <linux/edd.h>
+#include <linux/mmzone.h>
+
 #include <asm/mtrr.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -378,16 +380,19 @@ static __init void parse_cmdline_early (char ** cmdline_p)
 }
 
 #ifndef CONFIG_NUMA
-static void __init contig_initmem_init(void)
+static void __init
+contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 {
-        unsigned long bootmap_size, bootmap; 
-        bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
-        bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
-        if (bootmap == -1L) 
-                panic("Cannot find bootmem map of size %ld\n",bootmap_size);
-        bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
-        e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT);
-        reserve_bootmem(bootmap, bootmap_size);
+	unsigned long bootmap_size, bootmap;
+
+	memory_present(0, start_pfn, end_pfn);
+	bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
+	bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
+	if (bootmap == -1L)
+		panic("Cannot find bootmem map of size %ld\n",bootmap_size);
+	bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
+	e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT);
+	reserve_bootmem(bootmap, bootmap_size);
 } 
 #endif
 
@@ -557,7 +562,7 @@ void __init setup_arch(char **cmdline_p)
 #ifdef CONFIG_NUMA
 	numa_initmem_init(0, end_pfn); 
 #else
-	contig_initmem_init(); 
+	contig_initmem_init(0, end_pfn);
 #endif
 
 	/* Reserve direct mapping */
@@ -618,6 +623,8 @@ void __init setup_arch(char **cmdline_p)
 		}
 	}
 #endif
+
+	sparse_init();
 	paging_init();
 
 	check_ioapic();
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index fd9f25d7a6c..84cde796ecb 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -66,6 +66,13 @@ int __init compute_hash_shift(struct node *nodes, int numnodes)
 	return -1; 
 }
 
+#ifdef CONFIG_SPARSEMEM
+int early_pfn_to_nid(unsigned long pfn)
+{
+	return phys_to_nid(pfn << PAGE_SHIFT);
+}
+#endif
+
 /* Initialize bootmem allocator for a node */
 void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
 { 
@@ -80,6 +87,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
 	start_pfn = start >> PAGE_SHIFT;
 	end_pfn = end >> PAGE_SHIFT;
 
+	memory_present(nodeid, start_pfn, end_pfn);
 	nodedata_phys = find_e820_area(start, end, pgdat_size); 
 	if (nodedata_phys == -1L) 
 		panic("Cannot find memory pgdat in node %d\n", nodeid);
diff --git a/include/asm-x86_64/bitops.h b/include/asm-x86_64/bitops.h
index 5dd7727c756..a31bb99be53 100644
--- a/include/asm-x86_64/bitops.h
+++ b/include/asm-x86_64/bitops.h
@@ -411,8 +411,6 @@ static __inline__ int ffs(int x)
 /* find last set bit */
 #define fls(x) generic_fls(x)
 
-#define ARCH_HAS_ATOMIC_UNSIGNED 1
-
 #endif /* __KERNEL__ */
 
 #endif /* _X86_64_BITOPS_H */
diff --git a/include/asm-x86_64/sparsemem.h b/include/asm-x86_64/sparsemem.h
new file mode 100644
index 00000000000..dabb16714a7
--- /dev/null
+++ b/include/asm-x86_64/sparsemem.h
@@ -0,0 +1,26 @@
+#ifndef _ASM_X86_64_SPARSEMEM_H
+#define _ASM_X86_64_SPARSEMEM_H 1
+
+#ifdef CONFIG_SPARSEMEM
+
+/*
+ * generic non-linear memory support:
+ *
+ * 1) we will not split memory into more chunks than will fit into the flags
+ *    field of the struct page
+ *
+ * SECTION_SIZE_BITS		2^n: size of each section
+ * MAX_PHYSADDR_BITS		2^n: max size of physical address space
+ * MAX_PHYSMEM_BITS		2^n: how much memory we can have in that space
+ *
+ */
+
+#define SECTION_SIZE_BITS	27 /* matt - 128 is convenient right now */
+#define MAX_PHYSADDR_BITS	40
+#define MAX_PHYSMEM_BITS	40
+
+extern int early_pfn_to_nid(unsigned long pfn);
+
+#endif /* CONFIG_SPARSEMEM */
+
+#endif /* _ASM_X86_64_SPARSEMEM_H */
-- 
cgit v1.2.3-70-g09d2


From 8a9e1b0f564615bd92ba50162623e25c2904e564 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Thu, 23 Jun 2005 00:08:13 -0700
Subject: [PATCH] Platform SMIs and their interferance with tsc based delay
 calibration

Issue:
Current tsc based delay_calibration can result in significant errors in
loops_per_jiffy count when the platform events like SMIs
(System Management Interrupts that are non-maskable) are present. This could
lead to potential kernel panic(). This issue is becoming more visible with 2.6
kernel (as default HZ is 1000) and on platforms with higher SMI handling
latencies. During the boot time, SMIs are mostly used by BIOS (for things
like legacy keyboard emulation).

Description:
The psuedocode for current delay calibration with tsc based delay looks like
(0) Estimate a value for loops_per_jiffy
(1) While (loops_per_jiffy estimate is accurate enough)
(2)   wait for jiffy transition (jiffy1)
(3)   Note down current tsc (tsc1)
(4)   loop until tsc becomes tsc1 + loops_per_jiffy
(5)   check whether jiffy changed since jiffy1 or not and refine
loops_per_jiffy estimate

Consider the following cases
Case 1:
If SMIs happen between (2) and (3) above, we can end up with a
loops_per_jiffy value that is too low. This results in shorted delays and
kernel can panic () during boot (Mostly at IOAPIC timer initialization
timer_irq_works() as we don't have enough timer interrupts in a specified
interval).

Case 2:
If SMIs happen between (3) and (4) above, then we can end up with a
loops_per_jiffy value that is too high. And with current i386 code, too
high lpj value (greater than 17M) can result in a overflow in
delay.c:__const_udelay() again resulting in shorter delay and panic().

Solution:
The patch below makes the calibration routine aware of asynchronous events
like SMIs. We increase the delay calibration time and also identify any
significant errors (greater than 12.5%) in the calibration and notify it to
user.

Patch below changes both i386 and x86-64 architectures to use this
new and improved calibrate_delay_direct() routine.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/timers/common.c     |  9 ++++
 arch/i386/kernel/timers/timer.c      |  9 ++++
 arch/i386/kernel/timers/timer_hpet.c |  1 +
 arch/i386/kernel/timers/timer_pm.c   |  1 +
 arch/i386/kernel/timers/timer_tsc.c  |  1 +
 arch/x86_64/lib/delay.c              |  7 +++
 include/asm-i386/timer.h             |  2 +
 include/asm-i386/timex.h             |  3 ++
 include/asm-x86_64/timex.h           |  3 ++
 init/calibrate.c                     | 94 ++++++++++++++++++++++++++++++++++++
 10 files changed, 130 insertions(+)

(limited to 'include/asm-x86_64')

diff --git a/arch/i386/kernel/timers/common.c b/arch/i386/kernel/timers/common.c
index 8e201219f52..b38cc0d0c71 100644
--- a/arch/i386/kernel/timers/common.c
+++ b/arch/i386/kernel/timers/common.c
@@ -139,6 +139,15 @@ bad_calibration:
 }
 #endif
 
+
+unsigned long read_timer_tsc(void)
+{
+	unsigned long retval;
+	rdtscl(retval);
+	return retval;
+}
+
+
 /* calculate cpu_khz */
 void init_cpu_khz(void)
 {
diff --git a/arch/i386/kernel/timers/timer.c b/arch/i386/kernel/timers/timer.c
index a3d6a288088..7e39ed8e33f 100644
--- a/arch/i386/kernel/timers/timer.c
+++ b/arch/i386/kernel/timers/timer.c
@@ -64,3 +64,12 @@ struct timer_opts* __init select_timer(void)
 	panic("select_timer: Cannot find a suitable timer\n");
 	return NULL;
 }
+
+int read_current_timer(unsigned long *timer_val)
+{
+	if (cur_timer->read_timer) {
+		*timer_val = cur_timer->read_timer();
+		return 0;
+	}
+	return -1;
+}
diff --git a/arch/i386/kernel/timers/timer_hpet.c b/arch/i386/kernel/timers/timer_hpet.c
index f778f471a09..15a7d727bd6 100644
--- a/arch/i386/kernel/timers/timer_hpet.c
+++ b/arch/i386/kernel/timers/timer_hpet.c
@@ -186,6 +186,7 @@ static struct timer_opts timer_hpet = {
 	.get_offset =		get_offset_hpet,
 	.monotonic_clock =	monotonic_clock_hpet,
 	.delay = 		delay_hpet,
+	.read_timer = 		read_timer_tsc,
 };
 
 struct init_timer_opts __initdata timer_hpet_init = {
diff --git a/arch/i386/kernel/timers/timer_pm.c b/arch/i386/kernel/timers/timer_pm.c
index d77f22030fe..4ef20e66349 100644
--- a/arch/i386/kernel/timers/timer_pm.c
+++ b/arch/i386/kernel/timers/timer_pm.c
@@ -246,6 +246,7 @@ static struct timer_opts timer_pmtmr = {
 	.get_offset		= get_offset_pmtmr,
 	.monotonic_clock 	= monotonic_clock_pmtmr,
 	.delay 			= delay_pmtmr,
+	.read_timer 		= read_timer_tsc,
 };
 
 struct init_timer_opts __initdata timer_pmtmr_init = {
diff --git a/arch/i386/kernel/timers/timer_tsc.c b/arch/i386/kernel/timers/timer_tsc.c
index 180444d8782..27f08ae9120 100644
--- a/arch/i386/kernel/timers/timer_tsc.c
+++ b/arch/i386/kernel/timers/timer_tsc.c
@@ -572,6 +572,7 @@ static struct timer_opts timer_tsc = {
 	.get_offset = get_offset_tsc,
 	.monotonic_clock = monotonic_clock_tsc,
 	.delay = delay_tsc,
+	.read_timer = read_timer_tsc,
 };
 
 struct init_timer_opts __initdata timer_tsc_init = {
diff --git a/arch/x86_64/lib/delay.c b/arch/x86_64/lib/delay.c
index aed61a668a1..33a873a3c22 100644
--- a/arch/x86_64/lib/delay.c
+++ b/arch/x86_64/lib/delay.c
@@ -12,6 +12,7 @@
 #include <linux/sched.h>
 #include <linux/delay.h>
 #include <asm/delay.h>
+#include <asm/msr.h>
 
 #ifdef CONFIG_SMP
 #include <asm/smp.h>
@@ -19,6 +20,12 @@
 
 int x86_udelay_tsc = 0;		/* Delay via TSC */
 
+int read_current_timer(unsigned long *timer_value)
+{
+	rdtscll(*timer_value);
+	return 0;
+}
+
 void __delay(unsigned long loops)
 {
 	unsigned bclock, now;
diff --git a/include/asm-i386/timer.h b/include/asm-i386/timer.h
index c3470984983..dcf1e07db08 100644
--- a/include/asm-i386/timer.h
+++ b/include/asm-i386/timer.h
@@ -22,6 +22,7 @@ struct timer_opts {
 	unsigned long (*get_offset)(void);
 	unsigned long long (*monotonic_clock)(void);
 	void (*delay)(unsigned long);
+	unsigned long (*read_timer)(void);
 };
 
 struct init_timer_opts {
@@ -52,6 +53,7 @@ extern struct init_timer_opts timer_cyclone_init;
 #endif
 
 extern unsigned long calibrate_tsc(void);
+extern unsigned long read_timer_tsc(void);
 extern void init_cpu_khz(void);
 extern int recalibrate_cpu_khz(void);
 #ifdef CONFIG_HPET_TIMER
diff --git a/include/asm-i386/timex.h b/include/asm-i386/timex.h
index b41e484c344..e370f907bd3 100644
--- a/include/asm-i386/timex.h
+++ b/include/asm-i386/timex.h
@@ -49,4 +49,7 @@ static inline cycles_t get_cycles (void)
 
 extern unsigned long cpu_khz;
 
+extern int read_current_timer(unsigned long *timer_value);
+#define ARCH_HAS_READ_CURRENT_TIMER	1
+
 #endif
diff --git a/include/asm-x86_64/timex.h b/include/asm-x86_64/timex.h
index 34f31a18f90..24ecf6a637c 100644
--- a/include/asm-x86_64/timex.h
+++ b/include/asm-x86_64/timex.h
@@ -26,6 +26,9 @@ static inline cycles_t get_cycles (void)
 
 extern unsigned int cpu_khz;
 
+extern int read_current_timer(unsigned long *timer_value);
+#define ARCH_HAS_READ_CURRENT_TIMER	1
+
 extern struct vxtime_data vxtime;
 
 #endif
diff --git a/init/calibrate.c b/init/calibrate.c
index c698e04a3db..d206c7548fe 100644
--- a/init/calibrate.c
+++ b/init/calibrate.c
@@ -8,6 +8,8 @@
 #include <linux/delay.h>
 #include <linux/init.h>
 
+#include <asm/timex.h>
+
 static unsigned long preset_lpj;
 static int __init lpj_setup(char *str)
 {
@@ -17,6 +19,92 @@ static int __init lpj_setup(char *str)
 
 __setup("lpj=", lpj_setup);
 
+#ifdef ARCH_HAS_READ_CURRENT_TIMER
+
+/* This routine uses the read_current_timer() routine and gets the
+ * loops per jiffy directly, instead of guessing it using delay().
+ * Also, this code tries to handle non-maskable asynchronous events
+ * (like SMIs)
+ */
+#define DELAY_CALIBRATION_TICKS			((HZ < 100) ? 1 : (HZ/100))
+#define MAX_DIRECT_CALIBRATION_RETRIES		5
+
+static unsigned long __devinit calibrate_delay_direct(void)
+{
+	unsigned long pre_start, start, post_start;
+	unsigned long pre_end, end, post_end;
+	unsigned long start_jiffies;
+	unsigned long tsc_rate_min, tsc_rate_max;
+	unsigned long good_tsc_sum = 0;
+	unsigned long good_tsc_count = 0;
+	int i;
+
+	if (read_current_timer(&pre_start) < 0 )
+		return 0;
+
+	/*
+	 * A simple loop like
+	 *	while ( jiffies < start_jiffies+1)
+	 *		start = read_current_timer();
+	 * will not do. As we don't really know whether jiffy switch
+	 * happened first or timer_value was read first. And some asynchronous
+	 * event can happen between these two events introducing errors in lpj.
+	 *
+	 * So, we do
+	 * 1. pre_start <- When we are sure that jiffy switch hasn't happened
+	 * 2. check jiffy switch
+	 * 3. start <- timer value before or after jiffy switch
+	 * 4. post_start <- When we are sure that jiffy switch has happened
+	 *
+	 * Note, we don't know anything about order of 2 and 3.
+	 * Now, by looking at post_start and pre_start difference, we can
+	 * check whether any asynchronous event happened or not
+	 */
+
+	for (i = 0; i < MAX_DIRECT_CALIBRATION_RETRIES; i++) {
+		pre_start = 0;
+		read_current_timer(&start);
+		start_jiffies = jiffies;
+		while (jiffies <= (start_jiffies + 1)) {
+			pre_start = start;
+			read_current_timer(&start);
+		}
+		read_current_timer(&post_start);
+
+		pre_end = 0;
+		end = post_start;
+		while (jiffies <=
+		       (start_jiffies + 1 + DELAY_CALIBRATION_TICKS)) {
+			pre_end = end;
+			read_current_timer(&end);
+		}
+		read_current_timer(&post_end);
+
+		tsc_rate_max = (post_end - pre_start) / DELAY_CALIBRATION_TICKS;
+		tsc_rate_min = (pre_end - post_start) / DELAY_CALIBRATION_TICKS;
+
+		/*
+	 	 * If the upper limit and lower limit of the tsc_rate is
+		 * >= 12.5% apart, redo calibration.
+		 */
+		if (pre_start != 0 && pre_end != 0 &&
+		    (tsc_rate_max - tsc_rate_min) < (tsc_rate_max >> 3)) {
+			good_tsc_count++;
+			good_tsc_sum += tsc_rate_max;
+		}
+	}
+
+	if (good_tsc_count)
+		return (good_tsc_sum/good_tsc_count);
+
+	printk(KERN_WARNING "calibrate_delay_direct() failed to get a good "
+	       "estimate for loops_per_jiffy.\nProbably due to long platform interrupts. Consider using \"lpj=\" boot option.\n");
+	return 0;
+}
+#else
+static unsigned long __devinit calibrate_delay_direct(void) {return 0;}
+#endif
+
 /*
  * This is the number of bits of precision for the loops_per_jiffy.  Each
  * bit takes on average 1.5/HZ seconds.  This (like the original) is a little
@@ -35,6 +123,12 @@ void __devinit calibrate_delay(void)
 			"%lu.%02lu BogoMIPS preset\n",
 			loops_per_jiffy/(500000/HZ),
 			(loops_per_jiffy/(5000/HZ)) % 100);
+	} else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) {
+		printk("Calibrating delay using timer specific routine.. ");
+		printk("%lu.%02lu BogoMIPS (lpj=%lu)\n",
+			loops_per_jiffy/(500000/HZ),
+			(loops_per_jiffy/(5000/HZ)) % 100,
+			loops_per_jiffy);
 	} else {
 		loops_per_jiffy = (1<<12);
 
-- 
cgit v1.2.3-70-g09d2


From 8c5a09082f4e61a176382e96a831a0636b918602 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <christoph@lameter.com>
Date: Thu, 23 Jun 2005 00:08:18 -0700
Subject: [PATCH] x86/x86_64: pcibus_to_node

Define pcibus_to_node to be able to figure out which NUMA node contains a
given PCI device.  This defines pcibus_to_node(bus) in
include/linux/topology.h and adjusts the macros for i386 and x86_64 that
already provided a way to determine the cpumask of a pci device.

x86_64 was changed to not build an array of cpumasks anymore.  Instead an
array of nodes is build which can be used to generate the cpumask via
node_to_cpumask.

Signed-off-by: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/mpparse.c   |  4 +++-
 arch/x86_64/pci/k8-bus.c       | 16 +---------------
 include/asm-generic/topology.h |  9 ++++++++-
 include/asm-i386/topology.h    |  8 ++------
 include/asm-x86_64/topology.h  | 14 +++-----------
 5 files changed, 17 insertions(+), 34 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index 61a63be6b29..ed6a5588146 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -23,6 +23,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/mc146818rtc.h>
 #include <linux/acpi.h>
+#include <linux/module.h>
 
 #include <asm/smp.h>
 #include <asm/mtrr.h>
@@ -45,7 +46,8 @@ int acpi_found_madt;
 int apic_version [MAX_APICS];
 unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
 int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
-cpumask_t pci_bus_to_cpumask [256] = { [0 ... 255] = CPU_MASK_ALL };
+unsigned char pci_bus_to_node [256];
+EXPORT_SYMBOL(pci_bus_to_node);
 
 static int mp_current_pci_id = 0;
 /* I/O APIC entries */
diff --git a/arch/x86_64/pci/k8-bus.c b/arch/x86_64/pci/k8-bus.c
index 62349c78db5..7e7d0c2a002 100644
--- a/arch/x86_64/pci/k8-bus.c
+++ b/arch/x86_64/pci/k8-bus.c
@@ -53,25 +53,11 @@ fill_mp_bus_to_cpumask(void)
 				for (j = SECONDARY_LDT_BUS_NUMBER(ldtbus);
 				     j <= SUBORDINATE_LDT_BUS_NUMBER(ldtbus);
 				     j++)
-					pci_bus_to_cpumask[j] =
-						node_to_cpumask(NODE_ID(nid));
+					pci_bus_to_node[j] = NODE_ID(nid);
 			}
 		}
 	}
 
-	/* quick sanity check */
-	printed = 0;
-	for (i = 0; i < 256; i++) {
-		if (cpus_empty(pci_bus_to_cpumask[i])) {
-			pci_bus_to_cpumask[i] = CPU_MASK_ALL;
-			if (printed)
-				continue;
-			printk(KERN_ERR
-			       "k8-bus.c: some busses have empty cpu mask\n");
-			printed = 1;
-		}
-	}
-
 	return 0;
 }
 
diff --git a/include/asm-generic/topology.h b/include/asm-generic/topology.h
index ec96e8b0f19..5d9d70cd17f 100644
--- a/include/asm-generic/topology.h
+++ b/include/asm-generic/topology.h
@@ -41,8 +41,15 @@
 #ifndef node_to_first_cpu
 #define node_to_first_cpu(node)	(0)
 #endif
+#ifndef pcibus_to_node
+#define pcibus_to_node(node)	(-1)
+#endif
+
 #ifndef pcibus_to_cpumask
-#define pcibus_to_cpumask(bus)	(cpu_online_map)
+#define pcibus_to_cpumask(bus)	(pcibus_to_node(bus) == -1 ? \
+					CPU_MASK_ALL : \
+					node_to_cpumask(pcibus_to_node(bus)) \
+				)
 #endif
 
 #endif /* _ASM_GENERIC_TOPOLOGY_H */
diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h
index 98f9e6850cb..6d0f67507b2 100644
--- a/include/asm-i386/topology.h
+++ b/include/asm-i386/topology.h
@@ -60,12 +60,8 @@ static inline int node_to_first_cpu(int node)
 	return first_cpu(mask);
 }
 
-/* Returns the number of the node containing PCI bus number 'busnr' */
-static inline cpumask_t __pcibus_to_cpumask(int busnr)
-{
-	return node_to_cpumask(mp_bus_id_to_node[busnr]);
-}
-#define pcibus_to_cpumask(bus)	__pcibus_to_cpumask(bus->number)
+#define pcibus_to_node(bus) mp_bus_id_to_node[(bus)->number]
+#define pcibus_to_cpumask(bus) node_to_cpumask(pcibus_to_node(bus))
 
 /* sched_domains SD_NODE_INIT for NUMAQ machines */
 #define SD_NODE_INIT (struct sched_domain) {		\
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index da21573ec73..8f77e9f6bc2 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -13,8 +13,8 @@
 extern cpumask_t cpu_online_map;
 
 extern unsigned char cpu_to_node[];
+extern unsigned char pci_bus_to_node[];
 extern cpumask_t     node_to_cpumask[];
-extern cpumask_t pci_bus_to_cpumask[];
 
 #ifdef CONFIG_ACPI_NUMA
 extern int __node_distance(int, int);
@@ -26,16 +26,8 @@ extern int __node_distance(int, int);
 #define parent_node(node)		(node)
 #define node_to_first_cpu(node) 	(__ffs(node_to_cpumask[node]))
 #define node_to_cpumask(node)		(node_to_cpumask[node])
-
-static inline cpumask_t __pcibus_to_cpumask(int bus)
-{
-	cpumask_t busmask = pci_bus_to_cpumask[bus];
-	cpumask_t online = cpu_online_map;
-	cpumask_t res;
-	cpus_and(res, busmask, online);
-	return res;
-}
-#define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus->number)
+#define pcibus_to_node(bus)		pci_bus_to_node[(bus)->number]
+#define pcibus_to_cpumask(bus)		node_to_cpumask(pcibus_to_node(bus));
 
 /* sched_domains SD_NODE_INIT for x86_64 machines */
 #define SD_NODE_INIT (struct sched_domain) {		\
-- 
cgit v1.2.3-70-g09d2


From 59121003721a8fad11ee72e646fd9d3076b5679c Mon Sep 17 00:00:00 2001
From: Christoph Lameter <christoph@lameter.com>
Date: Thu, 23 Jun 2005 00:08:25 -0700
Subject: [PATCH] i386: Selectable Frequency of the Timer Interrupt

Make the timer frequency selectable. The timer interrupt may cause bus
and memory contention in large NUMA systems since the interrupt occurs
on each processor HZ times per second.

Signed-off-by: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig          |  2 ++
 arch/x86_64/Kconfig        |  2 ++
 include/asm-i386/param.h   |  4 +++-
 include/asm-x86_64/param.h |  6 ++++--
 kernel/Kconfig.hz          | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 57 insertions(+), 3 deletions(-)
 create mode 100644 kernel/Kconfig.hz

(limited to 'include/asm-x86_64')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index bfdcedef06e..d4ae5f9ceae 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -961,6 +961,8 @@ config SECCOMP
 
 	  If unsure, say Y. Only embedded should say N here.
 
+source kernel/Kconfig.hz
+
 endmenu
 
 
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 61ed1665234..db259757dc8 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -402,6 +402,8 @@ config SECCOMP
 
 	  If unsure, say Y. Only embedded should say N here.
 
+source kernel/Kconfig.hz
+
 endmenu
 
 #
diff --git a/include/asm-i386/param.h b/include/asm-i386/param.h
index b6440526e42..fa02e67ea86 100644
--- a/include/asm-i386/param.h
+++ b/include/asm-i386/param.h
@@ -1,8 +1,10 @@
+#include <linux/config.h>
+
 #ifndef _ASMi386_PARAM_H
 #define _ASMi386_PARAM_H
 
 #ifdef __KERNEL__
-# define HZ		1000		/* Internal kernel timer frequency */
+# define HZ		CONFIG_HZ	/* Internal kernel timer frequency */
 # define USER_HZ	100		/* .. some user interfaces are in "ticks" */
 # define CLOCKS_PER_SEC		(USER_HZ)	/* like times() */
 #endif
diff --git a/include/asm-x86_64/param.h b/include/asm-x86_64/param.h
index b707f0568c9..40b11937180 100644
--- a/include/asm-x86_64/param.h
+++ b/include/asm-x86_64/param.h
@@ -1,9 +1,11 @@
+#include <linux/config.h>
+
 #ifndef _ASMx86_64_PARAM_H
 #define _ASMx86_64_PARAM_H
 
 #ifdef __KERNEL__
-# define HZ            1000            /* Internal kernel timer frequency */
-# define USER_HZ       100          /* .. some user interfaces are in "ticks */
+# define HZ            CONFIG_HZ	/* Internal kernel timer frequency */
+# define USER_HZ       100		/* .. some user interfaces are in "ticks */
 #define CLOCKS_PER_SEC        (USER_HZ)       /* like times() */
 #endif
 
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
new file mode 100644
index 00000000000..248e1c396f8
--- /dev/null
+++ b/kernel/Kconfig.hz
@@ -0,0 +1,46 @@
+#
+# Timer Interrupt Frequency Configuration
+#
+
+choice
+	prompt "Timer frequency"
+	default HZ_250
+	help
+	 Allows the configuration of the timer frequency. It is customary
+	 to have the timer interrupt run at 1000 HZ but 100 HZ may be more
+	 beneficial for servers and NUMA systems that do not need to have
+	 a fast response for user interaction and that may experience bus
+	 contention and cacheline bounces as a result of timer interrupts.
+	 Note that the timer interrupt occurs on each processor in an SMP
+	 environment leading to NR_CPUS * HZ number of timer interrupts
+	 per second.
+
+
+	config HZ_100
+		bool "100 HZ"
+	help
+	  100 HZ is a typical choice for servers, SMP and NUMA systems
+	  with lots of processors that may show reduced performance if
+	  too many timer interrupts are occurring.
+
+	config HZ_250
+		bool "250 HZ"
+	help
+	 250 HZ is a good compromise choice allowing server performance
+	 while also showing good interactive responsiveness even
+	 on SMP and NUMA systems.
+
+	config HZ_1000
+		bool "1000 HZ"
+	help
+	 1000 HZ is the preferred choice for desktop systems and other
+	 systems requiring fast interactive responses to events.
+
+endchoice
+
+config HZ
+	int
+	default 100 if HZ_100
+	default 250 if HZ_250
+	default 1000 if HZ_1000
+
-- 
cgit v1.2.3-70-g09d2


From 32ecd42b6f94d3ee320a22827b46bd19ccf924e5 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Thu, 23 Jun 2005 00:08:38 -0700
Subject: [PATCH] eliminate duplicate rdpmc definition

Eliminate duplicate definition of rdpmc in x86-64's mtrr.h.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/msr.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/include/asm-x86_64/msr.h b/include/asm-x86_64/msr.h
index 513e52c7182..bc700232728 100644
--- a/include/asm-x86_64/msr.h
+++ b/include/asm-x86_64/msr.h
@@ -57,11 +57,6 @@
      (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
 } while(0)
 
-#define rdpmc(counter,low,high) \
-     __asm__ __volatile__("rdpmc" \
-			  : "=a" (low), "=d" (high) \
-			  : "c" (counter))
-
 #define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
 
 #define rdpmc(counter,low,high) \
-- 
cgit v1.2.3-70-g09d2


From fa1e1bdf78d405f9905b8290ee9211e7a7bbc99b Mon Sep 17 00:00:00 2001
From: Vincent Hanquez <vincent.hanquez@cl.cam.ac.uk>
Date: Thu, 23 Jun 2005 00:08:44 -0700
Subject: [PATCH] xen: x86: Rename usermode macro

Rename user_mode to user_mode_vm and add a user_mode macro similar to the
x86-64 one.

This is useful for Xen because the linux xen kernel does not runs on the same
priviledge that a vanilla linux kernel, and with this we just need to redefine
user_mode().

Signed-off-by: Vincent Hanquez <vincent.hanquez@cl.cam.ac.uk>
Cc: Ian Pratt <m+Ian.Pratt@cl.cam.ac.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/apic.c              | 2 +-
 arch/i386/kernel/ptrace.c            | 2 +-
 arch/i386/mach-voyager/voyager_smp.c | 2 +-
 arch/i386/oprofile/backtrace.c       | 2 +-
 include/asm-i386/ptrace.h            | 3 ++-
 include/asm-x86_64/ptrace.h          | 1 +
 6 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index d509836b70c..8d993fa7175 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -1133,7 +1133,7 @@ inline void smp_local_timer_interrupt(struct pt_regs * regs)
 		}
 
 #ifdef CONFIG_SMP
-		update_process_times(user_mode(regs));
+		update_process_times(user_mode_vm(regs));
 #endif
 	}
 
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index e34f651fa13..0da59b42843 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -668,7 +668,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
 	info.si_code = TRAP_BRKPT;
 
 	/* User-mode eip? */
-	info.si_addr = user_mode(regs) ? (void __user *) regs->eip : NULL;
+	info.si_addr = user_mode_vm(regs) ? (void __user *) regs->eip : NULL;
 
 	/* Send us the fakey SIGTRAP */
 	force_sig_info(SIGTRAP, &info, tsk);
diff --git a/arch/i386/mach-voyager/voyager_smp.c b/arch/i386/mach-voyager/voyager_smp.c
index a6e0ddd65bd..8c8527593da 100644
--- a/arch/i386/mach-voyager/voyager_smp.c
+++ b/arch/i386/mach-voyager/voyager_smp.c
@@ -1288,7 +1288,7 @@ smp_local_timer_interrupt(struct pt_regs * regs)
 						per_cpu(prof_counter, cpu);
 		}
 
-		update_process_times(user_mode(regs));
+		update_process_times(user_mode_vm(regs));
 	}
 
 	if( ((1<<cpu) & voyager_extended_vic_processors) == 0)
diff --git a/arch/i386/oprofile/backtrace.c b/arch/i386/oprofile/backtrace.c
index 52d72e074f7..65dfd2edb67 100644
--- a/arch/i386/oprofile/backtrace.c
+++ b/arch/i386/oprofile/backtrace.c
@@ -91,7 +91,7 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
 	head = (struct frame_head *)regs->ebp;
 #endif
 
-	if (!user_mode(regs)) {
+	if (!user_mode_vm(regs)) {
 		while (depth-- && valid_kernel_stack(head, regs))
 			head = dump_backtrace(head);
 		return;
diff --git a/include/asm-i386/ptrace.h b/include/asm-i386/ptrace.h
index 8618914b352..eef9f93870d 100644
--- a/include/asm-i386/ptrace.h
+++ b/include/asm-i386/ptrace.h
@@ -57,7 +57,8 @@ struct pt_regs {
 #ifdef __KERNEL__
 struct task_struct;
 extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code);
-#define user_mode(regs) ((VM_MASK & (regs)->eflags) || (3 & (regs)->xcs))
+#define user_mode(regs)		(3 & (regs)->xcs)
+#define user_mode_vm(regs)	((VM_MASK & (regs)->eflags) || user_mode(regs))
 #define instruction_pointer(regs) ((regs)->eip)
 #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
 extern unsigned long profile_pc(struct pt_regs *regs);
diff --git a/include/asm-x86_64/ptrace.h b/include/asm-x86_64/ptrace.h
index 5bbc8d3141c..ca6f15ff61d 100644
--- a/include/asm-x86_64/ptrace.h
+++ b/include/asm-x86_64/ptrace.h
@@ -82,6 +82,7 @@ struct pt_regs {
 
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__) 
 #define user_mode(regs) (!!((regs)->cs & 3))
+#define user_mode_vm(regs) user_mode(regs)
 #define instruction_pointer(regs) ((regs)->rip)
 extern unsigned long profile_pc(struct pt_regs *regs);
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
-- 
cgit v1.2.3-70-g09d2


From e9129e56e9ec50c0689eb4cf7a3ca132f1e776db Mon Sep 17 00:00:00 2001
From: Vincent Hanquez <vincent.hanquez@cl.cam.ac.uk>
Date: Thu, 23 Jun 2005 00:08:46 -0700
Subject: [PATCH] xen: x86_64: Add macro for debugreg

Add 2 macros to set and get debugreg on x86_64.  This is useful for Xen
because it will need only to redefine each macro to a hypervisor call.

Signed-off-by: Vincent Hanquez <vincent.hanquez@cl.cam.ac.uk>
Cc: Ian Pratt <m+Ian.Pratt@cl.cam.ac.uk>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/signal.c    | 2 +-
 arch/x86_64/kernel/traps.c     | 4 ++--
 include/asm-x86_64/processor.h | 8 ++++++++
 3 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index 3c5f3089371..0446e8b96a2 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -439,7 +439,7 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
 		 * inside the kernel.
 		 */
 		if (current->thread.debugreg7)
-			asm volatile("movq %0,%%db7"	: : "r" (current->thread.debugreg7));
+			set_debugreg(current->thread.debugreg7, 7);
 
 		/* Whee!  Actually deliver the signal.  */
 		return handle_signal(signr, &info, &ka, oldset, regs);
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 3dfec8fdabc..21706c1a0f9 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -669,7 +669,7 @@ asmlinkage void do_debug(struct pt_regs * regs, unsigned long error_code)
        }
 #endif
 
-	asm("movq %%db6,%0" : "=r" (condition));
+	get_debugreg(condition, 6);
 
 	if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
 						SIGTRAP) == NOTIFY_STOP)
@@ -721,7 +721,7 @@ asmlinkage void do_debug(struct pt_regs * regs, unsigned long error_code)
 	info.si_addr = (void __user *)regs->rip;
 	force_sig_info(SIGTRAP, &info, tsk);	
 clear_dr7:
-	asm volatile("movq %0,%%db7"::"r"(0UL));
+	set_debugreg(0UL, 7);
 	return;
 
 clear_TF_reenable:
diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h
index 8b55f139968..106f666517b 100644
--- a/include/asm-x86_64/processor.h
+++ b/include/asm-x86_64/processor.h
@@ -280,6 +280,14 @@ struct thread_struct {
 	set_fs(USER_DS);							 \
 } while(0) 
 
+#define get_debugreg(var, register)				\
+		__asm__("movq %%db" #register ", %0"		\
+			:"=r" (var))
+#define set_debugreg(value, register)			\
+		__asm__("movq %0,%%db" #register		\
+			: /* no output */			\
+			:"r" (value))
+
 struct task_struct;
 struct mm_struct;
 
-- 
cgit v1.2.3-70-g09d2


From dcd497f99a1ef29a7c5e76142965be77e9dacabd Mon Sep 17 00:00:00 2001
From: Jesper Juhl <juhl-lkml@dif.dk>
Date: Thu, 23 Jun 2005 00:09:07 -0700
Subject: [PATCH] streamline preempt_count type across archs

The preempt_count member of struct thread_info is currently either defined
as int, unsigned int or __s32 depending on arch.  This patch makes the type
of preempt_count an int on all archs.

Having preempt_count be an unsigned type prevents the catching of
preempt_count < 0 bugs, and using int on some archs and __s32 on others is
not exactely "neat" - much nicer when it's just int all over.

A previous version of this patch was already ACK'ed by Robert Love, and the
only change in this version of the patch compared to the one he ACK'ed is
that this one also makes sure the preempt_count member is consistently
commented.

Signed-off-by: Jesper Juhl <juhl-lkml@dif.dk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-arm/thread_info.h       | 2 +-
 include/asm-arm26/thread_info.h     | 2 +-
 include/asm-cris/thread_info.h      | 2 +-
 include/asm-frv/thread_info.h       | 2 +-
 include/asm-h8300/thread_info.h     | 2 +-
 include/asm-i386/thread_info.h      | 2 +-
 include/asm-ia64/thread_info.h      | 2 +-
 include/asm-m32r/thread_info.h      | 2 +-
 include/asm-m68k/thread_info.h      | 2 +-
 include/asm-m68knommu/thread_info.h | 2 +-
 include/asm-mips/thread_info.h      | 2 +-
 include/asm-parisc/thread_info.h    | 2 +-
 include/asm-ppc/thread_info.h       | 3 ++-
 include/asm-ppc64/thread_info.h     | 2 +-
 include/asm-s390/thread_info.h      | 2 +-
 include/asm-sh/thread_info.h        | 2 +-
 include/asm-sh64/thread_info.h      | 2 +-
 include/asm-sparc/thread_info.h     | 4 ++--
 include/asm-sparc64/thread_info.h   | 2 +-
 include/asm-um/thread_info.h        | 2 +-
 include/asm-v850/thread_info.h      | 3 ++-
 include/asm-x86_64/thread_info.h    | 2 +-
 22 files changed, 25 insertions(+), 23 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/include/asm-arm/thread_info.h b/include/asm-arm/thread_info.h
index 66c585c50cf..8252a4cd860 100644
--- a/include/asm-arm/thread_info.h
+++ b/include/asm-arm/thread_info.h
@@ -49,7 +49,7 @@ struct cpu_context_save {
  */
 struct thread_info {
 	unsigned long		flags;		/* low level flags */
-	__s32			preempt_count;	/* 0 => preemptable, <0 => bug */
+	int			preempt_count;	/* 0 => preemptable, <0 => bug */
 	mm_segment_t		addr_limit;	/* address limit */
 	struct task_struct	*task;		/* main task structure */
 	struct exec_domain	*exec_domain;	/* execution domain */
diff --git a/include/asm-arm26/thread_info.h b/include/asm-arm26/thread_info.h
index 50f41b50268..aff3e5699c6 100644
--- a/include/asm-arm26/thread_info.h
+++ b/include/asm-arm26/thread_info.h
@@ -44,7 +44,7 @@ struct cpu_context_save {
  */
 struct thread_info {
 	unsigned long		flags;		/* low level flags */
-	__s32			preempt_count;	/* 0 => preemptable, <0 => bug */
+	int			preempt_count;	/* 0 => preemptable, <0 => bug */
 	mm_segment_t		addr_limit;	/* address limit */
 	struct task_struct	*task;		/* main task structure */
 	struct exec_domain      *exec_domain;   /* execution domain */
diff --git a/include/asm-cris/thread_info.h b/include/asm-cris/thread_info.h
index 53193feb082..5ba4b7865cc 100644
--- a/include/asm-cris/thread_info.h
+++ b/include/asm-cris/thread_info.h
@@ -31,7 +31,7 @@ struct thread_info {
 	struct exec_domain	*exec_domain;	/* execution domain */
 	unsigned long		flags;		/* low level flags */
 	__u32			cpu;		/* current CPU */
-	__s32			preempt_count; /* 0 => preemptable, <0 => BUG */
+	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
 
 	mm_segment_t		addr_limit;	/* thread address space:
 					 	   0-0xBFFFFFFF for user-thead
diff --git a/include/asm-frv/thread_info.h b/include/asm-frv/thread_info.h
index b80a97f50af..c8cba7836f0 100644
--- a/include/asm-frv/thread_info.h
+++ b/include/asm-frv/thread_info.h
@@ -33,7 +33,7 @@ struct thread_info {
 	unsigned long		flags;		/* low level flags */
 	unsigned long		status;		/* thread-synchronous flags */
 	__u32			cpu;		/* current CPU */
-	__s32			preempt_count;	/* 0 => preemptable, <0 => BUG */
+	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
 
 	mm_segment_t		addr_limit;	/* thread address space:
 					 	   0-0xBFFFFFFF for user-thead
diff --git a/include/asm-h8300/thread_info.h b/include/asm-h8300/thread_info.h
index b07c9344776..bfcc755c3bb 100644
--- a/include/asm-h8300/thread_info.h
+++ b/include/asm-h8300/thread_info.h
@@ -23,7 +23,7 @@ struct thread_info {
 	struct exec_domain *exec_domain;	/* execution domain */
 	unsigned long	   flags;		/* low level flags */
 	int		   cpu;			/* cpu we're on */
-	int		   preempt_count;	/* 0 => preemptable, <0 => BUG*/
+	int		   preempt_count;	/* 0 => preemptable, <0 => BUG */
 	struct restart_block restart_block;
 };
 
diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h
index 2cd57271801..95add81237e 100644
--- a/include/asm-i386/thread_info.h
+++ b/include/asm-i386/thread_info.h
@@ -31,7 +31,7 @@ struct thread_info {
 	unsigned long		flags;		/* low level flags */
 	unsigned long		status;		/* thread-synchronous flags */
 	__u32			cpu;		/* current CPU */
-	__s32			preempt_count; /* 0 => preemptable, <0 => BUG */
+	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
 
 
 	mm_segment_t		addr_limit;	/* thread address space:
diff --git a/include/asm-ia64/thread_info.h b/include/asm-ia64/thread_info.h
index 8d5b7e77028..7dc8951708a 100644
--- a/include/asm-ia64/thread_info.h
+++ b/include/asm-ia64/thread_info.h
@@ -25,7 +25,7 @@ struct thread_info {
 	__u32 flags;			/* thread_info flags (see TIF_*) */
 	__u32 cpu;			/* current CPU */
 	mm_segment_t addr_limit;	/* user-level address space limit */
-	__s32 preempt_count;		/* 0=premptable, <0=BUG; will also serve as bh-counter */
+	int preempt_count;		/* 0=premptable, <0=BUG; will also serve as bh-counter */
 	struct restart_block restart_block;
 	struct {
 		int signo;
diff --git a/include/asm-m32r/thread_info.h b/include/asm-m32r/thread_info.h
index 9f3a0fcf6e2..7a6be7727a9 100644
--- a/include/asm-m32r/thread_info.h
+++ b/include/asm-m32r/thread_info.h
@@ -28,7 +28,7 @@ struct thread_info {
 	unsigned long		flags;		/* low level flags */
 	unsigned long		status;		/* thread-synchronous flags */
 	__u32			cpu;		/* current CPU */
-	__s32			preempt_count; /* 0 => preemptable, <0 => BUG */
+	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
 
 	mm_segment_t		addr_limit;	/* thread address space:
 					 	   0-0xBFFFFFFF for user-thread
diff --git a/include/asm-m68k/thread_info.h b/include/asm-m68k/thread_info.h
index 5f58939c59d..2aed24f6fd2 100644
--- a/include/asm-m68k/thread_info.h
+++ b/include/asm-m68k/thread_info.h
@@ -8,7 +8,7 @@
 struct thread_info {
 	struct task_struct	*task;		/* main task structure */
 	struct exec_domain	*exec_domain;	/* execution domain */
-	__s32			preempt_count; /* 0 => preemptable, <0 => BUG */
+	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
 	__u32 cpu; /* should always be 0 on m68k */
 	struct restart_block    restart_block;
 
diff --git a/include/asm-m68knommu/thread_info.h b/include/asm-m68knommu/thread_info.h
index c8153b7c1f5..7b9a3fa3af5 100644
--- a/include/asm-m68knommu/thread_info.h
+++ b/include/asm-m68knommu/thread_info.h
@@ -36,7 +36,7 @@ struct thread_info {
 	struct exec_domain *exec_domain;	/* execution domain */
 	unsigned long	   flags;		/* low level flags */
 	int		   cpu;			/* cpu we're on */
-	int		   preempt_count;	/* 0 => preemptable, <0 => BUG*/
+	int		   preempt_count;	/* 0 => preemptable, <0 => BUG */
 	struct restart_block restart_block;
 };
 
diff --git a/include/asm-mips/thread_info.h b/include/asm-mips/thread_info.h
index 768900305e2..42fcd6f2c20 100644
--- a/include/asm-mips/thread_info.h
+++ b/include/asm-mips/thread_info.h
@@ -27,7 +27,7 @@ struct thread_info {
 	struct exec_domain	*exec_domain;	/* execution domain */
 	unsigned long		flags;		/* low level flags */
 	__u32			cpu;		/* current CPU */
-	__s32			preempt_count; /* 0 => preemptable, <0 => BUG */
+	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
 
 	mm_segment_t		addr_limit;	/* thread address space:
 					 	   0-0xBFFFFFFF for user-thead
diff --git a/include/asm-parisc/thread_info.h b/include/asm-parisc/thread_info.h
index fe9b7f8ae4c..57bbb76cb6c 100644
--- a/include/asm-parisc/thread_info.h
+++ b/include/asm-parisc/thread_info.h
@@ -12,7 +12,7 @@ struct thread_info {
 	unsigned long flags;		/* thread_info flags (see TIF_*) */
 	mm_segment_t addr_limit;	/* user-level address space limit */
 	__u32 cpu;			/* current CPU */
-	__s32 preempt_count;		/* 0=premptable, <0=BUG; will also serve as bh-counter */
+	int preempt_count;		/* 0=premptable, <0=BUG; will also serve as bh-counter */
 	struct restart_block restart_block;
 };
 
diff --git a/include/asm-ppc/thread_info.h b/include/asm-ppc/thread_info.h
index e3b5284a6f9..27903db42ef 100644
--- a/include/asm-ppc/thread_info.h
+++ b/include/asm-ppc/thread_info.h
@@ -20,7 +20,8 @@ struct thread_info {
 	unsigned long		flags;		/* low level flags */
 	unsigned long		local_flags;	/* non-racy flags */
 	int			cpu;		/* cpu we're on */
-	int			preempt_count;
+	int			preempt_count;	/* 0 => preemptable,
+						   <0 => BUG */
 	struct restart_block	restart_block;
 };
 
diff --git a/include/asm-ppc64/thread_info.h b/include/asm-ppc64/thread_info.h
index 48b7900e90e..0494df6fca7 100644
--- a/include/asm-ppc64/thread_info.h
+++ b/include/asm-ppc64/thread_info.h
@@ -24,7 +24,7 @@ struct thread_info {
 	struct task_struct *task;		/* main task structure */
 	struct exec_domain *exec_domain;	/* execution domain */
 	int		cpu;			/* cpu we're on */
-	int		preempt_count;
+	int		preempt_count;		/* 0 => preemptable, <0 => BUG */
 	struct restart_block restart_block;
 	/* set by force_successful_syscall_return */
 	unsigned char	syscall_noerror;
diff --git a/include/asm-s390/thread_info.h b/include/asm-s390/thread_info.h
index aade85c53a6..fe101d41e84 100644
--- a/include/asm-s390/thread_info.h
+++ b/include/asm-s390/thread_info.h
@@ -50,7 +50,7 @@ struct thread_info {
 	struct exec_domain	*exec_domain;	/* execution domain */
 	unsigned long		flags;		/* low level flags */
 	unsigned int		cpu;		/* current CPU */
-	unsigned int		preempt_count; /* 0 => preemptable */
+	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
 	struct restart_block	restart_block;
 };
 
diff --git a/include/asm-sh/thread_info.h b/include/asm-sh/thread_info.h
index 4bbbd9f3c37..46080cefaff 100644
--- a/include/asm-sh/thread_info.h
+++ b/include/asm-sh/thread_info.h
@@ -20,7 +20,7 @@ struct thread_info {
 	struct exec_domain	*exec_domain;	/* execution domain */
 	__u32			flags;		/* low level flags */
 	__u32			cpu;
-	__s32			preempt_count; /* 0 => preemptable, <0 => BUG */
+	int			preempt_count; /* 0 => preemptable, <0 => BUG */
 	struct restart_block	restart_block;
 	__u8			supervisor_stack[0];
 };
diff --git a/include/asm-sh64/thread_info.h b/include/asm-sh64/thread_info.h
index 8a32d6bd0b7..10f024c6a2e 100644
--- a/include/asm-sh64/thread_info.h
+++ b/include/asm-sh64/thread_info.h
@@ -22,7 +22,7 @@ struct thread_info {
 	struct exec_domain	*exec_domain;	/* execution domain */
 	unsigned long		flags;		/* low level flags */
 	/* Put the 4 32-bit fields together to make asm offsetting easier. */
-	__s32			preempt_count; /* 0 => preemptable, <0 => BUG */
+	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
 	__u16			cpu;
 
 	mm_segment_t		addr_limit;
diff --git a/include/asm-sparc/thread_info.h b/include/asm-sparc/thread_info.h
index 104f03c5541..ff6ccb3d24c 100644
--- a/include/asm-sparc/thread_info.h
+++ b/include/asm-sparc/thread_info.h
@@ -30,9 +30,9 @@ struct thread_info {
 	struct task_struct	*task;		/* main task structure */
 	struct exec_domain	*exec_domain;	/* execution domain */
 	unsigned long		flags;		/* low level flags */
-
 	int			cpu;		/* cpu we're on */
-	int			preempt_count;
+	int			preempt_count;	/* 0 => preemptable,
+						   <0 => BUG */
 	int			softirq_count;
 	int			hardirq_count;
 
diff --git a/include/asm-sparc64/thread_info.h b/include/asm-sparc64/thread_info.h
index 517caaba1c8..0cd65295692 100644
--- a/include/asm-sparc64/thread_info.h
+++ b/include/asm-sparc64/thread_info.h
@@ -46,7 +46,7 @@ struct thread_info {
 	unsigned long		fault_address;
 	struct pt_regs		*kregs;
 	struct exec_domain	*exec_domain;
-	int			preempt_count;
+	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
 	int			__pad;
 
 	unsigned long		*utraps;
diff --git a/include/asm-um/thread_info.h b/include/asm-um/thread_info.h
index 1feaaf148ef..97267f059ef 100644
--- a/include/asm-um/thread_info.h
+++ b/include/asm-um/thread_info.h
@@ -17,7 +17,7 @@ struct thread_info {
 	struct exec_domain	*exec_domain;	/* execution domain */
 	unsigned long		flags;		/* low level flags */
 	__u32			cpu;		/* current CPU */
-	__s32			preempt_count;  /* 0 => preemptable, 
+	int			preempt_count;  /* 0 => preemptable,
 						   <0 => BUG */
 	mm_segment_t		addr_limit;	/* thread address space:
 					 	   0-0xBFFFFFFF for user
diff --git a/include/asm-v850/thread_info.h b/include/asm-v850/thread_info.h
index e2ef4459375..e4cfad94a55 100644
--- a/include/asm-v850/thread_info.h
+++ b/include/asm-v850/thread_info.h
@@ -30,7 +30,8 @@ struct thread_info {
 	struct exec_domain	*exec_domain;	/* execution domain */
 	unsigned long		flags;		/* low level flags */
 	int			cpu;		/* cpu we're on */
-	int			preempt_count;
+	int			preempt_count;	/* 0 => preemptable,
+						   <0 => BUG */
 	struct restart_block	restart_block;
 };
 
diff --git a/include/asm-x86_64/thread_info.h b/include/asm-x86_64/thread_info.h
index f4b3b249639..08eb6e4f373 100644
--- a/include/asm-x86_64/thread_info.h
+++ b/include/asm-x86_64/thread_info.h
@@ -29,7 +29,7 @@ struct thread_info {
 	__u32			flags;		/* low level flags */
 	__u32			status;		/* thread synchronous flags */
 	__u32			cpu;		/* current CPU */
-	int 			preempt_count;
+	int 			preempt_count;	/* 0 => preemptable, <0 => BUG */
 
 	mm_segment_t		addr_limit;	
 	struct restart_block    restart_block;
-- 
cgit v1.2.3-70-g09d2


From 73649dab0fd524cb8545a8cb83c6eaf77b107105 Mon Sep 17 00:00:00 2001
From: Rusty Lynch <rusty.lynch@intel.com>
Date: Thu, 23 Jun 2005 00:09:23 -0700
Subject: [PATCH] x86_64 specific function return probes

The following patch adds the x86_64 architecture specific implementation
for function return probes.

Function return probes is a mechanism built on top of kprobes that allows
a caller to register a handler to be called when a given function exits.
For example, to instrument the return path of sys_mkdir:

static int sys_mkdir_exit(struct kretprobe_instance *i, struct pt_regs *regs)
{
	printk("sys_mkdir exited\n");
	return 0;
}
static struct kretprobe return_probe = {
	.handler = sys_mkdir_exit,
};

<inside setup function>

return_probe.kp.addr = (kprobe_opcode_t *) kallsyms_lookup_name("sys_mkdir");
if (register_kretprobe(&return_probe)) {
	printk(KERN_DEBUG "Unable to register return probe!\n");
	/* do error path */
}

<inside cleanup function>
unregister_kretprobe(&return_probe);

The way this works is that:

* At system initialization time, kernel/kprobes.c installs a kprobe
  on a function called kretprobe_trampoline() that is implemented in
  the arch/x86_64/kernel/kprobes.c  (More on this later)

* When a return probe is registered using register_kretprobe(),
  kernel/kprobes.c will install a kprobe on the first instruction of the
  targeted function with the pre handler set to arch_prepare_kretprobe()
  which is implemented in arch/x86_64/kernel/kprobes.c.

* arch_prepare_kretprobe() will prepare a kretprobe instance that stores:
  - nodes for hanging this instance in an empty or free list
  - a pointer to the return probe
  - the original return address
  - a pointer to the stack address

  With all this stowed away, arch_prepare_kretprobe() then sets the return
  address for the targeted function to a special trampoline function called
  kretprobe_trampoline() implemented in arch/x86_64/kernel/kprobes.c

* The kprobe completes as normal, with control passing back to the target
  function that executes as normal, and eventually returns to our trampoline
  function.

* Since a kprobe was installed on kretprobe_trampoline() during system
  initialization, control passes back to kprobes via the architecture
  specific function trampoline_probe_handler() which will lookup the
  instance in an hlist maintained by kernel/kprobes.c, and then call
  the handler function.

* When trampoline_probe_handler() is done, the kprobes infrastructure
  single steps the original instruction (in this case just a top), and
  then calls trampoline_post_handler().  trampoline_post_handler() then
  looks up the instance again, puts the instance back on the free list,
  and then makes a long jump back to the original return instruction.

So to recap, to instrument the exit path of a function this implementation
will cause four interruptions:

  - A breakpoint at the very beginning of the function allowing us to
    switch out the return address
  - A single step interruption to execute the original instruction that
    we replaced with the break instruction (normal kprobe flow)
  - A breakpoint in the trampoline function where our instrumented function
    returned to
  - A single step interruption to execute the original instruction that
    we replaced with the break instruction (normal kprobe flow)

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/kprobes.c | 98 +++++++++++++++++++++++++++++++++++++++++++-
 arch/x86_64/kernel/process.c | 16 ++++++++
 include/asm-x86_64/kprobes.h |  3 ++
 3 files changed, 116 insertions(+), 1 deletion(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index f77f8a0ff18..203672ca740 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -27,6 +27,8 @@
  *		<prasanna@in.ibm.com> adapted for x86_64
  * 2005-Mar	Roland McGrath <roland@redhat.com>
  *		Fixed to handle %rip-relative addressing mode correctly.
+ * 2005-May     Rusty Lynch <rusty.lynch@intel.com>
+ *              Added function return probes functionality
  */
 
 #include <linux/config.h>
@@ -240,6 +242,50 @@ static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 		regs->rip = (unsigned long)p->ainsn.insn;
 }
 
+struct task_struct  *arch_get_kprobe_task(void *ptr)
+{
+	return ((struct thread_info *) (((unsigned long) ptr) &
+					(~(THREAD_SIZE -1))))->task;
+}
+
+void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs)
+{
+	unsigned long *sara = (unsigned long *)regs->rsp;
+	struct kretprobe_instance *ri;
+	static void *orig_ret_addr;
+
+	/*
+	 * Save the return address when the return probe hits
+	 * the first time, and use it to populate the (krprobe
+	 * instance)->ret_addr for subsequent return probes at
+	 * the same addrress since stack address would have
+	 * the kretprobe_trampoline by then.
+	 */
+	if (((void*) *sara) != kretprobe_trampoline)
+		orig_ret_addr = (void*) *sara;
+
+	if ((ri = get_free_rp_inst(rp)) != NULL) {
+		ri->rp = rp;
+		ri->stack_addr = sara;
+		ri->ret_addr = orig_ret_addr;
+		add_rp_inst(ri);
+		/* Replace the return addr with trampoline addr */
+		*sara = (unsigned long) &kretprobe_trampoline;
+	} else {
+		rp->nmissed++;
+	}
+}
+
+void arch_kprobe_flush_task(struct task_struct *tk)
+{
+	struct kretprobe_instance *ri;
+	while ((ri = get_rp_inst_tsk(tk)) != NULL) {
+		*((unsigned long *)(ri->stack_addr)) =
+					(unsigned long) ri->ret_addr;
+		recycle_rp_inst(ri);
+	}
+}
+
 /*
  * Interrupts are disabled on entry as trap3 is an interrupt gate and they
  * remain disabled thorough out this function.
@@ -316,6 +362,55 @@ no_kprobe:
 	return ret;
 }
 
+/*
+ * For function-return probes, init_kprobes() establishes a probepoint
+ * here. When a retprobed function returns, this probe is hit and
+ * trampoline_probe_handler() runs, calling the kretprobe's handler.
+ */
+ void kretprobe_trampoline_holder(void)
+ {
+ 	asm volatile (  ".global kretprobe_trampoline\n"
+ 			"kretprobe_trampoline: \n"
+ 			"nop\n");
+ }
+
+/*
+ * Called when we hit the probe point at kretprobe_trampoline
+ */
+int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	struct task_struct *tsk;
+	struct kretprobe_instance *ri;
+	struct hlist_head *head;
+	struct hlist_node *node;
+	unsigned long *sara = (unsigned long *)regs->rsp - 1;
+
+	tsk = arch_get_kprobe_task(sara);
+	head = kretprobe_inst_table_head(tsk);
+
+	hlist_for_each_entry(ri, node, head, hlist) {
+		if (ri->stack_addr == sara && ri->rp) {
+			if (ri->rp->handler)
+				ri->rp->handler(ri, regs);
+		}
+	}
+	return 0;
+}
+
+void trampoline_post_handler(struct kprobe *p, struct pt_regs *regs,
+						unsigned long flags)
+{
+	struct kretprobe_instance *ri;
+	/* RA already popped */
+	unsigned long *sara = ((unsigned long *)regs->rsp) - 1;
+
+	while ((ri = get_rp_inst(sara))) {
+		regs->rip = (unsigned long)ri->ret_addr;
+		recycle_rp_inst(ri);
+	}
+	regs->eflags &= ~TF_MASK;
+}
+
 /*
  * Called after single-stepping.  p->addr is the address of the
  * instruction whose first byte has been replaced by the "int 3"
@@ -404,7 +499,8 @@ int post_kprobe_handler(struct pt_regs *regs)
 	if (current_kprobe->post_handler)
 		current_kprobe->post_handler(current_kprobe, regs, 0);
 
-	resume_execution(current_kprobe, regs);
+	if (current_kprobe->post_handler != trampoline_post_handler)
+		resume_execution(current_kprobe, regs);
 	regs->eflags |= kprobe_saved_rflags;
 
 	unlock_kprobes();
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index dce8bab4306..e59d1f9d616 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -34,6 +34,7 @@
 #include <linux/ptrace.h>
 #include <linux/utsname.h>
 #include <linux/random.h>
+#include <linux/kprobes.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -293,6 +294,14 @@ void exit_thread(void)
 {
 	struct task_struct *me = current;
 	struct thread_struct *t = &me->thread;
+
+	/*
+	 * Remove function-return probe instances associated with this task
+	 * and put them back on the free list. Do not insert an exit probe for
+	 * this function, it will be disabled by kprobe_flush_task if you do.
+	 */
+	kprobe_flush_task(me);
+
 	if (me->thread.io_bitmap_ptr) { 
 		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
 
@@ -312,6 +321,13 @@ void flush_thread(void)
 	struct task_struct *tsk = current;
 	struct thread_info *t = current_thread_info();
 
+	/*
+	 * Remove function-return probe instances associated with this task
+	 * and put them back on the free list. Do not insert an exit probe for
+	 * this function, it will be disabled by kprobe_flush_task if you do.
+	 */
+	kprobe_flush_task(tsk);
+
 	if (t->flags & _TIF_ABI_PENDING)
 		t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
 
diff --git a/include/asm-x86_64/kprobes.h b/include/asm-x86_64/kprobes.h
index bfea52d516f..6d6d883fdf6 100644
--- a/include/asm-x86_64/kprobes.h
+++ b/include/asm-x86_64/kprobes.h
@@ -38,6 +38,9 @@ typedef u8 kprobe_opcode_t;
 	: (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR)))
 
 #define JPROBE_ENTRY(pentry)	(kprobe_opcode_t *)pentry
+#define ARCH_SUPPORTS_KRETPROBES
+
+void kretprobe_trampoline(void);
 
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
-- 
cgit v1.2.3-70-g09d2


From 11c80c8367db0a9d342529ed74464670cd86a1f6 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Thu, 23 Jun 2005 00:09:59 -0700
Subject: [PATCH] adjust per_cpu definition in non-SMP case

Fix (in the architectures I'm actually building for) the UP definition of
per_cpu so that the cpu specified may be any expression, not just an
identifier or a suffix expression.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-generic/percpu.h | 2 +-
 include/asm-ia64/percpu.h    | 2 +-
 include/asm-x86_64/percpu.h  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index 3b709b84934..9044aeb3782 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -29,7 +29,7 @@ do {								\
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
 
-#define per_cpu(var, cpu)			(*((void)cpu, &per_cpu__##var))
+#define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu__##var))
 #define __get_cpu_var(var)			per_cpu__##var
 
 #endif	/* SMP */
diff --git a/include/asm-ia64/percpu.h b/include/asm-ia64/percpu.h
index 1e87f19dad5..2b14dee29ce 100644
--- a/include/asm-ia64/percpu.h
+++ b/include/asm-ia64/percpu.h
@@ -50,7 +50,7 @@ extern void *per_cpu_init(void);
 
 #else /* ! SMP */
 
-#define per_cpu(var, cpu)			(*((void)cpu, &per_cpu__##var))
+#define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu__##var))
 #define __get_cpu_var(var)			per_cpu__##var
 #define per_cpu_init()				(__phys_per_cpu_start)
 
diff --git a/include/asm-x86_64/percpu.h b/include/asm-x86_64/percpu.h
index 415d73f3c8e..9c71855736f 100644
--- a/include/asm-x86_64/percpu.h
+++ b/include/asm-x86_64/percpu.h
@@ -39,7 +39,7 @@ extern void setup_per_cpu_areas(void);
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
 
-#define per_cpu(var, cpu)			(*((void)cpu, &per_cpu__##var))
+#define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu__##var))
 #define __get_cpu_var(var)			per_cpu__##var
 
 #endif	/* SMP */
-- 
cgit v1.2.3-70-g09d2


From 0d77e5a2c23da734f5a7925f64afa1c2ed92e0f9 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Thu, 23 Jun 2005 00:10:14 -0700
Subject: [PATCH] compat: introduce compat_time_t

This patch is based on work by Carlos O'Donell and Matthew Wilcox.  It
introduces/updates the compat_time_t type and uses it for compat siginfo
structures.  I have built this on ppc64 and x86_64.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/ia32/ia32priv.h       | 2 +-
 arch/s390/kernel/compat_linux.h | 2 +-
 arch/sparc64/kernel/signal32.c  | 2 +-
 include/asm-ia64/compat.h       | 1 +
 include/asm-mips/compat.h       | 1 +
 include/asm-parisc/compat.h     | 2 +-
 include/asm-ppc64/compat.h      | 1 +
 include/asm-ppc64/ppc32.h       | 2 +-
 include/asm-sparc64/compat.h    | 1 +
 include/asm-x86_64/ia32.h       | 2 +-
 10 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/ia64/ia32/ia32priv.h b/arch/ia64/ia32/ia32priv.h
index b2de948bdae..e3e9290e3ff 100644
--- a/arch/ia64/ia32/ia32priv.h
+++ b/arch/ia64/ia32/ia32priv.h
@@ -241,7 +241,7 @@ typedef struct compat_siginfo {
 
 		/* POSIX.1b timers */
 		struct {
-			timer_t _tid;		/* timer id */
+			compat_timer_t _tid;		/* timer id */
 			int _overrun;		/* overrun count */
 			char _pad[sizeof(unsigned int) - sizeof(int)];
 			compat_sigval_t _sigval;	/* same as below */
diff --git a/arch/s390/kernel/compat_linux.h b/arch/s390/kernel/compat_linux.h
index bf33dcfec7d..3898f66d0b2 100644
--- a/arch/s390/kernel/compat_linux.h
+++ b/arch/s390/kernel/compat_linux.h
@@ -45,7 +45,7 @@ typedef struct compat_siginfo {
 
 		/* POSIX.1b timers */
 		struct {
-			timer_t _tid;		/* timer id */
+			compat_timer_t _tid;		/* timer id */
 			int _overrun;		/* overrun count */
 			compat_sigval_t _sigval;	/* same as below */
 			int _sys_private;       /* not to be passed to user */
diff --git a/arch/sparc64/kernel/signal32.c b/arch/sparc64/kernel/signal32.c
index 9a375e975cf..f28428f4170 100644
--- a/arch/sparc64/kernel/signal32.c
+++ b/arch/sparc64/kernel/signal32.c
@@ -102,7 +102,7 @@ typedef struct compat_siginfo{
 
 		/* POSIX.1b timers */
 		struct {
-			timer_t _tid;			/* timer id */
+			compat_timer_t _tid;			/* timer id */
 			int _overrun;			/* overrun count */
 			compat_sigval_t _sigval;		/* same as below */
 			int _sys_private;		/* not to be passed to user */
diff --git a/include/asm-ia64/compat.h b/include/asm-ia64/compat.h
index cc0ff0a4bdd..0c05e5bad8a 100644
--- a/include/asm-ia64/compat.h
+++ b/include/asm-ia64/compat.h
@@ -27,6 +27,7 @@ typedef u16		compat_ipc_pid_t;
 typedef s32		compat_daddr_t;
 typedef u32		compat_caddr_t;
 typedef __kernel_fsid_t	compat_fsid_t;
+typedef s32		compat_timer_t;
 
 typedef s32		compat_int_t;
 typedef s32		compat_long_t;
diff --git a/include/asm-mips/compat.h b/include/asm-mips/compat.h
index dce92079e7f..d78002afb1e 100644
--- a/include/asm-mips/compat.h
+++ b/include/asm-mips/compat.h
@@ -29,6 +29,7 @@ typedef s32		compat_caddr_t;
 typedef struct {
 	s32	val[2];
 } compat_fsid_t;
+typedef s32		compat_timer_t;
 
 typedef s32		compat_int_t;
 typedef s32		compat_long_t;
diff --git a/include/asm-parisc/compat.h b/include/asm-parisc/compat.h
index ca0eac647a0..7630d1ad239 100644
--- a/include/asm-parisc/compat.h
+++ b/include/asm-parisc/compat.h
@@ -24,7 +24,7 @@ typedef u16	compat_nlink_t;
 typedef u16	compat_ipc_pid_t;
 typedef s32	compat_daddr_t;
 typedef u32	compat_caddr_t;
-typedef u32	compat_timer_t;
+typedef s32	compat_timer_t;
 
 typedef s32	compat_int_t;
 typedef s32	compat_long_t;
diff --git a/include/asm-ppc64/compat.h b/include/asm-ppc64/compat.h
index 09c28d28ce6..12414f5fc66 100644
--- a/include/asm-ppc64/compat.h
+++ b/include/asm-ppc64/compat.h
@@ -26,6 +26,7 @@ typedef s32		compat_daddr_t;
 typedef u32		compat_caddr_t;
 typedef __kernel_fsid_t	compat_fsid_t;
 typedef s32		compat_key_t;
+typedef s32		compat_timer_t;
 
 typedef s32		compat_int_t;
 typedef s32		compat_long_t;
diff --git a/include/asm-ppc64/ppc32.h b/include/asm-ppc64/ppc32.h
index 1d040489755..6b44a8caf39 100644
--- a/include/asm-ppc64/ppc32.h
+++ b/include/asm-ppc64/ppc32.h
@@ -32,7 +32,7 @@ typedef struct compat_siginfo {
 
 		/* POSIX.1b timers */
 		struct {
-			timer_t _tid;			/* timer id */
+			compat_timer_t _tid;			/* timer id */
 			int _overrun;			/* overrun count */
 			compat_sigval_t _sigval;		/* same as below */
 			int _sys_private;		/* not to be passed to user */
diff --git a/include/asm-sparc64/compat.h b/include/asm-sparc64/compat.h
index 22f58055b8a..b59122dd176 100644
--- a/include/asm-sparc64/compat.h
+++ b/include/asm-sparc64/compat.h
@@ -25,6 +25,7 @@ typedef s32		compat_daddr_t;
 typedef u32		compat_caddr_t;
 typedef __kernel_fsid_t	compat_fsid_t;
 typedef s32		compat_key_t;
+typedef s32		compat_timer_t;
 
 typedef s32		compat_int_t;
 typedef s32		compat_long_t;
diff --git a/include/asm-x86_64/ia32.h b/include/asm-x86_64/ia32.h
index c0a7717923e..6efa00fe4e7 100644
--- a/include/asm-x86_64/ia32.h
+++ b/include/asm-x86_64/ia32.h
@@ -94,7 +94,7 @@ typedef struct compat_siginfo{
 
 		/* POSIX.1b timers */
 		struct {
-			int _tid;		/* timer id */
+			compat_timer_t _tid;	/* timer id */
 			int _overrun;		/* overrun count */
 			compat_sigval_t _sigval;	/* same as below */
 			int _sys_private;	/* not to be passed to user */
-- 
cgit v1.2.3-70-g09d2


From 76e4f660d9f4c6d1bb473f72be2988c35eaca948 Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Sat, 25 Jun 2005 14:55:00 -0700
Subject: [PATCH] x86_64: CPU hotplug support

  Experimental CPU hotplug patch for x86_64
  -----------------------------------------
This supports logical CPU online and offline.
- Test with maxcpus=1, and then kick other cpu's off to test if init code
  is all cleaned up. CONFIG_SCHED_SMT works as well.
- idle threads are forked on demand from keventd threads for clean startup

TBD:
1. Not tested on a real NUMA machine (tested with numa=fake=2)
2. Handle ACPI pieces for physical hotplug support.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Acked-by: Andi Kleen <ak@muc.de>
Acked-by: Zwane Mwaikambo <zwane@arm.linux.org.uk>
Signed-off-by: Shaohua.li<shaohua.li@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mach-default/topology.c |  15 ++-
 arch/x86_64/Kconfig               |   9 ++
 arch/x86_64/kernel/irq.c          |  29 ++++++
 arch/x86_64/kernel/process.c      |  29 +++++-
 arch/x86_64/kernel/smpboot.c      | 205 +++++++++++++++++++++++++++++++++++---
 arch/x86_64/kernel/traps.c        |   8 +-
 include/asm-x86_64/irq.h          |   5 +
 include/asm-x86_64/smp.h          |   2 +
 8 files changed, 277 insertions(+), 25 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/i386/mach-default/topology.c b/arch/i386/mach-default/topology.c
index 5b3e8817dae..23395fff35d 100644
--- a/arch/i386/mach-default/topology.c
+++ b/arch/i386/mach-default/topology.c
@@ -73,12 +73,11 @@ static int __init topology_init(void)
 {
 	int i;
 
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		if (node_online(i))
-			arch_register_node(i);
-	}
-	for (i = 0; i < NR_CPUS; i++)
-		if (cpu_possible(i)) arch_register_cpu(i);
+	for_each_online_node(i)
+		arch_register_node(i);
+
+	for_each_cpu(i)
+		arch_register_cpu(i);
 	return 0;
 }
 
@@ -88,8 +87,8 @@ static int __init topology_init(void)
 {
 	int i;
 
-	for (i = 0; i < NR_CPUS; i++)
-		if (cpu_possible(i)) arch_register_cpu(i);
+	for_each_cpu(i)
+		arch_register_cpu(i);
 	return 0;
 }
 
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index db259757dc8..92f5a526602 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -313,6 +313,15 @@ config NR_CPUS
 	  This is purely to save memory - each supported CPU requires
 	  memory in the static kernel configuration.
 
+config HOTPLUG_CPU
+	bool "Support for hot-pluggable CPUs (EXPERIMENTAL)"
+	depends on SMP && HOTPLUG && EXPERIMENTAL
+	help
+		Say Y here to experiment with turning CPUs off and on.  CPUs
+		can be controlled through /sys/devices/system/cpu/cpu#.
+		Say N if you want to disable CPU hotplug.
+
+
 config HPET_TIMER
 	bool
 	default y
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index 62b112e4deb..cc3fb85f514 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -14,6 +14,7 @@
 #include <linux/interrupt.h>
 #include <linux/seq_file.h>
 #include <linux/module.h>
+#include <linux/delay.h>
 #include <asm/uaccess.h>
 #include <asm/io_apic.h>
 
@@ -106,3 +107,31 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
 	return 1;
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
+void fixup_irqs(cpumask_t map)
+{
+	unsigned int irq;
+	static int warned;
+
+	for (irq = 0; irq < NR_IRQS; irq++) {
+		cpumask_t mask;
+		if (irq == 2)
+			continue;
+
+		cpus_and(mask, irq_affinity[irq], map);
+		if (any_online_cpu(mask) == NR_CPUS) {
+			printk("Breaking affinity for irq %i\n", irq);
+			mask = map;
+		}
+		if (irq_desc[irq].handler->set_affinity)
+			irq_desc[irq].handler->set_affinity(irq, mask);
+		else if (irq_desc[irq].action && !(warned++))
+			printk("Cannot set affinity for irq %i\n", irq);
+	}
+
+	/* That doesn't seem sufficient.  Give it 1ms. */
+	local_irq_enable();
+	mdelay(1);
+	local_irq_disable();
+}
+#endif
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 15fa35a81a7..1d91271796e 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -8,7 +8,8 @@
  * 
  *  X86-64 port
  *	Andi Kleen.
- * 
+ *
+ *	CPU hotplug support - ashok.raj@intel.com
  *  $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
  */
 
@@ -18,6 +19,7 @@
 
 #include <stdarg.h>
 
+#include <linux/cpu.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
@@ -154,6 +156,29 @@ void cpu_idle_wait(void)
 }
 EXPORT_SYMBOL_GPL(cpu_idle_wait);
 
+#ifdef CONFIG_HOTPLUG_CPU
+DECLARE_PER_CPU(int, cpu_state);
+
+#include <asm/nmi.h>
+/* We don't actually take CPU down, just spin without interrupts. */
+static inline void play_dead(void)
+{
+	idle_task_exit();
+	wbinvd();
+	mb();
+	/* Ack it */
+	__get_cpu_var(cpu_state) = CPU_DEAD;
+
+	while (1)
+		safe_halt();
+}
+#else
+static inline void play_dead(void)
+{
+	BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
 /*
  * The idle thread. There's no useful work to be
  * done, so just try to conserve power and have a
@@ -174,6 +199,8 @@ void cpu_idle (void)
 			idle = pm_idle;
 			if (!idle)
 				idle = default_idle;
+			if (cpu_is_offline(smp_processor_id()))
+				play_dead();
 			idle();
 		}
 
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index bc98a6722cb..5a3f955b657 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -34,6 +34,7 @@
  *      Andi Kleen              :       Converted to new state machine.
  *					Various cleanups.
  *					Probably mostly hotplug CPU ready now.
+ *	Ashok Raj			: CPU hotplug support
  */
 
 
@@ -98,6 +99,37 @@ EXPORT_SYMBOL(cpu_core_map);
 extern unsigned char trampoline_data[];
 extern unsigned char trampoline_end[];
 
+/* State of each CPU */
+DEFINE_PER_CPU(int, cpu_state) = { 0 };
+
+/*
+ * Store all idle threads, this can be reused instead of creating
+ * a new thread. Also avoids complicated thread destroy functionality
+ * for idle threads.
+ */
+struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
+
+#define get_idle_for_cpu(x)     (idle_thread_array[(x)])
+#define set_idle_for_cpu(x,p)   (idle_thread_array[(x)] = (p))
+
+/*
+ * cpu_possible_map should be static, it cannot change as cpu's
+ * are onlined, or offlined. The reason is per-cpu data-structures
+ * are allocated by some modules at init time, and dont expect to
+ * do this dynamically on cpu arrival/departure.
+ * cpu_present_map on the other hand can change dynamically.
+ * In case when cpu_hotplug is not compiled, then we resort to current
+ * behaviour, which is cpu_possible == cpu_present.
+ * If cpu-hotplug is supported, then we need to preallocate for all
+ * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range.
+ * - Ashok Raj
+ */
+#ifdef CONFIG_HOTPLUG_CPU
+#define fixup_cpu_possible_map(x)	cpu_set((x), cpu_possible_map)
+#else
+#define fixup_cpu_possible_map(x)
+#endif
+
 /*
  * Currently trivial. Write the real->protected mode
  * bootstrap into the page concerned. The caller
@@ -623,33 +655,77 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta
 	return (send_status | accept_status);
 }
 
+struct create_idle {
+	struct task_struct *idle;
+	struct completion done;
+	int cpu;
+};
+
+void do_fork_idle(void *_c_idle)
+{
+	struct create_idle *c_idle = _c_idle;
+
+	c_idle->idle = fork_idle(c_idle->cpu);
+	complete(&c_idle->done);
+}
+
 /*
  * Boot one CPU.
  */
 static int __cpuinit do_boot_cpu(int cpu, int apicid)
 {
-	struct task_struct *idle;
 	unsigned long boot_error;
 	int timeout;
 	unsigned long start_rip;
+	struct create_idle c_idle = {
+		.cpu = cpu,
+		.done = COMPLETION_INITIALIZER(c_idle.done),
+	};
+	DECLARE_WORK(work, do_fork_idle, &c_idle);
+
+	c_idle.idle = get_idle_for_cpu(cpu);
+
+	if (c_idle.idle) {
+		c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *)
+			(THREAD_SIZE + (unsigned long) c_idle.idle->thread_info)) - 1);
+		init_idle(c_idle.idle, cpu);
+		goto do_rest;
+	}
+
 	/*
-	 * We can't use kernel_thread since we must avoid to
-	 * reschedule the child.
+	 * During cold boot process, keventd thread is not spun up yet.
+	 * When we do cpu hot-add, we create idle threads on the fly, we should
+	 * not acquire any attributes from the calling context. Hence the clean
+	 * way to create kernel_threads() is to do that from keventd().
+	 * We do the current_is_keventd() due to the fact that ACPI notifier
+	 * was also queuing to keventd() and when the caller is already running
+	 * in context of keventd(), we would end up with locking up the keventd
+	 * thread.
 	 */
-	idle = fork_idle(cpu);
-	if (IS_ERR(idle)) {
+	if (!keventd_up() || current_is_keventd())
+		work.func(work.data);
+	else {
+		schedule_work(&work);
+		wait_for_completion(&c_idle.done);
+	}
+
+	if (IS_ERR(c_idle.idle)) {
 		printk("failed fork for CPU %d\n", cpu);
-		return PTR_ERR(idle);
+		return PTR_ERR(c_idle.idle);
 	}
 
-	cpu_pda[cpu].pcurrent = idle;
+	set_idle_for_cpu(cpu, c_idle.idle);
+
+do_rest:
+
+	cpu_pda[cpu].pcurrent = c_idle.idle;
 
 	start_rip = setup_trampoline();
 
-	init_rsp = idle->thread.rsp;
+	init_rsp = c_idle.idle->thread.rsp;
 	per_cpu(init_tss,cpu).rsp0 = init_rsp;
 	initial_code = start_secondary;
-	clear_ti_thread_flag(idle->thread_info, TIF_FORK);
+	clear_ti_thread_flag(c_idle.idle->thread_info, TIF_FORK);
 
 	printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid,
 	       start_rip, init_rsp);
@@ -925,10 +1001,9 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 		int apicid = cpu_present_to_apicid(i);
 		if (physid_isset(apicid, phys_cpu_present_map)) {
 			cpu_set(i, cpu_present_map);
-			/* possible map would be different if we supported real
-			   CPU hotplug. */
 			cpu_set(i, cpu_possible_map);
 		}
+		fixup_cpu_possible_map(i);
 	}
 
 	if (smp_sanity_check(max_cpus) < 0) {
@@ -977,9 +1052,6 @@ void __init smp_prepare_boot_cpu(void)
 
 /*
  * Entry point to boot a CPU.
- *
- * This is all __cpuinit, not __devinit for now because we don't support
- * CPU hotplug (yet).
  */
 int __cpuinit __cpu_up(unsigned int cpu)
 {
@@ -996,6 +1068,14 @@ int __cpuinit __cpu_up(unsigned int cpu)
 		return -EINVAL;
 	}
 
+	/*
+	 * Already booted CPU?
+	 */
+ 	if (cpu_isset(cpu, cpu_callin_map)) {
+		Dprintk("do_boot_cpu %d Already started\n", cpu);
+ 		return -ENOSYS;
+	}
+
 	/* Boot it! */
 	err = do_boot_cpu(cpu, apicid);
 	if (err < 0) {
@@ -1008,7 +1088,9 @@ int __cpuinit __cpu_up(unsigned int cpu)
 
 	while (!cpu_isset(cpu, cpu_online_map))
 		cpu_relax();
-	return 0;
+	err = 0;
+
+	return err;
 }
 
 /*
@@ -1016,7 +1098,9 @@ int __cpuinit __cpu_up(unsigned int cpu)
  */
 void __init smp_cpus_done(unsigned int max_cpus)
 {
+#ifndef CONFIG_HOTPLUG_CPU
 	zap_low_mappings();
+#endif
 	smp_cleanup_boot();
 
 #ifdef CONFIG_X86_IO_APIC
@@ -1028,3 +1112,94 @@ void __init smp_cpus_done(unsigned int max_cpus)
 
 	check_nmi_watchdog();
 }
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static void
+remove_siblinginfo(int cpu)
+{
+	int sibling;
+
+	for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
+		cpu_clear(cpu, cpu_sibling_map[sibling]);
+	for_each_cpu_mask(sibling, cpu_core_map[cpu])
+		cpu_clear(cpu, cpu_core_map[sibling]);
+	cpus_clear(cpu_sibling_map[cpu]);
+	cpus_clear(cpu_core_map[cpu]);
+	phys_proc_id[cpu] = BAD_APICID;
+	cpu_core_id[cpu] = BAD_APICID;
+}
+
+void remove_cpu_from_maps(void)
+{
+	int cpu = smp_processor_id();
+
+	cpu_clear(cpu, cpu_callout_map);
+	cpu_clear(cpu, cpu_callin_map);
+	clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
+}
+
+int __cpu_disable(void)
+{
+	int cpu = smp_processor_id();
+
+	/*
+	 * Perhaps use cpufreq to drop frequency, but that could go
+	 * into generic code.
+ 	 *
+	 * We won't take down the boot processor on i386 due to some
+	 * interrupts only being able to be serviced by the BSP.
+	 * Especially so if we're not using an IOAPIC	-zwane
+	 */
+	if (cpu == 0)
+		return -EBUSY;
+
+	disable_APIC_timer();
+
+	/*
+	 * HACK:
+	 * Allow any queued timer interrupts to get serviced
+	 * This is only a temporary solution until we cleanup
+	 * fixup_irqs as we do for IA64.
+	 */
+	local_irq_enable();
+	mdelay(1);
+
+	local_irq_disable();
+	remove_siblinginfo(cpu);
+
+	/* It's now safe to remove this processor from the online map */
+	cpu_clear(cpu, cpu_online_map);
+	remove_cpu_from_maps();
+	fixup_irqs(cpu_online_map);
+	return 0;
+}
+
+void __cpu_die(unsigned int cpu)
+{
+	/* We don't do anything here: idle task is faking death itself. */
+	unsigned int i;
+
+	for (i = 0; i < 10; i++) {
+		/* They ack this in play_dead by setting CPU_DEAD */
+		if (per_cpu(cpu_state, cpu) == CPU_DEAD)
+			return;
+		current->state = TASK_UNINTERRUPTIBLE;
+		schedule_timeout(HZ/10);
+	}
+ 	printk(KERN_ERR "CPU %u didn't die...\n", cpu);
+}
+
+#else /* ... !CONFIG_HOTPLUG_CPU */
+
+int __cpu_disable(void)
+{
+	return -ENOSYS;
+}
+
+void __cpu_die(unsigned int cpu)
+{
+	/* We said "no" in __cpu_disable */
+	BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 121646fc43f..10273663000 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -586,11 +586,17 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
 asmlinkage void default_do_nmi(struct pt_regs *regs)
 {
 	unsigned char reason = 0;
+	int cpu;
+
+	cpu = smp_processor_id();
 
 	/* Only the BSP gets external NMIs from the system.  */
-	if (!smp_processor_id())
+	if (!cpu)
 		reason = get_nmi_reason();
 
+	if (!cpu_online(cpu))
+		return;
+
 	if (!(reason & 0xc0)) {
 		if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT)
 								== NOTIFY_STOP)
diff --git a/include/asm-x86_64/irq.h b/include/asm-x86_64/irq.h
index 3af50b3c3b0..eb3b7aa9eb9 100644
--- a/include/asm-x86_64/irq.h
+++ b/include/asm-x86_64/irq.h
@@ -52,4 +52,9 @@ struct irqaction;
 struct pt_regs;
 int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
 
+#ifdef CONFIG_HOTPLUG_CPU
+#include <linux/cpumask.h>
+extern void fixup_irqs(cpumask_t map);
+#endif
+
 #endif /* _ASM_IRQ_H */
diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h
index a7425aa5a3b..9c6242fb99d 100644
--- a/include/asm-x86_64/smp.h
+++ b/include/asm-x86_64/smp.h
@@ -77,6 +77,8 @@ extern __inline int hard_smp_processor_id(void)
 }
 
 extern int safe_smp_processor_id(void);
+extern int __cpu_disable(void);
+extern void __cpu_die(unsigned int cpu);
 
 #endif /* !ASSEMBLY */
 
-- 
cgit v1.2.3-70-g09d2


From 884d9e40b4089014f40c49e86ac6505842db2b53 Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Sat, 25 Jun 2005 14:55:02 -0700
Subject: [PATCH] x86_64: Dont use broadcast shortcut to make it cpu hotplug
 safe.

Broadcast IPI's provide un-expected behaviour for cpu hotplug.  CPU's in
offline state also end up receiving the IPI.  Once the cpus become online they
receive these stale IPI's which are bad and introduce unexpected behaviour.

This is easily avoided by not sending a broadcast and addressing just the
CPU's in online map.  Doing prelim cycle counts it appears there is no big
overhead and numbers seem around 0x3000-0x3900 on an average on x86 and x86_64
systems with CPUS running 3G, both for broadcast and mask version of the
API's.

The shortcuts are useful only for flat mode (where the perf shows no
degradation), and in cluster mode, its unicast anyway.  Its simpler to just
not use broadcast anymore.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Acked-by: Andi Kleen <ak@muc.de>
Acked-by: Zwane Mwaikambo <zwane@arm.linux.org.uk>
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/genapic_flat.c | 42 ++++++++++++++++++++++++---------------
 arch/x86_64/kernel/smp.c          | 10 ++++++++++
 arch/x86_64/kernel/smpboot.c      | 19 +++++++++++++++++-
 include/asm-x86_64/smp.h          |  2 ++
 4 files changed, 56 insertions(+), 17 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c
index b4cbbad0422..00f3fa6df71 100644
--- a/arch/x86_64/kernel/genapic_flat.c
+++ b/arch/x86_64/kernel/genapic_flat.c
@@ -7,6 +7,8 @@
  * Hacked for x86-64 by James Cleverdon from i386 architecture code by
  * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
  * James Cleverdon.
+ * Ashok Raj <ashok.raj@intel.com>
+ * 	Removed IPI broadcast shortcut to support CPU hotplug
  */
 #include <linux/config.h>
 #include <linux/threads.h>
@@ -45,22 +47,6 @@ static void flat_init_apic_ldr(void)
 	apic_write_around(APIC_LDR, val);
 }
 
-static void flat_send_IPI_allbutself(int vector)
-{
-	/*
-	 * if there are no other CPUs in the system then
-	 * we get an APIC send error if we try to broadcast.
-	 * thus we have to avoid sending IPIs in this case.
-	 */
-	if (num_online_cpus() > 1)
-		__send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
-}
-
-static void flat_send_IPI_all(int vector)
-{
-	__send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
-}
-
 static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
 {
 	unsigned long mask = cpus_addr(cpumask)[0];
@@ -93,6 +79,30 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
 	local_irq_restore(flags);
 }
 
+static void flat_send_IPI_allbutself(int vector)
+{
+	cpumask_t mask;
+	/*
+	 * if there are no other CPUs in the system then
+	 * we get an APIC send error if we try to broadcast.
+	 * thus we have to avoid sending IPIs in this case.
+	 */
+	int this_cpu = get_cpu();
+
+	mask = cpu_online_map;
+	cpu_clear(this_cpu, mask);
+
+	if (cpus_weight(mask) >= 1)
+		flat_send_IPI_mask(mask, vector);
+
+	put_cpu();
+}
+
+static void flat_send_IPI_all(int vector)
+{
+	flat_send_IPI_mask(cpu_online_map, vector);
+}
+
 static int flat_apic_id_registered(void)
 {
 	return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map);
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 1e379ed17b1..ccae392886a 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -283,6 +283,16 @@ struct call_data_struct {
 
 static struct call_data_struct * call_data;
 
+void lock_ipi_call_lock(void)
+{
+	spin_lock_irq(&call_lock);
+}
+
+void unlock_ipi_call_lock(void)
+{
+	spin_unlock_irq(&call_lock);
+}
+
 /*
  * this function sends a 'generic call function' IPI to all other CPUs
  * in the system.
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 571a55462fa..b969ee12872 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -507,10 +507,23 @@ void __cpuinit start_secondary(void)
 	 */
 	set_cpu_sibling_map(smp_processor_id());
 
+	/*
+	 * We need to hold call_lock, so there is no inconsistency
+	 * between the time smp_call_function() determines number of
+	 * IPI receipients, and the time when the determination is made
+	 * for which cpus receive the IPI in genapic_flat.c. Holding this
+	 * lock helps us to not include this cpu in a currently in progress
+	 * smp_call_function().
+	 */
+	lock_ipi_call_lock();
+
 	/*
 	 * Allow the master to continue.
 	 */
 	cpu_set(smp_processor_id(), cpu_online_map);
+	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+	unlock_ipi_call_lock();
+
 	mb();
 
 	/* Wait for TSC sync to not schedule things before.
@@ -1038,6 +1051,7 @@ void __init smp_prepare_boot_cpu(void)
 	cpu_set(me, cpu_callout_map);
 	cpu_set(0, cpu_sibling_map[0]);
 	cpu_set(0, cpu_core_map[0]);
+	per_cpu(cpu_state, me) = CPU_ONLINE;
 }
 
 /*
@@ -1066,6 +1080,7 @@ int __cpuinit __cpu_up(unsigned int cpu)
  		return -ENOSYS;
 	}
 
+	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
 	/* Boot it! */
 	err = do_boot_cpu(cpu, apicid);
 	if (err < 0) {
@@ -1170,8 +1185,10 @@ void __cpu_die(unsigned int cpu)
 
 	for (i = 0; i < 10; i++) {
 		/* They ack this in play_dead by setting CPU_DEAD */
-		if (per_cpu(cpu_state, cpu) == CPU_DEAD)
+		if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
+			printk ("CPU %d is now offline\n", cpu);
 			return;
+		}
 		current->state = TASK_UNINTERRUPTIBLE;
 		schedule_timeout(HZ/10);
 	}
diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h
index 9c6242fb99d..aeb1b73e21e 100644
--- a/include/asm-x86_64/smp.h
+++ b/include/asm-x86_64/smp.h
@@ -43,6 +43,8 @@ extern cpumask_t cpu_callout_map;
 extern void smp_alloc_memory(void);
 extern volatile unsigned long smp_invalidate_needed;
 extern int pic_mode;
+extern void lock_ipi_call_lock(void);
+extern void unlock_ipi_call_lock(void);
 extern int smp_num_siblings;
 extern void smp_flush_tlb(void);
 extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs);
-- 
cgit v1.2.3-70-g09d2


From 8d783b3e02002bce8cf9d4e4a82922ee7e59b1e5 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Sat, 25 Jun 2005 14:55:14 -0700
Subject: [PATCH] swsusp: clean assembly parts

This patch fixes register saving so that each register is only saved once,
and adds missing saving of %cr8 on x86-64.  Some reordering so that
save/restore is more logical/safer (segment registers should be restored
after gdt).

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/power/cpu.c        | 17 +++++++----------
 arch/x86_64/kernel/suspend.c | 18 +++++++++---------
 include/asm-x86_64/suspend.h |  2 +-
 3 files changed, 17 insertions(+), 20 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/i386/power/cpu.c b/arch/i386/power/cpu.c
index d099d01461f..0e6b45b6125 100644
--- a/arch/i386/power/cpu.c
+++ b/arch/i386/power/cpu.c
@@ -44,7 +44,6 @@ void __save_processor_state(struct saved_context *ctxt)
 	 */
 	asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit));
 	asm volatile ("sidt %0" : "=m" (ctxt->idt_limit));
-	asm volatile ("sldt %0" : "=m" (ctxt->ldt));
 	asm volatile ("str %0"  : "=m" (ctxt->tr));
 
 	/*
@@ -107,7 +106,6 @@ static void fix_processor_context(void)
 
 void __restore_processor_state(struct saved_context *ctxt)
 {
-
 	/*
 	 * control registers
 	 */
@@ -116,6 +114,13 @@ void __restore_processor_state(struct saved_context *ctxt)
 	asm volatile ("movl %0, %%cr2" :: "r" (ctxt->cr2));
 	asm volatile ("movl %0, %%cr0" :: "r" (ctxt->cr0));
 
+	/*
+	 * now restore the descriptor tables to their proper values
+	 * ltr is done i fix_processor_context().
+	 */
+	asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit));
+	asm volatile ("lidt %0" :: "m" (ctxt->idt_limit));
+
 	/*
 	 * segment registers
 	 */
@@ -124,14 +129,6 @@ void __restore_processor_state(struct saved_context *ctxt)
 	asm volatile ("movw %0, %%gs" :: "r" (ctxt->gs));
 	asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss));
 
-	/*
-	 * now restore the descriptor tables to their proper values
-	 * ltr is done i fix_processor_context().
-	 */
-	asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit));
-	asm volatile ("lidt %0" :: "m" (ctxt->idt_limit));
-	asm volatile ("lldt %0" :: "m" (ctxt->ldt));
-
 	/*
 	 * sysenter MSRs
 	 */
diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c
index ebaa1e37d65..6c0f402e3a8 100644
--- a/arch/x86_64/kernel/suspend.c
+++ b/arch/x86_64/kernel/suspend.c
@@ -44,7 +44,6 @@ void __save_processor_state(struct saved_context *ctxt)
 	 */
 	asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit));
 	asm volatile ("sidt %0" : "=m" (ctxt->idt_limit));
-	asm volatile ("sldt %0" : "=m" (ctxt->ldt));
 	asm volatile ("str %0"  : "=m" (ctxt->tr));
 
 	/* XMM0..XMM15 should be handled by kernel_fpu_begin(). */
@@ -69,6 +68,7 @@ void __save_processor_state(struct saved_context *ctxt)
 	asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2));
 	asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3));
 	asm volatile ("movq %%cr4, %0" : "=r" (ctxt->cr4));
+	asm volatile ("movq %%cr8, %0" : "=r" (ctxt->cr8));
 }
 
 void save_processor_state(void)
@@ -90,11 +90,19 @@ void __restore_processor_state(struct saved_context *ctxt)
 	/*
 	 * control registers
 	 */
+	asm volatile ("movq %0, %%cr8" :: "r" (ctxt->cr8));
 	asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4));
 	asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3));
 	asm volatile ("movq %0, %%cr2" :: "r" (ctxt->cr2));
 	asm volatile ("movq %0, %%cr0" :: "r" (ctxt->cr0));
 
+	/*
+	 * now restore the descriptor tables to their proper values
+	 * ltr is done i fix_processor_context().
+	 */
+	asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit));
+	asm volatile ("lidt %0" :: "m" (ctxt->idt_limit));
+
 	/*
 	 * segment registers
 	 */
@@ -108,14 +116,6 @@ void __restore_processor_state(struct saved_context *ctxt)
 	wrmsrl(MSR_GS_BASE, ctxt->gs_base);
 	wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
 
-	/*
-	 * now restore the descriptor tables to their proper values
-	 * ltr is done i fix_processor_context().
-	 */
-	asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit));
-	asm volatile ("lidt %0" :: "m" (ctxt->idt_limit));
-	asm volatile ("lldt %0" :: "m" (ctxt->ldt));
-
 	fix_processor_context();
 
 	do_fpu_end();
diff --git a/include/asm-x86_64/suspend.h b/include/asm-x86_64/suspend.h
index ec745807fea..bb9f40597d0 100644
--- a/include/asm-x86_64/suspend.h
+++ b/include/asm-x86_64/suspend.h
@@ -16,7 +16,7 @@ arch_prepare_suspend(void)
 struct saved_context {
   	u16 ds, es, fs, gs, ss;
 	unsigned long gs_base, gs_kernel_base, fs_base;
-	unsigned long cr0, cr2, cr3, cr4;
+	unsigned long cr0, cr2, cr3, cr4, cr8;
 	u16 gdt_pad;
 	u16 gdt_limit;
 	unsigned long gdt_base;
-- 
cgit v1.2.3-70-g09d2


From b2b18660066997420b716c1881a6be8b82700d97 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@us.ibm.com>
Date: Sat, 25 Jun 2005 14:55:38 -0700
Subject: [PATCH] RCU: clean up a few remaining synchronize_kernel() calls

2.6.12-rc6-mm1 has a few remaining synchronize_kernel()s, some (but not
all) in comments.  This patch changes these synchronize_kernel() calls (and
comments) to synchronize_rcu() or synchronize_sched() as follows:

- arch/x86_64/kernel/mce.c mce_read(): change to synchronize_sched() to
  handle races with machine-check exceptions (synchronize_rcu() would not cut
  it given RCU implementations intended for hardcore realtime use.

- drivers/input/serio/i8042.c i8042_stop(): change to synchronize_sched() to
  handle races with i8042_interrupt() interrupt handler.  Again,
  synchronize_rcu() would not cut it given RCU implementations intended for
  hardcore realtime use.

- include/*/kdebug.h comments: change to synchronize_sched() to handle races
  with NMIs.  As before, synchronize_rcu() would not cut it...

- include/linux/list.h comment: change to synchronize_rcu(), since this
  comment is for list_del_rcu().

- security/keys/key.c unregister_key_type(): change to synchronize_rcu(),
  since this is interacting with RCU read side.

- security/keys/process_keys.c install_session_keyring(): change to
  synchronize_rcu(), since this is interacting with RCU read side.

Signed-off-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/mce.c     | 2 +-
 drivers/input/serio/i8042.c  | 2 +-
 include/asm-i386/kdebug.h    | 2 +-
 include/asm-ppc64/kdebug.h   | 2 +-
 include/asm-sparc64/kdebug.h | 2 +-
 include/asm-x86_64/kdebug.h  | 2 +-
 include/linux/list.h         | 2 +-
 security/keys/key.c          | 2 +-
 security/keys/process_keys.c | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 7ab15c8ab95..21e70625a49 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -411,7 +411,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff
 	memset(mcelog.entry, 0, next * sizeof(struct mce));
 	mcelog.next = 0;
 
-	synchronize_kernel();	
+	synchronize_sched();
 
 	/* Collect entries that were still getting written before the synchronize. */
 
diff --git a/drivers/input/serio/i8042.c b/drivers/input/serio/i8042.c
index 5900de3c3f4..a9bf549c8dc 100644
--- a/drivers/input/serio/i8042.c
+++ b/drivers/input/serio/i8042.c
@@ -396,7 +396,7 @@ static void i8042_stop(struct serio *serio)
 	struct i8042_port *port = serio->port_data;
 
 	port->exists = 0;
-	synchronize_kernel();
+	synchronize_sched();
 	port->serio = NULL;
 }
 
diff --git a/include/asm-i386/kdebug.h b/include/asm-i386/kdebug.h
index de6498b0d49..b3f8d5f59d5 100644
--- a/include/asm-i386/kdebug.h
+++ b/include/asm-i386/kdebug.h
@@ -18,7 +18,7 @@ struct die_args {
 };
 
 /* Note - you should never unregister because that can race with NMIs.
-   If you really want to do it first unregister - then synchronize_kernel - then free.
+   If you really want to do it first unregister - then synchronize_sched - then free.
   */
 int register_die_notifier(struct notifier_block *nb);
 extern struct notifier_block *i386die_chain;
diff --git a/include/asm-ppc64/kdebug.h b/include/asm-ppc64/kdebug.h
index 488634258a7..d383d161cf8 100644
--- a/include/asm-ppc64/kdebug.h
+++ b/include/asm-ppc64/kdebug.h
@@ -17,7 +17,7 @@ struct die_args {
 
 /*
    Note - you should never unregister because that can race with NMIs.
-   If you really want to do it first unregister - then synchronize_kernel -
+   If you really want to do it first unregister - then synchronize_sched -
    then free.
  */
 int register_die_notifier(struct notifier_block *nb);
diff --git a/include/asm-sparc64/kdebug.h b/include/asm-sparc64/kdebug.h
index f70d3dad01f..6321f5a0198 100644
--- a/include/asm-sparc64/kdebug.h
+++ b/include/asm-sparc64/kdebug.h
@@ -16,7 +16,7 @@ struct die_args {
 };
 
 /* Note - you should never unregister because that can race with NMIs.
- * If you really want to do it first unregister - then synchronize_kernel
+ * If you really want to do it first unregister - then synchronize_sched
  * - then free.
  */
 int register_die_notifier(struct notifier_block *nb);
diff --git a/include/asm-x86_64/kdebug.h b/include/asm-x86_64/kdebug.h
index 6277f75cbb4..b90341994d8 100644
--- a/include/asm-x86_64/kdebug.h
+++ b/include/asm-x86_64/kdebug.h
@@ -14,7 +14,7 @@ struct die_args {
 }; 
 
 /* Note - you should never unregister because that can race with NMIs.
-   If you really want to do it first unregister - then synchronize_kernel - then free. 
+   If you really want to do it first unregister - then synchronize_sched - then free.
   */
 int register_die_notifier(struct notifier_block *nb);
 extern struct notifier_block *die_chain;
diff --git a/include/linux/list.h b/include/linux/list.h
index 399b51d1721..aab2db21b01 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -185,7 +185,7 @@ static inline void list_del(struct list_head *entry)
  * list_for_each_entry_rcu().
  *
  * Note that the caller is not permitted to immediately free
- * the newly deleted entry.  Instead, either synchronize_kernel()
+ * the newly deleted entry.  Instead, either synchronize_rcu()
  * or call_rcu() must be used to defer freeing until an RCU
  * grace period has elapsed.
  */
diff --git a/security/keys/key.c b/security/keys/key.c
index 3304d37bb37..fb89f984446 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -980,7 +980,7 @@ void unregister_key_type(struct key_type *ktype)
 	spin_unlock(&key_serial_lock);
 
 	/* make sure everyone revalidates their keys */
-	synchronize_kernel();
+	synchronize_rcu();
 
 	/* we should now be able to destroy the payloads of all the keys of
 	 * this type with impunity */
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 34db087bbcc..9b0369c5a22 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -234,7 +234,7 @@ static int install_session_keyring(struct task_struct *tsk,
 	ret = 0;
 
 	/* we're using RCU on the pointer */
-	synchronize_kernel();
+	synchronize_rcu();
 	key_put(old);
  error:
 	return ret;
-- 
cgit v1.2.3-70-g09d2


From 7897986bad8f6cd50d6149345aca7f6480f49464 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:13 -0700
Subject: [PATCH] sched: balance timers

Do CPU load averaging over a number of different intervals.  Allow each
interval to be chosen by sending a parameter to source_load and target_load.
0 is instantaneous, idx > 0 returns a decaying average with the most recent
sample weighted at 2^(idx-1).  To a maximum of 3 (could be easily increased).

So generally a higher number will result in more conservative balancing.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/topology.h   |   4 ++
 include/asm-x86_64/topology.h |   6 +-
 include/linux/sched.h         |   4 ++
 include/linux/topology.h      |   8 +++
 kernel/sched.c                | 138 ++++++++++++++++++++++--------------------
 5 files changed, 95 insertions(+), 65 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h
index 6d0f67507b2..0055fbfeec7 100644
--- a/include/asm-i386/topology.h
+++ b/include/asm-i386/topology.h
@@ -74,6 +74,10 @@ static inline int node_to_first_cpu(int node)
 	.imbalance_pct		= 125,			\
 	.cache_hot_time		= (10*1000000),		\
 	.cache_nice_tries	= 1,			\
+	.busy_idx		= 3,			\
+	.idle_idx		= 1,			\
+	.newidle_idx		= 2,			\
+	.wake_idx		= 1,			\
 	.per_cpu_gain		= 100,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index 8f77e9f6bc2..fe8d80a1575 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -39,7 +39,11 @@ extern int __node_distance(int, int);
 	.busy_factor		= 32,			\
 	.imbalance_pct		= 125,			\
 	.cache_hot_time		= (10*1000000),		\
-	.cache_nice_tries	= 1,			\
+	.cache_nice_tries	= 2,			\
+	.busy_idx		= 3,			\
+	.idle_idx		= 2,			\
+	.newidle_idx		= 1, 			\
+	.wake_idx		= 1,			\
 	.per_cpu_gain		= 100,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2c69682b044..664981ac1fb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -488,6 +488,10 @@ struct sched_domain {
 	unsigned long long cache_hot_time; /* Task considered cache hot (ns) */
 	unsigned int cache_nice_tries;	/* Leave cache hot tasks for # tries */
 	unsigned int per_cpu_gain;	/* CPU % gained by adding domain cpus */
+	unsigned int busy_idx;
+	unsigned int idle_idx;
+	unsigned int newidle_idx;
+	unsigned int wake_idx;
 	int flags;			/* See SD_* */
 
 	/* Runtime fields. */
diff --git a/include/linux/topology.h b/include/linux/topology.h
index d70e8972c67..ae9c2216dfa 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -89,6 +89,10 @@
 	.cache_hot_time		= 0,			\
 	.cache_nice_tries	= 0,			\
 	.per_cpu_gain		= 25,			\
+	.busy_idx		= 0,			\
+	.idle_idx		= 0,			\
+	.newidle_idx		= 0,			\
+	.wake_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
@@ -115,6 +119,10 @@
 	.cache_hot_time		= (5*1000000/2),	\
 	.cache_nice_tries	= 1,			\
 	.per_cpu_gain		= 100,			\
+	.busy_idx		= 2,			\
+	.idle_idx		= 0,			\
+	.newidle_idx		= 1,			\
+	.wake_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
diff --git a/kernel/sched.c b/kernel/sched.c
index f665de34ed8..b597b07e791 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -206,7 +206,7 @@ struct runqueue {
 	 */
 	unsigned long nr_running;
 #ifdef CONFIG_SMP
-	unsigned long cpu_load;
+	unsigned long cpu_load[3];
 #endif
 	unsigned long long nr_switches;
 
@@ -886,23 +886,27 @@ void kick_process(task_t *p)
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
-static inline unsigned long source_load(int cpu)
+static inline unsigned long source_load(int cpu, int type)
 {
 	runqueue_t *rq = cpu_rq(cpu);
 	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+	if (type == 0)
+		return load_now;
 
-	return min(rq->cpu_load, load_now);
+	return min(rq->cpu_load[type-1], load_now);
 }
 
 /*
  * Return a high guess at the load of a migration-target cpu
  */
-static inline unsigned long target_load(int cpu)
+static inline unsigned long target_load(int cpu, int type)
 {
 	runqueue_t *rq = cpu_rq(cpu);
 	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+	if (type == 0)
+		return load_now;
 
-	return max(rq->cpu_load, load_now);
+	return max(rq->cpu_load[type-1], load_now);
 }
 
 #endif
@@ -967,7 +971,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
 	runqueue_t *rq;
 #ifdef CONFIG_SMP
 	unsigned long load, this_load;
-	struct sched_domain *sd;
+	struct sched_domain *sd, *this_sd = NULL;
 	int new_cpu;
 #endif
 
@@ -986,72 +990,64 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
 	if (unlikely(task_running(rq, p)))
 		goto out_activate;
 
-#ifdef CONFIG_SCHEDSTATS
+	new_cpu = cpu;
+
 	schedstat_inc(rq, ttwu_cnt);
 	if (cpu == this_cpu) {
 		schedstat_inc(rq, ttwu_local);
-	} else {
-		for_each_domain(this_cpu, sd) {
-			if (cpu_isset(cpu, sd->span)) {
-				schedstat_inc(sd, ttwu_wake_remote);
-				break;
-			}
+		goto out_set_cpu;
+	}
+
+	for_each_domain(this_cpu, sd) {
+		if (cpu_isset(cpu, sd->span)) {
+			schedstat_inc(sd, ttwu_wake_remote);
+			this_sd = sd;
+			break;
 		}
 	}
-#endif
 
-	new_cpu = cpu;
-	if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
+	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
 		goto out_set_cpu;
 
-	load = source_load(cpu);
-	this_load = target_load(this_cpu);
-
 	/*
-	 * If sync wakeup then subtract the (maximum possible) effect of
-	 * the currently running task from the load of the current CPU:
+	 * Check for affine wakeup and passive balancing possibilities.
 	 */
-	if (sync)
-		this_load -= SCHED_LOAD_SCALE;
-
-	/* Don't pull the task off an idle CPU to a busy one */
-	if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
-		goto out_set_cpu;
+	if (this_sd) {
+		int idx = this_sd->wake_idx;
+		unsigned int imbalance;
 
-	new_cpu = this_cpu; /* Wake to this CPU if we can */
+		load = source_load(cpu, idx);
+		this_load = target_load(this_cpu, idx);
 
-	/*
-	 * Scan domains for affine wakeup and passive balancing
-	 * possibilities.
-	 */
-	for_each_domain(this_cpu, sd) {
-		unsigned int imbalance;
 		/*
-		 * Start passive balancing when half the imbalance_pct
-		 * limit is reached.
+		 * If sync wakeup then subtract the (maximum possible) effect of
+		 * the currently running task from the load of the current CPU:
 		 */
-		imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
+		if (sync)
+			this_load -= SCHED_LOAD_SCALE;
+
+		 /* Don't pull the task off an idle CPU to a busy one */
+		if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
+			goto out_set_cpu;
 
-		if ((sd->flags & SD_WAKE_AFFINE) &&
-				!task_hot(p, rq->timestamp_last_tick, sd)) {
+		new_cpu = this_cpu; /* Wake to this CPU if we can */
+
+		if ((this_sd->flags & SD_WAKE_AFFINE) &&
+			!task_hot(p, rq->timestamp_last_tick, this_sd)) {
 			/*
 			 * This domain has SD_WAKE_AFFINE and p is cache cold
 			 * in this domain.
 			 */
-			if (cpu_isset(cpu, sd->span)) {
-				schedstat_inc(sd, ttwu_move_affine);
-				goto out_set_cpu;
-			}
-		} else if ((sd->flags & SD_WAKE_BALANCE) &&
+			schedstat_inc(this_sd, ttwu_move_affine);
+			goto out_set_cpu;
+		} else if ((this_sd->flags & SD_WAKE_BALANCE) &&
 				imbalance*this_load <= 100*load) {
 			/*
 			 * This domain has SD_WAKE_BALANCE and there is
 			 * an imbalance.
 			 */
-			if (cpu_isset(cpu, sd->span)) {
-				schedstat_inc(sd, ttwu_move_balance);
-				goto out_set_cpu;
-			}
+			schedstat_inc(this_sd, ttwu_move_balance);
+			goto out_set_cpu;
 		}
 	}
 
@@ -1509,7 +1505,7 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu,
 	cpus_and(mask, sd->span, p->cpus_allowed);
 
 	for_each_cpu_mask(i, mask) {
-		load = target_load(i);
+		load = target_load(i, sd->wake_idx);
 
 		if (load < min_load) {
 			min_cpu = i;
@@ -1522,7 +1518,7 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu,
 	}
 
 	/* add +1 to account for the new task */
-	this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
+	this_load = source_load(this_cpu, sd->wake_idx) + SCHED_LOAD_SCALE;
 
 	/*
 	 * Would with the addition of the new task to the
@@ -1767,8 +1763,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 {
 	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+	int load_idx;
 
 	max_load = this_load = total_load = total_pwr = 0;
+	if (idle == NOT_IDLE)
+		load_idx = sd->busy_idx;
+	else if (idle == NEWLY_IDLE)
+		load_idx = sd->newidle_idx;
+	else
+		load_idx = sd->idle_idx;
 
 	do {
 		unsigned long load;
@@ -1783,9 +1786,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		for_each_cpu_mask(i, group->cpumask) {
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
-				load = target_load(i);
+				load = target_load(i, load_idx);
 			else
-				load = source_load(i);
+				load = source_load(i, load_idx);
 
 			avg_load += load;
 		}
@@ -1895,7 +1898,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
 	int i;
 
 	for_each_cpu_mask(i, group->cpumask) {
-		load = source_load(i);
+		load = source_load(i, 0);
 
 		if (load > max_load) {
 			max_load = load;
@@ -2150,18 +2153,23 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
 	unsigned long old_load, this_load;
 	unsigned long j = jiffies + CPU_OFFSET(this_cpu);
 	struct sched_domain *sd;
+	int i;
 
-	/* Update our load */
-	old_load = this_rq->cpu_load;
 	this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
-	/*
-	 * Round up the averaging division if load is increasing. This
-	 * prevents us from getting stuck on 9 if the load is 10, for
-	 * example.
-	 */
-	if (this_load > old_load)
-		old_load++;
-	this_rq->cpu_load = (old_load + this_load) / 2;
+	/* Update our load */
+	for (i = 0; i < 3; i++) {
+		unsigned long new_load = this_load;
+		int scale = 1 << i;
+		old_load = this_rq->cpu_load[i];
+		/*
+		 * Round up the averaging division if load is increasing. This
+		 * prevents us from getting stuck on 9 if the load is 10, for
+		 * example.
+		 */
+		if (new_load > old_load)
+			new_load += scale-1;
+		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
+	}
 
 	for_each_domain(this_cpu, sd) {
 		unsigned long interval;
@@ -4921,13 +4929,15 @@ void __init sched_init(void)
 
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
+		rq->nr_running = 0;
 		rq->active = rq->arrays;
 		rq->expired = rq->arrays + 1;
 		rq->best_expired_prio = MAX_PRIO;
 
 #ifdef CONFIG_SMP
 		rq->sd = &sched_domain_dummy;
-		rq->cpu_load = 0;
+		for (j = 1; j < 3; j++)
+			rq->cpu_load[j] = 0;
 		rq->active_balance = 0;
 		rq->push_cpu = 0;
 		rq->migration_thread = NULL;
-- 
cgit v1.2.3-70-g09d2


From cafb20c1f9976a70d633bb1e1c8c24eab00e4e80 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:17 -0700
Subject: [PATCH] sched: no aggressive idle balancing

Remove the very aggressive idle stuff that has recently gone into 2.6 - it is
going against the direction we are trying to go.  Hopefully we can regain
performance through other methods.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/topology.h   |  1 -
 include/asm-x86_64/topology.h |  1 -
 include/linux/topology.h      |  1 -
 kernel/sched.c                | 21 ++-------------------
 4 files changed, 2 insertions(+), 22 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h
index 0055fbfeec7..5eb6f61dcef 100644
--- a/include/asm-i386/topology.h
+++ b/include/asm-i386/topology.h
@@ -82,7 +82,6 @@ static inline int node_to_first_cpu(int node)
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
 				| SD_BALANCE_NEWIDLE	\
-				| SD_WAKE_IDLE		\
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index fe8d80a1575..9cb7459ce72 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -48,7 +48,6 @@ extern int __node_distance(int, int);
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
-				| SD_WAKE_IDLE		\
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
diff --git a/include/linux/topology.h b/include/linux/topology.h
index ae9c2216dfa..b23ec64df7f 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -127,7 +127,6 @@
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_AFFINE	\
-				| SD_WAKE_IDLE		\
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
diff --git a/kernel/sched.c b/kernel/sched.c
index 5ae3568eed0..396724a2519 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -414,22 +414,6 @@ static inline runqueue_t *this_rq_lock(void)
 	return rq;
 }
 
-#ifdef CONFIG_SCHED_SMT
-static int cpu_and_siblings_are_idle(int cpu)
-{
-	int sib;
-	for_each_cpu_mask(sib, cpu_sibling_map[cpu]) {
-		if (idle_cpu(sib))
-			continue;
-		return 0;
-	}
-
-	return 1;
-}
-#else
-#define cpu_and_siblings_are_idle(A) idle_cpu(A)
-#endif
-
 #ifdef CONFIG_SCHEDSTATS
 /*
  * Called when a process is dequeued from the active array and given
@@ -1652,12 +1636,11 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
 
 	/*
 	 * Aggressive migration if:
-	 * 1) the [whole] cpu is idle, or
+	 * 1) task is cache cold, or
 	 * 2) too many balance attempts have failed.
 	 */
 
-	if (cpu_and_siblings_are_idle(this_cpu) || \
-			sd->nr_balance_failed > sd->cache_nice_tries)
+	if (sd->nr_balance_failed > sd->cache_nice_tries)
 		return 1;
 
 	if (task_hot(p, rq->timestamp_last_tick, sd))
-- 
cgit v1.2.3-70-g09d2


From 147cbb4bbe991452698f0772d8292f22825710ba Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:19 -0700
Subject: [PATCH] sched: balance on fork

Reimplement the balance on exec balancing to be sched-domains aware.  Use this
to also do balance on fork balancing.  Make x86_64 do balance on fork over the
NUMA domain.

The problem that the non sched domains aware blancing became apparent on dual
core, multi socket opterons.  What we want is for the new tasks to be sent to
a different socket, but more often than not, we would first load up our
sibling core, or fill two cores of a single remote socket before selecting a
new one.

This gives large improvements to STREAM on such systems.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/topology.h |   2 +
 include/linux/sched.h         |  10 +--
 include/linux/topology.h      |   2 +
 kernel/sched.c                | 164 ++++++++++++++++++++++++++++--------------
 4 files changed, 119 insertions(+), 59 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index 9cb7459ce72..802d09b9c99 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -44,9 +44,11 @@ extern int __node_distance(int, int);
 	.idle_idx		= 2,			\
 	.newidle_idx		= 1, 			\
 	.wake_idx		= 1,			\
+	.forkexec_idx		= 1,			\
 	.per_cpu_gain		= 100,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
+				| SD_BALANCE_FORK	\
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 664981ac1fb..613491d3a87 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -460,10 +460,11 @@ enum idle_type
 #define SD_LOAD_BALANCE		1	/* Do load balancing on this domain. */
 #define SD_BALANCE_NEWIDLE	2	/* Balance when about to become idle */
 #define SD_BALANCE_EXEC		4	/* Balance on exec */
-#define SD_WAKE_IDLE		8	/* Wake to idle CPU on task wakeup */
-#define SD_WAKE_AFFINE		16	/* Wake task to waking CPU */
-#define SD_WAKE_BALANCE		32	/* Perform balancing at task wakeup */
-#define SD_SHARE_CPUPOWER	64	/* Domain members share cpu power */
+#define SD_BALANCE_FORK		8	/* Balance on fork, clone */
+#define SD_WAKE_IDLE		16	/* Wake to idle CPU on task wakeup */
+#define SD_WAKE_AFFINE		32	/* Wake task to waking CPU */
+#define SD_WAKE_BALANCE		64	/* Perform balancing at task wakeup */
+#define SD_SHARE_CPUPOWER	128	/* Domain members share cpu power */
 
 struct sched_group {
 	struct sched_group *next;	/* Must be a circular list */
@@ -492,6 +493,7 @@ struct sched_domain {
 	unsigned int idle_idx;
 	unsigned int newidle_idx;
 	unsigned int wake_idx;
+	unsigned int forkexec_idx;
 	int flags;			/* See SD_* */
 
 	/* Runtime fields. */
diff --git a/include/linux/topology.h b/include/linux/topology.h
index b23ec64df7f..665597207de 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -93,6 +93,7 @@
 	.idle_idx		= 0,			\
 	.newidle_idx		= 0,			\
 	.wake_idx		= 0,			\
+	.forkexec_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
@@ -123,6 +124,7 @@
 	.idle_idx		= 0,			\
 	.newidle_idx		= 1,			\
 	.wake_idx		= 1,			\
+	.forkexec_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
diff --git a/kernel/sched.c b/kernel/sched.c
index 396724a2519..7ecc237e2aa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -893,6 +893,79 @@ static inline unsigned long target_load(int cpu, int type)
 	return max(rq->cpu_load[type-1], load_now);
 }
 
+/*
+ * find_idlest_group finds and returns the least busy CPU group within the
+ * domain.
+ */
+static struct sched_group *
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
+{
+	struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
+	unsigned long min_load = ULONG_MAX, this_load = 0;
+	int load_idx = sd->forkexec_idx;
+	int imbalance = 100 + (sd->imbalance_pct-100)/2;
+
+	do {
+		unsigned long load, avg_load;
+		int local_group;
+		int i;
+
+		local_group = cpu_isset(this_cpu, group->cpumask);
+		/* XXX: put a cpus allowed check */
+
+		/* Tally up the load of all CPUs in the group */
+		avg_load = 0;
+
+		for_each_cpu_mask(i, group->cpumask) {
+			/* Bias balancing toward cpus of our domain */
+			if (local_group)
+				load = source_load(i, load_idx);
+			else
+				load = target_load(i, load_idx);
+
+			avg_load += load;
+		}
+
+		/* Adjust by relative CPU power of the group */
+		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+
+		if (local_group) {
+			this_load = avg_load;
+			this = group;
+		} else if (avg_load < min_load) {
+			min_load = avg_load;
+			idlest = group;
+		}
+		group = group->next;
+	} while (group != sd->groups);
+
+	if (!idlest || 100*this_load < imbalance*min_load)
+		return NULL;
+	return idlest;
+}
+
+/*
+ * find_idlest_queue - find the idlest runqueue among the cpus in group.
+ */
+static int find_idlest_cpu(struct sched_group *group, int this_cpu)
+{
+	unsigned long load, min_load = ULONG_MAX;
+	int idlest = -1;
+	int i;
+
+	for_each_cpu_mask(i, group->cpumask) {
+		load = source_load(i, 0);
+
+		if (load < min_load || (load == min_load && i == this_cpu)) {
+			min_load = load;
+			idlest = i;
+		}
+	}
+
+	return idlest;
+}
+
+
 #endif
 
 /*
@@ -1107,11 +1180,6 @@ int fastcall wake_up_state(task_t *p, unsigned int state)
 	return try_to_wake_up(p, state, 0);
 }
 
-#ifdef CONFIG_SMP
-static int find_idlest_cpu(struct task_struct *p, int this_cpu,
-			   struct sched_domain *sd);
-#endif
-
 /*
  * Perform scheduler related setup for a newly forked process p.
  * p is forked by current.
@@ -1181,12 +1249,38 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
 	unsigned long flags;
 	int this_cpu, cpu;
 	runqueue_t *rq, *this_rq;
+#ifdef CONFIG_SMP
+	struct sched_domain *tmp, *sd = NULL;
+#endif
 
 	rq = task_rq_lock(p, &flags);
-	cpu = task_cpu(p);
+	BUG_ON(p->state != TASK_RUNNING);
 	this_cpu = smp_processor_id();
+	cpu = task_cpu(p);
 
-	BUG_ON(p->state != TASK_RUNNING);
+#ifdef CONFIG_SMP
+	for_each_domain(cpu, tmp)
+		if (tmp->flags & SD_BALANCE_FORK)
+			sd = tmp;
+
+	if (sd) {
+		struct sched_group *group;
+
+		cpu = task_cpu(p);
+		group = find_idlest_group(sd, p, cpu);
+		if (group) {
+			int new_cpu;
+			new_cpu = find_idlest_cpu(group, cpu);
+			if (new_cpu != -1 && new_cpu != cpu &&
+					cpu_isset(new_cpu, p->cpus_allowed)) {
+				set_task_cpu(p, new_cpu);
+				task_rq_unlock(rq, &flags);
+				rq = task_rq_lock(p, &flags);
+				cpu = task_cpu(p);
+			}
+		}
+	}
+#endif
 
 	/*
 	 * We decrease the sleep average of forking parents
@@ -1480,51 +1574,6 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
 	}
 }
 
-/*
- * find_idlest_cpu - find the least busy runqueue.
- */
-static int find_idlest_cpu(struct task_struct *p, int this_cpu,
-			   struct sched_domain *sd)
-{
-	unsigned long load, min_load, this_load;
-	int i, min_cpu;
-	cpumask_t mask;
-
-	min_cpu = UINT_MAX;
-	min_load = ULONG_MAX;
-
-	cpus_and(mask, sd->span, p->cpus_allowed);
-
-	for_each_cpu_mask(i, mask) {
-		load = target_load(i, sd->wake_idx);
-
-		if (load < min_load) {
-			min_cpu = i;
-			min_load = load;
-
-			/* break out early on an idle CPU: */
-			if (!min_load)
-				break;
-		}
-	}
-
-	/* add +1 to account for the new task */
-	this_load = source_load(this_cpu, sd->wake_idx) + SCHED_LOAD_SCALE;
-
-	/*
-	 * Would with the addition of the new task to the
-	 * current CPU there be an imbalance between this
-	 * CPU and the idlest CPU?
-	 *
-	 * Use half of the balancing threshold - new-context is
-	 * a good opportunity to balance.
-	 */
-	if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
-		return min_cpu;
-
-	return this_cpu;
-}
-
 /*
  * If dest_cpu is allowed for this process, migrate the task to it.
  * This is accomplished by forcing the cpu_allowed mask to only
@@ -1578,8 +1627,15 @@ void sched_exec(void)
 			sd = tmp;
 
 	if (sd) {
+		struct sched_group *group;
 		schedstat_inc(sd, sbe_attempts);
-		new_cpu = find_idlest_cpu(current, this_cpu, sd);
+		group = find_idlest_group(sd, current, this_cpu);
+		if (!group)
+			goto out;
+		new_cpu = find_idlest_cpu(group, this_cpu);
+		if (new_cpu == -1)
+			goto out;
+
 		if (new_cpu != this_cpu) {
 			schedstat_inc(sd, sbe_pushed);
 			put_cpu();
@@ -1792,12 +1848,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		if (local_group) {
 			this_load = avg_load;
 			this = group;
-			goto nextgroup;
 		} else if (avg_load > max_load) {
 			max_load = avg_load;
 			busiest = group;
 		}
-nextgroup:
 		group = group->next;
 	} while (group != sd->groups);
 
-- 
cgit v1.2.3-70-g09d2


From 687f1661d302bc70ce906594a6d3f615ef075a50 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:21 -0700
Subject: [PATCH] sched: sched tuning

Do some basic initial tuning.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/domain.c     |  2 +-
 include/asm-i386/topology.h   |  2 +-
 include/asm-ia64/topology.h   | 61 +++++++++++++++++++++++++++++++++----------
 include/asm-x86_64/topology.h |  3 +--
 include/linux/topology.h      | 11 ++++----
 5 files changed, 55 insertions(+), 24 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/ia64/kernel/domain.c b/arch/ia64/kernel/domain.c
index fe532c97043..afbde79c3b3 100644
--- a/arch/ia64/kernel/domain.c
+++ b/arch/ia64/kernel/domain.c
@@ -14,7 +14,7 @@
 #include <linux/topology.h>
 #include <linux/nodemask.h>
 
-#define SD_NODES_PER_DOMAIN 6
+#define SD_NODES_PER_DOMAIN 16
 
 #ifdef CONFIG_NUMA
 /**
diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h
index 5eb6f61dcef..2461b731781 100644
--- a/include/asm-i386/topology.h
+++ b/include/asm-i386/topology.h
@@ -81,7 +81,7 @@ static inline int node_to_first_cpu(int node)
 	.per_cpu_gain		= 100,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
-				| SD_BALANCE_NEWIDLE	\
+				| SD_BALANCE_FORK	\
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
diff --git a/include/asm-ia64/topology.h b/include/asm-ia64/topology.h
index 21cf351fd05..4e64c2a6b36 100644
--- a/include/asm-ia64/topology.h
+++ b/include/asm-ia64/topology.h
@@ -42,25 +42,54 @@
 
 void build_cpu_to_node_map(void);
 
+#define SD_CPU_INIT (struct sched_domain) {		\
+	.span			= CPU_MASK_NONE,	\
+	.parent			= NULL,			\
+	.groups			= NULL,			\
+	.min_interval		= 1,			\
+	.max_interval		= 4,			\
+	.busy_factor		= 64,			\
+	.imbalance_pct		= 125,			\
+	.cache_hot_time		= (10*1000000),		\
+	.per_cpu_gain		= 100,			\
+	.cache_nice_tries	= 2,			\
+	.busy_idx		= 2,			\
+	.idle_idx		= 1,			\
+	.newidle_idx		= 2,			\
+	.wake_idx		= 1,			\
+	.forkexec_idx		= 1,			\
+	.flags			= SD_LOAD_BALANCE	\
+				| SD_BALANCE_NEWIDLE	\
+				| SD_BALANCE_EXEC	\
+				| SD_WAKE_AFFINE,	\
+	.last_balance		= jiffies,		\
+	.balance_interval	= 1,			\
+	.nr_balance_failed	= 0,			\
+}
+
 /* sched_domains SD_NODE_INIT for IA64 NUMA machines */
 #define SD_NODE_INIT (struct sched_domain) {		\
 	.span			= CPU_MASK_NONE,	\
 	.parent			= NULL,			\
 	.groups			= NULL,			\
-	.min_interval		= 80,			\
-	.max_interval		= 320,			\
-	.busy_factor		= 320,			\
+	.min_interval		= 8,			\
+	.max_interval		= 8*(min(num_online_cpus(), 32)), \
+	.busy_factor		= 64,			\
 	.imbalance_pct		= 125,			\
 	.cache_hot_time		= (10*1000000),		\
-	.cache_nice_tries	= 1,			\
+	.cache_nice_tries	= 2,			\
+	.busy_idx		= 3,			\
+	.idle_idx		= 2,			\
+	.newidle_idx		= 0, /* unused */	\
+	.wake_idx		= 1,			\
+	.forkexec_idx		= 1,			\
 	.per_cpu_gain		= 100,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
-				| SD_BALANCE_NEWIDLE	\
-				| SD_WAKE_IDLE		\
+				| SD_BALANCE_FORK	\
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
-	.balance_interval	= 1,			\
+	.balance_interval	= 64,			\
 	.nr_balance_failed	= 0,			\
 }
 
@@ -69,17 +98,21 @@ void build_cpu_to_node_map(void);
 	.span			= CPU_MASK_NONE,	\
 	.parent			= NULL,			\
 	.groups			= NULL,			\
-	.min_interval		= 80,			\
-	.max_interval		= 320,			\
-	.busy_factor		= 320,			\
-	.imbalance_pct		= 125,			\
+	.min_interval		= 64,			\
+	.max_interval		= 64*num_online_cpus(),	\
+	.busy_factor		= 128,			\
+	.imbalance_pct		= 133,			\
 	.cache_hot_time		= (10*1000000),		\
 	.cache_nice_tries	= 1,			\
+	.busy_idx		= 3,			\
+	.idle_idx		= 3,			\
+	.newidle_idx		= 0, /* unused */	\
+	.wake_idx		= 0, /* unused */	\
+	.forkexec_idx		= 0, /* unused */	\
 	.per_cpu_gain		= 100,			\
-	.flags			= SD_LOAD_BALANCE	\
-				| SD_BALANCE_EXEC,	\
+	.flags			= SD_LOAD_BALANCE,	\
 	.last_balance		= jiffies,		\
-	.balance_interval	= 100*(63+num_online_cpus())/64,   \
+	.balance_interval	= 64,			\
 	.nr_balance_failed	= 0,			\
 }
 
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index 802d09b9c99..c1bc3fad482 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -42,12 +42,11 @@ extern int __node_distance(int, int);
 	.cache_nice_tries	= 2,			\
 	.busy_idx		= 3,			\
 	.idle_idx		= 2,			\
-	.newidle_idx		= 1, 			\
+	.newidle_idx		= 0, 			\
 	.wake_idx		= 1,			\
 	.forkexec_idx		= 1,			\
 	.per_cpu_gain		= 100,			\
 	.flags			= SD_LOAD_BALANCE	\
-				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_FORK	\
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_BALANCE,	\
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 665597207de..0320225e96d 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -91,7 +91,7 @@
 	.per_cpu_gain		= 25,			\
 	.busy_idx		= 0,			\
 	.idle_idx		= 0,			\
-	.newidle_idx		= 0,			\
+	.newidle_idx		= 1,			\
 	.wake_idx		= 0,			\
 	.forkexec_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
@@ -121,15 +121,14 @@
 	.cache_nice_tries	= 1,			\
 	.per_cpu_gain		= 100,			\
 	.busy_idx		= 2,			\
-	.idle_idx		= 0,			\
-	.newidle_idx		= 1,			\
+	.idle_idx		= 1,			\
+	.newidle_idx		= 2,			\
 	.wake_idx		= 1,			\
-	.forkexec_idx		= 0,			\
+	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
-				| SD_WAKE_AFFINE	\
-				| SD_WAKE_BALANCE,	\
+				| SD_WAKE_AFFINE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
 	.nr_balance_failed	= 0,			\
-- 
cgit v1.2.3-70-g09d2


From 8f43d03fe2c4962c11d8227ac9505e590bad758b Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 25 Jun 2005 14:57:40 -0700
Subject: [PATCH] kexec: x86: rename APIC_MODE_EXINT

From: "Maciej W. Rozycki" <macro@linux-mips.org>

Rename APIC_MODE_EXINT to APIC_MODE_EXTINT - I think it should be named
after what the mode is called in documentation.

From: "Eric W. Biederman" <ebiederm@lnxi.com>

I have reduced this patch to just the name change in the header.  And
integrated the changes into the patches that add those
lines. Otherwise I ran into some ugly dependencies.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org
Signed-off-by: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/apicdef.h   | 2 +-
 include/asm-x86_64/apicdef.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/include/asm-i386/apicdef.h b/include/asm-i386/apicdef.h
index c689554ad5b..41e8d2d918e 100644
--- a/include/asm-i386/apicdef.h
+++ b/include/asm-i386/apicdef.h
@@ -90,7 +90,7 @@
 #define			SET_APIC_DELIVERY_MODE(x,y)	(((x)&~0x700)|((y)<<8))
 #define				APIC_MODE_FIXED		0x0
 #define				APIC_MODE_NMI		0x4
-#define				APIC_MODE_EXINT		0x7
+#define				APIC_MODE_EXTINT	0x7
 #define 	APIC_LVT1	0x360
 #define		APIC_LVTERR	0x370
 #define		APIC_TMICT	0x380
diff --git a/include/asm-x86_64/apicdef.h b/include/asm-x86_64/apicdef.h
index bfebdb69065..9388062c4f6 100644
--- a/include/asm-x86_64/apicdef.h
+++ b/include/asm-x86_64/apicdef.h
@@ -94,7 +94,7 @@
 #define			SET_APIC_DELIVERY_MODE(x,y)	(((x)&~0x700)|((y)<<8))
 #define				APIC_MODE_FIXED		0x0
 #define				APIC_MODE_NMI		0x4
-#define				APIC_MODE_EXINT		0x7
+#define				APIC_MODE_EXTINT	0x7
 #define 	APIC_LVT1	0x360
 #define		APIC_LVTERR	0x370
 #define		APIC_TMICT	0x380
-- 
cgit v1.2.3-70-g09d2


From 208fb93162d51faa69b9774fa7809858d84fd9dc Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 25 Jun 2005 14:57:45 -0700
Subject: [PATCH] kexec: x86_64: restore apic virtual wire mode on shutdown

When coming out of apic mode attempt to set the appropriate
apic back into virtual wire mode.  This improves on previous versions
of this patch by by never setting bot the local apic and the ioapic
into veritual wire mode.

This code looks at data from the mptable to see if an ioapic has
an ExtInt input to make this decision.  A future improvement
is to figure out which apic or ioapic was in virtual wire mode
at boot time and to remember it.  That is potentially a more accurate
method, of selecting which apic to place in virutal wire mode.

Signed-off-by: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/apic.c    | 38 +++++++++++++++++++++++++++++++++++++-
 arch/x86_64/kernel/io_apic.c | 36 ++++++++++++++++++++++++++++++++++--
 include/asm-x86_64/apic.h    |  2 +-
 3 files changed, 72 insertions(+), 4 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index 2a6c893ccf6..375d369570c 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -133,7 +133,7 @@ void __init connect_bsp_APIC(void)
 	}
 }
 
-void disconnect_bsp_APIC(void)
+void disconnect_bsp_APIC(int virt_wire_setup)
 {
 	if (pic_mode) {
 		/*
@@ -146,6 +146,42 @@ void disconnect_bsp_APIC(void)
 		outb(0x70, 0x22);
 		outb(0x00, 0x23);
 	}
+	else {
+		/* Go back to Virtual Wire compatibility mode */
+		unsigned long value;
+
+		/* For the spurious interrupt use vector F, and enable it */
+		value = apic_read(APIC_SPIV);
+		value &= ~APIC_VECTOR_MASK;
+		value |= APIC_SPIV_APIC_ENABLED;
+		value |= 0xf;
+		apic_write_around(APIC_SPIV, value);
+
+		if (!virt_wire_setup) {
+			/* For LVT0 make it edge triggered, active high, external and enabled */
+			value = apic_read(APIC_LVT0);
+			value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+				APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+				APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
+			value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+			value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
+			apic_write_around(APIC_LVT0, value);
+		}
+		else {
+			/* Disable LVT0 */
+			apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
+		}
+
+		/* For LVT1 make it edge triggered, active high, nmi and enabled */
+		value = apic_read(APIC_LVT1);
+		value &= ~(
+			APIC_MODE_MASK | APIC_SEND_PENDING |
+			APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+			APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+		value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+		value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
+		apic_write_around(APIC_LVT1, value);
+	}
 }
 
 void disable_local_APIC(void)
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index afd87e64d0a..157190d986b 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -330,7 +330,7 @@ static int find_irq_entry(int apic, int pin, int type)
 /*
  * Find the pin to which IRQ[irq] (ISA) is connected
  */
-static int __init find_isa_irq_pin(int irq, int type)
+static int find_isa_irq_pin(int irq, int type)
 {
 	int i;
 
@@ -1132,12 +1132,44 @@ static void __init enable_IO_APIC(void)
  */
 void disable_IO_APIC(void)
 {
+	int pin;
 	/*
 	 * Clear the IO-APIC before rebooting:
 	 */
 	clear_IO_APIC();
 
-	disconnect_bsp_APIC();
+	/*
+	 * If the i82559 is routed through an IOAPIC
+	 * Put that IOAPIC in virtual wire mode
+	 * so legacy interrups can be delivered.
+	 */
+	pin = find_isa_irq_pin(0, mp_ExtINT);
+	if (pin != -1) {
+		struct IO_APIC_route_entry entry;
+		unsigned long flags;
+
+		memset(&entry, 0, sizeof(entry));
+		entry.mask            = 0; /* Enabled */
+		entry.trigger         = 0; /* Edge */
+		entry.irr             = 0;
+		entry.polarity        = 0; /* High */
+		entry.delivery_status = 0;
+		entry.dest_mode       = 0; /* Physical */
+		entry.delivery_mode   = 7; /* ExtInt */
+		entry.vector          = 0;
+		entry.dest.physical.physical_dest = 0;
+
+
+		/*
+		 * Add it to the IO-APIC irq-routing table:
+		 */
+		spin_lock_irqsave(&ioapic_lock, flags);
+		io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1));
+		io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0));
+		spin_unlock_irqrestore(&ioapic_lock, flags);
+	}
+
+	disconnect_bsp_APIC(pin != -1);
 }
 
 /*
diff --git a/include/asm-x86_64/apic.h b/include/asm-x86_64/apic.h
index e4b1017b8b2..16ec82e16b2 100644
--- a/include/asm-x86_64/apic.h
+++ b/include/asm-x86_64/apic.h
@@ -77,7 +77,7 @@ static inline void ack_APIC_irq(void)
 extern int get_maxlvt (void);
 extern void clear_local_APIC (void);
 extern void connect_bsp_APIC (void);
-extern void disconnect_bsp_APIC (void);
+extern void disconnect_bsp_APIC (int virt_wire_setup);
 extern void disable_local_APIC (void);
 extern int verify_local_APIC (void);
 extern void cache_APIC_registers (void);
-- 
cgit v1.2.3-70-g09d2


From d0537508a9921efced238b20967e50e519ac34af Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 25 Jun 2005 14:57:52 -0700
Subject: [PATCH] kexec: x86_64: add CONFIG_PHYSICAL_START

For one kernel to report a crash another kernel has created we need
to have 2 kernels loaded simultaneously in memory.  To accomplish this
the two kernels need to built to run at different physical addresses.

This patch adds the CONFIG_PHYSICAL_START option to the x86_64 kernel
so we can do just that.  You need to know what you are doing and
the ramifications are before changing this value, and most users
won't care so I have made it depend on CONFIG_EMBEDDED

bzImage kernels will work and run at a different address when compiled
with this option but they will still load at 1MB.  If you need a kernel
loaded at a different address as well you need to boot a vmlinux.

Signed-off-by: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/Kconfig                | 11 +++++++++++
 arch/x86_64/boot/compressed/head.S |  7 ++++---
 arch/x86_64/boot/compressed/misc.c |  7 ++++---
 arch/x86_64/kernel/head.S          | 18 +++++++++---------
 include/asm-x86_64/page.h          |  6 ++++--
 5 files changed, 32 insertions(+), 17 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index a853d87ff7e..56c02cf6397 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -369,6 +369,17 @@ config X86_MCE_INTEL
 	   Additional support for intel specific MCE features such as
 	   the thermal monitor.
 
+config PHYSICAL_START
+	hex "Physical address where the kernel is loaded" if EMBEDDED
+	default "0x100000"
+	help
+	  This gives the physical address where the kernel is loaded.
+	  Primarily used in the case of kexec on panic where the
+	  fail safe kernel needs to run at a different address than
+	  the panic-ed kernel.
+
+	  Don't change this unless you know what you are doing.
+
 config SECCOMP
 	bool "Enable seccomp to safely compute untrusted bytecode"
 	depends on PROC_FS
diff --git a/arch/x86_64/boot/compressed/head.S b/arch/x86_64/boot/compressed/head.S
index 27264dbd575..b0df2aeff5d 100644
--- a/arch/x86_64/boot/compressed/head.S
+++ b/arch/x86_64/boot/compressed/head.S
@@ -28,6 +28,7 @@
 
 #include <linux/linkage.h>
 #include <asm/segment.h>
+#include <asm/page.h>
 
 	.code32
 	.globl startup_32
@@ -77,7 +78,7 @@ startup_32:
 	jnz  3f
 	addl $8,%esp
 	xorl %ebx,%ebx
-	ljmp $(__KERNEL_CS), $0x100000
+	ljmp $(__KERNEL_CS), $__PHYSICAL_START
 
 /*
  * We come here, if we were loaded high.
@@ -103,7 +104,7 @@ startup_32:
 	popl %ecx	# lcount
 	popl %edx	# high_buffer_start
 	popl %eax	# hcount
-	movl $0x100000,%edi
+	movl $__PHYSICAL_START,%edi
 	cli		# make sure we don't get interrupted
 	ljmp $(__KERNEL_CS), $0x1000 # and jump to the move routine
 
@@ -128,7 +129,7 @@ move_routine_start:
 	movsl
 	movl %ebx,%esi	# Restore setup pointer
 	xorl %ebx,%ebx
-	ljmp $(__KERNEL_CS), $0x100000
+	ljmp $(__KERNEL_CS), $__PHYSICAL_START
 move_routine_end:
 
 
diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c
index c8b9216f9e6..c43cedb5c1d 100644
--- a/arch/x86_64/boot/compressed/misc.c
+++ b/arch/x86_64/boot/compressed/misc.c
@@ -11,6 +11,7 @@
 
 #include "miscsetup.h"
 #include <asm/io.h>
+#include <asm/page.h>
 
 /*
  * gzip declarations
@@ -284,7 +285,7 @@ void setup_normal_output_buffer(void)
 #else
 	if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < 1024) error("Less than 2MB of memory");
 #endif
-	output_data = (char *)0x100000; /* Points to 1M */
+	output_data = (char *)__PHYSICAL_START; /* Normally Points to 1M */
 	free_mem_end_ptr = (long)real_mode;
 }
 
@@ -307,8 +308,8 @@ void setup_output_buffer_if_we_run_high(struct moveparams *mv)
 	low_buffer_size = low_buffer_end - LOW_BUFFER_START;
 	high_loaded = 1;
 	free_mem_end_ptr = (long)high_buffer_start;
-	if ( (0x100000 + low_buffer_size) > ((ulg)high_buffer_start)) {
-		high_buffer_start = (uch *)(0x100000 + low_buffer_size);
+	if ( (__PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) {
+		high_buffer_start = (uch *)(__PHYSICAL_START + low_buffer_size);
 		mv->hcount = 0; /* say: we need not to move high_buffer */
 	}
 	else mv->hcount = -1;
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index 9bd2e7a4b81..8d765aa77a2 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -248,23 +248,23 @@ ENTRY(_stext)
 	 */
 .org 0x1000
 ENTRY(init_level4_pgt)
-	.quad	0x0000000000102007		/* -> level3_ident_pgt */
+	.quad	0x0000000000002007 + __PHYSICAL_START	/* -> level3_ident_pgt */
 	.fill	255,8,0
-	.quad	0x000000000010a007
+	.quad	0x000000000000a007 + __PHYSICAL_START
 	.fill	254,8,0
 	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
-	.quad	0x0000000000103007		/* -> level3_kernel_pgt */
+	.quad	0x0000000000003007 + __PHYSICAL_START	/* -> level3_kernel_pgt */
 
 .org 0x2000
 ENTRY(level3_ident_pgt)
-	.quad	0x0000000000104007
+	.quad	0x0000000000004007 + __PHYSICAL_START
 	.fill	511,8,0
 
 .org 0x3000
 ENTRY(level3_kernel_pgt)
 	.fill	510,8,0
 	/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
-	.quad	0x0000000000105007		/* -> level2_kernel_pgt */
+	.quad	0x0000000000005007 + __PHYSICAL_START	/* -> level2_kernel_pgt */
 	.fill	1,8,0
 
 .org 0x4000
@@ -337,17 +337,17 @@ ENTRY(empty_bad_pmd_table)
 
 .org 0xa000
 ENTRY(level3_physmem_pgt)
-	.quad	0x0000000000105007		/* -> level2_kernel_pgt (so that __va works even before pagetable_init) */
+	.quad	0x0000000000005007 + __PHYSICAL_START	/* -> level2_kernel_pgt (so that __va works even before pagetable_init) */
 
 	.org 0xb000
 #ifdef CONFIG_ACPI_SLEEP
 ENTRY(wakeup_level4_pgt)
-	.quad	0x0000000000102007		/* -> level3_ident_pgt */
+	.quad	0x0000000000002007 + __PHYSICAL_START	/* -> level3_ident_pgt */
 	.fill	255,8,0
-	.quad	0x000000000010a007
+	.quad	0x000000000000a007 + __PHYSICAL_START
 	.fill	254,8,0
 	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
-	.quad	0x0000000000103007		/* -> level3_kernel_pgt */
+	.quad	0x0000000000003007 + __PHYSICAL_START	/* -> level3_kernel_pgt */
 #endif
 
 	.data
diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h
index 60130f4ca98..431318764af 100644
--- a/include/asm-x86_64/page.h
+++ b/include/asm-x86_64/page.h
@@ -64,12 +64,14 @@ typedef struct { unsigned long pgprot; } pgprot_t;
 #define __pgd(x) ((pgd_t) { (x) } )
 #define __pgprot(x)	((pgprot_t) { (x) } )
 
-#define __START_KERNEL		0xffffffff80100000UL
+#define __PHYSICAL_START	((unsigned long)CONFIG_PHYSICAL_START)
+#define __START_KERNEL		(__START_KERNEL_map + __PHYSICAL_START)
 #define __START_KERNEL_map	0xffffffff80000000UL
 #define __PAGE_OFFSET           0xffff810000000000UL
 
 #else
-#define __START_KERNEL		0xffffffff80100000
+#define __PHYSICAL_START	CONFIG_PHYSICAL_START
+#define __START_KERNEL		(__START_KERNEL_map + __PHYSICAL_START)
 #define __START_KERNEL_map	0xffffffff80000000
 #define __PAGE_OFFSET           0xffff810000000000
 #endif /* !__ASSEMBLY__ */
-- 
cgit v1.2.3-70-g09d2


From 5234f5eb04abbbfa306ccfbc2ccbb6e73f515b15 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 25 Jun 2005 14:58:02 -0700
Subject: [PATCH] kexec: x86_64 kexec implementation

This is the x86_64 implementation of machine kexec.  32bit compatibility
support has been implemented, and machine_kexec has been enhanced to not care
about the changing internal kernel paget table structures.

From: Alexander Nyberg <alexn@dsv.su.se>

      build fix

Signed-off-by: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/Kconfig                  |  17 +++
 arch/x86_64/ia32/ia32entry.S         |   2 +-
 arch/x86_64/kernel/Makefile          |   1 +
 arch/x86_64/kernel/crash.c           |  38 ++++++
 arch/x86_64/kernel/machine_kexec.c   | 245 +++++++++++++++++++++++++++++++++++
 arch/x86_64/kernel/relocate_kernel.S | 143 ++++++++++++++++++++
 include/asm-x86_64/kexec.h           |  28 ++++
 include/asm-x86_64/unistd.h          |   2 +-
 8 files changed, 474 insertions(+), 2 deletions(-)
 create mode 100644 arch/x86_64/kernel/crash.c
 create mode 100644 arch/x86_64/kernel/machine_kexec.c
 create mode 100644 arch/x86_64/kernel/relocate_kernel.S
 create mode 100644 include/asm-x86_64/kexec.h

(limited to 'include/asm-x86_64')

diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 56c02cf6397..d09437b5c48 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -380,6 +380,23 @@ config PHYSICAL_START
 
 	  Don't change this unless you know what you are doing.
 
+config KEXEC
+	bool "kexec system call (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	help
+	  kexec is a system call that implements the ability to shutdown your
+	  current kernel, and to start another kernel.  It is like a reboot
+	  but it is indepedent of the system firmware.   And like a reboot
+	  you can start any kernel with it, not just Linux.
+
+	  The name comes from the similiarity to the exec system call.
+
+	  It is an ongoing process to be certain the hardware in a machine
+	  is properly shutdown, so do not be surprised if this code does not
+	  initially work for you.  It may help to enable device hotplugging
+	  support.  As of this writing the exact hardware interface is
+	  strongly in flux, so no good recommendation can be made.
+
 config SECCOMP
 	bool "Enable seccomp to safely compute untrusted bytecode"
 	depends on PROC_FS
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index f3ca0db85b5..cc935427d53 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -589,7 +589,7 @@ ia32_sys_call_table:
 	.quad compat_sys_mq_timedreceive	/* 280 */
 	.quad compat_sys_mq_notify
 	.quad compat_sys_mq_getsetattr
-	.quad quiet_ni_syscall		/* reserved for kexec */
+	.quad compat_sys_kexec_load	/* reserved for kexec */
 	.quad compat_sys_waitid
 	.quad quiet_ni_syscall		/* sys_altroot */
 	.quad sys_add_key
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 5ca4a4598fd..48f9e2c19cd 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_SMP)		+= smp.o smpboot.o trampoline.o
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o  nmi.o
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o mpparse.o \
 		genapic.o genapic_cluster.o genapic_flat.o
+obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o crash.o
 obj-$(CONFIG_PM)		+= suspend.o
 obj-$(CONFIG_SOFTWARE_SUSPEND)	+= suspend_asm.o
 obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c
new file mode 100644
index 00000000000..7caf8a49d0c
--- /dev/null
+++ b/arch/x86_64/kernel/crash.c
@@ -0,0 +1,38 @@
+/*
+ * Architecture specific (x86_64) functions for kexec based crash dumps.
+ *
+ * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ *
+ * Copyright (C) IBM Corporation, 2004. All rights reserved.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/irq.h>
+#include <linux/reboot.h>
+#include <linux/kexec.h>
+
+#include <asm/processor.h>
+#include <asm/hardirq.h>
+#include <asm/nmi.h>
+#include <asm/hw_irq.h>
+
+#define MAX_NOTE_BYTES 1024
+typedef u32 note_buf_t[MAX_NOTE_BYTES/4];
+
+note_buf_t crash_notes[NR_CPUS];
+
+void machine_crash_shutdown(void)
+{
+	/* This function is only called after the system
+	 * has paniced or is otherwise in a critical state.
+	 * The minimum amount of code to allow a kexec'd kernel
+	 * to run successfully needs to happen here.
+	 *
+	 * In practice this means shooting down the other cpus in
+	 * an SMP system.
+	 */
+}
diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c
new file mode 100644
index 00000000000..200b5993f8d
--- /dev/null
+++ b/arch/x86_64/kernel/machine_kexec.c
@@ -0,0 +1,245 @@
+/*
+ * machine_kexec.c - handle transition of Linux booting another kernel
+ * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/mm.h>
+#include <linux/kexec.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/reboot.h>
+#include <asm/pda.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/io.h>
+#include <asm/apic.h>
+#include <asm/cpufeature.h>
+#include <asm/hw_irq.h>
+
+#define LEVEL0_SIZE (1UL << 12UL)
+#define LEVEL1_SIZE (1UL << 21UL)
+#define LEVEL2_SIZE (1UL << 30UL)
+#define LEVEL3_SIZE (1UL << 39UL)
+#define LEVEL4_SIZE (1UL << 48UL)
+
+#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE)
+#define L2_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L3_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+
+static void init_level2_page(
+	u64 *level2p, unsigned long addr)
+{
+	unsigned long end_addr;
+	addr &= PAGE_MASK;
+	end_addr = addr + LEVEL2_SIZE;
+	while(addr < end_addr) {
+		*(level2p++) = addr | L1_ATTR;
+		addr += LEVEL1_SIZE;
+	}
+}
+
+static int init_level3_page(struct kimage *image,
+	u64 *level3p, unsigned long addr, unsigned long last_addr)
+{
+	unsigned long end_addr;
+	int result;
+	result = 0;
+	addr &= PAGE_MASK;
+	end_addr = addr + LEVEL3_SIZE;
+	while((addr < last_addr) && (addr < end_addr)) {
+		struct page *page;
+		u64 *level2p;
+		page = kimage_alloc_control_pages(image, 0);
+		if (!page) {
+			result = -ENOMEM;
+			goto out;
+		}
+		level2p = (u64 *)page_address(page);
+		init_level2_page(level2p, addr);
+		*(level3p++) = __pa(level2p) | L2_ATTR;
+		addr += LEVEL2_SIZE;
+	}
+	/* clear the unused entries */
+	while(addr < end_addr) {
+		*(level3p++) = 0;
+		addr += LEVEL2_SIZE;
+	}
+out:
+	return result;
+}
+
+
+static int init_level4_page(struct kimage *image,
+	u64 *level4p, unsigned long addr, unsigned long last_addr)
+{
+	unsigned long end_addr;
+	int result;
+	result = 0;
+	addr &= PAGE_MASK;
+	end_addr = addr + LEVEL4_SIZE;
+	while((addr < last_addr) && (addr < end_addr)) {
+		struct page *page;
+		u64 *level3p;
+		page = kimage_alloc_control_pages(image, 0);
+		if (!page) {
+			result = -ENOMEM;
+			goto out;
+		}
+		level3p = (u64 *)page_address(page);
+		result = init_level3_page(image, level3p, addr, last_addr);
+		if (result) {
+			goto out;
+		}
+		*(level4p++) = __pa(level3p) | L3_ATTR;
+		addr += LEVEL3_SIZE;
+	}
+	/* clear the unused entries */
+	while(addr < end_addr) {
+		*(level4p++) = 0;
+		addr += LEVEL3_SIZE;
+	}
+ out:
+	return result;
+}
+
+
+static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
+{
+	u64 *level4p;
+	level4p = (u64 *)__va(start_pgtable);
+	return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
+}
+
+static void set_idt(void *newidt, u16 limit)
+{
+	unsigned char curidt[10];
+
+	/* x86-64 supports unaliged loads & stores */
+	(*(u16 *)(curidt)) = limit;
+	(*(u64 *)(curidt +2)) = (unsigned long)(newidt);
+
+	__asm__ __volatile__ (
+		"lidt %0\n"
+		: "=m" (curidt)
+		);
+};
+
+
+static void set_gdt(void *newgdt, u16 limit)
+{
+	unsigned char curgdt[10];
+
+	/* x86-64 supports unaligned loads & stores */
+	(*(u16 *)(curgdt)) = limit;
+	(*(u64 *)(curgdt +2)) = (unsigned long)(newgdt);
+
+	__asm__ __volatile__ (
+		"lgdt %0\n"
+		: "=m" (curgdt)
+		);
+};
+
+static void load_segments(void)
+{
+	__asm__ __volatile__ (
+		"\tmovl $"STR(__KERNEL_DS)",%eax\n"
+		"\tmovl %eax,%ds\n"
+		"\tmovl %eax,%es\n"
+		"\tmovl %eax,%ss\n"
+		"\tmovl %eax,%fs\n"
+		"\tmovl %eax,%gs\n"
+		);
+#undef STR
+#undef __STR
+}
+
+typedef NORET_TYPE void (*relocate_new_kernel_t)(
+	unsigned long indirection_page, unsigned long control_code_buffer,
+	unsigned long start_address, unsigned long pgtable) ATTRIB_NORET;
+
+const extern unsigned char relocate_new_kernel[];
+const extern unsigned long relocate_new_kernel_size;
+
+int machine_kexec_prepare(struct kimage *image)
+{
+	unsigned long start_pgtable, control_code_buffer;
+	int result;
+
+	/* Calculate the offsets */
+	start_pgtable       = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+	control_code_buffer = start_pgtable + 4096UL;
+
+	/* Setup the identity mapped 64bit page table */
+	result = init_pgtable(image, start_pgtable);
+	if (result) {
+		return result;
+	}
+
+	/* Place the code in the reboot code buffer */
+	memcpy(__va(control_code_buffer), relocate_new_kernel, relocate_new_kernel_size);
+
+	return 0;
+}
+
+void machine_kexec_cleanup(struct kimage *image)
+{
+	return;
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+NORET_TYPE void machine_kexec(struct kimage *image)
+{
+	unsigned long page_list;
+	unsigned long control_code_buffer;
+	unsigned long start_pgtable;
+	relocate_new_kernel_t rnk;
+
+	/* Interrupts aren't acceptable while we reboot */
+	local_irq_disable();
+
+	/* Calculate the offsets */
+	page_list           = image->head;
+	start_pgtable       = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+	control_code_buffer = start_pgtable + 4096UL;
+
+	/* Set the low half of the page table to my identity mapped
+	 * page table for kexec.  Leave the high half pointing at the
+	 * kernel pages.   Don't bother to flush the global pages
+	 * as that will happen when I fully switch to my identity mapped
+	 * page table anyway.
+	 */
+	memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2);
+	__flush_tlb();
+
+
+	/* The segment registers are funny things, they are
+	 * automatically loaded from a table, in memory wherever you
+	 * set them to a specific selector, but this table is never
+	 * accessed again unless you set the segment to a different selector.
+	 *
+	 * The more common model are caches where the behide
+	 * the scenes work is done, but is also dropped at arbitrary
+	 * times.
+	 *
+	 * I take advantage of this here by force loading the
+	 * segments, before I zap the gdt with an invalid value.
+	 */
+	load_segments();
+	/* The gdt & idt are now invalid.
+	 * If you want to load them you must set up your own idt & gdt.
+	 */
+	set_gdt(phys_to_virt(0),0);
+	set_idt(phys_to_virt(0),0);
+	/* now call it */
+	rnk = (relocate_new_kernel_t) control_code_buffer;
+	(*rnk)(page_list, control_code_buffer, image->start, start_pgtable);
+}
diff --git a/arch/x86_64/kernel/relocate_kernel.S b/arch/x86_64/kernel/relocate_kernel.S
new file mode 100644
index 00000000000..d24fa9b72a2
--- /dev/null
+++ b/arch/x86_64/kernel/relocate_kernel.S
@@ -0,0 +1,143 @@
+/*
+ * relocate_kernel.S - put the kernel image in place to boot
+ * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/linkage.h>
+
+	/*
+	 * Must be relocatable PIC code callable as a C function, that once
+	 * it starts can not use the previous processes stack.
+	 */
+	.globl relocate_new_kernel
+	.code64
+relocate_new_kernel:
+	/* %rdi page_list
+	 * %rsi reboot_code_buffer
+	 * %rdx start address
+	 * %rcx page_table
+	 * %r8  arg5
+	 * %r9  arg6
+	 */
+
+	/* zero out flags, and disable interrupts */
+	pushq $0
+	popfq
+
+	/* set a new stack at the bottom of our page... */
+	lea   4096(%rsi), %rsp
+
+	/* store the parameters back on the stack */
+	pushq	%rdx /* store the start address */
+
+	/* Set cr0 to a known state:
+	 * 31 1 == Paging enabled
+	 * 18 0 == Alignment check disabled
+	 * 16 0 == Write protect disabled
+	 * 3  0 == No task switch
+	 * 2  0 == Don't do FP software emulation.
+	 * 0  1 == Proctected mode enabled
+	 */
+	movq	%cr0, %rax
+	andq	$~((1<<18)|(1<<16)|(1<<3)|(1<<2)), %rax
+	orl	$((1<<31)|(1<<0)), %eax
+	movq	%rax, %cr0
+
+	/* Set cr4 to a known state:
+	 * 10 0 == xmm exceptions disabled
+	 * 9  0 == xmm registers instructions disabled
+	 * 8  0 == performance monitoring counter disabled
+	 * 7  0 == page global disabled
+	 * 6  0 == machine check exceptions disabled
+	 * 5  1 == physical address extension enabled
+	 * 4  0 == page size extensions	disabled
+	 * 3  0 == Debug extensions disabled
+	 * 2  0 == Time stamp disable (disabled)
+	 * 1  0 == Protected mode virtual interrupts disabled
+	 * 0  0 == VME disabled
+	 */
+
+	movq	$((1<<5)), %rax
+	movq	%rax, %cr4
+
+	jmp 1f
+1:
+
+	/* Switch to the identity mapped page tables,
+	 * and flush the TLB.
+	*/
+	movq	%rcx, %cr3
+
+	/* Do the copies */
+	movq	%rdi, %rcx 	/* Put the page_list in %rcx */
+	xorq	%rdi, %rdi
+	xorq	%rsi, %rsi
+	jmp	1f
+
+0:	/* top, read another word for the indirection page */
+
+	movq	(%rbx), %rcx
+	addq	$8,	%rbx
+1:
+	testq	$0x1,	%rcx  /* is it a destination page? */
+	jz	2f
+	movq	%rcx,	%rdi
+	andq	$0xfffffffffffff000, %rdi
+	jmp	0b
+2:
+	testq	$0x2,	%rcx  /* is it an indirection page? */
+	jz	2f
+	movq	%rcx,   %rbx
+	andq	$0xfffffffffffff000, %rbx
+	jmp	0b
+2:
+	testq	$0x4,	%rcx  /* is it the done indicator? */
+	jz	2f
+	jmp	3f
+2:
+	testq	$0x8,	%rcx  /* is it the source indicator? */
+	jz	0b	      /* Ignore it otherwise */
+	movq	%rcx,   %rsi  /* For ever source page do a copy */
+	andq	$0xfffffffffffff000, %rsi
+
+	movq	$512,   %rcx
+	rep ; movsq
+	jmp	0b
+3:
+
+	/* To be certain of avoiding problems with self-modifying code
+	 * I need to execute a serializing instruction here.
+	 * So I flush the TLB by reloading %cr3 here, it's handy,
+	 * and not processor dependent.
+	 */
+	movq	%cr3, %rax
+	movq	%rax, %cr3
+
+	/* set all of the registers to known values */
+	/* leave %rsp alone */
+
+	xorq	%rax, %rax
+	xorq	%rbx, %rbx
+	xorq    %rcx, %rcx
+	xorq    %rdx, %rdx
+	xorq    %rsi, %rsi
+	xorq    %rdi, %rdi
+	xorq    %rbp, %rbp
+	xorq	%r8,  %r8
+	xorq	%r9,  %r9
+	xorq	%r10, %r9
+	xorq	%r11, %r11
+	xorq	%r12, %r12
+	xorq	%r13, %r13
+	xorq	%r14, %r14
+	xorq	%r15, %r15
+
+	ret
+relocate_new_kernel_end:
+
+	.globl relocate_new_kernel_size
+relocate_new_kernel_size:
+	.quad relocate_new_kernel_end - relocate_new_kernel
diff --git a/include/asm-x86_64/kexec.h b/include/asm-x86_64/kexec.h
new file mode 100644
index 00000000000..dc33646dc7d
--- /dev/null
+++ b/include/asm-x86_64/kexec.h
@@ -0,0 +1,28 @@
+#ifndef _X86_64_KEXEC_H
+#define _X86_64_KEXEC_H
+
+#include <asm/page.h>
+#include <asm/proto.h>
+
+/*
+ * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
+ * I.e. Maximum page that is mapped directly into kernel memory,
+ * and kmap is not required.
+ *
+ * So far x86_64 is limited to 40 physical address bits.
+ */
+
+/* Maximum physical address we can use pages from */
+#define KEXEC_SOURCE_MEMORY_LIMIT      (0xFFFFFFFFFFUL)
+/* Maximum address we can reach in physical address mode */
+#define KEXEC_DESTINATION_MEMORY_LIMIT (0xFFFFFFFFFFUL)
+/* Maximum address we can use for the control pages */
+#define KEXEC_CONTROL_MEMORY_LIMIT     (0xFFFFFFFFFFUL)
+
+/* Allocate one page for the pdp and the second for the code */
+#define KEXEC_CONTROL_CODE_SIZE  (4096UL + 4096UL)
+
+/* The native architecture */
+#define KEXEC_ARCH KEXEC_ARCH_X86_64
+
+#endif /* _X86_64_KEXEC_H */
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 3c9af6fd433..d767adcbf0f 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -552,7 +552,7 @@ __SYSCALL(__NR_mq_notify, sys_mq_notify)
 #define __NR_mq_getsetattr 	245
 __SYSCALL(__NR_mq_getsetattr, sys_mq_getsetattr)
 #define __NR_kexec_load 	246
-__SYSCALL(__NR_kexec_load, sys_ni_syscall)
+__SYSCALL(__NR_kexec_load, sys_kexec_load)
 #define __NR_waitid		247
 __SYSCALL(__NR_waitid, sys_waitid)
 #define __NR_add_key		248
-- 
cgit v1.2.3-70-g09d2


From 625f1c8219d95300ed32e4c67eb62a50ded095ba Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Sat, 25 Jun 2005 14:58:12 -0700
Subject: [PATCH] Kdump: Export crash notes section address through sysfs

o Following patch exports kexec global variable "crash_notes" to user space
  through sysfs as kernel attribute in /sys/kernel.

Signed-off-by: Maneesh Soni <maneesh@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/crash.c   |  2 --
 arch/x86_64/kernel/crash.c |  3 ---
 include/asm-i386/kexec.h   |  5 +++++
 include/asm-x86_64/kexec.h |  5 +++++
 kernel/ksysfs.c            | 13 +++++++++++++
 5 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'include/asm-x86_64')

diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c
index 59b92d21746..3645ad7ac20 100644
--- a/arch/i386/kernel/crash.c
+++ b/arch/i386/kernel/crash.c
@@ -26,8 +26,6 @@
 #include <asm/apic.h>
 #include <mach_ipi.h>
 
-#define MAX_NOTE_BYTES 1024
-typedef u32 note_buf_t[MAX_NOTE_BYTES/4];
 
 note_buf_t crash_notes[NR_CPUS];
 
diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c
index 7caf8a49d0c..6183bcb8525 100644
--- a/arch/x86_64/kernel/crash.c
+++ b/arch/x86_64/kernel/crash.c
@@ -20,9 +20,6 @@
 #include <asm/nmi.h>
 #include <asm/hw_irq.h>
 
-#define MAX_NOTE_BYTES 1024
-typedef u32 note_buf_t[MAX_NOTE_BYTES/4];
-
 note_buf_t crash_notes[NR_CPUS];
 
 void machine_crash_shutdown(void)
diff --git a/include/asm-i386/kexec.h b/include/asm-i386/kexec.h
index a1599b55d62..6ed2a03e37b 100644
--- a/include/asm-i386/kexec.h
+++ b/include/asm-i386/kexec.h
@@ -25,4 +25,9 @@
 /* The native architecture */
 #define KEXEC_ARCH KEXEC_ARCH_386
 
+#define MAX_NOTE_BYTES 1024
+typedef u32 note_buf_t[MAX_NOTE_BYTES/4];
+
+extern note_buf_t crash_notes[];
+
 #endif /* _I386_KEXEC_H */
diff --git a/include/asm-x86_64/kexec.h b/include/asm-x86_64/kexec.h
index dc33646dc7d..42d2ff15c59 100644
--- a/include/asm-x86_64/kexec.h
+++ b/include/asm-x86_64/kexec.h
@@ -25,4 +25,9 @@
 /* The native architecture */
 #define KEXEC_ARCH KEXEC_ARCH_X86_64
 
+#define MAX_NOTE_BYTES 1024
+typedef u32 note_buf_t[MAX_NOTE_BYTES/4];
+
+extern note_buf_t crash_notes[];
+
 #endif /* _X86_64_KEXEC_H */
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 1f064a63f8c..015fb69ad94 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -30,12 +30,25 @@ static ssize_t hotplug_seqnum_show(struct subsystem *subsys, char *page)
 KERNEL_ATTR_RO(hotplug_seqnum);
 #endif
 
+#ifdef CONFIG_KEXEC
+#include <asm/kexec.h>
+
+static ssize_t crash_notes_show(struct subsystem *subsys, char *page)
+{
+	return sprintf(page, "%p\n", (void *)crash_notes);
+}
+KERNEL_ATTR_RO(crash_notes);
+#endif
+
 decl_subsys(kernel, NULL, NULL);
 EXPORT_SYMBOL_GPL(kernel_subsys);
 
 static struct attribute * kernel_attrs[] = {
 #ifdef CONFIG_HOTPLUG
 	&hotplug_seqnum_attr.attr,
+#endif
+#ifdef CONFIG_KEXEC
+	&crash_notes_attr.attr,
 #endif
 	NULL
 };
-- 
cgit v1.2.3-70-g09d2