From a460e745e8f9c75a0525ff94154a0629f9d3e05d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 17 Oct 2006 00:10:03 -0700
Subject: [PATCH] genirq: clean up irq-flow-type naming

Introduce desc->name and eliminate the handle_irq_name() hack.  Add
set_irq_chip_and_handler_name() to set the flow type and name at once.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Matthew Wilcox <willy@debian.org>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/i8259.c   |  7 ++++---
 arch/x86_64/kernel/io_apic.c | 15 ++++++++-------
 arch/x86_64/kernel/irq.c     |  2 +-
 3 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index 0612a33bb89..c4ef801b765 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -178,7 +178,8 @@ void make_8259A_irq(unsigned int irq)
 {
 	disable_irq_nosync(irq);
 	io_apic_irqs &= ~(1<<irq);
-	set_irq_chip_and_handler(irq, &i8259A_chip, handle_level_irq);
+	set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
+				      "XT");
 	enable_irq(irq);
 }
 
@@ -431,8 +432,8 @@ void __init init_ISA_irqs (void)
 			/*
 			 * 16 old-style INTA-cycle interrupts:
 			 */
-			set_irq_chip_and_handler(i, &i8259A_chip,
-						 handle_level_irq);
+			set_irq_chip_and_handler_name(i, &i8259A_chip,
+						      handle_level_irq, "XT");
 		} else {
 			/*
 			 * 'high' PCI IRQs filled in on demand
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 44b55f83387..49e94f7994c 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -696,11 +696,11 @@ static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
 {
 	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
 			trigger == IOAPIC_LEVEL)
-		set_irq_chip_and_handler(irq, &ioapic_chip,
-					 handle_fasteoi_irq);
+		set_irq_chip_and_handler_name(irq, &ioapic_chip,
+					      handle_fasteoi_irq, "fasteoi");
 	else
-		set_irq_chip_and_handler(irq, &ioapic_chip,
-					 handle_edge_irq);
+		set_irq_chip_and_handler_name(irq, &ioapic_chip,
+					      handle_edge_irq, "edge");
 }
 
 static void __init setup_IO_APIC_irqs(void)
@@ -806,7 +806,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
 	 * The timer IRQ doesn't have to know that behind the
 	 * scene we have a 8259A-master in AEOI mode ...
 	 */
-	set_irq_chip_and_handler(0, &ioapic_chip, handle_edge_irq);
+	set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
 
 	/*
 	 * Add it to the IO-APIC irq-routing table:
@@ -1839,7 +1839,7 @@ int arch_setup_msi_irq(unsigned int irq, struct pci_dev *dev)
 
 	write_msi_msg(irq, &msg);
 
-	set_irq_chip_and_handler(irq, &msi_chip, handle_edge_irq);
+	set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
 
 	return 0;
 }
@@ -1936,7 +1936,8 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 		write_ht_irq_low(irq, low);
 		write_ht_irq_high(irq, high);
 
-		set_irq_chip_and_handler(irq, &ht_irq_chip, handle_edge_irq);
+		set_irq_chip_and_handler_name(irq, &ht_irq_chip,
+					      handle_edge_irq, "edge");
 	}
 	return vector;
 }
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index dff68eb2b78..e46c55856d4 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -75,7 +75,7 @@ int show_interrupts(struct seq_file *p, void *v)
 			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
 #endif
 		seq_printf(p, " %8s", irq_desc[i].chip->name);
-		seq_printf(p, "-%s", handle_irq_name(irq_desc[i].handle_irq));
+		seq_printf(p, "-%-8s", irq_desc[i].name);
 
 		seq_printf(p, "  %s", action->name);
 		for (action=action->next; action; action = action->next)
-- 
cgit v1.2.3-70-g09d2


From 45edfd1db02f818b3dc7e4743ee8585af6b78f78 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai.lu@amd.com>
Date: Sat, 21 Oct 2006 18:37:01 +0200
Subject: [PATCH] x86-64: typo in __assign_irq_vector when updating pos for
 vector and offset

typo with cpu instead of new_cpu

Signed-off-by: Yinghai Lu <yinghai.lu@amd.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/io_apic.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 49e94f7994c..b848f480851 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -651,12 +651,12 @@ next:
 		if (vector == IA32_SYSCALL_VECTOR)
 			goto next;
 		for_each_cpu_mask(new_cpu, domain)
-			if (per_cpu(vector_irq, cpu)[vector] != -1)
+			if (per_cpu(vector_irq, new_cpu)[vector] != -1)
 				goto next;
 		/* Found one! */
 		for_each_cpu_mask(new_cpu, domain) {
-			pos[cpu].vector = vector;
-			pos[cpu].offset = offset;
+			pos[new_cpu].vector = vector;
+			pos[new_cpu].offset = offset;
 		}
 		if (old_vector >= 0) {
 			int old_cpu;
-- 
cgit v1.2.3-70-g09d2


From 73bb8919b33d42cf75a0ed89bc9ca6a7128665be Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Sat, 21 Oct 2006 18:37:01 +0200
Subject: [PATCH] x86-64: fix page align in e820 allocator

Currently some code pieces assume that address returned by find_e820_area()
are page aligned.  But looks like find_e820_area() had no such intention
and hence one might end up stomping over some of the data.  One such case
is bootmem allocator initialization code stomped over bss.

This patch modified find_e820_area() to return page aligned address.  This
might be little wasteful of memory but at the same time probably it is
easier to handle page aligned memory.

Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---
 arch/x86_64/kernel/e820.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index b3f0908668e..a75c829c2b0 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -54,13 +54,13 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size)
 
 	/* various gunk below that needed for SMP startup */
 	if (addr < 0x8000) { 
-		*addrp = 0x8000;
+		*addrp = PAGE_ALIGN(0x8000);
 		return 1; 
 	}
 
 	/* direct mapping tables of the kernel */
 	if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) { 
-		*addrp = table_end << PAGE_SHIFT; 
+		*addrp = PAGE_ALIGN(table_end << PAGE_SHIFT);
 		return 1;
 	} 
 
@@ -68,18 +68,18 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size)
 #ifdef CONFIG_BLK_DEV_INITRD
 	if (LOADER_TYPE && INITRD_START && last >= INITRD_START && 
 	    addr < INITRD_START+INITRD_SIZE) { 
-		*addrp = INITRD_START + INITRD_SIZE; 
+		*addrp = PAGE_ALIGN(INITRD_START + INITRD_SIZE);
 		return 1;
 	} 
 #endif
 	/* kernel code */
-	if (last >= __pa_symbol(&_text) && last < __pa_symbol(&_end)) {
-		*addrp = __pa_symbol(&_end);
+	if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) {
+		*addrp = PAGE_ALIGN(__pa_symbol(&_end));
 		return 1;
 	}
 
 	if (last >= ebda_addr && addr < ebda_addr + ebda_size) {
-		*addrp = ebda_addr + ebda_size;
+		*addrp = PAGE_ALIGN(ebda_addr + ebda_size);
 		return 1;
 	}
 
@@ -152,7 +152,7 @@ unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsi
 			continue; 
 		while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
 			;
-		last = addr + size;
+		last = PAGE_ALIGN(addr) + size;
 		if (last > ei->addr + ei->size)
 			continue;
 		if (last > end) 
-- 
cgit v1.2.3-70-g09d2


From cc7d479fe56133e79840beffe9cb4fd193af93aa Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Sat, 21 Oct 2006 18:37:02 +0200
Subject: [PATCH] x86-64: Fix ENOSYS in system call tracing

This patch:

- out of range system calls failing to return -ENOSYS under
  system call tracing

[AK: split out from another patch by Jan as separate bugfix]

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/entry.S | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 38a7b2d528e..038dcf733a1 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -315,6 +315,8 @@ tracesys:
 	LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
 	RESTORE_REST
 	cmpq $__NR_syscall_max,%rax
+	movq $-ENOSYS,%rcx
+	cmova %rcx,%rax
 	ja  1f
 	movq %r10,%rcx	/* fixup for C */
 	call *sys_call_table(,%rax,8)
-- 
cgit v1.2.3-70-g09d2


From 581910e2eb952e541b8ca9b5f551d6c124903b61 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 21 Oct 2006 18:37:02 +0200
Subject: [PATCH] x86-64: Revert interrupt backlink changes

They break more than they fix
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/entry.S | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 038dcf733a1..e3eddde3d3b 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -537,8 +537,6 @@ END(stub_rt_sigreturn)
 1:	incl	%gs:pda_irqcount
 	cmoveq %gs:pda_irqstackptr,%rsp
 	push    %rbp			# backlink for old unwinder
-	CFI_ADJUST_CFA_OFFSET 8
-	CFI_REL_OFFSET rbp,0
 	/*
 	 * We entered an interrupt context - irqs are off:
 	 */
@@ -1178,7 +1176,6 @@ ENTRY(call_softirq)
 	incl %gs:pda_irqcount
 	cmove %gs:pda_irqstackptr,%rsp
 	push  %rbp			# backlink for old unwinder
-	CFI_ADJUST_CFA_OFFSET    8
 	call __do_softirq
 	leaveq
 	CFI_DEF_CFA_REGISTER	rsp
-- 
cgit v1.2.3-70-g09d2


From 6bf2dafad18c119beb534cbb3d882fe7a6c3f529 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 21 Oct 2006 18:37:02 +0200
Subject: [PATCH] x86-64: Use irq_domain in ioapic_retrigger_irq

Thanks to YH Lu for spotting this.  It appears I missed this function when I
refactored allocate_irq_vector and introduced irq_domain, with the result that
all retriggered irqs would go to cpu 0 even if we were not prepared to receive
them there.

While reviewing YH's patch I also noticed that this function was missing
locking, and since I am now reading two values from two diffrent arrays that
looks like a race we might be able to hit in the real world.

Cc: Yinghai Lu <yinghai.lu@amd.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/io_apic.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index b848f480851..8a9a357875b 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -1255,12 +1255,15 @@ static int ioapic_retrigger_irq(unsigned int irq)
 {
 	cpumask_t mask;
 	unsigned vector;
+	unsigned long flags;
 
+	spin_lock_irqsave(&vector_lock, flags);
 	vector = irq_vector[irq];
 	cpus_clear(mask);
-	cpu_set(vector >> 8, mask);
+	cpu_set(first_cpu(irq_domain[irq]), mask);
 
-	send_IPI_mask(mask, vector & 0xff);
+	send_IPI_mask(mask, vector);
+	spin_unlock_irqrestore(&vector_lock, flags);
 
 	return 1;
 }
-- 
cgit v1.2.3-70-g09d2


From 8cf2c51927bbeefafc25193d01b91f9ed3806e96 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 21 Oct 2006 18:37:02 +0200
Subject: [PATCH] x86: Revert new unwind kernel stack termination

Jan convinced me that it was unnecessary because the assembly stubs do
this already on the stack.

Cc: jbeulich@novell.com

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/process.c | 6 +-----
 arch/x86_64/kernel/entry.S | 5 -----
 2 files changed, 1 insertion(+), 10 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 57d375900af..1e1fa3e391a 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -336,7 +336,6 @@ extern void kernel_thread_helper(void);
 int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
 {
 	struct pt_regs regs;
-	int err;
 
 	memset(&regs, 0, sizeof(regs));
 
@@ -351,10 +350,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
 	regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
 
 	/* Ok, create the new process.. */
-	err = do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
-	if (err == 0) /* terminate kernel stack */
-		task_pt_regs(current)->eip = 0;
-	return err;
+	return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
 }
 EXPORT_SYMBOL(kernel_thread);
 
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index e3eddde3d3b..7d401b00d82 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -980,11 +980,6 @@ ENTRY(kernel_thread)
 	call do_fork
 	movq %rax,RAX(%rsp)
 	xorl %edi,%edi
-	test %rax,%rax
-	jnz  1f
-	/* terminate stack in child */
-	movq %rdi,RIP(%rsp)
-1:
 
 	/*
 	 * It isn't worth to check for reschedule here,
-- 
cgit v1.2.3-70-g09d2


From 84f404f695b16bd142c8dd9910d5a398f54fb044 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 21 Oct 2006 18:37:02 +0200
Subject: [PATCH] x86-64: Put more than one cpu in TARGET_CPUS

TARGET_CPUS is the default irq routing poicy.  It specifies which cpus the
kernel should aim an irq at.  In physflat delivery mode we can route an irq to
a single cpu.  But that doesn't mean our default policy should only be a
single cpu is allowed.

By allowing the irq routing code to select from multiple cpus this enables
systems with more irqs then we can service on a single processor to actually
work.

I just audited and tested the code and irqbalance doesn't care, and the
io_apic.c doesn't care if we have extra cpus in the mask.  Everything will use
or assume we are using the lowest numbered cpu in the mask if we can't use
them all.

So this should result in no behavior changes except on systems that need it.

Thanks for YH Lu for spotting this problem in his testing.

Cc: Yinghai Lu <yinghai.lu@amd.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/genapic_flat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c
index 0dfc223c183..7c01db8fa9d 100644
--- a/arch/x86_64/kernel/genapic_flat.c
+++ b/arch/x86_64/kernel/genapic_flat.c
@@ -153,7 +153,7 @@ struct genapic apic_flat =  {
 
 static cpumask_t physflat_target_cpus(void)
 {
-	return cpumask_of_cpu(0);
+	return cpu_online_map;
 }
 
 static cpumask_t physflat_vector_allocation_domain(int cpu)
-- 
cgit v1.2.3-70-g09d2


From dbaab49f92ff6ae6255762a948375e4036cbdbd2 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Sat, 21 Oct 2006 18:37:03 +0200
Subject: [PATCH] x86-64: Overlapping program headers in physical addr space
 fix

o A recent change to vmlinux.ld.S file broke kexec as now resulting vmlinux
  program headers are overlapping in physical address space.

o Now all the vsyscall related sections are placed after data and after
  that mostly init data sections are placed. To avoid physical overlap
  among phdrs, there are three possible solutions.
	- Place vsyscall sections also in data phdrs instead of user
	- move vsyscal sections after init data in bss.
	- create another phdrs say data.init and move all the sections
	  after vsyscall into this new phdr.

o This patch implements the third solution.

Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Magnus Damm <magnus@valinux.co.jp>
Cc: Andi Kleen <ak@suse.de>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---
 arch/x86_64/kernel/vmlinux.lds.S | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index b9df2ab6529..1283614c9b2 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -17,6 +17,7 @@ PHDRS {
 	text PT_LOAD FLAGS(5);	/* R_E */
 	data PT_LOAD FLAGS(7);	/* RWE */
 	user PT_LOAD FLAGS(7);	/* RWE */
+	data.init PT_LOAD FLAGS(7);	/* RWE */
 	note PT_NOTE FLAGS(4);	/* R__ */
 }
 SECTIONS
@@ -131,7 +132,7 @@ SECTIONS
   . = ALIGN(8192);		/* init_task */
   .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
 	*(.data.init_task)
-  } :data
+  }:data.init
 
   . = ALIGN(4096);
   .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
-- 
cgit v1.2.3-70-g09d2


From e70ea8c09db0e25ab58f84ba7f393e5c9125a8ee Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 21 Oct 2006 18:37:03 +0200
Subject: [PATCH] x86-64: Revert timer routing behaviour back to 2.6.16 state

By default route the 8254 over the 8259 and only disable
it on ATI boards where this causes double timer interrupts.

This should unbreak some Nvidia boards where the timer doesn't
seem to tick of it isn't enabled in the 8259. At least one
VIA board also seemed to have a little trouble with the disabled
8259.

For 2.6.20 we'll try both dynamically without black listing, but I think
for .19 this is the safer approach because it has been already well tested
in earlier kernels. This also makes the x86-64 behaviour the same
as i386.

Command line options can change all this of course.

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/early-quirks.c | 9 +++++----
 arch/x86_64/kernel/io_apic.c      | 2 +-
 include/asm-x86_64/proto.h        | 2 ++
 3 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c
index 208e38a372c..2b1245d8625 100644
--- a/arch/x86_64/kernel/early-quirks.c
+++ b/arch/x86_64/kernel/early-quirks.c
@@ -61,10 +61,11 @@ static void nvidia_bugs(void)
 
 static void ati_bugs(void)
 {
-#if 1 /* for testing */
-	printk("ATI board detected\n");
-#endif
-	/* No bugs right now */
+	if (timer_over_8254 == 1) {
+		timer_over_8254 = 0;
+		printk(KERN_INFO
+	 	"ATI board detected. Disabling timer routing over 8254.\n");
+	}
 }
 
 struct chipset {
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 8a9a357875b..b000017e4b5 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -57,7 +57,7 @@ static int no_timer_check;
 
 static int disable_timer_pin_1 __initdata;
 
-int timer_over_8254 __initdata = 0;
+int timer_over_8254 __initdata = 1;
 
 /* Where if anywhere is the i8259 connect in external int mode */
 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h
index c181fef786e..e72cfcdf534 100644
--- a/include/asm-x86_64/proto.h
+++ b/include/asm-x86_64/proto.h
@@ -122,6 +122,8 @@ extern int fix_aperture;
 extern int reboot_force;
 extern int notsc_setup(char *);
 
+extern int timer_over_8254;
+
 extern int gsi_irq_sharing(int gsi);
 
 extern void smp_local_timer_interrupt(void);
-- 
cgit v1.2.3-70-g09d2


From aa026ede513b7d672fa7d9106b2f2a475455dcf2 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sun, 22 Oct 2006 00:41:15 +0200
Subject: [PATCH] x86-64: Fix C3 timer test

There was a typo in the C3 latency test to decide of the TSC
should be used or not. It used the C2 latency threshold, not the
C3 one. Fix that.

This should fix the time on various dual core laptops.

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/time.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 1ba5a442ac3..88722f11ca1 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -948,7 +948,7 @@ __cpuinit int unsynchronized_tsc(void)
  	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
 #ifdef CONFIG_ACPI
 		/* But TSC doesn't tick in C3 so don't use it there */
-		if (acpi_fadt.length > 0 && acpi_fadt.plvl3_lat < 100)
+		if (acpi_fadt.length > 0 && acpi_fadt.plvl3_lat < 1000)
 			return 1;
 #endif
  		return 0;
-- 
cgit v1.2.3-70-g09d2


From cb01fc720c629261b9c616b2d5fcc3d93cd8bb09 Mon Sep 17 00:00:00 2001
From: Muli Ben-Yehuda <muli@il.ibm.com>
Date: Sun, 22 Oct 2006 00:41:15 +0200
Subject: [PATCH] x86-64: increase PHB1 split transaction timeout

This patch increases the timeout for PCI split transactions on PHB1 on
the first Calgary to work around an issue with the aic94xx
adapter. Fixes kernel.org bugzilla #7180
(http://bugzilla.kernel.org/show_bug.cgi?id=7180)

Based on excellent debugging and a patch by Darrick J. Wong
<djwong@us.ibm.com>

Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
Signed-off-by: Andi Kleen <ak@suse.de>
Acked-by: Darrick J. Wong <djwong@us.ibm.com>
---
 arch/x86_64/kernel/pci-calgary.c | 44 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index b3296cc2f2f..37a770859e7 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -52,7 +52,8 @@
 #define ONE_BASED_CHASSIS_NUM   1
 
 /* register offsets inside the host bridge space */
-#define PHB_CSR_OFFSET		0x0110
+#define CALGARY_CONFIG_REG	0x0108
+#define PHB_CSR_OFFSET		0x0110 /* Channel Status */
 #define PHB_PLSSR_OFFSET	0x0120
 #define PHB_CONFIG_RW_OFFSET	0x0160
 #define PHB_IOBASE_BAR_LOW	0x0170
@@ -83,6 +84,8 @@
 #define TAR_VALID		0x0000000000000008UL
 /* CSR (Channel/DMA Status Register) */
 #define CSR_AGENT_MASK		0xffe0ffff
+/* CCR (Calgary Configuration Register) */
+#define CCR_2SEC_TIMEOUT        0x000000000000000EUL
 
 #define MAX_NUM_OF_PHBS		8 /* how many PHBs in total? */
 #define MAX_NUM_CHASSIS		8 /* max number of chassis */
@@ -732,6 +735,38 @@ static void calgary_watchdog(unsigned long data)
 	}
 }
 
+static void __init calgary_increase_split_completion_timeout(void __iomem *bbar,
+	unsigned char busnum)
+{
+	u64 val64;
+	void __iomem *target;
+	unsigned long phb_shift = -1;
+	u64 mask;
+
+	switch (busno_to_phbid(busnum)) {
+	case 0: phb_shift = (63 - 19);
+		break;
+	case 1: phb_shift = (63 - 23);
+		break;
+	case 2: phb_shift = (63 - 27);
+		break;
+	case 3: phb_shift = (63 - 35);
+		break;
+	default:
+		BUG_ON(busno_to_phbid(busnum));
+	}
+
+	target = calgary_reg(bbar, CALGARY_CONFIG_REG);
+	val64 = be64_to_cpu(readq(target));
+
+	/* zero out this PHB's timer bits */
+	mask = ~(0xFUL << phb_shift);
+	val64 &= mask;
+	val64 |= (CCR_2SEC_TIMEOUT << phb_shift);
+	writeq(cpu_to_be64(val64), target);
+	readq(target); /* flush */
+}
+
 static void __init calgary_enable_translation(struct pci_dev *dev)
 {
 	u32 val32;
@@ -756,6 +791,13 @@ static void __init calgary_enable_translation(struct pci_dev *dev)
 	writel(cpu_to_be32(val32), target);
 	readl(target); /* flush */
 
+	/*
+	 * Give split completion a longer timeout on bus 1 for aic94xx
+	 * http://bugzilla.kernel.org/show_bug.cgi?id=7180
+	 */
+	if (busnum == 1)
+		calgary_increase_split_completion_timeout(bbar, busnum);
+
 	init_timer(&tbl->watchdog_timer);
 	tbl->watchdog_timer.function = &calgary_watchdog;
 	tbl->watchdog_timer.data = (unsigned long)dev;
-- 
cgit v1.2.3-70-g09d2


From d1752aa884ec0ac3027c1a3d456bf69bf765c8b8 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 25 Oct 2006 01:00:22 +0200
Subject: [PATCH] x86-64: Simplify the vector allocator.

There is no reason to remember a per cpu position of which vector
to try.  Keeping a global position is simpler and more likely to
result in a global vector allocation even if I don't need or require
it.  For level triggered interrupts this means we are less likely to
acknowledge another cpus irq, and cause the level triggered irq to
harmlessly refire.

This simplification makes it easier to only access data structures
of  online cpus, by having fewer special cases to deal with.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/io_apic.c | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index b000017e4b5..0e89ae7e7b2 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -612,10 +612,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result)
 	 * Also, we've got to be careful not to trash gate
 	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
 	 */
-	static struct {
-		int vector;
-		int offset;
-	} pos[NR_CPUS] = { [ 0 ... NR_CPUS - 1] = {FIRST_DEVICE_VECTOR, 0} };
+	static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
 	int old_vector = -1;
 	int cpu;
 
@@ -631,14 +628,13 @@ static int __assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result)
 
 	for_each_cpu_mask(cpu, mask) {
 		cpumask_t domain;
-		int first, new_cpu;
+		int new_cpu;
 		int vector, offset;
 
 		domain = vector_allocation_domain(cpu);
-		first = first_cpu(domain);
 
-		vector = pos[first].vector;
-		offset = pos[first].offset;
+		vector = current_vector;
+		offset = current_offset;
 next:
 		vector += 8;
 		if (vector >= FIRST_SYSTEM_VECTOR) {
@@ -646,7 +642,7 @@ next:
 			offset = (offset + 1) % 8;
 			vector = FIRST_DEVICE_VECTOR + offset;
 		}
-		if (unlikely(pos[first].vector == vector))
+		if (unlikely(current_vector == vector))
 			continue;
 		if (vector == IA32_SYSCALL_VECTOR)
 			goto next;
@@ -654,10 +650,8 @@ next:
 			if (per_cpu(vector_irq, new_cpu)[vector] != -1)
 				goto next;
 		/* Found one! */
-		for_each_cpu_mask(new_cpu, domain) {
-			pos[new_cpu].vector = vector;
-			pos[new_cpu].offset = offset;
-		}
+		current_vector = vector;
+		current_offset = offset;
 		if (old_vector >= 0) {
 			int old_cpu;
 			for_each_cpu_mask(old_cpu, irq_domain[irq])
-- 
cgit v1.2.3-70-g09d2


From 70a0a5357db20c291d46c04011d646d5d84c868c Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 25 Oct 2006 01:00:23 +0200
Subject: [PATCH] x86-64: Only look at per_cpu data for online cpus.

When I generalized __assign_irq_vector I failed to pay attention
to what happens when you access a per cpu data structure for
a cpu that is not online.   It is an undefined case making any
code that does it have undefined behavior as well.

The code still needs to be able to allocate a vector across cpus
that are not online to properly handle combinations like lowest
priority interrupt delivery and cpu_hotplug.  Not that we can do
that today but the infrastructure shouldn't prevent it.

So this patch updates the places where we touch per cpu data
to only touch online cpus, it makes cpu vector allocation
an atomic operation with respect to cpu hotplug, and it updates
the cpu start code to properly initialize vector_irq so we
don't have inconsistencies.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/io_apic.c | 42 +++++++++++++++++++++++++++++++++++++-----
 arch/x86_64/kernel/smpboot.c |  7 ++++++-
 include/asm-x86_64/hw_irq.h  |  2 ++
 3 files changed, 45 insertions(+), 6 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 0e89ae7e7b2..fe429e5d6b2 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -63,7 +63,7 @@ int timer_over_8254 __initdata = 1;
 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 
 static DEFINE_SPINLOCK(ioapic_lock);
-static DEFINE_SPINLOCK(vector_lock);
+DEFINE_SPINLOCK(vector_lock);
 
 /*
  * # of IRQ routing registers
@@ -618,6 +618,9 @@ static int __assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result)
 
 	BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
 
+	/* Only try and allocate irqs on cpus that are present */
+	cpus_and(mask, mask, cpu_online_map);
+
 	if (irq_vector[irq] > 0)
 		old_vector = irq_vector[irq];
 	if (old_vector > 0) {
@@ -627,11 +630,12 @@ static int __assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result)
 	}
 
 	for_each_cpu_mask(cpu, mask) {
-		cpumask_t domain;
+		cpumask_t domain, new_mask;
 		int new_cpu;
 		int vector, offset;
 
 		domain = vector_allocation_domain(cpu);
+		cpus_and(new_mask, domain, cpu_online_map);
 
 		vector = current_vector;
 		offset = current_offset;
@@ -646,18 +650,20 @@ next:
 			continue;
 		if (vector == IA32_SYSCALL_VECTOR)
 			goto next;
-		for_each_cpu_mask(new_cpu, domain)
+		for_each_cpu_mask(new_cpu, new_mask)
 			if (per_cpu(vector_irq, new_cpu)[vector] != -1)
 				goto next;
 		/* Found one! */
 		current_vector = vector;
 		current_offset = offset;
 		if (old_vector >= 0) {
+			cpumask_t old_mask;
 			int old_cpu;
-			for_each_cpu_mask(old_cpu, irq_domain[irq])
+			cpus_and(old_mask, irq_domain[irq], cpu_online_map);
+			for_each_cpu_mask(old_cpu, old_mask)
 				per_cpu(vector_irq, old_cpu)[old_vector] = -1;
 		}
-		for_each_cpu_mask(new_cpu, domain)
+		for_each_cpu_mask(new_cpu, new_mask)
 			per_cpu(vector_irq, new_cpu)[vector] = irq;
 		irq_vector[irq] = vector;
 		irq_domain[irq] = domain;
@@ -678,6 +684,32 @@ static int assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result)
 	return vector;
 }
 
+void __setup_vector_irq(int cpu)
+{
+	/* Initialize vector_irq on a new cpu */
+	/* This function must be called with vector_lock held */
+	unsigned long flags;
+	int irq, vector;
+
+
+	/* Mark the inuse vectors */
+	for (irq = 0; irq < NR_IRQ_VECTORS; ++irq) {
+		if (!cpu_isset(cpu, irq_domain[irq]))
+			continue;
+		vector = irq_vector[irq];
+		per_cpu(vector_irq, cpu)[vector] = irq;
+	}
+	/* Mark the free vectors */
+	for (vector = 0; vector < NR_VECTORS; ++vector) {
+		irq = per_cpu(vector_irq, cpu)[vector];
+		if (irq < 0)
+			continue;
+		if (!cpu_isset(cpu, irq_domain[irq]))
+			per_cpu(vector_irq, cpu)[vector] = -1;
+	}
+}
+
+
 extern void (*interrupt[NR_IRQS])(void);
 
 static struct irq_chip ioapic_chip;
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 7b7a6870288..62c2e747af5 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -581,12 +581,16 @@ void __cpuinit start_secondary(void)
 	 * smp_call_function().
 	 */
 	lock_ipi_call_lock();
+	spin_lock(&vector_lock);
 
+	/* Setup the per cpu irq handling data structures */
+	__setup_vector_irq(smp_processor_id());
 	/*
 	 * Allow the master to continue.
 	 */
 	cpu_set(smp_processor_id(), cpu_online_map);
 	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+	spin_unlock(&vector_lock);
 	unlock_ipi_call_lock();
 
 	cpu_idle();
@@ -799,7 +803,6 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
 				cpu, node);
 	}
 
-
 	alternatives_smp_switch(1);
 
 	c_idle.idle = get_idle_for_cpu(cpu);
@@ -1246,8 +1249,10 @@ int __cpu_disable(void)
 	local_irq_disable();
 	remove_siblinginfo(cpu);
 
+	spin_lock(&vector_lock);
 	/* It's now safe to remove this processor from the online map */
 	cpu_clear(cpu, cpu_online_map);
+	spin_unlock(&vector_lock);
 	remove_cpu_from_maps();
 	fixup_irqs(cpu_online_map);
 	return 0;
diff --git a/include/asm-x86_64/hw_irq.h b/include/asm-x86_64/hw_irq.h
index 792dd52fcd7..179cce755aa 100644
--- a/include/asm-x86_64/hw_irq.h
+++ b/include/asm-x86_64/hw_irq.h
@@ -76,6 +76,8 @@
 #ifndef __ASSEMBLY__
 typedef int vector_irq_t[NR_VECTORS];
 DECLARE_PER_CPU(vector_irq_t, vector_irq);
+extern void __setup_vector_irq(int cpu);
+extern spinlock_t vector_lock;
 
 /*
  * Various low-level irq details needed by irq.c, process.c,
-- 
cgit v1.2.3-70-g09d2


From 61ce1efe6e40233663d27ab8ac9ba9710eebcaad Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 27 Oct 2006 11:41:44 -0700
Subject: [PATCH] vmlinux.lds: consolidate initcall sections

Add a vmlinux.lds.h helper macro for defining the eight-level initcall table,
teach all the architectures to use it.

This is a prerequisite for a patch which performs initcall synchronisation for
multithreaded-probing.

Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
[ Added AVR32 as well ]
Signed-off-by: Haavard Skinnemoen <hskinnemoen@atmel.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/vmlinux.lds.S     |  8 +-------
 arch/arm/kernel/vmlinux.lds.S       |  8 +-------
 arch/avr32/kernel/vmlinux.lds.c     |  8 +-------
 arch/frv/kernel/vmlinux.lds.S       |  8 +-------
 arch/h8300/kernel/vmlinux.lds.S     |  8 +-------
 arch/i386/kernel/vmlinux.lds.S      |  8 +-------
 arch/ia64/kernel/vmlinux.lds.S      |  8 +-------
 arch/m32r/kernel/vmlinux.lds.S      |  8 +-------
 arch/m68knommu/kernel/vmlinux.lds.S |  8 +-------
 arch/mips/kernel/vmlinux.lds.S      |  8 +-------
 arch/parisc/kernel/vmlinux.lds.S    |  8 +-------
 arch/powerpc/kernel/vmlinux.lds.S   |  8 +-------
 arch/ppc/kernel/vmlinux.lds.S       |  8 +-------
 arch/s390/kernel/vmlinux.lds.S      |  8 +-------
 arch/sh/kernel/vmlinux.lds.S        |  8 +-------
 arch/sh64/kernel/vmlinux.lds.S      |  8 +-------
 arch/sparc/kernel/vmlinux.lds.S     |  8 +-------
 arch/sparc64/kernel/vmlinux.lds.S   |  8 +-------
 arch/v850/kernel/vmlinux.lds.S      |  8 +-------
 arch/x86_64/kernel/vmlinux.lds.S    |  8 +-------
 arch/xtensa/kernel/vmlinux.lds.S    |  8 +-------
 include/asm-generic/vmlinux.lds.h   | 10 ++++++++++
 22 files changed, 31 insertions(+), 147 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S
index 71470e9d93b..76bf071e376 100644
--- a/arch/alpha/kernel/vmlinux.lds.S
+++ b/arch/alpha/kernel/vmlinux.lds.S
@@ -48,13 +48,7 @@ SECTIONS
   . = ALIGN(8);
   __initcall_start = .;
   .initcall.init : {
-	*(.initcall1.init) 
-	*(.initcall2.init) 
-	*(.initcall3.init) 
-	*(.initcall4.init) 
-	*(.initcall5.init) 
-	*(.initcall6.init) 
-	*(.initcall7.init)
+	INITCALLS
   }
   __initcall_end = .;
 
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index 3ca574ee277..a8fa75ea07a 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -45,13 +45,7 @@ SECTIONS
 			*(.early_param.init)
 		__early_end = .;
 		__initcall_start = .;
-			*(.initcall1.init)
-			*(.initcall2.init)
-			*(.initcall3.init)
-			*(.initcall4.init)
-			*(.initcall5.init)
-			*(.initcall6.init)
-			*(.initcall7.init)
+			INITCALLS
 		__initcall_end = .;
 		__con_initcall_start = .;
 			*(.con_initcall.init)
diff --git a/arch/avr32/kernel/vmlinux.lds.c b/arch/avr32/kernel/vmlinux.lds.c
index cdd627c6b7d..5c4424e362b 100644
--- a/arch/avr32/kernel/vmlinux.lds.c
+++ b/arch/avr32/kernel/vmlinux.lds.c
@@ -38,13 +38,7 @@ SECTIONS
 		__setup_end = .;
 		. = ALIGN(4);
 		__initcall_start = .;
-			*(.initcall1.init)
-			*(.initcall2.init)
-			*(.initcall3.init)
-			*(.initcall4.init)
-			*(.initcall5.init)
-			*(.initcall6.init)
-			*(.initcall7.init)
+			INITCALLS
 		__initcall_end = .;
 		__con_initcall_start = .;
 			*(.con_initcall.init)
diff --git a/arch/frv/kernel/vmlinux.lds.S b/arch/frv/kernel/vmlinux.lds.S
index f474534ba78..9c1fb12367f 100644
--- a/arch/frv/kernel/vmlinux.lds.S
+++ b/arch/frv/kernel/vmlinux.lds.S
@@ -44,13 +44,7 @@ SECTIONS
 
   __initcall_start = .;
   .initcall.init : {
-	*(.initcall1.init)
-	*(.initcall2.init)
-	*(.initcall3.init)
-	*(.initcall4.init)
-	*(.initcall5.init)
-	*(.initcall6.init)
-	*(.initcall7.init)
+	INITCALLS
   }
   __initcall_end = .;
   __con_initcall_start = .;
diff --git a/arch/h8300/kernel/vmlinux.lds.S b/arch/h8300/kernel/vmlinux.lds.S
index 6406c388f88..756325dd480 100644
--- a/arch/h8300/kernel/vmlinux.lds.S
+++ b/arch/h8300/kernel/vmlinux.lds.S
@@ -118,13 +118,7 @@ SECTIONS
 	. = ALIGN(0x4) ;
 	___setup_end = .;
 	___initcall_start = .;
-		*(.initcall1.init)
-		*(.initcall2.init)
-		*(.initcall3.init)
-		*(.initcall4.init)
-		*(.initcall5.init)
-		*(.initcall6.init)
-		*(.initcall7.init)
+		INITCALLS
 	___initcall_end = .;
 	___con_initcall_start = .;
 		*(.con_initcall.init)
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 1e7ac1c44dd..adc1f232afe 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -126,13 +126,7 @@ SECTIONS
   __setup_end = .;
   __initcall_start = .;
   .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
-	*(.initcall1.init) 
-	*(.initcall2.init) 
-	*(.initcall3.init) 
-	*(.initcall4.init) 
-	*(.initcall5.init) 
-	*(.initcall6.init) 
-	*(.initcall7.init)
+	INITCALLS
   }
   __initcall_end = .;
   __con_initcall_start = .;
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
index b3b2e389d6b..d6083a0936f 100644
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -128,13 +128,7 @@ SECTIONS
   .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET)
 	{
 	  __initcall_start = .;
-	  *(.initcall1.init)
-	  *(.initcall2.init)
-	  *(.initcall3.init)
-	  *(.initcall4.init)
-	  *(.initcall5.init)
-	  *(.initcall6.init)
-	  *(.initcall7.init)
+	INITCALLS
 	  __initcall_end = .;
 	}
 
diff --git a/arch/m32r/kernel/vmlinux.lds.S b/arch/m32r/kernel/vmlinux.lds.S
index 13c7bb698e3..358b9cee2c6 100644
--- a/arch/m32r/kernel/vmlinux.lds.S
+++ b/arch/m32r/kernel/vmlinux.lds.S
@@ -83,13 +83,7 @@ SECTIONS
   __setup_end = .;
   __initcall_start = .;
   .initcall.init : {
-	*(.initcall1.init)
-	*(.initcall2.init)
-	*(.initcall3.init)
-	*(.initcall4.init)
-	*(.initcall5.init)
-	*(.initcall6.init)
-	*(.initcall7.init)
+	INITCALLS
   }
   __initcall_end = .;
   __con_initcall_start = .;
diff --git a/arch/m68knommu/kernel/vmlinux.lds.S b/arch/m68knommu/kernel/vmlinux.lds.S
index ccd2ceb05cf..58afa8be604 100644
--- a/arch/m68knommu/kernel/vmlinux.lds.S
+++ b/arch/m68knommu/kernel/vmlinux.lds.S
@@ -140,13 +140,7 @@ SECTIONS {
 		*(.init.setup)
 		__setup_end = .;
 		__initcall_start = .;
-		*(.initcall1.init)
-		*(.initcall2.init)
-		*(.initcall3.init)
-		*(.initcall4.init)
-		*(.initcall5.init)
-		*(.initcall6.init)
-		*(.initcall7.init)
+		INITCALLS
 		__initcall_end = .;
 		__con_initcall_start = .;
 		*(.con_initcall.init)
diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S
index 0bb9cd88945..25ed3337ce3 100644
--- a/arch/mips/kernel/vmlinux.lds.S
+++ b/arch/mips/kernel/vmlinux.lds.S
@@ -91,13 +91,7 @@ SECTIONS
 
   __initcall_start = .;
   .initcall.init : {
-	*(.initcall1.init)
-	*(.initcall2.init)
-	*(.initcall3.init)
-	*(.initcall4.init)
-	*(.initcall5.init)
-	*(.initcall6.init)
-	*(.initcall7.init)
+	INITCALLS
   }
   __initcall_end = .;
 
diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S
index b3677fc8eef..7b943b45f7c 100644
--- a/arch/parisc/kernel/vmlinux.lds.S
+++ b/arch/parisc/kernel/vmlinux.lds.S
@@ -153,13 +153,7 @@ SECTIONS
   __setup_end = .;
   __initcall_start = .;
   .initcall.init : {
-	*(.initcall1.init) 
-	*(.initcall2.init) 
-	*(.initcall3.init) 
-	*(.initcall4.init) 
-	*(.initcall5.init) 
-	*(.initcall6.init) 
-	*(.initcall7.init)
+	INITCALLS
   }
   __initcall_end = .;
   __con_initcall_start = .;
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index cb0e8d46c3e..e8342d86753 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -108,13 +108,7 @@ SECTIONS
 
 	.initcall.init : {
 		__initcall_start = .;
-		*(.initcall1.init)
-		*(.initcall2.init)
-		*(.initcall3.init)
-		*(.initcall4.init)
-		*(.initcall5.init)
-		*(.initcall6.init)
-		*(.initcall7.init)
+		INITCALLS
 		__initcall_end = .;
 		}
 
diff --git a/arch/ppc/kernel/vmlinux.lds.S b/arch/ppc/kernel/vmlinux.lds.S
index 095fd332332..16e8661e1fe 100644
--- a/arch/ppc/kernel/vmlinux.lds.S
+++ b/arch/ppc/kernel/vmlinux.lds.S
@@ -115,13 +115,7 @@ SECTIONS
   __setup_end = .;
   __initcall_start = .;
   .initcall.init : {
-	*(.initcall1.init)
-	*(.initcall2.init)
-	*(.initcall3.init)
-	*(.initcall4.init)
-	*(.initcall5.init)
-	*(.initcall6.init)
-	*(.initcall7.init)
+	INITCALLS
   }
   __initcall_end = .;
 
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index af9e69a0301..fe0f2e97ba7 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -83,13 +83,7 @@ SECTIONS
   __setup_end = .;
   __initcall_start = .;
   .initcall.init : {
-	*(.initcall1.init) 
-	*(.initcall2.init) 
-	*(.initcall3.init) 
-	*(.initcall4.init) 
-	*(.initcall5.init) 
-	*(.initcall6.init) 
-	*(.initcall7.init)
+	INITCALLS
   }
   __initcall_end = .;
   __con_initcall_start = .;
diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S
index 5eb93091818..77b4026d568 100644
--- a/arch/sh/kernel/vmlinux.lds.S
+++ b/arch/sh/kernel/vmlinux.lds.S
@@ -76,13 +76,7 @@ SECTIONS
   __setup_end = .;
   __initcall_start = .;
   .initcall.init : {
-	*(.initcall1.init) 
-	*(.initcall2.init) 
-	*(.initcall3.init) 
-	*(.initcall4.init) 
-	*(.initcall5.init) 
-	*(.initcall6.init) 
-	*(.initcall7.init)
+	INITCALLS
   }
   __initcall_end = .;
   __con_initcall_start = .;
diff --git a/arch/sh64/kernel/vmlinux.lds.S b/arch/sh64/kernel/vmlinux.lds.S
index a8fcc3a7158..95c4d753e35 100644
--- a/arch/sh64/kernel/vmlinux.lds.S
+++ b/arch/sh64/kernel/vmlinux.lds.S
@@ -108,13 +108,7 @@ SECTIONS
   __setup_end = .;
   __initcall_start = .;
   .initcall.init : C_PHYS(.initcall.init) {
-  	*(.initcall1.init)
-  	*(.initcall2.init)
-  	*(.initcall3.init)
-  	*(.initcall4.init)
-  	*(.initcall5.init)
-  	*(.initcall6.init)
-  	*(.initcall7.init)
+	INITCALLS
   }
   __initcall_end = .;
   __con_initcall_start = .;
diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S
index 1dd78c84888..5cc5ff7f882 100644
--- a/arch/sparc/kernel/vmlinux.lds.S
+++ b/arch/sparc/kernel/vmlinux.lds.S
@@ -49,13 +49,7 @@ SECTIONS
   __setup_end = .;
   __initcall_start = .;
   .initcall.init : {
-	*(.initcall1.init) 
-	*(.initcall2.init) 
-	*(.initcall3.init) 
-	*(.initcall4.init) 
-	*(.initcall5.init) 
-	*(.initcall6.init) 
-	*(.initcall7.init)
+	INITCALLS
   }
   __initcall_end = .;
   __con_initcall_start = .;
diff --git a/arch/sparc64/kernel/vmlinux.lds.S b/arch/sparc64/kernel/vmlinux.lds.S
index b097379a49a..bd9de8c2a2a 100644
--- a/arch/sparc64/kernel/vmlinux.lds.S
+++ b/arch/sparc64/kernel/vmlinux.lds.S
@@ -57,13 +57,7 @@ SECTIONS
   __setup_end = .;
   __initcall_start = .;
   .initcall.init : {
-	*(.initcall1.init) 
-	*(.initcall2.init) 
-	*(.initcall3.init) 
-	*(.initcall4.init) 
-	*(.initcall5.init) 
-	*(.initcall6.init) 
-	*(.initcall7.init)
+	INITCALLS
   }
   __initcall_end = .;
   __con_initcall_start = .;
diff --git a/arch/v850/kernel/vmlinux.lds.S b/arch/v850/kernel/vmlinux.lds.S
index 63399219cd9..88d087f527c 100644
--- a/arch/v850/kernel/vmlinux.lds.S
+++ b/arch/v850/kernel/vmlinux.lds.S
@@ -140,13 +140,7 @@
 		___setup_end = . ;					      \
 		___initcall_start = . ;					      \
 			*(.initcall.init)				      \
-			*(.initcall1.init)				      \
-			*(.initcall2.init)				      \
-			*(.initcall3.init)				      \
-			*(.initcall4.init)				      \
-			*(.initcall5.init)				      \
-			*(.initcall6.init)				      \
-			*(.initcall7.init)				      \
+			INITCALLS					      \
 		. = ALIGN (4) ;						      \
 		___initcall_end = . ;					      \
 		___con_initcall_start = .;				      \
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 1283614c9b2..edb24aa714b 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -175,13 +175,7 @@ SECTIONS
   __setup_end = .;
   __initcall_start = .;
   .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
-	*(.initcall1.init) 
-	*(.initcall2.init) 
-	*(.initcall3.init) 
-	*(.initcall4.init) 
-	*(.initcall5.init) 
-	*(.initcall6.init) 
-	*(.initcall7.init)
+	INITCALLS
   }
   __initcall_end = .;
   __con_initcall_start = .;
diff --git a/arch/xtensa/kernel/vmlinux.lds.S b/arch/xtensa/kernel/vmlinux.lds.S
index ab6cdbd5eb6..cfe75f52872 100644
--- a/arch/xtensa/kernel/vmlinux.lds.S
+++ b/arch/xtensa/kernel/vmlinux.lds.S
@@ -184,13 +184,7 @@ SECTIONS
 
   __initcall_start = .;
   .initcall.init : {
-  	*(.initcall1.init)
-  	*(.initcall2.init)
-  	*(.initcall3.init)
-  	*(.initcall4.init)
-  	*(.initcall5.init)
-  	*(.initcall6.init)
-  	*(.initcall7.init)
+	INITCALLS
   }
   __initcall_end = .;
 
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 9d0d11c180d..e3e83bcaf71 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -213,3 +213,13 @@
 
 #define NOTES								\
 		.notes : { *(.note.*) } :note
+
+#define INITCALLS							\
+  	*(.initcall1.init)						\
+  	*(.initcall2.init)						\
+  	*(.initcall3.init)						\
+  	*(.initcall4.init)						\
+  	*(.initcall5.init)						\
+  	*(.initcall6.init)						\
+  	*(.initcall7.init)
+
-- 
cgit v1.2.3-70-g09d2


From 6c0ffb9d2fd987c79c6cbb81c3f3011c63749b1a Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@merom.osdl.org>
Date: Wed, 8 Nov 2006 10:23:03 -0800
Subject: x86-64: clean up io-apic accesses

This is just commit 130fe05dbc0114609cfef9815c0c5580b42decfa ported to
x86-64, for all the same reasons.  It cleans up the IO-APIC accesses in
order to then fix the ordering issues.

We move the accessor functions (that were only used by io_apic.c) out of
a header file, and use proper memory-mapped accesses rather than making
up our own "volatile" pointers.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/io_apic.c | 46 ++++++++++++++++++++++++++++++++++++++++++++
 include/asm-x86_64/io_apic.h | 34 --------------------------------
 2 files changed, 46 insertions(+), 34 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index fe429e5d6b2..96e02d84571 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -88,6 +88,52 @@ static struct irq_pin_list {
 	short apic, pin, next;
 } irq_2_pin[PIN_MAP_SIZE];
 
+struct io_apic {
+	unsigned int index;
+	unsigned int unused[3];
+	unsigned int data;
+};
+
+static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
+{
+	return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
+		+ (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
+}
+
+static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
+{
+	struct io_apic __iomem *io_apic = io_apic_base(apic);
+	writel(reg, &io_apic->index);
+	return readl(&io_apic->data);
+}
+
+static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+{
+	struct io_apic __iomem *io_apic = io_apic_base(apic);
+	writel(reg, &io_apic->index);
+	writel(value, &io_apic->data);
+}
+
+/*
+ * Re-write a value: to be used for read-modify-write
+ * cycles where the read already set up the index register.
+ */
+static inline void io_apic_modify(unsigned int apic, unsigned int value)
+{
+	struct io_apic __iomem *io_apic = io_apic_base(apic);
+	writel(value, &io_apic->data);
+}
+
+/*
+ * Synchronize the IO-APIC and the CPU by doing
+ * a dummy read from the IO-APIC
+ */
+static inline void io_apic_sync(unsigned int apic)
+{
+	struct io_apic __iomem *io_apic = io_apic_base(apic);
+	readl(&io_apic->data);
+}
+
 #define __DO_ACTION(R, ACTION, FINAL)					\
 									\
 {									\
diff --git a/include/asm-x86_64/io_apic.h b/include/asm-x86_64/io_apic.h
index 171ec2dc8c0..561ecbfd4cb 100644
--- a/include/asm-x86_64/io_apic.h
+++ b/include/asm-x86_64/io_apic.h
@@ -12,10 +12,6 @@
 
 #define APIC_MISMATCH_DEBUG
 
-#define IO_APIC_BASE(idx) \
-		((volatile int *)(__fix_to_virt(FIX_IO_APIC_BASE_0 + idx) \
-		+ (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK)))
-
 /*
  * The structure of the IO-APIC:
  */
@@ -119,36 +115,6 @@ extern struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
 /* non-0 if default (table-less) MP configuration */
 extern int mpc_default_type;
 
-static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
-{
-	*IO_APIC_BASE(apic) = reg;
-	return *(IO_APIC_BASE(apic)+4);
-}
-
-static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
-{
-	*IO_APIC_BASE(apic) = reg;
-	*(IO_APIC_BASE(apic)+4) = value;
-}
-
-/*
- * Re-write a value: to be used for read-modify-write
- * cycles where the read already set up the index register.
- */
-static inline void io_apic_modify(unsigned int apic, unsigned int value)
-{
-	*(IO_APIC_BASE(apic)+4) = value;
-}
-
-/*
- * Synchronize the IO-APIC and the CPU by doing
- * a dummy read from the IO-APIC
- */
-static inline void io_apic_sync(unsigned int apic)
-{
-	(void) *(IO_APIC_BASE(apic)+4);
-}
-
 /* 1 if "noapic" boot option passed */
 extern int skip_ioapic_setup;
 
-- 
cgit v1.2.3-70-g09d2


From 48797ebd9e8b16fddcd4ef062f792314a6b9219a Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@merom.osdl.org>
Date: Wed, 8 Nov 2006 10:27:54 -0800
Subject: x86-64: write IO APIC irq routing entries in correct order

This is the x86-64 version of f9dadfa71bc594df09044da61d1c72701121d802
that did the same thing on i386.

Since the "mask" bit is in the low word, when we write a new entry, we
need to write the high word first, before we potentially unmask it.

The exception is when we actually want to mask the interrupt, in which
case we want to write the low word first to make sure that the high word
doesn't change while the interrupt routing is still active.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/io_apic.c | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 96e02d84571..3b8f9c68ad3 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -172,11 +172,33 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
 	return eu.entry;
 }
 
+/*
+ * When we write a new IO APIC routing entry, we need to write the high
+ * word first! If the mask bit in the low word is clear, we will enable
+ * the interrupt, and we need to make sure the entry is fully populated
+ * before that happens.
+ */
 static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
 {
 	unsigned long flags;
 	union entry_union eu;
 	eu.entry = e;
+	spin_lock_irqsave(&ioapic_lock, flags);
+	io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+	io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+/*
+ * When we mask an IO APIC routing entry, we need to write the low
+ * word first, in order to set the mask bit before we change the
+ * high bits!
+ */
+static void ioapic_mask_entry(int apic, int pin)
+{
+	unsigned long flags;
+	union entry_union eu = { .entry.mask = 1 };
+
 	spin_lock_irqsave(&ioapic_lock, flags);
 	io_apic_write(apic, 0x10 + 2*pin, eu.w1);
 	io_apic_write(apic, 0x11 + 2*pin, eu.w2);
@@ -302,9 +324,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 	/*
 	 * Disable it in the IO-APIC irq-routing table:
 	 */
-	memset(&entry, 0, sizeof(entry));
-	entry.mask = 1;
-	ioapic_write_entry(apic, pin, entry);
+	ioapic_mask_entry(apic, pin);
 }
 
 static void clear_IO_APIC (void)
-- 
cgit v1.2.3-70-g09d2


From ec68307cc5a8dc499e48693843bb42f6b6028458 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 8 Nov 2006 17:44:57 -0800
Subject: [PATCH] htirq: refactor so we only have one function that writes to
 the chip

This refactoring actually optimizes the code a little by caching the value
that we think the device is programmed with instead of reading it back from
the hardware.  Which simplifies the code a little and should speed things up a
bit.

This patch introduces the concept of a ht_irq_msg and modifies the
architecture read/write routines to update this code.

There is a minor consistency fix here as well as x86_64 forgot to initialize
the htirq as masked.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Andi Kleen <ak@suse.de>
Acked-by: Bryan O'Sullivan <bos@pathscale.com>
Cc: <olson@pathscale.com>
Cc: Roland Dreier <rolandd@cisco.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/io_apic.c   | 26 ++++++++--------
 arch/x86_64/kernel/io_apic.c | 31 +++++++++----------
 drivers/pci/htirq.c          | 72 +++++++++++++++-----------------------------
 include/linux/htirq.h        | 11 ++++---
 4 files changed, 58 insertions(+), 82 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 507983c513c..ad84bc2802a 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -2624,18 +2624,16 @@ void arch_teardown_msi_irq(unsigned int irq)
 
 static void target_ht_irq(unsigned int irq, unsigned int dest)
 {
-	u32 low, high;
-	low  = read_ht_irq_low(irq);
-	high = read_ht_irq_high(irq);
+	struct ht_irq_msg msg;
+	fetch_ht_irq_msg(irq, &msg);
 
-	low  &= ~(HT_IRQ_LOW_DEST_ID_MASK);
-	high &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
+	msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK);
+	msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
 
-	low  |= HT_IRQ_LOW_DEST_ID(dest);
-	high |= HT_IRQ_HIGH_DEST_ID(dest);
+	msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest);
+	msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
 
-	write_ht_irq_low(irq, low);
-	write_ht_irq_high(irq, high);
+	write_ht_irq_msg(irq, &msg);
 }
 
 static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
@@ -2673,7 +2671,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 
 	vector = assign_irq_vector(irq);
 	if (vector >= 0) {
-		u32 low, high;
+		struct ht_irq_msg msg;
 		unsigned dest;
 		cpumask_t tmp;
 
@@ -2681,9 +2679,10 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 		cpu_set(vector >> 8, tmp);
 		dest = cpu_mask_to_apicid(tmp);
 
-		high = 	HT_IRQ_HIGH_DEST_ID(dest);
+		msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
 
-		low =	HT_IRQ_LOW_BASE |
+		msg.address_lo =
+			HT_IRQ_LOW_BASE |
 			HT_IRQ_LOW_DEST_ID(dest) |
 			HT_IRQ_LOW_VECTOR(vector) |
 			((INT_DEST_MODE == 0) ?
@@ -2695,8 +2694,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 				HT_IRQ_LOW_MT_ARBITRATED) |
 			HT_IRQ_LOW_IRQ_MASKED;
 
-		write_ht_irq_low(irq, low);
-		write_ht_irq_high(irq, high);
+		write_ht_irq_msg(irq, &msg);
 
 		set_irq_chip_and_handler_name(irq, &ht_irq_chip,
 					      handle_edge_irq, "edge");
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 3b8f9c68ad3..41bfc49301a 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -1955,18 +1955,16 @@ void arch_teardown_msi_irq(unsigned int irq)
 
 static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
 {
-	u32 low, high;
-	low  = read_ht_irq_low(irq);
-	high = read_ht_irq_high(irq);
+	struct ht_irq_msg msg;
+	fetch_ht_irq_msg(irq, &msg);
 
-	low  &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK);
-	high &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
+	msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK);
+	msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
 
-	low  |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest);
-	high |= HT_IRQ_HIGH_DEST_ID(dest);
+	msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest);
+	msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
 
-	write_ht_irq_low(irq, low);
-	write_ht_irq_high(irq, high);
+	write_ht_irq_msg(irq, &msg);
 }
 
 static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
@@ -1987,7 +1985,7 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
 
 	dest = cpu_mask_to_apicid(tmp);
 
-	target_ht_irq(irq, dest, vector & 0xff);
+	target_ht_irq(irq, dest, vector);
 	set_native_irq_info(irq, mask);
 }
 #endif
@@ -2010,14 +2008,15 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 
 	vector = assign_irq_vector(irq, TARGET_CPUS, &tmp);
 	if (vector >= 0) {
-		u32 low, high;
+		struct ht_irq_msg msg;
 		unsigned dest;
 
 		dest = cpu_mask_to_apicid(tmp);
 
-		high = 	HT_IRQ_HIGH_DEST_ID(dest);
+		msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
 
-		low =	HT_IRQ_LOW_BASE |
+		msg.address_lo =
+			HT_IRQ_LOW_BASE |
 			HT_IRQ_LOW_DEST_ID(dest) |
 			HT_IRQ_LOW_VECTOR(vector) |
 			((INT_DEST_MODE == 0) ?
@@ -2026,10 +2025,10 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 			HT_IRQ_LOW_RQEOI_EDGE |
 			((INT_DELIVERY_MODE != dest_LowestPrio) ?
 				HT_IRQ_LOW_MT_FIXED :
-				HT_IRQ_LOW_MT_ARBITRATED);
+				HT_IRQ_LOW_MT_ARBITRATED) |
+			HT_IRQ_LOW_IRQ_MASKED;
 
-		write_ht_irq_low(irq, low);
-		write_ht_irq_high(irq, high);
+		write_ht_irq_msg(irq, &msg);
 
 		set_irq_chip_and_handler_name(irq, &ht_irq_chip,
 					      handle_edge_irq, "edge");
diff --git a/drivers/pci/htirq.c b/drivers/pci/htirq.c
index 0e27f2404a8..e346fe31f97 100644
--- a/drivers/pci/htirq.c
+++ b/drivers/pci/htirq.c
@@ -27,82 +27,55 @@ struct ht_irq_cfg {
 	struct pci_dev *dev;
 	unsigned pos;
 	unsigned idx;
+	struct ht_irq_msg msg;
 };
 
-void write_ht_irq_low(unsigned int irq, u32 data)
-{
-	struct ht_irq_cfg *cfg = get_irq_data(irq);
-	unsigned long flags;
-	spin_lock_irqsave(&ht_irq_lock, flags);
-	pci_write_config_byte(cfg->dev, cfg->pos + 2, cfg->idx);
-	pci_write_config_dword(cfg->dev, cfg->pos + 4, data);
-	spin_unlock_irqrestore(&ht_irq_lock, flags);
-}
-
-void write_ht_irq_high(unsigned int irq, u32 data)
-{
-	struct ht_irq_cfg *cfg = get_irq_data(irq);
-	unsigned long flags;
-	spin_lock_irqsave(&ht_irq_lock, flags);
-	pci_write_config_byte(cfg->dev, cfg->pos + 2, cfg->idx + 1);
-	pci_write_config_dword(cfg->dev, cfg->pos + 4, data);
-	spin_unlock_irqrestore(&ht_irq_lock, flags);
-}
 
-u32 read_ht_irq_low(unsigned int irq)
+void write_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg)
 {
 	struct ht_irq_cfg *cfg = get_irq_data(irq);
 	unsigned long flags;
-	u32 data;
 	spin_lock_irqsave(&ht_irq_lock, flags);
-	pci_write_config_byte(cfg->dev, cfg->pos + 2, cfg->idx);
-	pci_read_config_dword(cfg->dev, cfg->pos + 4, &data);
+	if (cfg->msg.address_lo != msg->address_lo) {
+		pci_write_config_byte(cfg->dev, cfg->pos + 2, cfg->idx);
+		pci_write_config_dword(cfg->dev, cfg->pos + 4, msg->address_lo);
+	}
+	if (cfg->msg.address_hi != msg->address_hi) {
+		pci_write_config_byte(cfg->dev, cfg->pos + 2, cfg->idx + 1);
+		pci_write_config_dword(cfg->dev, cfg->pos + 4, msg->address_hi);
+	}
 	spin_unlock_irqrestore(&ht_irq_lock, flags);
-	return data;
+	cfg->msg = *msg;
 }
 
-u32 read_ht_irq_high(unsigned int irq)
+void fetch_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg)
 {
 	struct ht_irq_cfg *cfg = get_irq_data(irq);
-	unsigned long flags;
-	u32 data;
-	spin_lock_irqsave(&ht_irq_lock, flags);
-	pci_write_config_byte(cfg->dev, cfg->pos + 2, cfg->idx + 1);
-	pci_read_config_dword(cfg->dev, cfg->pos + 4, &data);
-	spin_unlock_irqrestore(&ht_irq_lock, flags);
-	return data;
+	*msg = cfg->msg;
 }
 
 void mask_ht_irq(unsigned int irq)
 {
 	struct ht_irq_cfg *cfg;
-	unsigned long flags;
-	u32 data;
+	struct ht_irq_msg msg;
 
 	cfg = get_irq_data(irq);
 
-	spin_lock_irqsave(&ht_irq_lock, flags);
-	pci_write_config_byte(cfg->dev, cfg->pos + 2, cfg->idx);
-	pci_read_config_dword(cfg->dev, cfg->pos + 4, &data);
-	data |= 1;
-	pci_write_config_dword(cfg->dev, cfg->pos + 4, data);
-	spin_unlock_irqrestore(&ht_irq_lock, flags);
+	msg = cfg->msg;
+	msg.address_lo |= 1;
+	write_ht_irq_msg(irq, &msg);
 }
 
 void unmask_ht_irq(unsigned int irq)
 {
 	struct ht_irq_cfg *cfg;
-	unsigned long flags;
-	u32 data;
+	struct ht_irq_msg msg;
 
 	cfg = get_irq_data(irq);
 
-	spin_lock_irqsave(&ht_irq_lock, flags);
-	pci_write_config_byte(cfg->dev, cfg->pos + 2, cfg->idx);
-	pci_read_config_dword(cfg->dev, cfg->pos + 4, &data);
-	data &= ~1;
-	pci_write_config_dword(cfg->dev, cfg->pos + 4, data);
-	spin_unlock_irqrestore(&ht_irq_lock, flags);
+	msg = cfg->msg;
+	msg.address_lo &= ~1;
+	write_ht_irq_msg(irq, &msg);
 }
 
 /**
@@ -152,6 +125,9 @@ int ht_create_irq(struct pci_dev *dev, int idx)
 	cfg->dev = dev;
 	cfg->pos = pos;
 	cfg->idx = 0x10 + (idx * 2);
+	/* Initialize msg to a value that will never match the first write. */
+	cfg->msg.address_lo = 0xffffffff;
+	cfg->msg.address_hi = 0xffffffff;
 
 	irq = create_irq();
 	if (irq < 0) {
diff --git a/include/linux/htirq.h b/include/linux/htirq.h
index 1f15ce279a2..108f0d91e11 100644
--- a/include/linux/htirq.h
+++ b/include/linux/htirq.h
@@ -1,11 +1,14 @@
 #ifndef LINUX_HTIRQ_H
 #define LINUX_HTIRQ_H
 
+struct ht_irq_msg {
+	u32	address_lo;	/* low 32 bits of the ht irq message */
+	u32	address_hi;	/* high 32 bits of the it irq message */
+};
+
 /* Helper functions.. */
-void write_ht_irq_low(unsigned int irq, u32 data);
-void write_ht_irq_high(unsigned int irq, u32 data);
-u32  read_ht_irq_low(unsigned int irq);
-u32  read_ht_irq_high(unsigned int irq);
+void fetch_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg);
+void write_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg);
 void mask_ht_irq(unsigned int irq);
 void unmask_ht_irq(unsigned int irq);
 
-- 
cgit v1.2.3-70-g09d2


From 14f448e36192d6d2cd7dfd81cb044977b2f9dd9b Mon Sep 17 00:00:00 2001
From: Aaron Durbin <adurbin@google.com>
Date: Tue, 14 Nov 2006 16:57:45 +0100
Subject: [PATCH] x86-64: Fix partial page check to ensure unusable memory is
 not being marked usable.

Fix partial page check in e820_register_active_regions to ensure
partial pages are
not being marked as active in the memory pool.

Signed-off-by: Aaron Durbin <adurbin@google.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/e820.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index a75c829c2b0..855b5611318 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -278,7 +278,7 @@ e820_register_active_regions(int nid, unsigned long start_pfn,
 								>> PAGE_SHIFT;
 
 		/* Skip map entries smaller than a page */
-		if (ei_startpfn > ei_endpfn)
+		if (ei_startpfn >= ei_endpfn)
 			continue;
 
 		/* Check if end_pfn_map should be updated */
-- 
cgit v1.2.3-70-g09d2


From 15803a43288da434d34d41c4ed650c3c1728d42c Mon Sep 17 00:00:00 2001
From: Magnus Damm <magnus@valinux.co.jp>
Date: Tue, 14 Nov 2006 16:57:46 +0100
Subject: [PATCH] x86-64: setup saved_max_pfn correctly (kdump)

x86_64: setup saved_max_pfn correctly

2.6.19-rc4 has broken CONFIG_CRASH_DUMP support on x86_64. It is impossible
to read out the kernel contents from /proc/vmcore because saved_max_pfn is set
to zero instead of the max_pfn value before the user map is setup.

This happens because saved_max_pfn is initialized at parse_early_param() time,
and at this time no active regions have been registered. save_max_pfn is setup
from e820_end_of_ram(), more exact find_max_pfn_with_active_regions() which
returns 0 because no regions exist.

This patch fixes this by registering before and removing after the call
to e820_end_of_ram().

Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/e820.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index 855b5611318..6fe191c5808 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -594,7 +594,9 @@ static int __init parse_memmap_opt(char *p)
 		 * size before original memory map is
 		 * reset.
 		 */
+		e820_register_active_regions(0, 0, -1UL);
 		saved_max_pfn = e820_end_of_ram();
+		remove_all_active_ranges();
 #endif
 		end_pfn_map = 0;
 		e820.nr_map = 0;
-- 
cgit v1.2.3-70-g09d2


From fa18f477d0987c011cce047a7c3cd1284f547a14 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Tue, 14 Nov 2006 16:57:46 +0100
Subject: [PATCH] x86: Add acpi_user_timer_override option for Asus boards

Timer overrides are normally disabled on Nvidia board because
they are commonly wrong, except on new ones with HPET support.
Unfortunately there are quite some Asus boards around that
don't have HPET, but need a timer override.

We don't know yet how to handle this transparently,
but at least add a command line option to force the timer override
and let them boot.

Cc: len.brown@intel.com

Signed-off-by: Andi Kleen <ak@suse.de>
---
 Documentation/kernel-parameters.txt | 4 ++++
 arch/i386/kernel/acpi/boot.c        | 8 ++++++++
 arch/i386/kernel/acpi/earlyquirk.c  | 8 +++++++-
 arch/x86_64/kernel/early-quirks.c   | 8 ++++++++
 include/asm-i386/acpi.h             | 1 +
 include/asm-x86_64/acpi.h           | 1 +
 6 files changed, 29 insertions(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index dd00fd556a6..67473849f20 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -164,6 +164,10 @@ and is between 256 and 4096 characters. It is defined in the file
 	acpi_skip_timer_override [HW,ACPI]
 			Recognize and ignore IRQ0/pin2 Interrupt Override.
 			For broken nForce2 BIOS resulting in XT-PIC timer.
+	acpi_use_timer_override [HW,ACPI}
+			Use timer override. For some broken Nvidia NF5 boards
+			that require a timer override, but don't have
+			HPET
 
 	acpi_dbg_layer=	[HW,ACPI]
 			Format: <int>
diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c
index 22e4c466e5a..d12fb97a533 100644
--- a/arch/i386/kernel/acpi/boot.c
+++ b/arch/i386/kernel/acpi/boot.c
@@ -82,6 +82,7 @@ EXPORT_SYMBOL(acpi_strict);
 acpi_interrupt_flags acpi_sci_flags __initdata;
 int acpi_sci_override_gsi __initdata;
 int acpi_skip_timer_override __initdata;
+int acpi_use_timer_override __initdata;
 
 #ifdef CONFIG_X86_LOCAL_APIC
 static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
@@ -1300,6 +1301,13 @@ static int __init parse_acpi_skip_timer_override(char *arg)
 	return 0;
 }
 early_param("acpi_skip_timer_override", parse_acpi_skip_timer_override);
+
+static int __init parse_acpi_use_timer_override(char *arg)
+{
+	acpi_use_timer_override = 1;
+	return 0;
+}
+early_param("acpi_use_timer_override", parse_acpi_use_timer_override);
 #endif /* CONFIG_X86_IO_APIC */
 
 static int __init setup_acpi_sci(char *s)
diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c
index fe799b11ac0..c9841692bb7 100644
--- a/arch/i386/kernel/acpi/earlyquirk.c
+++ b/arch/i386/kernel/acpi/earlyquirk.c
@@ -27,11 +27,17 @@ static int __init check_bridge(int vendor, int device)
 #ifdef CONFIG_ACPI
 	/* According to Nvidia all timer overrides are bogus unless HPET
 	   is enabled. */
-	if (vendor == PCI_VENDOR_ID_NVIDIA) {
+	if (!acpi_use_timer_override && vendor == PCI_VENDOR_ID_NVIDIA) {
 		nvidia_hpet_detected = 0;
 		acpi_table_parse(ACPI_HPET, nvidia_hpet_check);
 		if (nvidia_hpet_detected == 0) {
 			acpi_skip_timer_override = 1;
+			  printk(KERN_INFO "Nvidia board "
+                       "detected. Ignoring ACPI "
+                       "timer override.\n");
+                printk(KERN_INFO "If you got timer trouble "
+			 	 "try acpi_use_timer_override\n");
+
 		}
 	}
 #endif
diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c
index 2b1245d8625..68273bff58c 100644
--- a/arch/x86_64/kernel/early-quirks.c
+++ b/arch/x86_64/kernel/early-quirks.c
@@ -45,7 +45,13 @@ static void nvidia_bugs(void)
 	/*
 	 * All timer overrides on Nvidia are
 	 * wrong unless HPET is enabled.
+	 * Unfortunately that's not true on many Asus boards.
+	 * We don't know yet how to detect this automatically, but
+	 * at least allow a command line override.
 	 */
+	if (acpi_use_timer_override)
+		return;
+
 	nvidia_hpet_detected = 0;
 	acpi_table_parse(ACPI_HPET, nvidia_hpet_check);
 	if (nvidia_hpet_detected == 0) {
@@ -53,6 +59,8 @@ static void nvidia_bugs(void)
 		printk(KERN_INFO "Nvidia board "
 		       "detected. Ignoring ACPI "
 		       "timer override.\n");
+		printk(KERN_INFO "If you got timer trouble "
+			"try acpi_use_timer_override\n");
 	}
 #endif
 	/* RED-PEN skip them on mptables too? */
diff --git a/include/asm-i386/acpi.h b/include/asm-i386/acpi.h
index 6016632d032..c80b3a94511 100644
--- a/include/asm-i386/acpi.h
+++ b/include/asm-i386/acpi.h
@@ -132,6 +132,7 @@ extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq);
 
 #ifdef CONFIG_X86_IO_APIC
 extern int acpi_skip_timer_override;
+extern int acpi_use_timer_override;
 #endif
 
 static inline void acpi_noirq_set(void) { acpi_noirq = 1; }
diff --git a/include/asm-x86_64/acpi.h b/include/asm-x86_64/acpi.h
index ed59aa4c6ff..9d1916e59c0 100644
--- a/include/asm-x86_64/acpi.h
+++ b/include/asm-x86_64/acpi.h
@@ -163,6 +163,7 @@ extern u8 x86_acpiid_to_apicid[];
 #define ARCH_HAS_POWER_INIT 1
 
 extern int acpi_skip_timer_override;
+extern int acpi_use_timer_override;
 
 #endif /*__KERNEL__*/
 
-- 
cgit v1.2.3-70-g09d2


From 8c131af1db510793f87dc43edbc8950a35370df3 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Tue, 14 Nov 2006 16:57:46 +0100
Subject: [PATCH] x86-64: Fix vgetcpu when CONFIG_HOTPLUG_CPU is disabled

The vgetcpu per CPU initialization previously relied on CPU hotplug
events for all CPUs to initialize the per CPU state. That only
worked only on kernels with CONFIG_HOTPLUG_CPU enabled.  On the
others some CPUs didn't get their state initialized properly
and vgetcpu wouldn't work.

Change the initialization sequence to instead run in a normal
initcall (which runs after the normal CPU bootup) and initialize
all running CPUs there. Later hotplug CPUs are still handled
with an hotplug notifier.

This actually simplifies the code somewhat.

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/smp.c      |  3 +--
 arch/x86_64/kernel/time.c     | 11 -----------
 arch/x86_64/kernel/vsyscall.c | 45 ++++++++++++++++++++++++-------------------
 include/asm-x86_64/vsyscall.h |  2 --
 4 files changed, 26 insertions(+), 35 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 4f67697f503..9f74c883568 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -376,9 +376,8 @@ int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
 	/* prevent preemption and reschedule on another processor */
 	int me = get_cpu();
 	if (cpu == me) {
-		WARN_ON(1);
 		put_cpu();
-		return -EBUSY;
+		return 0;
 	}
 	spin_lock_bh(&call_lock);
 	__smp_call_function_single(cpu, func, info, nonatomic, wait);
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 88722f11ca1..e3ef544d2cf 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -876,15 +876,6 @@ static struct irqaction irq0 = {
 	timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL
 };
 
-static int __cpuinit
-time_cpu_notifier(struct notifier_block *nb, unsigned long action, void *hcpu)
-{
-	unsigned cpu = (unsigned long) hcpu;
-	if (action == CPU_ONLINE)
-		vsyscall_set_cpu(cpu);
-	return NOTIFY_DONE;
-}
-
 void __init time_init(void)
 {
 	if (nohpet)
@@ -925,8 +916,6 @@ void __init time_init(void)
 	vxtime.last_tsc = get_cycles_sync();
 	set_cyc2ns_scale(cpu_khz);
 	setup_irq(0, &irq0);
-	hotcpu_notifier(time_cpu_notifier, 0);
-	time_cpu_notifier(NULL, CPU_ONLINE, (void *)(long)smp_processor_id());
 
 #ifndef CONFIG_SMP
 	time_init_gtod();
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index a98b460af6a..a730bacecb0 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -27,6 +27,9 @@
 #include <linux/jiffies.h>
 #include <linux/sysctl.h>
 #include <linux/getcpu.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/notifier.h>
 
 #include <asm/vsyscall.h>
 #include <asm/pgtable.h>
@@ -243,32 +246,17 @@ static ctl_table kernel_root_table2[] = {
 
 #endif
 
-static void __cpuinit write_rdtscp_cb(void *info)
-{
-	write_rdtscp_aux((unsigned long)info);
-}
-
-void __cpuinit vsyscall_set_cpu(int cpu)
+/* Assume __initcall executes before all user space. Hopefully kmod
+   doesn't violate that. We'll find out if it does. */
+static void __cpuinit vsyscall_set_cpu(int cpu)
 {
 	unsigned long *d;
 	unsigned long node = 0;
 #ifdef CONFIG_NUMA
 	node = cpu_to_node[cpu];
 #endif
-	if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) {
-		void *info = (void *)((node << 12) | cpu);
-		/* Can happen on preemptive kernel */
-		if (get_cpu() == cpu)
-			write_rdtscp_cb(info);
-#ifdef CONFIG_SMP
-		else {
-			/* the notifier is unfortunately not executed on the
-			   target CPU */
-			smp_call_function_single(cpu,write_rdtscp_cb,info,0,1);
-		}
-#endif
-		put_cpu();
-	}
+	if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP))
+		write_rdtscp_aux((node << 12) | cpu);
 
 	/* Store cpu number in limit so that it can be loaded quickly
 	   in user space in vgetcpu.
@@ -280,6 +268,21 @@ void __cpuinit vsyscall_set_cpu(int cpu)
 	*d |= (node >> 4) << 48;
 }
 
+static void __cpuinit cpu_vsyscall_init(void *arg)
+{
+	/* preemption should be already off */
+	vsyscall_set_cpu(raw_smp_processor_id());
+}
+
+static int __cpuinit
+cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
+{
+	long cpu = (long)arg;
+	if (action == CPU_ONLINE)
+		smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
+	return NOTIFY_DONE;
+}
+
 static void __init map_vsyscall(void)
 {
 	extern char __vsyscall_0;
@@ -299,6 +302,8 @@ static int __init vsyscall_init(void)
 #ifdef CONFIG_SYSCTL
 	register_sysctl_table(kernel_root_table2, 0);
 #endif
+	on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
+	hotcpu_notifier(cpu_vsyscall_notifier, 0);
 	return 0;
 }
 
diff --git a/include/asm-x86_64/vsyscall.h b/include/asm-x86_64/vsyscall.h
index fd452fc2c03..01d1c17e284 100644
--- a/include/asm-x86_64/vsyscall.h
+++ b/include/asm-x86_64/vsyscall.h
@@ -59,8 +59,6 @@ extern seqlock_t xtime_lock;
 
 extern int sysctl_vsyscall;
 
-extern void vsyscall_set_cpu(int cpu);
-
 #define ARCH_HAVE_XTIME_LOCK 1
 
 #endif /* __KERNEL__ */
-- 
cgit v1.2.3-70-g09d2


From 9446868b5383eb87f76b2d4389dea4bb968a6657 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Tue, 14 Nov 2006 16:57:46 +0100
Subject: [PATCH] x86-64: Fix race in exit_idle

When another interrupt happens in exit_idle the exit idle notifier
could be called an incorrect number of times.

Add a test_and_clear_bit_pda and use it handle the bit
atomically against interrupts to avoid this.

Pointed out by Stephane Eranian

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/process.c | 3 +--
 include/asm-x86_64/pda.h     | 9 +++++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 49f7fac6229..f6226055d53 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -88,9 +88,8 @@ void enter_idle(void)
 
 static void __exit_idle(void)
 {
-	if (read_pda(isidle) == 0)
+	if (test_and_clear_bit_pda(0, isidle) == 0)
 		return;
-	write_pda(isidle, 0);
 	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
 }
 
diff --git a/include/asm-x86_64/pda.h b/include/asm-x86_64/pda.h
index 14996d962ba..5642634843c 100644
--- a/include/asm-x86_64/pda.h
+++ b/include/asm-x86_64/pda.h
@@ -109,6 +109,15 @@ extern struct x8664_pda _proxy_pda;
 #define sub_pda(field,val) pda_to_op("sub",field,val)
 #define or_pda(field,val) pda_to_op("or",field,val)
 
+/* This is not atomic against other CPUs -- CPU preemption needs to be off */
+#define test_and_clear_bit_pda(bit,field) ({		\
+	int old__;						\
+	asm volatile("btr %2,%%gs:%c3\n\tsbbl %0,%0"		\
+	    : "=r" (old__), "+m" (_proxy_pda.field) 		\
+	    : "dIr" (bit), "i" (pda_offset(field)) : "memory");	\
+	old__;							\
+})
+
 #endif
 
 #define PDA_STACKOFFSET (5*8)
-- 
cgit v1.2.3-70-g09d2


From 45c99533252ef2297f37c5fdd672a3e0eb566870 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 14 Nov 2006 10:52:12 -0700
Subject: [PATCH] Use delayed disable mode of ioapic edge triggered interrupts

Komuro reports that ISA interrupts do not work after a disable_irq(),
causing some PCMCIA drivers to not work, with messages like

	eth0: Asix AX88190: io 0x300, irq 3, hw_addr xx:xx:xx:xx:xx:xx
	eth0: found link beat
	eth0: autonegotiation complete: 100baseT-FD selected
	eth0: interrupt(s) dropped!
	eth0: interrupt(s) dropped!
	eth0: interrupt(s) dropped!
	...

Linus Torvalds <torvalds@osdl.org> said:

  "Now, edge-triggered interrupts are a _lot_ harder to mask, because the
   Intel APIC is an unbelievable piece of sh*t, and has the edge-detect logic
   _before_ the mask logic, so if a edge happens _while_ the device is
   masked, you'll never ever see the edge ever again (unmasking will not
   cause a new edge, so you simply lost the interrupt).

   So when you "mask" an edge-triggered IRQ, you can't really mask it at all,
   because if you did that, you'd lose it forever if the IRQ comes in while
   you masked it. Instead, we're supposed to leave it active, and set a flag,
   and IF the IRQ comes in, we just remember it, and mask it at that point
   instead, and then on unmasking, we have to replay it by sending a
   self-IPI."

This trivial patch solves the problem.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Ingo Molnar <mingo@redhat.com>
Acked-by: Komuro <komurojun-mbn@nifty.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/io_apic.c   | 4 +++-
 arch/x86_64/kernel/io_apic.c | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index ad84bc2802a..3b7a63e0ed1 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -1287,9 +1287,11 @@ static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
 			trigger == IOAPIC_LEVEL)
 		set_irq_chip_and_handler_name(irq, &ioapic_chip,
 					 handle_fasteoi_irq, "fasteoi");
-	else
+	else {
+		irq_desc[irq].status |= IRQ_DELAYED_DISABLE;
 		set_irq_chip_and_handler_name(irq, &ioapic_chip,
 					 handle_edge_irq, "edge");
+	}
 	set_intr_gate(vector, interrupt[irq]);
 }
 
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 41bfc49301a..14654e68241 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -790,9 +790,11 @@ static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
 			trigger == IOAPIC_LEVEL)
 		set_irq_chip_and_handler_name(irq, &ioapic_chip,
 					      handle_fasteoi_irq, "fasteoi");
-	else
+	else {
+		irq_desc[irq].status |= IRQ_DELAYED_DISABLE;
 		set_irq_chip_and_handler_name(irq, &ioapic_chip,
 					      handle_edge_irq, "edge");
+	}
 }
 
 static void __init setup_IO_APIC_irqs(void)
-- 
cgit v1.2.3-70-g09d2


From 6b3d1a95ba714bfb1cc81362f7f3e01b7654b4f3 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 16 Nov 2006 10:22:03 +0100
Subject: [PATCH] x86-64: Fix vsyscall.c compilation on UP

Broken by earlier patch by me.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/vsyscall.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index a730bacecb0..92546c1526f 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -274,6 +274,7 @@ static void __cpuinit cpu_vsyscall_init(void *arg)
 	vsyscall_set_cpu(raw_smp_processor_id());
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
 static int __cpuinit
 cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
 {
@@ -282,6 +283,7 @@ cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
 		smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
 	return NOTIFY_DONE;
 }
+#endif
 
 static void __init map_vsyscall(void)
 {
-- 
cgit v1.2.3-70-g09d2


From 0796bdb7e9e4a48b401f4fba1ee5dc79a45528ef Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 17 Nov 2006 05:57:49 +0100
Subject: [PATCH] x86_64: stack unwinder crash fix

the new dwarf2 unwinder crashes while trying to dump the stack:

  Leftover inexact backtrace:

  Unable to handle kernel paging request at ffffffff82800000 RIP:
   [<ffffffff8026cf26>] dump_trace+0x35b/0x3d2
  PGD 203027 PUD 205027 PMD 0
  Oops: 0000 [2] PREEMPT SMP
  CPU 0
  Modules linked in:
  Pid: 30, comm: khelper Not tainted 2.6.19-rc6-rt1 #11
  RIP: 0010:[<ffffffff8026cf26>]  [<ffffffff8026cf26>] dump_trace+0x35b/0x3d2
  RSP: 0000:ffff81003fb9d848  EFLAGS: 00010006
  RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
  RDX: 0000000000000000 RSI: ffffffff805b3520 RDI: 0000000000000000
  RBP: ffffffff827ffff9 R08: ffffffff80aad000 R09: 0000000000000005
  R10: ffffffff80aae000 R11: ffffffff8037961b R12: ffff81003fb9d858
  R13: 0000000000000000 R14: ffffffff80598460 R15: ffffffff80ab1fc0
  FS:  0000000000000000(0000) GS:ffffffff806c4200(0000) knlGS:0000000000000000
  CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
  CR2: ffffffff82800000 CR3: 0000000000201000 CR4: 00000000000006e0

this crash happened because it did not sanitize the dwarf2 data it
got, and got an unaligned stack pointer - which happily walked past
the process stack (and eventually reached the end of kernel memory
and pagefaulted there) due to this naive iteration condition:

        HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0);

note that i386 is alot more conservative when it comes to trusting
stack pointers:

  static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
  {
         return  p > (void *)tinfo &&
                 p < (void *)tinfo + THREAD_SIZE - 3;
  }

but the x86_64 code did not take this bit of i386 code.

The fix is to align the stack pointer.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Andi Kleen <ak@suse.de>
Cc: Jan Beulich <jbeulich@novell.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/traps.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 7819022a8db..a153d0a01b7 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -290,6 +290,12 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
 		if (tsk && tsk != current)
 			stack = (unsigned long *)tsk->thread.rsp;
 	}
+	/*
+	 * Align the stack pointer on word boundary, later loops
+	 * rely on that (and corruption / debug info bugs can cause
+	 * unaligned values here):
+	 */
+	stack = (unsigned long *)((unsigned long)stack & ~(sizeof(long)-1));
 
 	/*
 	 * Print function call entries within a stack. 'cond' is the
-- 
cgit v1.2.3-70-g09d2


From dc1829a4c378d793fb3b95d56135d89a0d7ff72a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 17 Nov 2006 14:26:18 +0100
Subject: [PATCH] i386/x86_64: ACPI cpu_idle_wait() fix

The scheduler on Andreas Friedrich's hyperthreading system stopped
working properly: the scheduler would never move tasks to another CPU!
The lask known working kernel was 2.6.8.

After a couple of attempts to corner the bug, the following smoking gun
was found:

  BIOS reported wrong ACPI idfor the processor
  CPU#1: set_cpus_allowed(), swapper:1, 3 -> 2
   [<c0103bbe>] show_trace_log_lvl+0x34/0x4a
   [<c0103ceb>] show_trace+0x2c/0x2e
   [<c01045f8>] dump_stack+0x2b/0x2d
   [<c0116a77>] set_cpus_allowed+0x52/0xec
   [<c0101d86>] cpu_idle_wait+0x2e/0x100
   [<c0259c57>] acpi_processor_power_exit+0x45/0x58
   [<c0259752>] acpi_processor_remove+0x46/0xea
   [<c025c6fb>] acpi_start_single_object+0x47/0x54
   [<c025cee5>] acpi_bus_register_driver+0xa4/0xd3
   [<c04ab2d7>] acpi_processor_init+0x57/0x77
   [<c01004d7>] init+0x146/0x2fd
   [<c0103a87>] kernel_thread_helper+0x7/0x10

a quick look at cpu_idle_wait() shows how broken that code is
on i386: it changes the init task's affinity map but never
restores it ...

and because all userspace tasks get forked by init, they all
inherited that single-CPU affinity mask. x86_64 cloned this
bug too.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Andreas Friedrich <andreas.friedrich@fujitsu-siemens.com>
Cc: Wolfgang Erig <Wolfgang.Erig@fujitsu-siemens.com>
Cc: Andrew Morton <akpm@osdl.org>
Cc: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/process.c   | 4 +++-
 arch/x86_64/kernel/process.c | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 1e1fa3e391a..dd53c58f64f 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -205,7 +205,7 @@ void cpu_idle(void)
 void cpu_idle_wait(void)
 {
 	unsigned int cpu, this_cpu = get_cpu();
-	cpumask_t map;
+	cpumask_t map, tmp = current->cpus_allowed;
 
 	set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
 	put_cpu();
@@ -227,6 +227,8 @@ void cpu_idle_wait(void)
 		}
 		cpus_and(map, map, cpu_online_map);
 	} while (!cpus_empty(map));
+
+	set_cpus_allowed(current, tmp);
 }
 EXPORT_SYMBOL_GPL(cpu_idle_wait);
 
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index f6226055d53..7451a4c43c1 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -144,7 +144,7 @@ static void poll_idle (void)
 void cpu_idle_wait(void)
 {
 	unsigned int cpu, this_cpu = get_cpu();
-	cpumask_t map;
+	cpumask_t map, tmp = current->cpus_allowed;
 
 	set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
 	put_cpu();
@@ -167,6 +167,8 @@ void cpu_idle_wait(void)
 		}
 		cpus_and(map, map, cpu_online_map);
 	} while (!cpus_empty(map));
+
+	set_cpus_allowed(current, tmp);
 }
 EXPORT_SYMBOL_GPL(cpu_idle_wait);
 
-- 
cgit v1.2.3-70-g09d2


From 9a14f2964b459c18198ee59ff7212321def82df7 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Mon, 20 Nov 2006 11:29:09 -0500
Subject: [PATCH] x86_64: Align data segment to PAGE_SIZE boundary

o Explicitly align data segment to PAGE_SIZE boundary otherwise depending on
  config options and tool chain it might be placed on a non PAGE_SIZE aligned
  boundary and vmlinux loaders like kexec fail when they encounter a
  PT_LOAD type segment which is not aligned to PAGE_SIZE boundary.

Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/vmlinux.lds.S | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index edb24aa714b..d9534e750d4 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -60,6 +60,7 @@ SECTIONS
   }
 #endif
 
+  . = ALIGN(PAGE_SIZE);        /* Align data segment to page size boundary */
 				/* Data */
   .data : AT(ADDR(.data) - LOAD_OFFSET) {
 	*(.data)
-- 
cgit v1.2.3-70-g09d2


From 3af9815328bba76e8d11d71d6dbbd6f38beafe58 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Mon, 20 Nov 2006 11:29:09 -0500
Subject: [PATCH] x86_64: Align data segment to PAGE_SIZE boundary

o Explicitly align data segment to PAGE_SIZE boundary otherwise depending on
  config options and tool chain it might be placed on a non PAGE_SIZE aligned
  boundary and vmlinux loaders like kexec fail when they encounter a
  PT_LOAD type segment which is not aligned to PAGE_SIZE boundary.

Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/vmlinux.lds.S | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index edb24aa714b..d9534e750d4 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -60,6 +60,7 @@ SECTIONS
   }
 #endif
 
+  . = ALIGN(PAGE_SIZE);        /* Align data segment to page size boundary */
 				/* Data */
   .data : AT(ADDR(.data) - LOAD_OFFSET) {
 	*(.data)
-- 
cgit v1.2.3-70-g09d2


From 52bad64d95bd89e08c49ec5a071fa6dcbe5a1a9c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 22 Nov 2006 14:54:01 +0000
Subject: WorkStruct: Separate delayable and non-delayable events.

Separate delayable work items from non-delayable work items be splitting them
into a separate structure (delayed_work), which incorporates a work_struct and
the timer_list removed from work_struct.

The work_struct struct is huge, and this limits it's usefulness.  On a 64-bit
architecture it's nearly 100 bytes in size.  This reduces that by half for the
non-delayable type of event.

Signed-Off-By: David Howells <dhowells@redhat.com>
---
 arch/x86_64/kernel/mce.c           |  2 +-
 drivers/ata/libata-core.c          | 11 +++-----
 drivers/ata/libata-eh.c            |  2 +-
 drivers/char/random.c              |  2 +-
 drivers/char/tty_io.c              |  2 +-
 fs/aio.c                           |  4 +--
 fs/nfs/client.c                    |  2 +-
 fs/nfs/namespace.c                 |  3 ++-
 include/linux/aio.h                |  2 +-
 include/linux/kbd_kern.h           |  2 +-
 include/linux/libata.h             |  4 +--
 include/linux/nfs_fs_sb.h          |  2 +-
 include/linux/sunrpc/rpc_pipe_fs.h |  2 +-
 include/linux/sunrpc/xprt.h        |  2 +-
 include/linux/tty.h                |  2 +-
 include/linux/workqueue.h          | 44 +++++++++++++++++++++++---------
 kernel/workqueue.c                 | 51 +++++++++++++++++++++-----------------
 mm/slab.c                          |  8 +++---
 net/core/link_watch.c              |  9 +++----
 net/sunrpc/cache.c                 |  4 +--
 net/sunrpc/rpc_pipe.c              |  3 ++-
 net/sunrpc/xprtsock.c              |  6 ++---
 22 files changed, 96 insertions(+), 73 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index bbea88801d8..5306f263090 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -307,7 +307,7 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
 
 static int check_interval = 5 * 60; /* 5 minutes */
 static void mcheck_timer(void *data);
-static DECLARE_WORK(mcheck_work, mcheck_timer, NULL);
+static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer, NULL);
 
 static void mcheck_check_cpu(void *info)
 {
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 915a55a6cc1..0bb4b4dced7 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -937,12 +937,9 @@ void ata_port_queue_task(struct ata_port *ap, void (*fn)(void *), void *data,
 	if (ap->pflags & ATA_PFLAG_FLUSH_PORT_TASK)
 		return;
 
-	PREPARE_WORK(&ap->port_task, fn, data);
+	PREPARE_DELAYED_WORK(&ap->port_task, fn, data);
 
-	if (!delay)
-		rc = queue_work(ata_wq, &ap->port_task);
-	else
-		rc = queue_delayed_work(ata_wq, &ap->port_task, delay);
+	rc = queue_delayed_work(ata_wq, &ap->port_task, delay);
 
 	/* rc == 0 means that another user is using port task */
 	WARN_ON(rc == 0);
@@ -5320,8 +5317,8 @@ void ata_port_init(struct ata_port *ap, struct ata_host *host,
 	ap->msg_enable = ATA_MSG_DRV | ATA_MSG_ERR | ATA_MSG_WARN;
 #endif
 
-	INIT_WORK(&ap->port_task, NULL, NULL);
-	INIT_WORK(&ap->hotplug_task, ata_scsi_hotplug, ap);
+	INIT_DELAYED_WORK(&ap->port_task, NULL, NULL);
+	INIT_DELAYED_WORK(&ap->hotplug_task, ata_scsi_hotplug, ap);
 	INIT_WORK(&ap->scsi_rescan_task, ata_scsi_dev_rescan, ap);
 	INIT_LIST_HEAD(&ap->eh_done_q);
 	init_waitqueue_head(&ap->eh_wait_q);
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 02b2b2787d9..9f6b7cc74fd 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -332,7 +332,7 @@ void ata_scsi_error(struct Scsi_Host *host)
 	if (ap->pflags & ATA_PFLAG_LOADING)
 		ap->pflags &= ~ATA_PFLAG_LOADING;
 	else if (ap->pflags & ATA_PFLAG_SCSI_HOTPLUG)
-		queue_work(ata_aux_wq, &ap->hotplug_task);
+		queue_delayed_work(ata_aux_wq, &ap->hotplug_task, 0);
 
 	if (ap->pflags & ATA_PFLAG_RECOVERED)
 		ata_port_printk(ap, KERN_INFO, "EH complete\n");
diff --git a/drivers/char/random.c b/drivers/char/random.c
index eb6b13f4211..f2ab61f3e8a 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1424,7 +1424,7 @@ static unsigned int ip_cnt;
 
 static void rekey_seq_generator(void *private_);
 
-static DECLARE_WORK(rekey_work, rekey_seq_generator, NULL);
+static DECLARE_DELAYED_WORK(rekey_work, rekey_seq_generator, NULL);
 
 /*
  * Lock avoidance:
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index e90ea39c7c4..7297acfe520 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -3580,7 +3580,7 @@ static void initialize_tty_struct(struct tty_struct *tty)
 	tty->overrun_time = jiffies;
 	tty->buf.head = tty->buf.tail = NULL;
 	tty_buffer_init(tty);
-	INIT_WORK(&tty->buf.work, flush_to_ldisc, tty);
+	INIT_DELAYED_WORK(&tty->buf.work, flush_to_ldisc, tty);
 	init_MUTEX(&tty->buf.pty_sem);
 	mutex_init(&tty->termios_mutex);
 	init_waitqueue_head(&tty->write_wait);
diff --git a/fs/aio.c b/fs/aio.c
index 94766599db0..11a1a7100ad 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -227,7 +227,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 
 	INIT_LIST_HEAD(&ctx->active_reqs);
 	INIT_LIST_HEAD(&ctx->run_list);
-	INIT_WORK(&ctx->wq, aio_kick_handler, ctx);
+	INIT_DELAYED_WORK(&ctx->wq, aio_kick_handler, ctx);
 
 	if (aio_setup_ring(ctx) < 0)
 		goto out_freectx;
@@ -876,7 +876,7 @@ static void aio_kick_handler(void *data)
 	 * we're in a worker thread already, don't use queue_delayed_work,
 	 */
 	if (requeue)
-		queue_work(aio_wq, &ctx->wq);
+		queue_delayed_work(aio_wq, &ctx->wq, 0);
 }
 
 
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 5fea638743e..6f0487d6f44 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -143,7 +143,7 @@ static struct nfs_client *nfs_alloc_client(const char *hostname,
 	INIT_LIST_HEAD(&clp->cl_state_owners);
 	INIT_LIST_HEAD(&clp->cl_unused);
 	spin_lock_init(&clp->cl_lock);
-	INIT_WORK(&clp->cl_renewd, nfs4_renew_state, clp);
+	INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state, clp);
 	rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
 	clp->cl_boot_time = CURRENT_TIME;
 	clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index ec1114b33d8..5ed798bc1cf 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -21,7 +21,8 @@
 static void nfs_expire_automounts(void *list);
 
 LIST_HEAD(nfs_automount_list);
-static DECLARE_WORK(nfs_automount_task, nfs_expire_automounts, &nfs_automount_list);
+static DECLARE_DELAYED_WORK(nfs_automount_task, nfs_expire_automounts,
+			    &nfs_automount_list);
 int nfs_mountpoint_expiry_timeout = 500 * HZ;
 
 static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 0d71c0041f1..9e350fd44d7 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -194,7 +194,7 @@ struct kioctx {
 
 	struct aio_ring_info	ring_info;
 
-	struct work_struct	wq;
+	struct delayed_work	wq;
 };
 
 /* prototypes */
diff --git a/include/linux/kbd_kern.h b/include/linux/kbd_kern.h
index efe0ee4cc80..06c58c423fe 100644
--- a/include/linux/kbd_kern.h
+++ b/include/linux/kbd_kern.h
@@ -158,7 +158,7 @@ static inline void con_schedule_flip(struct tty_struct *t)
 	if (t->buf.tail != NULL)
 		t->buf.tail->commit = t->buf.tail->used;
 	spin_unlock_irqrestore(&t->buf.lock, flags);
-	schedule_work(&t->buf.work);
+	schedule_delayed_work(&t->buf.work, 0);
 }
 
 #endif
diff --git a/include/linux/libata.h b/include/linux/libata.h
index abd2debebca..5f04006e8dd 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -568,8 +568,8 @@ struct ata_port {
 	struct ata_host		*host;
 	struct device 		*dev;
 
-	struct work_struct	port_task;
-	struct work_struct	hotplug_task;
+	struct delayed_work	port_task;
+	struct delayed_work	hotplug_task;
 	struct work_struct	scsi_rescan_task;
 
 	unsigned int		hsm_task_state;
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 7ccfc7ef0a8..95796e6924f 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -51,7 +51,7 @@ struct nfs_client {
 
 	unsigned long		cl_lease_time;
 	unsigned long		cl_last_renewal;
-	struct work_struct	cl_renewd;
+	struct delayed_work	cl_renewd;
 
 	struct rpc_wait_queue	cl_rpcwaitq;
 
diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h
index a2eb9b4a9de..4a68125b6de 100644
--- a/include/linux/sunrpc/rpc_pipe_fs.h
+++ b/include/linux/sunrpc/rpc_pipe_fs.h
@@ -30,7 +30,7 @@ struct rpc_inode {
 #define RPC_PIPE_WAIT_FOR_OPEN	1
 	int flags;
 	struct rpc_pipe_ops *ops;
-	struct work_struct queue_timeout;
+	struct delayed_work queue_timeout;
 };
 
 static inline struct rpc_inode *
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 60394fbc4c7..3e04c1512fc 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -177,7 +177,7 @@ struct rpc_xprt {
 	unsigned long		connect_timeout,
 				bind_timeout,
 				reestablish_timeout;
-	struct work_struct	connect_worker;
+	struct delayed_work	connect_worker;
 	unsigned short		port;
 
 	/*
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 44091c0db0b..c1f71644616 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -53,7 +53,7 @@ struct tty_buffer {
 };
 
 struct tty_bufhead {
-	struct work_struct		work;
+	struct delayed_work work;
 	struct semaphore pty_sem;
 	spinlock_t lock;
 	struct tty_buffer *head;	/* Queue head */
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 9bca3539a1e..9faaccae570 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -17,6 +17,10 @@ struct work_struct {
 	void (*func)(void *);
 	void *data;
 	void *wq_data;
+};
+
+struct delayed_work {
+	struct work_struct work;
 	struct timer_list timer;
 };
 
@@ -28,32 +32,48 @@ struct execute_work {
         .entry	= { &(n).entry, &(n).entry },			\
 	.func = (f),						\
 	.data = (d),						\
+	}
+
+#define __DELAYED_WORK_INITIALIZER(n, f, d) {			\
+	.work = __WORK_INITIALIZER((n).work, (f), (d)),		\
 	.timer = TIMER_INITIALIZER(NULL, 0, 0),			\
 	}
 
 #define DECLARE_WORK(n, f, d)					\
 	struct work_struct n = __WORK_INITIALIZER(n, f, d)
 
+#define DECLARE_DELAYED_WORK(n, f, d)				\
+	struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, d)
+
 /*
- * initialize a work-struct's func and data pointers:
+ * initialize a work item's function and data pointers
  */
 #define PREPARE_WORK(_work, _func, _data)			\
 	do {							\
-		(_work)->func = _func;				\
-		(_work)->data = _data;				\
+		(_work)->func = (_func);			\
+		(_work)->data = (_data);			\
 	} while (0)
 
+#define PREPARE_DELAYED_WORK(_work, _func, _data)		\
+	PREPARE_WORK(&(_work)->work, (_func), (_data))
+
 /*
- * initialize all of a work-struct:
+ * initialize all of a work item in one go
  */
 #define INIT_WORK(_work, _func, _data)				\
 	do {							\
 		INIT_LIST_HEAD(&(_work)->entry);		\
 		(_work)->pending = 0;				\
 		PREPARE_WORK((_work), (_func), (_data));	\
+	} while (0)
+
+#define INIT_DELAYED_WORK(_work, _func, _data)		\
+	do {							\
+		INIT_WORK(&(_work)->work, (_func), (_data));	\
 		init_timer(&(_work)->timer);			\
 	} while (0)
 
+
 extern struct workqueue_struct *__create_workqueue(const char *name,
 						    int singlethread);
 #define create_workqueue(name) __create_workqueue((name), 0)
@@ -62,24 +82,24 @@ extern struct workqueue_struct *__create_workqueue(const char *name,
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
 extern int FASTCALL(queue_work(struct workqueue_struct *wq, struct work_struct *work));
-extern int FASTCALL(queue_delayed_work(struct workqueue_struct *wq, struct work_struct *work, unsigned long delay));
+extern int FASTCALL(queue_delayed_work(struct workqueue_struct *wq, struct delayed_work *work, unsigned long delay));
 extern int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
-	struct work_struct *work, unsigned long delay);
+	struct delayed_work *work, unsigned long delay);
 extern void FASTCALL(flush_workqueue(struct workqueue_struct *wq));
 
 extern int FASTCALL(schedule_work(struct work_struct *work));
-extern int FASTCALL(schedule_delayed_work(struct work_struct *work, unsigned long delay));
+extern int FASTCALL(schedule_delayed_work(struct delayed_work *work, unsigned long delay));
 
-extern int schedule_delayed_work_on(int cpu, struct work_struct *work, unsigned long delay);
+extern int schedule_delayed_work_on(int cpu, struct delayed_work *work, unsigned long delay);
 extern int schedule_on_each_cpu(void (*func)(void *info), void *info);
 extern void flush_scheduled_work(void);
 extern int current_is_keventd(void);
 extern int keventd_up(void);
 
 extern void init_workqueues(void);
-void cancel_rearming_delayed_work(struct work_struct *work);
+void cancel_rearming_delayed_work(struct delayed_work *work);
 void cancel_rearming_delayed_workqueue(struct workqueue_struct *,
-				       struct work_struct *);
+				       struct delayed_work *);
 int execute_in_process_context(void (*fn)(void *), void *,
 			       struct execute_work *);
 
@@ -88,13 +108,13 @@ int execute_in_process_context(void (*fn)(void *), void *,
  * function may still be running on return from cancel_delayed_work().  Run
  * flush_scheduled_work() to wait on it.
  */
-static inline int cancel_delayed_work(struct work_struct *work)
+static inline int cancel_delayed_work(struct delayed_work *work)
 {
 	int ret;
 
 	ret = del_timer_sync(&work->timer);
 	if (ret)
-		clear_bit(0, &work->pending);
+		clear_bit(0, &work->work.pending);
 	return ret;
 }
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 17c2f03d2c2..44fc54b7dec 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -122,29 +122,33 @@ EXPORT_SYMBOL_GPL(queue_work);
 
 static void delayed_work_timer_fn(unsigned long __data)
 {
-	struct work_struct *work = (struct work_struct *)__data;
-	struct workqueue_struct *wq = work->wq_data;
+	struct delayed_work *dwork = (struct delayed_work *)__data;
+	struct workqueue_struct *wq = dwork->work.wq_data;
 	int cpu = smp_processor_id();
 
 	if (unlikely(is_single_threaded(wq)))
 		cpu = singlethread_cpu;
 
-	__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
+	__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), &dwork->work);
 }
 
 /**
  * queue_delayed_work - queue work on a workqueue after delay
  * @wq: workqueue to use
- * @work: work to queue
+ * @work: delayable work to queue
  * @delay: number of jiffies to wait before queueing
  *
  * Returns 0 if @work was already on a queue, non-zero otherwise.
  */
 int fastcall queue_delayed_work(struct workqueue_struct *wq,
-			struct work_struct *work, unsigned long delay)
+			struct delayed_work *dwork, unsigned long delay)
 {
 	int ret = 0;
-	struct timer_list *timer = &work->timer;
+	struct timer_list *timer = &dwork->timer;
+	struct work_struct *work = &dwork->work;
+
+	if (delay == 0)
+		return queue_work(wq, work);
 
 	if (!test_and_set_bit(0, &work->pending)) {
 		BUG_ON(timer_pending(timer));
@@ -153,7 +157,7 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq,
 		/* This stores wq for the moment, for the timer_fn */
 		work->wq_data = wq;
 		timer->expires = jiffies + delay;
-		timer->data = (unsigned long)work;
+		timer->data = (unsigned long)dwork;
 		timer->function = delayed_work_timer_fn;
 		add_timer(timer);
 		ret = 1;
@@ -172,10 +176,11 @@ EXPORT_SYMBOL_GPL(queue_delayed_work);
  * Returns 0 if @work was already on a queue, non-zero otherwise.
  */
 int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
-			struct work_struct *work, unsigned long delay)
+			struct delayed_work *dwork, unsigned long delay)
 {
 	int ret = 0;
-	struct timer_list *timer = &work->timer;
+	struct timer_list *timer = &dwork->timer;
+	struct work_struct *work = &dwork->work;
 
 	if (!test_and_set_bit(0, &work->pending)) {
 		BUG_ON(timer_pending(timer));
@@ -184,7 +189,7 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 		/* This stores wq for the moment, for the timer_fn */
 		work->wq_data = wq;
 		timer->expires = jiffies + delay;
-		timer->data = (unsigned long)work;
+		timer->data = (unsigned long)dwork;
 		timer->function = delayed_work_timer_fn;
 		add_timer_on(timer, cpu);
 		ret = 1;
@@ -468,31 +473,31 @@ EXPORT_SYMBOL(schedule_work);
 
 /**
  * schedule_delayed_work - put work task in global workqueue after delay
- * @work: job to be done
- * @delay: number of jiffies to wait
+ * @dwork: job to be done
+ * @delay: number of jiffies to wait or 0 for immediate execution
  *
  * After waiting for a given time this puts a job in the kernel-global
  * workqueue.
  */
-int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay)
+int fastcall schedule_delayed_work(struct delayed_work *dwork, unsigned long delay)
 {
-	return queue_delayed_work(keventd_wq, work, delay);
+	return queue_delayed_work(keventd_wq, dwork, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work);
 
 /**
  * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
  * @cpu: cpu to use
- * @work: job to be done
+ * @dwork: job to be done
  * @delay: number of jiffies to wait
  *
  * After waiting for a given time this puts a job in the kernel-global
  * workqueue on the specified CPU.
  */
 int schedule_delayed_work_on(int cpu,
-			struct work_struct *work, unsigned long delay)
+			struct delayed_work *dwork, unsigned long delay)
 {
-	return queue_delayed_work_on(cpu, keventd_wq, work, delay);
+	return queue_delayed_work_on(cpu, keventd_wq, dwork, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work_on);
 
@@ -539,12 +544,12 @@ EXPORT_SYMBOL(flush_scheduled_work);
  * cancel_rearming_delayed_workqueue - reliably kill off a delayed
  *			work whose handler rearms the delayed work.
  * @wq:   the controlling workqueue structure
- * @work: the delayed work struct
+ * @dwork: the delayed work struct
  */
 void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
-				       struct work_struct *work)
+				       struct delayed_work *dwork)
 {
-	while (!cancel_delayed_work(work))
+	while (!cancel_delayed_work(dwork))
 		flush_workqueue(wq);
 }
 EXPORT_SYMBOL(cancel_rearming_delayed_workqueue);
@@ -552,11 +557,11 @@ EXPORT_SYMBOL(cancel_rearming_delayed_workqueue);
 /**
  * cancel_rearming_delayed_work - reliably kill off a delayed keventd
  *			work whose handler rearms the delayed work.
- * @work: the delayed work struct
+ * @dwork: the delayed work struct
  */
-void cancel_rearming_delayed_work(struct work_struct *work)
+void cancel_rearming_delayed_work(struct delayed_work *dwork)
 {
-	cancel_rearming_delayed_workqueue(keventd_wq, work);
+	cancel_rearming_delayed_workqueue(keventd_wq, dwork);
 }
 EXPORT_SYMBOL(cancel_rearming_delayed_work);
 
diff --git a/mm/slab.c b/mm/slab.c
index 3c4a7e34edd..a65bc5e992c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -753,7 +753,7 @@ int slab_is_available(void)
 	return g_cpucache_up == FULL;
 }
 
-static DEFINE_PER_CPU(struct work_struct, reap_work);
+static DEFINE_PER_CPU(struct delayed_work, reap_work);
 
 static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
 {
@@ -916,16 +916,16 @@ static void next_reap_node(void)
  */
 static void __devinit start_cpu_timer(int cpu)
 {
-	struct work_struct *reap_work = &per_cpu(reap_work, cpu);
+	struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
 
 	/*
 	 * When this gets called from do_initcalls via cpucache_init(),
 	 * init_workqueues() has already run, so keventd will be setup
 	 * at that time.
 	 */
-	if (keventd_up() && reap_work->func == NULL) {
+	if (keventd_up() && reap_work->work.func == NULL) {
 		init_reap_node(cpu);
-		INIT_WORK(reap_work, cache_reap, NULL);
+		INIT_DELAYED_WORK(reap_work, cache_reap, NULL);
 		schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
 	}
 }
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index 4b36114744c..f2ed09e25df 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -35,7 +35,7 @@ static unsigned long linkwatch_flags;
 static unsigned long linkwatch_nextevent;
 
 static void linkwatch_event(void *dummy);
-static DECLARE_WORK(linkwatch_work, linkwatch_event, NULL);
+static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event, NULL);
 
 static LIST_HEAD(lweventlist);
 static DEFINE_SPINLOCK(lweventlist_lock);
@@ -171,10 +171,9 @@ void linkwatch_fire_event(struct net_device *dev)
 			unsigned long delay = linkwatch_nextevent - jiffies;
 
 			/* If we wrap around we'll delay it by at most HZ. */
-			if (!delay || delay > HZ)
-				schedule_work(&linkwatch_work);
-			else
-				schedule_delayed_work(&linkwatch_work, delay);
+			if (delay > HZ)
+				delay = 0;
+			schedule_delayed_work(&linkwatch_work, delay);
 		}
 	}
 }
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 00cb388ece0..d5725cb1491 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -285,7 +285,7 @@ static struct file_operations content_file_operations;
 static struct file_operations cache_flush_operations;
 
 static void do_cache_clean(void *data);
-static DECLARE_WORK(cache_cleaner, do_cache_clean, NULL);
+static DECLARE_DELAYED_WORK(cache_cleaner, do_cache_clean, NULL);
 
 void cache_register(struct cache_detail *cd)
 {
@@ -337,7 +337,7 @@ void cache_register(struct cache_detail *cd)
 	spin_unlock(&cache_list_lock);
 
 	/* start the cleaning process */
-	schedule_work(&cache_cleaner);
+	schedule_delayed_work(&cache_cleaner, 0);
 }
 
 int cache_unregister(struct cache_detail *cd)
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 9a0b41a97f9..97be3f7fed4 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -837,7 +837,8 @@ init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
 		INIT_LIST_HEAD(&rpci->pipe);
 		rpci->pipelen = 0;
 		init_waitqueue_head(&rpci->waitq);
-		INIT_WORK(&rpci->queue_timeout, rpc_timeout_upcall_queue, rpci);
+		INIT_DELAYED_WORK(&rpci->queue_timeout,
+				    rpc_timeout_upcall_queue, rpci);
 		rpci->ops = NULL;
 	}
 }
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 757fc91ef25..3c7532cd009 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1262,7 +1262,7 @@ static void xs_connect(struct rpc_task *task)
 			xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO;
 	} else {
 		dprintk("RPC:      xs_connect scheduled xprt %p\n", xprt);
-		schedule_work(&xprt->connect_worker);
+		schedule_delayed_work(&xprt->connect_worker, 0);
 
 		/* flush_scheduled_work can sleep... */
 		if (!RPC_IS_ASYNC(task))
@@ -1375,7 +1375,7 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to)
 	/* XXX: header size can vary due to auth type, IPv6, etc. */
 	xprt->max_payload = (1U << 16) - (MAX_HEADER << 3);
 
-	INIT_WORK(&xprt->connect_worker, xs_udp_connect_worker, xprt);
+	INIT_DELAYED_WORK(&xprt->connect_worker, xs_udp_connect_worker, xprt);
 	xprt->bind_timeout = XS_BIND_TO;
 	xprt->connect_timeout = XS_UDP_CONN_TO;
 	xprt->reestablish_timeout = XS_UDP_REEST_TO;
@@ -1420,7 +1420,7 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to)
 	xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
 	xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
 
-	INIT_WORK(&xprt->connect_worker, xs_tcp_connect_worker, xprt);
+	INIT_DELAYED_WORK(&xprt->connect_worker, xs_tcp_connect_worker, xprt);
 	xprt->bind_timeout = XS_BIND_TO;
 	xprt->connect_timeout = XS_TCP_CONN_TO;
 	xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
-- 
cgit v1.2.3-70-g09d2


From 65f27f38446e1976cc98fd3004b110fedcddd189 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 22 Nov 2006 14:55:48 +0000
Subject: WorkStruct: Pass the work_struct pointer instead of context data

Pass the work_struct pointer to the work function rather than context data.
The work function can use container_of() to work out the data.

For the cases where the container of the work_struct may go away the moment the
pending bit is cleared, it is made possible to defer the release of the
structure by deferring the clearing of the pending bit.

To make this work, an extra flag is introduced into the management side of the
work_struct.  This governs auto-release of the structure upon execution.

Ordinarily, the work queue executor would release the work_struct for further
scheduling or deallocation by clearing the pending bit prior to jumping to the
work function.  This means that, unless the driver makes some guarantee itself
that the work_struct won't go away, the work function may not access anything
else in the work_struct or its container lest they be deallocated..  This is a
problem if the auxiliary data is taken away (as done by the last patch).

However, if the pending bit is *not* cleared before jumping to the work
function, then the work function *may* access the work_struct and its container
with no problems.  But then the work function must itself release the
work_struct by calling work_release().

In most cases, automatic release is fine, so this is the default.  Special
initiators exist for the non-auto-release case (ending in _NAR).


Signed-Off-By: David Howells <dhowells@redhat.com>
---
 arch/x86_64/kernel/mce.c           |  6 +--
 arch/x86_64/kernel/smpboot.c       | 12 +++--
 arch/x86_64/kernel/time.c          |  4 +-
 block/as-iosched.c                 |  7 +--
 block/cfq-iosched.c                |  8 +--
 block/ll_rw_blk.c                  |  8 +--
 crypto/cryptomgr.c                 |  7 +--
 drivers/acpi/osl.c                 | 25 +++-------
 drivers/ata/libata-core.c          | 20 ++++----
 drivers/ata/libata-scsi.c          | 14 +++---
 drivers/ata/libata.h               |  4 +-
 drivers/block/floppy.c             |  6 +--
 drivers/char/random.c              |  6 +--
 drivers/char/sysrq.c               |  4 +-
 drivers/char/tty_io.c              | 31 ++++++------
 drivers/char/vt.c                  |  6 +--
 drivers/cpufreq/cpufreq.c          | 10 ++--
 drivers/input/keyboard/atkbd.c     |  6 +--
 drivers/input/serio/libps2.c       |  6 +--
 drivers/net/e1000/e1000_main.c     | 10 ++--
 drivers/pci/pcie/aer/aerdrv.c      |  2 +-
 drivers/pci/pcie/aer/aerdrv.h      |  2 +-
 drivers/pci/pcie/aer/aerdrv_core.c |  8 +--
 drivers/scsi/scsi_scan.c           |  7 +--
 drivers/scsi/scsi_sysfs.c          | 10 ++--
 fs/aio.c                           | 14 +++---
 fs/bio.c                           |  6 +--
 fs/file.c                          |  6 ++-
 fs/nfs/client.c                    |  2 +-
 fs/nfs/namespace.c                 |  9 ++--
 fs/nfs/nfs4_fs.h                   |  2 +-
 fs/nfs/nfs4renewd.c                |  5 +-
 include/linux/libata.h             |  3 +-
 include/linux/workqueue.h          | 99 +++++++++++++++++++++++++++++---------
 include/net/inet_timewait_sock.h   |  2 +-
 ipc/util.c                         |  7 ++-
 kernel/kmod.c                      | 16 +++---
 kernel/kthread.c                   | 13 +++--
 kernel/power/poweroff.c            |  4 +-
 kernel/sys.c                       |  4 +-
 kernel/workqueue.c                 | 19 +++-----
 mm/slab.c                          |  6 +--
 net/core/link_watch.c              |  6 +--
 net/ipv4/inet_timewait_sock.c      |  5 +-
 net/ipv4/tcp_minisocks.c           |  3 +-
 net/sunrpc/cache.c                 |  6 +--
 net/sunrpc/rpc_pipe.c              |  7 +--
 net/sunrpc/sched.c                 |  8 +--
 net/sunrpc/xprt.c                  |  7 +--
 net/sunrpc/xprtsock.c              | 18 ++++---
 security/keys/key.c                |  6 +--
 51 files changed, 293 insertions(+), 219 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 5306f263090..c7587fc3901 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -306,8 +306,8 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
  */
 
 static int check_interval = 5 * 60; /* 5 minutes */
-static void mcheck_timer(void *data);
-static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer, NULL);
+static void mcheck_timer(struct work_struct *work);
+static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
 
 static void mcheck_check_cpu(void *info)
 {
@@ -315,7 +315,7 @@ static void mcheck_check_cpu(void *info)
 		do_machine_check(NULL, 0);
 }
 
-static void mcheck_timer(void *data)
+static void mcheck_timer(struct work_struct *work)
 {
 	on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
 	schedule_delayed_work(&mcheck_work, check_interval * HZ);
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 62c2e747af5..9800147c4c6 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -753,14 +753,16 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta
 }
 
 struct create_idle {
+	struct work_struct work;
 	struct task_struct *idle;
 	struct completion done;
 	int cpu;
 };
 
-void do_fork_idle(void *_c_idle)
+void do_fork_idle(struct work_struct *work)
 {
-	struct create_idle *c_idle = _c_idle;
+	struct create_idle *c_idle =
+		container_of(work, struct create_idle, work);
 
 	c_idle->idle = fork_idle(c_idle->cpu);
 	complete(&c_idle->done);
@@ -775,10 +777,10 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
 	int timeout;
 	unsigned long start_rip;
 	struct create_idle c_idle = {
+		.work = __WORK_INITIALIZER(c_idle.work, do_fork_idle),
 		.cpu = cpu,
 		.done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
 	};
-	DECLARE_WORK(work, do_fork_idle, &c_idle);
 
 	/* allocate memory for gdts of secondary cpus. Hotplug is considered */
 	if (!cpu_gdt_descr[cpu].address &&
@@ -825,9 +827,9 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
 	 * thread.
 	 */
 	if (!keventd_up() || current_is_keventd())
-		work.func(work.data);
+		c_idle.work.func(&c_idle.work);
 	else {
-		schedule_work(&work);
+		schedule_work(&c_idle.work);
 		wait_for_completion(&c_idle.done);
 	}
 
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index e3ef544d2cf..9f05bc9b2da 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -563,7 +563,7 @@ static unsigned int cpufreq_delayed_issched = 0;
 static unsigned int cpufreq_init = 0;
 static struct work_struct cpufreq_delayed_get_work;
 
-static void handle_cpufreq_delayed_get(void *v)
+static void handle_cpufreq_delayed_get(struct work_struct *v)
 {
 	unsigned int cpu;
 	for_each_online_cpu(cpu) {
@@ -639,7 +639,7 @@ static struct notifier_block time_cpufreq_notifier_block = {
 
 static int __init cpufreq_tsc(void)
 {
-	INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL);
+	INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get);
 	if (!cpufreq_register_notifier(&time_cpufreq_notifier_block,
 				       CPUFREQ_TRANSITION_NOTIFIER))
 		cpufreq_init = 1;
diff --git a/block/as-iosched.c b/block/as-iosched.c
index 50b95e4c142..f371c935999 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -1274,9 +1274,10 @@ static void as_merged_requests(request_queue_t *q, struct request *req,
  *
  * FIXME! dispatch queue is not a queue at all!
  */
-static void as_work_handler(void *data)
+static void as_work_handler(struct work_struct *work)
 {
-	struct request_queue *q = data;
+	struct as_data *ad = container_of(work, struct as_data, antic_work);
+	struct request_queue *q = ad->q;
 	unsigned long flags;
 
 	spin_lock_irqsave(q->queue_lock, flags);
@@ -1332,7 +1333,7 @@ static void *as_init_queue(request_queue_t *q, elevator_t *e)
 	ad->antic_timer.function = as_antic_timeout;
 	ad->antic_timer.data = (unsigned long)q;
 	init_timer(&ad->antic_timer);
-	INIT_WORK(&ad->antic_work, as_work_handler, q);
+	INIT_WORK(&ad->antic_work, as_work_handler);
 
 	INIT_LIST_HEAD(&ad->fifo_list[REQ_SYNC]);
 	INIT_LIST_HEAD(&ad->fifo_list[REQ_ASYNC]);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 1d9c3c70a9a..6cec3a1dccb 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1841,9 +1841,11 @@ queue_fail:
 	return 1;
 }
 
-static void cfq_kick_queue(void *data)
+static void cfq_kick_queue(struct work_struct *work)
 {
-	request_queue_t *q = data;
+	struct cfq_data *cfqd =
+		container_of(work, struct cfq_data, unplug_work);
+	request_queue_t *q = cfqd->queue;
 	unsigned long flags;
 
 	spin_lock_irqsave(q->queue_lock, flags);
@@ -1987,7 +1989,7 @@ static void *cfq_init_queue(request_queue_t *q, elevator_t *e)
 	cfqd->idle_class_timer.function = cfq_idle_class_timer;
 	cfqd->idle_class_timer.data = (unsigned long) cfqd;
 
-	INIT_WORK(&cfqd->unplug_work, cfq_kick_queue, q);
+	INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
 
 	cfqd->cfq_quantum = cfq_quantum;
 	cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 9eaee664053..eb4cf6df737 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -34,7 +34,7 @@
  */
 #include <scsi/scsi_cmnd.h>
 
-static void blk_unplug_work(void *data);
+static void blk_unplug_work(struct work_struct *work);
 static void blk_unplug_timeout(unsigned long data);
 static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io);
 static void init_request_from_bio(struct request *req, struct bio *bio);
@@ -227,7 +227,7 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
 	if (q->unplug_delay == 0)
 		q->unplug_delay = 1;
 
-	INIT_WORK(&q->unplug_work, blk_unplug_work, q);
+	INIT_WORK(&q->unplug_work, blk_unplug_work);
 
 	q->unplug_timer.function = blk_unplug_timeout;
 	q->unplug_timer.data = (unsigned long)q;
@@ -1631,9 +1631,9 @@ static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
 	}
 }
 
-static void blk_unplug_work(void *data)
+static void blk_unplug_work(struct work_struct *work)
 {
-	request_queue_t *q = data;
+	request_queue_t *q = container_of(work, request_queue_t, unplug_work);
 
 	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
 				q->rq.count[READ] + q->rq.count[WRITE]);
diff --git a/crypto/cryptomgr.c b/crypto/cryptomgr.c
index 9b5b1560106..2ebffb84f1d 100644
--- a/crypto/cryptomgr.c
+++ b/crypto/cryptomgr.c
@@ -40,9 +40,10 @@ struct cryptomgr_param {
 	char template[CRYPTO_MAX_ALG_NAME];
 };
 
-static void cryptomgr_probe(void *data)
+static void cryptomgr_probe(struct work_struct *work)
 {
-	struct cryptomgr_param *param = data;
+	struct cryptomgr_param *param =
+		container_of(work, struct cryptomgr_param, work);
 	struct crypto_template *tmpl;
 	struct crypto_instance *inst;
 	int err;
@@ -112,7 +113,7 @@ static int cryptomgr_schedule_probe(struct crypto_larval *larval)
 	param->larval.type = larval->alg.cra_flags;
 	param->larval.mask = larval->mask;
 
-	INIT_WORK(&param->work, cryptomgr_probe, param);
+	INIT_WORK(&param->work, cryptomgr_probe);
 	schedule_work(&param->work);
 
 	return NOTIFY_STOP;
diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index 068fe4f100b..02b30ae6a68 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -50,6 +50,7 @@ ACPI_MODULE_NAME("osl")
 struct acpi_os_dpc {
 	acpi_osd_exec_callback function;
 	void *context;
+	struct work_struct work;
 };
 
 #ifdef CONFIG_ACPI_CUSTOM_DSDT
@@ -564,12 +565,9 @@ void acpi_os_derive_pci_id(acpi_handle rhandle,	/* upper bound  */
 	acpi_os_derive_pci_id_2(rhandle, chandle, id, &is_bridge, &bus_number);
 }
 
-static void acpi_os_execute_deferred(void *context)
+static void acpi_os_execute_deferred(struct work_struct *work)
 {
-	struct acpi_os_dpc *dpc = NULL;
-
-
-	dpc = (struct acpi_os_dpc *)context;
+	struct acpi_os_dpc *dpc = container_of(work, struct acpi_os_dpc, work);
 	if (!dpc) {
 		printk(KERN_ERR PREFIX "Invalid (NULL) context\n");
 		return;
@@ -602,7 +600,6 @@ acpi_status acpi_os_execute(acpi_execute_type type,
 {
 	acpi_status status = AE_OK;
 	struct acpi_os_dpc *dpc;
-	struct work_struct *task;
 
 	ACPI_FUNCTION_TRACE("os_queue_for_execution");
 
@@ -615,28 +612,22 @@ acpi_status acpi_os_execute(acpi_execute_type type,
 
 	/*
 	 * Allocate/initialize DPC structure.  Note that this memory will be
-	 * freed by the callee.  The kernel handles the tq_struct list  in a
+	 * freed by the callee.  The kernel handles the work_struct list  in a
 	 * way that allows us to also free its memory inside the callee.
 	 * Because we may want to schedule several tasks with different
 	 * parameters we can't use the approach some kernel code uses of
-	 * having a static tq_struct.
-	 * We can save time and code by allocating the DPC and tq_structs
-	 * from the same memory.
+	 * having a static work_struct.
 	 */
 
-	dpc =
-	    kmalloc(sizeof(struct acpi_os_dpc) + sizeof(struct work_struct),
-		    GFP_ATOMIC);
+	dpc = kmalloc(sizeof(struct acpi_os_dpc), GFP_ATOMIC);
 	if (!dpc)
 		return_ACPI_STATUS(AE_NO_MEMORY);
 
 	dpc->function = function;
 	dpc->context = context;
 
-	task = (void *)(dpc + 1);
-	INIT_WORK(task, acpi_os_execute_deferred, (void *)dpc);
-
-	if (!queue_work(kacpid_wq, task)) {
+	INIT_WORK(&dpc->work, acpi_os_execute_deferred);
+	if (!queue_work(kacpid_wq, &dpc->work)) {
 		ACPI_DEBUG_PRINT((ACPI_DB_ERROR,
 				  "Call to queue_work() failed.\n"));
 		kfree(dpc);
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 0bb4b4dced7..b5f2da6ac80 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -914,7 +914,7 @@ static unsigned int ata_id_xfermask(const u16 *id)
  *	ata_port_queue_task - Queue port_task
  *	@ap: The ata_port to queue port_task for
  *	@fn: workqueue function to be scheduled
- *	@data: data value to pass to workqueue function
+ *	@data: data for @fn to use
  *	@delay: delay time for workqueue function
  *
  *	Schedule @fn(@data) for execution after @delay jiffies using
@@ -929,7 +929,7 @@ static unsigned int ata_id_xfermask(const u16 *id)
  *	LOCKING:
  *	Inherited from caller.
  */
-void ata_port_queue_task(struct ata_port *ap, void (*fn)(void *), void *data,
+void ata_port_queue_task(struct ata_port *ap, work_func_t fn, void *data,
 			 unsigned long delay)
 {
 	int rc;
@@ -937,7 +937,8 @@ void ata_port_queue_task(struct ata_port *ap, void (*fn)(void *), void *data,
 	if (ap->pflags & ATA_PFLAG_FLUSH_PORT_TASK)
 		return;
 
-	PREPARE_DELAYED_WORK(&ap->port_task, fn, data);
+	PREPARE_DELAYED_WORK(&ap->port_task, fn);
+	ap->port_task_data = data;
 
 	rc = queue_delayed_work(ata_wq, &ap->port_task, delay);
 
@@ -4292,10 +4293,11 @@ fsm_start:
 	return poll_next;
 }
 
-static void ata_pio_task(void *_data)
+static void ata_pio_task(struct work_struct *work)
 {
-	struct ata_queued_cmd *qc = _data;
-	struct ata_port *ap = qc->ap;
+	struct ata_port *ap =
+		container_of(work, struct ata_port, port_task.work);
+	struct ata_queued_cmd *qc = ap->port_task_data;
 	u8 status;
 	int poll_next;
 
@@ -5317,9 +5319,9 @@ void ata_port_init(struct ata_port *ap, struct ata_host *host,
 	ap->msg_enable = ATA_MSG_DRV | ATA_MSG_ERR | ATA_MSG_WARN;
 #endif
 
-	INIT_DELAYED_WORK(&ap->port_task, NULL, NULL);
-	INIT_DELAYED_WORK(&ap->hotplug_task, ata_scsi_hotplug, ap);
-	INIT_WORK(&ap->scsi_rescan_task, ata_scsi_dev_rescan, ap);
+	INIT_DELAYED_WORK(&ap->port_task, NULL);
+	INIT_DELAYED_WORK(&ap->hotplug_task, ata_scsi_hotplug);
+	INIT_WORK(&ap->scsi_rescan_task, ata_scsi_dev_rescan);
 	INIT_LIST_HEAD(&ap->eh_done_q);
 	init_waitqueue_head(&ap->eh_wait_q);
 
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 5c1fc467fc7..c872b324dbd 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -3079,7 +3079,7 @@ static void ata_scsi_remove_dev(struct ata_device *dev)
 
 /**
  *	ata_scsi_hotplug - SCSI part of hotplug
- *	@data: Pointer to ATA port to perform SCSI hotplug on
+ *	@work: Pointer to ATA port to perform SCSI hotplug on
  *
  *	Perform SCSI part of hotplug.  It's executed from a separate
  *	workqueue after EH completes.  This is necessary because SCSI
@@ -3089,9 +3089,10 @@ static void ata_scsi_remove_dev(struct ata_device *dev)
  *	LOCKING:
  *	Kernel thread context (may sleep).
  */
-void ata_scsi_hotplug(void *data)
+void ata_scsi_hotplug(struct work_struct *work)
 {
-	struct ata_port *ap = data;
+	struct ata_port *ap =
+		container_of(work, struct ata_port, hotplug_task.work);
 	int i;
 
 	if (ap->pflags & ATA_PFLAG_UNLOADING) {
@@ -3190,7 +3191,7 @@ static int ata_scsi_user_scan(struct Scsi_Host *shost, unsigned int channel,
 
 /**
  *	ata_scsi_dev_rescan - initiate scsi_rescan_device()
- *	@data: Pointer to ATA port to perform scsi_rescan_device()
+ *	@work: Pointer to ATA port to perform scsi_rescan_device()
  *
  *	After ATA pass thru (SAT) commands are executed successfully,
  *	libata need to propagate the changes to SCSI layer.  This
@@ -3200,9 +3201,10 @@ static int ata_scsi_user_scan(struct Scsi_Host *shost, unsigned int channel,
  *	LOCKING:
  *	Kernel thread context (may sleep).
  */
-void ata_scsi_dev_rescan(void *data)
+void ata_scsi_dev_rescan(struct work_struct *work)
 {
-	struct ata_port *ap = data;
+	struct ata_port *ap =
+		container_of(work, struct ata_port, scsi_rescan_task);
 	struct ata_device *dev;
 	unsigned int i;
 
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index 0ed263be652..7e0f3aff873 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -81,7 +81,7 @@ extern struct scsi_transport_template ata_scsi_transport_template;
 
 extern void ata_scsi_scan_host(struct ata_port *ap);
 extern int ata_scsi_offline_dev(struct ata_device *dev);
-extern void ata_scsi_hotplug(void *data);
+extern void ata_scsi_hotplug(struct work_struct *work);
 extern unsigned int ata_scsiop_inq_std(struct ata_scsi_args *args, u8 *rbuf,
 			       unsigned int buflen);
 
@@ -111,7 +111,7 @@ extern void ata_scsi_rbuf_fill(struct ata_scsi_args *args,
                         unsigned int (*actor) (struct ata_scsi_args *args,
                                            u8 *rbuf, unsigned int buflen));
 extern void ata_schedule_scsi_eh(struct Scsi_Host *shost);
-extern void ata_scsi_dev_rescan(void *data);
+extern void ata_scsi_dev_rescan(struct work_struct *work);
 extern int ata_bus_probe(struct ata_port *ap);
 
 /* libata-eh.c */
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index aa1eb4466f9..3f1b38276e9 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -992,11 +992,11 @@ static void empty(void)
 {
 }
 
-static DECLARE_WORK(floppy_work, NULL, NULL);
+static DECLARE_WORK(floppy_work, NULL);
 
 static void schedule_bh(void (*handler) (void))
 {
-	PREPARE_WORK(&floppy_work, (work_func_t)handler, NULL);
+	PREPARE_WORK(&floppy_work, (work_func_t)handler);
 	schedule_work(&floppy_work);
 }
 
@@ -1008,7 +1008,7 @@ static void cancel_activity(void)
 
 	spin_lock_irqsave(&floppy_lock, flags);
 	do_floppy = NULL;
-	PREPARE_WORK(&floppy_work, (work_func_t)empty, NULL);
+	PREPARE_WORK(&floppy_work, (work_func_t)empty);
 	del_timer(&fd_timer);
 	spin_unlock_irqrestore(&floppy_lock, flags);
 }
diff --git a/drivers/char/random.c b/drivers/char/random.c
index f2ab61f3e8a..fa764688cad 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1422,9 +1422,9 @@ static struct keydata {
 
 static unsigned int ip_cnt;
 
-static void rekey_seq_generator(void *private_);
+static void rekey_seq_generator(struct work_struct *work);
 
-static DECLARE_DELAYED_WORK(rekey_work, rekey_seq_generator, NULL);
+static DECLARE_DELAYED_WORK(rekey_work, rekey_seq_generator);
 
 /*
  * Lock avoidance:
@@ -1438,7 +1438,7 @@ static DECLARE_DELAYED_WORK(rekey_work, rekey_seq_generator, NULL);
  * happen, and even if that happens only a not perfectly compliant
  * ISN is generated, nothing fatal.
  */
-static void rekey_seq_generator(void *private_)
+static void rekey_seq_generator(struct work_struct *work)
 {
 	struct keydata *keyptr = &ip_keydata[1 ^ (ip_cnt & 1)];
 
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index 5f49280779f..c64f5bcff94 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -219,13 +219,13 @@ static struct sysrq_key_op sysrq_term_op = {
 	.enable_mask	= SYSRQ_ENABLE_SIGNAL,
 };
 
-static void moom_callback(void *ignored)
+static void moom_callback(struct work_struct *ignored)
 {
 	out_of_memory(&NODE_DATA(0)->node_zonelists[ZONE_NORMAL],
 			GFP_KERNEL, 0);
 }
 
-static DECLARE_WORK(moom_work, moom_callback, NULL);
+static DECLARE_WORK(moom_work, moom_callback);
 
 static void sysrq_handle_moom(int key, struct tty_struct *tty)
 {
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 7297acfe520..83e9e7d9b58 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -1254,7 +1254,7 @@ EXPORT_SYMBOL_GPL(tty_ldisc_flush);
 	
 /**
  *	do_tty_hangup		-	actual handler for hangup events
- *	@data: tty device
+ *	@work: tty device
  *
  *	This can be called by the "eventd" kernel thread.  That is process
  *	synchronous but doesn't hold any locks, so we need to make sure we
@@ -1274,9 +1274,10 @@ EXPORT_SYMBOL_GPL(tty_ldisc_flush);
  *		tasklist_lock to walk task list for hangup event
  *
  */
-static void do_tty_hangup(void *data)
+static void do_tty_hangup(struct work_struct *work)
 {
-	struct tty_struct *tty = (struct tty_struct *) data;
+	struct tty_struct *tty =
+		container_of(work, struct tty_struct, hangup_work);
 	struct file * cons_filp = NULL;
 	struct file *filp, *f = NULL;
 	struct task_struct *p;
@@ -1433,7 +1434,7 @@ void tty_vhangup(struct tty_struct * tty)
 
 	printk(KERN_DEBUG "%s vhangup...\n", tty_name(tty, buf));
 #endif
-	do_tty_hangup((void *) tty);
+	do_tty_hangup(&tty->hangup_work);
 }
 EXPORT_SYMBOL(tty_vhangup);
 
@@ -3304,12 +3305,13 @@ int tty_ioctl(struct inode * inode, struct file * file,
  * Nasty bug: do_SAK is being called in interrupt context.  This can
  * deadlock.  We punt it up to process context.  AKPM - 16Mar2001
  */
-static void __do_SAK(void *arg)
+static void __do_SAK(struct work_struct *work)
 {
+	struct tty_struct *tty =
+		container_of(work, struct tty_struct, SAK_work);
 #ifdef TTY_SOFT_SAK
 	tty_hangup(tty);
 #else
-	struct tty_struct *tty = arg;
 	struct task_struct *g, *p;
 	int session;
 	int		i;
@@ -3388,7 +3390,7 @@ void do_SAK(struct tty_struct *tty)
 {
 	if (!tty)
 		return;
-	PREPARE_WORK(&tty->SAK_work, __do_SAK, tty);
+	PREPARE_WORK(&tty->SAK_work, __do_SAK);
 	schedule_work(&tty->SAK_work);
 }
 
@@ -3396,7 +3398,7 @@ EXPORT_SYMBOL(do_SAK);
 
 /**
  *	flush_to_ldisc
- *	@private_: tty structure passed from work queue.
+ *	@work: tty structure passed from work queue.
  *
  *	This routine is called out of the software interrupt to flush data
  *	from the buffer chain to the line discipline.
@@ -3406,9 +3408,10 @@ EXPORT_SYMBOL(do_SAK);
  *	receive_buf method is single threaded for each tty instance.
  */
  
-static void flush_to_ldisc(void *private_)
+static void flush_to_ldisc(struct work_struct *work)
 {
-	struct tty_struct *tty = (struct tty_struct *) private_;
+	struct tty_struct *tty =
+		container_of(work, struct tty_struct, buf.work.work);
 	unsigned long 	flags;
 	struct tty_ldisc *disc;
 	struct tty_buffer *tbuf, *head;
@@ -3553,7 +3556,7 @@ void tty_flip_buffer_push(struct tty_struct *tty)
 	spin_unlock_irqrestore(&tty->buf.lock, flags);
 
 	if (tty->low_latency)
-		flush_to_ldisc((void *) tty);
+		flush_to_ldisc(&tty->buf.work.work);
 	else
 		schedule_delayed_work(&tty->buf.work, 1);
 }
@@ -3580,17 +3583,17 @@ static void initialize_tty_struct(struct tty_struct *tty)
 	tty->overrun_time = jiffies;
 	tty->buf.head = tty->buf.tail = NULL;
 	tty_buffer_init(tty);
-	INIT_DELAYED_WORK(&tty->buf.work, flush_to_ldisc, tty);
+	INIT_DELAYED_WORK(&tty->buf.work, flush_to_ldisc);
 	init_MUTEX(&tty->buf.pty_sem);
 	mutex_init(&tty->termios_mutex);
 	init_waitqueue_head(&tty->write_wait);
 	init_waitqueue_head(&tty->read_wait);
-	INIT_WORK(&tty->hangup_work, do_tty_hangup, tty);
+	INIT_WORK(&tty->hangup_work, do_tty_hangup);
 	mutex_init(&tty->atomic_read_lock);
 	mutex_init(&tty->atomic_write_lock);
 	spin_lock_init(&tty->read_lock);
 	INIT_LIST_HEAD(&tty->tty_files);
-	INIT_WORK(&tty->SAK_work, NULL, NULL);
+	INIT_WORK(&tty->SAK_work, NULL);
 }
 
 /*
diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 8e4413f6fba..8ee04adc37f 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -155,7 +155,7 @@ static void con_flush_chars(struct tty_struct *tty);
 static void set_vesa_blanking(char __user *p);
 static void set_cursor(struct vc_data *vc);
 static void hide_cursor(struct vc_data *vc);
-static void console_callback(void *ignored);
+static void console_callback(struct work_struct *ignored);
 static void blank_screen_t(unsigned long dummy);
 static void set_palette(struct vc_data *vc);
 
@@ -174,7 +174,7 @@ static int vesa_blank_mode; /* 0:none 1:suspendV 2:suspendH 3:powerdown */
 static int blankinterval = 10*60*HZ;
 static int vesa_off_interval;
 
-static DECLARE_WORK(console_work, console_callback, NULL);
+static DECLARE_WORK(console_work, console_callback);
 
 /*
  * fg_console is the current virtual console,
@@ -2154,7 +2154,7 @@ out:
  * with other console code and prevention of re-entrancy is
  * ensured with console_sem.
  */
-static void console_callback(void *ignored)
+static void console_callback(struct work_struct *ignored)
 {
 	acquire_console_sem();
 
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index dd0c2623e27..7a7c6e6dfe4 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -42,7 +42,7 @@ static DEFINE_SPINLOCK(cpufreq_driver_lock);
 
 /* internal prototypes */
 static int __cpufreq_governor(struct cpufreq_policy *policy, unsigned int event);
-static void handle_update(void *data);
+static void handle_update(struct work_struct *work);
 
 /**
  * Two notifier lists: the "policy" list is involved in the
@@ -665,7 +665,7 @@ static int cpufreq_add_dev (struct sys_device * sys_dev)
 	mutex_init(&policy->lock);
 	mutex_lock(&policy->lock);
 	init_completion(&policy->kobj_unregister);
-	INIT_WORK(&policy->update, handle_update, (void *)(long)cpu);
+	INIT_WORK(&policy->update, handle_update);
 
 	/* call driver. From then on the cpufreq must be able
 	 * to accept all calls to ->verify and ->setpolicy for this CPU
@@ -895,9 +895,11 @@ static int cpufreq_remove_dev (struct sys_device * sys_dev)
 }
 
 
-static void handle_update(void *data)
+static void handle_update(struct work_struct *work)
 {
-	unsigned int cpu = (unsigned int)(long)data;
+	struct cpufreq_policy *policy =
+		container_of(work, struct cpufreq_policy, update);
+	unsigned int cpu = policy->cpu;
 	dprintk("handle_update for cpu %u called\n", cpu);
 	cpufreq_update_policy(cpu);
 }
diff --git a/drivers/input/keyboard/atkbd.c b/drivers/input/keyboard/atkbd.c
index cbb93669d1c..8451b29a3db 100644
--- a/drivers/input/keyboard/atkbd.c
+++ b/drivers/input/keyboard/atkbd.c
@@ -567,9 +567,9 @@ static int atkbd_set_leds(struct atkbd *atkbd)
  * interrupt context.
  */
 
-static void atkbd_event_work(void *data)
+static void atkbd_event_work(struct work_struct *work)
 {
-	struct atkbd *atkbd = data;
+	struct atkbd *atkbd = container_of(work, struct atkbd, event_work);
 
 	mutex_lock(&atkbd->event_mutex);
 
@@ -943,7 +943,7 @@ static int atkbd_connect(struct serio *serio, struct serio_driver *drv)
 
 	atkbd->dev = dev;
 	ps2_init(&atkbd->ps2dev, serio);
-	INIT_WORK(&atkbd->event_work, atkbd_event_work, atkbd);
+	INIT_WORK(&atkbd->event_work, atkbd_event_work);
 	mutex_init(&atkbd->event_mutex);
 
 	switch (serio->id.type) {
diff --git a/drivers/input/serio/libps2.c b/drivers/input/serio/libps2.c
index e5b1b60757b..b3e84d3bb7f 100644
--- a/drivers/input/serio/libps2.c
+++ b/drivers/input/serio/libps2.c
@@ -251,9 +251,9 @@ EXPORT_SYMBOL(ps2_command);
  * ps2_schedule_command(), to a PS/2 device (keyboard, mouse, etc.)
  */
 
-static void ps2_execute_scheduled_command(void *data)
+static void ps2_execute_scheduled_command(struct work_struct *work)
 {
-	struct ps2work *ps2work = data;
+	struct ps2work *ps2work = container_of(work, struct ps2work, work);
 
 	ps2_command(ps2work->ps2dev, ps2work->param, ps2work->command);
 	kfree(ps2work);
@@ -278,7 +278,7 @@ int ps2_schedule_command(struct ps2dev *ps2dev, unsigned char *param, int comman
 	ps2work->ps2dev = ps2dev;
 	ps2work->command = command;
 	memcpy(ps2work->param, param, send);
-	INIT_WORK(&ps2work->work, ps2_execute_scheduled_command, ps2work);
+	INIT_WORK(&ps2work->work, ps2_execute_scheduled_command);
 
 	if (!schedule_work(&ps2work->work)) {
 		kfree(ps2work);
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index 726ec5e88ab..03294400bc9 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -183,7 +183,7 @@ void e1000_set_ethtool_ops(struct net_device *netdev);
 static void e1000_enter_82542_rst(struct e1000_adapter *adapter);
 static void e1000_leave_82542_rst(struct e1000_adapter *adapter);
 static void e1000_tx_timeout(struct net_device *dev);
-static void e1000_reset_task(struct net_device *dev);
+static void e1000_reset_task(struct work_struct *work);
 static void e1000_smartspeed(struct e1000_adapter *adapter);
 static int e1000_82547_fifo_workaround(struct e1000_adapter *adapter,
                                        struct sk_buff *skb);
@@ -908,8 +908,7 @@ e1000_probe(struct pci_dev *pdev,
 	adapter->phy_info_timer.function = &e1000_update_phy_info;
 	adapter->phy_info_timer.data = (unsigned long) adapter;
 
-	INIT_WORK(&adapter->reset_task,
-		(void (*)(void *))e1000_reset_task, netdev);
+	INIT_WORK(&adapter->reset_task, e1000_reset_task);
 
 	e1000_check_options(adapter);
 
@@ -3154,9 +3153,10 @@ e1000_tx_timeout(struct net_device *netdev)
 }
 
 static void
-e1000_reset_task(struct net_device *netdev)
+e1000_reset_task(struct work_struct *work)
 {
-	struct e1000_adapter *adapter = netdev_priv(netdev);
+	struct e1000_adapter *adapter =
+		container_of(work, struct e1000_adapter, reset_task);
 
 	e1000_reinit_locked(adapter);
 }
diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c
index 04c43ef529a..55866b6b26f 100644
--- a/drivers/pci/pcie/aer/aerdrv.c
+++ b/drivers/pci/pcie/aer/aerdrv.c
@@ -160,7 +160,7 @@ static struct aer_rpc* aer_alloc_rpc(struct pcie_device *dev)
 	rpc->e_lock = SPIN_LOCK_UNLOCKED;
 
 	rpc->rpd = dev;
-	INIT_WORK(&rpc->dpc_handler, aer_isr, (void *)dev);
+	INIT_WORK(&rpc->dpc_handler, aer_isr);
 	rpc->prod_idx = rpc->cons_idx = 0;
 	mutex_init(&rpc->rpc_mutex);
 	init_waitqueue_head(&rpc->wait_release);
diff --git a/drivers/pci/pcie/aer/aerdrv.h b/drivers/pci/pcie/aer/aerdrv.h
index daf0cad88fc..3c0a58f64dd 100644
--- a/drivers/pci/pcie/aer/aerdrv.h
+++ b/drivers/pci/pcie/aer/aerdrv.h
@@ -118,7 +118,7 @@ extern struct bus_type pcie_port_bus_type;
 extern void aer_enable_rootport(struct aer_rpc *rpc);
 extern void aer_delete_rootport(struct aer_rpc *rpc);
 extern int aer_init(struct pcie_device *dev);
-extern void aer_isr(void *context);
+extern void aer_isr(struct work_struct *work);
 extern void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
 extern int aer_osc_setup(struct pci_dev *dev);
 
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
index 1c7e660d653..08e13033ced 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -690,14 +690,14 @@ static void aer_isr_one_error(struct pcie_device *p_device,
 
 /**
  * aer_isr - consume errors detected by root port
- * @context: pointer to a private data of pcie device
+ * @work: definition of this work item
  *
  * Invoked, as DPC, when root port records new detected error
  **/
-void aer_isr(void *context)
+void aer_isr(struct work_struct *work)
 {
-	struct pcie_device *p_device = (struct pcie_device *) context;
-	struct aer_rpc *rpc = get_service_data(p_device);
+	struct aer_rpc *rpc = container_of(work, struct aer_rpc, dpc_handler);
+	struct pcie_device *p_device = rpc->rpd;
 	struct aer_err_source *e_src;
 
 	mutex_lock(&rpc->rpc_mutex);
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index 94a274645f6..d3c5e964c96 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -362,9 +362,10 @@ static struct scsi_target *scsi_alloc_target(struct device *parent,
 	goto retry;
 }
 
-static void scsi_target_reap_usercontext(void *data)
+static void scsi_target_reap_usercontext(struct work_struct *work)
 {
-	struct scsi_target *starget = data;
+	struct scsi_target *starget =
+		container_of(work, struct scsi_target, ew.work);
 	struct Scsi_Host *shost = dev_to_shost(starget->dev.parent);
 	unsigned long flags;
 
@@ -400,7 +401,7 @@ void scsi_target_reap(struct scsi_target *starget)
 		starget->state = STARGET_DEL;
 		spin_unlock_irqrestore(shost->host_lock, flags);
 		execute_in_process_context(scsi_target_reap_usercontext,
-					   starget, &starget->ew);
+					   &starget->ew);
 		return;
 
 	}
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index e1a91665d1c..259c90cfa36 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -218,16 +218,16 @@ static void scsi_device_cls_release(struct class_device *class_dev)
 	put_device(&sdev->sdev_gendev);
 }
 
-static void scsi_device_dev_release_usercontext(void *data)
+static void scsi_device_dev_release_usercontext(struct work_struct *work)
 {
-	struct device *dev = data;
 	struct scsi_device *sdev;
 	struct device *parent;
 	struct scsi_target *starget;
 	unsigned long flags;
 
-	parent = dev->parent;
-	sdev = to_scsi_device(dev);
+	sdev = container_of(work, struct scsi_device, ew.work);
+
+	parent = sdev->sdev_gendev.parent;
 	starget = to_scsi_target(parent);
 
 	spin_lock_irqsave(sdev->host->host_lock, flags);
@@ -258,7 +258,7 @@ static void scsi_device_dev_release_usercontext(void *data)
 static void scsi_device_dev_release(struct device *dev)
 {
 	struct scsi_device *sdp = to_scsi_device(dev);
-	execute_in_process_context(scsi_device_dev_release_usercontext, dev,
+	execute_in_process_context(scsi_device_dev_release_usercontext,
 				   &sdp->ew);
 }
 
diff --git a/fs/aio.c b/fs/aio.c
index 11a1a7100ad..ca1c5180a17 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -53,13 +53,13 @@ static kmem_cache_t	*kioctx_cachep;
 static struct workqueue_struct *aio_wq;
 
 /* Used for rare fput completion. */
-static void aio_fput_routine(void *);
-static DECLARE_WORK(fput_work, aio_fput_routine, NULL);
+static void aio_fput_routine(struct work_struct *);
+static DECLARE_WORK(fput_work, aio_fput_routine);
 
 static DEFINE_SPINLOCK(fput_lock);
 static LIST_HEAD(fput_head);
 
-static void aio_kick_handler(void *);
+static void aio_kick_handler(struct work_struct *);
 static void aio_queue_work(struct kioctx *);
 
 /* aio_setup
@@ -227,7 +227,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 
 	INIT_LIST_HEAD(&ctx->active_reqs);
 	INIT_LIST_HEAD(&ctx->run_list);
-	INIT_DELAYED_WORK(&ctx->wq, aio_kick_handler, ctx);
+	INIT_DELAYED_WORK(&ctx->wq, aio_kick_handler);
 
 	if (aio_setup_ring(ctx) < 0)
 		goto out_freectx;
@@ -470,7 +470,7 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
 		wake_up(&ctx->wait);
 }
 
-static void aio_fput_routine(void *data)
+static void aio_fput_routine(struct work_struct *data)
 {
 	spin_lock_irq(&fput_lock);
 	while (likely(!list_empty(&fput_head))) {
@@ -859,9 +859,9 @@ static inline void aio_run_all_iocbs(struct kioctx *ctx)
  *      space.
  * Run on aiod's context.
  */
-static void aio_kick_handler(void *data)
+static void aio_kick_handler(struct work_struct *work)
 {
-	struct kioctx *ctx = data;
+	struct kioctx *ctx = container_of(work, struct kioctx, wq.work);
 	mm_segment_t oldfs = get_fs();
 	int requeue;
 
diff --git a/fs/bio.c b/fs/bio.c
index f95c8749499..c6c07ca5b5a 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -955,16 +955,16 @@ static void bio_release_pages(struct bio *bio)
  * run one bio_put() against the BIO.
  */
 
-static void bio_dirty_fn(void *data);
+static void bio_dirty_fn(struct work_struct *work);
 
-static DECLARE_WORK(bio_dirty_work, bio_dirty_fn, NULL);
+static DECLARE_WORK(bio_dirty_work, bio_dirty_fn);
 static DEFINE_SPINLOCK(bio_dirty_lock);
 static struct bio *bio_dirty_list;
 
 /*
  * This runs in process context
  */
-static void bio_dirty_fn(void *data)
+static void bio_dirty_fn(struct work_struct *work)
 {
 	unsigned long flags;
 	struct bio *bio;
diff --git a/fs/file.c b/fs/file.c
index 8e81775c5dc..3787e82f54c 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -91,8 +91,10 @@ out:
 	spin_unlock(&fddef->lock);
 }
 
-static void free_fdtable_work(struct fdtable_defer *f)
+static void free_fdtable_work(struct work_struct *work)
 {
+	struct fdtable_defer *f =
+		container_of(work, struct fdtable_defer, wq);
 	struct fdtable *fdt;
 
 	spin_lock_bh(&f->lock);
@@ -351,7 +353,7 @@ static void __devinit fdtable_defer_list_init(int cpu)
 {
 	struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);
 	spin_lock_init(&fddef->lock);
-	INIT_WORK(&fddef->wq, (void (*)(void *))free_fdtable_work, fddef);
+	INIT_WORK(&fddef->wq, free_fdtable_work);
 	init_timer(&fddef->timer);
 	fddef->timer.data = (unsigned long)fddef;
 	fddef->timer.function = fdtable_timer;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 6f0487d6f44..23ab145daa2 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -143,7 +143,7 @@ static struct nfs_client *nfs_alloc_client(const char *hostname,
 	INIT_LIST_HEAD(&clp->cl_state_owners);
 	INIT_LIST_HEAD(&clp->cl_unused);
 	spin_lock_init(&clp->cl_lock);
-	INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state, clp);
+	INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
 	rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
 	clp->cl_boot_time = CURRENT_TIME;
 	clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 5ed798bc1cf..371b804e7cc 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -18,11 +18,10 @@
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
 
-static void nfs_expire_automounts(void *list);
+static void nfs_expire_automounts(struct work_struct *work);
 
 LIST_HEAD(nfs_automount_list);
-static DECLARE_DELAYED_WORK(nfs_automount_task, nfs_expire_automounts,
-			    &nfs_automount_list);
+static DECLARE_DELAYED_WORK(nfs_automount_task, nfs_expire_automounts);
 int nfs_mountpoint_expiry_timeout = 500 * HZ;
 
 static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
@@ -165,9 +164,9 @@ struct inode_operations nfs_referral_inode_operations = {
 	.follow_link	= nfs_follow_mountpoint,
 };
 
-static void nfs_expire_automounts(void *data)
+static void nfs_expire_automounts(struct work_struct *work)
 {
-	struct list_head *list = (struct list_head *)data;
+	struct list_head *list = &nfs_automount_list;
 
 	mark_mounts_for_expiry(list);
 	if (!list_empty(list))
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 6f346677332..c26cd978c7c 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -185,7 +185,7 @@ extern const u32 nfs4_fs_locations_bitmap[2];
 extern void nfs4_schedule_state_renewal(struct nfs_client *);
 extern void nfs4_renewd_prepare_shutdown(struct nfs_server *);
 extern void nfs4_kill_renewd(struct nfs_client *);
-extern void nfs4_renew_state(void *);
+extern void nfs4_renew_state(struct work_struct *);
 
 /* nfs4state.c */
 struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp);
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 7b6df1852e7..823298561c0 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -59,9 +59,10 @@
 #define NFSDBG_FACILITY	NFSDBG_PROC
 
 void
-nfs4_renew_state(void *data)
+nfs4_renew_state(struct work_struct *work)
 {
-	struct nfs_client *clp = (struct nfs_client *)data;
+	struct nfs_client *clp =
+		container_of(work, struct nfs_client, cl_renewd.work);
 	struct rpc_cred *cred;
 	long lease, timeout;
 	unsigned long last, now;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 5f04006e8dd..b3f32eadbef 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -568,6 +568,7 @@ struct ata_port {
 	struct ata_host		*host;
 	struct device 		*dev;
 
+	void			*port_task_data;
 	struct delayed_work	port_task;
 	struct delayed_work	hotplug_task;
 	struct work_struct	scsi_rescan_task;
@@ -747,7 +748,7 @@ extern int ata_ratelimit(void);
 extern unsigned int ata_busy_sleep(struct ata_port *ap,
 				   unsigned long timeout_pat,
 				   unsigned long timeout);
-extern void ata_port_queue_task(struct ata_port *ap, void (*fn)(void *),
+extern void ata_port_queue_task(struct ata_port *ap, work_func_t fn,
 				void *data, unsigned long delay);
 extern u32 ata_wait_register(void __iomem *reg, u32 mask, u32 val,
 			     unsigned long interval_msec,
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index ecc017d24cf..4a3ea83c6d1 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -11,18 +11,19 @@
 
 struct workqueue_struct;
 
-typedef void (*work_func_t)(void *data);
+struct work_struct;
+typedef void (*work_func_t)(struct work_struct *work);
 
 struct work_struct {
-	/* the first word is the work queue pointer and the pending flag
-	 * rolled into one */
+	/* the first word is the work queue pointer and the flags rolled into
+	 * one */
 	unsigned long management;
 #define WORK_STRUCT_PENDING 0		/* T if work item pending execution */
+#define WORK_STRUCT_NOAUTOREL 1		/* F if work item automatically released on exec */
 #define WORK_STRUCT_FLAG_MASK (3UL)
 #define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK)
 	struct list_head entry;
 	work_func_t func;
-	void *data;
 };
 
 struct delayed_work {
@@ -34,48 +35,77 @@ struct execute_work {
 	struct work_struct work;
 };
 
-#define __WORK_INITIALIZER(n, f, d) {				\
+#define __WORK_INITIALIZER(n, f) {				\
+	.management = 0,					\
         .entry	= { &(n).entry, &(n).entry },			\
 	.func = (f),						\
-	.data = (d),						\
 	}
 
-#define __DELAYED_WORK_INITIALIZER(n, f, d) {			\
-	.work = __WORK_INITIALIZER((n).work, (f), (d)),		\
+#define __WORK_INITIALIZER_NAR(n, f) {				\
+	.management = (1 << WORK_STRUCT_NOAUTOREL),		\
+        .entry	= { &(n).entry, &(n).entry },			\
+	.func = (f),						\
+	}
+
+#define __DELAYED_WORK_INITIALIZER(n, f) {			\
+	.work = __WORK_INITIALIZER((n).work, (f)),		\
+	.timer = TIMER_INITIALIZER(NULL, 0, 0),			\
+	}
+
+#define __DELAYED_WORK_INITIALIZER_NAR(n, f) {			\
+	.work = __WORK_INITIALIZER_NAR((n).work, (f)),		\
 	.timer = TIMER_INITIALIZER(NULL, 0, 0),			\
 	}
 
-#define DECLARE_WORK(n, f, d)					\
-	struct work_struct n = __WORK_INITIALIZER(n, f, d)
+#define DECLARE_WORK(n, f)					\
+	struct work_struct n = __WORK_INITIALIZER(n, f)
+
+#define DECLARE_WORK_NAR(n, f)					\
+	struct work_struct n = __WORK_INITIALIZER_NAR(n, f)
 
-#define DECLARE_DELAYED_WORK(n, f, d)				\
-	struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, d)
+#define DECLARE_DELAYED_WORK(n, f)				\
+	struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f)
+
+#define DECLARE_DELAYED_WORK_NAR(n, f)			\
+	struct dwork_struct n = __DELAYED_WORK_INITIALIZER_NAR(n, f)
 
 /*
- * initialize a work item's function and data pointers
+ * initialize a work item's function pointer
  */
-#define PREPARE_WORK(_work, _func, _data)			\
+#define PREPARE_WORK(_work, _func)				\
 	do {							\
 		(_work)->func = (_func);			\
-		(_work)->data = (_data);			\
 	} while (0)
 
-#define PREPARE_DELAYED_WORK(_work, _func, _data)		\
-	PREPARE_WORK(&(_work)->work, (_func), (_data))
+#define PREPARE_DELAYED_WORK(_work, _func)			\
+	PREPARE_WORK(&(_work)->work, (_func))
 
 /*
  * initialize all of a work item in one go
  */
-#define INIT_WORK(_work, _func, _data)				\
+#define INIT_WORK(_work, _func)					\
 	do {							\
-		INIT_LIST_HEAD(&(_work)->entry);		\
 		(_work)->management = 0;			\
-		PREPARE_WORK((_work), (_func), (_data));	\
+		INIT_LIST_HEAD(&(_work)->entry);		\
+		PREPARE_WORK((_work), (_func));			\
+	} while (0)
+
+#define INIT_WORK_NAR(_work, _func)					\
+	do {								\
+		(_work)->management = (1 << WORK_STRUCT_NOAUTOREL);	\
+		INIT_LIST_HEAD(&(_work)->entry);			\
+		PREPARE_WORK((_work), (_func));				\
+	} while (0)
+
+#define INIT_DELAYED_WORK(_work, _func)				\
+	do {							\
+		INIT_WORK(&(_work)->work, (_func));		\
+		init_timer(&(_work)->timer);			\
 	} while (0)
 
-#define INIT_DELAYED_WORK(_work, _func, _data)		\
+#define INIT_DELAYED_WORK_NAR(_work, _func)			\
 	do {							\
-		INIT_WORK(&(_work)->work, (_func), (_data));	\
+		INIT_WORK_NAR(&(_work)->work, (_func));		\
 		init_timer(&(_work)->timer);			\
 	} while (0)
 
@@ -94,6 +124,27 @@ struct execute_work {
 #define delayed_work_pending(work) \
 	test_bit(WORK_STRUCT_PENDING, &(work)->work.management)
 
+/**
+ * work_release - Release a work item under execution
+ * @work: The work item to release
+ *
+ * This is used to release a work item that has been initialised with automatic
+ * release mode disabled (WORK_STRUCT_NOAUTOREL is set).  This gives the work
+ * function the opportunity to grab auxiliary data from the container of the
+ * work_struct before clearing the pending bit as the work_struct may be
+ * subject to deallocation the moment the pending bit is cleared.
+ *
+ * In such a case, this should be called in the work function after it has
+ * fetched any data it may require from the containter of the work_struct.
+ * After this function has been called, the work_struct may be scheduled for
+ * further execution or it may be deallocated unless other precautions are
+ * taken.
+ *
+ * This should also be used to release a delayed work item.
+ */
+#define work_release(work) \
+	clear_bit(WORK_STRUCT_PENDING, &(work)->management)
+
 
 extern struct workqueue_struct *__create_workqueue(const char *name,
 						    int singlethread);
@@ -112,7 +163,7 @@ extern int FASTCALL(schedule_work(struct work_struct *work));
 extern int FASTCALL(schedule_delayed_work(struct delayed_work *work, unsigned long delay));
 
 extern int schedule_delayed_work_on(int cpu, struct delayed_work *work, unsigned long delay);
-extern int schedule_on_each_cpu(work_func_t func, void *info);
+extern int schedule_on_each_cpu(work_func_t func);
 extern void flush_scheduled_work(void);
 extern int current_is_keventd(void);
 extern int keventd_up(void);
@@ -121,7 +172,7 @@ extern void init_workqueues(void);
 void cancel_rearming_delayed_work(struct delayed_work *work);
 void cancel_rearming_delayed_workqueue(struct workqueue_struct *,
 				       struct delayed_work *);
-int execute_in_process_context(work_func_t fn, void *, struct execute_work *);
+int execute_in_process_context(work_func_t fn, struct execute_work *);
 
 /*
  * Kill off a pending schedule_delayed_work().  Note that the work callback
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 5f48748fe01..f7be1ac7360 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -84,7 +84,7 @@ struct inet_timewait_death_row {
 };
 
 extern void inet_twdr_hangman(unsigned long data);
-extern void inet_twdr_twkill_work(void *data);
+extern void inet_twdr_twkill_work(struct work_struct *work);
 extern void inet_twdr_twcal_tick(unsigned long data);
 
 #if (BITS_PER_LONG == 64)
diff --git a/ipc/util.c b/ipc/util.c
index cd8bb14a431..a9b7a227b8d 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -514,6 +514,11 @@ void ipc_rcu_getref(void *ptr)
 	container_of(ptr, struct ipc_rcu_hdr, data)->refcount++;
 }
 
+static void ipc_do_vfree(struct work_struct *work)
+{
+	vfree(container_of(work, struct ipc_rcu_sched, work));
+}
+
 /**
  * ipc_schedule_free - free ipc + rcu space
  * @head: RCU callback structure for queued work
@@ -528,7 +533,7 @@ static void ipc_schedule_free(struct rcu_head *head)
 	struct ipc_rcu_sched *sched =
 			container_of(&(grace->data[0]), struct ipc_rcu_sched, data[0]);
 
-	INIT_WORK(&sched->work, vfree, sched);
+	INIT_WORK(&sched->work, ipc_do_vfree);
 	schedule_work(&sched->work);
 }
 
diff --git a/kernel/kmod.c b/kernel/kmod.c
index bb4e29d924e..7dc7a9dad6a 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -114,6 +114,7 @@ EXPORT_SYMBOL(request_module);
 #endif /* CONFIG_KMOD */
 
 struct subprocess_info {
+	struct work_struct work;
 	struct completion *complete;
 	char *path;
 	char **argv;
@@ -221,9 +222,10 @@ static int wait_for_helper(void *data)
 }
 
 /* This is run by khelper thread  */
-static void __call_usermodehelper(void *data)
+static void __call_usermodehelper(struct work_struct *work)
 {
-	struct subprocess_info *sub_info = data;
+	struct subprocess_info *sub_info =
+		container_of(work, struct subprocess_info, work);
 	pid_t pid;
 	int wait = sub_info->wait;
 
@@ -264,6 +266,8 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 	struct subprocess_info sub_info = {
+		.work		= __WORK_INITIALIZER(sub_info.work,
+						     __call_usermodehelper),
 		.complete	= &done,
 		.path		= path,
 		.argv		= argv,
@@ -272,7 +276,6 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
 		.wait		= wait,
 		.retval		= 0,
 	};
-	DECLARE_WORK(work, __call_usermodehelper, &sub_info);
 
 	if (!khelper_wq)
 		return -EBUSY;
@@ -280,7 +283,7 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
 	if (path[0] == '\0')
 		return 0;
 
-	queue_work(khelper_wq, &work);
+	queue_work(khelper_wq, &sub_info.work);
 	wait_for_completion(&done);
 	return sub_info.retval;
 }
@@ -291,6 +294,8 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
 {
 	DECLARE_COMPLETION(done);
 	struct subprocess_info sub_info = {
+		.work		= __WORK_INITIALIZER(sub_info.work,
+						     __call_usermodehelper),
 		.complete	= &done,
 		.path		= path,
 		.argv		= argv,
@@ -298,7 +303,6 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
 		.retval		= 0,
 	};
 	struct file *f;
-	DECLARE_WORK(work, __call_usermodehelper, &sub_info);
 
 	if (!khelper_wq)
 		return -EBUSY;
@@ -318,7 +322,7 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
 	}
 	sub_info.stdin = f;
 
-	queue_work(khelper_wq, &work);
+	queue_work(khelper_wq, &sub_info.work);
 	wait_for_completion(&done);
 	return sub_info.retval;
 }
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4f9c60ef95e..1db8c72d0d3 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -31,6 +31,8 @@ struct kthread_create_info
 	/* Result passed back to kthread_create() from keventd. */
 	struct task_struct *result;
 	struct completion done;
+
+	struct work_struct work;
 };
 
 struct kthread_stop_info
@@ -111,9 +113,10 @@ static int kthread(void *_create)
 }
 
 /* We are keventd: create a thread. */
-static void keventd_create_kthread(void *_create)
+static void keventd_create_kthread(struct work_struct *work)
 {
-	struct kthread_create_info *create = _create;
+	struct kthread_create_info *create =
+		container_of(work, struct kthread_create_info, work);
 	int pid;
 
 	/* We want our own signal handler (we take no signals by default). */
@@ -154,20 +157,20 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 				   ...)
 {
 	struct kthread_create_info create;
-	DECLARE_WORK(work, keventd_create_kthread, &create);
 
 	create.threadfn = threadfn;
 	create.data = data;
 	init_completion(&create.started);
 	init_completion(&create.done);
+	INIT_WORK(&create.work, keventd_create_kthread);
 
 	/*
 	 * The workqueue needs to start up first:
 	 */
 	if (!helper_wq)
-		work.func(work.data);
+		create.work.func(&create.work);
 	else {
-		queue_work(helper_wq, &work);
+		queue_work(helper_wq, &create.work);
 		wait_for_completion(&create.done);
 	}
 	if (!IS_ERR(create.result)) {
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index f1f900ac316..678ec736076 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -16,12 +16,12 @@
  * callback we use.
  */
 
-static void do_poweroff(void *dummy)
+static void do_poweroff(struct work_struct *dummy)
 {
 	kernel_power_off();
 }
 
-static DECLARE_WORK(poweroff_work, do_poweroff, NULL);
+static DECLARE_WORK(poweroff_work, do_poweroff);
 
 static void handle_poweroff(int key, struct tty_struct *tty)
 {
diff --git a/kernel/sys.c b/kernel/sys.c
index 98489d82801..c87b461de38 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -880,7 +880,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 	return 0;
 }
 
-static void deferred_cad(void *dummy)
+static void deferred_cad(struct work_struct *dummy)
 {
 	kernel_restart(NULL);
 }
@@ -892,7 +892,7 @@ static void deferred_cad(void *dummy)
  */
 void ctrl_alt_del(void)
 {
-	static DECLARE_WORK(cad_work, deferred_cad, NULL);
+	static DECLARE_WORK(cad_work, deferred_cad);
 
 	if (C_A_D)
 		schedule_work(&cad_work);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 96747975651..8d1e7cb8a51 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -241,14 +241,14 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
 		struct work_struct *work = list_entry(cwq->worklist.next,
 						struct work_struct, entry);
 		work_func_t f = work->func;
-		void *data = work->data;
 
 		list_del_init(cwq->worklist.next);
 		spin_unlock_irqrestore(&cwq->lock, flags);
 
 		BUG_ON(get_wq_data(work) != cwq);
-		clear_bit(WORK_STRUCT_PENDING, &work->management);
-		f(data);
+		if (!test_bit(WORK_STRUCT_NOAUTOREL, &work->management))
+			work_release(work);
+		f(work);
 
 		spin_lock_irqsave(&cwq->lock, flags);
 		cwq->remove_sequence++;
@@ -527,7 +527,6 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
 /**
  * schedule_on_each_cpu - call a function on each online CPU from keventd
  * @func: the function to call
- * @info: a pointer to pass to func()
  *
  * Returns zero on success.
  * Returns -ve errno on failure.
@@ -536,7 +535,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
  *
  * schedule_on_each_cpu() is very slow.
  */
-int schedule_on_each_cpu(work_func_t func, void *info)
+int schedule_on_each_cpu(work_func_t func)
 {
 	int cpu;
 	struct work_struct *works;
@@ -547,7 +546,7 @@ int schedule_on_each_cpu(work_func_t func, void *info)
 
 	mutex_lock(&workqueue_mutex);
 	for_each_online_cpu(cpu) {
-		INIT_WORK(per_cpu_ptr(works, cpu), func, info);
+		INIT_WORK(per_cpu_ptr(works, cpu), func);
 		__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
 				per_cpu_ptr(works, cpu));
 	}
@@ -591,7 +590,6 @@ EXPORT_SYMBOL(cancel_rearming_delayed_work);
 /**
  * execute_in_process_context - reliably execute the routine with user context
  * @fn:		the function to execute
- * @data:	data to pass to the function
  * @ew:		guaranteed storage for the execute work structure (must
  *		be available when the work executes)
  *
@@ -601,15 +599,14 @@ EXPORT_SYMBOL(cancel_rearming_delayed_work);
  * Returns:	0 - function was executed
  *		1 - function was scheduled for execution
  */
-int execute_in_process_context(work_func_t fn, void *data,
-			       struct execute_work *ew)
+int execute_in_process_context(work_func_t fn, struct execute_work *ew)
 {
 	if (!in_interrupt()) {
-		fn(data);
+		fn(&ew->work);
 		return 0;
 	}
 
-	INIT_WORK(&ew->work, fn, data);
+	INIT_WORK(&ew->work, fn);
 	schedule_work(&ew->work);
 
 	return 1;
diff --git a/mm/slab.c b/mm/slab.c
index a65bc5e992c..5de81473df3 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -313,7 +313,7 @@ static int drain_freelist(struct kmem_cache *cache,
 static void free_block(struct kmem_cache *cachep, void **objpp, int len,
 			int node);
 static int enable_cpucache(struct kmem_cache *cachep);
-static void cache_reap(void *unused);
+static void cache_reap(struct work_struct *unused);
 
 /*
  * This function must be completely optimized away if a constant is passed to
@@ -925,7 +925,7 @@ static void __devinit start_cpu_timer(int cpu)
 	 */
 	if (keventd_up() && reap_work->work.func == NULL) {
 		init_reap_node(cpu);
-		INIT_DELAYED_WORK(reap_work, cache_reap, NULL);
+		INIT_DELAYED_WORK(reap_work, cache_reap);
 		schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
 	}
 }
@@ -3815,7 +3815,7 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
  * If we cannot acquire the cache chain mutex then just give up - we'll try
  * again on the next iteration.
  */
-static void cache_reap(void *unused)
+static void cache_reap(struct work_struct *unused)
 {
 	struct kmem_cache *searchp;
 	struct kmem_list3 *l3;
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index f2ed09e25df..549a2ce951b 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -34,8 +34,8 @@ enum lw_bits {
 static unsigned long linkwatch_flags;
 static unsigned long linkwatch_nextevent;
 
-static void linkwatch_event(void *dummy);
-static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event, NULL);
+static void linkwatch_event(struct work_struct *dummy);
+static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event);
 
 static LIST_HEAD(lweventlist);
 static DEFINE_SPINLOCK(lweventlist_lock);
@@ -127,7 +127,7 @@ void linkwatch_run_queue(void)
 }       
 
 
-static void linkwatch_event(void *dummy)
+static void linkwatch_event(struct work_struct *dummy)
 {
 	/* Limit the number of linkwatch events to one
 	 * per second so that a runaway driver does not
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index cdd805344c6..8c74f9168b7 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -197,9 +197,10 @@ EXPORT_SYMBOL_GPL(inet_twdr_hangman);
 
 extern void twkill_slots_invalid(void);
 
-void inet_twdr_twkill_work(void *data)
+void inet_twdr_twkill_work(struct work_struct *work)
 {
-	struct inet_timewait_death_row *twdr = data;
+	struct inet_timewait_death_row *twdr =
+		container_of(work, struct inet_timewait_death_row, twkill_work);
 	int i;
 
 	if ((INET_TWDR_TWKILL_SLOTS - 1) > (sizeof(twdr->thread_slots) * 8))
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 0163d982690..af7b2c986b1 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -45,8 +45,7 @@ struct inet_timewait_death_row tcp_death_row = {
 	.tw_timer	= TIMER_INITIALIZER(inet_twdr_hangman, 0,
 					    (unsigned long)&tcp_death_row),
 	.twkill_work	= __WORK_INITIALIZER(tcp_death_row.twkill_work,
-					     inet_twdr_twkill_work,
-					     &tcp_death_row),
+					     inet_twdr_twkill_work),
 /* Short-time timewait calendar */
 
 	.twcal_hand	= -1,
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index d5725cb1491..d96fd466a9a 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -284,8 +284,8 @@ static struct file_operations cache_file_operations;
 static struct file_operations content_file_operations;
 static struct file_operations cache_flush_operations;
 
-static void do_cache_clean(void *data);
-static DECLARE_DELAYED_WORK(cache_cleaner, do_cache_clean, NULL);
+static void do_cache_clean(struct work_struct *work);
+static DECLARE_DELAYED_WORK(cache_cleaner, do_cache_clean);
 
 void cache_register(struct cache_detail *cd)
 {
@@ -461,7 +461,7 @@ static int cache_clean(void)
 /*
  * We want to regularly clean the cache, so we need to schedule some work ...
  */
-static void do_cache_clean(void *data)
+static void do_cache_clean(struct work_struct *work)
 {
 	int delay = 5;
 	if (cache_clean() == -1)
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 97be3f7fed4..49dba5febbb 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -54,10 +54,11 @@ static void rpc_purge_list(struct rpc_inode *rpci, struct list_head *head,
 }
 
 static void
-rpc_timeout_upcall_queue(void *data)
+rpc_timeout_upcall_queue(struct work_struct *work)
 {
 	LIST_HEAD(free_list);
-	struct rpc_inode *rpci = (struct rpc_inode *)data;
+	struct rpc_inode *rpci =
+		container_of(work, struct rpc_inode, queue_timeout.work);
 	struct inode *inode = &rpci->vfs_inode;
 	void (*destroy_msg)(struct rpc_pipe_msg *);
 
@@ -838,7 +839,7 @@ init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
 		rpci->pipelen = 0;
 		init_waitqueue_head(&rpci->waitq);
 		INIT_DELAYED_WORK(&rpci->queue_timeout,
-				    rpc_timeout_upcall_queue, rpci);
+				    rpc_timeout_upcall_queue);
 		rpci->ops = NULL;
 	}
 }
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index a1ab4eed41f..eff44bcdc95 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -41,7 +41,7 @@ static mempool_t	*rpc_buffer_mempool __read_mostly;
 
 static void			__rpc_default_timer(struct rpc_task *task);
 static void			rpciod_killall(void);
-static void			rpc_async_schedule(void *);
+static void			rpc_async_schedule(struct work_struct *);
 
 /*
  * RPC tasks sit here while waiting for conditions to improve.
@@ -305,7 +305,7 @@ static void rpc_make_runnable(struct rpc_task *task)
 	if (RPC_IS_ASYNC(task)) {
 		int status;
 
-		INIT_WORK(&task->u.tk_work, rpc_async_schedule, (void *)task);
+		INIT_WORK(&task->u.tk_work, rpc_async_schedule);
 		status = queue_work(task->tk_workqueue, &task->u.tk_work);
 		if (status < 0) {
 			printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status);
@@ -695,9 +695,9 @@ rpc_execute(struct rpc_task *task)
 	return __rpc_execute(task);
 }
 
-static void rpc_async_schedule(void *arg)
+static void rpc_async_schedule(struct work_struct *work)
 {
-	__rpc_execute((struct rpc_task *)arg);
+	__rpc_execute(container_of(work, struct rpc_task, u.tk_work));
 }
 
 /**
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 80857470dc1..4f9a5d9791f 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -479,9 +479,10 @@ int xprt_adjust_timeout(struct rpc_rqst *req)
 	return status;
 }
 
-static void xprt_autoclose(void *args)
+static void xprt_autoclose(struct work_struct *work)
 {
-	struct rpc_xprt *xprt = (struct rpc_xprt *)args;
+	struct rpc_xprt *xprt =
+		container_of(work, struct rpc_xprt, task_cleanup);
 
 	xprt_disconnect(xprt);
 	xprt->ops->close(xprt);
@@ -932,7 +933,7 @@ struct rpc_xprt *xprt_create_transport(int proto, struct sockaddr *ap, size_t si
 
 	INIT_LIST_HEAD(&xprt->free);
 	INIT_LIST_HEAD(&xprt->recv);
-	INIT_WORK(&xprt->task_cleanup, xprt_autoclose, xprt);
+	INIT_WORK(&xprt->task_cleanup, xprt_autoclose);
 	init_timer(&xprt->timer);
 	xprt->timer.function = xprt_init_autodisconnect;
 	xprt->timer.data = (unsigned long) xprt;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 3c7532cd009..cfe3c15be94 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1060,13 +1060,14 @@ static int xs_bindresvport(struct rpc_xprt *xprt, struct socket *sock)
 
 /**
  * xs_udp_connect_worker - set up a UDP socket
- * @args: RPC transport to connect
+ * @work: RPC transport to connect
  *
  * Invoked by a work queue tasklet.
  */
-static void xs_udp_connect_worker(void *args)
+static void xs_udp_connect_worker(struct work_struct *work)
 {
-	struct rpc_xprt *xprt = (struct rpc_xprt *) args;
+	struct rpc_xprt *xprt =
+		container_of(work, struct rpc_xprt, connect_worker.work);
 	struct socket *sock = xprt->sock;
 	int err, status = -EIO;
 
@@ -1144,13 +1145,14 @@ static void xs_tcp_reuse_connection(struct rpc_xprt *xprt)
 
 /**
  * xs_tcp_connect_worker - connect a TCP socket to a remote endpoint
- * @args: RPC transport to connect
+ * @work: RPC transport to connect
  *
  * Invoked by a work queue tasklet.
  */
-static void xs_tcp_connect_worker(void *args)
+static void xs_tcp_connect_worker(struct work_struct *work)
 {
-	struct rpc_xprt *xprt = (struct rpc_xprt *)args;
+	struct rpc_xprt *xprt =
+		container_of(work, struct rpc_xprt, connect_worker.work);
 	struct socket *sock = xprt->sock;
 	int err, status = -EIO;
 
@@ -1375,7 +1377,7 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to)
 	/* XXX: header size can vary due to auth type, IPv6, etc. */
 	xprt->max_payload = (1U << 16) - (MAX_HEADER << 3);
 
-	INIT_DELAYED_WORK(&xprt->connect_worker, xs_udp_connect_worker, xprt);
+	INIT_DELAYED_WORK(&xprt->connect_worker, xs_udp_connect_worker);
 	xprt->bind_timeout = XS_BIND_TO;
 	xprt->connect_timeout = XS_UDP_CONN_TO;
 	xprt->reestablish_timeout = XS_UDP_REEST_TO;
@@ -1420,7 +1422,7 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to)
 	xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
 	xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
 
-	INIT_DELAYED_WORK(&xprt->connect_worker, xs_tcp_connect_worker, xprt);
+	INIT_DELAYED_WORK(&xprt->connect_worker, xs_tcp_connect_worker);
 	xprt->bind_timeout = XS_BIND_TO;
 	xprt->connect_timeout = XS_TCP_CONN_TO;
 	xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
diff --git a/security/keys/key.c b/security/keys/key.c
index 80de8c3e9cc..70eacbe5abd 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -30,8 +30,8 @@ DEFINE_SPINLOCK(key_user_lock);
 static LIST_HEAD(key_types_list);
 static DECLARE_RWSEM(key_types_sem);
 
-static void key_cleanup(void *data);
-static DECLARE_WORK(key_cleanup_task, key_cleanup, NULL);
+static void key_cleanup(struct work_struct *work);
+static DECLARE_WORK(key_cleanup_task, key_cleanup);
 
 /* we serialise key instantiation and link */
 DECLARE_RWSEM(key_construction_sem);
@@ -552,7 +552,7 @@ EXPORT_SYMBOL(key_negate_and_link);
  * do cleaning up in process context so that we don't have to disable
  * interrupts all over the place
  */
-static void key_cleanup(void *data)
+static void key_cleanup(struct work_struct *work)
 {
 	struct rb_node *_n;
 	struct key *key;
-- 
cgit v1.2.3-70-g09d2


From 24d7bb3396c51ceb2285e0e7b0c1bd1865652c43 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 28 Nov 2006 09:14:05 +0100
Subject: [PATCH] x86_64: fix 'earlyprintk=...,keep' regression

Commit 2c8c0e6b8d7700a990da8d24eff767f9ca223b96 ("[PATCH] Convert x86-64
to early param") broke the earlyprintk=...,keep feature.

This restores that functionality.  Tested on x86_64.  Must-have for
v2.6.19, no risk.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/early_printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c
index e22ecd54870..47b6d90349d 100644
--- a/arch/x86_64/kernel/early_printk.c
+++ b/arch/x86_64/kernel/early_printk.c
@@ -224,7 +224,7 @@ static int __init setup_early_printk(char *buf)
 		return 0;
 	early_console_initialized = 1;
 
-	if (!strcmp(buf,"keep"))
+	if (strstr(buf, "keep"))
 		keep_early = 1;
 
 	if (!strncmp(buf, "serial", 6)) {
-- 
cgit v1.2.3-70-g09d2


From f7a23328a738b45124400d85eaf78a76939da726 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Tue, 28 Nov 2006 20:12:59 +0100
Subject: [PATCH] x86-64: Fix warning in io_apic.c

---
 arch/x86_64/kernel/io_apic.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 14654e68241..c80081a6ba4 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -754,10 +754,8 @@ void __setup_vector_irq(int cpu)
 {
 	/* Initialize vector_irq on a new cpu */
 	/* This function must be called with vector_lock held */
-	unsigned long flags;
 	int irq, vector;
 
-
 	/* Mark the inuse vectors */
 	for (irq = 0; irq < NR_IRQ_VECTORS; ++irq) {
 		if (!cpu_isset(cpu, irq_domain[irq]))
-- 
cgit v1.2.3-70-g09d2


From c547c77ee4d0408907847f64c403df1bf2f9c7a0 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Tue, 28 Nov 2006 20:12:59 +0100
Subject: [PATCH] x86-64: Use stricter in process stack check for unwinder

Previously it would check for alignment only, which could break
if the stack pointer was unaligned. Now explicitely check if the
stack pointer is in the stack page of the current process.

Ported from i386.

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/traps.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index a153d0a01b7..0d65b22f229 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -242,12 +242,19 @@ static int dump_trace_unwind(struct unwind_frame_info *info, void *context)
  * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
  */
 
+static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
+{
+	void *t = (void *)tinfo;
+        return p > t && p < t + THREAD_SIZE - 3;
+}
+
 void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack,
 		struct stacktrace_ops *ops, void *data)
 {
 	const unsigned cpu = smp_processor_id();
 	unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
 	unsigned used = 0;
+	struct thread_info *tinfo;
 
 	if (!tsk)
 		tsk = current;
@@ -370,7 +377,8 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
 	/*
 	 * This handles the process stack:
 	 */
-	HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0);
+	tinfo = current_thread_info();
+	HANDLE_STACK (valid_stack_ptr(tinfo, stack));
 #undef HANDLE_STACK
 }
 EXPORT_SYMBOL(dump_trace);
-- 
cgit v1.2.3-70-g09d2


From b615ebdac97c648a2ae7d23c5a0bbb3972adf928 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:00 +0100
Subject: [PATCH] x86: shorten lines in unwinder to be <= 80 characters

Andrew complained about > 80 character lines in the new unwinder.
Fix that.

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/traps.c   | 16 ++++++++++------
 arch/x86_64/kernel/traps.c | 21 +++++++++++++--------
 2 files changed, 23 insertions(+), 14 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index fe9c5e8e7e6..fe81d89c50d 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -173,6 +173,8 @@ dump_trace_unwind(struct unwind_frame_info *info, void *data)
 	return n;
 }
 
+#define MSG(msg) ops->warning(data, msg)
+
 void dump_trace(struct task_struct *task, struct pt_regs *regs,
 	        unsigned long *stack,
 		struct stacktrace_ops *ops, void *data)
@@ -191,29 +193,31 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 			if (unwind_init_frame_info(&info, task, regs) == 0)
 				unw_ret = dump_trace_unwind(&info, &oad);
 		} else if (task == current)
-			unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
+			unw_ret = unwind_init_running(&info, dump_trace_unwind,
+						      &oad);
 		else {
 			if (unwind_init_blocked(&info, task) == 0)
 				unw_ret = dump_trace_unwind(&info, &oad);
 		}
 		if (unw_ret > 0) {
 			if (call_trace == 1 && !arch_unw_user_mode(&info)) {
-				ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
+				ops->warning_symbol(data,
+					     "DWARF2 unwinder stuck at %s\n",
 					     UNW_PC(&info));
 				if (UNW_SP(&info) >= PAGE_OFFSET) {
-					ops->warning(data, "Leftover inexact backtrace:\n");
+					MSG("Leftover inexact backtrace:\n");
 					stack = (void *)UNW_SP(&info);
 					if (!stack)
 						return;
 					ebp = UNW_FP(&info);
 				} else
-					ops->warning(data, "Full inexact backtrace again:\n");
+					MSG("Full inexact backtrace again:\n");
 			} else if (call_trace >= 1)
 				return;
 			else
-				ops->warning(data, "Full inexact backtrace again:\n");
+				MSG("Full inexact backtrace again:\n");
 		} else
-			ops->warning(data, "Inexact backtrace:\n");
+			MSG("Inexact backtrace:\n");
 	}
 	if (!stack) {
 		unsigned long dummy;
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 0d65b22f229..d3f43c958ca 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -235,6 +235,8 @@ static int dump_trace_unwind(struct unwind_frame_info *info, void *context)
 	return n;
 }
 
+#define MSG(txt) ops->warning(data, txt)
+
 /*
  * x86-64 can have upto three kernel stacks: 
  * process stack
@@ -248,11 +250,12 @@ static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
         return p > t && p < t + THREAD_SIZE - 3;
 }
 
-void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack,
+void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
+		unsigned long *stack,
 		struct stacktrace_ops *ops, void *data)
 {
 	const unsigned cpu = smp_processor_id();
-	unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
+	unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr;
 	unsigned used = 0;
 	struct thread_info *tinfo;
 
@@ -268,28 +271,30 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
 			if (unwind_init_frame_info(&info, tsk, regs) == 0)
 				unw_ret = dump_trace_unwind(&info, &oad);
 		} else if (tsk == current)
-			unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
+			unw_ret = unwind_init_running(&info, dump_trace_unwind,
+						      &oad);
 		else {
 			if (unwind_init_blocked(&info, tsk) == 0)
 				unw_ret = dump_trace_unwind(&info, &oad);
 		}
 		if (unw_ret > 0) {
 			if (call_trace == 1 && !arch_unw_user_mode(&info)) {
-				ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
+				ops->warning_symbol(data,
+					     "DWARF2 unwinder stuck at %s\n",
 					     UNW_PC(&info));
 				if ((long)UNW_SP(&info) < 0) {
-					ops->warning(data, "Leftover inexact backtrace:\n");
+					MSG("Leftover inexact backtrace:");
 					stack = (unsigned long *)UNW_SP(&info);
 					if (!stack)
 						return;
 				} else
-					ops->warning(data, "Full inexact backtrace again:\n");
+					MSG("Full inexact backtrace again:\n");
 			} else if (call_trace >= 1)
 				return;
 			else
-				ops->warning(data, "Full inexact backtrace again:\n");
+				MSG("Full inexact backtrace again:\n");
 		} else
-			ops->warning(data, "Inexact backtrace:\n");
+			MSG("Inexact backtrace:\n");
 	}
 	if (!stack) {
 		unsigned long dummy;
-- 
cgit v1.2.3-70-g09d2


From dd315df1767cf56bd4fb8d730fdff4a3d7e15d84 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:00 +0100
Subject: [PATCH] x86: Compress stack unwinder output

The unwinder has some extra newlines, which eat up loads of screen
space when it spews. (See https://bugzilla.redhat.com/bugzilla/attachment.cgi?id=137900
for a nasty example).

warning_symbol-> and warning-> already printk a newline, so don't add one
in the strings passed to them.

[AK: redone for new code]

Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/traps.c   | 10 +++++-----
 arch/x86_64/kernel/traps.c |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index fe81d89c50d..396041a46e3 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -202,22 +202,22 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 		if (unw_ret > 0) {
 			if (call_trace == 1 && !arch_unw_user_mode(&info)) {
 				ops->warning_symbol(data,
-					     "DWARF2 unwinder stuck at %s\n",
+					     "DWARF2 unwinder stuck at %s",
 					     UNW_PC(&info));
 				if (UNW_SP(&info) >= PAGE_OFFSET) {
-					MSG("Leftover inexact backtrace:\n");
+					MSG("Leftover inexact backtrace:");
 					stack = (void *)UNW_SP(&info);
 					if (!stack)
 						return;
 					ebp = UNW_FP(&info);
 				} else
-					MSG("Full inexact backtrace again:\n");
+					MSG("Full inexact backtrace again:");
 			} else if (call_trace >= 1)
 				return;
 			else
-				MSG("Full inexact backtrace again:\n");
+				MSG("Full inexact backtrace again:");
 		} else
-			MSG("Inexact backtrace:\n");
+			MSG("Inexact backtrace:");
 	}
 	if (!stack) {
 		unsigned long dummy;
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index d3f43c958ca..eedd4e759c3 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -280,7 +280,7 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
 		if (unw_ret > 0) {
 			if (call_trace == 1 && !arch_unw_user_mode(&info)) {
 				ops->warning_symbol(data,
-					     "DWARF2 unwinder stuck at %s\n",
+					     "DWARF2 unwinder stuck at %s",
 					     UNW_PC(&info));
 				if ((long)UNW_SP(&info) < 0) {
 					MSG("Leftover inexact backtrace:");
@@ -288,13 +288,13 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
 					if (!stack)
 						return;
 				} else
-					MSG("Full inexact backtrace again:\n");
+					MSG("Full inexact backtrace again:");
 			} else if (call_trace >= 1)
 				return;
 			else
-				MSG("Full inexact backtrace again:\n");
+				MSG("Full inexact backtrace again:");
 		} else
-			MSG("Inexact backtrace:\n");
+			MSG("Inexact backtrace:");
 	}
 	if (!stack) {
 		unsigned long dummy;
-- 
cgit v1.2.3-70-g09d2


From 36b2a8d5aff4cb3ee83d5e40447a8f073bcfe2fb Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@hpl.hp.com>
Date: Thu, 7 Dec 2006 02:14:01 +0100
Subject: [PATCH] x86-64: add X86_FEATURE_PEBS and detection

Here is a patch (used by perfmon2) to detect the presence of the
Precise Event Based Sampling (PEBS) feature for Intel 64-bit processors.
The patch also adds the cpu_has_pebs macro.

changelog:
	- adds X86_FEATURE_PEBS
	- adds cpu_has_pebs to test for X86_FEATURE_PEBS

Signed-off-by: stephane eranian <eranian@hpl.hp.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/setup.c      | 7 +++++++
 include/asm-x86_64/cpufeature.h | 2 ++
 2 files changed, 9 insertions(+)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index fc944b5e8f4..619af2e2fa2 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -835,6 +835,13 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 			set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability);
 	}
 
+	if (cpu_has_ds) {
+		unsigned int l1, l2;
+		rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
+		if (!(l1 & (1<<12)))
+			set_bit(X86_FEATURE_PEBS, c->x86_capability);
+	}
+
 	n = c->extended_cpuid_level;
 	if (n >= 0x80000008) {
 		unsigned eax = cpuid_eax(0x80000008);
diff --git a/include/asm-x86_64/cpufeature.h b/include/asm-x86_64/cpufeature.h
index 65eb39e8f3c..d280384807e 100644
--- a/include/asm-x86_64/cpufeature.h
+++ b/include/asm-x86_64/cpufeature.h
@@ -68,6 +68,7 @@
 #define X86_FEATURE_FXSAVE_LEAK (3*32+7)  /* FIP/FOP/FDP leaks through FXSAVE */
 #define X86_FEATURE_UP		(3*32+8) /* SMP kernel running on UP */
 #define X86_FEATURE_ARCH_PERFMON (3*32+9) /* Intel Architectural PerfMon */
+#define X86_FEATURE_PEBS	(3*32+10) /* Precise-Event Based Sampling */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* Streaming SIMD Extensions-3 */
@@ -113,5 +114,6 @@
 #define cpu_has_centaur_mcr    0
 #define cpu_has_clflush	       boot_cpu_has(X86_FEATURE_CLFLSH)
 #define cpu_has_ds 	       boot_cpu_has(X86_FEATURE_DS)
+#define cpu_has_pebs 	       boot_cpu_has(X86_FEATURE_PEBS)
 
 #endif /* __ASM_X8664_CPUFEATURE_H */
-- 
cgit v1.2.3-70-g09d2


From e2764a1e306c986053a52b33748c33463cf888de Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 7 Dec 2006 02:14:01 +0100
Subject: [PATCH] x86-64: use BUILD_BUG_ON in FPU code

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---
 arch/x86_64/kernel/i387.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/i387.c b/arch/x86_64/kernel/i387.c
index 3aa1e9bb781..1d58c13bc6b 100644
--- a/arch/x86_64/kernel/i387.c
+++ b/arch/x86_64/kernel/i387.c
@@ -82,11 +82,8 @@ int save_i387(struct _fpstate __user *buf)
 	struct task_struct *tsk = current;
 	int err = 0;
 
-	{ 
-		extern void bad_user_i387_struct(void); 
-		if (sizeof(struct user_i387_struct) != sizeof(tsk->thread.i387.fxsave))
-			bad_user_i387_struct();
-	} 
+	BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
+			sizeof(tsk->thread.i387.fxsave));
 
 	if ((unsigned long)buf % 16) 
 		printk("save_i387: bad fpstate %p\n",buf); 
-- 
cgit v1.2.3-70-g09d2


From bb81a09e55eaf7e5f798468ab971469b6f66a259 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 7 Dec 2006 02:14:01 +0100
Subject: [PATCH] x86: all cpu backtrace

When a spinlock lockup occurs, arrange for the NMI code to emit an all-cpu
backtrace, so we get to see which CPU is holding the lock, and where.

Cc: Andi Kleen <ak@muc.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/nmi.c   | 26 ++++++++++++++++++++++++++
 arch/x86_64/kernel/nmi.c | 29 ++++++++++++++++++++++++++++-
 include/asm-i386/nmi.h   |  8 ++++++++
 include/asm-x86_64/nmi.h |  3 +++
 include/linux/nmi.h      |  5 +++++
 lib/spinlock_debug.c     |  4 ++++
 6 files changed, 74 insertions(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index eaafe233a5d..171194ccb7b 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -22,6 +22,7 @@
 #include <linux/percpu.h>
 #include <linux/dmi.h>
 #include <linux/kprobes.h>
+#include <linux/cpumask.h>
 
 #include <asm/smp.h>
 #include <asm/nmi.h>
@@ -42,6 +43,8 @@ int nmi_watchdog_enabled;
 static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner);
 static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]);
 
+static cpumask_t backtrace_mask = CPU_MASK_NONE;
+
 /* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
  * offset from MSR_P4_BSU_ESCR0.  It will be the max for all platforms (for now)
  */
@@ -907,6 +910,16 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
 		touched = 1;
 	}
 
+	if (cpu_isset(cpu, backtrace_mask)) {
+		static DEFINE_SPINLOCK(lock);	/* Serialise the printks */
+
+		spin_lock(&lock);
+		printk("NMI backtrace for cpu %d\n", cpu);
+		dump_stack();
+		spin_unlock(&lock);
+		cpu_clear(cpu, backtrace_mask);
+	}
+
 	sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
 
 	/* if the apic timer isn't firing, this cpu isn't doing much */
@@ -1033,6 +1046,19 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
 
 #endif
 
+void __trigger_all_cpu_backtrace(void)
+{
+	int i;
+
+	backtrace_mask = cpu_online_map;
+	/* Wait for up to 10 seconds for all CPUs to do the backtrace */
+	for (i = 0; i < 10 * 1000; i++) {
+		if (cpus_empty(backtrace_mask))
+			break;
+		mdelay(1);
+	}
+}
+
 EXPORT_SYMBOL(nmi_active);
 EXPORT_SYMBOL(nmi_watchdog);
 EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 7af9cb3e2d9..27e95e7922c 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -12,14 +12,15 @@
  *  Mikael Pettersson	: PM converted to driver model. Disable/enable API.
  */
 
+#include <linux/nmi.h>
 #include <linux/mm.h>
 #include <linux/delay.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/sysdev.h>
-#include <linux/nmi.h>
 #include <linux/sysctl.h>
 #include <linux/kprobes.h>
+#include <linux/cpumask.h>
 
 #include <asm/smp.h>
 #include <asm/nmi.h>
@@ -41,6 +42,8 @@ int panic_on_unrecovered_nmi;
 static DEFINE_PER_CPU(unsigned, perfctr_nmi_owner);
 static DEFINE_PER_CPU(unsigned, evntsel_nmi_owner[2]);
 
+static cpumask_t backtrace_mask = CPU_MASK_NONE;
+
 /* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
  * offset from MSR_P4_BSU_ESCR0.  It will be the max for all platforms (for now)
  */
@@ -782,6 +785,7 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
 {
 	int sum;
 	int touched = 0;
+	int cpu = smp_processor_id();
 	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
 	u64 dummy;
 	int rc=0;
@@ -799,6 +803,16 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
 		touched = 1;
 	}
 
+	if (cpu_isset(cpu, backtrace_mask)) {
+		static DEFINE_SPINLOCK(lock);	/* Serialise the printks */
+
+		spin_lock(&lock);
+		printk("NMI backtrace for cpu %d\n", cpu);
+		dump_stack();
+		spin_unlock(&lock);
+		cpu_clear(cpu, backtrace_mask);
+	}
+
 #ifdef CONFIG_X86_MCE
 	/* Could check oops_in_progress here too, but it's safer
 	   not too */
@@ -931,6 +945,19 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
 
 #endif
 
+void __trigger_all_cpu_backtrace(void)
+{
+	int i;
+
+	backtrace_mask = cpu_online_map;
+	/* Wait for up to 10 seconds for all CPUs to do the backtrace */
+	for (i = 0; i < 10 * 1000; i++) {
+		if (cpus_empty(backtrace_mask))
+			break;
+		mdelay(1);
+	}
+}
+
 EXPORT_SYMBOL(nmi_active);
 EXPORT_SYMBOL(nmi_watchdog);
 EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
diff --git a/include/asm-i386/nmi.h b/include/asm-i386/nmi.h
index 269d315719c..b04333ea6f3 100644
--- a/include/asm-i386/nmi.h
+++ b/include/asm-i386/nmi.h
@@ -5,6 +5,9 @@
 #define ASM_NMI_H
 
 #include <linux/pm.h>
+#include <asm/irq.h>
+
+#ifdef ARCH_HAS_NMI_WATCHDOG
 
 /**
  * do_nmi_callback
@@ -42,4 +45,9 @@ extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
 			void __user *, size_t *, loff_t *);
 extern int unknown_nmi_panic;
 
+void __trigger_all_cpu_backtrace(void);
+#define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace()
+
+#endif
+
 #endif /* ASM_NMI_H */
diff --git a/include/asm-x86_64/nmi.h b/include/asm-x86_64/nmi.h
index f367d4014b4..72375e7d32a 100644
--- a/include/asm-x86_64/nmi.h
+++ b/include/asm-x86_64/nmi.h
@@ -77,4 +77,7 @@ extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
 
 extern int unknown_nmi_panic;
 
+void __trigger_all_cpu_backtrace(void);
+#define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace()
+
 #endif /* ASM_NMI_H */
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index e16904e28c3..acb4ed13024 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -15,9 +15,14 @@
  * disables interrupts for a long time. This call is stateless.
  */
 #ifdef ARCH_HAS_NMI_WATCHDOG
+#include <asm/nmi.h>
 extern void touch_nmi_watchdog(void);
 #else
 # define touch_nmi_watchdog() touch_softlockup_watchdog()
 #endif
 
+#ifndef trigger_all_cpu_backtrace
+#define trigger_all_cpu_backtrace() do { } while (0)
+#endif
+
 #endif
diff --git a/lib/spinlock_debug.c b/lib/spinlock_debug.c
index b6c4f898197..479fd462eaa 100644
--- a/lib/spinlock_debug.c
+++ b/lib/spinlock_debug.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/spinlock.h>
+#include <linux/nmi.h>
 #include <linux/interrupt.h>
 #include <linux/debug_locks.h>
 #include <linux/delay.h>
@@ -117,6 +118,9 @@ static void __spin_lock_debug(spinlock_t *lock)
 				raw_smp_processor_id(), current->comm,
 				current->pid, lock);
 			dump_stack();
+#ifdef CONFIG_SMP
+			trigger_all_cpu_backtrace();
+#endif
 		}
 	}
 }
-- 
cgit v1.2.3-70-g09d2


From 399287229c775a8962a852a761d65dc9475dec7c Mon Sep 17 00:00:00 2001
From: Aaron Durbin <adurbin@google.com>
Date: Thu, 7 Dec 2006 02:14:01 +0100
Subject: [PATCH] x86-64: Insert Local and IO APIC(s) into resource map

Insert the Local APIC and IO APIC(s) into the resource tree.  It allows the
APIC resources to be visible within /proc/iomem.  The patch also takes into
account IO APIC(s) mapped in the PCI space by deferring the insertion until
after PCI has allocated its necessary resources.

Signed-off-by: Aaron Durbin <adurbin@google.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@muc.de>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---
 arch/x86_64/kernel/apic.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index 4d9d5ed942b..5c468971e64 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -25,6 +25,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/sysdev.h>
 #include <linux/module.h>
+#include <linux/ioport.h>
 
 #include <asm/atomic.h>
 #include <asm/smp.h>
@@ -45,6 +46,12 @@ int apic_calibrate_pmtmr __initdata;
 
 int disable_apic_timer __initdata;
 
+static struct resource *ioapic_resources;
+static struct resource lapic_resource = {
+	.name = "Local APIC",
+	.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
+};
+
 /*
  * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
  * IPIs in place of local APIC timers
@@ -585,6 +592,64 @@ static int __init detect_init_APIC (void)
 	return 0;
 }
 
+#ifdef CONFIG_X86_IO_APIC
+static struct resource * __init ioapic_setup_resources(void)
+{
+#define IOAPIC_RESOURCE_NAME_SIZE 11
+	unsigned long n;
+	struct resource *res;
+	char *mem;
+	int i;
+
+	if (nr_ioapics <= 0)
+		return NULL;
+
+	n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
+	n *= nr_ioapics;
+
+	mem = alloc_bootmem(n);
+	res = (void *)mem;
+
+	if (mem != NULL) {
+		memset(mem, 0, n);
+		mem += sizeof(struct resource) * nr_ioapics;
+
+		for (i = 0; i < nr_ioapics; i++) {
+			res[i].name = mem;
+			res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+			sprintf(mem,  "IOAPIC %u", i);
+			mem += IOAPIC_RESOURCE_NAME_SIZE;
+		}
+	}
+
+	ioapic_resources = res;
+
+	return res;
+}
+
+static int __init ioapic_insert_resources(void)
+{
+	int i;
+	struct resource *r = ioapic_resources;
+
+	if (!r) {
+		printk("IO APIC resources could be not be allocated.\n");
+		return -1;
+	}
+
+	for (i = 0; i < nr_ioapics; i++) {
+		insert_resource(&iomem_resource, r);
+		r++;
+	}
+
+	return 0;
+}
+
+/* Insert the IO APIC resources after PCI initialization has occured to handle
+ * IO APICS that are mapped in on a BAR in PCI space. */
+late_initcall(ioapic_insert_resources);
+#endif
+
 void __init init_apic_mappings(void)
 {
 	unsigned long apic_phys;
@@ -604,6 +669,11 @@ void __init init_apic_mappings(void)
 	apic_mapped = 1;
 	apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys);
 
+	/* Put local APIC into the resource map. */
+	lapic_resource.start = apic_phys;
+	lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
+	insert_resource(&iomem_resource, &lapic_resource);
+
 	/*
 	 * Fetch the APIC ID of the BSP in case we have a
 	 * default configuration (or the MP table is broken).
@@ -613,7 +683,9 @@ void __init init_apic_mappings(void)
 	{
 		unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
 		int i;
+		struct resource *ioapic_res;
 
+		ioapic_res = ioapic_setup_resources();
 		for (i = 0; i < nr_ioapics; i++) {
 			if (smp_found_config) {
 				ioapic_phys = mp_ioapics[i].mpc_apicaddr;
@@ -625,6 +697,12 @@ void __init init_apic_mappings(void)
 			apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n",
 					__fix_to_virt(idx), ioapic_phys);
 			idx++;
+
+			if (ioapic_res != NULL) {
+				ioapic_res->start = ioapic_phys;
+				ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
+				ioapic_res++;
+			}
 		}
 	}
 }
-- 
cgit v1.2.3-70-g09d2


From da68933e0a999fb13636653c710cca701b457ad2 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 7 Dec 2006 02:14:02 +0100
Subject: [PATCH] x86-64: dump_trace() atomicity fix

Fix

BUG: using smp_processor_id() in preemptible [00000001] code:

in backtracer on preemptible debug kernels.

Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/traps.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index eedd4e759c3..e37b4d77d5a 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -254,7 +254,7 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
 		unsigned long *stack,
 		struct stacktrace_ops *ops, void *data)
 {
-	const unsigned cpu = smp_processor_id();
+	const unsigned cpu = get_cpu();
 	unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr;
 	unsigned used = 0;
 	struct thread_info *tinfo;
@@ -286,11 +286,11 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
 					MSG("Leftover inexact backtrace:");
 					stack = (unsigned long *)UNW_SP(&info);
 					if (!stack)
-						return;
+						goto out;
 				} else
 					MSG("Full inexact backtrace again:");
 			} else if (call_trace >= 1)
-				return;
+				goto out;
 			else
 				MSG("Full inexact backtrace again:");
 		} else
@@ -385,6 +385,8 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
 	tinfo = current_thread_info();
 	HANDLE_STACK (valid_stack_ptr(tinfo, stack));
 #undef HANDLE_STACK
+out:
+	put_cpu();
 }
 EXPORT_SYMBOL(dump_trace);
 
-- 
cgit v1.2.3-70-g09d2


From bcddc0155f351ab3f06c6ede6d91fd399ef9e18f Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 7 Dec 2006 02:14:02 +0100
Subject: [PATCH] x86-64: miscellaneous entry.S adjustments

This patch:
- makes ret_from_sys_call no longer global (all external users were
  previously switched to use int_ret_from_sys_call)
- adjusts placement of a CFI_{REMEMBER,RESTORE}_STATE pair to better
  fit logic flow
- eliminates an unnecessary pair of CFI_{REMEMBER,RESTORE}_STATE
- glues together function- and unwinder-wise the previously separate
  system_call and int_ret_from_sys_call function fragments

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/entry.S | 36 ++++++------------------------------
 1 file changed, 6 insertions(+), 30 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 7d401b00d82..601d332c4b7 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -230,7 +230,6 @@ ENTRY(system_call)
 	CFI_REL_OFFSET rip,RIP-ARGOFFSET
 	GET_THREAD_INFO(%rcx)
 	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
-	CFI_REMEMBER_STATE
 	jnz tracesys
 	cmpq $__NR_syscall_max,%rax
 	ja badsys
@@ -241,7 +240,6 @@ ENTRY(system_call)
  * Syscall return path ending with SYSRET (fast path)
  * Has incomplete stack frame and undefined top of stack. 
  */		
-	.globl ret_from_sys_call
 ret_from_sys_call:
 	movl $_TIF_ALLWORK_MASK,%edi
 	/* edi:	flagmask */
@@ -251,8 +249,8 @@ sysret_check:
 	TRACE_IRQS_OFF
 	movl threadinfo_flags(%rcx),%edx
 	andl %edi,%edx
-	CFI_REMEMBER_STATE
 	jnz  sysret_careful 
+	CFI_REMEMBER_STATE
 	/*
 	 * sysretq will re-enable interrupts:
 	 */
@@ -265,10 +263,10 @@ sysret_check:
 	swapgs
 	sysretq
 
+	CFI_RESTORE_STATE
 	/* Handle reschedules */
 	/* edx:	work, edi: workmask */	
 sysret_careful:
-	CFI_RESTORE_STATE
 	bt $TIF_NEED_RESCHED,%edx
 	jnc sysret_signal
 	TRACE_IRQS_ON
@@ -306,7 +304,6 @@ badsys:
 
 	/* Do syscall tracing */
 tracesys:			 
-	CFI_RESTORE_STATE
 	SAVE_REST
 	movq $-ENOSYS,RAX(%rsp)
 	FIXUP_TOP_OF_STACK %rdi
@@ -322,32 +319,13 @@ tracesys:
 	call *sys_call_table(,%rax,8)
 1:	movq %rax,RAX-ARGOFFSET(%rsp)
 	/* Use IRET because user could have changed frame */
-	jmp int_ret_from_sys_call
-	CFI_ENDPROC
-END(system_call)
 		
 /* 
  * Syscall return path ending with IRET.
  * Has correct top of stack, but partial stack frame.
- */ 	
-ENTRY(int_ret_from_sys_call)
-	CFI_STARTPROC	simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,SS+8-ARGOFFSET
-	/*CFI_REL_OFFSET	ss,SS-ARGOFFSET*/
-	CFI_REL_OFFSET	rsp,RSP-ARGOFFSET
-	/*CFI_REL_OFFSET	rflags,EFLAGS-ARGOFFSET*/
-	/*CFI_REL_OFFSET	cs,CS-ARGOFFSET*/
-	CFI_REL_OFFSET	rip,RIP-ARGOFFSET
-	CFI_REL_OFFSET	rdx,RDX-ARGOFFSET
-	CFI_REL_OFFSET	rcx,RCX-ARGOFFSET
-	CFI_REL_OFFSET	rax,RAX-ARGOFFSET
-	CFI_REL_OFFSET	rdi,RDI-ARGOFFSET
-	CFI_REL_OFFSET	rsi,RSI-ARGOFFSET
-	CFI_REL_OFFSET	r8,R8-ARGOFFSET
-	CFI_REL_OFFSET	r9,R9-ARGOFFSET
-	CFI_REL_OFFSET	r10,R10-ARGOFFSET
-	CFI_REL_OFFSET	r11,R11-ARGOFFSET
+ */
+	.globl int_ret_from_sys_call
+int_ret_from_sys_call:
 	cli
 	TRACE_IRQS_OFF
 	testl $3,CS-ARGOFFSET(%rsp)
@@ -394,8 +372,6 @@ int_very_careful:
 	popq %rdi
 	CFI_ADJUST_CFA_OFFSET -8
 	andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
-	cli
-	TRACE_IRQS_OFF
 	jmp int_restore_rest
 	
 int_signal:
@@ -411,7 +387,7 @@ int_restore_rest:
 	TRACE_IRQS_OFF
 	jmp int_with_check
 	CFI_ENDPROC
-END(int_ret_from_sys_call)
+END(system_call)
 		
 /* 
  * Certain special system calls that need to save a complete full stack frame.
-- 
cgit v1.2.3-70-g09d2


From 72690a21188586022a9e65cb6f1cc8845167555a Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:03 +0100
Subject: [PATCH] x86: Don't use nested idle loops

Currently the idle loop has two nested loops -- one high level
in cpu_idle and in some low level idle functions another one.

Looping in the low level idle functions breaks the idle notifiers
because interrupts waking up sleep states need to execute
exit_idle() which is only in cpu_idle().

So don't do that, only loop in cpu_idle(). This only removes
code.

In some cases e.g. poll_idle the idle loop is a little longer
now because cpu_idle checks more things. I hope that isn't a problem
ACPI idle doesn't change behaviour because it never looped anyways.

Cc: len.brown@intel.com
Cc: eranian@hpl.hp.com
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/process.c   | 30 +++++++++---------------------
 arch/x86_64/kernel/process.c | 30 +++++++++---------------------
 2 files changed, 18 insertions(+), 42 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 8749b10d380..8f42659ef9d 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -100,22 +100,18 @@ EXPORT_SYMBOL(enable_hlt);
  */
 void default_idle(void)
 {
-	local_irq_enable();
-
 	if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
 		current_thread_info()->status &= ~TS_POLLING;
 		smp_mb__after_clear_bit();
-		while (!need_resched()) {
-			local_irq_disable();
-			if (!need_resched())
-				safe_halt();
-			else
-				local_irq_enable();
-		}
+		local_irq_disable();
+		if (!need_resched())
+			safe_halt();	/* enables interrupts racelessly */
+		else
+			local_irq_enable();
 		current_thread_info()->status |= TS_POLLING;
 	} else {
-		while (!need_resched())
-			cpu_relax();
+		/* loop is done by the caller */
+		cpu_relax();
 	}
 }
 #ifdef CONFIG_APM_MODULE
@@ -129,14 +125,7 @@ EXPORT_SYMBOL(default_idle);
  */
 static void poll_idle (void)
 {
-	local_irq_enable();
-
-	asm volatile(
-		"2:"
-		"testl %0, %1;"
-		"rep; nop;"
-		"je 2b;"
-		: : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags));
+	cpu_relax();
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -257,8 +246,7 @@ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
 static void mwait_idle(void)
 {
 	local_irq_enable();
-	while (!need_resched())
-		mwait_idle_with_hints(0, 0);
+	mwait_idle_with_hints(0, 0);
 }
 
 void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 7451a4c43c1..0b7b4caa4f7 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -108,17 +108,15 @@ void exit_idle(void)
  */
 static void default_idle(void)
 {
-	local_irq_enable();
-
 	current_thread_info()->status &= ~TS_POLLING;
 	smp_mb__after_clear_bit();
-	while (!need_resched()) {
-		local_irq_disable();
-		if (!need_resched())
-			safe_halt();
-		else
-			local_irq_enable();
-	}
+	local_irq_disable();
+	if (!need_resched()) {
+		/* Enables interrupts one instruction before HLT.
+		   x86 special cases this so there is no race. */
+		safe_halt();
+	} else
+		local_irq_enable();
 	current_thread_info()->status |= TS_POLLING;
 }
 
@@ -129,16 +127,7 @@ static void default_idle(void)
  */
 static void poll_idle (void)
 {
-	local_irq_enable();
-
-	asm volatile(
-		"2:"
-		"testl %0,%1;"
-		"rep; nop;"
-		"je 2b;"
-		: :
-		"i" (_TIF_NEED_RESCHED),
-		"m" (current_thread_info()->flags));
+	cpu_relax();
 }
 
 void cpu_idle_wait(void)
@@ -257,8 +246,7 @@ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
 static void mwait_idle(void)
 {
 	local_irq_enable();
-	while (!need_resched())
-		mwait_idle_with_hints(0,0);
+	mwait_idle_with_hints(0,0);
 }
 
 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
-- 
cgit v1.2.3-70-g09d2


From 7cd8b6861eb586aabe4c725cc0c259ce2e653695 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Thu, 7 Dec 2006 02:14:03 +0100
Subject: [PATCH] x86: remove last two pci_find offenders in the core code

Resending as I believe the discussion about them established they were
correct.

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---
 arch/i386/pci/irq.c              | 4 +++-
 arch/x86_64/kernel/pci-calgary.c | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/i386/pci/irq.c b/arch/i386/pci/irq.c
index e65551cd821..f2cb942f828 100644
--- a/arch/i386/pci/irq.c
+++ b/arch/i386/pci/irq.c
@@ -764,7 +764,7 @@ static void __init pirq_find_router(struct irq_router *r)
 	DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
 	    rt->rtr_vendor, rt->rtr_device);
 
-	pirq_router_dev = pci_find_slot(rt->rtr_bus, rt->rtr_devfn);
+	pirq_router_dev = pci_get_bus_and_slot(rt->rtr_bus, rt->rtr_devfn);
 	if (!pirq_router_dev) {
 		DBG(KERN_DEBUG "PCI: Interrupt router not found at "
 			"%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn);
@@ -784,6 +784,8 @@ static void __init pirq_find_router(struct irq_router *r)
 		pirq_router_dev->vendor,
 		pirq_router_dev->device,
 		pci_name(pirq_router_dev));
+
+	/* The device remains referenced for the kernel lifetime */
 }
 
 static struct irq_info *pirq_get_info(struct pci_dev *dev)
diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index 37a770859e7..d2ea87a9526 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -921,7 +921,7 @@ static int __init calgary_init(void)
 
 error:
 	do {
-		dev = pci_find_device_reverse(PCI_VENDOR_ID_IBM,
+		dev = pci_get_device_reverse(PCI_VENDOR_ID_IBM,
 					      PCI_DEVICE_ID_IBM_CALGARY,
 					      dev);
 		if (!dev)
-- 
cgit v1.2.3-70-g09d2


From 9c5f8be4625e73f17e28fea89399ed871a30e064 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:03 +0100
Subject: [PATCH] x86: Mention PCI instead of RAM in NMI parity error message

On modern systems RAM errors don't cause NMIs, but it's usually
caused by PCI SERR. Mention PCI instead of RAM in the printk.

Reported by r_hayashi@ctc-g.co.jp (Ryutaro Hayashi)

Cc:  r_hayashi@ctc-g.co.jp

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/traps.c   | 3 +--
 arch/x86_64/kernel/traps.c | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 4a6fa2837df..237f4884a1e 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -708,8 +708,7 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
 {
 	printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
 		"CPU %d.\n", reason, smp_processor_id());
-	printk(KERN_EMERG "You probably have a hardware problem with your RAM "
-			"chips\n");
+	printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
 	if (panic_on_unrecovered_nmi)
                 panic("NMI: Not continuing");
 
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index e37b4d77d5a..70bfaab9822 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -793,8 +793,7 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
 {
 	printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
 		reason);
-	printk(KERN_EMERG "You probably have a hardware problem with your "
-		"RAM chips\n");
+	printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
 
 	if (panic_on_unrecovered_nmi)
 		panic("NMI: Not continuing");
-- 
cgit v1.2.3-70-g09d2


From 5df0287ecc4f53e68bbd188fa8258b555e6b734f Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai.lu@amd.com>
Date: Thu, 7 Dec 2006 02:14:05 +0100
Subject: [PATCH] x86-64: Extend clear_irq_vector

Clear the irq releated entries in irq_vector, irq_domain and vector_irq
instead of clearing irq_vector only. So when new irq is created, it
could reuse that vector. (actually is the second loop scanning from
FIRST_DEVICE_VECTOR+8). This could avoid the vectors are used up
with enough module inserting and removing

Cc: Eric W. Biedierman <ebiederm@xmission.com>
Cc: Muli Ben-Yehuda <muli@il.ibm.com>
Signed-off-By: Yinghai Lu <yinghai.lu@amd.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/io_apic.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index c80081a6ba4..f71461b1f03 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -750,6 +750,22 @@ static int assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result)
 	return vector;
 }
 
+static void __clear_irq_vector(int irq)
+{
+	cpumask_t mask;
+	int cpu, vector;
+
+	BUG_ON(!irq_vector[irq]);
+
+	vector = irq_vector[irq];
+	cpus_and(mask, irq_domain[irq], cpu_online_map);
+	for_each_cpu_mask(cpu, mask)
+		per_cpu(vector_irq, cpu)[vector] = -1;
+
+	irq_vector[irq] = 0;
+	irq_domain[irq] = CPU_MASK_NONE;
+}
+
 void __setup_vector_irq(int cpu)
 {
 	/* Initialize vector_irq on a new cpu */
@@ -1837,7 +1853,7 @@ void destroy_irq(unsigned int irq)
 	dynamic_irq_cleanup(irq);
 
 	spin_lock_irqsave(&vector_lock, flags);
-	irq_vector[irq] = 0;
+	__clear_irq_vector(irq);
 	spin_unlock_irqrestore(&vector_lock, flags);
 }
 
-- 
cgit v1.2.3-70-g09d2


From ab2bf0c1c689905b628dca94d0acd9c50e152468 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:06 +0100
Subject: [PATCH] x86-64: Use probe_kernel_address in arch/x86_64/*

Instead of open coded __get_user

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/traps.c |  2 +-
 arch/x86_64/mm/fault.c     | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 70bfaab9822..264db33476a 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -30,9 +30,9 @@
 #include <linux/kprobes.h>
 #include <linux/kexec.h>
 #include <linux/unwind.h>
+#include <linux/uaccess.h>
 
 #include <asm/system.h>
-#include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/atomic.h>
 #include <asm/debugreg.h>
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 3751b4788e2..a65fc6f1dca 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -23,9 +23,9 @@
 #include <linux/compiler.h>
 #include <linux/module.h>
 #include <linux/kprobes.h>
+#include <linux/uaccess.h>
 
 #include <asm/system.h>
-#include <asm/uaccess.h>
 #include <asm/pgalloc.h>
 #include <asm/smp.h>
 #include <asm/tlbflush.h>
@@ -96,7 +96,7 @@ void bust_spinlocks(int yes)
 static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
 				unsigned long error_code)
 { 
-	unsigned char __user *instr;
+	unsigned char *instr;
 	int scan_more = 1;
 	int prefetch = 0; 
 	unsigned char *max_instr;
@@ -116,7 +116,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
 		unsigned char instr_hi;
 		unsigned char instr_lo;
 
-		if (__get_user(opcode, (char __user *)instr))
+		if (probe_kernel_address(instr, opcode))
 			break; 
 
 		instr_hi = opcode & 0xf0; 
@@ -154,7 +154,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
 		case 0x00:
 			/* Prefetch instruction is 0x0F0D or 0x0F18 */
 			scan_more = 0;
-			if (__get_user(opcode, (char __user *)instr))
+			if (probe_kernel_address(instr, opcode))
 				break;
 			prefetch = (instr_lo == 0xF) &&
 				(opcode == 0x0D || opcode == 0x18);
@@ -170,7 +170,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
 static int bad_address(void *p) 
 { 
 	unsigned long dummy;
-	return __get_user(dummy, (unsigned long __user *)p);
+	return probe_kernel_address((unsigned long *)p, dummy);
 } 
 
 void dump_pagetable(unsigned long address)
-- 
cgit v1.2.3-70-g09d2


From b026872601976f666bae77b609dc490d1834bf77 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:06 +0100
Subject: [PATCH] x86-64: Try multiple timer variants in check_timer

Instead of adding all kinds of more quirks try various timer
routing variants in check_timer.

In particular this tries to handle quirks from:
- Nvidia NF2-4 reference BIOS: wrong timer override
- Asus: Wrong timer override but no HPET table
- ATI: require timer disabled in 8259
- Some boards: require timer enabled in 8259

We just try many of the the known variants in the hopefully right order
in check_timer.

Trying pin 0/2 on Nvidia suggested by Tim Hockin.

TBD Experimental. Needs a lot of testing

Signed-off-by: Andi Kleen <ak@suse.de>
---
 Documentation/x86_64/boot-options.txt |   4 --
 arch/x86_64/kernel/early-quirks.c     |   5 --
 arch/x86_64/kernel/io_apic.c          | 124 ++++++++++++++--------------------
 3 files changed, 51 insertions(+), 82 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt
index f3c57f43ba6..ab9b1c046c0 100644
--- a/Documentation/x86_64/boot-options.txt
+++ b/Documentation/x86_64/boot-options.txt
@@ -52,10 +52,6 @@ APICs
 		 apicmaintimer. Useful when your PIT timer is totally
 		 broken.
 
-   disable_8254_timer / enable_8254_timer
-		 Enable interrupt 0 timer routing over the 8254 in addition to over
-	         the IO-APIC. The kernel tries to set a sensible default.
-
 Early Console
 
    syntax: earlyprintk=vga
diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c
index 68273bff58c..fb0c6da41b7 100644
--- a/arch/x86_64/kernel/early-quirks.c
+++ b/arch/x86_64/kernel/early-quirks.c
@@ -69,11 +69,6 @@ static void nvidia_bugs(void)
 
 static void ati_bugs(void)
 {
-	if (timer_over_8254 == 1) {
-		timer_over_8254 = 0;
-		printk(KERN_INFO
-	 	"ATI board detected. Disabling timer routing over 8254.\n");
-	}
 }
 
 struct chipset {
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index f71461b1f03..88fcc4ebbf6 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -55,10 +55,6 @@ int sis_apic_bug; /* not actually supported, dummy for compile */
 
 static int no_timer_check;
 
-static int disable_timer_pin_1 __initdata;
-
-int timer_over_8254 __initdata = 1;
-
 /* Where if anywhere is the i8259 connect in external int mode */
 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 
@@ -348,29 +344,6 @@ static int __init disable_ioapic_setup(char *str)
 }
 early_param("noapic", disable_ioapic_setup);
 
-/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
-static int __init disable_timer_pin_setup(char *arg)
-{
-	disable_timer_pin_1 = 1;
-	return 1;
-}
-__setup("disable_timer_pin_1", disable_timer_pin_setup);
-
-static int __init setup_disable_8254_timer(char *s)
-{
-	timer_over_8254 = -1;
-	return 1;
-}
-static int __init setup_enable_8254_timer(char *s)
-{
-	timer_over_8254 = 2;
-	return 1;
-}
-
-__setup("disable_8254_timer", setup_disable_8254_timer);
-__setup("enable_8254_timer", setup_enable_8254_timer);
-
-
 /*
  * Find the IRQ entry number of a certain pin.
  */
@@ -1579,10 +1552,33 @@ static inline void unlock_ExtINT_logic(void)
  * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
  * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
  * fanatically on his truly buggy board.
- *
- * FIXME: really need to revamp this for modern platforms only.
  */
-static inline void check_timer(void)
+
+static int try_apic_pin(int apic, int pin, char *msg)
+{
+	apic_printk(APIC_VERBOSE, KERN_INFO
+		    "..TIMER: trying IO-APIC=%d PIN=%d %s",
+		    apic, pin, msg);
+
+	/*
+	 * Ok, does IRQ0 through the IOAPIC work?
+	 */
+	if (!no_timer_check && timer_irq_works()) {
+		nmi_watchdog_default();
+		if (nmi_watchdog == NMI_IO_APIC) {
+			disable_8259A_irq(0);
+			setup_nmi();
+			enable_8259A_irq(0);
+		}
+		return 1;
+	}
+	clear_IO_APIC_pin(apic, pin);
+	apic_printk(APIC_QUIET, KERN_ERR " .. failed\n");
+	return 0;
+}
+
+/* The function from hell */
+static void check_timer(void)
 {
 	int apic1, pin1, apic2, pin2;
 	int vector;
@@ -1603,61 +1599,43 @@ static inline void check_timer(void)
 	 */
 	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
 	init_8259A(1);
-	if (timer_over_8254 > 0)
-		enable_8259A_irq(0);
 
 	pin1  = find_isa_irq_pin(0, mp_INT);
 	apic1 = find_isa_irq_apic(0, mp_INT);
 	pin2  = ioapic_i8259.pin;
 	apic2 = ioapic_i8259.apic;
 
-	apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
-		vector, apic1, pin1, apic2, pin2);
+	/* Do this first, otherwise we get double interrupts on ATI boards */
+	if ((pin1 != -1) && try_apic_pin(apic1, pin1,"with 8259 IRQ0 disabled"))
+		return;
 
-	if (pin1 != -1) {
-		/*
-		 * Ok, does IRQ0 through the IOAPIC work?
-		 */
-		unmask_IO_APIC_irq(0);
-		if (!no_timer_check && timer_irq_works()) {
-			nmi_watchdog_default();
-			if (nmi_watchdog == NMI_IO_APIC) {
-				disable_8259A_irq(0);
-				setup_nmi();
-				enable_8259A_irq(0);
-			}
-			if (disable_timer_pin_1 > 0)
-				clear_IO_APIC_pin(0, pin1);
-			return;
-		}
-		clear_IO_APIC_pin(apic1, pin1);
-		apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
-				"connected to IO-APIC\n");
-	}
+	/* Now try again with IRQ0 8259A enabled.
+	   Assumes timer is on IO-APIC 0 ?!? */
+	enable_8259A_irq(0);
+	unmask_IO_APIC_irq(0);
+	if (try_apic_pin(apic1, pin1, "with 8259 IRQ0 enabled"))
+		return;
+	disable_8259A_irq(0);
 
-	apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
-				"through the 8259A ... ");
+	/* Always try pin0 and pin2 on APIC 0 to handle buggy timer overrides
+	   on Nvidia boards */
+	if (!(apic1 == 0 && pin1 == 0) &&
+	    try_apic_pin(0, 0, "fallback with 8259 IRQ0 disabled"))
+		return;
+	if (!(apic1 == 0 && pin1 == 2) &&
+	    try_apic_pin(0, 2, "fallback with 8259 IRQ0 disabled"))
+		return;
+
+	/* Then try pure 8259A routing on the 8259 as reported by BIOS*/
+	enable_8259A_irq(0);
 	if (pin2 != -1) {
-		apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
-			apic2, pin2);
-		/*
-		 * legacy devices should be connected to IO APIC #0
-		 */
 		setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
-		if (timer_irq_works()) {
-			apic_printk(APIC_VERBOSE," works.\n");
-			nmi_watchdog_default();
-			if (nmi_watchdog == NMI_IO_APIC) {
-				setup_nmi();
-			}
+		if (try_apic_pin(apic2,pin2,"8259A broadcast ExtINT from BIOS"))
 			return;
-		}
-		/*
-		 * Cleanup, just in case ...
-		 */
-		clear_IO_APIC_pin(apic2, pin2);
 	}
-	apic_printk(APIC_VERBOSE," failed.\n");
+
+	/* Tried all possibilities to go through the IO-APIC. Now come the
+	   really cheesy fallbacks. */
 
 	if (nmi_watchdog == NMI_IO_APIC) {
 		printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
-- 
cgit v1.2.3-70-g09d2


From 58db85482743f5e3495d168c641c60ce1d3dfb06 Mon Sep 17 00:00:00 2001
From: Muli Ben-Yehuda <muli@il.ibm.com>
Date: Thu, 7 Dec 2006 02:14:06 +0100
Subject: [PATCH] calgary: phb_shift can be int

Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/pci-calgary.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index d2ea87a9526..f53b581dfd0 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -740,7 +740,7 @@ static void __init calgary_increase_split_completion_timeout(void __iomem *bbar,
 {
 	u64 val64;
 	void __iomem *target;
-	unsigned long phb_shift = -1;
+	unsigned int phb_shift = ~0; /* silence gcc */
 	u64 mask;
 
 	switch (busno_to_phbid(busnum)) {
-- 
cgit v1.2.3-70-g09d2


From b34e90b8f0f30151349134f87b5dc6ef75a5218c Mon Sep 17 00:00:00 2001
From: Laurent Vivier <Laurent.Vivier@bull.net>
Date: Thu, 7 Dec 2006 02:14:06 +0100
Subject: [PATCH] Calgary: use BIOS supplied BBARs and topology information

Find the BBAR register address of each Calgary using the "Extended
BIOS Data Area" rather than calculating it ourselves. Also get the bus
topology (what PHB each bus is on) from Calgary rather than
calculating it ourselves.

This patch fixes http://bugzilla.kernel.org/show_bug.cgi?id=7407.

Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/pci-calgary.c | 161 ++++++++++++++++++++++++++++++---------
 include/asm-x86_64/rio.h         |  76 ++++++++++++++++++
 2 files changed, 201 insertions(+), 36 deletions(-)
 create mode 100644 include/asm-x86_64/rio.h

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index f53b581dfd0..afc0a53505f 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -41,6 +41,7 @@
 #include <asm/pci-direct.h>
 #include <asm/system.h>
 #include <asm/dma.h>
+#include <asm/rio.h>
 
 #define PCI_DEVICE_ID_IBM_CALGARY 0x02a1
 #define PCI_VENDOR_DEVICE_ID_CALGARY \
@@ -115,14 +116,35 @@ static const unsigned long phb_offsets[] = {
 	0xB000 /* PHB3 */
 };
 
+/* PHB debug registers */
+
+static const unsigned long phb_debug_offsets[] = {
+	0x4000	/* PHB 0 DEBUG */,
+	0x5000	/* PHB 1 DEBUG */,
+	0x6000	/* PHB 2 DEBUG */,
+	0x7000	/* PHB 3 DEBUG */
+};
+
+/*
+ * STUFF register for each debug PHB,
+ * byte 1 = start bus number, byte 2 = end bus number
+ */
+
+#define PHB_DEBUG_STUFF_OFFSET	0x0020
+
 unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;
 static int translate_empty_slots __read_mostly = 0;
 static int calgary_detected __read_mostly = 0;
 
+static struct rio_table_hdr	*rio_table_hdr __initdata;
+static struct scal_detail	*scal_devs[MAX_NUMNODES] __initdata;
+static struct rio_detail	*rio_devs[MAX_NUMNODES*4] __initdata;
+
 struct calgary_bus_info {
 	void *tce_space;
 	unsigned char translation_disabled;
 	signed char phbid;
+	void __iomem *bbar;
 };
 
 static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
@@ -475,6 +497,11 @@ static struct dma_mapping_ops calgary_dma_ops = {
 	.unmap_sg = calgary_unmap_sg,
 };
 
+static inline void __iomem * busno_to_bbar(unsigned char num)
+{
+	return bus_info[num].bbar;
+}
+
 static inline int busno_to_phbid(unsigned char num)
 {
 	return bus_info[num].phbid;
@@ -828,31 +855,9 @@ static void __init calgary_disable_translation(struct pci_dev *dev)
 	del_timer_sync(&tbl->watchdog_timer);
 }
 
-static inline unsigned int __init locate_register_space(struct pci_dev *dev)
+static inline void __iomem * __init locate_register_space(struct pci_dev *dev)
 {
-	int rionodeid;
-	u32 address;
-
-	/*
-	 * Each Calgary has four busses. The first four busses (first Calgary)
-	 * have RIO node ID 2, then the next four (second Calgary) have RIO
-	 * node ID 3, the next four (third Calgary) have node ID 2 again, etc.
-	 * We use a gross hack - relying on the dev->bus->number ordering,
-	 * modulo 14 - to decide which Calgary a given bus is on. Busses 0, 1,
-	 * 2 and 4 are on the first Calgary (id 2), 6, 8, a and c are on the
-	 * second (id 3), and then it repeats modulo 14.
- 	 */
-	rionodeid = (dev->bus->number % 14 > 4) ? 3 : 2;
-	/*
-	 * register space address calculation as follows:
-	 * FE0MB-8MB*OneBasedChassisNumber+1MB*(RioNodeId-ChassisBase)
-	 * ChassisBase is always zero for x366/x260/x460
-	 * RioNodeId is 2 for first Calgary, 3 for second Calgary
-	 */
-	address = START_ADDRESS	-
-		(0x800000 * (ONE_BASED_CHASSIS_NUM + dev->bus->number / 14)) +
-		(0x100000) * (rionodeid - CHASSIS_BASE);
-	return address;
+	return busno_to_bbar(dev->bus->number);
 }
 
 static void __init calgary_init_one_nontraslated(struct pci_dev *dev)
@@ -864,15 +869,12 @@ static void __init calgary_init_one_nontraslated(struct pci_dev *dev)
 
 static int __init calgary_init_one(struct pci_dev *dev)
 {
-	u32 address;
 	void __iomem *bbar;
 	int ret;
 
 	BUG_ON(dev->bus->number >= MAX_PHB_BUS_NUM);
 
-	address = locate_register_space(dev);
-	/* map entire 1MB of Calgary config space */
-	bbar = ioremap_nocache(address, 1024 * 1024);
+	bbar = locate_register_space(dev);
 	if (!bbar) {
 		ret = -ENODATA;
 		goto done;
@@ -898,6 +900,35 @@ static int __init calgary_init(void)
 {
 	int ret = -ENODEV;
 	struct pci_dev *dev = NULL;
+	int rio, phb, bus;
+	void __iomem *bbar;
+	void __iomem *target;
+	u8 start_bus, end_bus;
+	u32 val;
+
+	for (rio = 0; rio < rio_table_hdr->num_rio_dev; rio++) {
+
+		if ( (rio_devs[rio]->type != COMPAT_CALGARY) &&
+		     (rio_devs[rio]->type != ALT_CALGARY) )
+			continue;
+
+		/* map entire 1MB of Calgary config space */
+		bbar = ioremap_nocache(rio_devs[rio]->BBAR, 1024 * 1024);
+
+		for (phb = 0; phb < PHBS_PER_CALGARY; phb++) {
+
+			target = calgary_reg(bbar, phb_debug_offsets[phb] |
+						   PHB_DEBUG_STUFF_OFFSET);
+			val = be32_to_cpu(readl(target));
+			start_bus = (u8)((val & 0x00FF0000) >> 16);
+			end_bus =   (u8)((val & 0x0000FF00) >> 8);
+			for (bus = start_bus; bus <= end_bus; bus++) {
+				bus_info[bus].bbar = bbar;
+				bus_info[bus].phbid = phb;
+			}
+		}
+	}
+
 
 	do {
 		dev = pci_get_device(PCI_VENDOR_ID_IBM,
@@ -962,13 +993,55 @@ static inline int __init determine_tce_table_size(u64 ram)
 	return ret;
 }
 
+static int __init build_detail_arrays(void)
+{
+	unsigned long ptr;
+	int i, scal_detail_size, rio_detail_size;
+
+	if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){
+		printk(KERN_WARNING
+			"Calgary: MAX_NUMNODES too low!  Defined as %d, "
+			"but system has %d nodes.\n",
+			MAX_NUMNODES, rio_table_hdr->num_scal_dev);
+		return -ENODEV;
+	}
+
+	switch (rio_table_hdr->version){
+	default:
+		printk(KERN_WARNING
+		       "Calgary: Invalid Rio Grande Table Version: %d\n",
+		       rio_table_hdr->version);
+		return -ENODEV;
+	case 2:
+		scal_detail_size = 11;
+		rio_detail_size = 13;
+		break;
+	case 3:
+		scal_detail_size = 12;
+		rio_detail_size = 15;
+		break;
+	}
+
+	ptr = ((unsigned long)rio_table_hdr) + 3;
+	for (i = 0; i < rio_table_hdr->num_scal_dev;
+		    i++, ptr += scal_detail_size)
+		scal_devs[i] = (struct scal_detail *)ptr;
+
+	for (i = 0; i < rio_table_hdr->num_rio_dev;
+		    i++, ptr += rio_detail_size)
+		rio_devs[i] = (struct rio_detail *)ptr;
+
+	return 0;
+}
+
 void __init detect_calgary(void)
 {
 	u32 val;
 	int bus;
 	void *tbl;
 	int calgary_found = 0;
-	int phb = -1;
+	unsigned long ptr;
+	int offset;
 
 	/*
 	 * if the user specified iommu=off or iommu=soft or we found
@@ -980,6 +1053,29 @@ void __init detect_calgary(void)
 	if (!early_pci_allowed())
 		return;
 
+	ptr = (unsigned long)phys_to_virt(get_bios_ebda());
+
+	rio_table_hdr = NULL;
+	offset = 0x180;
+	while (offset) {
+		/* The block id is stored in the 2nd word */
+		if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){
+			/* set the pointer past the offset & block id */
+			rio_table_hdr = (struct rio_table_hdr *)(ptr+offset+4);
+			break;
+		}
+		/* The next offset is stored in the 1st word. 0 means no more */
+		offset = *((unsigned short *)(ptr + offset));
+	}
+	if (!rio_table_hdr){
+		printk(KERN_ERR "Calgary: Unable to locate "
+				"Rio Grande Table in EBDA - bailing!\n");
+		return;
+	}
+
+	if (build_detail_arrays())
+		return;
+
 	specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE);
 
 	for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
@@ -990,12 +1086,6 @@ void __init detect_calgary(void)
 		if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY)
 			continue;
 
-		/*
-		 * There are 4 PHBs per Calgary chip.  Set phb to which phb (0-3)
-		 * it is connected to releative to the clagary chip.
-		 */
-		phb = (phb + 1) % PHBS_PER_CALGARY;
-
 		if (info->translation_disabled)
 			continue;
 
@@ -1010,7 +1100,6 @@ void __init detect_calgary(void)
 				if (!tbl)
 					goto cleanup;
 				info->tce_space = tbl;
-				info->phbid = phb;
 				calgary_found = 1;
 				break;
 			}
diff --git a/include/asm-x86_64/rio.h b/include/asm-x86_64/rio.h
new file mode 100644
index 00000000000..1f315c39d76
--- /dev/null
+++ b/include/asm-x86_64/rio.h
@@ -0,0 +1,76 @@
+/*
+ * Derived from include/asm-i386/mach-summit/mach_mpparse.h
+ *          and include/asm-i386/mach-default/bios_ebda.h
+ *
+ * Author: Laurent Vivier <Laurent.Vivier@bull.net>
+ *
+ */
+
+#ifndef __ASM_RIO_H
+#define __ASM_RIO_H
+
+#define RIO_TABLE_VERSION	3
+
+struct rio_table_hdr {
+	u8 version;      /* Version number of this data structure  */
+	u8 num_scal_dev; /* # of Scalability devices               */
+	u8 num_rio_dev;  /* # of RIO I/O devices                   */
+} __attribute__((packed));
+
+struct scal_detail {
+	u8 node_id;      /* Scalability Node ID                    */
+	u32  CBAR;       /* Address of 1MB register space          */
+	u8 port0node;    /* Node ID port connected to: 0xFF=None   */
+	u8 port0port;    /* Port num port connected to: 0,1,2, or  */
+	                 /* 0xFF=None                              */
+	u8 port1node;    /* Node ID port connected to: 0xFF = None */
+	u8 port1port;    /* Port num port connected to: 0,1,2, or  */
+	                 /* 0xFF=None                              */
+	u8 port2node;    /* Node ID port connected to: 0xFF = None */
+	u8 port2port;    /* Port num port connected to: 0,1,2, or  */
+	                 /* 0xFF=None                              */
+	u8 chassis_num;  /* 1 based Chassis number (1 = boot node) */
+} __attribute__((packed));
+
+struct rio_detail {
+	u8 node_id;      /* RIO Node ID                            */
+	u32  BBAR;       /* Address of 1MB register space          */
+	u8 type;         /* Type of device                         */
+	u8 owner_id;     /* Node ID of Hurricane that owns this    */
+	                 /* node                                   */
+	u8 port0node;    /* Node ID port connected to: 0xFF=None   */
+	u8 port0port;    /* Port num port connected to: 0,1,2, or  */
+	                 /* 0xFF=None                              */
+	u8 port1node;    /* Node ID port connected to: 0xFF=None   */
+	u8 port1port;    /* Port num port connected to: 0,1,2, or  */
+	                 /* 0xFF=None                              */
+	u8 first_slot;   /* Lowest slot number below this Calgary  */
+	u8 status;       /* Bit 0 = 1 : the XAPIC is used          */
+	                 /*       = 0 : the XAPIC is not used, ie: */
+	                 /*            ints fwded to another XAPIC */
+	                 /*           Bits1:7 Reserved             */
+	u8 WP_index;     /* instance index - lower ones have       */
+	                 /*     lower slot numbers/PCI bus numbers */
+	u8 chassis_num;  /* 1 based Chassis number                 */
+} __attribute__((packed));
+
+enum {
+	HURR_SCALABILTY	= 0,  /* Hurricane Scalability info */
+	HURR_RIOIB	= 2,  /* Hurricane RIOIB info       */
+	COMPAT_CALGARY	= 4,  /* Compatibility Calgary      */
+	ALT_CALGARY	= 5,  /* Second Planar Calgary      */
+};
+
+/*
+ * there is a real-mode segmented pointer pointing to the
+ * 4K EBDA area at 0x40E.
+ */
+
+static inline unsigned long get_bios_ebda(void)
+{
+	unsigned long address= *(unsigned short *)phys_to_virt(0x40Eul);
+	address <<= 4;
+	return address;
+}
+
+#endif /* __ASM_RIO_H */
-- 
cgit v1.2.3-70-g09d2


From eae93755540bae18aff46b8a0e621b5d65bd5380 Mon Sep 17 00:00:00 2001
From: Muli Ben-Yehuda <muli@il.ibm.com>
Date: Thu, 7 Dec 2006 02:14:06 +0100
Subject: [PATCH] Calgary: check BBAR ioremap success when ioremapping

This patch cleans up the previous "Use BIOS supplied BBAR information"
patch. Mostly stylistic clenaups, but also check for ioremap failure
when we ioremap the BBAR rather than when trying to use it.

Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
Signed-off-by: Andi Kleen <ak@suse.de>
Acked-by: Laurent Vivier <Laurent.Vivier@bull.net>
---
 arch/x86_64/kernel/pci-calgary.c | 85 +++++++++++++++++++++++-----------------
 include/asm-x86_64/rio.h         |  8 ++--
 2 files changed, 52 insertions(+), 41 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index afc0a53505f..8a1e4f35bc3 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -138,7 +138,7 @@ static int calgary_detected __read_mostly = 0;
 
 static struct rio_table_hdr	*rio_table_hdr __initdata;
 static struct scal_detail	*scal_devs[MAX_NUMNODES] __initdata;
-static struct rio_detail	*rio_devs[MAX_NUMNODES*4] __initdata;
+static struct rio_detail	*rio_devs[MAX_NUMNODES * 4] __initdata;
 
 struct calgary_bus_info {
 	void *tce_space;
@@ -855,11 +855,6 @@ static void __init calgary_disable_translation(struct pci_dev *dev)
 	del_timer_sync(&tbl->watchdog_timer);
 }
 
-static inline void __iomem * __init locate_register_space(struct pci_dev *dev)
-{
-	return busno_to_bbar(dev->bus->number);
-}
-
 static void __init calgary_init_one_nontraslated(struct pci_dev *dev)
 {
 	pci_dev_get(dev);
@@ -874,15 +869,10 @@ static int __init calgary_init_one(struct pci_dev *dev)
 
 	BUG_ON(dev->bus->number >= MAX_PHB_BUS_NUM);
 
-	bbar = locate_register_space(dev);
-	if (!bbar) {
-		ret = -ENODATA;
-		goto done;
-	}
-
+	bbar = busno_to_bbar(dev->bus->number);
 	ret = calgary_setup_tar(dev, bbar);
 	if (ret)
-		goto iounmap;
+		goto done;
 
 	pci_dev_get(dev);
 	dev->bus->self = dev;
@@ -890,38 +880,39 @@ static int __init calgary_init_one(struct pci_dev *dev)
 
 	return 0;
 
-iounmap:
-	iounmap(bbar);
 done:
 	return ret;
 }
 
-static int __init calgary_init(void)
+static int __init calgary_locate_bbars(void)
 {
-	int ret = -ENODEV;
-	struct pci_dev *dev = NULL;
-	int rio, phb, bus;
+	int ret;
+	int rioidx, phb, bus;
 	void __iomem *bbar;
 	void __iomem *target;
+	unsigned long offset;
 	u8 start_bus, end_bus;
 	u32 val;
 
-	for (rio = 0; rio < rio_table_hdr->num_rio_dev; rio++) {
+	ret = -ENODATA;
+	for (rioidx = 0; rioidx < rio_table_hdr->num_rio_dev; rioidx++) {
+		struct rio_detail *rio = rio_devs[rioidx];
 
-		if ( (rio_devs[rio]->type != COMPAT_CALGARY) &&
-		     (rio_devs[rio]->type != ALT_CALGARY) )
+		if ((rio->type != COMPAT_CALGARY) && (rio->type != ALT_CALGARY))
 			continue;
 
 		/* map entire 1MB of Calgary config space */
-		bbar = ioremap_nocache(rio_devs[rio]->BBAR, 1024 * 1024);
+		bbar = ioremap_nocache(rio->BBAR, 1024 * 1024);
+		if (!bbar)
+			goto error;
 
 		for (phb = 0; phb < PHBS_PER_CALGARY; phb++) {
+			offset = phb_debug_offsets[phb] | PHB_DEBUG_STUFF_OFFSET;
+			target = calgary_reg(bbar, offset);
 
-			target = calgary_reg(bbar, phb_debug_offsets[phb] |
-						   PHB_DEBUG_STUFF_OFFSET);
 			val = be32_to_cpu(readl(target));
 			start_bus = (u8)((val & 0x00FF0000) >> 16);
-			end_bus =   (u8)((val & 0x0000FF00) >> 8);
+			end_bus = (u8)((val & 0x0000FF00) >> 8);
 			for (bus = start_bus; bus <= end_bus; bus++) {
 				bus_info[bus].bbar = bbar;
 				bus_info[bus].phbid = phb;
@@ -929,6 +920,25 @@ static int __init calgary_init(void)
 		}
 	}
 
+	return 0;
+
+error:
+	/* scan bus_info and iounmap any bbars we previously ioremap'd */
+	for (bus = 0; bus < ARRAY_SIZE(bus_info); bus++)
+		if (bus_info[bus].bbar)
+			iounmap(bus_info[bus].bbar);
+
+	return ret;
+}
+
+static int __init calgary_init(void)
+{
+	int ret;
+	struct pci_dev *dev = NULL;
+
+	ret = calgary_locate_bbars();
+	if (ret)
+		return ret;
 
 	do {
 		dev = pci_get_device(PCI_VENDOR_ID_IBM,
@@ -1000,18 +1010,13 @@ static int __init build_detail_arrays(void)
 
 	if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){
 		printk(KERN_WARNING
-			"Calgary: MAX_NUMNODES too low!  Defined as %d, "
+			"Calgary: MAX_NUMNODES too low! Defined as %d, "
 			"but system has %d nodes.\n",
 			MAX_NUMNODES, rio_table_hdr->num_scal_dev);
 		return -ENODEV;
 	}
 
 	switch (rio_table_hdr->version){
-	default:
-		printk(KERN_WARNING
-		       "Calgary: Invalid Rio Grande Table Version: %d\n",
-		       rio_table_hdr->version);
-		return -ENODEV;
 	case 2:
 		scal_detail_size = 11;
 		rio_detail_size = 13;
@@ -1020,6 +1025,11 @@ static int __init build_detail_arrays(void)
 		scal_detail_size = 12;
 		rio_detail_size = 15;
 		break;
+	default:
+		printk(KERN_WARNING
+		       "Calgary: Invalid Rio Grande Table Version: %d\n",
+		       rio_table_hdr->version);
+		return -EPROTO;
 	}
 
 	ptr = ((unsigned long)rio_table_hdr) + 3;
@@ -1042,6 +1052,7 @@ void __init detect_calgary(void)
 	int calgary_found = 0;
 	unsigned long ptr;
 	int offset;
+	int ret;
 
 	/*
 	 * if the user specified iommu=off or iommu=soft or we found
@@ -1061,27 +1072,29 @@ void __init detect_calgary(void)
 		/* The block id is stored in the 2nd word */
 		if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){
 			/* set the pointer past the offset & block id */
-			rio_table_hdr = (struct rio_table_hdr *)(ptr+offset+4);
+			rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
 			break;
 		}
 		/* The next offset is stored in the 1st word. 0 means no more */
 		offset = *((unsigned short *)(ptr + offset));
 	}
-	if (!rio_table_hdr){
+	if (!rio_table_hdr) {
 		printk(KERN_ERR "Calgary: Unable to locate "
 				"Rio Grande Table in EBDA - bailing!\n");
 		return;
 	}
 
-	if (build_detail_arrays())
+	ret = build_detail_arrays();
+	if (ret) {
+		printk(KERN_ERR "Calgary: build_detail_arrays ret %d\n", ret);
 		return;
+	}
 
 	specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE);
 
 	for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
 		int dev;
 		struct calgary_bus_info *info = &bus_info[bus];
-		info->phbid = -1;
 
 		if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY)
 			continue;
diff --git a/include/asm-x86_64/rio.h b/include/asm-x86_64/rio.h
index 1f315c39d76..c7350f6d201 100644
--- a/include/asm-x86_64/rio.h
+++ b/include/asm-x86_64/rio.h
@@ -3,7 +3,6 @@
  *          and include/asm-i386/mach-default/bios_ebda.h
  *
  * Author: Laurent Vivier <Laurent.Vivier@bull.net>
- *
  */
 
 #ifndef __ASM_RIO_H
@@ -19,7 +18,7 @@ struct rio_table_hdr {
 
 struct scal_detail {
 	u8 node_id;      /* Scalability Node ID                    */
-	u32  CBAR;       /* Address of 1MB register space          */
+	u32 CBAR;        /* Address of 1MB register space          */
 	u8 port0node;    /* Node ID port connected to: 0xFF=None   */
 	u8 port0port;    /* Port num port connected to: 0,1,2, or  */
 	                 /* 0xFF=None                              */
@@ -34,7 +33,7 @@ struct scal_detail {
 
 struct rio_detail {
 	u8 node_id;      /* RIO Node ID                            */
-	u32  BBAR;       /* Address of 1MB register space          */
+	u32 BBAR;        /* Address of 1MB register space          */
 	u8 type;         /* Type of device                         */
 	u8 owner_id;     /* Node ID of Hurricane that owns this    */
 	                 /* node                                   */
@@ -65,10 +64,9 @@ enum {
  * there is a real-mode segmented pointer pointing to the
  * 4K EBDA area at 0x40E.
  */
-
 static inline unsigned long get_bios_ebda(void)
 {
-	unsigned long address= *(unsigned short *)phys_to_virt(0x40Eul);
+	unsigned long address = *(unsigned short *)phys_to_virt(0x40EUL);
 	address <<= 4;
 	return address;
 }
-- 
cgit v1.2.3-70-g09d2


From bff6547bb6a4e82c399d74e7fba78b12d2f162ed Mon Sep 17 00:00:00 2001
From: Muli Ben-Yehuda <muli@il.ibm.com>
Date: Thu, 7 Dec 2006 02:14:07 +0100
Subject: [PATCH] Calgary: allow compiling Calgary in but not using it by
 default

This patch makes it possible to compile Calgary in but not use it by
default. In this mode, use 'iommu=calgary' to activate it.

Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 Documentation/x86_64/boot-options.txt |  3 ++-
 arch/x86_64/Kconfig                   | 11 +++++++++++
 arch/x86_64/kernel/pci-calgary.c      |  9 +++++++++
 arch/x86_64/kernel/pci-dma.c          |  5 +++++
 include/asm-x86_64/calgary.h          |  2 ++
 5 files changed, 29 insertions(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt
index ab9b1c046c0..dbdcaf68e3e 100644
--- a/Documentation/x86_64/boot-options.txt
+++ b/Documentation/x86_64/boot-options.txt
@@ -179,7 +179,7 @@ PCI
 IOMMU
 
  iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge]
-         [,forcesac][,fullflush][,nomerge][,noaperture]
+         [,forcesac][,fullflush][,nomerge][,noaperture][,calgary]
    size  set size of iommu (in bytes)
    noagp don't initialize the AGP driver and use full aperture.
    off   don't use the IOMMU
@@ -200,6 +200,7 @@ IOMMU
 	    buffering.
    nodac    Forbid DMA >4GB
    panic    Always panic when IOMMU overflows
+   calgary  Use the Calgary IOMMU if it is available
 
   swiotlb=pages[,force]
 
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 010d2265f1c..5cb509dbffe 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -455,6 +455,17 @@ config CALGARY_IOMMU
 	  Normally the kernel will make the right choice by itself.
 	  If unsure, say Y.
 
+config CALGARY_IOMMU_ENABLED_BY_DEFAULT
+	bool "Should Calgary be enabled by default?"
+	default y
+	depends on CALGARY_IOMMU
+	help
+	  Should Calgary be enabled by default? if you choose 'y', Calgary
+	  will be used (if it exists). If you choose 'n', Calgary will not be
+	  used even if it exists. If you choose 'n' and would like to use
+	  Calgary anyway, pass 'iommu=calgary' on the kernel command line.
+	  If unsure, say Y.
+
 # need this always selected by IOMMU for the VIA workaround
 config SWIOTLB
 	bool
diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index 8a1e4f35bc3..0ddf29dae7e 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -43,6 +43,12 @@
 #include <asm/dma.h>
 #include <asm/rio.h>
 
+#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
+int use_calgary __read_mostly = 1;
+#else
+int use_calgary __read_mostly = 0;
+#endif /* CONFIG_CALGARY_DEFAULT_ENABLED */
+
 #define PCI_DEVICE_ID_IBM_CALGARY 0x02a1
 #define PCI_VENDOR_DEVICE_ID_CALGARY \
 	(PCI_VENDOR_ID_IBM | PCI_DEVICE_ID_IBM_CALGARY << 16)
@@ -1061,6 +1067,9 @@ void __init detect_calgary(void)
 	if (swiotlb || no_iommu || iommu_detected)
 		return;
 
+	if (!use_calgary)
+		return;
+
 	if (!early_pci_allowed())
 		return;
 
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
index f8d857453f8..683b7a5c1ab 100644
--- a/arch/x86_64/kernel/pci-dma.c
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -296,6 +296,11 @@ __init int iommu_setup(char *p)
 		gart_parse_options(p);
 #endif
 
+#ifdef CONFIG_CALGARY_IOMMU
+		if (!strncmp(p, "calgary", 7))
+			use_calgary = 1;
+#endif /* CONFIG_CALGARY_IOMMU */
+
 		p += strcspn(p, ",");
 		if (*p == ',')
 			++p;
diff --git a/include/asm-x86_64/calgary.h b/include/asm-x86_64/calgary.h
index 6b93f5a3a5c..7ee90064571 100644
--- a/include/asm-x86_64/calgary.h
+++ b/include/asm-x86_64/calgary.h
@@ -51,6 +51,8 @@ struct iommu_table {
 #define TCE_TABLE_SIZE_4M		6
 #define TCE_TABLE_SIZE_8M		7
 
+extern int use_calgary;
+
 #ifdef CONFIG_CALGARY_IOMMU
 extern int calgary_iommu_init(void);
 extern void detect_calgary(void);
-- 
cgit v1.2.3-70-g09d2


From 516d2836434d279fef164bb1b964e83425d312d2 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:07 +0100
Subject: [PATCH] x86-64: Fix race in IO-APIC routing entry setup.

Interrupt could happen between setting the IO-APIC entry
and setting its interrupt data.

Pointed out by Linus.

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/io_apic.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 88fcc4ebbf6..eaf0b708e67 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -174,14 +174,20 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
  * the interrupt, and we need to make sure the entry is fully populated
  * before that happens.
  */
-static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+static void
+__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
 {
-	unsigned long flags;
 	union entry_union eu;
 	eu.entry = e;
-	spin_lock_irqsave(&ioapic_lock, flags);
 	io_apic_write(apic, 0x11 + 2*pin, eu.w2);
 	io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+}
+
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&ioapic_lock, flags);
+	__ioapic_write_entry(apic, pin, e);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 103efcd9aac1de4da6a1477f2f3b9fcf35273a18 Mon Sep 17 00:00:00 2001
From: Ernie Petrides <petrides@redhat.com>
Date: Thu, 7 Dec 2006 02:14:09 +0100
Subject: [PATCH] x86-64: fix perms/range of vsyscall vma in /proc/*/maps

The final line of /proc/<pid>/maps on x86_64 for native 64-bit
tasks shows an incorrect ending address and incorrect permissions.  There
is only a single page mapped in this vsyscall region, and it is accessible
for both read and execute.

The patch below fixes this.  (Since 32-bit-compat tasks have a real vma
with correct perms/range, no change is necessary for that scenario.)

Before the patch, a "cat /proc/self/maps | tail -1" shows this:

        ffffffffff600000-ffffffffffe00000 ---p 00000000 [...]

After the patch, this is the output:

        ffffffffff600000-ffffffffff601000 r-xp 00000000 [...]

Signed-off-by: Ernie Petrides <petrides@redhat.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/vsyscall.c | 1 +
 arch/x86_64/mm/init.c         | 7 ++++---
 include/asm-x86_64/vsyscall.h | 1 +
 3 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index 92546c1526f..c3de9a09cd9 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -290,6 +290,7 @@ static void __init map_vsyscall(void)
 	extern char __vsyscall_0;
 	unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
 
+	/* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
 	__set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
 }
 
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 4c0c00ef3ca..2968b90ef8a 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -730,14 +730,15 @@ static __init int x8664_sysctl_init(void)
 __initcall(x8664_sysctl_init);
 #endif
 
-/* A pseudo VMAs to allow ptrace access for the vsyscall page.   This only
+/* A pseudo VMA to allow ptrace access for the vsyscall page.  This only
    covers the 64bit vsyscall page now. 32bit has a real VMA now and does
    not need special handling anymore. */
 
 static struct vm_area_struct gate_vma = {
 	.vm_start = VSYSCALL_START,
-	.vm_end = VSYSCALL_END,
-	.vm_page_prot = PAGE_READONLY
+	.vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
+	.vm_page_prot = PAGE_READONLY_EXEC,
+	.vm_flags = VM_READ | VM_EXEC
 };
 
 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
diff --git a/include/asm-x86_64/vsyscall.h b/include/asm-x86_64/vsyscall.h
index 01d1c17e284..05cb8dd200d 100644
--- a/include/asm-x86_64/vsyscall.h
+++ b/include/asm-x86_64/vsyscall.h
@@ -10,6 +10,7 @@ enum vsyscall_num {
 #define VSYSCALL_START (-10UL << 20)
 #define VSYSCALL_SIZE 1024
 #define VSYSCALL_END (-2UL << 20)
+#define VSYSCALL_MAPPED_PAGES 1
 #define VSYSCALL_ADDR(vsyscall_nr) (VSYSCALL_START+VSYSCALL_SIZE*(vsyscall_nr))
 
 #ifdef __KERNEL__
-- 
cgit v1.2.3-70-g09d2


From 9a457324229db34d3bcb0b67360130c287289401 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:09 +0100
Subject: [PATCH] x86-64: Rate limit no irq handler messages

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/irq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index e46c55856d4..0c06af6c13b 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -120,7 +120,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
 
 	if (likely(irq < NR_IRQS))
 		generic_handle_irq(irq);
-	else
+	else if (printk_ratelimit())
 		printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n",
 			__func__, smp_processor_id(), vector);
 
-- 
cgit v1.2.3-70-g09d2


From 9899f826fc90beba4f78083f6230e06cbe1050c9 Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Thu, 7 Dec 2006 02:14:10 +0100
Subject: [PATCH] x86-64: add genapic_force

Add genapic_force. Used by the next Intel quirks patch.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Cc: "Li, Shaohua" <shaohua.li@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---
 arch/x86_64/kernel/genapic.c | 9 ++++++++-
 include/asm-x86_64/genapic.h | 2 +-
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c
index 8e78a75d186..b007433f96b 100644
--- a/arch/x86_64/kernel/genapic.c
+++ b/arch/x86_64/kernel/genapic.c
@@ -33,7 +33,7 @@ extern struct genapic apic_flat;
 extern struct genapic apic_physflat;
 
 struct genapic *genapic = &apic_flat;
-
+struct genapic *genapic_force;
 
 /*
  * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
@@ -46,6 +46,13 @@ void __init clustered_apic_check(void)
 	u8 cluster_cnt[NUM_APIC_CLUSTERS];
 	int max_apic = 0;
 
+	/* genapic selection can be forced because of certain quirks.
+	 */
+	if (genapic_force) {
+		genapic = genapic_force;
+		goto print;
+	}
+
 #if defined(CONFIG_ACPI)
 	/*
 	 * Some x86_64 machines use physical APIC mode regardless of how many
diff --git a/include/asm-x86_64/genapic.h b/include/asm-x86_64/genapic.h
index a0e9a4b9348..b80f4bb5f27 100644
--- a/include/asm-x86_64/genapic.h
+++ b/include/asm-x86_64/genapic.h
@@ -30,6 +30,6 @@ struct genapic {
 };
 
 
-extern struct genapic *genapic;
+extern struct genapic *genapic, *genapic_force, apic_flat;
 
 #endif
-- 
cgit v1.2.3-70-g09d2


From b0d0a4ba45760b10ecee9035ed45b442c1a6cc84 Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Thu, 7 Dec 2006 02:14:10 +0100
Subject: [PATCH] x86: fix the irqbalance quirk for E7320/E7520/E7525

Move the irqbalance quirks for E7320/E7520/E7525(Errata 23 in
http://download.intel.com/design/chipsets/specupdt/30304203.pdf) to early
quirks.

And add a PCI quirk for these platforms to check(which happens very late
during the boot) if the APIC routing is indeed set to default flat mode.

This fixes the breakage(in x86_64) of this quirk due to cpu hotplug which
selects physical mode instead of the logical flat(as needed for this errata
workaround).

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Cc: "Li, Shaohua" <shaohua.li@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---
 arch/i386/kernel/acpi/earlyquirk.c | 21 +++++++++++++++++
 arch/i386/kernel/quirks.c          | 46 +++++++++++++++++++++++++++++---------
 arch/i386/kernel/smpboot.c         |  7 ++++++
 arch/x86_64/kernel/early-quirks.c  | 13 +++++++++++
 arch/x86_64/kernel/smpboot.c       |  8 +++++++
 include/asm-i386/genapic.h         |  2 +-
 include/asm-i386/irq.h             |  2 ++
 include/asm-x86_64/proto.h         |  1 +
 8 files changed, 88 insertions(+), 12 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c
index c9841692bb7..4b60af7f91d 100644
--- a/arch/i386/kernel/acpi/earlyquirk.c
+++ b/arch/i386/kernel/acpi/earlyquirk.c
@@ -10,6 +10,7 @@
 #include <asm/pci-direct.h>
 #include <asm/acpi.h>
 #include <asm/apic.h>
+#include <asm/irq.h>
 
 #ifdef CONFIG_ACPI
 
@@ -49,6 +50,24 @@ static int __init check_bridge(int vendor, int device)
 	return 0;
 }
 
+static void check_intel(void)
+{
+	u16 vendor, device;
+
+	vendor = read_pci_config_16(0, 0, 0, PCI_VENDOR_ID);
+
+	if (vendor != PCI_VENDOR_ID_INTEL)
+		return;
+
+	device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID);
+#ifdef CONFIG_SMP
+	if (device == PCI_DEVICE_ID_INTEL_E7320_MCH ||
+	    device == PCI_DEVICE_ID_INTEL_E7520_MCH ||
+	    device == PCI_DEVICE_ID_INTEL_E7525_MCH)
+		quirk_intel_irqbalance();
+#endif
+}
+
 void __init check_acpi_pci(void)
 {
 	int num, slot, func;
@@ -60,6 +79,8 @@ void __init check_acpi_pci(void)
 	if (!early_pci_allowed())
 		return;
 
+	check_intel();
+
 	/* Poor man's PCI discovery */
 	for (num = 0; num < 32; num++) {
 		for (slot = 0; slot < 32; slot++) {
diff --git a/arch/i386/kernel/quirks.c b/arch/i386/kernel/quirks.c
index 9f6ab1789bb..a01320a7b63 100644
--- a/arch/i386/kernel/quirks.c
+++ b/arch/i386/kernel/quirks.c
@@ -3,10 +3,23 @@
  */
 #include <linux/pci.h>
 #include <linux/irq.h>
+#include <asm/pci-direct.h>
+#include <asm/genapic.h>
+#include <asm/cpu.h>
 
 #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
+static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
+{
+#ifdef CONFIG_X86_64
+	if (genapic !=  &apic_flat)
+		panic("APIC mode must be flat on this system\n");
+#elif defined(CONFIG_X86_GENERICARCH)
+	if (genapic != &apic_default)
+		panic("APIC mode must be default(flat) on this system. Use apic=default\n");
+#endif
+}
 
-static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
+void __init quirk_intel_irqbalance(void)
 {
 	u8 config, rev;
 	u32 word;
@@ -16,18 +29,18 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
 	 * based platforms.
 	 * Disable SW irqbalance/affinity on those platforms.
 	 */
-	pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
+	rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION);
 	if (rev > 0x9)
 		return;
 
 	printk(KERN_INFO "Intel E7520/7320/7525 detected.");
 
-	/* enable access to config space*/
-	pci_read_config_byte(dev, 0xf4, &config);
-	pci_write_config_byte(dev, 0xf4, config|0x2);
+	/* enable access to config space */
+	config = read_pci_config_byte(0, 0, 0, 0xf4);
+	write_pci_config_byte(0, 0, 0, 0xf4, config|0x2);
 
 	/* read xTPR register */
-	raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
+	word = read_pci_config_16(0, 0, 0x40, 0x4c);
 
 	if (!(word & (1 << 13))) {
 		printk(KERN_INFO "Disabling irq balancing and affinity\n");
@@ -37,14 +50,25 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
 		noirqdebug_setup("");
 #ifdef CONFIG_PROC_FS
 		no_irq_affinity = 1;
+#endif
+#ifdef CONFIG_HOTPLUG_CPU
+		printk(KERN_INFO "Disabling cpu hotplug control\n");
+		enable_cpu_hotplug = 0;
+#endif
+#ifdef CONFIG_X86_64
+		/* force the genapic selection to flat mode so that
+		 * interrupts can be redirected to more than one CPU.
+		 */
+		genapic_force = &apic_flat;
 #endif
 	}
 
-	/* put back the original value for config space*/
+	/* put back the original value for config space */
 	if (!(config & 0x2))
-		pci_write_config_byte(dev, 0xf4, config);
+		write_pci_config_byte(0, 0, 0, 0xf4, config);
 }
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_E7320_MCH,	quirk_intel_irqbalance);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_E7525_MCH,	quirk_intel_irqbalance);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_E7520_MCH,	quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7320_MCH,  verify_quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7525_MCH,  verify_quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7520_MCH,  verify_quirk_intel_irqbalance);
+
 #endif
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index cd7de9c9654..346f27f4c79 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -58,6 +58,7 @@
 #include <asm/arch_hooks.h>
 #include <asm/nmi.h>
 #include <asm/pda.h>
+#include <asm/genapic.h>
 
 #include <mach_apic.h>
 #include <mach_wakecpu.h>
@@ -1482,6 +1483,12 @@ int __devinit __cpu_up(unsigned int cpu)
 	cpu_set(cpu, smp_commenced_mask);
 	while (!cpu_isset(cpu, cpu_online_map))
 		cpu_relax();
+
+#ifdef CONFIG_X86_GENERICARCH
+	if (num_online_cpus() > 8 && genapic == &apic_default)
+		panic("Default flat APIC routing can't be used with > 8 cpus\n");
+#endif
+
 	return 0;
 }
 
diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c
index fb0c6da41b7..829698f6d04 100644
--- a/arch/x86_64/kernel/early-quirks.c
+++ b/arch/x86_64/kernel/early-quirks.c
@@ -71,6 +71,18 @@ static void ati_bugs(void)
 {
 }
 
+static void intel_bugs(void)
+{
+	u16 device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID);
+
+#ifdef CONFIG_SMP
+	if (device == PCI_DEVICE_ID_INTEL_E7320_MCH ||
+	    device == PCI_DEVICE_ID_INTEL_E7520_MCH ||
+	    device == PCI_DEVICE_ID_INTEL_E7525_MCH)
+		quirk_intel_irqbalance();
+#endif
+}
+
 struct chipset {
 	u16 vendor;
 	void (*f)(void);
@@ -80,6 +92,7 @@ static struct chipset early_qrk[] = {
 	{ PCI_VENDOR_ID_NVIDIA, nvidia_bugs },
 	{ PCI_VENDOR_ID_VIA, via_bugs },
 	{ PCI_VENDOR_ID_ATI, ati_bugs },
+	{ PCI_VENDOR_ID_INTEL, intel_bugs},
 	{}
 };
 
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 62c2e747af5..4c161c208d5 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -60,6 +60,7 @@
 #include <asm/irq.h>
 #include <asm/hw_irq.h>
 #include <asm/numa.h>
+#include <asm/genapic.h>
 
 /* Number of siblings per CPU package */
 int smp_num_siblings = 1;
@@ -1167,6 +1168,13 @@ int __cpuinit __cpu_up(unsigned int cpu)
 
 	while (!cpu_isset(cpu, cpu_online_map))
 		cpu_relax();
+
+	if (num_online_cpus() > 8 && genapic == &apic_flat) {
+		printk(KERN_WARNING
+		       "flat APIC routing can't be used with > 8 cpus\n");
+		BUG();
+	}
+
 	err = 0;
 
 	return err;
diff --git a/include/asm-i386/genapic.h b/include/asm-i386/genapic.h
index 8ffbb0f0745..fd2be593b06 100644
--- a/include/asm-i386/genapic.h
+++ b/include/asm-i386/genapic.h
@@ -122,6 +122,6 @@ struct genapic {
 	APICFUNC(phys_pkg_id) \
 	}
 
-extern struct genapic *genapic;
+extern struct genapic *genapic, apic_default;
 
 #endif
diff --git a/include/asm-i386/irq.h b/include/asm-i386/irq.h
index 9e15ce0006e..11761cdaae1 100644
--- a/include/asm-i386/irq.h
+++ b/include/asm-i386/irq.h
@@ -37,6 +37,8 @@ static __inline__ int irq_canonicalize(int irq)
 extern int irqbalance_disable(char *str);
 #endif
 
+extern void quirk_intel_irqbalance(void);
+
 #ifdef CONFIG_HOTPLUG_CPU
 extern void fixup_irqs(cpumask_t map);
 #endif
diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h
index 21fa5afa232..6d324b83897 100644
--- a/include/asm-x86_64/proto.h
+++ b/include/asm-x86_64/proto.h
@@ -87,6 +87,7 @@ extern void syscall32_cpu_init(void);
 extern void setup_node_bootmem(int nodeid, unsigned long start, unsigned long end);
 
 extern void early_quirks(void);
+extern void quirk_intel_irqbalance(void);
 extern void check_efer(void);
 
 extern int unhandled_signal(struct task_struct *tsk, int sig);
-- 
cgit v1.2.3-70-g09d2


From ee58fad51a2a767cb2567706ace967705233d881 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@hpl.hp.com>
Date: Thu, 7 Dec 2006 02:14:11 +0100
Subject: [PATCH] x86-64: x86-64 add Intel BTS cpufeature bit and detection
 (take 2)

Here is a small patch for x86-64 which adds a cpufeature flag and
detection code for Intel's Branch Trace Store (BTS) feature. This
feature can be found on Intel P4 and Core 2 processors among others.
It can also be used by perfmon.

changelog:
	- add CPU_FEATURE_BTS
	- add Branch Trace Store detection

signed-off-by: stephane eranian <eranian@hpl.hp.com>

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/setup.c      | 2 ++
 include/asm-x86_64/cpufeature.h | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 619af2e2fa2..a570c81c831 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -838,6 +838,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 	if (cpu_has_ds) {
 		unsigned int l1, l2;
 		rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
+		if (!(l1 & (1<<11)))
+			set_bit(X86_FEATURE_BTS, c->x86_capability);
 		if (!(l1 & (1<<12)))
 			set_bit(X86_FEATURE_PEBS, c->x86_capability);
 	}
diff --git a/include/asm-x86_64/cpufeature.h b/include/asm-x86_64/cpufeature.h
index d280384807e..0b3c686139f 100644
--- a/include/asm-x86_64/cpufeature.h
+++ b/include/asm-x86_64/cpufeature.h
@@ -69,6 +69,7 @@
 #define X86_FEATURE_UP		(3*32+8) /* SMP kernel running on UP */
 #define X86_FEATURE_ARCH_PERFMON (3*32+9) /* Intel Architectural PerfMon */
 #define X86_FEATURE_PEBS	(3*32+10) /* Precise-Event Based Sampling */
+#define X86_FEATURE_BTS		(3*32+11) /* Branch Trace Store */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* Streaming SIMD Extensions-3 */
@@ -115,5 +116,6 @@
 #define cpu_has_clflush	       boot_cpu_has(X86_FEATURE_CLFLSH)
 #define cpu_has_ds 	       boot_cpu_has(X86_FEATURE_DS)
 #define cpu_has_pebs 	       boot_cpu_has(X86_FEATURE_PEBS)
+#define cpu_has_bts 	       boot_cpu_has(X86_FEATURE_BTS)
 
 #endif /* __ASM_X8664_CPUFEATURE_H */
-- 
cgit v1.2.3-70-g09d2


From f990fff427d68af3e4e1d16fe799c106abc0bf53 Mon Sep 17 00:00:00 2001
From: Karsten Wiese <fzu@wemgehoertderstaat.de>
Date: Thu, 7 Dec 2006 02:14:11 +0100
Subject: [PATCH] x86: Regard MSRs in lapic_suspend()/lapic_resume()

Read/Write APIC_LVTPC and APIC_LVTTHMR only,
if get_maxlvt() returns certain values.
This is done like everywhere else in i386/kernel/apic.c,
so I guess its correct.
Suspends/Resumes to disk fine and eleminates an smp_error_interrupt()
here on a K8.

AK: ported to x86-64 too

Signed-off-by: Karsten Wiese <fzu@wemgehoertderstaat.de>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/apic.c   | 22 ++++++++++++++++++----
 arch/x86_64/kernel/apic.c | 22 ++++++++++++++++++----
 2 files changed, 36 insertions(+), 8 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 2fd4b7d927c..776d9be26af 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -647,23 +647,30 @@ static struct {
 static int lapic_suspend(struct sys_device *dev, pm_message_t state)
 {
 	unsigned long flags;
+	int maxlvt;
 
 	if (!apic_pm_state.active)
 		return 0;
 
+	maxlvt = get_maxlvt();
+
 	apic_pm_state.apic_id = apic_read(APIC_ID);
 	apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
 	apic_pm_state.apic_ldr = apic_read(APIC_LDR);
 	apic_pm_state.apic_dfr = apic_read(APIC_DFR);
 	apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
 	apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
-	apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
+	if (maxlvt >= 4)
+		apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
 	apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
 	apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
 	apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
 	apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
 	apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
-	apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
+#ifdef CONFIG_X86_MCE_P4THERMAL
+	if (maxlvt >= 5)
+		apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
+#endif
 	
 	local_irq_save(flags);
 	disable_local_APIC();
@@ -675,10 +682,13 @@ static int lapic_resume(struct sys_device *dev)
 {
 	unsigned int l, h;
 	unsigned long flags;
+	int maxlvt;
 
 	if (!apic_pm_state.active)
 		return 0;
 
+	maxlvt = get_maxlvt();
+
 	local_irq_save(flags);
 
 	/*
@@ -700,8 +710,12 @@ static int lapic_resume(struct sys_device *dev)
 	apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
 	apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
 	apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
-	apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
-	apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
+#ifdef CONFIG_X86_MCE_P4THERMAL
+	if (maxlvt >= 5)
+		apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
+#endif
+	if (maxlvt >= 4)
+		apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
 	apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
 	apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
 	apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index 5c468971e64..f0b00d8731c 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -459,23 +459,30 @@ static struct {
 static int lapic_suspend(struct sys_device *dev, pm_message_t state)
 {
 	unsigned long flags;
+	int maxlvt;
 
 	if (!apic_pm_state.active)
 		return 0;
 
+	maxlvt = get_maxlvt();
+
 	apic_pm_state.apic_id = apic_read(APIC_ID);
 	apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
 	apic_pm_state.apic_ldr = apic_read(APIC_LDR);
 	apic_pm_state.apic_dfr = apic_read(APIC_DFR);
 	apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
 	apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
-	apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
+	if (maxlvt >= 4)
+		apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
 	apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
 	apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
 	apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
 	apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
 	apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
-	apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
+#ifdef CONFIG_X86_MCE_INTEL
+	if (maxlvt >= 5)
+		apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
+#endif
 	local_irq_save(flags);
 	disable_local_APIC();
 	local_irq_restore(flags);
@@ -486,10 +493,13 @@ static int lapic_resume(struct sys_device *dev)
 {
 	unsigned int l, h;
 	unsigned long flags;
+	int maxlvt;
 
 	if (!apic_pm_state.active)
 		return 0;
 
+	maxlvt = get_maxlvt();
+
 	local_irq_save(flags);
 	rdmsr(MSR_IA32_APICBASE, l, h);
 	l &= ~MSR_IA32_APICBASE_BASE;
@@ -503,8 +513,12 @@ static int lapic_resume(struct sys_device *dev)
 	apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
 	apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
 	apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
-	apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
-	apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
+#ifdef CONFIG_X86_MCE_INTEL
+	if (maxlvt >= 5)
+		apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
+#endif
+	if (maxlvt >= 4)
+		apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
 	apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
 	apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
 	apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
-- 
cgit v1.2.3-70-g09d2


From 86bd58bf4c383fde4e99b83ce917b091a072040d Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@cs.washington.edu>
Date: Thu, 7 Dec 2006 02:14:11 +0100
Subject: [PATCH] x86-64: Remove unused GET_APIC_VERSION call from
 clear_local_APIC

Remove unused GET_APIC_VERSION call from clear_local_APIC() and
__setup_APIC_LVTT().

Reported by D Binderman <dcb314@hotmail.com>.

Cc: Andi Kleen <ak@suse.de>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: David Rientjes <rientjes@cs.washington.edu>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/apic.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index f0b00d8731c..124b2d27b4a 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -140,7 +140,6 @@ void clear_local_APIC(void)
 		apic_write(APIC_LVTERR, APIC_LVT_MASKED);
 	if (maxlvt >= 4)
 		apic_write(APIC_LVTPC, APIC_LVT_MASKED);
-	v = GET_APIC_VERSION(apic_read(APIC_LVR));
 	apic_write(APIC_ESR, 0);
 	apic_read(APIC_ESR);
 }
@@ -736,10 +735,9 @@ void __init init_apic_mappings(void)
 
 static void __setup_APIC_LVTT(unsigned int clocks)
 {
-	unsigned int lvtt_value, tmp_value, ver;
+	unsigned int lvtt_value, tmp_value;
 	int cpu = smp_processor_id();
 
-	ver = GET_APIC_VERSION(apic_read(APIC_LVR));
 	lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
 
 	if (cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask))
-- 
cgit v1.2.3-70-g09d2


From 0741f4d207a644482d7a040f05cd264c98cf7ee8 Mon Sep 17 00:00:00 2001
From: Chuck Ebbert <76306.1226@compuserve.com>
Date: Thu, 7 Dec 2006 02:14:11 +0100
Subject: [PATCH] x86: add sysctl for kstack_depth_to_print

Add sysctl for kstack_depth_to_print. This lets users change
the amount of raw stack data printed in dump_stack() without
having to reboot.

Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 Documentation/sysctl/kernel.txt | 8 ++++++++
 arch/i386/kernel/traps.c        | 2 +-
 arch/x86_64/kernel/traps.c      | 2 +-
 include/asm-x86_64/stacktrace.h | 2 ++
 kernel/sysctl.c                 | 9 +++++++++
 5 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 0bc7f1e3c9e..5922e84d913 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -27,6 +27,7 @@ show up in /proc/sys/kernel:
 - hotplug
 - java-appletviewer           [ binfmt_java, obsolete ]
 - java-interpreter            [ binfmt_java, obsolete ]
+- kstack_depth_to_print       [ X86 only ]
 - l2cr                        [ PPC only ]
 - modprobe                    ==> Documentation/kmod.txt
 - msgmax
@@ -170,6 +171,13 @@ This flag controls the L2 cache of G3 processor boards. If
 
 ==============================================================
 
+kstack_depth_to_print: (X86 only)
+
+Controls the number of words to print when dumping the raw
+kernel stack.
+
+==============================================================
+
 osrelease, ostype & version:
 
 # cat osrelease
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 7b2f9f02208..1d48a75fa33 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -91,7 +91,7 @@ asmlinkage void alignment_check(void);
 asmlinkage void spurious_interrupt_bug(void);
 asmlinkage void machine_check(void);
 
-static int kstack_depth_to_print = 24;
+int kstack_depth_to_print = 24;
 #ifdef CONFIG_STACK_UNWIND
 static int call_trace = 1;
 #else
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 264db33476a..75ceccee178 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -108,7 +108,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
 	preempt_enable_no_resched();
 }
 
-static int kstack_depth_to_print = 12;
+int kstack_depth_to_print = 12;
 #ifdef CONFIG_STACK_UNWIND
 static int call_trace = 1;
 #else
diff --git a/include/asm-x86_64/stacktrace.h b/include/asm-x86_64/stacktrace.h
index 5eb9799bef7..6f0b5459430 100644
--- a/include/asm-x86_64/stacktrace.h
+++ b/include/asm-x86_64/stacktrace.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_STACKTRACE_H
 #define _ASM_STACKTRACE_H 1
 
+extern int kstack_depth_to_print;
+
 /* Generic stack tracer with callbacks */
 
 struct stacktrace_ops {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 09e569f4792..6fc5e17086f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -54,6 +54,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
 
 #ifdef CONFIG_X86
 #include <asm/nmi.h>
+#include <asm/stacktrace.h>
 #endif
 
 #if defined(CONFIG_SYSCTL)
@@ -707,6 +708,14 @@ static ctl_table kern_table[] = {
 		.mode		= 0444,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "kstack_depth_to_print",
+		.data		= &kstack_depth_to_print,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #endif
 #if defined(CONFIG_MMU)
 	{
-- 
cgit v1.2.3-70-g09d2


From 3df0af0eb064a16bbdbe81b46bc72a4089f88d54 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai.lu@amd.com>
Date: Thu, 7 Dec 2006 02:14:12 +0100
Subject: [PATCH] x86_64: clear_bss before set_intr_gate with early_idt_handler

idt_table is in the .bss section, so clear_bss need to called at first

Signed-off-by: Yinghai Lu <yinghai.lu@amd.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/head64.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
index 9561eb3c5b5..cc230b93cd1 100644
--- a/arch/x86_64/kernel/head64.c
+++ b/arch/x86_64/kernel/head64.c
@@ -57,10 +57,12 @@ void __init x86_64_start_kernel(char * real_mode_data)
 {
 	int i;
 
-	for (i = 0; i < 256; i++)
+	/* clear bss before set_intr_gate with early_idt_handler */
+	clear_bss();
+
+	for (i = 0; i < IDT_ENTRIES; i++)
 		set_intr_gate(i, early_idt_handler);
 	asm volatile("lidt %0" :: "m" (idt_descr));
-	clear_bss();
 
 	early_printk("Kernel alive\n");
 
-- 
cgit v1.2.3-70-g09d2


From 8fb6e5f5db860113e71ce7b854382ed40559395b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@amd.com>
Date: Thu, 7 Dec 2006 02:14:12 +0100
Subject: [PATCH] x86_64: interrupt array size should be aligned to NR_VECTORS

interrupt array is referred for idt vectors instead of NR_IRQS, so change size
to NR_VECTORS - FIRST_EXTERNAL_VECTOR. Also change to static.

Signed-off-by: Yinghai Lu <yinghai@amd.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/i8259.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index c4ef801b765..d73c79e821f 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -76,7 +76,8 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
 	IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
 	IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
 
-void (*interrupt[NR_IRQS])(void) = {
+/* for the irq vectors */
+static void (*interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
 					  IRQLIST_16(0x2), IRQLIST_16(0x3),
 	IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
 	IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
-- 
cgit v1.2.3-70-g09d2


From d4c45718b3c0d3045eab803cd15fabbed1ea5bcd Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 7 Dec 2006 02:14:12 +0100
Subject: [PATCH] x86-64: Fix kobject_init() WARN_ON on resume from disk

Make mce_remove_device() clean up the kobject in per_cpu(device_mce, cpu)
after it has been unregistered.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/mce.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index bbea88801d8..a7440cb9c14 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -652,6 +652,7 @@ static void mce_remove_device(unsigned int cpu)
 	sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
 	sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
 	sysdev_unregister(&per_cpu(device_mce,cpu));
+	memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
 }
 
 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
-- 
cgit v1.2.3-70-g09d2


From 73ad8355d7db6a3cdcf313d4a4586a8f81b19c2f Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai.lu@amd.com>
Date: Thu, 7 Dec 2006 02:14:12 +0100
Subject: [PATCH] x86-64: remove unused acpi_found_madt in mparse.

remove unused acpi_found_madt in mparse.c

Signed-off-by: Yinghai Lu <yinghai.lu@amd.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/mpparse.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index b147ab19fbd..08072568847 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -35,8 +35,6 @@
 int smp_found_config;
 unsigned int __initdata maxcpus = NR_CPUS;
 
-int acpi_found_madt;
-
 /*
  * Various Linux-internal data structures created from the
  * MP-table.
-- 
cgit v1.2.3-70-g09d2


From 616779656989cb8c59177e35cb13e87028b1edc8 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:12 +0100
Subject: [PATCH] x86-64: Synchronize RDTSC on single core AMD

There is no guarantee that two RDTSCs in a row are monotonic,
so don't assume it on single core AMD systems.
This will make gettimeofday slower again
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/setup.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index a570c81c831..05eaca41802 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -732,11 +732,8 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	/* Fix cpuid4 emulation for more */
 	num_cache_leaves = 3;
 
-	/* When there is only one core no need to synchronize RDTSC */
-	if (num_possible_cpus() == 1)
-	        set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
-	else
-	        clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
+	/* RDTSC can be speculated around */
+	clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
 }
 
 static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
-- 
cgit v1.2.3-70-g09d2


From e496a0da7f8110054d0ad4039245b0f49f9540f5 Mon Sep 17 00:00:00 2001
From: Muli Ben-Yehuda <muli@il.ibm.com>
Date: Thu, 7 Dec 2006 02:14:12 +0100
Subject: [PATCH] Calgary: remove unused variables

Spotted by d binderman <dcb314@hotmail.com>.

Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/pci-calgary.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index 0ddf29dae7e..3215675ab12 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -653,14 +653,9 @@ static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev)
 static void __init calgary_reserve_regions(struct pci_dev *dev)
 {
 	unsigned int npages;
-	void __iomem *bbar;
-	unsigned char busnum;
 	u64 start;
 	struct iommu_table *tbl = dev->sysdata;
 
-	bbar = tbl->bbar;
-	busnum = dev->bus->number;
-
 	/* reserve bad_dma_address in case it's a legal address */
 	iommu_range_reserve(tbl, bad_dma_address, 1);
 
-- 
cgit v1.2.3-70-g09d2


From f3d73707a1e84f0687a05144b70b660441e999c7 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Thu, 7 Dec 2006 02:14:12 +0100
Subject: [PATCH] x86-64: Mark rdtsc as sync only for netburst, not for core2

On the Core2 cpus, the rdtsc instruction is not serializing (as defined
in the architecture reference since rdtsc exists) and due to the deep
speculation of these cores, it's possible that you can observe time go
backwards between cores due to this speculation. Since the kernel
already deals with this with the SYNC_RDTSC flag, the solution is
simple, only assume that the instruction is serializing on family 15...

The price one pays for this is a slightly slower gettimeofday (by a
dozen or two cycles), but that increase is quite small to pay for a
really-going-forward tsc counter.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/setup.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 05eaca41802..6595a4ebe7f 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -860,7 +860,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 		set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
 	if (c->x86 == 6)
 		set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
-	set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
+	if (c->x86 == 15)
+		set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
+	else
+		clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
  	c->x86_max_cores = intel_num_cpu_cores(c);
 
 	srat_detect_node();
-- 
cgit v1.2.3-70-g09d2


From 446f713ba1afd68568139ae4691335ba273fa7f4 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:12 +0100
Subject: [PATCH] unwinder: always use unlocked module list access in unwinder
 fallback

We're already well protected against module unloads because module
unload uses stop_machine(). The only exception is NMIs, but other
users already risk lockless accesses here.

This avoids some hackery in lockdep and also a potential deadlock

This matches what i386 does.

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/traps.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 75ceccee178..9864d195c40 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -317,9 +317,9 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
 #define HANDLE_STACK(cond) \
 	do while (cond) { \
 		unsigned long addr = *stack++; \
-		if (oops_in_progress ? 		\
-			__kernel_text_address(addr) : \
-			kernel_text_address(addr)) { \
+		/* Use unlocked access here because except for NMIs	\
+		   we should be already protected against module unloads */ \
+		if (__kernel_text_address(addr)) { \
 			/* \
 			 * If the address is either in the text segment of the \
 			 * kernel, or in the region which contains vmalloc'ed \
-- 
cgit v1.2.3-70-g09d2


From 359ad0d4015a9ab39243f2ebc4eb07915bd618b2 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 7 Dec 2006 02:14:13 +0100
Subject: [PATCH] unwinder: more sanity checks in Dwarf2 unwinder

Tighten the requirements on both input to and output from the Dwarf2
unwinder.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/traps.c    |  7 +++++++
 arch/x86_64/kernel/traps.c  |  7 +++++++
 include/asm-i386/unwind.h   | 12 ++++--------
 include/asm-x86_64/unwind.h |  8 ++------
 kernel/unwind.c             | 16 +++++++++++++++-
 5 files changed, 35 insertions(+), 15 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 86d8476be4f..c447807e2a4 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -161,12 +161,19 @@ dump_trace_unwind(struct unwind_frame_info *info, void *data)
 {
 	struct ops_and_data *oad = (struct ops_and_data *)data;
 	int n = 0;
+	unsigned long sp = UNW_SP(info);
 
+	if (arch_unw_user_mode(info))
+		return -1;
 	while (unwind(info) == 0 && UNW_PC(info)) {
 		n++;
 		oad->ops->address(oad->data, UNW_PC(info));
 		if (arch_unw_user_mode(info))
 			break;
+		if ((sp & ~(PAGE_SIZE - 1)) == (UNW_SP(info) & ~(PAGE_SIZE - 1))
+		    && sp > UNW_SP(info))
+			break;
+		sp = UNW_SP(info);
 	}
 	return n;
 }
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 9864d195c40..4fdd162f0be 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -225,12 +225,19 @@ static int dump_trace_unwind(struct unwind_frame_info *info, void *context)
 {
 	struct ops_and_data *oad = (struct ops_and_data *)context;
 	int n = 0;
+	unsigned long sp = UNW_SP(info);
 
+	if (arch_unw_user_mode(info))
+		return -1;
 	while (unwind(info) == 0 && UNW_PC(info)) {
 		n++;
 		oad->ops->address(oad->data, UNW_PC(info));
 		if (arch_unw_user_mode(info))
 			break;
+		if ((sp & ~(PAGE_SIZE - 1)) == (UNW_SP(info) & ~(PAGE_SIZE - 1))
+		    && sp > UNW_SP(info))
+			break;
+		sp = UNW_SP(info);
 	}
 	return n;
 }
diff --git a/include/asm-i386/unwind.h b/include/asm-i386/unwind.h
index 601fc67bd77..aa2c931e30d 100644
--- a/include/asm-i386/unwind.h
+++ b/include/asm-i386/unwind.h
@@ -79,17 +79,13 @@ extern asmlinkage int arch_unwind_init_running(struct unwind_frame_info *,
                                                                           void *arg),
                                                void *arg);
 
-static inline int arch_unw_user_mode(const struct unwind_frame_info *info)
+static inline int arch_unw_user_mode(/*const*/ struct unwind_frame_info *info)
 {
-#if 0 /* This can only work when selector register and EFLAGS saves/restores
-         are properly annotated (and tracked in UNW_REGISTER_INFO). */
-	return user_mode_vm(&info->regs);
-#else
-	return info->regs.eip < PAGE_OFFSET
+	return user_mode_vm(&info->regs)
+	       || info->regs.eip < PAGE_OFFSET
 	       || (info->regs.eip >= __fix_to_virt(FIX_VDSO)
-	            && info->regs.eip < __fix_to_virt(FIX_VDSO) + PAGE_SIZE)
+	           && info->regs.eip < __fix_to_virt(FIX_VDSO) + PAGE_SIZE)
 	       || info->regs.esp < PAGE_OFFSET;
-#endif
 }
 
 #else
diff --git a/include/asm-x86_64/unwind.h b/include/asm-x86_64/unwind.h
index 2e7ff10fd77..2f6349e4871 100644
--- a/include/asm-x86_64/unwind.h
+++ b/include/asm-x86_64/unwind.h
@@ -87,14 +87,10 @@ extern int arch_unwind_init_running(struct unwind_frame_info *,
 
 static inline int arch_unw_user_mode(const struct unwind_frame_info *info)
 {
-#if 0 /* This can only work when selector register saves/restores
-         are properly annotated (and tracked in UNW_REGISTER_INFO). */
-	return user_mode(&info->regs);
-#else
-	return (long)info->regs.rip >= 0
+	return user_mode(&info->regs)
+	       || (long)info->regs.rip >= 0
 	       || (info->regs.rip >= VSYSCALL_START && info->regs.rip < VSYSCALL_END)
 	       || (long)info->regs.rsp >= 0;
-#endif
 }
 
 #else
diff --git a/kernel/unwind.c b/kernel/unwind.c
index af48168a3af..7e721f10410 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -95,6 +95,7 @@ static const struct {
 
 typedef unsigned long uleb128_t;
 typedef   signed long sleb128_t;
+#define sleb128abs __builtin_labs
 
 static struct unwind_table {
 	struct {
@@ -787,7 +788,7 @@ int unwind(struct unwind_frame_info *frame)
 #define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs])
 	const u32 *fde = NULL, *cie = NULL;
 	const u8 *ptr = NULL, *end = NULL;
-	unsigned long pc = UNW_PC(frame) - frame->call_frame;
+	unsigned long pc = UNW_PC(frame) - frame->call_frame, sp;
 	unsigned long startLoc = 0, endLoc = 0, cfa;
 	unsigned i;
 	signed ptrType = -1;
@@ -936,6 +937,9 @@ int unwind(struct unwind_frame_info *frame)
 		state.dataAlign = get_sleb128(&ptr, end);
 		if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end)
 			cie = NULL;
+		else if (UNW_PC(frame) % state.codeAlign
+		         || UNW_SP(frame) % sleb128abs(state.dataAlign))
+			return -EPERM;
 		else {
 			retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end);
 			/* skip augmentation */
@@ -968,6 +972,8 @@ int unwind(struct unwind_frame_info *frame)
 #ifdef CONFIG_FRAME_POINTER
 		unsigned long top, bottom;
 
+		if ((UNW_SP(frame) | UNW_FP(frame)) % sizeof(unsigned long))
+			return -EPERM;
 		top = STACK_TOP(frame->task);
 		bottom = STACK_BOTTOM(frame->task);
 # if FRAME_RETADDR_OFFSET < 0
@@ -1018,6 +1024,7 @@ int unwind(struct unwind_frame_info *frame)
 	   || state.regs[retAddrReg].where == Nowhere
 	   || state.cfa.reg >= ARRAY_SIZE(reg_info)
 	   || reg_info[state.cfa.reg].width != sizeof(unsigned long)
+	   || FRAME_REG(state.cfa.reg, unsigned long) % sizeof(unsigned long)
 	   || state.cfa.offs % sizeof(unsigned long))
 		return -EIO;
 	/* update frame */
@@ -1038,6 +1045,8 @@ int unwind(struct unwind_frame_info *frame)
 #else
 # define CASES CASE(8); CASE(16); CASE(32); CASE(64)
 #endif
+	pc = UNW_PC(frame);
+	sp = UNW_SP(frame);
 	for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
 		if (REG_INVALID(i)) {
 			if (state.regs[i].where == Nowhere)
@@ -1118,6 +1127,11 @@ int unwind(struct unwind_frame_info *frame)
 		}
 	}
 
+	if (UNW_PC(frame) % state.codeAlign
+	    || UNW_SP(frame) % sleb128abs(state.dataAlign)
+	    || (pc == UNW_PC(frame) && sp == UNW_SP(frame)))
+		return -EIO;
+
 	return 0;
 #undef CASES
 #undef FRAME_REG
-- 
cgit v1.2.3-70-g09d2


From a0429d0d7a6116dedcb71d9128da904bf135f189 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:13 +0100
Subject: [PATCH] x86-64: Remove unwind stack pointer alignment forcing again

This was added as a workaround for the fallback unwinder not supporting
unaligned stack pointers properly. But now it was fixed to do that,
so it's not needed anymore

Cc: mingo@elte.hu
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/traps.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 4fdd162f0be..a1641ffdffc 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -309,12 +309,6 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
 		if (tsk && tsk != current)
 			stack = (unsigned long *)tsk->thread.rsp;
 	}
-	/*
-	 * Align the stack pointer on word boundary, later loops
-	 * rely on that (and corruption / debug info bugs can cause
-	 * unaligned values here):
-	 */
-	stack = (unsigned long *)((unsigned long)stack & ~(sizeof(long)-1));
 
 	/*
 	 * Print function call entries within a stack. 'cond' is the
-- 
cgit v1.2.3-70-g09d2


From d331e739f5ad2aaa9d8553891ba6ca823bdbce37 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Thu, 7 Dec 2006 02:14:13 +0100
Subject: [PATCH] x86-64: Fix interrupt race in idle callback (3rd try)

Idle callbacks has some races when enter_idle() sets isidle and subsequent
interrupts that can happen on that CPU, before CPU goes to idle. Due to this,
an IDLE_END can get called before IDLE_START. To avoid these races, disable
interrupts before enter_idle and make sure that all idle routines do not
enable interrupts before entering idle.

Note that poll_idle() still has a this race as it has to enable interrupts
before going to idle. But, all other idle routines have the race fixed.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/process.c   | 19 +++++++++++++++++--
 include/asm-x86_64/processor.h |  8 ++++++++
 2 files changed, 25 insertions(+), 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 0b7b4caa4f7..a418ee4c8c6 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -127,6 +127,7 @@ static void default_idle(void)
  */
 static void poll_idle (void)
 {
+	local_irq_enable();
 	cpu_relax();
 }
 
@@ -208,6 +209,12 @@ void cpu_idle (void)
 				idle = default_idle;
 			if (cpu_is_offline(smp_processor_id()))
 				play_dead();
+			/*
+			 * Idle routines should keep interrupts disabled
+			 * from here on, until they go to idle.
+			 * Otherwise, idle callbacks can misfire.
+			 */
+			local_irq_disable();
 			enter_idle();
 			idle();
 			/* In many cases the interrupt that ended idle
@@ -245,8 +252,16 @@ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
 static void mwait_idle(void)
 {
-	local_irq_enable();
-	mwait_idle_with_hints(0,0);
+	if (!need_resched()) {
+		__monitor((void *)&current_thread_info()->flags, 0, 0);
+		smp_mb();
+		if (!need_resched())
+			__sti_mwait(0, 0);
+		else
+			local_irq_enable();
+	} else {
+		local_irq_enable();
+	}
 }
 
 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h
index cef17e0f828..76552d72804 100644
--- a/include/asm-x86_64/processor.h
+++ b/include/asm-x86_64/processor.h
@@ -475,6 +475,14 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
 		: :"a" (eax), "c" (ecx));
 }
 
+static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
+{
+	/* "mwait %eax,%ecx;" */
+	asm volatile(
+		"sti; .byte 0x0f,0x01,0xc9;"
+		: :"a" (eax), "c" (ecx));
+}
+
 extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
 
 #define stack_current() \
-- 
cgit v1.2.3-70-g09d2


From 3807fd46e94ab9f09e5ee3bff5e6515a94e9b3c7 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:13 +0100
Subject: [PATCH] x86-64: Clarify error message in GART code

- Remove "Disabling IOMMU" message because it confuses people
- Clarify that the GART IOMMU is refered to in other message

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/pci-gart.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index 16261a8a330..fc1960f1f24 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -601,10 +601,9 @@ void __init gart_iommu_init(void)
 	    (!force_iommu && end_pfn <= MAX_DMA32_PFN) ||
 	    !iommu_aperture ||
 	    (no_agp && init_k8_gatt(&info) < 0)) {
-		printk(KERN_INFO "PCI-DMA: Disabling IOMMU.\n");
 		if (end_pfn > MAX_DMA32_PFN) {
 			printk(KERN_ERR "WARNING more than 4GB of memory "
-					"but IOMMU not available.\n"
+					"but GART IOMMU not available.\n"
 			       KERN_ERR "WARNING 32bit PCI may malfunction.\n");
 		}
 		return;
-- 
cgit v1.2.3-70-g09d2


From ad892f5e0d01f3c3b475a688d1ddc211cf3ea56d Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai.lu@amd.com>
Date: Thu, 7 Dec 2006 02:14:19 +0100
Subject: [PATCH] x86-64: check vector in setup_ioapic_dest to verify if need
 setup_IO_APIC_irq

setup_IO_APIC_irqs could fail to get vector for some device when you have too
many devices, because at that time only boot cpu is online.  So check vector
for irq in setup_ioapic_dest and call setup_IO_APIC_irq to make sure IO-APIC
irq-routing table is initialized.

Also seperate setup_IO_APIC_irq from setup_IO_APIC_irqs.

Signed-off-by: Yinghai Lu <yinghai.lu@amd.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---
 arch/x86_64/kernel/io_apic.c | 104 +++++++++++++++++++++++++------------------
 1 file changed, 61 insertions(+), 43 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index eaf0b708e67..2a1dcd5f69c 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -789,27 +789,65 @@ static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
 					      handle_edge_irq, "edge");
 	}
 }
-
-static void __init setup_IO_APIC_irqs(void)
+static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq)
 {
 	struct IO_APIC_route_entry entry;
-	int apic, pin, idx, irq, first_notcon = 1, vector;
+	int vector;
 	unsigned long flags;
 
-	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
 
-	for (apic = 0; apic < nr_ioapics; apic++) {
-	for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+	/*
+	 * add it to the IO-APIC irq-routing table:
+	 */
+	memset(&entry,0,sizeof(entry));
 
-		/*
-		 * add it to the IO-APIC irq-routing table:
-		 */
-		memset(&entry,0,sizeof(entry));
+	entry.delivery_mode = INT_DELIVERY_MODE;
+	entry.dest_mode = INT_DEST_MODE;
+	entry.mask = 0;				/* enable IRQ */
+	entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+
+	entry.trigger = irq_trigger(idx);
+	entry.polarity = irq_polarity(idx);
 
-		entry.delivery_mode = INT_DELIVERY_MODE;
-		entry.dest_mode = INT_DEST_MODE;
-		entry.mask = 0;				/* enable IRQ */
+	if (irq_trigger(idx)) {
+		entry.trigger = 1;
+		entry.mask = 1;
 		entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+	}
+
+	if (!apic && !IO_APIC_IRQ(irq))
+		return;
+
+	if (IO_APIC_IRQ(irq)) {
+		cpumask_t mask;
+		vector = assign_irq_vector(irq, TARGET_CPUS, &mask);
+		if (vector < 0)
+			return;
+
+		entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
+		entry.vector = vector;
+
+		ioapic_register_intr(irq, vector, IOAPIC_AUTO);
+		if (!apic && (irq < 16))
+			disable_8259A_irq(irq);
+	}
+
+	ioapic_write_entry(apic, pin, entry);
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	set_native_irq_info(irq, TARGET_CPUS);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+
+}
+
+static void __init setup_IO_APIC_irqs(void)
+{
+	int apic, pin, idx, irq, first_notcon = 1;
+
+	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
+
+	for (apic = 0; apic < nr_ioapics; apic++) {
+	for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
 
 		idx = find_irq_entry(apic,pin,mp_INT);
 		if (idx == -1) {
@@ -821,39 +859,11 @@ static void __init setup_IO_APIC_irqs(void)
 			continue;
 		}
 
-		entry.trigger = irq_trigger(idx);
-		entry.polarity = irq_polarity(idx);
-
-		if (irq_trigger(idx)) {
-			entry.trigger = 1;
-			entry.mask = 1;
-			entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
-		}
-
 		irq = pin_2_irq(idx, apic, pin);
 		add_pin_to_irq(irq, apic, pin);
 
-		if (!apic && !IO_APIC_IRQ(irq))
-			continue;
+		setup_IO_APIC_irq(apic, pin, idx, irq);
 
-		if (IO_APIC_IRQ(irq)) {
-			cpumask_t mask;
-			vector = assign_irq_vector(irq, TARGET_CPUS, &mask);
-			if (vector < 0)
-				continue;
-
-			entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
-			entry.vector = vector;
-
-			ioapic_register_intr(irq, vector, IOAPIC_AUTO);
-			if (!apic && (irq < 16))
-				disable_8259A_irq(irq);
-		}
-		ioapic_write_entry(apic, pin, entry);
-
-		spin_lock_irqsave(&ioapic_lock, flags);
-		set_native_irq_info(irq, TARGET_CPUS);
-		spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
 	}
 
@@ -2139,7 +2149,15 @@ void __init setup_ioapic_dest(void)
 			if (irq_entry == -1)
 				continue;
 			irq = pin_2_irq(irq_entry, ioapic, pin);
-			set_ioapic_affinity_irq(irq, TARGET_CPUS);
+
+			/* setup_IO_APIC_irqs could fail to get vector for some device
+			 * when you have too many devices, because at that time only boot
+			 * cpu is online.
+			 */
+			if(!irq_vector[irq])
+				setup_IO_APIC_irq(ioapic, pin, irq_entry, irq);
+			else
+				set_ioapic_affinity_irq(irq, TARGET_CPUS);
 		}
 
 	}
-- 
cgit v1.2.3-70-g09d2


From b65780e123ba9b762276482bbfb52836e4d41fd9 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Thu, 7 Dec 2006 02:14:19 +0100
Subject: [PATCH] unwinder: move .eh_frame to RODATA

The .eh_frame section contents is never written to, so it can as well
benefit from CONFIG_DEBUG_RODATA.

Diff-ed against firstfloor tree.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/vmlinux.lds.S    |  9 ---------
 arch/x86_64/kernel/vmlinux.lds.S  |  9 ---------
 include/asm-generic/vmlinux.lds.h | 17 ++++++++++++-----
 kernel/unwind.c                   |  2 +-
 4 files changed, 13 insertions(+), 24 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 25581e87c60..56e6ad5cb04 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -102,15 +102,6 @@ SECTIONS
 	_edata = .;		/* End of data section */
   }
 
-#ifdef CONFIG_STACK_UNWIND
-  . = ALIGN(4);
-  .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) {
-	__start_unwind = .;
-  	*(.eh_frame)
-	__end_unwind = .;
-  }
-#endif
-
   . = ALIGN(THREAD_SIZE);	/* init_task */
   .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
 	*(.data.init_task)
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index d9534e750d4..6a1f8f491e5 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -51,15 +51,6 @@ SECTIONS
 
   RODATA
 
-#ifdef CONFIG_STACK_UNWIND
-  . = ALIGN(8);
-  .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) {
-	__start_unwind = .;
-  	*(.eh_frame)
-	__end_unwind = .;
-  }
-#endif
-
   . = ALIGN(PAGE_SIZE);        /* Align data segment to page size boundary */
 				/* Data */
   .data : AT(ADDR(.data) - LOAD_OFFSET) {
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 9f4747780da..4d4c62d1105 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -119,8 +119,7 @@
 		*(__ksymtab_strings)					\
 	}								\
 									\
-	/* Unwind data binary search table */				\
-	EH_FRAME_HDR							\
+	EH_FRAME							\
 									\
 	/* Built-in module parameters. */				\
 	__param : AT(ADDR(__param) - LOAD_OFFSET) {			\
@@ -162,15 +161,23 @@
 		VMLINUX_SYMBOL(__kprobes_text_end) = .;
 
 #ifdef CONFIG_STACK_UNWIND
-		/* Unwind data binary search table */
-#define EH_FRAME_HDR							\
+#define EH_FRAME							\
+		/* Unwind data binary search table */			\
+		. = ALIGN(8);						\
         	.eh_frame_hdr : AT(ADDR(.eh_frame_hdr) - LOAD_OFFSET) {	\
 			VMLINUX_SYMBOL(__start_unwind_hdr) = .;		\
 			*(.eh_frame_hdr)				\
 			VMLINUX_SYMBOL(__end_unwind_hdr) = .;		\
+		}							\
+		/* Unwind data */					\
+		. = ALIGN(8);						\
+		.eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) {		\
+			VMLINUX_SYMBOL(__start_unwind) = .;		\
+		  	*(.eh_frame)					\
+			VMLINUX_SYMBOL(__end_unwind) = .;		\
 		}
 #else
-#define EH_FRAME_HDR
+#define EH_FRAME
 #endif
 
 		/* DWARF debug sections.
diff --git a/kernel/unwind.c b/kernel/unwind.c
index 08645aa7c2d..09c26132924 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -19,7 +19,7 @@
 #include <asm/uaccess.h>
 #include <asm/unaligned.h>
 
-extern char __start_unwind[], __end_unwind[];
+extern const char __start_unwind[], __end_unwind[];
 extern const u8 __start_unwind_hdr[], __end_unwind_hdr[];
 
 #define MAX_STACK_DEPTH 8
-- 
cgit v1.2.3-70-g09d2


From 64a26a731235b59c9d73bbe82c1f896d57400d37 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 7 Dec 2006 02:14:19 +0100
Subject: [PATCH] x86-64: Export smp_call_function_single

smp_call_function() is exported, makes sense to export this one too.

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/smp.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 9f74c883568..1bf0e094f43 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -385,6 +385,7 @@ int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
 	put_cpu();
 	return 0;
 }
+EXPORT_SYMBOL(smp_call_function_single);
 
 /*
  * this function sends a 'generic call function' IPI to all other CPUs
-- 
cgit v1.2.3-70-g09d2


From f5738ceed46782aea7663d62cb6398eb05fc4ce0 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 6 Dec 2006 20:37:29 -0800
Subject: [PATCH] remove kernel syscalls

The last thing we agreed on was to remove the macros entirely for 2.6.19,
on all architectures. Unfortunately, I think nobody actually _did_ that,
so they are still there.

[akpm@osdl.org: x86_64 fix]
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Greg Schafer <gschafer@zip.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/vsyscall.c  |   1 +
 include/asm-alpha/unistd.h     | 182 ----------------------------
 include/asm-arm/unistd.h       | 150 -----------------------
 include/asm-arm26/unistd.h     | 133 ---------------------
 include/asm-frv/unistd.h       | 119 -------------------
 include/asm-h8300/unistd.h     | 166 --------------------------
 include/asm-i386/unistd.h      |  98 ---------------
 include/asm-m32r/unistd.h      | 111 -----------------
 include/asm-m68k/unistd.h      |  97 ---------------
 include/asm-m68knommu/unistd.h | 150 -----------------------
 include/asm-mips/unistd.h      | 262 -----------------------------------------
 include/asm-powerpc/unistd.h   | 109 -----------------
 include/asm-s390/unistd.h      | 154 ------------------------
 include/asm-sh/unistd.h        | 137 ---------------------
 include/asm-sh64/unistd.h      | 142 ----------------------
 include/asm-sparc/unistd.h     | 130 --------------------
 include/asm-sparc64/unistd.h   | 118 -------------------
 include/asm-v850/unistd.h      | 160 -------------------------
 include/asm-x86_64/unistd.h    |  99 ----------------
 include/asm-xtensa/unistd.h    | 184 -----------------------------
 20 files changed, 1 insertion(+), 2701 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index 92546c1526f..630036c06c7 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -42,6 +42,7 @@
 #include <asm/topology.h>
 
 #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
+#define __syscall_clobber "r11","rcx","memory"
 
 int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
 seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
diff --git a/include/asm-alpha/unistd.h b/include/asm-alpha/unistd.h
index 2cabbd465c0..84313d14e78 100644
--- a/include/asm-alpha/unistd.h
+++ b/include/asm-alpha/unistd.h
@@ -387,188 +387,6 @@
 
 #define NR_SYSCALLS			447
 
-#if defined(__GNUC__)
-
-#define _syscall_return(type)						\
-	return (_sc_err ? errno = _sc_ret, _sc_ret = -1L : 0), (type) _sc_ret
-
-#define _syscall_clobbers						\
-	"$1", "$2", "$3", "$4", "$5", "$6", "$7", "$8",			\
-	"$22", "$23", "$24", "$25", "$27", "$28" 			\
-
-#define _syscall0(type, name)						\
-type name(void)								\
-{									\
-	long _sc_ret, _sc_err;						\
-	{								\
-		register long _sc_0 __asm__("$0");			\
-		register long _sc_19 __asm__("$19");			\
-									\
-		_sc_0 = __NR_##name;					\
-		__asm__("callsys # %0 %1 %2"				\
-			: "=r"(_sc_0), "=r"(_sc_19)			\
-			: "0"(_sc_0)					\
-			: _syscall_clobbers);				\
-		_sc_ret = _sc_0, _sc_err = _sc_19;			\
-	}								\
-	_syscall_return(type);						\
-}
-
-#define _syscall1(type,name,type1,arg1)					\
-type name(type1 arg1)							\
-{									\
-	long _sc_ret, _sc_err;						\
-	{								\
-		register long _sc_0 __asm__("$0");			\
-		register long _sc_16 __asm__("$16");			\
-		register long _sc_19 __asm__("$19");			\
-									\
-		_sc_0 = __NR_##name;					\
-		_sc_16 = (long) (arg1);					\
-		__asm__("callsys # %0 %1 %2 %3"				\
-			: "=r"(_sc_0), "=r"(_sc_19)			\
-			: "0"(_sc_0), "r"(_sc_16)			\
-			: _syscall_clobbers);				\
-		_sc_ret = _sc_0, _sc_err = _sc_19;			\
-	}								\
-	_syscall_return(type);						\
-}
-
-#define _syscall2(type,name,type1,arg1,type2,arg2)			\
-type name(type1 arg1,type2 arg2)					\
-{									\
-	long _sc_ret, _sc_err;						\
-	{								\
-		register long _sc_0 __asm__("$0");			\
-		register long _sc_16 __asm__("$16");			\
-		register long _sc_17 __asm__("$17");			\
-		register long _sc_19 __asm__("$19");			\
-									\
-		_sc_0 = __NR_##name;					\
-		_sc_16 = (long) (arg1);					\
-		_sc_17 = (long) (arg2);					\
-		__asm__("callsys # %0 %1 %2 %3 %4"			\
-			: "=r"(_sc_0), "=r"(_sc_19)			\
-			: "0"(_sc_0), "r"(_sc_16), "r"(_sc_17)		\
-			: _syscall_clobbers);				\
-		_sc_ret = _sc_0, _sc_err = _sc_19;			\
-	}								\
-	_syscall_return(type);						\
-}
-
-#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3)		\
-type name(type1 arg1,type2 arg2,type3 arg3)				\
-{									\
-	long _sc_ret, _sc_err;						\
-	{								\
-		register long _sc_0 __asm__("$0");			\
-		register long _sc_16 __asm__("$16");			\
-		register long _sc_17 __asm__("$17");			\
-		register long _sc_18 __asm__("$18");			\
-		register long _sc_19 __asm__("$19");			\
-									\
-		_sc_0 = __NR_##name;					\
-		_sc_16 = (long) (arg1);					\
-		_sc_17 = (long) (arg2);					\
-		_sc_18 = (long) (arg3);					\
-		__asm__("callsys # %0 %1 %2 %3 %4 %5"			\
-			: "=r"(_sc_0), "=r"(_sc_19)			\
-			: "0"(_sc_0), "r"(_sc_16), "r"(_sc_17),		\
-			  "r"(_sc_18)					\
-			: _syscall_clobbers);				\
-		_sc_ret = _sc_0, _sc_err = _sc_19;			\
-	}								\
-	_syscall_return(type);						\
-}
-
-#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
-type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4)		 \
-{									 \
-	long _sc_ret, _sc_err;						\
-	{								\
-		register long _sc_0 __asm__("$0");			\
-		register long _sc_16 __asm__("$16");			\
-		register long _sc_17 __asm__("$17");			\
-		register long _sc_18 __asm__("$18");			\
-		register long _sc_19 __asm__("$19");			\
-									\
-		_sc_0 = __NR_##name;					\
-		_sc_16 = (long) (arg1);					\
-		_sc_17 = (long) (arg2);					\
-		_sc_18 = (long) (arg3);					\
-		_sc_19 = (long) (arg4);					\
-		__asm__("callsys # %0 %1 %2 %3 %4 %5 %6"		\
-			: "=r"(_sc_0), "=r"(_sc_19)			\
-			: "0"(_sc_0), "r"(_sc_16), "r"(_sc_17),		\
-			  "r"(_sc_18), "1"(_sc_19)			\
-			: _syscall_clobbers);				\
-		_sc_ret = _sc_0, _sc_err = _sc_19;			\
-	}								\
-	_syscall_return(type);						\
-} 
-
-#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
-	  type5,arg5)							 \
-type name (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5)	\
-{									\
-	long _sc_ret, _sc_err;						\
-	{								\
-		register long _sc_0 __asm__("$0");			\
-		register long _sc_16 __asm__("$16");			\
-		register long _sc_17 __asm__("$17");			\
-		register long _sc_18 __asm__("$18");			\
-		register long _sc_19 __asm__("$19");			\
-		register long _sc_20 __asm__("$20");			\
-									\
-		_sc_0 = __NR_##name;					\
-		_sc_16 = (long) (arg1);					\
-		_sc_17 = (long) (arg2);					\
-		_sc_18 = (long) (arg3);					\
-		_sc_19 = (long) (arg4);					\
-		_sc_20 = (long) (arg5);					\
-		__asm__("callsys # %0 %1 %2 %3 %4 %5 %6 %7"		\
-			: "=r"(_sc_0), "=r"(_sc_19)			\
-			: "0"(_sc_0), "r"(_sc_16), "r"(_sc_17),		\
-			  "r"(_sc_18), "1"(_sc_19), "r"(_sc_20)		\
-			: _syscall_clobbers);				\
-		_sc_ret = _sc_0, _sc_err = _sc_19;			\
-	}								\
-	_syscall_return(type);						\
-}
-
-#define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
-	  type5,arg5,type6,arg6)					 \
-type name (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5, type6 arg6)\
-{									\
-	long _sc_ret, _sc_err;						\
-	{								\
-		register long _sc_0 __asm__("$0");			\
-		register long _sc_16 __asm__("$16");			\
-		register long _sc_17 __asm__("$17");			\
-		register long _sc_18 __asm__("$18");			\
-		register long _sc_19 __asm__("$19");			\
-		register long _sc_20 __asm__("$20");			\
-		register long _sc_21 __asm__("$21");			\
-									\
-		_sc_0 = __NR_##name;					\
-		_sc_16 = (long) (arg1);					\
-		_sc_17 = (long) (arg2);					\
-		_sc_18 = (long) (arg3);					\
-		_sc_19 = (long) (arg4);					\
-		_sc_20 = (long) (arg5);					\
-		_sc_21 = (long) (arg6);					\
-		__asm__("callsys # %0 %1 %2 %3 %4 %5 %6 %7 %8"		\
-			: "=r"(_sc_0), "=r"(_sc_19)			\
-			: "0"(_sc_0), "r"(_sc_16), "r"(_sc_17),		\
-			  "r"(_sc_18), "1"(_sc_19), "r"(_sc_20), "r"(_sc_21) \
-			: _syscall_clobbers);				\
-		_sc_ret = _sc_0, _sc_err = _sc_19;			\
-	}								\
-	_syscall_return(type);						\
-}
-
-#endif /* __GNUC__ */
-
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
 #define __ARCH_WANT_STAT64
diff --git a/include/asm-arm/unistd.h b/include/asm-arm/unistd.h
index 14a87eec5a2..d44c629d842 100644
--- a/include/asm-arm/unistd.h
+++ b/include/asm-arm/unistd.h
@@ -377,156 +377,6 @@
 #endif
 
 #ifdef __KERNEL__
-#include <linux/err.h>
-#include <linux/linkage.h>
-
-#define __sys2(x) #x
-#define __sys1(x) __sys2(x)
-
-#ifndef __syscall
-#if defined(__thumb__) || defined(__ARM_EABI__)
-#define __SYS_REG(name) register long __sysreg __asm__("r7") = __NR_##name;
-#define __SYS_REG_LIST(regs...) "r" (__sysreg) , ##regs
-#define __syscall(name) "swi\t0"
-#else
-#define __SYS_REG(name)
-#define __SYS_REG_LIST(regs...) regs
-#define __syscall(name) "swi\t" __sys1(__NR_##name) ""
-#endif
-#endif
-
-#define __syscall_return(type, res)					\
-do {									\
-	if ((unsigned long)(res) >= (unsigned long)(-MAX_ERRNO)) {	\
-		errno = -(res);						\
-		res = -1;						\
-	}								\
-	return (type) (res);						\
-} while (0)
-
-#define _syscall0(type,name)						\
-type name(void) {							\
-  __SYS_REG(name)							\
-  register long __res_r0 __asm__("r0");					\
-  long __res;								\
-  __asm__ __volatile__ (						\
-  __syscall(name)							\
-	: "=r" (__res_r0)						\
-	: __SYS_REG_LIST()						\
-	: "memory" );							\
-  __res = __res_r0;							\
-  __syscall_return(type,__res);						\
-}
-
-#define _syscall1(type,name,type1,arg1) 				\
-type name(type1 arg1) { 						\
-  __SYS_REG(name)							\
-  register long __r0 __asm__("r0") = (long)arg1;			\
-  register long __res_r0 __asm__("r0");					\
-  long __res;								\
-  __asm__ __volatile__ (						\
-  __syscall(name)							\
-	: "=r" (__res_r0)						\
-	: __SYS_REG_LIST( "0" (__r0) )					\
-	: "memory" );							\
-  __res = __res_r0;							\
-  __syscall_return(type,__res);						\
-}
-
-#define _syscall2(type,name,type1,arg1,type2,arg2)			\
-type name(type1 arg1,type2 arg2) {					\
-  __SYS_REG(name)							\
-  register long __r0 __asm__("r0") = (long)arg1;			\
-  register long __r1 __asm__("r1") = (long)arg2;			\
-  register long __res_r0 __asm__("r0");					\
-  long __res;								\
-  __asm__ __volatile__ (						\
-  __syscall(name)							\
-	: "=r" (__res_r0)						\
-	: __SYS_REG_LIST( "0" (__r0), "r" (__r1) )			\
-	: "memory" );							\
-  __res = __res_r0;							\
-  __syscall_return(type,__res);						\
-}
-
-
-#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3)		\
-type name(type1 arg1,type2 arg2,type3 arg3) {				\
-  __SYS_REG(name)							\
-  register long __r0 __asm__("r0") = (long)arg1;			\
-  register long __r1 __asm__("r1") = (long)arg2;			\
-  register long __r2 __asm__("r2") = (long)arg3;			\
-  register long __res_r0 __asm__("r0");					\
-  long __res;								\
-  __asm__ __volatile__ (						\
-  __syscall(name)							\
-	: "=r" (__res_r0)						\
-	: __SYS_REG_LIST( "0" (__r0), "r" (__r1), "r" (__r2) )		\
-	: "memory" );							\
-  __res = __res_r0;							\
-  __syscall_return(type,__res);						\
-}
-
-
-#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4)\
-type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4) {		\
-  __SYS_REG(name)							\
-  register long __r0 __asm__("r0") = (long)arg1;			\
-  register long __r1 __asm__("r1") = (long)arg2;			\
-  register long __r2 __asm__("r2") = (long)arg3;			\
-  register long __r3 __asm__("r3") = (long)arg4;			\
-  register long __res_r0 __asm__("r0");					\
-  long __res;								\
-  __asm__ __volatile__ (						\
-  __syscall(name)							\
-	: "=r" (__res_r0)						\
-	: __SYS_REG_LIST( "0" (__r0), "r" (__r1), "r" (__r2), "r" (__r3) ) \
-	: "memory" );							\
-  __res = __res_r0;							\
-  __syscall_return(type,__res);						\
-}
-  
-
-#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5)	\
-type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5) {	\
-  __SYS_REG(name)							\
-  register long __r0 __asm__("r0") = (long)arg1;			\
-  register long __r1 __asm__("r1") = (long)arg2;			\
-  register long __r2 __asm__("r2") = (long)arg3;			\
-  register long __r3 __asm__("r3") = (long)arg4;			\
-  register long __r4 __asm__("r4") = (long)arg5;			\
-  register long __res_r0 __asm__("r0");					\
-  long __res;								\
-  __asm__ __volatile__ (						\
-  __syscall(name)							\
-	: "=r" (__res_r0)						\
-	: __SYS_REG_LIST( "0" (__r0), "r" (__r1), "r" (__r2),		\
-			  "r" (__r3), "r" (__r4) )			\
-	: "memory" );							\
-  __res = __res_r0;							\
-  __syscall_return(type,__res);						\
-}
-
-#define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5,type6,arg6)	\
-type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6) {	\
-  __SYS_REG(name)							\
-  register long __r0 __asm__("r0") = (long)arg1;			\
-  register long __r1 __asm__("r1") = (long)arg2;			\
-  register long __r2 __asm__("r2") = (long)arg3;			\
-  register long __r3 __asm__("r3") = (long)arg4;			\
-  register long __r4 __asm__("r4") = (long)arg5;			\
-  register long __r5 __asm__("r5") = (long)arg6;			\
-  register long __res_r0 __asm__("r0");					\
-  long __res;								\
-  __asm__ __volatile__ (						\
-  __syscall(name)							\
-	: "=r" (__res_r0)						\
-	: __SYS_REG_LIST( "0" (__r0), "r" (__r1), "r" (__r2),		\
-			  "r" (__r3), "r" (__r4), "r" (__r5) )		\
-	: "memory" );							\
-  __res = __res_r0;							\
-  __syscall_return(type,__res);						\
-}
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_STAT64
diff --git a/include/asm-arm26/unistd.h b/include/asm-arm26/unistd.h
index 25a5eead85b..4c3b919177e 100644
--- a/include/asm-arm26/unistd.h
+++ b/include/asm-arm26/unistd.h
@@ -311,139 +311,6 @@
 #define __ARM_NR_usr26			(__ARM_NR_BASE+3)
 
 #ifdef __KERNEL__
-#include <linux/err.h>
-#include <linux/linkage.h>
-
-#define __sys2(x) #x
-#define __sys1(x) __sys2(x)
-
-#ifndef __syscall
-#define __syscall(name) "swi\t" __sys1(__NR_##name) ""
-#endif
-
-#define __syscall_return(type, res)					\
-do {									\
-	if ((unsigned long)(res) >= (unsigned long)-MAX_ERRNO) {	\
-		errno = -(res);						\
-		res = -1;						\
-	}								\
-	return (type) (res);						\
-} while (0)
-
-#define _syscall0(type,name)						\
-type name(void) {							\
-  register long __res_r0 __asm__("r0");					\
-  long __res;								\
-  __asm__ __volatile__ (						\
-  __syscall(name)							\
-	: "=r" (__res_r0)						\
-	:								\
-	: "lr");							\
-  __res = __res_r0;							\
-  __syscall_return(type,__res);						\
-}
-
-#define _syscall1(type,name,type1,arg1) 				\
-type name(type1 arg1) { 						\
-  register long __r0 __asm__("r0") = (long)arg1;			\
-  register long __res_r0 __asm__("r0");					\
-  long __res;								\
-  __asm__ __volatile__ (						\
-  __syscall(name)							\
-	: "=r" (__res_r0)						\
-	: "r" (__r0)							\
-	: "lr");							\
-  __res = __res_r0;							\
-  __syscall_return(type,__res);						\
-}
-
-#define _syscall2(type,name,type1,arg1,type2,arg2)			\
-type name(type1 arg1,type2 arg2) {					\
-  register long __r0 __asm__("r0") = (long)arg1;			\
-  register long __r1 __asm__("r1") = (long)arg2;			\
-  register long __res_r0 __asm__("r0");					\
-  long __res;								\
-  __asm__ __volatile__ (						\
-  __syscall(name)							\
-	: "=r" (__res_r0)						\
-	: "r" (__r0),"r" (__r1) 					\
-	: "lr");							\
-  __res = __res_r0;							\
-  __syscall_return(type,__res);						\
-}
-
-
-#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3)		\
-type name(type1 arg1,type2 arg2,type3 arg3) {				\
-  register long __r0 __asm__("r0") = (long)arg1;			\
-  register long __r1 __asm__("r1") = (long)arg2;			\
-  register long __r2 __asm__("r2") = (long)arg3;			\
-  register long __res_r0 __asm__("r0");					\
-  long __res;								\
-  __asm__ __volatile__ (						\
-  __syscall(name)							\
-	: "=r" (__res_r0)						\
-	: "r" (__r0),"r" (__r1),"r" (__r2)				\
-	: "lr");							\
-  __res = __res_r0;							\
-  __syscall_return(type,__res);						\
-}
-
-
-#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4)\
-type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4) {		\
-  register long __r0 __asm__("r0") = (long)arg1;			\
-  register long __r1 __asm__("r1") = (long)arg2;			\
-  register long __r2 __asm__("r2") = (long)arg3;			\
-  register long __r3 __asm__("r3") = (long)arg4;			\
-  register long __res_r0 __asm__("r0");					\
-  long __res;								\
-  __asm__ __volatile__ (						\
-  __syscall(name)							\
-	: "=r" (__res_r0)						\
-	: "r" (__r0),"r" (__r1),"r" (__r2),"r" (__r3)			\
-	: "lr");							\
-  __res = __res_r0;							\
-  __syscall_return(type,__res);						\
-}
-  
-
-#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5)	\
-type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5) {	\
-  register long __r0 __asm__("r0") = (long)arg1;			\
-  register long __r1 __asm__("r1") = (long)arg2;			\
-  register long __r2 __asm__("r2") = (long)arg3;			\
-  register long __r3 __asm__("r3") = (long)arg4;			\
-  register long __r4 __asm__("r4") = (long)arg5;			\
-  register long __res_r0 __asm__("r0");					\
-  long __res;								\
-  __asm__ __volatile__ (						\
-  __syscall(name)							\
-	: "=r" (__res_r0)						\
-	: "r" (__r0),"r" (__r1),"r" (__r2),"r" (__r3),"r" (__r4)	\
-	: "lr");							\
-  __res = __res_r0;							\
-  __syscall_return(type,__res);						\
-}
-
-#define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5,type6,arg6)	\
-type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6) {	\
-  register long __r0 __asm__("r0") = (long)arg1;			\
-  register long __r1 __asm__("r1") = (long)arg2;			\
-  register long __r2 __asm__("r2") = (long)arg3;			\
-  register long __r3 __asm__("r3") = (long)arg4;			\
-  register long __r4 __asm__("r4") = (long)arg5;			\
-  register long __r5 __asm__("r5") = (long)arg6;			\
-  register long __res_r0 __asm__("r0");					\
-  long __res;								\
-  __asm__ __volatile__ (						\
-  __syscall(name)							\
-	: "=r" (__res_r0)						\
-	: "r" (__r0),"r" (__r1),"r" (__r2),"r" (__r3), "r" (__r4),"r" (__r5)		\
-	: "lr");							\
-  __res = __res_r0;							\
-  __syscall_return(type,__res);						\
-}
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/include/asm-frv/unistd.h b/include/asm-frv/unistd.h
index 725e854928c..584c0417ae4 100644
--- a/include/asm-frv/unistd.h
+++ b/include/asm-frv/unistd.h
@@ -320,125 +320,6 @@
 #ifdef __KERNEL__
 
 #define NR_syscalls 310
-#include <linux/err.h>
-
-/*
- * process the return value of a syscall, consigning it to one of two possible fates
- * - user-visible error numbers are in the range -1 - -4095: see <asm-frv/errno.h>
- */
-#undef __syscall_return
-#define __syscall_return(type, res)					\
-do {									\
-        unsigned long __sr2 = (res);					\
-	if (__builtin_expect(__sr2 >= (unsigned long)(-MAX_ERRNO), 0)) { \
-		errno = (-__sr2);					\
-		__sr2 = ~0UL;						\
-	}								\
-	return (type) __sr2;						\
-} while (0)
-
-/* XXX - _foo needs to be __foo, while __NR_bar could be _NR_bar. */
-
-#undef _syscall0
-#define _syscall0(type,name)						\
-type name(void)								\
-{									\
-	register unsigned long __scnum __asm__ ("gr7") = (__NR_##name);	\
-	register unsigned long __sc0 __asm__ ("gr8");			\
-	__asm__ __volatile__ ("tira gr0,#0"				\
-			      : "=r" (__sc0)				\
-			      : "r" (__scnum));				\
-	__syscall_return(type, __sc0);					\
-}
-
-#undef _syscall1
-#define _syscall1(type,name,type1,arg1)						\
-type name(type1 arg1)								\
-{										\
-	register unsigned long __scnum __asm__ ("gr7") = (__NR_##name);		\
-	register unsigned long __sc0 __asm__ ("gr8") = (unsigned long) arg1;	\
-	__asm__ __volatile__ ("tira gr0,#0"					\
-			      : "+r" (__sc0)					\
-			      : "r" (__scnum));					\
-	__syscall_return(type, __sc0);						\
-}
-
-#undef _syscall2
-#define _syscall2(type,name,type1,arg1,type2,arg2)				\
-type name(type1 arg1,type2 arg2)						\
-{										\
-	register unsigned long __scnum __asm__ ("gr7") = (__NR_##name);		\
-	register unsigned long __sc0 __asm__ ("gr8") = (unsigned long) arg1;	\
-	register unsigned long __sc1 __asm__ ("gr9") = (unsigned long) arg2;	\
-	__asm__ __volatile__ ("tira gr0,#0"					\
-			      : "+r" (__sc0)					\
-			      : "r" (__scnum), "r" (__sc1));			\
-	__syscall_return(type, __sc0);						\
-}
-
-#undef _syscall3
-#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3)			\
-type name(type1 arg1,type2 arg2,type3 arg3)					\
-{										\
-	register unsigned long __scnum __asm__ ("gr7") = (__NR_##name);		\
-	register unsigned long __sc0 __asm__ ("gr8") = (unsigned long) arg1;	\
-	register unsigned long __sc1 __asm__ ("gr9") = (unsigned long) arg2;	\
-	register unsigned long __sc2 __asm__ ("gr10") = (unsigned long) arg3;	\
-	__asm__ __volatile__ ("tira gr0,#0"					\
-			      : "+r" (__sc0)					\
-			      : "r" (__scnum), "r" (__sc1), "r" (__sc2));	\
-	__syscall_return(type, __sc0);						\
-}
-
-#undef _syscall4
-#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4)		\
-type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4)				\
-{											\
-	register unsigned long __scnum __asm__ ("gr7") = (__NR_##name);			\
-	register unsigned long __sc0 __asm__ ("gr8") = (unsigned long) arg1;		\
-	register unsigned long __sc1 __asm__ ("gr9") = (unsigned long) arg2;		\
-	register unsigned long __sc2 __asm__ ("gr10") = (unsigned long) arg3;		\
-	register unsigned long __sc3 __asm__ ("gr11") = (unsigned long) arg4;		\
-	__asm__ __volatile__ ("tira gr0,#0"						\
-			      : "+r" (__sc0)						\
-			      : "r" (__scnum), "r" (__sc1), "r" (__sc2), "r" (__sc3));	\
-	__syscall_return(type, __sc0);							\
-}
-
-#undef _syscall5
-#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5)	\
-type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5)			\
-{											\
-	register unsigned long __scnum __asm__ ("gr7") = (__NR_##name);			\
-	register unsigned long __sc0 __asm__ ("gr8") = (unsigned long) arg1;		\
-	register unsigned long __sc1 __asm__ ("gr9") = (unsigned long) arg2;		\
-	register unsigned long __sc2 __asm__ ("gr10") = (unsigned long) arg3;		\
-	register unsigned long __sc3 __asm__ ("gr11") = (unsigned long) arg4;		\
-	register unsigned long __sc4 __asm__ ("gr12") = (unsigned long) arg5;		\
-	__asm__ __volatile__ ("tira gr0,#0"						\
-			      : "+r" (__sc0)						\
-			      : "r" (__scnum), "r" (__sc1), "r" (__sc2),		\
-			      "r" (__sc3), "r" (__sc4));				\
-	__syscall_return(type, __sc0);							\
-}
-
-#undef _syscall6
-#define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5, type6, arg6) \
-type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6)		 \
-{												 \
-	register unsigned long __scnum __asm__ ("gr7") = (__NR_##name);				 \
-	register unsigned long __sc0 __asm__ ("gr8") = (unsigned long) arg1;			 \
-	register unsigned long __sc1 __asm__ ("gr9") = (unsigned long) arg2;			 \
-	register unsigned long __sc2 __asm__ ("gr10") = (unsigned long) arg3;			 \
-	register unsigned long __sc3 __asm__ ("gr11") = (unsigned long) arg4;			 \
-	register unsigned long __sc4 __asm__ ("gr12") = (unsigned long) arg5;			 \
-	register unsigned long __sc5 __asm__ ("gr13") = (unsigned long) arg6;			 \
-	__asm__ __volatile__ ("tira gr0,#0"							 \
-			      : "+r" (__sc0)							 \
-			      : "r" (__scnum), "r" (__sc1), "r" (__sc2),			 \
-			      "r" (__sc3), "r" (__sc4), "r" (__sc5));				 \
-	__syscall_return(type, __sc0);								 \
-}
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 /* #define __ARCH_WANT_OLD_READDIR */
diff --git a/include/asm-h8300/unistd.h b/include/asm-h8300/unistd.h
index 747788d629a..7ddd414f8d1 100644
--- a/include/asm-h8300/unistd.h
+++ b/include/asm-h8300/unistd.h
@@ -295,172 +295,6 @@
 #ifdef __KERNEL__
 
 #define NR_syscalls 289
-#include <linux/err.h>
-
-/* user-visible error numbers are in the range -1 - -MAX_ERRNO: see
-   <asm-m68k/errno.h> */
-
-#define __syscall_return(type, res) \
-do { \
-	if ((unsigned long)(res) >= (unsigned long)(-MAX_ERRNO)) { \
-	/* avoid using res which is declared to be in register d0; \
-	   errno might expand to a function call and clobber it.  */ \
-		int __err = -(res); \
-		errno = __err; \
-		res = -1; \
-	} \
-	return (type) (res); \
-} while (0)
-
-#define _syscall0(type, name)				\
-type name(void)						\
-{							\
-  register long __res __asm__("er0");			\
-  __asm__ __volatile__ ("mov.l %1,er0\n\t"		\
-                        "trapa	#0\n\t"			\
-			: "=r" (__res)			\
-			: "g" (__NR_##name)		\
-			: "cc", "memory");		\
-  __syscall_return(type, __res);			\
-}
-
-#define _syscall1(type, name, atype, a)			\
-type name(atype a)					\
-{							\
-  register long __res __asm__("er0");			\
-  register long _a __asm__("er1");			\
-  _a = (long)a;						\
-  __asm__ __volatile__ ("mov.l %1,er0\n\t"		\
-                        "trapa	#0\n\t"			\
-			: "=r" (__res)			\
-			: "g" (__NR_##name),		\
-			  "g" (_a)			\
-			: "cc", "memory");	 	\
-  __syscall_return(type, __res);			\
-}
-
-#define _syscall2(type, name, atype, a, btype, b)	\
-type name(atype a, btype b)				\
-{							\
-  register long __res __asm__("er0");			\
-  register long _a __asm__("er1");			\
-  register long _b __asm__("er2");			\
-  _a = (long)a;						\
-  _b = (long)b;						\
-  __asm__ __volatile__ ("mov.l %1,er0\n\t"		\
-                        "trapa	#0\n\t"			\
-			: "=r" (__res)			\
-			: "g" (__NR_##name),		\
-			  "g" (_a),			\
-			  "g" (_b)			\
-			: "cc", "memory");	 	\
-  __syscall_return(type, __res);			\
-}
-
-#define _syscall3(type, name, atype, a, btype, b, ctype, c)	\
-type name(atype a, btype b, ctype c)			\
-{							\
-  register long __res __asm__("er0");			\
-  register long _a __asm__("er1");			\
-  register long _b __asm__("er2");			\
-  register long _c __asm__("er3");			\
-  _a = (long)a;						\
-  _b = (long)b;						\
-  _c = (long)c;						\
-  __asm__ __volatile__ ("mov.l %1,er0\n\t"		\
-                        "trapa	#0\n\t"			\
-			: "=r" (__res)			\
-			: "g" (__NR_##name),		\
-			  "g" (_a),			\
-			  "g" (_b),			\
-			  "g" (_c)			\
-			: "cc", "memory");		\
-  __syscall_return(type, __res);			\
-}
-
-#define _syscall4(type, name, atype, a, btype, b,	\
-                  ctype, c, dtype, d)			\
-type name(atype a, btype b, ctype c, dtype d)		\
-{							\
-  register long __res __asm__("er0");			\
-  register long _a __asm__("er1");			\
-  register long _b __asm__("er2");			\
-  register long _c __asm__("er3");			\
-  register long _d __asm__("er4");			\
-  _a = (long)a;						\
-  _b = (long)b;						\
-  _c = (long)c;						\
-  _d = (long)d;						\
-  __asm__ __volatile__ ("mov.l	%1,er0\n\t"		\
-                        "trapa	#0\n\t"			\
-			: "=r" (__res)			\
-			: "g" (__NR_##name),		\
-			  "g" (_a),			\
-			  "g" (_b),			\
-			  "g" (_c),			\
-			  "g" (_d)			\
-			: "cc", "memory");		\
-  __syscall_return(type, __res);			\
-}
-
-#define _syscall5(type, name, atype, a, btype, b,	\
-                  ctype, c, dtype, d, etype, e)		\
-type name(atype a, btype b, ctype c, dtype d, etype e)	\
-{							\
-  register long __res __asm__("er0");			\
-  register long _a __asm__("er1");			\
-  register long _b __asm__("er2");			\
-  register long _c __asm__("er3");			\
-  register long _d __asm__("er4");			\
-  register long _e __asm__("er5");			\
-  _a = (long)a;                                       	\
-  _b = (long)b;                                       	\
-  _c = (long)c;                                       	\
-  _d = (long)d;                                       	\
-  _e = (long)e;                                       	\
-  __asm__ __volatile__ ("mov.l	%1,er0\n\t"		\
-                        "trapa	#0\n\t"			\
-			: "=r" (__res)			\
-			: "g" (__NR_##name),		\
-			  "g" (_a),			\
-			  "g" (_b),			\
-			  "g" (_c),			\
-			  "g" (_d),			\
-			  "g" (_e)			\
-			: "cc", "memory");		\
-  __syscall_return(type, __res);			\
-}
-
-#define _syscall6(type, name, atype, a, btype, b,	\
-                  ctype, c, dtype, d, etype, e, ftype, f)	\
-type name(atype a, btype b, ctype c, dtype d, etype e, ftype f)	\
-{							\
-  register long __res __asm__("er0");			\
-  register long _a __asm__("er1");			\
-  register long _b __asm__("er2");			\
-  register long _c __asm__("er3");			\
-  register long _d __asm__("er4");			\
-  register long _e __asm__("er5");			\
-  register long _f __asm__("er6");			\
-  _a = (long)a;                                       	\
-  _b = (long)b;                                       	\
-  _c = (long)c;                                       	\
-  _d = (long)d;                                       	\
-  _e = (long)e;                                       	\
-  _f = (long)f;                                       	\
-  __asm__ __volatile__ ("mov.l	%1,er0\n\t"		\
-                        "trapa	#0\n\t"			\
-			: "=r" (__res)			\
-			: "g" (__NR_##name),		\
-			  "g" (_a),			\
-			  "g" (_b),			\
-			  "g" (_c),			\
-			  "g" (_d),			\
-			  "g" (_e)			\
-			  "g" (_f)			\
-			: "cc", "memory");		\
-  __syscall_return(type, __res);			\
-}
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index beeeaf6b054..833fa1704ff 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -329,104 +329,6 @@
 #ifdef __KERNEL__
 
 #define NR_syscalls 320
-#include <linux/err.h>
-
-/*
- * user-visible error numbers are in the range -1 - -MAX_ERRNO: see
- * <asm-i386/errno.h>
- */
-#define __syscall_return(type, res) \
-do { \
-	if ((unsigned long)(res) >= (unsigned long)(-MAX_ERRNO)) { \
-		errno = -(res); \
-		res = -1; \
-	} \
-	return (type) (res); \
-} while (0)
-
-/* XXX - _foo needs to be __foo, while __NR_bar could be _NR_bar. */
-#define _syscall0(type,name) \
-type name(void) \
-{ \
-long __res; \
-__asm__ volatile ("int $0x80" \
-	: "=a" (__res) \
-	: "0" (__NR_##name)); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall1(type,name,type1,arg1) \
-type name(type1 arg1) \
-{ \
-long __res; \
-__asm__ volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" \
-	: "=a" (__res) \
-	: "0" (__NR_##name),"ri" ((long)(arg1)) : "memory"); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall2(type,name,type1,arg1,type2,arg2) \
-type name(type1 arg1,type2 arg2) \
-{ \
-long __res; \
-__asm__ volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" \
-	: "=a" (__res) \
-	: "0" (__NR_##name),"ri" ((long)(arg1)),"c" ((long)(arg2)) \
-	: "memory"); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3) \
-type name(type1 arg1,type2 arg2,type3 arg3) \
-{ \
-long __res; \
-__asm__ volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" \
-	: "=a" (__res) \
-	: "0" (__NR_##name),"ri" ((long)(arg1)),"c" ((long)(arg2)), \
-		  "d" ((long)(arg3)) : "memory"); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
-type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4) \
-{ \
-long __res; \
-__asm__ volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" \
-	: "=a" (__res) \
-	: "0" (__NR_##name),"ri" ((long)(arg1)),"c" ((long)(arg2)), \
-	  "d" ((long)(arg3)),"S" ((long)(arg4)) : "memory"); \
-__syscall_return(type,__res); \
-} 
-
-#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
-	  type5,arg5) \
-type name (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5) \
-{ \
-long __res; \
-__asm__ volatile ("push %%ebx ; movl %2,%%ebx ; movl %1,%%eax ; " \
-                  "int $0x80 ; pop %%ebx" \
-	: "=a" (__res) \
-	: "i" (__NR_##name),"ri" ((long)(arg1)),"c" ((long)(arg2)), \
-	  "d" ((long)(arg3)),"S" ((long)(arg4)),"D" ((long)(arg5)) \
-	: "memory"); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
-	  type5,arg5,type6,arg6) \
-type name (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5,type6 arg6) \
-{ \
-long __res; \
-  struct { long __a1; long __a6; } __s = { (long)arg1, (long)arg6 }; \
-__asm__ volatile ("push %%ebp ; push %%ebx ; movl 4(%2),%%ebp ; " \
-                  "movl 0(%2),%%ebx ; movl %1,%%eax ; int $0x80 ; " \
-                  "pop %%ebx ;  pop %%ebp" \
-	: "=a" (__res) \
-	: "i" (__NR_##name),"0" ((long)(&__s)),"c" ((long)(arg2)), \
-	  "d" ((long)(arg3)),"S" ((long)(arg4)),"D" ((long)(arg5)) \
-	: "memory"); \
-__syscall_return(type,__res); \
-}
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/include/asm-m32r/unistd.h b/include/asm-m32r/unistd.h
index 95aa34298d8..5b66bd3c6ed 100644
--- a/include/asm-m32r/unistd.h
+++ b/include/asm-m32r/unistd.h
@@ -296,117 +296,6 @@
 #ifdef __KERNEL__
 
 #define NR_syscalls 285
-#include <linux/err.h>
-
-/* user-visible error numbers are in the range -1 - -MAX_ERRNO: see
- * <asm-m32r/errno.h>
- */
-
-#include <asm/syscall.h>	/* SYSCALL_* */
-
-#define __syscall_return(type, res) \
-do { \
-	if ((unsigned long)(res) >= (unsigned long)(-MAX_ERRNO)) { \
-	/* Avoid using "res" which is declared to be in register r0; \
-	   errno might expand to a function call and clobber it.  */ \
-		int __err = -(res); \
-		errno = __err; \
-		res = -1; \
-	} \
-	return (type) (res); \
-} while (0)
-
-#define _syscall0(type,name) \
-type name(void) \
-{ \
-register long __scno __asm__ ("r7") = __NR_##name; \
-register long __res __asm__("r0"); \
-__asm__ __volatile__ (\
-	"trap #" SYSCALL_VECTOR "|| nop"\
-	: "=r" (__res) \
-	: "r" (__scno) \
-	: "memory"); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall1(type,name,type1,arg1) \
-type name(type1 arg1) \
-{ \
-register long __scno __asm__ ("r7") = __NR_##name; \
-register long __res __asm__ ("r0") = (long)(arg1); \
-__asm__ __volatile__ (\
-	"trap #" SYSCALL_VECTOR "|| nop"\
-	: "=r" (__res) \
-	: "r" (__scno), "0" (__res) \
-	: "memory"); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall2(type,name,type1,arg1,type2,arg2) \
-type name(type1 arg1,type2 arg2) \
-{ \
-register long __scno __asm__ ("r7") = __NR_##name; \
-register long __arg2 __asm__ ("r1") = (long)(arg2); \
-register long __res __asm__ ("r0") = (long)(arg1); \
-__asm__ __volatile__ (\
-	"trap #" SYSCALL_VECTOR "|| nop"\
-	: "=r" (__res) \
-	: "r" (__scno), "0" (__res), "r" (__arg2) \
-	: "memory"); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3) \
-type name(type1 arg1,type2 arg2,type3 arg3) \
-{ \
-register long __scno __asm__ ("r7") = __NR_##name; \
-register long __arg3 __asm__ ("r2") = (long)(arg3); \
-register long __arg2 __asm__ ("r1") = (long)(arg2); \
-register long __res __asm__ ("r0") = (long)(arg1); \
-__asm__ __volatile__ (\
-	"trap #" SYSCALL_VECTOR "|| nop"\
-	: "=r" (__res) \
-	: "r" (__scno), "0" (__res), "r" (__arg2), \
-		"r" (__arg3) \
-	: "memory"); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
-type name(type1 arg1,type2 arg2,type3 arg3,type4 arg4) \
-{ \
-register long __scno __asm__ ("r7") = __NR_##name; \
-register long __arg4 __asm__ ("r3") = (long)(arg4); \
-register long __arg3 __asm__ ("r2") = (long)(arg3); \
-register long __arg2 __asm__ ("r1") = (long)(arg2); \
-register long __res __asm__ ("r0") = (long)(arg1); \
-__asm__ __volatile__ (\
-	"trap #" SYSCALL_VECTOR "|| nop"\
-	: "=r" (__res) \
-	: "r" (__scno), "0" (__res), "r" (__arg2), \
-		"r" (__arg3), "r" (__arg4) \
-	: "memory"); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
-	type5,arg5) \
-type name(type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5) \
-{ \
-register long __scno __asm__ ("r7") = __NR_##name; \
-register long __arg5 __asm__ ("r4") = (long)(arg5); \
-register long __arg4 __asm__ ("r3") = (long)(arg4); \
-register long __arg3 __asm__ ("r2") = (long)(arg3); \
-register long __arg2 __asm__ ("r1") = (long)(arg2); \
-register long __res __asm__ ("r0") = (long)(arg1); \
-__asm__ __volatile__ (\
-	"trap #" SYSCALL_VECTOR "|| nop"\
-	: "=r" (__res) \
-	: "r" (__scno), "0" (__res), "r" (__arg2), \
-		"r" (__arg3), "r" (__arg4), "r" (__arg5) \
-	: "memory"); \
-__syscall_return(type,__res); \
-}
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_STAT64
diff --git a/include/asm-m68k/unistd.h b/include/asm-m68k/unistd.h
index ad4348058c6..fdbb60e6a0d 100644
--- a/include/asm-m68k/unistd.h
+++ b/include/asm-m68k/unistd.h
@@ -317,103 +317,6 @@
 #ifdef __KERNEL__
 
 #define NR_syscalls		311
-#include <linux/err.h>
-
-/* user-visible error numbers are in the range -1 - -MAX_ERRNO: see
-   <asm-m68k/errno.h> */
-
-#define __syscall_return(type, res) \
-do { \
-	if ((unsigned long)(res) >= (unsigned long)(-MAX_ERRNO)) { \
-	/* avoid using res which is declared to be in register d0; \
-	   errno might expand to a function call and clobber it.  */ \
-		int __err = -(res); \
-		errno = __err; \
-		res = -1; \
-	} \
-	return (type) (res); \
-} while (0)
-
-#define _syscall0(type,name) \
-type name(void) \
-{ \
-register long __res __asm__ ("%d0") = __NR_##name; \
-__asm__ __volatile__ ("trap  #0" \
-                      : "+d" (__res) ); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall1(type,name,atype,a) \
-type name(atype a) \
-{ \
-register long __res __asm__ ("%d0") = __NR_##name; \
-register long __a __asm__ ("%d1") = (long)(a); \
-__asm__ __volatile__ ("trap  #0" \
-		      : "+d" (__res) \
-		      : "d" (__a)  ); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall2(type,name,atype,a,btype,b) \
-type name(atype a,btype b) \
-{ \
-register long __res __asm__ ("%d0") = __NR_##name; \
-register long __a __asm__ ("%d1") = (long)(a); \
-register long __b __asm__ ("%d2") = (long)(b); \
-__asm__ __volatile__ ("trap  #0" \
-		      : "+d" (__res) \
-                      : "d" (__a), "d" (__b) \
-		     ); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall3(type,name,atype,a,btype,b,ctype,c) \
-type name(atype a,btype b,ctype c) \
-{ \
-register long __res __asm__ ("%d0") = __NR_##name; \
-register long __a __asm__ ("%d1") = (long)(a); \
-register long __b __asm__ ("%d2") = (long)(b); \
-register long __c __asm__ ("%d3") = (long)(c); \
-__asm__ __volatile__ ("trap  #0" \
-		      : "+d" (__res) \
-                      : "d" (__a), "d" (__b), \
-			"d" (__c) \
-		     ); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall4(type,name,atype,a,btype,b,ctype,c,dtype,d) \
-type name (atype a, btype b, ctype c, dtype d) \
-{ \
-register long __res __asm__ ("%d0") = __NR_##name; \
-register long __a __asm__ ("%d1") = (long)(a); \
-register long __b __asm__ ("%d2") = (long)(b); \
-register long __c __asm__ ("%d3") = (long)(c); \
-register long __d __asm__ ("%d4") = (long)(d); \
-__asm__ __volatile__ ("trap  #0" \
-                      : "+d" (__res) \
-                      : "d" (__a), "d" (__b), \
-			"d" (__c), "d" (__d)  \
-		     ); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall5(type,name,atype,a,btype,b,ctype,c,dtype,d,etype,e) \
-type name (atype a,btype b,ctype c,dtype d,etype e) \
-{ \
-register long __res __asm__ ("%d0") = __NR_##name; \
-register long __a __asm__ ("%d1") = (long)(a); \
-register long __b __asm__ ("%d2") = (long)(b); \
-register long __c __asm__ ("%d3") = (long)(c); \
-register long __d __asm__ ("%d4") = (long)(d); \
-register long __e __asm__ ("%d5") = (long)(e); \
-__asm__ __volatile__ ("trap  #0" \
-		      : "+d" (__res) \
-		      : "d" (__a), "d" (__b), \
-			"d" (__c), "d" (__d), "d" (__e)  \
-                     ); \
-__syscall_return(type,__res); \
-}
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/include/asm-m68knommu/unistd.h b/include/asm-m68knommu/unistd.h
index ebaf0319711..82e03195f32 100644
--- a/include/asm-m68knommu/unistd.h
+++ b/include/asm-m68knommu/unistd.h
@@ -318,156 +318,6 @@
 #ifdef __KERNEL__
 
 #define NR_syscalls		311
-#include <linux/err.h>
-
-/* user-visible error numbers are in the range -1 - -MAX_ERRNO: see
-   <asm-m68k/errno.h> */
-
-#define __syscall_return(type, res) \
-do { \
-	if ((unsigned long)(res) >= (unsigned long)(-MAX_ERRNO)) { \
-	/* avoid using res which is declared to be in register d0; \
-	   errno might expand to a function call and clobber it.  */ \
-		int __err = -(res); \
-		errno = __err; \
-		res = -1; \
-	} \
-	return (type) (res); \
-} while (0)
-
-#define _syscall0(type, name)							\
-type name(void)									\
-{										\
-  long __res;									\
-  __asm__ __volatile__ ("movel	%1, %%d0\n\t"					\
-  			"trap	#0\n\t"						\
-  			"movel	%%d0, %0"					\
-			: "=g" (__res)						\
-			: "i" (__NR_##name)					\
-			: "cc", "%d0");						\
-  if ((unsigned long)(__res) >= (unsigned long)(-125)) {				\
-    errno = -__res;								\
-    __res = -1;									\
-  }										\
-  return (type)__res;								\
-}
-
-#define _syscall1(type, name, atype, a)						\
-type name(atype a)								\
-{										\
-  long __res;									\
-  __asm__ __volatile__ ("movel	%2, %%d1\n\t"					\
-  			"movel	%1, %%d0\n\t"					\
-  			"trap	#0\n\t"						\
-  			"movel	%%d0, %0"					\
-			: "=g" (__res)						\
-			: "i" (__NR_##name),					\
-			  "g" ((long)a)						\
-			: "cc", "%d0", "%d1");					\
-  if ((unsigned long)(__res) >= (unsigned long)(-125)) {				\
-    errno = -__res;								\
-    __res = -1;									\
-  }										\
-  return (type)__res;								\
-}
-
-#define _syscall2(type, name, atype, a, btype, b)				\
-type name(atype a, btype b)							\
-{										\
-  long __res;									\
-  __asm__ __volatile__ ("movel	%3, %%d2\n\t"					\
-  			"movel	%2, %%d1\n\t"					\
-			"movel	%1, %%d0\n\t"					\
-  			"trap	#0\n\t"						\
-  			"movel	%%d0, %0"					\
-			: "=g" (__res)						\
-			: "i" (__NR_##name),					\
-			  "a" ((long)a),					\
-			  "g" ((long)b)						\
-			: "cc", "%d0", "%d1", "%d2");				\
-  if ((unsigned long)(__res) >= (unsigned long)(-125)) {				\
-    errno = -__res;								\
-    __res = -1;									\
-  }										\
-  return (type)__res;								\
-}
-
-#define _syscall3(type, name, atype, a, btype, b, ctype, c)			\
-type name(atype a, btype b, ctype c)						\
-{										\
-  long __res;									\
-  __asm__ __volatile__ ("movel	%4, %%d3\n\t"					\
-			"movel	%3, %%d2\n\t"					\
-  			"movel	%2, %%d1\n\t"					\
-			"movel	%1, %%d0\n\t"					\
-  			"trap	#0\n\t"						\
-  			"movel	%%d0, %0"					\
-			: "=g" (__res)						\
-			: "i" (__NR_##name),					\
-			  "a" ((long)a),					\
-			  "a" ((long)b),					\
-			  "g" ((long)c)						\
-			: "cc", "%d0", "%d1", "%d2", "%d3");			\
-  if ((unsigned long)(__res) >= (unsigned long)(-125)) {				\
-    errno = -__res;								\
-    __res = -1;									\
-  }										\
-  return (type)__res;								\
-}
-
-#define _syscall4(type, name, atype, a, btype, b, ctype, c, dtype, d)		\
-type name(atype a, btype b, ctype c, dtype d)					\
-{										\
-  long __res;									\
-  __asm__ __volatile__ ("movel	%5, %%d4\n\t"					\
-			"movel	%4, %%d3\n\t"					\
-			"movel	%3, %%d2\n\t"					\
-  			"movel	%2, %%d1\n\t"					\
-			"movel	%1, %%d0\n\t"					\
-  			"trap	#0\n\t"						\
-  			"movel	%%d0, %0"					\
-			: "=g" (__res)						\
-			: "i" (__NR_##name),					\
-			  "a" ((long)a),					\
-			  "a" ((long)b),					\
-			  "a" ((long)c),					\
-			  "g" ((long)d)						\
-			: "cc", "%d0", "%d1", "%d2", "%d3",			\
-			  "%d4");						\
-  if ((unsigned long)(__res) >= (unsigned long)(-125)) {				\
-    errno = -__res;								\
-    __res = -1;									\
-  }										\
-  return (type)__res;								\
-}
-
-#define _syscall5(type, name, atype, a, btype, b, ctype, c, dtype, d, etype, e)	\
-type name(atype a, btype b, ctype c, dtype d, etype e)				\
-{										\
-  long __res;									\
-  __asm__ __volatile__ ("movel	%6, %%d5\n\t"					\
-			"movel	%5, %%d4\n\t"					\
-			"movel	%4, %%d3\n\t"					\
-			"movel	%3, %%d2\n\t"					\
-  			"movel	%2, %%d1\n\t"					\
-			"movel	%1, %%d0\n\t"					\
-  			"trap	#0\n\t"						\
-  			"movel	%%d0, %0"					\
-			: "=g" (__res)						\
-			: "i" (__NR_##name),					\
-			  "a" ((long)a),					\
-			  "a" ((long)b),					\
-			  "a" ((long)c),					\
-			  "a" ((long)d),					\
-			  "g" ((long)e)						\
-			: "cc", "%d0", "%d1", "%d2", "%d3",			\
-			  "%d4", "%d5");					\
-  if ((unsigned long)(__res) >= (unsigned long)(-125)) {				\
-    errno = -__res;								\
-    __res = -1;									\
-  }										\
-  return (type)__res;								\
-}
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/include/asm-mips/unistd.h b/include/asm-mips/unistd.h
index ec56aa52f66..696cff39a1d 100644
--- a/include/asm-mips/unistd.h
+++ b/include/asm-mips/unistd.h
@@ -933,268 +933,6 @@
 
 #ifndef __ASSEMBLY__
 
-/* XXX - _foo needs to be __foo, while __NR_bar could be _NR_bar. */
-#define _syscall0(type,name) \
-type name(void) \
-{ \
-	register unsigned long __a3 asm("$7"); \
-	unsigned long __v0; \
-	\
-	__asm__ volatile ( \
-	".set\tnoreorder\n\t" \
-	"li\t$2, %2\t\t\t# " #name "\n\t" \
-	"syscall\n\t" \
-	"move\t%0, $2\n\t" \
-	".set\treorder" \
-	: "=&r" (__v0), "=r" (__a3) \
-	: "i" (__NR_##name) \
-	: "$2", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$24", \
-	  "memory"); \
-	\
-	if (__a3 == 0) \
-		return (type) __v0; \
-	errno = __v0; \
-	return (type) -1; \
-}
-
-/*
- * DANGER: This macro isn't usable for the pipe(2) call
- * which has a unusual return convention.
- */
-#define _syscall1(type,name,atype,a) \
-type name(atype a) \
-{ \
-	register unsigned long __a0 asm("$4") = (unsigned long) a; \
-	register unsigned long __a3 asm("$7"); \
-	unsigned long __v0; \
-	\
-	__asm__ volatile ( \
-	".set\tnoreorder\n\t" \
-	"li\t$2, %3\t\t\t# " #name "\n\t" \
-	"syscall\n\t" \
-	"move\t%0, $2\n\t" \
-	".set\treorder" \
-	: "=&r" (__v0), "=r" (__a3) \
-	: "r" (__a0), "i" (__NR_##name) \
-	: "$2", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$24", \
-	  "memory"); \
-	\
-	if (__a3 == 0) \
-		return (type) __v0; \
-	errno = __v0; \
-	return (type) -1; \
-}
-
-#define _syscall2(type,name,atype,a,btype,b) \
-type name(atype a, btype b) \
-{ \
-	register unsigned long __a0 asm("$4") = (unsigned long) a; \
-	register unsigned long __a1 asm("$5") = (unsigned long) b; \
-	register unsigned long __a3 asm("$7"); \
-	unsigned long __v0; \
-	\
-	__asm__ volatile ( \
-	".set\tnoreorder\n\t" \
-	"li\t$2, %4\t\t\t# " #name "\n\t" \
-	"syscall\n\t" \
-	"move\t%0, $2\n\t" \
-	".set\treorder" \
-	: "=&r" (__v0), "=r" (__a3) \
-	: "r" (__a0), "r" (__a1), "i" (__NR_##name) \
-	: "$2", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$24", \
-	  "memory"); \
-	\
-	if (__a3 == 0) \
-		return (type) __v0; \
-	errno = __v0; \
-	return (type) -1; \
-}
-
-#define _syscall3(type,name,atype,a,btype,b,ctype,c) \
-type name(atype a, btype b, ctype c) \
-{ \
-	register unsigned long __a0 asm("$4") = (unsigned long) a; \
-	register unsigned long __a1 asm("$5") = (unsigned long) b; \
-	register unsigned long __a2 asm("$6") = (unsigned long) c; \
-	register unsigned long __a3 asm("$7"); \
-	unsigned long __v0; \
-	\
-	__asm__ volatile ( \
-	".set\tnoreorder\n\t" \
-	"li\t$2, %5\t\t\t# " #name "\n\t" \
-	"syscall\n\t" \
-	"move\t%0, $2\n\t" \
-	".set\treorder" \
-	: "=&r" (__v0), "=r" (__a3) \
-	: "r" (__a0), "r" (__a1), "r" (__a2), "i" (__NR_##name) \
-	: "$2", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$24", \
-	  "memory"); \
-	\
-	if (__a3 == 0) \
-		return (type) __v0; \
-	errno = __v0; \
-	return (type) -1; \
-}
-
-#define _syscall4(type,name,atype,a,btype,b,ctype,c,dtype,d) \
-type name(atype a, btype b, ctype c, dtype d) \
-{ \
-	register unsigned long __a0 asm("$4") = (unsigned long) a; \
-	register unsigned long __a1 asm("$5") = (unsigned long) b; \
-	register unsigned long __a2 asm("$6") = (unsigned long) c; \
-	register unsigned long __a3 asm("$7") = (unsigned long) d; \
-	unsigned long __v0; \
-	\
-	__asm__ volatile ( \
-	".set\tnoreorder\n\t" \
-	"li\t$2, %5\t\t\t# " #name "\n\t" \
-	"syscall\n\t" \
-	"move\t%0, $2\n\t" \
-	".set\treorder" \
-	: "=&r" (__v0), "+r" (__a3) \
-	: "r" (__a0), "r" (__a1), "r" (__a2), "i" (__NR_##name) \
-	: "$2", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$24", \
-	  "memory"); \
-	\
-	if (__a3 == 0) \
-		return (type) __v0; \
-	errno = __v0; \
-	return (type) -1; \
-}
-
-#if (_MIPS_SIM == _MIPS_SIM_ABI32)
-
-/*
- * Using those means your brain needs more than an oil change ;-)
- */
-
-#define _syscall5(type,name,atype,a,btype,b,ctype,c,dtype,d,etype,e) \
-type name(atype a, btype b, ctype c, dtype d, etype e) \
-{ \
-	register unsigned long __a0 asm("$4") = (unsigned long) a; \
-	register unsigned long __a1 asm("$5") = (unsigned long) b; \
-	register unsigned long __a2 asm("$6") = (unsigned long) c; \
-	register unsigned long __a3 asm("$7") = (unsigned long) d; \
-	unsigned long __v0; \
-	\
-	__asm__ volatile ( \
-	".set\tnoreorder\n\t" \
-	"lw\t$2, %6\n\t" \
-	"subu\t$29, 32\n\t" \
-	"sw\t$2, 16($29)\n\t" \
-	"li\t$2, %5\t\t\t# " #name "\n\t" \
-	"syscall\n\t" \
-	"move\t%0, $2\n\t" \
-	"addiu\t$29, 32\n\t" \
-	".set\treorder" \
-	: "=&r" (__v0), "+r" (__a3) \
-	: "r" (__a0), "r" (__a1), "r" (__a2), "i" (__NR_##name), \
-	  "m" ((unsigned long)e) \
-	: "$2", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$24", \
-	  "memory"); \
-	\
-	if (__a3 == 0) \
-		return (type) __v0; \
-	errno = __v0; \
-	return (type) -1; \
-}
-
-#define _syscall6(type,name,atype,a,btype,b,ctype,c,dtype,d,etype,e,ftype,f) \
-type name(atype a, btype b, ctype c, dtype d, etype e, ftype f) \
-{ \
-	register unsigned long __a0 asm("$4") = (unsigned long) a; \
-	register unsigned long __a1 asm("$5") = (unsigned long) b; \
-	register unsigned long __a2 asm("$6") = (unsigned long) c; \
-	register unsigned long __a3 asm("$7") = (unsigned long) d; \
-	unsigned long __v0; \
-	\
-	__asm__ volatile ( \
-	".set\tnoreorder\n\t" \
-	"lw\t$2, %6\n\t" \
-	"lw\t$8, %7\n\t" \
-	"subu\t$29, 32\n\t" \
-	"sw\t$2, 16($29)\n\t" \
-	"sw\t$8, 20($29)\n\t" \
-	"li\t$2, %5\t\t\t# " #name "\n\t" \
-	"syscall\n\t" \
-	"move\t%0, $2\n\t" \
-	"addiu\t$29, 32\n\t" \
-	".set\treorder" \
-	: "=&r" (__v0), "+r" (__a3) \
-	: "r" (__a0), "r" (__a1), "r" (__a2), "i" (__NR_##name), \
-	  "m" ((unsigned long)e), "m" ((unsigned long)f) \
-	: "$2", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$24", \
-	  "memory"); \
-	\
-	if (__a3 == 0) \
-		return (type) __v0; \
-	errno = __v0; \
-	return (type) -1; \
-}
-
-#endif /* (_MIPS_SIM == _MIPS_SIM_ABI32) */
-
-#if (_MIPS_SIM == _MIPS_SIM_NABI32) || (_MIPS_SIM == _MIPS_SIM_ABI64)
-
-#define _syscall5(type,name,atype,a,btype,b,ctype,c,dtype,d,etype,e) \
-type name (atype a,btype b,ctype c,dtype d,etype e) \
-{ \
-	register unsigned long __a0 asm("$4") = (unsigned long) a; \
-	register unsigned long __a1 asm("$5") = (unsigned long) b; \
-	register unsigned long __a2 asm("$6") = (unsigned long) c; \
-	register unsigned long __a3 asm("$7") = (unsigned long) d; \
-	register unsigned long __a4 asm("$8") = (unsigned long) e; \
-	unsigned long __v0; \
-	\
-	__asm__ volatile ( \
-	".set\tnoreorder\n\t" \
-	"li\t$2, %6\t\t\t# " #name "\n\t" \
-	"syscall\n\t" \
-	"move\t%0, $2\n\t" \
-	".set\treorder" \
-	: "=&r" (__v0), "+r" (__a3) \
-	: "r" (__a0), "r" (__a1), "r" (__a2), "r" (__a4), "i" (__NR_##name) \
-	: "$2", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$24", \
-	  "memory"); \
-	\
-	if (__a3 == 0) \
-		return (type) __v0; \
-	errno = __v0; \
-	return (type) -1; \
-}
-
-#define _syscall6(type,name,atype,a,btype,b,ctype,c,dtype,d,etype,e,ftype,f) \
-type name (atype a,btype b,ctype c,dtype d,etype e,ftype f) \
-{ \
-	register unsigned long __a0 asm("$4") = (unsigned long) a; \
-	register unsigned long __a1 asm("$5") = (unsigned long) b; \
-	register unsigned long __a2 asm("$6") = (unsigned long) c; \
-	register unsigned long __a3 asm("$7") = (unsigned long) d; \
-	register unsigned long __a4 asm("$8") = (unsigned long) e; \
-	register unsigned long __a5 asm("$9") = (unsigned long) f; \
-	unsigned long __v0; \
-	\
-	__asm__ volatile ( \
-	".set\tnoreorder\n\t" \
-	"li\t$2, %7\t\t\t# " #name "\n\t" \
-	"syscall\n\t" \
-	"move\t%0, $2\n\t" \
-	".set\treorder" \
-	: "=&r" (__v0), "+r" (__a3) \
-	: "r" (__a0), "r" (__a1), "r" (__a2), "r" (__a4), "r" (__a5), \
-	  "i" (__NR_##name) \
-	: "$2", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$24", \
-	  "memory"); \
-	\
-	if (__a3 == 0) \
-		return (type) __v0; \
-	errno = __v0; \
-	return (type) -1; \
-}
-
-#endif /* (_MIPS_SIM == _MIPS_SIM_NABI32) || (_MIPS_SIM == _MIPS_SIM_ABI64) */
-
-
 #define __ARCH_OMIT_COMPAT_SYS_GETDENTS64
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/include/asm-powerpc/unistd.h b/include/asm-powerpc/unistd.h
index 04b6c17cc59..0ae954e3d25 100644
--- a/include/asm-powerpc/unistd.h
+++ b/include/asm-powerpc/unistd.h
@@ -334,115 +334,6 @@
 
 #ifndef __ASSEMBLY__
 
-/* On powerpc a system call basically clobbers the same registers like a
- * function call, with the exception of LR (which is needed for the
- * "sc; bnslr" sequence) and CR (where only CR0.SO is clobbered to signal
- * an error return status).
- */
-
-#define __syscall_nr(nr, type, name, args...)				\
-	unsigned long __sc_ret, __sc_err;				\
-	{								\
-		register unsigned long __sc_0  __asm__ ("r0");		\
-		register unsigned long __sc_3  __asm__ ("r3");		\
-		register unsigned long __sc_4  __asm__ ("r4");		\
-		register unsigned long __sc_5  __asm__ ("r5");		\
-		register unsigned long __sc_6  __asm__ ("r6");		\
-		register unsigned long __sc_7  __asm__ ("r7");		\
-		register unsigned long __sc_8  __asm__ ("r8");		\
-									\
-		__sc_loadargs_##nr(name, args);				\
-		__asm__ __volatile__					\
-			("sc           \n\t"				\
-			 "mfcr %0      "				\
-			: "=&r" (__sc_0),				\
-			  "=&r" (__sc_3),  "=&r" (__sc_4),		\
-			  "=&r" (__sc_5),  "=&r" (__sc_6),		\
-			  "=&r" (__sc_7),  "=&r" (__sc_8)		\
-			: __sc_asm_input_##nr				\
-			: "cr0", "ctr", "memory",			\
-			  "r9", "r10","r11", "r12");			\
-		__sc_ret = __sc_3;					\
-		__sc_err = __sc_0;					\
-	}								\
-	if (__sc_err & 0x10000000)					\
-	{								\
-		errno = __sc_ret;					\
-		__sc_ret = -1;						\
-	}								\
-	return (type) __sc_ret
-
-#define __sc_loadargs_0(name, dummy...)					\
-	__sc_0 = __NR_##name
-#define __sc_loadargs_1(name, arg1)					\
-	__sc_loadargs_0(name);						\
-	__sc_3 = (unsigned long) (arg1)
-#define __sc_loadargs_2(name, arg1, arg2)				\
-	__sc_loadargs_1(name, arg1);					\
-	__sc_4 = (unsigned long) (arg2)
-#define __sc_loadargs_3(name, arg1, arg2, arg3)				\
-	__sc_loadargs_2(name, arg1, arg2);				\
-	__sc_5 = (unsigned long) (arg3)
-#define __sc_loadargs_4(name, arg1, arg2, arg3, arg4)			\
-	__sc_loadargs_3(name, arg1, arg2, arg3);			\
-	__sc_6 = (unsigned long) (arg4)
-#define __sc_loadargs_5(name, arg1, arg2, arg3, arg4, arg5)		\
-	__sc_loadargs_4(name, arg1, arg2, arg3, arg4);			\
-	__sc_7 = (unsigned long) (arg5)
-#define __sc_loadargs_6(name, arg1, arg2, arg3, arg4, arg5, arg6)	\
-	__sc_loadargs_5(name, arg1, arg2, arg3, arg4, arg5);		\
-	__sc_8 = (unsigned long) (arg6)
-
-#define __sc_asm_input_0 "0" (__sc_0)
-#define __sc_asm_input_1 __sc_asm_input_0, "1" (__sc_3)
-#define __sc_asm_input_2 __sc_asm_input_1, "2" (__sc_4)
-#define __sc_asm_input_3 __sc_asm_input_2, "3" (__sc_5)
-#define __sc_asm_input_4 __sc_asm_input_3, "4" (__sc_6)
-#define __sc_asm_input_5 __sc_asm_input_4, "5" (__sc_7)
-#define __sc_asm_input_6 __sc_asm_input_5, "6" (__sc_8)
-
-#define _syscall0(type,name)						\
-type name(void)								\
-{									\
-	__syscall_nr(0, type, name);					\
-}
-
-#define _syscall1(type,name,type1,arg1)					\
-type name(type1 arg1)							\
-{									\
-	__syscall_nr(1, type, name, arg1);				\
-}
-
-#define _syscall2(type,name,type1,arg1,type2,arg2)			\
-type name(type1 arg1, type2 arg2)					\
-{									\
-	__syscall_nr(2, type, name, arg1, arg2);			\
-}
-
-#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3)		\
-type name(type1 arg1, type2 arg2, type3 arg3)				\
-{									\
-	__syscall_nr(3, type, name, arg1, arg2, arg3);			\
-}
-
-#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
-type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4)		\
-{									\
-	__syscall_nr(4, type, name, arg1, arg2, arg3, arg4);		\
-}
-
-#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5) \
-type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5)	\
-{									\
-	__syscall_nr(5, type, name, arg1, arg2, arg3, arg4, arg5);	\
-}
-#define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5,type6,arg6) \
-type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6) \
-{									\
-	__syscall_nr(6, type, name, arg1, arg2, arg3, arg4, arg5, arg6); \
-}
-
-
 #include <linux/types.h>
 #include <linux/compiler.h>
 #include <linux/linkage.h>
diff --git a/include/asm-s390/unistd.h b/include/asm-s390/unistd.h
index 71d3c21b84f..fb6fef97d73 100644
--- a/include/asm-s390/unistd.h
+++ b/include/asm-s390/unistd.h
@@ -345,160 +345,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/err.h>
-
-#define __syscall_return(type, res)			     \
-do {							     \
-	if ((unsigned long)(res) >= (unsigned long)(-MAX_ERRNO)) { \
-		errno = -(res);				     \
-		res = -1;				     \
-	}						     \
-	return (type) (res);				     \
-} while (0)
-
-#define _svc_clobber "1", "cc", "memory"
-
-#define _syscall0(type,name)					\
-type name(void) {						\
-	register long __svcres asm("2");			\
-	long __res;						\
-	asm volatile(						\
-		"	.if	%1 < 256\n"			\
-		"	svc	%b1\n"				\
-		"	.else\n"				\
-		"	la	%%r1,%1\n"			\
-		"	svc	0\n"				\
-		"	.endif"					\
-		: "=d" (__svcres)				\
-		: "i" (__NR_##name)				\
-		: _svc_clobber);				\
-	__res = __svcres;					\
-	__syscall_return(type,__res);				\
-}
-
-#define _syscall1(type,name,type1,arg1)				\
-type name(type1 arg1) {						\
-	register type1 __arg1 asm("2") = arg1;			\
-	register long __svcres asm("2");			\
-	long __res;						\
-	asm volatile(						\
-		"	.if	%1 < 256\n"			\
-		"	svc	%b1\n"				\
-		"	.else\n"				\
-		"	la	%%r1,%1\n"			\
-		"	svc	0\n"				\
-		"	.endif"					\
-		: "=d" (__svcres)				\
-		: "i" (__NR_##name),				\
-		  "0" (__arg1)					\
-		: _svc_clobber);				\
-	__res = __svcres;					\
-	__syscall_return(type,__res);				\
-}
-
-#define _syscall2(type,name,type1,arg1,type2,arg2)		\
-type name(type1 arg1, type2 arg2) {				\
-	register type1 __arg1 asm("2") = arg1;			\
-	register type2 __arg2 asm("3") = arg2;			\
-	register long __svcres asm("2");			\
-	long __res;						\
-	asm volatile(						\
-		"	.if	%1 < 256\n"			\
-		"	svc	%b1\n"				\
-		"	.else\n"				\
-		"	la	%%r1,%1\n"			\
-		"	svc	0\n"				\
-		"	.endif"					\
-		: "=d" (__svcres)				\
-		: "i" (__NR_##name),				\
-		  "0" (__arg1),					\
-		  "d" (__arg2)					\
-		: _svc_clobber );				\
-	__res = __svcres;					\
-	__syscall_return(type,__res);				\
-}
-
-#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3)	\
-type name(type1 arg1, type2 arg2, type3 arg3) {			\
-	register type1 __arg1 asm("2") = arg1;			\
-	register type2 __arg2 asm("3") = arg2;			\
-	register type3 __arg3 asm("4") = arg3;			\
-	register long __svcres asm("2");			\
-	long __res;						\
-	asm volatile(						\
-		"	.if	%1 < 256\n"			\
-		"	svc	%b1\n"				\
-		"	.else\n"				\
-		"	la	%%r1,%1\n"			\
-		"	svc	0\n"				\
-		"	.endif"					\
-		: "=d" (__svcres)				\
-		: "i" (__NR_##name),				\
-		  "0" (__arg1),					\
-		  "d" (__arg2),					\
-		  "d" (__arg3)					\
-		: _svc_clobber);				\
-	__res = __svcres;					\
-	__syscall_return(type,__res);				\
-}
-
-#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,	\
-		  type4,name4)					\
-type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4) {	\
-	register type1 __arg1 asm("2") = arg1;			\
-	register type2 __arg2 asm("3") = arg2;			\
-	register type3 __arg3 asm("4") = arg3;			\
-	register type4 __arg4 asm("5") = arg4;			\
-	register long __svcres asm("2");			\
-	long __res;						\
-	asm volatile(						\
-		"	.if	%1 < 256\n"			\
-		"	svc	%b1\n"				\
-		"	.else\n"				\
-		"	la	%%r1,%1\n"			\
-		"	svc	0\n"				\
-		"	.endif"					\
-		: "=d" (__svcres)				\
-		: "i" (__NR_##name),				\
-		  "0" (__arg1),					\
-		  "d" (__arg2),					\
-		  "d" (__arg3),					\
-		  "d" (__arg4)					\
-		: _svc_clobber);				\
-	__res = __svcres;					\
-	__syscall_return(type,__res);				\
-}
-
-#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,	\
-		  type4,name4,type5,name5)			\
-type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4,	\
-	  type5 arg5) {						\
-	register type1 __arg1 asm("2") = arg1;			\
-	register type2 __arg2 asm("3") = arg2;			\
-	register type3 __arg3 asm("4") = arg3;			\
-	register type4 __arg4 asm("5") = arg4;			\
-	register type5 __arg5 asm("6") = arg5;			\
-	register long __svcres asm("2");			\
-	long __res;						\
-	asm volatile(						\
-		"	.if	%1 < 256\n"			\
-		"	svc	%b1\n"				\
-		"	.else\n"				\
-		"	la	%%r1,%1\n"			\
-		"	svc	0\n"				\
-		"	.endif"					\
-		: "=d" (__svcres)				\
-		: "i" (__NR_##name),				\
-		  "0" (__arg1),					\
-		  "d" (__arg2),					\
-		  "d" (__arg3),					\
-		  "d" (__arg4),					\
-		  "d" (__arg5)					\
-		: _svc_clobber);				\
-	__res = __svcres;					\
-	__syscall_return(type,__res);				\
-}
-
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
 #define __ARCH_WANT_SYS_ALARM
diff --git a/include/asm-sh/unistd.h b/include/asm-sh/unistd.h
index 0cae1d24876..f982073dc6c 100644
--- a/include/asm-sh/unistd.h
+++ b/include/asm-sh/unistd.h
@@ -332,143 +332,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/err.h>
-
-/* user-visible error numbers are in the range -1 - -MAX_ERRNO:
- * see <asm-sh/errno.h> */
-
-#define __syscall_return(type, res) \
-do { \
-	if ((unsigned long)(res) >= (unsigned long)(-MAX_ERRNO)) { \
-	/* Avoid using "res" which is declared to be in register r0; \
-	   errno might expand to a function call and clobber it.  */ \
-		int __err = -(res); \
-		errno = __err; \
-		res = -1; \
-	} \
-	return (type) (res); \
-} while (0)
-
-#if defined(__sh2__) || defined(__SH2E__) || defined(__SH2A__)
-#define SYSCALL_ARG0	"trapa #0x20"
-#define SYSCALL_ARG1	"trapa #0x21"
-#define SYSCALL_ARG2	"trapa #0x22"
-#define SYSCALL_ARG3	"trapa #0x23"
-#define SYSCALL_ARG4	"trapa #0x24"
-#define SYSCALL_ARG5	"trapa #0x25"
-#define SYSCALL_ARG6	"trapa #0x26"
-#else
-#define SYSCALL_ARG0	"trapa #0x10"
-#define SYSCALL_ARG1	"trapa #0x11"
-#define SYSCALL_ARG2	"trapa #0x12"
-#define SYSCALL_ARG3	"trapa #0x13"
-#define SYSCALL_ARG4	"trapa #0x14"
-#define SYSCALL_ARG5	"trapa #0x15"
-#define SYSCALL_ARG6	"trapa #0x16"
-#endif
-
-/* XXX - _foo needs to be __foo, while __NR_bar could be _NR_bar. */
-#define _syscall0(type,name) \
-type name(void) \
-{ \
-register long __sc0 __asm__ ("r3") = __NR_##name; \
-__asm__ __volatile__ (SYSCALL_ARG0 \
-	: "=z" (__sc0) \
-	: "0" (__sc0) \
-	: "memory" ); \
-__syscall_return(type,__sc0); \
-}
-
-#define _syscall1(type,name,type1,arg1) \
-type name(type1 arg1) \
-{ \
-register long __sc0 __asm__ ("r3") = __NR_##name; \
-register long __sc4 __asm__ ("r4") = (long) arg1; \
-__asm__ __volatile__ (SYSCALL_ARG1 \
-	: "=z" (__sc0) \
-	: "0" (__sc0), "r" (__sc4) \
-	: "memory"); \
-__syscall_return(type,__sc0); \
-}
-
-#define _syscall2(type,name,type1,arg1,type2,arg2) \
-type name(type1 arg1,type2 arg2) \
-{ \
-register long __sc0 __asm__ ("r3") = __NR_##name; \
-register long __sc4 __asm__ ("r4") = (long) arg1; \
-register long __sc5 __asm__ ("r5") = (long) arg2; \
-__asm__ __volatile__ (SYSCALL_ARG2 \
-	: "=z" (__sc0) \
-	: "0" (__sc0), "r" (__sc4), "r" (__sc5) \
-	: "memory"); \
-__syscall_return(type,__sc0); \
-}
-
-#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3) \
-type name(type1 arg1,type2 arg2,type3 arg3) \
-{ \
-register long __sc0 __asm__ ("r3") = __NR_##name; \
-register long __sc4 __asm__ ("r4") = (long) arg1; \
-register long __sc5 __asm__ ("r5") = (long) arg2; \
-register long __sc6 __asm__ ("r6") = (long) arg3; \
-__asm__ __volatile__ (SYSCALL_ARG3 \
-	: "=z" (__sc0) \
-	: "0" (__sc0), "r" (__sc4), "r" (__sc5), "r" (__sc6) \
-	: "memory"); \
-__syscall_return(type,__sc0); \
-}
-
-#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
-type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4) \
-{ \
-register long __sc0 __asm__ ("r3") = __NR_##name; \
-register long __sc4 __asm__ ("r4") = (long) arg1; \
-register long __sc5 __asm__ ("r5") = (long) arg2; \
-register long __sc6 __asm__ ("r6") = (long) arg3; \
-register long __sc7 __asm__ ("r7") = (long) arg4; \
-__asm__ __volatile__ (SYSCALL_ARG4 \
-	: "=z" (__sc0) \
-	: "0" (__sc0), "r" (__sc4), "r" (__sc5), "r" (__sc6),  \
-	  "r" (__sc7) \
-	: "memory" ); \
-__syscall_return(type,__sc0); \
-}
-
-#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5) \
-type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5) \
-{ \
-register long __sc3 __asm__ ("r3") = __NR_##name; \
-register long __sc4 __asm__ ("r4") = (long) arg1; \
-register long __sc5 __asm__ ("r5") = (long) arg2; \
-register long __sc6 __asm__ ("r6") = (long) arg3; \
-register long __sc7 __asm__ ("r7") = (long) arg4; \
-register long __sc0 __asm__ ("r0") = (long) arg5; \
-__asm__ __volatile__ (SYSCALL_ARG5 \
-	: "=z" (__sc0) \
-	: "0" (__sc0), "r" (__sc4), "r" (__sc5), "r" (__sc6), "r" (__sc7),  \
-	  "r" (__sc3) \
-	: "memory" ); \
-__syscall_return(type,__sc0); \
-}
-
-#define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5,type6,arg6) \
-type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6) \
-{ \
-register long __sc3 __asm__ ("r3") = __NR_##name; \
-register long __sc4 __asm__ ("r4") = (long) arg1; \
-register long __sc5 __asm__ ("r5") = (long) arg2; \
-register long __sc6 __asm__ ("r6") = (long) arg3; \
-register long __sc7 __asm__ ("r7") = (long) arg4; \
-register long __sc0 __asm__ ("r0") = (long) arg5; \
-register long __sc1 __asm__ ("r1") = (long) arg6; \
-__asm__ __volatile__ (SYSCALL_ARG6 \
-	: "=z" (__sc0) \
-	: "0" (__sc0), "r" (__sc4), "r" (__sc5), "r" (__sc6), "r" (__sc7),  \
-	  "r" (__sc3), "r" (__sc1) \
-	: "memory" ); \
-__syscall_return(type,__sc0); \
-}
-
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
 #define __ARCH_WANT_OLD_STAT
diff --git a/include/asm-sh64/unistd.h b/include/asm-sh64/unistd.h
index ee7828b27ad..1f38a7aacaa 100644
--- a/include/asm-sh64/unistd.h
+++ b/include/asm-sh64/unistd.h
@@ -347,148 +347,6 @@
 #ifdef __KERNEL__ 
 
 #define NR_syscalls 321
-#include <linux/err.h>
-
-/* user-visible error numbers are in the range -1 - -MAX_ERRNO:
- * see <asm-sh64/errno.h> */
-
-#define __syscall_return(type, res) \
-do { \
-	/* Note: when returning from kernel the return value is in r9	    \
-	**       This prevents conflicts between return value and arg1      \
-	**       when dispatching signal handler, in other words makes	    \
-	**       life easier in the system call epilogue (see entry.S)      \
-	*/								    \
-        register unsigned long __sr2 __asm__ ("r2") = res;		    \
-	if ((unsigned long)(res) >= (unsigned long)(-MAX_ERRNO)) {	    \
-		errno = -(res);						    \
-		__sr2 = -1; 						    \
-	} \
-	return (type) (__sr2); 						    \
-} while (0)
-
-/* XXX - _foo needs to be __foo, while __NR_bar could be _NR_bar. */
-
-#define _syscall0(type,name) \
-type name(void) \
-{ \
-register unsigned long __sc0 __asm__ ("r9") = ((0x10 << 16) | __NR_##name); \
-__asm__ __volatile__ ("trapa	%1 !\t\t\t" #name "()"			    \
-	: "=r" (__sc0) 							    \
-	: "r" (__sc0) ); 						    \
-__syscall_return(type,__sc0); 						    \
-}
-
-	/*
-	 * The apparent spurious "dummy" assembler comment is *needed*,
-	 * as without it, the compiler treats the arg<n> variables
-	 * as no longer live just before the asm. The compiler can
-	 * then optimize the storage into any registers it wishes.
-	 * The additional dummy statement forces the compiler to put
-	 * the arguments into the correct registers before the TRAPA.
-	 */
-#define _syscall1(type,name,type1,arg1) \
-type name(type1 arg1) \
-{ \
-register unsigned long __sc0 __asm__ ("r9") = ((0x11 << 16) | __NR_##name); \
-register unsigned long __sc2 __asm__ ("r2") = (unsigned long) arg1;	    \
-__asm__ __volatile__ ("trapa	%1 !\t\t\t" #name "(%2)"		    \
-	: "=r" (__sc0) 							    \
-	: "r" (__sc0), "r" (__sc2));					    \
-__asm__ __volatile__ ("!dummy	%0 %1"				   	    \
-	:								    \
-	: "r" (__sc0), "r" (__sc2));					    \
-__syscall_return(type,__sc0); 						    \
-}
-
-#define _syscall2(type,name,type1,arg1,type2,arg2) \
-type name(type1 arg1,type2 arg2) \
-{ \
-register unsigned long __sc0 __asm__ ("r9") = ((0x12 << 16) | __NR_##name); \
-register unsigned long __sc2 __asm__ ("r2") = (unsigned long) arg1;	    \
-register unsigned long __sc3 __asm__ ("r3") = (unsigned long) arg2;	    \
-__asm__ __volatile__ ("trapa	%1 !\t\t\t" #name "(%2,%3)"		    \
-	: "=r" (__sc0) 							    \
-	: "r" (__sc0), "r" (__sc2), "r" (__sc3) );			    \
-__asm__ __volatile__ ("!dummy	%0 %1 %2"			   	    \
-	:								    \
-	: "r" (__sc0), "r" (__sc2), "r" (__sc3) );			    \
-__syscall_return(type,__sc0); 						    \
-}
-
-#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3) \
-type name(type1 arg1,type2 arg2,type3 arg3) \
-{ \
-register unsigned long __sc0 __asm__ ("r9") = ((0x13 << 16) | __NR_##name); \
-register unsigned long __sc2 __asm__ ("r2") = (unsigned long) arg1;	    \
-register unsigned long __sc3 __asm__ ("r3") = (unsigned long) arg2;	    \
-register unsigned long __sc4 __asm__ ("r4") = (unsigned long) arg3;	    \
-__asm__ __volatile__ ("trapa	%1 !\t\t\t" #name "(%2,%3,%4)"		    \
-	: "=r" (__sc0) 							    \
-	: "r" (__sc0), "r" (__sc2), "r" (__sc3), "r" (__sc4) );		    \
-__asm__ __volatile__ ("!dummy	%0 %1 %2 %3"			   	    \
-	:								    \
-	: "r" (__sc0), "r" (__sc2), "r" (__sc3), "r" (__sc4) );	   	    \
-__syscall_return(type,__sc0); 						    \
-}
-
-#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
-type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4) \
-{ \
-register unsigned long __sc0 __asm__ ("r9") = ((0x14 << 16) | __NR_##name); \
-register unsigned long __sc2 __asm__ ("r2") = (unsigned long) arg1;	    \
-register unsigned long __sc3 __asm__ ("r3") = (unsigned long) arg2;	    \
-register unsigned long __sc4 __asm__ ("r4") = (unsigned long) arg3;	    \
-register unsigned long __sc5 __asm__ ("r5") = (unsigned long) arg4;	    \
-__asm__ __volatile__ ("trapa	%1 !\t\t\t" #name "(%2,%3,%4,%5)"	    \
-	: "=r" (__sc0) 							    \
-	: "r" (__sc0), "r" (__sc2), "r" (__sc3), "r" (__sc4), "r" (__sc5) );\
-__asm__ __volatile__ ("!dummy	%0 %1 %2 %3 %4"			   	    \
-	:								    \
-	: "r" (__sc0), "r" (__sc2), "r" (__sc3), "r" (__sc4), "r" (__sc5) );\
-__syscall_return(type,__sc0); 						    \
-}
-
-#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5) \
-type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5) \
-{ \
-register unsigned long __sc0 __asm__ ("r9") = ((0x15 << 16) | __NR_##name); \
-register unsigned long __sc2 __asm__ ("r2") = (unsigned long) arg1;	    \
-register unsigned long __sc3 __asm__ ("r3") = (unsigned long) arg2;	    \
-register unsigned long __sc4 __asm__ ("r4") = (unsigned long) arg3;	    \
-register unsigned long __sc5 __asm__ ("r5") = (unsigned long) arg4;	    \
-register unsigned long __sc6 __asm__ ("r6") = (unsigned long) arg5;	    \
-__asm__ __volatile__ ("trapa	%1 !\t\t\t" #name "(%2,%3,%4,%5,%6)"	    \
-	: "=r" (__sc0) 							    \
-	: "r" (__sc0), "r" (__sc2), "r" (__sc3), "r" (__sc4), "r" (__sc5),  \
-	  "r" (__sc6));							    \
-__asm__ __volatile__ ("!dummy	%0 %1 %2 %3 %4 %5"		   	    \
-	:								    \
-	: "r" (__sc0), "r" (__sc2), "r" (__sc3), "r" (__sc4), "r" (__sc5),  \
-	  "r" (__sc6));							    \
-__syscall_return(type,__sc0); 						    \
-}
-
-#define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5, type6, arg6) \
-type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6) \
-{ \
-register unsigned long __sc0 __asm__ ("r9") = ((0x16 << 16) | __NR_##name); \
-register unsigned long __sc2 __asm__ ("r2") = (unsigned long) arg1;	    \
-register unsigned long __sc3 __asm__ ("r3") = (unsigned long) arg2;	    \
-register unsigned long __sc4 __asm__ ("r4") = (unsigned long) arg3;	    \
-register unsigned long __sc5 __asm__ ("r5") = (unsigned long) arg4;	    \
-register unsigned long __sc6 __asm__ ("r6") = (unsigned long) arg5;	    \
-register unsigned long __sc7 __asm__ ("r7") = (unsigned long) arg6;	    \
-__asm__ __volatile__ ("trapa	%1 !\t\t\t" #name "(%2,%3,%4,%5,%6,%7)"	    \
-	: "=r" (__sc0) 							    \
-	: "r" (__sc0), "r" (__sc2), "r" (__sc3), "r" (__sc4), "r" (__sc5),  \
-	  "r" (__sc6), "r" (__sc7));					    \
-__asm__ __volatile__ ("!dummy	%0 %1 %2 %3 %4 %5 %6"		   	    \
-	:								    \
-	: "r" (__sc0), "r" (__sc2), "r" (__sc3), "r" (__sc4), "r" (__sc5),  \
-	  "r" (__sc6), "r" (__sc7));					    \
-__syscall_return(type,__sc0); 						    \
-}
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/include/asm-sparc/unistd.h b/include/asm-sparc/unistd.h
index f7827fa4cd5..d5b2f8053b3 100644
--- a/include/asm-sparc/unistd.h
+++ b/include/asm-sparc/unistd.h
@@ -329,136 +329,6 @@
  *          find a free slot in the 0-302 range.
  */
 
-#define _syscall0(type,name) \
-type name(void) \
-{ \
-long __res; \
-register long __g1 __asm__ ("g1") = __NR_##name; \
-__asm__ __volatile__ ("t 0x10\n\t" \
-		      "bcc 1f\n\t" \
-		      "mov %%o0, %0\n\t" \
-		      "sub %%g0, %%o0, %0\n\t" \
-		      "1:\n\t" \
-		      : "=r" (__res)\
-		      : "r" (__g1) \
-		      : "o0", "cc"); \
-if (__res < -255 || __res >= 0) \
-    return (type) __res; \
-errno = -__res; \
-return -1; \
-}
-
-#define _syscall1(type,name,type1,arg1) \
-type name(type1 arg1) \
-{ \
-long __res; \
-register long __g1 __asm__ ("g1") = __NR_##name; \
-register long __o0 __asm__ ("o0") = (long)(arg1); \
-__asm__ __volatile__ ("t 0x10\n\t" \
-		      "bcc 1f\n\t" \
-		      "mov %%o0, %0\n\t" \
-		      "sub %%g0, %%o0, %0\n\t" \
-		      "1:\n\t" \
-		      : "=r" (__res), "=&r" (__o0) \
-		      : "1" (__o0), "r" (__g1) \
-		      : "cc"); \
-if (__res < -255 || __res >= 0) \
-	return (type) __res; \
-errno = -__res; \
-return -1; \
-}
-
-#define _syscall2(type,name,type1,arg1,type2,arg2) \
-type name(type1 arg1,type2 arg2) \
-{ \
-long __res; \
-register long __g1 __asm__ ("g1") = __NR_##name; \
-register long __o0 __asm__ ("o0") = (long)(arg1); \
-register long __o1 __asm__ ("o1") = (long)(arg2); \
-__asm__ __volatile__ ("t 0x10\n\t" \
-		      "bcc 1f\n\t" \
-		      "mov %%o0, %0\n\t" \
-		      "sub %%g0, %%o0, %0\n\t" \
-		      "1:\n\t" \
-		      : "=r" (__res), "=&r" (__o0) \
-		      : "1" (__o0), "r" (__o1), "r" (__g1) \
-		      : "cc"); \
-if (__res < -255 || __res >= 0) \
-	return (type) __res; \
-errno = -__res; \
-return -1; \
-}
-
-#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3) \
-type name(type1 arg1,type2 arg2,type3 arg3) \
-{ \
-long __res; \
-register long __g1 __asm__ ("g1") = __NR_##name; \
-register long __o0 __asm__ ("o0") = (long)(arg1); \
-register long __o1 __asm__ ("o1") = (long)(arg2); \
-register long __o2 __asm__ ("o2") = (long)(arg3); \
-__asm__ __volatile__ ("t 0x10\n\t" \
-		      "bcc 1f\n\t" \
-		      "mov %%o0, %0\n\t" \
-		      "sub %%g0, %%o0, %0\n\t" \
-		      "1:\n\t" \
-		      : "=r" (__res), "=&r" (__o0) \
-		      : "1" (__o0), "r" (__o1), "r" (__o2), "r" (__g1) \
-		      : "cc"); \
-if (__res < -255 || __res>=0) \
-	return (type) __res; \
-errno = -__res; \
-return -1; \
-}
-
-#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
-type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4) \
-{ \
-long __res; \
-register long __g1 __asm__ ("g1") = __NR_##name; \
-register long __o0 __asm__ ("o0") = (long)(arg1); \
-register long __o1 __asm__ ("o1") = (long)(arg2); \
-register long __o2 __asm__ ("o2") = (long)(arg3); \
-register long __o3 __asm__ ("o3") = (long)(arg4); \
-__asm__ __volatile__ ("t 0x10\n\t" \
-		      "bcc 1f\n\t" \
-		      "mov %%o0, %0\n\t" \
-		      "sub %%g0, %%o0, %0\n\t" \
-		      "1:\n\t" \
-		      : "=r" (__res), "=&r" (__o0) \
-		      : "1" (__o0), "r" (__o1), "r" (__o2), "r" (__o3), "r" (__g1) \
-		      : "cc"); \
-if (__res < -255 || __res>=0) \
-	return (type) __res; \
-errno = -__res; \
-return -1; \
-} 
-
-#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
-	  type5,arg5) \
-type name (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5) \
-{ \
-long __res; \
-register long __g1 __asm__ ("g1") = __NR_##name; \
-register long __o0 __asm__ ("o0") = (long)(arg1); \
-register long __o1 __asm__ ("o1") = (long)(arg2); \
-register long __o2 __asm__ ("o2") = (long)(arg3); \
-register long __o3 __asm__ ("o3") = (long)(arg4); \
-register long __o4 __asm__ ("o4") = (long)(arg5); \
-__asm__ __volatile__ ("t 0x10\n\t" \
-		      "bcc 1f\n\t" \
-		      "mov %%o0, %0\n\t" \
-		      "sub %%g0, %%o0, %0\n\t" \
-		      "1:\n\t" \
-		      : "=r" (__res), "=&r" (__o0) \
-		      : "1" (__o0), "r" (__o1), "r" (__o2), "r" (__o3), "r" (__o4), "r" (__g1) \
-		      : "cc"); \
-if (__res < -255 || __res>=0) \
-	return (type) __res; \
-errno = -__res; \
-return -1; \
-}
-
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
 #define __ARCH_WANT_STAT64
diff --git a/include/asm-sparc64/unistd.h b/include/asm-sparc64/unistd.h
index 63669dad0d7..47047536f26 100644
--- a/include/asm-sparc64/unistd.h
+++ b/include/asm-sparc64/unistd.h
@@ -332,124 +332,6 @@
  *          find a free slot in the 0-302 range.
  */
 
-#define _syscall0(type,name) \
-type name(void) \
-{ \
-long __res; \
-register long __g1 __asm__ ("g1") = __NR_##name; \
-__asm__ __volatile__ ("t 0x6d\n\t" \
-		      "sub %%g0, %%o0, %0\n\t" \
-		      "movcc %%xcc, %%o0, %0\n\t" \
-		      : "=r" (__res)\
-		      : "r" (__g1) \
-		      : "o0", "cc"); \
-if (__res >= 0) \
-    return (type) __res; \
-errno = -__res; \
-return -1; \
-}
-
-#define _syscall1(type,name,type1,arg1) \
-type name(type1 arg1) \
-{ \
-long __res; \
-register long __g1 __asm__ ("g1") = __NR_##name; \
-register long __o0 __asm__ ("o0") = (long)(arg1); \
-__asm__ __volatile__ ("t 0x6d\n\t" \
-		      "sub %%g0, %%o0, %0\n\t" \
-		      "movcc %%xcc, %%o0, %0\n\t" \
-		      : "=r" (__res), "=&r" (__o0) \
-		      : "1" (__o0), "r" (__g1) \
-		      : "cc"); \
-if (__res >= 0) \
-	return (type) __res; \
-errno = -__res; \
-return -1; \
-}
-
-#define _syscall2(type,name,type1,arg1,type2,arg2) \
-type name(type1 arg1,type2 arg2) \
-{ \
-long __res; \
-register long __g1 __asm__ ("g1") = __NR_##name; \
-register long __o0 __asm__ ("o0") = (long)(arg1); \
-register long __o1 __asm__ ("o1") = (long)(arg2); \
-__asm__ __volatile__ ("t 0x6d\n\t" \
-		      "sub %%g0, %%o0, %0\n\t" \
-		      "movcc %%xcc, %%o0, %0\n\t" \
-		      : "=r" (__res), "=&r" (__o0) \
-		      : "1" (__o0), "r" (__o1), "r" (__g1) \
-		      : "cc"); \
-if (__res >= 0) \
-	return (type) __res; \
-errno = -__res; \
-return -1; \
-}
-
-#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3) \
-type name(type1 arg1,type2 arg2,type3 arg3) \
-{ \
-long __res; \
-register long __g1 __asm__ ("g1") = __NR_##name; \
-register long __o0 __asm__ ("o0") = (long)(arg1); \
-register long __o1 __asm__ ("o1") = (long)(arg2); \
-register long __o2 __asm__ ("o2") = (long)(arg3); \
-__asm__ __volatile__ ("t 0x6d\n\t" \
-		      "sub %%g0, %%o0, %0\n\t" \
-		      "movcc %%xcc, %%o0, %0\n\t" \
-		      : "=r" (__res), "=&r" (__o0) \
-		      : "1" (__o0), "r" (__o1), "r" (__o2), "r" (__g1) \
-		      : "cc"); \
-if (__res>=0) \
-	return (type) __res; \
-errno = -__res; \
-return -1; \
-}
-
-#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
-type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4) \
-{ \
-long __res; \
-register long __g1 __asm__ ("g1") = __NR_##name; \
-register long __o0 __asm__ ("o0") = (long)(arg1); \
-register long __o1 __asm__ ("o1") = (long)(arg2); \
-register long __o2 __asm__ ("o2") = (long)(arg3); \
-register long __o3 __asm__ ("o3") = (long)(arg4); \
-__asm__ __volatile__ ("t 0x6d\n\t" \
-		      "sub %%g0, %%o0, %0\n\t" \
-		      "movcc %%xcc, %%o0, %0\n\t" \
-		      : "=r" (__res), "=&r" (__o0) \
-		      : "1" (__o0), "r" (__o1), "r" (__o2), "r" (__o3), "r" (__g1) \
-		      : "cc"); \
-if (__res>=0) \
-	return (type) __res; \
-errno = -__res; \
-return -1; \
-} 
-
-#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
-	  type5,arg5) \
-type name (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5) \
-{ \
-long __res; \
-register long __g1 __asm__ ("g1") = __NR_##name; \
-register long __o0 __asm__ ("o0") = (long)(arg1); \
-register long __o1 __asm__ ("o1") = (long)(arg2); \
-register long __o2 __asm__ ("o2") = (long)(arg3); \
-register long __o3 __asm__ ("o3") = (long)(arg4); \
-register long __o4 __asm__ ("o4") = (long)(arg5); \
-__asm__ __volatile__ ("t 0x6d\n\t" \
-		      "sub %%g0, %%o0, %0\n\t" \
-		      "movcc %%xcc, %%o0, %0\n\t" \
-		      : "=r" (__res), "=&r" (__o0) \
-		      : "1" (__o0), "r" (__o1), "r" (__o2), "r" (__o3), "r" (__o4), "r" (__g1) \
-		      : "cc"); \
-if (__res>=0) \
-	return (type) __res; \
-errno = -__res; \
-return -1; \
-}
-
 /* sysconf options, for SunOS compatibility */
 #define   _SC_ARG_MAX             1
 #define   _SC_CHILD_MAX           2
diff --git a/include/asm-v850/unistd.h b/include/asm-v850/unistd.h
index 737401e7d3a..2241ed45ecf 100644
--- a/include/asm-v850/unistd.h
+++ b/include/asm-v850/unistd.h
@@ -204,168 +204,8 @@
 #define __NR_gettid		201
 #define __NR_tkill		202
 
-
-/* Syscall protocol:
-   Syscall number in r12, args in r6-r9, r13-r14
-   Return value in r10
-   Trap 0 for `short' syscalls, where all the args can fit in function
-   call argument registers, and trap 1 when there are additional args in
-   r13-r14.  */
-
-#define SYSCALL_NUM	"r12"
-#define SYSCALL_ARG0	"r6"
-#define SYSCALL_ARG1	"r7"
-#define SYSCALL_ARG2	"r8"
-#define SYSCALL_ARG3	"r9"
-#define SYSCALL_ARG4	"r13"
-#define SYSCALL_ARG5	"r14"
-#define SYSCALL_RET	"r10"
-
-#define SYSCALL_SHORT_TRAP	"0"
-#define SYSCALL_LONG_TRAP	"1"
-
-/* Registers clobbered by any syscall.  This _doesn't_ include the syscall
-   number (r12) or the `extended arg' registers (r13, r14), even though
-   they are actually clobbered too (this is because gcc's `asm' statement
-   doesn't allow a clobber to be used as an input or output).  */
-#define SYSCALL_CLOBBERS	"r1", "r5", "r11", "r15", "r16", \
-				"r17", "r18", "r19"
-
-/* Registers clobbered by a `short' syscall.  This includes all clobbers
-   except the syscall number (r12).  */
-#define SYSCALL_SHORT_CLOBBERS	SYSCALL_CLOBBERS, "r13", "r14"
-
 #ifdef __KERNEL__
 
-#include <asm/clinkage.h>
-#include <linux/err.h>
-
-#define __syscall_return(type, res)					      \
-  do {									      \
-	  /* user-visible error numbers are in the range -1 - -MAX_ERRNO:      \
-	     see <asm-v850/errno.h> */					      \
-	  if (__builtin_expect ((unsigned long)(res) >= (unsigned long)(-MAX_ERRNO), 0)) { \
-		  errno = -(res);					      \
-		  res = -1;						      \
-	  }								      \
-	  return (type) (res);						      \
-  } while (0)
-
-
-#define _syscall0(type, name)						      \
-type name (void)							      \
-{									      \
-  register unsigned long __syscall __asm__ (SYSCALL_NUM) = __NR_##name;	      \
-  register unsigned long __ret __asm__ (SYSCALL_RET);			      \
-  __asm__ __volatile__ ("trap " SYSCALL_SHORT_TRAP			      \
-			: "=r" (__ret), "=r" (__syscall)	 	      \
-			: "1" (__syscall)				      \
-			: SYSCALL_SHORT_CLOBBERS);			      \
-  __syscall_return (type, __ret);					      \
-}
-
-#define _syscall1(type, name, atype, a)					      \
-type name (atype a)							      \
-{									      \
-  register atype __a __asm__ (SYSCALL_ARG0) = a;			      \
-  register unsigned long __syscall __asm__ (SYSCALL_NUM) = __NR_##name;	      \
-  register unsigned long __ret __asm__ (SYSCALL_RET);			      \
-  __asm__ __volatile__ ("trap " SYSCALL_SHORT_TRAP			      \
-			: "=r" (__ret), "=r" (__syscall)		      \
-			: "1" (__syscall), "r" (__a)			      \
-			: SYSCALL_SHORT_CLOBBERS);			      \
-  __syscall_return (type, __ret);					      \
-}
-
-#define _syscall2(type, name, atype, a, btype, b)			      \
-type name (atype a, btype b)						      \
-{									      \
-  register atype __a __asm__ (SYSCALL_ARG0) = a;			      \
-  register btype __b __asm__ (SYSCALL_ARG1) = b;			      \
-  register unsigned long __syscall __asm__ (SYSCALL_NUM) = __NR_##name;	      \
-  register unsigned long __ret __asm__ (SYSCALL_RET);			      \
-  __asm__ __volatile__ ("trap " SYSCALL_SHORT_TRAP			      \
-			: "=r" (__ret), "=r" (__syscall)		      \
-			: "1" (__syscall), "r" (__a), "r" (__b)		      \
-			: SYSCALL_SHORT_CLOBBERS);			      \
-  __syscall_return (type, __ret);					      \
-}
-
-#define _syscall3(type, name, atype, a, btype, b, ctype, c)		      \
-type name (atype a, btype b, ctype c)					      \
-{									      \
-  register atype __a __asm__ (SYSCALL_ARG0) = a;			      \
-  register btype __b __asm__ (SYSCALL_ARG1) = b;			      \
-  register ctype __c __asm__ (SYSCALL_ARG2) = c;			      \
-  register unsigned long __syscall __asm__ (SYSCALL_NUM) = __NR_##name;	      \
-  register unsigned long __ret __asm__ (SYSCALL_RET);			      \
-  __asm__ __volatile__ ("trap " SYSCALL_SHORT_TRAP			      \
-			: "=r" (__ret), "=r" (__syscall)		      \
-			: "1" (__syscall), "r" (__a), "r" (__b), "r" (__c)    \
-			: SYSCALL_SHORT_CLOBBERS);			      \
-  __syscall_return (type, __ret);					      \
-}
-
-#define _syscall4(type, name, atype, a, btype, b, ctype, c, dtype, d)	      \
-type name (atype a, btype b, ctype c, dtype d)				      \
-{									      \
-  register atype __a __asm__ (SYSCALL_ARG0) = a;			      \
-  register btype __b __asm__ (SYSCALL_ARG1) = b;			      \
-  register ctype __c __asm__ (SYSCALL_ARG2) = c;			      \
-  register dtype __d __asm__ (SYSCALL_ARG3) = d;			      \
-  register unsigned long __syscall __asm__ (SYSCALL_NUM) = __NR_##name;	      \
-  register unsigned long __ret __asm__ (SYSCALL_RET);			      \
-  __asm__ __volatile__ ("trap " SYSCALL_SHORT_TRAP			      \
-			: "=r" (__ret), "=r" (__syscall)		      \
-			: "1" (__syscall),				      \
-			"r" (__a), "r" (__b), "r" (__c), "r" (__d)	      \
-			: SYSCALL_SHORT_CLOBBERS);			      \
-  __syscall_return (type, __ret);					      \
-}
-
-#define _syscall5(type, name, atype, a, btype, b, ctype, c, dtype, d, etype,e)\
-type name (atype a, btype b, ctype c, dtype d, etype e)			      \
-{									      \
-  register atype __a __asm__ (SYSCALL_ARG0) = a;			      \
-  register btype __b __asm__ (SYSCALL_ARG1) = b;			      \
-  register ctype __c __asm__ (SYSCALL_ARG2) = c;			      \
-  register dtype __d __asm__ (SYSCALL_ARG3) = d;			      \
-  register etype __e __asm__ (SYSCALL_ARG4) = e;			      \
-  register unsigned long __syscall __asm__ (SYSCALL_NUM) = __NR_##name;	      \
-  register unsigned long __ret __asm__ (SYSCALL_RET);			      \
-  __asm__ __volatile__ ("trap " SYSCALL_LONG_TRAP			      \
-			: "=r" (__ret), "=r" (__syscall), "=r" (__e)	      \
-			: "1" (__syscall),				      \
-			"r" (__a), "r" (__b), "r" (__c), "r" (__d), "2" (__e) \
-			: SYSCALL_CLOBBERS);				      \
-  __syscall_return (type, __ret);					      \
-}
-
-#define __SYSCALL6_TRAP(syscall, ret, a, b, c, d, e, f)			      \
-  __asm__ __volatile__ ("trap " SYSCALL_LONG_TRAP			      \
-			: "=r" (ret), "=r" (syscall),			      \
- 			"=r" (e), "=r" (f)				      \
-			: "1" (syscall),				      \
-			"r" (a), "r" (b), "r" (c), "r" (d),		      \
-			"2" (e), "3" (f)				      \
-			: SYSCALL_CLOBBERS);
-
-#define _syscall6(type, name, atype, a, btype, b, ctype, c, dtype, d, etype, e, ftype, f) \
-type name (atype a, btype b, ctype c, dtype d, etype e, ftype f)	      \
-{									      \
-  register atype __a __asm__ (SYSCALL_ARG0) = a;			      \
-  register btype __b __asm__ (SYSCALL_ARG1) = b;			      \
-  register ctype __c __asm__ (SYSCALL_ARG2) = c;			      \
-  register dtype __d __asm__ (SYSCALL_ARG3) = d;			      \
-  register etype __e __asm__ (SYSCALL_ARG4) = e;			      \
-  register etype __f __asm__ (SYSCALL_ARG5) = f;			      \
-  register unsigned long __syscall __asm__ (SYSCALL_NUM) = __NR_##name;	      \
-  register unsigned long __ret __asm__ (SYSCALL_RET);			      \
-  __SYSCALL6_TRAP(__syscall, __ret, __a, __b, __c, __d, __e, __f);	      \
-  __syscall_return (type, __ret);					      \
-}
-		
-
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
 #define __ARCH_WANT_STAT64
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 777288eb7e7..c5f596e71fa 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -622,25 +622,7 @@ __SYSCALL(__NR_move_pages, sys_move_pages)
 
 #define __NR_syscall_max __NR_move_pages
 
-#ifdef __KERNEL__
-#include <linux/err.h>
-#endif
-
 #ifndef __NO_STUBS
-
-/* user-visible error numbers are in the range -1 - -MAX_ERRNO */
-
-#define __syscall_clobber "r11","rcx","memory" 
-
-#define __syscall_return(type, res) \
-do { \
-	if ((unsigned long)(res) >= (unsigned long)(-MAX_ERRNO)) { \
-		errno = -(res); \
-		res = -1; \
-	} \
-	return (type) (res); \
-} while (0)
-
 #define __ARCH_WANT_OLD_READDIR
 #define __ARCH_WANT_OLD_STAT
 #define __ARCH_WANT_SYS_ALARM
@@ -664,87 +646,6 @@ do { \
 #define __ARCH_WANT_SYS_TIME
 #define __ARCH_WANT_COMPAT_SYS_TIME
 
-#define __syscall "syscall"
-
-#define _syscall0(type,name) \
-type name(void) \
-{ \
-long __res; \
-__asm__ volatile (__syscall \
-	: "=a" (__res) \
-	: "0" (__NR_##name) : __syscall_clobber ); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall1(type,name,type1,arg1) \
-type name(type1 arg1) \
-{ \
-long __res; \
-__asm__ volatile (__syscall \
-	: "=a" (__res) \
-	: "0" (__NR_##name),"D" ((long)(arg1)) : __syscall_clobber ); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall2(type,name,type1,arg1,type2,arg2) \
-type name(type1 arg1,type2 arg2) \
-{ \
-long __res; \
-__asm__ volatile (__syscall \
-	: "=a" (__res) \
-	: "0" (__NR_##name),"D" ((long)(arg1)),"S" ((long)(arg2)) : __syscall_clobber ); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3) \
-type name(type1 arg1,type2 arg2,type3 arg3) \
-{ \
-long __res; \
-__asm__ volatile (__syscall \
-	: "=a" (__res) \
-	: "0" (__NR_##name),"D" ((long)(arg1)),"S" ((long)(arg2)), \
-		  "d" ((long)(arg3)) : __syscall_clobber); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
-type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4) \
-{ \
-long __res; \
-__asm__ volatile ("movq %5,%%r10 ;" __syscall \
-	: "=a" (__res) \
-	: "0" (__NR_##name),"D" ((long)(arg1)),"S" ((long)(arg2)), \
-	  "d" ((long)(arg3)),"g" ((long)(arg4)) : __syscall_clobber,"r10" ); \
-__syscall_return(type,__res); \
-} 
-
-#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
-	  type5,arg5) \
-type name (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5) \
-{ \
-long __res; \
-__asm__ volatile ("movq %5,%%r10 ; movq %6,%%r8 ; " __syscall \
-	: "=a" (__res) \
-	: "0" (__NR_##name),"D" ((long)(arg1)),"S" ((long)(arg2)), \
-	  "d" ((long)(arg3)),"g" ((long)(arg4)),"g" ((long)(arg5)) : \
-	__syscall_clobber,"r8","r10" ); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
-	  type5,arg5,type6,arg6) \
-type name (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5,type6 arg6) \
-{ \
-long __res; \
-__asm__ volatile ("movq %5,%%r10 ; movq %6,%%r8 ; movq %7,%%r9 ; " __syscall \
-	: "=a" (__res) \
-	: "0" (__NR_##name),"D" ((long)(arg1)),"S" ((long)(arg2)), \
-	  "d" ((long)(arg3)), "g" ((long)(arg4)), "g" ((long)(arg5)), \
-	  "g" ((long)(arg6)) : \
-	__syscall_clobber,"r8","r10","r9" ); \
-__syscall_return(type,__res); \
-}
-
 #ifdef __KERNEL__
 #ifndef __ASSEMBLY__
 
diff --git a/include/asm-xtensa/unistd.h b/include/asm-xtensa/unistd.h
index 411f810a55c..2e1a1b997e7 100644
--- a/include/asm-xtensa/unistd.h
+++ b/include/asm-xtensa/unistd.h
@@ -218,190 +218,6 @@
 
 #define SYSXTENSA_COUNT		   5	/* count of syscall0 functions*/
 
-#ifdef __KERNEL__
-#include <linux/linkage.h>
-
-#define __syscall_return(type, res) return ((type)(res))
-
-/* Tensilica's xt-xcc compiler is much more agressive at code
- * optimization than gcc.  Multiple __asm__ statements are
- * insufficient for xt-xcc because subsequent optimization passes
- * (beyond the front-end that knows of __asm__ statements and other
- * such GNU Extensions to C) can modify the register selection for
- * containment of C variables.
- *
- * xt-xcc cannot modify the contents of a single __asm__ statement, so
- * we create single-asm versions of the syscall macros that are
- * suitable and optimal for both xt-xcc and gcc.
- *
- * Linux takes system-call arguments in registers.  The following
- * design is optimized for user-land apps (e.g., glibc) which
- * typically have a function wrapper around the "syscall" assembly
- * instruction.  It satisfies the Xtensa ABI while minizing argument
- * shifting.
- *
- * The Xtensa ABI and software conventions require the system-call
- * number in a2.  If an argument exists in a2, we move it to the next
- * available register.  Note that for improved efficiency, we do NOT
- * shift all parameters down one register to maintain the original
- * order.
- *
- * At best case (zero arguments), we just write the syscall number to
- * a2.  At worst case (1 to 6 arguments), we move the argument in a2
- * to the next available register, then write the syscall number to
- * a2.
- *
- * For clarity, the following truth table enumerates all possibilities.
- *
- * arguments	syscall number	arg0, arg1, arg2, arg3, arg4, arg5
- * ---------	--------------	----------------------------------
- *	0	      a2
- *	1	      a2	a3
- *	2	      a2	a4,   a3
- *	3	      a2	a5,   a3,   a4
- *	4	      a2	a6,   a3,   a4,   a5
- *	5	      a2	a7,   a3,   a4,   a5,   a6
- *	6	      a2	a8,   a3,   a4,   a5,   a6,   a7
- */
-
-#define _syscall0(type,name) \
-type name(void) \
-{ \
-long __res; \
-__asm__ __volatile__ ( \
-	"  movi  a2, %1 \n" \
-	"  syscall      \n" \
-	"  mov   %0, a2 \n" \
-	: "=a" (__res) \
-	: "i" (__NR_##name) \
-	: "a2" \
-	); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall1(type,name,type0,arg0) \
-type name(type0 arg0) \
-{ \
-long __res; \
-__asm__ __volatile__ ( \
-	"  mov   a3, %2 \n" \
-	"  movi  a2, %1 \n" \
-	"  syscall      \n" \
-	"  mov   %0, a2 \n" \
-	: "=a" (__res) \
-	: "i" (__NR_##name), "a" (arg0) \
-	: "a2", "a3" \
-	); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall2(type,name,type0,arg0,type1,arg1) \
-type name(type0 arg0,type1 arg1) \
-{ \
-long __res; \
-__asm__ __volatile__ ( \
-	"  mov   a4, %2 \n" \
-	"  mov   a3, %3 \n" \
-	"  movi  a2, %1 \n" \
-	"  syscall      \n" \
-	"  mov   %0, a2 \n" \
-	: "=a" (__res) \
-	: "i" (__NR_##name), "a" (arg0), "a" (arg1) \
-	: "a2", "a3", "a4" \
-	); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall3(type,name,type0,arg0,type1,arg1,type2,arg2) \
-type name(type0 arg0,type1 arg1,type2 arg2) \
-{ \
-long __res; \
-__asm__ __volatile__ ( \
-	"  mov   a5, %2 \n" \
-	"  mov   a4, %4 \n" \
-	"  mov   a3, %3 \n" \
-	"  movi  a2, %1 \n" \
-	"  syscall      \n" \
-	"  mov   %0, a2 \n" \
-	: "=a" (__res) \
-	: "i" (__NR_##name), "a" (arg0), "a" (arg1), "a" (arg2) \
-	: "a2", "a3", "a4", "a5" \
-	); \
-__syscall_return(type,__res); \
-}
-
-#define _syscall4(type,name,type0,arg0,type1,arg1,type2,arg2,type3,arg3) \
-type name(type0 arg0,type1 arg1,type2 arg2,type3 arg3) \
-{ \
-long __res; \
-__asm__ __volatile__ ( \
-	"  mov   a6, %2 \n" \
-	"  mov   a5, %5 \n" \
-	"  mov   a4, %4 \n" \
-	"  mov   a3, %3 \n" \
-	"  movi  a2, %1 \n" \
-	"  syscall      \n" \
-	"  mov   %0, a2 \n" \
-	: "=a" (__res) \
-	: "i" (__NR_##name), "a" (arg0), "a" (arg1), "a" (arg2), "a" (arg3) \
-	: "a2", "a3", "a4", "a5", "a6" \
-	); \
-__syscall_return(type,__res); \
-}
-
-/* Note that we save and restore the a7 frame pointer.
- * Including a7 in the clobber list doesn't do what you'd expect.
- */
-#define _syscall5(type,name,type0,arg0,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
-type name(type0 arg0,type1 arg1,type2 arg2,type3 arg3,type4 arg4) \
-{ \
-long __res; \
-__asm__ __volatile__ ( \
-	"  mov   a9, a7 \n" \
-	"  mov   a7, %2 \n" \
-	"  mov   a6, %6 \n" \
-	"  mov   a5, %5 \n" \
-	"  mov   a4, %4 \n" \
-	"  mov   a3, %3 \n" \
-	"  movi  a2, %1 \n" \
-	"  syscall      \n" \
-	"  mov   a7, a9 \n" \
-	"  mov   %0, a2 \n" \
-	: "=a" (__res) \
-	: "i" (__NR_##name), "a" (arg0), "a" (arg1), "a" (arg2), \
-                             "a" (arg3), "a" (arg4) \
-	: "a2", "a3", "a4", "a5", "a6", "a9" \
-	); \
-__syscall_return(type,__res); \
-}
-
-/* Note that we save and restore the a7 frame pointer.
- * Including a7 in the clobber list doesn't do what you'd expect.
- */
-#define _syscall6(type,name,type0,arg0,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5) \
-type name(type0 arg0,type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5) \
-{ \
-long __res; \
-__asm__ __volatile__ ( \
-	"  mov   a9, a7 \n" \
-	"  mov   a8, %2 \n" \
-	"  mov   a7, %7 \n" \
-	"  mov   a6, %6 \n" \
-	"  mov   a5, %5 \n" \
-	"  mov   a4, %4 \n" \
-	"  mov   a3, %3 \n" \
-	"  movi  a2, %1 \n" \
-	"  syscall      \n" \
-	"  mov   a7, a9 \n" \
-	"  mov   %0, a2 \n" \
-	: "=a" (__res) \
-	: "i" (__NR_##name), "a" (arg0), "a" (arg1), "a" (arg2), \
-                             "a" (arg3), "a" (arg4), "a" (arg5)  \
-	: "a2", "a3", "a4", "a5", "a6", "a8", "a9" \
-	); \
-__syscall_return(type,__res); \
-}
-
 /*
  * "Conditional" syscalls
  *
-- 
cgit v1.2.3-70-g09d2


From 19e5d9c0d2194b4b47189cbec2921cbf72b0bd1c Mon Sep 17 00:00:00 2001
From: Henry Nestler <henry.ne@arcor.de>
Date: Wed, 6 Dec 2006 20:37:45 -0800
Subject: [PATCH] initrd: remove unused false condition for initrd_start

After LOADER_TYPE && INITRD_START are true, the short if-condition
for INITRD_START can never be false.

Remove unused code from the else condition.

Signed-off-by: Henry Nestler <henry.ne@arcor.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/frv/kernel/setup.c    | 2 +-
 arch/i386/kernel/setup.c   | 3 +--
 arch/m32r/kernel/setup.c   | 4 +---
 arch/m32r/mm/discontig.c   | 4 +---
 arch/sh/kernel/setup.c     | 3 +--
 arch/sh64/kernel/setup.c   | 4 +---
 arch/x86_64/kernel/setup.c | 3 +--
 7 files changed, 7 insertions(+), 16 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/frv/kernel/setup.c b/arch/frv/kernel/setup.c
index a8c61dac1ce..1a5eb6c301c 100644
--- a/arch/frv/kernel/setup.c
+++ b/arch/frv/kernel/setup.c
@@ -947,7 +947,7 @@ static void __init setup_linux_memory(void)
 	if (LOADER_TYPE && INITRD_START) {
 		if (INITRD_START + INITRD_SIZE <= (low_top_pfn << PAGE_SHIFT)) {
 			reserve_bootmem(INITRD_START, INITRD_SIZE);
-			initrd_start = INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
+			initrd_start = INITRD_START + PAGE_OFFSET;
 			initrd_end = initrd_start + INITRD_SIZE;
 		}
 		else {
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 141041dde74..97bb869307b 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -1162,8 +1162,7 @@ void __init setup_bootmem_allocator(void)
 	if (LOADER_TYPE && INITRD_START) {
 		if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
 			reserve_bootmem(INITRD_START, INITRD_SIZE);
-			initrd_start =
-				INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
+			initrd_start = INITRD_START + PAGE_OFFSET;
 			initrd_end = initrd_start+INITRD_SIZE;
 		}
 		else {
diff --git a/arch/m32r/kernel/setup.c b/arch/m32r/kernel/setup.c
index 0e7778be33c..936205f7aba 100644
--- a/arch/m32r/kernel/setup.c
+++ b/arch/m32r/kernel/setup.c
@@ -196,9 +196,7 @@ static unsigned long __init setup_memory(void)
 	if (LOADER_TYPE && INITRD_START) {
 		if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
 			reserve_bootmem(INITRD_START, INITRD_SIZE);
-			initrd_start = INITRD_START ?
-				INITRD_START + PAGE_OFFSET : 0;
-
+			initrd_start = INITRD_START + PAGE_OFFSET;
 			initrd_end = initrd_start + INITRD_SIZE;
 			printk("initrd:start[%08lx],size[%08lx]\n",
 				initrd_start, INITRD_SIZE);
diff --git a/arch/m32r/mm/discontig.c b/arch/m32r/mm/discontig.c
index abb34ccd598..c7efdb0aefc 100644
--- a/arch/m32r/mm/discontig.c
+++ b/arch/m32r/mm/discontig.c
@@ -105,9 +105,7 @@ unsigned long __init setup_memory(void)
 		if (INITRD_START + INITRD_SIZE <= PFN_PHYS(max_low_pfn)) {
 			reserve_bootmem_node(NODE_DATA(0), INITRD_START,
 				INITRD_SIZE);
-			initrd_start = INITRD_START ?
-				INITRD_START + PAGE_OFFSET : 0;
-
+			initrd_start = INITRD_START + PAGE_OFFSET;
 			initrd_end = initrd_start + INITRD_SIZE;
 			printk("initrd:start[%08lx],size[%08lx]\n",
 				initrd_start, INITRD_SIZE);
diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c
index 696ca75752d..f8dd6b7bfab 100644
--- a/arch/sh/kernel/setup.c
+++ b/arch/sh/kernel/setup.c
@@ -332,8 +332,7 @@ void __init setup_arch(char **cmdline_p)
 	if (LOADER_TYPE && INITRD_START) {
 		if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
 			reserve_bootmem_node(NODE_DATA(0), INITRD_START+__MEMORY_START, INITRD_SIZE);
-			initrd_start =
-				INITRD_START ? INITRD_START + PAGE_OFFSET + __MEMORY_START : 0;
+			initrd_start = INITRD_START + PAGE_OFFSET + __MEMORY_START;
 			initrd_end = initrd_start + INITRD_SIZE;
 		} else {
 			printk("initrd extends beyond end of memory "
diff --git a/arch/sh64/kernel/setup.c b/arch/sh64/kernel/setup.c
index ffb310e33ce..b9e7d54d7b8 100644
--- a/arch/sh64/kernel/setup.c
+++ b/arch/sh64/kernel/setup.c
@@ -243,9 +243,7 @@ void __init setup_arch(char **cmdline_p)
 		if (INITRD_START + INITRD_SIZE <= (PFN_PHYS(last_pfn))) {
 		        reserve_bootmem_node(NODE_DATA(0), INITRD_START + __MEMORY_START, INITRD_SIZE);
 
-			initrd_start =
-			  (long) INITRD_START ? INITRD_START + PAGE_OFFSET +  __MEMORY_START : 0;
-
+			initrd_start = (long) INITRD_START + PAGE_OFFSET + __MEMORY_START;
 			initrd_end = initrd_start + INITRD_SIZE;
 		} else {
 			printk("initrd extends beyond end of memory "
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index fc944b5e8f4..f12f266f3e9 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -471,8 +471,7 @@ void __init setup_arch(char **cmdline_p)
 	if (LOADER_TYPE && INITRD_START) {
 		if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
 			reserve_bootmem_generic(INITRD_START, INITRD_SIZE);
-			initrd_start =
-				INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
+			initrd_start = INITRD_START + PAGE_OFFSET;
 			initrd_end = initrd_start+INITRD_SIZE;
 		}
 		else {
-- 
cgit v1.2.3-70-g09d2


From b4c6c34a530b4d1c626f4ac0a884e0a9b849378c Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Wed, 6 Dec 2006 20:38:11 -0800
Subject: [PATCH] kprobes: enable booster on the preemptible kernel

When we are unregistering a kprobe-booster, we can't release its
instruction buffer immediately on the preemptive kernel, because some
processes might be preempted on the buffer.  The freeze_processes() and
thaw_processes() functions can clean most of processes up from the buffer.
There are still some non-frozen threads who have the PF_NOFREEZE flag.  If
those threads are sleeping (not preempted) at the known place outside the
buffer, we can ensure safety of freeing.

However, the processing of this check routine takes a long time.  So, this
patch introduces the garbage collection mechanism of insn_slot.  It also
introduces the "dirty" flag to free_insn_slot because of efficiency.

The "clean" instruction slots (dirty flag is cleared) are released
immediately.  But the "dirty" slots which are used by boosted kprobes, are
marked as garbages.  collect_garbage_slots() will be invoked to release
"dirty" slots if there are more than INSNS_PER_PAGE garbage slots or if
there are no unused slots.

Cc: "Keshavamurthy, Anil S" <anil.s.keshavamurthy@intel.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: "bibo,mao" <bibo.mao@intel.com>
Cc: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Cc: Yumiko Sugita <yumiko.sugita.yf@hitachi.com>
Cc: Satoshi Oshima <soshima@redhat.com>
Cc: Hideo Aoki <haoki@redhat.com>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/kprobes.c    |   4 +-
 arch/ia64/kernel/kprobes.c    |   2 +-
 arch/powerpc/kernel/kprobes.c |   2 +-
 arch/s390/kernel/kprobes.c    |   2 +-
 arch/x86_64/kernel/kprobes.c  |   2 +-
 include/linux/kprobes.h       |   2 +-
 kernel/kprobes.c              | 117 ++++++++++++++++++++++++++++++++++--------
 7 files changed, 103 insertions(+), 28 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index fc79e1e859c..af1d5334499 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -184,7 +184,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
 	mutex_lock(&kprobe_mutex);
-	free_insn_slot(p->ainsn.insn);
+	free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
 	mutex_unlock(&kprobe_mutex);
 }
 
@@ -333,7 +333,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 		return 1;
 
 ss_probe:
-#ifndef CONFIG_PREEMPT
+#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
 	if (p->ainsn.boostable == 1 && !p->post_handler){
 		/* Boost up -- we can execute copied instructions directly */
 		reset_current_kprobe();
diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c
index 51217d63285..4d592ee9300 100644
--- a/arch/ia64/kernel/kprobes.c
+++ b/arch/ia64/kernel/kprobes.c
@@ -481,7 +481,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
 	mutex_lock(&kprobe_mutex);
-	free_insn_slot(p->ainsn.insn);
+	free_insn_slot(p->ainsn.insn, 0);
 	mutex_unlock(&kprobe_mutex);
 }
 /*
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 7b8d12b9026..4657563f881 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -85,7 +85,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
 	mutex_lock(&kprobe_mutex);
-	free_insn_slot(p->ainsn.insn);
+	free_insn_slot(p->ainsn.insn, 0);
 	mutex_unlock(&kprobe_mutex);
 }
 
diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c
index 67914fe7f31..576368c4f60 100644
--- a/arch/s390/kernel/kprobes.c
+++ b/arch/s390/kernel/kprobes.c
@@ -200,7 +200,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
 	mutex_lock(&kprobe_mutex);
-	free_insn_slot(p->ainsn.insn);
+	free_insn_slot(p->ainsn.insn, 0);
 	mutex_unlock(&kprobe_mutex);
 }
 
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index ac241567e68..209c8c0bec7 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -224,7 +224,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
 	mutex_lock(&kprobe_mutex);
-	free_insn_slot(p->ainsn.insn);
+	free_insn_slot(p->ainsn.insn, 0);
 	mutex_unlock(&kprobe_mutex);
 }
 
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index ac4c0559f75..769be39b968 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -165,7 +165,7 @@ extern void arch_disarm_kprobe(struct kprobe *p);
 extern int arch_init_kprobes(void);
 extern void show_registers(struct pt_regs *regs);
 extern kprobe_opcode_t *get_insn_slot(void);
-extern void free_insn_slot(kprobe_opcode_t *slot);
+extern void free_insn_slot(kprobe_opcode_t *slot, int dirty);
 extern void kprobes_inc_nmissed_count(struct kprobe *p);
 
 /* Get the kprobe at this addr (if any) - called with preemption disabled */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 610c837ad9e..17ec4afb099 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -38,6 +38,7 @@
 #include <linux/module.h>
 #include <linux/moduleloader.h>
 #include <linux/kallsyms.h>
+#include <linux/freezer.h>
 #include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
 #include <asm/errno.h>
@@ -83,9 +84,36 @@ struct kprobe_insn_page {
 	kprobe_opcode_t *insns;		/* Page of instruction slots */
 	char slot_used[INSNS_PER_PAGE];
 	int nused;
+	int ngarbage;
 };
 
 static struct hlist_head kprobe_insn_pages;
+static int kprobe_garbage_slots;
+static int collect_garbage_slots(void);
+
+static int __kprobes check_safety(void)
+{
+	int ret = 0;
+#if defined(CONFIG_PREEMPT) && defined(CONFIG_PM)
+	ret = freeze_processes();
+	if (ret == 0) {
+		struct task_struct *p, *q;
+		do_each_thread(p, q) {
+			if (p != current && p->state == TASK_RUNNING &&
+			    p->pid != 0) {
+				printk("Check failed: %s is running\n",p->comm);
+				ret = -1;
+				goto loop_end;
+			}
+		} while_each_thread(p, q);
+	}
+loop_end:
+	thaw_processes();
+#else
+	synchronize_sched();
+#endif
+	return ret;
+}
 
 /**
  * get_insn_slot() - Find a slot on an executable page for an instruction.
@@ -96,6 +124,7 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
 	struct kprobe_insn_page *kip;
 	struct hlist_node *pos;
 
+      retry:
 	hlist_for_each(pos, &kprobe_insn_pages) {
 		kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
 		if (kip->nused < INSNS_PER_PAGE) {
@@ -112,7 +141,11 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
 		}
 	}
 
-	/* All out of space.  Need to allocate a new page. Use slot 0.*/
+	/* If there are any garbage slots, collect it and try again. */
+	if (kprobe_garbage_slots && collect_garbage_slots() == 0) {
+		goto retry;
+	}
+	/* All out of space.  Need to allocate a new page. Use slot 0. */
 	kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
 	if (!kip) {
 		return NULL;
@@ -133,10 +166,62 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
 	memset(kip->slot_used, 0, INSNS_PER_PAGE);
 	kip->slot_used[0] = 1;
 	kip->nused = 1;
+	kip->ngarbage = 0;
 	return kip->insns;
 }
 
-void __kprobes free_insn_slot(kprobe_opcode_t *slot)
+/* Return 1 if all garbages are collected, otherwise 0. */
+static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
+{
+	kip->slot_used[idx] = 0;
+	kip->nused--;
+	if (kip->nused == 0) {
+		/*
+		 * Page is no longer in use.  Free it unless
+		 * it's the last one.  We keep the last one
+		 * so as not to have to set it up again the
+		 * next time somebody inserts a probe.
+		 */
+		hlist_del(&kip->hlist);
+		if (hlist_empty(&kprobe_insn_pages)) {
+			INIT_HLIST_NODE(&kip->hlist);
+			hlist_add_head(&kip->hlist,
+				       &kprobe_insn_pages);
+		} else {
+			module_free(NULL, kip->insns);
+			kfree(kip);
+		}
+		return 1;
+	}
+	return 0;
+}
+
+static int __kprobes collect_garbage_slots(void)
+{
+	struct kprobe_insn_page *kip;
+	struct hlist_node *pos, *next;
+
+	/* Ensure no-one is preepmted on the garbages */
+	if (check_safety() != 0)
+		return -EAGAIN;
+
+	hlist_for_each_safe(pos, next, &kprobe_insn_pages) {
+		int i;
+		kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
+		if (kip->ngarbage == 0)
+			continue;
+		kip->ngarbage = 0;	/* we will collect all garbages */
+		for (i = 0; i < INSNS_PER_PAGE; i++) {
+			if (kip->slot_used[i] == -1 &&
+			    collect_one_slot(kip, i))
+				break;
+		}
+	}
+	kprobe_garbage_slots = 0;
+	return 0;
+}
+
+void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
 {
 	struct kprobe_insn_page *kip;
 	struct hlist_node *pos;
@@ -146,28 +231,18 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot)
 		if (kip->insns <= slot &&
 		    slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
 			int i = (slot - kip->insns) / MAX_INSN_SIZE;
-			kip->slot_used[i] = 0;
-			kip->nused--;
-			if (kip->nused == 0) {
-				/*
-				 * Page is no longer in use.  Free it unless
-				 * it's the last one.  We keep the last one
-				 * so as not to have to set it up again the
-				 * next time somebody inserts a probe.
-				 */
-				hlist_del(&kip->hlist);
-				if (hlist_empty(&kprobe_insn_pages)) {
-					INIT_HLIST_NODE(&kip->hlist);
-					hlist_add_head(&kip->hlist,
-						&kprobe_insn_pages);
-				} else {
-					module_free(NULL, kip->insns);
-					kfree(kip);
-				}
+			if (dirty) {
+				kip->slot_used[i] = -1;
+				kip->ngarbage++;
+			} else {
+				collect_one_slot(kip, i);
 			}
-			return;
+			break;
 		}
 	}
+	if (dirty && (++kprobe_garbage_slots > INSNS_PER_PAGE)) {
+		collect_garbage_slots();
+	}
 }
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From a38a44c1a93078fc5fadc4ac2df8dea4697069e2 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 6 Dec 2006 20:38:16 -0800
Subject: [PATCH] smp_call_function_single() check that local interrupts are
 enabled

smp_call_function_single() can deadlock if the caller disabled local
interrupts (the target CPU could be spinning on call_lock).  Check for that.

Why on earth do these functions use spin_lock_bh()??

Cc: "Randy.Dunlap" <rdunlap@xenotime.net>
Cc: Andi Kleen <ak@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/smp.c   | 4 ++++
 arch/x86_64/kernel/smp.c | 4 ++++
 2 files changed, 8 insertions(+)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 31e5c6573aa..9827cf927ec 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -699,6 +699,10 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 		put_cpu();
 		return -EBUSY;
 	}
+
+	/* Can deadlock when called with interrupts disabled */
+	WARN_ON(irqs_disabled());
+
 	spin_lock_bh(&call_lock);
 	__smp_call_function_single(cpu, func, info, nonatomic, wait);
 	spin_unlock_bh(&call_lock);
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 9f74c883568..32f4d7e2a06 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -379,6 +379,10 @@ int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
 		put_cpu();
 		return 0;
 	}
+
+	/* Can deadlock when called with interrupts disabled */
+	WARN_ON(irqs_disabled());
+
 	spin_lock_bh(&call_lock);
 	__smp_call_function_single(cpu, func, info, nonatomic, wait);
 	spin_unlock_bh(&call_lock);
-- 
cgit v1.2.3-70-g09d2


From 02316067852187b8bec781bec07410e91af79627 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 6 Dec 2006 20:38:17 -0800
Subject: [PATCH] hotplug CPU: clean up hotcpu_notifier() use

There was lots of #ifdef noise in the kernel due to hotcpu_notifier(fn,
prio) not correctly marking 'fn' as used in the !HOTPLUG_CPU case, and thus
generating compiler warnings of unused symbols, hence forcing people to add
#ifdefs.

the compiler can skip truly unused functions just fine:

    text    data     bss     dec     hex filename
 1624412  728710 3674856 6027978  5bfaca vmlinux.before
 1624412  728710 3674856 6027978  5bfaca vmlinux.after

[akpm@osdl.org: topology.c fix]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/cpu/mcheck/therm_throt.c | 2 --
 arch/i386/kernel/cpuid.c                  | 2 --
 arch/i386/kernel/microcode.c              | 2 --
 arch/i386/kernel/msr.c                    | 2 --
 arch/ia64/kernel/palinfo.c                | 2 --
 arch/ia64/kernel/salinfo.c                | 2 --
 arch/s390/appldata/appldata_base.c        | 2 --
 arch/x86_64/kernel/mce.c                  | 2 --
 arch/x86_64/kernel/mce_amd.c              | 4 ++--
 arch/x86_64/kernel/vsyscall.c             | 2 --
 block/ll_rw_blk.c                         | 4 ----
 drivers/base/topology.c                   | 2 --
 drivers/cpufreq/cpufreq.c                 | 2 --
 fs/buffer.c                               | 2 --
 include/linux/cpu.h                       | 6 +++---
 kernel/cpuset.c                           | 4 ----
 kernel/profile.c                          | 3 +--
 kernel/sched.c                            | 3 ---
 kernel/workqueue.c                        | 2 --
 lib/radix-tree.c                          | 2 --
 mm/page_alloc.c                           | 4 ----
 mm/swap.c                                 | 2 ++
 mm/vmscan.c                               | 2 --
 net/core/dev.c                            | 2 --
 net/core/flow.c                           | 2 --
 25 files changed, 8 insertions(+), 56 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c
index bad8b442070..065005c3f16 100644
--- a/arch/i386/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c
@@ -116,7 +116,6 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
 	return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group);
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
 {
 	return sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
@@ -153,7 +152,6 @@ static struct notifier_block thermal_throttle_cpu_notifier =
 {
 	.notifier_call = thermal_throttle_cpu_callback,
 };
-#endif /* CONFIG_HOTPLUG_CPU */
 
 static __init int thermal_throttle_init_device(void)
 {
diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c
index ab0c327e79d..23b2cc748d4 100644
--- a/arch/i386/kernel/cpuid.c
+++ b/arch/i386/kernel/cpuid.c
@@ -167,7 +167,6 @@ static int cpuid_device_create(int i)
 	return err;
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
@@ -187,7 +186,6 @@ static struct notifier_block __cpuinitdata cpuid_class_cpu_notifier =
 {
 	.notifier_call = cpuid_class_cpu_callback,
 };
-#endif /* !CONFIG_HOTPLUG_CPU */
 
 static int __init cpuid_init(void)
 {
diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c
index 23f5984d065..972346604f9 100644
--- a/arch/i386/kernel/microcode.c
+++ b/arch/i386/kernel/microcode.c
@@ -703,7 +703,6 @@ static struct sysdev_driver mc_sysdev_driver = {
 	.resume = mc_sysdev_resume,
 };
 
-#ifdef CONFIG_HOTPLUG_CPU
 static __cpuinit int
 mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
 {
@@ -726,7 +725,6 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
 static struct notifier_block mc_cpu_notifier = {
 	.notifier_call = mc_cpu_callback,
 };
-#endif
 
 static int __init microcode_init (void)
 {
diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c
index a773f776c9e..7763c67ca28 100644
--- a/arch/i386/kernel/msr.c
+++ b/arch/i386/kernel/msr.c
@@ -250,7 +250,6 @@ static int msr_device_create(int i)
 	return err;
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 static int msr_class_cpu_callback(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
 {
@@ -271,7 +270,6 @@ static struct notifier_block __cpuinitdata msr_class_cpu_notifier =
 {
 	.notifier_call = msr_class_cpu_callback,
 };
-#endif
 
 static int __init msr_init(void)
 {
diff --git a/arch/ia64/kernel/palinfo.c b/arch/ia64/kernel/palinfo.c
index 0b546e2b36a..c4c10a0b99d 100644
--- a/arch/ia64/kernel/palinfo.c
+++ b/arch/ia64/kernel/palinfo.c
@@ -952,7 +952,6 @@ remove_palinfo_proc_entries(unsigned int hcpu)
 	}
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 static int palinfo_cpu_callback(struct notifier_block *nfb,
 					unsigned long action, void *hcpu)
 {
@@ -974,7 +973,6 @@ static struct notifier_block palinfo_cpu_notifier =
 	.notifier_call = palinfo_cpu_callback,
 	.priority = 0,
 };
-#endif
 
 static int __init
 palinfo_init(void)
diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c
index e63b8ca5344..fd607ca51a8 100644
--- a/arch/ia64/kernel/salinfo.c
+++ b/arch/ia64/kernel/salinfo.c
@@ -575,7 +575,6 @@ static struct file_operations salinfo_data_fops = {
 	.write   = salinfo_log_write,
 };
 
-#ifdef	CONFIG_HOTPLUG_CPU
 static int __devinit
 salinfo_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
 {
@@ -620,7 +619,6 @@ static struct notifier_block salinfo_cpu_notifier =
 	.notifier_call = salinfo_cpu_callback,
 	.priority = 0,
 };
-#endif	/* CONFIG_HOTPLUG_CPU */
 
 static int __init
 salinfo_init(void)
diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index 67d5cf9cba8..b8c23729026 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -561,7 +561,6 @@ appldata_offline_cpu(int cpu)
 	spin_unlock(&appldata_timer_lock);
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 static int __cpuinit
 appldata_cpu_notify(struct notifier_block *self,
 		    unsigned long action, void *hcpu)
@@ -582,7 +581,6 @@ appldata_cpu_notify(struct notifier_block *self,
 static struct notifier_block appldata_nb = {
 	.notifier_call = appldata_cpu_notify,
 };
-#endif
 
 /*
  * appldata_init()
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index c7587fc3901..bc863c464a1 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -641,7 +641,6 @@ static __cpuinit int mce_create_device(unsigned int cpu)
 	return err;
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 static void mce_remove_device(unsigned int cpu)
 {
 	int i;
@@ -674,7 +673,6 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 static struct notifier_block mce_cpu_notifier = {
 	.notifier_call = mce_cpu_callback,
 };
-#endif
 
 static __init int mce_init_device(void)
 {
diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c
index 883fe747f64..fa09debad4b 100644
--- a/arch/x86_64/kernel/mce_amd.c
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -551,7 +551,6 @@ out:
 	return err;
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 /*
  * let's be hotplug friendly.
  * in case of multiple core processors, the first core always takes ownership
@@ -594,12 +593,14 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 
 	sprintf(name, "threshold_bank%i", bank);
 
+#ifdef CONFIG_SMP
 	/* sibling symlink */
 	if (shared_bank[bank] && b->blocks->cpu != cpu) {
 		sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name);
 		per_cpu(threshold_banks, cpu)[bank] = NULL;
 		return;
 	}
+#endif
 
 	/* remove all sibling symlinks before unregistering */
 	for_each_cpu_mask(i, b->cpus) {
@@ -656,7 +657,6 @@ static int threshold_cpu_callback(struct notifier_block *nfb,
 static struct notifier_block threshold_cpu_notifier = {
 	.notifier_call = threshold_cpu_callback,
 };
-#endif /* CONFIG_HOTPLUG_CPU */
 
 static __init int threshold_init_device(void)
 {
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index 630036c06c7..3785e495473 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -275,7 +275,6 @@ static void __cpuinit cpu_vsyscall_init(void *arg)
 	vsyscall_set_cpu(raw_smp_processor_id());
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 static int __cpuinit
 cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
 {
@@ -284,7 +283,6 @@ cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
 		smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
 	return NOTIFY_DONE;
 }
-#endif
 
 static void __init map_vsyscall(void)
 {
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index a4ff3271d4a..31512cd9f3a 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3459,8 +3459,6 @@ static void blk_done_softirq(struct softirq_action *h)
 	}
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-
 static int blk_cpu_notify(struct notifier_block *self, unsigned long action,
 			  void *hcpu)
 {
@@ -3486,8 +3484,6 @@ static struct notifier_block __devinitdata blk_cpu_notifier = {
 	.notifier_call	= blk_cpu_notify,
 };
 
-#endif /* CONFIG_HOTPLUG_CPU */
-
 /**
  * blk_complete_request - end I/O on a request
  * @req:      the request being processed
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index 3d12b85b096..067a9e8bc37 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -108,7 +108,6 @@ static int __cpuinit topology_add_dev(unsigned int cpu)
 	return rc;
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 static void __cpuinit topology_remove_dev(unsigned int cpu)
 {
 	struct sys_device *sys_dev = get_cpu_sysdev(cpu);
@@ -136,7 +135,6 @@ static int __cpuinit topology_cpu_callback(struct notifier_block *nfb,
 	}
 	return rc ? NOTIFY_BAD : NOTIFY_OK;
 }
-#endif
 
 static int __cpuinit topology_sysfs_init(void)
 {
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 7a7c6e6dfe4..47ab42db122 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1537,7 +1537,6 @@ int cpufreq_update_policy(unsigned int cpu)
 }
 EXPORT_SYMBOL(cpufreq_update_policy);
 
-#ifdef CONFIG_HOTPLUG_CPU
 static int cpufreq_cpu_callback(struct notifier_block *nfb,
 					unsigned long action, void *hcpu)
 {
@@ -1577,7 +1576,6 @@ static struct notifier_block __cpuinitdata cpufreq_cpu_notifier =
 {
     .notifier_call = cpufreq_cpu_callback,
 };
-#endif /* CONFIG_HOTPLUG_CPU */
 
 /*********************************************************************
  *               REGISTER / UNREGISTER CPUFREQ DRIVER                *
diff --git a/fs/buffer.c b/fs/buffer.c
index a8ca0ac2148..517860f2d75 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2972,7 +2972,6 @@ init_buffer_head(void *data, struct kmem_cache *cachep, unsigned long flags)
 	}
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 static void buffer_exit_cpu(int cpu)
 {
 	int i;
@@ -2994,7 +2993,6 @@ static int buffer_cpu_notify(struct notifier_block *self,
 		buffer_exit_cpu((unsigned long)hcpu);
 	return NOTIFY_OK;
 }
-#endif /* CONFIG_HOTPLUG_CPU */
 
 void __init buffer_init(void)
 {
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index f02d71bf689..71dc6ba4f73 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -89,9 +89,9 @@ int cpu_down(unsigned int cpu);
 #define lock_cpu_hotplug()	do { } while (0)
 #define unlock_cpu_hotplug()	do { } while (0)
 #define lock_cpu_hotplug_interruptible() 0
-#define hotcpu_notifier(fn, pri)	do { } while (0)
-#define register_hotcpu_notifier(nb)	do { } while (0)
-#define unregister_hotcpu_notifier(nb)	do { } while (0)
+#define hotcpu_notifier(fn, pri)	do { (void)(fn); } while (0)
+#define register_hotcpu_notifier(nb)	do { (void)(nb); } while (0)
+#define unregister_hotcpu_notifier(nb)	do { (void)(nb); } while (0)
 
 /* CPUs don't go offline once they're online w/o CONFIG_HOTPLUG_CPU */
 static inline int cpu_is_offline(int cpu) { return 0; }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index bd1e89c4c96..9b62b4c03ad 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2044,7 +2044,6 @@ out:
 	return err;
 }
 
-#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
 /*
  * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
  * or memory nodes, we need to walk over the cpuset hierarchy,
@@ -2108,9 +2107,7 @@ static void common_cpu_mem_hotplug_unplug(void)
 	mutex_unlock(&callback_mutex);
 	mutex_unlock(&manage_mutex);
 }
-#endif
 
-#ifdef CONFIG_HOTPLUG_CPU
 /*
  * The top_cpuset tracks what CPUs and Memory Nodes are online,
  * period.  This is necessary in order to make cpusets transparent
@@ -2127,7 +2124,6 @@ static int cpuset_handle_cpuhp(struct notifier_block *nb,
 	common_cpu_mem_hotplug_unplug();
 	return 0;
 }
-#endif
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
diff --git a/kernel/profile.c b/kernel/profile.c
index 04fd84e8cdb..0961d93e1d9 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -319,7 +319,6 @@ out:
 	put_cpu();
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 static int __devinit profile_cpu_callback(struct notifier_block *info,
 					unsigned long action, void *__cpu)
 {
@@ -372,10 +371,10 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
 	}
 	return NOTIFY_OK;
 }
-#endif /* CONFIG_HOTPLUG_CPU */
 #else /* !CONFIG_SMP */
 #define profile_flip_buffers()		do { } while (0)
 #define profile_discard_flip_buffers()	do { } while (0)
+#define profile_cpu_callback		NULL
 
 void profile_hits(int type, void *__pc, unsigned int nr_hits)
 {
diff --git a/kernel/sched.c b/kernel/sched.c
index 75a005ed4ed..c83f531c288 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6740,8 +6740,6 @@ SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
 	    sched_smt_power_savings_store);
 #endif
 
-
-#ifdef CONFIG_HOTPLUG_CPU
 /*
  * Force a reinitialization of the sched domains hierarchy.  The domains
  * and groups cannot be updated in place without racing with the balancing
@@ -6774,7 +6772,6 @@ static int update_sched_domains(struct notifier_block *nfb,
 
 	return NOTIFY_OK;
 }
-#endif
 
 void __init sched_init_smp(void)
 {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5484d6e045c..c5257316f4b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -655,7 +655,6 @@ int current_is_keventd(void)
 
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 /* Take the work from this (downed) CPU. */
 static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
 {
@@ -738,7 +737,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 
 	return NOTIFY_OK;
 }
-#endif
 
 void init_workqueues(void)
 {
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index e2cefabb5aa..d69ddbe4386 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -996,7 +996,6 @@ static __init void radix_tree_init_maxindex(void)
 		height_to_maxindex[i] = __maxindex(i);
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 static int radix_tree_callback(struct notifier_block *nfb,
                             unsigned long action,
                             void *hcpu)
@@ -1016,7 +1015,6 @@ static int radix_tree_callback(struct notifier_block *nfb,
        }
        return NOTIFY_OK;
 }
-#endif /* CONFIG_HOTPLUG_CPU */
 
 void __init radix_tree_init(void)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2273952300d..27ec7a1b802 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -701,7 +701,6 @@ void drain_node_pages(int nodeid)
 }
 #endif
 
-#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
 static void __drain_pages(unsigned int cpu)
 {
 	unsigned long flags;
@@ -723,7 +722,6 @@ static void __drain_pages(unsigned int cpu)
 		}
 	}
 }
-#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
 
 #ifdef CONFIG_PM
 
@@ -2907,7 +2905,6 @@ void __init free_area_init(unsigned long *zones_size)
 			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 static int page_alloc_cpu_notify(struct notifier_block *self,
 				 unsigned long action, void *hcpu)
 {
@@ -2922,7 +2919,6 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
 	}
 	return NOTIFY_OK;
 }
-#endif /* CONFIG_HOTPLUG_CPU */
 
 void __init page_alloc_init(void)
 {
diff --git a/mm/swap.c b/mm/swap.c
index 017e72ca9bb..2ed7be39795 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -514,5 +514,7 @@ void __init swap_setup(void)
 	 * Right now other parts of the system means that we
 	 * _really_ don't want to cluster much more
 	 */
+#ifdef CONFIG_HOTPLUG_CPU
 	hotcpu_notifier(cpu_swap_callback, 0);
+#endif
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f6616e81fac..093f5fe6dd7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1513,7 +1513,6 @@ out:
 }
 #endif
 
-#ifdef CONFIG_HOTPLUG_CPU
 /* It's optimal to keep kswapds on the same CPUs as their memory, but
    not required for correctness.  So if the last cpu in a node goes
    away, we get changed to run anywhere: as the first one comes back,
@@ -1534,7 +1533,6 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
 	}
 	return NOTIFY_OK;
 }
-#endif /* CONFIG_HOTPLUG_CPU */
 
 /*
  * This kswapd start function will be called by init and node-hot-add.
diff --git a/net/core/dev.c b/net/core/dev.c
index 59d058a3b50..e660cb57e42 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3340,7 +3340,6 @@ void unregister_netdev(struct net_device *dev)
 
 EXPORT_SYMBOL(unregister_netdev);
 
-#ifdef CONFIG_HOTPLUG_CPU
 static int dev_cpu_callback(struct notifier_block *nfb,
 			    unsigned long action,
 			    void *ocpu)
@@ -3384,7 +3383,6 @@ static int dev_cpu_callback(struct notifier_block *nfb,
 
 	return NOTIFY_OK;
 }
-#endif /* CONFIG_HOTPLUG_CPU */
 
 #ifdef CONFIG_NET_DMA
 /**
diff --git a/net/core/flow.c b/net/core/flow.c
index 104c25d00a1..d137f971f97 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -340,7 +340,6 @@ static void __devinit flow_cache_cpu_prepare(int cpu)
 	tasklet_init(tasklet, flow_cache_flush_tasklet, 0);
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 static int flow_cache_cpu(struct notifier_block *nfb,
 			  unsigned long action,
 			  void *hcpu)
@@ -349,7 +348,6 @@ static int flow_cache_cpu(struct notifier_block *nfb,
 		__flow_cache_shrink((unsigned long)hcpu, 0);
 	return NOTIFY_OK;
 }
-#endif /* CONFIG_HOTPLUG_CPU */
 
 static int __init flow_cache_init(void)
 {
-- 
cgit v1.2.3-70-g09d2


From 85916f8166b59eeac63d2b4f7f1df8de849334b4 Mon Sep 17 00:00:00 2001
From: Magnus Damm <magnus@valinux.co.jp>
Date: Wed, 6 Dec 2006 20:40:41 -0800
Subject: [PATCH] Kexec / Kdump: Unify elf note code

The elf note saving code is currently duplicated over several
architectures.  This cleanup patch simply adds code to a common file and
then replaces the arch-specific code with calls to the newly added code.

The only drawback with this approach is that s390 doesn't fully support
kexec-on-panic which for that arch leads to introduction of unused code.

Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
Cc: Vivek Goyal <vgoyal@in.ibm.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/crash.c    | 66 ++-----------------------------------------
 arch/powerpc/kernel/crash.c | 59 ++------------------------------------
 arch/x86_64/kernel/crash.c  | 69 ++-------------------------------------------
 include/linux/kexec.h       |  1 +
 kernel/kexec.c              | 56 ++++++++++++++++++++++++++++++++++++
 5 files changed, 63 insertions(+), 188 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c
index 144b4328896..a5e0e990ea9 100644
--- a/arch/i386/kernel/crash.c
+++ b/arch/i386/kernel/crash.c
@@ -31,68 +31,6 @@
 /* This keeps a track of which one is crashing cpu. */
 static int crashing_cpu;
 
-static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
-							       size_t data_len)
-{
-	struct elf_note note;
-
-	note.n_namesz = strlen(name) + 1;
-	note.n_descsz = data_len;
-	note.n_type   = type;
-	memcpy(buf, &note, sizeof(note));
-	buf += (sizeof(note) +3)/4;
-	memcpy(buf, name, note.n_namesz);
-	buf += (note.n_namesz + 3)/4;
-	memcpy(buf, data, note.n_descsz);
-	buf += (note.n_descsz + 3)/4;
-
-	return buf;
-}
-
-static void final_note(u32 *buf)
-{
-	struct elf_note note;
-
-	note.n_namesz = 0;
-	note.n_descsz = 0;
-	note.n_type   = 0;
-	memcpy(buf, &note, sizeof(note));
-}
-
-static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
-{
-	struct elf_prstatus prstatus;
-	u32 *buf;
-
-	if ((cpu < 0) || (cpu >= NR_CPUS))
-		return;
-
-	/* Using ELF notes here is opportunistic.
-	 * I need a well defined structure format
-	 * for the data I pass, and I need tags
-	 * on the data to indicate what information I have
-	 * squirrelled away.  ELF notes happen to provide
-	 * all of that, so there is no need to invent something new.
-	 */
-	buf = (u32*)per_cpu_ptr(crash_notes, cpu);
-	if (!buf)
-		return;
-	memset(&prstatus, 0, sizeof(prstatus));
-	prstatus.pr_pid = current->pid;
-	elf_core_copy_regs(&prstatus.pr_reg, regs);
-	buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
-				sizeof(prstatus));
-	final_note(buf);
-}
-
-static void crash_save_self(struct pt_regs *regs)
-{
-	int cpu;
-
-	cpu = safe_smp_processor_id();
-	crash_save_this_cpu(regs, cpu);
-}
-
 #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
 static atomic_t waiting_for_crash_ipi;
 
@@ -121,7 +59,7 @@ static int crash_nmi_callback(struct notifier_block *self,
 		crash_fixup_ss_esp(&fixed_regs, regs);
 		regs = &fixed_regs;
 	}
-	crash_save_this_cpu(regs, cpu);
+	crash_save_cpu(regs, cpu);
 	disable_local_APIC();
 	atomic_dec(&waiting_for_crash_ipi);
 	/* Assume hlt works */
@@ -195,5 +133,5 @@ void machine_crash_shutdown(struct pt_regs *regs)
 #if defined(CONFIG_X86_IO_APIC)
 	disable_IO_APIC();
 #endif
-	crash_save_self(regs);
+	crash_save_cpu(regs, safe_smp_processor_id());
 }
diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c
index 89b03c8da9d..d3f2080d2ee 100644
--- a/arch/powerpc/kernel/crash.c
+++ b/arch/powerpc/kernel/crash.c
@@ -46,61 +46,6 @@ int crashing_cpu = -1;
 static cpumask_t cpus_in_crash = CPU_MASK_NONE;
 cpumask_t cpus_in_sr = CPU_MASK_NONE;
 
-static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
-							       size_t data_len)
-{
-	struct elf_note note;
-
-	note.n_namesz = strlen(name) + 1;
-	note.n_descsz = data_len;
-	note.n_type   = type;
-	memcpy(buf, &note, sizeof(note));
-	buf += (sizeof(note) +3)/4;
-	memcpy(buf, name, note.n_namesz);
-	buf += (note.n_namesz + 3)/4;
-	memcpy(buf, data, note.n_descsz);
-	buf += (note.n_descsz + 3)/4;
-
-	return buf;
-}
-
-static void final_note(u32 *buf)
-{
-	struct elf_note note;
-
-	note.n_namesz = 0;
-	note.n_descsz = 0;
-	note.n_type   = 0;
-	memcpy(buf, &note, sizeof(note));
-}
-
-static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
-{
-	struct elf_prstatus prstatus;
-	u32 *buf;
-
-	if ((cpu < 0) || (cpu >= NR_CPUS))
-		return;
-
-	/* Using ELF notes here is opportunistic.
-	 * I need a well defined structure format
-	 * for the data I pass, and I need tags
-	 * on the data to indicate what information I have
-	 * squirrelled away.  ELF notes happen to provide
-	 * all of that that no need to invent something new.
-	 */
-	buf = (u32*)per_cpu_ptr(crash_notes, cpu);
-	if (!buf) 
-		return;
-
-	memset(&prstatus, 0, sizeof(prstatus));
-	prstatus.pr_pid = current->pid;
-	elf_core_copy_regs(&prstatus.pr_reg, regs);
-	buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
-			sizeof(prstatus));
-	final_note(buf);
-}
-
 #ifdef CONFIG_SMP
 static atomic_t enter_on_soft_reset = ATOMIC_INIT(0);
 
@@ -113,7 +58,7 @@ void crash_ipi_callback(struct pt_regs *regs)
 
 	hard_irq_disable();
 	if (!cpu_isset(cpu, cpus_in_crash))
-		crash_save_this_cpu(regs, cpu);
+		crash_save_cpu(regs, cpu);
 	cpu_set(cpu, cpus_in_crash);
 
 	/*
@@ -306,7 +251,7 @@ void default_machine_crash_shutdown(struct pt_regs *regs)
 	 * such that another IPI will not be sent.
 	 */
 	crashing_cpu = smp_processor_id();
-	crash_save_this_cpu(regs, crashing_cpu);
+	crash_save_cpu(regs, crashing_cpu);
 	crash_kexec_prepare_cpus(crashing_cpu);
 	cpu_set(crashing_cpu, cpus_in_crash);
 	if (ppc_md.kexec_cpu_down)
diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c
index 3525f884af8..95a7a2c1313 100644
--- a/arch/x86_64/kernel/crash.c
+++ b/arch/x86_64/kernel/crash.c
@@ -28,71 +28,6 @@
 /* This keeps a track of which one is crashing cpu. */
 static int crashing_cpu;
 
-static u32 *append_elf_note(u32 *buf, char *name, unsigned type,
-						void *data, size_t data_len)
-{
-	struct elf_note note;
-
-	note.n_namesz = strlen(name) + 1;
-	note.n_descsz = data_len;
-	note.n_type   = type;
-	memcpy(buf, &note, sizeof(note));
-	buf += (sizeof(note) +3)/4;
-	memcpy(buf, name, note.n_namesz);
-	buf += (note.n_namesz + 3)/4;
-	memcpy(buf, data, note.n_descsz);
-	buf += (note.n_descsz + 3)/4;
-
-	return buf;
-}
-
-static void final_note(u32 *buf)
-{
-	struct elf_note note;
-
-	note.n_namesz = 0;
-	note.n_descsz = 0;
-	note.n_type   = 0;
-	memcpy(buf, &note, sizeof(note));
-}
-
-static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
-{
-	struct elf_prstatus prstatus;
-	u32 *buf;
-
-	if ((cpu < 0) || (cpu >= NR_CPUS))
-		return;
-
-	/* Using ELF notes here is opportunistic.
-	 * I need a well defined structure format
-	 * for the data I pass, and I need tags
-	 * on the data to indicate what information I have
-	 * squirrelled away.  ELF notes happen to provide
-	 * all of that, no need to invent something new.
-	 */
-
-	buf = (u32*)per_cpu_ptr(crash_notes, cpu);
-
-	if (!buf)
-		return;
-
-	memset(&prstatus, 0, sizeof(prstatus));
-	prstatus.pr_pid = current->pid;
-	elf_core_copy_regs(&prstatus.pr_reg, regs);
-	buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
-					sizeof(prstatus));
-	final_note(buf);
-}
-
-static void crash_save_self(struct pt_regs *regs)
-{
-	int cpu;
-
-	cpu = smp_processor_id();
-	crash_save_this_cpu(regs, cpu);
-}
-
 #ifdef CONFIG_SMP
 static atomic_t waiting_for_crash_ipi;
 
@@ -117,7 +52,7 @@ static int crash_nmi_callback(struct notifier_block *self,
 		return NOTIFY_STOP;
 	local_irq_disable();
 
-	crash_save_this_cpu(regs, cpu);
+	crash_save_cpu(regs, cpu);
 	disable_local_APIC();
 	atomic_dec(&waiting_for_crash_ipi);
 	/* Assume hlt works */
@@ -196,5 +131,5 @@ void machine_crash_shutdown(struct pt_regs *regs)
 
 	disable_IO_APIC();
 
-	crash_save_self(regs);
+	crash_save_cpu(regs, smp_processor_id());
 }
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index a4ede62b339..e3abcec6c51 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -105,6 +105,7 @@ extern struct page *kimage_alloc_control_pages(struct kimage *image,
 						unsigned int order);
 extern void crash_kexec(struct pt_regs *);
 int kexec_should_crash(struct task_struct *);
+void crash_save_cpu(struct pt_regs *regs, int cpu);
 extern struct kimage *kexec_image;
 extern struct kimage *kexec_crash_image;
 
diff --git a/kernel/kexec.c b/kernel/kexec.c
index d43692cf232..afbbbe981be 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -20,6 +20,8 @@
 #include <linux/syscalls.h>
 #include <linux/ioport.h>
 #include <linux/hardirq.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
 
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -1066,6 +1068,60 @@ void crash_kexec(struct pt_regs *regs)
 	}
 }
 
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+			    size_t data_len)
+{
+	struct elf_note note;
+
+	note.n_namesz = strlen(name) + 1;
+	note.n_descsz = data_len;
+	note.n_type   = type;
+	memcpy(buf, &note, sizeof(note));
+	buf += (sizeof(note) + 3)/4;
+	memcpy(buf, name, note.n_namesz);
+	buf += (note.n_namesz + 3)/4;
+	memcpy(buf, data, note.n_descsz);
+	buf += (note.n_descsz + 3)/4;
+
+	return buf;
+}
+
+static void final_note(u32 *buf)
+{
+	struct elf_note note;
+
+	note.n_namesz = 0;
+	note.n_descsz = 0;
+	note.n_type   = 0;
+	memcpy(buf, &note, sizeof(note));
+}
+
+void crash_save_cpu(struct pt_regs *regs, int cpu)
+{
+	struct elf_prstatus prstatus;
+	u32 *buf;
+
+	if ((cpu < 0) || (cpu >= NR_CPUS))
+		return;
+
+	/* Using ELF notes here is opportunistic.
+	 * I need a well defined structure format
+	 * for the data I pass, and I need tags
+	 * on the data to indicate what information I have
+	 * squirrelled away.  ELF notes happen to provide
+	 * all of that, so there is no need to invent something new.
+	 */
+	buf = (u32*)per_cpu_ptr(crash_notes, cpu);
+	if (!buf)
+		return;
+	memset(&prstatus, 0, sizeof(prstatus));
+	prstatus.pr_pid = current->pid;
+	elf_core_copy_regs(&prstatus.pr_reg, regs);
+	buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
+				sizeof(prstatus));
+	final_note(buf);
+}
+
 static int __init crash_notes_memory_init(void)
 {
 	/* Allocate memory for saving cpu registers. */
-- 
cgit v1.2.3-70-g09d2


From c31a0bf3e1bc581676618db7492f18798fd0a73f Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Fri, 8 Dec 2006 02:36:22 -0800
Subject: [PATCH] Generic BUG for x86-64

This makes x86-64 use the generic BUG machinery.

The main advantage in using the generic BUG machinery for x86-64 is that
the inlined overhead of BUG is just the ud2a instruction; the file+line
information are no longer inlined into the instruction stream.  This
reduces cache pollution.

Signed-off-by: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Hugh Dickens <hugh@veritas.com>
Cc: Michael Ellerman <michael@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/Kconfig              |  5 +++++
 arch/x86_64/kernel/module.c      |  5 ++++-
 arch/x86_64/kernel/traps.c       | 36 +++++++++++---------------------
 arch/x86_64/kernel/vmlinux.lds.S |  2 ++
 include/asm-x86_64/bug.h         | 44 ++++++++++++++++++++--------------------
 5 files changed, 45 insertions(+), 47 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index bfbb9bcae12..3254a616c69 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -96,6 +96,11 @@ config AUDIT_ARCH
 	bool
 	default y
 
+config GENERIC_BUG
+	bool
+	default y
+	depends on BUG
+
 source "init/Kconfig"
 
 
diff --git a/arch/x86_64/kernel/module.c b/arch/x86_64/kernel/module.c
index 9d0958ff547..a888e67f587 100644
--- a/arch/x86_64/kernel/module.c
+++ b/arch/x86_64/kernel/module.c
@@ -23,6 +23,7 @@
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
+#include <linux/bug.h>
 
 #include <asm/system.h>
 #include <asm/page.h>
@@ -173,10 +174,12 @@ int module_finalize(const Elf_Ehdr *hdr,
 					    lseg, lseg + locks->sh_size,
 					    tseg, tseg + text->sh_size);
 	}
-	return 0;
+
+	return module_bug_finalize(hdr, sechdrs, me);
 }
 
 void module_arch_cleanup(struct module *mod)
 {
 	alternatives_smp_module_del(mod);
+	module_bug_cleanup(mod);
 }
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index a1641ffdffc..b54ccc07f37 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -31,6 +31,7 @@
 #include <linux/kexec.h>
 #include <linux/unwind.h>
 #include <linux/uaccess.h>
+#include <linux/bug.h>
 
 #include <asm/system.h>
 #include <asm/io.h>
@@ -524,30 +525,15 @@ bad:
 	printk("\n");
 }	
 
-void handle_BUG(struct pt_regs *regs)
-{ 
-	struct bug_frame f;
-	long len;
-	const char *prefix = "";
+int is_valid_bugaddr(unsigned long rip)
+{
+	unsigned short ud2;
 
-	if (user_mode(regs))
-		return; 
-	if (__copy_from_user(&f, (const void __user *) regs->rip,
-			     sizeof(struct bug_frame)))
-		return; 
-	if (f.filename >= 0 ||
-	    f.ud2[0] != 0x0f || f.ud2[1] != 0x0b) 
-		return;
-	len = __strnlen_user((char *)(long)f.filename, PATH_MAX) - 1;
-	if (len < 0 || len >= PATH_MAX)
-		f.filename = (int)(long)"unmapped filename";
-	else if (len > 50) {
-		f.filename += len - 50;
-		prefix = "...";
-	}
-	printk("----------- [cut here ] --------- [please bite here ] ---------\n");
-	printk(KERN_ALERT "Kernel BUG at %s%.50s:%d\n", prefix, (char *)(long)f.filename, f.line);
-} 
+	if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2)))
+		return 0;
+
+	return ud2 == 0x0b0f;
+}
 
 #ifdef CONFIG_BUG
 void out_of_line_bug(void)
@@ -627,7 +613,9 @@ void die(const char * str, struct pt_regs * regs, long err)
 {
 	unsigned long flags = oops_begin();
 
-	handle_BUG(regs);
+	if (!user_mode(regs))
+		report_bug(regs->rip);
+
 	__die(str, regs, err);
 	oops_end(flags);
 	do_exit(SIGSEGV); 
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 6a1f8f491e5..6c417788c54 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -51,6 +51,8 @@ SECTIONS
 
   RODATA
 
+  BUG_TABLE
+
   . = ALIGN(PAGE_SIZE);        /* Align data segment to page size boundary */
 				/* Data */
   .data : AT(ADDR(.data) - LOAD_OFFSET) {
diff --git a/include/asm-x86_64/bug.h b/include/asm-x86_64/bug.h
index 80ac1fe966a..68260641491 100644
--- a/include/asm-x86_64/bug.h
+++ b/include/asm-x86_64/bug.h
@@ -1,30 +1,30 @@
 #ifndef __ASM_X8664_BUG_H
 #define __ASM_X8664_BUG_H 1
 
-#include <linux/stringify.h>
-
-/*
- * Tell the user there is some problem.  The exception handler decodes 
- * this frame.
- */
-struct bug_frame {
-	unsigned char ud2[2];
-	unsigned char push;
-	signed int filename;
-	unsigned char ret;
-	unsigned short line;
-} __attribute__((packed));
-
 #ifdef CONFIG_BUG
 #define HAVE_ARCH_BUG
-/* We turn the bug frame into valid instructions to not confuse
-   the disassembler. Thanks to Jan Beulich & Suresh Siddha
-   for nice instruction selection.
-   The magic numbers generate mov $64bitimm,%eax ; ret $offset. */
-#define BUG() 								\
-	asm volatile(							\
-	"ud2 ; pushq $%c1 ; ret $%c0" :: 				\
-		     "i"(__LINE__), "i" (__FILE__))
+
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+#define BUG()								\
+	do {								\
+		asm volatile("1:\tud2\n"				\
+			     ".pushsection __bug_table,\"a\"\n"		\
+			     "2:\t.quad 1b, %c0\n"			\
+			     "\t.word %c1, 0\n"				\
+			     "\t.org 2b+%c2\n"				\
+			     ".popsection"				\
+			     : : "i" (__FILE__), "i" (__LINE__),	\
+			        "i" (sizeof(struct bug_entry)));	\
+		for(;;) ;						\
+	} while(0)
+#else
+#define BUG()								\
+	do {								\
+		asm volatile("ud2");					\
+		for(;;) ;						\
+	} while(0)
+#endif
+
 void out_of_line_bug(void);
 #else
 static inline void out_of_line_bug(void) { }
-- 
cgit v1.2.3-70-g09d2


From 92715e282be7c7488f892703c8d39b08976a833b Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Sat, 9 Dec 2006 21:33:35 +0100
Subject: [PATCH] x86: Fix boot hang due to nmi watchdog init code

2.6.19  stopped booting (or booted based on build/config) on our x86_64
systems due to a bug introduced in 2.6.19.  check_nmi_watchdog schedules an
IPI on all cpus to  busy wait on a flag, but fails to set the busywait
flag if NMI functionality is disabled.  This causes the secondary cpus
to spin in an endless loop, causing the kernel bootup to hang.
Depending upon the build, the  busywait flag got overwritten (stack variable)
and caused  the kernel to bootup on certain builds.  Following patch fixes
the bug by setting the busywait flag before returning from check_nmi_watchdog.
I guess using a stack variable is not good here as the calling function could
potentially return while the busy wait loop is still spinning on the flag.

AK: I redid the patch significantly to be cleaner

Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/nmi.c   | 8 ++++----
 arch/x86_64/kernel/nmi.c | 9 +++++----
 2 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index f5bc7e1be80..a5e34d65596 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -195,6 +195,8 @@ static __cpuinit inline int nmi_known_cpu(void)
 	return 0;
 }
 
+static int endflag __initdata = 0;
+
 #ifdef CONFIG_SMP
 /* The performance counters used by NMI_LOCAL_APIC don't trigger when
  * the CPU is idle. To make sure the NMI watchdog really ticks on all
@@ -202,7 +204,6 @@ static __cpuinit inline int nmi_known_cpu(void)
  */
 static __init void nmi_cpu_busy(void *data)
 {
-	volatile int *endflag = data;
 	local_irq_enable_in_hardirq();
 	/* Intentionally don't use cpu_relax here. This is
 	   to make sure that the performance counter really ticks,
@@ -210,14 +211,13 @@ static __init void nmi_cpu_busy(void *data)
 	   pause instruction. On a real HT machine this is fine because
 	   all other CPUs are busy with "useless" delay loops and don't
 	   care if they get somewhat less cycles. */
-	while (*endflag == 0)
-		barrier();
+	while (endflag == 0)
+		mb();
 }
 #endif
 
 static int __init check_nmi_watchdog(void)
 {
-	volatile int endflag = 0;
 	unsigned int *prev_nmi_count;
 	int cpu;
 
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 27e95e7922c..186aebbae32 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -193,6 +193,8 @@ void nmi_watchdog_default(void)
 		nmi_watchdog = NMI_IO_APIC;
 }
 
+static int endflag __initdata = 0;
+
 #ifdef CONFIG_SMP
 /* The performance counters used by NMI_LOCAL_APIC don't trigger when
  * the CPU is idle. To make sure the NMI watchdog really ticks on all
@@ -200,7 +202,6 @@ void nmi_watchdog_default(void)
  */
 static __init void nmi_cpu_busy(void *data)
 {
-	volatile int *endflag = data;
 	local_irq_enable_in_hardirq();
 	/* Intentionally don't use cpu_relax here. This is
 	   to make sure that the performance counter really ticks,
@@ -208,14 +209,13 @@ static __init void nmi_cpu_busy(void *data)
 	   pause instruction. On a real HT machine this is fine because
 	   all other CPUs are busy with "useless" delay loops and don't
 	   care if they get somewhat less cycles. */
-	while (*endflag == 0)
-		barrier();
+	while (endflag == 0)
+		mb();
 }
 #endif
 
 int __init check_nmi_watchdog (void)
 {
-	volatile int endflag = 0;
 	int *counts;
 	int cpu;
 
@@ -256,6 +256,7 @@ int __init check_nmi_watchdog (void)
 	if (!atomic_read(&nmi_active)) {
 		kfree(counts);
 		atomic_set(&nmi_active, -1);
+		endflag = 1;
 		return -1;
 	}
 	endflag = 1;
-- 
cgit v1.2.3-70-g09d2


From 1bac3b383a93a4a920ffc57441eb133c78567fbd Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 9 Dec 2006 21:33:36 +0100
Subject: [PATCH] x86: Work around gcc 4.2 over aggressive optimizer

The new PDA code uses a dummy _proxy_pda variable to describe
memory references to the PDA. It is never referenced
in inline assembly, but exists as input/output arguments.
gcc 4.2 in some cases can CSE references to this which causes
unresolved symbols.  Define it to zero to avoid this.

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/vmlinux.lds.S   | 1 +
 arch/x86_64/kernel/vmlinux.lds.S | 1 +
 2 files changed, 2 insertions(+)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 16d3c7133ad..a53c8b1854b 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -26,6 +26,7 @@ OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
 OUTPUT_ARCH(i386)
 ENTRY(phys_startup_32)
 jiffies = jiffies_64;
+_proxy_pda = 0;
 
 PHDRS {
 	text PT_LOAD FLAGS(5);	/* R_E */
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 6c417788c54..514be5dd230 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -13,6 +13,7 @@ OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
 OUTPUT_ARCH(i386:x86-64)
 ENTRY(phys_startup_64)
 jiffies_64 = jiffies;
+_proxy_pda = 0;
 PHDRS {
 	text PT_LOAD FLAGS(5);	/* R_E */
 	data PT_LOAD FLAGS(7);	/* RWE */
-- 
cgit v1.2.3-70-g09d2


From 1f29bcd739972f71f2fd5d5d265daf3e1208fa5e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sun, 10 Dec 2006 02:19:10 -0800
Subject: [PATCH] sysctl: remove unused "context" param

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andi Kleen <ak@suse.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: David Howells <dhowells@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/frv/kernel/pm.c           |  6 +++---
 arch/mips/lasat/sysctl.c       | 17 +++++++--------
 arch/x86_64/kernel/vsyscall.c  |  3 +--
 drivers/char/random.c          |  2 +-
 include/linux/sysctl.h         |  5 ++---
 include/net/ip.h               |  3 +--
 include/net/sctp/sctp.h        |  2 +-
 kernel/sysctl.c                | 47 ++++++++++++++++++++----------------------
 net/decnet/dn_dev.c            |  6 ++----
 net/decnet/sysctl_net_decnet.c |  6 ++----
 net/ipv4/devinet.c             |  3 +--
 net/ipv4/route.c               |  3 +--
 net/ipv4/sysctl_net_ipv4.c     | 16 ++++++--------
 net/ipv6/addrconf.c            |  3 +--
 net/ipv6/ndisc.c               |  9 +++-----
 15 files changed, 55 insertions(+), 76 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/frv/kernel/pm.c b/arch/frv/kernel/pm.c
index c1d9fc8f1a8..ee677ced7b6 100644
--- a/arch/frv/kernel/pm.c
+++ b/arch/frv/kernel/pm.c
@@ -223,7 +223,7 @@ static int cmode_procctl(ctl_table *ctl, int write, struct file *filp,
 
 static int cmode_sysctl(ctl_table *table, int __user *name, int nlen,
 			void __user *oldval, size_t __user *oldlenp,
-			void __user *newval, size_t newlen, void **context)
+			void __user *newval, size_t newlen)
 {
 	if (oldval && oldlenp) {
 		size_t oldlen;
@@ -326,7 +326,7 @@ static int p0_procctl(ctl_table *ctl, int write, struct file *filp,
 
 static int p0_sysctl(ctl_table *table, int __user *name, int nlen,
 		     void __user *oldval, size_t __user *oldlenp,
-		     void __user *newval, size_t newlen, void **context)
+		     void __user *newval, size_t newlen)
 {
 	if (oldval && oldlenp) {
 		size_t oldlen;
@@ -370,7 +370,7 @@ static int cm_procctl(ctl_table *ctl, int write, struct file *filp,
 
 static int cm_sysctl(ctl_table *table, int __user *name, int nlen,
 		     void __user *oldval, size_t __user *oldlenp,
-		     void __user *newval, size_t newlen, void **context)
+		     void __user *newval, size_t newlen)
 {
 	if (oldval && oldlenp) {
 		size_t oldlen;
diff --git a/arch/mips/lasat/sysctl.c b/arch/mips/lasat/sysctl.c
index da35d455549..12878359f2c 100644
--- a/arch/mips/lasat/sysctl.c
+++ b/arch/mips/lasat/sysctl.c
@@ -40,12 +40,12 @@ static DEFINE_MUTEX(lasat_info_mutex);
 /* Strategy function to write EEPROM after changing string entry */
 int sysctl_lasatstring(ctl_table *table, int *name, int nlen,
 		void *oldval, size_t *oldlenp,
-		void *newval, size_t newlen, void **context)
+		void *newval, size_t newlen)
 {
 	int r;
 	mutex_lock(&lasat_info_mutex);
 	r = sysctl_string(table, name,
-			  nlen, oldval, oldlenp, newval, newlen, context);
+			  nlen, oldval, oldlenp, newval, newlen);
 	if (r < 0) {
 		mutex_unlock(&lasat_info_mutex);
 		return r;
@@ -119,11 +119,11 @@ int proc_dolasatrtc(ctl_table *table, int write, struct file *filp,
 /* Sysctl for setting the IP addresses */
 int sysctl_lasat_intvec(ctl_table *table, int *name, int nlen,
 		    void *oldval, size_t *oldlenp,
-		    void *newval, size_t newlen, void **context)
+		    void *newval, size_t newlen)
 {
 	int r;
 	mutex_lock(&lasat_info_mutex);
-	r = sysctl_intvec(table, name, nlen, oldval, oldlenp, newval, newlen, context);
+	r = sysctl_intvec(table, name, nlen, oldval, oldlenp, newval, newlen);
 	if (r < 0) {
 		mutex_unlock(&lasat_info_mutex);
 		return r;
@@ -139,14 +139,14 @@ int sysctl_lasat_intvec(ctl_table *table, int *name, int nlen,
 /* Same for RTC */
 int sysctl_lasat_rtc(ctl_table *table, int *name, int nlen,
 		    void *oldval, size_t *oldlenp,
-		    void *newval, size_t newlen, void **context)
+		    void *newval, size_t newlen)
 {
 	int r;
 	mutex_lock(&lasat_info_mutex);
 	rtctmp = ds1603_read();
 	if (rtctmp < 0)
 		rtctmp = 0;
-	r = sysctl_intvec(table, name, nlen, oldval, oldlenp, newval, newlen, context);
+	r = sysctl_intvec(table, name, nlen, oldval, oldlenp, newval, newlen);
 	if (r < 0) {
 		mutex_unlock(&lasat_info_mutex);
 		return r;
@@ -251,13 +251,12 @@ int proc_lasat_ip(ctl_table *table, int write, struct file *filp,
 
 static int sysctl_lasat_eeprom_value(ctl_table *table, int *name, int nlen,
 				     void *oldval, size_t *oldlenp,
-				     void *newval, size_t newlen,
-				     void **context)
+				     void *newval, size_t newlen)
 {
 	int r;
 
 	mutex_lock(&lasat_info_mutex);
-	r = sysctl_intvec(table, name, nlen, oldval, oldlenp, newval, newlen, context);
+	r = sysctl_intvec(table, name, nlen, oldval, oldlenp, newval, newlen);
 	if (r < 0) {
 		mutex_unlock(&lasat_info_mutex);
 		return r;
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index 4a673f5397a..2433d6fc68b 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -225,8 +225,7 @@ out:
 
 static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen,
 				void __user *oldval, size_t __user *oldlenp,
-				void __user *newval, size_t newlen,
-				void **context)
+				void __user *newval, size_t newlen)
 {
 	return -ENOSYS;
 }
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 092a01cc02d..13d0b1350a6 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1203,7 +1203,7 @@ static int proc_do_uuid(ctl_table *table, int write, struct file *filp,
 
 static int uuid_strategy(ctl_table *table, int __user *name, int nlen,
 			 void __user *oldval, size_t __user *oldlenp,
-			 void __user *newval, size_t newlen, void **context)
+			 void __user *newval, size_t newlen)
 {
 	unsigned char tmp_uuid[16], *uuid;
 	unsigned int len;
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 94316a98e0d..6d8846e7be6 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -918,8 +918,7 @@ typedef struct ctl_table ctl_table;
 
 typedef int ctl_handler (ctl_table *table, int __user *name, int nlen,
 			 void __user *oldval, size_t __user *oldlenp,
-			 void __user *newval, size_t newlen, 
-			 void **context);
+			 void __user *newval, size_t newlen);
 
 typedef int proc_handler (ctl_table *ctl, int write, struct file * filp,
 			  void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -950,7 +949,7 @@ extern int do_sysctl (int __user *name, int nlen,
 extern int do_sysctl_strategy (ctl_table *table, 
 			       int __user *name, int nlen,
 			       void __user *oldval, size_t __user *oldlenp,
-			       void __user *newval, size_t newlen, void ** context);
+			       void __user *newval, size_t newlen);
 
 extern ctl_handler sysctl_string;
 extern ctl_handler sysctl_intvec;
diff --git a/include/net/ip.h b/include/net/ip.h
index 83cb9ac5554..053f02b5cb8 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -376,8 +376,7 @@ int ipv4_doint_and_flush(ctl_table *ctl, int write,
 			 size_t *lenp, loff_t *ppos);
 int ipv4_doint_and_flush_strategy(ctl_table *table, int __user *name, int nlen,
 				  void __user *oldval, size_t __user *oldlenp,
-				  void __user *newval, size_t newlen, 
-				  void **context);
+				  void __user *newval, size_t newlen);
 #ifdef CONFIG_PROC_FS
 extern int ip_misc_proc_init(void);
 #endif
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 215461f18db..c818f87122a 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -368,7 +368,7 @@ static inline void sctp_sysctl_register(void) { return; }
 static inline void sctp_sysctl_unregister(void) { return; }
 static inline int sctp_sysctl_jiffies_ms(ctl_table *table, int __user *name, int nlen,
 		void __user *oldval, size_t __user *oldlenp,
-		void __user *newval, size_t newlen, void **context) {
+		void __user *newval, size_t newlen) {
 	return -ENOSYS;
 }
 #endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c86445a62af..130c5ec9ee0 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -133,7 +133,7 @@ extern int max_lock_depth;
 
 #ifdef CONFIG_SYSCTL_SYSCALL
 static int parse_table(int __user *, int, void __user *, size_t __user *,
-		void __user *, size_t, ctl_table *, void **);
+		void __user *, size_t, ctl_table *);
 #endif
 
 static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
@@ -141,12 +141,12 @@ static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
 
 static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
 		  void __user *oldval, size_t __user *oldlenp,
-		  void __user *newval, size_t newlen, void **context);
+		  void __user *newval, size_t newlen);
 
 #ifdef CONFIG_SYSVIPC
 static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
 		  void __user *oldval, size_t __user *oldlenp,
-		  void __user *newval, size_t newlen, void **context);
+		  void __user *newval, size_t newlen);
 #endif
 
 #ifdef CONFIG_PROC_SYSCTL
@@ -1243,7 +1243,6 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
 	do {
 		struct ctl_table_header *head =
 			list_entry(tmp, struct ctl_table_header, ctl_entry);
-		void *context = NULL;
 
 		if (!use_table(head))
 			continue;
@@ -1251,9 +1250,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
 		spin_unlock(&sysctl_lock);
 
 		error = parse_table(name, nlen, oldval, oldlenp, 
-					newval, newlen, head->ctl_table,
-					&context);
-		kfree(context);
+					newval, newlen, head->ctl_table);
 
 		spin_lock(&sysctl_lock);
 		unuse_table(head);
@@ -1309,7 +1306,7 @@ static inline int ctl_perm(ctl_table *table, int op)
 static int parse_table(int __user *name, int nlen,
 		       void __user *oldval, size_t __user *oldlenp,
 		       void __user *newval, size_t newlen,
-		       ctl_table *table, void **context)
+		       ctl_table *table)
 {
 	int n;
 repeat:
@@ -1329,7 +1326,7 @@ repeat:
 					error = table->strategy(
 						table, name, nlen,
 						oldval, oldlenp,
-						newval, newlen, context);
+						newval, newlen);
 					if (error)
 						return error;
 				}
@@ -1340,7 +1337,7 @@ repeat:
 			}
 			error = do_sysctl_strategy(table, name, nlen,
 						   oldval, oldlenp,
-						   newval, newlen, context);
+						   newval, newlen);
 			return error;
 		}
 	}
@@ -1351,7 +1348,7 @@ repeat:
 int do_sysctl_strategy (ctl_table *table, 
 			int __user *name, int nlen,
 			void __user *oldval, size_t __user *oldlenp,
-			void __user *newval, size_t newlen, void **context)
+			void __user *newval, size_t newlen)
 {
 	int op = 0, rc;
 	size_t len;
@@ -1365,7 +1362,7 @@ int do_sysctl_strategy (ctl_table *table,
 
 	if (table->strategy) {
 		rc = table->strategy(table, name, nlen, oldval, oldlenp,
-				     newval, newlen, context);
+				     newval, newlen);
 		if (rc < 0)
 			return rc;
 		if (rc > 0)
@@ -2473,7 +2470,7 @@ int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
 /* The generic string strategy routine: */
 int sysctl_string(ctl_table *table, int __user *name, int nlen,
 		  void __user *oldval, size_t __user *oldlenp,
-		  void __user *newval, size_t newlen, void **context)
+		  void __user *newval, size_t newlen)
 {
 	if (!table->data || !table->maxlen) 
 		return -ENOTDIR;
@@ -2519,7 +2516,7 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen,
  */
 int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
 		void __user *oldval, size_t __user *oldlenp,
-		void __user *newval, size_t newlen, void **context)
+		void __user *newval, size_t newlen)
 {
 
 	if (newval && newlen) {
@@ -2555,7 +2552,7 @@ int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
 /* Strategy function to convert jiffies to seconds */ 
 int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
 		void __user *oldval, size_t __user *oldlenp,
-		void __user *newval, size_t newlen, void **context)
+		void __user *newval, size_t newlen)
 {
 	if (oldval) {
 		size_t olen;
@@ -2583,7 +2580,7 @@ int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
 /* Strategy function to convert jiffies to seconds */ 
 int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
 		void __user *oldval, size_t __user *oldlenp,
-		void __user *newval, size_t newlen, void **context)
+		void __user *newval, size_t newlen)
 {
 	if (oldval) {
 		size_t olen;
@@ -2612,7 +2609,7 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
 /* The generic string strategy routine: */
 static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
 		  void __user *oldval, size_t __user *oldlenp,
-		  void __user *newval, size_t newlen, void **context)
+		  void __user *newval, size_t newlen)
 {
 	struct ctl_table uts_table;
 	int r, write;
@@ -2620,7 +2617,7 @@ static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
 	memcpy(&uts_table, table, sizeof(uts_table));
 	uts_table.data = get_uts(table, write);
 	r = sysctl_string(&uts_table, name, nlen,
-		oldval, oldlenp, newval, newlen, context);
+		oldval, oldlenp, newval, newlen);
 	put_uts(table, write, uts_table.data);
 	return r;
 }
@@ -2629,7 +2626,7 @@ static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
 /* The generic sysctl ipc data routine. */
 static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
 		void __user *oldval, size_t __user *oldlenp,
-		void __user *newval, size_t newlen, void **context)
+		void __user *newval, size_t newlen)
 {
 	size_t len;
 	void *data;
@@ -2704,41 +2701,41 @@ out:
 
 int sysctl_string(ctl_table *table, int __user *name, int nlen,
 		  void __user *oldval, size_t __user *oldlenp,
-		  void __user *newval, size_t newlen, void **context)
+		  void __user *newval, size_t newlen)
 {
 	return -ENOSYS;
 }
 
 int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
 		void __user *oldval, size_t __user *oldlenp,
-		void __user *newval, size_t newlen, void **context)
+		void __user *newval, size_t newlen)
 {
 	return -ENOSYS;
 }
 
 int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
 		void __user *oldval, size_t __user *oldlenp,
-		void __user *newval, size_t newlen, void **context)
+		void __user *newval, size_t newlen)
 {
 	return -ENOSYS;
 }
 
 int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
 		void __user *oldval, size_t __user *oldlenp,
-		void __user *newval, size_t newlen, void **context)
+		void __user *newval, size_t newlen)
 {
 	return -ENOSYS;
 }
 
 static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
 		  void __user *oldval, size_t __user *oldlenp,
-		  void __user *newval, size_t newlen, void **context)
+		  void __user *newval, size_t newlen)
 {
 	return -ENOSYS;
 }
 static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
 		void __user *oldval, size_t __user *oldlenp,
-		void __user *newval, size_t newlen, void **context)
+		void __user *newval, size_t newlen)
 {
 	return -ENOSYS;
 }
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index 0b9d4c95515..fc6f3c023a5 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -167,8 +167,7 @@ static int dn_forwarding_proc(ctl_table *, int, struct file *,
 			void __user *, size_t *, loff_t *);
 static int dn_forwarding_sysctl(ctl_table *table, int __user *name, int nlen,
 			void __user *oldval, size_t __user *oldlenp,
-			void __user *newval, size_t newlen,
-			void **context);
+			void __user *newval, size_t newlen);
 
 static struct dn_dev_sysctl_table {
 	struct ctl_table_header *sysctl_header;
@@ -347,8 +346,7 @@ static int dn_forwarding_proc(ctl_table *table, int write,
 
 static int dn_forwarding_sysctl(ctl_table *table, int __user *name, int nlen,
 			void __user *oldval, size_t __user *oldlenp,
-			void __user *newval, size_t newlen,
-			void **context)
+			void __user *newval, size_t newlen)
 {
 #ifdef CONFIG_DECNET_ROUTER
 	struct net_device *dev = table->extra1;
diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c
index e246f054f36..a4065eb1341 100644
--- a/net/decnet/sysctl_net_decnet.c
+++ b/net/decnet/sysctl_net_decnet.c
@@ -134,8 +134,7 @@ static int parse_addr(__le16 *addr, char *str)
 
 static int dn_node_address_strategy(ctl_table *table, int __user *name, int nlen,
 				void __user *oldval, size_t __user *oldlenp,
-				void __user *newval, size_t newlen,
-				void **context)
+				void __user *newval, size_t newlen)
 {
 	size_t len;
 	__le16 addr;
@@ -220,8 +219,7 @@ static int dn_node_address_handler(ctl_table *table, int write,
 
 static int dn_def_dev_strategy(ctl_table *table, int __user *name, int nlen,
 				void __user *oldval, size_t __user *oldlenp,
-				void __user *newval, size_t newlen,
-				void **context)
+				void __user *newval, size_t newlen)
 {
 	size_t len;
 	struct net_device *dev;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 2fd899160f8..84bed40273a 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1303,8 +1303,7 @@ int ipv4_doint_and_flush(ctl_table *ctl, int write,
 
 int ipv4_doint_and_flush_strategy(ctl_table *table, int __user *name, int nlen,
 				  void __user *oldval, size_t __user *oldlenp,
-				  void __user *newval, size_t newlen, 
-				  void **context)
+				  void __user *newval, size_t newlen)
 {
 	int *valp = table->data;
 	int new;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 11c167118e8..1aaff0a2e09 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2872,8 +2872,7 @@ static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
 						void __user *oldval,
 						size_t __user *oldlenp,
 						void __user *newval,
-						size_t newlen,
-						void **context)
+						size_t newlen)
 {
 	int delay;
 	if (newlen != sizeof(int))
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index dfcf47f10f8..fabf69a9108 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -51,8 +51,7 @@ int ipv4_sysctl_forward(ctl_table *ctl, int write, struct file * filp,
 static int ipv4_sysctl_forward_strategy(ctl_table *table,
 			 int __user *name, int nlen,
 			 void __user *oldval, size_t __user *oldlenp,
-			 void __user *newval, size_t newlen, 
-			 void **context)
+			 void __user *newval, size_t newlen)
 {
 	int *valp = table->data;
 	int new;
@@ -111,8 +110,7 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file *
 static int sysctl_tcp_congestion_control(ctl_table *table, int __user *name,
 					 int nlen, void __user *oldval,
 					 size_t __user *oldlenp,
-					 void __user *newval, size_t newlen,
-					 void **context)
+					 void __user *newval, size_t newlen)
 {
 	char val[TCP_CA_NAME_MAX];
 	ctl_table tbl = {
@@ -122,8 +120,7 @@ static int sysctl_tcp_congestion_control(ctl_table *table, int __user *name,
 	int ret;
 
 	tcp_get_default_congestion_control(val);
-	ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen,
-			    context);
+	ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen);
 	if (ret == 0 && newval && newlen)
 		ret = tcp_set_default_congestion_control(val);
 	return ret;
@@ -169,8 +166,8 @@ static int proc_allowed_congestion_control(ctl_table *ctl,
 static int strategy_allowed_congestion_control(ctl_table *table, int __user *name,
 					       int nlen, void __user *oldval,
 					       size_t __user *oldlenp,
-					       void __user *newval, size_t newlen,
-					       void **context)
+					       void __user *newval,
+					       size_t newlen)
 {
 	ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX };
 	int ret;
@@ -180,8 +177,7 @@ static int strategy_allowed_congestion_control(ctl_table *table, int __user *nam
 		return -ENOMEM;
 
 	tcp_get_available_congestion_control(tbl.data, tbl.maxlen);
-	ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen,
-			    context);
+	ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen);
 	if (ret == 0 && newval && newlen)
 		ret = tcp_set_allowed_congestion_control(tbl.data);
 	kfree(tbl.data);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a5e8d207a51..9b0a9064315 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3656,8 +3656,7 @@ static int addrconf_sysctl_forward_strategy(ctl_table *table,
 					    int __user *name, int nlen,
 					    void __user *oldval,
 					    size_t __user *oldlenp,
-					    void __user *newval, size_t newlen,
-					    void **context)
+					    void __user *newval, size_t newlen)
 {
 	int *valp = table->data;
 	int new;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 56ea9283730..6a9f616de37 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1667,8 +1667,7 @@ int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, struct file * f
 static int ndisc_ifinfo_sysctl_strategy(ctl_table *ctl, int __user *name,
 					int nlen, void __user *oldval,
 					size_t __user *oldlenp,
-					void __user *newval, size_t newlen,
-					void **context)
+					void __user *newval, size_t newlen)
 {
 	struct net_device *dev = ctl->extra1;
 	struct inet6_dev *idev;
@@ -1681,14 +1680,12 @@ static int ndisc_ifinfo_sysctl_strategy(ctl_table *ctl, int __user *name,
 	switch (ctl->ctl_name) {
 	case NET_NEIGH_REACHABLE_TIME:
 		ret = sysctl_jiffies(ctl, name, nlen,
-				     oldval, oldlenp, newval, newlen,
-				     context);
+				     oldval, oldlenp, newval, newlen);
 		break;
 	case NET_NEIGH_RETRANS_TIME_MS:
 	case NET_NEIGH_REACHABLE_TIME_MS:
 		 ret = sysctl_ms_jiffies(ctl, name, nlen,
-					 oldval, oldlenp, newval, newlen,
-					 context);
+					 oldval, oldlenp, newval, newlen);
 		 break;
 	default:
 		ret = 0;
-- 
cgit v1.2.3-70-g09d2