From e74e396204bfcb67570ba4517b08f5918e69afea Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 30 Mar 2009 19:07:44 +0900
Subject: percpu: use dynamic percpu allocator as the default percpu allocator

This patch makes most !CONFIG_HAVE_SETUP_PER_CPU_AREA archs use
dynamic percpu allocator.  The first chunk is allocated using
embedding helper and 8k is reserved for modules.  This ensures that
the new allocator behaves almost identically to the original allocator
as long as static percpu variables are concerned, so it shouldn't
introduce much breakage.

s390 and alpha use custom SHIFT_PERCPU_PTR() to work around addressing
range limit the addressing model imposes.  Unfortunately, this breaks
if the address is specified using a variable, so for now, the two
archs aren't converted.

The following architectures are affected by this change.

* sh
* arm
* cris
* mips
* sparc(32)
* blackfin
* avr32
* parisc (broken, under investigation)
* m32r
* powerpc(32)

As this change makes the dynamic allocator the default one,
CONFIG_HAVE_DYNAMIC_PER_CPU_AREA is replaced with its invert -
CONFIG_HAVE_LEGACY_PER_CPU_AREA, which is added to yet-to-be converted
archs.  These archs implement their own setup_per_cpu_areas() and the
conversion is not trivial.

* powerpc(64)
* sparc(64)
* ia64
* alpha
* s390

Boot and batch alloc/free tests on x86_32 with debug code (x86_32
doesn't use default first chunk initialization).  Compile tested on
sparc(32), powerpc(32), arm and alpha.

Kyle McMartin reported that this change breaks parisc.  The problem is
still under investigation and he is okay with pushing this patch
forward and fixing parisc later.

[ Impact: use dynamic allocator for most archs w/o custom percpu setup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Reviewed-by: Christoph Lameter <cl@linux.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Bryan Wu <cooloney@kernel.org>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Grant Grundler <grundler@parisc-linux.org>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/sparc/Kconfig | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/sparc')

diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 3f8b6a92eab..7a8698b913f 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -92,6 +92,9 @@ config AUDIT_ARCH
 	bool
 	default y
 
+config HAVE_LEGACY_PER_CPU_AREA
+	def_bool y if SPARC64
+
 config HAVE_SETUP_PER_CPU_AREA
 	def_bool y if SPARC64
 
-- 
cgit v1.2.3-70-g09d2


From 405d967dc70002991f8fc35c20e0d3cbc7614f63 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 24 Jun 2009 15:13:38 +0900
Subject: linker script: throw away .discard section

x86 throws away .discard section but no other archs do.  Also,
.discard is not thrown away while linking modules.  Make every arch
and module linking throw it away.  This will be used to define dummy
variables for percpu declarations and definitions.

This patch is based on Ivan Kokshaysky's alpha percpu patch.

[ Impact: always throw away everything in .discard ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Haavard Skinnemoen <hskinnemoen@atmel.com>
Cc: Bryan Wu <cooloney@kernel.org>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Ingo Molnar <mingo@elte.hu>
---
 Makefile                             | 2 +-
 arch/alpha/kernel/vmlinux.lds.S      | 1 +
 arch/arm/kernel/vmlinux.lds.S        | 1 +
 arch/avr32/kernel/vmlinux.lds.S      | 1 +
 arch/blackfin/kernel/vmlinux.lds.S   | 1 +
 arch/cris/kernel/vmlinux.lds.S       | 1 +
 arch/frv/kernel/vmlinux.lds.S        | 2 ++
 arch/h8300/kernel/vmlinux.lds.S      | 1 +
 arch/ia64/kernel/vmlinux.lds.S       | 1 +
 arch/m32r/kernel/vmlinux.lds.S       | 1 +
 arch/m68k/kernel/vmlinux-std.lds     | 1 +
 arch/m68k/kernel/vmlinux-sun3.lds    | 1 +
 arch/m68knommu/kernel/vmlinux.lds.S  | 1 +
 arch/microblaze/kernel/vmlinux.lds.S | 2 ++
 arch/mips/kernel/vmlinux.lds.S       | 1 +
 arch/mn10300/kernel/vmlinux.lds.S    | 1 +
 arch/parisc/kernel/vmlinux.lds.S     | 1 +
 arch/powerpc/kernel/vmlinux.lds.S    | 1 +
 arch/s390/kernel/vmlinux.lds.S       | 1 +
 arch/sh/kernel/vmlinux.lds.S         | 1 +
 arch/sparc/kernel/vmlinux.lds.S      | 1 +
 arch/um/kernel/dyn.lds.S             | 2 ++
 arch/um/kernel/uml.lds.S             | 2 ++
 arch/xtensa/kernel/vmlinux.lds.S     | 1 +
 include/asm-generic/vmlinux.lds.h    | 8 ++++++++
 scripts/module-common.lds            | 8 ++++++++
 26 files changed, 44 insertions(+), 1 deletion(-)
 create mode 100644 scripts/module-common.lds

(limited to 'arch/sparc')

diff --git a/Makefile b/Makefile
index 46e1c9d03d5..12245be0512 100644
--- a/Makefile
+++ b/Makefile
@@ -327,7 +327,7 @@ CHECKFLAGS     := -D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ \
 MODFLAGS	= -DMODULE
 CFLAGS_MODULE   = $(MODFLAGS)
 AFLAGS_MODULE   = $(MODFLAGS)
-LDFLAGS_MODULE  =
+LDFLAGS_MODULE  = -T $(srctree)/scripts/module-common.lds
 CFLAGS_KERNEL	=
 AFLAGS_KERNEL	=
 CFLAGS_GCOV	= -fprofile-arcs -ftest-coverage
diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S
index b9d6568e5f7..75fe1d6877e 100644
--- a/arch/alpha/kernel/vmlinux.lds.S
+++ b/arch/alpha/kernel/vmlinux.lds.S
@@ -139,6 +139,7 @@ SECTIONS
 		EXIT_TEXT
 		EXIT_DATA
 		*(.exitcall.exit)
+		*(.discard)
 	}
 
 	.mdebug 0 : {
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index 6c077979254..e256c57b898 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -82,6 +82,7 @@ SECTIONS
 		EXIT_TEXT
 		EXIT_DATA
 		*(.exitcall.exit)
+		*(.discard)
 		*(.ARM.exidx.exit.text)
 		*(.ARM.extab.exit.text)
 #ifndef CONFIG_MMU
diff --git a/arch/avr32/kernel/vmlinux.lds.S b/arch/avr32/kernel/vmlinux.lds.S
index 7910d41eb88..b8324608ec0 100644
--- a/arch/avr32/kernel/vmlinux.lds.S
+++ b/arch/avr32/kernel/vmlinux.lds.S
@@ -131,6 +131,7 @@ SECTIONS
 	/DISCARD/       	: {
 		EXIT_DATA
 		*(.exitcall.exit)
+		*(.discard)
 	}
 
 	DWARF_DEBUG
diff --git a/arch/blackfin/kernel/vmlinux.lds.S b/arch/blackfin/kernel/vmlinux.lds.S
index 6ac307ca0d8..6e8eabd8f0a 100644
--- a/arch/blackfin/kernel/vmlinux.lds.S
+++ b/arch/blackfin/kernel/vmlinux.lds.S
@@ -280,5 +280,6 @@ SECTIONS
 	/DISCARD/ :
 	{
 		*(.exitcall.exit)
+		*(.discard)
 	}
 }
diff --git a/arch/cris/kernel/vmlinux.lds.S b/arch/cris/kernel/vmlinux.lds.S
index 0d2adfc794d..a3175ebb38c 100644
--- a/arch/cris/kernel/vmlinux.lds.S
+++ b/arch/cris/kernel/vmlinux.lds.S
@@ -145,6 +145,7 @@ SECTIONS
 		EXIT_TEXT
 		EXIT_DATA
 		*(.exitcall.exit)
+		*(.discard)
         }
 
 	dram_end = dram_start + (CONFIG_ETRAX_DRAM_SIZE - __CONFIG_ETRAX_VMEM_SIZE)*1024*1024;
diff --git a/arch/frv/kernel/vmlinux.lds.S b/arch/frv/kernel/vmlinux.lds.S
index 22d9787406e..64b5a5e4d35 100644
--- a/arch/frv/kernel/vmlinux.lds.S
+++ b/arch/frv/kernel/vmlinux.lds.S
@@ -177,6 +177,8 @@ SECTIONS
   .debug_ranges		0 : { *(.debug_ranges) }
 
   .comment 0 : { *(.comment) }
+
+  /DISCARD/ : { *(.discard) }
 }
 
 __kernel_image_size_no_bss = __bss_start - __kernel_image_start;
diff --git a/arch/h8300/kernel/vmlinux.lds.S b/arch/h8300/kernel/vmlinux.lds.S
index 43a87b9085b..03d6c0df33d 100644
--- a/arch/h8300/kernel/vmlinux.lds.S
+++ b/arch/h8300/kernel/vmlinux.lds.S
@@ -154,6 +154,7 @@ SECTIONS
 	}
 	/DISCARD/ : {
 		*(.exitcall.exit)
+		*(.discard)
 	}
         .romfs :	
 	{
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
index 4a95e86b9ac..13d95897587 100644
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -29,6 +29,7 @@ SECTIONS
 	EXIT_TEXT
 	EXIT_DATA
 	*(.exitcall.exit)
+	*(.discard)
 	*(.IA_64.unwind.exit.text)
 	*(.IA_64.unwind_info.exit.text)
 	}
diff --git a/arch/m32r/kernel/vmlinux.lds.S b/arch/m32r/kernel/vmlinux.lds.S
index 4179adf6c62..480a49944cf 100644
--- a/arch/m32r/kernel/vmlinux.lds.S
+++ b/arch/m32r/kernel/vmlinux.lds.S
@@ -125,6 +125,7 @@ SECTIONS
 	EXIT_TEXT
 	EXIT_DATA
 	*(.exitcall.exit)
+	*(.discard)
 	}
 
   /* Stabs debugging sections.  */
diff --git a/arch/m68k/kernel/vmlinux-std.lds b/arch/m68k/kernel/vmlinux-std.lds
index 01d212bb05a..905a797ada9 100644
--- a/arch/m68k/kernel/vmlinux-std.lds
+++ b/arch/m68k/kernel/vmlinux-std.lds
@@ -87,6 +87,7 @@ SECTIONS
 	EXIT_TEXT
 	EXIT_DATA
 	*(.exitcall.exit)
+	*(.discard)
 	}
 
   /* Stabs debugging sections.  */
diff --git a/arch/m68k/kernel/vmlinux-sun3.lds b/arch/m68k/kernel/vmlinux-sun3.lds
index c192f773db9..47d04be322a 100644
--- a/arch/m68k/kernel/vmlinux-sun3.lds
+++ b/arch/m68k/kernel/vmlinux-sun3.lds
@@ -82,6 +82,7 @@ __init_begin = .;
 	EXIT_TEXT
 	EXIT_DATA
 	*(.exitcall.exit)
+	*(.discard)
 	}
 
   .crap : {
diff --git a/arch/m68knommu/kernel/vmlinux.lds.S b/arch/m68knommu/kernel/vmlinux.lds.S
index b7fe505e358..68111a61a77 100644
--- a/arch/m68knommu/kernel/vmlinux.lds.S
+++ b/arch/m68knommu/kernel/vmlinux.lds.S
@@ -188,6 +188,7 @@ SECTIONS {
 		EXIT_TEXT
 		EXIT_DATA
 		*(.exitcall.exit)
+		*(.discard)
 	}
 
 	.bss : {
diff --git a/arch/microblaze/kernel/vmlinux.lds.S b/arch/microblaze/kernel/vmlinux.lds.S
index d34d38dcd12..a207543c592 100644
--- a/arch/microblaze/kernel/vmlinux.lds.S
+++ b/arch/microblaze/kernel/vmlinux.lds.S
@@ -162,4 +162,6 @@ SECTIONS {
 	}
 	. = ALIGN(4096);
 	_end = .;
+
+	/DISCARD/ : { *(.discard) }
 }
diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S
index 58738c8d754..45901609b74 100644
--- a/arch/mips/kernel/vmlinux.lds.S
+++ b/arch/mips/kernel/vmlinux.lds.S
@@ -179,6 +179,7 @@ SECTIONS
 	/* Sections to be discarded */
 	/DISCARD/ : {
 		*(.exitcall.exit)
+		*(.discard)
 
 		/* ABI crap starts here */
 		*(.MIPS.options)
diff --git a/arch/mn10300/kernel/vmlinux.lds.S b/arch/mn10300/kernel/vmlinux.lds.S
index 24de6b90f40..5d9f2f96ad9 100644
--- a/arch/mn10300/kernel/vmlinux.lds.S
+++ b/arch/mn10300/kernel/vmlinux.lds.S
@@ -146,6 +146,7 @@ SECTIONS
   /* Sections to be discarded */
   /DISCARD/ : {
 	*(.exitcall.exit)
+	*(.discard)
 	}
 
   STABS_DEBUG
diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S
index fd2cc4fd2b6..ccf58341845 100644
--- a/arch/parisc/kernel/vmlinux.lds.S
+++ b/arch/parisc/kernel/vmlinux.lds.S
@@ -240,6 +240,7 @@ SECTIONS
 	/* Sections to be discarded */
 	/DISCARD/ : {
 		*(.exitcall.exit)
+		*(.discard)
 #ifdef CONFIG_64BIT
 		/* temporary hack until binutils is fixed to not emit these
 	 	 * for static binaries
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 8ef8a14abc9..7fca9355fd3 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -40,6 +40,7 @@ SECTIONS
 	/* Sections to be discarded. */
 	/DISCARD/ : {
 	*(.exitcall.exit)
+	*(.discard)
 	EXIT_DATA
 	}
 
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index a53db23ee09..98867dfea46 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -161,6 +161,7 @@ SECTIONS
 	/DISCARD/ : {
 		EXIT_DATA
 		*(.exitcall.exit)
+		*(.discard)
 	}
 
 	/* Debugging sections.	*/
diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S
index f53c76acaed..766976d27b2 100644
--- a/arch/sh/kernel/vmlinux.lds.S
+++ b/arch/sh/kernel/vmlinux.lds.S
@@ -171,6 +171,7 @@ SECTIONS
 	 */
 	/DISCARD/ : {
 		*(.exitcall.exit)
+		*(.discard)
 	}
 
 	STABS_DEBUG
diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S
index fcbbd000ec0..d63cf914667 100644
--- a/arch/sparc/kernel/vmlinux.lds.S
+++ b/arch/sparc/kernel/vmlinux.lds.S
@@ -175,6 +175,7 @@ SECTIONS
 		EXIT_TEXT
 		EXIT_DATA
 		*(.exitcall.exit)
+		*(.discard)
 	}
 
 	STABS_DEBUG
diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S
index 9975e1ab44f..2916d6eadff 100644
--- a/arch/um/kernel/dyn.lds.S
+++ b/arch/um/kernel/dyn.lds.S
@@ -156,4 +156,6 @@ SECTIONS
   STABS_DEBUG
 
   DWARF_DEBUG
+
+  /DISCARD/	: { *(.discard) }
 }
diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S
index 11b835248b8..1f8a622cabe 100644
--- a/arch/um/kernel/uml.lds.S
+++ b/arch/um/kernel/uml.lds.S
@@ -100,4 +100,6 @@ SECTIONS
   STABS_DEBUG
 
   DWARF_DEBUG
+
+  /DISCARD/	: { *(.discard) }
 }
diff --git a/arch/xtensa/kernel/vmlinux.lds.S b/arch/xtensa/kernel/vmlinux.lds.S
index 41c159cd872..b1e24638acd 100644
--- a/arch/xtensa/kernel/vmlinux.lds.S
+++ b/arch/xtensa/kernel/vmlinux.lds.S
@@ -287,6 +287,7 @@ SECTIONS
 	EXIT_TEXT
 	EXIT_DATA
         *(.exitcall.exit)
+	*(.discard)
   }
 
   .xt.lit : { *(.xt.lit) }
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 55413e568f0..a19120c4e10 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -628,6 +628,14 @@
 #define INITRAMFS
 #endif
 
+#define DISCARDS							\
+	/DISCARD/ : {							\
+	EXIT_TEXT							\
+	EXIT_DATA							\
+	*(.exitcall.exit)						\
+	*(.discard)							\
+	}
+
 /**
  * PERCPU_VADDR - define output section for percpu area
  * @vaddr: explicit base address (optional)
diff --git a/scripts/module-common.lds b/scripts/module-common.lds
new file mode 100644
index 00000000000..47a1f9ae0ed
--- /dev/null
+++ b/scripts/module-common.lds
@@ -0,0 +1,8 @@
+/*
+ * Common module linker script, always used when linking a module.
+ * Archs are free to supply their own linker scripts.  ld will
+ * combine them automatically.
+ */
+SECTIONS {
+	/DISCARD/ : { *(.discard) }
+}
-- 
cgit v1.2.3-70-g09d2


From bf4bb2b1f285ec56e7f3cbf0190761b42131871c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 24 Jun 2009 16:57:03 +0900
Subject: sparc64: fix build breakage introduced by percpu-convert-most
 patchset

Commit e74e396204bfcb67570ba4517b08f5918e69afea incorrectly added
HAVE_LEGACY_PER_CPU_AREA to sparc64 although it already has been
converted to dynamic percpu allocator.  Drop both
HAVE_{LEGACY|DYNAMIC}_PER_CPU_AREA.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: David Miller <davem@davemloft.net>
---
 arch/sparc/Kconfig | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'arch/sparc')

diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 7a8698b913f..4f6ed0f113f 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -92,15 +92,9 @@ config AUDIT_ARCH
 	bool
 	default y
 
-config HAVE_LEGACY_PER_CPU_AREA
-	def_bool y if SPARC64
-
 config HAVE_SETUP_PER_CPU_AREA
 	def_bool y if SPARC64
 
-config HAVE_DYNAMIC_PER_CPU_AREA
-	def_bool y if SPARC64
-
 config GENERIC_HARDIRQS_NO__DO_IRQ
 	bool
 	def_bool y if SPARC64
-- 
cgit v1.2.3-70-g09d2


From 38a6be525460f52ac6f2de1c3f73c5615a8853cd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 4 Jul 2009 08:10:59 +0900
Subject: percpu: simplify pcpu_setup_first_chunk()

Now that all first chunk allocator helpers allocate and map the first
chunk themselves, there's no need to have optional default alloc/map
in pcpu_setup_first_chunk().  Drop @populate_pte_fn and only leave
@dyn_size optional and make all other params mandatory.

This makes it much easier to follow what pcpu_setup_first_chunk() is
doing and what actual differences tweaking each parameter results in.

[ Impact: drop unused code path ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/sparc/kernel/smp_64.c |   2 +-
 include/linux/percpu.h     |   5 +--
 mm/percpu.c                | 104 +++++++++++++--------------------------------
 3 files changed, 33 insertions(+), 78 deletions(-)

(limited to 'arch/sparc')

diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index fa44eaf8d89..ccad7b20ae7 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1528,7 +1528,7 @@ void __init setup_per_cpu_areas(void)
 
 	pcpu_unit_size = pcpu_setup_first_chunk(pcpur_get_page, static_size,
 						PERCPU_MODULE_RESERVE, dyn_size,
-						PCPU_CHUNK_SIZE, vm.addr, NULL);
+						PCPU_CHUNK_SIZE, vm.addr);
 
 	free_bootmem(__pa(pcpur_ptrs), ptrs_size);
 
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 9f6bfd7d4b9..ec64357e176 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -66,9 +66,8 @@ typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr);
 
 extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 				size_t static_size, size_t reserved_size,
-				ssize_t dyn_size, ssize_t unit_size,
-				void *base_addr,
-				pcpu_fc_populate_pte_fn_t populate_pte_fn);
+				ssize_t dyn_size, size_t unit_size,
+				void *base_addr);
 
 extern ssize_t __init pcpu_embed_first_chunk(
 				size_t static_size, size_t reserved_size,
diff --git a/mm/percpu.c b/mm/percpu.c
index 17db527ee2e..21d938a1066 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -983,24 +983,22 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * pcpu_setup_first_chunk - initialize the first percpu chunk
  * @get_page_fn: callback to fetch page pointer
  * @static_size: the size of static percpu area in bytes
- * @reserved_size: the size of reserved percpu area in bytes
+ * @reserved_size: the size of reserved percpu area in bytes, 0 for none
  * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
- * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
- * @base_addr: mapped address, NULL for auto
- * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
+ * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE
+ * @base_addr: mapped address
  *
  * Initialize the first percpu chunk which contains the kernel static
  * perpcu area.  This function is to be called from arch percpu area
- * setup path.  The first two parameters are mandatory.  The rest are
- * optional.
+ * setup path.
  *
  * @get_page_fn() should return pointer to percpu page given cpu
  * number and page number.  It should at least return enough pages to
  * cover the static area.  The returned pages for static area should
- * have been initialized with valid data.  If @unit_size is specified,
- * it can also return pages after the static area.  NULL return
- * indicates end of pages for the cpu.  Note that @get_page_fn() must
- * return the same number of pages for all cpus.
+ * have been initialized with valid data.  It can also return pages
+ * after the static area.  NULL return indicates end of pages for the
+ * cpu.  Note that @get_page_fn() must return the same number of pages
+ * for all cpus.
  *
  * @reserved_size, if non-zero, specifies the amount of bytes to
  * reserve after the static area in the first chunk.  This reserves
@@ -1015,17 +1013,12 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * non-negative value makes percpu leave alone the area beyond
  * @static_size + @reserved_size + @dyn_size.
  *
- * @unit_size, if non-negative, specifies unit size and must be
- * aligned to PAGE_SIZE and equal to or larger than @static_size +
- * @reserved_size + if non-negative, @dyn_size.
- *
- * Non-null @base_addr means that the caller already allocated virtual
- * region for the first chunk and mapped it.  percpu must not mess
- * with the chunk.  Note that @base_addr with 0 @unit_size or non-NULL
- * @populate_pte_fn doesn't make any sense.
+ * @unit_size specifies unit size and must be aligned to PAGE_SIZE and
+ * equal to or larger than @static_size + @reserved_size + if
+ * non-negative, @dyn_size.
  *
- * @populate_pte_fn is used to populate the pagetable.  NULL means the
- * caller already populated the pagetable.
+ * The caller should have mapped the first chunk at @base_addr and
+ * copied static data to each unit.
  *
  * If the first chunk ends up with both reserved and dynamic areas, it
  * is served by two chunks - one to serve the core static and reserved
@@ -1040,9 +1033,8 @@ EXPORT_SYMBOL_GPL(free_percpu);
  */
 size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 				     size_t static_size, size_t reserved_size,
-				     ssize_t dyn_size, ssize_t unit_size,
-				     void *base_addr,
-				     pcpu_fc_populate_pte_fn_t populate_pte_fn)
+				     ssize_t dyn_size, size_t unit_size,
+				     void *base_addr)
 {
 	static struct vm_struct first_vm;
 	static int smap[2], dmap[2];
@@ -1050,27 +1042,18 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 			  (dyn_size >= 0 ? dyn_size : 0);
 	struct pcpu_chunk *schunk, *dchunk = NULL;
 	unsigned int cpu;
-	int nr_pages;
-	int err, i;
+	int i, nr_pages;
 
 	/* santiy checks */
 	BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
 		     ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
 	BUG_ON(!static_size);
-	if (unit_size >= 0) {
-		BUG_ON(unit_size < size_sum);
-		BUG_ON(unit_size & ~PAGE_MASK);
-		BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
-	} else
-		BUG_ON(base_addr);
-	BUG_ON(base_addr && populate_pte_fn);
-
-	if (unit_size >= 0)
-		pcpu_unit_pages = unit_size >> PAGE_SHIFT;
-	else
-		pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
-					PFN_UP(size_sum));
+	BUG_ON(!base_addr);
+	BUG_ON(unit_size < size_sum);
+	BUG_ON(unit_size & ~PAGE_MASK);
+	BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
 
+	pcpu_unit_pages = unit_size >> PAGE_SHIFT;
 	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
 	pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
 	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
@@ -1079,6 +1062,10 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	if (dyn_size < 0)
 		dyn_size = pcpu_unit_size - static_size - reserved_size;
 
+	first_vm.flags = VM_ALLOC;
+	first_vm.size = pcpu_chunk_size;
+	first_vm.addr = base_addr;
+
 	/*
 	 * Allocate chunk slots.  The additional last slot is for
 	 * empty chunks.
@@ -1101,6 +1088,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	schunk->map = smap;
 	schunk->map_alloc = ARRAY_SIZE(smap);
 	schunk->page = schunk->page_ar;
+	schunk->immutable = true;
 
 	if (reserved_size) {
 		schunk->free_size = reserved_size;
@@ -1124,31 +1112,13 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 		dchunk->map = dmap;
 		dchunk->map_alloc = ARRAY_SIZE(dmap);
 		dchunk->page = schunk->page_ar;	/* share page map with schunk */
+		dchunk->immutable = true;
 
 		dchunk->contig_hint = dchunk->free_size = dyn_size;
 		dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
 		dchunk->map[dchunk->map_used++] = dchunk->free_size;
 	}
 
-	/* allocate vm address */
-	first_vm.flags = VM_ALLOC;
-	first_vm.size = pcpu_chunk_size;
-
-	if (!base_addr)
-		vm_area_register_early(&first_vm, PAGE_SIZE);
-	else {
-		/*
-		 * Pages already mapped.  No need to remap into
-		 * vmalloc area.  In this case the first chunks can't
-		 * be mapped or unmapped by percpu and are marked
-		 * immutable.
-		 */
-		first_vm.addr = base_addr;
-		schunk->immutable = true;
-		if (dchunk)
-			dchunk->immutable = true;
-	}
-
 	/* assign pages */
 	nr_pages = -1;
 	for_each_possible_cpu(cpu) {
@@ -1168,19 +1138,6 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 			BUG_ON(nr_pages != i);
 	}
 
-	/* map them */
-	if (populate_pte_fn) {
-		for_each_possible_cpu(cpu)
-			for (i = 0; i < nr_pages; i++)
-				populate_pte_fn(pcpu_chunk_addr(schunk,
-								cpu, i));
-
-		err = pcpu_map(schunk, 0, nr_pages);
-		if (err)
-			panic("failed to setup static percpu area, err=%d\n",
-			      err);
-	}
-
 	/* link the first chunk in */
 	pcpu_first_chunk = dchunk ?: schunk;
 	pcpu_chunk_relocate(pcpu_first_chunk, -1);
@@ -1282,7 +1239,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
 
 	return pcpu_setup_first_chunk(pcpue_get_page, static_size,
 				      reserved_size, dyn_size,
-				      pcpue_unit_size, pcpue_ptr, NULL);
+				      pcpue_unit_size, pcpue_ptr);
 }
 
 /*
@@ -1387,8 +1344,7 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
 
 	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
 				     reserved_size, -1,
-				     pcpu4k_unit_pages << PAGE_SHIFT, vm.addr,
-				     NULL);
+				     pcpu4k_unit_pages << PAGE_SHIFT, vm.addr);
 	goto out_free_ar;
 
 enomem:
@@ -1521,7 +1477,7 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
 
 	ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
 				     reserved_size, dyn_size, pcpul_unit_size,
-				     pcpul_vm.addr, NULL);
+				     pcpul_vm.addr);
 
 	/* sort pcpul_map array for pcpu_lpage_remapped() */
 	for (i = 0; i < num_possible_cpus() - 1; i++)
-- 
cgit v1.2.3-70-g09d2


From ce3141a277ff6cc37e51008b8888dc2cb7456ef1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 4 Jul 2009 08:11:00 +0900
Subject: percpu: drop pcpu_chunk->page[]

percpu core doesn't need to tack all the allocated pages.  It needs to
know whether certain pages are populated and a way to reverse map
address to page when freeing.  This patch drops pcpu_chunk->page[] and
use populated bitmap and vmalloc_to_page() lookup instead.  Using
vmalloc_to_page() exclusively is also possible but complicates first
chunk handling, inflates cache footprint and prevents non-standard
memory allocation for percpu memory.

pcpu_chunk->page[] was used to track each page's allocation and
allowed asymmetric population which happens during failure path;
however, with single bitmap for all units, this is no longer possible.
Bite the bullet and rewrite (de)populate functions so that things are
done in clearly separated steps such that asymmetric population
doesn't happen.  This makes the (de)population process much more
modular and will also ease implementing non-standard memory usage in
the future (e.g. large pages).

This makes @get_page_fn parameter to pcpu_setup_first_chunk()
unnecessary.  The parameter is dropped and all first chunk helpers are
updated accordingly.  Please note that despite the volume most changes
to first chunk helpers are symbol renames for variables which don't
need to be referenced outside of the helper anymore.

This change reduces memory usage and cache footprint of pcpu_chunk.
Now only #unit_pages bits are necessary per chunk.

[ Impact: reduced memory usage and cache footprint for bookkeeping ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
---
 arch/sparc/kernel/smp_64.c |  42 ++--
 include/linux/percpu.h     |   3 +-
 mm/percpu.c                | 604 ++++++++++++++++++++++++++++-----------------
 3 files changed, 400 insertions(+), 249 deletions(-)

(limited to 'arch/sparc')

diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index ccad7b20ae7..f2f22ee97a7 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1415,19 +1415,6 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
 #endif
 }
 
-static size_t pcpur_size __initdata;
-static void **pcpur_ptrs __initdata;
-
-static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
-{
-	size_t off = (size_t)pageno << PAGE_SHIFT;
-
-	if (off >= pcpur_size)
-		return NULL;
-
-	return virt_to_page(pcpur_ptrs[cpu] + off);
-}
-
 #define PCPU_CHUNK_SIZE (4UL * 1024UL * 1024UL)
 
 static void __init pcpu_map_range(unsigned long start, unsigned long end,
@@ -1491,25 +1478,26 @@ void __init setup_per_cpu_areas(void)
 	size_t dyn_size, static_size = __per_cpu_end - __per_cpu_start;
 	static struct vm_struct vm;
 	unsigned long delta, cpu;
-	size_t pcpu_unit_size;
+	size_t size_sum, pcpu_unit_size;
 	size_t ptrs_size;
+	void **ptrs;
 
-	pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
-			       PERCPU_DYNAMIC_RESERVE);
-	dyn_size = pcpur_size - static_size - PERCPU_MODULE_RESERVE;
+	size_sum = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
+			     PERCPU_DYNAMIC_RESERVE);
+	dyn_size = size_sum - static_size - PERCPU_MODULE_RESERVE;
 
 
-	ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
-	pcpur_ptrs = alloc_bootmem(ptrs_size);
+	ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(ptrs[0]));
+	ptrs = alloc_bootmem(ptrs_size);
 
 	for_each_possible_cpu(cpu) {
-		pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE,
-						     PCPU_CHUNK_SIZE);
+		ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE,
+					       PCPU_CHUNK_SIZE);
 
-		free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
-			     PCPU_CHUNK_SIZE - pcpur_size);
+		free_bootmem(__pa(ptrs[cpu] + size_sum),
+			     PCPU_CHUNK_SIZE - size_sum);
 
-		memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
+		memcpy(ptrs[cpu], __per_cpu_load, static_size);
 	}
 
 	/* allocate address and map */
@@ -1523,14 +1511,14 @@ void __init setup_per_cpu_areas(void)
 
 		start += cpu * PCPU_CHUNK_SIZE;
 		end = start + PCPU_CHUNK_SIZE;
-		pcpu_map_range(start, end, virt_to_page(pcpur_ptrs[cpu]));
+		pcpu_map_range(start, end, virt_to_page(ptrs[cpu]));
 	}
 
-	pcpu_unit_size = pcpu_setup_first_chunk(pcpur_get_page, static_size,
+	pcpu_unit_size = pcpu_setup_first_chunk(static_size,
 						PERCPU_MODULE_RESERVE, dyn_size,
 						PCPU_CHUNK_SIZE, vm.addr);
 
-	free_bootmem(__pa(pcpur_ptrs), ptrs_size);
+	free_bootmem(__pa(ptrs), ptrs_size);
 
 	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
 	for_each_possible_cpu(cpu) {
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index ec64357e176..63c8b7a23e6 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -58,13 +58,12 @@
 
 extern void *pcpu_base_addr;
 
-typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
 typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size);
 typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size);
 typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);
 typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr);
 
-extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
+extern size_t __init pcpu_setup_first_chunk(
 				size_t static_size, size_t reserved_size,
 				ssize_t dyn_size, size_t unit_size,
 				void *base_addr);
diff --git a/mm/percpu.c b/mm/percpu.c
index 639fce4d2ca..21756814d99 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -94,8 +94,7 @@ struct pcpu_chunk {
 	int			map_alloc;	/* # of map entries allocated */
 	int			*map;		/* allocation map */
 	bool			immutable;	/* no [de]population allowed */
-	struct page		**page;		/* points to page array */
-	struct page		*page_ar[];	/* #cpus * UNIT_PAGES */
+	unsigned long		populated[];	/* populated bitmap */
 };
 
 static int pcpu_unit_pages __read_mostly;
@@ -129,9 +128,9 @@ static int pcpu_reserved_chunk_limit;
  * Synchronization rules.
  *
  * There are two locks - pcpu_alloc_mutex and pcpu_lock.  The former
- * protects allocation/reclaim paths, chunks and chunk->page arrays.
- * The latter is a spinlock and protects the index data structures -
- * chunk slots, chunks and area maps in chunks.
+ * protects allocation/reclaim paths, chunks, populated bitmap and
+ * vmalloc mapping.  The latter is a spinlock and protects the index
+ * data structures - chunk slots, chunks and area maps in chunks.
  *
  * During allocation, pcpu_alloc_mutex is kept locked all the time and
  * pcpu_lock is grabbed and released as necessary.  All actual memory
@@ -188,16 +187,13 @@ static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
 		(pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
 }
 
-static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
-				      unsigned int cpu, int page_idx)
+static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
+				    unsigned int cpu, int page_idx)
 {
-	return &chunk->page[pcpu_page_idx(cpu, page_idx)];
-}
+	/* must not be used on pre-mapped chunk */
+	WARN_ON(chunk->immutable);
 
-static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
-				     int page_idx)
-{
-	return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
+	return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
 }
 
 /* set the pointer to a chunk in a page struct */
@@ -212,6 +208,34 @@ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
 	return (struct pcpu_chunk *)page->index;
 }
 
+static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
+{
+	*rs = find_next_zero_bit(chunk->populated, end, *rs);
+	*re = find_next_bit(chunk->populated, end, *rs + 1);
+}
+
+static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
+{
+	*rs = find_next_bit(chunk->populated, end, *rs);
+	*re = find_next_zero_bit(chunk->populated, end, *rs + 1);
+}
+
+/*
+ * (Un)populated page region iterators.  Iterate over (un)populated
+ * page regions betwen @start and @end in @chunk.  @rs and @re should
+ * be integer variables and will be set to start and end page index of
+ * the current region.
+ */
+#define pcpu_for_each_unpop_region(chunk, rs, re, start, end)		    \
+	for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
+	     (rs) < (re);						    \
+	     (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
+
+#define pcpu_for_each_pop_region(chunk, rs, re, start, end)		    \
+	for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end));   \
+	     (rs) < (re);						    \
+	     (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
+
 /**
  * pcpu_mem_alloc - allocate memory
  * @size: bytes to allocate
@@ -545,42 +569,197 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
 }
 
 /**
- * pcpu_unmap - unmap pages out of a pcpu_chunk
+ * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
+ * @chunk: chunk of interest
+ * @bitmapp: output parameter for bitmap
+ * @may_alloc: may allocate the array
+ *
+ * Returns pointer to array of pointers to struct page and bitmap,
+ * both of which can be indexed with pcpu_page_idx().  The returned
+ * array is cleared to zero and *@bitmapp is copied from
+ * @chunk->populated.  Note that there is only one array and bitmap
+ * and access exclusion is the caller's responsibility.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
+ * Otherwise, don't care.
+ *
+ * RETURNS:
+ * Pointer to temp pages array on success, NULL on failure.
+ */
+static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
+					       unsigned long **bitmapp,
+					       bool may_alloc)
+{
+	static struct page **pages;
+	static unsigned long *bitmap;
+	size_t pages_size = num_possible_cpus() * pcpu_unit_pages *
+			    sizeof(pages[0]);
+	size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
+			     sizeof(unsigned long);
+
+	if (!pages || !bitmap) {
+		if (may_alloc && !pages)
+			pages = pcpu_mem_alloc(pages_size);
+		if (may_alloc && !bitmap)
+			bitmap = pcpu_mem_alloc(bitmap_size);
+		if (!pages || !bitmap)
+			return NULL;
+	}
+
+	memset(pages, 0, pages_size);
+	bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
+
+	*bitmapp = bitmap;
+	return pages;
+}
+
+/**
+ * pcpu_free_pages - free pages which were allocated for @chunk
+ * @chunk: chunk pages were allocated for
+ * @pages: array of pages to be freed, indexed by pcpu_page_idx()
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to be freed
+ * @page_end: page index of the last page to be freed + 1
+ *
+ * Free pages [@page_start and @page_end) in @pages for all units.
+ * The pages were allocated for @chunk.
+ */
+static void pcpu_free_pages(struct pcpu_chunk *chunk,
+			    struct page **pages, unsigned long *populated,
+			    int page_start, int page_end)
+{
+	unsigned int cpu;
+	int i;
+
+	for_each_possible_cpu(cpu) {
+		for (i = page_start; i < page_end; i++) {
+			struct page *page = pages[pcpu_page_idx(cpu, i)];
+
+			if (page)
+				__free_page(page);
+		}
+	}
+}
+
+/**
+ * pcpu_alloc_pages - allocates pages for @chunk
+ * @chunk: target chunk
+ * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to be allocated
+ * @page_end: page index of the last page to be allocated + 1
+ *
+ * Allocate pages [@page_start,@page_end) into @pages for all units.
+ * The allocation is for @chunk.  Percpu core doesn't care about the
+ * content of @pages and will pass it verbatim to pcpu_map_pages().
+ */
+static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
+			    struct page **pages, unsigned long *populated,
+			    int page_start, int page_end)
+{
+	const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
+	unsigned int cpu;
+	int i;
+
+	for_each_possible_cpu(cpu) {
+		for (i = page_start; i < page_end; i++) {
+			struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
+
+			*pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
+			if (!*pagep) {
+				pcpu_free_pages(chunk, pages, populated,
+						page_start, page_end);
+				return -ENOMEM;
+			}
+		}
+	}
+	return 0;
+}
+
+/**
+ * pcpu_pre_unmap_flush - flush cache prior to unmapping
+ * @chunk: chunk the regions to be flushed belongs to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages in [@page_start,@page_end) of @chunk are about to be
+ * unmapped.  Flush cache.  As each flushing trial can be very
+ * expensive, issue flush on the whole region at once rather than
+ * doing it for each cpu.  This could be an overkill but is more
+ * scalable.
+ */
+static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
+				 int page_start, int page_end)
+{
+	unsigned int last = num_possible_cpus() - 1;
+
+	flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
+			   pcpu_chunk_addr(chunk, last, page_end));
+}
+
+static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
+{
+	unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
+}
+
+/**
+ * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
  * @chunk: chunk of interest
+ * @pages: pages array which can be used to pass information to free
+ * @populated: populated bitmap
  * @page_start: page index of the first page to unmap
  * @page_end: page index of the last page to unmap + 1
- * @flush_tlb: whether to flush tlb or not
  *
  * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
- * If @flush is true, vcache is flushed before unmapping and tlb
- * after.
+ * Corresponding elements in @pages were cleared by the caller and can
+ * be used to carry information to pcpu_free_pages() which will be
+ * called after all unmaps are finished.  The caller should call
+ * proper pre/post flush functions.
  */
-static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
-		       bool flush_tlb)
+static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
+			     struct page **pages, unsigned long *populated,
+			     int page_start, int page_end)
 {
-	unsigned int last = num_possible_cpus() - 1;
 	unsigned int cpu;
+	int i;
 
-	/* unmap must not be done on immutable chunk */
-	WARN_ON(chunk->immutable);
+	for_each_possible_cpu(cpu) {
+		for (i = page_start; i < page_end; i++) {
+			struct page *page;
 
-	/*
-	 * Each flushing trial can be very expensive, issue flush on
-	 * the whole region at once rather than doing it for each cpu.
-	 * This could be an overkill but is more scalable.
-	 */
-	flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
-			   pcpu_chunk_addr(chunk, last, page_end));
+			page = pcpu_chunk_page(chunk, cpu, i);
+			WARN_ON(!page);
+			pages[pcpu_page_idx(cpu, i)] = page;
+		}
+		__pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
+				   page_end - page_start);
+	}
 
-	for_each_possible_cpu(cpu)
-		unmap_kernel_range_noflush(
-				pcpu_chunk_addr(chunk, cpu, page_start),
-				(page_end - page_start) << PAGE_SHIFT);
-
-	/* ditto as flush_cache_vunmap() */
-	if (flush_tlb)
-		flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
-				       pcpu_chunk_addr(chunk, last, page_end));
+	for (i = page_start; i < page_end; i++)
+		__clear_bit(i, populated);
+}
+
+/**
+ * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
+ * @chunk: pcpu_chunk the regions to be flushed belong to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages [@page_start,@page_end) of @chunk have been unmapped.  Flush
+ * TLB for the regions.  This can be skipped if the area is to be
+ * returned to vmalloc as vmalloc will handle TLB flushing lazily.
+ *
+ * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
+ * for the whole region.
+ */
+static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
+				      int page_start, int page_end)
+{
+	unsigned int last = num_possible_cpus() - 1;
+
+	flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
+			       pcpu_chunk_addr(chunk, last, page_end));
 }
 
 static int __pcpu_map_pages(unsigned long addr, struct page **pages,
@@ -591,35 +770,76 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages,
 }
 
 /**
- * pcpu_map - map pages into a pcpu_chunk
+ * pcpu_map_pages - map pages into a pcpu_chunk
  * @chunk: chunk of interest
+ * @pages: pages array containing pages to be mapped
+ * @populated: populated bitmap
  * @page_start: page index of the first page to map
  * @page_end: page index of the last page to map + 1
  *
- * For each cpu, map pages [@page_start,@page_end) into @chunk.
- * vcache is flushed afterwards.
+ * For each cpu, map pages [@page_start,@page_end) into @chunk.  The
+ * caller is responsible for calling pcpu_post_map_flush() after all
+ * mappings are complete.
+ *
+ * This function is responsible for setting corresponding bits in
+ * @chunk->populated bitmap and whatever is necessary for reverse
+ * lookup (addr -> chunk).
  */
-static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
+static int pcpu_map_pages(struct pcpu_chunk *chunk,
+			  struct page **pages, unsigned long *populated,
+			  int page_start, int page_end)
 {
-	unsigned int last = num_possible_cpus() - 1;
-	unsigned int cpu;
-	int err;
-
-	/* map must not be done on immutable chunk */
-	WARN_ON(chunk->immutable);
+	unsigned int cpu, tcpu;
+	int i, err;
 
 	for_each_possible_cpu(cpu) {
 		err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
-				       pcpu_chunk_pagep(chunk, cpu, page_start),
+				       &pages[pcpu_page_idx(cpu, page_start)],
 				       page_end - page_start);
 		if (err < 0)
-			return err;
+			goto err;
 	}
 
+	/* mapping successful, link chunk and mark populated */
+	for (i = page_start; i < page_end; i++) {
+		for_each_possible_cpu(cpu)
+			pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
+					    chunk);
+		__set_bit(i, populated);
+	}
+
+	return 0;
+
+err:
+	for_each_possible_cpu(tcpu) {
+		if (tcpu == cpu)
+			break;
+		__pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
+				   page_end - page_start);
+	}
+	return err;
+}
+
+/**
+ * pcpu_post_map_flush - flush cache after mapping
+ * @chunk: pcpu_chunk the regions to be flushed belong to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages [@page_start,@page_end) of @chunk have been mapped.  Flush
+ * cache.
+ *
+ * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
+ * for the whole region.
+ */
+static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
+				int page_start, int page_end)
+{
+	unsigned int last = num_possible_cpus() - 1;
+
 	/* flush at once, please read comments in pcpu_unmap() */
 	flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
 			 pcpu_chunk_addr(chunk, last, page_end));
-	return 0;
 }
 
 /**
@@ -636,39 +856,45 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
  * CONTEXT:
  * pcpu_alloc_mutex.
  */
-static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
-				  bool flush)
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
 {
 	int page_start = PFN_DOWN(off);
 	int page_end = PFN_UP(off + size);
-	int unmap_start = -1;
-	int uninitialized_var(unmap_end);
-	unsigned int cpu;
-	int i;
+	struct page **pages;
+	unsigned long *populated;
+	int rs, re;
+
+	/* quick path, check whether it's empty already */
+	pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+		if (rs == page_start && re == page_end)
+			return;
+		break;
+	}
 
-	for (i = page_start; i < page_end; i++) {
-		for_each_possible_cpu(cpu) {
-			struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
+	/* immutable chunks can't be depopulated */
+	WARN_ON(chunk->immutable);
 
-			if (!*pagep)
-				continue;
+	/*
+	 * If control reaches here, there must have been at least one
+	 * successful population attempt so the temp pages array must
+	 * be available now.
+	 */
+	pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
+	BUG_ON(!pages);
 
-			__free_page(*pagep);
+	/* unmap and free */
+	pcpu_pre_unmap_flush(chunk, page_start, page_end);
 
-			/*
-			 * If it's partial depopulation, it might get
-			 * populated or depopulated again.  Mark the
-			 * page gone.
-			 */
-			*pagep = NULL;
+	pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
+		pcpu_unmap_pages(chunk, pages, populated, rs, re);
 
-			unmap_start = unmap_start < 0 ? i : unmap_start;
-			unmap_end = i + 1;
-		}
-	}
+	/* no need to flush tlb, vmalloc will handle it lazily */
+
+	pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
+		pcpu_free_pages(chunk, pages, populated, rs, re);
 
-	if (unmap_start >= 0)
-		pcpu_unmap(chunk, unmap_start, unmap_end, flush);
+	/* commit new bitmap */
+	bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
 }
 
 /**
@@ -685,50 +911,61 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
  */
 static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
 {
-	const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
 	int page_start = PFN_DOWN(off);
 	int page_end = PFN_UP(off + size);
-	int map_start = -1;
-	int uninitialized_var(map_end);
+	int free_end = page_start, unmap_end = page_start;
+	struct page **pages;
+	unsigned long *populated;
 	unsigned int cpu;
-	int i;
+	int rs, re, rc;
 
-	for (i = page_start; i < page_end; i++) {
-		if (pcpu_chunk_page_occupied(chunk, i)) {
-			if (map_start >= 0) {
-				if (pcpu_map(chunk, map_start, map_end))
-					goto err;
-				map_start = -1;
-			}
-			continue;
-		}
+	/* quick path, check whether all pages are already there */
+	pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) {
+		if (rs == page_start && re == page_end)
+			goto clear;
+		break;
+	}
 
-		map_start = map_start < 0 ? i : map_start;
-		map_end = i + 1;
+	/* need to allocate and map pages, this chunk can't be immutable */
+	WARN_ON(chunk->immutable);
 
-		for_each_possible_cpu(cpu) {
-			struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
+	pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
+	if (!pages)
+		return -ENOMEM;
 
-			*pagep = alloc_pages_node(cpu_to_node(cpu),
-						  alloc_mask, 0);
-			if (!*pagep)
-				goto err;
-			pcpu_set_page_chunk(*pagep, chunk);
-		}
+	/* alloc and map */
+	pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+		rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
+		if (rc)
+			goto err_free;
+		free_end = re;
 	}
 
-	if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
-		goto err;
+	pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+		rc = pcpu_map_pages(chunk, pages, populated, rs, re);
+		if (rc)
+			goto err_unmap;
+		unmap_end = re;
+	}
+	pcpu_post_map_flush(chunk, page_start, page_end);
 
+	/* commit new bitmap */
+	bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
+clear:
 	for_each_possible_cpu(cpu)
 		memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
 		       size);
-
 	return 0;
-err:
-	/* likely under heavy memory pressure, give memory back */
-	pcpu_depopulate_chunk(chunk, off, size, true);
-	return -ENOMEM;
+
+err_unmap:
+	pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
+	pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
+		pcpu_unmap_pages(chunk, pages, populated, rs, re);
+	pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
+err_free:
+	pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
+		pcpu_free_pages(chunk, pages, populated, rs, re);
+	return rc;
 }
 
 static void free_pcpu_chunk(struct pcpu_chunk *chunk)
@@ -752,7 +989,6 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
 	chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
 	chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
 	chunk->map[chunk->map_used++] = pcpu_unit_size;
-	chunk->page = chunk->page_ar;
 
 	chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
 	if (!chunk->vm) {
@@ -933,7 +1169,7 @@ static void pcpu_reclaim(struct work_struct *work)
 	mutex_unlock(&pcpu_alloc_mutex);
 
 	list_for_each_entry_safe(chunk, next, &todo, list) {
-		pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
+		pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
 		free_pcpu_chunk(chunk);
 	}
 }
@@ -981,7 +1217,6 @@ EXPORT_SYMBOL_GPL(free_percpu);
 
 /**
  * pcpu_setup_first_chunk - initialize the first percpu chunk
- * @get_page_fn: callback to fetch page pointer
  * @static_size: the size of static percpu area in bytes
  * @reserved_size: the size of reserved percpu area in bytes, 0 for none
  * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
@@ -992,14 +1227,6 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * perpcu area.  This function is to be called from arch percpu area
  * setup path.
  *
- * @get_page_fn() should return pointer to percpu page given cpu
- * number and page number.  It should at least return enough pages to
- * cover the static area.  The returned pages for static area should
- * have been initialized with valid data.  It can also return pages
- * after the static area.  NULL return indicates end of pages for the
- * cpu.  Note that @get_page_fn() must return the same number of pages
- * for all cpus.
- *
  * @reserved_size, if non-zero, specifies the amount of bytes to
  * reserve after the static area in the first chunk.  This reserves
  * the first chunk such that it's available only through reserved
@@ -1031,8 +1258,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * The determined pcpu_unit_size which can be used to initialize
  * percpu access.
  */
-size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
-				     size_t static_size, size_t reserved_size,
+size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
 				     ssize_t dyn_size, size_t unit_size,
 				     void *base_addr)
 {
@@ -1041,8 +1267,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	size_t size_sum = static_size + reserved_size +
 			  (dyn_size >= 0 ? dyn_size : 0);
 	struct pcpu_chunk *schunk, *dchunk = NULL;
-	unsigned int cpu;
-	int i, nr_pages;
+	int i;
 
 	/* santiy checks */
 	BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
@@ -1056,8 +1281,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	pcpu_unit_pages = unit_size >> PAGE_SHIFT;
 	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
 	pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
-	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
-		+ num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
+	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
+		BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
 
 	if (dyn_size < 0)
 		dyn_size = pcpu_unit_size - static_size - reserved_size;
@@ -1087,8 +1312,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	schunk->vm = &first_vm;
 	schunk->map = smap;
 	schunk->map_alloc = ARRAY_SIZE(smap);
-	schunk->page = schunk->page_ar;
 	schunk->immutable = true;
+	bitmap_fill(schunk->populated, pcpu_unit_pages);
 
 	if (reserved_size) {
 		schunk->free_size = reserved_size;
@@ -1106,38 +1331,19 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 
 	/* init dynamic chunk if necessary */
 	if (dyn_size) {
-		dchunk = alloc_bootmem(sizeof(struct pcpu_chunk));
+		dchunk = alloc_bootmem(pcpu_chunk_struct_size);
 		INIT_LIST_HEAD(&dchunk->list);
 		dchunk->vm = &first_vm;
 		dchunk->map = dmap;
 		dchunk->map_alloc = ARRAY_SIZE(dmap);
-		dchunk->page = schunk->page_ar;	/* share page map with schunk */
 		dchunk->immutable = true;
+		bitmap_fill(dchunk->populated, pcpu_unit_pages);
 
 		dchunk->contig_hint = dchunk->free_size = dyn_size;
 		dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
 		dchunk->map[dchunk->map_used++] = dchunk->free_size;
 	}
 
-	/* assign pages */
-	nr_pages = -1;
-	for_each_possible_cpu(cpu) {
-		for (i = 0; i < pcpu_unit_pages; i++) {
-			struct page *page = get_page_fn(cpu, i);
-
-			if (!page)
-				break;
-			*pcpu_chunk_pagep(schunk, cpu, i) = page;
-		}
-
-		BUG_ON(i < PFN_UP(static_size));
-
-		if (nr_pages < 0)
-			nr_pages = i;
-		else
-			BUG_ON(nr_pages != i);
-	}
-
 	/* link the first chunk in */
 	pcpu_first_chunk = dchunk ?: schunk;
 	pcpu_chunk_relocate(pcpu_first_chunk, -1);
@@ -1160,23 +1366,6 @@ static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size,
 	return size_sum;
 }
 
-/*
- * Embedding first chunk setup helper.
- */
-static void *pcpue_ptr __initdata;
-static size_t pcpue_size __initdata;
-static size_t pcpue_unit_size __initdata;
-
-static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
-{
-	size_t off = (size_t)pageno << PAGE_SHIFT;
-
-	if (off >= pcpue_size)
-		return NULL;
-
-	return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);
-}
-
 /**
  * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
  * @static_size: the size of static percpu area in bytes
@@ -1207,18 +1396,19 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
 ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
 				      ssize_t dyn_size)
 {
-	size_t chunk_size;
+	size_t size_sum, unit_size, chunk_size;
+	void *base;
 	unsigned int cpu;
 
 	/* determine parameters and allocate */
-	pcpue_size = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
+	size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
 
-	pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
-	chunk_size = pcpue_unit_size * num_possible_cpus();
+	unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
+	chunk_size = unit_size * num_possible_cpus();
 
-	pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
-					    __pa(MAX_DMA_ADDRESS));
-	if (!pcpue_ptr) {
+	base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
+				       __pa(MAX_DMA_ADDRESS));
+	if (!base) {
 		pr_warning("PERCPU: failed to allocate %zu bytes for "
 			   "embedding\n", chunk_size);
 		return -ENOMEM;
@@ -1226,33 +1416,18 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
 
 	/* return the leftover and copy */
 	for_each_possible_cpu(cpu) {
-		void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
+		void *ptr = base + cpu * unit_size;
 
-		free_bootmem(__pa(ptr + pcpue_size),
-			     pcpue_unit_size - pcpue_size);
+		free_bootmem(__pa(ptr + size_sum), unit_size - size_sum);
 		memcpy(ptr, __per_cpu_load, static_size);
 	}
 
 	/* we're ready, commit */
 	pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
-		pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
+		size_sum >> PAGE_SHIFT, base, static_size);
 
-	return pcpu_setup_first_chunk(pcpue_get_page, static_size,
-				      reserved_size, dyn_size,
-				      pcpue_unit_size, pcpue_ptr);
-}
-
-/*
- * 4k page first chunk setup helper.
- */
-static struct page **pcpu4k_pages __initdata;
-static int pcpu4k_unit_pages __initdata;
-
-static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
-{
-	if (pageno < pcpu4k_unit_pages)
-		return pcpu4k_pages[cpu * pcpu4k_unit_pages + pageno];
-	return NULL;
+	return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
+				      unit_size, base);
 }
 
 /**
@@ -1279,23 +1454,25 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
 				   pcpu_fc_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct vm;
+	int unit_pages;
 	size_t pages_size;
+	struct page **pages;
 	unsigned int cpu;
 	int i, j;
 	ssize_t ret;
 
-	pcpu4k_unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size,
-					 PCPU_MIN_UNIT_SIZE));
+	unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size,
+				  PCPU_MIN_UNIT_SIZE));
 
 	/* unaligned allocations can't be freed, round up to page size */
-	pages_size = PFN_ALIGN(pcpu4k_unit_pages * num_possible_cpus() *
-			       sizeof(pcpu4k_pages[0]));
-	pcpu4k_pages = alloc_bootmem(pages_size);
+	pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
+			       sizeof(pages[0]));
+	pages = alloc_bootmem(pages_size);
 
 	/* allocate pages */
 	j = 0;
 	for_each_possible_cpu(cpu)
-		for (i = 0; i < pcpu4k_unit_pages; i++) {
+		for (i = 0; i < unit_pages; i++) {
 			void *ptr;
 
 			ptr = alloc_fn(cpu, PAGE_SIZE);
@@ -1304,25 +1481,24 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
 					   "4k page for cpu%u\n", cpu);
 				goto enomem;
 			}
-			pcpu4k_pages[j++] = virt_to_page(ptr);
+			pages[j++] = virt_to_page(ptr);
 		}
 
 	/* allocate vm area, map the pages and copy static data */
 	vm.flags = VM_ALLOC;
-	vm.size = num_possible_cpus() * pcpu4k_unit_pages << PAGE_SHIFT;
+	vm.size = num_possible_cpus() * unit_pages << PAGE_SHIFT;
 	vm_area_register_early(&vm, PAGE_SIZE);
 
 	for_each_possible_cpu(cpu) {
 		unsigned long unit_addr = (unsigned long)vm.addr +
-			(cpu * pcpu4k_unit_pages << PAGE_SHIFT);
+			(cpu * unit_pages << PAGE_SHIFT);
 
-		for (i = 0; i < pcpu4k_unit_pages; i++)
+		for (i = 0; i < unit_pages; i++)
 			populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
 
 		/* pte already populated, the following shouldn't fail */
-		ret = __pcpu_map_pages(unit_addr,
-				       &pcpu4k_pages[cpu * pcpu4k_unit_pages],
-				       pcpu4k_unit_pages);
+		ret = __pcpu_map_pages(unit_addr, &pages[cpu * unit_pages],
+				       unit_pages);
 		if (ret < 0)
 			panic("failed to map percpu area, err=%zd\n", ret);
 
@@ -1340,19 +1516,18 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
 
 	/* we're ready, commit */
 	pr_info("PERCPU: %d 4k pages per cpu, static data %zu bytes\n",
-		pcpu4k_unit_pages, static_size);
+		unit_pages, static_size);
 
-	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
-				     reserved_size, -1,
-				     pcpu4k_unit_pages << PAGE_SHIFT, vm.addr);
+	ret = pcpu_setup_first_chunk(static_size, reserved_size, -1,
+				     unit_pages << PAGE_SHIFT, vm.addr);
 	goto out_free_ar;
 
 enomem:
 	while (--j >= 0)
-		free_fn(page_address(pcpu4k_pages[j]), PAGE_SIZE);
+		free_fn(page_address(pages[j]), PAGE_SIZE);
 	ret = -ENOMEM;
 out_free_ar:
-	free_bootmem(__pa(pcpu4k_pages), pages_size);
+	free_bootmem(__pa(pages), pages_size);
 	return ret;
 }
 
@@ -1370,16 +1545,6 @@ static size_t pcpul_unit_size;
 static struct pcpul_ent *pcpul_map;
 static struct vm_struct pcpul_vm;
 
-static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
-{
-	size_t off = (size_t)pageno << PAGE_SHIFT;
-
-	if (off >= pcpul_size)
-		return NULL;
-
-	return virt_to_page(pcpul_map[cpu].ptr + off);
-}
-
 /**
  * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
  * @static_size: the size of static percpu area in bytes
@@ -1475,9 +1640,8 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
 	pr_info("PERCPU: Remapped at %p with large pages, static data "
 		"%zu bytes\n", pcpul_vm.addr, static_size);
 
-	ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
-				     reserved_size, dyn_size, pcpul_unit_size,
-				     pcpul_vm.addr);
+	ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
+				     pcpul_unit_size, pcpul_vm.addr);
 
 	/* sort pcpul_map array for pcpu_lpage_remapped() */
 	for (i = 0; i < num_possible_cpus() - 1; i++)
-- 
cgit v1.2.3-70-g09d2


From 2f39e637ea240efb74cf807d31c93a71a0b89174 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 4 Jul 2009 08:11:00 +0900
Subject: percpu: allow non-linear / sparse cpu -> unit mapping

Currently cpu and unit are always identity mapped.  To allow more
efficient large page support on NUMA and lazy allocation for possible
but offline cpus, cpu -> unit mapping needs to be non-linear and/or
sparse.  This can be easily implemented by adding a cpu -> unit
mapping array and using it whenever looking up the matching unit for a
cpu.

The only unusal conversion is in pcpu_chunk_addr_search().  The passed
in address is unit0 based and unit0 might not be in use so it needs to
be converted to address of an in-use unit.  This is easily done by
adding the unit offset for the current processor.

[ Impact: allows non-linear/sparse cpu -> unit mapping, no visible change yet ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
---
 arch/sparc/kernel/smp_64.c |   2 +-
 include/linux/percpu.h     |   3 +-
 mm/percpu.c                | 129 +++++++++++++++++++++++++++++++++------------
 3 files changed, 97 insertions(+), 37 deletions(-)

(limited to 'arch/sparc')

diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index f2f22ee97a7..6970333b48b 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1516,7 +1516,7 @@ void __init setup_per_cpu_areas(void)
 
 	pcpu_unit_size = pcpu_setup_first_chunk(static_size,
 						PERCPU_MODULE_RESERVE, dyn_size,
-						PCPU_CHUNK_SIZE, vm.addr);
+						PCPU_CHUNK_SIZE, vm.addr, NULL);
 
 	free_bootmem(__pa(ptrs), ptrs_size);
 
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 63c8b7a23e6..1e0e8878dc2 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -57,6 +57,7 @@
 #endif
 
 extern void *pcpu_base_addr;
+extern const int *pcpu_unit_map;
 
 typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size);
 typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size);
@@ -66,7 +67,7 @@ typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr);
 extern size_t __init pcpu_setup_first_chunk(
 				size_t static_size, size_t reserved_size,
 				ssize_t dyn_size, size_t unit_size,
-				void *base_addr);
+				void *base_addr, const int *unit_map);
 
 extern ssize_t __init pcpu_embed_first_chunk(
 				size_t static_size, size_t reserved_size,
diff --git a/mm/percpu.c b/mm/percpu.c
index 21756814d99..2196fae24f0 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -8,12 +8,13 @@
  *
  * This is percpu allocator which can handle both static and dynamic
  * areas.  Percpu areas are allocated in chunks in vmalloc area.  Each
- * chunk is consisted of num_possible_cpus() units and the first chunk
- * is used for static percpu variables in the kernel image (special
- * boot time alloc/init handling necessary as these areas need to be
- * brought up before allocation services are running).  Unit grows as
- * necessary and all units grow or shrink in unison.  When a chunk is
- * filled up, another chunk is allocated.  ie. in vmalloc area
+ * chunk is consisted of boot-time determined number of units and the
+ * first chunk is used for static percpu variables in the kernel image
+ * (special boot time alloc/init handling necessary as these areas
+ * need to be brought up before allocation services are running).
+ * Unit grows as necessary and all units grow or shrink in unison.
+ * When a chunk is filled up, another chunk is allocated.  ie. in
+ * vmalloc area
  *
  *  c0                           c1                         c2
  *  -------------------          -------------------        ------------
@@ -22,11 +23,13 @@
  *
  * Allocation is done in offset-size areas of single unit space.  Ie,
  * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
- * c1:u1, c1:u2 and c1:u3.  Percpu access can be done by configuring
- * percpu base registers pcpu_unit_size apart.
+ * c1:u1, c1:u2 and c1:u3.  On UMA, units corresponds directly to
+ * cpus.  On NUMA, the mapping can be non-linear and even sparse.
+ * Percpu access can be done by configuring percpu base registers
+ * according to cpu to unit mapping and pcpu_unit_size.
  *
- * There are usually many small percpu allocations many of them as
- * small as 4 bytes.  The allocator organizes chunks into lists
+ * There are usually many small percpu allocations many of them being
+ * as small as 4 bytes.  The allocator organizes chunks into lists
  * according to free size and tries to allocate from the fullest one.
  * Each chunk keeps the maximum contiguous area size hint which is
  * guaranteed to be eqaul to or larger than the maximum contiguous
@@ -99,14 +102,22 @@ struct pcpu_chunk {
 
 static int pcpu_unit_pages __read_mostly;
 static int pcpu_unit_size __read_mostly;
+static int pcpu_nr_units __read_mostly;
 static int pcpu_chunk_size __read_mostly;
 static int pcpu_nr_slots __read_mostly;
 static size_t pcpu_chunk_struct_size __read_mostly;
 
+/* cpus with the lowest and highest unit numbers */
+static unsigned int pcpu_first_unit_cpu __read_mostly;
+static unsigned int pcpu_last_unit_cpu __read_mostly;
+
 /* the address of the first chunk which starts with the kernel static area */
 void *pcpu_base_addr __read_mostly;
 EXPORT_SYMBOL_GPL(pcpu_base_addr);
 
+/* cpu -> unit map */
+const int *pcpu_unit_map __read_mostly;
+
 /*
  * The first chunk which always exists.  Note that unlike other
  * chunks, this one can be allocated and mapped in several different
@@ -177,7 +188,7 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
 
 static int pcpu_page_idx(unsigned int cpu, int page_idx)
 {
-	return cpu * pcpu_unit_pages + page_idx;
+	return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
 }
 
 static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
@@ -321,6 +332,14 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 		return pcpu_first_chunk;
 	}
 
+	/*
+	 * The address is relative to unit0 which might be unused and
+	 * thus unmapped.  Offset the address to the unit space of the
+	 * current processor before looking it up in the vmalloc
+	 * space.  Note that any possible cpu id can be used here, so
+	 * there's no need to worry about preemption or cpu hotplug.
+	 */
+	addr += pcpu_unit_map[smp_processor_id()] * pcpu_unit_size;
 	return pcpu_get_page_chunk(vmalloc_to_page(addr));
 }
 
@@ -593,8 +612,7 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
 {
 	static struct page **pages;
 	static unsigned long *bitmap;
-	size_t pages_size = num_possible_cpus() * pcpu_unit_pages *
-			    sizeof(pages[0]);
+	size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
 	size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
 			     sizeof(unsigned long);
 
@@ -692,10 +710,9 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
 static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
 				 int page_start, int page_end)
 {
-	unsigned int last = num_possible_cpus() - 1;
-
-	flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
-			   pcpu_chunk_addr(chunk, last, page_end));
+	flush_cache_vunmap(
+		pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+		pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
 }
 
 static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
@@ -756,10 +773,9 @@ static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
 static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
 				      int page_start, int page_end)
 {
-	unsigned int last = num_possible_cpus() - 1;
-
-	flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
-			       pcpu_chunk_addr(chunk, last, page_end));
+	flush_tlb_kernel_range(
+		pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+		pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
 }
 
 static int __pcpu_map_pages(unsigned long addr, struct page **pages,
@@ -835,11 +851,9 @@ err:
 static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
 				int page_start, int page_end)
 {
-	unsigned int last = num_possible_cpus() - 1;
-
-	/* flush at once, please read comments in pcpu_unmap() */
-	flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
-			 pcpu_chunk_addr(chunk, last, page_end));
+	flush_cache_vmap(
+		pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+		pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
 }
 
 /**
@@ -953,8 +967,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
 	bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
 clear:
 	for_each_possible_cpu(cpu)
-		memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
-		       size);
+		memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
 	return 0;
 
 err_unmap:
@@ -1088,6 +1101,7 @@ area_found:
 
 	mutex_unlock(&pcpu_alloc_mutex);
 
+	/* return address relative to unit0 */
 	return __addr_to_pcpu_ptr(chunk->vm->addr + off);
 
 fail_unlock:
@@ -1222,6 +1236,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
  * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE
  * @base_addr: mapped address
+ * @unit_map: cpu -> unit map, NULL for sequential mapping
  *
  * Initialize the first percpu chunk which contains the kernel static
  * perpcu area.  This function is to be called from arch percpu area
@@ -1260,16 +1275,17 @@ EXPORT_SYMBOL_GPL(free_percpu);
  */
 size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
 				     ssize_t dyn_size, size_t unit_size,
-				     void *base_addr)
+				     void *base_addr, const int *unit_map)
 {
 	static struct vm_struct first_vm;
 	static int smap[2], dmap[2];
 	size_t size_sum = static_size + reserved_size +
 			  (dyn_size >= 0 ? dyn_size : 0);
 	struct pcpu_chunk *schunk, *dchunk = NULL;
+	unsigned int cpu, tcpu;
 	int i;
 
-	/* santiy checks */
+	/* sanity checks */
 	BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
 		     ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
 	BUG_ON(!static_size);
@@ -1278,9 +1294,52 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
 	BUG_ON(unit_size & ~PAGE_MASK);
 	BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
 
+	/* determine number of units and verify and initialize pcpu_unit_map */
+	if (unit_map) {
+		int first_unit = INT_MAX, last_unit = INT_MIN;
+
+		for_each_possible_cpu(cpu) {
+			int unit = unit_map[cpu];
+
+			BUG_ON(unit < 0);
+			for_each_possible_cpu(tcpu) {
+				if (tcpu == cpu)
+					break;
+				/* the mapping should be one-to-one */
+				BUG_ON(unit_map[tcpu] == unit);
+			}
+
+			if (unit < first_unit) {
+				pcpu_first_unit_cpu = cpu;
+				first_unit = unit;
+			}
+			if (unit > last_unit) {
+				pcpu_last_unit_cpu = cpu;
+				last_unit = unit;
+			}
+		}
+		pcpu_nr_units = last_unit + 1;
+		pcpu_unit_map = unit_map;
+	} else {
+		int *identity_map;
+
+		/* #units == #cpus, identity mapped */
+		identity_map = alloc_bootmem(num_possible_cpus() *
+					     sizeof(identity_map[0]));
+
+		for_each_possible_cpu(cpu)
+			identity_map[cpu] = cpu;
+
+		pcpu_first_unit_cpu = 0;
+		pcpu_last_unit_cpu = pcpu_nr_units - 1;
+		pcpu_nr_units = num_possible_cpus();
+		pcpu_unit_map = identity_map;
+	}
+
+	/* determine basic parameters */
 	pcpu_unit_pages = unit_size >> PAGE_SHIFT;
 	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
-	pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
+	pcpu_chunk_size = pcpu_nr_units * pcpu_unit_size;
 	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
 		BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
 
@@ -1349,7 +1408,7 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
 	pcpu_chunk_relocate(pcpu_first_chunk, -1);
 
 	/* we're done */
-	pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
+	pcpu_base_addr = schunk->vm->addr;
 	return pcpu_unit_size;
 }
 
@@ -1427,7 +1486,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
 		size_sum >> PAGE_SHIFT, base, static_size);
 
 	return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
-				      unit_size, base);
+				      unit_size, base, NULL);
 }
 
 /**
@@ -1519,7 +1578,7 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
 		unit_pages, static_size);
 
 	ret = pcpu_setup_first_chunk(static_size, reserved_size, -1,
-				     unit_pages << PAGE_SHIFT, vm.addr);
+				     unit_pages << PAGE_SHIFT, vm.addr, NULL);
 	goto out_free_ar;
 
 enomem:
@@ -1641,7 +1700,7 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
 		"%zu bytes\n", pcpul_vm.addr, static_size);
 
 	ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
-				     pcpul_unit_size, pcpul_vm.addr);
+				     pcpul_unit_size, pcpul_vm.addr, NULL);
 
 	/* sort pcpul_map array for pcpu_lpage_remapped() */
 	for (i = 0; i < num_possible_cpus() - 1; i++)
-- 
cgit v1.2.3-70-g09d2


From 023bf6f1b8bf58dc4da7f0dc1cf4787b0d5297c1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 9 Jul 2009 11:27:40 +0900
Subject: linker script: unify usage of discard definition

Discarded sections in different archs share some commonality but have
considerable differences.  This led to linker script for each arch
implementing its own /DISCARD/ definition, which makes maintaining
tedious and adding new entries error-prone.

This patch makes all linker scripts to move discard definitions to the
end of the linker script and use the common DISCARDS macro.  As ld
uses the first matching section definition, archs can include default
discarded sections by including them earlier in the linker script.

ia64 is notable because it first throws away some ia64 specific
subsections and then include the rest of the sections into the final
image, so those sections must be discarded before the inclusion.

defconfig compile tested for x86, x86-64, powerpc, powerpc64, ia64,
alpha, sparc, sparc64 and s390.  Michal Simek tested microblaze.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Acked-by: Mike Frysinger <vapier@gentoo.org>
Tested-by: Michal Simek <monstr@monstr.eu>
Cc: linux-arch@vger.kernel.org
Cc: Michal Simek <monstr@monstr.eu>
Cc: microblaze-uclinux@itee.uq.edu.au
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Tony Luck <tony.luck@intel.com>
---
 arch/alpha/kernel/vmlinux.lds.S      | 10 ++--------
 arch/avr32/kernel/vmlinux.lds.S      | 10 +++-------
 arch/blackfin/kernel/vmlinux.lds.S   |  6 +-----
 arch/cris/kernel/vmlinux.lds.S       | 10 ++--------
 arch/frv/kernel/vmlinux.lds.S        |  2 +-
 arch/h8300/kernel/vmlinux.lds.S      |  6 ++----
 arch/ia64/kernel/vmlinux.lds.S       | 17 ++++++++---------
 arch/m32r/kernel/vmlinux.lds.S       | 11 +++--------
 arch/m68k/kernel/vmlinux-std.lds     | 11 +++--------
 arch/m68k/kernel/vmlinux-sun3.lds    | 10 ++--------
 arch/m68knommu/kernel/vmlinux.lds.S  |  8 +-------
 arch/microblaze/kernel/vmlinux.lds.S |  2 +-
 arch/mips/kernel/vmlinux.lds.S       | 22 ++++++++++------------
 arch/mn10300/kernel/vmlinux.lds.S    |  9 +++------
 arch/parisc/kernel/vmlinux.lds.S     |  9 ++++-----
 arch/powerpc/kernel/vmlinux.lds.S    | 10 +++-------
 arch/s390/kernel/vmlinux.lds.S       | 10 +++-------
 arch/sh/kernel/vmlinux.lds.S         | 11 ++++-------
 arch/sparc/kernel/vmlinux.lds.S      |  9 ++-------
 arch/um/include/asm/common.lds.S     |  5 -----
 arch/um/kernel/dyn.lds.S             |  2 +-
 arch/um/kernel/uml.lds.S             |  2 +-
 arch/x86/kernel/vmlinux.lds.S        | 11 ++++-------
 arch/xtensa/kernel/vmlinux.lds.S     | 14 ++++----------
 include/asm-generic/vmlinux.lds.h    | 18 ++++++++++++------
 25 files changed, 80 insertions(+), 155 deletions(-)

(limited to 'arch/sparc')

diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S
index 75fe1d6877e..6dc03c35caa 100644
--- a/arch/alpha/kernel/vmlinux.lds.S
+++ b/arch/alpha/kernel/vmlinux.lds.S
@@ -134,14 +134,6 @@ SECTIONS
 	__bss_stop = .;
 	_end = .;
 
-	/* Sections to be discarded */
-	/DISCARD/ : {
-		EXIT_TEXT
-		EXIT_DATA
-		*(.exitcall.exit)
-		*(.discard)
-	}
-
 	.mdebug 0 : {
 		*(.mdebug)
 	}
@@ -151,4 +143,6 @@ SECTIONS
 
 	STABS_DEBUG
 	DWARF_DEBUG
+
+	DISCARDS
 }
diff --git a/arch/avr32/kernel/vmlinux.lds.S b/arch/avr32/kernel/vmlinux.lds.S
index b8324608ec0..c4b56654349 100644
--- a/arch/avr32/kernel/vmlinux.lds.S
+++ b/arch/avr32/kernel/vmlinux.lds.S
@@ -124,15 +124,11 @@ SECTIONS
 		_end = .;
 	}
 
+	DWARF_DEBUG
+
 	/* When something in the kernel is NOT compiled as a module, the module
 	 * cleanup code and data are put into these segments. Both can then be
 	 * thrown away, as cleanup code is never called unless it's a module.
 	 */
-	/DISCARD/       	: {
-		EXIT_DATA
-		*(.exitcall.exit)
-		*(.discard)
-	}
-
-	DWARF_DEBUG
+	DISCARDS
 }
diff --git a/arch/blackfin/kernel/vmlinux.lds.S b/arch/blackfin/kernel/vmlinux.lds.S
index 6e8eabd8f0a..d7ffe299b97 100644
--- a/arch/blackfin/kernel/vmlinux.lds.S
+++ b/arch/blackfin/kernel/vmlinux.lds.S
@@ -277,9 +277,5 @@ SECTIONS
 
 	DWARF_DEBUG
 
-	/DISCARD/ :
-	{
-		*(.exitcall.exit)
-		*(.discard)
-	}
+	DISCARDS
 }
diff --git a/arch/cris/kernel/vmlinux.lds.S b/arch/cris/kernel/vmlinux.lds.S
index a3175ebb38c..6c81836b922 100644
--- a/arch/cris/kernel/vmlinux.lds.S
+++ b/arch/cris/kernel/vmlinux.lds.S
@@ -140,13 +140,7 @@ SECTIONS
 	_end = .;
 	__end = .;
 
-	/* Sections to be discarded */
-	/DISCARD/ : {
-		EXIT_TEXT
-		EXIT_DATA
-		*(.exitcall.exit)
-		*(.discard)
-        }
-
 	dram_end = dram_start + (CONFIG_ETRAX_DRAM_SIZE - __CONFIG_ETRAX_VMEM_SIZE)*1024*1024;
+
+	DISCARDS
 }
diff --git a/arch/frv/kernel/vmlinux.lds.S b/arch/frv/kernel/vmlinux.lds.S
index 64b5a5e4d35..7dbf41f68b5 100644
--- a/arch/frv/kernel/vmlinux.lds.S
+++ b/arch/frv/kernel/vmlinux.lds.S
@@ -178,7 +178,7 @@ SECTIONS
 
   .comment 0 : { *(.comment) }
 
-  /DISCARD/ : { *(.discard) }
+  DISCARDS
 }
 
 __kernel_image_size_no_bss = __bss_start - __kernel_image_start;
diff --git a/arch/h8300/kernel/vmlinux.lds.S b/arch/h8300/kernel/vmlinux.lds.S
index 03d6c0df33d..662b02ecb86 100644
--- a/arch/h8300/kernel/vmlinux.lds.S
+++ b/arch/h8300/kernel/vmlinux.lds.S
@@ -152,10 +152,6 @@ SECTIONS
 	__end = . ;
 	__ramstart = .;
 	}
-	/DISCARD/ : {
-		*(.exitcall.exit)
-		*(.discard)
-	}
         .romfs :	
 	{
 		*(.romfs*)
@@ -166,4 +162,6 @@ SECTIONS
 	COMMAND_START = . - 0x200 ;
 	__ramend = . ;
 	}
+
+	DISCARDS
 }
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
index 13d95897587..eb4214d1c5a 100644
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -24,15 +24,14 @@ PHDRS {
 }
 SECTIONS
 {
-  /* Sections to be discarded */
+  /* unwind exit sections must be discarded before the rest of the
+     sections get included. */
   /DISCARD/ : {
-	EXIT_TEXT
-	EXIT_DATA
-	*(.exitcall.exit)
-	*(.discard)
 	*(.IA_64.unwind.exit.text)
 	*(.IA_64.unwind_info.exit.text)
-	}
+	*(.comment)
+	*(.note)
+  }
 
   v = PAGE_OFFSET;	/* this symbol is here to make debugging easier... */
   phys_start = _start - LOAD_OFFSET;
@@ -317,7 +316,7 @@ SECTIONS
   .debug_funcnames 0 : { *(.debug_funcnames) }
   .debug_typenames 0 : { *(.debug_typenames) }
   .debug_varnames  0 : { *(.debug_varnames) }
-  /* These must appear regardless of  .  */
-  /DISCARD/ : { *(.comment) }
-  /DISCARD/ : { *(.note) }
+
+  /* Default discards */
+  DISCARDS
 }
diff --git a/arch/m32r/kernel/vmlinux.lds.S b/arch/m32r/kernel/vmlinux.lds.S
index 480a49944cf..de5e21cca6a 100644
--- a/arch/m32r/kernel/vmlinux.lds.S
+++ b/arch/m32r/kernel/vmlinux.lds.S
@@ -120,14 +120,6 @@ SECTIONS
 
   _end = . ;
 
-  /* Sections to be discarded */
-  /DISCARD/ : {
-	EXIT_TEXT
-	EXIT_DATA
-	*(.exitcall.exit)
-	*(.discard)
-	}
-
   /* Stabs debugging sections.  */
   .stab 0 : { *(.stab) }
   .stabstr 0 : { *(.stabstr) }
@@ -136,4 +128,7 @@ SECTIONS
   .stab.index 0 : { *(.stab.index) }
   .stab.indexstr 0 : { *(.stab.indexstr) }
   .comment 0 : { *(.comment) }
+
+  /* Sections to be discarded */
+  DISCARDS
 }
diff --git a/arch/m68k/kernel/vmlinux-std.lds b/arch/m68k/kernel/vmlinux-std.lds
index 905a797ada9..47eac19e8f6 100644
--- a/arch/m68k/kernel/vmlinux-std.lds
+++ b/arch/m68k/kernel/vmlinux-std.lds
@@ -82,14 +82,6 @@ SECTIONS
 
   _end = . ;
 
-  /* Sections to be discarded */
-  /DISCARD/ : {
-	EXIT_TEXT
-	EXIT_DATA
-	*(.exitcall.exit)
-	*(.discard)
-	}
-
   /* Stabs debugging sections.  */
   .stab 0 : { *(.stab) }
   .stabstr 0 : { *(.stabstr) }
@@ -98,4 +90,7 @@ SECTIONS
   .stab.index 0 : { *(.stab.index) }
   .stab.indexstr 0 : { *(.stab.indexstr) }
   .comment 0 : { *(.comment) }
+
+  /* Sections to be discarded */
+  DISCARDS
 }
diff --git a/arch/m68k/kernel/vmlinux-sun3.lds b/arch/m68k/kernel/vmlinux-sun3.lds
index 47d04be322a..03efaf04d7d 100644
--- a/arch/m68k/kernel/vmlinux-sun3.lds
+++ b/arch/m68k/kernel/vmlinux-sun3.lds
@@ -77,14 +77,6 @@ __init_begin = .;
 
   _end = . ;
 
-  /* Sections to be discarded */
-  /DISCARD/ : {
-	EXIT_TEXT
-	EXIT_DATA
-	*(.exitcall.exit)
-	*(.discard)
-	}
-
   .crap : {
 	/* Stabs debugging sections.  */
 	*(.stab)
@@ -97,4 +89,6 @@ __init_begin = .;
 	*(.note)
   }
 
+  /* Sections to be discarded */
+  DISCARDS
 }
diff --git a/arch/m68knommu/kernel/vmlinux.lds.S b/arch/m68knommu/kernel/vmlinux.lds.S
index 68111a61a77..2736a5e309c 100644
--- a/arch/m68knommu/kernel/vmlinux.lds.S
+++ b/arch/m68knommu/kernel/vmlinux.lds.S
@@ -184,13 +184,6 @@ SECTIONS {
 		__init_end = .;
 	} > INIT
 
-	/DISCARD/ : {
-		EXIT_TEXT
-		EXIT_DATA
-		*(.exitcall.exit)
-		*(.discard)
-	}
-
 	.bss : {
 		. = ALIGN(4);
 		_sbss = . ;
@@ -201,5 +194,6 @@ SECTIONS {
 	 	_end = . ;
 	} > BSS
 
+	DISCARDS
 }
 
diff --git a/arch/microblaze/kernel/vmlinux.lds.S b/arch/microblaze/kernel/vmlinux.lds.S
index 81bebdcb18f..ec5fa91a48d 100644
--- a/arch/microblaze/kernel/vmlinux.lds.S
+++ b/arch/microblaze/kernel/vmlinux.lds.S
@@ -163,5 +163,5 @@ SECTIONS {
 	. = ALIGN(4096);
 	_end = .;
 
-	/DISCARD/ : { *(.discard) }
+	DISCARDS
 }
diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S
index 45901609b74..1474c18fb77 100644
--- a/arch/mips/kernel/vmlinux.lds.S
+++ b/arch/mips/kernel/vmlinux.lds.S
@@ -176,18 +176,6 @@ SECTIONS
 
 	_end = . ;
 
-	/* Sections to be discarded */
-	/DISCARD/ : {
-		*(.exitcall.exit)
-		*(.discard)
-
-		/* ABI crap starts here */
-		*(.MIPS.options)
-		*(.options)
-		*(.pdr)
-		*(.reginfo)
-	}
-
 	/* These mark the ABI of the kernel for debuggers.  */
 	.mdebug.abi32 : {
 		KEEP(*(.mdebug.abi32))
@@ -213,4 +201,14 @@ SECTIONS
 		*(.gptab.bss)
 		*(.gptab.sbss)
 	}
+
+	/* Sections to be discarded */
+	DISCARDS
+	/DISCARD/ : {
+		/* ABI crap starts here */
+		*(.MIPS.options)
+		*(.options)
+		*(.pdr)
+		*(.reginfo)
+	}
 }
diff --git a/arch/mn10300/kernel/vmlinux.lds.S b/arch/mn10300/kernel/vmlinux.lds.S
index 5609d4962a5..8fcd0f1e21d 100644
--- a/arch/mn10300/kernel/vmlinux.lds.S
+++ b/arch/mn10300/kernel/vmlinux.lds.S
@@ -115,13 +115,10 @@ SECTIONS
   . = ALIGN(PAGE_SIZE);
   pg0 = .;
 
-  /* Sections to be discarded */
-  /DISCARD/ : {
-	EXIT_CALL
-	*(.discard)
-	}
-
   STABS_DEBUG
 
   DWARF_DEBUG
+
+  /* Sections to be discarded */
+  DISCARDS
 }
diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S
index ccf58341845..aea1784edbd 100644
--- a/arch/parisc/kernel/vmlinux.lds.S
+++ b/arch/parisc/kernel/vmlinux.lds.S
@@ -237,10 +237,12 @@ SECTIONS
 	/* freed after init ends here */
 	_end = . ;
 
+	STABS_DEBUG
+	.note 0 : { *(.note) }
+
 	/* Sections to be discarded */
+	DISCARDS
 	/DISCARD/ : {
-		*(.exitcall.exit)
-		*(.discard)
 #ifdef CONFIG_64BIT
 		/* temporary hack until binutils is fixed to not emit these
 	 	 * for static binaries
@@ -253,7 +255,4 @@ SECTIONS
 		*(.gnu.hash)
 #endif
 	}
-
-	STABS_DEBUG
-	.note 0 : { *(.note) }	
 }
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 7fca9355fd3..244e3658983 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -37,13 +37,6 @@ jiffies = jiffies_64 + 4;
 #endif
 SECTIONS
 {
-	/* Sections to be discarded. */
-	/DISCARD/ : {
-	*(.exitcall.exit)
-	*(.discard)
-	EXIT_DATA
-	}
-
 	. = KERNELBASE;
 
 /*
@@ -299,4 +292,7 @@ SECTIONS
 	. = ALIGN(PAGE_SIZE);
 	_end = . ;
 	PROVIDE32 (end = .);
+
+	/* Sections to be discarded. */
+	DISCARDS
 }
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index 98867dfea46..82415c75b99 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -157,14 +157,10 @@ SECTIONS
 
 	_end = . ;
 
-	/* Sections to be discarded */
-	/DISCARD/ : {
-		EXIT_DATA
-		*(.exitcall.exit)
-		*(.discard)
-	}
-
 	/* Debugging sections.	*/
 	STABS_DEBUG
 	DWARF_DEBUG
+
+	/* Sections to be discarded */
+	DISCARDS
 }
diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S
index 766976d27b2..0ce254bca92 100644
--- a/arch/sh/kernel/vmlinux.lds.S
+++ b/arch/sh/kernel/vmlinux.lds.S
@@ -163,17 +163,14 @@ SECTIONS
 		_end = . ;
 	}
 
+	STABS_DEBUG
+	DWARF_DEBUG
+
 	/*
 	 * When something in the kernel is NOT compiled as a module, the
 	 * module cleanup code and data are put into these segments. Both
 	 * can then be thrown away, as cleanup code is never called unless
 	 * it's a module.
 	 */
-	/DISCARD/ : {
-		*(.exitcall.exit)
-		*(.discard)
-	}
-
-	STABS_DEBUG
-	DWARF_DEBUG
+	DISCARDS
 }
diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S
index d63cf914667..866390feb68 100644
--- a/arch/sparc/kernel/vmlinux.lds.S
+++ b/arch/sparc/kernel/vmlinux.lds.S
@@ -171,13 +171,8 @@ SECTIONS
 	}
 	_end = . ;
 
-	/DISCARD/ : {
-		EXIT_TEXT
-		EXIT_DATA
-		*(.exitcall.exit)
-		*(.discard)
-	}
-
 	STABS_DEBUG
 	DWARF_DEBUG
+
+	DISCARDS
 }
diff --git a/arch/um/include/asm/common.lds.S b/arch/um/include/asm/common.lds.S
index cb0248616d4..37ecc5577a9 100644
--- a/arch/um/include/asm/common.lds.S
+++ b/arch/um/include/asm/common.lds.S
@@ -123,8 +123,3 @@
 	__initramfs_end = .;
   }
 
-  /* Sections to be discarded */
-  /DISCARD/ : {
- 	*(.exitcall.exit)
-  }
-
diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S
index 2916d6eadff..715a188c047 100644
--- a/arch/um/kernel/dyn.lds.S
+++ b/arch/um/kernel/dyn.lds.S
@@ -157,5 +157,5 @@ SECTIONS
 
   DWARF_DEBUG
 
-  /DISCARD/	: { *(.discard) }
+  DISCARDS
 }
diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S
index 1f8a622cabe..2ebd39765db 100644
--- a/arch/um/kernel/uml.lds.S
+++ b/arch/um/kernel/uml.lds.S
@@ -101,5 +101,5 @@ SECTIONS
 
   DWARF_DEBUG
 
-  /DISCARD/	: { *(.discard) }
+  DISCARDS
 }
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 367e8788204..b600c843710 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -387,15 +387,12 @@ SECTIONS
 		_end = .;
 	}
 
-	/* Sections to be discarded */
-	/DISCARD/ : {
-		*(.exitcall.exit)
-		*(.eh_frame)
-		*(.discard)
-	}
-
         STABS_DEBUG
         DWARF_DEBUG
+
+	/* Sections to be discarded */
+	DISCARDS
+	/DISCARD/ : { *(.eh_frame) }
 }
 
 
diff --git a/arch/xtensa/kernel/vmlinux.lds.S b/arch/xtensa/kernel/vmlinux.lds.S
index b1e24638acd..921b6ff3b64 100644
--- a/arch/xtensa/kernel/vmlinux.lds.S
+++ b/arch/xtensa/kernel/vmlinux.lds.S
@@ -280,16 +280,6 @@ SECTIONS
     *(.ResetVector.text)
   }
 
-  /* Sections to be discarded */
-  /DISCARD/ :
-  {
-	*(.exit.literal)
-	EXIT_TEXT
-	EXIT_DATA
-        *(.exitcall.exit)
-	*(.discard)
-  }
-
   .xt.lit : { *(.xt.lit) }
   .xt.prop : { *(.xt.prop) }
 
@@ -322,4 +312,8 @@ SECTIONS
     *(.xt.lit)
     *(.gnu.linkonce.p*)
   }
+
+  /* Sections to be discarded */
+  DISCARDS
+  /DISCARD/ : { *(.exit.literal) }
 }
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index c5c18ac878a..ab8ea9b7741 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -35,13 +35,10 @@
  *	__bss_stop = .;
  *	_end = .;
  *
- *	/DISCARD/ : {
- *		EXIT_TEXT
- *		EXIT_DATA
- *		EXIT_CALL
- *	}
  *	STABS_DEBUG
  *	DWARF_DEBUG
+ *
+ *	DISCARDS		// must be the last
  * }
  *
  * [__init_begin, __init_end] is the init section that may be freed after init
@@ -629,11 +626,20 @@
 #define INIT_RAM_FS
 #endif
 
+/*
+ * Default discarded sections.
+ *
+ * Some archs want to discard exit text/data at runtime rather than
+ * link time due to cross-section references such as alt instructions,
+ * bug table, eh_frame, etc.  DISCARDS must be the last of output
+ * section definitions so that such archs put those in earlier section
+ * definitions.
+ */
 #define DISCARDS							\
 	/DISCARD/ : {							\
 	EXIT_TEXT							\
 	EXIT_DATA							\
-	*(.exitcall.exit)						\
+	EXIT_CALL							\
 	*(.discard)							\
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 1dacc76d0014a034b8aca14237c127d7c19d7726 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Wed, 1 Jul 2009 11:26:02 +0000
Subject: net/compat/wext: send different messages to compat tasks

Wireless extensions have the unfortunate problem that events
are multicast netlink messages, and are not independent of
pointer size. Thus, currently 32-bit tasks on 64-bit platforms
cannot properly receive events and fail with all kinds of
strange problems, for instance wpa_supplicant never notices
disassociations, due to the way the 64-bit event looks (to a
32-bit process), the fact that the address is all zeroes is
lost, it thinks instead it is 00:00:00:00:01:00.

The same problem existed with the ioctls, until David Miller
fixed those some time ago in an heroic effort.

A different problem caused by this is that we cannot send the
ASSOCREQIE/ASSOCRESPIE events because sending them causes a
32-bit wpa_supplicant on a 64-bit system to overwrite its
internal information, which is worse than it not getting the
information at all -- so we currently resort to sending a
custom string event that it then parses. This, however, has a
severe size limitation we are frequently hitting with modern
access points; this limitation would can be lifted after this
patch by sending the correct binary, not custom, event.

A similar problem apparently happens for some other netlink
users on x86_64 with 32-bit tasks due to the alignment for
64-bit quantities.

In order to fix these problems, I have implemented a way to
send compat messages to tasks. When sending an event, we send
the non-compat event data together with a compat event data in
skb_shinfo(main_skb)->frag_list. Then, when the event is read
from the socket, the netlink code makes sure to pass out only
the skb that is compatible with the task. This approach was
suggested by David Miller, my original approach required
always sending two skbs but that had various small problems.

To determine whether compat is needed or not, I have used the
MSG_CMSG_COMPAT flag, and adjusted the call path for recv and
recvfrom to include it, even if those calls do not have a cmsg
parameter.

I have not solved one small part of the problem, and I don't
think it is necessary to: if a 32-bit application uses read()
rather than any form of recvmsg() it will still get the wrong
(64-bit) event. However, neither do applications actually do
this, nor would it be a regression.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/mips/kernel/scall64-n32.S |  2 +-
 arch/mips/kernel/scall64-o32.S |  4 +--
 arch/sparc/kernel/sys32.S      |  2 +-
 include/linux/wireless.h       |  8 +++++
 net/Kconfig                    | 20 +++++++++++
 net/compat.c                   | 17 +++++++--
 net/netlink/af_netlink.c       | 36 ++++++++++++++++++-
 net/wireless/wext.c            | 78 ++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 160 insertions(+), 7 deletions(-)

(limited to 'arch/sparc')

diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S
index 15874f9812c..7c4a94f4370 100644
--- a/arch/mips/kernel/scall64-n32.S
+++ b/arch/mips/kernel/scall64-n32.S
@@ -164,7 +164,7 @@ EXPORT(sysn32_call_table)
 	PTR	sys_connect
 	PTR	sys_accept
 	PTR	sys_sendto
-	PTR	sys_recvfrom
+	PTR	compat_sys_recvfrom
 	PTR	compat_sys_sendmsg		/* 6045 */
 	PTR	compat_sys_recvmsg
 	PTR	sys_shutdown
diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S
index 781e0f1e953..821fc978673 100644
--- a/arch/mips/kernel/scall64-o32.S
+++ b/arch/mips/kernel/scall64-o32.S
@@ -378,8 +378,8 @@ sys_call_table:
 	PTR	sys_getsockname
 	PTR	sys_getsockopt
 	PTR	sys_listen
-	PTR	sys_recv			/* 4175 */
-	PTR	sys_recvfrom
+	PTR	compat_sys_recv			/* 4175 */
+	PTR	compat_sys_recvfrom
 	PTR	compat_sys_recvmsg
 	PTR	sys_send
 	PTR	compat_sys_sendmsg
diff --git a/arch/sparc/kernel/sys32.S b/arch/sparc/kernel/sys32.S
index f061c4dda9e..3762f6c7894 100644
--- a/arch/sparc/kernel/sys32.S
+++ b/arch/sparc/kernel/sys32.S
@@ -121,7 +121,7 @@ SIGN2(sys32_syslog, sys_syslog, %o0, %o2)
 SIGN1(sys32_umask, sys_umask, %o0)
 SIGN3(sys32_tgkill, sys_tgkill, %o0, %o1, %o2)
 SIGN1(sys32_sendto, sys_sendto, %o0)
-SIGN1(sys32_recvfrom, sys_recvfrom, %o0)
+SIGN1(sys32_recvfrom, compat_sys_recvfrom, %o0)
 SIGN3(sys32_socket, sys_socket, %o0, %o1, %o2)
 SIGN2(sys32_connect, sys_connect, %o0, %o2)
 SIGN2(sys32_bind, sys_bind, %o0, %o2)
diff --git a/include/linux/wireless.h b/include/linux/wireless.h
index cb24204851f..5b4c6c772a9 100644
--- a/include/linux/wireless.h
+++ b/include/linux/wireless.h
@@ -1132,6 +1132,14 @@ struct __compat_iw_event {
 };
 #define IW_EV_COMPAT_LCP_LEN offsetof(struct __compat_iw_event, pointer)
 #define IW_EV_COMPAT_POINT_OFF offsetof(struct compat_iw_point, length)
+
+/* Size of the various events for compat */
+#define IW_EV_COMPAT_CHAR_LEN	(IW_EV_COMPAT_LCP_LEN + IFNAMSIZ)
+#define IW_EV_COMPAT_UINT_LEN	(IW_EV_COMPAT_LCP_LEN + sizeof(__u32))
+#define IW_EV_COMPAT_FREQ_LEN	(IW_EV_COMPAT_LCP_LEN + sizeof(struct iw_freq))
+#define IW_EV_COMPAT_PARAM_LEN	(IW_EV_COMPAT_LCP_LEN + sizeof(struct iw_param))
+#define IW_EV_COMPAT_ADDR_LEN	(IW_EV_COMPAT_LCP_LEN + sizeof(struct sockaddr))
+#define IW_EV_COMPAT_QUAL_LEN	(IW_EV_COMPAT_LCP_LEN + sizeof(struct iw_quality))
 #define IW_EV_COMPAT_POINT_LEN	\
 	(IW_EV_COMPAT_LCP_LEN + sizeof(struct compat_iw_point) - \
 	 IW_EV_COMPAT_POINT_OFF)
diff --git a/net/Kconfig b/net/Kconfig
index 7051b971067..041c35edb76 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -23,6 +23,26 @@ menuconfig NET
 
 if NET
 
+config WANT_COMPAT_NETLINK_MESSAGES
+	bool
+	help
+	  This option can be selected by other options that need compat
+	  netlink messages.
+
+config COMPAT_NETLINK_MESSAGES
+	def_bool y
+	depends on COMPAT
+	depends on WIRELESS_EXT || WANT_COMPAT_NETLINK_MESSAGES
+	help
+	  This option makes it possible to send different netlink messages
+	  to tasks depending on whether the task is a compat task or not. To
+	  achieve this, you need to set skb_shinfo(skb)->frag_list to the
+	  compat skb before sending the skb, the netlink code will sort out
+	  which message to actually pass to the task.
+
+	  Newly written code should NEVER need this option but do
+	  compat-independent messages instead!
+
 menu "Networking options"
 
 source "net/packet/Kconfig"
diff --git a/net/compat.c b/net/compat.c
index 8d739053afe..12728b17a22 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -743,6 +743,18 @@ asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg, uns
 	return sys_recvmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT);
 }
 
+asmlinkage long compat_sys_recv(int fd, void __user *buf, size_t len, unsigned flags)
+{
+	return sys_recv(fd, buf, len, flags | MSG_CMSG_COMPAT);
+}
+
+asmlinkage long compat_sys_recvfrom(int fd, void __user *buf, size_t len,
+				    unsigned flags, struct sockaddr __user *addr,
+				    int __user *addrlen)
+{
+	return sys_recvfrom(fd, buf, len, flags | MSG_CMSG_COMPAT, addr, addrlen);
+}
+
 asmlinkage long compat_sys_socketcall(int call, u32 __user *args)
 {
 	int ret;
@@ -788,10 +800,11 @@ asmlinkage long compat_sys_socketcall(int call, u32 __user *args)
 		ret = sys_sendto(a0, compat_ptr(a1), a[2], a[3], compat_ptr(a[4]), a[5]);
 		break;
 	case SYS_RECV:
-		ret = sys_recv(a0, compat_ptr(a1), a[2], a[3]);
+		ret = compat_sys_recv(a0, compat_ptr(a1), a[2], a[3]);
 		break;
 	case SYS_RECVFROM:
-		ret = sys_recvfrom(a0, compat_ptr(a1), a[2], a[3], compat_ptr(a[4]), compat_ptr(a[5]));
+		ret = compat_sys_recvfrom(a0, compat_ptr(a1), a[2], a[3],
+					  compat_ptr(a[4]), compat_ptr(a[5]));
 		break;
 	case SYS_SHUTDOWN:
 		ret = sys_shutdown(a0,a1);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index d46da6cb92e..da3163d15ef 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1361,7 +1361,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
 	struct netlink_sock *nlk = nlk_sk(sk);
 	int noblock = flags&MSG_DONTWAIT;
 	size_t copied;
-	struct sk_buff *skb;
+	struct sk_buff *skb, *frag __maybe_unused = NULL;
 	int err;
 
 	if (flags&MSG_OOB)
@@ -1373,6 +1373,35 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
 	if (skb == NULL)
 		goto out;
 
+#ifdef CONFIG_COMPAT_NETLINK_MESSAGES
+	if (unlikely(skb_shinfo(skb)->frag_list)) {
+		bool need_compat = !!(flags & MSG_CMSG_COMPAT);
+
+		/*
+		 * If this skb has a frag_list, then here that means that
+		 * we will have to use the frag_list skb for compat tasks
+		 * and the regular skb for non-compat tasks.
+		 *
+		 * The skb might (and likely will) be cloned, so we can't
+		 * just reset frag_list and go on with things -- we need to
+		 * keep that. For the compat case that's easy -- simply get
+		 * a reference to the compat skb and free the regular one
+		 * including the frag. For the non-compat case, we need to
+		 * avoid sending the frag to the user -- so assign NULL but
+		 * restore it below before freeing the skb.
+		 */
+		if (need_compat) {
+			struct sk_buff *compskb = skb_shinfo(skb)->frag_list;
+			skb_get(compskb);
+			kfree_skb(skb);
+			skb = compskb;
+		} else {
+			frag = skb_shinfo(skb)->frag_list;
+			skb_shinfo(skb)->frag_list = NULL;
+		}
+	}
+#endif
+
 	msg->msg_namelen = 0;
 
 	copied = skb->len;
@@ -1403,6 +1432,11 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
 	siocb->scm->creds = *NETLINK_CREDS(skb);
 	if (flags & MSG_TRUNC)
 		copied = skb->len;
+
+#ifdef CONFIG_COMPAT_NETLINK_MESSAGES
+	skb_shinfo(skb)->frag_list = frag;
+#endif
+
 	skb_free_datagram(sk, skb);
 
 	if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2)
diff --git a/net/wireless/wext.c b/net/wireless/wext.c
index ee35e6422f1..3fe3c2c0ce1 100644
--- a/net/wireless/wext.c
+++ b/net/wireless/wext.c
@@ -417,6 +417,21 @@ static const int event_type_size[] = {
 	IW_EV_QUAL_LEN,			/* IW_HEADER_TYPE_QUAL */
 };
 
+#ifdef CONFIG_COMPAT
+static const int compat_event_type_size[] = {
+	IW_EV_COMPAT_LCP_LEN,		/* IW_HEADER_TYPE_NULL */
+	0,
+	IW_EV_COMPAT_CHAR_LEN,		/* IW_HEADER_TYPE_CHAR */
+	0,
+	IW_EV_COMPAT_UINT_LEN,		/* IW_HEADER_TYPE_UINT */
+	IW_EV_COMPAT_FREQ_LEN,		/* IW_HEADER_TYPE_FREQ */
+	IW_EV_COMPAT_ADDR_LEN,		/* IW_HEADER_TYPE_ADDR */
+	0,
+	IW_EV_COMPAT_POINT_LEN,		/* Without variable payload */
+	IW_EV_COMPAT_PARAM_LEN,		/* IW_HEADER_TYPE_PARAM */
+	IW_EV_COMPAT_QUAL_LEN,		/* IW_HEADER_TYPE_QUAL */
+};
+#endif
 
 /************************ COMMON SUBROUTINES ************************/
 /*
@@ -1348,6 +1363,22 @@ void wireless_send_event(struct net_device *	dev,
 	struct sk_buff *skb;
 	struct nlmsghdr *nlh;
 	struct nlattr *nla;
+#ifdef CONFIG_COMPAT
+	struct __compat_iw_event *compat_event;
+	struct compat_iw_point compat_wrqu;
+	struct sk_buff *compskb;
+#endif
+
+	/*
+	 * Nothing in the kernel sends scan events with data, be safe.
+	 * This is necessary because we cannot fix up scan event data
+	 * for compat, due to being contained in 'extra', but normally
+	 * applications are required to retrieve the scan data anyway
+	 * and no data is included in the event, this codifies that
+	 * practice.
+	 */
+	if (WARN_ON(cmd == SIOCGIWSCAN && extra))
+		extra = NULL;
 
 	/* Get the description of the Event */
 	if (cmd <= SIOCIWLAST) {
@@ -1446,7 +1477,54 @@ void wireless_send_event(struct net_device *	dev,
 		memcpy(((char *) event) + hdr_len, extra, extra_len);
 
 	nlmsg_end(skb, nlh);
+#ifdef CONFIG_COMPAT
+	hdr_len = compat_event_type_size[descr->header_type];
+	event_len = hdr_len + extra_len;
 
+	compskb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+	if (!compskb) {
+		kfree_skb(skb);
+		return;
+	}
+
+	/* Send via the RtNetlink event channel */
+	nlh = rtnetlink_ifinfo_prep(dev, compskb);
+	if (WARN_ON(!nlh)) {
+		kfree_skb(skb);
+		kfree_skb(compskb);
+		return;
+	}
+
+	/* Add the wireless events in the netlink packet */
+	nla = nla_reserve(compskb, IFLA_WIRELESS, event_len);
+	if (!nla) {
+		kfree_skb(skb);
+		kfree_skb(compskb);
+		return;
+	}
+	compat_event = nla_data(nla);
+
+	compat_event->len = event_len;
+	compat_event->cmd = cmd;
+	if (descr->header_type == IW_HEADER_TYPE_POINT) {
+		compat_wrqu.length = wrqu->data.length;
+		compat_wrqu.flags = wrqu->data.flags;
+		memcpy(&compat_event->pointer,
+			((char *) &compat_wrqu) + IW_EV_COMPAT_POINT_OFF,
+			hdr_len - IW_EV_COMPAT_LCP_LEN);
+		if (extra_len)
+			memcpy(((char *) compat_event) + hdr_len,
+				extra, extra_len);
+	} else {
+		/* extra_len must be zero, so no if (extra) needed */
+		memcpy(&compat_event->pointer, wrqu,
+			hdr_len - IW_EV_COMPAT_LCP_LEN);
+	}
+
+	nlmsg_end(compskb, nlh);
+
+	skb_shinfo(skb)->frag_list = compskb;
+#endif
 	skb_queue_tail(&dev_net(dev)->wext_nlevents, skb);
 	schedule_work(&wireless_nlevent_work);
 }
-- 
cgit v1.2.3-70-g09d2


From 47cab6a722d44c71c4f8224017ef548522243cf4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Aug 2009 09:31:54 +0200
Subject: debug lockups: Improve lockup detection, fix generic arch fallback

As Andrew noted, my previous patch ("debug lockups: Improve lockup
detection") broke/removed SysRq-L support from architecture that do
not provide a __trigger_all_cpu_backtrace implementation.

Restore a fallback path and clean up the SysRq-L machinery a bit:

 - Rename the arch method to arch_trigger_all_cpu_backtrace()

 - Simplify the define

 - Document the method a bit - in the hope of more architectures
   adding support for it.

[ The patch touches Sparc code for the rename. ]

Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: "David S. Miller" <davem@davemloft.net>
LKML-Reference: <20090802140809.7ec4bb6b.akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/sparc/include/asm/irq_64.h |  4 ++--
 arch/sparc/kernel/process_64.c  |  4 ++--
 arch/x86/include/asm/nmi.h      |  4 ++--
 arch/x86/kernel/apic/nmi.c      |  2 +-
 drivers/char/sysrq.c            | 15 ++++++++++++++-
 include/linux/nmi.h             | 19 +++++++++++++++++--
 6 files changed, 38 insertions(+), 10 deletions(-)

(limited to 'arch/sparc')

diff --git a/arch/sparc/include/asm/irq_64.h b/arch/sparc/include/asm/irq_64.h
index 1934f2cbf51..a0b443cb3c1 100644
--- a/arch/sparc/include/asm/irq_64.h
+++ b/arch/sparc/include/asm/irq_64.h
@@ -89,8 +89,8 @@ static inline unsigned long get_softint(void)
 	return retval;
 }
 
-void __trigger_all_cpu_backtrace(void);
-#define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace()
+void arch_trigger_all_cpu_backtrace(void);
+#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
 
 extern void *hardirq_stack[NR_CPUS];
 extern void *softirq_stack[NR_CPUS];
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index 4041f94e772..18d67854a1b 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -251,7 +251,7 @@ static void __global_reg_poll(struct global_reg_snapshot *gp)
 	}
 }
 
-void __trigger_all_cpu_backtrace(void)
+void arch_trigger_all_cpu_backtrace(void)
 {
 	struct thread_info *tp = current_thread_info();
 	struct pt_regs *regs = get_irq_regs();
@@ -304,7 +304,7 @@ void __trigger_all_cpu_backtrace(void)
 
 static void sysrq_handle_globreg(int key, struct tty_struct *tty)
 {
-	__trigger_all_cpu_backtrace();
+	arch_trigger_all_cpu_backtrace();
 }
 
 static struct sysrq_key_op sparc_globalreg_op = {
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index c86e5ed4af5..e63cf7d441e 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -45,8 +45,8 @@ extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
 			void __user *, size_t *, loff_t *);
 extern int unknown_nmi_panic;
 
-void __trigger_all_cpu_backtrace(void);
-#define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace()
+void arch_trigger_all_cpu_backtrace(void);
+#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
 
 static inline void localise_nmi_watchdog(void)
 {
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 1bb1ac20e9e..db7220220d0 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -554,7 +554,7 @@ int do_nmi_callback(struct pt_regs *regs, int cpu)
 	return 0;
 }
 
-void __trigger_all_cpu_backtrace(void)
+void arch_trigger_all_cpu_backtrace(void)
 {
 	int i;
 
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index 165f307f30e..50eecfe1d72 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -223,7 +223,20 @@ static DECLARE_WORK(sysrq_showallcpus, sysrq_showregs_othercpus);
 
 static void sysrq_handle_showallcpus(int key, struct tty_struct *tty)
 {
-	trigger_all_cpu_backtrace();
+	/*
+	 * Fall back to the workqueue based printing if the
+	 * backtrace printing did not succeed or the
+	 * architecture has no support for it:
+	 */
+	if (!trigger_all_cpu_backtrace()) {
+		struct pt_regs *regs = get_irq_regs();
+
+		if (regs) {
+			printk(KERN_INFO "CPU%d:\n", smp_processor_id());
+			show_regs(regs);
+		}
+		schedule_work(&sysrq_showallcpus);
+	}
 }
 
 static struct sysrq_key_op sysrq_showallcpus_op = {
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 29af2d5df09..b752e807add 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -28,8 +28,23 @@ static inline void acpi_nmi_disable(void) { }
 static inline void acpi_nmi_enable(void) { }
 #endif
 
-#ifndef trigger_all_cpu_backtrace
-#define trigger_all_cpu_backtrace() do { } while (0)
+/*
+ * Create trigger_all_cpu_backtrace() out of the arch-provided
+ * base function. Return whether such support was available,
+ * to allow calling code to fall back to some other mechanism:
+ */
+#ifdef arch_trigger_all_cpu_backtrace
+static inline bool trigger_all_cpu_backtrace(void)
+{
+	arch_trigger_all_cpu_backtrace();
+
+	return true;
+}
+#else
+static inline bool trigger_all_cpu_backtrace(void)
+{
+	return false;
+}
 #endif
 
 #endif
-- 
cgit v1.2.3-70-g09d2


From 6a12235c7d2d75c7d94b9afcaaecd422ff845ce0 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Wed, 29 Jul 2009 10:25:58 +0100
Subject: agp: kill phys_to_gart() and gart_to_phys()

There seems to be no reason for these -- they're a 1:1 mapping on all
platforms.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 arch/alpha/include/asm/agp.h    | 4 ----
 arch/ia64/include/asm/agp.h     | 4 ----
 arch/parisc/include/asm/agp.h   | 4 ----
 arch/powerpc/include/asm/agp.h  | 4 ----
 arch/sparc/include/asm/agp.h    | 4 ----
 arch/x86/include/asm/agp.h      | 4 ----
 drivers/char/agp/agp.h          | 3 ---
 drivers/char/agp/ali-agp.c      | 4 ++--
 drivers/char/agp/amd-k7-agp.c   | 8 ++++----
 drivers/char/agp/amd64-agp.c    | 6 +++---
 drivers/char/agp/ati-agp.c      | 6 +++---
 drivers/char/agp/backend.c      | 2 +-
 drivers/char/agp/efficeon-agp.c | 4 ++--
 drivers/char/agp/generic.c      | 6 +++---
 drivers/char/agp/hp-agp.c       | 4 ++--
 drivers/char/agp/i460-agp.c     | 4 ++--
 drivers/char/agp/intel-agp.c    | 7 +++----
 drivers/char/agp/nvidia-agp.c   | 2 +-
 drivers/char/agp/sgi-agp.c      | 2 +-
 drivers/char/agp/sworks-agp.c   | 8 ++++----
 drivers/char/agp/uninorth-agp.c | 2 +-
 21 files changed, 32 insertions(+), 60 deletions(-)

(limited to 'arch/sparc')

diff --git a/arch/alpha/include/asm/agp.h b/arch/alpha/include/asm/agp.h
index 26c17913529..a94d48b8677 100644
--- a/arch/alpha/include/asm/agp.h
+++ b/arch/alpha/include/asm/agp.h
@@ -9,10 +9,6 @@
 #define unmap_page_from_agp(page) 
 #define flush_agp_cache() mb()
 
-/* Convert a physical address to an address suitable for the GART. */
-#define phys_to_gart(x) (x)
-#define gart_to_phys(x) (x)
-
 /* GATT allocation. Returns/accepts GATT kernel virtual address. */
 #define alloc_gatt_pages(order)		\
 	((char *)__get_free_pages(GFP_KERNEL, (order)))
diff --git a/arch/ia64/include/asm/agp.h b/arch/ia64/include/asm/agp.h
index c11fdd8ab4d..01d09c401c5 100644
--- a/arch/ia64/include/asm/agp.h
+++ b/arch/ia64/include/asm/agp.h
@@ -17,10 +17,6 @@
 #define unmap_page_from_agp(page)	/* nothing */
 #define flush_agp_cache()		mb()
 
-/* Convert a physical address to an address suitable for the GART. */
-#define phys_to_gart(x) (x)
-#define gart_to_phys(x) (x)
-
 /* GATT allocation. Returns/accepts GATT kernel virtual address. */
 #define alloc_gatt_pages(order)		\
 	((char *)__get_free_pages(GFP_KERNEL, (order)))
diff --git a/arch/parisc/include/asm/agp.h b/arch/parisc/include/asm/agp.h
index 9651660da63..d226ffa8fc1 100644
--- a/arch/parisc/include/asm/agp.h
+++ b/arch/parisc/include/asm/agp.h
@@ -11,10 +11,6 @@
 #define unmap_page_from_agp(page)	/* nothing */
 #define flush_agp_cache()		mb()
 
-/* Convert a physical address to an address suitable for the GART. */
-#define phys_to_gart(x) (x)
-#define gart_to_phys(x) (x)
-
 /* GATT allocation. Returns/accepts GATT kernel virtual address. */
 #define alloc_gatt_pages(order)		\
 	((char *)__get_free_pages(GFP_KERNEL, (order)))
diff --git a/arch/powerpc/include/asm/agp.h b/arch/powerpc/include/asm/agp.h
index 86455c4c31e..416e12c2d50 100644
--- a/arch/powerpc/include/asm/agp.h
+++ b/arch/powerpc/include/asm/agp.h
@@ -8,10 +8,6 @@
 #define unmap_page_from_agp(page)
 #define flush_agp_cache() mb()
 
-/* Convert a physical address to an address suitable for the GART. */
-#define phys_to_gart(x) (x)
-#define gart_to_phys(x) (x)
-
 /* GATT allocation. Returns/accepts GATT kernel virtual address. */
 #define alloc_gatt_pages(order)		\
 	((char *)__get_free_pages(GFP_KERNEL, (order)))
diff --git a/arch/sparc/include/asm/agp.h b/arch/sparc/include/asm/agp.h
index c2456870b05..70f52c1661b 100644
--- a/arch/sparc/include/asm/agp.h
+++ b/arch/sparc/include/asm/agp.h
@@ -7,10 +7,6 @@
 #define unmap_page_from_agp(page)
 #define flush_agp_cache() mb()
 
-/* Convert a physical address to an address suitable for the GART. */
-#define phys_to_gart(x) (x)
-#define gart_to_phys(x) (x)
-
 /* GATT allocation. Returns/accepts GATT kernel virtual address. */
 #define alloc_gatt_pages(order)		\
 	((char *)__get_free_pages(GFP_KERNEL, (order)))
diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h
index 9825cd64c9b..eec2a70d437 100644
--- a/arch/x86/include/asm/agp.h
+++ b/arch/x86/include/asm/agp.h
@@ -22,10 +22,6 @@
  */
 #define flush_agp_cache() wbinvd()
 
-/* Convert a physical address to an address suitable for the GART. */
-#define phys_to_gart(x) (x)
-#define gart_to_phys(x) (x)
-
 /* GATT allocation. Returns/accepts GATT kernel virtual address. */
 #define alloc_gatt_pages(order)		\
 	((char *)__get_free_pages(GFP_KERNEL, (order)))
diff --git a/drivers/char/agp/agp.h b/drivers/char/agp/agp.h
index 4c6e5079d87..d6f36c004d9 100644
--- a/drivers/char/agp/agp.h
+++ b/drivers/char/agp/agp.h
@@ -318,9 +318,6 @@ void agp3_generic_cleanup(void);
 #define AGP_GENERIC_SIZES_ENTRIES 11
 extern const struct aper_size_info_16 agp3_generic_sizes[];
 
-#define virt_to_gart(x) (phys_to_gart(virt_to_phys(x)))
-#define gart_to_virt(x) (phys_to_virt(gart_to_phys(x)))
-
 extern int agp_off;
 extern int agp_try_unsupported_boot;
 
diff --git a/drivers/char/agp/ali-agp.c b/drivers/char/agp/ali-agp.c
index 201ef3ffd48..d2ce68f27e4 100644
--- a/drivers/char/agp/ali-agp.c
+++ b/drivers/char/agp/ali-agp.c
@@ -152,7 +152,7 @@ static struct page *m1541_alloc_page(struct agp_bridge_data *bridge)
 	pci_read_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL, &temp);
 	pci_write_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL,
 			(((temp & ALI_CACHE_FLUSH_ADDR_MASK) |
-			  phys_to_gart(page_to_phys(page))) | ALI_CACHE_FLUSH_EN ));
+			  page_to_phys(page)) | ALI_CACHE_FLUSH_EN ));
 	return page;
 }
 
@@ -180,7 +180,7 @@ static void m1541_destroy_page(struct page *page, int flags)
 		pci_read_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL, &temp);
 		pci_write_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL,
 				       (((temp & ALI_CACHE_FLUSH_ADDR_MASK) |
-					 phys_to_gart(page_to_phys(page))) | ALI_CACHE_FLUSH_EN));
+					 page_to_phys(page)) | ALI_CACHE_FLUSH_EN));
 	}
 	agp_generic_destroy_page(page, flags);
 }
diff --git a/drivers/char/agp/amd-k7-agp.c b/drivers/char/agp/amd-k7-agp.c
index 542a87895ae..73dbf40c874 100644
--- a/drivers/char/agp/amd-k7-agp.c
+++ b/drivers/char/agp/amd-k7-agp.c
@@ -44,7 +44,7 @@ static int amd_create_page_map(struct amd_page_map *page_map)
 #ifndef CONFIG_X86
 	SetPageReserved(virt_to_page(page_map->real));
 	global_cache_flush();
-	page_map->remapped = ioremap_nocache(virt_to_gart(page_map->real),
+	page_map->remapped = ioremap_nocache(virt_to_phys(page_map->real),
 					    PAGE_SIZE);
 	if (page_map->remapped == NULL) {
 		ClearPageReserved(virt_to_page(page_map->real));
@@ -160,7 +160,7 @@ static int amd_create_gatt_table(struct agp_bridge_data *bridge)
 
 	agp_bridge->gatt_table_real = (u32 *)page_dir.real;
 	agp_bridge->gatt_table = (u32 __iomem *)page_dir.remapped;
-	agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real);
+	agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real);
 
 	/* Get the address for the gart region.
 	 * This is a bus address even on the alpha, b/c its
@@ -173,7 +173,7 @@ static int amd_create_gatt_table(struct agp_bridge_data *bridge)
 
 	/* Calculate the agp offset */
 	for (i = 0; i < value->num_entries / 1024; i++, addr += 0x00400000) {
-		writel(virt_to_gart(amd_irongate_private.gatt_pages[i]->real) | 1,
+		writel(virt_to_phys(amd_irongate_private.gatt_pages[i]->real) | 1,
 			page_dir.remapped+GET_PAGE_DIR_OFF(addr));
 		readl(page_dir.remapped+GET_PAGE_DIR_OFF(addr));	/* PCI Posting. */
 	}
@@ -325,7 +325,7 @@ static int amd_insert_memory(struct agp_memory *mem, off_t pg_start, int type)
 		addr = (j * PAGE_SIZE) + agp_bridge->gart_bus_addr;
 		cur_gatt = GET_GATT(addr);
 		writel(agp_generic_mask_memory(agp_bridge,
-					       phys_to_gart(page_to_phys(mem->pages[i])),
+					       page_to_phys(mem->pages[i]),
 					       mem->type),
 		       cur_gatt+GET_GATT_OFF(addr));
 		readl(cur_gatt+GET_GATT_OFF(addr));	/* PCI Posting. */
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c
index e85a5b3e952..2fb2e6cc322 100644
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -79,7 +79,7 @@ static int amd64_insert_memory(struct agp_memory *mem, off_t pg_start, int type)
 
 	for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
 		tmp = agp_bridge->driver->mask_memory(agp_bridge,
-						      phys_to_gart(page_to_phys(mem->pages[i])),
+						      page_to_phys(mem->pages[i]),
 						      mask_type);
 
 		BUG_ON(tmp & 0xffffff0000000ffcULL);
@@ -178,7 +178,7 @@ static const struct aper_size_info_32 amd_8151_sizes[7] =
 
 static int amd_8151_configure(void)
 {
-	unsigned long gatt_bus = virt_to_gart(agp_bridge->gatt_table_real);
+	unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real);
 	int i;
 
 	/* Configure AGP regs in each x86-64 host bridge. */
@@ -558,7 +558,7 @@ static void __devexit agp_amd64_remove(struct pci_dev *pdev)
 {
 	struct agp_bridge_data *bridge = pci_get_drvdata(pdev);
 
-	release_mem_region(virt_to_gart(bridge->gatt_table_real),
+	release_mem_region(virt_to_phys(bridge->gatt_table_real),
 			   amd64_aperture_sizes[bridge->aperture_size_idx].size);
 	agp_remove_bridge(bridge);
 	agp_put_bridge(bridge);
diff --git a/drivers/char/agp/ati-agp.c b/drivers/char/agp/ati-agp.c
index 59ebd60c1b6..3b2ecbe86eb 100644
--- a/drivers/char/agp/ati-agp.c
+++ b/drivers/char/agp/ati-agp.c
@@ -302,7 +302,7 @@ static int ati_insert_memory(struct agp_memory * mem,
 		addr = (j * PAGE_SIZE) + agp_bridge->gart_bus_addr;
 		cur_gatt = GET_GATT(addr);
 		writel(agp_bridge->driver->mask_memory(agp_bridge,	
-						       phys_to_gart(page_to_phys(mem->pages[i])),
+						       page_to_phys(mem->pages[i]),
 						       mem->type),
 		       cur_gatt+GET_GATT_OFF(addr));
 	}
@@ -360,7 +360,7 @@ static int ati_create_gatt_table(struct agp_bridge_data *bridge)
 
 	agp_bridge->gatt_table_real = (u32 *)page_dir.real;
 	agp_bridge->gatt_table = (u32 __iomem *) page_dir.remapped;
-	agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real);
+	agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real);
 
 	/* Write out the size register */
 	current_size = A_SIZE_LVL2(agp_bridge->current_size);
@@ -390,7 +390,7 @@ static int ati_create_gatt_table(struct agp_bridge_data *bridge)
 
 	/* Calculate the agp offset */
 	for (i = 0; i < value->num_entries / 1024; i++, addr += 0x00400000) {
-		writel(virt_to_gart(ati_generic_private.gatt_pages[i]->real) | 1,
+		writel(virt_to_phys(ati_generic_private.gatt_pages[i]->real) | 1,
 			page_dir.remapped+GET_PAGE_DIR_OFF(addr));
 		readl(page_dir.remapped+GET_PAGE_DIR_OFF(addr));	/* PCI Posting. */
 	}
diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c
index 343f102090a..ad87753f6de 100644
--- a/drivers/char/agp/backend.c
+++ b/drivers/char/agp/backend.c
@@ -159,7 +159,7 @@ static int agp_backend_initialize(struct agp_bridge_data *bridge)
 				goto err_out_nounmap;
 			}
 		} else {
-			bridge->scratch_page_dma = phys_to_gart(page_to_phys(page));
+			bridge->scratch_page_dma = page_to_phys(page);
 		}
 
 		bridge->scratch_page = bridge->driver->mask_memory(bridge,
diff --git a/drivers/char/agp/efficeon-agp.c b/drivers/char/agp/efficeon-agp.c
index 35d50f2861b..793f39ea961 100644
--- a/drivers/char/agp/efficeon-agp.c
+++ b/drivers/char/agp/efficeon-agp.c
@@ -67,7 +67,7 @@ static const struct gatt_mask efficeon_generic_masks[] =
 /* This function does the same thing as mask_memory() for this chipset... */
 static inline unsigned long efficeon_mask_memory(struct page *page)
 {
-	unsigned long addr = phys_to_gart(page_to_phys(page));
+	unsigned long addr = page_to_phys(page);
 	return addr | 0x00000001;
 }
 
@@ -226,7 +226,7 @@ static int efficeon_create_gatt_table(struct agp_bridge_data *bridge)
 
 		efficeon_private.l1_table[index] = page;
 
-		value = virt_to_gart((unsigned long *)page) | pati | present | index;
+		value = virt_to_phys((unsigned long *)page) | pati | present | index;
 
 		pci_write_config_dword(agp_bridge->dev,
 			EFFICEON_ATTPAGE, value);
diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c
index 28f0208c66a..c50543966eb 100644
--- a/drivers/char/agp/generic.c
+++ b/drivers/char/agp/generic.c
@@ -988,7 +988,7 @@ int agp_generic_create_gatt_table(struct agp_bridge_data *bridge)
 	set_memory_uc((unsigned long)table, 1 << page_order);
 	bridge->gatt_table = (void *)table;
 #else
-	bridge->gatt_table = ioremap_nocache(virt_to_gart(table),
+	bridge->gatt_table = ioremap_nocache(virt_to_phys(table),
 					(PAGE_SIZE * (1 << page_order)));
 	bridge->driver->cache_flush();
 #endif
@@ -1001,7 +1001,7 @@ int agp_generic_create_gatt_table(struct agp_bridge_data *bridge)
 
 		return -ENOMEM;
 	}
-	bridge->gatt_bus_addr = virt_to_gart(bridge->gatt_table_real);
+	bridge->gatt_bus_addr = virt_to_phys(bridge->gatt_table_real);
 
 	/* AK: bogus, should encode addresses > 4GB */
 	for (i = 0; i < num_entries; i++) {
@@ -1142,7 +1142,7 @@ int agp_generic_insert_memory(struct agp_memory * mem, off_t pg_start, int type)
 
 	for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
 		writel(bridge->driver->mask_memory(bridge,
-						   phys_to_gart(page_to_phys(mem->pages[i])),
+						   page_to_phys(mem->pages[i]),
 						   mask_type),
 		       bridge->gatt_table+j);
 	}
diff --git a/drivers/char/agp/hp-agp.c b/drivers/char/agp/hp-agp.c
index 64dbf4b1cf2..501e293e5ad 100644
--- a/drivers/char/agp/hp-agp.c
+++ b/drivers/char/agp/hp-agp.c
@@ -107,7 +107,7 @@ static int __init hp_zx1_ioc_shared(void)
 	hp->gart_size = HP_ZX1_GART_SIZE;
 	hp->gatt_entries = hp->gart_size / hp->io_page_size;
 
-	hp->io_pdir = gart_to_virt(readq(hp->ioc_regs+HP_ZX1_PDIR_BASE));
+	hp->io_pdir = phys_to_virt(readq(hp->ioc_regs+HP_ZX1_PDIR_BASE));
 	hp->gatt = &hp->io_pdir[HP_ZX1_IOVA_TO_PDIR(hp->gart_base)];
 
 	if (hp->gatt[0] != HP_ZX1_SBA_IOMMU_COOKIE) {
@@ -246,7 +246,7 @@ hp_zx1_configure (void)
 	agp_bridge->mode = readl(hp->lba_regs+hp->lba_cap_offset+PCI_AGP_STATUS);
 
 	if (hp->io_pdir_owner) {
-		writel(virt_to_gart(hp->io_pdir), hp->ioc_regs+HP_ZX1_PDIR_BASE);
+		writel(virt_to_phys(hp->io_pdir), hp->ioc_regs+HP_ZX1_PDIR_BASE);
 		readl(hp->ioc_regs+HP_ZX1_PDIR_BASE);
 		writel(hp->io_tlb_ps, hp->ioc_regs+HP_ZX1_TCNFG);
 		readl(hp->ioc_regs+HP_ZX1_TCNFG);
diff --git a/drivers/char/agp/i460-agp.c b/drivers/char/agp/i460-agp.c
index 54191f86053..e763d3312ce 100644
--- a/drivers/char/agp/i460-agp.c
+++ b/drivers/char/agp/i460-agp.c
@@ -325,7 +325,7 @@ static int i460_insert_memory_small_io_page (struct agp_memory *mem,
 
 	io_page_size = 1UL << I460_IO_PAGE_SHIFT;
 	for (i = 0, j = io_pg_start; i < mem->page_count; i++) {
-		paddr = phys_to_gart(page_to_phys(mem->pages[i]));
+		paddr = page_to_phys(mem->pages[i]);
 		for (k = 0; k < I460_IOPAGES_PER_KPAGE; k++, j++, paddr += io_page_size)
 			WR_GATT(j, i460_mask_memory(agp_bridge, paddr, mem->type));
 	}
@@ -382,7 +382,7 @@ static int i460_alloc_large_page (struct lp_desc *lp)
 		return -ENOMEM;
 	}
 
-	lp->paddr = phys_to_gart(page_to_phys(lp->page));
+	lp->paddr = page_to_phys(lp->page);
 	lp->refcount = 0;
 	atomic_add(I460_KPAGES_PER_IOPAGE, &agp_bridge->current_memory_agp);
 	return 0;
diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
index d8c80d8be5e..aa8889e8afc 100644
--- a/drivers/char/agp/intel-agp.c
+++ b/drivers/char/agp/intel-agp.c
@@ -288,7 +288,7 @@ static void intel_agp_insert_sg_entries(struct agp_memory *mem,
 
 	for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
 		writel(agp_bridge->driver->mask_memory(agp_bridge,
-			       phys_to_gart(page_to_phys(mem->pages[i])), mask_type),
+				page_to_phys(mem->pages[i]), mask_type),
 		       intel_private.gtt+j);
 	}
 
@@ -470,8 +470,7 @@ static int intel_i810_insert_entries(struct agp_memory *mem, off_t pg_start,
 			global_cache_flush();
 		for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
 			writel(agp_bridge->driver->mask_memory(agp_bridge,
-							       phys_to_gart(page_to_phys(mem->pages[i])),
-							       mask_type),
+					page_to_phys(mem->pages[i]), mask_type),
 			       intel_private.registers+I810_PTE_BASE+(j*4));
 		}
 		readl(intel_private.registers+I810_PTE_BASE+((j-1)*4));
@@ -977,7 +976,7 @@ static int intel_i830_insert_entries(struct agp_memory *mem, off_t pg_start,
 
 	for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
 		writel(agp_bridge->driver->mask_memory(agp_bridge,
-			       phys_to_gart(page_to_phys(mem->pages[i])), mask_type),
+				page_to_phys(mem->pages[i]), mask_type),
 		       intel_private.registers+I810_PTE_BASE+(j*4));
 	}
 	readl(intel_private.registers+I810_PTE_BASE+((j-1)*4));
diff --git a/drivers/char/agp/nvidia-agp.c b/drivers/char/agp/nvidia-agp.c
index cedacee30ec..7e36d2b4f9d 100644
--- a/drivers/char/agp/nvidia-agp.c
+++ b/drivers/char/agp/nvidia-agp.c
@@ -225,7 +225,7 @@ static int nvidia_insert_memory(struct agp_memory *mem, off_t pg_start, int type
 	}
 	for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
 		writel(agp_bridge->driver->mask_memory(agp_bridge,
-			       phys_to_gart(page_to_phys(mem->pages[i])), mask_type),
+			       page_to_phys(mem->pages[i]), mask_type),
 			agp_bridge->gatt_table+nvidia_private.pg_offset+j);
 	}
 
diff --git a/drivers/char/agp/sgi-agp.c b/drivers/char/agp/sgi-agp.c
index 0d47fa84740..0d426ae39c8 100644
--- a/drivers/char/agp/sgi-agp.c
+++ b/drivers/char/agp/sgi-agp.c
@@ -190,7 +190,7 @@ static int sgi_tioca_insert_memory(struct agp_memory *mem, off_t pg_start,
 	for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
 		table[j] =
 		    bridge->driver->mask_memory(bridge,
-						phys_to_gart(page_to_phys(mem->pages[i])),
+						page_to_phys(mem->pages[i]),
 						mem->type);
 	}
 
diff --git a/drivers/char/agp/sworks-agp.c b/drivers/char/agp/sworks-agp.c
index 07259952fc3..13acaaf64ed 100644
--- a/drivers/char/agp/sworks-agp.c
+++ b/drivers/char/agp/sworks-agp.c
@@ -155,7 +155,7 @@ static int serverworks_create_gatt_table(struct agp_bridge_data *bridge)
 	/* Create a fake scratch directory */
 	for (i = 0; i < 1024; i++) {
 		writel(agp_bridge->scratch_page, serverworks_private.scratch_dir.remapped+i);
-		writel(virt_to_gart(serverworks_private.scratch_dir.real) | 1, page_dir.remapped+i);
+		writel(virt_to_phys(serverworks_private.scratch_dir.real) | 1, page_dir.remapped+i);
 	}
 
 	retval = serverworks_create_gatt_pages(value->num_entries / 1024);
@@ -167,7 +167,7 @@ static int serverworks_create_gatt_table(struct agp_bridge_data *bridge)
 
 	agp_bridge->gatt_table_real = (u32 *)page_dir.real;
 	agp_bridge->gatt_table = (u32 __iomem *)page_dir.remapped;
-	agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real);
+	agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real);
 
 	/* Get the address for the gart region.
 	 * This is a bus address even on the alpha, b/c its
@@ -179,7 +179,7 @@ static int serverworks_create_gatt_table(struct agp_bridge_data *bridge)
 
 	/* Calculate the agp offset */
 	for (i = 0; i < value->num_entries / 1024; i++)
-		writel(virt_to_gart(serverworks_private.gatt_pages[i]->real)|1, page_dir.remapped+i);
+		writel(virt_to_phys(serverworks_private.gatt_pages[i]->real)|1, page_dir.remapped+i);
 
 	return 0;
 }
@@ -350,7 +350,7 @@ static int serverworks_insert_memory(struct agp_memory *mem,
 		addr = (j * PAGE_SIZE) + agp_bridge->gart_bus_addr;
 		cur_gatt = SVRWRKS_GET_GATT(addr);
 		writel(agp_bridge->driver->mask_memory(agp_bridge, 
-				phys_to_gart(page_to_phys(mem->pages[i])), mem->type),
+				page_to_phys(mem->pages[i]), mem->type),
 		       cur_gatt+GET_GATT_OFF(addr));
 	}
 	serverworks_tlbflush(mem);
diff --git a/drivers/char/agp/uninorth-agp.c b/drivers/char/agp/uninorth-agp.c
index f192c3b9ad4..2e993112ab8 100644
--- a/drivers/char/agp/uninorth-agp.c
+++ b/drivers/char/agp/uninorth-agp.c
@@ -431,7 +431,7 @@ static int uninorth_create_gatt_table(struct agp_bridge_data *bridge)
 
 	bridge->gatt_table_real = (u32 *) table;
 	bridge->gatt_table = (u32 *)table;
-	bridge->gatt_bus_addr = virt_to_gart(table);
+	bridge->gatt_bus_addr = virt_to_phys(table);
 
 	for (i = 0; i < num_entries; i++)
 		bridge->gatt_table[i] = 0;
-- 
cgit v1.2.3-70-g09d2


From 49c794e94649020248e37b78db16cd25bad38b4f Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Tue, 4 Aug 2009 07:28:28 +0000
Subject: net: implement a SO_PROTOCOL getsockoption

Similar to SO_TYPE returning the socket type, SO_PROTOCOL allows to
retrieve the protocol used with a given socket.

I am not quite sure why we have that-many copies of socket.h, and why
the values are not the same on all arches either, but for where hex
numbers dominate, I use 0x1029 for SO_PROTOCOL as that seems to be
the next free unused number across a bunch of operating systems, or
so Google results make me want to believe. SO_PROTOCOL for others
just uses the next free Linux number, 38.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/alpha/include/asm/socket.h      | 1 +
 arch/arm/include/asm/socket.h        | 2 ++
 arch/avr32/include/asm/socket.h      | 2 ++
 arch/cris/include/asm/socket.h       | 2 ++
 arch/frv/include/asm/socket.h        | 2 ++
 arch/h8300/include/asm/socket.h      | 2 ++
 arch/ia64/include/asm/socket.h       | 2 ++
 arch/m32r/include/asm/socket.h       | 2 ++
 arch/m68k/include/asm/socket.h       | 2 ++
 arch/microblaze/include/asm/socket.h | 2 ++
 arch/mips/include/asm/socket.h       | 1 +
 arch/mn10300/include/asm/socket.h    | 2 ++
 arch/parisc/include/asm/socket.h     | 1 +
 arch/powerpc/include/asm/socket.h    | 2 ++
 arch/s390/include/asm/socket.h       | 2 ++
 arch/sparc/include/asm/socket.h      | 2 ++
 arch/x86/include/asm/socket.h        | 2 ++
 arch/xtensa/include/asm/socket.h     | 2 ++
 include/asm-generic/socket.h         | 2 ++
 net/core/sock.c                      | 5 +++++
 20 files changed, 40 insertions(+)

(limited to 'arch/sparc')

diff --git a/arch/alpha/include/asm/socket.h b/arch/alpha/include/asm/socket.h
index 3641ec1452f..2f8b4d37774 100644
--- a/arch/alpha/include/asm/socket.h
+++ b/arch/alpha/include/asm/socket.h
@@ -32,6 +32,7 @@
 #define	SO_RCVTIMEO	0x1012
 #define	SO_SNDTIMEO	0x1013
 #define SO_ACCEPTCONN	0x1014
+#define SO_PROTOCOL	0x1028
 
 /* linux-specific, might as well be the same as on i386 */
 #define SO_NO_CHECK	11
diff --git a/arch/arm/include/asm/socket.h b/arch/arm/include/asm/socket.h
index 537de4e0ef5..7f47454ffbf 100644
--- a/arch/arm/include/asm/socket.h
+++ b/arch/arm/include/asm/socket.h
@@ -57,4 +57,6 @@
 #define SO_TIMESTAMPING		37
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
+#define SO_PROTOCOL		38
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/avr32/include/asm/socket.h b/arch/avr32/include/asm/socket.h
index 04c86061970..6af2866a4f0 100644
--- a/arch/avr32/include/asm/socket.h
+++ b/arch/avr32/include/asm/socket.h
@@ -57,4 +57,6 @@
 #define SO_TIMESTAMPING		37
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
+#define SO_PROTOCOL		38
+
 #endif /* __ASM_AVR32_SOCKET_H */
diff --git a/arch/cris/include/asm/socket.h b/arch/cris/include/asm/socket.h
index d5cf7400540..f3859fb0990 100644
--- a/arch/cris/include/asm/socket.h
+++ b/arch/cris/include/asm/socket.h
@@ -59,6 +59,8 @@
 #define SO_TIMESTAMPING		37
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
+#define SO_PROTOCOL		38
+
 #endif /* _ASM_SOCKET_H */
 
 
diff --git a/arch/frv/include/asm/socket.h b/arch/frv/include/asm/socket.h
index 57c3d4054e8..8dab3486ffa 100644
--- a/arch/frv/include/asm/socket.h
+++ b/arch/frv/include/asm/socket.h
@@ -57,5 +57,7 @@
 #define SO_TIMESTAMPING		37
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
+#define SO_PROTOCOL		38
+
 #endif /* _ASM_SOCKET_H */
 
diff --git a/arch/h8300/include/asm/socket.h b/arch/h8300/include/asm/socket.h
index 602518a70a1..ba770d09cd6 100644
--- a/arch/h8300/include/asm/socket.h
+++ b/arch/h8300/include/asm/socket.h
@@ -57,4 +57,6 @@
 #define SO_TIMESTAMPING		37
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
+#define SO_PROTOCOL		38
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/ia64/include/asm/socket.h b/arch/ia64/include/asm/socket.h
index 745421225ec..091cd9d47d0 100644
--- a/arch/ia64/include/asm/socket.h
+++ b/arch/ia64/include/asm/socket.h
@@ -66,4 +66,6 @@
 #define SO_TIMESTAMPING		37
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
+#define SO_PROTOCOL		38
+
 #endif /* _ASM_IA64_SOCKET_H */
diff --git a/arch/m32r/include/asm/socket.h b/arch/m32r/include/asm/socket.h
index be7ed589af5..d36f5928fb7 100644
--- a/arch/m32r/include/asm/socket.h
+++ b/arch/m32r/include/asm/socket.h
@@ -57,4 +57,6 @@
 #define SO_TIMESTAMPING		37
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
+#define SO_PROTOCOL		38
+
 #endif /* _ASM_M32R_SOCKET_H */
diff --git a/arch/m68k/include/asm/socket.h b/arch/m68k/include/asm/socket.h
index ca87f938b03..060cb7ed024 100644
--- a/arch/m68k/include/asm/socket.h
+++ b/arch/m68k/include/asm/socket.h
@@ -57,4 +57,6 @@
 #define SO_TIMESTAMPING		37
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
+#define SO_PROTOCOL		38
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/microblaze/include/asm/socket.h b/arch/microblaze/include/asm/socket.h
index 82593686031..96bf8bfa935 100644
--- a/arch/microblaze/include/asm/socket.h
+++ b/arch/microblaze/include/asm/socket.h
@@ -66,4 +66,6 @@
 #define SO_TIMESTAMPING		37
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
+#define SO_PROTOCOL		38
+
 #endif /* _ASM_MICROBLAZE_SOCKET_H */
diff --git a/arch/mips/include/asm/socket.h b/arch/mips/include/asm/socket.h
index 2abca178016..289ce5f5f2a 100644
--- a/arch/mips/include/asm/socket.h
+++ b/arch/mips/include/asm/socket.h
@@ -42,6 +42,7 @@ To add: #define SO_REUSEPORT 0x0200	/* Allow local address and port reuse.  */
 #define SO_SNDTIMEO	0x1005	/* send timeout */
 #define SO_RCVTIMEO 	0x1006	/* receive timeout */
 #define SO_ACCEPTCONN	0x1009
+#define SO_PROTOCOL	0x1028	/* protocol type */
 
 /* linux-specific, might as well be the same as on i386 */
 #define SO_NO_CHECK	11
diff --git a/arch/mn10300/include/asm/socket.h b/arch/mn10300/include/asm/socket.h
index fb5daf438ec..19d7cf709b7 100644
--- a/arch/mn10300/include/asm/socket.h
+++ b/arch/mn10300/include/asm/socket.h
@@ -57,4 +57,6 @@
 #define SO_TIMESTAMPING		37
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
+#define SO_PROTOCOL		38
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/parisc/include/asm/socket.h b/arch/parisc/include/asm/socket.h
index 885472bf7b7..a658b09df62 100644
--- a/arch/parisc/include/asm/socket.h
+++ b/arch/parisc/include/asm/socket.h
@@ -24,6 +24,7 @@
 #define SO_RCVTIMEO	0x1006
 #define SO_ERROR	0x1007
 #define SO_TYPE		0x1008
+#define SO_PROTOCOL	0x1028
 #define SO_PEERNAME	0x2000
 
 #define SO_NO_CHECK	0x400b
diff --git a/arch/powerpc/include/asm/socket.h b/arch/powerpc/include/asm/socket.h
index 1e5cfad0e3f..609049d7117 100644
--- a/arch/powerpc/include/asm/socket.h
+++ b/arch/powerpc/include/asm/socket.h
@@ -64,4 +64,6 @@
 #define SO_TIMESTAMPING		37
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
+#define SO_PROTOCOL		38
+
 #endif	/* _ASM_POWERPC_SOCKET_H */
diff --git a/arch/s390/include/asm/socket.h b/arch/s390/include/asm/socket.h
index 02330c50241..65baa9a83ab 100644
--- a/arch/s390/include/asm/socket.h
+++ b/arch/s390/include/asm/socket.h
@@ -65,4 +65,6 @@
 #define SO_TIMESTAMPING		37
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
+#define SO_PROTOCOL		38
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/sparc/include/asm/socket.h b/arch/sparc/include/asm/socket.h
index 982a12f959f..9cbbfafd053 100644
--- a/arch/sparc/include/asm/socket.h
+++ b/arch/sparc/include/asm/socket.h
@@ -29,6 +29,8 @@
 #define SO_RCVBUFFORCE	0x100b
 #define SO_ERROR	0x1007
 #define SO_TYPE		0x1008
+#define SO_PROTOCOL	0x1028
+
 
 /* Linux specific, keep the same. */
 #define SO_NO_CHECK	0x000b
diff --git a/arch/x86/include/asm/socket.h b/arch/x86/include/asm/socket.h
index ca8bf2cd0ba..1077d2535a3 100644
--- a/arch/x86/include/asm/socket.h
+++ b/arch/x86/include/asm/socket.h
@@ -57,4 +57,6 @@
 #define SO_TIMESTAMPING		37
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
+#define SO_PROTOCOL		38
+
 #endif /* _ASM_X86_SOCKET_H */
diff --git a/arch/xtensa/include/asm/socket.h b/arch/xtensa/include/asm/socket.h
index dd1a7a4a1ce..e47f172142f 100644
--- a/arch/xtensa/include/asm/socket.h
+++ b/arch/xtensa/include/asm/socket.h
@@ -68,4 +68,6 @@
 #define SO_TIMESTAMPING		37
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
+#define SO_PROTOCOL		38
+
 #endif	/* _XTENSA_SOCKET_H */
diff --git a/include/asm-generic/socket.h b/include/asm-generic/socket.h
index 5d79e409241..7e24d186616 100644
--- a/include/asm-generic/socket.h
+++ b/include/asm-generic/socket.h
@@ -60,4 +60,6 @@
 #define SO_TIMESTAMPING		37
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
+#define SO_PROTOCOL		38
+
 #endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index a324a80c163..ebce661234a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -482,6 +482,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 		sk->sk_reuse = valbool;
 		break;
 	case SO_TYPE:
+	case SO_PROTOCOL:
 	case SO_ERROR:
 		ret = -ENOPROTOOPT;
 		break;
@@ -764,6 +765,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		v.val = sk->sk_type;
 		break;
 
+	case SO_PROTOCOL:
+		v.val = sk->sk_protocol;
+		break;
+
 	case SO_ERROR:
 		v.val = -sock_error(sk);
 		if (v.val == 0)
-- 
cgit v1.2.3-70-g09d2


From 0d6038ee76f2e06b79d0465807f67e86bf4025de Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Tue, 4 Aug 2009 07:28:29 +0000
Subject: net: implement a SO_DOMAIN getsockoption
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This sockopt goes in line with SO_TYPE and SO_PROTOCOL. It makes it
possible for userspace programs to pass around file descriptors — I
am referring to arguments-to-functions, but it may even work for the
fd passing over UNIX sockets — without needing to also pass the
auxiliary information (PF_INET6/IPPROTO_TCP).

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/alpha/include/asm/socket.h      | 1 +
 arch/arm/include/asm/socket.h        | 1 +
 arch/avr32/include/asm/socket.h      | 1 +
 arch/cris/include/asm/socket.h       | 1 +
 arch/frv/include/asm/socket.h        | 1 +
 arch/h8300/include/asm/socket.h      | 1 +
 arch/ia64/include/asm/socket.h       | 1 +
 arch/m32r/include/asm/socket.h       | 1 +
 arch/m68k/include/asm/socket.h       | 1 +
 arch/microblaze/include/asm/socket.h | 1 +
 arch/mips/include/asm/socket.h       | 1 +
 arch/mn10300/include/asm/socket.h    | 1 +
 arch/parisc/include/asm/socket.h     | 1 +
 arch/powerpc/include/asm/socket.h    | 1 +
 arch/s390/include/asm/socket.h       | 1 +
 arch/sparc/include/asm/socket.h      | 1 +
 arch/x86/include/asm/socket.h        | 1 +
 arch/xtensa/include/asm/socket.h     | 1 +
 include/asm-generic/socket.h         | 1 +
 net/core/sock.c                      | 5 +++++
 20 files changed, 24 insertions(+)

(limited to 'arch/sparc')

diff --git a/arch/alpha/include/asm/socket.h b/arch/alpha/include/asm/socket.h
index 2f8b4d37774..26773e3246e 100644
--- a/arch/alpha/include/asm/socket.h
+++ b/arch/alpha/include/asm/socket.h
@@ -33,6 +33,7 @@
 #define	SO_SNDTIMEO	0x1013
 #define SO_ACCEPTCONN	0x1014
 #define SO_PROTOCOL	0x1028
+#define SO_DOMAIN	0x1029
 
 /* linux-specific, might as well be the same as on i386 */
 #define SO_NO_CHECK	11
diff --git a/arch/arm/include/asm/socket.h b/arch/arm/include/asm/socket.h
index 7f47454ffbf..92ac61d294f 100644
--- a/arch/arm/include/asm/socket.h
+++ b/arch/arm/include/asm/socket.h
@@ -58,5 +58,6 @@
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
 #define SO_PROTOCOL		38
+#define SO_DOMAIN		39
 
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/avr32/include/asm/socket.h b/arch/avr32/include/asm/socket.h
index 6af2866a4f0..fe863f9794d 100644
--- a/arch/avr32/include/asm/socket.h
+++ b/arch/avr32/include/asm/socket.h
@@ -58,5 +58,6 @@
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
 #define SO_PROTOCOL		38
+#define SO_DOMAIN		39
 
 #endif /* __ASM_AVR32_SOCKET_H */
diff --git a/arch/cris/include/asm/socket.h b/arch/cris/include/asm/socket.h
index f3859fb0990..45ec49bdb7b 100644
--- a/arch/cris/include/asm/socket.h
+++ b/arch/cris/include/asm/socket.h
@@ -60,6 +60,7 @@
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
 #define SO_PROTOCOL		38
+#define SO_DOMAIN		39
 
 #endif /* _ASM_SOCKET_H */
 
diff --git a/arch/frv/include/asm/socket.h b/arch/frv/include/asm/socket.h
index 8dab3486ffa..2dea726095c 100644
--- a/arch/frv/include/asm/socket.h
+++ b/arch/frv/include/asm/socket.h
@@ -58,6 +58,7 @@
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
 #define SO_PROTOCOL		38
+#define SO_DOMAIN		39
 
 #endif /* _ASM_SOCKET_H */
 
diff --git a/arch/h8300/include/asm/socket.h b/arch/h8300/include/asm/socket.h
index ba770d09cd6..1547f01c8e2 100644
--- a/arch/h8300/include/asm/socket.h
+++ b/arch/h8300/include/asm/socket.h
@@ -58,5 +58,6 @@
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
 #define SO_PROTOCOL		38
+#define SO_DOMAIN		39
 
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/ia64/include/asm/socket.h b/arch/ia64/include/asm/socket.h
index 091cd9d47d0..0b0d5ff062e 100644
--- a/arch/ia64/include/asm/socket.h
+++ b/arch/ia64/include/asm/socket.h
@@ -67,5 +67,6 @@
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
 #define SO_PROTOCOL		38
+#define SO_DOMAIN		39
 
 #endif /* _ASM_IA64_SOCKET_H */
diff --git a/arch/m32r/include/asm/socket.h b/arch/m32r/include/asm/socket.h
index d36f5928fb7..3390a864f22 100644
--- a/arch/m32r/include/asm/socket.h
+++ b/arch/m32r/include/asm/socket.h
@@ -58,5 +58,6 @@
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
 #define SO_PROTOCOL		38
+#define SO_DOMAIN		39
 
 #endif /* _ASM_M32R_SOCKET_H */
diff --git a/arch/m68k/include/asm/socket.h b/arch/m68k/include/asm/socket.h
index 060cb7ed024..eee01cce921 100644
--- a/arch/m68k/include/asm/socket.h
+++ b/arch/m68k/include/asm/socket.h
@@ -58,5 +58,6 @@
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
 #define SO_PROTOCOL		38
+#define SO_DOMAIN		39
 
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/microblaze/include/asm/socket.h b/arch/microblaze/include/asm/socket.h
index 96bf8bfa935..7361ae7cfcd 100644
--- a/arch/microblaze/include/asm/socket.h
+++ b/arch/microblaze/include/asm/socket.h
@@ -67,5 +67,6 @@
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
 #define SO_PROTOCOL		38
+#define SO_DOMAIN		39
 
 #endif /* _ASM_MICROBLAZE_SOCKET_H */
diff --git a/arch/mips/include/asm/socket.h b/arch/mips/include/asm/socket.h
index 289ce5f5f2a..ae05accd9fe 100644
--- a/arch/mips/include/asm/socket.h
+++ b/arch/mips/include/asm/socket.h
@@ -43,6 +43,7 @@ To add: #define SO_REUSEPORT 0x0200	/* Allow local address and port reuse.  */
 #define SO_RCVTIMEO 	0x1006	/* receive timeout */
 #define SO_ACCEPTCONN	0x1009
 #define SO_PROTOCOL	0x1028	/* protocol type */
+#define SO_DOMAIN	0x1029	/* domain/socket family */
 
 /* linux-specific, might as well be the same as on i386 */
 #define SO_NO_CHECK	11
diff --git a/arch/mn10300/include/asm/socket.h b/arch/mn10300/include/asm/socket.h
index 19d7cf709b7..4df75af29d7 100644
--- a/arch/mn10300/include/asm/socket.h
+++ b/arch/mn10300/include/asm/socket.h
@@ -58,5 +58,6 @@
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
 #define SO_PROTOCOL		38
+#define SO_DOMAIN		39
 
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/parisc/include/asm/socket.h b/arch/parisc/include/asm/socket.h
index a658b09df62..960b1e5d8e1 100644
--- a/arch/parisc/include/asm/socket.h
+++ b/arch/parisc/include/asm/socket.h
@@ -25,6 +25,7 @@
 #define SO_ERROR	0x1007
 #define SO_TYPE		0x1008
 #define SO_PROTOCOL	0x1028
+#define SO_DOMAIN	0x1029
 #define SO_PEERNAME	0x2000
 
 #define SO_NO_CHECK	0x400b
diff --git a/arch/powerpc/include/asm/socket.h b/arch/powerpc/include/asm/socket.h
index 609049d7117..3ab8b3e6feb 100644
--- a/arch/powerpc/include/asm/socket.h
+++ b/arch/powerpc/include/asm/socket.h
@@ -65,5 +65,6 @@
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
 #define SO_PROTOCOL		38
+#define SO_DOMAIN		39
 
 #endif	/* _ASM_POWERPC_SOCKET_H */
diff --git a/arch/s390/include/asm/socket.h b/arch/s390/include/asm/socket.h
index 65baa9a83ab..e42df89a0b8 100644
--- a/arch/s390/include/asm/socket.h
+++ b/arch/s390/include/asm/socket.h
@@ -66,5 +66,6 @@
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
 #define SO_PROTOCOL		38
+#define SO_DOMAIN		39
 
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/sparc/include/asm/socket.h b/arch/sparc/include/asm/socket.h
index 9cbbfafd053..3a5ae3d1208 100644
--- a/arch/sparc/include/asm/socket.h
+++ b/arch/sparc/include/asm/socket.h
@@ -30,6 +30,7 @@
 #define SO_ERROR	0x1007
 #define SO_TYPE		0x1008
 #define SO_PROTOCOL	0x1028
+#define SO_DOMAIN	0x1029
 
 
 /* Linux specific, keep the same. */
diff --git a/arch/x86/include/asm/socket.h b/arch/x86/include/asm/socket.h
index 1077d2535a3..b2a8c74f2d0 100644
--- a/arch/x86/include/asm/socket.h
+++ b/arch/x86/include/asm/socket.h
@@ -58,5 +58,6 @@
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
 #define SO_PROTOCOL		38
+#define SO_DOMAIN		39
 
 #endif /* _ASM_X86_SOCKET_H */
diff --git a/arch/xtensa/include/asm/socket.h b/arch/xtensa/include/asm/socket.h
index e47f172142f..beb3a6bdb61 100644
--- a/arch/xtensa/include/asm/socket.h
+++ b/arch/xtensa/include/asm/socket.h
@@ -69,5 +69,6 @@
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
 #define SO_PROTOCOL		38
+#define SO_DOMAIN		39
 
 #endif	/* _XTENSA_SOCKET_H */
diff --git a/include/asm-generic/socket.h b/include/asm-generic/socket.h
index 7e24d186616..538991cef6f 100644
--- a/include/asm-generic/socket.h
+++ b/include/asm-generic/socket.h
@@ -61,5 +61,6 @@
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
 #define SO_PROTOCOL		38
+#define SO_DOMAIN		39
 
 #endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index ebce661234a..3ac34ea6ec0 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -483,6 +483,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 		break;
 	case SO_TYPE:
 	case SO_PROTOCOL:
+	case SO_DOMAIN:
 	case SO_ERROR:
 		ret = -ENOPROTOOPT;
 		break;
@@ -769,6 +770,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		v.val = sk->sk_protocol;
 		break;
 
+	case SO_DOMAIN:
+		v.val = sk->sk_family;
+		break;
+
 	case SO_ERROR:
 		v.val = -sock_error(sk);
 		if (v.val == 0)
-- 
cgit v1.2.3-70-g09d2


From bc0a14f154069cc4e42ea903c2c2b9984a94e4b7 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 10 Aug 2009 11:53:12 +0900
Subject: sparc: Use dma_map_ops struct

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Tested-by: Robert Reif <reif@earthlink.net>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
LKML-Reference: <1249872797-1314-4-git-send-email-fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/sparc/include/asm/dma-mapping.h | 43 ++++++------------------------------
 arch/sparc/kernel/dma.c              | 16 +++++++++-----
 arch/sparc/kernel/iommu.c            | 16 +++++++++-----
 arch/sparc/kernel/pci_sun4v.c        | 14 +++++++-----
 4 files changed, 36 insertions(+), 53 deletions(-)

(limited to 'arch/sparc')

diff --git a/arch/sparc/include/asm/dma-mapping.h b/arch/sparc/include/asm/dma-mapping.h
index 204e4bf6443..893f3ecc975 100644
--- a/arch/sparc/include/asm/dma-mapping.h
+++ b/arch/sparc/include/asm/dma-mapping.h
@@ -13,36 +13,7 @@ extern int dma_set_mask(struct device *dev, u64 dma_mask);
 #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
 #define dma_is_consistent(d, h)	(1)
 
-struct dma_ops {
-	void *(*alloc_coherent)(struct device *dev, size_t size,
-				dma_addr_t *dma_handle, gfp_t flag);
-	void (*free_coherent)(struct device *dev, size_t size,
-			      void *cpu_addr, dma_addr_t dma_handle);
-	dma_addr_t (*map_page)(struct device *dev, struct page *page,
-			       unsigned long offset, size_t size,
-			       enum dma_data_direction direction);
-	void (*unmap_page)(struct device *dev, dma_addr_t dma_addr,
-			   size_t size,
-			   enum dma_data_direction direction);
-	int (*map_sg)(struct device *dev, struct scatterlist *sg, int nents,
-		      enum dma_data_direction direction);
-	void (*unmap_sg)(struct device *dev, struct scatterlist *sg,
-			 int nhwentries,
-			 enum dma_data_direction direction);
-	void (*sync_single_for_cpu)(struct device *dev,
-				    dma_addr_t dma_handle, size_t size,
-				    enum dma_data_direction direction);
-	void (*sync_single_for_device)(struct device *dev,
-				       dma_addr_t dma_handle, size_t size,
-				       enum dma_data_direction direction);
-	void (*sync_sg_for_cpu)(struct device *dev, struct scatterlist *sg,
-				int nelems,
-				enum dma_data_direction direction);
-	void (*sync_sg_for_device)(struct device *dev,
-				   struct scatterlist *sg, int nents,
-				   enum dma_data_direction dir);
-};
-extern const struct dma_ops *dma_ops;
+extern const struct dma_map_ops *dma_ops;
 
 static inline void *dma_alloc_coherent(struct device *dev, size_t size,
 				       dma_addr_t *dma_handle, gfp_t flag)
@@ -62,40 +33,40 @@ static inline dma_addr_t dma_map_single(struct device *dev, void *cpu_addr,
 {
 	return dma_ops->map_page(dev, virt_to_page(cpu_addr),
 				 (unsigned long)cpu_addr & ~PAGE_MASK, size,
-				 direction);
+				 direction, NULL);
 }
 
 static inline void dma_unmap_single(struct device *dev, dma_addr_t dma_addr,
 				    size_t size,
 				    enum dma_data_direction direction)
 {
-	dma_ops->unmap_page(dev, dma_addr, size, direction);
+	dma_ops->unmap_page(dev, dma_addr, size, direction, NULL);
 }
 
 static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
 				      unsigned long offset, size_t size,
 				      enum dma_data_direction direction)
 {
-	return dma_ops->map_page(dev, page, offset, size, direction);
+	return dma_ops->map_page(dev, page, offset, size, direction, NULL);
 }
 
 static inline void dma_unmap_page(struct device *dev, dma_addr_t dma_address,
 				  size_t size,
 				  enum dma_data_direction direction)
 {
-	dma_ops->unmap_page(dev, dma_address, size, direction);
+	dma_ops->unmap_page(dev, dma_address, size, direction, NULL);
 }
 
 static inline int dma_map_sg(struct device *dev, struct scatterlist *sg,
 			     int nents, enum dma_data_direction direction)
 {
-	return dma_ops->map_sg(dev, sg, nents, direction);
+	return dma_ops->map_sg(dev, sg, nents, direction, NULL);
 }
 
 static inline void dma_unmap_sg(struct device *dev, struct scatterlist *sg,
 				int nents, enum dma_data_direction direction)
 {
-	dma_ops->unmap_sg(dev, sg, nents, direction);
+	dma_ops->unmap_sg(dev, sg, nents, direction, NULL);
 }
 
 static inline void dma_sync_single_for_cpu(struct device *dev,
diff --git a/arch/sparc/kernel/dma.c b/arch/sparc/kernel/dma.c
index 524c32f97c5..473a3fc7ab5 100644
--- a/arch/sparc/kernel/dma.c
+++ b/arch/sparc/kernel/dma.c
@@ -60,7 +60,8 @@ static void dma32_free_coherent(struct device *dev, size_t size,
 
 static dma_addr_t dma32_map_page(struct device *dev, struct page *page,
 				 unsigned long offset, size_t size,
-				 enum dma_data_direction direction)
+				 enum dma_data_direction direction,
+				 struct dma_attrs *attrs)
 {
 #ifdef CONFIG_PCI
 	if (dev->bus == &pci_bus_type)
@@ -72,7 +73,8 @@ static dma_addr_t dma32_map_page(struct device *dev, struct page *page,
 }
 
 static void dma32_unmap_page(struct device *dev, dma_addr_t dma_address,
-			     size_t size, enum dma_data_direction direction)
+			     size_t size, enum dma_data_direction direction,
+			     struct dma_attrs *attrs)
 {
 #ifdef CONFIG_PCI
 	if (dev->bus == &pci_bus_type) {
@@ -85,7 +87,8 @@ static void dma32_unmap_page(struct device *dev, dma_addr_t dma_address,
 }
 
 static int dma32_map_sg(struct device *dev, struct scatterlist *sg,
-			int nents, enum dma_data_direction direction)
+			int nents, enum dma_data_direction direction,
+			struct dma_attrs *attrs)
 {
 #ifdef CONFIG_PCI
 	if (dev->bus == &pci_bus_type)
@@ -95,7 +98,8 @@ static int dma32_map_sg(struct device *dev, struct scatterlist *sg,
 }
 
 void dma32_unmap_sg(struct device *dev, struct scatterlist *sg,
-		    int nents, enum dma_data_direction direction)
+		    int nents, enum dma_data_direction direction,
+		    struct dma_attrs *attrs)
 {
 #ifdef CONFIG_PCI
 	if (dev->bus == &pci_bus_type) {
@@ -161,7 +165,7 @@ static void dma32_sync_sg_for_device(struct device *dev,
 	BUG();
 }
 
-static const struct dma_ops dma32_dma_ops = {
+static const struct dma_map_ops dma32_dma_ops = {
 	.alloc_coherent		= dma32_alloc_coherent,
 	.free_coherent		= dma32_free_coherent,
 	.map_page		= dma32_map_page,
@@ -174,5 +178,5 @@ static const struct dma_ops dma32_dma_ops = {
 	.sync_sg_for_device	= dma32_sync_sg_for_device,
 };
 
-const struct dma_ops *dma_ops = &dma32_dma_ops;
+const struct dma_map_ops *dma_ops = &dma32_dma_ops;
 EXPORT_SYMBOL(dma_ops);
diff --git a/arch/sparc/kernel/iommu.c b/arch/sparc/kernel/iommu.c
index 0aeaefe696b..a9f0ad95518 100644
--- a/arch/sparc/kernel/iommu.c
+++ b/arch/sparc/kernel/iommu.c
@@ -353,7 +353,8 @@ static void dma_4u_free_coherent(struct device *dev, size_t size,
 
 static dma_addr_t dma_4u_map_page(struct device *dev, struct page *page,
 				  unsigned long offset, size_t sz,
-				  enum dma_data_direction direction)
+				  enum dma_data_direction direction,
+				  struct dma_attrs *attrs)
 {
 	struct iommu *iommu;
 	struct strbuf *strbuf;
@@ -474,7 +475,8 @@ do_flush_sync:
 }
 
 static void dma_4u_unmap_page(struct device *dev, dma_addr_t bus_addr,
-			      size_t sz, enum dma_data_direction direction)
+			      size_t sz, enum dma_data_direction direction,
+			      struct dma_attrs *attrs)
 {
 	struct iommu *iommu;
 	struct strbuf *strbuf;
@@ -520,7 +522,8 @@ static void dma_4u_unmap_page(struct device *dev, dma_addr_t bus_addr,
 }
 
 static int dma_4u_map_sg(struct device *dev, struct scatterlist *sglist,
-			 int nelems, enum dma_data_direction direction)
+			 int nelems, enum dma_data_direction direction,
+			 struct dma_attrs *attrs)
 {
 	struct scatterlist *s, *outs, *segstart;
 	unsigned long flags, handle, prot, ctx;
@@ -691,7 +694,8 @@ static unsigned long fetch_sg_ctx(struct iommu *iommu, struct scatterlist *sg)
 }
 
 static void dma_4u_unmap_sg(struct device *dev, struct scatterlist *sglist,
-			    int nelems, enum dma_data_direction direction)
+			    int nelems, enum dma_data_direction direction,
+			    struct dma_attrs *attrs)
 {
 	unsigned long flags, ctx;
 	struct scatterlist *sg;
@@ -822,7 +826,7 @@ static void dma_4u_sync_sg_for_cpu(struct device *dev,
 	spin_unlock_irqrestore(&iommu->lock, flags);
 }
 
-static const struct dma_ops sun4u_dma_ops = {
+static const struct dma_map_ops sun4u_dma_ops = {
 	.alloc_coherent		= dma_4u_alloc_coherent,
 	.free_coherent		= dma_4u_free_coherent,
 	.map_page		= dma_4u_map_page,
@@ -833,7 +837,7 @@ static const struct dma_ops sun4u_dma_ops = {
 	.sync_sg_for_cpu	= dma_4u_sync_sg_for_cpu,
 };
 
-const struct dma_ops *dma_ops = &sun4u_dma_ops;
+const struct dma_map_ops *dma_ops = &sun4u_dma_ops;
 EXPORT_SYMBOL(dma_ops);
 
 int dma_supported(struct device *dev, u64 device_mask)
diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c
index 2485eaa2310..c4f7dce577d 100644
--- a/arch/sparc/kernel/pci_sun4v.c
+++ b/arch/sparc/kernel/pci_sun4v.c
@@ -232,7 +232,8 @@ static void dma_4v_free_coherent(struct device *dev, size_t size, void *cpu,
 
 static dma_addr_t dma_4v_map_page(struct device *dev, struct page *page,
 				  unsigned long offset, size_t sz,
-				  enum dma_data_direction direction)
+				  enum dma_data_direction direction,
+				  struct dma_attrs *attrs)
 {
 	struct iommu *iommu;
 	unsigned long flags, npages, oaddr;
@@ -296,7 +297,8 @@ iommu_map_fail:
 }
 
 static void dma_4v_unmap_page(struct device *dev, dma_addr_t bus_addr,
-			      size_t sz, enum dma_data_direction direction)
+			      size_t sz, enum dma_data_direction direction,
+			      struct dma_attrs *attrs)
 {
 	struct pci_pbm_info *pbm;
 	struct iommu *iommu;
@@ -336,7 +338,8 @@ static void dma_4v_unmap_page(struct device *dev, dma_addr_t bus_addr,
 }
 
 static int dma_4v_map_sg(struct device *dev, struct scatterlist *sglist,
-			 int nelems, enum dma_data_direction direction)
+			 int nelems, enum dma_data_direction direction,
+			 struct dma_attrs *attrs)
 {
 	struct scatterlist *s, *outs, *segstart;
 	unsigned long flags, handle, prot;
@@ -478,7 +481,8 @@ iommu_map_failed:
 }
 
 static void dma_4v_unmap_sg(struct device *dev, struct scatterlist *sglist,
-			    int nelems, enum dma_data_direction direction)
+			    int nelems, enum dma_data_direction direction,
+			    struct dma_attrs *attrs)
 {
 	struct pci_pbm_info *pbm;
 	struct scatterlist *sg;
@@ -535,7 +539,7 @@ static void dma_4v_sync_sg_for_cpu(struct device *dev,
 	/* Nothing to do... */
 }
 
-static const struct dma_ops sun4v_dma_ops = {
+static const struct dma_map_ops sun4v_dma_ops = {
 	.alloc_coherent			= dma_4v_alloc_coherent,
 	.free_coherent			= dma_4v_free_coherent,
 	.map_page			= dma_4v_map_page,
-- 
cgit v1.2.3-70-g09d2


From 02f7a18935eef0e56d9180fc3c35da6aad1ff3bb Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 10 Aug 2009 11:53:13 +0900
Subject: sparc: Use asm-generic/dma-mapping-common.h

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Tested-by: Robert Reif <reif@earthlink.net>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
LKML-Reference: <1249872797-1314-5-git-send-email-fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/sparc/Kconfig                   |   1 +
 arch/sparc/include/asm/dma-mapping.h | 107 +++++------------------------------
 arch/sparc/kernel/dma.c              |   4 +-
 arch/sparc/kernel/iommu.c            |   4 +-
 arch/sparc/kernel/pci_sun4v.c        |   2 +-
 5 files changed, 19 insertions(+), 99 deletions(-)

(limited to 'arch/sparc')

diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 3f8b6a92eab..5f2df99645c 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -25,6 +25,7 @@ config SPARC
 	select ARCH_WANT_OPTIONAL_GPIOLIB
 	select RTC_CLASS
 	select RTC_DRV_M48T59
+	select HAVE_DMA_ATTRS
 
 config SPARC32
 	def_bool !64BIT
diff --git a/arch/sparc/include/asm/dma-mapping.h b/arch/sparc/include/asm/dma-mapping.h
index 893f3ecc975..34c92264208 100644
--- a/arch/sparc/include/asm/dma-mapping.h
+++ b/arch/sparc/include/asm/dma-mapping.h
@@ -3,6 +3,7 @@
 
 #include <linux/scatterlist.h>
 #include <linux/mm.h>
+#include <linux/dma-debug.h>
 
 #define DMA_ERROR_CODE	(~(dma_addr_t)0x0)
 
@@ -13,113 +14,31 @@ extern int dma_set_mask(struct device *dev, u64 dma_mask);
 #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
 #define dma_is_consistent(d, h)	(1)
 
-extern const struct dma_map_ops *dma_ops;
+extern struct dma_map_ops *dma_ops;
 
-static inline void *dma_alloc_coherent(struct device *dev, size_t size,
-				       dma_addr_t *dma_handle, gfp_t flag)
-{
-	return dma_ops->alloc_coherent(dev, size, dma_handle, flag);
-}
-
-static inline void dma_free_coherent(struct device *dev, size_t size,
-				     void *cpu_addr, dma_addr_t dma_handle)
-{
-	dma_ops->free_coherent(dev, size, cpu_addr, dma_handle);
-}
-
-static inline dma_addr_t dma_map_single(struct device *dev, void *cpu_addr,
-					size_t size,
-					enum dma_data_direction direction)
+static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 {
-	return dma_ops->map_page(dev, virt_to_page(cpu_addr),
-				 (unsigned long)cpu_addr & ~PAGE_MASK, size,
-				 direction, NULL);
+	return dma_ops;
 }
 
-static inline void dma_unmap_single(struct device *dev, dma_addr_t dma_addr,
-				    size_t size,
-				    enum dma_data_direction direction)
-{
-	dma_ops->unmap_page(dev, dma_addr, size, direction, NULL);
-}
-
-static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
-				      unsigned long offset, size_t size,
-				      enum dma_data_direction direction)
-{
-	return dma_ops->map_page(dev, page, offset, size, direction, NULL);
-}
+#include <asm-generic/dma-mapping-common.h>
 
-static inline void dma_unmap_page(struct device *dev, dma_addr_t dma_address,
-				  size_t size,
-				  enum dma_data_direction direction)
-{
-	dma_ops->unmap_page(dev, dma_address, size, direction, NULL);
-}
-
-static inline int dma_map_sg(struct device *dev, struct scatterlist *sg,
-			     int nents, enum dma_data_direction direction)
-{
-	return dma_ops->map_sg(dev, sg, nents, direction, NULL);
-}
-
-static inline void dma_unmap_sg(struct device *dev, struct scatterlist *sg,
-				int nents, enum dma_data_direction direction)
-{
-	dma_ops->unmap_sg(dev, sg, nents, direction, NULL);
-}
-
-static inline void dma_sync_single_for_cpu(struct device *dev,
-					   dma_addr_t dma_handle, size_t size,
-					   enum dma_data_direction direction)
-{
-	dma_ops->sync_single_for_cpu(dev, dma_handle, size, direction);
-}
-
-static inline void dma_sync_single_for_device(struct device *dev,
-					      dma_addr_t dma_handle,
-					      size_t size,
-					      enum dma_data_direction direction)
-{
-	if (dma_ops->sync_single_for_device)
-		dma_ops->sync_single_for_device(dev, dma_handle, size,
-						direction);
-}
-
-static inline void dma_sync_sg_for_cpu(struct device *dev,
-				       struct scatterlist *sg, int nelems,
-				       enum dma_data_direction direction)
+static inline void *dma_alloc_coherent(struct device *dev, size_t size,
+				       dma_addr_t *dma_handle, gfp_t flag)
 {
-	dma_ops->sync_sg_for_cpu(dev, sg, nelems, direction);
-}
+	struct dma_map_ops *ops = get_dma_ops(dev);
 
-static inline void dma_sync_sg_for_device(struct device *dev,
-					  struct scatterlist *sg, int nelems,
-					  enum dma_data_direction direction)
-{
-	if (dma_ops->sync_sg_for_device)
-		dma_ops->sync_sg_for_device(dev, sg, nelems, direction);
+	return ops->alloc_coherent(dev, size, dma_handle, flag);
 }
 
-static inline void dma_sync_single_range_for_cpu(struct device *dev,
-						 dma_addr_t dma_handle,
-						 unsigned long offset,
-						 size_t size,
-						 enum dma_data_direction dir)
+static inline void dma_free_coherent(struct device *dev, size_t size,
+				     void *cpu_addr, dma_addr_t dma_handle)
 {
-	dma_sync_single_for_cpu(dev, dma_handle+offset, size, dir);
-}
+	struct dma_map_ops *ops = get_dma_ops(dev);
 
-static inline void dma_sync_single_range_for_device(struct device *dev,
-						    dma_addr_t dma_handle,
-						    unsigned long offset,
-						    size_t size,
-						    enum dma_data_direction dir)
-{
-	dma_sync_single_for_device(dev, dma_handle+offset, size, dir);
+	ops->free_coherent(dev, size, cpu_addr, dma_handle);
 }
 
-
 static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	return (dma_addr == DMA_ERROR_CODE);
diff --git a/arch/sparc/kernel/dma.c b/arch/sparc/kernel/dma.c
index 473a3fc7ab5..15820a91817 100644
--- a/arch/sparc/kernel/dma.c
+++ b/arch/sparc/kernel/dma.c
@@ -165,7 +165,7 @@ static void dma32_sync_sg_for_device(struct device *dev,
 	BUG();
 }
 
-static const struct dma_map_ops dma32_dma_ops = {
+static struct dma_map_ops dma32_dma_ops = {
 	.alloc_coherent		= dma32_alloc_coherent,
 	.free_coherent		= dma32_free_coherent,
 	.map_page		= dma32_map_page,
@@ -178,5 +178,5 @@ static const struct dma_map_ops dma32_dma_ops = {
 	.sync_sg_for_device	= dma32_sync_sg_for_device,
 };
 
-const struct dma_map_ops *dma_ops = &dma32_dma_ops;
+struct dma_map_ops *dma_ops = &dma32_dma_ops;
 EXPORT_SYMBOL(dma_ops);
diff --git a/arch/sparc/kernel/iommu.c b/arch/sparc/kernel/iommu.c
index a9f0ad95518..74b289cab55 100644
--- a/arch/sparc/kernel/iommu.c
+++ b/arch/sparc/kernel/iommu.c
@@ -826,7 +826,7 @@ static void dma_4u_sync_sg_for_cpu(struct device *dev,
 	spin_unlock_irqrestore(&iommu->lock, flags);
 }
 
-static const struct dma_map_ops sun4u_dma_ops = {
+static struct dma_map_ops sun4u_dma_ops = {
 	.alloc_coherent		= dma_4u_alloc_coherent,
 	.free_coherent		= dma_4u_free_coherent,
 	.map_page		= dma_4u_map_page,
@@ -837,7 +837,7 @@ static const struct dma_map_ops sun4u_dma_ops = {
 	.sync_sg_for_cpu	= dma_4u_sync_sg_for_cpu,
 };
 
-const struct dma_map_ops *dma_ops = &sun4u_dma_ops;
+struct dma_map_ops *dma_ops = &sun4u_dma_ops;
 EXPORT_SYMBOL(dma_ops);
 
 int dma_supported(struct device *dev, u64 device_mask)
diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c
index c4f7dce577d..ee800f92792 100644
--- a/arch/sparc/kernel/pci_sun4v.c
+++ b/arch/sparc/kernel/pci_sun4v.c
@@ -539,7 +539,7 @@ static void dma_4v_sync_sg_for_cpu(struct device *dev,
 	/* Nothing to do... */
 }
 
-static const struct dma_map_ops sun4v_dma_ops = {
+static struct dma_map_ops sun4v_dma_ops = {
 	.alloc_coherent			= dma_4v_alloc_coherent,
 	.free_coherent			= dma_4v_free_coherent,
 	.map_page			= dma_4v_map_page,
-- 
cgit v1.2.3-70-g09d2


From 595cc8560783ea31ed1208dc1f97282a2a5606b7 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 10 Aug 2009 11:53:14 +0900
Subject: sparc: Remove no-op dma_4v_sync_single_for_cpu and
 dma_4v_sync_sg_for_cpu

Now sparc uses include/asm-generic/dma-mapping-common.h.
pci_sun4v.c doesn't need to have no-op
dma_4v_sync_single_for_cpu and dma_4v_sync_sg_for_cpu
(dma-mapping-common.h does nothing if sync_{single|sg}_for_cpu
hook is not defined). So we can remove them safely.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Tested-by: Robert Reif <reif@earthlink.net>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
LKML-Reference: <1249872797-1314-6-git-send-email-fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/sparc/kernel/pci_sun4v.c | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'arch/sparc')

diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c
index ee800f92792..23c33ff9c31 100644
--- a/arch/sparc/kernel/pci_sun4v.c
+++ b/arch/sparc/kernel/pci_sun4v.c
@@ -525,20 +525,6 @@ static void dma_4v_unmap_sg(struct device *dev, struct scatterlist *sglist,
 	spin_unlock_irqrestore(&iommu->lock, flags);
 }
 
-static void dma_4v_sync_single_for_cpu(struct device *dev,
-				       dma_addr_t bus_addr, size_t sz,
-				       enum dma_data_direction direction)
-{
-	/* Nothing to do... */
-}
-
-static void dma_4v_sync_sg_for_cpu(struct device *dev,
-				   struct scatterlist *sglist, int nelems,
-				   enum dma_data_direction direction)
-{
-	/* Nothing to do... */
-}
-
 static struct dma_map_ops sun4v_dma_ops = {
 	.alloc_coherent			= dma_4v_alloc_coherent,
 	.free_coherent			= dma_4v_free_coherent,
@@ -546,8 +532,6 @@ static struct dma_map_ops sun4v_dma_ops = {
 	.unmap_page			= dma_4v_unmap_page,
 	.map_sg				= dma_4v_map_sg,
 	.unmap_sg			= dma_4v_unmap_sg,
-	.sync_single_for_cpu		= dma_4v_sync_single_for_cpu,
-	.sync_sg_for_cpu		= dma_4v_sync_sg_for_cpu,
 };
 
 static void __devinit pci_sun4v_scan_bus(struct pci_pbm_info *pbm,
-- 
cgit v1.2.3-70-g09d2


From c2c07dbd8742a26ab3f1ee8b82237a060a0d9f61 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 10 Aug 2009 11:53:15 +0900
Subject: sparc: Replace sbus_map_single and sbus_unmap_single with
 sbus_map_page and sbus_unmap_page

This is a preparation for using asm-generic/pci-dma-compat.h;
SPARC32 has two dma_map_ops structures for pci and sbus
(removing arch/sparc/kernel/dma.c, PCI and SBUS DMA accessor).

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Tested-by: Robert Reif <reif@earthlink.net>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
LKML-Reference: <1249872797-1314-7-git-send-email-fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/sparc/kernel/dma.c    | 5 ++---
 arch/sparc/kernel/dma.h    | 6 +++---
 arch/sparc/kernel/ioport.c | 7 +++++--
 3 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'arch/sparc')

diff --git a/arch/sparc/kernel/dma.c b/arch/sparc/kernel/dma.c
index 15820a91817..a5d50dac735 100644
--- a/arch/sparc/kernel/dma.c
+++ b/arch/sparc/kernel/dma.c
@@ -68,8 +68,7 @@ static dma_addr_t dma32_map_page(struct device *dev, struct page *page,
 		return pci_map_page(to_pci_dev(dev), page, offset,
 				    size, (int)direction);
 #endif
-	return sbus_map_single(dev, page_address(page) + offset,
-			       size, (int)direction);
+	return sbus_map_page(dev, page, offset, size, (int)direction);
 }
 
 static void dma32_unmap_page(struct device *dev, dma_addr_t dma_address,
@@ -83,7 +82,7 @@ static void dma32_unmap_page(struct device *dev, dma_addr_t dma_address,
 		return;
 	}
 #endif
-	sbus_unmap_single(dev, dma_address, size, (int)direction);
+	sbus_unmap_page(dev, dma_address, size, (int)direction);
 }
 
 static int dma32_map_sg(struct device *dev, struct scatterlist *sg,
diff --git a/arch/sparc/kernel/dma.h b/arch/sparc/kernel/dma.h
index f8d8951adb5..680351ee0d4 100644
--- a/arch/sparc/kernel/dma.h
+++ b/arch/sparc/kernel/dma.h
@@ -1,8 +1,8 @@
 void *sbus_alloc_consistent(struct device *dev, long len, u32 *dma_addrp);
 void sbus_free_consistent(struct device *dev, long n, void *p, u32 ba);
-dma_addr_t sbus_map_single(struct device *dev, void *va,
-			   size_t len, int direction);
-void sbus_unmap_single(struct device *dev, dma_addr_t ba,
+dma_addr_t sbus_map_page(struct device *dev, struct page *page,
+			 unsigned long offset, size_t len, int direction);
+void sbus_unmap_page(struct device *dev, dma_addr_t ba,
 		       size_t n, int direction);
 int sbus_map_sg(struct device *dev, struct scatterlist *sg,
 		int n, int direction);
diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c
index 87ea0d03d97..39ff1e0c518 100644
--- a/arch/sparc/kernel/ioport.c
+++ b/arch/sparc/kernel/ioport.c
@@ -337,8 +337,11 @@ void sbus_free_consistent(struct device *dev, long n, void *p, u32 ba)
  * CPU view of this memory may be inconsistent with
  * a device view and explicit flushing is necessary.
  */
-dma_addr_t sbus_map_single(struct device *dev, void *va, size_t len, int direction)
+dma_addr_t sbus_map_page(struct device *dev, struct page *page,
+			 unsigned long offset, size_t len, int direction)
 {
+	void *va = page_address(page) + offset;
+
 	/* XXX why are some lengths signed, others unsigned? */
 	if (len <= 0) {
 		return 0;
@@ -350,7 +353,7 @@ dma_addr_t sbus_map_single(struct device *dev, void *va, size_t len, int directi
 	return mmu_get_scsi_one(dev, va, len);
 }
 
-void sbus_unmap_single(struct device *dev, dma_addr_t ba, size_t n, int direction)
+void sbus_unmap_page(struct device *dev, dma_addr_t ba, size_t n, int direction)
 {
 	mmu_release_scsi_one(dev, ba, n);
 }
-- 
cgit v1.2.3-70-g09d2


From ee664a9252d24ef10317d1bba8fc8f4c6495b36c Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 10 Aug 2009 11:53:16 +0900
Subject: sparc: Use asm-generic/pci-dma-compat

This converts SPARC to use asm-generic/pci-dma-compat instead
of the homegrown mechnism.

SPARC32 has two dma_map_ops structures for pci and sbus
(removing arch/sparc/kernel/dma.c, PCI and SBUS DMA accessor).
The global 'dma_ops' is set to sbus_dma_ops and get_dma_ops()
returns pci32_dma_ops for pci devices so we can use the
appropriate dma mapping operations.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Tested-by: Robert Reif <reif@earthlink.net>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
LKML-Reference: <1249872797-1314-8-git-send-email-fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/sparc/include/asm/dma-mapping.h |   7 +-
 arch/sparc/include/asm/pci.h         |   3 +
 arch/sparc/include/asm/pci_32.h      | 105 -----------------------
 arch/sparc/include/asm/pci_64.h      |  88 -------------------
 arch/sparc/kernel/dma.c              | 155 ++-------------------------------
 arch/sparc/kernel/dma.h              |  14 ---
 arch/sparc/kernel/iommu.c            |   4 +-
 arch/sparc/kernel/ioport.c           | 162 ++++++++++++++++-------------------
 arch/sparc/kernel/pci.c              |   2 +-
 9 files changed, 96 insertions(+), 444 deletions(-)
 delete mode 100644 arch/sparc/kernel/dma.h

(limited to 'arch/sparc')

diff --git a/arch/sparc/include/asm/dma-mapping.h b/arch/sparc/include/asm/dma-mapping.h
index 34c92264208..2677818dc78 100644
--- a/arch/sparc/include/asm/dma-mapping.h
+++ b/arch/sparc/include/asm/dma-mapping.h
@@ -14,10 +14,15 @@ extern int dma_set_mask(struct device *dev, u64 dma_mask);
 #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
 #define dma_is_consistent(d, h)	(1)
 
-extern struct dma_map_ops *dma_ops;
+extern struct dma_map_ops *dma_ops, pci32_dma_ops;
+extern struct bus_type pci_bus_type;
 
 static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 {
+#if defined(CONFIG_SPARC32) && defined(CONFIG_PCI)
+	if (dev->bus == &pci_bus_type)
+		return &pci32_dma_ops;
+#endif
 	return dma_ops;
 }
 
diff --git a/arch/sparc/include/asm/pci.h b/arch/sparc/include/asm/pci.h
index 6e14fd17933..d9c031f9910 100644
--- a/arch/sparc/include/asm/pci.h
+++ b/arch/sparc/include/asm/pci.h
@@ -5,4 +5,7 @@
 #else
 #include <asm/pci_32.h>
 #endif
+
+#include <asm-generic/pci-dma-compat.h>
+
 #endif
diff --git a/arch/sparc/include/asm/pci_32.h b/arch/sparc/include/asm/pci_32.h
index b41c4c19815..ac0e8369fd9 100644
--- a/arch/sparc/include/asm/pci_32.h
+++ b/arch/sparc/include/asm/pci_32.h
@@ -31,42 +31,8 @@ static inline void pcibios_penalize_isa_irq(int irq, int active)
  */
 #define PCI_DMA_BUS_IS_PHYS	(0)
 
-#include <asm/scatterlist.h>
-
 struct pci_dev;
 
-/* Allocate and map kernel buffer using consistent mode DMA for a device.
- * hwdev should be valid struct pci_dev pointer for PCI devices.
- */
-extern void *pci_alloc_consistent(struct pci_dev *hwdev, size_t size, dma_addr_t *dma_handle);
-
-/* Free and unmap a consistent DMA buffer.
- * cpu_addr is what was returned from pci_alloc_consistent,
- * size must be the same as what as passed into pci_alloc_consistent,
- * and likewise dma_addr must be the same as what *dma_addrp was set to.
- *
- * References to the memory and mappings assosciated with cpu_addr/dma_addr
- * past this call are illegal.
- */
-extern void pci_free_consistent(struct pci_dev *hwdev, size_t size, void *vaddr, dma_addr_t dma_handle);
-
-/* Map a single buffer of the indicated size for DMA in streaming mode.
- * The 32-bit bus address to use is returned.
- *
- * Once the device is given the dma address, the device owns this memory
- * until either pci_unmap_single or pci_dma_sync_single_for_cpu is performed.
- */
-extern dma_addr_t pci_map_single(struct pci_dev *hwdev, void *ptr, size_t size, int direction);
-
-/* Unmap a single streaming mode DMA translation.  The dma_addr and size
- * must match what was provided for in a previous pci_map_single call.  All
- * other usages are undefined.
- *
- * After this call, reads by the cpu to the buffer are guaranteed to see
- * whatever the device wrote there.
- */
-extern void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr, size_t size, int direction);
-
 /* pci_unmap_{single,page} is not a nop, thus... */
 #define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)	\
 	dma_addr_t ADDR_NAME;
@@ -81,69 +47,6 @@ extern void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr, size_t
 #define pci_unmap_len_set(PTR, LEN_NAME, VAL)		\
 	(((PTR)->LEN_NAME) = (VAL))
 
-/*
- * Same as above, only with pages instead of mapped addresses.
- */
-extern dma_addr_t pci_map_page(struct pci_dev *hwdev, struct page *page,
-			unsigned long offset, size_t size, int direction);
-extern void pci_unmap_page(struct pci_dev *hwdev,
-			dma_addr_t dma_address, size_t size, int direction);
-
-/* Map a set of buffers described by scatterlist in streaming
- * mode for DMA.  This is the scather-gather version of the
- * above pci_map_single interface.  Here the scatter gather list
- * elements are each tagged with the appropriate dma address
- * and length.  They are obtained via sg_dma_{address,length}(SG).
- *
- * NOTE: An implementation may be able to use a smaller number of
- *       DMA address/length pairs than there are SG table elements.
- *       (for example via virtual mapping capabilities)
- *       The routine returns the number of addr/length pairs actually
- *       used, at most nents.
- *
- * Device ownership issues as mentioned above for pci_map_single are
- * the same here.
- */
-extern int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg, int nents, int direction);
-
-/* Unmap a set of streaming mode DMA translations.
- * Again, cpu read rules concerning calls here are the same as for
- * pci_unmap_single() above.
- */
-extern void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg, int nhwents, int direction);
-
-/* Make physical memory consistent for a single
- * streaming mode DMA translation after a transfer.
- *
- * If you perform a pci_map_single() but wish to interrogate the
- * buffer using the cpu, yet do not wish to teardown the PCI dma
- * mapping, you must call this function before doing so.  At the
- * next point you give the PCI dma address back to the card, you
- * must first perform a pci_dma_sync_for_device, and then the device
- * again owns the buffer.
- */
-extern void pci_dma_sync_single_for_cpu(struct pci_dev *hwdev, dma_addr_t dma_handle, size_t size, int direction);
-extern void pci_dma_sync_single_for_device(struct pci_dev *hwdev, dma_addr_t dma_handle, size_t size, int direction);
-
-/* Make physical memory consistent for a set of streaming
- * mode DMA translations after a transfer.
- *
- * The same as pci_dma_sync_single_* but for a scatter-gather list,
- * same rules and usage.
- */
-extern void pci_dma_sync_sg_for_cpu(struct pci_dev *hwdev, struct scatterlist *sg, int nelems, int direction);
-extern void pci_dma_sync_sg_for_device(struct pci_dev *hwdev, struct scatterlist *sg, int nelems, int direction);
-
-/* Return whether the given PCI device DMA address mask can
- * be supported properly.  For example, if your device can
- * only drive the low 24-bits during PCI bus mastering, then
- * you would pass 0x00ffffff as the mask to this function.
- */
-static inline int pci_dma_supported(struct pci_dev *hwdev, u64 mask)
-{
-	return 1;
-}
-
 #ifdef CONFIG_PCI
 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 					enum pci_dma_burst_strategy *strat,
@@ -154,14 +57,6 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 }
 #endif
 
-#define PCI_DMA_ERROR_CODE      (~(dma_addr_t)0x0)
-
-static inline int pci_dma_mapping_error(struct pci_dev *pdev,
-					dma_addr_t dma_addr)
-{
-        return (dma_addr == PCI_DMA_ERROR_CODE);
-}
-
 struct device_node;
 extern struct device_node *pci_device_to_OF_node(struct pci_dev *pdev);
 
diff --git a/arch/sparc/include/asm/pci_64.h b/arch/sparc/include/asm/pci_64.h
index 7a1e3566e59..5cc9f6aa549 100644
--- a/arch/sparc/include/asm/pci_64.h
+++ b/arch/sparc/include/asm/pci_64.h
@@ -35,37 +35,6 @@ static inline void pcibios_penalize_isa_irq(int irq, int active)
  */
 #define PCI_DMA_BUS_IS_PHYS	(0)
 
-static inline void *pci_alloc_consistent(struct pci_dev *pdev, size_t size,
-					 dma_addr_t *dma_handle)
-{
-	return dma_alloc_coherent(&pdev->dev, size, dma_handle, GFP_ATOMIC);
-}
-
-static inline void pci_free_consistent(struct pci_dev *pdev, size_t size,
-				       void *vaddr, dma_addr_t dma_handle)
-{
-	return dma_free_coherent(&pdev->dev, size, vaddr, dma_handle);
-}
-
-static inline dma_addr_t pci_map_single(struct pci_dev *pdev, void *ptr,
-					size_t size, int direction)
-{
-	return dma_map_single(&pdev->dev, ptr, size,
-			      (enum dma_data_direction) direction);
-}
-
-static inline void pci_unmap_single(struct pci_dev *pdev, dma_addr_t dma_addr,
-				    size_t size, int direction)
-{
-	dma_unmap_single(&pdev->dev, dma_addr, size,
-			 (enum dma_data_direction) direction);
-}
-
-#define pci_map_page(dev, page, off, size, dir) \
-	pci_map_single(dev, (page_address(page) + (off)), size, dir)
-#define pci_unmap_page(dev,addr,sz,dir) \
-	pci_unmap_single(dev,addr,sz,dir)
-
 /* pci_unmap_{single,page} is not a nop, thus... */
 #define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)	\
 	dma_addr_t ADDR_NAME;
@@ -80,57 +49,6 @@ static inline void pci_unmap_single(struct pci_dev *pdev, dma_addr_t dma_addr,
 #define pci_unmap_len_set(PTR, LEN_NAME, VAL)		\
 	(((PTR)->LEN_NAME) = (VAL))
 
-static inline int pci_map_sg(struct pci_dev *pdev, struct scatterlist *sg,
-			     int nents, int direction)
-{
-	return dma_map_sg(&pdev->dev, sg, nents,
-			  (enum dma_data_direction) direction);
-}
-
-static inline void pci_unmap_sg(struct pci_dev *pdev, struct scatterlist *sg,
-				int nents, int direction)
-{
-	dma_unmap_sg(&pdev->dev, sg, nents,
-		     (enum dma_data_direction) direction);
-}
-
-static inline void pci_dma_sync_single_for_cpu(struct pci_dev *pdev,
-					       dma_addr_t dma_handle,
-					       size_t size, int direction)
-{
-	dma_sync_single_for_cpu(&pdev->dev, dma_handle, size,
-				(enum dma_data_direction) direction);
-}
-
-static inline void pci_dma_sync_single_for_device(struct pci_dev *pdev,
-						  dma_addr_t dma_handle,
-						  size_t size, int direction)
-{
-	/* No flushing needed to sync cpu writes to the device.  */
-}
-
-static inline void pci_dma_sync_sg_for_cpu(struct pci_dev *pdev,
-					   struct scatterlist *sg,
-					   int nents, int direction)
-{
-	dma_sync_sg_for_cpu(&pdev->dev, sg, nents,
-			    (enum dma_data_direction) direction);
-}
-
-static inline void pci_dma_sync_sg_for_device(struct pci_dev *pdev,
-					      struct scatterlist *sg,
-					      int nelems, int direction)
-{
-	/* No flushing needed to sync cpu writes to the device.  */
-}
-
-/* Return whether the given PCI device DMA address mask can
- * be supported properly.  For example, if your device can
- * only drive the low 24-bits during PCI bus mastering, then
- * you would pass 0x00ffffff as the mask to this function.
- */
-extern int pci_dma_supported(struct pci_dev *hwdev, u64 mask);
-
 /* PCI IOMMU mapping bypass support. */
 
 /* PCI 64-bit addressing works for all slots on all controller
@@ -140,12 +58,6 @@ extern int pci_dma_supported(struct pci_dev *hwdev, u64 mask);
 #define PCI64_REQUIRED_MASK	(~(dma64_addr_t)0)
 #define PCI64_ADDR_BASE		0xfffc000000000000UL
 
-static inline int pci_dma_mapping_error(struct pci_dev *pdev,
-					dma_addr_t dma_addr)
-{
-	return dma_mapping_error(&pdev->dev, dma_addr);
-}
-
 #ifdef CONFIG_PCI
 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 					enum pci_dma_burst_strategy *strat,
diff --git a/arch/sparc/kernel/dma.c b/arch/sparc/kernel/dma.c
index a5d50dac735..b2fa3127f60 100644
--- a/arch/sparc/kernel/dma.c
+++ b/arch/sparc/kernel/dma.c
@@ -13,13 +13,17 @@
 #include <linux/pci.h>
 #endif
 
-#include "dma.h"
-
+/*
+ * Return whether the given PCI device DMA address mask can be
+ * supported properly.  For example, if your device can only drive the
+ * low 24-bits during PCI bus mastering, then you would pass
+ * 0x00ffffff as the mask to this function.
+ */
 int dma_supported(struct device *dev, u64 mask)
 {
 #ifdef CONFIG_PCI
 	if (dev->bus == &pci_bus_type)
-		return pci_dma_supported(to_pci_dev(dev), mask);
+		return 1;
 #endif
 	return 0;
 }
@@ -34,148 +38,3 @@ int dma_set_mask(struct device *dev, u64 dma_mask)
 	return -EOPNOTSUPP;
 }
 EXPORT_SYMBOL(dma_set_mask);
-
-static void *dma32_alloc_coherent(struct device *dev, size_t size,
-				  dma_addr_t *dma_handle, gfp_t flag)
-{
-#ifdef CONFIG_PCI
-	if (dev->bus == &pci_bus_type)
-		return pci_alloc_consistent(to_pci_dev(dev), size, dma_handle);
-#endif
-	return sbus_alloc_consistent(dev, size, dma_handle);
-}
-
-static void dma32_free_coherent(struct device *dev, size_t size,
-				void *cpu_addr, dma_addr_t dma_handle)
-{
-#ifdef CONFIG_PCI
-	if (dev->bus == &pci_bus_type) {
-		pci_free_consistent(to_pci_dev(dev), size,
-				    cpu_addr, dma_handle);
-		return;
-	}
-#endif
-	sbus_free_consistent(dev, size, cpu_addr, dma_handle);
-}
-
-static dma_addr_t dma32_map_page(struct device *dev, struct page *page,
-				 unsigned long offset, size_t size,
-				 enum dma_data_direction direction,
-				 struct dma_attrs *attrs)
-{
-#ifdef CONFIG_PCI
-	if (dev->bus == &pci_bus_type)
-		return pci_map_page(to_pci_dev(dev), page, offset,
-				    size, (int)direction);
-#endif
-	return sbus_map_page(dev, page, offset, size, (int)direction);
-}
-
-static void dma32_unmap_page(struct device *dev, dma_addr_t dma_address,
-			     size_t size, enum dma_data_direction direction,
-			     struct dma_attrs *attrs)
-{
-#ifdef CONFIG_PCI
-	if (dev->bus == &pci_bus_type) {
-		pci_unmap_page(to_pci_dev(dev), dma_address,
-			       size, (int)direction);
-		return;
-	}
-#endif
-	sbus_unmap_page(dev, dma_address, size, (int)direction);
-}
-
-static int dma32_map_sg(struct device *dev, struct scatterlist *sg,
-			int nents, enum dma_data_direction direction,
-			struct dma_attrs *attrs)
-{
-#ifdef CONFIG_PCI
-	if (dev->bus == &pci_bus_type)
-		return pci_map_sg(to_pci_dev(dev), sg, nents, (int)direction);
-#endif
-	return sbus_map_sg(dev, sg, nents, direction);
-}
-
-void dma32_unmap_sg(struct device *dev, struct scatterlist *sg,
-		    int nents, enum dma_data_direction direction,
-		    struct dma_attrs *attrs)
-{
-#ifdef CONFIG_PCI
-	if (dev->bus == &pci_bus_type) {
-		pci_unmap_sg(to_pci_dev(dev), sg, nents, (int)direction);
-		return;
-	}
-#endif
-	sbus_unmap_sg(dev, sg, nents, (int)direction);
-}
-
-static void dma32_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
-				      size_t size,
-				      enum dma_data_direction direction)
-{
-#ifdef CONFIG_PCI
-	if (dev->bus == &pci_bus_type) {
-		pci_dma_sync_single_for_cpu(to_pci_dev(dev), dma_handle,
-					    size, (int)direction);
-		return;
-	}
-#endif
-	sbus_dma_sync_single_for_cpu(dev, dma_handle, size, (int) direction);
-}
-
-static void dma32_sync_single_for_device(struct device *dev,
-					 dma_addr_t dma_handle, size_t size,
-					 enum dma_data_direction direction)
-{
-#ifdef CONFIG_PCI
-	if (dev->bus == &pci_bus_type) {
-		pci_dma_sync_single_for_device(to_pci_dev(dev), dma_handle,
-					       size, (int)direction);
-		return;
-	}
-#endif
-	sbus_dma_sync_single_for_device(dev, dma_handle, size, (int) direction);
-}
-
-static void dma32_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
-				  int nelems, enum dma_data_direction direction)
-{
-#ifdef CONFIG_PCI
-	if (dev->bus == &pci_bus_type) {
-		pci_dma_sync_sg_for_cpu(to_pci_dev(dev), sg,
-					nelems, (int)direction);
-		return;
-	}
-#endif
-	BUG();
-}
-
-static void dma32_sync_sg_for_device(struct device *dev,
-				     struct scatterlist *sg, int nelems,
-				     enum dma_data_direction direction)
-{
-#ifdef CONFIG_PCI
-	if (dev->bus == &pci_bus_type) {
-		pci_dma_sync_sg_for_device(to_pci_dev(dev), sg,
-					   nelems, (int)direction);
-		return;
-	}
-#endif
-	BUG();
-}
-
-static struct dma_map_ops dma32_dma_ops = {
-	.alloc_coherent		= dma32_alloc_coherent,
-	.free_coherent		= dma32_free_coherent,
-	.map_page		= dma32_map_page,
-	.unmap_page		= dma32_unmap_page,
-	.map_sg			= dma32_map_sg,
-	.unmap_sg		= dma32_unmap_sg,
-	.sync_single_for_cpu	= dma32_sync_single_for_cpu,
-	.sync_single_for_device	= dma32_sync_single_for_device,
-	.sync_sg_for_cpu	= dma32_sync_sg_for_cpu,
-	.sync_sg_for_device	= dma32_sync_sg_for_device,
-};
-
-struct dma_map_ops *dma_ops = &dma32_dma_ops;
-EXPORT_SYMBOL(dma_ops);
diff --git a/arch/sparc/kernel/dma.h b/arch/sparc/kernel/dma.h
deleted file mode 100644
index 680351ee0d4..00000000000
--- a/arch/sparc/kernel/dma.h
+++ /dev/null
@@ -1,14 +0,0 @@
-void *sbus_alloc_consistent(struct device *dev, long len, u32 *dma_addrp);
-void sbus_free_consistent(struct device *dev, long n, void *p, u32 ba);
-dma_addr_t sbus_map_page(struct device *dev, struct page *page,
-			 unsigned long offset, size_t len, int direction);
-void sbus_unmap_page(struct device *dev, dma_addr_t ba,
-		       size_t n, int direction);
-int sbus_map_sg(struct device *dev, struct scatterlist *sg,
-		int n, int direction);
-void sbus_unmap_sg(struct device *dev, struct scatterlist *sg,
-		   int n, int direction);
-void sbus_dma_sync_single_for_cpu(struct device *dev, dma_addr_t ba,
-				  size_t size, int direction);
-void sbus_dma_sync_single_for_device(struct device *dev, dma_addr_t ba,
-				     size_t size, int direction);
diff --git a/arch/sparc/kernel/iommu.c b/arch/sparc/kernel/iommu.c
index 74b289cab55..7690cc219ec 100644
--- a/arch/sparc/kernel/iommu.c
+++ b/arch/sparc/kernel/iommu.c
@@ -840,6 +840,8 @@ static struct dma_map_ops sun4u_dma_ops = {
 struct dma_map_ops *dma_ops = &sun4u_dma_ops;
 EXPORT_SYMBOL(dma_ops);
 
+extern int pci64_dma_supported(struct pci_dev *pdev, u64 device_mask);
+
 int dma_supported(struct device *dev, u64 device_mask)
 {
 	struct iommu *iommu = dev->archdata.iommu;
@@ -853,7 +855,7 @@ int dma_supported(struct device *dev, u64 device_mask)
 
 #ifdef CONFIG_PCI
 	if (dev->bus == &pci_bus_type)
-		return pci_dma_supported(to_pci_dev(dev), device_mask);
+		return pci64_dma_supported(to_pci_dev(dev), device_mask);
 #endif
 
 	return 0;
diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c
index 39ff1e0c518..1eb60438965 100644
--- a/arch/sparc/kernel/ioport.c
+++ b/arch/sparc/kernel/ioport.c
@@ -48,8 +48,6 @@
 #include <asm/iommu.h>
 #include <asm/io-unit.h>
 
-#include "dma.h"
-
 #define mmu_inval_dma_area(p, l)	/* Anton pulled it out for 2.4.0-xx */
 
 static struct resource *_sparc_find_resource(struct resource *r,
@@ -246,7 +244,8 @@ EXPORT_SYMBOL(sbus_set_sbus64);
  * Typically devices use them for control blocks.
  * CPU may access them without any explicit flushing.
  */
-void *sbus_alloc_consistent(struct device *dev, long len, u32 *dma_addrp)
+static void *sbus_alloc_coherent(struct device *dev, size_t len,
+				 dma_addr_t *dma_addrp, gfp_t gfp)
 {
 	struct of_device *op = to_of_device(dev);
 	unsigned long len_total = (len + PAGE_SIZE-1) & PAGE_MASK;
@@ -299,7 +298,8 @@ err_nopages:
 	return NULL;
 }
 
-void sbus_free_consistent(struct device *dev, long n, void *p, u32 ba)
+static void sbus_free_coherent(struct device *dev, size_t n, void *p,
+			       dma_addr_t ba)
 {
 	struct resource *res;
 	struct page *pgv;
@@ -317,7 +317,7 @@ void sbus_free_consistent(struct device *dev, long n, void *p, u32 ba)
 
 	n = (n + PAGE_SIZE-1) & PAGE_MASK;
 	if ((res->end-res->start)+1 != n) {
-		printk("sbus_free_consistent: region 0x%lx asked 0x%lx\n",
+		printk("sbus_free_consistent: region 0x%lx asked 0x%zx\n",
 		    (long)((res->end-res->start)+1), n);
 		return;
 	}
@@ -337,8 +337,10 @@ void sbus_free_consistent(struct device *dev, long n, void *p, u32 ba)
  * CPU view of this memory may be inconsistent with
  * a device view and explicit flushing is necessary.
  */
-dma_addr_t sbus_map_page(struct device *dev, struct page *page,
-			 unsigned long offset, size_t len, int direction)
+static dma_addr_t sbus_map_page(struct device *dev, struct page *page,
+				unsigned long offset, size_t len,
+				enum dma_data_direction dir,
+				struct dma_attrs *attrs)
 {
 	void *va = page_address(page) + offset;
 
@@ -353,12 +355,14 @@ dma_addr_t sbus_map_page(struct device *dev, struct page *page,
 	return mmu_get_scsi_one(dev, va, len);
 }
 
-void sbus_unmap_page(struct device *dev, dma_addr_t ba, size_t n, int direction)
+static void sbus_unmap_page(struct device *dev, dma_addr_t ba, size_t n,
+			    enum dma_data_direction dir, struct dma_attrs *attrs)
 {
 	mmu_release_scsi_one(dev, ba, n);
 }
 
-int sbus_map_sg(struct device *dev, struct scatterlist *sg, int n, int direction)
+static int sbus_map_sg(struct device *dev, struct scatterlist *sg, int n,
+		       enum dma_data_direction dir, struct dma_attrs *attrs)
 {
 	mmu_get_scsi_sgl(dev, sg, n);
 
@@ -369,19 +373,38 @@ int sbus_map_sg(struct device *dev, struct scatterlist *sg, int n, int direction
 	return n;
 }
 
-void sbus_unmap_sg(struct device *dev, struct scatterlist *sg, int n, int direction)
+static void sbus_unmap_sg(struct device *dev, struct scatterlist *sg, int n,
+			  enum dma_data_direction dir, struct dma_attrs *attrs)
 {
 	mmu_release_scsi_sgl(dev, sg, n);
 }
 
-void sbus_dma_sync_single_for_cpu(struct device *dev, dma_addr_t ba, size_t size, int direction)
+static void sbus_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+				 int n,	enum dma_data_direction dir)
 {
+	BUG();
 }
 
-void sbus_dma_sync_single_for_device(struct device *dev, dma_addr_t ba, size_t size, int direction)
+static void sbus_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+				    int n, enum dma_data_direction dir)
 {
+	BUG();
 }
 
+struct dma_map_ops sbus_dma_ops = {
+	.alloc_coherent		= sbus_alloc_coherent,
+	.free_coherent		= sbus_free_coherent,
+	.map_page		= sbus_map_page,
+	.unmap_page		= sbus_unmap_page,
+	.map_sg			= sbus_map_sg,
+	.unmap_sg		= sbus_unmap_sg,
+	.sync_sg_for_cpu	= sbus_sync_sg_for_cpu,
+	.sync_sg_for_device	= sbus_sync_sg_for_device,
+};
+
+struct dma_map_ops *dma_ops = &sbus_dma_ops;
+EXPORT_SYMBOL(dma_ops);
+
 static int __init sparc_register_ioport(void)
 {
 	register_proc_sparc_ioport();
@@ -398,7 +421,8 @@ arch_initcall(sparc_register_ioport);
 /* Allocate and map kernel buffer using consistent mode DMA for a device.
  * hwdev should be valid struct pci_dev pointer for PCI devices.
  */
-void *pci_alloc_consistent(struct pci_dev *pdev, size_t len, dma_addr_t *pba)
+static void *pci32_alloc_coherent(struct device *dev, size_t len,
+				  dma_addr_t *pba, gfp_t gfp)
 {
 	unsigned long len_total = (len + PAGE_SIZE-1) & PAGE_MASK;
 	unsigned long va;
@@ -442,7 +466,6 @@ void *pci_alloc_consistent(struct pci_dev *pdev, size_t len, dma_addr_t *pba)
 	*pba = virt_to_phys(va); /* equals virt_to_bus (R.I.P.) for us. */
 	return (void *) res->start;
 }
-EXPORT_SYMBOL(pci_alloc_consistent);
 
 /* Free and unmap a consistent DMA buffer.
  * cpu_addr is what was returned from pci_alloc_consistent,
@@ -452,7 +475,8 @@ EXPORT_SYMBOL(pci_alloc_consistent);
  * References to the memory and mappings associated with cpu_addr/dma_addr
  * past this call are illegal.
  */
-void pci_free_consistent(struct pci_dev *pdev, size_t n, void *p, dma_addr_t ba)
+static void pci32_free_coherent(struct device *dev, size_t n, void *p,
+				dma_addr_t ba)
 {
 	struct resource *res;
 	unsigned long pgp;
@@ -484,60 +508,18 @@ void pci_free_consistent(struct pci_dev *pdev, size_t n, void *p, dma_addr_t ba)
 
 	free_pages(pgp, get_order(n));
 }
-EXPORT_SYMBOL(pci_free_consistent);
-
-/* Map a single buffer of the indicated size for DMA in streaming mode.
- * The 32-bit bus address to use is returned.
- *
- * Once the device is given the dma address, the device owns this memory
- * until either pci_unmap_single or pci_dma_sync_single_* is performed.
- */
-dma_addr_t pci_map_single(struct pci_dev *hwdev, void *ptr, size_t size,
-    int direction)
-{
-	BUG_ON(direction == PCI_DMA_NONE);
-	/* IIep is write-through, not flushing. */
-	return virt_to_phys(ptr);
-}
-EXPORT_SYMBOL(pci_map_single);
-
-/* Unmap a single streaming mode DMA translation.  The dma_addr and size
- * must match what was provided for in a previous pci_map_single call.  All
- * other usages are undefined.
- *
- * After this call, reads by the cpu to the buffer are guaranteed to see
- * whatever the device wrote there.
- */
-void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t ba, size_t size,
-    int direction)
-{
-	BUG_ON(direction == PCI_DMA_NONE);
-	if (direction != PCI_DMA_TODEVICE) {
-		mmu_inval_dma_area((unsigned long)phys_to_virt(ba),
-		    (size + PAGE_SIZE-1) & PAGE_MASK);
-	}
-}
-EXPORT_SYMBOL(pci_unmap_single);
 
 /*
  * Same as pci_map_single, but with pages.
  */
-dma_addr_t pci_map_page(struct pci_dev *hwdev, struct page *page,
-			unsigned long offset, size_t size, int direction)
+static dma_addr_t pci32_map_page(struct device *dev, struct page *page,
+				 unsigned long offset, size_t size,
+				 enum dma_data_direction dir,
+				 struct dma_attrs *attrs)
 {
-	BUG_ON(direction == PCI_DMA_NONE);
 	/* IIep is write-through, not flushing. */
 	return page_to_phys(page) + offset;
 }
-EXPORT_SYMBOL(pci_map_page);
-
-void pci_unmap_page(struct pci_dev *hwdev,
-			dma_addr_t dma_address, size_t size, int direction)
-{
-	BUG_ON(direction == PCI_DMA_NONE);
-	/* mmu_inval_dma_area XXX */
-}
-EXPORT_SYMBOL(pci_unmap_page);
 
 /* Map a set of buffers described by scatterlist in streaming
  * mode for DMA.  This is the scather-gather version of the
@@ -554,13 +536,13 @@ EXPORT_SYMBOL(pci_unmap_page);
  * Device ownership issues as mentioned above for pci_map_single are
  * the same here.
  */
-int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sgl, int nents,
-    int direction)
+static int pci32_map_sg(struct device *device, struct scatterlist *sgl,
+			int nents, enum dma_data_direction dir,
+			struct dma_attrs *attrs)
 {
 	struct scatterlist *sg;
 	int n;
 
-	BUG_ON(direction == PCI_DMA_NONE);
 	/* IIep is write-through, not flushing. */
 	for_each_sg(sgl, sg, nents, n) {
 		BUG_ON(page_address(sg_page(sg)) == NULL);
@@ -569,20 +551,19 @@ int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sgl, int nents,
 	}
 	return nents;
 }
-EXPORT_SYMBOL(pci_map_sg);
 
 /* Unmap a set of streaming mode DMA translations.
  * Again, cpu read rules concerning calls here are the same as for
  * pci_unmap_single() above.
  */
-void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sgl, int nents,
-    int direction)
+static void pci32_unmap_sg(struct device *dev, struct scatterlist *sgl,
+			   int nents, enum dma_data_direction dir,
+			   struct dma_attrs *attrs)
 {
 	struct scatterlist *sg;
 	int n;
 
-	BUG_ON(direction == PCI_DMA_NONE);
-	if (direction != PCI_DMA_TODEVICE) {
+	if (dir != PCI_DMA_TODEVICE) {
 		for_each_sg(sgl, sg, nents, n) {
 			BUG_ON(page_address(sg_page(sg)) == NULL);
 			mmu_inval_dma_area(
@@ -591,7 +572,6 @@ void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sgl, int nents,
 		}
 	}
 }
-EXPORT_SYMBOL(pci_unmap_sg);
 
 /* Make physical memory consistent for a single
  * streaming mode DMA translation before or after a transfer.
@@ -603,25 +583,23 @@ EXPORT_SYMBOL(pci_unmap_sg);
  * must first perform a pci_dma_sync_for_device, and then the
  * device again owns the buffer.
  */
-void pci_dma_sync_single_for_cpu(struct pci_dev *hwdev, dma_addr_t ba, size_t size, int direction)
+static void pci32_sync_single_for_cpu(struct device *dev, dma_addr_t ba,
+				      size_t size, enum dma_data_direction dir)
 {
-	BUG_ON(direction == PCI_DMA_NONE);
-	if (direction != PCI_DMA_TODEVICE) {
+	if (dir != PCI_DMA_TODEVICE) {
 		mmu_inval_dma_area((unsigned long)phys_to_virt(ba),
 		    (size + PAGE_SIZE-1) & PAGE_MASK);
 	}
 }
-EXPORT_SYMBOL(pci_dma_sync_single_for_cpu);
 
-void pci_dma_sync_single_for_device(struct pci_dev *hwdev, dma_addr_t ba, size_t size, int direction)
+static void pci32_sync_single_for_device(struct device *dev, dma_addr_t ba,
+					 size_t size, enum dma_data_direction dir)
 {
-	BUG_ON(direction == PCI_DMA_NONE);
-	if (direction != PCI_DMA_TODEVICE) {
+	if (dir != PCI_DMA_TODEVICE) {
 		mmu_inval_dma_area((unsigned long)phys_to_virt(ba),
 		    (size + PAGE_SIZE-1) & PAGE_MASK);
 	}
 }
-EXPORT_SYMBOL(pci_dma_sync_single_for_device);
 
 /* Make physical memory consistent for a set of streaming
  * mode DMA translations after a transfer.
@@ -629,13 +607,13 @@ EXPORT_SYMBOL(pci_dma_sync_single_for_device);
  * The same as pci_dma_sync_single_* but for a scatter-gather list,
  * same rules and usage.
  */
-void pci_dma_sync_sg_for_cpu(struct pci_dev *hwdev, struct scatterlist *sgl, int nents, int direction)
+static void pci32_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl,
+				  int nents, enum dma_data_direction dir)
 {
 	struct scatterlist *sg;
 	int n;
 
-	BUG_ON(direction == PCI_DMA_NONE);
-	if (direction != PCI_DMA_TODEVICE) {
+	if (dir != PCI_DMA_TODEVICE) {
 		for_each_sg(sgl, sg, nents, n) {
 			BUG_ON(page_address(sg_page(sg)) == NULL);
 			mmu_inval_dma_area(
@@ -644,15 +622,14 @@ void pci_dma_sync_sg_for_cpu(struct pci_dev *hwdev, struct scatterlist *sgl, int
 		}
 	}
 }
-EXPORT_SYMBOL(pci_dma_sync_sg_for_cpu);
 
-void pci_dma_sync_sg_for_device(struct pci_dev *hwdev, struct scatterlist *sgl, int nents, int direction)
+static void pci32_sync_sg_for_device(struct device *device, struct scatterlist *sgl,
+				     int nents, enum dma_data_direction dir)
 {
 	struct scatterlist *sg;
 	int n;
 
-	BUG_ON(direction == PCI_DMA_NONE);
-	if (direction != PCI_DMA_TODEVICE) {
+	if (dir != PCI_DMA_TODEVICE) {
 		for_each_sg(sgl, sg, nents, n) {
 			BUG_ON(page_address(sg_page(sg)) == NULL);
 			mmu_inval_dma_area(
@@ -661,7 +638,20 @@ void pci_dma_sync_sg_for_device(struct pci_dev *hwdev, struct scatterlist *sgl,
 		}
 	}
 }
-EXPORT_SYMBOL(pci_dma_sync_sg_for_device);
+
+struct dma_map_ops pci32_dma_ops = {
+	.alloc_coherent		= pci32_alloc_coherent,
+	.free_coherent		= pci32_free_coherent,
+	.map_page		= pci32_map_page,
+	.map_sg			= pci32_map_sg,
+	.unmap_sg		= pci32_unmap_sg,
+	.sync_single_for_cpu	= pci32_sync_single_for_cpu,
+	.sync_single_for_device	= pci32_sync_single_for_device,
+	.sync_sg_for_cpu	= pci32_sync_sg_for_cpu,
+	.sync_sg_for_device	= pci32_sync_sg_for_device,
+};
+EXPORT_SYMBOL(pci32_dma_ops);
+
 #endif /* CONFIG_PCI */
 
 #ifdef CONFIG_PROC_FS
diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c
index 57859ad2354..c6864866280 100644
--- a/arch/sparc/kernel/pci.c
+++ b/arch/sparc/kernel/pci.c
@@ -1039,7 +1039,7 @@ static void ali_sound_dma_hack(struct pci_dev *pdev, int set_bit)
 	pci_dev_put(ali_isa_bridge);
 }
 
-int pci_dma_supported(struct pci_dev *pdev, u64 device_mask)
+int pci64_dma_supported(struct pci_dev *pdev, u64 device_mask)
 {
 	u64 dma_addr_mask;
 
-- 
cgit v1.2.3-70-g09d2


From 451d7400a34cb679369e337d67f0238ed410f484 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 10 Aug 2009 11:53:17 +0900
Subject: sparc: Add CONFIG_DMA_API_DEBUG support

All we need to do for CONFIG_DMA_API_DEBUG support is call
dma_debug_init() in DMA code common for SPARC32 and SPARC64.

Now SPARC32 uses two dma_map_ops structures for pci and sbus so
there is not much dma stuff for SPARC32 in kernel/dma.c.
kernel/ioport.c also includes dma stuff for SPARC32. So let's
put all the dma stuff for SPARC32 in kernel/ioport.c and make
kernel/dma.c common for SPARC32 and SPARC64.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Tested-by: Robert Reif <reif@earthlink.net>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
LKML-Reference: <1249872797-1314-9-git-send-email-fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/sparc/Kconfig                   |  1 +
 arch/sparc/include/asm/dma-mapping.h |  6 +++++-
 arch/sparc/kernel/Makefile           |  2 +-
 arch/sparc/kernel/dma.c              | 37 +++++-------------------------------
 arch/sparc/kernel/ioport.c           | 27 ++++++++++++++++++++++++++
 5 files changed, 39 insertions(+), 34 deletions(-)

(limited to 'arch/sparc')

diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 5f2df99645c..233cff53a62 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -26,6 +26,7 @@ config SPARC
 	select RTC_CLASS
 	select RTC_DRV_M48T59
 	select HAVE_DMA_ATTRS
+	select HAVE_DMA_API_DEBUG
 
 config SPARC32
 	def_bool !64BIT
diff --git a/arch/sparc/include/asm/dma-mapping.h b/arch/sparc/include/asm/dma-mapping.h
index 2677818dc78..5a8c308e2b5 100644
--- a/arch/sparc/include/asm/dma-mapping.h
+++ b/arch/sparc/include/asm/dma-mapping.h
@@ -32,8 +32,11 @@ static inline void *dma_alloc_coherent(struct device *dev, size_t size,
 				       dma_addr_t *dma_handle, gfp_t flag)
 {
 	struct dma_map_ops *ops = get_dma_ops(dev);
+	void *cpu_addr;
 
-	return ops->alloc_coherent(dev, size, dma_handle, flag);
+	cpu_addr = ops->alloc_coherent(dev, size, dma_handle, flag);
+	debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
+	return cpu_addr;
 }
 
 static inline void dma_free_coherent(struct device *dev, size_t size,
@@ -41,6 +44,7 @@ static inline void dma_free_coherent(struct device *dev, size_t size,
 {
 	struct dma_map_ops *ops = get_dma_ops(dev);
 
+	debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
 	ops->free_coherent(dev, size, cpu_addr, dma_handle);
 }
 
diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile
index 475ce4696ac..29b88a58066 100644
--- a/arch/sparc/kernel/Makefile
+++ b/arch/sparc/kernel/Makefile
@@ -61,7 +61,7 @@ obj-$(CONFIG_SPARC64_SMP) += cpumap.o
 obj-$(CONFIG_SPARC32)     += devres.o
 devres-y                  := ../../../kernel/irq/devres.o
 
-obj-$(CONFIG_SPARC32)     += dma.o
+obj-y                     += dma.o
 
 obj-$(CONFIG_SPARC32_PCI) += pcic.o
 
diff --git a/arch/sparc/kernel/dma.c b/arch/sparc/kernel/dma.c
index b2fa3127f60..e1ba8ee21b9 100644
--- a/arch/sparc/kernel/dma.c
+++ b/arch/sparc/kernel/dma.c
@@ -1,40 +1,13 @@
-/* dma.c: PCI and SBUS DMA accessors for 32-bit sparc.
- *
- * Copyright (C) 2008 David S. Miller <davem@davemloft.net>
- */
-
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/dma-mapping.h>
-#include <linux/scatterlist.h>
-#include <linux/mm.h>
+#include <linux/dma-debug.h>
 
-#ifdef CONFIG_PCI
-#include <linux/pci.h>
-#endif
+#define PREALLOC_DMA_DEBUG_ENTRIES       (1 << 15)
 
-/*
- * Return whether the given PCI device DMA address mask can be
- * supported properly.  For example, if your device can only drive the
- * low 24-bits during PCI bus mastering, then you would pass
- * 0x00ffffff as the mask to this function.
- */
-int dma_supported(struct device *dev, u64 mask)
+static int __init dma_init(void)
 {
-#ifdef CONFIG_PCI
-	if (dev->bus == &pci_bus_type)
-		return 1;
-#endif
+	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
 	return 0;
 }
-EXPORT_SYMBOL(dma_supported);
-
-int dma_set_mask(struct device *dev, u64 dma_mask)
-{
-#ifdef CONFIG_PCI
-	if (dev->bus == &pci_bus_type)
-		return pci_set_dma_mask(to_pci_dev(dev), dma_mask);
-#endif
-	return -EOPNOTSUPP;
-}
-EXPORT_SYMBOL(dma_set_mask);
+fs_initcall(dma_init);
diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c
index 1eb60438965..edbea232c61 100644
--- a/arch/sparc/kernel/ioport.c
+++ b/arch/sparc/kernel/ioport.c
@@ -654,6 +654,33 @@ EXPORT_SYMBOL(pci32_dma_ops);
 
 #endif /* CONFIG_PCI */
 
+/*
+ * Return whether the given PCI device DMA address mask can be
+ * supported properly.  For example, if your device can only drive the
+ * low 24-bits during PCI bus mastering, then you would pass
+ * 0x00ffffff as the mask to this function.
+ */
+int dma_supported(struct device *dev, u64 mask)
+{
+#ifdef CONFIG_PCI
+	if (dev->bus == &pci_bus_type)
+		return 1;
+#endif
+	return 0;
+}
+EXPORT_SYMBOL(dma_supported);
+
+int dma_set_mask(struct device *dev, u64 dma_mask)
+{
+#ifdef CONFIG_PCI
+	if (dev->bus == &pci_bus_type)
+		return pci_set_dma_mask(to_pci_dev(dev), dma_mask);
+#endif
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(dma_set_mask);
+
+
 #ifdef CONFIG_PROC_FS
 
 static int
-- 
cgit v1.2.3-70-g09d2


From fd1e8a1fe2b54df6c185b4fa65f181f50b9c4d4e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Aug 2009 15:00:51 +0900
Subject: percpu: introduce pcpu_alloc_info and pcpu_group_info

Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array.  For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem.  Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.

This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus.  pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.

pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.

* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
  help dealing with pcpu_alloc_info.

* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
  generalized and renamed to pcpu_build_alloc_info().
  @cpu_distance_fn may be NULL indicating that all cpus are of
  LOCAL_DISTANCE.

* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
  generalized and renamed to pcpu_dump_alloc_info().  It now also
  prints which group each alloc unit belongs to.

* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
  separate parameters.  All first chunk allocators are updated to use
  pcpu_build_alloc_info() to build alloc_info and call
  pcpu_setup_first_chunk() with it.  This has the side effect of
  packing units for sparse possible cpus.  ie. if cpus 0, 2 and 4 are
  possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.

* x86 setup_pcpu_lpage() is updated to deal with alloc_info.

* sparc64 setup_per_cpu_areas() is updated to build alloc_info.

Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus.  It mostly changes how information is passed among
initialization functions and makes room for more flexibility.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
---
 arch/sparc/kernel/smp_64.c     |  24 +-
 arch/x86/kernel/setup_percpu.c |  38 ++-
 include/linux/percpu.h         |  42 +++-
 mm/percpu.c                    | 529 +++++++++++++++++++++++++----------------
 4 files changed, 389 insertions(+), 244 deletions(-)

(limited to 'arch/sparc')

diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index 9856d866b77..a42a4a744d1 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1475,17 +1475,29 @@ static void __init pcpu_map_range(unsigned long start, unsigned long end,
 
 void __init setup_per_cpu_areas(void)
 {
-	size_t dyn_size, static_size = __per_cpu_end - __per_cpu_start;
 	static struct vm_struct vm;
+	struct pcpu_alloc_info *ai;
 	unsigned long delta, cpu;
 	size_t size_sum, pcpu_unit_size;
 	size_t ptrs_size;
 	void **ptrs;
 
-	size_sum = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
+	ai = pcpu_alloc_alloc_info(1, nr_cpu_ids);
+
+	ai->static_size = __per_cpu_end - __per_cpu_start;
+	ai->reserved_size = PERCPU_MODULE_RESERVE;
+
+	size_sum = PFN_ALIGN(ai->static_size + ai->reserved_size +
 			     PERCPU_DYNAMIC_RESERVE);
-	dyn_size = size_sum - static_size - PERCPU_MODULE_RESERVE;
 
+	ai->dyn_size = size_sum - ai->static_size - ai->reserved_size;
+	ai->unit_size = PCPU_CHUNK_SIZE;
+	ai->atom_size = PCPU_CHUNK_SIZE;
+	ai->alloc_size = PCPU_CHUNK_SIZE;
+	ai->groups[0].nr_units = nr_cpu_ids;
+
+	for_each_possible_cpu(cpu)
+		ai->groups[0].cpu_map[cpu] = cpu;
 
 	ptrs_size = PFN_ALIGN(nr_cpu_ids * sizeof(ptrs[0]));
 	ptrs = alloc_bootmem(ptrs_size);
@@ -1497,7 +1509,7 @@ void __init setup_per_cpu_areas(void)
 		free_bootmem(__pa(ptrs[cpu] + size_sum),
 			     PCPU_CHUNK_SIZE - size_sum);
 
-		memcpy(ptrs[cpu], __per_cpu_load, static_size);
+		memcpy(ptrs[cpu], __per_cpu_load, ai->static_size);
 	}
 
 	/* allocate address and map */
@@ -1514,9 +1526,7 @@ void __init setup_per_cpu_areas(void)
 		pcpu_map_range(start, end, virt_to_page(ptrs[cpu]));
 	}
 
-	pcpu_unit_size = pcpu_setup_first_chunk(static_size,
-						PERCPU_MODULE_RESERVE, dyn_size,
-						PCPU_CHUNK_SIZE, vm.addr, NULL);
+	pcpu_unit_size = pcpu_setup_first_chunk(ai, vm.addr);
 
 	free_bootmem(__pa(ptrs), ptrs_size);
 
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 660cde13314..db5f9c49fec 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -161,9 +161,7 @@ static ssize_t __init setup_pcpu_lpage(bool chosen)
 {
 	size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
 	size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE;
-	size_t unit_map_size, unit_size;
-	int *unit_map;
-	int nr_units;
+	struct pcpu_alloc_info *ai;
 	ssize_t ret;
 
 	/* on non-NUMA, embedding is better */
@@ -177,26 +175,22 @@ static ssize_t __init setup_pcpu_lpage(bool chosen)
 	}
 
 	/* allocate and build unit_map */
-	unit_map_size = nr_cpu_ids * sizeof(int);
-	unit_map = alloc_bootmem_nopanic(unit_map_size);
-	if (!unit_map) {
-		pr_warning("PERCPU: failed to allocate unit_map\n");
-		return -ENOMEM;
+	ai = pcpu_build_alloc_info(PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
+				   PMD_SIZE, pcpu_lpage_cpu_distance);
+	if (IS_ERR(ai)) {
+		pr_warning("PERCPU: failed to build unit_map (%ld)\n",
+			   PTR_ERR(ai));
+		return PTR_ERR(ai);
 	}
 
-	ret = pcpu_lpage_build_unit_map(PERCPU_FIRST_CHUNK_RESERVE,
-					&dyn_size, &unit_size, PMD_SIZE,
-					unit_map, pcpu_lpage_cpu_distance);
-	if (ret < 0) {
-		pr_warning("PERCPU: failed to build unit_map\n");
-		goto out_free;
-	}
-	nr_units = ret;
-
 	/* do the parameters look okay? */
 	if (!chosen) {
 		size_t vm_size = VMALLOC_END - VMALLOC_START;
-		size_t tot_size = nr_units * unit_size;
+		size_t tot_size = 0;
+		int group;
+
+		for (group = 0; group < ai->nr_groups; group++)
+			tot_size += ai->unit_size * ai->groups[group].nr_units;
 
 		/* don't consume more than 20% of vmalloc area */
 		if (tot_size > vm_size / 5) {
@@ -207,12 +201,10 @@ static ssize_t __init setup_pcpu_lpage(bool chosen)
 		}
 	}
 
-	ret = pcpu_lpage_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
-				     unit_size, PMD_SIZE, unit_map, nr_units,
-				     pcpu_fc_alloc, pcpu_fc_free, pcpul_map);
+	ret = pcpu_lpage_first_chunk(ai, pcpu_fc_alloc, pcpu_fc_free,
+				     pcpul_map);
 out_free:
-	if (ret < 0)
-		free_bootmem(__pa(unit_map), unit_map_size);
+	pcpu_free_alloc_info(ai);
 	return ret;
 }
 #else
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 570fb18de2b..77b86be8ce4 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -59,6 +59,25 @@
 extern void *pcpu_base_addr;
 extern const int *pcpu_unit_map;
 
+struct pcpu_group_info {
+	int			nr_units;	/* aligned # of units */
+	unsigned long		base_offset;	/* base address offset */
+	unsigned int		*cpu_map;	/* unit->cpu map, empty
+						 * entries contain NR_CPUS */
+};
+
+struct pcpu_alloc_info {
+	size_t			static_size;
+	size_t			reserved_size;
+	size_t			dyn_size;
+	size_t			unit_size;
+	size_t			atom_size;
+	size_t			alloc_size;
+	size_t			__ai_size;	/* internal, don't use */
+	int			nr_groups;	/* 0 if grouping unnecessary */
+	struct pcpu_group_info	groups[];
+};
+
 enum pcpu_fc {
 	PCPU_FC_AUTO,
 	PCPU_FC_EMBED,
@@ -78,18 +97,17 @@ typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);
 typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to);
 typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr);
 
-#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
-extern int __init pcpu_lpage_build_unit_map(
-				size_t reserved_size, ssize_t *dyn_sizep,
-				size_t *unit_sizep, size_t lpage_size,
-				int *unit_map,
+extern struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
+							     int nr_units);
+extern void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai);
+
+extern struct pcpu_alloc_info * __init pcpu_build_alloc_info(
+				size_t reserved_size, ssize_t dyn_size,
+				size_t atom_size,
 				pcpu_fc_cpu_distance_fn_t cpu_distance_fn);
-#endif
 
-extern size_t __init pcpu_setup_first_chunk(
-				size_t static_size, size_t reserved_size,
-				size_t dyn_size, size_t unit_size,
-				void *base_addr, const int *unit_map);
+extern size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
+					    void *base_addr);
 
 #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
 extern ssize_t __init pcpu_embed_first_chunk(
@@ -106,9 +124,7 @@ extern ssize_t __init pcpu_page_first_chunk(
 
 #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
 extern ssize_t __init pcpu_lpage_first_chunk(
-				size_t reserved_size, size_t dyn_size,
-				size_t unit_size, size_t lpage_size,
-				const int *unit_map, int nr_units,
+				const struct pcpu_alloc_info *ai,
 				pcpu_fc_alloc_fn_t alloc_fn,
 				pcpu_fc_free_fn_t free_fn,
 				pcpu_fc_map_fn_t map_fn);
diff --git a/mm/percpu.c b/mm/percpu.c
index 2b9c4b2a2fc..99f7fa68272 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -58,6 +58,7 @@
 
 #include <linux/bitmap.h>
 #include <linux/bootmem.h>
+#include <linux/err.h>
 #include <linux/list.h>
 #include <linux/log2.h>
 #include <linux/mm.h>
@@ -1245,53 +1246,108 @@ static inline size_t pcpu_calc_fc_sizes(size_t static_size,
 	return size_sum;
 }
 
-#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
 /**
- * pcpu_lpage_build_unit_map - build unit_map for large page remapping
+ * pcpu_alloc_alloc_info - allocate percpu allocation info
+ * @nr_groups: the number of groups
+ * @nr_units: the number of units
+ *
+ * Allocate ai which is large enough for @nr_groups groups containing
+ * @nr_units units.  The returned ai's groups[0].cpu_map points to the
+ * cpu_map array which is long enough for @nr_units and filled with
+ * NR_CPUS.  It's the caller's responsibility to initialize cpu_map
+ * pointer of other groups.
+ *
+ * RETURNS:
+ * Pointer to the allocated pcpu_alloc_info on success, NULL on
+ * failure.
+ */
+struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
+						      int nr_units)
+{
+	struct pcpu_alloc_info *ai;
+	size_t base_size, ai_size;
+	void *ptr;
+	int unit;
+
+	base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),
+			  __alignof__(ai->groups[0].cpu_map[0]));
+	ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
+
+	ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size));
+	if (!ptr)
+		return NULL;
+	ai = ptr;
+	ptr += base_size;
+
+	ai->groups[0].cpu_map = ptr;
+
+	for (unit = 0; unit < nr_units; unit++)
+		ai->groups[0].cpu_map[unit] = NR_CPUS;
+
+	ai->nr_groups = nr_groups;
+	ai->__ai_size = PFN_ALIGN(ai_size);
+
+	return ai;
+}
+
+/**
+ * pcpu_free_alloc_info - free percpu allocation info
+ * @ai: pcpu_alloc_info to free
+ *
+ * Free @ai which was allocated by pcpu_alloc_alloc_info().
+ */
+void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
+{
+	free_bootmem(__pa(ai), ai->__ai_size);
+}
+
+/**
+ * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
  * @reserved_size: the size of reserved percpu area in bytes
- * @dyn_sizep: in/out parameter for dynamic size, -1 for auto
- * @unit_sizep: out parameter for unit size
- * @unit_map: unit_map to be filled
- * @cpu_distance_fn: callback to determine distance between cpus
+ * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
+ * @atom_size: allocation atom size
+ * @cpu_distance_fn: callback to determine distance between cpus, optional
  *
- * This function builds cpu -> unit map and determine other parameters
- * considering needed percpu size, large page size and distances
- * between CPUs in NUMA.
+ * This function determines grouping of units, their mappings to cpus
+ * and other parameters considering needed percpu size, allocation
+ * atom size and distances between CPUs.
  *
- * CPUs which are of LOCAL_DISTANCE both ways are grouped together and
- * may share units in the same large page.  The returned configuration
- * is guaranteed to have CPUs on different nodes on different large
- * pages and >=75% usage of allocated virtual address space.
+ * Groups are always mutliples of atom size and CPUs which are of
+ * LOCAL_DISTANCE both ways are grouped together and share space for
+ * units in the same group.  The returned configuration is guaranteed
+ * to have CPUs on different nodes on different groups and >=75% usage
+ * of allocated virtual address space.
  *
  * RETURNS:
- * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and
- * returns the number of units to be allocated.  -errno on failure.
+ * On success, pointer to the new allocation_info is returned.  On
+ * failure, ERR_PTR value is returned.
  */
-int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep,
-				     size_t *unit_sizep, size_t lpage_size,
-				     int *unit_map,
-				     pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
+struct pcpu_alloc_info * __init pcpu_build_alloc_info(
+				size_t reserved_size, ssize_t dyn_size,
+				size_t atom_size,
+				pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
 {
 	static int group_map[NR_CPUS] __initdata;
 	static int group_cnt[NR_CPUS] __initdata;
 	const size_t static_size = __per_cpu_end - __per_cpu_start;
-	int group_cnt_max = 0;
+	int group_cnt_max = 0, nr_groups = 1, nr_units = 0;
 	size_t size_sum, min_unit_size, alloc_size;
 	int upa, max_upa, uninitialized_var(best_upa);	/* units_per_alloc */
-	int last_allocs;
+	int last_allocs, group, unit;
 	unsigned int cpu, tcpu;
-	int group, unit;
+	struct pcpu_alloc_info *ai;
+	unsigned int *cpu_map;
 
 	/*
 	 * Determine min_unit_size, alloc_size and max_upa such that
-	 * alloc_size is multiple of lpage_size and is the smallest
+	 * alloc_size is multiple of atom_size and is the smallest
 	 * which can accomodate 4k aligned segments which are equal to
 	 * or larger than min_unit_size.
 	 */
-	size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep);
+	size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
 	min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
 
-	alloc_size = roundup(min_unit_size, lpage_size);
+	alloc_size = roundup(min_unit_size, atom_size);
 	upa = alloc_size / min_unit_size;
 	while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
 		upa--;
@@ -1304,10 +1360,11 @@ int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep,
 		for_each_possible_cpu(tcpu) {
 			if (cpu == tcpu)
 				break;
-			if (group_map[tcpu] == group &&
+			if (group_map[tcpu] == group && cpu_distance_fn &&
 			    (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
 			     cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
 				group++;
+				nr_groups = max(nr_groups, group + 1);
 				goto next_group;
 			}
 		}
@@ -1328,7 +1385,7 @@ int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep,
 		if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
 			continue;
 
-		for (group = 0; group_cnt[group]; group++) {
+		for (group = 0; group < nr_groups; group++) {
 			int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
 			allocs += this_allocs;
 			wasted += this_allocs * upa - group_cnt[group];
@@ -1348,75 +1405,122 @@ int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep,
 		last_allocs = allocs;
 		best_upa = upa;
 	}
-	*unit_sizep = alloc_size / best_upa;
+	upa = best_upa;
+
+	/* allocate and fill alloc_info */
+	for (group = 0; group < nr_groups; group++)
+		nr_units += roundup(group_cnt[group], upa);
+
+	ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
+	if (!ai)
+		return ERR_PTR(-ENOMEM);
+	cpu_map = ai->groups[0].cpu_map;
+
+	for (group = 0; group < nr_groups; group++) {
+		ai->groups[group].cpu_map = cpu_map;
+		cpu_map += roundup(group_cnt[group], upa);
+	}
+
+	ai->static_size = static_size;
+	ai->reserved_size = reserved_size;
+	ai->dyn_size = dyn_size;
+	ai->unit_size = alloc_size / upa;
+	ai->atom_size = atom_size;
+	ai->alloc_size = alloc_size;
+
+	for (group = 0, unit = 0; group_cnt[group]; group++) {
+		struct pcpu_group_info *gi = &ai->groups[group];
+
+		/*
+		 * Initialize base_offset as if all groups are located
+		 * back-to-back.  The caller should update this to
+		 * reflect actual allocation.
+		 */
+		gi->base_offset = unit * ai->unit_size;
 
-	/* assign units to cpus accordingly */
-	unit = 0;
-	for (group = 0; group_cnt[group]; group++) {
 		for_each_possible_cpu(cpu)
 			if (group_map[cpu] == group)
-				unit_map[cpu] = unit++;
-		unit = roundup(unit, best_upa);
+				gi->cpu_map[gi->nr_units++] = cpu;
+		gi->nr_units = roundup(gi->nr_units, upa);
+		unit += gi->nr_units;
 	}
+	BUG_ON(unit != nr_units);
 
-	return unit;	/* unit contains aligned number of units */
+	return ai;
 }
 
-static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map,
-				     unsigned int *cpup);
-
-static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
-					size_t reserved_size, size_t dyn_size,
-					size_t unit_size, size_t lpage_size,
-					const int *unit_map, int nr_units)
+/**
+ * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
+ * @lvl: loglevel
+ * @ai: allocation info to dump
+ *
+ * Print out information about @ai using loglevel @lvl.
+ */
+static void pcpu_dump_alloc_info(const char *lvl,
+				 const struct pcpu_alloc_info *ai)
 {
-	int width = 1, v = nr_units;
+	int group_width = 1, cpu_width = 1, width;
 	char empty_str[] = "--------";
-	int upl, lpl;	/* units per lpage, lpage per line */
-	unsigned int cpu;
-	int lpage, unit;
+	int alloc = 0, alloc_end = 0;
+	int group, v;
+	int upa, apl;	/* units per alloc, allocs per line */
+
+	v = ai->nr_groups;
+	while (v /= 10)
+		group_width++;
 
+	v = num_possible_cpus();
 	while (v /= 10)
-		width++;
-	empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0';
+		cpu_width++;
+	empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
 
-	upl = max_t(int, lpage_size / unit_size, 1);
-	lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1));
+	upa = ai->alloc_size / ai->unit_size;
+	width = upa * (cpu_width + 1) + group_width + 3;
+	apl = rounddown_pow_of_two(max(60 / width, 1));
 
-	printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl,
-	       static_size, reserved_size, dyn_size, unit_size, lpage_size);
+	printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
+	       lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
+	       ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
 
-	for (lpage = 0, unit = 0; unit < nr_units; unit++) {
-		if (!(unit % upl)) {
-			if (!(lpage++ % lpl)) {
+	for (group = 0; group < ai->nr_groups; group++) {
+		const struct pcpu_group_info *gi = &ai->groups[group];
+		int unit = 0, unit_end = 0;
+
+		BUG_ON(gi->nr_units % upa);
+		for (alloc_end += gi->nr_units / upa;
+		     alloc < alloc_end; alloc++) {
+			if (!(alloc % apl)) {
 				printk("\n");
-				printk("%spcpu-lpage: ", lvl);
-			} else
-				printk("| ");
+				printk("%spcpu-alloc: ", lvl);
+			}
+			printk("[%0*d] ", group_width, group);
+
+			for (unit_end += upa; unit < unit_end; unit++)
+				if (gi->cpu_map[unit] != NR_CPUS)
+					printk("%0*d ", cpu_width,
+					       gi->cpu_map[unit]);
+				else
+					printk("%s ", empty_str);
 		}
-		if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
-			printk("%0*d ", width, cpu);
-		else
-			printk("%s ", empty_str);
 	}
 	printk("\n");
 }
-#endif
 
 /**
  * pcpu_setup_first_chunk - initialize the first percpu chunk
- * @static_size: the size of static percpu area in bytes
- * @reserved_size: the size of reserved percpu area in bytes, 0 for none
- * @dyn_size: free size for dynamic allocation in bytes
- * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE
+ * @ai: pcpu_alloc_info describing how to percpu area is shaped
  * @base_addr: mapped address
- * @unit_map: cpu -> unit map, NULL for sequential mapping
  *
  * Initialize the first percpu chunk which contains the kernel static
  * perpcu area.  This function is to be called from arch percpu area
  * setup path.
  *
- * @reserved_size, if non-zero, specifies the amount of bytes to
+ * @ai contains all information necessary to initialize the first
+ * chunk and prime the dynamic percpu allocator.
+ *
+ * @ai->static_size is the size of static percpu area.
+ *
+ * @ai->reserved_size, if non-zero, specifies the amount of bytes to
  * reserve after the static area in the first chunk.  This reserves
  * the first chunk such that it's available only through reserved
  * percpu allocation.  This is primarily used to serve module percpu
@@ -1424,13 +1528,26 @@ static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
  * limited offset range for symbol relocations to guarantee module
  * percpu symbols fall inside the relocatable range.
  *
- * @dyn_size determines the number of bytes available for dynamic
- * allocation in the first chunk.  The area between @static_size +
- * @reserved_size + @dyn_size and @unit_size is unused.
+ * @ai->dyn_size determines the number of bytes available for dynamic
+ * allocation in the first chunk.  The area between @ai->static_size +
+ * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
  *
- * @unit_size specifies unit size and must be aligned to PAGE_SIZE and
- * equal to or larger than @static_size + @reserved_size + if
- * non-negative, @dyn_size.
+ * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
+ * and equal to or larger than @ai->static_size + @ai->reserved_size +
+ * @ai->dyn_size.
+ *
+ * @ai->atom_size is the allocation atom size and used as alignment
+ * for vm areas.
+ *
+ * @ai->alloc_size is the allocation size and always multiple of
+ * @ai->atom_size.  This is larger than @ai->atom_size if
+ * @ai->unit_size is larger than @ai->atom_size.
+ *
+ * @ai->nr_groups and @ai->groups describe virtual memory layout of
+ * percpu areas.  Units which should be colocated are put into the
+ * same group.  Dynamic VM areas will be allocated according to these
+ * groupings.  If @ai->nr_groups is zero, a single group containing
+ * all units is assumed.
  *
  * The caller should have mapped the first chunk at @base_addr and
  * copied static data to each unit.
@@ -1446,70 +1563,63 @@ static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
  * The determined pcpu_unit_size which can be used to initialize
  * percpu access.
  */
-size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
-				     size_t dyn_size, size_t unit_size,
-				     void *base_addr, const int *unit_map)
+size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
+				     void *base_addr)
 {
 	static struct vm_struct first_vm;
 	static int smap[2], dmap[2];
-	size_t size_sum = static_size + reserved_size + dyn_size;
+	size_t dyn_size = ai->dyn_size;
+	size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
 	struct pcpu_chunk *schunk, *dchunk = NULL;
-	unsigned int cpu, tcpu;
-	int i;
+	unsigned int cpu;
+	int *unit_map;
+	int group, unit, i;
 
 	/* sanity checks */
 	BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
 		     ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
-	BUG_ON(!static_size);
+	BUG_ON(ai->nr_groups <= 0);
+	BUG_ON(!ai->static_size);
 	BUG_ON(!base_addr);
-	BUG_ON(unit_size < size_sum);
-	BUG_ON(unit_size & ~PAGE_MASK);
-	BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
+	BUG_ON(ai->unit_size < size_sum);
+	BUG_ON(ai->unit_size & ~PAGE_MASK);
+	BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
+
+	pcpu_dump_alloc_info(KERN_DEBUG, ai);
 
 	/* determine number of units and verify and initialize pcpu_unit_map */
-	if (unit_map) {
-		int first_unit = INT_MAX, last_unit = INT_MIN;
-
-		for_each_possible_cpu(cpu) {
-			int unit = unit_map[cpu];
-
-			BUG_ON(unit < 0);
-			for_each_possible_cpu(tcpu) {
-				if (tcpu == cpu)
-					break;
-				/* the mapping should be one-to-one */
-				BUG_ON(unit_map[tcpu] == unit);
-			}
+	unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0]));
 
-			if (unit < first_unit) {
-				pcpu_first_unit_cpu = cpu;
-				first_unit = unit;
-			}
-			if (unit > last_unit) {
-				pcpu_last_unit_cpu = cpu;
-				last_unit = unit;
-			}
-		}
-		pcpu_nr_units = last_unit + 1;
-		pcpu_unit_map = unit_map;
-	} else {
-		int *identity_map;
+	for (cpu = 0; cpu < nr_cpu_ids; cpu++)
+		unit_map[cpu] = NR_CPUS;
+	pcpu_first_unit_cpu = NR_CPUS;
 
-		/* #units == #cpus, identity mapped */
-		identity_map = alloc_bootmem(nr_cpu_ids *
-					     sizeof(identity_map[0]));
+	for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
+		const struct pcpu_group_info *gi = &ai->groups[group];
 
-		for_each_possible_cpu(cpu)
-			identity_map[cpu] = cpu;
+		for (i = 0; i < gi->nr_units; i++) {
+			cpu = gi->cpu_map[i];
+			if (cpu == NR_CPUS)
+				continue;
 
-		pcpu_first_unit_cpu = 0;
-		pcpu_last_unit_cpu = pcpu_nr_units - 1;
-		pcpu_nr_units = nr_cpu_ids;
-		pcpu_unit_map = identity_map;
+			BUG_ON(cpu > nr_cpu_ids || !cpu_possible(cpu));
+			BUG_ON(unit_map[cpu] != NR_CPUS);
+
+			unit_map[cpu] = unit + i;
+			if (pcpu_first_unit_cpu == NR_CPUS)
+				pcpu_first_unit_cpu = cpu;
+		}
 	}
+	pcpu_last_unit_cpu = cpu;
+	pcpu_nr_units = unit;
+
+	for_each_possible_cpu(cpu)
+		BUG_ON(unit_map[cpu] == NR_CPUS);
+
+	pcpu_unit_map = unit_map;
 
 	/* determine basic parameters */
-	pcpu_unit_pages = unit_size >> PAGE_SHIFT;
+	pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
 	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
 	pcpu_chunk_size = pcpu_nr_units * pcpu_unit_size;
 	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
@@ -1543,17 +1653,17 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
 	schunk->immutable = true;
 	bitmap_fill(schunk->populated, pcpu_unit_pages);
 
-	if (reserved_size) {
-		schunk->free_size = reserved_size;
+	if (ai->reserved_size) {
+		schunk->free_size = ai->reserved_size;
 		pcpu_reserved_chunk = schunk;
-		pcpu_reserved_chunk_limit = static_size + reserved_size;
+		pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size;
 	} else {
 		schunk->free_size = dyn_size;
 		dyn_size = 0;			/* dynamic area covered */
 	}
 	schunk->contig_hint = schunk->free_size;
 
-	schunk->map[schunk->map_used++] = -static_size;
+	schunk->map[schunk->map_used++] = -ai->static_size;
 	if (schunk->free_size)
 		schunk->map[schunk->map_used++] = schunk->free_size;
 
@@ -1643,44 +1753,47 @@ early_param("percpu_alloc", percpu_alloc_setup);
  */
 ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size)
 {
-	const size_t static_size = __per_cpu_end - __per_cpu_start;
-	size_t size_sum, unit_size, chunk_size;
+	struct pcpu_alloc_info *ai;
+	size_t size_sum, chunk_size;
 	void *base;
-	unsigned int cpu;
+	int unit;
+	ssize_t ret;
 
-	/* determine parameters and allocate */
-	size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
+	ai = pcpu_build_alloc_info(reserved_size, dyn_size, PAGE_SIZE, NULL);
+	if (IS_ERR(ai))
+		return PTR_ERR(ai);
+	BUG_ON(ai->nr_groups != 1);
+	BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
 
-	unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
-	chunk_size = unit_size * nr_cpu_ids;
+	size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
+	chunk_size = ai->unit_size * num_possible_cpus();
 
 	base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
 				       __pa(MAX_DMA_ADDRESS));
 	if (!base) {
 		pr_warning("PERCPU: failed to allocate %zu bytes for "
 			   "embedding\n", chunk_size);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out_free_ai;
 	}
 
 	/* return the leftover and copy */
-	for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
-		void *ptr = base + cpu * unit_size;
-
-		if (cpu_possible(cpu)) {
-			free_bootmem(__pa(ptr + size_sum),
-				     unit_size - size_sum);
-			memcpy(ptr, __per_cpu_load, static_size);
-		} else
-			free_bootmem(__pa(ptr), unit_size);
+	for (unit = 0; unit < num_possible_cpus(); unit++) {
+		void *ptr = base + unit * ai->unit_size;
+
+		free_bootmem(__pa(ptr + size_sum), ai->unit_size - size_sum);
+		memcpy(ptr, __per_cpu_load, ai->static_size);
 	}
 
 	/* we're ready, commit */
 	pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
-		PFN_DOWN(size_sum), base, static_size, reserved_size, dyn_size,
-		unit_size);
+		PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
+		ai->dyn_size, ai->unit_size);
 
-	return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
-				      unit_size, base, NULL);
+	ret = pcpu_setup_first_chunk(ai, base);
+out_free_ai:
+	pcpu_free_alloc_info(ai);
+	return ret;
 }
 #endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK ||
 	  !CONFIG_HAVE_SETUP_PER_CPU_AREA */
@@ -1709,31 +1822,34 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
 				     pcpu_fc_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct vm;
-	const size_t static_size = __per_cpu_end - __per_cpu_start;
-	ssize_t dyn_size = -1;
-	size_t size_sum, unit_size;
+	struct pcpu_alloc_info *ai;
 	char psize_str[16];
 	int unit_pages;
 	size_t pages_size;
 	struct page **pages;
-	unsigned int cpu;
-	int i, j;
+	int unit, i, j;
 	ssize_t ret;
 
 	snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
 
-	size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
-	unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
-	unit_pages = unit_size >> PAGE_SHIFT;
+	ai = pcpu_build_alloc_info(reserved_size, -1, PAGE_SIZE, NULL);
+	if (IS_ERR(ai))
+		return PTR_ERR(ai);
+	BUG_ON(ai->nr_groups != 1);
+	BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
+
+	unit_pages = ai->unit_size >> PAGE_SHIFT;
 
 	/* unaligned allocations can't be freed, round up to page size */
-	pages_size = PFN_ALIGN(unit_pages * nr_cpu_ids * sizeof(pages[0]));
+	pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
+			       sizeof(pages[0]));
 	pages = alloc_bootmem(pages_size);
 
 	/* allocate pages */
 	j = 0;
-	for_each_possible_cpu(cpu)
+	for (unit = 0; unit < num_possible_cpus(); unit++)
 		for (i = 0; i < unit_pages; i++) {
+			unsigned int cpu = ai->groups[0].cpu_map[unit];
 			void *ptr;
 
 			ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
@@ -1747,18 +1863,18 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
 
 	/* allocate vm area, map the pages and copy static data */
 	vm.flags = VM_ALLOC;
-	vm.size = nr_cpu_ids * unit_size;
+	vm.size = num_possible_cpus() * ai->unit_size;
 	vm_area_register_early(&vm, PAGE_SIZE);
 
-	for_each_possible_cpu(cpu) {
+	for (unit = 0; unit < num_possible_cpus(); unit++) {
 		unsigned long unit_addr =
-			(unsigned long)vm.addr + cpu * unit_size;
+			(unsigned long)vm.addr + unit * ai->unit_size;
 
 		for (i = 0; i < unit_pages; i++)
 			populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
 
 		/* pte already populated, the following shouldn't fail */
-		ret = __pcpu_map_pages(unit_addr, &pages[cpu * unit_pages],
+		ret = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
 				       unit_pages);
 		if (ret < 0)
 			panic("failed to map percpu area, err=%zd\n", ret);
@@ -1772,16 +1888,15 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
 		 */
 
 		/* copy static data */
-		memcpy((void *)unit_addr, __per_cpu_load, static_size);
+		memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
 	}
 
 	/* we're ready, commit */
 	pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n",
-		unit_pages, psize_str, vm.addr, static_size, reserved_size,
-		dyn_size);
+		unit_pages, psize_str, vm.addr, ai->static_size,
+		ai->reserved_size, ai->dyn_size);
 
-	ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
-				     unit_size, vm.addr, NULL);
+	ret = pcpu_setup_first_chunk(ai, vm.addr);
 	goto out_free_ar;
 
 enomem:
@@ -1790,6 +1905,7 @@ enomem:
 	ret = -ENOMEM;
 out_free_ar:
 	free_bootmem(__pa(pages), pages_size);
+	pcpu_free_alloc_info(ai);
 	return ret;
 }
 #endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */
@@ -1805,38 +1921,50 @@ static size_t pcpul_lpage_size;
 static int pcpul_nr_lpages;
 static struct pcpul_ent *pcpul_map;
 
-static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map,
+static bool __init pcpul_unit_to_cpu(int unit, const struct pcpu_alloc_info *ai,
 				     unsigned int *cpup)
 {
-	unsigned int cpu;
+	int group, cunit;
 
-	for_each_possible_cpu(cpu)
-		if (unit_map[cpu] == unit) {
+	for (group = 0, cunit = 0; group < ai->nr_groups; group++) {
+		const struct pcpu_group_info *gi = &ai->groups[group];
+
+		if (unit < cunit + gi->nr_units) {
 			if (cpup)
-				*cpup = cpu;
+				*cpup = gi->cpu_map[unit - cunit];
 			return true;
 		}
+		cunit += gi->nr_units;
+	}
 
 	return false;
 }
 
+static int __init pcpul_cpu_to_unit(int cpu, const struct pcpu_alloc_info *ai)
+{
+	int group, unit, i;
+
+	for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
+		const struct pcpu_group_info *gi = &ai->groups[group];
+
+		for (i = 0; i < gi->nr_units; i++)
+			if (gi->cpu_map[i] == cpu)
+				return unit + i;
+	}
+	BUG();
+}
+
 /**
  * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
- * @reserved_size: the size of reserved percpu area in bytes
- * @dyn_size: free size for dynamic allocation in bytes
- * @unit_size: unit size in bytes
- * @lpage_size: the size of a large page
- * @unit_map: cpu -> unit mapping
- * @nr_units: the number of units
+ * @ai: pcpu_alloc_info
  * @alloc_fn: function to allocate percpu lpage, always called with lpage_size
  * @free_fn: function to free percpu memory, @size <= lpage_size
  * @map_fn: function to map percpu lpage, always called with lpage_size
  *
  * This allocator uses large page to build and map the first chunk.
- * Unlike other helpers, the caller should always specify @dyn_size
- * and @unit_size.  These parameters along with @unit_map and
- * @nr_units can be determined using pcpu_lpage_build_unit_map().
- * This two stage initialization is to allow arch code to evaluate the
+ * Unlike other helpers, the caller should provide fully initialized
+ * @ai.  This can be done using pcpu_build_alloc_info().  This two
+ * stage initialization is to allow arch code to evaluate the
  * parameters before committing to it.
  *
  * Large pages are allocated as directed by @unit_map and other
@@ -1852,27 +1980,26 @@ static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map,
  * The determined pcpu_unit_size which can be used to initialize
  * percpu access on success, -errno on failure.
  */
-ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size,
-				      size_t unit_size, size_t lpage_size,
-				      const int *unit_map, int nr_units,
+ssize_t __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai,
 				      pcpu_fc_alloc_fn_t alloc_fn,
 				      pcpu_fc_free_fn_t free_fn,
 				      pcpu_fc_map_fn_t map_fn)
 {
 	static struct vm_struct vm;
-	const size_t static_size = __per_cpu_end - __per_cpu_start;
-	size_t chunk_size = unit_size * nr_units;
-	size_t map_size;
+	const size_t lpage_size = ai->atom_size;
+	size_t chunk_size, map_size;
 	unsigned int cpu;
 	ssize_t ret;
-	int i, j, unit;
+	int i, j, unit, nr_units;
 
-	pcpul_lpage_dump_cfg(KERN_DEBUG, static_size, reserved_size, dyn_size,
-			     unit_size, lpage_size, unit_map, nr_units);
+	nr_units = 0;
+	for (i = 0; i < ai->nr_groups; i++)
+		nr_units += ai->groups[i].nr_units;
 
+	chunk_size = ai->unit_size * nr_units;
 	BUG_ON(chunk_size % lpage_size);
 
-	pcpul_size = static_size + reserved_size + dyn_size;
+	pcpul_size = ai->static_size + ai->reserved_size + ai->dyn_size;
 	pcpul_lpage_size = lpage_size;
 	pcpul_nr_lpages = chunk_size / lpage_size;
 
@@ -1883,13 +2010,13 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size,
 	/* allocate all pages */
 	for (i = 0; i < pcpul_nr_lpages; i++) {
 		size_t offset = i * lpage_size;
-		int first_unit = offset / unit_size;
-		int last_unit = (offset + lpage_size - 1) / unit_size;
+		int first_unit = offset / ai->unit_size;
+		int last_unit = (offset + lpage_size - 1) / ai->unit_size;
 		void *ptr;
 
 		/* find out which cpu is mapped to this unit */
 		for (unit = first_unit; unit <= last_unit; unit++)
-			if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
+			if (pcpul_unit_to_cpu(unit, ai, &cpu))
 				goto found;
 		continue;
 	found:
@@ -1905,12 +2032,12 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size,
 
 	/* return unused holes */
 	for (unit = 0; unit < nr_units; unit++) {
-		size_t start = unit * unit_size;
-		size_t end = start + unit_size;
+		size_t start = unit * ai->unit_size;
+		size_t end = start + ai->unit_size;
 		size_t off, next;
 
 		/* don't free used part of occupied unit */
-		if (pcpul_unit_to_cpu(unit, unit_map, NULL))
+		if (pcpul_unit_to_cpu(unit, ai, NULL))
 			start += pcpul_size;
 
 		/* unit can span more than one page, punch the holes */
@@ -1925,7 +2052,7 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size,
 	/* allocate address, map and copy */
 	vm.flags = VM_ALLOC;
 	vm.size = chunk_size;
-	vm_area_register_early(&vm, unit_size);
+	vm_area_register_early(&vm, ai->unit_size);
 
 	for (i = 0; i < pcpul_nr_lpages; i++) {
 		if (!pcpul_map[i].ptr)
@@ -1935,15 +2062,15 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size,
 	}
 
 	for_each_possible_cpu(cpu)
-		memcpy(vm.addr + unit_map[cpu] * unit_size, __per_cpu_load,
-		       static_size);
+		memcpy(vm.addr + pcpul_cpu_to_unit(cpu, ai) * ai->unit_size,
+		       __per_cpu_load, ai->static_size);
 
 	/* we're ready, commit */
 	pr_info("PERCPU: large pages @%p s%zu r%zu d%zu u%zu\n",
-		vm.addr, static_size, reserved_size, dyn_size, unit_size);
+		vm.addr, ai->static_size, ai->reserved_size, ai->dyn_size,
+		ai->unit_size);
 
-	ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
-				     unit_size, vm.addr, unit_map);
+	ret = pcpu_setup_first_chunk(ai, vm.addr);
 
 	/*
 	 * Sort pcpul_map array for pcpu_lpage_remapped().  Unmapped
-- 
cgit v1.2.3-70-g09d2


From fb435d5233f8b6f9b93c11d6304d8e98fed03234 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Aug 2009 15:00:51 +0900
Subject: percpu: add pcpu_unit_offsets[]

Currently units are mapped sequentially into address space.  This
patch adds pcpu_unit_offsets[] which allows units to be mapped to
arbitrary offsets from the chunk base address.  This is necessary to
allow sparse embedding which might would need to allocate address
ranges and memory areas which aren't aligned to unit size but
allocation atom size (page or large page size).  This also simplifies
things a bit by removing the need to calculate offset from unit
number.

With this change, there's no need for the arch code to know
pcpu_unit_size.  Update pcpu_setup_first_chunk() and first chunk
allocators to return regular 0 or -errno return code instead of unit
size or -errno.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: David S. Miller <davem@davemloft.net>
---
 arch/sparc/kernel/smp_64.c     | 12 +++---
 arch/x86/kernel/setup_percpu.c | 51 ++++++++++-------------
 include/linux/percpu.h         | 16 ++++---
 mm/percpu.c                    | 95 +++++++++++++++++++++---------------------
 4 files changed, 84 insertions(+), 90 deletions(-)

(limited to 'arch/sparc')

diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index a42a4a744d1..b03fd362c62 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1478,9 +1478,10 @@ void __init setup_per_cpu_areas(void)
 	static struct vm_struct vm;
 	struct pcpu_alloc_info *ai;
 	unsigned long delta, cpu;
-	size_t size_sum, pcpu_unit_size;
+	size_t size_sum;
 	size_t ptrs_size;
 	void **ptrs;
+	int rc;
 
 	ai = pcpu_alloc_alloc_info(1, nr_cpu_ids);
 
@@ -1526,14 +1527,15 @@ void __init setup_per_cpu_areas(void)
 		pcpu_map_range(start, end, virt_to_page(ptrs[cpu]));
 	}
 
-	pcpu_unit_size = pcpu_setup_first_chunk(ai, vm.addr);
+	rc = pcpu_setup_first_chunk(ai, vm.addr);
+	if (rc)
+		panic("failed to setup percpu first chunk (%d)", rc);
 
 	free_bootmem(__pa(ptrs), ptrs_size);
 
 	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
-	for_each_possible_cpu(cpu) {
-		__per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
-	}
+	for_each_possible_cpu(cpu)
+		__per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
 
 	/* Setup %g5 for the boot cpu.  */
 	__local_per_cpu_offset = __per_cpu_offset(smp_processor_id());
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index db5f9c49fec..9becc5d4b51 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -157,12 +157,12 @@ static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to)
 		return REMOTE_DISTANCE;
 }
 
-static ssize_t __init setup_pcpu_lpage(bool chosen)
+static int __init setup_pcpu_lpage(bool chosen)
 {
 	size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
 	size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE;
 	struct pcpu_alloc_info *ai;
-	ssize_t ret;
+	int rc;
 
 	/* on non-NUMA, embedding is better */
 	if (!chosen && !pcpu_need_numa())
@@ -196,19 +196,18 @@ static ssize_t __init setup_pcpu_lpage(bool chosen)
 		if (tot_size > vm_size / 5) {
 			pr_info("PERCPU: too large chunk size %zuMB for "
 				"large page remap\n", tot_size >> 20);
-			ret = -EINVAL;
+			rc = -EINVAL;
 			goto out_free;
 		}
 	}
 
-	ret = pcpu_lpage_first_chunk(ai, pcpu_fc_alloc, pcpu_fc_free,
-				     pcpul_map);
+	rc = pcpu_lpage_first_chunk(ai, pcpu_fc_alloc, pcpu_fc_free, pcpul_map);
 out_free:
 	pcpu_free_alloc_info(ai);
-	return ret;
+	return rc;
 }
 #else
-static ssize_t __init setup_pcpu_lpage(bool chosen)
+static int __init setup_pcpu_lpage(bool chosen)
 {
 	return -EINVAL;
 }
@@ -222,7 +221,7 @@ static ssize_t __init setup_pcpu_lpage(bool chosen)
  * mapping so that it can use PMD mapping without additional TLB
  * pressure.
  */
-static ssize_t __init setup_pcpu_embed(bool chosen)
+static int __init setup_pcpu_embed(bool chosen)
 {
 	size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
 
@@ -250,7 +249,7 @@ static void __init pcpup_populate_pte(unsigned long addr)
 	populate_extra_pte(addr);
 }
 
-static ssize_t __init setup_pcpu_page(void)
+static int __init setup_pcpu_page(void)
 {
 	return pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
 				     pcpu_fc_alloc, pcpu_fc_free,
@@ -274,8 +273,7 @@ void __init setup_per_cpu_areas(void)
 {
 	unsigned int cpu;
 	unsigned long delta;
-	size_t pcpu_unit_size;
-	ssize_t ret;
+	int rc;
 
 	pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
 		NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
@@ -285,36 +283,33 @@ void __init setup_per_cpu_areas(void)
 	 * of large page mappings.  Please read comments on top of
 	 * each allocator for details.
 	 */
-	ret = -EINVAL;
+	rc = -EINVAL;
 	if (pcpu_chosen_fc != PCPU_FC_AUTO) {
 		if (pcpu_chosen_fc != PCPU_FC_PAGE) {
 			if (pcpu_chosen_fc == PCPU_FC_LPAGE)
-				ret = setup_pcpu_lpage(true);
+				rc = setup_pcpu_lpage(true);
 			else
-				ret = setup_pcpu_embed(true);
+				rc = setup_pcpu_embed(true);
 
-			if (ret < 0)
-				pr_warning("PERCPU: %s allocator failed (%zd), "
+			if (rc < 0)
+				pr_warning("PERCPU: %s allocator failed (%d), "
 					   "falling back to page size\n",
-					   pcpu_fc_names[pcpu_chosen_fc], ret);
+					   pcpu_fc_names[pcpu_chosen_fc], rc);
 		}
 	} else {
-		ret = setup_pcpu_lpage(false);
-		if (ret < 0)
-			ret = setup_pcpu_embed(false);
+		rc = setup_pcpu_lpage(false);
+		if (rc < 0)
+			rc = setup_pcpu_embed(false);
 	}
-	if (ret < 0)
-		ret = setup_pcpu_page();
-	if (ret < 0)
-		panic("cannot initialize percpu area (err=%zd)", ret);
-
-	pcpu_unit_size = ret;
+	if (rc < 0)
+		rc = setup_pcpu_page();
+	if (rc < 0)
+		panic("cannot initialize percpu area (err=%d)", rc);
 
 	/* alrighty, percpu areas up and running */
 	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
 	for_each_possible_cpu(cpu) {
-		per_cpu_offset(cpu) =
-			delta + pcpu_unit_map[cpu] * pcpu_unit_size;
+		per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
 		per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
 		per_cpu(cpu_number, cpu) = cpu;
 		setup_percpu_segment(cpu);
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 77b86be8ce4..a7ec840f596 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -57,7 +57,7 @@
 #endif
 
 extern void *pcpu_base_addr;
-extern const int *pcpu_unit_map;
+extern const unsigned long *pcpu_unit_offsets;
 
 struct pcpu_group_info {
 	int			nr_units;	/* aligned # of units */
@@ -106,25 +106,23 @@ extern struct pcpu_alloc_info * __init pcpu_build_alloc_info(
 				size_t atom_size,
 				pcpu_fc_cpu_distance_fn_t cpu_distance_fn);
 
-extern size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
-					    void *base_addr);
+extern int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
+					 void *base_addr);
 
 #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
-extern ssize_t __init pcpu_embed_first_chunk(
-				size_t reserved_size, ssize_t dyn_size);
+extern int __init pcpu_embed_first_chunk(size_t reserved_size,
+					 ssize_t dyn_size);
 #endif
 
 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
-extern ssize_t __init pcpu_page_first_chunk(
-				size_t reserved_size,
+extern int __init pcpu_page_first_chunk(size_t reserved_size,
 				pcpu_fc_alloc_fn_t alloc_fn,
 				pcpu_fc_free_fn_t free_fn,
 				pcpu_fc_populate_pte_fn_t populate_pte_fn);
 #endif
 
 #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
-extern ssize_t __init pcpu_lpage_first_chunk(
-				const struct pcpu_alloc_info *ai,
+extern int __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai,
 				pcpu_fc_alloc_fn_t alloc_fn,
 				pcpu_fc_free_fn_t free_fn,
 				pcpu_fc_map_fn_t map_fn);
diff --git a/mm/percpu.c b/mm/percpu.c
index 99f7fa68272..653b02c4020 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -117,8 +117,8 @@ static unsigned int pcpu_last_unit_cpu __read_mostly;
 void *pcpu_base_addr __read_mostly;
 EXPORT_SYMBOL_GPL(pcpu_base_addr);
 
-/* cpu -> unit map */
-const int *pcpu_unit_map __read_mostly;
+static const int *pcpu_unit_map __read_mostly;		/* cpu -> unit */
+const unsigned long *pcpu_unit_offsets __read_mostly;	/* cpu -> unit offset */
 
 /*
  * The first chunk which always exists.  Note that unlike other
@@ -196,8 +196,8 @@ static int pcpu_page_idx(unsigned int cpu, int page_idx)
 static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
 				     unsigned int cpu, int page_idx)
 {
-	return (unsigned long)chunk->vm->addr +
-		(pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
+	return (unsigned long)chunk->vm->addr + pcpu_unit_offsets[cpu] +
+		(page_idx << PAGE_SHIFT);
 }
 
 static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
@@ -341,7 +341,7 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 	 * space.  Note that any possible cpu id can be used here, so
 	 * there's no need to worry about preemption or cpu hotplug.
 	 */
-	addr += pcpu_unit_map[smp_processor_id()] * pcpu_unit_size;
+	addr += pcpu_unit_offsets[smp_processor_id()];
 	return pcpu_get_page_chunk(vmalloc_to_page(addr));
 }
 
@@ -1560,17 +1560,17 @@ static void pcpu_dump_alloc_info(const char *lvl,
  * and available for dynamic allocation like any other chunks.
  *
  * RETURNS:
- * The determined pcpu_unit_size which can be used to initialize
- * percpu access.
+ * 0 on success, -errno on failure.
  */
-size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
-				     void *base_addr)
+int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
+				  void *base_addr)
 {
 	static struct vm_struct first_vm;
 	static int smap[2], dmap[2];
 	size_t dyn_size = ai->dyn_size;
 	size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
 	struct pcpu_chunk *schunk, *dchunk = NULL;
+	unsigned long *unit_off;
 	unsigned int cpu;
 	int *unit_map;
 	int group, unit, i;
@@ -1587,8 +1587,9 @@ size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 
 	pcpu_dump_alloc_info(KERN_DEBUG, ai);
 
-	/* determine number of units and verify and initialize pcpu_unit_map */
+	/* determine number of units and initialize unit_map and base */
 	unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0]));
+	unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0]));
 
 	for (cpu = 0; cpu < nr_cpu_ids; cpu++)
 		unit_map[cpu] = NR_CPUS;
@@ -1606,6 +1607,8 @@ size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 			BUG_ON(unit_map[cpu] != NR_CPUS);
 
 			unit_map[cpu] = unit + i;
+			unit_off[cpu] = gi->base_offset + i * ai->unit_size;
+
 			if (pcpu_first_unit_cpu == NR_CPUS)
 				pcpu_first_unit_cpu = cpu;
 		}
@@ -1617,6 +1620,7 @@ size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 		BUG_ON(unit_map[cpu] == NR_CPUS);
 
 	pcpu_unit_map = unit_map;
+	pcpu_unit_offsets = unit_off;
 
 	/* determine basic parameters */
 	pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
@@ -1688,7 +1692,7 @@ size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 
 	/* we're done */
 	pcpu_base_addr = schunk->vm->addr;
-	return pcpu_unit_size;
+	return 0;
 }
 
 const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
@@ -1748,16 +1752,15 @@ early_param("percpu_alloc", percpu_alloc_setup);
  * size, the leftover is returned to the bootmem allocator.
  *
  * RETURNS:
- * The determined pcpu_unit_size which can be used to initialize
- * percpu access on success, -errno on failure.
+ * 0 on success, -errno on failure.
  */
-ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size)
+int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size)
 {
 	struct pcpu_alloc_info *ai;
 	size_t size_sum, chunk_size;
 	void *base;
 	int unit;
-	ssize_t ret;
+	int rc;
 
 	ai = pcpu_build_alloc_info(reserved_size, dyn_size, PAGE_SIZE, NULL);
 	if (IS_ERR(ai))
@@ -1773,7 +1776,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size)
 	if (!base) {
 		pr_warning("PERCPU: failed to allocate %zu bytes for "
 			   "embedding\n", chunk_size);
-		ret = -ENOMEM;
+		rc = -ENOMEM;
 		goto out_free_ai;
 	}
 
@@ -1790,10 +1793,10 @@ ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size)
 		PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
 		ai->dyn_size, ai->unit_size);
 
-	ret = pcpu_setup_first_chunk(ai, base);
+	rc = pcpu_setup_first_chunk(ai, base);
 out_free_ai:
 	pcpu_free_alloc_info(ai);
-	return ret;
+	return rc;
 }
 #endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK ||
 	  !CONFIG_HAVE_SETUP_PER_CPU_AREA */
@@ -1813,13 +1816,12 @@ out_free_ai:
  * page-by-page into vmalloc area.
  *
  * RETURNS:
- * The determined pcpu_unit_size which can be used to initialize
- * percpu access on success, -errno on failure.
+ * 0 on success, -errno on failure.
  */
-ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
-				     pcpu_fc_alloc_fn_t alloc_fn,
-				     pcpu_fc_free_fn_t free_fn,
-				     pcpu_fc_populate_pte_fn_t populate_pte_fn)
+int __init pcpu_page_first_chunk(size_t reserved_size,
+				 pcpu_fc_alloc_fn_t alloc_fn,
+				 pcpu_fc_free_fn_t free_fn,
+				 pcpu_fc_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct vm;
 	struct pcpu_alloc_info *ai;
@@ -1827,8 +1829,7 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
 	int unit_pages;
 	size_t pages_size;
 	struct page **pages;
-	int unit, i, j;
-	ssize_t ret;
+	int unit, i, j, rc;
 
 	snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
 
@@ -1874,10 +1875,10 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
 			populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
 
 		/* pte already populated, the following shouldn't fail */
-		ret = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
-				       unit_pages);
-		if (ret < 0)
-			panic("failed to map percpu area, err=%zd\n", ret);
+		rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
+				      unit_pages);
+		if (rc < 0)
+			panic("failed to map percpu area, err=%d\n", rc);
 
 		/*
 		 * FIXME: Archs with virtual cache should flush local
@@ -1896,17 +1897,17 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
 		unit_pages, psize_str, vm.addr, ai->static_size,
 		ai->reserved_size, ai->dyn_size);
 
-	ret = pcpu_setup_first_chunk(ai, vm.addr);
+	rc = pcpu_setup_first_chunk(ai, vm.addr);
 	goto out_free_ar;
 
 enomem:
 	while (--j >= 0)
 		free_fn(page_address(pages[j]), PAGE_SIZE);
-	ret = -ENOMEM;
+	rc = -ENOMEM;
 out_free_ar:
 	free_bootmem(__pa(pages), pages_size);
 	pcpu_free_alloc_info(ai);
-	return ret;
+	return rc;
 }
 #endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */
 
@@ -1977,20 +1978,18 @@ static int __init pcpul_cpu_to_unit(int cpu, const struct pcpu_alloc_info *ai)
  * pcpu_lpage_remapped().
  *
  * RETURNS:
- * The determined pcpu_unit_size which can be used to initialize
- * percpu access on success, -errno on failure.
+ * 0 on success, -errno on failure.
  */
-ssize_t __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai,
-				      pcpu_fc_alloc_fn_t alloc_fn,
-				      pcpu_fc_free_fn_t free_fn,
-				      pcpu_fc_map_fn_t map_fn)
+int __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai,
+				  pcpu_fc_alloc_fn_t alloc_fn,
+				  pcpu_fc_free_fn_t free_fn,
+				  pcpu_fc_map_fn_t map_fn)
 {
 	static struct vm_struct vm;
 	const size_t lpage_size = ai->atom_size;
 	size_t chunk_size, map_size;
 	unsigned int cpu;
-	ssize_t ret;
-	int i, j, unit, nr_units;
+	int i, j, unit, nr_units, rc;
 
 	nr_units = 0;
 	for (i = 0; i < ai->nr_groups; i++)
@@ -2070,7 +2069,7 @@ ssize_t __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai,
 		vm.addr, ai->static_size, ai->reserved_size, ai->dyn_size,
 		ai->unit_size);
 
-	ret = pcpu_setup_first_chunk(ai, vm.addr);
+	rc = pcpu_setup_first_chunk(ai, vm.addr);
 
 	/*
 	 * Sort pcpul_map array for pcpu_lpage_remapped().  Unmapped
@@ -2094,7 +2093,7 @@ ssize_t __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai,
 	while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr)
 		pcpul_nr_lpages--;
 
-	return ret;
+	return rc;
 
 enomem:
 	for (i = 0; i < pcpul_nr_lpages; i++)
@@ -2166,21 +2165,21 @@ EXPORT_SYMBOL(__per_cpu_offset);
 
 void __init setup_per_cpu_areas(void)
 {
-	ssize_t unit_size;
 	unsigned long delta;
 	unsigned int cpu;
+	int rc;
 
 	/*
 	 * Always reserve area for module percpu variables.  That's
 	 * what the legacy allocator did.
 	 */
-	unit_size = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
-					   PERCPU_DYNAMIC_RESERVE);
-	if (unit_size < 0)
+	rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
+				    PERCPU_DYNAMIC_RESERVE);
+	if (rc < 0)
 		panic("Failed to initialized percpu areas.");
 
 	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
 	for_each_possible_cpu(cpu)
-		__per_cpu_offset[cpu] = delta + cpu * unit_size;
+		__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
 }
 #endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
-- 
cgit v1.2.3-70-g09d2


From bcb2107fdbecef3de55d597d23453747af81ba88 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Aug 2009 15:00:53 +0900
Subject: sparc64: use embedding percpu first chunk allocator

sparc64 currently allocates a large page for each cpu and partially
remap them into vmalloc area much like what lpage first chunk
allocator did.  As a 4M page is used for each cpu, this results in
very large unit size and also adds TLB pressure due to the double
mapping of pages in the first chunk.

This patch converts sparc64 to use the embedding percpu first chunk
allocator which now knows how to handle NUMA configurations.  This
simplifies the code a lot, doesn't incur any extra TLB pressure and
results in better utilization of address space.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/Kconfig         |   3 ++
 arch/sparc/kernel/smp_64.c | 128 +++++++--------------------------------------
 2 files changed, 21 insertions(+), 110 deletions(-)

(limited to 'arch/sparc')

diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 4f6ed0f113f..fbd1233b392 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -95,6 +95,9 @@ config AUDIT_ARCH
 config HAVE_SETUP_PER_CPU_AREA
 	def_bool y if SPARC64
 
+config NEED_PER_CPU_EMBED_FIRST_CHUNK
+	def_bool y if SPARC64
+
 config GENERIC_HARDIRQS_NO__DO_IRQ
 	bool
 	def_bool y if SPARC64
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index b03fd362c62..ff68373ce6d 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1389,8 +1389,8 @@ void smp_send_stop(void)
  * RETURNS:
  * Pointer to the allocated area on success, NULL on failure.
  */
-static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
-					unsigned long align)
+static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size,
+					size_t align)
 {
 	const unsigned long goal = __pa(MAX_DMA_ADDRESS);
 #ifdef CONFIG_NEED_MULTIPLE_NODES
@@ -1415,123 +1415,31 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
 #endif
 }
 
-#define PCPU_CHUNK_SIZE (4UL * 1024UL * 1024UL)
-
-static void __init pcpu_map_range(unsigned long start, unsigned long end,
-				  struct page *page)
+static void __init pcpu_free_bootmem(void *ptr, size_t size)
 {
-	unsigned long pfn = page_to_pfn(page);
-	unsigned long pte_base;
-
-	BUG_ON((pfn<<PAGE_SHIFT)&(PCPU_CHUNK_SIZE - 1UL));
-
-	pte_base = (_PAGE_VALID | _PAGE_SZ4MB_4U |
-		    _PAGE_CP_4U | _PAGE_CV_4U |
-		    _PAGE_P_4U | _PAGE_W_4U);
-	if (tlb_type == hypervisor)
-		pte_base = (_PAGE_VALID | _PAGE_SZ4MB_4V |
-			    _PAGE_CP_4V | _PAGE_CV_4V |
-			    _PAGE_P_4V | _PAGE_W_4V);
-
-	while (start < end) {
-		pgd_t *pgd = pgd_offset_k(start);
-		unsigned long this_end;
-		pud_t *pud;
-		pmd_t *pmd;
-		pte_t *pte;
-
-		pud = pud_offset(pgd, start);
-		if (pud_none(*pud)) {
-			pmd_t *new;
-
-			new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE);
-			pud_populate(&init_mm, pud, new);
-		}
-
-		pmd = pmd_offset(pud, start);
-		if (!pmd_present(*pmd)) {
-			pte_t *new;
-
-			new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE);
-			pmd_populate_kernel(&init_mm, pmd, new);
-		}
-
-		pte = pte_offset_kernel(pmd, start);
-		this_end = (start + PMD_SIZE) & PMD_MASK;
-		if (this_end > end)
-			this_end = end;
-
-		while (start < this_end) {
-			unsigned long paddr = pfn << PAGE_SHIFT;
-
-			pte_val(*pte) = (paddr | pte_base);
+	free_bootmem(__pa(ptr), size);
+}
 
-			start += PAGE_SIZE;
-			pte++;
-			pfn++;
-		}
-	}
+static int pcpu_cpu_distance(unsigned int from, unsigned int to)
+{
+	if (cpu_to_node(from) == cpu_to_node(to))
+		return LOCAL_DISTANCE;
+	else
+		return REMOTE_DISTANCE;
 }
 
 void __init setup_per_cpu_areas(void)
 {
-	static struct vm_struct vm;
-	struct pcpu_alloc_info *ai;
-	unsigned long delta, cpu;
-	size_t size_sum;
-	size_t ptrs_size;
-	void **ptrs;
+	unsigned long delta;
+	unsigned int cpu;
 	int rc;
 
-	ai = pcpu_alloc_alloc_info(1, nr_cpu_ids);
-
-	ai->static_size = __per_cpu_end - __per_cpu_start;
-	ai->reserved_size = PERCPU_MODULE_RESERVE;
-
-	size_sum = PFN_ALIGN(ai->static_size + ai->reserved_size +
-			     PERCPU_DYNAMIC_RESERVE);
-
-	ai->dyn_size = size_sum - ai->static_size - ai->reserved_size;
-	ai->unit_size = PCPU_CHUNK_SIZE;
-	ai->atom_size = PCPU_CHUNK_SIZE;
-	ai->alloc_size = PCPU_CHUNK_SIZE;
-	ai->groups[0].nr_units = nr_cpu_ids;
-
-	for_each_possible_cpu(cpu)
-		ai->groups[0].cpu_map[cpu] = cpu;
-
-	ptrs_size = PFN_ALIGN(nr_cpu_ids * sizeof(ptrs[0]));
-	ptrs = alloc_bootmem(ptrs_size);
-
-	for_each_possible_cpu(cpu) {
-		ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE,
-					       PCPU_CHUNK_SIZE);
-
-		free_bootmem(__pa(ptrs[cpu] + size_sum),
-			     PCPU_CHUNK_SIZE - size_sum);
-
-		memcpy(ptrs[cpu], __per_cpu_load, ai->static_size);
-	}
-
-	/* allocate address and map */
-	vm.flags = VM_ALLOC;
-	vm.size = nr_cpu_ids * PCPU_CHUNK_SIZE;
-	vm_area_register_early(&vm, PCPU_CHUNK_SIZE);
-
-	for_each_possible_cpu(cpu) {
-		unsigned long start = (unsigned long) vm.addr;
-		unsigned long end;
-
-		start += cpu * PCPU_CHUNK_SIZE;
-		end = start + PCPU_CHUNK_SIZE;
-		pcpu_map_range(start, end, virt_to_page(ptrs[cpu]));
-	}
-
-	rc = pcpu_setup_first_chunk(ai, vm.addr);
+	rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
+				    PERCPU_DYNAMIC_RESERVE, 4 << 20,
+				    pcpu_cpu_distance, pcpu_alloc_bootmem,
+				    pcpu_free_bootmem);
 	if (rc)
-		panic("failed to setup percpu first chunk (%d)", rc);
-
-	free_bootmem(__pa(ptrs), ptrs_size);
+		panic("failed to initialize first chunk (%d)", rc);
 
 	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
 	for_each_possible_cpu(cpu)
-- 
cgit v1.2.3-70-g09d2


From 48e46b7b311c54525712c28004f0a59f2c931d30 Mon Sep 17 00:00:00 2001
From: Jurij Smakov <jurij@wooyd.org>
Date: Sun, 16 Aug 2009 18:21:47 -0700
Subject: sparc64: build compressed image (zImage) by default

Besides creating the uncompressed vmlinux image for sparc64, also
create a compressed zImage. This is more consistent with other
architectures and required to make the 'deb-pkg' target work.

Signed-off-by: Jurij Smakov <jurij@wooyd.org>
Signed-off-by: Frans Pop <elendil@planet.nl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/Makefile      | 12 ++++++------
 arch/sparc/boot/Makefile |  3 +++
 2 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'arch/sparc')

diff --git a/arch/sparc/Makefile b/arch/sparc/Makefile
index 2003ded054c..467221dd570 100644
--- a/arch/sparc/Makefile
+++ b/arch/sparc/Makefile
@@ -38,10 +38,6 @@ CPPFLAGS_vmlinux.lds += -m32
 #  Actual linking is done with "make image".
 LDFLAGS_vmlinux = -r
 
-# Default target
-all: zImage
-
-
 else
 #####
 # sparc64
@@ -91,6 +87,9 @@ endif
 
 boot := arch/sparc/boot
 
+# Default target
+all: zImage
+
 image zImage tftpboot.img vmlinux.aout: vmlinux
 	$(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
 
@@ -109,8 +108,9 @@ define archhelp
 endef
 else
 define archhelp
-  echo  '* vmlinux       - Standard sparc64 kernel'
-  echo  '  vmlinux.aout  - a.out kernel for sparc64'
+  echo  '* vmlinux      - standard sparc64 kernel'
+  echo  '* zImage       - stripped and compressed sparc64 kernel ($(boot)/zImage)'
+  echo  '  vmlinux.aout - a.out kernel for sparc64'
   echo  '  tftpboot.img - image prepared for tftp'
 endef
 endif
diff --git a/arch/sparc/boot/Makefile b/arch/sparc/boot/Makefile
index 1ff0fd92475..97e3feb9ff1 100644
--- a/arch/sparc/boot/Makefile
+++ b/arch/sparc/boot/Makefile
@@ -79,6 +79,9 @@ $(obj)/image: vmlinux FORCE
 	$(call if_changed,strip)
 	@echo '  kernel: $@ is ready'
 
+$(obj)/zImage: $(obj)/image
+	$(call if_changed,gzip)
+
 $(obj)/tftpboot.img: $(obj)/image $(obj)/piggyback_64 System.map $(ROOT_IMG) FORCE
 	$(call if_changed,elftoaout)
 	$(call if_changed,piggy)
-- 
cgit v1.2.3-70-g09d2


From 3f389635102e0dd2f85ebfe7fd549942b5bbc33b Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Wed, 12 Aug 2009 22:49:47 +0000
Subject: SPARC: fix duplicate declaration

Only difference for 32 and 64 bit version is dma64_addr_t and rest is same.

Also fixed the following 'make includecheck' warning:

  arch/sparc/include/asm/types.h: asm-generic/int-ll64.h is included more than once.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/include/asm/types.h | 27 +++++++--------------------
 1 file changed, 7 insertions(+), 20 deletions(-)

(limited to 'arch/sparc')

diff --git a/arch/sparc/include/asm/types.h b/arch/sparc/include/asm/types.h
index de671d73bae..09c79a9c851 100644
--- a/arch/sparc/include/asm/types.h
+++ b/arch/sparc/include/asm/types.h
@@ -8,9 +8,8 @@
  * need to be careful to avoid a name clashes.
  */
 
-#if defined(__sparc__) && defined(__arch64__)
+#if defined(__sparc__)
 
-/*** SPARC 64 bit ***/
 #include <asm-generic/int-ll64.h>
 
 #ifndef __ASSEMBLY__
@@ -26,33 +25,21 @@ typedef unsigned short umode_t;
 /* Dma addresses come in generic and 64-bit flavours.  */
 
 typedef u32 dma_addr_t;
-typedef u64 dma64_addr_t;
 
-#endif /* __ASSEMBLY__ */
+#if defined(__arch64__)
 
-#endif /* __KERNEL__ */
+/*** SPARC 64 bit ***/
+typedef u64 dma64_addr_t;
 #else
-
 /*** SPARC 32 bit ***/
-#include <asm-generic/int-ll64.h>
-
-#ifndef __ASSEMBLY__
-
-typedef unsigned short umode_t;
-
-#endif /* __ASSEMBLY__ */
-
-#ifdef __KERNEL__
-
-#ifndef __ASSEMBLY__
-
-typedef u32 dma_addr_t;
 typedef u32 dma64_addr_t;
 
+#endif /* defined(__arch64__) */
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
 
-#endif /* defined(__sparc__) && defined(__arch64__) */
+#endif /* defined(__sparc__) */
 
 #endif /* defined(_SPARC_TYPES_H) */
-- 
cgit v1.2.3-70-g09d2


From 8abf9196008a7f06841b10769d1aed3f28c1d314 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 13 Aug 2009 10:05:43 +0000
Subject: sparc64: cheaper asm/uaccess.h inclusion

sched.h inclusion is definitely not needed like in 32-bit version,
remove it, fixup compilation.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/include/asm/uaccess_64.h | 2 +-
 arch/sparc/kernel/sysfs.c           | 1 +
 samples/kprobes/kretprobe_example.c | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/sparc')

diff --git a/arch/sparc/include/asm/uaccess_64.h b/arch/sparc/include/asm/uaccess_64.h
index a38c0323891..9ea271e19c7 100644
--- a/arch/sparc/include/asm/uaccess_64.h
+++ b/arch/sparc/include/asm/uaccess_64.h
@@ -7,8 +7,8 @@
 
 #ifdef __KERNEL__
 #include <linux/compiler.h>
-#include <linux/sched.h>
 #include <linux/string.h>
+#include <linux/thread_info.h>
 #include <asm/asi.h>
 #include <asm/system.h>
 #include <asm/spitfire.h>
diff --git a/arch/sparc/kernel/sysfs.c b/arch/sparc/kernel/sysfs.c
index d28f496f466..ca39c606fe8 100644
--- a/arch/sparc/kernel/sysfs.c
+++ b/arch/sparc/kernel/sysfs.c
@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2007 David S. Miller <davem@davemloft.net>
  */
+#include <linux/sched.h>
 #include <linux/sysdev.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
diff --git a/samples/kprobes/kretprobe_example.c b/samples/kprobes/kretprobe_example.c
index 4e764b317d6..1041b673159 100644
--- a/samples/kprobes/kretprobe_example.c
+++ b/samples/kprobes/kretprobe_example.c
@@ -23,6 +23,7 @@
 #include <linux/kprobes.h>
 #include <linux/ktime.h>
 #include <linux/limits.h>
+#include <linux/sched.h>
 
 static char func_name[NAME_MAX] = "do_fork";
 module_param_string(func, func_name, NAME_MAX, S_IRUGO);
-- 
cgit v1.2.3-70-g09d2


From 5213a780295895630530aebacdd19217a5379c9a Mon Sep 17 00:00:00 2001
From: Konrad Eisele <konrad@gaisler.com>
Date: Mon, 17 Aug 2009 00:13:29 +0000
Subject: sparc,leon: CONFIG_SPARC_LEON option and leon specific files.

The macro CONFIG_SPARC_LEON will shield, if undefined, the sun-sparc
code from LEON specific code. In
particular include/asm/leon.h will get empty through #ifdef and
leon_kernel.c and leon_mm.c will not be compiled.

Signed-off-by: Konrad Eisele <konrad@gaisler.com>
Reviewed-by:   Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/Kconfig                 |  11 ++
 arch/sparc/include/asm/leon.h      | 362 +++++++++++++++++++++++++++++++++++++
 arch/sparc/include/asm/leon_amba.h | 263 +++++++++++++++++++++++++++
 arch/sparc/kernel/leon_kernel.c    | 203 +++++++++++++++++++++
 arch/sparc/mm/leon_mm.c            | 260 ++++++++++++++++++++++++++
 5 files changed, 1099 insertions(+)
 create mode 100644 arch/sparc/include/asm/leon.h
 create mode 100644 arch/sparc/include/asm/leon_amba.h
 create mode 100644 arch/sparc/kernel/leon_kernel.c
 create mode 100644 arch/sparc/mm/leon_mm.c

(limited to 'arch/sparc')

diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 3f8b6a92eab..b847f880eec 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -437,6 +437,17 @@ config SERIAL_CONSOLE
 
 	  If unsure, say N.
 
+config SPARC_LEON
+	bool "Sparc Leon processor family"
+	depends on SPARC32
+	---help---
+	  If you say Y here if you are running on a SPARC-LEON processor.
+	  The LEON processor is a synthesizable VHDL model of the
+	  SPARC-v8 standard. LEON is  part of the GRLIB collection of
+	  IP cores that are distributed under GPL. GRLIB can be downloaded
+	  from www.gaisler.com. You can download a sparc-linux cross-compilation
+	  toolchain at www.gaisler.com.
+
 endmenu
 
 menu "Bus options (PCI etc.)"
diff --git a/arch/sparc/include/asm/leon.h b/arch/sparc/include/asm/leon.h
new file mode 100644
index 00000000000..28a42b73f64
--- /dev/null
+++ b/arch/sparc/include/asm/leon.h
@@ -0,0 +1,362 @@
+/*
+ * Copyright (C) 2004 Konrad Eisele (eiselekd@web.de,konrad@gaisler.com) Gaisler Research
+ * Copyright (C) 2004 Stefan Holst (mail@s-holst.de) Uni-Stuttgart
+ * Copyright (C) 2009 Daniel Hellstrom (daniel@gaisler.com) Aeroflex Gaisler AB
+ * Copyright (C) 2009 Konrad Eisele (konrad@gaisler.com) Aeroflex Gaisler AB
+ */
+
+#ifndef LEON_H_INCLUDE
+#define LEON_H_INCLUDE
+
+#ifdef CONFIG_SPARC_LEON
+
+#define ASI_LEON_NOCACHE	0x01
+
+#define ASI_LEON_DCACHE_MISS	0x1
+
+#define ASI_LEON_CACHEREGS	0x02
+#define ASI_LEON_IFLUSH		0x10
+#define ASI_LEON_DFLUSH		0x11
+
+#define ASI_LEON_MMUFLUSH	0x18
+#define ASI_LEON_MMUREGS	0x19
+#define ASI_LEON_BYPASS		0x1c
+#define ASI_LEON_FLUSH_PAGE	0x10
+
+/* mmu register access, ASI_LEON_MMUREGS */
+#define LEON_CNR_CTRL		0x000
+#define LEON_CNR_CTXP		0x100
+#define LEON_CNR_CTX		0x200
+#define LEON_CNR_F		0x300
+#define LEON_CNR_FADDR		0x400
+
+#define LEON_CNR_CTX_NCTX	256	/*number of MMU ctx */
+
+#define LEON_CNR_CTRL_TLBDIS	0x80000000
+
+#define LEON_MMUTLB_ENT_MAX	64
+
+/*
+ * diagnostic access from mmutlb.vhd:
+ * 0: pte address
+ * 4: pte
+ * 8: additional flags
+ */
+#define LEON_DIAGF_LVL		0x3
+#define LEON_DIAGF_WR		0x8
+#define LEON_DIAGF_WR_SHIFT	3
+#define LEON_DIAGF_HIT		0x10
+#define LEON_DIAGF_HIT_SHIFT	4
+#define LEON_DIAGF_CTX		0x1fe0
+#define LEON_DIAGF_CTX_SHIFT	5
+#define LEON_DIAGF_VALID	0x2000
+#define LEON_DIAGF_VALID_SHIFT	13
+
+/*
+ *  Interrupt Sources
+ *
+ *  The interrupt source numbers directly map to the trap type and to
+ *  the bits used in the Interrupt Clear, Interrupt Force, Interrupt Mask,
+ *  and the Interrupt Pending Registers.
+ */
+#define LEON_INTERRUPT_CORRECTABLE_MEMORY_ERROR	1
+#define LEON_INTERRUPT_UART_1_RX_TX		2
+#define LEON_INTERRUPT_UART_0_RX_TX		3
+#define LEON_INTERRUPT_EXTERNAL_0		4
+#define LEON_INTERRUPT_EXTERNAL_1		5
+#define LEON_INTERRUPT_EXTERNAL_2		6
+#define LEON_INTERRUPT_EXTERNAL_3		7
+#define LEON_INTERRUPT_TIMER1			8
+#define LEON_INTERRUPT_TIMER2			9
+#define LEON_INTERRUPT_EMPTY1			10
+#define LEON_INTERRUPT_EMPTY2			11
+#define LEON_INTERRUPT_OPEN_ETH			12
+#define LEON_INTERRUPT_EMPTY4			13
+#define LEON_INTERRUPT_EMPTY5			14
+#define LEON_INTERRUPT_EMPTY6			15
+
+/* irq masks */
+#define LEON_HARD_INT(x)	(1 << (x))	/* irq 0-15 */
+#define LEON_IRQMASK_R		0x0000fffe	/* bit 15- 1 of lregs.irqmask */
+#define LEON_IRQPRIO_R		0xfffe0000	/* bit 31-17 of lregs.irqmask */
+
+/* leon uart register definitions */
+#define LEON_OFF_UDATA	0x0
+#define LEON_OFF_USTAT	0x4
+#define LEON_OFF_UCTRL	0x8
+#define LEON_OFF_USCAL	0xc
+
+#define LEON_UCTRL_RE	0x01
+#define LEON_UCTRL_TE	0x02
+#define LEON_UCTRL_RI	0x04
+#define LEON_UCTRL_TI	0x08
+#define LEON_UCTRL_PS	0x10
+#define LEON_UCTRL_PE	0x20
+#define LEON_UCTRL_FL	0x40
+#define LEON_UCTRL_LB	0x80
+
+#define LEON_USTAT_DR	0x01
+#define LEON_USTAT_TS	0x02
+#define LEON_USTAT_TH	0x04
+#define LEON_USTAT_BR	0x08
+#define LEON_USTAT_OV	0x10
+#define LEON_USTAT_PE	0x20
+#define LEON_USTAT_FE	0x40
+
+#define LEON_MCFG2_SRAMDIS		0x00002000
+#define LEON_MCFG2_SDRAMEN		0x00004000
+#define LEON_MCFG2_SRAMBANKSZ		0x00001e00	/* [12-9] */
+#define LEON_MCFG2_SRAMBANKSZ_SHIFT	9
+#define LEON_MCFG2_SDRAMBANKSZ		0x03800000	/* [25-23] */
+#define LEON_MCFG2_SDRAMBANKSZ_SHIFT	23
+
+#define LEON_TCNT0_MASK	0x7fffff
+
+#define LEON_USTAT_ERROR (LEON_USTAT_OV | LEON_USTAT_PE | LEON_USTAT_FE)
+/* no break yet */
+
+#define ASI_LEON3_SYSCTRL		0x02
+#define ASI_LEON3_SYSCTRL_ICFG		0x08
+#define ASI_LEON3_SYSCTRL_DCFG		0x0c
+#define ASI_LEON3_SYSCTRL_CFG_SNOOPING (1 << 27)
+#define ASI_LEON3_SYSCTRL_CFG_SSIZE(c) (1 << ((c >> 20) & 0xf))
+
+#ifndef __ASSEMBLY__
+
+/* do a virtual address read without cache */
+static inline unsigned long leon_readnobuffer_reg(unsigned long paddr)
+{
+	unsigned long retval;
+	__asm__ __volatile__("lda [%1] %2, %0\n\t" :
+			     "=r"(retval) : "r"(paddr), "i"(ASI_LEON_NOCACHE));
+	return retval;
+}
+
+/* do a physical address bypass write, i.e. for 0x80000000 */
+static inline void leon_store_reg(unsigned long paddr, unsigned long value)
+{
+	__asm__ __volatile__("sta %0, [%1] %2\n\t" : : "r"(value), "r"(paddr),
+			     "i"(ASI_LEON_BYPASS) : "memory");
+}
+
+/* do a physical address bypass load, i.e. for 0x80000000 */
+static inline unsigned long leon_load_reg(unsigned long paddr)
+{
+	unsigned long retval;
+	__asm__ __volatile__("lda [%1] %2, %0\n\t" :
+			     "=r"(retval) : "r"(paddr), "i"(ASI_LEON_BYPASS));
+	return retval;
+}
+
+extern inline void leon_srmmu_disabletlb(void)
+{
+	unsigned int retval;
+	__asm__ __volatile__("lda [%%g0] %2, %0\n\t" : "=r"(retval) : "r"(0),
+			     "i"(ASI_LEON_MMUREGS));
+	retval |= LEON_CNR_CTRL_TLBDIS;
+	__asm__ __volatile__("sta %0, [%%g0] %2\n\t" : : "r"(retval), "r"(0),
+			     "i"(ASI_LEON_MMUREGS) : "memory");
+}
+
+extern inline void leon_srmmu_enabletlb(void)
+{
+	unsigned int retval;
+	__asm__ __volatile__("lda [%%g0] %2, %0\n\t" : "=r"(retval) : "r"(0),
+			     "i"(ASI_LEON_MMUREGS));
+	retval = retval & ~LEON_CNR_CTRL_TLBDIS;
+	__asm__ __volatile__("sta %0, [%%g0] %2\n\t" : : "r"(retval), "r"(0),
+			     "i"(ASI_LEON_MMUREGS) : "memory");
+}
+
+/* macro access for leon_load_reg() and leon_store_reg() */
+#define LEON3_BYPASS_LOAD_PA(x)	    (leon_load_reg((unsigned long)(x)))
+#define LEON3_BYPASS_STORE_PA(x, v) (leon_store_reg((unsigned long)(x), (unsigned long)(v)))
+#define LEON3_BYPASS_ANDIN_PA(x, v) LEON3_BYPASS_STORE_PA(x, LEON3_BYPASS_LOAD_PA(x) & v)
+#define LEON3_BYPASS_ORIN_PA(x, v)  LEON3_BYPASS_STORE_PA(x, LEON3_BYPASS_LOAD_PA(x) | v)
+#define LEON_BYPASS_LOAD_PA(x)      leon_load_reg((unsigned long)(x))
+#define LEON_BYPASS_STORE_PA(x, v)  leon_store_reg((unsigned long)(x), (unsigned long)(v))
+#define LEON_REGLOAD_PA(x)          leon_load_reg((unsigned long)(x)+LEON_PREGS)
+#define LEON_REGSTORE_PA(x, v)      leon_store_reg((unsigned long)(x)+LEON_PREGS, (unsigned long)(v))
+#define LEON_REGSTORE_OR_PA(x, v)   LEON_REGSTORE_PA(x, LEON_REGLOAD_PA(x) | (unsigned long)(v))
+#define LEON_REGSTORE_AND_PA(x, v)  LEON_REGSTORE_PA(x, LEON_REGLOAD_PA(x) & (unsigned long)(v))
+
+/* macro access for leon_readnobuffer_reg() */
+#define LEON_BYPASSCACHE_LOAD_VA(x) leon_readnobuffer_reg((unsigned long)(x))
+
+extern void sparc_leon_eirq_register(int eirq);
+extern void leon_init(void);
+extern void leon_switch_mm(void);
+extern void leon_init_IRQ(void);
+
+extern unsigned long last_valid_pfn;
+
+extern inline unsigned long sparc_leon3_get_dcachecfg(void)
+{
+	unsigned int retval;
+	__asm__ __volatile__("lda [%1] %2, %0\n\t" :
+			     "=r"(retval) :
+			     "r"(ASI_LEON3_SYSCTRL_DCFG),
+			     "i"(ASI_LEON3_SYSCTRL));
+	return retval;
+}
+
+/* enable snooping */
+extern inline void sparc_leon3_enable_snooping(void)
+{
+	__asm__ __volatile__ ("lda [%%g0] 2, %%l1\n\t"
+			  "set 0x800000, %%l2\n\t"
+			  "or  %%l2, %%l1, %%l2\n\t"
+			  "sta %%l2, [%%g0] 2\n\t" : : : "l1", "l2");
+};
+
+extern inline void sparc_leon3_disable_cache(void)
+{
+	__asm__ __volatile__ ("lda [%%g0] 2, %%l1\n\t"
+			  "set 0x00000f, %%l2\n\t"
+			  "andn  %%l2, %%l1, %%l2\n\t"
+			  "sta %%l2, [%%g0] 2\n\t" : : : "l1", "l2");
+};
+
+#endif /*!__ASSEMBLY__*/
+
+#ifdef CONFIG_SMP
+# define LEON3_IRQ_RESCHEDULE		13
+# define LEON3_IRQ_TICKER		(leon_percpu_timer_dev[0].irq)
+# define LEON3_IRQ_CROSS_CALL		15
+#endif
+
+#if defined(PAGE_SIZE_LEON_8K)
+#define LEON_PAGE_SIZE_LEON 1
+#elif defined(PAGE_SIZE_LEON_16K)
+#define LEON_PAGE_SIZE_LEON 2)
+#else
+#define LEON_PAGE_SIZE_LEON 0
+#endif
+
+#if LEON_PAGE_SIZE_LEON == 0
+/* [ 8, 6, 6 ] + 12 */
+#define LEON_PGD_SH    24
+#define LEON_PGD_M     0xff
+#define LEON_PMD_SH    18
+#define LEON_PMD_SH_V  (LEON_PGD_SH-2)
+#define LEON_PMD_M     0x3f
+#define LEON_PTE_SH    12
+#define LEON_PTE_M     0x3f
+#elif LEON_PAGE_SIZE_LEON == 1
+/* [ 7, 6, 6 ] + 13 */
+#define LEON_PGD_SH    25
+#define LEON_PGD_M     0x7f
+#define LEON_PMD_SH    19
+#define LEON_PMD_SH_V  (LEON_PGD_SH-1)
+#define LEON_PMD_M     0x3f
+#define LEON_PTE_SH    13
+#define LEON_PTE_M     0x3f
+#elif LEON_PAGE_SIZE_LEON == 2
+/* [ 6, 6, 6 ] + 14 */
+#define LEON_PGD_SH    26
+#define LEON_PGD_M     0x3f
+#define LEON_PMD_SH    20
+#define LEON_PMD_SH_V  (LEON_PGD_SH-0)
+#define LEON_PMD_M     0x3f
+#define LEON_PTE_SH    14
+#define LEON_PTE_M     0x3f
+#elif LEON_PAGE_SIZE_LEON == 3
+/* [ 4, 7, 6 ] + 15 */
+#define LEON_PGD_SH    28
+#define LEON_PGD_M     0x0f
+#define LEON_PMD_SH    21
+#define LEON_PMD_SH_V  (LEON_PGD_SH-0)
+#define LEON_PMD_M     0x7f
+#define LEON_PTE_SH    15
+#define LEON_PTE_M     0x3f
+#else
+#error cannot determine LEON_PAGE_SIZE_LEON
+#endif
+
+#define PAGE_MIN_SHIFT   (12)
+#define PAGE_MIN_SIZE    (1UL << PAGE_MIN_SHIFT)
+
+#define LEON3_XCCR_SETS_MASK  0x07000000UL
+#define LEON3_XCCR_SSIZE_MASK 0x00f00000UL
+
+#define LEON2_CCR_DSETS_MASK 0x03000000UL
+#define LEON2_CFG_SSIZE_MASK 0x00007000UL
+
+#ifndef __ASSEMBLY__
+extern unsigned long srmmu_swprobe(unsigned long vaddr, unsigned long *paddr);
+extern void leon_flush_icache_all(void);
+extern void leon_flush_dcache_all(void);
+extern void leon_flush_cache_all(void);
+extern void leon_flush_tlb_all(void);
+extern int leon_flush_during_switch;
+extern int leon_flush_needed(void);
+
+struct vm_area_struct;
+extern void leon_flush_icache_all(void);
+extern void leon_flush_dcache_all(void);
+extern void leon_flush_pcache_all(struct vm_area_struct *vma, unsigned long page);
+extern void leon_flush_cache_all(void);
+extern void leon_flush_tlb_all(void);
+extern int leon_flush_during_switch;
+extern int leon_flush_needed(void);
+extern void leon_flush_pcache_all(struct vm_area_struct *vma, unsigned long page);
+
+/* struct that hold LEON3 cache configuration registers */
+struct leon3_cacheregs {
+	unsigned long ccr;	/* 0x00 - Cache Control Register  */
+	unsigned long iccr;     /* 0x08 - Instruction Cache Configuration Register */
+	unsigned long dccr;	/* 0x0c - Data Cache Configuration Register */
+};
+
+/* struct that hold LEON2 cache configuration register
+ * & configuration register
+ */
+struct leon2_cacheregs {
+	unsigned long ccr, cfg;
+};
+
+#ifdef __KERNEL__
+
+#include <linux/interrupt.h>
+
+struct device_node;
+extern int sparc_leon_eirq_get(int eirq, int cpu);
+extern irqreturn_t sparc_leon_eirq_isr(int dummy, void *dev_id);
+extern void sparc_leon_eirq_register(int eirq);
+extern void leon_clear_clock_irq(void);
+extern void leon_load_profile_irq(int cpu, unsigned int limit);
+extern void leon_init_timers(irq_handler_t counter_fn);
+extern void leon_clear_clock_irq(void);
+extern void leon_load_profile_irq(int cpu, unsigned int limit);
+extern void leon_trans_init(struct device_node *dp);
+extern void leon_node_init(struct device_node *dp, struct device_node ***nextp);
+extern void leon_init_IRQ(void);
+extern void leon_init(void);
+extern unsigned long srmmu_swprobe(unsigned long vaddr, unsigned long *paddr);
+extern void init_leon(void);
+extern void poke_leonsparc(void);
+extern void leon3_getCacheRegs(struct leon3_cacheregs *regs);
+extern int leon_flush_needed(void);
+extern void leon_switch_mm(void);
+extern int srmmu_swprobe_trace;
+
+#endif /* __KERNEL__ */
+
+#endif /* __ASSEMBLY__ */
+
+/* macros used in leon_mm.c */
+#define PFN(x)           ((x) >> PAGE_SHIFT)
+#define _pfn_valid(pfn)	 ((pfn < last_valid_pfn) && (pfn >= PFN(phys_base)))
+#define _SRMMU_PTE_PMASK_LEON 0xffffffff
+
+#else /* defined(CONFIG_SPARC_LEON) */
+
+/* nop definitions for !LEON case */
+#define leon_init() do {} while (0)
+#define leon_switch_mm() do {} while (0)
+#define leon_init_IRQ() do {} while (0)
+#define init_leon() do {} while (0)
+
+#endif /* !defined(CONFIG_SPARC_LEON) */
+
+#endif
diff --git a/arch/sparc/include/asm/leon_amba.h b/arch/sparc/include/asm/leon_amba.h
new file mode 100644
index 00000000000..618e8882179
--- /dev/null
+++ b/arch/sparc/include/asm/leon_amba.h
@@ -0,0 +1,263 @@
+/*
+*Copyright (C) 2004 Konrad Eisele (eiselekd@web.de,konrad@gaisler.com), Gaisler Research
+*Copyright (C) 2004 Stefan Holst (mail@s-holst.de), Uni-Stuttgart
+*Copyright (C) 2009 Daniel Hellstrom (daniel@gaisler.com),Konrad Eisele (konrad@gaisler.com) Aeroflex Gaisler AB
+*/
+
+#ifndef LEON_AMBA_H_INCLUDE
+#define LEON_AMBA_H_INCLUDE
+
+#ifndef __ASSEMBLY__
+
+struct amba_prom_registers {
+	unsigned int phys_addr;	/* The physical address of this register */
+	unsigned int reg_size;	/* How many bytes does this register take up? */
+};
+
+#endif
+
+/*
+ *  The following defines the bits in the LEON UART Status Registers.
+ */
+
+#define LEON_REG_UART_STATUS_DR   0x00000001	/* Data Ready */
+#define LEON_REG_UART_STATUS_TSE  0x00000002	/* TX Send Register Empty */
+#define LEON_REG_UART_STATUS_THE  0x00000004	/* TX Hold Register Empty */
+#define LEON_REG_UART_STATUS_BR   0x00000008	/* Break Error */
+#define LEON_REG_UART_STATUS_OE   0x00000010	/* RX Overrun Error */
+#define LEON_REG_UART_STATUS_PE   0x00000020	/* RX Parity Error */
+#define LEON_REG_UART_STATUS_FE   0x00000040	/* RX Framing Error */
+#define LEON_REG_UART_STATUS_ERR  0x00000078	/* Error Mask */
+
+/*
+ *  The following defines the bits in the LEON UART Ctrl Registers.
+ */
+
+#define LEON_REG_UART_CTRL_RE     0x00000001	/* Receiver enable */
+#define LEON_REG_UART_CTRL_TE     0x00000002	/* Transmitter enable */
+#define LEON_REG_UART_CTRL_RI     0x00000004	/* Receiver interrupt enable */
+#define LEON_REG_UART_CTRL_TI     0x00000008	/* Transmitter irq */
+#define LEON_REG_UART_CTRL_PS     0x00000010	/* Parity select */
+#define LEON_REG_UART_CTRL_PE     0x00000020	/* Parity enable */
+#define LEON_REG_UART_CTRL_FL     0x00000040	/* Flow control enable */
+#define LEON_REG_UART_CTRL_LB     0x00000080	/* Loop Back enable */
+
+#define LEON3_GPTIMER_EN 1
+#define LEON3_GPTIMER_RL 2
+#define LEON3_GPTIMER_LD 4
+#define LEON3_GPTIMER_IRQEN 8
+#define LEON3_GPTIMER_SEPIRQ 8
+
+#define LEON23_REG_TIMER_CONTROL_EN    0x00000001 /* 1 = enable counting */
+/* 0 = hold scalar and counter */
+#define LEON23_REG_TIMER_CONTROL_RL    0x00000002 /* 1 = reload at 0 */
+						  /* 0 = stop at 0 */
+#define LEON23_REG_TIMER_CONTROL_LD    0x00000004 /* 1 = load counter */
+						  /* 0 = no function */
+#define LEON23_REG_TIMER_CONTROL_IQ    0x00000008 /* 1 = irq enable */
+						  /* 0 = no function */
+
+/*
+ *  The following defines the bits in the LEON PS/2 Status Registers.
+ */
+
+#define LEON_REG_PS2_STATUS_DR   0x00000001	/* Data Ready */
+#define LEON_REG_PS2_STATUS_PE   0x00000002	/* Parity error */
+#define LEON_REG_PS2_STATUS_FE   0x00000004	/* Framing error */
+#define LEON_REG_PS2_STATUS_KI   0x00000008	/* Keyboard inhibit */
+#define LEON_REG_PS2_STATUS_RF   0x00000010	/* RX buffer full */
+#define LEON_REG_PS2_STATUS_TF   0x00000020	/* TX buffer full */
+
+/*
+ *  The following defines the bits in the LEON PS/2 Ctrl Registers.
+ */
+
+#define LEON_REG_PS2_CTRL_RE 0x00000001	/* Receiver enable */
+#define LEON_REG_PS2_CTRL_TE 0x00000002	/* Transmitter enable */
+#define LEON_REG_PS2_CTRL_RI 0x00000004	/* Keyboard receive irq  */
+#define LEON_REG_PS2_CTRL_TI 0x00000008	/* Keyboard transmit irq */
+
+#define LEON3_IRQMPSTATUS_CPUNR     28
+#define LEON3_IRQMPSTATUS_BROADCAST 27
+
+#define GPTIMER_CONFIG_IRQNT(a)          (((a) >> 3) & 0x1f)
+#define GPTIMER_CONFIG_ISSEP(a)          ((a) & (1 << 8))
+#define GPTIMER_CONFIG_NTIMERS(a)        ((a) & (0x7))
+#define LEON3_GPTIMER_CTRL_PENDING       0x10
+#define LEON3_GPTIMER_CONFIG_NRTIMERS(c) ((c)->config & 0x7)
+#define LEON3_GPTIMER_CTRL_ISPENDING(r)  (((r)&LEON3_GPTIMER_CTRL_PENDING) ? 1 : 0)
+
+#ifdef CONFIG_SPARC_LEON
+
+#ifndef __ASSEMBLY__
+
+struct leon3_irqctrl_regs_map {
+	u32 ilevel;
+	u32 ipend;
+	u32 iforce;
+	u32 iclear;
+	u32 mpstatus;
+	u32 mpbroadcast;
+	u32 notused02;
+	u32 notused03;
+	u32 notused10;
+	u32 notused11;
+	u32 notused12;
+	u32 notused13;
+	u32 notused20;
+	u32 notused21;
+	u32 notused22;
+	u32 notused23;
+	u32 mask[16];
+	u32 force[16];
+	/* Extended IRQ registers */
+	u32 intid[16];	/* 0xc0 */
+};
+
+struct leon3_apbuart_regs_map {
+	u32 data;
+	u32 status;
+	u32 ctrl;
+	u32 scaler;
+};
+
+struct leon3_gptimerelem_regs_map {
+	u32 val;
+	u32 rld;
+	u32 ctrl;
+	u32 unused;
+};
+
+struct leon3_gptimer_regs_map {
+	u32 scalar;
+	u32 scalar_reload;
+	u32 config;
+	u32 unused;
+	struct leon3_gptimerelem_regs_map e[8];
+};
+
+/*
+ *  Types and structure used for AMBA Plug & Play bus scanning
+ */
+
+#define AMBA_MAXAPB_DEVS 64
+#define AMBA_MAXAPB_DEVS_PERBUS 16
+
+struct amba_device_table {
+	int devnr;		   /* number of devices on AHB or APB bus */
+	unsigned int *addr[16];    /* addresses to the devices configuration tables */
+	unsigned int allocbits[1]; /* 0=unallocated, 1=allocated driver */
+};
+
+struct amba_apbslv_device_table {
+	int devnr;		                  /* number of devices on AHB or APB bus */
+	unsigned int *addr[AMBA_MAXAPB_DEVS];     /* addresses to the devices configuration tables */
+	unsigned int apbmst[AMBA_MAXAPB_DEVS];    /* apb master if a entry is a apb slave */
+	unsigned int apbmstidx[AMBA_MAXAPB_DEVS]; /* apb master idx if a entry is a apb slave */
+	unsigned int allocbits[4];                /* 0=unallocated, 1=allocated driver */
+};
+
+struct amba_confarea_type {
+	struct amba_confarea_type *next;/* next bus in chain */
+	struct amba_device_table ahbmst;
+	struct amba_device_table ahbslv;
+	struct amba_apbslv_device_table apbslv;
+	unsigned int apbmst;
+};
+
+/* collect apb slaves */
+struct amba_apb_device {
+	unsigned int start, irq, bus_id;
+	struct amba_confarea_type *bus;
+};
+
+/* collect ahb slaves */
+struct amba_ahb_device {
+	unsigned int start[4], irq, bus_id;
+	struct amba_confarea_type *bus;
+};
+
+struct device_node;
+void _amba_init(struct device_node *dp, struct device_node ***nextp);
+
+extern struct leon3_irqctrl_regs_map *leon3_irqctrl_regs;
+extern struct leon3_gptimer_regs_map *leon3_gptimer_regs;
+extern struct amba_apb_device leon_percpu_timer_dev[16];
+extern int leondebug_irq_disable;
+extern int leon_debug_irqout;
+extern unsigned long leon3_gptimer_irq;
+extern unsigned int sparc_leon_eirq;
+
+#endif /* __ASSEMBLY__ */
+
+#define LEON3_IO_AREA 0xfff00000
+#define LEON3_CONF_AREA 0xff000
+#define LEON3_AHB_SLAVE_CONF_AREA (1 << 11)
+
+#define LEON3_AHB_CONF_WORDS 8
+#define LEON3_APB_CONF_WORDS 2
+#define LEON3_AHB_MASTERS 16
+#define LEON3_AHB_SLAVES 16
+#define LEON3_APB_SLAVES 16
+#define LEON3_APBUARTS 8
+
+/* Vendor codes */
+#define VENDOR_GAISLER   1
+#define VENDOR_PENDER    2
+#define VENDOR_ESA       4
+#define VENDOR_OPENCORES 8
+
+/* Gaisler Research device id's */
+#define GAISLER_LEON3    0x003
+#define GAISLER_LEON3DSU 0x004
+#define GAISLER_ETHAHB   0x005
+#define GAISLER_APBMST   0x006
+#define GAISLER_AHBUART  0x007
+#define GAISLER_SRCTRL   0x008
+#define GAISLER_SDCTRL   0x009
+#define GAISLER_APBUART  0x00C
+#define GAISLER_IRQMP    0x00D
+#define GAISLER_AHBRAM   0x00E
+#define GAISLER_GPTIMER  0x011
+#define GAISLER_PCITRG   0x012
+#define GAISLER_PCISBRG  0x013
+#define GAISLER_PCIFBRG  0x014
+#define GAISLER_PCITRACE 0x015
+#define GAISLER_PCIDMA   0x016
+#define GAISLER_AHBTRACE 0x017
+#define GAISLER_ETHDSU   0x018
+#define GAISLER_PIOPORT  0x01A
+#define GAISLER_GRGPIO   0x01A
+#define GAISLER_AHBJTAG  0x01c
+#define GAISLER_ETHMAC   0x01D
+#define GAISLER_AHB2AHB  0x020
+#define GAISLER_USBDC    0x021
+#define GAISLER_ATACTRL  0x024
+#define GAISLER_DDRSPA   0x025
+#define GAISLER_USBEHC   0x026
+#define GAISLER_USBUHC   0x027
+#define GAISLER_I2CMST   0x028
+#define GAISLER_SPICTRL  0x02D
+#define GAISLER_DDR2SPA  0x02E
+#define GAISLER_SPIMCTRL 0x045
+#define GAISLER_LEON4    0x048
+#define GAISLER_LEON4DSU 0x049
+#define GAISLER_AHBSTAT  0x052
+#define GAISLER_FTMCTRL  0x054
+#define GAISLER_KBD      0x060
+#define GAISLER_VGA      0x061
+#define GAISLER_SVGA     0x063
+#define GAISLER_GRSYSMON 0x066
+#define GAISLER_GRACECTRL 0x067
+
+#define GAISLER_L2TIME   0xffd	/* internal device: leon2 timer */
+#define GAISLER_L2C      0xffe	/* internal device: leon2compat */
+#define GAISLER_PLUGPLAY 0xfff	/* internal device: plug & play configarea */
+
+#define amba_vendor(x) (((x) >> 24) & 0xff)
+
+#define amba_device(x) (((x) >> 12) & 0xfff)
+
+#endif /* !defined(CONFIG_SPARC_LEON) */
+
+#endif
diff --git a/arch/sparc/kernel/leon_kernel.c b/arch/sparc/kernel/leon_kernel.c
new file mode 100644
index 00000000000..54d8a5bd482
--- /dev/null
+++ b/arch/sparc/kernel/leon_kernel.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (C) 2009 Daniel Hellstrom (daniel@gaisler.com) Aeroflex Gaisler AB
+ * Copyright (C) 2009 Konrad Eisele (konrad@gaisler.com) Aeroflex Gaisler AB
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/interrupt.h>
+#include <linux/of_device.h>
+#include <asm/oplib.h>
+#include <asm/timer.h>
+#include <asm/prom.h>
+#include <asm/leon.h>
+#include <asm/leon_amba.h>
+
+#include "prom.h"
+#include "irq.h"
+
+struct leon3_irqctrl_regs_map *leon3_irqctrl_regs; /* interrupt controller base address, initialized by amba_init() */
+struct leon3_gptimer_regs_map *leon3_gptimer_regs; /* timer controller base address, initialized by amba_init() */
+struct amba_apb_device leon_percpu_timer_dev[16];
+
+int leondebug_irq_disable;
+int leon_debug_irqout;
+static int dummy_master_l10_counter;
+
+unsigned long leon3_gptimer_irq; /* interrupt controller irq number, initialized by amba_init() */
+unsigned int sparc_leon_eirq;
+#define LEON_IMASK ((&leon3_irqctrl_regs->mask[0]))
+
+/* Return the IRQ of the pending IRQ on the extended IRQ controller */
+int sparc_leon_eirq_get(int eirq, int cpu)
+{
+	return LEON3_BYPASS_LOAD_PA(&leon3_irqctrl_regs->intid[cpu]) & 0x1f;
+}
+
+irqreturn_t sparc_leon_eirq_isr(int dummy, void *dev_id)
+{
+	printk(KERN_ERR "sparc_leon_eirq_isr: ERROR EXTENDED IRQ\n");
+	return IRQ_HANDLED;
+}
+
+/* The extended IRQ controller has been found, this function registers it */
+void sparc_leon_eirq_register(int eirq)
+{
+	int irq;
+
+	/* Register a "BAD" handler for this interrupt, it should never happen */
+	irq = request_irq(eirq, sparc_leon_eirq_isr,
+			  (IRQF_DISABLED | SA_STATIC_ALLOC), "extirq", NULL);
+
+	if (irq) {
+		printk(KERN_ERR
+		       "sparc_leon_eirq_register: unable to attach IRQ%d\n",
+		       eirq);
+	} else {
+		sparc_leon_eirq = eirq;
+	}
+
+}
+
+static inline unsigned long get_irqmask(unsigned int irq)
+{
+	unsigned long mask;
+
+	if (!irq || ((irq > 0xf) && !sparc_leon_eirq)
+	    || ((irq > 0x1f) && sparc_leon_eirq)) {
+		printk(KERN_ERR
+		       "leon_get_irqmask: false irq number: %d\n", irq);
+		mask = 0;
+	} else {
+		mask = LEON_HARD_INT(irq);
+	}
+	return mask;
+}
+
+static void leon_enable_irq(unsigned int irq_nr)
+{
+	unsigned long mask, flags;
+	mask = get_irqmask(irq_nr);
+	local_irq_save(flags);
+	LEON3_BYPASS_STORE_PA(LEON_IMASK,
+			      (LEON3_BYPASS_LOAD_PA(LEON_IMASK) | (mask)));
+	local_irq_restore(flags);
+}
+
+static void leon_disable_irq(unsigned int irq_nr)
+{
+	unsigned long mask, flags;
+	mask = get_irqmask(irq_nr);
+	local_irq_save(flags);
+	LEON3_BYPASS_STORE_PA(LEON_IMASK,
+			      (LEON3_BYPASS_LOAD_PA(LEON_IMASK) & ~(mask)));
+	local_irq_restore(flags);
+
+}
+
+void __init leon_init_timers(irq_handler_t counter_fn)
+{
+	int irq;
+
+	leondebug_irq_disable = 0;
+	leon_debug_irqout = 0;
+	master_l10_counter = (unsigned int *)&dummy_master_l10_counter;
+	dummy_master_l10_counter = 0;
+
+	if (leon3_gptimer_regs && leon3_irqctrl_regs) {
+		LEON3_BYPASS_STORE_PA(&leon3_gptimer_regs->e[0].val, 0);
+		LEON3_BYPASS_STORE_PA(&leon3_gptimer_regs->e[0].rld,
+				      (((1000000 / 100) - 1)));
+		LEON3_BYPASS_STORE_PA(&leon3_gptimer_regs->e[0].ctrl, 0);
+
+	} else {
+		printk(KERN_ERR "No Timer/irqctrl found\n");
+		BUG();
+	}
+
+	irq = request_irq(leon3_gptimer_irq,
+			  counter_fn,
+			  (IRQF_DISABLED | SA_STATIC_ALLOC), "timer", NULL);
+
+	if (irq) {
+		printk(KERN_ERR "leon_time_init: unable to attach IRQ%d\n",
+		       LEON_INTERRUPT_TIMER1);
+		prom_halt();
+	}
+
+	if (leon3_gptimer_regs) {
+		LEON3_BYPASS_STORE_PA(&leon3_gptimer_regs->e[0].ctrl,
+				      LEON3_GPTIMER_EN |
+				      LEON3_GPTIMER_RL |
+				      LEON3_GPTIMER_LD | LEON3_GPTIMER_IRQEN);
+	}
+}
+
+void leon_clear_clock_irq(void)
+{
+}
+
+void leon_load_profile_irq(int cpu, unsigned int limit)
+{
+	BUG();
+}
+
+
+
+
+void __init leon_trans_init(struct device_node *dp)
+{
+	if (strcmp(dp->type, "cpu") == 0 && strcmp(dp->name, "<NULL>") == 0) {
+		struct property *p;
+		p = of_find_property(dp, "mid", (void *)0);
+		if (p) {
+			int mid;
+			dp->name = prom_early_alloc(5 + 1);
+			memcpy(&mid, p->value, p->length);
+			sprintf((char *)dp->name, "cpu%.2d", mid);
+		}
+	}
+}
+
+void __initdata (*prom_amba_init)(struct device_node *dp, struct device_node ***nextp) = 0;
+
+void __init leon_node_init(struct device_node *dp, struct device_node ***nextp)
+{
+	if (prom_amba_init &&
+	    strcmp(dp->type, "ambapp") == 0 &&
+	    strcmp(dp->name, "ambapp0") == 0) {
+		prom_amba_init(dp, nextp);
+	}
+}
+
+void __init leon_init_IRQ(void)
+{
+	sparc_init_timers = leon_init_timers;
+
+	BTFIXUPSET_CALL(enable_irq, leon_enable_irq, BTFIXUPCALL_NORM);
+	BTFIXUPSET_CALL(disable_irq, leon_disable_irq, BTFIXUPCALL_NORM);
+	BTFIXUPSET_CALL(enable_pil_irq, leon_enable_irq, BTFIXUPCALL_NORM);
+	BTFIXUPSET_CALL(disable_pil_irq, leon_disable_irq, BTFIXUPCALL_NORM);
+
+	BTFIXUPSET_CALL(clear_clock_irq, leon_clear_clock_irq,
+			BTFIXUPCALL_NORM);
+	BTFIXUPSET_CALL(load_profile_irq, leon_load_profile_irq,
+			BTFIXUPCALL_NOP);
+
+#ifdef CONFIG_SMP
+	BTFIXUPSET_CALL(set_cpu_int, leon_set_cpu_int, BTFIXUPCALL_NORM);
+	BTFIXUPSET_CALL(clear_cpu_int, leon_clear_ipi, BTFIXUPCALL_NORM);
+	BTFIXUPSET_CALL(set_irq_udt, leon_set_udt, BTFIXUPCALL_NORM);
+#endif
+
+}
+
+void __init leon_init(void)
+{
+	prom_build_more = &leon_node_init;
+}
diff --git a/arch/sparc/mm/leon_mm.c b/arch/sparc/mm/leon_mm.c
new file mode 100644
index 00000000000..c0e01297e64
--- /dev/null
+++ b/arch/sparc/mm/leon_mm.c
@@ -0,0 +1,260 @@
+/*
+ *  linux/arch/sparc/mm/leon_m.c
+ *
+ * Copyright (C) 2004 Konrad Eisele (eiselekd@web.de, konrad@gaisler.com) Gaisler Research
+ * Copyright (C) 2009 Daniel Hellstrom (daniel@gaisler.com) Aeroflex Gaisler AB
+ * Copyright (C) 2009 Konrad Eisele (konrad@gaisler.com) Aeroflex Gaisler AB
+ *
+ * do srmmu probe in software
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <asm/asi.h>
+#include <asm/leon.h>
+#include <asm/tlbflush.h>
+
+int leon_flush_during_switch = 1;
+int srmmu_swprobe_trace;
+
+unsigned long srmmu_swprobe(unsigned long vaddr, unsigned long *paddr)
+{
+
+	unsigned int ctxtbl;
+	unsigned int pgd, pmd, ped;
+	unsigned int ptr;
+	unsigned int lvl, pte, paddrbase;
+	unsigned int ctx;
+	unsigned int paddr_calc;
+
+	paddrbase = 0;
+
+	if (srmmu_swprobe_trace)
+		printk(KERN_INFO "swprobe: trace on\n");
+
+	ctxtbl = srmmu_get_ctable_ptr();
+	if (!(ctxtbl)) {
+		if (srmmu_swprobe_trace)
+			printk(KERN_INFO "swprobe: srmmu_get_ctable_ptr returned 0=>0\n");
+		return 0;
+	}
+	if (!_pfn_valid(PFN(ctxtbl))) {
+		if (srmmu_swprobe_trace)
+			printk(KERN_INFO
+			       "swprobe: !_pfn_valid(%x)=>0\n",
+			       PFN(ctxtbl));
+		return 0;
+	}
+
+	ctx = srmmu_get_context();
+	if (srmmu_swprobe_trace)
+		printk(KERN_INFO "swprobe:  --- ctx (%x) ---\n", ctx);
+
+	pgd = LEON_BYPASS_LOAD_PA(ctxtbl + (ctx * 4));
+
+	if (((pgd & SRMMU_ET_MASK) == SRMMU_ET_PTE)) {
+		if (srmmu_swprobe_trace)
+			printk(KERN_INFO "swprobe: pgd is entry level 3\n");
+		lvl = 3;
+		pte = pgd;
+		paddrbase = pgd & _SRMMU_PTE_PMASK_LEON;
+		goto ready;
+	}
+	if (((pgd & SRMMU_ET_MASK) != SRMMU_ET_PTD)) {
+		if (srmmu_swprobe_trace)
+			printk(KERN_INFO "swprobe: pgd is invalid => 0\n");
+		return 0;
+	}
+
+	if (srmmu_swprobe_trace)
+		printk(KERN_INFO "swprobe:  --- pgd (%x) ---\n", pgd);
+
+	ptr = (pgd & SRMMU_PTD_PMASK) << 4;
+	ptr += ((((vaddr) >> LEON_PGD_SH) & LEON_PGD_M) * 4);
+	if (!_pfn_valid(PFN(ptr)))
+		return 0;
+
+	pmd = LEON_BYPASS_LOAD_PA(ptr);
+	if (((pmd & SRMMU_ET_MASK) == SRMMU_ET_PTE)) {
+		if (srmmu_swprobe_trace)
+			printk(KERN_INFO "swprobe: pmd is entry level 2\n");
+		lvl = 2;
+		pte = pmd;
+		paddrbase = pmd & _SRMMU_PTE_PMASK_LEON;
+		goto ready;
+	}
+	if (((pmd & SRMMU_ET_MASK) != SRMMU_ET_PTD)) {
+		if (srmmu_swprobe_trace)
+			printk(KERN_INFO "swprobe: pmd is invalid => 0\n");
+		return 0;
+	}
+
+	if (srmmu_swprobe_trace)
+		printk(KERN_INFO "swprobe:  --- pmd (%x) ---\n", pmd);
+
+	ptr = (pmd & SRMMU_PTD_PMASK) << 4;
+	ptr += (((vaddr >> LEON_PMD_SH) & LEON_PMD_M) * 4);
+	if (!_pfn_valid(PFN(ptr))) {
+		if (srmmu_swprobe_trace)
+			printk(KERN_INFO "swprobe: !_pfn_valid(%x)=>0\n",
+			       PFN(ptr));
+		return 0;
+	}
+
+	ped = LEON_BYPASS_LOAD_PA(ptr);
+
+	if (((ped & SRMMU_ET_MASK) == SRMMU_ET_PTE)) {
+		if (srmmu_swprobe_trace)
+			printk(KERN_INFO "swprobe: ped is entry level 1\n");
+		lvl = 1;
+		pte = ped;
+		paddrbase = ped & _SRMMU_PTE_PMASK_LEON;
+		goto ready;
+	}
+	if (((ped & SRMMU_ET_MASK) != SRMMU_ET_PTD)) {
+		if (srmmu_swprobe_trace)
+			printk(KERN_INFO "swprobe: ped is invalid => 0\n");
+		return 0;
+	}
+
+	if (srmmu_swprobe_trace)
+		printk(KERN_INFO "swprobe:  --- ped (%x) ---\n", ped);
+
+	ptr = (ped & SRMMU_PTD_PMASK) << 4;
+	ptr += (((vaddr >> LEON_PTE_SH) & LEON_PTE_M) * 4);
+	if (!_pfn_valid(PFN(ptr)))
+		return 0;
+
+	ptr = LEON_BYPASS_LOAD_PA(ptr);
+	if (((ptr & SRMMU_ET_MASK) == SRMMU_ET_PTE)) {
+		if (srmmu_swprobe_trace)
+			printk(KERN_INFO "swprobe: ptr is entry level 0\n");
+		lvl = 0;
+		pte = ptr;
+		paddrbase = ptr & _SRMMU_PTE_PMASK_LEON;
+		goto ready;
+	}
+	if (srmmu_swprobe_trace)
+		printk(KERN_INFO "swprobe: ptr is invalid => 0\n");
+	return 0;
+
+ready:
+	switch (lvl) {
+	case 0:
+		paddr_calc =
+		    (vaddr & ~(-1 << LEON_PTE_SH)) | ((pte & ~0xff) << 4);
+		break;
+	case 1:
+		paddr_calc =
+		    (vaddr & ~(-1 << LEON_PMD_SH)) | ((pte & ~0xff) << 4);
+		break;
+	case 2:
+		paddr_calc =
+		    (vaddr & ~(-1 << LEON_PGD_SH)) | ((pte & ~0xff) << 4);
+		break;
+	default:
+	case 3:
+		paddr_calc = vaddr;
+		break;
+	}
+	if (srmmu_swprobe_trace)
+		printk(KERN_INFO "swprobe: padde %x\n", paddr_calc);
+	if (paddr)
+		*paddr = paddr_calc;
+	return paddrbase;
+}
+
+void leon_flush_icache_all(void)
+{
+	__asm__ __volatile__(" flush ");	/*iflush*/
+}
+
+void leon_flush_dcache_all(void)
+{
+	__asm__ __volatile__("sta %%g0, [%%g0] %0\n\t" : :
+			     "i"(ASI_LEON_DFLUSH) : "memory");
+}
+
+void leon_flush_pcache_all(struct vm_area_struct *vma, unsigned long page)
+{
+	if (vma->vm_flags & VM_EXEC)
+		leon_flush_icache_all();
+	leon_flush_dcache_all();
+}
+
+void leon_flush_cache_all(void)
+{
+	__asm__ __volatile__(" flush ");	/*iflush*/
+	__asm__ __volatile__("sta %%g0, [%%g0] %0\n\t" : :
+			     "i"(ASI_LEON_DFLUSH) : "memory");
+}
+
+void leon_flush_tlb_all(void)
+{
+	leon_flush_cache_all();
+	__asm__ __volatile__("sta %%g0, [%0] %1\n\t" : : "r"(0x400),
+			     "i"(ASI_LEON_MMUFLUSH) : "memory");
+}
+
+/* get all cache regs */
+void leon3_getCacheRegs(struct leon3_cacheregs *regs)
+{
+	unsigned long ccr, iccr, dccr;
+
+	if (!regs)
+		return;
+	/* Get Cache regs from "Cache ASI" address 0x0, 0x8 and 0xC */
+	__asm__ __volatile__("lda [%%g0] %3, %0\n\t"
+			     "mov 0x08, %%g1\n\t"
+			     "lda [%%g1] %3, %1\n\t"
+			     "mov 0x0c, %%g1\n\t"
+			     "lda [%%g1] %3, %2\n\t"
+			     : "=r"(ccr), "=r"(iccr), "=r"(dccr)
+			       /* output */
+			     : "i"(ASI_LEON_CACHEREGS)	/* input */
+			     : "g1"	/* clobber list */
+	    );
+	regs->ccr = ccr;
+	regs->iccr = iccr;
+	regs->dccr = dccr;
+}
+
+/* Due to virtual cache we need to check cache configuration if
+ * it is possible to skip flushing in some cases.
+ *
+ * Leon2 and Leon3 differ in their way of telling cache information
+ *
+ */
+int leon_flush_needed(void)
+{
+	int flush_needed = -1;
+	unsigned int ssize, sets;
+	char *setStr[4] =
+	    { "direct mapped", "2-way associative", "3-way associative",
+		"4-way associative"
+	};
+	/* leon 3 */
+	struct leon3_cacheregs cregs;
+	leon3_getCacheRegs(&cregs);
+	sets = (cregs.dccr & LEON3_XCCR_SETS_MASK) >> 24;
+	/* (ssize=>realsize) 0=>1k, 1=>2k, 2=>4k, 3=>8k ... */
+	ssize = 1 << ((cregs.dccr & LEON3_XCCR_SSIZE_MASK) >> 20);
+
+	printk(KERN_INFO "CACHE: %s cache, set size %dk\n",
+	       sets > 3 ? "unknown" : setStr[sets], ssize);
+	if ((ssize <= (PAGE_SIZE / 1024)) && (sets == 0)) {
+		/* Set Size <= Page size  ==>
+		   flush on every context switch not needed. */
+		flush_needed = 0;
+		printk(KERN_INFO "CACHE: not flushing on every context switch\n");
+	}
+	return flush_needed;
+}
+
+void leon_switch_mm(void)
+{
+	flush_tlb_mm((void *)0);
+	if (leon_flush_during_switch)
+		leon_flush_cache_all();
+}
-- 
cgit v1.2.3-70-g09d2


From 97fb58fa9bb509b49090a1c62ed1b660d518c66b Mon Sep 17 00:00:00 2001
From: Konrad Eisele <konrad@gaisler.com>
Date: Mon, 17 Aug 2009 00:13:30 +0000
Subject: sparc,leon: Redefine MMU register access asi if CONFIG_LEON

SPARC-LEON has a different ASI for mmu register accesses.

Signed-off-by: Konrad Eisele <konrad@gaisler.com>
Reviewed-by:   Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/include/asm/asi.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/sparc')

diff --git a/arch/sparc/include/asm/asi.h b/arch/sparc/include/asm/asi.h
index 74703c5ef98..b2e3db63a64 100644
--- a/arch/sparc/include/asm/asi.h
+++ b/arch/sparc/include/asm/asi.h
@@ -40,7 +40,11 @@
 #define ASI_M_UNA01         0x01   /* Same here... */
 #define ASI_M_MXCC          0x02   /* Access to TI VIKING MXCC registers */
 #define ASI_M_FLUSH_PROBE   0x03   /* Reference MMU Flush/Probe; rw, ss */
+#ifndef CONFIG_SPARC_LEON
 #define ASI_M_MMUREGS       0x04   /* MMU Registers; rw, ss */
+#else
+#define ASI_M_MMUREGS       0x19
+#endif /* CONFIG_SPARC_LEON */
 #define ASI_M_TLBDIAG       0x05   /* MMU TLB only Diagnostics */
 #define ASI_M_DIAGS         0x06   /* Reference MMU Diagnostics */
 #define ASI_M_IODIAG        0x07   /* MMU I/O TLB only Diagnostics */
-- 
cgit v1.2.3-70-g09d2


From 0fd7ef1fe0e6e70c7851ce65a2eb8a8d3f49147e Mon Sep 17 00:00:00 2001
From: Konrad Eisele <konrad@gaisler.com>
Date: Mon, 17 Aug 2009 00:13:31 +0000
Subject: sparc,leon: Introduce the sparc-leon CPU type.

Add sparc_leon enum, M_LEON|M_LEON3_SOC machine. Add compilation of
leon.c in mm and kernel
if CONFIG_SPARC_LEON is defined. Add sparc_leon dependent
initialization to switch statements + head.S.

Signed-off-by: Konrad Eisele <konrad@gaisler.com>
Reviewed-by:   Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/include/asm/machines.h  | 6 +++++-
 arch/sparc/include/asm/pgtsrmmu.h  | 4 ++++
 arch/sparc/include/asm/prom.h      | 3 +++
 arch/sparc/include/asm/system_32.h | 1 +
 arch/sparc/kernel/Makefile         | 2 ++
 arch/sparc/kernel/cpu.c            | 5 +++++
 arch/sparc/kernel/head_32.S        | 5 +++++
 arch/sparc/kernel/idprom.c         | 2 ++
 arch/sparc/kernel/irq_32.c         | 5 +++++
 arch/sparc/kernel/setup_32.c       | 5 +++++
 arch/sparc/mm/Makefile             | 1 +
 arch/sparc/mm/init_32.c            | 4 ++++
 arch/sparc/mm/loadmmu.c            | 1 +
 13 files changed, 43 insertions(+), 1 deletion(-)

(limited to 'arch/sparc')

diff --git a/arch/sparc/include/asm/machines.h b/arch/sparc/include/asm/machines.h
index c28c2f24879..cd9c099567e 100644
--- a/arch/sparc/include/asm/machines.h
+++ b/arch/sparc/include/asm/machines.h
@@ -15,7 +15,7 @@ struct Sun_Machine_Models {
 /* Current number of machines we know about that has an IDPROM
  * machtype entry including one entry for the 0x80 OBP machines.
  */
-#define NUM_SUN_MACHINES   15
+#define NUM_SUN_MACHINES   16
 
 /* The machine type in the idprom area looks like this:
  *
@@ -30,6 +30,7 @@ struct Sun_Machine_Models {
 
 #define SM_ARCH_MASK  0xf0
 #define SM_SUN4       0x20
+#define  M_LEON       0x30
 #define SM_SUN4C      0x50
 #define SM_SUN4M      0x70
 #define SM_SUN4M_OBP  0x80
@@ -41,6 +42,9 @@ struct Sun_Machine_Models {
 #define SM_4_330      0x03    /* Sun 4/300 series */
 #define SM_4_470      0x04    /* Sun 4/400 series */
 
+/* Leon machines */
+#define M_LEON3_SOC   0x02    /* Leon3 SoC */
+
 /* Sun4c machines                Full Name              - PROM NAME */
 #define SM_4C_SS1     0x01    /* Sun4c SparcStation 1   - Sun 4/60  */
 #define SM_4C_IPC     0x02    /* Sun4c SparcStation IPC - Sun 4/40  */
diff --git a/arch/sparc/include/asm/pgtsrmmu.h b/arch/sparc/include/asm/pgtsrmmu.h
index 808555fc1d5..1407c07bdad 100644
--- a/arch/sparc/include/asm/pgtsrmmu.h
+++ b/arch/sparc/include/asm/pgtsrmmu.h
@@ -267,6 +267,7 @@ static inline void srmmu_flush_tlb_page(unsigned long page)
 
 }
 
+#ifndef CONFIG_SPARC_LEON
 static inline unsigned long srmmu_hwprobe(unsigned long vaddr)
 {
 	unsigned long retval;
@@ -278,6 +279,9 @@ static inline unsigned long srmmu_hwprobe(unsigned long vaddr)
 
 	return retval;
 }
+#else
+#define srmmu_hwprobe(addr) (srmmu_swprobe(addr, 0) & SRMMU_PTE_PMASK)
+#endif
 
 static inline int
 srmmu_get_pte (unsigned long addr)
diff --git a/arch/sparc/include/asm/prom.h b/arch/sparc/include/asm/prom.h
index be8d7aaeb60..82a190d7efc 100644
--- a/arch/sparc/include/asm/prom.h
+++ b/arch/sparc/include/asm/prom.h
@@ -118,5 +118,8 @@ extern struct device_node *of_console_device;
 extern char *of_console_path;
 extern char *of_console_options;
 
+extern void (*prom_build_more)(struct device_node *dp, struct device_node ***nextp);
+extern char *build_full_name(struct device_node *dp);
+
 #endif /* __KERNEL__ */
 #endif /* _SPARC_PROM_H */
diff --git a/arch/sparc/include/asm/system_32.h b/arch/sparc/include/asm/system_32.h
index 751c8c17f5a..890036b3689 100644
--- a/arch/sparc/include/asm/system_32.h
+++ b/arch/sparc/include/asm/system_32.h
@@ -32,6 +32,7 @@ enum sparc_cpu {
   sun4u       = 0x05, /* V8 ploos ploos */
   sun_unknown = 0x06,
   ap1000      = 0x07, /* almost a sun4m */
+  sparc_leon  = 0x08, /* Leon SoC */
 };
 
 /* Really, userland should not be looking at any of this... */
diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile
index 475ce4696ac..b0041756c9f 100644
--- a/arch/sparc/kernel/Makefile
+++ b/arch/sparc/kernel/Makefile
@@ -41,6 +41,8 @@ obj-y                   += of_device_common.o
 obj-y                   += of_device_$(BITS).o
 obj-$(CONFIG_SPARC64)   += prom_irqtrans.o
 
+obj-$(CONFIG_SPARC_LEON)+= leon_kernel.o
+
 obj-$(CONFIG_SPARC64)   += reboot.o
 obj-$(CONFIG_SPARC64)   += sysfs.o
 obj-$(CONFIG_SPARC64)   += iommu.o
diff --git a/arch/sparc/kernel/cpu.c b/arch/sparc/kernel/cpu.c
index d85c3dc4953..1446df90ef8 100644
--- a/arch/sparc/kernel/cpu.c
+++ b/arch/sparc/kernel/cpu.c
@@ -312,7 +312,12 @@ void __cpuinit cpu_probe(void)
 
 	psr = get_psr();
 	put_psr(psr | PSR_EF);
+#ifdef CONFIG_SPARC_LEON
+	fpu_vers = 7;
+#else
 	fpu_vers = ((get_fsr() >> 17) & 0x7);
+#endif
+
 	put_psr(psr);
 
 	set_cpu_and_fpu(psr_impl, psr_vers, fpu_vers);
diff --git a/arch/sparc/kernel/head_32.S b/arch/sparc/kernel/head_32.S
index 6b4d8acc4c8..439d82a95ac 100644
--- a/arch/sparc/kernel/head_32.S
+++ b/arch/sparc/kernel/head_32.S
@@ -809,6 +809,11 @@ found_version:
 		 nop
 
 got_prop:
+#ifdef CONFIG_SPARC_LEON
+	        /* no cpu-type check is needed, it is a SPARC-LEON */
+		ba sun4c_continue_boot
+		 nop
+#endif
 		set	cputypval, %o2
 		ldub	[%o2 + 0x4], %l1
 
diff --git a/arch/sparc/kernel/idprom.c b/arch/sparc/kernel/idprom.c
index 57922f69c3f..52a15fe2db1 100644
--- a/arch/sparc/kernel/idprom.c
+++ b/arch/sparc/kernel/idprom.c
@@ -31,6 +31,8 @@ static struct Sun_Machine_Models Sun_Machines[NUM_SUN_MACHINES] = {
 { .name = "Sun 4/200 Series",        .id_machtype = (SM_SUN4 | SM_4_260) },
 { .name = "Sun 4/300 Series",        .id_machtype = (SM_SUN4 | SM_4_330) },
 { .name = "Sun 4/400 Series",        .id_machtype = (SM_SUN4 | SM_4_470) },
+/* Now Leon */
+{ .name = "Leon3 System-on-a-Chip",  .id_machtype = (M_LEON | M_LEON3_SOC) },
 /* Now, Sun4c's */
 { .name = "Sun4c SparcStation 1",    .id_machtype = (SM_SUN4C | SM_4C_SS1) },
 { .name = "Sun4c SparcStation IPC",  .id_machtype = (SM_SUN4C | SM_4C_IPC) },
diff --git a/arch/sparc/kernel/irq_32.c b/arch/sparc/kernel/irq_32.c
index ad800b80c71..e1af4372832 100644
--- a/arch/sparc/kernel/irq_32.c
+++ b/arch/sparc/kernel/irq_32.c
@@ -45,6 +45,7 @@
 #include <asm/pcic.h>
 #include <asm/cacheflush.h>
 #include <asm/irq_regs.h>
+#include <asm/leon.h>
 
 #include "kernel.h"
 #include "irq.h"
@@ -661,6 +662,10 @@ void __init init_IRQ(void)
 		sun4d_init_IRQ();
 		break;
 
+	case sparc_leon:
+		leon_init_IRQ();
+		break;
+
 	default:
 		prom_printf("Cannot initialize IRQs on this Sun machine...");
 		break;
diff --git a/arch/sparc/kernel/setup_32.c b/arch/sparc/kernel/setup_32.c
index 998cadb4e7f..16a47ffe03c 100644
--- a/arch/sparc/kernel/setup_32.c
+++ b/arch/sparc/kernel/setup_32.c
@@ -235,6 +235,8 @@ void __init setup_arch(char **cmdline_p)
 		sparc_cpu_model = sun4e;
 	if (!strcmp(&cputypval,"sun4u"))
 		sparc_cpu_model = sun4u;
+	if (!strncmp(&cputypval, "leon" , 4))
+		sparc_cpu_model = sparc_leon;
 
 	printk("ARCH: ");
 	switch(sparc_cpu_model) {
@@ -256,6 +258,9 @@ void __init setup_arch(char **cmdline_p)
 	case sun4u:
 		printk("SUN4U\n");
 		break;
+	case sparc_leon:
+		printk("LEON\n");
+		break;
 	default:
 		printk("UNKNOWN!\n");
 		break;
diff --git a/arch/sparc/mm/Makefile b/arch/sparc/mm/Makefile
index 681abe0a459..79836a7dd00 100644
--- a/arch/sparc/mm/Makefile
+++ b/arch/sparc/mm/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_SPARC32)   += loadmmu.o
 obj-y                   += generic_$(BITS).o
 obj-$(CONFIG_SPARC32)   += extable.o btfixup.o srmmu.o iommu.o io-unit.o
 obj-$(CONFIG_SPARC32)   += hypersparc.o viking.o tsunami.o swift.o
+obj-$(CONFIG_SPARC_LEON)+= leon_mm.o
 
 # Only used by sparc64
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c
index 26bb3919ff1..54114ad0bde 100644
--- a/arch/sparc/mm/init_32.c
+++ b/arch/sparc/mm/init_32.c
@@ -34,6 +34,7 @@
 #include <asm/pgalloc.h>	/* bug in asm-generic/tlb.h: check_pgt_cache */
 #include <asm/tlb.h>
 #include <asm/prom.h>
+#include <asm/leon.h>
 
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 
@@ -326,6 +327,9 @@ void __init paging_init(void)
 		sparc_unmapped_base = 0xe0000000;
 		BTFIXUPSET_SETHI(sparc_unmapped_base, 0xe0000000);
 		break;
+	case sparc_leon:
+		leon_init();
+		/* fall through */
 	case sun4m:
 	case sun4d:
 		srmmu_paging_init();
diff --git a/arch/sparc/mm/loadmmu.c b/arch/sparc/mm/loadmmu.c
index 652be05acbe..82ec8f66603 100644
--- a/arch/sparc/mm/loadmmu.c
+++ b/arch/sparc/mm/loadmmu.c
@@ -33,6 +33,7 @@ void __init load_mmu(void)
 		break;
 	case sun4m:
 	case sun4d:
+	case sparc_leon:
 		ld_mmu_srmmu();
 		break;
 	default:
-- 
cgit v1.2.3-70-g09d2


From e63829de3d03f92cea2b26119e0aa9a7043b9913 Mon Sep 17 00:00:00 2001
From: Konrad Eisele <konrad@gaisler.com>
Date: Mon, 17 Aug 2009 00:13:32 +0000
Subject: sparc,leon: Added support for AMBAPP bus.

The device is a AMBA bus if it is a child of prom node "ambapp" (AMBA
plug and play). Two functions
leon_trans_init() and leon_node_init() (defined in
sparc/kernel/leon_kernel.c) are called in the
prom_build_tree() path if CONFIG_SPARC_LEON is
defined. leon_node_init() will build up the device
tree using AMBA plug and play. Also: a extra check was addes to
prom_common.c:build_one_prop()
in case a rom-node is undefined which can happen for SPARC-LEON
because it creates only a minimum
nodes to emulate sparc behaviour.

Signed-off-by: Konrad Eisele <konrad@gaisler.com>
Reviewed-by:   Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/kernel/of_device_32.c | 40 ++++++++++++++++++++++++++++++++++++++++
 arch/sparc/kernel/prom_32.c      | 33 +++++++++++++++++++++++++++++++++
 arch/sparc/kernel/prom_common.c  | 10 ++++++++--
 3 files changed, 81 insertions(+), 2 deletions(-)

(limited to 'arch/sparc')

diff --git a/arch/sparc/kernel/of_device_32.c b/arch/sparc/kernel/of_device_32.c
index 90396702ea2..4c26eb59e74 100644
--- a/arch/sparc/kernel/of_device_32.c
+++ b/arch/sparc/kernel/of_device_32.c
@@ -9,6 +9,8 @@
 #include <linux/irq.h>
 #include <linux/of_device.h>
 #include <linux/of_platform.h>
+#include <asm/leon.h>
+#include <asm/leon_amba.h>
 
 #include "of_device_common.h"
 
@@ -97,6 +99,35 @@ static unsigned long of_bus_sbus_get_flags(const u32 *addr, unsigned long flags)
 	return IORESOURCE_MEM;
 }
 
+ /*
+ * AMBAPP bus specific translator
+ */
+
+static int of_bus_ambapp_match(struct device_node *np)
+{
+	return !strcmp(np->name, "ambapp");
+}
+
+static void of_bus_ambapp_count_cells(struct device_node *child,
+				      int *addrc, int *sizec)
+{
+	if (addrc)
+		*addrc = 1;
+	if (sizec)
+		*sizec = 1;
+}
+
+static int of_bus_ambapp_map(u32 *addr, const u32 *range,
+			     int na, int ns, int pna)
+{
+	return of_bus_default_map(addr, range, na, ns, pna);
+}
+
+static unsigned long of_bus_ambapp_get_flags(const u32 *addr,
+					     unsigned long flags)
+{
+	return IORESOURCE_MEM;
+}
 
 /*
  * Array of bus specific translators
@@ -121,6 +152,15 @@ static struct of_bus of_busses[] = {
 		.map = of_bus_default_map,
 		.get_flags = of_bus_sbus_get_flags,
 	},
+	/* AMBA */
+	{
+		.name = "ambapp",
+		.addr_prop_name = "reg",
+		.match = of_bus_ambapp_match,
+		.count_cells = of_bus_ambapp_count_cells,
+		.map = of_bus_ambapp_map,
+		.get_flags = of_bus_ambapp_get_flags,
+	},
 	/* Default */
 	{
 		.name = "default",
diff --git a/arch/sparc/kernel/prom_32.c b/arch/sparc/kernel/prom_32.c
index fe43e80772d..0a37e8cfd16 100644
--- a/arch/sparc/kernel/prom_32.c
+++ b/arch/sparc/kernel/prom_32.c
@@ -24,6 +24,8 @@
 
 #include <asm/prom.h>
 #include <asm/oplib.h>
+#include <asm/leon.h>
+#include <asm/leon_amba.h>
 
 #include "prom.h"
 
@@ -131,6 +133,35 @@ static void __init ebus_path_component(struct device_node *dp, char *tmp_buf)
 		regs->which_io, regs->phys_addr);
 }
 
+/* "name:vendor:device@irq,addrlo" */
+static void __init ambapp_path_component(struct device_node *dp, char *tmp_buf)
+{
+	struct amba_prom_registers *regs; unsigned int *intr;
+	unsigned int *device, *vendor;
+	struct property *prop;
+
+	prop = of_find_property(dp, "reg", NULL);
+	if (!prop)
+		return;
+	regs = prop->value;
+	prop = of_find_property(dp, "interrupts", NULL);
+	if (!prop)
+		return;
+	intr = prop->value;
+	prop = of_find_property(dp, "vendor", NULL);
+	if (!prop)
+		return;
+	vendor = prop->value;
+	prop = of_find_property(dp, "device", NULL);
+	if (!prop)
+		return;
+	device = prop->value;
+
+	sprintf(tmp_buf, "%s:%d:%d@%x,%x",
+		dp->name, *vendor, *device,
+		*intr, regs->phys_addr);
+}
+
 static void __init __build_path_component(struct device_node *dp, char *tmp_buf)
 {
 	struct device_node *parent = dp->parent;
@@ -143,6 +174,8 @@ static void __init __build_path_component(struct device_node *dp, char *tmp_buf)
 			return sbus_path_component(dp, tmp_buf);
 		if (!strcmp(parent->type, "ebus"))
 			return ebus_path_component(dp, tmp_buf);
+		if (!strcmp(parent->type, "ambapp"))
+			return ambapp_path_component(dp, tmp_buf);
 
 		/* "isa" is handled with platform naming */
 	}
diff --git a/arch/sparc/kernel/prom_common.c b/arch/sparc/kernel/prom_common.c
index 0fb5789d43c..138910c6720 100644
--- a/arch/sparc/kernel/prom_common.c
+++ b/arch/sparc/kernel/prom_common.c
@@ -22,9 +22,12 @@
 #include <linux/of.h>
 #include <asm/prom.h>
 #include <asm/oplib.h>
+#include <asm/leon.h>
 
 #include "prom.h"
 
+void (*prom_build_more)(struct device_node *dp, struct device_node ***nextp);
+
 struct device_node *of_console_device;
 EXPORT_SYMBOL(of_console_device);
 
@@ -161,7 +164,7 @@ static struct property * __init build_one_prop(phandle node, char *prev,
 			name = prom_nextprop(node, prev, p->name);
 		}
 
-		if (strlen(name) == 0) {
+		if (!name || strlen(name) == 0) {
 			tmp = p;
 			return NULL;
 		}
@@ -242,7 +245,7 @@ static struct device_node * __init prom_create_node(phandle node,
 	return dp;
 }
 
-static char * __init build_full_name(struct device_node *dp)
+char * __init build_full_name(struct device_node *dp)
 {
 	int len, ourlen, plen;
 	char *n;
@@ -289,6 +292,9 @@ static struct device_node * __init prom_build_tree(struct device_node *parent,
 
 		dp->child = prom_build_tree(dp, prom_getchild(node), nextp);
 
+		if (prom_build_more)
+			prom_build_more(dp, nextp);
+
 		node = prom_getsibling(node);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 75d9e34698540e96b422293e1d76ab02cc7faefb Mon Sep 17 00:00:00 2001
From: Konrad Eisele <konrad@gaisler.com>
Date: Mon, 17 Aug 2009 00:13:33 +0000
Subject: sparc, leon: sparc-leon specific SRMMU initialization and bootup
 fixes.

The sparc-leon caches are virtually tagged so a flush is needed on ctx
switch.

Signed-off-by: Konrad Eisele <konrad@gaisler.com>
Reviewed-by:   Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/mm/srmmu.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)

(limited to 'arch/sparc')

diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c
index ade4eb373bd..509b1ffeba6 100644
--- a/arch/sparc/mm/srmmu.c
+++ b/arch/sparc/mm/srmmu.c
@@ -46,6 +46,7 @@
 #include <asm/tsunami.h>
 #include <asm/swift.h>
 #include <asm/turbosparc.h>
+#include <asm/leon.h>
 
 #include <asm/btfixup.h>
 
@@ -569,6 +570,9 @@ static void srmmu_switch_mm(struct mm_struct *old_mm, struct mm_struct *mm,
 		srmmu_ctxd_set(&srmmu_context_table[mm->context], mm->pgd);
 	}
 
+	if (sparc_cpu_model == sparc_leon)
+		leon_switch_mm();
+
 	if (is_hypersparc)
 		hyper_flush_whole_icache();
 
@@ -1977,6 +1981,45 @@ static void __init init_viking(void)
 	poke_srmmu = poke_viking;
 }
 
+#ifdef CONFIG_SPARC_LEON
+
+void __init poke_leonsparc(void)
+{
+}
+
+void __init init_leon(void)
+{
+
+	srmmu_name = "Leon";
+
+	BTFIXUPSET_CALL(flush_cache_all, leon_flush_cache_all,
+			BTFIXUPCALL_NORM);
+	BTFIXUPSET_CALL(flush_cache_mm, leon_flush_cache_all,
+			BTFIXUPCALL_NORM);
+	BTFIXUPSET_CALL(flush_cache_page, leon_flush_pcache_all,
+			BTFIXUPCALL_NORM);
+	BTFIXUPSET_CALL(flush_cache_range, leon_flush_cache_all,
+			BTFIXUPCALL_NORM);
+	BTFIXUPSET_CALL(flush_page_for_dma, leon_flush_dcache_all,
+			BTFIXUPCALL_NORM);
+
+	BTFIXUPSET_CALL(flush_tlb_all, leon_flush_tlb_all, BTFIXUPCALL_NORM);
+	BTFIXUPSET_CALL(flush_tlb_mm, leon_flush_tlb_all, BTFIXUPCALL_NORM);
+	BTFIXUPSET_CALL(flush_tlb_page, leon_flush_tlb_all, BTFIXUPCALL_NORM);
+	BTFIXUPSET_CALL(flush_tlb_range, leon_flush_tlb_all, BTFIXUPCALL_NORM);
+
+	BTFIXUPSET_CALL(__flush_page_to_ram, leon_flush_cache_all,
+			BTFIXUPCALL_NOP);
+	BTFIXUPSET_CALL(flush_sig_insns, leon_flush_cache_all, BTFIXUPCALL_NOP);
+
+	poke_srmmu = poke_leonsparc;
+
+	srmmu_cache_pagetables = 0;
+
+	leon_flush_during_switch = leon_flush_needed();
+}
+#endif
+
 /* Probe for the srmmu chip version. */
 static void __init get_srmmu_type(void)
 {
@@ -1992,7 +2035,15 @@ static void __init get_srmmu_type(void)
 	psr_typ = (psr >> 28) & 0xf;
 	psr_vers = (psr >> 24) & 0xf;
 
-	/* First, check for HyperSparc or Cypress. */
+	/* First, check for sparc-leon. */
+	if (sparc_cpu_model == sparc_leon) {
+		psr_typ = 0xf;	/* hardcoded ids for older models/simulators */
+		psr_vers = 2;
+		init_leon();
+		return;
+	}
+
+	/* Second, check for HyperSparc or Cypress. */
 	if(mod_typ == 1) {
 		switch(mod_rev) {
 		case 7:
-- 
cgit v1.2.3-70-g09d2


From 9f34ceb60357a7166c929d7b52bd057ad7ffc54b Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 31 Aug 2009 14:43:32 +0200
Subject: locking, sparc: Rename __spin_try_lock() and friends

Needed to avoid namespace conflicts when the common code
function bodies of _spin_try_lock() etc. are moved to a header
file where the function name would be __spin_try_lock().

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Horst Hartmann <horsth@linux.vnet.ibm.com>
Cc: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: David Miller <davem@davemloft.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <20090831124416.306495811@de.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/sparc/include/asm/spinlock_32.h | 12 ++++++------
 arch/sparc/include/asm/spinlock_64.h | 28 ++++++++++++++--------------
 2 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'arch/sparc')

diff --git a/arch/sparc/include/asm/spinlock_32.h b/arch/sparc/include/asm/spinlock_32.h
index 46f91ab66a5..857630cff63 100644
--- a/arch/sparc/include/asm/spinlock_32.h
+++ b/arch/sparc/include/asm/spinlock_32.h
@@ -76,7 +76,7 @@ static inline void __raw_spin_unlock(raw_spinlock_t *lock)
  *
  * Unfortunately this scheme limits us to ~16,000,000 cpus.
  */
-static inline void __read_lock(raw_rwlock_t *rw)
+static inline void arch_read_lock(raw_rwlock_t *rw)
 {
 	register raw_rwlock_t *lp asm("g1");
 	lp = rw;
@@ -92,11 +92,11 @@ static inline void __read_lock(raw_rwlock_t *rw)
 #define __raw_read_lock(lock) \
 do {	unsigned long flags; \
 	local_irq_save(flags); \
-	__read_lock(lock); \
+	arch_read_lock(lock); \
 	local_irq_restore(flags); \
 } while(0)
 
-static inline void __read_unlock(raw_rwlock_t *rw)
+static inline void arch_read_unlock(raw_rwlock_t *rw)
 {
 	register raw_rwlock_t *lp asm("g1");
 	lp = rw;
@@ -112,7 +112,7 @@ static inline void __read_unlock(raw_rwlock_t *rw)
 #define __raw_read_unlock(lock) \
 do {	unsigned long flags; \
 	local_irq_save(flags); \
-	__read_unlock(lock); \
+	arch_read_unlock(lock); \
 	local_irq_restore(flags); \
 } while(0)
 
@@ -150,7 +150,7 @@ static inline int __raw_write_trylock(raw_rwlock_t *rw)
 	return (val == 0);
 }
 
-static inline int __read_trylock(raw_rwlock_t *rw)
+static inline int arch_read_trylock(raw_rwlock_t *rw)
 {
 	register raw_rwlock_t *lp asm("g1");
 	register int res asm("o0");
@@ -169,7 +169,7 @@ static inline int __read_trylock(raw_rwlock_t *rw)
 ({	unsigned long flags; \
 	int res; \
 	local_irq_save(flags); \
-	res = __read_trylock(lock); \
+	res = arch_read_trylock(lock); \
 	local_irq_restore(flags); \
 	res; \
 })
diff --git a/arch/sparc/include/asm/spinlock_64.h b/arch/sparc/include/asm/spinlock_64.h
index f6b2b92ad8d..43e51478358 100644
--- a/arch/sparc/include/asm/spinlock_64.h
+++ b/arch/sparc/include/asm/spinlock_64.h
@@ -92,7 +92,7 @@ static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long fla
 
 /* Multi-reader locks, these are much saner than the 32-bit Sparc ones... */
 
-static void inline __read_lock(raw_rwlock_t *lock)
+static void inline arch_read_lock(raw_rwlock_t *lock)
 {
 	unsigned long tmp1, tmp2;
 
@@ -115,7 +115,7 @@ static void inline __read_lock(raw_rwlock_t *lock)
 	: "memory");
 }
 
-static int inline __read_trylock(raw_rwlock_t *lock)
+static int inline arch_read_trylock(raw_rwlock_t *lock)
 {
 	int tmp1, tmp2;
 
@@ -136,7 +136,7 @@ static int inline __read_trylock(raw_rwlock_t *lock)
 	return tmp1;
 }
 
-static void inline __read_unlock(raw_rwlock_t *lock)
+static void inline arch_read_unlock(raw_rwlock_t *lock)
 {
 	unsigned long tmp1, tmp2;
 
@@ -152,7 +152,7 @@ static void inline __read_unlock(raw_rwlock_t *lock)
 	: "memory");
 }
 
-static void inline __write_lock(raw_rwlock_t *lock)
+static void inline arch_write_lock(raw_rwlock_t *lock)
 {
 	unsigned long mask, tmp1, tmp2;
 
@@ -177,7 +177,7 @@ static void inline __write_lock(raw_rwlock_t *lock)
 	: "memory");
 }
 
-static void inline __write_unlock(raw_rwlock_t *lock)
+static void inline arch_write_unlock(raw_rwlock_t *lock)
 {
 	__asm__ __volatile__(
 "	stw		%%g0, [%0]"
@@ -186,7 +186,7 @@ static void inline __write_unlock(raw_rwlock_t *lock)
 	: "memory");
 }
 
-static int inline __write_trylock(raw_rwlock_t *lock)
+static int inline arch_write_trylock(raw_rwlock_t *lock)
 {
 	unsigned long mask, tmp1, tmp2, result;
 
@@ -210,14 +210,14 @@ static int inline __write_trylock(raw_rwlock_t *lock)
 	return result;
 }
 
-#define __raw_read_lock(p)	__read_lock(p)
-#define __raw_read_lock_flags(p, f) __read_lock(p)
-#define __raw_read_trylock(p)	__read_trylock(p)
-#define __raw_read_unlock(p)	__read_unlock(p)
-#define __raw_write_lock(p)	__write_lock(p)
-#define __raw_write_lock_flags(p, f) __write_lock(p)
-#define __raw_write_unlock(p)	__write_unlock(p)
-#define __raw_write_trylock(p)	__write_trylock(p)
+#define __raw_read_lock(p)	arch_read_lock(p)
+#define __raw_read_lock_flags(p, f) arch_read_lock(p)
+#define __raw_read_trylock(p)	arch_read_trylock(p)
+#define __raw_read_unlock(p)	arch_read_unlock(p)
+#define __raw_write_lock(p)	arch_write_lock(p)
+#define __raw_write_lock_flags(p, f) arch_write_lock(p)
+#define __raw_write_unlock(p)	arch_write_unlock(p)
+#define __raw_write_trylock(p)	arch_write_trylock(p)
 
 #define __raw_read_can_lock(rw)		(!((rw)->lock & 0x80000000UL))
 #define __raw_write_can_lock(rw)	(!(rw)->lock)
-- 
cgit v1.2.3-70-g09d2


From e7a088f935180b90cfe6ab0aaae8a556f46885fe Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 1 Sep 2009 17:54:07 -0700
Subject: sparc: convert /proc/io_map, /proc/dvma_map to seq_file

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/kernel/ioport.c | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

(limited to 'arch/sparc')

diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c
index 87ea0d03d97..e71ce79d8c1 100644
--- a/arch/sparc/kernel/ioport.c
+++ b/arch/sparc/kernel/ioport.c
@@ -35,6 +35,7 @@
 #include <linux/slab.h>
 #include <linux/pci.h>		/* struct pci_dev */
 #include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include <linux/scatterlist.h>
 #include <linux/of_device.h>
 
@@ -663,26 +664,33 @@ EXPORT_SYMBOL(pci_dma_sync_sg_for_device);
 
 #ifdef CONFIG_PROC_FS
 
-static int
-_sparc_io_get_info(char *buf, char **start, off_t fpos, int length, int *eof,
-    void *data)
+static int sparc_io_proc_show(struct seq_file *m, void *v)
 {
-	char *p = buf, *e = buf + length;
-	struct resource *r;
+	struct resource *root = m->private, *r;
 	const char *nm;
 
-	for (r = ((struct resource *)data)->child; r != NULL; r = r->sibling) {
-		if (p + 32 >= e)	/* Better than nothing */
-			break;
+	for (r = root->child; r != NULL; r = r->sibling) {
 		if ((nm = r->name) == 0) nm = "???";
-		p += sprintf(p, "%016llx-%016llx: %s\n",
+		seq_printf(m, "%016llx-%016llx: %s\n",
 				(unsigned long long)r->start,
 				(unsigned long long)r->end, nm);
 	}
 
-	return p-buf;
+	return 0;
 }
 
+static int sparc_io_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, sparc_io_proc_show, PDE(inode)->data);
+}
+
+static const struct file_operations sparc_io_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= sparc_io_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
 #endif /* CONFIG_PROC_FS */
 
 /*
@@ -707,7 +715,7 @@ static struct resource *_sparc_find_resource(struct resource *root,
 static void register_proc_sparc_ioport(void)
 {
 #ifdef CONFIG_PROC_FS
-	create_proc_read_entry("io_map",0,NULL,_sparc_io_get_info,&sparc_iomap);
-	create_proc_read_entry("dvma_map",0,NULL,_sparc_io_get_info,&_sparc_dvma);
+	proc_create_data("io_map", 0, NULL, &sparc_io_proc_fops, &sparc_iomap);
+	proc_create_data("dvma_map", 0, NULL, &sparc_io_proc_fops, &_sparc_dvma);
 #endif
 }
-- 
cgit v1.2.3-70-g09d2


From ee18d64c1f632043a02e6f5ba5e045bb26a5465f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 2 Sep 2009 09:14:21 +0100
Subject: KEYS: Add a keyctl to install a process's session keyring on its
 parent [try #6]

Add a keyctl to install a process's session keyring onto its parent.  This
replaces the parent's session keyring.  Because the COW credential code does
not permit one process to change another process's credentials directly, the
change is deferred until userspace next starts executing again.  Normally this
will be after a wait*() syscall.

To support this, three new security hooks have been provided:
cred_alloc_blank() to allocate unset security creds, cred_transfer() to fill in
the blank security creds and key_session_to_parent() - which asks the LSM if
the process may replace its parent's session keyring.

The replacement may only happen if the process has the same ownership details
as its parent, and the process has LINK permission on the session keyring, and
the session keyring is owned by the process, and the LSM permits it.

Note that this requires alteration to each architecture's notify_resume path.
This has been done for all arches barring blackfin, m68k* and xtensa, all of
which need assembly alteration to support TIF_NOTIFY_RESUME.  This allows the
replacement to be performed at the point the parent process resumes userspace
execution.

This allows the userspace AFS pioctl emulation to fully emulate newpag() and
the VIOCSETTOK and VIOCSETTOK2 pioctls, all of which require the ability to
alter the parent process's PAG membership.  However, since kAFS doesn't use
PAGs per se, but rather dumps the keys into the session keyring, the session
keyring of the parent must be replaced if, for example, VIOCSETTOK is passed
the newpag flag.

This can be tested with the following program:

	#include <stdio.h>
	#include <stdlib.h>
	#include <keyutils.h>

	#define KEYCTL_SESSION_TO_PARENT	18

	#define OSERROR(X, S) do { if ((long)(X) == -1) { perror(S); exit(1); } } while(0)

	int main(int argc, char **argv)
	{
		key_serial_t keyring, key;
		long ret;

		keyring = keyctl_join_session_keyring(argv[1]);
		OSERROR(keyring, "keyctl_join_session_keyring");

		key = add_key("user", "a", "b", 1, keyring);
		OSERROR(key, "add_key");

		ret = keyctl(KEYCTL_SESSION_TO_PARENT);
		OSERROR(ret, "KEYCTL_SESSION_TO_PARENT");

		return 0;
	}

Compiled and linked with -lkeyutils, you should see something like:

	[dhowells@andromeda ~]$ keyctl show
	Session Keyring
	       -3 --alswrv   4043  4043  keyring: _ses
	355907932 --alswrv   4043    -1   \_ keyring: _uid.4043
	[dhowells@andromeda ~]$ /tmp/newpag
	[dhowells@andromeda ~]$ keyctl show
	Session Keyring
	       -3 --alswrv   4043  4043  keyring: _ses
	1055658746 --alswrv   4043  4043   \_ user: a
	[dhowells@andromeda ~]$ /tmp/newpag hello
	[dhowells@andromeda ~]$ keyctl show
	Session Keyring
	       -3 --alswrv   4043  4043  keyring: hello
	340417692 --alswrv   4043  4043   \_ user: a

Where the test program creates a new session keyring, sticks a user key named
'a' into it and then installs it on its parent.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 Documentation/keys.txt        |  20 +++++++++
 arch/alpha/kernel/signal.c    |   2 +
 arch/arm/kernel/signal.c      |   2 +
 arch/avr32/kernel/signal.c    |   2 +
 arch/cris/kernel/ptrace.c     |   2 +
 arch/frv/kernel/signal.c      |   2 +
 arch/h8300/kernel/signal.c    |   2 +
 arch/ia64/kernel/process.c    |   2 +
 arch/m32r/kernel/signal.c     |   2 +
 arch/mips/kernel/signal.c     |   2 +
 arch/mn10300/kernel/signal.c  |   2 +
 arch/parisc/kernel/signal.c   |   2 +
 arch/s390/kernel/signal.c     |   2 +
 arch/sh/kernel/signal_32.c    |   2 +
 arch/sh/kernel/signal_64.c    |   2 +
 arch/sparc/kernel/signal_32.c |   2 +
 arch/sparc/kernel/signal_64.c |   3 ++
 arch/x86/kernel/signal.c      |   2 +
 include/linux/cred.h          |   1 +
 include/linux/key.h           |   3 ++
 include/linux/keyctl.h        |   1 +
 include/linux/sched.h         |   1 +
 include/linux/security.h      |  38 ++++++++++++++++
 kernel/cred.c                 |  43 ++++++++++++++++++
 security/capability.c         |  19 ++++++++
 security/keys/compat.c        |   3 ++
 security/keys/gc.c            |   1 +
 security/keys/internal.h      |   1 +
 security/keys/keyctl.c        | 102 ++++++++++++++++++++++++++++++++++++++++++
 security/keys/process_keys.c  |  49 ++++++++++++++++++++
 security/security.c           |  17 +++++++
 security/selinux/hooks.c      |  28 ++++++++++++
 security/smack/smack_lsm.c    |  30 +++++++++++++
 security/tomoyo/tomoyo.c      |  17 +++++++
 34 files changed, 409 insertions(+)

(limited to 'arch/sparc')

diff --git a/Documentation/keys.txt b/Documentation/keys.txt
index 203487e9b1d..e4dbbdb1bd9 100644
--- a/Documentation/keys.txt
+++ b/Documentation/keys.txt
@@ -757,6 +757,26 @@ The keyctl syscall functions are:
      successful.
 
 
+ (*) Install the calling process's session keyring on its parent.
+
+	long keyctl(KEYCTL_SESSION_TO_PARENT);
+
+     This functions attempts to install the calling process's session keyring
+     on to the calling process's parent, replacing the parent's current session
+     keyring.
+
+     The calling process must have the same ownership as its parent, the
+     keyring must have the same ownership as the calling process, the calling
+     process must have LINK permission on the keyring and the active LSM module
+     mustn't deny permission, otherwise error EPERM will be returned.
+
+     Error ENOMEM will be returned if there was insufficient memory to complete
+     the operation, otherwise 0 will be returned to indicate success.
+
+     The keyring will be replaced next time the parent process leaves the
+     kernel and resumes executing userspace.
+
+
 ===============
 KERNEL SERVICES
 ===============
diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c
index 04e17c1f0f1..d91aaa74705 100644
--- a/arch/alpha/kernel/signal.c
+++ b/arch/alpha/kernel/signal.c
@@ -687,5 +687,7 @@ do_notify_resume(struct pt_regs *regs, struct switch_stack *sw,
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
+		if (current->replacement_session_keyring)
+			key_replace_session_keyring();
 	}
 }
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 13dec276927..ea4ad3a43c8 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -711,5 +711,7 @@ do_notify_resume(struct pt_regs *regs, unsigned int thread_flags, int syscall)
 	if (thread_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
+		if (current->replacement_session_keyring)
+			key_replace_session_keyring();
 	}
 }
diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c
index 62d242e2d03..de9f7fe2aef 100644
--- a/arch/avr32/kernel/signal.c
+++ b/arch/avr32/kernel/signal.c
@@ -326,5 +326,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, struct thread_info *ti)
 	if (ti->flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
+		if (current->replacement_session_keyring)
+			key_replace_session_keyring();
 	}
 }
diff --git a/arch/cris/kernel/ptrace.c b/arch/cris/kernel/ptrace.c
index 4f06d7fb43d..32e9d5ee895 100644
--- a/arch/cris/kernel/ptrace.c
+++ b/arch/cris/kernel/ptrace.c
@@ -40,5 +40,7 @@ void do_notify_resume(int canrestart, struct pt_regs *regs,
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
+		if (current->replacement_session_keyring)
+			key_replace_session_keyring();
 	}
 }
diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c
index 4a7a62c6e78..6b0a2b6fed6 100644
--- a/arch/frv/kernel/signal.c
+++ b/arch/frv/kernel/signal.c
@@ -572,6 +572,8 @@ asmlinkage void do_notify_resume(__u32 thread_info_flags)
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(__frame);
+		if (current->replacement_session_keyring)
+			key_replace_session_keyring();
 	}
 
 } /* end do_notify_resume() */
diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c
index 56b3ab7dbbb..abac3ee8c52 100644
--- a/arch/h8300/kernel/signal.c
+++ b/arch/h8300/kernel/signal.c
@@ -556,5 +556,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, u32 thread_info_flags)
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
+		if (current->replacement_session_keyring)
+			key_replace_session_keyring();
 	}
 }
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 5d7c0e5b9e7..89969e95004 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -192,6 +192,8 @@ do_notify_resume_user(sigset_t *unused, struct sigscratch *scr, long in_syscall)
 	if (test_thread_flag(TIF_NOTIFY_RESUME)) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(&scr->pt);
+		if (current->replacement_session_keyring)
+			key_replace_session_keyring();
 	}
 
 	/* copy user rbs to kernel rbs */
diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c
index 3220258be18..f80bac17c65 100644
--- a/arch/m32r/kernel/signal.c
+++ b/arch/m32r/kernel/signal.c
@@ -411,6 +411,8 @@ void do_notify_resume(struct pt_regs *regs, sigset_t *oldset,
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
+		if (current->replacement_session_keyring)
+			key_replace_session_keyring();
 	}
 
 	clear_thread_flag(TIF_IRET);
diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index a3d1015471d..c2acf31874a 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -704,5 +704,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, void *unused,
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
+		if (current->replacement_session_keyring)
+			key_replace_session_keyring();
 	}
 }
diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c
index feb2f2e810d..a21f43bc68e 100644
--- a/arch/mn10300/kernel/signal.c
+++ b/arch/mn10300/kernel/signal.c
@@ -568,5 +568,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, u32 thread_info_flags)
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(__frame);
+		if (current->replacement_session_keyring)
+			key_replace_session_keyring();
 	}
 }
diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c
index b3bfc432670..5ca1c02b805 100644
--- a/arch/parisc/kernel/signal.c
+++ b/arch/parisc/kernel/signal.c
@@ -649,5 +649,7 @@ void do_notify_resume(struct pt_regs *regs, long in_syscall)
 	if (test_thread_flag(TIF_NOTIFY_RESUME)) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
+		if (current->replacement_session_keyring)
+			key_replace_session_keyring();
 	}
 }
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index 062bd64e65f..6b4fef877f9 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -536,4 +536,6 @@ void do_notify_resume(struct pt_regs *regs)
 {
 	clear_thread_flag(TIF_NOTIFY_RESUME);
 	tracehook_notify_resume(regs);
+	if (current->replacement_session_keyring)
+		key_replace_session_keyring();
 }
diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c
index b5afbec1db5..04a21883f32 100644
--- a/arch/sh/kernel/signal_32.c
+++ b/arch/sh/kernel/signal_32.c
@@ -640,5 +640,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, unsigned int save_r0,
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
+		if (current->replacement_session_keyring)
+			key_replace_session_keyring();
 	}
 }
diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c
index 0663a0ee602..9e5c9b1d7e9 100644
--- a/arch/sh/kernel/signal_64.c
+++ b/arch/sh/kernel/signal_64.c
@@ -772,5 +772,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, unsigned long thread_info
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
+		if (current->replacement_session_keyring)
+			key_replace_session_keyring();
 	}
 }
diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c
index 181d069a2d4..7ce1a1005b1 100644
--- a/arch/sparc/kernel/signal_32.c
+++ b/arch/sparc/kernel/signal_32.c
@@ -590,6 +590,8 @@ void do_notify_resume(struct pt_regs *regs, unsigned long orig_i0,
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
+		if (current->replacement_session_keyring)
+			key_replace_session_keyring();
 	}
 }
 
diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c
index ec82d76dc6f..647afbda7ae 100644
--- a/arch/sparc/kernel/signal_64.c
+++ b/arch/sparc/kernel/signal_64.c
@@ -613,5 +613,8 @@ void do_notify_resume(struct pt_regs *regs, unsigned long orig_i0, unsigned long
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
+		if (current->replacement_session_keyring)
+			key_replace_session_keyring();
 	}
 }
+
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 4c578751e94..81e58238c4c 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -869,6 +869,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
+		if (current->replacement_session_keyring)
+			key_replace_session_keyring();
 	}
 
 #ifdef CONFIG_X86_32
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 85439abdbc8..24520a539c6 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -152,6 +152,7 @@ struct cred {
 extern void __put_cred(struct cred *);
 extern void exit_creds(struct task_struct *);
 extern int copy_creds(struct task_struct *, unsigned long);
+extern struct cred *cred_alloc_blank(void);
 extern struct cred *prepare_creds(void);
 extern struct cred *prepare_exec_creds(void);
 extern struct cred *prepare_usermodehelper_creds(void);
diff --git a/include/linux/key.h b/include/linux/key.h
index 33e0165de10..cd50dfa1d4c 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -278,6 +278,8 @@ static inline key_serial_t key_serial(struct key *key)
 extern ctl_table key_sysctls[];
 #endif
 
+extern void key_replace_session_keyring(void);
+
 /*
  * the userspace interface
  */
@@ -300,6 +302,7 @@ extern void key_init(void);
 #define key_fsuid_changed(t)		do { } while(0)
 #define key_fsgid_changed(t)		do { } while(0)
 #define key_init()			do { } while(0)
+#define key_replace_session_keyring()	do { } while(0)
 
 #endif /* CONFIG_KEYS */
 #endif /* __KERNEL__ */
diff --git a/include/linux/keyctl.h b/include/linux/keyctl.h
index c0688eb7209..bd383f1944f 100644
--- a/include/linux/keyctl.h
+++ b/include/linux/keyctl.h
@@ -52,5 +52,6 @@
 #define KEYCTL_SET_TIMEOUT		15	/* set key timeout */
 #define KEYCTL_ASSUME_AUTHORITY		16	/* assume request_key() authorisation */
 #define KEYCTL_GET_SECURITY		17	/* get key security label */
+#define KEYCTL_SESSION_TO_PARENT	18	/* apply session keyring to parent process */
 
 #endif /*  _LINUX_KEYCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5c7ce13c169..9304027673b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1292,6 +1292,7 @@ struct task_struct {
 	struct mutex cred_guard_mutex;	/* guard against foreign influences on
 					 * credential calculations
 					 * (notably. ptrace) */
+	struct cred *replacement_session_keyring; /* for KEYCTL_SESSION_TO_PARENT */
 
 	char comm[TASK_COMM_LEN]; /* executable name excluding path
 				     - access with [gs]et_task_comm (which lock
diff --git a/include/linux/security.h b/include/linux/security.h
index 40ba39ea68c..97de3fe3dd0 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -653,6 +653,11 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	manual page for definitions of the @clone_flags.
  *	@clone_flags contains the flags indicating what should be shared.
  *	Return 0 if permission is granted.
+ * @cred_alloc_blank:
+ *	@cred points to the credentials.
+ *	@gfp indicates the atomicity of any memory allocations.
+ *	Only allocate sufficient memory and attach to @cred such that
+ *	cred_transfer() will not get ENOMEM.
  * @cred_free:
  *	@cred points to the credentials.
  *	Deallocate and clear the cred->security field in a set of credentials.
@@ -665,6 +670,10 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@new points to the new credentials.
  *	@old points to the original credentials.
  *	Install a new set of credentials.
+ * @cred_transfer:
+ *	@new points to the new credentials.
+ *	@old points to the original credentials.
+ *	Transfer data from original creds to new creds
  * @kernel_act_as:
  *	Set the credentials for a kernel service to act as (subjective context).
  *	@new points to the credentials to be modified.
@@ -1103,6 +1112,13 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	Return the length of the string (including terminating NUL) or -ve if
  *      an error.
  *	May also return 0 (and a NULL buffer pointer) if there is no label.
+ * @key_session_to_parent:
+ *	Forcibly assign the session keyring from a process to its parent
+ *	process.
+ *	@cred: Pointer to process's credentials
+ *	@parent_cred: Pointer to parent process's credentials
+ *	@keyring: Proposed new session keyring
+ *	Return 0 if permission is granted, -ve error otherwise.
  *
  * Security hooks affecting all System V IPC operations.
  *
@@ -1498,10 +1514,12 @@ struct security_operations {
 	int (*dentry_open) (struct file *file, const struct cred *cred);
 
 	int (*task_create) (unsigned long clone_flags);
+	int (*cred_alloc_blank) (struct cred *cred, gfp_t gfp);
 	void (*cred_free) (struct cred *cred);
 	int (*cred_prepare)(struct cred *new, const struct cred *old,
 			    gfp_t gfp);
 	void (*cred_commit)(struct cred *new, const struct cred *old);
+	void (*cred_transfer)(struct cred *new, const struct cred *old);
 	int (*kernel_act_as)(struct cred *new, u32 secid);
 	int (*kernel_create_files_as)(struct cred *new, struct inode *inode);
 	int (*kernel_module_request)(void);
@@ -1639,6 +1657,9 @@ struct security_operations {
 			       const struct cred *cred,
 			       key_perm_t perm);
 	int (*key_getsecurity)(struct key *key, char **_buffer);
+	int (*key_session_to_parent)(const struct cred *cred,
+				     const struct cred *parent_cred,
+				     struct key *key);
 #endif	/* CONFIG_KEYS */
 
 #ifdef CONFIG_AUDIT
@@ -1755,9 +1776,11 @@ int security_file_send_sigiotask(struct task_struct *tsk,
 int security_file_receive(struct file *file);
 int security_dentry_open(struct file *file, const struct cred *cred);
 int security_task_create(unsigned long clone_flags);
+int security_cred_alloc_blank(struct cred *cred, gfp_t gfp);
 void security_cred_free(struct cred *cred);
 int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp);
 void security_commit_creds(struct cred *new, const struct cred *old);
+void security_transfer_creds(struct cred *new, const struct cred *old);
 int security_kernel_act_as(struct cred *new, u32 secid);
 int security_kernel_create_files_as(struct cred *new, struct inode *inode);
 int security_kernel_module_request(void);
@@ -2286,6 +2309,9 @@ static inline int security_task_create(unsigned long clone_flags)
 	return 0;
 }
 
+static inline void security_cred_alloc_blank(struct cred *cred, gfp_t gfp)
+{ }
+
 static inline void security_cred_free(struct cred *cred)
 { }
 
@@ -2301,6 +2327,11 @@ static inline void security_commit_creds(struct cred *new,
 {
 }
 
+static inline void security_transfer_creds(struct cred *new,
+					   const struct cred *old)
+{
+}
+
 static inline int security_kernel_act_as(struct cred *cred, u32 secid)
 {
 	return 0;
@@ -2923,6 +2954,9 @@ void security_key_free(struct key *key);
 int security_key_permission(key_ref_t key_ref,
 			    const struct cred *cred, key_perm_t perm);
 int security_key_getsecurity(struct key *key, char **_buffer);
+int security_key_session_to_parent(const struct cred *cred,
+				   const struct cred *parent_cred,
+				   struct key *key);
 
 #else
 
@@ -2950,6 +2984,10 @@ static inline int security_key_getsecurity(struct key *key, char **_buffer)
 	return 0;
 }
 
+static inline int security_key_session_to_parent(const struct cred *cred,
+						 const struct cred *parent_cred,
+						 struct key *key);
+
 #endif
 #endif /* CONFIG_KEYS */
 
diff --git a/kernel/cred.c b/kernel/cred.c
index 24dd2f5104b..006fcab009d 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -199,6 +199,49 @@ void exit_creds(struct task_struct *tsk)
 	validate_creds(cred);
 	alter_cred_subscribers(cred, -1);
 	put_cred(cred);
+
+	cred = (struct cred *) tsk->replacement_session_keyring;
+	if (cred) {
+		tsk->replacement_session_keyring = NULL;
+		validate_creds(cred);
+		put_cred(cred);
+	}
+}
+
+/*
+ * Allocate blank credentials, such that the credentials can be filled in at a
+ * later date without risk of ENOMEM.
+ */
+struct cred *cred_alloc_blank(void)
+{
+	struct cred *new;
+
+	new = kmem_cache_zalloc(cred_jar, GFP_KERNEL);
+	if (!new)
+		return NULL;
+
+#ifdef CONFIG_KEYS
+	new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
+	if (!new->tgcred) {
+		kfree(new);
+		return NULL;
+	}
+	atomic_set(&new->tgcred->usage, 1);
+#endif
+
+	atomic_set(&new->usage, 1);
+
+	if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
+		goto error;
+
+#ifdef CONFIG_DEBUG_CREDENTIALS
+	new->magic = CRED_MAGIC;
+#endif
+	return new;
+
+error:
+	abort_creds(new);
+	return NULL;
 }
 
 /**
diff --git a/security/capability.c b/security/capability.c
index 06400cf0775..93a2ffe6590 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -373,6 +373,11 @@ static int cap_task_create(unsigned long clone_flags)
 	return 0;
 }
 
+static int cap_cred_alloc_blank(struct cred *cred, gfp_t gfp)
+{
+	return 0;
+}
+
 static void cap_cred_free(struct cred *cred)
 {
 }
@@ -386,6 +391,10 @@ static void cap_cred_commit(struct cred *new, const struct cred *old)
 {
 }
 
+static void cap_cred_transfer(struct cred *new, const struct cred *old)
+{
+}
+
 static int cap_kernel_act_as(struct cred *new, u32 secid)
 {
 	return 0;
@@ -836,6 +845,13 @@ static int cap_key_getsecurity(struct key *key, char **_buffer)
 	return 0;
 }
 
+static int cap_key_session_to_parent(const struct cred *cred,
+				     const struct cred *parent_cred,
+				     struct key *key)
+{
+	return 0;
+}
+
 #endif /* CONFIG_KEYS */
 
 #ifdef CONFIG_AUDIT
@@ -961,9 +977,11 @@ void security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, file_receive);
 	set_to_cap_if_null(ops, dentry_open);
 	set_to_cap_if_null(ops, task_create);
+	set_to_cap_if_null(ops, cred_alloc_blank);
 	set_to_cap_if_null(ops, cred_free);
 	set_to_cap_if_null(ops, cred_prepare);
 	set_to_cap_if_null(ops, cred_commit);
+	set_to_cap_if_null(ops, cred_transfer);
 	set_to_cap_if_null(ops, kernel_act_as);
 	set_to_cap_if_null(ops, kernel_create_files_as);
 	set_to_cap_if_null(ops, kernel_module_request);
@@ -1063,6 +1081,7 @@ void security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, key_free);
 	set_to_cap_if_null(ops, key_permission);
 	set_to_cap_if_null(ops, key_getsecurity);
+	set_to_cap_if_null(ops, key_session_to_parent);
 #endif	/* CONFIG_KEYS */
 #ifdef CONFIG_AUDIT
 	set_to_cap_if_null(ops, audit_rule_init);
diff --git a/security/keys/compat.c b/security/keys/compat.c
index c766c68a63b..792c0a611a6 100644
--- a/security/keys/compat.c
+++ b/security/keys/compat.c
@@ -82,6 +82,9 @@ asmlinkage long compat_sys_keyctl(u32 option,
 	case KEYCTL_GET_SECURITY:
 		return keyctl_get_security(arg2, compat_ptr(arg3), arg4);
 
+	case KEYCTL_SESSION_TO_PARENT:
+		return keyctl_session_to_parent();
+
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/security/keys/gc.c b/security/keys/gc.c
index 44adc325e15..1e616aef55f 100644
--- a/security/keys/gc.c
+++ b/security/keys/gc.c
@@ -65,6 +65,7 @@ static void key_gc_timer_func(unsigned long data)
  * - return true if we altered the keyring
  */
 static bool key_gc_keyring(struct key *keyring, time_t limit)
+	__releases(key_serial_lock)
 {
 	struct keyring_list *klist;
 	struct key *key;
diff --git a/security/keys/internal.h b/security/keys/internal.h
index fb830514c33..24ba0307b7a 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -201,6 +201,7 @@ extern long keyctl_set_timeout(key_serial_t, unsigned);
 extern long keyctl_assume_authority(key_serial_t);
 extern long keyctl_get_security(key_serial_t keyid, char __user *buffer,
 				size_t buflen);
+extern long keyctl_session_to_parent(void);
 
 /*
  * debugging key validation
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 736d7800f97..74c96852459 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -1228,6 +1228,105 @@ long keyctl_get_security(key_serial_t keyid,
 	return ret;
 }
 
+/*
+ * attempt to install the calling process's session keyring on the process's
+ * parent process
+ * - the keyring must exist and must grant us LINK permission
+ * - implements keyctl(KEYCTL_SESSION_TO_PARENT)
+ */
+long keyctl_session_to_parent(void)
+{
+	struct task_struct *me, *parent;
+	const struct cred *mycred, *pcred;
+	struct cred *cred, *oldcred;
+	key_ref_t keyring_r;
+	int ret;
+
+	keyring_r = lookup_user_key(KEY_SPEC_SESSION_KEYRING, 0, KEY_LINK);
+	if (IS_ERR(keyring_r))
+		return PTR_ERR(keyring_r);
+
+	/* our parent is going to need a new cred struct, a new tgcred struct
+	 * and new security data, so we allocate them here to prevent ENOMEM in
+	 * our parent */
+	ret = -ENOMEM;
+	cred = cred_alloc_blank();
+	if (!cred)
+		goto error_keyring;
+
+	cred->tgcred->session_keyring = key_ref_to_ptr(keyring_r);
+	keyring_r = NULL;
+
+	me = current;
+	write_lock_irq(&tasklist_lock);
+
+	parent = me->real_parent;
+	ret = -EPERM;
+
+	/* the parent mustn't be init and mustn't be a kernel thread */
+	if (parent->pid <= 1 || !parent->mm)
+		goto not_permitted;
+
+	/* the parent must be single threaded */
+	if (atomic_read(&parent->signal->count) != 1)
+		goto not_permitted;
+
+	/* the parent and the child must have different session keyrings or
+	 * there's no point */
+	mycred = current_cred();
+	pcred = __task_cred(parent);
+	if (mycred == pcred ||
+	    mycred->tgcred->session_keyring == pcred->tgcred->session_keyring)
+		goto already_same;
+
+	/* the parent must have the same effective ownership and mustn't be
+	 * SUID/SGID */
+	if (pcred-> uid	!= mycred->euid	||
+	    pcred->euid	!= mycred->euid	||
+	    pcred->suid	!= mycred->euid	||
+	    pcred-> gid	!= mycred->egid	||
+	    pcred->egid	!= mycred->egid	||
+	    pcred->sgid	!= mycred->egid)
+		goto not_permitted;
+
+	/* the keyrings must have the same UID */
+	if (pcred ->tgcred->session_keyring->uid != mycred->euid ||
+	    mycred->tgcred->session_keyring->uid != mycred->euid)
+		goto not_permitted;
+
+	/* the LSM must permit the replacement of the parent's keyring with the
+	 * keyring from this process */
+	ret = security_key_session_to_parent(mycred, pcred,
+					     key_ref_to_ptr(keyring_r));
+	if (ret < 0)
+		goto not_permitted;
+
+	/* if there's an already pending keyring replacement, then we replace
+	 * that */
+	oldcred = parent->replacement_session_keyring;
+
+	/* the replacement session keyring is applied just prior to userspace
+	 * restarting */
+	parent->replacement_session_keyring = cred;
+	cred = NULL;
+	set_ti_thread_flag(task_thread_info(parent), TIF_NOTIFY_RESUME);
+
+	write_unlock_irq(&tasklist_lock);
+	if (oldcred)
+		put_cred(oldcred);
+	return 0;
+
+already_same:
+	ret = 0;
+not_permitted:
+	put_cred(cred);
+	return ret;
+
+error_keyring:
+	key_ref_put(keyring_r);
+	return ret;
+}
+
 /*****************************************************************************/
 /*
  * the key control system call
@@ -1313,6 +1412,9 @@ SYSCALL_DEFINE5(keyctl, int, option, unsigned long, arg2, unsigned long, arg3,
 					   (char __user *) arg3,
 					   (size_t) arg4);
 
+	case KEYCTL_SESSION_TO_PARENT:
+		return keyctl_session_to_parent();
+
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 4739cfbb41b..5c23afb31ec 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -17,6 +17,7 @@
 #include <linux/fs.h>
 #include <linux/err.h>
 #include <linux/mutex.h>
+#include <linux/security.h>
 #include <linux/user_namespace.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -768,3 +769,51 @@ error:
 	abort_creds(new);
 	return ret;
 }
+
+/*
+ * Replace a process's session keyring when that process resumes userspace on
+ * behalf of one of its children
+ */
+void key_replace_session_keyring(void)
+{
+	const struct cred *old;
+	struct cred *new;
+
+	if (!current->replacement_session_keyring)
+		return;
+
+	write_lock_irq(&tasklist_lock);
+	new = current->replacement_session_keyring;
+	current->replacement_session_keyring = NULL;
+	write_unlock_irq(&tasklist_lock);
+
+	if (!new)
+		return;
+
+	old = current_cred();
+	new->  uid	= old->  uid;
+	new-> euid	= old-> euid;
+	new-> suid	= old-> suid;
+	new->fsuid	= old->fsuid;
+	new->  gid	= old->  gid;
+	new-> egid	= old-> egid;
+	new-> sgid	= old-> sgid;
+	new->fsgid	= old->fsgid;
+	new->user	= get_uid(old->user);
+	new->group_info	= get_group_info(old->group_info);
+
+	new->securebits	= old->securebits;
+	new->cap_inheritable	= old->cap_inheritable;
+	new->cap_permitted	= old->cap_permitted;
+	new->cap_effective	= old->cap_effective;
+	new->cap_bset		= old->cap_bset;
+
+	new->jit_keyring	= old->jit_keyring;
+	new->thread_keyring	= key_get(old->thread_keyring);
+	new->tgcred->tgid	= old->tgcred->tgid;
+	new->tgcred->process_keyring = key_get(old->tgcred->process_keyring);
+
+	security_transfer_creds(new, old);
+
+	commit_creds(new);
+}
diff --git a/security/security.c b/security/security.c
index f88eaf6b14c..d8b727637f0 100644
--- a/security/security.c
+++ b/security/security.c
@@ -684,6 +684,11 @@ int security_task_create(unsigned long clone_flags)
 	return security_ops->task_create(clone_flags);
 }
 
+int security_cred_alloc_blank(struct cred *cred, gfp_t gfp)
+{
+	return security_ops->cred_alloc_blank(cred, gfp);
+}
+
 void security_cred_free(struct cred *cred)
 {
 	security_ops->cred_free(cred);
@@ -699,6 +704,11 @@ void security_commit_creds(struct cred *new, const struct cred *old)
 	security_ops->cred_commit(new, old);
 }
 
+void security_transfer_creds(struct cred *new, const struct cred *old)
+{
+	security_ops->cred_transfer(new, old);
+}
+
 int security_kernel_act_as(struct cred *new, u32 secid)
 {
 	return security_ops->kernel_act_as(new, secid);
@@ -1241,6 +1251,13 @@ int security_key_getsecurity(struct key *key, char **_buffer)
 	return security_ops->key_getsecurity(key, _buffer);
 }
 
+int security_key_session_to_parent(const struct cred *cred,
+				   const struct cred *parent_cred,
+				   struct key *key)
+{
+	return security_ops->key_session_to_parent(cred, parent_cred, key);
+}
+
 #endif	/* CONFIG_KEYS */
 
 #ifdef CONFIG_AUDIT
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index c3bb31ecc5a..134a9c0d200 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3232,6 +3232,21 @@ static int selinux_task_create(unsigned long clone_flags)
 	return current_has_perm(current, PROCESS__FORK);
 }
 
+/*
+ * allocate the SELinux part of blank credentials
+ */
+static int selinux_cred_alloc_blank(struct cred *cred, gfp_t gfp)
+{
+	struct task_security_struct *tsec;
+
+	tsec = kzalloc(sizeof(struct task_security_struct), gfp);
+	if (!tsec)
+		return -ENOMEM;
+
+	cred->security = tsec;
+	return 0;
+}
+
 /*
  * detach and free the LSM part of a set of credentials
  */
@@ -3263,6 +3278,17 @@ static int selinux_cred_prepare(struct cred *new, const struct cred *old,
 	return 0;
 }
 
+/*
+ * transfer the SELinux data to a blank set of creds
+ */
+static void selinux_cred_transfer(struct cred *new, const struct cred *old)
+{
+	const struct task_security_struct *old_tsec = old->security;
+	struct task_security_struct *tsec = new->security;
+
+	*tsec = *old_tsec;
+}
+
 /*
  * set the security data for a kernel service
  * - all the creation contexts are set to unlabelled
@@ -5469,8 +5495,10 @@ static struct security_operations selinux_ops = {
 	.dentry_open =			selinux_dentry_open,
 
 	.task_create =			selinux_task_create,
+	.cred_alloc_blank =		selinux_cred_alloc_blank,
 	.cred_free =			selinux_cred_free,
 	.cred_prepare =			selinux_cred_prepare,
+	.cred_transfer =		selinux_cred_transfer,
 	.kernel_act_as =		selinux_kernel_act_as,
 	.kernel_create_files_as =	selinux_kernel_create_files_as,
 	.kernel_module_request =	selinux_kernel_module_request,
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index c243a2b2583..969f5fee190 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -1079,6 +1079,22 @@ static int smack_file_receive(struct file *file)
  * Task hooks
  */
 
+/**
+ * smack_cred_alloc_blank - "allocate" blank task-level security credentials
+ * @new: the new credentials
+ * @gfp: the atomicity of any memory allocations
+ *
+ * Prepare a blank set of credentials for modification.  This must allocate all
+ * the memory the LSM module might require such that cred_transfer() can
+ * complete without error.
+ */
+static int smack_cred_alloc_blank(struct cred *cred, gfp_t gfp)
+{
+	cred->security = NULL;
+	return 0;
+}
+
+
 /**
  * smack_cred_free - "free" task-level security credentials
  * @cred: the credentials in question
@@ -1116,6 +1132,18 @@ static void smack_cred_commit(struct cred *new, const struct cred *old)
 {
 }
 
+/**
+ * smack_cred_transfer - Transfer the old credentials to the new credentials
+ * @new: the new credentials
+ * @old: the original credentials
+ *
+ * Fill in a set of blank credentials from another set of credentials.
+ */
+static void smack_cred_transfer(struct cred *new, const struct cred *old)
+{
+	new->security = old->security;
+}
+
 /**
  * smack_kernel_act_as - Set the subjective context in a set of credentials
  * @new: points to the set of credentials to be modified.
@@ -3073,9 +3101,11 @@ struct security_operations smack_ops = {
 	.file_send_sigiotask = 		smack_file_send_sigiotask,
 	.file_receive = 		smack_file_receive,
 
+	.cred_alloc_blank =		smack_cred_alloc_blank,
 	.cred_free =			smack_cred_free,
 	.cred_prepare =			smack_cred_prepare,
 	.cred_commit =			smack_cred_commit,
+	.cred_transfer =		smack_cred_transfer,
 	.kernel_act_as =		smack_kernel_act_as,
 	.kernel_create_files_as =	smack_kernel_create_files_as,
 	.task_setpgid = 		smack_task_setpgid,
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index 35a13e7915e..9548a0984cc 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -14,6 +14,12 @@
 #include "tomoyo.h"
 #include "realpath.h"
 
+static int tomoyo_cred_alloc_blank(struct cred *new, gfp_t gfp)
+{
+	new->security = NULL;
+	return 0;
+}
+
 static int tomoyo_cred_prepare(struct cred *new, const struct cred *old,
 			       gfp_t gfp)
 {
@@ -25,6 +31,15 @@ static int tomoyo_cred_prepare(struct cred *new, const struct cred *old,
 	return 0;
 }
 
+static void tomoyo_cred_transfer(struct cred *new, const struct cred *old)
+{
+	/*
+	 * Since "struct tomoyo_domain_info *" is a sharable pointer,
+	 * we don't need to duplicate.
+	 */
+	new->security = old->security;
+}
+
 static int tomoyo_bprm_set_creds(struct linux_binprm *bprm)
 {
 	int rc;
@@ -262,7 +277,9 @@ static int tomoyo_dentry_open(struct file *f, const struct cred *cred)
  */
 static struct security_operations tomoyo_security_ops = {
 	.name                = "tomoyo",
+	.cred_alloc_blank    = tomoyo_cred_alloc_blank,
 	.cred_prepare        = tomoyo_cred_prepare,
+	.cred_transfer	     = tomoyo_cred_transfer,
 	.bprm_set_creds      = tomoyo_bprm_set_creds,
 	.bprm_check_security = tomoyo_bprm_check_security,
 #ifdef CONFIG_SYSCTL
-- 
cgit v1.2.3-70-g09d2


From 825c9fb47a0837db12fecf8d360e0e1d284ddb49 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 4 Sep 2009 02:56:22 -0700
Subject: sparc: add basic support for 'perf'

This wires up the perf_counter_open() syscall so that basic
software support for perf is working.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/Kconfig                    | 2 ++
 arch/sparc/include/asm/perf_counter.h | 8 ++++++++
 arch/sparc/include/asm/unistd.h       | 3 ++-
 arch/sparc/kernel/systbls_32.S        | 2 +-
 arch/sparc/kernel/systbls_64.S        | 4 ++--
 tools/perf/perf.h                     | 6 ++++++
 6 files changed, 21 insertions(+), 4 deletions(-)
 create mode 100644 arch/sparc/include/asm/perf_counter.h

(limited to 'arch/sparc')

diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index b847f880eec..2aa7cd39b48 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -25,6 +25,7 @@ config SPARC
 	select ARCH_WANT_OPTIONAL_GPIOLIB
 	select RTC_CLASS
 	select RTC_DRV_M48T59
+	select HAVE_PERF_COUNTERS
 
 config SPARC32
 	def_bool !64BIT
@@ -44,6 +45,7 @@ config SPARC64
 	select RTC_DRV_BQ4802
 	select RTC_DRV_SUN4V
 	select RTC_DRV_STARFIRE
+	select HAVE_PERF_COUNTERS
 
 config ARCH_DEFCONFIG
 	string
diff --git a/arch/sparc/include/asm/perf_counter.h b/arch/sparc/include/asm/perf_counter.h
new file mode 100644
index 00000000000..f07c587a830
--- /dev/null
+++ b/arch/sparc/include/asm/perf_counter.h
@@ -0,0 +1,8 @@
+#ifndef __ASM_SPARC_PERF_COUNTER_H
+#define __ASM_SPARC_PERF_COUNTER_H
+
+static inline void set_perf_counter_pending(void) { }
+
+#define	PERF_COUNTER_INDEX_OFFSET	0
+
+#endif
diff --git a/arch/sparc/include/asm/unistd.h b/arch/sparc/include/asm/unistd.h
index b2c406de7d4..706df669f3b 100644
--- a/arch/sparc/include/asm/unistd.h
+++ b/arch/sparc/include/asm/unistd.h
@@ -395,8 +395,9 @@
 #define __NR_preadv		324
 #define __NR_pwritev		325
 #define __NR_rt_tgsigqueueinfo	326
+#define __NR_perf_counter_open	327
 
-#define NR_SYSCALLS		327
+#define NR_SYSCALLS		328
 
 #ifdef __32bit_syscall_numbers__
 /* Sparc 32-bit only has the "setresuid32", "getresuid32" variants,
diff --git a/arch/sparc/kernel/systbls_32.S b/arch/sparc/kernel/systbls_32.S
index 69090165729..04181577cb6 100644
--- a/arch/sparc/kernel/systbls_32.S
+++ b/arch/sparc/kernel/systbls_32.S
@@ -82,5 +82,5 @@ sys_call_table:
 /*310*/	.long sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate
 /*315*/	.long sys_timerfd_settime, sys_timerfd_gettime, sys_signalfd4, sys_eventfd2, sys_epoll_create1
 /*320*/	.long sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, sys_preadv
-/*325*/	.long sys_pwritev, sys_rt_tgsigqueueinfo
+/*325*/	.long sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_counter_open
 
diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S
index 2ee7250ba7a..91b06b7f7ac 100644
--- a/arch/sparc/kernel/systbls_64.S
+++ b/arch/sparc/kernel/systbls_64.S
@@ -83,7 +83,7 @@ sys_call_table32:
 /*310*/	.word compat_sys_utimensat, compat_sys_signalfd, sys_timerfd_create, sys_eventfd, compat_sys_fallocate
 	.word compat_sys_timerfd_settime, compat_sys_timerfd_gettime, compat_sys_signalfd4, sys_eventfd2, sys_epoll_create1
 /*320*/	.word sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, compat_sys_preadv
-	.word compat_sys_pwritev, compat_sys_rt_tgsigqueueinfo
+	.word compat_sys_pwritev, compat_sys_rt_tgsigqueueinfo, sys_perf_counter_open
 
 #endif /* CONFIG_COMPAT */
 
@@ -158,4 +158,4 @@ sys_call_table:
 /*310*/	.word sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate
 	.word sys_timerfd_settime, sys_timerfd_gettime, sys_signalfd4, sys_eventfd2, sys_epoll_create1
 /*320*/	.word sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, sys_preadv
-	.word sys_pwritev, sys_rt_tgsigqueueinfo
+	.word sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_counter_open
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index e5148e2b613..2abeb20d0bf 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -41,6 +41,12 @@
 #define cpu_relax()	asm volatile("" ::: "memory");
 #endif
 
+#ifdef __sparc__
+#include "../../arch/sparc/include/asm/unistd.h"
+#define rmb()		asm volatile("":::"memory")
+#define cpu_relax()	asm volatile("":::"memory")
+#endif
+
 #include <time.h>
 #include <unistd.h>
 #include <sys/types.h>
-- 
cgit v1.2.3-70-g09d2


From a8f22264550e64c0cd11fb6647284b0bd6407f9c Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 8 Sep 2009 23:16:06 -0700
Subject: sparc64: Manage NMI watchdog enabling like x86.

Use a per-cpu 'wd_enabled' boolean and a global atomic_t count
of watchdog NMI enabled cpus which is set to '-1' if something
is wrong with the watchdog and it can't be used.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/include/asm/nmi.h |  2 +-
 arch/sparc/kernel/nmi.c      | 60 +++++++++++++++++++++++++++++++-------------
 arch/sparc/oprofile/init.c   |  2 +-
 3 files changed, 45 insertions(+), 19 deletions(-)

(limited to 'arch/sparc')

diff --git a/arch/sparc/include/asm/nmi.h b/arch/sparc/include/asm/nmi.h
index fbd546dd4fe..c7d11e435df 100644
--- a/arch/sparc/include/asm/nmi.h
+++ b/arch/sparc/include/asm/nmi.h
@@ -5,6 +5,6 @@ extern int __init nmi_init(void);
 extern void perfctr_irq(int irq, struct pt_regs *regs);
 extern void nmi_adjust_hz(unsigned int new_hz);
 
-extern int nmi_usable;
+extern atomic_t nmi_active;
 
 #endif /* __NMI_H */
diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c
index 2c0cc72d295..d1614e8384a 100644
--- a/arch/sparc/kernel/nmi.c
+++ b/arch/sparc/kernel/nmi.c
@@ -34,10 +34,17 @@
 static int nmi_watchdog_active;
 static int panic_on_timeout;
 
-int nmi_usable;
-EXPORT_SYMBOL_GPL(nmi_usable);
+/* nmi_active:
+ * >0: the NMI watchdog is active, but can be disabled
+ * <0: the NMI watchdog has not been set up, and cannot be enabled
+ *  0: the NMI watchdog is disabled, but can be enabled
+ */
+atomic_t nmi_active = ATOMIC_INIT(0);		/* oprofile uses this */
+EXPORT_SYMBOL(nmi_active);
 
 static unsigned int nmi_hz = HZ;
+static DEFINE_PER_CPU(short, wd_enabled);
+static int endflag __initdata;
 
 static DEFINE_PER_CPU(unsigned int, last_irq_sum);
 static DEFINE_PER_CPU(local_t, alert_counter);
@@ -110,7 +117,7 @@ notrace __kprobes void perfctr_irq(int irq, struct pt_regs *regs)
 		__get_cpu_var(last_irq_sum) = sum;
 		local_set(&__get_cpu_var(alert_counter), 0);
 	}
-	if (nmi_usable) {
+	if (__get_cpu_var(wd_enabled)) {
 		write_pic(picl_value(nmi_hz));
 		pcr_ops->write(pcr_enable);
 	}
@@ -121,8 +128,6 @@ static inline unsigned int get_nmi_count(int cpu)
 	return cpu_data(cpu).__nmi_count;
 }
 
-static int endflag __initdata;
-
 static __init void nmi_cpu_busy(void *data)
 {
 	local_irq_enable_in_hardirq();
@@ -143,12 +148,15 @@ static void report_broken_nmi(int cpu, int *prev_nmi_count)
 	printk(KERN_WARNING
 		"and attach the output of the 'dmesg' command.\n");
 
-	nmi_usable = 0;
+	per_cpu(wd_enabled, cpu) = 0;
+	atomic_dec(&nmi_active);
 }
 
-static void stop_watchdog(void *unused)
+static void stop_nmi_watchdog(void *unused)
 {
 	pcr_ops->write(PCR_PIC_PRIV);
+	__get_cpu_var(wd_enabled) = 0;
+	atomic_dec(&nmi_active);
 }
 
 static int __init check_nmi_watchdog(void)
@@ -156,6 +164,9 @@ static int __init check_nmi_watchdog(void)
 	unsigned int *prev_nmi_count;
 	int cpu, err;
 
+	if (!atomic_read(&nmi_active))
+		return 0;
+
 	prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(unsigned int), GFP_KERNEL);
 	if (!prev_nmi_count) {
 		err = -ENOMEM;
@@ -172,12 +183,15 @@ static int __init check_nmi_watchdog(void)
 	mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */
 
 	for_each_online_cpu(cpu) {
+		if (!per_cpu(wd_enabled, cpu))
+			continue;
 		if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5)
 			report_broken_nmi(cpu, prev_nmi_count);
 	}
 	endflag = 1;
-	if (!nmi_usable) {
+	if (!atomic_read(&nmi_active)) {
 		kfree(prev_nmi_count);
+		atomic_set(&nmi_active, -1);
 		err = -ENODEV;
 		goto error;
 	}
@@ -188,12 +202,26 @@ static int __init check_nmi_watchdog(void)
 	kfree(prev_nmi_count);
 	return 0;
 error:
-	on_each_cpu(stop_watchdog, NULL, 1);
+	on_each_cpu(stop_nmi_watchdog, NULL, 1);
 	return err;
 }
 
-static void start_watchdog(void *unused)
+static void start_nmi_watchdog(void *unused)
+{
+	__get_cpu_var(wd_enabled) = 1;
+	atomic_inc(&nmi_active);
+
+	pcr_ops->write(PCR_PIC_PRIV);
+	write_pic(picl_value(nmi_hz));
+
+	pcr_ops->write(pcr_enable);
+}
+
+static void nmi_adjust_hz_one(void *unused)
 {
+	if (!__get_cpu_var(wd_enabled))
+		return;
+
 	pcr_ops->write(PCR_PIC_PRIV);
 	write_pic(picl_value(nmi_hz));
 
@@ -203,13 +231,13 @@ static void start_watchdog(void *unused)
 void nmi_adjust_hz(unsigned int new_hz)
 {
 	nmi_hz = new_hz;
-	on_each_cpu(start_watchdog, NULL, 1);
+	on_each_cpu(nmi_adjust_hz_one, NULL, 1);
 }
 EXPORT_SYMBOL_GPL(nmi_adjust_hz);
 
 static int nmi_shutdown(struct notifier_block *nb, unsigned long cmd, void *p)
 {
-	on_each_cpu(stop_watchdog, NULL, 1);
+	on_each_cpu(stop_nmi_watchdog, NULL, 1);
 	return 0;
 }
 
@@ -221,16 +249,14 @@ int __init nmi_init(void)
 {
 	int err;
 
-	nmi_usable = 1;
-
-	on_each_cpu(start_watchdog, NULL, 1);
+	on_each_cpu(start_nmi_watchdog, NULL, 1);
 
 	err = check_nmi_watchdog();
 	if (!err) {
 		err = register_reboot_notifier(&nmi_reboot_notifier);
 		if (err) {
-			nmi_usable = 0;
-			on_each_cpu(stop_watchdog, NULL, 1);
+			on_each_cpu(stop_nmi_watchdog, NULL, 1);
+			atomic_set(&nmi_active, -1);
 		}
 	}
 	return err;
diff --git a/arch/sparc/oprofile/init.c b/arch/sparc/oprofile/init.c
index d172f86439b..9ce34fd294c 100644
--- a/arch/sparc/oprofile/init.c
+++ b/arch/sparc/oprofile/init.c
@@ -57,7 +57,7 @@ static void timer_stop(void)
 
 static int op_nmi_timer_init(struct oprofile_operations *ops)
 {
-	if (!nmi_usable)
+	if (atomic_read(&nmi_active) <= 0)
 		return -ENODEV;
 
 	ops->start = timer_start;
-- 
cgit v1.2.3-70-g09d2


From 4e85f5915dedb7c852c1e1189aa4133c87798a5e Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 8 Sep 2009 23:20:39 -0700
Subject: sparc64: Kill unnecessary cast in profile_timer_exceptions_notify().

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/oprofile/init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/sparc')

diff --git a/arch/sparc/oprofile/init.c b/arch/sparc/oprofile/init.c
index 9ce34fd294c..f97cb8b6ee5 100644
--- a/arch/sparc/oprofile/init.c
+++ b/arch/sparc/oprofile/init.c
@@ -21,7 +21,7 @@
 static int profile_timer_exceptions_notify(struct notifier_block *self,
 					   unsigned long val, void *data)
 {
-	struct die_args *args = (struct die_args *)data;
+	struct die_args *args = data;
 	int ret = NOTIFY_DONE;
 
 	switch (val) {
-- 
cgit v1.2.3-70-g09d2


From d89be56b2153ccad7e8cd9e71b5d8b686632e8c2 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 8 Sep 2009 23:29:16 -0700
Subject: sparc64: Make touch_nmi_watchdog() actually work.

It guards it's actions on nmi_watchdog_active, but nothing ever
sets that and it's initial value is zero.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/kernel/nmi.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/sparc')

diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c
index d1614e8384a..47a465fe3c3 100644
--- a/arch/sparc/kernel/nmi.c
+++ b/arch/sparc/kernel/nmi.c
@@ -31,7 +31,6 @@
  * level 14 as our IRQ off level.
  */
 
-static int nmi_watchdog_active;
 static int panic_on_timeout;
 
 /* nmi_active:
@@ -52,7 +51,7 @@ static DEFINE_PER_CPU(int, nmi_touch);
 
 void touch_nmi_watchdog(void)
 {
-	if (nmi_watchdog_active) {
+	if (atomic_read(&nmi_active)) {
 		int cpu;
 
 		for_each_present_cpu(cpu) {
-- 
cgit v1.2.3-70-g09d2


From 76c36d016a3f958d6231dcdef54a132614c3b578 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 10 Sep 2009 05:55:17 -0700
Subject: sparc64: Provide extern decls for sparc_??u_type strings.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/include/asm/system_64.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/sparc')

diff --git a/arch/sparc/include/asm/system_64.h b/arch/sparc/include/asm/system_64.h
index 6c077816ab2..25e848f0cad 100644
--- a/arch/sparc/include/asm/system_64.h
+++ b/arch/sparc/include/asm/system_64.h
@@ -29,6 +29,10 @@ enum sparc_cpu {
 /* This cannot ever be a sun4c :) That's just history. */
 #define ARCH_SUN4C 0
 
+extern const char *sparc_cpu_type;
+extern const char *sparc_fpu_type;
+extern const char *sparc_pmu_type;
+
 extern char reboot_command[];
 
 /* These are here in an effort to more fully work around Spitfire Errata
-- 
cgit v1.2.3-70-g09d2


From 2d0740c4562493b60f59ca9b0330a2d5e01d43ec Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 10 Sep 2009 05:56:16 -0700
Subject: sparc64: Use nmi_enter() and nmi_exit(), as needed.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/kernel/nmi.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/sparc')

diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c
index 47a465fe3c3..7de19dd30f4 100644
--- a/arch/sparc/kernel/nmi.c
+++ b/arch/sparc/kernel/nmi.c
@@ -84,6 +84,7 @@ static void die_nmi(const char *str, struct pt_regs *regs, int do_panic)
 	if (do_panic || panic_on_oops)
 		panic("Non maskable interrupt");
 
+	nmi_exit();
 	local_irq_enable();
 	do_exit(SIGBUS);
 }
@@ -98,6 +99,8 @@ notrace __kprobes void perfctr_irq(int irq, struct pt_regs *regs)
 
 	local_cpu_data().__nmi_count++;
 
+	nmi_enter();
+
 	if (notify_die(DIE_NMI, "nmi", regs, 0,
 		       pt_regs_trap_type(regs), SIGINT) == NOTIFY_STOP)
 		touched = 1;
@@ -120,6 +123,8 @@ notrace __kprobes void perfctr_irq(int irq, struct pt_regs *regs)
 		write_pic(picl_value(nmi_hz));
 		pcr_ops->write(pcr_enable);
 	}
+
+	nmi_exit();
 }
 
 static inline unsigned int get_nmi_count(int cpu)
-- 
cgit v1.2.3-70-g09d2


From 5686f9c3d67d5a20108fa26105c98b042df13123 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 10 Sep 2009 05:59:24 -0700
Subject: sparc64: Implement a real set_perf_counter_pending().

When the perf counter subsystem needs to reschedule work out
from an NMI, it invokes set_perf_counter_pending().

This triggers a non-NMI irq which should invoke
perf_counter_do_pending().

Currently this won't trigger because sparc64 won't trigger
the perf counter subsystem from NMIs, but when the HW counter
support is added it will.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/include/asm/perf_counter.h |  2 +-
 arch/sparc/kernel/pcr.c               | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'arch/sparc')

diff --git a/arch/sparc/include/asm/perf_counter.h b/arch/sparc/include/asm/perf_counter.h
index f07c587a830..38d64454643 100644
--- a/arch/sparc/include/asm/perf_counter.h
+++ b/arch/sparc/include/asm/perf_counter.h
@@ -1,7 +1,7 @@
 #ifndef __ASM_SPARC_PERF_COUNTER_H
 #define __ASM_SPARC_PERF_COUNTER_H
 
-static inline void set_perf_counter_pending(void) { }
+extern void set_perf_counter_pending(void);
 
 #define	PERF_COUNTER_INDEX_OFFSET	0
 
diff --git a/arch/sparc/kernel/pcr.c b/arch/sparc/kernel/pcr.c
index 1ae8cdd7e70..68ff0010707 100644
--- a/arch/sparc/kernel/pcr.c
+++ b/arch/sparc/kernel/pcr.c
@@ -7,6 +7,8 @@
 #include <linux/init.h>
 #include <linux/irq.h>
 
+#include <linux/perf_counter.h>
+
 #include <asm/pil.h>
 #include <asm/pcr.h>
 #include <asm/nmi.h>
@@ -34,10 +36,20 @@ unsigned int picl_shift;
  */
 void deferred_pcr_work_irq(int irq, struct pt_regs *regs)
 {
+	struct pt_regs *old_regs;
+
 	clear_softint(1 << PIL_DEFERRED_PCR_WORK);
+
+	old_regs = set_irq_regs(regs);
+	irq_enter();
+#ifdef CONFIG_PERF_COUNTERS
+	perf_counter_do_pending();
+#endif
+	irq_exit();
+	set_irq_regs(old_regs);
 }
 
-void schedule_deferred_pcr_work(void)
+void set_perf_counter_pending(void)
 {
 	set_softint(1 << PIL_DEFERRED_PCR_WORK);
 }
-- 
cgit v1.2.3-70-g09d2


From 59abbd1e7cfd6018fb8e58a96aa562aaff8711e7 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 10 Sep 2009 06:28:20 -0700
Subject: sparc64: Initial hw perf counter support.

Only supports one simple counter and only UltraSPARC-IIIi chips.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/include/asm/nmi.h          |   3 +
 arch/sparc/include/asm/perf_counter.h |   6 +
 arch/sparc/kernel/Makefile            |   3 +
 arch/sparc/kernel/nmi.c               |   8 +-
 arch/sparc/kernel/perf_counter.c      | 520 ++++++++++++++++++++++++++++++++++
 5 files changed, 538 insertions(+), 2 deletions(-)
 create mode 100644 arch/sparc/kernel/perf_counter.c

(limited to 'arch/sparc')

diff --git a/arch/sparc/include/asm/nmi.h b/arch/sparc/include/asm/nmi.h
index c7d11e435df..72e6500e7ab 100644
--- a/arch/sparc/include/asm/nmi.h
+++ b/arch/sparc/include/asm/nmi.h
@@ -7,4 +7,7 @@ extern void nmi_adjust_hz(unsigned int new_hz);
 
 extern atomic_t nmi_active;
 
+extern void start_nmi_watchdog(void *unused);
+extern void stop_nmi_watchdog(void *unused);
+
 #endif /* __NMI_H */
diff --git a/arch/sparc/include/asm/perf_counter.h b/arch/sparc/include/asm/perf_counter.h
index 38d64454643..5d7a8ca0e49 100644
--- a/arch/sparc/include/asm/perf_counter.h
+++ b/arch/sparc/include/asm/perf_counter.h
@@ -5,4 +5,10 @@ extern void set_perf_counter_pending(void);
 
 #define	PERF_COUNTER_INDEX_OFFSET	0
 
+#ifdef CONFIG_PERF_COUNTERS
+extern void init_hw_perf_counters(void);
+#else
+static inline void init_hw_perf_counters(void)	{ }
+#endif
+
 #endif
diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile
index b0041756c9f..f96dc5761f7 100644
--- a/arch/sparc/kernel/Makefile
+++ b/arch/sparc/kernel/Makefile
@@ -103,3 +103,6 @@ obj-$(CONFIG_SUN_LDOMS) += ldc.o vio.o viohs.o ds.o
 obj-$(CONFIG_AUDIT)     += audit.o
 audit--$(CONFIG_AUDIT)  := compat_audit.o
 obj-$(CONFIG_COMPAT)    += $(audit--y)
+
+pc--$(CONFIG_PERF_COUNTERS) := perf_counter.o
+obj-$(CONFIG_SPARC64)	+= $(pc--y)
diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c
index 7de19dd30f4..391a6ed9a18 100644
--- a/arch/sparc/kernel/nmi.c
+++ b/arch/sparc/kernel/nmi.c
@@ -19,6 +19,7 @@
 #include <linux/delay.h>
 #include <linux/smp.h>
 
+#include <asm/perf_counter.h>
 #include <asm/ptrace.h>
 #include <asm/local.h>
 #include <asm/pcr.h>
@@ -156,7 +157,7 @@ static void report_broken_nmi(int cpu, int *prev_nmi_count)
 	atomic_dec(&nmi_active);
 }
 
-static void stop_nmi_watchdog(void *unused)
+void stop_nmi_watchdog(void *unused)
 {
 	pcr_ops->write(PCR_PIC_PRIV);
 	__get_cpu_var(wd_enabled) = 0;
@@ -210,7 +211,7 @@ error:
 	return err;
 }
 
-static void start_nmi_watchdog(void *unused)
+void start_nmi_watchdog(void *unused)
 {
 	__get_cpu_var(wd_enabled) = 1;
 	atomic_inc(&nmi_active);
@@ -263,6 +264,9 @@ int __init nmi_init(void)
 			atomic_set(&nmi_active, -1);
 		}
 	}
+	if (!err)
+		init_hw_perf_counters();
+
 	return err;
 }
 
diff --git a/arch/sparc/kernel/perf_counter.c b/arch/sparc/kernel/perf_counter.c
new file mode 100644
index 00000000000..f2c781450d0
--- /dev/null
+++ b/arch/sparc/kernel/perf_counter.c
@@ -0,0 +1,520 @@
+/* Performance counter support for sparc64.
+ *
+ * Copyright (C) 2009 David S. Miller <davem@davemloft.net>
+ *
+ * This code is based almost entirely upon the x86 perf counter
+ * code, which is:
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2009 Jaswinder Singh Rajput
+ *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ */
+
+#include <linux/perf_counter.h>
+#include <linux/kprobes.h>
+#include <linux/kernel.h>
+#include <linux/kdebug.h>
+#include <linux/mutex.h>
+
+#include <asm/cpudata.h>
+#include <asm/atomic.h>
+#include <asm/nmi.h>
+#include <asm/pcr.h>
+
+/* Sparc64 chips have two performance counters, 32-bits each, with
+ * overflow interrupts generated on transition from 0xffffffff to 0.
+ * The counters are accessed in one go using a 64-bit register.
+ *
+ * Both counters are controlled using a single control register.  The
+ * only way to stop all sampling is to clear all of the context (user,
+ * supervisor, hypervisor) sampling enable bits.  But these bits apply
+ * to both counters, thus the two counters can't be enabled/disabled
+ * individually.
+ *
+ * The control register has two event fields, one for each of the two
+ * counters.  It's thus nearly impossible to have one counter going
+ * while keeping the other one stopped.  Therefore it is possible to
+ * get overflow interrupts for counters not currently "in use" and
+ * that condition must be checked in the overflow interrupt handler.
+ *
+ * So we use a hack, in that we program inactive counters with the
+ * "sw_count0" and "sw_count1" events.  These count how many times
+ * the instruction "sethi %hi(0xfc000), %g0" is executed.  It's an
+ * unusual way to encode a NOP and therefore will not trigger in
+ * normal code.
+ */
+
+#define MAX_HWCOUNTERS			2
+#define MAX_PERIOD			((1UL << 32) - 1)
+
+#define PIC_UPPER_INDEX			0
+#define PIC_LOWER_INDEX			1
+
+#define PIC_UPPER_NOP			0x1c
+#define PIC_LOWER_NOP			0x14
+
+struct cpu_hw_counters {
+	struct perf_counter	*counters[MAX_HWCOUNTERS];
+	unsigned long		used_mask[BITS_TO_LONGS(MAX_HWCOUNTERS)];
+	unsigned long		active_mask[BITS_TO_LONGS(MAX_HWCOUNTERS)];
+	int enabled;
+};
+DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { .enabled = 1, };
+
+struct perf_event_map {
+	u16	encoding;
+	u8	pic_mask;
+#define PIC_NONE	0x00
+#define PIC_UPPER	0x01
+#define PIC_LOWER	0x02
+};
+
+struct sparc_pmu {
+	const struct perf_event_map	*(*event_map)(int);
+	int				max_events;
+	int				upper_shift;
+	int				lower_shift;
+	int				event_mask;
+};
+
+static const struct perf_event_map ultra3i_perfmon_event_map[] = {
+	[PERF_COUNT_HW_CPU_CYCLES] = { 0x0000, PIC_UPPER | PIC_LOWER },
+	[PERF_COUNT_HW_INSTRUCTIONS] = { 0x0001, PIC_UPPER | PIC_LOWER },
+	[PERF_COUNT_HW_CACHE_REFERENCES] = { 0x0009, PIC_LOWER },
+	[PERF_COUNT_HW_CACHE_MISSES] = { 0x0009, PIC_UPPER },
+};
+
+static const struct perf_event_map *ultra3i_event_map(int event)
+{
+	return &ultra3i_perfmon_event_map[event];
+}
+
+static const struct sparc_pmu ultra3i_pmu = {
+	.event_map	= ultra3i_event_map,
+	.max_events	= ARRAY_SIZE(ultra3i_perfmon_event_map),
+	.upper_shift	= 11,
+	.lower_shift	= 4,
+	.event_mask	= 0x3f,
+};
+
+static const struct sparc_pmu *sparc_pmu __read_mostly;
+
+static u64 event_encoding(u64 event, int idx)
+{
+	if (idx == PIC_UPPER_INDEX)
+		event <<= sparc_pmu->upper_shift;
+	else
+		event <<= sparc_pmu->lower_shift;
+	return event;
+}
+
+static u64 mask_for_index(int idx)
+{
+	return event_encoding(sparc_pmu->event_mask, idx);
+}
+
+static u64 nop_for_index(int idx)
+{
+	return event_encoding(idx == PIC_UPPER_INDEX ?
+			      PIC_UPPER_NOP : PIC_LOWER_NOP, idx);
+}
+
+static inline void sparc_pmu_enable_counter(struct hw_perf_counter *hwc,
+					    int idx)
+{
+	u64 val, mask = mask_for_index(idx);
+
+	val = pcr_ops->read();
+	pcr_ops->write((val & ~mask) | hwc->config);
+}
+
+static inline void sparc_pmu_disable_counter(struct hw_perf_counter *hwc,
+					     int idx)
+{
+	u64 mask = mask_for_index(idx);
+	u64 nop = nop_for_index(idx);
+	u64 val = pcr_ops->read();
+
+	pcr_ops->write((val & ~mask) | nop);
+}
+
+void hw_perf_enable(void)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+	u64 val;
+	int i;
+
+	if (cpuc->enabled)
+		return;
+
+	cpuc->enabled = 1;
+	barrier();
+
+	val = pcr_ops->read();
+
+	for (i = 0; i < MAX_HWCOUNTERS; i++) {
+		struct perf_counter *cp = cpuc->counters[i];
+		struct hw_perf_counter *hwc;
+
+		if (!cp)
+			continue;
+		hwc = &cp->hw;
+		val |= hwc->config_base;
+	}
+
+	pcr_ops->write(val);
+}
+
+void hw_perf_disable(void)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+	u64 val;
+
+	if (!cpuc->enabled)
+		return;
+
+	cpuc->enabled = 0;
+
+	val = pcr_ops->read();
+	val &= ~(PCR_UTRACE | PCR_STRACE);
+	pcr_ops->write(val);
+}
+
+static u32 read_pmc(int idx)
+{
+	u64 val;
+
+	read_pic(val);
+	if (idx == PIC_UPPER_INDEX)
+		val >>= 32;
+
+	return val & 0xffffffff;
+}
+
+static void write_pmc(int idx, u64 val)
+{
+	u64 shift, mask, pic;
+
+	shift = 0;
+	if (idx == PIC_UPPER_INDEX)
+		shift = 32;
+
+	mask = ((u64) 0xffffffff) << shift;
+	val <<= shift;
+
+	read_pic(pic);
+	pic &= ~mask;
+	pic |= val;
+	write_pic(pic);
+}
+
+static int sparc_perf_counter_set_period(struct perf_counter *counter,
+					 struct hw_perf_counter *hwc, int idx)
+{
+	s64 left = atomic64_read(&hwc->period_left);
+	s64 period = hwc->sample_period;
+	int ret = 0;
+
+	if (unlikely(left <= -period)) {
+		left = period;
+		atomic64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		ret = 1;
+	}
+
+	if (unlikely(left <= 0)) {
+		left += period;
+		atomic64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		ret = 1;
+	}
+	if (left > MAX_PERIOD)
+		left = MAX_PERIOD;
+
+	atomic64_set(&hwc->prev_count, (u64)-left);
+
+	write_pmc(idx, (u64)(-left) & 0xffffffff);
+
+	perf_counter_update_userpage(counter);
+
+	return ret;
+}
+
+static int sparc_pmu_enable(struct perf_counter *counter)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+	struct hw_perf_counter *hwc = &counter->hw;
+	int idx = hwc->idx;
+
+	if (test_and_set_bit(idx, cpuc->used_mask))
+		return -EAGAIN;
+
+	sparc_pmu_disable_counter(hwc, idx);
+
+	cpuc->counters[idx] = counter;
+	set_bit(idx, cpuc->active_mask);
+
+	sparc_perf_counter_set_period(counter, hwc, idx);
+	sparc_pmu_enable_counter(hwc, idx);
+	perf_counter_update_userpage(counter);
+	return 0;
+}
+
+static u64 sparc_perf_counter_update(struct perf_counter *counter,
+				     struct hw_perf_counter *hwc, int idx)
+{
+	int shift = 64 - 32;
+	u64 prev_raw_count, new_raw_count;
+	s64 delta;
+
+again:
+	prev_raw_count = atomic64_read(&hwc->prev_count);
+	new_raw_count = read_pmc(idx);
+
+	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+			     new_raw_count) != prev_raw_count)
+		goto again;
+
+	delta = (new_raw_count << shift) - (prev_raw_count << shift);
+	delta >>= shift;
+
+	atomic64_add(delta, &counter->count);
+	atomic64_sub(delta, &hwc->period_left);
+
+	return new_raw_count;
+}
+
+static void sparc_pmu_disable(struct perf_counter *counter)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+	struct hw_perf_counter *hwc = &counter->hw;
+	int idx = hwc->idx;
+
+	clear_bit(idx, cpuc->active_mask);
+	sparc_pmu_disable_counter(hwc, idx);
+
+	barrier();
+
+	sparc_perf_counter_update(counter, hwc, idx);
+	cpuc->counters[idx] = NULL;
+	clear_bit(idx, cpuc->used_mask);
+
+	perf_counter_update_userpage(counter);
+}
+
+static void sparc_pmu_read(struct perf_counter *counter)
+{
+	struct hw_perf_counter *hwc = &counter->hw;
+	sparc_perf_counter_update(counter, hwc, hwc->idx);
+}
+
+static void sparc_pmu_unthrottle(struct perf_counter *counter)
+{
+	struct hw_perf_counter *hwc = &counter->hw;
+	sparc_pmu_enable_counter(hwc, hwc->idx);
+}
+
+static atomic_t active_counters = ATOMIC_INIT(0);
+static DEFINE_MUTEX(pmc_grab_mutex);
+
+void perf_counter_grab_pmc(void)
+{
+	if (atomic_inc_not_zero(&active_counters))
+		return;
+
+	mutex_lock(&pmc_grab_mutex);
+	if (atomic_read(&active_counters) == 0) {
+		if (atomic_read(&nmi_active) > 0) {
+			on_each_cpu(stop_nmi_watchdog, NULL, 1);
+			BUG_ON(atomic_read(&nmi_active) != 0);
+		}
+		atomic_inc(&active_counters);
+	}
+	mutex_unlock(&pmc_grab_mutex);
+}
+
+void perf_counter_release_pmc(void)
+{
+	if (atomic_dec_and_mutex_lock(&active_counters, &pmc_grab_mutex)) {
+		if (atomic_read(&nmi_active) == 0)
+			on_each_cpu(start_nmi_watchdog, NULL, 1);
+		mutex_unlock(&pmc_grab_mutex);
+	}
+}
+
+static void hw_perf_counter_destroy(struct perf_counter *counter)
+{
+	perf_counter_release_pmc();
+}
+
+static int __hw_perf_counter_init(struct perf_counter *counter)
+{
+	struct perf_counter_attr *attr = &counter->attr;
+	struct hw_perf_counter *hwc = &counter->hw;
+	const struct perf_event_map *pmap;
+	u64 enc;
+
+	if (atomic_read(&nmi_active) < 0)
+		return -ENODEV;
+
+	if (attr->type != PERF_TYPE_HARDWARE)
+		return -EOPNOTSUPP;
+
+	if (attr->config >= sparc_pmu->max_events)
+		return -EINVAL;
+
+	perf_counter_grab_pmc();
+	counter->destroy = hw_perf_counter_destroy;
+
+	/* We save the enable bits in the config_base.  So to
+	 * turn off sampling just write 'config', and to enable
+	 * things write 'config | config_base'.
+	 */
+	hwc->config_base = 0;
+	if (!attr->exclude_user)
+		hwc->config_base |= PCR_UTRACE;
+	if (!attr->exclude_kernel)
+		hwc->config_base |= PCR_STRACE;
+
+	if (!hwc->sample_period) {
+		hwc->sample_period = MAX_PERIOD;
+		hwc->last_period = hwc->sample_period;
+		atomic64_set(&hwc->period_left, hwc->sample_period);
+	}
+
+	pmap = sparc_pmu->event_map(attr->config);
+
+	enc = pmap->encoding;
+	if (pmap->pic_mask & PIC_UPPER) {
+		hwc->idx = PIC_UPPER_INDEX;
+		enc <<= sparc_pmu->upper_shift;
+	} else {
+		hwc->idx = PIC_LOWER_INDEX;
+		enc <<= sparc_pmu->lower_shift;
+	}
+
+	hwc->config |= enc;
+	return 0;
+}
+
+static const struct pmu pmu = {
+	.enable		= sparc_pmu_enable,
+	.disable	= sparc_pmu_disable,
+	.read		= sparc_pmu_read,
+	.unthrottle	= sparc_pmu_unthrottle,
+};
+
+const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
+{
+	int err = __hw_perf_counter_init(counter);
+
+	if (err)
+		return ERR_PTR(err);
+	return &pmu;
+}
+
+void perf_counter_print_debug(void)
+{
+	unsigned long flags;
+	u64 pcr, pic;
+	int cpu;
+
+	if (!sparc_pmu)
+		return;
+
+	local_irq_save(flags);
+
+	cpu = smp_processor_id();
+
+	pcr = pcr_ops->read();
+	read_pic(pic);
+
+	pr_info("\n");
+	pr_info("CPU#%d: PCR[%016llx] PIC[%016llx]\n",
+		cpu, pcr, pic);
+
+	local_irq_restore(flags);
+}
+
+static int __kprobes perf_counter_nmi_handler(struct notifier_block *self,
+					      unsigned long cmd, void *__args)
+{
+	struct die_args *args = __args;
+	struct perf_sample_data data;
+	struct cpu_hw_counters *cpuc;
+	struct pt_regs *regs;
+	int idx;
+
+	if (!atomic_read(&active_counters))
+		return NOTIFY_DONE;
+
+	switch (cmd) {
+	case DIE_NMI:
+		break;
+
+	default:
+		return NOTIFY_DONE;
+	}
+
+	regs = args->regs;
+
+	data.regs = regs;
+	data.addr = 0;
+
+	cpuc = &__get_cpu_var(cpu_hw_counters);
+	for (idx = 0; idx < MAX_HWCOUNTERS; idx++) {
+		struct perf_counter *counter = cpuc->counters[idx];
+		struct hw_perf_counter *hwc;
+		u64 val;
+
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+		hwc = &counter->hw;
+		val = sparc_perf_counter_update(counter, hwc, idx);
+		if (val & (1ULL << 31))
+			continue;
+
+		data.period = counter->hw.last_period;
+		if (!sparc_perf_counter_set_period(counter, hwc, idx))
+			continue;
+
+		if (perf_counter_overflow(counter, 1, &data))
+			sparc_pmu_disable_counter(hwc, idx);
+	}
+
+	return NOTIFY_STOP;
+}
+
+static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
+	.notifier_call		= perf_counter_nmi_handler,
+};
+
+static bool __init supported_pmu(void)
+{
+	if (!strcmp(sparc_pmu_type, "ultra3i")) {
+		sparc_pmu = &ultra3i_pmu;
+		return true;
+	}
+	return false;
+}
+
+void __init init_hw_perf_counters(void)
+{
+	pr_info("Performance counters: ");
+
+	if (!supported_pmu()) {
+		pr_cont("No support for PMU type '%s'\n", sparc_pmu_type);
+		return;
+	}
+
+	pr_cont("Supported PMU type is '%s'\n", sparc_pmu_type);
+
+	/* All sparc64 PMUs currently have 2 counters.  But this simple
+	 * driver only supports one active counter at a time.
+	 */
+	perf_max_counters = 1;
+
+	register_die_notifier(&perf_counter_nmi_notifier);
+}
-- 
cgit v1.2.3-70-g09d2


From 91b9286d819b821fd742c0053fe0748818374198 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 10 Sep 2009 07:09:06 -0700
Subject: sparc64: Provide hypervisor tracing bit support for perf counters.

A PMU need only specify which bit in the PCR enabled hypervisor
tracing in order to enable this.

This will be used in Niagara-2 perf counter support.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/kernel/perf_counter.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/sparc')

diff --git a/arch/sparc/kernel/perf_counter.c b/arch/sparc/kernel/perf_counter.c
index f2c781450d0..d86009fa6f8 100644
--- a/arch/sparc/kernel/perf_counter.c
+++ b/arch/sparc/kernel/perf_counter.c
@@ -77,6 +77,7 @@ struct sparc_pmu {
 	int				upper_shift;
 	int				lower_shift;
 	int				event_mask;
+	int				hv_bit;
 };
 
 static const struct perf_event_map ultra3i_perfmon_event_map[] = {
@@ -178,7 +179,7 @@ void hw_perf_disable(void)
 	cpuc->enabled = 0;
 
 	val = pcr_ops->read();
-	val &= ~(PCR_UTRACE | PCR_STRACE);
+	val &= ~(PCR_UTRACE | PCR_STRACE | sparc_pmu->hv_bit);
 	pcr_ops->write(val);
 }
 
@@ -377,6 +378,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		hwc->config_base |= PCR_UTRACE;
 	if (!attr->exclude_kernel)
 		hwc->config_base |= PCR_STRACE;
+	if (!attr->exclude_hv)
+		hwc->config_base |= sparc_pmu->hv_bit;
 
 	if (!hwc->sample_period) {
 		hwc->sample_period = MAX_PERIOD;
-- 
cgit v1.2.3-70-g09d2


From 496c07e3b43475124d7f2d77fafbc1f5055abfee Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 10 Sep 2009 07:10:59 -0700
Subject: sparc64: Provide a way to specify a perf counter overflow IRQ enable
 bit.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/kernel/perf_counter.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/sparc')

diff --git a/arch/sparc/kernel/perf_counter.c b/arch/sparc/kernel/perf_counter.c
index d86009fa6f8..0cd5487b341 100644
--- a/arch/sparc/kernel/perf_counter.c
+++ b/arch/sparc/kernel/perf_counter.c
@@ -78,6 +78,7 @@ struct sparc_pmu {
 	int				lower_shift;
 	int				event_mask;
 	int				hv_bit;
+	int				irq_bit;
 };
 
 static const struct perf_event_map ultra3i_perfmon_event_map[] = {
@@ -179,7 +180,8 @@ void hw_perf_disable(void)
 	cpuc->enabled = 0;
 
 	val = pcr_ops->read();
-	val &= ~(PCR_UTRACE | PCR_STRACE | sparc_pmu->hv_bit);
+	val &= ~(PCR_UTRACE | PCR_STRACE |
+		 sparc_pmu->hv_bit | sparc_pmu->irq_bit);
 	pcr_ops->write(val);
 }
 
@@ -373,7 +375,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	 * turn off sampling just write 'config', and to enable
 	 * things write 'config | config_base'.
 	 */
-	hwc->config_base = 0;
+	hwc->config_base = sparc_pmu->irq_bit;
 	if (!attr->exclude_user)
 		hwc->config_base |= PCR_UTRACE;
 	if (!attr->exclude_kernel)
-- 
cgit v1.2.3-70-g09d2


From 660d13765f342ecb6e774fcbc35b64beb0a069da Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 10 Sep 2009 07:13:26 -0700
Subject: sparc64: Perf counter 'nop' event is not constant.

On Niagara-2, for example, it's going to be different.  So make
it something specified in sparc_pmu.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/kernel/perf_counter.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/sparc')

diff --git a/arch/sparc/kernel/perf_counter.c b/arch/sparc/kernel/perf_counter.c
index 0cd5487b341..f3b8ccca3c9 100644
--- a/arch/sparc/kernel/perf_counter.c
+++ b/arch/sparc/kernel/perf_counter.c
@@ -52,9 +52,6 @@
 #define PIC_UPPER_INDEX			0
 #define PIC_LOWER_INDEX			1
 
-#define PIC_UPPER_NOP			0x1c
-#define PIC_LOWER_NOP			0x14
-
 struct cpu_hw_counters {
 	struct perf_counter	*counters[MAX_HWCOUNTERS];
 	unsigned long		used_mask[BITS_TO_LONGS(MAX_HWCOUNTERS)];
@@ -79,6 +76,8 @@ struct sparc_pmu {
 	int				event_mask;
 	int				hv_bit;
 	int				irq_bit;
+	int				upper_nop;
+	int				lower_nop;
 };
 
 static const struct perf_event_map ultra3i_perfmon_event_map[] = {
@@ -99,6 +98,8 @@ static const struct sparc_pmu ultra3i_pmu = {
 	.upper_shift	= 11,
 	.lower_shift	= 4,
 	.event_mask	= 0x3f,
+	.upper_nop	= 0x1c,
+	.lower_nop	= 0x14,
 };
 
 static const struct sparc_pmu *sparc_pmu __read_mostly;
@@ -120,7 +121,8 @@ static u64 mask_for_index(int idx)
 static u64 nop_for_index(int idx)
 {
 	return event_encoding(idx == PIC_UPPER_INDEX ?
-			      PIC_UPPER_NOP : PIC_LOWER_NOP, idx);
+			      sparc_pmu->upper_nop :
+			      sparc_pmu->lower_nop, idx);
 }
 
 static inline void sparc_pmu_enable_counter(struct hw_perf_counter *hwc,
-- 
cgit v1.2.3-70-g09d2


From b73d884756303316ead4cd7dad51236b2a515a1a Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 10 Sep 2009 07:22:18 -0700
Subject: sparc64: Initial niagara2 perf counter support.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/kernel/perf_counter.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'arch/sparc')

diff --git a/arch/sparc/kernel/perf_counter.c b/arch/sparc/kernel/perf_counter.c
index f3b8ccca3c9..09de4035eaa 100644
--- a/arch/sparc/kernel/perf_counter.c
+++ b/arch/sparc/kernel/perf_counter.c
@@ -102,6 +102,32 @@ static const struct sparc_pmu ultra3i_pmu = {
 	.lower_nop	= 0x14,
 };
 
+static const struct perf_event_map niagara2_perfmon_event_map[] = {
+	[PERF_COUNT_HW_CPU_CYCLES] = { 0x02ff, PIC_UPPER | PIC_LOWER },
+	[PERF_COUNT_HW_INSTRUCTIONS] = { 0x02ff, PIC_UPPER | PIC_LOWER },
+	[PERF_COUNT_HW_CACHE_REFERENCES] = { 0x0208, PIC_UPPER | PIC_LOWER },
+	[PERF_COUNT_HW_CACHE_MISSES] = { 0x0302, PIC_UPPER | PIC_LOWER },
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = { 0x0201, PIC_UPPER | PIC_LOWER },
+	[PERF_COUNT_HW_BRANCH_MISSES] = { 0x0202, PIC_UPPER | PIC_LOWER },
+};
+
+static const struct perf_event_map *niagara2_event_map(int event)
+{
+	return &niagara2_perfmon_event_map[event];
+}
+
+static const struct sparc_pmu niagara2_pmu = {
+	.event_map	= niagara2_event_map,
+	.max_events	= ARRAY_SIZE(niagara2_perfmon_event_map),
+	.upper_shift	= 19,
+	.lower_shift	= 6,
+	.event_mask	= 0xfff,
+	.hv_bit		= 0x8,
+	.irq_bit	= 0x03,
+	.upper_nop	= 0x220,
+	.lower_nop	= 0x220,
+};
+
 static const struct sparc_pmu *sparc_pmu __read_mostly;
 
 static u64 event_encoding(u64 event, int idx)
@@ -504,6 +530,10 @@ static bool __init supported_pmu(void)
 		sparc_pmu = &ultra3i_pmu;
 		return true;
 	}
+	if (!strcmp(sparc_pmu_type, "niagara2")) {
+		sparc_pmu = &niagara2_pmu;
+		return true;
+	}
 	return false;
 }
 
-- 
cgit v1.2.3-70-g09d2