From b8f8c3cf0a4ac0632ec3f0e15e9dc0c29de917af Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 18 Jul 2008 17:27:28 +0200
Subject: nohz: prevent tick stop outside of the idle loop

Jack Ren and Eric Miao tracked down the following long standing
problem in the NOHZ code:

	scheduler switch to idle task
	enable interrupts

Window starts here

	----> interrupt happens (does not set NEED_RESCHED)
	      	irq_exit() stops the tick

	----> interrupt happens (does set NEED_RESCHED)

	return from schedule()

	cpu_idle(): preempt_disable();

Window ends here

The interrupts can happen at any point inside the race window. The
first interrupt stops the tick, the second one causes the scheduler to
rerun and switch away from idle again and we end up with the tick
disabled.

The fact that it needs two interrupts where the first one does not set
NEED_RESCHED and the second one does made the bug obscure and extremly
hard to reproduce and analyse. Kudos to Jack and Eric.

Solution: Limit the NOHZ functionality to the idle loop to make sure
that we can not run into such a situation ever again.

cpu_idle()
{
	preempt_disable();

	while(1) {
		 tick_nohz_stop_sched_tick(1); <- tell NOHZ code that we
		 			          are in the idle loop

		 while (!need_resched())
		       halt();

		 tick_nohz_restart_sched_tick(); <- disables NOHZ mode
		 preempt_enable_no_resched();
		 schedule();
		 preempt_disable();
	}
}

In hindsight we should have done this forever, but ...

/me grabs a large brown paperbag.

Debugged-by: Jack Ren <jack.ren@marvell.com>,
Debugged-by: eric miao <eric.y.miao@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/powerpc/kernel/idle.c             | 2 +-
 arch/powerpc/platforms/iseries/setup.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c
index c3cf0e8f3ac..d308a9f70f1 100644
--- a/arch/powerpc/kernel/idle.c
+++ b/arch/powerpc/kernel/idle.c
@@ -60,7 +60,7 @@ void cpu_idle(void)
 
 	set_thread_flag(TIF_POLLING_NRFLAG);
 	while (1) {
-		tick_nohz_stop_sched_tick();
+		tick_nohz_stop_sched_tick(1);
 		while (!need_resched() && !cpu_should_die()) {
 			ppc64_runlatch_off();
 
diff --git a/arch/powerpc/platforms/iseries/setup.c b/arch/powerpc/platforms/iseries/setup.c
index b72120751bb..70b688c1aef 100644
--- a/arch/powerpc/platforms/iseries/setup.c
+++ b/arch/powerpc/platforms/iseries/setup.c
@@ -561,7 +561,7 @@ static void yield_shared_processor(void)
 static void iseries_shared_idle(void)
 {
 	while (1) {
-		tick_nohz_stop_sched_tick();
+		tick_nohz_stop_sched_tick(1);
 		while (!need_resched() && !hvlpevent_is_pending()) {
 			local_irq_disable();
 			ppc64_runlatch_off();
@@ -591,7 +591,7 @@ static void iseries_dedicated_idle(void)
 	set_thread_flag(TIF_POLLING_NRFLAG);
 
 	while (1) {
-		tick_nohz_stop_sched_tick();
+		tick_nohz_stop_sched_tick(1);
 		if (!need_resched()) {
 			while (!need_resched()) {
 				ppc64_runlatch_off();
-- 
cgit v1.2.3-70-g09d2


From 78f56bd3d2dbe173bf1a946b353bf72ab9c0b94e Mon Sep 17 00:00:00 2001
From: Jon Smirl <jonsmirl@gmail.com>
Date: Sun, 20 Jul 2008 11:30:08 -0400
Subject: powerpc/mpc5200: Remove fsl-soc.c from mpc5200 build, it is not
 needed.

Signed-off-by: Jon Smirl <jonsmirl@gmail.com>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 arch/powerpc/platforms/52xx/Kconfig | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/52xx/Kconfig b/arch/powerpc/platforms/52xx/Kconfig
index acd2fc8cf49..981b84b7599 100644
--- a/arch/powerpc/platforms/52xx/Kconfig
+++ b/arch/powerpc/platforms/52xx/Kconfig
@@ -1,7 +1,6 @@
 config PPC_MPC52xx
 	bool "52xx-based boards"
 	depends on PPC_MULTIPLATFORM && PPC32
-	select FSL_SOC
 	select PPC_CLOCK
 
 config PPC_MPC5200_SIMPLE
-- 
cgit v1.2.3-70-g09d2


From 6d5509babce654fd9ce0ff6689dbdf6ce56c43ae Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Thu, 15 May 2008 17:04:53 -0600
Subject: powerpc/mpc5200: Make mpc5200 GPIO driver select the GENERIC_GPIO
 config

CONFIG_GENERIC_GPIO is needed for the gpio driver to work.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 arch/powerpc/platforms/52xx/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/52xx/Kconfig b/arch/powerpc/platforms/52xx/Kconfig
index 981b84b7599..c1ca4f3ee5d 100644
--- a/arch/powerpc/platforms/52xx/Kconfig
+++ b/arch/powerpc/platforms/52xx/Kconfig
@@ -46,6 +46,7 @@ config PPC_MPC5200_BUGFIX
 config PPC_MPC5200_GPIO
 	bool "MPC5200 GPIO support"
 	depends on PPC_MPC52xx
+	select GENERIC_GPIO
 	select HAVE_GPIO_LIB
 	help
 	  Enable gpiolib support for mpc5200 based boards
-- 
cgit v1.2.3-70-g09d2


From 9bcab8405c98c34849c5795c717b7e6a3e2d3875 Mon Sep 17 00:00:00 2001
From: Milton Miller <miltonm@bga.com>
Date: Fri, 11 Jul 2008 09:03:09 +1000
Subject: powerpc/spufs: correct kcalloc usage

kcalloc is supposed to be called with the count as its first argument and
the element size as the second.

Signed-off-by: Milton Miller <miltonm@bga.com>
Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
---
 arch/powerpc/platforms/cell/spufs/sputrace.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/cell/spufs/sputrace.c b/arch/powerpc/platforms/cell/spufs/sputrace.c
index 8c0e95766a6..92d20e993ed 100644
--- a/arch/powerpc/platforms/cell/spufs/sputrace.c
+++ b/arch/powerpc/platforms/cell/spufs/sputrace.c
@@ -196,8 +196,7 @@ static int __init sputrace_init(void)
 	struct proc_dir_entry *entry;
 	int i, error = -ENOMEM;
 
-	sputrace_log = kcalloc(sizeof(struct sputrace),
-				bufsize, GFP_KERNEL);
+	sputrace_log = kcalloc(bufsize, sizeof(struct sputrace), GFP_KERNEL);
 	if (!sputrace_log)
 		goto out;
 
-- 
cgit v1.2.3-70-g09d2


From 0855b543222e79cbbd9d66dd56cb54740e7d524f Mon Sep 17 00:00:00 2001
From: Andre Detsch <adetsch@br.ibm.com>
Date: Thu, 24 Jul 2008 10:57:26 +1000
Subject: powerpc/spufs: fix aff_mutex and cbe_spu_info[n].list_mutex deadlock

Currenlt,, it is possible to lock aff_mutex and
cbe_spu_info[n].list_mutex in different orders, allowing a deadlock to
occur. With this change, aff_mutex is not taken within a list_mutex
critical section anymore.

Signed-off-by: Andre Detsch <adetsch@br.ibm.com>
Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
---
 arch/powerpc/platforms/cell/spufs/sched.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 34654743363..f293963cd85 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -389,6 +389,9 @@ static int has_affinity(struct spu_context *ctx)
 	if (list_empty(&ctx->aff_list))
 		return 0;
 
+	if (atomic_read(&ctx->gang->aff_sched_count) == 0)
+		ctx->gang->aff_ref_spu = NULL;
+
 	if (!gang->aff_ref_spu) {
 		if (!(gang->aff_flags & AFF_MERGED))
 			aff_merge_remaining_ctxs(gang);
@@ -416,14 +419,8 @@ static void spu_unbind_context(struct spu *spu, struct spu_context *ctx)
  	if (spu->ctx->flags & SPU_CREATE_NOSCHED)
 		atomic_dec(&cbe_spu_info[spu->node].reserved_spus);
 
-	if (ctx->gang){
-		mutex_lock(&ctx->gang->aff_mutex);
-		if (has_affinity(ctx)) {
-			if (atomic_dec_and_test(&ctx->gang->aff_sched_count))
-				ctx->gang->aff_ref_spu = NULL;
-		}
-		mutex_unlock(&ctx->gang->aff_mutex);
-	}
+	if (ctx->gang)
+		atomic_dec_if_positive(&ctx->gang->aff_sched_count);
 
 	spu_switch_notify(spu, NULL);
 	spu_unmap_mappings(ctx);
@@ -562,10 +559,7 @@ static struct spu *spu_get_idle(struct spu_context *ctx)
 				goto found;
 			mutex_unlock(&cbe_spu_info[node].list_mutex);
 
-			mutex_lock(&ctx->gang->aff_mutex);
-			if (atomic_dec_and_test(&ctx->gang->aff_sched_count))
-				ctx->gang->aff_ref_spu = NULL;
-			mutex_unlock(&ctx->gang->aff_mutex);
+			atomic_dec(&ctx->gang->aff_sched_count);
 			goto not_found;
 		}
 		mutex_unlock(&ctx->gang->aff_mutex);
-- 
cgit v1.2.3-70-g09d2


From ad1ede127760d6ca4903f44dfe1a8a38b3bfb36c Mon Sep 17 00:00:00 2001
From: Andre Detsch <adetsch@br.ibm.com>
Date: Thu, 24 Jul 2008 11:01:54 +1000
Subject: powerpc/spufs: better placement of spu affinity reference context

This patch adjusts the placement of a reference context from
a spu affinity chain. The reference context can now be placed
only on nodes that have enough spus not intended to be used by
another gang (already running on the node).

Signed-off-by: Andre Detsch <adetsch@br.ibm.com>
Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
---
 arch/powerpc/platforms/cell/spufs/sched.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index f293963cd85..2deeeba7ecc 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -312,10 +312,27 @@ static struct spu *aff_ref_location(struct spu_context *ctx, int mem_aff,
 	 */
 	node = cpu_to_node(raw_smp_processor_id());
 	for (n = 0; n < MAX_NUMNODES; n++, node++) {
+		int available_spus;
+
 		node = (node < MAX_NUMNODES) ? node : 0;
 		if (!node_allowed(ctx, node))
 			continue;
+
+		available_spus = 0;
 		mutex_lock(&cbe_spu_info[node].list_mutex);
+		list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
+			if (spu->ctx && spu->ctx->gang
+					&& spu->ctx->aff_offset == 0)
+				available_spus -=
+					(spu->ctx->gang->contexts - 1);
+			else
+				available_spus++;
+		}
+		if (available_spus < ctx->gang->contexts) {
+			mutex_unlock(&cbe_spu_info[node].list_mutex);
+			continue;
+		}
+
 		list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
 			if ((!mem_aff || spu->has_mem_affinity) &&
 							sched_spu(spu)) {
-- 
cgit v1.2.3-70-g09d2


From 9483a578df27fe7603605d565eefe039c1ba5845 Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Wed, 23 Jul 2008 21:26:48 -0700
Subject: add HAVE_CLK to Kconfig, for driver dependencies

Flag platforms as HAVE_CLK (or not) in Kconfig, based on whether they
support <linux/clk.h> calls, so that otherwise portable drivers which need
those calls can list that dependency.

Something like this is a prerequisite for merging the musb_hdrc driver,
currently used on platforms including Davinci, OMAP2430, OMAP3xx ...  and
the discrete TUSB6010 chip, which doesn't have a natural platform
dependency.  (Used with OMAP 2420 in current Nokia N8x0 tablets.)

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Cc: Russell King <rmk@arm.linux.org.uk>
Acked-by: Haavard Skinnemoen <hskinnemoen@atmel.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig         |  7 +++++++
 arch/arm/Kconfig     | 13 +++++++++++++
 arch/avr32/Kconfig   |  1 +
 arch/powerpc/Kconfig |  1 +
 arch/sh/Kconfig      |  1 +
 5 files changed, 23 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/Kconfig b/arch/Kconfig
index ad89a33d8c6..4d5ebbc1e72 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -42,3 +42,10 @@ config HAVE_DMA_ATTRS
 
 config USE_GENERIC_SMP_HELPERS
 	def_bool n
+
+config HAVE_CLK
+	def_bool n
+	help
+	  The <linux/clk.h> calls support software clock gating and
+	  thus are a key power management tool on many systems.
+
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index d048f6887d0..6fb4f03369f 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -198,12 +198,14 @@ choice
 config ARCH_AAEC2000
 	bool "Agilent AAEC-2000 based"
 	select ARM_AMBA
+	select HAVE_CLK
 	help
 	  This enables support for systems based on the Agilent AAEC-2000
 
 config ARCH_INTEGRATOR
 	bool "ARM Ltd. Integrator family"
 	select ARM_AMBA
+	select HAVE_CLK
 	select ICST525
 	help
 	  Support for ARM's Integrator platform.
@@ -211,6 +213,7 @@ config ARCH_INTEGRATOR
 config ARCH_REALVIEW
 	bool "ARM Ltd. RealView family"
 	select ARM_AMBA
+	select HAVE_CLK
 	select ICST307
 	select GENERIC_TIME
 	select GENERIC_CLOCKEVENTS
@@ -221,6 +224,7 @@ config ARCH_VERSATILE
 	bool "ARM Ltd. Versatile family"
 	select ARM_AMBA
 	select ARM_VIC
+	select HAVE_CLK
 	select ICST307
 	select GENERIC_TIME
 	select GENERIC_CLOCKEVENTS
@@ -262,6 +266,8 @@ config ARCH_EP93XX
 	select ARM_AMBA
 	select ARM_VIC
 	select GENERIC_GPIO
+	select HAVE_CLK
+	select HAVE_CLK
 	select HAVE_GPIO_LIB
 	help
 	  This enables support for the Cirrus EP93xx series of CPUs.
@@ -381,6 +387,7 @@ config ARCH_NS9XXX
 	select GENERIC_GPIO
 	select GENERIC_TIME
 	select GENERIC_CLOCKEVENTS
+	select HAVE_CLK
 	help
 	  Say Y here if you intend to run this kernel on a NetSilicon NS9xxx
 	  System.
@@ -430,6 +437,7 @@ config ARCH_ORION5X
 
 config ARCH_PNX4008
 	bool "Philips Nexperia PNX4008 Mobile"
+	select HAVE_CLK
 	help
 	  This enables support for Philips PNX4008 mobile platform.
 
@@ -438,6 +446,7 @@ config ARCH_PXA
 	depends on MMU
 	select ARCH_MTD_XIP
 	select GENERIC_GPIO
+	select HAVE_CLK
 	select HAVE_GPIO_LIB
 	select GENERIC_TIME
 	select GENERIC_CLOCKEVENTS
@@ -468,6 +477,7 @@ config ARCH_SA1100
 	select GENERIC_GPIO
 	select GENERIC_TIME
 	select GENERIC_CLOCKEVENTS
+	select HAVE_CLK
 	select TICK_ONESHOT
 	select HAVE_GPIO_LIB
 	help
@@ -476,6 +486,7 @@ config ARCH_SA1100
 config ARCH_S3C2410
 	bool "Samsung S3C2410, S3C2412, S3C2413, S3C2440, S3C2442, S3C2443"
 	select GENERIC_GPIO
+	select HAVE_CLK
 	help
 	  Samsung S3C2410X CPU based systems, such as the Simtec Electronics
 	  BAST (<http://www.simtec.co.uk/products/EB110ITX/>), the IPAQ 1940 or
@@ -503,12 +514,14 @@ config ARCH_DAVINCI
 	select GENERIC_TIME
 	select GENERIC_CLOCKEVENTS
 	select GENERIC_GPIO
+	select HAVE_CLK
 	help
 	  Support for TI's DaVinci platform.
 
 config ARCH_OMAP
 	bool "TI OMAP"
 	select GENERIC_GPIO
+	select HAVE_CLK
 	select HAVE_GPIO_LIB
 	select GENERIC_TIME
 	select GENERIC_CLOCKEVENTS
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index 45d63c98601..df4adefedb4 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -10,6 +10,7 @@ config AVR32
 	# With EMBEDDED=n, we get lots of stuff automatically selected
 	# that we usually don't need on AVR32.
 	select EMBEDDED
+	select HAVE_CLK
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
 	help
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 737ebf9d12b..4d7e2ba10ba 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -843,6 +843,7 @@ source "crypto/Kconfig"
 config PPC_CLOCK
 	bool
 	default n
+	select HAVE_CLK
 
 config PPC_LIB_RHEAP
 	bool
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 3e7384f4619..8879938f335 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -8,6 +8,7 @@ mainmenu "Linux/SuperH Kernel Configuration"
 config SUPERH
 	def_bool y
 	select EMBEDDED
+	select HAVE_CLK
 	select HAVE_IDE
 	select HAVE_OPROFILE
 	help
-- 
cgit v1.2.3-70-g09d2


From b61bfa3c462671c48a51fb5c31af337c5a996a04 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@saeurebad.de>
Date: Wed, 23 Jul 2008 21:26:55 -0700
Subject: mm: move bootmem descriptors definition to a single place

There are a lot of places that define either a single bootmem descriptor or an
array of them.  Use only one central array with MAX_NUMNODES items instead.

Signed-off-by: Johannes Weiner <hannes@saeurebad.de>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Kyle McMartin <kyle@parisc-linux.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/mm/numa.c             |  8 ++++----
 arch/arm/mm/discontig.c          | 34 ++++++++++++++++------------------
 arch/ia64/mm/discontig.c         | 11 +++++------
 arch/m32r/mm/discontig.c         |  4 +---
 arch/m68k/mm/init.c              |  4 +---
 arch/mips/sgi-ip27/ip27-memory.c |  4 +---
 arch/parisc/mm/init.c            |  3 +--
 arch/powerpc/mm/numa.c           |  3 +--
 arch/sh/mm/numa.c                |  5 ++---
 arch/sparc64/mm/init.c           |  3 +--
 arch/x86/mm/discontig_32.c       |  3 +--
 arch/x86/mm/numa_64.c            |  4 +---
 include/linux/bootmem.h          |  2 ++
 mm/bootmem.c                     |  2 ++
 mm/page_alloc.c                  |  4 +---
 15 files changed, 40 insertions(+), 54 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c
index 10ab7833e83..a53fda0481c 100644
--- a/arch/alpha/mm/numa.c
+++ b/arch/alpha/mm/numa.c
@@ -19,7 +19,6 @@
 #include <asm/pgalloc.h>
 
 pg_data_t node_data[MAX_NUMNODES];
-bootmem_data_t node_bdata[MAX_NUMNODES];
 EXPORT_SYMBOL(node_data);
 
 #undef DEBUG_DISCONTIG
@@ -141,7 +140,7 @@ setup_memory_node(int nid, void *kernel_end)
 		printk(" not enough mem to reserve NODE_DATA");
 		return;
 	}
-	NODE_DATA(nid)->bdata = &node_bdata[nid];
+	NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
 
 	printk(" Detected node memory:   start %8lu, end %8lu\n",
 	       node_min_pfn, node_max_pfn);
@@ -304,8 +303,9 @@ void __init paging_init(void)
 	dma_local_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
 
 	for_each_online_node(nid) {
-		unsigned long start_pfn = node_bdata[nid].node_boot_start >> PAGE_SHIFT;
-		unsigned long end_pfn = node_bdata[nid].node_low_pfn;
+		bootmem_data_t *bdata = &bootmem_node_data[nid];
+		unsigned long start_pfn = bdata->node_boot_start >> PAGE_SHIFT;
+		unsigned long end_pfn = bdata->node_low_pfn;
 
 		if (dma_local_pfn >= end_pfn - start_pfn)
 			zones_size[ZONE_DMA] = end_pfn - start_pfn;
diff --git a/arch/arm/mm/discontig.c b/arch/arm/mm/discontig.c
index 1e560218950..c8c0c4b0f0a 100644
--- a/arch/arm/mm/discontig.c
+++ b/arch/arm/mm/discontig.c
@@ -21,26 +21,24 @@
  * Our node_data structure for discontiguous memory.
  */
 
-static bootmem_data_t node_bootmem_data[MAX_NUMNODES];
-
 pg_data_t discontig_node_data[MAX_NUMNODES] = {
-  { .bdata = &node_bootmem_data[0] },
-  { .bdata = &node_bootmem_data[1] },
-  { .bdata = &node_bootmem_data[2] },
-  { .bdata = &node_bootmem_data[3] },
+  { .bdata = &bootmem_node_data[0] },
+  { .bdata = &bootmem_node_data[1] },
+  { .bdata = &bootmem_node_data[2] },
+  { .bdata = &bootmem_node_data[3] },
 #if MAX_NUMNODES == 16
-  { .bdata = &node_bootmem_data[4] },
-  { .bdata = &node_bootmem_data[5] },
-  { .bdata = &node_bootmem_data[6] },
-  { .bdata = &node_bootmem_data[7] },
-  { .bdata = &node_bootmem_data[8] },
-  { .bdata = &node_bootmem_data[9] },
-  { .bdata = &node_bootmem_data[10] },
-  { .bdata = &node_bootmem_data[11] },
-  { .bdata = &node_bootmem_data[12] },
-  { .bdata = &node_bootmem_data[13] },
-  { .bdata = &node_bootmem_data[14] },
-  { .bdata = &node_bootmem_data[15] },
+  { .bdata = &bootmem_node_data[4] },
+  { .bdata = &bootmem_node_data[5] },
+  { .bdata = &bootmem_node_data[6] },
+  { .bdata = &bootmem_node_data[7] },
+  { .bdata = &bootmem_node_data[8] },
+  { .bdata = &bootmem_node_data[9] },
+  { .bdata = &bootmem_node_data[10] },
+  { .bdata = &bootmem_node_data[11] },
+  { .bdata = &bootmem_node_data[12] },
+  { .bdata = &bootmem_node_data[13] },
+  { .bdata = &bootmem_node_data[14] },
+  { .bdata = &bootmem_node_data[15] },
 #endif
 };
 
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index 544dc420c65..2fcf8464331 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -36,7 +36,6 @@ struct early_node_data {
 	struct ia64_node_data *node_data;
 	unsigned long pernode_addr;
 	unsigned long pernode_size;
-	struct bootmem_data bootmem_data;
 	unsigned long num_physpages;
 #ifdef CONFIG_ZONE_DMA
 	unsigned long num_dma_physpages;
@@ -76,7 +75,7 @@ static int __init build_node_maps(unsigned long start, unsigned long len,
 				  int node)
 {
 	unsigned long cstart, epfn, end = start + len;
-	struct bootmem_data *bdp = &mem_data[node].bootmem_data;
+	struct bootmem_data *bdp = &bootmem_node_data[node];
 
 	epfn = GRANULEROUNDUP(end) >> PAGE_SHIFT;
 	cstart = GRANULEROUNDDOWN(start);
@@ -167,7 +166,7 @@ static void __init fill_pernode(int node, unsigned long pernode,
 {
 	void *cpu_data;
 	int cpus = early_nr_cpus_node(node);
-	struct bootmem_data *bdp = &mem_data[node].bootmem_data;
+	struct bootmem_data *bdp = &bootmem_node_data[node];
 
 	mem_data[node].pernode_addr = pernode;
 	mem_data[node].pernode_size = pernodesize;
@@ -224,7 +223,7 @@ static int __init find_pernode_space(unsigned long start, unsigned long len,
 {
 	unsigned long epfn;
 	unsigned long pernodesize = 0, pernode, pages, mapsize;
-	struct bootmem_data *bdp = &mem_data[node].bootmem_data;
+	struct bootmem_data *bdp = &bootmem_node_data[node];
 
 	epfn = (start + len) >> PAGE_SHIFT;
 
@@ -440,7 +439,7 @@ void __init find_memory(void)
 	efi_memmap_walk(find_max_min_low_pfn, NULL);
 
 	for_each_online_node(node)
-		if (mem_data[node].bootmem_data.node_low_pfn) {
+		if (bootmem_node_data[node].node_low_pfn) {
 			node_clear(node, memory_less_mask);
 			mem_data[node].min_pfn = ~0UL;
 		}
@@ -460,7 +459,7 @@ void __init find_memory(void)
 		else if (node_isset(node, memory_less_mask))
 			continue;
 
-		bdp = &mem_data[node].bootmem_data;
+		bdp = &bootmem_node_data[node];
 		pernode = mem_data[node].pernode_addr;
 		pernodesize = mem_data[node].pernode_size;
 		map = pernode + pernodesize;
diff --git a/arch/m32r/mm/discontig.c b/arch/m32r/mm/discontig.c
index 07c1af7dc0e..aa9145ef6cc 100644
--- a/arch/m32r/mm/discontig.c
+++ b/arch/m32r/mm/discontig.c
@@ -20,7 +20,6 @@ extern char _end[];
 
 struct pglist_data *node_data[MAX_NUMNODES];
 EXPORT_SYMBOL(node_data);
-static bootmem_data_t node_bdata[MAX_NUMNODES] __initdata;
 
 pg_data_t m32r_node_data[MAX_NUMNODES];
 
@@ -81,7 +80,7 @@ unsigned long __init setup_memory(void)
 	for_each_online_node(nid) {
 		mp = &mem_prof[nid];
 		NODE_DATA(nid)=(pg_data_t *)&m32r_node_data[nid];
-		NODE_DATA(nid)->bdata = &node_bdata[nid];
+		NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
 		min_pfn = mp->start_pfn;
 		max_pfn = mp->start_pfn + mp->pages;
 		bootmap_size = init_bootmem_node(NODE_DATA(nid), mp->free_pfn,
@@ -163,4 +162,3 @@ unsigned long __init zone_sizes_init(void)
 
 	return holes;
 }
-
diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c
index d8fb9c5303c..79f5f94d480 100644
--- a/arch/m68k/mm/init.c
+++ b/arch/m68k/mm/init.c
@@ -32,8 +32,6 @@
 
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 
-static bootmem_data_t __initdata bootmem_data[MAX_NUMNODES];
-
 pg_data_t pg_data_map[MAX_NUMNODES];
 EXPORT_SYMBOL(pg_data_map);
 
@@ -58,7 +56,7 @@ void __init m68k_setup_node(int node)
 		pg_data_table[i] = pg_data_map + node;
 	}
 #endif
-	pg_data_map[node].bdata = bootmem_data + node;
+	pg_data_map[node].bdata = bootmem_node_data + node;
 	node_set_online(node);
 }
 
diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c
index 42cd1095630..060d853d7b3 100644
--- a/arch/mips/sgi-ip27/ip27-memory.c
+++ b/arch/mips/sgi-ip27/ip27-memory.c
@@ -33,8 +33,6 @@
 #define SLOT_PFNSHIFT           (SLOT_SHIFT - PAGE_SHIFT)
 #define PFN_NASIDSHFT           (NASID_SHFT - PAGE_SHIFT)
 
-static struct bootmem_data __initdata plat_node_bdata[MAX_COMPACT_NODES];
-
 struct node_data *__node_data[MAX_COMPACT_NODES];
 
 EXPORT_SYMBOL(__node_data);
@@ -403,7 +401,7 @@ static void __init node_mem_init(cnodeid_t node)
 	 */
 	__node_data[node] = __va(slot_freepfn << PAGE_SHIFT);
 
-	NODE_DATA(node)->bdata = &plat_node_bdata[node];
+	NODE_DATA(node)->bdata = &bootmem_node_data[node];
 	NODE_DATA(node)->node_start_pfn = start_pfn;
 	NODE_DATA(node)->node_spanned_pages = end_pfn - start_pfn;
 
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index b4d6c8777ed..0ddf4904640 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -36,7 +36,6 @@ extern int  data_start;
 
 #ifdef CONFIG_DISCONTIGMEM
 struct node_map_data node_data[MAX_NUMNODES] __read_mostly;
-bootmem_data_t bmem_data[MAX_NUMNODES] __read_mostly;
 unsigned char pfnnid_map[PFNNID_MAP_MAX] __read_mostly;
 #endif
 
@@ -262,7 +261,7 @@ static void __init setup_bootmem(void)
 #ifdef CONFIG_DISCONTIGMEM
 	for (i = 0; i < MAX_PHYSMEM_RANGES; i++) {
 		memset(NODE_DATA(i), 0, sizeof(pg_data_t));
-		NODE_DATA(i)->bdata = &bmem_data[i];
+		NODE_DATA(i)->bdata = &bootmem_node_data[i];
 	}
 	memset(pfnnid_map, 0xff, sizeof(pfnnid_map));
 
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index cf4bffba6f7..d9a18135133 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -39,7 +39,6 @@ EXPORT_SYMBOL(numa_cpu_lookup_table);
 EXPORT_SYMBOL(numa_cpumask_lookup_table);
 EXPORT_SYMBOL(node_data);
 
-static bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES];
 static int min_common_depth;
 static int n_mem_addr_cells, n_mem_size_cells;
 
@@ -816,7 +815,7 @@ void __init do_init_bootmem(void)
   		dbg("node %d\n", nid);
 		dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
 
-		NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
+		NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
 		NODE_DATA(nid)->node_start_pfn = start_pfn;
 		NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
 
diff --git a/arch/sh/mm/numa.c b/arch/sh/mm/numa.c
index 1663199ce88..095d93bec7c 100644
--- a/arch/sh/mm/numa.c
+++ b/arch/sh/mm/numa.c
@@ -14,7 +14,6 @@
 #include <linux/pfn.h>
 #include <asm/sections.h>
 
-static bootmem_data_t plat_node_bdata[MAX_NUMNODES];
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL_GPL(node_data);
 
@@ -35,7 +34,7 @@ void __init setup_memory(void)
 	NODE_DATA(0) = pfn_to_kaddr(free_pfn);
 	memset(NODE_DATA(0), 0, sizeof(struct pglist_data));
 	free_pfn += PFN_UP(sizeof(struct pglist_data));
-	NODE_DATA(0)->bdata = &plat_node_bdata[0];
+	NODE_DATA(0)->bdata = &bootmem_node_data[0];
 
 	/* Set up node 0 */
 	setup_bootmem_allocator(free_pfn);
@@ -66,7 +65,7 @@ void __init setup_bootmem_node(int nid, unsigned long start, unsigned long end)
 	free_pfn += PFN_UP(sizeof(struct pglist_data));
 	memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
 
-	NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
+	NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
 	NODE_DATA(nid)->node_start_pfn = start_pfn;
 	NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
 
diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
index 84898c44dd4..71329747395 100644
--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@@ -788,7 +788,6 @@ int numa_cpu_lookup_table[NR_CPUS];
 cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
 
 #ifdef CONFIG_NEED_MULTIPLE_NODES
-static bootmem_data_t plat_node_bdata[MAX_NUMNODES];
 
 struct mdesc_mblock {
 	u64	base;
@@ -871,7 +870,7 @@ static void __init allocate_node_data(int nid)
 	NODE_DATA(nid) = __va(paddr);
 	memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
 
-	NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
+	NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
 #endif
 
 	p = NODE_DATA(nid);
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 5dfef9fa061..62fa440678d 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -42,7 +42,6 @@
 
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
-static bootmem_data_t node0_bdata;
 
 /*
  * numa interface - we expect the numa architecture specific code to have
@@ -385,7 +384,7 @@ void __init initmem_init(unsigned long start_pfn,
 	for_each_online_node(nid)
 		memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
 
-	NODE_DATA(0)->bdata = &node0_bdata;
+	NODE_DATA(0)->bdata = &bootmem_node_data[0];
 	setup_bootmem_allocator();
 }
 
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 9782f42dd31..a4dd793d600 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -23,8 +23,6 @@
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
 
-static bootmem_data_t plat_node_bdata[MAX_NUMNODES];
-
 struct memnode memnode;
 
 s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
@@ -198,7 +196,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
 		nodedata_phys + pgdat_size - 1);
 
 	memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
-	NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
+	NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
 	NODE_DATA(nodeid)->node_start_pfn = start_pfn;
 	NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
 
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index a1d9b79078e..2599c741405 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -38,6 +38,8 @@ typedef struct bootmem_data {
 	struct list_head list;
 } bootmem_data_t;
 
+extern bootmem_data_t bootmem_node_data[];
+
 extern unsigned long bootmem_bootmap_pages(unsigned long);
 extern unsigned long init_bootmem(unsigned long addr, unsigned long memend);
 extern void free_bootmem(unsigned long addr, unsigned long size);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 9f4bbc5da73..35b3cb66703 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -36,6 +36,8 @@ static LIST_HEAD(bdata_list);
 unsigned long saved_max_pfn;
 #endif
 
+bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
+
 /* return the number of _pages_ that will be allocated for the boot bitmap */
 unsigned long __init bootmem_bootmap_pages(unsigned long pages)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9ece07ce65b..e089b92cdff 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4040,9 +4040,7 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
 }
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-static bootmem_data_t contig_bootmem_data;
-struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
-
+struct pglist_data contig_page_data = { .bdata = &bootmem_node_data[0] };
 EXPORT_SYMBOL(contig_page_data);
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From a1f242ff460e4b50a045fa237c3c56cce9eabf83 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 23 Jul 2008 21:27:08 -0700
Subject: powerpc ioremap_prot

This adds ioremap_prot and pte_pgprot() so that one can extract protection
bits from a PTE and use them to ioremap_prot() (in order to support ptrace
of VM_IO | VM_PFNMAP as per Rik's patch).

This moves a couple of flag checks around in the ioremap implementations
of arch/powerpc.  There's a side effect of allowing non-cacheable and
non-guarded mappings on ppc32 which before would always have _PAGE_GUARDED
set whenever _PAGE_NO_CACHE is.

(standard ioremap will still set _PAGE_GUARDED, but ioremap_prot will be
capable of setting such a non guarded mapping).

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Dave Airlie <airlied@linux.ie>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/Kconfig                |  1 +
 arch/powerpc/mm/pgtable_32.c        | 22 ++++++++++++++++------
 arch/powerpc/mm/pgtable_64.c        | 16 ++++++++++++++++
 include/asm-powerpc/io.h            |  5 ++++-
 include/asm-powerpc/pgtable-4k.h    |  3 +++
 include/asm-powerpc/pgtable-ppc32.h | 16 ++++++++++++++++
 include/asm-powerpc/pgtable-ppc64.h |  8 ++++++++
 7 files changed, 64 insertions(+), 7 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 4d7e2ba10ba..a487671c282 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -111,6 +111,7 @@ config PPC
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FTRACE
 	select HAVE_IDE
+	select HAVE_IOREMAP_PROT
 	select HAVE_KPROBES
 	select HAVE_ARCH_KGDB
 	select HAVE_KRETPROBES
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index c7584072dfc..2001abdb191 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -145,13 +145,20 @@ void pte_free(struct mm_struct *mm, pgtable_t ptepage)
 void __iomem *
 ioremap(phys_addr_t addr, unsigned long size)
 {
-	return __ioremap(addr, size, _PAGE_NO_CACHE);
+	return __ioremap(addr, size, _PAGE_NO_CACHE | _PAGE_GUARDED);
 }
 EXPORT_SYMBOL(ioremap);
 
 void __iomem *
 ioremap_flags(phys_addr_t addr, unsigned long size, unsigned long flags)
 {
+	/* writeable implies dirty for kernel addresses */
+	if (flags & _PAGE_RW)
+		flags |= _PAGE_DIRTY | _PAGE_HWWRITE;
+
+	/* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
+	flags &= ~(_PAGE_USER | _PAGE_EXEC | _PAGE_HWEXEC);
+
 	return __ioremap(addr, size, flags);
 }
 EXPORT_SYMBOL(ioremap_flags);
@@ -163,6 +170,14 @@ __ioremap(phys_addr_t addr, unsigned long size, unsigned long flags)
 	phys_addr_t p;
 	int err;
 
+	/* Make sure we have the base flags */
+	if ((flags & _PAGE_PRESENT) == 0)
+		flags |= _PAGE_KERNEL;
+
+	/* Non-cacheable page cannot be coherent */
+	if (flags & _PAGE_NO_CACHE)
+		flags &= ~_PAGE_COHERENT;
+
 	/*
 	 * Choose an address to map it to.
 	 * Once the vmalloc system is running, we use it.
@@ -219,11 +234,6 @@ __ioremap(phys_addr_t addr, unsigned long size, unsigned long flags)
 		v = (ioremap_bot -= size);
 	}
 
-	if ((flags & _PAGE_PRESENT) == 0)
-		flags |= _PAGE_KERNEL;
-	if (flags & _PAGE_NO_CACHE)
-		flags |= _PAGE_GUARDED;
-
 	/*
 	 * Should check if it is a candidate for a BAT mapping
 	 */
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 3ef0ad2f9ca..365e61ae5db 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -107,9 +107,18 @@ void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size,
 {
 	unsigned long i;
 
+	/* Make sure we have the base flags */
 	if ((flags & _PAGE_PRESENT) == 0)
 		flags |= pgprot_val(PAGE_KERNEL);
 
+	/* Non-cacheable page cannot be coherent */
+	if (flags & _PAGE_NO_CACHE)
+		flags &= ~_PAGE_COHERENT;
+
+	/* We don't support the 4K PFN hack with ioremap */
+	if (flags & _PAGE_4K_PFN)
+		return NULL;
+
 	WARN_ON(pa & ~PAGE_MASK);
 	WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
 	WARN_ON(size & ~PAGE_MASK);
@@ -190,6 +199,13 @@ void __iomem * ioremap(phys_addr_t addr, unsigned long size)
 void __iomem * ioremap_flags(phys_addr_t addr, unsigned long size,
 			     unsigned long flags)
 {
+	/* writeable implies dirty for kernel addresses */
+	if (flags & _PAGE_RW)
+		flags |= _PAGE_DIRTY;
+
+	/* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
+	flags &= ~(_PAGE_USER | _PAGE_EXEC);
+
 	if (ppc_md.ioremap)
 		return ppc_md.ioremap(addr, size, flags);
 	return __ioremap(addr, size, flags);
diff --git a/include/asm-powerpc/io.h b/include/asm-powerpc/io.h
index 8b627823f5f..77c7fa025e6 100644
--- a/include/asm-powerpc/io.h
+++ b/include/asm-powerpc/io.h
@@ -617,7 +617,8 @@ static inline void iosync(void)
  *   and can be hooked by the platform via ppc_md
  *
  * * ioremap_flags allows to specify the page flags as an argument and can
- *   also be hooked by the platform via ppc_md
+ *   also be hooked by the platform via ppc_md. ioremap_prot is the exact
+ *   same thing as ioremap_flags.
  *
  * * ioremap_nocache is identical to ioremap
  *
@@ -639,6 +640,8 @@ extern void __iomem *ioremap(phys_addr_t address, unsigned long size);
 extern void __iomem *ioremap_flags(phys_addr_t address, unsigned long size,
 				   unsigned long flags);
 #define ioremap_nocache(addr, size)	ioremap((addr), (size))
+#define ioremap_prot(addr, size, prot)	ioremap_flags((addr), (size), (prot))
+
 extern void iounmap(volatile void __iomem *addr);
 
 extern void __iomem *__ioremap(phys_addr_t, unsigned long size,
diff --git a/include/asm-powerpc/pgtable-4k.h b/include/asm-powerpc/pgtable-4k.h
index fd2090dc1dc..c9601dfb4a1 100644
--- a/include/asm-powerpc/pgtable-4k.h
+++ b/include/asm-powerpc/pgtable-4k.h
@@ -51,6 +51,9 @@
 #define _PAGE_HPTEFLAGS (_PAGE_BUSY | _PAGE_HASHPTE | \
 			 _PAGE_SECONDARY | _PAGE_GROUP_IX)
 
+/* There is no 4K PFN hack on 4K pages */
+#define _PAGE_4K_PFN	0
+
 /* PAGE_MASK gives the right answer below, but only by accident */
 /* It should be preserving the high 48 bits and then specifically */
 /* preserving _PAGE_SECONDARY | _PAGE_GROUP_IX */
diff --git a/include/asm-powerpc/pgtable-ppc32.h b/include/asm-powerpc/pgtable-ppc32.h
index 3a96d001cb7..bdbab72f3eb 100644
--- a/include/asm-powerpc/pgtable-ppc32.h
+++ b/include/asm-powerpc/pgtable-ppc32.h
@@ -395,6 +395,12 @@ extern int icache_44x_need_flush;
 #ifndef _PAGE_EXEC
 #define _PAGE_EXEC	0
 #endif
+#ifndef _PAGE_ENDIAN
+#define _PAGE_ENDIAN	0
+#endif
+#ifndef _PAGE_COHERENT
+#define _PAGE_COHERENT	0
+#endif
 #ifndef _PMD_PRESENT_MASK
 #define _PMD_PRESENT_MASK	_PMD_PRESENT
 #endif
@@ -405,6 +411,12 @@ extern int icache_44x_need_flush;
 
 #define _PAGE_CHG_MASK	(PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
 
+
+#define PAGE_PROT_BITS	__pgprot(_PAGE_GUARDED | _PAGE_COHERENT | _PAGE_NO_CACHE | \
+				 _PAGE_WRITETHRU | _PAGE_ENDIAN | \
+				 _PAGE_USER | _PAGE_ACCESSED | \
+				 _PAGE_RW | _PAGE_HWWRITE | _PAGE_DIRTY | \
+				 _PAGE_EXEC | _PAGE_HWEXEC)
 /*
  * Note: the _PAGE_COHERENT bit automatically gets set in the hardware
  * PTE if CONFIG_SMP is defined (hash_page does this); there is no need
@@ -538,6 +550,10 @@ static inline pte_t pte_mkyoung(pte_t pte) {
 	pte_val(pte) |= _PAGE_ACCESSED; return pte; }
 static inline pte_t pte_mkspecial(pte_t pte) {
 	return pte; }
+static inline unsigned long pte_pgprot(pte_t pte)
+{
+	return __pgprot(pte_val(pte)) & PAGE_PROT_BITS;
+}
 
 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 {
diff --git a/include/asm-powerpc/pgtable-ppc64.h b/include/asm-powerpc/pgtable-ppc64.h
index ab98a9c80b2..ba8000352b9 100644
--- a/include/asm-powerpc/pgtable-ppc64.h
+++ b/include/asm-powerpc/pgtable-ppc64.h
@@ -117,6 +117,10 @@
 #define PAGE_AGP	__pgprot(_PAGE_BASE | _PAGE_WRENABLE | _PAGE_NO_CACHE)
 #define HAVE_PAGE_AGP
 
+#define PAGE_PROT_BITS	__pgprot(_PAGE_GUARDED | _PAGE_COHERENT | \
+				 _PAGE_NO_CACHE | _PAGE_WRITETHRU | \
+				 _PAGE_4K_PFN | _PAGE_RW | _PAGE_USER | \
+ 				 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_EXEC)
 /* PTEIDX nibble */
 #define _PTEIDX_SECONDARY	0x8
 #define _PTEIDX_GROUP_IX	0x7
@@ -262,6 +266,10 @@ static inline pte_t pte_mkhuge(pte_t pte) {
 	return pte; }
 static inline pte_t pte_mkspecial(pte_t pte) {
 	return pte; }
+static inline unsigned long pte_pgprot(pte_t pte)
+{
+	return __pgprot(pte_val(pte)) & PAGE_PROT_BITS;
+}
 
 /* Atomic PTE updates */
 static inline unsigned long pte_update(struct mm_struct *mm,
-- 
cgit v1.2.3-70-g09d2


From a352894d07059649398c4769dc8b645e1a1dad88 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 23 Jul 2008 21:27:09 -0700
Subject: spufs: use new vm_ops->access to allow local state access from gdb

This uses the new vm_ops->access to allow gdb to access the SPU local
store.  We currently prevent access to problem state registers, this can
be done later if really needed but it's safer not to.

[akpm@linux-foundation.org: fix typo]
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Dave Airlie <airlied@linux.ie>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/platforms/cell/spufs/file.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index 99c73066b82..010a51f5979 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -288,9 +288,32 @@ spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return VM_FAULT_NOPAGE;
 }
 
+static int spufs_mem_mmap_access(struct vm_area_struct *vma,
+				unsigned long address,
+				void *buf, int len, int write)
+{
+	struct spu_context *ctx = vma->vm_file->private_data;
+	unsigned long offset = address - vma->vm_start;
+	char *local_store;
+
+	if (write && !(vma->vm_flags & VM_WRITE))
+		return -EACCES;
+	if (spu_acquire(ctx))
+		return -EINTR;
+	if ((offset + len) > vma->vm_end)
+		len = vma->vm_end - offset;
+	local_store = ctx->ops->get_ls(ctx);
+	if (write)
+		memcpy_toio(local_store + offset, buf, len);
+	else
+		memcpy_fromio(buf, local_store + offset, len);
+	spu_release(ctx);
+	return len;
+}
 
 static struct vm_operations_struct spufs_mem_mmap_vmops = {
 	.fault = spufs_mem_mmap_fault,
+	.access = spufs_mem_mmap_access,
 };
 
 static int spufs_mem_mmap(struct file *file, struct vm_area_struct *vma)
-- 
cgit v1.2.3-70-g09d2


From 42b7772812d15b86543a23b82bd6070eef9a08b1 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Wed, 23 Jul 2008 21:27:10 -0700
Subject: mm: remove double indirection on tlb parameter to free_pgd_range() &
 Co

The double indirection here is not needed anywhere and hence (at least)
confusing.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: "David S. Miller" <davem@davemloft.net>
Acked-by: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/mm/hugetlbpage.c    |  2 +-
 arch/powerpc/mm/hugetlbpage.c |  8 ++++----
 fs/exec.c                     |  4 ++--
 include/asm-ia64/hugetlb.h    |  2 +-
 include/asm-powerpc/hugetlb.h |  2 +-
 include/asm-sh/hugetlb.h      |  2 +-
 include/asm-sparc/hugetlb.h   |  2 +-
 include/asm-x86/hugetlb.h     |  2 +-
 include/linux/mm.h            |  4 +---
 mm/internal.h                 |  3 +++
 mm/memory.c                   | 10 ++++++----
 mm/mmap.c                     |  6 ++++--
 12 files changed, 26 insertions(+), 21 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index d3ce8f3bcaa..cd49e2860ee 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -112,7 +112,7 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int wri
 	return NULL;
 }
 
-void hugetlb_free_pgd_range(struct mmu_gather **tlb,
+void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 			unsigned long addr, unsigned long end,
 			unsigned long floor, unsigned long ceiling)
 {
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 0d12fba31bc..1a96cc891cf 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -255,7 +255,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
  *
  * Must be called with pagetable lock held.
  */
-void hugetlb_free_pgd_range(struct mmu_gather **tlb,
+void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 			    unsigned long addr, unsigned long end,
 			    unsigned long floor, unsigned long ceiling)
 {
@@ -315,13 +315,13 @@ void hugetlb_free_pgd_range(struct mmu_gather **tlb,
 		return;
 
 	start = addr;
-	pgd = pgd_offset((*tlb)->mm, addr);
+	pgd = pgd_offset(tlb->mm, addr);
 	do {
-		BUG_ON(get_slice_psize((*tlb)->mm, addr) != mmu_huge_psize);
+		BUG_ON(get_slice_psize(tlb->mm, addr) != mmu_huge_psize);
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		hugetlb_free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
+		hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
 	} while (pgd++, addr = next, addr != end);
 }
 
diff --git a/fs/exec.c b/fs/exec.c
index fd9234379e8..190ed1f9277 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -541,7 +541,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 		/*
 		 * when the old and new regions overlap clear from new_end.
 		 */
-		free_pgd_range(&tlb, new_end, old_end, new_end,
+		free_pgd_range(tlb, new_end, old_end, new_end,
 			vma->vm_next ? vma->vm_next->vm_start : 0);
 	} else {
 		/*
@@ -550,7 +550,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 		 * have constraints on va-space that make this illegal (IA64) -
 		 * for the others its just a little faster.
 		 */
-		free_pgd_range(&tlb, old_start, old_end, new_end,
+		free_pgd_range(tlb, old_start, old_end, new_end,
 			vma->vm_next ? vma->vm_next->vm_start : 0);
 	}
 	tlb_finish_mmu(tlb, new_end, old_end);
diff --git a/include/asm-ia64/hugetlb.h b/include/asm-ia64/hugetlb.h
index f28a9701f1c..e9d1e5e2382 100644
--- a/include/asm-ia64/hugetlb.h
+++ b/include/asm-ia64/hugetlb.h
@@ -4,7 +4,7 @@
 #include <asm/page.h>
 
 
-void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
+void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
 			    unsigned long end, unsigned long floor,
 			    unsigned long ceiling);
 
diff --git a/include/asm-powerpc/hugetlb.h b/include/asm-powerpc/hugetlb.h
index be32ff02f4a..0a37aa5ecaa 100644
--- a/include/asm-powerpc/hugetlb.h
+++ b/include/asm-powerpc/hugetlb.h
@@ -7,7 +7,7 @@
 int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
 			   unsigned long len);
 
-void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
+void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
 			    unsigned long end, unsigned long floor,
 			    unsigned long ceiling);
 
diff --git a/include/asm-sh/hugetlb.h b/include/asm-sh/hugetlb.h
index 02402303d89..fb30018938c 100644
--- a/include/asm-sh/hugetlb.h
+++ b/include/asm-sh/hugetlb.h
@@ -26,7 +26,7 @@ static inline int prepare_hugepage_range(unsigned long addr, unsigned long len)
 static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) {
 }
 
-static inline void hugetlb_free_pgd_range(struct mmu_gather **tlb,
+static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 					  unsigned long addr, unsigned long end,
 					  unsigned long floor,
 					  unsigned long ceiling)
diff --git a/include/asm-sparc/hugetlb.h b/include/asm-sparc/hugetlb.h
index 412af58926a..aeb92374ca3 100644
--- a/include/asm-sparc/hugetlb.h
+++ b/include/asm-sparc/hugetlb.h
@@ -31,7 +31,7 @@ static inline int prepare_hugepage_range(unsigned long addr, unsigned long len)
 	return 0;
 }
 
-static inline void hugetlb_free_pgd_range(struct mmu_gather **tlb,
+static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 					  unsigned long addr, unsigned long end,
 					  unsigned long floor,
 					  unsigned long ceiling)
diff --git a/include/asm-x86/hugetlb.h b/include/asm-x86/hugetlb.h
index 14171a4924f..7eed6e0883b 100644
--- a/include/asm-x86/hugetlb.h
+++ b/include/asm-x86/hugetlb.h
@@ -26,7 +26,7 @@ static inline int prepare_hugepage_range(unsigned long addr, unsigned long len)
 static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) {
 }
 
-static inline void hugetlb_free_pgd_range(struct mmu_gather **tlb,
+static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 					  unsigned long addr, unsigned long end,
 					  unsigned long floor,
 					  unsigned long ceiling)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5c7f8f64f70..f8071097302 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -769,10 +769,8 @@ struct mm_walk {
 
 int walk_page_range(unsigned long addr, unsigned long end,
 		struct mm_walk *walk);
-void free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
+void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
 		unsigned long end, unsigned long floor, unsigned long ceiling);
-void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma,
-		unsigned long floor, unsigned long ceiling);
 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
 			struct vm_area_struct *vma);
 void unmap_mapping_range(struct address_space *mapping,
diff --git a/mm/internal.h b/mm/internal.h
index 50807e12490..858ad01864d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -13,6 +13,9 @@
 
 #include <linux/mm.h>
 
+void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
+		unsigned long floor, unsigned long ceiling);
+
 static inline void set_page_count(struct page *page, int v)
 {
 	atomic_set(&page->_count, v);
diff --git a/mm/memory.c b/mm/memory.c
index 87350321e66..82f3f1c5cf1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -61,6 +61,8 @@
 #include <linux/swapops.h>
 #include <linux/elf.h>
 
+#include "internal.h"
+
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 /* use the per-pgdat data instead for discontigmem - mbligh */
 unsigned long max_mapnr;
@@ -211,7 +213,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
  *
  * Must be called with pagetable lock held.
  */
-void free_pgd_range(struct mmu_gather **tlb,
+void free_pgd_range(struct mmu_gather *tlb,
 			unsigned long addr, unsigned long end,
 			unsigned long floor, unsigned long ceiling)
 {
@@ -262,16 +264,16 @@ void free_pgd_range(struct mmu_gather **tlb,
 		return;
 
 	start = addr;
-	pgd = pgd_offset((*tlb)->mm, addr);
+	pgd = pgd_offset(tlb->mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
+		free_pud_range(tlb, pgd, addr, next, floor, ceiling);
 	} while (pgd++, addr = next, addr != end);
 }
 
-void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
+void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		unsigned long floor, unsigned long ceiling)
 {
 	while (vma) {
diff --git a/mm/mmap.c b/mm/mmap.c
index 1d102b956fd..75e0d0673d7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -32,6 +32,8 @@
 #include <asm/tlb.h>
 #include <asm/mmu_context.h>
 
+#include "internal.h"
+
 #ifndef arch_mmap_check
 #define arch_mmap_check(addr, len, flags)	(0)
 #endif
@@ -1763,7 +1765,7 @@ static void unmap_region(struct mm_struct *mm,
 	update_hiwater_rss(mm);
 	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
-	free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
+	free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
 				 next? next->vm_start: 0);
 	tlb_finish_mmu(tlb, start, end);
 }
@@ -2063,7 +2065,7 @@ void exit_mmap(struct mm_struct *mm)
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
 	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
-	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
+	free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
 	tlb_finish_mmu(tlb, 0, end);
 
 	/*
-- 
cgit v1.2.3-70-g09d2


From a5516438959d90b071ff0a484ce4f3f523dc3152 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 23 Jul 2008 21:27:41 -0700
Subject: hugetlb: modular state for hugetlb page size

The goal of this patchset is to support multiple hugetlb page sizes.  This
is achieved by introducing a new struct hstate structure, which
encapsulates the important hugetlb state and constants (eg.  huge page
size, number of huge pages currently allocated, etc).

The hstate structure is then passed around the code which requires these
fields, they will do the right thing regardless of the exact hstate they
are operating on.

This patch adds the hstate structure, with a single global instance of it
(default_hstate), and does the basic work of converting hugetlb to use the
hstate.

Future patches will add more hstate structures to allow for different
hugetlbfs mounts to have different page sizes.

[akpm@linux-foundation.org: coding-style fixes]
Acked-by: Adam Litke <agl@us.ibm.com>
Acked-by: Nishanth Aravamudan <nacc@us.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/mm/hugetlbpage.c    |   7 +-
 arch/powerpc/mm/hugetlbpage.c |   3 +-
 arch/s390/mm/hugetlbpage.c    |   3 +-
 arch/sh/mm/hugetlbpage.c      |   3 +-
 arch/sparc64/mm/hugetlbpage.c |   5 +-
 arch/x86/mm/hugetlbpage.c     |   5 +-
 fs/hugetlbfs/inode.c          |  52 +++---
 include/asm-ia64/hugetlb.h    |   3 +-
 include/asm-powerpc/hugetlb.h |   3 +-
 include/asm-s390/hugetlb.h    |   3 +-
 include/asm-sh/hugetlb.h      |   3 +-
 include/asm-sparc/hugetlb.h   |   3 +-
 include/asm-x86/hugetlb.h     |   8 +-
 include/linux/hugetlb.h       |  88 +++++++++-
 ipc/shm.c                     |   3 +-
 mm/hugetlb.c                  | 368 +++++++++++++++++++++++-------------------
 mm/memory.c                   |   2 +-
 mm/mempolicy.c                |   9 +-
 mm/mmap.c                     |   3 +-
 19 files changed, 356 insertions(+), 218 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index cd49e2860ee..6170f097d25 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -24,7 +24,7 @@
 unsigned int hpage_shift=HPAGE_SHIFT_DEFAULT;
 
 pte_t *
-huge_pte_alloc (struct mm_struct *mm, unsigned long addr)
+huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
 {
 	unsigned long taddr = htlbpage_to_page(addr);
 	pgd_t *pgd;
@@ -75,7 +75,8 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
  * Don't actually need to do any preparation, but need to make sure
  * the address is in the right region.
  */
-int prepare_hugepage_range(unsigned long addr, unsigned long len)
+int prepare_hugepage_range(struct file *file,
+			unsigned long addr, unsigned long len)
 {
 	if (len & ~HPAGE_MASK)
 		return -EINVAL;
@@ -149,7 +150,7 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, u
 
 	/* Handle MAP_FIXED */
 	if (flags & MAP_FIXED) {
-		if (prepare_hugepage_range(addr, len))
+		if (prepare_hugepage_range(file, addr, len))
 			return -EINVAL;
 		return addr;
 	}
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 1a96cc891cf..c94dc71af98 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -128,7 +128,8 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	return NULL;
 }
 
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+			unsigned long addr, unsigned long sz)
 {
 	pgd_t *pg;
 	pud_t *pu;
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index f4b6124fdb7..9162dc84f77 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -72,7 +72,8 @@ void arch_release_hugepage(struct page *page)
 	page[1].index = 0;
 }
 
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+			unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgdp;
 	pud_t *pudp;
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index ae8c321d6e2..2f9dbe0ef4a 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -22,7 +22,8 @@
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
 
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+			unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
 	pud_t *pud;
diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c
index ebefd2a1437..1307b23f6a7 100644
--- a/arch/sparc64/mm/hugetlbpage.c
+++ b/arch/sparc64/mm/hugetlbpage.c
@@ -175,7 +175,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED) {
-		if (prepare_hugepage_range(addr, len))
+		if (prepare_hugepage_range(file, addr, len))
 			return -EINVAL;
 		return addr;
 	}
@@ -195,7 +195,8 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 				pgoff, flags);
 }
 
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+			unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
 	pud_t *pud;
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 0b3d567e686..52476fde899 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -124,7 +124,8 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 	return 1;
 }
 
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+			unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
 	pud_t *pud;
@@ -368,7 +369,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED) {
-		if (prepare_hugepage_range(addr, len))
+		if (prepare_hugepage_range(file, addr, len))
 			return -EINVAL;
 		return addr;
 	}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 428eff5b73f..516c581b537 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -80,6 +80,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	struct inode *inode = file->f_path.dentry->d_inode;
 	loff_t len, vma_len;
 	int ret;
+	struct hstate *h = hstate_file(file);
 
 	/*
 	 * vma address alignment (but not the pgoff alignment) has
@@ -92,7 +93,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	vma->vm_flags |= VM_HUGETLB | VM_RESERVED;
 	vma->vm_ops = &hugetlb_vm_ops;
 
-	if (vma->vm_pgoff & ~(HPAGE_MASK >> PAGE_SHIFT))
+	if (vma->vm_pgoff & ~(huge_page_mask(h) >> PAGE_SHIFT))
 		return -EINVAL;
 
 	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
@@ -104,8 +105,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 
 	if (hugetlb_reserve_pages(inode,
-				vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT),
-				len >> HPAGE_SHIFT, vma))
+				vma->vm_pgoff >> huge_page_order(h),
+				len >> huge_page_shift(h), vma))
 		goto out;
 
 	ret = 0;
@@ -130,20 +131,21 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	unsigned long start_addr;
+	struct hstate *h = hstate_file(file);
 
-	if (len & ~HPAGE_MASK)
+	if (len & ~huge_page_mask(h))
 		return -EINVAL;
 	if (len > TASK_SIZE)
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED) {
-		if (prepare_hugepage_range(addr, len))
+		if (prepare_hugepage_range(file, addr, len))
 			return -EINVAL;
 		return addr;
 	}
 
 	if (addr) {
-		addr = ALIGN(addr, HPAGE_SIZE);
+		addr = ALIGN(addr, huge_page_size(h));
 		vma = find_vma(mm, addr);
 		if (TASK_SIZE - len >= addr &&
 		    (!vma || addr + len <= vma->vm_start))
@@ -156,7 +158,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		start_addr = TASK_UNMAPPED_BASE;
 
 full_search:
-	addr = ALIGN(start_addr, HPAGE_SIZE);
+	addr = ALIGN(start_addr, huge_page_size(h));
 
 	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
 		/* At this point:  (!vma || addr < vma->vm_end). */
@@ -174,7 +176,7 @@ full_search:
 
 		if (!vma || addr + len <= vma->vm_start)
 			return addr;
-		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
+		addr = ALIGN(vma->vm_end, huge_page_size(h));
 	}
 }
 #endif
@@ -225,10 +227,11 @@ hugetlbfs_read_actor(struct page *page, unsigned long offset,
 static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
 			      size_t len, loff_t *ppos)
 {
+	struct hstate *h = hstate_file(filp);
 	struct address_space *mapping = filp->f_mapping;
 	struct inode *inode = mapping->host;
-	unsigned long index = *ppos >> HPAGE_SHIFT;
-	unsigned long offset = *ppos & ~HPAGE_MASK;
+	unsigned long index = *ppos >> huge_page_shift(h);
+	unsigned long offset = *ppos & ~huge_page_mask(h);
 	unsigned long end_index;
 	loff_t isize;
 	ssize_t retval = 0;
@@ -243,17 +246,17 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
 	if (!isize)
 		goto out;
 
-	end_index = (isize - 1) >> HPAGE_SHIFT;
+	end_index = (isize - 1) >> huge_page_shift(h);
 	for (;;) {
 		struct page *page;
-		int nr, ret;
+		unsigned long nr, ret;
 
 		/* nr is the maximum number of bytes to copy from this page */
-		nr = HPAGE_SIZE;
+		nr = huge_page_size(h);
 		if (index >= end_index) {
 			if (index > end_index)
 				goto out;
-			nr = ((isize - 1) & ~HPAGE_MASK) + 1;
+			nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
 			if (nr <= offset) {
 				goto out;
 			}
@@ -287,8 +290,8 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
 		offset += ret;
 		retval += ret;
 		len -= ret;
-		index += offset >> HPAGE_SHIFT;
-		offset &= ~HPAGE_MASK;
+		index += offset >> huge_page_shift(h);
+		offset &= ~huge_page_mask(h);
 
 		if (page)
 			page_cache_release(page);
@@ -298,7 +301,7 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
 			break;
 	}
 out:
-	*ppos = ((loff_t)index << HPAGE_SHIFT) + offset;
+	*ppos = ((loff_t)index << huge_page_shift(h)) + offset;
 	mutex_unlock(&inode->i_mutex);
 	return retval;
 }
@@ -339,8 +342,9 @@ static void truncate_huge_page(struct page *page)
 
 static void truncate_hugepages(struct inode *inode, loff_t lstart)
 {
+	struct hstate *h = hstate_inode(inode);
 	struct address_space *mapping = &inode->i_data;
-	const pgoff_t start = lstart >> HPAGE_SHIFT;
+	const pgoff_t start = lstart >> huge_page_shift(h);
 	struct pagevec pvec;
 	pgoff_t next;
 	int i, freed = 0;
@@ -449,8 +453,9 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 {
 	pgoff_t pgoff;
 	struct address_space *mapping = inode->i_mapping;
+	struct hstate *h = hstate_inode(inode);
 
-	BUG_ON(offset & ~HPAGE_MASK);
+	BUG_ON(offset & ~huge_page_mask(h));
 	pgoff = offset >> PAGE_SHIFT;
 
 	i_size_write(inode, offset);
@@ -465,6 +470,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
+	struct hstate *h = hstate_inode(inode);
 	int error;
 	unsigned int ia_valid = attr->ia_valid;
 
@@ -476,7 +482,7 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
 
 	if (ia_valid & ATTR_SIZE) {
 		error = -EINVAL;
-		if (!(attr->ia_size & ~HPAGE_MASK))
+		if (!(attr->ia_size & ~huge_page_mask(h)))
 			error = hugetlb_vmtruncate(inode, attr->ia_size);
 		if (error)
 			goto out;
@@ -610,9 +616,10 @@ static int hugetlbfs_set_page_dirty(struct page *page)
 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
+	struct hstate *h = hstate_inode(dentry->d_inode);
 
 	buf->f_type = HUGETLBFS_MAGIC;
-	buf->f_bsize = HPAGE_SIZE;
+	buf->f_bsize = huge_page_size(h);
 	if (sbinfo) {
 		spin_lock(&sbinfo->stat_lock);
 		/* If no limits set, just report 0 for max/free/used
@@ -942,7 +949,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
 		goto out_dentry;
 
 	error = -ENOMEM;
-	if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT, NULL))
+	if (hugetlb_reserve_pages(inode, 0,
+			size >> huge_page_shift(hstate_inode(inode)), NULL))
 		goto out_inode;
 
 	d_instantiate(dentry, inode);
diff --git a/include/asm-ia64/hugetlb.h b/include/asm-ia64/hugetlb.h
index e9d1e5e2382..da55c63728e 100644
--- a/include/asm-ia64/hugetlb.h
+++ b/include/asm-ia64/hugetlb.h
@@ -8,7 +8,8 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
 			    unsigned long end, unsigned long floor,
 			    unsigned long ceiling);
 
-int prepare_hugepage_range(unsigned long addr, unsigned long len);
+int prepare_hugepage_range(struct file *file,
+			unsigned long addr, unsigned long len);
 
 static inline int is_hugepage_only_range(struct mm_struct *mm,
 					 unsigned long addr,
diff --git a/include/asm-powerpc/hugetlb.h b/include/asm-powerpc/hugetlb.h
index 0a37aa5ecaa..ca37c4af27b 100644
--- a/include/asm-powerpc/hugetlb.h
+++ b/include/asm-powerpc/hugetlb.h
@@ -21,7 +21,8 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
  * If the arch doesn't supply something else, assume that hugepage
  * size aligned regions are ok without further preparation.
  */
-static inline int prepare_hugepage_range(unsigned long addr, unsigned long len)
+static inline int prepare_hugepage_range(struct file *file,
+			unsigned long addr, unsigned long len)
 {
 	if (len & ~HPAGE_MASK)
 		return -EINVAL;
diff --git a/include/asm-s390/hugetlb.h b/include/asm-s390/hugetlb.h
index 600a776f8f7..670a1d1745d 100644
--- a/include/asm-s390/hugetlb.h
+++ b/include/asm-s390/hugetlb.h
@@ -22,7 +22,8 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
  * If the arch doesn't supply something else, assume that hugepage
  * size aligned regions are ok without further preparation.
  */
-static inline int prepare_hugepage_range(unsigned long addr, unsigned long len)
+static inline int prepare_hugepage_range(struct file *file,
+			unsigned long addr, unsigned long len)
 {
 	if (len & ~HPAGE_MASK)
 		return -EINVAL;
diff --git a/include/asm-sh/hugetlb.h b/include/asm-sh/hugetlb.h
index fb30018938c..967068fb79a 100644
--- a/include/asm-sh/hugetlb.h
+++ b/include/asm-sh/hugetlb.h
@@ -14,7 +14,8 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
  * If the arch doesn't supply something else, assume that hugepage
  * size aligned regions are ok without further preparation.
  */
-static inline int prepare_hugepage_range(unsigned long addr, unsigned long len)
+static inline int prepare_hugepage_range(struct file *file,
+			unsigned long addr, unsigned long len)
 {
 	if (len & ~HPAGE_MASK)
 		return -EINVAL;
diff --git a/include/asm-sparc/hugetlb.h b/include/asm-sparc/hugetlb.h
index aeb92374ca3..177061064ee 100644
--- a/include/asm-sparc/hugetlb.h
+++ b/include/asm-sparc/hugetlb.h
@@ -22,7 +22,8 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
  * If the arch doesn't supply something else, assume that hugepage
  * size aligned regions are ok without further preparation.
  */
-static inline int prepare_hugepage_range(unsigned long addr, unsigned long len)
+static inline int prepare_hugepage_range(struct file *file,
+			unsigned long addr, unsigned long len)
 {
 	if (len & ~HPAGE_MASK)
 		return -EINVAL;
diff --git a/include/asm-x86/hugetlb.h b/include/asm-x86/hugetlb.h
index 7eed6e0883b..439a9acc132 100644
--- a/include/asm-x86/hugetlb.h
+++ b/include/asm-x86/hugetlb.h
@@ -14,11 +14,13 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
  * If the arch doesn't supply something else, assume that hugepage
  * size aligned regions are ok without further preparation.
  */
-static inline int prepare_hugepage_range(unsigned long addr, unsigned long len)
+static inline int prepare_hugepage_range(struct file *file,
+			unsigned long addr, unsigned long len)
 {
-	if (len & ~HPAGE_MASK)
+	struct hstate *h = hstate_file(file);
+	if (len & ~huge_page_mask(h))
 		return -EINVAL;
-	if (addr & ~HPAGE_MASK)
+	if (addr & ~huge_page_mask(h))
 		return -EINVAL;
 	return 0;
 }
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index abbc187193a..ad2271e11f9 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -8,7 +8,6 @@
 #include <linux/mempolicy.h>
 #include <linux/shm.h>
 #include <asm/tlbflush.h>
-#include <asm/hugetlb.h>
 
 struct ctl_table;
 
@@ -45,7 +44,8 @@ extern int sysctl_hugetlb_shm_group;
 
 /* arch callbacks */
 
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr);
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+			unsigned long addr, unsigned long sz);
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr);
 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
 struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
@@ -80,7 +80,7 @@ static inline unsigned long hugetlb_total_pages(void)
 #define hugetlb_report_meminfo(buf)		0
 #define hugetlb_report_node_meminfo(n, buf)	0
 #define follow_huge_pmd(mm, addr, pmd, write)	NULL
-#define prepare_hugepage_range(addr,len)	(-EINVAL)
+#define prepare_hugepage_range(file, addr, len)	(-EINVAL)
 #define pmd_huge(x)	0
 #define is_hugepage_only_range(mm, addr, len)	0
 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
@@ -134,8 +134,6 @@ struct file *hugetlb_file_setup(const char *name, size_t);
 int hugetlb_get_quota(struct address_space *mapping, long delta);
 void hugetlb_put_quota(struct address_space *mapping, long delta);
 
-#define BLOCKS_PER_HUGEPAGE	(HPAGE_SIZE / 512)
-
 static inline int is_file_hugepages(struct file *file)
 {
 	if (file->f_op == &hugetlbfs_file_operations)
@@ -164,4 +162,84 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 					unsigned long flags);
 #endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */
 
+#ifdef CONFIG_HUGETLB_PAGE
+
+/* Defines one hugetlb page size */
+struct hstate {
+	int hugetlb_next_nid;
+	unsigned int order;
+	unsigned long mask;
+	unsigned long max_huge_pages;
+	unsigned long nr_huge_pages;
+	unsigned long free_huge_pages;
+	unsigned long resv_huge_pages;
+	unsigned long surplus_huge_pages;
+	unsigned long nr_overcommit_huge_pages;
+	struct list_head hugepage_freelists[MAX_NUMNODES];
+	unsigned int nr_huge_pages_node[MAX_NUMNODES];
+	unsigned int free_huge_pages_node[MAX_NUMNODES];
+	unsigned int surplus_huge_pages_node[MAX_NUMNODES];
+};
+
+extern struct hstate default_hstate;
+
+static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
+{
+	return &default_hstate;
+}
+
+static inline struct hstate *hstate_file(struct file *f)
+{
+	return &default_hstate;
+}
+
+static inline struct hstate *hstate_inode(struct inode *i)
+{
+	return &default_hstate;
+}
+
+static inline unsigned long huge_page_size(struct hstate *h)
+{
+	return (unsigned long)PAGE_SIZE << h->order;
+}
+
+static inline unsigned long huge_page_mask(struct hstate *h)
+{
+	return h->mask;
+}
+
+static inline unsigned int huge_page_order(struct hstate *h)
+{
+	return h->order;
+}
+
+static inline unsigned huge_page_shift(struct hstate *h)
+{
+	return h->order + PAGE_SHIFT;
+}
+
+static inline unsigned int pages_per_huge_page(struct hstate *h)
+{
+	return 1 << h->order;
+}
+
+static inline unsigned int blocks_per_huge_page(struct hstate *h)
+{
+	return huge_page_size(h) / 512;
+}
+
+#include <asm/hugetlb.h>
+
+#else
+struct hstate {};
+#define hstate_file(f) NULL
+#define hstate_vma(v) NULL
+#define hstate_inode(i) NULL
+#define huge_page_size(h) PAGE_SIZE
+#define huge_page_mask(h) PAGE_MASK
+#define huge_page_order(h) 0
+#define huge_page_shift(h) PAGE_SHIFT
+#define pages_per_huge_page(h) 1
+#endif
+
 #endif /* _LINUX_HUGETLB_H */
diff --git a/ipc/shm.c b/ipc/shm.c
index 790240cd067..a726aebce7d 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -577,7 +577,8 @@ static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss,
 
 		if (is_file_hugepages(shp->shm_file)) {
 			struct address_space *mapping = inode->i_mapping;
-			*rss += (HPAGE_SIZE/PAGE_SIZE)*mapping->nrpages;
+			struct hstate *h = hstate_file(shp->shm_file);
+			*rss += pages_per_huge_page(h) * mapping->nrpages;
 		} else {
 			struct shmem_inode_info *info = SHMEM_I(inode);
 			spin_lock(&info->lock);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 32dff4290c6..0d8153e25f0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -22,18 +22,12 @@
 #include "internal.h"
 
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
-static unsigned long surplus_huge_pages;
-static unsigned long nr_overcommit_huge_pages;
 unsigned long max_huge_pages;
 unsigned long sysctl_overcommit_huge_pages;
-static struct list_head hugepage_freelists[MAX_NUMNODES];
-static unsigned int nr_huge_pages_node[MAX_NUMNODES];
-static unsigned int free_huge_pages_node[MAX_NUMNODES];
-static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
 static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
 unsigned long hugepages_treat_as_movable;
-static int hugetlb_next_nid;
+
+struct hstate default_hstate;
 
 /*
  * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
@@ -203,11 +197,11 @@ static long region_count(struct list_head *head, long f, long t)
  * Convert the address within this vma to the page offset within
  * the mapping, in pagecache page units; huge pages here.
  */
-static pgoff_t vma_hugecache_offset(struct vm_area_struct *vma,
-					unsigned long address)
+static pgoff_t vma_hugecache_offset(struct hstate *h,
+			struct vm_area_struct *vma, unsigned long address)
 {
-	return ((address - vma->vm_start) >> HPAGE_SHIFT) +
-			(vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+	return ((address - vma->vm_start) >> huge_page_shift(h)) +
+			(vma->vm_pgoff >> huge_page_order(h));
 }
 
 /*
@@ -309,20 +303,21 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
 }
 
 /* Decrement the reserved pages in the hugepage pool by one */
-static void decrement_hugepage_resv_vma(struct vm_area_struct *vma)
+static void decrement_hugepage_resv_vma(struct hstate *h,
+			struct vm_area_struct *vma)
 {
 	if (vma->vm_flags & VM_NORESERVE)
 		return;
 
 	if (vma->vm_flags & VM_SHARED) {
 		/* Shared mappings always use reserves */
-		resv_huge_pages--;
+		h->resv_huge_pages--;
 	} else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
 		/*
 		 * Only the process that called mmap() has reserves for
 		 * private mappings.
 		 */
-		resv_huge_pages--;
+		h->resv_huge_pages--;
 	}
 }
 
@@ -344,12 +339,13 @@ static int vma_has_private_reserves(struct vm_area_struct *vma)
 	return 1;
 }
 
-static void clear_huge_page(struct page *page, unsigned long addr)
+static void clear_huge_page(struct page *page,
+			unsigned long addr, unsigned long sz)
 {
 	int i;
 
 	might_sleep();
-	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
+	for (i = 0; i < sz/PAGE_SIZE; i++) {
 		cond_resched();
 		clear_user_highpage(page + i, addr + i * PAGE_SIZE);
 	}
@@ -359,41 +355,43 @@ static void copy_huge_page(struct page *dst, struct page *src,
 			   unsigned long addr, struct vm_area_struct *vma)
 {
 	int i;
+	struct hstate *h = hstate_vma(vma);
 
 	might_sleep();
-	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
+	for (i = 0; i < pages_per_huge_page(h); i++) {
 		cond_resched();
 		copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
 	}
 }
 
-static void enqueue_huge_page(struct page *page)
+static void enqueue_huge_page(struct hstate *h, struct page *page)
 {
 	int nid = page_to_nid(page);
-	list_add(&page->lru, &hugepage_freelists[nid]);
-	free_huge_pages++;
-	free_huge_pages_node[nid]++;
+	list_add(&page->lru, &h->hugepage_freelists[nid]);
+	h->free_huge_pages++;
+	h->free_huge_pages_node[nid]++;
 }
 
-static struct page *dequeue_huge_page(void)
+static struct page *dequeue_huge_page(struct hstate *h)
 {
 	int nid;
 	struct page *page = NULL;
 
 	for (nid = 0; nid < MAX_NUMNODES; ++nid) {
-		if (!list_empty(&hugepage_freelists[nid])) {
-			page = list_entry(hugepage_freelists[nid].next,
+		if (!list_empty(&h->hugepage_freelists[nid])) {
+			page = list_entry(h->hugepage_freelists[nid].next,
 					  struct page, lru);
 			list_del(&page->lru);
-			free_huge_pages--;
-			free_huge_pages_node[nid]--;
+			h->free_huge_pages--;
+			h->free_huge_pages_node[nid]--;
 			break;
 		}
 	}
 	return page;
 }
 
-static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
+static struct page *dequeue_huge_page_vma(struct hstate *h,
+				struct vm_area_struct *vma,
 				unsigned long address, int avoid_reserve)
 {
 	int nid;
@@ -411,26 +409,26 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
 	 * not "stolen". The child may still get SIGKILLed
 	 */
 	if (!vma_has_private_reserves(vma) &&
-			free_huge_pages - resv_huge_pages == 0)
+			h->free_huge_pages - h->resv_huge_pages == 0)
 		return NULL;
 
 	/* If reserves cannot be used, ensure enough pages are in the pool */
-	if (avoid_reserve && free_huge_pages - resv_huge_pages == 0)
+	if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
 		return NULL;
 
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 						MAX_NR_ZONES - 1, nodemask) {
 		nid = zone_to_nid(zone);
 		if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
-		    !list_empty(&hugepage_freelists[nid])) {
-			page = list_entry(hugepage_freelists[nid].next,
+		    !list_empty(&h->hugepage_freelists[nid])) {
+			page = list_entry(h->hugepage_freelists[nid].next,
 					  struct page, lru);
 			list_del(&page->lru);
-			free_huge_pages--;
-			free_huge_pages_node[nid]--;
+			h->free_huge_pages--;
+			h->free_huge_pages_node[nid]--;
 
 			if (!avoid_reserve)
-				decrement_hugepage_resv_vma(vma);
+				decrement_hugepage_resv_vma(h, vma);
 
 			break;
 		}
@@ -439,12 +437,13 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
 	return page;
 }
 
-static void update_and_free_page(struct page *page)
+static void update_and_free_page(struct hstate *h, struct page *page)
 {
 	int i;
-	nr_huge_pages--;
-	nr_huge_pages_node[page_to_nid(page)]--;
-	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
+
+	h->nr_huge_pages--;
+	h->nr_huge_pages_node[page_to_nid(page)]--;
+	for (i = 0; i < pages_per_huge_page(h); i++) {
 		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
 				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
 				1 << PG_private | 1<< PG_writeback);
@@ -452,11 +451,16 @@ static void update_and_free_page(struct page *page)
 	set_compound_page_dtor(page, NULL);
 	set_page_refcounted(page);
 	arch_release_hugepage(page);
-	__free_pages(page, HUGETLB_PAGE_ORDER);
+	__free_pages(page, huge_page_order(h));
 }
 
 static void free_huge_page(struct page *page)
 {
+	/*
+	 * Can't pass hstate in here because it is called from the
+	 * compound page destructor.
+	 */
+	struct hstate *h = &default_hstate;
 	int nid = page_to_nid(page);
 	struct address_space *mapping;
 
@@ -466,12 +470,12 @@ static void free_huge_page(struct page *page)
 	INIT_LIST_HEAD(&page->lru);
 
 	spin_lock(&hugetlb_lock);
-	if (surplus_huge_pages_node[nid]) {
-		update_and_free_page(page);
-		surplus_huge_pages--;
-		surplus_huge_pages_node[nid]--;
+	if (h->surplus_huge_pages_node[nid]) {
+		update_and_free_page(h, page);
+		h->surplus_huge_pages--;
+		h->surplus_huge_pages_node[nid]--;
 	} else {
-		enqueue_huge_page(page);
+		enqueue_huge_page(h, page);
 	}
 	spin_unlock(&hugetlb_lock);
 	if (mapping)
@@ -483,7 +487,7 @@ static void free_huge_page(struct page *page)
  * balanced by operating on them in a round-robin fashion.
  * Returns 1 if an adjustment was made.
  */
-static int adjust_pool_surplus(int delta)
+static int adjust_pool_surplus(struct hstate *h, int delta)
 {
 	static int prev_nid;
 	int nid = prev_nid;
@@ -496,15 +500,15 @@ static int adjust_pool_surplus(int delta)
 			nid = first_node(node_online_map);
 
 		/* To shrink on this node, there must be a surplus page */
-		if (delta < 0 && !surplus_huge_pages_node[nid])
+		if (delta < 0 && !h->surplus_huge_pages_node[nid])
 			continue;
 		/* Surplus cannot exceed the total number of pages */
-		if (delta > 0 && surplus_huge_pages_node[nid] >=
-						nr_huge_pages_node[nid])
+		if (delta > 0 && h->surplus_huge_pages_node[nid] >=
+						h->nr_huge_pages_node[nid])
 			continue;
 
-		surplus_huge_pages += delta;
-		surplus_huge_pages_node[nid] += delta;
+		h->surplus_huge_pages += delta;
+		h->surplus_huge_pages_node[nid] += delta;
 		ret = 1;
 		break;
 	} while (nid != prev_nid);
@@ -513,46 +517,46 @@ static int adjust_pool_surplus(int delta)
 	return ret;
 }
 
-static void prep_new_huge_page(struct page *page, int nid)
+static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 {
 	set_compound_page_dtor(page, free_huge_page);
 	spin_lock(&hugetlb_lock);
-	nr_huge_pages++;
-	nr_huge_pages_node[nid]++;
+	h->nr_huge_pages++;
+	h->nr_huge_pages_node[nid]++;
 	spin_unlock(&hugetlb_lock);
 	put_page(page); /* free it into the hugepage allocator */
 }
 
-static struct page *alloc_fresh_huge_page_node(int nid)
+static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
 {
 	struct page *page;
 
 	page = alloc_pages_node(nid,
 		htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
 						__GFP_REPEAT|__GFP_NOWARN,
-		HUGETLB_PAGE_ORDER);
+		huge_page_order(h));
 	if (page) {
 		if (arch_prepare_hugepage(page)) {
 			__free_pages(page, HUGETLB_PAGE_ORDER);
 			return NULL;
 		}
-		prep_new_huge_page(page, nid);
+		prep_new_huge_page(h, page, nid);
 	}
 
 	return page;
 }
 
-static int alloc_fresh_huge_page(void)
+static int alloc_fresh_huge_page(struct hstate *h)
 {
 	struct page *page;
 	int start_nid;
 	int next_nid;
 	int ret = 0;
 
-	start_nid = hugetlb_next_nid;
+	start_nid = h->hugetlb_next_nid;
 
 	do {
-		page = alloc_fresh_huge_page_node(hugetlb_next_nid);
+		page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid);
 		if (page)
 			ret = 1;
 		/*
@@ -566,11 +570,11 @@ static int alloc_fresh_huge_page(void)
 		 * if we just successfully allocated a hugepage so that
 		 * the next caller gets hugepages on the next node.
 		 */
-		next_nid = next_node(hugetlb_next_nid, node_online_map);
+		next_nid = next_node(h->hugetlb_next_nid, node_online_map);
 		if (next_nid == MAX_NUMNODES)
 			next_nid = first_node(node_online_map);
-		hugetlb_next_nid = next_nid;
-	} while (!page && hugetlb_next_nid != start_nid);
+		h->hugetlb_next_nid = next_nid;
+	} while (!page && h->hugetlb_next_nid != start_nid);
 
 	if (ret)
 		count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -580,8 +584,8 @@ static int alloc_fresh_huge_page(void)
 	return ret;
 }
 
-static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
-						unsigned long address)
+static struct page *alloc_buddy_huge_page(struct hstate *h,
+			struct vm_area_struct *vma, unsigned long address)
 {
 	struct page *page;
 	unsigned int nid;
@@ -610,18 +614,18 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
 	 * per-node value is checked there.
 	 */
 	spin_lock(&hugetlb_lock);
-	if (surplus_huge_pages >= nr_overcommit_huge_pages) {
+	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
 		spin_unlock(&hugetlb_lock);
 		return NULL;
 	} else {
-		nr_huge_pages++;
-		surplus_huge_pages++;
+		h->nr_huge_pages++;
+		h->surplus_huge_pages++;
 	}
 	spin_unlock(&hugetlb_lock);
 
 	page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
 					__GFP_REPEAT|__GFP_NOWARN,
-					HUGETLB_PAGE_ORDER);
+					huge_page_order(h));
 
 	spin_lock(&hugetlb_lock);
 	if (page) {
@@ -636,12 +640,12 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
 		/*
 		 * We incremented the global counters already
 		 */
-		nr_huge_pages_node[nid]++;
-		surplus_huge_pages_node[nid]++;
+		h->nr_huge_pages_node[nid]++;
+		h->surplus_huge_pages_node[nid]++;
 		__count_vm_event(HTLB_BUDDY_PGALLOC);
 	} else {
-		nr_huge_pages--;
-		surplus_huge_pages--;
+		h->nr_huge_pages--;
+		h->surplus_huge_pages--;
 		__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
 	}
 	spin_unlock(&hugetlb_lock);
@@ -653,16 +657,16 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
  * Increase the hugetlb pool such that it can accomodate a reservation
  * of size 'delta'.
  */
-static int gather_surplus_pages(int delta)
+static int gather_surplus_pages(struct hstate *h, int delta)
 {
 	struct list_head surplus_list;
 	struct page *page, *tmp;
 	int ret, i;
 	int needed, allocated;
 
-	needed = (resv_huge_pages + delta) - free_huge_pages;
+	needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
 	if (needed <= 0) {
-		resv_huge_pages += delta;
+		h->resv_huge_pages += delta;
 		return 0;
 	}
 
@@ -673,7 +677,7 @@ static int gather_surplus_pages(int delta)
 retry:
 	spin_unlock(&hugetlb_lock);
 	for (i = 0; i < needed; i++) {
-		page = alloc_buddy_huge_page(NULL, 0);
+		page = alloc_buddy_huge_page(h, NULL, 0);
 		if (!page) {
 			/*
 			 * We were not able to allocate enough pages to
@@ -694,7 +698,8 @@ retry:
 	 * because either resv_huge_pages or free_huge_pages may have changed.
 	 */
 	spin_lock(&hugetlb_lock);
-	needed = (resv_huge_pages + delta) - (free_huge_pages + allocated);
+	needed = (h->resv_huge_pages + delta) -
+			(h->free_huge_pages + allocated);
 	if (needed > 0)
 		goto retry;
 
@@ -707,7 +712,7 @@ retry:
 	 * before they are reserved.
 	 */
 	needed += allocated;
-	resv_huge_pages += delta;
+	h->resv_huge_pages += delta;
 	ret = 0;
 free:
 	/* Free the needed pages to the hugetlb pool */
@@ -715,7 +720,7 @@ free:
 		if ((--needed) < 0)
 			break;
 		list_del(&page->lru);
-		enqueue_huge_page(page);
+		enqueue_huge_page(h, page);
 	}
 
 	/* Free unnecessary surplus pages to the buddy allocator */
@@ -743,7 +748,8 @@ free:
  * allocated to satisfy the reservation must be explicitly freed if they were
  * never used.
  */
-static void return_unused_surplus_pages(unsigned long unused_resv_pages)
+static void return_unused_surplus_pages(struct hstate *h,
+					unsigned long unused_resv_pages)
 {
 	static int nid = -1;
 	struct page *page;
@@ -758,27 +764,27 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
 	unsigned long remaining_iterations = num_online_nodes();
 
 	/* Uncommit the reservation */
-	resv_huge_pages -= unused_resv_pages;
+	h->resv_huge_pages -= unused_resv_pages;
 
-	nr_pages = min(unused_resv_pages, surplus_huge_pages);
+	nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
 
 	while (remaining_iterations-- && nr_pages) {
 		nid = next_node(nid, node_online_map);
 		if (nid == MAX_NUMNODES)
 			nid = first_node(node_online_map);
 
-		if (!surplus_huge_pages_node[nid])
+		if (!h->surplus_huge_pages_node[nid])
 			continue;
 
-		if (!list_empty(&hugepage_freelists[nid])) {
-			page = list_entry(hugepage_freelists[nid].next,
+		if (!list_empty(&h->hugepage_freelists[nid])) {
+			page = list_entry(h->hugepage_freelists[nid].next,
 					  struct page, lru);
 			list_del(&page->lru);
-			update_and_free_page(page);
-			free_huge_pages--;
-			free_huge_pages_node[nid]--;
-			surplus_huge_pages--;
-			surplus_huge_pages_node[nid]--;
+			update_and_free_page(h, page);
+			h->free_huge_pages--;
+			h->free_huge_pages_node[nid]--;
+			h->surplus_huge_pages--;
+			h->surplus_huge_pages_node[nid]--;
 			nr_pages--;
 			remaining_iterations = num_online_nodes();
 		}
@@ -794,13 +800,14 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
  * an instantiated the change should be committed via vma_commit_reservation.
  * No action is required on failure.
  */
-static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr)
+static int vma_needs_reservation(struct hstate *h,
+			struct vm_area_struct *vma, unsigned long addr)
 {
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
 
 	if (vma->vm_flags & VM_SHARED) {
-		pgoff_t idx = vma_hugecache_offset(vma, addr);
+		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
 		return region_chg(&inode->i_mapping->private_list,
 							idx, idx + 1);
 
@@ -809,7 +816,7 @@ static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr)
 
 	} else  {
 		int err;
-		pgoff_t idx = vma_hugecache_offset(vma, addr);
+		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
 		struct resv_map *reservations = vma_resv_map(vma);
 
 		err = region_chg(&reservations->regions, idx, idx + 1);
@@ -818,18 +825,18 @@ static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr)
 		return 0;
 	}
 }
-static void vma_commit_reservation(struct vm_area_struct *vma,
-							unsigned long addr)
+static void vma_commit_reservation(struct hstate *h,
+			struct vm_area_struct *vma, unsigned long addr)
 {
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
 
 	if (vma->vm_flags & VM_SHARED) {
-		pgoff_t idx = vma_hugecache_offset(vma, addr);
+		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
 		region_add(&inode->i_mapping->private_list, idx, idx + 1);
 
 	} else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
-		pgoff_t idx = vma_hugecache_offset(vma, addr);
+		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
 		struct resv_map *reservations = vma_resv_map(vma);
 
 		/* Mark this page used in the map. */
@@ -840,6 +847,7 @@ static void vma_commit_reservation(struct vm_area_struct *vma,
 static struct page *alloc_huge_page(struct vm_area_struct *vma,
 				    unsigned long addr, int avoid_reserve)
 {
+	struct hstate *h = hstate_vma(vma);
 	struct page *page;
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
@@ -852,7 +860,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 	 * MAP_NORESERVE mappings may also need pages and quota allocated
 	 * if no reserve mapping overlaps.
 	 */
-	chg = vma_needs_reservation(vma, addr);
+	chg = vma_needs_reservation(h, vma, addr);
 	if (chg < 0)
 		return ERR_PTR(chg);
 	if (chg)
@@ -860,11 +868,11 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 			return ERR_PTR(-ENOSPC);
 
 	spin_lock(&hugetlb_lock);
-	page = dequeue_huge_page_vma(vma, addr, avoid_reserve);
+	page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
 	spin_unlock(&hugetlb_lock);
 
 	if (!page) {
-		page = alloc_buddy_huge_page(vma, addr);
+		page = alloc_buddy_huge_page(h, vma, addr);
 		if (!page) {
 			hugetlb_put_quota(inode->i_mapping, chg);
 			return ERR_PTR(-VM_FAULT_OOM);
@@ -874,7 +882,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 	set_page_refcounted(page);
 	set_page_private(page, (unsigned long) mapping);
 
-	vma_commit_reservation(vma, addr);
+	vma_commit_reservation(h, vma, addr);
 
 	return page;
 }
@@ -882,21 +890,28 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 static int __init hugetlb_init(void)
 {
 	unsigned long i;
+	struct hstate *h = &default_hstate;
 
 	if (HPAGE_SHIFT == 0)
 		return 0;
 
+	if (!h->order) {
+		h->order = HPAGE_SHIFT - PAGE_SHIFT;
+		h->mask = HPAGE_MASK;
+	}
+
 	for (i = 0; i < MAX_NUMNODES; ++i)
-		INIT_LIST_HEAD(&hugepage_freelists[i]);
+		INIT_LIST_HEAD(&h->hugepage_freelists[i]);
 
-	hugetlb_next_nid = first_node(node_online_map);
+	h->hugetlb_next_nid = first_node(node_online_map);
 
 	for (i = 0; i < max_huge_pages; ++i) {
-		if (!alloc_fresh_huge_page())
+		if (!alloc_fresh_huge_page(h))
 			break;
 	}
-	max_huge_pages = free_huge_pages = nr_huge_pages = i;
-	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
+	max_huge_pages = h->free_huge_pages = h->nr_huge_pages = i;
+	printk(KERN_INFO "Total HugeTLB memory allocated, %ld\n",
+			h->free_huge_pages);
 	return 0;
 }
 module_init(hugetlb_init);
@@ -922,34 +937,36 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
 
 #ifdef CONFIG_SYSCTL
 #ifdef CONFIG_HIGHMEM
-static void try_to_free_low(unsigned long count)
+static void try_to_free_low(struct hstate *h, unsigned long count)
 {
 	int i;
 
 	for (i = 0; i < MAX_NUMNODES; ++i) {
 		struct page *page, *next;
-		list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
-			if (count >= nr_huge_pages)
+		struct list_head *freel = &h->hugepage_freelists[i];
+		list_for_each_entry_safe(page, next, freel, lru) {
+			if (count >= h->nr_huge_pages)
 				return;
 			if (PageHighMem(page))
 				continue;
 			list_del(&page->lru);
 			update_and_free_page(page);
-			free_huge_pages--;
-			free_huge_pages_node[page_to_nid(page)]--;
+			h->free_huge_pages--;
+			h->free_huge_pages_node[page_to_nid(page)]--;
 		}
 	}
 }
 #else
-static inline void try_to_free_low(unsigned long count)
+static inline void try_to_free_low(struct hstate *h, unsigned long count)
 {
 }
 #endif
 
-#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages)
+#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
 static unsigned long set_max_huge_pages(unsigned long count)
 {
 	unsigned long min_count, ret;
+	struct hstate *h = &default_hstate;
 
 	/*
 	 * Increase the pool size
@@ -963,19 +980,19 @@ static unsigned long set_max_huge_pages(unsigned long count)
 	 * within all the constraints specified by the sysctls.
 	 */
 	spin_lock(&hugetlb_lock);
-	while (surplus_huge_pages && count > persistent_huge_pages) {
-		if (!adjust_pool_surplus(-1))
+	while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
+		if (!adjust_pool_surplus(h, -1))
 			break;
 	}
 
-	while (count > persistent_huge_pages) {
+	while (count > persistent_huge_pages(h)) {
 		/*
 		 * If this allocation races such that we no longer need the
 		 * page, free_huge_page will handle it by freeing the page
 		 * and reducing the surplus.
 		 */
 		spin_unlock(&hugetlb_lock);
-		ret = alloc_fresh_huge_page();
+		ret = alloc_fresh_huge_page(h);
 		spin_lock(&hugetlb_lock);
 		if (!ret)
 			goto out;
@@ -997,21 +1014,21 @@ static unsigned long set_max_huge_pages(unsigned long count)
 	 * and won't grow the pool anywhere else. Not until one of the
 	 * sysctls are changed, or the surplus pages go out of use.
 	 */
-	min_count = resv_huge_pages + nr_huge_pages - free_huge_pages;
+	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
 	min_count = max(count, min_count);
-	try_to_free_low(min_count);
-	while (min_count < persistent_huge_pages) {
-		struct page *page = dequeue_huge_page();
+	try_to_free_low(h, min_count);
+	while (min_count < persistent_huge_pages(h)) {
+		struct page *page = dequeue_huge_page(h);
 		if (!page)
 			break;
-		update_and_free_page(page);
+		update_and_free_page(h, page);
 	}
-	while (count < persistent_huge_pages) {
-		if (!adjust_pool_surplus(1))
+	while (count < persistent_huge_pages(h)) {
+		if (!adjust_pool_surplus(h, 1))
 			break;
 	}
 out:
-	ret = persistent_huge_pages;
+	ret = persistent_huge_pages(h);
 	spin_unlock(&hugetlb_lock);
 	return ret;
 }
@@ -1041,9 +1058,10 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
 			struct file *file, void __user *buffer,
 			size_t *length, loff_t *ppos)
 {
+	struct hstate *h = &default_hstate;
 	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
 	spin_lock(&hugetlb_lock);
-	nr_overcommit_huge_pages = sysctl_overcommit_huge_pages;
+	h->nr_overcommit_huge_pages = sysctl_overcommit_huge_pages;
 	spin_unlock(&hugetlb_lock);
 	return 0;
 }
@@ -1052,37 +1070,40 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
 
 int hugetlb_report_meminfo(char *buf)
 {
+	struct hstate *h = &default_hstate;
 	return sprintf(buf,
 			"HugePages_Total: %5lu\n"
 			"HugePages_Free:  %5lu\n"
 			"HugePages_Rsvd:  %5lu\n"
 			"HugePages_Surp:  %5lu\n"
 			"Hugepagesize:    %5lu kB\n",
-			nr_huge_pages,
-			free_huge_pages,
-			resv_huge_pages,
-			surplus_huge_pages,
-			HPAGE_SIZE/1024);
+			h->nr_huge_pages,
+			h->free_huge_pages,
+			h->resv_huge_pages,
+			h->surplus_huge_pages,
+			1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
 }
 
 int hugetlb_report_node_meminfo(int nid, char *buf)
 {
+	struct hstate *h = &default_hstate;
 	return sprintf(buf,
 		"Node %d HugePages_Total: %5u\n"
 		"Node %d HugePages_Free:  %5u\n"
 		"Node %d HugePages_Surp:  %5u\n",
-		nid, nr_huge_pages_node[nid],
-		nid, free_huge_pages_node[nid],
-		nid, surplus_huge_pages_node[nid]);
+		nid, h->nr_huge_pages_node[nid],
+		nid, h->free_huge_pages_node[nid],
+		nid, h->surplus_huge_pages_node[nid]);
 }
 
 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 unsigned long hugetlb_total_pages(void)
 {
-	return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
+	struct hstate *h = &default_hstate;
+	return h->nr_huge_pages * pages_per_huge_page(h);
 }
 
-static int hugetlb_acct_memory(long delta)
+static int hugetlb_acct_memory(struct hstate *h, long delta)
 {
 	int ret = -ENOMEM;
 
@@ -1105,18 +1126,18 @@ static int hugetlb_acct_memory(long delta)
 	 * semantics that cpuset has.
 	 */
 	if (delta > 0) {
-		if (gather_surplus_pages(delta) < 0)
+		if (gather_surplus_pages(h, delta) < 0)
 			goto out;
 
-		if (delta > cpuset_mems_nr(free_huge_pages_node)) {
-			return_unused_surplus_pages(delta);
+		if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
+			return_unused_surplus_pages(h, delta);
 			goto out;
 		}
 	}
 
 	ret = 0;
 	if (delta < 0)
-		return_unused_surplus_pages((unsigned long) -delta);
+		return_unused_surplus_pages(h, (unsigned long) -delta);
 
 out:
 	spin_unlock(&hugetlb_lock);
@@ -1141,14 +1162,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
 
 static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 {
+	struct hstate *h = hstate_vma(vma);
 	struct resv_map *reservations = vma_resv_map(vma);
 	unsigned long reserve;
 	unsigned long start;
 	unsigned long end;
 
 	if (reservations) {
-		start = vma_hugecache_offset(vma, vma->vm_start);
-		end = vma_hugecache_offset(vma, vma->vm_end);
+		start = vma_hugecache_offset(h, vma, vma->vm_start);
+		end = vma_hugecache_offset(h, vma, vma->vm_end);
 
 		reserve = (end - start) -
 			region_count(&reservations->regions, start, end);
@@ -1156,7 +1178,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 		kref_put(&reservations->refs, resv_map_release);
 
 		if (reserve)
-			hugetlb_acct_memory(-reserve);
+			hugetlb_acct_memory(h, -reserve);
 	}
 }
 
@@ -1214,14 +1236,16 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 	struct page *ptepage;
 	unsigned long addr;
 	int cow;
+	struct hstate *h = hstate_vma(vma);
+	unsigned long sz = huge_page_size(h);
 
 	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 
-	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+	for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
 		src_pte = huge_pte_offset(src, addr);
 		if (!src_pte)
 			continue;
-		dst_pte = huge_pte_alloc(dst, addr);
+		dst_pte = huge_pte_alloc(dst, addr, sz);
 		if (!dst_pte)
 			goto nomem;
 
@@ -1257,6 +1281,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 	pte_t pte;
 	struct page *page;
 	struct page *tmp;
+	struct hstate *h = hstate_vma(vma);
+	unsigned long sz = huge_page_size(h);
+
 	/*
 	 * A page gathering list, protected by per file i_mmap_lock. The
 	 * lock is used to avoid list corruption from multiple unmapping
@@ -1265,11 +1292,11 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 	LIST_HEAD(page_list);
 
 	WARN_ON(!is_vm_hugetlb_page(vma));
-	BUG_ON(start & ~HPAGE_MASK);
-	BUG_ON(end & ~HPAGE_MASK);
+	BUG_ON(start & ~huge_page_mask(h));
+	BUG_ON(end & ~huge_page_mask(h));
 
 	spin_lock(&mm->page_table_lock);
-	for (address = start; address < end; address += HPAGE_SIZE) {
+	for (address = start; address < end; address += sz) {
 		ptep = huge_pte_offset(mm, address);
 		if (!ptep)
 			continue;
@@ -1383,6 +1410,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, pte_t *ptep, pte_t pte,
 			struct page *pagecache_page)
 {
+	struct hstate *h = hstate_vma(vma);
 	struct page *old_page, *new_page;
 	int avoidcopy;
 	int outside_reserve = 0;
@@ -1443,7 +1471,7 @@ retry_avoidcopy:
 	__SetPageUptodate(new_page);
 	spin_lock(&mm->page_table_lock);
 
-	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
+	ptep = huge_pte_offset(mm, address & huge_page_mask(h));
 	if (likely(pte_same(huge_ptep_get(ptep), pte))) {
 		/* Break COW */
 		huge_ptep_clear_flush(vma, address, ptep);
@@ -1458,14 +1486,14 @@ retry_avoidcopy:
 }
 
 /* Return the pagecache page at a given address within a VMA */
-static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma,
-			unsigned long address)
+static struct page *hugetlbfs_pagecache_page(struct hstate *h,
+			struct vm_area_struct *vma, unsigned long address)
 {
 	struct address_space *mapping;
 	pgoff_t idx;
 
 	mapping = vma->vm_file->f_mapping;
-	idx = vma_hugecache_offset(vma, address);
+	idx = vma_hugecache_offset(h, vma, address);
 
 	return find_lock_page(mapping, idx);
 }
@@ -1473,6 +1501,7 @@ static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma,
 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, pte_t *ptep, int write_access)
 {
+	struct hstate *h = hstate_vma(vma);
 	int ret = VM_FAULT_SIGBUS;
 	pgoff_t idx;
 	unsigned long size;
@@ -1493,7 +1522,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 
 	mapping = vma->vm_file->f_mapping;
-	idx = vma_hugecache_offset(vma, address);
+	idx = vma_hugecache_offset(h, vma, address);
 
 	/*
 	 * Use page lock to guard against racing truncation
@@ -1502,7 +1531,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 retry:
 	page = find_lock_page(mapping, idx);
 	if (!page) {
-		size = i_size_read(mapping->host) >> HPAGE_SHIFT;
+		size = i_size_read(mapping->host) >> huge_page_shift(h);
 		if (idx >= size)
 			goto out;
 		page = alloc_huge_page(vma, address, 0);
@@ -1510,7 +1539,7 @@ retry:
 			ret = -PTR_ERR(page);
 			goto out;
 		}
-		clear_huge_page(page, address);
+		clear_huge_page(page, address, huge_page_size(h));
 		__SetPageUptodate(page);
 
 		if (vma->vm_flags & VM_SHARED) {
@@ -1526,14 +1555,14 @@ retry:
 			}
 
 			spin_lock(&inode->i_lock);
-			inode->i_blocks += BLOCKS_PER_HUGEPAGE;
+			inode->i_blocks += blocks_per_huge_page(h);
 			spin_unlock(&inode->i_lock);
 		} else
 			lock_page(page);
 	}
 
 	spin_lock(&mm->page_table_lock);
-	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
+	size = i_size_read(mapping->host) >> huge_page_shift(h);
 	if (idx >= size)
 		goto backout;
 
@@ -1569,8 +1598,9 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_t entry;
 	int ret;
 	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
+	struct hstate *h = hstate_vma(vma);
 
-	ptep = huge_pte_alloc(mm, address);
+	ptep = huge_pte_alloc(mm, address, huge_page_size(h));
 	if (!ptep)
 		return VM_FAULT_OOM;
 
@@ -1594,7 +1624,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (likely(pte_same(entry, huge_ptep_get(ptep))))
 		if (write_access && !pte_write(entry)) {
 			struct page *page;
-			page = hugetlbfs_pagecache_page(vma, address);
+			page = hugetlbfs_pagecache_page(h, vma, address);
 			ret = hugetlb_cow(mm, vma, address, ptep, entry, page);
 			if (page) {
 				unlock_page(page);
@@ -1615,6 +1645,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	unsigned long pfn_offset;
 	unsigned long vaddr = *position;
 	int remainder = *length;
+	struct hstate *h = hstate_vma(vma);
 
 	spin_lock(&mm->page_table_lock);
 	while (vaddr < vma->vm_end && remainder) {
@@ -1626,7 +1657,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		 * each hugepage.  We have to make * sure we get the
 		 * first, for the page indexing below to work.
 		 */
-		pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
+		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
 
 		if (!pte || huge_pte_none(huge_ptep_get(pte)) ||
 		    (write && !pte_write(huge_ptep_get(pte)))) {
@@ -1644,7 +1675,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			break;
 		}
 
-		pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
+		pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
 		page = pte_page(huge_ptep_get(pte));
 same_page:
 		if (pages) {
@@ -1660,7 +1691,7 @@ same_page:
 		--remainder;
 		++i;
 		if (vaddr < vma->vm_end && remainder &&
-				pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
+				pfn_offset < pages_per_huge_page(h)) {
 			/*
 			 * We use pfn_offset to avoid touching the pageframes
 			 * of this compound page.
@@ -1682,13 +1713,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
 	unsigned long start = address;
 	pte_t *ptep;
 	pte_t pte;
+	struct hstate *h = hstate_vma(vma);
 
 	BUG_ON(address >= end);
 	flush_cache_range(vma, address, end);
 
 	spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
 	spin_lock(&mm->page_table_lock);
-	for (; address < end; address += HPAGE_SIZE) {
+	for (; address < end; address += huge_page_size(h)) {
 		ptep = huge_pte_offset(mm, address);
 		if (!ptep)
 			continue;
@@ -1711,6 +1743,7 @@ int hugetlb_reserve_pages(struct inode *inode,
 					struct vm_area_struct *vma)
 {
 	long ret, chg;
+	struct hstate *h = hstate_inode(inode);
 
 	if (vma && vma->vm_flags & VM_NORESERVE)
 		return 0;
@@ -1739,7 +1772,7 @@ int hugetlb_reserve_pages(struct inode *inode,
 
 	if (hugetlb_get_quota(inode->i_mapping, chg))
 		return -ENOSPC;
-	ret = hugetlb_acct_memory(chg);
+	ret = hugetlb_acct_memory(h, chg);
 	if (ret < 0) {
 		hugetlb_put_quota(inode->i_mapping, chg);
 		return ret;
@@ -1751,12 +1784,13 @@ int hugetlb_reserve_pages(struct inode *inode,
 
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
 {
+	struct hstate *h = hstate_inode(inode);
 	long chg = region_truncate(&inode->i_mapping->private_list, offset);
 
 	spin_lock(&inode->i_lock);
-	inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed;
+	inode->i_blocks -= blocks_per_huge_page(h);
 	spin_unlock(&inode->i_lock);
 
 	hugetlb_put_quota(inode->i_mapping, (chg - freed));
-	hugetlb_acct_memory(-(chg - freed));
+	hugetlb_acct_memory(h, -(chg - freed));
 }
diff --git a/mm/memory.c b/mm/memory.c
index 72932489a08..c1c1d6d8c22 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -903,7 +903,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
 			if (unlikely(is_vm_hugetlb_page(vma))) {
 				unmap_hugepage_range(vma, start, end, NULL);
 				zap_work -= (end - start) /
-						(HPAGE_SIZE / PAGE_SIZE);
+					pages_per_huge_page(hstate_vma(vma));
 				start = end;
 			} else
 				start = unmap_page_range(*tlbp, vma,
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c94e58b192c..e550bec2058 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1481,7 +1481,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
 
 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
-						HPAGE_SHIFT), gfp_flags);
+				huge_page_shift(hstate_vma(vma))), gfp_flags);
 	} else {
 		zl = policy_zonelist(gfp_flags, *mpol);
 		if ((*mpol)->mode == MPOL_BIND)
@@ -2220,9 +2220,12 @@ static void check_huge_range(struct vm_area_struct *vma,
 {
 	unsigned long addr;
 	struct page *page;
+	struct hstate *h = hstate_vma(vma);
+	unsigned long sz = huge_page_size(h);
 
-	for (addr = start; addr < end; addr += HPAGE_SIZE) {
-		pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
+	for (addr = start; addr < end; addr += sz) {
+		pte_t *ptep = huge_pte_offset(vma->vm_mm,
+						addr & huge_page_mask(h));
 		pte_t pte;
 
 		if (!ptep)
diff --git a/mm/mmap.c b/mm/mmap.c
index 57d3b6097de..5e0cc99e9cd 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1812,7 +1812,8 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 	struct mempolicy *pol;
 	struct vm_area_struct *new;
 
-	if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK))
+	if (is_vm_hugetlb_page(vma) && (addr &
+					~(huge_page_mask(hstate_vma(vma)))))
 		return -EINVAL;
 
 	if (mm->map_count >= sysctl_max_map_count)
-- 
cgit v1.2.3-70-g09d2


From ceb868796181dc95ea01a110e123afd391639873 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 23 Jul 2008 21:27:50 -0700
Subject: hugetlb: introduce pud_huge

Straight forward extensions for huge pages located in the PUD instead of
PMDs.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/mm/hugetlbpage.c    |  6 ++++++
 arch/powerpc/mm/hugetlbpage.c |  5 +++++
 arch/s390/mm/hugetlbpage.c    |  5 +++++
 arch/sh/mm/hugetlbpage.c      |  5 +++++
 arch/sparc64/mm/hugetlbpage.c |  5 +++++
 arch/x86/mm/hugetlbpage.c     | 25 ++++++++++++++++++++++++-
 include/linux/hugetlb.h       |  5 +++++
 mm/hugetlb.c                  |  9 +++++++++
 mm/memory.c                   | 15 +++++++++++----
 9 files changed, 75 insertions(+), 5 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index 6170f097d25..c45fc7f5a97 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -107,6 +107,12 @@ int pmd_huge(pmd_t pmd)
 {
 	return 0;
 }
+
+int pud_huge(pud_t pud)
+{
+	return 0;
+}
+
 struct page *
 follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write)
 {
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index c94dc71af98..63db7adce71 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -369,6 +369,11 @@ int pmd_huge(pmd_t pmd)
 	return 0;
 }
 
+int pud_huge(pud_t pud)
+{
+	return 0;
+}
+
 struct page *
 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 		pmd_t *pmd, int write)
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index 9162dc84f77..f28c43d2f61 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -120,6 +120,11 @@ int pmd_huge(pmd_t pmd)
 	return !!(pmd_val(pmd) & _SEGMENT_ENTRY_LARGE);
 }
 
+int pud_huge(pud_t pud)
+{
+	return 0;
+}
+
 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 			     pmd_t *pmdp, int write)
 {
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index 2f9dbe0ef4a..9304117039c 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -79,6 +79,11 @@ int pmd_huge(pmd_t pmd)
 	return 0;
 }
 
+int pud_huge(pud_t pud)
+{
+	return 0;
+}
+
 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 			     pmd_t *pmd, int write)
 {
diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c
index 1307b23f6a7..f27d10369e0 100644
--- a/arch/sparc64/mm/hugetlbpage.c
+++ b/arch/sparc64/mm/hugetlbpage.c
@@ -295,6 +295,11 @@ int pmd_huge(pmd_t pmd)
 	return 0;
 }
 
+int pud_huge(pud_t pud)
+{
+	return 0;
+}
+
 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 			     pmd_t *pmd, int write)
 {
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 52476fde899..a4789e87a31 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -189,6 +189,11 @@ int pmd_huge(pmd_t pmd)
 	return 0;
 }
 
+int pud_huge(pud_t pud)
+{
+	return 0;
+}
+
 struct page *
 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 		pmd_t *pmd, int write)
@@ -209,6 +214,11 @@ int pmd_huge(pmd_t pmd)
 	return !!(pmd_val(pmd) & _PAGE_PSE);
 }
 
+int pud_huge(pud_t pud)
+{
+	return 0;
+}
+
 struct page *
 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 		pmd_t *pmd, int write)
@@ -217,9 +227,22 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 
 	page = pte_page(*(pte_t *)pmd);
 	if (page)
-		page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT);
+		page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
 	return page;
 }
+
+struct page *
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+		pud_t *pud, int write)
+{
+	struct page *page;
+
+	page = pte_page(*(pte_t *)pud);
+	if (page)
+		page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
+	return page;
+}
+
 #endif
 
 /* x86_64 also uses this file */
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 58c0de32e7f..b2c17f62cac 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -50,7 +50,10 @@ struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
 			      int write);
 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 				pmd_t *pmd, int write);
+struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
+				pud_t *pud, int write);
 int pmd_huge(pmd_t pmd);
+int pud_huge(pud_t pmd);
 void hugetlb_change_protection(struct vm_area_struct *vma,
 		unsigned long address, unsigned long end, pgprot_t newprot);
 
@@ -78,8 +81,10 @@ static inline unsigned long hugetlb_total_pages(void)
 #define hugetlb_report_meminfo(buf)		0
 #define hugetlb_report_node_meminfo(n, buf)	0
 #define follow_huge_pmd(mm, addr, pmd, write)	NULL
+#define follow_huge_pud(mm, addr, pud, write)	NULL
 #define prepare_hugepage_range(file, addr, len)	(-EINVAL)
 #define pmd_huge(x)	0
+#define pud_huge(x)	0
 #define is_hugepage_only_range(mm, addr, len)	0
 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
 #define hugetlb_fault(mm, vma, addr, write)	({ BUG(); 0; })
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0c74c14dd2f..107c1ce223c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1996,6 +1996,15 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	return ret;
 }
 
+/* Can be overriden by architectures */
+__attribute__((weak)) struct page *
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+	       pud_t *pud, int write)
+{
+	BUG();
+	return NULL;
+}
+
 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			struct page **pages, struct vm_area_struct **vmas,
 			unsigned long *position, int *length, int i,
diff --git a/mm/memory.c b/mm/memory.c
index 02fc6b1047b..262e3eb6601 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -998,19 +998,24 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 		goto no_page_table;
 
 	pud = pud_offset(pgd, address);
-	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+	if (pud_none(*pud))
+		goto no_page_table;
+	if (pud_huge(*pud)) {
+		BUG_ON(flags & FOLL_GET);
+		page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
+		goto out;
+	}
+	if (unlikely(pud_bad(*pud)))
 		goto no_page_table;
-	
+
 	pmd = pmd_offset(pud, address);
 	if (pmd_none(*pmd))
 		goto no_page_table;
-
 	if (pmd_huge(*pmd)) {
 		BUG_ON(flags & FOLL_GET);
 		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
 		goto out;
 	}
-
 	if (unlikely(pmd_bad(*pmd)))
 		goto no_page_table;
 
@@ -1567,6 +1572,8 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
 	unsigned long next;
 	int err;
 
+	BUG_ON(pud_huge(*pud));
+
 	pmd = pmd_alloc(mm, pud, addr);
 	if (!pmd)
 		return -ENOMEM;
-- 
cgit v1.2.3-70-g09d2


From ec4b2c0c8312d1118c2acd00c89988ecf955d5cc Mon Sep 17 00:00:00 2001
From: Jon Tollefson <kniht@linux.vnet.ibm.com>
Date: Wed, 23 Jul 2008 21:27:53 -0700
Subject: powerpc: function to allocate gigantic hugepages

The 16G page locations have been saved during early boot in an array.  The
alloc_bootmem_huge_page() function adds a page from here to the
huge_boot_pages list.

Acked-by: Adam Litke <agl@us.ibm.com>
Signed-off-by: Jon Tollefson <kniht@linux.vnet.ibm.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/mm/hugetlbpage.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 63db7adce71..5df82186fc9 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -29,6 +29,12 @@
 
 #define NUM_LOW_AREAS	(0x100000000UL >> SID_SHIFT)
 #define NUM_HIGH_AREAS	(PGTABLE_RANGE >> HTLB_AREA_SHIFT)
+#define MAX_NUMBER_GPAGES	1024
+
+/* Tracks the 16G pages after the device tree is scanned and before the
+ * huge_boot_pages list is ready.  */
+static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
+static unsigned nr_gpages;
 
 unsigned int hugepte_shift;
 #define PTRS_PER_HUGEPTE	(1 << hugepte_shift)
@@ -104,6 +110,21 @@ pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr)
 }
 #endif
 
+/* Moves the gigantic page addresses from the temporary list to the
+ * huge_boot_pages list.  */
+int alloc_bootmem_huge_page(struct hstate *h)
+{
+	struct huge_bootmem_page *m;
+	if (nr_gpages == 0)
+		return 0;
+	m = phys_to_virt(gpage_freearray[--nr_gpages]);
+	gpage_freearray[nr_gpages] = 0;
+	list_add(&m->list, &huge_boot_pages);
+	m->hstate = h;
+	return 1;
+}
+
+
 /* Modelled after find_linux_pte() */
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
-- 
cgit v1.2.3-70-g09d2


From 658013e93eb70494f7300bc90457b09a807232a4 Mon Sep 17 00:00:00 2001
From: Jon Tollefson <kniht@linux.vnet.ibm.com>
Date: Wed, 23 Jul 2008 21:27:54 -0700
Subject: powerpc: scan device tree for gigantic pages

The 16G huge pages have to be reserved in the HMC prior to boot.  The
location of the pages are placed in the device tree.  This patch adds code
to scan the device tree during very early boot and save these page
locations until hugetlbfs is ready for them.

Acked-by: Adam Litke <agl@us.ibm.com>
Signed-off-by: Jon Tollefson <kniht@linux.vnet.ibm.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/mm/hash_utils_64.c  | 44 +++++++++++++++++++++++++++++++++++++++-
 arch/powerpc/mm/hugetlbpage.c    | 16 +++++++++++++++
 include/asm-powerpc/mmu-hash64.h |  2 ++
 3 files changed, 61 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 8d3b58ebd38..ae4c717243a 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -68,6 +68,7 @@
 
 #define KB (1024)
 #define MB (1024*KB)
+#define GB (1024L*MB)
 
 /*
  * Note:  pte   --> Linux PTE
@@ -329,6 +330,44 @@ static int __init htab_dt_scan_page_sizes(unsigned long node,
 	return 0;
 }
 
+/* Scan for 16G memory blocks that have been set aside for huge pages
+ * and reserve those blocks for 16G huge pages.
+ */
+static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
+					const char *uname, int depth,
+					void *data) {
+	char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+	unsigned long *addr_prop;
+	u32 *page_count_prop;
+	unsigned int expected_pages;
+	long unsigned int phys_addr;
+	long unsigned int block_size;
+
+	/* We are scanning "memory" nodes only */
+	if (type == NULL || strcmp(type, "memory") != 0)
+		return 0;
+
+	/* This property is the log base 2 of the number of virtual pages that
+	 * will represent this memory block. */
+	page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL);
+	if (page_count_prop == NULL)
+		return 0;
+	expected_pages = (1 << page_count_prop[0]);
+	addr_prop = of_get_flat_dt_prop(node, "reg", NULL);
+	if (addr_prop == NULL)
+		return 0;
+	phys_addr = addr_prop[0];
+	block_size = addr_prop[1];
+	if (block_size != (16 * GB))
+		return 0;
+	printk(KERN_INFO "Huge page(16GB) memory: "
+			"addr = 0x%lX size = 0x%lX pages = %d\n",
+			phys_addr, block_size, expected_pages);
+	lmb_reserve(phys_addr, block_size * expected_pages);
+	add_gpage(phys_addr, block_size, expected_pages);
+	return 0;
+}
+
 static void __init htab_init_page_sizes(void)
 {
 	int rc;
@@ -418,7 +457,10 @@ static void __init htab_init_page_sizes(void)
 	       );
 
 #ifdef CONFIG_HUGETLB_PAGE
-	/* Init large page size. Currently, we pick 16M or 1M depending
+	/* Reserve 16G huge page memory sections for huge pages */
+	of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
+
+/* Init large page size. Currently, we pick 16M or 1M depending
 	 * on what is available
 	 */
 	if (mmu_psize_defs[MMU_PAGE_16M].shift)
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 5df82186fc9..e2a650a9e53 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -110,6 +110,22 @@ pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr)
 }
 #endif
 
+/* Build list of addresses of gigantic pages.  This function is used in early
+ * boot before the buddy or bootmem allocator is setup.
+ */
+void add_gpage(unsigned long addr, unsigned long page_size,
+	unsigned long number_of_pages)
+{
+	if (!addr)
+		return;
+	while (number_of_pages > 0) {
+		gpage_freearray[nr_gpages] = addr;
+		nr_gpages++;
+		number_of_pages--;
+		addr += page_size;
+	}
+}
+
 /* Moves the gigantic page addresses from the temporary list to the
  * huge_boot_pages list.  */
 int alloc_bootmem_huge_page(struct hstate *h)
diff --git a/include/asm-powerpc/mmu-hash64.h b/include/asm-powerpc/mmu-hash64.h
index d1dc16afb11..b61181aa774 100644
--- a/include/asm-powerpc/mmu-hash64.h
+++ b/include/asm-powerpc/mmu-hash64.h
@@ -281,6 +281,8 @@ extern int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
 			     unsigned long pstart, unsigned long mode,
 			     int psize, int ssize);
 extern void set_huge_psize(int psize);
+extern void add_gpage(unsigned long addr, unsigned long page_size,
+			  unsigned long number_of_pages);
 extern void demote_segment_4k(struct mm_struct *mm, unsigned long addr);
 
 extern void htab_initialize(void);
-- 
cgit v1.2.3-70-g09d2


From 91224346aa8c1cdaa660300a98e0b074a3a95030 Mon Sep 17 00:00:00 2001
From: Jon Tollefson <kniht@linux.vnet.ibm.com>
Date: Wed, 23 Jul 2008 21:27:55 -0700
Subject: powerpc: define support for 16G hugepages

The huge page size is defined for 16G pages.  If a hugepagesz of 16G is
specified at boot-time then it becomes the huge page size instead of the
default 16M.

The change in pgtable-64K.h is to the macro pte_iterate_hashed_subpages to
make the increment to va (the 1 being shifted) be a long so that it is not
shifted to 0.  Otherwise it would create an infinite loop when the shift
value is for a 16G page (when base page size is 64K).

Signed-off-by: Jon Tollefson <kniht@linux.vnet.ibm.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/mm/hugetlbpage.c     | 62 +++++++++++++++++++++++++++------------
 include/asm-powerpc/pgtable-64k.h |  2 +-
 2 files changed, 45 insertions(+), 19 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index e2a650a9e53..19b1a9cec6d 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -24,8 +24,9 @@
 #include <asm/cputable.h>
 #include <asm/spu.h>
 
-#define HPAGE_SHIFT_64K	16
-#define HPAGE_SHIFT_16M	24
+#define PAGE_SHIFT_64K	16
+#define PAGE_SHIFT_16M	24
+#define PAGE_SHIFT_16G	34
 
 #define NUM_LOW_AREAS	(0x100000000UL >> SID_SHIFT)
 #define NUM_HIGH_AREAS	(PGTABLE_RANGE >> HTLB_AREA_SHIFT)
@@ -95,7 +96,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 static inline
 pmd_t *hpmd_offset(pud_t *pud, unsigned long addr)
 {
-	if (HPAGE_SHIFT == HPAGE_SHIFT_64K)
+	if (HPAGE_SHIFT == PAGE_SHIFT_64K)
 		return pmd_offset(pud, addr);
 	else
 		return (pmd_t *) pud;
@@ -103,7 +104,7 @@ pmd_t *hpmd_offset(pud_t *pud, unsigned long addr)
 static inline
 pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr)
 {
-	if (HPAGE_SHIFT == HPAGE_SHIFT_64K)
+	if (HPAGE_SHIFT == PAGE_SHIFT_64K)
 		return pmd_alloc(mm, pud, addr);
 	else
 		return (pmd_t *) pud;
@@ -260,7 +261,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 			continue;
 		hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
 #else
-		if (HPAGE_SHIFT == HPAGE_SHIFT_64K) {
+		if (HPAGE_SHIFT == PAGE_SHIFT_64K) {
 			if (pud_none_or_clear_bad(pud))
 				continue;
 			hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
@@ -592,20 +593,40 @@ void set_huge_psize(int psize)
 {
 	/* Check that it is a page size supported by the hardware and
 	 * that it fits within pagetable limits. */
-	if (mmu_psize_defs[psize].shift && mmu_psize_defs[psize].shift < SID_SHIFT &&
+	if (mmu_psize_defs[psize].shift &&
+		mmu_psize_defs[psize].shift < SID_SHIFT_1T &&
 		(mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT ||
-			mmu_psize_defs[psize].shift == HPAGE_SHIFT_64K)) {
+		 mmu_psize_defs[psize].shift == PAGE_SHIFT_64K ||
+		 mmu_psize_defs[psize].shift == PAGE_SHIFT_16G)) {
+		/* Return if huge page size is the same as the
+		 * base page size. */
+		if (mmu_psize_defs[psize].shift == PAGE_SHIFT)
+			return;
+
 		HPAGE_SHIFT = mmu_psize_defs[psize].shift;
 		mmu_huge_psize = psize;
-#ifdef CONFIG_PPC_64K_PAGES
-		hugepte_shift = (PMD_SHIFT-HPAGE_SHIFT);
-#else
-		if (HPAGE_SHIFT == HPAGE_SHIFT_64K)
-			hugepte_shift = (PMD_SHIFT-HPAGE_SHIFT);
-		else
-			hugepte_shift = (PUD_SHIFT-HPAGE_SHIFT);
-#endif
 
+		switch (HPAGE_SHIFT) {
+		case PAGE_SHIFT_64K:
+		    /* We only allow 64k hpages with 4k base page,
+		     * which was checked above, and always put them
+		     * at the PMD */
+		    hugepte_shift = PMD_SHIFT;
+		    break;
+		case PAGE_SHIFT_16M:
+		    /* 16M pages can be at two different levels
+		     * of pagestables based on base page size */
+		    if (PAGE_SHIFT == PAGE_SHIFT_64K)
+			    hugepte_shift = PMD_SHIFT;
+		    else /* 4k base page */
+			    hugepte_shift = PUD_SHIFT;
+		    break;
+		case PAGE_SHIFT_16G:
+		    /* 16G pages are always at PGD level */
+		    hugepte_shift = PGDIR_SHIFT;
+		    break;
+		}
+		hugepte_shift -= HPAGE_SHIFT;
 	} else
 		HPAGE_SHIFT = 0;
 }
@@ -621,17 +642,22 @@ static int __init hugepage_setup_sz(char *str)
 	shift = __ffs(size);
 	switch (shift) {
 #ifndef CONFIG_PPC_64K_PAGES
-	case HPAGE_SHIFT_64K:
+	case PAGE_SHIFT_64K:
 		mmu_psize = MMU_PAGE_64K;
 		break;
 #endif
-	case HPAGE_SHIFT_16M:
+	case PAGE_SHIFT_16M:
 		mmu_psize = MMU_PAGE_16M;
 		break;
+	case PAGE_SHIFT_16G:
+		mmu_psize = MMU_PAGE_16G;
+		break;
 	}
 
-	if (mmu_psize >=0 && mmu_psize_defs[mmu_psize].shift)
+	if (mmu_psize >= 0 && mmu_psize_defs[mmu_psize].shift) {
 		set_huge_psize(mmu_psize);
+		hugetlb_add_hstate(shift - PAGE_SHIFT);
+	}
 	else
 		printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
 
diff --git a/include/asm-powerpc/pgtable-64k.h b/include/asm-powerpc/pgtable-64k.h
index c5007712473..7e54adb3559 100644
--- a/include/asm-powerpc/pgtable-64k.h
+++ b/include/asm-powerpc/pgtable-64k.h
@@ -138,7 +138,7 @@ static inline struct subpage_prot_table *pgd_subpage_prot(pgd_t *pgd)
                 unsigned __split = (psize == MMU_PAGE_4K ||                 \
 				    psize == MMU_PAGE_64K_AP);              \
                 shift = mmu_psize_defs[psize].shift;                        \
-	        for (index = 0; va < __end; index++, va += (1 << shift)) {  \
+		for (index = 0; va < __end; index++, va += (1L << shift)) { \
 		        if (!__split || __rpte_sub_valid(rpte, index)) do { \
 
 #define pte_iterate_hashed_end() } while(0); } } while(0)
-- 
cgit v1.2.3-70-g09d2


From 0d9ea75443dc7e37843e656b8ebc947a6d16d618 Mon Sep 17 00:00:00 2001
From: Jon Tollefson <kniht@linux.vnet.ibm.com>
Date: Wed, 23 Jul 2008 21:27:56 -0700
Subject: powerpc: support multiple hugepage sizes

Instead of using the variable mmu_huge_psize to keep track of the huge
page size we use an array of MMU_PAGE_* values.  For each supported huge
page size we need to know the hugepte_shift value and have a
pgtable_cache.  The hstate or an mmu_huge_psizes index is passed to
functions so that they know which huge page size they should use.

The hugepage sizes 16M and 64K are setup(if available on the hardware) so
that they don't have to be set on the boot cmd line in order to use them.
The number of 16G pages have to be specified at boot-time though (e.g.
hugepagesz=16G hugepages=5).

Signed-off-by: Jon Tollefson <kniht@linux.vnet.ibm.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/kernel-parameters.txt |  10 +-
 arch/powerpc/mm/hash_utils_64.c     |   9 +-
 arch/powerpc/mm/hugetlbpage.c       | 274 +++++++++++++++++++++++-------------
 arch/powerpc/mm/init_64.c           |   8 +-
 arch/powerpc/mm/tlb_64.c            |   2 +-
 include/asm-powerpc/hugetlb.h       |   5 +-
 include/asm-powerpc/mmu-hash64.h    |   4 +-
 include/asm-powerpc/page_64.h       |   1 +
 include/asm-powerpc/pgalloc-64.h    |   4 +-
 9 files changed, 199 insertions(+), 118 deletions(-)

(limited to 'arch/powerpc')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 30278e9e521..01a2992b575 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -776,11 +776,11 @@ and is between 256 and 4096 characters. It is defined in the file
 
 	hugepages=	[HW,X86-32,IA-64] HugeTLB pages to allocate at boot.
 	hugepagesz=	[HW,IA-64,PPC,X86-64] The size of the HugeTLB pages.
-			On x86 this option can be specified multiple times
-			interleaved with hugepages= to reserve huge pages
-			of different sizes. Valid pages sizes on x86-64
-			are 2M (when the CPU supports "pse") and 1G (when the
-			CPU supports the "pdpe1gb" cpuinfo flag)
+			On x86-64 and powerpc, this option can be specified
+			multiple times interleaved with hugepages= to reserve
+			huge pages of different sizes. Valid pages sizes on
+			x86-64 are 2M (when the CPU supports "pse") and 1G
+			(when the CPU supports the "pdpe1gb" cpuinfo flag)
 			Note that 1GB pages can only be allocated at boot time
 			using hugepages= and not freed afterwards.
 	default_hugepagesz=
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index ae4c717243a..5ce5a4dcd00 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -103,7 +103,6 @@ int mmu_kernel_ssize = MMU_SEGSIZE_256M;
 int mmu_highuser_ssize = MMU_SEGSIZE_256M;
 u16 mmu_slb_size = 64;
 #ifdef CONFIG_HUGETLB_PAGE
-int mmu_huge_psize = MMU_PAGE_16M;
 unsigned int HPAGE_SHIFT;
 #endif
 #ifdef CONFIG_PPC_64K_PAGES
@@ -460,15 +459,15 @@ static void __init htab_init_page_sizes(void)
 	/* Reserve 16G huge page memory sections for huge pages */
 	of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
 
-/* Init large page size. Currently, we pick 16M or 1M depending
+/* Set default large page size. Currently, we pick 16M or 1M depending
 	 * on what is available
 	 */
 	if (mmu_psize_defs[MMU_PAGE_16M].shift)
-		set_huge_psize(MMU_PAGE_16M);
+		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
 	/* With 4k/4level pagetables, we can't (for now) cope with a
 	 * huge page size < PMD_SIZE */
 	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
-		set_huge_psize(MMU_PAGE_1M);
+		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
 #endif /* CONFIG_HUGETLB_PAGE */
 }
 
@@ -889,7 +888,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 
 #ifdef CONFIG_HUGETLB_PAGE
 	/* Handle hugepage regions */
-	if (HPAGE_SHIFT && psize == mmu_huge_psize) {
+	if (HPAGE_SHIFT && mmu_huge_psizes[psize]) {
 		DBG_LOW(" -> huge page !\n");
 		return hash_huge_page(mm, access, ea, vsid, local, trap);
 	}
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 19b1a9cec6d..fb42c4dd321 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -37,15 +37,30 @@
 static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
 static unsigned nr_gpages;
 
-unsigned int hugepte_shift;
-#define PTRS_PER_HUGEPTE	(1 << hugepte_shift)
-#define HUGEPTE_TABLE_SIZE	(sizeof(pte_t) << hugepte_shift)
+/* Array of valid huge page sizes - non-zero value(hugepte_shift) is
+ * stored for the huge page sizes that are valid.
+ */
+unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
+
+#define hugepte_shift			mmu_huge_psizes
+#define PTRS_PER_HUGEPTE(psize)		(1 << hugepte_shift[psize])
+#define HUGEPTE_TABLE_SIZE(psize)	(sizeof(pte_t) << hugepte_shift[psize])
+
+#define HUGEPD_SHIFT(psize)		(mmu_psize_to_shift(psize) \
+						+ hugepte_shift[psize])
+#define HUGEPD_SIZE(psize)		(1UL << HUGEPD_SHIFT(psize))
+#define HUGEPD_MASK(psize)		(~(HUGEPD_SIZE(psize)-1))
 
-#define HUGEPD_SHIFT		(HPAGE_SHIFT + hugepte_shift)
-#define HUGEPD_SIZE		(1UL << HUGEPD_SHIFT)
-#define HUGEPD_MASK		(~(HUGEPD_SIZE-1))
+/* Subtract one from array size because we don't need a cache for 4K since
+ * is not a huge page size */
+#define huge_pgtable_cache(psize)	(pgtable_cache[HUGEPTE_CACHE_NUM \
+							+ psize-1])
+#define HUGEPTE_CACHE_NAME(psize)	(huge_pgtable_cache_name[psize])
 
-#define huge_pgtable_cache	(pgtable_cache[HUGEPTE_CACHE_NUM])
+static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = {
+	"unused_4K", "hugepte_cache_64K", "unused_64K_AP",
+	"hugepte_cache_1M", "hugepte_cache_16M", "hugepte_cache_16G"
+};
 
 /* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
  * will choke on pointers to hugepte tables, which is handy for
@@ -56,24 +71,49 @@ typedef struct { unsigned long pd; } hugepd_t;
 
 #define hugepd_none(hpd)	((hpd).pd == 0)
 
+static inline int shift_to_mmu_psize(unsigned int shift)
+{
+	switch (shift) {
+#ifndef CONFIG_PPC_64K_PAGES
+	case PAGE_SHIFT_64K:
+	    return MMU_PAGE_64K;
+#endif
+	case PAGE_SHIFT_16M:
+	    return MMU_PAGE_16M;
+	case PAGE_SHIFT_16G:
+	    return MMU_PAGE_16G;
+	}
+	return -1;
+}
+
+static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
+{
+	if (mmu_psize_defs[mmu_psize].shift)
+		return mmu_psize_defs[mmu_psize].shift;
+	BUG();
+}
+
 static inline pte_t *hugepd_page(hugepd_t hpd)
 {
 	BUG_ON(!(hpd.pd & HUGEPD_OK));
 	return (pte_t *)(hpd.pd & ~HUGEPD_OK);
 }
 
-static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr)
+static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
+				    struct hstate *hstate)
 {
-	unsigned long idx = ((addr >> HPAGE_SHIFT) & (PTRS_PER_HUGEPTE-1));
+	unsigned int shift = huge_page_shift(hstate);
+	int psize = shift_to_mmu_psize(shift);
+	unsigned long idx = ((addr >> shift) & (PTRS_PER_HUGEPTE(psize)-1));
 	pte_t *dir = hugepd_page(*hpdp);
 
 	return dir + idx;
 }
 
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
-			   unsigned long address)
+			   unsigned long address, unsigned int psize)
 {
-	pte_t *new = kmem_cache_alloc(huge_pgtable_cache,
+	pte_t *new = kmem_cache_alloc(huge_pgtable_cache(psize),
 				      GFP_KERNEL|__GFP_REPEAT);
 
 	if (! new)
@@ -81,7 +121,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 
 	spin_lock(&mm->page_table_lock);
 	if (!hugepd_none(*hpdp))
-		kmem_cache_free(huge_pgtable_cache, new);
+		kmem_cache_free(huge_pgtable_cache(psize), new);
 	else
 		hpdp->pd = (unsigned long)new | HUGEPD_OK;
 	spin_unlock(&mm->page_table_lock);
@@ -90,21 +130,22 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 
 /* Base page size affects how we walk hugetlb page tables */
 #ifdef CONFIG_PPC_64K_PAGES
-#define hpmd_offset(pud, addr)		pmd_offset(pud, addr)
-#define hpmd_alloc(mm, pud, addr)	pmd_alloc(mm, pud, addr)
+#define hpmd_offset(pud, addr, h)	pmd_offset(pud, addr)
+#define hpmd_alloc(mm, pud, addr, h)	pmd_alloc(mm, pud, addr)
 #else
 static inline
-pmd_t *hpmd_offset(pud_t *pud, unsigned long addr)
+pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate)
 {
-	if (HPAGE_SHIFT == PAGE_SHIFT_64K)
+	if (huge_page_shift(hstate) == PAGE_SHIFT_64K)
 		return pmd_offset(pud, addr);
 	else
 		return (pmd_t *) pud;
 }
 static inline
-pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr)
+pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr,
+		  struct hstate *hstate)
 {
-	if (HPAGE_SHIFT == PAGE_SHIFT_64K)
+	if (huge_page_shift(hstate) == PAGE_SHIFT_64K)
 		return pmd_alloc(mm, pud, addr);
 	else
 		return (pmd_t *) pud;
@@ -128,8 +169,9 @@ void add_gpage(unsigned long addr, unsigned long page_size,
 }
 
 /* Moves the gigantic page addresses from the temporary list to the
- * huge_boot_pages list.  */
-int alloc_bootmem_huge_page(struct hstate *h)
+ * huge_boot_pages list.
+ */
+int alloc_bootmem_huge_page(struct hstate *hstate)
 {
 	struct huge_bootmem_page *m;
 	if (nr_gpages == 0)
@@ -137,7 +179,7 @@ int alloc_bootmem_huge_page(struct hstate *h)
 	m = phys_to_virt(gpage_freearray[--nr_gpages]);
 	gpage_freearray[nr_gpages] = 0;
 	list_add(&m->list, &huge_boot_pages);
-	m->hstate = h;
+	m->hstate = hstate;
 	return 1;
 }
 
@@ -149,17 +191,25 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	pud_t *pu;
 	pmd_t *pm;
 
-	BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize);
+	unsigned int psize;
+	unsigned int shift;
+	unsigned long sz;
+	struct hstate *hstate;
+	psize = get_slice_psize(mm, addr);
+	shift = mmu_psize_to_shift(psize);
+	sz = ((1UL) << shift);
+	hstate = size_to_hstate(sz);
 
-	addr &= HPAGE_MASK;
+	addr &= hstate->mask;
 
 	pg = pgd_offset(mm, addr);
 	if (!pgd_none(*pg)) {
 		pu = pud_offset(pg, addr);
 		if (!pud_none(*pu)) {
-			pm = hpmd_offset(pu, addr);
+			pm = hpmd_offset(pu, addr, hstate);
 			if (!pmd_none(*pm))
-				return hugepte_offset((hugepd_t *)pm, addr);
+				return hugepte_offset((hugepd_t *)pm, addr,
+						      hstate);
 		}
 	}
 
@@ -173,16 +223,20 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
 	pud_t *pu;
 	pmd_t *pm;
 	hugepd_t *hpdp = NULL;
+	struct hstate *hstate;
+	unsigned int psize;
+	hstate = size_to_hstate(sz);
 
-	BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize);
+	psize = get_slice_psize(mm, addr);
+	BUG_ON(!mmu_huge_psizes[psize]);
 
-	addr &= HPAGE_MASK;
+	addr &= hstate->mask;
 
 	pg = pgd_offset(mm, addr);
 	pu = pud_alloc(mm, pg, addr);
 
 	if (pu) {
-		pm = hpmd_alloc(mm, pu, addr);
+		pm = hpmd_alloc(mm, pu, addr, hstate);
 		if (pm)
 			hpdp = (hugepd_t *)pm;
 	}
@@ -190,10 +244,10 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
 	if (! hpdp)
 		return NULL;
 
-	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr))
+	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, psize))
 		return NULL;
 
-	return hugepte_offset(hpdp, addr);
+	return hugepte_offset(hpdp, addr, hstate);
 }
 
 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
@@ -201,19 +255,22 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 	return 0;
 }
 
-static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp)
+static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp,
+			       unsigned int psize)
 {
 	pte_t *hugepte = hugepd_page(*hpdp);
 
 	hpdp->pd = 0;
 	tlb->need_flush = 1;
-	pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, HUGEPTE_CACHE_NUM,
+	pgtable_free_tlb(tlb, pgtable_free_cache(hugepte,
+						 HUGEPTE_CACHE_NUM+psize-1,
 						 PGF_CACHENUM_MASK));
 }
 
 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 				   unsigned long addr, unsigned long end,
-				   unsigned long floor, unsigned long ceiling)
+				   unsigned long floor, unsigned long ceiling,
+				   unsigned int psize)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -225,7 +282,7 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 		next = pmd_addr_end(addr, end);
 		if (pmd_none(*pmd))
 			continue;
-		free_hugepte_range(tlb, (hugepd_t *)pmd);
+		free_hugepte_range(tlb, (hugepd_t *)pmd, psize);
 	} while (pmd++, addr = next, addr != end);
 
 	start &= PUD_MASK;
@@ -251,6 +308,9 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 	pud_t *pud;
 	unsigned long next;
 	unsigned long start;
+	unsigned int shift;
+	unsigned int psize = get_slice_psize(tlb->mm, addr);
+	shift = mmu_psize_to_shift(psize);
 
 	start = addr;
 	pud = pud_offset(pgd, addr);
@@ -259,16 +319,18 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 #ifdef CONFIG_PPC_64K_PAGES
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
+		hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling,
+				       psize);
 #else
-		if (HPAGE_SHIFT == PAGE_SHIFT_64K) {
+		if (shift == PAGE_SHIFT_64K) {
 			if (pud_none_or_clear_bad(pud))
 				continue;
-			hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
+			hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
+					       ceiling, psize);
 		} else {
 			if (pud_none(*pud))
 				continue;
-			free_hugepte_range(tlb, (hugepd_t *)pud);
+			free_hugepte_range(tlb, (hugepd_t *)pud, psize);
 		}
 #endif
 	} while (pud++, addr = next, addr != end);
@@ -336,27 +398,29 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 	 * now has no other vmas using it, so can be freed, we don't
 	 * bother to round floor or end up - the tests don't need that.
 	 */
+	unsigned int psize = get_slice_psize(tlb->mm, addr);
 
-	addr &= HUGEPD_MASK;
+	addr &= HUGEPD_MASK(psize);
 	if (addr < floor) {
-		addr += HUGEPD_SIZE;
+		addr += HUGEPD_SIZE(psize);
 		if (!addr)
 			return;
 	}
 	if (ceiling) {
-		ceiling &= HUGEPD_MASK;
+		ceiling &= HUGEPD_MASK(psize);
 		if (!ceiling)
 			return;
 	}
 	if (end - 1 > ceiling - 1)
-		end -= HUGEPD_SIZE;
+		end -= HUGEPD_SIZE(psize);
 	if (addr > end - 1)
 		return;
 
 	start = addr;
 	pgd = pgd_offset(tlb->mm, addr);
 	do {
-		BUG_ON(get_slice_psize(tlb->mm, addr) != mmu_huge_psize);
+		psize = get_slice_psize(tlb->mm, addr);
+		BUG_ON(!mmu_huge_psizes[psize]);
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
@@ -373,7 +437,11 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 		 * necessary anymore if we make hpte_need_flush() get the
 		 * page size from the slices
 		 */
-		pte_update(mm, addr & HPAGE_MASK, ptep, ~0UL, 1);
+		unsigned int psize = get_slice_psize(mm, addr);
+		unsigned int shift = mmu_psize_to_shift(psize);
+		unsigned long sz = ((1UL) << shift);
+		struct hstate *hstate = size_to_hstate(sz);
+		pte_update(mm, addr & hstate->mask, ptep, ~0UL, 1);
 	}
 	*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
 }
@@ -390,14 +458,19 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 {
 	pte_t *ptep;
 	struct page *page;
+	unsigned int mmu_psize = get_slice_psize(mm, address);
 
-	if (get_slice_psize(mm, address) != mmu_huge_psize)
+	/* Verify it is a huge page else bail. */
+	if (!mmu_huge_psizes[mmu_psize])
 		return ERR_PTR(-EINVAL);
 
 	ptep = huge_pte_offset(mm, address);
 	page = pte_page(*ptep);
-	if (page)
-		page += (address % HPAGE_SIZE) / PAGE_SIZE;
+	if (page) {
+		unsigned int shift = mmu_psize_to_shift(mmu_psize);
+		unsigned long sz = ((1UL) << shift);
+		page += (address % sz) / PAGE_SIZE;
+	}
 
 	return page;
 }
@@ -425,15 +498,16 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 					unsigned long len, unsigned long pgoff,
 					unsigned long flags)
 {
-	return slice_get_unmapped_area(addr, len, flags,
-				       mmu_huge_psize, 1, 0);
+	struct hstate *hstate = hstate_file(file);
+	int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
+	return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
 }
 
 /*
  * Called by asm hashtable.S for doing lazy icache flush
  */
 static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
-						  pte_t pte, int trap)
+					pte_t pte, int trap, unsigned long sz)
 {
 	struct page *page;
 	int i;
@@ -446,7 +520,7 @@ static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
 	/* page is dirty */
 	if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
 		if (trap == 0x400) {
-			for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++)
+			for (i = 0; i < (sz / PAGE_SIZE); i++)
 				__flush_dcache_icache(page_address(page+i));
 			set_bit(PG_arch_1, &page->flags);
 		} else {
@@ -462,11 +536,16 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access,
 {
 	pte_t *ptep;
 	unsigned long old_pte, new_pte;
-	unsigned long va, rflags, pa;
+	unsigned long va, rflags, pa, sz;
 	long slot;
 	int err = 1;
 	int ssize = user_segment_size(ea);
+	unsigned int mmu_psize;
+	int shift;
+	mmu_psize = get_slice_psize(mm, ea);
 
+	if (!mmu_huge_psizes[mmu_psize])
+		goto out;
 	ptep = huge_pte_offset(mm, ea);
 
 	/* Search the Linux page table for a match with va */
@@ -509,30 +588,32 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access,
 	rflags = 0x2 | (!(new_pte & _PAGE_RW));
  	/* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
 	rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
+	shift = mmu_psize_to_shift(mmu_psize);
+	sz = ((1UL) << shift);
 	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
 		/* No CPU has hugepages but lacks no execute, so we
 		 * don't need to worry about that case */
 		rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
-						       trap);
+						       trap, sz);
 
 	/* Check if pte already has an hpte (case 2) */
 	if (unlikely(old_pte & _PAGE_HASHPTE)) {
 		/* There MIGHT be an HPTE for this pte */
 		unsigned long hash, slot;
 
-		hash = hpt_hash(va, HPAGE_SHIFT, ssize);
+		hash = hpt_hash(va, shift, ssize);
 		if (old_pte & _PAGE_F_SECOND)
 			hash = ~hash;
 		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 		slot += (old_pte & _PAGE_F_GIX) >> 12;
 
-		if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_huge_psize,
+		if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize,
 					 ssize, local) == -1)
 			old_pte &= ~_PAGE_HPTEFLAGS;
 	}
 
 	if (likely(!(old_pte & _PAGE_HASHPTE))) {
-		unsigned long hash = hpt_hash(va, HPAGE_SHIFT, ssize);
+		unsigned long hash = hpt_hash(va, shift, ssize);
 		unsigned long hpte_group;
 
 		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
@@ -553,7 +634,7 @@ repeat:
 
 		/* Insert into the hash table, primary slot */
 		slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
-					  mmu_huge_psize, ssize);
+					  mmu_psize, ssize);
 
 		/* Primary is full, try the secondary */
 		if (unlikely(slot == -1)) {
@@ -561,7 +642,7 @@ repeat:
 				      HPTES_PER_GROUP) & ~0x7UL; 
 			slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
 						  HPTE_V_SECONDARY,
-						  mmu_huge_psize, ssize);
+						  mmu_psize, ssize);
 			if (slot == -1) {
 				if (mftb() & 0x1)
 					hpte_group = ((hash & htab_hash_mask) *
@@ -598,66 +679,50 @@ void set_huge_psize(int psize)
 		(mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT ||
 		 mmu_psize_defs[psize].shift == PAGE_SHIFT_64K ||
 		 mmu_psize_defs[psize].shift == PAGE_SHIFT_16G)) {
-		/* Return if huge page size is the same as the
-		 * base page size. */
-		if (mmu_psize_defs[psize].shift == PAGE_SHIFT)
+		/* Return if huge page size has already been setup or is the
+		 * same as the base page size. */
+		if (mmu_huge_psizes[psize] ||
+		   mmu_psize_defs[psize].shift == PAGE_SHIFT)
 			return;
+		hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
 
-		HPAGE_SHIFT = mmu_psize_defs[psize].shift;
-		mmu_huge_psize = psize;
-
-		switch (HPAGE_SHIFT) {
+		switch (mmu_psize_defs[psize].shift) {
 		case PAGE_SHIFT_64K:
 		    /* We only allow 64k hpages with 4k base page,
 		     * which was checked above, and always put them
 		     * at the PMD */
-		    hugepte_shift = PMD_SHIFT;
+		    hugepte_shift[psize] = PMD_SHIFT;
 		    break;
 		case PAGE_SHIFT_16M:
 		    /* 16M pages can be at two different levels
 		     * of pagestables based on base page size */
 		    if (PAGE_SHIFT == PAGE_SHIFT_64K)
-			    hugepte_shift = PMD_SHIFT;
+			    hugepte_shift[psize] = PMD_SHIFT;
 		    else /* 4k base page */
-			    hugepte_shift = PUD_SHIFT;
+			    hugepte_shift[psize] = PUD_SHIFT;
 		    break;
 		case PAGE_SHIFT_16G:
 		    /* 16G pages are always at PGD level */
-		    hugepte_shift = PGDIR_SHIFT;
+		    hugepte_shift[psize] = PGDIR_SHIFT;
 		    break;
 		}
-		hugepte_shift -= HPAGE_SHIFT;
+		hugepte_shift[psize] -= mmu_psize_defs[psize].shift;
 	} else
-		HPAGE_SHIFT = 0;
+		hugepte_shift[psize] = 0;
 }
 
 static int __init hugepage_setup_sz(char *str)
 {
 	unsigned long long size;
-	int mmu_psize = -1;
+	int mmu_psize;
 	int shift;
 
 	size = memparse(str, &str);
 
 	shift = __ffs(size);
-	switch (shift) {
-#ifndef CONFIG_PPC_64K_PAGES
-	case PAGE_SHIFT_64K:
-		mmu_psize = MMU_PAGE_64K;
-		break;
-#endif
-	case PAGE_SHIFT_16M:
-		mmu_psize = MMU_PAGE_16M;
-		break;
-	case PAGE_SHIFT_16G:
-		mmu_psize = MMU_PAGE_16G;
-		break;
-	}
-
-	if (mmu_psize >= 0 && mmu_psize_defs[mmu_psize].shift) {
+	mmu_psize = shift_to_mmu_psize(shift);
+	if (mmu_psize >= 0 && mmu_psize_defs[mmu_psize].shift)
 		set_huge_psize(mmu_psize);
-		hugetlb_add_hstate(shift - PAGE_SHIFT);
-	}
 	else
 		printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
 
@@ -672,16 +737,31 @@ static void zero_ctor(struct kmem_cache *cache, void *addr)
 
 static int __init hugetlbpage_init(void)
 {
+	unsigned int psize;
+
 	if (!cpu_has_feature(CPU_FTR_16M_PAGE))
 		return -ENODEV;
-
-	huge_pgtable_cache = kmem_cache_create("hugepte_cache",
-					       HUGEPTE_TABLE_SIZE,
-					       HUGEPTE_TABLE_SIZE,
-					       0,
-					       zero_ctor);
-	if (! huge_pgtable_cache)
-		panic("hugetlbpage_init(): could not create hugepte cache\n");
+	/* Add supported huge page sizes.  Need to change HUGE_MAX_HSTATE
+	 * and adjust PTE_NONCACHE_NUM if the number of supported huge page
+	 * sizes changes.
+	 */
+	set_huge_psize(MMU_PAGE_16M);
+	set_huge_psize(MMU_PAGE_64K);
+	set_huge_psize(MMU_PAGE_16G);
+
+	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+		if (mmu_huge_psizes[psize]) {
+			huge_pgtable_cache(psize) = kmem_cache_create(
+						HUGEPTE_CACHE_NAME(psize),
+						HUGEPTE_TABLE_SIZE(psize),
+						HUGEPTE_TABLE_SIZE(psize),
+						0,
+						zero_ctor);
+			if (!huge_pgtable_cache(psize))
+				panic("hugetlbpage_init(): could not create %s"\
+				      "\n", HUGEPTE_CACHE_NAME(psize));
+		}
+	}
 
 	return 0;
 }
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 6ef63caca68..a41bc5aa204 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -153,10 +153,10 @@ static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
 };
 
 #ifdef CONFIG_HUGETLB_PAGE
-/* Hugepages need one extra cache, initialized in hugetlbpage.c.  We
- * can't put into the tables above, because HPAGE_SHIFT is not compile
- * time constant. */
-struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+1];
+/* Hugepages need an extra cache per hugepagesize, initialized in
+ * hugetlbpage.c.  We can't put into the tables above, because HPAGE_SHIFT
+ * is not compile time constant. */
+struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+MMU_PAGE_COUNT];
 #else
 struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
 #endif
diff --git a/arch/powerpc/mm/tlb_64.c b/arch/powerpc/mm/tlb_64.c
index a01b5c608ff..409fcc7b63c 100644
--- a/arch/powerpc/mm/tlb_64.c
+++ b/arch/powerpc/mm/tlb_64.c
@@ -147,7 +147,7 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
 	 */
 	if (huge) {
 #ifdef CONFIG_HUGETLB_PAGE
-		psize = mmu_huge_psize;
+		psize = get_slice_psize(mm, addr);;
 #else
 		BUG();
 		psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */
diff --git a/include/asm-powerpc/hugetlb.h b/include/asm-powerpc/hugetlb.h
index ca37c4af27b..26f0d0ab27a 100644
--- a/include/asm-powerpc/hugetlb.h
+++ b/include/asm-powerpc/hugetlb.h
@@ -24,9 +24,10 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 static inline int prepare_hugepage_range(struct file *file,
 			unsigned long addr, unsigned long len)
 {
-	if (len & ~HPAGE_MASK)
+	struct hstate *h = hstate_file(file);
+	if (len & ~huge_page_mask(h))
 		return -EINVAL;
-	if (addr & ~HPAGE_MASK)
+	if (addr & ~huge_page_mask(h))
 		return -EINVAL;
 	return 0;
 }
diff --git a/include/asm-powerpc/mmu-hash64.h b/include/asm-powerpc/mmu-hash64.h
index b61181aa774..19c7a940349 100644
--- a/include/asm-powerpc/mmu-hash64.h
+++ b/include/asm-powerpc/mmu-hash64.h
@@ -194,9 +194,9 @@ extern int mmu_ci_restrictions;
 
 #ifdef CONFIG_HUGETLB_PAGE
 /*
- * The page size index of the huge pages for use by hugetlbfs
+ * The page size indexes of the huge pages for use by hugetlbfs
  */
-extern int mmu_huge_psize;
+extern unsigned int mmu_huge_psizes[MMU_PAGE_COUNT];
 
 #endif /* CONFIG_HUGETLB_PAGE */
 
diff --git a/include/asm-powerpc/page_64.h b/include/asm-powerpc/page_64.h
index 02fd80710e9..043bfdfe4f7 100644
--- a/include/asm-powerpc/page_64.h
+++ b/include/asm-powerpc/page_64.h
@@ -90,6 +90,7 @@ extern unsigned int HPAGE_SHIFT;
 #define HPAGE_SIZE		((1UL) << HPAGE_SHIFT)
 #define HPAGE_MASK		(~(HPAGE_SIZE - 1))
 #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
+#define HUGE_MAX_HSTATE		3
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/include/asm-powerpc/pgalloc-64.h b/include/asm-powerpc/pgalloc-64.h
index 68980990f62..812a1d8f35c 100644
--- a/include/asm-powerpc/pgalloc-64.h
+++ b/include/asm-powerpc/pgalloc-64.h
@@ -22,7 +22,7 @@ extern struct kmem_cache *pgtable_cache[];
 #define PUD_CACHE_NUM		1
 #define PMD_CACHE_NUM		1
 #define HUGEPTE_CACHE_NUM	2
-#define PTE_NONCACHE_NUM	3  /* from GFP rather than kmem_cache */
+#define PTE_NONCACHE_NUM	7  /* from GFP rather than kmem_cache */
 
 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
@@ -119,7 +119,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
 	__free_page(ptepage);
 }
 
-#define PGF_CACHENUM_MASK	0x3
+#define PGF_CACHENUM_MASK	0x7
 
 typedef struct pgtable_free {
 	unsigned long val;
-- 
cgit v1.2.3-70-g09d2


From 27ac792ca0b0a1e7e65f20342260650516c95864 Mon Sep 17 00:00:00 2001
From: Andrea Righi <righi.andrea@gmail.com>
Date: Wed, 23 Jul 2008 21:28:13 -0700
Subject: PAGE_ALIGN(): correctly handle 64-bit values on 32-bit architectures

On 32-bit architectures PAGE_ALIGN() truncates 64-bit values to the 32-bit
boundary. For example:

	u64 val = PAGE_ALIGN(size);

always returns a value < 4GB even if size is greater than 4GB.

The problem resides in PAGE_MASK definition (from include/asm-x86/page.h for
example):

#define PAGE_SHIFT      12
#define PAGE_SIZE       (_AC(1,UL) << PAGE_SHIFT)
#define PAGE_MASK       (~(PAGE_SIZE-1))
...
#define PAGE_ALIGN(addr)       (((addr)+PAGE_SIZE-1)&PAGE_MASK)

The "~" is performed on a 32-bit value, so everything in "and" with
PAGE_MASK greater than 4GB will be truncated to the 32-bit boundary.
Using the ALIGN() macro seems to be the right way, because it uses
typeof(addr) for the mask.

Also move the PAGE_ALIGN() definitions out of include/asm-*/page.h in
include/linux/mm.h.

See also lkml discussion: http://lkml.org/lkml/2008/6/11/237

[akpm@linux-foundation.org: fix drivers/media/video/uvc/uvc_queue.c]
[akpm@linux-foundation.org: fix v850]
[akpm@linux-foundation.org: fix powerpc]
[akpm@linux-foundation.org: fix arm]
[akpm@linux-foundation.org: fix mips]
[akpm@linux-foundation.org: fix drivers/media/video/pvrusb2/pvrusb2-dvb.c]
[akpm@linux-foundation.org: fix drivers/mtd/maps/uclinux.c]
[akpm@linux-foundation.org: fix powerpc]
Signed-off-by: Andrea Righi <righi.andrea@gmail.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/kernel/module.c                     | 1 +
 arch/arm/plat-omap/fb.c                      | 1 +
 arch/avr32/mm/ioremap.c                      | 1 +
 arch/h8300/kernel/setup.c                    | 1 +
 arch/m68k/amiga/chipram.c                    | 1 +
 arch/m68knommu/kernel/setup.c                | 1 +
 arch/mips/kernel/module.c                    | 1 +
 arch/mips/sgi-ip27/ip27-klnuma.c             | 1 +
 arch/powerpc/kernel/suspend.c                | 1 +
 arch/powerpc/lib/code-patching.c             | 1 +
 arch/sparc64/kernel/iommu_common.h           | 2 +-
 arch/x86/kernel/module_64.c                  | 1 +
 arch/xtensa/kernel/setup.c                   | 1 +
 drivers/char/random.c                        | 1 +
 drivers/ieee1394/iso.c                       | 1 +
 drivers/media/video/pvrusb2/pvrusb2-dvb.c    | 1 +
 drivers/media/video/pvrusb2/pvrusb2-ioread.c | 1 +
 drivers/media/video/uvc/uvc_queue.c          | 1 +
 drivers/media/video/videobuf-core.c          | 1 +
 drivers/mtd/maps/uclinux.c                   | 1 +
 drivers/net/mlx4/eq.c                        | 1 +
 drivers/pcmcia/electra_cf.c                  | 1 +
 drivers/scsi/sun_esp.c                       | 1 +
 drivers/video/acornfb.c                      | 1 +
 drivers/video/imxfb.c                        | 1 +
 drivers/video/omap/dispc.c                   | 1 +
 drivers/video/omap/omapfb_main.c             | 1 +
 drivers/video/pxafb.c                        | 1 +
 drivers/video/sa1100fb.c                     | 1 +
 include/asm-alpha/page.h                     | 3 ---
 include/asm-arm/page-nommu.h                 | 4 +---
 include/asm-arm/page.h                       | 3 ---
 include/asm-avr32/page.h                     | 3 ---
 include/asm-blackfin/page.h                  | 3 ---
 include/asm-cris/page.h                      | 3 ---
 include/asm-frv/page.h                       | 3 ---
 include/asm-h8300/page.h                     | 3 ---
 include/asm-ia64/page.h                      | 1 -
 include/asm-m32r/page.h                      | 3 ---
 include/asm-m68k/dvma.h                      | 2 +-
 include/asm-m68k/page.h                      | 3 ---
 include/asm-m68knommu/page.h                 | 3 ---
 include/asm-mips/page.h                      | 3 ---
 include/asm-mips/processor.h                 | 2 +-
 include/asm-mn10300/page.h                   | 3 ---
 include/asm-parisc/page.h                    | 4 ----
 include/asm-powerpc/page.h                   | 3 ---
 include/asm-s390/page.h                      | 3 ---
 include/asm-sh/page.h                        | 3 ---
 include/asm-sparc/page_32.h                  | 3 ---
 include/asm-sparc/page_64.h                  | 3 ---
 include/asm-um/page.h                        | 3 ---
 include/asm-v850/page.h                      | 4 ----
 include/asm-x86/page.h                       | 3 ---
 include/asm-xtensa/page.h                    | 2 --
 include/linux/mm.h                           | 3 +++
 sound/core/info.c                            | 1 +
 57 files changed, 36 insertions(+), 74 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index 79b7e5cf541..a68259a0ccc 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/moduleloader.h>
 #include <linux/kernel.h>
+#include <linux/mm.h>
 #include <linux/elf.h>
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
diff --git a/arch/arm/plat-omap/fb.c b/arch/arm/plat-omap/fb.c
index 96d6f061973..5d107520e6b 100644
--- a/arch/arm/plat-omap/fb.c
+++ b/arch/arm/plat-omap/fb.c
@@ -23,6 +23,7 @@
 
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/mm.h>
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/bootmem.h>
diff --git a/arch/avr32/mm/ioremap.c b/arch/avr32/mm/ioremap.c
index 3437c82434a..f03b79f0e0a 100644
--- a/arch/avr32/mm/ioremap.c
+++ b/arch/avr32/mm/ioremap.c
@@ -6,6 +6,7 @@
  * published by the Free Software Foundation.
  */
 #include <linux/vmalloc.h>
+#include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/io.h>
 
diff --git a/arch/h8300/kernel/setup.c b/arch/h8300/kernel/setup.c
index b1f25c20a5d..7fda657110e 100644
--- a/arch/h8300/kernel/setup.c
+++ b/arch/h8300/kernel/setup.c
@@ -20,6 +20,7 @@
 #include <linux/sched.h>
 #include <linux/delay.h>
 #include <linux/interrupt.h>
+#include <linux/mm.h>
 #include <linux/fs.h>
 #include <linux/fb.h>
 #include <linux/console.h>
diff --git a/arch/m68k/amiga/chipram.c b/arch/m68k/amiga/chipram.c
index cbe36538af4..61df1d33c05 100644
--- a/arch/m68k/amiga/chipram.c
+++ b/arch/m68k/amiga/chipram.c
@@ -9,6 +9,7 @@
 
 #include <linux/types.h>
 #include <linux/kernel.h>
+#include <linux/mm.h>
 #include <linux/init.h>
 #include <linux/ioport.h>
 #include <linux/slab.h>
diff --git a/arch/m68knommu/kernel/setup.c b/arch/m68knommu/kernel/setup.c
index 03f4fe6a2fc..5985f198902 100644
--- a/arch/m68knommu/kernel/setup.c
+++ b/arch/m68knommu/kernel/setup.c
@@ -22,6 +22,7 @@
 #include <linux/interrupt.h>
 #include <linux/fb.h>
 #include <linux/module.h>
+#include <linux/mm.h>
 #include <linux/console.h>
 #include <linux/errno.h>
 #include <linux/string.h>
diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c
index e7ed0ac4853..1f60e27523d 100644
--- a/arch/mips/kernel/module.c
+++ b/arch/mips/kernel/module.c
@@ -22,6 +22,7 @@
 
 #include <linux/moduleloader.h>
 #include <linux/elf.h>
+#include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
diff --git a/arch/mips/sgi-ip27/ip27-klnuma.c b/arch/mips/sgi-ip27/ip27-klnuma.c
index 48932ce1d73..d9c79d8be81 100644
--- a/arch/mips/sgi-ip27/ip27-klnuma.c
+++ b/arch/mips/sgi-ip27/ip27-klnuma.c
@@ -4,6 +4,7 @@
  * Copyright 2000 - 2001 Kanoj Sarcar (kanoj@sgi.com)
  */
 #include <linux/init.h>
+#include <linux/mm.h>
 #include <linux/mmzone.h>
 #include <linux/kernel.h>
 #include <linux/nodemask.h>
diff --git a/arch/powerpc/kernel/suspend.c b/arch/powerpc/kernel/suspend.c
index 8cee5710754..6fc6328dc62 100644
--- a/arch/powerpc/kernel/suspend.c
+++ b/arch/powerpc/kernel/suspend.c
@@ -7,6 +7,7 @@
  * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
  */
 
+#include <linux/mm.h>
 #include <asm/page.h>
 
 /* References to section boundaries */
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 0559fe086eb..7c975d43e3f 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -10,6 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/vmalloc.h>
 #include <linux/init.h>
+#include <linux/mm.h>
 #include <asm/page.h>
 #include <asm/code-patching.h>
 
diff --git a/arch/sparc64/kernel/iommu_common.h b/arch/sparc64/kernel/iommu_common.h
index f3575a614fa..53b19c8231a 100644
--- a/arch/sparc64/kernel/iommu_common.h
+++ b/arch/sparc64/kernel/iommu_common.h
@@ -23,7 +23,7 @@
 #define IO_PAGE_SHIFT			13
 #define IO_PAGE_SIZE			(1UL << IO_PAGE_SHIFT)
 #define IO_PAGE_MASK			(~(IO_PAGE_SIZE-1))
-#define IO_PAGE_ALIGN(addr)		(((addr)+IO_PAGE_SIZE-1)&IO_PAGE_MASK)
+#define IO_PAGE_ALIGN(addr)		ALIGN(addr, IO_PAGE_SIZE)
 
 #define IO_TSB_ENTRIES			(128*1024)
 #define IO_TSB_SIZE			(IO_TSB_ENTRIES * 8)
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c
index 0e867676b5a..6ba87830d4b 100644
--- a/arch/x86/kernel/module_64.c
+++ b/arch/x86/kernel/module_64.c
@@ -22,6 +22,7 @@
 #include <linux/fs.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
+#include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/bug.h>
 
diff --git a/arch/xtensa/kernel/setup.c b/arch/xtensa/kernel/setup.c
index 5e6d75c9f92..a00359e8f7a 100644
--- a/arch/xtensa/kernel/setup.c
+++ b/arch/xtensa/kernel/setup.c
@@ -16,6 +16,7 @@
 
 #include <linux/errno.h>
 #include <linux/init.h>
+#include <linux/mm.h>
 #include <linux/proc_fs.h>
 #include <linux/screen_info.h>
 #include <linux/bootmem.h>
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 0cf98bd4f2d..e0d0e371909 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -236,6 +236,7 @@
 #include <linux/fs.h>
 #include <linux/genhd.h>
 #include <linux/interrupt.h>
+#include <linux/mm.h>
 #include <linux/spinlock.h>
 #include <linux/percpu.h>
 #include <linux/cryptohash.h>
diff --git a/drivers/ieee1394/iso.c b/drivers/ieee1394/iso.c
index 07ca35c98f9..1cf6487b65b 100644
--- a/drivers/ieee1394/iso.c
+++ b/drivers/ieee1394/iso.c
@@ -11,6 +11,7 @@
 
 #include <linux/pci.h>
 #include <linux/sched.h>
+#include <linux/mm.h>
 #include <linux/slab.h>
 
 #include "hosts.h"
diff --git a/drivers/media/video/pvrusb2/pvrusb2-dvb.c b/drivers/media/video/pvrusb2/pvrusb2-dvb.c
index 6ec4bf81fc7..77b3c338506 100644
--- a/drivers/media/video/pvrusb2/pvrusb2-dvb.c
+++ b/drivers/media/video/pvrusb2/pvrusb2-dvb.c
@@ -20,6 +20,7 @@
 
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/mm.h>
 #include "dvbdev.h"
 #include "pvrusb2-debug.h"
 #include "pvrusb2-hdw-internal.h"
diff --git a/drivers/media/video/pvrusb2/pvrusb2-ioread.c b/drivers/media/video/pvrusb2/pvrusb2-ioread.c
index 05a1376405e..b4824782d85 100644
--- a/drivers/media/video/pvrusb2/pvrusb2-ioread.c
+++ b/drivers/media/video/pvrusb2/pvrusb2-ioread.c
@@ -22,6 +22,7 @@
 #include "pvrusb2-debug.h"
 #include <linux/errno.h>
 #include <linux/string.h>
+#include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/mutex.h>
 #include <asm/uaccess.h>
diff --git a/drivers/media/video/uvc/uvc_queue.c b/drivers/media/video/uvc/uvc_queue.c
index 7388d0cee3d..5646a6a3293 100644
--- a/drivers/media/video/uvc/uvc_queue.c
+++ b/drivers/media/video/uvc/uvc_queue.c
@@ -13,6 +13,7 @@
 
 #include <linux/kernel.h>
 #include <linux/version.h>
+#include <linux/mm.h>
 #include <linux/list.h>
 #include <linux/module.h>
 #include <linux/usb.h>
diff --git a/drivers/media/video/videobuf-core.c b/drivers/media/video/videobuf-core.c
index 0a88c44ace0..b7b05842cf2 100644
--- a/drivers/media/video/videobuf-core.c
+++ b/drivers/media/video/videobuf-core.c
@@ -16,6 +16,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 
diff --git a/drivers/mtd/maps/uclinux.c b/drivers/mtd/maps/uclinux.c
index c42f4b83f68..3fcf92130aa 100644
--- a/drivers/mtd/maps/uclinux.c
+++ b/drivers/mtd/maps/uclinux.c
@@ -15,6 +15,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/fs.h>
+#include <linux/mm.h>
 #include <linux/major.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
diff --git a/drivers/net/mlx4/eq.c b/drivers/net/mlx4/eq.c
index e141a1513f0..ea3a09aaa84 100644
--- a/drivers/net/mlx4/eq.c
+++ b/drivers/net/mlx4/eq.c
@@ -33,6 +33,7 @@
 
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/mm.h>
 #include <linux/dma-mapping.h>
 
 #include <linux/mlx4/cmd.h>
diff --git a/drivers/pcmcia/electra_cf.c b/drivers/pcmcia/electra_cf.c
index c21f9a9c3e3..a34284b1482 100644
--- a/drivers/pcmcia/electra_cf.c
+++ b/drivers/pcmcia/electra_cf.c
@@ -28,6 +28,7 @@
 #include <linux/init.h>
 #include <linux/delay.h>
 #include <linux/interrupt.h>
+#include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/of_platform.h>
 
diff --git a/drivers/scsi/sun_esp.c b/drivers/scsi/sun_esp.c
index 2c87db98cdf..f9cf7015136 100644
--- a/drivers/scsi/sun_esp.c
+++ b/drivers/scsi/sun_esp.c
@@ -7,6 +7,7 @@
 #include <linux/types.h>
 #include <linux/delay.h>
 #include <linux/module.h>
+#include <linux/mm.h>
 #include <linux/init.h>
 
 #include <asm/irq.h>
diff --git a/drivers/video/acornfb.c b/drivers/video/acornfb.c
index eedb8285e32..017233d0c48 100644
--- a/drivers/video/acornfb.c
+++ b/drivers/video/acornfb.c
@@ -23,6 +23,7 @@
 #include <linux/string.h>
 #include <linux/ctype.h>
 #include <linux/slab.h>
+#include <linux/mm.h>
 #include <linux/init.h>
 #include <linux/fb.h>
 #include <linux/platform_device.h>
diff --git a/drivers/video/imxfb.c b/drivers/video/imxfb.c
index 94e4d3ac1a0..0c5a475c1ca 100644
--- a/drivers/video/imxfb.c
+++ b/drivers/video/imxfb.c
@@ -24,6 +24,7 @@
 #include <linux/string.h>
 #include <linux/interrupt.h>
 #include <linux/slab.h>
+#include <linux/mm.h>
 #include <linux/fb.h>
 #include <linux/delay.h>
 #include <linux/init.h>
diff --git a/drivers/video/omap/dispc.c b/drivers/video/omap/dispc.c
index ab32ceb0617..ab77c51fe9d 100644
--- a/drivers/video/omap/dispc.c
+++ b/drivers/video/omap/dispc.c
@@ -20,6 +20,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/dma-mapping.h>
+#include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/clk.h>
 #include <linux/io.h>
diff --git a/drivers/video/omap/omapfb_main.c b/drivers/video/omap/omapfb_main.c
index 14d0f7a1114..f85af5c4fa6 100644
--- a/drivers/video/omap/omapfb_main.c
+++ b/drivers/video/omap/omapfb_main.c
@@ -25,6 +25,7 @@
  * 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  */
 #include <linux/platform_device.h>
+#include <linux/mm.h>
 #include <linux/uaccess.h>
 
 #include <asm/mach-types.h>
diff --git a/drivers/video/pxafb.c b/drivers/video/pxafb.c
index bb251436950..5e8a140399f 100644
--- a/drivers/video/pxafb.c
+++ b/drivers/video/pxafb.c
@@ -30,6 +30,7 @@
 #include <linux/string.h>
 #include <linux/interrupt.h>
 #include <linux/slab.h>
+#include <linux/mm.h>
 #include <linux/fb.h>
 #include <linux/delay.h>
 #include <linux/init.h>
diff --git a/drivers/video/sa1100fb.c b/drivers/video/sa1100fb.c
index ab2b2110478..4a9f7e12180 100644
--- a/drivers/video/sa1100fb.c
+++ b/drivers/video/sa1100fb.c
@@ -167,6 +167,7 @@
 #include <linux/string.h>
 #include <linux/interrupt.h>
 #include <linux/slab.h>
+#include <linux/mm.h>
 #include <linux/fb.h>
 #include <linux/delay.h>
 #include <linux/init.h>
diff --git a/include/asm-alpha/page.h b/include/asm-alpha/page.h
index 22ff9762d17..0995f9d1341 100644
--- a/include/asm-alpha/page.h
+++ b/include/asm-alpha/page.h
@@ -80,9 +80,6 @@ typedef struct page *pgtable_t;
 
 #endif /* !__ASSEMBLY__ */
 
-/* to align the pointer to the (next) page boundary */
-#define PAGE_ALIGN(addr)	(((addr)+PAGE_SIZE-1)&PAGE_MASK)
-
 #define __pa(x)			((unsigned long) (x) - PAGE_OFFSET)
 #define __va(x)			((void *)((unsigned long) (x) + PAGE_OFFSET))
 #ifndef CONFIG_DISCONTIGMEM
diff --git a/include/asm-arm/page-nommu.h b/include/asm-arm/page-nommu.h
index a1bcad06048..ea1cde84f50 100644
--- a/include/asm-arm/page-nommu.h
+++ b/include/asm-arm/page-nommu.h
@@ -7,6 +7,7 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
+
 #ifndef _ASMARM_PAGE_NOMMU_H
 #define _ASMARM_PAGE_NOMMU_H
 
@@ -42,9 +43,6 @@ typedef unsigned long pgprot_t;
 #define __pmd(x)        (x)
 #define __pgprot(x)     (x)
 
-/* to align the pointer to the (next) page boundary */
-#define PAGE_ALIGN(addr)	(((addr)+PAGE_SIZE-1)&PAGE_MASK)
-
 extern unsigned long memory_start;
 extern unsigned long memory_end;
 
diff --git a/include/asm-arm/page.h b/include/asm-arm/page.h
index 8e05bdb5f12..7c5fc5582e5 100644
--- a/include/asm-arm/page.h
+++ b/include/asm-arm/page.h
@@ -15,9 +15,6 @@
 #define PAGE_SIZE		(1UL << PAGE_SHIFT)
 #define PAGE_MASK		(~(PAGE_SIZE-1))
 
-/* to align the pointer to the (next) page boundary */
-#define PAGE_ALIGN(addr)	(((addr)+PAGE_SIZE-1)&PAGE_MASK)
-
 #ifndef __ASSEMBLY__
 
 #ifndef CONFIG_MMU
diff --git a/include/asm-avr32/page.h b/include/asm-avr32/page.h
index cbbc5ca9728..f805d1cb11b 100644
--- a/include/asm-avr32/page.h
+++ b/include/asm-avr32/page.h
@@ -57,9 +57,6 @@ static inline int get_order(unsigned long size)
 
 #endif /* !__ASSEMBLY__ */
 
-/* Align the pointer to the (next) page boundary */
-#define PAGE_ALIGN(addr)	(((addr) + PAGE_SIZE - 1) & PAGE_MASK)
-
 /*
  * The hardware maps the virtual addresses 0x80000000 -> 0x9fffffff
  * permanently to the physical addresses 0x00000000 -> 0x1fffffff when
diff --git a/include/asm-blackfin/page.h b/include/asm-blackfin/page.h
index c7db0220fbd..344f6a8c1f2 100644
--- a/include/asm-blackfin/page.h
+++ b/include/asm-blackfin/page.h
@@ -51,9 +51,6 @@ typedef struct page *pgtable_t;
 #define __pgd(x)	((pgd_t) { (x) } )
 #define __pgprot(x)	((pgprot_t) { (x) } )
 
-/* to align the pointer to the (next) page boundary */
-#define PAGE_ALIGN(addr)	(((addr)+PAGE_SIZE-1)&PAGE_MASK)
-
 extern unsigned long memory_start;
 extern unsigned long memory_end;
 
diff --git a/include/asm-cris/page.h b/include/asm-cris/page.h
index c45bb1ef397..d19272ba6b6 100644
--- a/include/asm-cris/page.h
+++ b/include/asm-cris/page.h
@@ -60,9 +60,6 @@ typedef struct page *pgtable_t;
 
 #define page_to_phys(page)     __pa((((page) - mem_map) << PAGE_SHIFT) + PAGE_OFFSET)
 
-/* to align the pointer to the (next) page boundary */
-#define PAGE_ALIGN(addr)	(((addr)+PAGE_SIZE-1)&PAGE_MASK)
-
 #ifndef __ASSEMBLY__
 
 #endif /* __ASSEMBLY__ */
diff --git a/include/asm-frv/page.h b/include/asm-frv/page.h
index c2c1e89e747..bd9c220094c 100644
--- a/include/asm-frv/page.h
+++ b/include/asm-frv/page.h
@@ -40,9 +40,6 @@ typedef struct page *pgtable_t;
 #define __pgprot(x)	((pgprot_t) { (x) } )
 #define PTE_MASK	PAGE_MASK
 
-/* to align the pointer to the (next) page boundary */
-#define PAGE_ALIGN(addr)	(((addr) + PAGE_SIZE - 1) & PAGE_MASK)
-
 #define devmem_is_allowed(pfn)	1
 
 #define __pa(vaddr)		virt_to_phys((void *) (unsigned long) (vaddr))
diff --git a/include/asm-h8300/page.h b/include/asm-h8300/page.h
index d6a3eaf3b27..0b6acf0b03a 100644
--- a/include/asm-h8300/page.h
+++ b/include/asm-h8300/page.h
@@ -43,9 +43,6 @@ typedef struct page *pgtable_t;
 #define __pgd(x)	((pgd_t) { (x) } )
 #define __pgprot(x)	((pgprot_t) { (x) } )
 
-/* to align the pointer to the (next) page boundary */
-#define PAGE_ALIGN(addr)	(((addr)+PAGE_SIZE-1)&PAGE_MASK)
-
 extern unsigned long memory_start;
 extern unsigned long memory_end;
 
diff --git a/include/asm-ia64/page.h b/include/asm-ia64/page.h
index 36f39321b76..5f271bc712e 100644
--- a/include/asm-ia64/page.h
+++ b/include/asm-ia64/page.h
@@ -40,7 +40,6 @@
 
 #define PAGE_SIZE		(__IA64_UL_CONST(1) << PAGE_SHIFT)
 #define PAGE_MASK		(~(PAGE_SIZE - 1))
-#define PAGE_ALIGN(addr)	(((addr) + PAGE_SIZE - 1) & PAGE_MASK)
 
 #define PERCPU_PAGE_SHIFT	16	/* log2() of max. size of per-CPU area */
 #define PERCPU_PAGE_SIZE	(__IA64_UL_CONST(1) << PERCPU_PAGE_SHIFT)
diff --git a/include/asm-m32r/page.h b/include/asm-m32r/page.h
index 8a677f3fca6..c9333089fe1 100644
--- a/include/asm-m32r/page.h
+++ b/include/asm-m32r/page.h
@@ -41,9 +41,6 @@ typedef struct page *pgtable_t;
 
 #endif /* !__ASSEMBLY__ */
 
-/* to align the pointer to the (next) page boundary */
-#define PAGE_ALIGN(addr)	(((addr) + PAGE_SIZE - 1) & PAGE_MASK)
-
 /*
  * This handles the memory map.. We could make this a config
  * option, but too many people screw it up, and too few need
diff --git a/include/asm-m68k/dvma.h b/include/asm-m68k/dvma.h
index 4fff408d015..890bbf7e775 100644
--- a/include/asm-m68k/dvma.h
+++ b/include/asm-m68k/dvma.h
@@ -13,7 +13,7 @@
 #define DVMA_PAGE_SHIFT	13
 #define DVMA_PAGE_SIZE	(1UL << DVMA_PAGE_SHIFT)
 #define DVMA_PAGE_MASK	(~(DVMA_PAGE_SIZE-1))
-#define DVMA_PAGE_ALIGN(addr)	(((addr)+DVMA_PAGE_SIZE-1)&DVMA_PAGE_MASK)
+#define DVMA_PAGE_ALIGN(addr)	ALIGN(addr, DVMA_PAGE_SIZE)
 
 extern void dvma_init(void);
 extern int dvma_map_iommu(unsigned long kaddr, unsigned long baddr,
diff --git a/include/asm-m68k/page.h b/include/asm-m68k/page.h
index 880c2cbff8a..a34b8bad784 100644
--- a/include/asm-m68k/page.h
+++ b/include/asm-m68k/page.h
@@ -103,9 +103,6 @@ typedef struct page *pgtable_t;
 #define __pgd(x)	((pgd_t) { (x) } )
 #define __pgprot(x)	((pgprot_t) { (x) } )
 
-/* to align the pointer to the (next) page boundary */
-#define PAGE_ALIGN(addr)	(((addr)+PAGE_SIZE-1)&PAGE_MASK)
-
 #endif /* !__ASSEMBLY__ */
 
 #include <asm/page_offset.h>
diff --git a/include/asm-m68knommu/page.h b/include/asm-m68knommu/page.h
index 1e82ebb7d64..3a1ede4544c 100644
--- a/include/asm-m68knommu/page.h
+++ b/include/asm-m68knommu/page.h
@@ -43,9 +43,6 @@ typedef struct page *pgtable_t;
 #define __pgd(x)	((pgd_t) { (x) } )
 #define __pgprot(x)	((pgprot_t) { (x) } )
 
-/* to align the pointer to the (next) page boundary */
-#define PAGE_ALIGN(addr)	(((addr)+PAGE_SIZE-1)&PAGE_MASK)
-
 extern unsigned long memory_start;
 extern unsigned long memory_end;
 
diff --git a/include/asm-mips/page.h b/include/asm-mips/page.h
index 494f00ba954..fe7a88ea066 100644
--- a/include/asm-mips/page.h
+++ b/include/asm-mips/page.h
@@ -137,9 +137,6 @@ typedef struct { unsigned long pgprot; } pgprot_t;
 
 #endif /* !__ASSEMBLY__ */
 
-/* to align the pointer to the (next) page boundary */
-#define PAGE_ALIGN(addr)	(((addr) + PAGE_SIZE - 1) & PAGE_MASK)
-
 /*
  * __pa()/__va() should be used only during mem init.
  */
diff --git a/include/asm-mips/processor.h b/include/asm-mips/processor.h
index 58cbac5a64e..a1e4453469f 100644
--- a/include/asm-mips/processor.h
+++ b/include/asm-mips/processor.h
@@ -45,7 +45,7 @@ extern unsigned int vced_count, vcei_count;
  * This decides where the kernel will search for a free chunk of vm
  * space during mmap's.
  */
-#define TASK_UNMAPPED_BASE	(PAGE_ALIGN(TASK_SIZE / 3))
+#define TASK_UNMAPPED_BASE	((TASK_SIZE / 3) & ~(PAGE_SIZE))
 #endif
 
 #ifdef CONFIG_64BIT
diff --git a/include/asm-mn10300/page.h b/include/asm-mn10300/page.h
index 124971b9fb9..8288e124165 100644
--- a/include/asm-mn10300/page.h
+++ b/include/asm-mn10300/page.h
@@ -61,9 +61,6 @@ typedef struct page *pgtable_t;
 
 #endif /* !__ASSEMBLY__ */
 
-/* to align the pointer to the (next) page boundary */
-#define PAGE_ALIGN(addr)	(((addr) + PAGE_SIZE - 1) & PAGE_MASK)
-
 /*
  * This handles the memory map.. We could make this a config
  * option, but too many people screw it up, and too few need
diff --git a/include/asm-parisc/page.h b/include/asm-parisc/page.h
index 27d50b85954..c3941f09a87 100644
--- a/include/asm-parisc/page.h
+++ b/include/asm-parisc/page.h
@@ -119,10 +119,6 @@ extern int npmem_ranges;
 #define PMD_ENTRY_SIZE	(1UL << BITS_PER_PMD_ENTRY)
 #define PTE_ENTRY_SIZE	(1UL << BITS_PER_PTE_ENTRY)
 
-/* to align the pointer to the (next) page boundary */
-#define PAGE_ALIGN(addr)	(((addr)+PAGE_SIZE-1)&PAGE_MASK)
-
-
 #define LINUX_GATEWAY_SPACE     0
 
 /* This governs the relationship between virtual and physical addresses.
diff --git a/include/asm-powerpc/page.h b/include/asm-powerpc/page.h
index cffdf0eb0df..e088545cb3f 100644
--- a/include/asm-powerpc/page.h
+++ b/include/asm-powerpc/page.h
@@ -119,9 +119,6 @@ extern phys_addr_t kernstart_addr;
 /* align addr on a size boundary - adjust address up if needed */
 #define _ALIGN(addr,size)     _ALIGN_UP(addr,size)
 
-/* to align the pointer to the (next) page boundary */
-#define PAGE_ALIGN(addr)	_ALIGN(addr, PAGE_SIZE)
-
 /*
  * Don't compare things with KERNELBASE or PAGE_OFFSET to test for
  * "kernelness", use is_kernel_addr() - it should do what you want.
diff --git a/include/asm-s390/page.h b/include/asm-s390/page.h
index 12fd9c4f0f1..991ba939408 100644
--- a/include/asm-s390/page.h
+++ b/include/asm-s390/page.h
@@ -138,9 +138,6 @@ void arch_alloc_page(struct page *page, int order);
 
 #endif /* !__ASSEMBLY__ */
 
-/* to align the pointer to the (next) page boundary */
-#define PAGE_ALIGN(addr)        (((addr)+PAGE_SIZE-1)&PAGE_MASK)
-
 #define __PAGE_OFFSET           0x0UL
 #define PAGE_OFFSET             0x0UL
 #define __pa(x)                 (unsigned long)(x)
diff --git a/include/asm-sh/page.h b/include/asm-sh/page.h
index 304c30b5d94..5dc01d2fcc4 100644
--- a/include/asm-sh/page.h
+++ b/include/asm-sh/page.h
@@ -22,9 +22,6 @@
 #define PAGE_MASK	(~(PAGE_SIZE-1))
 #define PTE_MASK	PAGE_MASK
 
-/* to align the pointer to the (next) page boundary */
-#define PAGE_ALIGN(addr)	(((addr)+PAGE_SIZE-1)&PAGE_MASK)
-
 #if defined(CONFIG_HUGETLB_PAGE_SIZE_64K)
 #define HPAGE_SHIFT	16
 #elif defined(CONFIG_HUGETLB_PAGE_SIZE_256K)
diff --git a/include/asm-sparc/page_32.h b/include/asm-sparc/page_32.h
index 14de518cc38..cf5fb70ca1c 100644
--- a/include/asm-sparc/page_32.h
+++ b/include/asm-sparc/page_32.h
@@ -134,9 +134,6 @@ BTFIXUPDEF_SETHI(sparc_unmapped_base)
 
 #endif /* !(__ASSEMBLY__) */
 
-/* to align the pointer to the (next) page boundary */
-#define PAGE_ALIGN(addr)  (((addr)+PAGE_SIZE-1)&PAGE_MASK)
-
 #define PAGE_OFFSET	0xf0000000
 #ifndef __ASSEMBLY__
 extern unsigned long phys_base;
diff --git a/include/asm-sparc/page_64.h b/include/asm-sparc/page_64.h
index a8a2bba032c..b579b910ef5 100644
--- a/include/asm-sparc/page_64.h
+++ b/include/asm-sparc/page_64.h
@@ -106,9 +106,6 @@ typedef struct page *pgtable_t;
 
 #endif /* !(__ASSEMBLY__) */
 
-/* to align the pointer to the (next) page boundary */
-#define PAGE_ALIGN(addr)	(((addr)+PAGE_SIZE-1)&PAGE_MASK)
-
 /* We used to stick this into a hard-coded global register (%g4)
  * but that does not make sense anymore.
  */
diff --git a/include/asm-um/page.h b/include/asm-um/page.h
index 916e1a61999..335c57383c0 100644
--- a/include/asm-um/page.h
+++ b/include/asm-um/page.h
@@ -92,9 +92,6 @@ typedef struct page *pgtable_t;
 #define __pgd(x) ((pgd_t) { (x) } )
 #define __pgprot(x)	((pgprot_t) { (x) } )
 
-/* to align the pointer to the (next) page boundary */
-#define PAGE_ALIGN(addr)	(((addr)+PAGE_SIZE-1)&PAGE_MASK)
-
 extern unsigned long uml_physmem;
 
 #define PAGE_OFFSET (uml_physmem)
diff --git a/include/asm-v850/page.h b/include/asm-v850/page.h
index 74a539a9bd5..f9de35d873f 100644
--- a/include/asm-v850/page.h
+++ b/include/asm-v850/page.h
@@ -94,10 +94,6 @@ typedef unsigned long pgprot_t;
 #endif /* !__ASSEMBLY__ */
 
 
-/* to align the pointer to the (next) page boundary */
-#define PAGE_ALIGN(addr)	(((addr) + PAGE_SIZE - 1) & PAGE_MASK)
-
-
 /* No current v850 processor has virtual memory.  */
 #define __virt_to_phys(addr)	(addr)
 #define __phys_to_virt(addr)	(addr)
diff --git a/include/asm-x86/page.h b/include/asm-x86/page.h
index 6e02098b160..49982110e4d 100644
--- a/include/asm-x86/page.h
+++ b/include/asm-x86/page.h
@@ -34,9 +34,6 @@
 
 #define HUGE_MAX_HSTATE 2
 
-/* to align the pointer to the (next) page boundary */
-#define PAGE_ALIGN(addr)	(((addr)+PAGE_SIZE-1)&PAGE_MASK)
-
 #ifndef __ASSEMBLY__
 #include <linux/types.h>
 #endif
diff --git a/include/asm-xtensa/page.h b/include/asm-xtensa/page.h
index 80a6ae0dd25..11f7dc2dbec 100644
--- a/include/asm-xtensa/page.h
+++ b/include/asm-xtensa/page.h
@@ -26,13 +26,11 @@
 
 /*
  * PAGE_SHIFT determines the page size
- * PAGE_ALIGN(x) aligns the pointer to the (next) page boundary
  */
 
 #define PAGE_SHIFT		12
 #define PAGE_SIZE		(__XTENSA_UL_CONST(1) << PAGE_SHIFT)
 #define PAGE_MASK		(~(PAGE_SIZE-1))
-#define PAGE_ALIGN(addr)	(((addr)+PAGE_SIZE - 1) & PAGE_MASK)
 
 #define PAGE_OFFSET		XCHAL_KSEG_CACHED_VADDR
 #define MAX_MEM_PFN		XCHAL_KSEG_SIZE
diff --git a/include/linux/mm.h b/include/linux/mm.h
index df322fb4df3..d87a5a5fe87 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -41,6 +41,9 @@ extern unsigned long mmap_min_addr;
 
 #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
 
+/* to align the pointer to the (next) page boundary */
+#define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE)
+
 /*
  * Linux kernel virtual memory manager primitives.
  * The idea being to have a "virtual" mm in the same way
diff --git a/sound/core/info.c b/sound/core/info.c
index cb5ead3e202..c67773ad929 100644
--- a/sound/core/info.c
+++ b/sound/core/info.c
@@ -21,6 +21,7 @@
 
 #include <linux/init.h>
 #include <linux/time.h>
+#include <linux/mm.h>
 #include <linux/smp_lock.h>
 #include <linux/string.h>
 #include <sound/core.h>
-- 
cgit v1.2.3-70-g09d2


From 9115d13453dee22473a1e8cacc90a8d64a9c4bc9 Mon Sep 17 00:00:00 2001
From: Nathan Lynch <ntl@pobox.com>
Date: Wed, 16 Jul 2008 09:58:51 +1000
Subject: powerpc: Enable AT_BASE_PLATFORM aux vector

Stash the first platform string matched by identify_cpu() in
powerpc_base_platform, and supply that to the ELF loader for the value
of AT_BASE_PLATFORM.

Signed-off-by: Nathan Lynch <ntl@pobox.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/kernel/cputable.c | 11 +++++++++++
 include/asm-powerpc/cputable.h |  2 ++
 include/asm-powerpc/elf.h      |  8 ++++++++
 3 files changed, 21 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index b936a1dd0a5..25a052c1675 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -23,6 +23,9 @@
 struct cpu_spec* cur_cpu_spec = NULL;
 EXPORT_SYMBOL(cur_cpu_spec);
 
+/* The platform string corresponding to the real PVR */
+const char *powerpc_base_platform;
+
 /* NOTE:
  * Unlike ppc32, ppc64 will only call this once for the boot CPU, it's
  * the responsibility of the appropriate CPU save/restore functions to
@@ -1652,6 +1655,14 @@ struct cpu_spec * __init identify_cpu(unsigned long offset, unsigned int pvr)
 			} else
 				*t = *s;
 			*PTRRELOC(&cur_cpu_spec) = &the_cpu_spec;
+
+			/*
+			 * Set the base platform string once; assumes
+			 * we're called with real pvr first.
+			 */
+			if (powerpc_base_platform == NULL)
+				powerpc_base_platform = t->platform;
+
 #if defined(CONFIG_PPC64) || defined(CONFIG_BOOKE)
 			/* ppc64 and booke expect identify_cpu to also call
 			 * setup_cpu for that processor. I will consolidate
diff --git a/include/asm-powerpc/cputable.h b/include/asm-powerpc/cputable.h
index 2a3e9075a5a..ef8a248dfd5 100644
--- a/include/asm-powerpc/cputable.h
+++ b/include/asm-powerpc/cputable.h
@@ -127,6 +127,8 @@ extern struct cpu_spec *identify_cpu(unsigned long offset, unsigned int pvr);
 extern void do_feature_fixups(unsigned long value, void *fixup_start,
 			      void *fixup_end);
 
+extern const char *powerpc_base_platform;
+
 #endif /* __ASSEMBLY__ */
 
 /* CPU kernel features */
diff --git a/include/asm-powerpc/elf.h b/include/asm-powerpc/elf.h
index 89664675b46..80d1f399ee5 100644
--- a/include/asm-powerpc/elf.h
+++ b/include/asm-powerpc/elf.h
@@ -217,6 +217,14 @@ typedef elf_vrregset_t elf_fpxregset_t;
 
 #define ELF_PLATFORM	(cur_cpu_spec->platform)
 
+/* While ELF_PLATFORM indicates the ISA supported by the platform, it
+ * may not accurately reflect the underlying behavior of the hardware
+ * (as in the case of running in Power5+ compatibility mode on a
+ * Power6 machine).  ELF_BASE_PLATFORM allows ld.so to load libraries
+ * that are tuned for the real hardware.
+ */
+#define ELF_BASE_PLATFORM (powerpc_base_platform)
+
 #ifdef __powerpc64__
 # define ELF_PLAT_INIT(_r, load_addr)	do {	\
 	_r->gpr[2] = load_addr; 		\
-- 
cgit v1.2.3-70-g09d2


From 00bf6e906156b07cd641fe154ad0efe78f989692 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Wed, 23 Jul 2008 10:44:58 +1000
Subject: powerpc: Fallout from sysdev API changes

A struct sysdev_attribute * parameter was added to the show routine by
commit 4a0b2b4dbe1335b8b9886ba3dc85a145d5d938ed "sysdev: Pass the
attribute to the low level sysdev show/store function".

This eliminates a warning:

arch/powerpc/kernel/sysfs.c:538: warning: initialization from incompatible pointer type

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/kernel/sysfs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index aba0ba95f06..800e5e9a087 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -529,7 +529,8 @@ static void register_nodes(void)
 #endif
 
 /* Only valid if CPU is present. */
-static ssize_t show_physical_id(struct sys_device *dev, char *buf)
+static ssize_t show_physical_id(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
 
-- 
cgit v1.2.3-70-g09d2


From d6a61bfc06d6f2248f3e75f208d64e794082013c Mon Sep 17 00:00:00 2001
From: Luis Machado <luisgpm@linux.vnet.ibm.com>
Date: Thu, 24 Jul 2008 02:10:41 +1000
Subject: powerpc: BookE hardware watchpoint support

This patch implements support for HW based watchpoint via the
DBSR_DAC (Data Address Compare) facility of the BookE processors.

It does so by interfacing with the existing DABR breakpoint code
and adding the necessary bits and pieces for the new bits to
be properly set or cleared

Signed-off-by: Luis Machado <luisgpm@br.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/kernel/entry_32.S |  6 ++--
 arch/powerpc/kernel/process.c  | 46 +++++++++++++++++++++++++++
 arch/powerpc/kernel/ptrace.c   | 72 ++++++++++++++++++++++++++++++++++++++----
 arch/powerpc/kernel/signal.c   |  6 +++-
 arch/powerpc/kernel/traps.c    | 16 ++++++++++
 arch/powerpc/mm/fault.c        | 25 ---------------
 include/asm-powerpc/system.h   |  2 ++
 7 files changed, 138 insertions(+), 35 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index da52269aec1..81c8324a4a3 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -148,7 +148,7 @@ transfer_to_handler:
 	/* Check to see if the dbcr0 register is set up to debug.  Use the
 	   internal debug mode bit to do this. */
 	lwz	r12,THREAD_DBCR0(r12)
-	andis.	r12,r12,DBCR0_IDM@h
+	andis.	r12,r12,(DBCR0_IDM  | DBSR_DAC1R | DBSR_DAC1W)@h
 	beq+	3f
 	/* From user and task is ptraced - load up global dbcr0 */
 	li	r12,-1			/* clear all pending debug events */
@@ -292,7 +292,7 @@ syscall_exit_cont:
 	/* If the process has its own DBCR0 value, load it up.  The internal
 	   debug mode bit tells us that dbcr0 should be loaded. */
 	lwz	r0,THREAD+THREAD_DBCR0(r2)
-	andis.	r10,r0,DBCR0_IDM@h
+	andis.	r10,r0,(DBCR0_IDM  | DBSR_DAC1R | DBSR_DAC1W)@h
 	bnel-	load_dbcr0
 #endif
 #ifdef CONFIG_44x
@@ -720,7 +720,7 @@ restore_user:
 	/* Check whether this process has its own DBCR0 value.  The internal
 	   debug mode bit tells us that dbcr0 should be loaded. */
 	lwz	r0,THREAD+THREAD_DBCR0(r2)
-	andis.	r10,r0,DBCR0_IDM@h
+	andis.	r10,r0,(DBCR0_IDM  | DBSR_DAC1R | DBSR_DAC1W)@h
 	bnel-	load_dbcr0
 #endif
 
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 219f3634115..db2497ccc11 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -47,6 +47,8 @@
 #ifdef CONFIG_PPC64
 #include <asm/firmware.h>
 #endif
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
 
 extern unsigned long _get_SP(void);
 
@@ -239,6 +241,35 @@ void discard_lazy_cpu_state(void)
 }
 #endif /* CONFIG_SMP */
 
+void do_dabr(struct pt_regs *regs, unsigned long address,
+		    unsigned long error_code)
+{
+	siginfo_t info;
+
+	if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
+			11, SIGSEGV) == NOTIFY_STOP)
+		return;
+
+	if (debugger_dabr_match(regs))
+		return;
+
+	/* Clear the DAC and struct entries.  One shot trigger */
+#if (defined(CONFIG_44x) || defined(CONFIG_BOOKE))
+	mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) & ~(DBSR_DAC1R | DBSR_DAC1W
+							| DBCR0_IDM));
+#endif
+
+	/* Clear the DABR */
+	set_dabr(0);
+
+	/* Deliver the signal to userspace */
+	info.si_signo = SIGTRAP;
+	info.si_errno = 0;
+	info.si_code = TRAP_HWBKPT;
+	info.si_addr = (void __user *)address;
+	force_sig_info(SIGTRAP, &info, current);
+}
+
 static DEFINE_PER_CPU(unsigned long, current_dabr);
 
 int set_dabr(unsigned long dabr)
@@ -254,6 +285,11 @@ int set_dabr(unsigned long dabr)
 #if defined(CONFIG_PPC64) || defined(CONFIG_6xx)
 	mtspr(SPRN_DABR, dabr);
 #endif
+
+#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
+	mtspr(SPRN_DAC1, dabr);
+#endif
+
 	return 0;
 }
 
@@ -337,6 +373,12 @@ struct task_struct *__switch_to(struct task_struct *prev,
 	if (unlikely(__get_cpu_var(current_dabr) != new->thread.dabr))
 		set_dabr(new->thread.dabr);
 
+#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
+	/* If new thread DAC (HW breakpoint) is the same then leave it */
+	if (new->thread.dabr)
+		set_dabr(new->thread.dabr);
+#endif
+
 	new_thread = &new->thread;
 	old_thread = &current->thread;
 
@@ -525,6 +567,10 @@ void flush_thread(void)
 	if (current->thread.dabr) {
 		current->thread.dabr = 0;
 		set_dabr(0);
+
+#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
+		current->thread.dbcr0 &= ~(DBSR_DAC1R | DBSR_DAC1W);
+#endif
 	}
 }
 
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 8feb93e7890..a5d0e78779c 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -703,7 +703,7 @@ void user_enable_single_step(struct task_struct *task)
 
 	if (regs != NULL) {
 #if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
-		task->thread.dbcr0 = DBCR0_IDM | DBCR0_IC;
+		task->thread.dbcr0 |= DBCR0_IDM | DBCR0_IC;
 		regs->msr |= MSR_DE;
 #else
 		regs->msr |= MSR_SE;
@@ -716,9 +716,16 @@ void user_disable_single_step(struct task_struct *task)
 {
 	struct pt_regs *regs = task->thread.regs;
 
+
+#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
+	/* If DAC then do not single step, skip */
+	if (task->thread.dabr)
+		return;
+#endif
+
 	if (regs != NULL) {
 #if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
-		task->thread.dbcr0 = 0;
+		task->thread.dbcr0 &= ~(DBCR0_IC | DBCR0_IDM);
 		regs->msr &= ~MSR_DE;
 #else
 		regs->msr &= ~MSR_SE;
@@ -727,22 +734,75 @@ void user_disable_single_step(struct task_struct *task)
 	clear_tsk_thread_flag(task, TIF_SINGLESTEP);
 }
 
-static int ptrace_set_debugreg(struct task_struct *task, unsigned long addr,
+int ptrace_set_debugreg(struct task_struct *task, unsigned long addr,
 			       unsigned long data)
 {
-	/* We only support one DABR and no IABRS at the moment */
+	/* For ppc64 we support one DABR and no IABR's at the moment (ppc64).
+	 *  For embedded processors we support one DAC and no IAC's at the
+	 *  moment.
+	 */
 	if (addr > 0)
 		return -EINVAL;
 
-	/* The bottom 3 bits are flags */
 	if ((data & ~0x7UL) >= TASK_SIZE)
 		return -EIO;
 
-	/* Ensure translation is on */
+#ifdef CONFIG_PPC64
+
+	/* For processors using DABR (i.e. 970), the bottom 3 bits are flags.
+	 *  It was assumed, on previous implementations, that 3 bits were
+	 *  passed together with the data address, fitting the design of the
+	 *  DABR register, as follows:
+	 *
+	 *  bit 0: Read flag
+	 *  bit 1: Write flag
+	 *  bit 2: Breakpoint translation
+	 *
+	 *  Thus, we use them here as so.
+	 */
+
+	/* Ensure breakpoint translation bit is set */
 	if (data && !(data & DABR_TRANSLATION))
 		return -EIO;
 
+	/* Move contents to the DABR register */
 	task->thread.dabr = data;
+
+#endif
+#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
+
+	/* As described above, it was assumed 3 bits were passed with the data
+	 *  address, but we will assume only the mode bits will be passed
+	 *  as to not cause alignment restrictions for DAC-based processors.
+	 */
+
+	/* DAC's hold the whole address without any mode flags */
+	task->thread.dabr = data & ~0x3UL;
+
+	if (task->thread.dabr == 0) {
+		task->thread.dbcr0 &= ~(DBSR_DAC1R | DBSR_DAC1W | DBCR0_IDM);
+		task->thread.regs->msr &= ~MSR_DE;
+		return 0;
+	}
+
+	/* Read or Write bits must be set */
+
+	if (!(data & 0x3UL))
+		return -EINVAL;
+
+	/* Set the Internal Debugging flag (IDM bit 1) for the DBCR0
+	   register */
+	task->thread.dbcr0 = DBCR0_IDM;
+
+	/* Check for write and read flags and set DBCR0
+	   accordingly */
+	if (data & 0x1UL)
+		task->thread.dbcr0 |= DBSR_DAC1R;
+	if (data & 0x2UL)
+		task->thread.dbcr0 |= DBSR_DAC1W;
+
+	task->thread.regs->msr |= MSR_DE;
+#endif
 	return 0;
 }
 
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index ad55488939c..7aada783ec6 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -145,8 +145,12 @@ int do_signal(sigset_t *oldset, struct pt_regs *regs)
 	 * user space. The DABR will have been cleared if it
 	 * triggered inside the kernel.
 	 */
-	if (current->thread.dabr)
+	if (current->thread.dabr) {
 		set_dabr(current->thread.dabr);
+#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
+		mtspr(SPRN_DBCR0, current->thread.dbcr0);
+#endif
+	}
 
 	if (is32) {
         	if (ka.sa.sa_flags & SA_SIGINFO)
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 878fbddb6ae..81ccb8dd1a5 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1067,6 +1067,22 @@ void __kprobes DebugException(struct pt_regs *regs, unsigned long debug_status)
 		}
 
 		_exception(SIGTRAP, regs, TRAP_TRACE, regs->nip);
+	} else if (debug_status & (DBSR_DAC1R | DBSR_DAC1W)) {
+		regs->msr &= ~MSR_DE;
+
+		if (user_mode(regs)) {
+			current->thread.dbcr0 &= ~(DBSR_DAC1R | DBSR_DAC1W |
+								DBCR0_IDM);
+		} else {
+			/* Disable DAC interupts */
+			mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) & ~(DBSR_DAC1R |
+						DBSR_DAC1W | DBCR0_IDM));
+
+			/* Clear the DAC event */
+			mtspr(SPRN_DBSR, (DBSR_DAC1R | DBSR_DAC1W));
+		}
+		/* Setup and send the trap to the handler */
+		do_dabr(regs, mfspr(SPRN_DAC1), debug_status);
 	}
 }
 #endif /* CONFIG_4xx || CONFIG_BOOKE */
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 1707d00331f..565b7a237c8 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -100,31 +100,6 @@ static int store_updates_sp(struct pt_regs *regs)
 	return 0;
 }
 
-#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
-static void do_dabr(struct pt_regs *regs, unsigned long address,
-		    unsigned long error_code)
-{
-	siginfo_t info;
-
-	if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
-			11, SIGSEGV) == NOTIFY_STOP)
-		return;
-
-	if (debugger_dabr_match(regs))
-		return;
-
-	/* Clear the DABR */
-	set_dabr(0);
-
-	/* Deliver the signal to userspace */
-	info.si_signo = SIGTRAP;
-	info.si_errno = 0;
-	info.si_code = TRAP_HWBKPT;
-	info.si_addr = (void __user *)address;
-	force_sig_info(SIGTRAP, &info, current);
-}
-#endif /* !(CONFIG_4xx || CONFIG_BOOKE)*/
-
 /*
  * For 600- and 800-family processors, the error_code parameter is DSISR
  * for a data fault, SRR1 for an instruction fault. For 400-family processors
diff --git a/include/asm-powerpc/system.h b/include/asm-powerpc/system.h
index e6e25e2364e..d6648c14332 100644
--- a/include/asm-powerpc/system.h
+++ b/include/asm-powerpc/system.h
@@ -110,6 +110,8 @@ static inline int debugger_fault_handler(struct pt_regs *regs) { return 0; }
 #endif
 
 extern int set_dabr(unsigned long dabr);
+extern void do_dabr(struct pt_regs *regs, unsigned long address,
+		    unsigned long error_code);
 extern void print_backtrace(unsigned long *);
 extern void show_regs(struct pt_regs * regs);
 extern void flush_instruction_cache(void);
-- 
cgit v1.2.3-70-g09d2


From 7886250e9d71b24d0205ac6798ee855fb3836318 Mon Sep 17 00:00:00 2001
From: Mark Nelson <markn@au1.ibm.com>
Date: Thu, 24 Jul 2008 14:28:48 +1000
Subject: powerpc/cell: Fixed IOMMU mapping uses weak ordering for a pcie
 endpoint

At the moment the fixed mapping is by default strongly ordered (the
iommu_fixed=weak boot option must be used to make the fixed mapping weakly
ordered). If we're on a setup where the southbridge is being used in
endpoint mode (triblade and CAB boards) the default should be a weakly
ordered fixed mapping.

This adds a check so that if a node of type pcie-endpoint can be found in
the device tree the fixed mapping is set to be weak by default (but can be
overridden using iommu_fixed=strong).

Signed-off-by: Mark Nelson <markn@au1.ibm.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/platforms/cell/iommu.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c
index 208005ca262..031124a8e37 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -1150,12 +1150,23 @@ static int iommu_fixed_disabled;
 
 static int __init setup_iommu_fixed(char *str)
 {
+	struct device_node *pciep;
+
 	if (strcmp(str, "off") == 0)
 		iommu_fixed_disabled = 1;
 
-	else if (strcmp(str, "weak") == 0)
+	/* If we can find a pcie-endpoint in the device tree assume that
+	 * we're on a triblade or a CAB so by default the fixed mapping
+	 * should be set to be weakly ordered; but only if the boot
+	 * option WASN'T set for strong ordering
+	 */
+	pciep = of_find_node_by_type(NULL, "pcie-endpoint");
+
+	if (strcmp(str, "weak") == 0 || (pciep && strcmp(str, "strong") != 0))
 		iommu_fixed_is_weak = 1;
 
+	of_node_put(pciep);
+
 	return 1;
 }
 __setup("iommu_fixed=", setup_iommu_fixed);
-- 
cgit v1.2.3-70-g09d2


From 80c60bf9b96f6108c630d90efc073cd520801e6c Mon Sep 17 00:00:00 2001
From: Segher Boessenkool <segher@kernel.crashing.org>
Date: Fri, 25 Jul 2008 10:08:41 +1000
Subject: powerpc: Fix compile error with binutils 2.15

My previous patch to fix compilation with binutils-2.17 causes
a "file truncated" build error from ld with binutils 2.15 (and
possibly older), and a warning with 2.16 and 2.17.

This fixes it.

Signed-off-by: Segher Boessenkool <segher@kernel.crashing.org>
Acked-by: Chuck Meade <chuckmeade@mindspring.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/kernel/vmlinux.lds.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index a914411bced..4a8ce62fe11 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -85,7 +85,7 @@ SECTIONS
 
 	/* The dummy segment contents for the bug workaround mentioned above
 	   near PHDRS.  */
-	.dummy : {
+	.dummy : AT(ADDR(.dummy) - LOAD_OFFSET) {
 		LONG(0xf177)
 	} :kernel :dummy
 
-- 
cgit v1.2.3-70-g09d2


From 545500b307658ad5783e0f3a52a32b97b2dfaed2 Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@austin.ibm.com>
Date: Thu, 24 Jul 2008 04:25:00 +1000
Subject: powerpc/pseries: Remove extraneous error reporting for hcall failures
 in lparcfg

Remove the extraneous error reporting used when a hcall made from lparcfg fails.

Signed-off-by: Nathan Fontenot <nfont@austin.ibm.com>
Signed-off-by: Robert Jennings <rcj@linux.vnet.ibm.com>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/kernel/lparcfg.c | 32 --------------------------------
 1 file changed, 32 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/lparcfg.c b/arch/powerpc/kernel/lparcfg.c
index 827a5726a03..20278ece31d 100644
--- a/arch/powerpc/kernel/lparcfg.c
+++ b/arch/powerpc/kernel/lparcfg.c
@@ -129,33 +129,6 @@ static int iseries_lparcfg_data(struct seq_file *m, void *v)
 /*
  * Methods used to fetch LPAR data when running on a pSeries platform.
  */
-static void log_plpar_hcall_return(unsigned long rc, char *tag)
-{
-	switch(rc) {
-	case 0:
-		return;
-	case H_HARDWARE:
-		printk(KERN_INFO "plpar-hcall (%s) "
-				"Hardware fault\n", tag);
-		return;
-	case H_FUNCTION:
-		printk(KERN_INFO "plpar-hcall (%s) "
-				"Function not allowed\n", tag);
-		return;
-	case H_AUTHORITY:
-		printk(KERN_INFO "plpar-hcall (%s) "
-				"Not authorized to this function\n", tag);
-		return;
-	case H_PARAMETER:
-		printk(KERN_INFO "plpar-hcall (%s) "
-				"Bad parameter(s)\n",tag);
-		return;
-	default:
-		printk(KERN_INFO "plpar-hcall (%s) "
-				"Unexpected rc(0x%lx)\n", tag, rc);
-	}
-}
-
 /*
  * H_GET_PPP hcall returns info in 4 parms.
  *  entitled_capacity,unallocated_capacity,
@@ -191,8 +164,6 @@ static unsigned int h_get_ppp(unsigned long *entitled,
 	*aggregation = retbuf[2];
 	*resource = retbuf[3];
 
-	log_plpar_hcall_return(rc, "H_GET_PPP");
-
 	return rc;
 }
 
@@ -205,9 +176,6 @@ static void h_pic(unsigned long *pool_idle_time, unsigned long *num_procs)
 
 	*pool_idle_time = retbuf[0];
 	*num_procs = retbuf[1];
-
-	if (rc != H_AUTHORITY)
-		log_plpar_hcall_return(rc, "H_PIC");
 }
 
 #define SPLPAR_CHARACTERISTICS_TOKEN 20
-- 
cgit v1.2.3-70-g09d2


From 11529396ea3190113173f7a15e59a58dbcaa36c8 Mon Sep 17 00:00:00 2001
From: Nathan Fotenot <nfont@austin.ibm.com>
Date: Thu, 24 Jul 2008 04:25:16 +1000
Subject: powerpc/pseries: Split processor entitlement retrieval and gathering
 to helper routines

Split the retrieval and setting of processor entitlement and weight into
helper routines.  This also removes the printing of the raw values
returned from h_get_ppp, the values are already parsed and printed.

Signed-off-by: Nathan Fontenot <nfont@austin.ibm.com>
Signed-off-by: Robert Jennings <rcj@linux.vnet.ibm.com>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/kernel/lparcfg.c | 166 ++++++++++++++++++++++--------------------
 1 file changed, 88 insertions(+), 78 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/lparcfg.c b/arch/powerpc/kernel/lparcfg.c
index 20278ece31d..a0ca90ab5e3 100644
--- a/arch/powerpc/kernel/lparcfg.c
+++ b/arch/powerpc/kernel/lparcfg.c
@@ -167,7 +167,8 @@ static unsigned int h_get_ppp(unsigned long *entitled,
 	return rc;
 }
 
-static void h_pic(unsigned long *pool_idle_time, unsigned long *num_procs)
+static unsigned h_pic(unsigned long *pool_idle_time,
+		      unsigned long *num_procs)
 {
 	unsigned long rc;
 	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
@@ -176,6 +177,51 @@ static void h_pic(unsigned long *pool_idle_time, unsigned long *num_procs)
 
 	*pool_idle_time = retbuf[0];
 	*num_procs = retbuf[1];
+
+	return rc;
+}
+
+/*
+ * parse_ppp_data
+ * Parse out the data returned from h_get_ppp and h_pic
+ */
+static void parse_ppp_data(struct seq_file *m)
+{
+	unsigned long h_entitled, h_unallocated;
+	unsigned long h_aggregation, h_resource;
+	int rc;
+
+	rc = h_get_ppp(&h_entitled, &h_unallocated, &h_aggregation,
+		       &h_resource);
+	if (rc)
+		return;
+
+	seq_printf(m, "partition_entitled_capacity=%ld\n", h_entitled);
+	seq_printf(m, "group=%ld\n", (h_aggregation >> 2 * 8) & 0xffff);
+	seq_printf(m, "system_active_processors=%ld\n",
+		   (h_resource >> 0 * 8) & 0xffff);
+
+	/* pool related entries are apropriate for shared configs */
+	if (lppaca[0].shared_proc) {
+		unsigned long pool_idle_time, pool_procs;
+
+		seq_printf(m, "pool=%ld\n", (h_aggregation >> 0 * 8) & 0xffff);
+
+		/* report pool_capacity in percentage */
+		seq_printf(m, "pool_capacity=%ld\n",
+			   ((h_resource >> 2 * 8) & 0xffff) * 100);
+
+		h_pic(&pool_idle_time, &pool_procs);
+		seq_printf(m, "pool_idle_time=%ld\n", pool_idle_time);
+		seq_printf(m, "pool_num_procs=%ld\n", pool_procs);
+	}
+
+	seq_printf(m, "unallocated_capacity_weight=%ld\n",
+		   (h_resource >> 4 * 8) & 0xFF);
+
+	seq_printf(m, "capacity_weight=%ld\n", (h_resource >> 5 * 8) & 0xFF);
+	seq_printf(m, "capped=%ld\n", (h_resource >> 6 * 8) & 0x01);
+	seq_printf(m, "unallocated_capacity=%ld\n", h_unallocated);
 }
 
 #define SPLPAR_CHARACTERISTICS_TOKEN 20
@@ -302,60 +348,11 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
 	partition_active_processors = lparcfg_count_active_processors();
 
 	if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
-		unsigned long h_entitled, h_unallocated;
-		unsigned long h_aggregation, h_resource;
-		unsigned long pool_idle_time, pool_procs;
-		unsigned long purr;
-
-		h_get_ppp(&h_entitled, &h_unallocated, &h_aggregation,
-			  &h_resource);
-
-		seq_printf(m, "R4=0x%lx\n", h_entitled);
-		seq_printf(m, "R5=0x%lx\n", h_unallocated);
-		seq_printf(m, "R6=0x%lx\n", h_aggregation);
-		seq_printf(m, "R7=0x%lx\n", h_resource);
-
-		purr = get_purr();
-
 		/* this call handles the ibm,get-system-parameter contents */
 		parse_system_parameter_string(m);
+		parse_ppp_data(m);
 
-		seq_printf(m, "partition_entitled_capacity=%ld\n", h_entitled);
-
-		seq_printf(m, "group=%ld\n", (h_aggregation >> 2 * 8) & 0xffff);
-
-		seq_printf(m, "system_active_processors=%ld\n",
-			   (h_resource >> 0 * 8) & 0xffff);
-
-		/* pool related entries are apropriate for shared configs */
-		if (lppaca[0].shared_proc) {
-
-			h_pic(&pool_idle_time, &pool_procs);
-
-			seq_printf(m, "pool=%ld\n",
-				   (h_aggregation >> 0 * 8) & 0xffff);
-
-			/* report pool_capacity in percentage */
-			seq_printf(m, "pool_capacity=%ld\n",
-				   ((h_resource >> 2 * 8) & 0xffff) * 100);
-
-			seq_printf(m, "pool_idle_time=%ld\n", pool_idle_time);
-
-			seq_printf(m, "pool_num_procs=%ld\n", pool_procs);
-		}
-
-		seq_printf(m, "unallocated_capacity_weight=%ld\n",
-			   (h_resource >> 4 * 8) & 0xFF);
-
-		seq_printf(m, "capacity_weight=%ld\n",
-			   (h_resource >> 5 * 8) & 0xFF);
-
-		seq_printf(m, "capped=%ld\n", (h_resource >> 6 * 8) & 0x01);
-
-		seq_printf(m, "unallocated_capacity=%ld\n", h_unallocated);
-
-		seq_printf(m, "purr=%ld\n", purr);
-
+		seq_printf(m, "purr=%ld\n", get_purr());
 	} else {		/* non SPLPAR case */
 
 		seq_printf(m, "system_active_processors=%d\n",
@@ -382,6 +379,41 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
 	return 0;
 }
 
+static ssize_t update_ppp(u64 *entitlement, u8 *weight)
+{
+	unsigned long current_entitled;
+	unsigned long dummy;
+	unsigned long resource;
+	u8 current_weight, new_weight;
+	u64 new_entitled;
+	ssize_t retval;
+
+	/* Get our current parameters */
+	retval = h_get_ppp(&current_entitled, &dummy, &dummy, &resource);
+	if (retval)
+		return retval;
+
+	current_weight = (resource >> 5 * 8) & 0xFF;
+
+	if (entitlement) {
+		new_weight = current_weight;
+		new_entitled = *entitlement;
+	} else if (weight) {
+		new_weight = *weight;
+		new_entitled = current_entitled;
+	} else
+		return -EINVAL;
+
+	pr_debug("%s: current_entitled = %lu, current_weight = %u\n",
+		 __FUNCTION__, current_entitled, current_weight);
+
+	pr_debug("%s: new_entitled = %lu, new_weight = %u\n",
+		 __FUNCTION__, new_entitled, new_weight);
+
+	retval = plpar_hcall_norets(H_SET_PPP, new_entitled, new_weight);
+	return retval;
+}
+
 /*
  * Interface for changing system parameters (variable capacity weight
  * and entitled capacity).  Format of input is "param_name=value";
@@ -399,12 +431,6 @@ static ssize_t lparcfg_write(struct file *file, const char __user * buf,
 	char *tmp;
 	u64 new_entitled, *new_entitled_ptr = &new_entitled;
 	u8 new_weight, *new_weight_ptr = &new_weight;
-
-	unsigned long current_entitled;	/* parameters for h_get_ppp */
-	unsigned long dummy;
-	unsigned long resource;
-	u8 current_weight;
-
 	ssize_t retval = -ENOMEM;
 
 	if (!firmware_has_feature(FW_FEATURE_SPLPAR) ||
@@ -432,33 +458,17 @@ static ssize_t lparcfg_write(struct file *file, const char __user * buf,
 		*new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10);
 		if (endp == tmp)
 			goto out;
-		new_weight_ptr = &current_weight;
+
+		retval = update_ppp(new_entitled_ptr, NULL);
 	} else if (!strcmp(kbuf, "capacity_weight")) {
 		char *endp;
 		*new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10);
 		if (endp == tmp)
 			goto out;
-		new_entitled_ptr = &current_entitled;
-	} else
-		goto out;
 
-	/* Get our current parameters */
-	retval = h_get_ppp(&current_entitled, &dummy, &dummy, &resource);
-	if (retval) {
-		retval = -EIO;
+		retval = update_ppp(NULL, new_weight_ptr);
+	} else
 		goto out;
-	}
-
-	current_weight = (resource >> 5 * 8) & 0xFF;
-
-	pr_debug("%s: current_entitled = %lu, current_weight = %u\n",
-		 __func__, current_entitled, current_weight);
-
-	pr_debug("%s: new_entitled = %lu, new_weight = %u\n",
-		 __func__, *new_entitled_ptr, *new_weight_ptr);
-
-	retval = plpar_hcall_norets(H_SET_PPP, *new_entitled_ptr,
-				    *new_weight_ptr);
 
 	if (retval == H_SUCCESS || retval == H_CONSTRAINED) {
 		retval = count;
-- 
cgit v1.2.3-70-g09d2


From dfc3403f0e5ffb94ee29942f313b87d4061d951b Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@austin.ibm.com>
Date: Thu, 24 Jul 2008 04:27:30 +1000
Subject: powerpc/pseries: Add memory entitlement capabilities to
 /proc/ppc64/lparcfg

Update /proc/ppc64/lparcfg to display Cooperative Memory
Overcommitment statistics as reported by the H_GET_MPP hcall.  This
also updates the lparcfg interface to allow setting memory entitlement
and weight.

Signed-off-by: Nathan Fontenot <nfont@austin.ibm.com>
Signed-off-by: Robert Jennings <rcj@linux.vnet.ibm.com>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/kernel/lparcfg.c | 121 +++++++++++++++++++++++++++++++++++++++++-
 include/asm-powerpc/hvcall.h  |  18 ++++++-
 2 files changed, 137 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/lparcfg.c b/arch/powerpc/kernel/lparcfg.c
index a0ca90ab5e3..86e5b3ed10d 100644
--- a/arch/powerpc/kernel/lparcfg.c
+++ b/arch/powerpc/kernel/lparcfg.c
@@ -35,7 +35,7 @@
 #include <asm/prom.h>
 #include <asm/vdso_datapage.h>
 
-#define MODULE_VERS "1.7"
+#define MODULE_VERS "1.8"
 #define MODULE_NAME "lparcfg"
 
 /* #define LPARCFG_DEBUG */
@@ -129,6 +129,35 @@ static int iseries_lparcfg_data(struct seq_file *m, void *v)
 /*
  * Methods used to fetch LPAR data when running on a pSeries platform.
  */
+/**
+ * h_get_mpp
+ * H_GET_MPP hcall returns info in 7 parms
+ */
+int h_get_mpp(struct hvcall_mpp_data *mpp_data)
+{
+	int rc;
+	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+
+	rc = plpar_hcall9(H_GET_MPP, retbuf);
+
+	mpp_data->entitled_mem = retbuf[0];
+	mpp_data->mapped_mem = retbuf[1];
+
+	mpp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
+	mpp_data->pool_num = retbuf[2] & 0xffff;
+
+	mpp_data->mem_weight = (retbuf[3] >> 7 * 8) & 0xff;
+	mpp_data->unallocated_mem_weight = (retbuf[3] >> 6 * 8) & 0xff;
+	mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffff;
+
+	mpp_data->pool_size = retbuf[4];
+	mpp_data->loan_request = retbuf[5];
+	mpp_data->backing_mem = retbuf[6];
+
+	return rc;
+}
+EXPORT_SYMBOL(h_get_mpp);
+
 /*
  * H_GET_PPP hcall returns info in 4 parms.
  *  entitled_capacity,unallocated_capacity,
@@ -224,6 +253,44 @@ static void parse_ppp_data(struct seq_file *m)
 	seq_printf(m, "unallocated_capacity=%ld\n", h_unallocated);
 }
 
+/**
+ * parse_mpp_data
+ * Parse out data returned from h_get_mpp
+ */
+static void parse_mpp_data(struct seq_file *m)
+{
+	struct hvcall_mpp_data mpp_data;
+	int rc;
+
+	rc = h_get_mpp(&mpp_data);
+	if (rc)
+		return;
+
+	seq_printf(m, "entitled_memory=%ld\n", mpp_data.entitled_mem);
+
+	if (mpp_data.mapped_mem != -1)
+		seq_printf(m, "mapped_entitled_memory=%ld\n",
+		           mpp_data.mapped_mem);
+
+	seq_printf(m, "entitled_memory_group_number=%d\n", mpp_data.group_num);
+	seq_printf(m, "entitled_memory_pool_number=%d\n", mpp_data.pool_num);
+
+	seq_printf(m, "entitled_memory_weight=%d\n", mpp_data.mem_weight);
+	seq_printf(m, "unallocated_entitled_memory_weight=%d\n",
+	           mpp_data.unallocated_mem_weight);
+	seq_printf(m, "unallocated_io_mapping_entitlement=%ld\n",
+	           mpp_data.unallocated_entitlement);
+
+	if (mpp_data.pool_size != -1)
+		seq_printf(m, "entitled_memory_pool_size=%ld bytes\n",
+		           mpp_data.pool_size);
+
+	seq_printf(m, "entitled_memory_loan_request=%ld\n",
+	           mpp_data.loan_request);
+
+	seq_printf(m, "backing_memory=%ld bytes\n", mpp_data.backing_mem);
+}
+
 #define SPLPAR_CHARACTERISTICS_TOKEN 20
 #define SPLPAR_MAXLENGTH 1026*(sizeof(char))
 
@@ -351,6 +418,7 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
 		/* this call handles the ibm,get-system-parameter contents */
 		parse_system_parameter_string(m);
 		parse_ppp_data(m);
+		parse_mpp_data(m);
 
 		seq_printf(m, "purr=%ld\n", get_purr());
 	} else {		/* non SPLPAR case */
@@ -414,6 +482,43 @@ static ssize_t update_ppp(u64 *entitlement, u8 *weight)
 	return retval;
 }
 
+/**
+ * update_mpp
+ *
+ * Update the memory entitlement and weight for the partition.  Caller must
+ * specify either a new entitlement or weight, not both, to be updated
+ * since the h_set_mpp call takes both entitlement and weight as parameters.
+ */
+static ssize_t update_mpp(u64 *entitlement, u8 *weight)
+{
+	struct hvcall_mpp_data mpp_data;
+	u64 new_entitled;
+	u8 new_weight;
+	ssize_t rc;
+
+	rc = h_get_mpp(&mpp_data);
+	if (rc)
+		return rc;
+
+	if (entitlement) {
+		new_weight = mpp_data.mem_weight;
+		new_entitled = *entitlement;
+	} else if (weight) {
+		new_weight = *weight;
+		new_entitled = mpp_data.entitled_mem;
+	} else
+		return -EINVAL;
+
+	pr_debug("%s: current_entitled = %lu, current_weight = %u\n",
+	         __FUNCTION__, mpp_data.entitled_mem, mpp_data.mem_weight);
+
+	pr_debug("%s: new_entitled = %lu, new_weight = %u\n",
+	         __FUNCTION__, new_entitled, new_weight);
+
+	rc = plpar_hcall_norets(H_SET_MPP, new_entitled, new_weight);
+	return rc;
+}
+
 /*
  * Interface for changing system parameters (variable capacity weight
  * and entitled capacity).  Format of input is "param_name=value";
@@ -467,6 +572,20 @@ static ssize_t lparcfg_write(struct file *file, const char __user * buf,
 			goto out;
 
 		retval = update_ppp(NULL, new_weight_ptr);
+	} else if (!strcmp(kbuf, "entitled_memory")) {
+		char *endp;
+		*new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10);
+		if (endp == tmp)
+			goto out;
+
+		retval = update_mpp(new_entitled_ptr, NULL);
+	} else if (!strcmp(kbuf, "entitled_memory_weight")) {
+		char *endp;
+		*new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10);
+		if (endp == tmp)
+			goto out;
+
+		retval = update_mpp(NULL, new_weight_ptr);
 	} else
 		goto out;
 
diff --git a/include/asm-powerpc/hvcall.h b/include/asm-powerpc/hvcall.h
index bf6cd7cb996..46e76456cbb 100644
--- a/include/asm-powerpc/hvcall.h
+++ b/include/asm-powerpc/hvcall.h
@@ -210,7 +210,9 @@
 #define H_JOIN			0x298
 #define H_VASI_STATE            0x2A4
 #define H_ENABLE_CRQ		0x2B0
-#define MAX_HCALL_OPCODE	H_ENABLE_CRQ
+#define H_SET_MPP		0x2D0
+#define H_GET_MPP		0x2D4
+#define MAX_HCALL_OPCODE	H_GET_MPP
 
 #ifndef __ASSEMBLY__
 
@@ -270,6 +272,20 @@ struct hcall_stats {
 };
 #define HCALL_STAT_ARRAY_SIZE	((MAX_HCALL_OPCODE >> 2) + 1)
 
+struct hvcall_mpp_data {
+	unsigned long entitled_mem;
+	unsigned long mapped_mem;
+	unsigned short group_num;
+	unsigned short pool_num;
+	unsigned char mem_weight;
+	unsigned char unallocated_mem_weight;
+	unsigned long unallocated_entitlement;  /* value in bytes */
+	unsigned long pool_size;
+	signed long loan_request;
+	unsigned long backing_mem;
+};
+
+int h_get_mpp(struct hvcall_mpp_data *);
 #endif /* __ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_HVCALL_H */
-- 
cgit v1.2.3-70-g09d2


From 398778f78b76fb72cb18411487af01dea202709e Mon Sep 17 00:00:00 2001
From: Robert Jennings <rcj@linux.vnet.ibm.com>
Date: Thu, 24 Jul 2008 04:28:05 +1000
Subject: powerpc/pseries: Split retrieval of processor entitlement data into a
 helper routine

Split the retrieval of processor entitlement data returned in the H_GET_PPP
hcall into its own helper routine.

Signed-off-by: Nathan Fontenot <nfont@austin.ibm.com>
Signed-off-by: Robert Jennings <rcj@linux.vnet.ibm.com>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/kernel/lparcfg.c | 81 ++++++++++++++++++++++++-------------------
 1 file changed, 46 insertions(+), 35 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/lparcfg.c b/arch/powerpc/kernel/lparcfg.c
index 86e5b3ed10d..d82e1fa5ce2 100644
--- a/arch/powerpc/kernel/lparcfg.c
+++ b/arch/powerpc/kernel/lparcfg.c
@@ -158,6 +158,18 @@ int h_get_mpp(struct hvcall_mpp_data *mpp_data)
 }
 EXPORT_SYMBOL(h_get_mpp);
 
+struct hvcall_ppp_data {
+	u64	entitlement;
+	u64	unallocated_entitlement;
+	u16	group_num;
+	u16	pool_num;
+	u8	capped;
+	u8	weight;
+	u8	unallocated_weight;
+	u16	active_procs_in_pool;
+	u16	active_system_procs;
+};
+
 /*
  * H_GET_PPP hcall returns info in 4 parms.
  *  entitled_capacity,unallocated_capacity,
@@ -178,20 +190,24 @@ EXPORT_SYMBOL(h_get_mpp);
  *              XXXX - Active processors in Physical Processor Pool.
  *                  XXXX  - Processors active on platform.
  */
-static unsigned int h_get_ppp(unsigned long *entitled,
-			      unsigned long *unallocated,
-			      unsigned long *aggregation,
-			      unsigned long *resource)
+static unsigned int h_get_ppp(struct hvcall_ppp_data *ppp_data)
 {
 	unsigned long rc;
 	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
 
 	rc = plpar_hcall(H_GET_PPP, retbuf);
 
-	*entitled = retbuf[0];
-	*unallocated = retbuf[1];
-	*aggregation = retbuf[2];
-	*resource = retbuf[3];
+	ppp_data->entitlement = retbuf[0];
+	ppp_data->unallocated_entitlement = retbuf[1];
+
+	ppp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
+	ppp_data->pool_num = retbuf[2] & 0xffff;
+
+	ppp_data->capped = (retbuf[3] >> 6 * 8) & 0x01;
+	ppp_data->weight = (retbuf[3] >> 5 * 8) & 0xff;
+	ppp_data->unallocated_weight = (retbuf[3] >> 4 * 8) & 0xff;
+	ppp_data->active_procs_in_pool = (retbuf[3] >> 2 * 8) & 0xffff;
+	ppp_data->active_system_procs = retbuf[3] & 0xffff;
 
 	return rc;
 }
@@ -216,41 +232,40 @@ static unsigned h_pic(unsigned long *pool_idle_time,
  */
 static void parse_ppp_data(struct seq_file *m)
 {
-	unsigned long h_entitled, h_unallocated;
-	unsigned long h_aggregation, h_resource;
+	struct hvcall_ppp_data ppp_data;
 	int rc;
 
-	rc = h_get_ppp(&h_entitled, &h_unallocated, &h_aggregation,
-		       &h_resource);
+	rc = h_get_ppp(&ppp_data);
 	if (rc)
 		return;
 
-	seq_printf(m, "partition_entitled_capacity=%ld\n", h_entitled);
-	seq_printf(m, "group=%ld\n", (h_aggregation >> 2 * 8) & 0xffff);
-	seq_printf(m, "system_active_processors=%ld\n",
-		   (h_resource >> 0 * 8) & 0xffff);
+	seq_printf(m, "partition_entitled_capacity=%ld\n",
+	           ppp_data.entitlement);
+	seq_printf(m, "group=%d\n", ppp_data.group_num);
+	seq_printf(m, "system_active_processors=%d\n",
+	           ppp_data.active_system_procs);
 
 	/* pool related entries are apropriate for shared configs */
 	if (lppaca[0].shared_proc) {
 		unsigned long pool_idle_time, pool_procs;
 
-		seq_printf(m, "pool=%ld\n", (h_aggregation >> 0 * 8) & 0xffff);
+		seq_printf(m, "pool=%d\n", ppp_data.pool_num);
 
 		/* report pool_capacity in percentage */
-		seq_printf(m, "pool_capacity=%ld\n",
-			   ((h_resource >> 2 * 8) & 0xffff) * 100);
+		seq_printf(m, "pool_capacity=%d\n",
+			   ppp_data.active_procs_in_pool * 100);
 
 		h_pic(&pool_idle_time, &pool_procs);
 		seq_printf(m, "pool_idle_time=%ld\n", pool_idle_time);
 		seq_printf(m, "pool_num_procs=%ld\n", pool_procs);
 	}
 
-	seq_printf(m, "unallocated_capacity_weight=%ld\n",
-		   (h_resource >> 4 * 8) & 0xFF);
-
-	seq_printf(m, "capacity_weight=%ld\n", (h_resource >> 5 * 8) & 0xFF);
-	seq_printf(m, "capped=%ld\n", (h_resource >> 6 * 8) & 0x01);
-	seq_printf(m, "unallocated_capacity=%ld\n", h_unallocated);
+	seq_printf(m, "unallocated_capacity_weight=%d\n",
+		   ppp_data.unallocated_weight);
+	seq_printf(m, "capacity_weight=%d\n", ppp_data.weight);
+	seq_printf(m, "capped=%d\n", ppp_data.capped);
+	seq_printf(m, "unallocated_capacity=%ld\n",
+		   ppp_data.unallocated_entitlement);
 }
 
 /**
@@ -449,31 +464,27 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
 
 static ssize_t update_ppp(u64 *entitlement, u8 *weight)
 {
-	unsigned long current_entitled;
-	unsigned long dummy;
-	unsigned long resource;
-	u8 current_weight, new_weight;
+	struct hvcall_ppp_data ppp_data;
+	u8 new_weight;
 	u64 new_entitled;
 	ssize_t retval;
 
 	/* Get our current parameters */
-	retval = h_get_ppp(&current_entitled, &dummy, &dummy, &resource);
+	retval = h_get_ppp(&ppp_data);
 	if (retval)
 		return retval;
 
-	current_weight = (resource >> 5 * 8) & 0xFF;
-
 	if (entitlement) {
-		new_weight = current_weight;
+		new_weight = ppp_data.weight;
 		new_entitled = *entitlement;
 	} else if (weight) {
 		new_weight = *weight;
-		new_entitled = current_entitled;
+		new_entitled = ppp_data.entitlement;
 	} else
 		return -EINVAL;
 
 	pr_debug("%s: current_entitled = %lu, current_weight = %u\n",
-		 __FUNCTION__, current_entitled, current_weight);
+	         __FUNCTION__, ppp_data.entitlement, ppp_data.weight);
 
 	pr_debug("%s: new_entitled = %lu, new_weight = %u\n",
 		 __FUNCTION__, new_entitled, new_weight);
-- 
cgit v1.2.3-70-g09d2


From e46de429cb954d30a5642fba81d516ede518c65e Mon Sep 17 00:00:00 2001
From: Robert Jennings <rcj@linux.vnet.ibm.com>
Date: Thu, 24 Jul 2008 04:29:03 +1000
Subject: powerpc/pseries: Enable CMO feature during platform setup

For Cooperative Memory Overcommitment (CMO), set the FW_FEATURE_CMO
flag in powerpc_firmware_features from the rtas ibm,get-system-parameters
table prior to calling iommu_init_early_pSeries.

With this, any CMO specific functionality can be controlled by checking:
 firmware_has_feature(FW_FEATURE_CMO)

Signed-off-by: Robert Jennings <rcj@linux.vnet.ibm.com>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/platforms/pseries/setup.c | 71 ++++++++++++++++++++++++++++++++++
 include/asm-powerpc/firmware.h         |  3 +-
 2 files changed, 73 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 90beb444e1d..063a0d2fba3 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -314,6 +314,76 @@ static int pseries_set_xdabr(unsigned long dabr)
 			H_DABRX_KERNEL | H_DABRX_USER);
 }
 
+#define CMO_CHARACTERISTICS_TOKEN 44
+#define CMO_MAXLENGTH 1026
+
+/**
+ * fw_cmo_feature_init - FW_FEATURE_CMO is not stored in ibm,hypertas-functions,
+ * handle that here. (Stolen from parse_system_parameter_string)
+ */
+void pSeries_cmo_feature_init(void)
+{
+	char *ptr, *key, *value, *end;
+	int call_status;
+	int PrPSP = -1;
+	int SecPSP = -1;
+
+	pr_debug(" -> fw_cmo_feature_init()\n");
+	spin_lock(&rtas_data_buf_lock);
+	memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE);
+	call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
+				NULL,
+				CMO_CHARACTERISTICS_TOKEN,
+				__pa(rtas_data_buf),
+				RTAS_DATA_BUF_SIZE);
+
+	if (call_status != 0) {
+		spin_unlock(&rtas_data_buf_lock);
+		pr_debug("CMO not available\n");
+		pr_debug(" <- fw_cmo_feature_init()\n");
+		return;
+	}
+
+	end = rtas_data_buf + CMO_MAXLENGTH - 2;
+	ptr = rtas_data_buf + 2;	/* step over strlen value */
+	key = value = ptr;
+
+	while (*ptr && (ptr <= end)) {
+		/* Separate the key and value by replacing '=' with '\0' and
+		 * point the value at the string after the '='
+		 */
+		if (ptr[0] == '=') {
+			ptr[0] = '\0';
+			value = ptr + 1;
+		} else if (ptr[0] == '\0' || ptr[0] == ',') {
+			/* Terminate the string containing the key/value pair */
+			ptr[0] = '\0';
+
+			if (key == value) {
+				pr_debug("Malformed key/value pair\n");
+				/* Never found a '=', end processing */
+				break;
+			}
+
+			if (0 == strcmp(key, "PrPSP"))
+				PrPSP = simple_strtol(value, NULL, 10);
+			else if (0 == strcmp(key, "SecPSP"))
+				SecPSP = simple_strtol(value, NULL, 10);
+			value = key = ptr + 1;
+		}
+		ptr++;
+	}
+
+	if (PrPSP != -1 || SecPSP != -1) {
+		pr_info("CMO enabled\n");
+		pr_debug("CMO enabled, PrPSP=%d, SecPSP=%d\n", PrPSP, SecPSP);
+		powerpc_firmware_features |= FW_FEATURE_CMO;
+	} else
+		pr_debug("CMO not enabled, PrPSP=%d, SecPSP=%d\n", PrPSP, SecPSP);
+	spin_unlock(&rtas_data_buf_lock);
+	pr_debug(" <- fw_cmo_feature_init()\n");
+}
+
 /*
  * Early initialization.  Relocation is on but do not reference unbolted pages
  */
@@ -329,6 +399,7 @@ static void __init pSeries_init_early(void)
 	else if (firmware_has_feature(FW_FEATURE_XDABR))
 		ppc_md.set_dabr = pseries_set_xdabr;
 
+	pSeries_cmo_feature_init();
 	iommu_init_early_pSeries();
 
 	pr_debug(" <- pSeries_init_early()\n");
diff --git a/include/asm-powerpc/firmware.h b/include/asm-powerpc/firmware.h
index ef328995ba9..3a179827528 100644
--- a/include/asm-powerpc/firmware.h
+++ b/include/asm-powerpc/firmware.h
@@ -46,6 +46,7 @@
 #define FW_FEATURE_PS3_LV1	ASM_CONST(0x0000000000800000)
 #define FW_FEATURE_BEAT		ASM_CONST(0x0000000001000000)
 #define FW_FEATURE_BULK_REMOVE	ASM_CONST(0x0000000002000000)
+#define FW_FEATURE_CMO		ASM_CONST(0x0000000004000000)
 
 #ifndef __ASSEMBLY__
 
@@ -58,7 +59,7 @@ enum {
 		FW_FEATURE_MIGRATE | FW_FEATURE_PERFMON | FW_FEATURE_CRQ |
 		FW_FEATURE_VIO | FW_FEATURE_RDMA | FW_FEATURE_LLAN |
 		FW_FEATURE_BULK | FW_FEATURE_XDABR | FW_FEATURE_MULTITCE |
-		FW_FEATURE_SPLPAR | FW_FEATURE_LPAR,
+		FW_FEATURE_SPLPAR | FW_FEATURE_LPAR | FW_FEATURE_CMO,
 	FW_FEATURE_PSERIES_ALWAYS = 0,
 	FW_FEATURE_ISERIES_POSSIBLE = FW_FEATURE_ISERIES | FW_FEATURE_LPAR,
 	FW_FEATURE_ISERIES_ALWAYS = FW_FEATURE_ISERIES | FW_FEATURE_LPAR,
-- 
cgit v1.2.3-70-g09d2


From 86630a32320f83736c4c24e2c8bae218e4c56c7c Mon Sep 17 00:00:00 2001
From: Brian King <brking@linux.vnet.ibm.com>
Date: Thu, 24 Jul 2008 04:29:16 +1000
Subject: powerpc/pseries: Utilities to set firmware page state

Newer versions of firmware support page states, which are used by the
collaborative memory manager (future patch) to "loan" pages to the
hypervisor for use by other partitions.

Signed-off-by: Brian King <brking@linux.vnet.ibm.com>
Signed-off-by: Robert Jennings <rcj@linux.vnet.ibm.com>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/platforms/pseries/plpar_wrappers.h | 10 ++++++++++
 include/asm-powerpc/hvcall.h                    |  5 +++++
 2 files changed, 15 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/pseries/plpar_wrappers.h b/arch/powerpc/platforms/pseries/plpar_wrappers.h
index d8680b589dc..a437267c6bf 100644
--- a/arch/powerpc/platforms/pseries/plpar_wrappers.h
+++ b/arch/powerpc/platforms/pseries/plpar_wrappers.h
@@ -42,6 +42,16 @@ static inline long register_slb_shadow(unsigned long cpu, unsigned long vpa)
 	return vpa_call(0x3, cpu, vpa);
 }
 
+static inline long plpar_page_set_loaned(unsigned long vpa)
+{
+	return plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_LOANED, vpa, 0);
+}
+
+static inline long plpar_page_set_active(unsigned long vpa)
+{
+	return plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_ACTIVE, vpa, 0);
+}
+
 extern void vpa_init(int cpu);
 
 static inline long plpar_pte_enter(unsigned long flags,
diff --git a/include/asm-powerpc/hvcall.h b/include/asm-powerpc/hvcall.h
index 46e76456cbb..fbe2932fa9e 100644
--- a/include/asm-powerpc/hvcall.h
+++ b/include/asm-powerpc/hvcall.h
@@ -92,6 +92,11 @@
 #define H_EXACT			(1UL<<(63-24))	/* Use exact PTE or return H_PTEG_FULL */
 #define H_R_XLATE		(1UL<<(63-25))	/* include a valid logical page num in the pte if the valid bit is set */
 #define H_READ_4		(1UL<<(63-26))	/* Return 4 PTEs */
+#define H_PAGE_STATE_CHANGE	(1UL<<(63-28))
+#define H_PAGE_UNUSED		((1UL<<(63-29)) | (1UL<<(63-30)))
+#define H_PAGE_SET_UNUSED	(H_PAGE_STATE_CHANGE | H_PAGE_UNUSED)
+#define H_PAGE_SET_LOANED	(H_PAGE_SET_UNUSED | (1UL<<(63-31)))
+#define H_PAGE_SET_ACTIVE	H_PAGE_STATE_CHANGE
 #define H_AVPN			(1UL<<(63-32))	/* An avpn is provided as a sanity test */
 #define H_ANDCOND		(1UL<<(63-33))
 #define H_ICACHE_INVALIDATE	(1UL<<(63-40))	/* icbi, etc.  (ignored for IO pages) */
-- 
cgit v1.2.3-70-g09d2


From 84af458bb23bf5f0ba1af4320dd2a57f7c4363e5 Mon Sep 17 00:00:00 2001
From: Brian King <brking@linux.vnet.ibm.com>
Date: Thu, 24 Jul 2008 04:30:29 +1000
Subject: powerpc/pseries: Add collaborative memory manager

Adds a collaborative memory manager, which acts as a simple balloon driver
for System p machines that support cooperative memory overcommitment
(CMO).

Adds a platform configuration option for CMO called PPC_SMLPAR.

Signed-off-by: Brian King <brking@linux.vnet.ibm.com>
Signed-off-by: Robert Jennings <rcj@linux.vnet.ibm.com>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/platforms/pseries/Kconfig  |  23 ++
 arch/powerpc/platforms/pseries/Makefile |   1 +
 arch/powerpc/platforms/pseries/cmm.c    | 468 ++++++++++++++++++++++++++++++++
 3 files changed, 492 insertions(+)
 create mode 100644 arch/powerpc/platforms/pseries/cmm.c

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index 757c0296e0b..97619fd51e3 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -40,3 +40,26 @@ config PPC_PSERIES_DEBUG
 	depends on PPC_PSERIES && PPC_EARLY_DEBUG
 	bool "Enable extra debug logging in platforms/pseries"
 	default y
+
+config PPC_SMLPAR
+	bool "Support for shared-memory logical partitions"
+	depends on PPC_PSERIES
+	select LPARCFG
+	default n
+	help
+	  Select this option to enable shared memory partition support.
+	  With this option a system running in an LPAR can be given more
+	  memory than physically available and will allow firmware to
+	  balance memory across many LPARs.
+
+config CMM
+	tristate "Collaborative memory management"
+	depends on PPC_SMLPAR
+	default y
+	help
+	  Select this option, if you want to enable the kernel interface
+	  to reduce the memory size of the system. This is accomplished
+	  by allocating pages of memory and put them "on hold". This only
+	  makes sense for a system running in an LPAR where the unused pages
+	  will be reused for other LPARs. The interface allows firmware to
+	  balance memory across many LPARs.
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
index 554c6e42ef2..dfe574af2dc 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -24,3 +24,4 @@ obj-$(CONFIG_HVC_CONSOLE)	+= hvconsole.o
 obj-$(CONFIG_HVCS)		+= hvcserver.o
 obj-$(CONFIG_HCALL_STATS)	+= hvCall_inst.o
 obj-$(CONFIG_PHYP_DUMP)	+= phyp_dump.o
+obj-$(CONFIG_CMM)		+= cmm.o
diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c
new file mode 100644
index 00000000000..c6b3be03168
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/cmm.c
@@ -0,0 +1,468 @@
+/*
+ * Collaborative memory management interface.
+ *
+ * Copyright (C) 2008 IBM Corporation
+ * Author(s): Brian King (brking@linux.vnet.ibm.com),
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <linux/ctype.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/oom.h>
+#include <linux/sched.h>
+#include <linux/stringify.h>
+#include <linux/swap.h>
+#include <linux/sysdev.h>
+#include <asm/firmware.h>
+#include <asm/hvcall.h>
+#include <asm/mmu.h>
+#include <asm/pgalloc.h>
+#include <asm/uaccess.h>
+
+#include "plpar_wrappers.h"
+
+#define CMM_DRIVER_VERSION	"1.0.0"
+#define CMM_DEFAULT_DELAY	1
+#define CMM_DEBUG			0
+#define CMM_DISABLE		0
+#define CMM_OOM_KB		1024
+#define CMM_MIN_MEM_MB		256
+#define KB2PAGES(_p)		((_p)>>(PAGE_SHIFT-10))
+#define PAGES2KB(_p)		((_p)<<(PAGE_SHIFT-10))
+
+static unsigned int delay = CMM_DEFAULT_DELAY;
+static unsigned int oom_kb = CMM_OOM_KB;
+static unsigned int cmm_debug = CMM_DEBUG;
+static unsigned int cmm_disabled = CMM_DISABLE;
+static unsigned long min_mem_mb = CMM_MIN_MEM_MB;
+static struct sys_device cmm_sysdev;
+
+MODULE_AUTHOR("Brian King <brking@linux.vnet.ibm.com>");
+MODULE_DESCRIPTION("IBM System p Collaborative Memory Manager");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(CMM_DRIVER_VERSION);
+
+module_param_named(delay, delay, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(delay, "Delay (in seconds) between polls to query hypervisor paging requests. "
+		 "[Default=" __stringify(CMM_DEFAULT_DELAY) "]");
+module_param_named(oom_kb, oom_kb, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(oom_kb, "Amount of memory in kb to free on OOM. "
+		 "[Default=" __stringify(CMM_OOM_KB) "]");
+module_param_named(min_mem_mb, min_mem_mb, ulong, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(min_mem_mb, "Minimum amount of memory (in MB) to not balloon. "
+		 "[Default=" __stringify(CMM_MIN_MEM_MB) "]");
+module_param_named(debug, cmm_debug, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(debug, "Enable module debugging logging. Set to 1 to enable. "
+		 "[Default=" __stringify(CMM_DEBUG) "]");
+
+#define CMM_NR_PAGES ((PAGE_SIZE - sizeof(void *) - sizeof(unsigned long)) / sizeof(unsigned long))
+
+#define cmm_dbg(...) if (cmm_debug) { printk(KERN_INFO "cmm: "__VA_ARGS__); }
+
+struct cmm_page_array {
+	struct cmm_page_array *next;
+	unsigned long index;
+	unsigned long page[CMM_NR_PAGES];
+};
+
+static unsigned long loaned_pages;
+static unsigned long loaned_pages_target;
+static unsigned long oom_freed_pages;
+
+static struct cmm_page_array *cmm_page_list;
+static DEFINE_SPINLOCK(cmm_lock);
+
+static struct task_struct *cmm_thread_ptr;
+
+/**
+ * cmm_alloc_pages - Allocate pages and mark them as loaned
+ * @nr:	number of pages to allocate
+ *
+ * Return value:
+ * 	number of pages requested to be allocated which were not
+ **/
+static long cmm_alloc_pages(long nr)
+{
+	struct cmm_page_array *pa, *npa;
+	unsigned long addr;
+	long rc;
+
+	cmm_dbg("Begin request for %ld pages\n", nr);
+
+	while (nr) {
+		addr = __get_free_page(GFP_NOIO | __GFP_NOWARN |
+				       __GFP_NORETRY | __GFP_NOMEMALLOC);
+		if (!addr)
+			break;
+		spin_lock(&cmm_lock);
+		pa = cmm_page_list;
+		if (!pa || pa->index >= CMM_NR_PAGES) {
+			/* Need a new page for the page list. */
+			spin_unlock(&cmm_lock);
+			npa = (struct cmm_page_array *)__get_free_page(GFP_NOIO | __GFP_NOWARN |
+								       __GFP_NORETRY | __GFP_NOMEMALLOC);
+			if (!npa) {
+				pr_info("%s: Can not allocate new page list\n", __FUNCTION__);
+				free_page(addr);
+				break;
+			}
+			spin_lock(&cmm_lock);
+			pa = cmm_page_list;
+
+			if (!pa || pa->index >= CMM_NR_PAGES) {
+				npa->next = pa;
+				npa->index = 0;
+				pa = npa;
+				cmm_page_list = pa;
+			} else
+				free_page((unsigned long) npa);
+		}
+
+		if ((rc = plpar_page_set_loaned(__pa(addr)))) {
+			pr_err("%s: Can not set page to loaned. rc=%ld\n", __FUNCTION__, rc);
+			spin_unlock(&cmm_lock);
+			free_page(addr);
+			break;
+		}
+
+		pa->page[pa->index++] = addr;
+		loaned_pages++;
+		totalram_pages--;
+		spin_unlock(&cmm_lock);
+		nr--;
+	}
+
+	cmm_dbg("End request with %ld pages unfulfilled\n", nr);
+	return nr;
+}
+
+/**
+ * cmm_free_pages - Free pages and mark them as active
+ * @nr:	number of pages to free
+ *
+ * Return value:
+ * 	number of pages requested to be freed which were not
+ **/
+static long cmm_free_pages(long nr)
+{
+	struct cmm_page_array *pa;
+	unsigned long addr;
+
+	cmm_dbg("Begin free of %ld pages.\n", nr);
+	spin_lock(&cmm_lock);
+	pa = cmm_page_list;
+	while (nr) {
+		if (!pa || pa->index <= 0)
+			break;
+		addr = pa->page[--pa->index];
+
+		if (pa->index == 0) {
+			pa = pa->next;
+			free_page((unsigned long) cmm_page_list);
+			cmm_page_list = pa;
+		}
+
+		plpar_page_set_active(__pa(addr));
+		free_page(addr);
+		loaned_pages--;
+		nr--;
+		totalram_pages++;
+	}
+	spin_unlock(&cmm_lock);
+	cmm_dbg("End request with %ld pages unfulfilled\n", nr);
+	return nr;
+}
+
+/**
+ * cmm_oom_notify - OOM notifier
+ * @self:	notifier block struct
+ * @dummy:	not used
+ * @parm:	returned - number of pages freed
+ *
+ * Return value:
+ * 	NOTIFY_OK
+ **/
+static int cmm_oom_notify(struct notifier_block *self,
+			  unsigned long dummy, void *parm)
+{
+	unsigned long *freed = parm;
+	long nr = KB2PAGES(oom_kb);
+
+	cmm_dbg("OOM processing started\n");
+	nr = cmm_free_pages(nr);
+	loaned_pages_target = loaned_pages;
+	*freed += KB2PAGES(oom_kb) - nr;
+	oom_freed_pages += KB2PAGES(oom_kb) - nr;
+	cmm_dbg("OOM processing complete\n");
+	return NOTIFY_OK;
+}
+
+/**
+ * cmm_get_mpp - Read memory performance parameters
+ *
+ * Makes hcall to query the current page loan request from the hypervisor.
+ *
+ * Return value:
+ * 	nothing
+ **/
+static void cmm_get_mpp(void)
+{
+	int rc;
+	struct hvcall_mpp_data mpp_data;
+	unsigned long active_pages_target;
+	signed long page_loan_request;
+
+	rc = h_get_mpp(&mpp_data);
+
+	if (rc != H_SUCCESS)
+		return;
+
+	page_loan_request = div_s64((s64)mpp_data.loan_request, PAGE_SIZE);
+	loaned_pages_target = page_loan_request + loaned_pages;
+	if (loaned_pages_target > oom_freed_pages)
+		loaned_pages_target -= oom_freed_pages;
+	else
+		loaned_pages_target = 0;
+
+	active_pages_target = totalram_pages + loaned_pages - loaned_pages_target;
+
+	if ((min_mem_mb * 1024 * 1024) > (active_pages_target * PAGE_SIZE))
+		loaned_pages_target = totalram_pages + loaned_pages -
+			((min_mem_mb * 1024 * 1024) / PAGE_SIZE);
+
+	cmm_dbg("delta = %ld, loaned = %lu, target = %lu, oom = %lu, totalram = %lu\n",
+		page_loan_request, loaned_pages, loaned_pages_target,
+		oom_freed_pages, totalram_pages);
+}
+
+static struct notifier_block cmm_oom_nb = {
+	.notifier_call = cmm_oom_notify
+};
+
+/**
+ * cmm_thread - CMM task thread
+ * @dummy:	not used
+ *
+ * Return value:
+ * 	0
+ **/
+static int cmm_thread(void *dummy)
+{
+	unsigned long timeleft;
+
+	while (1) {
+		timeleft = msleep_interruptible(delay * 1000);
+
+		if (kthread_should_stop() || timeleft) {
+			loaned_pages_target = loaned_pages;
+			break;
+		}
+
+		cmm_get_mpp();
+
+		if (loaned_pages_target > loaned_pages) {
+			if (cmm_alloc_pages(loaned_pages_target - loaned_pages))
+				loaned_pages_target = loaned_pages;
+		} else if (loaned_pages_target < loaned_pages)
+			cmm_free_pages(loaned_pages - loaned_pages_target);
+	}
+	return 0;
+}
+
+#define CMM_SHOW(name, format, args...)			\
+	static ssize_t show_##name(struct sys_device *dev, char *buf)	\
+	{							\
+		return sprintf(buf, format, ##args);		\
+	}							\
+	static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL)
+
+CMM_SHOW(loaned_kb, "%lu\n", PAGES2KB(loaned_pages));
+CMM_SHOW(loaned_target_kb, "%lu\n", PAGES2KB(loaned_pages_target));
+
+static ssize_t show_oom_pages(struct sys_device *dev, char *buf)
+{
+	return sprintf(buf, "%lu\n", PAGES2KB(oom_freed_pages));
+}
+
+static ssize_t store_oom_pages(struct sys_device *dev,
+			       const char *buf, size_t count)
+{
+	unsigned long val = simple_strtoul (buf, NULL, 10);
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (val != 0)
+		return -EBADMSG;
+
+	oom_freed_pages = 0;
+	return count;
+}
+
+static SYSDEV_ATTR(oom_freed_kb, S_IWUSR| S_IRUGO,
+		   show_oom_pages, store_oom_pages);
+
+static struct sysdev_attribute *cmm_attrs[] = {
+	&attr_loaned_kb,
+	&attr_loaned_target_kb,
+	&attr_oom_freed_kb,
+};
+
+static struct sysdev_class cmm_sysdev_class = {
+	.name = "cmm",
+};
+
+/**
+ * cmm_sysfs_register - Register with sysfs
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int cmm_sysfs_register(struct sys_device *sysdev)
+{
+	int i, rc;
+
+	if ((rc = sysdev_class_register(&cmm_sysdev_class)))
+		return rc;
+
+	sysdev->id = 0;
+	sysdev->cls = &cmm_sysdev_class;
+
+	if ((rc = sysdev_register(sysdev)))
+		goto class_unregister;
+
+	for (i = 0; i < ARRAY_SIZE(cmm_attrs); i++) {
+		if ((rc = sysdev_create_file(sysdev, cmm_attrs[i])))
+			goto fail;
+	}
+
+	return 0;
+
+fail:
+	while (--i >= 0)
+		sysdev_remove_file(sysdev, cmm_attrs[i]);
+	sysdev_unregister(sysdev);
+class_unregister:
+	sysdev_class_unregister(&cmm_sysdev_class);
+	return rc;
+}
+
+/**
+ * cmm_unregister_sysfs - Unregister from sysfs
+ *
+ **/
+static void cmm_unregister_sysfs(struct sys_device *sysdev)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(cmm_attrs); i++)
+		sysdev_remove_file(sysdev, cmm_attrs[i]);
+	sysdev_unregister(sysdev);
+	sysdev_class_unregister(&cmm_sysdev_class);
+}
+
+/**
+ * cmm_init - Module initialization
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int cmm_init(void)
+{
+	int rc = -ENOMEM;
+
+	if (!firmware_has_feature(FW_FEATURE_CMO))
+		return -EOPNOTSUPP;
+
+	if ((rc = register_oom_notifier(&cmm_oom_nb)) < 0)
+		return rc;
+
+	if ((rc = cmm_sysfs_register(&cmm_sysdev)))
+		goto out_oom_notifier;
+
+	if (cmm_disabled)
+		return rc;
+
+	cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
+	if (IS_ERR(cmm_thread_ptr)) {
+		rc = PTR_ERR(cmm_thread_ptr);
+		goto out_unregister_sysfs;
+	}
+
+	return rc;
+
+out_unregister_sysfs:
+	cmm_unregister_sysfs(&cmm_sysdev);
+out_oom_notifier:
+	unregister_oom_notifier(&cmm_oom_nb);
+	return rc;
+}
+
+/**
+ * cmm_exit - Module exit
+ *
+ * Return value:
+ * 	nothing
+ **/
+static void cmm_exit(void)
+{
+	if (cmm_thread_ptr)
+		kthread_stop(cmm_thread_ptr);
+	unregister_oom_notifier(&cmm_oom_nb);
+	cmm_free_pages(loaned_pages);
+	cmm_unregister_sysfs(&cmm_sysdev);
+}
+
+/**
+ * cmm_set_disable - Disable/Enable CMM
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int cmm_set_disable(const char *val, struct kernel_param *kp)
+{
+	int disable = simple_strtoul(val, NULL, 10);
+
+	if (disable != 0 && disable != 1)
+		return -EINVAL;
+
+	if (disable && !cmm_disabled) {
+		if (cmm_thread_ptr)
+			kthread_stop(cmm_thread_ptr);
+		cmm_thread_ptr = NULL;
+		cmm_free_pages(loaned_pages);
+	} else if (!disable && cmm_disabled) {
+		cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
+		if (IS_ERR(cmm_thread_ptr))
+			return PTR_ERR(cmm_thread_ptr);
+	}
+
+	cmm_disabled = disable;
+	return 0;
+}
+
+module_param_call(disable, cmm_set_disable, param_get_uint,
+		  &cmm_disabled, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(disable, "Disable CMM. Set to 1 to disable. "
+		 "[Default=" __stringify(CMM_DISABLE) "]");
+
+module_init(cmm_init);
+module_exit(cmm_exit);
-- 
cgit v1.2.3-70-g09d2


From ffa5abbd0c399b32fc13a1b4718d87ee7a716999 Mon Sep 17 00:00:00 2001
From: Brian King <brking@linux.vnet.ibm.com>
Date: Thu, 24 Jul 2008 04:30:58 +1000
Subject: powerpc/pseries: Add CMO paging statistics

With the addition of Cooperative Memory Overcommitment (CMO) support
for IBM Power Systems, two fields have been added to the VPA to report
paging statistics.  Add support in lparcfg to report them to userspace.

Signed-off-by: Brian King <brking@linux.vnet.ibm.com>
Signed-off-by: Robert Jennings <rcj@linux.vnet.ibm.com>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/kernel/lparcfg.c | 20 ++++++++++++++++++++
 include/asm-powerpc/lppaca.h  |  5 ++++-
 2 files changed, 24 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/lparcfg.c b/arch/powerpc/kernel/lparcfg.c
index d82e1fa5ce2..848c3e5a637 100644
--- a/arch/powerpc/kernel/lparcfg.c
+++ b/arch/powerpc/kernel/lparcfg.c
@@ -409,6 +409,25 @@ static int lparcfg_count_active_processors(void)
 	return count;
 }
 
+static void pseries_cmo_data(struct seq_file *m)
+{
+	int cpu;
+	unsigned long cmo_faults = 0;
+	unsigned long cmo_fault_time = 0;
+
+	if (!firmware_has_feature(FW_FEATURE_CMO))
+		return;
+
+	for_each_possible_cpu(cpu) {
+		cmo_faults += lppaca[cpu].cmo_faults;
+		cmo_fault_time += lppaca[cpu].cmo_fault_time;
+	}
+
+	seq_printf(m, "cmo_faults=%lu\n", cmo_faults);
+	seq_printf(m, "cmo_fault_time_usec=%lu\n",
+		   cmo_fault_time / tb_ticks_per_usec);
+}
+
 static int pseries_lparcfg_data(struct seq_file *m, void *v)
 {
 	int partition_potential_processors;
@@ -434,6 +453,7 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
 		parse_system_parameter_string(m);
 		parse_ppp_data(m);
 		parse_mpp_data(m);
+		pseries_cmo_data(m);
 
 		seq_printf(m, "purr=%ld\n", get_purr());
 	} else {		/* non SPLPAR case */
diff --git a/include/asm-powerpc/lppaca.h b/include/asm-powerpc/lppaca.h
index 567ed92cd91..2fe268b1033 100644
--- a/include/asm-powerpc/lppaca.h
+++ b/include/asm-powerpc/lppaca.h
@@ -125,7 +125,10 @@ struct lppaca {
 	// NOTE: This value will ALWAYS be zero for dedicated processors and
 	// will NEVER be zero for shared processors (ie, initialized to a 1).
 	volatile u32 yield_count;	// PLIC increments each dispatchx00-x03
-	u8	reserved6[124];		// Reserved                     x04-x7F
+	u32 reserved6;
+	volatile u64 cmo_faults;	// CMO page fault count         x08-x0F
+	volatile u64 cmo_fault_time;	// CMO page fault time          x10-x17
+	u8	reserved7[104];		// Reserved                     x18-x7F
 
 //=============================================================================
 // CACHE_LINE_4-5 0x0180 - 0x027F Contains PMC interrupt data
-- 
cgit v1.2.3-70-g09d2


From 6490c4903d12f242bec4454301f76f6a7520e399 Mon Sep 17 00:00:00 2001
From: Robert Jennings <rcj@linux.vnet.ibm.com>
Date: Thu, 24 Jul 2008 04:31:16 +1000
Subject: powerpc/pseries: iommu enablement for CMO

To support Cooperative Memory Overcommitment (CMO), we need to check
for failure from some of the tce hcalls.

These changes for the pseries platform affect the powerpc architecture;
patches for the other affected platforms are included in this patch.

pSeries platform IOMMU code changes:
 * platform TCE functions must handle H_NOT_ENOUGH_RESOURCES errors and
   return an error.

Architecture IOMMU code changes:
 * Calls to ppc_md.tce_build need to check return values and return
   DMA_MAPPING_ERROR for transient errors.

Architecture changes:
 * struct machdep_calls for tce_build*_pSeriesLP functions need to change
   to indicate failure.
 * all other platforms will need updates to iommu functions to match the new
   calling semantics; they will return 0 on success.  The other platforms
   default configs have been built, but no further testing was performed.

Signed-off-by: Robert Jennings <rcj@linux.vnet.ibm.com>
Acked-by: Olof Johansson <olof@lixom.net>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/kernel/iommu.c            | 28 +++++++++++++++++++----
 arch/powerpc/platforms/cell/iommu.c    |  3 ++-
 arch/powerpc/platforms/iseries/iommu.c |  3 ++-
 arch/powerpc/platforms/pasemi/iommu.c  |  3 ++-
 arch/powerpc/platforms/pseries/iommu.c | 42 ++++++++++++++++++++++++++--------
 arch/powerpc/sysdev/dart_iommu.c       |  3 ++-
 include/asm-powerpc/machdep.h          |  2 +-
 7 files changed, 64 insertions(+), 20 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 2385f68c175..550a19399bf 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -49,6 +49,8 @@ static int novmerge = 1;
 
 static int protect4gb = 1;
 
+static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int);
+
 static inline unsigned long iommu_num_pages(unsigned long vaddr,
 					    unsigned long slen)
 {
@@ -191,6 +193,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
 {
 	unsigned long entry, flags;
 	dma_addr_t ret = DMA_ERROR_CODE;
+	int build_fail;
 
 	spin_lock_irqsave(&(tbl->it_lock), flags);
 
@@ -205,9 +208,21 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
 	ret = entry << IOMMU_PAGE_SHIFT;	/* Set the return dma address */
 
 	/* Put the TCEs in the HW table */
-	ppc_md.tce_build(tbl, entry, npages, (unsigned long)page & IOMMU_PAGE_MASK,
-			 direction, attrs);
+	build_fail = ppc_md.tce_build(tbl, entry, npages,
+	                              (unsigned long)page & IOMMU_PAGE_MASK,
+	                              direction, attrs);
+
+	/* ppc_md.tce_build() only returns non-zero for transient errors.
+	 * Clean up the table bitmap in this case and return
+	 * DMA_ERROR_CODE. For all other errors the functionality is
+	 * not altered.
+	 */
+	if (unlikely(build_fail)) {
+		__iommu_free(tbl, ret, npages);
 
+		spin_unlock_irqrestore(&(tbl->it_lock), flags);
+		return DMA_ERROR_CODE;
+	}
 
 	/* Flush/invalidate TLB caches if necessary */
 	if (ppc_md.tce_flush)
@@ -276,7 +291,7 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 	dma_addr_t dma_next = 0, dma_addr;
 	unsigned long flags;
 	struct scatterlist *s, *outs, *segstart;
-	int outcount, incount, i;
+	int outcount, incount, i, build_fail = 0;
 	unsigned int align;
 	unsigned long handle;
 	unsigned int max_seg_size;
@@ -337,8 +352,11 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 			    npages, entry, dma_addr);
 
 		/* Insert into HW table */
-		ppc_md.tce_build(tbl, entry, npages, vaddr & IOMMU_PAGE_MASK,
-				 direction, attrs);
+		build_fail = ppc_md.tce_build(tbl, entry, npages,
+		                              vaddr & IOMMU_PAGE_MASK,
+		                              direction, attrs);
+		if(unlikely(build_fail))
+			goto failure;
 
 		/* If we are in an open segment, try merging */
 		if (segstart != s) {
diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c
index 031124a8e37..e06420af5fe 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -172,7 +172,7 @@ static void invalidate_tce_cache(struct cbe_iommu *iommu, unsigned long *pte,
 	}
 }
 
-static void tce_build_cell(struct iommu_table *tbl, long index, long npages,
+static int tce_build_cell(struct iommu_table *tbl, long index, long npages,
 		unsigned long uaddr, enum dma_data_direction direction,
 		struct dma_attrs *attrs)
 {
@@ -213,6 +213,7 @@ static void tce_build_cell(struct iommu_table *tbl, long index, long npages,
 
 	pr_debug("tce_build_cell(index=%lx,n=%lx,dir=%d,base_pte=%lx)\n",
 		 index, npages, direction, base_pte);
+	return 0;
 }
 
 static void tce_free_cell(struct iommu_table *tbl, long index, long npages)
diff --git a/arch/powerpc/platforms/iseries/iommu.c b/arch/powerpc/platforms/iseries/iommu.c
index bc818e4e203..bb464d1211b 100644
--- a/arch/powerpc/platforms/iseries/iommu.c
+++ b/arch/powerpc/platforms/iseries/iommu.c
@@ -41,7 +41,7 @@
 #include <asm/iseries/hv_call_event.h>
 #include <asm/iseries/iommu.h>
 
-static void tce_build_iSeries(struct iommu_table *tbl, long index, long npages,
+static int tce_build_iSeries(struct iommu_table *tbl, long index, long npages,
 		unsigned long uaddr, enum dma_data_direction direction,
 		struct dma_attrs *attrs)
 {
@@ -71,6 +71,7 @@ static void tce_build_iSeries(struct iommu_table *tbl, long index, long npages,
 		index++;
 		uaddr += TCE_PAGE_SIZE;
 	}
+	return 0;
 }
 
 static void tce_free_iSeries(struct iommu_table *tbl, long index, long npages)
diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c
index 70541b7a501..a0ff03a3d8d 100644
--- a/arch/powerpc/platforms/pasemi/iommu.c
+++ b/arch/powerpc/platforms/pasemi/iommu.c
@@ -83,7 +83,7 @@ static u32 *iob_l2_base;
 static struct iommu_table iommu_table_iobmap;
 static int iommu_table_iobmap_inited;
 
-static void iobmap_build(struct iommu_table *tbl, long index,
+static int iobmap_build(struct iommu_table *tbl, long index,
 			 long npages, unsigned long uaddr,
 			 enum dma_data_direction direction,
 			 struct dma_attrs *attrs)
@@ -108,6 +108,7 @@ static void iobmap_build(struct iommu_table *tbl, long index,
 		uaddr += IOBMAP_PAGE_SIZE;
 		bus_addr += IOBMAP_PAGE_SIZE;
 	}
+	return 0;
 }
 
 
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 5377dd4b849..a8c446697f9 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -48,7 +48,7 @@
 #include "plpar_wrappers.h"
 
 
-static void tce_build_pSeries(struct iommu_table *tbl, long index,
+static int tce_build_pSeries(struct iommu_table *tbl, long index,
 			      long npages, unsigned long uaddr,
 			      enum dma_data_direction direction,
 			      struct dma_attrs *attrs)
@@ -72,6 +72,7 @@ static void tce_build_pSeries(struct iommu_table *tbl, long index,
 		uaddr += TCE_PAGE_SIZE;
 		tcep++;
 	}
+	return 0;
 }
 
 
@@ -94,14 +95,19 @@ static unsigned long tce_get_pseries(struct iommu_table *tbl, long index)
 	return *tcep;
 }
 
-static void tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
+static void tce_free_pSeriesLP(struct iommu_table*, long, long);
+static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long);
+
+static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
 				long npages, unsigned long uaddr,
 				enum dma_data_direction direction,
 				struct dma_attrs *attrs)
 {
-	u64 rc;
+	u64 rc = 0;
 	u64 proto_tce, tce;
 	u64 rpn;
+	int ret = 0;
+	long tcenum_start = tcenum, npages_start = npages;
 
 	rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
 	proto_tce = TCE_PCI_READ;
@@ -112,6 +118,13 @@ static void tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
 		tce = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT;
 		rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, tce);
 
+		if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
+			ret = (int)rc;
+			tce_free_pSeriesLP(tbl, tcenum_start,
+			                   (npages_start - (npages + 1)));
+			break;
+		}
+
 		if (rc && printk_ratelimit()) {
 			printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%ld\n", rc);
 			printk("\tindex   = 0x%lx\n", (u64)tbl->it_index);
@@ -123,25 +136,27 @@ static void tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
 		tcenum++;
 		rpn++;
 	}
+	return ret;
 }
 
 static DEFINE_PER_CPU(u64 *, tce_page) = NULL;
 
-static void tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
+static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 				     long npages, unsigned long uaddr,
 				     enum dma_data_direction direction,
 				     struct dma_attrs *attrs)
 {
-	u64 rc;
+	u64 rc = 0;
 	u64 proto_tce;
 	u64 *tcep;
 	u64 rpn;
 	long l, limit;
+	long tcenum_start = tcenum, npages_start = npages;
+	int ret = 0;
 
 	if (npages == 1) {
-		tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
-				    direction, attrs);
-		return;
+		return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
+		                           direction, attrs);
 	}
 
 	tcep = __get_cpu_var(tce_page);
@@ -153,9 +168,8 @@ static void tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 		tcep = (u64 *)__get_free_page(GFP_ATOMIC);
 		/* If allocation fails, fall back to the loop implementation */
 		if (!tcep) {
-			tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
+			return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
 					    direction, attrs);
-			return;
 		}
 		__get_cpu_var(tce_page) = tcep;
 	}
@@ -187,6 +201,13 @@ static void tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 		tcenum += limit;
 	} while (npages > 0 && !rc);
 
+	if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
+		ret = (int)rc;
+		tce_freemulti_pSeriesLP(tbl, tcenum_start,
+		                        (npages_start - (npages + limit)));
+		return ret;
+	}
+
 	if (rc && printk_ratelimit()) {
 		printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%ld\n", rc);
 		printk("\tindex   = 0x%lx\n", (u64)tbl->it_index);
@@ -194,6 +215,7 @@ static void tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 		printk("\ttce[0] val = 0x%lx\n", tcep[0]);
 		show_stack(current, (unsigned long *)__get_SP());
 	}
+	return ret;
 }
 
 static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c
index de8c8b542cf..89639ecbf38 100644
--- a/arch/powerpc/sysdev/dart_iommu.c
+++ b/arch/powerpc/sysdev/dart_iommu.c
@@ -147,7 +147,7 @@ static void dart_flush(struct iommu_table *tbl)
 	}
 }
 
-static void dart_build(struct iommu_table *tbl, long index,
+static int dart_build(struct iommu_table *tbl, long index,
 		       long npages, unsigned long uaddr,
 		       enum dma_data_direction direction,
 		       struct dma_attrs *attrs)
@@ -184,6 +184,7 @@ static void dart_build(struct iommu_table *tbl, long index,
 	} else {
 		dart_dirty = 1;
 	}
+	return 0;
 }
 
 
diff --git a/include/asm-powerpc/machdep.h b/include/asm-powerpc/machdep.h
index 1233d735fd2..893aafd87fd 100644
--- a/include/asm-powerpc/machdep.h
+++ b/include/asm-powerpc/machdep.h
@@ -76,7 +76,7 @@ struct machdep_calls {
 	 * destroyed as well */
 	void		(*hpte_clear_all)(void);
 
-	void		(*tce_build)(struct iommu_table * tbl,
+	int		(*tce_build)(struct iommu_table *tbl,
 				     long index,
 				     long npages,
 				     unsigned long uaddr,
-- 
cgit v1.2.3-70-g09d2


From a90ab95a9576d35de0d05f9f4fc435edcccafaa9 Mon Sep 17 00:00:00 2001
From: Robert Jennings <rcj@linux.vnet.ibm.com>
Date: Thu, 24 Jul 2008 04:31:33 +1000
Subject: powerpc/pseries: vio bus support for CMO

This is a large patch but the normal code path is not affected.  For
non-pSeries platforms the code is ifdef'ed out and for non-CMO enabled
pSeries systems this does not affect the normal code path.  Devices that
do not perform DMA operations do not need modification with this patch.
The function get_desired_dma was renamed from get_io_entitlement for
clarity.

Overview

Cooperative Memory Overcommitment (CMO) allows for a set of OS partitions
to be run with less RAM than the aggregate needs of the group of
partitions.  The firmware will balance memory between the partitions
and page in/out memory as needed.  Based on the number and type of IO
adpaters preset each partition is allocated an amount of memory for
DMA operations and this allocation will be guaranteed to the partition;
this is referred to as the partition's 'entitlement'.

Partitions running in a CMO environment can only have virtual IO devices
present.  The VIO bus layer will manage the IO entitlement for the system.
Accounting, at a system and per-device level, is tracked in the VIO bus
code and exposed via sysfs.  A set of dma_ops functions are added to
the bus to allow for this accounting.

Bus initialization

At initialization, the bus will calculate the minimum needs of the system
based on providing each device present with a standard minimum entitlement
along with a spare allocation for the bus to handle hotplug events.
If the minimum needs can not be met the system boot will be halted.

Device changes

The significant changes for devices while running under CMO are that the
devices must specify how much dedicated IO entitlement they desire and
must also handle DMA mapping errors that can occur due to constrained
IO memory.  The virtual IO drivers are modified to silence errors when
DMA mappings fail for CMO and handle these failures gracefully.

Each devices will be guaranteed a minimum entitlement that can always
be mapped.  Devices will specify how much entitlement they desire and
the VIO bus will attempt to provide for this.  Devices can change their
desired entitlement level at any point in time to address particular needs
(via vio_cmo_set_dev_desired()), not just at device probe time.

VIO bus changes

The system will have a particular entitlement level available from which
it can provide memory to the devices.  The bus defines two pools of memory
within this entitlement, the reserved and excess pools.  Each device is
provided with it's own entitlement no less than a system defined minimum
entitlement and no greater than what the device has specified as it's
desired entitlement.  The entitlement provided to devices comes from the
reserve pool.  The reserve pool can also contain a spare allocation as
large as the system defined minimum entitlement which is used for device
hotplug events.  Any entitlement not needed to fulfill the needs of a
reserve pool is placed in the excess pool.  Each device is guaranteed
that it can map up to it's entitled level; additional mapping are possible
as long as there is unmapped memory in the excess pool.

Bus probe

As the system starts, each device is given an entitlement equal only
to the system defined minimum entitlement.  The reserve pool is equal
to the sum of these entitlements, plus a spare allocation.  The VIO bus
also tracks the aggregate desired entitlement of all the devices.  If the
system desired entitlement is greater than the size of the reserve pool,
when devices unmap IO memory it will be reserved and a balance operation
will be scheduled for some time in the future.

Entitlement balancing

The balance function tries to fairly distribute entitlement between the
devices in the system with the goal of providing each device with it's
desired amount of entitlement.  Devices using more than what would be
ideal will have their entitled set-point adjusted; this will effectively
set a goal for lower IO memory usage as future mappings can fail and
deallocations will trigger a balance operation to distribute the newly
unmapped memory.  A fair distribution of entitlement can take several
balance operations to achieve.  Entitlement changes and device DLPAR
events will alter the state of CMO and will trigger balance operations.

Hotplug events

The VIO bus allows for changes in system entitlement at run-time via
'vio_cmo_entitlement_update()'.  When devices are added the hotplug
device event will be preceded by a system entitlement increase and this
is reversed when devices are removed.

The following changes are made that the VIO bus layer for CMO:
 * add IO memory accounting per device structure.
 * add IO memory entitlement query function to driver structure.
 * during vio bus probe, if CMO is enabled, check that driver has
   memory entitlement query function defined.  Fail if function not defined.
 * fail to register driver if io entitlement function not defined.
 * create set of dma_ops at vio level for CMO that will track allocations
   and return DMA failures once entitlement is reached.  Entitlement will
   limited by overall system entitlement.  Devices will have a reserved
   quantity of memory that is guaranteed, the rest can be used as available.
 * expose entitlement, current allocation, desired allocation, and the
   allocation error counter for devices to the user through sysfs
 * provide mechanism for changing a device's desired entitlement at run time
   for devices as an exported function and sysfs tunable
 * track any DMA failures for entitled IO memory for each vio device.
 * check entitlement against available system entitlement on device add
 * track entitlement metrics (high water mark, current usage)
 * provide function to reset high water mark
 * provide minimum and desired entitlement numbers at a bus level
 * provide drivers with a minimum guaranteed entitlement
 * balance available entitlement between devices to satisfy their needs
 * handle system entitlement changes and device hotplug

Signed-off-by: Robert Jennings <rcj@linux.vnet.ibm.com>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/kernel/vio.c | 1033 ++++++++++++++++++++++++++++++++++++++++++++-
 include/asm-powerpc/vio.h |   27 +-
 2 files changed, 1052 insertions(+), 8 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index b77f8af7ddd..ade8aeaa2e7 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -1,11 +1,12 @@
 /*
  * IBM PowerPC Virtual I/O Infrastructure Support.
  *
- *    Copyright (c) 2003-2005 IBM Corp.
+ *    Copyright (c) 2003,2008 IBM Corp.
  *     Dave Engebretsen engebret@us.ibm.com
  *     Santiago Leon santil@us.ibm.com
  *     Hollis Blanchard <hollisb@us.ibm.com>
  *     Stephen Rothwell
+ *     Robert Jennings <rcjenn@us.ibm.com>
  *
  *      This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -46,6 +47,996 @@ static struct vio_dev vio_bus_device  = { /* fake "parent" device */
 	.dev.bus = &vio_bus_type,
 };
 
+#ifdef CONFIG_PPC_SMLPAR
+/**
+ * vio_cmo_pool - A pool of IO memory for CMO use
+ *
+ * @size: The size of the pool in bytes
+ * @free: The amount of free memory in the pool
+ */
+struct vio_cmo_pool {
+	size_t size;
+	size_t free;
+};
+
+/* How many ms to delay queued balance work */
+#define VIO_CMO_BALANCE_DELAY 100
+
+/* Portion out IO memory to CMO devices by this chunk size */
+#define VIO_CMO_BALANCE_CHUNK 131072
+
+/**
+ * vio_cmo_dev_entry - A device that is CMO-enabled and requires entitlement
+ *
+ * @vio_dev: struct vio_dev pointer
+ * @list: pointer to other devices on bus that are being tracked
+ */
+struct vio_cmo_dev_entry {
+	struct vio_dev *viodev;
+	struct list_head list;
+};
+
+/**
+ * vio_cmo - VIO bus accounting structure for CMO entitlement
+ *
+ * @lock: spinlock for entire structure
+ * @balance_q: work queue for balancing system entitlement
+ * @device_list: list of CMO-enabled devices requiring entitlement
+ * @entitled: total system entitlement in bytes
+ * @reserve: pool of memory from which devices reserve entitlement, incl. spare
+ * @excess: pool of excess entitlement not needed for device reserves or spare
+ * @spare: IO memory for device hotplug functionality
+ * @min: minimum necessary for system operation
+ * @desired: desired memory for system operation
+ * @curr: bytes currently allocated
+ * @high: high water mark for IO data usage
+ */
+struct vio_cmo {
+	spinlock_t lock;
+	struct delayed_work balance_q;
+	struct list_head device_list;
+	size_t entitled;
+	struct vio_cmo_pool reserve;
+	struct vio_cmo_pool excess;
+	size_t spare;
+	size_t min;
+	size_t desired;
+	size_t curr;
+	size_t high;
+} vio_cmo;
+
+/**
+ * vio_cmo_OF_devices - Count the number of OF devices that have DMA windows
+ */
+static int vio_cmo_num_OF_devs(void)
+{
+	struct device_node *node_vroot;
+	int count = 0;
+
+	/*
+	 * Count the number of vdevice entries with an
+	 * ibm,my-dma-window OF property
+	 */
+	node_vroot = of_find_node_by_name(NULL, "vdevice");
+	if (node_vroot) {
+		struct device_node *of_node;
+		struct property *prop;
+
+		for_each_child_of_node(node_vroot, of_node) {
+			prop = of_find_property(of_node, "ibm,my-dma-window",
+			                       NULL);
+			if (prop)
+				count++;
+		}
+	}
+	of_node_put(node_vroot);
+	return count;
+}
+
+/**
+ * vio_cmo_alloc - allocate IO memory for CMO-enable devices
+ *
+ * @viodev: VIO device requesting IO memory
+ * @size: size of allocation requested
+ *
+ * Allocations come from memory reserved for the devices and any excess
+ * IO memory available to all devices.  The spare pool used to service
+ * hotplug must be equal to %VIO_CMO_MIN_ENT for the excess pool to be
+ * made available.
+ *
+ * Return codes:
+ *  0 for successful allocation and -ENOMEM for a failure
+ */
+static inline int vio_cmo_alloc(struct vio_dev *viodev, size_t size)
+{
+	unsigned long flags;
+	size_t reserve_free = 0;
+	size_t excess_free = 0;
+	int ret = -ENOMEM;
+
+	spin_lock_irqsave(&vio_cmo.lock, flags);
+
+	/* Determine the amount of free entitlement available in reserve */
+	if (viodev->cmo.entitled > viodev->cmo.allocated)
+		reserve_free = viodev->cmo.entitled - viodev->cmo.allocated;
+
+	/* If spare is not fulfilled, the excess pool can not be used. */
+	if (vio_cmo.spare >= VIO_CMO_MIN_ENT)
+		excess_free = vio_cmo.excess.free;
+
+	/* The request can be satisfied */
+	if ((reserve_free + excess_free) >= size) {
+		vio_cmo.curr += size;
+		if (vio_cmo.curr > vio_cmo.high)
+			vio_cmo.high = vio_cmo.curr;
+		viodev->cmo.allocated += size;
+		size -= min(reserve_free, size);
+		vio_cmo.excess.free -= size;
+		ret = 0;
+	}
+
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+	return ret;
+}
+
+/**
+ * vio_cmo_dealloc - deallocate IO memory from CMO-enable devices
+ * @viodev: VIO device freeing IO memory
+ * @size: size of deallocation
+ *
+ * IO memory is freed by the device back to the correct memory pools.
+ * The spare pool is replenished first from either memory pool, then
+ * the reserve pool is used to reduce device entitlement, the excess
+ * pool is used to increase the reserve pool toward the desired entitlement
+ * target, and then the remaining memory is returned to the pools.
+ *
+ */
+static inline void vio_cmo_dealloc(struct vio_dev *viodev, size_t size)
+{
+	unsigned long flags;
+	size_t spare_needed = 0;
+	size_t excess_freed = 0;
+	size_t reserve_freed = size;
+	size_t tmp;
+	int balance = 0;
+
+	spin_lock_irqsave(&vio_cmo.lock, flags);
+	vio_cmo.curr -= size;
+
+	/* Amount of memory freed from the excess pool */
+	if (viodev->cmo.allocated > viodev->cmo.entitled) {
+		excess_freed = min(reserve_freed, (viodev->cmo.allocated -
+		                                   viodev->cmo.entitled));
+		reserve_freed -= excess_freed;
+	}
+
+	/* Remove allocation from device */
+	viodev->cmo.allocated -= (reserve_freed + excess_freed);
+
+	/* Spare is a subset of the reserve pool, replenish it first. */
+	spare_needed = VIO_CMO_MIN_ENT - vio_cmo.spare;
+
+	/*
+	 * Replenish the spare in the reserve pool from the excess pool.
+	 * This moves entitlement into the reserve pool.
+	 */
+	if (spare_needed && excess_freed) {
+		tmp = min(excess_freed, spare_needed);
+		vio_cmo.excess.size -= tmp;
+		vio_cmo.reserve.size += tmp;
+		vio_cmo.spare += tmp;
+		excess_freed -= tmp;
+		spare_needed -= tmp;
+		balance = 1;
+	}
+
+	/*
+	 * Replenish the spare in the reserve pool from the reserve pool.
+	 * This removes entitlement from the device down to VIO_CMO_MIN_ENT,
+	 * if needed, and gives it to the spare pool. The amount of used
+	 * memory in this pool does not change.
+	 */
+	if (spare_needed && reserve_freed) {
+		tmp = min(spare_needed, min(reserve_freed,
+		                            (viodev->cmo.entitled -
+		                             VIO_CMO_MIN_ENT)));
+
+		vio_cmo.spare += tmp;
+		viodev->cmo.entitled -= tmp;
+		reserve_freed -= tmp;
+		spare_needed -= tmp;
+		balance = 1;
+	}
+
+	/*
+	 * Increase the reserve pool until the desired allocation is met.
+	 * Move an allocation freed from the excess pool into the reserve
+	 * pool and schedule a balance operation.
+	 */
+	if (excess_freed && (vio_cmo.desired > vio_cmo.reserve.size)) {
+		tmp = min(excess_freed, (vio_cmo.desired - vio_cmo.reserve.size));
+
+		vio_cmo.excess.size -= tmp;
+		vio_cmo.reserve.size += tmp;
+		excess_freed -= tmp;
+		balance = 1;
+	}
+
+	/* Return memory from the excess pool to that pool */
+	if (excess_freed)
+		vio_cmo.excess.free += excess_freed;
+
+	if (balance)
+		schedule_delayed_work(&vio_cmo.balance_q, VIO_CMO_BALANCE_DELAY);
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+/**
+ * vio_cmo_entitlement_update - Manage system entitlement changes
+ *
+ * @new_entitlement: new system entitlement to attempt to accommodate
+ *
+ * Increases in entitlement will be used to fulfill the spare entitlement
+ * and the rest is given to the excess pool.  Decreases, if they are
+ * possible, come from the excess pool and from unused device entitlement
+ *
+ * Returns: 0 on success, -ENOMEM when change can not be made
+ */
+int vio_cmo_entitlement_update(size_t new_entitlement)
+{
+	struct vio_dev *viodev;
+	struct vio_cmo_dev_entry *dev_ent;
+	unsigned long flags;
+	size_t avail, delta, tmp;
+
+	spin_lock_irqsave(&vio_cmo.lock, flags);
+
+	/* Entitlement increases */
+	if (new_entitlement > vio_cmo.entitled) {
+		delta = new_entitlement - vio_cmo.entitled;
+
+		/* Fulfill spare allocation */
+		if (vio_cmo.spare < VIO_CMO_MIN_ENT) {
+			tmp = min(delta, (VIO_CMO_MIN_ENT - vio_cmo.spare));
+			vio_cmo.spare += tmp;
+			vio_cmo.reserve.size += tmp;
+			delta -= tmp;
+		}
+
+		/* Remaining new allocation goes to the excess pool */
+		vio_cmo.entitled += delta;
+		vio_cmo.excess.size += delta;
+		vio_cmo.excess.free += delta;
+
+		goto out;
+	}
+
+	/* Entitlement decreases */
+	delta = vio_cmo.entitled - new_entitlement;
+	avail = vio_cmo.excess.free;
+
+	/*
+	 * Need to check how much unused entitlement each device can
+	 * sacrifice to fulfill entitlement change.
+	 */
+	list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+		if (avail >= delta)
+			break;
+
+		viodev = dev_ent->viodev;
+		if ((viodev->cmo.entitled > viodev->cmo.allocated) &&
+		    (viodev->cmo.entitled > VIO_CMO_MIN_ENT))
+				avail += viodev->cmo.entitled -
+				         max_t(size_t, viodev->cmo.allocated,
+				               VIO_CMO_MIN_ENT);
+	}
+
+	if (delta <= avail) {
+		vio_cmo.entitled -= delta;
+
+		/* Take entitlement from the excess pool first */
+		tmp = min(vio_cmo.excess.free, delta);
+		vio_cmo.excess.size -= tmp;
+		vio_cmo.excess.free -= tmp;
+		delta -= tmp;
+
+		/*
+		 * Remove all but VIO_CMO_MIN_ENT bytes from devices
+		 * until entitlement change is served
+		 */
+		list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+			if (!delta)
+				break;
+
+			viodev = dev_ent->viodev;
+			tmp = 0;
+			if ((viodev->cmo.entitled > viodev->cmo.allocated) &&
+			    (viodev->cmo.entitled > VIO_CMO_MIN_ENT))
+				tmp = viodev->cmo.entitled -
+				      max_t(size_t, viodev->cmo.allocated,
+				            VIO_CMO_MIN_ENT);
+			viodev->cmo.entitled -= min(tmp, delta);
+			delta -= min(tmp, delta);
+		}
+	} else {
+		spin_unlock_irqrestore(&vio_cmo.lock, flags);
+		return -ENOMEM;
+	}
+
+out:
+	schedule_delayed_work(&vio_cmo.balance_q, 0);
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+	return 0;
+}
+
+/**
+ * vio_cmo_balance - Balance entitlement among devices
+ *
+ * @work: work queue structure for this operation
+ *
+ * Any system entitlement above the minimum needed for devices, or
+ * already allocated to devices, can be distributed to the devices.
+ * The list of devices is iterated through to recalculate the desired
+ * entitlement level and to determine how much entitlement above the
+ * minimum entitlement is allocated to devices.
+ *
+ * Small chunks of the available entitlement are given to devices until
+ * their requirements are fulfilled or there is no entitlement left to give.
+ * Upon completion sizes of the reserve and excess pools are calculated.
+ *
+ * The system minimum entitlement level is also recalculated here.
+ * Entitlement will be reserved for devices even after vio_bus_remove to
+ * accommodate reloading the driver.  The OF tree is walked to count the
+ * number of devices present and this will remove entitlement for devices
+ * that have actually left the system after having vio_bus_remove called.
+ */
+static void vio_cmo_balance(struct work_struct *work)
+{
+	struct vio_cmo *cmo;
+	struct vio_dev *viodev;
+	struct vio_cmo_dev_entry *dev_ent;
+	unsigned long flags;
+	size_t avail = 0, level, chunk, need;
+	int devcount = 0, fulfilled;
+
+	cmo = container_of(work, struct vio_cmo, balance_q.work);
+
+	spin_lock_irqsave(&vio_cmo.lock, flags);
+
+	/* Calculate minimum entitlement and fulfill spare */
+	cmo->min = vio_cmo_num_OF_devs() * VIO_CMO_MIN_ENT;
+	BUG_ON(cmo->min > cmo->entitled);
+	cmo->spare = min_t(size_t, VIO_CMO_MIN_ENT, (cmo->entitled - cmo->min));
+	cmo->min += cmo->spare;
+	cmo->desired = cmo->min;
+
+	/*
+	 * Determine how much entitlement is available and reset device
+	 * entitlements
+	 */
+	avail = cmo->entitled - cmo->spare;
+	list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+		viodev = dev_ent->viodev;
+		devcount++;
+		viodev->cmo.entitled = VIO_CMO_MIN_ENT;
+		cmo->desired += (viodev->cmo.desired - VIO_CMO_MIN_ENT);
+		avail -= max_t(size_t, viodev->cmo.allocated, VIO_CMO_MIN_ENT);
+	}
+
+	/*
+	 * Having provided each device with the minimum entitlement, loop
+	 * over the devices portioning out the remaining entitlement
+	 * until there is nothing left.
+	 */
+	level = VIO_CMO_MIN_ENT;
+	while (avail) {
+		fulfilled = 0;
+		list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+			viodev = dev_ent->viodev;
+
+			if (viodev->cmo.desired <= level) {
+				fulfilled++;
+				continue;
+			}
+
+			/*
+			 * Give the device up to VIO_CMO_BALANCE_CHUNK
+			 * bytes of entitlement, but do not exceed the
+			 * desired level of entitlement for the device.
+			 */
+			chunk = min_t(size_t, avail, VIO_CMO_BALANCE_CHUNK);
+			chunk = min(chunk, (viodev->cmo.desired -
+			                    viodev->cmo.entitled));
+			viodev->cmo.entitled += chunk;
+
+			/*
+			 * If the memory for this entitlement increase was
+			 * already allocated to the device it does not come
+			 * from the available pool being portioned out.
+			 */
+			need = max(viodev->cmo.allocated, viodev->cmo.entitled)-
+			       max(viodev->cmo.allocated, level);
+			avail -= need;
+
+		}
+		if (fulfilled == devcount)
+			break;
+		level += VIO_CMO_BALANCE_CHUNK;
+	}
+
+	/* Calculate new reserve and excess pool sizes */
+	cmo->reserve.size = cmo->min;
+	cmo->excess.free = 0;
+	cmo->excess.size = 0;
+	need = 0;
+	list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+		viodev = dev_ent->viodev;
+		/* Calculated reserve size above the minimum entitlement */
+		if (viodev->cmo.entitled)
+			cmo->reserve.size += (viodev->cmo.entitled -
+			                      VIO_CMO_MIN_ENT);
+		/* Calculated used excess entitlement */
+		if (viodev->cmo.allocated > viodev->cmo.entitled)
+			need += viodev->cmo.allocated - viodev->cmo.entitled;
+	}
+	cmo->excess.size = cmo->entitled - cmo->reserve.size;
+	cmo->excess.free = cmo->excess.size - need;
+
+	cancel_delayed_work(container_of(work, struct delayed_work, work));
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+static void *vio_dma_iommu_alloc_coherent(struct device *dev, size_t size,
+                                          dma_addr_t *dma_handle, gfp_t flag)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	void *ret;
+
+	if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE))) {
+		atomic_inc(&viodev->cmo.allocs_failed);
+		return NULL;
+	}
+
+	ret = dma_iommu_ops.alloc_coherent(dev, size, dma_handle, flag);
+	if (unlikely(ret == NULL)) {
+		vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE));
+		atomic_inc(&viodev->cmo.allocs_failed);
+	}
+
+	return ret;
+}
+
+static void vio_dma_iommu_free_coherent(struct device *dev, size_t size,
+                                        void *vaddr, dma_addr_t dma_handle)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+
+	dma_iommu_ops.free_coherent(dev, size, vaddr, dma_handle);
+
+	vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE));
+}
+
+static dma_addr_t vio_dma_iommu_map_single(struct device *dev, void *vaddr,
+                                           size_t size,
+                                           enum dma_data_direction direction,
+                                           struct dma_attrs *attrs)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	dma_addr_t ret = DMA_ERROR_CODE;
+
+	if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE))) {
+		atomic_inc(&viodev->cmo.allocs_failed);
+		return ret;
+	}
+
+	ret = dma_iommu_ops.map_single(dev, vaddr, size, direction, attrs);
+	if (unlikely(dma_mapping_error(ret))) {
+		vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE));
+		atomic_inc(&viodev->cmo.allocs_failed);
+	}
+
+	return ret;
+}
+
+static void vio_dma_iommu_unmap_single(struct device *dev,
+		dma_addr_t dma_handle, size_t size,
+		enum dma_data_direction direction,
+		struct dma_attrs *attrs)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+
+	dma_iommu_ops.unmap_single(dev, dma_handle, size, direction, attrs);
+
+	vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE));
+}
+
+static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
+                                int nelems, enum dma_data_direction direction,
+                                struct dma_attrs *attrs)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	struct scatterlist *sgl;
+	int ret, count = 0;
+	size_t alloc_size = 0;
+
+	for (sgl = sglist; count < nelems; count++, sgl++)
+		alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE);
+
+	if (vio_cmo_alloc(viodev, alloc_size)) {
+		atomic_inc(&viodev->cmo.allocs_failed);
+		return 0;
+	}
+
+	ret = dma_iommu_ops.map_sg(dev, sglist, nelems, direction, attrs);
+
+	if (unlikely(!ret)) {
+		vio_cmo_dealloc(viodev, alloc_size);
+		atomic_inc(&viodev->cmo.allocs_failed);
+	}
+
+	for (sgl = sglist, count = 0; count < ret; count++, sgl++)
+		alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE);
+	if (alloc_size)
+		vio_cmo_dealloc(viodev, alloc_size);
+
+	return ret;
+}
+
+static void vio_dma_iommu_unmap_sg(struct device *dev,
+		struct scatterlist *sglist, int nelems,
+		enum dma_data_direction direction,
+		struct dma_attrs *attrs)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	struct scatterlist *sgl;
+	size_t alloc_size = 0;
+	int count = 0;
+
+	for (sgl = sglist; count < nelems; count++, sgl++)
+		alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE);
+
+	dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs);
+
+	vio_cmo_dealloc(viodev, alloc_size);
+}
+
+struct dma_mapping_ops vio_dma_mapping_ops = {
+	.alloc_coherent = vio_dma_iommu_alloc_coherent,
+	.free_coherent  = vio_dma_iommu_free_coherent,
+	.map_single     = vio_dma_iommu_map_single,
+	.unmap_single   = vio_dma_iommu_unmap_single,
+	.map_sg         = vio_dma_iommu_map_sg,
+	.unmap_sg       = vio_dma_iommu_unmap_sg,
+};
+
+/**
+ * vio_cmo_set_dev_desired - Set desired entitlement for a device
+ *
+ * @viodev: struct vio_dev for device to alter
+ * @new_desired: new desired entitlement level in bytes
+ *
+ * For use by devices to request a change to their entitlement at runtime or
+ * through sysfs.  The desired entitlement level is changed and a balancing
+ * of system resources is scheduled to run in the future.
+ */
+void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired)
+{
+	unsigned long flags;
+	struct vio_cmo_dev_entry *dev_ent;
+	int found = 0;
+
+	if (!firmware_has_feature(FW_FEATURE_CMO))
+		return;
+
+	spin_lock_irqsave(&vio_cmo.lock, flags);
+	if (desired < VIO_CMO_MIN_ENT)
+		desired = VIO_CMO_MIN_ENT;
+
+	/*
+	 * Changes will not be made for devices not in the device list.
+	 * If it is not in the device list, then no driver is loaded
+	 * for the device and it can not receive entitlement.
+	 */
+	list_for_each_entry(dev_ent, &vio_cmo.device_list, list)
+		if (viodev == dev_ent->viodev) {
+			found = 1;
+			break;
+		}
+	if (!found)
+		return;
+
+	/* Increase/decrease in desired device entitlement */
+	if (desired >= viodev->cmo.desired) {
+		/* Just bump the bus and device values prior to a balance*/
+		vio_cmo.desired += desired - viodev->cmo.desired;
+		viodev->cmo.desired = desired;
+	} else {
+		/* Decrease bus and device values for desired entitlement */
+		vio_cmo.desired -= viodev->cmo.desired - desired;
+		viodev->cmo.desired = desired;
+		/*
+		 * If less entitlement is desired than current entitlement, move
+		 * any reserve memory in the change region to the excess pool.
+		 */
+		if (viodev->cmo.entitled > desired) {
+			vio_cmo.reserve.size -= viodev->cmo.entitled - desired;
+			vio_cmo.excess.size += viodev->cmo.entitled - desired;
+			/*
+			 * If entitlement moving from the reserve pool to the
+			 * excess pool is currently unused, add to the excess
+			 * free counter.
+			 */
+			if (viodev->cmo.allocated < viodev->cmo.entitled)
+				vio_cmo.excess.free += viodev->cmo.entitled -
+				                       max(viodev->cmo.allocated, desired);
+			viodev->cmo.entitled = desired;
+		}
+	}
+	schedule_delayed_work(&vio_cmo.balance_q, 0);
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+/**
+ * vio_cmo_bus_probe - Handle CMO specific bus probe activities
+ *
+ * @viodev - Pointer to struct vio_dev for device
+ *
+ * Determine the devices IO memory entitlement needs, attempting
+ * to satisfy the system minimum entitlement at first and scheduling
+ * a balance operation to take care of the rest at a later time.
+ *
+ * Returns: 0 on success, -EINVAL when device doesn't support CMO, and
+ *          -ENOMEM when entitlement is not available for device or
+ *          device entry.
+ *
+ */
+static int vio_cmo_bus_probe(struct vio_dev *viodev)
+{
+	struct vio_cmo_dev_entry *dev_ent;
+	struct device *dev = &viodev->dev;
+	struct vio_driver *viodrv = to_vio_driver(dev->driver);
+	unsigned long flags;
+	size_t size;
+
+	/*
+	 * Check to see that device has a DMA window and configure
+	 * entitlement for the device.
+	 */
+	if (of_get_property(viodev->dev.archdata.of_node,
+	                    "ibm,my-dma-window", NULL)) {
+		/* Check that the driver is CMO enabled and get desired DMA */
+		if (!viodrv->get_desired_dma) {
+			dev_err(dev, "%s: device driver does not support CMO\n",
+			        __func__);
+			return -EINVAL;
+		}
+
+		viodev->cmo.desired = IOMMU_PAGE_ALIGN(viodrv->get_desired_dma(viodev));
+		if (viodev->cmo.desired < VIO_CMO_MIN_ENT)
+			viodev->cmo.desired = VIO_CMO_MIN_ENT;
+		size = VIO_CMO_MIN_ENT;
+
+		dev_ent = kmalloc(sizeof(struct vio_cmo_dev_entry),
+		                  GFP_KERNEL);
+		if (!dev_ent)
+			return -ENOMEM;
+
+		dev_ent->viodev = viodev;
+		spin_lock_irqsave(&vio_cmo.lock, flags);
+		list_add(&dev_ent->list, &vio_cmo.device_list);
+	} else {
+		viodev->cmo.desired = 0;
+		size = 0;
+		spin_lock_irqsave(&vio_cmo.lock, flags);
+	}
+
+	/*
+	 * If the needs for vio_cmo.min have not changed since they
+	 * were last set, the number of devices in the OF tree has
+	 * been constant and the IO memory for this is already in
+	 * the reserve pool.
+	 */
+	if (vio_cmo.min == ((vio_cmo_num_OF_devs() + 1) *
+	                    VIO_CMO_MIN_ENT)) {
+		/* Updated desired entitlement if device requires it */
+		if (size)
+			vio_cmo.desired += (viodev->cmo.desired -
+		                        VIO_CMO_MIN_ENT);
+	} else {
+		size_t tmp;
+
+		tmp = vio_cmo.spare + vio_cmo.excess.free;
+		if (tmp < size) {
+			dev_err(dev, "%s: insufficient free "
+			        "entitlement to add device. "
+			        "Need %lu, have %lu\n", __func__,
+				size, (vio_cmo.spare + tmp));
+			spin_unlock_irqrestore(&vio_cmo.lock, flags);
+			return -ENOMEM;
+		}
+
+		/* Use excess pool first to fulfill request */
+		tmp = min(size, vio_cmo.excess.free);
+		vio_cmo.excess.free -= tmp;
+		vio_cmo.excess.size -= tmp;
+		vio_cmo.reserve.size += tmp;
+
+		/* Use spare if excess pool was insufficient */
+		vio_cmo.spare -= size - tmp;
+
+		/* Update bus accounting */
+		vio_cmo.min += size;
+		vio_cmo.desired += viodev->cmo.desired;
+	}
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+	return 0;
+}
+
+/**
+ * vio_cmo_bus_remove - Handle CMO specific bus removal activities
+ *
+ * @viodev - Pointer to struct vio_dev for device
+ *
+ * Remove the device from the cmo device list.  The minimum entitlement
+ * will be reserved for the device as long as it is in the system.  The
+ * rest of the entitlement the device had been allocated will be returned
+ * to the system.
+ */
+static void vio_cmo_bus_remove(struct vio_dev *viodev)
+{
+	struct vio_cmo_dev_entry *dev_ent;
+	unsigned long flags;
+	size_t tmp;
+
+	spin_lock_irqsave(&vio_cmo.lock, flags);
+	if (viodev->cmo.allocated) {
+		dev_err(&viodev->dev, "%s: device had %lu bytes of IO "
+		        "allocated after remove operation.\n",
+		        __func__, viodev->cmo.allocated);
+		BUG();
+	}
+
+	/*
+	 * Remove the device from the device list being maintained for
+	 * CMO enabled devices.
+	 */
+	list_for_each_entry(dev_ent, &vio_cmo.device_list, list)
+		if (viodev == dev_ent->viodev) {
+			list_del(&dev_ent->list);
+			kfree(dev_ent);
+			break;
+		}
+
+	/*
+	 * Devices may not require any entitlement and they do not need
+	 * to be processed.  Otherwise, return the device's entitlement
+	 * back to the pools.
+	 */
+	if (viodev->cmo.entitled) {
+		/*
+		 * This device has not yet left the OF tree, it's
+		 * minimum entitlement remains in vio_cmo.min and
+		 * vio_cmo.desired
+		 */
+		vio_cmo.desired -= (viodev->cmo.desired - VIO_CMO_MIN_ENT);
+
+		/*
+		 * Save min allocation for device in reserve as long
+		 * as it exists in OF tree as determined by later
+		 * balance operation
+		 */
+		viodev->cmo.entitled -= VIO_CMO_MIN_ENT;
+
+		/* Replenish spare from freed reserve pool */
+		if (viodev->cmo.entitled && (vio_cmo.spare < VIO_CMO_MIN_ENT)) {
+			tmp = min(viodev->cmo.entitled, (VIO_CMO_MIN_ENT -
+			                                 vio_cmo.spare));
+			vio_cmo.spare += tmp;
+			viodev->cmo.entitled -= tmp;
+		}
+
+		/* Remaining reserve goes to excess pool */
+		vio_cmo.excess.size += viodev->cmo.entitled;
+		vio_cmo.excess.free += viodev->cmo.entitled;
+		vio_cmo.reserve.size -= viodev->cmo.entitled;
+
+		/*
+		 * Until the device is removed it will keep a
+		 * minimum entitlement; this will guarantee that
+		 * a module unload/load will result in a success.
+		 */
+		viodev->cmo.entitled = VIO_CMO_MIN_ENT;
+		viodev->cmo.desired = VIO_CMO_MIN_ENT;
+		atomic_set(&viodev->cmo.allocs_failed, 0);
+	}
+
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+static void vio_cmo_set_dma_ops(struct vio_dev *viodev)
+{
+	vio_dma_mapping_ops.dma_supported = dma_iommu_ops.dma_supported;
+	viodev->dev.archdata.dma_ops = &vio_dma_mapping_ops;
+}
+
+/**
+ * vio_cmo_bus_init - CMO entitlement initialization at bus init time
+ *
+ * Set up the reserve and excess entitlement pools based on available
+ * system entitlement and the number of devices in the OF tree that
+ * require entitlement in the reserve pool.
+ */
+static void vio_cmo_bus_init(void)
+{
+	struct hvcall_mpp_data mpp_data;
+	int err;
+
+	memset(&vio_cmo, 0, sizeof(struct vio_cmo));
+	spin_lock_init(&vio_cmo.lock);
+	INIT_LIST_HEAD(&vio_cmo.device_list);
+	INIT_DELAYED_WORK(&vio_cmo.balance_q, vio_cmo_balance);
+
+	/* Get current system entitlement */
+	err = h_get_mpp(&mpp_data);
+
+	/*
+	 * On failure, continue with entitlement set to 0, will panic()
+	 * later when spare is reserved.
+	 */
+	if (err != H_SUCCESS) {
+		printk(KERN_ERR "%s: unable to determine system IO "\
+		       "entitlement. (%d)\n", __func__, err);
+		vio_cmo.entitled = 0;
+	} else {
+		vio_cmo.entitled = mpp_data.entitled_mem;
+	}
+
+	/* Set reservation and check against entitlement */
+	vio_cmo.spare = VIO_CMO_MIN_ENT;
+	vio_cmo.reserve.size = vio_cmo.spare;
+	vio_cmo.reserve.size += (vio_cmo_num_OF_devs() *
+	                         VIO_CMO_MIN_ENT);
+	if (vio_cmo.reserve.size > vio_cmo.entitled) {
+		printk(KERN_ERR "%s: insufficient system entitlement\n",
+		       __func__);
+		panic("%s: Insufficient system entitlement", __func__);
+	}
+
+	/* Set the remaining accounting variables */
+	vio_cmo.excess.size = vio_cmo.entitled - vio_cmo.reserve.size;
+	vio_cmo.excess.free = vio_cmo.excess.size;
+	vio_cmo.min = vio_cmo.reserve.size;
+	vio_cmo.desired = vio_cmo.reserve.size;
+}
+
+/* sysfs device functions and data structures for CMO */
+
+#define viodev_cmo_rd_attr(name)                                        \
+static ssize_t viodev_cmo_##name##_show(struct device *dev,             \
+                                        struct device_attribute *attr,  \
+                                         char *buf)                     \
+{                                                                       \
+	return sprintf(buf, "%lu\n", to_vio_dev(dev)->cmo.name);        \
+}
+
+static ssize_t viodev_cmo_allocs_failed_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	return sprintf(buf, "%d\n", atomic_read(&viodev->cmo.allocs_failed));
+}
+
+static ssize_t viodev_cmo_allocs_failed_reset(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	atomic_set(&viodev->cmo.allocs_failed, 0);
+	return count;
+}
+
+static ssize_t viodev_cmo_desired_set(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	size_t new_desired;
+	int ret;
+
+	ret = strict_strtoul(buf, 10, &new_desired);
+	if (ret)
+		return ret;
+
+	vio_cmo_set_dev_desired(viodev, new_desired);
+	return count;
+}
+
+viodev_cmo_rd_attr(desired);
+viodev_cmo_rd_attr(entitled);
+viodev_cmo_rd_attr(allocated);
+
+static ssize_t name_show(struct device *, struct device_attribute *, char *);
+static ssize_t devspec_show(struct device *, struct device_attribute *, char *);
+static struct device_attribute vio_cmo_dev_attrs[] = {
+	__ATTR_RO(name),
+	__ATTR_RO(devspec),
+	__ATTR(cmo_desired,       S_IWUSR|S_IRUSR|S_IWGRP|S_IRGRP|S_IROTH,
+	       viodev_cmo_desired_show, viodev_cmo_desired_set),
+	__ATTR(cmo_entitled,      S_IRUGO, viodev_cmo_entitled_show,      NULL),
+	__ATTR(cmo_allocated,     S_IRUGO, viodev_cmo_allocated_show,     NULL),
+	__ATTR(cmo_allocs_failed, S_IWUSR|S_IRUSR|S_IWGRP|S_IRGRP|S_IROTH,
+	       viodev_cmo_allocs_failed_show, viodev_cmo_allocs_failed_reset),
+	__ATTR_NULL
+};
+
+/* sysfs bus functions and data structures for CMO */
+
+#define viobus_cmo_rd_attr(name)                                        \
+static ssize_t                                                          \
+viobus_cmo_##name##_show(struct bus_type *bt, char *buf)                \
+{                                                                       \
+	return sprintf(buf, "%lu\n", vio_cmo.name);                     \
+}
+
+#define viobus_cmo_pool_rd_attr(name, var)                              \
+static ssize_t                                                          \
+viobus_cmo_##name##_pool_show_##var(struct bus_type *bt, char *buf)     \
+{                                                                       \
+	return sprintf(buf, "%lu\n", vio_cmo.name.var);                 \
+}
+
+static ssize_t viobus_cmo_high_reset(struct bus_type *bt, const char *buf,
+                                     size_t count)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&vio_cmo.lock, flags);
+	vio_cmo.high = vio_cmo.curr;
+	spin_unlock_irqrestore(&vio_cmo.lock, flags);
+
+	return count;
+}
+
+viobus_cmo_rd_attr(entitled);
+viobus_cmo_pool_rd_attr(reserve, size);
+viobus_cmo_pool_rd_attr(excess, size);
+viobus_cmo_pool_rd_attr(excess, free);
+viobus_cmo_rd_attr(spare);
+viobus_cmo_rd_attr(min);
+viobus_cmo_rd_attr(desired);
+viobus_cmo_rd_attr(curr);
+viobus_cmo_rd_attr(high);
+
+static struct bus_attribute vio_cmo_bus_attrs[] = {
+	__ATTR(cmo_entitled, S_IRUGO, viobus_cmo_entitled_show, NULL),
+	__ATTR(cmo_reserve_size, S_IRUGO, viobus_cmo_reserve_pool_show_size, NULL),
+	__ATTR(cmo_excess_size, S_IRUGO, viobus_cmo_excess_pool_show_size, NULL),
+	__ATTR(cmo_excess_free, S_IRUGO, viobus_cmo_excess_pool_show_free, NULL),
+	__ATTR(cmo_spare,   S_IRUGO, viobus_cmo_spare_show,   NULL),
+	__ATTR(cmo_min,     S_IRUGO, viobus_cmo_min_show,     NULL),
+	__ATTR(cmo_desired, S_IRUGO, viobus_cmo_desired_show, NULL),
+	__ATTR(cmo_curr,    S_IRUGO, viobus_cmo_curr_show,    NULL),
+	__ATTR(cmo_high,    S_IWUSR|S_IRUSR|S_IWGRP|S_IRGRP|S_IROTH,
+	       viobus_cmo_high_show, viobus_cmo_high_reset),
+	__ATTR_NULL
+};
+
+static void vio_cmo_sysfs_init(void)
+{
+	vio_bus_type.dev_attrs = vio_cmo_dev_attrs;
+	vio_bus_type.bus_attrs = vio_cmo_bus_attrs;
+}
+#else /* CONFIG_PPC_SMLPAR */
+/* Dummy functions for iSeries platform */
+int vio_cmo_entitlement_update(size_t new_entitlement) { return 0; }
+void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired) {}
+static int vio_cmo_bus_probe(struct vio_dev *viodev) { return 0; }
+static void vio_cmo_bus_remove(struct vio_dev *viodev) {}
+static void vio_cmo_set_dma_ops(struct vio_dev *viodev) {}
+static void vio_cmo_bus_init() {}
+static void vio_cmo_sysfs_init() { }
+#endif /* CONFIG_PPC_SMLPAR */
+EXPORT_SYMBOL(vio_cmo_entitlement_update);
+EXPORT_SYMBOL(vio_cmo_set_dev_desired);
+
 static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev)
 {
 	const unsigned char *dma_window;
@@ -114,8 +1105,17 @@ static int vio_bus_probe(struct device *dev)
 		return error;
 
 	id = vio_match_device(viodrv->id_table, viodev);
-	if (id)
+	if (id) {
+		memset(&viodev->cmo, 0, sizeof(viodev->cmo));
+		if (firmware_has_feature(FW_FEATURE_CMO)) {
+			error = vio_cmo_bus_probe(viodev);
+			if (error)
+				return error;
+		}
 		error = viodrv->probe(viodev, id);
+		if (error)
+			vio_cmo_bus_remove(viodev);
+	}
 
 	return error;
 }
@@ -125,12 +1125,23 @@ static int vio_bus_remove(struct device *dev)
 {
 	struct vio_dev *viodev = to_vio_dev(dev);
 	struct vio_driver *viodrv = to_vio_driver(dev->driver);
+	struct device *devptr;
+	int ret = 1;
+
+	/*
+	 * Hold a reference to the device after the remove function is called
+	 * to allow for CMO accounting cleanup for the device.
+	 */
+	devptr = get_device(dev);
 
 	if (viodrv->remove)
-		return viodrv->remove(viodev);
+		ret = viodrv->remove(viodev);
+
+	if (!ret && firmware_has_feature(FW_FEATURE_CMO))
+		vio_cmo_bus_remove(viodev);
 
-	/* driver can't remove */
-	return 1;
+	put_device(devptr);
+	return ret;
 }
 
 /**
@@ -215,7 +1226,11 @@ struct vio_dev *vio_register_device_node(struct device_node *of_node)
 			viodev->unit_address = *unit_address;
 	}
 	viodev->dev.archdata.of_node = of_node_get(of_node);
-	viodev->dev.archdata.dma_ops = &dma_iommu_ops;
+
+	if (firmware_has_feature(FW_FEATURE_CMO))
+		vio_cmo_set_dma_ops(viodev);
+	else
+		viodev->dev.archdata.dma_ops = &dma_iommu_ops;
 	viodev->dev.archdata.dma_data = vio_build_iommu_table(viodev);
 	viodev->dev.archdata.numa_node = of_node_to_nid(of_node);
 
@@ -245,6 +1260,9 @@ static int __init vio_bus_init(void)
 	int err;
 	struct device_node *node_vroot;
 
+	if (firmware_has_feature(FW_FEATURE_CMO))
+		vio_cmo_sysfs_init();
+
 	err = bus_register(&vio_bus_type);
 	if (err) {
 		printk(KERN_ERR "failed to register VIO bus\n");
@@ -262,6 +1280,9 @@ static int __init vio_bus_init(void)
 		return err;
 	}
 
+	if (firmware_has_feature(FW_FEATURE_CMO))
+		vio_cmo_bus_init();
+
 	node_vroot = of_find_node_by_name(NULL, "vdevice");
 	if (node_vroot) {
 		struct device_node *of_node;
diff --git a/include/asm-powerpc/vio.h b/include/asm-powerpc/vio.h
index 56512a968da..0a290a19594 100644
--- a/include/asm-powerpc/vio.h
+++ b/include/asm-powerpc/vio.h
@@ -39,16 +39,32 @@
 #define VIO_IRQ_DISABLE		0UL
 #define VIO_IRQ_ENABLE		1UL
 
+/*
+ * VIO CMO minimum entitlement for all devices and spare entitlement
+ */
+#define VIO_CMO_MIN_ENT 1562624
+
 struct iommu_table;
 
-/*
- * The vio_dev structure is used to describe virtual I/O devices.
+/**
+ * vio_dev - This structure is used to describe virtual I/O devices.
+ *
+ * @desired: set from return of driver's get_desired_dma() function
+ * @entitled: bytes of IO data that has been reserved for this device.
+ * @allocated: bytes of IO data currently in use by the device.
+ * @allocs_failed: number of DMA failures due to insufficient entitlement.
  */
 struct vio_dev {
 	const char *name;
 	const char *type;
 	uint32_t unit_address;
 	unsigned int irq;
+	struct {
+		size_t desired;
+		size_t entitled;
+		size_t allocated;
+		atomic_t allocs_failed;
+	} cmo;
 	struct device dev;
 };
 
@@ -56,12 +72,19 @@ struct vio_driver {
 	const struct vio_device_id *id_table;
 	int (*probe)(struct vio_dev *dev, const struct vio_device_id *id);
 	int (*remove)(struct vio_dev *dev);
+	/* A driver must have a get_desired_dma() function to
+	 * be loaded in a CMO environment if it uses DMA.
+	 */
+	unsigned long (*get_desired_dma)(struct vio_dev *dev);
 	struct device_driver driver;
 };
 
 extern int vio_register_driver(struct vio_driver *drv);
 extern void vio_unregister_driver(struct vio_driver *drv);
 
+extern int vio_cmo_entitlement_update(size_t);
+extern void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired);
+
 extern void __devinit vio_unregister_device(struct vio_dev *dev);
 
 struct device_node;
-- 
cgit v1.2.3-70-g09d2


From 22e1a4dd3f2a9009d1d8896a5e833b6094877008 Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@austin.ibm.com>
Date: Thu, 24 Jul 2008 04:31:52 +1000
Subject: powerpc/pseries: Verify CMO memory entitlement updates with virtual
 I/O

Verify memory entitlement updates can be handled by vio.

Signed-off-by: Nathan Fontenot <nfont@austin.ibm.com>
Signed-off-by: Robert Jennings <rcj@linux.vnet.ibm.com>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/kernel/lparcfg.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/lparcfg.c b/arch/powerpc/kernel/lparcfg.c
index 848c3e5a637..64381a204a5 100644
--- a/arch/powerpc/kernel/lparcfg.c
+++ b/arch/powerpc/kernel/lparcfg.c
@@ -34,6 +34,7 @@
 #include <asm/time.h>
 #include <asm/prom.h>
 #include <asm/vdso_datapage.h>
+#include <asm/vio.h>
 
 #define MODULE_VERS "1.8"
 #define MODULE_NAME "lparcfg"
@@ -527,6 +528,15 @@ static ssize_t update_mpp(u64 *entitlement, u8 *weight)
 	u8 new_weight;
 	ssize_t rc;
 
+	if (entitlement) {
+		/* Check with vio to ensure the new memory entitlement
+		 * can be handled.
+		 */
+		rc = vio_cmo_entitlement_update(*entitlement);
+		if (rc)
+			return rc;
+	}
+
 	rc = h_get_mpp(&mpp_data);
 	if (rc)
 		return rc;
-- 
cgit v1.2.3-70-g09d2


From 8391e42a5c1f3d757faa5e7f46a4a68f9aa6cb12 Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@austin.ibm.com>
Date: Thu, 24 Jul 2008 04:36:38 +1000
Subject: powerpc/pseries: Update arch vector to indicate support for CMO

Update the architecture vector to indicate that Cooperative Memory
Overcommitment is supported if CONFIG_PPC_SMLPAR is set.

Signed-off-by: Nathan Fontenot <nfont@austin.ibm.com>
Signed-off-by: Robert Jennings <rcj@linux.vnet.ibm.com>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/kernel/prom_init.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 1ea8c8d3ce8..c4ab2195b9c 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -643,6 +643,11 @@ static void __init early_cmdline_parse(void)
 #else
 #define OV5_MSI			0x00
 #endif /* CONFIG_PCI_MSI */
+#ifdef CONFIG_PPC_SMLPAR
+#define OV5_CMO			0x80	/* Cooperative Memory Overcommitment */
+#else
+#define OV5_CMO			0x00
+#endif
 
 /*
  * The architecture vector has an array of PVR mask/value pairs,
@@ -687,10 +692,12 @@ static unsigned char ibm_architecture_vec[] = {
 	0,				/* don't halt */
 
 	/* option vector 5: PAPR/OF options */
-	3 - 2,				/* length */
+	5 - 2,				/* length */
 	0,				/* don't ignore, don't halt */
 	OV5_LPAR | OV5_SPLPAR | OV5_LARGE_PAGES | OV5_DRCONF_MEMORY |
 	OV5_DONATE_DEDICATE_CPU | OV5_MSI,
+	0,
+	OV5_CMO,
 };
 
 /* Old method - ELF header with PT_NOTE sections */
-- 
cgit v1.2.3-70-g09d2


From 16c14b4621c7b6fc4611abf1f86cd78cdb1b2b03 Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@austin.ibm.com>
Date: Thu, 24 Jul 2008 05:10:46 +1000
Subject: powerpc/pseries: Remove kmalloc call in handling writes to lparcfg

There are only 4 valid name=value pairs for writes to
/proc/ppc64/lparcfg.  Current code allocates a buffer to copy
this information in from the user.  Since the longest name=value
pair will easily fit into a buffer of 64 characters, simply
put the buffer on the stack instead of allocating the buffer.

Signed-off-by: Nathan Fotenot <nfont@austin.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/kernel/lparcfg.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/lparcfg.c b/arch/powerpc/kernel/lparcfg.c
index 64381a204a5..9f856a0c3e3 100644
--- a/arch/powerpc/kernel/lparcfg.c
+++ b/arch/powerpc/kernel/lparcfg.c
@@ -573,29 +573,27 @@ static ssize_t update_mpp(u64 *entitlement, u8 *weight)
 static ssize_t lparcfg_write(struct file *file, const char __user * buf,
 			     size_t count, loff_t * off)
 {
-	char *kbuf;
+	int kbuf_sz = 64;
+	char kbuf[kbuf_sz];
 	char *tmp;
 	u64 new_entitled, *new_entitled_ptr = &new_entitled;
 	u8 new_weight, *new_weight_ptr = &new_weight;
-	ssize_t retval = -ENOMEM;
+	ssize_t retval;
 
 	if (!firmware_has_feature(FW_FEATURE_SPLPAR) ||
 			firmware_has_feature(FW_FEATURE_ISERIES))
 		return -EINVAL;
 
-	kbuf = kmalloc(count, GFP_KERNEL);
-	if (!kbuf)
-		goto out;
+	if (count > kbuf_sz)
+		return -EINVAL;
 
-	retval = -EFAULT;
 	if (copy_from_user(kbuf, buf, count))
-		goto out;
+		return -EFAULT;
 
-	retval = -EINVAL;
 	kbuf[count - 1] = '\0';
 	tmp = strchr(kbuf, '=');
 	if (!tmp)
-		goto out;
+		return -EINVAL;
 
 	*tmp++ = '\0';
 
@@ -603,32 +601,32 @@ static ssize_t lparcfg_write(struct file *file, const char __user * buf,
 		char *endp;
 		*new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10);
 		if (endp == tmp)
-			goto out;
+			return -EINVAL;
 
 		retval = update_ppp(new_entitled_ptr, NULL);
 	} else if (!strcmp(kbuf, "capacity_weight")) {
 		char *endp;
 		*new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10);
 		if (endp == tmp)
-			goto out;
+			return -EINVAL;
 
 		retval = update_ppp(NULL, new_weight_ptr);
 	} else if (!strcmp(kbuf, "entitled_memory")) {
 		char *endp;
 		*new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10);
 		if (endp == tmp)
-			goto out;
+			return -EINVAL;
 
 		retval = update_mpp(new_entitled_ptr, NULL);
 	} else if (!strcmp(kbuf, "entitled_memory_weight")) {
 		char *endp;
 		*new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10);
 		if (endp == tmp)
-			goto out;
+			return -EINVAL;
 
 		retval = update_mpp(NULL, new_weight_ptr);
 	} else
-		goto out;
+		return -EINVAL;
 
 	if (retval == H_SUCCESS || retval == H_CONSTRAINED) {
 		retval = count;
@@ -644,8 +642,6 @@ static ssize_t lparcfg_write(struct file *file, const char __user * buf,
 		retval = -EIO;
 	}
 
-out:
-	kfree(kbuf);
 	return retval;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 58340a07c194e0aed7bc58b61ff24330bb2a409f Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Fri, 25 Jul 2008 01:45:33 -0700
Subject: introduce HAVE_EFFICIENT_UNALIGNED_ACCESS Kconfig symbol

In many cases, especially in networking, it can be beneficial to know at
compile time whether the architecture can do unaligned accesses efficiently.
This patch introduces a new Kconfig symbol

	HAVE_EFFICIENT_UNALIGNED_ACCESS

for that purpose and adds it to the powerpc and x86 architectures.  Also add
some documentation about alignment and networking, and especially one intended
use of this symbol.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Acked-by: Sam Ravnborg <sam@ravnborg.org>
Acked-by: Ingo Molnar <mingo@elte.hu> [x86 architecture part]
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/unaligned-memory-access.txt | 32 ++++++++++++++++++++++++++++---
 arch/Kconfig                              | 19 ++++++++++++++++++
 arch/powerpc/Kconfig                      |  1 +
 arch/x86/Kconfig                          |  1 +
 4 files changed, 50 insertions(+), 3 deletions(-)

(limited to 'arch/powerpc')

diff --git a/Documentation/unaligned-memory-access.txt b/Documentation/unaligned-memory-access.txt
index b0472ac5226..f866c72291b 100644
--- a/Documentation/unaligned-memory-access.txt
+++ b/Documentation/unaligned-memory-access.txt
@@ -218,9 +218,35 @@ If use of such macros is not convenient, another option is to use memcpy(),
 where the source or destination (or both) are of type u8* or unsigned char*.
 Due to the byte-wise nature of this operation, unaligned accesses are avoided.
 
+
+Alignment vs. Networking
+========================
+
+On architectures that require aligned loads, networking requires that the IP
+header is aligned on a four-byte boundary to optimise the IP stack. For
+regular ethernet hardware, the constant NET_IP_ALIGN is used. On most
+architectures this constant has the value 2 because the normal ethernet
+header is 14 bytes long, so in order to get proper alignment one needs to
+DMA to an address which can be expressed as 4*n + 2. One notable exception
+here is powerpc which defines NET_IP_ALIGN to 0 because DMA to unaligned
+addresses can be very expensive and dwarf the cost of unaligned loads.
+
+For some ethernet hardware that cannot DMA to unaligned addresses like
+4*n+2 or non-ethernet hardware, this can be a problem, and it is then
+required to copy the incoming frame into an aligned buffer. Because this is
+unnecessary on architectures that can do unaligned accesses, the code can be
+made dependent on CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS like so:
+
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	skb = original skb
+#else
+	skb = copy skb
+#endif
+
 --
-Author: Daniel Drake <dsd@gentoo.org>
+Authors: Daniel Drake <dsd@gentoo.org>,
+         Johannes Berg <johannes@sipsolutions.net>
 With help from: Alan Cox, Avuton Olrich, Heikki Orsila, Jan Engelhardt,
-Johannes Berg, Kyle McMartin, Kyle Moffett, Randy Dunlap, Robert Hancock,
-Uli Kunitz, Vadim Lobanov
+Kyle McMartin, Kyle Moffett, Randy Dunlap, Robert Hancock, Uli Kunitz,
+Vadim Lobanov
 
diff --git a/arch/Kconfig b/arch/Kconfig
index 6093c0be58b..b0fabfa864f 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -27,6 +27,25 @@ config KPROBES
 	  for kernel debugging, non-intrusive instrumentation and testing.
 	  If in doubt, say "N".
 
+config HAVE_EFFICIENT_UNALIGNED_ACCESS
+	def_bool n
+	help
+	  Some architectures are unable to perform unaligned accesses
+	  without the use of get_unaligned/put_unaligned. Others are
+	  unable to perform such accesses efficiently (e.g. trap on
+	  unaligned access and require fixing it up in the exception
+	  handler.)
+
+	  This symbol should be selected by an architecture if it can
+	  perform unaligned accesses efficiently to allow different
+	  code paths to be selected for these cases. Some network
+	  drivers, for example, could opt to not fix up alignment
+	  problems with received packets if doing so would not help
+	  much.
+
+	  See Documentation/unaligned-memory-access.txt for more
+	  information on the topic of unaligned memory accesses.
+
 config KRETPROBES
 	def_bool y
 	depends on KPROBES && HAVE_KRETPROBES
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index a487671c282..de6b49cd6be 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -112,6 +112,7 @@ config PPC
 	select HAVE_FTRACE
 	select HAVE_IDE
 	select HAVE_IOREMAP_PROT
+	select HAVE_EFFICIENT_UNALIGNED_ACCESS
 	select HAVE_KPROBES
 	select HAVE_ARCH_KGDB
 	select HAVE_KRETPROBES
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b2ddfcf0172..66f3ab05b18 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -28,6 +28,7 @@ config X86
 	select HAVE_FTRACE
 	select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
 	select HAVE_ARCH_KGDB if !X86_VOYAGER
+	select HAVE_EFFICIENT_UNALIGNED_ACCESS
 
 config ARCH_DEFCONFIG
 	string
-- 
cgit v1.2.3-70-g09d2


From ef53d9c5e4da147ecaa43c44c5e5945eb83970a2 Mon Sep 17 00:00:00 2001
From: Srinivasa D S <srinivasa@in.ibm.com>
Date: Fri, 25 Jul 2008 01:46:04 -0700
Subject: kprobes: improve kretprobe scalability with hashed locking

Currently list of kretprobe instances are stored in kretprobe object (as
used_instances,free_instances) and in kretprobe hash table.  We have one
global kretprobe lock to serialise the access to these lists.  This causes
only one kretprobe handler to execute at a time.  Hence affects system
performance, particularly on SMP systems and when return probe is set on
lot of functions (like on all systemcalls).

Solution proposed here gives fine-grain locks that performs better on SMP
system compared to present kretprobe implementation.

Solution:

 1) Instead of having one global lock to protect kretprobe instances
    present in kretprobe object and kretprobe hash table.  We will have
    two locks, one lock for protecting kretprobe hash table and another
    lock for kretporbe object.

 2) We hold lock present in kretprobe object while we modify kretprobe
    instance in kretprobe object and we hold per-hash-list lock while
    modifying kretprobe instances present in that hash list.  To prevent
    deadlock, we never grab a per-hash-list lock while holding a kretprobe
    lock.

 3) We can remove used_instances from struct kretprobe, as we can
    track used instances of kretprobe instances using kretprobe hash
    table.

Time duration for kernel compilation ("make -j 8") on a 8-way ppc64 system
with return probes set on all systemcalls looks like this.

cacheline              non-cacheline             Un-patched kernel
aligned patch 	       aligned patch
===============================================================================
real    9m46.784s       9m54.412s                  10m2.450s
user    40m5.715s       40m7.142s                  40m4.273s
sys     2m57.754s       2m58.583s                  3m17.430s
===========================================================

Time duration for kernel compilation ("make -j 8) on the same system, when
kernel is not probed.
=========================
real    9m26.389s
user    40m8.775s
sys     2m7.283s
=========================

Signed-off-by: Srinivasa DS <srinivasa@in.ibm.com>
Signed-off-by: Jim Keniston <jkenisto@us.ibm.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/kernel/kprobes.c     |   6 +-
 arch/ia64/kernel/kprobes.c    |   6 +-
 arch/powerpc/kernel/kprobes.c |   6 +-
 arch/s390/kernel/kprobes.c    |   6 +-
 arch/sparc64/kernel/kprobes.c |  11 ++--
 arch/x86/kernel/kprobes.c     |   6 +-
 include/linux/kprobes.h       |   7 ++-
 kernel/kprobes.c              | 127 +++++++++++++++++++++++++++++-------------
 8 files changed, 108 insertions(+), 67 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/arm/kernel/kprobes.c b/arch/arm/kernel/kprobes.c
index 5ee39e10c8d..d28513f14d0 100644
--- a/arch/arm/kernel/kprobes.c
+++ b/arch/arm/kernel/kprobes.c
@@ -296,8 +296,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
 	unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
 
 	INIT_HLIST_HEAD(&empty_rp);
-	spin_lock_irqsave(&kretprobe_lock, flags);
-	head = kretprobe_inst_table_head(current);
+	kretprobe_hash_lock(current, &head, &flags);
 
 	/*
 	 * It is possible to have multiple instances associated with a given
@@ -337,7 +336,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
 	}
 
 	kretprobe_assert(ri, orig_ret_address, trampoline_address);
-	spin_unlock_irqrestore(&kretprobe_lock, flags);
+	kretprobe_hash_unlock(current, &flags);
 
 	hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
 		hlist_del(&ri->hlist);
@@ -347,7 +346,6 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
 	return (void *)orig_ret_address;
 }
 
-/* Called with kretprobe_lock held. */
 void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
 				      struct pt_regs *regs)
 {
diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c
index 233434f4f88..f07688da947 100644
--- a/arch/ia64/kernel/kprobes.c
+++ b/arch/ia64/kernel/kprobes.c
@@ -429,8 +429,7 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 		((struct fnptr *)kretprobe_trampoline)->ip;
 
 	INIT_HLIST_HEAD(&empty_rp);
-	spin_lock_irqsave(&kretprobe_lock, flags);
-	head = kretprobe_inst_table_head(current);
+	kretprobe_hash_lock(current, &head, &flags);
 
 	/*
 	 * It is possible to have multiple instances associated with a given
@@ -485,7 +484,7 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 	kretprobe_assert(ri, orig_ret_address, trampoline_address);
 
 	reset_current_kprobe();
-	spin_unlock_irqrestore(&kretprobe_lock, flags);
+	kretprobe_hash_unlock(current, &flags);
 	preempt_enable_no_resched();
 
 	hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
@@ -500,7 +499,6 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 	return 1;
 }
 
-/* Called with kretprobe_lock held */
 void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
 				      struct pt_regs *regs)
 {
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 4ba2af12545..de79915452c 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -144,7 +144,6 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 	kcb->kprobe_saved_msr = regs->msr;
 }
 
-/* Called with kretprobe_lock held */
 void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
 				      struct pt_regs *regs)
 {
@@ -312,8 +311,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p,
 	unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
 
 	INIT_HLIST_HEAD(&empty_rp);
-	spin_lock_irqsave(&kretprobe_lock, flags);
-	head = kretprobe_inst_table_head(current);
+	kretprobe_hash_lock(current, &head, &flags);
 
 	/*
 	 * It is possible to have multiple instances associated with a given
@@ -352,7 +350,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p,
 	regs->nip = orig_ret_address;
 
 	reset_current_kprobe();
-	spin_unlock_irqrestore(&kretprobe_lock, flags);
+	kretprobe_hash_unlock(current, &flags);
 	preempt_enable_no_resched();
 
 	hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c
index 288ad490a6d..4f82e5b5f87 100644
--- a/arch/s390/kernel/kprobes.c
+++ b/arch/s390/kernel/kprobes.c
@@ -270,7 +270,6 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 	__ctl_store(kcb->kprobe_saved_ctl, 9, 11);
 }
 
-/* Called with kretprobe_lock held */
 void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
 					struct pt_regs *regs)
 {
@@ -377,8 +376,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p,
 	unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
 
 	INIT_HLIST_HEAD(&empty_rp);
-	spin_lock_irqsave(&kretprobe_lock, flags);
-	head = kretprobe_inst_table_head(current);
+	kretprobe_hash_lock(current, &head, &flags);
 
 	/*
 	 * It is possible to have multiple instances associated with a given
@@ -417,7 +415,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p,
 	regs->psw.addr = orig_ret_address | PSW_ADDR_AMODE;
 
 	reset_current_kprobe();
-	spin_unlock_irqrestore(&kretprobe_lock, flags);
+	kretprobe_hash_unlock(current, &flags);
 	preempt_enable_no_resched();
 
 	hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
diff --git a/arch/sparc64/kernel/kprobes.c b/arch/sparc64/kernel/kprobes.c
index f43b5d75535..201a6e547e4 100644
--- a/arch/sparc64/kernel/kprobes.c
+++ b/arch/sparc64/kernel/kprobes.c
@@ -478,9 +478,9 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 	return 0;
 }
 
-/* Called with kretprobe_lock held.  The value stored in the return
- * address register is actually 2 instructions before where the
- * callee will return to.  Sequences usually look something like this
+/* The value stored in the return address register is actually 2
+ * instructions before where the callee will return to.
+ * Sequences usually look something like this
  *
  *		call	some_function	<--- return register points here
  *		 nop			<--- call delay slot
@@ -512,8 +512,7 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 	unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
 
 	INIT_HLIST_HEAD(&empty_rp);
-	spin_lock_irqsave(&kretprobe_lock, flags);
-	head = kretprobe_inst_table_head(current);
+	kretprobe_hash_lock(current, &head, &flags);
 
 	/*
 	 * It is possible to have multiple instances associated with a given
@@ -553,7 +552,7 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 	regs->tnpc = orig_ret_address + 4;
 
 	reset_current_kprobe();
-	spin_unlock_irqrestore(&kretprobe_lock, flags);
+	kretprobe_hash_unlock(current, &flags);
 	preempt_enable_no_resched();
 
 	hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 43c019f85f0..6c27679ec6a 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -431,7 +431,6 @@ static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 		regs->ip = (unsigned long)p->ainsn.insn;
 }
 
-/* Called with kretprobe_lock held */
 void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
 				      struct pt_regs *regs)
 {
@@ -682,8 +681,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
 	unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
 
 	INIT_HLIST_HEAD(&empty_rp);
-	spin_lock_irqsave(&kretprobe_lock, flags);
-	head = kretprobe_inst_table_head(current);
+	kretprobe_hash_lock(current, &head, &flags);
 	/* fixup registers */
 #ifdef CONFIG_X86_64
 	regs->cs = __KERNEL_CS;
@@ -732,7 +730,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
 
 	kretprobe_assert(ri, orig_ret_address, trampoline_address);
 
-	spin_unlock_irqrestore(&kretprobe_lock, flags);
+	kretprobe_hash_unlock(current, &flags);
 
 	hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
 		hlist_del(&ri->hlist);
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 04a3556bdea..0be7795655f 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -157,11 +157,10 @@ struct kretprobe {
 	int nmissed;
 	size_t data_size;
 	struct hlist_head free_instances;
-	struct hlist_head used_instances;
+	spinlock_t lock;
 };
 
 struct kretprobe_instance {
-	struct hlist_node uflist; /* either on free list or used list */
 	struct hlist_node hlist;
 	struct kretprobe *rp;
 	kprobe_opcode_t *ret_addr;
@@ -201,7 +200,6 @@ static inline int init_test_probes(void)
 }
 #endif /* CONFIG_KPROBES_SANITY_TEST */
 
-extern spinlock_t kretprobe_lock;
 extern struct mutex kprobe_mutex;
 extern int arch_prepare_kprobe(struct kprobe *p);
 extern void arch_arm_kprobe(struct kprobe *p);
@@ -214,6 +212,9 @@ extern void kprobes_inc_nmissed_count(struct kprobe *p);
 
 /* Get the kprobe at this addr (if any) - called with preemption disabled */
 struct kprobe *get_kprobe(void *addr);
+void kretprobe_hash_lock(struct task_struct *tsk,
+			 struct hlist_head **head, unsigned long *flags);
+void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags);
 struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk);
 
 /* kprobe_running() will just return the current_kprobe on this CPU */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1485ca8d0e0..cb0b3bde361 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -62,6 +62,7 @@
 	addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name)))
 #endif
 
+static int kprobes_initialized;
 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 
@@ -69,8 +70,15 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 static bool kprobe_enabled;
 
 DEFINE_MUTEX(kprobe_mutex);		/* Protects kprobe_table */
-DEFINE_SPINLOCK(kretprobe_lock);	/* Protects kretprobe_inst_table */
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
+static struct {
+	spinlock_t lock ____cacheline_aligned;
+} kretprobe_table_locks[KPROBE_TABLE_SIZE];
+
+static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
+{
+	return &(kretprobe_table_locks[hash].lock);
+}
 
 /*
  * Normally, functions that we'd want to prohibit kprobes in, are marked
@@ -368,26 +376,53 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
 	return;
 }
 
-/* Called with kretprobe_lock held */
 void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
 				struct hlist_head *head)
 {
+	struct kretprobe *rp = ri->rp;
+
 	/* remove rp inst off the rprobe_inst_table */
 	hlist_del(&ri->hlist);
-	if (ri->rp) {
-		/* remove rp inst off the used list */
-		hlist_del(&ri->uflist);
-		/* put rp inst back onto the free list */
-		INIT_HLIST_NODE(&ri->uflist);
-		hlist_add_head(&ri->uflist, &ri->rp->free_instances);
+	INIT_HLIST_NODE(&ri->hlist);
+	if (likely(rp)) {
+		spin_lock(&rp->lock);
+		hlist_add_head(&ri->hlist, &rp->free_instances);
+		spin_unlock(&rp->lock);
 	} else
 		/* Unregistering */
 		hlist_add_head(&ri->hlist, head);
 }
 
-struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
+void kretprobe_hash_lock(struct task_struct *tsk,
+			 struct hlist_head **head, unsigned long *flags)
+{
+	unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
+	spinlock_t *hlist_lock;
+
+	*head = &kretprobe_inst_table[hash];
+	hlist_lock = kretprobe_table_lock_ptr(hash);
+	spin_lock_irqsave(hlist_lock, *flags);
+}
+
+void kretprobe_table_lock(unsigned long hash, unsigned long *flags)
 {
-	return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
+	spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
+	spin_lock_irqsave(hlist_lock, *flags);
+}
+
+void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags)
+{
+	unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
+	spinlock_t *hlist_lock;
+
+	hlist_lock = kretprobe_table_lock_ptr(hash);
+	spin_unlock_irqrestore(hlist_lock, *flags);
+}
+
+void kretprobe_table_unlock(unsigned long hash, unsigned long *flags)
+{
+	spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
+	spin_unlock_irqrestore(hlist_lock, *flags);
 }
 
 /*
@@ -401,17 +436,21 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
 	struct kretprobe_instance *ri;
 	struct hlist_head *head, empty_rp;
 	struct hlist_node *node, *tmp;
-	unsigned long flags = 0;
+	unsigned long hash, flags = 0;
 
-	INIT_HLIST_HEAD(&empty_rp);
-	spin_lock_irqsave(&kretprobe_lock, flags);
-	head = kretprobe_inst_table_head(tk);
+	if (unlikely(!kprobes_initialized))
+		/* Early boot.  kretprobe_table_locks not yet initialized. */
+		return;
+
+	hash = hash_ptr(tk, KPROBE_HASH_BITS);
+	head = &kretprobe_inst_table[hash];
+	kretprobe_table_lock(hash, &flags);
 	hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
 		if (ri->task == tk)
 			recycle_rp_inst(ri, &empty_rp);
 	}
-	spin_unlock_irqrestore(&kretprobe_lock, flags);
-
+	kretprobe_table_unlock(hash, &flags);
+	INIT_HLIST_HEAD(&empty_rp);
 	hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
 		hlist_del(&ri->hlist);
 		kfree(ri);
@@ -423,24 +462,29 @@ static inline void free_rp_inst(struct kretprobe *rp)
 	struct kretprobe_instance *ri;
 	struct hlist_node *pos, *next;
 
-	hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, uflist) {
-		hlist_del(&ri->uflist);
+	hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, hlist) {
+		hlist_del(&ri->hlist);
 		kfree(ri);
 	}
 }
 
 static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
 {
-	unsigned long flags;
+	unsigned long flags, hash;
 	struct kretprobe_instance *ri;
 	struct hlist_node *pos, *next;
+	struct hlist_head *head;
+
 	/* No race here */
-	spin_lock_irqsave(&kretprobe_lock, flags);
-	hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) {
-		ri->rp = NULL;
-		hlist_del(&ri->uflist);
+	for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) {
+		kretprobe_table_lock(hash, &flags);
+		head = &kretprobe_inst_table[hash];
+		hlist_for_each_entry_safe(ri, pos, next, head, hlist) {
+			if (ri->rp == rp)
+				ri->rp = NULL;
+		}
+		kretprobe_table_unlock(hash, &flags);
 	}
-	spin_unlock_irqrestore(&kretprobe_lock, flags);
 	free_rp_inst(rp);
 }
 
@@ -831,32 +875,37 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
 					   struct pt_regs *regs)
 {
 	struct kretprobe *rp = container_of(p, struct kretprobe, kp);
-	unsigned long flags = 0;
+	unsigned long hash, flags = 0;
+	struct kretprobe_instance *ri;
 
 	/*TODO: consider to only swap the RA after the last pre_handler fired */
-	spin_lock_irqsave(&kretprobe_lock, flags);
+	hash = hash_ptr(current, KPROBE_HASH_BITS);
+	spin_lock_irqsave(&rp->lock, flags);
 	if (!hlist_empty(&rp->free_instances)) {
-		struct kretprobe_instance *ri;
-
 		ri = hlist_entry(rp->free_instances.first,
-				 struct kretprobe_instance, uflist);
+				struct kretprobe_instance, hlist);
+		hlist_del(&ri->hlist);
+		spin_unlock_irqrestore(&rp->lock, flags);
+
 		ri->rp = rp;
 		ri->task = current;
 
 		if (rp->entry_handler && rp->entry_handler(ri, regs)) {
-			spin_unlock_irqrestore(&kretprobe_lock, flags);
+			spin_unlock_irqrestore(&rp->lock, flags);
 			return 0;
 		}
 
 		arch_prepare_kretprobe(ri, regs);
 
 		/* XXX(hch): why is there no hlist_move_head? */
-		hlist_del(&ri->uflist);
-		hlist_add_head(&ri->uflist, &ri->rp->used_instances);
-		hlist_add_head(&ri->hlist, kretprobe_inst_table_head(ri->task));
-	} else
+		INIT_HLIST_NODE(&ri->hlist);
+		kretprobe_table_lock(hash, &flags);
+		hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]);
+		kretprobe_table_unlock(hash, &flags);
+	} else {
 		rp->nmissed++;
-	spin_unlock_irqrestore(&kretprobe_lock, flags);
+		spin_unlock_irqrestore(&rp->lock, flags);
+	}
 	return 0;
 }
 
@@ -892,7 +941,7 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp,
 		rp->maxactive = NR_CPUS;
 #endif
 	}
-	INIT_HLIST_HEAD(&rp->used_instances);
+	spin_lock_init(&rp->lock);
 	INIT_HLIST_HEAD(&rp->free_instances);
 	for (i = 0; i < rp->maxactive; i++) {
 		inst = kmalloc(sizeof(struct kretprobe_instance) +
@@ -901,8 +950,8 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp,
 			free_rp_inst(rp);
 			return -ENOMEM;
 		}
-		INIT_HLIST_NODE(&inst->uflist);
-		hlist_add_head(&inst->uflist, &rp->free_instances);
+		INIT_HLIST_NODE(&inst->hlist);
+		hlist_add_head(&inst->hlist, &rp->free_instances);
 	}
 
 	rp->nmissed = 0;
@@ -1009,6 +1058,7 @@ static int __init init_kprobes(void)
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
 		INIT_HLIST_HEAD(&kprobe_table[i]);
 		INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
+		spin_lock_init(&(kretprobe_table_locks[i].lock));
 	}
 
 	/*
@@ -1050,6 +1100,7 @@ static int __init init_kprobes(void)
 	err = arch_init_kprobes();
 	if (!err)
 		err = register_die_notifier(&kprobe_exceptions_nb);
+	kprobes_initialized = (err == 0);
 
 	if (!err)
 		init_test_probes();
-- 
cgit v1.2.3-70-g09d2


From 7444a72effa632fcd8edc566f880d96fe213c73b Mon Sep 17 00:00:00 2001
From: Michael Buesch <mb@bu3sch.de>
Date: Fri, 25 Jul 2008 01:46:11 -0700
Subject: gpiolib: allow user-selection

This patch adds functionality to the gpio-lib subsystem to make it
possible to enable the gpio-lib code even if the architecture code didn't
request to get it built in.

The archtitecture code does still need to implement the gpiolib accessor
functions in its asm/gpio.h file.  This patch adds the implementations for
x86 and PPC.

With these changes it is possible to run generic GPIO expansion cards on
every architecture that implements the trivial wrapper functions.  Support
for more architectures can easily be added.

Signed-off-by: Michael Buesch <mb@bu3sch.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: David Brownell <david-b@pacbell.net>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Haavard Skinnemoen <hskinnemoen@atmel.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Jean Delvare <khali@linux-fr.org>
Cc: Samuel Ortiz <sameo@openedhand.com>
Cc: Kumar Gala <galak@gate.crashing.org>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/gpio.txt               | 12 +++++++-
 arch/arm/Kconfig                     |  8 +++---
 arch/avr32/Kconfig                   |  2 +-
 arch/mips/Kconfig                    |  2 +-
 arch/powerpc/Kconfig                 |  1 +
 arch/powerpc/platforms/52xx/Kconfig  |  2 +-
 arch/powerpc/sysdev/qe_lib/Kconfig   |  2 +-
 arch/x86/Kconfig                     |  1 +
 drivers/Makefile                     |  2 +-
 drivers/gpio/Kconfig                 | 33 ++++++++++++++++++---
 drivers/gpio/Makefile                |  2 +-
 drivers/i2c/chips/Kconfig            |  2 +-
 drivers/mfd/Kconfig                  |  4 +--
 drivers/of/Kconfig                   |  2 +-
 include/asm-generic/gpio.h           |  2 +-
 include/asm-mips/mach-generic/gpio.h |  2 +-
 include/asm-powerpc/gpio.h           |  4 +--
 include/asm-x86/gpio.h               | 56 ++++++++++++++++++++++++++++++++++++
 18 files changed, 116 insertions(+), 23 deletions(-)

(limited to 'arch/powerpc')

diff --git a/Documentation/gpio.txt b/Documentation/gpio.txt
index 8b69811a964..18022e249c5 100644
--- a/Documentation/gpio.txt
+++ b/Documentation/gpio.txt
@@ -389,11 +389,21 @@ either NULL or the label associated with that GPIO when it was requested.
 
 Platform Support
 ----------------
-To support this framework, a platform's Kconfig will "select HAVE_GPIO_LIB"
+To support this framework, a platform's Kconfig will "select" either
+ARCH_REQUIRE_GPIOLIB or ARCH_WANT_OPTIONAL_GPIOLIB
 and arrange that its <asm/gpio.h> includes <asm-generic/gpio.h> and defines
 three functions: gpio_get_value(), gpio_set_value(), and gpio_cansleep().
 They may also want to provide a custom value for ARCH_NR_GPIOS.
 
+ARCH_REQUIRE_GPIOLIB means that the gpio-lib code will always get compiled
+into the kernel on that architecture.
+
+ARCH_WANT_OPTIONAL_GPIOLIB means the gpio-lib code defaults to off and the user
+can enable it and build it into the kernel optionally.
+
+If neither of these options are selected, the platform does not support
+GPIOs through GPIO-lib and the code cannot be enabled by the user.
+
 Trivial implementations of those functions can directly use framework
 code, which always dispatches through the gpio_chip:
 
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 6fb4f03369f..dabb015aa40 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -268,7 +268,7 @@ config ARCH_EP93XX
 	select GENERIC_GPIO
 	select HAVE_CLK
 	select HAVE_CLK
-	select HAVE_GPIO_LIB
+	select ARCH_REQUIRE_GPIOLIB
 	help
 	  This enables support for the Cirrus EP93xx series of CPUs.
 
@@ -447,7 +447,7 @@ config ARCH_PXA
 	select ARCH_MTD_XIP
 	select GENERIC_GPIO
 	select HAVE_CLK
-	select HAVE_GPIO_LIB
+	select ARCH_REQUIRE_GPIOLIB
 	select GENERIC_TIME
 	select GENERIC_CLOCKEVENTS
 	select TICK_ONESHOT
@@ -479,7 +479,7 @@ config ARCH_SA1100
 	select GENERIC_CLOCKEVENTS
 	select HAVE_CLK
 	select TICK_ONESHOT
-	select HAVE_GPIO_LIB
+	select ARCH_REQUIRE_GPIOLIB
 	help
 	  Support for StrongARM 11x0 based boards.
 
@@ -522,7 +522,7 @@ config ARCH_OMAP
 	bool "TI OMAP"
 	select GENERIC_GPIO
 	select HAVE_CLK
-	select HAVE_GPIO_LIB
+	select ARCH_REQUIRE_GPIOLIB
 	select GENERIC_TIME
 	select GENERIC_CLOCKEVENTS
 	help
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index df4adefedb4..7c239a91627 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -88,7 +88,7 @@ config PLATFORM_AT32AP
 	select SUBARCH_AVR32B
 	select MMU
 	select PERFORMANCE_COUNTERS
-	select HAVE_GPIO_LIB
+	select ARCH_REQUIRE_GPIOLIB
 	select GENERIC_ALLOCATOR
 
 #
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index b9c754f4070..b4c4eaa5dd2 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -713,7 +713,7 @@ config CSRC_SB1250
 
 config GPIO_TXX9
 	select GENERIC_GPIO
-	select HAVE_GPIO_LIB
+	select ARCH_REQUIRE_GPIOLIB
 	bool
 
 config CFE
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index de6b49cd6be..fe88418167c 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -110,6 +110,7 @@ config PPC
 	default y
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FTRACE
+	select ARCH_WANT_OPTIONAL_GPIOLIB
 	select HAVE_IDE
 	select HAVE_IOREMAP_PROT
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
diff --git a/arch/powerpc/platforms/52xx/Kconfig b/arch/powerpc/platforms/52xx/Kconfig
index d664b1bce38..ccbd4958412 100644
--- a/arch/powerpc/platforms/52xx/Kconfig
+++ b/arch/powerpc/platforms/52xx/Kconfig
@@ -48,6 +48,6 @@ config PPC_MPC5200_BUGFIX
 config PPC_MPC5200_GPIO
 	bool "MPC5200 GPIO support"
 	depends on PPC_MPC52xx
-	select HAVE_GPIO_LIB
+	select ARCH_REQUIRE_GPIOLIB
 	help
 	  Enable gpiolib support for mpc5200 based boards
diff --git a/arch/powerpc/sysdev/qe_lib/Kconfig b/arch/powerpc/sysdev/qe_lib/Kconfig
index 4bb18f57901..1ce546462be 100644
--- a/arch/powerpc/sysdev/qe_lib/Kconfig
+++ b/arch/powerpc/sysdev/qe_lib/Kconfig
@@ -29,7 +29,7 @@ config QE_GPIO
 	bool "QE GPIO support"
 	depends on QUICC_ENGINE
 	select GENERIC_GPIO
-	select HAVE_GPIO_LIB
+	select ARCH_REQUIRE_GPIOLIB
 	help
 	  Say Y here if you're going to use hardware that connects to the
 	  QE GPIOs.
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 66f3ab05b18..e3cba0b4560 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -23,6 +23,7 @@ config X86
 	select HAVE_OPROFILE
 	select HAVE_IOREMAP_PROT
 	select HAVE_KPROBES
+	select ARCH_WANT_OPTIONAL_GPIOLIB if !X86_RDC321X
 	select HAVE_KRETPROBES
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FTRACE
diff --git a/drivers/Makefile b/drivers/Makefile
index 808e0ae66aa..54ec5e718c0 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -5,7 +5,7 @@
 # Rewritten to use lists instead of if-statements.
 #
 
-obj-$(CONFIG_HAVE_GPIO_LIB)	+= gpio/
+obj-y				+= gpio/
 obj-$(CONFIG_PCI)		+= pci/
 obj-$(CONFIG_PARISC)		+= parisc/
 obj-$(CONFIG_RAPIDIO)		+= rapidio/
diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index de202dbe530..5a355f82916 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -2,15 +2,40 @@
 # GPIO infrastructure and expanders
 #
 
-config HAVE_GPIO_LIB
+config ARCH_WANT_OPTIONAL_GPIOLIB
 	bool
+	help
+	  Select this config option from the architecture Kconfig, if
+	  it is possible to use gpiolib on the architecture, but let the
+	  user decide whether to actually build it or not.
+	  Select this instead of ARCH_REQUIRE_GPIOLIB, if your architecture does
+	  not depend on GPIOs being available, but rather let the user
+	  decide whether he needs it or not.
+
+config ARCH_REQUIRE_GPIOLIB
+	bool
+	select GPIOLIB
 	help
 	  Platforms select gpiolib if they use this infrastructure
 	  for all their GPIOs, usually starting with ones integrated
 	  into SOC processors.
+	  Selecting this from the architecture code will cause the gpiolib
+	  code to always get built in.
+
+
+
+menuconfig GPIOLIB
+	bool "GPIO Support"
+	depends on ARCH_WANT_OPTIONAL_GPIOLIB || ARCH_REQUIRE_GPIOLIB
+	select GENERIC_GPIO
+	help
+	  This enables GPIO support through the generic GPIO library.
+	  You only need to enable this, if you also want to enable
+	  one or more of the GPIO expansion card drivers below.
+
+	  If unsure, say N.
 
-menu "GPIO Support"
-	depends on HAVE_GPIO_LIB
+if GPIOLIB
 
 config DEBUG_GPIO
 	bool "Debug GPIO calls"
@@ -116,4 +141,4 @@ config GPIO_MCP23S08
 	  SPI driver for Microchip MCP23S08 I/O expander.  This provides
 	  a GPIO interface supporting inputs and outputs.
 
-endmenu
+endif
diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile
index eeb2f2b2028..8c45948d1fe 100644
--- a/drivers/gpio/Makefile
+++ b/drivers/gpio/Makefile
@@ -2,7 +2,7 @@
 
 ccflags-$(CONFIG_DEBUG_GPIO)	+= -DDEBUG
 
-obj-$(CONFIG_HAVE_GPIO_LIB)	+= gpiolib.o
+obj-$(CONFIG_GPIOLIB)		+= gpiolib.o
 
 obj-$(CONFIG_GPIO_MAX7301)	+= max7301.o
 obj-$(CONFIG_GPIO_MCP23S08)	+= mcp23s08.o
diff --git a/drivers/i2c/chips/Kconfig b/drivers/i2c/chips/Kconfig
index 50e0a465374..a95cb9465d6 100644
--- a/drivers/i2c/chips/Kconfig
+++ b/drivers/i2c/chips/Kconfig
@@ -126,7 +126,7 @@ config ISP1301_OMAP
 
 config TPS65010
 	tristate "TPS6501x Power Management chips"
-	depends on HAVE_GPIO_LIB
+	depends on GPIOLIB
 	default y if MACH_OMAP_H2 || MACH_OMAP_H3 || MACH_OMAP_OSK
 	help
 	  If you say yes here you get support for the TPS6501x series of
diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index bac9e973ece..1f57a99fd96 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -36,7 +36,7 @@ config MFD_ASIC3
 
 config HTC_EGPIO
 	bool "HTC EGPIO support"
-	depends on GENERIC_HARDIRQS && HAVE_GPIO_LIB && ARM
+	depends on GENERIC_HARDIRQS && GPIOLIB && ARM
 	help
 	    This driver supports the CPLD egpio chip present on
 	    several HTC phones.  It provides basic support for input
@@ -52,7 +52,7 @@ config HTC_PASIC3
 
 config MFD_TC6393XB
 	bool "Support Toshiba TC6393XB"
-	depends on HAVE_GPIO_LIB
+	depends on GPIOLIB
 	select MFD_CORE
 	help
 	  Support for Toshiba Mobile IO Controller TC6393XB
diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
index 3a7a11a75fb..1d7ec312934 100644
--- a/drivers/of/Kconfig
+++ b/drivers/of/Kconfig
@@ -4,7 +4,7 @@ config OF_DEVICE
 
 config OF_GPIO
 	def_bool y
-	depends on OF && PPC_OF && HAVE_GPIO_LIB
+	depends on OF && PPC_OF && GPIOLIB
 	help
 	  OpenFirmware GPIO accessors
 
diff --git a/include/asm-generic/gpio.h b/include/asm-generic/gpio.h
index 1beff5166e5..a3034d20ebd 100644
--- a/include/asm-generic/gpio.h
+++ b/include/asm-generic/gpio.h
@@ -3,7 +3,7 @@
 
 #include <linux/types.h>
 
-#ifdef CONFIG_HAVE_GPIO_LIB
+#ifdef CONFIG_GPIOLIB
 
 #include <linux/compiler.h>
 
diff --git a/include/asm-mips/mach-generic/gpio.h b/include/asm-mips/mach-generic/gpio.h
index e6b376bd9d0..b4e70208da6 100644
--- a/include/asm-mips/mach-generic/gpio.h
+++ b/include/asm-mips/mach-generic/gpio.h
@@ -1,7 +1,7 @@
 #ifndef __ASM_MACH_GENERIC_GPIO_H
 #define __ASM_MACH_GENERIC_GPIO_H
 
-#ifdef CONFIG_HAVE_GPIO_LIB
+#ifdef CONFIG_GPIOLIB
 #define gpio_get_value	__gpio_get_value
 #define gpio_set_value	__gpio_set_value
 #define gpio_cansleep	__gpio_cansleep
diff --git a/include/asm-powerpc/gpio.h b/include/asm-powerpc/gpio.h
index 77ad3a890f3..ea04632399d 100644
--- a/include/asm-powerpc/gpio.h
+++ b/include/asm-powerpc/gpio.h
@@ -17,7 +17,7 @@
 #include <linux/errno.h>
 #include <asm-generic/gpio.h>
 
-#ifdef CONFIG_HAVE_GPIO_LIB
+#ifdef CONFIG_GPIOLIB
 
 /*
  * We don't (yet) implement inlined/rapid versions for on-chip gpios.
@@ -51,6 +51,6 @@ static inline int irq_to_gpio(unsigned int irq)
 	return -EINVAL;
 }
 
-#endif /* CONFIG_HAVE_GPIO_LIB */
+#endif /* CONFIG_GPIOLIB */
 
 #endif /* __ASM_POWERPC_GPIO_H */
diff --git a/include/asm-x86/gpio.h b/include/asm-x86/gpio.h
index ff87fca0caf..116e9147fe6 100644
--- a/include/asm-x86/gpio.h
+++ b/include/asm-x86/gpio.h
@@ -1,6 +1,62 @@
+/*
+ * Generic GPIO API implementation for x86.
+ *
+ * Derived from the generic GPIO API for powerpc:
+ *
+ * Copyright (c) 2007-2008  MontaVista Software, Inc.
+ *
+ * Author: Anton Vorontsov <avorontsov@ru.mvista.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
 #ifndef _ASM_I386_GPIO_H
 #define _ASM_I386_GPIO_H
 
+#ifdef CONFIG_X86_RDC321X
 #include <gpio.h>
+#else /* CONFIG_X86_RDC321X */
+
+#include <asm-generic/gpio.h>
+
+#ifdef CONFIG_GPIOLIB
+
+/*
+ * Just call gpiolib.
+ */
+static inline int gpio_get_value(unsigned int gpio)
+{
+	return __gpio_get_value(gpio);
+}
+
+static inline void gpio_set_value(unsigned int gpio, int value)
+{
+	__gpio_set_value(gpio, value);
+}
+
+static inline int gpio_cansleep(unsigned int gpio)
+{
+	return __gpio_cansleep(gpio);
+}
+
+/*
+ * Not implemented, yet.
+ */
+static inline int gpio_to_irq(unsigned int gpio)
+{
+	return -ENOSYS;
+}
+
+static inline int irq_to_gpio(unsigned int irq)
+{
+	return -EINVAL;
+}
+
+#endif /* CONFIG_GPIOLIB */
+
+#endif /* CONFIG_X86_RDC321X */
 
 #endif /* _ASM_I386_GPIO_H */
-- 
cgit v1.2.3-70-g09d2


From fc532f810832beb3306b42526a78f411972281c7 Mon Sep 17 00:00:00 2001
From: Nathan Lynch <ntl@pobox.com>
Date: Fri, 25 Jul 2008 17:50:30 -0500
Subject: powerpc: Fix boot problem due to AT_BASE_PLATFORM change

Commit 9115d13453dee22473a1e8cacc90a8d64a9c4bc9 ("powerpc: Enable
AT_BASE_PLATFORM aux vector") broke boot on 32-bit powerpc systems; we
have to use PTRRELOC to initialize powerpc_base_platform this early in
boot.

Bug reported by Jon Smirl.

Signed-off-by: Nathan Lynch <ntl@pobox.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/kernel/cputable.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index 25a052c1675..25c273c761d 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -1660,8 +1660,8 @@ struct cpu_spec * __init identify_cpu(unsigned long offset, unsigned int pvr)
 			 * Set the base platform string once; assumes
 			 * we're called with real pvr first.
 			 */
-			if (powerpc_base_platform == NULL)
-				powerpc_base_platform = t->platform;
+			if (*PTRRELOC(&powerpc_base_platform) == NULL)
+				*PTRRELOC(&powerpc_base_platform) = t->platform;
 
 #if defined(CONFIG_PPC64) || defined(CONFIG_BOOKE)
 			/* ppc64 and booke expect identify_cpu to also call
-- 
cgit v1.2.3-70-g09d2


From 8d8bb39b9eba32dd70e87fd5ad5c5dd4ba118e06 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Fri, 25 Jul 2008 19:44:49 -0700
Subject: dma-mapping: add the device argument to dma_mapping_error()

Add per-device dma_mapping_ops support for CONFIG_X86_64 as POWER
architecture does:

This enables us to cleanly fix the Calgary IOMMU issue that some devices
are not behind the IOMMU (http://lkml.org/lkml/2008/5/8/423).

I think that per-device dma_mapping_ops support would be also helpful for
KVM people to support PCI passthrough but Andi thinks that this makes it
difficult to support the PCI passthrough (see the above thread).  So I
CC'ed this to KVM camp.  Comments are appreciated.

A pointer to dma_mapping_ops to struct dev_archdata is added.  If the
pointer is non NULL, DMA operations in asm/dma-mapping.h use it.  If it's
NULL, the system-wide dma_ops pointer is used as before.

If it's useful for KVM people, I plan to implement a mechanism to register
a hook called when a new pci (or dma capable) device is created (it works
with hot plugging).  It enables IOMMUs to set up an appropriate
dma_mapping_ops per device.

The major obstacle is that dma_mapping_error doesn't take a pointer to the
device unlike other DMA operations.  So x86 can't have dma_mapping_ops per
device.  Note all the POWER IOMMUs use the same dma_mapping_error function
so this is not a problem for POWER but x86 IOMMUs use different
dma_mapping_error functions.

The first patch adds the device argument to dma_mapping_error.  The patch
is trivial but large since it touches lots of drivers and dma-mapping.h in
all the architecture.

This patch:

dma_mapping_error() doesn't take a pointer to the device unlike other DMA
operations.  So we can't have dma_mapping_ops per device.

Note that POWER already has dma_mapping_ops per device but all the POWER
IOMMUs use the same dma_mapping_error function.  x86 IOMMUs use device
argument.

[akpm@linux-foundation.org: fix sge]
[akpm@linux-foundation.org: fix svc_rdma]
[akpm@linux-foundation.org: build fix]
[akpm@linux-foundation.org: fix bnx2x]
[akpm@linux-foundation.org: fix s2io]
[akpm@linux-foundation.org: fix pasemi_mac]
[akpm@linux-foundation.org: fix sdhci]
[akpm@linux-foundation.org: build fix]
[akpm@linux-foundation.org: fix sparc]
[akpm@linux-foundation.org: fix ibmvscsi]
Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Muli Ben-Yehuda <muli@il.ibm.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Avi Kivity <avi@qumranet.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/DMA-API.txt                      |  4 +-
 arch/arm/common/dmabounce.c                    |  2 +-
 arch/ia64/hp/common/hwsw_iommu.c               |  5 +-
 arch/ia64/hp/common/sba_iommu.c                |  2 +-
 arch/ia64/sn/pci/pci_dma.c                     |  2 +-
 arch/mips/mm/dma-default.c                     |  2 +-
 arch/powerpc/platforms/cell/celleb_scc_pciex.c |  2 +-
 arch/powerpc/platforms/cell/spider-pci.c       |  2 +-
 arch/powerpc/platforms/iseries/mf.c            |  2 +-
 arch/x86/kernel/pci-calgary_64.c               |  2 +-
 arch/x86/kernel/pci-dma.c                      | 27 ++++---
 arch/x86/kernel/pci-gart_64.c                  |  3 +-
 arch/x86/kernel/pci-nommu.c                    | 14 +---
 arch/x86/kernel/pci-swiotlb_64.c               |  2 +-
 drivers/firewire/fw-iso.c                      |  2 +-
 drivers/firewire/fw-ohci.c                     |  2 +-
 drivers/firewire/fw-sbp2.c                     |  8 +--
 drivers/infiniband/hw/ipath/ipath_sdma.c       |  2 +-
 drivers/infiniband/hw/ipath/ipath_user_sdma.c  |  6 +-
 drivers/infiniband/hw/mthca/mthca_eq.c         |  2 +-
 drivers/media/dvb/pluto2/pluto2.c              |  2 +-
 drivers/mmc/host/sdhci.c                       |  4 +-
 drivers/net/arm/ep93xx_eth.c                   |  4 +-
 drivers/net/bnx2x_main.c                       |  4 +-
 drivers/net/cxgb3/sge.c                        |  2 +-
 drivers/net/e100.c                             |  2 +-
 drivers/net/e1000e/ethtool.c                   |  4 +-
 drivers/net/e1000e/netdev.c                    | 11 +--
 drivers/net/ibmveth.c                          | 38 +++++-----
 drivers/net/iseries_veth.c                     |  4 +-
 drivers/net/mlx4/eq.c                          |  2 +-
 drivers/net/pasemi_mac.c                       |  6 +-
 drivers/net/qla3xxx.c                          | 12 ++--
 drivers/net/s2io.c                             | 48 +++++++------
 drivers/net/sfc/rx.c                           |  4 +-
 drivers/net/sfc/tx.c                           |  7 +-
 drivers/net/spider_net.c                       |  4 +-
 drivers/net/tc35815.c                          |  4 +-
 drivers/net/wireless/ath5k/base.c              |  4 +-
 drivers/scsi/ibmvscsi/ibmvfc.c                 |  4 +-
 drivers/scsi/ibmvscsi/ibmvscsi.c               |  4 +-
 drivers/scsi/ibmvscsi/ibmvstgt.c               |  2 +-
 drivers/scsi/ibmvscsi/rpa_vscsi.c              |  2 +-
 drivers/spi/atmel_spi.c                        |  4 +-
 drivers/spi/au1550_spi.c                       |  6 +-
 drivers/spi/omap2_mcspi.c                      |  4 +-
 drivers/spi/pxa2xx_spi.c                       |  4 +-
 drivers/spi/spi_imx.c                          |  6 +-
 include/asm-alpha/dma-mapping.h                |  6 +-
 include/asm-alpha/pci.h                        |  2 +-
 include/asm-arm/dma-mapping.h                  |  2 +-
 include/asm-avr32/dma-mapping.h                |  2 +-
 include/asm-cris/dma-mapping.h                 |  2 +-
 include/asm-frv/dma-mapping.h                  |  2 +-
 include/asm-generic/dma-mapping-broken.h       |  2 +-
 include/asm-generic/dma-mapping.h              |  4 +-
 include/asm-generic/pci-dma-compat.h           |  4 +-
 include/asm-ia64/machvec.h                     |  2 +-
 include/asm-m68k/dma-mapping.h                 |  2 +-
 include/asm-mips/dma-mapping.h                 |  2 +-
 include/asm-mn10300/dma-mapping.h              |  2 +-
 include/asm-parisc/dma-mapping.h               |  2 +-
 include/asm-powerpc/dma-mapping.h              |  2 +-
 include/asm-sh/dma-mapping.h                   |  2 +-
 include/asm-sparc/dma-mapping_64.h             |  2 +-
 include/asm-sparc/pci_32.h                     |  3 +-
 include/asm-sparc/pci_64.h                     |  5 +-
 include/asm-x86/device.h                       |  3 +
 include/asm-x86/dma-mapping.h                  | 99 ++++++++++++++++++--------
 include/asm-x86/swiotlb.h                      |  2 +-
 include/asm-xtensa/dma-mapping.h               |  2 +-
 include/linux/i2o.h                            |  2 +-
 include/linux/ssb/ssb.h                        |  4 +-
 include/rdma/ib_verbs.h                        |  2 +-
 lib/swiotlb.c                                  |  4 +-
 net/sunrpc/xprtrdma/svc_rdma_sendto.c          |  3 +-
 76 files changed, 256 insertions(+), 210 deletions(-)

(limited to 'arch/powerpc')

diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index 80d150458c8..d8b63d164e4 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -298,10 +298,10 @@ recommended that you never use these unless you really know what the
 cache width is.
 
 int
-dma_mapping_error(dma_addr_t dma_addr)
+dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 
 int
-pci_dma_mapping_error(dma_addr_t dma_addr)
+pci_dma_mapping_error(struct pci_dev *hwdev, dma_addr_t dma_addr)
 
 In some circumstances dma_map_single and dma_map_page will fail to create
 a mapping. A driver can check for these errors by testing the returned
diff --git a/arch/arm/common/dmabounce.c b/arch/arm/common/dmabounce.c
index dd294734260..69130f36590 100644
--- a/arch/arm/common/dmabounce.c
+++ b/arch/arm/common/dmabounce.c
@@ -280,7 +280,7 @@ unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
 	/*
 	 * Trying to unmap an invalid mapping
 	 */
-	if (dma_mapping_error(dma_addr)) {
+	if (dma_mapping_error(dev, dma_addr)) {
 		dev_err(dev, "Trying to unmap invalid mapping\n");
 		return;
 	}
diff --git a/arch/ia64/hp/common/hwsw_iommu.c b/arch/ia64/hp/common/hwsw_iommu.c
index 1c44ec2a1d5..88b6e6f3fd8 100644
--- a/arch/ia64/hp/common/hwsw_iommu.c
+++ b/arch/ia64/hp/common/hwsw_iommu.c
@@ -186,9 +186,10 @@ hwsw_dma_supported (struct device *dev, u64 mask)
 }
 
 int
-hwsw_dma_mapping_error (dma_addr_t dma_addr)
+hwsw_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
-	return hwiommu_dma_mapping_error (dma_addr) || swiotlb_dma_mapping_error(dma_addr);
+	return hwiommu_dma_mapping_error(dev, dma_addr) ||
+		swiotlb_dma_mapping_error(dev, dma_addr);
 }
 
 EXPORT_SYMBOL(hwsw_dma_mapping_error);
diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index 34421aed1e2..4956be40d7b 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -2147,7 +2147,7 @@ sba_dma_supported (struct device *dev, u64 mask)
 }
 
 int
-sba_dma_mapping_error (dma_addr_t dma_addr)
+sba_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	return 0;
 }
diff --git a/arch/ia64/sn/pci/pci_dma.c b/arch/ia64/sn/pci/pci_dma.c
index 52175af299a..53ebb648449 100644
--- a/arch/ia64/sn/pci/pci_dma.c
+++ b/arch/ia64/sn/pci/pci_dma.c
@@ -350,7 +350,7 @@ void sn_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 }
 EXPORT_SYMBOL(sn_dma_sync_sg_for_device);
 
-int sn_dma_mapping_error(dma_addr_t dma_addr)
+int sn_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	return 0;
 }
diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c
index ae39dd88b9a..891312f8e5a 100644
--- a/arch/mips/mm/dma-default.c
+++ b/arch/mips/mm/dma-default.c
@@ -348,7 +348,7 @@ void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nele
 
 EXPORT_SYMBOL(dma_sync_sg_for_device);
 
-int dma_mapping_error(dma_addr_t dma_addr)
+int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	return 0;
 }
diff --git a/arch/powerpc/platforms/cell/celleb_scc_pciex.c b/arch/powerpc/platforms/cell/celleb_scc_pciex.c
index 0e04f8fb152..3e7e0f1568e 100644
--- a/arch/powerpc/platforms/cell/celleb_scc_pciex.c
+++ b/arch/powerpc/platforms/cell/celleb_scc_pciex.c
@@ -281,7 +281,7 @@ static int __init scc_pciex_iowa_init(struct iowa_bus *bus, void *data)
 
 	dummy_page_da = dma_map_single(bus->phb->parent, dummy_page_va,
 				       PAGE_SIZE, DMA_FROM_DEVICE);
-	if (dma_mapping_error(dummy_page_da)) {
+	if (dma_mapping_error(bus->phb->parent, dummy_page_da)) {
 		pr_err("PCIEX:Map dummy page failed.\n");
 		kfree(dummy_page_va);
 		return -1;
diff --git a/arch/powerpc/platforms/cell/spider-pci.c b/arch/powerpc/platforms/cell/spider-pci.c
index 418b605ac35..5122ec14527 100644
--- a/arch/powerpc/platforms/cell/spider-pci.c
+++ b/arch/powerpc/platforms/cell/spider-pci.c
@@ -111,7 +111,7 @@ static int __init spiderpci_pci_setup_chip(struct pci_controller *phb,
 
 	dummy_page_da = dma_map_single(phb->parent, dummy_page_va,
 				       PAGE_SIZE, DMA_FROM_DEVICE);
-	if (dma_mapping_error(dummy_page_da)) {
+	if (dma_mapping_error(phb->parent, dummy_page_da)) {
 		pr_err("SPIDER-IOWA:Map dummy page filed.\n");
 		kfree(dummy_page_va);
 		return -1;
diff --git a/arch/powerpc/platforms/iseries/mf.c b/arch/powerpc/platforms/iseries/mf.c
index 1dc7295746d..731d7b15774 100644
--- a/arch/powerpc/platforms/iseries/mf.c
+++ b/arch/powerpc/platforms/iseries/mf.c
@@ -871,7 +871,7 @@ static int proc_mf_dump_cmdline(char *page, char **start, off_t off,
 		count = 256 - off;
 
 	dma_addr = iseries_hv_map(page, off + count, DMA_FROM_DEVICE);
-	if (dma_mapping_error(dma_addr))
+	if (dma_mapping_error(NULL, dma_addr))
 		return -ENOMEM;
 	memset(page, 0, off + count);
 	memset(&vsp_cmd, 0, sizeof(vsp_cmd));
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 19e7fc7c2c4..1eb86be93d7 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -544,7 +544,7 @@ error:
 	return ret;
 }
 
-static const struct dma_mapping_ops calgary_dma_ops = {
+static struct dma_mapping_ops calgary_dma_ops = {
 	.alloc_coherent = calgary_alloc_coherent,
 	.map_single = calgary_map_single,
 	.unmap_single = calgary_unmap_single,
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index cbecb05551b..37544123896 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -11,7 +11,7 @@
 
 static int forbid_dac __read_mostly;
 
-const struct dma_mapping_ops *dma_ops;
+struct dma_mapping_ops *dma_ops;
 EXPORT_SYMBOL(dma_ops);
 
 static int iommu_sac_force __read_mostly;
@@ -312,6 +312,8 @@ static int dma_release_coherent(struct device *dev, int order, void *vaddr)
 
 int dma_supported(struct device *dev, u64 mask)
 {
+	struct dma_mapping_ops *ops = get_dma_ops(dev);
+
 #ifdef CONFIG_PCI
 	if (mask > 0xffffffff && forbid_dac > 0) {
 		dev_info(dev, "PCI: Disallowing DAC for device\n");
@@ -319,8 +321,8 @@ int dma_supported(struct device *dev, u64 mask)
 	}
 #endif
 
-	if (dma_ops->dma_supported)
-		return dma_ops->dma_supported(dev, mask);
+	if (ops->dma_supported)
+		return ops->dma_supported(dev, mask);
 
 	/* Copied from i386. Doesn't make much sense, because it will
 	   only work for pci_alloc_coherent.
@@ -367,6 +369,7 @@ void *
 dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		   gfp_t gfp)
 {
+	struct dma_mapping_ops *ops = get_dma_ops(dev);
 	void *memory = NULL;
 	struct page *page;
 	unsigned long dma_mask = 0;
@@ -435,8 +438,8 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
 			/* Let low level make its own zone decisions */
 			gfp &= ~(GFP_DMA32|GFP_DMA);
 
-			if (dma_ops->alloc_coherent)
-				return dma_ops->alloc_coherent(dev, size,
+			if (ops->alloc_coherent)
+				return ops->alloc_coherent(dev, size,
 							   dma_handle, gfp);
 			return NULL;
 		}
@@ -448,14 +451,14 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		}
 	}
 
-	if (dma_ops->alloc_coherent) {
+	if (ops->alloc_coherent) {
 		free_pages((unsigned long)memory, get_order(size));
 		gfp &= ~(GFP_DMA|GFP_DMA32);
-		return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
+		return ops->alloc_coherent(dev, size, dma_handle, gfp);
 	}
 
-	if (dma_ops->map_simple) {
-		*dma_handle = dma_ops->map_simple(dev, virt_to_phys(memory),
+	if (ops->map_simple) {
+		*dma_handle = ops->map_simple(dev, virt_to_phys(memory),
 					      size,
 					      PCI_DMA_BIDIRECTIONAL);
 		if (*dma_handle != bad_dma_address)
@@ -477,12 +480,14 @@ EXPORT_SYMBOL(dma_alloc_coherent);
 void dma_free_coherent(struct device *dev, size_t size,
 			 void *vaddr, dma_addr_t bus)
 {
+	struct dma_mapping_ops *ops = get_dma_ops(dev);
+
 	int order = get_order(size);
 	WARN_ON(irqs_disabled());	/* for portability */
 	if (dma_release_coherent(dev, order, vaddr))
 		return;
-	if (dma_ops->unmap_single)
-		dma_ops->unmap_single(dev, bus, size, 0);
+	if (ops->unmap_single)
+		ops->unmap_single(dev, bus, size, 0);
 	free_pages((unsigned long)vaddr, order);
 }
 EXPORT_SYMBOL(dma_free_coherent);
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index df5f142657d..744126e6495 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -692,8 +692,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 
 extern int agp_amd64_init(void);
 
-static const struct dma_mapping_ops gart_dma_ops = {
-	.mapping_error			= NULL,
+static struct dma_mapping_ops gart_dma_ops = {
 	.map_single			= gart_map_single,
 	.map_simple			= gart_map_simple,
 	.unmap_single			= gart_unmap_single,
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 792b9179eff..3f91f71cdc3 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -72,21 +72,9 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
 	return nents;
 }
 
-/* Make sure we keep the same behaviour */
-static int nommu_mapping_error(dma_addr_t dma_addr)
-{
-#ifdef CONFIG_X86_32
-	return 0;
-#else
-	return (dma_addr == bad_dma_address);
-#endif
-}
-
-
-const struct dma_mapping_ops nommu_dma_ops = {
+struct dma_mapping_ops nommu_dma_ops = {
 	.map_single = nommu_map_single,
 	.map_sg = nommu_map_sg,
-	.mapping_error = nommu_mapping_error,
 	.is_phys = 1,
 };
 
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 20df839b9c2..c4ce0332759 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -18,7 +18,7 @@ swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size,
 	return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction);
 }
 
-const struct dma_mapping_ops swiotlb_dma_ops = {
+struct dma_mapping_ops swiotlb_dma_ops = {
 	.mapping_error = swiotlb_dma_mapping_error,
 	.alloc_coherent = swiotlb_alloc_coherent,
 	.free_coherent = swiotlb_free_coherent,
diff --git a/drivers/firewire/fw-iso.c b/drivers/firewire/fw-iso.c
index bcbe794a3ea..e14c03dc006 100644
--- a/drivers/firewire/fw-iso.c
+++ b/drivers/firewire/fw-iso.c
@@ -50,7 +50,7 @@ fw_iso_buffer_init(struct fw_iso_buffer *buffer, struct fw_card *card,
 
 		address = dma_map_page(card->device, buffer->pages[i],
 				       0, PAGE_SIZE, direction);
-		if (dma_mapping_error(address)) {
+		if (dma_mapping_error(card->device, address)) {
 			__free_page(buffer->pages[i]);
 			goto out_pages;
 		}
diff --git a/drivers/firewire/fw-ohci.c b/drivers/firewire/fw-ohci.c
index 333b12544dd..566672e0bcf 100644
--- a/drivers/firewire/fw-ohci.c
+++ b/drivers/firewire/fw-ohci.c
@@ -953,7 +953,7 @@ at_context_queue_packet(struct context *ctx, struct fw_packet *packet)
 		payload_bus =
 			dma_map_single(ohci->card.device, packet->payload,
 				       packet->payload_length, DMA_TO_DEVICE);
-		if (dma_mapping_error(payload_bus)) {
+		if (dma_mapping_error(ohci->card.device, payload_bus)) {
 			packet->ack = RCODE_SEND_ERROR;
 			return -1;
 		}
diff --git a/drivers/firewire/fw-sbp2.c b/drivers/firewire/fw-sbp2.c
index 53fc5a641e6..aaff50ebba1 100644
--- a/drivers/firewire/fw-sbp2.c
+++ b/drivers/firewire/fw-sbp2.c
@@ -543,7 +543,7 @@ sbp2_send_management_orb(struct sbp2_logical_unit *lu, int node_id,
 	orb->response_bus =
 		dma_map_single(device->card->device, &orb->response,
 			       sizeof(orb->response), DMA_FROM_DEVICE);
-	if (dma_mapping_error(orb->response_bus))
+	if (dma_mapping_error(device->card->device, orb->response_bus))
 		goto fail_mapping_response;
 
 	orb->request.response.high = 0;
@@ -577,7 +577,7 @@ sbp2_send_management_orb(struct sbp2_logical_unit *lu, int node_id,
 	orb->base.request_bus =
 		dma_map_single(device->card->device, &orb->request,
 			       sizeof(orb->request), DMA_TO_DEVICE);
-	if (dma_mapping_error(orb->base.request_bus))
+	if (dma_mapping_error(device->card->device, orb->base.request_bus))
 		goto fail_mapping_request;
 
 	sbp2_send_orb(&orb->base, lu, node_id, generation,
@@ -1424,7 +1424,7 @@ sbp2_map_scatterlist(struct sbp2_command_orb *orb, struct fw_device *device,
 	orb->page_table_bus =
 		dma_map_single(device->card->device, orb->page_table,
 			       sizeof(orb->page_table), DMA_TO_DEVICE);
-	if (dma_mapping_error(orb->page_table_bus))
+	if (dma_mapping_error(device->card->device, orb->page_table_bus))
 		goto fail_page_table;
 
 	/*
@@ -1509,7 +1509,7 @@ static int sbp2_scsi_queuecommand(struct scsi_cmnd *cmd, scsi_done_fn_t done)
 	orb->base.request_bus =
 		dma_map_single(device->card->device, &orb->request,
 			       sizeof(orb->request), DMA_TO_DEVICE);
-	if (dma_mapping_error(orb->base.request_bus))
+	if (dma_mapping_error(device->card->device, orb->base.request_bus))
 		goto out;
 
 	sbp2_send_orb(&orb->base, lu, lu->tgt->node_id, lu->generation,
diff --git a/drivers/infiniband/hw/ipath/ipath_sdma.c b/drivers/infiniband/hw/ipath/ipath_sdma.c
index eaba03273e4..284c9bca517 100644
--- a/drivers/infiniband/hw/ipath/ipath_sdma.c
+++ b/drivers/infiniband/hw/ipath/ipath_sdma.c
@@ -698,7 +698,7 @@ retry:
 
 	addr = dma_map_single(&dd->pcidev->dev, tx->txreq.map_addr,
 			      tx->map_len, DMA_TO_DEVICE);
-	if (dma_mapping_error(addr)) {
+	if (dma_mapping_error(&dd->pcidev->dev, addr)) {
 		ret = -EIO;
 		goto unlock;
 	}
diff --git a/drivers/infiniband/hw/ipath/ipath_user_sdma.c b/drivers/infiniband/hw/ipath/ipath_user_sdma.c
index 86e016916cd..82d9a0b5ca2 100644
--- a/drivers/infiniband/hw/ipath/ipath_user_sdma.c
+++ b/drivers/infiniband/hw/ipath/ipath_user_sdma.c
@@ -206,7 +206,7 @@ static int ipath_user_sdma_coalesce(const struct ipath_devdata *dd,
 
 	dma_addr = dma_map_page(&dd->pcidev->dev, page, 0, len,
 				DMA_TO_DEVICE);
-	if (dma_mapping_error(dma_addr)) {
+	if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
 		ret = -ENOMEM;
 		goto free_unmap;
 	}
@@ -301,7 +301,7 @@ static int ipath_user_sdma_pin_pages(const struct ipath_devdata *dd,
 				     pages[j], 0, flen, DMA_TO_DEVICE);
 		unsigned long fofs = addr & ~PAGE_MASK;
 
-		if (dma_mapping_error(dma_addr)) {
+		if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
 			ret = -ENOMEM;
 			goto done;
 		}
@@ -508,7 +508,7 @@ static int ipath_user_sdma_queue_pkts(const struct ipath_devdata *dd,
 		if (page) {
 			dma_addr = dma_map_page(&dd->pcidev->dev,
 						page, 0, len, DMA_TO_DEVICE);
-			if (dma_mapping_error(dma_addr)) {
+			if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
 				ret = -ENOMEM;
 				goto free_pbc;
 			}
diff --git a/drivers/infiniband/hw/mthca/mthca_eq.c b/drivers/infiniband/hw/mthca/mthca_eq.c
index 4e36aa7cb3d..cc6858f0b65 100644
--- a/drivers/infiniband/hw/mthca/mthca_eq.c
+++ b/drivers/infiniband/hw/mthca/mthca_eq.c
@@ -780,7 +780,7 @@ int mthca_map_eq_icm(struct mthca_dev *dev, u64 icm_virt)
 		return -ENOMEM;
 	dev->eq_table.icm_dma  = pci_map_page(dev->pdev, dev->eq_table.icm_page, 0,
 					      PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
-	if (pci_dma_mapping_error(dev->eq_table.icm_dma)) {
+	if (pci_dma_mapping_error(dev->pdev, dev->eq_table.icm_dma)) {
 		__free_page(dev->eq_table.icm_page);
 		return -ENOMEM;
 	}
diff --git a/drivers/media/dvb/pluto2/pluto2.c b/drivers/media/dvb/pluto2/pluto2.c
index 1360403b88b..a9653c63f4d 100644
--- a/drivers/media/dvb/pluto2/pluto2.c
+++ b/drivers/media/dvb/pluto2/pluto2.c
@@ -242,7 +242,7 @@ static int __devinit pluto_dma_map(struct pluto *pluto)
 	pluto->dma_addr = pci_map_single(pluto->pdev, pluto->dma_buf,
 			TS_DMA_BYTES, PCI_DMA_FROMDEVICE);
 
-	return pci_dma_mapping_error(pluto->dma_addr);
+	return pci_dma_mapping_error(pluto->pdev, pluto->dma_addr);
 }
 
 static void pluto_dma_unmap(struct pluto *pluto)
diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index c3a5db72ddd..5f95e10229b 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -337,7 +337,7 @@ static int sdhci_adma_table_pre(struct sdhci_host *host,
 
 	host->align_addr = dma_map_single(mmc_dev(host->mmc),
 		host->align_buffer, 128 * 4, direction);
-	if (dma_mapping_error(host->align_addr))
+	if (dma_mapping_error(mmc_dev(host->mmc), host->align_addr))
 		goto fail;
 	BUG_ON(host->align_addr & 0x3);
 
@@ -439,7 +439,7 @@ static int sdhci_adma_table_pre(struct sdhci_host *host,
 
 	host->adma_addr = dma_map_single(mmc_dev(host->mmc),
 		host->adma_desc, (128 * 2 + 1) * 4, DMA_TO_DEVICE);
-	if (dma_mapping_error(host->align_addr))
+	if (dma_mapping_error(mmc_dev(host->mmc), host->align_addr))
 		goto unmap_entries;
 	BUG_ON(host->adma_addr & 0x3);
 
diff --git a/drivers/net/arm/ep93xx_eth.c b/drivers/net/arm/ep93xx_eth.c
index 7a14980f347..18d3eeb7eab 100644
--- a/drivers/net/arm/ep93xx_eth.c
+++ b/drivers/net/arm/ep93xx_eth.c
@@ -482,7 +482,7 @@ static int ep93xx_alloc_buffers(struct ep93xx_priv *ep)
 			goto err;
 
 		d = dma_map_single(NULL, page, PAGE_SIZE, DMA_FROM_DEVICE);
-		if (dma_mapping_error(d)) {
+		if (dma_mapping_error(NULL, d)) {
 			free_page((unsigned long)page);
 			goto err;
 		}
@@ -505,7 +505,7 @@ static int ep93xx_alloc_buffers(struct ep93xx_priv *ep)
 			goto err;
 
 		d = dma_map_single(NULL, page, PAGE_SIZE, DMA_TO_DEVICE);
-		if (dma_mapping_error(d)) {
+		if (dma_mapping_error(NULL, d)) {
 			free_page((unsigned long)page);
 			goto err;
 		}
diff --git a/drivers/net/bnx2x_main.c b/drivers/net/bnx2x_main.c
index 0263bef9cc6..c7cc760a177 100644
--- a/drivers/net/bnx2x_main.c
+++ b/drivers/net/bnx2x_main.c
@@ -1020,7 +1020,7 @@ static inline int bnx2x_alloc_rx_sge(struct bnx2x *bp,
 
 	mapping = pci_map_page(bp->pdev, page, 0, BCM_PAGE_SIZE*PAGES_PER_SGE,
 			       PCI_DMA_FROMDEVICE);
-	if (unlikely(dma_mapping_error(mapping))) {
+	if (unlikely(dma_mapping_error(&bp->pdev->dev, mapping))) {
 		__free_pages(page, PAGES_PER_SGE_SHIFT);
 		return -ENOMEM;
 	}
@@ -1048,7 +1048,7 @@ static inline int bnx2x_alloc_rx_skb(struct bnx2x *bp,
 
 	mapping = pci_map_single(bp->pdev, skb->data, bp->rx_buf_use_size,
 				 PCI_DMA_FROMDEVICE);
-	if (unlikely(dma_mapping_error(mapping))) {
+	if (unlikely(dma_mapping_error(&bp->pdev->dev, mapping))) {
 		dev_kfree_skb(skb);
 		return -ENOMEM;
 	}
diff --git a/drivers/net/cxgb3/sge.c b/drivers/net/cxgb3/sge.c
index a96331c875e..1b0861d73ab 100644
--- a/drivers/net/cxgb3/sge.c
+++ b/drivers/net/cxgb3/sge.c
@@ -386,7 +386,7 @@ static inline int add_one_rx_buf(void *va, unsigned int len,
 	dma_addr_t mapping;
 
 	mapping = pci_map_single(pdev, va, len, PCI_DMA_FROMDEVICE);
-	if (unlikely(pci_dma_mapping_error(mapping)))
+	if (unlikely(pci_dma_mapping_error(pdev, mapping)))
 		return -ENOMEM;
 
 	pci_unmap_addr_set(sd, dma_addr, mapping);
diff --git a/drivers/net/e100.c b/drivers/net/e100.c
index 1037b133231..19d32a227be 100644
--- a/drivers/net/e100.c
+++ b/drivers/net/e100.c
@@ -1790,7 +1790,7 @@ static int e100_rx_alloc_skb(struct nic *nic, struct rx *rx)
 	rx->dma_addr = pci_map_single(nic->pdev, rx->skb->data,
 		RFD_BUF_LEN, PCI_DMA_BIDIRECTIONAL);
 
-	if (pci_dma_mapping_error(rx->dma_addr)) {
+	if (pci_dma_mapping_error(nic->pdev, rx->dma_addr)) {
 		dev_kfree_skb_any(rx->skb);
 		rx->skb = NULL;
 		rx->dma_addr = 0;
diff --git a/drivers/net/e1000e/ethtool.c b/drivers/net/e1000e/ethtool.c
index a14561f40db..9350564065e 100644
--- a/drivers/net/e1000e/ethtool.c
+++ b/drivers/net/e1000e/ethtool.c
@@ -1090,7 +1090,7 @@ static int e1000_setup_desc_rings(struct e1000_adapter *adapter)
 		tx_ring->buffer_info[i].dma =
 			pci_map_single(pdev, skb->data, skb->len,
 				       PCI_DMA_TODEVICE);
-		if (pci_dma_mapping_error(tx_ring->buffer_info[i].dma)) {
+		if (pci_dma_mapping_error(pdev, tx_ring->buffer_info[i].dma)) {
 			ret_val = 4;
 			goto err_nomem;
 		}
@@ -1153,7 +1153,7 @@ static int e1000_setup_desc_rings(struct e1000_adapter *adapter)
 		rx_ring->buffer_info[i].dma =
 			pci_map_single(pdev, skb->data, 2048,
 				       PCI_DMA_FROMDEVICE);
-		if (pci_dma_mapping_error(rx_ring->buffer_info[i].dma)) {
+		if (pci_dma_mapping_error(pdev, rx_ring->buffer_info[i].dma)) {
 			ret_val = 8;
 			goto err_nomem;
 		}
diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c
index 9c0f56b3c51..d1367789976 100644
--- a/drivers/net/e1000e/netdev.c
+++ b/drivers/net/e1000e/netdev.c
@@ -195,7 +195,7 @@ map_skb:
 		buffer_info->dma = pci_map_single(pdev, skb->data,
 						  adapter->rx_buffer_len,
 						  PCI_DMA_FROMDEVICE);
-		if (pci_dma_mapping_error(buffer_info->dma)) {
+		if (pci_dma_mapping_error(pdev, buffer_info->dma)) {
 			dev_err(&pdev->dev, "RX DMA map failed\n");
 			adapter->rx_dma_failed++;
 			break;
@@ -265,7 +265,7 @@ static void e1000_alloc_rx_buffers_ps(struct e1000_adapter *adapter,
 						   ps_page->page,
 						   0, PAGE_SIZE,
 						   PCI_DMA_FROMDEVICE);
-				if (pci_dma_mapping_error(ps_page->dma)) {
+				if (pci_dma_mapping_error(pdev, ps_page->dma)) {
 					dev_err(&adapter->pdev->dev,
 					  "RX DMA page map failed\n");
 					adapter->rx_dma_failed++;
@@ -300,7 +300,7 @@ static void e1000_alloc_rx_buffers_ps(struct e1000_adapter *adapter,
 		buffer_info->dma = pci_map_single(pdev, skb->data,
 						  adapter->rx_ps_bsize0,
 						  PCI_DMA_FROMDEVICE);
-		if (pci_dma_mapping_error(buffer_info->dma)) {
+		if (pci_dma_mapping_error(pdev, buffer_info->dma)) {
 			dev_err(&pdev->dev, "RX DMA map failed\n");
 			adapter->rx_dma_failed++;
 			/* cleanup skb */
@@ -3344,7 +3344,7 @@ static int e1000_tx_map(struct e1000_adapter *adapter,
 				skb->data + offset,
 				size,
 				PCI_DMA_TODEVICE);
-		if (pci_dma_mapping_error(buffer_info->dma)) {
+		if (pci_dma_mapping_error(adapter->pdev, buffer_info->dma)) {
 			dev_err(&adapter->pdev->dev, "TX DMA map failed\n");
 			adapter->tx_dma_failed++;
 			return -1;
@@ -3382,7 +3382,8 @@ static int e1000_tx_map(struct e1000_adapter *adapter,
 					offset,
 					size,
 					PCI_DMA_TODEVICE);
-			if (pci_dma_mapping_error(buffer_info->dma)) {
+			if (pci_dma_mapping_error(adapter->pdev,
+						  buffer_info->dma)) {
 				dev_err(&adapter->pdev->dev,
 					"TX DMA page map failed\n");
 				adapter->tx_dma_failed++;
diff --git a/drivers/net/ibmveth.c b/drivers/net/ibmveth.c
index e5a6e2e8454..91ec9fdc718 100644
--- a/drivers/net/ibmveth.c
+++ b/drivers/net/ibmveth.c
@@ -260,7 +260,7 @@ static void ibmveth_replenish_buffer_pool(struct ibmveth_adapter *adapter, struc
 		dma_addr = dma_map_single(&adapter->vdev->dev, skb->data,
 				pool->buff_size, DMA_FROM_DEVICE);
 
-		if (dma_mapping_error(dma_addr))
+		if (dma_mapping_error((&adapter->vdev->dev, dma_addr))
 			goto failure;
 
 		pool->free_map[free_index] = IBM_VETH_INVALID_MAP;
@@ -294,7 +294,7 @@ failure:
 		pool->consumer_index = pool->size - 1;
 	else
 		pool->consumer_index--;
-	if (!dma_mapping_error(dma_addr))
+	if (!dma_mapping_error((&adapter->vdev->dev, dma_addr))
 		dma_unmap_single(&adapter->vdev->dev,
 		                 pool->dma_addr[index], pool->buff_size,
 		                 DMA_FROM_DEVICE);
@@ -448,11 +448,11 @@ static void ibmveth_rxq_harvest_buffer(struct ibmveth_adapter *adapter)
 static void ibmveth_cleanup(struct ibmveth_adapter *adapter)
 {
 	int i;
+	struct device *dev = &adapter->vdev->dev;
 
 	if(adapter->buffer_list_addr != NULL) {
-		if(!dma_mapping_error(adapter->buffer_list_dma)) {
-			dma_unmap_single(&adapter->vdev->dev,
-					adapter->buffer_list_dma, 4096,
+		if (!dma_mapping_error(dev, adapter->buffer_list_dma)) {
+			dma_unmap_single(dev, adapter->buffer_list_dma, 4096,
 					DMA_BIDIRECTIONAL);
 			adapter->buffer_list_dma = DMA_ERROR_CODE;
 		}
@@ -461,9 +461,8 @@ static void ibmveth_cleanup(struct ibmveth_adapter *adapter)
 	}
 
 	if(adapter->filter_list_addr != NULL) {
-		if(!dma_mapping_error(adapter->filter_list_dma)) {
-			dma_unmap_single(&adapter->vdev->dev,
-					adapter->filter_list_dma, 4096,
+		if (!dma_mapping_error(dev, adapter->filter_list_dma)) {
+			dma_unmap_single(dev, adapter->filter_list_dma, 4096,
 					DMA_BIDIRECTIONAL);
 			adapter->filter_list_dma = DMA_ERROR_CODE;
 		}
@@ -472,8 +471,8 @@ static void ibmveth_cleanup(struct ibmveth_adapter *adapter)
 	}
 
 	if(adapter->rx_queue.queue_addr != NULL) {
-		if(!dma_mapping_error(adapter->rx_queue.queue_dma)) {
-			dma_unmap_single(&adapter->vdev->dev,
+		if (!dma_mapping_error(dev, adapter->rx_queue.queue_dma)) {
+			dma_unmap_single(dev,
 					adapter->rx_queue.queue_dma,
 					adapter->rx_queue.queue_len,
 					DMA_BIDIRECTIONAL);
@@ -535,6 +534,7 @@ static int ibmveth_open(struct net_device *netdev)
 	int rc;
 	union ibmveth_buf_desc rxq_desc;
 	int i;
+	struct device *dev;
 
 	ibmveth_debug_printk("open starting\n");
 
@@ -563,17 +563,19 @@ static int ibmveth_open(struct net_device *netdev)
 		return -ENOMEM;
 	}
 
-	adapter->buffer_list_dma = dma_map_single(&adapter->vdev->dev,
+	dev = &adapter->vdev->dev;
+
+	adapter->buffer_list_dma = dma_map_single(dev,
 			adapter->buffer_list_addr, 4096, DMA_BIDIRECTIONAL);
-	adapter->filter_list_dma = dma_map_single(&adapter->vdev->dev,
+	adapter->filter_list_dma = dma_map_single(dev,
 			adapter->filter_list_addr, 4096, DMA_BIDIRECTIONAL);
-	adapter->rx_queue.queue_dma = dma_map_single(&adapter->vdev->dev,
+	adapter->rx_queue.queue_dma = dma_map_single(dev,
 			adapter->rx_queue.queue_addr,
 			adapter->rx_queue.queue_len, DMA_BIDIRECTIONAL);
 
-	if((dma_mapping_error(adapter->buffer_list_dma) ) ||
-	   (dma_mapping_error(adapter->filter_list_dma)) ||
-	   (dma_mapping_error(adapter->rx_queue.queue_dma))) {
+	if ((dma_mapping_error(dev, adapter->buffer_list_dma)) ||
+	    (dma_mapping_error(dev, adapter->filter_list_dma)) ||
+	    (dma_mapping_error(dev, adapter->rx_queue.queue_dma))) {
 		ibmveth_error_printk("unable to map filter or buffer list pages\n");
 		ibmveth_cleanup(adapter);
 		napi_disable(&adapter->napi);
@@ -645,7 +647,7 @@ static int ibmveth_open(struct net_device *netdev)
 	adapter->bounce_buffer_dma =
 	    dma_map_single(&adapter->vdev->dev, adapter->bounce_buffer,
 			   netdev->mtu + IBMVETH_BUFF_OH, DMA_BIDIRECTIONAL);
-	if (dma_mapping_error(adapter->bounce_buffer_dma)) {
+	if (dma_mapping_error(dev, adapter->bounce_buffer_dma)) {
 		ibmveth_error_printk("unable to map bounce buffer\n");
 		ibmveth_cleanup(adapter);
 		napi_disable(&adapter->napi);
@@ -922,7 +924,7 @@ static int ibmveth_start_xmit(struct sk_buff *skb, struct net_device *netdev)
 		buf[1] = 0;
 	}
 
-	if (dma_mapping_error(data_dma_addr)) {
+	if (dma_mapping_error((&adapter->vdev->dev, data_dma_addr)) {
 		if (!firmware_has_feature(FW_FEATURE_CMO))
 			ibmveth_error_printk("tx: unable to map xmit buffer\n");
 		skb_copy_from_linear_data(skb, adapter->bounce_buffer,
diff --git a/drivers/net/iseries_veth.c b/drivers/net/iseries_veth.c
index b8d0639c1cd..c46864d626b 100644
--- a/drivers/net/iseries_veth.c
+++ b/drivers/net/iseries_veth.c
@@ -1128,7 +1128,7 @@ static int veth_transmit_to_one(struct sk_buff *skb, HvLpIndex rlp,
 	msg->data.addr[0] = dma_map_single(port->dev, skb->data,
 				skb->len, DMA_TO_DEVICE);
 
-	if (dma_mapping_error(msg->data.addr[0]))
+	if (dma_mapping_error(port->dev, msg->data.addr[0]))
 		goto recycle_and_drop;
 
 	msg->dev = port->dev;
@@ -1226,7 +1226,7 @@ static void veth_recycle_msg(struct veth_lpar_connection *cnx,
 		dma_address = msg->data.addr[0];
 		dma_length = msg->data.len[0];
 
-		if (!dma_mapping_error(dma_address))
+		if (!dma_mapping_error(msg->dev, dma_address))
 			dma_unmap_single(msg->dev, dma_address, dma_length,
 					DMA_TO_DEVICE);
 
diff --git a/drivers/net/mlx4/eq.c b/drivers/net/mlx4/eq.c
index ea3a09aaa84..7df928d3a3d 100644
--- a/drivers/net/mlx4/eq.c
+++ b/drivers/net/mlx4/eq.c
@@ -526,7 +526,7 @@ int mlx4_map_eq_icm(struct mlx4_dev *dev, u64 icm_virt)
 		return -ENOMEM;
 	priv->eq_table.icm_dma  = pci_map_page(dev->pdev, priv->eq_table.icm_page, 0,
 					       PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
-	if (pci_dma_mapping_error(priv->eq_table.icm_dma)) {
+	if (pci_dma_mapping_error(dev->pdev, priv->eq_table.icm_dma)) {
 		__free_page(priv->eq_table.icm_page);
 		return -ENOMEM;
 	}
diff --git a/drivers/net/pasemi_mac.c b/drivers/net/pasemi_mac.c
index 993d87c9296..edc0fd58898 100644
--- a/drivers/net/pasemi_mac.c
+++ b/drivers/net/pasemi_mac.c
@@ -650,7 +650,7 @@ static void pasemi_mac_replenish_rx_ring(const struct net_device *dev,
 				     mac->bufsz - LOCAL_SKB_ALIGN,
 				     PCI_DMA_FROMDEVICE);
 
-		if (unlikely(dma_mapping_error(dma))) {
+		if (unlikely(pci_dma_mapping_error(mac->dma_pdev, dma))) {
 			dev_kfree_skb_irq(info->skb);
 			break;
 		}
@@ -1519,7 +1519,7 @@ static int pasemi_mac_start_tx(struct sk_buff *skb, struct net_device *dev)
 	map[0] = pci_map_single(mac->dma_pdev, skb->data, skb_headlen(skb),
 				PCI_DMA_TODEVICE);
 	map_size[0] = skb_headlen(skb);
-	if (dma_mapping_error(map[0]))
+	if (pci_dma_mapping_error(mac->dma_pdev, map[0]))
 		goto out_err_nolock;
 
 	for (i = 0; i < nfrags; i++) {
@@ -1529,7 +1529,7 @@ static int pasemi_mac_start_tx(struct sk_buff *skb, struct net_device *dev)
 					frag->page_offset, frag->size,
 					PCI_DMA_TODEVICE);
 		map_size[i+1] = frag->size;
-		if (dma_mapping_error(map[i+1])) {
+		if (pci_dma_mapping_error(mac->dma_pdev, map[i+1])) {
 			nfrags = i;
 			goto out_err_nolock;
 		}
diff --git a/drivers/net/qla3xxx.c b/drivers/net/qla3xxx.c
index e7d48a352be..e82b37bbd6c 100644
--- a/drivers/net/qla3xxx.c
+++ b/drivers/net/qla3xxx.c
@@ -328,7 +328,7 @@ static void ql_release_to_lrg_buf_free_list(struct ql3_adapter *qdev,
 					     qdev->lrg_buffer_len -
 					     QL_HEADER_SPACE,
 					     PCI_DMA_FROMDEVICE);
-			err = pci_dma_mapping_error(map);
+			err = pci_dma_mapping_error(qdev->pdev, map);
 			if(err) {
 				printk(KERN_ERR "%s: PCI mapping failed with error: %d\n",
 				       qdev->ndev->name, err);
@@ -1919,7 +1919,7 @@ static int ql_populate_free_queue(struct ql3_adapter *qdev)
 						     QL_HEADER_SPACE,
 						     PCI_DMA_FROMDEVICE);
 
-				err = pci_dma_mapping_error(map);
+				err = pci_dma_mapping_error(qdev->pdev, map);
 				if(err) {
 					printk(KERN_ERR "%s: PCI mapping failed with error: %d\n",
 					       qdev->ndev->name, err);
@@ -2454,7 +2454,7 @@ static int ql_send_map(struct ql3_adapter *qdev,
 	 */
 	map = pci_map_single(qdev->pdev, skb->data, len, PCI_DMA_TODEVICE);
 
-	err = pci_dma_mapping_error(map);
+	err = pci_dma_mapping_error(qdev->pdev, map);
 	if(err) {
 		printk(KERN_ERR "%s: PCI mapping failed with error: %d\n",
 		       qdev->ndev->name, err);
@@ -2487,7 +2487,7 @@ static int ql_send_map(struct ql3_adapter *qdev,
 						     sizeof(struct oal),
 						     PCI_DMA_TODEVICE);
 
-				err = pci_dma_mapping_error(map);
+				err = pci_dma_mapping_error(qdev->pdev, map);
 				if(err) {
 
 					printk(KERN_ERR "%s: PCI mapping outbound address list with error: %d\n",
@@ -2514,7 +2514,7 @@ static int ql_send_map(struct ql3_adapter *qdev,
 					 frag->page_offset, frag->size,
 					 PCI_DMA_TODEVICE);
 
-			err = pci_dma_mapping_error(map);
+			err = pci_dma_mapping_error(qdev->pdev, map);
 			if(err) {
 				printk(KERN_ERR "%s: PCI mapping frags failed with error: %d\n",
 				       qdev->ndev->name, err);
@@ -2916,7 +2916,7 @@ static int ql_alloc_large_buffers(struct ql3_adapter *qdev)
 					     QL_HEADER_SPACE,
 					     PCI_DMA_FROMDEVICE);
 
-			err = pci_dma_mapping_error(map);
+			err = pci_dma_mapping_error(qdev->pdev, map);
 			if(err) {
 				printk(KERN_ERR "%s: PCI mapping failed with error: %d\n",
 				       qdev->ndev->name, err);
diff --git a/drivers/net/s2io.c b/drivers/net/s2io.c
index 9dae40ccf04..86d77d05190 100644
--- a/drivers/net/s2io.c
+++ b/drivers/net/s2io.c
@@ -2512,8 +2512,8 @@ static void stop_nic(struct s2io_nic *nic)
  *   Return Value:
  *  SUCCESS on success or an appropriate -ve value on failure.
  */
-
-static int fill_rx_buffers(struct ring_info *ring, int from_card_up)
+static int fill_rx_buffers(struct s2io_nic *nic, struct ring_info *ring,
+				int from_card_up)
 {
 	struct sk_buff *skb;
 	struct RxD_t *rxdp;
@@ -2602,7 +2602,8 @@ static int fill_rx_buffers(struct ring_info *ring, int from_card_up)
 			rxdp1->Buffer0_ptr = pci_map_single
 			    (ring->pdev, skb->data, size - NET_IP_ALIGN,
 				PCI_DMA_FROMDEVICE);
-			if(pci_dma_mapping_error(rxdp1->Buffer0_ptr))
+			if (pci_dma_mapping_error(nic->pdev,
+						rxdp1->Buffer0_ptr))
 				goto pci_map_failed;
 
 			rxdp->Control_2 =
@@ -2636,7 +2637,8 @@ static int fill_rx_buffers(struct ring_info *ring, int from_card_up)
 				rxdp3->Buffer0_ptr =
 				   pci_map_single(ring->pdev, ba->ba_0,
 					BUF0_LEN, PCI_DMA_FROMDEVICE);
-				if (pci_dma_mapping_error(rxdp3->Buffer0_ptr))
+			if (pci_dma_mapping_error(nic->pdev,
+						rxdp3->Buffer0_ptr))
 					goto pci_map_failed;
 			} else
 				pci_dma_sync_single_for_device(ring->pdev,
@@ -2655,7 +2657,8 @@ static int fill_rx_buffers(struct ring_info *ring, int from_card_up)
 				(ring->pdev, skb->data, ring->mtu + 4,
 						PCI_DMA_FROMDEVICE);
 
-				if (pci_dma_mapping_error(rxdp3->Buffer2_ptr))
+				if (pci_dma_mapping_error(nic->pdev,
+							rxdp3->Buffer2_ptr))
 					goto pci_map_failed;
 
 				if (from_card_up) {
@@ -2664,8 +2667,8 @@ static int fill_rx_buffers(struct ring_info *ring, int from_card_up)
 						ba->ba_1, BUF1_LEN,
 						PCI_DMA_FROMDEVICE);
 
-					if (pci_dma_mapping_error
-						(rxdp3->Buffer1_ptr)) {
+					if (pci_dma_mapping_error(nic->pdev,
+						rxdp3->Buffer1_ptr)) {
 						pci_unmap_single
 							(ring->pdev,
 						    (dma_addr_t)(unsigned long)
@@ -2806,9 +2809,9 @@ static void free_rx_buffers(struct s2io_nic *sp)
 	}
 }
 
-static int s2io_chk_rx_buffers(struct ring_info *ring)
+static int s2io_chk_rx_buffers(struct s2io_nic *nic, struct ring_info *ring)
 {
-	if (fill_rx_buffers(ring, 0) == -ENOMEM) {
+	if (fill_rx_buffers(nic, ring, 0) == -ENOMEM) {
 		DBG_PRINT(INFO_DBG, "%s:Out of memory", ring->dev->name);
 		DBG_PRINT(INFO_DBG, " in Rx Intr!!\n");
 	}
@@ -2848,7 +2851,7 @@ static int s2io_poll_msix(struct napi_struct *napi, int budget)
 		return 0;
 
 	pkts_processed = rx_intr_handler(ring, budget);
-	s2io_chk_rx_buffers(ring);
+	s2io_chk_rx_buffers(nic, ring);
 
 	if (pkts_processed < budget_org) {
 		netif_rx_complete(dev, napi);
@@ -2882,7 +2885,7 @@ static int s2io_poll_inta(struct napi_struct *napi, int budget)
 	for (i = 0; i < config->rx_ring_num; i++) {
 		ring = &mac_control->rings[i];
 		ring_pkts_processed = rx_intr_handler(ring, budget);
-		s2io_chk_rx_buffers(ring);
+		s2io_chk_rx_buffers(nic, ring);
 		pkts_processed += ring_pkts_processed;
 		budget -= ring_pkts_processed;
 		if (budget <= 0)
@@ -2939,7 +2942,8 @@ static void s2io_netpoll(struct net_device *dev)
 		rx_intr_handler(&mac_control->rings[i], 0);
 
 	for (i = 0; i < config->rx_ring_num; i++) {
-		if (fill_rx_buffers(&mac_control->rings[i], 0) == -ENOMEM) {
+		if (fill_rx_buffers(nic, &mac_control->rings[i], 0) ==
+				-ENOMEM) {
 			DBG_PRINT(INFO_DBG, "%s:Out of memory", dev->name);
 			DBG_PRINT(INFO_DBG, " in Rx Netpoll!!\n");
 			break;
@@ -4235,14 +4239,14 @@ static int s2io_xmit(struct sk_buff *skb, struct net_device *dev)
 		txdp->Buffer_Pointer = pci_map_single(sp->pdev,
 					fifo->ufo_in_band_v,
 					sizeof(u64), PCI_DMA_TODEVICE);
-		if (pci_dma_mapping_error(txdp->Buffer_Pointer))
+		if (pci_dma_mapping_error(sp->pdev, txdp->Buffer_Pointer))
 			goto pci_map_failed;
 		txdp++;
 	}
 
 	txdp->Buffer_Pointer = pci_map_single
 	    (sp->pdev, skb->data, frg_len, PCI_DMA_TODEVICE);
-	if (pci_dma_mapping_error(txdp->Buffer_Pointer))
+	if (pci_dma_mapping_error(sp->pdev, txdp->Buffer_Pointer))
 		goto pci_map_failed;
 
 	txdp->Host_Control = (unsigned long) skb;
@@ -4345,7 +4349,7 @@ static irqreturn_t s2io_msix_ring_handle(int irq, void *dev_id)
 		netif_rx_schedule(dev, &ring->napi);
 	} else {
 		rx_intr_handler(ring, 0);
-		s2io_chk_rx_buffers(ring);
+		s2io_chk_rx_buffers(sp, ring);
 	}
 
 	return IRQ_HANDLED;
@@ -4826,7 +4830,7 @@ static irqreturn_t s2io_isr(int irq, void *dev_id)
 		 */
 		if (!config->napi) {
 			for (i = 0; i < config->rx_ring_num; i++)
-				s2io_chk_rx_buffers(&mac_control->rings[i]);
+				s2io_chk_rx_buffers(sp, &mac_control->rings[i]);
 		}
 		writeq(sp->general_int_mask, &bar0->general_int_mask);
 		readl(&bar0->general_int_status);
@@ -6859,7 +6863,7 @@ static int set_rxd_buffer_pointer(struct s2io_nic *sp, struct RxD_t *rxdp,
 				pci_map_single( sp->pdev, (*skb)->data,
 					size - NET_IP_ALIGN,
 					PCI_DMA_FROMDEVICE);
-			if (pci_dma_mapping_error(rxdp1->Buffer0_ptr))
+			if (pci_dma_mapping_error(sp->pdev, rxdp1->Buffer0_ptr))
 				goto memalloc_failed;
 			rxdp->Host_Control = (unsigned long) (*skb);
 		}
@@ -6886,12 +6890,13 @@ static int set_rxd_buffer_pointer(struct s2io_nic *sp, struct RxD_t *rxdp,
 				pci_map_single(sp->pdev, (*skb)->data,
 					       dev->mtu + 4,
 					       PCI_DMA_FROMDEVICE);
-			if (pci_dma_mapping_error(rxdp3->Buffer2_ptr))
+			if (pci_dma_mapping_error(sp->pdev, rxdp3->Buffer2_ptr))
 				goto memalloc_failed;
 			rxdp3->Buffer0_ptr = *temp0 =
 				pci_map_single( sp->pdev, ba->ba_0, BUF0_LEN,
 						PCI_DMA_FROMDEVICE);
-			if (pci_dma_mapping_error(rxdp3->Buffer0_ptr)) {
+			if (pci_dma_mapping_error(sp->pdev,
+						rxdp3->Buffer0_ptr)) {
 				pci_unmap_single (sp->pdev,
 					(dma_addr_t)rxdp3->Buffer2_ptr,
 					dev->mtu + 4, PCI_DMA_FROMDEVICE);
@@ -6903,7 +6908,8 @@ static int set_rxd_buffer_pointer(struct s2io_nic *sp, struct RxD_t *rxdp,
 			rxdp3->Buffer1_ptr = *temp1 =
 				pci_map_single(sp->pdev, ba->ba_1, BUF1_LEN,
 						PCI_DMA_FROMDEVICE);
-			if (pci_dma_mapping_error(rxdp3->Buffer1_ptr)) {
+			if (pci_dma_mapping_error(sp->pdev,
+						rxdp3->Buffer1_ptr)) {
 				pci_unmap_single (sp->pdev,
 					(dma_addr_t)rxdp3->Buffer0_ptr,
 					BUF0_LEN, PCI_DMA_FROMDEVICE);
@@ -7187,7 +7193,7 @@ static int s2io_card_up(struct s2io_nic * sp)
 
 	for (i = 0; i < config->rx_ring_num; i++) {
 		mac_control->rings[i].mtu = dev->mtu;
-		ret = fill_rx_buffers(&mac_control->rings[i], 1);
+		ret = fill_rx_buffers(sp, &mac_control->rings[i], 1);
 		if (ret) {
 			DBG_PRINT(ERR_DBG, "%s: Out of memory in Open\n",
 				  dev->name);
diff --git a/drivers/net/sfc/rx.c b/drivers/net/sfc/rx.c
index 601b001437c..0d27dd39bc0 100644
--- a/drivers/net/sfc/rx.c
+++ b/drivers/net/sfc/rx.c
@@ -233,7 +233,7 @@ static inline int efx_init_rx_buffer_skb(struct efx_rx_queue *rx_queue,
 					  rx_buf->data, rx_buf->len,
 					  PCI_DMA_FROMDEVICE);
 
-	if (unlikely(pci_dma_mapping_error(rx_buf->dma_addr))) {
+	if (unlikely(pci_dma_mapping_error(efx->pci_dev, rx_buf->dma_addr))) {
 		dev_kfree_skb_any(rx_buf->skb);
 		rx_buf->skb = NULL;
 		return -EIO;
@@ -275,7 +275,7 @@ static inline int efx_init_rx_buffer_page(struct efx_rx_queue *rx_queue,
 					0, efx_rx_buf_size(efx),
 					PCI_DMA_FROMDEVICE);
 
-		if (unlikely(pci_dma_mapping_error(dma_addr))) {
+		if (unlikely(pci_dma_mapping_error(efx->pci_dev, dma_addr))) {
 			__free_pages(rx_buf->page, efx->rx_buffer_order);
 			rx_buf->page = NULL;
 			return -EIO;
diff --git a/drivers/net/sfc/tx.c b/drivers/net/sfc/tx.c
index 5cdd082ab8f..5e8374ab28e 100644
--- a/drivers/net/sfc/tx.c
+++ b/drivers/net/sfc/tx.c
@@ -172,7 +172,7 @@ static inline int efx_enqueue_skb(struct efx_tx_queue *tx_queue,
 
 	/* Process all fragments */
 	while (1) {
-		if (unlikely(pci_dma_mapping_error(dma_addr)))
+		if (unlikely(pci_dma_mapping_error(pci_dev, dma_addr)))
 			goto pci_err;
 
 		/* Store fields for marking in the per-fragment final
@@ -661,7 +661,8 @@ efx_tsoh_heap_alloc(struct efx_tx_queue *tx_queue, size_t header_len)
 	tsoh->dma_addr = pci_map_single(tx_queue->efx->pci_dev,
 					TSOH_BUFFER(tsoh), header_len,
 					PCI_DMA_TODEVICE);
-	if (unlikely(pci_dma_mapping_error(tsoh->dma_addr))) {
+	if (unlikely(pci_dma_mapping_error(tx_queue->efx->pci_dev,
+					   tsoh->dma_addr))) {
 		kfree(tsoh);
 		return NULL;
 	}
@@ -863,7 +864,7 @@ static inline int tso_get_fragment(struct tso_state *st, struct efx_nic *efx,
 
 	st->ifc.unmap_addr = pci_map_page(efx->pci_dev, page, page_off,
 					  len, PCI_DMA_TODEVICE);
-	if (likely(!pci_dma_mapping_error(st->ifc.unmap_addr))) {
+	if (likely(!pci_dma_mapping_error(efx->pci_dev, st->ifc.unmap_addr))) {
 		st->ifc.unmap_len = len;
 		st->ifc.len = len;
 		st->ifc.dma_addr = st->ifc.unmap_addr;
diff --git a/drivers/net/spider_net.c b/drivers/net/spider_net.c
index 00aa0b108cb..b6435d0d71f 100644
--- a/drivers/net/spider_net.c
+++ b/drivers/net/spider_net.c
@@ -452,7 +452,7 @@ spider_net_prepare_rx_descr(struct spider_net_card *card,
 	/* iommu-map the skb */
 	buf = pci_map_single(card->pdev, descr->skb->data,
 			SPIDER_NET_MAX_FRAME, PCI_DMA_FROMDEVICE);
-	if (pci_dma_mapping_error(buf)) {
+	if (pci_dma_mapping_error(card->pdev, buf)) {
 		dev_kfree_skb_any(descr->skb);
 		descr->skb = NULL;
 		if (netif_msg_rx_err(card) && net_ratelimit())
@@ -691,7 +691,7 @@ spider_net_prepare_tx_descr(struct spider_net_card *card,
 	unsigned long flags;
 
 	buf = pci_map_single(card->pdev, skb->data, skb->len, PCI_DMA_TODEVICE);
-	if (pci_dma_mapping_error(buf)) {
+	if (pci_dma_mapping_error(card->pdev, buf)) {
 		if (netif_msg_tx_err(card) && net_ratelimit())
 			dev_err(&card->netdev->dev, "could not iommu-map packet (%p, %i). "
 				  "Dropping packet\n", skb->data, skb->len);
diff --git a/drivers/net/tc35815.c b/drivers/net/tc35815.c
index a645e5028c1..8487ace9d2e 100644
--- a/drivers/net/tc35815.c
+++ b/drivers/net/tc35815.c
@@ -506,7 +506,7 @@ static void *alloc_rxbuf_page(struct pci_dev *hwdev, dma_addr_t *dma_handle)
 		return NULL;
 	*dma_handle = pci_map_single(hwdev, buf, PAGE_SIZE,
 				     PCI_DMA_FROMDEVICE);
-	if (pci_dma_mapping_error(*dma_handle)) {
+	if (pci_dma_mapping_error(hwdev, *dma_handle)) {
 		free_page((unsigned long)buf);
 		return NULL;
 	}
@@ -536,7 +536,7 @@ static struct sk_buff *alloc_rxbuf_skb(struct net_device *dev,
 		return NULL;
 	*dma_handle = pci_map_single(hwdev, skb->data, RX_BUF_SIZE,
 				     PCI_DMA_FROMDEVICE);
-	if (pci_dma_mapping_error(*dma_handle)) {
+	if (pci_dma_mapping_error(hwdev, *dma_handle)) {
 		dev_kfree_skb_any(skb);
 		return NULL;
 	}
diff --git a/drivers/net/wireless/ath5k/base.c b/drivers/net/wireless/ath5k/base.c
index 217d506527a..d9769c52734 100644
--- a/drivers/net/wireless/ath5k/base.c
+++ b/drivers/net/wireless/ath5k/base.c
@@ -1166,7 +1166,7 @@ ath5k_rxbuf_setup(struct ath5k_softc *sc, struct ath5k_buf *bf)
 		bf->skb = skb;
 		bf->skbaddr = pci_map_single(sc->pdev,
 			skb->data, sc->rxbufsize, PCI_DMA_FROMDEVICE);
-		if (unlikely(pci_dma_mapping_error(bf->skbaddr))) {
+		if (unlikely(pci_dma_mapping_error(sc->pdev, bf->skbaddr))) {
 			ATH5K_ERR(sc, "%s: DMA mapping failed\n", __func__);
 			dev_kfree_skb(skb);
 			bf->skb = NULL;
@@ -1918,7 +1918,7 @@ ath5k_beacon_setup(struct ath5k_softc *sc, struct ath5k_buf *bf)
 	ATH5K_DBG(sc, ATH5K_DEBUG_BEACON, "skb %p [data %p len %u] "
 			"skbaddr %llx\n", skb, skb->data, skb->len,
 			(unsigned long long)bf->skbaddr);
-	if (pci_dma_mapping_error(bf->skbaddr)) {
+	if (pci_dma_mapping_error(sc->pdev, bf->skbaddr)) {
 		ATH5K_ERR(sc, "beacon DMA mapping failed\n");
 		return -EIO;
 	}
diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index c4a7c06793c..61f8fdea2d9 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -3525,7 +3525,7 @@ static int ibmvfc_init_crq(struct ibmvfc_host *vhost)
 	crq->msg_token = dma_map_single(dev, crq->msgs,
 					PAGE_SIZE, DMA_BIDIRECTIONAL);
 
-	if (dma_mapping_error(crq->msg_token))
+	if (dma_mapping_error(dev, crq->msg_token))
 		goto map_failed;
 
 	retrc = rc = plpar_hcall_norets(H_REG_CRQ, vdev->unit_address,
@@ -3618,7 +3618,7 @@ static int ibmvfc_alloc_mem(struct ibmvfc_host *vhost)
 					    async_q->size * sizeof(*async_q->msgs),
 					    DMA_BIDIRECTIONAL);
 
-	if (dma_mapping_error(async_q->msg_token)) {
+	if (dma_mapping_error(dev, async_q->msg_token)) {
 		dev_err(dev, "Failed to map async queue\n");
 		goto free_async_crq;
 	}
diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c
index 20000ec79b0..6b24b9cdb04 100644
--- a/drivers/scsi/ibmvscsi/ibmvscsi.c
+++ b/drivers/scsi/ibmvscsi/ibmvscsi.c
@@ -859,7 +859,7 @@ static void send_mad_adapter_info(struct ibmvscsi_host_data *hostdata)
 					    sizeof(hostdata->madapter_info),
 					    DMA_BIDIRECTIONAL);
 
-	if (dma_mapping_error(req->buffer)) {
+	if (dma_mapping_error(hostdata->dev, req->buffer)) {
 		if (!firmware_has_feature(FW_FEATURE_CMO))
 			dev_err(hostdata->dev,
 			        "Unable to map request_buffer for "
@@ -1407,7 +1407,7 @@ static int ibmvscsi_do_host_config(struct ibmvscsi_host_data *hostdata,
 						    length,
 						    DMA_BIDIRECTIONAL);
 
-	if (dma_mapping_error(host_config->buffer)) {
+	if (dma_mapping_error(hostdata->dev, host_config->buffer)) {
 		if (!firmware_has_feature(FW_FEATURE_CMO))
 			dev_err(hostdata->dev,
 			        "dma_mapping error getting host config\n");
diff --git a/drivers/scsi/ibmvscsi/ibmvstgt.c b/drivers/scsi/ibmvscsi/ibmvstgt.c
index 3b9514c8f1f..2e13ec00172 100644
--- a/drivers/scsi/ibmvscsi/ibmvstgt.c
+++ b/drivers/scsi/ibmvscsi/ibmvstgt.c
@@ -564,7 +564,7 @@ static int crq_queue_create(struct crq_queue *queue, struct srp_target *target)
 					  queue->size * sizeof(*queue->msgs),
 					  DMA_BIDIRECTIONAL);
 
-	if (dma_mapping_error(queue->msg_token))
+	if (dma_mapping_error(target->dev, queue->msg_token))
 		goto map_failed;
 
 	err = h_reg_crq(vport->dma_dev->unit_address, queue->msg_token,
diff --git a/drivers/scsi/ibmvscsi/rpa_vscsi.c b/drivers/scsi/ibmvscsi/rpa_vscsi.c
index 182146100dc..462a8574dad 100644
--- a/drivers/scsi/ibmvscsi/rpa_vscsi.c
+++ b/drivers/scsi/ibmvscsi/rpa_vscsi.c
@@ -253,7 +253,7 @@ static int rpavscsi_init_crq_queue(struct crq_queue *queue,
 					  queue->size * sizeof(*queue->msgs),
 					  DMA_BIDIRECTIONAL);
 
-	if (dma_mapping_error(queue->msg_token))
+	if (dma_mapping_error(hostdata->dev, queue->msg_token))
 		goto map_failed;
 
 	gather_partition_info();
diff --git a/drivers/spi/atmel_spi.c b/drivers/spi/atmel_spi.c
index e81d59d7891..0c716566085 100644
--- a/drivers/spi/atmel_spi.c
+++ b/drivers/spi/atmel_spi.c
@@ -313,14 +313,14 @@ atmel_spi_dma_map_xfer(struct atmel_spi *as, struct spi_transfer *xfer)
 		xfer->tx_dma = dma_map_single(dev,
 				(void *) xfer->tx_buf, xfer->len,
 				DMA_TO_DEVICE);
-		if (dma_mapping_error(xfer->tx_dma))
+		if (dma_mapping_error(dev, xfer->tx_dma))
 			return -ENOMEM;
 	}
 	if (xfer->rx_buf) {
 		xfer->rx_dma = dma_map_single(dev,
 				xfer->rx_buf, xfer->len,
 				DMA_FROM_DEVICE);
-		if (dma_mapping_error(xfer->rx_dma)) {
+		if (dma_mapping_error(dev, xfer->rx_dma)) {
 			if (xfer->tx_buf)
 				dma_unmap_single(dev,
 						xfer->tx_dma, xfer->len,
diff --git a/drivers/spi/au1550_spi.c b/drivers/spi/au1550_spi.c
index 9149689c79d..87b73e0169c 100644
--- a/drivers/spi/au1550_spi.c
+++ b/drivers/spi/au1550_spi.c
@@ -334,7 +334,7 @@ static int au1550_spi_dma_rxtmp_alloc(struct au1550_spi *hw, unsigned size)
 	hw->dma_rx_tmpbuf_size = size;
 	hw->dma_rx_tmpbuf_addr = dma_map_single(hw->dev, hw->dma_rx_tmpbuf,
 			size, DMA_FROM_DEVICE);
-	if (dma_mapping_error(hw->dma_rx_tmpbuf_addr)) {
+	if (dma_mapping_error(hw->dev, hw->dma_rx_tmpbuf_addr)) {
 		kfree(hw->dma_rx_tmpbuf);
 		hw->dma_rx_tmpbuf = 0;
 		hw->dma_rx_tmpbuf_size = 0;
@@ -378,7 +378,7 @@ static int au1550_spi_dma_txrxb(struct spi_device *spi, struct spi_transfer *t)
 			dma_rx_addr = dma_map_single(hw->dev,
 					(void *)t->rx_buf,
 					t->len, DMA_FROM_DEVICE);
-			if (dma_mapping_error(dma_rx_addr))
+			if (dma_mapping_error(hw->dev, dma_rx_addr))
 				dev_err(hw->dev, "rx dma map error\n");
 		}
 	} else {
@@ -401,7 +401,7 @@ static int au1550_spi_dma_txrxb(struct spi_device *spi, struct spi_transfer *t)
 			dma_tx_addr = dma_map_single(hw->dev,
 					(void *)t->tx_buf,
 					t->len, DMA_TO_DEVICE);
-			if (dma_mapping_error(dma_tx_addr))
+			if (dma_mapping_error(hw->dev, dma_tx_addr))
 				dev_err(hw->dev, "tx dma map error\n");
 		}
 	} else {
diff --git a/drivers/spi/omap2_mcspi.c b/drivers/spi/omap2_mcspi.c
index b1cc148036c..f6f987bb71c 100644
--- a/drivers/spi/omap2_mcspi.c
+++ b/drivers/spi/omap2_mcspi.c
@@ -836,7 +836,7 @@ static int omap2_mcspi_transfer(struct spi_device *spi, struct spi_message *m)
 		if (tx_buf != NULL) {
 			t->tx_dma = dma_map_single(&spi->dev, (void *) tx_buf,
 					len, DMA_TO_DEVICE);
-			if (dma_mapping_error(t->tx_dma)) {
+			if (dma_mapping_error(&spi->dev, t->tx_dma)) {
 				dev_dbg(&spi->dev, "dma %cX %d bytes error\n",
 						'T', len);
 				return -EINVAL;
@@ -845,7 +845,7 @@ static int omap2_mcspi_transfer(struct spi_device *spi, struct spi_message *m)
 		if (rx_buf != NULL) {
 			t->rx_dma = dma_map_single(&spi->dev, rx_buf, t->len,
 					DMA_FROM_DEVICE);
-			if (dma_mapping_error(t->rx_dma)) {
+			if (dma_mapping_error(&spi->dev, t->rx_dma)) {
 				dev_dbg(&spi->dev, "dma %cX %d bytes error\n",
 						'R', len);
 				if (tx_buf != NULL)
diff --git a/drivers/spi/pxa2xx_spi.c b/drivers/spi/pxa2xx_spi.c
index 0c452c46ab0..067299d6d19 100644
--- a/drivers/spi/pxa2xx_spi.c
+++ b/drivers/spi/pxa2xx_spi.c
@@ -353,7 +353,7 @@ static int map_dma_buffers(struct driver_data *drv_data)
 	drv_data->rx_dma = dma_map_single(dev, drv_data->rx,
 						drv_data->rx_map_len,
 						DMA_FROM_DEVICE);
-	if (dma_mapping_error(drv_data->rx_dma))
+	if (dma_mapping_error(dev, drv_data->rx_dma))
 		return 0;
 
 	/* Stream map the tx buffer */
@@ -361,7 +361,7 @@ static int map_dma_buffers(struct driver_data *drv_data)
 						drv_data->tx_map_len,
 						DMA_TO_DEVICE);
 
-	if (dma_mapping_error(drv_data->tx_dma)) {
+	if (dma_mapping_error(dev, drv_data->tx_dma)) {
 		dma_unmap_single(dev, drv_data->rx_dma,
 					drv_data->rx_map_len, DMA_FROM_DEVICE);
 		return 0;
diff --git a/drivers/spi/spi_imx.c b/drivers/spi/spi_imx.c
index 54ac7bea5f8..6fb77fcc497 100644
--- a/drivers/spi/spi_imx.c
+++ b/drivers/spi/spi_imx.c
@@ -491,7 +491,7 @@ static int map_dma_buffers(struct driver_data *drv_data)
 							buf,
 							drv_data->tx_map_len,
 							DMA_TO_DEVICE);
-			if (dma_mapping_error(drv_data->tx_dma))
+			if (dma_mapping_error(dev, drv_data->tx_dma))
 				return -1;
 
 			drv_data->tx_dma_needs_unmap = 1;
@@ -516,7 +516,7 @@ static int map_dma_buffers(struct driver_data *drv_data)
 					buf,
 					drv_data->len,
 					DMA_FROM_DEVICE);
-		if (dma_mapping_error(drv_data->rx_dma))
+		if (dma_mapping_error(dev, drv_data->rx_dma))
 			return -1;
 		drv_data->rx_dma_needs_unmap = 1;
 	}
@@ -534,7 +534,7 @@ static int map_dma_buffers(struct driver_data *drv_data)
 					buf,
 					drv_data->tx_map_len,
 					DMA_TO_DEVICE);
-	if (dma_mapping_error(drv_data->tx_dma)) {
+	if (dma_mapping_error(dev, drv_data->tx_dma)) {
 		if (drv_data->rx_dma) {
 			dma_unmap_single(dev,
 					drv_data->rx_dma,
diff --git a/include/asm-alpha/dma-mapping.h b/include/asm-alpha/dma-mapping.h
index db351d1296f..a5801ae02e4 100644
--- a/include/asm-alpha/dma-mapping.h
+++ b/include/asm-alpha/dma-mapping.h
@@ -24,8 +24,8 @@
 		pci_unmap_sg(alpha_gendev_to_pci(dev), sg, nents, dir)
 #define dma_supported(dev, mask)			\
 		pci_dma_supported(alpha_gendev_to_pci(dev), mask)
-#define dma_mapping_error(addr)				\
-		pci_dma_mapping_error(addr)
+#define dma_mapping_error(dev, addr)				\
+		pci_dma_mapping_error(alpha_gendev_to_pci(dev), addr)
 
 #else	/* no PCI - no IOMMU. */
 
@@ -45,7 +45,7 @@ int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 #define dma_unmap_page(dev, addr, size, dir)	((void)0)
 #define dma_unmap_sg(dev, sg, nents, dir)	((void)0)
 
-#define dma_mapping_error(addr)  (0)
+#define dma_mapping_error(dev, addr)  (0)
 
 #endif	/* !CONFIG_PCI */
 
diff --git a/include/asm-alpha/pci.h b/include/asm-alpha/pci.h
index d31fd49ff79..2a14302c17a 100644
--- a/include/asm-alpha/pci.h
+++ b/include/asm-alpha/pci.h
@@ -106,7 +106,7 @@ extern dma_addr_t pci_map_page(struct pci_dev *, struct page *,
 /* Test for pci_map_single or pci_map_page having generated an error.  */
 
 static inline int
-pci_dma_mapping_error(dma_addr_t dma_addr)
+pci_dma_mapping_error(struct pci_dev *pdev, dma_addr_t dma_addr)
 {
 	return dma_addr == 0;
 }
diff --git a/include/asm-arm/dma-mapping.h b/include/asm-arm/dma-mapping.h
index e99406a7bec..f41335ba633 100644
--- a/include/asm-arm/dma-mapping.h
+++ b/include/asm-arm/dma-mapping.h
@@ -56,7 +56,7 @@ static inline int dma_is_consistent(struct device *dev, dma_addr_t handle)
 /*
  * DMA errors are defined by all-bits-set in the DMA address.
  */
-static inline int dma_mapping_error(dma_addr_t dma_addr)
+static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	return dma_addr == ~0;
 }
diff --git a/include/asm-avr32/dma-mapping.h b/include/asm-avr32/dma-mapping.h
index 57dc672bab8..0399359ab5d 100644
--- a/include/asm-avr32/dma-mapping.h
+++ b/include/asm-avr32/dma-mapping.h
@@ -35,7 +35,7 @@ static inline int dma_set_mask(struct device *dev, u64 dma_mask)
 /*
  * dma_map_single can't fail as it is implemented now.
  */
-static inline int dma_mapping_error(dma_addr_t addr)
+static inline int dma_mapping_error(struct device *dev, dma_addr_t addr)
 {
 	return 0;
 }
diff --git a/include/asm-cris/dma-mapping.h b/include/asm-cris/dma-mapping.h
index edc8d1bfaae..cb2fb25ff8d 100644
--- a/include/asm-cris/dma-mapping.h
+++ b/include/asm-cris/dma-mapping.h
@@ -120,7 +120,7 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
 }
 
 static inline int
-dma_mapping_error(dma_addr_t dma_addr)
+dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	return 0;
 }
diff --git a/include/asm-frv/dma-mapping.h b/include/asm-frv/dma-mapping.h
index 2e8966ca030..b2898877c07 100644
--- a/include/asm-frv/dma-mapping.h
+++ b/include/asm-frv/dma-mapping.h
@@ -126,7 +126,7 @@ void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nele
 }
 
 static inline
-int dma_mapping_error(dma_addr_t dma_addr)
+int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	return 0;
 }
diff --git a/include/asm-generic/dma-mapping-broken.h b/include/asm-generic/dma-mapping-broken.h
index e2468f894d2..82cd0cb1c3f 100644
--- a/include/asm-generic/dma-mapping-broken.h
+++ b/include/asm-generic/dma-mapping-broken.h
@@ -61,7 +61,7 @@ dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
 #define dma_sync_sg_for_device dma_sync_sg_for_cpu
 
 extern int
-dma_mapping_error(dma_addr_t dma_addr);
+dma_mapping_error(struct device *dev, dma_addr_t dma_addr);
 
 extern int
 dma_supported(struct device *dev, u64 mask);
diff --git a/include/asm-generic/dma-mapping.h b/include/asm-generic/dma-mapping.h
index 783ab9944d7..189486c3f92 100644
--- a/include/asm-generic/dma-mapping.h
+++ b/include/asm-generic/dma-mapping.h
@@ -144,9 +144,9 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
 }
 
 static inline int
-dma_mapping_error(dma_addr_t dma_addr)
+dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
-	return pci_dma_mapping_error(dma_addr);
+	return pci_dma_mapping_error(to_pci_dev(dev), dma_addr);
 }
 
 
diff --git a/include/asm-generic/pci-dma-compat.h b/include/asm-generic/pci-dma-compat.h
index 25c10e96b2b..37b3706226e 100644
--- a/include/asm-generic/pci-dma-compat.h
+++ b/include/asm-generic/pci-dma-compat.h
@@ -99,9 +99,9 @@ pci_dma_sync_sg_for_device(struct pci_dev *hwdev, struct scatterlist *sg,
 }
 
 static inline int
-pci_dma_mapping_error(dma_addr_t dma_addr)
+pci_dma_mapping_error(struct pci_dev *pdev, dma_addr_t dma_addr)
 {
-	return dma_mapping_error(dma_addr);
+	return dma_mapping_error(&pdev->dev, dma_addr);
 }
 
 #endif
diff --git a/include/asm-ia64/machvec.h b/include/asm-ia64/machvec.h
index 0721a5e8271..a6d50c77b6b 100644
--- a/include/asm-ia64/machvec.h
+++ b/include/asm-ia64/machvec.h
@@ -54,7 +54,7 @@ typedef void ia64_mv_dma_sync_single_for_cpu (struct device *, dma_addr_t, size_
 typedef void ia64_mv_dma_sync_sg_for_cpu (struct device *, struct scatterlist *, int, int);
 typedef void ia64_mv_dma_sync_single_for_device (struct device *, dma_addr_t, size_t, int);
 typedef void ia64_mv_dma_sync_sg_for_device (struct device *, struct scatterlist *, int, int);
-typedef int ia64_mv_dma_mapping_error (dma_addr_t dma_addr);
+typedef int ia64_mv_dma_mapping_error(struct device *, dma_addr_t dma_addr);
 typedef int ia64_mv_dma_supported (struct device *, u64);
 
 typedef dma_addr_t ia64_mv_dma_map_single_attrs (struct device *, void *, size_t, int, struct dma_attrs *);
diff --git a/include/asm-m68k/dma-mapping.h b/include/asm-m68k/dma-mapping.h
index a26cdeb46a5..91f7944333d 100644
--- a/include/asm-m68k/dma-mapping.h
+++ b/include/asm-m68k/dma-mapping.h
@@ -84,7 +84,7 @@ static inline void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *s
 {
 }
 
-static inline int dma_mapping_error(dma_addr_t handle)
+static inline int dma_mapping_error(struct device *dev, dma_addr_t handle)
 {
 	return 0;
 }
diff --git a/include/asm-mips/dma-mapping.h b/include/asm-mips/dma-mapping.h
index 230b3f1b69b..c64afb40cd0 100644
--- a/include/asm-mips/dma-mapping.h
+++ b/include/asm-mips/dma-mapping.h
@@ -42,7 +42,7 @@ extern void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
 	int nelems, enum dma_data_direction direction);
 extern void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 	int nelems, enum dma_data_direction direction);
-extern int dma_mapping_error(dma_addr_t dma_addr);
+extern int dma_mapping_error(struct device *dev, dma_addr_t dma_addr);
 extern int dma_supported(struct device *dev, u64 mask);
 
 static inline int
diff --git a/include/asm-mn10300/dma-mapping.h b/include/asm-mn10300/dma-mapping.h
index 7c882fca9ec..ccae8f6c632 100644
--- a/include/asm-mn10300/dma-mapping.h
+++ b/include/asm-mn10300/dma-mapping.h
@@ -182,7 +182,7 @@ void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 }
 
 static inline
-int dma_mapping_error(dma_addr_t dma_addr)
+int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	return 0;
 }
diff --git a/include/asm-parisc/dma-mapping.h b/include/asm-parisc/dma-mapping.h
index c6c0e9ff6bd..53af696f23d 100644
--- a/include/asm-parisc/dma-mapping.h
+++ b/include/asm-parisc/dma-mapping.h
@@ -248,6 +248,6 @@ void * sba_get_iommu(struct parisc_device *dev);
 #endif
 
 /* At the moment, we panic on error for IOMMU resource exaustion */
-#define dma_mapping_error(x)	0
+#define dma_mapping_error(dev, x)	0
 
 #endif
diff --git a/include/asm-powerpc/dma-mapping.h b/include/asm-powerpc/dma-mapping.h
index 74c54978098..c7ca45f97dd 100644
--- a/include/asm-powerpc/dma-mapping.h
+++ b/include/asm-powerpc/dma-mapping.h
@@ -415,7 +415,7 @@ static inline void dma_sync_sg_for_device(struct device *dev,
 		__dma_sync_page(sg_page(sg), sg->offset, sg->length, direction);
 }
 
-static inline int dma_mapping_error(dma_addr_t dma_addr)
+static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 #ifdef CONFIG_PPC64
 	return (dma_addr == DMA_ERROR_CODE);
diff --git a/include/asm-sh/dma-mapping.h b/include/asm-sh/dma-mapping.h
index 22cc419389f..6c0b8a2de14 100644
--- a/include/asm-sh/dma-mapping.h
+++ b/include/asm-sh/dma-mapping.h
@@ -171,7 +171,7 @@ static inline int dma_get_cache_alignment(void)
 	return L1_CACHE_BYTES;
 }
 
-static inline int dma_mapping_error(dma_addr_t dma_addr)
+static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	return dma_addr == 0;
 }
diff --git a/include/asm-sparc/dma-mapping_64.h b/include/asm-sparc/dma-mapping_64.h
index 38cbec76a33..bfa64f9702d 100644
--- a/include/asm-sparc/dma-mapping_64.h
+++ b/include/asm-sparc/dma-mapping_64.h
@@ -135,7 +135,7 @@ static inline void dma_sync_sg_for_device(struct device *dev,
 	/* No flushing needed to sync cpu writes to the device.  */
 }
 
-static inline int dma_mapping_error(dma_addr_t dma_addr)
+static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	return (dma_addr == DMA_ERROR_CODE);
 }
diff --git a/include/asm-sparc/pci_32.h b/include/asm-sparc/pci_32.h
index b93b6c79e08..0ee949d220c 100644
--- a/include/asm-sparc/pci_32.h
+++ b/include/asm-sparc/pci_32.h
@@ -154,7 +154,8 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 
 #define PCI_DMA_ERROR_CODE      (~(dma_addr_t)0x0)
 
-static inline int pci_dma_mapping_error(dma_addr_t dma_addr)
+static inline int pci_dma_mapping_error(struct pci_dev *pdev,
+					dma_addr_t dma_addr)
 {
         return (dma_addr == PCI_DMA_ERROR_CODE);
 }
diff --git a/include/asm-sparc/pci_64.h b/include/asm-sparc/pci_64.h
index f59f2571295..4f79a54948f 100644
--- a/include/asm-sparc/pci_64.h
+++ b/include/asm-sparc/pci_64.h
@@ -140,9 +140,10 @@ extern int pci_dma_supported(struct pci_dev *hwdev, u64 mask);
 #define PCI64_REQUIRED_MASK	(~(dma64_addr_t)0)
 #define PCI64_ADDR_BASE		0xfffc000000000000UL
 
-static inline int pci_dma_mapping_error(dma_addr_t dma_addr)
+static inline int pci_dma_mapping_error(struct pci_dev *pdev,
+					dma_addr_t dma_addr)
 {
-	return dma_mapping_error(dma_addr);
+	return dma_mapping_error(&pdev->dev, dma_addr);
 }
 
 #ifdef CONFIG_PCI
diff --git a/include/asm-x86/device.h b/include/asm-x86/device.h
index 87a715367a1..3c034f48fdb 100644
--- a/include/asm-x86/device.h
+++ b/include/asm-x86/device.h
@@ -5,6 +5,9 @@ struct dev_archdata {
 #ifdef CONFIG_ACPI
 	void	*acpi_handle;
 #endif
+#ifdef CONFIG_X86_64
+struct dma_mapping_ops *dma_ops;
+#endif
 #ifdef CONFIG_DMAR
 	void *iommu; /* hook for IOMMU specific extension */
 #endif
diff --git a/include/asm-x86/dma-mapping.h b/include/asm-x86/dma-mapping.h
index c2ddd3d1b88..0eaa9bf6011 100644
--- a/include/asm-x86/dma-mapping.h
+++ b/include/asm-x86/dma-mapping.h
@@ -17,7 +17,8 @@ extern int panic_on_overflow;
 extern int force_iommu;
 
 struct dma_mapping_ops {
-	int             (*mapping_error)(dma_addr_t dma_addr);
+	int             (*mapping_error)(struct device *dev,
+					 dma_addr_t dma_addr);
 	void*           (*alloc_coherent)(struct device *dev, size_t size,
 				dma_addr_t *dma_handle, gfp_t gfp);
 	void            (*free_coherent)(struct device *dev, size_t size,
@@ -56,14 +57,32 @@ struct dma_mapping_ops {
 	int		is_phys;
 };
 
-extern const struct dma_mapping_ops *dma_ops;
+extern struct dma_mapping_ops *dma_ops;
 
-static inline int dma_mapping_error(dma_addr_t dma_addr)
+static inline struct dma_mapping_ops *get_dma_ops(struct device *dev)
 {
-	if (dma_ops->mapping_error)
-		return dma_ops->mapping_error(dma_addr);
+#ifdef CONFIG_X86_32
+	return dma_ops;
+#else
+	if (unlikely(!dev) || !dev->archdata.dma_ops)
+		return dma_ops;
+	else
+		return dev->archdata.dma_ops;
+#endif
+}
+
+/* Make sure we keep the same behaviour */
+static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+#ifdef CONFIG_X86_32
+	return 0;
+#else
+	struct dma_mapping_ops *ops = get_dma_ops(dev);
+	if (ops->mapping_error)
+		return ops->mapping_error(dev, dma_addr);
 
 	return (dma_addr == bad_dma_address);
+#endif
 }
 
 #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
@@ -83,44 +102,53 @@ static inline dma_addr_t
 dma_map_single(struct device *hwdev, void *ptr, size_t size,
 	       int direction)
 {
+	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
 	BUG_ON(!valid_dma_direction(direction));
-	return dma_ops->map_single(hwdev, virt_to_phys(ptr), size, direction);
+	return ops->map_single(hwdev, virt_to_phys(ptr), size, direction);
 }
 
 static inline void
 dma_unmap_single(struct device *dev, dma_addr_t addr, size_t size,
 		 int direction)
 {
+	struct dma_mapping_ops *ops = get_dma_ops(dev);
+
 	BUG_ON(!valid_dma_direction(direction));
-	if (dma_ops->unmap_single)
-		dma_ops->unmap_single(dev, addr, size, direction);
+	if (ops->unmap_single)
+		ops->unmap_single(dev, addr, size, direction);
 }
 
 static inline int
 dma_map_sg(struct device *hwdev, struct scatterlist *sg,
 	   int nents, int direction)
 {
+	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
 	BUG_ON(!valid_dma_direction(direction));
-	return dma_ops->map_sg(hwdev, sg, nents, direction);
+	return ops->map_sg(hwdev, sg, nents, direction);
 }
 
 static inline void
 dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
 	     int direction)
 {
+	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
 	BUG_ON(!valid_dma_direction(direction));
-	if (dma_ops->unmap_sg)
-		dma_ops->unmap_sg(hwdev, sg, nents, direction);
+	if (ops->unmap_sg)
+		ops->unmap_sg(hwdev, sg, nents, direction);
 }
 
 static inline void
 dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
 			size_t size, int direction)
 {
+	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
 	BUG_ON(!valid_dma_direction(direction));
-	if (dma_ops->sync_single_for_cpu)
-		dma_ops->sync_single_for_cpu(hwdev, dma_handle, size,
-					     direction);
+	if (ops->sync_single_for_cpu)
+		ops->sync_single_for_cpu(hwdev, dma_handle, size, direction);
 	flush_write_buffers();
 }
 
@@ -128,10 +156,11 @@ static inline void
 dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle,
 			   size_t size, int direction)
 {
+	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
 	BUG_ON(!valid_dma_direction(direction));
-	if (dma_ops->sync_single_for_device)
-		dma_ops->sync_single_for_device(hwdev, dma_handle, size,
-						direction);
+	if (ops->sync_single_for_device)
+		ops->sync_single_for_device(hwdev, dma_handle, size, direction);
 	flush_write_buffers();
 }
 
@@ -139,11 +168,12 @@ static inline void
 dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
 			      unsigned long offset, size_t size, int direction)
 {
-	BUG_ON(!valid_dma_direction(direction));
-	if (dma_ops->sync_single_range_for_cpu)
-		dma_ops->sync_single_range_for_cpu(hwdev, dma_handle, offset,
-						   size, direction);
+	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
 
+	BUG_ON(!valid_dma_direction(direction));
+	if (ops->sync_single_range_for_cpu)
+		ops->sync_single_range_for_cpu(hwdev, dma_handle, offset,
+					       size, direction);
 	flush_write_buffers();
 }
 
@@ -152,11 +182,12 @@ dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle,
 				 unsigned long offset, size_t size,
 				 int direction)
 {
-	BUG_ON(!valid_dma_direction(direction));
-	if (dma_ops->sync_single_range_for_device)
-		dma_ops->sync_single_range_for_device(hwdev, dma_handle,
-						      offset, size, direction);
+	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
 
+	BUG_ON(!valid_dma_direction(direction));
+	if (ops->sync_single_range_for_device)
+		ops->sync_single_range_for_device(hwdev, dma_handle,
+						  offset, size, direction);
 	flush_write_buffers();
 }
 
@@ -164,9 +195,11 @@ static inline void
 dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
 		    int nelems, int direction)
 {
+	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
 	BUG_ON(!valid_dma_direction(direction));
-	if (dma_ops->sync_sg_for_cpu)
-		dma_ops->sync_sg_for_cpu(hwdev, sg, nelems, direction);
+	if (ops->sync_sg_for_cpu)
+		ops->sync_sg_for_cpu(hwdev, sg, nelems, direction);
 	flush_write_buffers();
 }
 
@@ -174,9 +207,11 @@ static inline void
 dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
 		       int nelems, int direction)
 {
+	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
 	BUG_ON(!valid_dma_direction(direction));
-	if (dma_ops->sync_sg_for_device)
-		dma_ops->sync_sg_for_device(hwdev, sg, nelems, direction);
+	if (ops->sync_sg_for_device)
+		ops->sync_sg_for_device(hwdev, sg, nelems, direction);
 
 	flush_write_buffers();
 }
@@ -185,9 +220,11 @@ static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
 				      size_t offset, size_t size,
 				      int direction)
 {
+	struct dma_mapping_ops *ops = get_dma_ops(dev);
+
 	BUG_ON(!valid_dma_direction(direction));
-	return dma_ops->map_single(dev, page_to_phys(page)+offset,
-				   size, direction);
+	return ops->map_single(dev, page_to_phys(page) + offset,
+			       size, direction);
 }
 
 static inline void dma_unmap_page(struct device *dev, dma_addr_t addr,
diff --git a/include/asm-x86/swiotlb.h b/include/asm-x86/swiotlb.h
index c706a744263..2730b351afc 100644
--- a/include/asm-x86/swiotlb.h
+++ b/include/asm-x86/swiotlb.h
@@ -35,7 +35,7 @@ extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg,
 			  int nents, int direction);
 extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg,
 			     int nents, int direction);
-extern int swiotlb_dma_mapping_error(dma_addr_t dma_addr);
+extern int swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr);
 extern void swiotlb_free_coherent(struct device *hwdev, size_t size,
 				  void *vaddr, dma_addr_t dma_handle);
 extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
diff --git a/include/asm-xtensa/dma-mapping.h b/include/asm-xtensa/dma-mapping.h
index 3c7d537dd15..51882ae3db4 100644
--- a/include/asm-xtensa/dma-mapping.h
+++ b/include/asm-xtensa/dma-mapping.h
@@ -139,7 +139,7 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
 		consistent_sync(sg_virt(sg), sg->length, dir);
 }
 static inline int
-dma_mapping_error(dma_addr_t dma_addr)
+dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	return 0;
 }
diff --git a/include/linux/i2o.h b/include/linux/i2o.h
index 7d51cbca49a..75ae6d8aba4 100644
--- a/include/linux/i2o.h
+++ b/include/linux/i2o.h
@@ -758,7 +758,7 @@ static inline dma_addr_t i2o_dma_map_single(struct i2o_controller *c, void *ptr,
 	}
 
 	dma_addr = dma_map_single(&c->pdev->dev, ptr, size, direction);
-	if (!dma_mapping_error(dma_addr)) {
+	if (!dma_mapping_error(&c->pdev->dev, dma_addr)) {
 #ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
 		if ((sizeof(dma_addr_t) > 4) && c->pae_support) {
 			*mptr++ = cpu_to_le32(0x7C020002);
diff --git a/include/linux/ssb/ssb.h b/include/linux/ssb/ssb.h
index 4bf8cade9db..e530026eedf 100644
--- a/include/linux/ssb/ssb.h
+++ b/include/linux/ssb/ssb.h
@@ -427,9 +427,9 @@ static inline int ssb_dma_mapping_error(struct ssb_device *dev, dma_addr_t addr)
 {
 	switch (dev->bus->bustype) {
 	case SSB_BUSTYPE_PCI:
-		return pci_dma_mapping_error(addr);
+		return pci_dma_mapping_error(dev->bus->host_pci, addr);
 	case SSB_BUSTYPE_SSB:
-		return dma_mapping_error(addr);
+		return dma_mapping_error(dev->dev, addr);
 	default:
 		__ssb_dma_not_implemented(dev);
 	}
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 90b529f7a15..936e333e7ce 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1590,7 +1590,7 @@ static inline int ib_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
 {
 	if (dev->dma_ops)
 		return dev->dma_ops->mapping_error(dev, dma_addr);
-	return dma_mapping_error(dma_addr);
+	return dma_mapping_error(dev->dma_device, dma_addr);
 }
 
 /**
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index d568894df8c..977edbdbc1d 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -492,7 +492,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 		 */
 		dma_addr_t handle;
 		handle = swiotlb_map_single(NULL, NULL, size, DMA_FROM_DEVICE);
-		if (swiotlb_dma_mapping_error(handle))
+		if (swiotlb_dma_mapping_error(hwdev, handle))
 			return NULL;
 
 		ret = bus_to_virt(handle);
@@ -824,7 +824,7 @@ swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
 }
 
 int
-swiotlb_dma_mapping_error(dma_addr_t dma_addr)
+swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
 {
 	return (dma_addr == virt_to_bus(io_tlb_overflow_buffer));
 }
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index a19b22b452a..84d328329d9 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -169,7 +169,8 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
 					  (void *)
 					  vec->sge[xdr_sge_no].iov_base + sge_off,
 					  sge_bytes, DMA_TO_DEVICE);
-		if (dma_mapping_error(sge[sge_no].addr))
+		if (dma_mapping_error(xprt->sc_cm_id->device->dma_device,
+					sge[sge_no].addr))
 			goto err;
 		sge_off = 0;
 		sge_no++;
-- 
cgit v1.2.3-70-g09d2


From 3ab83521378268044a448113c6aa9a9e245f4d2f Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 25 Jul 2008 19:45:07 -0700
Subject: kexec jump

This patch provides an enhancement to kexec/kdump.  It implements the
following features:

- Backup/restore memory used by the original kernel before/after
  kexec.

- Save/restore CPU state before/after kexec.

The features of this patch can be used as a general method to call program in
physical mode (paging turning off).  This can be used to call BIOS code under
Linux.

kexec-tools needs to be patched to support kexec jump. The patches and
the precompiled kexec can be download from the following URL:

       source: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-src_git_kh10.tar.bz2
       patches: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-patches_git_kh10.tar.bz2
       binary: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec_git_kh10

Usage example of calling some physical mode code and return:

1. Compile and install patched kernel with following options selected:

CONFIG_X86_32=y
CONFIG_KEXEC=y
CONFIG_PM=y
CONFIG_KEXEC_JUMP=y

2. Build patched kexec-tool or download the pre-built one.

3. Build some physical mode executable named such as "phy_mode"

4. Boot kernel compiled in step 1.

5. Load physical mode executable with /sbin/kexec. The shell command
   line can be as follow:

   /sbin/kexec --load-preserve-context --args-none phy_mode

6. Call physical mode executable with following shell command line:

   /sbin/kexec -e

Implementation point:

To support jumping without reserving memory.  One shadow backup page (source
page) is allocated for each page used by kexeced code image (destination
page).  When do kexec_load, the image of kexeced code is loaded into source
pages, and before executing, the destination pages and the source pages are
swapped, so the contents of destination pages are backupped.  Before jumping
to the kexeced code image and after jumping back to the original kernel, the
destination pages and the source pages are swapped too.

C ABI (calling convention) is used as communication protocol between
kernel and called code.

A flag named KEXEC_PRESERVE_CONTEXT for sys_kexec_load is added to
indicate that the loaded kernel image is used for jumping back.

Now, only the i386 architecture is supported.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@nigel.suspend2.net>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/kernel/machine_kexec.c  |   2 +-
 arch/sh/kernel/machine_kexec.c       |   2 +-
 arch/x86/Kconfig                     |   7 ++
 arch/x86/kernel/machine_kexec_32.c   |  27 ++++--
 arch/x86/kernel/machine_kexec_64.c   |   2 +-
 arch/x86/kernel/relocate_kernel_32.S | 174 ++++++++++++++++++++++++++++++-----
 include/asm-x86/kexec.h              |  18 ++--
 include/linux/kexec.h                |  17 +++-
 kernel/kexec.c                       |  57 ++++++++++++
 kernel/sys.c                         |  31 ++-----
 10 files changed, 269 insertions(+), 68 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/machine_kexec.c b/arch/powerpc/kernel/machine_kexec.c
index 29a0e039d43..aab76887a84 100644
--- a/arch/powerpc/kernel/machine_kexec.c
+++ b/arch/powerpc/kernel/machine_kexec.c
@@ -48,7 +48,7 @@ void machine_kexec_cleanup(struct kimage *image)
  * Do not allocate memory (or fail in any way) in machine_kexec().
  * We are past the point of no return, committed to rebooting now.
  */
-NORET_TYPE void machine_kexec(struct kimage *image)
+void machine_kexec(struct kimage *image)
 {
 	if (ppc_md.machine_kexec)
 		ppc_md.machine_kexec(image);
diff --git a/arch/sh/kernel/machine_kexec.c b/arch/sh/kernel/machine_kexec.c
index 5c17de51987..ec1eadce4aa 100644
--- a/arch/sh/kernel/machine_kexec.c
+++ b/arch/sh/kernel/machine_kexec.c
@@ -70,7 +70,7 @@ static void kexec_info(struct kimage *image)
  * Do not allocate memory (or fail in any way) in machine_kexec().
  * We are past the point of no return, committed to rebooting now.
  */
-NORET_TYPE void machine_kexec(struct kimage *image)
+void machine_kexec(struct kimage *image)
 {
 
 	unsigned long page_list;
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e3cba0b4560..7ecb679f013 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1279,6 +1279,13 @@ config CRASH_DUMP
 	  (CONFIG_RELOCATABLE=y).
 	  For more details see Documentation/kdump/kdump.txt
 
+config KEXEC_JUMP
+	bool "kexec jump (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	depends on KEXEC && PM_SLEEP && X86_32
+	help
+	  Invoke code in physical address mode via KEXEC
+
 config PHYSICAL_START
 	hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
 	default "0x1000000" if X86_NUMAQ
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 8864230d55a..2b67609d0a1 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -22,6 +22,7 @@
 #include <asm/cpufeature.h>
 #include <asm/desc.h>
 #include <asm/system.h>
+#include <asm/cacheflush.h>
 
 #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
 static u32 kexec_pgd[1024] PAGE_ALIGNED;
@@ -85,10 +86,12 @@ static void load_segments(void)
  * reboot code buffer to allow us to avoid allocations
  * later.
  *
- * Currently nothing.
+ * Make control page executable.
  */
 int machine_kexec_prepare(struct kimage *image)
 {
+	if (nx_enabled)
+		set_pages_x(image->control_code_page, 1);
 	return 0;
 }
 
@@ -98,16 +101,24 @@ int machine_kexec_prepare(struct kimage *image)
  */
 void machine_kexec_cleanup(struct kimage *image)
 {
+	if (nx_enabled)
+		set_pages_nx(image->control_code_page, 1);
 }
 
 /*
  * Do not allocate memory (or fail in any way) in machine_kexec().
  * We are past the point of no return, committed to rebooting now.
  */
-NORET_TYPE void machine_kexec(struct kimage *image)
+void machine_kexec(struct kimage *image)
 {
 	unsigned long page_list[PAGES_NR];
 	void *control_page;
+	asmlinkage unsigned long
+		(*relocate_kernel_ptr)(unsigned long indirection_page,
+				       unsigned long control_page,
+				       unsigned long start_address,
+				       unsigned int has_pae,
+				       unsigned int preserve_context);
 
 	tracer_disable();
 
@@ -115,10 +126,11 @@ NORET_TYPE void machine_kexec(struct kimage *image)
 	local_irq_disable();
 
 	control_page = page_address(image->control_code_page);
-	memcpy(control_page, relocate_kernel, PAGE_SIZE);
+	memcpy(control_page, relocate_kernel, PAGE_SIZE/2);
 
+	relocate_kernel_ptr = control_page;
 	page_list[PA_CONTROL_PAGE] = __pa(control_page);
-	page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
+	page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
 	page_list[PA_PGD] = __pa(kexec_pgd);
 	page_list[VA_PGD] = (unsigned long)kexec_pgd;
 #ifdef CONFIG_X86_PAE
@@ -131,6 +143,7 @@ NORET_TYPE void machine_kexec(struct kimage *image)
 	page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
 	page_list[PA_PTE_1] = __pa(kexec_pte1);
 	page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
+	page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT);
 
 	/* The segment registers are funny things, they have both a
 	 * visible and an invisible part.  Whenever the visible part is
@@ -149,8 +162,10 @@ NORET_TYPE void machine_kexec(struct kimage *image)
 	set_idt(phys_to_virt(0),0);
 
 	/* now call it */
-	relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
-			image->start, cpu_has_pae);
+	image->start = relocate_kernel_ptr((unsigned long)image->head,
+					   (unsigned long)page_list,
+					   image->start, cpu_has_pae,
+					   image->preserve_context);
 }
 
 void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 9dd9262693a..c43caa3a91f 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -181,7 +181,7 @@ void machine_kexec_cleanup(struct kimage *image)
  * Do not allocate memory (or fail in any way) in machine_kexec().
  * We are past the point of no return, committed to rebooting now.
  */
-NORET_TYPE void machine_kexec(struct kimage *image)
+void machine_kexec(struct kimage *image)
 {
 	unsigned long page_list[PAGES_NR];
 	void *control_page;
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index c30fe25d470..703310a9902 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -20,11 +20,44 @@
 #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
 #define PAE_PGD_ATTR (_PAGE_PRESENT)
 
+/* control_page + PAGE_SIZE/2 ~ control_page + PAGE_SIZE * 3/4 are
+ * used to save some data for jumping back
+ */
+#define DATA(offset)		(PAGE_SIZE/2+(offset))
+
+/* Minimal CPU state */
+#define ESP			DATA(0x0)
+#define CR0			DATA(0x4)
+#define CR3			DATA(0x8)
+#define CR4			DATA(0xc)
+
+/* other data */
+#define CP_VA_CONTROL_PAGE	DATA(0x10)
+#define CP_PA_PGD		DATA(0x14)
+#define CP_PA_SWAP_PAGE		DATA(0x18)
+#define CP_PA_BACKUP_PAGES_MAP	DATA(0x1c)
+
 	.text
 	.align PAGE_SIZE
 	.globl relocate_kernel
 relocate_kernel:
-	movl	8(%esp), %ebp /* list of pages */
+	/* Save the CPU context, used for jumping back */
+
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	pushl	%ebp
+	pushf
+
+	movl	20+8(%esp), %ebp /* list of pages */
+	movl	PTR(VA_CONTROL_PAGE)(%ebp), %edi
+	movl	%esp, ESP(%edi)
+	movl	%cr0, %eax
+	movl	%eax, CR0(%edi)
+	movl	%cr3, %eax
+	movl	%eax, CR3(%edi)
+	movl	%cr4, %eax
+	movl	%eax, CR4(%edi)
 
 #ifdef CONFIG_X86_PAE
 	/* map the control page at its virtual address */
@@ -138,15 +171,25 @@ relocate_kernel:
 
 relocate_new_kernel:
 	/* read the arguments and say goodbye to the stack */
-	movl  4(%esp), %ebx /* page_list */
-	movl  8(%esp), %ebp /* list of pages */
-	movl  12(%esp), %edx /* start address */
-	movl  16(%esp), %ecx /* cpu_has_pae */
+	movl  20+4(%esp), %ebx /* page_list */
+	movl  20+8(%esp), %ebp /* list of pages */
+	movl  20+12(%esp), %edx /* start address */
+	movl  20+16(%esp), %ecx /* cpu_has_pae */
+	movl  20+20(%esp), %esi /* preserve_context */
 
 	/* zero out flags, and disable interrupts */
 	pushl $0
 	popfl
 
+	/* save some information for jumping back */
+	movl	PTR(VA_CONTROL_PAGE)(%ebp), %edi
+	movl	%edi, CP_VA_CONTROL_PAGE(%edi)
+	movl	PTR(PA_PGD)(%ebp), %eax
+	movl	%eax, CP_PA_PGD(%edi)
+	movl	PTR(PA_SWAP_PAGE)(%ebp), %eax
+	movl	%eax, CP_PA_SWAP_PAGE(%edi)
+	movl	%ebx, CP_PA_BACKUP_PAGES_MAP(%edi)
+
 	/* get physical address of control page now */
 	/* this is impossible after page table switch */
 	movl	PTR(PA_CONTROL_PAGE)(%ebp), %edi
@@ -197,8 +240,90 @@ identity_mapped:
 	xorl	%eax, %eax
 	movl	%eax, %cr3
 
+	movl	CP_PA_SWAP_PAGE(%edi), %eax
+	pushl	%eax
+	pushl	%ebx
+	call	swap_pages
+	addl	$8, %esp
+
+	/* To be certain of avoiding problems with self-modifying code
+	 * I need to execute a serializing instruction here.
+	 * So I flush the TLB, it's handy, and not processor dependent.
+	 */
+	xorl	%eax, %eax
+	movl	%eax, %cr3
+
+	/* set all of the registers to known values */
+	/* leave %esp alone */
+
+	testl	%esi, %esi
+	jnz 1f
+	xorl	%edi, %edi
+	xorl	%eax, %eax
+	xorl	%ebx, %ebx
+	xorl    %ecx, %ecx
+	xorl    %edx, %edx
+	xorl    %esi, %esi
+	xorl    %ebp, %ebp
+	ret
+1:
+	popl	%edx
+	movl	CP_PA_SWAP_PAGE(%edi), %esp
+	addl	$PAGE_SIZE, %esp
+2:
+	call	*%edx
+
+	/* get the re-entry point of the peer system */
+	movl	0(%esp), %ebp
+	call	1f
+1:
+	popl	%ebx
+	subl	$(1b - relocate_kernel), %ebx
+	movl	CP_VA_CONTROL_PAGE(%ebx), %edi
+	lea	PAGE_SIZE(%ebx), %esp
+	movl	CP_PA_SWAP_PAGE(%ebx), %eax
+	movl	CP_PA_BACKUP_PAGES_MAP(%ebx), %edx
+	pushl	%eax
+	pushl	%edx
+	call	swap_pages
+	addl	$8, %esp
+	movl	CP_PA_PGD(%ebx), %eax
+	movl	%eax, %cr3
+	movl	%cr0, %eax
+	orl	$(1<<31), %eax
+	movl	%eax, %cr0
+	lea	PAGE_SIZE(%edi), %esp
+	movl	%edi, %eax
+	addl	$(virtual_mapped - relocate_kernel), %eax
+	pushl	%eax
+	ret
+
+virtual_mapped:
+	movl	CR4(%edi), %eax
+	movl	%eax, %cr4
+	movl	CR3(%edi), %eax
+	movl	%eax, %cr3
+	movl	CR0(%edi), %eax
+	movl	%eax, %cr0
+	movl	ESP(%edi), %esp
+	movl	%ebp, %eax
+
+	popf
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	ret
+
 	/* Do the copies */
-	movl	%ebx, %ecx
+swap_pages:
+	movl	8(%esp), %edx
+	movl	4(%esp), %ecx
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%edi
+	pushl	%esi
+	movl	%ecx, %ebx
 	jmp	1f
 
 0:	/* top, read another word from the indirection page */
@@ -226,27 +351,28 @@ identity_mapped:
 	movl    %ecx,   %esi /* For every source page do a copy */
 	andl    $0xfffff000, %esi
 
+	movl	%edi, %eax
+	movl	%esi, %ebp
+
+	movl	%edx, %edi
 	movl    $1024, %ecx
 	rep ; movsl
-	jmp     0b
 
-3:
-
-	/* To be certain of avoiding problems with self-modifying code
-	 * I need to execute a serializing instruction here.
-	 * So I flush the TLB, it's handy, and not processor dependent.
-	 */
-	xorl	%eax, %eax
-	movl	%eax, %cr3
+	movl	%ebp, %edi
+	movl	%eax, %esi
+	movl	$1024, %ecx
+	rep ; movsl
 
-	/* set all of the registers to known values */
-	/* leave %esp alone */
+	movl	%eax, %edi
+	movl	%edx, %esi
+	movl	$1024, %ecx
+	rep ; movsl
 
-	xorl	%eax, %eax
-	xorl	%ebx, %ebx
-	xorl    %ecx, %ecx
-	xorl    %edx, %edx
-	xorl    %esi, %esi
-	xorl    %edi, %edi
-	xorl    %ebp, %ebp
+	lea	PAGE_SIZE(%ebp), %esi
+	jmp     0b
+3:
+	popl	%esi
+	popl	%edi
+	popl	%ebx
+	popl	%ebp
 	ret
diff --git a/include/asm-x86/kexec.h b/include/asm-x86/kexec.h
index 8f855a15f64..c0e52a14fd4 100644
--- a/include/asm-x86/kexec.h
+++ b/include/asm-x86/kexec.h
@@ -10,14 +10,15 @@
 # define VA_PTE_0		5
 # define PA_PTE_1		6
 # define VA_PTE_1		7
+# define PA_SWAP_PAGE		8
 # ifdef CONFIG_X86_PAE
-#  define PA_PMD_0		8
-#  define VA_PMD_0		9
-#  define PA_PMD_1		10
-#  define VA_PMD_1		11
-#  define PAGES_NR		12
+#  define PA_PMD_0		9
+#  define VA_PMD_0		10
+#  define PA_PMD_1		11
+#  define VA_PMD_1		12
+#  define PAGES_NR		13
 # else
-#  define PAGES_NR		8
+#  define PAGES_NR		9
 # endif
 #else
 # define PA_CONTROL_PAGE	0
@@ -152,11 +153,12 @@ static inline void crash_setup_regs(struct pt_regs *newregs,
 }
 
 #ifdef CONFIG_X86_32
-asmlinkage NORET_TYPE void
+asmlinkage unsigned long
 relocate_kernel(unsigned long indirection_page,
 		unsigned long control_page,
 		unsigned long start_address,
-		unsigned int has_pae) ATTRIB_NORET;
+		unsigned int has_pae,
+		unsigned int preserve_context);
 #else
 NORET_TYPE void
 relocate_kernel(unsigned long indirection_page,
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 3265968cd2c..82f88a8a827 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -83,6 +83,7 @@ struct kimage {
 
 	unsigned long start;
 	struct page *control_code_page;
+	struct page *swap_page;
 
 	unsigned long nr_segments;
 	struct kexec_segment segment[KEXEC_SEGMENT_MAX];
@@ -98,18 +99,20 @@ struct kimage {
 	unsigned int type : 1;
 #define KEXEC_TYPE_DEFAULT 0
 #define KEXEC_TYPE_CRASH   1
+	unsigned int preserve_context : 1;
 };
 
 
 /* kexec interface functions */
-extern NORET_TYPE void machine_kexec(struct kimage *image) ATTRIB_NORET;
+extern void machine_kexec(struct kimage *image);
 extern int machine_kexec_prepare(struct kimage *image);
 extern void machine_kexec_cleanup(struct kimage *image);
 extern asmlinkage long sys_kexec_load(unsigned long entry,
 					unsigned long nr_segments,
 					struct kexec_segment __user *segments,
 					unsigned long flags);
+extern int kernel_kexec(void);
 #ifdef CONFIG_COMPAT
 extern asmlinkage long compat_sys_kexec_load(unsigned long entry,
 				unsigned long nr_segments,
@@ -156,8 +159,9 @@ extern struct kimage *kexec_crash_image;
 #define kexec_flush_icache_page(page)
 #endif
 
-#define KEXEC_ON_CRASH  0x00000001
-#define KEXEC_ARCH_MASK 0xffff0000
+#define KEXEC_ON_CRASH		0x00000001
+#define KEXEC_PRESERVE_CONTEXT	0x00000002
+#define KEXEC_ARCH_MASK		0xffff0000
 
 /* These values match the ELF architecture values.
  * Unless there is a good reason that should continue to be the case.
@@ -174,7 +178,12 @@ extern struct kimage *kexec_crash_image;
 #define KEXEC_ARCH_MIPS_LE (10 << 16)
 #define KEXEC_ARCH_MIPS    ( 8 << 16)
 
-#define KEXEC_FLAGS    (KEXEC_ON_CRASH)  /* List of defined/legal kexec flags */
+/* List of defined/legal kexec flags */
+#ifndef CONFIG_KEXEC_JUMP
+#define KEXEC_FLAGS    KEXEC_ON_CRASH
+#else
+#define KEXEC_FLAGS    (KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT)
+#endif
 
 #define VMCOREINFO_BYTES           (4096)
 #define VMCOREINFO_NOTE_NAME       "VMCOREINFO"
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 6db42ff8d52..a0d920915b3 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -24,6 +24,8 @@
 #include <linux/utsrelease.h>
 #include <linux/utsname.h>
 #include <linux/numa.h>
+#include <linux/suspend.h>
+#include <linux/device.h>
 
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -242,6 +244,12 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
 		goto out;
 	}
 
+	image->swap_page = kimage_alloc_control_pages(image, 0);
+	if (!image->swap_page) {
+		printk(KERN_ERR "Could not allocate swap buffer\n");
+		goto out;
+	}
+
 	result = 0;
  out:
 	if (result == 0)
@@ -986,6 +994,8 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
 		if (result)
 			goto out;
 
+		if (flags & KEXEC_PRESERVE_CONTEXT)
+			image->preserve_context = 1;
 		result = machine_kexec_prepare(image);
 		if (result)
 			goto out;
@@ -1411,3 +1421,50 @@ static int __init crash_save_vmcoreinfo_init(void)
 }
 
 module_init(crash_save_vmcoreinfo_init)
+
+/**
+ *	kernel_kexec - reboot the system
+ *
+ *	Move into place and start executing a preloaded standalone
+ *	executable.  If nothing was preloaded return an error.
+ */
+int kernel_kexec(void)
+{
+	int error = 0;
+
+	if (xchg(&kexec_lock, 1))
+		return -EBUSY;
+	if (!kexec_image) {
+		error = -EINVAL;
+		goto Unlock;
+	}
+
+	if (kexec_image->preserve_context) {
+#ifdef CONFIG_KEXEC_JUMP
+		local_irq_disable();
+		save_processor_state();
+#endif
+	} else {
+		blocking_notifier_call_chain(&reboot_notifier_list,
+					     SYS_RESTART, NULL);
+		system_state = SYSTEM_RESTART;
+		device_shutdown();
+		sysdev_shutdown();
+		printk(KERN_EMERG "Starting new kernel\n");
+		machine_shutdown();
+	}
+
+	machine_kexec(kexec_image);
+
+	if (kexec_image->preserve_context) {
+#ifdef CONFIG_KEXEC_JUMP
+		restore_processor_state();
+		local_irq_enable();
+#endif
+	}
+
+ Unlock:
+	xchg(&kexec_lock, 0);
+
+	return error;
+}
diff --git a/kernel/sys.c b/kernel/sys.c
index 0c9d3fa1f5f..c01858090a9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -301,26 +301,6 @@ void kernel_restart(char *cmd)
 }
 EXPORT_SYMBOL_GPL(kernel_restart);
 
-/**
- *	kernel_kexec - reboot the system
- *
- *	Move into place and start executing a preloaded standalone
- *	executable.  If nothing was preloaded return an error.
- */
-static void kernel_kexec(void)
-{
-#ifdef CONFIG_KEXEC
-	struct kimage *image;
-	image = xchg(&kexec_image, NULL);
-	if (!image)
-		return;
-	kernel_restart_prepare(NULL);
-	printk(KERN_EMERG "Starting new kernel\n");
-	machine_shutdown();
-	machine_kexec(image);
-#endif
-}
-
 static void kernel_shutdown_prepare(enum system_states state)
 {
 	blocking_notifier_call_chain(&reboot_notifier_list,
@@ -425,10 +405,15 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 		kernel_restart(buffer);
 		break;
 
+#ifdef CONFIG_KEXEC
 	case LINUX_REBOOT_CMD_KEXEC:
-		kernel_kexec();
-		unlock_kernel();
-		return -EINVAL;
+		{
+			int ret;
+			ret = kernel_kexec();
+			unlock_kernel();
+			return ret;
+		}
+#endif
 
 #ifdef CONFIG_HIBERNATION
 	case LINUX_REBOOT_CMD_SW_SUSPEND:
-- 
cgit v1.2.3-70-g09d2


From 51cc50685a4275c6a02653670af9f108a64e01cf Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 25 Jul 2008 19:45:34 -0700
Subject: SL*B: drop kmem cache argument from constructor

Kmem cache passed to constructor is only needed for constructors that are
themselves multiplexeres.  Nobody uses this "feature", nor does anybody uses
passed kmem cache in non-trivial way, so pass only pointer to object.

Non-trivial places are:
	arch/powerpc/mm/init_64.c
	arch/powerpc/mm/hugetlbpage.c

This is flag day, yes.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Jon Tollefson <kniht@linux.vnet.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Matt Mackall <mpm@selenic.com>
[akpm@linux-foundation.org: fix arch/powerpc/mm/hugetlbpage.c]
[akpm@linux-foundation.org: fix mm/slab.c]
[akpm@linux-foundation.org: fix ubifs]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/plat-s3c24xx/dma.c               |  2 +-
 arch/powerpc/kernel/rtas_flash.c          |  2 +-
 arch/powerpc/mm/hugetlbpage.c             |  9 ++-------
 arch/powerpc/mm/init_64.c                 | 24 +++++++++---------------
 arch/powerpc/platforms/cell/spufs/inode.c |  2 +-
 arch/sh/mm/pmb.c                          |  2 +-
 arch/xtensa/mm/init.c                     |  2 +-
 drivers/usb/mon/mon_text.c                |  4 ++--
 fs/adfs/super.c                           |  2 +-
 fs/affs/super.c                           |  2 +-
 fs/afs/super.c                            |  4 ++--
 fs/befs/linuxvfs.c                        |  2 +-
 fs/bfs/inode.c                            |  2 +-
 fs/block_dev.c                            |  2 +-
 fs/buffer.c                               |  2 +-
 fs/cifs/cifsfs.c                          |  2 +-
 fs/coda/inode.c                           |  2 +-
 fs/ecryptfs/main.c                        |  4 ++--
 fs/efs/super.c                            |  2 +-
 fs/ext2/super.c                           |  2 +-
 fs/ext3/super.c                           |  2 +-
 fs/ext4/super.c                           |  2 +-
 fs/fat/cache.c                            |  2 +-
 fs/fat/inode.c                            |  2 +-
 fs/fuse/inode.c                           |  2 +-
 fs/gfs2/main.c                            |  4 ++--
 fs/hfs/super.c                            |  2 +-
 fs/hfsplus/super.c                        |  2 +-
 fs/hpfs/super.c                           |  2 +-
 fs/hugetlbfs/inode.c                      |  2 +-
 fs/inode.c                                |  2 +-
 fs/isofs/inode.c                          |  2 +-
 fs/jffs2/super.c                          |  2 +-
 fs/jfs/jfs_metapage.c                     |  2 +-
 fs/jfs/super.c                            |  2 +-
 fs/locks.c                                |  2 +-
 fs/minix/inode.c                          |  2 +-
 fs/ncpfs/inode.c                          |  2 +-
 fs/nfs/inode.c                            |  2 +-
 fs/ntfs/super.c                           |  2 +-
 fs/ocfs2/dlm/dlmfs.c                      |  3 +--
 fs/ocfs2/super.c                          |  2 +-
 fs/openpromfs/inode.c                     |  2 +-
 fs/proc/inode.c                           |  2 +-
 fs/qnx4/inode.c                           |  2 +-
 fs/reiserfs/super.c                       |  2 +-
 fs/romfs/inode.c                          |  2 +-
 fs/smbfs/inode.c                          |  2 +-
 fs/sysv/inode.c                           |  2 +-
 fs/ubifs/super.c                          |  2 +-
 fs/udf/super.c                            |  2 +-
 fs/ufs/super.c                            |  2 +-
 fs/xfs/linux-2.6/kmem.h                   |  2 +-
 fs/xfs/linux-2.6/xfs_super.c              |  1 -
 include/linux/slab.h                      |  2 +-
 include/linux/slub_def.h                  |  2 +-
 ipc/mqueue.c                              |  2 +-
 kernel/fork.c                             |  2 +-
 lib/idr.c                                 |  2 +-
 lib/radix-tree.c                          |  2 +-
 mm/rmap.c                                 |  2 +-
 mm/shmem.c                                |  2 +-
 mm/slab.c                                 | 11 +++++------
 mm/slob.c                                 |  7 +++----
 mm/slub.c                                 | 13 ++++++-------
 net/socket.c                              |  2 +-
 net/sunrpc/rpc_pipe.c                     |  2 +-
 67 files changed, 90 insertions(+), 106 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/arm/plat-s3c24xx/dma.c b/arch/arm/plat-s3c24xx/dma.c
index 60f162dc4fa..8c5e656d5d8 100644
--- a/arch/arm/plat-s3c24xx/dma.c
+++ b/arch/arm/plat-s3c24xx/dma.c
@@ -1304,7 +1304,7 @@ struct sysdev_class dma_sysclass = {
 
 /* kmem cache implementation */
 
-static void s3c2410_dma_cache_ctor(struct kmem_cache *c, void *p)
+static void s3c2410_dma_cache_ctor(void *p)
 {
 	memset(p, 0, sizeof(struct s3c2410_dma_buf));
 }
diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c
index 09ded5c424a..149cb112cd1 100644
--- a/arch/powerpc/kernel/rtas_flash.c
+++ b/arch/powerpc/kernel/rtas_flash.c
@@ -286,7 +286,7 @@ static ssize_t rtas_flash_read(struct file *file, char __user *buf,
 }
 
 /* constructor for flash_block_cache */
-void rtas_block_ctor(struct kmem_cache *cache, void *ptr)
+void rtas_block_ctor(void *ptr)
 {
 	memset(ptr, 0, RTAS_BLK_SIZE);
 }
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index fb42c4dd321..ed0aab0208a 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -113,7 +113,7 @@ static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 			   unsigned long address, unsigned int psize)
 {
-	pte_t *new = kmem_cache_alloc(huge_pgtable_cache(psize),
+	pte_t *new = kmem_cache_zalloc(huge_pgtable_cache(psize),
 				      GFP_KERNEL|__GFP_REPEAT);
 
 	if (! new)
@@ -730,11 +730,6 @@ static int __init hugepage_setup_sz(char *str)
 }
 __setup("hugepagesz=", hugepage_setup_sz);
 
-static void zero_ctor(struct kmem_cache *cache, void *addr)
-{
-	memset(addr, 0, kmem_cache_size(cache));
-}
-
 static int __init hugetlbpage_init(void)
 {
 	unsigned int psize;
@@ -756,7 +751,7 @@ static int __init hugetlbpage_init(void)
 						HUGEPTE_TABLE_SIZE(psize),
 						HUGEPTE_TABLE_SIZE(psize),
 						0,
-						zero_ctor);
+						NULL);
 			if (!huge_pgtable_cache(psize))
 				panic("hugetlbpage_init(): could not create %s"\
 				      "\n", HUGEPTE_CACHE_NAME(psize));
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index a41bc5aa204..4f7df85129d 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -136,9 +136,14 @@ static int __init setup_kcore(void)
 module_init(setup_kcore);
 #endif
 
-static void zero_ctor(struct kmem_cache *cache, void *addr)
+static void pgd_ctor(void *addr)
 {
-	memset(addr, 0, kmem_cache_size(cache));
+	memset(addr, 0, PGD_TABLE_SIZE);
+}
+
+static void pmd_ctor(void *addr)
+{
+	memset(addr, 0, PMD_TABLE_SIZE);
 }
 
 static const unsigned int pgtable_cache_size[2] = {
@@ -163,19 +168,8 @@ struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
 
 void pgtable_cache_init(void)
 {
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) {
-		int size = pgtable_cache_size[i];
-		const char *name = pgtable_cache_name[i];
-
-		pr_debug("Allocating page table cache %s (#%d) "
-			"for size: %08x...\n", name, i, size);
-		pgtable_cache[i] = kmem_cache_create(name,
-						     size, size,
-						     SLAB_PANIC,
-						     zero_ctor);
-	}
+	pgtable_cache[0] = kmem_cache_create(pgtable_cache_name[0], PGD_TABLE_SIZE, PGD_TABLE_SIZE, SLAB_PANIC, pgd_ctor);
+	pgtable_cache[1] = kmem_cache_create(pgtable_cache_name[1], PMD_TABLE_SIZE, PMD_TABLE_SIZE, SLAB_PANIC, pmd_ctor);
 }
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 7123472801d..690ca7b0dcf 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -78,7 +78,7 @@ spufs_destroy_inode(struct inode *inode)
 }
 
 static void
-spufs_init_once(struct kmem_cache *cachep, void *p)
+spufs_init_once(void *p)
 {
 	struct spufs_inode_info *ei = p;
 
diff --git a/arch/sh/mm/pmb.c b/arch/sh/mm/pmb.c
index 0b0ec6e0475..46911bcbf17 100644
--- a/arch/sh/mm/pmb.c
+++ b/arch/sh/mm/pmb.c
@@ -293,7 +293,7 @@ void pmb_unmap(unsigned long addr)
 	} while (pmbe);
 }
 
-static void pmb_cache_ctor(struct kmem_cache *cachep, void *pmb)
+static void pmb_cache_ctor(void *pmb)
 {
 	struct pmb_entry *pmbe = pmb;
 
diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c
index 81d0560eaea..ee261005b36 100644
--- a/arch/xtensa/mm/init.c
+++ b/arch/xtensa/mm/init.c
@@ -309,7 +309,7 @@ void show_mem(void)
 
 struct kmem_cache *pgtable_cache __read_mostly;
 
-static void pgd_ctor(struct kmem_cache *cache, void* addr)
+static void pgd_ctor(void* addr)
 {
 	pte_t* ptep = (pte_t*)addr;
 	int i;
diff --git a/drivers/usb/mon/mon_text.c b/drivers/usb/mon/mon_text.c
index 5e3e4e9b6c7..1f715436d6d 100644
--- a/drivers/usb/mon/mon_text.c
+++ b/drivers/usb/mon/mon_text.c
@@ -87,7 +87,7 @@ struct mon_reader_text {
 
 static struct dentry *mon_dir;		/* Usually /sys/kernel/debug/usbmon */
 
-static void mon_text_ctor(struct kmem_cache *, void *);
+static void mon_text_ctor(void *);
 
 struct mon_text_ptr {
 	int cnt, limit;
@@ -720,7 +720,7 @@ void mon_text_del(struct mon_bus *mbus)
 /*
  * Slab interface: constructor.
  */
-static void mon_text_ctor(struct kmem_cache *slab, void *mem)
+static void mon_text_ctor(void *mem)
 {
 	/*
 	 * Nothing to initialize. No, really!
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 9e421eeb672..26f3b43726b 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -249,7 +249,7 @@ static void adfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(adfs_inode_cachep, ADFS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct adfs_inode_info *ei = (struct adfs_inode_info *) foo;
 
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 4e030956640..3a89094f93d 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -90,7 +90,7 @@ static void affs_destroy_inode(struct inode *inode)
 	kmem_cache_free(affs_inode_cachep, AFFS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct affs_inode_info *ei = (struct affs_inode_info *) foo;
 
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 7e3faeef681..250d8c4d66e 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -27,7 +27,7 @@
 
 #define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
 
-static void afs_i_init_once(struct kmem_cache *cachep, void *foo);
+static void afs_i_init_once(void *foo);
 static int afs_get_sb(struct file_system_type *fs_type,
 		      int flags, const char *dev_name,
 		      void *data, struct vfsmount *mnt);
@@ -449,7 +449,7 @@ static void afs_put_super(struct super_block *sb)
 /*
  * initialise an inode cache slab element prior to any use
  */
-static void afs_i_init_once(struct kmem_cache *cachep, void *_vnode)
+static void afs_i_init_once(void *_vnode)
 {
 	struct afs_vnode *vnode = _vnode;
 
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index e8717de3bab..02c6e62b72f 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -289,7 +289,7 @@ befs_destroy_inode(struct inode *inode)
         kmem_cache_free(befs_inode_cachep, BEFS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
         struct befs_inode_info *bi = (struct befs_inode_info *) foo;
 
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 053e690ec9e..0ed57b5ee01 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -264,7 +264,7 @@ static void bfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(bfs_inode_cachep, BFS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct bfs_inode_info *bi = foo;
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 10d8a0aa871..dcf37cada36 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -271,7 +271,7 @@ static void bdev_destroy_inode(struct inode *inode)
 	kmem_cache_free(bdev_cachep, bdi);
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct bdev_inode *ei = (struct bdev_inode *) foo;
 	struct block_device *bdev = &ei->bdev;
diff --git a/fs/buffer.c b/fs/buffer.c
index 109b261192d..5fd497cdd6f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3272,7 +3272,7 @@ int bh_submit_read(struct buffer_head *bh)
 EXPORT_SYMBOL(bh_submit_read);
 
 static void
-init_buffer_head(struct kmem_cache *cachep, void *data)
+init_buffer_head(void *data)
 {
 	struct buffer_head *bh = data;
 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 22857c639df..fe5f6809cba 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -766,7 +766,7 @@ const struct file_operations cifs_dir_ops = {
 };
 
 static void
-cifs_init_once(struct kmem_cache *cachep, void *inode)
+cifs_init_once(void *inode)
 {
 	struct cifsInodeInfo *cifsi = inode;
 
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 2f58dfc7008..830f51abb97 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -58,7 +58,7 @@ static void coda_destroy_inode(struct inode *inode)
 	kmem_cache_free(coda_inode_cachep, ITOC(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct coda_inode_info *ei = (struct coda_inode_info *) foo;
 
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 6f403cfba14..448dfd597b5 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -578,7 +578,7 @@ static struct file_system_type ecryptfs_fs_type = {
  * Initializes the ecryptfs_inode_info_cache when it is created
  */
 static void
-inode_info_init_once(struct kmem_cache *cachep, void *vptr)
+inode_info_init_once(void *vptr)
 {
 	struct ecryptfs_inode_info *ei = (struct ecryptfs_inode_info *)vptr;
 
@@ -589,7 +589,7 @@ static struct ecryptfs_cache_info {
 	struct kmem_cache **cache;
 	const char *name;
 	size_t size;
-	void (*ctor)(struct kmem_cache *cache, void *obj);
+	void (*ctor)(void *obj);
 } ecryptfs_cache_infos[] = {
 	{
 		.cache = &ecryptfs_auth_tok_list_item_cache,
diff --git a/fs/efs/super.c b/fs/efs/super.c
index d733531b55e..567b134fa1f 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -70,7 +70,7 @@ static void efs_destroy_inode(struct inode *inode)
 	kmem_cache_free(efs_inode_cachep, INODE_INFO(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct efs_inode_info *ei = (struct efs_inode_info *) foo;
 
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 31308a3b0b8..fd88c7b43e6 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -159,7 +159,7 @@ static void ext2_destroy_inode(struct inode *inode)
 	kmem_cache_free(ext2_inode_cachep, EXT2_I(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct ext2_inode_info *ei = (struct ext2_inode_info *) foo;
 
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 615788c6843..8ddced38467 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -472,7 +472,7 @@ static void ext3_destroy_inode(struct inode *inode)
 	kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct ext3_inode_info *ei = (struct ext3_inode_info *) foo;
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1cb371dcd60..b5479b1dff1 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -595,7 +595,7 @@ static void ext4_destroy_inode(struct inode *inode)
 	kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
 
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 3a9ecac8d61..3222f51c41c 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -36,7 +36,7 @@ static inline int fat_max_cache(struct inode *inode)
 
 static struct kmem_cache *fat_cache_cachep;
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct fat_cache *cache = (struct fat_cache *)foo;
 
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 23676f9d79c..6d266d793e2 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -498,7 +498,7 @@ static void fat_destroy_inode(struct inode *inode)
 	kmem_cache_free(fat_inode_cachep, MSDOS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct msdos_inode_info *ei = (struct msdos_inode_info *)foo;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 7d2f7d6e22e..d2249f174e2 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -956,7 +956,7 @@ static inline void unregister_fuseblk(void)
 }
 #endif
 
-static void fuse_inode_init_once(struct kmem_cache *cachep, void *foo)
+static void fuse_inode_init_once(void *foo)
 {
 	struct inode * inode = foo;
 
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index bcc668d0fad..bb2cc303ac2 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -24,7 +24,7 @@
 #include "util.h"
 #include "glock.h"
 
-static void gfs2_init_inode_once(struct kmem_cache *cachep, void *foo)
+static void gfs2_init_inode_once(void *foo)
 {
 	struct gfs2_inode *ip = foo;
 
@@ -33,7 +33,7 @@ static void gfs2_init_inode_once(struct kmem_cache *cachep, void *foo)
 	ip->i_alloc = NULL;
 }
 
-static void gfs2_init_glock_once(struct kmem_cache *cachep, void *foo)
+static void gfs2_init_glock_once(void *foo)
 {
 	struct gfs2_glock *gl = foo;
 
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index ac2ec5ef66e..4abb1047c68 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -432,7 +432,7 @@ static struct file_system_type hfs_fs_type = {
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
-static void hfs_init_once(struct kmem_cache *cachep, void *p)
+static void hfs_init_once(void *p)
 {
 	struct hfs_inode_info *i = p;
 
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 3859118531c..e834e578c93 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -485,7 +485,7 @@ static struct file_system_type hfsplus_fs_type = {
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
-static void hfsplus_init_once(struct kmem_cache *cachep, void *p)
+static void hfsplus_init_once(void *p)
 {
 	struct hfsplus_inode_info *i = p;
 
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index f63a699ec65..b8ae9c90ada 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -173,7 +173,7 @@ static void hpfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo;
 
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index dbd01d262ca..3f58923fb39 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -705,7 +705,7 @@ static const struct address_space_operations hugetlbfs_aops = {
 };
 
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
 
diff --git a/fs/inode.c b/fs/inode.c
index 35b6414522e..b6726f64453 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -224,7 +224,7 @@ void inode_init_once(struct inode *inode)
 
 EXPORT_SYMBOL(inode_init_once);
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct inode * inode = (struct inode *) foo;
 
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 044a254d526..26948a6033b 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -73,7 +73,7 @@ static void isofs_destroy_inode(struct inode *inode)
 	kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct iso_inode_info *ei = foo;
 
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 7da69eae49e..efd401257ed 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -44,7 +44,7 @@ static void jffs2_destroy_inode(struct inode *inode)
 	kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode));
 }
 
-static void jffs2_i_init_once(struct kmem_cache *cachep, void *foo)
+static void jffs2_i_init_once(void *foo)
 {
 	struct jffs2_inode_info *f = foo;
 
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 854ff0ec574..c350057087d 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -182,7 +182,7 @@ static inline void remove_metapage(struct page *page, struct metapage *mp)
 
 #endif
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct metapage *mp = (struct metapage *)foo;
 
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 359c091d896..3630718be39 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -760,7 +760,7 @@ static struct file_system_type jfs_fs_type = {
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct jfs_inode_info *jfs_ip = (struct jfs_inode_info *) foo;
 
diff --git a/fs/locks.c b/fs/locks.c
index 01490300f7c..5eb259e3cd3 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -201,7 +201,7 @@ EXPORT_SYMBOL(locks_init_lock);
  * Initialises the fields of the file lock which are invariant for
  * free file_locks.
  */
-static void init_once(struct kmem_cache *cache, void *foo)
+static void init_once(void *foo)
 {
 	struct file_lock *lock = (struct file_lock *) foo;
 
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 523d7371341..d1d1eb84679 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -68,7 +68,7 @@ static void minix_destroy_inode(struct inode *inode)
 	kmem_cache_free(minix_inode_cachep, minix_i(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct minix_inode_info *ei = (struct minix_inode_info *) foo;
 
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 2e5ab1204de..d642f0e5b36 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -64,7 +64,7 @@ static void ncp_destroy_inode(struct inode *inode)
 	kmem_cache_free(ncp_inode_cachep, NCP_FINFO(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct ncp_inode_info *ei = (struct ncp_inode_info *) foo;
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index df23f987da6..52daefa2f52 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1242,7 +1242,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
 #endif
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct nfs_inode *nfsi = (struct nfs_inode *) foo;
 
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 3e76f3b216b..4a46743b507 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3080,7 +3080,7 @@ struct kmem_cache *ntfs_inode_cache;
 struct kmem_cache *ntfs_big_inode_cache;
 
 /* Init once constructor for the inode slab cache. */
-static void ntfs_big_inode_init_once(struct kmem_cache *cachep, void *foo)
+static void ntfs_big_inode_init_once(void *foo)
 {
 	ntfs_inode *ni = (ntfs_inode *)foo;
 
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index e48aba698b7..533a789c3ef 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -267,8 +267,7 @@ static ssize_t dlmfs_file_write(struct file *filp,
 	return writelen;
 }
 
-static void dlmfs_init_once(struct kmem_cache *cachep,
-			    void *foo)
+static void dlmfs_init_once(void *foo)
 {
 	struct dlmfs_inode_private *ip =
 		(struct dlmfs_inode_private *) foo;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index ccecfe5094f..2560b33889a 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1118,7 +1118,7 @@ bail:
 	return status;
 }
 
-static void ocfs2_inode_init_once(struct kmem_cache *cachep, void *data)
+static void ocfs2_inode_init_once(void *data)
 {
 	struct ocfs2_inode_info *oi = data;
 
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index d17b4fd204e..9f5b054f06b 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -430,7 +430,7 @@ static struct file_system_type openprom_fs_type = {
 	.kill_sb	= kill_anon_super,
 };
 
-static void op_inode_init_once(struct kmem_cache * cachep, void *data)
+static void op_inode_init_once(void *data)
 {
 	struct op_inode_info *oi = (struct op_inode_info *) data;
 
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 02eca2ed9dd..b37f25dc45a 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -94,7 +94,7 @@ static void proc_destroy_inode(struct inode *inode)
 	kmem_cache_free(proc_inode_cachep, PROC_I(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct proc_inode *ei = (struct proc_inode *) foo;
 
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index b31ab78052b..2aad1044b84 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -553,7 +553,7 @@ static void qnx4_destroy_inode(struct inode *inode)
 	kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo;
 
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 2ec748ba0bd..879e54d35c2 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -521,7 +521,7 @@ static void reiserfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo;
 
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 3f13d491c7c..8e51a2aaa97 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -577,7 +577,7 @@ static void romfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct romfs_inode_info *ei = foo;
 
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 376ef3ee6ed..3528f40ffb0 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -67,7 +67,7 @@ static void smb_destroy_inode(struct inode *inode)
 	kmem_cache_free(smb_inode_cachep, SMB_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct smb_inode_info *ei = (struct smb_inode_info *) foo;
 
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index c5d60de0658..df0d435baa4 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -326,7 +326,7 @@ static void sysv_destroy_inode(struct inode *inode)
 	kmem_cache_free(sysv_inode_cachep, SYSV_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *p)
+static void init_once(void *p)
 {
 	struct sysv_inode_info *si = (struct sysv_inode_info *)p;
 
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 00eb9c68ad0..ca1e2d4e03c 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1841,7 +1841,7 @@ static struct file_system_type ubifs_fs_type = {
 /*
  * Inode slab cache constructor.
  */
-static void inode_slab_ctor(struct kmem_cache *cachep, void *obj)
+static void inode_slab_ctor(void *obj)
 {
 	struct ubifs_inode *ui = obj;
 	inode_init_once(&ui->vfs_inode);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 44cc702f96c..5698bbf83bb 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -148,7 +148,7 @@ static void udf_destroy_inode(struct inode *inode)
 	kmem_cache_free(udf_inode_cachep, UDF_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct udf_inode_info *ei = (struct udf_inode_info *)foo;
 
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 227c9d70004..3e30e40aa24 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1302,7 +1302,7 @@ static void ufs_destroy_inode(struct inode *inode)
 	kmem_cache_free(ufs_inode_cachep, UFS_I(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct ufs_inode_info *ei = (struct ufs_inode_info *) foo;
 
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index 5e956490297..a20683cf74d 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -79,7 +79,7 @@ kmem_zone_init(int size, char *zone_name)
 
 static inline kmem_zone_t *
 kmem_zone_init_flags(int size, char *zone_name, unsigned long flags,
-		     void (*construct)(kmem_zone_t *, void *))
+		     void (*construct)(void *))
 {
 	return kmem_cache_create(zone_name, size, 0, flags, construct);
 }
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 742b2c7852c..943381284e2 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -843,7 +843,6 @@ xfs_fs_destroy_inode(
 
 STATIC void
 xfs_fs_inode_init_once(
-	kmem_zone_t		*zonep,
 	void			*vnode)
 {
 	inode_init_once(vn_to_inode((bhv_vnode_t *)vnode));
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 41103910f8a..9ff8e849940 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -58,7 +58,7 @@ int slab_is_available(void);
 
 struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
 			unsigned long,
-			void (*)(struct kmem_cache *, void *));
+			void (*)(void *));
 void kmem_cache_destroy(struct kmem_cache *);
 int kmem_cache_shrink(struct kmem_cache *);
 void kmem_cache_free(struct kmem_cache *, void *);
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index d117ea2825a..5bad61a93f6 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -85,7 +85,7 @@ struct kmem_cache {
 	struct kmem_cache_order_objects min;
 	gfp_t allocflags;	/* gfp flags to use on each alloc */
 	int refcount;		/* Refcount for slab cache destroy */
-	void (*ctor)(struct kmem_cache *, void *);
+	void (*ctor)(void *);
 	int inuse;		/* Offset to metadata */
 	int align;		/* Alignment */
 	const char *name;	/* Name (only for display!) */
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 1fdc2eb2f6d..474984f9e03 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -207,7 +207,7 @@ static int mqueue_get_sb(struct file_system_type *fs_type,
 	return get_sb_single(fs_type, flags, data, mqueue_fill_super, mnt);
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct mqueue_inode_info *p = (struct mqueue_inode_info *) foo;
 
diff --git a/kernel/fork.c b/kernel/fork.c
index b99d73e971a..80e83e459b1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1442,7 +1442,7 @@ long do_fork(unsigned long clone_flags,
 #define ARCH_MIN_MMSTRUCT_ALIGN 0
 #endif
 
-static void sighand_ctor(struct kmem_cache *cachep, void *data)
+static void sighand_ctor(void *data)
 {
 	struct sighand_struct *sighand = data;
 
diff --git a/lib/idr.c b/lib/idr.c
index 3476f8203e9..e728c7fccc4 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -607,7 +607,7 @@ void *idr_replace(struct idr *idp, void *ptr, int id)
 }
 EXPORT_SYMBOL(idr_replace);
 
-static void idr_cache_ctor(struct kmem_cache *idr_layer_cache, void *idr_layer)
+static void idr_cache_ctor(void *idr_layer)
 {
 	memset(idr_layer, 0, sizeof(struct idr_layer));
 }
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 9c4f1ffa286..be86b32bc87 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -1183,7 +1183,7 @@ int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
 EXPORT_SYMBOL(radix_tree_tagged);
 
 static void
-radix_tree_node_ctor(struct kmem_cache *cachep, void *node)
+radix_tree_node_ctor(void *node)
 {
 	memset(node, 0, sizeof(struct radix_tree_node));
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index abbd29f7c43..39ae5a9bf38 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -138,7 +138,7 @@ void anon_vma_unlink(struct vm_area_struct *vma)
 		anon_vma_free(anon_vma);
 }
 
-static void anon_vma_ctor(struct kmem_cache *cachep, void *data)
+static void anon_vma_ctor(void *data)
 {
 	struct anon_vma *anon_vma = data;
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 1089092aeca..952d361774b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2352,7 +2352,7 @@ static void shmem_destroy_inode(struct inode *inode)
 	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
 
diff --git a/mm/slab.c b/mm/slab.c
index 052e7d64537..918f04f7fef 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -406,7 +406,7 @@ struct kmem_cache {
 	unsigned int dflags;		/* dynamic flags */
 
 	/* constructor func */
-	void (*ctor)(struct kmem_cache *, void *);
+	void (*ctor)(void *obj);
 
 /* 5) cache creation/removal */
 	const char *name;
@@ -2137,8 +2137,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
  */
 struct kmem_cache *
 kmem_cache_create (const char *name, size_t size, size_t align,
-	unsigned long flags,
-	void (*ctor)(struct kmem_cache *, void *))
+	unsigned long flags, void (*ctor)(void *))
 {
 	size_t left_over, slab_size, ralign;
 	struct kmem_cache *cachep = NULL, *pc;
@@ -2653,7 +2652,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
 		 * They must also be threaded.
 		 */
 		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
-			cachep->ctor(cachep, objp + obj_offset(cachep));
+			cachep->ctor(objp + obj_offset(cachep));
 
 		if (cachep->flags & SLAB_RED_ZONE) {
 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
@@ -2669,7 +2668,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
 					 cachep->buffer_size / PAGE_SIZE, 0);
 #else
 		if (cachep->ctor)
-			cachep->ctor(cachep, objp);
+			cachep->ctor(objp);
 #endif
 		slab_bufctl(slabp)[i] = i + 1;
 	}
@@ -3093,7 +3092,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 #endif
 	objp += obj_offset(cachep);
 	if (cachep->ctor && cachep->flags & SLAB_POISON)
-		cachep->ctor(cachep, objp);
+		cachep->ctor(objp);
 #if ARCH_SLAB_MINALIGN
 	if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
 		printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
diff --git a/mm/slob.c b/mm/slob.c
index de268eb7ac7..d8fbd4d1bfa 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -525,12 +525,11 @@ struct kmem_cache {
 	unsigned int size, align;
 	unsigned long flags;
 	const char *name;
-	void (*ctor)(struct kmem_cache *, void *);
+	void (*ctor)(void *);
 };
 
 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
-	size_t align, unsigned long flags,
-	void (*ctor)(struct kmem_cache *, void *))
+	size_t align, unsigned long flags, void (*ctor)(void *))
 {
 	struct kmem_cache *c;
 
@@ -575,7 +574,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 		b = slob_new_page(flags, get_order(c->size), node);
 
 	if (c->ctor)
-		c->ctor(c, b);
+		c->ctor(b);
 
 	return b;
 }
diff --git a/mm/slub.c b/mm/slub.c
index 77c21cf53ff..b7e2cd5d82d 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1012,7 +1012,7 @@ __setup("slub_debug", setup_slub_debug);
 
 static unsigned long kmem_cache_flags(unsigned long objsize,
 	unsigned long flags, const char *name,
-	void (*ctor)(struct kmem_cache *, void *))
+	void (*ctor)(void *))
 {
 	/*
 	 * Enable debugging if selected on the kernel commandline.
@@ -1040,7 +1040,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
 static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
 static inline unsigned long kmem_cache_flags(unsigned long objsize,
 	unsigned long flags, const char *name,
-	void (*ctor)(struct kmem_cache *, void *))
+	void (*ctor)(void *))
 {
 	return flags;
 }
@@ -1103,7 +1103,7 @@ static void setup_object(struct kmem_cache *s, struct page *page,
 {
 	setup_object_debug(s, page, object);
 	if (unlikely(s->ctor))
-		s->ctor(s, object);
+		s->ctor(object);
 }
 
 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -2286,7 +2286,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
 		const char *name, size_t size,
 		size_t align, unsigned long flags,
-		void (*ctor)(struct kmem_cache *, void *))
+		void (*ctor)(void *))
 {
 	memset(s, 0, kmem_size);
 	s->name = name;
@@ -3042,7 +3042,7 @@ static int slab_unmergeable(struct kmem_cache *s)
 
 static struct kmem_cache *find_mergeable(size_t size,
 		size_t align, unsigned long flags, const char *name,
-		void (*ctor)(struct kmem_cache *, void *))
+		void (*ctor)(void *))
 {
 	struct kmem_cache *s;
 
@@ -3082,8 +3082,7 @@ static struct kmem_cache *find_mergeable(size_t size,
 }
 
 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
-		size_t align, unsigned long flags,
-		void (*ctor)(struct kmem_cache *, void *))
+		size_t align, unsigned long flags, void (*ctor)(void *))
 {
 	struct kmem_cache *s;
 
diff --git a/net/socket.c b/net/socket.c
index 1310a82cbba..8ef8ba81b9e 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -265,7 +265,7 @@ static void sock_destroy_inode(struct inode *inode)
 			container_of(inode, struct socket_alloc, vfs_inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct socket_alloc *ei = (struct socket_alloc *)foo;
 
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 5a9b0e7828c..23a2b8f6dc4 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -897,7 +897,7 @@ static struct file_system_type rpc_pipe_fs_type = {
 };
 
 static void
-init_once(struct kmem_cache * cachep, void *foo)
+init_once(void *foo)
 {
 	struct rpc_inode *rpci = (struct rpc_inode *) foo;
 
-- 
cgit v1.2.3-70-g09d2


From bda2fa535564ace56a395d5b65c6dc81305401fa Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@saeurebad.de>
Date: Fri, 25 Jul 2008 19:46:14 -0700
Subject: powerpc: use generic show_mem()

Remove arch-specific show_mem() in favor of the generic version.

This also removes the following redundant information display:

	- pages in swapcache, printed by show_swap_cache_info()

where show_mem() calls show_free_areas(), which calls
show_swap_cache_info().

Signed-off-by: Johannes Weiner <hannes@saeurebad.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/mm/mem.c | 39 ---------------------------------------
 1 file changed, 39 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 1ca2235f096..702691cb9e8 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -186,45 +186,6 @@ walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg,
 }
 EXPORT_SYMBOL_GPL(walk_memory_resource);
 
-void show_mem(void)
-{
-	unsigned long total = 0, reserved = 0;
-	unsigned long shared = 0, cached = 0;
-	unsigned long highmem = 0;
-	struct page *page;
-	pg_data_t *pgdat;
-	unsigned long i;
-
-	printk("Mem-info:\n");
-	show_free_areas();
-	for_each_online_pgdat(pgdat) {
-		unsigned long flags;
-		pgdat_resize_lock(pgdat, &flags);
-		for (i = 0; i < pgdat->node_spanned_pages; i++) {
-			if (!pfn_valid(pgdat->node_start_pfn + i))
-				continue;
-			page = pgdat_page_nr(pgdat, i);
-			total++;
-			if (PageHighMem(page))
-				highmem++;
-			if (PageReserved(page))
-				reserved++;
-			else if (PageSwapCache(page))
-				cached++;
-			else if (page_count(page))
-				shared += page_count(page) - 1;
-		}
-		pgdat_resize_unlock(pgdat, &flags);
-	}
-	printk("%ld pages of RAM\n", total);
-#ifdef CONFIG_HIGHMEM
-	printk("%ld pages of HIGHMEM\n", highmem);
-#endif
-	printk("%ld reserved pages\n", reserved);
-	printk("%ld pages shared\n", shared);
-	printk("%ld pages swap cached\n", cached);
-}
-
 /*
  * Initialize the bootmem system and give it all the memory we
  * have available.  If we are using highmem, we only put the
-- 
cgit v1.2.3-70-g09d2