Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6 into for-linus-1

* 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6: (9356 commits) [media] rc: update for bitop name changes fs: simplify iget & friends fs: pull inode->i_lock up out of writeback_single_inode fs: rename inode_lock to inode_hash_lock fs: move i_wb_list out from under inode_lock fs: move i_sb_list out from under inode_lock fs: remove inode_lock from iput_final and prune_icache fs: Lock the inode LRU list separately fs: factor inode disposal fs: protect inode->i_state with inode->i_lock lib, arch: add filter argument to show_mem and fix private implementations SLUB: Write to per cpu data when allocating it slub: Fix debugobjects with lockless fastpath autofs4: Do not potentially dereference NULL pointer returned by fget() in autofs_dev_ioctl_setpipefd() autofs4 - remove autofs4_lock autofs4 - fix d_manage() return on rcu-walk autofs4 - fix autofs4_expire_indirect() traversal autofs4 - fix dentry leak in autofs4_expire_direct() autofs4 - reinstate last used update on access vfs - check non-mountpoint dentry might block in __follow_mount_rcu() ... NOTE! This merge commit was created to fix compilation error. The block tree was merged upstream and removed the 'elv_queue_empty()' function which the new 'mtdswap' driver is using. So a simple merge of the mtd tree with upstream does not compile. And the mtd tree has already be published, so re-basing it is not an option. To fix this unfortunate situation, I had to merge upstream into the mtd-2.6.git tree without committing, put the fixup patch on top of this, and then commit this. The result is that we do not have commits which do not compile. In other words, this merge commit "merges" 3 things: the MTD tree, the upstream tree, and the fixup patch.
author: Artem Bityutskiy <Artem.Bityutskiy@nokia.com> 2011-03-25 17:41:20 +0200
committer: Artem Bityutskiy <Artem.Bityutskiy@nokia.com> 2011-03-25 17:41:20 +0200
commit: 7bf7e370d5919112c223a269462cd0b546903829 (patch)
tree: 03ccc715239df14ae168277dbccc9d9cf4d8a2c8 /arch/powerpc/platforms/pseries
parent: 68b1a1e786f29c900fa1c516a402e24f0ece622a (diff)
parent: d39dd11c3e6a7af5c20bfac40594db36cf270f42 (diff)
10 files changed, 940 insertions, 131 deletions
diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c
index f4803868642..3cafc306b97 100644
--- a/arch/powerpc/platforms/pseries/cmm.c
+++ b/arch/powerpc/platforms/pseries/cmm.c
@@ -508,12 +508,7 @@ static int cmm_memory_isolate_cb(struct notifier_block *self,
 	if (action == MEM_ISOLATE_COUNT)
 		ret = cmm_count_pages(arg);
 
-	if (ret)
-		ret = notifier_from_errno(ret);
-	else
-		ret = NOTIFY_OK;
-
-	return ret;
+	return notifier_from_errno(ret);
 }
 
 static struct notifier_block cmm_mem_isolate_nb = {
@@ -635,12 +630,7 @@ static int cmm_memory_cb(struct notifier_block *self,
 		break;
 	}
 
-	if (ret)
-		ret = notifier_from_errno(ret);
-	else
-		ret = NOTIFY_OK;
-
-	return ret;
+	return notifier_from_errno(ret);
 }
 
 static struct notifier_block cmm_mem_nb = {
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
index 17a11c82e6f..3cc4d102b1f 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -876,7 +876,7 @@ void eeh_restore_bars(struct pci_dn *pdn)
  *
  * Save the values of the device bars. Unlike the restore
  * routine, this routine is *not* recursive. This is because
- * PCI devices are added individuallly; but, for the restore,
+ * PCI devices are added individually; but, for the restore,
  * an entire slot is reset at a time.
  */
 static void eeh_save_bars(struct pci_dn *pdn)
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index bc880366414..33867ec4a23 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -17,6 +17,54 @@
 #include <asm/pSeries_reconfig.h>
 #include <asm/sparsemem.h>
 
+static unsigned long get_memblock_size(void)
+{
+	struct device_node *np;
+	unsigned int memblock_size = 0;
+
+	np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+	if (np) {
+		const unsigned long *size;
+
+		size = of_get_property(np, "ibm,lmb-size", NULL);
+		memblock_size = size ? *size : 0;
+
+		of_node_put(np);
+	} else {
+		unsigned int memzero_size = 0;
+		const unsigned int *regs;
+
+		np = of_find_node_by_path("/memory@0");
+		if (np) {
+			regs = of_get_property(np, "reg", NULL);
+			memzero_size = regs ? regs[3] : 0;
+			of_node_put(np);
+		}
+
+		if (memzero_size) {
+			/* We now know the size of memory@0, use this to find
+			 * the first memoryblock and get its size.
+			 */
+			char buf[64];
+
+			sprintf(buf, "/memory@%x", memzero_size);
+			np = of_find_node_by_path(buf);
+			if (np) {
+				regs = of_get_property(np, "reg", NULL);
+				memblock_size = regs ? regs[3] : 0;
+				of_node_put(np);
+			}
+		}
+	}
+
+	return memblock_size;
+}
+
+unsigned long memory_block_size_bytes(void)
+{
+	return get_memblock_size();
+}
+
 static int pseries_remove_memblock(unsigned long base, unsigned int memblock_size)
 {
 	unsigned long start, start_pfn;
@@ -127,30 +175,22 @@ static int pseries_add_memory(struct device_node *np)
 
 static int pseries_drconf_memory(unsigned long *base, unsigned int action)
 {
-	struct device_node *np;
-	const unsigned long *lmb_size;
+	unsigned long memblock_size;
 	int rc;
 
-	np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
-	if (!np)
+	memblock_size = get_memblock_size();
+	if (!memblock_size)
 		return -EINVAL;
 
-	lmb_size = of_get_property(np, "ibm,lmb-size", NULL);
-	if (!lmb_size) {
-		of_node_put(np);
-		return -EINVAL;
-	}
-
 	if (action == PSERIES_DRCONF_MEM_ADD) {
-		rc = memblock_add(*base, *lmb_size);
+		rc = memblock_add(*base, memblock_size);
 		rc = (rc < 0) ? -EINVAL : 0;
 	} else if (action == PSERIES_DRCONF_MEM_REMOVE) {
-		rc = pseries_remove_memblock(*base, *lmb_size);
+		rc = pseries_remove_memblock(*base, memblock_size);
 	} else {
 		rc = -EINVAL;
 	}
 
-	of_node_put(np);
 	return rc;
 }
 
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index edea60b7ee9..154c464cdca 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -33,6 +33,7 @@
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
 #include <linux/crash_dump.h>
+#include <linux/memory.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/rtas.h>
@@ -45,6 +46,7 @@
 #include <asm/tce.h>
 #include <asm/ppc-pci.h>
 #include <asm/udbg.h>
+#include <asm/mmzone.h>
 
 #include "plpar_wrappers.h"
 
@@ -270,6 +272,152 @@ static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum)
 	return tce_ret;
 }
 
+/* this is compatable with cells for the device tree property */
+struct dynamic_dma_window_prop {
+	__be32	liobn;		/* tce table number */
+	__be64	dma_base;	/* address hi,lo */
+	__be32	tce_shift;	/* ilog2(tce_page_size) */
+	__be32	window_shift;	/* ilog2(tce_window_size) */
+};
+
+struct direct_window {
+	struct device_node *device;
+	const struct dynamic_dma_window_prop *prop;
+	struct list_head list;
+};
+
+/* Dynamic DMA Window support */
+struct ddw_query_response {
+	u32 windows_available;
+	u32 largest_available_block;
+	u32 page_size;
+	u32 migration_capable;
+};
+
+struct ddw_create_response {
+	u32 liobn;
+	u32 addr_hi;
+	u32 addr_lo;
+};
+
+static LIST_HEAD(direct_window_list);
+/* prevents races between memory on/offline and window creation */
+static DEFINE_SPINLOCK(direct_window_list_lock);
+/* protects initializing window twice for same device */
+static DEFINE_MUTEX(direct_window_init_mutex);
+#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
+
+static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
+					unsigned long num_pfn, const void *arg)
+{
+	const struct dynamic_dma_window_prop *maprange = arg;
+	int rc;
+	u64 tce_size, num_tce, dma_offset, next;
+	u32 tce_shift;
+	long limit;
+
+	tce_shift = be32_to_cpu(maprange->tce_shift);
+	tce_size = 1ULL << tce_shift;
+	next = start_pfn << PAGE_SHIFT;
+	num_tce = num_pfn << PAGE_SHIFT;
+
+	/* round back to the beginning of the tce page size */
+	num_tce += next & (tce_size - 1);
+	next &= ~(tce_size - 1);
+
+	/* covert to number of tces */
+	num_tce |= tce_size - 1;
+	num_tce >>= tce_shift;
+
+	do {
+		/*
+		 * Set up the page with TCE data, looping through and setting
+		 * the values.
+		 */
+		limit = min_t(long, num_tce, 512);
+		dma_offset = next + be64_to_cpu(maprange->dma_base);
+
+		rc = plpar_tce_stuff((u64)be32_to_cpu(maprange->liobn),
+					     dma_offset,
+					     0, limit);
+		num_tce -= limit;
+	} while (num_tce > 0 && !rc);
+
+	return rc;
+}
+
+static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
+					unsigned long num_pfn, const void *arg)
+{
+	const struct dynamic_dma_window_prop *maprange = arg;
+	u64 *tcep, tce_size, num_tce, dma_offset, next, proto_tce, liobn;
+	u32 tce_shift;
+	u64 rc = 0;
+	long l, limit;
+
+	local_irq_disable();	/* to protect tcep and the page behind it */
+	tcep = __get_cpu_var(tce_page);
+
+	if (!tcep) {
+		tcep = (u64 *)__get_free_page(GFP_ATOMIC);
+		if (!tcep) {
+			local_irq_enable();
+			return -ENOMEM;
+		}
+		__get_cpu_var(tce_page) = tcep;
+	}
+
+	proto_tce = TCE_PCI_READ | TCE_PCI_WRITE;
+
+	liobn = (u64)be32_to_cpu(maprange->liobn);
+	tce_shift = be32_to_cpu(maprange->tce_shift);
+	tce_size = 1ULL << tce_shift;
+	next = start_pfn << PAGE_SHIFT;
+	num_tce = num_pfn << PAGE_SHIFT;
+
+	/* round back to the beginning of the tce page size */
+	num_tce += next & (tce_size - 1);
+	next &= ~(tce_size - 1);
+
+	/* covert to number of tces */
+	num_tce |= tce_size - 1;
+	num_tce >>= tce_shift;
+
+	/* We can map max one pageful of TCEs at a time */
+	do {
+		/*
+		 * Set up the page with TCE data, looping through and setting
+		 * the values.
+		 */
+		limit = min_t(long, num_tce, 4096/TCE_ENTRY_SIZE);
+		dma_offset = next + be64_to_cpu(maprange->dma_base);
+
+		for (l = 0; l < limit; l++) {
+			tcep[l] = proto_tce | next;
+			next += tce_size;
+		}
+
+		rc = plpar_tce_put_indirect(liobn,
+					    dma_offset,
+					    (u64)virt_to_abs(tcep),
+					    limit);
+
+		num_tce -= limit;
+	} while (num_tce > 0 && !rc);
+
+	/* error cleanup: caller will clear whole range */
+
+	local_irq_enable();
+	return rc;
+}
+
+static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn,
+		unsigned long num_pfn, void *arg)
+{
+	return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg);
+}
+
+
 #ifdef CONFIG_PCI
 static void iommu_table_setparms(struct pci_controller *phb,
 				 struct device_node *dn,
@@ -495,6 +643,329 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
 		       pci_name(dev));
 }
 
+static int __read_mostly disable_ddw;
+
+static int __init disable_ddw_setup(char *str)
+{
+	disable_ddw = 1;
+	printk(KERN_INFO "ppc iommu: disabling ddw.\n");
+
+	return 0;
+}
+
+early_param("disable_ddw", disable_ddw_setup);
+
+static void remove_ddw(struct device_node *np)
+{
+	struct dynamic_dma_window_prop *dwp;
+	struct property *win64;
+	const u32 *ddr_avail;
+	u64 liobn;
+	int len, ret;
+
+	ddr_avail = of_get_property(np, "ibm,ddw-applicable", &len);
+	win64 = of_find_property(np, DIRECT64_PROPNAME, NULL);
+	if (!win64 || !ddr_avail || len < 3 * sizeof(u32))
+		return;
+
+	dwp = win64->value;
+	liobn = (u64)be32_to_cpu(dwp->liobn);
+
+	/* clear the whole window, note the arg is in kernel pages */
+	ret = tce_clearrange_multi_pSeriesLP(0,
+		1ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp);
+	if (ret)
+		pr_warning("%s failed to clear tces in window.\n",
+			 np->full_name);
+	else
+		pr_debug("%s successfully cleared tces in window.\n",
+			 np->full_name);
+
+	ret = rtas_call(ddr_avail[2], 1, 1, NULL, liobn);
+	if (ret)
+		pr_warning("%s: failed to remove direct window: rtas returned "
+			"%d to ibm,remove-pe-dma-window(%x) %llx\n",
+			np->full_name, ret, ddr_avail[2], liobn);
+	else
+		pr_debug("%s: successfully removed direct window: rtas returned "
+			"%d to ibm,remove-pe-dma-window(%x) %llx\n",
+			np->full_name, ret, ddr_avail[2], liobn);
+}
+
+
+static int dupe_ddw_if_already_created(struct pci_dev *dev, struct device_node *pdn)
+{
+	struct device_node *dn;
+	struct pci_dn *pcidn;
+	struct direct_window *window;
+	const struct dynamic_dma_window_prop *direct64;
+	u64 dma_addr = 0;
+
+	dn = pci_device_to_OF_node(dev);
+	pcidn = PCI_DN(dn);
+	spin_lock(&direct_window_list_lock);
+	/* check if we already created a window and dupe that config if so */
+	list_for_each_entry(window, &direct_window_list, list) {
+		if (window->device == pdn) {
+			direct64 = window->prop;
+			dma_addr = direct64->dma_base;
+			break;
+		}
+	}
+	spin_unlock(&direct_window_list_lock);
+
+	return dma_addr;
+}
+
+static u64 dupe_ddw_if_kexec(struct pci_dev *dev, struct device_node *pdn)
+{
+	struct device_node *dn;
+	struct pci_dn *pcidn;
+	int len;
+	struct direct_window *window;
+	const struct dynamic_dma_window_prop *direct64;
+	u64 dma_addr = 0;
+
+	dn = pci_device_to_OF_node(dev);
+	pcidn = PCI_DN(dn);
+	direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len);
+	if (direct64) {
+		window = kzalloc(sizeof(*window), GFP_KERNEL);
+		if (!window) {
+			remove_ddw(pdn);
+		} else {
+			window->device = pdn;
+			window->prop = direct64;
+			spin_lock(&direct_window_list_lock);
+			list_add(&window->list, &direct_window_list);
+			spin_unlock(&direct_window_list_lock);
+			dma_addr = direct64->dma_base;
+		}
+	}
+
+	return dma_addr;
+}
+
+static int query_ddw(struct pci_dev *dev, const u32 *ddr_avail,
+			struct ddw_query_response *query)
+{
+	struct device_node *dn;
+	struct pci_dn *pcidn;
+	u32 cfg_addr;
+	u64 buid;
+	int ret;
+
+	/*
+	 * Get the config address and phb buid of the PE window.
+	 * Rely on eeh to retrieve this for us.
+	 * Retrieve them from the pci device, not the node with the
+	 * dma-window property
+	 */
+	dn = pci_device_to_OF_node(dev);
+	pcidn = PCI_DN(dn);
+	cfg_addr = pcidn->eeh_config_addr;
+	if (pcidn->eeh_pe_config_addr)
+		cfg_addr = pcidn->eeh_pe_config_addr;
+	buid = pcidn->phb->buid;
+	ret = rtas_call(ddr_avail[0], 3, 5, (u32 *)query,
+		  cfg_addr, BUID_HI(buid), BUID_LO(buid));
+	dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x"
+		" returned %d\n", ddr_avail[0], cfg_addr, BUID_HI(buid),
+		BUID_LO(buid), ret);
+	return ret;
+}
+
+static int create_ddw(struct pci_dev *dev, const u32 *ddr_avail,
+			struct ddw_create_response *create, int page_shift,
+			int window_shift)
+{
+	struct device_node *dn;
+	struct pci_dn *pcidn;
+	u32 cfg_addr;
+	u64 buid;
+	int ret;
+
+	/*
+	 * Get the config address and phb buid of the PE window.
+	 * Rely on eeh to retrieve this for us.
+	 * Retrieve them from the pci device, not the node with the
+	 * dma-window property
+	 */
+	dn = pci_device_to_OF_node(dev);
+	pcidn = PCI_DN(dn);
+	cfg_addr = pcidn->eeh_config_addr;
+	if (pcidn->eeh_pe_config_addr)
+		cfg_addr = pcidn->eeh_pe_config_addr;
+	buid = pcidn->phb->buid;
+
+	do {
+		/* extra outputs are LIOBN and dma-addr (hi, lo) */
+		ret = rtas_call(ddr_avail[1], 5, 4, (u32 *)create, cfg_addr,
+				BUID_HI(buid), BUID_LO(buid), page_shift, window_shift);
+	} while (rtas_busy_delay(ret));
+	dev_info(&dev->dev,
+		"ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d "
+		"(liobn = 0x%x starting addr = %x %x)\n", ddr_avail[1],
+		 cfg_addr, BUID_HI(buid), BUID_LO(buid), page_shift,
+		 window_shift, ret, create->liobn, create->addr_hi, create->addr_lo);
+
+	return ret;
+}
+
+/*
+ * If the PE supports dynamic dma windows, and there is space for a table
+ * that can map all pages in a linear offset, then setup such a table,
+ * and record the dma-offset in the struct device.
+ *
+ * dev: the pci device we are checking
+ * pdn: the parent pe node with the ibm,dma_window property
+ * Future: also check if we can remap the base window for our base page size
+ *
+ * returns the dma offset for use by dma_set_mask
+ */
+static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
+{
+	int len, ret;
+	struct ddw_query_response query;
+	struct ddw_create_response create;
+	int page_shift;
+	u64 dma_addr, max_addr;
+	struct device_node *dn;
+	const u32 *uninitialized_var(ddr_avail);
+	struct direct_window *window;
+	struct property *uninitialized_var(win64);
+	struct dynamic_dma_window_prop *ddwprop;
+
+	mutex_lock(&direct_window_init_mutex);
+
+	dma_addr = dupe_ddw_if_already_created(dev, pdn);
+	if (dma_addr != 0)
+		goto out_unlock;
+
+	dma_addr = dupe_ddw_if_kexec(dev, pdn);
+	if (dma_addr != 0)
+		goto out_unlock;
+
+	/*
+	 * the ibm,ddw-applicable property holds the tokens for:
+	 * ibm,query-pe-dma-window
+	 * ibm,create-pe-dma-window
+	 * ibm,remove-pe-dma-window
+	 * for the given node in that order.
+	 * the property is actually in the parent, not the PE
+	 */
+	ddr_avail = of_get_property(pdn, "ibm,ddw-applicable", &len);
+	if (!ddr_avail || len < 3 * sizeof(u32))
+		goto out_unlock;
+
+       /*
+	 * Query if there is a second window of size to map the
+	 * whole partition.  Query returns number of windows, largest
+	 * block assigned to PE (partition endpoint), and two bitmasks
+	 * of page sizes: supported and supported for migrate-dma.
+	 */
+	dn = pci_device_to_OF_node(dev);
+	ret = query_ddw(dev, ddr_avail, &query);
+	if (ret != 0)
+		goto out_unlock;
+
+	if (query.windows_available == 0) {
+		/*
+		 * no additional windows are available for this device.
+		 * We might be able to reallocate the existing window,
+		 * trading in for a larger page size.
+		 */
+		dev_dbg(&dev->dev, "no free dynamic windows");
+		goto out_unlock;
+	}
+	if (query.page_size & 4) {
+		page_shift = 24; /* 16MB */
+	} else if (query.page_size & 2) {
+		page_shift = 16; /* 64kB */
+	} else if (query.page_size & 1) {
+		page_shift = 12; /* 4kB */
+	} else {
+		dev_dbg(&dev->dev, "no supported direct page size in mask %x",
+			  query.page_size);
+		goto out_unlock;
+	}
+	/* verify the window * number of ptes will map the partition */
+	/* check largest block * page size > max memory hotplug addr */
+	max_addr = memory_hotplug_max();
+	if (query.largest_available_block < (max_addr >> page_shift)) {
+		dev_dbg(&dev->dev, "can't map partiton max 0x%llx with %u "
+			  "%llu-sized pages\n", max_addr,  query.largest_available_block,
+			  1ULL << page_shift);
+		goto out_unlock;
+	}
+	len = order_base_2(max_addr);
+	win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
+	if (!win64) {
+		dev_info(&dev->dev,
+			"couldn't allocate property for 64bit dma window\n");
+		goto out_unlock;
+	}
+	win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL);
+	win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL);
+	if (!win64->name || !win64->value) {
+		dev_info(&dev->dev,
+			"couldn't allocate property name and value\n");
+		goto out_free_prop;
+	}
+
+	ret = create_ddw(dev, ddr_avail, &create, page_shift, len);
+	if (ret != 0)
+		goto out_free_prop;
+
+	ddwprop->liobn = cpu_to_be32(create.liobn);
+	ddwprop->dma_base = cpu_to_be64(of_read_number(&create.addr_hi, 2));
+	ddwprop->tce_shift = cpu_to_be32(page_shift);
+	ddwprop->window_shift = cpu_to_be32(len);
+
+	dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %s\n",
+		  create.liobn, dn->full_name);
+
+	window = kzalloc(sizeof(*window), GFP_KERNEL);
+	if (!window)
+		goto out_clear_window;
+
+	ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT,
+			win64->value, tce_setrange_multi_pSeriesLP_walk);
+	if (ret) {
+		dev_info(&dev->dev, "failed to map direct window for %s: %d\n",
+			 dn->full_name, ret);
+		goto out_clear_window;
+	}
+
+	ret = prom_add_property(pdn, win64);
+	if (ret) {
+		dev_err(&dev->dev, "unable to add dma window property for %s: %d",
+			 pdn->full_name, ret);
+		goto out_clear_window;
+	}
+
+	window->device = pdn;
+	window->prop = ddwprop;
+	spin_lock(&direct_window_list_lock);
+	list_add(&window->list, &direct_window_list);
+	spin_unlock(&direct_window_list_lock);
+
+	dma_addr = of_read_number(&create.addr_hi, 2);
+	goto out_unlock;
+
+out_clear_window:
+	remove_ddw(pdn);
+
+out_free_prop:
+	kfree(win64->name);
+	kfree(win64->value);
+	kfree(win64);
+
+out_unlock:
+	mutex_unlock(&direct_window_init_mutex);
+	return dma_addr;
+}
+
 static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
 {
 	struct device_node *pdn, *dn;
@@ -541,23 +1012,137 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
 
 	set_iommu_table_base(&dev->dev, pci->iommu_table);
 }
+
+static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask)
+{
+	bool ddw_enabled = false;
+	struct device_node *pdn, *dn;
+	struct pci_dev *pdev;
+	const void *dma_window = NULL;
+	u64 dma_offset;
+
+	if (!dev->dma_mask || !dma_supported(dev, dma_mask))
+		return -EIO;
+
+	/* only attempt to use a new window if 64-bit DMA is requested */
+	if (!disable_ddw && dma_mask == DMA_BIT_MASK(64)) {
+		pdev = to_pci_dev(dev);
+
+		dn = pci_device_to_OF_node(pdev);
+		dev_dbg(dev, "node is %s\n", dn->full_name);
+
+		/*
+		 * the device tree might contain the dma-window properties
+		 * per-device and not neccesarily for the bus. So we need to
+		 * search upwards in the tree until we either hit a dma-window
+		 * property, OR find a parent with a table already allocated.
+		 */
+		for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table;
+				pdn = pdn->parent) {
+			dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
+			if (dma_window)
+				break;
+		}
+		if (pdn && PCI_DN(pdn)) {
+			dma_offset = enable_ddw(pdev, pdn);
+			if (dma_offset != 0) {
+				dev_info(dev, "Using 64-bit direct DMA at offset %llx\n", dma_offset);
+				set_dma_offset(dev, dma_offset);
+				set_dma_ops(dev, &dma_direct_ops);
+				ddw_enabled = true;
+			}
+		}
+	}
+
+	/* fall-through to iommu ops */
+	if (!ddw_enabled) {
+		dev_info(dev, "Using 32-bit DMA via iommu\n");
+		set_dma_ops(dev, &dma_iommu_ops);
+	}
+
+	*dev->dma_mask = dma_mask;
+	return 0;
+}
+
 #else  /* CONFIG_PCI */
 #define pci_dma_bus_setup_pSeries	NULL
 #define pci_dma_dev_setup_pSeries	NULL
 #define pci_dma_bus_setup_pSeriesLP	NULL
 #define pci_dma_dev_setup_pSeriesLP	NULL
+#define dma_set_mask_pSeriesLP		NULL
 #endif /* !CONFIG_PCI */
 
+static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action,
+		void *data)
+{
+	struct direct_window *window;
+	struct memory_notify *arg = data;
+	int ret = 0;
+
+	switch (action) {
+	case MEM_GOING_ONLINE:
+		spin_lock(&direct_window_list_lock);
+		list_for_each_entry(window, &direct_window_list, list) {
+			ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn,
+					arg->nr_pages, window->prop);
+			/* XXX log error */
+		}
+		spin_unlock(&direct_window_list_lock);
+		break;
+	case MEM_CANCEL_ONLINE:
+	case MEM_OFFLINE:
+		spin_lock(&direct_window_list_lock);
+		list_for_each_entry(window, &direct_window_list, list) {
+			ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn,
+					arg->nr_pages, window->prop);
+			/* XXX log error */
+		}
+		spin_unlock(&direct_window_list_lock);
+		break;
+	default:
+		break;
+	}
+	if (ret && action != MEM_CANCEL_ONLINE)
+		return NOTIFY_BAD;
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block iommu_mem_nb = {
+	.notifier_call = iommu_mem_notifier,
+};
+
 static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *node)
 {
 	int err = NOTIFY_OK;
 	struct device_node *np = node;
 	struct pci_dn *pci = PCI_DN(np);
+	struct direct_window *window;
 
 	switch (action) {
 	case PSERIES_RECONFIG_REMOVE:
 		if (pci && pci->iommu_table)
 			iommu_free_table(pci->iommu_table, np->full_name);
+
+		spin_lock(&direct_window_list_lock);
+		list_for_each_entry(window, &direct_window_list, list) {
+			if (window->device == np) {
+				list_del(&window->list);
+				kfree(window);
+				break;
+			}
+		}
+		spin_unlock(&direct_window_list_lock);
+
+		/*
+		 * Because the notifier runs after isolation of the
+		 * slot, we are guaranteed any DMA window has already
+		 * been revoked and the TCEs have been marked invalid,
+		 * so we don't need a call to remove_ddw(np). However,
+		 * if an additional notifier action is added before the
+		 * isolate call, we should update this code for
+		 * completeness with such a call.
+		 */
 		break;
 	default:
 		err = NOTIFY_DONE;
@@ -587,6 +1172,7 @@ void iommu_init_early_pSeries(void)
 		ppc_md.tce_get   = tce_get_pSeriesLP;
 		ppc_md.pci_dma_bus_setup = pci_dma_bus_setup_pSeriesLP;
 		ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_pSeriesLP;
+		ppc_md.dma_set_mask = dma_set_mask_pSeriesLP;
 	} else {
 		ppc_md.tce_build = tce_build_pSeries;
 		ppc_md.tce_free  = tce_free_pSeries;
@@ -597,6 +1183,7 @@ void iommu_init_early_pSeries(void)
 
 
 	pSeries_reconfig_notifier_register(&iommu_reconfig_nb);
+	register_memory_notifier(&iommu_mem_nb);
 
 	set_pci_dma_ops(&dma_iommu_ops);
 }
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 5d3ea9f60dd..ca5d5898d32 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -713,6 +713,13 @@ EXPORT_SYMBOL(arch_free_page);
 /* NB: reg/unreg are called while guarded with the tracepoints_mutex */
 extern long hcall_tracepoint_refcount;
 
+/* 
+ * Since the tracing code might execute hcalls we need to guard against
+ * recursion. One example of this are spinlocks calling H_YIELD on
+ * shared processor partitions.
+ */
+static DEFINE_PER_CPU(unsigned int, hcall_trace_depth);
+
 void hcall_tracepoint_regfunc(void)
 {
 	hcall_tracepoint_refcount++;
@@ -725,12 +732,42 @@ void hcall_tracepoint_unregfunc(void)
 
 void __trace_hcall_entry(unsigned long opcode, unsigned long *args)
 {
+	unsigned long flags;
+	unsigned int *depth;
+
+	local_irq_save(flags);
+
+	depth = &__get_cpu_var(hcall_trace_depth);
+
+	if (*depth)
+		goto out;
+
+	(*depth)++;
 	trace_hcall_entry(opcode, args);
+	(*depth)--;
+
+out:
+	local_irq_restore(flags);
 }
 
 void __trace_hcall_exit(long opcode, unsigned long retval,
 			unsigned long *retbuf)
 {
+	unsigned long flags;
+	unsigned int *depth;
+
+	local_irq_save(flags);
+
+	depth = &__get_cpu_var(hcall_trace_depth);
+
+	if (*depth)
+		goto out;
+
+	(*depth)++;
 	trace_hcall_exit(opcode, retval, retbuf);
+	(*depth)--;
+
+out:
+	local_irq_restore(flags);
 }
 #endif
diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
index 1164c3430f2..18ac801f8e9 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -93,8 +93,18 @@ static void rtas_disable_msi(struct pci_dev *pdev)
 	if (!pdn)
 		return;
 
-	if (rtas_change_msi(pdn, RTAS_CHANGE_FN, 0) != 0)
-		pr_debug("rtas_msi: Setting MSIs to 0 failed!\n");
+	/*
+	 * disabling MSI with the explicit interface also disables MSI-X
+	 */
+	if (rtas_change_msi(pdn, RTAS_CHANGE_MSI_FN, 0) != 0) {
+		/* 
+		 * may have failed because explicit interface is not
+		 * present
+		 */
+		if (rtas_change_msi(pdn, RTAS_CHANGE_FN, 0) != 0) {
+			pr_debug("rtas_msi: Setting MSIs to 0 failed!\n");
+		}
+	}
 }
 
 static int rtas_query_irq_number(struct pci_dn *pdn, int offset)
diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
index 7e828ba29bc..419707b0724 100644
--- a/arch/powerpc/platforms/pseries/nvram.c
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -16,6 +16,8 @@
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/kmsg_dump.h>
 #include <asm/uaccess.h>
 #include <asm/nvram.h>
 #include <asm/rtas.h>
@@ -30,17 +32,54 @@ static int nvram_fetch, nvram_store;
 static char nvram_buf[NVRW_CNT];	/* assume this is in the first 4GB */
 static DEFINE_SPINLOCK(nvram_lock);
 
-static long nvram_error_log_index = -1;
-static long nvram_error_log_size = 0;
-
 struct err_log_info {
 	int error_type;
 	unsigned int seq_num;
 };
-#define NVRAM_MAX_REQ		2079
-#define NVRAM_MIN_REQ		1055
 
-#define NVRAM_LOG_PART_NAME	"ibm,rtas-log"
+struct nvram_os_partition {
+	const char *name;
+	int req_size;	/* desired size, in bytes */
+	int min_size;	/* minimum acceptable size (0 means req_size) */
+	long size;	/* size of data portion (excluding err_log_info) */
+	long index;	/* offset of data portion of partition */
+};
+
+static struct nvram_os_partition rtas_log_partition = {
+	.name = "ibm,rtas-log",
+	.req_size = 2079,
+	.min_size = 1055,
+	.index = -1
+};
+
+static struct nvram_os_partition oops_log_partition = {
+	.name = "lnx,oops-log",
+	.req_size = 4000,
+	.min_size = 2000,
+	.index = -1
+};
+
+static const char *pseries_nvram_os_partitions[] = {
+	"ibm,rtas-log",
+	"lnx,oops-log",
+	NULL
+};
+
+static void oops_to_nvram(struct kmsg_dumper *dumper,
+		enum kmsg_dump_reason reason,
+		const char *old_msgs, unsigned long old_len,
+		const char *new_msgs, unsigned long new_len);
+
+static struct kmsg_dumper nvram_kmsg_dumper = {
+	.dump = oops_to_nvram
+};
+
+/* See clobbering_unread_rtas_event() */
+#define NVRAM_RTAS_READ_TIMEOUT 5		/* seconds */
+static unsigned long last_unread_rtas_event;	/* timestamp */
+
+/* We preallocate oops_buf during init to avoid kmalloc during oops/panic. */
+static char *oops_buf;
 
 static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index)
 {
@@ -134,7 +173,7 @@ static ssize_t pSeries_nvram_get_size(void)
 }
 
 
-/* nvram_write_error_log
+/* nvram_write_os_partition, nvram_write_error_log
  *
  * We need to buffer the error logs into nvram to ensure that we have
  * the failure information to decode.  If we have a severe error there
@@ -156,48 +195,58 @@ static ssize_t pSeries_nvram_get_size(void)
  * The 'data' section would look like (in bytes):
  * +--------------+------------+-----------------------------------+
  * | event_logged | sequence # | error log                         |
- * |0            3|4          7|8            nvram_error_log_size-1|
+ * |0            3|4          7|8                  error_log_size-1|
  * +--------------+------------+-----------------------------------+
  *
  * event_logged: 0 if event has not been logged to syslog, 1 if it has
  * sequence #: The unique sequence # for each event. (until it wraps)
  * error log: The error log from event_scan
  */
-int nvram_write_error_log(char * buff, int length,
-                          unsigned int err_type, unsigned int error_log_cnt)
+int nvram_write_os_partition(struct nvram_os_partition *part, char * buff,
+		int length, unsigned int err_type, unsigned int error_log_cnt)
 {
 	int rc;
 	loff_t tmp_index;
 	struct err_log_info info;
 	
-	if (nvram_error_log_index == -1) {
+	if (part->index == -1) {
 		return -ESPIPE;
 	}
 
-	if (length > nvram_error_log_size) {
-		length = nvram_error_log_size;
+	if (length > part->size) {
+		length = part->size;
 	}
 
 	info.error_type = err_type;
 	info.seq_num = error_log_cnt;
 
-	tmp_index = nvram_error_log_index;
+	tmp_index = part->index;
 
 	rc = ppc_md.nvram_write((char *)&info, sizeof(struct err_log_info), &tmp_index);
 	if (rc <= 0) {
-		printk(KERN_ERR "nvram_write_error_log: Failed nvram_write (%d)\n", rc);
+		pr_err("%s: Failed nvram_write (%d)\n", __FUNCTION__, rc);
 		return rc;
 	}
 
 	rc = ppc_md.nvram_write(buff, length, &tmp_index);
 	if (rc <= 0) {
-		printk(KERN_ERR "nvram_write_error_log: Failed nvram_write (%d)\n", rc);
+		pr_err("%s: Failed nvram_write (%d)\n", __FUNCTION__, rc);
 		return rc;
 	}
 	
 	return 0;
 }
 
+int nvram_write_error_log(char * buff, int length,
+                          unsigned int err_type, unsigned int error_log_cnt)
+{
+	int rc = nvram_write_os_partition(&rtas_log_partition, buff, length,
+						err_type, error_log_cnt);
+	if (!rc)
+		last_unread_rtas_event = get_seconds();
+	return rc;
+}
+
 /* nvram_read_error_log
  *
  * Reads nvram for error log for at most 'length'
@@ -209,13 +258,13 @@ int nvram_read_error_log(char * buff, int length,
 	loff_t tmp_index;
 	struct err_log_info info;
 	
-	if (nvram_error_log_index == -1)
+	if (rtas_log_partition.index == -1)
 		return -1;
 
-	if (length > nvram_error_log_size)
-		length = nvram_error_log_size;
+	if (length > rtas_log_partition.size)
+		length = rtas_log_partition.size;
 
-	tmp_index = nvram_error_log_index;
+	tmp_index = rtas_log_partition.index;
 
 	rc = ppc_md.nvram_read((char *)&info, sizeof(struct err_log_info), &tmp_index);
 	if (rc <= 0) {
@@ -244,37 +293,40 @@ int nvram_clear_error_log(void)
 	int clear_word = ERR_FLAG_ALREADY_LOGGED;
 	int rc;
 
-	if (nvram_error_log_index == -1)
+	if (rtas_log_partition.index == -1)
 		return -1;
 
-	tmp_index = nvram_error_log_index;
+	tmp_index = rtas_log_partition.index;
 	
 	rc = ppc_md.nvram_write((char *)&clear_word, sizeof(int), &tmp_index);
 	if (rc <= 0) {
 		printk(KERN_ERR "nvram_clear_error_log: Failed nvram_write (%d)\n", rc);
 		return rc;
 	}
+	last_unread_rtas_event = 0;
 
 	return 0;
 }
 
-/* pseries_nvram_init_log_partition
+/* pseries_nvram_init_os_partition
  *
- * This will setup the partition we need for buffering the
- * error logs and cleanup partitions if needed.
+ * This sets up a partition with an "OS" signature.
  *
  * The general strategy is the following:
- * 1.) If there is log partition large enough then use it.
- * 2.) If there is none large enough, search
- * for a free partition that is large enough.
- * 3.) If there is not a free partition large enough remove 
- * _all_ OS partitions and consolidate the space.
- * 4.) Will first try getting a chunk that will satisfy the maximum
- * error log size (NVRAM_MAX_REQ).
- * 5.) If the max chunk cannot be allocated then try finding a chunk
- * that will satisfy the minum needed (NVRAM_MIN_REQ).
+ * 1.) If a partition with the indicated name already exists...
+ *	- If it's large enough, use it.
+ *	- Otherwise, recycle it and keep going.
+ * 2.) Search for a free partition that is large enough.
+ * 3.) If there's not a free partition large enough, recycle any obsolete
+ * OS partitions and try again.
+ * 4.) Will first try getting a chunk that will satisfy the requested size.
+ * 5.) If a chunk of the requested size cannot be allocated, then try finding
+ * a chunk that will satisfy the minum needed.
+ *
+ * Returns 0 on success, else -1.
  */
-static int __init pseries_nvram_init_log_partition(void)
+static int __init pseries_nvram_init_os_partition(struct nvram_os_partition
+									*part)
 {
 	loff_t p;
 	int size;
@@ -282,47 +334,76 @@ static int __init pseries_nvram_init_log_partition(void)
 	/* Scan nvram for partitions */
 	nvram_scan_partitions();
 
-	/* Lookg for ours */
-	p = nvram_find_partition(NVRAM_LOG_PART_NAME, NVRAM_SIG_OS, &size);
+	/* Look for ours */
+	p = nvram_find_partition(part->name, NVRAM_SIG_OS, &size);
 
 	/* Found one but too small, remove it */
-	if (p && size < NVRAM_MIN_REQ) {
-		pr_info("nvram: Found too small "NVRAM_LOG_PART_NAME" partition"
-			",removing it...");
-		nvram_remove_partition(NVRAM_LOG_PART_NAME, NVRAM_SIG_OS);
+	if (p && size < part->min_size) {
+		pr_info("nvram: Found too small %s partition,"
+					" removing it...\n", part->name);
+		nvram_remove_partition(part->name, NVRAM_SIG_OS, NULL);
 		p = 0;
 	}
 
 	/* Create one if we didn't find */
 	if (!p) {
-		p = nvram_create_partition(NVRAM_LOG_PART_NAME, NVRAM_SIG_OS,
-					   NVRAM_MAX_REQ, NVRAM_MIN_REQ);
-		/* No room for it, try to get rid of any OS partition
-		 * and try again
-		 */
+		p = nvram_create_partition(part->name, NVRAM_SIG_OS,
+					part->req_size, part->min_size);
 		if (p == -ENOSPC) {
-			pr_info("nvram: No room to create "NVRAM_LOG_PART_NAME
-				" partition, deleting all OS partitions...");
-			nvram_remove_partition(NULL, NVRAM_SIG_OS);
-			p = nvram_create_partition(NVRAM_LOG_PART_NAME,
-						   NVRAM_SIG_OS, NVRAM_MAX_REQ,
-						   NVRAM_MIN_REQ);
+			pr_info("nvram: No room to create %s partition, "
+				"deleting any obsolete OS partitions...\n",
+				part->name);
+			nvram_remove_partition(NULL, NVRAM_SIG_OS,
+						pseries_nvram_os_partitions);
+			p = nvram_create_partition(part->name, NVRAM_SIG_OS,
+					part->req_size, part->min_size);
 		}
 	}
 
 	if (p <= 0) {
-		pr_err("nvram: Failed to find or create "NVRAM_LOG_PART_NAME
-		       " partition, err %d\n", (int)p);
-		return 0;
+		pr_err("nvram: Failed to find or create %s"
+		       " partition, err %d\n", part->name, (int)p);
+		return -1;
 	}
 
-	nvram_error_log_index = p;
-	nvram_error_log_size = nvram_get_partition_size(p) -
-		sizeof(struct err_log_info);
+	part->index = p;
+	part->size = nvram_get_partition_size(p) - sizeof(struct err_log_info);
 	
 	return 0;
 }
-machine_arch_initcall(pseries, pseries_nvram_init_log_partition);
+
+static void __init nvram_init_oops_partition(int rtas_partition_exists)
+{
+	int rc;
+
+	rc = pseries_nvram_init_os_partition(&oops_log_partition);
+	if (rc != 0) {
+		if (!rtas_partition_exists)
+			return;
+		pr_notice("nvram: Using %s partition to log both"
+			" RTAS errors and oops/panic reports\n",
+			rtas_log_partition.name);
+		memcpy(&oops_log_partition, &rtas_log_partition,
+						sizeof(rtas_log_partition));
+	}
+	oops_buf = kmalloc(oops_log_partition.size, GFP_KERNEL);
+	rc = kmsg_dump_register(&nvram_kmsg_dumper);
+	if (rc != 0) {
+		pr_err("nvram: kmsg_dump_register() failed; returned %d\n", rc);
+		kfree(oops_buf);
+		return;
+	}
+}
+
+static int __init pseries_nvram_init_log_partitions(void)
+{
+	int rc;
+
+	rc = pseries_nvram_init_os_partition(&rtas_log_partition);
+	nvram_init_oops_partition(rc == 0);
+	return 0;
+}
+machine_arch_initcall(pseries, pseries_nvram_init_log_partitions);
 
 int __init pSeries_nvram_init(void)
 {
@@ -353,3 +434,59 @@ int __init pSeries_nvram_init(void)
 
 	return 0;
 }
+
+/*
+ * Try to capture the last capture_len bytes of the printk buffer.  Return
+ * the amount actually captured.
+ */
+static size_t capture_last_msgs(const char *old_msgs, size_t old_len,
+				const char *new_msgs, size_t new_len,
+				char *captured, size_t capture_len)
+{
+	if (new_len >= capture_len) {
+		memcpy(captured, new_msgs + (new_len - capture_len),
+								capture_len);
+		return capture_len;
+	} else {
+		/* Grab the end of old_msgs. */
+		size_t old_tail_len = min(old_len, capture_len - new_len);
+		memcpy(captured, old_msgs + (old_len - old_tail_len),
+								old_tail_len);
+		memcpy(captured + old_tail_len, new_msgs, new_len);
+		return old_tail_len + new_len;
+	}
+}
+
+/*
+ * Are we using the ibm,rtas-log for oops/panic reports?  And if so,
+ * would logging this oops/panic overwrite an RTAS event that rtas_errd
+ * hasn't had a chance to read and process?  Return 1 if so, else 0.
+ *
+ * We assume that if rtas_errd hasn't read the RTAS event in
+ * NVRAM_RTAS_READ_TIMEOUT seconds, it's probably not going to.
+ */
+static int clobbering_unread_rtas_event(void)
+{
+	return (oops_log_partition.index == rtas_log_partition.index
+		&& last_unread_rtas_event
+		&& get_seconds() - last_unread_rtas_event <=
+						NVRAM_RTAS_READ_TIMEOUT);
+}
+
+/* our kmsg_dump callback */
+static void oops_to_nvram(struct kmsg_dumper *dumper,
+		enum kmsg_dump_reason reason,
+		const char *old_msgs, unsigned long old_len,
+		const char *new_msgs, unsigned long new_len)
+{
+	static unsigned int oops_count = 0;
+	size_t text_len;
+
+	if (clobbering_unread_rtas_event())
+		return;
+
+	text_len = capture_last_msgs(old_msgs, old_len, new_msgs, new_len,
+					oops_buf, oops_log_partition.size);
+	(void) nvram_write_os_partition(&oops_log_partition, oops_buf,
+		(int) text_len, ERR_TYPE_KERNEL_PANIC, ++oops_count);
+}
diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c
index 5fcc92a12d3..3bf4488aaec 100644
--- a/arch/powerpc/platforms/pseries/pci_dlpar.c
+++ b/arch/powerpc/platforms/pseries/pci_dlpar.c
@@ -149,7 +149,7 @@ struct pci_controller * __devinit init_phb_dynamic(struct device_node *dn)
 	if (dn->child)
 		eeh_add_device_tree_early(dn);
 
-	pcibios_scan_phb(phb, dn);
+	pcibios_scan_phb(phb);
 	pcibios_finish_adding_to_bus(phb->bus);
 
 	return phb;
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index d345bfd56bb..2a0089a2c82 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -114,10 +114,13 @@ static void __init fwnmi_init(void)
 
 static void pseries_8259_cascade(unsigned int irq, struct irq_desc *desc)
 {
+	struct irq_chip *chip = get_irq_desc_chip(desc);
 	unsigned int cascade_irq = i8259_irq();
+
 	if (cascade_irq != NO_IRQ)
 		generic_handle_irq(cascade_irq);
-	desc->chip->eoi(irq);
+
+	chip->irq_eoi(&desc->irq_data);
 }
 
 static void __init pseries_setup_i8259_cascade(void)
diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c
index 7b96e5a270c..01fea46c033 100644
--- a/arch/powerpc/platforms/pseries/xics.c
+++ b/arch/powerpc/platforms/pseries/xics.c
@@ -202,20 +202,20 @@ static int get_irq_server(unsigned int virq, const struct cpumask *cpumask,
 #define get_irq_server(virq, cpumask, strict_check) (default_server)
 #endif
 
-static void xics_unmask_irq(unsigned int virq)
+static void xics_unmask_irq(struct irq_data *d)
 {
 	unsigned int irq;
 	int call_status;
 	int server;
 
-	pr_devel("xics: unmask virq %d\n", virq);
+	pr_devel("xics: unmask virq %d\n", d->irq);
 
-	irq = (unsigned int)irq_map[virq].hwirq;
+	irq = (unsigned int)irq_map[d->irq].hwirq;
 	pr_devel(" -> map to hwirq 0x%x\n", irq);
 	if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS)
 		return;
 
-	server = get_irq_server(virq, irq_to_desc(virq)->affinity, 0);
+	server = get_irq_server(d->irq, d->affinity, 0);
 
 	call_status = rtas_call(ibm_set_xive, 3, 1, NULL, irq, server,
 				DEFAULT_PRIORITY);
@@ -235,61 +235,61 @@ static void xics_unmask_irq(unsigned int virq)
 	}
 }
 
-static unsigned int xics_startup(unsigned int virq)
+static unsigned int xics_startup(struct irq_data *d)
 {
 	/*
 	 * The generic MSI code returns with the interrupt disabled on the
 	 * card, using the MSI mask bits. Firmware doesn't appear to unmask
 	 * at that level, so we do it here by hand.
 	 */
-	if (irq_to_desc(virq)->msi_desc)
-		unmask_msi_irq(irq_get_irq_data(virq));
+	if (d->msi_desc)
+		unmask_msi_irq(d);
 
 	/* unmask it */
-	xics_unmask_irq(virq);
+	xics_unmask_irq(d);
 	return 0;
 }
 
-static void xics_mask_real_irq(unsigned int irq)
+static void xics_mask_real_irq(struct irq_data *d)
 {
 	int call_status;
 
-	if (irq == XICS_IPI)
+	if (d->irq == XICS_IPI)
 		return;
 
-	call_status = rtas_call(ibm_int_off, 1, 1, NULL, irq);
+	call_status = rtas_call(ibm_int_off, 1, 1, NULL, d->irq);
 	if (call_status != 0) {
 		printk(KERN_ERR "%s: ibm_int_off irq=%u returned %d\n",
-			__func__, irq, call_status);
+			__func__, d->irq, call_status);
 		return;
 	}
 
 	/* Have to set XIVE to 0xff to be able to remove a slot */
-	call_status = rtas_call(ibm_set_xive, 3, 1, NULL, irq,
+	call_status = rtas_call(ibm_set_xive, 3, 1, NULL, d->irq,
 				default_server, 0xff);
 	if (call_status != 0) {
 		printk(KERN_ERR "%s: ibm_set_xive(0xff) irq=%u returned %d\n",
-			__func__, irq, call_status);
+			__func__, d->irq, call_status);
 		return;
 	}
 }
 
-static void xics_mask_irq(unsigned int virq)
+static void xics_mask_irq(struct irq_data *d)
 {
 	unsigned int irq;
 
-	pr_devel("xics: mask virq %d\n", virq);
+	pr_devel("xics: mask virq %d\n", d->irq);
 
-	irq = (unsigned int)irq_map[virq].hwirq;
+	irq = (unsigned int)irq_map[d->irq].hwirq;
 	if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS)
 		return;
-	xics_mask_real_irq(irq);
+	xics_mask_real_irq(d);
 }
 
 static void xics_mask_unknown_vec(unsigned int vec)
 {
 	printk(KERN_ERR "Interrupt %u (real) is invalid, disabling it.\n", vec);
-	xics_mask_real_irq(vec);
+	xics_mask_real_irq(irq_get_irq_data(vec));
 }
 
 static inline unsigned int xics_xirr_vector(unsigned int xirr)
@@ -371,30 +371,31 @@ static unsigned char pop_cppr(void)
 	return os_cppr->stack[--os_cppr->index];
 }
 
-static void xics_eoi_direct(unsigned int virq)
+static void xics_eoi_direct(struct irq_data *d)
 {
-	unsigned int irq = (unsigned int)irq_map[virq].hwirq;
+	unsigned int irq = (unsigned int)irq_map[d->irq].hwirq;
 
 	iosync();
 	direct_xirr_info_set((pop_cppr() << 24) | irq);
 }
 
-static void xics_eoi_lpar(unsigned int virq)
+static void xics_eoi_lpar(struct irq_data *d)
 {
-	unsigned int irq = (unsigned int)irq_map[virq].hwirq;
+	unsigned int irq = (unsigned int)irq_map[d->irq].hwirq;
 
 	iosync();
 	lpar_xirr_info_set((pop_cppr() << 24) | irq);
 }
 
-static int xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
+static int
+xics_set_affinity(struct irq_data *d, const struct cpumask *cpumask, bool force)
 {
 	unsigned int irq;
 	int status;
 	int xics_status[2];
 	int irq_server;
 
-	irq = (unsigned int)irq_map[virq].hwirq;
+	irq = (unsigned int)irq_map[d->irq].hwirq;
 	if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS)
 		return -1;
 
@@ -406,13 +407,13 @@ static int xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
 		return -1;
 	}
 
-	irq_server = get_irq_server(virq, cpumask, 1);
+	irq_server = get_irq_server(d->irq, cpumask, 1);
 	if (irq_server == -1) {
 		char cpulist[128];
 		cpumask_scnprintf(cpulist, sizeof(cpulist), cpumask);
 		printk(KERN_WARNING
 			"%s: No online cpus in the mask %s for irq %d\n",
-			__func__, cpulist, virq);
+			__func__, cpulist, d->irq);
 		return -1;
 	}
 
@@ -430,20 +431,20 @@ static int xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
 
 static struct irq_chip xics_pic_direct = {
 	.name = "XICS",
-	.startup = xics_startup,
-	.mask = xics_mask_irq,
-	.unmask = xics_unmask_irq,
-	.eoi = xics_eoi_direct,
-	.set_affinity = xics_set_affinity
+	.irq_startup = xics_startup,
+	.irq_mask = xics_mask_irq,
+	.irq_unmask = xics_unmask_irq,
+	.irq_eoi = xics_eoi_direct,
+	.irq_set_affinity = xics_set_affinity
 };
 
 static struct irq_chip xics_pic_lpar = {
 	.name = "XICS",
-	.startup = xics_startup,
-	.mask = xics_mask_irq,
-	.unmask = xics_unmask_irq,
-	.eoi = xics_eoi_lpar,
-	.set_affinity = xics_set_affinity
+	.irq_startup = xics_startup,
+	.irq_mask = xics_mask_irq,
+	.irq_unmask = xics_unmask_irq,
+	.irq_eoi = xics_eoi_lpar,
+	.irq_set_affinity = xics_set_affinity
 };
 
 
@@ -890,6 +891,7 @@ void xics_migrate_irqs_away(void)
 
 	for_each_irq(virq) {
 		struct irq_desc *desc;
+		struct irq_chip *chip;
 		int xics_status[2];
 		int status;
 		unsigned long flags;
@@ -903,12 +905,15 @@ void xics_migrate_irqs_away(void)
 		/* We need to get IPIs still. */
 		if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS)
 			continue;
+
 		desc = irq_to_desc(virq);
 
 		/* We only need to migrate enabled IRQS */
-		if (desc == NULL || desc->chip == NULL
-		    || desc->action == NULL
-		    || desc->chip->set_affinity == NULL)
+		if (desc == NULL || desc->action == NULL)
+			continue;
+
+		chip = get_irq_desc_chip(desc);
+		if (chip == NULL || chip->irq_set_affinity == NULL)
 			continue;
 
 		raw_spin_lock_irqsave(&desc->lock, flags);
@@ -934,8 +939,8 @@ void xics_migrate_irqs_away(void)
 			       virq, cpu);
 
 		/* Reset affinity to all cpus */
-		cpumask_setall(irq_to_desc(virq)->affinity);
-		desc->chip->set_affinity(virq, cpu_all_mask);
+		cpumask_setall(desc->irq_data.affinity);
+		chip->irq_set_affinity(&desc->irq_data, cpu_all_mask, true);
 unlock:
 		raw_spin_unlock_irqrestore(&desc->lock, flags);
 	}
author	Artem Bityutskiy <Artem.Bityutskiy@nokia.com>	2011-03-25 17:41:20 +0200
committer	Artem Bityutskiy <Artem.Bityutskiy@nokia.com>	2011-03-25 17:41:20 +0200
commit	7bf7e370d5919112c223a269462cd0b546903829 (patch)
tree	03ccc715239df14ae168277dbccc9d9cf4d8a2c8 /arch/powerpc/platforms/pseries
parent	68b1a1e786f29c900fa1c516a402e24f0ece622a (diff)
parent	d39dd11c3e6a7af5c20bfac40594db36cf270f42 (diff)