16 files changed, 5150 insertions, 0 deletions
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
new file mode 100644
index 00000000000..e3fc3407bb1
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -0,0 +1,33 @@
+
+config PPC_SPLPAR
+	depends on PPC_PSERIES
+	bool "Support for shared-processor logical partitions"
+	default n
+	help
+	  Enabling this option will make the kernel run more efficiently
+	  on logically-partitioned pSeries systems which use shared
+	  processors, that is, which share physical processors between
+	  two or more partitions.
+
+config HMT
+	bool "Hardware multithreading"
+	depends on SMP && PPC_PSERIES && BROKEN
+	help
+	  This option enables hardware multithreading on RS64 cpus.
+	  pSeries systems p620 and p660 have such a cpu type.
+
+config EEH
+	bool "PCI Extended Error Handling (EEH)" if EMBEDDED
+	depends on PPC_PSERIES
+	default y if !EMBEDDED
+
+config SCANLOG
+	tristate "Scanlog dump interface"
+	depends on RTAS_PROC && PPC_PSERIES
+
+config LPARCFG
+	tristate "LPAR Configuration Data"
+	depends on PPC_PSERIES || PPC_ISERIES
+	help
+	Provide system capacity information via human readable
+	<key word>=<value> pairs through a /proc/ppc64/lparcfg interface.
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
new file mode 100644
index 00000000000..b9938fece78
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -0,0 +1,5 @@
+obj-y			:= pci.o lpar.o hvCall.o nvram.o reconfig.o \
+			   setup.o iommu.o ras.o rtasd.o
+obj-$(CONFIG_SMP)	+= smp.o
+obj-$(CONFIG_IBMVIO)	+= vio.o
+obj-$(CONFIG_XICS)	+= xics.o
diff --git a/arch/powerpc/platforms/pseries/hvCall.S b/arch/powerpc/platforms/pseries/hvCall.S
new file mode 100644
index 00000000000..176e8da7646
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/hvCall.S
@@ -0,0 +1,131 @@
+/*
+ * arch/ppc64/kernel/pSeries_hvCall.S
+ *
+ * This file contains the generic code to perform a call to the
+ * pSeries LPAR hypervisor.
+ * NOTE: this file will go away when we move to inline this work.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <asm/hvcall.h>
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+	
+#define STK_PARM(i)     (48 + ((i)-3)*8)
+
+	.text
+
+/* long plpar_hcall(unsigned long opcode,		R3
+			unsigned long arg1,		R4
+			unsigned long arg2,		R5
+			unsigned long arg3,		R6
+			unsigned long arg4,		R7
+			unsigned long *out1,		R8
+			unsigned long *out2,		R9
+			unsigned long *out3);		R10
+ */
+_GLOBAL(plpar_hcall)
+	HMT_MEDIUM
+
+	mfcr	r0
+
+	std	r8,STK_PARM(r8)(r1)	/* Save out ptrs */
+	std	r9,STK_PARM(r9)(r1)
+	std	r10,STK_PARM(r10)(r1)
+
+	stw	r0,8(r1)
+
+	HVSC				/* invoke the hypervisor */
+
+	lwz	r0,8(r1)
+
+	ld	r8,STK_PARM(r8)(r1)	/* Fetch r4-r6 ret args */
+	ld	r9,STK_PARM(r9)(r1)
+	ld	r10,STK_PARM(r10)(r1)
+	std	r4,0(r8)
+	std	r5,0(r9)
+	std	r6,0(r10)
+
+	mtcrf	0xff,r0
+	blr				/* return r3 = status */
+
+
+/* Simple interface with no output values (other than status) */
+_GLOBAL(plpar_hcall_norets)
+	HMT_MEDIUM
+
+	mfcr	r0
+	stw	r0,8(r1)
+
+	HVSC				/* invoke the hypervisor */
+
+	lwz	r0,8(r1)
+	mtcrf	0xff,r0
+	blr				/* return r3 = status */
+
+
+/* long plpar_hcall_8arg_2ret(unsigned long opcode,	R3
+			unsigned long arg1,		R4
+			unsigned long arg2,		R5
+			unsigned long arg3,		R6
+			unsigned long arg4,		R7
+			unsigned long arg5,		R8
+			unsigned long arg6,		R9
+			unsigned long arg7,		R10
+			unsigned long arg8,		112(R1)
+			unsigned long *out1);		120(R1)
+ */
+_GLOBAL(plpar_hcall_8arg_2ret)
+	HMT_MEDIUM
+
+	mfcr	r0
+	ld	r11,STK_PARM(r11)(r1)	/* put arg8 in R11 */
+	stw	r0,8(r1)
+
+	HVSC				/* invoke the hypervisor */
+
+	lwz	r0,8(r1)
+	ld	r10,STK_PARM(r12)(r1)	/* Fetch r4 ret arg */
+	std	r4,0(r10)
+	mtcrf	0xff,r0
+	blr				/* return r3 = status */
+
+
+/* long plpar_hcall_4out(unsigned long opcode,		R3
+		 	unsigned long arg1,		R4
+		 	unsigned long arg2,		R5
+		 	unsigned long arg3,		R6
+		 	unsigned long arg4,		R7
+		 	unsigned long *out1,		R8
+		 	unsigned long *out2,		R9
+		 	unsigned long *out3,		R10
+		 	unsigned long *out4);		112(R1)
+ */
+_GLOBAL(plpar_hcall_4out)
+	HMT_MEDIUM
+
+	mfcr	r0
+	stw	r0,8(r1)
+
+	std	r8,STK_PARM(r8)(r1)	/* Save out ptrs */
+	std	r9,STK_PARM(r9)(r1)
+	std	r10,STK_PARM(r10)(r1)
+
+	HVSC				/* invoke the hypervisor */
+
+	lwz	r0,8(r1)
+
+	ld	r8,STK_PARM(r8)(r1)	/* Fetch r4-r7 ret args */
+	ld	r9,STK_PARM(r9)(r1)
+	ld	r10,STK_PARM(r10)(r1)
+	ld	r11,STK_PARM(r11)(r1)
+	std	r4,0(r8)
+	std	r5,0(r9)
+	std	r6,0(r10)
+	std	r7,0(r11)
+
+	mtcrf	0xff,r0
+	blr				/* return r3 = status */
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
new file mode 100644
index 00000000000..513e2723149
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -0,0 +1,607 @@
+/*
+ * arch/ppc64/kernel/pSeries_iommu.c
+ *
+ * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation
+ *
+ * Rewrite, cleanup: 
+ *
+ * Copyright (C) 2004 Olof Johansson <olof@austin.ibm.com>, IBM Corporation
+ *
+ * Dynamic DMA mapping support, pSeries-specific parts, both SMP and LPAR.
+ *
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/ppcdebug.h>
+#include <asm/iommu.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+#include <asm/abs_addr.h>
+#include <asm/pSeries_reconfig.h>
+#include <asm/systemcfg.h>
+#include <asm/firmware.h>
+#include <asm/tce.h>
+#include <asm/ppc-pci.h>
+
+#include "plpar_wrappers.h"
+
+#define DBG(fmt...)
+
+extern int is_python(struct device_node *);
+
+static void tce_build_pSeries(struct iommu_table *tbl, long index, 
+			      long npages, unsigned long uaddr, 
+			      enum dma_data_direction direction)
+{
+	union tce_entry t;
+	union tce_entry *tp;
+
+	index <<= TCE_PAGE_FACTOR;
+	npages <<= TCE_PAGE_FACTOR;
+
+	t.te_word = 0;
+	t.te_rdwr = 1; // Read allowed 
+
+	if (direction != DMA_TO_DEVICE)
+		t.te_pciwr = 1;
+
+	tp = ((union tce_entry *)tbl->it_base) + index;
+
+	while (npages--) {
+		/* can't move this out since we might cross LMB boundary */
+		t.te_rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
+	
+		tp->te_word = t.te_word;
+
+		uaddr += TCE_PAGE_SIZE;
+		tp++;
+	}
+}
+
+
+static void tce_free_pSeries(struct iommu_table *tbl, long index, long npages)
+{
+	union tce_entry t;
+	union tce_entry *tp;
+
+	npages <<= TCE_PAGE_FACTOR;
+	index <<= TCE_PAGE_FACTOR;
+
+	t.te_word = 0;
+	tp  = ((union tce_entry *)tbl->it_base) + index;
+		
+	while (npages--) {
+		tp->te_word = t.te_word;
+		
+		tp++;
+	}
+}
+
+
+static void tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
+				long npages, unsigned long uaddr,
+				enum dma_data_direction direction)
+{
+	u64 rc;
+	union tce_entry tce;
+
+	tce.te_word = 0;
+	tce.te_rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
+	tce.te_rdwr = 1;
+	if (direction != DMA_TO_DEVICE)
+		tce.te_pciwr = 1;
+
+	while (npages--) {
+		rc = plpar_tce_put((u64)tbl->it_index, 
+				   (u64)tcenum << 12, 
+				   tce.te_word );
+		
+		if (rc && printk_ratelimit()) {
+			printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%ld\n", rc);
+			printk("\tindex   = 0x%lx\n", (u64)tbl->it_index);
+			printk("\ttcenum  = 0x%lx\n", (u64)tcenum);
+			printk("\ttce val = 0x%lx\n", tce.te_word );
+			show_stack(current, (unsigned long *)__get_SP());
+		}
+			
+		tcenum++;
+		tce.te_rpn++;
+	}
+}
+
+static DEFINE_PER_CPU(void *, tce_page) = NULL;
+
+static void tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
+				     long npages, unsigned long uaddr,
+				     enum dma_data_direction direction)
+{
+	u64 rc;
+	union tce_entry tce, *tcep;
+	long l, limit;
+
+	tcenum <<= TCE_PAGE_FACTOR;
+	npages <<= TCE_PAGE_FACTOR;
+
+	if (npages == 1)
+		return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
+					   direction);
+
+	tcep = __get_cpu_var(tce_page);
+
+	/* This is safe to do since interrupts are off when we're called
+	 * from iommu_alloc{,_sg}()
+	 */
+	if (!tcep) {
+		tcep = (void *)__get_free_page(GFP_ATOMIC);
+		/* If allocation fails, fall back to the loop implementation */
+		if (!tcep)
+			return tce_build_pSeriesLP(tbl, tcenum, npages,
+						   uaddr, direction);
+		__get_cpu_var(tce_page) = tcep;
+	}
+
+	tce.te_word = 0;
+	tce.te_rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
+	tce.te_rdwr = 1;
+	if (direction != DMA_TO_DEVICE)
+		tce.te_pciwr = 1;
+
+	/* We can map max one pageful of TCEs at a time */
+	do {
+		/*
+		 * Set up the page with TCE data, looping through and setting
+		 * the values.
+		 */
+		limit = min_t(long, npages, 4096/sizeof(union tce_entry));
+
+		for (l = 0; l < limit; l++) {
+			tcep[l] = tce;
+			tce.te_rpn++;
+		}
+
+		rc = plpar_tce_put_indirect((u64)tbl->it_index,
+					    (u64)tcenum << 12,
+					    (u64)virt_to_abs(tcep),
+					    limit);
+
+		npages -= limit;
+		tcenum += limit;
+	} while (npages > 0 && !rc);
+
+	if (rc && printk_ratelimit()) {
+		printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%ld\n", rc);
+		printk("\tindex   = 0x%lx\n", (u64)tbl->it_index);
+		printk("\tnpages  = 0x%lx\n", (u64)npages);
+		printk("\ttce[0] val = 0x%lx\n", tcep[0].te_word);
+		show_stack(current, (unsigned long *)__get_SP());
+	}
+}
+
+static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
+{
+	u64 rc;
+	union tce_entry tce;
+
+	tcenum <<= TCE_PAGE_FACTOR;
+	npages <<= TCE_PAGE_FACTOR;
+
+	tce.te_word = 0;
+
+	while (npages--) {
+		rc = plpar_tce_put((u64)tbl->it_index,
+				   (u64)tcenum << 12,
+				   tce.te_word);
+
+		if (rc && printk_ratelimit()) {
+			printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%ld\n", rc);
+			printk("\tindex   = 0x%lx\n", (u64)tbl->it_index);
+			printk("\ttcenum  = 0x%lx\n", (u64)tcenum);
+			printk("\ttce val = 0x%lx\n", tce.te_word );
+			show_stack(current, (unsigned long *)__get_SP());
+		}
+
+		tcenum++;
+	}
+}
+
+
+static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
+{
+	u64 rc;
+	union tce_entry tce;
+
+	tcenum <<= TCE_PAGE_FACTOR;
+	npages <<= TCE_PAGE_FACTOR;
+
+	tce.te_word = 0;
+
+	rc = plpar_tce_stuff((u64)tbl->it_index,
+			   (u64)tcenum << 12,
+			   tce.te_word,
+			   npages);
+
+	if (rc && printk_ratelimit()) {
+		printk("tce_freemulti_pSeriesLP: plpar_tce_stuff failed\n");
+		printk("\trc      = %ld\n", rc);
+		printk("\tindex   = 0x%lx\n", (u64)tbl->it_index);
+		printk("\tnpages  = 0x%lx\n", (u64)npages);
+		printk("\ttce val = 0x%lx\n", tce.te_word );
+		show_stack(current, (unsigned long *)__get_SP());
+	}
+}
+
+static void iommu_table_setparms(struct pci_controller *phb,
+				 struct device_node *dn,
+				 struct iommu_table *tbl) 
+{
+	struct device_node *node;
+	unsigned long *basep;
+	unsigned int *sizep;
+
+	node = (struct device_node *)phb->arch_data;
+
+	basep = (unsigned long *)get_property(node, "linux,tce-base", NULL);
+	sizep = (unsigned int *)get_property(node, "linux,tce-size", NULL);
+	if (basep == NULL || sizep == NULL) {
+		printk(KERN_ERR "PCI_DMA: iommu_table_setparms: %s has "
+				"missing tce entries !\n", dn->full_name);
+		return;
+	}
+
+	tbl->it_base = (unsigned long)__va(*basep);
+	memset((void *)tbl->it_base, 0, *sizep);
+
+	tbl->it_busno = phb->bus->number;
+	
+	/* Units of tce entries */
+	tbl->it_offset = phb->dma_window_base_cur >> PAGE_SHIFT;
+	
+	/* Test if we are going over 2GB of DMA space */
+	if (phb->dma_window_base_cur + phb->dma_window_size > 0x80000000ul) {
+		udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
+		panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n"); 
+	}
+	
+	phb->dma_window_base_cur += phb->dma_window_size;
+
+	/* Set the tce table size - measured in entries */
+	tbl->it_size = phb->dma_window_size >> PAGE_SHIFT;
+
+	tbl->it_index = 0;
+	tbl->it_blocksize = 16;
+	tbl->it_type = TCE_PCI;
+}
+
+/*
+ * iommu_table_setparms_lpar
+ *
+ * Function: On pSeries LPAR systems, return TCE table info, given a pci bus.
+ *
+ * ToDo: properly interpret the ibm,dma-window property.  The definition is:
+ *	logical-bus-number	(1 word)
+ *	phys-address		(#address-cells words)
+ *	size			(#cell-size words)
+ *
+ * Currently we hard code these sizes (more or less).
+ */
+static void iommu_table_setparms_lpar(struct pci_controller *phb,
+				      struct device_node *dn,
+				      struct iommu_table *tbl,
+				      unsigned int *dma_window)
+{
+	tbl->it_busno  = PCI_DN(dn)->bussubno;
+
+	/* TODO: Parse field size properties properly. */
+	tbl->it_size   = (((unsigned long)dma_window[4] << 32) |
+			   (unsigned long)dma_window[5]) >> PAGE_SHIFT;
+	tbl->it_offset = (((unsigned long)dma_window[2] << 32) |
+			   (unsigned long)dma_window[3]) >> PAGE_SHIFT;
+	tbl->it_base   = 0;
+	tbl->it_index  = dma_window[0];
+	tbl->it_blocksize  = 16;
+	tbl->it_type = TCE_PCI;
+}
+
+static void iommu_bus_setup_pSeries(struct pci_bus *bus)
+{
+	struct device_node *dn;
+	struct iommu_table *tbl;
+	struct device_node *isa_dn, *isa_dn_orig;
+	struct device_node *tmp;
+	struct pci_dn *pci;
+	int children;
+
+	DBG("iommu_bus_setup_pSeries, bus %p, bus->self %p\n", bus, bus->self);
+
+	dn = pci_bus_to_OF_node(bus);
+	pci = PCI_DN(dn);
+
+	if (bus->self) {
+		/* This is not a root bus, any setup will be done for the
+		 * device-side of the bridge in iommu_dev_setup_pSeries().
+		 */
+		return;
+	}
+
+	/* Check if the ISA bus on the system is under
+	 * this PHB.
+	 */
+	isa_dn = isa_dn_orig = of_find_node_by_type(NULL, "isa");
+
+	while (isa_dn && isa_dn != dn)
+		isa_dn = isa_dn->parent;
+
+	if (isa_dn_orig)
+		of_node_put(isa_dn_orig);
+
+	/* Count number of direct PCI children of the PHB.
+	 * All PCI device nodes have class-code property, so it's
+	 * an easy way to find them.
+	 */
+	for (children = 0, tmp = dn->child; tmp; tmp = tmp->sibling)
+		if (get_property(tmp, "class-code", NULL))
+			children++;
+
+	DBG("Children: %d\n", children);
+
+	/* Calculate amount of DMA window per slot. Each window must be
+	 * a power of two (due to pci_alloc_consistent requirements).
+	 *
+	 * Keep 256MB aside for PHBs with ISA.
+	 */
+
+	if (!isa_dn) {
+		/* No ISA/IDE - just set window size and return */
+		pci->phb->dma_window_size = 0x80000000ul; /* To be divided */
+
+		while (pci->phb->dma_window_size * children > 0x80000000ul)
+			pci->phb->dma_window_size >>= 1;
+		DBG("No ISA/IDE, window size is 0x%lx\n",
+			pci->phb->dma_window_size);
+		pci->phb->dma_window_base_cur = 0;
+
+		return;
+	}
+
+	/* If we have ISA, then we probably have an IDE
+	 * controller too. Allocate a 128MB table but
+	 * skip the first 128MB to avoid stepping on ISA
+	 * space.
+	 */
+	pci->phb->dma_window_size = 0x8000000ul;
+	pci->phb->dma_window_base_cur = 0x8000000ul;
+
+	tbl = kmalloc(sizeof(struct iommu_table), GFP_KERNEL);
+
+	iommu_table_setparms(pci->phb, dn, tbl);
+	pci->iommu_table = iommu_init_table(tbl);
+
+	/* Divide the rest (1.75GB) among the children */
+	pci->phb->dma_window_size = 0x80000000ul;
+	while (pci->phb->dma_window_size * children > 0x70000000ul)
+		pci->phb->dma_window_size >>= 1;
+
+	DBG("ISA/IDE, window size is 0x%lx\n", pci->phb->dma_window_size);
+
+}
+
+
+static void iommu_bus_setup_pSeriesLP(struct pci_bus *bus)
+{
+	struct iommu_table *tbl;
+	struct device_node *dn, *pdn;
+	struct pci_dn *ppci;
+	unsigned int *dma_window = NULL;
+
+	DBG("iommu_bus_setup_pSeriesLP, bus %p, bus->self %p\n", bus, bus->self);
+
+	dn = pci_bus_to_OF_node(bus);
+
+	/* Find nearest ibm,dma-window, walking up the device tree */
+	for (pdn = dn; pdn != NULL; pdn = pdn->parent) {
+		dma_window = (unsigned int *)get_property(pdn, "ibm,dma-window", NULL);
+		if (dma_window != NULL)
+			break;
+	}
+
+	if (dma_window == NULL) {
+		DBG("iommu_bus_setup_pSeriesLP: bus %s seems to have no ibm,dma-window property\n", dn->full_name);
+		return;
+	}
+
+	ppci = pdn->data;
+	if (!ppci->iommu_table) {
+		/* Bussubno hasn't been copied yet.
+		 * Do it now because iommu_table_setparms_lpar needs it.
+		 */
+
+		ppci->bussubno = bus->number;
+
+		tbl = (struct iommu_table *)kmalloc(sizeof(struct iommu_table),
+						    GFP_KERNEL);
+	
+		iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window);
+
+		ppci->iommu_table = iommu_init_table(tbl);
+	}
+
+	if (pdn != dn)
+		PCI_DN(dn)->iommu_table = ppci->iommu_table;
+}
+
+
+static void iommu_dev_setup_pSeries(struct pci_dev *dev)
+{
+	struct device_node *dn, *mydn;
+	struct iommu_table *tbl;
+
+	DBG("iommu_dev_setup_pSeries, dev %p (%s)\n", dev, pci_name(dev));
+
+	mydn = dn = pci_device_to_OF_node(dev);
+
+	/* If we're the direct child of a root bus, then we need to allocate
+	 * an iommu table ourselves. The bus setup code should have setup
+	 * the window sizes already.
+	 */
+	if (!dev->bus->self) {
+		DBG(" --> first child, no bridge. Allocating iommu table.\n");
+		tbl = kmalloc(sizeof(struct iommu_table), GFP_KERNEL);
+		iommu_table_setparms(PCI_DN(dn)->phb, dn, tbl);
+		PCI_DN(mydn)->iommu_table = iommu_init_table(tbl);
+
+		return;
+	}
+
+	/* If this device is further down the bus tree, search upwards until
+	 * an already allocated iommu table is found and use that.
+	 */
+
+	while (dn && dn->data && PCI_DN(dn)->iommu_table == NULL)
+		dn = dn->parent;
+
+	if (dn && dn->data) {
+		PCI_DN(mydn)->iommu_table = PCI_DN(dn)->iommu_table;
+	} else {
+		DBG("iommu_dev_setup_pSeries, dev %p (%s) has no iommu table\n", dev, pci_name(dev));
+	}
+}
+
+static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *node)
+{
+	int err = NOTIFY_OK;
+	struct device_node *np = node;
+	struct pci_dn *pci = np->data;
+
+	switch (action) {
+	case PSERIES_RECONFIG_REMOVE:
+		if (pci && pci->iommu_table &&
+		    get_property(np, "ibm,dma-window", NULL))
+			iommu_free_table(np);
+		break;
+	default:
+		err = NOTIFY_DONE;
+		break;
+	}
+	return err;
+}
+
+static struct notifier_block iommu_reconfig_nb = {
+	.notifier_call = iommu_reconfig_notifier,
+};
+
+static void iommu_dev_setup_pSeriesLP(struct pci_dev *dev)
+{
+	struct device_node *pdn, *dn;
+	struct iommu_table *tbl;
+	int *dma_window = NULL;
+	struct pci_dn *pci;
+
+	DBG("iommu_dev_setup_pSeriesLP, dev %p (%s)\n", dev, pci_name(dev));
+
+	/* dev setup for LPAR is a little tricky, since the device tree might
+	 * contain the dma-window properties per-device and not neccesarily
+	 * for the bus. So we need to search upwards in the tree until we
+	 * either hit a dma-window property, OR find a parent with a table
+	 * already allocated.
+	 */
+	dn = pci_device_to_OF_node(dev);
+
+	for (pdn = dn; pdn && pdn->data && !PCI_DN(pdn)->iommu_table;
+	     pdn = pdn->parent) {
+		dma_window = (unsigned int *)
+			get_property(pdn, "ibm,dma-window", NULL);
+		if (dma_window)
+			break;
+	}
+
+	/* Check for parent == NULL so we don't try to setup the empty EADS
+	 * slots on POWER4 machines.
+	 */
+	if (dma_window == NULL || pdn->parent == NULL) {
+		DBG("No dma window for device, linking to parent\n");
+		PCI_DN(dn)->iommu_table = PCI_DN(pdn)->iommu_table;
+		return;
+	} else {
+		DBG("Found DMA window, allocating table\n");
+	}
+
+	pci = pdn->data;
+	if (!pci->iommu_table) {
+		/* iommu_table_setparms_lpar needs bussubno. */
+		pci->bussubno = pci->phb->bus->number;
+
+		tbl = (struct iommu_table *)kmalloc(sizeof(struct iommu_table),
+						    GFP_KERNEL);
+
+		iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window);
+
+		pci->iommu_table = iommu_init_table(tbl);
+	}
+
+	if (pdn != dn)
+		PCI_DN(dn)->iommu_table = pci->iommu_table;
+}
+
+static void iommu_bus_setup_null(struct pci_bus *b) { }
+static void iommu_dev_setup_null(struct pci_dev *d) { }
+
+/* These are called very early. */
+void iommu_init_early_pSeries(void)
+{
+	if (of_chosen && get_property(of_chosen, "linux,iommu-off", NULL)) {
+		/* Direct I/O, IOMMU off */
+		ppc_md.iommu_dev_setup = iommu_dev_setup_null;
+		ppc_md.iommu_bus_setup = iommu_bus_setup_null;
+		pci_direct_iommu_init();
+
+		return;
+	}
+
+	if (systemcfg->platform & PLATFORM_LPAR) {
+		if (firmware_has_feature(FW_FEATURE_MULTITCE)) {
+			ppc_md.tce_build = tce_buildmulti_pSeriesLP;
+			ppc_md.tce_free	 = tce_freemulti_pSeriesLP;
+		} else {
+			ppc_md.tce_build = tce_build_pSeriesLP;
+			ppc_md.tce_free	 = tce_free_pSeriesLP;
+		}
+		ppc_md.iommu_bus_setup = iommu_bus_setup_pSeriesLP;
+		ppc_md.iommu_dev_setup = iommu_dev_setup_pSeriesLP;
+	} else {
+		ppc_md.tce_build = tce_build_pSeries;
+		ppc_md.tce_free  = tce_free_pSeries;
+		ppc_md.iommu_bus_setup = iommu_bus_setup_pSeries;
+		ppc_md.iommu_dev_setup = iommu_dev_setup_pSeries;
+	}
+
+
+	pSeries_reconfig_notifier_register(&iommu_reconfig_nb);
+
+	pci_iommu_init();
+}
+
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
new file mode 100644
index 00000000000..e384a5a9179
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -0,0 +1,514 @@
+/*
+ * pSeries_lpar.c
+ * Copyright (C) 2001 Todd Inglett, IBM Corporation
+ *
+ * pSeries LPAR support.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#define DEBUG
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/dma-mapping.h>
+#include <asm/processor.h>
+#include <asm/mmu.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/machdep.h>
+#include <asm/abs_addr.h>
+#include <asm/mmu_context.h>
+#include <asm/ppcdebug.h>
+#include <asm/iommu.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/prom.h>
+#include <asm/abs_addr.h>
+#include <asm/cputable.h>
+
+#include "plpar_wrappers.h"
+
+#ifdef DEBUG
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+/* in pSeries_hvCall.S */
+EXPORT_SYMBOL(plpar_hcall);
+EXPORT_SYMBOL(plpar_hcall_4out);
+EXPORT_SYMBOL(plpar_hcall_norets);
+EXPORT_SYMBOL(plpar_hcall_8arg_2ret);
+
+extern void pSeries_find_serial_port(void);
+
+
+int vtermno;	/* virtual terminal# for udbg  */
+
+#define __ALIGNED__ __attribute__((__aligned__(sizeof(long))))
+static void udbg_hvsi_putc(unsigned char c)
+{
+	/* packet's seqno isn't used anyways */
+	uint8_t packet[] __ALIGNED__ = { 0xff, 5, 0, 0, c };
+	int rc;
+
+	if (c == '\n')
+		udbg_hvsi_putc('\r');
+
+	do {
+		rc = plpar_put_term_char(vtermno, sizeof(packet), packet);
+	} while (rc == H_Busy);
+}
+
+static long hvsi_udbg_buf_len;
+static uint8_t hvsi_udbg_buf[256];
+
+static int udbg_hvsi_getc_poll(void)
+{
+	unsigned char ch;
+	int rc, i;
+
+	if (hvsi_udbg_buf_len == 0) {
+		rc = plpar_get_term_char(vtermno, &hvsi_udbg_buf_len, hvsi_udbg_buf);
+		if (rc != H_Success || hvsi_udbg_buf[0] != 0xff) {
+			/* bad read or non-data packet */
+			hvsi_udbg_buf_len = 0;
+		} else {
+			/* remove the packet header */
+			for (i = 4; i < hvsi_udbg_buf_len; i++)
+				hvsi_udbg_buf[i-4] = hvsi_udbg_buf[i];
+			hvsi_udbg_buf_len -= 4;
+		}
+	}
+
+	if (hvsi_udbg_buf_len <= 0 || hvsi_udbg_buf_len > 256) {
+		/* no data ready */
+		hvsi_udbg_buf_len = 0;
+		return -1;
+	}
+
+	ch = hvsi_udbg_buf[0];
+	/* shift remaining data down */
+	for (i = 1; i < hvsi_udbg_buf_len; i++) {
+		hvsi_udbg_buf[i-1] = hvsi_udbg_buf[i];
+	}
+	hvsi_udbg_buf_len--;
+
+	return ch;
+}
+
+static unsigned char udbg_hvsi_getc(void)
+{
+	int ch;
+	for (;;) {
+		ch = udbg_hvsi_getc_poll();
+		if (ch == -1) {
+			/* This shouldn't be needed...but... */
+			volatile unsigned long delay;
+			for (delay=0; delay < 2000000; delay++)
+				;
+		} else {
+			return ch;
+		}
+	}
+}
+
+static void udbg_putcLP(unsigned char c)
+{
+	char buf[16];
+	unsigned long rc;
+
+	if (c == '\n')
+		udbg_putcLP('\r');
+
+	buf[0] = c;
+	do {
+		rc = plpar_put_term_char(vtermno, 1, buf);
+	} while(rc == H_Busy);
+}
+
+/* Buffered chars getc */
+static long inbuflen;
+static long inbuf[2];	/* must be 2 longs */
+
+static int udbg_getc_pollLP(void)
+{
+	/* The interface is tricky because it may return up to 16 chars.
+	 * We save them statically for future calls to udbg_getc().
+	 */
+	char ch, *buf = (char *)inbuf;
+	int i;
+	long rc;
+	if (inbuflen == 0) {
+		/* get some more chars. */
+		inbuflen = 0;
+		rc = plpar_get_term_char(vtermno, &inbuflen, buf);
+		if (rc != H_Success)
+			inbuflen = 0;	/* otherwise inbuflen is garbage */
+	}
+	if (inbuflen <= 0 || inbuflen > 16) {
+		/* Catch error case as well as other oddities (corruption) */
+		inbuflen = 0;
+		return -1;
+	}
+	ch = buf[0];
+	for (i = 1; i < inbuflen; i++)	/* shuffle them down. */
+		buf[i-1] = buf[i];
+	inbuflen--;
+	return ch;
+}
+
+static unsigned char udbg_getcLP(void)
+{
+	int ch;
+	for (;;) {
+		ch = udbg_getc_pollLP();
+		if (ch == -1) {
+			/* This shouldn't be needed...but... */
+			volatile unsigned long delay;
+			for (delay=0; delay < 2000000; delay++)
+				;
+		} else {
+			return ch;
+		}
+	}
+}
+
+/* call this from early_init() for a working debug console on
+ * vterm capable LPAR machines
+ */
+void udbg_init_debug_lpar(void)
+{
+	vtermno = 0;
+	udbg_putc = udbg_putcLP;
+	udbg_getc = udbg_getcLP;
+	udbg_getc_poll = udbg_getc_pollLP;
+}
+
+/* returns 0 if couldn't find or use /chosen/stdout as console */
+int find_udbg_vterm(void)
+{
+	struct device_node *stdout_node;
+	u32 *termno;
+	char *name;
+	int found = 0;
+
+	/* find the boot console from /chosen/stdout */
+	if (!of_chosen)
+		return 0;
+	name = (char *)get_property(of_chosen, "linux,stdout-path", NULL);
+	if (name == NULL)
+		return 0;
+	stdout_node = of_find_node_by_path(name);
+	if (!stdout_node)
+		return 0;
+
+	/* now we have the stdout node; figure out what type of device it is. */
+	name = (char *)get_property(stdout_node, "name", NULL);
+	if (!name) {
+		printk(KERN_WARNING "stdout node missing 'name' property!\n");
+		goto out;
+	}
+
+	if (strncmp(name, "vty", 3) == 0) {
+		if (device_is_compatible(stdout_node, "hvterm1")) {
+			termno = (u32 *)get_property(stdout_node, "reg", NULL);
+			if (termno) {
+				vtermno = termno[0];
+				udbg_putc = udbg_putcLP;
+				udbg_getc = udbg_getcLP;
+				udbg_getc_poll = udbg_getc_pollLP;
+				found = 1;
+			}
+		} else if (device_is_compatible(stdout_node, "hvterm-protocol")) {
+			termno = (u32 *)get_property(stdout_node, "reg", NULL);
+			if (termno) {
+				vtermno = termno[0];
+				udbg_putc = udbg_hvsi_putc;
+				udbg_getc = udbg_hvsi_getc;
+				udbg_getc_poll = udbg_hvsi_getc_poll;
+				found = 1;
+			}
+		}
+	} else if (strncmp(name, "serial", 6)) {
+		/* XXX fix ISA serial console */
+		printk(KERN_WARNING "serial stdout on LPAR ('%s')! "
+				"can't print udbg messages\n",
+		       stdout_node->full_name);
+	} else {
+		printk(KERN_WARNING "don't know how to print to stdout '%s'\n",
+		       stdout_node->full_name);
+	}
+
+out:
+	of_node_put(stdout_node);
+	return found;
+}
+
+void vpa_init(int cpu)
+{
+	int hwcpu = get_hard_smp_processor_id(cpu);
+	unsigned long vpa = __pa(&paca[cpu].lppaca);
+	long ret;
+
+	if (cpu_has_feature(CPU_FTR_ALTIVEC))
+		paca[cpu].lppaca.vmxregs_in_use = 1;
+
+	ret = register_vpa(hwcpu, vpa);
+
+	if (ret)
+		printk(KERN_ERR "WARNING: vpa_init: VPA registration for "
+				"cpu %d (hw %d) of area %lx returns %ld\n",
+				cpu, hwcpu, vpa, ret);
+}
+
+long pSeries_lpar_hpte_insert(unsigned long hpte_group,
+			      unsigned long va, unsigned long prpn,
+			      unsigned long vflags, unsigned long rflags)
+{
+	unsigned long lpar_rc;
+	unsigned long flags;
+	unsigned long slot;
+	unsigned long hpte_v, hpte_r;
+	unsigned long dummy0, dummy1;
+
+	hpte_v = ((va >> 23) << HPTE_V_AVPN_SHIFT) | vflags | HPTE_V_VALID;
+	if (vflags & HPTE_V_LARGE)
+		hpte_v &= ~(1UL << HPTE_V_AVPN_SHIFT);
+
+	hpte_r = (prpn << HPTE_R_RPN_SHIFT) | rflags;
+
+	/* Now fill in the actual HPTE */
+	/* Set CEC cookie to 0         */
+	/* Zero page = 0               */
+	/* I-cache Invalidate = 0      */
+	/* I-cache synchronize = 0     */
+	/* Exact = 0                   */
+	flags = 0;
+
+	/* XXX why is this here? - Anton */
+	if (rflags & (_PAGE_GUARDED|_PAGE_NO_CACHE))
+		hpte_r &= ~_PAGE_COHERENT;
+
+	lpar_rc = plpar_hcall(H_ENTER, flags, hpte_group, hpte_v,
+			      hpte_r, &slot, &dummy0, &dummy1);
+
+	if (unlikely(lpar_rc == H_PTEG_Full))
+		return -1;
+
+	/*
+	 * Since we try and ioremap PHBs we don't own, the pte insert
+	 * will fail. However we must catch the failure in hash_page
+	 * or we will loop forever, so return -2 in this case.
+	 */
+	if (unlikely(lpar_rc != H_Success))
+		return -2;
+
+	/* Because of iSeries, we have to pass down the secondary
+	 * bucket bit here as well
+	 */
+	return (slot & 7) | (!!(vflags & HPTE_V_SECONDARY) << 3);
+}
+
+static DEFINE_SPINLOCK(pSeries_lpar_tlbie_lock);
+
+static long pSeries_lpar_hpte_remove(unsigned long hpte_group)
+{
+	unsigned long slot_offset;
+	unsigned long lpar_rc;
+	int i;
+	unsigned long dummy1, dummy2;
+
+	/* pick a random slot to start at */
+	slot_offset = mftb() & 0x7;
+
+	for (i = 0; i < HPTES_PER_GROUP; i++) {
+
+		/* don't remove a bolted entry */
+		lpar_rc = plpar_pte_remove(H_ANDCOND, hpte_group + slot_offset,
+					   (0x1UL << 4), &dummy1, &dummy2);
+
+		if (lpar_rc == H_Success)
+			return i;
+
+		BUG_ON(lpar_rc != H_Not_Found);
+
+		slot_offset++;
+		slot_offset &= 0x7;
+	}
+
+	return -1;
+}
+
+static void pSeries_lpar_hptab_clear(void)
+{
+	unsigned long size_bytes = 1UL << ppc64_pft_size;
+	unsigned long hpte_count = size_bytes >> 4;
+	unsigned long dummy1, dummy2;
+	int i;
+
+	/* TODO: Use bulk call */
+	for (i = 0; i < hpte_count; i++)
+		plpar_pte_remove(0, i, 0, &dummy1, &dummy2);
+}
+
+/*
+ * NOTE: for updatepp ops we are fortunate that the linux "newpp" bits and
+ * the low 3 bits of flags happen to line up.  So no transform is needed.
+ * We can probably optimize here and assume the high bits of newpp are
+ * already zero.  For now I am paranoid.
+ */
+static long pSeries_lpar_hpte_updatepp(unsigned long slot, unsigned long newpp,
+				       unsigned long va, int large, int local)
+{
+	unsigned long lpar_rc;
+	unsigned long flags = (newpp & 7) | H_AVPN;
+	unsigned long avpn = va >> 23;
+
+	if (large)
+		avpn &= ~0x1UL;
+
+	lpar_rc = plpar_pte_protect(flags, slot, (avpn << 7));
+
+	if (lpar_rc == H_Not_Found)
+		return -1;
+
+	BUG_ON(lpar_rc != H_Success);
+
+	return 0;
+}
+
+static unsigned long pSeries_lpar_hpte_getword0(unsigned long slot)
+{
+	unsigned long dword0;
+	unsigned long lpar_rc;
+	unsigned long dummy_word1;
+	unsigned long flags;
+
+	/* Read 1 pte at a time                        */
+	/* Do not need RPN to logical page translation */
+	/* No cross CEC PFT access                     */
+	flags = 0;
+
+	lpar_rc = plpar_pte_read(flags, slot, &dword0, &dummy_word1);
+
+	BUG_ON(lpar_rc != H_Success);
+
+	return dword0;
+}
+
+static long pSeries_lpar_hpte_find(unsigned long vpn)
+{
+	unsigned long hash;
+	unsigned long i, j;
+	long slot;
+	unsigned long hpte_v;
+
+	hash = hpt_hash(vpn, 0);
+
+	for (j = 0; j < 2; j++) {
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		for (i = 0; i < HPTES_PER_GROUP; i++) {
+			hpte_v = pSeries_lpar_hpte_getword0(slot);
+
+			if ((HPTE_V_AVPN_VAL(hpte_v) == (vpn >> 11))
+			    && (hpte_v & HPTE_V_VALID)
+			    && (!!(hpte_v & HPTE_V_SECONDARY) == j)) {
+				/* HPTE matches */
+				if (j)
+					slot = -slot;
+				return slot;
+			}
+			++slot;
+		}
+		hash = ~hash;
+	}
+
+	return -1;
+} 
+
+static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp,
+					     unsigned long ea)
+{
+	unsigned long lpar_rc;
+	unsigned long vsid, va, vpn, flags;
+	long slot;
+
+	vsid = get_kernel_vsid(ea);
+	va = (vsid << 28) | (ea & 0x0fffffff);
+	vpn = va >> PAGE_SHIFT;
+
+	slot = pSeries_lpar_hpte_find(vpn);
+	BUG_ON(slot == -1);
+
+	flags = newpp & 7;
+	lpar_rc = plpar_pte_protect(flags, slot, 0);
+
+	BUG_ON(lpar_rc != H_Success);
+}
+
+static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long va,
+					 int large, int local)
+{
+	unsigned long avpn = va >> 23;
+	unsigned long lpar_rc;
+	unsigned long dummy1, dummy2;
+
+	if (large)
+		avpn &= ~0x1UL;
+
+	lpar_rc = plpar_pte_remove(H_AVPN, slot, (avpn << 7), &dummy1,
+				   &dummy2);
+
+	if (lpar_rc == H_Not_Found)
+		return;
+
+	BUG_ON(lpar_rc != H_Success);
+}
+
+/*
+ * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie
+ * lock.
+ */
+void pSeries_lpar_flush_hash_range(unsigned long number, int local)
+{
+	int i;
+	unsigned long flags = 0;
+	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+	int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
+
+	if (lock_tlbie)
+		spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
+
+	for (i = 0; i < number; i++)
+		flush_hash_page(batch->vaddr[i], batch->pte[i], local);
+
+	if (lock_tlbie)
+		spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
+}
+
+void hpte_init_lpar(void)
+{
+	ppc_md.hpte_invalidate	= pSeries_lpar_hpte_invalidate;
+	ppc_md.hpte_updatepp	= pSeries_lpar_hpte_updatepp;
+	ppc_md.hpte_updateboltedpp = pSeries_lpar_hpte_updateboltedpp;
+	ppc_md.hpte_insert	= pSeries_lpar_hpte_insert;
+	ppc_md.hpte_remove	= pSeries_lpar_hpte_remove;
+	ppc_md.flush_hash_range	= pSeries_lpar_flush_hash_range;
+	ppc_md.hpte_clear_all   = pSeries_lpar_hptab_clear;
+
+	htab_finish_init();
+}
diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
new file mode 100644
index 00000000000..18abfb1f4e2
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -0,0 +1,148 @@
+/*
+ *  c 2001 PPC 64 Team, IBM Corp
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ * /dev/nvram driver for PPC64
+ *
+ * This perhaps should live in drivers/char
+ */
+
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <asm/uaccess.h>
+#include <asm/nvram.h>
+#include <asm/rtas.h>
+#include <asm/prom.h>
+#include <asm/machdep.h>
+
+static unsigned int nvram_size;
+static int nvram_fetch, nvram_store;
+static char nvram_buf[NVRW_CNT];	/* assume this is in the first 4GB */
+static DEFINE_SPINLOCK(nvram_lock);
+
+
+static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index)
+{
+	unsigned int i;
+	unsigned long len;
+	int done;
+	unsigned long flags;
+	char *p = buf;
+
+
+	if (nvram_size == 0 || nvram_fetch == RTAS_UNKNOWN_SERVICE)
+		return -ENODEV;
+
+	if (*index >= nvram_size)
+		return 0;
+
+	i = *index;
+	if (i + count > nvram_size)
+		count = nvram_size - i;
+
+	spin_lock_irqsave(&nvram_lock, flags);
+
+	for (; count != 0; count -= len) {
+		len = count;
+		if (len > NVRW_CNT)
+			len = NVRW_CNT;
+		
+		if ((rtas_call(nvram_fetch, 3, 2, &done, i, __pa(nvram_buf),
+			       len) != 0) || len != done) {
+			spin_unlock_irqrestore(&nvram_lock, flags);
+			return -EIO;
+		}
+		
+		memcpy(p, nvram_buf, len);
+
+		p += len;
+		i += len;
+	}
+
+	spin_unlock_irqrestore(&nvram_lock, flags);
+	
+	*index = i;
+	return p - buf;
+}
+
+static ssize_t pSeries_nvram_write(char *buf, size_t count, loff_t *index)
+{
+	unsigned int i;
+	unsigned long len;
+	int done;
+	unsigned long flags;
+	const char *p = buf;
+
+	if (nvram_size == 0 || nvram_store == RTAS_UNKNOWN_SERVICE)
+		return -ENODEV;
+
+	if (*index >= nvram_size)
+		return 0;
+
+	i = *index;
+	if (i + count > nvram_size)
+		count = nvram_size - i;
+
+	spin_lock_irqsave(&nvram_lock, flags);
+
+	for (; count != 0; count -= len) {
+		len = count;
+		if (len > NVRW_CNT)
+			len = NVRW_CNT;
+
+		memcpy(nvram_buf, p, len);
+
+		if ((rtas_call(nvram_store, 3, 2, &done, i, __pa(nvram_buf),
+			       len) != 0) || len != done) {
+			spin_unlock_irqrestore(&nvram_lock, flags);
+			return -EIO;
+		}
+		
+		p += len;
+		i += len;
+	}
+	spin_unlock_irqrestore(&nvram_lock, flags);
+	
+	*index = i;
+	return p - buf;
+}
+
+static ssize_t pSeries_nvram_get_size(void)
+{
+	return nvram_size ? nvram_size : -ENODEV;
+}
+
+int __init pSeries_nvram_init(void)
+{
+	struct device_node *nvram;
+	unsigned int *nbytes_p, proplen;
+
+	nvram = of_find_node_by_type(NULL, "nvram");
+	if (nvram == NULL)
+		return -ENODEV;
+
+	nbytes_p = (unsigned int *)get_property(nvram, "#bytes", &proplen);
+	if (nbytes_p == NULL || proplen != sizeof(unsigned int))
+		return -EIO;
+
+	nvram_size = *nbytes_p;
+
+	nvram_fetch = rtas_token("nvram-fetch");
+	nvram_store = rtas_token("nvram-store");
+	printk(KERN_INFO "PPC64 nvram contains %d bytes\n", nvram_size);
+	of_node_put(nvram);
+
+	ppc_md.nvram_read	= pSeries_nvram_read;
+	ppc_md.nvram_write	= pSeries_nvram_write;
+	ppc_md.nvram_size	= pSeries_nvram_get_size;
+
+	return 0;
+}
diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c
new file mode 100644
index 00000000000..c198656a3bb
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/pci.c
@@ -0,0 +1,142 @@
+/*
+ * arch/ppc64/kernel/pSeries_pci.c
+ *
+ * Copyright (C) 2001 Dave Engebretsen, IBM Corporation
+ * Copyright (C) 2003 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * pSeries specific routines for PCI.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *    
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+
+#include <asm/pci-bridge.h>
+#include <asm/prom.h>
+#include <asm/ppc-pci.h>
+
+static int __devinitdata s7a_workaround = -1;
+
+#if 0
+void pcibios_name_device(struct pci_dev *dev)
+{
+	struct device_node *dn;
+
+	/*
+	 * Add IBM loc code (slot) as a prefix to the device names for service
+	 */
+	dn = pci_device_to_OF_node(dev);
+	if (dn) {
+		char *loc_code = get_property(dn, "ibm,loc-code", 0);
+		if (loc_code) {
+			int loc_len = strlen(loc_code);
+			if (loc_len < sizeof(dev->dev.name)) {
+				memmove(dev->dev.name+loc_len+1, dev->dev.name,
+					sizeof(dev->dev.name)-loc_len-1);
+				memcpy(dev->dev.name, loc_code, loc_len);
+				dev->dev.name[loc_len] = ' ';
+				dev->dev.name[sizeof(dev->dev.name)-1] = '\0';
+			}
+		}
+	}
+}   
+DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pcibios_name_device);
+#endif
+
+static void __devinit check_s7a(void)
+{
+	struct device_node *root;
+	char *model;
+
+	s7a_workaround = 0;
+	root = of_find_node_by_path("/");
+	if (root) {
+		model = get_property(root, "model", NULL);
+		if (model && !strcmp(model, "IBM,7013-S7A"))
+			s7a_workaround = 1;
+		of_node_put(root);
+	}
+}
+
+void __devinit pSeries_irq_bus_setup(struct pci_bus *bus)
+{
+	struct pci_dev *dev;
+
+	if (s7a_workaround < 0)
+		check_s7a();
+	list_for_each_entry(dev, &bus->devices, bus_list) {
+		pci_read_irq_line(dev);
+		if (s7a_workaround) {
+			if (dev->irq > 16) {
+				dev->irq -= 3;
+				pci_write_config_byte(dev, PCI_INTERRUPT_LINE,
+					dev->irq);
+			}
+		}
+	}
+}
+
+static void __init pSeries_request_regions(void)
+{
+	if (!isa_io_base)
+		return;
+
+	request_region(0x20,0x20,"pic1");
+	request_region(0xa0,0x20,"pic2");
+	request_region(0x00,0x20,"dma1");
+	request_region(0x40,0x20,"timer");
+	request_region(0x80,0x10,"dma page reg");
+	request_region(0xc0,0x20,"dma2");
+}
+
+void __init pSeries_final_fixup(void)
+{
+	phbs_remap_io();
+	pSeries_request_regions();
+
+	pci_addr_cache_build();
+}
+
+/*
+ * Assume the winbond 82c105 is the IDE controller on a
+ * p610.  We should probably be more careful in case
+ * someone tries to plug in a similar adapter.
+ */
+static void fixup_winbond_82c105(struct pci_dev* dev)
+{
+	int i;
+	unsigned int reg;
+
+	if (!(systemcfg->platform & PLATFORM_PSERIES))
+		return;
+
+	printk("Using INTC for W82c105 IDE controller.\n");
+	pci_read_config_dword(dev, 0x40, &reg);
+	/* Enable LEGIRQ to use INTC instead of ISA interrupts */
+	pci_write_config_dword(dev, 0x40, reg | (1<<11));
+
+	for (i = 0; i < DEVICE_COUNT_RESOURCE; ++i) {
+		/* zap the 2nd function of the winbond chip */
+		if (dev->resource[i].flags & IORESOURCE_IO
+		    && dev->bus->number == 0 && dev->devfn == 0x81)
+			dev->resource[i].flags &= ~IORESOURCE_IO;
+	}
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_WINBOND, PCI_DEVICE_ID_WINBOND_82C105,
+			 fixup_winbond_82c105);
diff --git a/arch/powerpc/platforms/pseries/plpar_wrappers.h b/arch/powerpc/platforms/pseries/plpar_wrappers.h
new file mode 100644
index 00000000000..382f8c5b0e7
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/plpar_wrappers.h
@@ -0,0 +1,120 @@
+#ifndef _PSERIES_PLPAR_WRAPPERS_H
+#define _PSERIES_PLPAR_WRAPPERS_H
+
+#include <asm/hvcall.h>
+
+static inline long poll_pending(void)
+{
+	unsigned long dummy;
+	return plpar_hcall(H_POLL_PENDING, 0, 0, 0, 0, &dummy, &dummy, &dummy);
+}
+
+static inline long prod_processor(void)
+{
+	plpar_hcall_norets(H_PROD);
+	return 0;
+}
+
+static inline long cede_processor(void)
+{
+	plpar_hcall_norets(H_CEDE);
+	return 0;
+}
+
+static inline long vpa_call(unsigned long flags, unsigned long cpu,
+		unsigned long vpa)
+{
+	/* flags are in bits 16-18 (counting from most significant bit) */
+	flags = flags << (63 - 18);
+
+	return plpar_hcall_norets(H_REGISTER_VPA, flags, cpu, vpa);
+}
+
+static inline long unregister_vpa(unsigned long cpu, unsigned long vpa)
+{
+	return vpa_call(0x5, cpu, vpa);
+}
+
+static inline long register_vpa(unsigned long cpu, unsigned long vpa)
+{
+	return vpa_call(0x1, cpu, vpa);
+}
+
+extern void vpa_init(int cpu);
+
+static inline long plpar_pte_remove(unsigned long flags, unsigned long ptex,
+		unsigned long avpn, unsigned long *old_pteh_ret,
+		unsigned long *old_ptel_ret)
+{
+	unsigned long dummy;
+	return plpar_hcall(H_REMOVE, flags, ptex, avpn, 0, old_pteh_ret,
+			old_ptel_ret, &dummy);
+}
+
+static inline long plpar_pte_read(unsigned long flags, unsigned long ptex,
+		unsigned long *old_pteh_ret, unsigned long *old_ptel_ret)
+{
+	unsigned long dummy;
+	return plpar_hcall(H_READ, flags, ptex, 0, 0, old_pteh_ret,
+			old_ptel_ret, &dummy);
+}
+
+static inline long plpar_pte_protect(unsigned long flags, unsigned long ptex,
+		unsigned long avpn)
+{
+	return plpar_hcall_norets(H_PROTECT, flags, ptex, avpn);
+}
+
+static inline long plpar_tce_get(unsigned long liobn, unsigned long ioba,
+		unsigned long *tce_ret)
+{
+	unsigned long dummy;
+	return plpar_hcall(H_GET_TCE, liobn, ioba, 0, 0, tce_ret, &dummy,
+			&dummy);
+}
+
+static inline long plpar_tce_put(unsigned long liobn, unsigned long ioba,
+		unsigned long tceval)
+{
+	return plpar_hcall_norets(H_PUT_TCE, liobn, ioba, tceval);
+}
+
+static inline long plpar_tce_put_indirect(unsigned long liobn,
+		unsigned long ioba, unsigned long page, unsigned long count)
+{
+	return plpar_hcall_norets(H_PUT_TCE_INDIRECT, liobn, ioba, page, count);
+}
+
+static inline long plpar_tce_stuff(unsigned long liobn, unsigned long ioba,
+		unsigned long tceval, unsigned long count)
+{
+	return plpar_hcall_norets(H_STUFF_TCE, liobn, ioba, tceval, count);
+}
+
+static inline long plpar_get_term_char(unsigned long termno,
+		unsigned long *len_ret, char *buf_ret)
+{
+	unsigned long *lbuf = (unsigned long *)buf_ret;	/* TODO: alignment? */
+	return plpar_hcall(H_GET_TERM_CHAR, termno, 0, 0, 0, len_ret,
+			lbuf + 0, lbuf + 1);
+}
+
+static inline long plpar_put_term_char(unsigned long termno, unsigned long len,
+		const char *buffer)
+{
+	unsigned long *lbuf = (unsigned long *)buffer;	/* TODO: alignment? */
+	return plpar_hcall_norets(H_PUT_TERM_CHAR, termno, len, lbuf[0],
+			lbuf[1]);
+}
+
+static inline long plpar_set_xdabr(unsigned long address, unsigned long flags)
+{
+	return plpar_hcall_norets(H_SET_XDABR, address, flags);
+}
+
+static inline long plpar_set_dabr(unsigned long val)
+{
+	return plpar_hcall_norets(H_SET_DABR, val);
+}
+
+#endif /* _PSERIES_PLPAR_WRAPPERS_H */
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
new file mode 100644
index 00000000000..6562ff4b0a8
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -0,0 +1,352 @@
+/*
+ * Copyright (C) 2001 Dave Engebretsen IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+/* Change Activity:
+ * 2001/09/21 : engebret : Created with minimal EPOW and HW exception support.
+ * End Change Activity
+ */
+
+#include <linux/errno.h>
+#include <linux/threads.h>
+#include <linux/kernel_stat.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/timex.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/irq.h>
+#include <linux/random.h>
+#include <linux/sysrq.h>
+#include <linux/bitops.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/irq.h>
+#include <asm/cache.h>
+#include <asm/prom.h>
+#include <asm/ptrace.h>
+#include <asm/machdep.h>
+#include <asm/rtas.h>
+#include <asm/ppcdebug.h>
+
+static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX];
+static DEFINE_SPINLOCK(ras_log_buf_lock);
+
+char mce_data_buf[RTAS_ERROR_LOG_MAX]
+;
+/* This is true if we are using the firmware NMI handler (typically LPAR) */
+extern int fwnmi_active;
+
+static int ras_get_sensor_state_token;
+static int ras_check_exception_token;
+
+#define EPOW_SENSOR_TOKEN	9
+#define EPOW_SENSOR_INDEX	0
+#define RAS_VECTOR_OFFSET	0x500
+
+static irqreturn_t ras_epow_interrupt(int irq, void *dev_id,
+					struct pt_regs * regs);
+static irqreturn_t ras_error_interrupt(int irq, void *dev_id,
+					struct pt_regs * regs);
+
+/* #define DEBUG */
+
+static void request_ras_irqs(struct device_node *np, char *propname,
+			irqreturn_t (*handler)(int, void *, struct pt_regs *),
+			const char *name)
+{
+	unsigned int *ireg, len, i;
+	int virq, n_intr;
+
+	ireg = (unsigned int *)get_property(np, propname, &len);
+	if (ireg == NULL)
+		return;
+	n_intr = prom_n_intr_cells(np);
+	len /= n_intr * sizeof(*ireg);
+
+	for (i = 0; i < len; i++) {
+		virq = virt_irq_create_mapping(*ireg);
+		if (virq == NO_IRQ) {
+			printk(KERN_ERR "Unable to allocate interrupt "
+			       "number for %s\n", np->full_name);
+			return;
+		}
+		if (request_irq(irq_offset_up(virq), handler, 0, name, NULL)) {
+			printk(KERN_ERR "Unable to request interrupt %d for "
+			       "%s\n", irq_offset_up(virq), np->full_name);
+			return;
+		}
+		ireg += n_intr;
+	}
+}
+
+/*
+ * Initialize handlers for the set of interrupts caused by hardware errors
+ * and power system events.
+ */
+static int __init init_ras_IRQ(void)
+{
+	struct device_node *np;
+
+	ras_get_sensor_state_token = rtas_token("get-sensor-state");
+	ras_check_exception_token = rtas_token("check-exception");
+
+	/* Internal Errors */
+	np = of_find_node_by_path("/event-sources/internal-errors");
+	if (np != NULL) {
+		request_ras_irqs(np, "open-pic-interrupt", ras_error_interrupt,
+				 "RAS_ERROR");
+		request_ras_irqs(np, "interrupts", ras_error_interrupt,
+				 "RAS_ERROR");
+		of_node_put(np);
+	}
+
+	/* EPOW Events */
+	np = of_find_node_by_path("/event-sources/epow-events");
+	if (np != NULL) {
+		request_ras_irqs(np, "open-pic-interrupt", ras_epow_interrupt,
+				 "RAS_EPOW");
+		request_ras_irqs(np, "interrupts", ras_epow_interrupt,
+				 "RAS_EPOW");
+		of_node_put(np);
+	}
+
+	return 1;
+}
+__initcall(init_ras_IRQ);
+
+/*
+ * Handle power subsystem events (EPOW).
+ *
+ * Presently we just log the event has occurred.  This should be fixed
+ * to examine the type of power failure and take appropriate action where
+ * the time horizon permits something useful to be done.
+ */
+static irqreturn_t
+ras_epow_interrupt(int irq, void *dev_id, struct pt_regs * regs)
+{
+	int status = 0xdeadbeef;
+	int state = 0;
+	int critical;
+
+	status = rtas_call(ras_get_sensor_state_token, 2, 2, &state,
+			   EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX);
+
+	if (state > 3)
+		critical = 1;  /* Time Critical */
+	else
+		critical = 0;
+
+	spin_lock(&ras_log_buf_lock);
+
+	status = rtas_call(ras_check_exception_token, 6, 1, NULL,
+			   RAS_VECTOR_OFFSET,
+			   virt_irq_to_real(irq_offset_down(irq)),
+			   RTAS_EPOW_WARNING | RTAS_POWERMGM_EVENTS,
+			   critical, __pa(&ras_log_buf),
+				rtas_get_error_log_max());
+
+	udbg_printf("EPOW <0x%lx 0x%x 0x%x>\n",
+		    *((unsigned long *)&ras_log_buf), status, state);
+	printk(KERN_WARNING "EPOW <0x%lx 0x%x 0x%x>\n",
+	       *((unsigned long *)&ras_log_buf), status, state);
+
+	/* format and print the extended information */
+	log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0);
+
+	spin_unlock(&ras_log_buf_lock);
+	return IRQ_HANDLED;
+}
+
+/*
+ * Handle hardware error interrupts.
+ *
+ * RTAS check-exception is called to collect data on the exception.  If
+ * the error is deemed recoverable, we log a warning and return.
+ * For nonrecoverable errors, an error is logged and we stop all processing
+ * as quickly as possible in order to prevent propagation of the failure.
+ */
+static irqreturn_t
+ras_error_interrupt(int irq, void *dev_id, struct pt_regs * regs)
+{
+	struct rtas_error_log *rtas_elog;
+	int status = 0xdeadbeef;
+	int fatal;
+
+	spin_lock(&ras_log_buf_lock);
+
+	status = rtas_call(ras_check_exception_token, 6, 1, NULL,
+			   RAS_VECTOR_OFFSET,
+			   virt_irq_to_real(irq_offset_down(irq)),
+			   RTAS_INTERNAL_ERROR, 1 /*Time Critical */,
+			   __pa(&ras_log_buf),
+				rtas_get_error_log_max());
+
+	rtas_elog = (struct rtas_error_log *)ras_log_buf;
+
+	if ((status == 0) && (rtas_elog->severity >= RTAS_SEVERITY_ERROR_SYNC))
+		fatal = 1;
+	else
+		fatal = 0;
+
+	/* format and print the extended information */
+	log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, fatal);
+
+	if (fatal) {
+		udbg_printf("Fatal HW Error <0x%lx 0x%x>\n",
+			    *((unsigned long *)&ras_log_buf), status);
+		printk(KERN_EMERG "Error: Fatal hardware error <0x%lx 0x%x>\n",
+		       *((unsigned long *)&ras_log_buf), status);
+
+#ifndef DEBUG
+		/* Don't actually power off when debugging so we can test
+		 * without actually failing while injecting errors.
+		 * Error data will not be logged to syslog.
+		 */
+		ppc_md.power_off();
+#endif
+	} else {
+		udbg_printf("Recoverable HW Error <0x%lx 0x%x>\n",
+			    *((unsigned long *)&ras_log_buf), status);
+		printk(KERN_WARNING
+		       "Warning: Recoverable hardware error <0x%lx 0x%x>\n",
+		       *((unsigned long *)&ras_log_buf), status);
+	}
+
+	spin_unlock(&ras_log_buf_lock);
+	return IRQ_HANDLED;
+}
+
+/* Get the error information for errors coming through the
+ * FWNMI vectors.  The pt_regs' r3 will be updated to reflect
+ * the actual r3 if possible, and a ptr to the error log entry
+ * will be returned if found.
+ *
+ * The mce_data_buf does not have any locks or protection around it,
+ * if a second machine check comes in, or a system reset is done
+ * before we have logged the error, then we will get corruption in the
+ * error log.  This is preferable over holding off on calling
+ * ibm,nmi-interlock which would result in us checkstopping if a
+ * second machine check did come in.
+ */
+static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs)
+{
+	unsigned long errdata = regs->gpr[3];
+	struct rtas_error_log *errhdr = NULL;
+	unsigned long *savep;
+
+	if ((errdata >= 0x7000 && errdata < 0x7fff0) ||
+	    (errdata >= rtas.base && errdata < rtas.base + rtas.size - 16)) {
+		savep = __va(errdata);
+		regs->gpr[3] = savep[0];	/* restore original r3 */
+		memset(mce_data_buf, 0, RTAS_ERROR_LOG_MAX);
+		memcpy(mce_data_buf, (char *)(savep + 1), RTAS_ERROR_LOG_MAX);
+		errhdr = (struct rtas_error_log *)mce_data_buf;
+	} else {
+		printk("FWNMI: corrupt r3\n");
+	}
+	return errhdr;
+}
+
+/* Call this when done with the data returned by FWNMI_get_errinfo.
+ * It will release the saved data area for other CPUs in the
+ * partition to receive FWNMI errors.
+ */
+static void fwnmi_release_errinfo(void)
+{
+	int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL);
+	if (ret != 0)
+		printk("FWNMI: nmi-interlock failed: %d\n", ret);
+}
+
+void pSeries_system_reset_exception(struct pt_regs *regs)
+{
+	if (fwnmi_active) {
+		struct rtas_error_log *errhdr = fwnmi_get_errinfo(regs);
+		if (errhdr) {
+			/* XXX Should look at FWNMI information */
+		}
+		fwnmi_release_errinfo();
+	}
+}
+
+/*
+ * See if we can recover from a machine check exception.
+ * This is only called on power4 (or above) and only via
+ * the Firmware Non-Maskable Interrupts (fwnmi) handler
+ * which provides the error analysis for us.
+ *
+ * Return 1 if corrected (or delivered a signal).
+ * Return 0 if there is nothing we can do.
+ */
+static int recover_mce(struct pt_regs *regs, struct rtas_error_log * err)
+{
+	int nonfatal = 0;
+
+	if (err->disposition == RTAS_DISP_FULLY_RECOVERED) {
+		/* Platform corrected itself */
+		nonfatal = 1;
+	} else if ((regs->msr & MSR_RI) &&
+		   user_mode(regs) &&
+		   err->severity == RTAS_SEVERITY_ERROR_SYNC &&
+		   err->disposition == RTAS_DISP_NOT_RECOVERED &&
+		   err->target == RTAS_TARGET_MEMORY &&
+		   err->type == RTAS_TYPE_ECC_UNCORR &&
+		   !(current->pid == 0 || current->pid == 1)) {
+		/* Kill off a user process with an ECC error */
+		printk(KERN_ERR "MCE: uncorrectable ecc error for pid %d\n",
+		       current->pid);
+		/* XXX something better for ECC error? */
+		_exception(SIGBUS, regs, BUS_ADRERR, regs->nip);
+		nonfatal = 1;
+	}
+
+	log_error((char *)err, ERR_TYPE_RTAS_LOG, !nonfatal);
+
+	return nonfatal;
+}
+
+/*
+ * Handle a machine check.
+ *
+ * Note that on Power 4 and beyond Firmware Non-Maskable Interrupts (fwnmi)
+ * should be present.  If so the handler which called us tells us if the
+ * error was recovered (never true if RI=0).
+ *
+ * On hardware prior to Power 4 these exceptions were asynchronous which
+ * means we can't tell exactly where it occurred and so we can't recover.
+ */
+int pSeries_machine_check_exception(struct pt_regs *regs)
+{
+	struct rtas_error_log *errp;
+
+	if (fwnmi_active) {
+		errp = fwnmi_get_errinfo(regs);
+		fwnmi_release_errinfo();
+		if (errp && recover_mce(regs, errp))
+			return 1;
+	}
+
+	return 0;
+}
diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c
new file mode 100644
index 00000000000..58c61219d08
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/reconfig.c
@@ -0,0 +1,426 @@
+/*
+ * pSeries_reconfig.c - support for dynamic reconfiguration (including PCI
+ * Hotplug and Dynamic Logical Partitioning on RPA platforms).
+ *
+ * Copyright (C) 2005 Nathan Lynch
+ * Copyright (C) 2005 IBM Corporation
+ *
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License version
+ *	2 as published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kref.h>
+#include <linux/notifier.h>
+#include <linux/proc_fs.h>
+
+#include <asm/prom.h>
+#include <asm/pSeries_reconfig.h>
+#include <asm/uaccess.h>
+
+
+
+/*
+ * Routines for "runtime" addition and removal of device tree nodes.
+ */
+#ifdef CONFIG_PROC_DEVICETREE
+/*
+ * Add a node to /proc/device-tree.
+ */
+static void add_node_proc_entries(struct device_node *np)
+{
+	struct proc_dir_entry *ent;
+
+	ent = proc_mkdir(strrchr(np->full_name, '/') + 1, np->parent->pde);
+	if (ent)
+		proc_device_tree_add_node(np, ent);
+}
+
+static void remove_node_proc_entries(struct device_node *np)
+{
+	struct property *pp = np->properties;
+	struct device_node *parent = np->parent;
+
+	while (pp) {
+		remove_proc_entry(pp->name, np->pde);
+		pp = pp->next;
+	}
+	if (np->pde)
+		remove_proc_entry(np->pde->name, parent->pde);
+}
+#else /* !CONFIG_PROC_DEVICETREE */
+static void add_node_proc_entries(struct device_node *np)
+{
+	return;
+}
+
+static void remove_node_proc_entries(struct device_node *np)
+{
+	return;
+}
+#endif /* CONFIG_PROC_DEVICETREE */
+
+/**
+ *	derive_parent - basically like dirname(1)
+ *	@path:  the full_name of a node to be added to the tree
+ *
+ *	Returns the node which should be the parent of the node
+ *	described by path.  E.g., for path = "/foo/bar", returns
+ *	the node with full_name = "/foo".
+ */
+static struct device_node *derive_parent(const char *path)
+{
+	struct device_node *parent = NULL;
+	char *parent_path = "/";
+	size_t parent_path_len = strrchr(path, '/') - path + 1;
+
+	/* reject if path is "/" */
+	if (!strcmp(path, "/"))
+		return ERR_PTR(-EINVAL);
+
+	if (strrchr(path, '/') != path) {
+		parent_path = kmalloc(parent_path_len, GFP_KERNEL);
+		if (!parent_path)
+			return ERR_PTR(-ENOMEM);
+		strlcpy(parent_path, path, parent_path_len);
+	}
+	parent = of_find_node_by_path(parent_path);
+	if (!parent)
+		return ERR_PTR(-EINVAL);
+	if (strcmp(parent_path, "/"))
+		kfree(parent_path);
+	return parent;
+}
+
+static struct notifier_block *pSeries_reconfig_chain;
+
+int pSeries_reconfig_notifier_register(struct notifier_block *nb)
+{
+	return notifier_chain_register(&pSeries_reconfig_chain, nb);
+}
+
+void pSeries_reconfig_notifier_unregister(struct notifier_block *nb)
+{
+	notifier_chain_unregister(&pSeries_reconfig_chain, nb);
+}
+
+static int pSeries_reconfig_add_node(const char *path, struct property *proplist)
+{
+	struct device_node *np;
+	int err = -ENOMEM;
+
+	np = kzalloc(sizeof(*np), GFP_KERNEL);
+	if (!np)
+		goto out_err;
+
+	np->full_name = kmalloc(strlen(path) + 1, GFP_KERNEL);
+	if (!np->full_name)
+		goto out_err;
+
+	strcpy(np->full_name, path);
+
+	np->properties = proplist;
+	OF_MARK_DYNAMIC(np);
+	kref_init(&np->kref);
+
+	np->parent = derive_parent(path);
+	if (IS_ERR(np->parent)) {
+		err = PTR_ERR(np->parent);
+		goto out_err;
+	}
+
+	err = notifier_call_chain(&pSeries_reconfig_chain,
+				  PSERIES_RECONFIG_ADD, np);
+	if (err == NOTIFY_BAD) {
+		printk(KERN_ERR "Failed to add device node %s\n", path);
+		err = -ENOMEM; /* For now, safe to assume kmalloc failure */
+		goto out_err;
+	}
+
+	of_attach_node(np);
+
+	add_node_proc_entries(np);
+
+	of_node_put(np->parent);
+
+	return 0;
+
+out_err:
+	if (np) {
+		of_node_put(np->parent);
+		kfree(np->full_name);
+		kfree(np);
+	}
+	return err;
+}
+
+static int pSeries_reconfig_remove_node(struct device_node *np)
+{
+	struct device_node *parent, *child;
+
+	parent = of_get_parent(np);
+	if (!parent)
+		return -EINVAL;
+
+	if ((child = of_get_next_child(np, NULL))) {
+		of_node_put(child);
+		return -EBUSY;
+	}
+
+	remove_node_proc_entries(np);
+
+	notifier_call_chain(&pSeries_reconfig_chain,
+			    PSERIES_RECONFIG_REMOVE, np);
+	of_detach_node(np);
+
+	of_node_put(parent);
+	of_node_put(np); /* Must decrement the refcount */
+	return 0;
+}
+
+/*
+ * /proc/ppc64/ofdt - yucky binary interface for adding and removing
+ * OF device nodes.  Should be deprecated as soon as we get an
+ * in-kernel wrapper for the RTAS ibm,configure-connector call.
+ */
+
+static void release_prop_list(const struct property *prop)
+{
+	struct property *next;
+	for (; prop; prop = next) {
+		next = prop->next;
+		kfree(prop->name);
+		kfree(prop->value);
+		kfree(prop);
+	}
+
+}
+
+/**
+ * parse_next_property - process the next property from raw input buffer
+ * @buf: input buffer, must be nul-terminated
+ * @end: end of the input buffer + 1, for validation
+ * @name: return value; set to property name in buf
+ * @length: return value; set to length of value
+ * @value: return value; set to the property value in buf
+ *
+ * Note that the caller must make copies of the name and value returned,
+ * this function does no allocation or copying of the data.  Return value
+ * is set to the next name in buf, or NULL on error.
+ */
+static char * parse_next_property(char *buf, char *end, char **name, int *length,
+				  unsigned char **value)
+{
+	char *tmp;
+
+	*name = buf;
+
+	tmp = strchr(buf, ' ');
+	if (!tmp) {
+		printk(KERN_ERR "property parse failed in %s at line %d\n",
+		       __FUNCTION__, __LINE__);
+		return NULL;
+	}
+	*tmp = '\0';
+
+	if (++tmp >= end) {
+		printk(KERN_ERR "property parse failed in %s at line %d\n",
+		       __FUNCTION__, __LINE__);
+		return NULL;
+	}
+
+	/* now we're on the length */
+	*length = -1;
+	*length = simple_strtoul(tmp, &tmp, 10);
+	if (*length == -1) {
+		printk(KERN_ERR "property parse failed in %s at line %d\n",
+		       __FUNCTION__, __LINE__);
+		return NULL;
+	}
+	if (*tmp != ' ' || ++tmp >= end) {
+		printk(KERN_ERR "property parse failed in %s at line %d\n",
+		       __FUNCTION__, __LINE__);
+		return NULL;
+	}
+
+	/* now we're on the value */
+	*value = tmp;
+	tmp += *length;
+	if (tmp > end) {
+		printk(KERN_ERR "property parse failed in %s at line %d\n",
+		       __FUNCTION__, __LINE__);
+		return NULL;
+	}
+	else if (tmp < end && *tmp != ' ' && *tmp != '\0') {
+		printk(KERN_ERR "property parse failed in %s at line %d\n",
+		       __FUNCTION__, __LINE__);
+		return NULL;
+	}
+	tmp++;
+
+	/* and now we should be on the next name, or the end */
+	return tmp;
+}
+
+static struct property *new_property(const char *name, const int length,
+				     const unsigned char *value, struct property *last)
+{
+	struct property *new = kmalloc(sizeof(*new), GFP_KERNEL);
+
+	if (!new)
+		return NULL;
+	memset(new, 0, sizeof(*new));
+
+	if (!(new->name = kmalloc(strlen(name) + 1, GFP_KERNEL)))
+		goto cleanup;
+	if (!(new->value = kmalloc(length + 1, GFP_KERNEL)))
+		goto cleanup;
+
+	strcpy(new->name, name);
+	memcpy(new->value, value, length);
+	*(((char *)new->value) + length) = 0;
+	new->length = length;
+	new->next = last;
+	return new;
+
+cleanup:
+	if (new->name)
+		kfree(new->name);
+	if (new->value)
+		kfree(new->value);
+	kfree(new);
+	return NULL;
+}
+
+static int do_add_node(char *buf, size_t bufsize)
+{
+	char *path, *end, *name;
+	struct device_node *np;
+	struct property *prop = NULL;
+	unsigned char* value;
+	int length, rv = 0;
+
+	end = buf + bufsize;
+	path = buf;
+	buf = strchr(buf, ' ');
+	if (!buf)
+		return -EINVAL;
+	*buf = '\0';
+	buf++;
+
+	if ((np = of_find_node_by_path(path))) {
+		of_node_put(np);
+		return -EINVAL;
+	}
+
+	/* rv = build_prop_list(tmp, bufsize - (tmp - buf), &proplist); */
+	while (buf < end &&
+	       (buf = parse_next_property(buf, end, &name, &length, &value))) {
+		struct property *last = prop;
+
+		prop = new_property(name, length, value, last);
+		if (!prop) {
+			rv = -ENOMEM;
+			prop = last;
+			goto out;
+		}
+	}
+	if (!buf) {
+		rv = -EINVAL;
+		goto out;
+	}
+
+	rv = pSeries_reconfig_add_node(path, prop);
+
+out:
+	if (rv)
+		release_prop_list(prop);
+	return rv;
+}
+
+static int do_remove_node(char *buf)
+{
+	struct device_node *node;
+	int rv = -ENODEV;
+
+	if ((node = of_find_node_by_path(buf)))
+		rv = pSeries_reconfig_remove_node(node);
+
+	of_node_put(node);
+	return rv;
+}
+
+/**
+ * ofdt_write - perform operations on the Open Firmware device tree
+ *
+ * @file: not used
+ * @buf: command and arguments
+ * @count: size of the command buffer
+ * @off: not used
+ *
+ * Operations supported at this time are addition and removal of
+ * whole nodes along with their properties.  Operations on individual
+ * properties are not implemented (yet).
+ */
+static ssize_t ofdt_write(struct file *file, const char __user *buf, size_t count,
+			  loff_t *off)
+{
+	int rv = 0;
+	char *kbuf;
+	char *tmp;
+
+	if (!(kbuf = kmalloc(count + 1, GFP_KERNEL))) {
+		rv = -ENOMEM;
+		goto out;
+	}
+	if (copy_from_user(kbuf, buf, count)) {
+		rv = -EFAULT;
+		goto out;
+	}
+
+	kbuf[count] = '\0';
+
+	tmp = strchr(kbuf, ' ');
+	if (!tmp) {
+		rv = -EINVAL;
+		goto out;
+	}
+	*tmp = '\0';
+	tmp++;
+
+	if (!strcmp(kbuf, "add_node"))
+		rv = do_add_node(tmp, count - (tmp - kbuf));
+	else if (!strcmp(kbuf, "remove_node"))
+		rv = do_remove_node(tmp);
+	else
+		rv = -EINVAL;
+out:
+	kfree(kbuf);
+	return rv ? rv : count;
+}
+
+static struct file_operations ofdt_fops = {
+	.write = ofdt_write
+};
+
+/* create /proc/ppc64/ofdt write-only by root */
+static int proc_ppc64_create_ofdt(void)
+{
+	struct proc_dir_entry *ent;
+
+	if (!(systemcfg->platform & PLATFORM_PSERIES))
+		return 0;
+
+	ent = create_proc_entry("ppc64/ofdt", S_IWUSR, NULL);
+	if (ent) {
+		ent->nlink = 1;
+		ent->data = NULL;
+		ent->size = 0;
+		ent->proc_fops = &ofdt_fops;
+	}
+
+	return 0;
+}
+__initcall(proc_ppc64_create_ofdt);
diff --git a/arch/powerpc/platforms/pseries/rtasd.c b/arch/powerpc/platforms/pseries/rtasd.c
new file mode 100644
index 00000000000..e26b0420b6d
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/rtasd.c
@@ -0,0 +1,527 @@
+/*
+ * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Communication to userspace based on kernel/printk.c
+ */
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/poll.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/vmalloc.h>
+#include <linux/spinlock.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/rtas.h>
+#include <asm/prom.h>
+#include <asm/nvram.h>
+#include <asm/atomic.h>
+#include <asm/systemcfg.h>
+
+#if 0
+#define DEBUG(A...)	printk(KERN_ERR A)
+#else
+#define DEBUG(A...)
+#endif
+
+static DEFINE_SPINLOCK(rtasd_log_lock);
+
+DECLARE_WAIT_QUEUE_HEAD(rtas_log_wait);
+
+static char *rtas_log_buf;
+static unsigned long rtas_log_start;
+static unsigned long rtas_log_size;
+
+static int surveillance_timeout = -1;
+static unsigned int rtas_event_scan_rate;
+static unsigned int rtas_error_log_max;
+static unsigned int rtas_error_log_buffer_max;
+
+static int full_rtas_msgs = 0;
+
+extern int no_logging;
+
+volatile int error_log_cnt = 0;
+
+/*
+ * Since we use 32 bit RTAS, the physical address of this must be below
+ * 4G or else bad things happen. Allocate this in the kernel data and
+ * make it big enough.
+ */
+static unsigned char logdata[RTAS_ERROR_LOG_MAX];
+
+static int get_eventscan_parms(void);
+
+static char *rtas_type[] = {
+	"Unknown", "Retry", "TCE Error", "Internal Device Failure",
+	"Timeout", "Data Parity", "Address Parity", "Cache Parity",
+	"Address Invalid", "ECC Uncorrected", "ECC Corrupted",
+};
+
+static char *rtas_event_type(int type)
+{
+	if ((type > 0) && (type < 11))
+		return rtas_type[type];
+
+	switch (type) {
+		case RTAS_TYPE_EPOW:
+			return "EPOW";
+		case RTAS_TYPE_PLATFORM:
+			return "Platform Error";
+		case RTAS_TYPE_IO:
+			return "I/O Event";
+		case RTAS_TYPE_INFO:
+			return "Platform Information Event";
+		case RTAS_TYPE_DEALLOC:
+			return "Resource Deallocation Event";
+		case RTAS_TYPE_DUMP:
+			return "Dump Notification Event";
+	}
+
+	return rtas_type[0];
+}
+
+/* To see this info, grep RTAS /var/log/messages and each entry
+ * will be collected together with obvious begin/end.
+ * There will be a unique identifier on the begin and end lines.
+ * This will persist across reboots.
+ *
+ * format of error logs returned from RTAS:
+ * bytes	(size)	: contents
+ * --------------------------------------------------------
+ * 0-7		(8)	: rtas_error_log
+ * 8-47		(40)	: extended info
+ * 48-51	(4)	: vendor id
+ * 52-1023 (vendor specific) : location code and debug data
+ */
+static void printk_log_rtas(char *buf, int len)
+{
+
+	int i,j,n = 0;
+	int perline = 16;
+	char buffer[64];
+	char * str = "RTAS event";
+
+	if (full_rtas_msgs) {
+		printk(RTAS_DEBUG "%d -------- %s begin --------\n",
+		       error_log_cnt, str);
+
+		/*
+		 * Print perline bytes on each line, each line will start
+		 * with RTAS and a changing number, so syslogd will
+		 * print lines that are otherwise the same.  Separate every
+		 * 4 bytes with a space.
+		 */
+		for (i = 0; i < len; i++) {
+			j = i % perline;
+			if (j == 0) {
+				memset(buffer, 0, sizeof(buffer));
+				n = sprintf(buffer, "RTAS %d:", i/perline);
+			}
+
+			if ((i % 4) == 0)
+				n += sprintf(buffer+n, " ");
+
+			n += sprintf(buffer+n, "%02x", (unsigned char)buf[i]);
+
+			if (j == (perline-1))
+				printk(KERN_DEBUG "%s\n", buffer);
+		}
+		if ((i % perline) != 0)
+			printk(KERN_DEBUG "%s\n", buffer);
+
+		printk(RTAS_DEBUG "%d -------- %s end ----------\n",
+		       error_log_cnt, str);
+	} else {
+		struct rtas_error_log *errlog = (struct rtas_error_log *)buf;
+
+		printk(RTAS_DEBUG "event: %d, Type: %s, Severity: %d\n",
+		       error_log_cnt, rtas_event_type(errlog->type),
+		       errlog->severity);
+	}
+}
+
+static int log_rtas_len(char * buf)
+{
+	int len;
+	struct rtas_error_log *err;
+
+	/* rtas fixed header */
+	len = 8;
+	err = (struct rtas_error_log *)buf;
+	if (err->extended_log_length) {
+
+		/* extended header */
+		len += err->extended_log_length;
+	}
+
+	if (rtas_error_log_max == 0) {
+		get_eventscan_parms();
+	}
+	if (len > rtas_error_log_max)
+		len = rtas_error_log_max;
+
+	return len;
+}
+
+/*
+ * First write to nvram, if fatal error, that is the only
+ * place we log the info.  The error will be picked up
+ * on the next reboot by rtasd.  If not fatal, run the
+ * method for the type of error.  Currently, only RTAS
+ * errors have methods implemented, but in the future
+ * there might be a need to store data in nvram before a
+ * call to panic().
+ *
+ * XXX We write to nvram periodically, to indicate error has
+ * been written and sync'd, but there is a possibility
+ * that if we don't shutdown correctly, a duplicate error
+ * record will be created on next reboot.
+ */
+void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
+{
+	unsigned long offset;
+	unsigned long s;
+	int len = 0;
+
+	DEBUG("logging event\n");
+	if (buf == NULL)
+		return;
+
+	spin_lock_irqsave(&rtasd_log_lock, s);
+
+	/* get length and increase count */
+	switch (err_type & ERR_TYPE_MASK) {
+	case ERR_TYPE_RTAS_LOG:
+		len = log_rtas_len(buf);
+		if (!(err_type & ERR_FLAG_BOOT))
+			error_log_cnt++;
+		break;
+	case ERR_TYPE_KERNEL_PANIC:
+	default:
+		spin_unlock_irqrestore(&rtasd_log_lock, s);
+		return;
+	}
+
+	/* Write error to NVRAM */
+	if (!no_logging && !(err_type & ERR_FLAG_BOOT))
+		nvram_write_error_log(buf, len, err_type);
+
+	/*
+	 * rtas errors can occur during boot, and we do want to capture
+	 * those somewhere, even if nvram isn't ready (why not?), and even
+	 * if rtasd isn't ready. Put them into the boot log, at least.
+	 */
+	if ((err_type & ERR_TYPE_MASK) == ERR_TYPE_RTAS_LOG)
+		printk_log_rtas(buf, len);
+
+	/* Check to see if we need to or have stopped logging */
+	if (fatal || no_logging) {
+		no_logging = 1;
+		spin_unlock_irqrestore(&rtasd_log_lock, s);
+		return;
+	}
+
+	/* call type specific method for error */
+	switch (err_type & ERR_TYPE_MASK) {
+	case ERR_TYPE_RTAS_LOG:
+		offset = rtas_error_log_buffer_max *
+			((rtas_log_start+rtas_log_size) & LOG_NUMBER_MASK);
+
+		/* First copy over sequence number */
+		memcpy(&rtas_log_buf[offset], (void *) &error_log_cnt, sizeof(int));
+
+		/* Second copy over error log data */
+		offset += sizeof(int);
+		memcpy(&rtas_log_buf[offset], buf, len);
+
+		if (rtas_log_size < LOG_NUMBER)
+			rtas_log_size += 1;
+		else
+			rtas_log_start += 1;
+
+		spin_unlock_irqrestore(&rtasd_log_lock, s);
+		wake_up_interruptible(&rtas_log_wait);
+		break;
+	case ERR_TYPE_KERNEL_PANIC:
+	default:
+		spin_unlock_irqrestore(&rtasd_log_lock, s);
+		return;
+	}
+
+}
+
+
+static int rtas_log_open(struct inode * inode, struct file * file)
+{
+	return 0;
+}
+
+static int rtas_log_release(struct inode * inode, struct file * file)
+{
+	return 0;
+}
+
+/* This will check if all events are logged, if they are then, we
+ * know that we can safely clear the events in NVRAM.
+ * Next we'll sit and wait for something else to log.
+ */
+static ssize_t rtas_log_read(struct file * file, char __user * buf,
+			 size_t count, loff_t *ppos)
+{
+	int error;
+	char *tmp;
+	unsigned long s;
+	unsigned long offset;
+
+	if (!buf || count < rtas_error_log_buffer_max)
+		return -EINVAL;
+
+	count = rtas_error_log_buffer_max;
+
+	if (!access_ok(VERIFY_WRITE, buf, count))
+		return -EFAULT;
+
+	tmp = kmalloc(count, GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+
+	spin_lock_irqsave(&rtasd_log_lock, s);
+	/* if it's 0, then we know we got the last one (the one in NVRAM) */
+	if (rtas_log_size == 0 && !no_logging)
+		nvram_clear_error_log();
+	spin_unlock_irqrestore(&rtasd_log_lock, s);
+
+
+	error = wait_event_interruptible(rtas_log_wait, rtas_log_size);
+	if (error)
+		goto out;
+
+	spin_lock_irqsave(&rtasd_log_lock, s);
+	offset = rtas_error_log_buffer_max * (rtas_log_start & LOG_NUMBER_MASK);
+	memcpy(tmp, &rtas_log_buf[offset], count);
+
+	rtas_log_start += 1;
+	rtas_log_size -= 1;
+	spin_unlock_irqrestore(&rtasd_log_lock, s);
+
+	error = copy_to_user(buf, tmp, count) ? -EFAULT : count;
+out:
+	kfree(tmp);
+	return error;
+}
+
+static unsigned int rtas_log_poll(struct file *file, poll_table * wait)
+{
+	poll_wait(file, &rtas_log_wait, wait);
+	if (rtas_log_size)
+		return POLLIN | POLLRDNORM;
+	return 0;
+}
+
+struct file_operations proc_rtas_log_operations = {
+	.read =		rtas_log_read,
+	.poll =		rtas_log_poll,
+	.open =		rtas_log_open,
+	.release =	rtas_log_release,
+};
+
+static int enable_surveillance(int timeout)
+{
+	int error;
+
+	error = rtas_set_indicator(SURVEILLANCE_TOKEN, 0, timeout);
+
+	if (error == 0)
+		return 0;
+
+	if (error == -EINVAL) {
+		printk(KERN_INFO "rtasd: surveillance not supported\n");
+		return 0;
+	}
+
+	printk(KERN_ERR "rtasd: could not update surveillance\n");
+	return -1;
+}
+
+static int get_eventscan_parms(void)
+{
+	struct device_node *node;
+	int *ip;
+
+	node = of_find_node_by_path("/rtas");
+
+	ip = (int *)get_property(node, "rtas-event-scan-rate", NULL);
+	if (ip == NULL) {
+		printk(KERN_ERR "rtasd: no rtas-event-scan-rate\n");
+		of_node_put(node);
+		return -1;
+	}
+	rtas_event_scan_rate = *ip;
+	DEBUG("rtas-event-scan-rate %d\n", rtas_event_scan_rate);
+
+	/* Make room for the sequence number */
+	rtas_error_log_max = rtas_get_error_log_max();
+	rtas_error_log_buffer_max = rtas_error_log_max + sizeof(int);
+
+	of_node_put(node);
+
+	return 0;
+}
+
+static void do_event_scan(int event_scan)
+{
+	int error;
+	do {
+		memset(logdata, 0, rtas_error_log_max);
+		error = rtas_call(event_scan, 4, 1, NULL,
+				  RTAS_EVENT_SCAN_ALL_EVENTS, 0,
+				  __pa(logdata), rtas_error_log_max);
+		if (error == -1) {
+			printk(KERN_ERR "event-scan failed\n");
+			break;
+		}
+
+		if (error == 0)
+			pSeries_log_error(logdata, ERR_TYPE_RTAS_LOG, 0);
+
+	} while(error == 0);
+}
+
+static void do_event_scan_all_cpus(long delay)
+{
+	int cpu;
+
+	lock_cpu_hotplug();
+	cpu = first_cpu(cpu_online_map);
+	for (;;) {
+		set_cpus_allowed(current, cpumask_of_cpu(cpu));
+		do_event_scan(rtas_token("event-scan"));
+		set_cpus_allowed(current, CPU_MASK_ALL);
+
+		/* Drop hotplug lock, and sleep for the specified delay */
+		unlock_cpu_hotplug();
+		msleep_interruptible(delay);
+		lock_cpu_hotplug();
+
+		cpu = next_cpu(cpu, cpu_online_map);
+		if (cpu == NR_CPUS)
+			break;
+	}
+	unlock_cpu_hotplug();
+}
+
+static int rtasd(void *unused)
+{
+	unsigned int err_type;
+	int event_scan = rtas_token("event-scan");
+	int rc;
+
+	daemonize("rtasd");
+
+	if (event_scan == RTAS_UNKNOWN_SERVICE || get_eventscan_parms() == -1)
+		goto error;
+
+	rtas_log_buf = vmalloc(rtas_error_log_buffer_max*LOG_NUMBER);
+	if (!rtas_log_buf) {
+		printk(KERN_ERR "rtasd: no memory\n");
+		goto error;
+	}
+
+	printk(KERN_INFO "RTAS daemon started\n");
+
+	DEBUG("will sleep for %d milliseconds\n", (30000/rtas_event_scan_rate));
+
+	/* See if we have any error stored in NVRAM */
+	memset(logdata, 0, rtas_error_log_max);
+
+	rc = nvram_read_error_log(logdata, rtas_error_log_max, &err_type);
+
+	/* We can use rtas_log_buf now */
+	no_logging = 0;
+
+	if (!rc) {
+		if (err_type != ERR_FLAG_ALREADY_LOGGED) {
+			pSeries_log_error(logdata, err_type | ERR_FLAG_BOOT, 0);
+		}
+	}
+
+	/* First pass. */
+	do_event_scan_all_cpus(1000);
+
+	if (surveillance_timeout != -1) {
+		DEBUG("enabling surveillance\n");
+		enable_surveillance(surveillance_timeout);
+		DEBUG("surveillance enabled\n");
+	}
+
+	/* Delay should be at least one second since some
+	 * machines have problems if we call event-scan too
+	 * quickly. */
+	for (;;)
+		do_event_scan_all_cpus(30000/rtas_event_scan_rate);
+
+error:
+	/* Should delete proc entries */
+	return -EINVAL;
+}
+
+static int __init rtas_init(void)
+{
+	struct proc_dir_entry *entry;
+
+	/* No RTAS, only warn if we are on a pSeries box  */
+	if (rtas_token("event-scan") == RTAS_UNKNOWN_SERVICE) {
+		if (systemcfg->platform & PLATFORM_PSERIES)
+			printk(KERN_INFO "rtasd: no event-scan on system\n");
+		return 1;
+	}
+
+	entry = create_proc_entry("ppc64/rtas/error_log", S_IRUSR, NULL);
+	if (entry)
+		entry->proc_fops = &proc_rtas_log_operations;
+	else
+		printk(KERN_ERR "Failed to create error_log proc entry\n");
+
+	if (kernel_thread(rtasd, NULL, CLONE_FS) < 0)
+		printk(KERN_ERR "Failed to start RTAS daemon\n");
+
+	return 0;
+}
+
+static int __init surveillance_setup(char *str)
+{
+	int i;
+
+	if (get_option(&str,&i)) {
+		if (i >= 0 && i <= 255)
+			surveillance_timeout = i;
+	}
+
+	return 1;
+}
+
+static int __init rtasmsgs_setup(char *str)
+{
+	if (strcmp(str, "on") == 0)
+		full_rtas_msgs = 1;
+	else if (strcmp(str, "off") == 0)
+		full_rtas_msgs = 0;
+
+	return 1;
+}
+__initcall(rtas_init);
+__setup("surveillance=", surveillance_setup);
+__setup("rtasmsgs=", rtasmsgs_setup);
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
new file mode 100644
index 00000000000..65bee939eec
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -0,0 +1,618 @@
+/*
+ *  64-bit pSeries and RS/6000 setup code.
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *  Adapted from 'alpha' version by Gary Thomas
+ *  Modified by Cort Dougan (cort@cs.nmt.edu)
+ *  Modified by PPC64 Team, IBM Corp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * bootup setup stuff..
+ */
+
+#undef DEBUG
+
+#include <linux/config.h>
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/slab.h>
+#include <linux/user.h>
+#include <linux/a.out.h>
+#include <linux/tty.h>
+#include <linux/major.h>
+#include <linux/interrupt.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/console.h>
+#include <linux/pci.h>
+#include <linux/utsname.h>
+#include <linux/adb.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/irq.h>
+#include <linux/seq_file.h>
+#include <linux/root_dev.h>
+
+#include <asm/mmu.h>
+#include <asm/processor.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/pci-bridge.h>
+#include <asm/iommu.h>
+#include <asm/dma.h>
+#include <asm/machdep.h>
+#include <asm/irq.h>
+#include <asm/time.h>
+#include <asm/nvram.h>
+#include "xics.h"
+#include <asm/firmware.h>
+#include <asm/pmc.h>
+#include <asm/mpic.h>
+#include <asm/ppc-pci.h>
+#include <asm/i8259.h>
+#include <asm/udbg.h>
+
+#include "plpar_wrappers.h"
+
+#ifdef DEBUG
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+extern void find_udbg_vterm(void);
+extern void system_reset_fwnmi(void);	/* from head.S */
+extern void machine_check_fwnmi(void);	/* from head.S */
+extern void generic_find_legacy_serial_ports(u64 *physport,
+		unsigned int *default_speed);
+
+int fwnmi_active;  /* TRUE if an FWNMI handler is present */
+
+extern void pSeries_system_reset_exception(struct pt_regs *regs);
+extern int pSeries_machine_check_exception(struct pt_regs *regs);
+
+static void pseries_shared_idle(void);
+static void pseries_dedicated_idle(void);
+
+struct mpic *pSeries_mpic;
+
+void pSeries_show_cpuinfo(struct seq_file *m)
+{
+	struct device_node *root;
+	const char *model = "";
+
+	root = of_find_node_by_path("/");
+	if (root)
+		model = get_property(root, "model", NULL);
+	seq_printf(m, "machine\t\t: CHRP %s\n", model);
+	of_node_put(root);
+}
+
+/* Initialize firmware assisted non-maskable interrupts if
+ * the firmware supports this feature.
+ *
+ */
+static void __init fwnmi_init(void)
+{
+	int ret;
+	int ibm_nmi_register = rtas_token("ibm,nmi-register");
+	if (ibm_nmi_register == RTAS_UNKNOWN_SERVICE)
+		return;
+	ret = rtas_call(ibm_nmi_register, 2, 1, NULL,
+			__pa((unsigned long)system_reset_fwnmi),
+			__pa((unsigned long)machine_check_fwnmi));
+	if (ret == 0)
+		fwnmi_active = 1;
+}
+
+static void __init pSeries_init_mpic(void)
+{
+        unsigned int *addrp;
+	struct device_node *np;
+	unsigned long intack = 0;
+
+	/* All ISUs are setup, complete initialization */
+	mpic_init(pSeries_mpic);
+
+	/* Check what kind of cascade ACK we have */
+        if (!(np = of_find_node_by_name(NULL, "pci"))
+            || !(addrp = (unsigned int *)
+                 get_property(np, "8259-interrupt-acknowledge", NULL)))
+                printk(KERN_ERR "Cannot find pci to get ack address\n");
+        else
+		intack = addrp[prom_n_addr_cells(np)-1];
+	of_node_put(np);
+
+	/* Setup the legacy interrupts & controller */
+	i8259_init(intack, 0);
+
+	/* Hook cascade to mpic */
+	mpic_setup_cascade(NUM_ISA_INTERRUPTS, i8259_irq_cascade, NULL);
+}
+
+static void __init pSeries_setup_mpic(void)
+{
+	unsigned int *opprop;
+	unsigned long openpic_addr = 0;
+        unsigned char senses[NR_IRQS - NUM_ISA_INTERRUPTS];
+        struct device_node *root;
+	int irq_count;
+
+	/* Find the Open PIC if present */
+	root = of_find_node_by_path("/");
+	opprop = (unsigned int *) get_property(root, "platform-open-pic", NULL);
+	if (opprop != 0) {
+		int n = prom_n_addr_cells(root);
+
+		for (openpic_addr = 0; n > 0; --n)
+			openpic_addr = (openpic_addr << 32) + *opprop++;
+		printk(KERN_DEBUG "OpenPIC addr: %lx\n", openpic_addr);
+	}
+	of_node_put(root);
+
+	BUG_ON(openpic_addr == 0);
+
+	/* Get the sense values from OF */
+	prom_get_irq_senses(senses, NUM_ISA_INTERRUPTS, NR_IRQS);
+	
+	/* Setup the openpic driver */
+	irq_count = NR_IRQS - NUM_ISA_INTERRUPTS - 4; /* leave room for IPIs */
+	pSeries_mpic = mpic_alloc(openpic_addr, MPIC_PRIMARY,
+				  16, 16, irq_count, /* isu size, irq offset, irq count */ 
+				  NR_IRQS - 4, /* ipi offset */
+				  senses, irq_count, /* sense & sense size */
+				  " MPIC     ");
+}
+
+static void pseries_lpar_enable_pmcs(void)
+{
+	unsigned long set, reset;
+
+	power4_enable_pmcs();
+
+	set = 1UL << 63;
+	reset = 0;
+	plpar_hcall_norets(H_PERFMON, set, reset);
+
+	/* instruct hypervisor to maintain PMCs */
+	if (firmware_has_feature(FW_FEATURE_SPLPAR))
+		get_paca()->lppaca.pmcregs_in_use = 1;
+}
+
+static void __init pSeries_setup_arch(void)
+{
+	/* Fixup ppc_md depending on the type of interrupt controller */
+	if (ppc64_interrupt_controller == IC_OPEN_PIC) {
+		ppc_md.init_IRQ       = pSeries_init_mpic;
+		ppc_md.get_irq        = mpic_get_irq;
+	 	ppc_md.cpu_irq_down   = mpic_teardown_this_cpu;
+		/* Allocate the mpic now, so that find_and_init_phbs() can
+		 * fill the ISUs */
+		pSeries_setup_mpic();
+	} else {
+		ppc_md.init_IRQ       = xics_init_IRQ;
+		ppc_md.get_irq        = xics_get_irq;
+		ppc_md.cpu_irq_down   = xics_teardown_cpu;
+	}
+
+#ifdef CONFIG_SMP
+	smp_init_pSeries();
+#endif
+	/* openpic global configuration register (64-bit format). */
+	/* openpic Interrupt Source Unit pointer (64-bit format). */
+	/* python0 facility area (mmio) (64-bit format) REAL address. */
+
+	/* init to some ~sane value until calibrate_delay() runs */
+	loops_per_jiffy = 50000000;
+
+	if (ROOT_DEV == 0) {
+		printk("No ramdisk, default root is /dev/sda2\n");
+		ROOT_DEV = Root_SDA2;
+	}
+
+	fwnmi_init();
+
+	/* Find and initialize PCI host bridges */
+	init_pci_config_tokens();
+	find_and_init_phbs();
+	eeh_init();
+
+	pSeries_nvram_init();
+
+	/* Choose an idle loop */
+	if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
+		vpa_init(boot_cpuid);
+		if (get_paca()->lppaca.shared_proc) {
+			printk(KERN_INFO "Using shared processor idle loop\n");
+			ppc_md.idle_loop = pseries_shared_idle;
+		} else {
+			printk(KERN_INFO "Using dedicated idle loop\n");
+			ppc_md.idle_loop = pseries_dedicated_idle;
+		}
+	} else {
+		printk(KERN_INFO "Using default idle loop\n");
+		ppc_md.idle_loop = default_idle;
+	}
+
+	if (systemcfg->platform & PLATFORM_LPAR)
+		ppc_md.enable_pmcs = pseries_lpar_enable_pmcs;
+	else
+		ppc_md.enable_pmcs = power4_enable_pmcs;
+}
+
+static int __init pSeries_init_panel(void)
+{
+	/* Manually leave the kernel version on the panel. */
+	ppc_md.progress("Linux ppc64\n", 0);
+	ppc_md.progress(system_utsname.version, 0);
+
+	return 0;
+}
+arch_initcall(pSeries_init_panel);
+
+
+/* Build up the ppc64_firmware_features bitmask field
+ * using contents of device-tree/ibm,hypertas-functions.
+ * Ultimately this functionality may be moved into prom.c prom_init().
+ */
+static void __init fw_feature_init(void)
+{
+	struct device_node * dn;
+	char * hypertas;
+	unsigned int len;
+
+	DBG(" -> fw_feature_init()\n");
+
+	ppc64_firmware_features = 0;
+	dn = of_find_node_by_path("/rtas");
+	if (dn == NULL) {
+		printk(KERN_ERR "WARNING ! Cannot find RTAS in device-tree !\n");
+		goto no_rtas;
+	}
+
+	hypertas = get_property(dn, "ibm,hypertas-functions", &len);
+	if (hypertas) {
+		while (len > 0){
+			int i, hypertas_len;
+			/* check value against table of strings */
+			for(i=0; i < FIRMWARE_MAX_FEATURES ;i++) {
+				if ((firmware_features_table[i].name) &&
+				    (strcmp(firmware_features_table[i].name,hypertas))==0) {
+					/* we have a match */
+					ppc64_firmware_features |= 
+						(firmware_features_table[i].val);
+					break;
+				} 
+			}
+			hypertas_len = strlen(hypertas);
+			len -= hypertas_len +1;
+			hypertas+= hypertas_len +1;
+		}
+	}
+
+	of_node_put(dn);
+ no_rtas:
+	printk(KERN_INFO "firmware_features = 0x%lx\n", 
+	       ppc64_firmware_features);
+
+	DBG(" <- fw_feature_init()\n");
+}
+
+
+static  void __init pSeries_discover_pic(void)
+{
+	struct device_node *np;
+	char *typep;
+
+	/*
+	 * Setup interrupt mapping options that are needed for finish_device_tree
+	 * to properly parse the OF interrupt tree & do the virtual irq mapping
+	 */
+	__irq_offset_value = NUM_ISA_INTERRUPTS;
+	ppc64_interrupt_controller = IC_INVALID;
+	for (np = NULL; (np = of_find_node_by_name(np, "interrupt-controller"));) {
+		typep = (char *)get_property(np, "compatible", NULL);
+		if (strstr(typep, "open-pic"))
+			ppc64_interrupt_controller = IC_OPEN_PIC;
+		else if (strstr(typep, "ppc-xicp"))
+			ppc64_interrupt_controller = IC_PPC_XIC;
+		else
+			printk("pSeries_discover_pic: failed to recognize"
+			       " interrupt-controller\n");
+		break;
+	}
+}
+
+static void pSeries_mach_cpu_die(void)
+{
+	local_irq_disable();
+	idle_task_exit();
+	/* Some hardware requires clearing the CPPR, while other hardware does not
+	 * it is safe either way
+	 */
+	pSeriesLP_cppr_info(0, 0);
+	rtas_stop_self();
+	/* Should never get here... */
+	BUG();
+	for(;;);
+}
+
+static int pseries_set_dabr(unsigned long dabr)
+{
+	if (firmware_has_feature(FW_FEATURE_XDABR)) {
+		/* We want to catch accesses from kernel and userspace */
+		return plpar_set_xdabr(dabr, H_DABRX_KERNEL | H_DABRX_USER);
+	}
+
+	return plpar_set_dabr(dabr);
+}
+
+
+/*
+ * Early initialization.  Relocation is on but do not reference unbolted pages
+ */
+static void __init pSeries_init_early(void)
+{
+	void *comport;
+	int iommu_off = 0;
+	unsigned int default_speed;
+	u64 physport;
+
+	DBG(" -> pSeries_init_early()\n");
+
+	fw_feature_init();
+	
+	if (systemcfg->platform & PLATFORM_LPAR)
+		hpte_init_lpar();
+	else {
+		hpte_init_native();
+		iommu_off = (of_chosen &&
+			     get_property(of_chosen, "linux,iommu-off", NULL));
+	}
+
+	generic_find_legacy_serial_ports(&physport, &default_speed);
+
+	if (systemcfg->platform & PLATFORM_LPAR)
+		find_udbg_vterm();
+	else if (physport) {
+		/* Map the uart for udbg. */
+		comport = (void *)ioremap(physport, 16);
+		udbg_init_uart(comport, default_speed);
+
+		DBG("Hello World !\n");
+	}
+
+	if (firmware_has_feature(FW_FEATURE_XDABR | FW_FEATURE_DABR))
+		ppc_md.set_dabr = pseries_set_dabr;
+
+	iommu_init_early_pSeries();
+
+	pSeries_discover_pic();
+
+	DBG(" <- pSeries_init_early()\n");
+}
+
+
+static int pSeries_check_legacy_ioport(unsigned int baseport)
+{
+	struct device_node *np;
+
+#define I8042_DATA_REG	0x60
+#define FDC_BASE	0x3f0
+
+
+	switch(baseport) {
+	case I8042_DATA_REG:
+		np = of_find_node_by_type(NULL, "8042");
+		if (np == NULL)
+			return -ENODEV;
+		of_node_put(np);
+		break;
+	case FDC_BASE:
+		np = of_find_node_by_type(NULL, "fdc");
+		if (np == NULL)
+			return -ENODEV;
+		of_node_put(np);
+		break;
+	}
+	return 0;
+}
+
+/*
+ * Called very early, MMU is off, device-tree isn't unflattened
+ */
+extern struct machdep_calls pSeries_md;
+
+static int __init pSeries_probe(int platform)
+{
+	if (platform != PLATFORM_PSERIES &&
+	    platform != PLATFORM_PSERIES_LPAR)
+		return 0;
+
+	/* if we have some ppc_md fixups for LPAR to do, do
+	 * it here ...
+	 */
+
+	return 1;
+}
+
+DECLARE_PER_CPU(unsigned long, smt_snooze_delay);
+
+static inline void dedicated_idle_sleep(unsigned int cpu)
+{
+	struct paca_struct *ppaca = &paca[cpu ^ 1];
+
+	/* Only sleep if the other thread is not idle */
+	if (!(ppaca->lppaca.idle)) {
+		local_irq_disable();
+
+		/*
+		 * We are about to sleep the thread and so wont be polling any
+		 * more.
+		 */
+		clear_thread_flag(TIF_POLLING_NRFLAG);
+
+		/*
+		 * SMT dynamic mode. Cede will result in this thread going
+		 * dormant, if the partner thread is still doing work.  Thread
+		 * wakes up if partner goes idle, an interrupt is presented, or
+		 * a prod occurs.  Returning from the cede enables external
+		 * interrupts.
+		 */
+		if (!need_resched())
+			cede_processor();
+		else
+			local_irq_enable();
+	} else {
+		/*
+		 * Give the HV an opportunity at the processor, since we are
+		 * not doing any work.
+		 */
+		poll_pending();
+	}
+}
+
+static void pseries_dedicated_idle(void)
+{ 
+	long oldval;
+	struct paca_struct *lpaca = get_paca();
+	unsigned int cpu = smp_processor_id();
+	unsigned long start_snooze;
+	unsigned long *smt_snooze_delay = &__get_cpu_var(smt_snooze_delay);
+
+	while (1) {
+		/*
+		 * Indicate to the HV that we are idle. Now would be
+		 * a good time to find other work to dispatch.
+		 */
+		lpaca->lppaca.idle = 1;
+
+		oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
+		if (!oldval) {
+			set_thread_flag(TIF_POLLING_NRFLAG);
+
+			start_snooze = __get_tb() +
+				*smt_snooze_delay * tb_ticks_per_usec;
+
+			while (!need_resched() && !cpu_is_offline(cpu)) {
+				ppc64_runlatch_off();
+
+				/*
+				 * Go into low thread priority and possibly
+				 * low power mode.
+				 */
+				HMT_low();
+				HMT_very_low();
+
+				if (*smt_snooze_delay != 0 &&
+				    __get_tb() > start_snooze) {
+					HMT_medium();
+					dedicated_idle_sleep(cpu);
+				}
+
+			}
+
+			HMT_medium();
+			clear_thread_flag(TIF_POLLING_NRFLAG);
+		} else {
+			set_need_resched();
+		}
+
+		lpaca->lppaca.idle = 0;
+		ppc64_runlatch_on();
+
+		schedule();
+
+		if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)
+			cpu_die();
+	}
+}
+
+static void pseries_shared_idle(void)
+{
+	struct paca_struct *lpaca = get_paca();
+	unsigned int cpu = smp_processor_id();
+
+	while (1) {
+		/*
+		 * Indicate to the HV that we are idle. Now would be
+		 * a good time to find other work to dispatch.
+		 */
+		lpaca->lppaca.idle = 1;
+
+		while (!need_resched() && !cpu_is_offline(cpu)) {
+			local_irq_disable();
+			ppc64_runlatch_off();
+
+			/*
+			 * Yield the processor to the hypervisor.  We return if
+			 * an external interrupt occurs (which are driven prior
+			 * to returning here) or if a prod occurs from another
+			 * processor. When returning here, external interrupts
+			 * are enabled.
+			 *
+			 * Check need_resched() again with interrupts disabled
+			 * to avoid a race.
+			 */
+			if (!need_resched())
+				cede_processor();
+			else
+				local_irq_enable();
+
+			HMT_medium();
+		}
+
+		lpaca->lppaca.idle = 0;
+		ppc64_runlatch_on();
+
+		schedule();
+
+		if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)
+			cpu_die();
+	}
+}
+
+static int pSeries_pci_probe_mode(struct pci_bus *bus)
+{
+	if (systemcfg->platform & PLATFORM_LPAR)
+		return PCI_PROBE_DEVTREE;
+	return PCI_PROBE_NORMAL;
+}
+
+struct machdep_calls __initdata pSeries_md = {
+	.probe			= pSeries_probe,
+	.setup_arch		= pSeries_setup_arch,
+	.init_early		= pSeries_init_early,
+	.show_cpuinfo		= pSeries_show_cpuinfo,
+	.log_error		= pSeries_log_error,
+	.pcibios_fixup		= pSeries_final_fixup,
+	.pci_probe_mode		= pSeries_pci_probe_mode,
+	.irq_bus_setup		= pSeries_irq_bus_setup,
+	.restart		= rtas_restart,
+	.power_off		= rtas_power_off,
+	.halt			= rtas_halt,
+	.panic			= rtas_os_term,
+	.cpu_die		= pSeries_mach_cpu_die,
+	.get_boot_time		= rtas_get_boot_time,
+	.get_rtc_time		= rtas_get_rtc_time,
+	.set_rtc_time		= rtas_set_rtc_time,
+	.calibrate_decr		= generic_calibrate_decr,
+	.progress		= rtas_progress,
+	.check_legacy_ioport	= pSeries_check_legacy_ioport,
+	.system_reset_exception = pSeries_system_reset_exception,
+	.machine_check_exception = pSeries_machine_check_exception,
+};
diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
new file mode 100644
index 00000000000..7a243e8ccd7
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -0,0 +1,472 @@
+/*
+ * SMP support for pSeries machines.
+ *
+ * Dave Engebretsen, Peter Bergner, and
+ * Mike Corrigan {engebret|bergner|mikec}@us.ibm.com
+ *
+ * Plus various changes from other IBM teams...
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/cache.h>
+#include <linux/err.h>
+#include <linux/sysdev.h>
+#include <linux/cpu.h>
+
+#include <asm/ptrace.h>
+#include <asm/atomic.h>
+#include <asm/irq.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/smp.h>
+#include <asm/paca.h>
+#include <asm/time.h>
+#include <asm/machdep.h>
+#include "xics.h"
+#include <asm/cputable.h>
+#include <asm/firmware.h>
+#include <asm/system.h>
+#include <asm/rtas.h>
+#include <asm/pSeries_reconfig.h>
+#include <asm/mpic.h>
+
+#include "plpar_wrappers.h"
+
+#ifdef DEBUG
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+/*
+ * The primary thread of each non-boot processor is recorded here before
+ * smp init.
+ */
+static cpumask_t of_spin_map;
+
+extern void pSeries_secondary_smp_init(unsigned long);
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/* Get state of physical CPU.
+ * Return codes:
+ *	0	- The processor is in the RTAS stopped state
+ *	1	- stop-self is in progress
+ *	2	- The processor is not in the RTAS stopped state
+ *	-1	- Hardware Error
+ *	-2	- Hardware Busy, Try again later.
+ */
+static int query_cpu_stopped(unsigned int pcpu)
+{
+	int cpu_status;
+	int status, qcss_tok;
+
+	qcss_tok = rtas_token("query-cpu-stopped-state");
+	if (qcss_tok == RTAS_UNKNOWN_SERVICE)
+		return -1;
+	status = rtas_call(qcss_tok, 1, 2, &cpu_status, pcpu);
+	if (status != 0) {
+		printk(KERN_ERR
+		       "RTAS query-cpu-stopped-state failed: %i\n", status);
+		return status;
+	}
+
+	return cpu_status;
+}
+
+int pSeries_cpu_disable(void)
+{
+	int cpu = smp_processor_id();
+
+	cpu_clear(cpu, cpu_online_map);
+	systemcfg->processorCount--;
+
+	/*fix boot_cpuid here*/
+	if (cpu == boot_cpuid)
+		boot_cpuid = any_online_cpu(cpu_online_map);
+
+	/* FIXME: abstract this to not be platform specific later on */
+	xics_migrate_irqs_away();
+	return 0;
+}
+
+void pSeries_cpu_die(unsigned int cpu)
+{
+	int tries;
+	int cpu_status;
+	unsigned int pcpu = get_hard_smp_processor_id(cpu);
+
+	for (tries = 0; tries < 25; tries++) {
+		cpu_status = query_cpu_stopped(pcpu);
+		if (cpu_status == 0 || cpu_status == -1)
+			break;
+		msleep(200);
+	}
+	if (cpu_status != 0) {
+		printk("Querying DEAD? cpu %i (%i) shows %i\n",
+		       cpu, pcpu, cpu_status);
+	}
+
+	/* Isolation and deallocation are definatly done by
+	 * drslot_chrp_cpu.  If they were not they would be
+	 * done here.  Change isolate state to Isolate and
+	 * change allocation-state to Unusable.
+	 */
+	paca[cpu].cpu_start = 0;
+}
+
+/*
+ * Update cpu_present_map and paca(s) for a new cpu node.  The wrinkle
+ * here is that a cpu device node may represent up to two logical cpus
+ * in the SMT case.  We must honor the assumption in other code that
+ * the logical ids for sibling SMT threads x and y are adjacent, such
+ * that x^1 == y and y^1 == x.
+ */
+static int pSeries_add_processor(struct device_node *np)
+{
+	unsigned int cpu;
+	cpumask_t candidate_map, tmp = CPU_MASK_NONE;
+	int err = -ENOSPC, len, nthreads, i;
+	u32 *intserv;
+
+	intserv = (u32 *)get_property(np, "ibm,ppc-interrupt-server#s", &len);
+	if (!intserv)
+		return 0;
+
+	nthreads = len / sizeof(u32);
+	for (i = 0; i < nthreads; i++)
+		cpu_set(i, tmp);
+
+	lock_cpu_hotplug();
+
+	BUG_ON(!cpus_subset(cpu_present_map, cpu_possible_map));
+
+	/* Get a bitmap of unoccupied slots. */
+	cpus_xor(candidate_map, cpu_possible_map, cpu_present_map);
+	if (cpus_empty(candidate_map)) {
+		/* If we get here, it most likely means that NR_CPUS is
+		 * less than the partition's max processors setting.
+		 */
+		printk(KERN_ERR "Cannot add cpu %s; this system configuration"
+		       " supports %d logical cpus.\n", np->full_name,
+		       cpus_weight(cpu_possible_map));
+		goto out_unlock;
+	}
+
+	while (!cpus_empty(tmp))
+		if (cpus_subset(tmp, candidate_map))
+			/* Found a range where we can insert the new cpu(s) */
+			break;
+		else
+			cpus_shift_left(tmp, tmp, nthreads);
+
+	if (cpus_empty(tmp)) {
+		printk(KERN_ERR "Unable to find space in cpu_present_map for"
+		       " processor %s with %d thread(s)\n", np->name,
+		       nthreads);
+		goto out_unlock;
+	}
+
+	for_each_cpu_mask(cpu, tmp) {
+		BUG_ON(cpu_isset(cpu, cpu_present_map));
+		cpu_set(cpu, cpu_present_map);
+		set_hard_smp_processor_id(cpu, *intserv++);
+	}
+	err = 0;
+out_unlock:
+	unlock_cpu_hotplug();
+	return err;
+}
+
+/*
+ * Update the present map for a cpu node which is going away, and set
+ * the hard id in the paca(s) to -1 to be consistent with boot time
+ * convention for non-present cpus.
+ */
+static void pSeries_remove_processor(struct device_node *np)
+{
+	unsigned int cpu;
+	int len, nthreads, i;
+	u32 *intserv;
+
+	intserv = (u32 *)get_property(np, "ibm,ppc-interrupt-server#s", &len);
+	if (!intserv)
+		return;
+
+	nthreads = len / sizeof(u32);
+
+	lock_cpu_hotplug();
+	for (i = 0; i < nthreads; i++) {
+		for_each_present_cpu(cpu) {
+			if (get_hard_smp_processor_id(cpu) != intserv[i])
+				continue;
+			BUG_ON(cpu_online(cpu));
+			cpu_clear(cpu, cpu_present_map);
+			set_hard_smp_processor_id(cpu, -1);
+			break;
+		}
+		if (cpu == NR_CPUS)
+			printk(KERN_WARNING "Could not find cpu to remove "
+			       "with physical id 0x%x\n", intserv[i]);
+	}
+	unlock_cpu_hotplug();
+}
+
+static int pSeries_smp_notifier(struct notifier_block *nb, unsigned long action, void *node)
+{
+	int err = NOTIFY_OK;
+
+	switch (action) {
+	case PSERIES_RECONFIG_ADD:
+		if (pSeries_add_processor(node))
+			err = NOTIFY_BAD;
+		break;
+	case PSERIES_RECONFIG_REMOVE:
+		pSeries_remove_processor(node);
+		break;
+	default:
+		err = NOTIFY_DONE;
+		break;
+	}
+	return err;
+}
+
+static struct notifier_block pSeries_smp_nb = {
+	.notifier_call = pSeries_smp_notifier,
+};
+
+#endif /* CONFIG_HOTPLUG_CPU */
+
+/**
+ * smp_startup_cpu() - start the given cpu
+ *
+ * At boot time, there is nothing to do for primary threads which were
+ * started from Open Firmware.  For anything else, call RTAS with the
+ * appropriate start location.
+ *
+ * Returns:
+ *	0	- failure
+ *	1	- success
+ */
+static inline int __devinit smp_startup_cpu(unsigned int lcpu)
+{
+	int status;
+	unsigned long start_here = __pa((u32)*((unsigned long *)
+					       pSeries_secondary_smp_init));
+	unsigned int pcpu;
+	int start_cpu;
+
+	if (cpu_isset(lcpu, of_spin_map))
+		/* Already started by OF and sitting in spin loop */
+		return 1;
+
+	pcpu = get_hard_smp_processor_id(lcpu);
+
+	/* Fixup atomic count: it exited inside IRQ handler. */
+	paca[lcpu].__current->thread_info->preempt_count	= 0;
+
+	/* 
+	 * If the RTAS start-cpu token does not exist then presume the
+	 * cpu is already spinning.
+	 */
+	start_cpu = rtas_token("start-cpu");
+	if (start_cpu == RTAS_UNKNOWN_SERVICE)
+		return 1;
+
+	status = rtas_call(start_cpu, 3, 1, NULL, pcpu, start_here, lcpu);
+	if (status != 0) {
+		printk(KERN_ERR "start-cpu failed: %i\n", status);
+		return 0;
+	}
+
+	return 1;
+}
+
+#ifdef CONFIG_XICS
+static inline void smp_xics_do_message(int cpu, int msg)
+{
+	set_bit(msg, &xics_ipi_message[cpu].value);
+	mb();
+	xics_cause_IPI(cpu);
+}
+
+static void smp_xics_message_pass(int target, int msg)
+{
+	unsigned int i;
+
+	if (target < NR_CPUS) {
+		smp_xics_do_message(target, msg);
+	} else {
+		for_each_online_cpu(i) {
+			if (target == MSG_ALL_BUT_SELF
+			    && i == smp_processor_id())
+				continue;
+			smp_xics_do_message(i, msg);
+		}
+	}
+}
+
+static int __init smp_xics_probe(void)
+{
+	xics_request_IPIs();
+
+	return cpus_weight(cpu_possible_map);
+}
+
+static void __devinit smp_xics_setup_cpu(int cpu)
+{
+	if (cpu != boot_cpuid)
+		xics_setup_cpu();
+
+	if (firmware_has_feature(FW_FEATURE_SPLPAR))
+		vpa_init(cpu);
+
+	cpu_clear(cpu, of_spin_map);
+
+}
+#endif /* CONFIG_XICS */
+
+static DEFINE_SPINLOCK(timebase_lock);
+static unsigned long timebase = 0;
+
+static void __devinit pSeries_give_timebase(void)
+{
+	spin_lock(&timebase_lock);
+	rtas_call(rtas_token("freeze-time-base"), 0, 1, NULL);
+	timebase = get_tb();
+	spin_unlock(&timebase_lock);
+
+	while (timebase)
+		barrier();
+	rtas_call(rtas_token("thaw-time-base"), 0, 1, NULL);
+}
+
+static void __devinit pSeries_take_timebase(void)
+{
+	while (!timebase)
+		barrier();
+	spin_lock(&timebase_lock);
+	set_tb(timebase >> 32, timebase & 0xffffffff);
+	timebase = 0;
+	spin_unlock(&timebase_lock);
+}
+
+static void __devinit smp_pSeries_kick_cpu(int nr)
+{
+	BUG_ON(nr < 0 || nr >= NR_CPUS);
+
+	if (!smp_startup_cpu(nr))
+		return;
+
+	/*
+	 * The processor is currently spinning, waiting for the
+	 * cpu_start field to become non-zero After we set cpu_start,
+	 * the processor will continue on to secondary_start
+	 */
+	paca[nr].cpu_start = 1;
+}
+
+static int smp_pSeries_cpu_bootable(unsigned int nr)
+{
+	/* Special case - we inhibit secondary thread startup
+	 * during boot if the user requests it.  Odd-numbered
+	 * cpus are assumed to be secondary threads.
+	 */
+	if (system_state < SYSTEM_RUNNING &&
+	    cpu_has_feature(CPU_FTR_SMT) &&
+	    !smt_enabled_at_boot && nr % 2 != 0)
+		return 0;
+
+	return 1;
+}
+#ifdef CONFIG_MPIC
+static struct smp_ops_t pSeries_mpic_smp_ops = {
+	.message_pass	= smp_mpic_message_pass,
+	.probe		= smp_mpic_probe,
+	.kick_cpu	= smp_pSeries_kick_cpu,
+	.setup_cpu	= smp_mpic_setup_cpu,
+};
+#endif
+#ifdef CONFIG_XICS
+static struct smp_ops_t pSeries_xics_smp_ops = {
+	.message_pass	= smp_xics_message_pass,
+	.probe		= smp_xics_probe,
+	.kick_cpu	= smp_pSeries_kick_cpu,
+	.setup_cpu	= smp_xics_setup_cpu,
+	.cpu_bootable	= smp_pSeries_cpu_bootable,
+};
+#endif
+
+/* This is called very early */
+void __init smp_init_pSeries(void)
+{
+	int i;
+
+	DBG(" -> smp_init_pSeries()\n");
+
+	switch (ppc64_interrupt_controller) {
+#ifdef CONFIG_MPIC
+	case IC_OPEN_PIC:
+		smp_ops = &pSeries_mpic_smp_ops;
+		break;
+#endif
+#ifdef CONFIG_XICS
+	case IC_PPC_XIC:
+		smp_ops = &pSeries_xics_smp_ops;
+		break;
+#endif
+	default:
+		panic("Invalid interrupt controller");
+	}
+
+#ifdef CONFIG_HOTPLUG_CPU
+	smp_ops->cpu_disable = pSeries_cpu_disable;
+	smp_ops->cpu_die = pSeries_cpu_die;
+
+	/* Processors can be added/removed only on LPAR */
+	if (systemcfg->platform == PLATFORM_PSERIES_LPAR)
+		pSeries_reconfig_notifier_register(&pSeries_smp_nb);
+#endif
+
+	/* Mark threads which are still spinning in hold loops. */
+	if (cpu_has_feature(CPU_FTR_SMT)) {
+		for_each_present_cpu(i) { 
+			if (i % 2 == 0)
+				/*
+				 * Even-numbered logical cpus correspond to
+				 * primary threads.
+				 */
+				cpu_set(i, of_spin_map);
+		}
+	} else {
+		of_spin_map = cpu_present_map;
+	}
+
+	cpu_clear(boot_cpuid, of_spin_map);
+
+	/* Non-lpar has additional take/give timebase */
+	if (rtas_token("freeze-time-base") != RTAS_UNKNOWN_SERVICE) {
+		smp_ops->give_timebase = pSeries_give_timebase;
+		smp_ops->take_timebase = pSeries_take_timebase;
+	}
+
+	DBG(" <- smp_init_pSeries()\n");
+}
+
diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c
new file mode 100644
index 00000000000..866379b80c0
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -0,0 +1,274 @@
+/*
+ * IBM PowerPC pSeries Virtual I/O Infrastructure Support.
+ *
+ *    Copyright (c) 2003-2005 IBM Corp.
+ *     Dave Engebretsen engebret@us.ibm.com
+ *     Santiago Leon santil@us.ibm.com
+ *     Hollis Blanchard <hollisb@us.ibm.com>
+ *     Stephen Rothwell
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/kobject.h>
+#include <asm/iommu.h>
+#include <asm/dma.h>
+#include <asm/prom.h>
+#include <asm/vio.h>
+#include <asm/hvcall.h>
+#include <asm/tce.h>
+
+extern struct subsystem devices_subsys; /* needed for vio_find_name() */
+
+static void probe_bus_pseries(void)
+{
+	struct device_node *node_vroot, *of_node;
+
+	node_vroot = find_devices("vdevice");
+	if ((node_vroot == NULL) || (node_vroot->child == NULL))
+		/* this machine doesn't do virtual IO, and that's ok */
+		return;
+
+	/*
+	 * Create struct vio_devices for each virtual device in the device tree.
+	 * Drivers will associate with them later.
+	 */
+	for (of_node = node_vroot->child; of_node != NULL;
+			of_node = of_node->sibling) {
+		printk(KERN_DEBUG "%s: processing %p\n", __FUNCTION__, of_node);
+		vio_register_device_node(of_node);
+	}
+}
+
+/**
+ * vio_match_device_pseries: - Tell if a pSeries VIO device matches a
+ *	vio_device_id
+ */
+static int vio_match_device_pseries(const struct vio_device_id *id,
+		const struct vio_dev *dev)
+{
+	return (strncmp(dev->type, id->type, strlen(id->type)) == 0) &&
+			device_is_compatible(dev->dev.platform_data, id->compat);
+}
+
+static void vio_release_device_pseries(struct device *dev)
+{
+	/* XXX free TCE table */
+	of_node_put(dev->platform_data);
+}
+
+static ssize_t viodev_show_devspec(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct device_node *of_node = dev->platform_data;
+
+	return sprintf(buf, "%s\n", of_node->full_name);
+}
+DEVICE_ATTR(devspec, S_IRUSR | S_IRGRP | S_IROTH, viodev_show_devspec, NULL);
+
+static void vio_unregister_device_pseries(struct vio_dev *viodev)
+{
+	device_remove_file(&viodev->dev, &dev_attr_devspec);
+}
+
+static struct vio_bus_ops vio_bus_ops_pseries = {
+	.match = vio_match_device_pseries,
+	.unregister_device = vio_unregister_device_pseries,
+	.release_device = vio_release_device_pseries,
+};
+
+/**
+ * vio_bus_init_pseries: - Initialize the pSeries virtual IO bus
+ */
+static int __init vio_bus_init_pseries(void)
+{
+	int err;
+
+	err = vio_bus_init(&vio_bus_ops_pseries);
+	if (err == 0)
+		probe_bus_pseries();
+	return err;
+}
+
+__initcall(vio_bus_init_pseries);
+
+/**
+ * vio_build_iommu_table: - gets the dma information from OF and
+ *	builds the TCE tree.
+ * @dev: the virtual device.
+ *
+ * Returns a pointer to the built tce tree, or NULL if it can't
+ * find property.
+*/
+static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev)
+{
+	unsigned int *dma_window;
+	struct iommu_table *newTceTable;
+	unsigned long offset;
+	int dma_window_property_size;
+
+	dma_window = (unsigned int *) get_property(dev->dev.platform_data, "ibm,my-dma-window", &dma_window_property_size);
+	if(!dma_window) {
+		return NULL;
+	}
+
+	newTceTable = (struct iommu_table *) kmalloc(sizeof(struct iommu_table), GFP_KERNEL);
+
+	/*  There should be some code to extract the phys-encoded offset
+		using prom_n_addr_cells(). However, according to a comment
+		on earlier versions, it's always zero, so we don't bother */
+	offset = dma_window[1] >>  PAGE_SHIFT;
+
+	/* TCE table size - measured in tce entries */
+	newTceTable->it_size		= dma_window[4] >> PAGE_SHIFT;
+	/* offset for VIO should always be 0 */
+	newTceTable->it_offset		= offset;
+	newTceTable->it_busno		= 0;
+	newTceTable->it_index		= (unsigned long)dma_window[0];
+	newTceTable->it_type		= TCE_VB;
+
+	return iommu_init_table(newTceTable);
+}
+
+/**
+ * vio_register_device_node: - Register a new vio device.
+ * @of_node:	The OF node for this device.
+ *
+ * Creates and initializes a vio_dev structure from the data in
+ * of_node (dev.platform_data) and adds it to the list of virtual devices.
+ * Returns a pointer to the created vio_dev or NULL if node has
+ * NULL device_type or compatible fields.
+ */
+struct vio_dev * __devinit vio_register_device_node(struct device_node *of_node)
+{
+	struct vio_dev *viodev;
+	unsigned int *unit_address;
+	unsigned int *irq_p;
+
+	/* we need the 'device_type' property, in order to match with drivers */
+	if ((NULL == of_node->type)) {
+		printk(KERN_WARNING
+			"%s: node %s missing 'device_type'\n", __FUNCTION__,
+			of_node->name ? of_node->name : "<unknown>");
+		return NULL;
+	}
+
+	unit_address = (unsigned int *)get_property(of_node, "reg", NULL);
+	if (!unit_address) {
+		printk(KERN_WARNING "%s: node %s missing 'reg'\n", __FUNCTION__,
+			of_node->name ? of_node->name : "<unknown>");
+		return NULL;
+	}
+
+	/* allocate a vio_dev for this node */
+	viodev = kmalloc(sizeof(struct vio_dev), GFP_KERNEL);
+	if (!viodev) {
+		return NULL;
+	}
+	memset(viodev, 0, sizeof(struct vio_dev));
+
+	viodev->dev.platform_data = of_node_get(of_node);
+
+	viodev->irq = NO_IRQ;
+	irq_p = (unsigned int *)get_property(of_node, "interrupts", NULL);
+	if (irq_p) {
+		int virq = virt_irq_create_mapping(*irq_p);
+		if (virq == NO_IRQ) {
+			printk(KERN_ERR "Unable to allocate interrupt "
+			       "number for %s\n", of_node->full_name);
+		} else
+			viodev->irq = irq_offset_up(virq);
+	}
+
+	snprintf(viodev->dev.bus_id, BUS_ID_SIZE, "%x", *unit_address);
+	viodev->name = of_node->name;
+	viodev->type = of_node->type;
+	viodev->unit_address = *unit_address;
+	viodev->iommu_table = vio_build_iommu_table(viodev);
+
+	/* register with generic device framework */
+	if (vio_register_device(viodev) == NULL) {
+		/* XXX free TCE table */
+		kfree(viodev);
+		return NULL;
+	}
+	device_create_file(&viodev->dev, &dev_attr_devspec);
+
+	return viodev;
+}
+EXPORT_SYMBOL(vio_register_device_node);
+
+/**
+ * vio_get_attribute: - get attribute for virtual device
+ * @vdev:	The vio device to get property.
+ * @which:	The property/attribute to be extracted.
+ * @length:	Pointer to length of returned data size (unused if NULL).
+ *
+ * Calls prom.c's get_property() to return the value of the
+ * attribute specified by the preprocessor constant @which
+*/
+const void * vio_get_attribute(struct vio_dev *vdev, void* which, int* length)
+{
+	return get_property(vdev->dev.platform_data, (char*)which, length);
+}
+EXPORT_SYMBOL(vio_get_attribute);
+
+/* vio_find_name() - internal because only vio.c knows how we formatted the
+ * kobject name
+ * XXX once vio_bus_type.devices is actually used as a kset in
+ * drivers/base/bus.c, this function should be removed in favor of
+ * "device_find(kobj_name, &vio_bus_type)"
+ */
+static struct vio_dev *vio_find_name(const char *kobj_name)
+{
+	struct kobject *found;
+
+	found = kset_find_obj(&devices_subsys.kset, kobj_name);
+	if (!found)
+		return NULL;
+
+	return to_vio_dev(container_of(found, struct device, kobj));
+}
+
+/**
+ * vio_find_node - find an already-registered vio_dev
+ * @vnode: device_node of the virtual device we're looking for
+ */
+struct vio_dev *vio_find_node(struct device_node *vnode)
+{
+	uint32_t *unit_address;
+	char kobj_name[BUS_ID_SIZE];
+
+	/* construct the kobject name from the device node */
+	unit_address = (uint32_t *)get_property(vnode, "reg", NULL);
+	if (!unit_address)
+		return NULL;
+	snprintf(kobj_name, BUS_ID_SIZE, "%x", *unit_address);
+
+	return vio_find_name(kobj_name);
+}
+EXPORT_SYMBOL(vio_find_node);
+
+int vio_enable_interrupts(struct vio_dev *dev)
+{
+	int rc = h_vio_signal(dev->unit_address, VIO_IRQ_ENABLE);
+	if (rc != H_Success)
+		printk(KERN_ERR "vio: Error 0x%x enabling interrupts\n", rc);
+	return rc;
+}
+EXPORT_SYMBOL(vio_enable_interrupts);
+
+int vio_disable_interrupts(struct vio_dev *dev)
+{
+	int rc = h_vio_signal(dev->unit_address, VIO_IRQ_DISABLE);
+	if (rc != H_Success)
+		printk(KERN_ERR "vio: Error 0x%x disabling interrupts\n", rc);
+	return rc;
+}
+EXPORT_SYMBOL(vio_disable_interrupts);
diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c
new file mode 100644
index 00000000000..c72c86f05cb
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/xics.c
@@ -0,0 +1,747 @@
+/*
+ * arch/powerpc/platforms/pseries/xics.c
+ *
+ * Copyright 2000 IBM Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/threads.h>
+#include <linux/kernel.h>
+#include <linux/irq.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/signal.h>
+#include <linux/init.h>
+#include <linux/gfp.h>
+#include <linux/radix-tree.h>
+#include <linux/cpu.h>
+#include <asm/prom.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/smp.h>
+#include <asm/rtas.h>
+#include <asm/hvcall.h>
+#include <asm/machdep.h>
+#include <asm/i8259.h>
+
+#include "xics.h"
+
+static unsigned int xics_startup(unsigned int irq);
+static void xics_enable_irq(unsigned int irq);
+static void xics_disable_irq(unsigned int irq);
+static void xics_mask_and_ack_irq(unsigned int irq);
+static void xics_end_irq(unsigned int irq);
+static void xics_set_affinity(unsigned int irq_nr, cpumask_t cpumask);
+
+static struct hw_interrupt_type xics_pic = {
+	.typename = " XICS     ",
+	.startup = xics_startup,
+	.enable = xics_enable_irq,
+	.disable = xics_disable_irq,
+	.ack = xics_mask_and_ack_irq,
+	.end = xics_end_irq,
+	.set_affinity = xics_set_affinity
+};
+
+static struct hw_interrupt_type xics_8259_pic = {
+	.typename = " XICS/8259",
+	.ack = xics_mask_and_ack_irq,
+};
+
+/* This is used to map real irq numbers to virtual */
+static struct radix_tree_root irq_map = RADIX_TREE_INIT(GFP_ATOMIC);
+
+#define XICS_IPI		2
+#define XICS_IRQ_SPURIOUS	0
+
+/* Want a priority other than 0.  Various HW issues require this. */
+#define	DEFAULT_PRIORITY	5
+
+/*
+ * Mark IPIs as higher priority so we can take them inside interrupts that
+ * arent marked SA_INTERRUPT
+ */
+#define IPI_PRIORITY		4
+
+struct xics_ipl {
+	union {
+		u32 word;
+		u8 bytes[4];
+	} xirr_poll;
+	union {
+		u32 word;
+		u8 bytes[4];
+	} xirr;
+	u32 dummy;
+	union {
+		u32 word;
+		u8 bytes[4];
+	} qirr;
+};
+
+static struct xics_ipl __iomem *xics_per_cpu[NR_CPUS];
+
+static int xics_irq_8259_cascade = 0;
+static int xics_irq_8259_cascade_real = 0;
+static unsigned int default_server = 0xFF;
+static unsigned int default_distrib_server = 0;
+static unsigned int interrupt_server_size = 8;
+
+/*
+ * XICS only has a single IPI, so encode the messages per CPU
+ */
+struct xics_ipi_struct xics_ipi_message[NR_CPUS] __cacheline_aligned;
+
+/* RTAS service tokens */
+static int ibm_get_xive;
+static int ibm_set_xive;
+static int ibm_int_on;
+static int ibm_int_off;
+
+typedef struct {
+	int (*xirr_info_get)(int cpu);
+	void (*xirr_info_set)(int cpu, int val);
+	void (*cppr_info)(int cpu, u8 val);
+	void (*qirr_info)(int cpu, u8 val);
+} xics_ops;
+
+
+/* SMP */
+
+static int pSeries_xirr_info_get(int n_cpu)
+{
+	return in_be32(&xics_per_cpu[n_cpu]->xirr.word);
+}
+
+static void pSeries_xirr_info_set(int n_cpu, int value)
+{
+	out_be32(&xics_per_cpu[n_cpu]->xirr.word, value);
+}
+
+static void pSeries_cppr_info(int n_cpu, u8 value)
+{
+	out_8(&xics_per_cpu[n_cpu]->xirr.bytes[0], value);
+}
+
+static void pSeries_qirr_info(int n_cpu, u8 value)
+{
+	out_8(&xics_per_cpu[n_cpu]->qirr.bytes[0], value);
+}
+
+static xics_ops pSeries_ops = {
+	pSeries_xirr_info_get,
+	pSeries_xirr_info_set,
+	pSeries_cppr_info,
+	pSeries_qirr_info
+};
+
+static xics_ops *ops = &pSeries_ops;
+
+
+/* LPAR */
+
+static inline long plpar_eoi(unsigned long xirr)
+{
+	return plpar_hcall_norets(H_EOI, xirr);
+}
+
+static inline long plpar_cppr(unsigned long cppr)
+{
+	return plpar_hcall_norets(H_CPPR, cppr);
+}
+
+static inline long plpar_ipi(unsigned long servernum, unsigned long mfrr)
+{
+	return plpar_hcall_norets(H_IPI, servernum, mfrr);
+}
+
+static inline long plpar_xirr(unsigned long *xirr_ret)
+{
+	unsigned long dummy;
+	return plpar_hcall(H_XIRR, 0, 0, 0, 0, xirr_ret, &dummy, &dummy);
+}
+
+static int pSeriesLP_xirr_info_get(int n_cpu)
+{
+	unsigned long lpar_rc;
+	unsigned long return_value;
+
+	lpar_rc = plpar_xirr(&return_value);
+	if (lpar_rc != H_Success)
+		panic(" bad return code xirr - rc = %lx \n", lpar_rc);
+	return (int)return_value;
+}
+
+static void pSeriesLP_xirr_info_set(int n_cpu, int value)
+{
+	unsigned long lpar_rc;
+	unsigned long val64 = value & 0xffffffff;
+
+	lpar_rc = plpar_eoi(val64);
+	if (lpar_rc != H_Success)
+		panic("bad return code EOI - rc = %ld, value=%lx\n", lpar_rc,
+		      val64);
+}
+
+void pSeriesLP_cppr_info(int n_cpu, u8 value)
+{
+	unsigned long lpar_rc;
+
+	lpar_rc = plpar_cppr(value);
+	if (lpar_rc != H_Success)
+		panic("bad return code cppr - rc = %lx\n", lpar_rc);
+}
+
+static void pSeriesLP_qirr_info(int n_cpu , u8 value)
+{
+	unsigned long lpar_rc;
+
+	lpar_rc = plpar_ipi(get_hard_smp_processor_id(n_cpu), value);
+	if (lpar_rc != H_Success)
+		panic("bad return code qirr - rc = %lx\n", lpar_rc);
+}
+
+xics_ops pSeriesLP_ops = {
+	pSeriesLP_xirr_info_get,
+	pSeriesLP_xirr_info_set,
+	pSeriesLP_cppr_info,
+	pSeriesLP_qirr_info
+};
+
+static unsigned int xics_startup(unsigned int virq)
+{
+	unsigned int irq;
+
+	irq = irq_offset_down(virq);
+	if (radix_tree_insert(&irq_map, virt_irq_to_real(irq),
+			      &virt_irq_to_real_map[irq]) == -ENOMEM)
+		printk(KERN_CRIT "Out of memory creating real -> virtual"
+		       " IRQ mapping for irq %u (real 0x%x)\n",
+		       virq, virt_irq_to_real(irq));
+	xics_enable_irq(virq);
+	return 0;	/* return value is ignored */
+}
+
+static unsigned int real_irq_to_virt(unsigned int real_irq)
+{
+	unsigned int *ptr;
+
+	ptr = radix_tree_lookup(&irq_map, real_irq);
+	if (ptr == NULL)
+		return NO_IRQ;
+	return ptr - virt_irq_to_real_map;
+}
+
+#ifdef CONFIG_SMP
+static int get_irq_server(unsigned int irq)
+{
+	unsigned int server;
+	/* For the moment only implement delivery to all cpus or one cpu */
+	cpumask_t cpumask = irq_affinity[irq];
+	cpumask_t tmp = CPU_MASK_NONE;
+
+	if (!distribute_irqs)
+		return default_server;
+
+	if (cpus_equal(cpumask, CPU_MASK_ALL)) {
+		server = default_distrib_server;
+	} else {
+		cpus_and(tmp, cpu_online_map, cpumask);
+
+		if (cpus_empty(tmp))
+			server = default_distrib_server;
+		else
+			server = get_hard_smp_processor_id(first_cpu(tmp));
+	}
+
+	return server;
+
+}
+#else
+static int get_irq_server(unsigned int irq)
+{
+	return default_server;
+}
+#endif
+
+static void xics_enable_irq(unsigned int virq)
+{
+	unsigned int irq;
+	int call_status;
+	unsigned int server;
+
+	irq = virt_irq_to_real(irq_offset_down(virq));
+	if (irq == XICS_IPI)
+		return;
+
+	server = get_irq_server(virq);
+	call_status = rtas_call(ibm_set_xive, 3, 1, NULL, irq, server,
+				DEFAULT_PRIORITY);
+	if (call_status != 0) {
+		printk(KERN_ERR "xics_enable_irq: irq=%u: ibm_set_xive "
+		       "returned %d\n", irq, call_status);
+		printk("set_xive %x, server %x\n", ibm_set_xive, server);
+		return;
+	}
+
+	/* Now unmask the interrupt (often a no-op) */
+	call_status = rtas_call(ibm_int_on, 1, 1, NULL, irq);
+	if (call_status != 0) {
+		printk(KERN_ERR "xics_enable_irq: irq=%u: ibm_int_on "
+		       "returned %d\n", irq, call_status);
+		return;
+	}
+}
+
+static void xics_disable_real_irq(unsigned int irq)
+{
+	int call_status;
+	unsigned int server;
+
+	if (irq == XICS_IPI)
+		return;
+
+	call_status = rtas_call(ibm_int_off, 1, 1, NULL, irq);
+	if (call_status != 0) {
+		printk(KERN_ERR "xics_disable_real_irq: irq=%u: "
+		       "ibm_int_off returned %d\n", irq, call_status);
+		return;
+	}
+
+	server = get_irq_server(irq);
+	/* Have to set XIVE to 0xff to be able to remove a slot */
+	call_status = rtas_call(ibm_set_xive, 3, 1, NULL, irq, server, 0xff);
+	if (call_status != 0) {
+		printk(KERN_ERR "xics_disable_irq: irq=%u: ibm_set_xive(0xff)"
+		       " returned %d\n", irq, call_status);
+		return;
+	}
+}
+
+static void xics_disable_irq(unsigned int virq)
+{
+	unsigned int irq;
+
+	irq = virt_irq_to_real(irq_offset_down(virq));
+	xics_disable_real_irq(irq);
+}
+
+static void xics_end_irq(unsigned int irq)
+{
+	int cpu = smp_processor_id();
+
+	iosync();
+	ops->xirr_info_set(cpu, ((0xff << 24) |
+				 (virt_irq_to_real(irq_offset_down(irq)))));
+
+}
+
+static void xics_mask_and_ack_irq(unsigned int irq)
+{
+	int cpu = smp_processor_id();
+
+	if (irq < irq_offset_value()) {
+		i8259_pic.ack(irq);
+		iosync();
+		ops->xirr_info_set(cpu, ((0xff<<24) |
+					 xics_irq_8259_cascade_real));
+		iosync();
+	}
+}
+
+int xics_get_irq(struct pt_regs *regs)
+{
+	unsigned int cpu = smp_processor_id();
+	unsigned int vec;
+	int irq;
+
+	vec = ops->xirr_info_get(cpu);
+	/*  (vec >> 24) == old priority */
+	vec &= 0x00ffffff;
+
+	/* for sanity, this had better be < NR_IRQS - 16 */
+	if (vec == xics_irq_8259_cascade_real) {
+		irq = i8259_irq(regs);
+		if (irq == -1) {
+			/* Spurious cascaded interrupt.  Still must ack xics */
+			xics_end_irq(irq_offset_up(xics_irq_8259_cascade));
+
+			irq = -1;
+		}
+	} else if (vec == XICS_IRQ_SPURIOUS) {
+		irq = -1;
+	} else {
+		irq = real_irq_to_virt(vec);
+		if (irq == NO_IRQ)
+			irq = real_irq_to_virt_slowpath(vec);
+		if (irq == NO_IRQ) {
+			printk(KERN_ERR "Interrupt %u (real) is invalid,"
+			       " disabling it.\n", vec);
+			xics_disable_real_irq(vec);
+		} else
+			irq = irq_offset_up(irq);
+	}
+	return irq;
+}
+
+#ifdef CONFIG_SMP
+
+irqreturn_t xics_ipi_action(int irq, void *dev_id, struct pt_regs *regs)
+{
+	int cpu = smp_processor_id();
+
+	ops->qirr_info(cpu, 0xff);
+
+	WARN_ON(cpu_is_offline(cpu));
+
+	while (xics_ipi_message[cpu].value) {
+		if (test_and_clear_bit(PPC_MSG_CALL_FUNCTION,
+				       &xics_ipi_message[cpu].value)) {
+			mb();
+			smp_message_recv(PPC_MSG_CALL_FUNCTION, regs);
+		}
+		if (test_and_clear_bit(PPC_MSG_RESCHEDULE,
+				       &xics_ipi_message[cpu].value)) {
+			mb();
+			smp_message_recv(PPC_MSG_RESCHEDULE, regs);
+		}
+#if 0
+		if (test_and_clear_bit(PPC_MSG_MIGRATE_TASK,
+				       &xics_ipi_message[cpu].value)) {
+			mb();
+			smp_message_recv(PPC_MSG_MIGRATE_TASK, regs);
+		}
+#endif
+#ifdef CONFIG_DEBUGGER
+		if (test_and_clear_bit(PPC_MSG_DEBUGGER_BREAK,
+				       &xics_ipi_message[cpu].value)) {
+			mb();
+			smp_message_recv(PPC_MSG_DEBUGGER_BREAK, regs);
+		}
+#endif
+	}
+	return IRQ_HANDLED;
+}
+
+void xics_cause_IPI(int cpu)
+{
+	ops->qirr_info(cpu, IPI_PRIORITY);
+}
+#endif /* CONFIG_SMP */
+
+void xics_setup_cpu(void)
+{
+	int cpu = smp_processor_id();
+
+	ops->cppr_info(cpu, 0xff);
+	iosync();
+
+	/*
+	 * Put the calling processor into the GIQ.  This is really only
+	 * necessary from a secondary thread as the OF start-cpu interface
+	 * performs this function for us on primary threads.
+	 *
+	 * XXX: undo of teardown on kexec needs this too, as may hotplug
+	 */
+	rtas_set_indicator(GLOBAL_INTERRUPT_QUEUE,
+		(1UL << interrupt_server_size) - 1 - default_distrib_server, 1);
+}
+
+void xics_init_IRQ(void)
+{
+	int i;
+	unsigned long intr_size = 0;
+	struct device_node *np;
+	uint *ireg, ilen, indx = 0;
+	unsigned long intr_base = 0;
+	struct xics_interrupt_node {
+		unsigned long addr;
+		unsigned long size;
+	} intnodes[NR_CPUS];
+
+	ppc64_boot_msg(0x20, "XICS Init");
+
+	ibm_get_xive = rtas_token("ibm,get-xive");
+	ibm_set_xive = rtas_token("ibm,set-xive");
+	ibm_int_on  = rtas_token("ibm,int-on");
+	ibm_int_off = rtas_token("ibm,int-off");
+
+	np = of_find_node_by_type(NULL, "PowerPC-External-Interrupt-Presentation");
+	if (!np)
+		panic("xics_init_IRQ: can't find interrupt presentation");
+
+nextnode:
+	ireg = (uint *)get_property(np, "ibm,interrupt-server-ranges", NULL);
+	if (ireg) {
+		/*
+		 * set node starting index for this node
+		 */
+		indx = *ireg;
+	}
+
+	ireg = (uint *)get_property(np, "reg", &ilen);
+	if (!ireg)
+		panic("xics_init_IRQ: can't find interrupt reg property");
+
+	while (ilen) {
+		intnodes[indx].addr = (unsigned long)*ireg++ << 32;
+		ilen -= sizeof(uint);
+		intnodes[indx].addr |= *ireg++;
+		ilen -= sizeof(uint);
+		intnodes[indx].size = (unsigned long)*ireg++ << 32;
+		ilen -= sizeof(uint);
+		intnodes[indx].size |= *ireg++;
+		ilen -= sizeof(uint);
+		indx++;
+		if (indx >= NR_CPUS) break;
+	}
+
+	np = of_find_node_by_type(np, "PowerPC-External-Interrupt-Presentation");
+	if ((indx < NR_CPUS) && np) goto nextnode;
+
+	/* Find the server numbers for the boot cpu. */
+	for (np = of_find_node_by_type(NULL, "cpu");
+	     np;
+	     np = of_find_node_by_type(np, "cpu")) {
+		ireg = (uint *)get_property(np, "reg", &ilen);
+		if (ireg && ireg[0] == boot_cpuid_phys) {
+			ireg = (uint *)get_property(np, "ibm,ppc-interrupt-gserver#s",
+						    &ilen);
+			i = ilen / sizeof(int);
+			if (ireg && i > 0) {
+				default_server = ireg[0];
+				default_distrib_server = ireg[i-1]; /* take last element */
+			}
+			ireg = (uint *)get_property(np,
+					"ibm,interrupt-server#-size", NULL);
+			if (ireg)
+				interrupt_server_size = *ireg;
+			break;
+		}
+	}
+	of_node_put(np);
+
+	intr_base = intnodes[0].addr;
+	intr_size = intnodes[0].size;
+
+	np = of_find_node_by_type(NULL, "interrupt-controller");
+	if (!np) {
+		printk(KERN_WARNING "xics: no ISA interrupt controller\n");
+		xics_irq_8259_cascade_real = -1;
+		xics_irq_8259_cascade = -1;
+	} else {
+		ireg = (uint *) get_property(np, "interrupts", NULL);
+		if (!ireg)
+			panic("xics_init_IRQ: can't find ISA interrupts property");
+
+		xics_irq_8259_cascade_real = *ireg;
+		xics_irq_8259_cascade
+			= virt_irq_create_mapping(xics_irq_8259_cascade_real);
+		of_node_put(np);
+	}
+
+	if (systemcfg->platform == PLATFORM_PSERIES) {
+#ifdef CONFIG_SMP
+		for_each_cpu(i) {
+			int hard_id;
+
+			/* FIXME: Do this dynamically! --RR */
+			if (!cpu_present(i))
+				continue;
+
+			hard_id = get_hard_smp_processor_id(i);
+			xics_per_cpu[i] = ioremap(intnodes[hard_id].addr,
+						  intnodes[hard_id].size);
+		}
+#else
+		xics_per_cpu[0] = ioremap(intr_base, intr_size);
+#endif /* CONFIG_SMP */
+	} else if (systemcfg->platform == PLATFORM_PSERIES_LPAR) {
+		ops = &pSeriesLP_ops;
+	}
+
+	xics_8259_pic.enable = i8259_pic.enable;
+	xics_8259_pic.disable = i8259_pic.disable;
+	for (i = 0; i < 16; ++i)
+		get_irq_desc(i)->handler = &xics_8259_pic;
+	for (; i < NR_IRQS; ++i)
+		get_irq_desc(i)->handler = &xics_pic;
+
+	xics_setup_cpu();
+
+	ppc64_boot_msg(0x21, "XICS Done");
+}
+
+/*
+ * We cant do this in init_IRQ because we need the memory subsystem up for
+ * request_irq()
+ */
+static int __init xics_setup_i8259(void)
+{
+	if (ppc64_interrupt_controller == IC_PPC_XIC &&
+	    xics_irq_8259_cascade != -1) {
+		if (request_irq(irq_offset_up(xics_irq_8259_cascade),
+				no_action, 0, "8259 cascade", NULL))
+			printk(KERN_ERR "xics_setup_i8259: couldn't get 8259 "
+					"cascade\n");
+		i8259_init(0, 0);
+	}
+	return 0;
+}
+arch_initcall(xics_setup_i8259);
+
+#ifdef CONFIG_SMP
+void xics_request_IPIs(void)
+{
+	virt_irq_to_real_map[XICS_IPI] = XICS_IPI;
+
+	/* IPIs are marked SA_INTERRUPT as they must run with irqs disabled */
+	request_irq(irq_offset_up(XICS_IPI), xics_ipi_action, SA_INTERRUPT,
+		    "IPI", NULL);
+	get_irq_desc(irq_offset_up(XICS_IPI))->status |= IRQ_PER_CPU;
+}
+#endif
+
+static void xics_set_affinity(unsigned int virq, cpumask_t cpumask)
+{
+	unsigned int irq;
+	int status;
+	int xics_status[2];
+	unsigned long newmask;
+	cpumask_t tmp = CPU_MASK_NONE;
+
+	irq = virt_irq_to_real(irq_offset_down(virq));
+	if (irq == XICS_IPI || irq == NO_IRQ)
+		return;
+
+	status = rtas_call(ibm_get_xive, 1, 3, xics_status, irq);
+
+	if (status) {
+		printk(KERN_ERR "xics_set_affinity: irq=%u ibm,get-xive "
+		       "returns %d\n", irq, status);
+		return;
+	}
+
+	/* For the moment only implement delivery to all cpus or one cpu */
+	if (cpus_equal(cpumask, CPU_MASK_ALL)) {
+		newmask = default_distrib_server;
+	} else {
+		cpus_and(tmp, cpu_online_map, cpumask);
+		if (cpus_empty(tmp))
+			return;
+		newmask = get_hard_smp_processor_id(first_cpu(tmp));
+	}
+
+	status = rtas_call(ibm_set_xive, 3, 1, NULL,
+				irq, newmask, xics_status[1]);
+
+	if (status) {
+		printk(KERN_ERR "xics_set_affinity: irq=%u ibm,set-xive "
+		       "returns %d\n", irq, status);
+		return;
+	}
+}
+
+void xics_teardown_cpu(int secondary)
+{
+	int cpu = smp_processor_id();
+
+	ops->cppr_info(cpu, 0x00);
+	iosync();
+
+	/*
+	 * Some machines need to have at least one cpu in the GIQ,
+	 * so leave the master cpu in the group.
+	 */
+	if (secondary) {
+		/*
+		 * we need to EOI the IPI if we got here from kexec down IPI
+		 *
+		 * probably need to check all the other interrupts too
+		 * should we be flagging idle loop instead?
+		 * or creating some task to be scheduled?
+		 */
+		ops->xirr_info_set(cpu, XICS_IPI);
+		rtas_set_indicator(GLOBAL_INTERRUPT_QUEUE,
+			(1UL << interrupt_server_size) - 1 -
+			default_distrib_server, 0);
+	}
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/* Interrupts are disabled. */
+void xics_migrate_irqs_away(void)
+{
+	int status;
+	unsigned int irq, virq, cpu = smp_processor_id();
+
+	/* Reject any interrupt that was queued to us... */
+	ops->cppr_info(cpu, 0);
+	iosync();
+
+	/* remove ourselves from the global interrupt queue */
+	status = rtas_set_indicator(GLOBAL_INTERRUPT_QUEUE,
+		(1UL << interrupt_server_size) - 1 - default_distrib_server, 0);
+	WARN_ON(status < 0);
+
+	/* Allow IPIs again... */
+	ops->cppr_info(cpu, DEFAULT_PRIORITY);
+	iosync();
+
+	for_each_irq(virq) {
+		irq_desc_t *desc;
+		int xics_status[2];
+		unsigned long flags;
+
+		/* We cant set affinity on ISA interrupts */
+		if (virq < irq_offset_value())
+			continue;
+
+		desc = get_irq_desc(virq);
+		irq = virt_irq_to_real(irq_offset_down(virq));
+
+		/* We need to get IPIs still. */
+		if (irq == XICS_IPI || irq == NO_IRQ)
+			continue;
+
+		/* We only need to migrate enabled IRQS */
+		if (desc == NULL || desc->handler == NULL
+		    || desc->action == NULL
+		    || desc->handler->set_affinity == NULL)
+			continue;
+
+		spin_lock_irqsave(&desc->lock, flags);
+
+		status = rtas_call(ibm_get_xive, 1, 3, xics_status, irq);
+		if (status) {
+			printk(KERN_ERR "migrate_irqs_away: irq=%u "
+					"ibm,get-xive returns %d\n",
+					virq, status);
+			goto unlock;
+		}
+
+		/*
+		 * We only support delivery to all cpus or to one cpu.
+		 * The irq has to be migrated only in the single cpu
+		 * case.
+		 */
+		if (xics_status[0] != get_hard_smp_processor_id(cpu))
+			goto unlock;
+
+		printk(KERN_WARNING "IRQ %u affinity broken off cpu %u\n",
+		       virq, cpu);
+
+		/* Reset affinity to all cpus */
+		desc->handler->set_affinity(virq, CPU_MASK_ALL);
+		irq_affinity[virq] = CPU_MASK_ALL;
+unlock:
+		spin_unlock_irqrestore(&desc->lock, flags);
+	}
+}
+#endif
diff --git a/arch/powerpc/platforms/pseries/xics.h b/arch/powerpc/platforms/pseries/xics.h
new file mode 100644
index 00000000000..e14c70868f1
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/xics.h
@@ -0,0 +1,34 @@
+/*
+ * arch/powerpc/platforms/pseries/xics.h
+ *
+ * Copyright 2000 IBM Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _POWERPC_KERNEL_XICS_H
+#define _POWERPC_KERNEL_XICS_H
+
+#include <linux/cache.h>
+
+void xics_init_IRQ(void);
+int xics_get_irq(struct pt_regs *);
+void xics_setup_cpu(void);
+void xics_teardown_cpu(int secondary);
+void xics_cause_IPI(int cpu);
+void xics_request_IPIs(void);
+void xics_migrate_irqs_away(void);
+
+/* first argument is ignored for now*/
+void pSeriesLP_cppr_info(int n_cpu, u8 value);
+
+struct xics_ipi_struct {
+	volatile unsigned long value;
+} ____cacheline_aligned;
+
+extern struct xics_ipi_struct xics_ipi_message[NR_CPUS] __cacheline_aligned;
+
+#endif /* _POWERPC_KERNEL_XICS_H */