Linux-2.6.12-rc2v2.6.12-rc2

Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
author: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 15:20:36 -0700
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 15:20:36 -0700
commit: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree: 0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/ppc64/kernel
124 files changed, 50275 insertions, 0 deletions
diff --git a/arch/ppc64/kernel/HvCall.c b/arch/ppc64/kernel/HvCall.c
new file mode 100644
index 00000000000..b772e65b57a
--- /dev/null
+++ b/arch/ppc64/kernel/HvCall.c
@@ -0,0 +1,36 @@
+/*
+ * HvCall.c
+ * Copyright (C) 2001  Mike Corrigan IBM Corporation
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <asm/page.h>
+#include <asm/abs_addr.h>
+#include <asm/iSeries/HvCall.h>
+#include <asm/iSeries/HvCallSc.h>
+#include <asm/iSeries/HvTypes.h>
+
+
+void HvCall_writeLogBuffer(const void *buffer, u64 len)
+{
+	struct HvLpBufferList hv_buf;
+	u64 left_this_page;
+	u64 cur = virt_to_abs(buffer);
+
+	while (len) {
+		hv_buf.addr = cur;
+		left_this_page = ((cur & PAGE_MASK) + PAGE_SIZE) - cur;
+		if (left_this_page > len)
+			left_this_page = len;
+		hv_buf.len = left_this_page;
+		len -= left_this_page;
+		HvCall2(HvCallBaseWriteLogBuffer,
+				virt_to_abs(&hv_buf),
+				left_this_page);
+		cur = (cur & PAGE_MASK) + PAGE_SIZE;
+	}
+}
diff --git a/arch/ppc64/kernel/HvLpConfig.c b/arch/ppc64/kernel/HvLpConfig.c
new file mode 100644
index 00000000000..cb1d6473203
--- /dev/null
+++ b/arch/ppc64/kernel/HvLpConfig.c
@@ -0,0 +1,27 @@
+/*
+ * HvLpConfig.c
+ * Copyright (C) 2001  Kyle A. Lucke, IBM Corporation
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/module.h>
+#include <asm/iSeries/HvLpConfig.h>
+
+HvLpIndex HvLpConfig_getLpIndex_outline(void)
+{
+	return HvLpConfig_getLpIndex();
+}
+EXPORT_SYMBOL(HvLpConfig_getLpIndex_outline);
diff --git a/arch/ppc64/kernel/HvLpEvent.c b/arch/ppc64/kernel/HvLpEvent.c
new file mode 100644
index 00000000000..9802beefa21
--- /dev/null
+++ b/arch/ppc64/kernel/HvLpEvent.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2001 Mike Corrigan IBM Corp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/stddef.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/system.h>
+#include <asm/iSeries/HvLpEvent.h>
+#include <asm/iSeries/HvCallEvent.h>
+#include <asm/iSeries/LparData.h>
+
+/* Array of LpEvent handler functions */
+LpEventHandler lpEventHandler[HvLpEvent_Type_NumTypes];
+unsigned lpEventHandlerPaths[HvLpEvent_Type_NumTypes];
+
+/* Register a handler for an LpEvent type */
+
+int HvLpEvent_registerHandler( HvLpEvent_Type eventType, LpEventHandler handler )
+{
+	int rc = 1;
+	if ( eventType < HvLpEvent_Type_NumTypes ) {
+		lpEventHandler[eventType] = handler;
+		rc = 0;
+	}
+	return rc;
+	
+}
+
+int HvLpEvent_unregisterHandler( HvLpEvent_Type eventType )
+{
+	int rc = 1;
+
+	might_sleep();
+
+	if ( eventType < HvLpEvent_Type_NumTypes ) {
+		if ( !lpEventHandlerPaths[eventType] ) {
+			lpEventHandler[eventType] = NULL;
+			rc = 0;
+
+			/* We now sleep until all other CPUs have scheduled. This ensures that
+			 * the deletion is seen by all other CPUs, and that the deleted handler
+			 * isn't still running on another CPU when we return. */
+			synchronize_kernel();
+		}
+	}
+	return rc;
+}
+EXPORT_SYMBOL(HvLpEvent_registerHandler);
+EXPORT_SYMBOL(HvLpEvent_unregisterHandler);
+
+/* (lpIndex is the partition index of the target partition.  
+ * needed only for VirtualIo, VirtualLan and SessionMgr.  Zero
+ * indicates to use our partition index - for the other types)
+ */
+int HvLpEvent_openPath( HvLpEvent_Type eventType, HvLpIndex lpIndex )
+{
+	int rc = 1;
+	if ( eventType < HvLpEvent_Type_NumTypes &&
+	     lpEventHandler[eventType] ) {
+		if ( lpIndex == 0 )
+			lpIndex = itLpNaca.xLpIndex;
+		HvCallEvent_openLpEventPath( lpIndex, eventType );
+		++lpEventHandlerPaths[eventType];
+		rc = 0;
+	}
+	return rc;
+}
+
+int HvLpEvent_closePath( HvLpEvent_Type eventType, HvLpIndex lpIndex )
+{
+	int rc = 1;
+	if ( eventType < HvLpEvent_Type_NumTypes &&
+	     lpEventHandler[eventType] &&
+	     lpEventHandlerPaths[eventType] ) {
+		if ( lpIndex == 0 )
+			lpIndex = itLpNaca.xLpIndex;
+		HvCallEvent_closeLpEventPath( lpIndex, eventType );
+		--lpEventHandlerPaths[eventType];
+		rc = 0;
+	}
+	return rc;
+}
+
diff --git a/arch/ppc64/kernel/ItLpQueue.c b/arch/ppc64/kernel/ItLpQueue.c
new file mode 100644
index 00000000000..c923a815760
--- /dev/null
+++ b/arch/ppc64/kernel/ItLpQueue.c
@@ -0,0 +1,167 @@
+/*
+ * ItLpQueue.c
+ * Copyright (C) 2001 Mike Corrigan  IBM Corporation
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/stddef.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <asm/system.h>
+#include <asm/paca.h>
+#include <asm/iSeries/ItLpQueue.h>
+#include <asm/iSeries/HvLpEvent.h>
+#include <asm/iSeries/HvCallEvent.h>
+#include <asm/iSeries/LparData.h>
+
+static __inline__ int set_inUse( struct ItLpQueue * lpQueue )
+{
+	int t;
+	u32 * inUseP = &(lpQueue->xInUseWord);
+
+	__asm__ __volatile__("\n\
+1:	lwarx	%0,0,%2		\n\
+	cmpwi	0,%0,0		\n\
+	li	%0,0		\n\
+	bne-	2f		\n\
+	addi	%0,%0,1		\n\
+	stwcx.	%0,0,%2		\n\
+	bne-	1b		\n\
+2:	eieio"
+	: "=&r" (t), "=m" (lpQueue->xInUseWord)
+	: "r" (inUseP), "m" (lpQueue->xInUseWord)
+	: "cc");
+
+	return t;
+}
+
+static __inline__ void clear_inUse( struct ItLpQueue * lpQueue )
+{
+	lpQueue->xInUseWord = 0;
+}
+
+/* Array of LpEvent handler functions */
+extern LpEventHandler lpEventHandler[HvLpEvent_Type_NumTypes];
+unsigned long ItLpQueueInProcess = 0;
+
+struct HvLpEvent * ItLpQueue_getNextLpEvent( struct ItLpQueue * lpQueue )
+{
+	struct HvLpEvent * nextLpEvent = 
+		(struct HvLpEvent *)lpQueue->xSlicCurEventPtr;
+	if ( nextLpEvent->xFlags.xValid ) {
+		/* rmb() needed only for weakly consistent machines (regatta) */
+		rmb();
+		/* Set pointer to next potential event */
+		lpQueue->xSlicCurEventPtr += ((nextLpEvent->xSizeMinus1 +
+				      LpEventAlign ) /
+				      LpEventAlign ) *
+				      LpEventAlign;
+		/* Wrap to beginning if no room at end */
+		if (lpQueue->xSlicCurEventPtr > lpQueue->xSlicLastValidEventPtr)
+			lpQueue->xSlicCurEventPtr = lpQueue->xSlicEventStackPtr;
+	}
+	else 
+		nextLpEvent = NULL;
+
+	return nextLpEvent;
+}
+
+int ItLpQueue_isLpIntPending( struct ItLpQueue * lpQueue )
+{
+	int retval = 0;
+	struct HvLpEvent * nextLpEvent;
+	if ( lpQueue ) {
+		nextLpEvent = (struct HvLpEvent *)lpQueue->xSlicCurEventPtr;
+		retval = nextLpEvent->xFlags.xValid | lpQueue->xPlicOverflowIntPending;
+	}
+	return retval;
+}
+
+void ItLpQueue_clearValid( struct HvLpEvent * event )
+{
+	/* Clear the valid bit of the event
+	 * Also clear bits within this event that might
+	 * look like valid bits (on 64-byte boundaries)
+   	 */
+	unsigned extra = (( event->xSizeMinus1 + LpEventAlign ) /
+						 LpEventAlign ) - 1;
+	switch ( extra ) {
+	  case 3:
+	   ((struct HvLpEvent*)((char*)event+3*LpEventAlign))->xFlags.xValid=0;
+	  case 2:
+	   ((struct HvLpEvent*)((char*)event+2*LpEventAlign))->xFlags.xValid=0;
+	  case 1:
+	   ((struct HvLpEvent*)((char*)event+1*LpEventAlign))->xFlags.xValid=0;
+	  case 0:
+	   ;	
+	}
+	mb();
+	event->xFlags.xValid = 0;
+}
+
+unsigned ItLpQueue_process( struct ItLpQueue * lpQueue, struct pt_regs *regs )
+{
+	unsigned numIntsProcessed = 0;
+	struct HvLpEvent * nextLpEvent;
+
+	/* If we have recursed, just return */
+	if ( !set_inUse( lpQueue ) )
+		return 0;
+	
+	if (ItLpQueueInProcess == 0)
+		ItLpQueueInProcess = 1;
+	else
+		BUG();
+
+	for (;;) {
+		nextLpEvent = ItLpQueue_getNextLpEvent( lpQueue );
+		if ( nextLpEvent ) {
+			/* Count events to return to caller
+			 * and count processed events in lpQueue
+ 			 */
+			++numIntsProcessed;
+			lpQueue->xLpIntCount++;		
+			/* Call appropriate handler here, passing 
+			 * a pointer to the LpEvent.  The handler
+			 * must make a copy of the LpEvent if it
+			 * needs it in a bottom half. (perhaps for
+			 * an ACK)
+			 *	
+			 *  Handlers are responsible for ACK processing 
+			 *
+			 * The Hypervisor guarantees that LpEvents will
+			 * only be delivered with types that we have
+			 * registered for, so no type check is necessary
+			 * here!
+  			 */
+			if ( nextLpEvent->xType < HvLpEvent_Type_NumTypes )
+				lpQueue->xLpIntCountByType[nextLpEvent->xType]++;
+			if ( nextLpEvent->xType < HvLpEvent_Type_NumTypes &&
+			     lpEventHandler[nextLpEvent->xType] ) 
+				lpEventHandler[nextLpEvent->xType](nextLpEvent, regs);
+			else
+				printk(KERN_INFO "Unexpected Lp Event type=%d\n", nextLpEvent->xType );
+			
+			ItLpQueue_clearValid( nextLpEvent );
+		} else if ( lpQueue->xPlicOverflowIntPending )
+			/*
+			 * No more valid events. If overflow events are
+			 * pending process them
+			 */
+			HvCallEvent_getOverflowLpEvents( lpQueue->xIndex);
+		else
+			break;
+	}
+
+	ItLpQueueInProcess = 0;
+	mb();
+	clear_inUse( lpQueue );
+
+	get_paca()->lpevent_count += numIntsProcessed;
+
+	return numIntsProcessed;
+}
diff --git a/arch/ppc64/kernel/LparData.c b/arch/ppc64/kernel/LparData.c
new file mode 100644
index 00000000000..badc5a44361
--- /dev/null
+++ b/arch/ppc64/kernel/LparData.c
@@ -0,0 +1,250 @@
+/* 
+ * Copyright 2001 Mike Corrigan, IBM Corp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/threads.h>
+#include <linux/module.h>
+#include <linux/bitops.h>
+#include <asm/processor.h>
+#include <asm/ptrace.h>
+#include <asm/naca.h>
+#include <asm/abs_addr.h>
+#include <asm/iSeries/ItLpNaca.h>
+#include <asm/lppaca.h>
+#include <asm/iSeries/ItLpRegSave.h>
+#include <asm/paca.h>
+#include <asm/iSeries/HvReleaseData.h>
+#include <asm/iSeries/LparMap.h>
+#include <asm/iSeries/ItVpdAreas.h>
+#include <asm/iSeries/ItIplParmsReal.h>
+#include <asm/iSeries/ItExtVpdPanel.h>
+#include <asm/iSeries/ItLpQueue.h>
+#include <asm/iSeries/IoHriProcessorVpd.h>
+#include <asm/iSeries/ItSpCommArea.h>
+
+/* The LpQueue is used to pass event data from the hypervisor to
+ * the partition.  This is where I/O interrupt events are communicated.
+ */
+
+/* May be filled in by the hypervisor so cannot end up in the BSS */
+struct ItLpQueue xItLpQueue __attribute__((__section__(".data")));
+
+
+/* The HvReleaseData is the root of the information shared between 
+ * the hypervisor and Linux.  
+ */
+
+struct HvReleaseData hvReleaseData = {
+	.xDesc = 0xc8a5d9c4,	/* "HvRD" ebcdic */
+	.xSize = sizeof(struct HvReleaseData),
+	.xVpdAreasPtrOffset = offsetof(struct naca_struct, xItVpdAreas),
+	.xSlicNacaAddr = &naca,		/* 64-bit Naca address */
+	.xMsNucDataOffset = 0x4800,	/* offset of LparMap within loadarea (see head.S) */
+	.xTagsMode = 1,			/* tags inactive       */
+	.xAddressSize = 0,		/* 64 bit              */
+	.xNoSharedProcs = 0,		/* shared processors   */
+	.xNoHMT = 0,			/* HMT allowed         */
+	.xRsvd2 = 6,			/* TEMP: This allows non-GA driver */
+	.xVrmIndex = 4,			/* We are v5r2m0               */
+	.xMinSupportedPlicVrmIndex = 3,		/* v5r1m0 */
+	.xMinCompatablePlicVrmIndex = 3,	/* v5r1m0 */
+	.xVrmName = { 0xd3, 0x89, 0x95, 0xa4,	/* "Linux 2.4.64" ebcdic */
+		0xa7, 0x40, 0xf2, 0x4b,
+		0xf4, 0x4b, 0xf6, 0xf4 },
+};
+
+extern void system_reset_iSeries(void);
+extern void machine_check_iSeries(void);
+extern void data_access_iSeries(void);
+extern void instruction_access_iSeries(void);
+extern void hardware_interrupt_iSeries(void);
+extern void alignment_iSeries(void);
+extern void program_check_iSeries(void);
+extern void fp_unavailable_iSeries(void);
+extern void decrementer_iSeries(void);
+extern void trap_0a_iSeries(void);
+extern void trap_0b_iSeries(void);
+extern void system_call_iSeries(void);
+extern void single_step_iSeries(void);
+extern void trap_0e_iSeries(void);
+extern void performance_monitor_iSeries(void);
+extern void data_access_slb_iSeries(void);
+extern void instruction_access_slb_iSeries(void);
+	
+struct ItLpNaca itLpNaca = {
+	.xDesc = 0xd397d581,		/* "LpNa" ebcdic */
+	.xSize = 0x0400,		/* size of ItLpNaca */
+	.xIntHdlrOffset = 0x0300,	/* offset to int array */
+	.xMaxIntHdlrEntries = 19,	/* # ents */
+	.xPrimaryLpIndex = 0,		/* Part # of primary */
+	.xServiceLpIndex = 0,		/* Part # of serv */
+	.xLpIndex = 0,			/* Part # of me */
+	.xMaxLpQueues = 0,		/* # of LP queues */
+	.xLpQueueOffset = 0x100,	/* offset of start of LP queues */
+	.xPirEnvironMode = 0,		/* Piranha stuff */
+	.xPirConsoleMode = 0,
+	.xPirDasdMode = 0,
+	.xLparInstalled = 0,
+	.xSysPartitioned = 0,
+	.xHwSyncedTBs = 0,
+	.xIntProcUtilHmt = 0,
+	.xSpVpdFormat = 0,
+	.xIntProcRatio = 0,
+	.xPlicVrmIndex = 0,		/* VRM index of PLIC */
+	.xMinSupportedSlicVrmInd = 0,	/* min supported SLIC */
+	.xMinCompatableSlicVrmInd = 0,	/* min compat SLIC */
+	.xLoadAreaAddr = 0,		/* 64-bit addr of load area */
+	.xLoadAreaChunks = 0,		/* chunks for load area */
+	.xPaseSysCallCRMask = 0,	/* PASE mask */
+	.xSlicSegmentTablePtr = 0,	/* seg table */
+	.xOldLpQueue = { 0 }, 		/* Old LP Queue */
+	.xInterruptHdlr = {
+		(u64)system_reset_iSeries,	/* 0x100 System Reset */
+		(u64)machine_check_iSeries,	/* 0x200 Machine Check */
+		(u64)data_access_iSeries,	/* 0x300 Data Access */
+		(u64)instruction_access_iSeries, /* 0x400 Instruction Access */
+		(u64)hardware_interrupt_iSeries, /* 0x500 External */
+		(u64)alignment_iSeries,		/* 0x600 Alignment */
+		(u64)program_check_iSeries,	/* 0x700 Program Check */
+		(u64)fp_unavailable_iSeries,	/* 0x800 FP Unavailable */
+		(u64)decrementer_iSeries,	/* 0x900 Decrementer */
+		(u64)trap_0a_iSeries,		/* 0xa00 Trap 0A */
+		(u64)trap_0b_iSeries,		/* 0xb00 Trap 0B */
+		(u64)system_call_iSeries,	/* 0xc00 System Call */
+		(u64)single_step_iSeries,	/* 0xd00 Single Step */
+		(u64)trap_0e_iSeries,		/* 0xe00 Trap 0E */
+		(u64)performance_monitor_iSeries,/* 0xf00 Performance Monitor */
+		0,				/* int 0x1000 */
+		0,				/* int 0x1010 */
+		0,				/* int 0x1020 CPU ctls */
+		(u64)hardware_interrupt_iSeries, /* SC Ret Hdlr */
+		(u64)data_access_slb_iSeries,	/* 0x380 D-SLB */
+		(u64)instruction_access_slb_iSeries /* 0x480 I-SLB */
+	}
+};
+EXPORT_SYMBOL(itLpNaca);
+
+/* May be filled in by the hypervisor so cannot end up in the BSS */
+struct ItIplParmsReal xItIplParmsReal __attribute__((__section__(".data"))); 
+
+/* May be filled in by the hypervisor so cannot end up in the BSS */
+struct ItExtVpdPanel xItExtVpdPanel __attribute__((__section__(".data")));
+EXPORT_SYMBOL(xItExtVpdPanel);
+
+#define maxPhysicalProcessors 32
+
+struct IoHriProcessorVpd xIoHriProcessorVpd[maxPhysicalProcessors] = {
+	{
+		.xInstCacheOperandSize = 32,
+		.xDataCacheOperandSize = 32,
+		.xProcFreq     = 50000000,
+		.xTimeBaseFreq = 50000000,
+		.xPVR = 0x3600
+	}
+};
+	
+/* Space for Main Store Vpd 27,200 bytes */
+/* May be filled in by the hypervisor so cannot end up in the BSS */
+u64    xMsVpd[3400] __attribute__((__section__(".data")));
+
+/* Space for Recovery Log Buffer */
+/* May be filled in by the hypervisor so cannot end up in the BSS */
+u64    xRecoveryLogBuffer[32] __attribute__((__section__(".data")));
+
+struct SpCommArea xSpCommArea = {
+	.xDesc = 0xE2D7C3C2,
+	.xFormat = 1,
+};
+
+/* The LparMap data is now located at offset 0x6000 in head.S
+ * It was put there so that the HvReleaseData could address it
+ * with a 32-bit offset as required by the iSeries hypervisor
+ *
+ * The Naca has a pointer to the ItVpdAreas.  The hypervisor finds
+ * the Naca via the HvReleaseData area.  The HvReleaseData has the
+ * offset into the Naca of the pointer to the ItVpdAreas.
+ */
+struct ItVpdAreas itVpdAreas = {
+	.xSlicDesc = 0xc9a3e5c1,		/* "ItVA" */
+	.xSlicSize = sizeof(struct ItVpdAreas),
+	.xSlicVpdEntries = ItVpdMaxEntries,	/* # VPD array entries */
+	.xSlicDmaEntries = ItDmaMaxEntries,	/* # DMA array entries */
+	.xSlicMaxLogicalProcs = NR_CPUS * 2,	/* Max logical procs */
+	.xSlicMaxPhysicalProcs = maxPhysicalProcessors,	/* Max physical procs */
+	.xSlicDmaToksOffset = offsetof(struct ItVpdAreas, xPlicDmaToks),
+	.xSlicVpdAdrsOffset = offsetof(struct ItVpdAreas, xSlicVpdAdrs),
+	.xSlicDmaLensOffset = offsetof(struct ItVpdAreas, xPlicDmaLens),
+	.xSlicVpdLensOffset = offsetof(struct ItVpdAreas, xSlicVpdLens),
+	.xSlicMaxSlotLabels = 0,		/* max slot labels */
+	.xSlicMaxLpQueues = 1,			/* max LP queues */
+	.xPlicDmaLens = { 0 },			/* DMA lengths */
+	.xPlicDmaToks = { 0 },			/* DMA tokens */
+	.xSlicVpdLens = {			/* VPD lengths */
+	        0,0,0,		        /*  0 - 2 */
+		sizeof(xItExtVpdPanel), /*       3 Extended VPD   */
+		sizeof(struct paca_struct),	/*       4 length of Paca  */
+		0,			/*       5 */
+		sizeof(struct ItIplParmsReal),/* 6 length of IPL parms */
+		26992,			/*	 7 length of MS VPD */
+		0,			/*       8 */
+		sizeof(struct ItLpNaca),/*       9 length of LP Naca */
+		0, 			/*	10 */
+		256,			/*	11 length of Recovery Log Buf */
+		sizeof(struct SpCommArea), /*   12 length of SP Comm Area */
+		0,0,0,			/* 13 - 15 */
+		sizeof(struct IoHriProcessorVpd),/* 16 length of Proc Vpd */
+		0,0,0,0,0,0,		/* 17 - 22  */
+		sizeof(struct ItLpQueue),/*     23 length of Lp Queue */
+		0,0			/* 24 - 25 */
+		},
+	.xSlicVpdAdrs = {			/* VPD addresses */
+		0,0,0,  		/*	 0 -  2 */
+		&xItExtVpdPanel,        /*       3 Extended VPD */
+		&paca[0],		/*       4 first Paca */
+		0,			/*       5 */
+		&xItIplParmsReal,	/*	 6 IPL parms */
+		&xMsVpd,		/*	 7 MS Vpd */
+		0,			/*       8 */
+		&itLpNaca,		/*       9 LpNaca */
+		0,			/*	10 */
+		&xRecoveryLogBuffer,	/*	11 Recovery Log Buffer */
+		&xSpCommArea,		/*	12 SP Comm Area */
+		0,0,0,			/* 13 - 15 */
+		&xIoHriProcessorVpd,	/*      16 Proc Vpd */
+		0,0,0,0,0,0,		/* 17 - 22 */
+		&xItLpQueue,		/*      23 Lp Queue */
+		0,0
+	}
+};
+
+struct msChunks msChunks;
+EXPORT_SYMBOL(msChunks);
+
+/* Depending on whether this is called from iSeries or pSeries setup
+ * code, the location of the msChunks struct may or may not have
+ * to be reloc'd, so we force the caller to do that for us by passing
+ * in a pointer to the structure.
+ */
+unsigned long
+msChunks_alloc(unsigned long mem, unsigned long num_chunks, unsigned long chunk_size)
+{
+	unsigned long offset = reloc_offset();
+	struct msChunks *_msChunks = PTRRELOC(&msChunks);
+
+	_msChunks->num_chunks  = num_chunks;
+	_msChunks->chunk_size  = chunk_size;
+	_msChunks->chunk_shift = __ilog2(chunk_size);
+	_msChunks->chunk_mask  = (1UL<<_msChunks->chunk_shift)-1;
+
+	mem = _ALIGN(mem, sizeof(msChunks_entry));
+	_msChunks->abs = (msChunks_entry *)(mem + offset);
+	mem += num_chunks * sizeof(msChunks_entry);
+
+	return mem;
+}
diff --git a/arch/ppc64/kernel/Makefile b/arch/ppc64/kernel/Makefile
new file mode 100644
index 00000000000..96d90b0c511
--- /dev/null
+++ b/arch/ppc64/kernel/Makefile
@@ -0,0 +1,68 @@
+#
+# Makefile for the linux ppc64 kernel.
+#
+
+EXTRA_CFLAGS	+= -mno-minimal-toc
+extra-y		:= head.o vmlinux.lds
+
+obj-y               :=	setup.o entry.o traps.o irq.o idle.o dma.o \
+			time.o process.o signal.o syscalls.o misc.o ptrace.o \
+			align.o semaphore.o bitops.o pacaData.o \
+			udbg.o binfmt_elf32.o sys_ppc32.o ioctl32.o \
+			ptrace32.o signal32.o rtc.o init_task.o \
+			lmb.o cputable.o cpu_setup_power4.o idle_power4.o \
+			iommu.o sysfs.o vdso.o pmc.o
+obj-y += vdso32/ vdso64/
+
+obj-$(CONFIG_PPC_OF) +=	of_device.o
+
+pci-obj-$(CONFIG_PPC_ISERIES)	+= iSeries_pci.o iSeries_pci_reset.o
+pci-obj-$(CONFIG_PPC_MULTIPLATFORM)	+= pci_dn.o pci_direct_iommu.o
+
+obj-$(CONFIG_PCI)	+= pci.o pci_iommu.o iomap.o $(pci-obj-y)
+
+obj-$(CONFIG_PPC_ISERIES) += iSeries_irq.o \
+			     iSeries_VpdInfo.o XmPciLpEvent.o \
+			     HvCall.o HvLpConfig.o LparData.o \
+			     iSeries_setup.o ItLpQueue.o hvCall.o \
+			     mf.o HvLpEvent.o iSeries_proc.o iSeries_htab.o \
+			     iSeries_iommu.o
+
+obj-$(CONFIG_PPC_MULTIPLATFORM) += nvram.o i8259.o prom_init.o prom.o mpic.o
+
+obj-$(CONFIG_PPC_PSERIES) += pSeries_pci.o pSeries_lpar.o pSeries_hvCall.o \
+			     pSeries_nvram.o rtasd.o ras.o pSeries_reconfig.o \
+			     xics.o rtas.o pSeries_setup.o pSeries_iommu.o
+
+obj-$(CONFIG_EEH)		+= eeh.o
+obj-$(CONFIG_PROC_FS)		+= proc_ppc64.o
+obj-$(CONFIG_RTAS_FLASH)	+= rtas_flash.o
+obj-$(CONFIG_SMP)		+= smp.o
+obj-$(CONFIG_MODULES)		+= module.o ppc_ksyms.o
+obj-$(CONFIG_RTAS_PROC)		+= rtas-proc.o
+obj-$(CONFIG_SCANLOG)		+= scanlog.o
+obj-$(CONFIG_VIOPATH)		+= viopath.o
+obj-$(CONFIG_LPARCFG)		+= lparcfg.o
+obj-$(CONFIG_HVC_CONSOLE)	+= hvconsole.o
+obj-$(CONFIG_BOOTX_TEXT)	+= btext.o
+obj-$(CONFIG_HVCS)		+= hvcserver.o
+obj-$(CONFIG_IBMVIO)		+= vio.o
+
+obj-$(CONFIG_PPC_PMAC)		+= pmac_setup.o pmac_feature.o pmac_pci.o \
+				   pmac_time.o pmac_nvram.o pmac_low_i2c.o
+
+obj-$(CONFIG_PPC_MAPLE)		+= maple_setup.o maple_pci.o maple_time.o
+
+obj-$(CONFIG_U3_DART)		+= u3_iommu.o
+
+ifdef CONFIG_SMP
+obj-$(CONFIG_PPC_PMAC)		+= pmac_smp.o smp-tbsync.o
+obj-$(CONFIG_PPC_ISERIES)	+= iSeries_smp.o
+obj-$(CONFIG_PPC_PSERIES)	+= pSeries_smp.o
+obj-$(CONFIG_PPC_MAPLE)		+= smp-tbsync.o
+endif
+
+obj-$(CONFIG_ALTIVEC)		+= vecemu.o vector.o
+obj-$(CONFIG_KPROBES)		+= kprobes.o
+
+CFLAGS_ioctl32.o += -Ifs/
diff --git a/arch/ppc64/kernel/XmPciLpEvent.c b/arch/ppc64/kernel/XmPciLpEvent.c
new file mode 100644
index 00000000000..809c9bc6678
--- /dev/null
+++ b/arch/ppc64/kernel/XmPciLpEvent.c
@@ -0,0 +1,190 @@
+/*
+ * File XmPciLpEvent.h created by Wayne Holm on Mon Jan 15 2001.
+ *
+ * This module handles PCI interrupt events sent by the iSeries Hypervisor.
+*/
+
+#include <linux/config.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/threads.h>
+#include <linux/smp.h>
+#include <linux/param.h>
+#include <linux/string.h>
+#include <linux/bootmem.h>
+#include <linux/ide.h>
+
+#include <asm/iSeries/HvTypes.h>
+#include <asm/iSeries/HvLpEvent.h>
+#include <asm/iSeries/HvCallPci.h>
+#include <asm/iSeries/XmPciLpEvent.h>
+#include <asm/ppcdebug.h>
+
+static long Pci_Interrupt_Count;
+static long Pci_Event_Count;
+
+enum XmPciLpEvent_Subtype {
+	XmPciLpEvent_BusCreated	   = 0,		// PHB has been created
+	XmPciLpEvent_BusError	   = 1,		// PHB has failed
+	XmPciLpEvent_BusFailed	   = 2,		// Msg to Secondary, Primary failed bus
+	XmPciLpEvent_NodeFailed	   = 4,		// Multi-adapter bridge has failed
+	XmPciLpEvent_NodeRecovered = 5,		// Multi-adapter bridge has recovered
+	XmPciLpEvent_BusRecovered  = 12,	// PHB has been recovered
+	XmPciLpEvent_UnQuiesceBus  = 18,	// Secondary bus unqiescing
+	XmPciLpEvent_BridgeError   = 21,	// Bridge Error
+	XmPciLpEvent_SlotInterrupt = 22		// Slot interrupt
+};
+
+struct XmPciLpEvent_BusInterrupt {
+	HvBusNumber	busNumber;
+	HvSubBusNumber	subBusNumber;
+};
+
+struct XmPciLpEvent_NodeInterrupt {
+	HvBusNumber	busNumber;
+	HvSubBusNumber	subBusNumber;
+	HvAgentId	deviceId;
+};
+
+struct XmPciLpEvent {
+	struct HvLpEvent hvLpEvent;
+
+	union {
+		u64 alignData;			// Align on an 8-byte boundary
+
+		struct {
+			u32		fisr;
+			HvBusNumber	busNumber;
+			HvSubBusNumber	subBusNumber;
+			HvAgentId	deviceId;
+		} slotInterrupt;
+
+		struct XmPciLpEvent_BusInterrupt busFailed;
+		struct XmPciLpEvent_BusInterrupt busRecovered;
+		struct XmPciLpEvent_BusInterrupt busCreated;
+
+		struct XmPciLpEvent_NodeInterrupt nodeFailed;
+		struct XmPciLpEvent_NodeInterrupt nodeRecovered;
+
+	} eventData;
+
+};
+
+static void intReceived(struct XmPciLpEvent *eventParm,
+		struct pt_regs *regsParm);
+
+static void XmPciLpEvent_handler(struct HvLpEvent *eventParm,
+		struct pt_regs *regsParm)
+{
+#ifdef CONFIG_PCI
+#if 0
+	PPCDBG(PPCDBG_BUSWALK, "XmPciLpEvent_handler, type 0x%x\n",
+			eventParm->xType);
+#endif
+	++Pci_Event_Count;
+
+	if (eventParm && (eventParm->xType == HvLpEvent_Type_PciIo)) {
+		switch (eventParm->xFlags.xFunction) {
+		case HvLpEvent_Function_Int:
+			intReceived((struct XmPciLpEvent *)eventParm, regsParm);
+			break;
+		case HvLpEvent_Function_Ack:
+			printk(KERN_ERR
+				"XmPciLpEvent.c: unexpected ack received\n");
+			break;
+		default:
+			printk(KERN_ERR
+				"XmPciLpEvent.c: unexpected event function %d\n",
+				(int)eventParm->xFlags.xFunction);
+			break;
+		}
+	} else if (eventParm)
+		printk(KERN_ERR
+			"XmPciLpEvent.c: Unrecognized PCI event type 0x%x\n",
+			(int)eventParm->xType);
+	else
+		printk(KERN_ERR "XmPciLpEvent.c: NULL event received\n");
+#endif
+}
+
+static void intReceived(struct XmPciLpEvent *eventParm,
+		struct pt_regs *regsParm)
+{
+	int irq;
+
+	++Pci_Interrupt_Count;
+#if 0
+	PPCDBG(PPCDBG_BUSWALK, "PCI: XmPciLpEvent.c: intReceived\n");
+#endif
+
+	switch (eventParm->hvLpEvent.xSubtype) {
+	case XmPciLpEvent_SlotInterrupt:
+		irq = eventParm->hvLpEvent.xCorrelationToken;
+		/* Dispatch the interrupt handlers for this irq */
+		ppc_irq_dispatch_handler(regsParm, irq);
+		HvCallPci_eoi(eventParm->eventData.slotInterrupt.busNumber,
+			eventParm->eventData.slotInterrupt.subBusNumber,
+			eventParm->eventData.slotInterrupt.deviceId);
+		break;
+		/* Ignore error recovery events for now */
+	case XmPciLpEvent_BusCreated:
+		printk(KERN_INFO "XmPciLpEvent.c: system bus %d created\n",
+			eventParm->eventData.busCreated.busNumber);
+		break;
+	case XmPciLpEvent_BusError:
+	case XmPciLpEvent_BusFailed:
+		printk(KERN_INFO "XmPciLpEvent.c: system bus %d failed\n",
+			eventParm->eventData.busFailed.busNumber);
+		break;
+	case XmPciLpEvent_BusRecovered:
+	case XmPciLpEvent_UnQuiesceBus:
+		printk(KERN_INFO "XmPciLpEvent.c: system bus %d recovered\n",
+			eventParm->eventData.busRecovered.busNumber);
+		break;
+	case XmPciLpEvent_NodeFailed:
+	case XmPciLpEvent_BridgeError:
+		printk(KERN_INFO
+			"XmPciLpEvent.c: multi-adapter bridge %d/%d/%d failed\n",
+			eventParm->eventData.nodeFailed.busNumber,
+			eventParm->eventData.nodeFailed.subBusNumber,
+			eventParm->eventData.nodeFailed.deviceId);
+		break;
+	case XmPciLpEvent_NodeRecovered:
+		printk(KERN_INFO
+			"XmPciLpEvent.c: multi-adapter bridge %d/%d/%d recovered\n",
+			eventParm->eventData.nodeRecovered.busNumber,
+			eventParm->eventData.nodeRecovered.subBusNumber,
+			eventParm->eventData.nodeRecovered.deviceId);
+		break;
+	default:
+		printk(KERN_ERR
+			"XmPciLpEvent.c: unrecognized event subtype 0x%x\n",
+			eventParm->hvLpEvent.xSubtype);
+		break;
+	}
+}
+
+
+/* This should be called sometime prior to buswalk (init_IRQ would be good) */
+int XmPciLpEvent_init()
+{
+	int xRc;
+
+	PPCDBG(PPCDBG_BUSWALK,
+			"XmPciLpEvent_init, Register Event type 0x%04X\n",
+			HvLpEvent_Type_PciIo);
+
+	xRc = HvLpEvent_registerHandler(HvLpEvent_Type_PciIo,
+			&XmPciLpEvent_handler);
+	if (xRc == 0) {
+		xRc = HvLpEvent_openPath(HvLpEvent_Type_PciIo, 0);
+		if (xRc != 0)
+			printk(KERN_ERR
+				"XmPciLpEvent.c: open event path failed with rc 0x%x\n",
+				xRc);
+	} else
+		printk(KERN_ERR
+			"XmPciLpEvent.c: register handler failed with rc 0x%x\n",
+			xRc);
+	return xRc;
+}
diff --git a/arch/ppc64/kernel/align.c b/arch/ppc64/kernel/align.c
new file mode 100644
index 00000000000..330e7ef8142
--- /dev/null
+++ b/arch/ppc64/kernel/align.c
@@ -0,0 +1,396 @@
+/* align.c - handle alignment exceptions for the Power PC.
+ *
+ * Copyright (c) 1996 Paul Mackerras <paulus@cs.anu.edu.au>
+ * Copyright (c) 1998-1999 TiVo, Inc.
+ *   PowerPC 403GCX modifications.
+ * Copyright (c) 1999 Grant Erickson <grant@lcse.umn.edu>
+ *   PowerPC 403GCX/405GP modifications.
+ * Copyright (c) 2001-2002 PPC64 team, IBM Corp
+ *   64-bit and Power4 support
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <asm/processor.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/cache.h>
+#include <asm/cputable.h>
+
+struct aligninfo {
+	unsigned char len;
+	unsigned char flags;
+};
+
+#define IS_XFORM(inst)	(((inst) >> 26) == 31)
+#define IS_DSFORM(inst)	(((inst) >> 26) >= 56)
+
+#define INVALID	{ 0, 0 }
+
+#define LD	1	/* load */
+#define ST	2	/* store */
+#define	SE	4	/* sign-extend value */
+#define F	8	/* to/from fp regs */
+#define U	0x10	/* update index register */
+#define M	0x20	/* multiple load/store */
+#define SW	0x40	/* byte swap */
+
+#define DCBZ	0x5f	/* 8xx/82xx dcbz faults when cache not enabled */
+
+/*
+ * The PowerPC stores certain bits of the instruction that caused the
+ * alignment exception in the DSISR register.  This array maps those
+ * bits to information about the operand length and what the
+ * instruction would do.
+ */
+static struct aligninfo aligninfo[128] = {
+	{ 4, LD },		/* 00 0 0000: lwz / lwarx */
+	INVALID,		/* 00 0 0001 */
+	{ 4, ST },		/* 00 0 0010: stw */
+	INVALID,		/* 00 0 0011 */
+	{ 2, LD },		/* 00 0 0100: lhz */
+	{ 2, LD+SE },		/* 00 0 0101: lha */
+	{ 2, ST },		/* 00 0 0110: sth */
+	{ 4, LD+M },		/* 00 0 0111: lmw */
+	{ 4, LD+F },		/* 00 0 1000: lfs */
+	{ 8, LD+F },		/* 00 0 1001: lfd */
+	{ 4, ST+F },		/* 00 0 1010: stfs */
+	{ 8, ST+F },		/* 00 0 1011: stfd */
+	INVALID,		/* 00 0 1100 */
+	{ 8, LD },		/* 00 0 1101: ld */
+	INVALID,		/* 00 0 1110 */
+	{ 8, ST },		/* 00 0 1111: std */
+	{ 4, LD+U },		/* 00 1 0000: lwzu */
+	INVALID,		/* 00 1 0001 */
+	{ 4, ST+U },		/* 00 1 0010: stwu */
+	INVALID,		/* 00 1 0011 */
+	{ 2, LD+U },		/* 00 1 0100: lhzu */
+	{ 2, LD+SE+U },		/* 00 1 0101: lhau */
+	{ 2, ST+U },		/* 00 1 0110: sthu */
+	{ 4, ST+M },		/* 00 1 0111: stmw */
+	{ 4, LD+F+U },		/* 00 1 1000: lfsu */
+	{ 8, LD+F+U },		/* 00 1 1001: lfdu */
+	{ 4, ST+F+U },		/* 00 1 1010: stfsu */
+	{ 8, ST+F+U },		/* 00 1 1011: stfdu */
+	INVALID,		/* 00 1 1100 */
+	INVALID,		/* 00 1 1101 */
+	INVALID,		/* 00 1 1110 */
+	INVALID,		/* 00 1 1111 */
+	{ 8, LD },		/* 01 0 0000: ldx */
+	INVALID,		/* 01 0 0001 */
+	{ 8, ST },		/* 01 0 0010: stdx */
+	INVALID,		/* 01 0 0011 */
+	INVALID,		/* 01 0 0100 */
+	{ 4, LD+SE },		/* 01 0 0101: lwax */
+	INVALID,		/* 01 0 0110 */
+	INVALID,		/* 01 0 0111 */
+	{ 0, LD },		/* 01 0 1000: lswx */
+	{ 0, LD },		/* 01 0 1001: lswi */
+	{ 0, ST },		/* 01 0 1010: stswx */
+	{ 0, ST },		/* 01 0 1011: stswi */
+	INVALID,		/* 01 0 1100 */
+	{ 8, LD+U },		/* 01 0 1101: ldu */
+	INVALID,		/* 01 0 1110 */
+	{ 8, ST+U },		/* 01 0 1111: stdu */
+	{ 8, LD+U },		/* 01 1 0000: ldux */
+	INVALID,		/* 01 1 0001 */
+	{ 8, ST+U },		/* 01 1 0010: stdux */
+	INVALID,		/* 01 1 0011 */
+	INVALID,		/* 01 1 0100 */
+	{ 4, LD+SE+U },		/* 01 1 0101: lwaux */
+	INVALID,		/* 01 1 0110 */
+	INVALID,		/* 01 1 0111 */
+	INVALID,		/* 01 1 1000 */
+	INVALID,		/* 01 1 1001 */
+	INVALID,		/* 01 1 1010 */
+	INVALID,		/* 01 1 1011 */
+	INVALID,		/* 01 1 1100 */
+	INVALID,		/* 01 1 1101 */
+	INVALID,		/* 01 1 1110 */
+	INVALID,		/* 01 1 1111 */
+	INVALID,		/* 10 0 0000 */
+	INVALID,		/* 10 0 0001 */
+	{ 0, ST },		/* 10 0 0010: stwcx. */
+	INVALID,		/* 10 0 0011 */
+	INVALID,		/* 10 0 0100 */
+	INVALID,		/* 10 0 0101 */
+	INVALID,		/* 10 0 0110 */
+	INVALID,		/* 10 0 0111 */
+	{ 4, LD+SW },		/* 10 0 1000: lwbrx */
+	INVALID,		/* 10 0 1001 */
+	{ 4, ST+SW },		/* 10 0 1010: stwbrx */
+	INVALID,		/* 10 0 1011 */
+	{ 2, LD+SW },		/* 10 0 1100: lhbrx */
+	{ 4, LD+SE },		/* 10 0 1101  lwa */
+	{ 2, ST+SW },		/* 10 0 1110: sthbrx */
+	INVALID,		/* 10 0 1111 */
+	INVALID,		/* 10 1 0000 */
+	INVALID,		/* 10 1 0001 */
+	INVALID,		/* 10 1 0010 */
+	INVALID,		/* 10 1 0011 */
+	INVALID,		/* 10 1 0100 */
+	INVALID,		/* 10 1 0101 */
+	INVALID,		/* 10 1 0110 */
+	INVALID,		/* 10 1 0111 */
+	INVALID,		/* 10 1 1000 */
+	INVALID,		/* 10 1 1001 */
+	INVALID,		/* 10 1 1010 */
+	INVALID,		/* 10 1 1011 */
+	INVALID,		/* 10 1 1100 */
+	INVALID,		/* 10 1 1101 */
+	INVALID,		/* 10 1 1110 */
+	{ L1_CACHE_BYTES, ST },	/* 10 1 1111: dcbz */
+	{ 4, LD },		/* 11 0 0000: lwzx */
+	INVALID,		/* 11 0 0001 */
+	{ 4, ST },		/* 11 0 0010: stwx */
+	INVALID,		/* 11 0 0011 */
+	{ 2, LD },		/* 11 0 0100: lhzx */
+	{ 2, LD+SE },		/* 11 0 0101: lhax */
+	{ 2, ST },		/* 11 0 0110: sthx */
+	INVALID,		/* 11 0 0111 */
+	{ 4, LD+F },		/* 11 0 1000: lfsx */
+	{ 8, LD+F },		/* 11 0 1001: lfdx */
+	{ 4, ST+F },		/* 11 0 1010: stfsx */
+	{ 8, ST+F },		/* 11 0 1011: stfdx */
+	INVALID,		/* 11 0 1100 */
+	{ 8, LD+M },		/* 11 0 1101: lmd */
+	INVALID,		/* 11 0 1110 */
+	{ 8, ST+M },		/* 11 0 1111: stmd */
+	{ 4, LD+U },		/* 11 1 0000: lwzux */
+	INVALID,		/* 11 1 0001 */
+	{ 4, ST+U },		/* 11 1 0010: stwux */
+	INVALID,		/* 11 1 0011 */
+	{ 2, LD+U },		/* 11 1 0100: lhzux */
+	{ 2, LD+SE+U },		/* 11 1 0101: lhaux */
+	{ 2, ST+U },		/* 11 1 0110: sthux */
+	INVALID,		/* 11 1 0111 */
+	{ 4, LD+F+U },		/* 11 1 1000: lfsux */
+	{ 8, LD+F+U },		/* 11 1 1001: lfdux */
+	{ 4, ST+F+U },		/* 11 1 1010: stfsux */
+	{ 8, ST+F+U },		/* 11 1 1011: stfdux */
+	INVALID,		/* 11 1 1100 */
+	INVALID,		/* 11 1 1101 */
+	INVALID,		/* 11 1 1110 */
+	INVALID,		/* 11 1 1111 */
+};
+
+#define SWAP(a, b)	(t = (a), (a) = (b), (b) = t)
+
+static inline unsigned make_dsisr(unsigned instr)
+{
+	unsigned dsisr;
+	
+	/* create a DSISR value from the instruction */
+	dsisr = (instr & 0x03ff0000) >> 16;			/* bits  6:15 --> 22:31 */
+	
+	if ( IS_XFORM(instr) ) {
+		dsisr |= (instr & 0x00000006) << 14;		/* bits 29:30 --> 15:16 */
+		dsisr |= (instr & 0x00000040) << 8;		/* bit     25 -->    17 */
+		dsisr |= (instr & 0x00000780) << 3;		/* bits 21:24 --> 18:21 */
+	}
+	else {
+		dsisr |= (instr & 0x04000000) >> 12;		/* bit      5 -->    17 */
+		dsisr |= (instr & 0x78000000) >> 17;		/* bits  1: 4 --> 18:21 */
+		if ( IS_DSFORM(instr) ) {
+			dsisr |= (instr & 0x00000003) << 18;	/* bits 30:31 --> 12:13 */
+		}
+	}
+	
+	return dsisr;
+}
+
+int
+fix_alignment(struct pt_regs *regs)
+{
+	unsigned int instr, nb, flags;
+	int t;
+	unsigned long reg, areg;
+	unsigned long i;
+	int ret;
+	unsigned dsisr;
+	unsigned char __user *addr;
+	unsigned char __user *p;
+	unsigned long __user *lp;
+	union {
+		long ll;
+		double dd;
+		unsigned char v[8];
+		struct {
+			unsigned hi32;
+			int	 low32;
+		} x32;
+		struct {
+			unsigned char hi48[6];
+			short	      low16;
+		} x16;
+	} data;
+
+	/*
+	 * Return 1 on success
+	 * Return 0 if unable to handle the interrupt
+	 * Return -EFAULT if data address is bad
+	 */
+
+	dsisr = regs->dsisr;
+
+	if (cpu_has_feature(CPU_FTR_NODSISRALIGN)) {
+	    unsigned int real_instr;
+	    if (__get_user(real_instr, (unsigned int __user *)regs->nip))
+		return 0;
+	    dsisr = make_dsisr(real_instr);
+	}
+
+	/* extract the operation and registers from the dsisr */
+	reg = (dsisr >> 5) & 0x1f;	/* source/dest register */
+	areg = dsisr & 0x1f;		/* register to update */
+	instr = (dsisr >> 10) & 0x7f;
+	instr |= (dsisr >> 13) & 0x60;
+
+	/* Lookup the operation in our table */
+	nb = aligninfo[instr].len;
+	flags = aligninfo[instr].flags;
+
+	/* DAR has the operand effective address */
+	addr = (unsigned char __user *)regs->dar;
+
+	/* A size of 0 indicates an instruction we don't support */
+	/* we also don't support the multiples (lmw, stmw, lmd, stmd) */
+	if ((nb == 0) || (flags & M))
+		return 0;		/* too hard or invalid instruction */
+
+	/*
+	 * Special handling for dcbz
+	 * dcbz may give an alignment exception for accesses to caching inhibited
+	 * storage
+	 */
+	if (instr == DCBZ)
+		addr = (unsigned char __user *) ((unsigned long)addr & -L1_CACHE_BYTES);
+
+	/* Verify the address of the operand */
+	if (user_mode(regs)) {
+		if (!access_ok((flags & ST? VERIFY_WRITE: VERIFY_READ), addr, nb))
+			return -EFAULT;	/* bad address */
+	}
+
+	/* Force the fprs into the save area so we can reference them */
+	if (flags & F) {
+		if (!user_mode(regs))
+			return 0;
+		flush_fp_to_thread(current);
+	}
+	
+	/* If we are loading, get the data from user space */
+	if (flags & LD) {
+		data.ll = 0;
+		ret = 0;
+		p = addr;
+		switch (nb) {
+		case 8:
+			ret |= __get_user(data.v[0], p++);
+			ret |= __get_user(data.v[1], p++);
+			ret |= __get_user(data.v[2], p++);
+			ret |= __get_user(data.v[3], p++);
+		case 4:
+			ret |= __get_user(data.v[4], p++);
+			ret |= __get_user(data.v[5], p++);
+		case 2:
+			ret |= __get_user(data.v[6], p++);
+			ret |= __get_user(data.v[7], p++);
+			if (ret)
+				return -EFAULT;
+		}
+	}
+	
+	/* If we are storing, get the data from the saved gpr or fpr */
+	if (flags & ST) {
+		if (flags & F) {
+			if (nb == 4) {
+				/* Doing stfs, have to convert to single */
+				preempt_disable();
+				enable_kernel_fp();
+				cvt_df(&current->thread.fpr[reg], (float *)&data.v[4], &current->thread.fpscr);
+				disable_kernel_fp();
+				preempt_enable();
+			}
+			else
+				data.dd = current->thread.fpr[reg];
+		}
+		else 
+			data.ll = regs->gpr[reg];
+	}
+	
+	/* Swap bytes as needed */
+	if (flags & SW) {
+		if (nb == 2)
+			SWAP(data.v[6], data.v[7]);
+		else {	/* nb must be 4 */
+			SWAP(data.v[4], data.v[7]);
+			SWAP(data.v[5], data.v[6]);
+		}
+	}
+	
+	/* Sign extend as needed */
+	if (flags & SE) {
+		if ( nb == 2 )
+			data.ll = data.x16.low16;
+		else	/* nb must be 4 */
+			data.ll = data.x32.low32;
+	}
+	
+	/* If we are loading, move the data to the gpr or fpr */
+	if (flags & LD) {
+		if (flags & F) {
+			if (nb == 4) {
+				/* Doing lfs, have to convert to double */
+				preempt_disable();
+				enable_kernel_fp();
+				cvt_fd((float *)&data.v[4], &current->thread.fpr[reg], &current->thread.fpscr);
+				disable_kernel_fp();
+				preempt_enable();
+			}
+			else
+				current->thread.fpr[reg] = data.dd;
+		}
+		else
+			regs->gpr[reg] = data.ll;
+	}
+	
+	/* If we are storing, copy the data to the user */
+	if (flags & ST) {
+		ret = 0;
+		p = addr;
+		switch (nb) {
+		case 128:	/* Special case - must be dcbz */
+			lp = (unsigned long __user *)p;
+			for (i = 0; i < L1_CACHE_BYTES / sizeof(long); ++i)
+				ret |= __put_user(0, lp++);
+			break;
+		case 8:
+			ret |= __put_user(data.v[0], p++);
+			ret |= __put_user(data.v[1], p++);
+			ret |= __put_user(data.v[2], p++);
+			ret |= __put_user(data.v[3], p++);
+		case 4:
+			ret |= __put_user(data.v[4], p++);
+			ret |= __put_user(data.v[5], p++);
+		case 2:
+			ret |= __put_user(data.v[6], p++);
+			ret |= __put_user(data.v[7], p++);
+		}
+		if (ret)
+			return -EFAULT;
+	}
+	
+	/* Update RA as needed */
+	if (flags & U) {
+		regs->gpr[areg] = regs->dar;
+	}
+
+	return 1;
+}
+
diff --git a/arch/ppc64/kernel/asm-offsets.c b/arch/ppc64/kernel/asm-offsets.c
new file mode 100644
index 00000000000..0094ac79a18
--- /dev/null
+++ b/arch/ppc64/kernel/asm-offsets.c
@@ -0,0 +1,193 @@
+/*
+ * This program is used to generate definitions needed by
+ * assembly language modules.
+ *
+ * We use the technique used in the OSF Mach kernel code:
+ * generate asm statements containing #defines,
+ * compile this file to assembler, and then extract the
+ * #defines from the assembly-language output.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/time.h>
+#include <linux/hardirq.h>
+#include <asm/io.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+
+#include <asm/paca.h>
+#include <asm/lppaca.h>
+#include <asm/iSeries/ItLpQueue.h>
+#include <asm/iSeries/HvLpEvent.h>
+#include <asm/rtas.h>
+#include <asm/cputable.h>
+#include <asm/cache.h>
+#include <asm/systemcfg.h>
+#include <asm/compat.h>
+
+#define DEFINE(sym, val) \
+	asm volatile("\n->" #sym " %0 " #val : : "i" (val))
+
+#define BLANK() asm volatile("\n->" : : )
+
+int main(void)
+{
+	/* thread struct on stack */
+	DEFINE(THREAD_SHIFT, THREAD_SHIFT);
+	DEFINE(THREAD_SIZE, THREAD_SIZE);
+	DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
+	DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
+	DEFINE(TI_SC_NOERR, offsetof(struct thread_info, syscall_noerror));
+
+	/* task_struct->thread */
+	DEFINE(THREAD, offsetof(struct task_struct, thread));
+	DEFINE(PT_REGS, offsetof(struct thread_struct, regs));
+	DEFINE(THREAD_FPEXC_MODE, offsetof(struct thread_struct, fpexc_mode));
+	DEFINE(THREAD_FPR0, offsetof(struct thread_struct, fpr[0]));
+	DEFINE(THREAD_FPSCR, offsetof(struct thread_struct, fpscr));
+	DEFINE(KSP, offsetof(struct thread_struct, ksp));
+	DEFINE(KSP_VSID, offsetof(struct thread_struct, ksp_vsid));
+
+#ifdef CONFIG_ALTIVEC
+	DEFINE(THREAD_VR0, offsetof(struct thread_struct, vr[0]));
+	DEFINE(THREAD_VRSAVE, offsetof(struct thread_struct, vrsave));
+	DEFINE(THREAD_VSCR, offsetof(struct thread_struct, vscr));
+	DEFINE(THREAD_USED_VR, offsetof(struct thread_struct, used_vr));
+#endif /* CONFIG_ALTIVEC */
+	DEFINE(MM, offsetof(struct task_struct, mm));
+
+	DEFINE(DCACHEL1LINESIZE, offsetof(struct ppc64_caches, dline_size));
+	DEFINE(DCACHEL1LOGLINESIZE, offsetof(struct ppc64_caches, log_dline_size));
+	DEFINE(DCACHEL1LINESPERPAGE, offsetof(struct ppc64_caches, dlines_per_page));
+	DEFINE(ICACHEL1LINESIZE, offsetof(struct ppc64_caches, iline_size));
+	DEFINE(ICACHEL1LOGLINESIZE, offsetof(struct ppc64_caches, log_iline_size));
+	DEFINE(ICACHEL1LINESPERPAGE, offsetof(struct ppc64_caches, ilines_per_page));
+	DEFINE(PLATFORM, offsetof(struct systemcfg, platform));
+
+	/* paca */
+        DEFINE(PACA_SIZE, sizeof(struct paca_struct));
+        DEFINE(PACAPACAINDEX, offsetof(struct paca_struct, paca_index));
+        DEFINE(PACAPROCSTART, offsetof(struct paca_struct, cpu_start));
+        DEFINE(PACAKSAVE, offsetof(struct paca_struct, kstack));
+	DEFINE(PACACURRENT, offsetof(struct paca_struct, __current));
+        DEFINE(PACASAVEDMSR, offsetof(struct paca_struct, saved_msr));
+        DEFINE(PACASTABREAL, offsetof(struct paca_struct, stab_real));
+        DEFINE(PACASTABVIRT, offsetof(struct paca_struct, stab_addr));
+	DEFINE(PACASTABRR, offsetof(struct paca_struct, stab_rr));
+        DEFINE(PACAR1, offsetof(struct paca_struct, saved_r1));
+	DEFINE(PACATOC, offsetof(struct paca_struct, kernel_toc));
+	DEFINE(PACAPROCENABLED, offsetof(struct paca_struct, proc_enabled));
+	DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache));
+	DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr));
+	DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
+#ifdef CONFIG_HUGETLB_PAGE
+	DEFINE(PACAHTLBSEGS, offsetof(struct paca_struct, context.htlb_segs));
+#endif /* CONFIG_HUGETLB_PAGE */
+	DEFINE(PACADEFAULTDECR, offsetof(struct paca_struct, default_decr));
+        DEFINE(PACA_EXGEN, offsetof(struct paca_struct, exgen));
+        DEFINE(PACA_EXMC, offsetof(struct paca_struct, exmc));
+        DEFINE(PACA_EXSLB, offsetof(struct paca_struct, exslb));
+        DEFINE(PACA_EXDSI, offsetof(struct paca_struct, exdsi));
+        DEFINE(PACAEMERGSP, offsetof(struct paca_struct, emergency_sp));
+	DEFINE(PACALPPACA, offsetof(struct paca_struct, lppaca));
+	DEFINE(PACAHWCPUID, offsetof(struct paca_struct, hw_cpu_id));
+	DEFINE(LPPACASRR0, offsetof(struct lppaca, saved_srr0));
+	DEFINE(LPPACASRR1, offsetof(struct lppaca, saved_srr1));
+	DEFINE(LPPACAANYINT, offsetof(struct lppaca, int_dword.any_int));
+	DEFINE(LPPACADECRINT, offsetof(struct lppaca, int_dword.fields.decr_int));
+
+	/* RTAS */
+	DEFINE(RTASBASE, offsetof(struct rtas_t, base));
+	DEFINE(RTASENTRY, offsetof(struct rtas_t, entry));
+
+	/* Interrupt register frame */
+	DEFINE(STACK_FRAME_OVERHEAD, STACK_FRAME_OVERHEAD);
+
+	DEFINE(SWITCH_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs));
+
+	/* 288 = # of volatile regs, int & fp, for leaf routines */
+	/* which do not stack a frame.  See the PPC64 ABI.       */
+	DEFINE(INT_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs) + 288);
+	/* Create extra stack space for SRR0 and SRR1 when calling prom/rtas. */
+	DEFINE(PROM_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs) + 16);
+	DEFINE(RTAS_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs) + 16);
+	DEFINE(GPR0, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, gpr[0]));
+	DEFINE(GPR1, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, gpr[1]));
+	DEFINE(GPR2, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, gpr[2]));
+	DEFINE(GPR3, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, gpr[3]));
+	DEFINE(GPR4, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, gpr[4]));
+	DEFINE(GPR5, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, gpr[5]));
+	DEFINE(GPR6, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, gpr[6]));
+	DEFINE(GPR7, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, gpr[7]));
+	DEFINE(GPR8, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, gpr[8]));
+	DEFINE(GPR9, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, gpr[9]));
+	DEFINE(GPR10, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, gpr[10]));
+	DEFINE(GPR11, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, gpr[11]));
+	DEFINE(GPR12, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, gpr[12]));
+	DEFINE(GPR13, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, gpr[13]));
+	/*
+	 * Note: these symbols include _ because they overlap with special
+	 * register names
+	 */
+	DEFINE(_NIP, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, nip));
+	DEFINE(_MSR, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, msr));
+	DEFINE(_CTR, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, ctr));
+	DEFINE(_LINK, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, link));
+	DEFINE(_CCR, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, ccr));
+	DEFINE(_XER, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, xer));
+	DEFINE(_DAR, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, dar));
+	DEFINE(_DSISR, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, dsisr));
+	DEFINE(ORIG_GPR3, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, orig_gpr3));
+	DEFINE(RESULT, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, result));
+	DEFINE(_TRAP, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, trap));
+	DEFINE(SOFTE, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, softe));
+
+	/* These _only_ to be used with {PROM,RTAS}_FRAME_SIZE!!! */
+	DEFINE(_SRR0, STACK_FRAME_OVERHEAD+sizeof(struct pt_regs));
+	DEFINE(_SRR1, STACK_FRAME_OVERHEAD+sizeof(struct pt_regs)+8);
+
+	DEFINE(CLONE_VM, CLONE_VM);
+	DEFINE(CLONE_UNTRACED, CLONE_UNTRACED);
+
+	/* About the CPU features table */
+	DEFINE(CPU_SPEC_ENTRY_SIZE, sizeof(struct cpu_spec));
+	DEFINE(CPU_SPEC_PVR_MASK, offsetof(struct cpu_spec, pvr_mask));
+	DEFINE(CPU_SPEC_PVR_VALUE, offsetof(struct cpu_spec, pvr_value));
+	DEFINE(CPU_SPEC_FEATURES, offsetof(struct cpu_spec, cpu_features));
+	DEFINE(CPU_SPEC_SETUP, offsetof(struct cpu_spec, cpu_setup));
+
+	/* systemcfg offsets for use by vdso */
+	DEFINE(CFG_TB_ORIG_STAMP, offsetof(struct systemcfg, tb_orig_stamp));
+	DEFINE(CFG_TB_TICKS_PER_SEC, offsetof(struct systemcfg, tb_ticks_per_sec));
+	DEFINE(CFG_TB_TO_XS, offsetof(struct systemcfg, tb_to_xs));
+	DEFINE(CFG_STAMP_XSEC, offsetof(struct systemcfg, stamp_xsec));
+	DEFINE(CFG_TB_UPDATE_COUNT, offsetof(struct systemcfg, tb_update_count));
+	DEFINE(CFG_TZ_MINUTEWEST, offsetof(struct systemcfg, tz_minuteswest));
+	DEFINE(CFG_TZ_DSTTIME, offsetof(struct systemcfg, tz_dsttime));
+	DEFINE(CFG_SYSCALL_MAP32, offsetof(struct systemcfg, syscall_map_32));
+	DEFINE(CFG_SYSCALL_MAP64, offsetof(struct systemcfg, syscall_map_64));
+
+	/* timeval/timezone offsets for use by vdso */
+	DEFINE(TVAL64_TV_SEC, offsetof(struct timeval, tv_sec));
+	DEFINE(TVAL64_TV_USEC, offsetof(struct timeval, tv_usec));
+	DEFINE(TVAL32_TV_SEC, offsetof(struct compat_timeval, tv_sec));
+	DEFINE(TVAL32_TV_USEC, offsetof(struct compat_timeval, tv_usec));
+	DEFINE(TZONE_TZ_MINWEST, offsetof(struct timezone, tz_minuteswest));
+	DEFINE(TZONE_TZ_DSTTIME, offsetof(struct timezone, tz_dsttime));
+
+	return 0;
+}
diff --git a/arch/ppc64/kernel/binfmt_elf32.c b/arch/ppc64/kernel/binfmt_elf32.c
new file mode 100644
index 00000000000..fadc699a049
--- /dev/null
+++ b/arch/ppc64/kernel/binfmt_elf32.c
@@ -0,0 +1,78 @@
+/*
+ * binfmt_elf32.c: Support 32-bit PPC ELF binaries on Power3 and followons.
+ * based on the SPARC64 version.
+ * Copyright (C) 1995, 1996, 1997, 1998 David S. Miller	(davem@redhat.com)
+ * Copyright (C) 1995, 1996, 1997, 1998 Jakub Jelinek	(jj@ultra.linux.cz)
+ *
+ * Copyright (C) 2000,2001 Ken Aaker (kdaaker@rchland.vnet.ibm.com), IBM Corp
+ * Copyright (C) 2001 Anton Blanchard (anton@au.ibm.com), IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define ELF_ARCH		EM_PPC
+#define ELF_CLASS		ELFCLASS32
+#define ELF_DATA		ELFDATA2MSB;
+
+#include <asm/processor.h>
+#include <linux/module.h>
+#include <linux/config.h>
+#include <linux/elfcore.h>
+#include <linux/compat.h>
+
+#define elf_prstatus elf_prstatus32
+struct elf_prstatus32
+{
+	struct elf_siginfo pr_info;	/* Info associated with signal */
+	short	pr_cursig;		/* Current signal */
+	unsigned int pr_sigpend;	/* Set of pending signals */
+	unsigned int pr_sighold;	/* Set of held signals */
+	pid_t	pr_pid;
+	pid_t	pr_ppid;
+	pid_t	pr_pgrp;
+	pid_t	pr_sid;
+	struct compat_timeval pr_utime;	/* User time */
+	struct compat_timeval pr_stime;	/* System time */
+	struct compat_timeval pr_cutime;	/* Cumulative user time */
+	struct compat_timeval pr_cstime;	/* Cumulative system time */
+	elf_gregset_t pr_reg;		/* General purpose registers. */
+	int pr_fpvalid;		/* True if math co-processor being used. */
+};
+
+#define elf_prpsinfo elf_prpsinfo32
+struct elf_prpsinfo32
+{
+	char	pr_state;	/* numeric process state */
+	char	pr_sname;	/* char for pr_state */
+	char	pr_zomb;	/* zombie */
+	char	pr_nice;	/* nice val */
+	unsigned int pr_flag;	/* flags */
+	u32	pr_uid;
+	u32	pr_gid;
+	pid_t	pr_pid, pr_ppid, pr_pgrp, pr_sid;
+	/* Lots missing */
+	char	pr_fname[16];	/* filename of executable */
+	char	pr_psargs[ELF_PRARGSZ];	/* initial part of arg list */
+};
+
+#include <linux/time.h>
+
+#undef cputime_to_timeval
+#define cputime_to_timeval cputime_to_compat_timeval
+static __inline__ void
+cputime_to_compat_timeval(const cputime_t cputime, struct compat_timeval *value)
+{
+	unsigned long jiffies = cputime_to_jiffies(cputime);
+	value->tv_usec = (jiffies % HZ) * (1000000L / HZ);
+	value->tv_sec = jiffies / HZ;
+}
+
+extern void start_thread32(struct pt_regs *, unsigned long, unsigned long);
+#undef start_thread
+#define start_thread start_thread32
+#define init_elf_binfmt init_elf32_binfmt
+
+#include "../../../fs/binfmt_elf.c"
diff --git a/arch/ppc64/kernel/bitops.c b/arch/ppc64/kernel/bitops.c
new file mode 100644
index 00000000000..ae329e8b4ac
--- /dev/null
+++ b/arch/ppc64/kernel/bitops.c
@@ -0,0 +1,147 @@
+/*
+ * These are too big to be inlined.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/bitops.h>
+#include <asm/byteorder.h>
+
+unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
+				 unsigned long offset)
+{
+	const unsigned long *p = addr + (offset >> 6);
+	unsigned long result = offset & ~63UL;
+	unsigned long tmp;
+
+	if (offset >= size)
+		return size;
+	size -= result;
+	offset &= 63UL;
+	if (offset) {
+		tmp = *(p++);
+		tmp |= ~0UL >> (64 - offset);
+		if (size < 64)
+			goto found_first;
+		if (~tmp)
+			goto found_middle;
+		size -= 64;
+		result += 64;
+	}
+	while (size & ~63UL) {
+		if (~(tmp = *(p++)))
+			goto found_middle;
+		result += 64;
+		size -= 64;
+	}
+	if (!size)
+		return result;
+	tmp = *p;
+
+found_first:
+	tmp |= ~0UL << size;
+	if (tmp == ~0UL)	/* Are any bits zero? */
+		return result + size;	/* Nope. */
+found_middle:
+	return result + ffz(tmp);
+}
+
+EXPORT_SYMBOL(find_next_zero_bit);
+
+unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
+			    unsigned long offset)
+{
+	const unsigned long *p = addr + (offset >> 6);
+	unsigned long result = offset & ~63UL;
+	unsigned long tmp;
+
+	if (offset >= size)
+		return size;
+	size -= result;
+	offset &= 63UL;
+	if (offset) {
+		tmp = *(p++);
+		tmp &= (~0UL << offset);
+		if (size < 64)
+			goto found_first;
+		if (tmp)
+			goto found_middle;
+		size -= 64;
+		result += 64;
+	}
+	while (size & ~63UL) {
+		if ((tmp = *(p++)))
+			goto found_middle;
+		result += 64;
+		size -= 64;
+	}
+	if (!size)
+		return result;
+	tmp = *p;
+
+found_first:
+	tmp &= (~0UL >> (64 - size));
+	if (tmp == 0UL)		/* Are any bits set? */
+		return result + size;	/* Nope. */
+found_middle:
+	return result + __ffs(tmp);
+}
+
+EXPORT_SYMBOL(find_next_bit);
+
+static inline unsigned int ext2_ilog2(unsigned int x)
+{
+	int lz;
+
+	asm("cntlzw %0,%1": "=r"(lz):"r"(x));
+	return 31 - lz;
+}
+
+static inline unsigned int ext2_ffz(unsigned int x)
+{
+	u32 rc;
+	if ((x = ~x) == 0)
+		return 32;
+	rc = ext2_ilog2(x & -x);
+	return rc;
+}
+
+unsigned long find_next_zero_le_bit(const unsigned long *addr, unsigned long size,
+				    unsigned long offset)
+{
+	const unsigned int *p = ((const unsigned int *)addr) + (offset >> 5);
+	unsigned int result = offset & ~31;
+	unsigned int tmp;
+
+	if (offset >= size)
+		return size;
+	size -= result;
+	offset &= 31;
+	if (offset) {
+		tmp = cpu_to_le32p(p++);
+		tmp |= ~0U >> (32 - offset);	/* bug or feature ? */
+		if (size < 32)
+			goto found_first;
+		if (tmp != ~0)
+			goto found_middle;
+		size -= 32;
+		result += 32;
+	}
+	while (size >= 32) {
+		if ((tmp = cpu_to_le32p(p++)) != ~0)
+			goto found_middle;
+		result += 32;
+		size -= 32;
+	}
+	if (!size)
+		return result;
+	tmp = cpu_to_le32p(p);
+found_first:
+	tmp |= ~0 << size;
+	if (tmp == ~0)		/* Are any bits zero? */
+		return result + size;	/* Nope. */
+found_middle:
+	return result + ext2_ffz(tmp);
+}
+
+EXPORT_SYMBOL(find_next_zero_le_bit);
diff --git a/arch/ppc64/kernel/btext.c b/arch/ppc64/kernel/btext.c
new file mode 100644
index 00000000000..c53f079e9b7
--- /dev/null
+++ b/arch/ppc64/kernel/btext.c
@@ -0,0 +1,751 @@
+/*
+ * Procedures for drawing on the screen early on in the boot process.
+ *
+ * Benjamin Herrenschmidt <benh@kernel.crashing.org>
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/version.h>
+
+#include <asm/sections.h>
+#include <asm/prom.h>
+#include <asm/btext.h>
+#include <asm/prom.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
+#include <asm/io.h>
+#include <asm/lmb.h>
+#include <asm/processor.h>
+
+#undef NO_SCROLL
+
+#ifndef NO_SCROLL
+static void scrollscreen(void);
+#endif
+
+static void draw_byte(unsigned char c, long locX, long locY);
+static void draw_byte_32(unsigned char *bits, unsigned int *base, int rb);
+static void draw_byte_16(unsigned char *bits, unsigned int *base, int rb);
+static void draw_byte_8(unsigned char *bits, unsigned int *base, int rb);
+
+static int g_loc_X;
+static int g_loc_Y;
+static int g_max_loc_X;
+static int g_max_loc_Y;
+
+static int dispDeviceRowBytes;
+static int dispDeviceDepth;
+static int dispDeviceRect[4];
+static unsigned char *dispDeviceBase, *logicalDisplayBase;
+
+unsigned long disp_BAT[2] __initdata = {0, 0};
+
+#define cmapsz	(16*256)
+
+static unsigned char vga_font[cmapsz];
+
+int boot_text_mapped;
+int force_printk_to_btext = 0;
+
+
+/* Here's a small text engine to use during early boot
+ * or for debugging purposes
+ *
+ * todo:
+ *
+ *  - build some kind of vgacon with it to enable early printk
+ *  - move to a separate file
+ *  - add a few video driver hooks to keep in sync with display
+ *    changes.
+ */
+
+void map_boot_text(void)
+{
+	unsigned long base, offset, size;
+	unsigned char *vbase;
+
+	/* By default, we are no longer mapped */
+	boot_text_mapped = 0;
+	if (dispDeviceBase == 0)
+		return;
+	base = ((unsigned long) dispDeviceBase) & 0xFFFFF000UL;
+	offset = ((unsigned long) dispDeviceBase) - base;
+	size = dispDeviceRowBytes * dispDeviceRect[3] + offset
+		+ dispDeviceRect[0];
+	vbase = __ioremap(base, size, _PAGE_NO_CACHE);
+	if (vbase == 0)
+		return;
+	logicalDisplayBase = vbase + offset;
+	boot_text_mapped = 1;
+}
+
+int btext_initialize(struct device_node *np)
+{
+	unsigned int width, height, depth, pitch;
+	unsigned long address = 0;
+	u32 *prop;
+
+	prop = (u32 *)get_property(np, "width", NULL);
+	if (prop == NULL)
+		return -EINVAL;
+	width = *prop;
+	prop = (u32 *)get_property(np, "height", NULL);
+	if (prop == NULL)
+		return -EINVAL;
+	height = *prop;
+	prop = (u32 *)get_property(np, "depth", NULL);
+	if (prop == NULL)
+		return -EINVAL;
+	depth = *prop;
+	pitch = width * ((depth + 7) / 8);
+	prop = (u32 *)get_property(np, "linebytes", NULL);
+	if (prop)
+		pitch = *prop;
+	if (pitch == 1)
+		pitch = 0x1000;
+	prop = (u32 *)get_property(np, "address", NULL);
+	if (prop)
+		address = *prop;
+
+	/* FIXME: Add support for PCI reg properties */
+
+	if (address == 0)
+		return -EINVAL;
+
+	g_loc_X = 0;
+	g_loc_Y = 0;
+	g_max_loc_X = width / 8;
+	g_max_loc_Y = height / 16;
+	logicalDisplayBase = (unsigned char *)address;
+	dispDeviceBase = (unsigned char *)address;
+	dispDeviceRowBytes = pitch;
+	dispDeviceDepth = depth;
+	dispDeviceRect[0] = dispDeviceRect[1] = 0;
+	dispDeviceRect[2] = width;
+	dispDeviceRect[3] = height;
+
+	map_boot_text();
+
+	return 0;
+}
+
+
+/* Calc the base address of a given point (x,y) */
+static unsigned char * calc_base(int x, int y)
+{
+	unsigned char *base;
+
+	base = logicalDisplayBase;
+	if (base == 0)
+		base = dispDeviceBase;
+	base += (x + dispDeviceRect[0]) * (dispDeviceDepth >> 3);
+	base += (y + dispDeviceRect[1]) * dispDeviceRowBytes;
+	return base;
+}
+
+/* Adjust the display to a new resolution */
+void btext_update_display(unsigned long phys, int width, int height,
+			  int depth, int pitch)
+{
+	if (dispDeviceBase == 0)
+		return;
+
+	/* check it's the same frame buffer (within 256MB) */
+	if ((phys ^ (unsigned long)dispDeviceBase) & 0xf0000000)
+		return;
+
+	dispDeviceBase = (__u8 *) phys;
+	dispDeviceRect[0] = 0;
+	dispDeviceRect[1] = 0;
+	dispDeviceRect[2] = width;
+	dispDeviceRect[3] = height;
+	dispDeviceDepth = depth;
+	dispDeviceRowBytes = pitch;
+	if (boot_text_mapped) {
+		iounmap(logicalDisplayBase);
+		boot_text_mapped = 0;
+	}
+	map_boot_text();
+	g_loc_X = 0;
+	g_loc_Y = 0;
+	g_max_loc_X = width / 8;
+	g_max_loc_Y = height / 16;
+}
+
+void btext_clearscreen(void)
+{
+	unsigned long *base	= (unsigned long *)calc_base(0, 0);
+	unsigned long width 	= ((dispDeviceRect[2] - dispDeviceRect[0]) *
+					(dispDeviceDepth >> 3)) >> 3;
+	int i,j;
+
+	for (i=0; i<(dispDeviceRect[3] - dispDeviceRect[1]); i++)
+	{
+		unsigned long *ptr = base;
+		for(j=width; j; --j)
+			*(ptr++) = 0;
+		base += (dispDeviceRowBytes >> 3);
+	}
+}
+
+#ifndef NO_SCROLL
+static void scrollscreen(void)
+{
+	unsigned long *src     	= (unsigned long *)calc_base(0,16);
+	unsigned long *dst     	= (unsigned long *)calc_base(0,0);
+	unsigned long width    	= ((dispDeviceRect[2] - dispDeviceRect[0]) *
+				   (dispDeviceDepth >> 3)) >> 3;
+	int i,j;
+
+	for (i=0; i<(dispDeviceRect[3] - dispDeviceRect[1] - 16); i++)
+	{
+		unsigned long *src_ptr = src;
+		unsigned long *dst_ptr = dst;
+		for(j=width; j; --j)
+			*(dst_ptr++) = *(src_ptr++);
+		src += (dispDeviceRowBytes >> 3);
+		dst += (dispDeviceRowBytes >> 3);
+	}
+	for (i=0; i<16; i++)
+	{
+		unsigned long *dst_ptr = dst;
+		for(j=width; j; --j)
+			*(dst_ptr++) = 0;
+		dst += (dispDeviceRowBytes >> 3);
+	}
+}
+#endif /* ndef NO_SCROLL */
+
+void btext_drawchar(char c)
+{
+	int cline = 0;
+#ifdef NO_SCROLL
+	int x;
+#endif
+	if (!boot_text_mapped)
+		return;
+
+	switch (c) {
+	case '\b':
+		if (g_loc_X > 0)
+			--g_loc_X;
+		break;
+	case '\t':
+		g_loc_X = (g_loc_X & -8) + 8;
+		break;
+	case '\r':
+		g_loc_X = 0;
+		break;
+	case '\n':
+		g_loc_X = 0;
+		g_loc_Y++;
+		cline = 1;
+		break;
+	default:
+		draw_byte(c, g_loc_X++, g_loc_Y);
+	}
+	if (g_loc_X >= g_max_loc_X) {
+		g_loc_X = 0;
+		g_loc_Y++;
+		cline = 1;
+	}
+#ifndef NO_SCROLL
+	while (g_loc_Y >= g_max_loc_Y) {
+		scrollscreen();
+		g_loc_Y--;
+	}
+#else
+	/* wrap around from bottom to top of screen so we don't
+	   waste time scrolling each line.  -- paulus. */
+	if (g_loc_Y >= g_max_loc_Y)
+		g_loc_Y = 0;
+	if (cline) {
+		for (x = 0; x < g_max_loc_X; ++x)
+			draw_byte(' ', x, g_loc_Y);
+	}
+#endif
+}
+
+void btext_drawstring(const char *c)
+{
+	if (!boot_text_mapped)
+		return;
+	while (*c)
+		btext_drawchar(*c++);
+}
+
+void btext_drawhex(unsigned long v)
+{
+	char *hex_table = "0123456789abcdef";
+
+	if (!boot_text_mapped)
+		return;
+	btext_drawchar(hex_table[(v >> 60) & 0x0000000FUL]);
+	btext_drawchar(hex_table[(v >> 56) & 0x0000000FUL]);
+	btext_drawchar(hex_table[(v >> 52) & 0x0000000FUL]);
+	btext_drawchar(hex_table[(v >> 48) & 0x0000000FUL]);
+	btext_drawchar(hex_table[(v >> 44) & 0x0000000FUL]);
+	btext_drawchar(hex_table[(v >> 40) & 0x0000000FUL]);
+	btext_drawchar(hex_table[(v >> 36) & 0x0000000FUL]);
+	btext_drawchar(hex_table[(v >> 32) & 0x0000000FUL]);
+	btext_drawchar(hex_table[(v >> 28) & 0x0000000FUL]);
+	btext_drawchar(hex_table[(v >> 24) & 0x0000000FUL]);
+	btext_drawchar(hex_table[(v >> 20) & 0x0000000FUL]);
+	btext_drawchar(hex_table[(v >> 16) & 0x0000000FUL]);
+	btext_drawchar(hex_table[(v >> 12) & 0x0000000FUL]);
+	btext_drawchar(hex_table[(v >>  8) & 0x0000000FUL]);
+	btext_drawchar(hex_table[(v >>  4) & 0x0000000FUL]);
+	btext_drawchar(hex_table[(v >>  0) & 0x0000000FUL]);
+	btext_drawchar(' ');
+}
+
+static void draw_byte(unsigned char c, long locX, long locY)
+{
+	unsigned char *base	= calc_base(locX << 3, locY << 4);
+	unsigned char *font	= &vga_font[((unsigned int)c) * 16];
+	int rb			= dispDeviceRowBytes;
+
+	switch(dispDeviceDepth) {
+	case 24:
+	case 32:
+		draw_byte_32(font, (unsigned int *)base, rb);
+		break;
+	case 15:
+	case 16:
+		draw_byte_16(font, (unsigned int *)base, rb);
+		break;
+	case 8:
+		draw_byte_8(font, (unsigned int *)base, rb);
+		break;
+	}
+}
+
+static unsigned int expand_bits_8[16] = {
+	0x00000000,
+	0x000000ff,
+	0x0000ff00,
+	0x0000ffff,
+	0x00ff0000,
+	0x00ff00ff,
+	0x00ffff00,
+	0x00ffffff,
+	0xff000000,
+	0xff0000ff,
+	0xff00ff00,
+	0xff00ffff,
+	0xffff0000,
+	0xffff00ff,
+	0xffffff00,
+	0xffffffff
+};
+
+static unsigned int expand_bits_16[4] = {
+	0x00000000,
+	0x0000ffff,
+	0xffff0000,
+	0xffffffff
+};
+
+
+static void draw_byte_32(unsigned char *font, unsigned int *base, int rb)
+{
+	int l, bits;
+	int fg = 0xFFFFFFFFUL;
+	int bg = 0x00000000UL;
+
+	for (l = 0; l < 16; ++l)
+	{
+		bits = *font++;
+		base[0] = (-(bits >> 7) & fg) ^ bg;
+		base[1] = (-((bits >> 6) & 1) & fg) ^ bg;
+		base[2] = (-((bits >> 5) & 1) & fg) ^ bg;
+		base[3] = (-((bits >> 4) & 1) & fg) ^ bg;
+		base[4] = (-((bits >> 3) & 1) & fg) ^ bg;
+		base[5] = (-((bits >> 2) & 1) & fg) ^ bg;
+		base[6] = (-((bits >> 1) & 1) & fg) ^ bg;
+		base[7] = (-(bits & 1) & fg) ^ bg;
+		base = (unsigned int *) ((char *)base + rb);
+	}
+}
+
+static void draw_byte_16(unsigned char *font, unsigned int *base, int rb)
+{
+	int l, bits;
+	int fg = 0xFFFFFFFFUL;
+	int bg = 0x00000000UL;
+	unsigned int *eb = (int *)expand_bits_16;
+
+	for (l = 0; l < 16; ++l)
+	{
+		bits = *font++;
+		base[0] = (eb[bits >> 6] & fg) ^ bg;
+		base[1] = (eb[(bits >> 4) & 3] & fg) ^ bg;
+		base[2] = (eb[(bits >> 2) & 3] & fg) ^ bg;
+		base[3] = (eb[bits & 3] & fg) ^ bg;
+		base = (unsigned int *) ((char *)base + rb);
+	}
+}
+
+static void draw_byte_8(unsigned char *font, unsigned int *base, int rb)
+{
+	int l, bits;
+	int fg = 0x0F0F0F0FUL;
+	int bg = 0x00000000UL;
+	unsigned int *eb = (int *)expand_bits_8;
+
+	for (l = 0; l < 16; ++l)
+	{
+		bits = *font++;
+		base[0] = (eb[bits >> 4] & fg) ^ bg;
+		base[1] = (eb[bits & 0xf] & fg) ^ bg;
+		base = (unsigned int *) ((char *)base + rb);
+	}
+}
+
+static unsigned char vga_font[cmapsz] = {
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0x81, 0xa5, 0x81, 0x81, 0xbd,
+0x99, 0x81, 0x81, 0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0xff,
+0xdb, 0xff, 0xff, 0xc3, 0xe7, 0xff, 0xff, 0x7e, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x6c, 0xfe, 0xfe, 0xfe, 0xfe, 0x7c, 0x38, 0x10,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x7c, 0xfe,
+0x7c, 0x38, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18,
+0x3c, 0x3c, 0xe7, 0xe7, 0xe7, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x18, 0x3c, 0x7e, 0xff, 0xff, 0x7e, 0x18, 0x18, 0x3c,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x3c,
+0x3c, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
+0xff, 0xff, 0xe7, 0xc3, 0xc3, 0xe7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, 0x66, 0x42, 0x42, 0x66, 0x3c, 0x00,
+0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xc3, 0x99, 0xbd,
+0xbd, 0x99, 0xc3, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x1e, 0x0e,
+0x1a, 0x32, 0x78, 0xcc, 0xcc, 0xcc, 0xcc, 0x78, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x3c, 0x66, 0x66, 0x66, 0x66, 0x3c, 0x18, 0x7e, 0x18, 0x18,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0x33, 0x3f, 0x30, 0x30, 0x30,
+0x30, 0x70, 0xf0, 0xe0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0x63,
+0x7f, 0x63, 0x63, 0x63, 0x63, 0x67, 0xe7, 0xe6, 0xc0, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x18, 0x18, 0xdb, 0x3c, 0xe7, 0x3c, 0xdb, 0x18, 0x18,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfe, 0xf8,
+0xf0, 0xe0, 0xc0, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x06, 0x0e,
+0x1e, 0x3e, 0xfe, 0x3e, 0x1e, 0x0e, 0x06, 0x02, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x18, 0x3c, 0x7e, 0x18, 0x18, 0x18, 0x7e, 0x3c, 0x18, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66,
+0x66, 0x00, 0x66, 0x66, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0xdb,
+0xdb, 0xdb, 0x7b, 0x1b, 0x1b, 0x1b, 0x1b, 0x1b, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x7c, 0xc6, 0x60, 0x38, 0x6c, 0xc6, 0xc6, 0x6c, 0x38, 0x0c, 0xc6,
+0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0xfe, 0xfe, 0xfe, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x3c,
+0x7e, 0x18, 0x18, 0x18, 0x7e, 0x3c, 0x18, 0x7e, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x18, 0x3c, 0x7e, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x7e, 0x3c, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x18, 0x0c, 0xfe, 0x0c, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x60, 0xfe, 0x60, 0x30, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, 0xc0,
+0xc0, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x24, 0x66, 0xff, 0x66, 0x24, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x38, 0x7c, 0x7c, 0xfe, 0xfe, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0xfe, 0x7c, 0x7c,
+0x38, 0x38, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x18, 0x3c, 0x3c, 0x3c, 0x18, 0x18, 0x18, 0x00, 0x18, 0x18,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x66, 0x66, 0x66, 0x24, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x6c,
+0x6c, 0xfe, 0x6c, 0x6c, 0x6c, 0xfe, 0x6c, 0x6c, 0x00, 0x00, 0x00, 0x00,
+0x18, 0x18, 0x7c, 0xc6, 0xc2, 0xc0, 0x7c, 0x06, 0x06, 0x86, 0xc6, 0x7c,
+0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2, 0xc6, 0x0c, 0x18,
+0x30, 0x60, 0xc6, 0x86, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38, 0x6c,
+0x6c, 0x38, 0x76, 0xdc, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x30, 0x30, 0x30, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x18, 0x30, 0x30, 0x30, 0x30,
+0x30, 0x30, 0x18, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x18,
+0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x18, 0x30, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x66, 0x3c, 0xff, 0x3c, 0x66, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x7e,
+0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x18, 0x30, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x02, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 0x80, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xce, 0xde, 0xf6, 0xe6, 0xc6, 0xc6, 0x7c,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x38, 0x78, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6,
+0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 0xc6, 0xfe, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x7c, 0xc6, 0x06, 0x06, 0x3c, 0x06, 0x06, 0x06, 0xc6, 0x7c,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x1c, 0x3c, 0x6c, 0xcc, 0xfe,
+0x0c, 0x0c, 0x0c, 0x1e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0xc0,
+0xc0, 0xc0, 0xfc, 0x06, 0x06, 0x06, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x38, 0x60, 0xc0, 0xc0, 0xfc, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0xc6, 0x06, 0x06, 0x0c, 0x18,
+0x30, 0x30, 0x30, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6,
+0xc6, 0xc6, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0x7e, 0x06, 0x06, 0x06, 0x0c, 0x78,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x00, 0x00,
+0x00, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x18, 0x18, 0x00, 0x00, 0x00, 0x18, 0x18, 0x30, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x06, 0x0c, 0x18, 0x30, 0x60, 0x30, 0x18, 0x0c, 0x06,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00,
+0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60,
+0x30, 0x18, 0x0c, 0x06, 0x0c, 0x18, 0x30, 0x60, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x7c, 0xc6, 0xc6, 0x0c, 0x18, 0x18, 0x18, 0x00, 0x18, 0x18,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xde, 0xde,
+0xde, 0xdc, 0xc0, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x38,
+0x6c, 0xc6, 0xc6, 0xfe, 0xc6, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0xfc, 0x66, 0x66, 0x66, 0x7c, 0x66, 0x66, 0x66, 0x66, 0xfc,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, 0x66, 0xc2, 0xc0, 0xc0, 0xc0,
+0xc0, 0xc2, 0x66, 0x3c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf8, 0x6c,
+0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x6c, 0xf8, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0xfe, 0x66, 0x62, 0x68, 0x78, 0x68, 0x60, 0x62, 0x66, 0xfe,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x66, 0x62, 0x68, 0x78, 0x68,
+0x60, 0x60, 0x60, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, 0x66,
+0xc2, 0xc0, 0xc0, 0xde, 0xc6, 0xc6, 0x66, 0x3a, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0xc6, 0xc6, 0xc6, 0xc6, 0xfe, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1e, 0x0c,
+0x0c, 0x0c, 0x0c, 0x0c, 0xcc, 0xcc, 0xcc, 0x78, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0xe6, 0x66, 0x66, 0x6c, 0x78, 0x78, 0x6c, 0x66, 0x66, 0xe6,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x60, 0x60, 0x60, 0x60, 0x60,
+0x60, 0x62, 0x66, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, 0xe7,
+0xff, 0xff, 0xdb, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0xc6, 0xe6, 0xf6, 0xfe, 0xde, 0xce, 0xc6, 0xc6, 0xc6, 0xc6,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6,
+0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x66,
+0x66, 0x66, 0x7c, 0x60, 0x60, 0x60, 0x60, 0xf0, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xd6, 0xde, 0x7c,
+0x0c, 0x0e, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x66, 0x66, 0x66, 0x7c, 0x6c,
+0x66, 0x66, 0x66, 0xe6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6,
+0xc6, 0x60, 0x38, 0x0c, 0x06, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0xff, 0xdb, 0x99, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6,
+0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, 0xc3,
+0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0x66, 0x3c, 0x18, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0xdb, 0xdb, 0xff, 0x66, 0x66,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, 0xc3, 0x66, 0x3c, 0x18, 0x18,
+0x3c, 0x66, 0xc3, 0xc3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, 0xc3,
+0xc3, 0x66, 0x3c, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0xff, 0xc3, 0x86, 0x0c, 0x18, 0x30, 0x60, 0xc1, 0xc3, 0xff,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, 0x30, 0x30, 0x30, 0x30, 0x30,
+0x30, 0x30, 0x30, 0x3c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
+0xc0, 0xe0, 0x70, 0x38, 0x1c, 0x0e, 0x06, 0x02, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x3c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x3c,
+0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x6c, 0xc6, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00,
+0x30, 0x30, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x78, 0x0c, 0x7c,
+0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe0, 0x60,
+0x60, 0x78, 0x6c, 0x66, 0x66, 0x66, 0x66, 0x7c, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6, 0xc0, 0xc0, 0xc0, 0xc6, 0x7c,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1c, 0x0c, 0x0c, 0x3c, 0x6c, 0xcc,
+0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x7c, 0xc6, 0xfe, 0xc0, 0xc0, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x38, 0x6c, 0x64, 0x60, 0xf0, 0x60, 0x60, 0x60, 0x60, 0xf0,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x76, 0xcc, 0xcc,
+0xcc, 0xcc, 0xcc, 0x7c, 0x0c, 0xcc, 0x78, 0x00, 0x00, 0x00, 0xe0, 0x60,
+0x60, 0x6c, 0x76, 0x66, 0x66, 0x66, 0x66, 0xe6, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x18, 0x18, 0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x06, 0x00, 0x0e, 0x06, 0x06,
+0x06, 0x06, 0x06, 0x06, 0x66, 0x66, 0x3c, 0x00, 0x00, 0x00, 0xe0, 0x60,
+0x60, 0x66, 0x6c, 0x78, 0x78, 0x6c, 0x66, 0xe6, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe6, 0xff, 0xdb,
+0xdb, 0xdb, 0xdb, 0xdb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0xdc, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdc, 0x66, 0x66,
+0x66, 0x66, 0x66, 0x7c, 0x60, 0x60, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x76, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x7c, 0x0c, 0x0c, 0x1e, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0xdc, 0x76, 0x66, 0x60, 0x60, 0x60, 0xf0,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6, 0x60,
+0x38, 0x0c, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x30,
+0x30, 0xfc, 0x30, 0x30, 0x30, 0x30, 0x36, 0x1c, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x76,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, 0xc3, 0xc3,
+0xc3, 0x66, 0x3c, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0xc3, 0xc3, 0xc3, 0xdb, 0xdb, 0xff, 0x66, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, 0x66, 0x3c, 0x18, 0x3c, 0x66, 0xc3,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc6, 0xc6, 0xc6,
+0xc6, 0xc6, 0xc6, 0x7e, 0x06, 0x0c, 0xf8, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0xfe, 0xcc, 0x18, 0x30, 0x60, 0xc6, 0xfe, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x0e, 0x18, 0x18, 0x18, 0x70, 0x18, 0x18, 0x18, 0x18, 0x0e,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x18, 0x18, 0x00, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x70, 0x18,
+0x18, 0x18, 0x0e, 0x18, 0x18, 0x18, 0x18, 0x70, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x76, 0xdc, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x6c, 0xc6,
+0xc6, 0xc6, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, 0x66,
+0xc2, 0xc0, 0xc0, 0xc0, 0xc2, 0x66, 0x3c, 0x0c, 0x06, 0x7c, 0x00, 0x00,
+0x00, 0x00, 0xcc, 0x00, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x76,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x18, 0x30, 0x00, 0x7c, 0xc6, 0xfe,
+0xc0, 0xc0, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x6c,
+0x00, 0x78, 0x0c, 0x7c, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0xcc, 0x00, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0xcc, 0xcc, 0x76,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x30, 0x18, 0x00, 0x78, 0x0c, 0x7c,
+0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38, 0x6c, 0x38,
+0x00, 0x78, 0x0c, 0x7c, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x3c, 0x66, 0x60, 0x60, 0x66, 0x3c, 0x0c, 0x06,
+0x3c, 0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x6c, 0x00, 0x7c, 0xc6, 0xfe,
+0xc0, 0xc0, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc6, 0x00,
+0x00, 0x7c, 0xc6, 0xfe, 0xc0, 0xc0, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x60, 0x30, 0x18, 0x00, 0x7c, 0xc6, 0xfe, 0xc0, 0xc0, 0xc6, 0x7c,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x66, 0x00, 0x00, 0x38, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x3c, 0x66,
+0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x60, 0x30, 0x18, 0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c,
+0x00, 0x00, 0x00, 0x00, 0x00, 0xc6, 0x00, 0x10, 0x38, 0x6c, 0xc6, 0xc6,
+0xfe, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00, 0x38, 0x6c, 0x38, 0x00,
+0x38, 0x6c, 0xc6, 0xc6, 0xfe, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00,
+0x18, 0x30, 0x60, 0x00, 0xfe, 0x66, 0x60, 0x7c, 0x60, 0x60, 0x66, 0xfe,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x6e, 0x3b, 0x1b,
+0x7e, 0xd8, 0xdc, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3e, 0x6c,
+0xcc, 0xcc, 0xfe, 0xcc, 0xcc, 0xcc, 0xcc, 0xce, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x10, 0x38, 0x6c, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc6, 0x00, 0x00, 0x7c, 0xc6, 0xc6,
+0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x30, 0x18,
+0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x30, 0x78, 0xcc, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x76,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x30, 0x18, 0x00, 0xcc, 0xcc, 0xcc,
+0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc6, 0x00,
+0x00, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7e, 0x06, 0x0c, 0x78, 0x00,
+0x00, 0xc6, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c,
+0x00, 0x00, 0x00, 0x00, 0x00, 0xc6, 0x00, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6,
+0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x7e,
+0xc3, 0xc0, 0xc0, 0xc0, 0xc3, 0x7e, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x38, 0x6c, 0x64, 0x60, 0xf0, 0x60, 0x60, 0x60, 0x60, 0xe6, 0xfc,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, 0x66, 0x3c, 0x18, 0xff, 0x18,
+0xff, 0x18, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x66, 0x66,
+0x7c, 0x62, 0x66, 0x6f, 0x66, 0x66, 0x66, 0xf3, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x0e, 0x1b, 0x18, 0x18, 0x18, 0x7e, 0x18, 0x18, 0x18, 0x18, 0x18,
+0xd8, 0x70, 0x00, 0x00, 0x00, 0x18, 0x30, 0x60, 0x00, 0x78, 0x0c, 0x7c,
+0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x18, 0x30,
+0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x18, 0x30, 0x60, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x30, 0x60, 0x00, 0xcc, 0xcc, 0xcc,
+0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x76, 0xdc,
+0x00, 0xdc, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x00, 0x00, 0x00, 0x00,
+0x76, 0xdc, 0x00, 0xc6, 0xe6, 0xf6, 0xfe, 0xde, 0xce, 0xc6, 0xc6, 0xc6,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, 0x6c, 0x6c, 0x3e, 0x00, 0x7e, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38, 0x6c, 0x6c,
+0x38, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x30, 0x30, 0x00, 0x30, 0x30, 0x60, 0xc0, 0xc6, 0xc6, 0x7c,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0xc0,
+0xc0, 0xc0, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0xfe, 0x06, 0x06, 0x06, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0xc0, 0xc0, 0xc2, 0xc6, 0xcc, 0x18, 0x30, 0x60, 0xce, 0x9b, 0x06,
+0x0c, 0x1f, 0x00, 0x00, 0x00, 0xc0, 0xc0, 0xc2, 0xc6, 0xcc, 0x18, 0x30,
+0x66, 0xce, 0x96, 0x3e, 0x06, 0x06, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18,
+0x00, 0x18, 0x18, 0x18, 0x3c, 0x3c, 0x3c, 0x18, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x36, 0x6c, 0xd8, 0x6c, 0x36, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd8, 0x6c, 0x36,
+0x6c, 0xd8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x44, 0x11, 0x44,
+0x11, 0x44, 0x11, 0x44, 0x11, 0x44, 0x11, 0x44, 0x11, 0x44, 0x11, 0x44,
+0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa,
+0x55, 0xaa, 0x55, 0xaa, 0xdd, 0x77, 0xdd, 0x77, 0xdd, 0x77, 0xdd, 0x77,
+0xdd, 0x77, 0xdd, 0x77, 0xdd, 0x77, 0xdd, 0x77, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xf8, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xf8, 0x18, 0xf8,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x36, 0x36, 0x36, 0x36,
+0x36, 0x36, 0x36, 0xf6, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x36, 0x36, 0x36, 0x36,
+0x36, 0x36, 0x36, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf8, 0x18, 0xf8,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x36, 0x36, 0x36, 0x36,
+0x36, 0xf6, 0x06, 0xf6, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
+0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
+0x36, 0x36, 0x36, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x06, 0xf6,
+0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
+0x36, 0xf6, 0x06, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0xfe, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x18, 0x18, 0x18, 0xf8, 0x18, 0xf8,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0xf8, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x1f, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xff,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0xff, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x1f, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0xff, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x1f, 0x18, 0x1f, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x37,
+0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
+0x36, 0x37, 0x30, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0x30, 0x37, 0x36, 0x36, 0x36, 0x36,
+0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0xf7, 0x00, 0xff,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0xff, 0x00, 0xf7, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
+0x36, 0x36, 0x36, 0x36, 0x36, 0x37, 0x30, 0x37, 0x36, 0x36, 0x36, 0x36,
+0x36, 0x36, 0x36, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0xff,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x36, 0x36, 0x36, 0x36,
+0x36, 0xf7, 0x00, 0xf7, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
+0x18, 0x18, 0x18, 0x18, 0x18, 0xff, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0xff,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0xff, 0x00, 0xff, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x36, 0x36, 0x36, 0x36,
+0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x3f,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x1f, 0x18, 0x1f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0x18, 0x1f, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3f,
+0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
+0x36, 0x36, 0x36, 0xff, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
+0x18, 0x18, 0x18, 0x18, 0x18, 0xff, 0x18, 0xff, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xf8,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x1f, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff,
+0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xf0, 0xf0, 0xf0, 0xf0,
+0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
+0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+0x0f, 0x0f, 0x0f, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x76, 0xdc, 0xd8, 0xd8, 0xd8, 0xdc, 0x76, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x78, 0xcc, 0xcc, 0xcc, 0xd8, 0xcc, 0xc6, 0xc6, 0xc6, 0xcc,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0xc6, 0xc6, 0xc0, 0xc0, 0xc0,
+0xc0, 0xc0, 0xc0, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0xfe, 0x6c, 0x6c, 0x6c, 0x6c, 0x6c, 0x6c, 0x6c, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0xfe, 0xc6, 0x60, 0x30, 0x18, 0x30, 0x60, 0xc6, 0xfe,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0xd8, 0xd8,
+0xd8, 0xd8, 0xd8, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x66, 0x66, 0x66, 0x66, 0x66, 0x7c, 0x60, 0x60, 0xc0, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x76, 0xdc, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0x18, 0x3c, 0x66, 0x66,
+0x66, 0x3c, 0x18, 0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38,
+0x6c, 0xc6, 0xc6, 0xfe, 0xc6, 0xc6, 0x6c, 0x38, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x38, 0x6c, 0xc6, 0xc6, 0xc6, 0x6c, 0x6c, 0x6c, 0x6c, 0xee,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1e, 0x30, 0x18, 0x0c, 0x3e, 0x66,
+0x66, 0x66, 0x66, 0x3c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x7e, 0xdb, 0xdb, 0xdb, 0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x03, 0x06, 0x7e, 0xdb, 0xdb, 0xf3, 0x7e, 0x60, 0xc0,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1c, 0x30, 0x60, 0x60, 0x7c, 0x60,
+0x60, 0x60, 0x30, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c,
+0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0xfe, 0x00, 0x00, 0xfe, 0x00, 0x00, 0xfe, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x7e, 0x18,
+0x18, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30,
+0x18, 0x0c, 0x06, 0x0c, 0x18, 0x30, 0x00, 0x7e, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x0c, 0x18, 0x30, 0x60, 0x30, 0x18, 0x0c, 0x00, 0x7e,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x1b, 0x1b, 0x1b, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0xd8, 0xd8, 0xd8, 0x70, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x00, 0x7e, 0x00, 0x18, 0x18, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x76, 0xdc, 0x00,
+0x76, 0xdc, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38, 0x6c, 0x6c,
+0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x0c, 0x0c,
+0x0c, 0x0c, 0x0c, 0xec, 0x6c, 0x6c, 0x3c, 0x1c, 0x00, 0x00, 0x00, 0x00,
+0x00, 0xd8, 0x6c, 0x6c, 0x6c, 0x6c, 0x6c, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x70, 0xd8, 0x30, 0x60, 0xc8, 0xf8, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00,
+};
diff --git a/arch/ppc64/kernel/cpu_setup_power4.S b/arch/ppc64/kernel/cpu_setup_power4.S
new file mode 100644
index 00000000000..3bd95182085
--- /dev/null
+++ b/arch/ppc64/kernel/cpu_setup_power4.S
@@ -0,0 +1,214 @@
+/*
+ * This file contains low level CPU setup functions.
+ *    Copyright (C) 2003 Benjamin Herrenschmidt (benh@kernel.crashing.org)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/config.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/ppc_asm.h>
+#include <asm/cputable.h>
+#include <asm/ppc_asm.h>
+#include <asm/offsets.h>
+#include <asm/cache.h>
+
+_GLOBAL(__970_cpu_preinit)
+	/*
+	 * Do nothing if not running in HV mode
+	 */
+	mfmsr	r0
+	rldicl.	r0,r0,4,63
+	beqlr
+
+	/*
+	 * Deal only with PPC970 and PPC970FX.
+	 */
+	mfspr	r0,SPRN_PVR
+	srwi	r0,r0,16
+	cmpwi	cr0,r0,0x39
+	cmpwi	cr1,r0,0x3c
+	cror	4*cr0+eq,4*cr0+eq,4*cr1+eq
+	bnelr
+
+	/* Make sure HID4:rm_ci is off before MMU is turned off, that large
+	 * pages are enabled with HID4:61 and clear HID5:DCBZ_size and
+	 * HID5:DCBZ32_ill
+	 */
+	li	r0,0
+	mfspr	r3,SPRN_HID4
+	rldimi	r3,r0,40,23	/* clear bit 23 (rm_ci) */
+	rldimi	r3,r0,2,61	/* clear bit 61 (lg_pg_en) */
+	sync
+	mtspr	SPRN_HID4,r3
+	isync
+	sync
+	mfspr	r3,SPRN_HID5
+	rldimi	r3,r0,6,56	/* clear bits 56 & 57 (DCBZ*) */
+	sync
+	mtspr	SPRN_HID5,r3
+	isync
+	sync
+
+	/* Setup some basic HID1 features */
+	mfspr	r0,SPRN_HID1
+	li	r3,0x1200		/* enable i-fetch cacheability */
+	sldi	r3,r3,44		/* and prefetch */
+	or	r0,r0,r3
+	mtspr	SPRN_HID1,r0
+	mtspr	SPRN_HID1,r0
+	isync
+
+	/* Clear HIOR */
+	li	r0,0
+	sync
+	mtspr	SPRN_HIOR,0		/* Clear interrupt prefix */
+	isync
+	blr
+
+_GLOBAL(__setup_cpu_power4)
+	blr
+	
+_GLOBAL(__setup_cpu_ppc970)
+	mfspr	r0,SPRN_HID0
+	li	r11,5			/* clear DOZE and SLEEP */
+	rldimi	r0,r11,52,8		/* set NAP and DPM */
+	mtspr	SPRN_HID0,r0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	sync
+	isync
+	blr
+
+/* Definitions for the table use to save CPU states */
+#define CS_HID0		0
+#define CS_HID1		8
+#define	CS_HID4		16
+#define CS_HID5		24
+#define CS_SIZE		32
+
+	.data
+	.balign	L1_CACHE_BYTES,0
+cpu_state_storage:	
+	.space	CS_SIZE
+	.balign	L1_CACHE_BYTES,0
+	.text
+	
+/* Called in normal context to backup CPU 0 state. This
+ * does not include cache settings. This function is also
+ * called for machine sleep. This does not include the MMU
+ * setup, BATs, etc... but rather the "special" registers
+ * like HID0, HID1, HID4, etc...
+ */
+_GLOBAL(__save_cpu_setup)
+	/* Some CR fields are volatile, we back it up all */
+	mfcr	r7
+
+	/* Get storage ptr */
+	LOADADDR(r5,cpu_state_storage)
+
+	/* We only deal with 970 for now */
+	mfspr	r0,SPRN_PVR
+	srwi	r0,r0,16
+	cmpwi	cr0,r0,0x39
+	cmpwi	cr1,r0,0x3c
+	cror	4*cr0+eq,4*cr0+eq,4*cr1+eq
+	bne	1f
+
+	/* Save HID0,1,4 and 5 */
+	mfspr	r3,SPRN_HID0
+	std	r3,CS_HID0(r5)
+	mfspr	r3,SPRN_HID1
+	std	r3,CS_HID1(r5)
+	mfspr	r3,SPRN_HID4
+	std	r3,CS_HID4(r5)
+	mfspr	r3,SPRN_HID5
+	std	r3,CS_HID5(r5)
+	
+1:
+	mtcr	r7
+	blr
+
+/* Called with no MMU context (typically MSR:IR/DR off) to
+ * restore CPU state as backed up by the previous
+ * function. This does not include cache setting
+ */
+_GLOBAL(__restore_cpu_setup)
+	/* Get storage ptr (FIXME when using anton reloc as we
+	 * are running with translation disabled here
+	 */
+	LOADADDR(r5,cpu_state_storage)
+
+	/* We only deal with 970 for now */
+	mfspr	r0,SPRN_PVR
+	srwi	r0,r0,16
+	cmpwi	cr0,r0,0x39
+	cmpwi	cr1,r0,0x3c
+	cror	4*cr0+eq,4*cr0+eq,4*cr1+eq
+	bne	1f
+
+	/* Before accessing memory, we make sure rm_ci is clear */
+	li	r0,0
+	mfspr	r3,SPRN_HID4
+	rldimi	r3,r0,40,23	/* clear bit 23 (rm_ci) */
+	sync
+	mtspr	SPRN_HID4,r3
+	isync
+	sync
+
+	/* Clear interrupt prefix */
+	li	r0,0
+	sync
+	mtspr	SPRN_HIOR,0
+	isync
+
+	/* Restore HID0 */
+	ld	r3,CS_HID0(r5)
+	sync
+	isync
+	mtspr	SPRN_HID0,r3
+	mfspr	r3,SPRN_HID0
+	mfspr	r3,SPRN_HID0
+	mfspr	r3,SPRN_HID0
+	mfspr	r3,SPRN_HID0
+	mfspr	r3,SPRN_HID0
+	mfspr	r3,SPRN_HID0
+	sync
+	isync
+
+	/* Restore HID1 */
+	ld	r3,CS_HID1(r5)
+	sync
+	isync
+	mtspr	SPRN_HID1,r3
+	mtspr	SPRN_HID1,r3
+	sync
+	isync
+	
+	/* Restore HID4 */
+	ld	r3,CS_HID4(r5)
+	sync
+	isync
+	mtspr	SPRN_HID4,r3
+	sync
+	isync
+
+	/* Restore HID5 */
+	ld	r3,CS_HID5(r5)
+	sync
+	isync
+	mtspr	SPRN_HID5,r3
+	sync
+	isync
+1:
+	blr
+
diff --git a/arch/ppc64/kernel/cputable.c b/arch/ppc64/kernel/cputable.c
new file mode 100644
index 00000000000..8644a864805
--- /dev/null
+++ b/arch/ppc64/kernel/cputable.c
@@ -0,0 +1,197 @@
+/*
+ *  arch/ppc64/kernel/cputable.c
+ *
+ *  Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org)
+ *
+ *  Modifications for ppc64:
+ *      Copyright (C) 2003 Dave Engebretsen <engebret@us.ibm.com>
+ * 
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/threads.h>
+#include <linux/init.h>
+#include <linux/module.h>
+
+#include <asm/cputable.h>
+
+struct cpu_spec* cur_cpu_spec = NULL;
+EXPORT_SYMBOL(cur_cpu_spec);
+
+/* NOTE:
+ * Unlike ppc32, ppc64 will only call this once for the boot CPU, it's
+ * the responsibility of the appropriate CPU save/restore functions to
+ * eventually copy these settings over. Those save/restore aren't yet
+ * part of the cputable though. That has to be fixed for both ppc32
+ * and ppc64
+ */
+extern void __setup_cpu_power3(unsigned long offset, struct cpu_spec* spec);
+extern void __setup_cpu_power4(unsigned long offset, struct cpu_spec* spec);
+extern void __setup_cpu_ppc970(unsigned long offset, struct cpu_spec* spec);
+
+
+/* We only set the altivec features if the kernel was compiled with altivec
+ * support
+ */
+#ifdef CONFIG_ALTIVEC
+#define CPU_FTR_ALTIVEC_COMP	CPU_FTR_ALTIVEC
+#define PPC_FEATURE_HAS_ALTIVEC_COMP PPC_FEATURE_HAS_ALTIVEC
+#else
+#define CPU_FTR_ALTIVEC_COMP	0
+#define PPC_FEATURE_HAS_ALTIVEC_COMP    0
+#endif
+
+struct cpu_spec	cpu_specs[] = {
+    {	/* Power3 */
+	    0xffff0000, 0x00400000, "POWER3 (630)",
+	    CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE |
+		    CPU_FTR_IABR | CPU_FTR_PMC8,
+	    COMMON_USER_PPC64,
+	    128, 128,
+	    __setup_cpu_power3,
+	    COMMON_PPC64_FW
+    },
+    {	/* Power3+ */
+	    0xffff0000, 0x00410000, "POWER3 (630+)",
+	    CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE |
+		    CPU_FTR_IABR | CPU_FTR_PMC8,
+	    COMMON_USER_PPC64,
+	    128, 128,
+	    __setup_cpu_power3,
+	    COMMON_PPC64_FW
+    },
+    {	/* Northstar */
+	    0xffff0000, 0x00330000, "RS64-II (northstar)",
+	    CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE |
+		    CPU_FTR_IABR | CPU_FTR_PMC8 | CPU_FTR_MMCRA,
+	    COMMON_USER_PPC64,
+	    128, 128,
+	    __setup_cpu_power3,
+	    COMMON_PPC64_FW
+    },
+    {	/* Pulsar */
+	    0xffff0000, 0x00340000, "RS64-III (pulsar)",
+	    CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE |
+		    CPU_FTR_IABR | CPU_FTR_PMC8 | CPU_FTR_MMCRA,
+	    COMMON_USER_PPC64,
+	    128, 128,
+	    __setup_cpu_power3,
+	    COMMON_PPC64_FW
+    },
+    {	/* I-star */
+	    0xffff0000, 0x00360000, "RS64-III (icestar)",
+	    CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE |
+		    CPU_FTR_IABR | CPU_FTR_PMC8 | CPU_FTR_MMCRA,
+	    COMMON_USER_PPC64,
+	    128, 128,
+	    __setup_cpu_power3,
+	    COMMON_PPC64_FW
+    },
+    {	/* S-star */
+	    0xffff0000, 0x00370000, "RS64-IV (sstar)",
+	    CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE |
+		    CPU_FTR_IABR | CPU_FTR_PMC8 | CPU_FTR_MMCRA,
+	    COMMON_USER_PPC64,
+	    128, 128,
+	    __setup_cpu_power3,
+	    COMMON_PPC64_FW
+    },
+    {	/* Power4 */
+	    0xffff0000, 0x00350000, "POWER4 (gp)",
+	    CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE |
+		    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_PMC8 | CPU_FTR_MMCRA,
+	    COMMON_USER_PPC64,
+	    128, 128,
+	    __setup_cpu_power4,
+	    COMMON_PPC64_FW
+    },
+    {	/* Power4+ */
+	    0xffff0000, 0x00380000, "POWER4+ (gq)",
+	    CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE |
+		    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_PMC8 | CPU_FTR_MMCRA,
+	    COMMON_USER_PPC64,
+	    128, 128,
+	    __setup_cpu_power4,
+	    COMMON_PPC64_FW
+    },
+    {	/* PPC970 */
+	    0xffff0000, 0x00390000, "PPC970",
+	    CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE |
+		    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_ALTIVEC_COMP |
+		    CPU_FTR_CAN_NAP | CPU_FTR_PMC8 | CPU_FTR_MMCRA,
+	    COMMON_USER_PPC64 | PPC_FEATURE_HAS_ALTIVEC_COMP,
+	    128, 128,
+	    __setup_cpu_ppc970,
+	    COMMON_PPC64_FW
+    },
+    {	/* PPC970FX */
+	    0xffff0000, 0x003c0000, "PPC970FX",
+	    CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE |
+		    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_ALTIVEC_COMP |
+		    CPU_FTR_CAN_NAP | CPU_FTR_PMC8 | CPU_FTR_MMCRA,
+	    COMMON_USER_PPC64 | PPC_FEATURE_HAS_ALTIVEC_COMP,
+	    128, 128,
+	    __setup_cpu_ppc970,
+	    COMMON_PPC64_FW
+    },
+    {	/* Power5 */
+	    0xffff0000, 0x003a0000, "POWER5 (gr)",
+	    CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE |
+		    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_MMCRA | CPU_FTR_SMT |
+		    CPU_FTR_COHERENT_ICACHE | CPU_FTR_LOCKLESS_TLBIE |
+		    CPU_FTR_MMCRA_SIHV,
+	    COMMON_USER_PPC64,
+	    128, 128,
+	    __setup_cpu_power4,
+	    COMMON_PPC64_FW
+    },
+    {	/* Power5 */
+	    0xffff0000, 0x003b0000, "POWER5 (gs)",
+	    CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE |
+		    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_MMCRA | CPU_FTR_SMT |
+		    CPU_FTR_COHERENT_ICACHE | CPU_FTR_LOCKLESS_TLBIE |
+		    CPU_FTR_MMCRA_SIHV,
+	    COMMON_USER_PPC64,
+	    128, 128,
+	    __setup_cpu_power4,
+	    COMMON_PPC64_FW
+    },
+    {	/* default match */
+	    0x00000000, 0x00000000, "POWER4 (compatible)",
+  	    CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE |
+		    CPU_FTR_PPCAS_ARCH_V2,
+	    COMMON_USER_PPC64,
+	    128, 128,
+	    __setup_cpu_power4,
+	    COMMON_PPC64_FW
+    }
+};
+
+firmware_feature_t firmware_features_table[FIRMWARE_MAX_FEATURES] = {
+    {FW_FEATURE_PFT,		"hcall-pft"},
+    {FW_FEATURE_TCE,		"hcall-tce"},
+    {FW_FEATURE_SPRG0,		"hcall-sprg0"},
+    {FW_FEATURE_DABR,		"hcall-dabr"},
+    {FW_FEATURE_COPY,		"hcall-copy"},
+    {FW_FEATURE_ASR,		"hcall-asr"},
+    {FW_FEATURE_DEBUG,		"hcall-debug"},
+    {FW_FEATURE_PERF,		"hcall-perf"},
+    {FW_FEATURE_DUMP,		"hcall-dump"},
+    {FW_FEATURE_INTERRUPT,	"hcall-interrupt"},
+    {FW_FEATURE_MIGRATE,	"hcall-migrate"},
+    {FW_FEATURE_PERFMON,	"hcall-perfmon"},
+    {FW_FEATURE_CRQ,    	"hcall-crq"},
+    {FW_FEATURE_VIO,	        "hcall-vio"},
+    {FW_FEATURE_RDMA,	        "hcall-rdma"},
+    {FW_FEATURE_LLAN,	        "hcall-lLAN"},
+    {FW_FEATURE_BULK,   	"hcall-bulk"},
+    {FW_FEATURE_XDABR,  	"hcall-xdabr"},
+    {FW_FEATURE_MULTITCE,	"hcall-multi-tce"},
+    {FW_FEATURE_SPLPAR,	        "hcall-splpar"},
+};
diff --git a/arch/ppc64/kernel/dma.c b/arch/ppc64/kernel/dma.c
new file mode 100644
index 00000000000..ce714c92713
--- /dev/null
+++ b/arch/ppc64/kernel/dma.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (C) 2004 IBM Corporation
+ *
+ * Implements the generic device dma API for ppc64. Handles
+ * the pci and vio busses
+ */
+
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+/* Include the busses we support */
+#include <linux/pci.h>
+#include <asm/vio.h>
+#include <asm/scatterlist.h>
+#include <asm/bug.h>
+
+static struct dma_mapping_ops *get_dma_ops(struct device *dev)
+{
+	if (dev->bus == &pci_bus_type)
+		return &pci_dma_ops;
+#ifdef CONFIG_IBMVIO
+	if (dev->bus == &vio_bus_type)
+		return &vio_dma_ops;
+#endif
+	return NULL;
+}
+
+int dma_supported(struct device *dev, u64 mask)
+{
+	struct dma_mapping_ops *dma_ops = get_dma_ops(dev);
+
+	if (dma_ops)
+		return dma_ops->dma_supported(dev, mask);
+	BUG();
+	return 0;
+}
+EXPORT_SYMBOL(dma_supported);
+
+int dma_set_mask(struct device *dev, u64 dma_mask)
+{
+	if (dev->bus == &pci_bus_type)
+		return pci_set_dma_mask(to_pci_dev(dev), dma_mask);
+#ifdef CONFIG_IBMVIO
+	if (dev->bus == &vio_bus_type)
+		return -EIO;
+#endif /* CONFIG_IBMVIO */
+	BUG();
+	return 0;
+}
+EXPORT_SYMBOL(dma_set_mask);
+
+void *dma_alloc_coherent(struct device *dev, size_t size,
+		dma_addr_t *dma_handle, unsigned int __nocast flag)
+{
+	struct dma_mapping_ops *dma_ops = get_dma_ops(dev);
+
+	if (dma_ops)
+		return dma_ops->alloc_coherent(dev, size, dma_handle, flag);
+	BUG();
+	return NULL;
+}
+EXPORT_SYMBOL(dma_alloc_coherent);
+
+void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr,
+		dma_addr_t dma_handle)
+{
+	struct dma_mapping_ops *dma_ops = get_dma_ops(dev);
+
+	if (dma_ops)
+		dma_ops->free_coherent(dev, size, cpu_addr, dma_handle);
+	else
+		BUG();
+}
+EXPORT_SYMBOL(dma_free_coherent);
+
+dma_addr_t dma_map_single(struct device *dev, void *cpu_addr, size_t size,
+		enum dma_data_direction direction)
+{
+	struct dma_mapping_ops *dma_ops = get_dma_ops(dev);
+
+	if (dma_ops)
+		return dma_ops->map_single(dev, cpu_addr, size, direction);
+	BUG();
+	return (dma_addr_t)0;
+}
+EXPORT_SYMBOL(dma_map_single);
+
+void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
+		enum dma_data_direction direction)
+{
+	struct dma_mapping_ops *dma_ops = get_dma_ops(dev);
+
+	if (dma_ops)
+		dma_ops->unmap_single(dev, dma_addr, size, direction);
+	else
+		BUG();
+}
+EXPORT_SYMBOL(dma_unmap_single);
+
+dma_addr_t dma_map_page(struct device *dev, struct page *page,
+		unsigned long offset, size_t size,
+		enum dma_data_direction direction)
+{
+	struct dma_mapping_ops *dma_ops = get_dma_ops(dev);
+
+	if (dma_ops)
+		return dma_ops->map_single(dev,
+				(page_address(page) + offset), size, direction);
+	BUG();
+	return (dma_addr_t)0;
+}
+EXPORT_SYMBOL(dma_map_page);
+
+void dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
+		enum dma_data_direction direction)
+{
+	struct dma_mapping_ops *dma_ops = get_dma_ops(dev);
+
+	if (dma_ops)
+		dma_ops->unmap_single(dev, dma_address, size, direction);
+	else
+		BUG();
+}
+EXPORT_SYMBOL(dma_unmap_page);
+
+int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+		enum dma_data_direction direction)
+{
+	struct dma_mapping_ops *dma_ops = get_dma_ops(dev);
+
+	if (dma_ops)
+		return dma_ops->map_sg(dev, sg, nents, direction);
+	BUG();
+	return 0;
+}
+EXPORT_SYMBOL(dma_map_sg);
+
+void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries,
+		enum dma_data_direction direction)
+{
+	struct dma_mapping_ops *dma_ops = get_dma_ops(dev);
+
+	if (dma_ops)
+		dma_ops->unmap_sg(dev, sg, nhwentries, direction);
+	else
+		BUG();
+}
+EXPORT_SYMBOL(dma_unmap_sg);
diff --git a/arch/ppc64/kernel/eeh.c b/arch/ppc64/kernel/eeh.c
new file mode 100644
index 00000000000..d63d41f3eec
--- /dev/null
+++ b/arch/ppc64/kernel/eeh.c
@@ -0,0 +1,937 @@
+/*
+ * eeh.c
+ * Copyright (C) 2001 Dave Engebretsen & Todd Inglett IBM Corporation
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/bootmem.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/notifier.h>
+#include <linux/pci.h>
+#include <linux/proc_fs.h>
+#include <linux/rbtree.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <asm/eeh.h>
+#include <asm/io.h>
+#include <asm/machdep.h>
+#include <asm/rtas.h>
+#include <asm/atomic.h>
+#include <asm/systemcfg.h>
+#include "pci.h"
+
+#undef DEBUG
+
+/** Overview:
+ *  EEH, or "Extended Error Handling" is a PCI bridge technology for
+ *  dealing with PCI bus errors that can't be dealt with within the
+ *  usual PCI framework, except by check-stopping the CPU.  Systems
+ *  that are designed for high-availability/reliability cannot afford
+ *  to crash due to a "mere" PCI error, thus the need for EEH.
+ *  An EEH-capable bridge operates by converting a detected error
+ *  into a "slot freeze", taking the PCI adapter off-line, making
+ *  the slot behave, from the OS'es point of view, as if the slot
+ *  were "empty": all reads return 0xff's and all writes are silently
+ *  ignored.  EEH slot isolation events can be triggered by parity
+ *  errors on the address or data busses (e.g. during posted writes),
+ *  which in turn might be caused by dust, vibration, humidity,
+ *  radioactivity or plain-old failed hardware.
+ *
+ *  Note, however, that one of the leading causes of EEH slot
+ *  freeze events are buggy device drivers, buggy device microcode,
+ *  or buggy device hardware.  This is because any attempt by the
+ *  device to bus-master data to a memory address that is not
+ *  assigned to the device will trigger a slot freeze.   (The idea
+ *  is to prevent devices-gone-wild from corrupting system memory).
+ *  Buggy hardware/drivers will have a miserable time co-existing
+ *  with EEH.
+ *
+ *  Ideally, a PCI device driver, when suspecting that an isolation
+ *  event has occured (e.g. by reading 0xff's), will then ask EEH
+ *  whether this is the case, and then take appropriate steps to
+ *  reset the PCI slot, the PCI device, and then resume operations.
+ *  However, until that day,  the checking is done here, with the
+ *  eeh_check_failure() routine embedded in the MMIO macros.  If
+ *  the slot is found to be isolated, an "EEH Event" is synthesized
+ *  and sent out for processing.
+ */
+
+/** Bus Unit ID macros; get low and hi 32-bits of the 64-bit BUID */
+#define BUID_HI(buid) ((buid) >> 32)
+#define BUID_LO(buid) ((buid) & 0xffffffff)
+
+/* EEH event workqueue setup. */
+static DEFINE_SPINLOCK(eeh_eventlist_lock);
+LIST_HEAD(eeh_eventlist);
+static void eeh_event_handler(void *);
+DECLARE_WORK(eeh_event_wq, eeh_event_handler, NULL);
+
+static struct notifier_block *eeh_notifier_chain;
+
+/*
+ * If a device driver keeps reading an MMIO register in an interrupt
+ * handler after a slot isolation event has occurred, we assume it
+ * is broken and panic.  This sets the threshold for how many read
+ * attempts we allow before panicking.
+ */
+#define EEH_MAX_FAILS	1000
+static atomic_t eeh_fail_count;
+
+/* RTAS tokens */
+static int ibm_set_eeh_option;
+static int ibm_set_slot_reset;
+static int ibm_read_slot_reset_state;
+static int ibm_read_slot_reset_state2;
+static int ibm_slot_error_detail;
+
+static int eeh_subsystem_enabled;
+
+/* Buffer for reporting slot-error-detail rtas calls */
+static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX];
+static DEFINE_SPINLOCK(slot_errbuf_lock);
+static int eeh_error_buf_size;
+
+/* System monitoring statistics */
+static DEFINE_PER_CPU(unsigned long, total_mmio_ffs);
+static DEFINE_PER_CPU(unsigned long, false_positives);
+static DEFINE_PER_CPU(unsigned long, ignored_failures);
+static DEFINE_PER_CPU(unsigned long, slot_resets);
+
+/**
+ * The pci address cache subsystem.  This subsystem places
+ * PCI device address resources into a red-black tree, sorted
+ * according to the address range, so that given only an i/o
+ * address, the corresponding PCI device can be **quickly**
+ * found. It is safe to perform an address lookup in an interrupt
+ * context; this ability is an important feature.
+ *
+ * Currently, the only customer of this code is the EEH subsystem;
+ * thus, this code has been somewhat tailored to suit EEH better.
+ * In particular, the cache does *not* hold the addresses of devices
+ * for which EEH is not enabled.
+ *
+ * (Implementation Note: The RB tree seems to be better/faster
+ * than any hash algo I could think of for this problem, even
+ * with the penalty of slow pointer chases for d-cache misses).
+ */
+struct pci_io_addr_range
+{
+	struct rb_node rb_node;
+	unsigned long addr_lo;
+	unsigned long addr_hi;
+	struct pci_dev *pcidev;
+	unsigned int flags;
+};
+
+static struct pci_io_addr_cache
+{
+	struct rb_root rb_root;
+	spinlock_t piar_lock;
+} pci_io_addr_cache_root;
+
+static inline struct pci_dev *__pci_get_device_by_addr(unsigned long addr)
+{
+	struct rb_node *n = pci_io_addr_cache_root.rb_root.rb_node;
+
+	while (n) {
+		struct pci_io_addr_range *piar;
+		piar = rb_entry(n, struct pci_io_addr_range, rb_node);
+
+		if (addr < piar->addr_lo) {
+			n = n->rb_left;
+		} else {
+			if (addr > piar->addr_hi) {
+				n = n->rb_right;
+			} else {
+				pci_dev_get(piar->pcidev);
+				return piar->pcidev;
+			}
+		}
+	}
+
+	return NULL;
+}
+
+/**
+ * pci_get_device_by_addr - Get device, given only address
+ * @addr: mmio (PIO) phys address or i/o port number
+ *
+ * Given an mmio phys address, or a port number, find a pci device
+ * that implements this address.  Be sure to pci_dev_put the device
+ * when finished.  I/O port numbers are assumed to be offset
+ * from zero (that is, they do *not* have pci_io_addr added in).
+ * It is safe to call this function within an interrupt.
+ */
+static struct pci_dev *pci_get_device_by_addr(unsigned long addr)
+{
+	struct pci_dev *dev;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
+	dev = __pci_get_device_by_addr(addr);
+	spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
+	return dev;
+}
+
+#ifdef DEBUG
+/*
+ * Handy-dandy debug print routine, does nothing more
+ * than print out the contents of our addr cache.
+ */
+static void pci_addr_cache_print(struct pci_io_addr_cache *cache)
+{
+	struct rb_node *n;
+	int cnt = 0;
+
+	n = rb_first(&cache->rb_root);
+	while (n) {
+		struct pci_io_addr_range *piar;
+		piar = rb_entry(n, struct pci_io_addr_range, rb_node);
+		printk(KERN_DEBUG "PCI: %s addr range %d [%lx-%lx]: %s %s\n",
+		       (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", cnt,
+		       piar->addr_lo, piar->addr_hi, pci_name(piar->pcidev),
+		       pci_pretty_name(piar->pcidev));
+		cnt++;
+		n = rb_next(n);
+	}
+}
+#endif
+
+/* Insert address range into the rb tree. */
+static struct pci_io_addr_range *
+pci_addr_cache_insert(struct pci_dev *dev, unsigned long alo,
+		      unsigned long ahi, unsigned int flags)
+{
+	struct rb_node **p = &pci_io_addr_cache_root.rb_root.rb_node;
+	struct rb_node *parent = NULL;
+	struct pci_io_addr_range *piar;
+
+	/* Walk tree, find a place to insert into tree */
+	while (*p) {
+		parent = *p;
+		piar = rb_entry(parent, struct pci_io_addr_range, rb_node);
+		if (alo < piar->addr_lo) {
+			p = &parent->rb_left;
+		} else if (ahi > piar->addr_hi) {
+			p = &parent->rb_right;
+		} else {
+			if (dev != piar->pcidev ||
+			    alo != piar->addr_lo || ahi != piar->addr_hi) {
+				printk(KERN_WARNING "PIAR: overlapping address range\n");
+			}
+			return piar;
+		}
+	}
+	piar = (struct pci_io_addr_range *)kmalloc(sizeof(struct pci_io_addr_range), GFP_ATOMIC);
+	if (!piar)
+		return NULL;
+
+	piar->addr_lo = alo;
+	piar->addr_hi = ahi;
+	piar->pcidev = dev;
+	piar->flags = flags;
+
+	rb_link_node(&piar->rb_node, parent, p);
+	rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root);
+
+	return piar;
+}
+
+static void __pci_addr_cache_insert_device(struct pci_dev *dev)
+{
+	struct device_node *dn;
+	int i;
+	int inserted = 0;
+
+	dn = pci_device_to_OF_node(dev);
+	if (!dn) {
+		printk(KERN_WARNING "PCI: no pci dn found for dev=%s %s\n",
+			pci_name(dev), pci_pretty_name(dev));
+		return;
+	}
+
+	/* Skip any devices for which EEH is not enabled. */
+	if (!(dn->eeh_mode & EEH_MODE_SUPPORTED) ||
+	    dn->eeh_mode & EEH_MODE_NOCHECK) {
+#ifdef DEBUG
+		printk(KERN_INFO "PCI: skip building address cache for=%s %s\n",
+		       pci_name(dev), pci_pretty_name(dev));
+#endif
+		return;
+	}
+
+	/* The cache holds a reference to the device... */
+	pci_dev_get(dev);
+
+	/* Walk resources on this device, poke them into the tree */
+	for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
+		unsigned long start = pci_resource_start(dev,i);
+		unsigned long end = pci_resource_end(dev,i);
+		unsigned int flags = pci_resource_flags(dev,i);
+
+		/* We are interested only bus addresses, not dma or other stuff */
+		if (0 == (flags & (IORESOURCE_IO | IORESOURCE_MEM)))
+			continue;
+		if (start == 0 || ~start == 0 || end == 0 || ~end == 0)
+			 continue;
+		pci_addr_cache_insert(dev, start, end, flags);
+		inserted = 1;
+	}
+
+	/* If there was nothing to add, the cache has no reference... */
+	if (!inserted)
+		pci_dev_put(dev);
+}
+
+/**
+ * pci_addr_cache_insert_device - Add a device to the address cache
+ * @dev: PCI device whose I/O addresses we are interested in.
+ *
+ * In order to support the fast lookup of devices based on addresses,
+ * we maintain a cache of devices that can be quickly searched.
+ * This routine adds a device to that cache.
+ */
+void pci_addr_cache_insert_device(struct pci_dev *dev)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
+	__pci_addr_cache_insert_device(dev);
+	spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
+}
+
+static inline void __pci_addr_cache_remove_device(struct pci_dev *dev)
+{
+	struct rb_node *n;
+	int removed = 0;
+
+restart:
+	n = rb_first(&pci_io_addr_cache_root.rb_root);
+	while (n) {
+		struct pci_io_addr_range *piar;
+		piar = rb_entry(n, struct pci_io_addr_range, rb_node);
+
+		if (piar->pcidev == dev) {
+			rb_erase(n, &pci_io_addr_cache_root.rb_root);
+			removed = 1;
+			kfree(piar);
+			goto restart;
+		}
+		n = rb_next(n);
+	}
+
+	/* The cache no longer holds its reference to this device... */
+	if (removed)
+		pci_dev_put(dev);
+}
+
+/**
+ * pci_addr_cache_remove_device - remove pci device from addr cache
+ * @dev: device to remove
+ *
+ * Remove a device from the addr-cache tree.
+ * This is potentially expensive, since it will walk
+ * the tree multiple times (once per resource).
+ * But so what; device removal doesn't need to be that fast.
+ */
+void pci_addr_cache_remove_device(struct pci_dev *dev)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
+	__pci_addr_cache_remove_device(dev);
+	spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
+}
+
+/**
+ * pci_addr_cache_build - Build a cache of I/O addresses
+ *
+ * Build a cache of pci i/o addresses.  This cache will be used to
+ * find the pci device that corresponds to a given address.
+ * This routine scans all pci busses to build the cache.
+ * Must be run late in boot process, after the pci controllers
+ * have been scaned for devices (after all device resources are known).
+ */
+void __init pci_addr_cache_build(void)
+{
+	struct pci_dev *dev = NULL;
+
+	spin_lock_init(&pci_io_addr_cache_root.piar_lock);
+
+	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+		/* Ignore PCI bridges ( XXX why ??) */
+		if ((dev->class >> 16) == PCI_BASE_CLASS_BRIDGE) {
+			continue;
+		}
+		pci_addr_cache_insert_device(dev);
+	}
+
+#ifdef DEBUG
+	/* Verify tree built up above, echo back the list of addrs. */
+	pci_addr_cache_print(&pci_io_addr_cache_root);
+#endif
+}
+
+/* --------------------------------------------------------------- */
+/* Above lies the PCI Address Cache. Below lies the EEH event infrastructure */
+
+/**
+ * eeh_register_notifier - Register to find out about EEH events.
+ * @nb: notifier block to callback on events
+ */
+int eeh_register_notifier(struct notifier_block *nb)
+{
+	return notifier_chain_register(&eeh_notifier_chain, nb);
+}
+
+/**
+ * eeh_unregister_notifier - Unregister to an EEH event notifier.
+ * @nb: notifier block to callback on events
+ */
+int eeh_unregister_notifier(struct notifier_block *nb)
+{
+	return notifier_chain_unregister(&eeh_notifier_chain, nb);
+}
+
+/**
+ * read_slot_reset_state - Read the reset state of a device node's slot
+ * @dn: device node to read
+ * @rets: array to return results in
+ */
+static int read_slot_reset_state(struct device_node *dn, int rets[])
+{
+	int token, outputs;
+
+	if (ibm_read_slot_reset_state2 != RTAS_UNKNOWN_SERVICE) {
+		token = ibm_read_slot_reset_state2;
+		outputs = 4;
+	} else {
+		token = ibm_read_slot_reset_state;
+		outputs = 3;
+	}
+
+	return rtas_call(token, 3, outputs, rets, dn->eeh_config_addr,
+			 BUID_HI(dn->phb->buid), BUID_LO(dn->phb->buid));
+}
+
+/**
+ * eeh_panic - call panic() for an eeh event that cannot be handled.
+ * The philosophy of this routine is that it is better to panic and
+ * halt the OS than it is to risk possible data corruption by
+ * oblivious device drivers that don't know better.
+ *
+ * @dev pci device that had an eeh event
+ * @reset_state current reset state of the device slot
+ */
+static void eeh_panic(struct pci_dev *dev, int reset_state)
+{
+	/*
+	 * XXX We should create a separate sysctl for this.
+	 *
+	 * Since the panic_on_oops sysctl is used to halt the system
+	 * in light of potential corruption, we can use it here.
+	 */
+	if (panic_on_oops)
+		panic("EEH: MMIO failure (%d) on device:%s %s\n", reset_state,
+		      pci_name(dev), pci_pretty_name(dev));
+	else {
+		__get_cpu_var(ignored_failures)++;
+		printk(KERN_INFO "EEH: Ignored MMIO failure (%d) on device:%s %s\n",
+		       reset_state, pci_name(dev), pci_pretty_name(dev));
+	}
+}
+
+/**
+ * eeh_event_handler - dispatch EEH events.  The detection of a frozen
+ * slot can occur inside an interrupt, where it can be hard to do
+ * anything about it.  The goal of this routine is to pull these
+ * detection events out of the context of the interrupt handler, and
+ * re-dispatch them for processing at a later time in a normal context.
+ *
+ * @dummy - unused
+ */
+static void eeh_event_handler(void *dummy)
+{
+	unsigned long flags;
+	struct eeh_event	*event;
+
+	while (1) {
+		spin_lock_irqsave(&eeh_eventlist_lock, flags);
+		event = NULL;
+		if (!list_empty(&eeh_eventlist)) {
+			event = list_entry(eeh_eventlist.next, struct eeh_event, list);
+			list_del(&event->list);
+		}
+		spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
+		if (event == NULL)
+			break;
+
+		printk(KERN_INFO "EEH: MMIO failure (%d), notifiying device "
+		       "%s %s\n", event->reset_state,
+		       pci_name(event->dev), pci_pretty_name(event->dev));
+
+		atomic_set(&eeh_fail_count, 0);
+		notifier_call_chain (&eeh_notifier_chain,
+				     EEH_NOTIFY_FREEZE, event);
+
+		__get_cpu_var(slot_resets)++;
+
+		pci_dev_put(event->dev);
+		kfree(event);
+	}
+}
+
+/**
+ * eeh_token_to_phys - convert EEH address token to phys address
+ * @token i/o token, should be address in the form 0xE....
+ */
+static inline unsigned long eeh_token_to_phys(unsigned long token)
+{
+	pte_t *ptep;
+	unsigned long pa;
+
+	ptep = find_linux_pte(ioremap_mm.pgd, token);
+	if (!ptep)
+		return token;
+	pa = pte_pfn(*ptep) << PAGE_SHIFT;
+
+	return pa | (token & (PAGE_SIZE-1));
+}
+
+/**
+ * eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze
+ * @dn device node
+ * @dev pci device, if known
+ *
+ * Check for an EEH failure for the given device node.  Call this
+ * routine if the result of a read was all 0xff's and you want to
+ * find out if this is due to an EEH slot freeze.  This routine
+ * will query firmware for the EEH status.
+ *
+ * Returns 0 if there has not been an EEH error; otherwise returns
+ * a non-zero value and queues up a solt isolation event notification.
+ *
+ * It is safe to call this routine in an interrupt context.
+ */
+int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
+{
+	int ret;
+	int rets[3];
+	unsigned long flags;
+	int rc, reset_state;
+	struct eeh_event  *event;
+
+	__get_cpu_var(total_mmio_ffs)++;
+
+	if (!eeh_subsystem_enabled)
+		return 0;
+
+	if (!dn)
+		return 0;
+
+	/* Access to IO BARs might get this far and still not want checking. */
+	if (!(dn->eeh_mode & EEH_MODE_SUPPORTED) ||
+	    dn->eeh_mode & EEH_MODE_NOCHECK) {
+		return 0;
+	}
+
+	if (!dn->eeh_config_addr) {
+		return 0;
+	}
+
+	/*
+	 * If we already have a pending isolation event for this
+	 * slot, we know it's bad already, we don't need to check...
+	 */
+	if (dn->eeh_mode & EEH_MODE_ISOLATED) {
+		atomic_inc(&eeh_fail_count);
+		if (atomic_read(&eeh_fail_count) >= EEH_MAX_FAILS) {
+			/* re-read the slot reset state */
+			if (read_slot_reset_state(dn, rets) != 0)
+				rets[0] = -1;	/* reset state unknown */
+			eeh_panic(dev, rets[0]);
+		}
+		return 0;
+	}
+
+	/*
+	 * Now test for an EEH failure.  This is VERY expensive.
+	 * Note that the eeh_config_addr may be a parent device
+	 * in the case of a device behind a bridge, or it may be
+	 * function zero of a multi-function device.
+	 * In any case they must share a common PHB.
+	 */
+	ret = read_slot_reset_state(dn, rets);
+	if (!(ret == 0 && rets[1] == 1 && (rets[0] == 2 || rets[0] == 4))) {
+		__get_cpu_var(false_positives)++;
+		return 0;
+	}
+
+	/* prevent repeated reports of this failure */
+	dn->eeh_mode |= EEH_MODE_ISOLATED;
+
+	reset_state = rets[0];
+
+	spin_lock_irqsave(&slot_errbuf_lock, flags);
+	memset(slot_errbuf, 0, eeh_error_buf_size);
+
+	rc = rtas_call(ibm_slot_error_detail,
+	               8, 1, NULL, dn->eeh_config_addr,
+	               BUID_HI(dn->phb->buid),
+	               BUID_LO(dn->phb->buid), NULL, 0,
+	               virt_to_phys(slot_errbuf),
+	               eeh_error_buf_size,
+	               1 /* Temporary Error */);
+
+	if (rc == 0)
+		log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0);
+	spin_unlock_irqrestore(&slot_errbuf_lock, flags);
+
+	printk(KERN_INFO "EEH: MMIO failure (%d) on device: %s %s\n",
+	       rets[0], dn->name, dn->full_name);
+	event = kmalloc(sizeof(*event), GFP_ATOMIC);
+	if (event == NULL) {
+		eeh_panic(dev, reset_state);
+		return 1;
+ 	}
+
+	event->dev = dev;
+	event->dn = dn;
+	event->reset_state = reset_state;
+
+	/* We may or may not be called in an interrupt context */
+	spin_lock_irqsave(&eeh_eventlist_lock, flags);
+	list_add(&event->list, &eeh_eventlist);
+	spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
+
+	/* Most EEH events are due to device driver bugs.  Having
+	 * a stack trace will help the device-driver authors figure
+	 * out what happened.  So print that out. */
+	dump_stack();
+	schedule_work(&eeh_event_wq);
+
+	return 0;
+}
+
+EXPORT_SYMBOL(eeh_dn_check_failure);
+
+/**
+ * eeh_check_failure - check if all 1's data is due to EEH slot freeze
+ * @token i/o token, should be address in the form 0xA....
+ * @val value, should be all 1's (XXX why do we need this arg??)
+ *
+ * Check for an eeh failure at the given token address.
+ * Check for an EEH failure at the given token address.  Call this
+ * routine if the result of a read was all 0xff's and you want to
+ * find out if this is due to an EEH slot freeze event.  This routine
+ * will query firmware for the EEH status.
+ *
+ * Note this routine is safe to call in an interrupt context.
+ */
+unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned long val)
+{
+	unsigned long addr;
+	struct pci_dev *dev;
+	struct device_node *dn;
+
+	/* Finding the phys addr + pci device; this is pretty quick. */
+	addr = eeh_token_to_phys((unsigned long __force) token);
+	dev = pci_get_device_by_addr(addr);
+	if (!dev)
+		return val;
+
+	dn = pci_device_to_OF_node(dev);
+	eeh_dn_check_failure (dn, dev);
+
+	pci_dev_put(dev);
+	return val;
+}
+
+EXPORT_SYMBOL(eeh_check_failure);
+
+struct eeh_early_enable_info {
+	unsigned int buid_hi;
+	unsigned int buid_lo;
+};
+
+/* Enable eeh for the given device node. */
+static void *early_enable_eeh(struct device_node *dn, void *data)
+{
+	struct eeh_early_enable_info *info = data;
+	int ret;
+	char *status = get_property(dn, "status", NULL);
+	u32 *class_code = (u32 *)get_property(dn, "class-code", NULL);
+	u32 *vendor_id = (u32 *)get_property(dn, "vendor-id", NULL);
+	u32 *device_id = (u32 *)get_property(dn, "device-id", NULL);
+	u32 *regs;
+	int enable;
+
+	dn->eeh_mode = 0;
+
+	if (status && strcmp(status, "ok") != 0)
+		return NULL;	/* ignore devices with bad status */
+
+	/* Ignore bad nodes. */
+	if (!class_code || !vendor_id || !device_id)
+		return NULL;
+
+	/* There is nothing to check on PCI to ISA bridges */
+	if (dn->type && !strcmp(dn->type, "isa")) {
+		dn->eeh_mode |= EEH_MODE_NOCHECK;
+		return NULL;
+	}
+
+	/*
+	 * Now decide if we are going to "Disable" EEH checking
+	 * for this device.  We still run with the EEH hardware active,
+	 * but we won't be checking for ff's.  This means a driver
+	 * could return bad data (very bad!), an interrupt handler could
+	 * hang waiting on status bits that won't change, etc.
+	 * But there are a few cases like display devices that make sense.
+	 */
+	enable = 1;	/* i.e. we will do checking */
+	if ((*class_code >> 16) == PCI_BASE_CLASS_DISPLAY)
+		enable = 0;
+
+	if (!enable)
+		dn->eeh_mode |= EEH_MODE_NOCHECK;
+
+	/* Ok... see if this device supports EEH.  Some do, some don't,
+	 * and the only way to find out is to check each and every one. */
+	regs = (u32 *)get_property(dn, "reg", NULL);
+	if (regs) {
+		/* First register entry is addr (00BBSS00)  */
+		/* Try to enable eeh */
+		ret = rtas_call(ibm_set_eeh_option, 4, 1, NULL,
+				regs[0], info->buid_hi, info->buid_lo,
+				EEH_ENABLE);
+		if (ret == 0) {
+			eeh_subsystem_enabled = 1;
+			dn->eeh_mode |= EEH_MODE_SUPPORTED;
+			dn->eeh_config_addr = regs[0];
+#ifdef DEBUG
+			printk(KERN_DEBUG "EEH: %s: eeh enabled\n", dn->full_name);
+#endif
+		} else {
+
+			/* This device doesn't support EEH, but it may have an
+			 * EEH parent, in which case we mark it as supported. */
+			if (dn->parent && (dn->parent->eeh_mode & EEH_MODE_SUPPORTED)) {
+				/* Parent supports EEH. */
+				dn->eeh_mode |= EEH_MODE_SUPPORTED;
+				dn->eeh_config_addr = dn->parent->eeh_config_addr;
+				return NULL;
+			}
+		}
+	} else {
+		printk(KERN_WARNING "EEH: %s: unable to get reg property.\n",
+		       dn->full_name);
+	}
+
+	return NULL; 
+}
+
+/*
+ * Initialize EEH by trying to enable it for all of the adapters in the system.
+ * As a side effect we can determine here if eeh is supported at all.
+ * Note that we leave EEH on so failed config cycles won't cause a machine
+ * check.  If a user turns off EEH for a particular adapter they are really
+ * telling Linux to ignore errors.  Some hardware (e.g. POWER5) won't
+ * grant access to a slot if EEH isn't enabled, and so we always enable
+ * EEH for all slots/all devices.
+ *
+ * The eeh-force-off option disables EEH checking globally, for all slots.
+ * Even if force-off is set, the EEH hardware is still enabled, so that
+ * newer systems can boot.
+ */
+void __init eeh_init(void)
+{
+	struct device_node *phb, *np;
+	struct eeh_early_enable_info info;
+
+	np = of_find_node_by_path("/rtas");
+	if (np == NULL)
+		return;
+
+	ibm_set_eeh_option = rtas_token("ibm,set-eeh-option");
+	ibm_set_slot_reset = rtas_token("ibm,set-slot-reset");
+	ibm_read_slot_reset_state2 = rtas_token("ibm,read-slot-reset-state2");
+	ibm_read_slot_reset_state = rtas_token("ibm,read-slot-reset-state");
+	ibm_slot_error_detail = rtas_token("ibm,slot-error-detail");
+
+	if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE)
+		return;
+
+	eeh_error_buf_size = rtas_token("rtas-error-log-max");
+	if (eeh_error_buf_size == RTAS_UNKNOWN_SERVICE) {
+		eeh_error_buf_size = 1024;
+	}
+	if (eeh_error_buf_size > RTAS_ERROR_LOG_MAX) {
+		printk(KERN_WARNING "EEH: rtas-error-log-max is bigger than allocated "
+		      "buffer ! (%d vs %d)", eeh_error_buf_size, RTAS_ERROR_LOG_MAX);
+		eeh_error_buf_size = RTAS_ERROR_LOG_MAX;
+	}
+
+	/* Enable EEH for all adapters.  Note that eeh requires buid's */
+	for (phb = of_find_node_by_name(NULL, "pci"); phb;
+	     phb = of_find_node_by_name(phb, "pci")) {
+		unsigned long buid;
+
+		buid = get_phb_buid(phb);
+		if (buid == 0)
+			continue;
+
+		info.buid_lo = BUID_LO(buid);
+		info.buid_hi = BUID_HI(buid);
+		traverse_pci_devices(phb, early_enable_eeh, &info);
+	}
+
+	if (eeh_subsystem_enabled)
+		printk(KERN_INFO "EEH: PCI Enhanced I/O Error Handling Enabled\n");
+	else
+		printk(KERN_WARNING "EEH: No capable adapters found\n");
+}
+
+/**
+ * eeh_add_device_early - enable EEH for the indicated device_node
+ * @dn: device node for which to set up EEH
+ *
+ * This routine must be used to perform EEH initialization for PCI
+ * devices that were added after system boot (e.g. hotplug, dlpar).
+ * This routine must be called before any i/o is performed to the
+ * adapter (inluding any config-space i/o).
+ * Whether this actually enables EEH or not for this device depends
+ * on the CEC architecture, type of the device, on earlier boot
+ * command-line arguments & etc.
+ */
+void eeh_add_device_early(struct device_node *dn)
+{
+	struct pci_controller *phb;
+	struct eeh_early_enable_info info;
+
+	if (!dn)
+		return;
+	phb = dn->phb;
+	if (NULL == phb || 0 == phb->buid) {
+		printk(KERN_WARNING "EEH: Expected buid but found none\n");
+		return;
+	}
+
+	info.buid_hi = BUID_HI(phb->buid);
+	info.buid_lo = BUID_LO(phb->buid);
+	early_enable_eeh(dn, &info);
+}
+EXPORT_SYMBOL(eeh_add_device_early);
+
+/**
+ * eeh_add_device_late - perform EEH initialization for the indicated pci device
+ * @dev: pci device for which to set up EEH
+ *
+ * This routine must be used to complete EEH initialization for PCI
+ * devices that were added after system boot (e.g. hotplug, dlpar).
+ */
+void eeh_add_device_late(struct pci_dev *dev)
+{
+	if (!dev || !eeh_subsystem_enabled)
+		return;
+
+#ifdef DEBUG
+	printk(KERN_DEBUG "EEH: adding device %s %s\n", pci_name(dev),
+	       pci_pretty_name(dev));
+#endif
+
+	pci_addr_cache_insert_device (dev);
+}
+EXPORT_SYMBOL(eeh_add_device_late);
+
+/**
+ * eeh_remove_device - undo EEH setup for the indicated pci device
+ * @dev: pci device to be removed
+ *
+ * This routine should be when a device is removed from a running
+ * system (e.g. by hotplug or dlpar).
+ */
+void eeh_remove_device(struct pci_dev *dev)
+{
+	if (!dev || !eeh_subsystem_enabled)
+		return;
+
+	/* Unregister the device with the EEH/PCI address search system */
+#ifdef DEBUG
+	printk(KERN_DEBUG "EEH: remove device %s %s\n", pci_name(dev),
+	       pci_pretty_name(dev));
+#endif
+	pci_addr_cache_remove_device(dev);
+}
+EXPORT_SYMBOL(eeh_remove_device);
+
+static int proc_eeh_show(struct seq_file *m, void *v)
+{
+	unsigned int cpu;
+	unsigned long ffs = 0, positives = 0, failures = 0;
+	unsigned long resets = 0;
+
+	for_each_cpu(cpu) {
+		ffs += per_cpu(total_mmio_ffs, cpu);
+		positives += per_cpu(false_positives, cpu);
+		failures += per_cpu(ignored_failures, cpu);
+		resets += per_cpu(slot_resets, cpu);
+	}
+
+	if (0 == eeh_subsystem_enabled) {
+		seq_printf(m, "EEH Subsystem is globally disabled\n");
+		seq_printf(m, "eeh_total_mmio_ffs=%ld\n", ffs);
+	} else {
+		seq_printf(m, "EEH Subsystem is enabled\n");
+		seq_printf(m, "eeh_total_mmio_ffs=%ld\n"
+			   "eeh_false_positives=%ld\n"
+			   "eeh_ignored_failures=%ld\n"
+			   "eeh_slot_resets=%ld\n"
+				"eeh_fail_count=%d\n",
+			   ffs, positives, failures, resets,
+				eeh_fail_count.counter);
+	}
+
+	return 0;
+}
+
+static int proc_eeh_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, proc_eeh_show, NULL);
+}
+
+static struct file_operations proc_eeh_operations = {
+	.open      = proc_eeh_open,
+	.read      = seq_read,
+	.llseek    = seq_lseek,
+	.release   = single_release,
+};
+
+static int __init eeh_init_proc(void)
+{
+	struct proc_dir_entry *e;
+
+	if (systemcfg->platform & PLATFORM_PSERIES) {
+		e = create_proc_entry("ppc64/eeh", 0, NULL);
+		if (e)
+			e->proc_fops = &proc_eeh_operations;
+	}
+
+	return 0;
+}
+__initcall(eeh_init_proc);
diff --git a/arch/ppc64/kernel/entry.S b/arch/ppc64/kernel/entry.S
new file mode 100644
index 00000000000..d3604056e1a
--- /dev/null
+++ b/arch/ppc64/kernel/entry.S
@@ -0,0 +1,845 @@
+/*
+ *  arch/ppc64/kernel/entry.S
+ *
+ *  PowerPC version 
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *  Rewritten by Cort Dougan (cort@cs.nmt.edu) for PReP
+ *    Copyright (C) 1996 Cort Dougan <cort@cs.nmt.edu>
+ *  Adapted for Power Macintosh by Paul Mackerras.
+ *  Low-level exception handlers and MMU support
+ *  rewritten by Paul Mackerras.
+ *    Copyright (C) 1996 Paul Mackerras.
+ *  MPC8xx modifications Copyright (C) 1997 Dan Malek (dmalek@jlc.net).
+ *
+ *  This file contains the system call entry code, context switch
+ *  code, and exception/interrupt return code for PowerPC.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <asm/unistd.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/thread_info.h>
+#include <asm/ppc_asm.h>
+#include <asm/offsets.h>
+#include <asm/cputable.h>
+
+#ifdef CONFIG_PPC_ISERIES
+#define DO_SOFT_DISABLE
+#endif
+
+/*
+ * System calls.
+ */
+	.section	".toc","aw"
+.SYS_CALL_TABLE:
+	.tc .sys_call_table[TC],.sys_call_table
+
+.SYS_CALL_TABLE32:
+	.tc .sys_call_table32[TC],.sys_call_table32
+
+/* This value is used to mark exception frames on the stack. */
+exception_marker:
+	.tc	ID_72656773_68657265[TC],0x7265677368657265
+
+	.section	".text"
+	.align 7
+
+#undef SHOW_SYSCALLS
+
+	.globl system_call_common
+system_call_common:
+	andi.	r10,r12,MSR_PR
+	mr	r10,r1
+	addi	r1,r1,-INT_FRAME_SIZE
+	beq-	1f
+	ld	r1,PACAKSAVE(r13)
+1:	std	r10,0(r1)
+	std	r11,_NIP(r1)
+	std	r12,_MSR(r1)
+	std	r0,GPR0(r1)
+	std	r10,GPR1(r1)
+	std	r2,GPR2(r1)
+	std	r3,GPR3(r1)
+	std	r4,GPR4(r1)
+	std	r5,GPR5(r1)
+	std	r6,GPR6(r1)
+	std	r7,GPR7(r1)
+	std	r8,GPR8(r1)
+	li	r11,0
+	std	r11,GPR9(r1)
+	std	r11,GPR10(r1)
+	std	r11,GPR11(r1)
+	std	r11,GPR12(r1)
+	std	r9,GPR13(r1)
+	crclr	so
+	mfcr	r9
+	mflr	r10
+	li	r11,0xc01
+	std	r9,_CCR(r1)
+	std	r10,_LINK(r1)
+	std	r11,_TRAP(r1)
+	mfxer	r9
+	mfctr	r10
+	std	r9,_XER(r1)
+	std	r10,_CTR(r1)
+	std	r3,ORIG_GPR3(r1)
+	ld	r2,PACATOC(r13)
+	addi	r9,r1,STACK_FRAME_OVERHEAD
+	ld	r11,exception_marker@toc(r2)
+	std	r11,-16(r9)		/* "regshere" marker */
+#ifdef CONFIG_PPC_ISERIES
+	/* Hack for handling interrupts when soft-enabling on iSeries */
+	cmpdi	cr1,r0,0x5555		/* syscall 0x5555 */
+	andi.	r10,r12,MSR_PR		/* from kernel */
+	crand	4*cr0+eq,4*cr1+eq,4*cr0+eq
+	beq	hardware_interrupt_entry
+	lbz	r10,PACAPROCENABLED(r13)
+	std	r10,SOFTE(r1)
+#endif
+	mfmsr	r11
+	ori	r11,r11,MSR_EE
+	mtmsrd	r11,1
+
+#ifdef SHOW_SYSCALLS
+	bl	.do_show_syscall
+	REST_GPR(0,r1)
+	REST_4GPRS(3,r1)
+	REST_2GPRS(7,r1)
+	addi	r9,r1,STACK_FRAME_OVERHEAD
+#endif
+	clrrdi	r11,r1,THREAD_SHIFT
+	li	r12,0
+	ld	r10,TI_FLAGS(r11)
+	stb	r12,TI_SC_NOERR(r11)
+	andi.	r11,r10,_TIF_SYSCALL_T_OR_A
+	bne-	syscall_dotrace
+syscall_dotrace_cont:
+	cmpldi	0,r0,NR_syscalls
+	bge-	syscall_enosys
+
+system_call:			/* label this so stack traces look sane */
+/*
+ * Need to vector to 32 Bit or default sys_call_table here,
+ * based on caller's run-mode / personality.
+ */
+	ld	r11,.SYS_CALL_TABLE@toc(2)
+	andi.	r10,r10,_TIF_32BIT
+	beq	15f
+	ld	r11,.SYS_CALL_TABLE32@toc(2)
+	clrldi	r3,r3,32
+	clrldi	r4,r4,32
+	clrldi	r5,r5,32
+	clrldi	r6,r6,32
+	clrldi	r7,r7,32
+	clrldi	r8,r8,32
+15:
+	slwi	r0,r0,3
+	ldx	r10,r11,r0	/* Fetch system call handler [ptr] */
+	mtctr   r10
+	bctrl			/* Call handler */
+
+syscall_exit:
+#ifdef SHOW_SYSCALLS
+	std	r3,GPR3(r1)
+	bl	.do_show_syscall_exit
+	ld	r3,GPR3(r1)
+#endif
+	std	r3,RESULT(r1)
+	ld	r5,_CCR(r1)
+	li	r10,-_LAST_ERRNO
+	cmpld	r3,r10
+	clrrdi	r12,r1,THREAD_SHIFT
+	bge-	syscall_error
+syscall_error_cont:
+
+	/* check for syscall tracing or audit */
+	ld	r9,TI_FLAGS(r12)
+	andi.	r0,r9,(_TIF_SYSCALL_T_OR_A|_TIF_SINGLESTEP)
+	bne-	syscall_exit_trace
+syscall_exit_trace_cont:
+
+	/* disable interrupts so current_thread_info()->flags can't change,
+	   and so that we don't get interrupted after loading SRR0/1. */
+	ld	r8,_MSR(r1)
+	andi.	r10,r8,MSR_RI
+	beq-	unrecov_restore
+	mfmsr	r10
+	rldicl	r10,r10,48,1
+	rotldi	r10,r10,16
+	mtmsrd	r10,1
+	ld	r9,TI_FLAGS(r12)
+	andi.	r0,r9,(_TIF_SYSCALL_T_OR_A|_TIF_SIGPENDING|_TIF_NEED_RESCHED)
+	bne-	syscall_exit_work
+	ld	r7,_NIP(r1)
+	stdcx.	r0,0,r1			/* to clear the reservation */
+	andi.	r6,r8,MSR_PR
+	ld	r4,_LINK(r1)
+	beq-	1f			/* only restore r13 if */
+	ld	r13,GPR13(r1)		/* returning to usermode */
+1:	ld	r2,GPR2(r1)
+	li	r12,MSR_RI
+	andc	r10,r10,r12
+	mtmsrd	r10,1			/* clear MSR.RI */
+	ld	r1,GPR1(r1)
+	mtlr	r4
+	mtcr	r5
+	mtspr	SRR0,r7
+	mtspr	SRR1,r8
+	rfid
+	b	.	/* prevent speculative execution */
+
+syscall_enosys:
+	li	r3,-ENOSYS
+	std	r3,RESULT(r1)
+	clrrdi	r12,r1,THREAD_SHIFT
+	ld	r5,_CCR(r1)
+
+syscall_error:
+	lbz	r11,TI_SC_NOERR(r12)
+	cmpwi	0,r11,0
+	bne-	syscall_error_cont
+	neg	r3,r3
+	oris	r5,r5,0x1000	/* Set SO bit in CR */
+	std	r5,_CCR(r1)
+	b	syscall_error_cont
+        
+/* Traced system call support */
+syscall_dotrace:
+	bl	.save_nvgprs
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.do_syscall_trace_enter
+	ld	r0,GPR0(r1)	/* Restore original registers */
+	ld	r3,GPR3(r1)
+	ld	r4,GPR4(r1)
+	ld	r5,GPR5(r1)
+	ld	r6,GPR6(r1)
+	ld	r7,GPR7(r1)
+	ld	r8,GPR8(r1)
+	addi	r9,r1,STACK_FRAME_OVERHEAD
+	clrrdi	r10,r1,THREAD_SHIFT
+	ld	r10,TI_FLAGS(r10)
+	b	syscall_dotrace_cont
+
+syscall_exit_trace:
+	std	r3,GPR3(r1)
+	bl	.save_nvgprs
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.do_syscall_trace_leave
+	REST_NVGPRS(r1)
+	ld	r3,GPR3(r1)
+	ld	r5,_CCR(r1)
+	clrrdi	r12,r1,THREAD_SHIFT
+	b	syscall_exit_trace_cont
+
+/* Stuff to do on exit from a system call. */
+syscall_exit_work:
+	std	r3,GPR3(r1)
+	std	r5,_CCR(r1)
+	b	.ret_from_except_lite
+
+/* Save non-volatile GPRs, if not already saved. */
+_GLOBAL(save_nvgprs)
+	ld	r11,_TRAP(r1)
+	andi.	r0,r11,1
+	beqlr-
+	SAVE_NVGPRS(r1)
+	clrrdi	r0,r11,1
+	std	r0,_TRAP(r1)
+	blr
+
+/*
+ * The sigsuspend and rt_sigsuspend system calls can call do_signal
+ * and thus put the process into the stopped state where we might
+ * want to examine its user state with ptrace.  Therefore we need
+ * to save all the nonvolatile registers (r14 - r31) before calling
+ * the C code.  Similarly, fork, vfork and clone need the full
+ * register state on the stack so that it can be copied to the child.
+ */
+_GLOBAL(ppc32_sigsuspend)
+	bl	.save_nvgprs
+	bl	.sys32_sigsuspend
+	b	70f
+
+_GLOBAL(ppc64_rt_sigsuspend)
+	bl	.save_nvgprs
+	bl	.sys_rt_sigsuspend
+	b	70f
+
+_GLOBAL(ppc32_rt_sigsuspend)
+	bl	.save_nvgprs
+	bl	.sys32_rt_sigsuspend
+	/* If sigsuspend() returns zero, we are going into a signal handler */
+70:	cmpdi	0,r3,0
+	beq	.ret_from_except
+	/* If it returned -EINTR, we need to return via syscall_exit to set
+	   the SO bit in cr0 and potentially stop for ptrace. */
+	b	syscall_exit
+
+_GLOBAL(ppc_fork)
+	bl	.save_nvgprs
+	bl	.sys_fork
+	b	syscall_exit
+
+_GLOBAL(ppc_vfork)
+	bl	.save_nvgprs
+	bl	.sys_vfork
+	b	syscall_exit
+
+_GLOBAL(ppc_clone)
+	bl	.save_nvgprs
+	bl	.sys_clone
+	b	syscall_exit
+
+_GLOBAL(ppc32_swapcontext)
+	bl	.save_nvgprs
+	bl	.sys32_swapcontext
+	b	80f
+	
+_GLOBAL(ppc64_swapcontext)
+	bl	.save_nvgprs
+	bl	.sys_swapcontext
+	b	80f
+
+_GLOBAL(ppc32_sigreturn)
+	bl	.sys32_sigreturn
+	b	80f
+
+_GLOBAL(ppc32_rt_sigreturn)
+	bl	.sys32_rt_sigreturn
+	b	80f
+
+_GLOBAL(ppc64_rt_sigreturn)
+	bl	.sys_rt_sigreturn
+
+80:	cmpdi	0,r3,0
+	blt	syscall_exit
+	clrrdi	r4,r1,THREAD_SHIFT
+	ld	r4,TI_FLAGS(r4)
+	andi.	r4,r4,(_TIF_SYSCALL_T_OR_A|_TIF_SINGLESTEP)
+	beq+	81f
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.do_syscall_trace_leave
+81:	b	.ret_from_except
+
+_GLOBAL(ret_from_fork)
+	bl	.schedule_tail
+	REST_NVGPRS(r1)
+	li	r3,0
+	b	syscall_exit
+
+/*
+ * This routine switches between two different tasks.  The process
+ * state of one is saved on its kernel stack.  Then the state
+ * of the other is restored from its kernel stack.  The memory
+ * management hardware is updated to the second process's state.
+ * Finally, we can return to the second process, via ret_from_except.
+ * On entry, r3 points to the THREAD for the current task, r4
+ * points to the THREAD for the new task.
+ *
+ * Note: there are two ways to get to the "going out" portion
+ * of this code; either by coming in via the entry (_switch)
+ * or via "fork" which must set up an environment equivalent
+ * to the "_switch" path.  If you change this you'll have to change
+ * the fork code also.
+ *
+ * The code which creates the new task context is in 'copy_thread'
+ * in arch/ppc64/kernel/process.c
+ */
+	.align	7
+_GLOBAL(_switch)
+	mflr	r0
+	std	r0,16(r1)
+	stdu	r1,-SWITCH_FRAME_SIZE(r1)
+	/* r3-r13 are caller saved -- Cort */
+	SAVE_8GPRS(14, r1)
+	SAVE_10GPRS(22, r1)
+	mflr	r20		/* Return to switch caller */
+	mfmsr	r22
+	li	r0, MSR_FP
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	oris	r0,r0,MSR_VEC@h	/* Disable altivec */
+	mfspr	r24,SPRN_VRSAVE	/* save vrsave register value */
+	std	r24,THREAD_VRSAVE(r3)
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif /* CONFIG_ALTIVEC */
+	and.	r0,r0,r22
+	beq+	1f
+	andc	r22,r22,r0
+	mtmsrd	r22
+	isync
+1:	std	r20,_NIP(r1)
+	mfcr	r23
+	std	r23,_CCR(r1)
+	std	r1,KSP(r3)	/* Set old stack pointer */
+
+#ifdef CONFIG_SMP
+	/* We need a sync somewhere here to make sure that if the
+	 * previous task gets rescheduled on another CPU, it sees all
+	 * stores it has performed on this one.
+	 */
+	sync
+#endif /* CONFIG_SMP */
+
+	addi	r6,r4,-THREAD	/* Convert THREAD to 'current' */
+	std	r6,PACACURRENT(r13)	/* Set new 'current' */
+
+	ld	r8,KSP(r4)	/* new stack pointer */
+BEGIN_FTR_SECTION
+	clrrdi	r6,r8,28	/* get its ESID */
+	clrrdi	r9,r1,28	/* get current sp ESID */
+	clrldi.	r0,r6,2		/* is new ESID c00000000? */
+	cmpd	cr1,r6,r9	/* or is new ESID the same as current ESID? */
+	cror	eq,4*cr1+eq,eq
+	beq	2f		/* if yes, don't slbie it */
+	oris	r0,r6,0x0800	/* set C (class) bit */
+
+	/* Bolt in the new stack SLB entry */
+	ld	r7,KSP_VSID(r4)	/* Get new stack's VSID */
+	oris	r6,r6,(SLB_ESID_V)@h
+	ori	r6,r6,(SLB_NUM_BOLTED-1)@l
+	slbie	r0
+	slbie	r0		/* Workaround POWER5 < DD2.1 issue */
+	slbmte	r7,r6
+	isync
+
+2:
+END_FTR_SECTION_IFSET(CPU_FTR_SLB)
+	clrrdi	r7,r8,THREAD_SHIFT	/* base of new stack */
+	/* Note: this uses SWITCH_FRAME_SIZE rather than INT_FRAME_SIZE
+	   because we don't need to leave the 288-byte ABI gap at the
+	   top of the kernel stack. */
+	addi	r7,r7,THREAD_SIZE-SWITCH_FRAME_SIZE
+
+	mr	r1,r8		/* start using new stack pointer */
+	std	r7,PACAKSAVE(r13)
+
+	ld	r6,_CCR(r1)
+	mtcrf	0xFF,r6
+
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	ld	r0,THREAD_VRSAVE(r4)
+	mtspr	SPRN_VRSAVE,r0		/* if G4, restore VRSAVE reg */
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif /* CONFIG_ALTIVEC */
+
+	/* r3-r13 are destroyed -- Cort */
+	REST_8GPRS(14, r1)
+	REST_10GPRS(22, r1)
+
+#ifdef CONFIG_PPC_ISERIES
+	clrrdi	r7,r1,THREAD_SHIFT	/* get current_thread_info() */
+	ld	r7,TI_FLAGS(r7)		/* Get run light flag */
+	mfspr	r9,CTRLF
+	srdi	r7,r7,TIF_RUN_LIGHT
+	insrdi	r9,r7,1,63		/* Insert run light into CTRL */
+	mtspr	CTRLT,r9
+#endif
+
+	/* convert old thread to its task_struct for return value */
+	addi	r3,r3,-THREAD
+	ld	r7,_NIP(r1)	/* Return to _switch caller in new task */
+	mtlr	r7
+	addi	r1,r1,SWITCH_FRAME_SIZE
+	blr
+
+	.align	7
+_GLOBAL(ret_from_except)
+	ld	r11,_TRAP(r1)
+	andi.	r0,r11,1
+	bne	.ret_from_except_lite
+	REST_NVGPRS(r1)
+
+_GLOBAL(ret_from_except_lite)
+	/*
+	 * Disable interrupts so that current_thread_info()->flags
+	 * can't change between when we test it and when we return
+	 * from the interrupt.
+	 */
+	mfmsr	r10		/* Get current interrupt state */
+	rldicl	r9,r10,48,1	/* clear MSR_EE */
+	rotldi	r9,r9,16
+	mtmsrd	r9,1		/* Update machine state */
+
+#ifdef CONFIG_PREEMPT
+	clrrdi	r9,r1,THREAD_SHIFT	/* current_thread_info() */
+	li	r0,_TIF_NEED_RESCHED	/* bits to check */
+	ld	r3,_MSR(r1)
+	ld	r4,TI_FLAGS(r9)
+	/* Move MSR_PR bit in r3 to _TIF_SIGPENDING position in r0 */
+	rlwimi	r0,r3,32+TIF_SIGPENDING-MSR_PR_LG,_TIF_SIGPENDING
+	and.	r0,r4,r0	/* check NEED_RESCHED and maybe SIGPENDING */
+	bne	do_work
+
+#else /* !CONFIG_PREEMPT */
+	ld	r3,_MSR(r1)	/* Returning to user mode? */
+	andi.	r3,r3,MSR_PR
+	beq	restore		/* if not, just restore regs and return */
+
+	/* Check current_thread_info()->flags */
+	clrrdi	r9,r1,THREAD_SHIFT
+	ld	r4,TI_FLAGS(r9)
+	andi.	r0,r4,_TIF_USER_WORK_MASK
+	bne	do_work
+#endif
+
+restore:
+#ifdef CONFIG_PPC_ISERIES
+	ld	r5,SOFTE(r1)
+	cmpdi	0,r5,0
+	beq	4f
+	/* Check for pending interrupts (iSeries) */
+	ld	r3,PACALPPACA+LPPACAANYINT(r13)
+	cmpdi	r3,0
+	beq+	4f			/* skip do_IRQ if no interrupts */
+
+	li	r3,0
+	stb	r3,PACAPROCENABLED(r13)	/* ensure we are soft-disabled */
+	ori	r10,r10,MSR_EE
+	mtmsrd	r10			/* hard-enable again */
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.do_IRQ
+	b	.ret_from_except_lite		/* loop back and handle more */
+
+4:	stb	r5,PACAPROCENABLED(r13)
+#endif
+
+	ld	r3,_MSR(r1)
+	andi.	r0,r3,MSR_RI
+	beq-	unrecov_restore
+
+	andi.	r0,r3,MSR_PR
+
+	/*
+	 * r13 is our per cpu area, only restore it if we are returning to
+	 * userspace
+	 */
+	beq	1f
+	REST_GPR(13, r1)
+1:
+	ld	r3,_CTR(r1)
+	ld	r0,_LINK(r1)
+	mtctr	r3
+	mtlr	r0
+	ld	r3,_XER(r1)
+	mtspr	XER,r3
+
+	REST_8GPRS(5, r1)
+
+	stdcx.	r0,0,r1		/* to clear the reservation */
+
+	mfmsr	r0
+	li	r2, MSR_RI
+	andc	r0,r0,r2
+	mtmsrd	r0,1
+
+	ld	r0,_MSR(r1)
+	mtspr	SRR1,r0
+
+	ld	r2,_CCR(r1)
+	mtcrf	0xFF,r2
+	ld	r2,_NIP(r1)
+	mtspr	SRR0,r2
+
+	ld	r0,GPR0(r1)
+	ld	r2,GPR2(r1)
+	ld	r3,GPR3(r1)
+	ld	r4,GPR4(r1)
+	ld	r1,GPR1(r1)
+
+	rfid
+	b	.	/* prevent speculative execution */
+
+/* Note: this must change if we start using the TIF_NOTIFY_RESUME bit */
+do_work:
+#ifdef CONFIG_PREEMPT
+	andi.	r0,r3,MSR_PR	/* Returning to user mode? */
+	bne	user_work
+	/* Check that preempt_count() == 0 and interrupts are enabled */
+	lwz	r8,TI_PREEMPT(r9)
+	cmpwi	cr1,r8,0
+#ifdef CONFIG_PPC_ISERIES
+	ld	r0,SOFTE(r1)
+	cmpdi	r0,0
+#else
+	andi.	r0,r3,MSR_EE
+#endif
+	crandc	eq,cr1*4+eq,eq
+	bne	restore
+	/* here we are preempting the current task */
+1:
+#ifdef CONFIG_PPC_ISERIES
+	li	r0,1
+	stb	r0,PACAPROCENABLED(r13)
+#endif
+	ori	r10,r10,MSR_EE
+	mtmsrd	r10,1		/* reenable interrupts */
+	bl	.preempt_schedule
+	mfmsr	r10
+	clrrdi	r9,r1,THREAD_SHIFT
+	rldicl	r10,r10,48,1	/* disable interrupts again */
+	rotldi	r10,r10,16
+	mtmsrd	r10,1
+	ld	r4,TI_FLAGS(r9)
+	andi.	r0,r4,_TIF_NEED_RESCHED
+	bne	1b
+	b	restore
+
+user_work:
+#endif
+	/* Enable interrupts */
+	ori	r10,r10,MSR_EE
+	mtmsrd	r10,1
+
+	andi.	r0,r4,_TIF_NEED_RESCHED
+	beq	1f
+	bl	.schedule
+	b	.ret_from_except_lite
+
+1:	bl	.save_nvgprs
+	li	r3,0
+	addi	r4,r1,STACK_FRAME_OVERHEAD
+	bl	.do_signal
+	b	.ret_from_except
+
+unrecov_restore:
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.unrecoverable_exception
+	b	unrecov_restore
+
+#ifdef CONFIG_PPC_RTAS
+/*
+ * On CHRP, the Run-Time Abstraction Services (RTAS) have to be
+ * called with the MMU off.
+ *
+ * In addition, we need to be in 32b mode, at least for now.
+ * 
+ * Note: r3 is an input parameter to rtas, so don't trash it...
+ */
+_GLOBAL(enter_rtas)
+	mflr	r0
+	std	r0,16(r1)
+        stdu	r1,-RTAS_FRAME_SIZE(r1)	/* Save SP and create stack space. */
+
+	/* Because RTAS is running in 32b mode, it clobbers the high order half
+	 * of all registers that it saves.  We therefore save those registers
+	 * RTAS might touch to the stack.  (r0, r3-r13 are caller saved)
+   	 */
+	SAVE_GPR(2, r1)			/* Save the TOC */
+	SAVE_GPR(13, r1)		/* Save paca */
+	SAVE_8GPRS(14, r1)		/* Save the non-volatiles */
+	SAVE_10GPRS(22, r1)		/* ditto */
+
+	mfcr	r4
+	std	r4,_CCR(r1)
+	mfctr	r5
+	std	r5,_CTR(r1)
+	mfspr	r6,XER
+	std	r6,_XER(r1)
+	mfdar	r7
+	std	r7,_DAR(r1)
+	mfdsisr	r8
+	std	r8,_DSISR(r1)
+	mfsrr0	r9
+	std	r9,_SRR0(r1)
+	mfsrr1	r10
+	std	r10,_SRR1(r1)
+
+	/* There is no way it is acceptable to get here with interrupts enabled,
+	 * check it with the asm equivalent of WARN_ON
+	 */
+	mfmsr	r6
+	andi.	r0,r6,MSR_EE
+1:	tdnei	r0,0
+.section __bug_table,"a"
+	.llong	1b,__LINE__ + 0x1000000, 1f, 2f
+.previous
+.section .rodata,"a"
+1:	.asciz	__FILE__
+2:	.asciz "enter_rtas"
+.previous
+	
+	/* Unfortunately, the stack pointer and the MSR are also clobbered,
+	 * so they are saved in the PACA which allows us to restore
+	 * our original state after RTAS returns.
+         */
+	std	r1,PACAR1(r13)
+        std	r6,PACASAVEDMSR(r13)
+
+	/* Setup our real return addr */	
+	SET_REG_TO_LABEL(r4,.rtas_return_loc)
+	SET_REG_TO_CONST(r9,KERNELBASE)
+	sub	r4,r4,r9
+       	mtlr	r4
+
+	li	r0,0
+	ori	r0,r0,MSR_EE|MSR_SE|MSR_BE|MSR_RI
+	andc	r0,r6,r0
+	
+        li      r9,1
+        rldicr  r9,r9,MSR_SF_LG,(63-MSR_SF_LG)
+	ori	r9,r9,MSR_IR|MSR_DR|MSR_FE0|MSR_FE1|MSR_FP
+	andc	r6,r0,r9
+	ori	r6,r6,MSR_RI
+	sync				/* disable interrupts so SRR0/1 */
+	mtmsrd	r0			/* don't get trashed */
+
+	SET_REG_TO_LABEL(r4,rtas)
+	ld	r5,RTASENTRY(r4)	/* get the rtas->entry value */
+	ld	r4,RTASBASE(r4)		/* get the rtas->base value */
+	
+	mtspr	SRR0,r5
+	mtspr	SRR1,r6
+	rfid
+	b	.	/* prevent speculative execution */
+
+_STATIC(rtas_return_loc)
+	/* relocation is off at this point */
+	mfspr	r4,SPRG3	        /* Get PACA */
+	SET_REG_TO_CONST(r5, KERNELBASE)
+        sub     r4,r4,r5                /* RELOC the PACA base pointer */
+
+	mfmsr   r6
+	li	r0,MSR_RI
+	andc	r6,r6,r0
+	sync	
+	mtmsrd  r6
+        
+        ld	r1,PACAR1(r4)           /* Restore our SP */
+	LOADADDR(r3,.rtas_restore_regs)
+        ld	r4,PACASAVEDMSR(r4)     /* Restore our MSR */
+
+	mtspr	SRR0,r3
+	mtspr	SRR1,r4
+	rfid
+	b	.	/* prevent speculative execution */
+
+_STATIC(rtas_restore_regs)
+	/* relocation is on at this point */
+	REST_GPR(2, r1)			/* Restore the TOC */
+	REST_GPR(13, r1)		/* Restore paca */
+	REST_8GPRS(14, r1)		/* Restore the non-volatiles */
+	REST_10GPRS(22, r1)		/* ditto */
+
+	mfspr	r13,SPRG3
+
+	ld	r4,_CCR(r1)
+	mtcr	r4
+	ld	r5,_CTR(r1)
+	mtctr	r5
+	ld	r6,_XER(r1)
+	mtspr	XER,r6
+	ld	r7,_DAR(r1)
+	mtdar	r7
+	ld	r8,_DSISR(r1)
+	mtdsisr	r8
+	ld	r9,_SRR0(r1)
+	mtsrr0	r9
+	ld	r10,_SRR1(r1)
+	mtsrr1	r10
+
+        addi	r1,r1,RTAS_FRAME_SIZE	/* Unstack our frame */
+	ld	r0,16(r1)		/* get return address */
+
+	mtlr    r0
+        blr				/* return to caller */
+
+#endif /* CONFIG_PPC_RTAS */
+
+#ifdef CONFIG_PPC_MULTIPLATFORM
+
+_GLOBAL(enter_prom)
+	mflr	r0
+	std	r0,16(r1)
+        stdu	r1,-PROM_FRAME_SIZE(r1)	/* Save SP and create stack space */
+
+	/* Because PROM is running in 32b mode, it clobbers the high order half
+	 * of all registers that it saves.  We therefore save those registers
+	 * PROM might touch to the stack.  (r0, r3-r13 are caller saved)
+   	 */
+	SAVE_8GPRS(2, r1)
+	SAVE_GPR(13, r1)
+	SAVE_8GPRS(14, r1)
+	SAVE_10GPRS(22, r1)
+	mfcr	r4
+	std	r4,_CCR(r1)
+	mfctr	r5
+	std	r5,_CTR(r1)
+	mfspr	r6,XER
+	std	r6,_XER(r1)
+	mfdar	r7
+	std	r7,_DAR(r1)
+	mfdsisr	r8
+	std	r8,_DSISR(r1)
+	mfsrr0	r9
+	std	r9,_SRR0(r1)
+	mfsrr1	r10
+	std	r10,_SRR1(r1)
+	mfmsr	r11
+	std	r11,_MSR(r1)
+
+	/* Get the PROM entrypoint */
+	ld	r0,GPR4(r1)
+	mtlr	r0
+
+	/* Switch MSR to 32 bits mode
+	 */
+        mfmsr   r11
+        li      r12,1
+        rldicr  r12,r12,MSR_SF_LG,(63-MSR_SF_LG)
+        andc    r11,r11,r12
+        li      r12,1
+        rldicr  r12,r12,MSR_ISF_LG,(63-MSR_ISF_LG)
+        andc    r11,r11,r12
+        mtmsrd  r11
+        isync
+
+	/* Restore arguments & enter PROM here... */
+	ld	r3,GPR3(r1)
+	blrl
+
+	/* Just make sure that r1 top 32 bits didn't get
+	 * corrupt by OF
+	 */
+	rldicl	r1,r1,0,32
+
+	/* Restore the MSR (back to 64 bits) */
+	ld	r0,_MSR(r1)
+	mtmsrd	r0
+        isync
+
+	/* Restore other registers */
+	REST_GPR(2, r1)
+	REST_GPR(13, r1)
+	REST_8GPRS(14, r1)
+	REST_10GPRS(22, r1)
+	ld	r4,_CCR(r1)
+	mtcr	r4
+	ld	r5,_CTR(r1)
+	mtctr	r5
+	ld	r6,_XER(r1)
+	mtspr	XER,r6
+	ld	r7,_DAR(r1)
+	mtdar	r7
+	ld	r8,_DSISR(r1)
+	mtdsisr	r8
+	ld	r9,_SRR0(r1)
+	mtsrr0	r9
+	ld	r10,_SRR1(r1)
+	mtsrr1	r10
+	
+        addi	r1,r1,PROM_FRAME_SIZE
+	ld	r0,16(r1)
+	mtlr    r0
+        blr
+	
+#endif	/* CONFIG_PPC_MULTIPLATFORM */
diff --git a/arch/ppc64/kernel/head.S b/arch/ppc64/kernel/head.S
new file mode 100644
index 00000000000..fe05f3fbf9d
--- /dev/null
+++ b/arch/ppc64/kernel/head.S
@@ -0,0 +1,2139 @@
+/*
+ *  arch/ppc64/kernel/head.S
+ *
+ *  PowerPC version
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Rewritten by Cort Dougan (cort@cs.nmt.edu) for PReP
+ *    Copyright (C) 1996 Cort Dougan <cort@cs.nmt.edu>
+ *  Adapted for Power Macintosh by Paul Mackerras.
+ *  Low-level exception handlers and MMU support
+ *  rewritten by Paul Mackerras.
+ *    Copyright (C) 1996 Paul Mackerras.
+ *
+ *  Adapted for 64bit PowerPC by Dave Engebretsen, Peter Bergner, and
+ *    Mike Corrigan {engebret|bergner|mikejc}@us.ibm.com
+ *
+ *  This file contains the low-level support and setup for the
+ *  PowerPC-64 platform, including trap and interrupt dispatch.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#define SECONDARY_PROCESSORS
+
+#include <linux/config.h>
+#include <linux/threads.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/naca.h>
+#include <asm/systemcfg.h>
+#include <asm/ppc_asm.h>
+#include <asm/offsets.h>
+#include <asm/bug.h>
+#include <asm/cputable.h>
+#include <asm/setup.h>
+#include <asm/hvcall.h>
+
+#ifdef CONFIG_PPC_ISERIES
+#define DO_SOFT_DISABLE
+#endif
+
+/*
+ * hcall interface to pSeries LPAR
+ */
+#define H_SET_ASR	0x30
+
+/*
+ * We layout physical memory as follows:
+ * 0x0000 - 0x00ff : Secondary processor spin code
+ * 0x0100 - 0x2fff : pSeries Interrupt prologs
+ * 0x3000 - 0x3fff : Interrupt support
+ * 0x4000 - 0x4fff : NACA
+ * 0x6000	   : iSeries and common interrupt prologs
+ * 0x9000 - 0x9fff : Initial segment table
+ */
+
+/*
+ *   SPRG Usage
+ *
+ *   Register	Definition
+ *
+ *   SPRG0	reserved for hypervisor
+ *   SPRG1	temp - used to save gpr
+ *   SPRG2	temp - used to save gpr
+ *   SPRG3	virt addr of paca
+ */
+
+/*
+ * Entering into this code we make the following assumptions:
+ *  For pSeries:
+ *   1. The MMU is off & open firmware is running in real mode.
+ *   2. The kernel is entered at __start
+ *
+ *  For iSeries:
+ *   1. The MMU is on (as it always is for iSeries)
+ *   2. The kernel is entered at system_reset_iSeries
+ */
+
+	.text
+	.globl  _stext
+_stext:
+#ifdef CONFIG_PPC_MULTIPLATFORM
+_GLOBAL(__start)
+	/* NOP this out unconditionally */
+BEGIN_FTR_SECTION
+	b .__start_initialization_multiplatform
+END_FTR_SECTION(0, 1)
+#endif /* CONFIG_PPC_MULTIPLATFORM */
+
+	/* Catch branch to 0 in real mode */
+	trap
+#ifdef CONFIG_PPC_ISERIES
+	/*
+	 * At offset 0x20, there is a pointer to iSeries LPAR data.
+	 * This is required by the hypervisor
+	 */
+	. = 0x20
+	.llong hvReleaseData-KERNELBASE
+
+	/*
+	 * At offset 0x28 and 0x30 are offsets to the msChunks
+	 * array (used by the iSeries LPAR debugger to do translation
+	 * between physical addresses and absolute addresses) and
+	 * to the pidhash table (also used by the debugger)
+	 */
+	.llong msChunks-KERNELBASE
+	.llong 0	/* pidhash-KERNELBASE SFRXXX */
+
+	/* Offset 0x38 - Pointer to start of embedded System.map */
+	.globl	embedded_sysmap_start
+embedded_sysmap_start:
+	.llong	0
+	/* Offset 0x40 - Pointer to end of embedded System.map */
+	.globl	embedded_sysmap_end
+embedded_sysmap_end:
+	.llong	0
+
+#else /* CONFIG_PPC_ISERIES */
+
+	/* Secondary processors spin on this value until it goes to 1. */
+	.globl  __secondary_hold_spinloop
+__secondary_hold_spinloop:
+	.llong	0x0
+
+	/* Secondary processors write this value with their cpu # */
+	/* after they enter the spin loop immediately below.	  */
+	.globl	__secondary_hold_acknowledge
+__secondary_hold_acknowledge:
+	.llong	0x0
+
+	. = 0x60
+/*
+ * The following code is used on pSeries to hold secondary processors
+ * in a spin loop after they have been freed from OpenFirmware, but
+ * before the bulk of the kernel has been relocated.  This code
+ * is relocated to physical address 0x60 before prom_init is run.
+ * All of it must fit below the first exception vector at 0x100.
+ */
+_GLOBAL(__secondary_hold)
+	mfmsr	r24
+	ori	r24,r24,MSR_RI
+	mtmsrd	r24			/* RI on */
+
+	/* Grab our linux cpu number */
+	mr	r24,r3
+
+	/* Tell the master cpu we're here */
+	/* Relocation is off & we are located at an address less */
+	/* than 0x100, so only need to grab low order offset.    */
+	std	r24,__secondary_hold_acknowledge@l(0)
+	sync
+
+	/* All secondary cpu's wait here until told to start. */
+100:	ld	r4,__secondary_hold_spinloop@l(0)
+	cmpdi	0,r4,1
+	bne	100b
+
+#ifdef CONFIG_HMT
+	b	.hmt_init
+#else
+#ifdef CONFIG_SMP
+	mr	r3,r24
+	b	.pSeries_secondary_smp_init
+#else
+	BUG_OPCODE
+#endif
+#endif
+#endif
+
+/* This value is used to mark exception frames on the stack. */
+	.section ".toc","aw"
+exception_marker:
+	.tc	ID_72656773_68657265[TC],0x7265677368657265
+	.text
+
+/*
+ * The following macros define the code that appears as
+ * the prologue to each of the exception handlers.  They
+ * are split into two parts to allow a single kernel binary
+ * to be used for pSeries and iSeries.
+ * LOL.  One day... - paulus
+ */
+
+/*
+ * We make as much of the exception code common between native
+ * exception handlers (including pSeries LPAR) and iSeries LPAR
+ * implementations as possible.
+ */
+
+/*
+ * This is the start of the interrupt handlers for pSeries
+ * This code runs with relocation off.
+ */
+#define EX_R9		0
+#define EX_R10		8
+#define EX_R11		16
+#define EX_R12		24
+#define EX_R13		32
+#define EX_SRR0		40
+#define EX_R3		40	/* SLB miss saves R3, but not SRR0 */
+#define EX_DAR		48
+#define EX_LR		48	/* SLB miss saves LR, but not DAR */
+#define EX_DSISR	56
+#define EX_CCR		60
+
+#define EXCEPTION_PROLOG_PSERIES(area, label)				\
+	mfspr	r13,SPRG3;		/* get paca address into r13 */	\
+	std	r9,area+EX_R9(r13);	/* save r9 - r12 */		\
+	std	r10,area+EX_R10(r13);					\
+	std	r11,area+EX_R11(r13);					\
+	std	r12,area+EX_R12(r13);					\
+	mfspr	r9,SPRG1;						\
+	std	r9,area+EX_R13(r13);					\
+	mfcr	r9;							\
+	clrrdi	r12,r13,32;		/* get high part of &label */	\
+	mfmsr	r10;							\
+	mfspr	r11,SRR0;		/* save SRR0 */			\
+	ori	r12,r12,(label)@l;	/* virt addr of handler */	\
+	ori	r10,r10,MSR_IR|MSR_DR|MSR_RI;				\
+	mtspr	SRR0,r12;						\
+	mfspr	r12,SRR1;		/* and SRR1 */			\
+	mtspr	SRR1,r10;						\
+	rfid;								\
+	b	.	/* prevent speculative execution */
+
+/*
+ * This is the start of the interrupt handlers for iSeries
+ * This code runs with relocation on.
+ */
+#define EXCEPTION_PROLOG_ISERIES_1(area)				\
+	mfspr	r13,SPRG3;		/* get paca address into r13 */	\
+	std	r9,area+EX_R9(r13);	/* save r9 - r12 */		\
+	std	r10,area+EX_R10(r13);					\
+	std	r11,area+EX_R11(r13);					\
+	std	r12,area+EX_R12(r13);					\
+	mfspr	r9,SPRG1;						\
+	std	r9,area+EX_R13(r13);					\
+	mfcr	r9
+
+#define EXCEPTION_PROLOG_ISERIES_2					\
+	mfmsr	r10;							\
+	ld	r11,PACALPPACA+LPPACASRR0(r13);				\
+	ld	r12,PACALPPACA+LPPACASRR1(r13);				\
+	ori	r10,r10,MSR_RI;						\
+	mtmsrd	r10,1
+
+/*
+ * The common exception prolog is used for all except a few exceptions
+ * such as a segment miss on a kernel address.  We have to be prepared
+ * to take another exception from the point where we first touch the
+ * kernel stack onwards.
+ *
+ * On entry r13 points to the paca, r9-r13 are saved in the paca,
+ * r9 contains the saved CR, r11 and r12 contain the saved SRR0 and
+ * SRR1, and relocation is on.
+ */
+#define EXCEPTION_PROLOG_COMMON(n, area)				   \
+	andi.	r10,r12,MSR_PR;		/* See if coming from user	*/ \
+	mr	r10,r1;			/* Save r1			*/ \
+	subi	r1,r1,INT_FRAME_SIZE;	/* alloc frame on kernel stack	*/ \
+	beq-	1f;							   \
+	ld	r1,PACAKSAVE(r13);	/* kernel stack to use		*/ \
+1:	cmpdi	cr1,r1,0;		/* check if r1 is in userspace	*/ \
+	bge-	cr1,bad_stack;		/* abort if it is		*/ \
+	std	r9,_CCR(r1);		/* save CR in stackframe	*/ \
+	std	r11,_NIP(r1);		/* save SRR0 in stackframe	*/ \
+	std	r12,_MSR(r1);		/* save SRR1 in stackframe	*/ \
+	std	r10,0(r1);		/* make stack chain pointer	*/ \
+	std	r0,GPR0(r1);		/* save r0 in stackframe	*/ \
+	std	r10,GPR1(r1);		/* save r1 in stackframe	*/ \
+	std	r2,GPR2(r1);		/* save r2 in stackframe	*/ \
+	SAVE_4GPRS(3, r1);		/* save r3 - r6 in stackframe	*/ \
+	SAVE_2GPRS(7, r1);		/* save r7, r8 in stackframe	*/ \
+	ld	r9,area+EX_R9(r13);	/* move r9, r10 to stackframe	*/ \
+	ld	r10,area+EX_R10(r13);					   \
+	std	r9,GPR9(r1);						   \
+	std	r10,GPR10(r1);						   \
+	ld	r9,area+EX_R11(r13);	/* move r11 - r13 to stackframe	*/ \
+	ld	r10,area+EX_R12(r13);					   \
+	ld	r11,area+EX_R13(r13);					   \
+	std	r9,GPR11(r1);						   \
+	std	r10,GPR12(r1);						   \
+	std	r11,GPR13(r1);						   \
+	ld	r2,PACATOC(r13);	/* get kernel TOC into r2	*/ \
+	mflr	r9;			/* save LR in stackframe	*/ \
+	std	r9,_LINK(r1);						   \
+	mfctr	r10;			/* save CTR in stackframe	*/ \
+	std	r10,_CTR(r1);						   \
+	mfspr	r11,XER;		/* save XER in stackframe	*/ \
+	std	r11,_XER(r1);						   \
+	li	r9,(n)+1;						   \
+	std	r9,_TRAP(r1);		/* set trap number		*/ \
+	li	r10,0;							   \
+	ld	r11,exception_marker@toc(r2);				   \
+	std	r10,RESULT(r1);		/* clear regs->result		*/ \
+	std	r11,STACK_FRAME_OVERHEAD-16(r1); /* mark the frame	*/
+
+/*
+ * Exception vectors.
+ */
+#define STD_EXCEPTION_PSERIES(n, label)			\
+	. = n;						\
+	.globl label##_pSeries;				\
+label##_pSeries:					\
+	HMT_MEDIUM;					\
+	mtspr	SPRG1,r13;		/* save r13 */	\
+	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common)
+
+#define STD_EXCEPTION_ISERIES(n, label, area)		\
+	.globl label##_iSeries;				\
+label##_iSeries:					\
+	HMT_MEDIUM;					\
+	mtspr	SPRG1,r13;		/* save r13 */	\
+	EXCEPTION_PROLOG_ISERIES_1(area);		\
+	EXCEPTION_PROLOG_ISERIES_2;			\
+	b	label##_common
+
+#define MASKABLE_EXCEPTION_ISERIES(n, label)				\
+	.globl label##_iSeries;						\
+label##_iSeries:							\
+	HMT_MEDIUM;							\
+	mtspr	SPRG1,r13;		/* save r13 */			\
+	EXCEPTION_PROLOG_ISERIES_1(PACA_EXGEN);				\
+	lbz	r10,PACAPROCENABLED(r13);				\
+	cmpwi	0,r10,0;						\
+	beq-	label##_iSeries_masked;					\
+	EXCEPTION_PROLOG_ISERIES_2;					\
+	b	label##_common;						\
+
+#ifdef DO_SOFT_DISABLE
+#define DISABLE_INTS				\
+	lbz	r10,PACAPROCENABLED(r13);	\
+	li	r11,0;				\
+	std	r10,SOFTE(r1);			\
+	mfmsr	r10;				\
+	stb	r11,PACAPROCENABLED(r13);	\
+	ori	r10,r10,MSR_EE;			\
+	mtmsrd	r10,1
+
+#define ENABLE_INTS				\
+	lbz	r10,PACAPROCENABLED(r13);	\
+	mfmsr	r11;				\
+	std	r10,SOFTE(r1);			\
+	ori	r11,r11,MSR_EE;			\
+	mtmsrd	r11,1
+
+#else	/* hard enable/disable interrupts */
+#define DISABLE_INTS
+
+#define ENABLE_INTS				\
+	ld	r12,_MSR(r1);			\
+	mfmsr	r11;				\
+	rlwimi	r11,r12,0,MSR_EE;		\
+	mtmsrd	r11,1
+
+#endif
+
+#define STD_EXCEPTION_COMMON(trap, label, hdlr)		\
+	.align	7;					\
+	.globl label##_common;				\
+label##_common:						\
+	EXCEPTION_PROLOG_COMMON(trap, PACA_EXGEN);	\
+	DISABLE_INTS;					\
+	bl	.save_nvgprs;				\
+	addi	r3,r1,STACK_FRAME_OVERHEAD;		\
+	bl	hdlr;					\
+	b	.ret_from_except
+
+#define STD_EXCEPTION_COMMON_LITE(trap, label, hdlr)	\
+	.align	7;					\
+	.globl label##_common;				\
+label##_common:						\
+	EXCEPTION_PROLOG_COMMON(trap, PACA_EXGEN);	\
+	DISABLE_INTS;					\
+	addi	r3,r1,STACK_FRAME_OVERHEAD;		\
+	bl	hdlr;					\
+	b	.ret_from_except_lite
+
+/*
+ * Start of pSeries system interrupt routines
+ */
+	. = 0x100
+	.globl __start_interrupts
+__start_interrupts:
+
+	STD_EXCEPTION_PSERIES(0x100, system_reset)
+
+	. = 0x200
+_machine_check_pSeries:
+	HMT_MEDIUM
+	mtspr	SPRG1,r13		/* save r13 */
+	EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common)
+
+	. = 0x300
+	.globl data_access_pSeries
+data_access_pSeries:
+	HMT_MEDIUM
+	mtspr	SPRG1,r13
+BEGIN_FTR_SECTION
+	mtspr	SPRG2,r12
+	mfspr	r13,DAR
+	mfspr	r12,DSISR
+	srdi	r13,r13,60
+	rlwimi	r13,r12,16,0x20
+	mfcr	r12
+	cmpwi	r13,0x2c
+	beq	.do_stab_bolted_pSeries
+	mtcrf	0x80,r12
+	mfspr	r12,SPRG2
+END_FTR_SECTION_IFCLR(CPU_FTR_SLB)
+	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common)
+
+	. = 0x380
+	.globl data_access_slb_pSeries
+data_access_slb_pSeries:
+	HMT_MEDIUM
+	mtspr	SPRG1,r13
+	mfspr	r13,SPRG3		/* get paca address into r13 */
+	std	r9,PACA_EXSLB+EX_R9(r13)	/* save r9 - r12 */
+	std	r10,PACA_EXSLB+EX_R10(r13)
+	std	r11,PACA_EXSLB+EX_R11(r13)
+	std	r12,PACA_EXSLB+EX_R12(r13)
+	std	r3,PACA_EXSLB+EX_R3(r13)
+	mfspr	r9,SPRG1
+	std	r9,PACA_EXSLB+EX_R13(r13)
+	mfcr	r9
+	mfspr	r12,SRR1		/* and SRR1 */
+	mfspr	r3,DAR
+	b	.do_slb_miss		/* Rel. branch works in real mode */
+
+	STD_EXCEPTION_PSERIES(0x400, instruction_access)
+
+	. = 0x480
+	.globl instruction_access_slb_pSeries
+instruction_access_slb_pSeries:
+	HMT_MEDIUM
+	mtspr	SPRG1,r13
+	mfspr	r13,SPRG3		/* get paca address into r13 */
+	std	r9,PACA_EXSLB+EX_R9(r13)	/* save r9 - r12 */
+	std	r10,PACA_EXSLB+EX_R10(r13)
+	std	r11,PACA_EXSLB+EX_R11(r13)
+	std	r12,PACA_EXSLB+EX_R12(r13)
+	std	r3,PACA_EXSLB+EX_R3(r13)
+	mfspr	r9,SPRG1
+	std	r9,PACA_EXSLB+EX_R13(r13)
+	mfcr	r9
+	mfspr	r12,SRR1		/* and SRR1 */
+	mfspr	r3,SRR0			/* SRR0 is faulting address */
+	b	.do_slb_miss		/* Rel. branch works in real mode */
+
+	STD_EXCEPTION_PSERIES(0x500, hardware_interrupt)
+	STD_EXCEPTION_PSERIES(0x600, alignment)
+	STD_EXCEPTION_PSERIES(0x700, program_check)
+	STD_EXCEPTION_PSERIES(0x800, fp_unavailable)
+	STD_EXCEPTION_PSERIES(0x900, decrementer)
+	STD_EXCEPTION_PSERIES(0xa00, trap_0a)
+	STD_EXCEPTION_PSERIES(0xb00, trap_0b)
+
+	. = 0xc00
+	.globl	system_call_pSeries
+system_call_pSeries:
+	HMT_MEDIUM
+	mr	r9,r13
+	mfmsr	r10
+	mfspr	r13,SPRG3
+	mfspr	r11,SRR0
+	clrrdi	r12,r13,32
+	oris	r12,r12,system_call_common@h
+	ori	r12,r12,system_call_common@l
+	mtspr	SRR0,r12
+	ori	r10,r10,MSR_IR|MSR_DR|MSR_RI
+	mfspr	r12,SRR1
+	mtspr	SRR1,r10
+	rfid
+	b	.	/* prevent speculative execution */
+
+	STD_EXCEPTION_PSERIES(0xd00, single_step)
+	STD_EXCEPTION_PSERIES(0xe00, trap_0e)
+
+	/* We need to deal with the Altivec unavailable exception
+	 * here which is at 0xf20, thus in the middle of the
+	 * prolog code of the PerformanceMonitor one. A little
+	 * trickery is thus necessary
+	 */
+	. = 0xf00
+	b	performance_monitor_pSeries
+
+	STD_EXCEPTION_PSERIES(0xf20, altivec_unavailable)
+
+	STD_EXCEPTION_PSERIES(0x1300, instruction_breakpoint)
+	STD_EXCEPTION_PSERIES(0x1700, altivec_assist)
+
+	/* moved from 0xf00 */
+	STD_EXCEPTION_PSERIES(0x3000, performance_monitor)
+
+	. = 0x3100
+_GLOBAL(do_stab_bolted_pSeries)
+	mtcrf	0x80,r12
+	mfspr	r12,SPRG2
+	EXCEPTION_PROLOG_PSERIES(PACA_EXSLB, .do_stab_bolted)
+
+	
+	/* Space for the naca.  Architected to be located at real address
+	 * NACA_PHYS_ADDR.  Various tools rely on this location being fixed.
+	 * The first dword of the naca is required by iSeries LPAR to
+	 * point to itVpdAreas.  On pSeries native, this value is not used.
+	 */
+	. = NACA_PHYS_ADDR
+	.globl __end_interrupts
+__end_interrupts:
+#ifdef CONFIG_PPC_ISERIES
+	.globl naca
+naca:
+	.llong itVpdAreas
+
+	/*
+	 * The iSeries LPAR map is at this fixed address
+	 * so that the HvReleaseData structure can address
+	 * it with a 32-bit offset.
+	 *
+	 * The VSID values below are dependent on the
+	 * VSID generation algorithm.  See include/asm/mmu_context.h.
+	 */
+
+	. = 0x4800
+
+	.llong	2		/* # ESIDs to be mapped by hypervisor	 */
+	.llong	1		/* # memory ranges to be mapped by hypervisor */
+	.llong	STAB0_PAGE	/* Page # of segment table within load area	*/
+	.llong	0		/* Reserved */
+	.llong	0		/* Reserved */
+	.llong	0		/* Reserved */
+	.llong	0		/* Reserved */
+	.llong	0		/* Reserved */
+	.llong	(KERNELBASE>>SID_SHIFT)
+	.llong	0x408f92c94	/* KERNELBASE VSID */
+	/* We have to list the bolted VMALLOC segment here, too, so that it
+	 * will be restored on shared processor switch */
+	.llong	(VMALLOCBASE>>SID_SHIFT)
+	.llong	0xf09b89af5	/* VMALLOCBASE VSID */
+	.llong	8192		/* # pages to map (32 MB) */
+	.llong	0		/* Offset from start of loadarea to start of map */
+	.llong	0x408f92c940000	/* VPN of first page to map */
+
+	. = 0x6100
+
+/***  ISeries-LPAR interrupt handlers ***/
+
+	STD_EXCEPTION_ISERIES(0x200, machine_check, PACA_EXMC)
+
+	.globl data_access_iSeries
+data_access_iSeries:
+	mtspr	SPRG1,r13
+BEGIN_FTR_SECTION
+	mtspr	SPRG2,r12
+	mfspr	r13,DAR
+	mfspr	r12,DSISR
+	srdi	r13,r13,60
+	rlwimi	r13,r12,16,0x20
+	mfcr	r12
+	cmpwi	r13,0x2c
+	beq	.do_stab_bolted_iSeries
+	mtcrf	0x80,r12
+	mfspr	r12,SPRG2
+END_FTR_SECTION_IFCLR(CPU_FTR_SLB)
+	EXCEPTION_PROLOG_ISERIES_1(PACA_EXGEN)
+	EXCEPTION_PROLOG_ISERIES_2
+	b	data_access_common
+
+.do_stab_bolted_iSeries:
+	mtcrf	0x80,r12
+	mfspr	r12,SPRG2
+	EXCEPTION_PROLOG_ISERIES_1(PACA_EXSLB)
+	EXCEPTION_PROLOG_ISERIES_2
+	b	.do_stab_bolted
+
+	.globl	data_access_slb_iSeries
+data_access_slb_iSeries:
+	mtspr	SPRG1,r13		/* save r13 */
+	EXCEPTION_PROLOG_ISERIES_1(PACA_EXSLB)
+	std	r3,PACA_EXSLB+EX_R3(r13)
+	ld	r12,PACALPPACA+LPPACASRR1(r13)
+	mfspr	r3,DAR
+	b	.do_slb_miss
+
+	STD_EXCEPTION_ISERIES(0x400, instruction_access, PACA_EXGEN)
+
+	.globl	instruction_access_slb_iSeries
+instruction_access_slb_iSeries:
+	mtspr	SPRG1,r13		/* save r13 */
+	EXCEPTION_PROLOG_ISERIES_1(PACA_EXSLB)
+	std	r3,PACA_EXSLB+EX_R3(r13)
+	ld	r12,PACALPPACA+LPPACASRR1(r13)
+	ld	r3,PACALPPACA+LPPACASRR0(r13)
+	b	.do_slb_miss
+
+	MASKABLE_EXCEPTION_ISERIES(0x500, hardware_interrupt)
+	STD_EXCEPTION_ISERIES(0x600, alignment, PACA_EXGEN)
+	STD_EXCEPTION_ISERIES(0x700, program_check, PACA_EXGEN)
+	STD_EXCEPTION_ISERIES(0x800, fp_unavailable, PACA_EXGEN)
+	MASKABLE_EXCEPTION_ISERIES(0x900, decrementer)
+	STD_EXCEPTION_ISERIES(0xa00, trap_0a, PACA_EXGEN)
+	STD_EXCEPTION_ISERIES(0xb00, trap_0b, PACA_EXGEN)
+
+	.globl	system_call_iSeries
+system_call_iSeries:
+	mr	r9,r13
+	mfspr	r13,SPRG3
+	EXCEPTION_PROLOG_ISERIES_2
+	b	system_call_common
+
+	STD_EXCEPTION_ISERIES( 0xd00, single_step, PACA_EXGEN)
+	STD_EXCEPTION_ISERIES( 0xe00, trap_0e, PACA_EXGEN)
+	STD_EXCEPTION_ISERIES( 0xf00, performance_monitor, PACA_EXGEN)
+
+	.globl system_reset_iSeries
+system_reset_iSeries:
+	mfspr	r13,SPRG3		/* Get paca address */
+	mfmsr	r24
+	ori	r24,r24,MSR_RI
+	mtmsrd	r24			/* RI on */
+	lhz	r24,PACAPACAINDEX(r13)	/* Get processor # */
+	cmpwi	0,r24,0			/* Are we processor 0? */
+	beq	.__start_initialization_iSeries	/* Start up the first processor */
+	mfspr	r4,CTRLF
+	li	r5,RUNLATCH		/* Turn off the run light */
+	andc	r4,r4,r5
+	mtspr	CTRLT,r4
+
+1:
+	HMT_LOW
+#ifdef CONFIG_SMP
+	lbz	r23,PACAPROCSTART(r13)	/* Test if this processor
+					 * should start */
+	sync
+	LOADADDR(r3,current_set)
+	sldi	r28,r24,3		/* get current_set[cpu#] */
+	ldx	r3,r3,r28
+	addi	r1,r3,THREAD_SIZE
+	subi	r1,r1,STACK_FRAME_OVERHEAD
+
+	cmpwi	0,r23,0
+	beq	iSeries_secondary_smp_loop	/* Loop until told to go */
+#ifdef SECONDARY_PROCESSORS
+	bne	.__secondary_start		/* Loop until told to go */
+#endif
+iSeries_secondary_smp_loop:
+	/* Let the Hypervisor know we are alive */
+	/* 8002 is a call to HvCallCfg::getLps, a harmless Hypervisor function */
+	lis	r3,0x8002
+	rldicr	r3,r3,32,15		/* r0 = (r3 << 32) & 0xffff000000000000 */
+#else /* CONFIG_SMP */
+	/* Yield the processor.  This is required for non-SMP kernels
+		which are running on multi-threaded machines. */
+	lis	r3,0x8000
+	rldicr	r3,r3,32,15		/* r3 = (r3 << 32) & 0xffff000000000000 */
+	addi	r3,r3,18		/* r3 = 0x8000000000000012 which is "yield" */
+	li	r4,0			/* "yield timed" */
+	li	r5,-1			/* "yield forever" */
+#endif /* CONFIG_SMP */
+	li	r0,-1			/* r0=-1 indicates a Hypervisor call */
+	sc				/* Invoke the hypervisor via a system call */
+	mfspr	r13,SPRG3		/* Put r13 back ???? */
+	b	1b			/* If SMP not configured, secondaries
+					 * loop forever */
+
+	.globl decrementer_iSeries_masked
+decrementer_iSeries_masked:
+	li	r11,1
+	stb	r11,PACALPPACA+LPPACADECRINT(r13)
+	lwz	r12,PACADEFAULTDECR(r13)
+	mtspr	SPRN_DEC,r12
+	/* fall through */
+
+	.globl hardware_interrupt_iSeries_masked
+hardware_interrupt_iSeries_masked:
+	mtcrf	0x80,r9		/* Restore regs */
+	ld	r11,PACALPPACA+LPPACASRR0(r13)
+	ld	r12,PACALPPACA+LPPACASRR1(r13)
+	mtspr	SRR0,r11
+	mtspr	SRR1,r12
+	ld	r9,PACA_EXGEN+EX_R9(r13)
+	ld	r10,PACA_EXGEN+EX_R10(r13)
+	ld	r11,PACA_EXGEN+EX_R11(r13)
+	ld	r12,PACA_EXGEN+EX_R12(r13)
+	ld	r13,PACA_EXGEN+EX_R13(r13)
+	rfid
+	b	.	/* prevent speculative execution */
+#endif
+
+/*
+ * Data area reserved for FWNMI option.
+ */
+	.= 0x7000
+	.globl fwnmi_data_area
+fwnmi_data_area:
+
+/*
+ * Vectors for the FWNMI option.  Share common code.
+ */
+	. = 0x8000
+	.globl system_reset_fwnmi
+system_reset_fwnmi:
+	HMT_MEDIUM
+	mtspr	SPRG1,r13		/* save r13 */
+	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common)
+	.globl machine_check_fwnmi
+machine_check_fwnmi:
+	HMT_MEDIUM
+	mtspr	SPRG1,r13		/* save r13 */
+	EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common)
+
+	/*
+	 * Space for the initial segment table
+	 * For LPAR, the hypervisor must fill in at least one entry
+	 * before we get control (with relocate on)
+	 */
+	. = STAB0_PHYS_ADDR
+	.globl __start_stab
+__start_stab:
+
+	. = (STAB0_PHYS_ADDR + PAGE_SIZE)
+	.globl __end_stab
+__end_stab:
+
+
+/*** Common interrupt handlers ***/
+
+	STD_EXCEPTION_COMMON(0x100, system_reset, .system_reset_exception)
+
+	/*
+	 * Machine check is different because we use a different
+	 * save area: PACA_EXMC instead of PACA_EXGEN.
+	 */
+	.align	7
+	.globl machine_check_common
+machine_check_common:
+	EXCEPTION_PROLOG_COMMON(0x200, PACA_EXMC)
+	DISABLE_INTS
+	bl	.save_nvgprs
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.machine_check_exception
+	b	.ret_from_except
+
+	STD_EXCEPTION_COMMON_LITE(0x900, decrementer, .timer_interrupt)
+	STD_EXCEPTION_COMMON(0xa00, trap_0a, .unknown_exception)
+	STD_EXCEPTION_COMMON(0xb00, trap_0b, .unknown_exception)
+	STD_EXCEPTION_COMMON(0xd00, single_step, .single_step_exception)
+	STD_EXCEPTION_COMMON(0xe00, trap_0e, .unknown_exception)
+	STD_EXCEPTION_COMMON(0xf00, performance_monitor, .performance_monitor_exception)
+	STD_EXCEPTION_COMMON(0x1300, instruction_breakpoint, .instruction_breakpoint_exception)
+#ifdef CONFIG_ALTIVEC
+	STD_EXCEPTION_COMMON(0x1700, altivec_assist, .altivec_assist_exception)
+#else
+	STD_EXCEPTION_COMMON(0x1700, altivec_assist, .unknown_exception)
+#endif
+
+/*
+ * Here we have detected that the kernel stack pointer is bad.
+ * R9 contains the saved CR, r13 points to the paca,
+ * r10 contains the (bad) kernel stack pointer,
+ * r11 and r12 contain the saved SRR0 and SRR1.
+ * We switch to using the paca guard page as an emergency stack,
+ * save the registers there, and call kernel_bad_stack(), which panics.
+ */
+bad_stack:
+	ld	r1,PACAEMERGSP(r13)
+	subi	r1,r1,64+INT_FRAME_SIZE
+	std	r9,_CCR(r1)
+	std	r10,GPR1(r1)
+	std	r11,_NIP(r1)
+	std	r12,_MSR(r1)
+	mfspr	r11,DAR
+	mfspr	r12,DSISR
+	std	r11,_DAR(r1)
+	std	r12,_DSISR(r1)
+	mflr	r10
+	mfctr	r11
+	mfxer	r12
+	std	r10,_LINK(r1)
+	std	r11,_CTR(r1)
+	std	r12,_XER(r1)
+	SAVE_GPR(0,r1)
+	SAVE_GPR(2,r1)
+	SAVE_4GPRS(3,r1)
+	SAVE_2GPRS(7,r1)
+	SAVE_10GPRS(12,r1)
+	SAVE_10GPRS(22,r1)
+	addi	r11,r1,INT_FRAME_SIZE
+	std	r11,0(r1)
+	li	r12,0
+	std	r12,0(r11)
+	ld	r2,PACATOC(r13)
+1:	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.kernel_bad_stack
+	b	1b
+
+/*
+ * Return from an exception with minimal checks.
+ * The caller is assumed to have done EXCEPTION_PROLOG_COMMON.
+ * If interrupts have been enabled, or anything has been
+ * done that might have changed the scheduling status of
+ * any task or sent any task a signal, you should use
+ * ret_from_except or ret_from_except_lite instead of this.
+ */
+fast_exception_return:
+	ld	r12,_MSR(r1)
+	ld	r11,_NIP(r1)
+	andi.	r3,r12,MSR_RI		/* check if RI is set */
+	beq-	unrecov_fer
+	ld	r3,_CCR(r1)
+	ld	r4,_LINK(r1)
+	ld	r5,_CTR(r1)
+	ld	r6,_XER(r1)
+	mtcr	r3
+	mtlr	r4
+	mtctr	r5
+	mtxer	r6
+	REST_GPR(0, r1)
+	REST_8GPRS(2, r1)
+
+	mfmsr	r10
+	clrrdi	r10,r10,2		/* clear RI (LE is 0 already) */
+	mtmsrd	r10,1
+
+	mtspr	SRR1,r12
+	mtspr	SRR0,r11
+	REST_4GPRS(10, r1)
+	ld	r1,GPR1(r1)
+	rfid
+	b	.	/* prevent speculative execution */
+
+unrecov_fer:
+	bl	.save_nvgprs
+1:	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.unrecoverable_exception
+	b	1b
+
+/*
+ * Here r13 points to the paca, r9 contains the saved CR,
+ * SRR0 and SRR1 are saved in r11 and r12,
+ * r9 - r13 are saved in paca->exgen.
+ */
+	.align	7
+	.globl data_access_common
+data_access_common:
+	mfspr	r10,DAR
+	std	r10,PACA_EXGEN+EX_DAR(r13)
+	mfspr	r10,DSISR
+	stw	r10,PACA_EXGEN+EX_DSISR(r13)
+	EXCEPTION_PROLOG_COMMON(0x300, PACA_EXGEN)
+	ld	r3,PACA_EXGEN+EX_DAR(r13)
+	lwz	r4,PACA_EXGEN+EX_DSISR(r13)
+	li	r5,0x300
+	b	.do_hash_page	 	/* Try to handle as hpte fault */
+
+	.align	7
+	.globl instruction_access_common
+instruction_access_common:
+	EXCEPTION_PROLOG_COMMON(0x400, PACA_EXGEN)
+	ld	r3,_NIP(r1)
+	andis.	r4,r12,0x5820
+	li	r5,0x400
+	b	.do_hash_page		/* Try to handle as hpte fault */
+
+	.align	7
+	.globl hardware_interrupt_common
+	.globl hardware_interrupt_entry
+hardware_interrupt_common:
+	EXCEPTION_PROLOG_COMMON(0x500, PACA_EXGEN)
+hardware_interrupt_entry:
+	DISABLE_INTS
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.do_IRQ
+	b	.ret_from_except_lite
+
+	.align	7
+	.globl alignment_common
+alignment_common:
+	mfspr	r10,DAR
+	std	r10,PACA_EXGEN+EX_DAR(r13)
+	mfspr	r10,DSISR
+	stw	r10,PACA_EXGEN+EX_DSISR(r13)
+	EXCEPTION_PROLOG_COMMON(0x600, PACA_EXGEN)
+	ld	r3,PACA_EXGEN+EX_DAR(r13)
+	lwz	r4,PACA_EXGEN+EX_DSISR(r13)
+	std	r3,_DAR(r1)
+	std	r4,_DSISR(r1)
+	bl	.save_nvgprs
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	ENABLE_INTS
+	bl	.alignment_exception
+	b	.ret_from_except
+
+	.align	7
+	.globl program_check_common
+program_check_common:
+	EXCEPTION_PROLOG_COMMON(0x700, PACA_EXGEN)
+	bl	.save_nvgprs
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	ENABLE_INTS
+	bl	.program_check_exception
+	b	.ret_from_except
+
+	.align	7
+	.globl fp_unavailable_common
+fp_unavailable_common:
+	EXCEPTION_PROLOG_COMMON(0x800, PACA_EXGEN)
+	bne	.load_up_fpu		/* if from user, just load it up */
+	bl	.save_nvgprs
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	ENABLE_INTS
+	bl	.kernel_fp_unavailable_exception
+	BUG_OPCODE
+
+	.align	7
+	.globl altivec_unavailable_common
+altivec_unavailable_common:
+	EXCEPTION_PROLOG_COMMON(0xf20, PACA_EXGEN)
+#ifdef CONFIG_ALTIVEC
+	bne	.load_up_altivec	/* if from user, just load it up */
+#endif
+	bl	.save_nvgprs
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	ENABLE_INTS
+	bl	.altivec_unavailable_exception
+	b	.ret_from_except
+
+/*
+ * Hash table stuff
+ */
+	.align	7
+_GLOBAL(do_hash_page)
+	std	r3,_DAR(r1)
+	std	r4,_DSISR(r1)
+
+	andis.	r0,r4,0xa450		/* weird error? */
+	bne-	.handle_page_fault	/* if not, try to insert a HPTE */
+BEGIN_FTR_SECTION
+	andis.	r0,r4,0x0020		/* Is it a segment table fault? */
+	bne-	.do_ste_alloc		/* If so handle it */
+END_FTR_SECTION_IFCLR(CPU_FTR_SLB)
+
+	/*
+	 * We need to set the _PAGE_USER bit if MSR_PR is set or if we are
+	 * accessing a userspace segment (even from the kernel). We assume
+	 * kernel addresses always have the high bit set.
+	 */
+	rlwinm	r4,r4,32-25+9,31-9,31-9	/* DSISR_STORE -> _PAGE_RW */
+	rotldi	r0,r3,15		/* Move high bit into MSR_PR posn */
+	orc	r0,r12,r0		/* MSR_PR | ~high_bit */
+	rlwimi	r4,r0,32-13,30,30	/* becomes _PAGE_USER access bit */
+	ori	r4,r4,1			/* add _PAGE_PRESENT */
+	rlwimi	r4,r5,22+2,31-2,31-2	/* Set _PAGE_EXEC if trap is 0x400 */
+
+	/*
+	 * On iSeries, we soft-disable interrupts here, then
+	 * hard-enable interrupts so that the hash_page code can spin on
+	 * the hash_table_lock without problems on a shared processor.
+	 */
+	DISABLE_INTS
+
+	/*
+	 * r3 contains the faulting address
+	 * r4 contains the required access permissions
+	 * r5 contains the trap number
+	 *
+	 * at return r3 = 0 for success
+	 */
+	bl	.hash_page		/* build HPTE if possible */
+	cmpdi	r3,0			/* see if hash_page succeeded */
+
+#ifdef DO_SOFT_DISABLE
+	/*
+	 * If we had interrupts soft-enabled at the point where the
+	 * DSI/ISI occurred, and an interrupt came in during hash_page,
+	 * handle it now.
+	 * We jump to ret_from_except_lite rather than fast_exception_return
+	 * because ret_from_except_lite will check for and handle pending
+	 * interrupts if necessary.
+	 */
+	beq	.ret_from_except_lite
+	/* For a hash failure, we don't bother re-enabling interrupts */
+	ble-	12f
+
+	/*
+	 * hash_page couldn't handle it, set soft interrupt enable back
+	 * to what it was before the trap.  Note that .local_irq_restore
+	 * handles any interrupts pending at this point.
+	 */
+	ld	r3,SOFTE(r1)
+	bl	.local_irq_restore
+	b	11f
+#else
+	beq	fast_exception_return   /* Return from exception on success */
+	ble-	12f			/* Failure return from hash_page */
+
+	/* fall through */
+#endif
+
+/* Here we have a page fault that hash_page can't handle. */
+_GLOBAL(handle_page_fault)
+	ENABLE_INTS
+11:	ld	r4,_DAR(r1)
+	ld	r5,_DSISR(r1)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.do_page_fault
+	cmpdi	r3,0
+	beq+	.ret_from_except_lite
+	bl	.save_nvgprs
+	mr	r5,r3
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	lwz	r4,_DAR(r1)
+	bl	.bad_page_fault
+	b	.ret_from_except
+
+/* We have a page fault that hash_page could handle but HV refused
+ * the PTE insertion
+ */
+12:	bl	.save_nvgprs
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	lwz	r4,_DAR(r1)
+	bl	.low_hash_fault
+	b	.ret_from_except
+
+	/* here we have a segment miss */
+_GLOBAL(do_ste_alloc)
+	bl	.ste_allocate		/* try to insert stab entry */
+	cmpdi	r3,0
+	beq+	fast_exception_return
+	b	.handle_page_fault
+
+/*
+ * r13 points to the PACA, r9 contains the saved CR,
+ * r11 and r12 contain the saved SRR0 and SRR1.
+ * r9 - r13 are saved in paca->exslb.
+ * We assume we aren't going to take any exceptions during this procedure.
+ * We assume (DAR >> 60) == 0xc.
+ */
+	.align	7
+_GLOBAL(do_stab_bolted)
+	stw	r9,PACA_EXSLB+EX_CCR(r13)	/* save CR in exc. frame */
+	std	r11,PACA_EXSLB+EX_SRR0(r13)	/* save SRR0 in exc. frame */
+
+	/* Hash to the primary group */
+	ld	r10,PACASTABVIRT(r13)
+	mfspr	r11,DAR
+	srdi	r11,r11,28
+	rldimi	r10,r11,7,52	/* r10 = first ste of the group */
+
+	/* Calculate VSID */
+	/* This is a kernel address, so protovsid = ESID */
+	ASM_VSID_SCRAMBLE(r11, r9)
+	rldic	r9,r11,12,16	/* r9 = vsid << 12 */
+
+	/* Search the primary group for a free entry */
+1:	ld	r11,0(r10)	/* Test valid bit of the current ste	*/
+	andi.	r11,r11,0x80
+	beq	2f
+	addi	r10,r10,16
+	andi.	r11,r10,0x70
+	bne	1b
+
+	/* Stick for only searching the primary group for now.		*/
+	/* At least for now, we use a very simple random castout scheme */
+	/* Use the TB as a random number ;  OR in 1 to avoid entry 0	*/
+	mftb	r11
+	rldic	r11,r11,4,57	/* r11 = (r11 << 4) & 0x70 */
+	ori	r11,r11,0x10
+
+	/* r10 currently points to an ste one past the group of interest */
+	/* make it point to the randomly selected entry			*/
+	subi	r10,r10,128
+	or 	r10,r10,r11	/* r10 is the entry to invalidate	*/
+
+	isync			/* mark the entry invalid		*/
+	ld	r11,0(r10)
+	rldicl	r11,r11,56,1	/* clear the valid bit */
+	rotldi	r11,r11,8
+	std	r11,0(r10)
+	sync
+
+	clrrdi	r11,r11,28	/* Get the esid part of the ste		*/
+	slbie	r11
+
+2:	std	r9,8(r10)	/* Store the vsid part of the ste	*/
+	eieio
+
+	mfspr	r11,DAR		/* Get the new esid			*/
+	clrrdi	r11,r11,28	/* Permits a full 32b of ESID		*/
+	ori	r11,r11,0x90	/* Turn on valid and kp			*/
+	std	r11,0(r10)	/* Put new entry back into the stab	*/
+
+	sync
+
+	/* All done -- return from exception. */
+	lwz	r9,PACA_EXSLB+EX_CCR(r13)	/* get saved CR */
+	ld	r11,PACA_EXSLB+EX_SRR0(r13)	/* get saved SRR0 */
+
+	andi.	r10,r12,MSR_RI
+	beq-	unrecov_slb
+
+	mtcrf	0x80,r9			/* restore CR */
+
+	mfmsr	r10
+	clrrdi	r10,r10,2
+	mtmsrd	r10,1
+
+	mtspr	SRR0,r11
+	mtspr	SRR1,r12
+	ld	r9,PACA_EXSLB+EX_R9(r13)
+	ld	r10,PACA_EXSLB+EX_R10(r13)
+	ld	r11,PACA_EXSLB+EX_R11(r13)
+	ld	r12,PACA_EXSLB+EX_R12(r13)
+	ld	r13,PACA_EXSLB+EX_R13(r13)
+	rfid
+	b	.	/* prevent speculative execution */
+
+/*
+ * r13 points to the PACA, r9 contains the saved CR,
+ * r11 and r12 contain the saved SRR0 and SRR1.
+ * r3 has the faulting address
+ * r9 - r13 are saved in paca->exslb.
+ * r3 is saved in paca->slb_r3
+ * We assume we aren't going to take any exceptions during this procedure.
+ */
+_GLOBAL(do_slb_miss)
+	mflr	r10
+
+	stw	r9,PACA_EXSLB+EX_CCR(r13)	/* save CR in exc. frame */
+	std	r10,PACA_EXSLB+EX_LR(r13)	/* save LR */
+
+	bl	.slb_allocate			/* handle it */
+
+	/* All done -- return from exception. */
+
+	ld	r10,PACA_EXSLB+EX_LR(r13)
+	ld	r3,PACA_EXSLB+EX_R3(r13)
+	lwz	r9,PACA_EXSLB+EX_CCR(r13)	/* get saved CR */
+#ifdef CONFIG_PPC_ISERIES
+	ld	r11,PACALPPACA+LPPACASRR0(r13)	/* get SRR0 value */
+#endif /* CONFIG_PPC_ISERIES */
+
+	mtlr	r10
+
+	andi.	r10,r12,MSR_RI	/* check for unrecoverable exception */
+	beq-	unrecov_slb
+
+.machine	push
+.machine	"power4"
+	mtcrf	0x80,r9
+	mtcrf	0x01,r9		/* slb_allocate uses cr0 and cr7 */
+.machine	pop
+
+#ifdef CONFIG_PPC_ISERIES
+	mtspr	SRR0,r11
+	mtspr	SRR1,r12
+#endif /* CONFIG_PPC_ISERIES */
+	ld	r9,PACA_EXSLB+EX_R9(r13)
+	ld	r10,PACA_EXSLB+EX_R10(r13)
+	ld	r11,PACA_EXSLB+EX_R11(r13)
+	ld	r12,PACA_EXSLB+EX_R12(r13)
+	ld	r13,PACA_EXSLB+EX_R13(r13)
+	rfid
+	b	.	/* prevent speculative execution */
+
+unrecov_slb:
+	EXCEPTION_PROLOG_COMMON(0x4100, PACA_EXSLB)
+	DISABLE_INTS
+	bl	.save_nvgprs
+1:	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.unrecoverable_exception
+	b	1b
+
+
+/*
+ * On pSeries, secondary processors spin in the following code.
+ * At entry, r3 = this processor's number (physical cpu id)
+ */
+_GLOBAL(pSeries_secondary_smp_init)
+	mr	r24,r3
+	
+	/* turn on 64-bit mode */
+	bl	.enable_64b_mode
+	isync
+
+	/* Copy some CPU settings from CPU 0 */
+	bl	.__restore_cpu_setup
+
+	/* Set up a paca value for this processor. Since we have the
+	 * physical cpu id in r3, we need to search the pacas to find
+	 * which logical id maps to our physical one.
+	 */
+	LOADADDR(r13, paca) 		/* Get base vaddr of paca array	 */
+	li	r5,0			/* logical cpu id                */
+1:	lhz	r6,PACAHWCPUID(r13)	/* Load HW procid from paca      */
+	cmpw	r6,r24			/* Compare to our id             */
+	beq	2f
+	addi	r13,r13,PACA_SIZE	/* Loop to next PACA on miss     */
+	addi	r5,r5,1
+	cmpwi	r5,NR_CPUS
+	blt	1b
+
+99:	HMT_LOW				/* Couldn't find our CPU id      */
+	b	99b
+
+2:	mtspr	SPRG3,r13		/* Save vaddr of paca in SPRG3	 */
+	/* From now on, r24 is expected to be logica cpuid */
+	mr	r24,r5
+3:	HMT_LOW
+	lbz	r23,PACAPROCSTART(r13)	/* Test if this processor should */
+					/* start.			 */
+	sync
+
+	/* Create a temp kernel stack for use before relocation is on.	*/
+	ld	r1,PACAEMERGSP(r13)
+	subi	r1,r1,STACK_FRAME_OVERHEAD
+
+	cmpwi	0,r23,0
+#ifdef CONFIG_SMP
+#ifdef SECONDARY_PROCESSORS
+	bne	.__secondary_start
+#endif
+#endif
+	b 	3b			/* Loop until told to go	 */
+
+#ifdef CONFIG_PPC_ISERIES
+_STATIC(__start_initialization_iSeries)
+	/* Clear out the BSS */
+	LOADADDR(r11,__bss_stop)
+	LOADADDR(r8,__bss_start)
+	sub	r11,r11,r8		/* bss size			*/
+	addi	r11,r11,7		/* round up to an even double word */
+	rldicl. r11,r11,61,3		/* shift right by 3		*/
+	beq	4f
+	addi	r8,r8,-8
+	li	r0,0
+	mtctr	r11			/* zero this many doublewords	*/
+3:	stdu	r0,8(r8)
+	bdnz	3b
+4:
+	LOADADDR(r1,init_thread_union)
+	addi	r1,r1,THREAD_SIZE
+	li	r0,0
+	stdu	r0,-STACK_FRAME_OVERHEAD(r1)
+
+	LOADADDR(r3,cpu_specs)
+	LOADADDR(r4,cur_cpu_spec)
+	li	r5,0
+	bl	.identify_cpu
+
+	LOADADDR(r2,__toc_start)
+	addi	r2,r2,0x4000
+	addi	r2,r2,0x4000
+
+	bl	.iSeries_early_setup
+
+	/* relocation is on at this point */
+
+	b	.start_here_common
+#endif /* CONFIG_PPC_ISERIES */
+
+#ifdef CONFIG_PPC_MULTIPLATFORM
+
+_STATIC(__mmu_off)
+	mfmsr	r3
+	andi.	r0,r3,MSR_IR|MSR_DR
+	beqlr
+	andc	r3,r3,r0
+	mtspr	SPRN_SRR0,r4
+	mtspr	SPRN_SRR1,r3
+	sync
+	rfid
+	b	.	/* prevent speculative execution */
+
+
+/*
+ * Here is our main kernel entry point. We support currently 2 kind of entries
+ * depending on the value of r5.
+ *
+ *   r5 != NULL -> OF entry, we go to prom_init, "legacy" parameter content
+ *                 in r3...r7
+ *   
+ *   r5 == NULL -> kexec style entry. r3 is a physical pointer to the
+ *                 DT block, r4 is a physical pointer to the kernel itself
+ *
+ */
+_GLOBAL(__start_initialization_multiplatform)
+	/*
+	 * Are we booted from a PROM Of-type client-interface ?
+	 */
+	cmpldi	cr0,r5,0
+	bne	.__boot_from_prom		/* yes -> prom */
+
+	/* Save parameters */
+	mr	r31,r3
+	mr	r30,r4
+
+	/* Make sure we are running in 64 bits mode */
+	bl	.enable_64b_mode
+
+	/* Setup some critical 970 SPRs before switching MMU off */
+	bl	.__970_cpu_preinit
+
+	/* cpu # */
+	li	r24,0
+
+	/* Switch off MMU if not already */
+	LOADADDR(r4, .__after_prom_start - KERNELBASE)
+	add	r4,r4,r30
+	bl	.__mmu_off
+	b	.__after_prom_start
+
+_STATIC(__boot_from_prom)
+	/* Save parameters */
+	mr	r31,r3
+	mr	r30,r4
+	mr	r29,r5
+	mr	r28,r6
+	mr	r27,r7
+
+	/* Make sure we are running in 64 bits mode */
+	bl	.enable_64b_mode
+
+	/* put a relocation offset into r3 */
+	bl	.reloc_offset
+
+	LOADADDR(r2,__toc_start)
+	addi	r2,r2,0x4000
+	addi	r2,r2,0x4000
+
+	/* Relocate the TOC from a virt addr to a real addr */
+	sub	r2,r2,r3
+
+	/* Restore parameters */
+	mr	r3,r31
+	mr	r4,r30
+	mr	r5,r29
+	mr	r6,r28
+	mr	r7,r27
+
+	/* Do all of the interaction with OF client interface */
+	bl	.prom_init
+	/* We never return */
+	trap
+
+/*
+ * At this point, r3 contains the physical address we are running at,
+ * returned by prom_init()
+ */
+_STATIC(__after_prom_start)
+
+/*
+ * We need to run with __start at physical address 0.
+ * This will leave some code in the first 256B of
+ * real memory, which are reserved for software use.
+ * The remainder of the first page is loaded with the fixed
+ * interrupt vectors.  The next two pages are filled with
+ * unknown exception placeholders.
+ *
+ * Note: This process overwrites the OF exception vectors.
+ *	r26 == relocation offset
+ *	r27 == KERNELBASE
+ */
+	bl	.reloc_offset
+	mr	r26,r3
+	SET_REG_TO_CONST(r27,KERNELBASE)
+
+	li	r3,0			/* target addr */
+
+	// XXX FIXME: Use phys returned by OF (r30)
+	sub	r4,r27,r26 		/* source addr			 */
+					/* current address of _start	 */
+					/*   i.e. where we are running	 */
+					/*	the source addr		 */
+
+	LOADADDR(r5,copy_to_here)	/* # bytes of memory to copy	 */
+	sub	r5,r5,r27
+
+	li	r6,0x100		/* Start offset, the first 0x100 */
+					/* bytes were copied earlier.	 */
+
+	bl	.copy_and_flush		/* copy the first n bytes	 */
+					/* this includes the code being	 */
+					/* executed here.		 */
+
+	LOADADDR(r0, 4f)		/* Jump to the copy of this code */
+	mtctr	r0			/* that we just made/relocated	 */
+	bctr
+
+4:	LOADADDR(r5,klimit)
+	sub	r5,r5,r26
+	ld	r5,0(r5)		/* get the value of klimit */
+	sub	r5,r5,r27
+	bl	.copy_and_flush		/* copy the rest */
+	b	.start_here_multiplatform
+
+#endif /* CONFIG_PPC_MULTIPLATFORM */
+
+/*
+ * Copy routine used to copy the kernel to start at physical address 0
+ * and flush and invalidate the caches as needed.
+ * r3 = dest addr, r4 = source addr, r5 = copy limit, r6 = start offset
+ * on exit, r3, r4, r5 are unchanged, r6 is updated to be >= r5.
+ *
+ * Note: this routine *only* clobbers r0, r6 and lr
+ */
+_GLOBAL(copy_and_flush)
+	addi	r5,r5,-8
+	addi	r6,r6,-8
+4:	li	r0,16			/* Use the least common		*/
+					/* denominator cache line	*/
+					/* size.  This results in	*/
+					/* extra cache line flushes	*/
+					/* but operation is correct.	*/
+					/* Can't get cache line size	*/
+					/* from NACA as it is being	*/
+					/* moved too.			*/
+
+	mtctr	r0			/* put # words/line in ctr	*/
+3:	addi	r6,r6,8			/* copy a cache line		*/
+	ldx	r0,r6,r4
+	stdx	r0,r6,r3
+	bdnz	3b
+	dcbst	r6,r3			/* write it to memory		*/
+	sync
+	icbi	r6,r3			/* flush the icache line	*/
+	cmpld	0,r6,r5
+	blt	4b
+	sync
+	addi	r5,r5,8
+	addi	r6,r6,8
+	blr
+
+.align 8
+copy_to_here:
+
+/*
+ * load_up_fpu(unused, unused, tsk)
+ * Disable FP for the task which had the FPU previously,
+ * and save its floating-point registers in its thread_struct.
+ * Enables the FPU for use in the kernel on return.
+ * On SMP we know the fpu is free, since we give it up every
+ * switch (ie, no lazy save of the FP registers).
+ * On entry: r13 == 'current' && last_task_used_math != 'current'
+ */
+_STATIC(load_up_fpu)
+	mfmsr	r5			/* grab the current MSR */
+	ori	r5,r5,MSR_FP
+	mtmsrd	r5			/* enable use of fpu now */
+	isync
+/*
+ * For SMP, we don't do lazy FPU switching because it just gets too
+ * horrendously complex, especially when a task switches from one CPU
+ * to another.  Instead we call giveup_fpu in switch_to.
+ *
+ */
+#ifndef CONFIG_SMP
+	ld	r3,last_task_used_math@got(r2)
+	ld	r4,0(r3)
+	cmpdi	0,r4,0
+	beq	1f
+	/* Save FP state to last_task_used_math's THREAD struct */
+	addi	r4,r4,THREAD
+	SAVE_32FPRS(0, r4)
+	mffs	fr0
+	stfd	fr0,THREAD_FPSCR(r4)
+	/* Disable FP for last_task_used_math */
+	ld	r5,PT_REGS(r4)
+	ld	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
+	li	r6,MSR_FP|MSR_FE0|MSR_FE1
+	andc	r4,r4,r6
+	std	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
+1:
+#endif /* CONFIG_SMP */
+	/* enable use of FP after return */
+	ld	r4,PACACURRENT(r13)
+	addi	r5,r4,THREAD		/* Get THREAD */
+	ld	r4,THREAD_FPEXC_MODE(r5)
+	ori	r12,r12,MSR_FP
+	or	r12,r12,r4
+	std	r12,_MSR(r1)
+	lfd	fr0,THREAD_FPSCR(r5)
+	mtfsf	0xff,fr0
+	REST_32FPRS(0, r5)
+#ifndef CONFIG_SMP
+	/* Update last_task_used_math to 'current' */
+	subi	r4,r5,THREAD		/* Back to 'current' */
+	std	r4,0(r3)
+#endif /* CONFIG_SMP */
+	/* restore registers and return */
+	b	fast_exception_return
+
+/*
+ * disable_kernel_fp()
+ * Disable the FPU.
+ */
+_GLOBAL(disable_kernel_fp)
+	mfmsr	r3
+	rldicl	r0,r3,(63-MSR_FP_LG),1
+	rldicl	r3,r0,(MSR_FP_LG+1),0
+	mtmsrd	r3			/* disable use of fpu now */
+	isync
+	blr
+
+/*
+ * giveup_fpu(tsk)
+ * Disable FP for the task given as the argument,
+ * and save the floating-point registers in its thread_struct.
+ * Enables the FPU for use in the kernel on return.
+ */
+_GLOBAL(giveup_fpu)
+	mfmsr	r5
+	ori	r5,r5,MSR_FP
+	mtmsrd	r5			/* enable use of fpu now */
+	isync
+	cmpdi	0,r3,0
+	beqlr-				/* if no previous owner, done */
+	addi	r3,r3,THREAD		/* want THREAD of task */
+	ld	r5,PT_REGS(r3)
+	cmpdi	0,r5,0
+	SAVE_32FPRS(0, r3)
+	mffs	fr0
+	stfd	fr0,THREAD_FPSCR(r3)
+	beq	1f
+	ld	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
+	li	r3,MSR_FP|MSR_FE0|MSR_FE1
+	andc	r4,r4,r3		/* disable FP for previous task */
+	std	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
+1:
+#ifndef CONFIG_SMP
+	li	r5,0
+	ld	r4,last_task_used_math@got(r2)
+	std	r5,0(r4)
+#endif /* CONFIG_SMP */
+	blr
+
+
+#ifdef CONFIG_ALTIVEC
+		
+/*
+ * load_up_altivec(unused, unused, tsk)
+ * Disable VMX for the task which had it previously,
+ * and save its vector registers in its thread_struct.
+ * Enables the VMX for use in the kernel on return.
+ * On SMP we know the VMX is free, since we give it up every
+ * switch (ie, no lazy save of the vector registers).
+ * On entry: r13 == 'current' && last_task_used_altivec != 'current'
+ */
+_STATIC(load_up_altivec)
+	mfmsr	r5			/* grab the current MSR */
+	oris	r5,r5,MSR_VEC@h
+	mtmsrd	r5			/* enable use of VMX now */
+	isync
+	
+/*
+ * For SMP, we don't do lazy VMX switching because it just gets too
+ * horrendously complex, especially when a task switches from one CPU
+ * to another.  Instead we call giveup_altvec in switch_to.
+ * VRSAVE isn't dealt with here, that is done in the normal context
+ * switch code. Note that we could rely on vrsave value to eventually
+ * avoid saving all of the VREGs here...
+ */
+#ifndef CONFIG_SMP
+	ld	r3,last_task_used_altivec@got(r2)
+	ld	r4,0(r3)
+	cmpdi	0,r4,0
+	beq	1f
+	/* Save VMX state to last_task_used_altivec's THREAD struct */
+	addi	r4,r4,THREAD
+	SAVE_32VRS(0,r5,r4)
+	mfvscr	vr0
+	li	r10,THREAD_VSCR
+	stvx	vr0,r10,r4
+	/* Disable VMX for last_task_used_altivec */
+	ld	r5,PT_REGS(r4)
+	ld	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
+	lis	r6,MSR_VEC@h
+	andc	r4,r4,r6
+	std	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
+1:
+#endif /* CONFIG_SMP */
+	/* Hack: if we get an altivec unavailable trap with VRSAVE
+	 * set to all zeros, we assume this is a broken application
+	 * that fails to set it properly, and thus we switch it to
+	 * all 1's
+	 */
+	mfspr	r4,SPRN_VRSAVE
+	cmpdi	0,r4,0
+	bne+	1f
+	li	r4,-1
+	mtspr	SPRN_VRSAVE,r4
+1:
+	/* enable use of VMX after return */
+	ld	r4,PACACURRENT(r13)
+	addi	r5,r4,THREAD		/* Get THREAD */
+	oris	r12,r12,MSR_VEC@h
+	std	r12,_MSR(r1)
+	li	r4,1
+	li	r10,THREAD_VSCR
+	stw	r4,THREAD_USED_VR(r5)
+	lvx	vr0,r10,r5
+	mtvscr	vr0
+	REST_32VRS(0,r4,r5)
+#ifndef CONFIG_SMP
+	/* Update last_task_used_math to 'current' */
+	subi	r4,r5,THREAD		/* Back to 'current' */
+	std	r4,0(r3)
+#endif /* CONFIG_SMP */
+	/* restore registers and return */
+	b	fast_exception_return
+
+/*
+ * disable_kernel_altivec()
+ * Disable the VMX.
+ */
+_GLOBAL(disable_kernel_altivec)
+	mfmsr	r3
+	rldicl	r0,r3,(63-MSR_VEC_LG),1
+	rldicl	r3,r0,(MSR_VEC_LG+1),0
+	mtmsrd	r3			/* disable use of VMX now */
+	isync
+	blr
+
+/*
+ * giveup_altivec(tsk)
+ * Disable VMX for the task given as the argument,
+ * and save the vector registers in its thread_struct.
+ * Enables the VMX for use in the kernel on return.
+ */
+_GLOBAL(giveup_altivec)
+	mfmsr	r5
+	oris	r5,r5,MSR_VEC@h
+	mtmsrd	r5			/* enable use of VMX now */
+	isync
+	cmpdi	0,r3,0
+	beqlr-				/* if no previous owner, done */
+	addi	r3,r3,THREAD		/* want THREAD of task */
+	ld	r5,PT_REGS(r3)
+	cmpdi	0,r5,0
+	SAVE_32VRS(0,r4,r3)
+	mfvscr	vr0
+	li	r4,THREAD_VSCR
+	stvx	vr0,r4,r3
+	beq	1f
+	ld	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
+	lis	r3,MSR_VEC@h
+	andc	r4,r4,r3		/* disable FP for previous task */
+	std	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
+1:
+#ifndef CONFIG_SMP
+	li	r5,0
+	ld	r4,last_task_used_altivec@got(r2)
+	std	r5,0(r4)
+#endif /* CONFIG_SMP */
+	blr
+
+#endif /* CONFIG_ALTIVEC */
+
+#ifdef CONFIG_SMP
+#ifdef CONFIG_PPC_PMAC
+/*
+ * On PowerMac, secondary processors starts from the reset vector, which
+ * is temporarily turned into a call to one of the functions below.
+ */
+	.section ".text";
+	.align 2 ;
+
+	.globl	pmac_secondary_start_1	
+pmac_secondary_start_1:	
+	li	r24, 1
+	b	.pmac_secondary_start
+	
+	.globl pmac_secondary_start_2
+pmac_secondary_start_2:	
+	li	r24, 2
+	b	.pmac_secondary_start
+	
+	.globl pmac_secondary_start_3
+pmac_secondary_start_3:
+	li	r24, 3
+	b	.pmac_secondary_start
+	
+_GLOBAL(pmac_secondary_start)
+	/* turn on 64-bit mode */
+	bl	.enable_64b_mode
+	isync
+
+	/* Copy some CPU settings from CPU 0 */
+	bl	.__restore_cpu_setup
+
+	/* pSeries do that early though I don't think we really need it */
+	mfmsr	r3
+	ori	r3,r3,MSR_RI
+	mtmsrd	r3			/* RI on */
+
+	/* Set up a paca value for this processor. */
+	LOADADDR(r4, paca) 		 /* Get base vaddr of paca array	*/
+	mulli	r13,r24,PACA_SIZE	 /* Calculate vaddr of right paca */
+	add	r13,r13,r4		/* for this processor.		*/
+	mtspr	SPRG3,r13		 /* Save vaddr of paca in SPRG3	*/
+
+	/* Create a temp kernel stack for use before relocation is on.	*/
+	ld	r1,PACAEMERGSP(r13)
+	subi	r1,r1,STACK_FRAME_OVERHEAD
+
+	b	.__secondary_start
+
+#endif /* CONFIG_PPC_PMAC */
+
+/*
+ * This function is called after the master CPU has released the
+ * secondary processors.  The execution environment is relocation off.
+ * The paca for this processor has the following fields initialized at
+ * this point:
+ *   1. Processor number
+ *   2. Segment table pointer (virtual address)
+ * On entry the following are set:
+ *   r1	= stack pointer.  vaddr for iSeries, raddr (temp stack) for pSeries
+ *   r24   = cpu# (in Linux terms)
+ *   r13   = paca virtual address
+ *   SPRG3 = paca virtual address
+ */
+_GLOBAL(__secondary_start)
+
+	HMT_MEDIUM			/* Set thread priority to MEDIUM */
+
+	ld	r2,PACATOC(r13)
+	li	r6,0
+	stb	r6,PACAPROCENABLED(r13)
+
+#ifndef CONFIG_PPC_ISERIES
+	/* Initialize the page table pointer register. */
+	LOADADDR(r6,_SDR1)
+	ld	r6,0(r6)		/* get the value of _SDR1	 */
+	mtspr	SDR1,r6			/* set the htab location	 */
+#endif
+	/* Initialize the first segment table (or SLB) entry		 */
+	ld	r3,PACASTABVIRT(r13)	/* get addr of segment table	 */
+	bl	.stab_initialize
+
+	/* Initialize the kernel stack.  Just a repeat for iSeries.	 */
+	LOADADDR(r3,current_set)
+	sldi	r28,r24,3		/* get current_set[cpu#]	 */
+	ldx	r1,r3,r28
+	addi	r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD
+	std	r1,PACAKSAVE(r13)
+
+	ld	r3,PACASTABREAL(r13)	/* get raddr of segment table	 */
+	ori	r4,r3,1			/* turn on valid bit		 */
+
+#ifdef CONFIG_PPC_ISERIES
+	li	r0,-1			/* hypervisor call */
+	li	r3,1
+	sldi	r3,r3,63		/* 0x8000000000000000 */
+	ori	r3,r3,4			/* 0x8000000000000004 */
+	sc				/* HvCall_setASR */
+#else
+	/* set the ASR */
+	ld	r3,systemcfg@got(r2)	/* r3 = ptr to systemcfg	 */
+	lwz	r3,PLATFORM(r3)		/* r3 = platform flags		 */
+	cmpldi 	r3,PLATFORM_PSERIES_LPAR
+	bne	98f
+	mfspr	r3,PVR
+	srwi	r3,r3,16
+	cmpwi	r3,0x37			/* SStar  */
+	beq	97f
+	cmpwi	r3,0x36			/* IStar  */
+	beq	97f
+	cmpwi	r3,0x34			/* Pulsar */
+	bne	98f
+97:	li	r3,H_SET_ASR		/* hcall = H_SET_ASR */
+	HVSC				/* Invoking hcall */
+	b	99f
+98:					/* !(rpa hypervisor) || !(star)  */
+	mtasr	r4			/* set the stab location	 */
+99:
+#endif
+	li	r7,0
+	mtlr	r7
+
+	/* enable MMU and jump to start_secondary */
+	LOADADDR(r3,.start_secondary_prolog)
+	SET_REG_TO_CONST(r4, MSR_KERNEL)
+#ifdef DO_SOFT_DISABLE
+	ori	r4,r4,MSR_EE
+#endif
+	mtspr	SRR0,r3
+	mtspr	SRR1,r4
+	rfid
+	b	.	/* prevent speculative execution */
+
+/* 
+ * Running with relocation on at this point.  All we want to do is
+ * zero the stack back-chain pointer before going into C code.
+ */
+_GLOBAL(start_secondary_prolog)
+	li	r3,0
+	std	r3,0(r1)		/* Zero the stack frame pointer	*/
+	bl	.start_secondary
+#endif
+
+/*
+ * This subroutine clobbers r11 and r12
+ */
+_GLOBAL(enable_64b_mode)
+	mfmsr	r11			/* grab the current MSR */
+	li	r12,1
+	rldicr	r12,r12,MSR_SF_LG,(63-MSR_SF_LG)
+	or	r11,r11,r12
+	li	r12,1
+	rldicr	r12,r12,MSR_ISF_LG,(63-MSR_ISF_LG)
+	or	r11,r11,r12
+	mtmsrd	r11
+	isync
+	blr
+
+#ifdef CONFIG_PPC_MULTIPLATFORM
+/*
+ * This is where the main kernel code starts.
+ */
+_STATIC(start_here_multiplatform)
+	/* get a new offset, now that the kernel has moved. */
+	bl	.reloc_offset
+	mr	r26,r3
+
+	/* Clear out the BSS. It may have been done in prom_init,
+	 * already but that's irrelevant since prom_init will soon
+	 * be detached from the kernel completely. Besides, we need
+	 * to clear it now for kexec-style entry.
+	 */
+	LOADADDR(r11,__bss_stop)
+	LOADADDR(r8,__bss_start)
+	sub	r11,r11,r8		/* bss size			*/
+	addi	r11,r11,7		/* round up to an even double word */
+	rldicl. r11,r11,61,3		/* shift right by 3		*/
+	beq	4f
+	addi	r8,r8,-8
+	li	r0,0
+	mtctr	r11			/* zero this many doublewords	*/
+3:	stdu	r0,8(r8)
+	bdnz	3b
+4:
+
+	mfmsr	r6
+	ori	r6,r6,MSR_RI
+	mtmsrd	r6			/* RI on */
+
+#ifdef CONFIG_HMT
+	/* Start up the second thread on cpu 0 */
+	mfspr	r3,PVR
+	srwi	r3,r3,16
+	cmpwi	r3,0x34			/* Pulsar  */
+	beq	90f
+	cmpwi	r3,0x36			/* Icestar */
+	beq	90f
+	cmpwi	r3,0x37			/* SStar   */
+	beq	90f
+	b	91f			/* HMT not supported */
+90:	li	r3,0
+	bl	.hmt_start_secondary
+91:
+#endif
+
+	/* The following gets the stack and TOC set up with the regs */
+	/* pointing to the real addr of the kernel stack.  This is   */
+	/* all done to support the C function call below which sets  */
+	/* up the htab.  This is done because we have relocated the  */
+	/* kernel but are still running in real mode. */
+
+	LOADADDR(r3,init_thread_union)
+	sub	r3,r3,r26
+
+	/* set up a stack pointer (physical address) */
+	addi	r1,r3,THREAD_SIZE
+	li	r0,0
+	stdu	r0,-STACK_FRAME_OVERHEAD(r1)
+
+	/* set up the TOC (physical address) */
+	LOADADDR(r2,__toc_start)
+	addi	r2,r2,0x4000
+	addi	r2,r2,0x4000
+	sub	r2,r2,r26
+
+	LOADADDR(r3,cpu_specs)
+	sub	r3,r3,r26
+	LOADADDR(r4,cur_cpu_spec)
+	sub	r4,r4,r26
+	mr	r5,r26
+	bl	.identify_cpu
+
+	/* Save some low level config HIDs of CPU0 to be copied to
+	 * other CPUs later on, or used for suspend/resume
+	 */
+	bl	.__save_cpu_setup
+	sync
+
+	/* Setup a valid physical PACA pointer in SPRG3 for early_setup
+	 * note that boot_cpuid can always be 0 nowadays since there is
+	 * nowhere it can be initialized differently before we reach this
+	 * code
+	 */
+	LOADADDR(r27, boot_cpuid)
+	sub	r27,r27,r26
+	lwz	r27,0(r27)
+
+	LOADADDR(r24, paca) 		/* Get base vaddr of paca array	 */
+	mulli	r13,r27,PACA_SIZE	/* Calculate vaddr of right paca */
+	add	r13,r13,r24		/* for this processor.		 */
+	sub	r13,r13,r26		/* convert to physical addr	 */
+	mtspr	SPRG3,r13		/* PPPBBB: Temp... -Peter */
+	
+	/* Do very early kernel initializations, including initial hash table,
+	 * stab and slb setup before we turn on relocation.	*/
+
+	/* Restore parameters passed from prom_init/kexec */
+	mr	r3,r31
+ 	bl	.early_setup
+
+	/* set the ASR */
+	ld	r3,PACASTABREAL(r13)
+	ori	r4,r3,1			/* turn on valid bit		 */
+	ld	r3,systemcfg@got(r2)	/* r3 = ptr to systemcfg */
+	lwz	r3,PLATFORM(r3)		/* r3 = platform flags */
+	cmpldi 	r3,PLATFORM_PSERIES_LPAR
+	bne	98f
+	mfspr	r3,PVR
+	srwi	r3,r3,16
+	cmpwi	r3,0x37			/* SStar */
+	beq	97f
+	cmpwi	r3,0x36			/* IStar  */
+	beq	97f
+	cmpwi	r3,0x34			/* Pulsar */
+	bne	98f
+97:	li	r3,H_SET_ASR		/* hcall = H_SET_ASR */
+	HVSC				/* Invoking hcall */
+	b	99f
+98:					/* !(rpa hypervisor) || !(star) */
+	mtasr	r4			/* set the stab location	*/
+99:
+	/* Set SDR1 (hash table pointer) */
+	ld	r3,systemcfg@got(r2)	/* r3 = ptr to systemcfg */
+	lwz	r3,PLATFORM(r3)		/* r3 = platform flags */
+	/* Test if bit 0 is set (LPAR bit) */
+	andi.	r3,r3,0x1
+	bne	98f
+	LOADADDR(r6,_SDR1)		/* Only if NOT LPAR */
+	sub	r6,r6,r26
+	ld	r6,0(r6)		/* get the value of _SDR1 */
+	mtspr	SDR1,r6			/* set the htab location  */
+98: 
+	LOADADDR(r3,.start_here_common)
+	SET_REG_TO_CONST(r4, MSR_KERNEL)
+	mtspr	SRR0,r3
+	mtspr	SRR1,r4
+	rfid
+	b	.	/* prevent speculative execution */
+#endif /* CONFIG_PPC_MULTIPLATFORM */
+	
+	/* This is where all platforms converge execution */
+_STATIC(start_here_common)
+	/* relocation is on at this point */
+
+	/* The following code sets up the SP and TOC now that we are */
+	/* running with translation enabled. */
+
+	LOADADDR(r3,init_thread_union)
+
+	/* set up the stack */
+	addi	r1,r3,THREAD_SIZE
+	li	r0,0
+	stdu	r0,-STACK_FRAME_OVERHEAD(r1)
+
+	/* Apply the CPUs-specific fixups (nop out sections not relevant
+	 * to this CPU
+	 */
+	li	r3,0
+	bl	.do_cpu_ftr_fixups
+
+	LOADADDR(r26, boot_cpuid)
+	lwz	r26,0(r26)
+
+	LOADADDR(r24, paca) 		/* Get base vaddr of paca array  */
+	mulli	r13,r26,PACA_SIZE	/* Calculate vaddr of right paca */
+	add	r13,r13,r24		/* for this processor.		 */
+	mtspr	SPRG3,r13
+
+	/* ptr to current */
+	LOADADDR(r4,init_task)
+	std	r4,PACACURRENT(r13)
+
+	/* Load the TOC */
+	ld	r2,PACATOC(r13)
+	std	r1,PACAKSAVE(r13)
+
+	bl	.setup_system
+
+	/* Load up the kernel context */
+5:
+#ifdef DO_SOFT_DISABLE
+	li	r5,0
+	stb	r5,PACAPROCENABLED(r13)	/* Soft Disabled */
+	mfmsr	r5
+	ori	r5,r5,MSR_EE		/* Hard Enabled */
+	mtmsrd	r5
+#endif
+
+	bl .start_kernel
+
+_GLOBAL(__setup_cpu_power3)
+	blr
+
+_GLOBAL(hmt_init)
+#ifdef CONFIG_HMT
+	LOADADDR(r5, hmt_thread_data)
+	mfspr	r7,PVR
+	srwi	r7,r7,16
+	cmpwi	r7,0x34			/* Pulsar  */
+	beq	90f
+	cmpwi	r7,0x36			/* Icestar */
+	beq	91f
+	cmpwi	r7,0x37			/* SStar   */
+	beq	91f
+	b	101f
+90:	mfspr	r6,PIR
+	andi.	r6,r6,0x1f
+	b	92f
+91:	mfspr	r6,PIR
+	andi.	r6,r6,0x3ff
+92:	sldi	r4,r24,3
+	stwx	r6,r5,r4
+	bl	.hmt_start_secondary
+	b	101f
+
+__hmt_secondary_hold:
+	LOADADDR(r5, hmt_thread_data)
+	clrldi	r5,r5,4
+	li	r7,0
+	mfspr	r6,PIR
+	mfspr	r8,PVR
+	srwi	r8,r8,16
+	cmpwi	r8,0x34
+	bne	93f
+	andi.	r6,r6,0x1f
+	b	103f
+93:	andi.	r6,r6,0x3f
+
+103:	lwzx	r8,r5,r7
+	cmpw	r8,r6
+	beq	104f
+	addi	r7,r7,8
+	b	103b
+
+104:	addi	r7,r7,4
+	lwzx	r9,r5,r7
+	mr	r24,r9
+101:
+#endif
+	mr	r3,r24
+	b	.pSeries_secondary_smp_init
+
+#ifdef CONFIG_HMT
+_GLOBAL(hmt_start_secondary)
+	LOADADDR(r4,__hmt_secondary_hold)
+	clrldi	r4,r4,4
+	mtspr	NIADORM, r4
+	mfspr	r4, MSRDORM
+	li	r5, -65
+	and	r4, r4, r5
+	mtspr	MSRDORM, r4
+	lis	r4,0xffef
+	ori	r4,r4,0x7403
+	mtspr	TSC, r4
+	li	r4,0x1f4
+	mtspr	TST, r4
+	mfspr	r4, HID0
+	ori	r4, r4, 0x1
+	mtspr	HID0, r4
+	mfspr	r4, CTRLF
+	oris	r4, r4, 0x40
+	mtspr	CTRLT, r4
+	blr
+#endif
+
+#if defined(CONFIG_SMP) && !defined(CONFIG_PPC_ISERIES)
+_GLOBAL(smp_release_cpus)
+	/* All secondary cpus are spinning on a common
+	 * spinloop, release them all now so they can start
+	 * to spin on their individual paca spinloops.
+	 * For non SMP kernels, the secondary cpus never
+	 * get out of the common spinloop.
+	 */
+	li	r3,1
+	LOADADDR(r5,__secondary_hold_spinloop)
+	std	r3,0(r5)
+	sync
+	blr
+#endif /* CONFIG_SMP && !CONFIG_PPC_ISERIES */
+
+
+/*
+ * We put a few things here that have to be page-aligned.
+ * This stuff goes at the beginning of the data segment,
+ * which is page-aligned.
+ */
+	.data
+	.align	12
+	.globl	sdata
+sdata:
+	.globl	empty_zero_page
+empty_zero_page:
+	.space	4096
+
+	.globl	swapper_pg_dir
+swapper_pg_dir:
+	.space	4096
+
+	.globl	ioremap_dir
+ioremap_dir:
+	.space	4096
+
+#ifdef CONFIG_SMP
+/* 1 page segment table per cpu (max 48, cpu0 allocated at STAB0_PHYS_ADDR) */
+	.globl	stab_array
+stab_array:
+	.space	4096 * 48
+#endif
+	
+/*
+ * This space gets a copy of optional info passed to us by the bootstrap
+ * Used to pass parameters into the kernel like root=/dev/sda1, etc.
+ */
+	.globl	cmd_line
+cmd_line:
+	.space	COMMAND_LINE_SIZE
diff --git a/arch/ppc64/kernel/hvCall.S b/arch/ppc64/kernel/hvCall.S
new file mode 100644
index 00000000000..4c699eab1b9
--- /dev/null
+++ b/arch/ppc64/kernel/hvCall.S
@@ -0,0 +1,98 @@
+/*
+ * arch/ppc64/kernel/hvCall.S
+ *
+ *
+ * This file contains the code to perform calls to the
+ * iSeries LPAR hypervisor
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/processor.h>
+
+	.text
+
+/* 
+ * Hypervisor call
+ * 
+ * Invoke the iSeries hypervisor via the System Call instruction
+ * Parameters are passed to this routine in registers r3 - r10
+ * 
+ * r3 contains the HV function to be called
+ * r4-r10 contain the operands to the hypervisor function
+ *
+ */
+
+_GLOBAL(HvCall)
+_GLOBAL(HvCall0)
+_GLOBAL(HvCall1)
+_GLOBAL(HvCall2)
+_GLOBAL(HvCall3)
+_GLOBAL(HvCall4)
+_GLOBAL(HvCall5)
+_GLOBAL(HvCall6)
+_GLOBAL(HvCall7)
+
+
+	mfcr	r0
+	std	r0,-8(r1)
+	stdu	r1,-(STACK_FRAME_OVERHEAD+16)(r1)
+	
+	/* r0 = 0xffffffffffffffff indicates a hypervisor call */
+	
+	li	r0,-1
+	
+	/* Invoke the hypervisor */
+
+	sc
+
+	ld	r1,0(r1)
+	ld	r0,-8(r1)
+	mtcrf	0xff,r0
+
+	/*  return to caller, return value in r3 */
+	
+	blr
+
+_GLOBAL(HvCall0Ret16)
+_GLOBAL(HvCall1Ret16)
+_GLOBAL(HvCall2Ret16)
+_GLOBAL(HvCall3Ret16)
+_GLOBAL(HvCall4Ret16)
+_GLOBAL(HvCall5Ret16)
+_GLOBAL(HvCall6Ret16)
+_GLOBAL(HvCall7Ret16)
+
+	mfcr	r0
+	std	r0,-8(r1)
+	std	r31,-16(r1)
+	stdu	r1,-(STACK_FRAME_OVERHEAD+32)(r1)
+
+	mr	r31,r4
+	li	r0,-1
+	mr	r4,r5
+	mr	r5,r6
+	mr	r6,r7
+	mr	r7,r8
+	mr	r8,r9
+	mr	r9,r10
+
+	sc
+
+	std	r3,0(r31)
+	std	r4,8(r31)
+
+	mr	r3,r5
+
+	ld	r1,0(r1)
+	ld	r0,-8(r1)
+	mtcrf	0xff,r0
+	ld	r31,-16(r1)
+	
+	blr
+
+
diff --git a/arch/ppc64/kernel/hvconsole.c b/arch/ppc64/kernel/hvconsole.c
new file mode 100644
index 00000000000..c72fb8ffe97
--- /dev/null
+++ b/arch/ppc64/kernel/hvconsole.c
@@ -0,0 +1,121 @@
+/*
+ * hvconsole.c
+ * Copyright (C) 2004 Hollis Blanchard, IBM Corporation
+ * Copyright (C) 2004 IBM Corporation
+ *
+ * Additional Author(s):
+ *  Ryan S. Arnold <rsa@us.ibm.com>
+ *
+ * LPAR console support.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/hvcall.h>
+#include <asm/hvconsole.h>
+#include <asm/prom.h>
+
+/**
+ * hvc_get_chars - retrieve characters from firmware for denoted vterm adatper
+ * @vtermno: The vtermno or unit_address of the adapter from which to fetch the
+ *	data.
+ * @buf: The character buffer into which to put the character data fetched from
+ *	firmware.
+ * @count: not used?
+ */
+int hvc_get_chars(uint32_t vtermno, char *buf, int count)
+{
+	unsigned long got;
+
+	if (plpar_hcall(H_GET_TERM_CHAR, vtermno, 0, 0, 0, &got,
+		(unsigned long *)buf, (unsigned long *)buf+1) == H_Success) {
+		/*
+		 * Work around a HV bug where it gives us a null
+		 * after every \r.  -- paulus
+		 */
+		if (got > 0) {
+			int i;
+			for (i = 1; i < got; ++i) {
+				if (buf[i] == 0 && buf[i-1] == '\r') {
+					--got;
+					if (i < got)
+						memmove(&buf[i], &buf[i+1],
+							got - i);
+				}
+			}
+		}
+		return got;
+	}
+	return 0;
+}
+
+EXPORT_SYMBOL(hvc_get_chars);
+
+/**
+ * hvc_put_chars: send characters to firmware for denoted vterm adapter
+ * @vtermno: The vtermno or unit_address of the adapter from which the data
+ *	originated.
+ * @buf: The character buffer that contains the character data to send to
+ *	firmware.
+ * @count: Send this number of characters.
+ */
+int hvc_put_chars(uint32_t vtermno, const char *buf, int count)
+{
+	unsigned long *lbuf = (unsigned long *) buf;
+	long ret;
+
+	ret = plpar_hcall_norets(H_PUT_TERM_CHAR, vtermno, count, lbuf[0],
+				 lbuf[1]);
+	if (ret == H_Success)
+		return count;
+	if (ret == H_Busy)
+		return 0;
+	return -EIO;
+}
+
+EXPORT_SYMBOL(hvc_put_chars);
+
+/*
+ * We hope/assume that the first vty found corresponds to the first console
+ * device.
+ */
+int hvc_find_vtys(void)
+{
+	struct device_node *vty;
+	int num_found = 0;
+
+	for (vty = of_find_node_by_name(NULL, "vty"); vty != NULL;
+			vty = of_find_node_by_name(vty, "vty")) {
+		uint32_t *vtermno;
+
+		/* We have statically defined space for only a certain number of
+		 * console adapters. */
+		if (num_found >= MAX_NR_HVC_CONSOLES)
+			break;
+
+		vtermno = (uint32_t *)get_property(vty, "reg", NULL);
+		if (!vtermno)
+			continue;
+
+		if (device_is_compatible(vty, "hvterm1")) {
+			hvc_instantiate(*vtermno, num_found);
+			++num_found;
+		}
+	}
+
+	return num_found;
+}
diff --git a/arch/ppc64/kernel/hvcserver.c b/arch/ppc64/kernel/hvcserver.c
new file mode 100644
index 00000000000..bde8f42da85
--- /dev/null
+++ b/arch/ppc64/kernel/hvcserver.c
@@ -0,0 +1,249 @@
+/*
+ * hvcserver.c
+ * Copyright (C) 2004 Ryan S Arnold, IBM Corporation
+ *
+ * PPC64 virtual I/O console server support.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <asm/hvcall.h>
+#include <asm/hvcserver.h>
+#include <asm/io.h>
+
+#define HVCS_ARCH_VERSION "1.0.0"
+
+MODULE_AUTHOR("Ryan S. Arnold <rsa@us.ibm.com>");
+MODULE_DESCRIPTION("IBM hvcs ppc64 API");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(HVCS_ARCH_VERSION);
+
+/*
+ * Convert arch specific return codes into relevant errnos.  The hvcs
+ * functions aren't performance sensitive, so this conversion isn't an
+ * issue.
+ */
+int hvcs_convert(long to_convert)
+{
+	switch (to_convert) {
+		case H_Success:
+			return 0;
+		case H_Parameter:
+			return -EINVAL;
+		case H_Hardware:
+			return -EIO;
+		case H_Busy:
+		case H_LongBusyOrder1msec:
+		case H_LongBusyOrder10msec:
+		case H_LongBusyOrder100msec:
+		case H_LongBusyOrder1sec:
+		case H_LongBusyOrder10sec:
+		case H_LongBusyOrder100sec:
+			return -EBUSY;
+		case H_Function: /* fall through */
+		default:
+			return -EPERM;
+	}
+}
+
+/**
+ * hvcs_free_partner_info - free pi allocated by hvcs_get_partner_info
+ * @head: list_head pointer for an allocated list of partner info structs to
+ *	free.
+ *
+ * This function is used to free the partner info list that was returned by
+ * calling hvcs_get_partner_info().
+ */
+int hvcs_free_partner_info(struct list_head *head)
+{
+	struct hvcs_partner_info *pi;
+	struct list_head *element;
+
+	if (!head)
+		return -EINVAL;
+
+	while (!list_empty(head)) {
+		element = head->next;
+		pi = list_entry(element, struct hvcs_partner_info, node);
+		list_del(element);
+		kfree(pi);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(hvcs_free_partner_info);
+
+/* Helper function for hvcs_get_partner_info */
+int hvcs_next_partner(uint32_t unit_address,
+		unsigned long last_p_partition_ID,
+		unsigned long last_p_unit_address, unsigned long *pi_buff)
+
+{
+	long retval;
+	retval = plpar_hcall_norets(H_VTERM_PARTNER_INFO, unit_address,
+			last_p_partition_ID,
+				last_p_unit_address, virt_to_phys(pi_buff));
+	return hvcs_convert(retval);
+}
+
+/**
+ * hvcs_get_partner_info - Get all of the partner info for a vty-server adapter
+ * @unit_address: The unit_address of the vty-server adapter for which this
+ *	function is fetching partner info.
+ * @head: An initialized list_head pointer to an empty list to use to return the
+ *	list of partner info fetched from the hypervisor to the caller.
+ * @pi_buff: A page sized buffer pre-allocated prior to calling this function
+ *	that is to be used to be used by firmware as an iterator to keep track
+ *	of the partner info retrieval.
+ *
+ * This function returns non-zero on success, or if there is no partner info.
+ *
+ * The pi_buff is pre-allocated prior to calling this function because this
+ * function may be called with a spin_lock held and kmalloc of a page is not
+ * recommended as GFP_ATOMIC.
+ *
+ * The first long of this buffer is used to store a partner unit address.  The
+ * second long is used to store a partner partition ID and starting at
+ * pi_buff[2] is the 79 character Converged Location Code (diff size than the
+ * unsigned longs, hence the casting mumbo jumbo you see later).
+ *
+ * Invocation of this function should always be followed by an invocation of
+ * hvcs_free_partner_info() using a pointer to the SAME list head instance
+ * that was passed as a parameter to this function.
+ */
+int hvcs_get_partner_info(uint32_t unit_address, struct list_head *head,
+		unsigned long *pi_buff)
+{
+	/*
+	 * Dealt with as longs because of the hcall interface even though the
+	 * values are uint32_t.
+	 */
+	unsigned long	last_p_partition_ID;
+	unsigned long	last_p_unit_address;
+	struct hvcs_partner_info *next_partner_info = NULL;
+	int more = 1;
+	int retval;
+
+	memset(pi_buff, 0x00, PAGE_SIZE);
+	/* invalid parameters */
+	if (!head || !pi_buff)
+		return -EINVAL;
+
+	last_p_partition_ID = last_p_unit_address = ~0UL;
+	INIT_LIST_HEAD(head);
+
+	do {
+		retval = hvcs_next_partner(unit_address, last_p_partition_ID,
+				last_p_unit_address, pi_buff);
+		if (retval) {
+			/*
+			 * Don't indicate that we've failed if we have
+			 * any list elements.
+			 */
+			if (!list_empty(head))
+				return 0;
+			return retval;
+		}
+
+		last_p_partition_ID = pi_buff[0];
+		last_p_unit_address = pi_buff[1];
+
+		/* This indicates that there are no further partners */
+		if (last_p_partition_ID == ~0UL
+				&& last_p_unit_address == ~0UL)
+			break;
+
+		/* This is a very small struct and will be freed soon in
+		 * hvcs_free_partner_info(). */
+		next_partner_info = kmalloc(sizeof(struct hvcs_partner_info),
+				GFP_ATOMIC);
+
+		if (!next_partner_info) {
+			printk(KERN_WARNING "HVCONSOLE: kmalloc() failed to"
+				" allocate partner info struct.\n");
+			hvcs_free_partner_info(head);
+			return -ENOMEM;
+		}
+
+		next_partner_info->unit_address
+			= (unsigned int)last_p_unit_address;
+		next_partner_info->partition_ID
+			= (unsigned int)last_p_partition_ID;
+
+		/* copy the Null-term char too */
+		strncpy(&next_partner_info->location_code[0],
+			(char *)&pi_buff[2],
+			strlen((char *)&pi_buff[2]) + 1);
+
+		list_add_tail(&(next_partner_info->node), head);
+		next_partner_info = NULL;
+
+	} while (more);
+
+	return 0;
+}
+EXPORT_SYMBOL(hvcs_get_partner_info);
+
+/**
+ * hvcs_register_connection - establish a connection between this vty-server and
+ *	a vty.
+ * @unit_address: The unit address of the vty-server adapter that is to be
+ *	establish a connection.
+ * @p_partition_ID: The partition ID of the vty adapter that is to be connected.
+ * @p_unit_address: The unit address of the vty adapter to which the vty-server
+ *	is to be connected.
+ *
+ * If this function is called once and -EINVAL is returned it may
+ * indicate that the partner info needs to be refreshed for the
+ * target unit address at which point the caller must invoke
+ * hvcs_get_partner_info() and then call this function again.  If,
+ * for a second time, -EINVAL is returned then it indicates that
+ * there is probably already a partner connection registered to a
+ * different vty-server adapter.  It is also possible that a second
+ * -EINVAL may indicate that one of the parms is not valid, for
+ * instance if the link was removed between the vty-server adapter
+ * and the vty adapter that you are trying to open.  Don't shoot the
+ * messenger.  Firmware implemented it this way.
+ */
+int hvcs_register_connection( uint32_t unit_address,
+		uint32_t p_partition_ID, uint32_t p_unit_address)
+{
+	long retval;
+	retval = plpar_hcall_norets(H_REGISTER_VTERM, unit_address,
+				p_partition_ID, p_unit_address);
+	return hvcs_convert(retval);
+}
+EXPORT_SYMBOL(hvcs_register_connection);
+
+/**
+ * hvcs_free_connection - free the connection between a vty-server and vty
+ * @unit_address: The unit address of the vty-server that is to have its
+ *	connection severed.
+ *
+ * This function is used to free the partner connection between a vty-server
+ * adapter and a vty adapter.
+ *
+ * If -EBUSY is returned continue to call this function until 0 is returned.
+ */
+int hvcs_free_connection(uint32_t unit_address)
+{
+	long retval;
+	retval = plpar_hcall_norets(H_FREE_VTERM, unit_address);
+	return hvcs_convert(retval);
+}
+EXPORT_SYMBOL(hvcs_free_connection);
diff --git a/arch/ppc64/kernel/i8259.c b/arch/ppc64/kernel/i8259.c
new file mode 100644
index 00000000000..74dcfd68fc7
--- /dev/null
+++ b/arch/ppc64/kernel/i8259.c
@@ -0,0 +1,177 @@
+/*
+ * c 2001 PPC64 Team, IBM Corp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/stddef.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/signal.h>
+#include <linux/cache.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <asm/io.h>
+#include <asm/ppcdebug.h>
+#include "i8259.h"
+
+unsigned char cached_8259[2] = { 0xff, 0xff };
+#define cached_A1 (cached_8259[0])
+#define cached_21 (cached_8259[1])
+
+static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(i8259_lock);
+
+static int i8259_pic_irq_offset;
+static int i8259_present;
+
+int i8259_irq(int cpu)
+{
+	int irq;
+	
+	spin_lock/*_irqsave*/(&i8259_lock/*, flags*/);
+        /*
+         * Perform an interrupt acknowledge cycle on controller 1
+         */                                                             
+        outb(0x0C, 0x20);
+        irq = inb(0x20) & 7;                                   
+        if (irq == 2)                                                     
+        {                                                                   
+                /*                                     
+                 * Interrupt is cascaded so perform interrupt
+                 * acknowledge on controller 2
+                 */
+                outb(0x0C, 0xA0);                      
+                irq = (inb(0xA0) & 7) + 8;
+        }
+        else if (irq==7)                                
+        {
+                /*                               
+                 * This may be a spurious interrupt
+                 *                         
+                 * Read the interrupt status register. If the most
+                 * significant bit is not set then there is no valid
+		 * interrupt
+		 */
+		outb(0x0b, 0x20);
+		if(~inb(0x20)&0x80) {
+			spin_unlock/*_irqrestore*/(&i8259_lock/*, flags*/);
+			return -1;
+		}
+	}
+	spin_unlock/*_irqrestore*/(&i8259_lock/*, flags*/);
+	return irq;
+}
+
+static void i8259_mask_and_ack_irq(unsigned int irq_nr)
+{
+	unsigned long flags;
+	
+	spin_lock_irqsave(&i8259_lock, flags);
+        if ( irq_nr >= i8259_pic_irq_offset )
+                irq_nr -= i8259_pic_irq_offset;
+
+        if (irq_nr > 7) {                                                   
+                cached_A1 |= 1 << (irq_nr-8);                                   
+                inb(0xA1);      /* DUMMY */                                     
+                outb(cached_A1,0xA1);                                           
+                outb(0x20,0xA0);        /* Non-specific EOI */             
+                outb(0x20,0x20);        /* Non-specific EOI to cascade */
+        } else {                                                            
+                cached_21 |= 1 << irq_nr;                                   
+                inb(0x21);      /* DUMMY */                                 
+                outb(cached_21,0x21);
+                outb(0x20,0x20);        /* Non-specific EOI */                 
+        }                                                                
+	spin_unlock_irqrestore(&i8259_lock, flags);
+}
+
+static void i8259_set_irq_mask(int irq_nr)
+{
+        outb(cached_A1,0xA1);
+        outb(cached_21,0x21);
+}
+
+static void i8259_mask_irq(unsigned int irq_nr)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8259_lock, flags);
+        if ( irq_nr >= i8259_pic_irq_offset )
+                irq_nr -= i8259_pic_irq_offset;
+        if ( irq_nr < 8 )
+                cached_21 |= 1 << irq_nr;
+        else
+                cached_A1 |= 1 << (irq_nr-8);
+        i8259_set_irq_mask(irq_nr);
+	spin_unlock_irqrestore(&i8259_lock, flags);
+}
+
+static void i8259_unmask_irq(unsigned int irq_nr)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8259_lock, flags);
+        if ( irq_nr >= i8259_pic_irq_offset )
+                irq_nr -= i8259_pic_irq_offset;
+        if ( irq_nr < 8 )
+                cached_21 &= ~(1 << irq_nr);
+        else
+                cached_A1 &= ~(1 << (irq_nr-8));
+        i8259_set_irq_mask(irq_nr);
+	spin_unlock_irqrestore(&i8259_lock, flags);
+}
+
+static void i8259_end_irq(unsigned int irq)
+{
+	if (!(get_irq_desc(irq)->status & (IRQ_DISABLED|IRQ_INPROGRESS)) &&
+	    get_irq_desc(irq)->action)
+		i8259_unmask_irq(irq);
+}
+
+struct hw_interrupt_type i8259_pic = {
+	.typename = " i8259    ",
+	.enable = i8259_unmask_irq,
+	.disable = i8259_mask_irq,
+	.ack = i8259_mask_and_ack_irq,
+	.end = i8259_end_irq,
+};
+
+void __init i8259_init(int offset)
+{
+	unsigned long flags;
+	
+	spin_lock_irqsave(&i8259_lock, flags);
+	i8259_pic_irq_offset = offset;
+	i8259_present = 1;
+        /* init master interrupt controller */
+        outb(0x11, 0x20); /* Start init sequence */
+        outb(0x00, 0x21); /* Vector base */
+        outb(0x04, 0x21); /* edge tiggered, Cascade (slave) on IRQ2 */
+        outb(0x01, 0x21); /* Select 8086 mode */
+        outb(0xFF, 0x21); /* Mask all */
+        /* init slave interrupt controller */
+        outb(0x11, 0xA0); /* Start init sequence */
+        outb(0x08, 0xA1); /* Vector base */
+        outb(0x02, 0xA1); /* edge triggered, Cascade (slave) on IRQ2 */
+        outb(0x01, 0xA1); /* Select 8086 mode */
+        outb(0xFF, 0xA1); /* Mask all */
+        outb(cached_A1, 0xA1);
+        outb(cached_21, 0x21);
+	spin_unlock_irqrestore(&i8259_lock, flags);
+        
+}
+
+static int i8259_request_cascade(void)
+{
+	if (!i8259_present)
+		return -ENODEV;
+
+        request_irq( i8259_pic_irq_offset + 2, no_action, SA_INTERRUPT,
+                     "82c59 secondary cascade", NULL );
+
+	return 0;
+}
+
+arch_initcall(i8259_request_cascade);
diff --git a/arch/ppc64/kernel/i8259.h b/arch/ppc64/kernel/i8259.h
new file mode 100644
index 00000000000..f74764ba0bf
--- /dev/null
+++ b/arch/ppc64/kernel/i8259.h
@@ -0,0 +1,17 @@
+/*
+ * c 2001 PPC 64 Team, IBM Corp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#ifndef _PPC_KERNEL_i8259_H
+#define _PPC_KERNEL_i8259_H
+
+extern struct hw_interrupt_type i8259_pic;
+
+extern void i8259_init(int offset);
+extern int i8259_irq(int);
+
+#endif /* _PPC_KERNEL_i8259_H */
diff --git a/arch/ppc64/kernel/iSeries_VpdInfo.c b/arch/ppc64/kernel/iSeries_VpdInfo.c
new file mode 100644
index 00000000000..a6f0ff2d023
--- /dev/null
+++ b/arch/ppc64/kernel/iSeries_VpdInfo.c
@@ -0,0 +1,277 @@
+/************************************************************************/
+/* File iSeries_vpdInfo.c created by Allan Trautman on Fri Feb  2 2001. */
+/************************************************************************/
+/* This code gets the card location of the hardware                     */
+/* Copyright (C) 20yy  <Allan H Trautman> <IBM Corp>                    */
+/*                                                                      */
+/* This program is free software; you can redistribute it and/or modify */
+/* it under the terms of the GNU General Public License as published by */
+/* the Free Software Foundation; either version 2 of the License, or    */
+/* (at your option) any later version.                                  */
+/*                                                                      */
+/* This program is distributed in the hope that it will be useful,      */ 
+/* but WITHOUT ANY WARRANTY; without even the implied warranty of       */
+/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the        */
+/* GNU General Public License for more details.                         */
+/*                                                                      */
+/* You should have received a copy of the GNU General Public License    */ 
+/* along with this program; if not, write to the:                       */
+/* Free Software Foundation, Inc.,                                      */ 
+/* 59 Temple Place, Suite 330,                                          */ 
+/* Boston, MA  02111-1307  USA                                          */
+/************************************************************************/
+/* Change Activity:                                                     */
+/*   Created, Feb 2, 2001                                               */
+/*   Ported to ppc64, August 20, 2001                                   */
+/* End Change Activity                                                  */
+/************************************************************************/
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <asm/types.h>
+#include <asm/resource.h>
+
+#include <asm/iSeries/HvCallPci.h>
+#include <asm/iSeries/HvTypes.h>
+#include <asm/iSeries/mf.h>
+#include <asm/iSeries/LparData.h>
+#include <asm/iSeries/iSeries_pci.h>
+#include "pci.h"
+
+/*
+ * Size of Bus VPD data
+ */
+#define BUS_VPDSIZE      1024
+/*
+ * Bus Vpd Tags
+ */
+#define  VpdEndOfDataTag   0x78
+#define  VpdEndOfAreaTag   0x79
+#define  VpdIdStringTag    0x82
+#define  VpdVendorAreaTag  0x84
+/*
+ * Mfg Area Tags
+ */
+#define  VpdFruFlag       0x4647     // "FG"
+#define  VpdFruFrameId    0x4649     // "FI"
+#define  VpdSlotMapFormat 0x4D46     // "MF"
+#define  VpdAsmPartNumber 0x504E     // "PN"
+#define  VpdFruSerial     0x534E     // "SN"
+#define  VpdSlotMap       0x534D     // "SM"
+
+/*
+ * Structures of the areas
+ */
+struct MfgVpdAreaStruct {
+	u16 Tag;
+	u8  TagLength;
+	u8  AreaData1;
+	u8  AreaData2;
+};
+typedef struct MfgVpdAreaStruct MfgArea;
+#define MFG_ENTRY_SIZE   3
+
+struct SlotMapStruct {
+	u8   AgentId;
+	u8   SecondaryAgentId;
+	u8   PhbId;
+	char CardLocation[3];
+	char Parms[8];
+	char Reserved[2];
+}; 
+typedef struct SlotMapStruct SlotMap;
+#define SLOT_ENTRY_SIZE   16
+
+/*
+ * Formats the device information.
+ * - Pass in pci_dev* pointer to the device.
+ * - Pass in buffer to place the data.  Danger here is the buffer must
+ *   be as big as the client says it is.   Should be at least 128 bytes.
+ * Return will the length of the string data put in the buffer.
+ * Format:
+ * PCI: Bus  0, Device 26, Vendor 0x12AE  Frame  1, Card  C10  Ethernet
+ * controller
+ */
+int iSeries_Device_Information(struct pci_dev *PciDev, char *buffer,
+		int BufferSize)
+{
+	struct iSeries_Device_Node *DevNode =
+		(struct iSeries_Device_Node *)PciDev->sysdata;
+	int len;
+
+	if (DevNode == NULL)
+		return sprintf(buffer,
+				"PCI: iSeries_Device_Information DevNode is NULL");
+
+	if (BufferSize < 128)
+		return 0;
+
+	len = sprintf(buffer, "PCI: Bus%3d, Device%3d, Vendor %04X ",
+			ISERIES_BUS(DevNode), PCI_SLOT(PciDev->devfn),
+			PciDev->vendor);
+	len += sprintf(buffer + len, "Frame%3d, Card %4s  ",
+			DevNode->FrameId, DevNode->CardLocation);
+#ifdef CONFIG_PCI
+	if (pci_class_name(PciDev->class >> 8) == 0)
+		len += sprintf(buffer + len, "0x%04X  ",
+				(int)(PciDev->class >> 8));
+	else
+		len += sprintf(buffer + len, "%s",
+				pci_class_name(PciDev->class >> 8));
+#endif
+	return len;
+}
+
+/*
+ * Parse the Slot Area
+ */
+void iSeries_Parse_SlotArea(SlotMap *MapPtr, int MapLen,
+		struct iSeries_Device_Node *DevNode)
+{
+	int SlotMapLen = MapLen;
+	SlotMap *SlotMapPtr = MapPtr;
+
+	/*
+	 * Parse Slot label until we find the one requrested
+	 */
+	while (SlotMapLen > 0) {
+		if (SlotMapPtr->AgentId == DevNode->AgentId ) {
+			/*
+			 * If Phb wasn't found, grab the entry first one found.
+			 */
+			if (DevNode->PhbId == 0xff)
+				DevNode->PhbId = SlotMapPtr->PhbId; 
+			/* Found it, extract the data. */
+			if (SlotMapPtr->PhbId == DevNode->PhbId ) {
+	        		memcpy(&DevNode->CardLocation,
+						&SlotMapPtr->CardLocation, 3);
+				DevNode->CardLocation[3]  = 0;
+				break;
+			}
+		}
+		/* Point to the next Slot */
+		SlotMapPtr = (SlotMap *)((char *)SlotMapPtr + SLOT_ENTRY_SIZE);
+		SlotMapLen -= SLOT_ENTRY_SIZE;
+	}
+}
+
+/*
+ * Parse the Mfg Area
+ */
+static void iSeries_Parse_MfgArea(u8 *AreaData, int AreaLen,
+		struct iSeries_Device_Node *DevNode)
+{
+	MfgArea *MfgAreaPtr = (MfgArea *)AreaData;
+	int MfgAreaLen = AreaLen;
+	u16 SlotMapFmt = 0;
+
+	/* Parse Mfg Data */
+	while (MfgAreaLen > 0) {
+		int MfgTagLen = MfgAreaPtr->TagLength;
+		/* Frame ID         (FI 4649020310 ) */
+		if (MfgAreaPtr->Tag == VpdFruFrameId)		/* FI  */
+			DevNode->FrameId = MfgAreaPtr->AreaData1;
+		/* Slot Map Format  (MF 4D46020004 ) */
+		else if (MfgAreaPtr->Tag == VpdSlotMapFormat)	/* MF  */
+			SlotMapFmt = (MfgAreaPtr->AreaData1 * 256)
+				+ MfgAreaPtr->AreaData2;
+		/* Slot Map         (SM 534D90 */
+		else if (MfgAreaPtr->Tag == VpdSlotMap)	{	/* SM  */
+			SlotMap *SlotMapPtr;
+
+			if (SlotMapFmt == 0x1004)
+				SlotMapPtr = (SlotMap *)((char *)MfgAreaPtr
+						+ MFG_ENTRY_SIZE + 1);
+ 	    		else
+				SlotMapPtr = (SlotMap *)((char *)MfgAreaPtr
+						+ MFG_ENTRY_SIZE);
+	    		iSeries_Parse_SlotArea(SlotMapPtr, MfgTagLen, DevNode);
+		}
+		/*
+		 * Point to the next Mfg Area
+		 * Use defined size, sizeof give wrong answer
+		 */
+		MfgAreaPtr = (MfgArea *)((char *)MfgAreaPtr + MfgTagLen
+				+ MFG_ENTRY_SIZE);
+		MfgAreaLen -= (MfgTagLen + MFG_ENTRY_SIZE); 
+	}	
+}
+
+/*
+ * Look for "BUS".. Data is not Null terminated.
+ * PHBID of 0xFF indicates PHB was not found in VPD Data.
+ */
+static int iSeries_Parse_PhbId(u8 *AreaPtr, int AreaLength)
+{
+	u8 *PhbPtr = AreaPtr;
+	int DataLen = AreaLength;
+	char PhbId = 0xFF;                   
+
+	while (DataLen > 0) {
+		if ((*PhbPtr == 'B') && (*(PhbPtr + 1) == 'U')
+				&& (*(PhbPtr + 2) == 'S')) {
+			PhbPtr += 3;
+			while (*PhbPtr == ' ')
+				++PhbPtr;
+			PhbId = (*PhbPtr & 0x0F);
+			break;
+        	}
+		++PhbPtr;
+		--DataLen;
+	}
+	return PhbId;
+}
+
+/*
+ * Parse out the VPD Areas
+ */
+static void iSeries_Parse_Vpd(u8 *VpdData, int VpdDataLen,
+		struct iSeries_Device_Node *DevNode)
+{
+	u8 *TagPtr = VpdData;
+	int DataLen = VpdDataLen - 3;
+
+	while ((*TagPtr != VpdEndOfAreaTag) && (DataLen > 0)) {
+		int AreaLen = *(TagPtr + 1) + (*(TagPtr + 2) * 256);	
+		u8 *AreaData  = TagPtr + 3;
+
+		if (*TagPtr == VpdIdStringTag)
+			DevNode->PhbId = iSeries_Parse_PhbId(AreaData, AreaLen);
+		else if (*TagPtr == VpdVendorAreaTag)
+	    		iSeries_Parse_MfgArea(AreaData, AreaLen, DevNode);
+		/* Point to next Area. */
+		TagPtr  = AreaData + AreaLen;
+		DataLen -= AreaLen;
+	}
+}    
+
+void iSeries_Get_Location_Code(struct iSeries_Device_Node *DevNode)
+{
+	int BusVpdLen = 0;
+	u8 *BusVpdPtr = (u8 *)kmalloc(BUS_VPDSIZE, GFP_KERNEL);
+
+	if (BusVpdPtr == NULL) {
+		printk("PCI: Bus VPD Buffer allocation failure.\n");
+		return;
+	}
+	BusVpdLen = HvCallPci_getBusVpd(ISERIES_BUS(DevNode),
+					ISERIES_HV_ADDR(BusVpdPtr),
+					BUS_VPDSIZE);
+	if (BusVpdLen == 0) {
+		kfree(BusVpdPtr);
+		printk("PCI: Bus VPD Buffer zero length.\n");
+		return;
+	}
+	/* printk("PCI: BusVpdPtr: %p, %d\n",BusVpdPtr, BusVpdLen); */
+	/* Make sure this is what I think it is */
+	if (*BusVpdPtr != VpdIdStringTag) {	/* 0x82 */
+		printk("PCI: Bus VPD Buffer missing starting tag.\n");
+		kfree(BusVpdPtr);
+		return;
+	}
+	iSeries_Parse_Vpd(BusVpdPtr,BusVpdLen, DevNode);
+	sprintf(DevNode->Location, "Frame%3d, Card %-4s", DevNode->FrameId,
+			DevNode->CardLocation);
+	kfree(BusVpdPtr);
+}
diff --git a/arch/ppc64/kernel/iSeries_htab.c b/arch/ppc64/kernel/iSeries_htab.c
new file mode 100644
index 00000000000..aa9e8fdd1a4
--- /dev/null
+++ b/arch/ppc64/kernel/iSeries_htab.c
@@ -0,0 +1,242 @@
+/*
+ * iSeries hashtable management.
+ * 	Derived from pSeries_htab.c
+ *
+ * SMP scalability work:
+ *    Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <asm/machdep.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/iSeries/HvCallHpt.h>
+#include <asm/abs_addr.h>
+#include <linux/spinlock.h>
+
+static spinlock_t iSeries_hlocks[64] __cacheline_aligned_in_smp = { [0 ... 63] = SPIN_LOCK_UNLOCKED};
+
+/*
+ * Very primitive algorithm for picking up a lock
+ */
+static inline void iSeries_hlock(unsigned long slot)
+{
+	if (slot & 0x8)
+		slot = ~slot;
+	spin_lock(&iSeries_hlocks[(slot >> 4) & 0x3f]);
+}
+
+static inline void iSeries_hunlock(unsigned long slot)
+{
+	if (slot & 0x8)
+		slot = ~slot;
+	spin_unlock(&iSeries_hlocks[(slot >> 4) & 0x3f]);
+}
+
+static long iSeries_hpte_insert(unsigned long hpte_group, unsigned long va,
+			 unsigned long prpn, int secondary,
+			 unsigned long hpteflags, int bolted, int large)
+{
+	long slot;
+	HPTE lhpte;
+
+	/*
+	 * The hypervisor tries both primary and secondary.
+	 * If we are being called to insert in the secondary,
+	 * it means we have already tried both primary and secondary,
+	 * so we return failure immediately.
+	 */
+	if (secondary)
+		return -1;
+
+	iSeries_hlock(hpte_group);
+
+	slot = HvCallHpt_findValid(&lhpte, va >> PAGE_SHIFT);
+	BUG_ON(lhpte.dw0.dw0.v);
+
+	if (slot == -1)	{ /* No available entry found in either group */
+		iSeries_hunlock(hpte_group);
+		return -1;
+	}
+
+	if (slot < 0) {		/* MSB set means secondary group */
+		secondary = 1;
+		slot &= 0x7fffffffffffffff;
+	}
+
+	lhpte.dw1.dword1      = 0;
+	lhpte.dw1.dw1.rpn     = physRpn_to_absRpn(prpn);
+	lhpte.dw1.flags.flags = hpteflags;
+
+	lhpte.dw0.dword0      = 0;
+	lhpte.dw0.dw0.avpn    = va >> 23;
+	lhpte.dw0.dw0.h       = secondary;
+	lhpte.dw0.dw0.bolted  = bolted;
+	lhpte.dw0.dw0.v       = 1;
+
+	/* Now fill in the actual HPTE */
+	HvCallHpt_addValidate(slot, secondary, &lhpte);
+
+	iSeries_hunlock(hpte_group);
+
+	return (secondary << 3) | (slot & 7);
+}
+
+static unsigned long iSeries_hpte_getword0(unsigned long slot)
+{
+	unsigned long dword0;
+	HPTE hpte;
+
+	HvCallHpt_get(&hpte, slot);
+	dword0 = hpte.dw0.dword0;
+
+	return dword0;
+}
+
+static long iSeries_hpte_remove(unsigned long hpte_group)
+{
+	unsigned long slot_offset;
+	int i;
+	HPTE lhpte;
+
+	/* Pick a random slot to start at */
+	slot_offset = mftb() & 0x7;
+
+	iSeries_hlock(hpte_group);
+
+	for (i = 0; i < HPTES_PER_GROUP; i++) {
+		lhpte.dw0.dword0 = 
+			iSeries_hpte_getword0(hpte_group + slot_offset);
+
+		if (!lhpte.dw0.dw0.bolted) {
+			HvCallHpt_invalidateSetSwBitsGet(hpte_group + 
+							 slot_offset, 0, 0);
+			iSeries_hunlock(hpte_group);
+			return i;
+		}
+
+		slot_offset++;
+		slot_offset &= 0x7;
+	}
+
+	iSeries_hunlock(hpte_group);
+
+	return -1;
+}
+
+/*
+ * The HyperVisor expects the "flags" argument in this form:
+ * 	bits  0..59 : reserved
+ * 	bit      60 : N
+ * 	bits 61..63 : PP2,PP1,PP0
+ */
+static long iSeries_hpte_updatepp(unsigned long slot, unsigned long newpp,
+				  unsigned long va, int large, int local)
+{
+	HPTE hpte;
+	unsigned long avpn = va >> 23;
+
+	iSeries_hlock(slot);
+
+	HvCallHpt_get(&hpte, slot);
+	if ((hpte.dw0.dw0.avpn == avpn) && (hpte.dw0.dw0.v)) {
+		/*
+		 * Hypervisor expects bits as NPPP, which is
+		 * different from how they are mapped in our PP.
+		 */
+		HvCallHpt_setPp(slot, (newpp & 0x3) | ((newpp & 0x4) << 1));
+		iSeries_hunlock(slot);
+		return 0;
+	}
+	iSeries_hunlock(slot);
+
+	return -1;
+}
+
+/*
+ * Functions used to find the PTE for a particular virtual address. 
+ * Only used during boot when bolting pages.
+ *
+ * Input : vpn      : virtual page number
+ * Output: PTE index within the page table of the entry
+ *         -1 on failure
+ */
+static long iSeries_hpte_find(unsigned long vpn)
+{
+	HPTE hpte;
+	long slot;
+
+	/*
+	 * The HvCallHpt_findValid interface is as follows:
+	 * 0xffffffffffffffff : No entry found.
+	 * 0x00000000xxxxxxxx : Entry found in primary group, slot x
+	 * 0x80000000xxxxxxxx : Entry found in secondary group, slot x
+	 */
+	slot = HvCallHpt_findValid(&hpte, vpn); 
+	if (hpte.dw0.dw0.v) {
+		if (slot < 0) {
+			slot &= 0x7fffffffffffffff;
+			slot = -slot;
+		}
+	} else
+		slot = -1;
+	return slot;
+}
+
+/*
+ * Update the page protection bits. Intended to be used to create
+ * guard pages for kernel data structures on pages which are bolted
+ * in the HPT. Assumes pages being operated on will not be stolen.
+ * Does not work on large pages.
+ *
+ * No need to lock here because we should be the only user.
+ */
+static void iSeries_hpte_updateboltedpp(unsigned long newpp, unsigned long ea)
+{
+	unsigned long vsid,va,vpn;
+	long slot;
+
+	vsid = get_kernel_vsid(ea);
+	va = (vsid << 28) | (ea & 0x0fffffff);
+	vpn = va >> PAGE_SHIFT;
+	slot = iSeries_hpte_find(vpn); 
+	if (slot == -1)
+		panic("updateboltedpp: Could not find page to bolt\n");
+	HvCallHpt_setPp(slot, newpp);
+}
+
+static void iSeries_hpte_invalidate(unsigned long slot, unsigned long va,
+				    int large, int local)
+{
+	HPTE lhpte;
+	unsigned long avpn = va >> 23;
+	unsigned long flags;
+
+	local_irq_save(flags);
+
+	iSeries_hlock(slot);
+
+	lhpte.dw0.dword0 = iSeries_hpte_getword0(slot);
+	
+	if ((lhpte.dw0.dw0.avpn == avpn) && lhpte.dw0.dw0.v)
+		HvCallHpt_invalidateSetSwBitsGet(slot, 0, 0);
+
+	iSeries_hunlock(slot);
+
+	local_irq_restore(flags);
+}
+
+void hpte_init_iSeries(void)
+{
+	ppc_md.hpte_invalidate	= iSeries_hpte_invalidate;
+	ppc_md.hpte_updatepp	= iSeries_hpte_updatepp;
+	ppc_md.hpte_updateboltedpp = iSeries_hpte_updateboltedpp;
+	ppc_md.hpte_insert	= iSeries_hpte_insert;
+	ppc_md.hpte_remove     	= iSeries_hpte_remove;
+
+	htab_finish_init();
+}
diff --git a/arch/ppc64/kernel/iSeries_iommu.c b/arch/ppc64/kernel/iSeries_iommu.c
new file mode 100644
index 00000000000..4e1a47c8a80
--- /dev/null
+++ b/arch/ppc64/kernel/iSeries_iommu.c
@@ -0,0 +1,175 @@
+/*
+ * arch/ppc64/kernel/iSeries_iommu.c
+ *
+ * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation
+ *
+ * Rewrite, cleanup:
+ *
+ * Copyright (C) 2004 Olof Johansson <olof@austin.ibm.com>, IBM Corporation
+ *
+ * Dynamic DMA mapping support, iSeries-specific parts.
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/types.h>
+#include <linux/dma-mapping.h>
+#include <linux/list.h>
+
+#include <asm/iommu.h>
+#include <asm/machdep.h>
+#include <asm/iSeries/HvCallXm.h>
+#include <asm/iSeries/iSeries_pci.h>
+
+extern struct list_head iSeries_Global_Device_List;
+
+
+static void tce_build_iSeries(struct iommu_table *tbl, long index, long npages,
+		unsigned long uaddr, enum dma_data_direction direction)
+{
+	u64 rc;
+	union tce_entry tce;
+
+	while (npages--) {
+		tce.te_word = 0;
+		tce.te_bits.tb_rpn = virt_to_abs(uaddr) >> PAGE_SHIFT;
+
+		if (tbl->it_type == TCE_VB) {
+			/* Virtual Bus */
+			tce.te_bits.tb_valid = 1;
+			tce.te_bits.tb_allio = 1;
+			if (direction != DMA_TO_DEVICE)
+				tce.te_bits.tb_rdwr = 1;
+		} else {
+			/* PCI Bus */
+			tce.te_bits.tb_rdwr = 1; /* Read allowed */
+			if (direction != DMA_TO_DEVICE)
+				tce.te_bits.tb_pciwr = 1;
+		}
+
+		rc = HvCallXm_setTce((u64)tbl->it_index, (u64)index,
+				tce.te_word);
+		if (rc)
+			panic("PCI_DMA: HvCallXm_setTce failed, Rc: 0x%lx\n",
+					rc);
+		index++;
+		uaddr += PAGE_SIZE;
+	}
+}
+
+static void tce_free_iSeries(struct iommu_table *tbl, long index, long npages)
+{
+	u64 rc;
+
+	while (npages--) {
+		rc = HvCallXm_setTce((u64)tbl->it_index, (u64)index, 0);
+		if (rc)
+			panic("PCI_DMA: HvCallXm_setTce failed, Rc: 0x%lx\n",
+					rc);
+		index++;
+	}
+}
+
+
+/*
+ * This function compares the known tables to find an iommu_table
+ * that has already been built for hardware TCEs.
+ */
+static struct iommu_table *iommu_table_find(struct iommu_table * tbl)
+{
+	struct iSeries_Device_Node *dp;
+
+	list_for_each_entry(dp, &iSeries_Global_Device_List, Device_List) {
+		if ((dp->iommu_table != NULL) &&
+		    (dp->iommu_table->it_type == TCE_PCI) &&
+		    (dp->iommu_table->it_offset == tbl->it_offset) &&
+		    (dp->iommu_table->it_index == tbl->it_index) &&
+		    (dp->iommu_table->it_size == tbl->it_size))
+			return dp->iommu_table;
+	}
+	return NULL;
+}
+
+/*
+ * Call Hv with the architected data structure to get TCE table info.
+ * info. Put the returned data into the Linux representation of the
+ * TCE table data.
+ * The Hardware Tce table comes in three flavors.
+ * 1. TCE table shared between Buses.
+ * 2. TCE table per Bus.
+ * 3. TCE Table per IOA.
+ */
+static void iommu_table_getparms(struct iSeries_Device_Node* dn,
+				 struct iommu_table* tbl)
+{
+	struct iommu_table_cb *parms;
+
+	parms = kmalloc(sizeof(*parms), GFP_KERNEL);
+	if (parms == NULL)
+		panic("PCI_DMA: TCE Table Allocation failed.");
+
+	memset(parms, 0, sizeof(*parms));
+
+	parms->itc_busno = ISERIES_BUS(dn);
+	parms->itc_slotno = dn->LogicalSlot;
+	parms->itc_virtbus = 0;
+
+	HvCallXm_getTceTableParms(ISERIES_HV_ADDR(parms));
+
+	if (parms->itc_size == 0)
+		panic("PCI_DMA: parms->size is zero, parms is 0x%p", parms);
+
+	/* itc_size is in pages worth of table, it_size is in # of entries */
+	tbl->it_size = (parms->itc_size * PAGE_SIZE) / sizeof(union tce_entry);
+	tbl->it_busno = parms->itc_busno;
+	tbl->it_offset = parms->itc_offset;
+	tbl->it_index = parms->itc_index;
+	tbl->it_blocksize = 1;
+	tbl->it_type = TCE_PCI;
+
+	kfree(parms);
+}
+
+
+void iommu_devnode_init_iSeries(struct iSeries_Device_Node *dn)
+{
+	struct iommu_table *tbl;
+
+	tbl = kmalloc(sizeof(struct iommu_table), GFP_KERNEL);
+
+	iommu_table_getparms(dn, tbl);
+
+	/* Look for existing tce table */
+	dn->iommu_table = iommu_table_find(tbl);
+	if (dn->iommu_table == NULL)
+		dn->iommu_table = iommu_init_table(tbl);
+	else
+		kfree(tbl);
+}
+
+static void iommu_dev_setup_iSeries(struct pci_dev *dev) { }
+static void iommu_bus_setup_iSeries(struct pci_bus *bus) { }
+
+void iommu_init_early_iSeries(void)
+{
+	ppc_md.tce_build = tce_build_iSeries;
+	ppc_md.tce_free  = tce_free_iSeries;
+
+	ppc_md.iommu_dev_setup = iommu_dev_setup_iSeries;
+	ppc_md.iommu_bus_setup = iommu_bus_setup_iSeries;
+
+	pci_iommu_init();
+}
diff --git a/arch/ppc64/kernel/iSeries_irq.c b/arch/ppc64/kernel/iSeries_irq.c
new file mode 100644
index 00000000000..f831d259dbb
--- /dev/null
+++ b/arch/ppc64/kernel/iSeries_irq.c
@@ -0,0 +1,209 @@
+/************************************************************************/
+/* This module supports the iSeries PCI bus interrupt handling          */
+/* Copyright (C) 20yy  <Robert L Holtorf> <IBM Corp>                    */
+/*                                                                      */
+/* This program is free software; you can redistribute it and/or modify */
+/* it under the terms of the GNU General Public License as published by */
+/* the Free Software Foundation; either version 2 of the License, or    */
+/* (at your option) any later version.                                  */
+/*                                                                      */
+/* This program is distributed in the hope that it will be useful,      */ 
+/* but WITHOUT ANY WARRANTY; without even the implied warranty of       */
+/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the        */
+/* GNU General Public License for more details.                         */
+/*                                                                      */
+/* You should have received a copy of the GNU General Public License    */ 
+/* along with this program; if not, write to the:                       */
+/* Free Software Foundation, Inc.,                                      */ 
+/* 59 Temple Place, Suite 330,                                          */ 
+/* Boston, MA  02111-1307  USA                                          */
+/************************************************************************/
+/* Change Activity:                                                     */
+/*   Created, December 13, 2000 by Wayne Holm                           */ 
+/* End Change Activity                                                  */
+/************************************************************************/
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/threads.h>
+#include <linux/smp.h>
+#include <linux/param.h>
+#include <linux/string.h>
+#include <linux/bootmem.h>
+#include <linux/ide.h>
+
+#include <linux/irq.h>
+#include <linux/spinlock.h>
+#include <asm/ppcdebug.h>
+
+#include <asm/iSeries/HvCallPci.h>
+#include <asm/iSeries/HvCallXm.h>
+#include <asm/iSeries/iSeries_irq.h>
+#include <asm/iSeries/XmPciLpEvent.h>
+
+static unsigned int iSeries_startup_IRQ(unsigned int irq);
+static void iSeries_shutdown_IRQ(unsigned int irq);
+static void iSeries_enable_IRQ(unsigned int irq);
+static void iSeries_disable_IRQ(unsigned int irq);
+static void iSeries_end_IRQ(unsigned int irq);
+
+static hw_irq_controller iSeries_IRQ_handler = {
+	.typename = "iSeries irq controller",
+	.startup = iSeries_startup_IRQ,
+	.shutdown = iSeries_shutdown_IRQ,
+	.enable = iSeries_enable_IRQ,
+	.disable = iSeries_disable_IRQ,
+	.end = iSeries_end_IRQ
+};
+
+/* This maps virtual irq numbers to real irqs */
+unsigned int virt_irq_to_real_map[NR_IRQS];
+
+/* The next available virtual irq number */
+/* Note: the pcnet32 driver assumes irq numbers < 2 aren't valid. :( */
+static int next_virtual_irq = 2;
+
+/* This is called by init_IRQ.  set in ppc_md.init_IRQ by iSeries_setup.c */
+void __init iSeries_init_IRQ(void)
+{
+	/* Register PCI event handler and open an event path */
+	XmPciLpEvent_init();
+}
+
+/*
+ * This is called out of iSeries_scan_slot to allocate an IRQ for an EADS slot
+ * It calculates the irq value for the slot.
+ * Note that subBusNumber is always 0 (at the moment at least).
+ */
+int __init iSeries_allocate_IRQ(HvBusNumber busNumber,
+		HvSubBusNumber subBusNumber, HvAgentId deviceId)
+{
+	unsigned int realirq, virtirq;
+	u8 idsel = (deviceId >> 4);
+	u8 function = deviceId & 7;
+
+	virtirq = next_virtual_irq++;
+	realirq = ((busNumber - 1) << 6) + ((idsel - 1) << 3) + function;
+	virt_irq_to_real_map[virtirq] = realirq;
+
+	irq_desc[virtirq].handler = &iSeries_IRQ_handler;
+	return virtirq;
+}
+
+#define REAL_IRQ_TO_BUS(irq)	((((irq) >> 6) & 0xff) + 1)
+#define REAL_IRQ_TO_IDSEL(irq)	((((irq) >> 3) & 7) + 1)
+#define REAL_IRQ_TO_FUNC(irq)	((irq) & 7)
+
+/* This is called by iSeries_activate_IRQs */
+static unsigned int iSeries_startup_IRQ(unsigned int irq)
+{
+	u32 bus, deviceId, function, mask;
+	const u32 subBus = 0;
+	unsigned int rirq = virt_irq_to_real_map[irq];
+
+	bus = REAL_IRQ_TO_BUS(rirq);
+	function = REAL_IRQ_TO_FUNC(rirq);
+	deviceId = (REAL_IRQ_TO_IDSEL(rirq) << 4) + function;
+
+	/* Link the IRQ number to the bridge */
+	HvCallXm_connectBusUnit(bus, subBus, deviceId, irq);
+
+	/* Unmask bridge interrupts in the FISR */
+	mask = 0x01010000 << function;
+	HvCallPci_unmaskFisr(bus, subBus, deviceId, mask);
+	iSeries_enable_IRQ(irq);
+	return 0;
+}
+
+/*
+ * This is called out of iSeries_fixup to activate interrupt
+ * generation for usable slots
+ */
+void __init iSeries_activate_IRQs()
+{
+	int irq;
+	unsigned long flags;
+
+	for_each_irq (irq) {
+		irq_desc_t *desc = get_irq_desc(irq);
+
+		if (desc && desc->handler && desc->handler->startup) {
+			spin_lock_irqsave(&desc->lock, flags);
+			desc->handler->startup(irq);
+			spin_unlock_irqrestore(&desc->lock, flags);
+		}
+    	}
+}
+
+/*  this is not called anywhere currently */
+static void iSeries_shutdown_IRQ(unsigned int irq)
+{
+	u32 bus, deviceId, function, mask;
+	const u32 subBus = 0;
+	unsigned int rirq = virt_irq_to_real_map[irq];
+
+	/* irq should be locked by the caller */
+	bus = REAL_IRQ_TO_BUS(rirq);
+	function = REAL_IRQ_TO_FUNC(rirq);
+	deviceId = (REAL_IRQ_TO_IDSEL(rirq) << 4) + function;
+
+	/* Invalidate the IRQ number in the bridge */
+	HvCallXm_connectBusUnit(bus, subBus, deviceId, 0);
+
+	/* Mask bridge interrupts in the FISR */
+	mask = 0x01010000 << function;
+	HvCallPci_maskFisr(bus, subBus, deviceId, mask);
+}
+
+/*
+ * This will be called by device drivers (via disable_IRQ)
+ * to disable INTA in the bridge interrupt status register.
+ */
+static void iSeries_disable_IRQ(unsigned int irq)
+{
+	u32 bus, deviceId, function, mask;
+	const u32 subBus = 0;
+	unsigned int rirq = virt_irq_to_real_map[irq];
+
+	/* The IRQ has already been locked by the caller */
+	bus = REAL_IRQ_TO_BUS(rirq);
+	function = REAL_IRQ_TO_FUNC(rirq);
+	deviceId = (REAL_IRQ_TO_IDSEL(rirq) << 4) + function;
+
+	/* Mask secondary INTA   */
+	mask = 0x80000000;
+	HvCallPci_maskInterrupts(bus, subBus, deviceId, mask);
+	PPCDBG(PPCDBG_BUSWALK, "iSeries_disable_IRQ 0x%02X.%02X.%02X 0x%04X\n",
+	       bus, subBus, deviceId, irq);
+}
+
+/*
+ * This will be called by device drivers (via enable_IRQ)
+ * to enable INTA in the bridge interrupt status register.
+ */
+static void iSeries_enable_IRQ(unsigned int irq)
+{
+	u32 bus, deviceId, function, mask;
+	const u32 subBus = 0;
+	unsigned int rirq = virt_irq_to_real_map[irq];
+
+	/* The IRQ has already been locked by the caller */
+	bus = REAL_IRQ_TO_BUS(rirq);
+	function = REAL_IRQ_TO_FUNC(rirq);
+	deviceId = (REAL_IRQ_TO_IDSEL(rirq) << 4) + function;
+
+	/* Unmask secondary INTA */
+	mask = 0x80000000;
+	HvCallPci_unmaskInterrupts(bus, subBus, deviceId, mask);
+	PPCDBG(PPCDBG_BUSWALK, "iSeries_enable_IRQ 0x%02X.%02X.%02X 0x%04X\n",
+	       bus, subBus, deviceId, irq);
+}
+
+/*
+ * Need to define this so ppc_irq_dispatch_handler will NOT call
+ * enable_IRQ at the end of interrupt handling.  However, this does
+ * nothing because there is not enough information provided to do
+ * the EOI HvCall.  This is done by XmPciLpEvent.c
+ */
+static void iSeries_end_IRQ(unsigned int irq)
+{
+}
diff --git a/arch/ppc64/kernel/iSeries_pci.c b/arch/ppc64/kernel/iSeries_pci.c
new file mode 100644
index 00000000000..bd4c2554f1a
--- /dev/null
+++ b/arch/ppc64/kernel/iSeries_pci.c
@@ -0,0 +1,912 @@
+/*
+ * iSeries_pci.c
+ *
+ * Copyright (C) 2001 Allan Trautman, IBM Corporation
+ *
+ * iSeries specific routines for PCI.
+ * 
+ * Based on code from pci.c and iSeries_pci.c 32bit
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+#include <linux/kernel.h>
+#include <linux/list.h> 
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/ide.h>
+#include <linux/pci.h>
+
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/prom.h>
+#include <asm/machdep.h>
+#include <asm/pci-bridge.h>
+#include <asm/ppcdebug.h>
+#include <asm/iommu.h>
+
+#include <asm/iSeries/HvCallPci.h>
+#include <asm/iSeries/HvCallSm.h>
+#include <asm/iSeries/HvCallXm.h>
+#include <asm/iSeries/LparData.h>
+#include <asm/iSeries/iSeries_irq.h>
+#include <asm/iSeries/iSeries_pci.h>
+#include <asm/iSeries/mf.h>
+
+#include "pci.h"
+
+extern unsigned long io_page_mask;
+
+/*
+ * Forward declares of prototypes. 
+ */
+static struct iSeries_Device_Node *find_Device_Node(int bus, int devfn);
+static void scan_PHB_slots(struct pci_controller *Phb);
+static void scan_EADS_bridge(HvBusNumber Bus, HvSubBusNumber SubBus, int IdSel);
+static int scan_bridge_slot(HvBusNumber Bus, struct HvCallPci_BridgeInfo *Info);
+
+LIST_HEAD(iSeries_Global_Device_List);
+
+static int DeviceCount;
+
+/* Counters and control flags. */
+static long Pci_Io_Read_Count;
+static long Pci_Io_Write_Count;
+#if 0
+static long Pci_Cfg_Read_Count;
+static long Pci_Cfg_Write_Count;
+#endif
+static long Pci_Error_Count;
+
+static int Pci_Retry_Max = 3;	/* Only retry 3 times  */	
+static int Pci_Error_Flag = 1;	/* Set Retry Error on. */
+
+static struct pci_ops iSeries_pci_ops;
+
+/*
+ * Table defines
+ * Each Entry size is 4 MB * 1024 Entries = 4GB I/O address space.
+ */
+#define IOMM_TABLE_MAX_ENTRIES	1024
+#define IOMM_TABLE_ENTRY_SIZE	0x0000000000400000UL
+#define BASE_IO_MEMORY		0xE000000000000000UL
+
+static unsigned long max_io_memory = 0xE000000000000000UL;
+static long current_iomm_table_entry;
+
+/*
+ * Lookup Tables.
+ */
+static struct iSeries_Device_Node **iomm_table;
+static u8 *iobar_table;
+
+/*
+ * Static and Global variables
+ */
+static char *pci_io_text = "iSeries PCI I/O";
+static DEFINE_SPINLOCK(iomm_table_lock);
+
+/*
+ * iomm_table_initialize
+ *
+ * Allocates and initalizes the Address Translation Table and Bar
+ * Tables to get them ready for use.  Must be called before any
+ * I/O space is handed out to the device BARs.
+ */
+static void iomm_table_initialize(void)
+{
+	spin_lock(&iomm_table_lock);
+	iomm_table = kmalloc(sizeof(*iomm_table) * IOMM_TABLE_MAX_ENTRIES,
+			GFP_KERNEL);
+	iobar_table = kmalloc(sizeof(*iobar_table) * IOMM_TABLE_MAX_ENTRIES,
+			GFP_KERNEL);
+	spin_unlock(&iomm_table_lock);
+	if ((iomm_table == NULL) || (iobar_table == NULL))
+		panic("PCI: I/O tables allocation failed.\n");
+}
+
+/*
+ * iomm_table_allocate_entry
+ *
+ * Adds pci_dev entry in address translation table
+ *
+ * - Allocates the number of entries required in table base on BAR
+ *   size.
+ * - Allocates starting at BASE_IO_MEMORY and increases.
+ * - The size is round up to be a multiple of entry size.
+ * - CurrentIndex is incremented to keep track of the last entry.
+ * - Builds the resource entry for allocated BARs.
+ */
+static void iomm_table_allocate_entry(struct pci_dev *dev, int bar_num)
+{
+	struct resource *bar_res = &dev->resource[bar_num];
+	long bar_size = pci_resource_len(dev, bar_num);
+
+	/*
+	 * No space to allocate, quick exit, skip Allocation.
+	 */
+	if (bar_size == 0)
+		return;
+	/*
+	 * Set Resource values.
+	 */
+	spin_lock(&iomm_table_lock);
+	bar_res->name = pci_io_text;
+	bar_res->start =
+		IOMM_TABLE_ENTRY_SIZE * current_iomm_table_entry;
+	bar_res->start += BASE_IO_MEMORY;
+	bar_res->end = bar_res->start + bar_size - 1;
+	/*
+	 * Allocate the number of table entries needed for BAR.
+	 */
+	while (bar_size > 0 ) {
+		iomm_table[current_iomm_table_entry] = dev->sysdata;
+		iobar_table[current_iomm_table_entry] = bar_num;
+		bar_size -= IOMM_TABLE_ENTRY_SIZE;
+		++current_iomm_table_entry;
+	}
+	max_io_memory = BASE_IO_MEMORY +
+		(IOMM_TABLE_ENTRY_SIZE * current_iomm_table_entry);
+	spin_unlock(&iomm_table_lock);
+}
+
+/*
+ * allocate_device_bars
+ *
+ * - Allocates ALL pci_dev BAR's and updates the resources with the
+ *   BAR value.  BARS with zero length will have the resources
+ *   The HvCallPci_getBarParms is used to get the size of the BAR
+ *   space.  It calls iomm_table_allocate_entry to allocate
+ *   each entry.
+ * - Loops through The Bar resources(0 - 5) including the ROM
+ *   is resource(6).
+ */
+static void allocate_device_bars(struct pci_dev *dev)
+{
+	struct resource *bar_res;
+	int bar_num;
+
+	for (bar_num = 0; bar_num <= PCI_ROM_RESOURCE; ++bar_num) {
+		bar_res = &dev->resource[bar_num];
+		iomm_table_allocate_entry(dev, bar_num);
+    	}
+}
+
+/*
+ * Log error information to system console.
+ * Filter out the device not there errors.
+ * PCI: EADs Connect Failed 0x18.58.10 Rc: 0x00xx
+ * PCI: Read Vendor Failed 0x18.58.10 Rc: 0x00xx
+ * PCI: Connect Bus Unit Failed 0x18.58.10 Rc: 0x00xx
+ */
+static void pci_Log_Error(char *Error_Text, int Bus, int SubBus,
+		int AgentId, int HvRc)
+{
+	if (HvRc == 0x0302)
+		return;
+	printk(KERN_ERR "PCI: %s Failed: 0x%02X.%02X.%02X Rc: 0x%04X",
+	       Error_Text, Bus, SubBus, AgentId, HvRc);
+}
+
+/*
+ * build_device_node(u16 Bus, int SubBus, u8 DevFn)
+ */
+static struct iSeries_Device_Node *build_device_node(HvBusNumber Bus,
+		HvSubBusNumber SubBus, int AgentId, int Function)
+{
+	struct iSeries_Device_Node *node;
+
+	PPCDBG(PPCDBG_BUSWALK,
+			"-build_device_node 0x%02X.%02X.%02X Function: %02X\n",
+			Bus, SubBus, AgentId, Function);
+
+	node = kmalloc(sizeof(struct iSeries_Device_Node), GFP_KERNEL);
+	if (node == NULL)
+		return NULL;
+
+	memset(node, 0, sizeof(struct iSeries_Device_Node));
+	list_add_tail(&node->Device_List, &iSeries_Global_Device_List);
+#if 0
+	node->DsaAddr = ((u64)Bus << 48) + ((u64)SubBus << 40) + ((u64)0x10 << 32);
+#endif
+	node->DsaAddr.DsaAddr = 0;
+	node->DsaAddr.Dsa.busNumber = Bus;
+	node->DsaAddr.Dsa.subBusNumber = SubBus;
+	node->DsaAddr.Dsa.deviceId = 0x10;
+	node->AgentId = AgentId;
+	node->DevFn = PCI_DEVFN(ISERIES_ENCODE_DEVICE(AgentId), Function);
+	node->IoRetry = 0;
+	iSeries_Get_Location_Code(node);
+	return node;
+}
+
+/*
+ * unsigned long __init find_and_init_phbs(void)
+ *
+ * Description:
+ *   This function checks for all possible system PCI host bridges that connect
+ *   PCI buses.  The system hypervisor is queried as to the guest partition
+ *   ownership status.  A pci_controller is built for any bus which is partially
+ *   owned or fully owned by this guest partition.
+ */
+unsigned long __init find_and_init_phbs(void)
+{
+	struct pci_controller *phb;
+	HvBusNumber bus;
+
+	PPCDBG(PPCDBG_BUSWALK, "find_and_init_phbs Entry\n");
+
+	/* Check all possible buses. */
+	for (bus = 0; bus < 256; bus++) {
+		int ret = HvCallXm_testBus(bus);
+		if (ret == 0) {
+			printk("bus %d appears to exist\n", bus);
+
+			phb = (struct pci_controller *)kmalloc(sizeof(struct pci_controller), GFP_KERNEL);
+			if (phb == NULL)
+				return -ENOMEM;
+       			pci_setup_pci_controller(phb);
+
+			phb->pci_mem_offset = phb->local_number = bus;
+			phb->first_busno = bus;
+			phb->last_busno = bus;
+			phb->ops = &iSeries_pci_ops;
+
+			PPCDBG(PPCDBG_BUSWALK, "PCI:Create iSeries pci_controller(%p), Bus: %04X\n",
+					phb, bus);
+
+			/* Find and connect the devices. */
+			scan_PHB_slots(phb);
+		}
+		/*
+		 * Check for Unexpected Return code, a clue that something
+		 * has gone wrong.
+		 */
+		else if (ret != 0x0301)
+			printk(KERN_ERR "Unexpected Return on Probe(0x%04X): 0x%04X",
+			       bus, ret);
+	}
+	return 0;
+}
+
+/*
+ * iSeries_pcibios_init
+ *  
+ * Chance to initialize and structures or variable before PCI Bus walk.
+ */
+void iSeries_pcibios_init(void)
+{
+	PPCDBG(PPCDBG_BUSWALK, "iSeries_pcibios_init Entry.\n"); 
+	iomm_table_initialize();
+	find_and_init_phbs();
+	io_page_mask = -1;
+	PPCDBG(PPCDBG_BUSWALK, "iSeries_pcibios_init Exit.\n"); 
+}
+
+/*
+ * iSeries_pci_final_fixup(void)  
+ */
+void __init iSeries_pci_final_fixup(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iSeries_Device_Node *node;
+	char Buffer[256];
+    	int DeviceCount = 0;
+
+	PPCDBG(PPCDBG_BUSWALK, "iSeries_pcibios_fixup Entry.\n"); 
+
+	/* Fix up at the device node and pci_dev relationship */
+	mf_display_src(0xC9000100);
+
+	printk("pcibios_final_fixup\n");
+	for_each_pci_dev(pdev) {
+		node = find_Device_Node(pdev->bus->number, pdev->devfn);
+		printk("pci dev %p (%x.%x), node %p\n", pdev,
+		       pdev->bus->number, pdev->devfn, node);
+
+		if (node != NULL) {
+			++DeviceCount;
+			pdev->sysdata = (void *)node;
+			node->PciDev = pdev;
+			PPCDBG(PPCDBG_BUSWALK,
+					"pdev 0x%p <==> DevNode 0x%p\n",
+					pdev, node);
+			allocate_device_bars(pdev);
+			iSeries_Device_Information(pdev, Buffer,
+					sizeof(Buffer));
+			printk("%d. %s\n", DeviceCount, Buffer);
+			iommu_devnode_init_iSeries(node);
+		} else
+			printk("PCI: Device Tree not found for 0x%016lX\n",
+					(unsigned long)pdev);
+		pdev->irq = node->Irq;
+	}
+	iSeries_activate_IRQs();
+	mf_display_src(0xC9000200);
+}
+
+void pcibios_fixup_bus(struct pci_bus *PciBus)
+{
+	PPCDBG(PPCDBG_BUSWALK, "iSeries_pcibios_fixup_bus(0x%04X) Entry.\n",
+			PciBus->number); 
+}
+
+void pcibios_fixup_resources(struct pci_dev *pdev)
+{
+	PPCDBG(PPCDBG_BUSWALK, "fixup_resources pdev %p\n", pdev);
+}   
+
+/*
+ * Loop through each node function to find usable EADs bridges.  
+ */
+static void scan_PHB_slots(struct pci_controller *Phb)
+{
+	struct HvCallPci_DeviceInfo *DevInfo;
+	HvBusNumber bus = Phb->local_number;	/* System Bus */	
+	const HvSubBusNumber SubBus = 0;	/* EADs is always 0. */
+	int HvRc = 0;
+	int IdSel;	
+	const int MaxAgents = 8;
+
+	DevInfo = (struct HvCallPci_DeviceInfo*)
+		kmalloc(sizeof(struct HvCallPci_DeviceInfo), GFP_KERNEL);
+	if (DevInfo == NULL)
+		return;
+
+	/*
+	 * Probe for EADs Bridges      
+	 */
+	for (IdSel = 1; IdSel < MaxAgents; ++IdSel) {
+    		HvRc = HvCallPci_getDeviceInfo(bus, SubBus, IdSel,
+				ISERIES_HV_ADDR(DevInfo),
+				sizeof(struct HvCallPci_DeviceInfo));
+		if (HvRc == 0) {
+			if (DevInfo->deviceType == HvCallPci_NodeDevice)
+				scan_EADS_bridge(bus, SubBus, IdSel);
+			else
+				printk("PCI: Invalid System Configuration(0x%02X)"
+				       " for bus 0x%02x id 0x%02x.\n",
+				       DevInfo->deviceType, bus, IdSel);
+		}
+		else
+			pci_Log_Error("getDeviceInfo", bus, SubBus, IdSel, HvRc);
+	}
+	kfree(DevInfo);
+}
+
+static void scan_EADS_bridge(HvBusNumber bus, HvSubBusNumber SubBus,
+		int IdSel)
+{
+	struct HvCallPci_BridgeInfo *BridgeInfo;
+	HvAgentId AgentId;
+	int Function;
+	int HvRc;
+
+	BridgeInfo = (struct HvCallPci_BridgeInfo *)
+		kmalloc(sizeof(struct HvCallPci_BridgeInfo), GFP_KERNEL);
+	if (BridgeInfo == NULL)
+		return;
+
+	/* Note: hvSubBus and irq is always be 0 at this level! */
+	for (Function = 0; Function < 8; ++Function) {
+	  	AgentId = ISERIES_PCI_AGENTID(IdSel, Function);
+		HvRc = HvCallXm_connectBusUnit(bus, SubBus, AgentId, 0);
+ 		if (HvRc == 0) {
+			printk("found device at bus %d idsel %d func %d (AgentId %x)\n",
+			       bus, IdSel, Function, AgentId);
+  			/*  Connect EADs: 0x18.00.12 = 0x00 */
+			PPCDBG(PPCDBG_BUSWALK,
+					"PCI:Connect EADs: 0x%02X.%02X.%02X\n",
+					bus, SubBus, AgentId);
+	    		HvRc = HvCallPci_getBusUnitInfo(bus, SubBus, AgentId,
+					ISERIES_HV_ADDR(BridgeInfo),
+					sizeof(struct HvCallPci_BridgeInfo));
+	 		if (HvRc == 0) {
+				printk("bridge info: type %x subbus %x maxAgents %x maxsubbus %x logslot %x\n",
+					BridgeInfo->busUnitInfo.deviceType,
+					BridgeInfo->subBusNumber,
+					BridgeInfo->maxAgents,
+					BridgeInfo->maxSubBusNumber,
+					BridgeInfo->logicalSlotNumber);
+				PPCDBG(PPCDBG_BUSWALK,
+					"PCI: BridgeInfo, Type:0x%02X, SubBus:0x%02X, MaxAgents:0x%02X, MaxSubBus: 0x%02X, LSlot: 0x%02X\n",
+					BridgeInfo->busUnitInfo.deviceType,
+					BridgeInfo->subBusNumber,
+					BridgeInfo->maxAgents,
+					BridgeInfo->maxSubBusNumber,
+					BridgeInfo->logicalSlotNumber);
+
+				if (BridgeInfo->busUnitInfo.deviceType ==
+						HvCallPci_BridgeDevice)  {
+					/* Scan_Bridge_Slot...: 0x18.00.12 */
+					scan_bridge_slot(bus, BridgeInfo);
+				} else
+					printk("PCI: Invalid Bridge Configuration(0x%02X)",
+						BridgeInfo->busUnitInfo.deviceType);
+			}
+    		} else if (HvRc != 0x000B)
+			pci_Log_Error("EADs Connect",
+					bus, SubBus, AgentId, HvRc);
+	}
+	kfree(BridgeInfo);
+}
+
+/*
+ * This assumes that the node slot is always on the primary bus!
+ */
+static int scan_bridge_slot(HvBusNumber Bus,
+		struct HvCallPci_BridgeInfo *BridgeInfo)
+{
+	struct iSeries_Device_Node *node;
+	HvSubBusNumber SubBus = BridgeInfo->subBusNumber;
+	u16 VendorId = 0;
+	int HvRc = 0;
+	u8 Irq = 0;
+	int IdSel = ISERIES_GET_DEVICE_FROM_SUBBUS(SubBus);
+	int Function = ISERIES_GET_FUNCTION_FROM_SUBBUS(SubBus);
+	HvAgentId EADsIdSel = ISERIES_PCI_AGENTID(IdSel, Function);
+
+	/* iSeries_allocate_IRQ.: 0x18.00.12(0xA3) */
+  	Irq = iSeries_allocate_IRQ(Bus, 0, EADsIdSel);
+	PPCDBG(PPCDBG_BUSWALK,
+		"PCI:- allocate and assign IRQ 0x%02X.%02X.%02X = 0x%02X\n",
+		Bus, 0, EADsIdSel, Irq);
+
+	/*
+	 * Connect all functions of any device found.  
+	 */
+  	for (IdSel = 1; IdSel <= BridgeInfo->maxAgents; ++IdSel) {
+    		for (Function = 0; Function < 8; ++Function) {
+			HvAgentId AgentId = ISERIES_PCI_AGENTID(IdSel, Function);
+			HvRc = HvCallXm_connectBusUnit(Bus, SubBus,
+					AgentId, Irq);
+			if (HvRc != 0) {
+				pci_Log_Error("Connect Bus Unit",
+					      Bus, SubBus, AgentId, HvRc);
+				continue;
+			}
+
+			HvRc = HvCallPci_configLoad16(Bus, SubBus, AgentId,
+						      PCI_VENDOR_ID, &VendorId);
+			if (HvRc != 0) {
+				pci_Log_Error("Read Vendor",
+					      Bus, SubBus, AgentId, HvRc);
+				continue;
+			}
+			printk("read vendor ID: %x\n", VendorId);
+
+			/* FoundDevice: 0x18.28.10 = 0x12AE */
+			PPCDBG(PPCDBG_BUSWALK,
+			       "PCI:- FoundDevice: 0x%02X.%02X.%02X = 0x%04X, irq %d\n",
+			       Bus, SubBus, AgentId, VendorId, Irq);
+			HvRc = HvCallPci_configStore8(Bus, SubBus, AgentId,
+						      PCI_INTERRUPT_LINE, Irq);  
+			if (HvRc != 0)
+				pci_Log_Error("PciCfgStore Irq Failed!",
+					      Bus, SubBus, AgentId, HvRc);
+
+			++DeviceCount;
+			node = build_device_node(Bus, SubBus, EADsIdSel, Function);
+			node->Vendor = VendorId;
+			node->Irq = Irq;
+			node->LogicalSlot = BridgeInfo->logicalSlotNumber;
+
+		} /* for (Function = 0; Function < 8; ++Function) */
+	} /* for (IdSel = 1; IdSel <= MaxAgents; ++IdSel) */
+	return HvRc;
+}
+
+/*
+ * I/0 Memory copy MUST use mmio commands on iSeries
+ * To do; For performance, include the hv call directly
+ */
+void iSeries_memset_io(volatile void __iomem *dest, char c, size_t Count)
+{
+	u8 ByteValue = c;
+	long NumberOfBytes = Count;
+
+	while (NumberOfBytes > 0) {
+		iSeries_Write_Byte(ByteValue, dest++);
+		-- NumberOfBytes;
+	}
+}
+EXPORT_SYMBOL(iSeries_memset_io);
+
+void iSeries_memcpy_toio(volatile void __iomem *dest, void *source, size_t count)
+{
+	char *src = source;
+	long NumberOfBytes = count;
+
+	while (NumberOfBytes > 0) {
+		iSeries_Write_Byte(*src++, dest++);
+		-- NumberOfBytes;
+	}
+}
+EXPORT_SYMBOL(iSeries_memcpy_toio);
+
+void iSeries_memcpy_fromio(void *dest, const volatile void __iomem *src, size_t count)
+{
+	char *dst = dest;
+	long NumberOfBytes = count;
+
+	while (NumberOfBytes > 0) {
+		*dst++ = iSeries_Read_Byte(src++);
+		-- NumberOfBytes;
+	}
+}
+EXPORT_SYMBOL(iSeries_memcpy_fromio);
+
+/*
+ * Look down the chain to find the matching Device Device
+ */
+static struct iSeries_Device_Node *find_Device_Node(int bus, int devfn)
+{
+	struct list_head *pos;
+
+	list_for_each(pos, &iSeries_Global_Device_List) {
+		struct iSeries_Device_Node *node =
+			list_entry(pos, struct iSeries_Device_Node, Device_List);
+
+		if ((bus == ISERIES_BUS(node)) && (devfn == node->DevFn))
+			return node;
+	}
+	return NULL;
+}
+
+#if 0
+/*
+ * Returns the device node for the passed pci_dev
+ * Sanity Check Node PciDev to passed pci_dev
+ * If none is found, returns a NULL which the client must handle.
+ */
+static struct iSeries_Device_Node *get_Device_Node(struct pci_dev *pdev)
+{
+	struct iSeries_Device_Node *node;
+
+	node = pdev->sysdata;
+	if (node == NULL || node->PciDev != pdev)
+		node = find_Device_Node(pdev->bus->number, pdev->devfn);
+	return node;
+}
+#endif
+
+/*
+ * Config space read and write functions.
+ * For now at least, we look for the device node for the bus and devfn
+ * that we are asked to access.  It may be possible to translate the devfn
+ * to a subbus and deviceid more directly.
+ */
+static u64 hv_cfg_read_func[4]  = {
+	HvCallPciConfigLoad8, HvCallPciConfigLoad16,
+	HvCallPciConfigLoad32, HvCallPciConfigLoad32
+};
+
+static u64 hv_cfg_write_func[4] = {
+	HvCallPciConfigStore8, HvCallPciConfigStore16,
+	HvCallPciConfigStore32, HvCallPciConfigStore32
+};
+
+/*
+ * Read PCI config space
+ */
+static int iSeries_pci_read_config(struct pci_bus *bus, unsigned int devfn,
+		int offset, int size, u32 *val)
+{
+	struct iSeries_Device_Node *node = find_Device_Node(bus->number, devfn);
+	u64 fn;
+	struct HvCallPci_LoadReturn ret;
+
+	if (node == NULL)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+	if (offset > 255) {
+		*val = ~0;
+		return PCIBIOS_BAD_REGISTER_NUMBER;
+	}
+
+	fn = hv_cfg_read_func[(size - 1) & 3];
+	HvCall3Ret16(fn, &ret, node->DsaAddr.DsaAddr, offset, 0);
+
+	if (ret.rc != 0) {
+		*val = ~0;
+		return PCIBIOS_DEVICE_NOT_FOUND;	/* or something */
+	}
+
+	*val = ret.value;
+	return 0;
+}
+
+/*
+ * Write PCI config space
+ */
+
+static int iSeries_pci_write_config(struct pci_bus *bus, unsigned int devfn,
+		int offset, int size, u32 val)
+{
+	struct iSeries_Device_Node *node = find_Device_Node(bus->number, devfn);
+	u64 fn;
+	u64 ret;
+
+	if (node == NULL)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+	if (offset > 255)
+		return PCIBIOS_BAD_REGISTER_NUMBER;
+
+	fn = hv_cfg_write_func[(size - 1) & 3];
+	ret = HvCall4(fn, node->DsaAddr.DsaAddr, offset, val, 0);
+
+	if (ret != 0)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	return 0;
+}
+
+static struct pci_ops iSeries_pci_ops = {
+	.read = iSeries_pci_read_config,
+	.write = iSeries_pci_write_config
+};
+
+/*
+ * Check Return Code
+ * -> On Failure, print and log information.
+ *    Increment Retry Count, if exceeds max, panic partition.
+ * -> If in retry, print and log success 
+ *
+ * PCI: Device 23.90 ReadL I/O Error( 0): 0x1234
+ * PCI: Device 23.90 ReadL Retry( 1)
+ * PCI: Device 23.90 ReadL Retry Successful(1)
+ */
+static int CheckReturnCode(char *TextHdr, struct iSeries_Device_Node *DevNode,
+		u64 ret)
+{
+	if (ret != 0)  {
+		++Pci_Error_Count;
+		++DevNode->IoRetry;
+		printk("PCI: %s: Device 0x%04X:%02X  I/O Error(%2d): 0x%04X\n",
+				TextHdr, DevNode->DsaAddr.Dsa.busNumber, DevNode->DevFn,
+				DevNode->IoRetry, (int)ret);
+		/*
+		 * Bump the retry and check for retry count exceeded.
+		 * If, Exceeded, panic the system.
+		 */
+		if ((DevNode->IoRetry > Pci_Retry_Max) &&
+				(Pci_Error_Flag > 0)) {
+			mf_display_src(0xB6000103);
+			panic_timeout = 0; 
+			panic("PCI: Hardware I/O Error, SRC B6000103, "
+					"Automatic Reboot Disabled.\n");
+		}
+		return -1;	/* Retry Try */
+	}
+	/* If retry was in progress, log success and rest retry count */
+	if (DevNode->IoRetry > 0)
+		DevNode->IoRetry = 0;
+	return 0; 
+}
+
+/*
+ * Translate the I/O Address into a device node, bar, and bar offset.
+ * Note: Make sure the passed variable end up on the stack to avoid
+ * the exposure of being device global.
+ */
+static inline struct iSeries_Device_Node *xlate_iomm_address(
+		const volatile void __iomem *IoAddress,
+		u64 *dsaptr, u64 *BarOffsetPtr)
+{
+	unsigned long OrigIoAddr;
+	unsigned long BaseIoAddr;
+	unsigned long TableIndex;
+	struct iSeries_Device_Node *DevNode;
+
+	OrigIoAddr = (unsigned long __force)IoAddress;
+	if ((OrigIoAddr < BASE_IO_MEMORY) || (OrigIoAddr >= max_io_memory))
+		return NULL;
+	BaseIoAddr = OrigIoAddr - BASE_IO_MEMORY;
+	TableIndex = BaseIoAddr / IOMM_TABLE_ENTRY_SIZE;
+	DevNode = iomm_table[TableIndex];
+
+	if (DevNode != NULL) {
+		int barnum = iobar_table[TableIndex];
+		*dsaptr = DevNode->DsaAddr.DsaAddr | (barnum << 24);
+		*BarOffsetPtr = BaseIoAddr % IOMM_TABLE_ENTRY_SIZE;
+	} else
+		panic("PCI: Invalid PCI IoAddress detected!\n");
+	return DevNode;
+}
+
+/*
+ * Read MM I/O Instructions for the iSeries
+ * On MM I/O error, all ones are returned and iSeries_pci_IoError is cal
+ * else, data is returned in big Endian format.
+ *
+ * iSeries_Read_Byte = Read Byte  ( 8 bit)
+ * iSeries_Read_Word = Read Word  (16 bit)
+ * iSeries_Read_Long = Read Long  (32 bit)
+ */
+u8 iSeries_Read_Byte(const volatile void __iomem *IoAddress)
+{
+	u64 BarOffset;
+	u64 dsa;
+	struct HvCallPci_LoadReturn ret;
+	struct iSeries_Device_Node *DevNode =
+		xlate_iomm_address(IoAddress, &dsa, &BarOffset);
+
+	if (DevNode == NULL) {
+		static unsigned long last_jiffies;
+		static int num_printed;
+
+		if ((jiffies - last_jiffies) > 60 * HZ) {
+			last_jiffies = jiffies;
+			num_printed = 0;
+		}
+		if (num_printed++ < 10)
+			printk(KERN_ERR "iSeries_Read_Byte: invalid access at IO address %p\n", IoAddress);
+		return 0xff;
+	}
+	do {
+		++Pci_Io_Read_Count;
+		HvCall3Ret16(HvCallPciBarLoad8, &ret, dsa, BarOffset, 0);
+	} while (CheckReturnCode("RDB", DevNode, ret.rc) != 0);
+
+	return (u8)ret.value;
+}
+EXPORT_SYMBOL(iSeries_Read_Byte);
+
+u16 iSeries_Read_Word(const volatile void __iomem *IoAddress)
+{
+	u64 BarOffset;
+	u64 dsa;
+	struct HvCallPci_LoadReturn ret;
+	struct iSeries_Device_Node *DevNode =
+		xlate_iomm_address(IoAddress, &dsa, &BarOffset);
+
+	if (DevNode == NULL) {
+		static unsigned long last_jiffies;
+		static int num_printed;
+
+		if ((jiffies - last_jiffies) > 60 * HZ) {
+			last_jiffies = jiffies;
+			num_printed = 0;
+		}
+		if (num_printed++ < 10)
+			printk(KERN_ERR "iSeries_Read_Word: invalid access at IO address %p\n", IoAddress);
+		return 0xffff;
+	}
+	do {
+		++Pci_Io_Read_Count;
+		HvCall3Ret16(HvCallPciBarLoad16, &ret, dsa,
+				BarOffset, 0);
+	} while (CheckReturnCode("RDW", DevNode, ret.rc) != 0);
+
+	return swab16((u16)ret.value);
+}
+EXPORT_SYMBOL(iSeries_Read_Word);
+
+u32 iSeries_Read_Long(const volatile void __iomem *IoAddress)
+{
+	u64 BarOffset;
+	u64 dsa;
+	struct HvCallPci_LoadReturn ret;
+	struct iSeries_Device_Node *DevNode =
+		xlate_iomm_address(IoAddress, &dsa, &BarOffset);
+
+	if (DevNode == NULL) {
+		static unsigned long last_jiffies;
+		static int num_printed;
+
+		if ((jiffies - last_jiffies) > 60 * HZ) {
+			last_jiffies = jiffies;
+			num_printed = 0;
+		}
+		if (num_printed++ < 10)
+			printk(KERN_ERR "iSeries_Read_Long: invalid access at IO address %p\n", IoAddress);
+		return 0xffffffff;
+	}
+	do {
+		++Pci_Io_Read_Count;
+		HvCall3Ret16(HvCallPciBarLoad32, &ret, dsa,
+				BarOffset, 0);
+	} while (CheckReturnCode("RDL", DevNode, ret.rc) != 0);
+
+	return swab32((u32)ret.value);
+}
+EXPORT_SYMBOL(iSeries_Read_Long);
+
+/*
+ * Write MM I/O Instructions for the iSeries
+ *
+ * iSeries_Write_Byte = Write Byte (8 bit)
+ * iSeries_Write_Word = Write Word(16 bit)
+ * iSeries_Write_Long = Write Long(32 bit)
+ */
+void iSeries_Write_Byte(u8 data, volatile void __iomem *IoAddress)
+{
+	u64 BarOffset;
+	u64 dsa;
+	u64 rc;
+	struct iSeries_Device_Node *DevNode =
+		xlate_iomm_address(IoAddress, &dsa, &BarOffset);
+
+	if (DevNode == NULL) {
+		static unsigned long last_jiffies;
+		static int num_printed;
+
+		if ((jiffies - last_jiffies) > 60 * HZ) {
+			last_jiffies = jiffies;
+			num_printed = 0;
+		}
+		if (num_printed++ < 10)
+			printk(KERN_ERR "iSeries_Write_Byte: invalid access at IO address %p\n", IoAddress);
+		return;
+	}
+	do {
+		++Pci_Io_Write_Count;
+		rc = HvCall4(HvCallPciBarStore8, dsa, BarOffset, data, 0);
+	} while (CheckReturnCode("WWB", DevNode, rc) != 0);
+}
+EXPORT_SYMBOL(iSeries_Write_Byte);
+
+void iSeries_Write_Word(u16 data, volatile void __iomem *IoAddress)
+{
+	u64 BarOffset;
+	u64 dsa;
+	u64 rc;
+	struct iSeries_Device_Node *DevNode =
+		xlate_iomm_address(IoAddress, &dsa, &BarOffset);
+
+	if (DevNode == NULL) {
+		static unsigned long last_jiffies;
+		static int num_printed;
+
+		if ((jiffies - last_jiffies) > 60 * HZ) {
+			last_jiffies = jiffies;
+			num_printed = 0;
+		}
+		if (num_printed++ < 10)
+			printk(KERN_ERR "iSeries_Write_Word: invalid access at IO address %p\n", IoAddress);
+		return;
+	}
+	do {
+		++Pci_Io_Write_Count;
+		rc = HvCall4(HvCallPciBarStore16, dsa, BarOffset, swab16(data), 0);
+	} while (CheckReturnCode("WWW", DevNode, rc) != 0);
+}
+EXPORT_SYMBOL(iSeries_Write_Word);
+
+void iSeries_Write_Long(u32 data, volatile void __iomem *IoAddress)
+{
+	u64 BarOffset;
+	u64 dsa;
+	u64 rc;
+	struct iSeries_Device_Node *DevNode =
+		xlate_iomm_address(IoAddress, &dsa, &BarOffset);
+
+	if (DevNode == NULL) {
+		static unsigned long last_jiffies;
+		static int num_printed;
+
+		if ((jiffies - last_jiffies) > 60 * HZ) {
+			last_jiffies = jiffies;
+			num_printed = 0;
+		}
+		if (num_printed++ < 10)
+			printk(KERN_ERR "iSeries_Write_Long: invalid access at IO address %p\n", IoAddress);
+		return;
+	}
+	do {
+		++Pci_Io_Write_Count;
+		rc = HvCall4(HvCallPciBarStore32, dsa, BarOffset, swab32(data), 0);
+	} while (CheckReturnCode("WWL", DevNode, rc) != 0);
+}
+EXPORT_SYMBOL(iSeries_Write_Long);
diff --git a/arch/ppc64/kernel/iSeries_pci_reset.c b/arch/ppc64/kernel/iSeries_pci_reset.c
new file mode 100644
index 00000000000..0f785e4584f
--- /dev/null
+++ b/arch/ppc64/kernel/iSeries_pci_reset.c
@@ -0,0 +1,104 @@
+#define PCIFR(...)
+/************************************************************************/
+/* File iSeries_pci_reset.c created by Allan Trautman on Mar 21 2001.   */
+/************************************************************************/
+/* This code supports the pci interface on the IBM iSeries systems.     */
+/* Copyright (C) 20yy  <Allan H Trautman> <IBM Corp>                    */
+/*                                                                      */
+/* This program is free software; you can redistribute it and/or modify */
+/* it under the terms of the GNU General Public License as published by */
+/* the Free Software Foundation; either version 2 of the License, or    */
+/* (at your option) any later version.                                  */
+/*                                                                      */
+/* This program is distributed in the hope that it will be useful,      */ 
+/* but WITHOUT ANY WARRANTY; without even the implied warranty of       */
+/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the        */
+/* GNU General Public License for more details.                         */
+/*                                                                      */
+/* You should have received a copy of the GNU General Public License    */ 
+/* along with this program; if not, write to the:                       */
+/* Free Software Foundation, Inc.,                                      */ 
+/* 59 Temple Place, Suite 330,                                          */ 
+/* Boston, MA  02111-1307  USA                                          */
+/************************************************************************/
+/* Change Activity:                                                     */
+/*   Created, March 20, 2001                                            */
+/*   April 30, 2001, Added return codes on functions.                   */
+/*   September 10, 2001, Ported to ppc64.                               */
+/* End Change Activity                                                  */
+/************************************************************************/
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/irq.h>
+#include <linux/delay.h>
+
+#include <asm/io.h>
+#include <asm/iSeries/HvCallPci.h>
+#include <asm/iSeries/HvTypes.h>
+#include <asm/iSeries/mf.h>
+#include <asm/pci.h>
+
+#include <asm/iSeries/iSeries_pci.h>
+#include "pci.h"
+
+/*
+ * Interface to toggle the reset line
+ * Time is in .1 seconds, need for seconds.
+ */
+int iSeries_Device_ToggleReset(struct pci_dev *PciDev, int AssertTime,
+		int DelayTime)
+{
+	unsigned int AssertDelay, WaitDelay;
+	struct iSeries_Device_Node *DeviceNode =
+		(struct iSeries_Device_Node *)PciDev->sysdata;
+
+ 	if (DeviceNode == NULL) { 
+		printk("PCI: Pci Reset Failed, Device Node not found for pci_dev %p\n",
+				PciDev);
+		return -1;
+	}
+	/*
+	 * Set defaults, Assert is .5 second, Wait is 3 seconds.
+	 */
+	if (AssertTime == 0)
+		AssertDelay = 500;
+	else
+		AssertDelay = AssertTime * 100;
+
+	if (DelayTime == 0)
+		WaitDelay = 3000;
+	else
+		WaitDelay = DelayTime * 100;
+
+	/*
+	 * Assert reset
+	 */
+	DeviceNode->ReturnCode = HvCallPci_setSlotReset(ISERIES_BUS(DeviceNode),
+			0x00, DeviceNode->AgentId, 1);
+	if (DeviceNode->ReturnCode == 0) {
+		msleep(AssertDelay);			/* Sleep for the time */
+		DeviceNode->ReturnCode =
+			HvCallPci_setSlotReset(ISERIES_BUS(DeviceNode),
+					0x00, DeviceNode->AgentId, 0);
+
+		/*
+   		 * Wait for device to reset
+		 */
+		msleep(WaitDelay);
+	}
+	if (DeviceNode->ReturnCode == 0)
+		PCIFR("Slot 0x%04X.%02 Reset\n", ISERIES_BUS(DeviceNode),
+				DeviceNode->AgentId);
+	else {
+		printk("PCI: Slot 0x%04X.%02X Reset Failed, RCode: %04X\n",
+				ISERIES_BUS(DeviceNode), DeviceNode->AgentId,
+				DeviceNode->ReturnCode);
+		PCIFR("Slot 0x%04X.%02X Reset Failed, RCode: %04X\n",
+				ISERIES_BUS(DeviceNode), DeviceNode->AgentId,
+				DeviceNode->ReturnCode);
+	}
+	return DeviceNode->ReturnCode;
+}
+EXPORT_SYMBOL(iSeries_Device_ToggleReset);
diff --git a/arch/ppc64/kernel/iSeries_proc.c b/arch/ppc64/kernel/iSeries_proc.c
new file mode 100644
index 00000000000..0cc58ddf48d
--- /dev/null
+++ b/arch/ppc64/kernel/iSeries_proc.c
@@ -0,0 +1,162 @@
+/*
+ * iSeries_proc.c
+ * Copyright (C) 2001  Kyle A. Lucke IBM Corporation
+ * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/param.h>		/* for HZ */
+#include <asm/paca.h>
+#include <asm/processor.h>
+#include <asm/time.h>
+#include <asm/lppaca.h>
+#include <asm/iSeries/ItLpQueue.h>
+#include <asm/iSeries/HvCallXm.h>
+#include <asm/iSeries/IoHriMainStore.h>
+#include <asm/iSeries/LparData.h>
+#include <asm/iSeries/iSeries_proc.h>
+
+static int __init iseries_proc_create(void)
+{
+	struct proc_dir_entry *e = proc_mkdir("iSeries", 0);
+	if (!e)
+		return 1;
+
+	return 0;
+}
+core_initcall(iseries_proc_create);
+
+static char *event_types[9] = {
+	"Hypervisor\t\t",
+	"Machine Facilities\t",
+	"Session Manager\t",
+	"SPD I/O\t\t",
+	"Virtual Bus\t\t",
+	"PCI I/O\t\t",
+	"RIO I/O\t\t",
+	"Virtual Lan\t\t",
+	"Virtual I/O\t\t"
+};
+
+static int proc_lpevents_show(struct seq_file *m, void *v)
+{
+	unsigned int i;
+
+	seq_printf(m, "LpEventQueue 0\n");
+	seq_printf(m, "  events processed:\t%lu\n",
+		   (unsigned long)xItLpQueue.xLpIntCount);
+
+	for (i = 0; i < 9; ++i)
+		seq_printf(m, "    %s %10lu\n", event_types[i],
+			   (unsigned long)xItLpQueue.xLpIntCountByType[i]);
+
+	seq_printf(m, "\n  events processed by processor:\n");
+
+	for_each_online_cpu(i)
+		seq_printf(m, "    CPU%02d  %10u\n", i, paca[i].lpevent_count);
+
+	return 0;
+}
+
+static int proc_lpevents_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, proc_lpevents_show, NULL);
+}
+
+static struct file_operations proc_lpevents_operations = {
+	.open		= proc_lpevents_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static unsigned long startTitan = 0;
+static unsigned long startTb = 0;
+
+static int proc_titantod_show(struct seq_file *m, void *v)
+{
+	unsigned long tb0, titan_tod;
+
+	tb0 = get_tb();
+	titan_tod = HvCallXm_loadTod();
+
+	seq_printf(m, "Titan\n" );
+	seq_printf(m, "  time base =          %016lx\n", tb0);
+	seq_printf(m, "  titan tod =          %016lx\n", titan_tod);
+	seq_printf(m, "  xProcFreq =          %016x\n",
+		   xIoHriProcessorVpd[0].xProcFreq);
+	seq_printf(m, "  xTimeBaseFreq =      %016x\n",
+		   xIoHriProcessorVpd[0].xTimeBaseFreq);
+	seq_printf(m, "  tb_ticks_per_jiffy = %lu\n", tb_ticks_per_jiffy);
+	seq_printf(m, "  tb_ticks_per_usec  = %lu\n", tb_ticks_per_usec);
+
+	if (!startTitan) {
+		startTitan = titan_tod;
+		startTb = tb0;
+	} else {
+		unsigned long titan_usec = (titan_tod - startTitan) >> 12;
+		unsigned long tb_ticks = (tb0 - startTb);
+		unsigned long titan_jiffies = titan_usec / (1000000/HZ);
+		unsigned long titan_jiff_usec = titan_jiffies * (1000000/HZ);
+		unsigned long titan_jiff_rem_usec = titan_usec - titan_jiff_usec;
+		unsigned long tb_jiffies = tb_ticks / tb_ticks_per_jiffy;
+		unsigned long tb_jiff_ticks = tb_jiffies * tb_ticks_per_jiffy;
+		unsigned long tb_jiff_rem_ticks = tb_ticks - tb_jiff_ticks;
+		unsigned long tb_jiff_rem_usec = tb_jiff_rem_ticks / tb_ticks_per_usec;
+		unsigned long new_tb_ticks_per_jiffy = (tb_ticks * (1000000/HZ))/titan_usec;
+
+		seq_printf(m, "  titan elapsed = %lu uSec\n", titan_usec);
+		seq_printf(m, "  tb elapsed    = %lu ticks\n", tb_ticks);
+		seq_printf(m, "  titan jiffies = %lu.%04lu \n", titan_jiffies,
+			   titan_jiff_rem_usec);
+		seq_printf(m, "  tb jiffies    = %lu.%04lu\n", tb_jiffies,
+			   tb_jiff_rem_usec);
+		seq_printf(m, "  new tb_ticks_per_jiffy = %lu\n",
+			   new_tb_ticks_per_jiffy);
+	}
+
+	return 0;
+}
+
+static int proc_titantod_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, proc_titantod_show, NULL);
+}
+
+static struct file_operations proc_titantod_operations = {
+	.open		= proc_titantod_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init iseries_proc_init(void)
+{
+	struct proc_dir_entry *e;
+
+	e = create_proc_entry("iSeries/lpevents", S_IFREG|S_IRUGO, NULL);
+	if (e)
+		e->proc_fops = &proc_lpevents_operations;
+
+	e = create_proc_entry("iSeries/titanTod", S_IFREG|S_IRUGO, NULL);
+	if (e)
+		e->proc_fops = &proc_titantod_operations;
+
+	return 0;
+}
+__initcall(iseries_proc_init);
diff --git a/arch/ppc64/kernel/iSeries_setup.c b/arch/ppc64/kernel/iSeries_setup.c
new file mode 100644
index 00000000000..da20120f226
--- /dev/null
+++ b/arch/ppc64/kernel/iSeries_setup.c
@@ -0,0 +1,877 @@
+/*
+ *    Copyright (c) 2000 Mike Corrigan <mikejc@us.ibm.com>
+ *    Copyright (c) 1999-2000 Grant Erickson <grant@lcse.umn.edu>
+ *
+ *    Module name: iSeries_setup.c
+ *
+ *    Description:
+ *      Architecture- / platform-specific boot-time initialization code for
+ *      the IBM iSeries LPAR.  Adapted from original code by Grant Erickson and
+ *      code by Gary Thomas, Cort Dougan <cort@fsmlabs.com>, and Dan Malek
+ *      <dan@net4x.com>.
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/threads.h>
+#include <linux/smp.h>
+#include <linux/param.h>
+#include <linux/string.h>
+#include <linux/bootmem.h>
+#include <linux/initrd.h>
+#include <linux/seq_file.h>
+#include <linux/kdev_t.h>
+#include <linux/major.h>
+#include <linux/root_dev.h>
+
+#include <asm/processor.h>
+#include <asm/machdep.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
+#include <asm/mmu_context.h>
+#include <asm/cputable.h>
+#include <asm/sections.h>
+#include <asm/iommu.h>
+
+#include <asm/time.h>
+#include "iSeries_setup.h"
+#include <asm/naca.h>
+#include <asm/paca.h>
+#include <asm/cache.h>
+#include <asm/sections.h>
+#include <asm/iSeries/LparData.h>
+#include <asm/iSeries/HvCallHpt.h>
+#include <asm/iSeries/HvLpConfig.h>
+#include <asm/iSeries/HvCallEvent.h>
+#include <asm/iSeries/HvCallSm.h>
+#include <asm/iSeries/HvCallXm.h>
+#include <asm/iSeries/ItLpQueue.h>
+#include <asm/iSeries/IoHriMainStore.h>
+#include <asm/iSeries/iSeries_proc.h>
+#include <asm/iSeries/mf.h>
+#include <asm/iSeries/HvLpEvent.h>
+#include <asm/iSeries/iSeries_irq.h>
+
+extern void hvlog(char *fmt, ...);
+
+#ifdef DEBUG
+#define DBG(fmt...) hvlog(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+/* Function Prototypes */
+extern void ppcdbg_initialize(void);
+
+static void build_iSeries_Memory_Map(void);
+static void setup_iSeries_cache_sizes(void);
+static void iSeries_bolt_kernel(unsigned long saddr, unsigned long eaddr);
+extern void iSeries_pci_final_fixup(void);
+
+/* Global Variables */
+static unsigned long procFreqHz;
+static unsigned long procFreqMhz;
+static unsigned long procFreqMhzHundreths;
+
+static unsigned long tbFreqHz;
+static unsigned long tbFreqMhz;
+static unsigned long tbFreqMhzHundreths;
+
+int piranha_simulator;
+
+extern int rd_size;		/* Defined in drivers/block/rd.c */
+extern unsigned long klimit;
+extern unsigned long embedded_sysmap_start;
+extern unsigned long embedded_sysmap_end;
+
+extern unsigned long iSeries_recal_tb;
+extern unsigned long iSeries_recal_titan;
+
+static int mf_initialized;
+
+struct MemoryBlock {
+	unsigned long absStart;
+	unsigned long absEnd;
+	unsigned long logicalStart;
+	unsigned long logicalEnd;
+};
+
+/*
+ * Process the main store vpd to determine where the holes in memory are
+ * and return the number of physical blocks and fill in the array of
+ * block data.
+ */
+static unsigned long iSeries_process_Condor_mainstore_vpd(
+		struct MemoryBlock *mb_array, unsigned long max_entries)
+{
+	unsigned long holeFirstChunk, holeSizeChunks;
+	unsigned long numMemoryBlocks = 1;
+	struct IoHriMainStoreSegment4 *msVpd =
+		(struct IoHriMainStoreSegment4 *)xMsVpd;
+	unsigned long holeStart = msVpd->nonInterleavedBlocksStartAdr;
+	unsigned long holeEnd = msVpd->nonInterleavedBlocksEndAdr;
+	unsigned long holeSize = holeEnd - holeStart;
+
+	printk("Mainstore_VPD: Condor\n");
+	/*
+	 * Determine if absolute memory has any
+	 * holes so that we can interpret the
+	 * access map we get back from the hypervisor
+	 * correctly.
+	 */
+	mb_array[0].logicalStart = 0;
+	mb_array[0].logicalEnd = 0x100000000;
+	mb_array[0].absStart = 0;
+	mb_array[0].absEnd = 0x100000000;
+
+	if (holeSize) {
+		numMemoryBlocks = 2;
+		holeStart = holeStart & 0x000fffffffffffff;
+		holeStart = addr_to_chunk(holeStart);
+		holeFirstChunk = holeStart;
+		holeSize = addr_to_chunk(holeSize);
+		holeSizeChunks = holeSize;
+		printk( "Main store hole: start chunk = %0lx, size = %0lx chunks\n",
+				holeFirstChunk, holeSizeChunks );
+		mb_array[0].logicalEnd = holeFirstChunk;
+		mb_array[0].absEnd = holeFirstChunk;
+		mb_array[1].logicalStart = holeFirstChunk;
+		mb_array[1].logicalEnd = 0x100000000 - holeSizeChunks;
+		mb_array[1].absStart = holeFirstChunk + holeSizeChunks;
+		mb_array[1].absEnd = 0x100000000;
+	}
+	return numMemoryBlocks;
+}
+
+#define MaxSegmentAreas			32
+#define MaxSegmentAdrRangeBlocks	128
+#define MaxAreaRangeBlocks		4
+
+static unsigned long iSeries_process_Regatta_mainstore_vpd(
+		struct MemoryBlock *mb_array, unsigned long max_entries)
+{
+	struct IoHriMainStoreSegment5 *msVpdP =
+		(struct IoHriMainStoreSegment5 *)xMsVpd;
+	unsigned long numSegmentBlocks = 0;
+	u32 existsBits = msVpdP->msAreaExists;
+	unsigned long area_num;
+
+	printk("Mainstore_VPD: Regatta\n");
+
+	for (area_num = 0; area_num < MaxSegmentAreas; ++area_num ) {
+		unsigned long numAreaBlocks;
+		struct IoHriMainStoreArea4 *currentArea;
+
+		if (existsBits & 0x80000000) {
+			unsigned long block_num;
+
+			currentArea = &msVpdP->msAreaArray[area_num];
+			numAreaBlocks = currentArea->numAdrRangeBlocks;
+			printk("ms_vpd: processing area %2ld  blocks=%ld",
+					area_num, numAreaBlocks);
+			for (block_num = 0; block_num < numAreaBlocks;
+					++block_num ) {
+				/* Process an address range block */
+				struct MemoryBlock tempBlock;
+				unsigned long i;
+
+				tempBlock.absStart =
+					(unsigned long)currentArea->xAdrRangeBlock[block_num].blockStart;
+				tempBlock.absEnd =
+					(unsigned long)currentArea->xAdrRangeBlock[block_num].blockEnd;
+				tempBlock.logicalStart = 0;
+				tempBlock.logicalEnd   = 0;
+				printk("\n          block %ld absStart=%016lx absEnd=%016lx",
+						block_num, tempBlock.absStart,
+						tempBlock.absEnd);
+
+				for (i = 0; i < numSegmentBlocks; ++i) {
+					if (mb_array[i].absStart ==
+							tempBlock.absStart)
+						break;
+				}
+				if (i == numSegmentBlocks) {
+					if (numSegmentBlocks == max_entries)
+						panic("iSeries_process_mainstore_vpd: too many memory blocks");
+					mb_array[numSegmentBlocks] = tempBlock;
+					++numSegmentBlocks;
+				} else
+					printk(" (duplicate)");
+			}
+			printk("\n");
+		}
+		existsBits <<= 1;
+	}
+	/* Now sort the blocks found into ascending sequence */
+	if (numSegmentBlocks > 1) {
+		unsigned long m, n;
+
+		for (m = 0; m < numSegmentBlocks - 1; ++m) {
+			for (n = numSegmentBlocks - 1; m < n; --n) {
+				if (mb_array[n].absStart <
+						mb_array[n-1].absStart) {
+					struct MemoryBlock tempBlock;
+
+					tempBlock = mb_array[n];
+					mb_array[n] = mb_array[n-1];
+					mb_array[n-1] = tempBlock;
+				}
+			}
+		}
+	}
+	/*
+	 * Assign "logical" addresses to each block.  These
+	 * addresses correspond to the hypervisor "bitmap" space.
+	 * Convert all addresses into units of 256K chunks.
+	 */
+	{
+	unsigned long i, nextBitmapAddress;
+
+	printk("ms_vpd: %ld sorted memory blocks\n", numSegmentBlocks);
+	nextBitmapAddress = 0;
+	for (i = 0; i < numSegmentBlocks; ++i) {
+		unsigned long length = mb_array[i].absEnd -
+			mb_array[i].absStart;
+
+		mb_array[i].logicalStart = nextBitmapAddress;
+		mb_array[i].logicalEnd = nextBitmapAddress + length;
+		nextBitmapAddress += length;
+		printk("          Bitmap range: %016lx - %016lx\n"
+				"        Absolute range: %016lx - %016lx\n",
+				mb_array[i].logicalStart,
+				mb_array[i].logicalEnd,
+				mb_array[i].absStart, mb_array[i].absEnd);
+		mb_array[i].absStart = addr_to_chunk(mb_array[i].absStart &
+				0x000fffffffffffff);
+		mb_array[i].absEnd = addr_to_chunk(mb_array[i].absEnd &
+				0x000fffffffffffff);
+		mb_array[i].logicalStart =
+			addr_to_chunk(mb_array[i].logicalStart);
+		mb_array[i].logicalEnd = addr_to_chunk(mb_array[i].logicalEnd);
+	}
+	}
+
+	return numSegmentBlocks;
+}
+
+static unsigned long iSeries_process_mainstore_vpd(struct MemoryBlock *mb_array,
+		unsigned long max_entries)
+{
+	unsigned long i;
+	unsigned long mem_blocks = 0;
+
+	if (cpu_has_feature(CPU_FTR_SLB))
+		mem_blocks = iSeries_process_Regatta_mainstore_vpd(mb_array,
+				max_entries);
+	else
+		mem_blocks = iSeries_process_Condor_mainstore_vpd(mb_array,
+				max_entries);
+
+	printk("Mainstore_VPD: numMemoryBlocks = %ld \n", mem_blocks);
+	for (i = 0; i < mem_blocks; ++i) {
+		printk("Mainstore_VPD: block %3ld logical chunks %016lx - %016lx\n"
+		       "                             abs chunks %016lx - %016lx\n",
+			i, mb_array[i].logicalStart, mb_array[i].logicalEnd,
+			mb_array[i].absStart, mb_array[i].absEnd);
+	}
+	return mem_blocks;
+}
+
+static void __init iSeries_get_cmdline(void)
+{
+	char *p, *q;
+
+	/* copy the command line parameter from the primary VSP  */
+	HvCallEvent_dmaToSp(cmd_line, 2 * 64* 1024, 256,
+			HvLpDma_Direction_RemoteToLocal);
+
+	p = cmd_line;
+	q = cmd_line + 255;
+	while(p < q) {
+		if (!*p || *p == '\n')
+			break;
+		++p;
+	}
+	*p = 0;
+}
+
+static void __init iSeries_init_early(void)
+{
+	extern unsigned long memory_limit;
+
+	DBG(" -> iSeries_init_early()\n");
+
+	ppcdbg_initialize();
+
+#if defined(CONFIG_BLK_DEV_INITRD)
+	/*
+	 * If the init RAM disk has been configured and there is
+	 * a non-zero starting address for it, set it up
+	 */
+	if (naca.xRamDisk) {
+		initrd_start = (unsigned long)__va(naca.xRamDisk);
+		initrd_end = initrd_start + naca.xRamDiskSize * PAGE_SIZE;
+		initrd_below_start_ok = 1;	// ramdisk in kernel space
+		ROOT_DEV = Root_RAM0;
+		if (((rd_size * 1024) / PAGE_SIZE) < naca.xRamDiskSize)
+			rd_size = (naca.xRamDiskSize * PAGE_SIZE) / 1024;
+	} else
+#endif /* CONFIG_BLK_DEV_INITRD */
+	{
+	    /* ROOT_DEV = MKDEV(VIODASD_MAJOR, 1); */
+	}
+
+	iSeries_recal_tb = get_tb();
+	iSeries_recal_titan = HvCallXm_loadTod();
+
+	/*
+	 * Cache sizes must be initialized before hpte_init_iSeries is called
+	 * as the later need them for flush_icache_range()
+	 */
+	setup_iSeries_cache_sizes();
+
+	/*
+	 * Initialize the hash table management pointers
+	 */
+	hpte_init_iSeries();
+
+	/*
+	 * Initialize the DMA/TCE management
+	 */
+	iommu_init_early_iSeries();
+
+	/*
+	 * Initialize the table which translate Linux physical addresses to
+	 * AS/400 absolute addresses
+	 */
+	build_iSeries_Memory_Map();
+
+	iSeries_get_cmdline();
+
+	/* Save unparsed command line copy for /proc/cmdline */
+	strlcpy(saved_command_line, cmd_line, COMMAND_LINE_SIZE);
+
+	/* Parse early parameters, in particular mem=x */
+	parse_early_param();
+
+	if (memory_limit) {
+		if (memory_limit < systemcfg->physicalMemorySize)
+			systemcfg->physicalMemorySize = memory_limit;
+		else {
+			printk("Ignoring mem=%lu >= ram_top.\n", memory_limit);
+			memory_limit = 0;
+		}
+	}
+
+	/* Bolt kernel mappings for all of memory (or just a bit if we've got a limit) */
+	iSeries_bolt_kernel(0, systemcfg->physicalMemorySize);
+
+	lmb_init();
+	lmb_add(0, systemcfg->physicalMemorySize);
+	lmb_analyze();
+	lmb_reserve(0, __pa(klimit));
+
+	/* Initialize machine-dependency vectors */
+#ifdef CONFIG_SMP
+	smp_init_iSeries();
+#endif
+	if (itLpNaca.xPirEnvironMode == 0)
+		piranha_simulator = 1;
+
+	/* Associate Lp Event Queue 0 with processor 0 */
+	HvCallEvent_setLpEventQueueInterruptProc(0, 0);
+
+	mf_init();
+	mf_initialized = 1;
+	mb();
+
+	/* If we were passed an initrd, set the ROOT_DEV properly if the values
+	 * look sensible. If not, clear initrd reference.
+	 */
+#ifdef CONFIG_BLK_DEV_INITRD
+	if (initrd_start >= KERNELBASE && initrd_end >= KERNELBASE &&
+	    initrd_end > initrd_start)
+		ROOT_DEV = Root_RAM0;
+	else
+		initrd_start = initrd_end = 0;
+#endif /* CONFIG_BLK_DEV_INITRD */
+
+	DBG(" <- iSeries_init_early()\n");
+}
+
+/*
+ * The iSeries may have very large memories ( > 128 GB ) and a partition
+ * may get memory in "chunks" that may be anywhere in the 2**52 real
+ * address space.  The chunks are 256K in size.  To map this to the
+ * memory model Linux expects, the AS/400 specific code builds a
+ * translation table to translate what Linux thinks are "physical"
+ * addresses to the actual real addresses.  This allows us to make
+ * it appear to Linux that we have contiguous memory starting at
+ * physical address zero while in fact this could be far from the truth.
+ * To avoid confusion, I'll let the words physical and/or real address
+ * apply to the Linux addresses while I'll use "absolute address" to
+ * refer to the actual hardware real address.
+ *
+ * build_iSeries_Memory_Map gets information from the Hypervisor and
+ * looks at the Main Store VPD to determine the absolute addresses
+ * of the memory that has been assigned to our partition and builds
+ * a table used to translate Linux's physical addresses to these
+ * absolute addresses.  Absolute addresses are needed when
+ * communicating with the hypervisor (e.g. to build HPT entries)
+ */
+
+static void __init build_iSeries_Memory_Map(void)
+{
+	u32 loadAreaFirstChunk, loadAreaLastChunk, loadAreaSize;
+	u32 nextPhysChunk;
+	u32 hptFirstChunk, hptLastChunk, hptSizeChunks, hptSizePages;
+	u32 num_ptegs;
+	u32 totalChunks,moreChunks;
+	u32 currChunk, thisChunk, absChunk;
+	u32 currDword;
+	u32 chunkBit;
+	u64 map;
+	struct MemoryBlock mb[32];
+	unsigned long numMemoryBlocks, curBlock;
+
+	/* Chunk size on iSeries is 256K bytes */
+	totalChunks = (u32)HvLpConfig_getMsChunks();
+	klimit = msChunks_alloc(klimit, totalChunks, 1UL << 18);
+
+	/*
+	 * Get absolute address of our load area
+	 * and map it to physical address 0
+	 * This guarantees that the loadarea ends up at physical 0
+	 * otherwise, it might not be returned by PLIC as the first
+	 * chunks
+	 */
+
+	loadAreaFirstChunk = (u32)addr_to_chunk(itLpNaca.xLoadAreaAddr);
+	loadAreaSize =  itLpNaca.xLoadAreaChunks;
+
+	/*
+	 * Only add the pages already mapped here.
+	 * Otherwise we might add the hpt pages
+	 * The rest of the pages of the load area
+	 * aren't in the HPT yet and can still
+	 * be assigned an arbitrary physical address
+	 */
+	if ((loadAreaSize * 64) > HvPagesToMap)
+		loadAreaSize = HvPagesToMap / 64;
+
+	loadAreaLastChunk = loadAreaFirstChunk + loadAreaSize - 1;
+
+	/*
+	 * TODO Do we need to do something if the HPT is in the 64MB load area?
+	 * This would be required if the itLpNaca.xLoadAreaChunks includes
+	 * the HPT size
+	 */
+
+	printk("Mapping load area - physical addr = 0000000000000000\n"
+		"                    absolute addr = %016lx\n",
+		chunk_to_addr(loadAreaFirstChunk));
+	printk("Load area size %dK\n", loadAreaSize * 256);
+
+	for (nextPhysChunk = 0; nextPhysChunk < loadAreaSize; ++nextPhysChunk)
+		msChunks.abs[nextPhysChunk] =
+			loadAreaFirstChunk + nextPhysChunk;
+
+	/*
+	 * Get absolute address of our HPT and remember it so
+	 * we won't map it to any physical address
+	 */
+	hptFirstChunk = (u32)addr_to_chunk(HvCallHpt_getHptAddress());
+	hptSizePages = (u32)HvCallHpt_getHptPages();
+	hptSizeChunks = hptSizePages >> (msChunks.chunk_shift - PAGE_SHIFT);
+	hptLastChunk = hptFirstChunk + hptSizeChunks - 1;
+
+	printk("HPT absolute addr = %016lx, size = %dK\n",
+			chunk_to_addr(hptFirstChunk), hptSizeChunks * 256);
+
+	/* Fill in the hashed page table hash mask */
+	num_ptegs = hptSizePages *
+		(PAGE_SIZE / (sizeof(HPTE) * HPTES_PER_GROUP));
+	htab_hash_mask = num_ptegs - 1;
+
+	/*
+	 * The actual hashed page table is in the hypervisor,
+	 * we have no direct access
+	 */
+	htab_address = NULL;
+
+	/*
+	 * Determine if absolute memory has any
+	 * holes so that we can interpret the
+	 * access map we get back from the hypervisor
+	 * correctly.
+	 */
+	numMemoryBlocks = iSeries_process_mainstore_vpd(mb, 32);
+
+	/*
+	 * Process the main store access map from the hypervisor
+	 * to build up our physical -> absolute translation table
+	 */
+	curBlock = 0;
+	currChunk = 0;
+	currDword = 0;
+	moreChunks = totalChunks;
+
+	while (moreChunks) {
+		map = HvCallSm_get64BitsOfAccessMap(itLpNaca.xLpIndex,
+				currDword);
+		thisChunk = currChunk;
+		while (map) {
+			chunkBit = map >> 63;
+			map <<= 1;
+			if (chunkBit) {
+				--moreChunks;
+				while (thisChunk >= mb[curBlock].logicalEnd) {
+					++curBlock;
+					if (curBlock >= numMemoryBlocks)
+						panic("out of memory blocks");
+				}
+				if (thisChunk < mb[curBlock].logicalStart)
+					panic("memory block error");
+
+				absChunk = mb[curBlock].absStart +
+					(thisChunk - mb[curBlock].logicalStart);
+				if (((absChunk < hptFirstChunk) ||
+				     (absChunk > hptLastChunk)) &&
+				    ((absChunk < loadAreaFirstChunk) ||
+				     (absChunk > loadAreaLastChunk))) {
+					msChunks.abs[nextPhysChunk] = absChunk;
+					++nextPhysChunk;
+				}
+			}
+			++thisChunk;
+		}
+		++currDword;
+		currChunk += 64;
+	}
+
+	/*
+	 * main store size (in chunks) is
+	 *   totalChunks - hptSizeChunks
+	 * which should be equal to
+	 *   nextPhysChunk
+	 */
+	systemcfg->physicalMemorySize = chunk_to_addr(nextPhysChunk);
+}
+
+/*
+ * Set up the variables that describe the cache line sizes
+ * for this machine.
+ */
+static void __init setup_iSeries_cache_sizes(void)
+{
+	unsigned int i, n;
+	unsigned int procIx = get_paca()->lppaca.dyn_hv_phys_proc_index;
+
+	systemcfg->icache_size =
+	ppc64_caches.isize = xIoHriProcessorVpd[procIx].xInstCacheSize * 1024;
+	systemcfg->icache_line_size =
+	ppc64_caches.iline_size =
+		xIoHriProcessorVpd[procIx].xInstCacheOperandSize;
+	systemcfg->dcache_size =
+	ppc64_caches.dsize =
+		xIoHriProcessorVpd[procIx].xDataL1CacheSizeKB * 1024;
+	systemcfg->dcache_line_size =
+	ppc64_caches.dline_size =
+		xIoHriProcessorVpd[procIx].xDataCacheOperandSize;
+	ppc64_caches.ilines_per_page = PAGE_SIZE / ppc64_caches.iline_size;
+	ppc64_caches.dlines_per_page = PAGE_SIZE / ppc64_caches.dline_size;
+
+	i = ppc64_caches.iline_size;
+	n = 0;
+	while ((i = (i / 2)))
+		++n;
+	ppc64_caches.log_iline_size = n;
+
+	i = ppc64_caches.dline_size;
+	n = 0;
+	while ((i = (i / 2)))
+		++n;
+	ppc64_caches.log_dline_size = n;
+
+	printk("D-cache line size = %d\n",
+			(unsigned int)ppc64_caches.dline_size);
+	printk("I-cache line size = %d\n",
+			(unsigned int)ppc64_caches.iline_size);
+}
+
+/*
+ * Create a pte. Used during initialization only.
+ */
+static void iSeries_make_pte(unsigned long va, unsigned long pa,
+			     int mode)
+{
+	HPTE local_hpte, rhpte;
+	unsigned long hash, vpn;
+	long slot;
+
+	vpn = va >> PAGE_SHIFT;
+	hash = hpt_hash(vpn, 0);
+
+	local_hpte.dw1.dword1 = pa | mode;
+	local_hpte.dw0.dword0 = 0;
+	local_hpte.dw0.dw0.avpn = va >> 23;
+	local_hpte.dw0.dw0.bolted = 1;		/* bolted */
+	local_hpte.dw0.dw0.v = 1;
+
+	slot = HvCallHpt_findValid(&rhpte, vpn);
+	if (slot < 0) {
+		/* Must find space in primary group */
+		panic("hash_page: hpte already exists\n");
+	}
+	HvCallHpt_addValidate(slot, 0, (HPTE *)&local_hpte );
+}
+
+/*
+ * Bolt the kernel addr space into the HPT
+ */
+static void __init iSeries_bolt_kernel(unsigned long saddr, unsigned long eaddr)
+{
+	unsigned long pa;
+	unsigned long mode_rw = _PAGE_ACCESSED | _PAGE_COHERENT | PP_RWXX;
+	HPTE hpte;
+
+	for (pa = saddr; pa < eaddr ;pa += PAGE_SIZE) {
+		unsigned long ea = (unsigned long)__va(pa);
+		unsigned long vsid = get_kernel_vsid(ea);
+		unsigned long va = (vsid << 28) | (pa & 0xfffffff);
+		unsigned long vpn = va >> PAGE_SHIFT;
+		unsigned long slot = HvCallHpt_findValid(&hpte, vpn);
+
+		/* Make non-kernel text non-executable */
+		if (!in_kernel_text(ea))
+			mode_rw |= HW_NO_EXEC;
+
+		if (hpte.dw0.dw0.v) {
+			/* HPTE exists, so just bolt it */
+			HvCallHpt_setSwBits(slot, 0x10, 0);
+			/* And make sure the pp bits are correct */
+			HvCallHpt_setPp(slot, PP_RWXX);
+		} else
+			/* No HPTE exists, so create a new bolted one */
+			iSeries_make_pte(va, phys_to_abs(pa), mode_rw);
+	}
+}
+
+extern unsigned long ppc_proc_freq;
+extern unsigned long ppc_tb_freq;
+
+/*
+ * Document me.
+ */
+static void __init iSeries_setup_arch(void)
+{
+	void *eventStack;
+	unsigned procIx = get_paca()->lppaca.dyn_hv_phys_proc_index;
+
+	/* Add an eye catcher and the systemcfg layout version number */
+	strcpy(systemcfg->eye_catcher, "SYSTEMCFG:PPC64");
+	systemcfg->version.major = SYSTEMCFG_MAJOR;
+	systemcfg->version.minor = SYSTEMCFG_MINOR;
+
+	/* Setup the Lp Event Queue */
+
+	/* Allocate a page for the Event Stack
+	 * The hypervisor wants the absolute real address, so
+	 * we subtract out the KERNELBASE and add in the
+	 * absolute real address of the kernel load area
+	 */
+	eventStack = alloc_bootmem_pages(LpEventStackSize);
+	memset(eventStack, 0, LpEventStackSize);
+
+	/* Invoke the hypervisor to initialize the event stack */
+	HvCallEvent_setLpEventStack(0, eventStack, LpEventStackSize);
+
+	/* Initialize fields in our Lp Event Queue */
+	xItLpQueue.xSlicEventStackPtr = (char *)eventStack;
+	xItLpQueue.xSlicCurEventPtr = (char *)eventStack;
+	xItLpQueue.xSlicLastValidEventPtr = (char *)eventStack +
+					(LpEventStackSize - LpEventMaxSize);
+	xItLpQueue.xIndex = 0;
+
+	/* Compute processor frequency */
+	procFreqHz = ((1UL << 34) * 1000000) /
+			xIoHriProcessorVpd[procIx].xProcFreq;
+	procFreqMhz = procFreqHz / 1000000;
+	procFreqMhzHundreths = (procFreqHz / 10000) - (procFreqMhz * 100);
+	ppc_proc_freq = procFreqHz;
+
+	/* Compute time base frequency */
+	tbFreqHz = ((1UL << 32) * 1000000) /
+		xIoHriProcessorVpd[procIx].xTimeBaseFreq;
+	tbFreqMhz = tbFreqHz / 1000000;
+	tbFreqMhzHundreths = (tbFreqHz / 10000) - (tbFreqMhz * 100);
+	ppc_tb_freq = tbFreqHz;
+
+	printk("Max  logical processors = %d\n",
+			itVpdAreas.xSlicMaxLogicalProcs);
+	printk("Max physical processors = %d\n",
+			itVpdAreas.xSlicMaxPhysicalProcs);
+	printk("Processor frequency = %lu.%02lu\n", procFreqMhz,
+			procFreqMhzHundreths);
+	printk("Time base frequency = %lu.%02lu\n", tbFreqMhz,
+			tbFreqMhzHundreths);
+	systemcfg->processor = xIoHriProcessorVpd[procIx].xPVR;
+	printk("Processor version = %x\n", systemcfg->processor);
+}
+
+static void iSeries_get_cpuinfo(struct seq_file *m)
+{
+	seq_printf(m, "machine\t\t: 64-bit iSeries Logical Partition\n");
+}
+
+/*
+ * Document me.
+ * and Implement me.
+ */
+static int iSeries_get_irq(struct pt_regs *regs)
+{
+	/* -2 means ignore this interrupt */
+	return -2;
+}
+
+/*
+ * Document me.
+ */
+static void iSeries_restart(char *cmd)
+{
+	mf_reboot();
+}
+
+/*
+ * Document me.
+ */
+static void iSeries_power_off(void)
+{
+	mf_power_off();
+}
+
+/*
+ * Document me.
+ */
+static void iSeries_halt(void)
+{
+	mf_power_off();
+}
+
+extern void setup_default_decr(void);
+
+/*
+ * void __init iSeries_calibrate_decr()
+ *
+ * Description:
+ *   This routine retrieves the internal processor frequency from the VPD,
+ *   and sets up the kernel timer decrementer based on that value.
+ *
+ */
+static void __init iSeries_calibrate_decr(void)
+{
+	unsigned long	cyclesPerUsec;
+	struct div_result divres;
+
+	/* Compute decrementer (and TB) frequency in cycles/sec */
+	cyclesPerUsec = ppc_tb_freq / 1000000;
+
+	/*
+	 * Set the amount to refresh the decrementer by.  This
+	 * is the number of decrementer ticks it takes for
+	 * 1/HZ seconds.
+	 */
+	tb_ticks_per_jiffy = ppc_tb_freq / HZ;
+
+#if 0
+	/* TEST CODE FOR ADJTIME */
+	tb_ticks_per_jiffy += tb_ticks_per_jiffy / 5000;
+	/* END OF TEST CODE */
+#endif
+
+	/*
+	 * tb_ticks_per_sec = freq; would give better accuracy
+	 * but tb_ticks_per_sec = tb_ticks_per_jiffy*HZ; assures
+	 * that jiffies (and xtime) will match the time returned
+	 * by do_gettimeofday.
+	 */
+	tb_ticks_per_sec = tb_ticks_per_jiffy * HZ;
+	tb_ticks_per_usec = cyclesPerUsec;
+	tb_to_us = mulhwu_scale_factor(ppc_tb_freq, 1000000);
+	div128_by_32(1024 * 1024, 0, tb_ticks_per_sec, &divres);
+	tb_to_xs = divres.result_low;
+	setup_default_decr();
+}
+
+static void __init iSeries_progress(char * st, unsigned short code)
+{
+	printk("Progress: [%04x] - %s\n", (unsigned)code, st);
+	if (!piranha_simulator && mf_initialized) {
+		if (code != 0xffff)
+			mf_display_progress(code);
+		else
+			mf_clear_src();
+	}
+}
+
+static void __init iSeries_fixup_klimit(void)
+{
+	/*
+	 * Change klimit to take into account any ram disk
+	 * that may be included
+	 */
+	if (naca.xRamDisk)
+		klimit = KERNELBASE + (u64)naca.xRamDisk +
+			(naca.xRamDiskSize * PAGE_SIZE);
+	else {
+		/*
+		 * No ram disk was included - check and see if there
+		 * was an embedded system map.  Change klimit to take
+		 * into account any embedded system map
+		 */
+		if (embedded_sysmap_end)
+			klimit = KERNELBASE + ((embedded_sysmap_end + 4095) &
+					0xfffffffffffff000);
+	}
+}
+
+static int __init iSeries_src_init(void)
+{
+        /* clear the progress line */
+        ppc_md.progress(" ", 0xffff);
+        return 0;
+}
+
+late_initcall(iSeries_src_init);
+
+void __init iSeries_early_setup(void)
+{
+	iSeries_fixup_klimit();
+
+	ppc_md.setup_arch = iSeries_setup_arch;
+	ppc_md.get_cpuinfo = iSeries_get_cpuinfo;
+	ppc_md.init_IRQ = iSeries_init_IRQ;
+	ppc_md.get_irq = iSeries_get_irq;
+	ppc_md.init_early = iSeries_init_early,
+
+	ppc_md.pcibios_fixup  = iSeries_pci_final_fixup;
+
+	ppc_md.restart = iSeries_restart;
+	ppc_md.power_off = iSeries_power_off;
+	ppc_md.halt = iSeries_halt;
+
+	ppc_md.get_boot_time = iSeries_get_boot_time;
+	ppc_md.set_rtc_time = iSeries_set_rtc_time;
+	ppc_md.get_rtc_time = iSeries_get_rtc_time;
+	ppc_md.calibrate_decr = iSeries_calibrate_decr;
+	ppc_md.progress = iSeries_progress;
+}
+
diff --git a/arch/ppc64/kernel/iSeries_setup.h b/arch/ppc64/kernel/iSeries_setup.h
new file mode 100644
index 00000000000..c6eb29a245a
--- /dev/null
+++ b/arch/ppc64/kernel/iSeries_setup.h
@@ -0,0 +1,26 @@
+/*
+ *    Copyright (c) 2000 Mike Corrigan <mikejc@us.ibm.com>
+ *    Copyright (c) 1999-2000 Grant Erickson <grant@lcse.umn.edu>
+ *
+ *    Module name: as400_setup.h
+ *
+ *    Description:
+ *      Architecture- / platform-specific boot-time initialization code for
+ *      the IBM AS/400 LPAR. Adapted from original code by Grant Erickson and
+ *      code by Gary Thomas, Cort Dougan <cort@cs.nmt.edu>, and Dan Malek
+ *      <dan@netx4.com>.
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#ifndef	__ISERIES_SETUP_H__
+#define	__ISERIES_SETUP_H__
+
+extern void iSeries_get_boot_time(struct rtc_time *tm);
+extern int iSeries_set_rtc_time(struct rtc_time *tm);
+extern void iSeries_get_rtc_time(struct rtc_time *tm);
+
+#endif /* __ISERIES_SETUP_H__ */
diff --git a/arch/ppc64/kernel/iSeries_smp.c b/arch/ppc64/kernel/iSeries_smp.c
new file mode 100644
index 00000000000..ba1f084d546
--- /dev/null
+++ b/arch/ppc64/kernel/iSeries_smp.c
@@ -0,0 +1,151 @@
+/*
+ * SMP support for iSeries machines.
+ *
+ * Dave Engebretsen, Peter Bergner, and
+ * Mike Corrigan {engebret|bergner|mikec}@us.ibm.com
+ *
+ * Plus various changes from other IBM teams...
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/cache.h>
+#include <linux/err.h>
+#include <linux/sysdev.h>
+#include <linux/cpu.h>
+
+#include <asm/ptrace.h>
+#include <asm/atomic.h>
+#include <asm/irq.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/paca.h>
+#include <asm/iSeries/LparData.h>
+#include <asm/iSeries/HvCall.h>
+#include <asm/iSeries/HvCallCfg.h>
+#include <asm/time.h>
+#include <asm/ppcdebug.h>
+#include <asm/machdep.h>
+#include <asm/cputable.h>
+#include <asm/system.h>
+
+static unsigned long iSeries_smp_message[NR_CPUS];
+
+void iSeries_smp_message_recv( struct pt_regs * regs )
+{
+	int cpu = smp_processor_id();
+	int msg;
+
+	if ( num_online_cpus() < 2 )
+		return;
+
+	for ( msg = 0; msg < 4; ++msg )
+		if ( test_and_clear_bit( msg, &iSeries_smp_message[cpu] ) )
+			smp_message_recv( msg, regs );
+}
+
+static inline void smp_iSeries_do_message(int cpu, int msg)
+{
+	set_bit(msg, &iSeries_smp_message[cpu]);
+	HvCall_sendIPI(&(paca[cpu]));
+}
+
+static void smp_iSeries_message_pass(int target, int msg)
+{
+	int i;
+
+	if (target < NR_CPUS)
+		smp_iSeries_do_message(target, msg);
+	else {
+		for_each_online_cpu(i) {
+			if (target == MSG_ALL_BUT_SELF
+			    && i == smp_processor_id())
+				continue;
+			smp_iSeries_do_message(i, msg);
+		}
+	}
+}
+
+static int smp_iSeries_numProcs(void)
+{
+	unsigned np, i;
+
+	np = 0;
+        for (i=0; i < NR_CPUS; ++i) {
+                if (paca[i].lppaca.dyn_proc_status < 2) {
+			cpu_set(i, cpu_possible_map);
+			cpu_set(i, cpu_present_map);
+			cpu_set(i, cpu_sibling_map[i]);
+                        ++np;
+                }
+        }
+	return np;
+}
+
+static int smp_iSeries_probe(void)
+{
+	unsigned i;
+	unsigned np = 0;
+
+	for (i=0; i < NR_CPUS; ++i) {
+		if (paca[i].lppaca.dyn_proc_status < 2) {
+			/*paca[i].active = 1;*/
+			++np;
+		}
+	}
+
+	return np;
+}
+
+static void smp_iSeries_kick_cpu(int nr)
+{
+	BUG_ON(nr < 0 || nr >= NR_CPUS);
+
+	/* Verify that our partition has a processor nr */
+	if (paca[nr].lppaca.dyn_proc_status >= 2)
+		return;
+
+	/* The processor is currently spinning, waiting
+	 * for the cpu_start field to become non-zero
+	 * After we set cpu_start, the processor will
+	 * continue on to secondary_start in iSeries_head.S
+	 */
+	paca[nr].cpu_start = 1;
+}
+
+static void __devinit smp_iSeries_setup_cpu(int nr)
+{
+}
+
+static struct smp_ops_t iSeries_smp_ops = {
+	.message_pass = smp_iSeries_message_pass,
+	.probe        = smp_iSeries_probe,
+	.kick_cpu     = smp_iSeries_kick_cpu,
+	.setup_cpu    = smp_iSeries_setup_cpu,
+};
+
+/* This is called very early. */
+void __init smp_init_iSeries(void)
+{
+	smp_ops = &iSeries_smp_ops;
+	systemcfg->processorCount	= smp_iSeries_numProcs();
+}
+
diff --git a/arch/ppc64/kernel/idle.c b/arch/ppc64/kernel/idle.c
new file mode 100644
index 00000000000..6abc621d3ba
--- /dev/null
+++ b/arch/ppc64/kernel/idle.c
@@ -0,0 +1,380 @@
+/*
+ * Idle daemon for PowerPC.  Idle daemon will handle any action
+ * that needs to be taken when the system becomes idle.
+ *
+ * Originally Written by Cort Dougan (cort@cs.nmt.edu)
+ *
+ * iSeries supported added by Mike Corrigan <mikejc@us.ibm.com>
+ *
+ * Additional shared processor, SMT, and firmware support
+ *    Copyright (c) 2003 Dave Engebretsen <engebret@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/smp.h>
+
+#include <asm/system.h>
+#include <asm/processor.h>
+#include <asm/mmu.h>
+#include <asm/cputable.h>
+#include <asm/time.h>
+#include <asm/iSeries/HvCall.h>
+#include <asm/iSeries/ItLpQueue.h>
+#include <asm/plpar_wrappers.h>
+#include <asm/systemcfg.h>
+
+extern void power4_idle(void);
+
+static int (*idle_loop)(void);
+
+#ifdef CONFIG_PPC_ISERIES
+static unsigned long maxYieldTime = 0;
+static unsigned long minYieldTime = 0xffffffffffffffffUL;
+
+static void yield_shared_processor(void)
+{
+	unsigned long tb;
+	unsigned long yieldTime;
+
+	HvCall_setEnabledInterrupts(HvCall_MaskIPI |
+				    HvCall_MaskLpEvent |
+				    HvCall_MaskLpProd |
+				    HvCall_MaskTimeout);
+
+	tb = get_tb();
+	/* Compute future tb value when yield should expire */
+	HvCall_yieldProcessor(HvCall_YieldTimed, tb+tb_ticks_per_jiffy);
+
+	yieldTime = get_tb() - tb;
+	if (yieldTime > maxYieldTime)
+		maxYieldTime = yieldTime;
+
+	if (yieldTime < minYieldTime)
+		minYieldTime = yieldTime;
+	
+	/*
+	 * The decrementer stops during the yield.  Force a fake decrementer
+	 * here and let the timer_interrupt code sort out the actual time.
+	 */
+	get_paca()->lppaca.int_dword.fields.decr_int = 1;
+	process_iSeries_events();
+}
+
+static int iSeries_idle(void)
+{
+	struct paca_struct *lpaca;
+	long oldval;
+	unsigned long CTRL;
+
+	/* ensure iSeries run light will be out when idle */
+	clear_thread_flag(TIF_RUN_LIGHT);
+	CTRL = mfspr(CTRLF);
+	CTRL &= ~RUNLATCH;
+	mtspr(CTRLT, CTRL);
+
+	lpaca = get_paca();
+
+	while (1) {
+		if (lpaca->lppaca.shared_proc) {
+			if (ItLpQueue_isLpIntPending(lpaca->lpqueue_ptr))
+				process_iSeries_events();
+			if (!need_resched())
+				yield_shared_processor();
+		} else {
+			oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
+
+			if (!oldval) {
+				set_thread_flag(TIF_POLLING_NRFLAG);
+
+				while (!need_resched()) {
+					HMT_medium();
+					if (ItLpQueue_isLpIntPending(lpaca->lpqueue_ptr))
+						process_iSeries_events();
+					HMT_low();
+				}
+
+				HMT_medium();
+				clear_thread_flag(TIF_POLLING_NRFLAG);
+			} else {
+				set_need_resched();
+			}
+		}
+
+		schedule();
+	}
+
+	return 0;
+}
+
+#else
+
+static int default_idle(void)
+{
+	long oldval;
+	unsigned int cpu = smp_processor_id();
+
+	while (1) {
+		oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
+
+		if (!oldval) {
+			set_thread_flag(TIF_POLLING_NRFLAG);
+
+			while (!need_resched() && !cpu_is_offline(cpu)) {
+				barrier();
+				/*
+				 * Go into low thread priority and possibly
+				 * low power mode.
+				 */
+				HMT_low();
+				HMT_very_low();
+			}
+
+			HMT_medium();
+			clear_thread_flag(TIF_POLLING_NRFLAG);
+		} else {
+			set_need_resched();
+		}
+
+		schedule();
+		if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)
+			cpu_die();
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_PPC_PSERIES
+
+DECLARE_PER_CPU(unsigned long, smt_snooze_delay);
+
+int dedicated_idle(void)
+{
+	long oldval;
+	struct paca_struct *lpaca = get_paca(), *ppaca;
+	unsigned long start_snooze;
+	unsigned long *smt_snooze_delay = &__get_cpu_var(smt_snooze_delay);
+	unsigned int cpu = smp_processor_id();
+
+	ppaca = &paca[cpu ^ 1];
+
+	while (1) {
+		/*
+		 * Indicate to the HV that we are idle. Now would be
+		 * a good time to find other work to dispatch.
+		 */
+		lpaca->lppaca.idle = 1;
+
+		oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
+		if (!oldval) {
+			set_thread_flag(TIF_POLLING_NRFLAG);
+			start_snooze = __get_tb() +
+				*smt_snooze_delay * tb_ticks_per_usec;
+			while (!need_resched() && !cpu_is_offline(cpu)) {
+				/*
+				 * Go into low thread priority and possibly
+				 * low power mode.
+				 */
+				HMT_low();
+				HMT_very_low();
+
+				if (*smt_snooze_delay == 0 ||
+				    __get_tb() < start_snooze)
+					continue;
+
+				HMT_medium();
+
+				if (!(ppaca->lppaca.idle)) {
+					local_irq_disable();
+
+					/*
+					 * We are about to sleep the thread
+					 * and so wont be polling any
+					 * more.
+					 */
+					clear_thread_flag(TIF_POLLING_NRFLAG);
+
+					/*
+					 * SMT dynamic mode. Cede will result
+					 * in this thread going dormant, if the
+					 * partner thread is still doing work.
+					 * Thread wakes up if partner goes idle,
+					 * an interrupt is presented, or a prod
+					 * occurs.  Returning from the cede
+					 * enables external interrupts.
+					 */
+					if (!need_resched())
+						cede_processor();
+					else
+						local_irq_enable();
+				} else {
+					/*
+					 * Give the HV an opportunity at the
+					 * processor, since we are not doing
+					 * any work.
+					 */
+					poll_pending();
+				}
+			}
+
+			clear_thread_flag(TIF_POLLING_NRFLAG);
+		} else {
+			set_need_resched();
+		}
+
+		HMT_medium();
+		lpaca->lppaca.idle = 0;
+		schedule();
+		if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)
+			cpu_die();
+	}
+	return 0;
+}
+
+static int shared_idle(void)
+{
+	struct paca_struct *lpaca = get_paca();
+	unsigned int cpu = smp_processor_id();
+
+	while (1) {
+		/*
+		 * Indicate to the HV that we are idle. Now would be
+		 * a good time to find other work to dispatch.
+		 */
+		lpaca->lppaca.idle = 1;
+
+		while (!need_resched() && !cpu_is_offline(cpu)) {
+			local_irq_disable();
+
+			/*
+			 * Yield the processor to the hypervisor.  We return if
+			 * an external interrupt occurs (which are driven prior
+			 * to returning here) or if a prod occurs from another 
+			 * processor. When returning here, external interrupts
+			 * are enabled.
+			 *
+			 * Check need_resched() again with interrupts disabled
+			 * to avoid a race.
+			 */
+			if (!need_resched())
+				cede_processor();
+			else
+				local_irq_enable();
+		}
+
+		HMT_medium();
+		lpaca->lppaca.idle = 0;
+		schedule();
+		if (cpu_is_offline(smp_processor_id()) &&
+		    system_state == SYSTEM_RUNNING)
+			cpu_die();
+	}
+
+	return 0;
+}
+
+#endif /* CONFIG_PPC_PSERIES */
+
+static int native_idle(void)
+{
+	while(1) {
+		/* check CPU type here */
+		if (!need_resched())
+			power4_idle();
+		if (need_resched())
+			schedule();
+
+		if (cpu_is_offline(_smp_processor_id()) &&
+		    system_state == SYSTEM_RUNNING)
+			cpu_die();
+	}
+	return 0;
+}
+
+#endif /* CONFIG_PPC_ISERIES */
+
+void cpu_idle(void)
+{
+	idle_loop();
+}
+
+int powersave_nap;
+
+#ifdef CONFIG_SYSCTL
+/*
+ * Register the sysctl to set/clear powersave_nap.
+ */
+static ctl_table powersave_nap_ctl_table[]={
+	{
+		.ctl_name	= KERN_PPC_POWERSAVE_NAP,
+		.procname	= "powersave-nap",
+		.data		= &powersave_nap,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{ 0, },
+};
+static ctl_table powersave_nap_sysctl_root[] = {
+	{ 1, "kernel", NULL, 0, 0755, powersave_nap_ctl_table, },
+ 	{ 0,},
+};
+
+static int __init
+register_powersave_nap_sysctl(void)
+{
+	register_sysctl_table(powersave_nap_sysctl_root, 0);
+
+	return 0;
+}
+__initcall(register_powersave_nap_sysctl);
+#endif
+
+int idle_setup(void)
+{
+	/*
+	 * Move that junk to each platform specific file, eventually define
+	 * a pSeries_idle for shared processor stuff
+	 */
+#ifdef CONFIG_PPC_ISERIES
+	idle_loop = iSeries_idle;
+	return 1;
+#else
+	idle_loop = default_idle;
+#endif
+#ifdef CONFIG_PPC_PSERIES
+	if (systemcfg->platform & PLATFORM_PSERIES) {
+		if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) {
+			if (get_paca()->lppaca.shared_proc) {
+				printk(KERN_INFO "Using shared processor idle loop\n");
+				idle_loop = shared_idle;
+			} else {
+				printk(KERN_INFO "Using dedicated idle loop\n");
+				idle_loop = dedicated_idle;
+			}
+		} else {
+			printk(KERN_INFO "Using default idle loop\n");
+			idle_loop = default_idle;
+		}
+	}
+#endif /* CONFIG_PPC_PSERIES */
+#ifndef CONFIG_PPC_ISERIES
+	if (systemcfg->platform == PLATFORM_POWERMAC ||
+	    systemcfg->platform == PLATFORM_MAPLE) {
+		printk(KERN_INFO "Using native/NAP idle loop\n");
+		idle_loop = native_idle;
+	}
+#endif /* CONFIG_PPC_ISERIES */
+
+	return 1;
+}
diff --git a/arch/ppc64/kernel/idle_power4.S b/arch/ppc64/kernel/idle_power4.S
new file mode 100644
index 00000000000..97e4a265504
--- /dev/null
+++ b/arch/ppc64/kernel/idle_power4.S
@@ -0,0 +1,79 @@
+/*
+ *  This file contains the power_save function for 6xx & 7xxx CPUs
+ *  rewritten in assembler
+ *
+ *  Warning ! This code assumes that if your machine has a 750fx
+ *  it will have PLL 1 set to low speed mode (used during NAP/DOZE).
+ *  if this is not the case some additional changes will have to
+ *  be done to check a runtime var (a bit like powersave-nap)
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/threads.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/cputable.h>
+#include <asm/thread_info.h>
+#include <asm/ppc_asm.h>
+#include <asm/offsets.h>
+
+#undef DEBUG
+
+	.text
+
+/*
+ * Here is the power_save_6xx function. This could eventually be
+ * split into several functions & changing the function pointer
+ * depending on the various features.
+ */
+_GLOBAL(power4_idle)
+BEGIN_FTR_SECTION
+	blr
+END_FTR_SECTION_IFCLR(CPU_FTR_CAN_NAP)
+	/* We must dynamically check for the NAP feature as it
+	 * can be cleared by CPU init after the fixups are done
+	 */
+	LOADBASE(r3,cur_cpu_spec)
+	ld	r4,cur_cpu_spec@l(r3)
+	ld	r4,CPU_SPEC_FEATURES(r4)
+	andi.	r0,r4,CPU_FTR_CAN_NAP
+	beqlr
+	/* Now check if user or arch enabled NAP mode */
+	LOADBASE(r3,powersave_nap)
+	lwz	r4,powersave_nap@l(r3)
+	cmpwi	0,r4,0
+	beqlr
+
+	/* Clear MSR:EE */
+	mfmsr	r7
+	li	r4,0
+	ori	r4,r4,MSR_EE
+	andc	r0,r7,r4
+	mtmsrd	r0
+
+	/* Check current_thread_info()->flags */
+	clrrdi	r4,r1,THREAD_SHIFT
+	ld	r4,TI_FLAGS(r4)
+	andi.	r0,r4,_TIF_NEED_RESCHED
+	beq	1f
+	mtmsrd	r7	/* out of line this ? */
+	blr
+1:	
+	/* Go to NAP now */	
+BEGIN_FTR_SECTION
+	DSSALL
+	sync
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+	oris	r7,r7,MSR_POW@h
+	sync
+	isync
+	mtmsrd	r7
+	isync
+	sync
+	blr
+	
diff --git a/arch/ppc64/kernel/init_task.c b/arch/ppc64/kernel/init_task.c
new file mode 100644
index 00000000000..941043ae040
--- /dev/null
+++ b/arch/ppc64/kernel/init_task.c
@@ -0,0 +1,36 @@
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/init_task.h>
+#include <linux/fs.h>
+#include <linux/mqueue.h>
+#include <asm/uaccess.h>
+
+static struct fs_struct init_fs = INIT_FS;
+static struct files_struct init_files = INIT_FILES;
+static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
+static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
+struct mm_struct init_mm = INIT_MM(init_mm);
+
+EXPORT_SYMBOL(init_mm);
+
+/*
+ * Initial thread structure.
+ *
+ * We need to make sure that this is 16384-byte aligned due to the
+ * way process stacks are handled. This is done by having a special
+ * "init_task" linker map entry..
+ */
+union thread_union init_thread_union 
+	__attribute__((__section__(".data.init_task"))) =
+		{ INIT_THREAD_INFO(init_task) };
+
+/*
+ * Initial task structure.
+ *
+ * All other task structs will be allocated on slabs in fork.c
+ */
+struct task_struct init_task = INIT_TASK(init_task);
+
+EXPORT_SYMBOL(init_task);
diff --git a/arch/ppc64/kernel/ioctl32.c b/arch/ppc64/kernel/ioctl32.c
new file mode 100644
index 00000000000..a8005db23ec
--- /dev/null
+++ b/arch/ppc64/kernel/ioctl32.c
@@ -0,0 +1,51 @@
+/* 
+ * ioctl32.c: Conversion between 32bit and 64bit native ioctls.
+ * 
+ * Based on sparc64 ioctl32.c by:
+ *
+ * Copyright (C) 1997-2000  Jakub Jelinek  (jakub@redhat.com)
+ * Copyright (C) 1998  Eddie C. Dost  (ecd@skynet.be)
+ *
+ * ppc64 changes:
+ *
+ * Copyright (C) 2000  Ken Aaker (kdaaker@rchland.vnet.ibm.com)
+ * Copyright (C) 2001  Anton Blanchard (antonb@au.ibm.com)
+ *
+ * These routines maintain argument size conversion between 32bit and 64bit
+ * ioctls.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define INCLUDES
+#include "compat_ioctl.c"
+#include <linux/syscalls.h>
+
+#define CODE
+#include "compat_ioctl.c"
+
+#define HANDLE_IOCTL(cmd,handler) { cmd, (ioctl_trans_handler_t)handler, NULL },
+#define COMPATIBLE_IOCTL(cmd) HANDLE_IOCTL(cmd,sys_ioctl)
+
+#define IOCTL_TABLE_START \
+	struct ioctl_trans ioctl_start[] = {
+#define IOCTL_TABLE_END \
+	};
+
+IOCTL_TABLE_START
+#include <linux/compat_ioctl.h>
+#define DECLARES
+#include "compat_ioctl.c"
+COMPATIBLE_IOCTL(TIOCSTART)
+COMPATIBLE_IOCTL(TIOCSTOP)
+COMPATIBLE_IOCTL(TIOCSLTC)
+/* Little p (/dev/rtc, /dev/envctrl, etc.) */
+COMPATIBLE_IOCTL(_IOR('p', 20, int[7])) /* RTCGET */
+COMPATIBLE_IOCTL(_IOW('p', 21, int[7])) /* RTCSET */
+
+IOCTL_TABLE_END
+
+int ioctl_table_size = ARRAY_SIZE(ioctl_start);
diff --git a/arch/ppc64/kernel/iomap.c b/arch/ppc64/kernel/iomap.c
new file mode 100644
index 00000000000..153cc8b0f13
--- /dev/null
+++ b/arch/ppc64/kernel/iomap.c
@@ -0,0 +1,126 @@
+/*
+ * arch/ppc64/kernel/iomap.c
+ *
+ * ppc64 "iomap" interface implementation.
+ *
+ * (C) Copyright 2004 Linus Torvalds
+ */
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/mm.h>
+#include <asm/io.h>
+
+/*
+ * Here comes the ppc64 implementation of the IOMAP 
+ * interfaces.
+ */
+unsigned int fastcall ioread8(void __iomem *addr)
+{
+	return readb(addr);
+}
+unsigned int fastcall ioread16(void __iomem *addr)
+{
+	return readw(addr);
+}
+unsigned int fastcall ioread32(void __iomem *addr)
+{
+	return readl(addr);
+}
+EXPORT_SYMBOL(ioread8);
+EXPORT_SYMBOL(ioread16);
+EXPORT_SYMBOL(ioread32);
+
+void fastcall iowrite8(u8 val, void __iomem *addr)
+{
+	writeb(val, addr);
+}
+void fastcall iowrite16(u16 val, void __iomem *addr)
+{
+	writew(val, addr);
+}
+void fastcall iowrite32(u32 val, void __iomem *addr)
+{
+	writel(val, addr);
+}
+EXPORT_SYMBOL(iowrite8);
+EXPORT_SYMBOL(iowrite16);
+EXPORT_SYMBOL(iowrite32);
+
+/*
+ * These are the "repeat read/write" functions. Note the
+ * non-CPU byte order. We do things in "IO byteorder"
+ * here.
+ *
+ * FIXME! We could make these do EEH handling if we really
+ * wanted. Not clear if we do.
+ */
+void ioread8_rep(void __iomem *addr, void *dst, unsigned long count)
+{
+	_insb((u8 __force *) addr, dst, count);
+}
+void ioread16_rep(void __iomem *addr, void *dst, unsigned long count)
+{
+	_insw_ns((u16 __force *) addr, dst, count);
+}
+void ioread32_rep(void __iomem *addr, void *dst, unsigned long count)
+{
+	_insl_ns((u32 __force *) addr, dst, count);
+}
+EXPORT_SYMBOL(ioread8_rep);
+EXPORT_SYMBOL(ioread16_rep);
+EXPORT_SYMBOL(ioread32_rep);
+
+void iowrite8_rep(void __iomem *addr, const void *src, unsigned long count)
+{
+	_outsb((u8 __force *) addr, src, count);
+}
+void iowrite16_rep(void __iomem *addr, const void *src, unsigned long count)
+{
+	_outsw_ns((u16 __force *) addr, src, count);
+}
+void iowrite32_rep(void __iomem *addr, const void *src, unsigned long count)
+{
+	_outsl_ns((u32 __force *) addr, src, count);
+}
+EXPORT_SYMBOL(iowrite8_rep);
+EXPORT_SYMBOL(iowrite16_rep);
+EXPORT_SYMBOL(iowrite32_rep);
+
+void __iomem *ioport_map(unsigned long port, unsigned int len)
+{
+	if (!_IO_IS_VALID(port))
+		return NULL;
+	return (void __iomem *) (port+pci_io_base);
+}
+
+void ioport_unmap(void __iomem *addr)
+{
+	/* Nothing to do */
+}
+EXPORT_SYMBOL(ioport_map);
+EXPORT_SYMBOL(ioport_unmap);
+
+void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max)
+{
+	unsigned long start = pci_resource_start(dev, bar);
+	unsigned long len = pci_resource_len(dev, bar);
+	unsigned long flags = pci_resource_flags(dev, bar);
+
+	if (!len)
+		return NULL;
+	if (max && len > max)
+		len = max;
+	if (flags & IORESOURCE_IO)
+		return ioport_map(start, len);
+	if (flags & IORESOURCE_MEM)
+		return ioremap(start, len);
+	/* What? */
+	return NULL;
+}
+
+void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
+{
+	/* Nothing to do */
+}
+EXPORT_SYMBOL(pci_iomap);
+EXPORT_SYMBOL(pci_iounmap);
diff --git a/arch/ppc64/kernel/iommu.c b/arch/ppc64/kernel/iommu.c
new file mode 100644
index 00000000000..344164681d2
--- /dev/null
+++ b/arch/ppc64/kernel/iommu.c
@@ -0,0 +1,567 @@
+/*
+ * arch/ppc64/kernel/iommu.c
+ * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation
+ * 
+ * Rewrite, cleanup, new allocation schemes, virtual merging: 
+ * Copyright (C) 2004 Olof Johansson, IBM Corporation
+ *               and  Ben. Herrenschmidt, IBM Corporation
+ *
+ * Dynamic DMA mapping support, bus-independent parts.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/dma-mapping.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/iommu.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+
+#define DBG(...)
+
+#ifdef CONFIG_IOMMU_VMERGE
+static int novmerge = 0;
+#else
+static int novmerge = 1;
+#endif
+
+static int __init setup_iommu(char *str)
+{
+	if (!strcmp(str, "novmerge"))
+		novmerge = 1;
+	else if (!strcmp(str, "vmerge"))
+		novmerge = 0;
+	return 1;
+}
+
+__setup("iommu=", setup_iommu);
+
+static unsigned long iommu_range_alloc(struct iommu_table *tbl,
+                                       unsigned long npages,
+                                       unsigned long *handle,
+                                       unsigned int align_order)
+{ 
+	unsigned long n, end, i, start;
+	unsigned long limit;
+	int largealloc = npages > 15;
+	int pass = 0;
+	unsigned long align_mask;
+
+	align_mask = 0xffffffffffffffffl >> (64 - align_order);
+
+	/* This allocator was derived from x86_64's bit string search */
+
+	/* Sanity check */
+	if (unlikely(npages) == 0) {
+		if (printk_ratelimit())
+			WARN_ON(1);
+		return DMA_ERROR_CODE;
+	}
+
+	if (handle && *handle)
+		start = *handle;
+	else
+		start = largealloc ? tbl->it_largehint : tbl->it_hint;
+
+	/* Use only half of the table for small allocs (15 pages or less) */
+	limit = largealloc ? tbl->it_size : tbl->it_halfpoint;
+
+	if (largealloc && start < tbl->it_halfpoint)
+		start = tbl->it_halfpoint;
+
+	/* The case below can happen if we have a small segment appended
+	 * to a large, or when the previous alloc was at the very end of
+	 * the available space. If so, go back to the initial start.
+	 */
+	if (start >= limit)
+		start = largealloc ? tbl->it_largehint : tbl->it_hint;
+	
+ again:
+
+	n = find_next_zero_bit(tbl->it_map, limit, start);
+
+	/* Align allocation */
+	n = (n + align_mask) & ~align_mask;
+
+	end = n + npages;
+
+	if (unlikely(end >= limit)) {
+		if (likely(pass < 2)) {
+			/* First failure, just rescan the half of the table.
+			 * Second failure, rescan the other half of the table.
+			 */
+			start = (largealloc ^ pass) ? tbl->it_halfpoint : 0;
+			limit = pass ? tbl->it_size : limit;
+			pass++;
+			goto again;
+		} else {
+			/* Third failure, give up */
+			return DMA_ERROR_CODE;
+		}
+	}
+
+	for (i = n; i < end; i++)
+		if (test_bit(i, tbl->it_map)) {
+			start = i+1;
+			goto again;
+		}
+
+	for (i = n; i < end; i++)
+		__set_bit(i, tbl->it_map);
+
+	/* Bump the hint to a new block for small allocs. */
+	if (largealloc) {
+		/* Don't bump to new block to avoid fragmentation */
+		tbl->it_largehint = end;
+	} else {
+		/* Overflow will be taken care of at the next allocation */
+		tbl->it_hint = (end + tbl->it_blocksize - 1) &
+		                ~(tbl->it_blocksize - 1);
+	}
+
+	/* Update handle for SG allocations */
+	if (handle)
+		*handle = end;
+
+	return n;
+}
+
+static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *page,
+		       unsigned int npages, enum dma_data_direction direction,
+		       unsigned int align_order)
+{
+	unsigned long entry, flags;
+	dma_addr_t ret = DMA_ERROR_CODE;
+	
+	spin_lock_irqsave(&(tbl->it_lock), flags);
+
+	entry = iommu_range_alloc(tbl, npages, NULL, align_order);
+
+	if (unlikely(entry == DMA_ERROR_CODE)) {
+		spin_unlock_irqrestore(&(tbl->it_lock), flags);
+		return DMA_ERROR_CODE;
+	}
+
+	entry += tbl->it_offset;	/* Offset into real TCE table */
+	ret = entry << PAGE_SHIFT;	/* Set the return dma address */
+
+	/* Put the TCEs in the HW table */
+	ppc_md.tce_build(tbl, entry, npages, (unsigned long)page & PAGE_MASK,
+			 direction);
+
+
+	/* Flush/invalidate TLB caches if necessary */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	spin_unlock_irqrestore(&(tbl->it_lock), flags);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+
+	return ret;
+}
+
+static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, 
+			 unsigned int npages)
+{
+	unsigned long entry, free_entry;
+	unsigned long i;
+
+	entry = dma_addr >> PAGE_SHIFT;
+	free_entry = entry - tbl->it_offset;
+
+	if (((free_entry + npages) > tbl->it_size) ||
+	    (entry < tbl->it_offset)) {
+		if (printk_ratelimit()) {
+			printk(KERN_INFO "iommu_free: invalid entry\n");
+			printk(KERN_INFO "\tentry     = 0x%lx\n", entry); 
+			printk(KERN_INFO "\tdma_addr  = 0x%lx\n", (u64)dma_addr);
+			printk(KERN_INFO "\tTable     = 0x%lx\n", (u64)tbl);
+			printk(KERN_INFO "\tbus#      = 0x%lx\n", (u64)tbl->it_busno);
+			printk(KERN_INFO "\tsize      = 0x%lx\n", (u64)tbl->it_size);
+			printk(KERN_INFO "\tstartOff  = 0x%lx\n", (u64)tbl->it_offset);
+			printk(KERN_INFO "\tindex     = 0x%lx\n", (u64)tbl->it_index);
+			WARN_ON(1);
+		}
+		return;
+	}
+
+	ppc_md.tce_free(tbl, entry, npages);
+	
+	for (i = 0; i < npages; i++)
+		__clear_bit(free_entry+i, tbl->it_map);
+}
+
+static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
+		unsigned int npages)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&(tbl->it_lock), flags);
+
+	__iommu_free(tbl, dma_addr, npages);
+
+	/* Make sure TLB cache is flushed if the HW needs it. We do
+	 * not do an mb() here on purpose, it is not needed on any of
+	 * the current platforms.
+	 */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	spin_unlock_irqrestore(&(tbl->it_lock), flags);
+}
+
+int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
+		struct scatterlist *sglist, int nelems,
+		enum dma_data_direction direction)
+{
+	dma_addr_t dma_next = 0, dma_addr;
+	unsigned long flags;
+	struct scatterlist *s, *outs, *segstart;
+	int outcount;
+	unsigned long handle;
+
+	BUG_ON(direction == DMA_NONE);
+
+	if ((nelems == 0) || !tbl)
+		return 0;
+
+	outs = s = segstart = &sglist[0];
+	outcount = 1;
+	handle = 0;
+
+	/* Init first segment length for backout at failure */
+	outs->dma_length = 0;
+
+	DBG("mapping %d elements:\n", nelems);
+
+	spin_lock_irqsave(&(tbl->it_lock), flags);
+
+	for (s = outs; nelems; nelems--, s++) {
+		unsigned long vaddr, npages, entry, slen;
+
+		slen = s->length;
+		/* Sanity check */
+		if (slen == 0) {
+			dma_next = 0;
+			continue;
+		}
+		/* Allocate iommu entries for that segment */
+		vaddr = (unsigned long)page_address(s->page) + s->offset;
+		npages = PAGE_ALIGN(vaddr + slen) - (vaddr & PAGE_MASK);
+		npages >>= PAGE_SHIFT;
+		entry = iommu_range_alloc(tbl, npages, &handle, 0);
+
+		DBG("  - vaddr: %lx, size: %lx\n", vaddr, slen);
+
+		/* Handle failure */
+		if (unlikely(entry == DMA_ERROR_CODE)) {
+			if (printk_ratelimit())
+				printk(KERN_INFO "iommu_alloc failed, tbl %p vaddr %lx"
+				       " npages %lx\n", tbl, vaddr, npages);
+			goto failure;
+		}
+
+		/* Convert entry to a dma_addr_t */
+		entry += tbl->it_offset;
+		dma_addr = entry << PAGE_SHIFT;
+		dma_addr |= s->offset;
+
+		DBG("  - %lx pages, entry: %lx, dma_addr: %lx\n",
+			    npages, entry, dma_addr);
+
+		/* Insert into HW table */
+		ppc_md.tce_build(tbl, entry, npages, vaddr & PAGE_MASK, direction);
+
+		/* If we are in an open segment, try merging */
+		if (segstart != s) {
+			DBG("  - trying merge...\n");
+			/* We cannot merge if:
+			 * - allocated dma_addr isn't contiguous to previous allocation
+			 */
+			if (novmerge || (dma_addr != dma_next)) {
+				/* Can't merge: create a new segment */
+				segstart = s;
+				outcount++; outs++;
+				DBG("    can't merge, new segment.\n");
+			} else {
+				outs->dma_length += s->length;
+				DBG("    merged, new len: %lx\n", outs->dma_length);
+			}
+		}
+
+		if (segstart == s) {
+			/* This is a new segment, fill entries */
+			DBG("  - filling new segment.\n");
+			outs->dma_address = dma_addr;
+			outs->dma_length = slen;
+		}
+
+		/* Calculate next page pointer for contiguous check */
+		dma_next = dma_addr + slen;
+
+		DBG("  - dma next is: %lx\n", dma_next);
+	}
+
+	/* Flush/invalidate TLB caches if necessary */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	spin_unlock_irqrestore(&(tbl->it_lock), flags);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+
+	DBG("mapped %d elements:\n", outcount);
+
+	/* For the sake of iommu_free_sg, we clear out the length in the
+	 * next entry of the sglist if we didn't fill the list completely
+	 */
+	if (outcount < nelems) {
+		outs++;
+		outs->dma_address = DMA_ERROR_CODE;
+		outs->dma_length = 0;
+	}
+	return outcount;
+
+ failure:
+	for (s = &sglist[0]; s <= outs; s++) {
+		if (s->dma_length != 0) {
+			unsigned long vaddr, npages;
+
+			vaddr = s->dma_address & PAGE_MASK;
+			npages = (PAGE_ALIGN(s->dma_address + s->dma_length) - vaddr)
+				>> PAGE_SHIFT;
+			__iommu_free(tbl, vaddr, npages);
+		}
+	}
+	spin_unlock_irqrestore(&(tbl->it_lock), flags);
+	return 0;
+}
+
+
+void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
+		int nelems, enum dma_data_direction direction)
+{
+	unsigned long flags;
+
+	BUG_ON(direction == DMA_NONE);
+
+	if (!tbl)
+		return;
+
+	spin_lock_irqsave(&(tbl->it_lock), flags);
+
+	while (nelems--) {
+		unsigned int npages;
+		dma_addr_t dma_handle = sglist->dma_address;
+
+		if (sglist->dma_length == 0)
+			break;
+		npages = (PAGE_ALIGN(dma_handle + sglist->dma_length)
+			  - (dma_handle & PAGE_MASK)) >> PAGE_SHIFT;
+		__iommu_free(tbl, dma_handle, npages);
+		sglist++;
+	}
+
+	/* Flush/invalidate TLBs if necessary. As for iommu_free(), we
+	 * do not do an mb() here, the affected platforms do not need it
+	 * when freeing.
+	 */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	spin_unlock_irqrestore(&(tbl->it_lock), flags);
+}
+
+/*
+ * Build a iommu_table structure.  This contains a bit map which
+ * is used to manage allocation of the tce space.
+ */
+struct iommu_table *iommu_init_table(struct iommu_table *tbl)
+{
+	unsigned long sz;
+	static int welcomed = 0;
+
+	/* Set aside 1/4 of the table for large allocations. */
+	tbl->it_halfpoint = tbl->it_size * 3 / 4;
+
+	/* number of bytes needed for the bitmap */
+	sz = (tbl->it_size + 7) >> 3;
+
+	tbl->it_map = (unsigned long *)__get_free_pages(GFP_ATOMIC, get_order(sz));
+	if (!tbl->it_map)
+		panic("iommu_init_table: Can't allocate %ld bytes\n", sz);
+
+	memset(tbl->it_map, 0, sz);
+
+	tbl->it_hint = 0;
+	tbl->it_largehint = tbl->it_halfpoint;
+	spin_lock_init(&tbl->it_lock);
+
+	if (!welcomed) {
+		printk(KERN_INFO "IOMMU table initialized, virtual merging %s\n",
+		       novmerge ? "disabled" : "enabled");
+		welcomed = 1;
+	}
+
+	return tbl;
+}
+
+void iommu_free_table(struct device_node *dn)
+{
+	struct iommu_table *tbl = dn->iommu_table;
+	unsigned long bitmap_sz, i;
+	unsigned int order;
+
+	if (!tbl || !tbl->it_map) {
+		printk(KERN_ERR "%s: expected TCE map for %s\n", __FUNCTION__,
+				dn->full_name);
+		return;
+	}
+
+	/* verify that table contains no entries */
+	/* it_size is in entries, and we're examining 64 at a time */
+	for (i = 0; i < (tbl->it_size/64); i++) {
+		if (tbl->it_map[i] != 0) {
+			printk(KERN_WARNING "%s: Unexpected TCEs for %s\n",
+				__FUNCTION__, dn->full_name);
+			break;
+		}
+	}
+
+	/* calculate bitmap size in bytes */
+	bitmap_sz = (tbl->it_size + 7) / 8;
+
+	/* free bitmap */
+	order = get_order(bitmap_sz);
+	free_pages((unsigned long) tbl->it_map, order);
+
+	/* free table */
+	kfree(tbl);
+}
+
+/* Creates TCEs for a user provided buffer.  The user buffer must be
+ * contiguous real kernel storage (not vmalloc).  The address of the buffer
+ * passed here is the kernel (virtual) address of the buffer.  The buffer
+ * need not be page aligned, the dma_addr_t returned will point to the same
+ * byte within the page as vaddr.
+ */
+dma_addr_t iommu_map_single(struct iommu_table *tbl, void *vaddr,
+		size_t size, enum dma_data_direction direction)
+{
+	dma_addr_t dma_handle = DMA_ERROR_CODE;
+	unsigned long uaddr;
+	unsigned int npages;
+
+	BUG_ON(direction == DMA_NONE);
+
+	uaddr = (unsigned long)vaddr;
+	npages = PAGE_ALIGN(uaddr + size) - (uaddr & PAGE_MASK);
+	npages >>= PAGE_SHIFT;
+
+	if (tbl) {
+		dma_handle = iommu_alloc(tbl, vaddr, npages, direction, 0);
+		if (dma_handle == DMA_ERROR_CODE) {
+			if (printk_ratelimit())  {
+				printk(KERN_INFO "iommu_alloc failed, "
+						"tbl %p vaddr %p npages %d\n",
+						tbl, vaddr, npages);
+			}
+		} else
+			dma_handle |= (uaddr & ~PAGE_MASK);
+	}
+
+	return dma_handle;
+}
+
+void iommu_unmap_single(struct iommu_table *tbl, dma_addr_t dma_handle,
+		size_t size, enum dma_data_direction direction)
+{
+	BUG_ON(direction == DMA_NONE);
+
+	if (tbl)
+		iommu_free(tbl, dma_handle, (PAGE_ALIGN(dma_handle + size) -
+					(dma_handle & PAGE_MASK)) >> PAGE_SHIFT);
+}
+
+/* Allocates a contiguous real buffer and creates mappings over it.
+ * Returns the virtual address of the buffer and sets dma_handle
+ * to the dma address (mapping) of the first page.
+ */
+void *iommu_alloc_coherent(struct iommu_table *tbl, size_t size,
+		dma_addr_t *dma_handle, unsigned int __nocast flag)
+{
+	void *ret = NULL;
+	dma_addr_t mapping;
+	unsigned int npages, order;
+
+	size = PAGE_ALIGN(size);
+	npages = size >> PAGE_SHIFT;
+	order = get_order(size);
+
+ 	/*
+	 * Client asked for way too much space.  This is checked later
+	 * anyway.  It is easier to debug here for the drivers than in
+	 * the tce tables.
+	 */
+	if (order >= IOMAP_MAX_ORDER) {
+		printk("iommu_alloc_consistent size too large: 0x%lx\n", size);
+		return NULL;
+	}
+
+	if (!tbl)
+		return NULL;
+
+	/* Alloc enough pages (and possibly more) */
+	ret = (void *)__get_free_pages(flag, order);
+	if (!ret)
+		return NULL;
+	memset(ret, 0, size);
+
+	/* Set up tces to cover the allocated range */
+	mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL, order);
+	if (mapping == DMA_ERROR_CODE) {
+		free_pages((unsigned long)ret, order);
+		ret = NULL;
+	} else
+		*dma_handle = mapping;
+	return ret;
+}
+
+void iommu_free_coherent(struct iommu_table *tbl, size_t size,
+			 void *vaddr, dma_addr_t dma_handle)
+{
+	unsigned int npages;
+
+	if (tbl) {
+		size = PAGE_ALIGN(size);
+		npages = size >> PAGE_SHIFT;
+		iommu_free(tbl, dma_handle, npages);
+		free_pages((unsigned long)vaddr, get_order(size));
+	}
+}
diff --git a/arch/ppc64/kernel/irq.c b/arch/ppc64/kernel/irq.c
new file mode 100644
index 00000000000..4fd7f203c1e
--- /dev/null
+++ b/arch/ppc64/kernel/irq.c
@@ -0,0 +1,519 @@
+/*
+ *  arch/ppc/kernel/irq.c
+ *
+ *  Derived from arch/i386/kernel/irq.c
+ *    Copyright (C) 1992 Linus Torvalds
+ *  Adapted from arch/i386 by Gary Thomas
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *  Updated and modified by Cort Dougan (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Cort Dougan
+ *  Adapted for Power Macintosh by Paul Mackerras
+ *    Copyright (C) 1996 Paul Mackerras (paulus@cs.anu.edu.au)
+ *  Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk).
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * This file contains the code used by various IRQ handling routines:
+ * asking for different IRQ's should be done through these routines
+ * instead of just grabbing them. Thus setups with different IRQ numbers
+ * shouldn't result in any weird surprises, and installing new handlers
+ * should be easier.
+ */
+
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/threads.h>
+#include <linux/kernel_stat.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/timex.h>
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/irq.h>
+#include <linux/proc_fs.h>
+#include <linux/random.h>
+#include <linux/kallsyms.h>
+#include <linux/profile.h>
+#include <linux/bitops.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/irq.h>
+#include <asm/cache.h>
+#include <asm/prom.h>
+#include <asm/ptrace.h>
+#include <asm/iSeries/LparData.h>
+#include <asm/machdep.h>
+#include <asm/paca.h>
+
+#ifdef CONFIG_SMP
+extern void iSeries_smp_message_recv( struct pt_regs * );
+#endif
+
+extern irq_desc_t irq_desc[NR_IRQS];
+EXPORT_SYMBOL(irq_desc);
+
+int distribute_irqs = 1;
+int __irq_offset_value;
+int ppc_spurious_interrupts;
+unsigned long lpevent_count;
+u64 ppc64_interrupt_controller;
+
+int show_interrupts(struct seq_file *p, void *v)
+{
+	int i = *(loff_t *) v, j;
+	struct irqaction * action;
+	irq_desc_t *desc;
+	unsigned long flags;
+
+	if (i == 0) {
+		seq_printf(p, "           ");
+		for (j=0; j<NR_CPUS; j++) {
+			if (cpu_online(j))
+				seq_printf(p, "CPU%d       ",j);
+		}
+		seq_putc(p, '\n');
+	}
+
+	if (i < NR_IRQS) {
+		desc = get_irq_desc(i);
+		spin_lock_irqsave(&desc->lock, flags);
+		action = desc->action;
+		if (!action || !action->handler)
+			goto skip;
+		seq_printf(p, "%3d: ", i);
+#ifdef CONFIG_SMP
+		for (j = 0; j < NR_CPUS; j++) {
+			if (cpu_online(j))
+				seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+		}
+#else
+		seq_printf(p, "%10u ", kstat_irqs(i));
+#endif /* CONFIG_SMP */
+		if (desc->handler)
+			seq_printf(p, " %s ", desc->handler->typename );
+		else
+			seq_printf(p, "  None      ");
+		seq_printf(p, "%s", (desc->status & IRQ_LEVEL) ? "Level " : "Edge  ");
+		seq_printf(p, "    %s",action->name);
+		for (action=action->next; action; action = action->next)
+			seq_printf(p, ", %s", action->name);
+		seq_putc(p, '\n');
+skip:
+		spin_unlock_irqrestore(&desc->lock, flags);
+	} else if (i == NR_IRQS)
+		seq_printf(p, "BAD: %10u\n", ppc_spurious_interrupts);
+	return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+void fixup_irqs(cpumask_t map)
+{
+	unsigned int irq;
+	static int warned;
+
+	for_each_irq(irq) {
+		cpumask_t mask;
+
+		if (irq_desc[irq].status & IRQ_PER_CPU)
+			continue;
+
+		cpus_and(mask, irq_affinity[irq], map);
+		if (any_online_cpu(mask) == NR_CPUS) {
+			printk("Breaking affinity for irq %i\n", irq);
+			mask = map;
+		}
+		if (irq_desc[irq].handler->set_affinity)
+			irq_desc[irq].handler->set_affinity(irq, mask);
+		else if (irq_desc[irq].action && !(warned++))
+			printk("Cannot set affinity for irq %i\n", irq);
+	}
+
+	local_irq_enable();
+	mdelay(1);
+	local_irq_disable();
+}
+#endif
+
+extern int noirqdebug;
+
+/*
+ * Eventually, this should take an array of interrupts and an array size
+ * so it can dispatch multiple interrupts.
+ */
+void ppc_irq_dispatch_handler(struct pt_regs *regs, int irq)
+{
+	int status;
+	struct irqaction *action;
+	int cpu = smp_processor_id();
+	irq_desc_t *desc = get_irq_desc(irq);
+	irqreturn_t action_ret;
+#ifdef CONFIG_IRQSTACKS
+	struct thread_info *curtp, *irqtp;
+#endif
+
+	kstat_cpu(cpu).irqs[irq]++;
+
+	if (desc->status & IRQ_PER_CPU) {
+		/* no locking required for CPU-local interrupts: */
+		ack_irq(irq);
+		action_ret = handle_IRQ_event(irq, regs, desc->action);
+		desc->handler->end(irq);
+		return;
+	}
+
+	spin_lock(&desc->lock);
+	ack_irq(irq);	
+	/*
+	   REPLAY is when Linux resends an IRQ that was dropped earlier
+	   WAITING is used by probe to mark irqs that are being tested
+	   */
+	status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING);
+	status |= IRQ_PENDING; /* we _want_ to handle it */
+
+	/*
+	 * If the IRQ is disabled for whatever reason, we cannot
+	 * use the action we have.
+	 */
+	action = NULL;
+	if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) {
+		action = desc->action;
+		if (!action || !action->handler) {
+			ppc_spurious_interrupts++;
+			printk(KERN_DEBUG "Unhandled interrupt %x, disabled\n", irq);
+			/* We can't call disable_irq here, it would deadlock */
+			if (!desc->depth)
+				desc->depth = 1;
+			desc->status |= IRQ_DISABLED;
+			/* This is not a real spurrious interrupt, we
+			 * have to eoi it, so we jump to out
+			 */
+			mask_irq(irq);
+			goto out;
+		}
+		status &= ~IRQ_PENDING; /* we commit to handling */
+		status |= IRQ_INPROGRESS; /* we are handling it */
+	}
+	desc->status = status;
+
+	/*
+	 * If there is no IRQ handler or it was disabled, exit early.
+	   Since we set PENDING, if another processor is handling
+	   a different instance of this same irq, the other processor
+	   will take care of it.
+	 */
+	if (unlikely(!action))
+		goto out;
+
+	/*
+	 * Edge triggered interrupts need to remember
+	 * pending events.
+	 * This applies to any hw interrupts that allow a second
+	 * instance of the same irq to arrive while we are in do_IRQ
+	 * or in the handler. But the code here only handles the _second_
+	 * instance of the irq, not the third or fourth. So it is mostly
+	 * useful for irq hardware that does not mask cleanly in an
+	 * SMP environment.
+	 */
+	for (;;) {
+		spin_unlock(&desc->lock);
+
+#ifdef CONFIG_IRQSTACKS
+		/* Switch to the irq stack to handle this */
+		curtp = current_thread_info();
+		irqtp = hardirq_ctx[smp_processor_id()];
+		if (curtp != irqtp) {
+			irqtp->task = curtp->task;
+			irqtp->flags = 0;
+			action_ret = call_handle_IRQ_event(irq, regs, action, irqtp);
+			irqtp->task = NULL;
+			if (irqtp->flags)
+				set_bits(irqtp->flags, &curtp->flags);
+		} else
+#endif
+			action_ret = handle_IRQ_event(irq, regs, action);
+
+		spin_lock(&desc->lock);
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret);
+		if (likely(!(desc->status & IRQ_PENDING)))
+			break;
+		desc->status &= ~IRQ_PENDING;
+	}
+out:
+	desc->status &= ~IRQ_INPROGRESS;
+	/*
+	 * The ->end() handler has to deal with interrupts which got
+	 * disabled while the handler was running.
+	 */
+	if (desc->handler) {
+		if (desc->handler->end)
+			desc->handler->end(irq);
+		else if (desc->handler->enable)
+			desc->handler->enable(irq);
+	}
+	spin_unlock(&desc->lock);
+}
+
+#ifdef CONFIG_PPC_ISERIES
+void do_IRQ(struct pt_regs *regs)
+{
+	struct paca_struct *lpaca;
+	struct ItLpQueue *lpq;
+
+	irq_enter();
+
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+	/* Debugging check for stack overflow: is there less than 2KB free? */
+	{
+		long sp;
+
+		sp = __get_SP() & (THREAD_SIZE-1);
+
+		if (unlikely(sp < (sizeof(struct thread_info) + 2048))) {
+			printk("do_IRQ: stack overflow: %ld\n",
+				sp - sizeof(struct thread_info));
+			dump_stack();
+		}
+	}
+#endif
+
+	lpaca = get_paca();
+#ifdef CONFIG_SMP
+	if (lpaca->lppaca.int_dword.fields.ipi_cnt) {
+		lpaca->lppaca.int_dword.fields.ipi_cnt = 0;
+		iSeries_smp_message_recv(regs);
+	}
+#endif /* CONFIG_SMP */
+	lpq = lpaca->lpqueue_ptr;
+	if (lpq && ItLpQueue_isLpIntPending(lpq))
+		lpevent_count += ItLpQueue_process(lpq, regs);
+
+	irq_exit();
+
+	if (lpaca->lppaca.int_dword.fields.decr_int) {
+		lpaca->lppaca.int_dword.fields.decr_int = 0;
+		/* Signal a fake decrementer interrupt */
+		timer_interrupt(regs);
+	}
+}
+
+#else	/* CONFIG_PPC_ISERIES */
+
+void do_IRQ(struct pt_regs *regs)
+{
+	int irq;
+
+	irq_enter();
+
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+	/* Debugging check for stack overflow: is there less than 2KB free? */
+	{
+		long sp;
+
+		sp = __get_SP() & (THREAD_SIZE-1);
+
+		if (unlikely(sp < (sizeof(struct thread_info) + 2048))) {
+			printk("do_IRQ: stack overflow: %ld\n",
+				sp - sizeof(struct thread_info));
+			dump_stack();
+		}
+	}
+#endif
+
+	irq = ppc_md.get_irq(regs);
+
+	if (irq >= 0)
+		ppc_irq_dispatch_handler(regs, irq);
+	else
+		/* That's not SMP safe ... but who cares ? */
+		ppc_spurious_interrupts++;
+
+	irq_exit();
+}
+#endif	/* CONFIG_PPC_ISERIES */
+
+void __init init_IRQ(void)
+{
+	static int once = 0;
+
+	if (once)
+		return;
+
+	once++;
+
+	ppc_md.init_IRQ();
+	irq_ctx_init();
+}
+
+#ifndef CONFIG_PPC_ISERIES
+/*
+ * Virtual IRQ mapping code, used on systems with XICS interrupt controllers.
+ */
+
+#define UNDEFINED_IRQ 0xffffffff
+unsigned int virt_irq_to_real_map[NR_IRQS];
+
+/*
+ * Don't use virtual irqs 0, 1, 2 for devices.
+ * The pcnet32 driver considers interrupt numbers < 2 to be invalid,
+ * and 2 is the XICS IPI interrupt.
+ * We limit virtual irqs to 17 less than NR_IRQS so that when we
+ * offset them by 16 (to reserve the first 16 for ISA interrupts)
+ * we don't end up with an interrupt number >= NR_IRQS.
+ */
+#define MIN_VIRT_IRQ	3
+#define MAX_VIRT_IRQ	(NR_IRQS - NUM_ISA_INTERRUPTS - 1)
+#define NR_VIRT_IRQS	(MAX_VIRT_IRQ - MIN_VIRT_IRQ + 1)
+
+void
+virt_irq_init(void)
+{
+	int i;
+	for (i = 0; i < NR_IRQS; i++)
+		virt_irq_to_real_map[i] = UNDEFINED_IRQ;
+}
+
+/* Create a mapping for a real_irq if it doesn't already exist.
+ * Return the virtual irq as a convenience.
+ */
+int virt_irq_create_mapping(unsigned int real_irq)
+{
+	unsigned int virq, first_virq;
+	static int warned;
+
+	if (ppc64_interrupt_controller == IC_OPEN_PIC)
+		return real_irq;	/* no mapping for openpic (for now) */
+
+	/* don't map interrupts < MIN_VIRT_IRQ */
+	if (real_irq < MIN_VIRT_IRQ) {
+		virt_irq_to_real_map[real_irq] = real_irq;
+		return real_irq;
+	}
+
+	/* map to a number between MIN_VIRT_IRQ and MAX_VIRT_IRQ */
+	virq = real_irq;
+	if (virq > MAX_VIRT_IRQ)
+		virq = (virq % NR_VIRT_IRQS) + MIN_VIRT_IRQ;
+
+	/* search for this number or a free slot */
+	first_virq = virq;
+	while (virt_irq_to_real_map[virq] != UNDEFINED_IRQ) {
+		if (virt_irq_to_real_map[virq] == real_irq)
+			return virq;
+		if (++virq > MAX_VIRT_IRQ)
+			virq = MIN_VIRT_IRQ;
+		if (virq == first_virq)
+			goto nospace;	/* oops, no free slots */
+	}
+
+	virt_irq_to_real_map[virq] = real_irq;
+	return virq;
+
+ nospace:
+	if (!warned) {
+		printk(KERN_CRIT "Interrupt table is full\n");
+		printk(KERN_CRIT "Increase NR_IRQS (currently %d) "
+		       "in your kernel sources and rebuild.\n", NR_IRQS);
+		warned = 1;
+	}
+	return NO_IRQ;
+}
+
+/*
+ * In most cases will get a hit on the very first slot checked in the
+ * virt_irq_to_real_map.  Only when there are a large number of
+ * IRQs will this be expensive.
+ */
+unsigned int real_irq_to_virt_slowpath(unsigned int real_irq)
+{
+	unsigned int virq;
+	unsigned int first_virq;
+
+	virq = real_irq;
+
+	if (virq > MAX_VIRT_IRQ)
+		virq = (virq % NR_VIRT_IRQS) + MIN_VIRT_IRQ;
+
+	first_virq = virq;
+
+	do {
+		if (virt_irq_to_real_map[virq] == real_irq)
+			return virq;
+
+		virq++;
+
+		if (virq >= MAX_VIRT_IRQ)
+			virq = 0;
+
+	} while (first_virq != virq);
+
+	return NO_IRQ;
+
+}
+
+#endif /* CONFIG_PPC_ISERIES */
+
+#ifdef CONFIG_IRQSTACKS
+struct thread_info *softirq_ctx[NR_CPUS];
+struct thread_info *hardirq_ctx[NR_CPUS];
+
+void irq_ctx_init(void)
+{
+	struct thread_info *tp;
+	int i;
+
+	for_each_cpu(i) {
+		memset((void *)softirq_ctx[i], 0, THREAD_SIZE);
+		tp = softirq_ctx[i];
+		tp->cpu = i;
+		tp->preempt_count = SOFTIRQ_OFFSET;
+
+		memset((void *)hardirq_ctx[i], 0, THREAD_SIZE);
+		tp = hardirq_ctx[i];
+		tp->cpu = i;
+		tp->preempt_count = HARDIRQ_OFFSET;
+	}
+}
+
+void do_softirq(void)
+{
+	unsigned long flags;
+	struct thread_info *curtp, *irqtp;
+
+	if (in_interrupt())
+		return;
+
+	local_irq_save(flags);
+
+	if (local_softirq_pending()) {
+		curtp = current_thread_info();
+		irqtp = softirq_ctx[smp_processor_id()];
+		irqtp->task = curtp->task;
+		call_do_softirq(irqtp);
+		irqtp->task = NULL;
+	}
+
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(do_softirq);
+
+#endif /* CONFIG_IRQSTACKS */
+
+static int __init setup_noirqdistrib(char *str)
+{
+	distribute_irqs = 0;
+	return 1;
+}
+
+__setup("noirqdistrib", setup_noirqdistrib);
diff --git a/arch/ppc64/kernel/kprobes.c b/arch/ppc64/kernel/kprobes.c
new file mode 100644
index 00000000000..103daaf7357
--- /dev/null
+++ b/arch/ppc64/kernel/kprobes.c
@@ -0,0 +1,290 @@
+/*
+ *  Kernel Probes (KProbes)
+ *  arch/ppc64/kernel/kprobes.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
+ *		Probes initial implementation ( includes contributions from
+ *		Rusty Russell).
+ * 2004-July	Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
+ *		interface to access function arguments.
+ * 2004-Nov	Ananth N Mavinakayanahalli <ananth@in.ibm.com> kprobes port
+ *		for PPC64
+ */
+
+#include <linux/config.h>
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/spinlock.h>
+#include <linux/preempt.h>
+#include <asm/kdebug.h>
+#include <asm/sstep.h>
+
+/* kprobe_status settings */
+#define KPROBE_HIT_ACTIVE	0x00000001
+#define KPROBE_HIT_SS		0x00000002
+
+static struct kprobe *current_kprobe;
+static unsigned long kprobe_status, kprobe_saved_msr;
+static struct pt_regs jprobe_saved_regs;
+
+int arch_prepare_kprobe(struct kprobe *p)
+{
+	kprobe_opcode_t insn = *p->addr;
+
+	if (IS_MTMSRD(insn) || IS_RFID(insn))
+		/* cannot put bp on RFID/MTMSRD */
+		return 1;
+	return 0;
+}
+
+void arch_copy_kprobe(struct kprobe *p)
+{
+	memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+}
+
+void arch_remove_kprobe(struct kprobe *p)
+{
+}
+
+static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs)
+{
+	*p->addr = p->opcode;
+	regs->nip = (unsigned long)p->addr;
+}
+
+static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
+{
+	regs->msr |= MSR_SE;
+	/*single step inline if it a breakpoint instruction*/
+	if (p->opcode == BREAKPOINT_INSTRUCTION)
+		regs->nip = (unsigned long)p->addr;
+	else
+		regs->nip = (unsigned long)&p->ainsn.insn;
+}
+
+static inline int kprobe_handler(struct pt_regs *regs)
+{
+	struct kprobe *p;
+	int ret = 0;
+	unsigned int *addr = (unsigned int *)regs->nip;
+
+	/* Check we're not actually recursing */
+	if (kprobe_running()) {
+		/* We *are* holding lock here, so this is safe.
+		   Disarm the probe we just hit, and ignore it. */
+		p = get_kprobe(addr);
+		if (p) {
+			if (kprobe_status == KPROBE_HIT_SS) {
+				regs->msr &= ~MSR_SE;
+				regs->msr |= kprobe_saved_msr;
+				unlock_kprobes();
+				goto no_kprobe;
+			}
+			disarm_kprobe(p, regs);
+			ret = 1;
+		} else {
+			p = current_kprobe;
+			if (p->break_handler && p->break_handler(p, regs)) {
+				goto ss_probe;
+			}
+		}
+		/* If it's not ours, can't be delete race, (we hold lock). */
+		goto no_kprobe;
+	}
+
+	lock_kprobes();
+	p = get_kprobe(addr);
+	if (!p) {
+		unlock_kprobes();
+		if (*addr != BREAKPOINT_INSTRUCTION) {
+			/*
+			 * PowerPC has multiple variants of the "trap"
+			 * instruction. If the current instruction is a
+			 * trap variant, it could belong to someone else
+			 */
+			kprobe_opcode_t cur_insn = *addr;
+			if (IS_TW(cur_insn) || IS_TD(cur_insn) ||
+					IS_TWI(cur_insn) || IS_TDI(cur_insn))
+		       		goto no_kprobe;
+			/*
+			 * The breakpoint instruction was removed right
+			 * after we hit it.  Another cpu has removed
+			 * either a probepoint or a debugger breakpoint
+			 * at this address.  In either case, no further
+			 * handling of this interrupt is appropriate.
+			 */
+			ret = 1;
+		}
+		/* Not one of ours: let kernel handle it */
+		goto no_kprobe;
+	}
+
+	kprobe_status = KPROBE_HIT_ACTIVE;
+	current_kprobe = p;
+	kprobe_saved_msr = regs->msr;
+	if (p->pre_handler && p->pre_handler(p, regs))
+		/* handler has already set things up, so skip ss setup */
+		return 1;
+
+ss_probe:
+	prepare_singlestep(p, regs);
+	kprobe_status = KPROBE_HIT_SS;
+	/*
+	 * This preempt_disable() matches the preempt_enable_no_resched()
+	 * in post_kprobe_handler().
+	 */
+	preempt_disable();
+	return 1;
+
+no_kprobe:
+	return ret;
+}
+
+/*
+ * Called after single-stepping.  p->addr is the address of the
+ * instruction whose first byte has been replaced by the "breakpoint"
+ * instruction.  To avoid the SMP problems that can occur when we
+ * temporarily put back the original opcode to single-step, we
+ * single-stepped a copy of the instruction.  The address of this
+ * copy is p->ainsn.insn.
+ */
+static void resume_execution(struct kprobe *p, struct pt_regs *regs)
+{
+	int ret;
+
+	regs->nip = (unsigned long)p->addr;
+	ret = emulate_step(regs, p->ainsn.insn[0]);
+	if (ret == 0)
+		regs->nip = (unsigned long)p->addr + 4;
+
+	regs->msr &= ~MSR_SE;
+}
+
+static inline int post_kprobe_handler(struct pt_regs *regs)
+{
+	if (!kprobe_running())
+		return 0;
+
+	if (current_kprobe->post_handler)
+		current_kprobe->post_handler(current_kprobe, regs, 0);
+
+	resume_execution(current_kprobe, regs);
+	regs->msr |= kprobe_saved_msr;
+
+	unlock_kprobes();
+	preempt_enable_no_resched();
+
+	/*
+	 * if somebody else is singlestepping across a probe point, msr
+	 * will have SE set, in which case, continue the remaining processing
+	 * of do_debug, as if this is not a probe hit.
+	 */
+	if (regs->msr & MSR_SE)
+		return 0;
+
+	return 1;
+}
+
+/* Interrupts disabled, kprobe_lock held. */
+static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+{
+	if (current_kprobe->fault_handler
+	    && current_kprobe->fault_handler(current_kprobe, regs, trapnr))
+		return 1;
+
+	if (kprobe_status & KPROBE_HIT_SS) {
+		resume_execution(current_kprobe, regs);
+		regs->msr |= kprobe_saved_msr;
+
+		unlock_kprobes();
+		preempt_enable_no_resched();
+	}
+	return 0;
+}
+
+/*
+ * Wrapper routine to for handling exceptions.
+ */
+int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
+			     void *data)
+{
+	struct die_args *args = (struct die_args *)data;
+	int ret = NOTIFY_DONE;
+
+	/*
+	 * Interrupts are not disabled here.  We need to disable
+	 * preemption, because kprobe_running() uses smp_processor_id().
+	 */
+	preempt_disable();
+	switch (val) {
+	case DIE_IABR_MATCH:
+	case DIE_DABR_MATCH:
+	case DIE_BPT:
+		if (kprobe_handler(args->regs))
+			ret = NOTIFY_STOP;
+		break;
+	case DIE_SSTEP:
+		if (post_kprobe_handler(args->regs))
+			ret = NOTIFY_STOP;
+		break;
+	case DIE_GPF:
+	case DIE_PAGE_FAULT:
+		if (kprobe_running() &&
+		    kprobe_fault_handler(args->regs, args->trapnr))
+			ret = NOTIFY_STOP;
+		break;
+	default:
+		break;
+	}
+	preempt_enable();
+	return ret;
+}
+
+int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	struct jprobe *jp = container_of(p, struct jprobe, kp);
+
+	memcpy(&jprobe_saved_regs, regs, sizeof(struct pt_regs));
+
+	/* setup return addr to the jprobe handler routine */
+	regs->nip = (unsigned long)(((func_descr_t *)jp->entry)->entry);
+	regs->gpr[2] = (unsigned long)(((func_descr_t *)jp->entry)->toc);
+
+	return 1;
+}
+
+void jprobe_return(void)
+{
+	asm volatile("trap" ::: "memory");
+}
+
+void jprobe_return_end(void)
+{
+};
+
+int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	/*
+	 * FIXME - we should ideally be validating that we got here 'cos
+	 * of the "trap" in jprobe_return() above, before restoring the
+	 * saved regs...
+	 */
+	memcpy(regs, &jprobe_saved_regs, sizeof(struct pt_regs));
+	return 1;
+}
diff --git a/arch/ppc64/kernel/lmb.c b/arch/ppc64/kernel/lmb.c
new file mode 100644
index 00000000000..d6c6bd03d2a
--- /dev/null
+++ b/arch/ppc64/kernel/lmb.c
@@ -0,0 +1,372 @@
+/*
+ * Procedures for interfacing to Open Firmware.
+ *
+ * Peter Bergner, IBM Corp.	June 2001.
+ * Copyright (C) 2001 Peter Bergner.
+ * 
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <asm/types.h>
+#include <asm/page.h>
+#include <asm/prom.h>
+#include <asm/lmb.h>
+#include <asm/abs_addr.h>
+
+struct lmb lmb;
+
+#undef DEBUG
+
+void lmb_dump_all(void)
+{
+#ifdef DEBUG
+	unsigned long i;
+	struct lmb *_lmb  = &lmb;
+
+	udbg_printf("lmb_dump_all:\n");
+	udbg_printf("    memory.cnt		  = 0x%lx\n",
+		    _lmb->memory.cnt);
+	udbg_printf("    memory.size		  = 0x%lx\n",
+		    _lmb->memory.size);
+	for (i=0; i < _lmb->memory.cnt ;i++) {
+		udbg_printf("    memory.region[0x%x].base       = 0x%lx\n",
+			    i, _lmb->memory.region[i].base);
+		udbg_printf("		      .physbase = 0x%lx\n",
+			    _lmb->memory.region[i].physbase);
+		udbg_printf("		      .size     = 0x%lx\n",
+			    _lmb->memory.region[i].size);
+	}
+
+	udbg_printf("\n    reserved.cnt	  = 0x%lx\n",
+		    _lmb->reserved.cnt);
+	udbg_printf("    reserved.size	  = 0x%lx\n",
+		    _lmb->reserved.size);
+	for (i=0; i < _lmb->reserved.cnt ;i++) {
+		udbg_printf("    reserved.region[0x%x].base       = 0x%lx\n",
+			    i, _lmb->reserved.region[i].base);
+		udbg_printf("		      .physbase = 0x%lx\n",
+			    _lmb->reserved.region[i].physbase);
+		udbg_printf("		      .size     = 0x%lx\n",
+			    _lmb->reserved.region[i].size);
+	}
+#endif /* DEBUG */
+}
+
+static unsigned long __init
+lmb_addrs_overlap(unsigned long base1, unsigned long size1,
+                  unsigned long base2, unsigned long size2)
+{
+	return ((base1 < (base2+size2)) && (base2 < (base1+size1)));
+}
+
+static long __init
+lmb_addrs_adjacent(unsigned long base1, unsigned long size1,
+		   unsigned long base2, unsigned long size2)
+{
+	if (base2 == base1 + size1)
+		return 1;
+	else if (base1 == base2 + size2)
+		return -1;
+
+	return 0;
+}
+
+static long __init
+lmb_regions_adjacent(struct lmb_region *rgn, unsigned long r1, unsigned long r2)
+{
+	unsigned long base1 = rgn->region[r1].base;
+	unsigned long size1 = rgn->region[r1].size;
+	unsigned long base2 = rgn->region[r2].base;
+	unsigned long size2 = rgn->region[r2].size;
+
+	return lmb_addrs_adjacent(base1, size1, base2, size2);
+}
+
+/* Assumption: base addr of region 1 < base addr of region 2 */
+static void __init
+lmb_coalesce_regions(struct lmb_region *rgn, unsigned long r1, unsigned long r2)
+{
+	unsigned long i;
+
+	rgn->region[r1].size += rgn->region[r2].size;
+	for (i=r2; i < rgn->cnt-1; i++) {
+		rgn->region[i].base = rgn->region[i+1].base;
+		rgn->region[i].physbase = rgn->region[i+1].physbase;
+		rgn->region[i].size = rgn->region[i+1].size;
+	}
+	rgn->cnt--;
+}
+
+/* This routine called with relocation disabled. */
+void __init
+lmb_init(void)
+{
+	struct lmb *_lmb = &lmb;
+
+	/* Create a dummy zero size LMB which will get coalesced away later.
+	 * This simplifies the lmb_add() code below...
+	 */
+	_lmb->memory.region[0].base = 0;
+	_lmb->memory.region[0].size = 0;
+	_lmb->memory.cnt = 1;
+
+	/* Ditto. */
+	_lmb->reserved.region[0].base = 0;
+	_lmb->reserved.region[0].size = 0;
+	_lmb->reserved.cnt = 1;
+}
+
+/* This routine called with relocation disabled. */
+void __init
+lmb_analyze(void)
+{
+	unsigned long i;
+	unsigned long mem_size = 0;
+	unsigned long size_mask = 0;
+	struct lmb *_lmb = &lmb;
+#ifdef CONFIG_MSCHUNKS
+	unsigned long physbase = 0;
+#endif
+
+	for (i=0; i < _lmb->memory.cnt; i++) {
+		unsigned long lmb_size;
+
+		lmb_size = _lmb->memory.region[i].size;
+
+#ifdef CONFIG_MSCHUNKS
+		_lmb->memory.region[i].physbase = physbase;
+		physbase += lmb_size;
+#else
+		_lmb->memory.region[i].physbase = _lmb->memory.region[i].base;
+#endif
+		mem_size += lmb_size;
+		size_mask |= lmb_size;
+	}
+
+	_lmb->memory.size = mem_size;
+}
+
+/* This routine called with relocation disabled. */
+static long __init
+lmb_add_region(struct lmb_region *rgn, unsigned long base, unsigned long size)
+{
+	unsigned long i, coalesced = 0;
+	long adjacent;
+
+	/* First try and coalesce this LMB with another. */
+	for (i=0; i < rgn->cnt; i++) {
+		unsigned long rgnbase = rgn->region[i].base;
+		unsigned long rgnsize = rgn->region[i].size;
+
+		adjacent = lmb_addrs_adjacent(base,size,rgnbase,rgnsize);
+		if ( adjacent > 0 ) {
+			rgn->region[i].base -= size;
+			rgn->region[i].physbase -= size;
+			rgn->region[i].size += size;
+			coalesced++;
+			break;
+		}
+		else if ( adjacent < 0 ) {
+			rgn->region[i].size += size;
+			coalesced++;
+			break;
+		}
+	}
+
+	if ((i < rgn->cnt-1) && lmb_regions_adjacent(rgn, i, i+1) ) {
+		lmb_coalesce_regions(rgn, i, i+1);
+		coalesced++;
+	}
+
+	if ( coalesced ) {
+		return coalesced;
+	} else if ( rgn->cnt >= MAX_LMB_REGIONS ) {
+		return -1;
+	}
+
+	/* Couldn't coalesce the LMB, so add it to the sorted table. */
+	for (i=rgn->cnt-1; i >= 0; i--) {
+		if (base < rgn->region[i].base) {
+			rgn->region[i+1].base = rgn->region[i].base;
+			rgn->region[i+1].physbase = rgn->region[i].physbase;
+			rgn->region[i+1].size = rgn->region[i].size;
+		}  else {
+			rgn->region[i+1].base = base;
+			rgn->region[i+1].physbase = lmb_abs_to_phys(base);
+			rgn->region[i+1].size = size;
+			break;
+		}
+	}
+	rgn->cnt++;
+
+	return 0;
+}
+
+/* This routine called with relocation disabled. */
+long __init
+lmb_add(unsigned long base, unsigned long size)
+{
+	struct lmb *_lmb = &lmb;
+	struct lmb_region *_rgn = &(_lmb->memory);
+
+	/* On pSeries LPAR systems, the first LMB is our RMO region. */
+	if ( base == 0 )
+		_lmb->rmo_size = size;
+
+	return lmb_add_region(_rgn, base, size);
+
+}
+
+long __init
+lmb_reserve(unsigned long base, unsigned long size)
+{
+	struct lmb *_lmb = &lmb;
+	struct lmb_region *_rgn = &(_lmb->reserved);
+
+	return lmb_add_region(_rgn, base, size);
+}
+
+long __init
+lmb_overlaps_region(struct lmb_region *rgn, unsigned long base, unsigned long size)
+{
+	unsigned long i;
+
+	for (i=0; i < rgn->cnt; i++) {
+		unsigned long rgnbase = rgn->region[i].base;
+		unsigned long rgnsize = rgn->region[i].size;
+		if ( lmb_addrs_overlap(base,size,rgnbase,rgnsize) ) {
+			break;
+		}
+	}
+
+	return (i < rgn->cnt) ? i : -1;
+}
+
+unsigned long __init
+lmb_alloc(unsigned long size, unsigned long align)
+{
+	return lmb_alloc_base(size, align, LMB_ALLOC_ANYWHERE);
+}
+
+unsigned long __init
+lmb_alloc_base(unsigned long size, unsigned long align, unsigned long max_addr)
+{
+	long i, j;
+	unsigned long base = 0;
+	struct lmb *_lmb = &lmb;
+	struct lmb_region *_mem = &(_lmb->memory);
+	struct lmb_region *_rsv = &(_lmb->reserved);
+
+	for (i=_mem->cnt-1; i >= 0; i--) {
+		unsigned long lmbbase = _mem->region[i].base;
+		unsigned long lmbsize = _mem->region[i].size;
+
+		if ( max_addr == LMB_ALLOC_ANYWHERE )
+			base = _ALIGN_DOWN(lmbbase+lmbsize-size, align);
+		else if ( lmbbase < max_addr )
+			base = _ALIGN_DOWN(min(lmbbase+lmbsize,max_addr)-size, align);
+		else
+			continue;
+
+		while ( (lmbbase <= base) &&
+			((j = lmb_overlaps_region(_rsv,base,size)) >= 0) ) {
+			base = _ALIGN_DOWN(_rsv->region[j].base-size, align);
+		}
+
+		if ( (base != 0) && (lmbbase <= base) )
+			break;
+	}
+
+	if ( i < 0 )
+		return 0;
+
+	lmb_add_region(_rsv, base, size);
+
+	return base;
+}
+
+unsigned long __init
+lmb_phys_mem_size(void)
+{
+	struct lmb *_lmb = &lmb;
+#ifdef CONFIG_MSCHUNKS
+	return _lmb->memory.size;
+#else
+	struct lmb_region *_mem = &(_lmb->memory);
+	unsigned long total = 0;
+	int i;
+
+	/* add all physical memory to the bootmem map */
+	for (i=0; i < _mem->cnt; i++)
+		total += _mem->region[i].size;
+	return total;
+#endif /* CONFIG_MSCHUNKS */
+}
+
+unsigned long __init
+lmb_end_of_DRAM(void)
+{
+	struct lmb *_lmb = &lmb;
+	struct lmb_region *_mem = &(_lmb->memory);
+	int idx = _mem->cnt - 1;
+
+#ifdef CONFIG_MSCHUNKS
+	return (_mem->region[idx].physbase + _mem->region[idx].size);
+#else
+	return (_mem->region[idx].base + _mem->region[idx].size);
+#endif /* CONFIG_MSCHUNKS */
+
+	return 0;
+}
+
+unsigned long __init
+lmb_abs_to_phys(unsigned long aa)
+{
+	unsigned long i, pa = aa;
+	struct lmb *_lmb = &lmb;
+	struct lmb_region *_mem = &(_lmb->memory);
+
+	for (i=0; i < _mem->cnt; i++) {
+		unsigned long lmbbase = _mem->region[i].base;
+		unsigned long lmbsize = _mem->region[i].size;
+		if ( lmb_addrs_overlap(aa,1,lmbbase,lmbsize) ) {
+			pa = _mem->region[i].physbase + (aa - lmbbase);
+			break;
+		}
+	}
+
+	return pa;
+}
+
+/*
+ * Truncate the lmb list to memory_limit if it's set
+ * You must call lmb_analyze() after this.
+ */
+void __init lmb_enforce_memory_limit(void)
+{
+	extern unsigned long memory_limit;
+	unsigned long i, limit;
+	struct lmb_region *mem = &(lmb.memory);
+
+	if (! memory_limit)
+		return;
+
+	limit = memory_limit;
+	for (i = 0; i < mem->cnt; i++) {
+		if (limit > mem->region[i].size) {
+			limit -= mem->region[i].size;
+			continue;
+		}
+
+		mem->region[i].size = limit;
+		mem->cnt = i + 1;
+		break;
+	}
+}
diff --git a/arch/ppc64/kernel/lparcfg.c b/arch/ppc64/kernel/lparcfg.c
new file mode 100644
index 00000000000..a8fd32df848
--- /dev/null
+++ b/arch/ppc64/kernel/lparcfg.c
@@ -0,0 +1,611 @@
+/*
+ * PowerPC64 LPAR Configuration Information Driver
+ *
+ * Dave Engebretsen engebret@us.ibm.com
+ *    Copyright (c) 2003 Dave Engebretsen
+ * Will Schmidt willschm@us.ibm.com
+ *    SPLPAR updates, Copyright (c) 2003 Will Schmidt IBM Corporation.
+ *    seq_file updates, Copyright (c) 2004 Will Schmidt IBM Corporation.
+ * Nathan Lynch nathanl@austin.ibm.com
+ *    Added lparcfg_write, Copyright (C) 2004 Nathan Lynch IBM Corporation.
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ * This driver creates a proc file at /proc/ppc64/lparcfg which contains
+ * keyword - value pairs that specify the configuration of the partition.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+#include <asm/uaccess.h>
+#include <asm/iSeries/HvLpConfig.h>
+#include <asm/lppaca.h>
+#include <asm/iSeries/LparData.h>
+#include <asm/hvcall.h>
+#include <asm/cputable.h>
+#include <asm/rtas.h>
+#include <asm/system.h>
+#include <asm/time.h>
+
+#define MODULE_VERS "1.6"
+#define MODULE_NAME "lparcfg"
+
+/* #define LPARCFG_DEBUG */
+
+/* find a better place for this function... */
+void log_plpar_hcall_return(unsigned long rc, char *tag)
+{
+	if (rc == 0)		/* success, return */
+		return;
+/* check for null tag ? */
+	if (rc == H_Hardware)
+		printk(KERN_INFO
+		       "plpar-hcall (%s) failed with hardware fault\n", tag);
+	else if (rc == H_Function)
+		printk(KERN_INFO
+		       "plpar-hcall (%s) failed; function not allowed\n", tag);
+	else if (rc == H_Authority)
+		printk(KERN_INFO
+		       "plpar-hcall (%s) failed; not authorized to this function\n",
+		       tag);
+	else if (rc == H_Parameter)
+		printk(KERN_INFO "plpar-hcall (%s) failed; Bad parameter(s)\n",
+		       tag);
+	else
+		printk(KERN_INFO
+		       "plpar-hcall (%s) failed with unexpected rc(0x%lx)\n",
+		       tag, rc);
+
+}
+
+static struct proc_dir_entry *proc_ppc64_lparcfg;
+#define LPARCFG_BUFF_SIZE 4096
+
+#ifdef CONFIG_PPC_ISERIES
+
+/*
+ * For iSeries legacy systems, the PPA purr function is available from the
+ * emulated_time_base field in the paca.
+ */
+static unsigned long get_purr(void)
+{
+	unsigned long sum_purr = 0;
+	int cpu;
+	struct paca_struct *lpaca;
+
+	for_each_cpu(cpu) {
+		lpaca = paca + cpu;
+		sum_purr += lpaca->lppaca.emulated_time_base;
+
+#ifdef PURR_DEBUG
+		printk(KERN_INFO "get_purr for cpu (%d) has value (%ld) \n",
+			cpu, lpaca->lppaca.emulated_time_base);
+#endif
+	}
+	return sum_purr;
+}
+
+#define lparcfg_write NULL
+
+/* 
+ * Methods used to fetch LPAR data when running on an iSeries platform.
+ */
+static int lparcfg_data(struct seq_file *m, void *v)
+{
+	unsigned long pool_id, lp_index;
+	int shared, entitled_capacity, max_entitled_capacity;
+	int processors, max_processors;
+	struct paca_struct *lpaca = get_paca();
+	unsigned long purr = get_purr();
+
+	seq_printf(m, "%s %s \n", MODULE_NAME, MODULE_VERS);
+
+	shared = (int)(lpaca->lppaca_ptr->shared_proc);
+	seq_printf(m, "serial_number=%c%c%c%c%c%c%c\n",
+		   e2a(xItExtVpdPanel.mfgID[2]),
+		   e2a(xItExtVpdPanel.mfgID[3]),
+		   e2a(xItExtVpdPanel.systemSerial[1]),
+		   e2a(xItExtVpdPanel.systemSerial[2]),
+		   e2a(xItExtVpdPanel.systemSerial[3]),
+		   e2a(xItExtVpdPanel.systemSerial[4]),
+		   e2a(xItExtVpdPanel.systemSerial[5]));
+
+	seq_printf(m, "system_type=%c%c%c%c\n",
+		   e2a(xItExtVpdPanel.machineType[0]),
+		   e2a(xItExtVpdPanel.machineType[1]),
+		   e2a(xItExtVpdPanel.machineType[2]),
+		   e2a(xItExtVpdPanel.machineType[3]));
+
+	lp_index = HvLpConfig_getLpIndex();
+	seq_printf(m, "partition_id=%d\n", (int)lp_index);
+
+	seq_printf(m, "system_active_processors=%d\n",
+		   (int)HvLpConfig_getSystemPhysicalProcessors());
+
+	seq_printf(m, "system_potential_processors=%d\n",
+		   (int)HvLpConfig_getSystemPhysicalProcessors());
+
+	processors = (int)HvLpConfig_getPhysicalProcessors();
+	seq_printf(m, "partition_active_processors=%d\n", processors);
+
+	max_processors = (int)HvLpConfig_getMaxPhysicalProcessors();
+	seq_printf(m, "partition_potential_processors=%d\n", max_processors);
+
+	if (shared) {
+		entitled_capacity = HvLpConfig_getSharedProcUnits();
+		max_entitled_capacity = HvLpConfig_getMaxSharedProcUnits();
+	} else {
+		entitled_capacity = processors * 100;
+		max_entitled_capacity = max_processors * 100;
+	}
+	seq_printf(m, "partition_entitled_capacity=%d\n", entitled_capacity);
+
+	seq_printf(m, "partition_max_entitled_capacity=%d\n",
+		   max_entitled_capacity);
+
+	if (shared) {
+		pool_id = HvLpConfig_getSharedPoolIndex();
+		seq_printf(m, "pool=%d\n", (int)pool_id);
+		seq_printf(m, "pool_capacity=%d\n",
+			   (int)(HvLpConfig_getNumProcsInSharedPool(pool_id) *
+				 100));
+		seq_printf(m, "purr=%ld\n", purr);
+	}
+
+	seq_printf(m, "shared_processor_mode=%d\n", shared);
+
+	return 0;
+}
+#endif				/* CONFIG_PPC_ISERIES */
+
+#ifdef CONFIG_PPC_PSERIES
+/* 
+ * Methods used to fetch LPAR data when running on a pSeries platform.
+ */
+
+/*
+ * H_GET_PPP hcall returns info in 4 parms.
+ *  entitled_capacity,unallocated_capacity,
+ *  aggregation, resource_capability).
+ *
+ *  R4 = Entitled Processor Capacity Percentage. 
+ *  R5 = Unallocated Processor Capacity Percentage.
+ *  R6 (AABBCCDDEEFFGGHH).
+ *      XXXX - reserved (0)
+ *          XXXX - reserved (0)
+ *              XXXX - Group Number
+ *                  XXXX - Pool Number.
+ *  R7 (IIJJKKLLMMNNOOPP).
+ *      XX - reserved. (0)
+ *        XX - bit 0-6 reserved (0).   bit 7 is Capped indicator.
+ *          XX - variable processor Capacity Weight
+ *            XX - Unallocated Variable Processor Capacity Weight.
+ *              XXXX - Active processors in Physical Processor Pool.
+ *                  XXXX  - Processors active on platform. 
+ */
+static unsigned int h_get_ppp(unsigned long *entitled,
+			      unsigned long *unallocated,
+			      unsigned long *aggregation,
+			      unsigned long *resource)
+{
+	unsigned long rc;
+	rc = plpar_hcall_4out(H_GET_PPP, 0, 0, 0, 0, entitled, unallocated,
+			      aggregation, resource);
+
+	log_plpar_hcall_return(rc, "H_GET_PPP");
+
+	return rc;
+}
+
+static void h_pic(unsigned long *pool_idle_time, unsigned long *num_procs)
+{
+	unsigned long rc;
+	unsigned long dummy;
+	rc = plpar_hcall(H_PIC, 0, 0, 0, 0, pool_idle_time, num_procs, &dummy);
+
+	log_plpar_hcall_return(rc, "H_PIC");
+}
+
+static unsigned long get_purr(void);
+
+/* Track sum of all purrs across all processors. This is used to further */
+/* calculate usage values by different applications                       */
+
+static unsigned long get_purr(void)
+{
+	unsigned long sum_purr = 0;
+	int cpu;
+	struct cpu_usage *cu;
+
+	for_each_cpu(cpu) {
+		cu = &per_cpu(cpu_usage_array, cpu);
+		sum_purr += cu->current_tb;
+	}
+	return sum_purr;
+}
+
+#define SPLPAR_CHARACTERISTICS_TOKEN 20
+#define SPLPAR_MAXLENGTH 1026*(sizeof(char))
+
+/*
+ * parse_system_parameter_string()
+ * Retrieve the potential_processors, max_entitled_capacity and friends
+ * through the get-system-parameter rtas call.  Replace keyword strings as
+ * necessary.
+ */
+static void parse_system_parameter_string(struct seq_file *m)
+{
+	int call_status;
+
+	char *local_buffer = kmalloc(SPLPAR_MAXLENGTH, GFP_KERNEL);
+	if (!local_buffer) {
+		printk(KERN_ERR "%s %s kmalloc failure at line %d \n",
+		       __FILE__, __FUNCTION__, __LINE__);
+		return;
+	}
+
+	spin_lock(&rtas_data_buf_lock);
+	memset(rtas_data_buf, 0, SPLPAR_MAXLENGTH);
+	call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
+				NULL,
+				SPLPAR_CHARACTERISTICS_TOKEN,
+				__pa(rtas_data_buf));
+	memcpy(local_buffer, rtas_data_buf, SPLPAR_MAXLENGTH);
+	spin_unlock(&rtas_data_buf_lock);
+
+	if (call_status != 0) {
+		printk(KERN_INFO
+		       "%s %s Error calling get-system-parameter (0x%x)\n",
+		       __FILE__, __FUNCTION__, call_status);
+	} else {
+		int splpar_strlen;
+		int idx, w_idx;
+		char *workbuffer = kmalloc(SPLPAR_MAXLENGTH, GFP_KERNEL);
+		if (!workbuffer) {
+			printk(KERN_ERR "%s %s kmalloc failure at line %d \n",
+			       __FILE__, __FUNCTION__, __LINE__);
+			return;
+		}
+#ifdef LPARCFG_DEBUG
+		printk(KERN_INFO "success calling get-system-parameter \n");
+#endif
+		splpar_strlen = local_buffer[0] * 16 + local_buffer[1];
+		local_buffer += 2;	/* step over strlen value */
+
+		memset(workbuffer, 0, SPLPAR_MAXLENGTH);
+		w_idx = 0;
+		idx = 0;
+		while ((*local_buffer) && (idx < splpar_strlen)) {
+			workbuffer[w_idx++] = local_buffer[idx++];
+			if ((local_buffer[idx] == ',')
+			    || (local_buffer[idx] == '\0')) {
+				workbuffer[w_idx] = '\0';
+				if (w_idx) {
+					/* avoid the empty string */
+					seq_printf(m, "%s\n", workbuffer);
+				}
+				memset(workbuffer, 0, SPLPAR_MAXLENGTH);
+				idx++;	/* skip the comma */
+				w_idx = 0;
+			} else if (local_buffer[idx] == '=') {
+				/* code here to replace workbuffer contents
+				   with different keyword strings */
+				if (0 == strcmp(workbuffer, "MaxEntCap")) {
+					strcpy(workbuffer,
+					       "partition_max_entitled_capacity");
+					w_idx = strlen(workbuffer);
+				}
+				if (0 == strcmp(workbuffer, "MaxPlatProcs")) {
+					strcpy(workbuffer,
+					       "system_potential_processors");
+					w_idx = strlen(workbuffer);
+				}
+			}
+		}
+		kfree(workbuffer);
+		local_buffer -= 2;	/* back up over strlen value */
+	}
+	kfree(local_buffer);
+}
+
+static int lparcfg_count_active_processors(void);
+
+/* Return the number of processors in the system.
+ * This function reads through the device tree and counts
+ * the virtual processors, this does not include threads.
+ */
+static int lparcfg_count_active_processors(void)
+{
+	struct device_node *cpus_dn = NULL;
+	int count = 0;
+
+	while ((cpus_dn = of_find_node_by_type(cpus_dn, "cpu"))) {
+#ifdef LPARCFG_DEBUG
+		printk(KERN_ERR "cpus_dn %p \n", cpus_dn);
+#endif
+		count++;
+	}
+	return count;
+}
+
+static int lparcfg_data(struct seq_file *m, void *v)
+{
+	int partition_potential_processors;
+	int partition_active_processors;
+	struct device_node *rootdn;
+	const char *model = "";
+	const char *system_id = "";
+	unsigned int *lp_index_ptr, lp_index = 0;
+	struct device_node *rtas_node;
+	int *lrdrp;
+
+	rootdn = find_path_device("/");
+	if (rootdn) {
+		model = get_property(rootdn, "model", NULL);
+		system_id = get_property(rootdn, "system-id", NULL);
+		lp_index_ptr = (unsigned int *)
+		    get_property(rootdn, "ibm,partition-no", NULL);
+		if (lp_index_ptr)
+			lp_index = *lp_index_ptr;
+	}
+
+	seq_printf(m, "%s %s \n", MODULE_NAME, MODULE_VERS);
+
+	seq_printf(m, "serial_number=%s\n", system_id);
+
+	seq_printf(m, "system_type=%s\n", model);
+
+	seq_printf(m, "partition_id=%d\n", (int)lp_index);
+
+	rtas_node = find_path_device("/rtas");
+	lrdrp = (int *)get_property(rtas_node, "ibm,lrdr-capacity", NULL);
+
+	if (lrdrp == NULL) {
+		partition_potential_processors = systemcfg->processorCount;
+	} else {
+		partition_potential_processors = *(lrdrp + 4);
+	}
+
+	partition_active_processors = lparcfg_count_active_processors();
+
+	if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) {
+		unsigned long h_entitled, h_unallocated;
+		unsigned long h_aggregation, h_resource;
+		unsigned long pool_idle_time, pool_procs;
+		unsigned long purr;
+
+		h_get_ppp(&h_entitled, &h_unallocated, &h_aggregation,
+			  &h_resource);
+
+		seq_printf(m, "R4=0x%lx\n", h_entitled);
+		seq_printf(m, "R5=0x%lx\n", h_unallocated);
+		seq_printf(m, "R6=0x%lx\n", h_aggregation);
+		seq_printf(m, "R7=0x%lx\n", h_resource);
+
+		purr = get_purr();
+
+		/* this call handles the ibm,get-system-parameter contents */
+		parse_system_parameter_string(m);
+
+		seq_printf(m, "partition_entitled_capacity=%ld\n", h_entitled);
+
+		seq_printf(m, "group=%ld\n", (h_aggregation >> 2 * 8) & 0xffff);
+
+		seq_printf(m, "system_active_processors=%ld\n",
+			   (h_resource >> 0 * 8) & 0xffff);
+
+		/* pool related entries are apropriate for shared configs */
+		if (paca[0].lppaca.shared_proc) {
+
+			h_pic(&pool_idle_time, &pool_procs);
+
+			seq_printf(m, "pool=%ld\n",
+				   (h_aggregation >> 0 * 8) & 0xffff);
+
+			/* report pool_capacity in percentage */
+			seq_printf(m, "pool_capacity=%ld\n",
+				   ((h_resource >> 2 * 8) & 0xffff) * 100);
+
+			seq_printf(m, "pool_idle_time=%ld\n", pool_idle_time);
+
+			seq_printf(m, "pool_num_procs=%ld\n", pool_procs);
+		}
+
+		seq_printf(m, "unallocated_capacity_weight=%ld\n",
+			   (h_resource >> 4 * 8) & 0xFF);
+
+		seq_printf(m, "capacity_weight=%ld\n",
+			   (h_resource >> 5 * 8) & 0xFF);
+
+		seq_printf(m, "capped=%ld\n", (h_resource >> 6 * 8) & 0x01);
+
+		seq_printf(m, "unallocated_capacity=%ld\n", h_unallocated);
+
+		seq_printf(m, "purr=%ld\n", purr);
+
+	} else {		/* non SPLPAR case */
+
+		seq_printf(m, "system_active_processors=%d\n",
+			   partition_potential_processors);
+
+		seq_printf(m, "system_potential_processors=%d\n",
+			   partition_potential_processors);
+
+		seq_printf(m, "partition_max_entitled_capacity=%d\n",
+			   partition_potential_processors * 100);
+
+		seq_printf(m, "partition_entitled_capacity=%d\n",
+			   partition_active_processors * 100);
+	}
+
+	seq_printf(m, "partition_active_processors=%d\n",
+		   partition_active_processors);
+
+	seq_printf(m, "partition_potential_processors=%d\n",
+		   partition_potential_processors);
+
+	seq_printf(m, "shared_processor_mode=%d\n", paca[0].lppaca.shared_proc);
+
+	return 0;
+}
+
+/*
+ * Interface for changing system parameters (variable capacity weight
+ * and entitled capacity).  Format of input is "param_name=value";
+ * anything after value is ignored.  Valid parameters at this time are
+ * "partition_entitled_capacity" and "capacity_weight".  We use
+ * H_SET_PPP to alter parameters.
+ *
+ * This function should be invoked only on systems with
+ * FW_FEATURE_SPLPAR.
+ */
+static ssize_t lparcfg_write(struct file *file, const char __user * buf,
+			     size_t count, loff_t * off)
+{
+	char *kbuf;
+	char *tmp;
+	u64 new_entitled, *new_entitled_ptr = &new_entitled;
+	u8 new_weight, *new_weight_ptr = &new_weight;
+
+	unsigned long current_entitled;	/* parameters for h_get_ppp */
+	unsigned long dummy;
+	unsigned long resource;
+	u8 current_weight;
+
+	ssize_t retval = -ENOMEM;
+
+	kbuf = kmalloc(count, GFP_KERNEL);
+	if (!kbuf)
+		goto out;
+
+	retval = -EFAULT;
+	if (copy_from_user(kbuf, buf, count))
+		goto out;
+
+	retval = -EINVAL;
+	kbuf[count - 1] = '\0';
+	tmp = strchr(kbuf, '=');
+	if (!tmp)
+		goto out;
+
+	*tmp++ = '\0';
+
+	if (!strcmp(kbuf, "partition_entitled_capacity")) {
+		char *endp;
+		*new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10);
+		if (endp == tmp)
+			goto out;
+		new_weight_ptr = &current_weight;
+	} else if (!strcmp(kbuf, "capacity_weight")) {
+		char *endp;
+		*new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10);
+		if (endp == tmp)
+			goto out;
+		new_entitled_ptr = &current_entitled;
+	} else
+		goto out;
+
+	/* Get our current parameters */
+	retval = h_get_ppp(&current_entitled, &dummy, &dummy, &resource);
+	if (retval) {
+		retval = -EIO;
+		goto out;
+	}
+
+	current_weight = (resource >> 5 * 8) & 0xFF;
+
+	pr_debug("%s: current_entitled = %lu, current_weight = %lu\n",
+		 __FUNCTION__, current_entitled, current_weight);
+
+	pr_debug("%s: new_entitled = %lu, new_weight = %lu\n",
+		 __FUNCTION__, *new_entitled_ptr, *new_weight_ptr);
+
+	retval = plpar_hcall_norets(H_SET_PPP, *new_entitled_ptr,
+				    *new_weight_ptr);
+
+	if (retval == H_Success || retval == H_Constrained) {
+		retval = count;
+	} else if (retval == H_Busy) {
+		retval = -EBUSY;
+	} else if (retval == H_Hardware) {
+		retval = -EIO;
+	} else if (retval == H_Parameter) {
+		retval = -EINVAL;
+	} else {
+		printk(KERN_WARNING "%s: received unknown hv return code %ld",
+		       __FUNCTION__, retval);
+		retval = -EIO;
+	}
+
+      out:
+	kfree(kbuf);
+	return retval;
+}
+
+#endif				/* CONFIG_PPC_PSERIES */
+
+static int lparcfg_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, lparcfg_data, NULL);
+}
+
+struct file_operations lparcfg_fops = {
+      .owner	= THIS_MODULE,
+      .read	= seq_read,
+      .open	= lparcfg_open,
+      .release	= single_release,
+};
+
+int __init lparcfg_init(void)
+{
+	struct proc_dir_entry *ent;
+	mode_t mode = S_IRUSR;
+
+	/* Allow writing if we have FW_FEATURE_SPLPAR */
+	if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) {
+		lparcfg_fops.write = lparcfg_write;
+		mode |= S_IWUSR;
+	}
+
+	ent = create_proc_entry("ppc64/lparcfg", mode, NULL);
+	if (ent) {
+		ent->proc_fops = &lparcfg_fops;
+		ent->data = kmalloc(LPARCFG_BUFF_SIZE, GFP_KERNEL);
+		if (!ent->data) {
+			printk(KERN_ERR
+			       "Failed to allocate buffer for lparcfg\n");
+			remove_proc_entry("lparcfg", ent->parent);
+			return -ENOMEM;
+		}
+	} else {
+		printk(KERN_ERR "Failed to create ppc64/lparcfg\n");
+		return -EIO;
+	}
+
+	proc_ppc64_lparcfg = ent;
+	return 0;
+}
+
+void __exit lparcfg_cleanup(void)
+{
+	if (proc_ppc64_lparcfg) {
+		if (proc_ppc64_lparcfg->data) {
+			kfree(proc_ppc64_lparcfg->data);
+		}
+		remove_proc_entry("lparcfg", proc_ppc64_lparcfg->parent);
+	}
+}
+
+module_init(lparcfg_init);
+module_exit(lparcfg_cleanup);
+MODULE_DESCRIPTION("Interface for LPAR configuration data");
+MODULE_AUTHOR("Dave Engebretsen");
+MODULE_LICENSE("GPL");
diff --git a/arch/ppc64/kernel/maple_pci.c b/arch/ppc64/kernel/maple_pci.c
new file mode 100644
index 00000000000..53993999b26
--- /dev/null
+++ b/arch/ppc64/kernel/maple_pci.c
@@ -0,0 +1,521 @@
+/*
+ * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org),
+ *		      IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define DEBUG
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+
+#include <asm/sections.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+#include <asm/iommu.h>
+
+#include "pci.h"
+
+#ifdef DEBUG
+#define DBG(x...) printk(x)
+#else
+#define DBG(x...)
+#endif
+
+static struct pci_controller *u3_agp, *u3_ht;
+
+static int __init fixup_one_level_bus_range(struct device_node *node, int higher)
+{
+	for (; node != 0;node = node->sibling) {
+		int * bus_range;
+		unsigned int *class_code;
+		int len;
+
+		/* For PCI<->PCI bridges or CardBus bridges, we go down */
+		class_code = (unsigned int *) get_property(node, "class-code", NULL);
+		if (!class_code || ((*class_code >> 8) != PCI_CLASS_BRIDGE_PCI &&
+			(*class_code >> 8) != PCI_CLASS_BRIDGE_CARDBUS))
+			continue;
+		bus_range = (int *) get_property(node, "bus-range", &len);
+		if (bus_range != NULL && len > 2 * sizeof(int)) {
+			if (bus_range[1] > higher)
+				higher = bus_range[1];
+		}
+		higher = fixup_one_level_bus_range(node->child, higher);
+	}
+	return higher;
+}
+
+/* This routine fixes the "bus-range" property of all bridges in the
+ * system since they tend to have their "last" member wrong on macs
+ *
+ * Note that the bus numbers manipulated here are OF bus numbers, they
+ * are not Linux bus numbers.
+ */
+static void __init fixup_bus_range(struct device_node *bridge)
+{
+	int * bus_range;
+	int len;
+
+	/* Lookup the "bus-range" property for the hose */
+	bus_range = (int *) get_property(bridge, "bus-range", &len);
+	if (bus_range == NULL || len < 2 * sizeof(int)) {
+		printk(KERN_WARNING "Can't get bus-range for %s\n",
+			       bridge->full_name);
+		return;
+	}
+	bus_range[1] = fixup_one_level_bus_range(bridge->child, bus_range[1]);
+}
+
+
+#define U3_AGP_CFA0(devfn, off)	\
+	((1 << (unsigned long)PCI_SLOT(dev_fn)) \
+	| (((unsigned long)PCI_FUNC(dev_fn)) << 8) \
+	| (((unsigned long)(off)) & 0xFCUL))
+
+#define U3_AGP_CFA1(bus, devfn, off)	\
+	((((unsigned long)(bus)) << 16) \
+	|(((unsigned long)(devfn)) << 8) \
+	|(((unsigned long)(off)) & 0xFCUL) \
+	|1UL)
+
+static unsigned long u3_agp_cfg_access(struct pci_controller* hose,
+				       u8 bus, u8 dev_fn, u8 offset)
+{
+	unsigned int caddr;
+
+	if (bus == hose->first_busno) {
+		if (dev_fn < (11 << 3))
+			return 0;
+		caddr = U3_AGP_CFA0(dev_fn, offset);
+	} else
+		caddr = U3_AGP_CFA1(bus, dev_fn, offset);
+
+	/* Uninorth will return garbage if we don't read back the value ! */
+	do {
+		out_le32(hose->cfg_addr, caddr);
+	} while (in_le32(hose->cfg_addr) != caddr);
+
+	offset &= 0x07;
+	return ((unsigned long)hose->cfg_data) + offset;
+}
+
+static int u3_agp_read_config(struct pci_bus *bus, unsigned int devfn,
+			      int offset, int len, u32 *val)
+{
+	struct pci_controller *hose;
+	unsigned long addr;
+
+	hose = pci_bus_to_host(bus);
+	if (hose == NULL)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	addr = u3_agp_cfg_access(hose, bus->number, devfn, offset);
+	if (!addr)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+	/*
+	 * Note: the caller has already checked that offset is
+	 * suitably aligned and that len is 1, 2 or 4.
+	 */
+	switch (len) {
+	case 1:
+		*val = in_8((u8 *)addr);
+		break;
+	case 2:
+		*val = in_le16((u16 *)addr);
+		break;
+	default:
+		*val = in_le32((u32 *)addr);
+		break;
+	}
+	return PCIBIOS_SUCCESSFUL;
+}
+
+static int u3_agp_write_config(struct pci_bus *bus, unsigned int devfn,
+			       int offset, int len, u32 val)
+{
+	struct pci_controller *hose;
+	unsigned long addr;
+
+	hose = pci_bus_to_host(bus);
+	if (hose == NULL)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	addr = u3_agp_cfg_access(hose, bus->number, devfn, offset);
+	if (!addr)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+	/*
+	 * Note: the caller has already checked that offset is
+	 * suitably aligned and that len is 1, 2 or 4.
+	 */
+	switch (len) {
+	case 1:
+		out_8((u8 *)addr, val);
+		(void) in_8((u8 *)addr);
+		break;
+	case 2:
+		out_le16((u16 *)addr, val);
+		(void) in_le16((u16 *)addr);
+		break;
+	default:
+		out_le32((u32 *)addr, val);
+		(void) in_le32((u32 *)addr);
+		break;
+	}
+	return PCIBIOS_SUCCESSFUL;
+}
+
+static struct pci_ops u3_agp_pci_ops =
+{
+	u3_agp_read_config,
+	u3_agp_write_config
+};
+
+
+#define U3_HT_CFA0(devfn, off)		\
+		((((unsigned long)devfn) << 8) | offset)
+#define U3_HT_CFA1(bus, devfn, off)	\
+		(U3_HT_CFA0(devfn, off) \
+		+ (((unsigned long)bus) << 16) \
+		+ 0x01000000UL)
+
+static unsigned long u3_ht_cfg_access(struct pci_controller* hose,
+				      u8 bus, u8 devfn, u8 offset)
+{
+	if (bus == hose->first_busno) {
+		if (PCI_SLOT(devfn) == 0)
+			return 0;
+		return ((unsigned long)hose->cfg_data) + U3_HT_CFA0(devfn, offset);
+	} else
+		return ((unsigned long)hose->cfg_data) + U3_HT_CFA1(bus, devfn, offset);
+}
+
+static int u3_ht_read_config(struct pci_bus *bus, unsigned int devfn,
+			     int offset, int len, u32 *val)
+{
+	struct pci_controller *hose;
+	unsigned long addr;
+
+	hose = pci_bus_to_host(bus);
+	if (hose == NULL)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	addr = u3_ht_cfg_access(hose, bus->number, devfn, offset);
+	if (!addr)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	/*
+	 * Note: the caller has already checked that offset is
+	 * suitably aligned and that len is 1, 2 or 4.
+	 */
+	switch (len) {
+	case 1:
+		*val = in_8((u8 *)addr);
+		break;
+	case 2:
+		*val = in_le16((u16 *)addr);
+		break;
+	default:
+		*val = in_le32((u32 *)addr);
+		break;
+	}
+	return PCIBIOS_SUCCESSFUL;
+}
+
+static int u3_ht_write_config(struct pci_bus *bus, unsigned int devfn,
+			      int offset, int len, u32 val)
+{
+	struct pci_controller *hose;
+	unsigned long addr;
+
+	hose = pci_bus_to_host(bus);
+	if (hose == NULL)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	addr = u3_ht_cfg_access(hose, bus->number, devfn, offset);
+	if (!addr)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+	/*
+	 * Note: the caller has already checked that offset is
+	 * suitably aligned and that len is 1, 2 or 4.
+	 */
+	switch (len) {
+	case 1:
+		out_8((u8 *)addr, val);
+		(void) in_8((u8 *)addr);
+		break;
+	case 2:
+		out_le16((u16 *)addr, val);
+		(void) in_le16((u16 *)addr);
+		break;
+	default:
+		out_le32((u32 *)addr, val);
+		(void) in_le32((u32 *)addr);
+		break;
+	}
+	return PCIBIOS_SUCCESSFUL;
+}
+
+static struct pci_ops u3_ht_pci_ops =
+{
+	u3_ht_read_config,
+	u3_ht_write_config
+};
+
+static void __init setup_u3_agp(struct pci_controller* hose)
+{
+	/* On G5, we move AGP up to high bus number so we don't need
+	 * to reassign bus numbers for HT. If we ever have P2P bridges
+	 * on AGP, we'll have to move pci_assign_all_busses to the
+	 * pci_controller structure so we enable it for AGP and not for
+	 * HT childs.
+	 * We hard code the address because of the different size of
+	 * the reg address cell, we shall fix that by killing struct
+	 * reg_property and using some accessor functions instead
+	 */
+       	hose->first_busno = 0xf0;
+	hose->last_busno = 0xff;
+	hose->ops = &u3_agp_pci_ops;
+	hose->cfg_addr = ioremap(0xf0000000 + 0x800000, 0x1000);
+	hose->cfg_data = ioremap(0xf0000000 + 0xc00000, 0x1000);
+
+	u3_agp = hose;
+}
+
+static void __init setup_u3_ht(struct pci_controller* hose)
+{
+	hose->ops = &u3_ht_pci_ops;
+
+	/* We hard code the address because of the different size of
+	 * the reg address cell, we shall fix that by killing struct
+	 * reg_property and using some accessor functions instead
+	 */
+	hose->cfg_data = (volatile unsigned char *)ioremap(0xf2000000, 0x02000000);
+
+	hose->first_busno = 0;
+	hose->last_busno = 0xef;
+
+	u3_ht = hose;
+}
+
+static int __init add_bridge(struct device_node *dev)
+{
+	int len;
+	struct pci_controller *hose;
+	char* disp_name;
+	int *bus_range;
+	int primary = 1;
+ 	struct property *of_prop;
+
+	DBG("Adding PCI host bridge %s\n", dev->full_name);
+
+       	bus_range = (int *) get_property(dev, "bus-range", &len);
+       	if (bus_range == NULL || len < 2 * sizeof(int)) {
+       		printk(KERN_WARNING "Can't get bus-range for %s, assume bus 0\n",
+       			       dev->full_name);
+       	}
+
+	hose = alloc_bootmem(sizeof(struct pci_controller));
+	if (hose == NULL)
+		return -ENOMEM;
+       	pci_setup_pci_controller(hose);
+
+       	hose->arch_data = dev;
+       	hose->first_busno = bus_range ? bus_range[0] : 0;
+       	hose->last_busno = bus_range ? bus_range[1] : 0xff;
+
+	of_prop = alloc_bootmem(sizeof(struct property) +
+				sizeof(hose->global_number));
+	if (of_prop) {
+		memset(of_prop, 0, sizeof(struct property));
+		of_prop->name = "linux,pci-domain";
+		of_prop->length = sizeof(hose->global_number);
+		of_prop->value = (unsigned char *)&of_prop[1];
+		memcpy(of_prop->value, &hose->global_number, sizeof(hose->global_number));
+		prom_add_property(dev, of_prop);
+	}
+
+	disp_name = NULL;
+       	if (device_is_compatible(dev, "u3-agp")) {
+       		setup_u3_agp(hose);
+       		disp_name = "U3-AGP";
+       		primary = 0;
+       	} else if (device_is_compatible(dev, "u3-ht")) {
+       		setup_u3_ht(hose);
+       		disp_name = "U3-HT";
+       		primary = 1;
+       	}
+       	printk(KERN_INFO "Found %s PCI host bridge. Firmware bus number: %d->%d\n",
+       		disp_name, hose->first_busno, hose->last_busno);
+
+       	/* Interpret the "ranges" property */
+       	/* This also maps the I/O region and sets isa_io/mem_base */
+       	pci_process_bridge_OF_ranges(hose, dev);
+	pci_setup_phb_io(hose, primary);
+
+       	/* Fixup "bus-range" OF property */
+       	fixup_bus_range(dev);
+
+	return 0;
+}
+
+
+void __init maple_pcibios_fixup(void)
+{
+	struct pci_dev *dev = NULL;
+
+	DBG(" -> maple_pcibios_fixup\n");
+
+	for_each_pci_dev(dev)
+		pci_read_irq_line(dev);
+
+	/* Do the mapping of the IO space */
+	phbs_remap_io();
+
+	DBG(" <- maple_pcibios_fixup\n");
+}
+
+static void __init maple_fixup_phb_resources(void)
+{
+	struct pci_controller *hose, *tmp;
+	
+	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+		unsigned long offset = (unsigned long)hose->io_base_virt - pci_io_base;
+		hose->io_resource.start += offset;
+		hose->io_resource.end += offset;
+		printk(KERN_INFO "PCI Host %d, io start: %lx; io end: %lx\n",
+		       hose->global_number,
+		       hose->io_resource.start, hose->io_resource.end);
+	}
+}
+
+void __init maple_pci_init(void)
+{
+	struct device_node *np, *root;
+	struct device_node *ht = NULL;
+
+	/* Probe root PCI hosts, that is on U3 the AGP host and the
+	 * HyperTransport host. That one is actually "kept" around
+	 * and actually added last as it's resource management relies
+	 * on the AGP resources to have been setup first
+	 */
+	root = of_find_node_by_path("/");
+	if (root == NULL) {
+		printk(KERN_CRIT "maple_find_bridges: can't find root of device tree\n");
+		return;
+	}
+	for (np = NULL; (np = of_get_next_child(root, np)) != NULL;) {
+		if (np->name == NULL)
+			continue;
+		if (strcmp(np->name, "pci") == 0) {
+			if (add_bridge(np) == 0)
+				of_node_get(np);
+		}
+		if (strcmp(np->name, "ht") == 0) {
+			of_node_get(np);
+			ht = np;
+		}
+	}
+	of_node_put(root);
+
+	/* Now setup the HyperTransport host if we found any
+	 */
+	if (ht && add_bridge(ht) != 0)
+		of_node_put(ht);
+
+	/* Fixup the IO resources on our host bridges as the common code
+	 * does it only for childs of the host bridges
+	 */
+	maple_fixup_phb_resources();
+
+	/* Setup the linkage between OF nodes and PHBs */ 
+	pci_devs_phb_init();
+
+	/* Fixup the PCI<->OF mapping for U3 AGP due to bus renumbering. We
+	 * assume there is no P2P bridge on the AGP bus, which should be a
+	 * safe assumptions hopefully.
+	 */
+	if (u3_agp) {
+		struct device_node *np = u3_agp->arch_data;
+		np->busno = 0xf0;
+		for (np = np->child; np; np = np->sibling)
+			np->busno = 0xf0;
+	}
+
+	/* Tell pci.c to use the common resource allocation mecanism */
+	pci_probe_only = 0;
+	
+	/* Allow all IO */
+	io_page_mask = -1;
+}
+
+int maple_pci_get_legacy_ide_irq(struct pci_dev *pdev, int channel)
+{
+	struct device_node *np;
+	int irq = channel ? 15 : 14;
+
+	if (pdev->vendor != PCI_VENDOR_ID_AMD ||
+	    pdev->device != PCI_DEVICE_ID_AMD_8111_IDE)
+		return irq;
+
+	np = pci_device_to_OF_node(pdev);
+	if (np == NULL)
+		return irq;
+	if (np->n_intrs < 2)
+		return irq;
+	return np->intrs[channel & 0x1].line;
+}
+
+/* XXX: To remove once all firmwares are ok */
+static void fixup_maple_ide(struct pci_dev* dev)
+{
+#if 0 /* Enable this to enable IDE port 0 */
+	{
+		u8 v;
+
+		pci_read_config_byte(dev, 0x40, &v);
+		v |= 2;
+		pci_write_config_byte(dev, 0x40, v);
+	}
+#endif
+#if 0 /* fix bus master base */
+	pci_write_config_dword(dev, 0x20, 0xcc01);
+	printk("old ide resource: %lx -> %lx \n",
+	       dev->resource[4].start, dev->resource[4].end);
+	dev->resource[4].start = 0xcc00;
+	dev->resource[4].end = 0xcc10;
+#endif
+#if 1 /* Enable this to fixup IDE sense/polarity of irqs in IO-APICs */
+	{
+		struct pci_dev *apicdev;
+		u32 v;
+
+		apicdev = pci_get_slot (dev->bus, PCI_DEVFN(5,0));
+		if (apicdev == NULL)
+			printk("IDE Fixup IRQ: Can't find IO-APIC !\n");
+		else {
+			pci_write_config_byte(apicdev, 0xf2, 0x10 + 2*14);
+			pci_read_config_dword(apicdev, 0xf4, &v);
+			v &= ~0x00000022;
+			pci_write_config_dword(apicdev, 0xf4, v);
+			pci_write_config_byte(apicdev, 0xf2, 0x10 + 2*15);
+			pci_read_config_dword(apicdev, 0xf4, &v);
+			v &= ~0x00000022;
+			pci_write_config_dword(apicdev, 0xf4, v);
+			pci_dev_put(apicdev);
+		}
+	}
+#endif
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_8111_IDE,
+			 fixup_maple_ide);
diff --git a/arch/ppc64/kernel/maple_setup.c b/arch/ppc64/kernel/maple_setup.c
new file mode 100644
index 00000000000..1db6ea0f336
--- /dev/null
+++ b/arch/ppc64/kernel/maple_setup.c
@@ -0,0 +1,240 @@
+/*
+ *  arch/ppc64/kernel/maple_setup.c
+ *
+ *  (c) Copyright 2004 Benjamin Herrenschmidt (benh@kernel.crashing.org),
+ *                     IBM Corp. 
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#define DEBUG
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/user.h>
+#include <linux/a.out.h>
+#include <linux/tty.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/ioport.h>
+#include <linux/major.h>
+#include <linux/initrd.h>
+#include <linux/vt_kern.h>
+#include <linux/console.h>
+#include <linux/ide.h>
+#include <linux/pci.h>
+#include <linux/adb.h>
+#include <linux/cuda.h>
+#include <linux/pmu.h>
+#include <linux/irq.h>
+#include <linux/seq_file.h>
+#include <linux/root_dev.h>
+#include <linux/serial.h>
+#include <linux/smp.h>
+
+#include <asm/processor.h>
+#include <asm/sections.h>
+#include <asm/prom.h>
+#include <asm/system.h>
+#include <asm/pgtable.h>
+#include <asm/bitops.h>
+#include <asm/io.h>
+#include <asm/pci-bridge.h>
+#include <asm/iommu.h>
+#include <asm/machdep.h>
+#include <asm/dma.h>
+#include <asm/cputable.h>
+#include <asm/time.h>
+#include <asm/of_device.h>
+#include <asm/lmb.h>
+
+#include "mpic.h"
+
+#ifdef DEBUG
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+extern int maple_set_rtc_time(struct rtc_time *tm);
+extern void maple_get_rtc_time(struct rtc_time *tm);
+extern void maple_get_boot_time(struct rtc_time *tm);
+extern void maple_calibrate_decr(void);
+extern void maple_pci_init(void);
+extern void maple_pcibios_fixup(void);
+extern int maple_pci_get_legacy_ide_irq(struct pci_dev *dev, int channel);
+extern void generic_find_legacy_serial_ports(u64 *physport,
+		unsigned int *default_speed);
+
+
+static void maple_restart(char *cmd)
+{
+}
+
+static void maple_power_off(void)
+{
+}
+
+static void maple_halt(void)
+{
+}
+
+#ifdef CONFIG_SMP
+struct smp_ops_t maple_smp_ops = {
+	.probe		= smp_mpic_probe,
+	.message_pass	= smp_mpic_message_pass,
+	.kick_cpu	= smp_generic_kick_cpu,
+	.setup_cpu	= smp_mpic_setup_cpu,
+	.give_timebase	= smp_generic_give_timebase,
+	.take_timebase	= smp_generic_take_timebase,
+};
+#endif /* CONFIG_SMP */
+
+void __init maple_setup_arch(void)
+{
+	/* init to some ~sane value until calibrate_delay() runs */
+	loops_per_jiffy = 50000000;
+
+	/* Setup SMP callback */
+#ifdef CONFIG_SMP
+	smp_ops = &maple_smp_ops;
+#endif
+	/* Lookup PCI hosts */
+       	maple_pci_init();
+
+#ifdef CONFIG_DUMMY_CONSOLE
+	conswitchp = &dummy_con;
+#endif
+}
+
+/* 
+ * Early initialization.
+ */
+static void __init maple_init_early(void)
+{
+	unsigned int default_speed;
+	u64 physport;
+
+	DBG(" -> maple_init_early\n");
+
+	/* Initialize hash table, from now on, we can take hash faults
+	 * and call ioremap
+	 */
+	hpte_init_native();
+
+	/* Find the serial port */
+	generic_find_legacy_serial_ports(&physport, &default_speed);
+
+	DBG("phys port addr: %lx\n", (long)physport);
+
+	if (physport) {
+		void *comport;
+		/* Map the uart for udbg. */
+		comport = (void *)__ioremap(physport, 16, _PAGE_NO_CACHE);
+		udbg_init_uart(comport, default_speed);
+
+		ppc_md.udbg_putc = udbg_putc;
+		ppc_md.udbg_getc = udbg_getc;
+		ppc_md.udbg_getc_poll = udbg_getc_poll;
+		DBG("Hello World !\n");
+	}
+
+	/* Setup interrupt mapping options */
+	ppc64_interrupt_controller = IC_OPEN_PIC;
+
+	iommu_init_early_u3();
+
+	DBG(" <- maple_init_early\n");
+}
+
+
+static __init void maple_init_IRQ(void)
+{
+	struct device_node *root;
+	unsigned int *opprop;
+	unsigned long opic_addr;
+	struct mpic *mpic;
+	unsigned char senses[128];
+	int n;
+
+	DBG(" -> maple_init_IRQ\n");
+
+	/* XXX: Non standard, replace that with a proper openpic/mpic node
+	 * in the device-tree. Find the Open PIC if present */
+	root = of_find_node_by_path("/");
+	opprop = (unsigned int *) get_property(root,
+				"platform-open-pic", NULL);
+	if (opprop == 0)
+		panic("OpenPIC not found !\n");
+
+	n = prom_n_addr_cells(root);
+	for (opic_addr = 0; n > 0; --n)
+		opic_addr = (opic_addr << 32) + *opprop++;
+	of_node_put(root);
+
+	/* Obtain sense values from device-tree */
+	prom_get_irq_senses(senses, 0, 128);
+
+	mpic = mpic_alloc(opic_addr,
+			  MPIC_PRIMARY | MPIC_BIG_ENDIAN |
+			  MPIC_BROKEN_U3 | MPIC_WANTS_RESET,
+			  0, 0, 128, 128, senses, 128, "U3-MPIC");
+	BUG_ON(mpic == NULL);
+	mpic_init(mpic);
+
+	DBG(" <- maple_init_IRQ\n");
+}
+
+static void __init maple_progress(char *s, unsigned short hex)
+{
+	printk("*** %04x : %s\n", hex, s ? s : "");
+}
+
+
+/*
+ * Called very early, MMU is off, device-tree isn't unflattened
+ */
+static int __init maple_probe(int platform)
+{
+	if (platform != PLATFORM_MAPLE)
+		return 0;
+	/*
+	 * On U3, the DART (iommu) must be allocated now since it
+	 * has an impact on htab_initialize (due to the large page it
+	 * occupies having to be broken up so the DART itself is not
+	 * part of the cacheable linar mapping
+	 */
+	alloc_u3_dart_table();
+
+	return 1;
+}
+
+struct machdep_calls __initdata maple_md = {
+	.probe			= maple_probe,
+	.setup_arch		= maple_setup_arch,
+	.init_early		= maple_init_early,
+	.init_IRQ		= maple_init_IRQ,
+	.get_irq		= mpic_get_irq,
+	.pcibios_fixup		= maple_pcibios_fixup,
+	.pci_get_legacy_ide_irq	= maple_pci_get_legacy_ide_irq,
+	.restart		= maple_restart,
+	.power_off		= maple_power_off,
+	.halt			= maple_halt,
+       	.get_boot_time		= maple_get_boot_time,
+       	.set_rtc_time		= maple_set_rtc_time,
+       	.get_rtc_time		= maple_get_rtc_time,
+      	.calibrate_decr		= maple_calibrate_decr,
+	.progress		= maple_progress,
+};
diff --git a/arch/ppc64/kernel/maple_time.c b/arch/ppc64/kernel/maple_time.c
new file mode 100644
index 00000000000..07ce7895b43
--- /dev/null
+++ b/arch/ppc64/kernel/maple_time.c
@@ -0,0 +1,226 @@
+/*
+ *  arch/ppc64/kernel/maple_time.c
+ *
+ *  (c) Copyright 2004 Benjamin Herrenschmidt (benh@kernel.crashing.org),
+ *                     IBM Corp. 
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#undef DEBUG
+
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/param.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/time.h>
+#include <linux/adb.h>
+#include <linux/pmu.h>
+#include <linux/interrupt.h>
+#include <linux/mc146818rtc.h>
+#include <linux/bcd.h>
+
+#include <asm/sections.h>
+#include <asm/prom.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/machdep.h>
+#include <asm/time.h>
+
+#ifdef DEBUG
+#define DBG(x...) printk(x)
+#else
+#define DBG(x...)
+#endif
+
+extern void setup_default_decr(void);
+extern void GregorianDay(struct rtc_time * tm);
+
+extern unsigned long ppc_tb_freq;
+extern unsigned long ppc_proc_freq;
+static int maple_rtc_addr;
+
+static int maple_clock_read(int addr)
+{
+	outb_p(addr, maple_rtc_addr);
+	return inb_p(maple_rtc_addr+1);
+}
+
+static void maple_clock_write(unsigned long val, int addr)
+{
+	outb_p(addr, maple_rtc_addr);
+	outb_p(val, maple_rtc_addr+1);
+}
+
+void maple_get_rtc_time(struct rtc_time *tm)
+{
+	int uip, i;
+
+	/* The Linux interpretation of the CMOS clock register contents:
+	 * When the Update-In-Progress (UIP) flag goes from 1 to 0, the
+	 * RTC registers show the second which has precisely just started.
+	 * Let's hope other operating systems interpret the RTC the same way.
+	 */
+
+	/* Since the UIP flag is set for about 2.2 ms and the clock
+	 * is typically written with a precision of 1 jiffy, trying
+	 * to obtain a precision better than a few milliseconds is
+	 * an illusion. Only consistency is interesting, this also
+	 * allows to use the routine for /dev/rtc without a potential
+	 * 1 second kernel busy loop triggered by any reader of /dev/rtc.
+	 */
+
+	for (i = 0; i<1000000; i++) {
+		uip = maple_clock_read(RTC_FREQ_SELECT);
+		tm->tm_sec = maple_clock_read(RTC_SECONDS);
+		tm->tm_min = maple_clock_read(RTC_MINUTES);
+		tm->tm_hour = maple_clock_read(RTC_HOURS);
+		tm->tm_mday = maple_clock_read(RTC_DAY_OF_MONTH);
+		tm->tm_mon = maple_clock_read(RTC_MONTH);
+		tm->tm_year = maple_clock_read(RTC_YEAR);
+		uip |= maple_clock_read(RTC_FREQ_SELECT);
+		if ((uip & RTC_UIP)==0)
+			break;
+	}
+
+	if (!(maple_clock_read(RTC_CONTROL) & RTC_DM_BINARY)
+	    || RTC_ALWAYS_BCD) {
+		BCD_TO_BIN(tm->tm_sec);
+		BCD_TO_BIN(tm->tm_min);
+		BCD_TO_BIN(tm->tm_hour);
+		BCD_TO_BIN(tm->tm_mday);
+		BCD_TO_BIN(tm->tm_mon);
+		BCD_TO_BIN(tm->tm_year);
+	  }
+	if ((tm->tm_year + 1900) < 1970)
+		tm->tm_year += 100;
+
+	GregorianDay(tm);
+}
+
+int maple_set_rtc_time(struct rtc_time *tm)
+{
+	unsigned char save_control, save_freq_select;
+	int sec, min, hour, mon, mday, year;
+
+	spin_lock(&rtc_lock);
+
+	save_control = maple_clock_read(RTC_CONTROL); /* tell the clock it's being set */
+
+	maple_clock_write((save_control|RTC_SET), RTC_CONTROL);
+
+	save_freq_select = maple_clock_read(RTC_FREQ_SELECT); /* stop and reset prescaler */
+
+	maple_clock_write((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
+
+	sec = tm->tm_sec;
+	min = tm->tm_min;
+	hour = tm->tm_hour;
+	mon = tm->tm_mon;
+	mday = tm->tm_mday;
+	year = tm->tm_year;
+
+	if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
+		BIN_TO_BCD(sec);
+		BIN_TO_BCD(min);
+		BIN_TO_BCD(hour);
+		BIN_TO_BCD(mon);
+		BIN_TO_BCD(mday);
+		BIN_TO_BCD(year);
+	}
+	maple_clock_write(sec, RTC_SECONDS);
+	maple_clock_write(min, RTC_MINUTES);
+	maple_clock_write(hour, RTC_HOURS);
+	maple_clock_write(mon, RTC_MONTH);
+	maple_clock_write(mday, RTC_DAY_OF_MONTH);
+	maple_clock_write(year, RTC_YEAR);
+
+	/* The following flags have to be released exactly in this order,
+	 * otherwise the DS12887 (popular MC146818A clone with integrated
+	 * battery and quartz) will not reset the oscillator and will not
+	 * update precisely 500 ms later. You won't find this mentioned in
+	 * the Dallas Semiconductor data sheets, but who believes data
+	 * sheets anyway ...                           -- Markus Kuhn
+	 */
+	maple_clock_write(save_control, RTC_CONTROL);
+	maple_clock_write(save_freq_select, RTC_FREQ_SELECT);
+
+	spin_unlock(&rtc_lock);
+
+	return 0;
+}
+
+void __init maple_get_boot_time(struct rtc_time *tm)
+{
+	struct device_node *rtcs;
+
+	rtcs = find_compatible_devices("rtc", "pnpPNP,b00");
+	if (rtcs && rtcs->addrs) {
+		maple_rtc_addr = rtcs->addrs[0].address;
+		printk(KERN_INFO "Maple: Found RTC at 0x%x\n", maple_rtc_addr);
+	} else {
+		maple_rtc_addr = RTC_PORT(0); /* legacy address */
+		printk(KERN_INFO "Maple: No device node for RTC, assuming "
+		       "legacy address (0x%x)\n", maple_rtc_addr);
+	}
+	
+	maple_get_rtc_time(tm);
+}
+
+/* XXX FIXME: Some sane defaults: 125 MHz timebase, 1GHz processor */
+#define DEFAULT_TB_FREQ		125000000UL
+#define DEFAULT_PROC_FREQ	(DEFAULT_TB_FREQ * 8)
+
+void __init maple_calibrate_decr(void)
+{
+	struct device_node *cpu;
+	struct div_result divres;
+	unsigned int *fp = NULL;
+
+	/*
+	 * The cpu node should have a timebase-frequency property
+	 * to tell us the rate at which the decrementer counts.
+	 */
+	cpu = of_find_node_by_type(NULL, "cpu");
+
+	ppc_tb_freq = DEFAULT_TB_FREQ;
+	if (cpu != 0)
+		fp = (unsigned int *)get_property(cpu, "timebase-frequency", NULL);
+	if (fp != NULL)
+		ppc_tb_freq = *fp;
+	else
+		printk(KERN_ERR "WARNING: Estimating decrementer frequency (not found)\n");
+	fp = NULL;
+	ppc_proc_freq = DEFAULT_PROC_FREQ;
+	if (cpu != 0)
+		fp = (unsigned int *)get_property(cpu, "clock-frequency", NULL);
+	if (fp != NULL)
+		ppc_proc_freq = *fp;
+	else
+		printk(KERN_ERR "WARNING: Estimating processor frequency (not found)\n");
+
+	of_node_put(cpu);
+
+	printk(KERN_INFO "time_init: decrementer frequency = %lu.%.6lu MHz\n",
+	       ppc_tb_freq/1000000, ppc_tb_freq%1000000);
+	printk(KERN_INFO "time_init: processor frequency   = %lu.%.6lu MHz\n",
+	       ppc_proc_freq/1000000, ppc_proc_freq%1000000);
+
+	tb_ticks_per_jiffy = ppc_tb_freq / HZ;
+	tb_ticks_per_sec = tb_ticks_per_jiffy * HZ;
+	tb_ticks_per_usec = ppc_tb_freq / 1000000;
+	tb_to_us = mulhwu_scale_factor(ppc_tb_freq, 1000000);
+	div128_by_32(1024*1024, 0, tb_ticks_per_sec, &divres);
+	tb_to_xs = divres.result_low;
+
+	setup_default_decr();
+}
diff --git a/arch/ppc64/kernel/mf.c b/arch/ppc64/kernel/mf.c
new file mode 100644
index 00000000000..1bd52ece497
--- /dev/null
+++ b/arch/ppc64/kernel/mf.c
@@ -0,0 +1,1239 @@
+/*
+  * mf.c
+  * Copyright (C) 2001 Troy D. Armstrong  IBM Corporation
+  * Copyright (C) 2004 Stephen Rothwell  IBM Corporation
+  *
+  * This modules exists as an interface between a Linux secondary partition
+  * running on an iSeries and the primary partition's Virtual Service
+  * Processor (VSP) object.  The VSP has final authority over powering on/off
+  * all partitions in the iSeries.  It also provides miscellaneous low-level
+  * machine facility type operations.
+  *
+  *
+  * This program is free software; you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation; either version 2 of the License, or
+  * (at your option) any later version.
+  *
+  * This program is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  * GNU General Public License for more details.
+  *
+  * You should have received a copy of the GNU General Public License
+  * along with this program; if not, write to the Free Software
+  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+  */
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/bcd.h>
+
+#include <asm/time.h>
+#include <asm/uaccess.h>
+#include <asm/iSeries/vio.h>
+#include <asm/iSeries/mf.h>
+#include <asm/iSeries/HvLpConfig.h>
+#include <asm/iSeries/ItSpCommArea.h>
+
+/*
+ * This is the structure layout for the Machine Facilites LPAR event
+ * flows.
+ */
+struct vsp_cmd_data {
+	u64 token;
+	u16 cmd;
+	HvLpIndex lp_index;
+	u8 result_code;
+	u32 reserved;
+	union {
+		u64 state;	/* GetStateOut */
+		u64 ipl_type;	/* GetIplTypeOut, Function02SelectIplTypeIn */
+		u64 ipl_mode;	/* GetIplModeOut, Function02SelectIplModeIn */
+		u64 page[4];	/* GetSrcHistoryIn */
+		u64 flag;	/* GetAutoIplWhenPrimaryIplsOut,
+				   SetAutoIplWhenPrimaryIplsIn,
+				   WhiteButtonPowerOffIn,
+				   Function08FastPowerOffIn,
+				   IsSpcnRackPowerIncompleteOut */
+		struct {
+			u64 token;
+			u64 address_type;
+			u64 side;
+			u32 length;
+			u32 offset;
+		} kern;		/* SetKernelImageIn, GetKernelImageIn,
+				   SetKernelCmdLineIn, GetKernelCmdLineIn */
+		u32 length_out;	/* GetKernelImageOut, GetKernelCmdLineOut */
+		u8 reserved[80];
+	} sub_data;
+};
+
+struct vsp_rsp_data {
+	struct completion com;
+	struct vsp_cmd_data *response;
+};
+
+struct alloc_data {
+	u16 size;
+	u16 type;
+	u32 count;
+	u16 reserved1;
+	u8 reserved2;
+	HvLpIndex target_lp;
+};
+
+struct ce_msg_data;
+
+typedef void (*ce_msg_comp_hdlr)(void *token, struct ce_msg_data *vsp_cmd_rsp);
+
+struct ce_msg_comp_data {
+	ce_msg_comp_hdlr handler;
+	void *token;
+};
+
+struct ce_msg_data {
+	u8 ce_msg[12];
+	char reserved[4];
+	struct ce_msg_comp_data *completion;
+};
+
+struct io_mf_lp_event {
+	struct HvLpEvent hp_lp_event;
+	u16 subtype_result_code;
+	u16 reserved1;
+	u32 reserved2;
+	union {
+		struct alloc_data alloc;
+		struct ce_msg_data ce_msg;
+		struct vsp_cmd_data vsp_cmd;
+	} data;
+};
+
+#define subtype_data(a, b, c, d)	\
+		(((a) << 24) + ((b) << 16) + ((c) << 8) + (d))
+
+/*
+ * All outgoing event traffic is kept on a FIFO queue.  The first
+ * pointer points to the one that is outstanding, and all new
+ * requests get stuck on the end.  Also, we keep a certain number of
+ * preallocated pending events so that we can operate very early in
+ * the boot up sequence (before kmalloc is ready).
+ */
+struct pending_event {
+	struct pending_event *next;
+	struct io_mf_lp_event event;
+	MFCompleteHandler hdlr;
+	char dma_data[72];
+	unsigned dma_data_length;
+	unsigned remote_address;
+};
+static spinlock_t pending_event_spinlock;
+static struct pending_event *pending_event_head;
+static struct pending_event *pending_event_tail;
+static struct pending_event *pending_event_avail;
+static struct pending_event pending_event_prealloc[16];
+
+/*
+ * Put a pending event onto the available queue, so it can get reused.
+ * Attention! You must have the pending_event_spinlock before calling!
+ */
+static void free_pending_event(struct pending_event *ev)
+{
+	if (ev != NULL) {
+		ev->next = pending_event_avail;
+		pending_event_avail = ev;
+	}
+}
+
+/*
+ * Enqueue the outbound event onto the stack.  If the queue was
+ * empty to begin with, we must also issue it via the Hypervisor
+ * interface.  There is a section of code below that will touch
+ * the first stack pointer without the protection of the pending_event_spinlock.
+ * This is OK, because we know that nobody else will be modifying
+ * the first pointer when we do this.
+ */
+static int signal_event(struct pending_event *ev)
+{
+	int rc = 0;
+	unsigned long flags;
+	int go = 1;
+	struct pending_event *ev1;
+	HvLpEvent_Rc hv_rc;
+
+	/* enqueue the event */
+	if (ev != NULL) {
+		ev->next = NULL;
+		spin_lock_irqsave(&pending_event_spinlock, flags);
+		if (pending_event_head == NULL)
+			pending_event_head = ev;
+		else {
+			go = 0;
+			pending_event_tail->next = ev;
+		}
+		pending_event_tail = ev;
+		spin_unlock_irqrestore(&pending_event_spinlock, flags);
+	}
+
+	/* send the event */
+	while (go) {
+		go = 0;
+
+		/* any DMA data to send beforehand? */
+		if (pending_event_head->dma_data_length > 0)
+			HvCallEvent_dmaToSp(pending_event_head->dma_data,
+					pending_event_head->remote_address,
+					pending_event_head->dma_data_length,
+					HvLpDma_Direction_LocalToRemote);
+
+		hv_rc = HvCallEvent_signalLpEvent(
+				&pending_event_head->event.hp_lp_event);
+		if (hv_rc != HvLpEvent_Rc_Good) {
+			printk(KERN_ERR "mf.c: HvCallEvent_signalLpEvent() "
+					"failed with %d\n", (int)hv_rc);
+
+			spin_lock_irqsave(&pending_event_spinlock, flags);
+			ev1 = pending_event_head;
+			pending_event_head = pending_event_head->next;
+			if (pending_event_head != NULL)
+				go = 1;
+			spin_unlock_irqrestore(&pending_event_spinlock, flags);
+
+			if (ev1 == ev)
+				rc = -EIO;
+			else if (ev1->hdlr != NULL)
+				(*ev1->hdlr)((void *)ev1->event.hp_lp_event.xCorrelationToken, -EIO);
+
+			spin_lock_irqsave(&pending_event_spinlock, flags);
+			free_pending_event(ev1);
+			spin_unlock_irqrestore(&pending_event_spinlock, flags);
+		}
+	}
+
+	return rc;
+}
+
+/*
+ * Allocate a new pending_event structure, and initialize it.
+ */
+static struct pending_event *new_pending_event(void)
+{
+	struct pending_event *ev = NULL;
+	HvLpIndex primary_lp = HvLpConfig_getPrimaryLpIndex();
+	unsigned long flags;
+	struct HvLpEvent *hev;
+
+	spin_lock_irqsave(&pending_event_spinlock, flags);
+	if (pending_event_avail != NULL) {
+		ev = pending_event_avail;
+		pending_event_avail = pending_event_avail->next;
+	}
+	spin_unlock_irqrestore(&pending_event_spinlock, flags);
+	if (ev == NULL) {
+		ev = kmalloc(sizeof(struct pending_event), GFP_ATOMIC);
+		if (ev == NULL) {
+			printk(KERN_ERR "mf.c: unable to kmalloc %ld bytes\n",
+					sizeof(struct pending_event));
+			return NULL;
+		}
+	}
+	memset(ev, 0, sizeof(struct pending_event));
+	hev = &ev->event.hp_lp_event;
+	hev->xFlags.xValid = 1;
+	hev->xFlags.xAckType = HvLpEvent_AckType_ImmediateAck;
+	hev->xFlags.xAckInd = HvLpEvent_AckInd_DoAck;
+	hev->xFlags.xFunction = HvLpEvent_Function_Int;
+	hev->xType = HvLpEvent_Type_MachineFac;
+	hev->xSourceLp = HvLpConfig_getLpIndex();
+	hev->xTargetLp = primary_lp;
+	hev->xSizeMinus1 = sizeof(ev->event) - 1;
+	hev->xRc = HvLpEvent_Rc_Good;
+	hev->xSourceInstanceId = HvCallEvent_getSourceLpInstanceId(primary_lp,
+			HvLpEvent_Type_MachineFac);
+	hev->xTargetInstanceId = HvCallEvent_getTargetLpInstanceId(primary_lp,
+			HvLpEvent_Type_MachineFac);
+
+	return ev;
+}
+
+static int signal_vsp_instruction(struct vsp_cmd_data *vsp_cmd)
+{
+	struct pending_event *ev = new_pending_event();
+	int rc;
+	struct vsp_rsp_data response;
+
+	if (ev == NULL)
+		return -ENOMEM;
+
+	init_completion(&response.com);
+	response.response = vsp_cmd;
+	ev->event.hp_lp_event.xSubtype = 6;
+	ev->event.hp_lp_event.x.xSubtypeData =
+		subtype_data('M', 'F',  'V',  'I');
+	ev->event.data.vsp_cmd.token = (u64)&response;
+	ev->event.data.vsp_cmd.cmd = vsp_cmd->cmd;
+	ev->event.data.vsp_cmd.lp_index = HvLpConfig_getLpIndex();
+	ev->event.data.vsp_cmd.result_code = 0xFF;
+	ev->event.data.vsp_cmd.reserved = 0;
+	memcpy(&(ev->event.data.vsp_cmd.sub_data),
+			&(vsp_cmd->sub_data), sizeof(vsp_cmd->sub_data));
+	mb();
+
+	rc = signal_event(ev);
+	if (rc == 0)
+		wait_for_completion(&response.com);
+	return rc;
+}
+
+
+/*
+ * Send a 12-byte CE message to the primary partition VSP object
+ */
+static int signal_ce_msg(char *ce_msg, struct ce_msg_comp_data *completion)
+{
+	struct pending_event *ev = new_pending_event();
+
+	if (ev == NULL)
+		return -ENOMEM;
+
+	ev->event.hp_lp_event.xSubtype = 0;
+	ev->event.hp_lp_event.x.xSubtypeData =
+		subtype_data('M',  'F',  'C',  'E');
+	memcpy(ev->event.data.ce_msg.ce_msg, ce_msg, 12);
+	ev->event.data.ce_msg.completion = completion;
+	return signal_event(ev);
+}
+
+/*
+ * Send a 12-byte CE message (with no data) to the primary partition VSP object
+ */
+static int signal_ce_msg_simple(u8 ce_op, struct ce_msg_comp_data *completion)
+{
+	u8 ce_msg[12];
+
+	memset(ce_msg, 0, sizeof(ce_msg));
+	ce_msg[3] = ce_op;
+	return signal_ce_msg(ce_msg, completion);
+}
+
+/*
+ * Send a 12-byte CE message and DMA data to the primary partition VSP object
+ */
+static int dma_and_signal_ce_msg(char *ce_msg,
+		struct ce_msg_comp_data *completion, void *dma_data,
+		unsigned dma_data_length, unsigned remote_address)
+{
+	struct pending_event *ev = new_pending_event();
+
+	if (ev == NULL)
+		return -ENOMEM;
+
+	ev->event.hp_lp_event.xSubtype = 0;
+	ev->event.hp_lp_event.x.xSubtypeData =
+		subtype_data('M', 'F', 'C', 'E');
+	memcpy(ev->event.data.ce_msg.ce_msg, ce_msg, 12);
+	ev->event.data.ce_msg.completion = completion;
+	memcpy(ev->dma_data, dma_data, dma_data_length);
+	ev->dma_data_length = dma_data_length;
+	ev->remote_address = remote_address;
+	return signal_event(ev);
+}
+
+/*
+ * Initiate a nice (hopefully) shutdown of Linux.  We simply are
+ * going to try and send the init process a SIGINT signal.  If
+ * this fails (why?), we'll simply force it off in a not-so-nice
+ * manner.
+ */
+static int shutdown(void)
+{
+	int rc = kill_proc(1, SIGINT, 1);
+
+	if (rc) {
+		printk(KERN_ALERT "mf.c: SIGINT to init failed (%d), "
+				"hard shutdown commencing\n", rc);
+		mf_power_off();
+	} else
+		printk(KERN_INFO "mf.c: init has been successfully notified "
+				"to proceed with shutdown\n");
+	return rc;
+}
+
+/*
+ * The primary partition VSP object is sending us a new
+ * event flow.  Handle it...
+ */
+static void handle_int(struct io_mf_lp_event *event)
+{
+	struct ce_msg_data *ce_msg_data;
+	struct ce_msg_data *pce_msg_data;
+	unsigned long flags;
+	struct pending_event *pev;
+
+	/* ack the interrupt */
+	event->hp_lp_event.xRc = HvLpEvent_Rc_Good;
+	HvCallEvent_ackLpEvent(&event->hp_lp_event);
+
+	/* process interrupt */
+	switch (event->hp_lp_event.xSubtype) {
+	case 0:	/* CE message */
+		ce_msg_data = &event->data.ce_msg;
+		switch (ce_msg_data->ce_msg[3]) {
+		case 0x5B:	/* power control notification */
+			if ((ce_msg_data->ce_msg[5] & 0x20) != 0) {
+				printk(KERN_INFO "mf.c: Commencing partition shutdown\n");
+				if (shutdown() == 0)
+					signal_ce_msg_simple(0xDB, NULL);
+			}
+			break;
+		case 0xC0:	/* get time */
+			spin_lock_irqsave(&pending_event_spinlock, flags);
+			pev = pending_event_head;
+			if (pev != NULL)
+				pending_event_head = pending_event_head->next;
+			spin_unlock_irqrestore(&pending_event_spinlock, flags);
+			if (pev == NULL)
+				break;
+			pce_msg_data = &pev->event.data.ce_msg;
+			if (pce_msg_data->ce_msg[3] != 0x40)
+				break;
+			if (pce_msg_data->completion != NULL) {
+				ce_msg_comp_hdlr handler =
+					pce_msg_data->completion->handler;
+				void *token = pce_msg_data->completion->token;
+
+				if (handler != NULL)
+					(*handler)(token, ce_msg_data);
+			}
+			spin_lock_irqsave(&pending_event_spinlock, flags);
+			free_pending_event(pev);
+			spin_unlock_irqrestore(&pending_event_spinlock, flags);
+			/* send next waiting event */
+			if (pending_event_head != NULL)
+				signal_event(NULL);
+			break;
+		}
+		break;
+	case 1:	/* IT sys shutdown */
+		printk(KERN_INFO "mf.c: Commencing system shutdown\n");
+		shutdown();
+		break;
+	}
+}
+
+/*
+ * The primary partition VSP object is acknowledging the receipt
+ * of a flow we sent to them.  If there are other flows queued
+ * up, we must send another one now...
+ */
+static void handle_ack(struct io_mf_lp_event *event)
+{
+	unsigned long flags;
+	struct pending_event *two = NULL;
+	unsigned long free_it = 0;
+	struct ce_msg_data *ce_msg_data;
+	struct ce_msg_data *pce_msg_data;
+	struct vsp_rsp_data *rsp;
+
+	/* handle current event */
+	if (pending_event_head == NULL) {
+		printk(KERN_ERR "mf.c: stack empty for receiving ack\n");
+		return;
+	}
+
+	switch (event->hp_lp_event.xSubtype) {
+	case 0:     /* CE msg */
+		ce_msg_data = &event->data.ce_msg;
+		if (ce_msg_data->ce_msg[3] != 0x40) {
+			free_it = 1;
+			break;
+		}
+		if (ce_msg_data->ce_msg[2] == 0)
+			break;
+		free_it = 1;
+		pce_msg_data = &pending_event_head->event.data.ce_msg;
+		if (pce_msg_data->completion != NULL) {
+			ce_msg_comp_hdlr handler =
+				pce_msg_data->completion->handler;
+			void *token = pce_msg_data->completion->token;
+
+			if (handler != NULL)
+				(*handler)(token, ce_msg_data);
+		}
+		break;
+	case 4:	/* allocate */
+	case 5:	/* deallocate */
+		if (pending_event_head->hdlr != NULL)
+			(*pending_event_head->hdlr)((void *)event->hp_lp_event.xCorrelationToken, event->data.alloc.count);
+		free_it = 1;
+		break;
+	case 6:
+		free_it = 1;
+		rsp = (struct vsp_rsp_data *)event->data.vsp_cmd.token;
+		if (rsp == NULL) {
+			printk(KERN_ERR "mf.c: no rsp\n");
+			break;
+		}
+		if (rsp->response != NULL)
+			memcpy(rsp->response, &event->data.vsp_cmd,
+					sizeof(event->data.vsp_cmd));
+		complete(&rsp->com);
+		break;
+	}
+
+	/* remove from queue */
+	spin_lock_irqsave(&pending_event_spinlock, flags);
+	if ((pending_event_head != NULL) && (free_it == 1)) {
+		struct pending_event *oldHead = pending_event_head;
+
+		pending_event_head = pending_event_head->next;
+		two = pending_event_head;
+		free_pending_event(oldHead);
+	}
+	spin_unlock_irqrestore(&pending_event_spinlock, flags);
+
+	/* send next waiting event */
+	if (two != NULL)
+		signal_event(NULL);
+}
+
+/*
+ * This is the generic event handler we are registering with
+ * the Hypervisor.  Ensure the flows are for us, and then
+ * parse it enough to know if it is an interrupt or an
+ * acknowledge.
+ */
+static void hv_handler(struct HvLpEvent *event, struct pt_regs *regs)
+{
+	if ((event != NULL) && (event->xType == HvLpEvent_Type_MachineFac)) {
+		switch(event->xFlags.xFunction) {
+		case HvLpEvent_Function_Ack:
+			handle_ack((struct io_mf_lp_event *)event);
+			break;
+		case HvLpEvent_Function_Int:
+			handle_int((struct io_mf_lp_event *)event);
+			break;
+		default:
+			printk(KERN_ERR "mf.c: non ack/int event received\n");
+			break;
+		}
+	} else
+		printk(KERN_ERR "mf.c: alien event received\n");
+}
+
+/*
+ * Global kernel interface to allocate and seed events into the
+ * Hypervisor.
+ */
+void mf_allocate_lp_events(HvLpIndex target_lp, HvLpEvent_Type type,
+		unsigned size, unsigned count, MFCompleteHandler hdlr,
+		void *user_token)
+{
+	struct pending_event *ev = new_pending_event();
+	int rc;
+
+	if (ev == NULL) {
+		rc = -ENOMEM;
+	} else {
+		ev->event.hp_lp_event.xSubtype = 4;
+		ev->event.hp_lp_event.xCorrelationToken = (u64)user_token;
+		ev->event.hp_lp_event.x.xSubtypeData =
+			subtype_data('M', 'F', 'M', 'A');
+		ev->event.data.alloc.target_lp = target_lp;
+		ev->event.data.alloc.type = type;
+		ev->event.data.alloc.size = size;
+		ev->event.data.alloc.count = count;
+		ev->hdlr = hdlr;
+		rc = signal_event(ev);
+	}
+	if ((rc != 0) && (hdlr != NULL))
+		(*hdlr)(user_token, rc);
+}
+EXPORT_SYMBOL(mf_allocate_lp_events);
+
+/*
+ * Global kernel interface to unseed and deallocate events already in
+ * Hypervisor.
+ */
+void mf_deallocate_lp_events(HvLpIndex target_lp, HvLpEvent_Type type,
+		unsigned count, MFCompleteHandler hdlr, void *user_token)
+{
+	struct pending_event *ev = new_pending_event();
+	int rc;
+
+	if (ev == NULL)
+		rc = -ENOMEM;
+	else {
+		ev->event.hp_lp_event.xSubtype = 5;
+		ev->event.hp_lp_event.xCorrelationToken = (u64)user_token;
+		ev->event.hp_lp_event.x.xSubtypeData =
+			subtype_data('M', 'F', 'M', 'D');
+		ev->event.data.alloc.target_lp = target_lp;
+		ev->event.data.alloc.type = type;
+		ev->event.data.alloc.count = count;
+		ev->hdlr = hdlr;
+		rc = signal_event(ev);
+	}
+	if ((rc != 0) && (hdlr != NULL))
+		(*hdlr)(user_token, rc);
+}
+EXPORT_SYMBOL(mf_deallocate_lp_events);
+
+/*
+ * Global kernel interface to tell the VSP object in the primary
+ * partition to power this partition off.
+ */
+void mf_power_off(void)
+{
+	printk(KERN_INFO "mf.c: Down it goes...\n");
+	signal_ce_msg_simple(0x4d, NULL);
+	for (;;)
+		;
+}
+
+/*
+ * Global kernel interface to tell the VSP object in the primary
+ * partition to reboot this partition.
+ */
+void mf_reboot(void)
+{
+	printk(KERN_INFO "mf.c: Preparing to bounce...\n");
+	signal_ce_msg_simple(0x4e, NULL);
+	for (;;)
+		;
+}
+
+/*
+ * Display a single word SRC onto the VSP control panel.
+ */
+void mf_display_src(u32 word)
+{
+	u8 ce[12];
+
+	memset(ce, 0, sizeof(ce));
+	ce[3] = 0x4a;
+	ce[7] = 0x01;
+	ce[8] = word >> 24;
+	ce[9] = word >> 16;
+	ce[10] = word >> 8;
+	ce[11] = word;
+	signal_ce_msg(ce, NULL);
+}
+
+/*
+ * Display a single word SRC of the form "PROGXXXX" on the VSP control panel.
+ */
+void mf_display_progress(u16 value)
+{
+	u8 ce[12];
+	u8 src[72];
+
+	memcpy(ce, "\x00\x00\x04\x4A\x00\x00\x00\x48\x00\x00\x00\x00", 12);
+	memcpy(src, "\x01\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00"
+		"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+		"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+		"\x00\x00\x00\x00PROGxxxx                        ",
+		72);
+	src[6] = value >> 8;
+	src[7] = value & 255;
+	src[44] = "0123456789ABCDEF"[(value >> 12) & 15];
+	src[45] = "0123456789ABCDEF"[(value >> 8) & 15];
+	src[46] = "0123456789ABCDEF"[(value >> 4) & 15];
+	src[47] = "0123456789ABCDEF"[value & 15];
+	dma_and_signal_ce_msg(ce, NULL, src, sizeof(src), 9 * 64 * 1024);
+}
+
+/*
+ * Clear the VSP control panel.  Used to "erase" an SRC that was
+ * previously displayed.
+ */
+void mf_clear_src(void)
+{
+	signal_ce_msg_simple(0x4b, NULL);
+}
+
+/*
+ * Initialization code here.
+ */
+void mf_init(void)
+{
+	int i;
+
+	/* initialize */
+	spin_lock_init(&pending_event_spinlock);
+	for (i = 0;
+	     i < sizeof(pending_event_prealloc) / sizeof(*pending_event_prealloc);
+	     ++i)
+		free_pending_event(&pending_event_prealloc[i]);
+	HvLpEvent_registerHandler(HvLpEvent_Type_MachineFac, &hv_handler);
+
+	/* virtual continue ack */
+	signal_ce_msg_simple(0x57, NULL);
+
+	/* initialization complete */
+	printk(KERN_NOTICE "mf.c: iSeries Linux LPAR Machine Facilities "
+			"initialized\n");
+}
+
+struct rtc_time_data {
+	struct completion com;
+	struct ce_msg_data ce_msg;
+	int rc;
+};
+
+static void get_rtc_time_complete(void *token, struct ce_msg_data *ce_msg)
+{
+	struct rtc_time_data *rtc = token;
+
+	memcpy(&rtc->ce_msg, ce_msg, sizeof(rtc->ce_msg));
+	rtc->rc = 0;
+	complete(&rtc->com);
+}
+
+int mf_get_rtc(struct rtc_time *tm)
+{
+	struct ce_msg_comp_data ce_complete;
+	struct rtc_time_data rtc_data;
+	int rc;
+
+	memset(&ce_complete, 0, sizeof(ce_complete));
+	memset(&rtc_data, 0, sizeof(rtc_data));
+	init_completion(&rtc_data.com);
+	ce_complete.handler = &get_rtc_time_complete;
+	ce_complete.token = &rtc_data;
+	rc = signal_ce_msg_simple(0x40, &ce_complete);
+	if (rc)
+		return rc;
+	wait_for_completion(&rtc_data.com);
+	tm->tm_wday = 0;
+	tm->tm_yday = 0;
+	tm->tm_isdst = 0;
+	if (rtc_data.rc) {
+		tm->tm_sec = 0;
+		tm->tm_min = 0;
+		tm->tm_hour = 0;
+		tm->tm_mday = 15;
+		tm->tm_mon = 5;
+		tm->tm_year = 52;
+		return rtc_data.rc;
+	}
+
+	if ((rtc_data.ce_msg.ce_msg[2] == 0xa9) ||
+	    (rtc_data.ce_msg.ce_msg[2] == 0xaf)) {
+		/* TOD clock is not set */
+		tm->tm_sec = 1;
+		tm->tm_min = 1;
+		tm->tm_hour = 1;
+		tm->tm_mday = 10;
+		tm->tm_mon = 8;
+		tm->tm_year = 71;
+		mf_set_rtc(tm);
+	}
+	{
+		u8 *ce_msg = rtc_data.ce_msg.ce_msg;
+		u8 year = ce_msg[5];
+		u8 sec = ce_msg[6];
+		u8 min = ce_msg[7];
+		u8 hour = ce_msg[8];
+		u8 day = ce_msg[10];
+		u8 mon = ce_msg[11];
+
+		BCD_TO_BIN(sec);
+		BCD_TO_BIN(min);
+		BCD_TO_BIN(hour);
+		BCD_TO_BIN(day);
+		BCD_TO_BIN(mon);
+		BCD_TO_BIN(year);
+
+		if (year <= 69)
+			year += 100;
+
+		tm->tm_sec = sec;
+		tm->tm_min = min;
+		tm->tm_hour = hour;
+		tm->tm_mday = day;
+		tm->tm_mon = mon;
+		tm->tm_year = year;
+	}
+
+	return 0;
+}
+
+int mf_set_rtc(struct rtc_time *tm)
+{
+	char ce_time[12];
+	u8 day, mon, hour, min, sec, y1, y2;
+	unsigned year;
+
+	year = 1900 + tm->tm_year;
+	y1 = year / 100;
+	y2 = year % 100;
+
+	sec = tm->tm_sec;
+	min = tm->tm_min;
+	hour = tm->tm_hour;
+	day = tm->tm_mday;
+	mon = tm->tm_mon + 1;
+
+	BIN_TO_BCD(sec);
+	BIN_TO_BCD(min);
+	BIN_TO_BCD(hour);
+	BIN_TO_BCD(mon);
+	BIN_TO_BCD(day);
+	BIN_TO_BCD(y1);
+	BIN_TO_BCD(y2);
+
+	memset(ce_time, 0, sizeof(ce_time));
+	ce_time[3] = 0x41;
+	ce_time[4] = y1;
+	ce_time[5] = y2;
+	ce_time[6] = sec;
+	ce_time[7] = min;
+	ce_time[8] = hour;
+	ce_time[10] = day;
+	ce_time[11] = mon;
+
+	return signal_ce_msg(ce_time, NULL);
+}
+
+#ifdef CONFIG_PROC_FS
+
+static int proc_mf_dump_cmdline(char *page, char **start, off_t off,
+		int count, int *eof, void *data)
+{
+	int len;
+	char *p;
+	struct vsp_cmd_data vsp_cmd;
+	int rc;
+	dma_addr_t dma_addr;
+
+	/* The HV appears to return no more than 256 bytes of command line */
+	if (off >= 256)
+		return 0;
+	if ((off + count) > 256)
+		count = 256 - off;
+
+	dma_addr = dma_map_single(iSeries_vio_dev, page, off + count,
+			DMA_FROM_DEVICE);
+	if (dma_mapping_error(dma_addr))
+		return -ENOMEM;
+	memset(page, 0, off + count);
+	memset(&vsp_cmd, 0, sizeof(vsp_cmd));
+	vsp_cmd.cmd = 33;
+	vsp_cmd.sub_data.kern.token = dma_addr;
+	vsp_cmd.sub_data.kern.address_type = HvLpDma_AddressType_TceIndex;
+	vsp_cmd.sub_data.kern.side = (u64)data;
+	vsp_cmd.sub_data.kern.length = off + count;
+	mb();
+	rc = signal_vsp_instruction(&vsp_cmd);
+	dma_unmap_single(iSeries_vio_dev, dma_addr, off + count,
+			DMA_FROM_DEVICE);
+	if (rc)
+		return rc;
+	if (vsp_cmd.result_code != 0)
+		return -ENOMEM;
+	p = page;
+	len = 0;
+	while (len < (off + count)) {
+		if ((*p == '\0') || (*p == '\n')) {
+			if (*p == '\0')
+				*p = '\n';
+			p++;
+			len++;
+			*eof = 1;
+			break;
+		}
+		p++;
+		len++;
+	}
+
+	if (len < off) {
+		*eof = 1;
+		len = 0;
+	}
+	return len;
+}
+
+#if 0
+static int mf_getVmlinuxChunk(char *buffer, int *size, int offset, u64 side)
+{
+	struct vsp_cmd_data vsp_cmd;
+	int rc;
+	int len = *size;
+	dma_addr_t dma_addr;
+
+	dma_addr = dma_map_single(iSeries_vio_dev, buffer, len,
+			DMA_FROM_DEVICE);
+	memset(buffer, 0, len);
+	memset(&vsp_cmd, 0, sizeof(vsp_cmd));
+	vsp_cmd.cmd = 32;
+	vsp_cmd.sub_data.kern.token = dma_addr;
+	vsp_cmd.sub_data.kern.address_type = HvLpDma_AddressType_TceIndex;
+	vsp_cmd.sub_data.kern.side = side;
+	vsp_cmd.sub_data.kern.offset = offset;
+	vsp_cmd.sub_data.kern.length = len;
+	mb();
+	rc = signal_vsp_instruction(&vsp_cmd);
+	if (rc == 0) {
+		if (vsp_cmd.result_code == 0)
+			*size = vsp_cmd.sub_data.length_out;
+		else
+			rc = -ENOMEM;
+	}
+
+	dma_unmap_single(iSeries_vio_dev, dma_addr, len, DMA_FROM_DEVICE);
+
+	return rc;
+}
+
+static int proc_mf_dump_vmlinux(char *page, char **start, off_t off,
+		int count, int *eof, void *data)
+{
+	int sizeToGet = count;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (mf_getVmlinuxChunk(page, &sizeToGet, off, (u64)data) == 0) {
+		if (sizeToGet != 0) {
+			*start = page + off;
+			return sizeToGet;
+		}
+		*eof = 1;
+		return 0;
+	}
+	*eof = 1;
+	return 0;
+}
+#endif
+
+static int proc_mf_dump_side(char *page, char **start, off_t off,
+		int count, int *eof, void *data)
+{
+	int len;
+	char mf_current_side = ' ';
+	struct vsp_cmd_data vsp_cmd;
+
+	memset(&vsp_cmd, 0, sizeof(vsp_cmd));
+	vsp_cmd.cmd = 2;
+	vsp_cmd.sub_data.ipl_type = 0;
+	mb();
+
+	if (signal_vsp_instruction(&vsp_cmd) == 0) {
+		if (vsp_cmd.result_code == 0) {
+			switch (vsp_cmd.sub_data.ipl_type) {
+			case 0:	mf_current_side = 'A';
+				break;
+			case 1:	mf_current_side = 'B';
+				break;
+			case 2:	mf_current_side = 'C';
+				break;
+			default:	mf_current_side = 'D';
+				break;
+			}
+		}
+	}
+
+	len = sprintf(page, "%c\n", mf_current_side);
+
+	if (len <= (off + count))
+		*eof = 1;
+	*start = page + off;
+	len -= off;
+	if (len > count)
+		len = count;
+	if (len < 0)
+		len = 0;
+	return len;
+}
+
+static int proc_mf_change_side(struct file *file, const char __user *buffer,
+		unsigned long count, void *data)
+{
+	char side;
+	u64 newSide;
+	struct vsp_cmd_data vsp_cmd;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (count == 0)
+		return 0;
+
+	if (get_user(side, buffer))
+		return -EFAULT;
+
+	switch (side) {
+	case 'A':	newSide = 0;
+			break;
+	case 'B':	newSide = 1;
+			break;
+	case 'C':	newSide = 2;
+			break;
+	case 'D':	newSide = 3;
+			break;
+	default:
+		printk(KERN_ERR "mf_proc.c: proc_mf_change_side: invalid side\n");
+		return -EINVAL;
+	}
+
+	memset(&vsp_cmd, 0, sizeof(vsp_cmd));
+	vsp_cmd.sub_data.ipl_type = newSide;
+	vsp_cmd.cmd = 10;
+
+	(void)signal_vsp_instruction(&vsp_cmd);
+
+	return count;
+}
+
+#if 0
+static void mf_getSrcHistory(char *buffer, int size)
+{
+	struct IplTypeReturnStuff return_stuff;
+	struct pending_event *ev = new_pending_event();
+	int rc = 0;
+	char *pages[4];
+
+	pages[0] = kmalloc(4096, GFP_ATOMIC);
+	pages[1] = kmalloc(4096, GFP_ATOMIC);
+	pages[2] = kmalloc(4096, GFP_ATOMIC);
+	pages[3] = kmalloc(4096, GFP_ATOMIC);
+	if ((ev == NULL) || (pages[0] == NULL) || (pages[1] == NULL)
+			 || (pages[2] == NULL) || (pages[3] == NULL))
+		return -ENOMEM;
+
+	return_stuff.xType = 0;
+	return_stuff.xRc = 0;
+	return_stuff.xDone = 0;
+	ev->event.hp_lp_event.xSubtype = 6;
+	ev->event.hp_lp_event.x.xSubtypeData =
+		subtype_data('M', 'F', 'V', 'I');
+	ev->event.data.vsp_cmd.xEvent = &return_stuff;
+	ev->event.data.vsp_cmd.cmd = 4;
+	ev->event.data.vsp_cmd.lp_index = HvLpConfig_getLpIndex();
+	ev->event.data.vsp_cmd.result_code = 0xFF;
+	ev->event.data.vsp_cmd.reserved = 0;
+	ev->event.data.vsp_cmd.sub_data.page[0] = ISERIES_HV_ADDR(pages[0]);
+	ev->event.data.vsp_cmd.sub_data.page[1] = ISERIES_HV_ADDR(pages[1]);
+	ev->event.data.vsp_cmd.sub_data.page[2] = ISERIES_HV_ADDR(pages[2]);
+	ev->event.data.vsp_cmd.sub_data.page[3] = ISERIES_HV_ADDR(pages[3]);
+	mb();
+	if (signal_event(ev) != 0)
+		return;
+
+ 	while (return_stuff.xDone != 1)
+ 		udelay(10);
+ 	if (return_stuff.xRc == 0)
+ 		memcpy(buffer, pages[0], size);
+	kfree(pages[0]);
+	kfree(pages[1]);
+	kfree(pages[2]);
+	kfree(pages[3]);
+}
+#endif
+
+static int proc_mf_dump_src(char *page, char **start, off_t off,
+		int count, int *eof, void *data)
+{
+#if 0
+	int len;
+
+	mf_getSrcHistory(page, count);
+	len = count;
+	len -= off;
+	if (len < count) {
+		*eof = 1;
+		if (len <= 0)
+			return 0;
+	} else
+		len = count;
+	*start = page + off;
+	return len;
+#else
+	return 0;
+#endif
+}
+
+static int proc_mf_change_src(struct file *file, const char __user *buffer,
+		unsigned long count, void *data)
+{
+	char stkbuf[10];
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if ((count < 4) && (count != 1)) {
+		printk(KERN_ERR "mf_proc: invalid src\n");
+		return -EINVAL;
+	}
+
+	if (count > (sizeof(stkbuf) - 1))
+		count = sizeof(stkbuf) - 1;
+	if (copy_from_user(stkbuf, buffer, count))
+		return -EFAULT;
+
+	if ((count == 1) && (*stkbuf == '\0'))
+		mf_clear_src();
+	else
+		mf_display_src(*(u32 *)stkbuf);
+
+	return count;
+}
+
+static int proc_mf_change_cmdline(struct file *file, const char __user *buffer,
+		unsigned long count, void *data)
+{
+	struct vsp_cmd_data vsp_cmd;
+	dma_addr_t dma_addr;
+	char *page;
+	int ret = -EACCES;
+
+	if (!capable(CAP_SYS_ADMIN))
+		goto out;
+
+	dma_addr = 0;
+	page = dma_alloc_coherent(iSeries_vio_dev, count, &dma_addr,
+			GFP_ATOMIC);
+	ret = -ENOMEM;
+	if (page == NULL)
+		goto out;
+
+	ret = -EFAULT;
+	if (copy_from_user(page, buffer, count))
+		goto out_free;
+
+	memset(&vsp_cmd, 0, sizeof(vsp_cmd));
+	vsp_cmd.cmd = 31;
+	vsp_cmd.sub_data.kern.token = dma_addr;
+	vsp_cmd.sub_data.kern.address_type = HvLpDma_AddressType_TceIndex;
+	vsp_cmd.sub_data.kern.side = (u64)data;
+	vsp_cmd.sub_data.kern.length = count;
+	mb();
+	(void)signal_vsp_instruction(&vsp_cmd);
+	ret = count;
+
+out_free:
+	dma_free_coherent(iSeries_vio_dev, count, page, dma_addr);
+out:
+	return ret;
+}
+
+static ssize_t proc_mf_change_vmlinux(struct file *file,
+				      const char __user *buf,
+				      size_t count, loff_t *ppos)
+{
+	struct proc_dir_entry *dp = PDE(file->f_dentry->d_inode);
+	ssize_t rc;
+	dma_addr_t dma_addr;
+	char *page;
+	struct vsp_cmd_data vsp_cmd;
+
+	rc = -EACCES;
+	if (!capable(CAP_SYS_ADMIN))
+		goto out;
+
+	dma_addr = 0;
+	page = dma_alloc_coherent(iSeries_vio_dev, count, &dma_addr,
+			GFP_ATOMIC);
+	rc = -ENOMEM;
+	if (page == NULL) {
+		printk(KERN_ERR "mf.c: couldn't allocate memory to set vmlinux chunk\n");
+		goto out;
+	}
+	rc = -EFAULT;
+	if (copy_from_user(page, buf, count))
+		goto out_free;
+
+	memset(&vsp_cmd, 0, sizeof(vsp_cmd));
+	vsp_cmd.cmd = 30;
+	vsp_cmd.sub_data.kern.token = dma_addr;
+	vsp_cmd.sub_data.kern.address_type = HvLpDma_AddressType_TceIndex;
+	vsp_cmd.sub_data.kern.side = (u64)dp->data;
+	vsp_cmd.sub_data.kern.offset = *ppos;
+	vsp_cmd.sub_data.kern.length = count;
+	mb();
+	rc = signal_vsp_instruction(&vsp_cmd);
+	if (rc)
+		goto out_free;
+	rc = -ENOMEM;
+	if (vsp_cmd.result_code != 0)
+		goto out_free;
+
+	*ppos += count;
+	rc = count;
+out_free:
+	dma_free_coherent(iSeries_vio_dev, count, page, dma_addr);
+out:
+	return rc;
+}
+
+static struct file_operations proc_vmlinux_operations = {
+	.write		= proc_mf_change_vmlinux,
+};
+
+static int __init mf_proc_init(void)
+{
+	struct proc_dir_entry *mf_proc_root;
+	struct proc_dir_entry *ent;
+	struct proc_dir_entry *mf;
+	char name[2];
+	int i;
+
+	mf_proc_root = proc_mkdir("iSeries/mf", NULL);
+	if (!mf_proc_root)
+		return 1;
+
+	name[1] = '\0';
+	for (i = 0; i < 4; i++) {
+		name[0] = 'A' + i;
+		mf = proc_mkdir(name, mf_proc_root);
+		if (!mf)
+			return 1;
+
+		ent = create_proc_entry("cmdline", S_IFREG|S_IRUSR|S_IWUSR, mf);
+		if (!ent)
+			return 1;
+		ent->nlink = 1;
+		ent->data = (void *)(long)i;
+		ent->read_proc = proc_mf_dump_cmdline;
+		ent->write_proc = proc_mf_change_cmdline;
+
+		if (i == 3)	/* no vmlinux entry for 'D' */
+			continue;
+
+		ent = create_proc_entry("vmlinux", S_IFREG|S_IWUSR, mf);
+		if (!ent)
+			return 1;
+		ent->nlink = 1;
+		ent->data = (void *)(long)i;
+		ent->proc_fops = &proc_vmlinux_operations;
+	}
+
+	ent = create_proc_entry("side", S_IFREG|S_IRUSR|S_IWUSR, mf_proc_root);
+	if (!ent)
+		return 1;
+	ent->nlink = 1;
+	ent->data = (void *)0;
+	ent->read_proc = proc_mf_dump_side;
+	ent->write_proc = proc_mf_change_side;
+
+	ent = create_proc_entry("src", S_IFREG|S_IRUSR|S_IWUSR, mf_proc_root);
+	if (!ent)
+		return 1;
+	ent->nlink = 1;
+	ent->data = (void *)0;
+	ent->read_proc = proc_mf_dump_src;
+	ent->write_proc = proc_mf_change_src;
+
+	return 0;
+}
+
+__initcall(mf_proc_init);
+
+#endif /* CONFIG_PROC_FS */
diff --git a/arch/ppc64/kernel/misc.S b/arch/ppc64/kernel/misc.S
new file mode 100644
index 00000000000..90b41f48d21
--- /dev/null
+++ b/arch/ppc64/kernel/misc.S
@@ -0,0 +1,1234 @@
+/*
+ *  arch/ppc/kernel/misc.S
+ *
+ *  
+ *
+ * This file contains miscellaneous low-level functions.
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Largely rewritten by Cort Dougan (cort@cs.nmt.edu)
+ * and Paul Mackerras.
+ * Adapted for iSeries by Mike Corrigan (mikejc@us.ibm.com)
+ * PPC64 updates by Dave Engebretsen (engebret@us.ibm.com) 
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/sys.h>
+#include <asm/unistd.h>
+#include <asm/errno.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/cache.h>
+#include <asm/ppc_asm.h>
+#include <asm/offsets.h>
+#include <asm/cputable.h>
+
+	.text
+
+/*
+ * Returns (address we're running at) - (address we were linked at)
+ * for use before the text and data are mapped to KERNELBASE.
+ */
+
+_GLOBAL(reloc_offset)
+	mflr	r0
+	bl	1f
+1:	mflr	r3
+	LOADADDR(r4,1b)
+	sub	r3,r4,r3
+	mtlr	r0
+	blr
+
+_GLOBAL(get_msr)
+	mfmsr	r3
+	blr
+
+_GLOBAL(get_dar)
+	mfdar	r3
+	blr
+
+_GLOBAL(get_srr0)
+	mfsrr0  r3
+	blr
+
+_GLOBAL(get_srr1)
+	mfsrr1  r3
+	blr
+	
+_GLOBAL(get_sp)
+	mr	r3,r1
+	blr
+		
+#ifdef CONFIG_PPC_ISERIES
+/* unsigned long local_save_flags(void) */
+_GLOBAL(local_get_flags)
+	lbz	r3,PACAPROCENABLED(r13)
+	blr
+
+/* unsigned long local_irq_disable(void) */
+_GLOBAL(local_irq_disable)
+	lbz	r3,PACAPROCENABLED(r13)
+	li	r4,0
+	stb	r4,PACAPROCENABLED(r13)
+	blr			/* Done */
+
+/* void local_irq_restore(unsigned long flags) */	
+_GLOBAL(local_irq_restore)
+	lbz	r5,PACAPROCENABLED(r13)
+	 /* Check if things are setup the way we want _already_. */
+	cmpw	0,r3,r5
+	beqlr
+	/* are we enabling interrupts? */
+	cmpdi	0,r3,0
+	stb	r3,PACAPROCENABLED(r13)
+	beqlr
+	/* Check pending interrupts */
+	/*   A decrementer, IPI or PMC interrupt may have occurred
+	 *   while we were in the hypervisor (which enables) */
+	ld	r4,PACALPPACA+LPPACAANYINT(r13)
+	cmpdi	r4,0
+	beqlr
+
+	/*
+	 * Handle pending interrupts in interrupt context
+	 */
+	li	r0,0x5555
+	sc
+	blr
+#endif /* CONFIG_PPC_ISERIES */
+
+#ifdef CONFIG_IRQSTACKS
+_GLOBAL(call_do_softirq)
+	mflr	r0
+	std	r0,16(r1)
+	stdu	r1,THREAD_SIZE-112(r3)
+	mr	r1,r3
+	bl	.__do_softirq
+	ld	r1,0(r1)
+	ld	r0,16(r1)
+	mtlr	r0
+	blr
+
+_GLOBAL(call_handle_IRQ_event)
+	mflr	r0
+	std	r0,16(r1)
+	stdu	r1,THREAD_SIZE-112(r6)
+	mr	r1,r6
+	bl	.handle_IRQ_event
+	ld	r1,0(r1)
+	ld	r0,16(r1)
+	mtlr	r0
+	blr
+#endif /* CONFIG_IRQSTACKS */
+
+	/*
+ * To be called by C code which needs to do some operations with MMU
+ * disabled. Note that interrupts have to be disabled by the caller
+ * prior to calling us. The code called _MUST_ be in the RMO of course
+ * and part of the linear mapping as we don't attempt to translate the
+ * stack pointer at all. The function is called with the stack switched
+ * to this CPU emergency stack
+ *
+ * prototype is void *call_with_mmu_off(void *func, void *data);
+ *
+ * the called function is expected to be of the form
+ *
+ * void *called(void *data); 
+ */
+_GLOBAL(call_with_mmu_off)
+	mflr	r0			/* get link, save it on stackframe */
+	std	r0,16(r1)
+	mr	r1,r5			/* save old stack ptr */
+	ld	r1,PACAEMERGSP(r13)	/* get emerg. stack */
+	subi	r1,r1,STACK_FRAME_OVERHEAD
+	std	r0,16(r1)		/* save link on emerg. stack */
+	std	r5,0(r1)		/* save old stack ptr in backchain */
+	ld	r3,0(r3)		/* get to real function ptr (assume same TOC) */
+	bl	2f			/* we need LR to return, continue at label 2 */
+
+	ld	r0,16(r1)		/* we return here from the call, get LR and */
+	ld	r1,0(r1)		/* .. old stack ptr */
+	mtspr	SPRN_SRR0,r0		/* and get back to virtual mode with these */
+	mfmsr	r4
+	ori	r4,r4,MSR_IR|MSR_DR
+	mtspr	SPRN_SRR1,r4
+	rfid
+
+2:	mtspr	SPRN_SRR0,r3		/* coming from above, enter real mode */
+	mr	r3,r4			/* get parameter */
+	mfmsr	r0
+	ori	r0,r0,MSR_IR|MSR_DR
+	xori	r0,r0,MSR_IR|MSR_DR
+	mtspr	SPRN_SRR1,r0
+	rfid
+
+
+	.section	".toc","aw"
+PPC64_CACHES:
+	.tc		ppc64_caches[TC],ppc64_caches
+	.section	".text"
+
+/*
+ * Write any modified data cache blocks out to memory
+ * and invalidate the corresponding instruction cache blocks.
+ *
+ * flush_icache_range(unsigned long start, unsigned long stop)
+ *
+ *   flush all bytes from start through stop-1 inclusive
+ */
+
+_GLOBAL(__flush_icache_range)
+
+/*
+ * Flush the data cache to memory 
+ * 
+ * Different systems have different cache line sizes
+ * and in some cases i-cache and d-cache line sizes differ from
+ * each other.
+ */
+ 	ld	r10,PPC64_CACHES@toc(r2)
+	lwz	r7,DCACHEL1LINESIZE(r10)/* Get cache line size */
+	addi	r5,r7,-1
+	andc	r6,r3,r5		/* round low to line bdy */
+	subf	r8,r6,r4		/* compute length */
+	add	r8,r8,r5		/* ensure we get enough */
+	lwz	r9,DCACHEL1LOGLINESIZE(r10)	/* Get log-2 of cache line size */
+	srw.	r8,r8,r9		/* compute line count */
+	beqlr				/* nothing to do? */
+	mtctr	r8
+1:	dcbst	0,r6
+	add	r6,r6,r7
+	bdnz	1b
+	sync
+
+/* Now invalidate the instruction cache */
+	
+	lwz	r7,ICACHEL1LINESIZE(r10)	/* Get Icache line size */
+	addi	r5,r7,-1
+	andc	r6,r3,r5		/* round low to line bdy */
+	subf	r8,r6,r4		/* compute length */
+	add	r8,r8,r5
+	lwz	r9,ICACHEL1LOGLINESIZE(r10)	/* Get log-2 of Icache line size */
+	srw.	r8,r8,r9		/* compute line count */
+	beqlr				/* nothing to do? */
+	mtctr	r8
+2:	icbi	0,r6
+	add	r6,r6,r7
+	bdnz	2b
+	isync
+	blr
+	
+/*
+ * Like above, but only do the D-cache.
+ *
+ * flush_dcache_range(unsigned long start, unsigned long stop)
+ *
+ *    flush all bytes from start to stop-1 inclusive
+ */
+_GLOBAL(flush_dcache_range)
+
+/*
+ * Flush the data cache to memory 
+ * 
+ * Different systems have different cache line sizes
+ */
+ 	ld	r10,PPC64_CACHES@toc(r2)
+	lwz	r7,DCACHEL1LINESIZE(r10)	/* Get dcache line size */
+	addi	r5,r7,-1
+	andc	r6,r3,r5		/* round low to line bdy */
+	subf	r8,r6,r4		/* compute length */
+	add	r8,r8,r5		/* ensure we get enough */
+	lwz	r9,DCACHEL1LOGLINESIZE(r10)	/* Get log-2 of dcache line size */
+	srw.	r8,r8,r9		/* compute line count */
+	beqlr				/* nothing to do? */
+	mtctr	r8
+0:	dcbst	0,r6
+	add	r6,r6,r7
+	bdnz	0b
+	sync
+	blr
+
+/*
+ * Like above, but works on non-mapped physical addresses.
+ * Use only for non-LPAR setups ! It also assumes real mode
+ * is cacheable. Used for flushing out the DART before using
+ * it as uncacheable memory 
+ *
+ * flush_dcache_phys_range(unsigned long start, unsigned long stop)
+ *
+ *    flush all bytes from start to stop-1 inclusive
+ */
+_GLOBAL(flush_dcache_phys_range)
+ 	ld	r10,PPC64_CACHES@toc(r2)
+	lwz	r7,DCACHEL1LINESIZE(r10)	/* Get dcache line size */
+	addi	r5,r7,-1
+	andc	r6,r3,r5		/* round low to line bdy */
+	subf	r8,r6,r4		/* compute length */
+	add	r8,r8,r5		/* ensure we get enough */
+	lwz	r9,DCACHEL1LOGLINESIZE(r10)	/* Get log-2 of dcache line size */
+	srw.	r8,r8,r9		/* compute line count */
+	beqlr				/* nothing to do? */
+	mfmsr	r5			/* Disable MMU Data Relocation */
+	ori	r0,r5,MSR_DR
+	xori	r0,r0,MSR_DR
+	sync
+	mtmsr	r0
+	sync
+	isync
+	mtctr	r8
+0:	dcbst	0,r6
+	add	r6,r6,r7
+	bdnz	0b
+	sync
+	isync
+	mtmsr	r5			/* Re-enable MMU Data Relocation */
+	sync
+	isync
+	blr
+
+_GLOBAL(flush_inval_dcache_range)
+ 	ld	r10,PPC64_CACHES@toc(r2)
+	lwz	r7,DCACHEL1LINESIZE(r10)	/* Get dcache line size */
+	addi	r5,r7,-1
+	andc	r6,r3,r5		/* round low to line bdy */
+	subf	r8,r6,r4		/* compute length */
+	add	r8,r8,r5		/* ensure we get enough */
+	lwz	r9,DCACHEL1LOGLINESIZE(r10)/* Get log-2 of dcache line size */
+	srw.	r8,r8,r9		/* compute line count */
+	beqlr				/* nothing to do? */
+	sync
+	isync
+	mtctr	r8
+0:	dcbf	0,r6
+	add	r6,r6,r7
+	bdnz	0b
+	sync
+	isync
+	blr
+
+
+/*
+ * Flush a particular page from the data cache to RAM.
+ * Note: this is necessary because the instruction cache does *not*
+ * snoop from the data cache.
+ *
+ *	void __flush_dcache_icache(void *page)
+ */
+_GLOBAL(__flush_dcache_icache)
+/*
+ * Flush the data cache to memory 
+ * 
+ * Different systems have different cache line sizes
+ */
+
+/* Flush the dcache */
+ 	ld	r7,PPC64_CACHES@toc(r2)
+	clrrdi	r3,r3,12           	    /* Page align */
+	lwz	r4,DCACHEL1LINESPERPAGE(r7)	/* Get # dcache lines per page */
+	lwz	r5,DCACHEL1LINESIZE(r7)		/* Get dcache line size */
+	mr	r6,r3
+	mtctr	r4
+0:	dcbst	0,r6
+	add	r6,r6,r5
+	bdnz	0b
+	sync
+
+/* Now invalidate the icache */	
+
+	lwz	r4,ICACHEL1LINESPERPAGE(r7)	/* Get # icache lines per page */
+	lwz	r5,ICACHEL1LINESIZE(r7)		/* Get icache line size */
+	mtctr	r4
+1:	icbi	0,r3
+	add	r3,r3,r5
+	bdnz	1b
+	isync
+	blr
+	
+/*
+ * I/O string operations
+ *
+ * insb(port, buf, len)
+ * outsb(port, buf, len)
+ * insw(port, buf, len)
+ * outsw(port, buf, len)
+ * insl(port, buf, len)
+ * outsl(port, buf, len)
+ * insw_ns(port, buf, len)
+ * outsw_ns(port, buf, len)
+ * insl_ns(port, buf, len)
+ * outsl_ns(port, buf, len)
+ *
+ * The *_ns versions don't do byte-swapping.
+ */
+_GLOBAL(_insb)
+	cmpwi	0,r5,0
+	mtctr	r5
+	subi	r4,r4,1
+	blelr-
+00:	lbz	r5,0(r3)
+	eieio
+	stbu	r5,1(r4)
+	bdnz	00b
+	twi	0,r5,0
+	isync
+	blr
+
+_GLOBAL(_outsb)
+	cmpwi	0,r5,0
+	mtctr	r5
+	subi	r4,r4,1
+	blelr-
+00:	lbzu	r5,1(r4)
+	stb	r5,0(r3)
+	bdnz	00b
+	sync
+	blr	
+
+_GLOBAL(_insw)
+	cmpwi	0,r5,0
+	mtctr	r5
+	subi	r4,r4,2
+	blelr-
+00:	lhbrx	r5,0,r3
+	eieio
+	sthu	r5,2(r4)
+	bdnz	00b
+	twi	0,r5,0
+	isync
+	blr
+
+_GLOBAL(_outsw)
+	cmpwi	0,r5,0
+	mtctr	r5
+	subi	r4,r4,2
+	blelr-
+00:	lhzu	r5,2(r4)
+	sthbrx	r5,0,r3	
+	bdnz	00b
+	sync
+	blr	
+
+_GLOBAL(_insl)
+	cmpwi	0,r5,0
+	mtctr	r5
+	subi	r4,r4,4
+	blelr-
+00:	lwbrx	r5,0,r3
+	eieio
+	stwu	r5,4(r4)
+	bdnz	00b
+	twi	0,r5,0
+	isync
+	blr
+
+_GLOBAL(_outsl)
+	cmpwi	0,r5,0
+	mtctr	r5
+	subi	r4,r4,4
+	blelr-
+00:	lwzu	r5,4(r4)
+	stwbrx	r5,0,r3
+	bdnz	00b
+	sync
+	blr	
+
+/* _GLOBAL(ide_insw) now in drivers/ide/ide-iops.c */
+_GLOBAL(_insw_ns)
+	cmpwi	0,r5,0
+	mtctr	r5
+	subi	r4,r4,2
+	blelr-
+00:	lhz	r5,0(r3)
+	eieio
+	sthu	r5,2(r4)
+	bdnz	00b
+	twi	0,r5,0
+	isync
+	blr
+
+/* _GLOBAL(ide_outsw) now in drivers/ide/ide-iops.c */
+_GLOBAL(_outsw_ns)
+	cmpwi	0,r5,0
+	mtctr	r5
+	subi	r4,r4,2
+	blelr-
+00:	lhzu	r5,2(r4)
+	sth	r5,0(r3)
+	bdnz	00b
+	sync
+	blr	
+
+_GLOBAL(_insl_ns)
+	cmpwi	0,r5,0
+	mtctr	r5
+	subi	r4,r4,4
+	blelr-
+00:	lwz	r5,0(r3)
+	eieio
+	stwu	r5,4(r4)
+	bdnz	00b
+	twi	0,r5,0
+	isync
+	blr
+
+_GLOBAL(_outsl_ns)
+	cmpwi	0,r5,0
+	mtctr	r5
+	subi	r4,r4,4
+	blelr-
+00:	lwzu	r5,4(r4)
+	stw	r5,0(r3)
+	bdnz	00b
+	sync
+	blr	
+
+
+_GLOBAL(cvt_fd)
+	lfd	0,0(r5)		/* load up fpscr value */
+	mtfsf	0xff,0
+	lfs	0,0(r3)
+	stfd	0,0(r4)
+	mffs	0		/* save new fpscr value */
+	stfd	0,0(r5)
+	blr
+
+_GLOBAL(cvt_df)
+	lfd	0,0(r5)		/* load up fpscr value */
+	mtfsf	0xff,0
+	lfd	0,0(r3)
+	stfs	0,0(r4)
+	mffs	0		/* save new fpscr value */
+	stfd	0,0(r5)
+	blr
+
+/*
+ * identify_cpu and calls setup_cpu
+ * In:	r3 = base of the cpu_specs array
+ *	r4 = address of cur_cpu_spec
+ *	r5 = relocation offset
+ */
+_GLOBAL(identify_cpu)
+	mfpvr	r7
+1:
+	lwz	r8,CPU_SPEC_PVR_MASK(r3)
+	and	r8,r8,r7
+	lwz	r9,CPU_SPEC_PVR_VALUE(r3)
+	cmplw	0,r9,r8
+	beq	1f
+	addi	r3,r3,CPU_SPEC_ENTRY_SIZE
+	b	1b
+1:
+	add	r0,r3,r5
+	std	r0,0(r4)
+	ld	r4,CPU_SPEC_SETUP(r3)
+	sub	r4,r4,r5
+	ld	r4,0(r4)
+	sub	r4,r4,r5
+	mtctr	r4
+	/* Calling convention for cpu setup is r3=offset, r4=cur_cpu_spec */
+	mr	r4,r3
+	mr	r3,r5
+	bctr
+
+/*
+ * do_cpu_ftr_fixups - goes through the list of CPU feature fixups
+ * and writes nop's over sections of code that don't apply for this cpu.
+ * r3 = data offset (not changed)
+ */
+_GLOBAL(do_cpu_ftr_fixups)
+	/* Get CPU 0 features */
+	LOADADDR(r6,cur_cpu_spec)
+	sub	r6,r6,r3
+	ld	r4,0(r6)
+	sub	r4,r4,r3
+	ld	r4,CPU_SPEC_FEATURES(r4)
+	/* Get the fixup table */
+	LOADADDR(r6,__start___ftr_fixup)
+	sub	r6,r6,r3
+	LOADADDR(r7,__stop___ftr_fixup)
+	sub	r7,r7,r3
+	/* Do the fixup */
+1:	cmpld	r6,r7
+	bgelr
+	addi	r6,r6,32
+	ld	r8,-32(r6)	/* mask */
+	and	r8,r8,r4
+	ld	r9,-24(r6)	/* value */
+	cmpld	r8,r9
+	beq	1b
+	ld	r8,-16(r6)	/* section begin */
+	ld	r9,-8(r6)	/* section end */
+	subf.	r9,r8,r9
+	beq	1b
+	/* write nops over the section of code */
+	/* todo: if large section, add a branch at the start of it */
+	srwi	r9,r9,2
+	mtctr	r9
+	sub	r8,r8,r3
+	lis	r0,0x60000000@h	/* nop */
+3:	stw	r0,0(r8)
+	andi.	r10,r4,CPU_FTR_SPLIT_ID_CACHE@l
+	beq	2f
+	dcbst	0,r8		/* suboptimal, but simpler */
+	sync
+	icbi	0,r8
+2:	addi	r8,r8,4
+	bdnz	3b
+	sync			/* additional sync needed on g4 */
+	isync
+	b	1b
+
+#if defined(CONFIG_PPC_PMAC) || defined(CONFIG_PPC_MAPLE)
+/*
+ * Do an IO access in real mode
+ */
+_GLOBAL(real_readb)
+	mfmsr	r7
+	ori	r0,r7,MSR_DR
+	xori	r0,r0,MSR_DR
+	sync
+	mtmsrd	r0
+	sync
+	isync
+	mfspr	r6,SPRN_HID4
+	rldicl	r5,r6,32,0
+	ori	r5,r5,0x100
+	rldicl	r5,r5,32,0
+	sync
+	mtspr	SPRN_HID4,r5
+	isync
+	slbia
+	isync
+	lbz	r3,0(r3)
+	sync
+	mtspr	SPRN_HID4,r6
+	isync
+	slbia
+	isync
+	mtmsrd	r7
+	sync
+	isync
+	blr
+
+	/*
+ * Do an IO access in real mode
+ */
+_GLOBAL(real_writeb)
+	mfmsr	r7
+	ori	r0,r7,MSR_DR
+	xori	r0,r0,MSR_DR
+	sync
+	mtmsrd	r0
+	sync
+	isync
+	mfspr	r6,SPRN_HID4
+	rldicl	r5,r6,32,0
+	ori	r5,r5,0x100
+	rldicl	r5,r5,32,0
+	sync
+	mtspr	SPRN_HID4,r5
+	isync
+	slbia
+	isync
+	stb	r3,0(r4)
+	sync
+	mtspr	SPRN_HID4,r6
+	isync
+	slbia
+	isync
+	mtmsrd	r7
+	sync
+	isync
+	blr
+#endif /* defined(CONFIG_PPC_PMAC) || defined(CONFIG_PPC_MAPLE) */
+
+/*
+ * Create a kernel thread
+ *   kernel_thread(fn, arg, flags)
+ */
+_GLOBAL(kernel_thread)
+	std	r29,-24(r1)
+	std	r30,-16(r1)
+	stdu	r1,-STACK_FRAME_OVERHEAD(r1)
+	mr	r29,r3
+	mr	r30,r4
+	ori	r3,r5,CLONE_VM	/* flags */
+	oris	r3,r3,(CLONE_UNTRACED>>16)
+	li	r4,0		/* new sp (unused) */
+	li	r0,__NR_clone
+	sc
+	cmpdi	0,r3,0		/* parent or child? */
+	bne	1f		/* return if parent */
+	li	r0,0
+	stdu	r0,-STACK_FRAME_OVERHEAD(r1)
+	ld	r2,8(r29)
+	ld	r29,0(r29)
+	mtlr	r29              /* fn addr in lr */
+	mr	r3,r30	        /* load arg and call fn */
+	blrl
+	li	r0,__NR_exit	/* exit after child exits */
+        li	r3,0
+	sc
+1:	addi	r1,r1,STACK_FRAME_OVERHEAD	
+	ld	r29,-24(r1)
+	ld	r30,-16(r1)
+	blr
+
+/* Why isn't this a) automatic, b) written in 'C'? */	
+	.balign 8
+_GLOBAL(sys_call_table32)
+	.llong .sys_restart_syscall	/* 0 */
+	.llong .sys_exit
+	.llong .ppc_fork
+	.llong .sys_read
+	.llong .sys_write
+	.llong .sys32_open		/* 5 */
+	.llong .sys_close
+	.llong .sys32_waitpid
+	.llong .sys32_creat
+	.llong .sys_link
+	.llong .sys_unlink      	/* 10 */
+	.llong .sys32_execve
+	.llong .sys_chdir
+	.llong .compat_sys_time
+	.llong .sys_mknod
+	.llong .sys_chmod		/* 15 */
+	.llong .sys_lchown
+	.llong .sys_ni_syscall		/* old break syscall */
+	.llong .sys_ni_syscall		/* old stat syscall */
+	.llong .ppc32_lseek
+	.llong .sys_getpid              /* 20 */
+	.llong .compat_sys_mount
+	.llong .sys_oldumount
+	.llong .sys_setuid
+	.llong .sys_getuid
+	.llong .compat_sys_stime	/* 25 */
+	.llong .sys32_ptrace
+	.llong .sys_alarm
+	.llong .sys_ni_syscall		/* old fstat syscall */
+	.llong .sys32_pause
+	.llong .compat_sys_utime		/* 30 */
+	.llong .sys_ni_syscall		/* old stty syscall */
+	.llong .sys_ni_syscall		/* old gtty syscall */
+	.llong .sys32_access
+	.llong .sys32_nice
+	.llong .sys_ni_syscall		/* 35 - old ftime syscall */
+	.llong .sys_sync
+	.llong .sys32_kill
+	.llong .sys_rename
+	.llong .sys32_mkdir
+	.llong .sys_rmdir		/* 40 */
+	.llong .sys_dup
+	.llong .sys_pipe
+	.llong .compat_sys_times
+	.llong .sys_ni_syscall		/* old prof syscall */
+	.llong .sys_brk			/* 45 */
+	.llong .sys_setgid
+	.llong .sys_getgid
+	.llong .sys_signal
+	.llong .sys_geteuid
+	.llong .sys_getegid		/* 50 */
+	.llong .sys_acct
+	.llong .sys_umount
+	.llong .sys_ni_syscall		/* old lock syscall */
+	.llong .compat_sys_ioctl
+	.llong .compat_sys_fcntl		/* 55 */
+	.llong .sys_ni_syscall		/* old mpx syscall */
+	.llong .sys32_setpgid
+	.llong .sys_ni_syscall		/* old ulimit syscall */
+	.llong .sys32_olduname
+	.llong .sys32_umask		/* 60 */
+	.llong .sys_chroot
+	.llong .sys_ustat
+	.llong .sys_dup2
+	.llong .sys_getppid
+	.llong .sys_getpgrp	        /* 65 */
+	.llong .sys_setsid
+	.llong .sys32_sigaction
+	.llong .sys_sgetmask
+	.llong .sys32_ssetmask
+	.llong .sys_setreuid	        /* 70 */
+	.llong .sys_setregid
+	.llong .ppc32_sigsuspend
+	.llong .compat_sys_sigpending
+	.llong .sys32_sethostname
+	.llong .compat_sys_setrlimit	        /* 75 */
+	.llong .compat_sys_old_getrlimit
+	.llong .compat_sys_getrusage
+	.llong .sys32_gettimeofday
+	.llong .sys32_settimeofday
+	.llong .sys32_getgroups	        /* 80 */
+	.llong .sys32_setgroups
+	.llong .sys_ni_syscall		/* old select syscall */
+	.llong .sys_symlink
+	.llong .sys_ni_syscall		/* old lstat syscall */
+	.llong .sys32_readlink	        /* 85 */
+	.llong .sys_uselib
+	.llong .sys_swapon
+	.llong .sys_reboot
+	.llong .old32_readdir
+	.llong .sys_mmap		/* 90 */
+	.llong .sys_munmap
+	.llong .sys_truncate
+	.llong .sys_ftruncate
+	.llong .sys_fchmod
+	.llong .sys_fchown              /* 95 */
+	.llong .sys32_getpriority
+	.llong .sys32_setpriority
+	.llong .sys_ni_syscall		/* old profil syscall */
+	.llong .compat_sys_statfs
+	.llong .compat_sys_fstatfs		/* 100 */
+	.llong .sys_ni_syscall		/* old ioperm syscall */
+	.llong .compat_sys_socketcall
+	.llong .sys32_syslog
+	.llong .compat_sys_setitimer
+	.llong .compat_sys_getitimer		/* 105 */
+	.llong .compat_sys_newstat
+	.llong .compat_sys_newlstat
+	.llong .compat_sys_newfstat
+	.llong .sys_uname
+	.llong .sys_ni_syscall		/* 110 old iopl syscall */
+	.llong .sys_vhangup
+	.llong .sys_ni_syscall		/* old idle syscall */
+	.llong .sys_ni_syscall		/* old vm86 syscall */
+	.llong .compat_sys_wait4
+	.llong .sys_swapoff		/* 115 */
+	.llong .sys32_sysinfo
+	.llong .sys32_ipc
+	.llong .sys_fsync
+	.llong .ppc32_sigreturn
+	.llong .ppc_clone		/* 120 */
+	.llong .sys32_setdomainname
+	.llong .ppc64_newuname
+	.llong .sys_ni_syscall		/* old modify_ldt syscall */
+	.llong .sys32_adjtimex
+	.llong .sys_mprotect		/* 125 */
+	.llong .compat_sys_sigprocmask
+	.llong .sys_ni_syscall		/* old create_module syscall */
+	.llong .sys_init_module
+	.llong .sys_delete_module
+	.llong .sys_ni_syscall		/* 130 old get_kernel_syms syscall */
+	.llong .sys_quotactl
+	.llong .sys32_getpgid
+	.llong .sys_fchdir
+	.llong .sys_bdflush
+	.llong .sys32_sysfs		/* 135 */
+	.llong .ppc64_personality
+	.llong .sys_ni_syscall	        /* for afs_syscall */
+	.llong .sys_setfsuid
+	.llong .sys_setfsgid
+	.llong .sys_llseek	        /* 140 */
+        .llong .sys32_getdents
+	.llong .ppc32_select
+	.llong .sys_flock
+	.llong .sys_msync
+	.llong .compat_sys_readv	/* 145 */
+	.llong .compat_sys_writev
+	.llong .sys32_getsid
+	.llong .sys_fdatasync
+	.llong .sys32_sysctl
+	.llong .sys_mlock		/* 150 */
+	.llong .sys_munlock
+	.llong .sys_mlockall
+	.llong .sys_munlockall
+	.llong .sys32_sched_setparam
+	.llong .sys32_sched_getparam	/* 155 */
+	.llong .sys32_sched_setscheduler
+	.llong .sys32_sched_getscheduler
+	.llong .sys_sched_yield
+	.llong .sys32_sched_get_priority_max
+	.llong .sys32_sched_get_priority_min  /* 160 */
+	.llong .sys32_sched_rr_get_interval
+	.llong .compat_sys_nanosleep
+	.llong .sys_mremap
+	.llong .sys_setresuid
+	.llong .sys_getresuid	        /* 165 */
+	.llong .sys_ni_syscall		/* old query_module syscall */
+	.llong .sys_poll
+	.llong .compat_sys_nfsservctl
+	.llong .sys_setresgid
+	.llong .sys_getresgid	        /* 170 */
+	.llong .sys32_prctl
+	.llong .ppc32_rt_sigreturn
+	.llong .sys32_rt_sigaction
+	.llong .sys32_rt_sigprocmask
+	.llong .sys32_rt_sigpending     /* 175 */
+	.llong .compat_sys_rt_sigtimedwait
+	.llong .sys32_rt_sigqueueinfo
+	.llong .ppc32_rt_sigsuspend
+	.llong .sys32_pread64
+	.llong .sys32_pwrite64	        /* 180 */
+	.llong .sys_chown
+	.llong .sys_getcwd
+	.llong .sys_capget
+	.llong .sys_capset
+	.llong .sys32_sigaltstack	/* 185 */
+	.llong .sys32_sendfile
+	.llong .sys_ni_syscall		/* reserved for streams1 */
+	.llong .sys_ni_syscall		/* reserved for streams2 */
+	.llong .ppc_vfork
+	.llong .compat_sys_getrlimit	        /* 190 */
+	.llong .sys32_readahead
+	.llong .sys32_mmap2
+	.llong .sys32_truncate64
+	.llong .sys32_ftruncate64
+	.llong .sys_stat64      	/* 195 */
+	.llong .sys_lstat64
+	.llong .sys_fstat64
+	.llong .sys32_pciconfig_read
+	.llong .sys32_pciconfig_write
+	.llong .sys32_pciconfig_iobase	/* 200 - pciconfig_iobase */
+	.llong .sys_ni_syscall		/* reserved for MacOnLinux */
+	.llong .sys_getdents64
+	.llong .sys_pivot_root
+	.llong .compat_sys_fcntl64
+	.llong .sys_madvise		/* 205 */
+	.llong .sys_mincore
+	.llong .sys_gettid
+	.llong .sys_tkill
+	.llong .sys_setxattr
+	.llong .sys_lsetxattr		/* 210 */
+	.llong .sys_fsetxattr
+	.llong .sys_getxattr
+	.llong .sys_lgetxattr
+	.llong .sys_fgetxattr
+	.llong .sys_listxattr		/* 215 */
+	.llong .sys_llistxattr
+	.llong .sys_flistxattr
+	.llong .sys_removexattr
+	.llong .sys_lremovexattr
+	.llong .sys_fremovexattr	/* 220 */
+	.llong .compat_sys_futex
+	.llong .compat_sys_sched_setaffinity
+	.llong .compat_sys_sched_getaffinity
+	.llong .sys_ni_syscall
+	.llong .sys_ni_syscall		/* 225 - reserved for tux */
+	.llong .sys32_sendfile64
+	.llong .compat_sys_io_setup
+	.llong .sys_io_destroy
+	.llong .compat_sys_io_getevents
+	.llong .compat_sys_io_submit
+	.llong .sys_io_cancel
+	.llong .sys_set_tid_address
+	.llong .ppc32_fadvise64
+	.llong .sys_exit_group
+	.llong .ppc32_lookup_dcookie	/* 235 */
+	.llong .sys_epoll_create
+	.llong .sys_epoll_ctl
+	.llong .sys_epoll_wait
+	.llong .sys_remap_file_pages
+	.llong .ppc32_timer_create	/* 240 */
+	.llong .compat_sys_timer_settime
+	.llong .compat_sys_timer_gettime
+	.llong .sys_timer_getoverrun
+	.llong .sys_timer_delete
+	.llong .compat_sys_clock_settime	/* 245 */
+	.llong .compat_sys_clock_gettime
+	.llong .compat_sys_clock_getres
+	.llong .compat_sys_clock_nanosleep
+	.llong .ppc32_swapcontext
+	.llong .sys32_tgkill		/* 250 */
+	.llong .sys32_utimes
+	.llong .compat_sys_statfs64
+	.llong .compat_sys_fstatfs64
+	.llong .ppc32_fadvise64_64	/* 32bit only fadvise64_64 */
+	.llong .ppc_rtas		/* 255 */
+	.llong .sys_ni_syscall		/* 256 reserved for sys_debug_setcontext */
+	.llong .sys_ni_syscall		/* 257 reserved for vserver */
+	.llong .sys_ni_syscall		/* 258 reserved for new sys_remap_file_pages */
+	.llong .compat_sys_mbind
+	.llong .compat_sys_get_mempolicy	/* 260 */
+	.llong .compat_sys_set_mempolicy
+	.llong .compat_sys_mq_open
+	.llong .sys_mq_unlink
+	.llong .compat_sys_mq_timedsend
+	.llong .compat_sys_mq_timedreceive /* 265 */
+	.llong .compat_sys_mq_notify
+	.llong .compat_sys_mq_getsetattr
+	.llong .sys_ni_syscall		/* 268 reserved for sys_kexec_load */
+	.llong .sys32_add_key
+	.llong .sys32_request_key
+	.llong .compat_sys_keyctl
+	.llong .compat_sys_waitid
+
+	.balign 8
+_GLOBAL(sys_call_table)
+	.llong .sys_restart_syscall	/* 0 */
+	.llong .sys_exit
+	.llong .ppc_fork
+	.llong .sys_read
+	.llong .sys_write
+	.llong .sys_open		/* 5 */
+	.llong .sys_close
+	.llong .sys_waitpid
+	.llong .sys_creat
+	.llong .sys_link
+	.llong .sys_unlink		/* 10 */
+	.llong .sys_execve
+	.llong .sys_chdir
+	.llong .sys64_time
+	.llong .sys_mknod
+	.llong .sys_chmod		/* 15 */
+	.llong .sys_lchown
+	.llong .sys_ni_syscall		/* old break syscall */
+	.llong .sys_ni_syscall		/* old stat syscall */
+	.llong .sys_lseek
+	.llong .sys_getpid		/* 20 */
+	.llong .sys_mount
+	.llong .sys_ni_syscall		/* old umount syscall */
+	.llong .sys_setuid
+	.llong .sys_getuid
+	.llong .sys_stime		/* 25 */
+	.llong .sys_ptrace
+	.llong .sys_alarm
+	.llong .sys_ni_syscall		/* old fstat syscall */
+	.llong .sys_pause
+	.llong .sys_utime		/* 30 */
+	.llong .sys_ni_syscall		/* old stty syscall */
+	.llong .sys_ni_syscall		/* old gtty syscall */
+	.llong .sys_access
+	.llong .sys_nice
+	.llong .sys_ni_syscall		/* 35 - old ftime syscall */
+	.llong .sys_sync
+	.llong .sys_kill
+	.llong .sys_rename
+	.llong .sys_mkdir
+	.llong .sys_rmdir		/* 40 */
+	.llong .sys_dup
+	.llong .sys_pipe
+	.llong .sys_times
+	.llong .sys_ni_syscall		/* old prof syscall */
+	.llong .sys_brk			/* 45 */
+	.llong .sys_setgid
+	.llong .sys_getgid
+	.llong .sys_signal
+	.llong .sys_geteuid
+	.llong .sys_getegid		/* 50 */
+	.llong .sys_acct
+	.llong .sys_umount
+	.llong .sys_ni_syscall		/* old lock syscall */
+	.llong .sys_ioctl
+	.llong .sys_fcntl		/* 55 */
+	.llong .sys_ni_syscall		/* old mpx syscall */
+	.llong .sys_setpgid
+	.llong .sys_ni_syscall		/* old ulimit syscall */
+	.llong .sys_ni_syscall		/* old uname syscall */
+	.llong .sys_umask		/* 60 */
+	.llong .sys_chroot
+	.llong .sys_ustat
+	.llong .sys_dup2
+	.llong .sys_getppid
+	.llong .sys_getpgrp		/* 65 */
+	.llong .sys_setsid
+	.llong .sys_ni_syscall
+	.llong .sys_sgetmask
+	.llong .sys_ssetmask
+	.llong .sys_setreuid		/* 70 */
+	.llong .sys_setregid
+	.llong .sys_ni_syscall
+	.llong .sys_ni_syscall
+	.llong .sys_sethostname
+	.llong .sys_setrlimit		/* 75 */
+	.llong .sys_ni_syscall		/* old getrlimit syscall */
+	.llong .sys_getrusage
+	.llong .sys_gettimeofday
+	.llong .sys_settimeofday
+	.llong .sys_getgroups		/* 80 */
+	.llong .sys_setgroups
+	.llong .sys_ni_syscall		/* old select syscall */
+	.llong .sys_symlink
+	.llong .sys_ni_syscall		/* old lstat syscall */
+	.llong .sys_readlink		/* 85 */
+	.llong .sys_uselib
+	.llong .sys_swapon
+	.llong .sys_reboot
+	.llong .sys_ni_syscall		/* old readdir syscall */
+	.llong .sys_mmap		/* 90 */
+	.llong .sys_munmap
+	.llong .sys_truncate
+	.llong .sys_ftruncate
+	.llong .sys_fchmod
+	.llong .sys_fchown		/* 95 */
+	.llong .sys_getpriority
+	.llong .sys_setpriority
+	.llong .sys_ni_syscall		/* old profil syscall holder */
+	.llong .sys_statfs
+	.llong .sys_fstatfs		/* 100 */
+	.llong .sys_ni_syscall		/* old ioperm syscall */
+	.llong .sys_socketcall
+	.llong .sys_syslog
+	.llong .sys_setitimer
+	.llong .sys_getitimer		/* 105 */
+	.llong .sys_newstat
+	.llong .sys_newlstat
+	.llong .sys_newfstat
+	.llong .sys_ni_syscall		/* old uname syscall */
+	.llong .sys_ni_syscall		/* 110 old iopl syscall */
+	.llong .sys_vhangup
+	.llong .sys_ni_syscall		/* old idle syscall */
+	.llong .sys_ni_syscall		/* old vm86 syscall */
+	.llong .sys_wait4
+	.llong .sys_swapoff		/* 115 */
+	.llong .sys_sysinfo
+	.llong .sys_ipc
+	.llong .sys_fsync
+	.llong .sys_ni_syscall
+	.llong .ppc_clone		/* 120 */
+	.llong .sys_setdomainname
+	.llong .ppc64_newuname
+	.llong .sys_ni_syscall		/* old modify_ldt syscall */
+	.llong .sys_adjtimex
+	.llong .sys_mprotect		/* 125 */
+	.llong .sys_ni_syscall
+	.llong .sys_ni_syscall		/* old create_module syscall */
+	.llong .sys_init_module
+	.llong .sys_delete_module
+	.llong .sys_ni_syscall		/* 130 old get_kernel_syms syscall */
+	.llong .sys_quotactl
+	.llong .sys_getpgid
+	.llong .sys_fchdir
+	.llong .sys_bdflush
+	.llong .sys_sysfs		/* 135 */
+	.llong .ppc64_personality
+	.llong .sys_ni_syscall	        /* for afs_syscall */
+	.llong .sys_setfsuid
+	.llong .sys_setfsgid
+	.llong .sys_llseek	        /* 140 */
+        .llong .sys_getdents
+	.llong .sys_select
+	.llong .sys_flock
+	.llong .sys_msync
+	.llong .sys_readv		/* 145 */
+	.llong .sys_writev
+	.llong .sys_getsid
+	.llong .sys_fdatasync
+	.llong .sys_sysctl
+	.llong .sys_mlock		/* 150 */
+	.llong .sys_munlock
+	.llong .sys_mlockall
+	.llong .sys_munlockall
+	.llong .sys_sched_setparam
+	.llong .sys_sched_getparam	/* 155 */
+	.llong .sys_sched_setscheduler
+	.llong .sys_sched_getscheduler
+	.llong .sys_sched_yield
+	.llong .sys_sched_get_priority_max
+	.llong .sys_sched_get_priority_min  /* 160 */
+	.llong .sys_sched_rr_get_interval
+	.llong .sys_nanosleep
+	.llong .sys_mremap
+	.llong .sys_setresuid
+	.llong .sys_getresuid	        /* 165 */
+	.llong .sys_ni_syscall		/* old query_module syscall */
+	.llong .sys_poll
+	.llong .sys_nfsservctl
+	.llong .sys_setresgid
+	.llong .sys_getresgid	        /* 170 */
+	.llong .sys_prctl
+	.llong .ppc64_rt_sigreturn
+	.llong .sys_rt_sigaction
+	.llong .sys_rt_sigprocmask	
+	.llong .sys_rt_sigpending	/* 175 */
+	.llong .sys_rt_sigtimedwait
+	.llong .sys_rt_sigqueueinfo
+	.llong .ppc64_rt_sigsuspend
+	.llong .sys_pread64
+	.llong .sys_pwrite64	        /* 180 */
+	.llong .sys_chown
+	.llong .sys_getcwd
+	.llong .sys_capget
+	.llong .sys_capset
+	.llong .sys_sigaltstack	        /* 185 */
+	.llong .sys_sendfile64
+	.llong .sys_ni_syscall		/* reserved for streams1 */
+	.llong .sys_ni_syscall		/* reserved for streams2 */
+	.llong .ppc_vfork
+	.llong .sys_getrlimit	        /* 190 */
+	.llong .sys_readahead
+	.llong .sys_ni_syscall		/* 32bit only mmap2 */
+	.llong .sys_ni_syscall		/* 32bit only truncate64 */
+	.llong .sys_ni_syscall		/* 32bit only ftruncate64 */
+	.llong .sys_ni_syscall		/* 195 - 32bit only stat64 */
+	.llong .sys_ni_syscall		/* 32bit only lstat64 */
+	.llong .sys_ni_syscall		/* 32bit only fstat64 */
+	.llong .sys_ni_syscall		/* 32bit only pciconfig_read */
+	.llong .sys_ni_syscall		/* 32bit only pciconfig_write */
+	.llong .sys_ni_syscall		/* 32bit only pciconfig_iobase */
+	.llong .sys_ni_syscall		/* reserved for MacOnLinux */
+	.llong .sys_getdents64
+	.llong .sys_pivot_root
+	.llong .sys_ni_syscall		/* 32bit only fcntl64 */
+	.llong .sys_madvise		/* 205 */
+	.llong .sys_mincore
+	.llong .sys_gettid
+	.llong .sys_tkill
+	.llong .sys_setxattr
+	.llong .sys_lsetxattr		/* 210 */
+	.llong .sys_fsetxattr
+	.llong .sys_getxattr
+	.llong .sys_lgetxattr
+	.llong .sys_fgetxattr
+	.llong .sys_listxattr		/* 215 */
+	.llong .sys_llistxattr
+	.llong .sys_flistxattr
+	.llong .sys_removexattr
+	.llong .sys_lremovexattr
+	.llong .sys_fremovexattr	/* 220 */
+	.llong .sys_futex
+	.llong .sys_sched_setaffinity
+	.llong .sys_sched_getaffinity
+	.llong .sys_ni_syscall
+	.llong .sys_ni_syscall		/* 225 - reserved for tux */
+	.llong .sys_ni_syscall		/* 32bit only sendfile64 */
+	.llong .sys_io_setup
+	.llong .sys_io_destroy
+	.llong .sys_io_getevents
+	.llong .sys_io_submit		/* 230 */
+	.llong .sys_io_cancel
+	.llong .sys_set_tid_address
+	.llong .sys_fadvise64
+	.llong .sys_exit_group
+	.llong .sys_lookup_dcookie	/* 235 */
+	.llong .sys_epoll_create
+	.llong .sys_epoll_ctl
+	.llong .sys_epoll_wait
+	.llong .sys_remap_file_pages
+	.llong .sys_timer_create	/* 240 */
+	.llong .sys_timer_settime
+	.llong .sys_timer_gettime
+	.llong .sys_timer_getoverrun
+	.llong .sys_timer_delete
+	.llong .sys_clock_settime	/* 245 */
+	.llong .sys_clock_gettime
+	.llong .sys_clock_getres
+	.llong .sys_clock_nanosleep
+	.llong .ppc64_swapcontext
+	.llong .sys_tgkill		/* 250 */
+	.llong .sys_utimes
+	.llong .sys_statfs64
+	.llong .sys_fstatfs64
+	.llong .sys_ni_syscall		/* 32bit only fadvise64_64 */
+	.llong .ppc_rtas		/* 255 */
+	.llong .sys_ni_syscall		/* 256 reserved for sys_debug_setcontext */
+	.llong .sys_ni_syscall		/* 257 reserved for vserver */
+	.llong .sys_ni_syscall		/* 258 reserved for new sys_remap_file_pages */
+	.llong .sys_mbind
+	.llong .sys_get_mempolicy	/* 260 */
+	.llong .sys_set_mempolicy
+	.llong .sys_mq_open
+	.llong .sys_mq_unlink
+	.llong .sys_mq_timedsend
+	.llong .sys_mq_timedreceive	/* 265 */
+	.llong .sys_mq_notify
+	.llong .sys_mq_getsetattr
+	.llong .sys_ni_syscall		/* 268 reserved for sys_kexec_load */
+	.llong .sys_add_key
+	.llong .sys_request_key		/* 270 */
+	.llong .sys_keyctl
+	.llong .sys_waitid
diff --git a/arch/ppc64/kernel/module.c b/arch/ppc64/kernel/module.c
new file mode 100644
index 00000000000..c683bf88e69
--- /dev/null
+++ b/arch/ppc64/kernel/module.c
@@ -0,0 +1,442 @@
+/*  Kernel module help for PPC64.
+    Copyright (C) 2001, 2003 Rusty Russell IBM Corporation.
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+#include <linux/module.h>
+#include <linux/elf.h>
+#include <linux/moduleloader.h>
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <asm/module.h>
+#include <asm/uaccess.h>
+
+/* FIXME: We don't do .init separately.  To do this, we'd need to have
+   a separate r2 value in the init and core section, and stub between
+   them, too.
+
+   Using a magic allocator which places modules within 32MB solves
+   this, and makes other things simpler.  Anton?
+   --RR.  */
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(fmt , ...)
+#endif
+
+/* There's actually a third entry here, but it's unused */
+struct ppc64_opd_entry
+{
+	unsigned long funcaddr;
+	unsigned long r2;
+};
+
+/* Like PPC32, we need little trampolines to do > 24-bit jumps (into
+   the kernel itself).  But on PPC64, these need to be used for every
+   jump, actually, to reset r2 (TOC+0x8000). */
+struct ppc64_stub_entry
+{
+	/* 28 byte jump instruction sequence (7 instructions) */
+	unsigned char jump[28];
+	unsigned char unused[4];
+	/* Data for the above code */
+	struct ppc64_opd_entry opd;
+};
+
+/* We use a stub to fix up r2 (TOC ptr) and to jump to the (external)
+   function which may be more than 24-bits away.  We could simply
+   patch the new r2 value and function pointer into the stub, but it's
+   significantly shorter to put these values at the end of the stub
+   code, and patch the stub address (32-bits relative to the TOC ptr,
+   r2) into the stub. */
+static struct ppc64_stub_entry ppc64_stub =
+{ .jump = {
+	0x3d, 0x82, 0x00, 0x00, /* addis   r12,r2, <high> */
+	0x39, 0x8c, 0x00, 0x00, /* addi    r12,r12, <low> */
+	/* Save current r2 value in magic place on the stack. */
+	0xf8, 0x41, 0x00, 0x28, /* std     r2,40(r1) */
+	0xe9, 0x6c, 0x00, 0x20, /* ld      r11,32(r12) */
+	0xe8, 0x4c, 0x00, 0x28, /* ld      r2,40(r12) */
+	0x7d, 0x69, 0x03, 0xa6, /* mtctr   r11 */
+	0x4e, 0x80, 0x04, 0x20  /* bctr */
+} };
+
+/* Count how many different 24-bit relocations (different symbol,
+   different addend) */
+static unsigned int count_relocs(const Elf64_Rela *rela, unsigned int num)
+{
+	unsigned int i, j, ret = 0;
+
+	/* FIXME: Only count external ones --RR */
+	/* Sure, this is order(n^2), but it's usually short, and not
+           time critical */
+	for (i = 0; i < num; i++) {
+		/* Only count 24-bit relocs, others don't need stubs */
+		if (ELF64_R_TYPE(rela[i].r_info) != R_PPC_REL24)
+			continue;
+		for (j = 0; j < i; j++) {
+			/* If this addend appeared before, it's
+                           already been counted */
+			if (rela[i].r_info == rela[j].r_info
+			    && rela[i].r_addend == rela[j].r_addend)
+				break;
+		}
+		if (j == i) ret++;
+	}
+	return ret;
+}
+
+void *module_alloc(unsigned long size)
+{
+	if (size == 0)
+		return NULL;
+
+	return vmalloc_exec(size);
+}
+
+/* Free memory returned from module_alloc */
+void module_free(struct module *mod, void *module_region)
+{
+	vfree(module_region);
+	/* FIXME: If module_region == mod->init_region, trim exception
+           table entries. */
+}
+
+/* Get size of potential trampolines required. */
+static unsigned long get_stubs_size(const Elf64_Ehdr *hdr,
+				    const Elf64_Shdr *sechdrs)
+{
+	/* One extra reloc so it's always 0-funcaddr terminated */
+	unsigned long relocs = 1;
+	unsigned i;
+
+	/* Every relocated section... */
+	for (i = 1; i < hdr->e_shnum; i++) {
+		if (sechdrs[i].sh_type == SHT_RELA) {
+			DEBUGP("Found relocations in section %u\n", i);
+			DEBUGP("Ptr: %p.  Number: %lu\n",
+			       (void *)sechdrs[i].sh_addr,
+			       sechdrs[i].sh_size / sizeof(Elf64_Rela));
+			relocs += count_relocs((void *)sechdrs[i].sh_addr,
+					       sechdrs[i].sh_size
+					       / sizeof(Elf64_Rela));
+		}
+	}
+
+	DEBUGP("Looks like a total of %lu stubs, max\n", relocs);
+	return relocs * sizeof(struct ppc64_stub_entry);
+}
+
+static void dedotify_versions(struct modversion_info *vers,
+			      unsigned long size)
+{
+	struct modversion_info *end;
+
+	for (end = (void *)vers + size; vers < end; vers++)
+		if (vers->name[0] == '.')
+			memmove(vers->name, vers->name+1, strlen(vers->name));
+}
+
+/* Undefined symbols which refer to .funcname, hack to funcname */
+static void dedotify(Elf64_Sym *syms, unsigned int numsyms, char *strtab)
+{
+	unsigned int i;
+
+	for (i = 1; i < numsyms; i++) {
+		if (syms[i].st_shndx == SHN_UNDEF) {
+			char *name = strtab + syms[i].st_name;
+			if (name[0] == '.')
+				memmove(name, name+1, strlen(name));
+		}
+	}
+}
+
+int module_frob_arch_sections(Elf64_Ehdr *hdr,
+			      Elf64_Shdr *sechdrs,
+			      char *secstrings,
+			      struct module *me)
+{
+	unsigned int i;
+
+	/* Find .toc and .stubs sections, symtab and strtab */
+	for (i = 1; i < hdr->e_shnum; i++) {
+		char *p;
+		if (strcmp(secstrings + sechdrs[i].sh_name, ".stubs") == 0)
+			me->arch.stubs_section = i;
+		else if (strcmp(secstrings + sechdrs[i].sh_name, ".toc") == 0)
+			me->arch.toc_section = i;
+		else if (strcmp(secstrings+sechdrs[i].sh_name,"__versions")==0)
+			dedotify_versions((void *)hdr + sechdrs[i].sh_offset,
+					  sechdrs[i].sh_size);
+
+		/* We don't handle .init for the moment: rename to _init */
+		while ((p = strstr(secstrings + sechdrs[i].sh_name, ".init")))
+			p[0] = '_';
+
+		if (sechdrs[i].sh_type == SHT_SYMTAB)
+			dedotify((void *)hdr + sechdrs[i].sh_offset,
+				 sechdrs[i].sh_size / sizeof(Elf64_Sym),
+				 (void *)hdr
+				 + sechdrs[sechdrs[i].sh_link].sh_offset);
+	}
+	if (!me->arch.stubs_section || !me->arch.toc_section) {
+		printk("%s: doesn't contain .toc or .stubs.\n", me->name);
+		return -ENOEXEC;
+	}
+
+	/* Override the stubs size */
+	sechdrs[me->arch.stubs_section].sh_size = get_stubs_size(hdr, sechdrs);
+	return 0;
+}
+
+int apply_relocate(Elf64_Shdr *sechdrs,
+		   const char *strtab,
+		   unsigned int symindex,
+		   unsigned int relsec,
+		   struct module *me)
+{
+	printk(KERN_ERR "%s: Non-ADD RELOCATION unsupported\n", me->name);
+	return -ENOEXEC;
+}
+
+/* r2 is the TOC pointer: it actually points 0x8000 into the TOC (this
+   gives the value maximum span in an instruction which uses a signed
+   offset) */
+static inline unsigned long my_r2(Elf64_Shdr *sechdrs, struct module *me)
+{
+	return sechdrs[me->arch.toc_section].sh_addr + 0x8000;
+}
+
+/* Both low and high 16 bits are added as SIGNED additions, so if low
+   16 bits has high bit set, high 16 bits must be adjusted.  These
+   macros do that (stolen from binutils). */
+#define PPC_LO(v) ((v) & 0xffff)
+#define PPC_HI(v) (((v) >> 16) & 0xffff)
+#define PPC_HA(v) PPC_HI ((v) + 0x8000)
+
+/* Patch stub to reference function and correct r2 value. */
+static inline int create_stub(Elf64_Shdr *sechdrs,
+			      struct ppc64_stub_entry *entry,
+			      struct ppc64_opd_entry *opd,
+			      struct module *me)
+{
+	Elf64_Half *loc1, *loc2;
+	long reladdr;
+
+	*entry = ppc64_stub;
+
+	loc1 = (Elf64_Half *)&entry->jump[2];
+	loc2 = (Elf64_Half *)&entry->jump[6];
+
+	/* Stub uses address relative to r2. */
+	reladdr = (unsigned long)entry - my_r2(sechdrs, me);
+	if (reladdr > 0x7FFFFFFF || reladdr < -(0x80000000L)) {
+		printk("%s: Address %p of stub out of range of %p.\n",
+		       me->name, (void *)reladdr, (void *)my_r2);
+		return 0;
+	}
+	DEBUGP("Stub %p get data from reladdr %li\n", entry, reladdr);
+
+	*loc1 = PPC_HA(reladdr);
+	*loc2 = PPC_LO(reladdr);
+	entry->opd.funcaddr = opd->funcaddr;
+	entry->opd.r2 = opd->r2;
+	return 1;
+}
+
+/* Create stub to jump to function described in this OPD: we need the
+   stub to set up the TOC ptr (r2) for the function. */
+static unsigned long stub_for_addr(Elf64_Shdr *sechdrs,
+				   unsigned long opdaddr,
+				   struct module *me)
+{
+	struct ppc64_stub_entry *stubs;
+	struct ppc64_opd_entry *opd = (void *)opdaddr;
+	unsigned int i, num_stubs;
+
+	num_stubs = sechdrs[me->arch.stubs_section].sh_size / sizeof(*stubs);
+
+	/* Find this stub, or if that fails, the next avail. entry */
+	stubs = (void *)sechdrs[me->arch.stubs_section].sh_addr;
+	for (i = 0; stubs[i].opd.funcaddr; i++) {
+		BUG_ON(i >= num_stubs);
+
+		if (stubs[i].opd.funcaddr == opd->funcaddr)
+			return (unsigned long)&stubs[i];
+	}
+
+	if (!create_stub(sechdrs, &stubs[i], opd, me))
+		return 0;
+
+	return (unsigned long)&stubs[i];
+}
+
+/* We expect a noop next: if it is, replace it with instruction to
+   restore r2. */
+static int restore_r2(u32 *instruction, struct module *me)
+{
+	if (*instruction != 0x60000000) {
+		printk("%s: Expect noop after relocate, got %08x\n",
+		       me->name, *instruction);
+		return 0;
+	}
+	*instruction = 0xe8410028;	/* ld r2,40(r1) */
+	return 1;
+}
+
+int apply_relocate_add(Elf64_Shdr *sechdrs,
+		       const char *strtab,
+		       unsigned int symindex,
+		       unsigned int relsec,
+		       struct module *me)
+{
+	unsigned int i;
+	Elf64_Rela *rela = (void *)sechdrs[relsec].sh_addr;
+	Elf64_Sym *sym;
+	unsigned long *location;
+	unsigned long value;
+
+	DEBUGP("Applying ADD relocate section %u to %u\n", relsec,
+	       sechdrs[relsec].sh_info);
+	for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rela); i++) {
+		/* This is where to make the change */
+		location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
+			+ rela[i].r_offset;
+		/* This is the symbol it is referring to */
+		sym = (Elf64_Sym *)sechdrs[symindex].sh_addr
+			+ ELF64_R_SYM(rela[i].r_info);
+
+		DEBUGP("RELOC at %p: %li-type as %s (%lu) + %li\n",
+		       location, (long)ELF64_R_TYPE(rela[i].r_info),
+		       strtab + sym->st_name, (unsigned long)sym->st_value,
+		       (long)rela[i].r_addend);
+
+		/* `Everything is relative'. */
+		value = sym->st_value + rela[i].r_addend;
+
+		switch (ELF64_R_TYPE(rela[i].r_info)) {
+		case R_PPC64_ADDR32:
+			/* Simply set it */
+			*(u32 *)location = value;
+			break;
+			
+		case R_PPC64_ADDR64:
+			/* Simply set it */
+			*(unsigned long *)location = value;
+			break;
+
+		case R_PPC64_TOC:
+			*(unsigned long *)location = my_r2(sechdrs, me);
+			break;
+
+		case R_PPC64_TOC16_DS:
+			/* Subtact TOC pointer */
+			value -= my_r2(sechdrs, me);
+			if ((value & 3) != 0 || value + 0x8000 > 0xffff) {
+				printk("%s: bad TOC16_DS relocation (%lu)\n",
+				       me->name, value);
+				return -ENOEXEC;
+			}
+			*((uint16_t *) location)
+				= (*((uint16_t *) location) & ~0xfffc)
+				| (value & 0xfffc);
+			break;
+
+		case R_PPC_REL24:
+			/* FIXME: Handle weak symbols here --RR */
+			if (sym->st_shndx == SHN_UNDEF) {
+				/* External: go via stub */
+				value = stub_for_addr(sechdrs, value, me);
+				if (!value)
+					return -ENOENT;
+				if (!restore_r2((u32 *)location + 1, me))
+					return -ENOEXEC;
+			}
+
+			/* Convert value to relative */
+			value -= (unsigned long)location;
+			if (value + 0x2000000 > 0x3ffffff || (value & 3) != 0){
+				printk("%s: REL24 %li out of range!\n",
+				       me->name, (long int)value);
+				return -ENOEXEC;
+			}
+
+			/* Only replace bits 2 through 26 */
+			*(uint32_t *)location 
+				= (*(uint32_t *)location & ~0x03fffffc)
+				| (value & 0x03fffffc);
+			break;
+
+		default:
+			printk("%s: Unknown ADD relocation: %lu\n",
+			       me->name,
+			       (unsigned long)ELF64_R_TYPE(rela[i].r_info));
+			return -ENOEXEC;
+		}
+	}
+
+	return 0;
+}
+
+LIST_HEAD(module_bug_list);
+
+int module_finalize(const Elf_Ehdr *hdr,
+		const Elf_Shdr *sechdrs, struct module *me)
+{
+	char *secstrings;
+	unsigned int i;
+
+	me->arch.bug_table = NULL;
+	me->arch.num_bugs = 0;
+
+	/* Find the __bug_table section, if present */
+	secstrings = (char *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+	for (i = 1; i < hdr->e_shnum; i++) {
+		if (strcmp(secstrings+sechdrs[i].sh_name, "__bug_table"))
+			continue;
+		me->arch.bug_table = (void *) sechdrs[i].sh_addr;
+		me->arch.num_bugs = sechdrs[i].sh_size / sizeof(struct bug_entry);
+		break;
+	}
+
+	/*
+	 * Strictly speaking this should have a spinlock to protect against
+	 * traversals, but since we only traverse on BUG()s, a spinlock
+	 * could potentially lead to deadlock and thus be counter-productive.
+	 */
+	list_add(&me->arch.bug_list, &module_bug_list);
+
+	return 0;
+}
+
+void module_arch_cleanup(struct module *mod)
+{
+	list_del(&mod->arch.bug_list);
+}
+
+struct bug_entry *module_find_bug(unsigned long bugaddr)
+{
+	struct mod_arch_specific *mod;
+	unsigned int i;
+	struct bug_entry *bug;
+
+	list_for_each_entry(mod, &module_bug_list, bug_list) {
+		bug = mod->bug_table;
+		for (i = 0; i < mod->num_bugs; ++i, ++bug)
+			if (bugaddr == bug->bug_addr)
+				return bug;
+	}
+	return NULL;
+}
diff --git a/arch/ppc64/kernel/mpic.c b/arch/ppc64/kernel/mpic.c
new file mode 100644
index 00000000000..593ea5b82af
--- /dev/null
+++ b/arch/ppc64/kernel/mpic.c
@@ -0,0 +1,859 @@
+/*
+ *  arch/ppc64/kernel/mpic.c
+ *
+ *  Driver for interrupt controllers following the OpenPIC standard, the
+ *  common implementation beeing IBM's MPIC. This driver also can deal
+ *  with various broken implementations of this HW.
+ *
+ *  Copyright (C) 2004 Benjamin Herrenschmidt, IBM Corp.
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of this archive
+ *  for more details.
+ */
+
+#undef DEBUG
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/bootmem.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+
+#include <asm/ptrace.h>
+#include <asm/signal.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/irq.h>
+#include <asm/machdep.h>
+
+#include "mpic.h"
+
+#ifdef DEBUG
+#define DBG(fmt...) printk(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+static struct mpic *mpics;
+static struct mpic *mpic_primary;
+static DEFINE_SPINLOCK(mpic_lock);
+
+
+/*
+ * Register accessor functions
+ */
+
+
+static inline u32 _mpic_read(unsigned int be, volatile u32 __iomem *base,
+			    unsigned int reg)
+{
+	if (be)
+		return in_be32(base + (reg >> 2));
+	else
+		return in_le32(base + (reg >> 2));
+}
+
+static inline void _mpic_write(unsigned int be, volatile u32 __iomem *base,
+			      unsigned int reg, u32 value)
+{
+	if (be)
+		out_be32(base + (reg >> 2), value);
+	else
+		out_le32(base + (reg >> 2), value);
+}
+
+static inline u32 _mpic_ipi_read(struct mpic *mpic, unsigned int ipi)
+{
+	unsigned int be = (mpic->flags & MPIC_BIG_ENDIAN) != 0;
+	unsigned int offset = MPIC_GREG_IPI_VECTOR_PRI_0 + (ipi * 0x10);
+
+	if (mpic->flags & MPIC_BROKEN_IPI)
+		be = !be;
+	return _mpic_read(be, mpic->gregs, offset);
+}
+
+static inline void _mpic_ipi_write(struct mpic *mpic, unsigned int ipi, u32 value)
+{
+	unsigned int offset = MPIC_GREG_IPI_VECTOR_PRI_0 + (ipi * 0x10);
+
+	_mpic_write(mpic->flags & MPIC_BIG_ENDIAN, mpic->gregs, offset, value);
+}
+
+static inline u32 _mpic_cpu_read(struct mpic *mpic, unsigned int reg)
+{
+	unsigned int cpu = 0;
+
+	if (mpic->flags & MPIC_PRIMARY)
+		cpu = hard_smp_processor_id();
+
+	return _mpic_read(mpic->flags & MPIC_BIG_ENDIAN, mpic->cpuregs[cpu], reg);
+}
+
+static inline void _mpic_cpu_write(struct mpic *mpic, unsigned int reg, u32 value)
+{
+	unsigned int cpu = 0;
+
+	if (mpic->flags & MPIC_PRIMARY)
+		cpu = hard_smp_processor_id();
+
+	_mpic_write(mpic->flags & MPIC_BIG_ENDIAN, mpic->cpuregs[cpu], reg, value);
+}
+
+static inline u32 _mpic_irq_read(struct mpic *mpic, unsigned int src_no, unsigned int reg)
+{
+	unsigned int	isu = src_no >> mpic->isu_shift;
+	unsigned int	idx = src_no & mpic->isu_mask;
+
+	return _mpic_read(mpic->flags & MPIC_BIG_ENDIAN, mpic->isus[isu],
+			  reg + (idx * MPIC_IRQ_STRIDE));
+}
+
+static inline void _mpic_irq_write(struct mpic *mpic, unsigned int src_no,
+				   unsigned int reg, u32 value)
+{
+	unsigned int	isu = src_no >> mpic->isu_shift;
+	unsigned int	idx = src_no & mpic->isu_mask;
+
+	_mpic_write(mpic->flags & MPIC_BIG_ENDIAN, mpic->isus[isu],
+		    reg + (idx * MPIC_IRQ_STRIDE), value);
+}
+
+#define mpic_read(b,r)		_mpic_read(mpic->flags & MPIC_BIG_ENDIAN,(b),(r))
+#define mpic_write(b,r,v)	_mpic_write(mpic->flags & MPIC_BIG_ENDIAN,(b),(r),(v))
+#define mpic_ipi_read(i)	_mpic_ipi_read(mpic,(i))
+#define mpic_ipi_write(i,v)	_mpic_ipi_write(mpic,(i),(v))
+#define mpic_cpu_read(i)	_mpic_cpu_read(mpic,(i))
+#define mpic_cpu_write(i,v)	_mpic_cpu_write(mpic,(i),(v))
+#define mpic_irq_read(s,r)	_mpic_irq_read(mpic,(s),(r))
+#define mpic_irq_write(s,r,v)	_mpic_irq_write(mpic,(s),(r),(v))
+
+
+/*
+ * Low level utility functions
+ */
+
+
+
+/* Check if we have one of those nice broken MPICs with a flipped endian on
+ * reads from IPI registers
+ */
+static void __init mpic_test_broken_ipi(struct mpic *mpic)
+{
+	u32 r;
+
+	mpic_write(mpic->gregs, MPIC_GREG_IPI_VECTOR_PRI_0, MPIC_VECPRI_MASK);
+	r = mpic_read(mpic->gregs, MPIC_GREG_IPI_VECTOR_PRI_0);
+
+	if (r == le32_to_cpu(MPIC_VECPRI_MASK)) {
+		printk(KERN_INFO "mpic: Detected reversed IPI registers\n");
+		mpic->flags |= MPIC_BROKEN_IPI;
+	}
+}
+
+#ifdef CONFIG_MPIC_BROKEN_U3
+
+/* Test if an interrupt is sourced from HyperTransport (used on broken U3s)
+ * to force the edge setting on the MPIC and do the ack workaround.
+ */
+static inline int mpic_is_ht_interrupt(struct mpic *mpic, unsigned int source_no)
+{
+	if (source_no >= 128 || !mpic->fixups)
+		return 0;
+	return mpic->fixups[source_no].base != NULL;
+}
+
+static inline void mpic_apic_end_irq(struct mpic *mpic, unsigned int source_no)
+{
+	struct mpic_irq_fixup *fixup = &mpic->fixups[source_no];
+	u32 tmp;
+
+	spin_lock(&mpic->fixup_lock);
+	writeb(0x11 + 2 * fixup->irq, fixup->base);
+	tmp = readl(fixup->base + 2);
+	writel(tmp | 0x80000000ul, fixup->base + 2);
+	/* config writes shouldn't be posted but let's be safe ... */
+	(void)readl(fixup->base + 2);
+	spin_unlock(&mpic->fixup_lock);
+}
+
+
+static void __init mpic_amd8111_read_irq(struct mpic *mpic, u8 __iomem *devbase)
+{
+	int i, irq;
+	u32 tmp;
+
+	printk(KERN_INFO "mpic:    - Workarounds on AMD 8111 @ %p\n", devbase);
+
+	for (i=0; i < 24; i++) {
+		writeb(0x10 + 2*i, devbase + 0xf2);
+		tmp = readl(devbase + 0xf4);
+		if ((tmp & 0x1) || !(tmp & 0x20))
+			continue;
+		irq = (tmp >> 16) & 0xff;
+		mpic->fixups[irq].irq = i;
+		mpic->fixups[irq].base = devbase + 0xf2;
+	}
+}
+ 
+static void __init mpic_amd8131_read_irq(struct mpic *mpic, u8 __iomem *devbase)
+{
+	int i, irq;
+	u32 tmp;
+
+	printk(KERN_INFO "mpic:    - Workarounds on AMD 8131 @ %p\n", devbase);
+
+	for (i=0; i < 4; i++) {
+		writeb(0x10 + 2*i, devbase + 0xba);
+		tmp = readl(devbase + 0xbc);
+		if ((tmp & 0x1) || !(tmp & 0x20))
+			continue;
+		irq = (tmp >> 16) & 0xff;
+		mpic->fixups[irq].irq = i;
+		mpic->fixups[irq].base = devbase + 0xba;
+	}
+}
+ 
+static void __init mpic_scan_ioapics(struct mpic *mpic)
+{
+	unsigned int devfn;
+	u8 __iomem *cfgspace;
+
+	printk(KERN_INFO "mpic: Setting up IO-APICs workarounds for U3\n");
+
+	/* Allocate fixups array */
+	mpic->fixups = alloc_bootmem(128 * sizeof(struct mpic_irq_fixup));
+	BUG_ON(mpic->fixups == NULL);
+	memset(mpic->fixups, 0, 128 * sizeof(struct mpic_irq_fixup));
+
+	/* Init spinlock */
+	spin_lock_init(&mpic->fixup_lock);
+
+	/* Map u3 config space. We assume all IO-APICs are on the primary bus
+	 * and slot will never be above "0xf" so we only need to map 32k
+	 */
+	cfgspace = (unsigned char __iomem *)ioremap(0xf2000000, 0x8000);
+	BUG_ON(cfgspace == NULL);
+
+	/* Now we scan all slots. We do a very quick scan, we read the header type,
+	 * vendor ID and device ID only, that's plenty enough
+	 */
+	for (devfn = 0; devfn < PCI_DEVFN(0x10,0); devfn ++) {
+		u8 __iomem *devbase = cfgspace + (devfn << 8);
+		u8 hdr_type = readb(devbase + PCI_HEADER_TYPE);
+		u32 l = readl(devbase + PCI_VENDOR_ID);
+		u16 vendor_id, device_id;
+		int multifunc = 0;
+
+		DBG("devfn %x, l: %x\n", devfn, l);
+
+		/* If no device, skip */
+		if (l == 0xffffffff || l == 0x00000000 ||
+		    l == 0x0000ffff || l == 0xffff0000)
+			goto next;
+
+		/* Check if it's a multifunction device (only really used
+		 * to function 0 though
+		 */
+		multifunc = !!(hdr_type & 0x80);
+		vendor_id = l & 0xffff;
+		device_id = (l >> 16) & 0xffff;
+
+		/* If a known device, go to fixup setup code */
+		if (vendor_id == PCI_VENDOR_ID_AMD && device_id == 0x7460)
+			mpic_amd8111_read_irq(mpic, devbase);
+		if (vendor_id == PCI_VENDOR_ID_AMD && device_id == 0x7450)
+			mpic_amd8131_read_irq(mpic, devbase);
+	next:
+		/* next device, if function 0 */
+		if ((PCI_FUNC(devfn) == 0) && !multifunc)
+			devfn += 7;
+	}
+}
+
+#endif /* CONFIG_MPIC_BROKEN_U3 */
+
+
+/* Find an mpic associated with a given linux interrupt */
+static struct mpic *mpic_find(unsigned int irq, unsigned int *is_ipi)
+{
+	struct mpic *mpic = mpics;
+
+	while(mpic) {
+		/* search IPIs first since they may override the main interrupts */
+		if (irq >= mpic->ipi_offset && irq < (mpic->ipi_offset + 4)) {
+			if (is_ipi)
+				*is_ipi = 1;
+			return mpic;
+		}
+		if (irq >= mpic->irq_offset &&
+		    irq < (mpic->irq_offset + mpic->irq_count)) {
+			if (is_ipi)
+				*is_ipi = 0;
+			return mpic;
+		}
+		mpic = mpic -> next;
+	}
+	return NULL;
+}
+
+/* Convert a cpu mask from logical to physical cpu numbers. */
+static inline u32 mpic_physmask(u32 cpumask)
+{
+	int i;
+	u32 mask = 0;
+
+	for (i = 0; i < NR_CPUS; ++i, cpumask >>= 1)
+		mask |= (cpumask & 1) << get_hard_smp_processor_id(i);
+	return mask;
+}
+
+#ifdef CONFIG_SMP
+/* Get the mpic structure from the IPI number */
+static inline struct mpic * mpic_from_ipi(unsigned int ipi)
+{
+	return container_of(irq_desc[ipi].handler, struct mpic, hc_ipi);
+}
+#endif
+
+/* Get the mpic structure from the irq number */
+static inline struct mpic * mpic_from_irq(unsigned int irq)
+{
+	return container_of(irq_desc[irq].handler, struct mpic, hc_irq);
+}
+
+/* Send an EOI */
+static inline void mpic_eoi(struct mpic *mpic)
+{
+	mpic_cpu_write(MPIC_CPU_EOI, 0);
+	(void)mpic_cpu_read(MPIC_CPU_WHOAMI);
+}
+
+#ifdef CONFIG_SMP
+static irqreturn_t mpic_ipi_action(int irq, void *dev_id, struct pt_regs *regs)
+{
+	struct mpic *mpic = dev_id;
+
+	smp_message_recv(irq - mpic->ipi_offset, regs);
+	return IRQ_HANDLED;
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * Linux descriptor level callbacks
+ */
+
+
+static void mpic_enable_irq(unsigned int irq)
+{
+	unsigned int loops = 100000;
+	struct mpic *mpic = mpic_from_irq(irq);
+	unsigned int src = irq - mpic->irq_offset;
+
+	DBG("%s: enable_irq: %d (src %d)\n", mpic->name, irq, src);
+
+	mpic_irq_write(src, MPIC_IRQ_VECTOR_PRI,
+		       mpic_irq_read(src, MPIC_IRQ_VECTOR_PRI) & ~MPIC_VECPRI_MASK);
+
+	/* make sure mask gets to controller before we return to user */
+	do {
+		if (!loops--) {
+			printk(KERN_ERR "mpic_enable_irq timeout\n");
+			break;
+		}
+	} while(mpic_irq_read(src, MPIC_IRQ_VECTOR_PRI) & MPIC_VECPRI_MASK);	
+}
+
+static void mpic_disable_irq(unsigned int irq)
+{
+	unsigned int loops = 100000;
+	struct mpic *mpic = mpic_from_irq(irq);
+	unsigned int src = irq - mpic->irq_offset;
+
+	DBG("%s: disable_irq: %d (src %d)\n", mpic->name, irq, src);
+
+	mpic_irq_write(src, MPIC_IRQ_VECTOR_PRI,
+		       mpic_irq_read(src, MPIC_IRQ_VECTOR_PRI) | MPIC_VECPRI_MASK);
+
+	/* make sure mask gets to controller before we return to user */
+	do {
+		if (!loops--) {
+			printk(KERN_ERR "mpic_enable_irq timeout\n");
+			break;
+		}
+	} while(!(mpic_irq_read(src, MPIC_IRQ_VECTOR_PRI) & MPIC_VECPRI_MASK));
+}
+
+static void mpic_end_irq(unsigned int irq)
+{
+	struct mpic *mpic = mpic_from_irq(irq);
+
+	DBG("%s: end_irq: %d\n", mpic->name, irq);
+
+	/* We always EOI on end_irq() even for edge interrupts since that
+	 * should only lower the priority, the MPIC should have properly
+	 * latched another edge interrupt coming in anyway
+	 */
+
+#ifdef CONFIG_MPIC_BROKEN_U3
+	if (mpic->flags & MPIC_BROKEN_U3) {
+		unsigned int src = irq - mpic->irq_offset;
+		if (mpic_is_ht_interrupt(mpic, src))
+			mpic_apic_end_irq(mpic, src);
+	}
+#endif /* CONFIG_MPIC_BROKEN_U3 */
+
+	mpic_eoi(mpic);
+}
+
+#ifdef CONFIG_SMP
+
+static void mpic_enable_ipi(unsigned int irq)
+{
+	struct mpic *mpic = mpic_from_ipi(irq);
+	unsigned int src = irq - mpic->ipi_offset;
+
+	DBG("%s: enable_ipi: %d (ipi %d)\n", mpic->name, irq, src);
+	mpic_ipi_write(src, mpic_ipi_read(src) & ~MPIC_VECPRI_MASK);
+}
+
+static void mpic_disable_ipi(unsigned int irq)
+{
+	/* NEVER disable an IPI... that's just plain wrong! */
+}
+
+static void mpic_end_ipi(unsigned int irq)
+{
+	struct mpic *mpic = mpic_from_ipi(irq);
+
+	/*
+	 * IPIs are marked IRQ_PER_CPU. This has the side effect of
+	 * preventing the IRQ_PENDING/IRQ_INPROGRESS logic from
+	 * applying to them. We EOI them late to avoid re-entering.
+	 * We mark IPI's with SA_INTERRUPT as they must run with
+	 * irqs disabled.
+	 */
+	mpic_eoi(mpic);
+}
+
+#endif /* CONFIG_SMP */
+
+static void mpic_set_affinity(unsigned int irq, cpumask_t cpumask)
+{
+	struct mpic *mpic = mpic_from_irq(irq);
+
+	cpumask_t tmp;
+
+	cpus_and(tmp, cpumask, cpu_online_map);
+
+	mpic_irq_write(irq - mpic->irq_offset, MPIC_IRQ_DESTINATION,
+		       mpic_physmask(cpus_addr(tmp)[0]));	
+}
+
+
+/*
+ * Exported functions
+ */
+
+
+struct mpic * __init mpic_alloc(unsigned long phys_addr,
+				unsigned int flags,
+				unsigned int isu_size,
+				unsigned int irq_offset,
+				unsigned int irq_count,
+				unsigned int ipi_offset,
+				unsigned char *senses,
+				unsigned int senses_count,
+				const char *name)
+{
+	struct mpic	*mpic;
+	u32		reg;
+	const char	*vers;
+	int		i;
+
+	mpic = alloc_bootmem(sizeof(struct mpic));
+	if (mpic == NULL)
+		return NULL;
+	
+	memset(mpic, 0, sizeof(struct mpic));
+	mpic->name = name;
+
+	mpic->hc_irq.typename = name;
+	mpic->hc_irq.enable = mpic_enable_irq;
+	mpic->hc_irq.disable = mpic_disable_irq;
+	mpic->hc_irq.end = mpic_end_irq;
+	if (flags & MPIC_PRIMARY)
+		mpic->hc_irq.set_affinity = mpic_set_affinity;
+#ifdef CONFIG_SMP
+	mpic->hc_ipi.typename = name;
+	mpic->hc_ipi.enable = mpic_enable_ipi;
+	mpic->hc_ipi.disable = mpic_disable_ipi;
+	mpic->hc_ipi.end = mpic_end_ipi;
+#endif /* CONFIG_SMP */
+
+	mpic->flags = flags;
+	mpic->isu_size = isu_size;
+	mpic->irq_offset = irq_offset;
+	mpic->irq_count = irq_count;
+	mpic->ipi_offset = ipi_offset;
+	mpic->num_sources = 0; /* so far */
+	mpic->senses = senses;
+	mpic->senses_count = senses_count;
+
+	/* Map the global registers */
+	mpic->gregs = ioremap(phys_addr + MPIC_GREG_BASE, 0x1000);
+	mpic->tmregs = mpic->gregs + (MPIC_TIMER_BASE >> 2);
+	BUG_ON(mpic->gregs == NULL);
+
+	/* Reset */
+	if (flags & MPIC_WANTS_RESET) {
+		mpic_write(mpic->gregs, MPIC_GREG_GLOBAL_CONF_0,
+			   mpic_read(mpic->gregs, MPIC_GREG_GLOBAL_CONF_0)
+			   | MPIC_GREG_GCONF_RESET);
+		while( mpic_read(mpic->gregs, MPIC_GREG_GLOBAL_CONF_0)
+		       & MPIC_GREG_GCONF_RESET)
+			mb();
+	}
+
+	/* Read feature register, calculate num CPUs and, for non-ISU
+	 * MPICs, num sources as well. On ISU MPICs, sources are counted
+	 * as ISUs are added
+	 */
+	reg = mpic_read(mpic->gregs, MPIC_GREG_FEATURE_0);
+	mpic->num_cpus = ((reg & MPIC_GREG_FEATURE_LAST_CPU_MASK)
+			  >> MPIC_GREG_FEATURE_LAST_CPU_SHIFT) + 1;
+	if (isu_size == 0)
+		mpic->num_sources = ((reg & MPIC_GREG_FEATURE_LAST_SRC_MASK)
+				     >> MPIC_GREG_FEATURE_LAST_SRC_SHIFT) + 1;
+
+	/* Map the per-CPU registers */
+	for (i = 0; i < mpic->num_cpus; i++) {
+		mpic->cpuregs[i] = ioremap(phys_addr + MPIC_CPU_BASE +
+					   i * MPIC_CPU_STRIDE, 0x1000);
+		BUG_ON(mpic->cpuregs[i] == NULL);
+	}
+
+	/* Initialize main ISU if none provided */
+	if (mpic->isu_size == 0) {
+		mpic->isu_size = mpic->num_sources;
+		mpic->isus[0] = ioremap(phys_addr + MPIC_IRQ_BASE,
+					MPIC_IRQ_STRIDE * mpic->isu_size);
+		BUG_ON(mpic->isus[0] == NULL);
+	}
+	mpic->isu_shift = 1 + __ilog2(mpic->isu_size - 1);
+	mpic->isu_mask = (1 << mpic->isu_shift) - 1;
+
+	/* Display version */
+	switch (reg & MPIC_GREG_FEATURE_VERSION_MASK) {
+	case 1:
+		vers = "1.0";
+		break;
+	case 2:
+		vers = "1.2";
+		break;
+	case 3:
+		vers = "1.3";
+		break;
+	default:
+		vers = "<unknown>";
+		break;
+	}
+	printk(KERN_INFO "mpic: Setting up MPIC \"%s\" version %s at %lx, max %d CPUs\n",
+	       name, vers, phys_addr, mpic->num_cpus);
+	printk(KERN_INFO "mpic: ISU size: %d, shift: %d, mask: %x\n", mpic->isu_size,
+	       mpic->isu_shift, mpic->isu_mask);
+
+	mpic->next = mpics;
+	mpics = mpic;
+
+	if (flags & MPIC_PRIMARY)
+		mpic_primary = mpic;
+
+	return mpic;
+}
+
+void __init mpic_assign_isu(struct mpic *mpic, unsigned int isu_num,
+			    unsigned long phys_addr)
+{
+	unsigned int isu_first = isu_num * mpic->isu_size;
+
+	BUG_ON(isu_num >= MPIC_MAX_ISU);
+
+	mpic->isus[isu_num] = ioremap(phys_addr, MPIC_IRQ_STRIDE * mpic->isu_size);
+	if ((isu_first + mpic->isu_size) > mpic->num_sources)
+		mpic->num_sources = isu_first + mpic->isu_size;
+}
+
+void __init mpic_setup_cascade(unsigned int irq, mpic_cascade_t handler,
+			       void *data)
+{
+	struct mpic *mpic = mpic_find(irq, NULL);
+	unsigned long flags;
+
+	/* Synchronization here is a bit dodgy, so don't try to replace cascade
+	 * interrupts on the fly too often ... but normally it's set up at boot.
+	 */
+	spin_lock_irqsave(&mpic_lock, flags);
+	if (mpic->cascade)	       
+		mpic_disable_irq(mpic->cascade_vec + mpic->irq_offset);
+	mpic->cascade = NULL;
+	wmb();
+	mpic->cascade_vec = irq - mpic->irq_offset;
+	mpic->cascade_data = data;
+	wmb();
+	mpic->cascade = handler;
+	mpic_enable_irq(irq);
+	spin_unlock_irqrestore(&mpic_lock, flags);
+}
+
+void __init mpic_init(struct mpic *mpic)
+{
+	int i;
+
+	BUG_ON(mpic->num_sources == 0);
+
+	printk(KERN_INFO "mpic: Initializing for %d sources\n", mpic->num_sources);
+
+	/* Set current processor priority to max */
+	mpic_cpu_write(MPIC_CPU_CURRENT_TASK_PRI, 0xf);
+
+	/* Initialize timers: just disable them all */
+	for (i = 0; i < 4; i++) {
+		mpic_write(mpic->tmregs,
+			   i * MPIC_TIMER_STRIDE + MPIC_TIMER_DESTINATION, 0);
+		mpic_write(mpic->tmregs,
+			   i * MPIC_TIMER_STRIDE + MPIC_TIMER_VECTOR_PRI,
+			   MPIC_VECPRI_MASK |
+			   (MPIC_VEC_TIMER_0 + i));
+	}
+
+	/* Initialize IPIs to our reserved vectors and mark them disabled for now */
+	mpic_test_broken_ipi(mpic);
+	for (i = 0; i < 4; i++) {
+		mpic_ipi_write(i,
+			       MPIC_VECPRI_MASK |
+			       (10 << MPIC_VECPRI_PRIORITY_SHIFT) |
+			       (MPIC_VEC_IPI_0 + i));
+#ifdef CONFIG_SMP
+		if (!(mpic->flags & MPIC_PRIMARY))
+			continue;
+		irq_desc[mpic->ipi_offset+i].status |= IRQ_PER_CPU;
+		irq_desc[mpic->ipi_offset+i].handler = &mpic->hc_ipi;
+		
+#endif /* CONFIG_SMP */
+	}
+
+	/* Initialize interrupt sources */
+	if (mpic->irq_count == 0)
+		mpic->irq_count = mpic->num_sources;
+
+#ifdef CONFIG_MPIC_BROKEN_U3
+	/* Do the ioapic fixups on U3 broken mpic */
+	DBG("MPIC flags: %x\n", mpic->flags);
+	if ((mpic->flags & MPIC_BROKEN_U3) && (mpic->flags & MPIC_PRIMARY))
+		mpic_scan_ioapics(mpic);
+#endif /* CONFIG_MPIC_BROKEN_U3 */
+
+	for (i = 0; i < mpic->num_sources; i++) {
+		/* start with vector = source number, and masked */
+		u32 vecpri = MPIC_VECPRI_MASK | i | (8 << MPIC_VECPRI_PRIORITY_SHIFT);
+		int level = 0;
+		
+		/* if it's an IPI, we skip it */
+		if ((mpic->irq_offset + i) >= (mpic->ipi_offset + i) &&
+		    (mpic->irq_offset + i) <  (mpic->ipi_offset + i + 4))
+			continue;
+
+		/* do senses munging */
+		if (mpic->senses && i < mpic->senses_count) {
+			if (mpic->senses[i] & IRQ_SENSE_LEVEL)
+				vecpri |= MPIC_VECPRI_SENSE_LEVEL;
+			if (mpic->senses[i] & IRQ_POLARITY_POSITIVE)
+				vecpri |= MPIC_VECPRI_POLARITY_POSITIVE;
+		} else
+			vecpri |= MPIC_VECPRI_SENSE_LEVEL;
+
+		/* remember if it was a level interrupts */
+		level = (vecpri & MPIC_VECPRI_SENSE_LEVEL);
+
+		/* deal with broken U3 */
+		if (mpic->flags & MPIC_BROKEN_U3) {
+#ifdef CONFIG_MPIC_BROKEN_U3
+			if (mpic_is_ht_interrupt(mpic, i)) {
+				vecpri &= ~(MPIC_VECPRI_SENSE_MASK |
+					    MPIC_VECPRI_POLARITY_MASK);
+				vecpri |= MPIC_VECPRI_POLARITY_POSITIVE;
+			}
+#else
+			printk(KERN_ERR "mpic: BROKEN_U3 set, but CONFIG doesn't match\n");
+#endif
+		}
+
+		DBG("setup source %d, vecpri: %08x, level: %d\n", i, vecpri,
+		    (level != 0));
+
+		/* init hw */
+		mpic_irq_write(i, MPIC_IRQ_VECTOR_PRI, vecpri);
+		mpic_irq_write(i, MPIC_IRQ_DESTINATION,
+			       1 << get_hard_smp_processor_id(boot_cpuid));
+
+		/* init linux descriptors */
+		if (i < mpic->irq_count) {
+			irq_desc[mpic->irq_offset+i].status = level ? IRQ_LEVEL : 0;
+			irq_desc[mpic->irq_offset+i].handler = &mpic->hc_irq;
+		}
+	}
+	
+	/* Init spurrious vector */
+	mpic_write(mpic->gregs, MPIC_GREG_SPURIOUS, MPIC_VEC_SPURRIOUS);
+
+	/* Disable 8259 passthrough */
+	mpic_write(mpic->gregs, MPIC_GREG_GLOBAL_CONF_0,
+		   mpic_read(mpic->gregs, MPIC_GREG_GLOBAL_CONF_0)
+		   | MPIC_GREG_GCONF_8259_PTHROU_DIS);
+
+	/* Set current processor priority to 0 */
+	mpic_cpu_write(MPIC_CPU_CURRENT_TASK_PRI, 0);
+}
+
+
+
+void mpic_irq_set_priority(unsigned int irq, unsigned int pri)
+{
+	int is_ipi;
+	struct mpic *mpic = mpic_find(irq, &is_ipi);
+	unsigned long flags;
+	u32 reg;
+
+	spin_lock_irqsave(&mpic_lock, flags);
+	if (is_ipi) {
+		reg = mpic_ipi_read(irq - mpic->ipi_offset) & MPIC_VECPRI_PRIORITY_MASK;
+		mpic_ipi_write(irq - mpic->ipi_offset,
+			       reg | (pri << MPIC_VECPRI_PRIORITY_SHIFT));
+	} else {
+		reg = mpic_irq_read(irq - mpic->irq_offset, MPIC_IRQ_VECTOR_PRI)
+			& MPIC_VECPRI_PRIORITY_MASK;
+		mpic_irq_write(irq - mpic->irq_offset, MPIC_IRQ_VECTOR_PRI,
+			       reg | (pri << MPIC_VECPRI_PRIORITY_SHIFT));
+	}
+	spin_unlock_irqrestore(&mpic_lock, flags);
+}
+
+unsigned int mpic_irq_get_priority(unsigned int irq)
+{
+	int is_ipi;
+	struct mpic *mpic = mpic_find(irq, &is_ipi);
+	unsigned long flags;
+	u32 reg;
+
+	spin_lock_irqsave(&mpic_lock, flags);
+	if (is_ipi)
+		reg = mpic_ipi_read(irq - mpic->ipi_offset);
+	else
+		reg = mpic_irq_read(irq - mpic->irq_offset, MPIC_IRQ_VECTOR_PRI);
+	spin_unlock_irqrestore(&mpic_lock, flags);
+	return (reg & MPIC_VECPRI_PRIORITY_MASK) >> MPIC_VECPRI_PRIORITY_SHIFT;
+}
+
+void mpic_setup_this_cpu(void)
+{
+#ifdef CONFIG_SMP
+	struct mpic *mpic = mpic_primary;
+	unsigned long flags;
+	u32 msk = 1 << hard_smp_processor_id();
+	unsigned int i;
+
+	BUG_ON(mpic == NULL);
+
+	DBG("%s: setup_this_cpu(%d)\n", mpic->name, hard_smp_processor_id());
+
+	spin_lock_irqsave(&mpic_lock, flags);
+
+ 	/* let the mpic know we want intrs. default affinity is 0xffffffff
+	 * until changed via /proc. That's how it's done on x86. If we want
+	 * it differently, then we should make sure we also change the default
+	 * values of irq_affinity in irq.c.
+ 	 */
+	if (distribute_irqs) {
+	 	for (i = 0; i < mpic->num_sources ; i++)
+			mpic_irq_write(i, MPIC_IRQ_DESTINATION,
+				mpic_irq_read(i, MPIC_IRQ_DESTINATION) | msk);
+	}
+
+	/* Set current processor priority to 0 */
+	mpic_cpu_write(MPIC_CPU_CURRENT_TASK_PRI, 0);
+
+	spin_unlock_irqrestore(&mpic_lock, flags);
+#endif /* CONFIG_SMP */
+}
+
+void mpic_send_ipi(unsigned int ipi_no, unsigned int cpu_mask)
+{
+	struct mpic *mpic = mpic_primary;
+
+	BUG_ON(mpic == NULL);
+
+	DBG("%s: send_ipi(ipi_no: %d)\n", mpic->name, ipi_no);
+
+	mpic_cpu_write(MPIC_CPU_IPI_DISPATCH_0 + ipi_no * 0x10,
+		       mpic_physmask(cpu_mask & cpus_addr(cpu_online_map)[0]));
+}
+
+int mpic_get_one_irq(struct mpic *mpic, struct pt_regs *regs)
+{
+	u32 irq;
+
+	irq = mpic_cpu_read(MPIC_CPU_INTACK) & MPIC_VECPRI_VECTOR_MASK;
+	DBG("%s: get_one_irq(): %d\n", mpic->name, irq);
+
+	if (mpic->cascade && irq == mpic->cascade_vec) {
+		DBG("%s: cascading ...\n", mpic->name);
+		irq = mpic->cascade(regs, mpic->cascade_data);
+		mpic_eoi(mpic);
+		return irq;
+	}
+	if (unlikely(irq == MPIC_VEC_SPURRIOUS))
+		return -1;
+	if (irq < MPIC_VEC_IPI_0) 
+		return irq + mpic->irq_offset;
+       	DBG("%s: ipi %d !\n", mpic->name, irq - MPIC_VEC_IPI_0);
+	return irq - MPIC_VEC_IPI_0 + mpic->ipi_offset;
+}
+
+int mpic_get_irq(struct pt_regs *regs)
+{
+	struct mpic *mpic = mpic_primary;
+
+	BUG_ON(mpic == NULL);
+
+	return mpic_get_one_irq(mpic, regs);
+}
+
+
+#ifdef CONFIG_SMP
+void mpic_request_ipis(void)
+{
+	struct mpic *mpic = mpic_primary;
+
+	BUG_ON(mpic == NULL);
+	
+	printk("requesting IPIs ... \n");
+
+	/* IPIs are marked SA_INTERRUPT as they must run with irqs disabled */
+	request_irq(mpic->ipi_offset+0, mpic_ipi_action, SA_INTERRUPT,
+		    "IPI0 (call function)", mpic);
+	request_irq(mpic->ipi_offset+1, mpic_ipi_action, SA_INTERRUPT,
+		   "IPI1 (reschedule)", mpic);
+	request_irq(mpic->ipi_offset+2, mpic_ipi_action, SA_INTERRUPT,
+		   "IPI2 (unused)", mpic);
+	request_irq(mpic->ipi_offset+3, mpic_ipi_action, SA_INTERRUPT,
+		   "IPI3 (debugger break)", mpic);
+
+	printk("IPIs requested... \n");
+}
+#endif /* CONFIG_SMP */
diff --git a/arch/ppc64/kernel/mpic.h b/arch/ppc64/kernel/mpic.h
new file mode 100644
index 00000000000..571b3c99e06
--- /dev/null
+++ b/arch/ppc64/kernel/mpic.h
@@ -0,0 +1,267 @@
+#include <linux/irq.h>
+
+/*
+ * Global registers
+ */
+
+#define MPIC_GREG_BASE			0x01000
+
+#define MPIC_GREG_FEATURE_0		0x00000
+#define		MPIC_GREG_FEATURE_LAST_SRC_MASK		0x07ff0000
+#define		MPIC_GREG_FEATURE_LAST_SRC_SHIFT	16
+#define		MPIC_GREG_FEATURE_LAST_CPU_MASK		0x00001f00
+#define		MPIC_GREG_FEATURE_LAST_CPU_SHIFT	8
+#define		MPIC_GREG_FEATURE_VERSION_MASK		0xff
+#define MPIC_GREG_FEATURE_1		0x00010
+#define MPIC_GREG_GLOBAL_CONF_0		0x00020
+#define		MPIC_GREG_GCONF_RESET			0x80000000
+#define		MPIC_GREG_GCONF_8259_PTHROU_DIS		0x20000000
+#define		MPIC_GREG_GCONF_BASE_MASK		0x000fffff
+#define MPIC_GREG_GLOBAL_CONF_1		0x00030
+#define MPIC_GREG_VENDOR_0		0x00040
+#define MPIC_GREG_VENDOR_1		0x00050
+#define MPIC_GREG_VENDOR_2		0x00060
+#define MPIC_GREG_VENDOR_3		0x00070
+#define MPIC_GREG_VENDOR_ID		0x00080
+#define 	MPIC_GREG_VENDOR_ID_STEPPING_MASK	0x00ff0000
+#define 	MPIC_GREG_VENDOR_ID_STEPPING_SHIFT	16
+#define 	MPIC_GREG_VENDOR_ID_DEVICE_ID_MASK	0x0000ff00
+#define 	MPIC_GREG_VENDOR_ID_DEVICE_ID_SHIFT	8
+#define 	MPIC_GREG_VENDOR_ID_VENDOR_ID_MASK	0x000000ff
+#define MPIC_GREG_PROCESSOR_INIT	0x00090
+#define MPIC_GREG_IPI_VECTOR_PRI_0	0x000a0
+#define MPIC_GREG_IPI_VECTOR_PRI_1	0x000b0
+#define MPIC_GREG_IPI_VECTOR_PRI_2	0x000c0
+#define MPIC_GREG_IPI_VECTOR_PRI_3	0x000d0
+#define MPIC_GREG_SPURIOUS		0x000e0
+#define MPIC_GREG_TIMER_FREQ		0x000f0
+
+/*
+ *
+ * Timer registers
+ */
+#define MPIC_TIMER_BASE			0x01100
+#define MPIC_TIMER_STRIDE		0x40
+
+#define MPIC_TIMER_CURRENT_CNT		0x00000
+#define MPIC_TIMER_BASE_CNT		0x00010
+#define MPIC_TIMER_VECTOR_PRI		0x00020
+#define MPIC_TIMER_DESTINATION		0x00030
+
+/*
+ * Per-Processor registers
+ */
+
+#define MPIC_CPU_THISBASE		0x00000
+#define MPIC_CPU_BASE			0x20000
+#define MPIC_CPU_STRIDE			0x01000
+
+#define MPIC_CPU_IPI_DISPATCH_0		0x00040
+#define MPIC_CPU_IPI_DISPATCH_1		0x00050
+#define MPIC_CPU_IPI_DISPATCH_2		0x00060
+#define MPIC_CPU_IPI_DISPATCH_3		0x00070
+#define MPIC_CPU_CURRENT_TASK_PRI	0x00080
+#define 	MPIC_CPU_TASKPRI_MASK			0x0000000f
+#define MPIC_CPU_WHOAMI			0x00090
+#define 	MPIC_CPU_WHOAMI_MASK			0x0000001f
+#define MPIC_CPU_INTACK			0x000a0
+#define MPIC_CPU_EOI			0x000b0
+
+/*
+ * Per-source registers
+ */
+
+#define MPIC_IRQ_BASE			0x10000
+#define MPIC_IRQ_STRIDE			0x00020
+#define MPIC_IRQ_VECTOR_PRI		0x00000
+#define 	MPIC_VECPRI_MASK			0x80000000
+#define 	MPIC_VECPRI_ACTIVITY			0x40000000	/* Read Only */
+#define 	MPIC_VECPRI_PRIORITY_MASK		0x000f0000
+#define 	MPIC_VECPRI_PRIORITY_SHIFT		16
+#define 	MPIC_VECPRI_VECTOR_MASK			0x000007ff
+#define 	MPIC_VECPRI_POLARITY_POSITIVE		0x00800000
+#define 	MPIC_VECPRI_POLARITY_NEGATIVE		0x00000000
+#define 	MPIC_VECPRI_POLARITY_MASK		0x00800000
+#define 	MPIC_VECPRI_SENSE_LEVEL			0x00400000
+#define 	MPIC_VECPRI_SENSE_EDGE			0x00000000
+#define 	MPIC_VECPRI_SENSE_MASK			0x00400000
+#define MPIC_IRQ_DESTINATION		0x00010
+
+#define MPIC_MAX_IRQ_SOURCES	2048
+#define MPIC_MAX_CPUS		32
+#define MPIC_MAX_ISU		32
+
+/*
+ * Special vector numbers (internal use only)
+ */
+#define MPIC_VEC_SPURRIOUS	255
+#define MPIC_VEC_IPI_3		254
+#define MPIC_VEC_IPI_2		253
+#define MPIC_VEC_IPI_1		252
+#define MPIC_VEC_IPI_0		251
+
+/* unused */
+#define MPIC_VEC_TIMER_3	250
+#define MPIC_VEC_TIMER_2	249
+#define MPIC_VEC_TIMER_1	248
+#define MPIC_VEC_TIMER_0	247
+
+/* Type definition of the cascade handler */
+typedef int (*mpic_cascade_t)(struct pt_regs *regs, void *data);
+
+#ifdef CONFIG_MPIC_BROKEN_U3
+/* Fixup table entry */
+struct mpic_irq_fixup
+{
+	u8 __iomem	*base;
+	unsigned int   irq;
+};
+#endif /* CONFIG_MPIC_BROKEN_U3 */
+
+
+/* The instance data of a given MPIC */
+struct mpic
+{
+	/* The "linux" controller struct */
+	hw_irq_controller	hc_irq;
+#ifdef CONFIG_SMP
+	hw_irq_controller	hc_ipi;
+#endif
+	const char		*name;
+	/* Flags */
+	unsigned int		flags;
+	/* How many irq sources in a given ISU */
+	unsigned int		isu_size;
+	unsigned int		isu_shift;
+	unsigned int		isu_mask;
+	/* Offset of irq vector numbers */
+	unsigned int		irq_offset;	
+	unsigned int		irq_count;
+	/* Offset of ipi vector numbers */
+	unsigned int		ipi_offset;
+	/* Number of sources */
+	unsigned int		num_sources;
+	/* Number of CPUs */
+	unsigned int		num_cpus;
+	/* cascade handler */
+	mpic_cascade_t		cascade;
+	void			*cascade_data;
+	unsigned int		cascade_vec;
+	/* senses array */
+	unsigned char		*senses;
+	unsigned int		senses_count;
+
+#ifdef CONFIG_MPIC_BROKEN_U3
+	/* The fixup table */
+	struct mpic_irq_fixup	*fixups;
+	spinlock_t		fixup_lock;
+#endif
+
+	/* The various ioremap'ed bases */
+	volatile u32 __iomem	*gregs;
+	volatile u32 __iomem	*tmregs;
+	volatile u32 __iomem	*cpuregs[MPIC_MAX_CPUS];
+	volatile u32 __iomem	*isus[MPIC_MAX_ISU];
+
+	/* link */
+	struct mpic		*next;
+};
+
+/* This is the primary controller, only that one has IPIs and
+ * has afinity control. A non-primary MPIC always uses CPU0
+ * registers only
+ */
+#define MPIC_PRIMARY			0x00000001
+/* Set this for a big-endian MPIC */
+#define MPIC_BIG_ENDIAN			0x00000002
+/* Broken U3 MPIC */
+#define MPIC_BROKEN_U3			0x00000004
+/* Broken IPI registers (autodetected) */
+#define MPIC_BROKEN_IPI			0x00000008
+/* MPIC wants a reset */
+#define MPIC_WANTS_RESET		0x00000010
+
+/* Allocate the controller structure and setup the linux irq descs
+ * for the range if interrupts passed in. No HW initialization is
+ * actually performed.
+ * 
+ * @phys_addr:	physial base address of the MPIC
+ * @flags:	flags, see constants above
+ * @isu_size:	number of interrupts in an ISU. Use 0 to use a
+ *              standard ISU-less setup (aka powermac)
+ * @irq_offset: first irq number to assign to this mpic
+ * @irq_count:  number of irqs to use with this mpic IRQ sources. Pass 0
+ *	        to match the number of sources
+ * @ipi_offset: first irq number to assign to this mpic IPI sources,
+ *		used only on primary mpic
+ * @senses:	array of sense values
+ * @senses_num: number of entries in the array
+ *
+ * Note about the sense array. If none is passed, all interrupts are
+ * setup to be level negative unless MPIC_BROKEN_U3 is set in which
+ * case they are edge positive (and the array is ignored anyway).
+ * The values in the array start at the first source of the MPIC,
+ * that is senses[0] correspond to linux irq "irq_offset".
+ */
+extern struct mpic *mpic_alloc(unsigned long phys_addr,
+			       unsigned int flags,
+			       unsigned int isu_size,
+			       unsigned int irq_offset,
+			       unsigned int irq_count,
+			       unsigned int ipi_offset,
+			       unsigned char *senses,
+			       unsigned int senses_num,
+			       const char *name);
+
+/* Assign ISUs, to call before mpic_init()
+ *
+ * @mpic:	controller structure as returned by mpic_alloc()
+ * @isu_num:	ISU number
+ * @phys_addr:	physical address of the ISU
+ */
+extern void mpic_assign_isu(struct mpic *mpic, unsigned int isu_num,
+			    unsigned long phys_addr);
+
+/* Initialize the controller. After this has been called, none of the above
+ * should be called again for this mpic
+ */
+extern void mpic_init(struct mpic *mpic);
+
+/* Setup a cascade. Currently, only one cascade is supported this
+ * way, though you can always do a normal request_irq() and add
+ * other cascades this way. You should call this _after_ having
+ * added all the ISUs
+ *
+ * @irq_no:	"linux" irq number of the cascade (that is offset'ed vector)
+ * @handler:	cascade handler function
+ */
+extern void mpic_setup_cascade(unsigned int irq_no, mpic_cascade_t hanlder,
+			       void *data);
+
+/*
+ * All of the following functions must only be used after the
+ * ISUs have been assigned and the controller fully initialized
+ * with mpic_init()
+ */
+
+
+/* Change/Read the priority of an interrupt. Default is 8 for irqs and
+ * 10 for IPIs. You can call this on both IPIs and IRQ numbers, but the
+ * IPI number is then the offset'ed (linux irq number mapped to the IPI)
+ */
+extern void mpic_irq_set_priority(unsigned int irq, unsigned int pri);
+extern unsigned int mpic_irq_get_priority(unsigned int irq);
+
+/* Setup a non-boot CPU */
+extern void mpic_setup_this_cpu(void);
+
+/* Request IPIs on primary mpic */
+extern void mpic_request_ipis(void);
+
+/* Send an IPI (non offseted number 0..3) */
+extern void mpic_send_ipi(unsigned int ipi_no, unsigned int cpu_mask);
+
+/* Fetch interrupt from a given mpic */
+extern int mpic_get_one_irq(struct mpic *mpic, struct pt_regs *regs);
+/* This one gets to the primary mpic */
+extern int mpic_get_irq(struct pt_regs *regs);
diff --git a/arch/ppc64/kernel/nvram.c b/arch/ppc64/kernel/nvram.c
new file mode 100644
index 00000000000..b9069c2d193
--- /dev/null
+++ b/arch/ppc64/kernel/nvram.c
@@ -0,0 +1,746 @@
+/*
+ *  c 2001 PPC 64 Team, IBM Corp
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ * /dev/nvram driver for PPC64
+ *
+ * This perhaps should live in drivers/char
+ *
+ * TODO: Split the /dev/nvram part (that one can use
+ *       drivers/char/generic_nvram.c) from the arch & partition
+ *       parsing code.
+ */
+
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/fcntl.h>
+#include <linux/nvram.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <asm/uaccess.h>
+#include <asm/nvram.h>
+#include <asm/rtas.h>
+#include <asm/prom.h>
+#include <asm/machdep.h>
+#include <asm/systemcfg.h>
+
+#undef DEBUG_NVRAM
+
+static int nvram_scan_partitions(void);
+static int nvram_setup_partition(void);
+static int nvram_create_os_partition(void);
+static int nvram_remove_os_partition(void);
+
+static struct nvram_partition * nvram_part;
+static long nvram_error_log_index = -1;
+static long nvram_error_log_size = 0;
+
+int no_logging = 1; 	/* Until we initialize everything,
+			 * make sure we don't try logging
+			 * anything */
+
+extern volatile int error_log_cnt;
+
+struct err_log_info {
+	int error_type;
+	unsigned int seq_num;
+};
+
+static loff_t dev_nvram_llseek(struct file *file, loff_t offset, int origin)
+{
+	int size;
+
+	if (ppc_md.nvram_size == NULL)
+		return -ENODEV;
+	size = ppc_md.nvram_size();
+
+	switch (origin) {
+	case 1:
+		offset += file->f_pos;
+		break;
+	case 2:
+		offset += size;
+		break;
+	}
+	if (offset < 0)
+		return -EINVAL;
+	file->f_pos = offset;
+	return file->f_pos;
+}
+
+
+static ssize_t dev_nvram_read(struct file *file, char __user *buf,
+			  size_t count, loff_t *ppos)
+{
+	ssize_t len;
+	char *tmp_buffer;
+	int size;
+
+	if (ppc_md.nvram_size == NULL)
+		return -ENODEV;
+	size = ppc_md.nvram_size();
+
+	if (!access_ok(VERIFY_WRITE, buf, count))
+		return -EFAULT;
+	if (*ppos >= size)
+		return 0;
+	if (count > size) 
+		count = size;
+
+	tmp_buffer = (char *) kmalloc(count, GFP_KERNEL);
+	if (!tmp_buffer) {
+		printk(KERN_ERR "dev_read_nvram: kmalloc failed\n");
+		return -ENOMEM;
+	}
+
+	len = ppc_md.nvram_read(tmp_buffer, count, ppos);
+	if ((long)len <= 0) {
+		kfree(tmp_buffer);
+		return len;
+	}
+
+	if (copy_to_user(buf, tmp_buffer, len)) {
+		kfree(tmp_buffer);
+		return -EFAULT;
+	}
+
+	kfree(tmp_buffer);
+	return len;
+
+}
+
+static ssize_t dev_nvram_write(struct file *file, const char __user *buf,
+			   size_t count, loff_t *ppos)
+{
+	ssize_t len;
+	char * tmp_buffer;
+	int size;
+
+	if (ppc_md.nvram_size == NULL)
+		return -ENODEV;
+	size = ppc_md.nvram_size();
+
+	if (!access_ok(VERIFY_READ, buf, count))
+		return -EFAULT;
+	if (*ppos >= size)
+		return 0;
+	if (count > size)
+		count = size;
+
+	tmp_buffer = (char *) kmalloc(count, GFP_KERNEL);
+	if (!tmp_buffer) {
+		printk(KERN_ERR "dev_nvram_write: kmalloc failed\n");
+		return -ENOMEM;
+	}
+	
+	if (copy_from_user(tmp_buffer, buf, count)) {
+		kfree(tmp_buffer);
+		return -EFAULT;
+	}
+
+	len = ppc_md.nvram_write(tmp_buffer, count, ppos);
+	if ((long)len <= 0) {
+		kfree(tmp_buffer);
+		return len;
+	}
+
+	kfree(tmp_buffer);
+	return len;
+}
+
+static int dev_nvram_ioctl(struct inode *inode, struct file *file,
+	unsigned int cmd, unsigned long arg)
+{
+	switch(cmd) {
+#ifdef CONFIG_PPC_PMAC
+	case OBSOLETE_PMAC_NVRAM_GET_OFFSET:
+		printk(KERN_WARNING "nvram: Using obsolete PMAC_NVRAM_GET_OFFSET ioctl\n");
+	case IOC_NVRAM_GET_OFFSET: {
+		int part, offset;
+
+		if (systemcfg->platform != PLATFORM_POWERMAC)
+			return -EINVAL;
+		if (copy_from_user(&part, (void __user*)arg, sizeof(part)) != 0)
+			return -EFAULT;
+		if (part < pmac_nvram_OF || part > pmac_nvram_NR)
+			return -EINVAL;
+		offset = pmac_get_partition(part);
+		if (offset < 0)
+			return offset;
+		if (copy_to_user((void __user*)arg, &offset, sizeof(offset)) != 0)
+			return -EFAULT;
+		return 0;
+	}
+#endif /* CONFIG_PPC_PMAC */
+	}
+	return -EINVAL;
+}
+
+struct file_operations nvram_fops = {
+	.owner =	THIS_MODULE,
+	.llseek =	dev_nvram_llseek,
+	.read =		dev_nvram_read,
+	.write =	dev_nvram_write,
+	.ioctl =	dev_nvram_ioctl,
+};
+
+static struct miscdevice nvram_dev = {
+	NVRAM_MINOR,
+	"nvram",
+	&nvram_fops
+};
+
+
+#ifdef DEBUG_NVRAM
+static void nvram_print_partitions(char * label)
+{
+	struct list_head * p;
+	struct nvram_partition * tmp_part;
+	
+	printk(KERN_WARNING "--------%s---------\n", label);
+	printk(KERN_WARNING "indx\t\tsig\tchks\tlen\tname\n");
+	list_for_each(p, &nvram_part->partition) {
+		tmp_part = list_entry(p, struct nvram_partition, partition);
+		printk(KERN_WARNING "%d    \t%02x\t%02x\t%d\t%s\n",
+		       tmp_part->index, tmp_part->header.signature,
+		       tmp_part->header.checksum, tmp_part->header.length,
+		       tmp_part->header.name);
+	}
+}
+#endif
+
+
+static int nvram_write_header(struct nvram_partition * part)
+{
+	loff_t tmp_index;
+	int rc;
+	
+	tmp_index = part->index;
+	rc = ppc_md.nvram_write((char *)&part->header, NVRAM_HEADER_LEN, &tmp_index); 
+
+	return rc;
+}
+
+
+static unsigned char nvram_checksum(struct nvram_header *p)
+{
+	unsigned int c_sum, c_sum2;
+	unsigned short *sp = (unsigned short *)p->name; /* assume 6 shorts */
+	c_sum = p->signature + p->length + sp[0] + sp[1] + sp[2] + sp[3] + sp[4] + sp[5];
+
+	/* The sum may have spilled into the 3rd byte.  Fold it back. */
+	c_sum = ((c_sum & 0xffff) + (c_sum >> 16)) & 0xffff;
+	/* The sum cannot exceed 2 bytes.  Fold it into a checksum */
+	c_sum2 = (c_sum >> 8) + (c_sum << 8);
+	c_sum = ((c_sum + c_sum2) >> 8) & 0xff;
+	return c_sum;
+}
+
+
+/*
+ * Find an nvram partition, sig can be 0 for any
+ * partition or name can be NULL for any name, else
+ * tries to match both
+ */
+struct nvram_partition *nvram_find_partition(int sig, const char *name)
+{
+	struct nvram_partition * part;
+	struct list_head * p;
+
+	list_for_each(p, &nvram_part->partition) {
+		part = list_entry(p, struct nvram_partition, partition);
+
+		if (sig && part->header.signature != sig)
+			continue;
+		if (name && 0 != strncmp(name, part->header.name, 12))
+			continue;
+		return part; 
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(nvram_find_partition);
+
+
+static int nvram_remove_os_partition(void)
+{
+	struct list_head *i;
+	struct list_head *j;
+	struct nvram_partition * part;
+	struct nvram_partition * cur_part;
+	int rc;
+
+	list_for_each(i, &nvram_part->partition) {
+		part = list_entry(i, struct nvram_partition, partition);
+		if (part->header.signature != NVRAM_SIG_OS)
+			continue;
+		
+		/* Make os partition a free partition */
+		part->header.signature = NVRAM_SIG_FREE;
+		sprintf(part->header.name, "wwwwwwwwwwww");
+		part->header.checksum = nvram_checksum(&part->header);
+
+		/* Merge contiguous free partitions backwards */
+		list_for_each_prev(j, &part->partition) {
+			cur_part = list_entry(j, struct nvram_partition, partition);
+			if (cur_part == nvram_part || cur_part->header.signature != NVRAM_SIG_FREE) {
+				break;
+			}
+			
+			part->header.length += cur_part->header.length;
+			part->header.checksum = nvram_checksum(&part->header);
+			part->index = cur_part->index;
+
+			list_del(&cur_part->partition);
+			kfree(cur_part);
+			j = &part->partition; /* fixup our loop */
+		}
+		
+		/* Merge contiguous free partitions forwards */
+		list_for_each(j, &part->partition) {
+			cur_part = list_entry(j, struct nvram_partition, partition);
+			if (cur_part == nvram_part || cur_part->header.signature != NVRAM_SIG_FREE) {
+				break;
+			}
+
+			part->header.length += cur_part->header.length;
+			part->header.checksum = nvram_checksum(&part->header);
+
+			list_del(&cur_part->partition);
+			kfree(cur_part);
+			j = &part->partition; /* fixup our loop */
+		}
+		
+		rc = nvram_write_header(part);
+		if (rc <= 0) {
+			printk(KERN_ERR "nvram_remove_os_partition: nvram_write failed (%d)\n", rc);
+			return rc;
+		}
+
+	}
+	
+	return 0;
+}
+
+/* nvram_create_os_partition
+ *
+ * Create a OS linux partition to buffer error logs.
+ * Will create a partition starting at the first free
+ * space found if space has enough room.
+ */
+static int nvram_create_os_partition(void)
+{
+	struct list_head * p;
+	struct nvram_partition * part;
+	struct nvram_partition * new_part = NULL;
+	struct nvram_partition * free_part = NULL;
+	int seq_init[2] = { 0, 0 };
+	loff_t tmp_index;
+	long size = 0;
+	int rc;
+	
+	/* Find a free partition that will give us the maximum needed size 
+	   If can't find one that will give us the minimum size needed */
+	list_for_each(p, &nvram_part->partition) {
+		part = list_entry(p, struct nvram_partition, partition);
+		if (part->header.signature != NVRAM_SIG_FREE)
+			continue;
+
+		if (part->header.length >= NVRAM_MAX_REQ) {
+			size = NVRAM_MAX_REQ;
+			free_part = part;
+			break;
+		}
+		if (!size && part->header.length >= NVRAM_MIN_REQ) {
+			size = NVRAM_MIN_REQ;
+			free_part = part;
+		}
+	}
+	if (!size) {
+		return -ENOSPC;
+	}
+	
+	/* Create our OS partition */
+	new_part = (struct nvram_partition *)
+		kmalloc(sizeof(struct nvram_partition), GFP_KERNEL);
+	if (!new_part) {
+		printk(KERN_ERR "nvram_create_os_partition: kmalloc failed\n");
+		return -ENOMEM;
+	}
+
+	new_part->index = free_part->index;
+	new_part->header.signature = NVRAM_SIG_OS;
+	new_part->header.length = size;
+	sprintf(new_part->header.name, "ppc64,linux");
+	new_part->header.checksum = nvram_checksum(&new_part->header);
+
+	rc = nvram_write_header(new_part);
+	if (rc <= 0) {
+		printk(KERN_ERR "nvram_create_os_partition: nvram_write_header \
+				failed (%d)\n", rc);
+		return rc;
+	}
+
+	/* make sure and initialize to zero the sequence number and the error
+	   type logged */
+	tmp_index = new_part->index + NVRAM_HEADER_LEN;
+	rc = ppc_md.nvram_write((char *)&seq_init, sizeof(seq_init), &tmp_index);
+	if (rc <= 0) {
+		printk(KERN_ERR "nvram_create_os_partition: nvram_write failed (%d)\n", rc);
+		return rc;
+	}
+	
+	nvram_error_log_index = new_part->index + NVRAM_HEADER_LEN;
+	nvram_error_log_size = ((part->header.length - 1) *
+				NVRAM_BLOCK_LEN) - sizeof(struct err_log_info);
+	
+	list_add_tail(&new_part->partition, &free_part->partition);
+
+	if (free_part->header.length <= size) {
+		list_del(&free_part->partition);
+		kfree(free_part);
+		return 0;
+	} 
+
+	/* Adjust the partition we stole the space from */
+	free_part->index += size * NVRAM_BLOCK_LEN;
+	free_part->header.length -= size;
+	free_part->header.checksum = nvram_checksum(&free_part->header);
+	
+	rc = nvram_write_header(free_part);
+	if (rc <= 0) {
+		printk(KERN_ERR "nvram_create_os_partition: nvram_write_header "
+		       "failed (%d)\n", rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+
+/* nvram_setup_partition
+ *
+ * This will setup the partition we need for buffering the
+ * error logs and cleanup partitions if needed.
+ *
+ * The general strategy is the following:
+ * 1.) If there is ppc64,linux partition large enough then use it.
+ * 2.) If there is not a ppc64,linux partition large enough, search
+ * for a free partition that is large enough.
+ * 3.) If there is not a free partition large enough remove 
+ * _all_ OS partitions and consolidate the space.
+ * 4.) Will first try getting a chunk that will satisfy the maximum
+ * error log size (NVRAM_MAX_REQ).
+ * 5.) If the max chunk cannot be allocated then try finding a chunk
+ * that will satisfy the minum needed (NVRAM_MIN_REQ).
+ */
+static int nvram_setup_partition(void)
+{
+	struct list_head * p;
+	struct nvram_partition * part;
+	int rc;
+
+	/* For now, we don't do any of this on pmac, until I
+	 * have figured out if it's worth killing some unused stuffs
+	 * in our nvram, as Apple defined partitions use pretty much
+	 * all of the space
+	 */
+	if (systemcfg->platform == PLATFORM_POWERMAC)
+		return -ENOSPC;
+
+	/* see if we have an OS partition that meets our needs.
+	   will try getting the max we need.  If not we'll delete
+	   partitions and try again. */
+	list_for_each(p, &nvram_part->partition) {
+		part = list_entry(p, struct nvram_partition, partition);
+		if (part->header.signature != NVRAM_SIG_OS)
+			continue;
+
+		if (strcmp(part->header.name, "ppc64,linux"))
+			continue;
+
+		if (part->header.length >= NVRAM_MIN_REQ) {
+			/* found our partition */
+			nvram_error_log_index = part->index + NVRAM_HEADER_LEN;
+			nvram_error_log_size = ((part->header.length - 1) *
+						NVRAM_BLOCK_LEN) - sizeof(struct err_log_info);
+			return 0;
+		}
+	}
+	
+	/* try creating a partition with the free space we have */
+	rc = nvram_create_os_partition();
+	if (!rc) {
+		return 0;
+	}
+		
+	/* need to free up some space */
+	rc = nvram_remove_os_partition();
+	if (rc) {
+		return rc;
+	}
+	
+	/* create a partition in this new space */
+	rc = nvram_create_os_partition();
+	if (rc) {
+		printk(KERN_ERR "nvram_create_os_partition: Could not find a "
+		       "NVRAM partition large enough\n");
+		return rc;
+	}
+	
+	return 0;
+}
+
+
+static int nvram_scan_partitions(void)
+{
+	loff_t cur_index = 0;
+	struct nvram_header phead;
+	struct nvram_partition * tmp_part;
+	unsigned char c_sum;
+	char * header;
+	int total_size;
+	int err;
+
+	if (ppc_md.nvram_size == NULL)
+		return -ENODEV;
+	total_size = ppc_md.nvram_size();
+	
+	header = (char *) kmalloc(NVRAM_HEADER_LEN, GFP_KERNEL);
+	if (!header) {
+		printk(KERN_ERR "nvram_scan_partitions: Failed kmalloc\n");
+		return -ENOMEM;
+	}
+
+	while (cur_index < total_size) {
+
+		err = ppc_md.nvram_read(header, NVRAM_HEADER_LEN, &cur_index);
+		if (err != NVRAM_HEADER_LEN) {
+			printk(KERN_ERR "nvram_scan_partitions: Error parsing "
+			       "nvram partitions\n");
+			goto out;
+		}
+
+		cur_index -= NVRAM_HEADER_LEN; /* nvram_read will advance us */
+
+		memcpy(&phead, header, NVRAM_HEADER_LEN);
+
+		err = 0;
+		c_sum = nvram_checksum(&phead);
+		if (c_sum != phead.checksum) {
+			printk(KERN_WARNING "WARNING: nvram partition checksum"
+			       " was %02x, should be %02x!\n",
+			       phead.checksum, c_sum);
+			printk(KERN_WARNING "Terminating nvram partition scan\n");
+			goto out;
+		}
+		if (!phead.length) {
+			printk(KERN_WARNING "WARNING: nvram corruption "
+			       "detected: 0-length partition\n");
+			goto out;
+		}
+		tmp_part = (struct nvram_partition *)
+			kmalloc(sizeof(struct nvram_partition), GFP_KERNEL);
+		err = -ENOMEM;
+		if (!tmp_part) {
+			printk(KERN_ERR "nvram_scan_partitions: kmalloc failed\n");
+			goto out;
+		}
+		
+		memcpy(&tmp_part->header, &phead, NVRAM_HEADER_LEN);
+		tmp_part->index = cur_index;
+		list_add_tail(&tmp_part->partition, &nvram_part->partition);
+		
+		cur_index += phead.length * NVRAM_BLOCK_LEN;
+	}
+	err = 0;
+
+ out:
+	kfree(header);
+	return err;
+}
+
+static int __init nvram_init(void)
+{
+	int error;
+	int rc;
+	
+	if (ppc_md.nvram_size == NULL || ppc_md.nvram_size() <= 0)
+		return  -ENODEV;
+
+  	rc = misc_register(&nvram_dev);
+	if (rc != 0) {
+		printk(KERN_ERR "nvram_init: failed to register device\n");
+		return rc;
+	}
+  	
+  	/* initialize our anchor for the nvram partition list */
+  	nvram_part = (struct nvram_partition *) kmalloc(sizeof(struct nvram_partition), GFP_KERNEL);
+  	if (!nvram_part) {
+  		printk(KERN_ERR "nvram_init: Failed kmalloc\n");
+  		return -ENOMEM;
+  	}
+  	INIT_LIST_HEAD(&nvram_part->partition);
+  
+  	/* Get all the NVRAM partitions */
+  	error = nvram_scan_partitions();
+  	if (error) {
+  		printk(KERN_ERR "nvram_init: Failed nvram_scan_partitions\n");
+  		return error;
+  	}
+  		
+  	if(nvram_setup_partition()) 
+  		printk(KERN_WARNING "nvram_init: Could not find nvram partition"
+  		       " for nvram buffered error logging.\n");
+  
+#ifdef DEBUG_NVRAM
+	nvram_print_partitions("NVRAM Partitions");
+#endif
+
+  	return rc;
+}
+
+void __exit nvram_cleanup(void)
+{
+        misc_deregister( &nvram_dev );
+}
+
+
+#ifdef CONFIG_PPC_PSERIES
+
+/* nvram_write_error_log
+ *
+ * We need to buffer the error logs into nvram to ensure that we have
+ * the failure information to decode.  If we have a severe error there
+ * is no way to guarantee that the OS or the machine is in a state to
+ * get back to user land and write the error to disk.  For example if
+ * the SCSI device driver causes a Machine Check by writing to a bad
+ * IO address, there is no way of guaranteeing that the device driver
+ * is in any state that is would also be able to write the error data
+ * captured to disk, thus we buffer it in NVRAM for analysis on the
+ * next boot.
+ *
+ * In NVRAM the partition containing the error log buffer will looks like:
+ * Header (in bytes):
+ * +-----------+----------+--------+------------+------------------+
+ * | signature | checksum | length | name       | data             |
+ * |0          |1         |2      3|4         15|16        length-1|
+ * +-----------+----------+--------+------------+------------------+
+ *
+ * The 'data' section would look like (in bytes):
+ * +--------------+------------+-----------------------------------+
+ * | event_logged | sequence # | error log                         |
+ * |0            3|4          7|8            nvram_error_log_size-1|
+ * +--------------+------------+-----------------------------------+
+ *
+ * event_logged: 0 if event has not been logged to syslog, 1 if it has
+ * sequence #: The unique sequence # for each event. (until it wraps)
+ * error log: The error log from event_scan
+ */
+int nvram_write_error_log(char * buff, int length, unsigned int err_type)
+{
+	int rc;
+	loff_t tmp_index;
+	struct err_log_info info;
+	
+	if (no_logging) {
+		return -EPERM;
+	}
+
+	if (nvram_error_log_index == -1) {
+		return -ESPIPE;
+	}
+
+	if (length > nvram_error_log_size) {
+		length = nvram_error_log_size;
+	}
+
+	info.error_type = err_type;
+	info.seq_num = error_log_cnt;
+
+	tmp_index = nvram_error_log_index;
+
+	rc = ppc_md.nvram_write((char *)&info, sizeof(struct err_log_info), &tmp_index);
+	if (rc <= 0) {
+		printk(KERN_ERR "nvram_write_error_log: Failed nvram_write (%d)\n", rc);
+		return rc;
+	}
+
+	rc = ppc_md.nvram_write(buff, length, &tmp_index);
+	if (rc <= 0) {
+		printk(KERN_ERR "nvram_write_error_log: Failed nvram_write (%d)\n", rc);
+		return rc;
+	}
+	
+	return 0;
+}
+
+/* nvram_read_error_log
+ *
+ * Reads nvram for error log for at most 'length'
+ */
+int nvram_read_error_log(char * buff, int length, unsigned int * err_type)
+{
+	int rc;
+	loff_t tmp_index;
+	struct err_log_info info;
+	
+	if (nvram_error_log_index == -1)
+		return -1;
+
+	if (length > nvram_error_log_size)
+		length = nvram_error_log_size;
+
+	tmp_index = nvram_error_log_index;
+
+	rc = ppc_md.nvram_read((char *)&info, sizeof(struct err_log_info), &tmp_index);
+	if (rc <= 0) {
+		printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc);
+		return rc;
+	}
+
+	rc = ppc_md.nvram_read(buff, length, &tmp_index);
+	if (rc <= 0) {
+		printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc);
+		return rc;
+	}
+
+	error_log_cnt = info.seq_num;
+	*err_type = info.error_type;
+
+	return 0;
+}
+
+/* This doesn't actually zero anything, but it sets the event_logged
+ * word to tell that this event is safely in syslog.
+ */
+int nvram_clear_error_log(void)
+{
+	loff_t tmp_index;
+	int clear_word = ERR_FLAG_ALREADY_LOGGED;
+	int rc;
+
+	tmp_index = nvram_error_log_index;
+	
+	rc = ppc_md.nvram_write((char *)&clear_word, sizeof(int), &tmp_index);
+	if (rc <= 0) {
+		printk(KERN_ERR "nvram_clear_error_log: Failed nvram_write (%d)\n", rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+#endif /* CONFIG_PPC_PSERIES */
+
+module_init(nvram_init);
+module_exit(nvram_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/arch/ppc64/kernel/of_device.c b/arch/ppc64/kernel/of_device.c
new file mode 100644
index 00000000000..b27a75f1b98
--- /dev/null
+++ b/arch/ppc64/kernel/of_device.c
@@ -0,0 +1,272 @@
+#include <linux/config.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <asm/errno.h>
+#include <asm/of_device.h>
+
+/**
+ * of_match_device - Tell if an of_device structure has a matching
+ * of_match structure
+ * @ids: array of of device match structures to search in
+ * @dev: the of device structure to match against
+ *
+ * Used by a driver to check whether an of_device present in the
+ * system is in its list of supported devices.
+ */
+const struct of_match * of_match_device(const struct of_match *matches,
+					const struct of_device *dev)
+{
+	if (!dev->node)
+		return NULL;
+	while (matches->name || matches->type || matches->compatible) {
+		int match = 1;
+		if (matches->name && matches->name != OF_ANY_MATCH)
+			match &= dev->node->name
+				&& !strcmp(matches->name, dev->node->name);
+		if (matches->type && matches->type != OF_ANY_MATCH)
+			match &= dev->node->type
+				&& !strcmp(matches->type, dev->node->type);
+		if (matches->compatible && matches->compatible != OF_ANY_MATCH)
+			match &= device_is_compatible(dev->node,
+				matches->compatible);
+		if (match)
+			return matches;
+		matches++;
+	}
+	return NULL;
+}
+
+static int of_platform_bus_match(struct device *dev, struct device_driver *drv)
+{
+	struct of_device * of_dev = to_of_device(dev);
+	struct of_platform_driver * of_drv = to_of_platform_driver(drv);
+	const struct of_match * matches = of_drv->match_table;
+
+	if (!matches)
+		return 0;
+
+	return of_match_device(matches, of_dev) != NULL;
+}
+
+struct of_device *of_dev_get(struct of_device *dev)
+{
+	struct device *tmp;
+
+	if (!dev)
+		return NULL;
+	tmp = get_device(&dev->dev);
+	if (tmp)
+		return to_of_device(tmp);
+	else
+		return NULL;
+}
+
+void of_dev_put(struct of_device *dev)
+{
+	if (dev)
+		put_device(&dev->dev);
+}
+
+
+static int of_device_probe(struct device *dev)
+{
+	int error = -ENODEV;
+	struct of_platform_driver *drv;
+	struct of_device *of_dev;
+	const struct of_match *match;
+
+	drv = to_of_platform_driver(dev->driver);
+	of_dev = to_of_device(dev);
+
+	if (!drv->probe)
+		return error;
+
+	of_dev_get(of_dev);
+
+	match = of_match_device(drv->match_table, of_dev);
+	if (match)
+		error = drv->probe(of_dev, match);
+	if (error)
+		of_dev_put(of_dev);
+
+	return error;
+}
+
+static int of_device_remove(struct device *dev)
+{
+	struct of_device * of_dev = to_of_device(dev);
+	struct of_platform_driver * drv = to_of_platform_driver(dev->driver);
+
+	if (dev->driver && drv->remove)
+		drv->remove(of_dev);
+	return 0;
+}
+
+static int of_device_suspend(struct device *dev, u32 state)
+{
+	struct of_device * of_dev = to_of_device(dev);
+	struct of_platform_driver * drv = to_of_platform_driver(dev->driver);
+	int error = 0;
+
+	if (dev->driver && drv->suspend)
+		error = drv->suspend(of_dev, state);
+	return error;
+}
+
+static int of_device_resume(struct device * dev)
+{
+	struct of_device * of_dev = to_of_device(dev);
+	struct of_platform_driver * drv = to_of_platform_driver(dev->driver);
+	int error = 0;
+
+	if (dev->driver && drv->resume)
+		error = drv->resume(of_dev);
+	return error;
+}
+
+struct bus_type of_platform_bus_type = {
+       .name	= "of_platform",
+       .match	= of_platform_bus_match,
+       .suspend	= of_device_suspend,
+       .resume	= of_device_resume,
+};
+
+static int __init of_bus_driver_init(void)
+{
+	return bus_register(&of_platform_bus_type);
+}
+
+postcore_initcall(of_bus_driver_init);
+
+int of_register_driver(struct of_platform_driver *drv)
+{
+	int count = 0;
+
+	/* initialize common driver fields */
+	drv->driver.name = drv->name;
+	drv->driver.bus = &of_platform_bus_type;
+	drv->driver.probe = of_device_probe;
+	drv->driver.remove = of_device_remove;
+
+	/* register with core */
+	count = driver_register(&drv->driver);
+	return count ? count : 1;
+}
+
+void of_unregister_driver(struct of_platform_driver *drv)
+{
+	driver_unregister(&drv->driver);
+}
+
+
+static ssize_t dev_show_devspec(struct device *dev, char *buf)
+{
+	struct of_device *ofdev;
+
+	ofdev = to_of_device(dev);
+	return sprintf(buf, "%s", ofdev->node->full_name);
+}
+
+static DEVICE_ATTR(devspec, S_IRUGO, dev_show_devspec, NULL);
+
+/**
+ * of_release_dev - free an of device structure when all users of it are finished.
+ * @dev: device that's been disconnected
+ *
+ * Will be called only by the device core when all users of this of device are
+ * done.
+ */
+void of_release_dev(struct device *dev)
+{
+	struct of_device *ofdev;
+
+        ofdev = to_of_device(dev);
+	kfree(ofdev);
+}
+
+int of_device_register(struct of_device *ofdev)
+{
+	int rc;
+	struct of_device **odprop;
+
+	BUG_ON(ofdev->node == NULL);
+
+	odprop = (struct of_device **)get_property(ofdev->node, "linux,device", NULL);
+	if (!odprop) {
+		struct property *new_prop;
+	
+		new_prop = kmalloc(sizeof(struct property) + sizeof(struct of_device *),
+			GFP_KERNEL);
+		if (new_prop == NULL)
+			return -ENOMEM;
+		new_prop->name = "linux,device";
+		new_prop->length = sizeof(sizeof(struct of_device *));
+		new_prop->value = (unsigned char *)&new_prop[1];
+		odprop = (struct of_device **)new_prop->value;
+		*odprop = NULL;
+		prom_add_property(ofdev->node, new_prop);
+	}
+	*odprop = ofdev;
+
+	rc = device_register(&ofdev->dev);
+	if (rc)
+		return rc;
+
+	device_create_file(&ofdev->dev, &dev_attr_devspec);
+
+	return 0;
+}
+
+void of_device_unregister(struct of_device *ofdev)
+{
+	struct of_device **odprop;
+
+	device_remove_file(&ofdev->dev, &dev_attr_devspec);
+
+	odprop = (struct of_device **)get_property(ofdev->node, "linux,device", NULL);
+	if (odprop)
+		*odprop = NULL;
+
+	device_unregister(&ofdev->dev);
+}
+
+struct of_device* of_platform_device_create(struct device_node *np, const char *bus_id)
+{
+	struct of_device *dev;
+	u32 *reg;
+
+	dev = kmalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return NULL;
+	memset(dev, 0, sizeof(*dev));
+
+	dev->node = np;
+	dev->dma_mask = 0xffffffffUL;
+	dev->dev.dma_mask = &dev->dma_mask;
+	dev->dev.parent = NULL;
+	dev->dev.bus = &of_platform_bus_type;
+	dev->dev.release = of_release_dev;
+
+	reg = (u32 *)get_property(np, "reg", NULL);
+	strlcpy(dev->dev.bus_id, bus_id, BUS_ID_SIZE);
+
+	if (of_device_register(dev) != 0) {
+		kfree(dev);
+		return NULL;
+	}
+
+	return dev;
+}
+
+EXPORT_SYMBOL(of_match_device);
+EXPORT_SYMBOL(of_platform_bus_type);
+EXPORT_SYMBOL(of_register_driver);
+EXPORT_SYMBOL(of_unregister_driver);
+EXPORT_SYMBOL(of_device_register);
+EXPORT_SYMBOL(of_device_unregister);
+EXPORT_SYMBOL(of_dev_get);
+EXPORT_SYMBOL(of_dev_put);
+EXPORT_SYMBOL(of_platform_device_create);
+EXPORT_SYMBOL(of_release_dev);
diff --git a/arch/ppc64/kernel/pSeries_hvCall.S b/arch/ppc64/kernel/pSeries_hvCall.S
new file mode 100644
index 00000000000..0715d303801
--- /dev/null
+++ b/arch/ppc64/kernel/pSeries_hvCall.S
@@ -0,0 +1,123 @@
+/*
+ * arch/ppc64/kernel/pSeries_hvCall.S
+ *
+ * This file contains the generic code to perform a call to the
+ * pSeries LPAR hypervisor.
+ * NOTE: this file will go away when we move to inline this work.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <asm/hvcall.h>
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+	
+#define STK_PARM(i)     (48 + ((i)-3)*8)
+
+	.text
+
+/* long plpar_hcall(unsigned long opcode,		R3
+			unsigned long arg1,		R4
+			unsigned long arg2,		R5
+			unsigned long arg3,		R6
+			unsigned long arg4,		R7
+			unsigned long *out1,		R8
+			unsigned long *out2,		R9
+			unsigned long *out3);		R10
+ */
+_GLOBAL(plpar_hcall)
+	mfcr	r0
+
+	std	r8,STK_PARM(r8)(r1)	/* Save out ptrs */
+	std	r9,STK_PARM(r9)(r1)
+	std	r10,STK_PARM(r10)(r1)
+
+	stw	r0,8(r1)
+
+	HVSC				/* invoke the hypervisor */
+
+	lwz	r0,8(r1)
+
+	ld	r8,STK_PARM(r8)(r1)	/* Fetch r4-r6 ret args */
+	ld	r9,STK_PARM(r9)(r1)
+	ld	r10,STK_PARM(r10)(r1)
+	std	r4,0(r8)
+	std	r5,0(r9)
+	std	r6,0(r10)
+
+	mtcrf	0xff,r0
+	blr				/* return r3 = status */
+
+
+/* Simple interface with no output values (other than status) */
+_GLOBAL(plpar_hcall_norets)
+	mfcr	r0
+	stw	r0,8(r1)
+
+	HVSC				/* invoke the hypervisor */
+
+	lwz	r0,8(r1)
+	mtcrf	0xff,r0
+	blr				/* return r3 = status */
+
+
+/* long plpar_hcall_8arg_2ret(unsigned long opcode,	R3
+			unsigned long arg1,		R4
+			unsigned long arg2,		R5
+			unsigned long arg3,		R6
+			unsigned long arg4,		R7
+			unsigned long arg5,		R8
+			unsigned long arg6,		R9
+			unsigned long arg7,		R10
+			unsigned long arg8,		112(R1)
+			unsigned long *out1);		120(R1)
+ */
+_GLOBAL(plpar_hcall_8arg_2ret)
+	mfcr	r0
+	ld	r11,STK_PARM(r11)(r1)	/* put arg8 in R11 */
+	stw	r0,8(r1)
+
+	HVSC				/* invoke the hypervisor */
+
+	lwz	r0,8(r1)
+	ld	r10,STK_PARM(r12)(r1)	/* Fetch r4 ret arg */
+	std	r4,0(r10)
+	mtcrf	0xff,r0
+	blr				/* return r3 = status */
+
+
+/* long plpar_hcall_4out(unsigned long opcode,		R3
+		 	unsigned long arg1,		R4
+		 	unsigned long arg2,		R5
+		 	unsigned long arg3,		R6
+		 	unsigned long arg4,		R7
+		 	unsigned long *out1,		R8
+		 	unsigned long *out2,		R9
+		 	unsigned long *out3,		R10
+		 	unsigned long *out4);		112(R1)
+ */
+_GLOBAL(plpar_hcall_4out)
+	mfcr	r0
+	stw	r0,8(r1)
+
+	std	r8,STK_PARM(r8)(r1)	/* Save out ptrs */
+	std	r9,STK_PARM(r9)(r1)
+	std	r10,STK_PARM(r10)(r1)
+
+	HVSC				/* invoke the hypervisor */
+
+	lwz	r0,8(r1)
+
+	ld	r8,STK_PARM(r8)(r1)	/* Fetch r4-r7 ret args */
+	ld	r9,STK_PARM(r9)(r1)
+	ld	r10,STK_PARM(r10)(r1)
+	ld	r11,STK_PARM(r11)(r1)
+	std	r4,0(r8)
+	std	r5,0(r9)
+	std	r6,0(r10)
+	std	r7,0(r11)
+
+	mtcrf	0xff,r0
+	blr				/* return r3 = status */
diff --git a/arch/ppc64/kernel/pSeries_iommu.c b/arch/ppc64/kernel/pSeries_iommu.c
new file mode 100644
index 00000000000..69130522a87
--- /dev/null
+++ b/arch/ppc64/kernel/pSeries_iommu.c
@@ -0,0 +1,570 @@
+/*
+ * arch/ppc64/kernel/pSeries_iommu.c
+ *
+ * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation
+ *
+ * Rewrite, cleanup: 
+ *
+ * Copyright (C) 2004 Olof Johansson <olof@austin.ibm.com>, IBM Corporation
+ *
+ * Dynamic DMA mapping support, pSeries-specific parts, both SMP and LPAR.
+ *
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/ppcdebug.h>
+#include <asm/iommu.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+#include <asm/abs_addr.h>
+#include <asm/plpar_wrappers.h>
+#include <asm/pSeries_reconfig.h>
+#include <asm/systemcfg.h>
+#include "pci.h"
+
+#define DBG(fmt...)
+
+extern int is_python(struct device_node *);
+
+static void tce_build_pSeries(struct iommu_table *tbl, long index, 
+			      long npages, unsigned long uaddr, 
+			      enum dma_data_direction direction)
+{
+	union tce_entry t;
+	union tce_entry *tp;
+
+	t.te_word = 0;
+	t.te_rdwr = 1; // Read allowed 
+
+	if (direction != DMA_TO_DEVICE)
+		t.te_pciwr = 1;
+
+	tp = ((union tce_entry *)tbl->it_base) + index;
+
+	while (npages--) {
+		/* can't move this out since we might cross LMB boundary */
+		t.te_rpn = (virt_to_abs(uaddr)) >> PAGE_SHIFT;
+	
+		tp->te_word = t.te_word;
+
+		uaddr += PAGE_SIZE;
+		tp++;
+	}
+}
+
+
+static void tce_free_pSeries(struct iommu_table *tbl, long index, long npages)
+{
+	union tce_entry t;
+	union tce_entry *tp;
+
+	t.te_word = 0;
+	tp  = ((union tce_entry *)tbl->it_base) + index;
+		
+	while (npages--) {
+		tp->te_word = t.te_word;
+		
+		tp++;
+	}
+}
+
+
+static void tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
+				long npages, unsigned long uaddr,
+				enum dma_data_direction direction)
+{
+	u64 rc;
+	union tce_entry tce;
+
+	tce.te_word = 0;
+	tce.te_rpn = (virt_to_abs(uaddr)) >> PAGE_SHIFT;
+	tce.te_rdwr = 1;
+	if (direction != DMA_TO_DEVICE)
+		tce.te_pciwr = 1;
+
+	while (npages--) {
+		rc = plpar_tce_put((u64)tbl->it_index, 
+				   (u64)tcenum << 12, 
+				   tce.te_word );
+		
+		if (rc && printk_ratelimit()) {
+			printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%ld\n", rc);
+			printk("\tindex   = 0x%lx\n", (u64)tbl->it_index);
+			printk("\ttcenum  = 0x%lx\n", (u64)tcenum);
+			printk("\ttce val = 0x%lx\n", tce.te_word );
+			show_stack(current, (unsigned long *)__get_SP());
+		}
+			
+		tcenum++;
+		tce.te_rpn++;
+	}
+}
+
+static DEFINE_PER_CPU(void *, tce_page) = NULL;
+
+static void tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
+				     long npages, unsigned long uaddr,
+				     enum dma_data_direction direction)
+{
+	u64 rc;
+	union tce_entry tce, *tcep;
+	long l, limit;
+
+	if (npages == 1)
+		return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
+					   direction);
+
+	tcep = __get_cpu_var(tce_page);
+
+	/* This is safe to do since interrupts are off when we're called
+	 * from iommu_alloc{,_sg}()
+	 */
+	if (!tcep) {
+		tcep = (void *)__get_free_page(GFP_ATOMIC);
+		/* If allocation fails, fall back to the loop implementation */
+		if (!tcep)
+			return tce_build_pSeriesLP(tbl, tcenum, npages,
+						   uaddr, direction);
+		__get_cpu_var(tce_page) = tcep;
+	}
+
+	tce.te_word = 0;
+	tce.te_rpn = (virt_to_abs(uaddr)) >> PAGE_SHIFT;
+	tce.te_rdwr = 1;
+	if (direction != DMA_TO_DEVICE)
+		tce.te_pciwr = 1;
+
+	/* We can map max one pageful of TCEs at a time */
+	do {
+		/*
+		 * Set up the page with TCE data, looping through and setting
+		 * the values.
+		 */
+		limit = min_t(long, npages, PAGE_SIZE/sizeof(union tce_entry));
+
+		for (l = 0; l < limit; l++) {
+			tcep[l] = tce;
+			tce.te_rpn++;
+		}
+
+		rc = plpar_tce_put_indirect((u64)tbl->it_index,
+					    (u64)tcenum << 12,
+					    (u64)virt_to_abs(tcep),
+					    limit);
+
+		npages -= limit;
+		tcenum += limit;
+	} while (npages > 0 && !rc);
+
+	if (rc && printk_ratelimit()) {
+		printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%ld\n", rc);
+		printk("\tindex   = 0x%lx\n", (u64)tbl->it_index);
+		printk("\tnpages  = 0x%lx\n", (u64)npages);
+		printk("\ttce[0] val = 0x%lx\n", tcep[0].te_word);
+		show_stack(current, (unsigned long *)__get_SP());
+	}
+}
+
+static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
+{
+	u64 rc;
+	union tce_entry tce;
+
+	tce.te_word = 0;
+
+	while (npages--) {
+		rc = plpar_tce_put((u64)tbl->it_index,
+				   (u64)tcenum << 12,
+				   tce.te_word);
+
+		if (rc && printk_ratelimit()) {
+			printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%ld\n", rc);
+			printk("\tindex   = 0x%lx\n", (u64)tbl->it_index);
+			printk("\ttcenum  = 0x%lx\n", (u64)tcenum);
+			printk("\ttce val = 0x%lx\n", tce.te_word );
+			show_stack(current, (unsigned long *)__get_SP());
+		}
+
+		tcenum++;
+	}
+}
+
+
+static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
+{
+	u64 rc;
+	union tce_entry tce;
+
+	tce.te_word = 0;
+
+	rc = plpar_tce_stuff((u64)tbl->it_index,
+			   (u64)tcenum << 12,
+			   tce.te_word,
+			   npages);
+
+	if (rc && printk_ratelimit()) {
+		printk("tce_freemulti_pSeriesLP: plpar_tce_stuff failed\n");
+		printk("\trc      = %ld\n", rc);
+		printk("\tindex   = 0x%lx\n", (u64)tbl->it_index);
+		printk("\tnpages  = 0x%lx\n", (u64)npages);
+		printk("\ttce val = 0x%lx\n", tce.te_word );
+		show_stack(current, (unsigned long *)__get_SP());
+	}
+}
+
+static void iommu_table_setparms(struct pci_controller *phb,
+				 struct device_node *dn,
+				 struct iommu_table *tbl) 
+{
+	struct device_node *node;
+	unsigned long *basep;
+	unsigned int *sizep;
+
+	node = (struct device_node *)phb->arch_data;
+
+	basep = (unsigned long *)get_property(node, "linux,tce-base", NULL);
+	sizep = (unsigned int *)get_property(node, "linux,tce-size", NULL);
+	if (basep == NULL || sizep == NULL) {
+		printk(KERN_ERR "PCI_DMA: iommu_table_setparms: %s has "
+				"missing tce entries !\n", dn->full_name);
+		return;
+	}
+
+	tbl->it_base = (unsigned long)__va(*basep);
+	memset((void *)tbl->it_base, 0, *sizep);
+
+	tbl->it_busno = phb->bus->number;
+	
+	/* Units of tce entries */
+	tbl->it_offset = phb->dma_window_base_cur >> PAGE_SHIFT;
+	
+	/* Test if we are going over 2GB of DMA space */
+	if (phb->dma_window_base_cur + phb->dma_window_size > (1L << 31))
+		panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n"); 
+	
+	phb->dma_window_base_cur += phb->dma_window_size;
+
+	/* Set the tce table size - measured in entries */
+	tbl->it_size = phb->dma_window_size >> PAGE_SHIFT;
+
+	tbl->it_index = 0;
+	tbl->it_blocksize = 16;
+	tbl->it_type = TCE_PCI;
+}
+
+/*
+ * iommu_table_setparms_lpar
+ *
+ * Function: On pSeries LPAR systems, return TCE table info, given a pci bus.
+ *
+ * ToDo: properly interpret the ibm,dma-window property.  The definition is:
+ *	logical-bus-number	(1 word)
+ *	phys-address		(#address-cells words)
+ *	size			(#cell-size words)
+ *
+ * Currently we hard code these sizes (more or less).
+ */
+static void iommu_table_setparms_lpar(struct pci_controller *phb,
+				      struct device_node *dn,
+				      struct iommu_table *tbl,
+				      unsigned int *dma_window)
+{
+	tbl->it_busno  = dn->bussubno;
+
+	/* TODO: Parse field size properties properly. */
+	tbl->it_size   = (((unsigned long)dma_window[4] << 32) |
+			   (unsigned long)dma_window[5]) >> PAGE_SHIFT;
+	tbl->it_offset = (((unsigned long)dma_window[2] << 32) |
+			   (unsigned long)dma_window[3]) >> PAGE_SHIFT;
+	tbl->it_base   = 0;
+	tbl->it_index  = dma_window[0];
+	tbl->it_blocksize  = 16;
+	tbl->it_type = TCE_PCI;
+}
+
+static void iommu_bus_setup_pSeries(struct pci_bus *bus)
+{
+	struct device_node *dn, *pdn;
+	struct iommu_table *tbl;
+
+	DBG("iommu_bus_setup_pSeries, bus %p, bus->self %p\n", bus, bus->self);
+
+	/* For each (root) bus, we carve up the available DMA space in 256MB
+	 * pieces. Since each piece is used by one (sub) bus/device, that would
+	 * give a maximum of 7 devices per PHB. In most cases, this is plenty.
+	 *
+	 * The exception is on Python PHBs (pre-POWER4). Here we don't have EADS
+	 * bridges below the PHB to allocate the sectioned tables to, so instead
+	 * we allocate a 1GB table at the PHB level.
+	 */
+
+	dn = pci_bus_to_OF_node(bus);
+
+	if (!bus->self) {
+		/* Root bus */
+		if (is_python(dn)) {
+			unsigned int *iohole;
+
+			DBG("Python root bus %s\n", bus->name);
+
+			iohole = (unsigned int *)get_property(dn, "io-hole", 0);
+
+			if (iohole) {
+				/* On first bus we need to leave room for the
+				 * ISA address space. Just skip the first 256MB
+				 * alltogether. This leaves 768MB for the window.
+				 */
+				DBG("PHB has io-hole, reserving 256MB\n");
+				dn->phb->dma_window_size = 3 << 28;
+				dn->phb->dma_window_base_cur = 1 << 28;
+			} else {
+				/* 1GB window by default */
+				dn->phb->dma_window_size = 1 << 30;
+				dn->phb->dma_window_base_cur = 0;
+			}
+
+			tbl = kmalloc(sizeof(struct iommu_table), GFP_KERNEL);
+
+			iommu_table_setparms(dn->phb, dn, tbl);
+			dn->iommu_table = iommu_init_table(tbl);
+		} else {
+			/* Do a 128MB table at root. This is used for the IDE
+			 * controller on some SMP-mode POWER4 machines. It
+			 * doesn't hurt to allocate it on other machines
+			 * -- it'll just be unused since new tables are
+			 * allocated on the EADS level.
+			 *
+			 * Allocate at offset 128MB to avoid having to deal
+			 * with ISA holes; 128MB table for IDE is plenty.
+			 */
+			dn->phb->dma_window_size = 1 << 27;
+			dn->phb->dma_window_base_cur = 1 << 27;
+
+			tbl = kmalloc(sizeof(struct iommu_table), GFP_KERNEL);
+
+			iommu_table_setparms(dn->phb, dn, tbl);
+			dn->iommu_table = iommu_init_table(tbl);
+
+			/* All child buses have 256MB tables */
+			dn->phb->dma_window_size = 1 << 28;
+		}
+	} else {
+		pdn = pci_bus_to_OF_node(bus->parent);
+
+		if (!bus->parent->self && !is_python(pdn)) {
+			struct iommu_table *tbl;
+			/* First child and not python means this is the EADS
+			 * level. Allocate new table for this slot with 256MB
+			 * window.
+			 */
+
+			tbl = kmalloc(sizeof(struct iommu_table), GFP_KERNEL);
+
+			iommu_table_setparms(dn->phb, dn, tbl);
+
+			dn->iommu_table = iommu_init_table(tbl);
+		} else {
+			/* Lower than first child or under python, use parent table */
+			dn->iommu_table = pdn->iommu_table;
+		}
+	}
+}
+
+
+static void iommu_bus_setup_pSeriesLP(struct pci_bus *bus)
+{
+	struct iommu_table *tbl;
+	struct device_node *dn, *pdn;
+	unsigned int *dma_window = NULL;
+
+	DBG("iommu_bus_setup_pSeriesLP, bus %p, bus->self %p\n", bus, bus->self);
+
+	dn = pci_bus_to_OF_node(bus);
+
+	/* Find nearest ibm,dma-window, walking up the device tree */
+	for (pdn = dn; pdn != NULL; pdn = pdn->parent) {
+		dma_window = (unsigned int *)get_property(pdn, "ibm,dma-window", NULL);
+		if (dma_window != NULL)
+			break;
+	}
+
+	if (dma_window == NULL) {
+		DBG("iommu_bus_setup_pSeriesLP: bus %s seems to have no ibm,dma-window property\n", dn->full_name);
+		return;
+	}
+
+	if (!pdn->iommu_table) {
+		/* Bussubno hasn't been copied yet.
+		 * Do it now because iommu_table_setparms_lpar needs it.
+		 */
+		pdn->bussubno = bus->number;
+
+		tbl = (struct iommu_table *)kmalloc(sizeof(struct iommu_table),
+						    GFP_KERNEL);
+	
+		iommu_table_setparms_lpar(pdn->phb, pdn, tbl, dma_window);
+
+		pdn->iommu_table = iommu_init_table(tbl);
+	}
+
+	if (pdn != dn)
+		dn->iommu_table = pdn->iommu_table;
+}
+
+
+static void iommu_dev_setup_pSeries(struct pci_dev *dev)
+{
+	struct device_node *dn, *mydn;
+
+	DBG("iommu_dev_setup_pSeries, dev %p (%s)\n", dev, dev->pretty_name);
+	/* Now copy the iommu_table ptr from the bus device down to the
+	 * pci device_node.  This means get_iommu_table() won't need to search
+	 * up the device tree to find it.
+	 */
+	mydn = dn = pci_device_to_OF_node(dev);
+
+	while (dn && dn->iommu_table == NULL)
+		dn = dn->parent;
+
+	if (dn) {
+		mydn->iommu_table = dn->iommu_table;
+	} else {
+		DBG("iommu_dev_setup_pSeries, dev %p (%s) has no iommu table\n", dev, dev->pretty_name);
+	}
+}
+
+static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *node)
+{
+	int err = NOTIFY_OK;
+	struct device_node *np = node;
+
+	switch (action) {
+	case PSERIES_RECONFIG_REMOVE:
+		if (np->iommu_table &&
+		    get_property(np, "ibm,dma-window", NULL))
+			iommu_free_table(np);
+		break;
+	default:
+		err = NOTIFY_DONE;
+		break;
+	}
+	return err;
+}
+
+static struct notifier_block iommu_reconfig_nb = {
+	.notifier_call = iommu_reconfig_notifier,
+};
+
+static void iommu_dev_setup_pSeriesLP(struct pci_dev *dev)
+{
+	struct device_node *pdn, *dn;
+	struct iommu_table *tbl;
+	int *dma_window = NULL;
+
+	DBG("iommu_dev_setup_pSeriesLP, dev %p (%s)\n", dev, dev->pretty_name);
+
+	/* dev setup for LPAR is a little tricky, since the device tree might
+	 * contain the dma-window properties per-device and not neccesarily
+	 * for the bus. So we need to search upwards in the tree until we
+	 * either hit a dma-window property, OR find a parent with a table
+	 * already allocated.
+	 */
+	dn = pci_device_to_OF_node(dev);
+
+	for (pdn = dn; pdn && !pdn->iommu_table; pdn = pdn->parent) {
+		dma_window = (unsigned int *)get_property(pdn, "ibm,dma-window", NULL);
+		if (dma_window)
+			break;
+	}
+
+	/* Check for parent == NULL so we don't try to setup the empty EADS
+	 * slots on POWER4 machines.
+	 */
+	if (dma_window == NULL || pdn->parent == NULL) {
+		/* Fall back to regular (non-LPAR) dev setup */
+		DBG("No dma window for device, falling back to regular setup\n");
+		iommu_dev_setup_pSeries(dev);
+		return;
+	} else {
+		DBG("Found DMA window, allocating table\n");
+	}
+
+	if (!pdn->iommu_table) {
+		/* iommu_table_setparms_lpar needs bussubno. */
+		pdn->bussubno = pdn->phb->bus->number;
+
+		tbl = (struct iommu_table *)kmalloc(sizeof(struct iommu_table),
+						    GFP_KERNEL);
+
+		iommu_table_setparms_lpar(pdn->phb, pdn, tbl, dma_window);
+
+		pdn->iommu_table = iommu_init_table(tbl);
+	}
+
+	if (pdn != dn)
+		dn->iommu_table = pdn->iommu_table;
+}
+
+static void iommu_bus_setup_null(struct pci_bus *b) { }
+static void iommu_dev_setup_null(struct pci_dev *d) { }
+
+/* These are called very early. */
+void iommu_init_early_pSeries(void)
+{
+	if (of_chosen && get_property(of_chosen, "linux,iommu-off", NULL)) {
+		/* Direct I/O, IOMMU off */
+		ppc_md.iommu_dev_setup = iommu_dev_setup_null;
+		ppc_md.iommu_bus_setup = iommu_bus_setup_null;
+		pci_direct_iommu_init();
+
+		return;
+	}
+
+	if (systemcfg->platform & PLATFORM_LPAR) {
+		if (cur_cpu_spec->firmware_features & FW_FEATURE_MULTITCE) {
+			ppc_md.tce_build = tce_buildmulti_pSeriesLP;
+			ppc_md.tce_free	 = tce_freemulti_pSeriesLP;
+		} else {
+			ppc_md.tce_build = tce_build_pSeriesLP;
+			ppc_md.tce_free	 = tce_free_pSeriesLP;
+		}
+		ppc_md.iommu_bus_setup = iommu_bus_setup_pSeriesLP;
+		ppc_md.iommu_dev_setup = iommu_dev_setup_pSeriesLP;
+	} else {
+		ppc_md.tce_build = tce_build_pSeries;
+		ppc_md.tce_free  = tce_free_pSeries;
+		ppc_md.iommu_bus_setup = iommu_bus_setup_pSeries;
+		ppc_md.iommu_dev_setup = iommu_dev_setup_pSeries;
+	}
+
+
+	pSeries_reconfig_notifier_register(&iommu_reconfig_nb);
+
+	pci_iommu_init();
+}
+
diff --git a/arch/ppc64/kernel/pSeries_lpar.c b/arch/ppc64/kernel/pSeries_lpar.c
new file mode 100644
index 00000000000..6534812db43
--- /dev/null
+++ b/arch/ppc64/kernel/pSeries_lpar.c
@@ -0,0 +1,531 @@
+/*
+ * pSeries_lpar.c
+ * Copyright (C) 2001 Todd Inglett, IBM Corporation
+ *
+ * pSeries LPAR support.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#define DEBUG
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/dma-mapping.h>
+#include <asm/processor.h>
+#include <asm/mmu.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/machdep.h>
+#include <asm/abs_addr.h>
+#include <asm/mmu_context.h>
+#include <asm/ppcdebug.h>
+#include <asm/iommu.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/prom.h>
+#include <asm/abs_addr.h>
+#include <asm/cputable.h>
+#include <asm/plpar_wrappers.h>
+
+#ifdef DEBUG
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+/* in pSeries_hvCall.S */
+EXPORT_SYMBOL(plpar_hcall);
+EXPORT_SYMBOL(plpar_hcall_4out);
+EXPORT_SYMBOL(plpar_hcall_norets);
+EXPORT_SYMBOL(plpar_hcall_8arg_2ret);
+
+extern void fw_feature_init(void);
+extern void pSeries_find_serial_port(void);
+
+
+int vtermno;	/* virtual terminal# for udbg  */
+
+#define __ALIGNED__ __attribute__((__aligned__(sizeof(long))))
+static void udbg_hvsi_putc(unsigned char c)
+{
+	/* packet's seqno isn't used anyways */
+	uint8_t packet[] __ALIGNED__ = { 0xff, 5, 0, 0, c };
+	int rc;
+
+	if (c == '\n')
+		udbg_hvsi_putc('\r');
+
+	do {
+		rc = plpar_put_term_char(vtermno, sizeof(packet), packet);
+	} while (rc == H_Busy);
+}
+
+static long hvsi_udbg_buf_len;
+static uint8_t hvsi_udbg_buf[256];
+
+static int udbg_hvsi_getc_poll(void)
+{
+	unsigned char ch;
+	int rc, i;
+
+	if (hvsi_udbg_buf_len == 0) {
+		rc = plpar_get_term_char(vtermno, &hvsi_udbg_buf_len, hvsi_udbg_buf);
+		if (rc != H_Success || hvsi_udbg_buf[0] != 0xff) {
+			/* bad read or non-data packet */
+			hvsi_udbg_buf_len = 0;
+		} else {
+			/* remove the packet header */
+			for (i = 4; i < hvsi_udbg_buf_len; i++)
+				hvsi_udbg_buf[i-4] = hvsi_udbg_buf[i];
+			hvsi_udbg_buf_len -= 4;
+		}
+	}
+
+	if (hvsi_udbg_buf_len <= 0 || hvsi_udbg_buf_len > 256) {
+		/* no data ready */
+		hvsi_udbg_buf_len = 0;
+		return -1;
+	}
+
+	ch = hvsi_udbg_buf[0];
+	/* shift remaining data down */
+	for (i = 1; i < hvsi_udbg_buf_len; i++) {
+		hvsi_udbg_buf[i-1] = hvsi_udbg_buf[i];
+	}
+	hvsi_udbg_buf_len--;
+
+	return ch;
+}
+
+static unsigned char udbg_hvsi_getc(void)
+{
+	int ch;
+	for (;;) {
+		ch = udbg_hvsi_getc_poll();
+		if (ch == -1) {
+			/* This shouldn't be needed...but... */
+			volatile unsigned long delay;
+			for (delay=0; delay < 2000000; delay++)
+				;
+		} else {
+			return ch;
+		}
+	}
+}
+
+static void udbg_putcLP(unsigned char c)
+{
+	char buf[16];
+	unsigned long rc;
+
+	if (c == '\n')
+		udbg_putcLP('\r');
+
+	buf[0] = c;
+	do {
+		rc = plpar_put_term_char(vtermno, 1, buf);
+	} while(rc == H_Busy);
+}
+
+/* Buffered chars getc */
+static long inbuflen;
+static long inbuf[2];	/* must be 2 longs */
+
+static int udbg_getc_pollLP(void)
+{
+	/* The interface is tricky because it may return up to 16 chars.
+	 * We save them statically for future calls to udbg_getc().
+	 */
+	char ch, *buf = (char *)inbuf;
+	int i;
+	long rc;
+	if (inbuflen == 0) {
+		/* get some more chars. */
+		inbuflen = 0;
+		rc = plpar_get_term_char(vtermno, &inbuflen, buf);
+		if (rc != H_Success)
+			inbuflen = 0;	/* otherwise inbuflen is garbage */
+	}
+	if (inbuflen <= 0 || inbuflen > 16) {
+		/* Catch error case as well as other oddities (corruption) */
+		inbuflen = 0;
+		return -1;
+	}
+	ch = buf[0];
+	for (i = 1; i < inbuflen; i++)	/* shuffle them down. */
+		buf[i-1] = buf[i];
+	inbuflen--;
+	return ch;
+}
+
+static unsigned char udbg_getcLP(void)
+{
+	int ch;
+	for (;;) {
+		ch = udbg_getc_pollLP();
+		if (ch == -1) {
+			/* This shouldn't be needed...but... */
+			volatile unsigned long delay;
+			for (delay=0; delay < 2000000; delay++)
+				;
+		} else {
+			return ch;
+		}
+	}
+}
+
+/* call this from early_init() for a working debug console on
+ * vterm capable LPAR machines
+ */
+void udbg_init_debug_lpar(void)
+{
+	vtermno = 0;
+	ppc_md.udbg_putc = udbg_putcLP;
+	ppc_md.udbg_getc = udbg_getcLP;
+	ppc_md.udbg_getc_poll = udbg_getc_pollLP;
+}
+
+/* returns 0 if couldn't find or use /chosen/stdout as console */
+int find_udbg_vterm(void)
+{
+	struct device_node *stdout_node;
+	u32 *termno;
+	char *name;
+	int found = 0;
+
+	/* find the boot console from /chosen/stdout */
+	if (!of_chosen)
+		return 0;
+	name = (char *)get_property(of_chosen, "linux,stdout-path", NULL);
+	if (name == NULL)
+		return 0;
+	stdout_node = of_find_node_by_path(name);
+	if (!stdout_node)
+		return 0;
+
+	/* now we have the stdout node; figure out what type of device it is. */
+	name = (char *)get_property(stdout_node, "name", NULL);
+	if (!name) {
+		printk(KERN_WARNING "stdout node missing 'name' property!\n");
+		goto out;
+	}
+
+	if (strncmp(name, "vty", 3) == 0) {
+		if (device_is_compatible(stdout_node, "hvterm1")) {
+			termno = (u32 *)get_property(stdout_node, "reg", NULL);
+			if (termno) {
+				vtermno = termno[0];
+				ppc_md.udbg_putc = udbg_putcLP;
+				ppc_md.udbg_getc = udbg_getcLP;
+				ppc_md.udbg_getc_poll = udbg_getc_pollLP;
+				found = 1;
+			}
+		} else if (device_is_compatible(stdout_node, "hvterm-protocol")) {
+			termno = (u32 *)get_property(stdout_node, "reg", NULL);
+			if (termno) {
+				vtermno = termno[0];
+				ppc_md.udbg_putc = udbg_hvsi_putc;
+				ppc_md.udbg_getc = udbg_hvsi_getc;
+				ppc_md.udbg_getc_poll = udbg_hvsi_getc_poll;
+				found = 1;
+			}
+		}
+	} else if (strncmp(name, "serial", 6)) {
+		/* XXX fix ISA serial console */
+		printk(KERN_WARNING "serial stdout on LPAR ('%s')! "
+				"can't print udbg messages\n",
+		       stdout_node->full_name);
+	} else {
+		printk(KERN_WARNING "don't know how to print to stdout '%s'\n",
+		       stdout_node->full_name);
+	}
+
+out:
+	of_node_put(stdout_node);
+	return found;
+}
+
+void vpa_init(int cpu)
+{
+	int hwcpu = get_hard_smp_processor_id(cpu);
+	unsigned long vpa = (unsigned long)&(paca[cpu].lppaca);
+	long ret;
+	unsigned long flags;
+
+	/* Register the Virtual Processor Area (VPA) */
+	flags = 1UL << (63 - 18);
+	ret = register_vpa(flags, hwcpu, __pa(vpa));
+
+	if (ret)
+		printk(KERN_ERR "WARNING: vpa_init: VPA registration for "
+				"cpu %d (hw %d) of area %lx returns %ld\n",
+				cpu, hwcpu, __pa(vpa), ret);
+}
+
+long pSeries_lpar_hpte_insert(unsigned long hpte_group,
+			      unsigned long va, unsigned long prpn,
+			      int secondary, unsigned long hpteflags,
+			      int bolted, int large)
+{
+	unsigned long arpn = physRpn_to_absRpn(prpn);
+	unsigned long lpar_rc;
+	unsigned long flags;
+	unsigned long slot;
+	HPTE lhpte;
+	unsigned long dummy0, dummy1;
+
+	/* Fill in the local HPTE with absolute rpn, avpn and flags */
+	lhpte.dw1.dword1      = 0;
+	lhpte.dw1.dw1.rpn     = arpn;
+	lhpte.dw1.flags.flags = hpteflags;
+
+	lhpte.dw0.dword0      = 0;
+	lhpte.dw0.dw0.avpn    = va >> 23;
+	lhpte.dw0.dw0.h       = secondary;
+	lhpte.dw0.dw0.bolted  = bolted;
+	lhpte.dw0.dw0.v       = 1;
+
+	if (large) {
+		lhpte.dw0.dw0.l = 1;
+		lhpte.dw0.dw0.avpn &= ~0x1UL;
+	}
+
+	/* Now fill in the actual HPTE */
+	/* Set CEC cookie to 0         */
+	/* Zero page = 0               */
+	/* I-cache Invalidate = 0      */
+	/* I-cache synchronize = 0     */
+	/* Exact = 0                   */
+	flags = 0;
+
+	/* XXX why is this here? - Anton */
+	if (hpteflags & (_PAGE_GUARDED|_PAGE_NO_CACHE))
+		lhpte.dw1.flags.flags &= ~_PAGE_COHERENT;
+
+	lpar_rc = plpar_hcall(H_ENTER, flags, hpte_group, lhpte.dw0.dword0,
+			      lhpte.dw1.dword1, &slot, &dummy0, &dummy1);
+
+	if (unlikely(lpar_rc == H_PTEG_Full))
+		return -1;
+
+	/*
+	 * Since we try and ioremap PHBs we don't own, the pte insert
+	 * will fail. However we must catch the failure in hash_page
+	 * or we will loop forever, so return -2 in this case.
+	 */
+	if (unlikely(lpar_rc != H_Success))
+		return -2;
+
+	/* Because of iSeries, we have to pass down the secondary
+	 * bucket bit here as well
+	 */
+	return (slot & 7) | (secondary << 3);
+}
+
+static DEFINE_SPINLOCK(pSeries_lpar_tlbie_lock);
+
+static long pSeries_lpar_hpte_remove(unsigned long hpte_group)
+{
+	unsigned long slot_offset;
+	unsigned long lpar_rc;
+	int i;
+	unsigned long dummy1, dummy2;
+
+	/* pick a random slot to start at */
+	slot_offset = mftb() & 0x7;
+
+	for (i = 0; i < HPTES_PER_GROUP; i++) {
+
+		/* don't remove a bolted entry */
+		lpar_rc = plpar_pte_remove(H_ANDCOND, hpte_group + slot_offset,
+					   (0x1UL << 4), &dummy1, &dummy2);
+
+		if (lpar_rc == H_Success)
+			return i;
+
+		BUG_ON(lpar_rc != H_Not_Found);
+
+		slot_offset++;
+		slot_offset &= 0x7;
+	}
+
+	return -1;
+}
+
+static void pSeries_lpar_hptab_clear(void)
+{
+	unsigned long size_bytes = 1UL << ppc64_pft_size;
+	unsigned long hpte_count = size_bytes >> 4;
+	unsigned long dummy1, dummy2;
+	int i;
+
+	/* TODO: Use bulk call */
+	for (i = 0; i < hpte_count; i++)
+		plpar_pte_remove(0, i, 0, &dummy1, &dummy2);
+}
+
+/*
+ * NOTE: for updatepp ops we are fortunate that the linux "newpp" bits and
+ * the low 3 bits of flags happen to line up.  So no transform is needed.
+ * We can probably optimize here and assume the high bits of newpp are
+ * already zero.  For now I am paranoid.
+ */
+static long pSeries_lpar_hpte_updatepp(unsigned long slot, unsigned long newpp,
+				       unsigned long va, int large, int local)
+{
+	unsigned long lpar_rc;
+	unsigned long flags = (newpp & 7) | H_AVPN;
+	unsigned long avpn = va >> 23;
+
+	if (large)
+		avpn &= ~0x1UL;
+
+	lpar_rc = plpar_pte_protect(flags, slot, (avpn << 7));
+
+	if (lpar_rc == H_Not_Found)
+		return -1;
+
+	BUG_ON(lpar_rc != H_Success);
+
+	return 0;
+}
+
+static unsigned long pSeries_lpar_hpte_getword0(unsigned long slot)
+{
+	unsigned long dword0;
+	unsigned long lpar_rc;
+	unsigned long dummy_word1;
+	unsigned long flags;
+
+	/* Read 1 pte at a time                        */
+	/* Do not need RPN to logical page translation */
+	/* No cross CEC PFT access                     */
+	flags = 0;
+
+	lpar_rc = plpar_pte_read(flags, slot, &dword0, &dummy_word1);
+
+	BUG_ON(lpar_rc != H_Success);
+
+	return dword0;
+}
+
+static long pSeries_lpar_hpte_find(unsigned long vpn)
+{
+	unsigned long hash;
+	unsigned long i, j;
+	long slot;
+	union {
+		unsigned long dword0;
+		Hpte_dword0 dw0;
+	} hpte_dw0;
+	Hpte_dword0 dw0;
+
+	hash = hpt_hash(vpn, 0);
+
+	for (j = 0; j < 2; j++) {
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		for (i = 0; i < HPTES_PER_GROUP; i++) {
+			hpte_dw0.dword0 = pSeries_lpar_hpte_getword0(slot);
+			dw0 = hpte_dw0.dw0;
+
+			if ((dw0.avpn == (vpn >> 11)) && dw0.v &&
+			    (dw0.h == j)) {
+				/* HPTE matches */
+				if (j)
+					slot = -slot;
+				return slot;
+			}
+			++slot;
+		}
+		hash = ~hash;
+	}
+
+	return -1;
+} 
+
+static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp,
+					     unsigned long ea)
+{
+	unsigned long lpar_rc;
+	unsigned long vsid, va, vpn, flags;
+	long slot;
+
+	vsid = get_kernel_vsid(ea);
+	va = (vsid << 28) | (ea & 0x0fffffff);
+	vpn = va >> PAGE_SHIFT;
+
+	slot = pSeries_lpar_hpte_find(vpn);
+	BUG_ON(slot == -1);
+
+	flags = newpp & 7;
+	lpar_rc = plpar_pte_protect(flags, slot, 0);
+
+	BUG_ON(lpar_rc != H_Success);
+}
+
+static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long va,
+					 int large, int local)
+{
+	unsigned long avpn = va >> 23;
+	unsigned long lpar_rc;
+	unsigned long dummy1, dummy2;
+
+	if (large)
+		avpn &= ~0x1UL;
+
+	lpar_rc = plpar_pte_remove(H_AVPN, slot, (avpn << 7), &dummy1,
+				   &dummy2);
+
+	if (lpar_rc == H_Not_Found)
+		return;
+
+	BUG_ON(lpar_rc != H_Success);
+}
+
+/*
+ * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie
+ * lock.
+ */
+void pSeries_lpar_flush_hash_range(unsigned long context, unsigned long number,
+				   int local)
+{
+	int i;
+	unsigned long flags = 0;
+	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+	int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
+
+	if (lock_tlbie)
+		spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
+
+	for (i = 0; i < number; i++)
+		flush_hash_page(context, batch->addr[i], batch->pte[i], local);
+
+	if (lock_tlbie)
+		spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
+}
+
+void hpte_init_lpar(void)
+{
+	ppc_md.hpte_invalidate	= pSeries_lpar_hpte_invalidate;
+	ppc_md.hpte_updatepp	= pSeries_lpar_hpte_updatepp;
+	ppc_md.hpte_updateboltedpp = pSeries_lpar_hpte_updateboltedpp;
+	ppc_md.hpte_insert	= pSeries_lpar_hpte_insert;
+	ppc_md.hpte_remove	= pSeries_lpar_hpte_remove;
+	ppc_md.flush_hash_range	= pSeries_lpar_flush_hash_range;
+	ppc_md.hpte_clear_all   = pSeries_lpar_hptab_clear;
+
+	htab_finish_init();
+}
diff --git a/arch/ppc64/kernel/pSeries_nvram.c b/arch/ppc64/kernel/pSeries_nvram.c
new file mode 100644
index 00000000000..18abfb1f4e2
--- /dev/null
+++ b/arch/ppc64/kernel/pSeries_nvram.c
@@ -0,0 +1,148 @@
+/*
+ *  c 2001 PPC 64 Team, IBM Corp
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ * /dev/nvram driver for PPC64
+ *
+ * This perhaps should live in drivers/char
+ */
+
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <asm/uaccess.h>
+#include <asm/nvram.h>
+#include <asm/rtas.h>
+#include <asm/prom.h>
+#include <asm/machdep.h>
+
+static unsigned int nvram_size;
+static int nvram_fetch, nvram_store;
+static char nvram_buf[NVRW_CNT];	/* assume this is in the first 4GB */
+static DEFINE_SPINLOCK(nvram_lock);
+
+
+static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index)
+{
+	unsigned int i;
+	unsigned long len;
+	int done;
+	unsigned long flags;
+	char *p = buf;
+
+
+	if (nvram_size == 0 || nvram_fetch == RTAS_UNKNOWN_SERVICE)
+		return -ENODEV;
+
+	if (*index >= nvram_size)
+		return 0;
+
+	i = *index;
+	if (i + count > nvram_size)
+		count = nvram_size - i;
+
+	spin_lock_irqsave(&nvram_lock, flags);
+
+	for (; count != 0; count -= len) {
+		len = count;
+		if (len > NVRW_CNT)
+			len = NVRW_CNT;
+		
+		if ((rtas_call(nvram_fetch, 3, 2, &done, i, __pa(nvram_buf),
+			       len) != 0) || len != done) {
+			spin_unlock_irqrestore(&nvram_lock, flags);
+			return -EIO;
+		}
+		
+		memcpy(p, nvram_buf, len);
+
+		p += len;
+		i += len;
+	}
+
+	spin_unlock_irqrestore(&nvram_lock, flags);
+	
+	*index = i;
+	return p - buf;
+}
+
+static ssize_t pSeries_nvram_write(char *buf, size_t count, loff_t *index)
+{
+	unsigned int i;
+	unsigned long len;
+	int done;
+	unsigned long flags;
+	const char *p = buf;
+
+	if (nvram_size == 0 || nvram_store == RTAS_UNKNOWN_SERVICE)
+		return -ENODEV;
+
+	if (*index >= nvram_size)
+		return 0;
+
+	i = *index;
+	if (i + count > nvram_size)
+		count = nvram_size - i;
+
+	spin_lock_irqsave(&nvram_lock, flags);
+
+	for (; count != 0; count -= len) {
+		len = count;
+		if (len > NVRW_CNT)
+			len = NVRW_CNT;
+
+		memcpy(nvram_buf, p, len);
+
+		if ((rtas_call(nvram_store, 3, 2, &done, i, __pa(nvram_buf),
+			       len) != 0) || len != done) {
+			spin_unlock_irqrestore(&nvram_lock, flags);
+			return -EIO;
+		}
+		
+		p += len;
+		i += len;
+	}
+	spin_unlock_irqrestore(&nvram_lock, flags);
+	
+	*index = i;
+	return p - buf;
+}
+
+static ssize_t pSeries_nvram_get_size(void)
+{
+	return nvram_size ? nvram_size : -ENODEV;
+}
+
+int __init pSeries_nvram_init(void)
+{
+	struct device_node *nvram;
+	unsigned int *nbytes_p, proplen;
+
+	nvram = of_find_node_by_type(NULL, "nvram");
+	if (nvram == NULL)
+		return -ENODEV;
+
+	nbytes_p = (unsigned int *)get_property(nvram, "#bytes", &proplen);
+	if (nbytes_p == NULL || proplen != sizeof(unsigned int))
+		return -EIO;
+
+	nvram_size = *nbytes_p;
+
+	nvram_fetch = rtas_token("nvram-fetch");
+	nvram_store = rtas_token("nvram-store");
+	printk(KERN_INFO "PPC64 nvram contains %d bytes\n", nvram_size);
+	of_node_put(nvram);
+
+	ppc_md.nvram_read	= pSeries_nvram_read;
+	ppc_md.nvram_write	= pSeries_nvram_write;
+	ppc_md.nvram_size	= pSeries_nvram_get_size;
+
+	return 0;
+}
diff --git a/arch/ppc64/kernel/pSeries_pci.c b/arch/ppc64/kernel/pSeries_pci.c
new file mode 100644
index 00000000000..0b1cca28140
--- /dev/null
+++ b/arch/ppc64/kernel/pSeries_pci.c
@@ -0,0 +1,602 @@
+/*
+ * pSeries_pci.c
+ *
+ * Copyright (C) 2001 Dave Engebretsen, IBM Corporation
+ * Copyright (C) 2003 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * pSeries specific routines for PCI.
+ * 
+ * Based on code from pci.c and chrp_pci.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *    
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/threads.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/irq.h>
+#include <asm/prom.h>
+#include <asm/machdep.h>
+#include <asm/pci-bridge.h>
+#include <asm/iommu.h>
+#include <asm/rtas.h>
+
+#include "mpic.h"
+#include "pci.h"
+
+/* RTAS tokens */
+static int read_pci_config;
+static int write_pci_config;
+static int ibm_read_pci_config;
+static int ibm_write_pci_config;
+
+static int s7a_workaround;
+
+extern struct mpic *pSeries_mpic;
+
+static int config_access_valid(struct device_node *dn, int where)
+{
+	if (where < 256)
+		return 1;
+	if (where < 4096 && dn->pci_ext_config_space)
+		return 1;
+
+	return 0;
+}
+
+static int rtas_read_config(struct device_node *dn, int where, int size, u32 *val)
+{
+	int returnval = -1;
+	unsigned long buid, addr;
+	int ret;
+
+	if (!dn)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+	if (!config_access_valid(dn, where))
+		return PCIBIOS_BAD_REGISTER_NUMBER;
+
+	addr = ((where & 0xf00) << 20) | (dn->busno << 16) |
+		(dn->devfn << 8) | (where & 0xff);
+	buid = dn->phb->buid;
+	if (buid) {
+		ret = rtas_call(ibm_read_pci_config, 4, 2, &returnval,
+				addr, buid >> 32, buid & 0xffffffff, size);
+	} else {
+		ret = rtas_call(read_pci_config, 2, 2, &returnval, addr, size);
+	}
+	*val = returnval;
+
+	if (ret)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	if (returnval == EEH_IO_ERROR_VALUE(size)
+	    && eeh_dn_check_failure (dn, NULL))
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	return PCIBIOS_SUCCESSFUL;
+}
+
+static int rtas_pci_read_config(struct pci_bus *bus,
+				unsigned int devfn,
+				int where, int size, u32 *val)
+{
+	struct device_node *busdn, *dn;
+
+	if (bus->self)
+		busdn = pci_device_to_OF_node(bus->self);
+	else
+		busdn = bus->sysdata;	/* must be a phb */
+
+	/* Search only direct children of the bus */
+	for (dn = busdn->child; dn; dn = dn->sibling)
+		if (dn->devfn == devfn)
+			return rtas_read_config(dn, where, size, val);
+	return PCIBIOS_DEVICE_NOT_FOUND;
+}
+
+static int rtas_write_config(struct device_node *dn, int where, int size, u32 val)
+{
+	unsigned long buid, addr;
+	int ret;
+
+	if (!dn)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+	if (!config_access_valid(dn, where))
+		return PCIBIOS_BAD_REGISTER_NUMBER;
+
+	addr = ((where & 0xf00) << 20) | (dn->busno << 16) |
+		(dn->devfn << 8) | (where & 0xff);
+	buid = dn->phb->buid;
+	if (buid) {
+		ret = rtas_call(ibm_write_pci_config, 5, 1, NULL, addr, buid >> 32, buid & 0xffffffff, size, (ulong) val);
+	} else {
+		ret = rtas_call(write_pci_config, 3, 1, NULL, addr, size, (ulong)val);
+	}
+
+	if (ret)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	return PCIBIOS_SUCCESSFUL;
+}
+
+static int rtas_pci_write_config(struct pci_bus *bus,
+				 unsigned int devfn,
+				 int where, int size, u32 val)
+{
+	struct device_node *busdn, *dn;
+
+	if (bus->self)
+		busdn = pci_device_to_OF_node(bus->self);
+	else
+		busdn = bus->sysdata;	/* must be a phb */
+
+	/* Search only direct children of the bus */
+	for (dn = busdn->child; dn; dn = dn->sibling)
+		if (dn->devfn == devfn)
+			return rtas_write_config(dn, where, size, val);
+	return PCIBIOS_DEVICE_NOT_FOUND;
+}
+
+struct pci_ops rtas_pci_ops = {
+	rtas_pci_read_config,
+	rtas_pci_write_config
+};
+
+int is_python(struct device_node *dev)
+{
+	char *model = (char *)get_property(dev, "model", NULL);
+
+	if (model && strstr(model, "Python"))
+		return 1;
+
+	return 0;
+}
+
+static int get_phb_reg_prop(struct device_node *dev,
+			    unsigned int addr_size_words,
+			    struct reg_property64 *reg)
+{
+	unsigned int *ui_ptr = NULL, len;
+
+	/* Found a PHB, now figure out where his registers are mapped. */
+	ui_ptr = (unsigned int *)get_property(dev, "reg", &len);
+	if (ui_ptr == NULL)
+		return 1;
+
+	if (addr_size_words == 1) {
+		reg->address = ((struct reg_property32 *)ui_ptr)->address;
+		reg->size    = ((struct reg_property32 *)ui_ptr)->size;
+	} else {
+		*reg = *((struct reg_property64 *)ui_ptr);
+	}
+
+	return 0;
+}
+
+static void python_countermeasures(struct device_node *dev,
+				   unsigned int addr_size_words)
+{
+	struct reg_property64 reg_struct;
+	void __iomem *chip_regs;
+	volatile u32 val;
+
+	if (get_phb_reg_prop(dev, addr_size_words, &reg_struct))
+		return;
+
+	/* Python's register file is 1 MB in size. */
+	chip_regs = ioremap(reg_struct.address & ~(0xfffffUL), 0x100000);
+
+	/* 
+	 * Firmware doesn't always clear this bit which is critical
+	 * for good performance - Anton
+	 */
+
+#define PRG_CL_RESET_VALID 0x00010000
+
+	val = in_be32(chip_regs + 0xf6030);
+	if (val & PRG_CL_RESET_VALID) {
+		printk(KERN_INFO "Python workaround: ");
+		val &= ~PRG_CL_RESET_VALID;
+		out_be32(chip_regs + 0xf6030, val);
+		/*
+		 * We must read it back for changes to
+		 * take effect
+		 */
+		val = in_be32(chip_regs + 0xf6030);
+		printk("reg0: %x\n", val);
+	}
+
+	iounmap(chip_regs);
+}
+
+void __init init_pci_config_tokens (void)
+{
+	read_pci_config = rtas_token("read-pci-config");
+	write_pci_config = rtas_token("write-pci-config");
+	ibm_read_pci_config = rtas_token("ibm,read-pci-config");
+	ibm_write_pci_config = rtas_token("ibm,write-pci-config");
+}
+
+unsigned long __devinit get_phb_buid (struct device_node *phb)
+{
+	int addr_cells;
+	unsigned int *buid_vals;
+	unsigned int len;
+	unsigned long buid;
+
+	if (ibm_read_pci_config == -1) return 0;
+
+	/* PHB's will always be children of the root node,
+	 * or so it is promised by the current firmware. */
+	if (phb->parent == NULL)
+		return 0;
+	if (phb->parent->parent)
+		return 0;
+
+	buid_vals = (unsigned int *) get_property(phb, "reg", &len);
+	if (buid_vals == NULL)
+		return 0;
+
+	addr_cells = prom_n_addr_cells(phb);
+	if (addr_cells == 1) {
+		buid = (unsigned long) buid_vals[0];
+	} else {
+		buid = (((unsigned long)buid_vals[0]) << 32UL) |
+			(((unsigned long)buid_vals[1]) & 0xffffffff);
+	}
+	return buid;
+}
+
+static int phb_set_bus_ranges(struct device_node *dev,
+			      struct pci_controller *phb)
+{
+	int *bus_range;
+	unsigned int len;
+
+	bus_range = (int *) get_property(dev, "bus-range", &len);
+	if (bus_range == NULL || len < 2 * sizeof(int)) {
+		return 1;
+ 	}
+ 
+	phb->first_busno =  bus_range[0];
+	phb->last_busno  =  bus_range[1];
+
+	return 0;
+}
+
+static int __devinit setup_phb(struct device_node *dev,
+			       struct pci_controller *phb,
+			       unsigned int addr_size_words)
+{
+	pci_setup_pci_controller(phb);
+
+	if (is_python(dev))
+		python_countermeasures(dev, addr_size_words);
+
+	if (phb_set_bus_ranges(dev, phb))
+		return 1;
+
+	phb->arch_data = dev;
+	phb->ops = &rtas_pci_ops;
+	phb->buid = get_phb_buid(dev);
+
+	return 0;
+}
+
+static void __devinit add_linux_pci_domain(struct device_node *dev,
+					   struct pci_controller *phb,
+					   struct property *of_prop)
+{
+	memset(of_prop, 0, sizeof(struct property));
+	of_prop->name = "linux,pci-domain";
+	of_prop->length = sizeof(phb->global_number);
+	of_prop->value = (unsigned char *)&of_prop[1];
+	memcpy(of_prop->value, &phb->global_number, sizeof(phb->global_number));
+	prom_add_property(dev, of_prop);
+}
+
+static struct pci_controller * __init alloc_phb(struct device_node *dev,
+						unsigned int addr_size_words)
+{
+	struct pci_controller *phb;
+	struct property *of_prop;
+
+	phb = alloc_bootmem(sizeof(struct pci_controller));
+	if (phb == NULL)
+		return NULL;
+
+	of_prop = alloc_bootmem(sizeof(struct property) +
+				sizeof(phb->global_number));
+	if (!of_prop)
+		return NULL;
+
+	if (setup_phb(dev, phb, addr_size_words))
+		return NULL;
+
+	add_linux_pci_domain(dev, phb, of_prop);
+
+	return phb;
+}
+
+static struct pci_controller * __devinit alloc_phb_dynamic(struct device_node *dev, unsigned int addr_size_words)
+{
+	struct pci_controller *phb;
+
+	phb = (struct pci_controller *)kmalloc(sizeof(struct pci_controller),
+					       GFP_KERNEL);
+	if (phb == NULL)
+		return NULL;
+
+	if (setup_phb(dev, phb, addr_size_words))
+		return NULL;
+
+	phb->is_dynamic = 1;
+
+	/* TODO: linux,pci-domain? */
+
+ 	return phb;
+}
+
+unsigned long __init find_and_init_phbs(void)
+{
+	struct device_node *node;
+	struct pci_controller *phb;
+	unsigned int root_size_cells = 0;
+	unsigned int index;
+	unsigned int *opprop = NULL;
+	struct device_node *root = of_find_node_by_path("/");
+
+	if (ppc64_interrupt_controller == IC_OPEN_PIC) {
+		opprop = (unsigned int *)get_property(root,
+				"platform-open-pic", NULL);
+	}
+
+	root_size_cells = prom_n_size_cells(root);
+
+	index = 0;
+
+	for (node = of_get_next_child(root, NULL);
+	     node != NULL;
+	     node = of_get_next_child(root, node)) {
+		if (node->type == NULL || strcmp(node->type, "pci") != 0)
+			continue;
+
+		phb = alloc_phb(node, root_size_cells);
+		if (!phb)
+			continue;
+
+		pci_process_bridge_OF_ranges(phb, node);
+		pci_setup_phb_io(phb, index == 0);
+
+		if (ppc64_interrupt_controller == IC_OPEN_PIC && pSeries_mpic) {
+			int addr = root_size_cells * (index + 2) - 1;
+			mpic_assign_isu(pSeries_mpic, index, opprop[addr]);
+		}
+
+		index++;
+	}
+
+	of_node_put(root);
+	pci_devs_phb_init();
+
+	/*
+	 * pci_probe_only and pci_assign_all_buses can be set via properties
+	 * in chosen.
+	 */
+	if (of_chosen) {
+		int *prop;
+
+		prop = (int *)get_property(of_chosen, "linux,pci-probe-only",
+					   NULL);
+		if (prop)
+			pci_probe_only = *prop;
+
+		prop = (int *)get_property(of_chosen,
+					   "linux,pci-assign-all-buses", NULL);
+		if (prop)
+			pci_assign_all_buses = *prop;
+	}
+
+	return 0;
+}
+
+struct pci_controller * __devinit init_phb_dynamic(struct device_node *dn)
+{
+	struct device_node *root = of_find_node_by_path("/");
+	unsigned int root_size_cells = 0;
+	struct pci_controller *phb;
+	struct pci_bus *bus;
+	int primary;
+
+	root_size_cells = prom_n_size_cells(root);
+
+	primary = list_empty(&hose_list);
+	phb = alloc_phb_dynamic(dn, root_size_cells);
+	if (!phb)
+		return NULL;
+
+	pci_process_bridge_OF_ranges(phb, dn);
+
+	pci_setup_phb_io_dynamic(phb, primary);
+	of_node_put(root);
+
+	pci_devs_phb_init_dynamic(phb);
+	phb->last_busno = 0xff;
+	bus = pci_scan_bus(phb->first_busno, phb->ops, phb->arch_data);
+	phb->bus = bus;
+	phb->last_busno = bus->subordinate;
+
+	return phb;
+}
+EXPORT_SYMBOL(init_phb_dynamic);
+
+#if 0
+void pcibios_name_device(struct pci_dev *dev)
+{
+	struct device_node *dn;
+
+	/*
+	 * Add IBM loc code (slot) as a prefix to the device names for service
+	 */
+	dn = pci_device_to_OF_node(dev);
+	if (dn) {
+		char *loc_code = get_property(dn, "ibm,loc-code", 0);
+		if (loc_code) {
+			int loc_len = strlen(loc_code);
+			if (loc_len < sizeof(dev->dev.name)) {
+				memmove(dev->dev.name+loc_len+1, dev->dev.name,
+					sizeof(dev->dev.name)-loc_len-1);
+				memcpy(dev->dev.name, loc_code, loc_len);
+				dev->dev.name[loc_len] = ' ';
+				dev->dev.name[sizeof(dev->dev.name)-1] = '\0';
+			}
+		}
+	}
+}   
+DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pcibios_name_device);
+#endif
+
+static void check_s7a(void)
+{
+	struct device_node *root;
+	char *model;
+
+	root = of_find_node_by_path("/");
+	if (root) {
+		model = get_property(root, "model", NULL);
+		if (model && !strcmp(model, "IBM,7013-S7A"))
+			s7a_workaround = 1;
+		of_node_put(root);
+	}
+}
+
+/* RPA-specific bits for removing PHBs */
+int pcibios_remove_root_bus(struct pci_controller *phb)
+{
+	struct pci_bus *b = phb->bus;
+	struct resource *res;
+	int rc, i;
+
+	res = b->resource[0];
+	if (!res->flags) {
+		printk(KERN_ERR "%s: no IO resource for PHB %s\n", __FUNCTION__,
+				b->name);
+		return 1;
+	}
+
+	rc = unmap_bus_range(b);
+	if (rc) {
+		printk(KERN_ERR "%s: failed to unmap IO on bus %s\n",
+			__FUNCTION__, b->name);
+		return 1;
+	}
+
+	if (release_resource(res)) {
+		printk(KERN_ERR "%s: failed to release IO on bus %s\n",
+				__FUNCTION__, b->name);
+		return 1;
+	}
+
+	for (i = 1; i < 3; ++i) {
+		res = b->resource[i];
+		if (!res->flags && i == 0) {
+			printk(KERN_ERR "%s: no MEM resource for PHB %s\n",
+				__FUNCTION__, b->name);
+			return 1;
+		}
+		if (res->flags && release_resource(res)) {
+			printk(KERN_ERR
+			       "%s: failed to release IO %d on bus %s\n",
+				__FUNCTION__, i, b->name);
+			return 1;
+		}
+	}
+
+	list_del(&phb->list_node);
+	if (phb->is_dynamic)
+		kfree(phb);
+
+	return 0;
+}
+EXPORT_SYMBOL(pcibios_remove_root_bus);
+
+static void __init pSeries_request_regions(void)
+{
+	if (!isa_io_base)
+		return;
+
+	request_region(0x20,0x20,"pic1");
+	request_region(0xa0,0x20,"pic2");
+	request_region(0x00,0x20,"dma1");
+	request_region(0x40,0x20,"timer");
+	request_region(0x80,0x10,"dma page reg");
+	request_region(0xc0,0x20,"dma2");
+}
+
+void __init pSeries_final_fixup(void)
+{
+	struct pci_dev *dev = NULL;
+
+	check_s7a();
+
+	for_each_pci_dev(dev) {
+		pci_read_irq_line(dev);
+		if (s7a_workaround) {
+			if (dev->irq > 16) {
+				dev->irq -= 3;
+				pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq);
+			}
+		}
+	}
+
+	phbs_remap_io();
+	pSeries_request_regions();
+
+	pci_addr_cache_build();
+}
+
+/*
+ * Assume the winbond 82c105 is the IDE controller on a
+ * p610.  We should probably be more careful in case
+ * someone tries to plug in a similar adapter.
+ */
+static void fixup_winbond_82c105(struct pci_dev* dev)
+{
+	int i;
+	unsigned int reg;
+
+	if (!(systemcfg->platform & PLATFORM_PSERIES))
+		return;
+
+	printk("Using INTC for W82c105 IDE controller.\n");
+	pci_read_config_dword(dev, 0x40, &reg);
+	/* Enable LEGIRQ to use INTC instead of ISA interrupts */
+	pci_write_config_dword(dev, 0x40, reg | (1<<11));
+
+	for (i = 0; i < DEVICE_COUNT_RESOURCE; ++i) {
+		/* zap the 2nd function of the winbond chip */
+		if (dev->resource[i].flags & IORESOURCE_IO
+		    && dev->bus->number == 0 && dev->devfn == 0x81)
+			dev->resource[i].flags &= ~IORESOURCE_IO;
+	}
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_WINBOND, PCI_DEVICE_ID_WINBOND_82C105,
+			 fixup_winbond_82c105);
diff --git a/arch/ppc64/kernel/pSeries_reconfig.c b/arch/ppc64/kernel/pSeries_reconfig.c
new file mode 100644
index 00000000000..cb5443f2e49
--- /dev/null
+++ b/arch/ppc64/kernel/pSeries_reconfig.c
@@ -0,0 +1,434 @@
+/*
+ * pSeries_reconfig.c - support for dynamic reconfiguration (including PCI
+ * Hotplug and Dynamic Logical Partitioning on RPA platforms).
+ *
+ * Copyright (C) 2005 Nathan Lynch
+ * Copyright (C) 2005 IBM Corporation
+ *
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License version
+ *	2 as published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kref.h>
+#include <linux/notifier.h>
+#include <linux/proc_fs.h>
+
+#include <asm/prom.h>
+#include <asm/pSeries_reconfig.h>
+#include <asm/uaccess.h>
+
+
+
+/*
+ * Routines for "runtime" addition and removal of device tree nodes.
+ */
+#ifdef CONFIG_PROC_DEVICETREE
+/*
+ * Add a node to /proc/device-tree.
+ */
+static void add_node_proc_entries(struct device_node *np)
+{
+	struct proc_dir_entry *ent;
+
+	ent = proc_mkdir(strrchr(np->full_name, '/') + 1, np->parent->pde);
+	if (ent)
+		proc_device_tree_add_node(np, ent);
+}
+
+static void remove_node_proc_entries(struct device_node *np)
+{
+	struct property *pp = np->properties;
+	struct device_node *parent = np->parent;
+
+	while (pp) {
+		remove_proc_entry(pp->name, np->pde);
+		pp = pp->next;
+	}
+
+	/* Assuming that symlinks have the same parent directory as
+	 * np->pde.
+	 */
+	if (np->name_link)
+		remove_proc_entry(np->name_link->name, parent->pde);
+	if (np->addr_link)
+		remove_proc_entry(np->addr_link->name, parent->pde);
+	if (np->pde)
+		remove_proc_entry(np->pde->name, parent->pde);
+}
+#else /* !CONFIG_PROC_DEVICETREE */
+static void add_node_proc_entries(struct device_node *np)
+{
+	return;
+}
+
+static void remove_node_proc_entries(struct device_node *np)
+{
+	return;
+}
+#endif /* CONFIG_PROC_DEVICETREE */
+
+/**
+ *	derive_parent - basically like dirname(1)
+ *	@path:  the full_name of a node to be added to the tree
+ *
+ *	Returns the node which should be the parent of the node
+ *	described by path.  E.g., for path = "/foo/bar", returns
+ *	the node with full_name = "/foo".
+ */
+static struct device_node *derive_parent(const char *path)
+{
+	struct device_node *parent = NULL;
+	char *parent_path = "/";
+	size_t parent_path_len = strrchr(path, '/') - path + 1;
+
+	/* reject if path is "/" */
+	if (!strcmp(path, "/"))
+		return ERR_PTR(-EINVAL);
+
+	if (strrchr(path, '/') != path) {
+		parent_path = kmalloc(parent_path_len, GFP_KERNEL);
+		if (!parent_path)
+			return ERR_PTR(-ENOMEM);
+		strlcpy(parent_path, path, parent_path_len);
+	}
+	parent = of_find_node_by_path(parent_path);
+	if (!parent)
+		return ERR_PTR(-EINVAL);
+	if (strcmp(parent_path, "/"))
+		kfree(parent_path);
+	return parent;
+}
+
+static struct notifier_block *pSeries_reconfig_chain;
+
+int pSeries_reconfig_notifier_register(struct notifier_block *nb)
+{
+	return notifier_chain_register(&pSeries_reconfig_chain, nb);
+}
+
+void pSeries_reconfig_notifier_unregister(struct notifier_block *nb)
+{
+	notifier_chain_unregister(&pSeries_reconfig_chain, nb);
+}
+
+static int pSeries_reconfig_add_node(const char *path, struct property *proplist)
+{
+	struct device_node *np;
+	int err = -ENOMEM;
+
+	np = kcalloc(1, sizeof(*np), GFP_KERNEL);
+	if (!np)
+		goto out_err;
+
+	np->full_name = kmalloc(strlen(path) + 1, GFP_KERNEL);
+	if (!np->full_name)
+		goto out_err;
+
+	strcpy(np->full_name, path);
+
+	np->properties = proplist;
+	OF_MARK_DYNAMIC(np);
+	kref_init(&np->kref);
+
+	np->parent = derive_parent(path);
+	if (IS_ERR(np->parent)) {
+		err = PTR_ERR(np->parent);
+		goto out_err;
+	}
+
+	err = notifier_call_chain(&pSeries_reconfig_chain,
+				  PSERIES_RECONFIG_ADD, np);
+	if (err == NOTIFY_BAD) {
+		printk(KERN_ERR "Failed to add device node %s\n", path);
+		err = -ENOMEM; /* For now, safe to assume kmalloc failure */
+		goto out_err;
+	}
+
+	of_attach_node(np);
+
+	add_node_proc_entries(np);
+
+	of_node_put(np->parent);
+
+	return 0;
+
+out_err:
+	if (np) {
+		of_node_put(np->parent);
+		kfree(np->full_name);
+		kfree(np);
+	}
+	return err;
+}
+
+static int pSeries_reconfig_remove_node(struct device_node *np)
+{
+	struct device_node *parent, *child;
+
+	parent = of_get_parent(np);
+	if (!parent)
+		return -EINVAL;
+
+	if ((child = of_get_next_child(np, NULL))) {
+		of_node_put(child);
+		return -EBUSY;
+	}
+
+	remove_node_proc_entries(np);
+
+	notifier_call_chain(&pSeries_reconfig_chain,
+			    PSERIES_RECONFIG_REMOVE, np);
+	of_detach_node(np);
+
+	of_node_put(parent);
+	of_node_put(np); /* Must decrement the refcount */
+	return 0;
+}
+
+/*
+ * /proc/ppc64/ofdt - yucky binary interface for adding and removing
+ * OF device nodes.  Should be deprecated as soon as we get an
+ * in-kernel wrapper for the RTAS ibm,configure-connector call.
+ */
+
+static void release_prop_list(const struct property *prop)
+{
+	struct property *next;
+	for (; prop; prop = next) {
+		next = prop->next;
+		kfree(prop->name);
+		kfree(prop->value);
+		kfree(prop);
+	}
+
+}
+
+/**
+ * parse_next_property - process the next property from raw input buffer
+ * @buf: input buffer, must be nul-terminated
+ * @end: end of the input buffer + 1, for validation
+ * @name: return value; set to property name in buf
+ * @length: return value; set to length of value
+ * @value: return value; set to the property value in buf
+ *
+ * Note that the caller must make copies of the name and value returned,
+ * this function does no allocation or copying of the data.  Return value
+ * is set to the next name in buf, or NULL on error.
+ */
+static char * parse_next_property(char *buf, char *end, char **name, int *length,
+				  unsigned char **value)
+{
+	char *tmp;
+
+	*name = buf;
+
+	tmp = strchr(buf, ' ');
+	if (!tmp) {
+		printk(KERN_ERR "property parse failed in %s at line %d\n",
+		       __FUNCTION__, __LINE__);
+		return NULL;
+	}
+	*tmp = '\0';
+
+	if (++tmp >= end) {
+		printk(KERN_ERR "property parse failed in %s at line %d\n",
+		       __FUNCTION__, __LINE__);
+		return NULL;
+	}
+
+	/* now we're on the length */
+	*length = -1;
+	*length = simple_strtoul(tmp, &tmp, 10);
+	if (*length == -1) {
+		printk(KERN_ERR "property parse failed in %s at line %d\n",
+		       __FUNCTION__, __LINE__);
+		return NULL;
+	}
+	if (*tmp != ' ' || ++tmp >= end) {
+		printk(KERN_ERR "property parse failed in %s at line %d\n",
+		       __FUNCTION__, __LINE__);
+		return NULL;
+	}
+
+	/* now we're on the value */
+	*value = tmp;
+	tmp += *length;
+	if (tmp > end) {
+		printk(KERN_ERR "property parse failed in %s at line %d\n",
+		       __FUNCTION__, __LINE__);
+		return NULL;
+	}
+	else if (tmp < end && *tmp != ' ' && *tmp != '\0') {
+		printk(KERN_ERR "property parse failed in %s at line %d\n",
+		       __FUNCTION__, __LINE__);
+		return NULL;
+	}
+	tmp++;
+
+	/* and now we should be on the next name, or the end */
+	return tmp;
+}
+
+static struct property *new_property(const char *name, const int length,
+				     const unsigned char *value, struct property *last)
+{
+	struct property *new = kmalloc(sizeof(*new), GFP_KERNEL);
+
+	if (!new)
+		return NULL;
+	memset(new, 0, sizeof(*new));
+
+	if (!(new->name = kmalloc(strlen(name) + 1, GFP_KERNEL)))
+		goto cleanup;
+	if (!(new->value = kmalloc(length + 1, GFP_KERNEL)))
+		goto cleanup;
+
+	strcpy(new->name, name);
+	memcpy(new->value, value, length);
+	*(((char *)new->value) + length) = 0;
+	new->length = length;
+	new->next = last;
+	return new;
+
+cleanup:
+	if (new->name)
+		kfree(new->name);
+	if (new->value)
+		kfree(new->value);
+	kfree(new);
+	return NULL;
+}
+
+static int do_add_node(char *buf, size_t bufsize)
+{
+	char *path, *end, *name;
+	struct device_node *np;
+	struct property *prop = NULL;
+	unsigned char* value;
+	int length, rv = 0;
+
+	end = buf + bufsize;
+	path = buf;
+	buf = strchr(buf, ' ');
+	if (!buf)
+		return -EINVAL;
+	*buf = '\0';
+	buf++;
+
+	if ((np = of_find_node_by_path(path))) {
+		of_node_put(np);
+		return -EINVAL;
+	}
+
+	/* rv = build_prop_list(tmp, bufsize - (tmp - buf), &proplist); */
+	while (buf < end &&
+	       (buf = parse_next_property(buf, end, &name, &length, &value))) {
+		struct property *last = prop;
+
+		prop = new_property(name, length, value, last);
+		if (!prop) {
+			rv = -ENOMEM;
+			prop = last;
+			goto out;
+		}
+	}
+	if (!buf) {
+		rv = -EINVAL;
+		goto out;
+	}
+
+	rv = pSeries_reconfig_add_node(path, prop);
+
+out:
+	if (rv)
+		release_prop_list(prop);
+	return rv;
+}
+
+static int do_remove_node(char *buf)
+{
+	struct device_node *node;
+	int rv = -ENODEV;
+
+	if ((node = of_find_node_by_path(buf)))
+		rv = pSeries_reconfig_remove_node(node);
+
+	of_node_put(node);
+	return rv;
+}
+
+/**
+ * ofdt_write - perform operations on the Open Firmware device tree
+ *
+ * @file: not used
+ * @buf: command and arguments
+ * @count: size of the command buffer
+ * @off: not used
+ *
+ * Operations supported at this time are addition and removal of
+ * whole nodes along with their properties.  Operations on individual
+ * properties are not implemented (yet).
+ */
+static ssize_t ofdt_write(struct file *file, const char __user *buf, size_t count,
+			  loff_t *off)
+{
+	int rv = 0;
+	char *kbuf;
+	char *tmp;
+
+	if (!(kbuf = kmalloc(count + 1, GFP_KERNEL))) {
+		rv = -ENOMEM;
+		goto out;
+	}
+	if (copy_from_user(kbuf, buf, count)) {
+		rv = -EFAULT;
+		goto out;
+	}
+
+	kbuf[count] = '\0';
+
+	tmp = strchr(kbuf, ' ');
+	if (!tmp) {
+		rv = -EINVAL;
+		goto out;
+	}
+	*tmp = '\0';
+	tmp++;
+
+	if (!strcmp(kbuf, "add_node"))
+		rv = do_add_node(tmp, count - (tmp - kbuf));
+	else if (!strcmp(kbuf, "remove_node"))
+		rv = do_remove_node(tmp);
+	else
+		rv = -EINVAL;
+out:
+	kfree(kbuf);
+	return rv ? rv : count;
+}
+
+static struct file_operations ofdt_fops = {
+	.write = ofdt_write
+};
+
+/* create /proc/ppc64/ofdt write-only by root */
+static int proc_ppc64_create_ofdt(void)
+{
+	struct proc_dir_entry *ent;
+
+	if (!(systemcfg->platform & PLATFORM_PSERIES))
+		return 0;
+
+	ent = create_proc_entry("ppc64/ofdt", S_IWUSR, NULL);
+	if (ent) {
+		ent->nlink = 1;
+		ent->data = NULL;
+		ent->size = 0;
+		ent->proc_fops = &ofdt_fops;
+	}
+
+	return 0;
+}
+__initcall(proc_ppc64_create_ofdt);
diff --git a/arch/ppc64/kernel/pSeries_setup.c b/arch/ppc64/kernel/pSeries_setup.c
new file mode 100644
index 00000000000..06536de5125
--- /dev/null
+++ b/arch/ppc64/kernel/pSeries_setup.c
@@ -0,0 +1,612 @@
+/*
+ *  linux/arch/ppc/kernel/setup.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *  Adapted from 'alpha' version by Gary Thomas
+ *  Modified by Cort Dougan (cort@cs.nmt.edu)
+ *  Modified by PPC64 Team, IBM Corp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * bootup setup stuff..
+ */
+
+#undef DEBUG
+
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/slab.h>
+#include <linux/user.h>
+#include <linux/a.out.h>
+#include <linux/tty.h>
+#include <linux/major.h>
+#include <linux/interrupt.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/console.h>
+#include <linux/pci.h>
+#include <linux/version.h>
+#include <linux/adb.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/irq.h>
+#include <linux/seq_file.h>
+#include <linux/root_dev.h>
+
+#include <asm/mmu.h>
+#include <asm/processor.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/pci-bridge.h>
+#include <asm/iommu.h>
+#include <asm/dma.h>
+#include <asm/machdep.h>
+#include <asm/irq.h>
+#include <asm/time.h>
+#include <asm/nvram.h>
+#include <asm/plpar_wrappers.h>
+#include <asm/xics.h>
+#include <asm/cputable.h>
+
+#include "i8259.h"
+#include "mpic.h"
+#include "pci.h"
+
+#ifdef DEBUG
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+extern void pSeries_final_fixup(void);
+
+extern void pSeries_get_boot_time(struct rtc_time *rtc_time);
+extern void pSeries_get_rtc_time(struct rtc_time *rtc_time);
+extern int  pSeries_set_rtc_time(struct rtc_time *rtc_time);
+extern void find_udbg_vterm(void);
+extern void system_reset_fwnmi(void);	/* from head.S */
+extern void machine_check_fwnmi(void);	/* from head.S */
+extern void generic_find_legacy_serial_ports(u64 *physport,
+		unsigned int *default_speed);
+
+int fwnmi_active;  /* TRUE if an FWNMI handler is present */
+
+extern unsigned long ppc_proc_freq;
+extern unsigned long ppc_tb_freq;
+
+extern void pSeries_system_reset_exception(struct pt_regs *regs);
+extern int pSeries_machine_check_exception(struct pt_regs *regs);
+
+static volatile void __iomem * chrp_int_ack_special;
+struct mpic *pSeries_mpic;
+
+void pSeries_get_cpuinfo(struct seq_file *m)
+{
+	struct device_node *root;
+	const char *model = "";
+
+	root = of_find_node_by_path("/");
+	if (root)
+		model = get_property(root, "model", NULL);
+	seq_printf(m, "machine\t\t: CHRP %s\n", model);
+	of_node_put(root);
+}
+
+/* Initialize firmware assisted non-maskable interrupts if
+ * the firmware supports this feature.
+ *
+ */
+static void __init fwnmi_init(void)
+{
+	int ret;
+	int ibm_nmi_register = rtas_token("ibm,nmi-register");
+	if (ibm_nmi_register == RTAS_UNKNOWN_SERVICE)
+		return;
+	ret = rtas_call(ibm_nmi_register, 2, 1, NULL,
+			__pa((unsigned long)system_reset_fwnmi),
+			__pa((unsigned long)machine_check_fwnmi));
+	if (ret == 0)
+		fwnmi_active = 1;
+}
+
+static int pSeries_irq_cascade(struct pt_regs *regs, void *data)
+{
+	if (chrp_int_ack_special)
+		return readb(chrp_int_ack_special);
+	else
+		return i8259_irq(smp_processor_id());
+}
+
+static void __init pSeries_init_mpic(void)
+{
+        unsigned int *addrp;
+	struct device_node *np;
+        int i;
+
+	/* All ISUs are setup, complete initialization */
+	mpic_init(pSeries_mpic);
+
+	/* Check what kind of cascade ACK we have */
+        if (!(np = of_find_node_by_name(NULL, "pci"))
+            || !(addrp = (unsigned int *)
+                 get_property(np, "8259-interrupt-acknowledge", NULL)))
+                printk(KERN_ERR "Cannot find pci to get ack address\n");
+        else
+		chrp_int_ack_special = ioremap(addrp[prom_n_addr_cells(np)-1], 1);
+	of_node_put(np);
+
+	/* Setup the legacy interrupts & controller */
+        for (i = 0; i < NUM_ISA_INTERRUPTS; i++)
+                irq_desc[i].handler = &i8259_pic;
+	i8259_init(0);
+
+	/* Hook cascade to mpic */
+	mpic_setup_cascade(NUM_ISA_INTERRUPTS, pSeries_irq_cascade, NULL);
+}
+
+static void __init pSeries_setup_mpic(void)
+{
+	unsigned int *opprop;
+	unsigned long openpic_addr = 0;
+        unsigned char senses[NR_IRQS - NUM_ISA_INTERRUPTS];
+        struct device_node *root;
+	int irq_count;
+
+	/* Find the Open PIC if present */
+	root = of_find_node_by_path("/");
+	opprop = (unsigned int *) get_property(root, "platform-open-pic", NULL);
+	if (opprop != 0) {
+		int n = prom_n_addr_cells(root);
+
+		for (openpic_addr = 0; n > 0; --n)
+			openpic_addr = (openpic_addr << 32) + *opprop++;
+		printk(KERN_DEBUG "OpenPIC addr: %lx\n", openpic_addr);
+	}
+	of_node_put(root);
+
+	BUG_ON(openpic_addr == 0);
+
+	/* Get the sense values from OF */
+	prom_get_irq_senses(senses, NUM_ISA_INTERRUPTS, NR_IRQS);
+	
+	/* Setup the openpic driver */
+	irq_count = NR_IRQS - NUM_ISA_INTERRUPTS - 4; /* leave room for IPIs */
+	pSeries_mpic = mpic_alloc(openpic_addr, MPIC_PRIMARY,
+				  16, 16, irq_count, /* isu size, irq offset, irq count */ 
+				  NR_IRQS - 4, /* ipi offset */
+				  senses, irq_count, /* sense & sense size */
+				  " MPIC     ");
+}
+
+static void __init pSeries_setup_arch(void)
+{
+	/* Fixup ppc_md depending on the type of interrupt controller */
+	if (ppc64_interrupt_controller == IC_OPEN_PIC) {
+		ppc_md.init_IRQ       = pSeries_init_mpic; 
+		ppc_md.get_irq        = mpic_get_irq;
+		/* Allocate the mpic now, so that find_and_init_phbs() can
+		 * fill the ISUs */
+		pSeries_setup_mpic();
+	} else {
+		ppc_md.init_IRQ       = xics_init_IRQ;
+		ppc_md.get_irq        = xics_get_irq;
+	}
+
+#ifdef CONFIG_SMP
+	smp_init_pSeries();
+#endif
+	/* openpic global configuration register (64-bit format). */
+	/* openpic Interrupt Source Unit pointer (64-bit format). */
+	/* python0 facility area (mmio) (64-bit format) REAL address. */
+
+	/* init to some ~sane value until calibrate_delay() runs */
+	loops_per_jiffy = 50000000;
+
+	if (ROOT_DEV == 0) {
+		printk("No ramdisk, default root is /dev/sda2\n");
+		ROOT_DEV = Root_SDA2;
+	}
+
+	fwnmi_init();
+
+	/* Find and initialize PCI host bridges */
+	init_pci_config_tokens();
+	eeh_init();
+	find_and_init_phbs();
+
+#ifdef CONFIG_DUMMY_CONSOLE
+	conswitchp = &dummy_con;
+#endif
+
+	pSeries_nvram_init();
+
+	if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR)
+		vpa_init(boot_cpuid);
+}
+
+static int __init pSeries_init_panel(void)
+{
+	/* Manually leave the kernel version on the panel. */
+	ppc_md.progress("Linux ppc64\n", 0);
+	ppc_md.progress(UTS_RELEASE, 0);
+
+	return 0;
+}
+arch_initcall(pSeries_init_panel);
+
+
+/* Build up the firmware_features bitmask field
+ * using contents of device-tree/ibm,hypertas-functions.
+ * Ultimately this functionality may be moved into prom.c prom_init().
+ */
+void __init fw_feature_init(void)
+{
+	struct device_node * dn;
+	char * hypertas;
+	unsigned int len;
+
+	DBG(" -> fw_feature_init()\n");
+
+	cur_cpu_spec->firmware_features = 0;
+	dn = of_find_node_by_path("/rtas");
+	if (dn == NULL) {
+		printk(KERN_ERR "WARNING ! Cannot find RTAS in device-tree !\n");
+		goto no_rtas;
+	}
+
+	hypertas = get_property(dn, "ibm,hypertas-functions", &len);
+	if (hypertas) {
+		while (len > 0){
+			int i, hypertas_len;
+			/* check value against table of strings */
+			for(i=0; i < FIRMWARE_MAX_FEATURES ;i++) {
+				if ((firmware_features_table[i].name) &&
+				    (strcmp(firmware_features_table[i].name,hypertas))==0) {
+					/* we have a match */
+					cur_cpu_spec->firmware_features |= 
+						(firmware_features_table[i].val);
+					break;
+				} 
+			}
+			hypertas_len = strlen(hypertas);
+			len -= hypertas_len +1;
+			hypertas+= hypertas_len +1;
+		}
+	}
+
+	of_node_put(dn);
+ no_rtas:
+	printk(KERN_INFO "firmware_features = 0x%lx\n", 
+	       cur_cpu_spec->firmware_features);
+
+	DBG(" <- fw_feature_init()\n");
+}
+
+
+static  void __init pSeries_discover_pic(void)
+{
+	struct device_node *np;
+	char *typep;
+
+	/*
+	 * Setup interrupt mapping options that are needed for finish_device_tree
+	 * to properly parse the OF interrupt tree & do the virtual irq mapping
+	 */
+	__irq_offset_value = NUM_ISA_INTERRUPTS;
+	ppc64_interrupt_controller = IC_INVALID;
+	for (np = NULL; (np = of_find_node_by_name(np, "interrupt-controller"));) {
+		typep = (char *)get_property(np, "compatible", NULL);
+		if (strstr(typep, "open-pic"))
+			ppc64_interrupt_controller = IC_OPEN_PIC;
+		else if (strstr(typep, "ppc-xicp"))
+			ppc64_interrupt_controller = IC_PPC_XIC;
+		else
+			printk("pSeries_discover_pic: failed to recognize"
+			       " interrupt-controller\n");
+		break;
+	}
+}
+
+static void pSeries_mach_cpu_die(void)
+{
+	local_irq_disable();
+	idle_task_exit();
+	/* Some hardware requires clearing the CPPR, while other hardware does not
+	 * it is safe either way
+	 */
+	pSeriesLP_cppr_info(0, 0);
+	rtas_stop_self();
+	/* Should never get here... */
+	BUG();
+	for(;;);
+}
+
+
+/*
+ * Early initialization.  Relocation is on but do not reference unbolted pages
+ */
+static void __init pSeries_init_early(void)
+{
+	void *comport;
+	int iommu_off = 0;
+	unsigned int default_speed;
+	u64 physport;
+
+	DBG(" -> pSeries_init_early()\n");
+
+	fw_feature_init();
+	
+	if (systemcfg->platform & PLATFORM_LPAR)
+		hpte_init_lpar();
+	else {
+		hpte_init_native();
+		iommu_off = (of_chosen &&
+			     get_property(of_chosen, "linux,iommu-off", NULL));
+	}
+
+	generic_find_legacy_serial_ports(&physport, &default_speed);
+
+	if (systemcfg->platform & PLATFORM_LPAR)
+		find_udbg_vterm();
+	else if (physport) {
+		/* Map the uart for udbg. */
+		comport = (void *)__ioremap(physport, 16, _PAGE_NO_CACHE);
+		udbg_init_uart(comport, default_speed);
+
+		ppc_md.udbg_putc = udbg_putc;
+		ppc_md.udbg_getc = udbg_getc;
+		ppc_md.udbg_getc_poll = udbg_getc_poll;
+		DBG("Hello World !\n");
+	}
+
+
+	iommu_init_early_pSeries();
+
+	pSeries_discover_pic();
+
+	DBG(" <- pSeries_init_early()\n");
+}
+
+
+static void pSeries_progress(char *s, unsigned short hex)
+{
+	struct device_node *root;
+	int width, *p;
+	char *os;
+	static int display_character, set_indicator;
+	static int max_width;
+	static DEFINE_SPINLOCK(progress_lock);
+	static int pending_newline = 0;  /* did last write end with unprinted newline? */
+
+	if (!rtas.base)
+		return;
+
+	if (max_width == 0) {
+		if ((root = find_path_device("/rtas")) &&
+		     (p = (unsigned int *)get_property(root,
+						       "ibm,display-line-length",
+						       NULL)))
+			max_width = *p;
+		else
+			max_width = 0x10;
+		display_character = rtas_token("display-character");
+		set_indicator = rtas_token("set-indicator");
+	}
+
+	if (display_character == RTAS_UNKNOWN_SERVICE) {
+		/* use hex display if available */
+		if (set_indicator != RTAS_UNKNOWN_SERVICE)
+			rtas_call(set_indicator, 3, 1, NULL, 6, 0, hex);
+		return;
+	}
+
+	spin_lock(&progress_lock);
+
+	/*
+	 * Last write ended with newline, but we didn't print it since
+	 * it would just clear the bottom line of output. Print it now
+	 * instead.
+	 *
+	 * If no newline is pending, print a CR to start output at the
+	 * beginning of the line.
+	 */
+	if (pending_newline) {
+		rtas_call(display_character, 1, 1, NULL, '\r');
+		rtas_call(display_character, 1, 1, NULL, '\n');
+		pending_newline = 0;
+	} else {
+		rtas_call(display_character, 1, 1, NULL, '\r');
+	}
+ 
+	width = max_width;
+	os = s;
+	while (*os) {
+		if (*os == '\n' || *os == '\r') {
+			/* Blank to end of line. */
+			while (width-- > 0)
+				rtas_call(display_character, 1, 1, NULL, ' ');
+ 
+			/* If newline is the last character, save it
+			 * until next call to avoid bumping up the
+			 * display output.
+			 */
+			if (*os == '\n' && !os[1]) {
+				pending_newline = 1;
+				spin_unlock(&progress_lock);
+				return;
+			}
+ 
+			/* RTAS wants CR-LF, not just LF */
+ 
+			if (*os == '\n') {
+				rtas_call(display_character, 1, 1, NULL, '\r');
+				rtas_call(display_character, 1, 1, NULL, '\n');
+			} else {
+				/* CR might be used to re-draw a line, so we'll
+				 * leave it alone and not add LF.
+				 */
+				rtas_call(display_character, 1, 1, NULL, *os);
+			}
+ 
+			width = max_width;
+		} else {
+			width--;
+			rtas_call(display_character, 1, 1, NULL, *os);
+		}
+ 
+		os++;
+ 
+		/* if we overwrite the screen length */
+		if (width <= 0)
+			while ((*os != 0) && (*os != '\n') && (*os != '\r'))
+				os++;
+	}
+ 
+	/* Blank to end of line. */
+	while (width-- > 0)
+		rtas_call(display_character, 1, 1, NULL, ' ');
+
+	spin_unlock(&progress_lock);
+}
+
+extern void setup_default_decr(void);
+
+/* Some sane defaults: 125 MHz timebase, 1GHz processor */
+#define DEFAULT_TB_FREQ		125000000UL
+#define DEFAULT_PROC_FREQ	(DEFAULT_TB_FREQ * 8)
+
+static void __init pSeries_calibrate_decr(void)
+{
+	struct device_node *cpu;
+	struct div_result divres;
+	unsigned int *fp;
+	int node_found;
+
+	/*
+	 * The cpu node should have a timebase-frequency property
+	 * to tell us the rate at which the decrementer counts.
+	 */
+	cpu = of_find_node_by_type(NULL, "cpu");
+
+	ppc_tb_freq = DEFAULT_TB_FREQ;		/* hardcoded default */
+	node_found = 0;
+	if (cpu != 0) {
+		fp = (unsigned int *)get_property(cpu, "timebase-frequency",
+						  NULL);
+		if (fp != 0) {
+			node_found = 1;
+			ppc_tb_freq = *fp;
+		}
+	}
+	if (!node_found)
+		printk(KERN_ERR "WARNING: Estimating decrementer frequency "
+				"(not found)\n");
+
+	ppc_proc_freq = DEFAULT_PROC_FREQ;
+	node_found = 0;
+	if (cpu != 0) {
+		fp = (unsigned int *)get_property(cpu, "clock-frequency",
+						  NULL);
+		if (fp != 0) {
+			node_found = 1;
+			ppc_proc_freq = *fp;
+		}
+	}
+	if (!node_found)
+		printk(KERN_ERR "WARNING: Estimating processor frequency "
+				"(not found)\n");
+
+	of_node_put(cpu);
+
+	printk(KERN_INFO "time_init: decrementer frequency = %lu.%.6lu MHz\n",
+	       ppc_tb_freq/1000000, ppc_tb_freq%1000000);
+	printk(KERN_INFO "time_init: processor frequency   = %lu.%.6lu MHz\n",
+	       ppc_proc_freq/1000000, ppc_proc_freq%1000000);
+
+	tb_ticks_per_jiffy = ppc_tb_freq / HZ;
+	tb_ticks_per_sec = tb_ticks_per_jiffy * HZ;
+	tb_ticks_per_usec = ppc_tb_freq / 1000000;
+	tb_to_us = mulhwu_scale_factor(ppc_tb_freq, 1000000);
+	div128_by_32(1024*1024, 0, tb_ticks_per_sec, &divres);
+	tb_to_xs = divres.result_low;
+
+	setup_default_decr();
+}
+
+static int pSeries_check_legacy_ioport(unsigned int baseport)
+{
+	struct device_node *np;
+
+#define I8042_DATA_REG	0x60
+#define FDC_BASE	0x3f0
+
+
+	switch(baseport) {
+	case I8042_DATA_REG:
+		np = of_find_node_by_type(NULL, "8042");
+		if (np == NULL)
+			return -ENODEV;
+		of_node_put(np);
+		break;
+	case FDC_BASE:
+		np = of_find_node_by_type(NULL, "fdc");
+		if (np == NULL)
+			return -ENODEV;
+		of_node_put(np);
+		break;
+	}
+	return 0;
+}
+
+/*
+ * Called very early, MMU is off, device-tree isn't unflattened
+ */
+extern struct machdep_calls pSeries_md;
+
+static int __init pSeries_probe(int platform)
+{
+	if (platform != PLATFORM_PSERIES &&
+	    platform != PLATFORM_PSERIES_LPAR)
+		return 0;
+
+	/* if we have some ppc_md fixups for LPAR to do, do
+	 * it here ...
+	 */
+
+	return 1;
+}
+
+struct machdep_calls __initdata pSeries_md = {
+	.probe			= pSeries_probe,
+	.setup_arch		= pSeries_setup_arch,
+	.init_early		= pSeries_init_early,
+	.get_cpuinfo		= pSeries_get_cpuinfo,
+	.log_error		= pSeries_log_error,
+	.pcibios_fixup		= pSeries_final_fixup,
+	.restart		= rtas_restart,
+	.power_off		= rtas_power_off,
+	.halt			= rtas_halt,
+	.panic			= rtas_os_term,
+	.cpu_die		= pSeries_mach_cpu_die,
+	.get_boot_time		= pSeries_get_boot_time,
+	.get_rtc_time		= pSeries_get_rtc_time,
+	.set_rtc_time		= pSeries_set_rtc_time,
+	.calibrate_decr		= pSeries_calibrate_decr,
+	.progress		= pSeries_progress,
+	.check_legacy_ioport	= pSeries_check_legacy_ioport,
+	.system_reset_exception = pSeries_system_reset_exception,
+	.machine_check_exception = pSeries_machine_check_exception,
+};
diff --git a/arch/ppc64/kernel/pSeries_smp.c b/arch/ppc64/kernel/pSeries_smp.c
new file mode 100644
index 00000000000..c60d8cb2b84
--- /dev/null
+++ b/arch/ppc64/kernel/pSeries_smp.c
@@ -0,0 +1,451 @@
+/*
+ * SMP support for pSeries machines.
+ *
+ * Dave Engebretsen, Peter Bergner, and
+ * Mike Corrigan {engebret|bergner|mikec}@us.ibm.com
+ *
+ * Plus various changes from other IBM teams...
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/cache.h>
+#include <linux/err.h>
+#include <linux/sysdev.h>
+#include <linux/cpu.h>
+
+#include <asm/ptrace.h>
+#include <asm/atomic.h>
+#include <asm/irq.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/smp.h>
+#include <asm/paca.h>
+#include <asm/time.h>
+#include <asm/machdep.h>
+#include <asm/xics.h>
+#include <asm/cputable.h>
+#include <asm/system.h>
+#include <asm/rtas.h>
+#include <asm/plpar_wrappers.h>
+#include <asm/pSeries_reconfig.h>
+
+#include "mpic.h"
+
+#ifdef DEBUG
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+/*
+ * The primary thread of each non-boot processor is recorded here before
+ * smp init.
+ */
+static cpumask_t of_spin_map;
+
+extern void pSeries_secondary_smp_init(unsigned long);
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/* Get state of physical CPU.
+ * Return codes:
+ *	0	- The processor is in the RTAS stopped state
+ *	1	- stop-self is in progress
+ *	2	- The processor is not in the RTAS stopped state
+ *	-1	- Hardware Error
+ *	-2	- Hardware Busy, Try again later.
+ */
+static int query_cpu_stopped(unsigned int pcpu)
+{
+	int cpu_status;
+	int status, qcss_tok;
+
+	qcss_tok = rtas_token("query-cpu-stopped-state");
+	if (qcss_tok == RTAS_UNKNOWN_SERVICE)
+		return -1;
+	status = rtas_call(qcss_tok, 1, 2, &cpu_status, pcpu);
+	if (status != 0) {
+		printk(KERN_ERR
+		       "RTAS query-cpu-stopped-state failed: %i\n", status);
+		return status;
+	}
+
+	return cpu_status;
+}
+
+int pSeries_cpu_disable(void)
+{
+	systemcfg->processorCount--;
+
+	/*fix boot_cpuid here*/
+	if (smp_processor_id() == boot_cpuid)
+		boot_cpuid = any_online_cpu(cpu_online_map);
+
+	/* FIXME: abstract this to not be platform specific later on */
+	xics_migrate_irqs_away();
+	return 0;
+}
+
+void pSeries_cpu_die(unsigned int cpu)
+{
+	int tries;
+	int cpu_status;
+	unsigned int pcpu = get_hard_smp_processor_id(cpu);
+
+	for (tries = 0; tries < 25; tries++) {
+		cpu_status = query_cpu_stopped(pcpu);
+		if (cpu_status == 0 || cpu_status == -1)
+			break;
+		msleep(200);
+	}
+	if (cpu_status != 0) {
+		printk("Querying DEAD? cpu %i (%i) shows %i\n",
+		       cpu, pcpu, cpu_status);
+	}
+
+	/* Isolation and deallocation are definatly done by
+	 * drslot_chrp_cpu.  If they were not they would be
+	 * done here.  Change isolate state to Isolate and
+	 * change allocation-state to Unusable.
+	 */
+	paca[cpu].cpu_start = 0;
+}
+
+/*
+ * Update cpu_present_map and paca(s) for a new cpu node.  The wrinkle
+ * here is that a cpu device node may represent up to two logical cpus
+ * in the SMT case.  We must honor the assumption in other code that
+ * the logical ids for sibling SMT threads x and y are adjacent, such
+ * that x^1 == y and y^1 == x.
+ */
+static int pSeries_add_processor(struct device_node *np)
+{
+	unsigned int cpu;
+	cpumask_t candidate_map, tmp = CPU_MASK_NONE;
+	int err = -ENOSPC, len, nthreads, i;
+	u32 *intserv;
+
+	intserv = (u32 *)get_property(np, "ibm,ppc-interrupt-server#s", &len);
+	if (!intserv)
+		return 0;
+
+	nthreads = len / sizeof(u32);
+	for (i = 0; i < nthreads; i++)
+		cpu_set(i, tmp);
+
+	lock_cpu_hotplug();
+
+	BUG_ON(!cpus_subset(cpu_present_map, cpu_possible_map));
+
+	/* Get a bitmap of unoccupied slots. */
+	cpus_xor(candidate_map, cpu_possible_map, cpu_present_map);
+	if (cpus_empty(candidate_map)) {
+		/* If we get here, it most likely means that NR_CPUS is
+		 * less than the partition's max processors setting.
+		 */
+		printk(KERN_ERR "Cannot add cpu %s; this system configuration"
+		       " supports %d logical cpus.\n", np->full_name,
+		       cpus_weight(cpu_possible_map));
+		goto out_unlock;
+	}
+
+	while (!cpus_empty(tmp))
+		if (cpus_subset(tmp, candidate_map))
+			/* Found a range where we can insert the new cpu(s) */
+			break;
+		else
+			cpus_shift_left(tmp, tmp, nthreads);
+
+	if (cpus_empty(tmp)) {
+		printk(KERN_ERR "Unable to find space in cpu_present_map for"
+		       " processor %s with %d thread(s)\n", np->name,
+		       nthreads);
+		goto out_unlock;
+	}
+
+	for_each_cpu_mask(cpu, tmp) {
+		BUG_ON(cpu_isset(cpu, cpu_present_map));
+		cpu_set(cpu, cpu_present_map);
+		set_hard_smp_processor_id(cpu, *intserv++);
+	}
+	err = 0;
+out_unlock:
+	unlock_cpu_hotplug();
+	return err;
+}
+
+/*
+ * Update the present map for a cpu node which is going away, and set
+ * the hard id in the paca(s) to -1 to be consistent with boot time
+ * convention for non-present cpus.
+ */
+static void pSeries_remove_processor(struct device_node *np)
+{
+	unsigned int cpu;
+	int len, nthreads, i;
+	u32 *intserv;
+
+	intserv = (u32 *)get_property(np, "ibm,ppc-interrupt-server#s", &len);
+	if (!intserv)
+		return;
+
+	nthreads = len / sizeof(u32);
+
+	lock_cpu_hotplug();
+	for (i = 0; i < nthreads; i++) {
+		for_each_present_cpu(cpu) {
+			if (get_hard_smp_processor_id(cpu) != intserv[i])
+				continue;
+			BUG_ON(cpu_online(cpu));
+			cpu_clear(cpu, cpu_present_map);
+			set_hard_smp_processor_id(cpu, -1);
+			break;
+		}
+		if (cpu == NR_CPUS)
+			printk(KERN_WARNING "Could not find cpu to remove "
+			       "with physical id 0x%x\n", intserv[i]);
+	}
+	unlock_cpu_hotplug();
+}
+
+static int pSeries_smp_notifier(struct notifier_block *nb, unsigned long action, void *node)
+{
+	int err = NOTIFY_OK;
+
+	switch (action) {
+	case PSERIES_RECONFIG_ADD:
+		if (pSeries_add_processor(node))
+			err = NOTIFY_BAD;
+		break;
+	case PSERIES_RECONFIG_REMOVE:
+		pSeries_remove_processor(node);
+		break;
+	default:
+		err = NOTIFY_DONE;
+		break;
+	}
+	return err;
+}
+
+static struct notifier_block pSeries_smp_nb = {
+	.notifier_call = pSeries_smp_notifier,
+};
+
+#endif /* CONFIG_HOTPLUG_CPU */
+
+/**
+ * smp_startup_cpu() - start the given cpu
+ *
+ * At boot time, there is nothing to do for primary threads which were
+ * started from Open Firmware.  For anything else, call RTAS with the
+ * appropriate start location.
+ *
+ * Returns:
+ *	0	- failure
+ *	1	- success
+ */
+static inline int __devinit smp_startup_cpu(unsigned int lcpu)
+{
+	int status;
+	unsigned long start_here = __pa((u32)*((unsigned long *)
+					       pSeries_secondary_smp_init));
+	unsigned int pcpu;
+
+	if (cpu_isset(lcpu, of_spin_map))
+		/* Already started by OF and sitting in spin loop */
+		return 1;
+
+	pcpu = get_hard_smp_processor_id(lcpu);
+
+	/* Fixup atomic count: it exited inside IRQ handler. */
+	paca[lcpu].__current->thread_info->preempt_count	= 0;
+
+	status = rtas_call(rtas_token("start-cpu"), 3, 1, NULL,
+			   pcpu, start_here, lcpu);
+	if (status != 0) {
+		printk(KERN_ERR "start-cpu failed: %i\n", status);
+		return 0;
+	}
+	return 1;
+}
+
+static inline void smp_xics_do_message(int cpu, int msg)
+{
+	set_bit(msg, &xics_ipi_message[cpu].value);
+	mb();
+	xics_cause_IPI(cpu);
+}
+
+static void smp_xics_message_pass(int target, int msg)
+{
+	unsigned int i;
+
+	if (target < NR_CPUS) {
+		smp_xics_do_message(target, msg);
+	} else {
+		for_each_online_cpu(i) {
+			if (target == MSG_ALL_BUT_SELF
+			    && i == smp_processor_id())
+				continue;
+			smp_xics_do_message(i, msg);
+		}
+	}
+}
+
+static int __init smp_xics_probe(void)
+{
+	xics_request_IPIs();
+
+	return cpus_weight(cpu_possible_map);
+}
+
+static void __devinit smp_xics_setup_cpu(int cpu)
+{
+	if (cpu != boot_cpuid)
+		xics_setup_cpu();
+
+	if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR)
+		vpa_init(cpu);
+
+	cpu_clear(cpu, of_spin_map);
+
+	/*
+	 * Put the calling processor into the GIQ.  This is really only
+	 * necessary from a secondary thread as the OF start-cpu interface
+	 * performs this function for us on primary threads.
+	 */
+	rtas_set_indicator(GLOBAL_INTERRUPT_QUEUE,
+		(1UL << interrupt_server_size) - 1 - default_distrib_server, 1);
+}
+
+static DEFINE_SPINLOCK(timebase_lock);
+static unsigned long timebase = 0;
+
+static void __devinit pSeries_give_timebase(void)
+{
+	spin_lock(&timebase_lock);
+	rtas_call(rtas_token("freeze-time-base"), 0, 1, NULL);
+	timebase = get_tb();
+	spin_unlock(&timebase_lock);
+
+	while (timebase)
+		barrier();
+	rtas_call(rtas_token("thaw-time-base"), 0, 1, NULL);
+}
+
+static void __devinit pSeries_take_timebase(void)
+{
+	while (!timebase)
+		barrier();
+	spin_lock(&timebase_lock);
+	set_tb(timebase >> 32, timebase & 0xffffffff);
+	timebase = 0;
+	spin_unlock(&timebase_lock);
+}
+
+static void __devinit smp_pSeries_kick_cpu(int nr)
+{
+	BUG_ON(nr < 0 || nr >= NR_CPUS);
+
+	if (!smp_startup_cpu(nr))
+		return;
+
+	/*
+	 * The processor is currently spinning, waiting for the
+	 * cpu_start field to become non-zero After we set cpu_start,
+	 * the processor will continue on to secondary_start
+	 */
+	paca[nr].cpu_start = 1;
+}
+
+static int smp_pSeries_cpu_bootable(unsigned int nr)
+{
+	/* Special case - we inhibit secondary thread startup
+	 * during boot if the user requests it.  Odd-numbered
+	 * cpus are assumed to be secondary threads.
+	 */
+	if (system_state < SYSTEM_RUNNING &&
+	    cur_cpu_spec->cpu_features & CPU_FTR_SMT &&
+	    !smt_enabled_at_boot && nr % 2 != 0)
+		return 0;
+
+	return 1;
+}
+
+static struct smp_ops_t pSeries_mpic_smp_ops = {
+	.message_pass	= smp_mpic_message_pass,
+	.probe		= smp_mpic_probe,
+	.kick_cpu	= smp_pSeries_kick_cpu,
+	.setup_cpu	= smp_mpic_setup_cpu,
+};
+
+static struct smp_ops_t pSeries_xics_smp_ops = {
+	.message_pass	= smp_xics_message_pass,
+	.probe		= smp_xics_probe,
+	.kick_cpu	= smp_pSeries_kick_cpu,
+	.setup_cpu	= smp_xics_setup_cpu,
+	.cpu_bootable	= smp_pSeries_cpu_bootable,
+};
+
+/* This is called very early */
+void __init smp_init_pSeries(void)
+{
+	int i;
+
+	DBG(" -> smp_init_pSeries()\n");
+
+	if (ppc64_interrupt_controller == IC_OPEN_PIC)
+		smp_ops = &pSeries_mpic_smp_ops;
+	else
+		smp_ops = &pSeries_xics_smp_ops;
+
+#ifdef CONFIG_HOTPLUG_CPU
+	smp_ops->cpu_disable = pSeries_cpu_disable;
+	smp_ops->cpu_die = pSeries_cpu_die;
+
+	/* Processors can be added/removed only on LPAR */
+	if (systemcfg->platform == PLATFORM_PSERIES_LPAR)
+		pSeries_reconfig_notifier_register(&pSeries_smp_nb);
+#endif
+
+	/* Mark threads which are still spinning in hold loops. */
+	if (cur_cpu_spec->cpu_features & CPU_FTR_SMT)
+		for_each_present_cpu(i) {
+			if (i % 2 == 0)
+				/*
+				 * Even-numbered logical cpus correspond to
+				 * primary threads.
+				 */
+				cpu_set(i, of_spin_map);
+		}
+	else
+		of_spin_map = cpu_present_map;
+
+	cpu_clear(boot_cpuid, of_spin_map);
+
+	/* Non-lpar has additional take/give timebase */
+	if (rtas_token("freeze-time-base") != RTAS_UNKNOWN_SERVICE) {
+		smp_ops->give_timebase = pSeries_give_timebase;
+		smp_ops->take_timebase = pSeries_take_timebase;
+	}
+
+	DBG(" <- smp_init_pSeries()\n");
+}
+
diff --git a/arch/ppc64/kernel/pacaData.c b/arch/ppc64/kernel/pacaData.c
new file mode 100644
index 00000000000..a3e0975c26c
--- /dev/null
+++ b/arch/ppc64/kernel/pacaData.c
@@ -0,0 +1,224 @@
+/*
+ * c 2001 PPC 64 Team, IBM Corp
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/threads.h>
+#include <linux/module.h>
+
+#include <asm/processor.h>
+#include <asm/ptrace.h>
+#include <asm/page.h>
+
+#include <asm/lppaca.h>
+#include <asm/iSeries/ItLpQueue.h>
+#include <asm/paca.h>
+
+static union {
+	struct systemcfg	data;
+	u8			page[PAGE_SIZE];
+} systemcfg_store __page_aligned;
+struct systemcfg *systemcfg = &systemcfg_store.data;
+EXPORT_SYMBOL(systemcfg);
+
+
+/* This symbol is provided by the linker - let it fill in the paca
+ * field correctly */
+extern unsigned long __toc_start;
+
+/* The Paca is an array with one entry per processor.  Each contains an 
+ * lppaca, which contains the information shared between the
+ * hypervisor and Linux.  Each also contains an ItLpRegSave area which
+ * is used by the hypervisor to save registers.
+ * On systems with hardware multi-threading, there are two threads
+ * per processor.  The Paca array must contain an entry for each thread.
+ * The VPD Areas will give a max logical processors = 2 * max physical
+ * processors.  The processor VPD array needs one entry per physical
+ * processor (not thread).
+ */
+#ifdef CONFIG_PPC_ISERIES
+#define EXTRA_INITS(number, lpq)					    \
+	.lppaca_ptr = &paca[number].lppaca,				    \
+	.lpqueue_ptr = (lpq),		/* &xItLpQueue, */		    \
+	.reg_save_ptr = &paca[number].reg_save,				    \
+	.reg_save = {							    \
+		.xDesc = 0xd397d9e2,	/* "LpRS" */			    \
+		.xSize = sizeof(struct ItLpRegSave)			    \
+	},
+#else
+#define EXTRA_INITS(number, lpq)
+#endif
+
+#define PACAINITDATA(number,start,lpq,asrr,asrv)			    \
+{									    \
+	.lock_token = 0x8000,						    \
+	.paca_index = (number),		/* Paca Index */		    \
+	.default_decr = 0x00ff0000,	/* Initial Decr */		    \
+	.kernel_toc = (unsigned long)(&__toc_start) + 0x8000UL,		    \
+	.stab_real = (asrr), 		/* Real pointer to segment table */ \
+	.stab_addr = (asrv),		/* Virt pointer to segment table */ \
+	.cpu_start = (start),		/* Processor start */		    \
+	.hw_cpu_id = 0xffff,						    \
+	.lppaca = {							    \
+		.desc = 0xd397d781,	/* "LpPa" */			    \
+		.size = sizeof(struct lppaca),				    \
+		.dyn_proc_status = 2,					    \
+		.decr_val = 0x00ff0000,					    \
+		.fpregs_in_use = 1,					    \
+		.end_of_quantum = 0xfffffffffffffffful,			    \
+		.slb_count = 64,					    \
+	},								    \
+	EXTRA_INITS((number), (lpq))					    \
+}
+
+struct paca_struct paca[] = {
+#ifdef CONFIG_PPC_ISERIES
+	PACAINITDATA( 0, 1, &xItLpQueue, 0, STAB0_VIRT_ADDR),
+#else
+	PACAINITDATA( 0, 1, NULL, STAB0_PHYS_ADDR, STAB0_VIRT_ADDR),
+#endif
+#if NR_CPUS > 1
+	PACAINITDATA( 1, 0, NULL, 0, 0),
+	PACAINITDATA( 2, 0, NULL, 0, 0),
+	PACAINITDATA( 3, 0, NULL, 0, 0),
+#if NR_CPUS > 4
+	PACAINITDATA( 4, 0, NULL, 0, 0),
+	PACAINITDATA( 5, 0, NULL, 0, 0),
+	PACAINITDATA( 6, 0, NULL, 0, 0),
+	PACAINITDATA( 7, 0, NULL, 0, 0),
+#if NR_CPUS > 8
+	PACAINITDATA( 8, 0, NULL, 0, 0),
+	PACAINITDATA( 9, 0, NULL, 0, 0),
+	PACAINITDATA(10, 0, NULL, 0, 0),
+	PACAINITDATA(11, 0, NULL, 0, 0),
+	PACAINITDATA(12, 0, NULL, 0, 0),
+	PACAINITDATA(13, 0, NULL, 0, 0),
+	PACAINITDATA(14, 0, NULL, 0, 0),
+	PACAINITDATA(15, 0, NULL, 0, 0),
+	PACAINITDATA(16, 0, NULL, 0, 0),
+	PACAINITDATA(17, 0, NULL, 0, 0),
+	PACAINITDATA(18, 0, NULL, 0, 0),
+	PACAINITDATA(19, 0, NULL, 0, 0),
+	PACAINITDATA(20, 0, NULL, 0, 0),
+	PACAINITDATA(21, 0, NULL, 0, 0),
+	PACAINITDATA(22, 0, NULL, 0, 0),
+	PACAINITDATA(23, 0, NULL, 0, 0),
+	PACAINITDATA(24, 0, NULL, 0, 0),
+	PACAINITDATA(25, 0, NULL, 0, 0),
+	PACAINITDATA(26, 0, NULL, 0, 0),
+	PACAINITDATA(27, 0, NULL, 0, 0),
+	PACAINITDATA(28, 0, NULL, 0, 0),
+	PACAINITDATA(29, 0, NULL, 0, 0),
+	PACAINITDATA(30, 0, NULL, 0, 0),
+	PACAINITDATA(31, 0, NULL, 0, 0),
+#if NR_CPUS > 32
+	PACAINITDATA(32, 0, NULL, 0, 0),
+	PACAINITDATA(33, 0, NULL, 0, 0),
+	PACAINITDATA(34, 0, NULL, 0, 0),
+	PACAINITDATA(35, 0, NULL, 0, 0),
+	PACAINITDATA(36, 0, NULL, 0, 0),
+	PACAINITDATA(37, 0, NULL, 0, 0),
+	PACAINITDATA(38, 0, NULL, 0, 0),
+	PACAINITDATA(39, 0, NULL, 0, 0),
+	PACAINITDATA(40, 0, NULL, 0, 0),
+	PACAINITDATA(41, 0, NULL, 0, 0),
+	PACAINITDATA(42, 0, NULL, 0, 0),
+	PACAINITDATA(43, 0, NULL, 0, 0),
+	PACAINITDATA(44, 0, NULL, 0, 0),
+	PACAINITDATA(45, 0, NULL, 0, 0),
+	PACAINITDATA(46, 0, NULL, 0, 0),
+	PACAINITDATA(47, 0, NULL, 0, 0),
+	PACAINITDATA(48, 0, NULL, 0, 0),
+	PACAINITDATA(49, 0, NULL, 0, 0),
+	PACAINITDATA(50, 0, NULL, 0, 0),
+	PACAINITDATA(51, 0, NULL, 0, 0),
+	PACAINITDATA(52, 0, NULL, 0, 0),
+	PACAINITDATA(53, 0, NULL, 0, 0),
+	PACAINITDATA(54, 0, NULL, 0, 0),
+	PACAINITDATA(55, 0, NULL, 0, 0),
+	PACAINITDATA(56, 0, NULL, 0, 0),
+	PACAINITDATA(57, 0, NULL, 0, 0),
+	PACAINITDATA(58, 0, NULL, 0, 0),
+	PACAINITDATA(59, 0, NULL, 0, 0),
+	PACAINITDATA(60, 0, NULL, 0, 0),
+	PACAINITDATA(61, 0, NULL, 0, 0),
+	PACAINITDATA(62, 0, NULL, 0, 0),
+	PACAINITDATA(63, 0, NULL, 0, 0),
+#if NR_CPUS > 64
+	PACAINITDATA(64, 0, NULL, 0, 0),
+	PACAINITDATA(65, 0, NULL, 0, 0),
+	PACAINITDATA(66, 0, NULL, 0, 0),
+	PACAINITDATA(67, 0, NULL, 0, 0),
+	PACAINITDATA(68, 0, NULL, 0, 0),
+	PACAINITDATA(69, 0, NULL, 0, 0),
+	PACAINITDATA(70, 0, NULL, 0, 0),
+	PACAINITDATA(71, 0, NULL, 0, 0),
+	PACAINITDATA(72, 0, NULL, 0, 0),
+	PACAINITDATA(73, 0, NULL, 0, 0),
+	PACAINITDATA(74, 0, NULL, 0, 0),
+	PACAINITDATA(75, 0, NULL, 0, 0),
+	PACAINITDATA(76, 0, NULL, 0, 0),
+	PACAINITDATA(77, 0, NULL, 0, 0),
+	PACAINITDATA(78, 0, NULL, 0, 0),
+	PACAINITDATA(79, 0, NULL, 0, 0),
+	PACAINITDATA(80, 0, NULL, 0, 0),
+	PACAINITDATA(81, 0, NULL, 0, 0),
+	PACAINITDATA(82, 0, NULL, 0, 0),
+	PACAINITDATA(83, 0, NULL, 0, 0),
+	PACAINITDATA(84, 0, NULL, 0, 0),
+	PACAINITDATA(85, 0, NULL, 0, 0),
+	PACAINITDATA(86, 0, NULL, 0, 0),
+	PACAINITDATA(87, 0, NULL, 0, 0),
+	PACAINITDATA(88, 0, NULL, 0, 0),
+	PACAINITDATA(89, 0, NULL, 0, 0),
+	PACAINITDATA(90, 0, NULL, 0, 0),
+	PACAINITDATA(91, 0, NULL, 0, 0),
+	PACAINITDATA(92, 0, NULL, 0, 0),
+	PACAINITDATA(93, 0, NULL, 0, 0),
+	PACAINITDATA(94, 0, NULL, 0, 0),
+	PACAINITDATA(95, 0, NULL, 0, 0),
+	PACAINITDATA(96, 0, NULL, 0, 0),
+	PACAINITDATA(97, 0, NULL, 0, 0),
+	PACAINITDATA(98, 0, NULL, 0, 0),
+	PACAINITDATA(99, 0, NULL, 0, 0),
+	PACAINITDATA(100, 0, NULL, 0, 0),
+	PACAINITDATA(101, 0, NULL, 0, 0),
+	PACAINITDATA(102, 0, NULL, 0, 0),
+	PACAINITDATA(103, 0, NULL, 0, 0),
+	PACAINITDATA(104, 0, NULL, 0, 0),
+	PACAINITDATA(105, 0, NULL, 0, 0),
+	PACAINITDATA(106, 0, NULL, 0, 0),
+	PACAINITDATA(107, 0, NULL, 0, 0),
+	PACAINITDATA(108, 0, NULL, 0, 0),
+	PACAINITDATA(109, 0, NULL, 0, 0),
+	PACAINITDATA(110, 0, NULL, 0, 0),
+	PACAINITDATA(111, 0, NULL, 0, 0),
+	PACAINITDATA(112, 0, NULL, 0, 0),
+	PACAINITDATA(113, 0, NULL, 0, 0),
+	PACAINITDATA(114, 0, NULL, 0, 0),
+	PACAINITDATA(115, 0, NULL, 0, 0),
+	PACAINITDATA(116, 0, NULL, 0, 0),
+	PACAINITDATA(117, 0, NULL, 0, 0),
+	PACAINITDATA(118, 0, NULL, 0, 0),
+	PACAINITDATA(119, 0, NULL, 0, 0),
+	PACAINITDATA(120, 0, NULL, 0, 0),
+	PACAINITDATA(121, 0, NULL, 0, 0),
+	PACAINITDATA(122, 0, NULL, 0, 0),
+	PACAINITDATA(123, 0, NULL, 0, 0),
+	PACAINITDATA(124, 0, NULL, 0, 0),
+	PACAINITDATA(125, 0, NULL, 0, 0),
+	PACAINITDATA(126, 0, NULL, 0, 0),
+	PACAINITDATA(127, 0, NULL, 0, 0),
+#endif
+#endif
+#endif
+#endif
+#endif
+};
+EXPORT_SYMBOL(paca);
diff --git a/arch/ppc64/kernel/pci.c b/arch/ppc64/kernel/pci.c
new file mode 100644
index 00000000000..fdd8f7869a6
--- /dev/null
+++ b/arch/ppc64/kernel/pci.c
@@ -0,0 +1,942 @@
+/*
+ * Port for PPC64 David Engebretsen, IBM Corp.
+ * Contains common pci routines for ppc64 platform, pSeries and iSeries brands.
+ * 
+ * Copyright (C) 2003 Anton Blanchard <anton@au.ibm.com>, IBM
+ *   Rework, based on alpha PCI code.
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/mm.h>
+#include <linux/list.h>
+
+#include <asm/processor.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/pci-bridge.h>
+#include <asm/byteorder.h>
+#include <asm/irq.h>
+#include <asm/machdep.h>
+#include <asm/udbg.h>
+
+#include "pci.h"
+
+#ifdef DEBUG
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+unsigned long pci_probe_only = 1;
+unsigned long pci_assign_all_buses = 0;
+
+/*
+ * legal IO pages under MAX_ISA_PORT.  This is to ensure we don't touch
+ * devices we don't have access to.
+ */
+unsigned long io_page_mask;
+
+EXPORT_SYMBOL(io_page_mask);
+
+
+unsigned int pcibios_assign_all_busses(void)
+{
+	return pci_assign_all_buses;
+}
+
+/* pci_io_base -- the base address from which io bars are offsets.
+ * This is the lowest I/O base address (so bar values are always positive),
+ * and it *must* be the start of ISA space if an ISA bus exists because
+ * ISA drivers use hard coded offsets.  If no ISA bus exists a dummy
+ * page is mapped and isa_io_limit prevents access to it.
+ */
+unsigned long isa_io_base;	/* NULL if no ISA bus */
+EXPORT_SYMBOL(isa_io_base);
+unsigned long pci_io_base;
+EXPORT_SYMBOL(pci_io_base);
+
+void iSeries_pcibios_init(void);
+
+LIST_HEAD(hose_list);
+
+struct dma_mapping_ops pci_dma_ops;
+EXPORT_SYMBOL(pci_dma_ops);
+
+int global_phb_number;		/* Global phb counter */
+
+/* Cached ISA bridge dev. */
+struct pci_dev *ppc64_isabridge_dev = NULL;
+
+static void fixup_broken_pcnet32(struct pci_dev* dev)
+{
+	if ((dev->class>>8 == PCI_CLASS_NETWORK_ETHERNET)) {
+		dev->vendor = PCI_VENDOR_ID_AMD;
+		pci_write_config_word(dev, PCI_VENDOR_ID, PCI_VENDOR_ID_AMD);
+		pci_name_device(dev);
+	}
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_TRIDENT, PCI_ANY_ID, fixup_broken_pcnet32);
+
+void  pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region,
+			      struct resource *res)
+{
+	unsigned long offset = 0;
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+
+	if (!hose)
+		return;
+
+	if (res->flags & IORESOURCE_IO)
+	        offset = (unsigned long)hose->io_base_virt - pci_io_base;
+
+	if (res->flags & IORESOURCE_MEM)
+		offset = hose->pci_mem_offset;
+
+	region->start = res->start - offset;
+	region->end = res->end - offset;
+}
+
+#ifdef CONFIG_HOTPLUG
+EXPORT_SYMBOL(pcibios_resource_to_bus);
+#endif
+
+/*
+ * We need to avoid collisions with `mirrored' VGA ports
+ * and other strange ISA hardware, so we always want the
+ * addresses to be allocated in the 0x000-0x0ff region
+ * modulo 0x400.
+ *
+ * Why? Because some silly external IO cards only decode
+ * the low 10 bits of the IO address. The 0x00-0xff region
+ * is reserved for motherboard devices that decode all 16
+ * bits, so it's ok to allocate at, say, 0x2800-0x28ff,
+ * but we want to try to avoid allocating at 0x2900-0x2bff
+ * which might have be mirrored at 0x0100-0x03ff..
+ */
+void pcibios_align_resource(void *data, struct resource *res,
+			    unsigned long size, unsigned long align)
+{
+	struct pci_dev *dev = data;
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	unsigned long start = res->start;
+	unsigned long alignto;
+
+	if (res->flags & IORESOURCE_IO) {
+	        unsigned long offset = (unsigned long)hose->io_base_virt -
+					pci_io_base;
+		/* Make sure we start at our min on all hoses */
+		if (start - offset < PCIBIOS_MIN_IO)
+			start = PCIBIOS_MIN_IO + offset;
+
+		/*
+		 * Put everything into 0x00-0xff region modulo 0x400
+		 */
+		if (start & 0x300)
+			start = (start + 0x3ff) & ~0x3ff;
+
+	} else if (res->flags & IORESOURCE_MEM) {
+		/* Make sure we start at our min on all hoses */
+		if (start - hose->pci_mem_offset < PCIBIOS_MIN_MEM)
+			start = PCIBIOS_MIN_MEM + hose->pci_mem_offset;
+
+		/* Align to multiple of size of minimum base.  */
+		alignto = max(0x1000UL, align);
+		start = ALIGN(start, alignto);
+	}
+
+	res->start = start;
+}
+
+static DEFINE_SPINLOCK(hose_spinlock);
+
+/*
+ * pci_controller(phb) initialized common variables.
+ */
+void __devinit pci_setup_pci_controller(struct pci_controller *hose)
+{
+	memset(hose, 0, sizeof(struct pci_controller));
+
+	spin_lock(&hose_spinlock);
+	hose->global_number = global_phb_number++;
+	list_add_tail(&hose->list_node, &hose_list);
+	spin_unlock(&hose_spinlock);
+}
+
+static void __init pcibios_claim_one_bus(struct pci_bus *b)
+{
+	struct pci_dev *dev;
+	struct pci_bus *child_bus;
+
+	list_for_each_entry(dev, &b->devices, bus_list) {
+		int i;
+
+		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+			struct resource *r = &dev->resource[i];
+
+			if (r->parent || !r->start || !r->flags)
+				continue;
+			pci_claim_resource(dev, i);
+		}
+	}
+
+	list_for_each_entry(child_bus, &b->children, node)
+		pcibios_claim_one_bus(child_bus);
+}
+
+#ifndef CONFIG_PPC_ISERIES
+static void __init pcibios_claim_of_setup(void)
+{
+	struct pci_bus *b;
+
+	list_for_each_entry(b, &pci_root_buses, node)
+		pcibios_claim_one_bus(b);
+}
+#endif
+
+static int __init pcibios_init(void)
+{
+	struct pci_controller *hose, *tmp;
+	struct pci_bus *bus;
+
+	/* For now, override phys_mem_access_prot. If we need it,
+	 * later, we may move that initialization to each ppc_md
+	 */
+	ppc_md.phys_mem_access_prot = pci_phys_mem_access_prot;
+
+#ifdef CONFIG_PPC_ISERIES
+	iSeries_pcibios_init(); 
+#endif
+
+	printk("PCI: Probing PCI hardware\n");
+
+	/* Scan all of the recorded PCI controllers.  */
+	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+		hose->last_busno = 0xff;
+		bus = pci_scan_bus(hose->first_busno, hose->ops,
+				   hose->arch_data);
+		hose->bus = bus;
+		hose->last_busno = bus->subordinate;
+	}
+
+#ifndef CONFIG_PPC_ISERIES
+	if (pci_probe_only)
+		pcibios_claim_of_setup();
+	else
+		/* FIXME: `else' will be removed when
+		   pci_assign_unassigned_resources() is able to work
+		   correctly with [partially] allocated PCI tree. */
+		pci_assign_unassigned_resources();
+#endif /* !CONFIG_PPC_ISERIES */
+
+	/* Call machine dependent final fixup */
+	if (ppc_md.pcibios_fixup)
+		ppc_md.pcibios_fixup();
+
+	/* Cache the location of the ISA bridge (if we have one) */
+	ppc64_isabridge_dev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
+	if (ppc64_isabridge_dev != NULL)
+		printk("ISA bridge at %s\n", pci_name(ppc64_isabridge_dev));
+
+	printk("PCI: Probing PCI hardware done\n");
+
+	return 0;
+}
+
+subsys_initcall(pcibios_init);
+
+char __init *pcibios_setup(char *str)
+{
+	return str;
+}
+
+int pcibios_enable_device(struct pci_dev *dev, int mask)
+{
+	u16 cmd, oldcmd;
+	int i;
+
+	pci_read_config_word(dev, PCI_COMMAND, &cmd);
+	oldcmd = cmd;
+
+	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+		struct resource *res = &dev->resource[i];
+
+		/* Only set up the requested stuff */
+		if (!(mask & (1<<i)))
+			continue;
+
+		if (res->flags & IORESOURCE_IO)
+			cmd |= PCI_COMMAND_IO;
+		if (res->flags & IORESOURCE_MEM)
+			cmd |= PCI_COMMAND_MEMORY;
+	}
+
+	if (cmd != oldcmd) {
+		printk(KERN_DEBUG "PCI: Enabling device: (%s), cmd %x\n",
+		       pci_name(dev), cmd);
+                /* Enable the appropriate bits in the PCI command register.  */
+		pci_write_config_word(dev, PCI_COMMAND, cmd);
+	}
+	return 0;
+}
+
+/*
+ * Return the domain number for this bus.
+ */
+int pci_domain_nr(struct pci_bus *bus)
+{
+#ifdef CONFIG_PPC_ISERIES
+	return 0;
+#else
+	struct pci_controller *hose = pci_bus_to_host(bus);
+
+	return hose->global_number;
+#endif
+}
+
+EXPORT_SYMBOL(pci_domain_nr);
+
+/* Decide whether to display the domain number in /proc */
+int pci_proc_domain(struct pci_bus *bus)
+{
+#ifdef CONFIG_PPC_ISERIES
+	return 0;
+#else
+	struct pci_controller *hose = pci_bus_to_host(bus);
+	return hose->buid;
+#endif
+}
+
+/*
+ * Platform support for /proc/bus/pci/X/Y mmap()s,
+ * modelled on the sparc64 implementation by Dave Miller.
+ *  -- paulus.
+ */
+
+/*
+ * Adjust vm_pgoff of VMA such that it is the physical page offset
+ * corresponding to the 32-bit pci bus offset for DEV requested by the user.
+ *
+ * Basically, the user finds the base address for his device which he wishes
+ * to mmap.  They read the 32-bit value from the config space base register,
+ * add whatever PAGE_SIZE multiple offset they wish, and feed this into the
+ * offset parameter of mmap on /proc/bus/pci/XXX for that device.
+ *
+ * Returns negative error code on failure, zero on success.
+ */
+static struct resource *__pci_mmap_make_offset(struct pci_dev *dev,
+					       unsigned long *offset,
+					       enum pci_mmap_state mmap_state)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	unsigned long io_offset = 0;
+	int i, res_bit;
+
+	if (hose == 0)
+		return NULL;		/* should never happen */
+
+	/* If memory, add on the PCI bridge address offset */
+	if (mmap_state == pci_mmap_mem) {
+		*offset += hose->pci_mem_offset;
+		res_bit = IORESOURCE_MEM;
+	} else {
+		io_offset = (unsigned long)hose->io_base_virt;
+		*offset += io_offset;
+		res_bit = IORESOURCE_IO;
+	}
+
+	/*
+	 * Check that the offset requested corresponds to one of the
+	 * resources of the device.
+	 */
+	for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
+		struct resource *rp = &dev->resource[i];
+		int flags = rp->flags;
+
+		/* treat ROM as memory (should be already) */
+		if (i == PCI_ROM_RESOURCE)
+			flags |= IORESOURCE_MEM;
+
+		/* Active and same type? */
+		if ((flags & res_bit) == 0)
+			continue;
+
+		/* In the range of this resource? */
+		if (*offset < (rp->start & PAGE_MASK) || *offset > rp->end)
+			continue;
+
+		/* found it! construct the final physical address */
+		if (mmap_state == pci_mmap_io)
+			*offset += hose->io_base_phys - io_offset;
+		return rp;
+	}
+
+	return NULL;
+}
+
+/*
+ * Set vm_page_prot of VMA, as appropriate for this architecture, for a pci
+ * device mapping.
+ */
+static pgprot_t __pci_mmap_set_pgprot(struct pci_dev *dev, struct resource *rp,
+				      pgprot_t protection,
+				      enum pci_mmap_state mmap_state,
+				      int write_combine)
+{
+	unsigned long prot = pgprot_val(protection);
+
+	/* Write combine is always 0 on non-memory space mappings. On
+	 * memory space, if the user didn't pass 1, we check for a
+	 * "prefetchable" resource. This is a bit hackish, but we use
+	 * this to workaround the inability of /sysfs to provide a write
+	 * combine bit
+	 */
+	if (mmap_state != pci_mmap_mem)
+		write_combine = 0;
+	else if (write_combine == 0) {
+		if (rp->flags & IORESOURCE_PREFETCH)
+			write_combine = 1;
+	}
+
+	/* XXX would be nice to have a way to ask for write-through */
+	prot |= _PAGE_NO_CACHE;
+	if (write_combine)
+		prot &= ~_PAGE_GUARDED;
+	else
+		prot |= _PAGE_GUARDED;
+
+	printk("PCI map for %s:%lx, prot: %lx\n", pci_name(dev), rp->start,
+	       prot);
+
+	return __pgprot(prot);
+}
+
+/*
+ * This one is used by /dev/mem and fbdev who have no clue about the
+ * PCI device, it tries to find the PCI device first and calls the
+ * above routine
+ */
+pgprot_t pci_phys_mem_access_prot(struct file *file,
+				  unsigned long offset,
+				  unsigned long size,
+				  pgprot_t protection)
+{
+	struct pci_dev *pdev = NULL;
+	struct resource *found = NULL;
+	unsigned long prot = pgprot_val(protection);
+	int i;
+
+	if (page_is_ram(offset >> PAGE_SHIFT))
+		return prot;
+
+	prot |= _PAGE_NO_CACHE | _PAGE_GUARDED;
+
+	for_each_pci_dev(pdev) {
+		for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
+			struct resource *rp = &pdev->resource[i];
+			int flags = rp->flags;
+
+			/* Active and same type? */
+			if ((flags & IORESOURCE_MEM) == 0)
+				continue;
+			/* In the range of this resource? */
+			if (offset < (rp->start & PAGE_MASK) ||
+			    offset > rp->end)
+				continue;
+			found = rp;
+			break;
+		}
+		if (found)
+			break;
+	}
+	if (found) {
+		if (found->flags & IORESOURCE_PREFETCH)
+			prot &= ~_PAGE_GUARDED;
+		pci_dev_put(pdev);
+	}
+
+	DBG("non-PCI map for %lx, prot: %lx\n", offset, prot);
+
+	return __pgprot(prot);
+}
+
+
+/*
+ * Perform the actual remap of the pages for a PCI device mapping, as
+ * appropriate for this architecture.  The region in the process to map
+ * is described by vm_start and vm_end members of VMA, the base physical
+ * address is found in vm_pgoff.
+ * The pci device structure is provided so that architectures may make mapping
+ * decisions on a per-device or per-bus basis.
+ *
+ * Returns a negative error code on failure, zero on success.
+ */
+int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+			enum pci_mmap_state mmap_state,
+			int write_combine)
+{
+	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+	struct resource *rp;
+	int ret;
+
+	rp = __pci_mmap_make_offset(dev, &offset, mmap_state);
+	if (rp == NULL)
+		return -EINVAL;
+
+	vma->vm_pgoff = offset >> PAGE_SHIFT;
+	vma->vm_flags |= VM_SHM | VM_LOCKED | VM_IO;
+	vma->vm_page_prot = __pci_mmap_set_pgprot(dev, rp,
+						  vma->vm_page_prot,
+						  mmap_state, write_combine);
+
+	ret = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+			       vma->vm_end - vma->vm_start, vma->vm_page_prot);
+
+	return ret;
+}
+
+#ifdef CONFIG_PPC_MULTIPLATFORM
+static ssize_t pci_show_devspec(struct device *dev, char *buf)
+{
+	struct pci_dev *pdev;
+	struct device_node *np;
+
+	pdev = to_pci_dev (dev);
+	np = pci_device_to_OF_node(pdev);
+	if (np == NULL || np->full_name == NULL)
+		return 0;
+	return sprintf(buf, "%s", np->full_name);
+}
+static DEVICE_ATTR(devspec, S_IRUGO, pci_show_devspec, NULL);
+#endif /* CONFIG_PPC_MULTIPLATFORM */
+
+void pcibios_add_platform_entries(struct pci_dev *pdev)
+{
+#ifdef CONFIG_PPC_MULTIPLATFORM
+	device_create_file(&pdev->dev, &dev_attr_devspec);
+#endif /* CONFIG_PPC_MULTIPLATFORM */
+}
+
+#ifdef CONFIG_PPC_MULTIPLATFORM
+
+#define ISA_SPACE_MASK 0x1
+#define ISA_SPACE_IO 0x1
+
+static void __devinit pci_process_ISA_OF_ranges(struct device_node *isa_node,
+				      unsigned long phb_io_base_phys,
+				      void __iomem * phb_io_base_virt)
+{
+	struct isa_range *range;
+	unsigned long pci_addr;
+	unsigned int isa_addr;
+	unsigned int size;
+	int rlen = 0;
+
+	range = (struct isa_range *) get_property(isa_node, "ranges", &rlen);
+	if (range == NULL || (rlen < sizeof(struct isa_range))) {
+		printk(KERN_ERR "no ISA ranges or unexpected isa range size,"
+		       "mapping 64k\n");
+		__ioremap_explicit(phb_io_base_phys, (unsigned long)phb_io_base_virt, 
+				   0x10000, _PAGE_NO_CACHE);
+		return;	
+	}
+	
+	/* From "ISA Binding to 1275"
+	 * The ranges property is laid out as an array of elements,
+	 * each of which comprises:
+	 *   cells 0 - 1:	an ISA address
+	 *   cells 2 - 4:	a PCI address 
+	 *			(size depending on dev->n_addr_cells)
+	 *   cell 5:		the size of the range
+	 */
+	if ((range->isa_addr.a_hi && ISA_SPACE_MASK) == ISA_SPACE_IO) {
+		isa_addr = range->isa_addr.a_lo;
+		pci_addr = (unsigned long) range->pci_addr.a_mid << 32 | 
+			range->pci_addr.a_lo;
+
+		/* Assume these are both zero */
+		if ((pci_addr != 0) || (isa_addr != 0)) {
+			printk(KERN_ERR "unexpected isa to pci mapping: %s\n",
+					__FUNCTION__);
+			return;
+		}
+		
+		size = PAGE_ALIGN(range->size);
+
+		__ioremap_explicit(phb_io_base_phys, 
+				   (unsigned long) phb_io_base_virt, 
+				   size, _PAGE_NO_CACHE);
+	}
+}
+
+void __devinit pci_process_bridge_OF_ranges(struct pci_controller *hose,
+					    struct device_node *dev)
+{
+	unsigned int *ranges;
+	unsigned long size;
+	int rlen = 0;
+	int memno = 0;
+	struct resource *res;
+	int np, na = prom_n_addr_cells(dev);
+	unsigned long pci_addr, cpu_phys_addr;
+
+	np = na + 5;
+
+	/* From "PCI Binding to 1275"
+	 * The ranges property is laid out as an array of elements,
+	 * each of which comprises:
+	 *   cells 0 - 2:	a PCI address
+	 *   cells 3 or 3+4:	a CPU physical address
+	 *			(size depending on dev->n_addr_cells)
+	 *   cells 4+5 or 5+6:	the size of the range
+	 */
+	rlen = 0;
+	hose->io_base_phys = 0;
+	ranges = (unsigned int *) get_property(dev, "ranges", &rlen);
+	while ((rlen -= np * sizeof(unsigned int)) >= 0) {
+		res = NULL;
+		pci_addr = (unsigned long)ranges[1] << 32 | ranges[2];
+
+		cpu_phys_addr = ranges[3];
+		if (na == 2)
+			cpu_phys_addr = cpu_phys_addr << 32 | ranges[4];
+
+		size = (unsigned long)ranges[na+3] << 32 | ranges[na+4];
+		if (size == 0)
+			continue;
+		switch ((ranges[0] >> 24) & 0x3) {
+		case 1:		/* I/O space */
+			hose->io_base_phys = cpu_phys_addr;
+			hose->pci_io_size = size;
+
+			res = &hose->io_resource;
+			res->flags = IORESOURCE_IO;
+			res->start = pci_addr;
+			DBG("phb%d: IO 0x%lx -> 0x%lx\n", hose->global_number,
+				    res->start, res->start + size - 1);
+			break;
+		case 2:		/* memory space */
+			memno = 0;
+			while (memno < 3 && hose->mem_resources[memno].flags)
+				++memno;
+
+			if (memno == 0)
+				hose->pci_mem_offset = cpu_phys_addr - pci_addr;
+			if (memno < 3) {
+				res = &hose->mem_resources[memno];
+				res->flags = IORESOURCE_MEM;
+				res->start = cpu_phys_addr;
+				DBG("phb%d: MEM 0x%lx -> 0x%lx\n", hose->global_number,
+					    res->start, res->start + size - 1);
+			}
+			break;
+		}
+		if (res != NULL) {
+			res->name = dev->full_name;
+			res->end = res->start + size - 1;
+			res->parent = NULL;
+			res->sibling = NULL;
+			res->child = NULL;
+		}
+		ranges += np;
+	}
+}
+
+void __init pci_setup_phb_io(struct pci_controller *hose, int primary)
+{
+	unsigned long size = hose->pci_io_size;
+	unsigned long io_virt_offset;
+	struct resource *res;
+	struct device_node *isa_dn;
+
+	hose->io_base_virt = reserve_phb_iospace(size);
+	DBG("phb%d io_base_phys 0x%lx io_base_virt 0x%lx\n",
+		hose->global_number, hose->io_base_phys,
+		(unsigned long) hose->io_base_virt);
+
+	if (primary) {
+		pci_io_base = (unsigned long)hose->io_base_virt;
+		isa_dn = of_find_node_by_type(NULL, "isa");
+		if (isa_dn) {
+			isa_io_base = pci_io_base;
+			pci_process_ISA_OF_ranges(isa_dn, hose->io_base_phys,
+						hose->io_base_virt);
+			of_node_put(isa_dn);
+			/* Allow all IO */
+			io_page_mask = -1;
+		}
+	}
+
+	io_virt_offset = (unsigned long)hose->io_base_virt - pci_io_base;
+	res = &hose->io_resource;
+	res->start += io_virt_offset;
+	res->end += io_virt_offset;
+}
+
+void __devinit pci_setup_phb_io_dynamic(struct pci_controller *hose,
+					int primary)
+{
+	unsigned long size = hose->pci_io_size;
+	unsigned long io_virt_offset;
+	struct resource *res;
+
+	hose->io_base_virt = __ioremap(hose->io_base_phys, size,
+					_PAGE_NO_CACHE);
+	DBG("phb%d io_base_phys 0x%lx io_base_virt 0x%lx\n",
+		hose->global_number, hose->io_base_phys,
+		(unsigned long) hose->io_base_virt);
+
+	if (primary)
+		pci_io_base = (unsigned long)hose->io_base_virt;
+
+	io_virt_offset = (unsigned long)hose->io_base_virt - pci_io_base;
+	res = &hose->io_resource;
+	res->start += io_virt_offset;
+	res->end += io_virt_offset;
+}
+
+
+static int get_bus_io_range(struct pci_bus *bus, unsigned long *start_phys,
+				unsigned long *start_virt, unsigned long *size)
+{
+	struct pci_controller *hose = pci_bus_to_host(bus);
+	struct pci_bus_region region;
+	struct resource *res;
+
+	if (bus->self) {
+		res = bus->resource[0];
+		pcibios_resource_to_bus(bus->self, &region, res);
+		*start_phys = hose->io_base_phys + region.start;
+		*start_virt = (unsigned long) hose->io_base_virt + 
+				region.start;
+		if (region.end > region.start) 
+			*size = region.end - region.start + 1;
+		else {
+			printk("%s(): unexpected region 0x%lx->0x%lx\n", 
+					__FUNCTION__, region.start, region.end);
+			return 1;
+		}
+		
+	} else {
+		/* Root Bus */
+		res = &hose->io_resource;
+		*start_phys = hose->io_base_phys;
+		*start_virt = (unsigned long) hose->io_base_virt;
+		if (res->end > res->start)
+			*size = res->end - res->start + 1;
+		else {
+			printk("%s(): unexpected region 0x%lx->0x%lx\n", 
+					__FUNCTION__, res->start, res->end);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+int unmap_bus_range(struct pci_bus *bus)
+{
+	unsigned long start_phys;
+	unsigned long start_virt;
+	unsigned long size;
+
+	if (!bus) {
+		printk(KERN_ERR "%s() expected bus\n", __FUNCTION__);
+		return 1;
+	}
+	
+	if (get_bus_io_range(bus, &start_phys, &start_virt, &size))
+		return 1;
+	if (iounmap_explicit((void __iomem *) start_virt, size))
+		return 1;
+
+	return 0;
+}
+EXPORT_SYMBOL(unmap_bus_range);
+
+int remap_bus_range(struct pci_bus *bus)
+{
+	unsigned long start_phys;
+	unsigned long start_virt;
+	unsigned long size;
+
+	if (!bus) {
+		printk(KERN_ERR "%s() expected bus\n", __FUNCTION__);
+		return 1;
+	}
+	
+	
+	if (get_bus_io_range(bus, &start_phys, &start_virt, &size))
+		return 1;
+	printk("mapping IO %lx -> %lx, size: %lx\n", start_phys, start_virt, size);
+	if (__ioremap_explicit(start_phys, start_virt, size, _PAGE_NO_CACHE))
+		return 1;
+
+	return 0;
+}
+EXPORT_SYMBOL(remap_bus_range);
+
+void phbs_remap_io(void)
+{
+	struct pci_controller *hose, *tmp;
+
+	list_for_each_entry_safe(hose, tmp, &hose_list, list_node)
+		remap_bus_range(hose->bus);
+}
+
+/*
+ * ppc64 can have multifunction devices that do not respond to function 0.
+ * In this case we must scan all functions.
+ */
+int pcibios_scan_all_fns(struct pci_bus *bus, int devfn)
+{
+       struct device_node *busdn, *dn;
+
+       if (bus->self)
+               busdn = pci_device_to_OF_node(bus->self);
+       else
+               busdn = bus->sysdata;   /* must be a phb */
+
+       if (busdn == NULL)
+	       return 0;
+
+       /*
+        * Check to see if there is any of the 8 functions are in the
+        * device tree.  If they are then we need to scan all the
+        * functions of this slot.
+        */
+       for (dn = busdn->child; dn; dn = dn->sibling)
+               if ((dn->devfn >> 3) == (devfn >> 3))
+                       return 1;
+
+       return 0;
+}
+
+
+void __devinit pcibios_fixup_device_resources(struct pci_dev *dev,
+					   struct pci_bus *bus)
+{
+	/* Update device resources.  */
+	struct pci_controller *hose = pci_bus_to_host(bus);
+	int i;
+
+	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+		if (dev->resource[i].flags & IORESOURCE_IO) {
+			unsigned long offset = (unsigned long)hose->io_base_virt
+				- pci_io_base;
+                        unsigned long start, end, mask;
+
+                        start = dev->resource[i].start += offset;
+                        end = dev->resource[i].end += offset;
+
+                        /* Need to allow IO access to pages that are in the
+                           ISA range */
+                        if (start < MAX_ISA_PORT) {
+                                if (end > MAX_ISA_PORT)
+                                        end = MAX_ISA_PORT;
+
+                                start >>= PAGE_SHIFT;
+                                end >>= PAGE_SHIFT;
+
+                                /* get the range of pages for the map */
+                                mask = ((1 << (end+1))-1) ^ ((1 << start)-1);
+                                io_page_mask |= mask;
+                        }
+		}
+                else if (dev->resource[i].flags & IORESOURCE_MEM) {
+			dev->resource[i].start += hose->pci_mem_offset;
+			dev->resource[i].end += hose->pci_mem_offset;
+		}
+        }
+}
+EXPORT_SYMBOL(pcibios_fixup_device_resources);
+
+void __devinit pcibios_fixup_bus(struct pci_bus *bus)
+{
+	struct pci_controller *hose = pci_bus_to_host(bus);
+	struct pci_dev *dev = bus->self;
+	struct resource *res;
+	int i;
+
+	if (!dev) {
+		/* Root bus. */
+
+		hose->bus = bus;
+		bus->resource[0] = res = &hose->io_resource;
+
+		if (res->flags && request_resource(&ioport_resource, res))
+			printk(KERN_ERR "Failed to request IO on "
+					"PCI domain %d\n", pci_domain_nr(bus));
+
+		for (i = 0; i < 3; ++i) {
+			res = &hose->mem_resources[i];
+			bus->resource[i+1] = res;
+			if (res->flags && request_resource(&iomem_resource, res))
+				printk(KERN_ERR "Failed to request MEM on "
+						"PCI domain %d\n",
+						pci_domain_nr(bus));
+		}
+	} else if (pci_probe_only &&
+		   (dev->class >> 8) == PCI_CLASS_BRIDGE_PCI) {
+		/* This is a subordinate bridge */
+
+		pci_read_bridge_bases(bus);
+		pcibios_fixup_device_resources(dev, bus);
+	}
+
+	ppc_md.iommu_bus_setup(bus);
+
+	list_for_each_entry(dev, &bus->devices, bus_list)
+		ppc_md.iommu_dev_setup(dev);
+
+	if (!pci_probe_only)
+		return;
+
+	list_for_each_entry(dev, &bus->devices, bus_list) {
+		if ((dev->class >> 8) != PCI_CLASS_BRIDGE_PCI)
+			pcibios_fixup_device_resources(dev, bus);
+	}
+}
+EXPORT_SYMBOL(pcibios_fixup_bus);
+
+/*
+ * Reads the interrupt pin to determine if interrupt is use by card.
+ * If the interrupt is used, then gets the interrupt line from the 
+ * openfirmware and sets it in the pci_dev and pci_config line.
+ */
+int pci_read_irq_line(struct pci_dev *pci_dev)
+{
+	u8 intpin;
+	struct device_node *node;
+
+    	pci_read_config_byte(pci_dev, PCI_INTERRUPT_PIN, &intpin);
+	if (intpin == 0)
+		return 0;
+
+	node = pci_device_to_OF_node(pci_dev);
+	if (node == NULL)
+		return -1;
+
+	if (node->n_intrs == 0)
+		return -1;
+
+	pci_dev->irq = node->intrs[0].line;
+
+	pci_write_config_byte(pci_dev, PCI_INTERRUPT_LINE, pci_dev->irq);
+
+	return 0;
+}
+EXPORT_SYMBOL(pci_read_irq_line);
+
+#endif /* CONFIG_PPC_MULTIPLATFORM */
diff --git a/arch/ppc64/kernel/pci.h b/arch/ppc64/kernel/pci.h
new file mode 100644
index 00000000000..0fd7d849aa7
--- /dev/null
+++ b/arch/ppc64/kernel/pci.h
@@ -0,0 +1,51 @@
+/*
+ * c 2001 PPC 64 Team, IBM Corp
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+#ifndef __PPC_KERNEL_PCI_H__
+#define __PPC_KERNEL_PCI_H__
+
+#include <linux/pci.h>
+#include <asm/pci-bridge.h>
+
+extern unsigned long isa_io_base;
+
+extern void pci_setup_pci_controller(struct pci_controller *hose);
+extern void pci_setup_phb_io(struct pci_controller *hose, int primary);
+extern void pci_setup_phb_io_dynamic(struct pci_controller *hose, int primary);
+
+
+extern struct list_head hose_list;
+extern int global_phb_number;
+
+extern unsigned long find_and_init_phbs(void);
+
+extern struct pci_dev *ppc64_isabridge_dev;	/* may be NULL if no ISA bus */
+
+/* PCI device_node operations */
+struct device_node;
+typedef void *(*traverse_func)(struct device_node *me, void *data);
+void *traverse_pci_devices(struct device_node *start, traverse_func pre,
+		void *data);
+
+void pci_devs_phb_init(void);
+void pci_devs_phb_init_dynamic(struct pci_controller *phb);
+struct device_node *fetch_dev_dn(struct pci_dev *dev);
+
+/* PCI address cache management routines */
+void pci_addr_cache_insert_device(struct pci_dev *dev);
+void pci_addr_cache_remove_device(struct pci_dev *dev);
+
+/* From pSeries_pci.h */
+void init_pci_config_tokens (void);
+unsigned long get_phb_buid (struct device_node *);
+
+extern unsigned long pci_probe_only;
+extern unsigned long pci_assign_all_buses;
+extern int pci_read_irq_line(struct pci_dev *pci_dev);
+
+#endif /* __PPC_KERNEL_PCI_H__ */
diff --git a/arch/ppc64/kernel/pci_direct_iommu.c b/arch/ppc64/kernel/pci_direct_iommu.c
new file mode 100644
index 00000000000..b8f7f58824f
--- /dev/null
+++ b/arch/ppc64/kernel/pci_direct_iommu.c
@@ -0,0 +1,95 @@
+/*
+ * Support for DMA from PCI devices to main memory on
+ * machines without an iommu or with directly addressable
+ * RAM (typically a pmac with 2Gb of RAM or less)
+ *
+ * Copyright (C) 2003 Benjamin Herrenschmidt (benh@kernel.crashing.org)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+
+#include <asm/sections.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+#include <asm/pmac_feature.h>
+#include <asm/abs_addr.h>
+
+#include "pci.h"
+
+static void *pci_direct_alloc_coherent(struct device *hwdev, size_t size,
+				   dma_addr_t *dma_handle, unsigned int __nocast flag)
+{
+	void *ret;
+
+	ret = (void *)__get_free_pages(flag, get_order(size));
+	if (ret != NULL) {
+		memset(ret, 0, size);
+		*dma_handle = virt_to_abs(ret);
+	}
+	return ret;
+}
+
+static void pci_direct_free_coherent(struct device *hwdev, size_t size,
+				 void *vaddr, dma_addr_t dma_handle)
+{
+	free_pages((unsigned long)vaddr, get_order(size));
+}
+
+static dma_addr_t pci_direct_map_single(struct device *hwdev, void *ptr,
+		size_t size, enum dma_data_direction direction)
+{
+	return virt_to_abs(ptr);
+}
+
+static void pci_direct_unmap_single(struct device *hwdev, dma_addr_t dma_addr,
+		size_t size, enum dma_data_direction direction)
+{
+}
+
+static int pci_direct_map_sg(struct device *hwdev, struct scatterlist *sg,
+		int nents, enum dma_data_direction direction)
+{
+	int i;
+
+	for (i = 0; i < nents; i++, sg++) {
+		sg->dma_address = page_to_phys(sg->page) + sg->offset;
+		sg->dma_length = sg->length;
+	}
+
+	return nents;
+}
+
+static void pci_direct_unmap_sg(struct device *hwdev, struct scatterlist *sg,
+		int nents, enum dma_data_direction direction)
+{
+}
+
+static int pci_direct_dma_supported(struct device *dev, u64 mask)
+{
+	return mask < 0x100000000ull;
+}
+
+void __init pci_direct_iommu_init(void)
+{
+	pci_dma_ops.alloc_coherent = pci_direct_alloc_coherent;
+	pci_dma_ops.free_coherent = pci_direct_free_coherent;
+	pci_dma_ops.map_single = pci_direct_map_single;
+	pci_dma_ops.unmap_single = pci_direct_unmap_single;
+	pci_dma_ops.map_sg = pci_direct_map_sg;
+	pci_dma_ops.unmap_sg = pci_direct_unmap_sg;
+	pci_dma_ops.dma_supported = pci_direct_dma_supported;
+}
diff --git a/arch/ppc64/kernel/pci_dn.c b/arch/ppc64/kernel/pci_dn.c
new file mode 100644
index 00000000000..ec345462afc
--- /dev/null
+++ b/arch/ppc64/kernel/pci_dn.c
@@ -0,0 +1,198 @@
+/*
+ * pci_dn.c
+ *
+ * Copyright (C) 2001 Todd Inglett, IBM Corporation
+ *
+ * PCI manipulation via device_nodes.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *    
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+#include <linux/init.h>
+
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/pci-bridge.h>
+#include <asm/pSeries_reconfig.h>
+
+#include "pci.h"
+
+/*
+ * Traverse_func that inits the PCI fields of the device node.
+ * NOTE: this *must* be done before read/write config to the device.
+ */
+static void * __devinit update_dn_pci_info(struct device_node *dn, void *data)
+{
+	struct pci_controller *phb = data;
+	int *type = (int *)get_property(dn, "ibm,pci-config-space-type", NULL);
+	u32 *regs;
+
+	dn->phb = phb;
+	regs = (u32 *)get_property(dn, "reg", NULL);
+	if (regs) {
+		/* First register entry is addr (00BBSS00)  */
+		dn->busno = (regs[0] >> 16) & 0xff;
+		dn->devfn = (regs[0] >> 8) & 0xff;
+	}
+
+	dn->pci_ext_config_space = (type && *type == 1);
+	return NULL;
+}
+
+/*
+ * Traverse a device tree stopping each PCI device in the tree.
+ * This is done depth first.  As each node is processed, a "pre"
+ * function is called and the children are processed recursively.
+ *
+ * The "pre" func returns a value.  If non-zero is returned from
+ * the "pre" func, the traversal stops and this value is returned.
+ * This return value is useful when using traverse as a method of
+ * finding a device.
+ *
+ * NOTE: we do not run the func for devices that do not appear to
+ * be PCI except for the start node which we assume (this is good
+ * because the start node is often a phb which may be missing PCI
+ * properties).
+ * We use the class-code as an indicator. If we run into
+ * one of these nodes we also assume its siblings are non-pci for
+ * performance.
+ */
+void *traverse_pci_devices(struct device_node *start, traverse_func pre,
+		void *data)
+{
+	struct device_node *dn, *nextdn;
+	void *ret;
+
+	/* We started with a phb, iterate all childs */
+	for (dn = start->child; dn; dn = nextdn) {
+		u32 *classp, class;
+
+		nextdn = NULL;
+		classp = (u32 *)get_property(dn, "class-code", NULL);
+		class = classp ? *classp : 0;
+
+		if (pre && ((ret = pre(dn, data)) != NULL))
+			return ret;
+
+		/* If we are a PCI bridge, go down */
+		if (dn->child && ((class >> 8) == PCI_CLASS_BRIDGE_PCI ||
+				  (class >> 8) == PCI_CLASS_BRIDGE_CARDBUS))
+			/* Depth first...do children */
+			nextdn = dn->child;
+		else if (dn->sibling)
+			/* ok, try next sibling instead. */
+			nextdn = dn->sibling;
+		if (!nextdn) {
+			/* Walk up to next valid sibling. */
+			do {
+				dn = dn->parent;
+				if (dn == start)
+					return NULL;
+			} while (dn->sibling == NULL);
+			nextdn = dn->sibling;
+		}
+	}
+	return NULL;
+}
+
+void __devinit pci_devs_phb_init_dynamic(struct pci_controller *phb)
+{
+	struct device_node * dn = (struct device_node *) phb->arch_data;
+
+	/* PHB nodes themselves must not match */
+	dn->devfn = dn->busno = -1;
+	dn->phb = phb;
+
+	/* Update dn->phb ptrs for new phb and children devices */
+	traverse_pci_devices(dn, update_dn_pci_info, phb);
+}
+
+/*
+ * Traversal func that looks for a <busno,devfcn> value.
+ * If found, the device_node is returned (thus terminating the traversal).
+ */
+static void *is_devfn_node(struct device_node *dn, void *data)
+{
+	int busno = ((unsigned long)data >> 8) & 0xff;
+	int devfn = ((unsigned long)data) & 0xff;
+
+	return ((devfn == dn->devfn) && (busno == dn->busno)) ? dn : NULL;
+}
+
+/*
+ * This is the "slow" path for looking up a device_node from a
+ * pci_dev.  It will hunt for the device under its parent's
+ * phb and then update sysdata for a future fastpath.
+ *
+ * It may also do fixups on the actual device since this happens
+ * on the first read/write.
+ *
+ * Note that it also must deal with devices that don't exist.
+ * In this case it may probe for real hardware ("just in case")
+ * and add a device_node to the device tree if necessary.
+ *
+ */
+struct device_node *fetch_dev_dn(struct pci_dev *dev)
+{
+	struct device_node *orig_dn = dev->sysdata;
+	struct pci_controller *phb = orig_dn->phb; /* assume same phb as orig_dn */
+	struct device_node *phb_dn;
+	struct device_node *dn;
+	unsigned long searchval = (dev->bus->number << 8) | dev->devfn;
+
+	phb_dn = phb->arch_data;
+	dn = traverse_pci_devices(phb_dn, is_devfn_node, (void *)searchval);
+	if (dn)
+		dev->sysdata = dn;
+	return dn;
+}
+EXPORT_SYMBOL(fetch_dev_dn);
+
+static int pci_dn_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *node)
+{
+	struct device_node *np = node;
+	int err = NOTIFY_OK;
+
+	switch (action) {
+	case PSERIES_RECONFIG_ADD:
+		update_dn_pci_info(np, np->parent->phb);
+		break;
+	default:
+		err = NOTIFY_DONE;
+		break;
+	}
+	return err;
+}
+
+static struct notifier_block pci_dn_reconfig_nb = {
+	.notifier_call = pci_dn_reconfig_notifier,
+};
+
+/*
+ * Actually initialize the phbs.
+ * The buswalk on this phb has not happened yet.
+ */
+void __init pci_devs_phb_init(void)
+{
+	struct pci_controller *phb, *tmp;
+
+	/* This must be done first so the device nodes have valid pci info! */
+	list_for_each_entry_safe(phb, tmp, &hose_list, list_node)
+		pci_devs_phb_init_dynamic(phb);
+
+	pSeries_reconfig_notifier_register(&pci_dn_reconfig_nb);
+}
diff --git a/arch/ppc64/kernel/pci_iommu.c b/arch/ppc64/kernel/pci_iommu.c
new file mode 100644
index 00000000000..ef0a62b916b
--- /dev/null
+++ b/arch/ppc64/kernel/pci_iommu.c
@@ -0,0 +1,139 @@
+/*
+ * arch/ppc64/kernel/pci_iommu.c
+ * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation
+ * 
+ * Rewrite, cleanup, new allocation schemes: 
+ * Copyright (C) 2004 Olof Johansson, IBM Corporation
+ *
+ * Dynamic DMA mapping support, platform-independent parts.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/iommu.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+#include "pci.h"
+
+#ifdef CONFIG_PPC_ISERIES
+#include <asm/iSeries/iSeries_pci.h>
+#endif /* CONFIG_PPC_ISERIES */
+
+/*
+ * We can use ->sysdata directly and avoid the extra work in
+ * pci_device_to_OF_node since ->sysdata will have been initialised
+ * in the iommu init code for all devices.
+ */
+#define PCI_GET_DN(dev) ((struct device_node *)((dev)->sysdata))
+
+static inline struct iommu_table *devnode_table(struct device *dev)
+{
+	struct pci_dev *pdev;
+
+	if (!dev) {
+		pdev = ppc64_isabridge_dev;
+		if (!pdev)
+			return NULL;
+	} else
+		pdev = to_pci_dev(dev);
+
+#ifdef CONFIG_PPC_ISERIES
+	return ISERIES_DEVNODE(pdev)->iommu_table;
+#endif /* CONFIG_PPC_ISERIES */
+
+#ifdef CONFIG_PPC_MULTIPLATFORM
+	return PCI_GET_DN(pdev)->iommu_table;
+#endif /* CONFIG_PPC_MULTIPLATFORM */
+}
+
+
+/* Allocates a contiguous real buffer and creates mappings over it.
+ * Returns the virtual address of the buffer and sets dma_handle
+ * to the dma address (mapping) of the first page.
+ */
+static void *pci_iommu_alloc_coherent(struct device *hwdev, size_t size,
+			   dma_addr_t *dma_handle, unsigned int __nocast flag)
+{
+	return iommu_alloc_coherent(devnode_table(hwdev), size, dma_handle,
+			flag);
+}
+
+static void pci_iommu_free_coherent(struct device *hwdev, size_t size,
+			 void *vaddr, dma_addr_t dma_handle)
+{
+	iommu_free_coherent(devnode_table(hwdev), size, vaddr, dma_handle);
+}
+
+/* Creates TCEs for a user provided buffer.  The user buffer must be 
+ * contiguous real kernel storage (not vmalloc).  The address of the buffer
+ * passed here is the kernel (virtual) address of the buffer.  The buffer
+ * need not be page aligned, the dma_addr_t returned will point to the same
+ * byte within the page as vaddr.
+ */
+static dma_addr_t pci_iommu_map_single(struct device *hwdev, void *vaddr,
+		size_t size, enum dma_data_direction direction)
+{
+	return iommu_map_single(devnode_table(hwdev), vaddr, size, direction);
+}
+
+
+static void pci_iommu_unmap_single(struct device *hwdev, dma_addr_t dma_handle,
+		size_t size, enum dma_data_direction direction)
+{
+	iommu_unmap_single(devnode_table(hwdev), dma_handle, size, direction);
+}
+
+
+static int pci_iommu_map_sg(struct device *pdev, struct scatterlist *sglist,
+		int nelems, enum dma_data_direction direction)
+{
+	return iommu_map_sg(pdev, devnode_table(pdev), sglist,
+			nelems, direction);
+}
+
+static void pci_iommu_unmap_sg(struct device *pdev, struct scatterlist *sglist,
+		int nelems, enum dma_data_direction direction)
+{
+	iommu_unmap_sg(devnode_table(pdev), sglist, nelems, direction);
+}
+
+/* We support DMA to/from any memory page via the iommu */
+static int pci_iommu_dma_supported(struct device *dev, u64 mask)
+{
+	return 1;
+}
+
+void pci_iommu_init(void)
+{
+	pci_dma_ops.alloc_coherent = pci_iommu_alloc_coherent;
+	pci_dma_ops.free_coherent = pci_iommu_free_coherent;
+	pci_dma_ops.map_single = pci_iommu_map_single;
+	pci_dma_ops.unmap_single = pci_iommu_unmap_single;
+	pci_dma_ops.map_sg = pci_iommu_map_sg;
+	pci_dma_ops.unmap_sg = pci_iommu_unmap_sg;
+	pci_dma_ops.dma_supported = pci_iommu_dma_supported;
+}
diff --git a/arch/ppc64/kernel/pmac.h b/arch/ppc64/kernel/pmac.h
new file mode 100644
index 00000000000..40e1c5030f7
--- /dev/null
+++ b/arch/ppc64/kernel/pmac.h
@@ -0,0 +1,31 @@
+#ifndef __PMAC_H__
+#define __PMAC_H__
+
+#include <linux/pci.h>
+#include <linux/ide.h>
+
+/*
+ * Declaration for the various functions exported by the
+ * pmac_* files. Mostly for use by pmac_setup
+ */
+
+extern void pmac_get_boot_time(struct rtc_time *tm);
+extern void pmac_get_rtc_time(struct rtc_time *tm);
+extern int  pmac_set_rtc_time(struct rtc_time *tm);
+extern void pmac_read_rtc_time(void);
+extern void pmac_calibrate_decr(void);
+
+extern void pmac_pcibios_fixup(void);
+extern void pmac_pci_init(void);
+extern void pmac_setup_pci_dma(void);
+extern void pmac_check_ht_link(void);
+
+extern void pmac_setup_smp(void);
+
+extern unsigned long pmac_ide_get_base(int index);
+extern void pmac_ide_init_hwif_ports(hw_regs_t *hw,
+	unsigned long data_port, unsigned long ctrl_port, int *irq);
+
+extern void pmac_nvram_init(void);
+
+#endif /* __PMAC_H__ */
diff --git a/arch/ppc64/kernel/pmac_feature.c b/arch/ppc64/kernel/pmac_feature.c
new file mode 100644
index 00000000000..7f1062d222c
--- /dev/null
+++ b/arch/ppc64/kernel/pmac_feature.c
@@ -0,0 +1,676 @@
+/*
+ *  arch/ppc/platforms/pmac_feature.c
+ *
+ *  Copyright (C) 1996-2001 Paul Mackerras (paulus@cs.anu.edu.au)
+ *                          Ben. Herrenschmidt (benh@kernel.crashing.org)
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ *  TODO:
+ *
+ *   - Replace mdelay with some schedule loop if possible
+ *   - Shorten some obfuscated delays on some routines (like modem
+ *     power)
+ *   - Refcount some clocks (see darwin)
+ *   - Split split split...
+ *
+ */
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/adb.h>
+#include <linux/pmu.h>
+#include <linux/ioport.h>
+#include <linux/pci.h>
+#include <asm/sections.h>
+#include <asm/errno.h>
+#include <asm/keylargo.h>
+#include <asm/uninorth.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/machdep.h>
+#include <asm/pmac_feature.h>
+#include <asm/dbdma.h>
+#include <asm/pci-bridge.h>
+#include <asm/pmac_low_i2c.h>
+
+#undef DEBUG_FEATURE
+
+#ifdef DEBUG_FEATURE
+#define DBG(fmt...) printk(KERN_DEBUG fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+/*
+ * We use a single global lock to protect accesses. Each driver has
+ * to take care of its own locking
+ */
+static DEFINE_SPINLOCK(feature_lock  __pmacdata);
+
+#define LOCK(flags)	spin_lock_irqsave(&feature_lock, flags);
+#define UNLOCK(flags)	spin_unlock_irqrestore(&feature_lock, flags);
+
+
+/*
+ * Instance of some macio stuffs
+ */
+struct macio_chip macio_chips[MAX_MACIO_CHIPS]  __pmacdata;
+
+struct macio_chip* __pmac
+macio_find(struct device_node* child, int type)
+{
+	while(child) {
+		int	i;
+
+		for (i=0; i < MAX_MACIO_CHIPS && macio_chips[i].of_node; i++)
+			if (child == macio_chips[i].of_node &&
+			    (!type || macio_chips[i].type == type))
+				return &macio_chips[i];
+		child = child->parent;
+	}
+	return NULL;
+}
+
+static const char* macio_names[] __pmacdata =
+{
+	"Unknown",
+	"Grand Central",
+	"OHare",
+	"OHareII",
+	"Heathrow",
+	"Gatwick",
+	"Paddington",
+	"Keylargo",
+	"Pangea",
+	"Intrepid",
+	"K2"
+};
+
+
+
+/*
+ * Uninorth reg. access. Note that Uni-N regs are big endian
+ */
+
+#define UN_REG(r)	(uninorth_base + ((r) >> 2))
+#define UN_IN(r)	(in_be32(UN_REG(r)))
+#define UN_OUT(r,v)	(out_be32(UN_REG(r), (v)))
+#define UN_BIS(r,v)	(UN_OUT((r), UN_IN(r) | (v)))
+#define UN_BIC(r,v)	(UN_OUT((r), UN_IN(r) & ~(v)))
+
+static struct device_node* uninorth_node __pmacdata;
+static u32* uninorth_base __pmacdata;
+static u32 uninorth_rev __pmacdata;
+static void *u3_ht;
+
+extern struct device_node *k2_skiplist[2];
+
+/*
+ * For each motherboard family, we have a table of functions pointers
+ * that handle the various features.
+ */
+
+typedef long (*feature_call)(struct device_node* node, long param, long value);
+
+struct feature_table_entry {
+	unsigned int	selector;
+	feature_call	function;
+};
+
+struct pmac_mb_def
+{
+	const char*			model_string;
+	const char*			model_name;
+	int				model_id;
+	struct feature_table_entry* 	features;
+	unsigned long			board_flags;
+};
+static struct pmac_mb_def pmac_mb __pmacdata;
+
+/*
+ * Here are the chip specific feature functions
+ */
+
+
+static long __pmac g5_read_gpio(struct device_node* node, long param, long value)
+{
+	struct macio_chip* macio = &macio_chips[0];
+
+	return MACIO_IN8(param);
+}
+
+
+static long __pmac g5_write_gpio(struct device_node* node, long param, long value)
+{
+	struct macio_chip* macio = &macio_chips[0];
+
+	MACIO_OUT8(param, (u8)(value & 0xff));
+	return 0;
+}
+
+static long __pmac g5_gmac_enable(struct device_node* node, long param, long value)
+{
+	struct macio_chip* macio = &macio_chips[0];
+	unsigned long flags;
+
+	if (node == NULL)
+		return -ENODEV;
+
+	LOCK(flags);
+	if (value) {
+		MACIO_BIS(KEYLARGO_FCR1, K2_FCR1_GMAC_CLK_ENABLE);
+		mb();
+		k2_skiplist[0] = NULL;
+	} else {
+		k2_skiplist[0] = node;
+		mb();
+		MACIO_BIC(KEYLARGO_FCR1, K2_FCR1_GMAC_CLK_ENABLE);
+	}
+	
+	UNLOCK(flags);
+	mdelay(1);
+
+	return 0;
+}
+
+static long __pmac g5_fw_enable(struct device_node* node, long param, long value)
+{
+	struct macio_chip* macio = &macio_chips[0];
+	unsigned long flags;
+
+	if (node == NULL)
+		return -ENODEV;
+
+	LOCK(flags);
+	if (value) {
+		MACIO_BIS(KEYLARGO_FCR1, K2_FCR1_FW_CLK_ENABLE);
+		mb();
+		k2_skiplist[1] = NULL;
+	} else {
+		k2_skiplist[1] = node;
+		mb();
+		MACIO_BIC(KEYLARGO_FCR1, K2_FCR1_FW_CLK_ENABLE);
+	}
+	
+	UNLOCK(flags);
+	mdelay(1);
+
+	return 0;
+}
+
+static long __pmac g5_mpic_enable(struct device_node* node, long param, long value)
+{
+	unsigned long flags;
+
+	if (node->parent == NULL || strcmp(node->parent->name, "u3"))
+		return 0;
+
+	LOCK(flags);
+	UN_BIS(U3_TOGGLE_REG, U3_MPIC_RESET | U3_MPIC_OUTPUT_ENABLE);
+	UNLOCK(flags);
+
+	return 0;
+}
+
+static long __pmac g5_eth_phy_reset(struct device_node* node, long param, long value)
+{
+	struct macio_chip* macio = &macio_chips[0];
+	struct device_node *phy;
+	int need_reset;
+
+	/*
+	 * We must not reset the combo PHYs, only the BCM5221 found in
+	 * the iMac G5.
+	 */
+	phy = of_get_next_child(node, NULL);
+	if (!phy)
+		return -ENODEV;
+	need_reset = device_is_compatible(phy, "B5221");
+	of_node_put(phy);
+	if (!need_reset)
+		return 0;
+
+	/* PHY reset is GPIO 29, not in device-tree unfortunately */
+	MACIO_OUT8(K2_GPIO_EXTINT_0 + 29,
+		   KEYLARGO_GPIO_OUTPUT_ENABLE | KEYLARGO_GPIO_OUTOUT_DATA);
+	/* Thankfully, this is now always called at a time when we can
+	 * schedule by sungem.
+	 */
+	msleep(10);
+	MACIO_OUT8(K2_GPIO_EXTINT_0 + 29, 0);
+
+	return 0;
+}
+
+#ifdef CONFIG_SMP
+static long __pmac g5_reset_cpu(struct device_node* node, long param, long value)
+{
+	unsigned int reset_io = 0;
+	unsigned long flags;
+	struct macio_chip* macio;
+	struct device_node* np;
+
+	macio = &macio_chips[0];
+	if (macio->type != macio_keylargo2)
+		return -ENODEV;
+
+	np = find_path_device("/cpus");
+	if (np == NULL)
+		return -ENODEV;
+	for (np = np->child; np != NULL; np = np->sibling) {
+		u32* num = (u32 *)get_property(np, "reg", NULL);
+		u32* rst = (u32 *)get_property(np, "soft-reset", NULL);
+		if (num == NULL || rst == NULL)
+			continue;
+		if (param == *num) {
+			reset_io = *rst;
+			break;
+		}
+	}
+	if (np == NULL || reset_io == 0)
+		return -ENODEV;
+
+	LOCK(flags);
+	MACIO_OUT8(reset_io, KEYLARGO_GPIO_OUTPUT_ENABLE);
+	(void)MACIO_IN8(reset_io);
+	udelay(1);
+	MACIO_OUT8(reset_io, 0);
+	(void)MACIO_IN8(reset_io);
+	UNLOCK(flags);
+
+	return 0;
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * This can be called from pmac_smp so isn't static
+ *
+ * This takes the second CPU off the bus on dual CPU machines
+ * running UP
+ */
+void __pmac g5_phy_disable_cpu1(void)
+{
+	UN_OUT(U3_API_PHY_CONFIG_1, 0);
+}
+
+static long __pmac generic_get_mb_info(struct device_node* node, long param, long value)
+{
+	switch(param) {
+		case PMAC_MB_INFO_MODEL:
+			return pmac_mb.model_id;
+		case PMAC_MB_INFO_FLAGS:
+			return pmac_mb.board_flags;
+		case PMAC_MB_INFO_NAME:			
+			/* hack hack hack... but should work */
+			*((const char **)value) = pmac_mb.model_name;
+			return 0;
+	}
+	return -EINVAL;
+}
+
+
+/*
+ * Table definitions
+ */
+
+/* Used on any machine
+ */
+static struct feature_table_entry any_features[]  __pmacdata = {
+	{ PMAC_FTR_GET_MB_INFO,		generic_get_mb_info },
+	{ 0, NULL }
+};
+
+/* G5 features
+ */
+static struct feature_table_entry g5_features[]  __pmacdata = {
+	{ PMAC_FTR_GMAC_ENABLE,		g5_gmac_enable },
+	{ PMAC_FTR_1394_ENABLE,		g5_fw_enable },
+	{ PMAC_FTR_ENABLE_MPIC,		g5_mpic_enable },
+	{ PMAC_FTR_READ_GPIO,		g5_read_gpio },
+	{ PMAC_FTR_WRITE_GPIO,		g5_write_gpio },
+	{ PMAC_FTR_GMAC_PHY_RESET,	g5_eth_phy_reset },
+#ifdef CONFIG_SMP
+	{ PMAC_FTR_RESET_CPU,		g5_reset_cpu },
+#endif /* CONFIG_SMP */
+	{ 0, NULL }
+};
+
+static struct pmac_mb_def pmac_mb_defs[] __pmacdata = {
+	{	"PowerMac7,2",			"PowerMac G5",
+		PMAC_TYPE_POWERMAC_G5,		g5_features,
+		0,
+	},
+	{	"PowerMac7,3",			"PowerMac G5",
+		PMAC_TYPE_POWERMAC_G5,		g5_features,
+		0,
+	},
+	{	"PowerMac8,1",			"iMac G5",
+		PMAC_TYPE_IMAC_G5,		g5_features,
+		0,
+	},
+	{	"PowerMac9,1",			"PowerMac G5",
+		PMAC_TYPE_POWERMAC_G5_U3L,	g5_features,
+		0,
+	},
+	{       "RackMac3,1",                   "XServe G5",
+		PMAC_TYPE_XSERVE_G5,		g5_features,
+		0,
+	},
+};
+
+/*
+ * The toplevel feature_call callback
+ */
+long __pmac pmac_do_feature_call(unsigned int selector, ...)
+{
+	struct device_node* node;
+	long param, value;
+	int i;
+	feature_call func = NULL;
+	va_list args;
+
+	if (pmac_mb.features)
+		for (i=0; pmac_mb.features[i].function; i++)
+			if (pmac_mb.features[i].selector == selector) {
+				func = pmac_mb.features[i].function;
+				break;
+			}
+	if (!func)
+		for (i=0; any_features[i].function; i++)
+			if (any_features[i].selector == selector) {
+				func = any_features[i].function;
+				break;
+			}
+	if (!func)
+		return -ENODEV;
+
+	va_start(args, selector);
+	node = (struct device_node*)va_arg(args, void*);
+	param = va_arg(args, long);
+	value = va_arg(args, long);
+	va_end(args);
+
+	return func(node, param, value);
+}
+
+static int __init probe_motherboard(void)
+{
+	int i;
+	struct macio_chip* macio = &macio_chips[0];
+	const char* model = NULL;
+	struct device_node *dt;
+
+	/* Lookup known motherboard type in device-tree. First try an
+	 * exact match on the "model" property, then try a "compatible"
+	 * match is none is found.
+	 */
+	dt = find_devices("device-tree");
+	if (dt != NULL)
+		model = (const char *) get_property(dt, "model", NULL);
+	for(i=0; model && i<(sizeof(pmac_mb_defs)/sizeof(struct pmac_mb_def)); i++) {
+	    if (strcmp(model, pmac_mb_defs[i].model_string) == 0) {
+		pmac_mb = pmac_mb_defs[i];
+		goto found;
+	    }
+	}
+	for(i=0; i<(sizeof(pmac_mb_defs)/sizeof(struct pmac_mb_def)); i++) {
+	    if (machine_is_compatible(pmac_mb_defs[i].model_string)) {
+		pmac_mb = pmac_mb_defs[i];
+		goto found;
+	    }
+	}
+
+	/* Fallback to selection depending on mac-io chip type */
+	switch(macio->type) {
+	case macio_keylargo2:
+		pmac_mb.model_id = PMAC_TYPE_UNKNOWN_K2;
+		pmac_mb.model_name = "Unknown K2-based";
+	    	pmac_mb.features = g5_features;
+		
+	default:
+	    	return -ENODEV;
+	}
+found:
+	/* Check for "mobile" machine */
+	if (model && (strncmp(model, "PowerBook", 9) == 0
+		   || strncmp(model, "iBook", 5) == 0))
+		pmac_mb.board_flags |= PMAC_MB_MOBILE;
+
+
+	printk(KERN_INFO "PowerMac motherboard: %s\n", pmac_mb.model_name);
+	return 0;
+}
+
+/* Initialize the Core99 UniNorth host bridge and memory controller
+ */
+static void __init probe_uninorth(void)
+{
+	uninorth_node = of_find_node_by_name(NULL, "u3");
+	if (uninorth_node && uninorth_node->n_addrs > 0) {
+		/* Small hack until I figure out if parsing in prom.c is correct. I should
+		 * get rid of those pre-parsed junk anyway
+		 */
+		unsigned long address = uninorth_node->addrs[0].address;
+		uninorth_base = ioremap(address, 0x40000);
+		uninorth_rev = in_be32(UN_REG(UNI_N_VERSION));
+		u3_ht = ioremap(address + U3_HT_CONFIG_BASE, 0x1000);
+	} else
+		uninorth_node = NULL;
+
+	if (!uninorth_node)
+		return;
+
+	printk(KERN_INFO "Found U3 memory controller & host bridge, revision: %d\n",
+	       uninorth_rev);
+	printk(KERN_INFO "Mapped at 0x%08lx\n", (unsigned long)uninorth_base);
+
+}
+
+static void __init probe_one_macio(const char* name, const char* compat, int type)
+{
+	struct device_node*	node;
+	int			i;
+	volatile u32*		base;
+	u32*			revp;
+
+	node = find_devices(name);
+	if (!node || !node->n_addrs)
+		return;
+	if (compat)
+		do {
+			if (device_is_compatible(node, compat))
+				break;
+			node = node->next;
+		} while (node);
+	if (!node)
+		return;
+	for(i=0; i<MAX_MACIO_CHIPS; i++) {
+		if (!macio_chips[i].of_node)
+			break;
+		if (macio_chips[i].of_node == node)
+			return;
+	}
+	if (i >= MAX_MACIO_CHIPS) {
+		printk(KERN_ERR "pmac_feature: Please increase MAX_MACIO_CHIPS !\n");
+		printk(KERN_ERR "pmac_feature: %s skipped\n", node->full_name);
+		return;
+	}
+	base = (volatile u32*)ioremap(node->addrs[0].address, node->addrs[0].size);
+	if (!base) {
+		printk(KERN_ERR "pmac_feature: Can't map mac-io chip !\n");
+		return;
+	}
+	if (type == macio_keylargo) {
+		u32* did = (u32 *)get_property(node, "device-id", NULL);
+		if (*did == 0x00000025)
+			type = macio_pangea;
+		if (*did == 0x0000003e)
+			type = macio_intrepid;
+	}
+	macio_chips[i].of_node	= node;
+	macio_chips[i].type	= type;
+	macio_chips[i].base	= base;
+	macio_chips[i].flags	= MACIO_FLAG_SCCB_ON | MACIO_FLAG_SCCB_ON;
+	macio_chips[i].name 	= macio_names[type];
+	revp = (u32 *)get_property(node, "revision-id", NULL);
+	if (revp)
+		macio_chips[i].rev = *revp;
+	printk(KERN_INFO "Found a %s mac-io controller, rev: %d, mapped at 0x%p\n",
+		macio_names[type], macio_chips[i].rev, macio_chips[i].base);
+}
+
+static int __init
+probe_macios(void)
+{
+	probe_one_macio("mac-io", "K2-Keylargo", macio_keylargo2);
+
+	macio_chips[0].lbus.index = 0;
+	macio_chips[1].lbus.index = 1;
+
+	return (macio_chips[0].of_node == NULL) ? -ENODEV : 0;
+}
+
+static void __init
+set_initial_features(void)
+{
+	struct device_node *np;
+
+	if (macio_chips[0].type == macio_keylargo2) {
+#ifndef CONFIG_SMP
+		/* On SMP machines running UP, we have the second CPU eating
+		 * bus cycles. We need to take it off the bus. This is done
+		 * from pmac_smp for SMP kernels running on one CPU
+		 */
+		np = of_find_node_by_type(NULL, "cpu");
+		if (np != NULL)
+			np = of_find_node_by_type(np, "cpu");
+		if (np != NULL) {
+			g5_phy_disable_cpu1();
+			of_node_put(np);
+		}
+#endif /* CONFIG_SMP */
+		/* Enable GMAC for now for PCI probing. It will be disabled
+		 * later on after PCI probe
+		 */
+		np = of_find_node_by_name(NULL, "ethernet");
+		while(np) {
+			if (device_is_compatible(np, "K2-GMAC"))
+				g5_gmac_enable(np, 0, 1);
+			np = of_find_node_by_name(np, "ethernet");
+		}
+
+		/* Enable FW before PCI probe. Will be disabled later on
+		 * Note: We should have a batter way to check that we are
+		 * dealing with uninorth internal cell and not a PCI cell
+		 * on the external PCI. The code below works though.
+		 */
+		np = of_find_node_by_name(NULL, "firewire");
+		while(np) {
+			if (device_is_compatible(np, "pci106b,5811")) {
+				macio_chips[0].flags |= MACIO_FLAG_FW_SUPPORTED;
+				g5_fw_enable(np, 0, 1);
+			}
+			np = of_find_node_by_name(np, "firewire");
+		}
+	}
+}
+
+void __init
+pmac_feature_init(void)
+{
+	/* Detect the UniNorth memory controller */
+	probe_uninorth();
+
+	/* Probe mac-io controllers */
+	if (probe_macios()) {
+		printk(KERN_WARNING "No mac-io chip found\n");
+		return;
+	}
+
+	/* Setup low-level i2c stuffs */
+	pmac_init_low_i2c();
+
+	/* Probe machine type */
+	if (probe_motherboard())
+		printk(KERN_WARNING "Unknown PowerMac !\n");
+
+	/* Set some initial features (turn off some chips that will
+	 * be later turned on)
+	 */
+	set_initial_features();
+}
+
+int __init pmac_feature_late_init(void)
+{
+#if 0
+	struct device_node* np;
+
+	/* Request some resources late */
+	if (uninorth_node)
+		request_OF_resource(uninorth_node, 0, NULL);
+	np = find_devices("hammerhead");
+	if (np)
+		request_OF_resource(np, 0, NULL);
+	np = find_devices("interrupt-controller");
+	if (np)
+		request_OF_resource(np, 0, NULL);
+#endif
+	return 0;
+}
+
+device_initcall(pmac_feature_late_init);
+
+#if 0
+static void dump_HT_speeds(char *name, u32 cfg, u32 frq)
+{
+	int	freqs[16] = { 200,300,400,500,600,800,1000,0,0,0,0,0,0,0,0,0 };
+	int	bits[8] = { 8,16,0,32,2,4,0,0 };
+	int	freq = (frq >> 8) & 0xf;
+
+	if (freqs[freq] == 0)
+		printk("%s: Unknown HT link frequency %x\n", name, freq);
+	else
+		printk("%s: %d MHz on main link, (%d in / %d out) bits width\n",
+		       name, freqs[freq],
+		       bits[(cfg >> 28) & 0x7], bits[(cfg >> 24) & 0x7]);
+}
+#endif
+
+void __init pmac_check_ht_link(void)
+{
+#if 0 /* Disabled for now */
+	u32	ufreq, freq, ucfg, cfg;
+	struct device_node *pcix_node;
+	u8  	px_bus, px_devfn;
+	struct pci_controller *px_hose;
+
+	(void)in_be32(u3_ht + U3_HT_LINK_COMMAND);
+	ucfg = cfg = in_be32(u3_ht + U3_HT_LINK_CONFIG);
+	ufreq = freq = in_be32(u3_ht + U3_HT_LINK_FREQ);
+	dump_HT_speeds("U3 HyperTransport", cfg, freq);
+
+	pcix_node = of_find_compatible_node(NULL, "pci", "pci-x");
+	if (pcix_node == NULL) {
+		printk("No PCI-X bridge found\n");
+		return;
+	}
+	px_hose = pcix_node->phb;
+	px_bus = pcix_node->busno;
+	px_devfn = pcix_node->devfn;
+	
+	early_read_config_dword(px_hose, px_bus, px_devfn, 0xc4, &cfg);
+	early_read_config_dword(px_hose, px_bus, px_devfn, 0xcc, &freq);
+	dump_HT_speeds("PCI-X HT Uplink", cfg, freq);
+	early_read_config_dword(px_hose, px_bus, px_devfn, 0xc8, &cfg);
+	early_read_config_dword(px_hose, px_bus, px_devfn, 0xd0, &freq);
+	dump_HT_speeds("PCI-X HT Downlink", cfg, freq);
+#endif
+}
diff --git a/arch/ppc64/kernel/pmac_low_i2c.c b/arch/ppc64/kernel/pmac_low_i2c.c
new file mode 100644
index 00000000000..f3f39e8e337
--- /dev/null
+++ b/arch/ppc64/kernel/pmac_low_i2c.c
@@ -0,0 +1,523 @@
+/*
+ *  arch/ppc/platforms/pmac_low_i2c.c
+ *
+ *  Copyright (C) 2003 Ben. Herrenschmidt (benh@kernel.crashing.org)
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ *  This file contains some low-level i2c access routines that
+ *  need to be used by various bits of the PowerMac platform code
+ *  at times where the real asynchronous & interrupt driven driver
+ *  cannot be used. The API borrows some semantics from the darwin
+ *  driver in order to ease the implementation of the platform
+ *  properties parser
+ */
+
+#undef DEBUG
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/adb.h>
+#include <linux/pmu.h>
+#include <asm/keylargo.h>
+#include <asm/uninorth.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/machdep.h>
+#include <asm/pmac_low_i2c.h>
+
+#define MAX_LOW_I2C_HOST	4
+
+#ifdef DEBUG
+#define DBG(x...) do {\
+		printk(KERN_DEBUG "KW:" x);	\
+	} while(0)
+#else
+#define DBG(x...)
+#endif
+
+struct low_i2c_host;
+
+typedef int (*low_i2c_func_t)(struct low_i2c_host *host, u8 addr, u8 sub, u8 *data, int len);
+
+struct low_i2c_host
+{
+	struct device_node	*np;		/* OF device node */
+	struct semaphore	mutex;		/* Access mutex for use by i2c-keywest */
+	low_i2c_func_t		func;		/* Access function */
+	unsigned int		is_open : 1;	/* Poor man's access control */
+	int			mode;		/* Current mode */
+	int			channel;	/* Current channel */
+	int			num_channels;	/* Number of channels */
+	void __iomem		*base;		/* For keywest-i2c, base address */
+	int			bsteps;		/* And register stepping */
+	int			speed;		/* And speed */
+};
+
+static struct low_i2c_host	low_i2c_hosts[MAX_LOW_I2C_HOST];
+
+/* No locking is necessary on allocation, we are running way before
+ * anything can race with us
+ */
+static struct low_i2c_host *find_low_i2c_host(struct device_node *np)
+{
+	int i;
+
+	for (i = 0; i < MAX_LOW_I2C_HOST; i++)
+		if (low_i2c_hosts[i].np == np)
+			return &low_i2c_hosts[i];
+	return NULL;
+}
+
+/*
+ *
+ * i2c-keywest implementation (UniNorth, U2, U3, Keylargo's)
+ *
+ */
+
+/*
+ * Keywest i2c definitions borrowed from drivers/i2c/i2c-keywest.h,
+ * should be moved somewhere in include/asm-ppc/
+ */
+/* Register indices */
+typedef enum {
+	reg_mode = 0,
+	reg_control,
+	reg_status,
+	reg_isr,
+	reg_ier,
+	reg_addr,
+	reg_subaddr,
+	reg_data
+} reg_t;
+
+
+/* Mode register */
+#define KW_I2C_MODE_100KHZ	0x00
+#define KW_I2C_MODE_50KHZ	0x01
+#define KW_I2C_MODE_25KHZ	0x02
+#define KW_I2C_MODE_DUMB	0x00
+#define KW_I2C_MODE_STANDARD	0x04
+#define KW_I2C_MODE_STANDARDSUB	0x08
+#define KW_I2C_MODE_COMBINED	0x0C
+#define KW_I2C_MODE_MODE_MASK	0x0C
+#define KW_I2C_MODE_CHAN_MASK	0xF0
+
+/* Control register */
+#define KW_I2C_CTL_AAK		0x01
+#define KW_I2C_CTL_XADDR	0x02
+#define KW_I2C_CTL_STOP		0x04
+#define KW_I2C_CTL_START	0x08
+
+/* Status register */
+#define KW_I2C_STAT_BUSY	0x01
+#define KW_I2C_STAT_LAST_AAK	0x02
+#define KW_I2C_STAT_LAST_RW	0x04
+#define KW_I2C_STAT_SDA		0x08
+#define KW_I2C_STAT_SCL		0x10
+
+/* IER & ISR registers */
+#define KW_I2C_IRQ_DATA		0x01
+#define KW_I2C_IRQ_ADDR		0x02
+#define KW_I2C_IRQ_STOP		0x04
+#define KW_I2C_IRQ_START	0x08
+#define KW_I2C_IRQ_MASK		0x0F
+
+/* State machine states */
+enum {
+	state_idle,
+	state_addr,
+	state_read,
+	state_write,
+	state_stop,
+	state_dead
+};
+
+#define WRONG_STATE(name) do {\
+		printk(KERN_DEBUG "KW: wrong state. Got %s, state: %s (isr: %02x)\n", \
+		       name, __kw_state_names[state], isr); \
+	} while(0)
+
+static const char *__kw_state_names[] = {
+	"state_idle",
+	"state_addr",
+	"state_read",
+	"state_write",
+	"state_stop",
+	"state_dead"
+};
+
+static inline u8 __kw_read_reg(struct low_i2c_host *host, reg_t reg)
+{
+	return readb(host->base + (((unsigned int)reg) << host->bsteps));
+}
+
+static inline void __kw_write_reg(struct low_i2c_host *host, reg_t reg, u8 val)
+{
+	writeb(val, host->base + (((unsigned)reg) << host->bsteps));
+	(void)__kw_read_reg(host, reg_subaddr);
+}
+
+#define kw_write_reg(reg, val)	__kw_write_reg(host, reg, val) 
+#define kw_read_reg(reg)	__kw_read_reg(host, reg) 
+
+
+/* Don't schedule, the g5 fan controller is too
+ * timing sensitive
+ */
+static u8 kw_wait_interrupt(struct low_i2c_host* host)
+{
+	int i, j;
+	u8 isr;
+	
+	for (i = 0; i < 100000; i++) {
+		isr = kw_read_reg(reg_isr) & KW_I2C_IRQ_MASK;
+		if (isr != 0)
+			return isr;
+
+		/* This code is used with the timebase frozen, we cannot rely
+		 * on udelay ! For now, just use a bogus loop
+		 */
+		for (j = 1; j < 10000; j++)
+			mb();
+	}
+	return isr;
+}
+
+static int kw_handle_interrupt(struct low_i2c_host *host, int state, int rw, int *rc, u8 **data, int *len, u8 isr)
+{
+	u8 ack;
+
+	DBG("kw_handle_interrupt(%s, isr: %x)\n", __kw_state_names[state], isr);
+
+	if (isr == 0) {
+		if (state != state_stop) {
+			DBG("KW: Timeout !\n");
+			*rc = -EIO;
+			goto stop;
+		}
+		if (state == state_stop) {
+			ack = kw_read_reg(reg_status);
+			if (!(ack & KW_I2C_STAT_BUSY)) {
+				state = state_idle;
+				kw_write_reg(reg_ier, 0x00);
+			}
+		}
+		return state;
+	}
+
+	if (isr & KW_I2C_IRQ_ADDR) {
+		ack = kw_read_reg(reg_status);
+		if (state != state_addr) {
+			kw_write_reg(reg_isr, KW_I2C_IRQ_ADDR);
+			WRONG_STATE("KW_I2C_IRQ_ADDR"); 
+			*rc = -EIO;
+			goto stop;
+		}
+		if ((ack & KW_I2C_STAT_LAST_AAK) == 0) {			
+			*rc = -ENODEV;
+			DBG("KW: NAK on address\n");
+			return state_stop;		     
+		} else {
+			if (rw) {
+				state = state_read;
+				if (*len > 1)
+					kw_write_reg(reg_control, KW_I2C_CTL_AAK);
+			} else {
+				state = state_write;
+				kw_write_reg(reg_data, **data);
+				(*data)++; (*len)--;
+			}
+		}
+		kw_write_reg(reg_isr, KW_I2C_IRQ_ADDR);
+	}
+
+	if (isr & KW_I2C_IRQ_DATA) {
+		if (state == state_read) {
+			**data = kw_read_reg(reg_data);
+			(*data)++; (*len)--;
+			kw_write_reg(reg_isr, KW_I2C_IRQ_DATA);
+			if ((*len) == 0)
+				state = state_stop;
+			else if ((*len) == 1)
+				kw_write_reg(reg_control, 0);
+		} else if (state == state_write) {
+			ack = kw_read_reg(reg_status);
+			if ((ack & KW_I2C_STAT_LAST_AAK) == 0) {
+				DBG("KW: nack on data write\n");
+				*rc = -EIO;
+				goto stop;
+			} else if (*len) {
+				kw_write_reg(reg_data, **data);
+				(*data)++; (*len)--;
+			} else {
+				kw_write_reg(reg_control, KW_I2C_CTL_STOP);
+				state = state_stop;
+				*rc = 0;
+			}
+			kw_write_reg(reg_isr, KW_I2C_IRQ_DATA);
+		} else {
+			kw_write_reg(reg_isr, KW_I2C_IRQ_DATA);
+			WRONG_STATE("KW_I2C_IRQ_DATA"); 
+			if (state != state_stop) {
+				*rc = -EIO;
+				goto stop;
+			}
+		}
+	}
+
+	if (isr & KW_I2C_IRQ_STOP) {
+		kw_write_reg(reg_isr, KW_I2C_IRQ_STOP);
+		if (state != state_stop) {
+			WRONG_STATE("KW_I2C_IRQ_STOP");
+			*rc = -EIO;
+		}
+		return state_idle;
+	}
+
+	if (isr & KW_I2C_IRQ_START)
+		kw_write_reg(reg_isr, KW_I2C_IRQ_START);
+
+	return state;
+
+ stop:
+	kw_write_reg(reg_control, KW_I2C_CTL_STOP);	
+	return state_stop;
+}
+
+static int keywest_low_i2c_func(struct low_i2c_host *host, u8 addr, u8 subaddr, u8 *data, int len)
+{
+	u8 mode_reg = host->speed;
+	int state = state_addr;
+	int rc = 0;
+
+	/* Setup mode & subaddress if any */
+	switch(host->mode) {
+	case pmac_low_i2c_mode_dumb:
+		printk(KERN_ERR "low_i2c: Dumb mode not supported !\n");
+		return -EINVAL;
+	case pmac_low_i2c_mode_std:
+		mode_reg |= KW_I2C_MODE_STANDARD;
+		break;
+	case pmac_low_i2c_mode_stdsub:
+		mode_reg |= KW_I2C_MODE_STANDARDSUB;
+		break;
+	case pmac_low_i2c_mode_combined:
+		mode_reg |= KW_I2C_MODE_COMBINED;
+		break;
+	}
+
+	/* Setup channel & clear pending irqs */
+	kw_write_reg(reg_isr, kw_read_reg(reg_isr));
+	kw_write_reg(reg_mode, mode_reg | (host->channel << 4));
+	kw_write_reg(reg_status, 0);
+
+	/* Set up address and r/w bit */
+	kw_write_reg(reg_addr, addr);
+
+	/* Set up the sub address */
+	if ((mode_reg & KW_I2C_MODE_MODE_MASK) == KW_I2C_MODE_STANDARDSUB
+	    || (mode_reg & KW_I2C_MODE_MODE_MASK) == KW_I2C_MODE_COMBINED)
+		kw_write_reg(reg_subaddr, subaddr);
+
+	/* Start sending address & disable interrupt*/
+	kw_write_reg(reg_ier, 0 /*KW_I2C_IRQ_MASK*/);
+	kw_write_reg(reg_control, KW_I2C_CTL_XADDR);
+
+	/* State machine, to turn into an interrupt handler */
+	while(state != state_idle) {
+		u8 isr = kw_wait_interrupt(host);
+		state = kw_handle_interrupt(host, state, addr & 1, &rc, &data, &len, isr);
+	}
+
+	return rc;
+}
+
+static void keywest_low_i2c_add(struct device_node *np)
+{
+	struct low_i2c_host	*host = find_low_i2c_host(NULL);
+	u32			*psteps, *prate, steps, aoffset = 0;
+	struct device_node	*parent;
+
+	if (host == NULL) {
+		printk(KERN_ERR "low_i2c: Can't allocate host for %s\n",
+		       np->full_name);
+		return;
+	}
+	memset(host, 0, sizeof(*host));
+
+	init_MUTEX(&host->mutex);
+	host->np = of_node_get(np);	
+	psteps = (u32 *)get_property(np, "AAPL,address-step", NULL);
+	steps = psteps ? (*psteps) : 0x10;
+	for (host->bsteps = 0; (steps & 0x01) == 0; host->bsteps++)
+		steps >>= 1;
+	parent = of_get_parent(np);
+	host->num_channels = 1;
+	if (parent && parent->name[0] == 'u') {
+		host->num_channels = 2;
+		aoffset = 3;
+	}
+	/* Select interface rate */
+	host->speed = KW_I2C_MODE_100KHZ;
+	prate = (u32 *)get_property(np, "AAPL,i2c-rate", NULL);
+	if (prate) switch(*prate) {
+	case 100:
+		host->speed = KW_I2C_MODE_100KHZ;
+		break;
+	case 50:
+		host->speed = KW_I2C_MODE_50KHZ;
+		break;
+	case 25:
+		host->speed = KW_I2C_MODE_25KHZ;
+		break;
+	}	
+
+	host->mode = pmac_low_i2c_mode_std;
+	host->base = ioremap(np->addrs[0].address + aoffset,
+						np->addrs[0].size);
+	host->func = keywest_low_i2c_func;
+}
+
+/*
+ *
+ * PMU implementation
+ *
+ */
+
+
+#ifdef CONFIG_ADB_PMU
+
+static int pmu_low_i2c_func(struct low_i2c_host *host, u8 addr, u8 sub, u8 *data, int len)
+{
+	// TODO
+	return -ENODEV;
+}
+
+static void pmu_low_i2c_add(struct device_node *np)
+{
+	struct low_i2c_host	*host = find_low_i2c_host(NULL);
+
+	if (host == NULL) {
+		printk(KERN_ERR "low_i2c: Can't allocate host for %s\n",
+		       np->full_name);
+		return;
+	}
+	memset(host, 0, sizeof(*host));
+
+	init_MUTEX(&host->mutex);
+	host->np = of_node_get(np);	
+	host->num_channels = 3;
+	host->mode = pmac_low_i2c_mode_std;
+	host->func = pmu_low_i2c_func;
+}
+
+#endif /* CONFIG_ADB_PMU */
+
+void __init pmac_init_low_i2c(void)
+{
+	struct device_node *np;
+
+	/* Probe keywest-i2c busses */
+	np = of_find_compatible_node(NULL, "i2c", "keywest-i2c");
+	while(np) {
+		keywest_low_i2c_add(np);
+		np = of_find_compatible_node(np, "i2c", "keywest-i2c");
+	}
+
+#ifdef CONFIG_ADB_PMU
+	/* Probe PMU busses */
+	np = of_find_node_by_name(NULL, "via-pmu");
+	if (np)
+		pmu_low_i2c_add(np);
+#endif /* CONFIG_ADB_PMU */
+
+	/* TODO: Add CUDA support as well */
+}
+
+int pmac_low_i2c_lock(struct device_node *np)
+{
+	struct low_i2c_host *host = find_low_i2c_host(np);
+
+	if (!host)
+		return -ENODEV;
+	down(&host->mutex);
+	return 0;
+}
+EXPORT_SYMBOL(pmac_low_i2c_lock);
+
+int pmac_low_i2c_unlock(struct device_node *np)
+{
+	struct low_i2c_host *host = find_low_i2c_host(np);
+
+	if (!host)
+		return -ENODEV;
+	up(&host->mutex);
+	return 0;
+}
+EXPORT_SYMBOL(pmac_low_i2c_unlock);
+
+
+int pmac_low_i2c_open(struct device_node *np, int channel)
+{
+	struct low_i2c_host *host = find_low_i2c_host(np);
+
+	if (!host)
+		return -ENODEV;
+
+	if (channel >= host->num_channels)
+		return -EINVAL;
+
+	down(&host->mutex);
+	host->is_open = 1;
+	host->channel = channel;
+
+	return 0;
+}
+EXPORT_SYMBOL(pmac_low_i2c_open);
+
+int pmac_low_i2c_close(struct device_node *np)
+{
+	struct low_i2c_host *host = find_low_i2c_host(np);
+
+	if (!host)
+		return -ENODEV;
+
+	host->is_open = 0;
+	up(&host->mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL(pmac_low_i2c_close);
+
+int pmac_low_i2c_setmode(struct device_node *np, int mode)
+{
+	struct low_i2c_host *host = find_low_i2c_host(np);
+
+	if (!host)
+		return -ENODEV;
+	WARN_ON(!host->is_open);
+	host->mode = mode;
+
+	return 0;
+}
+EXPORT_SYMBOL(pmac_low_i2c_setmode);
+
+int pmac_low_i2c_xfer(struct device_node *np, u8 addrdir, u8 subaddr, u8 *data, int len)
+{
+	struct low_i2c_host *host = find_low_i2c_host(np);
+
+	if (!host)
+		return -ENODEV;
+	WARN_ON(!host->is_open);
+
+	return host->func(host, addrdir, subaddr, data, len);
+}
+EXPORT_SYMBOL(pmac_low_i2c_xfer);
+
diff --git a/arch/ppc64/kernel/pmac_nvram.c b/arch/ppc64/kernel/pmac_nvram.c
new file mode 100644
index 00000000000..e32a902236e
--- /dev/null
+++ b/arch/ppc64/kernel/pmac_nvram.c
@@ -0,0 +1,495 @@
+/*
+ *  arch/ppc/platforms/pmac_nvram.c
+ *
+ *  Copyright (C) 2002 Benjamin Herrenschmidt (benh@kernel.crashing.org)
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ *  Todo: - add support for the OF persistent properties
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/bootmem.h>
+#include <linux/completion.h>
+#include <linux/spinlock.h>
+#include <asm/sections.h>
+#include <asm/io.h>
+#include <asm/system.h>
+#include <asm/prom.h>
+#include <asm/machdep.h>
+#include <asm/nvram.h>
+
+#define DEBUG
+
+#ifdef DEBUG
+#define DBG(x...) printk(x)
+#else
+#define DBG(x...)
+#endif
+
+#define NVRAM_SIZE		0x2000	/* 8kB of non-volatile RAM */
+
+#define CORE99_SIGNATURE	0x5a
+#define CORE99_ADLER_START	0x14
+
+/* On Core99, nvram is either a sharp, a micron or an AMD flash */
+#define SM_FLASH_STATUS_DONE	0x80
+#define SM_FLASH_STATUS_ERR	0x38
+
+#define SM_FLASH_CMD_ERASE_CONFIRM	0xd0
+#define SM_FLASH_CMD_ERASE_SETUP	0x20
+#define SM_FLASH_CMD_RESET		0xff
+#define SM_FLASH_CMD_WRITE_SETUP	0x40
+#define SM_FLASH_CMD_CLEAR_STATUS	0x50
+#define SM_FLASH_CMD_READ_STATUS	0x70
+
+/* CHRP NVRAM header */
+struct chrp_header {
+  u8		signature;
+  u8		cksum;
+  u16		len;
+  char          name[12];
+  u8		data[0];
+};
+
+struct core99_header {
+  struct chrp_header	hdr;
+  u32			adler;
+  u32			generation;
+  u32			reserved[2];
+};
+
+/*
+ * Read and write the non-volatile RAM on PowerMacs and CHRP machines.
+ */
+static volatile unsigned char *nvram_data;
+static int core99_bank = 0;
+// XXX Turn that into a sem
+static DEFINE_SPINLOCK(nv_lock);
+
+extern int system_running;
+
+static int (*core99_write_bank)(int bank, u8* datas);
+static int (*core99_erase_bank)(int bank);
+
+static char *nvram_image __pmacdata;
+
+
+static ssize_t __pmac core99_nvram_read(char *buf, size_t count, loff_t *index)
+{
+	int i;
+
+	if (nvram_image == NULL)
+		return -ENODEV;
+	if (*index > NVRAM_SIZE)
+		return 0;
+
+	i = *index;
+	if (i + count > NVRAM_SIZE)
+		count = NVRAM_SIZE - i;
+
+	memcpy(buf, &nvram_image[i], count);
+	*index = i + count;
+	return count;
+}
+
+static ssize_t __pmac core99_nvram_write(char *buf, size_t count, loff_t *index)
+{
+	int i;
+
+	if (nvram_image == NULL)
+		return -ENODEV;
+	if (*index > NVRAM_SIZE)
+		return 0;
+
+	i = *index;
+	if (i + count > NVRAM_SIZE)
+		count = NVRAM_SIZE - i;
+
+	memcpy(&nvram_image[i], buf, count);
+	*index = i + count;
+	return count;
+}
+
+static ssize_t __pmac core99_nvram_size(void)
+{
+	if (nvram_image == NULL)
+		return -ENODEV;
+	return NVRAM_SIZE;
+}
+
+static u8 __pmac chrp_checksum(struct chrp_header* hdr)
+{
+	u8 *ptr;
+	u16 sum = hdr->signature;
+	for (ptr = (u8 *)&hdr->len; ptr < hdr->data; ptr++)
+		sum += *ptr;
+	while (sum > 0xFF)
+		sum = (sum & 0xFF) + (sum>>8);
+	return sum;
+}
+
+static u32 __pmac core99_calc_adler(u8 *buffer)
+{
+	int cnt;
+	u32 low, high;
+
+   	buffer += CORE99_ADLER_START;
+	low = 1;
+	high = 0;
+	for (cnt=0; cnt<(NVRAM_SIZE-CORE99_ADLER_START); cnt++) {
+		if ((cnt % 5000) == 0) {
+			high  %= 65521UL;
+			high %= 65521UL;
+		}
+		low += buffer[cnt];
+		high += low;
+	}
+	low  %= 65521UL;
+	high %= 65521UL;
+
+	return (high << 16) | low;
+}
+
+static u32 __pmac core99_check(u8* datas)
+{
+	struct core99_header* hdr99 = (struct core99_header*)datas;
+
+	if (hdr99->hdr.signature != CORE99_SIGNATURE) {
+		DBG("Invalid signature\n");
+		return 0;
+	}
+	if (hdr99->hdr.cksum != chrp_checksum(&hdr99->hdr)) {
+		DBG("Invalid checksum\n");
+		return 0;
+	}
+	if (hdr99->adler != core99_calc_adler(datas)) {
+		DBG("Invalid adler\n");
+		return 0;
+	}
+	return hdr99->generation;
+}
+
+static int __pmac sm_erase_bank(int bank)
+{
+	int stat, i;
+	unsigned long timeout;
+
+	u8* base = (u8 *)nvram_data + core99_bank*NVRAM_SIZE;
+
+       	DBG("nvram: Sharp/Micron Erasing bank %d...\n", bank);
+
+	out_8(base, SM_FLASH_CMD_ERASE_SETUP);
+	out_8(base, SM_FLASH_CMD_ERASE_CONFIRM);
+	timeout = 0;
+	do {
+		if (++timeout > 1000000) {
+			printk(KERN_ERR "nvram: Sharp/Miron flash erase timeout !\n");
+			break;
+		}
+		out_8(base, SM_FLASH_CMD_READ_STATUS);
+		stat = in_8(base);
+	} while (!(stat & SM_FLASH_STATUS_DONE));
+
+	out_8(base, SM_FLASH_CMD_CLEAR_STATUS);
+	out_8(base, SM_FLASH_CMD_RESET);
+
+	for (i=0; i<NVRAM_SIZE; i++)
+		if (base[i] != 0xff) {
+			printk(KERN_ERR "nvram: Sharp/Micron flash erase failed !\n");
+			return -ENXIO;
+		}
+	return 0;
+}
+
+static int __pmac sm_write_bank(int bank, u8* datas)
+{
+	int i, stat = 0;
+	unsigned long timeout;
+
+	u8* base = (u8 *)nvram_data + core99_bank*NVRAM_SIZE;
+
+       	DBG("nvram: Sharp/Micron Writing bank %d...\n", bank);
+
+	for (i=0; i<NVRAM_SIZE; i++) {
+		out_8(base+i, SM_FLASH_CMD_WRITE_SETUP);
+		udelay(1);
+		out_8(base+i, datas[i]);
+		timeout = 0;
+		do {
+			if (++timeout > 1000000) {
+				printk(KERN_ERR "nvram: Sharp/Micron flash write timeout !\n");
+				break;
+			}
+			out_8(base, SM_FLASH_CMD_READ_STATUS);
+			stat = in_8(base);
+		} while (!(stat & SM_FLASH_STATUS_DONE));
+		if (!(stat & SM_FLASH_STATUS_DONE))
+			break;
+	}
+	out_8(base, SM_FLASH_CMD_CLEAR_STATUS);
+	out_8(base, SM_FLASH_CMD_RESET);
+	for (i=0; i<NVRAM_SIZE; i++)
+		if (base[i] != datas[i]) {
+			printk(KERN_ERR "nvram: Sharp/Micron flash write failed !\n");
+			return -ENXIO;
+		}
+	return 0;
+}
+
+static int __pmac amd_erase_bank(int bank)
+{
+	int i, stat = 0;
+	unsigned long timeout;
+
+	u8* base = (u8 *)nvram_data + core99_bank*NVRAM_SIZE;
+
+       	DBG("nvram: AMD Erasing bank %d...\n", bank);
+
+	/* Unlock 1 */
+	out_8(base+0x555, 0xaa);
+	udelay(1);
+	/* Unlock 2 */
+	out_8(base+0x2aa, 0x55);
+	udelay(1);
+
+	/* Sector-Erase */
+	out_8(base+0x555, 0x80);
+	udelay(1);
+	out_8(base+0x555, 0xaa);
+	udelay(1);
+	out_8(base+0x2aa, 0x55);
+	udelay(1);
+	out_8(base, 0x30);
+	udelay(1);
+
+	timeout = 0;
+	do {
+		if (++timeout > 1000000) {
+			printk(KERN_ERR "nvram: AMD flash erase timeout !\n");
+			break;
+		}
+		stat = in_8(base) ^ in_8(base);
+	} while (stat != 0);
+	
+	/* Reset */
+	out_8(base, 0xf0);
+	udelay(1);
+	
+	for (i=0; i<NVRAM_SIZE; i++)
+		if (base[i] != 0xff) {
+			printk(KERN_ERR "nvram: AMD flash erase failed !\n");
+			return -ENXIO;
+		}
+	return 0;
+}
+
+static int __pmac amd_write_bank(int bank, u8* datas)
+{
+	int i, stat = 0;
+	unsigned long timeout;
+
+	u8* base = (u8 *)nvram_data + core99_bank*NVRAM_SIZE;
+
+       	DBG("nvram: AMD Writing bank %d...\n", bank);
+
+	for (i=0; i<NVRAM_SIZE; i++) {
+		/* Unlock 1 */
+		out_8(base+0x555, 0xaa);
+		udelay(1);
+		/* Unlock 2 */
+		out_8(base+0x2aa, 0x55);
+		udelay(1);
+
+		/* Write single word */
+		out_8(base+0x555, 0xa0);
+		udelay(1);
+		out_8(base+i, datas[i]);
+		
+		timeout = 0;
+		do {
+			if (++timeout > 1000000) {
+				printk(KERN_ERR "nvram: AMD flash write timeout !\n");
+				break;
+			}
+			stat = in_8(base) ^ in_8(base);
+		} while (stat != 0);
+		if (stat != 0)
+			break;
+	}
+
+	/* Reset */
+	out_8(base, 0xf0);
+	udelay(1);
+
+	for (i=0; i<NVRAM_SIZE; i++)
+		if (base[i] != datas[i]) {
+			printk(KERN_ERR "nvram: AMD flash write failed !\n");
+			return -ENXIO;
+		}
+	return 0;
+}
+
+
+static int __pmac core99_nvram_sync(void)
+{
+	struct core99_header* hdr99;
+	unsigned long flags;
+
+	spin_lock_irqsave(&nv_lock, flags);
+	if (!memcmp(nvram_image, (u8*)nvram_data + core99_bank*NVRAM_SIZE,
+		NVRAM_SIZE))
+		goto bail;
+
+	DBG("Updating nvram...\n");
+
+	hdr99 = (struct core99_header*)nvram_image;
+	hdr99->generation++;
+	hdr99->hdr.signature = CORE99_SIGNATURE;
+	hdr99->hdr.cksum = chrp_checksum(&hdr99->hdr);
+	hdr99->adler = core99_calc_adler(nvram_image);
+	core99_bank = core99_bank ? 0 : 1;
+	if (core99_erase_bank)
+		if (core99_erase_bank(core99_bank)) {
+			printk("nvram: Error erasing bank %d\n", core99_bank);
+			goto bail;
+		}
+	if (core99_write_bank)
+		if (core99_write_bank(core99_bank, nvram_image))
+			printk("nvram: Error writing bank %d\n", core99_bank);
+ bail:
+	spin_unlock_irqrestore(&nv_lock, flags);
+
+	return 0;
+}
+
+int __init pmac_nvram_init(void)
+{
+	struct device_node *dp;
+	u32 gen_bank0, gen_bank1;
+	int i;
+
+	dp = find_devices("nvram");
+	if (dp == NULL) {
+		printk(KERN_ERR "Can't find NVRAM device\n");
+		return -ENODEV;
+	}
+	if (!device_is_compatible(dp, "nvram,flash")) {
+		printk(KERN_ERR "Incompatible type of NVRAM\n");
+		return -ENXIO;
+	}
+
+	nvram_image = alloc_bootmem(NVRAM_SIZE);
+	if (nvram_image == NULL) {
+		printk(KERN_ERR "nvram: can't allocate ram image\n");
+		return -ENOMEM;
+	}
+	nvram_data = ioremap(dp->addrs[0].address, NVRAM_SIZE*2);
+	
+	DBG("nvram: Checking bank 0...\n");
+
+	gen_bank0 = core99_check((u8 *)nvram_data);
+	gen_bank1 = core99_check((u8 *)nvram_data + NVRAM_SIZE);
+	core99_bank = (gen_bank0 < gen_bank1) ? 1 : 0;
+
+	DBG("nvram: gen0=%d, gen1=%d\n", gen_bank0, gen_bank1);
+	DBG("nvram: Active bank is: %d\n", core99_bank);
+
+	for (i=0; i<NVRAM_SIZE; i++)
+		nvram_image[i] = nvram_data[i + core99_bank*NVRAM_SIZE];
+
+	ppc_md.nvram_read	= core99_nvram_read;
+	ppc_md.nvram_write	= core99_nvram_write;
+	ppc_md.nvram_size	= core99_nvram_size;
+	ppc_md.nvram_sync	= core99_nvram_sync;
+	
+	/* 
+	 * Maybe we could be smarter here though making an exclusive list
+	 * of known flash chips is a bit nasty as older OF didn't provide us
+	 * with a useful "compatible" entry. A solution would be to really
+	 * identify the chip using flash id commands and base ourselves on
+	 * a list of known chips IDs
+	 */
+	if (device_is_compatible(dp, "amd-0137")) {
+		core99_erase_bank = amd_erase_bank;
+		core99_write_bank = amd_write_bank;
+	} else {
+		core99_erase_bank = sm_erase_bank;
+		core99_write_bank = sm_write_bank;
+	}
+
+	return 0;
+}
+
+int __pmac pmac_get_partition(int partition)
+{
+	struct nvram_partition *part;
+	const char *name;
+	int sig;
+
+	switch(partition) {
+	case pmac_nvram_OF:
+		name = "common";
+		sig = NVRAM_SIG_SYS;
+		break;
+	case pmac_nvram_XPRAM:
+		name = "APL,MacOS75";
+		sig = NVRAM_SIG_OS;
+		break;
+	case pmac_nvram_NR:
+	default:
+		/* Oldworld stuff */
+		return -ENODEV;
+	}
+
+	part = nvram_find_partition(sig, name);
+	if (part == NULL)
+		return 0;
+
+	return part->index;
+}
+
+u8 __pmac pmac_xpram_read(int xpaddr)
+{
+	int offset = pmac_get_partition(pmac_nvram_XPRAM);
+	loff_t index;
+	u8 buf;
+	ssize_t count;
+
+	if (offset < 0 || xpaddr < 0 || xpaddr > 0x100)
+		return 0xff;
+	index = offset + xpaddr;
+
+	count = ppc_md.nvram_read(&buf, 1, &index);
+	if (count != 1)
+		return 0xff;
+	return buf;
+}
+
+void __pmac pmac_xpram_write(int xpaddr, u8 data)
+{
+	int offset = pmac_get_partition(pmac_nvram_XPRAM);
+	loff_t index;
+	u8 buf;
+
+	if (offset < 0 || xpaddr < 0 || xpaddr > 0x100)
+		return;
+	index = offset + xpaddr;
+	buf = data;
+
+	ppc_md.nvram_write(&buf, 1, &index);
+}
+
+EXPORT_SYMBOL(pmac_get_partition);
+EXPORT_SYMBOL(pmac_xpram_read);
+EXPORT_SYMBOL(pmac_xpram_write);
diff --git a/arch/ppc64/kernel/pmac_pci.c b/arch/ppc64/kernel/pmac_pci.c
new file mode 100644
index 00000000000..71fe911ad18
--- /dev/null
+++ b/arch/ppc64/kernel/pmac_pci.c
@@ -0,0 +1,793 @@
+/*
+ * Support for PCI bridges found on Power Macintoshes.
+ * At present the "bandit" and "chaos" bridges are supported.
+ * Fortunately you access configuration space in the same
+ * way with either bridge.
+ *
+ * Copyright (C) 2003 Benjamin Herrenschmuidt (benh@kernel.crashing.org)
+ * Copyright (C) 1997 Paul Mackerras (paulus@samba.org)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+
+#include <asm/sections.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+#include <asm/pmac_feature.h>
+#include <asm/iommu.h>
+
+#include "pci.h"
+#include "pmac.h"
+
+#define DEBUG
+
+#ifdef DEBUG
+#define DBG(x...) printk(x)
+#else
+#define DBG(x...)
+#endif
+
+/* XXX Could be per-controller, but I don't think we risk anything by
+ * assuming we won't have both UniNorth and Bandit */
+static int has_uninorth;
+static struct pci_controller *u3_agp;
+struct device_node *k2_skiplist[2];
+
+static int __init fixup_one_level_bus_range(struct device_node *node, int higher)
+{
+	for (; node != 0;node = node->sibling) {
+		int * bus_range;
+		unsigned int *class_code;
+		int len;
+
+		/* For PCI<->PCI bridges or CardBus bridges, we go down */
+		class_code = (unsigned int *) get_property(node, "class-code", NULL);
+		if (!class_code || ((*class_code >> 8) != PCI_CLASS_BRIDGE_PCI &&
+			(*class_code >> 8) != PCI_CLASS_BRIDGE_CARDBUS))
+			continue;
+		bus_range = (int *) get_property(node, "bus-range", &len);
+		if (bus_range != NULL && len > 2 * sizeof(int)) {
+			if (bus_range[1] > higher)
+				higher = bus_range[1];
+		}
+		higher = fixup_one_level_bus_range(node->child, higher);
+	}
+	return higher;
+}
+
+/* This routine fixes the "bus-range" property of all bridges in the
+ * system since they tend to have their "last" member wrong on macs
+ *
+ * Note that the bus numbers manipulated here are OF bus numbers, they
+ * are not Linux bus numbers.
+ */
+static void __init fixup_bus_range(struct device_node *bridge)
+{
+	int * bus_range;
+	int len;
+
+	/* Lookup the "bus-range" property for the hose */
+	bus_range = (int *) get_property(bridge, "bus-range", &len);
+	if (bus_range == NULL || len < 2 * sizeof(int)) {
+		printk(KERN_WARNING "Can't get bus-range for %s\n",
+			       bridge->full_name);
+		return;
+	}
+	bus_range[1] = fixup_one_level_bus_range(bridge->child, bus_range[1]);
+}
+
+/*
+ * Apple MacRISC (U3, UniNorth, Bandit, Chaos) PCI controllers.
+ *
+ * The "Bandit" version is present in all early PCI PowerMacs,
+ * and up to the first ones using Grackle. Some machines may
+ * have 2 bandit controllers (2 PCI busses).
+ *
+ * "Chaos" is used in some "Bandit"-type machines as a bridge
+ * for the separate display bus. It is accessed the same
+ * way as bandit, but cannot be probed for devices. It therefore
+ * has its own config access functions.
+ *
+ * The "UniNorth" version is present in all Core99 machines
+ * (iBook, G4, new IMacs, and all the recent Apple machines).
+ * It contains 3 controllers in one ASIC.
+ *
+ * The U3 is the bridge used on G5 machines. It contains on
+ * AGP bus which is dealt with the old UniNorth access routines
+ * and an HyperTransport bus which uses its own set of access
+ * functions.
+ */
+
+#define MACRISC_CFA0(devfn, off)	\
+	((1 << (unsigned long)PCI_SLOT(dev_fn)) \
+	| (((unsigned long)PCI_FUNC(dev_fn)) << 8) \
+	| (((unsigned long)(off)) & 0xFCUL))
+
+#define MACRISC_CFA1(bus, devfn, off)	\
+	((((unsigned long)(bus)) << 16) \
+	|(((unsigned long)(devfn)) << 8) \
+	|(((unsigned long)(off)) & 0xFCUL) \
+	|1UL)
+
+static unsigned long __pmac macrisc_cfg_access(struct pci_controller* hose,
+					       u8 bus, u8 dev_fn, u8 offset)
+{
+	unsigned int caddr;
+
+	if (bus == hose->first_busno) {
+		if (dev_fn < (11 << 3))
+			return 0;
+		caddr = MACRISC_CFA0(dev_fn, offset);
+	} else
+		caddr = MACRISC_CFA1(bus, dev_fn, offset);
+
+	/* Uninorth will return garbage if we don't read back the value ! */
+	do {
+		out_le32(hose->cfg_addr, caddr);
+	} while (in_le32(hose->cfg_addr) != caddr);
+
+	offset &= has_uninorth ? 0x07 : 0x03;
+	return ((unsigned long)hose->cfg_data) + offset;
+}
+
+static int __pmac macrisc_read_config(struct pci_bus *bus, unsigned int devfn,
+				      int offset, int len, u32 *val)
+{
+	struct pci_controller *hose;
+	unsigned long addr;
+
+	hose = pci_bus_to_host(bus);
+	if (hose == NULL)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	addr = macrisc_cfg_access(hose, bus->number, devfn, offset);
+	if (!addr)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+	/*
+	 * Note: the caller has already checked that offset is
+	 * suitably aligned and that len is 1, 2 or 4.
+	 */
+	switch (len) {
+	case 1:
+		*val = in_8((u8 *)addr);
+		break;
+	case 2:
+		*val = in_le16((u16 *)addr);
+		break;
+	default:
+		*val = in_le32((u32 *)addr);
+		break;
+	}
+	return PCIBIOS_SUCCESSFUL;
+}
+
+static int __pmac macrisc_write_config(struct pci_bus *bus, unsigned int devfn,
+				       int offset, int len, u32 val)
+{
+	struct pci_controller *hose;
+	unsigned long addr;
+
+	hose = pci_bus_to_host(bus);
+	if (hose == NULL)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	addr = macrisc_cfg_access(hose, bus->number, devfn, offset);
+	if (!addr)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+	/*
+	 * Note: the caller has already checked that offset is
+	 * suitably aligned and that len is 1, 2 or 4.
+	 */
+	switch (len) {
+	case 1:
+		out_8((u8 *)addr, val);
+		(void) in_8((u8 *)addr);
+		break;
+	case 2:
+		out_le16((u16 *)addr, val);
+		(void) in_le16((u16 *)addr);
+		break;
+	default:
+		out_le32((u32 *)addr, val);
+		(void) in_le32((u32 *)addr);
+		break;
+	}
+	return PCIBIOS_SUCCESSFUL;
+}
+
+static struct pci_ops macrisc_pci_ops =
+{
+	macrisc_read_config,
+	macrisc_write_config
+};
+
+/*
+ * These versions of U3 HyperTransport config space access ops do not
+ * implement self-view of the HT host yet
+ */
+
+/*
+ * This function deals with some "special cases" devices.
+ *
+ *  0 -> No special case
+ *  1 -> Skip the device but act as if the access was successfull
+ *       (return 0xff's on reads, eventually, cache config space
+ *       accesses in a later version)
+ * -1 -> Hide the device (unsuccessful acess)
+ */
+static int u3_ht_skip_device(struct pci_controller *hose,
+			     struct pci_bus *bus, unsigned int devfn)
+{
+	struct device_node *busdn, *dn;
+	int i;
+
+	/* We only allow config cycles to devices that are in OF device-tree
+	 * as we are apparently having some weird things going on with some
+	 * revs of K2 on recent G5s
+	 */
+	if (bus->self)
+		busdn = pci_device_to_OF_node(bus->self);
+	else
+		busdn = hose->arch_data;
+	for (dn = busdn->child; dn; dn = dn->sibling)
+		if (dn->devfn == devfn)
+			break;
+	if (dn == NULL)
+		return -1;
+
+	/*
+	 * When a device in K2 is powered down, we die on config
+	 * cycle accesses. Fix that here.
+	 */
+	for (i=0; i<2; i++)
+		if (k2_skiplist[i] == dn)
+			return 1;
+
+	return 0;
+}
+
+#define U3_HT_CFA0(devfn, off)		\
+		((((unsigned long)devfn) << 8) | offset)
+#define U3_HT_CFA1(bus, devfn, off)	\
+		(U3_HT_CFA0(devfn, off) \
+		+ (((unsigned long)bus) << 16) \
+		+ 0x01000000UL)
+
+static unsigned long __pmac u3_ht_cfg_access(struct pci_controller* hose,
+					     u8 bus, u8 devfn, u8 offset)
+{
+	if (bus == hose->first_busno) {
+		/* For now, we don't self probe U3 HT bridge */
+		if (PCI_SLOT(devfn) == 0)
+			return 0;
+		return ((unsigned long)hose->cfg_data) + U3_HT_CFA0(devfn, offset);
+	} else
+		return ((unsigned long)hose->cfg_data) + U3_HT_CFA1(bus, devfn, offset);
+}
+
+static int __pmac u3_ht_read_config(struct pci_bus *bus, unsigned int devfn,
+				    int offset, int len, u32 *val)
+{
+	struct pci_controller *hose;
+	unsigned long addr;
+
+
+	hose = pci_bus_to_host(bus);      
+	if (hose == NULL)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	addr = u3_ht_cfg_access(hose, bus->number, devfn, offset);
+	if (!addr)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	switch (u3_ht_skip_device(hose, bus, devfn)) {
+	case 0:
+		break;
+	case 1:
+		switch (len) {
+		case 1:
+			*val = 0xff; break;
+		case 2:
+			*val = 0xffff; break;
+		default:
+			*val = 0xfffffffful; break;
+		}
+		return PCIBIOS_SUCCESSFUL;
+	default:
+		return PCIBIOS_DEVICE_NOT_FOUND;
+	}
+
+	/*
+	 * Note: the caller has already checked that offset is
+	 * suitably aligned and that len is 1, 2 or 4.
+	 */
+	switch (len) {
+	case 1:
+		*val = in_8((u8 *)addr);
+		break;
+	case 2:
+		*val = in_le16((u16 *)addr);
+		break;
+	default:
+		*val = in_le32((u32 *)addr);
+		break;
+	}
+	return PCIBIOS_SUCCESSFUL;
+}
+
+static int __pmac u3_ht_write_config(struct pci_bus *bus, unsigned int devfn,
+				     int offset, int len, u32 val)
+{
+	struct pci_controller *hose;
+	unsigned long addr;
+
+	hose = pci_bus_to_host(bus);
+	if (hose == NULL)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	addr = u3_ht_cfg_access(hose, bus->number, devfn, offset);
+	if (!addr)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	switch (u3_ht_skip_device(hose, bus, devfn)) {
+	case 0:
+		break;
+	case 1:
+		return PCIBIOS_SUCCESSFUL;
+	default:
+		return PCIBIOS_DEVICE_NOT_FOUND;
+	}
+
+	/*
+	 * Note: the caller has already checked that offset is
+	 * suitably aligned and that len is 1, 2 or 4.
+	 */
+	switch (len) {
+	case 1:
+		out_8((u8 *)addr, val);
+		(void) in_8((u8 *)addr);
+		break;
+	case 2:
+		out_le16((u16 *)addr, val);
+		(void) in_le16((u16 *)addr);
+		break;
+	default:
+		out_le32((u32 *)addr, val);
+		(void) in_le32((u32 *)addr);
+		break;
+	}
+	return PCIBIOS_SUCCESSFUL;
+}
+
+static struct pci_ops u3_ht_pci_ops =
+{
+	u3_ht_read_config,
+	u3_ht_write_config
+};
+
+static void __init setup_u3_agp(struct pci_controller* hose)
+{
+	/* On G5, we move AGP up to high bus number so we don't need
+	 * to reassign bus numbers for HT. If we ever have P2P bridges
+	 * on AGP, we'll have to move pci_assign_all_busses to the
+	 * pci_controller structure so we enable it for AGP and not for
+	 * HT childs.
+	 * We hard code the address because of the different size of
+	 * the reg address cell, we shall fix that by killing struct
+	 * reg_property and using some accessor functions instead
+	 */
+       	hose->first_busno = 0xf0;
+	hose->last_busno = 0xff;
+	has_uninorth = 1;
+	hose->ops = &macrisc_pci_ops;
+	hose->cfg_addr = ioremap(0xf0000000 + 0x800000, 0x1000);
+	hose->cfg_data = ioremap(0xf0000000 + 0xc00000, 0x1000);
+
+	u3_agp = hose;
+}
+
+static void __init setup_u3_ht(struct pci_controller* hose)
+{
+	struct device_node *np = (struct device_node *)hose->arch_data;
+	int i, cur;
+
+	hose->ops = &u3_ht_pci_ops;
+
+	/* We hard code the address because of the different size of
+	 * the reg address cell, we shall fix that by killing struct
+	 * reg_property and using some accessor functions instead
+	 */
+	hose->cfg_data = (volatile unsigned char *)ioremap(0xf2000000, 0x02000000);
+
+	/*
+	 * /ht node doesn't expose a "ranges" property, so we "remove" regions that
+	 * have been allocated to AGP. So far, this version of the code doesn't assign
+	 * any of the 0xfxxxxxxx "fine" memory regions to /ht.
+	 * We need to fix that sooner or later by either parsing all child "ranges"
+	 * properties or figuring out the U3 address space decoding logic and
+	 * then read it's configuration register (if any).
+	 */
+	hose->io_base_phys = 0xf4000000;
+	hose->io_base_virt = ioremap(hose->io_base_phys, 0x00400000);
+	isa_io_base = pci_io_base = (unsigned long) hose->io_base_virt;
+	hose->io_resource.name = np->full_name;
+	hose->io_resource.start = 0;
+	hose->io_resource.end = 0x003fffff;
+	hose->io_resource.flags = IORESOURCE_IO;
+	hose->pci_mem_offset = 0;
+	hose->first_busno = 0;
+	hose->last_busno = 0xef;
+	hose->mem_resources[0].name = np->full_name;
+	hose->mem_resources[0].start = 0x80000000;
+	hose->mem_resources[0].end = 0xefffffff;
+	hose->mem_resources[0].flags = IORESOURCE_MEM;
+
+	if (u3_agp == NULL) {
+		DBG("U3 has no AGP, using full resource range\n");
+		return;
+	}
+
+	/* We "remove" the AGP resources from the resources allocated to HT, that
+	 * is we create "holes". However, that code does assumptions that so far
+	 * happen to be true (cross fingers...), typically that resources in the
+	 * AGP node are properly ordered
+	 */
+	cur = 0;
+	for (i=0; i<3; i++) {
+		struct resource *res = &u3_agp->mem_resources[i];
+		if (res->flags != IORESOURCE_MEM)
+			continue;
+		/* We don't care about "fine" resources */
+		if (res->start >= 0xf0000000)
+			continue;
+		/* Check if it's just a matter of "shrinking" us in one direction */
+		if (hose->mem_resources[cur].start == res->start) {
+			DBG("U3/HT: shrink start of %d, %08lx -> %08lx\n",
+			    cur, hose->mem_resources[cur].start, res->end + 1);
+			hose->mem_resources[cur].start = res->end + 1;
+			continue;
+		}
+		if (hose->mem_resources[cur].end == res->end) {
+			DBG("U3/HT: shrink end of %d, %08lx -> %08lx\n",
+			    cur, hose->mem_resources[cur].end, res->start - 1);
+			hose->mem_resources[cur].end = res->start - 1;
+			continue;
+		}
+		/* No, it's not the case, we need a hole */
+		if (cur == 2) {
+			/* not enough resources for a hole, we drop part of the range */
+			printk(KERN_WARNING "Running out of resources for /ht host !\n");
+			hose->mem_resources[cur].end = res->start - 1;
+			continue;
+		}		
+		cur++;
+       		DBG("U3/HT: hole, %d end at %08lx, %d start at %08lx\n",
+		    cur-1, res->start - 1, cur, res->end + 1);
+		hose->mem_resources[cur].name = np->full_name;
+		hose->mem_resources[cur].flags = IORESOURCE_MEM;
+		hose->mem_resources[cur].start = res->end + 1;
+		hose->mem_resources[cur].end = hose->mem_resources[cur-1].end;
+		hose->mem_resources[cur-1].end = res->start - 1;
+	}
+}
+
+static void __init pmac_process_bridge_OF_ranges(struct pci_controller *hose,
+			   struct device_node *dev, int primary)
+{
+	static unsigned int static_lc_ranges[2024];
+	unsigned int *dt_ranges, *lc_ranges, *ranges, *prev;
+	unsigned int size;
+	int rlen = 0, orig_rlen;
+	int memno = 0;
+	struct resource *res;
+	int np, na = prom_n_addr_cells(dev);
+
+	np = na + 5;
+
+	/* First we try to merge ranges to fix a problem with some pmacs
+	 * that can have more than 3 ranges, fortunately using contiguous
+	 * addresses -- BenH
+	 */
+	dt_ranges = (unsigned int *) get_property(dev, "ranges", &rlen);
+	if (!dt_ranges)
+		return;
+	/*	lc_ranges = alloc_bootmem(rlen);*/
+	lc_ranges = static_lc_ranges;
+	if (!lc_ranges)
+		return; /* what can we do here ? */
+	memcpy(lc_ranges, dt_ranges, rlen);
+	orig_rlen = rlen;
+
+	/* Let's work on a copy of the "ranges" property instead of damaging
+	 * the device-tree image in memory
+	 */
+	ranges = lc_ranges;
+	prev = NULL;
+	while ((rlen -= np * sizeof(unsigned int)) >= 0) {
+		if (prev) {
+			if (prev[0] == ranges[0] && prev[1] == ranges[1] &&
+				(prev[2] + prev[na+4]) == ranges[2] &&
+				(prev[na+2] + prev[na+4]) == ranges[na+2]) {
+				prev[na+4] += ranges[na+4];
+				ranges[0] = 0;
+				ranges += np;
+				continue;
+			}
+		}
+		prev = ranges;
+		ranges += np;
+	}
+
+	/*
+	 * The ranges property is laid out as an array of elements,
+	 * each of which comprises:
+	 *   cells 0 - 2:	a PCI address
+	 *   cells 3 or 3+4:	a CPU physical address
+	 *			(size depending on dev->n_addr_cells)
+	 *   cells 4+5 or 5+6:	the size of the range
+	 */
+	ranges = lc_ranges;
+	rlen = orig_rlen;
+	while (ranges && (rlen -= np * sizeof(unsigned int)) >= 0) {
+		res = NULL;
+		size = ranges[na+4];
+		switch (ranges[0] >> 24) {
+		case 1:		/* I/O space */
+			if (ranges[2] != 0)
+				break;
+			hose->io_base_phys = ranges[na+2];
+			/* limit I/O space to 16MB */
+			if (size > 0x01000000)
+				size = 0x01000000;
+			hose->io_base_virt = ioremap(ranges[na+2], size);
+			if (primary)
+				isa_io_base = (unsigned long) hose->io_base_virt;
+			res = &hose->io_resource;
+			res->flags = IORESOURCE_IO;
+			res->start = ranges[2];
+			break;
+		case 2:		/* memory space */
+			memno = 0;
+			if (ranges[1] == 0 && ranges[2] == 0
+			    && ranges[na+4] <= (16 << 20)) {
+				/* 1st 16MB, i.e. ISA memory area */
+#if 0
+				if (primary)
+					isa_mem_base = ranges[na+2];
+#endif
+				memno = 1;
+			}
+			while (memno < 3 && hose->mem_resources[memno].flags)
+				++memno;
+			if (memno == 0)
+				hose->pci_mem_offset = ranges[na+2] - ranges[2];
+			if (memno < 3) {
+				res = &hose->mem_resources[memno];
+				res->flags = IORESOURCE_MEM;
+				res->start = ranges[na+2];
+			}
+			break;
+		}
+		if (res != NULL) {
+			res->name = dev->full_name;
+			res->end = res->start + size - 1;
+			res->parent = NULL;
+			res->sibling = NULL;
+			res->child = NULL;
+		}
+		ranges += np;
+	}
+}
+
+/*
+ * We assume that if we have a G3 powermac, we have one bridge called
+ * "pci" (a MPC106) and no bandit or chaos bridges, and contrariwise,
+ * if we have one or more bandit or chaos bridges, we don't have a MPC106.
+ */
+static int __init add_bridge(struct device_node *dev)
+{
+	int len;
+	struct pci_controller *hose;
+	char* disp_name;
+	int *bus_range;
+	int primary = 1;
+ 	struct property *of_prop;
+
+	DBG("Adding PCI host bridge %s\n", dev->full_name);
+
+       	bus_range = (int *) get_property(dev, "bus-range", &len);
+       	if (bus_range == NULL || len < 2 * sizeof(int)) {
+       		printk(KERN_WARNING "Can't get bus-range for %s, assume bus 0\n",
+       			       dev->full_name);
+       	}
+
+	hose = alloc_bootmem(sizeof(struct pci_controller));
+	if (hose == NULL)
+		return -ENOMEM;
+       	pci_setup_pci_controller(hose);
+
+       	hose->arch_data = dev;
+       	hose->first_busno = bus_range ? bus_range[0] : 0;
+       	hose->last_busno = bus_range ? bus_range[1] : 0xff;
+
+	of_prop = alloc_bootmem(sizeof(struct property) +
+				sizeof(hose->global_number));
+	if (of_prop) {
+		memset(of_prop, 0, sizeof(struct property));
+		of_prop->name = "linux,pci-domain";
+		of_prop->length = sizeof(hose->global_number);
+		of_prop->value = (unsigned char *)&of_prop[1];
+		memcpy(of_prop->value, &hose->global_number, sizeof(hose->global_number));
+		prom_add_property(dev, of_prop);
+	}
+
+	disp_name = NULL;
+       	if (device_is_compatible(dev, "u3-agp")) {
+       		setup_u3_agp(hose);
+       		disp_name = "U3-AGP";
+       		primary = 0;
+       	} else if (device_is_compatible(dev, "u3-ht")) {
+       		setup_u3_ht(hose);
+       		disp_name = "U3-HT";
+       		primary = 1;
+       	}
+       	printk(KERN_INFO "Found %s PCI host bridge. Firmware bus number: %d->%d\n",
+       		disp_name, hose->first_busno, hose->last_busno);
+
+       	/* Interpret the "ranges" property */
+       	/* This also maps the I/O region and sets isa_io/mem_base */
+       	pmac_process_bridge_OF_ranges(hose, dev, primary);
+
+       	/* Fixup "bus-range" OF property */
+       	fixup_bus_range(dev);
+
+	return 0;
+}
+
+/*
+ * We use our own read_irq_line here because PCI_INTERRUPT_PIN is
+ * crap on some of Apple ASICs. We unconditionally use the Open Firmware
+ * interrupt number as this is always right.
+ */
+static int pmac_pci_read_irq_line(struct pci_dev *pci_dev)
+{
+	struct device_node *node;
+
+	node = pci_device_to_OF_node(pci_dev);
+	if (node == NULL)
+		return -1;
+	if (node->n_intrs == 0)
+		return -1;
+	pci_dev->irq = node->intrs[0].line;
+	pci_write_config_byte(pci_dev, PCI_INTERRUPT_LINE, pci_dev->irq);
+
+	return 0;
+}
+
+void __init pmac_pcibios_fixup(void)
+{
+	struct pci_dev *dev = NULL;
+
+	for_each_pci_dev(dev)
+		pmac_pci_read_irq_line(dev);
+}
+
+static void __init pmac_fixup_phb_resources(void)
+{
+	struct pci_controller *hose, *tmp;
+	
+	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+		unsigned long offset = (unsigned long)hose->io_base_virt - pci_io_base;
+		hose->io_resource.start += offset;
+		hose->io_resource.end += offset;
+		printk(KERN_INFO "PCI Host %d, io start: %lx; io end: %lx\n",
+		       hose->global_number,
+		       hose->io_resource.start, hose->io_resource.end);
+	}
+}
+
+void __init pmac_pci_init(void)
+{
+	struct device_node *np, *root;
+	struct device_node *ht = NULL;
+
+	/* Probe root PCI hosts, that is on U3 the AGP host and the
+	 * HyperTransport host. That one is actually "kept" around
+	 * and actually added last as it's resource management relies
+	 * on the AGP resources to have been setup first
+	 */
+	root = of_find_node_by_path("/");
+	if (root == NULL) {
+		printk(KERN_CRIT "pmac_find_bridges: can't find root of device tree\n");
+		return;
+	}
+	for (np = NULL; (np = of_get_next_child(root, np)) != NULL;) {
+		if (np->name == NULL)
+			continue;
+		if (strcmp(np->name, "pci") == 0) {
+			if (add_bridge(np) == 0)
+				of_node_get(np);
+		}
+		if (strcmp(np->name, "ht") == 0) {
+			of_node_get(np);
+			ht = np;
+		}
+	}
+	of_node_put(root);
+
+	/* Now setup the HyperTransport host if we found any
+	 */
+	if (ht && add_bridge(ht) != 0)
+		of_node_put(ht);
+
+	/* Fixup the IO resources on our host bridges as the common code
+	 * does it only for childs of the host bridges
+	 */
+	pmac_fixup_phb_resources();
+
+	/* Setup the linkage between OF nodes and PHBs */ 
+	pci_devs_phb_init();
+
+	/* Fixup the PCI<->OF mapping for U3 AGP due to bus renumbering. We
+	 * assume there is no P2P bridge on the AGP bus, which should be a
+	 * safe assumptions hopefully.
+	 */
+	if (u3_agp) {
+		struct device_node *np = u3_agp->arch_data;
+		np->busno = 0xf0;
+		for (np = np->child; np; np = np->sibling)
+			np->busno = 0xf0;
+	}
+
+	pmac_check_ht_link();
+
+	/* Tell pci.c to not use the common resource allocation mecanism */
+	pci_probe_only = 1;
+	
+	/* Allow all IO */
+	io_page_mask = -1;
+}
+
+/*
+ * Disable second function on K2-SATA, it's broken
+ * and disable IO BARs on first one
+ */
+static void fixup_k2_sata(struct pci_dev* dev)
+{
+	int i;
+	u16 cmd;
+
+	if (PCI_FUNC(dev->devfn) > 0) {
+		pci_read_config_word(dev, PCI_COMMAND, &cmd);
+		cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY);
+		pci_write_config_word(dev, PCI_COMMAND, cmd);
+		for (i = 0; i < 6; i++) {
+			dev->resource[i].start = dev->resource[i].end = 0;
+			dev->resource[i].flags = 0;
+			pci_write_config_dword(dev, PCI_BASE_ADDRESS_0 + 4 * i, 0);
+		}
+	} else {
+		pci_read_config_word(dev, PCI_COMMAND, &cmd);
+		cmd &= ~PCI_COMMAND_IO;
+		pci_write_config_word(dev, PCI_COMMAND, cmd);
+		for (i = 0; i < 5; i++) {
+			dev->resource[i].start = dev->resource[i].end = 0;
+			dev->resource[i].flags = 0;
+			pci_write_config_dword(dev, PCI_BASE_ADDRESS_0 + 4 * i, 0);
+		}
+	}
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SERVERWORKS, 0x0240, fixup_k2_sata);
+
diff --git a/arch/ppc64/kernel/pmac_setup.c b/arch/ppc64/kernel/pmac_setup.c
new file mode 100644
index 00000000000..6cf03d387b9
--- /dev/null
+++ b/arch/ppc64/kernel/pmac_setup.c
@@ -0,0 +1,511 @@
+/*
+ *  arch/ppc/platforms/setup.c
+ *
+ *  PowerPC version
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Adapted for Power Macintosh by Paul Mackerras
+ *    Copyright (C) 1996 Paul Mackerras (paulus@cs.anu.edu.au)
+ *
+ *  Derived from "arch/alpha/kernel/setup.c"
+ *    Copyright (C) 1995 Linus Torvalds
+ *
+ *  Maintained by Benjamin Herrenschmidt (benh@kernel.crashing.org)
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+/*
+ * bootup setup stuff..
+ */
+
+#undef DEBUG
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/user.h>
+#include <linux/a.out.h>
+#include <linux/tty.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/ioport.h>
+#include <linux/major.h>
+#include <linux/initrd.h>
+#include <linux/vt_kern.h>
+#include <linux/console.h>
+#include <linux/ide.h>
+#include <linux/pci.h>
+#include <linux/adb.h>
+#include <linux/cuda.h>
+#include <linux/pmu.h>
+#include <linux/irq.h>
+#include <linux/seq_file.h>
+#include <linux/root_dev.h>
+#include <linux/bitops.h>
+
+#include <asm/processor.h>
+#include <asm/sections.h>
+#include <asm/prom.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/pci-bridge.h>
+#include <asm/iommu.h>
+#include <asm/machdep.h>
+#include <asm/dma.h>
+#include <asm/btext.h>
+#include <asm/cputable.h>
+#include <asm/pmac_feature.h>
+#include <asm/time.h>
+#include <asm/of_device.h>
+#include <asm/lmb.h>
+#include <asm/smu.h>
+
+#include "pmac.h"
+#include "mpic.h"
+
+#ifdef DEBUG
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+static int current_root_goodness = -1;
+#define DEFAULT_ROOT_DEVICE Root_SDA1	/* sda1 - slightly silly choice */
+
+extern  int powersave_nap;
+int sccdbg;
+
+sys_ctrler_t sys_ctrler;
+EXPORT_SYMBOL(sys_ctrler);
+
+#ifdef CONFIG_PMAC_SMU
+unsigned long smu_cmdbuf_abs;
+EXPORT_SYMBOL(smu_cmdbuf_abs);
+#endif
+
+extern void udbg_init_scc(struct device_node *np);
+
+void __pmac pmac_show_cpuinfo(struct seq_file *m)
+{
+	struct device_node *np;
+	char *pp;
+	int plen;
+	char* mbname;
+	int mbmodel = pmac_call_feature(PMAC_FTR_GET_MB_INFO, NULL,
+					PMAC_MB_INFO_MODEL, 0);
+	unsigned int mbflags = pmac_call_feature(PMAC_FTR_GET_MB_INFO, NULL,
+						 PMAC_MB_INFO_FLAGS, 0);
+
+	if (pmac_call_feature(PMAC_FTR_GET_MB_INFO, NULL, PMAC_MB_INFO_NAME,
+			      (long)&mbname) != 0)
+		mbname = "Unknown";
+	
+	/* find motherboard type */
+	seq_printf(m, "machine\t\t: ");
+	np = find_devices("device-tree");
+	if (np != NULL) {
+		pp = (char *) get_property(np, "model", NULL);
+		if (pp != NULL)
+			seq_printf(m, "%s\n", pp);
+		else
+			seq_printf(m, "PowerMac\n");
+		pp = (char *) get_property(np, "compatible", &plen);
+		if (pp != NULL) {
+			seq_printf(m, "motherboard\t:");
+			while (plen > 0) {
+				int l = strlen(pp) + 1;
+				seq_printf(m, " %s", pp);
+				plen -= l;
+				pp += l;
+			}
+			seq_printf(m, "\n");
+		}
+	} else
+		seq_printf(m, "PowerMac\n");
+
+	/* print parsed model */
+	seq_printf(m, "detected as\t: %d (%s)\n", mbmodel, mbname);
+	seq_printf(m, "pmac flags\t: %08x\n", mbflags);
+
+	/* Indicate newworld */
+	seq_printf(m, "pmac-generation\t: NewWorld\n");
+}
+
+
+void __init pmac_setup_arch(void)
+{
+	/* init to some ~sane value until calibrate_delay() runs */
+	loops_per_jiffy = 50000000;
+
+	/* Probe motherboard chipset */
+	pmac_feature_init();
+#if 0
+	/* Lock-enable the SCC channel used for debug */
+	if (sccdbg) {
+		np = of_find_node_by_name(NULL, "escc");
+		if (np)
+			pmac_call_feature(PMAC_FTR_SCC_ENABLE, np,
+					  PMAC_SCC_ASYNC | PMAC_SCC_FLAG_XMON, 1);
+	}
+#endif
+	/* We can NAP */
+	powersave_nap = 1;
+
+#ifdef CONFIG_ADB_PMU
+	/* Initialize the PMU if any */
+	find_via_pmu();
+#endif
+#ifdef CONFIG_PMAC_SMU
+	/* Initialize the SMU if any */
+	smu_init();
+#endif
+
+	/* Init NVRAM access */
+	pmac_nvram_init();
+
+	/* Setup SMP callback */
+#ifdef CONFIG_SMP
+	pmac_setup_smp();
+#endif
+
+	/* Lookup PCI hosts */
+       	pmac_pci_init();
+
+#ifdef CONFIG_DUMMY_CONSOLE
+	conswitchp = &dummy_con;
+#endif
+}
+
+#ifdef CONFIG_SCSI
+void note_scsi_host(struct device_node *node, void *host)
+{
+	/* Obsolete */
+}
+#endif
+
+
+static int initializing = 1;
+
+static int pmac_late_init(void)
+{
+	initializing = 0;
+	return 0;
+}
+
+late_initcall(pmac_late_init);
+
+/* can't be __init - can be called whenever a disk is first accessed */
+void __pmac note_bootable_part(dev_t dev, int part, int goodness)
+{
+	extern dev_t boot_dev;
+	char *p;
+
+	if (!initializing)
+		return;
+	if ((goodness <= current_root_goodness) &&
+	    ROOT_DEV != DEFAULT_ROOT_DEVICE)
+		return;
+	p = strstr(saved_command_line, "root=");
+	if (p != NULL && (p == saved_command_line || p[-1] == ' '))
+		return;
+
+	if (!boot_dev || dev == boot_dev) {
+		ROOT_DEV = dev + part;
+		boot_dev = 0;
+		current_root_goodness = goodness;
+	}
+}
+
+void __pmac pmac_restart(char *cmd)
+{
+	switch(sys_ctrler) {
+#ifdef CONFIG_ADB_PMU
+	case SYS_CTRLER_PMU:
+		pmu_restart();
+		break;
+#endif
+
+#ifdef CONFIG_PMAC_SMU
+	case SYS_CTRLER_SMU:
+		smu_restart();
+		break;
+#endif
+	default:
+		;
+	}
+}
+
+void __pmac pmac_power_off(void)
+{
+	switch(sys_ctrler) {
+#ifdef CONFIG_ADB_PMU
+	case SYS_CTRLER_PMU:
+		pmu_shutdown();
+		break;
+#endif
+#ifdef CONFIG_PMAC_SMU
+	case SYS_CTRLER_SMU:
+		smu_shutdown();
+		break;
+#endif
+	default:
+		;
+	}
+}
+
+void __pmac pmac_halt(void)
+{
+	pmac_power_off();
+}
+
+#ifdef CONFIG_BOOTX_TEXT
+static int dummy_getc_poll(void)
+{
+	return -1;
+}
+
+static unsigned char dummy_getc(void)
+{
+	return 0;
+}
+
+static void btext_putc(unsigned char c)
+{
+	btext_drawchar(c);
+}
+
+static void __init init_boot_display(void)
+{
+	char *name;
+	struct device_node *np = NULL; 
+	int rc = -ENODEV;
+
+	printk("trying to initialize btext ...\n");
+
+	name = (char *)get_property(of_chosen, "linux,stdout-path", NULL);
+	if (name != NULL) {
+		np = of_find_node_by_path(name);
+		if (np != NULL) {
+			if (strcmp(np->type, "display") != 0) {
+				printk("boot stdout isn't a display !\n");
+				of_node_put(np);
+				np = NULL;
+			}
+		}
+	}
+	if (np)
+		rc = btext_initialize(np);
+	if (rc == 0)
+		return;
+
+	for (np = NULL; (np = of_find_node_by_type(np, "display"));) {
+		if (get_property(np, "linux,opened", NULL)) {
+			printk("trying %s ...\n", np->full_name);
+			rc = btext_initialize(np);
+			printk("result: %d\n", rc);
+		}
+		if (rc == 0)
+			return;
+	}
+}
+#endif /* CONFIG_BOOTX_TEXT */
+
+/* 
+ * Early initialization.
+ */
+void __init pmac_init_early(void)
+{
+	DBG(" -> pmac_init_early\n");
+
+	/* Initialize hash table, from now on, we can take hash faults
+	 * and call ioremap
+	 */
+	hpte_init_native();
+
+	/* Init SCC */
+       	if (strstr(cmd_line, "sccdbg")) {
+		sccdbg = 1;
+       		udbg_init_scc(NULL);
+       	}
+
+	else {
+#ifdef CONFIG_BOOTX_TEXT
+		init_boot_display();
+
+		ppc_md.udbg_putc = btext_putc;
+		ppc_md.udbg_getc = dummy_getc;
+		ppc_md.udbg_getc_poll = dummy_getc_poll;
+#endif /* CONFIG_BOOTX_TEXT */
+	}
+
+	/* Setup interrupt mapping options */
+	ppc64_interrupt_controller = IC_OPEN_PIC;
+
+	iommu_init_early_u3();
+
+	DBG(" <- pmac_init_early\n");
+}
+
+static int pmac_u3_cascade(struct pt_regs *regs, void *data)
+{
+	return mpic_get_one_irq((struct mpic *)data, regs);
+}
+
+static __init void pmac_init_IRQ(void)
+{
+        struct device_node *irqctrler  = NULL;
+        struct device_node *irqctrler2 = NULL;
+	struct device_node *np = NULL;
+	struct mpic *mpic1, *mpic2;
+
+	/* We first try to detect Apple's new Core99 chipset, since mac-io
+	 * is quite different on those machines and contains an IBM MPIC2.
+	 */
+	while ((np = of_find_node_by_type(np, "open-pic")) != NULL) {
+		struct device_node *parent = of_get_parent(np);
+		if (parent && !strcmp(parent->name, "u3"))
+			irqctrler2 = of_node_get(np);
+		else
+			irqctrler = of_node_get(np);
+		of_node_put(parent);
+	}
+	if (irqctrler != NULL && irqctrler->n_addrs > 0) {
+		unsigned char senses[128];
+
+		printk(KERN_INFO "PowerMac using OpenPIC irq controller at 0x%08x\n",
+		       (unsigned int)irqctrler->addrs[0].address);
+
+		prom_get_irq_senses(senses, 0, 128);
+		mpic1 = mpic_alloc(irqctrler->addrs[0].address,
+				   MPIC_PRIMARY | MPIC_WANTS_RESET,
+				   0, 0, 128, 256, senses, 128, " K2-MPIC  ");
+		BUG_ON(mpic1 == NULL);
+		mpic_init(mpic1);		
+
+		if (irqctrler2 != NULL && irqctrler2->n_intrs > 0 &&
+		    irqctrler2->n_addrs > 0) {
+			printk(KERN_INFO "Slave OpenPIC at 0x%08x hooked on IRQ %d\n",
+			       (u32)irqctrler2->addrs[0].address,
+			       irqctrler2->intrs[0].line);
+
+			pmac_call_feature(PMAC_FTR_ENABLE_MPIC, irqctrler2, 0, 0);
+			prom_get_irq_senses(senses, 128, 128 + 128);
+
+			/* We don't need to set MPIC_BROKEN_U3 here since we don't have
+			 * hypertransport interrupts routed to it
+			 */
+			mpic2 = mpic_alloc(irqctrler2->addrs[0].address,
+					   MPIC_BIG_ENDIAN | MPIC_WANTS_RESET,
+					   0, 128, 128, 0, senses, 128, " U3-MPIC  ");
+			BUG_ON(mpic2 == NULL);
+			mpic_init(mpic2);
+			mpic_setup_cascade(irqctrler2->intrs[0].line,
+					   pmac_u3_cascade, mpic2);
+		}
+	}
+	of_node_put(irqctrler);
+	of_node_put(irqctrler2);
+}
+
+static void __init pmac_progress(char *s, unsigned short hex)
+{
+	if (sccdbg) {
+		udbg_puts(s);
+		udbg_puts("\n");
+	}
+#ifdef CONFIG_BOOTX_TEXT
+	else if (boot_text_mapped) {
+		btext_drawstring(s);
+		btext_drawstring("\n");
+	}
+#endif /* CONFIG_BOOTX_TEXT */
+}
+
+/*
+ * pmac has no legacy IO, anything calling this function has to
+ * fail or bad things will happen
+ */
+static int pmac_check_legacy_ioport(unsigned int baseport)
+{
+	return -ENODEV;
+}
+
+static int __init pmac_declare_of_platform_devices(void)
+{
+	struct device_node *np;
+
+	np = find_devices("u3");
+	if (np) {
+		for (np = np->child; np != NULL; np = np->sibling)
+			if (strncmp(np->name, "i2c", 3) == 0) {
+				of_platform_device_create(np, "u3-i2c");
+				break;
+			}
+	}
+
+	return 0;
+}
+
+device_initcall(pmac_declare_of_platform_devices);
+
+/*
+ * Called very early, MMU is off, device-tree isn't unflattened
+ */
+static int __init pmac_probe(int platform)
+{
+	if (platform != PLATFORM_POWERMAC)
+		return 0;
+	/*
+	 * On U3, the DART (iommu) must be allocated now since it
+	 * has an impact on htab_initialize (due to the large page it
+	 * occupies having to be broken up so the DART itself is not
+	 * part of the cacheable linar mapping
+	 */
+	alloc_u3_dart_table();
+
+#ifdef CONFIG_PMAC_SMU
+	/*
+	 * SMU based G5s need some memory below 2Gb, at least the current
+	 * driver needs that. We have to allocate it now. We allocate 4k
+	 * (1 small page) for now.
+	 */
+	smu_cmdbuf_abs = lmb_alloc_base(4096, 4096, 0x80000000UL);
+#endif /* CONFIG_PMAC_SMU */
+
+	return 1;
+}
+
+struct machdep_calls __initdata pmac_md = {
+#ifdef CONFIG_HOTPLUG_CPU
+	.cpu_die		= generic_mach_cpu_die,
+#endif
+	.probe			= pmac_probe,
+	.setup_arch		= pmac_setup_arch,
+	.init_early		= pmac_init_early,
+       	.get_cpuinfo		= pmac_show_cpuinfo,
+	.init_IRQ		= pmac_init_IRQ,
+	.get_irq		= mpic_get_irq,
+	.pcibios_fixup		= pmac_pcibios_fixup,
+	.restart		= pmac_restart,
+	.power_off		= pmac_power_off,
+	.halt			= pmac_halt,
+       	.get_boot_time		= pmac_get_boot_time,
+       	.set_rtc_time		= pmac_set_rtc_time,
+       	.get_rtc_time		= pmac_get_rtc_time,
+      	.calibrate_decr		= pmac_calibrate_decr,
+	.feature_call		= pmac_do_feature_call,
+	.progress		= pmac_progress,
+	.check_legacy_ioport	= pmac_check_legacy_ioport
+};
diff --git a/arch/ppc64/kernel/pmac_smp.c b/arch/ppc64/kernel/pmac_smp.c
new file mode 100644
index 00000000000..c27588ede2f
--- /dev/null
+++ b/arch/ppc64/kernel/pmac_smp.c
@@ -0,0 +1,316 @@
+/*
+ * SMP support for power macintosh.
+ *
+ * We support both the old "powersurge" SMP architecture
+ * and the current Core99 (G4 PowerMac) machines.
+ *
+ * Note that we don't support the very first rev. of
+ * Apple/DayStar 2 CPUs board, the one with the funky
+ * watchdog. Hopefully, none of these should be there except
+ * maybe internally to Apple. I should probably still add some
+ * code to detect this card though and disable SMP. --BenH.
+ *
+ * Support Macintosh G4 SMP by Troy Benjegerdes (hozer@drgw.net)
+ * and Ben Herrenschmidt <benh@kernel.crashing.org>.
+ *
+ * Support for DayStar quad CPU cards
+ * Copyright (C) XLR8, Inc. 1994-2000
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/irq.h>
+
+#include <asm/ptrace.h>
+#include <asm/atomic.h>
+#include <asm/irq.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/sections.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/smp.h>
+#include <asm/machdep.h>
+#include <asm/pmac_feature.h>
+#include <asm/time.h>
+#include <asm/cacheflush.h>
+#include <asm/keylargo.h>
+#include <asm/pmac_low_i2c.h>
+
+#include "mpic.h"
+
+#ifdef DEBUG
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+extern void pmac_secondary_start_1(void);
+extern void pmac_secondary_start_2(void);
+extern void pmac_secondary_start_3(void);
+
+extern struct smp_ops_t *smp_ops;
+
+static void (*pmac_tb_freeze)(int freeze);
+static struct device_node *pmac_tb_clock_chip_host;
+static DEFINE_SPINLOCK(timebase_lock);
+static unsigned long timebase;
+
+static void smp_core99_cypress_tb_freeze(int freeze)
+{
+	u8 data;
+	int rc;
+
+	/* Strangely, the device-tree says address is 0xd2, but darwin
+	 * accesses 0xd0 ...
+	 */
+	pmac_low_i2c_setmode(pmac_tb_clock_chip_host, pmac_low_i2c_mode_combined);
+	rc = pmac_low_i2c_xfer(pmac_tb_clock_chip_host,
+			       0xd0 | pmac_low_i2c_read,
+			       0x81, &data, 1);
+	if (rc != 0)
+		goto bail;
+
+	data = (data & 0xf3) | (freeze ? 0x00 : 0x0c);
+
+       	pmac_low_i2c_setmode(pmac_tb_clock_chip_host, pmac_low_i2c_mode_stdsub);
+	rc = pmac_low_i2c_xfer(pmac_tb_clock_chip_host,
+			       0xd0 | pmac_low_i2c_write,
+			       0x81, &data, 1);
+
+ bail:
+	if (rc != 0) {
+		printk("Cypress Timebase %s rc: %d\n",
+		       freeze ? "freeze" : "unfreeze", rc);
+		panic("Timebase freeze failed !\n");
+	}
+}
+
+static void smp_core99_pulsar_tb_freeze(int freeze)
+{
+	u8 data;
+	int rc;
+
+	/* Strangely, the device-tree says address is 0xd2, but darwin
+	 * accesses 0xd0 ...
+	 */
+	pmac_low_i2c_setmode(pmac_tb_clock_chip_host, pmac_low_i2c_mode_combined);
+	rc = pmac_low_i2c_xfer(pmac_tb_clock_chip_host,
+			       0xd4 | pmac_low_i2c_read,
+			       0x2e, &data, 1);
+	if (rc != 0)
+		goto bail;
+
+	data = (data & 0x88) | (freeze ? 0x11 : 0x22);
+
+	pmac_low_i2c_setmode(pmac_tb_clock_chip_host, pmac_low_i2c_mode_stdsub);
+	rc = pmac_low_i2c_xfer(pmac_tb_clock_chip_host,
+			       0xd4 | pmac_low_i2c_write,
+			       0x2e, &data, 1);
+ bail:
+	if (rc != 0) {
+		printk(KERN_ERR "Pulsar Timebase %s rc: %d\n",
+		       freeze ? "freeze" : "unfreeze", rc);
+		panic("Timebase freeze failed !\n");
+	}
+}
+
+
+static void smp_core99_give_timebase(void)
+{
+	/* Open i2c bus for synchronous access */
+	if (pmac_low_i2c_open(pmac_tb_clock_chip_host, 0))
+		panic("Can't open i2c for TB sync !\n");
+
+	spin_lock(&timebase_lock);
+	(*pmac_tb_freeze)(1);
+	mb();
+	timebase = get_tb();
+	spin_unlock(&timebase_lock);
+
+	while (timebase)
+		barrier();
+
+	spin_lock(&timebase_lock);
+	(*pmac_tb_freeze)(0);
+	spin_unlock(&timebase_lock);
+
+	/* Close i2c bus */
+	pmac_low_i2c_close(pmac_tb_clock_chip_host);
+}
+
+
+static void __devinit smp_core99_take_timebase(void)
+{
+	while (!timebase)
+		barrier();
+	spin_lock(&timebase_lock);
+	set_tb(timebase >> 32, timebase & 0xffffffff);
+	timebase = 0;
+	spin_unlock(&timebase_lock);
+}
+
+
+static int __init smp_core99_probe(void)
+{
+	struct device_node *cpus;	
+	struct device_node *cc;	
+	int ncpus = 0;
+
+	/* Maybe use systemconfiguration here ? */
+	if (ppc_md.progress) ppc_md.progress("smp_core99_probe", 0x345);
+
+	/* Count CPUs in the device-tree */
+       	for (cpus = NULL; (cpus = of_find_node_by_type(cpus, "cpu")) != NULL;)
+	       	++ncpus;
+
+	printk(KERN_INFO "PowerMac SMP probe found %d cpus\n", ncpus);
+
+	/* Nothing more to do if less than 2 of them */
+	if (ncpus <= 1)
+		return 1;
+
+	/* Look for the clock chip */
+	for (cc = NULL; (cc = of_find_node_by_name(cc, "i2c-hwclock")) != NULL;) {
+		struct device_node *p = of_get_parent(cc);
+		u32 *reg;
+		int ok;
+		ok = p && device_is_compatible(p, "uni-n-i2c");
+		if (!ok)
+			goto next;
+		reg = (u32 *)get_property(cc, "reg", NULL);
+		if (reg == NULL)
+			goto next;
+		switch (*reg) {
+		case 0xd2:
+			pmac_tb_freeze = smp_core99_cypress_tb_freeze;
+			printk(KERN_INFO "Timebase clock is Cypress chip\n");
+			break;
+		case 0xd4:
+			pmac_tb_freeze = smp_core99_pulsar_tb_freeze;
+			printk(KERN_INFO "Timebase clock is Pulsar chip\n");
+			break;
+		}
+		if (pmac_tb_freeze != NULL) {
+			pmac_tb_clock_chip_host = p;
+			smp_ops->give_timebase = smp_core99_give_timebase;
+			smp_ops->take_timebase = smp_core99_take_timebase;
+			break;
+		}
+	next:
+		of_node_put(p);
+	}
+
+	mpic_request_ipis();
+
+	return ncpus;
+}
+
+static void __init smp_core99_kick_cpu(int nr)
+{
+	int save_vector, j;
+	unsigned long new_vector;
+	unsigned long flags;
+	volatile unsigned int *vector
+		 = ((volatile unsigned int *)(KERNELBASE+0x100));
+
+	if (nr < 1 || nr > 3)
+		return;
+	if (ppc_md.progress) ppc_md.progress("smp_core99_kick_cpu", 0x346);
+
+	local_irq_save(flags);
+	local_irq_disable();
+
+	/* Save reset vector */
+	save_vector = *vector;
+
+	/* Setup fake reset vector that does	
+	 *   b .pmac_secondary_start - KERNELBASE
+	 */
+	switch(nr) {
+	case 1:
+		new_vector = (unsigned long)pmac_secondary_start_1;
+		break;
+	case 2:
+		new_vector = (unsigned long)pmac_secondary_start_2;
+		break;			
+	case 3:
+	default:
+		new_vector = (unsigned long)pmac_secondary_start_3;
+		break;
+	}
+	*vector = 0x48000002 + (new_vector - KERNELBASE);
+
+	/* flush data cache and inval instruction cache */
+	flush_icache_range((unsigned long) vector, (unsigned long) vector + 4);
+
+	/* Put some life in our friend */
+	pmac_call_feature(PMAC_FTR_RESET_CPU, NULL, nr, 0);
+	paca[nr].cpu_start = 1;
+
+	/* FIXME: We wait a bit for the CPU to take the exception, I should
+	 * instead wait for the entry code to set something for me. Well,
+	 * ideally, all that crap will be done in prom.c and the CPU left
+	 * in a RAM-based wait loop like CHRP.
+	 */
+	for (j = 1; j < 1000000; j++)
+		mb();
+
+	/* Restore our exception vector */
+	*vector = save_vector;
+	flush_icache_range((unsigned long) vector, (unsigned long) vector + 4);
+
+	local_irq_restore(flags);
+	if (ppc_md.progress) ppc_md.progress("smp_core99_kick_cpu done", 0x347);
+}
+
+static void __init smp_core99_setup_cpu(int cpu_nr)
+{
+	/* Setup MPIC */
+	mpic_setup_this_cpu();
+
+	if (cpu_nr == 0) {
+		extern void g5_phy_disable_cpu1(void);
+
+		/* If we didn't start the second CPU, we must take
+		 * it off the bus
+		 */
+		if (num_online_cpus() < 2)		
+			g5_phy_disable_cpu1();
+		if (ppc_md.progress) ppc_md.progress("smp_core99_setup_cpu 0 done", 0x349);
+	}
+}
+
+struct smp_ops_t core99_smp_ops __pmacdata = {
+	.message_pass	= smp_mpic_message_pass,
+	.probe		= smp_core99_probe,
+	.kick_cpu	= smp_core99_kick_cpu,
+	.setup_cpu	= smp_core99_setup_cpu,
+	.give_timebase	= smp_generic_give_timebase,
+	.take_timebase	= smp_generic_take_timebase,
+};
+
+void __init pmac_setup_smp(void)
+{
+	smp_ops = &core99_smp_ops;
+#ifdef CONFIG_HOTPLUG_CPU
+	smp_ops->cpu_enable = generic_cpu_enable;
+	smp_ops->cpu_disable = generic_cpu_disable;
+	smp_ops->cpu_die = generic_cpu_die;
+#endif
+}
diff --git a/arch/ppc64/kernel/pmac_time.c b/arch/ppc64/kernel/pmac_time.c
new file mode 100644
index 00000000000..f24827581dd
--- /dev/null
+++ b/arch/ppc64/kernel/pmac_time.c
@@ -0,0 +1,201 @@
+/*
+ * Support for periodic interrupts (100 per second) and for getting
+ * the current time from the RTC on Power Macintoshes.
+ *
+ * We use the decrementer register for our periodic interrupts.
+ *
+ * Paul Mackerras	August 1996.
+ * Copyright (C) 1996 Paul Mackerras.
+ * Copyright (C) 2003-2005 Benjamin Herrenschmidt.
+ *
+ */
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/param.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/time.h>
+#include <linux/adb.h>
+#include <linux/pmu.h>
+#include <linux/interrupt.h>
+
+#include <asm/sections.h>
+#include <asm/prom.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/machdep.h>
+#include <asm/time.h>
+#include <asm/nvram.h>
+#include <asm/smu.h>
+
+#undef DEBUG
+
+#ifdef DEBUG
+#define DBG(x...) printk(x)
+#else
+#define DBG(x...)
+#endif
+
+extern void setup_default_decr(void);
+
+extern unsigned long ppc_tb_freq;
+extern unsigned long ppc_proc_freq;
+
+/* Apparently the RTC stores seconds since 1 Jan 1904 */
+#define RTC_OFFSET	2082844800
+
+/*
+ * Calibrate the decrementer frequency with the VIA timer 1.
+ */
+#define VIA_TIMER_FREQ_6	4700000	/* time 1 frequency * 6 */
+
+extern struct timezone sys_tz;
+extern void to_tm(int tim, struct rtc_time * tm);
+
+void __pmac pmac_get_rtc_time(struct rtc_time *tm)
+{
+	switch(sys_ctrler) {
+#ifdef CONFIG_ADB_PMU
+	case SYS_CTRLER_PMU: {
+		/* TODO: Move that to a function in the PMU driver */
+		struct adb_request req;
+		unsigned int now;
+
+		if (pmu_request(&req, NULL, 1, PMU_READ_RTC) < 0)
+			return;
+		pmu_wait_complete(&req);
+		if (req.reply_len != 4)
+			printk(KERN_ERR "pmac_get_rtc_time: PMU returned a %d"
+			       " bytes reply\n", req.reply_len);
+		now = (req.reply[0] << 24) + (req.reply[1] << 16)
+			+ (req.reply[2] << 8) + req.reply[3];
+		DBG("get: %u -> %u\n", (int)now, (int)(now - RTC_OFFSET));
+		now -= RTC_OFFSET;
+
+		to_tm(now, tm);
+		tm->tm_year -= 1900;
+		tm->tm_mon -= 1;
+	
+		DBG("-> tm_mday: %d, tm_mon: %d, tm_year: %d, %d:%02d:%02d\n",
+		    tm->tm_mday, tm->tm_mon, tm->tm_year,
+		    tm->tm_hour, tm->tm_min, tm->tm_sec);
+		break;
+	}
+#endif /* CONFIG_ADB_PMU */
+
+#ifdef CONFIG_PMAC_SMU
+	case SYS_CTRLER_SMU:
+		smu_get_rtc_time(tm);
+		break;
+#endif /* CONFIG_PMAC_SMU */
+	default:
+		;
+	}
+}
+
+int __pmac pmac_set_rtc_time(struct rtc_time *tm)
+{
+	switch(sys_ctrler) {
+#ifdef CONFIG_ADB_PMU
+	case SYS_CTRLER_PMU: {
+		/* TODO: Move that to a function in the PMU driver */
+		struct adb_request req;
+		unsigned int nowtime;
+
+		DBG("set: tm_mday: %d, tm_mon: %d, tm_year: %d,"
+		    " %d:%02d:%02d\n",
+		    tm->tm_mday, tm->tm_mon, tm->tm_year,
+		    tm->tm_hour, tm->tm_min, tm->tm_sec);
+
+		nowtime = mktime(tm->tm_year + 1900, tm->tm_mon + 1,
+				 tm->tm_mday, tm->tm_hour, tm->tm_min,
+				 tm->tm_sec);
+
+		DBG("-> %u -> %u\n", (int)nowtime,
+		    (int)(nowtime + RTC_OFFSET));
+		nowtime += RTC_OFFSET;
+
+		if (pmu_request(&req, NULL, 5, PMU_SET_RTC,
+				nowtime >> 24, nowtime >> 16,
+				nowtime >> 8, nowtime) < 0)
+			return -ENXIO;
+		pmu_wait_complete(&req);
+		if (req.reply_len != 0)
+			printk(KERN_ERR "pmac_set_rtc_time: PMU returned a %d"
+			       " bytes reply\n", req.reply_len);
+		return 0;
+	}
+#endif /* CONFIG_ADB_PMU */
+
+#ifdef CONFIG_PMAC_SMU
+	case SYS_CTRLER_SMU:
+		return smu_set_rtc_time(tm);
+#endif /* CONFIG_PMAC_SMU */
+	default:
+		return -ENODEV;
+	}
+}
+
+void __init pmac_get_boot_time(struct rtc_time *tm)
+{
+	pmac_get_rtc_time(tm);
+
+#ifdef disabled__CONFIG_NVRAM
+	s32 delta = 0;
+	int dst;
+	
+	delta = ((s32)pmac_xpram_read(PMAC_XPRAM_MACHINE_LOC + 0x9)) << 16;
+	delta |= ((s32)pmac_xpram_read(PMAC_XPRAM_MACHINE_LOC + 0xa)) << 8;
+	delta |= pmac_xpram_read(PMAC_XPRAM_MACHINE_LOC + 0xb);
+	if (delta & 0x00800000UL)
+		delta |= 0xFF000000UL;
+	dst = ((pmac_xpram_read(PMAC_XPRAM_MACHINE_LOC + 0x8) & 0x80) != 0);
+	printk("GMT Delta read from XPRAM: %d minutes, DST: %s\n", delta/60,
+		dst ? "on" : "off");
+#endif
+}
+
+/*
+ * Query the OF and get the decr frequency.
+ * This was taken from the pmac time_init() when merging the prep/pmac
+ * time functions.
+ */
+void __init pmac_calibrate_decr(void)
+{
+	struct device_node *cpu;
+	unsigned int freq, *fp;
+	struct div_result divres;
+
+	/*
+	 * The cpu node should have a timebase-frequency property
+	 * to tell us the rate at which the decrementer counts.
+	 */
+	cpu = find_type_devices("cpu");
+	if (cpu == 0)
+		panic("can't find cpu node in time_init");
+	fp = (unsigned int *) get_property(cpu, "timebase-frequency", NULL);
+	if (fp == 0)
+		panic("can't get cpu timebase frequency");
+	freq = *fp;
+	printk("time_init: decrementer frequency = %u.%.6u MHz\n",
+	       freq/1000000, freq%1000000);
+	tb_ticks_per_jiffy = freq / HZ;
+	tb_ticks_per_sec = tb_ticks_per_jiffy * HZ;
+	tb_ticks_per_usec = freq / 1000000;
+	tb_to_us = mulhwu_scale_factor(freq, 1000000);
+	div128_by_32( 1024*1024, 0, tb_ticks_per_sec, &divres );
+	tb_to_xs = divres.result_low;
+	ppc_tb_freq = freq;
+
+	fp = (unsigned int *)get_property(cpu, "clock-frequency", NULL);
+	if (fp == 0)
+		panic("can't get cpu processor frequency");
+	ppc_proc_freq = *fp;
+
+	setup_default_decr();
+}
+
diff --git a/arch/ppc64/kernel/pmc.c b/arch/ppc64/kernel/pmc.c
new file mode 100644
index 00000000000..67be773f9c0
--- /dev/null
+++ b/arch/ppc64/kernel/pmc.c
@@ -0,0 +1,67 @@
+/*
+ *  linux/arch/ppc64/kernel/pmc.c
+ *
+ *  Copyright (C) 2004 David Gibson, IBM Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+
+#include <asm/processor.h>
+#include <asm/pmc.h>
+
+/* Ensure exceptions are disabled */
+static void dummy_perf(struct pt_regs *regs)
+{
+	unsigned int mmcr0 = mfspr(SPRN_MMCR0);
+
+	mmcr0 &= ~(MMCR0_PMXE|MMCR0_PMAO);
+	mtspr(SPRN_MMCR0, mmcr0);
+}
+
+static spinlock_t pmc_owner_lock = SPIN_LOCK_UNLOCKED;
+static void *pmc_owner_caller; /* mostly for debugging */
+perf_irq_t perf_irq = dummy_perf;
+
+int reserve_pmc_hardware(perf_irq_t new_perf_irq)
+{
+	int err = 0;
+
+	spin_lock(&pmc_owner_lock);
+
+	if (pmc_owner_caller) {
+		printk(KERN_WARNING "reserve_pmc_hardware: "
+		       "PMC hardware busy (reserved by caller %p)\n",
+		       pmc_owner_caller);
+		err = -EBUSY;
+		goto out;
+	}
+
+	pmc_owner_caller = __builtin_return_address(0);
+	perf_irq = new_perf_irq ? : dummy_perf;
+
+ out:
+	spin_unlock(&pmc_owner_lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(reserve_pmc_hardware);
+
+void release_pmc_hardware(void)
+{
+	spin_lock(&pmc_owner_lock);
+
+	WARN_ON(! pmc_owner_caller);
+
+	pmc_owner_caller = NULL;
+	perf_irq = dummy_perf;
+
+	spin_unlock(&pmc_owner_lock);
+}
+EXPORT_SYMBOL_GPL(release_pmc_hardware);
diff --git a/arch/ppc64/kernel/ppc_ksyms.c b/arch/ppc64/kernel/ppc_ksyms.c
new file mode 100644
index 00000000000..6ced63a3439
--- /dev/null
+++ b/arch/ppc64/kernel/ppc_ksyms.c
@@ -0,0 +1,95 @@
+/* 
+ * c 2001 PPC 64 Team, IBM Corp
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/console.h>
+#include <net/checksum.h>
+
+#include <asm/processor.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/system.h>
+#include <asm/hw_irq.h>
+#include <asm/abs_addr.h>
+#include <asm/cacheflush.h>
+#include <asm/iSeries/HvCallSc.h>
+
+EXPORT_SYMBOL(strcpy);
+EXPORT_SYMBOL(strncpy);
+EXPORT_SYMBOL(strcat);
+EXPORT_SYMBOL(strncat);
+EXPORT_SYMBOL(strchr);
+EXPORT_SYMBOL(strrchr);
+EXPORT_SYMBOL(strpbrk);
+EXPORT_SYMBOL(strstr);
+EXPORT_SYMBOL(strlen);
+EXPORT_SYMBOL(strnlen);
+EXPORT_SYMBOL(strcmp);
+EXPORT_SYMBOL(strncmp);
+
+EXPORT_SYMBOL(csum_partial);
+EXPORT_SYMBOL(csum_partial_copy_generic);
+EXPORT_SYMBOL(ip_fast_csum);
+EXPORT_SYMBOL(csum_tcpudp_magic);
+
+EXPORT_SYMBOL(__copy_tofrom_user);
+EXPORT_SYMBOL(__clear_user);
+EXPORT_SYMBOL(__strncpy_from_user);
+EXPORT_SYMBOL(__strnlen_user);
+
+EXPORT_SYMBOL(reloc_offset);
+
+#ifdef CONFIG_PPC_ISERIES
+EXPORT_SYMBOL(HvCall0);
+EXPORT_SYMBOL(HvCall1);
+EXPORT_SYMBOL(HvCall2);
+EXPORT_SYMBOL(HvCall3);
+EXPORT_SYMBOL(HvCall4);
+EXPORT_SYMBOL(HvCall5);
+EXPORT_SYMBOL(HvCall6);
+EXPORT_SYMBOL(HvCall7);
+#endif
+
+EXPORT_SYMBOL(_insb);
+EXPORT_SYMBOL(_outsb);
+EXPORT_SYMBOL(_insw);
+EXPORT_SYMBOL(_outsw);
+EXPORT_SYMBOL(_insl);
+EXPORT_SYMBOL(_outsl);
+EXPORT_SYMBOL(_insw_ns);
+EXPORT_SYMBOL(_outsw_ns);
+EXPORT_SYMBOL(_insl_ns);
+EXPORT_SYMBOL(_outsl_ns);
+
+EXPORT_SYMBOL(kernel_thread);
+
+EXPORT_SYMBOL(giveup_fpu);
+#ifdef CONFIG_ALTIVEC
+EXPORT_SYMBOL(giveup_altivec);
+#endif
+EXPORT_SYMBOL(flush_icache_range);
+
+#ifdef CONFIG_SMP
+#ifdef CONFIG_PPC_ISERIES
+EXPORT_SYMBOL(local_get_flags);
+EXPORT_SYMBOL(local_irq_disable);
+EXPORT_SYMBOL(local_irq_restore);
+#endif
+#endif
+
+EXPORT_SYMBOL(memcpy);
+EXPORT_SYMBOL(memset);
+EXPORT_SYMBOL(memmove);
+EXPORT_SYMBOL(memscan);
+EXPORT_SYMBOL(memcmp);
+EXPORT_SYMBOL(memchr);
+
+EXPORT_SYMBOL(timer_interrupt);
+EXPORT_SYMBOL(console_drivers);
diff --git a/arch/ppc64/kernel/proc_ppc64.c b/arch/ppc64/kernel/proc_ppc64.c
new file mode 100644
index 00000000000..0914b0669b0
--- /dev/null
+++ b/arch/ppc64/kernel/proc_ppc64.c
@@ -0,0 +1,128 @@
+/*
+ * arch/ppc64/kernel/proc_ppc64.c
+ *
+ * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen IBM Corporation
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+
+#include <asm/systemcfg.h>
+#include <asm/rtas.h>
+#include <asm/uaccess.h>
+#include <asm/prom.h>
+
+static loff_t  page_map_seek( struct file *file, loff_t off, int whence);
+static ssize_t page_map_read( struct file *file, char __user *buf, size_t nbytes,
+			      loff_t *ppos);
+static int     page_map_mmap( struct file *file, struct vm_area_struct *vma );
+
+static struct file_operations page_map_fops = {
+	.llseek	= page_map_seek,
+	.read	= page_map_read,
+	.mmap	= page_map_mmap
+};
+
+/*
+ * Create the ppc64 and ppc64/rtas directories early. This allows us to
+ * assume that they have been previously created in drivers.
+ */
+static int __init proc_ppc64_create(void)
+{
+	struct proc_dir_entry *root;
+
+	root = proc_mkdir("ppc64", NULL);
+	if (!root)
+		return 1;
+
+	if (!(systemcfg->platform & PLATFORM_PSERIES))
+		return 0;
+
+	if (!proc_mkdir("rtas", root))
+		return 1;
+
+	if (!proc_symlink("rtas", NULL, "ppc64/rtas"))
+		return 1;
+
+	return 0;
+}
+core_initcall(proc_ppc64_create);
+
+static int __init proc_ppc64_init(void)
+{
+	struct proc_dir_entry *pde;
+
+	pde = create_proc_entry("ppc64/systemcfg", S_IFREG|S_IRUGO, NULL);
+	if (!pde)
+		return 1;
+	pde->nlink = 1;
+	pde->data = systemcfg;
+	pde->size = PAGE_SIZE;
+	pde->proc_fops = &page_map_fops;
+
+	return 0;
+}
+__initcall(proc_ppc64_init);
+
+static loff_t page_map_seek( struct file *file, loff_t off, int whence)
+{
+	loff_t new;
+	struct proc_dir_entry *dp = PDE(file->f_dentry->d_inode);
+
+	switch(whence) {
+	case 0:
+		new = off;
+		break;
+	case 1:
+		new = file->f_pos + off;
+		break;
+	case 2:
+		new = dp->size + off;
+		break;
+	default:
+		return -EINVAL;
+	}
+	if ( new < 0 || new > dp->size )
+		return -EINVAL;
+	return (file->f_pos = new);
+}
+
+static ssize_t page_map_read( struct file *file, char __user *buf, size_t nbytes,
+			      loff_t *ppos)
+{
+	struct proc_dir_entry *dp = PDE(file->f_dentry->d_inode);
+	return simple_read_from_buffer(buf, nbytes, ppos, dp->data, dp->size);
+}
+
+static int page_map_mmap( struct file *file, struct vm_area_struct *vma )
+{
+	struct proc_dir_entry *dp = PDE(file->f_dentry->d_inode);
+
+	vma->vm_flags |= VM_SHM | VM_LOCKED;
+
+	if ((vma->vm_end - vma->vm_start) > dp->size)
+		return -EINVAL;
+
+	remap_pfn_range(vma, vma->vm_start, __pa(dp->data) >> PAGE_SHIFT,
+						dp->size, vma->vm_page_prot);
+	return 0;
+}
+
diff --git a/arch/ppc64/kernel/process.c b/arch/ppc64/kernel/process.c
new file mode 100644
index 00000000000..8b068612273
--- /dev/null
+++ b/arch/ppc64/kernel/process.c
@@ -0,0 +1,688 @@
+/*
+ *  linux/arch/ppc64/kernel/process.c
+ *
+ *  Derived from "arch/i386/kernel/process.c"
+ *    Copyright (C) 1995  Linus Torvalds
+ *
+ *  Updated and modified by Cort Dougan (cort@cs.nmt.edu) and
+ *  Paul Mackerras (paulus@cs.anu.edu.au)
+ *
+ *  PowerPC version 
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/slab.h>
+#include <linux/user.h>
+#include <linux/elf.h>
+#include <linux/init.h>
+#include <linux/init_task.h>
+#include <linux/prctl.h>
+#include <linux/ptrace.h>
+#include <linux/kallsyms.h>
+#include <linux/interrupt.h>
+#include <linux/utsname.h>
+
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/processor.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/prom.h>
+#include <asm/ppcdebug.h>
+#include <asm/machdep.h>
+#include <asm/iSeries/HvCallHpt.h>
+#include <asm/cputable.h>
+#include <asm/sections.h>
+#include <asm/tlbflush.h>
+#include <asm/time.h>
+
+#ifndef CONFIG_SMP
+struct task_struct *last_task_used_math = NULL;
+struct task_struct *last_task_used_altivec = NULL;
+#endif
+
+struct mm_struct ioremap_mm = {
+	.pgd		= ioremap_dir,
+	.mm_users	= ATOMIC_INIT(2),
+	.mm_count	= ATOMIC_INIT(1),
+	.cpu_vm_mask	= CPU_MASK_ALL,
+	.page_table_lock = SPIN_LOCK_UNLOCKED,
+};
+
+/*
+ * Make sure the floating-point register state in the
+ * the thread_struct is up to date for task tsk.
+ */
+void flush_fp_to_thread(struct task_struct *tsk)
+{
+	if (tsk->thread.regs) {
+		/*
+		 * We need to disable preemption here because if we didn't,
+		 * another process could get scheduled after the regs->msr
+		 * test but before we have finished saving the FP registers
+		 * to the thread_struct.  That process could take over the
+		 * FPU, and then when we get scheduled again we would store
+		 * bogus values for the remaining FP registers.
+		 */
+		preempt_disable();
+		if (tsk->thread.regs->msr & MSR_FP) {
+#ifdef CONFIG_SMP
+			/*
+			 * This should only ever be called for current or
+			 * for a stopped child process.  Since we save away
+			 * the FP register state on context switch on SMP,
+			 * there is something wrong if a stopped child appears
+			 * to still have its FP state in the CPU registers.
+			 */
+			BUG_ON(tsk != current);
+#endif
+			giveup_fpu(current);
+		}
+		preempt_enable();
+	}
+}
+
+void enable_kernel_fp(void)
+{
+	WARN_ON(preemptible());
+
+#ifdef CONFIG_SMP
+	if (current->thread.regs && (current->thread.regs->msr & MSR_FP))
+		giveup_fpu(current);
+	else
+		giveup_fpu(NULL);	/* just enables FP for kernel */
+#else
+	giveup_fpu(last_task_used_math);
+#endif /* CONFIG_SMP */
+}
+EXPORT_SYMBOL(enable_kernel_fp);
+
+int dump_task_fpu(struct task_struct *tsk, elf_fpregset_t *fpregs)
+{
+	if (!tsk->thread.regs)
+		return 0;
+	flush_fp_to_thread(current);
+
+	memcpy(fpregs, &tsk->thread.fpr[0], sizeof(*fpregs));
+
+	return 1;
+}
+
+#ifdef CONFIG_ALTIVEC
+
+void enable_kernel_altivec(void)
+{
+	WARN_ON(preemptible());
+
+#ifdef CONFIG_SMP
+	if (current->thread.regs && (current->thread.regs->msr & MSR_VEC))
+		giveup_altivec(current);
+	else
+		giveup_altivec(NULL);	/* just enables FP for kernel */
+#else
+	giveup_altivec(last_task_used_altivec);
+#endif /* CONFIG_SMP */
+}
+EXPORT_SYMBOL(enable_kernel_altivec);
+
+/*
+ * Make sure the VMX/Altivec register state in the
+ * the thread_struct is up to date for task tsk.
+ */
+void flush_altivec_to_thread(struct task_struct *tsk)
+{
+	if (tsk->thread.regs) {
+		preempt_disable();
+		if (tsk->thread.regs->msr & MSR_VEC) {
+#ifdef CONFIG_SMP
+			BUG_ON(tsk != current);
+#endif
+			giveup_altivec(current);
+		}
+		preempt_enable();
+	}
+}
+
+int dump_task_altivec(struct pt_regs *regs, elf_vrregset_t *vrregs)
+{
+	flush_altivec_to_thread(current);
+	memcpy(vrregs, &current->thread.vr[0], sizeof(*vrregs));
+	return 1;
+}
+
+#endif /* CONFIG_ALTIVEC */
+
+DEFINE_PER_CPU(struct cpu_usage, cpu_usage_array);
+
+struct task_struct *__switch_to(struct task_struct *prev,
+				struct task_struct *new)
+{
+	struct thread_struct *new_thread, *old_thread;
+	unsigned long flags;
+	struct task_struct *last;
+
+#ifdef CONFIG_SMP
+	/* avoid complexity of lazy save/restore of fpu
+	 * by just saving it every time we switch out if
+	 * this task used the fpu during the last quantum.
+	 * 
+	 * If it tries to use the fpu again, it'll trap and
+	 * reload its fp regs.  So we don't have to do a restore
+	 * every switch, just a save.
+	 *  -- Cort
+	 */
+	if (prev->thread.regs && (prev->thread.regs->msr & MSR_FP))
+		giveup_fpu(prev);
+#ifdef CONFIG_ALTIVEC
+	if (prev->thread.regs && (prev->thread.regs->msr & MSR_VEC))
+		giveup_altivec(prev);
+#endif /* CONFIG_ALTIVEC */
+#endif /* CONFIG_SMP */
+
+#if defined(CONFIG_ALTIVEC) && !defined(CONFIG_SMP)
+	/* Avoid the trap.  On smp this this never happens since
+	 * we don't set last_task_used_altivec -- Cort
+	 */
+	if (new->thread.regs && last_task_used_altivec == new)
+		new->thread.regs->msr |= MSR_VEC;
+#endif /* CONFIG_ALTIVEC */
+
+	flush_tlb_pending();
+
+	new_thread = &new->thread;
+	old_thread = &current->thread;
+
+/* Collect purr utilization data per process and per processor wise */
+/* purr is nothing but processor time base                          */
+
+#if defined(CONFIG_PPC_PSERIES)
+	if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) {
+		struct cpu_usage *cu = &__get_cpu_var(cpu_usage_array);
+		long unsigned start_tb, current_tb;
+		start_tb = old_thread->start_tb;
+		cu->current_tb = current_tb = mfspr(SPRN_PURR);
+		old_thread->accum_tb += (current_tb - start_tb);
+		new_thread->start_tb = current_tb;
+	}
+#endif
+
+
+	local_irq_save(flags);
+	last = _switch(old_thread, new_thread);
+
+	local_irq_restore(flags);
+
+	return last;
+}
+
+static int instructions_to_print = 16;
+
+static void show_instructions(struct pt_regs *regs)
+{
+	int i;
+	unsigned long pc = regs->nip - (instructions_to_print * 3 / 4 *
+			sizeof(int));
+
+	printk("Instruction dump:");
+
+	for (i = 0; i < instructions_to_print; i++) {
+		int instr;
+
+		if (!(i % 8))
+			printk("\n");
+
+		if (((REGION_ID(pc) != KERNEL_REGION_ID) &&
+		     (REGION_ID(pc) != VMALLOC_REGION_ID)) ||
+		     __get_user(instr, (unsigned int *)pc)) {
+			printk("XXXXXXXX ");
+		} else {
+			if (regs->nip == pc)
+				printk("<%08x> ", instr);
+			else
+				printk("%08x ", instr);
+		}
+
+		pc += sizeof(int);
+	}
+
+	printk("\n");
+}
+
+void show_regs(struct pt_regs * regs)
+{
+	int i;
+	unsigned long trap;
+
+	printk("NIP: %016lX XER: %08X LR: %016lX CTR: %016lX\n",
+	       regs->nip, (unsigned int)regs->xer, regs->link, regs->ctr);
+	printk("REGS: %p TRAP: %04lx   %s  (%s)\n",
+	       regs, regs->trap, print_tainted(), system_utsname.release);
+	printk("MSR: %016lx EE: %01x PR: %01x FP: %01x ME: %01x "
+	       "IR/DR: %01x%01x CR: %08X\n",
+	       regs->msr, regs->msr&MSR_EE ? 1 : 0, regs->msr&MSR_PR ? 1 : 0,
+	       regs->msr & MSR_FP ? 1 : 0,regs->msr&MSR_ME ? 1 : 0,
+	       regs->msr&MSR_IR ? 1 : 0,
+	       regs->msr&MSR_DR ? 1 : 0,
+	       (unsigned int)regs->ccr);
+	trap = TRAP(regs);
+	printk("DAR: %016lx DSISR: %016lx\n", regs->dar, regs->dsisr);
+	printk("TASK: %p[%d] '%s' THREAD: %p",
+	       current, current->pid, current->comm, current->thread_info);
+
+#ifdef CONFIG_SMP
+	printk(" CPU: %d", smp_processor_id());
+#endif /* CONFIG_SMP */
+
+	for (i = 0; i < 32; i++) {
+		if ((i % 4) == 0) {
+			printk("\n" KERN_INFO "GPR%02d: ", i);
+		}
+
+		printk("%016lX ", regs->gpr[i]);
+		if (i == 13 && !FULL_REGS(regs))
+			break;
+	}
+	printk("\n");
+	/*
+	 * Lookup NIP late so we have the best change of getting the
+	 * above info out without failing
+	 */
+	printk("NIP [%016lx] ", regs->nip);
+	print_symbol("%s\n", regs->nip);
+	printk("LR [%016lx] ", regs->link);
+	print_symbol("%s\n", regs->link);
+	show_stack(current, (unsigned long *)regs->gpr[1]);
+	if (!user_mode(regs))
+		show_instructions(regs);
+}
+
+void exit_thread(void)
+{
+#ifndef CONFIG_SMP
+	if (last_task_used_math == current)
+		last_task_used_math = NULL;
+#ifdef CONFIG_ALTIVEC
+	if (last_task_used_altivec == current)
+		last_task_used_altivec = NULL;
+#endif /* CONFIG_ALTIVEC */
+#endif /* CONFIG_SMP */
+}
+
+void flush_thread(void)
+{
+	struct thread_info *t = current_thread_info();
+
+	if (t->flags & _TIF_ABI_PENDING)
+		t->flags ^= (_TIF_ABI_PENDING | _TIF_32BIT);
+
+#ifndef CONFIG_SMP
+	if (last_task_used_math == current)
+		last_task_used_math = NULL;
+#ifdef CONFIG_ALTIVEC
+	if (last_task_used_altivec == current)
+		last_task_used_altivec = NULL;
+#endif /* CONFIG_ALTIVEC */
+#endif /* CONFIG_SMP */
+}
+
+void
+release_thread(struct task_struct *t)
+{
+}
+
+
+/*
+ * This gets called before we allocate a new thread and copy
+ * the current task into it.
+ */
+void prepare_to_copy(struct task_struct *tsk)
+{
+	flush_fp_to_thread(current);
+	flush_altivec_to_thread(current);
+}
+
+/*
+ * Copy a thread..
+ */
+int
+copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+	    unsigned long unused, struct task_struct *p, struct pt_regs *regs)
+{
+	struct pt_regs *childregs, *kregs;
+	extern void ret_from_fork(void);
+	unsigned long sp = (unsigned long)p->thread_info + THREAD_SIZE;
+
+	/* Copy registers */
+	sp -= sizeof(struct pt_regs);
+	childregs = (struct pt_regs *) sp;
+	*childregs = *regs;
+	if ((childregs->msr & MSR_PR) == 0) {
+		/* for kernel thread, set stackptr in new task */
+		childregs->gpr[1] = sp + sizeof(struct pt_regs);
+		p->thread.regs = NULL;	/* no user register state */
+		clear_ti_thread_flag(p->thread_info, TIF_32BIT);
+#ifdef CONFIG_PPC_ISERIES
+		set_ti_thread_flag(p->thread_info, TIF_RUN_LIGHT);
+#endif
+	} else {
+		childregs->gpr[1] = usp;
+		p->thread.regs = childregs;
+		if (clone_flags & CLONE_SETTLS) {
+			if (test_thread_flag(TIF_32BIT))
+				childregs->gpr[2] = childregs->gpr[6];
+			else
+				childregs->gpr[13] = childregs->gpr[6];
+		}
+	}
+	childregs->gpr[3] = 0;  /* Result from fork() */
+	sp -= STACK_FRAME_OVERHEAD;
+
+	/*
+	 * The way this works is that at some point in the future
+	 * some task will call _switch to switch to the new task.
+	 * That will pop off the stack frame created below and start
+	 * the new task running at ret_from_fork.  The new task will
+	 * do some house keeping and then return from the fork or clone
+	 * system call, using the stack frame created above.
+	 */
+	sp -= sizeof(struct pt_regs);
+	kregs = (struct pt_regs *) sp;
+	sp -= STACK_FRAME_OVERHEAD;
+	p->thread.ksp = sp;
+	if (cpu_has_feature(CPU_FTR_SLB)) {
+		unsigned long sp_vsid = get_kernel_vsid(sp);
+
+		sp_vsid <<= SLB_VSID_SHIFT;
+		sp_vsid |= SLB_VSID_KERNEL;
+		if (cpu_has_feature(CPU_FTR_16M_PAGE))
+			sp_vsid |= SLB_VSID_L;
+
+		p->thread.ksp_vsid = sp_vsid;
+	}
+
+	/*
+	 * The PPC64 ABI makes use of a TOC to contain function 
+	 * pointers.  The function (ret_from_except) is actually a pointer
+	 * to the TOC entry.  The first entry is a pointer to the actual
+	 * function.
+ 	 */
+	kregs->nip = *((unsigned long *)ret_from_fork);
+
+	return 0;
+}
+
+/*
+ * Set up a thread for executing a new program
+ */
+void start_thread(struct pt_regs *regs, unsigned long fdptr, unsigned long sp)
+{
+	unsigned long entry, toc, load_addr = regs->gpr[2];
+
+	/* fdptr is a relocated pointer to the function descriptor for
+         * the elf _start routine.  The first entry in the function
+         * descriptor is the entry address of _start and the second
+         * entry is the TOC value we need to use.
+         */
+	set_fs(USER_DS);
+	__get_user(entry, (unsigned long __user *)fdptr);
+	__get_user(toc, (unsigned long __user *)fdptr+1);
+
+	/* Check whether the e_entry function descriptor entries
+	 * need to be relocated before we can use them.
+	 */
+	if (load_addr != 0) {
+		entry += load_addr;
+		toc   += load_addr;
+	}
+
+	/*
+	 * If we exec out of a kernel thread then thread.regs will not be
+	 * set. Do it now.
+	 */
+	if (!current->thread.regs) {
+		unsigned long childregs = (unsigned long)current->thread_info +
+						THREAD_SIZE;
+		childregs -= sizeof(struct pt_regs);
+		current->thread.regs = (struct pt_regs *)childregs;
+	}
+
+	regs->nip = entry;
+	regs->gpr[1] = sp;
+	regs->gpr[2] = toc;
+	regs->msr = MSR_USER64;
+#ifndef CONFIG_SMP
+	if (last_task_used_math == current)
+		last_task_used_math = 0;
+#endif /* CONFIG_SMP */
+	memset(current->thread.fpr, 0, sizeof(current->thread.fpr));
+	current->thread.fpscr = 0;
+#ifdef CONFIG_ALTIVEC
+#ifndef CONFIG_SMP
+	if (last_task_used_altivec == current)
+		last_task_used_altivec = 0;
+#endif /* CONFIG_SMP */
+	memset(current->thread.vr, 0, sizeof(current->thread.vr));
+	current->thread.vscr.u[0] = 0;
+	current->thread.vscr.u[1] = 0;
+	current->thread.vscr.u[2] = 0;
+	current->thread.vscr.u[3] = 0x00010000; /* Java mode disabled */
+	current->thread.vrsave = 0;
+	current->thread.used_vr = 0;
+#endif /* CONFIG_ALTIVEC */
+}
+EXPORT_SYMBOL(start_thread);
+
+int set_fpexc_mode(struct task_struct *tsk, unsigned int val)
+{
+	struct pt_regs *regs = tsk->thread.regs;
+
+	if (val > PR_FP_EXC_PRECISE)
+		return -EINVAL;
+	tsk->thread.fpexc_mode = __pack_fe01(val);
+	if (regs != NULL && (regs->msr & MSR_FP) != 0)
+		regs->msr = (regs->msr & ~(MSR_FE0|MSR_FE1))
+			| tsk->thread.fpexc_mode;
+	return 0;
+}
+
+int get_fpexc_mode(struct task_struct *tsk, unsigned long adr)
+{
+	unsigned int val;
+
+	val = __unpack_fe01(tsk->thread.fpexc_mode);
+	return put_user(val, (unsigned int __user *) adr);
+}
+
+int sys_clone(unsigned long clone_flags, unsigned long p2, unsigned long p3,
+	      unsigned long p4, unsigned long p5, unsigned long p6,
+	      struct pt_regs *regs)
+{
+	unsigned long parent_tidptr = 0;
+	unsigned long child_tidptr = 0;
+
+	if (p2 == 0)
+		p2 = regs->gpr[1];	/* stack pointer for child */
+
+	if (clone_flags & (CLONE_PARENT_SETTID | CLONE_CHILD_SETTID |
+			   CLONE_CHILD_CLEARTID)) {
+		parent_tidptr = p3;
+		child_tidptr = p5;
+		if (test_thread_flag(TIF_32BIT)) {
+			parent_tidptr &= 0xffffffff;
+			child_tidptr &= 0xffffffff;
+		}
+	}
+
+	return do_fork(clone_flags, p2, regs, 0,
+		    (int __user *)parent_tidptr, (int __user *)child_tidptr);
+}
+
+int sys_fork(unsigned long p1, unsigned long p2, unsigned long p3,
+	     unsigned long p4, unsigned long p5, unsigned long p6,
+	     struct pt_regs *regs)
+{
+	return do_fork(SIGCHLD, regs->gpr[1], regs, 0, NULL, NULL);
+}
+
+int sys_vfork(unsigned long p1, unsigned long p2, unsigned long p3,
+	      unsigned long p4, unsigned long p5, unsigned long p6,
+	      struct pt_regs *regs)
+{
+	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->gpr[1], regs, 0,
+	            NULL, NULL);
+}
+
+int sys_execve(unsigned long a0, unsigned long a1, unsigned long a2,
+	       unsigned long a3, unsigned long a4, unsigned long a5,
+	       struct pt_regs *regs)
+{
+	int error;
+	char * filename;
+	
+	filename = getname((char __user *) a0);
+	error = PTR_ERR(filename);
+	if (IS_ERR(filename))
+		goto out;
+	flush_fp_to_thread(current);
+	flush_altivec_to_thread(current);
+	error = do_execve(filename, (char __user * __user *) a1,
+				    (char __user * __user *) a2, regs);
+  
+	if (error == 0) {
+		task_lock(current);
+		current->ptrace &= ~PT_DTRACE;
+		task_unlock(current);
+	}
+	putname(filename);
+
+out:
+	return error;
+}
+
+static int kstack_depth_to_print = 64;
+
+static int validate_sp(unsigned long sp, struct task_struct *p,
+		       unsigned long nbytes)
+{
+	unsigned long stack_page = (unsigned long)p->thread_info;
+
+	if (sp >= stack_page + sizeof(struct thread_struct)
+	    && sp <= stack_page + THREAD_SIZE - nbytes)
+		return 1;
+
+#ifdef CONFIG_IRQSTACKS
+	stack_page = (unsigned long) hardirq_ctx[task_cpu(p)];
+	if (sp >= stack_page + sizeof(struct thread_struct)
+	    && sp <= stack_page + THREAD_SIZE - nbytes)
+		return 1;
+
+	stack_page = (unsigned long) softirq_ctx[task_cpu(p)];
+	if (sp >= stack_page + sizeof(struct thread_struct)
+	    && sp <= stack_page + THREAD_SIZE - nbytes)
+		return 1;
+#endif
+
+	return 0;
+}
+
+unsigned long get_wchan(struct task_struct *p)
+{
+	unsigned long ip, sp;
+	int count = 0;
+
+	if (!p || p == current || p->state == TASK_RUNNING)
+		return 0;
+
+	sp = p->thread.ksp;
+	if (!validate_sp(sp, p, 112))
+		return 0;
+
+	do {
+		sp = *(unsigned long *)sp;
+		if (!validate_sp(sp, p, 112))
+			return 0;
+		if (count > 0) {
+			ip = *(unsigned long *)(sp + 16);
+			if (!in_sched_functions(ip))
+				return ip;
+		}
+	} while (count++ < 16);
+	return 0;
+}
+EXPORT_SYMBOL(get_wchan);
+
+void show_stack(struct task_struct *p, unsigned long *_sp)
+{
+	unsigned long ip, newsp, lr;
+	int count = 0;
+	unsigned long sp = (unsigned long)_sp;
+	int firstframe = 1;
+
+	if (sp == 0) {
+		if (p) {
+			sp = p->thread.ksp;
+		} else {
+			sp = __get_SP();
+			p = current;
+		}
+	}
+
+	lr = 0;
+	printk("Call Trace:\n");
+	do {
+		if (!validate_sp(sp, p, 112))
+			return;
+
+		_sp = (unsigned long *) sp;
+		newsp = _sp[0];
+		ip = _sp[2];
+		if (!firstframe || ip != lr) {
+			printk("[%016lx] [%016lx] ", sp, ip);
+			print_symbol("%s", ip);
+			if (firstframe)
+				printk(" (unreliable)");
+			printk("\n");
+		}
+		firstframe = 0;
+
+		/*
+		 * See if this is an exception frame.
+		 * We look for the "regshere" marker in the current frame.
+		 */
+		if (validate_sp(sp, p, sizeof(struct pt_regs) + 400)
+		    && _sp[12] == 0x7265677368657265ul) {
+			struct pt_regs *regs = (struct pt_regs *)
+				(sp + STACK_FRAME_OVERHEAD);
+			printk("--- Exception: %lx", regs->trap);
+			print_symbol(" at %s\n", regs->nip);
+			lr = regs->link;
+			print_symbol("    LR = %s\n", lr);
+			firstframe = 1;
+		}
+
+		sp = newsp;
+	} while (count++ < kstack_depth_to_print);
+}
+
+void dump_stack(void)
+{
+	show_stack(current, (unsigned long *)__get_SP());
+}
+EXPORT_SYMBOL(dump_stack);
diff --git a/arch/ppc64/kernel/prom.c b/arch/ppc64/kernel/prom.c
new file mode 100644
index 00000000000..01739d5c47c
--- /dev/null
+++ b/arch/ppc64/kernel/prom.c
@@ -0,0 +1,1820 @@
+/*
+ * 
+ *
+ * Procedures for interfacing to Open Firmware.
+ *
+ * Paul Mackerras	August 1996.
+ * Copyright (C) 1996 Paul Mackerras.
+ * 
+ *  Adapted for 64bit PowerPC by Dave Engebretsen and Peter Bergner.
+ *    {engebret|bergner}@us.ibm.com 
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <stdarg.h>
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/version.h>
+#include <linux/threads.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/pci.h>
+#include <linux/stringify.h>
+#include <linux/delay.h>
+#include <linux/initrd.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/lmb.h>
+#include <asm/abs_addr.h>
+#include <asm/page.h>
+#include <asm/processor.h>
+#include <asm/irq.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/system.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
+#include <asm/pci.h>
+#include <asm/iommu.h>
+#include <asm/bootinfo.h>
+#include <asm/ppcdebug.h>
+#include <asm/btext.h>
+#include <asm/sections.h>
+#include <asm/machdep.h>
+#include <asm/pSeries_reconfig.h>
+
+#ifdef DEBUG
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+struct pci_reg_property {
+	struct pci_address addr;
+	u32 size_hi;
+	u32 size_lo;
+};
+
+struct isa_reg_property {
+	u32 space;
+	u32 address;
+	u32 size;
+};
+
+
+typedef int interpret_func(struct device_node *, unsigned long *,
+			   int, int, int);
+
+extern struct rtas_t rtas;
+extern struct lmb lmb;
+extern unsigned long klimit;
+
+static int __initdata dt_root_addr_cells;
+static int __initdata dt_root_size_cells;
+static int __initdata iommu_is_off;
+int __initdata iommu_force_on;
+typedef u32 cell_t;
+
+#if 0
+static struct boot_param_header *initial_boot_params __initdata;
+#else
+struct boot_param_header *initial_boot_params;
+#endif
+
+static struct device_node *allnodes = NULL;
+
+/* use when traversing tree through the allnext, child, sibling,
+ * or parent members of struct device_node.
+ */
+static DEFINE_RWLOCK(devtree_lock);
+
+/* export that to outside world */
+struct device_node *of_chosen;
+
+/*
+ * Wrapper for allocating memory for various data that needs to be
+ * attached to device nodes as they are processed at boot or when
+ * added to the device tree later (e.g. DLPAR).  At boot there is
+ * already a region reserved so we just increment *mem_start by size;
+ * otherwise we call kmalloc.
+ */
+static void * prom_alloc(unsigned long size, unsigned long *mem_start)
+{
+	unsigned long tmp;
+
+	if (!mem_start)
+		return kmalloc(size, GFP_KERNEL);
+
+	tmp = *mem_start;
+	*mem_start += size;
+	return (void *)tmp;
+}
+
+/*
+ * Find the device_node with a given phandle.
+ */
+static struct device_node * find_phandle(phandle ph)
+{
+	struct device_node *np;
+
+	for (np = allnodes; np != 0; np = np->allnext)
+		if (np->linux_phandle == ph)
+			return np;
+	return NULL;
+}
+
+/*
+ * Find the interrupt parent of a node.
+ */
+static struct device_node * __devinit intr_parent(struct device_node *p)
+{
+	phandle *parp;
+
+	parp = (phandle *) get_property(p, "interrupt-parent", NULL);
+	if (parp == NULL)
+		return p->parent;
+	return find_phandle(*parp);
+}
+
+/*
+ * Find out the size of each entry of the interrupts property
+ * for a node.
+ */
+int __devinit prom_n_intr_cells(struct device_node *np)
+{
+	struct device_node *p;
+	unsigned int *icp;
+
+	for (p = np; (p = intr_parent(p)) != NULL; ) {
+		icp = (unsigned int *)
+			get_property(p, "#interrupt-cells", NULL);
+		if (icp != NULL)
+			return *icp;
+		if (get_property(p, "interrupt-controller", NULL) != NULL
+		    || get_property(p, "interrupt-map", NULL) != NULL) {
+			printk("oops, node %s doesn't have #interrupt-cells\n",
+			       p->full_name);
+			return 1;
+		}
+	}
+#ifdef DEBUG_IRQ
+	printk("prom_n_intr_cells failed for %s\n", np->full_name);
+#endif
+	return 1;
+}
+
+/*
+ * Map an interrupt from a device up to the platform interrupt
+ * descriptor.
+ */
+static int __devinit map_interrupt(unsigned int **irq, struct device_node **ictrler,
+				   struct device_node *np, unsigned int *ints,
+				   int nintrc)
+{
+	struct device_node *p, *ipar;
+	unsigned int *imap, *imask, *ip;
+	int i, imaplen, match;
+	int newintrc = 0, newaddrc = 0;
+	unsigned int *reg;
+	int naddrc;
+
+	reg = (unsigned int *) get_property(np, "reg", NULL);
+	naddrc = prom_n_addr_cells(np);
+	p = intr_parent(np);
+	while (p != NULL) {
+		if (get_property(p, "interrupt-controller", NULL) != NULL)
+			/* this node is an interrupt controller, stop here */
+			break;
+		imap = (unsigned int *)
+			get_property(p, "interrupt-map", &imaplen);
+		if (imap == NULL) {
+			p = intr_parent(p);
+			continue;
+		}
+		imask = (unsigned int *)
+			get_property(p, "interrupt-map-mask", NULL);
+		if (imask == NULL) {
+			printk("oops, %s has interrupt-map but no mask\n",
+			       p->full_name);
+			return 0;
+		}
+		imaplen /= sizeof(unsigned int);
+		match = 0;
+		ipar = NULL;
+		while (imaplen > 0 && !match) {
+			/* check the child-interrupt field */
+			match = 1;
+			for (i = 0; i < naddrc && match; ++i)
+				match = ((reg[i] ^ imap[i]) & imask[i]) == 0;
+			for (; i < naddrc + nintrc && match; ++i)
+				match = ((ints[i-naddrc] ^ imap[i]) & imask[i]) == 0;
+			imap += naddrc + nintrc;
+			imaplen -= naddrc + nintrc;
+			/* grab the interrupt parent */
+			ipar = find_phandle((phandle) *imap++);
+			--imaplen;
+			if (ipar == NULL) {
+				printk("oops, no int parent %x in map of %s\n",
+				       imap[-1], p->full_name);
+				return 0;
+			}
+			/* find the parent's # addr and intr cells */
+			ip = (unsigned int *)
+				get_property(ipar, "#interrupt-cells", NULL);
+			if (ip == NULL) {
+				printk("oops, no #interrupt-cells on %s\n",
+				       ipar->full_name);
+				return 0;
+			}
+			newintrc = *ip;
+			ip = (unsigned int *)
+				get_property(ipar, "#address-cells", NULL);
+			newaddrc = (ip == NULL)? 0: *ip;
+			imap += newaddrc + newintrc;
+			imaplen -= newaddrc + newintrc;
+		}
+		if (imaplen < 0) {
+			printk("oops, error decoding int-map on %s, len=%d\n",
+			       p->full_name, imaplen);
+			return 0;
+		}
+		if (!match) {
+#ifdef DEBUG_IRQ
+			printk("oops, no match in %s int-map for %s\n",
+			       p->full_name, np->full_name);
+#endif
+			return 0;
+		}
+		p = ipar;
+		naddrc = newaddrc;
+		nintrc = newintrc;
+		ints = imap - nintrc;
+		reg = ints - naddrc;
+	}
+	if (p == NULL) {
+#ifdef DEBUG_IRQ
+		printk("hmmm, int tree for %s doesn't have ctrler\n",
+		       np->full_name);
+#endif
+		return 0;
+	}
+	*irq = ints;
+	*ictrler = p;
+	return nintrc;
+}
+
+static int __devinit finish_node_interrupts(struct device_node *np,
+					    unsigned long *mem_start,
+					    int measure_only)
+{
+	unsigned int *ints;
+	int intlen, intrcells, intrcount;
+	int i, j, n;
+	unsigned int *irq, virq;
+	struct device_node *ic;
+
+	ints = (unsigned int *) get_property(np, "interrupts", &intlen);
+	if (ints == NULL)
+		return 0;
+	intrcells = prom_n_intr_cells(np);
+	intlen /= intrcells * sizeof(unsigned int);
+
+	np->intrs = prom_alloc(intlen * sizeof(*(np->intrs)), mem_start);
+	if (!np->intrs)
+		return -ENOMEM;
+
+	if (measure_only)
+		return 0;
+
+	intrcount = 0;
+	for (i = 0; i < intlen; ++i, ints += intrcells) {
+		n = map_interrupt(&irq, &ic, np, ints, intrcells);
+		if (n <= 0)
+			continue;
+
+		/* don't map IRQ numbers under a cascaded 8259 controller */
+		if (ic && device_is_compatible(ic, "chrp,iic")) {
+			np->intrs[intrcount].line = irq[0];
+		} else {
+			virq = virt_irq_create_mapping(irq[0]);
+			if (virq == NO_IRQ) {
+				printk(KERN_CRIT "Could not allocate interrupt"
+				       " number for %s\n", np->full_name);
+				continue;
+			}
+			np->intrs[intrcount].line = irq_offset_up(virq);
+		}
+
+		/* We offset irq numbers for the u3 MPIC by 128 in PowerMac */
+		if (systemcfg->platform == PLATFORM_POWERMAC && ic && ic->parent) {
+			char *name = get_property(ic->parent, "name", NULL);
+			if (name && !strcmp(name, "u3"))
+				np->intrs[intrcount].line += 128;
+		}
+		np->intrs[intrcount].sense = 1;
+		if (n > 1)
+			np->intrs[intrcount].sense = irq[1];
+		if (n > 2) {
+			printk("hmmm, got %d intr cells for %s:", n,
+			       np->full_name);
+			for (j = 0; j < n; ++j)
+				printk(" %d", irq[j]);
+			printk("\n");
+		}
+		++intrcount;
+	}
+	np->n_intrs = intrcount;
+
+	return 0;
+}
+
+static int __devinit interpret_pci_props(struct device_node *np,
+					 unsigned long *mem_start,
+					 int naddrc, int nsizec,
+					 int measure_only)
+{
+	struct address_range *adr;
+	struct pci_reg_property *pci_addrs;
+	int i, l, n_addrs;
+
+	pci_addrs = (struct pci_reg_property *)
+		get_property(np, "assigned-addresses", &l);
+	if (!pci_addrs)
+		return 0;
+
+	n_addrs = l / sizeof(*pci_addrs);
+
+	adr = prom_alloc(n_addrs * sizeof(*adr), mem_start);
+	if (!adr)
+		return -ENOMEM;
+
+ 	if (measure_only)
+ 		return 0;
+
+ 	np->addrs = adr;
+ 	np->n_addrs = n_addrs;
+
+ 	for (i = 0; i < n_addrs; i++) {
+ 		adr[i].space = pci_addrs[i].addr.a_hi;
+ 		adr[i].address = pci_addrs[i].addr.a_lo |
+			((u64)pci_addrs[i].addr.a_mid << 32);
+ 		adr[i].size = pci_addrs[i].size_lo;
+	}
+
+	return 0;
+}
+
+static int __init interpret_dbdma_props(struct device_node *np,
+					unsigned long *mem_start,
+					int naddrc, int nsizec,
+					int measure_only)
+{
+	struct reg_property32 *rp;
+	struct address_range *adr;
+	unsigned long base_address;
+	int i, l;
+	struct device_node *db;
+
+	base_address = 0;
+	if (!measure_only) {
+		for (db = np->parent; db != NULL; db = db->parent) {
+			if (!strcmp(db->type, "dbdma") && db->n_addrs != 0) {
+				base_address = db->addrs[0].address;
+				break;
+			}
+		}
+	}
+
+	rp = (struct reg_property32 *) get_property(np, "reg", &l);
+	if (rp != 0 && l >= sizeof(struct reg_property32)) {
+		i = 0;
+		adr = (struct address_range *) (*mem_start);
+		while ((l -= sizeof(struct reg_property32)) >= 0) {
+			if (!measure_only) {
+				adr[i].space = 2;
+				adr[i].address = rp[i].address + base_address;
+				adr[i].size = rp[i].size;
+			}
+			++i;
+		}
+		np->addrs = adr;
+		np->n_addrs = i;
+		(*mem_start) += i * sizeof(struct address_range);
+	}
+
+	return 0;
+}
+
+static int __init interpret_macio_props(struct device_node *np,
+					unsigned long *mem_start,
+					int naddrc, int nsizec,
+					int measure_only)
+{
+	struct reg_property32 *rp;
+	struct address_range *adr;
+	unsigned long base_address;
+	int i, l;
+	struct device_node *db;
+
+	base_address = 0;
+	if (!measure_only) {
+		for (db = np->parent; db != NULL; db = db->parent) {
+			if (!strcmp(db->type, "mac-io") && db->n_addrs != 0) {
+				base_address = db->addrs[0].address;
+				break;
+			}
+		}
+	}
+
+	rp = (struct reg_property32 *) get_property(np, "reg", &l);
+	if (rp != 0 && l >= sizeof(struct reg_property32)) {
+		i = 0;
+		adr = (struct address_range *) (*mem_start);
+		while ((l -= sizeof(struct reg_property32)) >= 0) {
+			if (!measure_only) {
+				adr[i].space = 2;
+				adr[i].address = rp[i].address + base_address;
+				adr[i].size = rp[i].size;
+			}
+			++i;
+		}
+		np->addrs = adr;
+		np->n_addrs = i;
+		(*mem_start) += i * sizeof(struct address_range);
+	}
+
+	return 0;
+}
+
+static int __init interpret_isa_props(struct device_node *np,
+				      unsigned long *mem_start,
+				      int naddrc, int nsizec,
+				      int measure_only)
+{
+	struct isa_reg_property *rp;
+	struct address_range *adr;
+	int i, l;
+
+	rp = (struct isa_reg_property *) get_property(np, "reg", &l);
+	if (rp != 0 && l >= sizeof(struct isa_reg_property)) {
+		i = 0;
+		adr = (struct address_range *) (*mem_start);
+		while ((l -= sizeof(struct isa_reg_property)) >= 0) {
+			if (!measure_only) {
+				adr[i].space = rp[i].space;
+				adr[i].address = rp[i].address;
+				adr[i].size = rp[i].size;
+			}
+			++i;
+		}
+		np->addrs = adr;
+		np->n_addrs = i;
+		(*mem_start) += i * sizeof(struct address_range);
+	}
+
+	return 0;
+}
+
+static int __init interpret_root_props(struct device_node *np,
+				       unsigned long *mem_start,
+				       int naddrc, int nsizec,
+				       int measure_only)
+{
+	struct address_range *adr;
+	int i, l;
+	unsigned int *rp;
+	int rpsize = (naddrc + nsizec) * sizeof(unsigned int);
+
+	rp = (unsigned int *) get_property(np, "reg", &l);
+	if (rp != 0 && l >= rpsize) {
+		i = 0;
+		adr = (struct address_range *) (*mem_start);
+		while ((l -= rpsize) >= 0) {
+			if (!measure_only) {
+				adr[i].space = 0;
+				adr[i].address = rp[naddrc - 1];
+				adr[i].size = rp[naddrc + nsizec - 1];
+			}
+			++i;
+			rp += naddrc + nsizec;
+		}
+		np->addrs = adr;
+		np->n_addrs = i;
+		(*mem_start) += i * sizeof(struct address_range);
+	}
+
+	return 0;
+}
+
+static int __devinit finish_node(struct device_node *np,
+				 unsigned long *mem_start,
+				 interpret_func *ifunc,
+				 int naddrc, int nsizec,
+				 int measure_only)
+{
+	struct device_node *child;
+	int *ip, rc = 0;
+
+	/* get the device addresses and interrupts */
+	if (ifunc != NULL)
+		rc = ifunc(np, mem_start, naddrc, nsizec, measure_only);
+	if (rc)
+		goto out;
+
+	rc = finish_node_interrupts(np, mem_start, measure_only);
+	if (rc)
+		goto out;
+
+	/* Look for #address-cells and #size-cells properties. */
+	ip = (int *) get_property(np, "#address-cells", NULL);
+	if (ip != NULL)
+		naddrc = *ip;
+	ip = (int *) get_property(np, "#size-cells", NULL);
+	if (ip != NULL)
+		nsizec = *ip;
+
+	/* the f50 sets the name to 'display' and 'compatible' to what we
+	 * expect for the name -- Cort
+	 */
+	if (!strcmp(np->name, "display"))
+		np->name = get_property(np, "compatible", NULL);
+
+	if (!strcmp(np->name, "device-tree") || np->parent == NULL)
+		ifunc = interpret_root_props;
+	else if (np->type == 0)
+		ifunc = NULL;
+	else if (!strcmp(np->type, "pci") || !strcmp(np->type, "vci"))
+		ifunc = interpret_pci_props;
+	else if (!strcmp(np->type, "dbdma"))
+		ifunc = interpret_dbdma_props;
+	else if (!strcmp(np->type, "mac-io") || ifunc == interpret_macio_props)
+		ifunc = interpret_macio_props;
+	else if (!strcmp(np->type, "isa"))
+		ifunc = interpret_isa_props;
+	else if (!strcmp(np->name, "uni-n") || !strcmp(np->name, "u3"))
+		ifunc = interpret_root_props;
+	else if (!((ifunc == interpret_dbdma_props
+		    || ifunc == interpret_macio_props)
+		   && (!strcmp(np->type, "escc")
+		       || !strcmp(np->type, "media-bay"))))
+		ifunc = NULL;
+
+	for (child = np->child; child != NULL; child = child->sibling) {
+		rc = finish_node(child, mem_start, ifunc,
+				 naddrc, nsizec, measure_only);
+		if (rc)
+			goto out;
+	}
+out:
+	return rc;
+}
+
+/**
+ * finish_device_tree is called once things are running normally
+ * (i.e. with text and data mapped to the address they were linked at).
+ * It traverses the device tree and fills in some of the additional,
+ * fields in each node like {n_}addrs and {n_}intrs, the virt interrupt
+ * mapping is also initialized at this point.
+ */
+void __init finish_device_tree(void)
+{
+	unsigned long start, end, size = 0;
+
+	DBG(" -> finish_device_tree\n");
+
+	if (ppc64_interrupt_controller == IC_INVALID) {
+		DBG("failed to configure interrupt controller type\n");
+		panic("failed to configure interrupt controller type\n");
+	}
+	
+	/* Initialize virtual IRQ map */
+	virt_irq_init();
+
+	/*
+	 * Finish device-tree (pre-parsing some properties etc...)
+	 * We do this in 2 passes. One with "measure_only" set, which
+	 * will only measure the amount of memory needed, then we can
+	 * allocate that memory, and call finish_node again. However,
+	 * we must be careful as most routines will fail nowadays when
+	 * prom_alloc() returns 0, so we must make sure our first pass
+	 * doesn't start at 0. We pre-initialize size to 16 for that
+	 * reason and then remove those additional 16 bytes
+	 */
+	size = 16;
+	finish_node(allnodes, &size, NULL, 0, 0, 1);
+	size -= 16;
+	end = start = (unsigned long)abs_to_virt(lmb_alloc(size, 128));
+	finish_node(allnodes, &end, NULL, 0, 0, 0);
+	BUG_ON(end != start + size);
+
+	DBG(" <- finish_device_tree\n");
+}
+
+#ifdef DEBUG
+#define printk udbg_printf
+#endif
+
+static inline char *find_flat_dt_string(u32 offset)
+{
+	return ((char *)initial_boot_params) + initial_boot_params->off_dt_strings
+		+ offset;
+}
+
+/**
+ * This function is used to scan the flattened device-tree, it is
+ * used to extract the memory informations at boot before we can
+ * unflatten the tree
+ */
+static int __init scan_flat_dt(int (*it)(unsigned long node,
+					 const char *full_path, void *data),
+			       void *data)
+{
+	unsigned long p = ((unsigned long)initial_boot_params) +
+		initial_boot_params->off_dt_struct;
+	int rc = 0;
+
+	do {
+		u32 tag = *((u32 *)p);
+		char *pathp;
+		
+		p += 4;
+		if (tag == OF_DT_END_NODE)
+			continue;
+		if (tag == OF_DT_END)
+			break;
+		if (tag == OF_DT_PROP) {
+			u32 sz = *((u32 *)p);
+			p += 8;
+			p = _ALIGN(p, sz >= 8 ? 8 : 4);
+			p += sz;
+			p = _ALIGN(p, 4);
+			continue;
+		}
+		if (tag != OF_DT_BEGIN_NODE) {
+			printk(KERN_WARNING "Invalid tag %x scanning flattened"
+			       " device tree !\n", tag);
+			return -EINVAL;
+		}
+		pathp = (char *)p;
+		p = _ALIGN(p + strlen(pathp) + 1, 4);
+		rc = it(p, pathp, data);
+		if (rc != 0)
+			break;		
+	} while(1);
+
+	return rc;
+}
+
+/**
+ * This  function can be used within scan_flattened_dt callback to get
+ * access to properties
+ */
+static void* __init get_flat_dt_prop(unsigned long node, const char *name,
+				     unsigned long *size)
+{
+	unsigned long p = node;
+
+	do {
+		u32 tag = *((u32 *)p);
+		u32 sz, noff;
+		const char *nstr;
+
+		p += 4;
+		if (tag != OF_DT_PROP)
+			return NULL;
+
+		sz = *((u32 *)p);
+		noff = *((u32 *)(p + 4));
+		p += 8;
+		p = _ALIGN(p, sz >= 8 ? 8 : 4);
+
+		nstr = find_flat_dt_string(noff);
+		if (nstr == NULL) {
+			printk(KERN_WARNING "Can't find property index name !\n");
+			return NULL;
+		}
+		if (strcmp(name, nstr) == 0) {
+			if (size)
+				*size = sz;
+			return (void *)p;
+		}
+		p += sz;
+		p = _ALIGN(p, 4);
+	} while(1);
+}
+
+static void *__init unflatten_dt_alloc(unsigned long *mem, unsigned long size,
+					       unsigned long align)
+{
+	void *res;
+
+	*mem = _ALIGN(*mem, align);
+	res = (void *)*mem;
+	*mem += size;
+
+	return res;
+}
+
+static unsigned long __init unflatten_dt_node(unsigned long mem,
+					      unsigned long *p,
+					      struct device_node *dad,
+					      struct device_node ***allnextpp)
+{
+	struct device_node *np;
+	struct property *pp, **prev_pp = NULL;
+	char *pathp;
+	u32 tag;
+	unsigned int l;
+
+	tag = *((u32 *)(*p));
+	if (tag != OF_DT_BEGIN_NODE) {
+		printk("Weird tag at start of node: %x\n", tag);
+		return mem;
+	}
+	*p += 4;
+	pathp = (char *)*p;
+	l = strlen(pathp) + 1;
+	*p = _ALIGN(*p + l, 4);
+
+	np = unflatten_dt_alloc(&mem, sizeof(struct device_node) + l,
+				__alignof__(struct device_node));
+	if (allnextpp) {
+		memset(np, 0, sizeof(*np));
+		np->full_name = ((char*)np) + sizeof(struct device_node);
+		memcpy(np->full_name, pathp, l);
+		prev_pp = &np->properties;
+		**allnextpp = np;
+		*allnextpp = &np->allnext;
+		if (dad != NULL) {
+			np->parent = dad;
+			/* we temporarily use the `next' field as `last_child'. */
+			if (dad->next == 0)
+				dad->child = np;
+			else
+				dad->next->sibling = np;
+			dad->next = np;
+		}
+		kref_init(&np->kref);
+	}
+	while(1) {
+		u32 sz, noff;
+		char *pname;
+
+		tag = *((u32 *)(*p));
+		if (tag != OF_DT_PROP)
+			break;
+		*p += 4;
+		sz = *((u32 *)(*p));
+		noff = *((u32 *)((*p) + 4));
+		*p = _ALIGN((*p) + 8, sz >= 8 ? 8 : 4);
+
+		pname = find_flat_dt_string(noff);
+		if (pname == NULL) {
+			printk("Can't find property name in list !\n");
+			break;
+		}
+		l = strlen(pname) + 1;
+		pp = unflatten_dt_alloc(&mem, sizeof(struct property),
+					__alignof__(struct property));
+		if (allnextpp) {
+			if (strcmp(pname, "linux,phandle") == 0) {
+				np->node = *((u32 *)*p);
+				if (np->linux_phandle == 0)
+					np->linux_phandle = np->node;
+			}
+			if (strcmp(pname, "ibm,phandle") == 0)
+				np->linux_phandle = *((u32 *)*p);
+			pp->name = pname;
+			pp->length = sz;
+			pp->value = (void *)*p;
+			*prev_pp = pp;
+			prev_pp = &pp->next;
+		}
+		*p = _ALIGN((*p) + sz, 4);
+	}
+	if (allnextpp) {
+		*prev_pp = NULL;
+		np->name = get_property(np, "name", NULL);
+		np->type = get_property(np, "device_type", NULL);
+
+		if (!np->name)
+			np->name = "<NULL>";
+		if (!np->type)
+			np->type = "<NULL>";
+	}
+	while (tag == OF_DT_BEGIN_NODE) {
+		mem = unflatten_dt_node(mem, p, np, allnextpp);
+		tag = *((u32 *)(*p));
+	}
+	if (tag != OF_DT_END_NODE) {
+		printk("Weird tag at start of node: %x\n", tag);
+		return mem;
+	}
+	*p += 4;
+	return mem;
+}
+
+
+/**
+ * unflattens the device-tree passed by the firmware, creating the
+ * tree of struct device_node. It also fills the "name" and "type"
+ * pointers of the nodes so the normal device-tree walking functions
+ * can be used (this used to be done by finish_device_tree)
+ */
+void __init unflatten_device_tree(void)
+{
+	unsigned long start, mem, size;
+	struct device_node **allnextp = &allnodes;
+	char *p;
+	int l = 0;
+
+	DBG(" -> unflatten_device_tree()\n");
+
+	/* First pass, scan for size */
+	start = ((unsigned long)initial_boot_params) +
+		initial_boot_params->off_dt_struct;
+	size = unflatten_dt_node(0, &start, NULL, NULL);
+
+	DBG("  size is %lx, allocating...\n", size);
+
+	/* Allocate memory for the expanded device tree */
+	mem = (unsigned long)abs_to_virt(lmb_alloc(size,
+						   __alignof__(struct device_node)));
+	DBG("  unflattening...\n", mem);
+
+	/* Second pass, do actual unflattening */
+	start = ((unsigned long)initial_boot_params) +
+		initial_boot_params->off_dt_struct;
+	unflatten_dt_node(mem, &start, NULL, &allnextp);
+	if (*((u32 *)start) != OF_DT_END)
+		printk(KERN_WARNING "Weird tag at end of tree: %x\n", *((u32 *)start));
+	*allnextp = NULL;
+
+	/* Get pointer to OF "/chosen" node for use everywhere */
+	of_chosen = of_find_node_by_path("/chosen");
+
+	/* Retreive command line */
+	if (of_chosen != NULL) {
+		p = (char *)get_property(of_chosen, "bootargs", &l);
+		if (p != NULL && l > 0)
+			strlcpy(cmd_line, p, min(l, COMMAND_LINE_SIZE));
+	}
+#ifdef CONFIG_CMDLINE
+	if (l == 0 || (l == 1 && (*p) == 0))
+		strlcpy(cmd_line, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
+#endif /* CONFIG_CMDLINE */
+
+	DBG("Command line is: %s\n", cmd_line);
+
+	DBG(" <- unflatten_device_tree()\n");
+}
+
+
+static int __init early_init_dt_scan_cpus(unsigned long node,
+					  const char *full_path, void *data)
+{
+	char *type = get_flat_dt_prop(node, "device_type", NULL);
+
+	/* We are scanning "cpu" nodes only */
+	if (type == NULL || strcmp(type, "cpu") != 0)
+		return 0;
+
+	/* On LPAR, look for the first ibm,pft-size property for the  hash table size
+	 */
+	if (systemcfg->platform == PLATFORM_PSERIES_LPAR && ppc64_pft_size == 0) {
+		u32 *pft_size;
+		pft_size = (u32 *)get_flat_dt_prop(node, "ibm,pft-size", NULL);
+		if (pft_size != NULL) {
+			/* pft_size[0] is the NUMA CEC cookie */
+			ppc64_pft_size = pft_size[1];
+		}
+	}
+
+	if (initial_boot_params && initial_boot_params->version >= 2) {
+		/* version 2 of the kexec param format adds the phys cpuid
+		 * of booted proc.
+		 */
+		boot_cpuid_phys = initial_boot_params->boot_cpuid_phys;
+		boot_cpuid = 0;
+	} else {
+		/* Check if it's the boot-cpu, set it's hw index in paca now */
+		if (get_flat_dt_prop(node, "linux,boot-cpu", NULL) != NULL) {
+			u32 *prop = get_flat_dt_prop(node, "reg", NULL);
+			set_hard_smp_processor_id(0, prop == NULL ? 0 : *prop);
+			boot_cpuid_phys = get_hard_smp_processor_id(0);
+		}
+	}
+
+	return 0;
+}
+
+static int __init early_init_dt_scan_chosen(unsigned long node,
+					    const char *full_path, void *data)
+{
+	u32 *prop;
+	u64 *prop64;
+	extern unsigned long memory_limit, tce_alloc_start, tce_alloc_end;
+
+	if (strcmp(full_path, "/chosen") != 0)
+		return 0;
+
+	/* get platform type */
+	prop = (u32 *)get_flat_dt_prop(node, "linux,platform", NULL);
+	if (prop == NULL)
+		return 0;
+	systemcfg->platform = *prop;
+
+	/* check if iommu is forced on or off */
+	if (get_flat_dt_prop(node, "linux,iommu-off", NULL) != NULL)
+		iommu_is_off = 1;
+	if (get_flat_dt_prop(node, "linux,iommu-force-on", NULL) != NULL)
+		iommu_force_on = 1;
+
+ 	prop64 = (u64*)get_flat_dt_prop(node, "linux,memory-limit", NULL);
+ 	if (prop64)
+ 		memory_limit = *prop64;
+
+ 	prop64 = (u64*)get_flat_dt_prop(node, "linux,tce-alloc-start", NULL);
+ 	if (prop64)
+ 		tce_alloc_start = *prop64;
+
+ 	prop64 = (u64*)get_flat_dt_prop(node, "linux,tce-alloc-end", NULL);
+ 	if (prop64)
+ 		tce_alloc_end = *prop64;
+
+#ifdef CONFIG_PPC_RTAS
+	/* To help early debugging via the front panel, we retreive a minimal
+	 * set of RTAS infos now if available
+	 */
+	{
+		u64 *basep, *entryp;
+
+		basep = (u64*)get_flat_dt_prop(node, "linux,rtas-base", NULL);
+		entryp = (u64*)get_flat_dt_prop(node, "linux,rtas-entry", NULL);
+		prop = (u32*)get_flat_dt_prop(node, "linux,rtas-size", NULL);
+		if (basep && entryp && prop) {
+			rtas.base = *basep;
+			rtas.entry = *entryp;
+			rtas.size = *prop;
+		}
+	}
+#endif /* CONFIG_PPC_RTAS */
+
+	/* break now */
+	return 1;
+}
+
+static int __init early_init_dt_scan_root(unsigned long node,
+					  const char *full_path, void *data)
+{
+	u32 *prop;
+
+	if (strcmp(full_path, "/") != 0)
+		return 0;
+
+	prop = (u32 *)get_flat_dt_prop(node, "#size-cells", NULL);
+	dt_root_size_cells = (prop == NULL) ? 1 : *prop;
+		
+	prop = (u32 *)get_flat_dt_prop(node, "#address-cells", NULL);
+	dt_root_addr_cells = (prop == NULL) ? 2 : *prop;
+	
+	/* break now */
+	return 1;
+}
+
+static unsigned long __init dt_mem_next_cell(int s, cell_t **cellp)
+{
+	cell_t *p = *cellp;
+	unsigned long r = 0;
+
+	/* Ignore more than 2 cells */
+	while (s > 2) {
+		p++;
+		s--;
+	}
+	while (s) {
+		r <<= 32;
+		r |= *(p++);
+		s--;
+	}
+
+	*cellp = p;
+	return r;
+}
+
+
+static int __init early_init_dt_scan_memory(unsigned long node,
+					    const char *full_path, void *data)
+{
+	char *type = get_flat_dt_prop(node, "device_type", NULL);
+	cell_t *reg, *endp;
+	unsigned long l;
+
+	/* We are scanning "memory" nodes only */
+	if (type == NULL || strcmp(type, "memory") != 0)
+		return 0;
+
+	reg = (cell_t *)get_flat_dt_prop(node, "reg", &l);
+	if (reg == NULL)
+		return 0;
+
+	endp = reg + (l / sizeof(cell_t));
+
+	DBG("memory scan node %s ...\n", full_path);
+	while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) {
+		unsigned long base, size;
+
+		base = dt_mem_next_cell(dt_root_addr_cells, &reg);
+		size = dt_mem_next_cell(dt_root_size_cells, &reg);
+
+		if (size == 0)
+			continue;
+		DBG(" - %lx ,  %lx\n", base, size);
+		if (iommu_is_off) {
+			if (base >= 0x80000000ul)
+				continue;
+			if ((base + size) > 0x80000000ul)
+				size = 0x80000000ul - base;
+		}
+		lmb_add(base, size);
+	}
+	return 0;
+}
+
+static void __init early_reserve_mem(void)
+{
+	u64 base, size;
+	u64 *reserve_map = (u64 *)(((unsigned long)initial_boot_params) +
+				   initial_boot_params->off_mem_rsvmap);
+	while (1) {
+		base = *(reserve_map++);
+		size = *(reserve_map++);
+		if (size == 0)
+			break;
+		DBG("reserving: %lx -> %lx\n", base, size);
+		lmb_reserve(base, size);
+	}
+
+#if 0
+	DBG("memory reserved, lmbs :\n");
+      	lmb_dump_all();
+#endif
+}
+
+void __init early_init_devtree(void *params)
+{
+	DBG(" -> early_init_devtree()\n");
+
+	/* Setup flat device-tree pointer */
+	initial_boot_params = params;
+
+	/* By default, hash size is not set */
+	ppc64_pft_size = 0;
+
+	/* Retreive various informations from the /chosen node of the
+	 * device-tree, including the platform type, initrd location and
+	 * size, TCE reserve, and more ...
+	 */
+	scan_flat_dt(early_init_dt_scan_chosen, NULL);
+
+	/* Scan memory nodes and rebuild LMBs */
+	lmb_init();
+	scan_flat_dt(early_init_dt_scan_root, NULL);
+	scan_flat_dt(early_init_dt_scan_memory, NULL);
+	lmb_enforce_memory_limit();
+	lmb_analyze();
+	systemcfg->physicalMemorySize = lmb_phys_mem_size();
+	lmb_reserve(0, __pa(klimit));
+
+	DBG("Phys. mem: %lx\n", systemcfg->physicalMemorySize);
+
+	/* Reserve LMB regions used by kernel, initrd, dt, etc... */
+	early_reserve_mem();
+
+	DBG("Scanning CPUs ...\n");
+
+	/* Retreive hash table size from flattened tree */
+	scan_flat_dt(early_init_dt_scan_cpus, NULL);
+
+	/* If hash size wasn't obtained above, we calculate it now based on
+	 * the total RAM size
+	 */
+	if (ppc64_pft_size == 0) {
+		unsigned long rnd_mem_size, pteg_count;
+
+		/* round mem_size up to next power of 2 */
+		rnd_mem_size = 1UL << __ilog2(systemcfg->physicalMemorySize);
+		if (rnd_mem_size < systemcfg->physicalMemorySize)
+			rnd_mem_size <<= 1;
+
+		/* # pages / 2 */
+		pteg_count = max(rnd_mem_size >> (12 + 1), 1UL << 11);
+
+		ppc64_pft_size = __ilog2(pteg_count << 7);
+	}
+
+	DBG("Hash pftSize: %x\n", (int)ppc64_pft_size);
+	DBG(" <- early_init_devtree()\n");
+}
+
+#undef printk
+
+int
+prom_n_addr_cells(struct device_node* np)
+{
+	int* ip;
+	do {
+		if (np->parent)
+			np = np->parent;
+		ip = (int *) get_property(np, "#address-cells", NULL);
+		if (ip != NULL)
+			return *ip;
+	} while (np->parent);
+	/* No #address-cells property for the root node, default to 1 */
+	return 1;
+}
+
+int
+prom_n_size_cells(struct device_node* np)
+{
+	int* ip;
+	do {
+		if (np->parent)
+			np = np->parent;
+		ip = (int *) get_property(np, "#size-cells", NULL);
+		if (ip != NULL)
+			return *ip;
+	} while (np->parent);
+	/* No #size-cells property for the root node, default to 1 */
+	return 1;
+}
+
+/**
+ * Work out the sense (active-low level / active-high edge)
+ * of each interrupt from the device tree.
+ */
+void __init prom_get_irq_senses(unsigned char *senses, int off, int max)
+{
+	struct device_node *np;
+	int i, j;
+
+	/* default to level-triggered */
+	memset(senses, 1, max - off);
+
+	for (np = allnodes; np != 0; np = np->allnext) {
+		for (j = 0; j < np->n_intrs; j++) {
+			i = np->intrs[j].line;
+			if (i >= off && i < max)
+				senses[i-off] = np->intrs[j].sense ?
+					IRQ_SENSE_LEVEL | IRQ_POLARITY_NEGATIVE :
+					IRQ_SENSE_EDGE | IRQ_POLARITY_POSITIVE;
+		}
+	}
+}
+
+/**
+ * Construct and return a list of the device_nodes with a given name.
+ */
+struct device_node *
+find_devices(const char *name)
+{
+	struct device_node *head, **prevp, *np;
+
+	prevp = &head;
+	for (np = allnodes; np != 0; np = np->allnext) {
+		if (np->name != 0 && strcasecmp(np->name, name) == 0) {
+			*prevp = np;
+			prevp = &np->next;
+		}
+	}
+	*prevp = NULL;
+	return head;
+}
+EXPORT_SYMBOL(find_devices);
+
+/**
+ * Construct and return a list of the device_nodes with a given type.
+ */
+struct device_node *
+find_type_devices(const char *type)
+{
+	struct device_node *head, **prevp, *np;
+
+	prevp = &head;
+	for (np = allnodes; np != 0; np = np->allnext) {
+		if (np->type != 0 && strcasecmp(np->type, type) == 0) {
+			*prevp = np;
+			prevp = &np->next;
+		}
+	}
+	*prevp = NULL;
+	return head;
+}
+EXPORT_SYMBOL(find_type_devices);
+
+/**
+ * Returns all nodes linked together
+ */
+struct device_node *
+find_all_nodes(void)
+{
+	struct device_node *head, **prevp, *np;
+
+	prevp = &head;
+	for (np = allnodes; np != 0; np = np->allnext) {
+		*prevp = np;
+		prevp = &np->next;
+	}
+	*prevp = NULL;
+	return head;
+}
+EXPORT_SYMBOL(find_all_nodes);
+
+/** Checks if the given "compat" string matches one of the strings in
+ * the device's "compatible" property
+ */
+int
+device_is_compatible(struct device_node *device, const char *compat)
+{
+	const char* cp;
+	int cplen, l;
+
+	cp = (char *) get_property(device, "compatible", &cplen);
+	if (cp == NULL)
+		return 0;
+	while (cplen > 0) {
+		if (strncasecmp(cp, compat, strlen(compat)) == 0)
+			return 1;
+		l = strlen(cp) + 1;
+		cp += l;
+		cplen -= l;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(device_is_compatible);
+
+
+/**
+ * Indicates whether the root node has a given value in its
+ * compatible property.
+ */
+int
+machine_is_compatible(const char *compat)
+{
+	struct device_node *root;
+	int rc = 0;
+
+	root = of_find_node_by_path("/");
+	if (root) {
+		rc = device_is_compatible(root, compat);
+		of_node_put(root);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(machine_is_compatible);
+
+/**
+ * Construct and return a list of the device_nodes with a given type
+ * and compatible property.
+ */
+struct device_node *
+find_compatible_devices(const char *type, const char *compat)
+{
+	struct device_node *head, **prevp, *np;
+
+	prevp = &head;
+	for (np = allnodes; np != 0; np = np->allnext) {
+		if (type != NULL
+		    && !(np->type != 0 && strcasecmp(np->type, type) == 0))
+			continue;
+		if (device_is_compatible(np, compat)) {
+			*prevp = np;
+			prevp = &np->next;
+		}
+	}
+	*prevp = NULL;
+	return head;
+}
+EXPORT_SYMBOL(find_compatible_devices);
+
+/**
+ * Find the device_node with a given full_name.
+ */
+struct device_node *
+find_path_device(const char *path)
+{
+	struct device_node *np;
+
+	for (np = allnodes; np != 0; np = np->allnext)
+		if (np->full_name != 0 && strcasecmp(np->full_name, path) == 0)
+			return np;
+	return NULL;
+}
+EXPORT_SYMBOL(find_path_device);
+
+/*******
+ *
+ * New implementation of the OF "find" APIs, return a refcounted
+ * object, call of_node_put() when done.  The device tree and list
+ * are protected by a rw_lock.
+ *
+ * Note that property management will need some locking as well,
+ * this isn't dealt with yet.
+ *
+ *******/
+
+/**
+ *	of_find_node_by_name - Find a node by its "name" property
+ *	@from:	The node to start searching from or NULL, the node
+ *		you pass will not be searched, only the next one
+ *		will; typically, you pass what the previous call
+ *		returned. of_node_put() will be called on it
+ *	@name:	The name string to match against
+ *
+ *	Returns a node pointer with refcount incremented, use
+ *	of_node_put() on it when done.
+ */
+struct device_node *of_find_node_by_name(struct device_node *from,
+	const char *name)
+{
+	struct device_node *np;
+
+	read_lock(&devtree_lock);
+	np = from ? from->allnext : allnodes;
+	for (; np != 0; np = np->allnext)
+		if (np->name != 0 && strcasecmp(np->name, name) == 0
+		    && of_node_get(np))
+			break;
+	if (from)
+		of_node_put(from);
+	read_unlock(&devtree_lock);
+	return np;
+}
+EXPORT_SYMBOL(of_find_node_by_name);
+
+/**
+ *	of_find_node_by_type - Find a node by its "device_type" property
+ *	@from:	The node to start searching from or NULL, the node
+ *		you pass will not be searched, only the next one
+ *		will; typically, you pass what the previous call
+ *		returned. of_node_put() will be called on it
+ *	@name:	The type string to match against
+ *
+ *	Returns a node pointer with refcount incremented, use
+ *	of_node_put() on it when done.
+ */
+struct device_node *of_find_node_by_type(struct device_node *from,
+	const char *type)
+{
+	struct device_node *np;
+
+	read_lock(&devtree_lock);
+	np = from ? from->allnext : allnodes;
+	for (; np != 0; np = np->allnext)
+		if (np->type != 0 && strcasecmp(np->type, type) == 0
+		    && of_node_get(np))
+			break;
+	if (from)
+		of_node_put(from);
+	read_unlock(&devtree_lock);
+	return np;
+}
+EXPORT_SYMBOL(of_find_node_by_type);
+
+/**
+ *	of_find_compatible_node - Find a node based on type and one of the
+ *                                tokens in its "compatible" property
+ *	@from:		The node to start searching from or NULL, the node
+ *			you pass will not be searched, only the next one
+ *			will; typically, you pass what the previous call
+ *			returned. of_node_put() will be called on it
+ *	@type:		The type string to match "device_type" or NULL to ignore
+ *	@compatible:	The string to match to one of the tokens in the device
+ *			"compatible" list.
+ *
+ *	Returns a node pointer with refcount incremented, use
+ *	of_node_put() on it when done.
+ */
+struct device_node *of_find_compatible_node(struct device_node *from,
+	const char *type, const char *compatible)
+{
+	struct device_node *np;
+
+	read_lock(&devtree_lock);
+	np = from ? from->allnext : allnodes;
+	for (; np != 0; np = np->allnext) {
+		if (type != NULL
+		    && !(np->type != 0 && strcasecmp(np->type, type) == 0))
+			continue;
+		if (device_is_compatible(np, compatible) && of_node_get(np))
+			break;
+	}
+	if (from)
+		of_node_put(from);
+	read_unlock(&devtree_lock);
+	return np;
+}
+EXPORT_SYMBOL(of_find_compatible_node);
+
+/**
+ *	of_find_node_by_path - Find a node matching a full OF path
+ *	@path:	The full path to match
+ *
+ *	Returns a node pointer with refcount incremented, use
+ *	of_node_put() on it when done.
+ */
+struct device_node *of_find_node_by_path(const char *path)
+{
+	struct device_node *np = allnodes;
+
+	read_lock(&devtree_lock);
+	for (; np != 0; np = np->allnext)
+		if (np->full_name != 0 && strcasecmp(np->full_name, path) == 0
+		    && of_node_get(np))
+			break;
+	read_unlock(&devtree_lock);
+	return np;
+}
+EXPORT_SYMBOL(of_find_node_by_path);
+
+/**
+ *	of_find_node_by_phandle - Find a node given a phandle
+ *	@handle:	phandle of the node to find
+ *
+ *	Returns a node pointer with refcount incremented, use
+ *	of_node_put() on it when done.
+ */
+struct device_node *of_find_node_by_phandle(phandle handle)
+{
+	struct device_node *np;
+
+	read_lock(&devtree_lock);
+	for (np = allnodes; np != 0; np = np->allnext)
+		if (np->linux_phandle == handle)
+			break;
+	if (np)
+		of_node_get(np);
+	read_unlock(&devtree_lock);
+	return np;
+}
+EXPORT_SYMBOL(of_find_node_by_phandle);
+
+/**
+ *	of_find_all_nodes - Get next node in global list
+ *	@prev:	Previous node or NULL to start iteration
+ *		of_node_put() will be called on it
+ *
+ *	Returns a node pointer with refcount incremented, use
+ *	of_node_put() on it when done.
+ */
+struct device_node *of_find_all_nodes(struct device_node *prev)
+{
+	struct device_node *np;
+
+	read_lock(&devtree_lock);
+	np = prev ? prev->allnext : allnodes;
+	for (; np != 0; np = np->allnext)
+		if (of_node_get(np))
+			break;
+	if (prev)
+		of_node_put(prev);
+	read_unlock(&devtree_lock);
+	return np;
+}
+EXPORT_SYMBOL(of_find_all_nodes);
+
+/**
+ *	of_get_parent - Get a node's parent if any
+ *	@node:	Node to get parent
+ *
+ *	Returns a node pointer with refcount incremented, use
+ *	of_node_put() on it when done.
+ */
+struct device_node *of_get_parent(const struct device_node *node)
+{
+	struct device_node *np;
+
+	if (!node)
+		return NULL;
+
+	read_lock(&devtree_lock);
+	np = of_node_get(node->parent);
+	read_unlock(&devtree_lock);
+	return np;
+}
+EXPORT_SYMBOL(of_get_parent);
+
+/**
+ *	of_get_next_child - Iterate a node childs
+ *	@node:	parent node
+ *	@prev:	previous child of the parent node, or NULL to get first
+ *
+ *	Returns a node pointer with refcount incremented, use
+ *	of_node_put() on it when done.
+ */
+struct device_node *of_get_next_child(const struct device_node *node,
+	struct device_node *prev)
+{
+	struct device_node *next;
+
+	read_lock(&devtree_lock);
+	next = prev ? prev->sibling : node->child;
+	for (; next != 0; next = next->sibling)
+		if (of_node_get(next))
+			break;
+	if (prev)
+		of_node_put(prev);
+	read_unlock(&devtree_lock);
+	return next;
+}
+EXPORT_SYMBOL(of_get_next_child);
+
+/**
+ *	of_node_get - Increment refcount of a node
+ *	@node:	Node to inc refcount, NULL is supported to
+ *		simplify writing of callers
+ *
+ *	Returns node.
+ */
+struct device_node *of_node_get(struct device_node *node)
+{
+	if (node)
+		kref_get(&node->kref);
+	return node;
+}
+EXPORT_SYMBOL(of_node_get);
+
+static inline struct device_node * kref_to_device_node(struct kref *kref)
+{
+	return container_of(kref, struct device_node, kref);
+}
+
+/**
+ *	of_node_release - release a dynamically allocated node
+ *	@kref:  kref element of the node to be released
+ *
+ *	In of_node_put() this function is passed to kref_put()
+ *	as the destructor.
+ */
+static void of_node_release(struct kref *kref)
+{
+	struct device_node *node = kref_to_device_node(kref);
+	struct property *prop = node->properties;
+
+	if (!OF_IS_DYNAMIC(node))
+		return;
+	while (prop) {
+		struct property *next = prop->next;
+		kfree(prop->name);
+		kfree(prop->value);
+		kfree(prop);
+		prop = next;
+	}
+	kfree(node->intrs);
+	kfree(node->addrs);
+	kfree(node->full_name);
+	kfree(node);
+}
+
+/**
+ *	of_node_put - Decrement refcount of a node
+ *	@node:	Node to dec refcount, NULL is supported to
+ *		simplify writing of callers
+ *
+ */
+void of_node_put(struct device_node *node)
+{
+	if (node)
+		kref_put(&node->kref, of_node_release);
+}
+EXPORT_SYMBOL(of_node_put);
+
+/*
+ * Fix up the uninitialized fields in a new device node:
+ * name, type, n_addrs, addrs, n_intrs, intrs, and pci-specific fields
+ *
+ * A lot of boot-time code is duplicated here, because functions such
+ * as finish_node_interrupts, interpret_pci_props, etc. cannot use the
+ * slab allocator.
+ *
+ * This should probably be split up into smaller chunks.
+ */
+
+static int of_finish_dynamic_node(struct device_node *node,
+				  unsigned long *unused1, int unused2,
+				  int unused3, int unused4)
+{
+	struct device_node *parent = of_get_parent(node);
+	int err = 0;
+	phandle *ibm_phandle;
+
+	node->name = get_property(node, "name", NULL);
+	node->type = get_property(node, "device_type", NULL);
+
+	if (!parent) {
+		err = -ENODEV;
+		goto out;
+	}
+
+	/* We don't support that function on PowerMac, at least
+	 * not yet
+	 */
+	if (systemcfg->platform == PLATFORM_POWERMAC)
+		return -ENODEV;
+
+	/* fix up new node's linux_phandle field */
+	if ((ibm_phandle = (unsigned int *)get_property(node, "ibm,phandle", NULL)))
+		node->linux_phandle = *ibm_phandle;
+
+out:
+	of_node_put(parent);
+	return err;
+}
+
+/*
+ * Plug a device node into the tree and global list.
+ */
+void of_attach_node(struct device_node *np)
+{
+	write_lock(&devtree_lock);
+	np->sibling = np->parent->child;
+	np->allnext = allnodes;
+	np->parent->child = np;
+	allnodes = np;
+	write_unlock(&devtree_lock);
+}
+
+/*
+ * "Unplug" a node from the device tree.  The caller must hold
+ * a reference to the node.  The memory associated with the node
+ * is not freed until its refcount goes to zero.
+ */
+void of_detach_node(const struct device_node *np)
+{
+	struct device_node *parent;
+
+	write_lock(&devtree_lock);
+
+	parent = np->parent;
+
+	if (allnodes == np)
+		allnodes = np->allnext;
+	else {
+		struct device_node *prev;
+		for (prev = allnodes;
+		     prev->allnext != np;
+		     prev = prev->allnext)
+			;
+		prev->allnext = np->allnext;
+	}
+
+	if (parent->child == np)
+		parent->child = np->sibling;
+	else {
+		struct device_node *prevsib;
+		for (prevsib = np->parent->child;
+		     prevsib->sibling != np;
+		     prevsib = prevsib->sibling)
+			;
+		prevsib->sibling = np->sibling;
+	}
+
+	write_unlock(&devtree_lock);
+}
+
+static int prom_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *node)
+{
+	int err;
+
+	switch (action) {
+	case PSERIES_RECONFIG_ADD:
+		err = finish_node(node, NULL, of_finish_dynamic_node, 0, 0, 0);
+		if (err < 0) {
+			printk(KERN_ERR "finish_node returned %d\n", err);
+			err = NOTIFY_BAD;
+		}
+		break;
+	default:
+		err = NOTIFY_DONE;
+		break;
+	}
+	return err;
+}
+
+static struct notifier_block prom_reconfig_nb = {
+	.notifier_call = prom_reconfig_notifier,
+	.priority = 10, /* This one needs to run first */
+};
+
+static int __init prom_reconfig_setup(void)
+{
+	return pSeries_reconfig_notifier_register(&prom_reconfig_nb);
+}
+__initcall(prom_reconfig_setup);
+
+/*
+ * Find a property with a given name for a given node
+ * and return the value.
+ */
+unsigned char *
+get_property(struct device_node *np, const char *name, int *lenp)
+{
+	struct property *pp;
+
+	for (pp = np->properties; pp != 0; pp = pp->next)
+		if (strcmp(pp->name, name) == 0) {
+			if (lenp != 0)
+				*lenp = pp->length;
+			return pp->value;
+		}
+	return NULL;
+}
+EXPORT_SYMBOL(get_property);
+
+/*
+ * Add a property to a node
+ */
+void
+prom_add_property(struct device_node* np, struct property* prop)
+{
+	struct property **next = &np->properties;
+
+	prop->next = NULL;	
+	while (*next)
+		next = &(*next)->next;
+	*next = prop;
+}
+
+#if 0
+void
+print_properties(struct device_node *np)
+{
+	struct property *pp;
+	char *cp;
+	int i, n;
+
+	for (pp = np->properties; pp != 0; pp = pp->next) {
+		printk(KERN_INFO "%s", pp->name);
+		for (i = strlen(pp->name); i < 16; ++i)
+			printk(" ");
+		cp = (char *) pp->value;
+		for (i = pp->length; i > 0; --i, ++cp)
+			if ((i > 1 && (*cp < 0x20 || *cp > 0x7e))
+			    || (i == 1 && *cp != 0))
+				break;
+		if (i == 0 && pp->length > 1) {
+			/* looks like a string */
+			printk(" %s\n", (char *) pp->value);
+		} else {
+			/* dump it in hex */
+			n = pp->length;
+			if (n > 64)
+				n = 64;
+			if (pp->length % 4 == 0) {
+				unsigned int *p = (unsigned int *) pp->value;
+
+				n /= 4;
+				for (i = 0; i < n; ++i) {
+					if (i != 0 && (i % 4) == 0)
+						printk("\n                ");
+					printk(" %08x", *p++);
+				}
+			} else {
+				unsigned char *bp = pp->value;
+
+				for (i = 0; i < n; ++i) {
+					if (i != 0 && (i % 16) == 0)
+						printk("\n                ");
+					printk(" %02x", *bp++);
+				}
+			}
+			printk("\n");
+			if (pp->length > 64)
+				printk("                 ... (length = %d)\n",
+				       pp->length);
+		}
+	}
+}
+#endif
+
+
+
+
+
+
+
+
+
+
diff --git a/arch/ppc64/kernel/prom_init.c b/arch/ppc64/kernel/prom_init.c
new file mode 100644
index 00000000000..8dffa9ae262
--- /dev/null
+++ b/arch/ppc64/kernel/prom_init.c
@@ -0,0 +1,1838 @@
+/*
+ * 
+ *
+ * Procedures for interfacing to Open Firmware.
+ *
+ * Paul Mackerras	August 1996.
+ * Copyright (C) 1996 Paul Mackerras.
+ * 
+ *  Adapted for 64bit PowerPC by Dave Engebretsen and Peter Bergner.
+ *    {engebret|bergner}@us.ibm.com 
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG_PROM
+
+#include <stdarg.h>
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/version.h>
+#include <linux/threads.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/pci.h>
+#include <linux/proc_fs.h>
+#include <linux/stringify.h>
+#include <linux/delay.h>
+#include <linux/initrd.h>
+#include <linux/bitops.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/abs_addr.h>
+#include <asm/page.h>
+#include <asm/processor.h>
+#include <asm/irq.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/system.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
+#include <asm/pci.h>
+#include <asm/iommu.h>
+#include <asm/bootinfo.h>
+#include <asm/ppcdebug.h>
+#include <asm/btext.h>
+#include <asm/sections.h>
+#include <asm/machdep.h>
+
+#ifdef CONFIG_LOGO_LINUX_CLUT224
+#include <linux/linux_logo.h>
+extern const struct linux_logo logo_linux_clut224;
+#endif
+
+/*
+ * Properties whose value is longer than this get excluded from our
+ * copy of the device tree. This value does need to be big enough to
+ * ensure that we don't lose things like the interrupt-map property
+ * on a PCI-PCI bridge.
+ */
+#define MAX_PROPERTY_LENGTH	(1UL * 1024 * 1024)
+
+/*
+ * Eventually bump that one up
+ */
+#define DEVTREE_CHUNK_SIZE	0x100000
+
+/*
+ * This is the size of the local memory reserve map that gets copied
+ * into the boot params passed to the kernel. That size is totally
+ * flexible as the kernel just reads the list until it encounters an
+ * entry with size 0, so it can be changed without breaking binary
+ * compatibility
+ */
+#define MEM_RESERVE_MAP_SIZE	8
+
+/*
+ * prom_init() is called very early on, before the kernel text
+ * and data have been mapped to KERNELBASE.  At this point the code
+ * is running at whatever address it has been loaded at, so
+ * references to extern and static variables must be relocated
+ * explicitly.  The procedure reloc_offset() returns the address
+ * we're currently running at minus the address we were linked at.
+ * (Note that strings count as static variables.)
+ *
+ * Because OF may have mapped I/O devices into the area starting at
+ * KERNELBASE, particularly on CHRP machines, we can't safely call
+ * OF once the kernel has been mapped to KERNELBASE.  Therefore all
+ * OF calls should be done within prom_init(), and prom_init()
+ * and all routines called within it must be careful to relocate
+ * references as necessary.
+ *
+ * Note that the bss is cleared *after* prom_init runs, so we have
+ * to make sure that any static or extern variables it accesses
+ * are put in the data segment.
+ */
+
+
+#define PROM_BUG() do {						\
+        prom_printf("kernel BUG at %s line 0x%x!\n",		\
+		    RELOC(__FILE__), __LINE__);			\
+        __asm__ __volatile__(".long " BUG_ILLEGAL_INSTR);	\
+} while (0)
+
+#ifdef DEBUG_PROM
+#define prom_debug(x...)	prom_printf(x)
+#else
+#define prom_debug(x...)
+#endif
+
+
+typedef u32 prom_arg_t;
+
+struct prom_args {
+        u32 service;
+        u32 nargs;
+        u32 nret;
+        prom_arg_t args[10];
+        prom_arg_t *rets;     /* Pointer to return values in args[16]. */
+};
+
+struct prom_t {
+	unsigned long entry;
+	ihandle root;
+	ihandle chosen;
+	int cpu;
+	ihandle stdout;
+	ihandle disp_node;
+	struct prom_args args;
+	unsigned long version;
+	unsigned long root_size_cells;
+	unsigned long root_addr_cells;
+};
+
+struct pci_reg_property {
+	struct pci_address addr;
+	u32 size_hi;
+	u32 size_lo;
+};
+
+struct mem_map_entry {
+	u64	base;
+	u64	size;
+};
+
+typedef u32 cell_t;
+
+extern void __start(unsigned long r3, unsigned long r4, unsigned long r5);
+
+extern void enter_prom(struct prom_args *args, unsigned long entry);
+extern void copy_and_flush(unsigned long dest, unsigned long src,
+			   unsigned long size, unsigned long offset);
+
+extern unsigned long klimit;
+
+/* prom structure */
+static struct prom_t __initdata prom;
+
+#define PROM_SCRATCH_SIZE 256
+
+static char __initdata of_stdout_device[256];
+static char __initdata prom_scratch[PROM_SCRATCH_SIZE];
+
+static unsigned long __initdata dt_header_start;
+static unsigned long __initdata dt_struct_start, dt_struct_end;
+static unsigned long __initdata dt_string_start, dt_string_end;
+
+static unsigned long __initdata prom_initrd_start, prom_initrd_end;
+
+static int __initdata iommu_force_on;
+static int __initdata ppc64_iommu_off;
+static int __initdata of_platform;
+
+static char __initdata prom_cmd_line[COMMAND_LINE_SIZE];
+
+static unsigned long __initdata prom_memory_limit;
+static unsigned long __initdata prom_tce_alloc_start;
+static unsigned long __initdata prom_tce_alloc_end;
+
+static unsigned long __initdata alloc_top;
+static unsigned long __initdata alloc_top_high;
+static unsigned long __initdata alloc_bottom;
+static unsigned long __initdata rmo_top;
+static unsigned long __initdata ram_top;
+
+static struct mem_map_entry __initdata mem_reserve_map[MEM_RESERVE_MAP_SIZE];
+static int __initdata mem_reserve_cnt;
+
+static cell_t __initdata regbuf[1024];
+
+
+#define MAX_CPU_THREADS 2
+
+/* TO GO */
+#ifdef CONFIG_HMT
+struct {
+	unsigned int pir;
+	unsigned int threadid;
+} hmt_thread_data[NR_CPUS];
+#endif /* CONFIG_HMT */
+
+/*
+ * This are used in calls to call_prom.  The 4th and following
+ * arguments to call_prom should be 32-bit values.  64 bit values
+ * are truncated to 32 bits (and fortunately don't get interpreted
+ * as two arguments).
+ */
+#define ADDR(x)		(u32) ((unsigned long)(x) - offset)
+
+/* This is the one and *ONLY* place where we actually call open
+ * firmware from, since we need to make sure we're running in 32b
+ * mode when we do.  We switch back to 64b mode upon return.
+ */
+
+#define PROM_ERROR	(-1)
+
+static int __init call_prom(const char *service, int nargs, int nret, ...)
+{
+	int i;
+	unsigned long offset = reloc_offset();
+	struct prom_t *_prom = PTRRELOC(&prom);
+	va_list list;
+
+	_prom->args.service = ADDR(service);
+	_prom->args.nargs = nargs;
+	_prom->args.nret = nret;
+	_prom->args.rets = (prom_arg_t *)&(_prom->args.args[nargs]);
+
+	va_start(list, nret);
+	for (i=0; i < nargs; i++)
+		_prom->args.args[i] = va_arg(list, prom_arg_t);
+	va_end(list);
+
+	for (i=0; i < nret ;i++)
+		_prom->args.rets[i] = 0;
+
+	enter_prom(&_prom->args, _prom->entry);
+
+	return (nret > 0) ? _prom->args.rets[0] : 0;
+}
+
+
+static unsigned int __init prom_claim(unsigned long virt, unsigned long size,
+				unsigned long align)
+{
+	return (unsigned int)call_prom("claim", 3, 1,
+				       (prom_arg_t)virt, (prom_arg_t)size,
+				       (prom_arg_t)align);
+}
+
+static void __init prom_print(const char *msg)
+{
+	const char *p, *q;
+	unsigned long offset = reloc_offset();
+	struct prom_t *_prom = PTRRELOC(&prom);
+
+	if (_prom->stdout == 0)
+		return;
+
+	for (p = msg; *p != 0; p = q) {
+		for (q = p; *q != 0 && *q != '\n'; ++q)
+			;
+		if (q > p)
+			call_prom("write", 3, 1, _prom->stdout, p, q - p);
+		if (*q == 0)
+			break;
+		++q;
+		call_prom("write", 3, 1, _prom->stdout, ADDR("\r\n"), 2);
+	}
+}
+
+
+static void __init prom_print_hex(unsigned long val)
+{
+	unsigned long offset = reloc_offset();
+	int i, nibbles = sizeof(val)*2;
+	char buf[sizeof(val)*2+1];
+	struct prom_t *_prom = PTRRELOC(&prom);
+
+	for (i = nibbles-1;  i >= 0;  i--) {
+		buf[i] = (val & 0xf) + '0';
+		if (buf[i] > '9')
+			buf[i] += ('a'-'0'-10);
+		val >>= 4;
+	}
+	buf[nibbles] = '\0';
+	call_prom("write", 3, 1, _prom->stdout, buf, nibbles);
+}
+
+
+static void __init prom_printf(const char *format, ...)
+{
+	unsigned long offset = reloc_offset();
+	const char *p, *q, *s;
+	va_list args;
+	unsigned long v;
+	struct prom_t *_prom = PTRRELOC(&prom);
+
+	va_start(args, format);
+	for (p = PTRRELOC(format); *p != 0; p = q) {
+		for (q = p; *q != 0 && *q != '\n' && *q != '%'; ++q)
+			;
+		if (q > p)
+			call_prom("write", 3, 1, _prom->stdout, p, q - p);
+		if (*q == 0)
+			break;
+		if (*q == '\n') {
+			++q;
+			call_prom("write", 3, 1, _prom->stdout,
+				  ADDR("\r\n"), 2);
+			continue;
+		}
+		++q;
+		if (*q == 0)
+			break;
+		switch (*q) {
+		case 's':
+			++q;
+			s = va_arg(args, const char *);
+			prom_print(s);
+			break;
+		case 'x':
+			++q;
+			v = va_arg(args, unsigned long);
+			prom_print_hex(v);
+			break;
+		}
+	}
+}
+
+
+static void __init __attribute__((noreturn)) prom_panic(const char *reason)
+{
+	unsigned long offset = reloc_offset();
+
+	prom_print(PTRRELOC(reason));
+	/* ToDo: should put up an SRC here */
+	call_prom("exit", 0, 0);
+
+	for (;;)			/* should never get here */
+		;
+}
+
+
+static int __init prom_next_node(phandle *nodep)
+{
+	phandle node;
+
+	if ((node = *nodep) != 0
+	    && (*nodep = call_prom("child", 1, 1, node)) != 0)
+		return 1;
+	if ((*nodep = call_prom("peer", 1, 1, node)) != 0)
+		return 1;
+	for (;;) {
+		if ((node = call_prom("parent", 1, 1, node)) == 0)
+			return 0;
+		if ((*nodep = call_prom("peer", 1, 1, node)) != 0)
+			return 1;
+	}
+}
+
+static int __init prom_getprop(phandle node, const char *pname,
+			       void *value, size_t valuelen)
+{
+	unsigned long offset = reloc_offset();
+
+	return call_prom("getprop", 4, 1, node, ADDR(pname),
+			 (u32)(unsigned long) value, (u32) valuelen);
+}
+
+static int __init prom_getproplen(phandle node, const char *pname)
+{
+	unsigned long offset = reloc_offset();
+
+	return call_prom("getproplen", 2, 1, node, ADDR(pname));
+}
+
+static int __init prom_setprop(phandle node, const char *pname,
+			       void *value, size_t valuelen)
+{
+	unsigned long offset = reloc_offset();
+
+	return call_prom("setprop", 4, 1, node, ADDR(pname),
+			 (u32)(unsigned long) value, (u32) valuelen);
+}
+
+/* We can't use the standard versions because of RELOC headaches. */
+#define isxdigit(c)	(('0' <= (c) && (c) <= '9') \
+			 || ('a' <= (c) && (c) <= 'f') \
+			 || ('A' <= (c) && (c) <= 'F'))
+
+#define isdigit(c)	('0' <= (c) && (c) <= '9')
+#define islower(c)	('a' <= (c) && (c) <= 'z')
+#define toupper(c)	(islower(c) ? ((c) - 'a' + 'A') : (c))
+
+unsigned long prom_strtoul(const char *cp, const char **endp)
+{
+	unsigned long result = 0, base = 10, value;
+
+	if (*cp == '0') {
+		base = 8;
+		cp++;
+		if (toupper(*cp) == 'X') {
+			cp++;
+			base = 16;
+		}
+	}
+
+	while (isxdigit(*cp) &&
+	       (value = isdigit(*cp) ? *cp - '0' : toupper(*cp) - 'A' + 10) < base) {
+		result = result * base + value;
+		cp++;
+	}
+
+	if (endp)
+		*endp = cp;
+
+	return result;
+}
+
+unsigned long prom_memparse(const char *ptr, const char **retptr)
+{
+	unsigned long ret = prom_strtoul(ptr, retptr);
+	int shift = 0;
+
+	/*
+	 * We can't use a switch here because GCC *may* generate a
+	 * jump table which won't work, because we're not running at
+	 * the address we're linked at.
+	 */
+	if ('G' == **retptr || 'g' == **retptr)
+		shift = 30;
+
+	if ('M' == **retptr || 'm' == **retptr)
+		shift = 20;
+
+	if ('K' == **retptr || 'k' == **retptr)
+		shift = 10;
+
+	if (shift) {
+		ret <<= shift;
+		(*retptr)++;
+	}
+
+	return ret;
+}
+
+/*
+ * Early parsing of the command line passed to the kernel, used for
+ * "mem=x" and the options that affect the iommu
+ */
+static void __init early_cmdline_parse(void)
+{
+	unsigned long offset = reloc_offset();
+	struct prom_t *_prom = PTRRELOC(&prom);
+	char *opt, *p;
+	int l = 0;
+
+	RELOC(prom_cmd_line[0]) = 0;
+	p = RELOC(prom_cmd_line);
+	if ((long)_prom->chosen > 0)
+		l = prom_getprop(_prom->chosen, "bootargs", p, COMMAND_LINE_SIZE-1);
+#ifdef CONFIG_CMDLINE
+	if (l == 0) /* dbl check */
+		strlcpy(RELOC(prom_cmd_line),
+			RELOC(CONFIG_CMDLINE), sizeof(prom_cmd_line));
+#endif /* CONFIG_CMDLINE */
+	prom_printf("command line: %s\n", RELOC(prom_cmd_line));
+
+	opt = strstr(RELOC(prom_cmd_line), RELOC("iommu="));
+	if (opt) {
+		prom_printf("iommu opt is: %s\n", opt);
+		opt += 6;
+		while (*opt && *opt == ' ')
+			opt++;
+		if (!strncmp(opt, RELOC("off"), 3))
+			RELOC(ppc64_iommu_off) = 1;
+		else if (!strncmp(opt, RELOC("force"), 5))
+			RELOC(iommu_force_on) = 1;
+	}
+
+	opt = strstr(RELOC(prom_cmd_line), RELOC("mem="));
+	if (opt) {
+		opt += 4;
+		RELOC(prom_memory_limit) = prom_memparse(opt, (const char **)&opt);
+		/* Align to 16 MB == size of large page */
+		RELOC(prom_memory_limit) = ALIGN(RELOC(prom_memory_limit), 0x1000000);
+	}
+}
+
+/*
+ * Memory allocation strategy... our layout is normally:
+ *
+ *  at 14Mb or more we vmlinux, then a gap and initrd. In some rare cases, initrd
+ *  might end up beeing before the kernel though. We assume this won't override
+ *  the final kernel at 0, we have no provision to handle that in this version,
+ *  but it should hopefully never happen.
+ *
+ *  alloc_top is set to the top of RMO, eventually shrink down if the TCEs overlap
+ *  alloc_bottom is set to the top of kernel/initrd
+ *
+ *  from there, allocations are done that way : rtas is allocated topmost, and
+ *  the device-tree is allocated from the bottom. We try to grow the device-tree
+ *  allocation as we progress. If we can't, then we fail, we don't currently have
+ *  a facility to restart elsewhere, but that shouldn't be necessary neither
+ *
+ *  Note that calls to reserve_mem have to be done explicitely, memory allocated
+ *  with either alloc_up or alloc_down isn't automatically reserved.
+ */
+
+
+/*
+ * Allocates memory in the RMO upward from the kernel/initrd
+ *
+ * When align is 0, this is a special case, it means to allocate in place
+ * at the current location of alloc_bottom or fail (that is basically
+ * extending the previous allocation). Used for the device-tree flattening
+ */
+static unsigned long __init alloc_up(unsigned long size, unsigned long align)
+{
+	unsigned long offset = reloc_offset();
+	unsigned long base = _ALIGN_UP(RELOC(alloc_bottom), align);
+	unsigned long addr = 0;
+
+	prom_debug("alloc_up(%x, %x)\n", size, align);
+	if (RELOC(ram_top) == 0)
+		prom_panic("alloc_up() called with mem not initialized\n");
+
+	if (align)
+		base = _ALIGN_UP(RELOC(alloc_bottom), align);
+	else
+		base = RELOC(alloc_bottom);
+
+	for(; (base + size) <= RELOC(alloc_top); 
+	    base = _ALIGN_UP(base + 0x100000, align)) {
+		prom_debug("    trying: 0x%x\n\r", base);
+		addr = (unsigned long)prom_claim(base, size, 0);
+		if ((int)addr != PROM_ERROR)
+			break;
+		addr = 0;
+		if (align == 0)
+			break;
+	}
+	if (addr == 0)
+		return 0;
+	RELOC(alloc_bottom) = addr;
+
+	prom_debug(" -> %x\n", addr);
+	prom_debug("  alloc_bottom : %x\n", RELOC(alloc_bottom));
+	prom_debug("  alloc_top    : %x\n", RELOC(alloc_top));
+	prom_debug("  alloc_top_hi : %x\n", RELOC(alloc_top_high));
+	prom_debug("  rmo_top      : %x\n", RELOC(rmo_top));
+	prom_debug("  ram_top      : %x\n", RELOC(ram_top));
+
+	return addr;
+}
+
+/*
+ * Allocates memory downard, either from top of RMO, or if highmem
+ * is set, from the top of RAM. Note that this one doesn't handle
+ * failures. In does claim memory if highmem is not set.
+ */
+static unsigned long __init alloc_down(unsigned long size, unsigned long align,
+				       int highmem)
+{
+	unsigned long offset = reloc_offset();
+	unsigned long base, addr = 0;
+
+	prom_debug("alloc_down(%x, %x, %s)\n", size, align,
+		   highmem ? RELOC("(high)") : RELOC("(low)"));
+	if (RELOC(ram_top) == 0)
+		prom_panic("alloc_down() called with mem not initialized\n");
+
+	if (highmem) {
+		/* Carve out storage for the TCE table. */
+		addr = _ALIGN_DOWN(RELOC(alloc_top_high) - size, align);
+		if (addr <= RELOC(alloc_bottom))
+			return 0;
+		else {
+			/* Will we bump into the RMO ? If yes, check out that we
+			 * didn't overlap existing allocations there, if we did,
+			 * we are dead, we must be the first in town !
+			 */
+			if (addr < RELOC(rmo_top)) {
+				/* Good, we are first */
+				if (RELOC(alloc_top) == RELOC(rmo_top))
+					RELOC(alloc_top) = RELOC(rmo_top) = addr;
+				else
+					return 0;
+			}
+			RELOC(alloc_top_high) = addr;
+		}
+		goto bail;
+	}
+
+	base = _ALIGN_DOWN(RELOC(alloc_top) - size, align);
+	for(; base > RELOC(alloc_bottom); base = _ALIGN_DOWN(base - 0x100000, align))  {
+		prom_debug("    trying: 0x%x\n\r", base);
+		addr = (unsigned long)prom_claim(base, size, 0);
+		if ((int)addr != PROM_ERROR)
+			break;
+		addr = 0;
+	}
+	if (addr == 0)
+		return 0;
+	RELOC(alloc_top) = addr;
+
+ bail:
+	prom_debug(" -> %x\n", addr);
+	prom_debug("  alloc_bottom : %x\n", RELOC(alloc_bottom));
+	prom_debug("  alloc_top    : %x\n", RELOC(alloc_top));
+	prom_debug("  alloc_top_hi : %x\n", RELOC(alloc_top_high));
+	prom_debug("  rmo_top      : %x\n", RELOC(rmo_top));
+	prom_debug("  ram_top      : %x\n", RELOC(ram_top));
+
+	return addr;
+}
+
+/*
+ * Parse a "reg" cell
+ */
+static unsigned long __init prom_next_cell(int s, cell_t **cellp)
+{
+	cell_t *p = *cellp;
+	unsigned long r = 0;
+
+	/* Ignore more than 2 cells */
+	while (s > 2) {
+		p++;
+		s--;
+	}
+	while (s) {
+		r <<= 32;
+		r |= *(p++);
+		s--;
+	}
+
+	*cellp = p;
+	return r;
+}
+
+/*
+ * Very dumb function for adding to the memory reserve list, but
+ * we don't need anything smarter at this point
+ *
+ * XXX Eventually check for collisions. They should NEVER happen
+ * if problems seem to show up, it would be a good start to track
+ * them down.
+ */
+static void reserve_mem(unsigned long base, unsigned long size)
+{
+	unsigned long offset = reloc_offset();
+	unsigned long top = base + size;
+	unsigned long cnt = RELOC(mem_reserve_cnt);
+
+	if (size == 0)
+		return;
+
+	/* We need to always keep one empty entry so that we
+	 * have our terminator with "size" set to 0 since we are
+	 * dumb and just copy this entire array to the boot params
+	 */
+	base = _ALIGN_DOWN(base, PAGE_SIZE);
+	top = _ALIGN_UP(top, PAGE_SIZE);
+	size = top - base;
+
+	if (cnt >= (MEM_RESERVE_MAP_SIZE - 1))
+		prom_panic("Memory reserve map exhausted !\n");
+	RELOC(mem_reserve_map)[cnt].base = base;
+	RELOC(mem_reserve_map)[cnt].size = size;
+	RELOC(mem_reserve_cnt) = cnt + 1;
+}
+
+/*
+ * Initialize memory allocation mecanism, parse "memory" nodes and
+ * obtain that way the top of memory and RMO to setup out local allocator
+ */
+static void __init prom_init_mem(void)
+{
+	phandle node;
+	char *path, type[64];
+	unsigned int plen;
+	cell_t *p, *endp;
+	unsigned long offset = reloc_offset();
+	struct prom_t *_prom = PTRRELOC(&prom);
+
+	/*
+	 * We iterate the memory nodes to find
+	 * 1) top of RMO (first node)
+	 * 2) top of memory
+	 */
+	prom_debug("root_addr_cells: %x\n", (long)_prom->root_addr_cells);
+	prom_debug("root_size_cells: %x\n", (long)_prom->root_size_cells);
+
+	prom_debug("scanning memory:\n");
+	path = RELOC(prom_scratch);
+
+	for (node = 0; prom_next_node(&node); ) {
+		type[0] = 0;
+		prom_getprop(node, "device_type", type, sizeof(type));
+
+		if (strcmp(type, RELOC("memory")))
+			continue;
+	
+		plen = prom_getprop(node, "reg", RELOC(regbuf), sizeof(regbuf));
+		if (plen > sizeof(regbuf)) {
+			prom_printf("memory node too large for buffer !\n");
+			plen = sizeof(regbuf);
+		}
+		p = RELOC(regbuf);
+		endp = p + (plen / sizeof(cell_t));
+
+#ifdef DEBUG_PROM
+		memset(path, 0, PROM_SCRATCH_SIZE);
+		call_prom("package-to-path", 3, 1, node, path, PROM_SCRATCH_SIZE-1);
+		prom_debug("  node %s :\n", path);
+#endif /* DEBUG_PROM */
+
+		while ((endp - p) >= (_prom->root_addr_cells + _prom->root_size_cells)) {
+			unsigned long base, size;
+
+			base = prom_next_cell(_prom->root_addr_cells, &p);
+			size = prom_next_cell(_prom->root_size_cells, &p);
+
+			if (size == 0)
+				continue;
+			prom_debug("    %x %x\n", base, size);
+			if (base == 0)
+				RELOC(rmo_top) = size;
+			if ((base + size) > RELOC(ram_top))
+				RELOC(ram_top) = base + size;
+		}
+	}
+
+	RELOC(alloc_bottom) = PAGE_ALIGN(RELOC(klimit) - offset + 0x4000);
+
+	/* Check if we have an initrd after the kernel, if we do move our bottom
+	 * point to after it
+	 */
+	if (RELOC(prom_initrd_start)) {
+		if (RELOC(prom_initrd_end) > RELOC(alloc_bottom))
+			RELOC(alloc_bottom) = PAGE_ALIGN(RELOC(prom_initrd_end));
+	}
+
+	/*
+	 * If prom_memory_limit is set we reduce the upper limits *except* for
+	 * alloc_top_high. This must be the real top of RAM so we can put
+	 * TCE's up there.
+	 */
+
+	RELOC(alloc_top_high) = RELOC(ram_top);
+
+	if (RELOC(prom_memory_limit)) {
+		if (RELOC(prom_memory_limit) <= RELOC(alloc_bottom)) {
+			prom_printf("Ignoring mem=%x <= alloc_bottom.\n",
+				RELOC(prom_memory_limit));
+			RELOC(prom_memory_limit) = 0;
+		} else if (RELOC(prom_memory_limit) >= RELOC(ram_top)) {
+			prom_printf("Ignoring mem=%x >= ram_top.\n",
+				RELOC(prom_memory_limit));
+			RELOC(prom_memory_limit) = 0;
+		} else {
+			RELOC(ram_top) = RELOC(prom_memory_limit);
+			RELOC(rmo_top) = min(RELOC(rmo_top), RELOC(prom_memory_limit));
+		}
+	}
+
+	/*
+	 * Setup our top alloc point, that is top of RMO or top of
+	 * segment 0 when running non-LPAR.
+	 */
+	if ( RELOC(of_platform) == PLATFORM_PSERIES_LPAR )
+		RELOC(alloc_top) = RELOC(rmo_top);
+	else
+		RELOC(alloc_top) = RELOC(rmo_top) = min(0x40000000ul, RELOC(ram_top));
+
+	prom_printf("memory layout at init:\n");
+	prom_printf("  memory_limit : %x (16 MB aligned)\n", RELOC(prom_memory_limit));
+	prom_printf("  alloc_bottom : %x\n", RELOC(alloc_bottom));
+	prom_printf("  alloc_top    : %x\n", RELOC(alloc_top));
+	prom_printf("  alloc_top_hi : %x\n", RELOC(alloc_top_high));
+	prom_printf("  rmo_top      : %x\n", RELOC(rmo_top));
+	prom_printf("  ram_top      : %x\n", RELOC(ram_top));
+}
+
+
+/*
+ * Allocate room for and instanciate RTAS
+ */
+static void __init prom_instantiate_rtas(void)
+{
+	unsigned long offset = reloc_offset();
+	struct prom_t *_prom = PTRRELOC(&prom);
+	phandle prom_rtas, rtas_node;
+	u32 base, entry = 0;
+	u32 size = 0;
+
+	prom_debug("prom_instantiate_rtas: start...\n");
+
+	prom_rtas = call_prom("finddevice", 1, 1, ADDR("/rtas"));
+	prom_debug("prom_rtas: %x\n", prom_rtas);
+	if (prom_rtas == (phandle) -1)
+		return;
+
+	prom_getprop(prom_rtas, "rtas-size", &size, sizeof(size));
+	if (size == 0)
+		return;
+
+	base = alloc_down(size, PAGE_SIZE, 0);
+	if (base == 0) {
+		prom_printf("RTAS allocation failed !\n");
+		return;
+	}
+	prom_printf("instantiating rtas at 0x%x", base);
+
+	rtas_node = call_prom("open", 1, 1, ADDR("/rtas"));
+	prom_printf("...");
+
+	if (call_prom("call-method", 3, 2,
+		      ADDR("instantiate-rtas"),
+		      rtas_node, base) != PROM_ERROR) {
+		entry = (long)_prom->args.rets[1];
+	}
+	if (entry == 0) {
+		prom_printf(" failed\n");
+		return;
+	}
+	prom_printf(" done\n");
+
+	reserve_mem(base, size);
+
+	prom_setprop(prom_rtas, "linux,rtas-base", &base, sizeof(base));
+	prom_setprop(prom_rtas, "linux,rtas-entry", &entry, sizeof(entry));
+
+	prom_debug("rtas base     = 0x%x\n", base);
+	prom_debug("rtas entry    = 0x%x\n", entry);
+	prom_debug("rtas size     = 0x%x\n", (long)size);
+
+	prom_debug("prom_instantiate_rtas: end...\n");
+}
+
+
+/*
+ * Allocate room for and initialize TCE tables
+ */
+static void __init prom_initialize_tce_table(void)
+{
+	phandle node;
+	ihandle phb_node;
+	unsigned long offset = reloc_offset();
+	char compatible[64], type[64], model[64];
+	char *path = RELOC(prom_scratch);
+	u64 base, align;
+	u32 minalign, minsize;
+	u64 tce_entry, *tce_entryp;
+	u64 local_alloc_top, local_alloc_bottom;
+	u64 i;
+
+	if (RELOC(ppc64_iommu_off))
+		return;
+
+	prom_debug("starting prom_initialize_tce_table\n");
+
+	/* Cache current top of allocs so we reserve a single block */
+	local_alloc_top = RELOC(alloc_top_high);
+	local_alloc_bottom = local_alloc_top;
+
+	/* Search all nodes looking for PHBs. */
+	for (node = 0; prom_next_node(&node); ) {
+		compatible[0] = 0;
+		type[0] = 0;
+		model[0] = 0;
+		prom_getprop(node, "compatible",
+			     compatible, sizeof(compatible));
+		prom_getprop(node, "device_type", type, sizeof(type));
+		prom_getprop(node, "model", model, sizeof(model));
+
+		if ((type[0] == 0) || (strstr(type, RELOC("pci")) == NULL))
+			continue;
+
+		/* Keep the old logic in tack to avoid regression. */
+		if (compatible[0] != 0) {
+			if ((strstr(compatible, RELOC("python")) == NULL) &&
+			    (strstr(compatible, RELOC("Speedwagon")) == NULL) &&
+			    (strstr(compatible, RELOC("Winnipeg")) == NULL))
+				continue;
+		} else if (model[0] != 0) {
+			if ((strstr(model, RELOC("ython")) == NULL) &&
+			    (strstr(model, RELOC("peedwagon")) == NULL) &&
+			    (strstr(model, RELOC("innipeg")) == NULL))
+				continue;
+		}
+
+		if (prom_getprop(node, "tce-table-minalign", &minalign,
+				 sizeof(minalign)) == PROM_ERROR)
+			minalign = 0;
+		if (prom_getprop(node, "tce-table-minsize", &minsize,
+				 sizeof(minsize)) == PROM_ERROR)
+			minsize = 4UL << 20;
+
+		/*
+		 * Even though we read what OF wants, we just set the table
+		 * size to 4 MB.  This is enough to map 2GB of PCI DMA space.
+		 * By doing this, we avoid the pitfalls of trying to DMA to
+		 * MMIO space and the DMA alias hole.
+		 *
+		 * On POWER4, firmware sets the TCE region by assuming
+		 * each TCE table is 8MB. Using this memory for anything
+		 * else will impact performance, so we always allocate 8MB.
+		 * Anton
+		 */
+		if (__is_processor(PV_POWER4) || __is_processor(PV_POWER4p))
+			minsize = 8UL << 20;
+		else
+			minsize = 4UL << 20;
+
+		/* Align to the greater of the align or size */
+		align = max(minalign, minsize);
+		base = alloc_down(minsize, align, 1);
+		if (base == 0)
+			prom_panic("ERROR, cannot find space for TCE table.\n");
+		if (base < local_alloc_bottom)
+			local_alloc_bottom = base;
+
+		/* Save away the TCE table attributes for later use. */
+		prom_setprop(node, "linux,tce-base", &base, sizeof(base));
+		prom_setprop(node, "linux,tce-size", &minsize, sizeof(minsize));
+
+		/* It seems OF doesn't null-terminate the path :-( */
+		memset(path, 0, sizeof(path));
+		/* Call OF to setup the TCE hardware */
+		if (call_prom("package-to-path", 3, 1, node,
+			      path, PROM_SCRATCH_SIZE-1) == PROM_ERROR) {
+			prom_printf("package-to-path failed\n");
+		}
+
+		prom_debug("TCE table: %s\n", path);
+		prom_debug("\tnode = 0x%x\n", node);
+		prom_debug("\tbase = 0x%x\n", base);
+		prom_debug("\tsize = 0x%x\n", minsize);
+
+		/* Initialize the table to have a one-to-one mapping
+		 * over the allocated size.
+		 */
+		tce_entryp = (unsigned long *)base;
+		for (i = 0; i < (minsize >> 3) ;tce_entryp++, i++) {
+			tce_entry = (i << PAGE_SHIFT);
+			tce_entry |= 0x3;
+			*tce_entryp = tce_entry;
+		}
+
+		prom_printf("opening PHB %s", path);
+		phb_node = call_prom("open", 1, 1, path);
+		if ( (long)phb_node <= 0)
+			prom_printf("... failed\n");
+		else
+			prom_printf("... done\n");
+
+		call_prom("call-method", 6, 0, ADDR("set-64-bit-addressing"),
+			  phb_node, -1, minsize,
+			  (u32) base, (u32) (base >> 32));
+		call_prom("close", 1, 0, phb_node);
+	}
+
+	reserve_mem(local_alloc_bottom, local_alloc_top - local_alloc_bottom);
+
+	if (RELOC(prom_memory_limit)) {
+		/*
+		 * We align the start to a 16MB boundary so we can map the TCE area
+		 * using large pages if possible. The end should be the top of RAM
+		 * so no need to align it.
+		 */
+		RELOC(prom_tce_alloc_start) = _ALIGN_DOWN(local_alloc_bottom, 0x1000000);
+		RELOC(prom_tce_alloc_end) = local_alloc_top;
+	}
+
+	/* Flag the first invalid entry */
+	prom_debug("ending prom_initialize_tce_table\n");
+}
+
+/*
+ * With CHRP SMP we need to use the OF to start the other
+ * processors so we can't wait until smp_boot_cpus (the OF is
+ * trashed by then) so we have to put the processors into
+ * a holding pattern controlled by the kernel (not OF) before
+ * we destroy the OF.
+ *
+ * This uses a chunk of low memory, puts some holding pattern
+ * code there and sends the other processors off to there until
+ * smp_boot_cpus tells them to do something.  The holding pattern
+ * checks that address until its cpu # is there, when it is that
+ * cpu jumps to __secondary_start().  smp_boot_cpus() takes care
+ * of setting those values.
+ *
+ * We also use physical address 0x4 here to tell when a cpu
+ * is in its holding pattern code.
+ *
+ * Fixup comment... DRENG / PPPBBB - Peter
+ *
+ * -- Cort
+ */
+static void __init prom_hold_cpus(void)
+{
+	unsigned long i;
+	unsigned int reg;
+	phandle node;
+	unsigned long offset = reloc_offset();
+	char type[64];
+	int cpuid = 0;
+	unsigned int interrupt_server[MAX_CPU_THREADS];
+	unsigned int cpu_threads, hw_cpu_num;
+	int propsize;
+	extern void __secondary_hold(void);
+	extern unsigned long __secondary_hold_spinloop;
+	extern unsigned long __secondary_hold_acknowledge;
+	unsigned long *spinloop
+		= (void *)virt_to_abs(&__secondary_hold_spinloop);
+	unsigned long *acknowledge
+		= (void *)virt_to_abs(&__secondary_hold_acknowledge);
+	unsigned long secondary_hold
+		= virt_to_abs(*PTRRELOC((unsigned long *)__secondary_hold));
+	struct prom_t *_prom = PTRRELOC(&prom);
+
+	prom_debug("prom_hold_cpus: start...\n");
+	prom_debug("    1) spinloop       = 0x%x\n", (unsigned long)spinloop);
+	prom_debug("    1) *spinloop      = 0x%x\n", *spinloop);
+	prom_debug("    1) acknowledge    = 0x%x\n",
+		   (unsigned long)acknowledge);
+	prom_debug("    1) *acknowledge   = 0x%x\n", *acknowledge);
+	prom_debug("    1) secondary_hold = 0x%x\n", secondary_hold);
+
+	/* Set the common spinloop variable, so all of the secondary cpus
+	 * will block when they are awakened from their OF spinloop.
+	 * This must occur for both SMP and non SMP kernels, since OF will
+	 * be trashed when we move the kernel.
+	 */
+	*spinloop = 0;
+
+#ifdef CONFIG_HMT
+	for (i=0; i < NR_CPUS; i++) {
+		RELOC(hmt_thread_data)[i].pir = 0xdeadbeef;
+	}
+#endif
+	/* look for cpus */
+	for (node = 0; prom_next_node(&node); ) {
+		type[0] = 0;
+		prom_getprop(node, "device_type", type, sizeof(type));
+		if (strcmp(type, RELOC("cpu")) != 0)
+			continue;
+
+		/* Skip non-configured cpus. */
+		if (prom_getprop(node, "status", type, sizeof(type)) > 0)
+			if (strcmp(type, RELOC("okay")) != 0)
+				continue;
+
+		reg = -1;
+		prom_getprop(node, "reg", &reg, sizeof(reg));
+
+		prom_debug("\ncpuid        = 0x%x\n", cpuid);
+		prom_debug("cpu hw idx   = 0x%x\n", reg);
+
+		/* Init the acknowledge var which will be reset by
+		 * the secondary cpu when it awakens from its OF
+		 * spinloop.
+		 */
+		*acknowledge = (unsigned long)-1;
+
+		propsize = prom_getprop(node, "ibm,ppc-interrupt-server#s",
+					&interrupt_server,
+					sizeof(interrupt_server));
+		if (propsize < 0) {
+			/* no property.  old hardware has no SMT */
+			cpu_threads = 1;
+			interrupt_server[0] = reg; /* fake it with phys id */
+		} else {
+			/* We have a threaded processor */
+			cpu_threads = propsize / sizeof(u32);
+			if (cpu_threads > MAX_CPU_THREADS) {
+				prom_printf("SMT: too many threads!\n"
+					    "SMT: found %x, max is %x\n",
+					    cpu_threads, MAX_CPU_THREADS);
+				cpu_threads = 1; /* ToDo: panic? */
+			}
+		}
+
+		hw_cpu_num = interrupt_server[0];
+		if (hw_cpu_num != _prom->cpu) {
+			/* Primary Thread of non-boot cpu */
+			prom_printf("%x : starting cpu hw idx %x... ", cpuid, reg);
+			call_prom("start-cpu", 3, 0, node,
+				  secondary_hold, reg);
+
+			for ( i = 0 ; (i < 100000000) && 
+			      (*acknowledge == ((unsigned long)-1)); i++ )
+				mb();
+
+			if (*acknowledge == reg) {
+				prom_printf("done\n");
+				/* We have to get every CPU out of OF,
+				 * even if we never start it. */
+				if (cpuid >= NR_CPUS)
+					goto next;
+			} else {
+				prom_printf("failed: %x\n", *acknowledge);
+			}
+		}
+#ifdef CONFIG_SMP
+		else
+			prom_printf("%x : boot cpu     %x\n", cpuid, reg);
+#endif
+next:
+#ifdef CONFIG_SMP
+		/* Init paca for secondary threads.   They start later. */
+		for (i=1; i < cpu_threads; i++) {
+			cpuid++;
+			if (cpuid >= NR_CPUS)
+				continue;
+		}
+#endif /* CONFIG_SMP */
+		cpuid++;
+	}
+#ifdef CONFIG_HMT
+	/* Only enable HMT on processors that provide support. */
+	if (__is_processor(PV_PULSAR) || 
+	    __is_processor(PV_ICESTAR) ||
+	    __is_processor(PV_SSTAR)) {
+		prom_printf("    starting secondary threads\n");
+
+		for (i = 0; i < NR_CPUS; i += 2) {
+			if (!cpu_online(i))
+				continue;
+
+			if (i == 0) {
+				unsigned long pir = mfspr(SPRN_PIR);
+				if (__is_processor(PV_PULSAR)) {
+					RELOC(hmt_thread_data)[i].pir = 
+						pir & 0x1f;
+				} else {
+					RELOC(hmt_thread_data)[i].pir = 
+						pir & 0x3ff;
+				}
+			}
+		}
+	} else {
+		prom_printf("Processor is not HMT capable\n");
+	}
+#endif
+
+	if (cpuid > NR_CPUS)
+		prom_printf("WARNING: maximum CPUs (" __stringify(NR_CPUS)
+			    ") exceeded: ignoring extras\n");
+
+	prom_debug("prom_hold_cpus: end...\n");
+}
+
+
+static void __init prom_init_client_services(unsigned long pp)
+{
+	unsigned long offset = reloc_offset();
+	struct prom_t *_prom = PTRRELOC(&prom);
+
+	/* Get a handle to the prom entry point before anything else */
+	_prom->entry = pp;
+
+	/* Init default value for phys size */
+	_prom->root_size_cells = 1;
+	_prom->root_addr_cells = 2;
+
+	/* get a handle for the stdout device */
+	_prom->chosen = call_prom("finddevice", 1, 1, ADDR("/chosen"));
+	if ((long)_prom->chosen <= 0)
+		prom_panic("cannot find chosen"); /* msg won't be printed :( */
+
+	/* get device tree root */
+	_prom->root = call_prom("finddevice", 1, 1, ADDR("/"));
+	if ((long)_prom->root <= 0)
+		prom_panic("cannot find device tree root"); /* msg won't be printed :( */
+}
+
+static void __init prom_init_stdout(void)
+{
+	unsigned long offset = reloc_offset();
+	struct prom_t *_prom = PTRRELOC(&prom);
+	char *path = RELOC(of_stdout_device);
+	char type[16];
+	u32 val;
+
+	if (prom_getprop(_prom->chosen, "stdout", &val, sizeof(val)) <= 0)
+		prom_panic("cannot find stdout");
+
+	_prom->stdout = val;
+
+	/* Get the full OF pathname of the stdout device */
+	memset(path, 0, 256);
+	call_prom("instance-to-path", 3, 1, _prom->stdout, path, 255);
+	val = call_prom("instance-to-package", 1, 1, _prom->stdout);
+	prom_setprop(_prom->chosen, "linux,stdout-package", &val, sizeof(val));
+	prom_printf("OF stdout device is: %s\n", RELOC(of_stdout_device));
+	prom_setprop(_prom->chosen, "linux,stdout-path",
+		     RELOC(of_stdout_device), strlen(RELOC(of_stdout_device))+1);
+
+	/* If it's a display, note it */
+	memset(type, 0, sizeof(type));
+	prom_getprop(val, "device_type", type, sizeof(type));
+	if (strcmp(type, RELOC("display")) == 0) {
+		_prom->disp_node = val;
+		prom_setprop(val, "linux,boot-display", NULL, 0);
+	}
+}
+
+static void __init prom_close_stdin(void)
+{
+	unsigned long offset = reloc_offset();
+	struct prom_t *_prom = PTRRELOC(&prom);
+	ihandle val;
+
+	if (prom_getprop(_prom->chosen, "stdin", &val, sizeof(val)) > 0)
+		call_prom("close", 1, 0, val);
+}
+
+static int __init prom_find_machine_type(void)
+{
+	unsigned long offset = reloc_offset();
+	struct prom_t *_prom = PTRRELOC(&prom);
+	char compat[256];
+	int len, i = 0;
+	phandle rtas;
+
+	len = prom_getprop(_prom->root, "compatible",
+			   compat, sizeof(compat)-1);
+	if (len > 0) {
+		compat[len] = 0;
+		while (i < len) {
+			char *p = &compat[i];
+			int sl = strlen(p);
+			if (sl == 0)
+				break;
+			if (strstr(p, RELOC("Power Macintosh")) ||
+			    strstr(p, RELOC("MacRISC4")))
+				return PLATFORM_POWERMAC;
+			if (strstr(p, RELOC("Momentum,Maple")))
+				return PLATFORM_MAPLE;
+			i += sl + 1;
+		}
+	}
+	/* Default to pSeries. We need to know if we are running LPAR */
+	rtas = call_prom("finddevice", 1, 1, ADDR("/rtas"));
+	if (rtas != (phandle) -1) {
+		unsigned long x;
+		x = prom_getproplen(rtas, "ibm,hypertas-functions");
+		if (x != PROM_ERROR) {
+			prom_printf("Hypertas detected, assuming LPAR !\n");
+			return PLATFORM_PSERIES_LPAR;
+		}
+	}
+	return PLATFORM_PSERIES;
+}
+
+static int __init prom_set_color(ihandle ih, int i, int r, int g, int b)
+{
+	unsigned long offset = reloc_offset();
+
+	return call_prom("call-method", 6, 1, ADDR("color!"), ih, i, b, g, r);
+}
+
+/*
+ * If we have a display that we don't know how to drive,
+ * we will want to try to execute OF's open method for it
+ * later.  However, OF will probably fall over if we do that
+ * we've taken over the MMU.
+ * So we check whether we will need to open the display,
+ * and if so, open it now.
+ */
+static void __init prom_check_displays(void)
+{
+	unsigned long offset = reloc_offset();
+	struct prom_t *_prom = PTRRELOC(&prom);
+	char type[16], *path;
+	phandle node;
+	ihandle ih;
+	int i;
+
+	static unsigned char default_colors[] = {
+		0x00, 0x00, 0x00,
+		0x00, 0x00, 0xaa,
+		0x00, 0xaa, 0x00,
+		0x00, 0xaa, 0xaa,
+		0xaa, 0x00, 0x00,
+		0xaa, 0x00, 0xaa,
+		0xaa, 0xaa, 0x00,
+		0xaa, 0xaa, 0xaa,
+		0x55, 0x55, 0x55,
+		0x55, 0x55, 0xff,
+		0x55, 0xff, 0x55,
+		0x55, 0xff, 0xff,
+		0xff, 0x55, 0x55,
+		0xff, 0x55, 0xff,
+		0xff, 0xff, 0x55,
+		0xff, 0xff, 0xff
+	};
+	const unsigned char *clut;
+
+	prom_printf("Looking for displays\n");
+	for (node = 0; prom_next_node(&node); ) {
+		memset(type, 0, sizeof(type));
+		prom_getprop(node, "device_type", type, sizeof(type));
+		if (strcmp(type, RELOC("display")) != 0)
+			continue;
+
+		/* It seems OF doesn't null-terminate the path :-( */
+		path = RELOC(prom_scratch);
+		memset(path, 0, PROM_SCRATCH_SIZE);
+
+		/*
+		 * leave some room at the end of the path for appending extra
+		 * arguments
+		 */
+		if (call_prom("package-to-path", 3, 1, node, path, PROM_SCRATCH_SIZE-10) < 0)
+			continue;
+		prom_printf("found display   : %s, opening ... ", path);
+		
+		ih = call_prom("open", 1, 1, path);
+		if (ih == (ihandle)0 || ih == (ihandle)-1) {
+			prom_printf("failed\n");
+			continue;
+		}
+
+		/* Success */
+		prom_printf("done\n");
+		prom_setprop(node, "linux,opened", NULL, 0);
+
+		/*
+		 * stdout wasn't a display node, pick the first we can find
+		 * for btext
+		 */
+		if (_prom->disp_node == 0)
+			_prom->disp_node = node;
+
+		/* Setup a useable color table when the appropriate
+		 * method is available. Should update this to set-colors */
+		clut = RELOC(default_colors);
+		for (i = 0; i < 32; i++, clut += 3)
+			if (prom_set_color(ih, i, clut[0], clut[1],
+					   clut[2]) != 0)
+				break;
+
+#ifdef CONFIG_LOGO_LINUX_CLUT224
+		clut = PTRRELOC(RELOC(logo_linux_clut224.clut));
+		for (i = 0; i < RELOC(logo_linux_clut224.clutsize); i++, clut += 3)
+			if (prom_set_color(ih, i + 32, clut[0], clut[1],
+					   clut[2]) != 0)
+				break;
+#endif /* CONFIG_LOGO_LINUX_CLUT224 */
+	}
+}
+
+
+/* Return (relocated) pointer to this much memory: moves initrd if reqd. */
+static void __init *make_room(unsigned long *mem_start, unsigned long *mem_end,
+			      unsigned long needed, unsigned long align)
+{
+	unsigned long offset = reloc_offset();
+	void *ret;
+
+	*mem_start = _ALIGN(*mem_start, align);
+	while ((*mem_start + needed) > *mem_end) {
+		unsigned long room, chunk;
+
+		prom_debug("Chunk exhausted, claiming more at %x...\n",
+			   RELOC(alloc_bottom));
+		room = RELOC(alloc_top) - RELOC(alloc_bottom);
+		if (room > DEVTREE_CHUNK_SIZE)
+			room = DEVTREE_CHUNK_SIZE;
+		if (room < PAGE_SIZE)
+			prom_panic("No memory for flatten_device_tree (no room)");
+		chunk = alloc_up(room, 0);
+		if (chunk == 0)
+			prom_panic("No memory for flatten_device_tree (claim failed)");
+		*mem_end = RELOC(alloc_top);
+	}
+
+	ret = (void *)*mem_start;
+	*mem_start += needed;
+
+	return ret;
+}
+
+#define dt_push_token(token, mem_start, mem_end) \
+	do { *((u32 *)make_room(mem_start, mem_end, 4, 4)) = token; } while(0)
+
+static unsigned long __init dt_find_string(char *str)
+{
+	unsigned long offset = reloc_offset();
+	char *s, *os;
+
+	s = os = (char *)RELOC(dt_string_start);
+	s += 4;
+	while (s <  (char *)RELOC(dt_string_end)) {
+		if (strcmp(s, str) == 0)
+			return s - os;
+		s += strlen(s) + 1;
+	}
+	return 0;
+}
+
+static void __init scan_dt_build_strings(phandle node, unsigned long *mem_start,
+					 unsigned long *mem_end)
+{
+	unsigned long offset = reloc_offset();
+	char *prev_name, *namep, *sstart;
+	unsigned long soff;
+	phandle child;
+
+	sstart =  (char *)RELOC(dt_string_start);
+
+	/* get and store all property names */
+	prev_name = RELOC("");
+	for (;;) {
+		
+		/* 32 is max len of name including nul. */
+		namep = make_room(mem_start, mem_end, 32, 1);
+		if (call_prom("nextprop", 3, 1, node, prev_name, namep) <= 0) {
+			/* No more nodes: unwind alloc */
+			*mem_start = (unsigned long)namep;
+			break;
+		}
+		soff = dt_find_string(namep);
+		if (soff != 0) {
+			*mem_start = (unsigned long)namep;
+			namep = sstart + soff;
+		} else {
+			/* Trim off some if we can */
+			*mem_start = (unsigned long)namep + strlen(namep) + 1;
+			RELOC(dt_string_end) = *mem_start;
+		}
+		prev_name = namep;
+	}
+
+	/* do all our children */
+	child = call_prom("child", 1, 1, node);
+	while (child != (phandle)0) {
+		scan_dt_build_strings(child, mem_start, mem_end);
+		child = call_prom("peer", 1, 1, child);
+	}
+}
+
+static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start,
+					unsigned long *mem_end)
+{
+	int l, align;
+	phandle child;
+	char *namep, *prev_name, *sstart;
+	unsigned long soff;
+	unsigned char *valp;
+	unsigned long offset = reloc_offset();
+	char pname[32];
+	char *path;
+
+	path = RELOC(prom_scratch);
+
+	dt_push_token(OF_DT_BEGIN_NODE, mem_start, mem_end);
+
+	/* get the node's full name */
+	namep = (char *)*mem_start;
+	l = call_prom("package-to-path", 3, 1, node,
+		      namep, *mem_end - *mem_start);
+	if (l >= 0) {
+		/* Didn't fit?  Get more room. */
+		if (l+1 > *mem_end - *mem_start) {
+			namep = make_room(mem_start, mem_end, l+1, 1);
+			call_prom("package-to-path", 3, 1, node, namep, l);
+		}
+		namep[l] = '\0';
+		*mem_start = _ALIGN(((unsigned long) namep) + strlen(namep) + 1, 4);
+	}
+
+	/* get it again for debugging */
+	memset(path, 0, PROM_SCRATCH_SIZE);
+	call_prom("package-to-path", 3, 1, node, path, PROM_SCRATCH_SIZE-1);
+
+	/* get and store all properties */
+	prev_name = RELOC("");
+	sstart = (char *)RELOC(dt_string_start);
+	for (;;) {
+		if (call_prom("nextprop", 3, 1, node, prev_name, pname) <= 0)
+			break;
+
+		/* find string offset */
+		soff = dt_find_string(pname);
+		if (soff == 0) {
+			prom_printf("WARNING: Can't find string index for <%s>, node %s\n",
+				    pname, path);
+			break;
+		}
+		prev_name = sstart + soff;
+
+		/* get length */
+		l = call_prom("getproplen", 2, 1, node, pname);
+
+		/* sanity checks */
+		if (l < 0)
+			continue;
+		if (l > MAX_PROPERTY_LENGTH) {
+			prom_printf("WARNING: ignoring large property ");
+			/* It seems OF doesn't null-terminate the path :-( */
+			prom_printf("[%s] ", path);
+			prom_printf("%s length 0x%x\n", pname, l);
+			continue;
+		}
+
+		/* push property head */
+		dt_push_token(OF_DT_PROP, mem_start, mem_end);
+		dt_push_token(l, mem_start, mem_end);
+		dt_push_token(soff, mem_start, mem_end);
+
+		/* push property content */
+		align = (l >= 8) ? 8 : 4;
+		valp = make_room(mem_start, mem_end, l, align);
+		call_prom("getprop", 4, 1, node, pname, valp, l);
+		*mem_start = _ALIGN(*mem_start, 4);
+	}
+
+	/* Add a "linux,phandle" property. */
+	soff = dt_find_string(RELOC("linux,phandle"));
+	if (soff == 0)
+		prom_printf("WARNING: Can't find string index for <linux-phandle>"
+			    " node %s\n", path);
+	else {
+		dt_push_token(OF_DT_PROP, mem_start, mem_end);
+		dt_push_token(4, mem_start, mem_end);
+		dt_push_token(soff, mem_start, mem_end);
+		valp = make_room(mem_start, mem_end, 4, 4);
+		*(u32 *)valp = node;
+	}
+
+	/* do all our children */
+	child = call_prom("child", 1, 1, node);
+	while (child != (phandle)0) {
+		scan_dt_build_struct(child, mem_start, mem_end);
+		child = call_prom("peer", 1, 1, child);
+	}
+
+	dt_push_token(OF_DT_END_NODE, mem_start, mem_end);
+}
+
+static void __init flatten_device_tree(void)
+{
+	phandle root;
+	unsigned long offset = reloc_offset();
+	unsigned long mem_start, mem_end, room;
+	struct boot_param_header *hdr;
+	char *namep;
+	u64 *rsvmap;
+
+	/*
+	 * Check how much room we have between alloc top & bottom (+/- a
+	 * few pages), crop to 4Mb, as this is our "chuck" size
+	 */
+	room = RELOC(alloc_top) - RELOC(alloc_bottom) - 0x4000;
+	if (room > DEVTREE_CHUNK_SIZE)
+		room = DEVTREE_CHUNK_SIZE;
+	prom_debug("starting device tree allocs at %x\n", RELOC(alloc_bottom));
+
+	/* Now try to claim that */
+	mem_start = (unsigned long)alloc_up(room, PAGE_SIZE);
+	if (mem_start == 0)
+		prom_panic("Can't allocate initial device-tree chunk\n");
+	mem_end = RELOC(alloc_top);
+
+	/* Get root of tree */
+	root = call_prom("peer", 1, 1, (phandle)0);
+	if (root == (phandle)0)
+		prom_panic ("couldn't get device tree root\n");
+
+	/* Build header and make room for mem rsv map */ 
+	mem_start = _ALIGN(mem_start, 4);
+	hdr = make_room(&mem_start, &mem_end, sizeof(struct boot_param_header), 4);
+	RELOC(dt_header_start) = (unsigned long)hdr;
+	rsvmap = make_room(&mem_start, &mem_end, sizeof(mem_reserve_map), 8);
+
+	/* Start of strings */
+	mem_start = PAGE_ALIGN(mem_start);
+	RELOC(dt_string_start) = mem_start;
+	mem_start += 4; /* hole */
+
+	/* Add "linux,phandle" in there, we'll need it */
+	namep = make_room(&mem_start, &mem_end, 16, 1);
+	strcpy(namep, RELOC("linux,phandle"));
+	mem_start = (unsigned long)namep + strlen(namep) + 1;
+	RELOC(dt_string_end) = mem_start;
+
+	/* Build string array */
+	prom_printf("Building dt strings...\n"); 
+	scan_dt_build_strings(root, &mem_start, &mem_end);
+
+	/* Build structure */
+	mem_start = PAGE_ALIGN(mem_start);
+	RELOC(dt_struct_start) = mem_start;
+	prom_printf("Building dt structure...\n"); 
+	scan_dt_build_struct(root, &mem_start, &mem_end);
+	dt_push_token(OF_DT_END, &mem_start, &mem_end);
+	RELOC(dt_struct_end) = PAGE_ALIGN(mem_start);
+
+	/* Finish header */
+	hdr->magic = OF_DT_HEADER;
+	hdr->totalsize = RELOC(dt_struct_end) - RELOC(dt_header_start);
+	hdr->off_dt_struct = RELOC(dt_struct_start) - RELOC(dt_header_start);
+	hdr->off_dt_strings = RELOC(dt_string_start) - RELOC(dt_header_start);
+	hdr->off_mem_rsvmap = ((unsigned long)rsvmap) - RELOC(dt_header_start);
+	hdr->version = OF_DT_VERSION;
+	hdr->last_comp_version = 1;
+
+	/* Reserve the whole thing and copy the reserve map in, we
+	 * also bump mem_reserve_cnt to cause further reservations to
+	 * fail since it's too late.
+	 */
+	reserve_mem(RELOC(dt_header_start), hdr->totalsize);
+	memcpy(rsvmap, RELOC(mem_reserve_map), sizeof(mem_reserve_map));
+
+#ifdef DEBUG_PROM
+	{
+		int i;
+		prom_printf("reserved memory map:\n");
+		for (i = 0; i < RELOC(mem_reserve_cnt); i++)
+			prom_printf("  %x - %x\n", RELOC(mem_reserve_map)[i].base,
+				    RELOC(mem_reserve_map)[i].size);
+	}
+#endif
+	RELOC(mem_reserve_cnt) = MEM_RESERVE_MAP_SIZE;
+
+	prom_printf("Device tree strings 0x%x -> 0x%x\n",
+		    RELOC(dt_string_start), RELOC(dt_string_end)); 
+	prom_printf("Device tree struct  0x%x -> 0x%x\n",
+		    RELOC(dt_struct_start), RELOC(dt_struct_end));
+
+ }
+
+static void __init prom_find_boot_cpu(void)
+{
+	unsigned long offset = reloc_offset();
+       	struct prom_t *_prom = PTRRELOC(&prom);
+	u32 getprop_rval;
+	ihandle prom_cpu;
+	phandle cpu_pkg;
+
+	if (prom_getprop(_prom->chosen, "cpu", &prom_cpu, sizeof(prom_cpu)) <= 0)
+		prom_panic("cannot find boot cpu");
+
+	cpu_pkg = call_prom("instance-to-package", 1, 1, prom_cpu);
+
+	prom_setprop(cpu_pkg, "linux,boot-cpu", NULL, 0);
+	prom_getprop(cpu_pkg, "reg", &getprop_rval, sizeof(getprop_rval));
+	_prom->cpu = getprop_rval;
+
+	prom_debug("Booting CPU hw index = 0x%x\n", _prom->cpu);
+}
+
+static void __init prom_check_initrd(unsigned long r3, unsigned long r4)
+{
+#ifdef CONFIG_BLK_DEV_INITRD
+	unsigned long offset = reloc_offset();
+       	struct prom_t *_prom = PTRRELOC(&prom);
+
+	if ( r3 && r4 && r4 != 0xdeadbeef) {
+		u64 val;
+
+		RELOC(prom_initrd_start) = (r3 >= KERNELBASE) ? __pa(r3) : r3;
+		RELOC(prom_initrd_end) = RELOC(prom_initrd_start) + r4;
+
+		val = (u64)RELOC(prom_initrd_start);
+		prom_setprop(_prom->chosen, "linux,initrd-start", &val, sizeof(val));
+		val = (u64)RELOC(prom_initrd_end);
+		prom_setprop(_prom->chosen, "linux,initrd-end", &val, sizeof(val));
+
+		reserve_mem(RELOC(prom_initrd_start),
+			    RELOC(prom_initrd_end) - RELOC(prom_initrd_start));
+
+		prom_debug("initrd_start=0x%x\n", RELOC(prom_initrd_start));
+		prom_debug("initrd_end=0x%x\n", RELOC(prom_initrd_end));
+	}
+#endif /* CONFIG_BLK_DEV_INITRD */
+}
+
+/*
+ * We enter here early on, when the Open Firmware prom is still
+ * handling exceptions and the MMU hash table for us.
+ */
+
+unsigned long __init prom_init(unsigned long r3, unsigned long r4, unsigned long pp,
+			       unsigned long r6, unsigned long r7)
+{	
+	unsigned long offset = reloc_offset();
+       	struct prom_t *_prom = PTRRELOC(&prom);
+	unsigned long phys = KERNELBASE - offset;
+	u32 getprop_rval;
+	
+	/*
+	 * First zero the BSS
+	 */
+	memset(PTRRELOC(&__bss_start), 0, __bss_stop - __bss_start);
+
+	/*
+	 * Init interface to Open Firmware, get some node references,
+	 * like /chosen
+	 */
+	prom_init_client_services(pp);
+
+	/*
+	 * Init prom stdout device
+	 */
+	prom_init_stdout();
+	prom_debug("klimit=0x%x\n", RELOC(klimit));
+	prom_debug("offset=0x%x\n", offset);
+
+	/*
+	 * Check for an initrd
+	 */
+	prom_check_initrd(r3, r4);
+
+	/*
+	 * Get default machine type. At this point, we do not differenciate
+	 * between pSeries SMP and pSeries LPAR
+	 */
+	RELOC(of_platform) = prom_find_machine_type();
+	getprop_rval = RELOC(of_platform);
+	prom_setprop(_prom->chosen, "linux,platform",
+		     &getprop_rval, sizeof(getprop_rval));
+
+	/*
+	 * On pSeries, copy the CPU hold code
+	 */
+       	if (RELOC(of_platform) & PLATFORM_PSERIES)
+       		copy_and_flush(0, KERNELBASE - offset, 0x100, 0);
+
+	/*
+	 * Get memory cells format
+	 */
+	getprop_rval = 1;
+	prom_getprop(_prom->root, "#size-cells",
+		     &getprop_rval, sizeof(getprop_rval));
+	_prom->root_size_cells = getprop_rval;
+	getprop_rval = 2;
+	prom_getprop(_prom->root, "#address-cells",
+		     &getprop_rval, sizeof(getprop_rval));
+	_prom->root_addr_cells = getprop_rval;
+
+	/*
+	 * Do early parsing of command line
+	 */
+	early_cmdline_parse();
+
+	/*
+	 * Initialize memory management within prom_init
+	 */
+	prom_init_mem();
+
+	/*
+	 * Determine which cpu is actually running right _now_
+	 */
+	prom_find_boot_cpu();
+
+	/* 
+	 * Initialize display devices
+	 */
+	prom_check_displays();
+
+	/*
+	 * Initialize IOMMU (TCE tables) on pSeries. Do that before anything else
+	 * that uses the allocator, we need to make sure we get the top of memory
+	 * available for us here...
+	 */
+	if (RELOC(of_platform) == PLATFORM_PSERIES)
+		prom_initialize_tce_table();
+
+	/*
+	 * On non-powermacs, try to instantiate RTAS and puts all CPUs
+	 * in spin-loops. PowerMacs don't have a working RTAS and use
+	 * a different way to spin CPUs
+	 */
+	if (RELOC(of_platform) != PLATFORM_POWERMAC) {
+		prom_instantiate_rtas();
+		prom_hold_cpus();
+	}
+
+	/*
+	 * Fill in some infos for use by the kernel later on
+	 */
+	if (RELOC(ppc64_iommu_off))
+		prom_setprop(_prom->chosen, "linux,iommu-off", NULL, 0);
+
+	if (RELOC(iommu_force_on))
+		prom_setprop(_prom->chosen, "linux,iommu-force-on", NULL, 0);
+
+	if (RELOC(prom_memory_limit))
+		prom_setprop(_prom->chosen, "linux,memory-limit",
+			PTRRELOC(&prom_memory_limit), sizeof(RELOC(prom_memory_limit)));
+
+	if (RELOC(prom_tce_alloc_start)) {
+		prom_setprop(_prom->chosen, "linux,tce-alloc-start",
+			PTRRELOC(&prom_tce_alloc_start), sizeof(RELOC(prom_tce_alloc_start)));
+		prom_setprop(_prom->chosen, "linux,tce-alloc-end",
+			PTRRELOC(&prom_tce_alloc_end), sizeof(RELOC(prom_tce_alloc_end)));
+	}
+
+	/*
+	 * Now finally create the flattened device-tree
+	 */
+       	prom_printf("copying OF device tree ...\n");
+       	flatten_device_tree();
+
+	/* in case stdin is USB and still active on IBM machines... */
+	prom_close_stdin();
+
+	/*
+	 * Call OF "quiesce" method to shut down pending DMA's from
+	 * devices etc...
+	 */
+	prom_printf("Calling quiesce ...\n");
+	call_prom("quiesce", 0, 0);
+
+	/*
+	 * And finally, call the kernel passing it the flattened device
+	 * tree and NULL as r5, thus triggering the new entry point which
+	 * is common to us and kexec
+	 */
+	prom_printf("returning from prom_init\n");
+	prom_debug("->dt_header_start=0x%x\n", RELOC(dt_header_start));
+	prom_debug("->phys=0x%x\n", phys);
+
+	__start(RELOC(dt_header_start), phys, 0);
+
+	return 0;
+}
+
diff --git a/arch/ppc64/kernel/ptrace.c b/arch/ppc64/kernel/ptrace.c
new file mode 100644
index 00000000000..354a287c67e
--- /dev/null
+++ b/arch/ppc64/kernel/ptrace.c
@@ -0,0 +1,328 @@
+/*
+ *  linux/arch/ppc64/kernel/ptrace.c
+ *
+ *  PowerPC version
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Derived from "arch/m68k/kernel/ptrace.c"
+ *  Copyright (C) 1994 by Hamish Macdonald
+ *  Taken from linux/kernel/ptrace.c and modified for M680x0.
+ *  linux/kernel/ptrace.c is by Ross Biro 1/23/92, edited by Linus Torvalds
+ *
+ * Modified by Cort Dougan (cort@hq.fsmlabs.com)
+ * and Paul Mackerras (paulus@linuxcare.com.au).
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License.  See the file README.legal in the main directory of
+ * this archive for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/errno.h>
+#include <linux/ptrace.h>
+#include <linux/user.h>
+#include <linux/security.h>
+#include <linux/audit.h>
+#include <linux/seccomp.h>
+
+#include <asm/uaccess.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/system.h>
+#include <asm/ptrace-common.h>
+
+/*
+ * does not yet catch signals sent when the child dies.
+ * in exit.c or in signal.c.
+ */
+
+/*
+ * Called by kernel/ptrace.c when detaching..
+ *
+ * Make sure single step bits etc are not set.
+ */
+void ptrace_disable(struct task_struct *child)
+{
+	/* make sure the single step bit is not set. */
+	clear_single_step(child);
+}
+
+int sys_ptrace(long request, long pid, long addr, long data)
+{
+	struct task_struct *child;
+	int ret = -EPERM;
+
+	lock_kernel();
+	if (request == PTRACE_TRACEME) {
+		/* are we already being traced? */
+		if (current->ptrace & PT_PTRACED)
+			goto out;
+		ret = security_ptrace(current->parent, current);
+		if (ret)
+			goto out;
+		/* set the ptrace bit in the process flags. */
+		current->ptrace |= PT_PTRACED;
+		ret = 0;
+		goto out;
+	}
+	ret = -ESRCH;
+	read_lock(&tasklist_lock);
+	child = find_task_by_pid(pid);
+	if (child)
+		get_task_struct(child);
+	read_unlock(&tasklist_lock);
+	if (!child)
+		goto out;
+
+	ret = -EPERM;
+	if (pid == 1)		/* you may not mess with init */
+		goto out_tsk;
+
+	if (request == PTRACE_ATTACH) {
+		ret = ptrace_attach(child);
+		goto out_tsk;
+	}
+
+	ret = ptrace_check_attach(child, request == PTRACE_KILL);
+	if (ret < 0)
+		goto out_tsk;
+
+	switch (request) {
+	/* when I and D space are separate, these will need to be fixed. */
+	case PTRACE_PEEKTEXT: /* read word at location addr. */ 
+	case PTRACE_PEEKDATA: {
+		unsigned long tmp;
+		int copied;
+
+		copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
+		ret = -EIO;
+		if (copied != sizeof(tmp))
+			break;
+		ret = put_user(tmp,(unsigned long __user *) data);
+		break;
+	}
+
+	/* read the word at location addr in the USER area. */
+	case PTRACE_PEEKUSR: {
+		unsigned long index;
+		unsigned long tmp;
+
+		ret = -EIO;
+		/* convert to index and check */
+		index = (unsigned long) addr >> 3;
+		if ((addr & 7) || (index > PT_FPSCR))
+			break;
+
+		if (index < PT_FPR0) {
+			tmp = get_reg(child, (int)index);
+		} else {
+			flush_fp_to_thread(child);
+			tmp = ((unsigned long *)child->thread.fpr)[index - PT_FPR0];
+		}
+		ret = put_user(tmp,(unsigned long __user *) data);
+		break;
+	}
+
+	/* If I and D space are separate, this will have to be fixed. */
+	case PTRACE_POKETEXT: /* write the word at location addr. */
+	case PTRACE_POKEDATA:
+		ret = 0;
+		if (access_process_vm(child, addr, &data, sizeof(data), 1)
+				== sizeof(data))
+			break;
+		ret = -EIO;
+		break;
+
+	/* write the word at location addr in the USER area */
+	case PTRACE_POKEUSR: {
+		unsigned long index;
+
+		ret = -EIO;
+		/* convert to index and check */
+		index = (unsigned long) addr >> 3;
+		if ((addr & 7) || (index > PT_FPSCR))
+			break;
+
+		if (index == PT_ORIG_R3)
+			break;
+		if (index < PT_FPR0) {
+			ret = put_reg(child, index, data);
+		} else {
+			flush_fp_to_thread(child);
+			((unsigned long *)child->thread.fpr)[index - PT_FPR0] = data;
+			ret = 0;
+		}
+		break;
+	}
+
+	case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
+	case PTRACE_CONT: { /* restart after signal. */
+		ret = -EIO;
+		if ((unsigned long) data > _NSIG)
+			break;
+		if (request == PTRACE_SYSCALL)
+			set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+		else
+			clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+		child->exit_code = data;
+		/* make sure the single step bit is not set. */
+		clear_single_step(child);
+		wake_up_process(child);
+		ret = 0;
+		break;
+	}
+
+	/*
+	 * make the child exit.  Best I can do is send it a sigkill.
+	 * perhaps it should be put in the status that it wants to
+	 * exit.
+	 */
+	case PTRACE_KILL: {
+		ret = 0;
+		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
+			break;
+		child->exit_code = SIGKILL;
+		/* make sure the single step bit is not set. */
+		clear_single_step(child);
+		wake_up_process(child);
+		break;
+	}
+
+	case PTRACE_SINGLESTEP: {  /* set the trap flag. */
+		ret = -EIO;
+		if ((unsigned long) data > _NSIG)
+			break;
+		clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+		set_single_step(child);
+		child->exit_code = data;
+		/* give it a chance to run. */
+		wake_up_process(child);
+		ret = 0;
+		break;
+	}
+
+	case PTRACE_DETACH:
+		ret = ptrace_detach(child, data);
+		break;
+
+	case PPC_PTRACE_GETREGS: { /* Get GPRs 0 - 31. */
+		int i;
+		unsigned long *reg = &((unsigned long *)child->thread.regs)[0];
+		unsigned long __user *tmp = (unsigned long __user *)addr;
+
+		for (i = 0; i < 32; i++) {
+			ret = put_user(*reg, tmp);
+			if (ret)
+				break;
+			reg++;
+			tmp++;
+		}
+		break;
+	}
+
+	case PPC_PTRACE_SETREGS: { /* Set GPRs 0 - 31. */
+		int i;
+		unsigned long *reg = &((unsigned long *)child->thread.regs)[0];
+		unsigned long __user *tmp = (unsigned long __user *)addr;
+
+		for (i = 0; i < 32; i++) {
+			ret = get_user(*reg, tmp);
+			if (ret)
+				break;
+			reg++;
+			tmp++;
+		}
+		break;
+	}
+
+	case PPC_PTRACE_GETFPREGS: { /* Get FPRs 0 - 31. */
+		int i;
+		unsigned long *reg = &((unsigned long *)child->thread.fpr)[0];
+		unsigned long __user *tmp = (unsigned long __user *)addr;
+
+		flush_fp_to_thread(child);
+
+		for (i = 0; i < 32; i++) {
+			ret = put_user(*reg, tmp);
+			if (ret)
+				break;
+			reg++;
+			tmp++;
+		}
+		break;
+	}
+
+	case PPC_PTRACE_SETFPREGS: { /* Get FPRs 0 - 31. */
+		int i;
+		unsigned long *reg = &((unsigned long *)child->thread.fpr)[0];
+		unsigned long __user *tmp = (unsigned long __user *)addr;
+
+		flush_fp_to_thread(child);
+
+		for (i = 0; i < 32; i++) {
+			ret = get_user(*reg, tmp);
+			if (ret)
+				break;
+			reg++;
+			tmp++;
+		}
+		break;
+	}
+
+	default:
+		ret = ptrace_request(child, request, addr, data);
+		break;
+	}
+out_tsk:
+	put_task_struct(child);
+out:
+	unlock_kernel();
+	return ret;
+}
+
+static void do_syscall_trace(void)
+{
+	/* the 0x80 provides a way for the tracing parent to distinguish
+	   between a syscall stop and SIGTRAP delivery */
+	ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
+				 ? 0x80 : 0));
+
+	/*
+	 * this isn't the same as continuing with a signal, but it will do
+	 * for normal use.  strace only continues with a signal if the
+	 * stopping signal is not SIGTRAP.  -brl
+	 */
+	if (current->exit_code) {
+		send_sig(current->exit_code, current, 1);
+		current->exit_code = 0;
+	}
+}
+
+void do_syscall_trace_enter(struct pt_regs *regs)
+{
+	if (unlikely(current->audit_context))
+		audit_syscall_entry(current, regs->gpr[0],
+				    regs->gpr[3], regs->gpr[4],
+				    regs->gpr[5], regs->gpr[6]);
+
+	if (test_thread_flag(TIF_SYSCALL_TRACE)
+	    && (current->ptrace & PT_PTRACED))
+		do_syscall_trace();
+}
+
+void do_syscall_trace_leave(struct pt_regs *regs)
+{
+	secure_computing(regs->gpr[0]);
+
+	if (unlikely(current->audit_context))
+		audit_syscall_exit(current, regs->result);
+
+	if ((test_thread_flag(TIF_SYSCALL_TRACE)
+	     || test_thread_flag(TIF_SINGLESTEP))
+	    && (current->ptrace & PT_PTRACED))
+		do_syscall_trace();
+}
diff --git a/arch/ppc64/kernel/ptrace32.c b/arch/ppc64/kernel/ptrace32.c
new file mode 100644
index 00000000000..ee81b1b776c
--- /dev/null
+++ b/arch/ppc64/kernel/ptrace32.c
@@ -0,0 +1,420 @@
+/*
+ *  linux/arch/ppc64/kernel/ptrace32.c
+ *
+ *  PowerPC version
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Derived from "arch/m68k/kernel/ptrace.c"
+ *  Copyright (C) 1994 by Hamish Macdonald
+ *  Taken from linux/kernel/ptrace.c and modified for M680x0.
+ *  linux/kernel/ptrace.c is by Ross Biro 1/23/92, edited by Linus Torvalds
+ *
+ * Modified by Cort Dougan (cort@hq.fsmlabs.com)
+ * and Paul Mackerras (paulus@linuxcare.com.au).
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License.  See the file README.legal in the main directory of
+ * this archive for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/errno.h>
+#include <linux/ptrace.h>
+#include <linux/user.h>
+#include <linux/security.h>
+
+#include <asm/uaccess.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/system.h>
+#include <asm/ptrace-common.h>
+
+/*
+ * does not yet catch signals sent when the child dies.
+ * in exit.c or in signal.c.
+ */
+
+int sys32_ptrace(long request, long pid, unsigned long addr, unsigned long data)
+{
+	struct task_struct *child;
+	int ret = -EPERM;
+
+	lock_kernel();
+	if (request == PTRACE_TRACEME) {
+		/* are we already being traced? */
+		if (current->ptrace & PT_PTRACED)
+			goto out;
+		ret = security_ptrace(current->parent, current);
+		if (ret)
+			goto out;
+		/* set the ptrace bit in the process flags. */
+		current->ptrace |= PT_PTRACED;
+		ret = 0;
+		goto out;
+	}
+	ret = -ESRCH;
+	read_lock(&tasklist_lock);
+	child = find_task_by_pid(pid);
+	if (child)
+		get_task_struct(child);
+	read_unlock(&tasklist_lock);
+	if (!child)
+		goto out;
+
+	ret = -EPERM;
+	if (pid == 1)		/* you may not mess with init */
+		goto out_tsk;
+
+	if (request == PTRACE_ATTACH) {
+		ret = ptrace_attach(child);
+		goto out_tsk;
+	}
+
+	ret = ptrace_check_attach(child, request == PTRACE_KILL);
+	if (ret < 0)
+		goto out_tsk;
+
+	switch (request) {
+	/* when I and D space are separate, these will need to be fixed. */
+	case PTRACE_PEEKTEXT: /* read word at location addr. */ 
+	case PTRACE_PEEKDATA: {
+		unsigned int tmp;
+		int copied;
+
+		copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
+		ret = -EIO;
+		if (copied != sizeof(tmp))
+			break;
+		ret = put_user(tmp, (u32 __user *)data);
+		break;
+	}
+
+	/*
+	 * Read 4 bytes of the other process' storage
+	 *  data is a pointer specifying where the user wants the
+	 *	4 bytes copied into
+	 *  addr is a pointer in the user's storage that contains an 8 byte
+	 *	address in the other process of the 4 bytes that is to be read
+	 * (this is run in a 32-bit process looking at a 64-bit process)
+	 * when I and D space are separate, these will need to be fixed.
+	 */
+	case PPC_PTRACE_PEEKTEXT_3264:
+	case PPC_PTRACE_PEEKDATA_3264: {
+		u32 tmp;
+		int copied;
+		u32 __user * addrOthers;
+
+		ret = -EIO;
+
+		/* Get the addr in the other process that we want to read */
+		if (get_user(addrOthers, (u32 __user * __user *)addr) != 0)
+			break;
+
+		copied = access_process_vm(child, (u64)addrOthers, &tmp,
+				sizeof(tmp), 0);
+		if (copied != sizeof(tmp))
+			break;
+		ret = put_user(tmp, (u32 __user *)data);
+		break;
+	}
+
+	/* Read a register (specified by ADDR) out of the "user area" */
+	case PTRACE_PEEKUSR: {
+		int index;
+		unsigned long tmp;
+
+		ret = -EIO;
+		/* convert to index and check */
+		index = (unsigned long) addr >> 2;
+		if ((addr & 3) || (index > PT_FPSCR32))
+			break;
+
+		if (index < PT_FPR0) {
+			tmp = get_reg(child, index);
+		} else {
+			flush_fp_to_thread(child);
+			/*
+			 * the user space code considers the floating point
+			 * to be an array of unsigned int (32 bits) - the
+			 * index passed in is based on this assumption.
+			 */
+			tmp = ((unsigned int *)child->thread.fpr)[index - PT_FPR0];
+		}
+		ret = put_user((unsigned int)tmp, (u32 __user *)data);
+		break;
+	}
+  
+	/*
+	 * Read 4 bytes out of the other process' pt_regs area
+	 *  data is a pointer specifying where the user wants the
+	 *	4 bytes copied into
+	 *  addr is the offset into the other process' pt_regs structure
+	 *	that is to be read
+	 * (this is run in a 32-bit process looking at a 64-bit process)
+	 */
+	case PPC_PTRACE_PEEKUSR_3264: {
+		u32 index;
+		u32 reg32bits;
+		u64 tmp;
+		u32 numReg;
+		u32 part;
+
+		ret = -EIO;
+		/* Determine which register the user wants */
+		index = (u64)addr >> 2;
+		numReg = index / 2;
+		/* Determine which part of the register the user wants */
+		if (index % 2)
+			part = 1;  /* want the 2nd half of the register (right-most). */
+		else
+			part = 0;  /* want the 1st half of the register (left-most). */
+
+		/* Validate the input - check to see if address is on the wrong boundary or beyond the end of the user area */
+		if ((addr & 3) || numReg > PT_FPSCR)
+			break;
+
+		if (numReg >= PT_FPR0) {
+			flush_fp_to_thread(child);
+			tmp = ((unsigned long int *)child->thread.fpr)[numReg - PT_FPR0];
+		} else { /* register within PT_REGS struct */
+			tmp = get_reg(child, numReg);
+		} 
+		reg32bits = ((u32*)&tmp)[part];
+		ret = put_user(reg32bits, (u32 __user *)data);
+		break;
+	}
+
+	/* If I and D space are separate, this will have to be fixed. */
+	case PTRACE_POKETEXT: /* write the word at location addr. */
+	case PTRACE_POKEDATA: {
+		unsigned int tmp;
+		tmp = data;
+		ret = 0;
+		if (access_process_vm(child, addr, &tmp, sizeof(tmp), 1)
+				== sizeof(tmp))
+			break;
+		ret = -EIO;
+		break;
+	}
+
+	/*
+	 * Write 4 bytes into the other process' storage
+	 *  data is the 4 bytes that the user wants written
+	 *  addr is a pointer in the user's storage that contains an
+	 *	8 byte address in the other process where the 4 bytes
+	 *	that is to be written
+	 * (this is run in a 32-bit process looking at a 64-bit process)
+	 * when I and D space are separate, these will need to be fixed.
+	 */
+	case PPC_PTRACE_POKETEXT_3264:
+	case PPC_PTRACE_POKEDATA_3264: {
+		u32 tmp = data;
+		u32 __user * addrOthers;
+
+		/* Get the addr in the other process that we want to write into */
+		ret = -EIO;
+		if (get_user(addrOthers, (u32 __user * __user *)addr) != 0)
+			break;
+		ret = 0;
+		if (access_process_vm(child, (u64)addrOthers, &tmp,
+					sizeof(tmp), 1) == sizeof(tmp))
+			break;
+		ret = -EIO;
+		break;
+	}
+
+	/* write the word at location addr in the USER area */
+	case PTRACE_POKEUSR: {
+		unsigned long index;
+
+		ret = -EIO;
+		/* convert to index and check */
+		index = (unsigned long) addr >> 2;
+		if ((addr & 3) || (index > PT_FPSCR32))
+			break;
+
+		if (index == PT_ORIG_R3)
+			break;
+		if (index < PT_FPR0) {
+			ret = put_reg(child, index, data);
+		} else {
+			flush_fp_to_thread(child);
+			/*
+			 * the user space code considers the floating point
+			 * to be an array of unsigned int (32 bits) - the
+			 * index passed in is based on this assumption.
+			 */
+			((unsigned int *)child->thread.fpr)[index - PT_FPR0] = data;
+			ret = 0;
+		}
+		break;
+	}
+
+	/*
+	 * Write 4 bytes into the other process' pt_regs area
+	 *  data is the 4 bytes that the user wants written
+	 *  addr is the offset into the other process' pt_regs structure
+	 *	that is to be written into
+	 * (this is run in a 32-bit process looking at a 64-bit process)
+	 */
+	case PPC_PTRACE_POKEUSR_3264: {
+		u32 index;
+		u32 numReg;
+
+		ret = -EIO;
+		/* Determine which register the user wants */
+		index = (u64)addr >> 2;
+		numReg = index / 2;
+		/*
+		 * Validate the input - check to see if address is on the
+		 * wrong boundary or beyond the end of the user area
+		 */
+		if ((addr & 3) || (numReg > PT_FPSCR))
+			break;
+		/* Insure it is a register we let them change */
+		if ((numReg == PT_ORIG_R3)
+				|| ((numReg > PT_CCR) && (numReg < PT_FPR0)))
+			break;
+		if (numReg >= PT_FPR0) {
+			flush_fp_to_thread(child);
+		}
+		if (numReg == PT_MSR)
+			data = (data & MSR_DEBUGCHANGE)
+				| (child->thread.regs->msr & ~MSR_DEBUGCHANGE);
+		((u32*)child->thread.regs)[index] = data;
+		ret = 0;
+		break;
+	}
+
+	case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
+	case PTRACE_CONT: { /* restart after signal. */
+		ret = -EIO;
+		if ((unsigned long) data > _NSIG)
+			break;
+		if (request == PTRACE_SYSCALL)
+			set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+		else
+			clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+		child->exit_code = data;
+		/* make sure the single step bit is not set. */
+		clear_single_step(child);
+		wake_up_process(child);
+		ret = 0;
+		break;
+	}
+
+	/*
+	 * make the child exit.  Best I can do is send it a sigkill.
+	 * perhaps it should be put in the status that it wants to
+	 * exit.
+	 */
+	case PTRACE_KILL: {
+		ret = 0;
+		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
+			break;
+		child->exit_code = SIGKILL;
+		/* make sure the single step bit is not set. */
+		clear_single_step(child);
+		wake_up_process(child);
+		break;
+	}
+
+	case PTRACE_SINGLESTEP: {  /* set the trap flag. */
+		ret = -EIO;
+		if ((unsigned long) data > _NSIG)
+			break;
+		clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+		set_single_step(child);
+		child->exit_code = data;
+		/* give it a chance to run. */
+		wake_up_process(child);
+		ret = 0;
+		break;
+	}
+
+	case PTRACE_DETACH:
+		ret = ptrace_detach(child, data);
+		break;
+
+	case PPC_PTRACE_GETREGS: { /* Get GPRs 0 - 31. */
+		int i;
+		unsigned long *reg = &((unsigned long *)child->thread.regs)[0];
+		unsigned int __user *tmp = (unsigned int __user *)addr;
+
+		for (i = 0; i < 32; i++) {
+			ret = put_user(*reg, tmp);
+			if (ret)
+				break;
+			reg++;
+			tmp++;
+		}
+		break;
+	}
+
+	case PPC_PTRACE_SETREGS: { /* Set GPRs 0 - 31. */
+		int i;
+		unsigned long *reg = &((unsigned long *)child->thread.regs)[0];
+		unsigned int __user *tmp = (unsigned int __user *)addr;
+
+		for (i = 0; i < 32; i++) {
+			ret = get_user(*reg, tmp);
+			if (ret)
+				break;
+			reg++;
+			tmp++;
+		}
+		break;
+	}
+
+	case PPC_PTRACE_GETFPREGS: { /* Get FPRs 0 - 31. */
+		int i;
+		unsigned long *reg = &((unsigned long *)child->thread.fpr)[0];
+		unsigned int __user *tmp = (unsigned int __user *)addr;
+
+		flush_fp_to_thread(child);
+
+		for (i = 0; i < 32; i++) {
+			ret = put_user(*reg, tmp);
+			if (ret)
+				break;
+			reg++;
+			tmp++;
+		}
+		break;
+	}
+
+	case PPC_PTRACE_SETFPREGS: { /* Get FPRs 0 - 31. */
+		int i;
+		unsigned long *reg = &((unsigned long *)child->thread.fpr)[0];
+		unsigned int __user *tmp = (unsigned int __user *)addr;
+
+		flush_fp_to_thread(child);
+
+		for (i = 0; i < 32; i++) {
+			ret = get_user(*reg, tmp);
+			if (ret)
+				break;
+			reg++;
+			tmp++;
+		}
+		break;
+	}
+
+       case PTRACE_GETEVENTMSG:
+                ret = put_user(child->ptrace_message, (unsigned int __user *) data);
+                break;
+
+	default:
+		ret = ptrace_request(child, request, addr, data);
+		break;
+	}
+out_tsk:
+	put_task_struct(child);
+out:
+	unlock_kernel();
+	return ret;
+}
diff --git a/arch/ppc64/kernel/ras.c b/arch/ppc64/kernel/ras.c
new file mode 100644
index 00000000000..1c4c796b212
--- /dev/null
+++ b/arch/ppc64/kernel/ras.c
@@ -0,0 +1,356 @@
+/*
+ * ras.c
+ * Copyright (C) 2001 Dave Engebretsen IBM Corporation
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+/* Change Activity:
+ * 2001/09/21 : engebret : Created with minimal EPOW and HW exception support.
+ * End Change Activity 
+ */
+
+#include <linux/errno.h>
+#include <linux/threads.h>
+#include <linux/kernel_stat.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/timex.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/irq.h>
+#include <linux/random.h>
+#include <linux/sysrq.h>
+#include <linux/bitops.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/irq.h>
+#include <asm/cache.h>
+#include <asm/prom.h>
+#include <asm/ptrace.h>
+#include <asm/iSeries/LparData.h>
+#include <asm/machdep.h>
+#include <asm/rtas.h>
+#include <asm/ppcdebug.h>
+
+static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX];
+static DEFINE_SPINLOCK(ras_log_buf_lock);
+
+char mce_data_buf[RTAS_ERROR_LOG_MAX]
+;
+/* This is true if we are using the firmware NMI handler (typically LPAR) */
+extern int fwnmi_active;
+
+extern void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr);
+
+static int ras_get_sensor_state_token;
+static int ras_check_exception_token;
+
+#define EPOW_SENSOR_TOKEN	9
+#define EPOW_SENSOR_INDEX	0
+#define RAS_VECTOR_OFFSET	0x500
+
+static irqreturn_t ras_epow_interrupt(int irq, void *dev_id,
+					struct pt_regs * regs);
+static irqreturn_t ras_error_interrupt(int irq, void *dev_id,
+					struct pt_regs * regs);
+
+/* #define DEBUG */
+
+static void request_ras_irqs(struct device_node *np, char *propname,
+			irqreturn_t (*handler)(int, void *, struct pt_regs *),
+			const char *name)
+{
+	unsigned int *ireg, len, i;
+	int virq, n_intr;
+
+	ireg = (unsigned int *)get_property(np, propname, &len);
+	if (ireg == NULL)
+		return;
+	n_intr = prom_n_intr_cells(np);
+	len /= n_intr * sizeof(*ireg);
+
+	for (i = 0; i < len; i++) {
+		virq = virt_irq_create_mapping(*ireg);
+		if (virq == NO_IRQ) {
+			printk(KERN_ERR "Unable to allocate interrupt "
+			       "number for %s\n", np->full_name);
+			return;
+		}
+		if (request_irq(irq_offset_up(virq), handler, 0, name, NULL)) {
+			printk(KERN_ERR "Unable to request interrupt %d for "
+			       "%s\n", irq_offset_up(virq), np->full_name);
+			return;
+		}
+		ireg += n_intr;
+	}
+}
+
+/*
+ * Initialize handlers for the set of interrupts caused by hardware errors
+ * and power system events.
+ */
+static int __init init_ras_IRQ(void)
+{
+	struct device_node *np;
+
+	ras_get_sensor_state_token = rtas_token("get-sensor-state");
+	ras_check_exception_token = rtas_token("check-exception");
+
+	/* Internal Errors */
+	np = of_find_node_by_path("/event-sources/internal-errors");
+	if (np != NULL) {
+		request_ras_irqs(np, "open-pic-interrupt", ras_error_interrupt,
+				 "RAS_ERROR");
+		request_ras_irqs(np, "interrupts", ras_error_interrupt,
+				 "RAS_ERROR");
+		of_node_put(np);
+	}
+
+	/* EPOW Events */
+	np = of_find_node_by_path("/event-sources/epow-events");
+	if (np != NULL) {
+		request_ras_irqs(np, "open-pic-interrupt", ras_epow_interrupt,
+				 "RAS_EPOW");
+		request_ras_irqs(np, "interrupts", ras_epow_interrupt,
+				 "RAS_EPOW");
+		of_node_put(np);
+	}
+
+	return 1;
+}
+__initcall(init_ras_IRQ);
+
+/*
+ * Handle power subsystem events (EPOW).
+ *
+ * Presently we just log the event has occurred.  This should be fixed
+ * to examine the type of power failure and take appropriate action where
+ * the time horizon permits something useful to be done.
+ */
+static irqreturn_t
+ras_epow_interrupt(int irq, void *dev_id, struct pt_regs * regs)
+{
+	int status = 0xdeadbeef;
+	int state = 0;
+	int critical;
+
+	status = rtas_call(ras_get_sensor_state_token, 2, 2, &state,
+			   EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX);
+
+	if (state > 3)
+		critical = 1;  /* Time Critical */
+	else
+		critical = 0;
+
+	spin_lock(&ras_log_buf_lock);
+
+	status = rtas_call(ras_check_exception_token, 6, 1, NULL,
+			   RAS_VECTOR_OFFSET,
+			   virt_irq_to_real(irq_offset_down(irq)),
+			   RTAS_EPOW_WARNING | RTAS_POWERMGM_EVENTS,
+			   critical, __pa(&ras_log_buf),
+				rtas_get_error_log_max());
+
+	udbg_printf("EPOW <0x%lx 0x%x 0x%x>\n",
+		    *((unsigned long *)&ras_log_buf), status, state);
+	printk(KERN_WARNING "EPOW <0x%lx 0x%x 0x%x>\n",
+	       *((unsigned long *)&ras_log_buf), status, state);
+
+	/* format and print the extended information */
+	log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0);
+
+	spin_unlock(&ras_log_buf_lock);
+	return IRQ_HANDLED;
+}
+
+/*
+ * Handle hardware error interrupts.
+ *
+ * RTAS check-exception is called to collect data on the exception.  If
+ * the error is deemed recoverable, we log a warning and return.
+ * For nonrecoverable errors, an error is logged and we stop all processing
+ * as quickly as possible in order to prevent propagation of the failure.
+ */
+static irqreturn_t
+ras_error_interrupt(int irq, void *dev_id, struct pt_regs * regs)
+{
+	struct rtas_error_log *rtas_elog;
+	int status = 0xdeadbeef;
+	int fatal;
+
+	spin_lock(&ras_log_buf_lock);
+
+	status = rtas_call(ras_check_exception_token, 6, 1, NULL,
+			   RAS_VECTOR_OFFSET,
+			   virt_irq_to_real(irq_offset_down(irq)),
+			   RTAS_INTERNAL_ERROR, 1 /*Time Critical */,
+			   __pa(&ras_log_buf),
+				rtas_get_error_log_max());
+
+	rtas_elog = (struct rtas_error_log *)ras_log_buf;
+
+	if ((status == 0) && (rtas_elog->severity >= RTAS_SEVERITY_ERROR_SYNC))
+		fatal = 1;
+	else
+		fatal = 0;
+
+	/* format and print the extended information */
+	log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, fatal);
+
+	if (fatal) {
+		udbg_printf("Fatal HW Error <0x%lx 0x%x>\n",
+			    *((unsigned long *)&ras_log_buf), status);
+		printk(KERN_EMERG "Error: Fatal hardware error <0x%lx 0x%x>\n",
+		       *((unsigned long *)&ras_log_buf), status);
+
+#ifndef DEBUG
+		/* Don't actually power off when debugging so we can test
+		 * without actually failing while injecting errors.
+		 * Error data will not be logged to syslog.
+		 */
+		ppc_md.power_off();
+#endif
+	} else {
+		udbg_printf("Recoverable HW Error <0x%lx 0x%x>\n",
+			    *((unsigned long *)&ras_log_buf), status);
+		printk(KERN_WARNING
+		       "Warning: Recoverable hardware error <0x%lx 0x%x>\n",
+		       *((unsigned long *)&ras_log_buf), status);
+	}
+
+	spin_unlock(&ras_log_buf_lock);
+	return IRQ_HANDLED;
+}
+
+/* Get the error information for errors coming through the
+ * FWNMI vectors.  The pt_regs' r3 will be updated to reflect
+ * the actual r3 if possible, and a ptr to the error log entry
+ * will be returned if found.
+ *
+ * The mce_data_buf does not have any locks or protection around it,
+ * if a second machine check comes in, or a system reset is done
+ * before we have logged the error, then we will get corruption in the
+ * error log.  This is preferable over holding off on calling
+ * ibm,nmi-interlock which would result in us checkstopping if a
+ * second machine check did come in.
+ */
+static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs)
+{
+	unsigned long errdata = regs->gpr[3];
+	struct rtas_error_log *errhdr = NULL;
+	unsigned long *savep;
+
+	if ((errdata >= 0x7000 && errdata < 0x7fff0) ||
+	    (errdata >= rtas.base && errdata < rtas.base + rtas.size - 16)) {
+		savep = __va(errdata);
+		regs->gpr[3] = savep[0];	/* restore original r3 */
+		memset(mce_data_buf, 0, RTAS_ERROR_LOG_MAX);
+		memcpy(mce_data_buf, (char *)(savep + 1), RTAS_ERROR_LOG_MAX);
+		errhdr = (struct rtas_error_log *)mce_data_buf;
+	} else {
+		printk("FWNMI: corrupt r3\n");
+	}
+	return errhdr;
+}
+
+/* Call this when done with the data returned by FWNMI_get_errinfo.
+ * It will release the saved data area for other CPUs in the
+ * partition to receive FWNMI errors.
+ */
+static void fwnmi_release_errinfo(void)
+{
+	int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL);
+	if (ret != 0)
+		printk("FWNMI: nmi-interlock failed: %d\n", ret);
+}
+
+void pSeries_system_reset_exception(struct pt_regs *regs)
+{
+	if (fwnmi_active) {
+		struct rtas_error_log *errhdr = fwnmi_get_errinfo(regs);
+		if (errhdr) {
+			/* XXX Should look at FWNMI information */
+		}
+		fwnmi_release_errinfo();
+	}
+}
+
+/*
+ * See if we can recover from a machine check exception.
+ * This is only called on power4 (or above) and only via
+ * the Firmware Non-Maskable Interrupts (fwnmi) handler
+ * which provides the error analysis for us.
+ *
+ * Return 1 if corrected (or delivered a signal).
+ * Return 0 if there is nothing we can do.
+ */
+static int recover_mce(struct pt_regs *regs, struct rtas_error_log * err)
+{
+	int nonfatal = 0;
+
+	if (err->disposition == RTAS_DISP_FULLY_RECOVERED) {
+		/* Platform corrected itself */
+		nonfatal = 1;
+	} else if ((regs->msr & MSR_RI) &&
+		   user_mode(regs) &&
+		   err->severity == RTAS_SEVERITY_ERROR_SYNC &&
+		   err->disposition == RTAS_DISP_NOT_RECOVERED &&
+		   err->target == RTAS_TARGET_MEMORY &&
+		   err->type == RTAS_TYPE_ECC_UNCORR &&
+		   !(current->pid == 0 || current->pid == 1)) {
+		/* Kill off a user process with an ECC error */
+		printk(KERN_ERR "MCE: uncorrectable ecc error for pid %d\n",
+		       current->pid);
+		/* XXX something better for ECC error? */
+		_exception(SIGBUS, regs, BUS_ADRERR, regs->nip);
+		nonfatal = 1;
+	}
+
+ 	log_error((char *)err, ERR_TYPE_RTAS_LOG, !nonfatal);
+
+	return nonfatal;
+}
+
+/*
+ * Handle a machine check.
+ *
+ * Note that on Power 4 and beyond Firmware Non-Maskable Interrupts (fwnmi)
+ * should be present.  If so the handler which called us tells us if the
+ * error was recovered (never true if RI=0).
+ *
+ * On hardware prior to Power 4 these exceptions were asynchronous which
+ * means we can't tell exactly where it occurred and so we can't recover.
+ */
+int pSeries_machine_check_exception(struct pt_regs *regs)
+{
+	struct rtas_error_log *errp;
+
+	if (fwnmi_active) {
+		errp = fwnmi_get_errinfo(regs);
+		fwnmi_release_errinfo();
+		if (errp && recover_mce(regs, errp))
+			return 1;
+	}
+
+	return 0;
+}
diff --git a/arch/ppc64/kernel/rtas-proc.c b/arch/ppc64/kernel/rtas-proc.c
new file mode 100644
index 00000000000..28b1f1521f2
--- /dev/null
+++ b/arch/ppc64/kernel/rtas-proc.c
@@ -0,0 +1,807 @@
+/*
+ *   arch/ppc64/kernel/rtas-proc.c
+ *   Copyright (C) 2000 Tilmann Bitterberg
+ *   (tilmann@bitterberg.de)
+ *
+ *   RTAS (Runtime Abstraction Services) stuff
+ *   Intention is to provide a clean user interface
+ *   to use the RTAS.
+ *
+ *   TODO:
+ *   Split off a header file and maybe move it to a different
+ *   location. Write Documentation on what the /proc/rtas/ entries
+ *   actually do.
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/ctype.h>
+#include <linux/time.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+#include <linux/bitops.h>
+
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/machdep.h> /* for ppc_md */
+#include <asm/time.h>
+#include <asm/systemcfg.h>
+
+/* Token for Sensors */
+#define KEY_SWITCH		0x0001
+#define ENCLOSURE_SWITCH	0x0002
+#define THERMAL_SENSOR		0x0003
+#define LID_STATUS		0x0004
+#define POWER_SOURCE		0x0005
+#define BATTERY_VOLTAGE		0x0006
+#define BATTERY_REMAINING	0x0007
+#define BATTERY_PERCENTAGE	0x0008
+#define EPOW_SENSOR		0x0009
+#define BATTERY_CYCLESTATE	0x000a
+#define BATTERY_CHARGING	0x000b
+
+/* IBM specific sensors */
+#define IBM_SURVEILLANCE	0x2328 /* 9000 */
+#define IBM_FANRPM		0x2329 /* 9001 */
+#define IBM_VOLTAGE		0x232a /* 9002 */
+#define IBM_DRCONNECTOR		0x232b /* 9003 */
+#define IBM_POWERSUPPLY		0x232c /* 9004 */
+
+/* Status return values */
+#define SENSOR_CRITICAL_HIGH	13
+#define SENSOR_WARNING_HIGH	12
+#define SENSOR_NORMAL		11
+#define SENSOR_WARNING_LOW	10
+#define SENSOR_CRITICAL_LOW	 9
+#define SENSOR_SUCCESS		 0
+#define SENSOR_HW_ERROR		-1
+#define SENSOR_BUSY		-2
+#define SENSOR_NOT_EXIST	-3
+#define SENSOR_DR_ENTITY	-9000
+
+/* Location Codes */
+#define LOC_SCSI_DEV_ADDR	'A'
+#define LOC_SCSI_DEV_LOC	'B'
+#define LOC_CPU			'C'
+#define LOC_DISKETTE		'D'
+#define LOC_ETHERNET		'E'
+#define LOC_FAN			'F'
+#define LOC_GRAPHICS		'G'
+/* reserved / not used		'H' */
+#define LOC_IO_ADAPTER		'I'
+/* reserved / not used		'J' */
+#define LOC_KEYBOARD		'K'
+#define LOC_LCD			'L'
+#define LOC_MEMORY		'M'
+#define LOC_NV_MEMORY		'N'
+#define LOC_MOUSE		'O'
+#define LOC_PLANAR		'P'
+#define LOC_OTHER_IO		'Q'
+#define LOC_PARALLEL		'R'
+#define LOC_SERIAL		'S'
+#define LOC_DEAD_RING		'T'
+#define LOC_RACKMOUNTED		'U' /* for _u_nit is rack mounted */
+#define LOC_VOLTAGE		'V'
+#define LOC_SWITCH_ADAPTER	'W'
+#define LOC_OTHER		'X'
+#define LOC_FIRMWARE		'Y'
+#define LOC_SCSI		'Z'
+
+/* Tokens for indicators */
+#define TONE_FREQUENCY		0x0001 /* 0 - 1000 (HZ)*/
+#define TONE_VOLUME		0x0002 /* 0 - 100 (%) */
+#define SYSTEM_POWER_STATE	0x0003 
+#define WARNING_LIGHT		0x0004
+#define DISK_ACTIVITY_LIGHT	0x0005
+#define HEX_DISPLAY_UNIT	0x0006
+#define BATTERY_WARNING_TIME	0x0007
+#define CONDITION_CYCLE_REQUEST	0x0008
+#define SURVEILLANCE_INDICATOR	0x2328 /* 9000 */
+#define DR_ACTION		0x2329 /* 9001 */
+#define DR_INDICATOR		0x232a /* 9002 */
+/* 9003 - 9004: Vendor specific */
+/* 9006 - 9999: Vendor specific */
+
+/* other */
+#define MAX_SENSORS		 17  /* I only know of 17 sensors */    
+#define MAX_LINELENGTH          256
+#define SENSOR_PREFIX		"ibm,sensor-"
+#define cel_to_fahr(x)		((x*9/5)+32)
+
+
+/* Globals */
+static struct rtas_sensors sensors;
+static struct device_node *rtas_node = NULL;
+static unsigned long power_on_time = 0; /* Save the time the user set */
+static char progress_led[MAX_LINELENGTH];
+
+static unsigned long rtas_tone_frequency = 1000;
+static unsigned long rtas_tone_volume = 0;
+
+/* ****************STRUCTS******************************************* */
+struct individual_sensor {
+	unsigned int token;
+	unsigned int quant;
+};
+
+struct rtas_sensors {
+        struct individual_sensor sensor[MAX_SENSORS];
+	unsigned int quant;
+};
+
+/* ****************************************************************** */
+/* Declarations */
+static int ppc_rtas_sensors_show(struct seq_file *m, void *v);
+static int ppc_rtas_clock_show(struct seq_file *m, void *v);
+static ssize_t ppc_rtas_clock_write(struct file *file,
+		const char __user *buf, size_t count, loff_t *ppos);
+static int ppc_rtas_progress_show(struct seq_file *m, void *v);
+static ssize_t ppc_rtas_progress_write(struct file *file,
+		const char __user *buf, size_t count, loff_t *ppos);
+static int ppc_rtas_poweron_show(struct seq_file *m, void *v);
+static ssize_t ppc_rtas_poweron_write(struct file *file,
+		const char __user *buf, size_t count, loff_t *ppos);
+
+static ssize_t ppc_rtas_tone_freq_write(struct file *file,
+		const char __user *buf, size_t count, loff_t *ppos);
+static int ppc_rtas_tone_freq_show(struct seq_file *m, void *v);
+static ssize_t ppc_rtas_tone_volume_write(struct file *file,
+		const char __user *buf, size_t count, loff_t *ppos);
+static int ppc_rtas_tone_volume_show(struct seq_file *m, void *v);
+static int ppc_rtas_rmo_buf_show(struct seq_file *m, void *v);
+
+static int sensors_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ppc_rtas_sensors_show, NULL);
+}
+
+struct file_operations ppc_rtas_sensors_operations = {
+	.open		= sensors_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int poweron_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ppc_rtas_poweron_show, NULL);
+}
+
+struct file_operations ppc_rtas_poweron_operations = {
+	.open		= poweron_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.write		= ppc_rtas_poweron_write,
+	.release	= single_release,
+};
+
+static int progress_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ppc_rtas_progress_show, NULL);
+}
+
+struct file_operations ppc_rtas_progress_operations = {
+	.open		= progress_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.write		= ppc_rtas_progress_write,
+	.release	= single_release,
+};
+
+static int clock_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ppc_rtas_clock_show, NULL);
+}
+
+struct file_operations ppc_rtas_clock_operations = {
+	.open		= clock_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.write		= ppc_rtas_clock_write,
+	.release	= single_release,
+};
+
+static int tone_freq_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ppc_rtas_tone_freq_show, NULL);
+}
+
+struct file_operations ppc_rtas_tone_freq_operations = {
+	.open		= tone_freq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.write		= ppc_rtas_tone_freq_write,
+	.release	= single_release,
+};
+
+static int tone_volume_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ppc_rtas_tone_volume_show, NULL);
+}
+
+struct file_operations ppc_rtas_tone_volume_operations = {
+	.open		= tone_volume_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.write		= ppc_rtas_tone_volume_write,
+	.release	= single_release,
+};
+
+static int rmo_buf_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ppc_rtas_rmo_buf_show, NULL);
+}
+
+struct file_operations ppc_rtas_rmo_buf_ops = {
+	.open		= rmo_buf_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int ppc_rtas_find_all_sensors(void);
+static void ppc_rtas_process_sensor(struct seq_file *m,
+	struct individual_sensor *s, int state, int error, char *loc);
+static char *ppc_rtas_process_error(int error);
+static void get_location_code(struct seq_file *m,
+	struct individual_sensor *s, char *loc);
+static void check_location_string(struct seq_file *m, char *c);
+static void check_location(struct seq_file *m, char *c);
+
+static int __init proc_rtas_init(void)
+{
+	struct proc_dir_entry *entry;
+
+	if (!(systemcfg->platform & PLATFORM_PSERIES))
+		return 1;
+
+	rtas_node = of_find_node_by_name(NULL, "rtas");
+	if (rtas_node == NULL)
+		return 1;
+
+	entry = create_proc_entry("ppc64/rtas/progress", S_IRUGO|S_IWUSR, NULL);
+	if (entry)
+		entry->proc_fops = &ppc_rtas_progress_operations;
+
+	entry = create_proc_entry("ppc64/rtas/clock", S_IRUGO|S_IWUSR, NULL);
+	if (entry)
+		entry->proc_fops = &ppc_rtas_clock_operations;
+
+	entry = create_proc_entry("ppc64/rtas/poweron", S_IWUSR|S_IRUGO, NULL);
+	if (entry)
+		entry->proc_fops = &ppc_rtas_poweron_operations;
+
+	entry = create_proc_entry("ppc64/rtas/sensors", S_IRUGO, NULL);
+	if (entry)
+		entry->proc_fops = &ppc_rtas_sensors_operations;
+
+	entry = create_proc_entry("ppc64/rtas/frequency", S_IWUSR|S_IRUGO,
+				  NULL);
+	if (entry)
+		entry->proc_fops = &ppc_rtas_tone_freq_operations;
+
+	entry = create_proc_entry("ppc64/rtas/volume", S_IWUSR|S_IRUGO, NULL);
+	if (entry)
+		entry->proc_fops = &ppc_rtas_tone_volume_operations;
+
+	entry = create_proc_entry("ppc64/rtas/rmo_buffer", S_IRUSR, NULL);
+	if (entry)
+		entry->proc_fops = &ppc_rtas_rmo_buf_ops;
+
+	return 0;
+}
+
+__initcall(proc_rtas_init);
+
+static int parse_number(const char __user *p, size_t count, unsigned long *val)
+{
+	char buf[40];
+	char *end;
+
+	if (count > 39)
+		return -EINVAL;
+
+	if (copy_from_user(buf, p, count))
+		return -EFAULT;
+
+	buf[count] = 0;
+
+	*val = simple_strtoul(buf, &end, 10);
+	if (*end && *end != '\n')
+		return -EINVAL;
+
+	return 0;
+}
+
+/* ****************************************************************** */
+/* POWER-ON-TIME                                                      */
+/* ****************************************************************** */
+static ssize_t ppc_rtas_poweron_write(struct file *file,
+		const char __user *buf, size_t count, loff_t *ppos)
+{
+	struct rtc_time tm;
+	unsigned long nowtime;
+	int error = parse_number(buf, count, &nowtime);
+	if (error)
+		return error;
+
+	power_on_time = nowtime; /* save the time */
+
+	to_tm(nowtime, &tm);
+
+	error = rtas_call(rtas_token("set-time-for-power-on"), 7, 1, NULL, 
+			tm.tm_year, tm.tm_mon, tm.tm_mday, 
+			tm.tm_hour, tm.tm_min, tm.tm_sec, 0 /* nano */);
+	if (error)
+		printk(KERN_WARNING "error: setting poweron time returned: %s\n", 
+				ppc_rtas_process_error(error));
+	return count;
+}
+/* ****************************************************************** */
+static int ppc_rtas_poweron_show(struct seq_file *m, void *v)
+{
+	if (power_on_time == 0)
+		seq_printf(m, "Power on time not set\n");
+	else
+		seq_printf(m, "%lu\n",power_on_time);
+	return 0;
+}
+
+/* ****************************************************************** */
+/* PROGRESS                                                           */
+/* ****************************************************************** */
+static ssize_t ppc_rtas_progress_write(struct file *file,
+		const char __user *buf, size_t count, loff_t *ppos)
+{
+	unsigned long hex;
+
+	if (count >= MAX_LINELENGTH)
+		count = MAX_LINELENGTH -1;
+	if (copy_from_user(progress_led, buf, count)) { /* save the string */
+		return -EFAULT;
+	}
+	progress_led[count] = 0;
+
+	/* Lets see if the user passed hexdigits */
+	hex = simple_strtoul(progress_led, NULL, 10);
+
+	ppc_md.progress ((char *)progress_led, hex);
+	return count;
+
+	/* clear the line */
+	/* ppc_md.progress("                   ", 0xffff);*/
+}
+/* ****************************************************************** */
+static int ppc_rtas_progress_show(struct seq_file *m, void *v)
+{
+	if (progress_led)
+		seq_printf(m, "%s\n", progress_led);
+	return 0;
+}
+
+/* ****************************************************************** */
+/* CLOCK                                                              */
+/* ****************************************************************** */
+static ssize_t ppc_rtas_clock_write(struct file *file,
+		const char __user *buf, size_t count, loff_t *ppos)
+{
+	struct rtc_time tm;
+	unsigned long nowtime;
+	int error = parse_number(buf, count, &nowtime);
+	if (error)
+		return error;
+
+	to_tm(nowtime, &tm);
+	error = rtas_call(rtas_token("set-time-of-day"), 7, 1, NULL, 
+			tm.tm_year, tm.tm_mon, tm.tm_mday, 
+			tm.tm_hour, tm.tm_min, tm.tm_sec, 0);
+	if (error)
+		printk(KERN_WARNING "error: setting the clock returned: %s\n", 
+				ppc_rtas_process_error(error));
+	return count;
+}
+/* ****************************************************************** */
+static int ppc_rtas_clock_show(struct seq_file *m, void *v)
+{
+	int ret[8];
+	int error = rtas_call(rtas_token("get-time-of-day"), 0, 8, ret);
+
+	if (error) {
+		printk(KERN_WARNING "error: reading the clock returned: %s\n", 
+				ppc_rtas_process_error(error));
+		seq_printf(m, "0");
+	} else { 
+		unsigned int year, mon, day, hour, min, sec;
+		year = ret[0]; mon  = ret[1]; day  = ret[2];
+		hour = ret[3]; min  = ret[4]; sec  = ret[5];
+		seq_printf(m, "%lu\n",
+				mktime(year, mon, day, hour, min, sec));
+	}
+	return 0;
+}
+
+/* ****************************************************************** */
+/* SENSOR STUFF                                                       */
+/* ****************************************************************** */
+static int ppc_rtas_sensors_show(struct seq_file *m, void *v)
+{
+	int i,j;
+	int state, error;
+	int get_sensor_state = rtas_token("get-sensor-state");
+
+	seq_printf(m, "RTAS (RunTime Abstraction Services) Sensor Information\n");
+	seq_printf(m, "Sensor\t\tValue\t\tCondition\tLocation\n");
+	seq_printf(m, "********************************************************\n");
+
+	if (ppc_rtas_find_all_sensors() != 0) {
+		seq_printf(m, "\nNo sensors are available\n");
+		return 0;
+	}
+
+	for (i=0; i<sensors.quant; i++) {
+		struct individual_sensor *p = &sensors.sensor[i];
+		char rstr[64];
+		char *loc;
+		int llen, offs;
+
+		sprintf (rstr, SENSOR_PREFIX"%04d", p->token);
+		loc = (char *) get_property(rtas_node, rstr, &llen);
+
+		/* A sensor may have multiple instances */
+		for (j = 0, offs = 0; j <= p->quant; j++) {
+			error =	rtas_call(get_sensor_state, 2, 2, &state, 
+				  	  p->token, j);
+
+			ppc_rtas_process_sensor(m, p, state, error, loc);
+			seq_putc(m, '\n');
+			if (loc) {
+				offs += strlen(loc) + 1;
+				loc += strlen(loc) + 1;
+				if (offs >= llen)
+					loc = NULL;
+			}
+		}
+	}
+	return 0;
+}
+
+/* ****************************************************************** */
+
+static int ppc_rtas_find_all_sensors(void)
+{
+	unsigned int *utmp;
+	int len, i;
+
+	utmp = (unsigned int *) get_property(rtas_node, "rtas-sensors", &len);
+	if (utmp == NULL) {
+		printk (KERN_ERR "error: could not get rtas-sensors\n");
+		return 1;
+	}
+
+	sensors.quant = len / 8;      /* int + int */
+
+	for (i=0; i<sensors.quant; i++) {
+		sensors.sensor[i].token = *utmp++;
+		sensors.sensor[i].quant = *utmp++;
+	}
+	return 0;
+}
+
+/* ****************************************************************** */
+/*
+ * Builds a string of what rtas returned
+ */
+static char *ppc_rtas_process_error(int error)
+{
+	switch (error) {
+		case SENSOR_CRITICAL_HIGH:
+			return "(critical high)";
+		case SENSOR_WARNING_HIGH:
+			return "(warning high)";
+		case SENSOR_NORMAL:
+			return "(normal)";
+		case SENSOR_WARNING_LOW:
+			return "(warning low)";
+		case SENSOR_CRITICAL_LOW:
+			return "(critical low)";
+		case SENSOR_SUCCESS:
+			return "(read ok)";
+		case SENSOR_HW_ERROR:
+			return "(hardware error)";
+		case SENSOR_BUSY:
+			return "(busy)";
+		case SENSOR_NOT_EXIST:
+			return "(non existent)";
+		case SENSOR_DR_ENTITY:
+			return "(dr entity removed)";
+		default:
+			return "(UNKNOWN)";
+	}
+}
+
+/* ****************************************************************** */
+/*
+ * Builds a string out of what the sensor said
+ */
+
+static void ppc_rtas_process_sensor(struct seq_file *m,
+	struct individual_sensor *s, int state, int error, char *loc)
+{
+	/* Defined return vales */
+	const char * key_switch[]        = { "Off\t", "Normal\t", "Secure\t", 
+						"Maintenance" };
+	const char * enclosure_switch[]  = { "Closed", "Open" };
+	const char * lid_status[]        = { " ", "Open", "Closed" };
+	const char * power_source[]      = { "AC\t", "Battery", 
+		  				"AC & Battery" };
+	const char * battery_remaining[] = { "Very Low", "Low", "Mid", "High" };
+	const char * epow_sensor[]       = { 
+		"EPOW Reset", "Cooling warning", "Power warning",
+		"System shutdown", "System halt", "EPOW main enclosure",
+		"EPOW power off" };
+	const char * battery_cyclestate[]  = { "None", "In progress", 
+						"Requested" };
+	const char * battery_charging[]    = { "Charging", "Discharching", 
+						"No current flow" };
+	const char * ibm_drconnector[]     = { "Empty", "Present", "Unusable", 
+						"Exchange" };
+
+	int have_strings = 0;
+	int num_states = 0;
+	int temperature = 0;
+	int unknown = 0;
+
+	/* What kind of sensor do we have here? */
+	
+	switch (s->token) {
+		case KEY_SWITCH:
+			seq_printf(m, "Key switch:\t");
+			num_states = sizeof(key_switch) / sizeof(char *);
+			if (state < num_states) {
+				seq_printf(m, "%s\t", key_switch[state]);
+				have_strings = 1;
+			}
+			break;
+		case ENCLOSURE_SWITCH:
+			seq_printf(m, "Enclosure switch:\t");
+			num_states = sizeof(enclosure_switch) / sizeof(char *);
+			if (state < num_states) {
+				seq_printf(m, "%s\t", 
+						enclosure_switch[state]);
+				have_strings = 1;
+			}
+			break;
+		case THERMAL_SENSOR:
+			seq_printf(m, "Temp. (C/F):\t");
+			temperature = 1;
+			break;
+		case LID_STATUS:
+			seq_printf(m, "Lid status:\t");
+			num_states = sizeof(lid_status) / sizeof(char *);
+			if (state < num_states) {
+				seq_printf(m, "%s\t", lid_status[state]);
+				have_strings = 1;
+			}
+			break;
+		case POWER_SOURCE:
+			seq_printf(m, "Power source:\t");
+			num_states = sizeof(power_source) / sizeof(char *);
+			if (state < num_states) {
+				seq_printf(m, "%s\t", 
+						power_source[state]);
+				have_strings = 1;
+			}
+			break;
+		case BATTERY_VOLTAGE:
+			seq_printf(m, "Battery voltage:\t");
+			break;
+		case BATTERY_REMAINING:
+			seq_printf(m, "Battery remaining:\t");
+			num_states = sizeof(battery_remaining) / sizeof(char *);
+			if (state < num_states)
+			{
+				seq_printf(m, "%s\t", 
+						battery_remaining[state]);
+				have_strings = 1;
+			}
+			break;
+		case BATTERY_PERCENTAGE:
+			seq_printf(m, "Battery percentage:\t");
+			break;
+		case EPOW_SENSOR:
+			seq_printf(m, "EPOW Sensor:\t");
+			num_states = sizeof(epow_sensor) / sizeof(char *);
+			if (state < num_states) {
+				seq_printf(m, "%s\t", epow_sensor[state]);
+				have_strings = 1;
+			}
+			break;
+		case BATTERY_CYCLESTATE:
+			seq_printf(m, "Battery cyclestate:\t");
+			num_states = sizeof(battery_cyclestate) / 
+				     	sizeof(char *);
+			if (state < num_states) {
+				seq_printf(m, "%s\t", 
+						battery_cyclestate[state]);
+				have_strings = 1;
+			}
+			break;
+		case BATTERY_CHARGING:
+			seq_printf(m, "Battery Charging:\t");
+			num_states = sizeof(battery_charging) / sizeof(char *);
+			if (state < num_states) {
+				seq_printf(m, "%s\t", 
+						battery_charging[state]);
+				have_strings = 1;
+			}
+			break;
+		case IBM_SURVEILLANCE:
+			seq_printf(m, "Surveillance:\t");
+			break;
+		case IBM_FANRPM:
+			seq_printf(m, "Fan (rpm):\t");
+			break;
+		case IBM_VOLTAGE:
+			seq_printf(m, "Voltage (mv):\t");
+			break;
+		case IBM_DRCONNECTOR:
+			seq_printf(m, "DR connector:\t");
+			num_states = sizeof(ibm_drconnector) / sizeof(char *);
+			if (state < num_states) {
+				seq_printf(m, "%s\t", 
+						ibm_drconnector[state]);
+				have_strings = 1;
+			}
+			break;
+		case IBM_POWERSUPPLY:
+			seq_printf(m, "Powersupply:\t");
+			break;
+		default:
+			seq_printf(m,  "Unknown sensor (type %d), ignoring it\n",
+					s->token);
+			unknown = 1;
+			have_strings = 1;
+			break;
+	}
+	if (have_strings == 0) {
+		if (temperature) {
+			seq_printf(m, "%4d /%4d\t", state, cel_to_fahr(state));
+		} else
+			seq_printf(m, "%10d\t", state);
+	}
+	if (unknown == 0) {
+		seq_printf(m, "%s\t", ppc_rtas_process_error(error));
+		get_location_code(m, s, loc);
+	}
+}
+
+/* ****************************************************************** */
+
+static void check_location(struct seq_file *m, char *c)
+{
+	switch (c[0]) {
+		case LOC_PLANAR:
+			seq_printf(m, "Planar #%c", c[1]);
+			break;
+		case LOC_CPU:
+			seq_printf(m, "CPU #%c", c[1]);
+			break;
+		case LOC_FAN:
+			seq_printf(m, "Fan #%c", c[1]);
+			break;
+		case LOC_RACKMOUNTED:
+			seq_printf(m, "Rack #%c", c[1]);
+			break;
+		case LOC_VOLTAGE:
+			seq_printf(m, "Voltage #%c", c[1]);
+			break;
+		case LOC_LCD:
+			seq_printf(m, "LCD #%c", c[1]);
+			break;
+		case '.':
+			seq_printf(m, "- %c", c[1]);
+			break;
+		default:
+			seq_printf(m, "Unknown location");
+			break;
+	}
+}
+
+
+/* ****************************************************************** */
+/* 
+ * Format: 
+ * ${LETTER}${NUMBER}[[-/]${LETTER}${NUMBER} [ ... ] ]
+ * the '.' may be an abbrevation
+ */
+static void check_location_string(struct seq_file *m, char *c)
+{
+	while (*c) {
+		if (isalpha(*c) || *c == '.')
+			check_location(m, c);
+		else if (*c == '/' || *c == '-')
+			seq_printf(m, " at ");
+		c++;
+	}
+}
+
+
+/* ****************************************************************** */
+
+static void get_location_code(struct seq_file *m, struct individual_sensor *s, char *loc)
+{
+	if (!loc || !*loc) {
+		seq_printf(m, "---");/* does not have a location */
+	} else {
+		check_location_string(m, loc);
+	}
+	seq_putc(m, ' ');
+}
+/* ****************************************************************** */
+/* INDICATORS - Tone Frequency                                        */
+/* ****************************************************************** */
+static ssize_t ppc_rtas_tone_freq_write(struct file *file,
+		const char __user *buf, size_t count, loff_t *ppos)
+{
+	unsigned long freq;
+	int error = parse_number(buf, count, &freq);
+	if (error)
+		return error;
+
+	rtas_tone_frequency = freq; /* save it for later */
+	error = rtas_call(rtas_token("set-indicator"), 3, 1, NULL,
+			TONE_FREQUENCY, 0, freq);
+	if (error)
+		printk(KERN_WARNING "error: setting tone frequency returned: %s\n", 
+				ppc_rtas_process_error(error));
+	return count;
+}
+/* ****************************************************************** */
+static int ppc_rtas_tone_freq_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%lu\n", rtas_tone_frequency);
+	return 0;
+}
+/* ****************************************************************** */
+/* INDICATORS - Tone Volume                                           */
+/* ****************************************************************** */
+static ssize_t ppc_rtas_tone_volume_write(struct file *file,
+		const char __user *buf, size_t count, loff_t *ppos)
+{
+	unsigned long volume;
+	int error = parse_number(buf, count, &volume);
+	if (error)
+		return error;
+
+	if (volume > 100)
+		volume = 100;
+	
+        rtas_tone_volume = volume; /* save it for later */
+	error = rtas_call(rtas_token("set-indicator"), 3, 1, NULL,
+			TONE_VOLUME, 0, volume);
+	if (error)
+		printk(KERN_WARNING "error: setting tone volume returned: %s\n", 
+				ppc_rtas_process_error(error));
+	return count;
+}
+/* ****************************************************************** */
+static int ppc_rtas_tone_volume_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%lu\n", rtas_tone_volume);
+	return 0;
+}
+
+#define RMO_READ_BUF_MAX 30
+
+/* RTAS Userspace access */
+static int ppc_rtas_rmo_buf_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%016lx %x\n", rtas_rmo_buf, RTAS_RMOBUF_MAX);
+	return 0;
+}
diff --git a/arch/ppc64/kernel/rtas.c b/arch/ppc64/kernel/rtas.c
new file mode 100644
index 00000000000..5575603def2
--- /dev/null
+++ b/arch/ppc64/kernel/rtas.c
@@ -0,0 +1,657 @@
+/*
+ *
+ * Procedures for interfacing to the RTAS on CHRP machines.
+ *
+ * Peter Bergner, IBM	March 2001.
+ * Copyright (C) 2001 IBM.
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <stdarg.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/semaphore.h>
+#include <asm/machdep.h>
+#include <asm/page.h>
+#include <asm/param.h>
+#include <asm/system.h>
+#include <asm/abs_addr.h>
+#include <asm/udbg.h>
+#include <asm/delay.h>
+#include <asm/uaccess.h>
+#include <asm/systemcfg.h>
+
+struct flash_block_list_header rtas_firmware_flash_list = {0, NULL};
+
+struct rtas_t rtas = { 
+	.lock = SPIN_LOCK_UNLOCKED
+};
+
+EXPORT_SYMBOL(rtas);
+
+char rtas_err_buf[RTAS_ERROR_LOG_MAX];
+
+DEFINE_SPINLOCK(rtas_data_buf_lock);
+char rtas_data_buf[RTAS_DATA_BUF_SIZE]__page_aligned;
+unsigned long rtas_rmo_buf;
+
+void
+call_rtas_display_status(unsigned char c)
+{
+	struct rtas_args *args = &rtas.args;
+	unsigned long s;
+
+	if (!rtas.base)
+		return;
+	spin_lock_irqsave(&rtas.lock, s);
+
+	args->token = 10;
+	args->nargs = 1;
+	args->nret  = 1;
+	args->rets  = (rtas_arg_t *)&(args->args[1]);
+	args->args[0] = (int)c;
+
+	enter_rtas(__pa(args));
+
+	spin_unlock_irqrestore(&rtas.lock, s);
+}
+
+void
+call_rtas_display_status_delay(unsigned char c)
+{
+	static int pending_newline = 0;  /* did last write end with unprinted newline? */
+	static int width = 16;
+
+	if (c == '\n') {	
+		while (width-- > 0)
+			call_rtas_display_status(' ');
+		width = 16;
+		udelay(500000);
+		pending_newline = 1;
+	} else {
+		if (pending_newline) {
+			call_rtas_display_status('\r');
+			call_rtas_display_status('\n');
+		} 
+		pending_newline = 0;
+		if (width--) {
+			call_rtas_display_status(c);
+			udelay(10000);
+		}
+	}
+}
+
+int
+rtas_token(const char *service)
+{
+	int *tokp;
+	if (rtas.dev == NULL) {
+		PPCDBG(PPCDBG_RTAS,"\tNo rtas device in device-tree...\n");
+		return RTAS_UNKNOWN_SERVICE;
+	}
+	tokp = (int *) get_property(rtas.dev, service, NULL);
+	return tokp ? *tokp : RTAS_UNKNOWN_SERVICE;
+}
+
+/*
+ * Return the firmware-specified size of the error log buffer
+ *  for all rtas calls that require an error buffer argument.
+ *  This includes 'check-exception' and 'rtas-last-error'.
+ */
+int rtas_get_error_log_max(void)
+{
+	static int rtas_error_log_max;
+	if (rtas_error_log_max)
+		return rtas_error_log_max;
+
+	rtas_error_log_max = rtas_token ("rtas-error-log-max");
+	if ((rtas_error_log_max == RTAS_UNKNOWN_SERVICE) ||
+	    (rtas_error_log_max > RTAS_ERROR_LOG_MAX)) {
+		printk (KERN_WARNING "RTAS: bad log buffer size %d\n", rtas_error_log_max);
+		rtas_error_log_max = RTAS_ERROR_LOG_MAX;
+	}
+	return rtas_error_log_max;
+}
+
+
+/** Return a copy of the detailed error text associated with the
+ *  most recent failed call to rtas.  Because the error text
+ *  might go stale if there are any other intervening rtas calls,
+ *  this routine must be called atomically with whatever produced
+ *  the error (i.e. with rtas.lock still held from the previous call).
+ */
+static int
+__fetch_rtas_last_error(void)
+{
+	struct rtas_args err_args, save_args;
+	u32 bufsz;
+
+	bufsz = rtas_get_error_log_max();
+
+	err_args.token = rtas_token("rtas-last-error");
+	err_args.nargs = 2;
+	err_args.nret = 1;
+
+	err_args.args[0] = (rtas_arg_t)__pa(rtas_err_buf);
+	err_args.args[1] = bufsz;
+	err_args.args[2] = 0;
+
+	save_args = rtas.args;
+	rtas.args = err_args;
+
+	enter_rtas(__pa(&rtas.args));
+
+	err_args = rtas.args;
+	rtas.args = save_args;
+
+	return err_args.args[2];
+}
+
+int rtas_call(int token, int nargs, int nret, int *outputs, ...)
+{
+	va_list list;
+	int i, logit = 0;
+	unsigned long s;
+	struct rtas_args *rtas_args;
+	char * buff_copy = NULL;
+	int ret;
+
+	PPCDBG(PPCDBG_RTAS, "Entering rtas_call\n");
+	PPCDBG(PPCDBG_RTAS, "\ttoken    = 0x%x\n", token);
+	PPCDBG(PPCDBG_RTAS, "\tnargs    = %d\n", nargs);
+	PPCDBG(PPCDBG_RTAS, "\tnret     = %d\n", nret);
+	PPCDBG(PPCDBG_RTAS, "\t&outputs = 0x%lx\n", outputs);
+	if (token == RTAS_UNKNOWN_SERVICE)
+		return -1;
+
+	/* Gotta do something different here, use global lock for now... */
+	spin_lock_irqsave(&rtas.lock, s);
+	rtas_args = &rtas.args;
+
+	rtas_args->token = token;
+	rtas_args->nargs = nargs;
+	rtas_args->nret  = nret;
+	rtas_args->rets  = (rtas_arg_t *)&(rtas_args->args[nargs]);
+	va_start(list, outputs);
+	for (i = 0; i < nargs; ++i) {
+		rtas_args->args[i] = va_arg(list, rtas_arg_t);
+		PPCDBG(PPCDBG_RTAS, "\tnarg[%d] = 0x%x\n", i, rtas_args->args[i]);
+	}
+	va_end(list);
+
+	for (i = 0; i < nret; ++i)
+		rtas_args->rets[i] = 0;
+
+	PPCDBG(PPCDBG_RTAS, "\tentering rtas with 0x%lx\n",
+		__pa(rtas_args));
+	enter_rtas(__pa(rtas_args));
+	PPCDBG(PPCDBG_RTAS, "\treturned from rtas ...\n");
+
+	/* A -1 return code indicates that the last command couldn't
+	   be completed due to a hardware error. */
+	if (rtas_args->rets[0] == -1)
+		logit = (__fetch_rtas_last_error() == 0);
+
+	ifppcdebug(PPCDBG_RTAS) {
+		for(i=0; i < nret ;i++)
+			udbg_printf("\tnret[%d] = 0x%lx\n", i, (ulong)rtas_args->rets[i]);
+	}
+
+	if (nret > 1 && outputs != NULL)
+		for (i = 0; i < nret-1; ++i)
+			outputs[i] = rtas_args->rets[i+1];
+	ret = (nret > 0)? rtas_args->rets[0]: 0;
+
+	/* Log the error in the unlikely case that there was one. */
+	if (unlikely(logit)) {
+		buff_copy = rtas_err_buf;
+		if (mem_init_done) {
+			buff_copy = kmalloc(RTAS_ERROR_LOG_MAX, GFP_ATOMIC);
+			if (buff_copy)
+				memcpy(buff_copy, rtas_err_buf,
+				       RTAS_ERROR_LOG_MAX);
+		}
+	}
+
+	/* Gotta do something different here, use global lock for now... */
+	spin_unlock_irqrestore(&rtas.lock, s);
+
+	if (buff_copy) {
+		log_error(buff_copy, ERR_TYPE_RTAS_LOG, 0);
+		if (mem_init_done)
+			kfree(buff_copy);
+	}
+	return ret;
+}
+
+/* Given an RTAS status code of 990n compute the hinted delay of 10^n
+ * (last digit) milliseconds.  For now we bound at n=5 (100 sec).
+ */
+unsigned int
+rtas_extended_busy_delay_time(int status)
+{
+	int order = status - 9900;
+	unsigned long ms;
+
+	if (order < 0)
+		order = 0;	/* RTC depends on this for -2 clock busy */
+	else if (order > 5)
+		order = 5;	/* bound */
+
+	/* Use microseconds for reasonable accuracy */
+	for (ms=1; order > 0; order--)
+		ms *= 10;
+
+	return ms; 
+}
+
+int rtas_error_rc(int rtas_rc)
+{
+	int rc;
+
+	switch (rtas_rc) {
+		case -1: 		/* Hardware Error */
+			rc = -EIO;
+			break;
+		case -3:		/* Bad indicator/domain/etc */
+			rc = -EINVAL;
+			break;
+		case -9000:		/* Isolation error */
+			rc = -EFAULT;
+			break;
+		case -9001:		/* Outstanding TCE/PTE */
+			rc = -EEXIST;
+			break;
+		case -9002:		/* No usable slot */
+			rc = -ENODEV;
+			break;
+		default:
+			printk(KERN_ERR "%s: unexpected RTAS error %d\n",
+					__FUNCTION__, rtas_rc);
+			rc = -ERANGE;
+			break;
+	}
+	return rc;
+}
+
+int rtas_get_power_level(int powerdomain, int *level)
+{
+	int token = rtas_token("get-power-level");
+	int rc;
+
+	if (token == RTAS_UNKNOWN_SERVICE)
+		return -ENOENT;
+
+	while ((rc = rtas_call(token, 1, 2, level, powerdomain)) == RTAS_BUSY)
+		udelay(1);
+
+	if (rc < 0)
+		return rtas_error_rc(rc);
+	return rc;
+}
+
+int rtas_set_power_level(int powerdomain, int level, int *setlevel)
+{
+	int token = rtas_token("set-power-level");
+	unsigned int wait_time;
+	int rc;
+
+	if (token == RTAS_UNKNOWN_SERVICE)
+		return -ENOENT;
+
+	while (1) {
+		rc = rtas_call(token, 2, 2, setlevel, powerdomain, level);
+		if (rc == RTAS_BUSY)
+			udelay(1);
+		else if (rtas_is_extended_busy(rc)) {
+			wait_time = rtas_extended_busy_delay_time(rc);
+			udelay(wait_time * 1000);
+		} else
+			break;
+	}
+
+	if (rc < 0)
+		return rtas_error_rc(rc);
+	return rc;
+}
+
+int rtas_get_sensor(int sensor, int index, int *state)
+{
+	int token = rtas_token("get-sensor-state");
+	unsigned int wait_time;
+	int rc;
+
+	if (token == RTAS_UNKNOWN_SERVICE)
+		return -ENOENT;
+
+	while (1) {
+		rc = rtas_call(token, 2, 2, state, sensor, index);
+		if (rc == RTAS_BUSY)
+			udelay(1);
+		else if (rtas_is_extended_busy(rc)) {
+			wait_time = rtas_extended_busy_delay_time(rc);
+			udelay(wait_time * 1000);
+		} else
+			break;
+	}
+
+	if (rc < 0)
+		return rtas_error_rc(rc);
+	return rc;
+}
+
+int rtas_set_indicator(int indicator, int index, int new_value)
+{
+	int token = rtas_token("set-indicator");
+	unsigned int wait_time;
+	int rc;
+
+	if (token == RTAS_UNKNOWN_SERVICE)
+		return -ENOENT;
+
+	while (1) {
+		rc = rtas_call(token, 3, 1, NULL, indicator, index, new_value);
+		if (rc == RTAS_BUSY)
+			udelay(1);
+		else if (rtas_is_extended_busy(rc)) {
+			wait_time = rtas_extended_busy_delay_time(rc);
+			udelay(wait_time * 1000);
+		}
+		else
+			break;
+	}
+
+	if (rc < 0)
+		return rtas_error_rc(rc);
+	return rc;
+}
+
+#define FLASH_BLOCK_LIST_VERSION (1UL)
+static void
+rtas_flash_firmware(void)
+{
+	unsigned long image_size;
+	struct flash_block_list *f, *next, *flist;
+	unsigned long rtas_block_list;
+	int i, status, update_token;
+
+	update_token = rtas_token("ibm,update-flash-64-and-reboot");
+	if (update_token == RTAS_UNKNOWN_SERVICE) {
+		printk(KERN_ALERT "FLASH: ibm,update-flash-64-and-reboot is not available -- not a service partition?\n");
+		printk(KERN_ALERT "FLASH: firmware will not be flashed\n");
+		return;
+	}
+
+	/* NOTE: the "first" block list is a global var with no data
+	 * blocks in the kernel data segment.  We do this because
+	 * we want to ensure this block_list addr is under 4GB.
+	 */
+	rtas_firmware_flash_list.num_blocks = 0;
+	flist = (struct flash_block_list *)&rtas_firmware_flash_list;
+	rtas_block_list = virt_to_abs(flist);
+	if (rtas_block_list >= 4UL*1024*1024*1024) {
+		printk(KERN_ALERT "FLASH: kernel bug...flash list header addr above 4GB\n");
+		return;
+	}
+
+	printk(KERN_ALERT "FLASH: preparing saved firmware image for flash\n");
+	/* Update the block_list in place. */
+	image_size = 0;
+	for (f = flist; f; f = next) {
+		/* Translate data addrs to absolute */
+		for (i = 0; i < f->num_blocks; i++) {
+			f->blocks[i].data = (char *)virt_to_abs(f->blocks[i].data);
+			image_size += f->blocks[i].length;
+		}
+		next = f->next;
+		/* Don't translate NULL pointer for last entry */
+		if (f->next)
+			f->next = (struct flash_block_list *)virt_to_abs(f->next);
+		else
+			f->next = NULL;
+		/* make num_blocks into the version/length field */
+		f->num_blocks = (FLASH_BLOCK_LIST_VERSION << 56) | ((f->num_blocks+1)*16);
+	}
+
+	printk(KERN_ALERT "FLASH: flash image is %ld bytes\n", image_size);
+	printk(KERN_ALERT "FLASH: performing flash and reboot\n");
+	ppc_md.progress("Flashing        \n", 0x0);
+	ppc_md.progress("Please Wait...  ", 0x0);
+	printk(KERN_ALERT "FLASH: this will take several minutes.  Do not power off!\n");
+	status = rtas_call(update_token, 1, 1, NULL, rtas_block_list);
+	switch (status) {	/* should only get "bad" status */
+	    case 0:
+		printk(KERN_ALERT "FLASH: success\n");
+		break;
+	    case -1:
+		printk(KERN_ALERT "FLASH: hardware error.  Firmware may not be not flashed\n");
+		break;
+	    case -3:
+		printk(KERN_ALERT "FLASH: image is corrupt or not correct for this platform.  Firmware not flashed\n");
+		break;
+	    case -4:
+		printk(KERN_ALERT "FLASH: flash failed when partially complete.  System may not reboot\n");
+		break;
+	    default:
+		printk(KERN_ALERT "FLASH: unknown flash return code %d\n", status);
+		break;
+	}
+}
+
+void rtas_flash_bypass_warning(void)
+{
+	printk(KERN_ALERT "FLASH: firmware flash requires a reboot\n");
+	printk(KERN_ALERT "FLASH: the firmware image will NOT be flashed\n");
+}
+
+
+void
+rtas_restart(char *cmd)
+{
+	if (rtas_firmware_flash_list.next)
+		rtas_flash_firmware();
+
+	printk("RTAS system-reboot returned %d\n",
+	       rtas_call(rtas_token("system-reboot"), 0, 1, NULL));
+	for (;;);
+}
+
+void
+rtas_power_off(void)
+{
+	if (rtas_firmware_flash_list.next)
+		rtas_flash_bypass_warning();
+	/* allow power on only with power button press */
+	printk("RTAS power-off returned %d\n",
+	       rtas_call(rtas_token("power-off"), 2, 1, NULL, -1, -1));
+	for (;;);
+}
+
+void
+rtas_halt(void)
+{
+	if (rtas_firmware_flash_list.next)
+		rtas_flash_bypass_warning();
+	rtas_power_off();
+}
+
+/* Must be in the RMO region, so we place it here */
+static char rtas_os_term_buf[2048];
+
+void rtas_os_term(char *str)
+{
+	int status;
+
+	if (RTAS_UNKNOWN_SERVICE == rtas_token("ibm,os-term"))
+		return;
+
+	snprintf(rtas_os_term_buf, 2048, "OS panic: %s", str);
+
+	do {
+		status = rtas_call(rtas_token("ibm,os-term"), 1, 1, NULL,
+				   __pa(rtas_os_term_buf));
+
+		if (status == RTAS_BUSY)
+			udelay(1);
+		else if (status != 0)
+			printk(KERN_EMERG "ibm,os-term call failed %d\n",
+			       status);
+	} while (status == RTAS_BUSY);
+}
+
+
+asmlinkage int ppc_rtas(struct rtas_args __user *uargs)
+{
+	struct rtas_args args;
+	unsigned long flags;
+	char * buff_copy;
+	int nargs;
+	int err_rc = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (copy_from_user(&args, uargs, 3 * sizeof(u32)) != 0)
+		return -EFAULT;
+
+	nargs = args.nargs;
+	if (nargs > ARRAY_SIZE(args.args)
+	    || args.nret > ARRAY_SIZE(args.args)
+	    || nargs + args.nret > ARRAY_SIZE(args.args))
+		return -EINVAL;
+
+	/* Copy in args. */
+	if (copy_from_user(args.args, uargs->args,
+			   nargs * sizeof(rtas_arg_t)) != 0)
+		return -EFAULT;
+
+	buff_copy = kmalloc(RTAS_ERROR_LOG_MAX, GFP_KERNEL);
+
+	spin_lock_irqsave(&rtas.lock, flags);
+
+	rtas.args = args;
+	enter_rtas(__pa(&rtas.args));
+	args = rtas.args;
+
+	args.rets = &args.args[nargs];
+
+	/* A -1 return code indicates that the last command couldn't
+	   be completed due to a hardware error. */
+	if (args.rets[0] == -1) {
+		err_rc = __fetch_rtas_last_error();
+		if ((err_rc == 0) && buff_copy) {
+			memcpy(buff_copy, rtas_err_buf, RTAS_ERROR_LOG_MAX);
+		}
+	}
+
+	spin_unlock_irqrestore(&rtas.lock, flags);
+
+	if (buff_copy) {
+		if ((args.rets[0] == -1) && (err_rc == 0)) {
+			log_error(buff_copy, ERR_TYPE_RTAS_LOG, 0);
+		}
+		kfree(buff_copy);
+	}
+
+	/* Copy out args. */
+	if (copy_to_user(uargs->args + nargs,
+			 args.args + nargs,
+			 args.nret * sizeof(rtas_arg_t)) != 0)
+		return -EFAULT;
+
+	return 0;
+}
+
+/* This version can't take the spinlock, because it never returns */
+
+struct rtas_args rtas_stop_self_args = {
+	/* The token is initialized for real in setup_system() */
+	.token = RTAS_UNKNOWN_SERVICE,
+	.nargs = 0,
+	.nret = 1,
+	.rets = &rtas_stop_self_args.args[0],
+};
+
+void rtas_stop_self(void)
+{
+	struct rtas_args *rtas_args = &rtas_stop_self_args;
+
+	local_irq_disable();
+
+	BUG_ON(rtas_args->token == RTAS_UNKNOWN_SERVICE);
+
+	printk("cpu %u (hwid %u) Ready to die...\n",
+	       smp_processor_id(), hard_smp_processor_id());
+	enter_rtas(__pa(rtas_args));
+
+	panic("Alas, I survived.\n");
+}
+
+/*
+ * Call early during boot, before mem init or bootmem, to retreive the RTAS
+ * informations from the device-tree and allocate the RMO buffer for userland
+ * accesses.
+ */
+void __init rtas_initialize(void)
+{
+	/* Get RTAS dev node and fill up our "rtas" structure with infos
+	 * about it.
+	 */
+	rtas.dev = of_find_node_by_name(NULL, "rtas");
+	if (rtas.dev) {
+		u32 *basep, *entryp;
+		u32 *sizep;
+
+		basep = (u32 *)get_property(rtas.dev, "linux,rtas-base", NULL);
+		sizep = (u32 *)get_property(rtas.dev, "rtas-size", NULL);
+		if (basep != NULL && sizep != NULL) {
+			rtas.base = *basep;
+			rtas.size = *sizep;
+			entryp = (u32 *)get_property(rtas.dev, "linux,rtas-entry", NULL);
+			if (entryp == NULL) /* Ugh */
+				rtas.entry = rtas.base;
+			else
+				rtas.entry = *entryp;
+		} else
+			rtas.dev = NULL;
+	}
+	/* If RTAS was found, allocate the RMO buffer for it and look for
+	 * the stop-self token if any
+	 */
+	if (rtas.dev) {
+		unsigned long rtas_region = RTAS_INSTANTIATE_MAX;
+		if (systemcfg->platform == PLATFORM_PSERIES_LPAR)
+			rtas_region = min(lmb.rmo_size, RTAS_INSTANTIATE_MAX);
+
+		rtas_rmo_buf = lmb_alloc_base(RTAS_RMOBUF_MAX, PAGE_SIZE,
+							rtas_region);
+
+#ifdef CONFIG_HOTPLUG_CPU
+		rtas_stop_self_args.token = rtas_token("stop-self");
+#endif /* CONFIG_HOTPLUG_CPU */
+	}
+
+}
+
+
+EXPORT_SYMBOL(rtas_firmware_flash_list);
+EXPORT_SYMBOL(rtas_token);
+EXPORT_SYMBOL(rtas_call);
+EXPORT_SYMBOL(rtas_data_buf);
+EXPORT_SYMBOL(rtas_data_buf_lock);
+EXPORT_SYMBOL(rtas_extended_busy_delay_time);
+EXPORT_SYMBOL(rtas_get_sensor);
+EXPORT_SYMBOL(rtas_get_power_level);
+EXPORT_SYMBOL(rtas_set_power_level);
+EXPORT_SYMBOL(rtas_set_indicator);
+EXPORT_SYMBOL(rtas_get_error_log_max);
diff --git a/arch/ppc64/kernel/rtas_flash.c b/arch/ppc64/kernel/rtas_flash.c
new file mode 100644
index 00000000000..3213837282c
--- /dev/null
+++ b/arch/ppc64/kernel/rtas_flash.c
@@ -0,0 +1,725 @@
+/*
+ *  c 2001 PPC 64 Team, IBM Corp
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ * /proc/ppc64/rtas/firmware_flash interface
+ *
+ * This file implements a firmware_flash interface to pump a firmware
+ * image into the kernel.  At reboot time rtas_restart() will see the
+ * firmware image and flash it as it reboots (see rtas.c).
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <asm/delay.h>
+#include <asm/uaccess.h>
+#include <asm/rtas.h>
+
+#define MODULE_VERS "1.0"
+#define MODULE_NAME "rtas_flash"
+
+#define FIRMWARE_FLASH_NAME "firmware_flash"   
+#define FIRMWARE_UPDATE_NAME "firmware_update"
+#define MANAGE_FLASH_NAME "manage_flash"
+#define VALIDATE_FLASH_NAME "validate_flash"
+
+/* General RTAS Status Codes */
+#define RTAS_RC_SUCCESS  0
+#define RTAS_RC_HW_ERR	-1
+#define RTAS_RC_BUSY	-2
+
+/* Flash image status values */
+#define FLASH_AUTH           -9002 /* RTAS Not Service Authority Partition */
+#define FLASH_NO_OP          -1099 /* No operation initiated by user */	
+#define FLASH_IMG_SHORT	     -1005 /* Flash image shorter than expected */
+#define FLASH_IMG_BAD_LEN    -1004 /* Bad length value in flash list block */
+#define FLASH_IMG_NULL_DATA  -1003 /* Bad data value in flash list block */
+#define FLASH_IMG_READY      0     /* Firmware img ready for flash on reboot */
+
+/* Manage image status values */
+#define MANAGE_AUTH          -9002 /* RTAS Not Service Authority Partition */
+#define MANAGE_ACTIVE_ERR    -9001 /* RTAS Cannot Overwrite Active Img */
+#define MANAGE_NO_OP         -1099 /* No operation initiated by user */
+#define MANAGE_PARAM_ERR     -3    /* RTAS Parameter Error */
+#define MANAGE_HW_ERR        -1    /* RTAS Hardware Error */
+
+/* Validate image status values */
+#define VALIDATE_AUTH          -9002 /* RTAS Not Service Authority Partition */
+#define VALIDATE_NO_OP         -1099 /* No operation initiated by the user */
+#define VALIDATE_INCOMPLETE    -1002 /* User copied < VALIDATE_BUF_SIZE */
+#define VALIDATE_READY	       -1001 /* Firmware image ready for validation */
+#define VALIDATE_PARAM_ERR     -3    /* RTAS Parameter Error */
+#define VALIDATE_HW_ERR        -1    /* RTAS Hardware Error */
+#define VALIDATE_TMP_UPDATE    0     /* Validate Return Status */
+#define VALIDATE_FLASH_AUTH    1     /* Validate Return Status */
+#define VALIDATE_INVALID_IMG   2     /* Validate Return Status */
+#define VALIDATE_CUR_UNKNOWN   3     /* Validate Return Status */
+#define VALIDATE_TMP_COMMIT_DL 4     /* Validate Return Status */
+#define VALIDATE_TMP_COMMIT    5     /* Validate Return Status */
+#define VALIDATE_TMP_UPDATE_DL 6     /* Validate Return Status */
+
+/* ibm,manage-flash-image operation tokens */
+#define RTAS_REJECT_TMP_IMG   0
+#define RTAS_COMMIT_TMP_IMG   1
+
+/* Array sizes */
+#define VALIDATE_BUF_SIZE 4096    
+#define RTAS_MSG_MAXLEN   64
+
+/* Local copy of the flash block list.
+ * We only allow one open of the flash proc file and create this
+ * list as we go.  This list will be put in the kernel's
+ * rtas_firmware_flash_list global var once it is fully read.
+ *
+ * For convenience as we build the list we use virtual addrs,
+ * we do not fill in the version number, and the length field
+ * is treated as the number of entries currently in the block
+ * (i.e. not a byte count).  This is all fixed on release.
+ */
+
+/* Status int must be first member of struct */
+struct rtas_update_flash_t
+{
+	int status;			/* Flash update status */
+	struct flash_block_list *flist; /* Local copy of flash block list */
+};
+
+/* Status int must be first member of struct */
+struct rtas_manage_flash_t
+{
+	int status;			/* Returned status */
+	unsigned int op;		/* Reject or commit image */
+};
+
+/* Status int must be first member of struct */
+struct rtas_validate_flash_t
+{
+	int status;		 	/* Returned status */	
+	char buf[VALIDATE_BUF_SIZE]; 	/* Candidate image buffer */
+	unsigned int buf_size;		/* Size of image buf */
+	unsigned int update_results;	/* Update results token */
+};
+
+static DEFINE_SPINLOCK(flash_file_open_lock);
+static struct proc_dir_entry *firmware_flash_pde;
+static struct proc_dir_entry *firmware_update_pde;
+static struct proc_dir_entry *validate_pde;
+static struct proc_dir_entry *manage_pde;
+
+/* Do simple sanity checks on the flash image. */
+static int flash_list_valid(struct flash_block_list *flist)
+{
+	struct flash_block_list *f;
+	int i;
+	unsigned long block_size, image_size;
+
+	/* Paranoid self test here.  We also collect the image size. */
+	image_size = 0;
+	for (f = flist; f; f = f->next) {
+		for (i = 0; i < f->num_blocks; i++) {
+			if (f->blocks[i].data == NULL) {
+				return FLASH_IMG_NULL_DATA;
+			}
+			block_size = f->blocks[i].length;
+			if (block_size <= 0 || block_size > PAGE_SIZE) {
+				return FLASH_IMG_BAD_LEN;
+			}
+			image_size += block_size;
+		}
+	}
+
+	if (image_size < (256 << 10)) {
+		if (image_size < 2) 
+			return FLASH_NO_OP;
+	}
+
+	printk(KERN_INFO "FLASH: flash image with %ld bytes stored for hardware flash on reboot\n", image_size);
+
+	return FLASH_IMG_READY;
+}
+
+static void free_flash_list(struct flash_block_list *f)
+{
+	struct flash_block_list *next;
+	int i;
+
+	while (f) {
+		for (i = 0; i < f->num_blocks; i++)
+			free_page((unsigned long)(f->blocks[i].data));
+		next = f->next;
+		free_page((unsigned long)f);
+		f = next;
+	}
+}
+
+static int rtas_flash_release(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *dp = PDE(file->f_dentry->d_inode);
+	struct rtas_update_flash_t *uf;
+	
+	uf = (struct rtas_update_flash_t *) dp->data;
+	if (uf->flist) {    
+		/* File was opened in write mode for a new flash attempt */
+		/* Clear saved list */
+		if (rtas_firmware_flash_list.next) {
+			free_flash_list(rtas_firmware_flash_list.next);
+			rtas_firmware_flash_list.next = NULL;
+		}
+
+		if (uf->status != FLASH_AUTH)  
+			uf->status = flash_list_valid(uf->flist);
+
+		if (uf->status == FLASH_IMG_READY) 
+			rtas_firmware_flash_list.next = uf->flist;
+		else
+			free_flash_list(uf->flist);
+
+		uf->flist = NULL;
+	}
+
+	atomic_dec(&dp->count);
+	return 0;
+}
+
+static void get_flash_status_msg(int status, char *buf)
+{
+	char *msg;
+
+	switch (status) {
+	case FLASH_AUTH:
+		msg = "error: this partition does not have service authority\n";
+		break;
+	case FLASH_NO_OP:
+		msg = "info: no firmware image for flash\n";
+		break;
+	case FLASH_IMG_SHORT:
+		msg = "error: flash image short\n";
+		break;
+	case FLASH_IMG_BAD_LEN:
+		msg = "error: internal error bad length\n";
+		break;
+	case FLASH_IMG_NULL_DATA:
+		msg = "error: internal error null data\n";
+		break;
+	case FLASH_IMG_READY:
+		msg = "ready: firmware image ready for flash on reboot\n";
+		break;
+	default:
+		sprintf(buf, "error: unexpected status value %d\n", status);
+		return;
+	}
+
+	strcpy(buf, msg);	
+}
+
+/* Reading the proc file will show status (not the firmware contents) */
+static ssize_t rtas_flash_read(struct file *file, char *buf,
+			       size_t count, loff_t *ppos)
+{
+	struct proc_dir_entry *dp = PDE(file->f_dentry->d_inode);
+	struct rtas_update_flash_t *uf;
+	char msg[RTAS_MSG_MAXLEN];
+	int msglen;
+
+	uf = (struct rtas_update_flash_t *) dp->data;
+
+	if (!strcmp(dp->name, FIRMWARE_FLASH_NAME)) {
+		get_flash_status_msg(uf->status, msg);
+	} else {	   /* FIRMWARE_UPDATE_NAME */
+		sprintf(msg, "%d\n", uf->status);
+	}
+	msglen = strlen(msg);
+	if (msglen > count)
+		msglen = count;
+
+	if (ppos && *ppos != 0)
+		return 0;	/* be cheap */
+
+	if (!access_ok(VERIFY_WRITE, buf, msglen))
+		return -EINVAL;
+
+	if (copy_to_user(buf, msg, msglen))
+		return -EFAULT;
+
+	if (ppos)
+		*ppos = msglen;
+	return msglen;
+}
+
+/* We could be much more efficient here.  But to keep this function
+ * simple we allocate a page to the block list no matter how small the
+ * count is.  If the system is low on memory it will be just as well
+ * that we fail....
+ */
+static ssize_t rtas_flash_write(struct file *file, const char *buffer,
+				size_t count, loff_t *off)
+{
+	struct proc_dir_entry *dp = PDE(file->f_dentry->d_inode);
+	struct rtas_update_flash_t *uf;
+	char *p;
+	int next_free;
+	struct flash_block_list *fl;
+
+	uf = (struct rtas_update_flash_t *) dp->data;
+
+	if (uf->status == FLASH_AUTH || count == 0)
+		return count;	/* discard data */
+
+	/* In the case that the image is not ready for flashing, the memory
+	 * allocated for the block list will be freed upon the release of the 
+	 * proc file
+	 */
+	if (uf->flist == NULL) {
+		uf->flist = (struct flash_block_list *) get_zeroed_page(GFP_KERNEL);
+		if (!uf->flist)
+			return -ENOMEM;
+	}
+
+	fl = uf->flist;
+	while (fl->next)
+		fl = fl->next; /* seek to last block_list for append */
+	next_free = fl->num_blocks;
+	if (next_free == FLASH_BLOCKS_PER_NODE) {
+		/* Need to allocate another block_list */
+		fl->next = (struct flash_block_list *)get_zeroed_page(GFP_KERNEL);
+		if (!fl->next)
+			return -ENOMEM;
+		fl = fl->next;
+		next_free = 0;
+	}
+
+	if (count > PAGE_SIZE)
+		count = PAGE_SIZE;
+	p = (char *)get_zeroed_page(GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+	
+	if(copy_from_user(p, buffer, count)) {
+		free_page((unsigned long)p);
+		return -EFAULT;
+	}
+	fl->blocks[next_free].data = p;
+	fl->blocks[next_free].length = count;
+	fl->num_blocks++;
+
+	return count;
+}
+
+static int rtas_excl_open(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *dp = PDE(inode);
+
+	/* Enforce exclusive open with use count of PDE */
+	spin_lock(&flash_file_open_lock);
+	if (atomic_read(&dp->count) > 1) {
+		spin_unlock(&flash_file_open_lock);
+		return -EBUSY;
+	}
+
+	atomic_inc(&dp->count);
+	spin_unlock(&flash_file_open_lock);
+	
+	return 0;
+}
+
+static int rtas_excl_release(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *dp = PDE(inode);
+
+	atomic_dec(&dp->count);
+
+	return 0;
+}
+
+static void manage_flash(struct rtas_manage_flash_t *args_buf)
+{
+	unsigned int wait_time;
+	s32 rc;
+
+	while (1) {
+		rc = rtas_call(rtas_token("ibm,manage-flash-image"), 1, 
+			       1, NULL, args_buf->op);
+		if (rc == RTAS_RC_BUSY)
+			udelay(1);
+		else if (rtas_is_extended_busy(rc)) {
+			wait_time = rtas_extended_busy_delay_time(rc);
+			udelay(wait_time * 1000);
+		} else
+			break;
+	}
+
+	args_buf->status = rc;
+}
+
+static ssize_t manage_flash_read(struct file *file, char *buf,
+			       size_t count, loff_t *ppos)
+{
+	struct proc_dir_entry *dp = PDE(file->f_dentry->d_inode);
+	struct rtas_manage_flash_t *args_buf;
+	char msg[RTAS_MSG_MAXLEN];
+	int msglen;
+
+	args_buf = (struct rtas_manage_flash_t *) dp->data;
+	if (args_buf == NULL)
+		return 0;
+
+	msglen = sprintf(msg, "%d\n", args_buf->status);
+	if (msglen > count)
+		msglen = count;
+
+	if (ppos && *ppos != 0)
+		return 0;	/* be cheap */
+
+	if (!access_ok(VERIFY_WRITE, buf, msglen))
+		return -EINVAL;
+
+	if (copy_to_user(buf, msg, msglen))
+		return -EFAULT;
+
+	if (ppos)
+		*ppos = msglen;
+	return msglen;
+}
+
+static ssize_t manage_flash_write(struct file *file, const char *buf,
+				size_t count, loff_t *off)
+{
+	struct proc_dir_entry *dp = PDE(file->f_dentry->d_inode);
+	struct rtas_manage_flash_t *args_buf;
+	const char reject_str[] = "0";
+	const char commit_str[] = "1";
+	char stkbuf[10];
+	int op;
+
+	args_buf = (struct rtas_manage_flash_t *) dp->data;
+	if ((args_buf->status == MANAGE_AUTH) || (count == 0))
+		return count;
+		
+	op = -1;
+	if (buf) {
+		if (count > 9) count = 9;
+		if (copy_from_user (stkbuf, buf, count)) {
+			return -EFAULT;
+		}
+		if (strncmp(stkbuf, reject_str, strlen(reject_str)) == 0) 
+			op = RTAS_REJECT_TMP_IMG;
+		else if (strncmp(stkbuf, commit_str, strlen(commit_str)) == 0) 
+			op = RTAS_COMMIT_TMP_IMG;
+	}
+	
+	if (op == -1)   /* buf is empty, or contains invalid string */
+		return -EINVAL;
+
+	args_buf->op = op;
+	manage_flash(args_buf);
+
+	return count;
+}
+
+static void validate_flash(struct rtas_validate_flash_t *args_buf)
+{
+	int token = rtas_token("ibm,validate-flash-image");
+	unsigned int wait_time;
+	int update_results;
+	s32 rc;	
+
+	rc = 0;
+	while(1) {
+		spin_lock(&rtas_data_buf_lock);
+		memcpy(rtas_data_buf, args_buf->buf, VALIDATE_BUF_SIZE);
+		rc = rtas_call(token, 2, 2, &update_results, 
+			       (u32) __pa(rtas_data_buf), args_buf->buf_size);
+		memcpy(args_buf->buf, rtas_data_buf, VALIDATE_BUF_SIZE);
+		spin_unlock(&rtas_data_buf_lock);
+			
+		if (rc == RTAS_RC_BUSY)
+			udelay(1);
+		else if (rtas_is_extended_busy(rc)) {
+			wait_time = rtas_extended_busy_delay_time(rc);
+			udelay(wait_time * 1000);
+		} else
+			break;
+	}
+
+	args_buf->status = rc;
+	args_buf->update_results = update_results;
+}
+
+static int get_validate_flash_msg(struct rtas_validate_flash_t *args_buf, 
+		                   char *msg)
+{
+	int n;
+
+	if (args_buf->status >= VALIDATE_TMP_UPDATE) { 
+		n = sprintf(msg, "%d\n", args_buf->update_results);
+		if ((args_buf->update_results >= VALIDATE_CUR_UNKNOWN) ||
+		    (args_buf->update_results == VALIDATE_TMP_UPDATE))
+			n += sprintf(msg + n, "%s\n", args_buf->buf);
+	} else {
+		n = sprintf(msg, "%d\n", args_buf->status);
+	}
+	return n;
+}
+
+static ssize_t validate_flash_read(struct file *file, char *buf,
+			       size_t count, loff_t *ppos)
+{
+	struct proc_dir_entry *dp = PDE(file->f_dentry->d_inode);
+	struct rtas_validate_flash_t *args_buf;
+	char msg[RTAS_MSG_MAXLEN];
+	int msglen;
+
+	args_buf = (struct rtas_validate_flash_t *) dp->data;
+
+	if (ppos && *ppos != 0)
+		return 0;	/* be cheap */
+	
+	msglen = get_validate_flash_msg(args_buf, msg);
+	if (msglen > count)
+		msglen = count;
+
+	if (!access_ok(VERIFY_WRITE, buf, msglen))
+		return -EINVAL;
+
+	if (copy_to_user(buf, msg, msglen))
+		return -EFAULT;
+
+	if (ppos)
+		*ppos = msglen;
+	return msglen;
+}
+
+static ssize_t validate_flash_write(struct file *file, const char *buf,
+				    size_t count, loff_t *off)
+{
+	struct proc_dir_entry *dp = PDE(file->f_dentry->d_inode);
+	struct rtas_validate_flash_t *args_buf;
+	int rc;
+
+	args_buf = (struct rtas_validate_flash_t *) dp->data;
+
+	if (dp->data == NULL) {
+		dp->data = kmalloc(sizeof(struct rtas_validate_flash_t), 
+				GFP_KERNEL);
+		if (dp->data == NULL) 
+			return -ENOMEM;
+	}
+
+	/* We are only interested in the first 4K of the
+	 * candidate image */
+	if ((*off >= VALIDATE_BUF_SIZE) || 
+		(args_buf->status == VALIDATE_AUTH)) {
+		*off += count;
+		return count;
+	}
+
+	if (*off + count >= VALIDATE_BUF_SIZE)  {
+		count = VALIDATE_BUF_SIZE - *off;
+		args_buf->status = VALIDATE_READY;	
+	} else {
+		args_buf->status = VALIDATE_INCOMPLETE;
+	}
+
+	if (!access_ok(VERIFY_READ, buf, count)) {
+		rc = -EFAULT;
+		goto done;
+	}
+	if (copy_from_user(args_buf->buf + *off, buf, count)) {
+		rc = -EFAULT;
+		goto done;
+	}
+
+	*off += count;
+	rc = count;
+done:
+	if (rc < 0) {
+		kfree(dp->data);
+		dp->data = NULL;
+	}
+	return rc;
+}
+
+static int validate_flash_release(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *dp = PDE(file->f_dentry->d_inode);
+	struct rtas_validate_flash_t *args_buf;
+
+	args_buf = (struct rtas_validate_flash_t *) dp->data;
+
+	if (args_buf->status == VALIDATE_READY) {
+		args_buf->buf_size = VALIDATE_BUF_SIZE;
+		validate_flash(args_buf);
+	}
+
+	/* The matching atomic_inc was in rtas_excl_open() */
+	atomic_dec(&dp->count);
+
+	return 0;
+}
+
+static void remove_flash_pde(struct proc_dir_entry *dp)
+{
+	if (dp) {
+		if (dp->data != NULL)
+			kfree(dp->data);
+		dp->owner = NULL;
+		remove_proc_entry(dp->name, dp->parent);
+	}
+}
+
+static int initialize_flash_pde_data(const char *rtas_call_name,
+				     size_t buf_size,
+				     struct proc_dir_entry *dp)
+{
+	int *status;
+	int token;
+
+	dp->data = kmalloc(buf_size, GFP_KERNEL);
+	if (dp->data == NULL) {
+		remove_flash_pde(dp);
+		return -ENOMEM;
+	}
+
+	memset(dp->data, 0, buf_size);
+
+	/*
+	 * This code assumes that the status int is the first member of the
+	 * struct 
+	 */
+	status = (int *) dp->data;
+	token = rtas_token(rtas_call_name);
+	if (token == RTAS_UNKNOWN_SERVICE)
+		*status = FLASH_AUTH;
+	else
+		*status = FLASH_NO_OP;
+
+	return 0;
+}
+
+static struct proc_dir_entry *create_flash_pde(const char *filename,
+					       struct file_operations *fops)
+{
+	struct proc_dir_entry *ent = NULL;
+
+	ent = create_proc_entry(filename, S_IRUSR | S_IWUSR, NULL);
+	if (ent != NULL) {
+		ent->nlink = 1;
+		ent->proc_fops = fops;
+		ent->owner = THIS_MODULE;
+	}
+
+	return ent;
+}
+
+static struct file_operations rtas_flash_operations = {
+	.read		= rtas_flash_read,
+	.write		= rtas_flash_write,
+	.open		= rtas_excl_open,
+	.release	= rtas_flash_release,
+};
+
+static struct file_operations manage_flash_operations = {
+	.read		= manage_flash_read,
+	.write		= manage_flash_write,
+	.open		= rtas_excl_open,
+	.release	= rtas_excl_release,
+};
+
+static struct file_operations validate_flash_operations = {
+	.read		= validate_flash_read,
+	.write		= validate_flash_write,
+	.open		= rtas_excl_open,
+	.release	= validate_flash_release,
+};
+
+int __init rtas_flash_init(void)
+{
+	int rc;
+
+	if (rtas_token("ibm,update-flash-64-and-reboot") ==
+		       RTAS_UNKNOWN_SERVICE) {
+		printk(KERN_ERR "rtas_flash: no firmware flash support\n");
+		return 1;
+	}
+
+	firmware_flash_pde = create_flash_pde("ppc64/rtas/"
+					      FIRMWARE_FLASH_NAME,
+					      &rtas_flash_operations);
+	if (firmware_flash_pde == NULL) {
+		rc = -ENOMEM;
+		goto cleanup;
+	}
+
+	rc = initialize_flash_pde_data("ibm,update-flash-64-and-reboot",
+			 	       sizeof(struct rtas_update_flash_t), 
+				       firmware_flash_pde);
+	if (rc != 0)
+		goto cleanup;
+
+	firmware_update_pde = create_flash_pde("ppc64/rtas/"
+					       FIRMWARE_UPDATE_NAME,
+					       &rtas_flash_operations);
+	if (firmware_update_pde == NULL) {
+		rc = -ENOMEM;
+		goto cleanup;
+	}
+
+	rc = initialize_flash_pde_data("ibm,update-flash-64-and-reboot",
+			 	       sizeof(struct rtas_update_flash_t), 
+				       firmware_update_pde);
+	if (rc != 0)
+		goto cleanup;
+
+	validate_pde = create_flash_pde("ppc64/rtas/" VALIDATE_FLASH_NAME,
+			      		&validate_flash_operations);
+	if (validate_pde == NULL) {
+		rc = -ENOMEM;
+		goto cleanup;
+	}
+
+	rc = initialize_flash_pde_data("ibm,validate-flash-image",
+		                       sizeof(struct rtas_validate_flash_t), 
+				       validate_pde);
+	if (rc != 0)
+		goto cleanup;
+
+	manage_pde = create_flash_pde("ppc64/rtas/" MANAGE_FLASH_NAME,
+				      &manage_flash_operations);
+	if (manage_pde == NULL) {
+		rc = -ENOMEM;
+		goto cleanup;
+	}
+
+	rc = initialize_flash_pde_data("ibm,manage-flash-image",
+			               sizeof(struct rtas_manage_flash_t),
+				       manage_pde);
+	if (rc != 0)
+		goto cleanup;
+
+	return 0;
+
+cleanup:
+	remove_flash_pde(firmware_flash_pde);
+	remove_flash_pde(firmware_update_pde);
+	remove_flash_pde(validate_pde);
+	remove_flash_pde(manage_pde);
+
+	return rc;
+}
+
+void __exit rtas_flash_cleanup(void)
+{
+	remove_flash_pde(firmware_flash_pde);
+	remove_flash_pde(firmware_update_pde);
+	remove_flash_pde(validate_pde);
+	remove_flash_pde(manage_pde);
+}
+
+module_init(rtas_flash_init);
+module_exit(rtas_flash_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/arch/ppc64/kernel/rtasd.c b/arch/ppc64/kernel/rtasd.c
new file mode 100644
index 00000000000..ff65dc33320
--- /dev/null
+++ b/arch/ppc64/kernel/rtasd.c
@@ -0,0 +1,527 @@
+/*
+ * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Communication to userspace based on kernel/printk.c
+ */
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/poll.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/vmalloc.h>
+#include <linux/spinlock.h>
+#include <linux/cpu.h>
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/rtas.h>
+#include <asm/prom.h>
+#include <asm/nvram.h>
+#include <asm/atomic.h>
+#include <asm/systemcfg.h>
+
+#if 0
+#define DEBUG(A...)	printk(KERN_ERR A)
+#else
+#define DEBUG(A...)
+#endif
+
+static DEFINE_SPINLOCK(rtasd_log_lock);
+
+DECLARE_WAIT_QUEUE_HEAD(rtas_log_wait);
+
+static char *rtas_log_buf;
+static unsigned long rtas_log_start;
+static unsigned long rtas_log_size;
+
+static int surveillance_timeout = -1;
+static unsigned int rtas_event_scan_rate;
+static unsigned int rtas_error_log_max;
+static unsigned int rtas_error_log_buffer_max;
+
+static int full_rtas_msgs = 0;
+
+extern int no_logging;
+
+volatile int error_log_cnt = 0;
+
+/*
+ * Since we use 32 bit RTAS, the physical address of this must be below
+ * 4G or else bad things happen. Allocate this in the kernel data and
+ * make it big enough.
+ */
+static unsigned char logdata[RTAS_ERROR_LOG_MAX];
+
+static int get_eventscan_parms(void);
+
+static char *rtas_type[] = {
+	"Unknown", "Retry", "TCE Error", "Internal Device Failure",
+	"Timeout", "Data Parity", "Address Parity", "Cache Parity",
+	"Address Invalid", "ECC Uncorrected", "ECC Corrupted",
+};
+
+static char *rtas_event_type(int type)
+{
+	if ((type > 0) && (type < 11))
+		return rtas_type[type];
+
+	switch (type) {
+		case RTAS_TYPE_EPOW:
+			return "EPOW";
+		case RTAS_TYPE_PLATFORM:
+			return "Platform Error";
+		case RTAS_TYPE_IO:
+			return "I/O Event";
+		case RTAS_TYPE_INFO:
+			return "Platform Information Event";
+		case RTAS_TYPE_DEALLOC:
+			return "Resource Deallocation Event";
+		case RTAS_TYPE_DUMP:
+			return "Dump Notification Event";
+	}
+
+	return rtas_type[0];
+}
+
+/* To see this info, grep RTAS /var/log/messages and each entry
+ * will be collected together with obvious begin/end.
+ * There will be a unique identifier on the begin and end lines.
+ * This will persist across reboots.
+ *
+ * format of error logs returned from RTAS:
+ * bytes	(size)	: contents
+ * --------------------------------------------------------
+ * 0-7		(8)	: rtas_error_log
+ * 8-47		(40)	: extended info
+ * 48-51	(4)	: vendor id
+ * 52-1023 (vendor specific) : location code and debug data
+ */
+static void printk_log_rtas(char *buf, int len)
+{
+
+	int i,j,n = 0;
+	int perline = 16;
+	char buffer[64];
+	char * str = "RTAS event";
+
+	if (full_rtas_msgs) {
+		printk(RTAS_DEBUG "%d -------- %s begin --------\n",
+		       error_log_cnt, str);
+
+		/*
+		 * Print perline bytes on each line, each line will start
+		 * with RTAS and a changing number, so syslogd will
+		 * print lines that are otherwise the same.  Separate every
+		 * 4 bytes with a space.
+		 */
+		for (i = 0; i < len; i++) {
+			j = i % perline;
+			if (j == 0) {
+				memset(buffer, 0, sizeof(buffer));
+				n = sprintf(buffer, "RTAS %d:", i/perline);
+			}
+
+			if ((i % 4) == 0)
+				n += sprintf(buffer+n, " ");
+
+			n += sprintf(buffer+n, "%02x", (unsigned char)buf[i]);
+
+			if (j == (perline-1))
+				printk(KERN_DEBUG "%s\n", buffer);
+		}
+		if ((i % perline) != 0)
+			printk(KERN_DEBUG "%s\n", buffer);
+
+		printk(RTAS_DEBUG "%d -------- %s end ----------\n",
+		       error_log_cnt, str);
+	} else {
+		struct rtas_error_log *errlog = (struct rtas_error_log *)buf;
+
+		printk(RTAS_DEBUG "event: %d, Type: %s, Severity: %d\n",
+		       error_log_cnt, rtas_event_type(errlog->type),
+		       errlog->severity);
+	}
+}
+
+static int log_rtas_len(char * buf)
+{
+	int len;
+	struct rtas_error_log *err;
+
+	/* rtas fixed header */
+	len = 8;
+	err = (struct rtas_error_log *)buf;
+	if (err->extended_log_length) {
+
+		/* extended header */
+		len += err->extended_log_length;
+	}
+
+	if (rtas_error_log_max == 0) {
+		get_eventscan_parms();
+	}
+	if (len > rtas_error_log_max)
+		len = rtas_error_log_max;
+
+	return len;
+}
+
+/*
+ * First write to nvram, if fatal error, that is the only
+ * place we log the info.  The error will be picked up
+ * on the next reboot by rtasd.  If not fatal, run the
+ * method for the type of error.  Currently, only RTAS
+ * errors have methods implemented, but in the future
+ * there might be a need to store data in nvram before a
+ * call to panic().
+ *
+ * XXX We write to nvram periodically, to indicate error has
+ * been written and sync'd, but there is a possibility
+ * that if we don't shutdown correctly, a duplicate error
+ * record will be created on next reboot.
+ */
+void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
+{
+	unsigned long offset;
+	unsigned long s;
+	int len = 0;
+
+	DEBUG("logging event\n");
+	if (buf == NULL)
+		return;
+
+	spin_lock_irqsave(&rtasd_log_lock, s);
+
+	/* get length and increase count */
+	switch (err_type & ERR_TYPE_MASK) {
+	case ERR_TYPE_RTAS_LOG:
+		len = log_rtas_len(buf);
+		if (!(err_type & ERR_FLAG_BOOT))
+			error_log_cnt++;
+		break;
+	case ERR_TYPE_KERNEL_PANIC:
+	default:
+		spin_unlock_irqrestore(&rtasd_log_lock, s);
+		return;
+	}
+
+	/* Write error to NVRAM */
+	if (!no_logging && !(err_type & ERR_FLAG_BOOT))
+		nvram_write_error_log(buf, len, err_type);
+
+	/*
+	 * rtas errors can occur during boot, and we do want to capture
+	 * those somewhere, even if nvram isn't ready (why not?), and even
+	 * if rtasd isn't ready. Put them into the boot log, at least.
+	 */
+	if ((err_type & ERR_TYPE_MASK) == ERR_TYPE_RTAS_LOG)
+		printk_log_rtas(buf, len);
+
+	/* Check to see if we need to or have stopped logging */
+	if (fatal || no_logging) {
+		no_logging = 1;
+		spin_unlock_irqrestore(&rtasd_log_lock, s);
+		return;
+	}
+
+	/* call type specific method for error */
+	switch (err_type & ERR_TYPE_MASK) {
+	case ERR_TYPE_RTAS_LOG:
+		offset = rtas_error_log_buffer_max *
+			((rtas_log_start+rtas_log_size) & LOG_NUMBER_MASK);
+
+		/* First copy over sequence number */
+		memcpy(&rtas_log_buf[offset], (void *) &error_log_cnt, sizeof(int));
+
+		/* Second copy over error log data */
+		offset += sizeof(int);
+		memcpy(&rtas_log_buf[offset], buf, len);
+
+		if (rtas_log_size < LOG_NUMBER)
+			rtas_log_size += 1;
+		else
+			rtas_log_start += 1;
+
+		spin_unlock_irqrestore(&rtasd_log_lock, s);
+		wake_up_interruptible(&rtas_log_wait);
+		break;
+	case ERR_TYPE_KERNEL_PANIC:
+	default:
+		spin_unlock_irqrestore(&rtasd_log_lock, s);
+		return;
+	}
+
+}
+
+
+static int rtas_log_open(struct inode * inode, struct file * file)
+{
+	return 0;
+}
+
+static int rtas_log_release(struct inode * inode, struct file * file)
+{
+	return 0;
+}
+
+/* This will check if all events are logged, if they are then, we
+ * know that we can safely clear the events in NVRAM.
+ * Next we'll sit and wait for something else to log.
+ */
+static ssize_t rtas_log_read(struct file * file, char __user * buf,
+			 size_t count, loff_t *ppos)
+{
+	int error;
+	char *tmp;
+	unsigned long s;
+	unsigned long offset;
+
+	if (!buf || count < rtas_error_log_buffer_max)
+		return -EINVAL;
+
+	count = rtas_error_log_buffer_max;
+
+	if (!access_ok(VERIFY_WRITE, buf, count))
+		return -EFAULT;
+
+	tmp = kmalloc(count, GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+
+	spin_lock_irqsave(&rtasd_log_lock, s);
+	/* if it's 0, then we know we got the last one (the one in NVRAM) */
+	if (rtas_log_size == 0 && !no_logging)
+		nvram_clear_error_log();
+	spin_unlock_irqrestore(&rtasd_log_lock, s);
+
+
+	error = wait_event_interruptible(rtas_log_wait, rtas_log_size);
+	if (error)
+		goto out;
+
+	spin_lock_irqsave(&rtasd_log_lock, s);
+	offset = rtas_error_log_buffer_max * (rtas_log_start & LOG_NUMBER_MASK);
+	memcpy(tmp, &rtas_log_buf[offset], count);
+
+	rtas_log_start += 1;
+	rtas_log_size -= 1;
+	spin_unlock_irqrestore(&rtasd_log_lock, s);
+
+	error = copy_to_user(buf, tmp, count) ? -EFAULT : count;
+out:
+	kfree(tmp);
+	return error;
+}
+
+static unsigned int rtas_log_poll(struct file *file, poll_table * wait)
+{
+	poll_wait(file, &rtas_log_wait, wait);
+	if (rtas_log_size)
+		return POLLIN | POLLRDNORM;
+	return 0;
+}
+
+struct file_operations proc_rtas_log_operations = {
+	.read =		rtas_log_read,
+	.poll =		rtas_log_poll,
+	.open =		rtas_log_open,
+	.release =	rtas_log_release,
+};
+
+static int enable_surveillance(int timeout)
+{
+	int error;
+
+	error = rtas_set_indicator(SURVEILLANCE_TOKEN, 0, timeout);
+
+	if (error == 0)
+		return 0;
+
+	if (error == -EINVAL) {
+		printk(KERN_INFO "rtasd: surveillance not supported\n");
+		return 0;
+	}
+
+	printk(KERN_ERR "rtasd: could not update surveillance\n");
+	return -1;
+}
+
+static int get_eventscan_parms(void)
+{
+	struct device_node *node;
+	int *ip;
+
+	node = of_find_node_by_path("/rtas");
+
+	ip = (int *)get_property(node, "rtas-event-scan-rate", NULL);
+	if (ip == NULL) {
+		printk(KERN_ERR "rtasd: no rtas-event-scan-rate\n");
+		of_node_put(node);
+		return -1;
+	}
+	rtas_event_scan_rate = *ip;
+	DEBUG("rtas-event-scan-rate %d\n", rtas_event_scan_rate);
+
+	/* Make room for the sequence number */
+	rtas_error_log_max = rtas_get_error_log_max();
+	rtas_error_log_buffer_max = rtas_error_log_max + sizeof(int);
+
+	of_node_put(node);
+
+	return 0;
+}
+
+static void do_event_scan(int event_scan)
+{
+	int error;
+	do {
+		memset(logdata, 0, rtas_error_log_max);
+		error = rtas_call(event_scan, 4, 1, NULL,
+				  RTAS_EVENT_SCAN_ALL_EVENTS, 0,
+				  __pa(logdata), rtas_error_log_max);
+		if (error == -1) {
+			printk(KERN_ERR "event-scan failed\n");
+			break;
+		}
+
+		if (error == 0)
+			pSeries_log_error(logdata, ERR_TYPE_RTAS_LOG, 0);
+
+	} while(error == 0);
+}
+
+static void do_event_scan_all_cpus(long delay)
+{
+	int cpu;
+
+	lock_cpu_hotplug();
+	cpu = first_cpu(cpu_online_map);
+	for (;;) {
+		set_cpus_allowed(current, cpumask_of_cpu(cpu));
+		do_event_scan(rtas_token("event-scan"));
+		set_cpus_allowed(current, CPU_MASK_ALL);
+
+		/* Drop hotplug lock, and sleep for the specified delay */
+		unlock_cpu_hotplug();
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(delay);
+		lock_cpu_hotplug();
+
+		cpu = next_cpu(cpu, cpu_online_map);
+		if (cpu == NR_CPUS)
+			break;
+	}
+	unlock_cpu_hotplug();
+}
+
+static int rtasd(void *unused)
+{
+	unsigned int err_type;
+	int event_scan = rtas_token("event-scan");
+	int rc;
+
+	daemonize("rtasd");
+
+	if (event_scan == RTAS_UNKNOWN_SERVICE || get_eventscan_parms() == -1)
+		goto error;
+
+	rtas_log_buf = vmalloc(rtas_error_log_buffer_max*LOG_NUMBER);
+	if (!rtas_log_buf) {
+		printk(KERN_ERR "rtasd: no memory\n");
+		goto error;
+	}
+
+	printk(KERN_ERR "RTAS daemon started\n");
+
+	DEBUG("will sleep for %d jiffies\n", (HZ*60/rtas_event_scan_rate) / 2);
+
+	/* See if we have any error stored in NVRAM */
+	memset(logdata, 0, rtas_error_log_max);
+
+	rc = nvram_read_error_log(logdata, rtas_error_log_max, &err_type);
+
+	/* We can use rtas_log_buf now */
+	no_logging = 0;
+
+	if (!rc) {
+		if (err_type != ERR_FLAG_ALREADY_LOGGED) {
+			pSeries_log_error(logdata, err_type | ERR_FLAG_BOOT, 0);
+		}
+	}
+
+	/* First pass. */
+	do_event_scan_all_cpus(HZ);
+
+	if (surveillance_timeout != -1) {
+		DEBUG("enabling surveillance\n");
+		enable_surveillance(surveillance_timeout);
+		DEBUG("surveillance enabled\n");
+	}
+
+	/* Delay should be at least one second since some
+	 * machines have problems if we call event-scan too
+	 * quickly. */
+	for (;;)
+		do_event_scan_all_cpus((HZ*60/rtas_event_scan_rate) / 2);
+
+error:
+	/* Should delete proc entries */
+	return -EINVAL;
+}
+
+static int __init rtas_init(void)
+{
+	struct proc_dir_entry *entry;
+
+	/* No RTAS, only warn if we are on a pSeries box  */
+	if (rtas_token("event-scan") == RTAS_UNKNOWN_SERVICE) {
+		if (systemcfg->platform & PLATFORM_PSERIES)
+			printk(KERN_ERR "rtasd: no event-scan on system\n");
+		return 1;
+	}
+
+	entry = create_proc_entry("ppc64/rtas/error_log", S_IRUSR, NULL);
+	if (entry)
+		entry->proc_fops = &proc_rtas_log_operations;
+	else
+		printk(KERN_ERR "Failed to create error_log proc entry\n");
+
+	if (kernel_thread(rtasd, NULL, CLONE_FS) < 0)
+		printk(KERN_ERR "Failed to start RTAS daemon\n");
+
+	return 0;
+}
+
+static int __init surveillance_setup(char *str)
+{
+	int i;
+
+	if (get_option(&str,&i)) {
+		if (i >= 0 && i <= 255)
+			surveillance_timeout = i;
+	}
+
+	return 1;
+}
+
+static int __init rtasmsgs_setup(char *str)
+{
+	if (strcmp(str, "on") == 0)
+		full_rtas_msgs = 1;
+	else if (strcmp(str, "off") == 0)
+		full_rtas_msgs = 0;
+
+	return 1;
+}
+__initcall(rtas_init);
+__setup("surveillance=", surveillance_setup);
+__setup("rtasmsgs=", rtasmsgs_setup);
diff --git a/arch/ppc64/kernel/rtc.c b/arch/ppc64/kernel/rtc.c
new file mode 100644
index 00000000000..3e70b91375f
--- /dev/null
+++ b/arch/ppc64/kernel/rtc.c
@@ -0,0 +1,440 @@
+/*
+ *	Real Time Clock interface for PPC64.
+ *
+ *	Based on rtc.c by Paul Gortmaker
+ *
+ *	This driver allows use of the real time clock
+ *	from user space. It exports the /dev/rtc
+ *	interface supporting various ioctl() and also the
+ *	/proc/driver/rtc pseudo-file for status information.
+ *
+ * 	Interface does not support RTC interrupts nor an alarm.
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ *      1.0	Mike Corrigan:    IBM iSeries rtc support
+ *      1.1	Dave Engebretsen: IBM pSeries rtc support
+ */
+
+#define RTC_VERSION		"1.1"
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/miscdevice.h>
+#include <linux/ioport.h>
+#include <linux/fcntl.h>
+#include <linux/mc146818rtc.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/proc_fs.h>
+#include <linux/spinlock.h>
+#include <linux/bcd.h>
+#include <linux/interrupt.h>
+
+#include <asm/io.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/time.h>
+#include <asm/rtas.h>
+
+#include <asm/iSeries/LparData.h>
+#include <asm/iSeries/mf.h>
+#include <asm/machdep.h>
+#include <asm/iSeries/ItSpCommArea.h>
+
+extern int piranha_simulator;
+
+/*
+ *	We sponge a minor off of the misc major. No need slurping
+ *	up another valuable major dev number for this. If you add
+ *	an ioctl, make sure you don't conflict with SPARC's RTC
+ *	ioctls.
+ */
+
+static ssize_t rtc_read(struct file *file, char __user *buf,
+			size_t count, loff_t *ppos);
+
+static int rtc_ioctl(struct inode *inode, struct file *file,
+		     unsigned int cmd, unsigned long arg);
+
+static int rtc_read_proc(char *page, char **start, off_t off,
+                         int count, int *eof, void *data);
+
+/*
+ *	If this driver ever becomes modularised, it will be really nice
+ *	to make the epoch retain its value across module reload...
+ */
+
+static unsigned long epoch = 1900;	/* year corresponding to 0x00	*/
+
+static const unsigned char days_in_mo[] = 
+{0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31};
+
+/*
+ *	Now all the various file operations that we export.
+ */
+
+static ssize_t rtc_read(struct file *file, char __user *buf,
+			size_t count, loff_t *ppos)
+{
+	return -EIO;
+}
+
+static int rtc_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
+		     unsigned long arg)
+{
+	struct rtc_time wtime; 
+
+	switch (cmd) {
+	case RTC_RD_TIME:	/* Read the time/date from RTC	*/
+	{
+		memset(&wtime, 0, sizeof(struct rtc_time));
+		ppc_md.get_rtc_time(&wtime);
+		break;
+	}
+	case RTC_SET_TIME:	/* Set the RTC */
+	{
+		struct rtc_time rtc_tm;
+		unsigned char mon, day, hrs, min, sec, leap_yr;
+		unsigned int yrs;
+
+		if (!capable(CAP_SYS_TIME))
+			return -EACCES;
+
+		if (copy_from_user(&rtc_tm, (struct rtc_time __user *)arg,
+				   sizeof(struct rtc_time)))
+			return -EFAULT;
+
+		yrs = rtc_tm.tm_year;
+		mon = rtc_tm.tm_mon + 1;   /* tm_mon starts at zero */
+		day = rtc_tm.tm_mday;
+		hrs = rtc_tm.tm_hour;
+		min = rtc_tm.tm_min;
+		sec = rtc_tm.tm_sec;
+
+		if (yrs < 70)
+			return -EINVAL;
+
+		leap_yr = ((!(yrs % 4) && (yrs % 100)) || !(yrs % 400));
+
+		if ((mon > 12) || (day == 0))
+			return -EINVAL;
+
+		if (day > (days_in_mo[mon] + ((mon == 2) && leap_yr)))
+			return -EINVAL;
+			
+		if ((hrs >= 24) || (min >= 60) || (sec >= 60))
+			return -EINVAL;
+
+		if ( yrs > 169 )
+			return -EINVAL;
+
+		ppc_md.set_rtc_time(&rtc_tm);
+		
+		return 0;
+	}
+	case RTC_EPOCH_READ:	/* Read the epoch.	*/
+	{
+		return put_user (epoch, (unsigned long __user *)arg);
+	}
+	case RTC_EPOCH_SET:	/* Set the epoch.	*/
+	{
+		/* 
+		 * There were no RTC clocks before 1900.
+		 */
+		if (arg < 1900)
+			return -EINVAL;
+
+		if (!capable(CAP_SYS_TIME))
+			return -EACCES;
+
+		epoch = arg;
+		return 0;
+	}
+	default:
+		return -EINVAL;
+	}
+	return copy_to_user((void __user *)arg, &wtime, sizeof wtime) ? -EFAULT : 0;
+}
+
+static int rtc_open(struct inode *inode, struct file *file)
+{
+	nonseekable_open(inode, file);
+	return 0;
+}
+
+static int rtc_release(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+/*
+ *	The various file operations we support.
+ */
+static struct file_operations rtc_fops = {
+	.owner =	THIS_MODULE,
+	.llseek =	no_llseek,
+	.read =		rtc_read,
+	.ioctl =	rtc_ioctl,
+	.open =		rtc_open,
+	.release =	rtc_release,
+};
+
+static struct miscdevice rtc_dev = {
+	.minor =	RTC_MINOR,
+	.name =		"rtc",
+	.fops =		&rtc_fops
+};
+
+static int __init rtc_init(void)
+{
+	int retval;
+
+	retval = misc_register(&rtc_dev);
+	if(retval < 0)
+		return retval;
+
+#ifdef CONFIG_PROC_FS
+	if (create_proc_read_entry("driver/rtc", 0, NULL, rtc_read_proc, NULL)
+			== NULL) {
+		misc_deregister(&rtc_dev);
+		return -ENOMEM;
+	}
+#endif
+
+	printk(KERN_INFO "i/pSeries Real Time Clock Driver v" RTC_VERSION "\n");
+
+	return 0;
+}
+
+static void __exit rtc_exit (void)
+{
+	remove_proc_entry ("driver/rtc", NULL);
+	misc_deregister(&rtc_dev);
+}
+
+module_init(rtc_init);
+module_exit(rtc_exit);
+
+/*
+ *	Info exported via "/proc/driver/rtc".
+ */
+
+static int rtc_proc_output (char *buf)
+{
+	
+	char *p;
+	struct rtc_time tm;
+	
+	p = buf;
+
+	ppc_md.get_rtc_time(&tm);
+
+	/*
+	 * There is no way to tell if the luser has the RTC set for local
+	 * time or for Universal Standard Time (GMT). Probably local though.
+	 */
+	p += sprintf(p,
+		     "rtc_time\t: %02d:%02d:%02d\n"
+		     "rtc_date\t: %04d-%02d-%02d\n"
+	 	     "rtc_epoch\t: %04lu\n",
+		     tm.tm_hour, tm.tm_min, tm.tm_sec,
+		     tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, epoch);
+
+	p += sprintf(p,
+		     "DST_enable\t: no\n"
+		     "BCD\t\t: yes\n"
+		     "24hr\t\t: yes\n" );
+
+	return  p - buf;
+}
+
+static int rtc_read_proc(char *page, char **start, off_t off,
+                         int count, int *eof, void *data)
+{
+        int len = rtc_proc_output (page);
+        if (len <= off+count) *eof = 1;
+        *start = page + off;
+        len -= off;
+        if (len>count) len = count;
+        if (len<0) len = 0;
+        return len;
+}
+
+#ifdef CONFIG_PPC_ISERIES
+/*
+ * Get the RTC from the virtual service processor
+ * This requires flowing LpEvents to the primary partition
+ */
+void iSeries_get_rtc_time(struct rtc_time *rtc_tm)
+{
+	if (piranha_simulator)
+		return;
+
+	mf_get_rtc(rtc_tm);
+	rtc_tm->tm_mon--;
+}
+
+/*
+ * Set the RTC in the virtual service processor
+ * This requires flowing LpEvents to the primary partition
+ */
+int iSeries_set_rtc_time(struct rtc_time *tm)
+{
+	mf_set_rtc(tm);
+	return 0;
+}
+
+void iSeries_get_boot_time(struct rtc_time *tm)
+{
+	unsigned long time;
+	static unsigned long lastsec = 1;
+
+	u32 dataWord1 = *((u32 *)(&xSpCommArea.xBcdTimeAtIplStart));
+	u32 dataWord2 = *(((u32 *)&(xSpCommArea.xBcdTimeAtIplStart)) + 1);
+	int year = 1970;
+	int year1 = ( dataWord1 >> 24 ) & 0x000000FF;
+	int year2 = ( dataWord1 >> 16 ) & 0x000000FF;
+	int sec = ( dataWord1 >> 8 ) & 0x000000FF;
+	int min = dataWord1 & 0x000000FF;
+	int hour = ( dataWord2 >> 24 ) & 0x000000FF;
+	int day = ( dataWord2 >> 8 ) & 0x000000FF;
+	int mon = dataWord2 & 0x000000FF;
+
+	if ( piranha_simulator )
+		return;
+
+	BCD_TO_BIN(sec);
+	BCD_TO_BIN(min);
+	BCD_TO_BIN(hour);
+	BCD_TO_BIN(day);
+	BCD_TO_BIN(mon);
+	BCD_TO_BIN(year1);
+	BCD_TO_BIN(year2);
+	year = year1 * 100 + year2;
+
+	time = mktime(year, mon, day, hour, min, sec);
+	time += ( jiffies / HZ );
+
+	/* Now THIS is a nasty hack!
+	* It ensures that the first two calls get different answers.  
+	* That way the loop in init_time (time.c) will not think
+	* the clock is stuck.
+	*/
+	if ( lastsec ) {
+		time -= lastsec;
+		--lastsec;
+	}
+
+	to_tm(time, tm); 
+	tm->tm_year -= 1900;
+	tm->tm_mon  -= 1;
+}
+#endif
+
+#ifdef CONFIG_PPC_RTAS
+#define MAX_RTC_WAIT 5000	/* 5 sec */
+#define RTAS_CLOCK_BUSY (-2)
+void pSeries_get_boot_time(struct rtc_time *rtc_tm)
+{
+	int ret[8];
+	int error, wait_time;
+	unsigned long max_wait_tb;
+
+	max_wait_tb = __get_tb() + tb_ticks_per_usec * 1000 * MAX_RTC_WAIT;
+	do {
+		error = rtas_call(rtas_token("get-time-of-day"), 0, 8, ret);
+		if (error == RTAS_CLOCK_BUSY || rtas_is_extended_busy(error)) {
+			wait_time = rtas_extended_busy_delay_time(error);
+			/* This is boot time so we spin. */
+			udelay(wait_time*1000);
+			error = RTAS_CLOCK_BUSY;
+		}
+	} while (error == RTAS_CLOCK_BUSY && (__get_tb() < max_wait_tb));
+
+	if (error != 0 && printk_ratelimit()) {
+		printk(KERN_WARNING "error: reading the clock failed (%d)\n",
+			error);
+		return;
+	}
+
+	rtc_tm->tm_sec = ret[5];
+	rtc_tm->tm_min = ret[4];
+	rtc_tm->tm_hour = ret[3];
+	rtc_tm->tm_mday = ret[2];
+	rtc_tm->tm_mon = ret[1] - 1;
+	rtc_tm->tm_year = ret[0] - 1900;
+}
+
+/* NOTE: get_rtc_time will get an error if executed in interrupt context
+ * and if a delay is needed to read the clock.  In this case we just
+ * silently return without updating rtc_tm.
+ */
+void pSeries_get_rtc_time(struct rtc_time *rtc_tm)
+{
+        int ret[8];
+	int error, wait_time;
+	unsigned long max_wait_tb;
+
+	max_wait_tb = __get_tb() + tb_ticks_per_usec * 1000 * MAX_RTC_WAIT;
+	do {
+		error = rtas_call(rtas_token("get-time-of-day"), 0, 8, ret);
+		if (error == RTAS_CLOCK_BUSY || rtas_is_extended_busy(error)) {
+			if (in_interrupt() && printk_ratelimit()) {
+				printk(KERN_WARNING "error: reading clock would delay interrupt\n");
+				return;	/* delay not allowed */
+			}
+			wait_time = rtas_extended_busy_delay_time(error);
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(wait_time);
+			error = RTAS_CLOCK_BUSY;
+		}
+	} while (error == RTAS_CLOCK_BUSY && (__get_tb() < max_wait_tb));
+
+        if (error != 0 && printk_ratelimit()) {
+                printk(KERN_WARNING "error: reading the clock failed (%d)\n",
+		       error);
+		return;
+        }
+
+	rtc_tm->tm_sec = ret[5];
+	rtc_tm->tm_min = ret[4];
+	rtc_tm->tm_hour = ret[3];
+	rtc_tm->tm_mday = ret[2];
+	rtc_tm->tm_mon = ret[1] - 1;
+	rtc_tm->tm_year = ret[0] - 1900;
+}
+
+int pSeries_set_rtc_time(struct rtc_time *tm)
+{
+	int error, wait_time;
+	unsigned long max_wait_tb;
+
+	max_wait_tb = __get_tb() + tb_ticks_per_usec * 1000 * MAX_RTC_WAIT;
+	do {
+	        error = rtas_call(rtas_token("set-time-of-day"), 7, 1, NULL,
+				  tm->tm_year + 1900, tm->tm_mon + 1, 
+				  tm->tm_mday, tm->tm_hour, tm->tm_min, 
+				  tm->tm_sec, 0);
+		if (error == RTAS_CLOCK_BUSY || rtas_is_extended_busy(error)) {
+			if (in_interrupt())
+				return 1;	/* probably decrementer */
+			wait_time = rtas_extended_busy_delay_time(error);
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(wait_time);
+			error = RTAS_CLOCK_BUSY;
+		}
+	} while (error == RTAS_CLOCK_BUSY && (__get_tb() < max_wait_tb));
+
+        if (error != 0 && printk_ratelimit())
+                printk(KERN_WARNING "error: setting the clock failed (%d)\n",
+		       error); 
+
+        return 0;
+}
+#endif
diff --git a/arch/ppc64/kernel/scanlog.c b/arch/ppc64/kernel/scanlog.c
new file mode 100644
index 00000000000..189b81a4198
--- /dev/null
+++ b/arch/ppc64/kernel/scanlog.c
@@ -0,0 +1,245 @@
+/*
+ *  c 2001 PPC 64 Team, IBM Corp
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ * scan-log-data driver for PPC64  Todd Inglett <tinglett@vnet.ibm.com>
+ *
+ * When ppc64 hardware fails the service processor dumps internal state
+ * of the system.  After a reboot the operating system can access a dump
+ * of this data using this driver.  A dump exists if the device-tree
+ * /chosen/ibm,scan-log-data property exists.
+ *
+ * This driver exports /proc/ppc64/scan-log-dump which can be read.
+ * The driver supports only sequential reads.
+ *
+ * The driver looks at a write to the driver for the single word "reset".
+ * If given, the driver will reset the scanlog so the platform can free it.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+#include <asm/rtas.h>
+#include <asm/prom.h>
+
+#define MODULE_VERS "1.0"
+#define MODULE_NAME "scanlog"
+
+/* Status returns from ibm,scan-log-dump */
+#define SCANLOG_COMPLETE 0
+#define SCANLOG_HWERROR -1
+#define SCANLOG_CONTINUE 1
+
+#define DEBUG(A...) do { if (scanlog_debug) printk(KERN_ERR "scanlog: " A); } while (0)
+
+static int scanlog_debug;
+static unsigned int ibm_scan_log_dump;			/* RTAS token */
+static struct proc_dir_entry *proc_ppc64_scan_log_dump;	/* The proc file */
+
+static ssize_t scanlog_read(struct file *file, char *buf,
+			    size_t count, loff_t *ppos)
+{
+        struct inode * inode = file->f_dentry->d_inode;
+	struct proc_dir_entry *dp;
+	unsigned int *data;
+	int status;
+	unsigned long len, off;
+	unsigned int wait_time;
+
+        dp = PDE(inode);
+ 	data = (unsigned int *)dp->data;
+
+	if (!data) {
+		printk(KERN_ERR "scanlog: read failed no data\n");
+		return -EIO;
+	}
+
+	if (count > RTAS_DATA_BUF_SIZE)
+		count = RTAS_DATA_BUF_SIZE;
+
+	if (count < 1024) {
+		/* This is the min supported by this RTAS call.  Rather
+		 * than do all the buffering we insist the user code handle
+		 * larger reads.  As long as cp works... :)
+		 */
+		printk(KERN_ERR "scanlog: cannot perform a small read (%ld)\n", count);
+		return -EINVAL;
+	}
+
+	if (!access_ok(VERIFY_WRITE, buf, count))
+		return -EFAULT;
+
+	for (;;) {
+		wait_time = HZ/2;	/* default wait if no data */
+		spin_lock(&rtas_data_buf_lock);
+		memcpy(rtas_data_buf, data, RTAS_DATA_BUF_SIZE);
+		status = rtas_call(ibm_scan_log_dump, 2, 1, NULL,
+				   (u32) __pa(rtas_data_buf), (u32) count);
+		memcpy(data, rtas_data_buf, RTAS_DATA_BUF_SIZE);
+		spin_unlock(&rtas_data_buf_lock);
+
+		DEBUG("status=%d, data[0]=%x, data[1]=%x, data[2]=%x\n",
+		      status, data[0], data[1], data[2]);
+		switch (status) {
+		    case SCANLOG_COMPLETE:
+			DEBUG("hit eof\n");
+			return 0;
+		    case SCANLOG_HWERROR:
+			DEBUG("hardware error reading scan log data\n");
+			return -EIO;
+		    case SCANLOG_CONTINUE:
+			/* We may or may not have data yet */
+			len = data[1];
+			off = data[2];
+			if (len > 0) {
+				if (copy_to_user(buf, ((char *)data)+off, len))
+					return -EFAULT;
+				return len;
+			}
+			/* Break to sleep default time */
+			break;
+		    default:
+			if (status > 9900 && status <= 9905) {
+				/* No data.  RTAS is hinting at a delay required
+				 * between 1-100000 milliseconds
+				 */
+				int ms = 1;
+				for (; status > 9900; status--)
+					ms = ms * 10;
+				/* Use microseconds for reasonable accuracy */
+				ms *= 1000;
+				wait_time = ms / (1000000/HZ); /* round down is fine */
+				/* Fall through to sleep */
+			} else {
+				printk(KERN_ERR "scanlog: unknown error from rtas: %d\n", status);
+				return -EIO;
+			}
+		}
+		/* Apparently no data yet.  Wait and try again. */
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(wait_time);
+	}
+	/*NOTREACHED*/
+}
+
+static ssize_t scanlog_write(struct file * file, const char * buf,
+			     size_t count, loff_t *ppos)
+{
+	char stkbuf[20];
+	int status;
+
+	if (count > 19) count = 19;
+	if (copy_from_user (stkbuf, buf, count)) {
+		return -EFAULT;
+	}
+	stkbuf[count] = 0;
+
+	if (buf) {
+		if (strncmp(stkbuf, "reset", 5) == 0) {
+			DEBUG("reset scanlog\n");
+			status = rtas_call(ibm_scan_log_dump, 2, 1, NULL, 0, 0);
+			DEBUG("rtas returns %d\n", status);
+		} else if (strncmp(stkbuf, "debugon", 7) == 0) {
+			printk(KERN_ERR "scanlog: debug on\n");
+			scanlog_debug = 1;
+		} else if (strncmp(stkbuf, "debugoff", 8) == 0) {
+			printk(KERN_ERR "scanlog: debug off\n");
+			scanlog_debug = 0;
+		}
+	}
+	return count;
+}
+
+static int scanlog_open(struct inode * inode, struct file * file)
+{
+	struct proc_dir_entry *dp = PDE(inode);
+	unsigned int *data = (unsigned int *)dp->data;
+
+	if (!data) {
+		printk(KERN_ERR "scanlog: open failed no data\n");
+		return -EIO;
+	}
+
+	if (data[0] != 0) {
+		/* This imperfect test stops a second copy of the
+		 * data (or a reset while data is being copied)
+		 */
+		return -EBUSY;
+	}
+
+	data[0] = 0;	/* re-init so we restart the scan */
+
+	return 0;
+}
+
+static int scanlog_release(struct inode * inode, struct file * file)
+{
+	struct proc_dir_entry *dp = PDE(inode);
+	unsigned int *data = (unsigned int *)dp->data;
+
+	if (!data) {
+		printk(KERN_ERR "scanlog: release failed no data\n");
+		return -EIO;
+	}
+	data[0] = 0;
+
+	return 0;
+}
+
+struct file_operations scanlog_fops = {
+	.owner		= THIS_MODULE,
+	.read		= scanlog_read,
+	.write		= scanlog_write,
+	.open		= scanlog_open,
+	.release	= scanlog_release,
+};
+
+int __init scanlog_init(void)
+{
+	struct proc_dir_entry *ent;
+
+	ibm_scan_log_dump = rtas_token("ibm,scan-log-dump");
+	if (ibm_scan_log_dump == RTAS_UNKNOWN_SERVICE) {
+		printk(KERN_ERR "scan-log-dump not implemented on this system\n");
+		return -EIO;
+	}
+
+        ent = create_proc_entry("ppc64/rtas/scan-log-dump",  S_IRUSR, NULL);
+	if (ent) {
+		ent->proc_fops = &scanlog_fops;
+		/* Ideally we could allocate a buffer < 4G */
+		ent->data = kmalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
+		if (!ent->data) {
+			printk(KERN_ERR "Failed to allocate a buffer\n");
+			remove_proc_entry("scan-log-dump", ent->parent);
+			return -ENOMEM;
+		}
+		((unsigned int *)ent->data)[0] = 0;
+	} else {
+		printk(KERN_ERR "Failed to create ppc64/scan-log-dump proc entry\n");
+		return -EIO;
+	}
+	proc_ppc64_scan_log_dump = ent;
+
+	return 0;
+}
+
+void __exit scanlog_cleanup(void)
+{
+	if (proc_ppc64_scan_log_dump) {
+		if (proc_ppc64_scan_log_dump->data)
+			kfree(proc_ppc64_scan_log_dump->data);
+		remove_proc_entry("scan-log-dump", proc_ppc64_scan_log_dump->parent);
+	}
+}
+
+module_init(scanlog_init);
+module_exit(scanlog_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/arch/ppc64/kernel/semaphore.c b/arch/ppc64/kernel/semaphore.c
new file mode 100644
index 00000000000..a1c1db573e9
--- /dev/null
+++ b/arch/ppc64/kernel/semaphore.c
@@ -0,0 +1,136 @@
+/*
+ * 
+ *
+ * PowerPC-specific semaphore code.
+ *
+ * Copyright (C) 1999 Cort Dougan <cort@cs.nmt.edu>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * April 2001 - Reworked by Paul Mackerras <paulus@samba.org>
+ * to eliminate the SMP races in the old version between the updates
+ * of `count' and `waking'.  Now we use negative `count' values to
+ * indicate that some process(es) are waiting for the semaphore.
+ */
+
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/module.h>
+
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
+#include <asm/errno.h>
+
+/*
+ * Atomically update sem->count.
+ * This does the equivalent of the following:
+ *
+ *	old_count = sem->count;
+ *	tmp = MAX(old_count, 0) + incr;
+ *	sem->count = tmp;
+ *	return old_count;
+ */
+static inline int __sem_update_count(struct semaphore *sem, int incr)
+{
+	int old_count, tmp;
+
+	__asm__ __volatile__("\n"
+"1:	lwarx	%0,0,%3\n"
+"	srawi	%1,%0,31\n"
+"	andc	%1,%0,%1\n"
+"	add	%1,%1,%4\n"
+"	stwcx.	%1,0,%3\n"
+"	bne	1b"
+	: "=&r" (old_count), "=&r" (tmp), "=m" (sem->count)
+	: "r" (&sem->count), "r" (incr), "m" (sem->count)
+	: "cc");
+
+	return old_count;
+}
+
+void __up(struct semaphore *sem)
+{
+	/*
+	 * Note that we incremented count in up() before we came here,
+	 * but that was ineffective since the result was <= 0, and
+	 * any negative value of count is equivalent to 0.
+	 * This ends up setting count to 1, unless count is now > 0
+	 * (i.e. because some other cpu has called up() in the meantime),
+	 * in which case we just increment count.
+	 */
+	__sem_update_count(sem, 1);
+	wake_up(&sem->wait);
+}
+EXPORT_SYMBOL(__up);
+
+/*
+ * Note that when we come in to __down or __down_interruptible,
+ * we have already decremented count, but that decrement was
+ * ineffective since the result was < 0, and any negative value
+ * of count is equivalent to 0.
+ * Thus it is only when we decrement count from some value > 0
+ * that we have actually got the semaphore.
+ */
+void __sched __down(struct semaphore *sem)
+{
+	struct task_struct *tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+
+	__set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+	add_wait_queue_exclusive(&sem->wait, &wait);
+
+	/*
+	 * Try to get the semaphore.  If the count is > 0, then we've
+	 * got the semaphore; we decrement count and exit the loop.
+	 * If the count is 0 or negative, we set it to -1, indicating
+	 * that we are asleep, and then sleep.
+	 */
+	while (__sem_update_count(sem, -1) <= 0) {
+		schedule();
+		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+	}
+	remove_wait_queue(&sem->wait, &wait);
+	__set_task_state(tsk, TASK_RUNNING);
+
+	/*
+	 * If there are any more sleepers, wake one of them up so
+	 * that it can either get the semaphore, or set count to -1
+	 * indicating that there are still processes sleeping.
+	 */
+	wake_up(&sem->wait);
+}
+EXPORT_SYMBOL(__down);
+
+int __sched __down_interruptible(struct semaphore * sem)
+{
+	int retval = 0;
+	struct task_struct *tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+
+	__set_task_state(tsk, TASK_INTERRUPTIBLE);
+	add_wait_queue_exclusive(&sem->wait, &wait);
+
+	while (__sem_update_count(sem, -1) <= 0) {
+		if (signal_pending(current)) {
+			/*
+			 * A signal is pending - give up trying.
+			 * Set sem->count to 0 if it is negative,
+			 * since we are no longer sleeping.
+			 */
+			__sem_update_count(sem, 0);
+			retval = -EINTR;
+			break;
+		}
+		schedule();
+		set_task_state(tsk, TASK_INTERRUPTIBLE);
+	}
+	remove_wait_queue(&sem->wait, &wait);
+	__set_task_state(tsk, TASK_RUNNING);
+
+	wake_up(&sem->wait);
+	return retval;
+}
+EXPORT_SYMBOL(__down_interruptible);
diff --git a/arch/ppc64/kernel/setup.c b/arch/ppc64/kernel/setup.c
new file mode 100644
index 00000000000..21c57f539c2
--- /dev/null
+++ b/arch/ppc64/kernel/setup.c
@@ -0,0 +1,1392 @@
+/*
+ * 
+ * Common boot and setup code.
+ *
+ * Copyright (C) 2001 PPC64 Team, IBM Corp
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/reboot.h>
+#include <linux/delay.h>
+#include <linux/initrd.h>
+#include <linux/ide.h>
+#include <linux/seq_file.h>
+#include <linux/ioport.h>
+#include <linux/console.h>
+#include <linux/version.h>
+#include <linux/tty.h>
+#include <linux/root_dev.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/unistd.h>
+#include <linux/serial.h>
+#include <linux/serial_8250.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/processor.h>
+#include <asm/pgtable.h>
+#include <asm/bootinfo.h>
+#include <asm/smp.h>
+#include <asm/elf.h>
+#include <asm/machdep.h>
+#include <asm/iSeries/LparData.h>
+#include <asm/paca.h>
+#include <asm/ppcdebug.h>
+#include <asm/time.h>
+#include <asm/cputable.h>
+#include <asm/sections.h>
+#include <asm/btext.h>
+#include <asm/nvram.h>
+#include <asm/setup.h>
+#include <asm/system.h>
+#include <asm/rtas.h>
+#include <asm/iommu.h>
+#include <asm/serial.h>
+#include <asm/cache.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+
+#ifdef DEBUG
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+/*
+ * Here are some early debugging facilities. You can enable one
+ * but your kernel will not boot on anything else if you do so
+ */
+
+/* This one is for use on LPAR machines that support an HVC console
+ * on vterm 0
+ */
+extern void udbg_init_debug_lpar(void);
+/* This one is for use on Apple G5 machines
+ */
+extern void udbg_init_pmac_realmode(void);
+/* That's RTAS panel debug */
+extern void call_rtas_display_status_delay(unsigned char c);
+/* Here's maple real mode debug */
+extern void udbg_init_maple_realmode(void);
+
+#define EARLY_DEBUG_INIT() do {} while(0)
+
+#if 0
+#define EARLY_DEBUG_INIT() udbg_init_debug_lpar()
+#define EARLY_DEBUG_INIT() udbg_init_maple_realmode()
+#define EARLY_DEBUG_INIT() udbg_init_pmac_realmode()
+#define EARLY_DEBUG_INIT()						\
+	do { ppc_md.udbg_putc = call_rtas_display_status_delay; } while(0)
+#endif
+
+/* extern void *stab; */
+extern unsigned long klimit;
+
+extern void mm_init_ppc64(void);
+extern int  idle_setup(void);
+extern void stab_initialize(unsigned long stab);
+extern void htab_initialize(void);
+extern void early_init_devtree(void *flat_dt);
+extern void unflatten_device_tree(void);
+
+extern void smp_release_cpus(void);
+
+unsigned long decr_overclock = 1;
+unsigned long decr_overclock_proc0 = 1;
+unsigned long decr_overclock_set = 0;
+unsigned long decr_overclock_proc0_set = 0;
+
+int have_of = 1;
+int boot_cpuid = 0;
+int boot_cpuid_phys = 0;
+dev_t boot_dev;
+u64 ppc64_pft_size;
+u64 ppc64_debug_switch;
+
+struct ppc64_caches ppc64_caches;
+EXPORT_SYMBOL_GPL(ppc64_caches);
+
+/*
+ * These are used in binfmt_elf.c to put aux entries on the stack
+ * for each elf executable being started.
+ */
+int dcache_bsize;
+int icache_bsize;
+int ucache_bsize;
+
+/* The main machine-dep calls structure
+ */
+struct machdep_calls ppc_md;
+EXPORT_SYMBOL(ppc_md);
+
+#ifdef CONFIG_MAGIC_SYSRQ
+unsigned long SYSRQ_KEY;
+#endif /* CONFIG_MAGIC_SYSRQ */
+
+
+static int ppc64_panic_event(struct notifier_block *, unsigned long, void *);
+static struct notifier_block ppc64_panic_block = {
+	.notifier_call = ppc64_panic_event,
+	.priority = INT_MIN /* may not return; must be done last */
+};
+
+/*
+ * Perhaps we can put the pmac screen_info[] here
+ * on pmac as well so we don't need the ifdef's.
+ * Until we get multiple-console support in here
+ * that is.  -- Cort
+ * Maybe tie it to serial consoles, since this is really what
+ * these processors use on existing boards.  -- Dan
+ */ 
+struct screen_info screen_info = {
+	.orig_x = 0,
+	.orig_y = 25,
+	.orig_video_cols = 80,
+	.orig_video_lines = 25,
+	.orig_video_isVGA = 1,
+	.orig_video_points = 16
+};
+
+/*
+ * Initialize the PPCDBG state.  Called before relocation has been enabled.
+ */
+void __init ppcdbg_initialize(void)
+{
+	ppc64_debug_switch = PPC_DEBUG_DEFAULT; /* | PPCDBG_BUSWALK | */
+	/* PPCDBG_PHBINIT | PPCDBG_MM | PPCDBG_MMINIT | PPCDBG_TCEINIT | PPCDBG_TCE */;
+}
+
+/*
+ * Early boot console based on udbg
+ */
+static struct console udbg_console = {
+	.name	= "udbg",
+	.write	= udbg_console_write,
+	.flags	= CON_PRINTBUFFER,
+	.index	= -1,
+};
+static int early_console_initialized;
+
+void __init disable_early_printk(void)
+{
+	if (!early_console_initialized)
+		return;
+	unregister_console(&udbg_console);
+	early_console_initialized = 0;
+}
+
+#if defined(CONFIG_PPC_MULTIPLATFORM) && defined(CONFIG_SMP)
+
+static int smt_enabled_cmdline;
+
+/* Look for ibm,smt-enabled OF option */
+static void check_smt_enabled(void)
+{
+	struct device_node *dn;
+	char *smt_option;
+
+	/* Allow the command line to overrule the OF option */
+	if (smt_enabled_cmdline)
+		return;
+
+	dn = of_find_node_by_path("/options");
+
+	if (dn) {
+		smt_option = (char *)get_property(dn, "ibm,smt-enabled", NULL);
+
+                if (smt_option) {
+			if (!strcmp(smt_option, "on"))
+				smt_enabled_at_boot = 1;
+			else if (!strcmp(smt_option, "off"))
+				smt_enabled_at_boot = 0;
+                }
+        }
+}
+
+/* Look for smt-enabled= cmdline option */
+static int __init early_smt_enabled(char *p)
+{
+	smt_enabled_cmdline = 1;
+
+	if (!p)
+		return 0;
+
+	if (!strcmp(p, "on") || !strcmp(p, "1"))
+		smt_enabled_at_boot = 1;
+	else if (!strcmp(p, "off") || !strcmp(p, "0"))
+		smt_enabled_at_boot = 0;
+
+	return 0;
+}
+early_param("smt-enabled", early_smt_enabled);
+
+/**
+ * setup_cpu_maps - initialize the following cpu maps:
+ *                  cpu_possible_map
+ *                  cpu_present_map
+ *                  cpu_sibling_map
+ *
+ * Having the possible map set up early allows us to restrict allocations
+ * of things like irqstacks to num_possible_cpus() rather than NR_CPUS.
+ *
+ * We do not initialize the online map here; cpus set their own bits in
+ * cpu_online_map as they come up.
+ *
+ * This function is valid only for Open Firmware systems.  finish_device_tree
+ * must be called before using this.
+ *
+ * While we're here, we may as well set the "physical" cpu ids in the paca.
+ */
+static void __init setup_cpu_maps(void)
+{
+	struct device_node *dn = NULL;
+	int cpu = 0;
+	int swap_cpuid = 0;
+
+	check_smt_enabled();
+
+	while ((dn = of_find_node_by_type(dn, "cpu")) && cpu < NR_CPUS) {
+		u32 *intserv;
+		int j, len = sizeof(u32), nthreads;
+
+		intserv = (u32 *)get_property(dn, "ibm,ppc-interrupt-server#s",
+					      &len);
+		if (!intserv)
+			intserv = (u32 *)get_property(dn, "reg", NULL);
+
+		nthreads = len / sizeof(u32);
+
+		for (j = 0; j < nthreads && cpu < NR_CPUS; j++) {
+			cpu_set(cpu, cpu_present_map);
+			set_hard_smp_processor_id(cpu, intserv[j]);
+
+			if (intserv[j] == boot_cpuid_phys)
+				swap_cpuid = cpu;
+			cpu_set(cpu, cpu_possible_map);
+			cpu++;
+		}
+	}
+
+	/* Swap CPU id 0 with boot_cpuid_phys, so we can always assume that
+	 * boot cpu is logical 0.
+	 */
+	if (boot_cpuid_phys != get_hard_smp_processor_id(0)) {
+		u32 tmp;
+		tmp = get_hard_smp_processor_id(0);
+		set_hard_smp_processor_id(0, boot_cpuid_phys);
+		set_hard_smp_processor_id(swap_cpuid, tmp);
+	}
+
+	/*
+	 * On pSeries LPAR, we need to know how many cpus
+	 * could possibly be added to this partition.
+	 */
+	if (systemcfg->platform == PLATFORM_PSERIES_LPAR &&
+				(dn = of_find_node_by_path("/rtas"))) {
+		int num_addr_cell, num_size_cell, maxcpus;
+		unsigned int *ireg;
+
+		num_addr_cell = prom_n_addr_cells(dn);
+		num_size_cell = prom_n_size_cells(dn);
+
+		ireg = (unsigned int *)
+			get_property(dn, "ibm,lrdr-capacity", NULL);
+
+		if (!ireg)
+			goto out;
+
+		maxcpus = ireg[num_addr_cell + num_size_cell];
+
+		/* Double maxcpus for processors which have SMT capability */
+		if (cpu_has_feature(CPU_FTR_SMT))
+			maxcpus *= 2;
+
+		if (maxcpus > NR_CPUS) {
+			printk(KERN_WARNING
+			       "Partition configured for %d cpus, "
+			       "operating system maximum is %d.\n",
+			       maxcpus, NR_CPUS);
+			maxcpus = NR_CPUS;
+		} else
+			printk(KERN_INFO "Partition configured for %d cpus.\n",
+			       maxcpus);
+
+		for (cpu = 0; cpu < maxcpus; cpu++)
+			cpu_set(cpu, cpu_possible_map);
+	out:
+		of_node_put(dn);
+	}
+
+	/*
+	 * Do the sibling map; assume only two threads per processor.
+	 */
+	for_each_cpu(cpu) {
+		cpu_set(cpu, cpu_sibling_map[cpu]);
+		if (cpu_has_feature(CPU_FTR_SMT))
+			cpu_set(cpu ^ 0x1, cpu_sibling_map[cpu]);
+	}
+
+	systemcfg->processorCount = num_present_cpus();
+}
+#endif /* defined(CONFIG_PPC_MULTIPLATFORM) && defined(CONFIG_SMP) */
+
+
+#ifdef CONFIG_PPC_MULTIPLATFORM
+
+extern struct machdep_calls pSeries_md;
+extern struct machdep_calls pmac_md;
+extern struct machdep_calls maple_md;
+
+/* Ultimately, stuff them in an elf section like initcalls... */
+static struct machdep_calls __initdata *machines[] = {
+#ifdef CONFIG_PPC_PSERIES
+	&pSeries_md,
+#endif /* CONFIG_PPC_PSERIES */
+#ifdef CONFIG_PPC_PMAC
+	&pmac_md,
+#endif /* CONFIG_PPC_PMAC */
+#ifdef CONFIG_PPC_MAPLE
+	&maple_md,
+#endif /* CONFIG_PPC_MAPLE */
+	NULL
+};
+
+/*
+ * Early initialization entry point. This is called by head.S
+ * with MMU translation disabled. We rely on the "feature" of
+ * the CPU that ignores the top 2 bits of the address in real
+ * mode so we can access kernel globals normally provided we
+ * only toy with things in the RMO region. From here, we do
+ * some early parsing of the device-tree to setup out LMB
+ * data structures, and allocate & initialize the hash table
+ * and segment tables so we can start running with translation
+ * enabled.
+ *
+ * It is this function which will call the probe() callback of
+ * the various platform types and copy the matching one to the
+ * global ppc_md structure. Your platform can eventually do
+ * some very early initializations from the probe() routine, but
+ * this is not recommended, be very careful as, for example, the
+ * device-tree is not accessible via normal means at this point.
+ */
+
+void __init early_setup(unsigned long dt_ptr)
+{
+	struct paca_struct *lpaca = get_paca();
+	static struct machdep_calls **mach;
+
+	/*
+	 * Enable early debugging if any specified (see top of
+	 * this file)
+	 */
+	EARLY_DEBUG_INIT();
+
+	DBG(" -> early_setup()\n");
+
+	/*
+	 * Fill the default DBG level (do we want to keep
+	 * that old mecanism around forever ?)
+	 */
+	ppcdbg_initialize();
+
+	/*
+	 * Do early initializations using the flattened device
+	 * tree, like retreiving the physical memory map or
+	 * calculating/retreiving the hash table size
+	 */
+	early_init_devtree(__va(dt_ptr));
+
+	/*
+	 * Iterate all ppc_md structures until we find the proper
+	 * one for the current machine type
+	 */
+	DBG("Probing machine type for platform %x...\n",
+	    systemcfg->platform);
+
+	for (mach = machines; *mach; mach++) {
+		if ((*mach)->probe(systemcfg->platform))
+			break;
+	}
+	/* What can we do if we didn't find ? */
+	if (*mach == NULL) {
+		DBG("No suitable machine found !\n");
+		for (;;);
+	}
+	ppc_md = **mach;
+
+	/* our udbg callbacks got overriden by the above, let's put them
+	 * back in. Ultimately, I want those things to be split from the
+	 * main ppc_md
+	 */
+	EARLY_DEBUG_INIT();
+
+	DBG("Found, Initializing memory management...\n");
+
+	/*
+	 * Initialize stab / SLB management
+	 */
+	stab_initialize(lpaca->stab_real);
+
+	/*
+	 * Initialize the MMU Hash table and create the linear mapping
+	 * of memory
+	 */
+	htab_initialize();
+
+	DBG(" <- early_setup()\n");
+}
+
+
+/*
+ * Initialize some remaining members of the ppc64_caches and systemcfg structures
+ * (at least until we get rid of them completely). This is mostly some
+ * cache informations about the CPU that will be used by cache flush
+ * routines and/or provided to userland
+ */
+static void __init initialize_cache_info(void)
+{
+	struct device_node *np;
+	unsigned long num_cpus = 0;
+
+	DBG(" -> initialize_cache_info()\n");
+
+	for (np = NULL; (np = of_find_node_by_type(np, "cpu"));) {
+		num_cpus += 1;
+
+		/* We're assuming *all* of the CPUs have the same
+		 * d-cache and i-cache sizes... -Peter
+		 */
+
+		if ( num_cpus == 1 ) {
+			u32 *sizep, *lsizep;
+			u32 size, lsize;
+			const char *dc, *ic;
+
+			/* Then read cache informations */
+			if (systemcfg->platform == PLATFORM_POWERMAC) {
+				dc = "d-cache-block-size";
+				ic = "i-cache-block-size";
+			} else {
+				dc = "d-cache-line-size";
+				ic = "i-cache-line-size";
+			}
+
+			size = 0;
+			lsize = cur_cpu_spec->dcache_bsize;
+			sizep = (u32 *)get_property(np, "d-cache-size", NULL);
+			if (sizep != NULL)
+				size = *sizep;
+			lsizep = (u32 *) get_property(np, dc, NULL);
+			if (lsizep != NULL)
+				lsize = *lsizep;
+			if (sizep == 0 || lsizep == 0)
+				DBG("Argh, can't find dcache properties ! "
+				    "sizep: %p, lsizep: %p\n", sizep, lsizep);
+
+			systemcfg->dcache_size = ppc64_caches.dsize = size;
+			systemcfg->dcache_line_size =
+				ppc64_caches.dline_size = lsize;
+			ppc64_caches.log_dline_size = __ilog2(lsize);
+			ppc64_caches.dlines_per_page = PAGE_SIZE / lsize;
+
+			size = 0;
+			lsize = cur_cpu_spec->icache_bsize;
+			sizep = (u32 *)get_property(np, "i-cache-size", NULL);
+			if (sizep != NULL)
+				size = *sizep;
+			lsizep = (u32 *)get_property(np, ic, NULL);
+			if (lsizep != NULL)
+				lsize = *lsizep;
+			if (sizep == 0 || lsizep == 0)
+				DBG("Argh, can't find icache properties ! "
+				    "sizep: %p, lsizep: %p\n", sizep, lsizep);
+
+			systemcfg->icache_size = ppc64_caches.isize = size;
+			systemcfg->icache_line_size =
+				ppc64_caches.iline_size = lsize;
+			ppc64_caches.log_iline_size = __ilog2(lsize);
+			ppc64_caches.ilines_per_page = PAGE_SIZE / lsize;
+		}
+	}
+
+	/* Add an eye catcher and the systemcfg layout version number */
+	strcpy(systemcfg->eye_catcher, "SYSTEMCFG:PPC64");
+	systemcfg->version.major = SYSTEMCFG_MAJOR;
+	systemcfg->version.minor = SYSTEMCFG_MINOR;
+	systemcfg->processor = mfspr(SPRN_PVR);
+
+	DBG(" <- initialize_cache_info()\n");
+}
+
+static void __init check_for_initrd(void)
+{
+#ifdef CONFIG_BLK_DEV_INITRD
+	u64 *prop;
+
+	DBG(" -> check_for_initrd()\n");
+
+	prop = (u64 *)get_property(of_chosen, "linux,initrd-start", NULL);
+	if (prop != NULL) {
+		initrd_start = (unsigned long)__va(*prop);
+		prop = (u64 *)get_property(of_chosen, "linux,initrd-end", NULL);
+		if (prop != NULL) {
+			initrd_end = (unsigned long)__va(*prop);
+			initrd_below_start_ok = 1;
+		} else
+			initrd_start = 0;
+	}
+
+	/* If we were passed an initrd, set the ROOT_DEV properly if the values
+	 * look sensible. If not, clear initrd reference.
+	 */
+	if (initrd_start >= KERNELBASE && initrd_end >= KERNELBASE &&
+	    initrd_end > initrd_start)
+		ROOT_DEV = Root_RAM0;
+	else
+		initrd_start = initrd_end = 0;
+
+	if (initrd_start)
+		printk("Found initrd at 0x%lx:0x%lx\n", initrd_start, initrd_end);
+
+	DBG(" <- check_for_initrd()\n");
+#endif /* CONFIG_BLK_DEV_INITRD */
+}
+
+#endif /* CONFIG_PPC_MULTIPLATFORM */
+
+/*
+ * Do some initial setup of the system.  The parameters are those which 
+ * were passed in from the bootloader.
+ */
+void __init setup_system(void)
+{
+	DBG(" -> setup_system()\n");
+
+#ifdef CONFIG_PPC_ISERIES
+	/* pSeries systems are identified in prom.c via OF. */
+	if (itLpNaca.xLparInstalled == 1)
+		systemcfg->platform = PLATFORM_ISERIES_LPAR;
+
+	ppc_md.init_early();
+#else /* CONFIG_PPC_ISERIES */
+
+	/*
+	 * Unflatten the device-tree passed by prom_init or kexec
+	 */
+	unflatten_device_tree();
+
+	/*
+	 * Fill the ppc64_caches & systemcfg structures with informations
+	 * retreived from the device-tree. Need to be called before
+	 * finish_device_tree() since the later requires some of the
+	 * informations filled up here to properly parse the interrupt
+	 * tree.
+	 * It also sets up the cache line sizes which allows to call
+	 * routines like flush_icache_range (used by the hash init
+	 * later on).
+	 */
+	initialize_cache_info();
+
+#ifdef CONFIG_PPC_RTAS
+	/*
+	 * Initialize RTAS if available
+	 */
+	rtas_initialize();
+#endif /* CONFIG_PPC_RTAS */
+
+	/*
+	 * Check if we have an initrd provided via the device-tree
+	 */
+	check_for_initrd();
+
+	/*
+	 * Do some platform specific early initializations, that includes
+	 * setting up the hash table pointers. It also sets up some interrupt-mapping
+	 * related options that will be used by finish_device_tree()
+	 */
+	ppc_md.init_early();
+
+	/*
+	 * "Finish" the device-tree, that is do the actual parsing of
+	 * some of the properties like the interrupt map
+	 */
+	finish_device_tree();
+
+	/*
+	 * Initialize xmon
+	 */
+#ifdef CONFIG_XMON_DEFAULT
+	xmon_init();
+#endif
+	/*
+	 * Register early console
+	 */
+	early_console_initialized = 1;
+	register_console(&udbg_console);
+
+	/* Save unparsed command line copy for /proc/cmdline */
+	strlcpy(saved_command_line, cmd_line, COMMAND_LINE_SIZE);
+
+	parse_early_param();
+#endif /* !CONFIG_PPC_ISERIES */
+
+#if defined(CONFIG_SMP) && !defined(CONFIG_PPC_ISERIES)
+	/*
+	 * iSeries has already initialized the cpu maps at this point.
+	 */
+	setup_cpu_maps();
+
+	/* Release secondary cpus out of their spinloops at 0x60 now that
+	 * we can map physical -> logical CPU ids
+	 */
+	smp_release_cpus();
+#endif /* defined(CONFIG_SMP) && !defined(CONFIG_PPC_ISERIES) */
+
+	printk("Starting Linux PPC64 %s\n", UTS_RELEASE);
+
+	printk("-----------------------------------------------------\n");
+	printk("ppc64_pft_size                = 0x%lx\n", ppc64_pft_size);
+	printk("ppc64_debug_switch            = 0x%lx\n", ppc64_debug_switch);
+	printk("ppc64_interrupt_controller    = 0x%ld\n", ppc64_interrupt_controller);
+	printk("systemcfg                     = 0x%p\n", systemcfg);
+	printk("systemcfg->platform           = 0x%x\n", systemcfg->platform);
+	printk("systemcfg->processorCount     = 0x%lx\n", systemcfg->processorCount);
+	printk("systemcfg->physicalMemorySize = 0x%lx\n", systemcfg->physicalMemorySize);
+	printk("ppc64_caches.dcache_line_size = 0x%x\n",
+			ppc64_caches.dline_size);
+	printk("ppc64_caches.icache_line_size = 0x%x\n",
+			ppc64_caches.iline_size);
+	printk("htab_address                  = 0x%p\n", htab_address);
+	printk("htab_hash_mask                = 0x%lx\n", htab_hash_mask);
+	printk("-----------------------------------------------------\n");
+
+	mm_init_ppc64();
+
+	DBG(" <- setup_system()\n");
+}
+
+
+void machine_restart(char *cmd)
+{
+	if (ppc_md.nvram_sync)
+		ppc_md.nvram_sync();
+	ppc_md.restart(cmd);
+}
+
+EXPORT_SYMBOL(machine_restart);
+  
+void machine_power_off(void)
+{
+	if (ppc_md.nvram_sync)
+		ppc_md.nvram_sync();
+	ppc_md.power_off();
+}
+
+EXPORT_SYMBOL(machine_power_off);
+  
+void machine_halt(void)
+{
+	if (ppc_md.nvram_sync)
+		ppc_md.nvram_sync();
+	ppc_md.halt();
+}
+
+EXPORT_SYMBOL(machine_halt);
+
+unsigned long ppc_proc_freq;
+unsigned long ppc_tb_freq;
+
+static int ppc64_panic_event(struct notifier_block *this,
+                             unsigned long event, void *ptr)
+{
+	ppc_md.panic((char *)ptr);  /* May not return */
+	return NOTIFY_DONE;
+}
+
+
+#ifdef CONFIG_SMP
+DEFINE_PER_CPU(unsigned int, pvr);
+#endif
+
+static int show_cpuinfo(struct seq_file *m, void *v)
+{
+	unsigned long cpu_id = (unsigned long)v - 1;
+	unsigned int pvr;
+	unsigned short maj;
+	unsigned short min;
+
+	if (cpu_id == NR_CPUS) {
+		seq_printf(m, "timebase\t: %lu\n", ppc_tb_freq);
+
+		if (ppc_md.get_cpuinfo != NULL)
+			ppc_md.get_cpuinfo(m);
+
+		return 0;
+	}
+
+	/* We only show online cpus: disable preempt (overzealous, I
+	 * knew) to prevent cpu going down. */
+	preempt_disable();
+	if (!cpu_online(cpu_id)) {
+		preempt_enable();
+		return 0;
+	}
+
+#ifdef CONFIG_SMP
+	pvr = per_cpu(pvr, cpu_id);
+#else
+	pvr = mfspr(SPRN_PVR);
+#endif
+	maj = (pvr >> 8) & 0xFF;
+	min = pvr & 0xFF;
+
+	seq_printf(m, "processor\t: %lu\n", cpu_id);
+	seq_printf(m, "cpu\t\t: ");
+
+	if (cur_cpu_spec->pvr_mask)
+		seq_printf(m, "%s", cur_cpu_spec->cpu_name);
+	else
+		seq_printf(m, "unknown (%08x)", pvr);
+
+#ifdef CONFIG_ALTIVEC
+	if (cpu_has_feature(CPU_FTR_ALTIVEC))
+		seq_printf(m, ", altivec supported");
+#endif /* CONFIG_ALTIVEC */
+
+	seq_printf(m, "\n");
+
+	/*
+	 * Assume here that all clock rates are the same in a
+	 * smp system.  -- Cort
+	 */
+	seq_printf(m, "clock\t\t: %lu.%06luMHz\n", ppc_proc_freq / 1000000,
+		   ppc_proc_freq % 1000000);
+
+	seq_printf(m, "revision\t: %hd.%hd\n\n", maj, min);
+
+	preempt_enable();
+	return 0;
+}
+
+static void *c_start(struct seq_file *m, loff_t *pos)
+{
+	return *pos <= NR_CPUS ? (void *)((*pos)+1) : NULL;
+}
+static void *c_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	++*pos;
+	return c_start(m, pos);
+}
+static void c_stop(struct seq_file *m, void *v)
+{
+}
+struct seq_operations cpuinfo_op = {
+	.start =c_start,
+	.next =	c_next,
+	.stop =	c_stop,
+	.show =	show_cpuinfo,
+};
+
+/*
+ * These three variables are used to save values passed to us by prom_init()
+ * via the device tree. The TCE variables are needed because with a memory_limit
+ * in force we may need to explicitly map the TCE are at the top of RAM.
+ */
+unsigned long memory_limit;
+unsigned long tce_alloc_start;
+unsigned long tce_alloc_end;
+
+#ifdef CONFIG_PPC_ISERIES
+/*
+ * On iSeries we just parse the mem=X option from the command line.
+ * On pSeries it's a bit more complicated, see prom_init_mem()
+ */
+static int __init early_parsemem(char *p)
+{
+	if (!p)
+		return 0;
+
+	memory_limit = ALIGN(memparse(p, &p), PAGE_SIZE);
+
+	return 0;
+}
+early_param("mem", early_parsemem);
+#endif /* CONFIG_PPC_ISERIES */
+
+#ifdef CONFIG_PPC_MULTIPLATFORM
+static int __init set_preferred_console(void)
+{
+	struct device_node *prom_stdout = NULL;
+	char *name;
+	u32 *spd;
+	int offset = 0;
+
+	DBG(" -> set_preferred_console()\n");
+
+	/* The user has requested a console so this is already set up. */
+	if (strstr(saved_command_line, "console=")) {
+		DBG(" console was specified !\n");
+		return -EBUSY;
+	}
+
+	if (!of_chosen) {
+		DBG(" of_chosen is NULL !\n");
+		return -ENODEV;
+	}
+	/* We are getting a weird phandle from OF ... */
+	/* ... So use the full path instead */
+	name = (char *)get_property(of_chosen, "linux,stdout-path", NULL);
+	if (name == NULL) {
+		DBG(" no linux,stdout-path !\n");
+		return -ENODEV;
+	}
+	prom_stdout = of_find_node_by_path(name);
+	if (!prom_stdout) {
+		DBG(" can't find stdout package %s !\n", name);
+		return -ENODEV;
+	}	
+	DBG("stdout is %s\n", prom_stdout->full_name);
+
+	name = (char *)get_property(prom_stdout, "name", NULL);
+	if (!name) {
+		DBG(" stdout package has no name !\n");
+		goto not_found;
+	}
+	spd = (u32 *)get_property(prom_stdout, "current-speed", NULL);
+
+	if (0)
+		;
+#ifdef CONFIG_SERIAL_8250_CONSOLE
+	else if (strcmp(name, "serial") == 0) {
+		int i;
+		u32 *reg = (u32 *)get_property(prom_stdout, "reg", &i);
+		if (i > 8) {
+			switch (reg[1]) {
+				case 0x3f8:
+					offset = 0;
+					break;
+				case 0x2f8:
+					offset = 1;
+					break;
+				case 0x898:
+					offset = 2;
+					break;
+				case 0x890:
+					offset = 3;
+					break;
+				default:
+					/* We dont recognise the serial port */
+					goto not_found;
+			}
+		}
+	}
+#endif /* CONFIG_SERIAL_8250_CONSOLE */
+#ifdef CONFIG_PPC_PSERIES
+	else if (strcmp(name, "vty") == 0) {
+ 		u32 *reg = (u32 *)get_property(prom_stdout, "reg", NULL);
+ 		char *compat = (char *)get_property(prom_stdout, "compatible", NULL);
+
+ 		if (reg && compat && (strcmp(compat, "hvterm-protocol") == 0)) {
+ 			/* Host Virtual Serial Interface */
+ 			int offset;
+ 			switch (reg[0]) {
+ 				case 0x30000000:
+ 					offset = 0;
+ 					break;
+ 				case 0x30000001:
+ 					offset = 1;
+ 					break;
+ 				default:
+					goto not_found;
+ 			}
+			of_node_put(prom_stdout);
+			DBG("Found hvsi console at offset %d\n", offset);
+ 			return add_preferred_console("hvsi", offset, NULL);
+ 		} else {
+ 			/* pSeries LPAR virtual console */
+			of_node_put(prom_stdout);
+			DBG("Found hvc console\n");
+ 			return add_preferred_console("hvc", 0, NULL);
+ 		}
+	}
+#endif /* CONFIG_PPC_PSERIES */
+#ifdef CONFIG_SERIAL_PMACZILOG_CONSOLE
+	else if (strcmp(name, "ch-a") == 0)
+		offset = 0;
+	else if (strcmp(name, "ch-b") == 0)
+		offset = 1;
+#endif /* CONFIG_SERIAL_PMACZILOG_CONSOLE */
+	else
+		goto not_found;
+	of_node_put(prom_stdout);
+
+	DBG("Found serial console at ttyS%d\n", offset);
+
+	if (spd) {
+		static char __initdata opt[16];
+		sprintf(opt, "%d", *spd);
+		return add_preferred_console("ttyS", offset, opt);
+	} else
+		return add_preferred_console("ttyS", offset, NULL);
+
+ not_found:
+	DBG("No preferred console found !\n");
+	of_node_put(prom_stdout);
+	return -ENODEV;
+}
+console_initcall(set_preferred_console);
+#endif /* CONFIG_PPC_MULTIPLATFORM */
+
+#ifdef CONFIG_IRQSTACKS
+static void __init irqstack_early_init(void)
+{
+	unsigned int i;
+
+	/*
+	 * interrupt stacks must be under 256MB, we cannot afford to take
+	 * SLB misses on them.
+	 */
+	for_each_cpu(i) {
+		softirq_ctx[i] = (struct thread_info *)__va(lmb_alloc_base(THREAD_SIZE,
+					THREAD_SIZE, 0x10000000));
+		hardirq_ctx[i] = (struct thread_info *)__va(lmb_alloc_base(THREAD_SIZE,
+					THREAD_SIZE, 0x10000000));
+	}
+}
+#else
+#define irqstack_early_init()
+#endif
+
+/*
+ * Stack space used when we detect a bad kernel stack pointer, and
+ * early in SMP boots before relocation is enabled.
+ */
+static void __init emergency_stack_init(void)
+{
+	unsigned long limit;
+	unsigned int i;
+
+	/*
+	 * Emergency stacks must be under 256MB, we cannot afford to take
+	 * SLB misses on them. The ABI also requires them to be 128-byte
+	 * aligned.
+	 *
+	 * Since we use these as temporary stacks during secondary CPU
+	 * bringup, we need to get at them in real mode. This means they
+	 * must also be within the RMO region.
+	 */
+	limit = min(0x10000000UL, lmb.rmo_size);
+
+	for_each_cpu(i)
+		paca[i].emergency_sp = __va(lmb_alloc_base(PAGE_SIZE, 128,
+						limit)) + PAGE_SIZE;
+}
+
+/*
+ * Called from setup_arch to initialize the bitmap of available
+ * syscalls in the systemcfg page
+ */
+void __init setup_syscall_map(void)
+{
+	unsigned int i, count64 = 0, count32 = 0;
+	extern unsigned long *sys_call_table;
+	extern unsigned long *sys_call_table32;
+	extern unsigned long sys_ni_syscall;
+
+
+	for (i = 0; i < __NR_syscalls; i++) {
+		if (sys_call_table[i] == sys_ni_syscall)
+			continue;
+		count64++;
+		systemcfg->syscall_map_64[i >> 5] |= 0x80000000UL >> (i & 0x1f);
+	}
+	for (i = 0; i < __NR_syscalls; i++) {
+		if (sys_call_table32[i] == sys_ni_syscall)
+			continue;
+		count32++;
+		systemcfg->syscall_map_32[i >> 5] |= 0x80000000UL >> (i & 0x1f);
+	}
+	printk(KERN_INFO "Syscall map setup, %d 32 bits and %d 64 bits syscalls\n",
+	       count32, count64);
+}
+
+/*
+ * Called into from start_kernel, after lock_kernel has been called.
+ * Initializes bootmem, which is unsed to manage page allocation until
+ * mem_init is called.
+ */
+void __init setup_arch(char **cmdline_p)
+{
+	extern void do_init_bootmem(void);
+
+	ppc64_boot_msg(0x12, "Setup Arch");
+
+	*cmdline_p = cmd_line;
+
+	/*
+	 * Set cache line size based on type of cpu as a default.
+	 * Systems with OF can look in the properties on the cpu node(s)
+	 * for a possibly more accurate value.
+	 */
+	dcache_bsize = ppc64_caches.dline_size;
+	icache_bsize = ppc64_caches.iline_size;
+
+	/* reboot on panic */
+	panic_timeout = 180;
+
+	if (ppc_md.panic)
+		notifier_chain_register(&panic_notifier_list, &ppc64_panic_block);
+
+	init_mm.start_code = PAGE_OFFSET;
+	init_mm.end_code = (unsigned long) _etext;
+	init_mm.end_data = (unsigned long) _edata;
+	init_mm.brk = klimit;
+	
+	irqstack_early_init();
+	emergency_stack_init();
+
+	/* set up the bootmem stuff with available memory */
+	do_init_bootmem();
+
+	/* initialize the syscall map in systemcfg */
+	setup_syscall_map();
+
+	ppc_md.setup_arch();
+
+	/* Select the correct idle loop for the platform. */
+	idle_setup();
+
+	paging_init();
+	ppc64_boot_msg(0x15, "Setup Done");
+}
+
+
+/* ToDo: do something useful if ppc_md is not yet setup. */
+#define PPC64_LINUX_FUNCTION 0x0f000000
+#define PPC64_IPL_MESSAGE 0xc0000000
+#define PPC64_TERM_MESSAGE 0xb0000000
+#define PPC64_ATTN_MESSAGE 0xa0000000
+#define PPC64_DUMP_MESSAGE 0xd0000000
+
+static void ppc64_do_msg(unsigned int src, const char *msg)
+{
+	if (ppc_md.progress) {
+		char buf[32];
+
+		sprintf(buf, "%08x        \n", src);
+		ppc_md.progress(buf, 0);
+		sprintf(buf, "%-16s", msg);
+		ppc_md.progress(buf, 0);
+	}
+}
+
+/* Print a boot progress message. */
+void ppc64_boot_msg(unsigned int src, const char *msg)
+{
+	ppc64_do_msg(PPC64_LINUX_FUNCTION|PPC64_IPL_MESSAGE|src, msg);
+	printk("[boot]%04x %s\n", src, msg);
+}
+
+/* Print a termination message (print only -- does not stop the kernel) */
+void ppc64_terminate_msg(unsigned int src, const char *msg)
+{
+	ppc64_do_msg(PPC64_LINUX_FUNCTION|PPC64_TERM_MESSAGE|src, msg);
+	printk("[terminate]%04x %s\n", src, msg);
+}
+
+/* Print something that needs attention (device error, etc) */
+void ppc64_attention_msg(unsigned int src, const char *msg)
+{
+	ppc64_do_msg(PPC64_LINUX_FUNCTION|PPC64_ATTN_MESSAGE|src, msg);
+	printk("[attention]%04x %s\n", src, msg);
+}
+
+/* Print a dump progress message. */
+void ppc64_dump_msg(unsigned int src, const char *msg)
+{
+	ppc64_do_msg(PPC64_LINUX_FUNCTION|PPC64_DUMP_MESSAGE|src, msg);
+	printk("[dump]%04x %s\n", src, msg);
+}
+
+int set_spread_lpevents( char * str )
+{
+	/* The parameter is the number of processors to share in processing lp events */
+	unsigned long i;
+	unsigned long val = simple_strtoul( str, NULL, 0 );
+	if ( ( val > 0 ) && ( val <= NR_CPUS ) ) {
+		for ( i=1; i<val; ++i )
+			paca[i].lpqueue_ptr = paca[0].lpqueue_ptr;
+		printk("lpevent processing spread over %ld processors\n", val);
+	}
+	else
+		printk("invalid spreaqd_lpevents %ld\n", val);
+	return 1;
+}	
+
+/* This should only be called on processor 0 during calibrate decr */
+void setup_default_decr(void)
+{
+	struct paca_struct *lpaca = get_paca();
+
+	if ( decr_overclock_set && !decr_overclock_proc0_set )
+		decr_overclock_proc0 = decr_overclock;
+
+	lpaca->default_decr = tb_ticks_per_jiffy / decr_overclock_proc0;	
+	lpaca->next_jiffy_update_tb = get_tb() + tb_ticks_per_jiffy;
+}
+
+int set_decr_overclock_proc0( char * str )
+{
+	unsigned long val = simple_strtoul( str, NULL, 0 );
+	if ( ( val >= 1 ) && ( val <= 48 ) ) {
+		decr_overclock_proc0_set = 1;
+		decr_overclock_proc0 = val;
+		printk("proc 0 decrementer overclock factor of %ld\n", val);
+	}
+	else
+		printk("invalid proc 0 decrementer overclock factor of %ld\n", val);
+	return 1;
+}
+
+int set_decr_overclock( char * str )
+{
+	unsigned long val = simple_strtoul( str, NULL, 0 );
+	if ( ( val >= 1 ) && ( val <= 48 ) ) {
+		decr_overclock_set = 1;
+		decr_overclock = val;
+		printk("decrementer overclock factor of %ld\n", val);
+	}
+	else
+		printk("invalid decrementer overclock factor of %ld\n", val);
+	return 1;
+
+}
+
+__setup("spread_lpevents=", set_spread_lpevents );
+__setup("decr_overclock_proc0=", set_decr_overclock_proc0 );
+__setup("decr_overclock=", set_decr_overclock );
+
+#ifndef CONFIG_PPC_ISERIES
+/*
+ * This function can be used by platforms to "find" legacy serial ports.
+ * It works for "serial" nodes under an "isa" node, and will try to
+ * respect the "ibm,aix-loc" property if any. It works with up to 8
+ * ports.
+ */
+
+#define MAX_LEGACY_SERIAL_PORTS	8
+static struct plat_serial8250_port serial_ports[MAX_LEGACY_SERIAL_PORTS+1];
+static unsigned int old_serial_count;
+
+void __init generic_find_legacy_serial_ports(u64 *physport,
+		unsigned int *default_speed)
+{
+	struct device_node *np;
+	u32 *sizeprop;
+
+	struct isa_reg_property {
+		u32 space;
+		u32 address;
+		u32 size;
+	};
+	struct pci_reg_property {
+		struct pci_address addr;
+		u32 size_hi;
+		u32 size_lo;
+	};                                                                        
+
+	DBG(" -> generic_find_legacy_serial_port()\n");
+
+	*physport = 0;
+	if (default_speed)
+		*default_speed = 0;
+
+	np = of_find_node_by_path("/");
+	if (!np)
+		return;
+
+	/* First fill our array */
+	for (np = NULL; (np = of_find_node_by_type(np, "serial"));) {
+		struct device_node *isa, *pci;
+		struct isa_reg_property *reg;
+		unsigned long phys_size, addr_size, io_base;
+		u32 *rangesp;
+		u32 *interrupts, *clk, *spd;
+		char *typep;
+		int index, rlen, rentsize;
+
+		/* Ok, first check if it's under an "isa" parent */
+		isa = of_get_parent(np);
+		if (!isa || strcmp(isa->name, "isa")) {
+			DBG("%s: no isa parent found\n", np->full_name);
+			continue;
+		}
+		
+		/* Now look for an "ibm,aix-loc" property that gives us ordering
+		 * if any...
+		 */
+	 	typep = (char *)get_property(np, "ibm,aix-loc", NULL);
+
+		/* Get the ISA port number */
+		reg = (struct isa_reg_property *)get_property(np, "reg", NULL);	
+		if (reg == NULL)
+			goto next_port;
+		/* We assume the interrupt number isn't translated ... */
+		interrupts = (u32 *)get_property(np, "interrupts", NULL);
+		/* get clock freq. if present */
+		clk = (u32 *)get_property(np, "clock-frequency", NULL);
+		/* get default speed if present */
+		spd = (u32 *)get_property(np, "current-speed", NULL);
+		/* Default to locate at end of array */
+		index = old_serial_count; /* end of the array by default */
+
+		/* If we have a location index, then use it */
+		if (typep && *typep == 'S') {
+			index = simple_strtol(typep+1, NULL, 0) - 1;
+			/* if index is out of range, use end of array instead */
+			if (index >= MAX_LEGACY_SERIAL_PORTS)
+				index = old_serial_count;
+			/* if our index is still out of range, that mean that
+			 * array is full, we could scan for a free slot but that
+			 * make little sense to bother, just skip the port
+			 */
+			if (index >= MAX_LEGACY_SERIAL_PORTS)
+				goto next_port;
+			if (index >= old_serial_count)
+				old_serial_count = index + 1;
+			/* Check if there is a port who already claimed our slot */
+			if (serial_ports[index].iobase != 0) {
+				/* if we still have some room, move it, else override */
+				if (old_serial_count < MAX_LEGACY_SERIAL_PORTS) {
+					DBG("Moved legacy port %d -> %d\n", index,
+					    old_serial_count);
+					serial_ports[old_serial_count++] =
+						serial_ports[index];
+				} else {
+					DBG("Replacing legacy port %d\n", index);
+				}
+			}
+		}
+		if (index >= MAX_LEGACY_SERIAL_PORTS)
+			goto next_port;
+		if (index >= old_serial_count)
+			old_serial_count = index + 1;
+
+		/* Now fill the entry */
+		memset(&serial_ports[index], 0, sizeof(struct plat_serial8250_port));
+		serial_ports[index].uartclk = clk ? *clk : BASE_BAUD * 16;
+		serial_ports[index].iobase = reg->address;
+		serial_ports[index].irq = interrupts ? interrupts[0] : 0;
+		serial_ports[index].flags = ASYNC_BOOT_AUTOCONF;
+
+		DBG("Added legacy port, index: %d, port: %x, irq: %d, clk: %d\n",
+		    index,
+		    serial_ports[index].iobase,
+		    serial_ports[index].irq,
+		    serial_ports[index].uartclk);
+
+		/* Get phys address of IO reg for port 1 */
+		if (index != 0)
+			goto next_port;
+
+		pci = of_get_parent(isa);
+		if (!pci) {
+			DBG("%s: no pci parent found\n", np->full_name);
+			goto next_port;
+		}
+
+		rangesp = (u32 *)get_property(pci, "ranges", &rlen);
+		if (rangesp == NULL) {
+			of_node_put(pci);
+			goto next_port;
+		}
+		rlen /= 4;
+
+		/* we need the #size-cells of the PCI bridge node itself */
+		phys_size = 1;
+		sizeprop = (u32 *)get_property(pci, "#size-cells", NULL);
+		if (sizeprop != NULL)
+			phys_size = *sizeprop;
+		/* we need the parent #addr-cells */
+		addr_size = prom_n_addr_cells(pci);
+		rentsize = 3 + addr_size + phys_size;
+		io_base = 0;
+		for (;rlen >= rentsize; rlen -= rentsize,rangesp += rentsize) {
+			if (((rangesp[0] >> 24) & 0x3) != 1)
+				continue; /* not IO space */
+			io_base = rangesp[3];
+			if (addr_size == 2)
+				io_base = (io_base << 32) | rangesp[4];
+		}
+		if (io_base != 0) {
+			*physport = io_base + reg->address;
+			if (default_speed && spd)
+				*default_speed = *spd;
+		}
+		of_node_put(pci);
+	next_port:
+		of_node_put(isa);
+	}
+
+	DBG(" <- generic_find_legacy_serial_port()\n");
+}
+
+static struct platform_device serial_device = {
+	.name	= "serial8250",
+	.id	= 0,
+	.dev	= {
+		.platform_data = serial_ports,
+	},
+};
+
+static int __init serial_dev_init(void)
+{
+	return platform_device_register(&serial_device);
+}
+arch_initcall(serial_dev_init);
+
+#endif /* CONFIG_PPC_ISERIES */
+
+int check_legacy_ioport(unsigned long base_port)
+{
+	if (ppc_md.check_legacy_ioport == NULL)
+		return 0;
+	return ppc_md.check_legacy_ioport(base_port);
+}
+EXPORT_SYMBOL(check_legacy_ioport);
+
+#ifdef CONFIG_XMON
+static int __init early_xmon(char *p)
+{
+	/* ensure xmon is enabled */
+	if (p) {
+		if (strncmp(p, "on", 2) == 0)
+			xmon_init();
+		if (strncmp(p, "early", 5) != 0)
+			return 0;
+	}
+	xmon_init();
+	debugger(NULL);
+
+	return 0;
+}
+early_param("xmon", early_xmon);
+#endif
+
+void cpu_die(void)
+{
+	if (ppc_md.cpu_die)
+		ppc_md.cpu_die();
+}
diff --git a/arch/ppc64/kernel/signal.c b/arch/ppc64/kernel/signal.c
new file mode 100644
index 00000000000..a95a2b49a1d
--- /dev/null
+++ b/arch/ppc64/kernel/signal.c
@@ -0,0 +1,575 @@
+/*
+ *  linux/arch/ppc64/kernel/signal.c
+ *
+ *  PowerPC version 
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Derived from "arch/i386/kernel/signal.c"
+ *    Copyright (C) 1991, 1992 Linus Torvalds
+ *    1997-11-28  Modified for POSIX.1b signals by Richard Henderson
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/wait.h>
+#include <linux/unistd.h>
+#include <linux/stddef.h>
+#include <linux/elf.h>
+#include <linux/ptrace.h>
+#include <linux/module.h>
+
+#include <asm/sigcontext.h>
+#include <asm/ucontext.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/ppcdebug.h>
+#include <asm/unistd.h>
+#include <asm/cacheflush.h>
+#include <asm/vdso.h>
+
+#define DEBUG_SIG 0
+
+#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
+
+#ifndef MIN
+#define MIN(a,b) (((a) < (b)) ? (a) : (b))
+#endif
+
+#define GP_REGS_SIZE	MIN(sizeof(elf_gregset_t), sizeof(struct pt_regs))
+#define FP_REGS_SIZE	sizeof(elf_fpregset_t)
+
+#define TRAMP_TRACEBACK	3
+#define TRAMP_SIZE	6
+
+/*
+ * When we have signals to deliver, we set up on the user stack,
+ * going down from the original stack pointer:
+ *	1) a rt_sigframe struct which contains the ucontext	
+ *	2) a gap of __SIGNAL_FRAMESIZE bytes which acts as a dummy caller
+ *	   frame for the signal handler.
+ */
+
+struct rt_sigframe {
+	/* sys_rt_sigreturn requires the ucontext be the first field */
+	struct ucontext uc;
+	unsigned long _unused[2];
+	unsigned int tramp[TRAMP_SIZE];
+	struct siginfo *pinfo;
+	void *puc;
+	struct siginfo info;
+	/* 64 bit ABI allows for 288 bytes below sp before decrementing it. */
+	char abigap[288];
+} __attribute__ ((aligned (16)));
+
+
+/*
+ * Atomically swap in the new signal mask, and wait for a signal.
+ */
+long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize, int p3, int p4,
+		       int p6, int p7, struct pt_regs *regs)
+{
+	sigset_t saveset, newset;
+
+	/* XXX: Don't preclude handling different sized sigset_t's.  */
+	if (sigsetsize != sizeof(sigset_t))
+		return -EINVAL;
+
+	if (copy_from_user(&newset, unewset, sizeof(newset)))
+		return -EFAULT;
+	sigdelsetmask(&newset, ~_BLOCKABLE);
+
+	spin_lock_irq(&current->sighand->siglock);
+	saveset = current->blocked;
+	current->blocked = newset;
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+
+	regs->result = -EINTR;
+	regs->gpr[3] = EINTR;
+	regs->ccr |= 0x10000000;
+	while (1) {
+		current->state = TASK_INTERRUPTIBLE;
+		schedule();
+		if (do_signal(&saveset, regs))
+			return 0;
+	}
+}
+
+long sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, unsigned long r5,
+		     unsigned long r6, unsigned long r7, unsigned long r8,
+		     struct pt_regs *regs)
+{
+	return do_sigaltstack(uss, uoss, regs->gpr[1]);
+}
+
+
+/*
+ * Set up the sigcontext for the signal frame.
+ */
+
+static long setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
+		 int signr, sigset_t *set, unsigned long handler)
+{
+	/* When CONFIG_ALTIVEC is set, we _always_ setup v_regs even if the
+	 * process never used altivec yet (MSR_VEC is zero in pt_regs of
+	 * the context). This is very important because we must ensure we
+	 * don't lose the VRSAVE content that may have been set prior to
+	 * the process doing its first vector operation
+	 * Userland shall check AT_HWCAP to know wether it can rely on the
+	 * v_regs pointer or not
+	 */
+#ifdef CONFIG_ALTIVEC
+	elf_vrreg_t __user *v_regs = (elf_vrreg_t __user *)(((unsigned long)sc->vmx_reserve + 15) & ~0xful);
+#endif
+	long err = 0;
+
+	flush_fp_to_thread(current);
+
+	/* Make sure signal doesn't get spurrious FP exceptions */
+	current->thread.fpscr = 0;
+
+#ifdef CONFIG_ALTIVEC
+	err |= __put_user(v_regs, &sc->v_regs);
+
+	/* save altivec registers */
+	if (current->thread.used_vr) {
+		flush_altivec_to_thread(current);
+		/* Copy 33 vec registers (vr0..31 and vscr) to the stack */
+		err |= __copy_to_user(v_regs, current->thread.vr, 33 * sizeof(vector128));
+		/* set MSR_VEC in the MSR value in the frame to indicate that sc->v_reg)
+		 * contains valid data.
+		 */
+		regs->msr |= MSR_VEC;
+	}
+	/* We always copy to/from vrsave, it's 0 if we don't have or don't
+	 * use altivec.
+	 */
+	err |= __put_user(current->thread.vrsave, (u32 __user *)&v_regs[33]);
+#else /* CONFIG_ALTIVEC */
+	err |= __put_user(0, &sc->v_regs);
+#endif /* CONFIG_ALTIVEC */
+	err |= __put_user(&sc->gp_regs, &sc->regs);
+	err |= __copy_to_user(&sc->gp_regs, regs, GP_REGS_SIZE);
+	err |= __copy_to_user(&sc->fp_regs, &current->thread.fpr, FP_REGS_SIZE);
+	err |= __put_user(signr, &sc->signal);
+	err |= __put_user(handler, &sc->handler);
+	if (set != NULL)
+		err |=  __put_user(set->sig[0], &sc->oldmask);
+
+	return err;
+}
+
+/*
+ * Restore the sigcontext from the signal frame.
+ */
+
+static long restore_sigcontext(struct pt_regs *regs, sigset_t *set, int sig,
+			      struct sigcontext __user *sc)
+{
+#ifdef CONFIG_ALTIVEC
+	elf_vrreg_t __user *v_regs;
+#endif
+	unsigned long err = 0;
+	unsigned long save_r13 = 0;
+	elf_greg_t *gregs = (elf_greg_t *)regs;
+#ifdef CONFIG_ALTIVEC
+	unsigned long msr;
+#endif
+	int i;
+
+	/* If this is not a signal return, we preserve the TLS in r13 */
+	if (!sig)
+		save_r13 = regs->gpr[13];
+
+	/* copy everything before MSR */
+	err |= __copy_from_user(regs, &sc->gp_regs,
+				PT_MSR*sizeof(unsigned long));
+
+	/* skip MSR and SOFTE */
+	for (i = PT_MSR+1; i <= PT_RESULT; i++) {
+		if (i == PT_SOFTE)
+			continue;
+		err |= __get_user(gregs[i], &sc->gp_regs[i]);
+	}
+
+	if (!sig)
+		regs->gpr[13] = save_r13;
+	err |= __copy_from_user(&current->thread.fpr, &sc->fp_regs, FP_REGS_SIZE);
+	if (set != NULL)
+		err |=  __get_user(set->sig[0], &sc->oldmask);
+
+#ifdef CONFIG_ALTIVEC
+	err |= __get_user(v_regs, &sc->v_regs);
+	err |= __get_user(msr, &sc->gp_regs[PT_MSR]);
+	if (err)
+		return err;
+	/* Copy 33 vec registers (vr0..31 and vscr) from the stack */
+	if (v_regs != 0 && (msr & MSR_VEC) != 0)
+		err |= __copy_from_user(current->thread.vr, v_regs,
+					33 * sizeof(vector128));
+	else if (current->thread.used_vr)
+		memset(current->thread.vr, 0, 33 * sizeof(vector128));
+	/* Always get VRSAVE back */
+	if (v_regs != 0)
+		err |= __get_user(current->thread.vrsave, (u32 __user *)&v_regs[33]);
+	else
+		current->thread.vrsave = 0;
+#endif /* CONFIG_ALTIVEC */
+
+#ifndef CONFIG_SMP
+	preempt_disable();
+	if (last_task_used_math == current)
+		last_task_used_math = NULL;
+	if (last_task_used_altivec == current)
+		last_task_used_altivec = NULL;
+	preempt_enable();
+#endif
+	/* Force reload of FP/VEC */
+	regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC);
+
+	return err;
+}
+
+/*
+ * Allocate space for the signal frame
+ */
+static inline void __user * get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
+				  size_t frame_size)
+{
+        unsigned long newsp;
+
+        /* Default to using normal stack */
+        newsp = regs->gpr[1];
+
+	if (ka->sa.sa_flags & SA_ONSTACK) {
+		if (! on_sig_stack(regs->gpr[1]))
+			newsp = (current->sas_ss_sp + current->sas_ss_size);
+	}
+
+        return (void __user *)((newsp - frame_size) & -16ul);
+}
+
+/*
+ * Setup the trampoline code on the stack
+ */
+static long setup_trampoline(unsigned int syscall, unsigned int __user *tramp)
+{
+	int i;
+	long err = 0;
+
+	/* addi r1, r1, __SIGNAL_FRAMESIZE  # Pop the dummy stackframe */
+	err |= __put_user(0x38210000UL | (__SIGNAL_FRAMESIZE & 0xffff), &tramp[0]);
+	/* li r0, __NR_[rt_]sigreturn| */
+	err |= __put_user(0x38000000UL | (syscall & 0xffff), &tramp[1]);
+	/* sc */
+	err |= __put_user(0x44000002UL, &tramp[2]);
+
+	/* Minimal traceback info */
+	for (i=TRAMP_TRACEBACK; i < TRAMP_SIZE ;i++)
+		err |= __put_user(0, &tramp[i]);
+
+	if (!err)
+		flush_icache_range((unsigned long) &tramp[0],
+			   (unsigned long) &tramp[TRAMP_SIZE]);
+
+	return err;
+}
+
+/*
+ * Restore the user process's signal mask (also used by signal32.c)
+ */
+void restore_sigmask(sigset_t *set)
+{
+	sigdelsetmask(set, ~_BLOCKABLE);
+	spin_lock_irq(&current->sighand->siglock);
+	current->blocked = *set;
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+}
+
+
+/*
+ * Handle {get,set,swap}_context operations
+ */
+int sys_swapcontext(struct ucontext __user *old_ctx,
+		    struct ucontext __user *new_ctx,
+		    long ctx_size, long r6, long r7, long r8, struct pt_regs *regs)
+{
+	unsigned char tmp;
+	sigset_t set;
+
+	/* Context size is for future use. Right now, we only make sure
+	 * we are passed something we understand
+	 */
+	if (ctx_size < sizeof(struct ucontext))
+		return -EINVAL;
+
+	if (old_ctx != NULL) {
+		if (!access_ok(VERIFY_WRITE, old_ctx, sizeof(*old_ctx))
+		    || setup_sigcontext(&old_ctx->uc_mcontext, regs, 0, NULL, 0)
+		    || __copy_to_user(&old_ctx->uc_sigmask,
+				      &current->blocked, sizeof(sigset_t)))
+			return -EFAULT;
+	}
+	if (new_ctx == NULL)
+		return 0;
+	if (!access_ok(VERIFY_READ, new_ctx, sizeof(*new_ctx))
+	    || __get_user(tmp, (u8 __user *) new_ctx)
+	    || __get_user(tmp, (u8 __user *) (new_ctx + 1) - 1))
+		return -EFAULT;
+
+	/*
+	 * If we get a fault copying the context into the kernel's
+	 * image of the user's registers, we can't just return -EFAULT
+	 * because the user's registers will be corrupted.  For instance
+	 * the NIP value may have been updated but not some of the
+	 * other registers.  Given that we have done the access_ok
+	 * and successfully read the first and last bytes of the region
+	 * above, this should only happen in an out-of-memory situation
+	 * or if another thread unmaps the region containing the context.
+	 * We kill the task with a SIGSEGV in this situation.
+	 */
+
+	if (__copy_from_user(&set, &new_ctx->uc_sigmask, sizeof(set)))
+		do_exit(SIGSEGV);
+	restore_sigmask(&set);
+	if (restore_sigcontext(regs, NULL, 0, &new_ctx->uc_mcontext))
+		do_exit(SIGSEGV);
+
+	/* This returns like rt_sigreturn */
+	return 0;
+}
+
+
+/*
+ * Do a signal return; undo the signal stack.
+ */
+
+int sys_rt_sigreturn(unsigned long r3, unsigned long r4, unsigned long r5,
+		     unsigned long r6, unsigned long r7, unsigned long r8,
+		     struct pt_regs *regs)
+{
+	struct ucontext __user *uc = (struct ucontext __user *)regs->gpr[1];
+	sigset_t set;
+
+	/* Always make any pending restarted system calls return -EINTR */
+	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+
+	if (!access_ok(VERIFY_READ, uc, sizeof(*uc)))
+		goto badframe;
+
+	if (__copy_from_user(&set, &uc->uc_sigmask, sizeof(set)))
+		goto badframe;
+	restore_sigmask(&set);
+	if (restore_sigcontext(regs, NULL, 1, &uc->uc_mcontext))
+		goto badframe;
+
+	/* do_sigaltstack expects a __user pointer and won't modify
+	 * what's in there anyway
+	 */
+	do_sigaltstack(&uc->uc_stack, NULL, regs->gpr[1]);
+
+	return regs->result;
+
+badframe:
+#if DEBUG_SIG
+	printk("badframe in sys_rt_sigreturn, regs=%p uc=%p &uc->uc_mcontext=%p\n",
+	       regs, uc, &uc->uc_mcontext);
+#endif
+	force_sig(SIGSEGV, current);
+	return 0;
+}
+
+static int setup_rt_frame(int signr, struct k_sigaction *ka, siginfo_t *info,
+		sigset_t *set, struct pt_regs *regs)
+{
+	/* Handler is *really* a pointer to the function descriptor for
+	 * the signal routine.  The first entry in the function
+	 * descriptor is the entry address of signal and the second
+	 * entry is the TOC value we need to use.
+	 */
+	func_descr_t __user *funct_desc_ptr;
+	struct rt_sigframe __user *frame;
+	unsigned long newsp = 0;
+	long err = 0;
+
+	frame = get_sigframe(ka, regs, sizeof(*frame));
+
+	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+		goto badframe;
+
+	err |= __put_user(&frame->info, &frame->pinfo);
+	err |= __put_user(&frame->uc, &frame->puc);
+	err |= copy_siginfo_to_user(&frame->info, info);
+	if (err)
+		goto badframe;
+
+	/* Create the ucontext.  */
+	err |= __put_user(0, &frame->uc.uc_flags);
+	err |= __put_user(0, &frame->uc.uc_link);
+	err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
+	err |= __put_user(sas_ss_flags(regs->gpr[1]),
+			  &frame->uc.uc_stack.ss_flags);
+	err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
+	err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, signr, NULL,
+				(unsigned long)ka->sa.sa_handler);
+	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+	if (err)
+		goto badframe;
+
+	/* Set up to return from userspace. */
+	if (vdso64_rt_sigtramp && current->thread.vdso_base) {
+		regs->link = current->thread.vdso_base + vdso64_rt_sigtramp;
+	} else {
+		err |= setup_trampoline(__NR_rt_sigreturn, &frame->tramp[0]);
+		if (err)
+			goto badframe;
+		regs->link = (unsigned long) &frame->tramp[0];
+	}
+	funct_desc_ptr = (func_descr_t __user *) ka->sa.sa_handler;
+
+	/* Allocate a dummy caller frame for the signal handler. */
+	newsp = (unsigned long)frame - __SIGNAL_FRAMESIZE;
+	err |= put_user(regs->gpr[1], (unsigned long __user *)newsp);
+
+	/* Set up "regs" so we "return" to the signal handler. */
+	err |= get_user(regs->nip, &funct_desc_ptr->entry);
+	regs->gpr[1] = newsp;
+	err |= get_user(regs->gpr[2], &funct_desc_ptr->toc);
+	regs->gpr[3] = signr;
+	regs->result = 0;
+	if (ka->sa.sa_flags & SA_SIGINFO) {
+		err |= get_user(regs->gpr[4], (unsigned long __user *)&frame->pinfo);
+		err |= get_user(regs->gpr[5], (unsigned long __user *)&frame->puc);
+		regs->gpr[6] = (unsigned long) frame;
+	} else {
+		regs->gpr[4] = (unsigned long)&frame->uc.uc_mcontext;
+	}
+	if (err)
+		goto badframe;
+
+	if (test_thread_flag(TIF_SINGLESTEP))
+		ptrace_notify(SIGTRAP);
+
+	return 1;
+
+badframe:
+#if DEBUG_SIG
+	printk("badframe in setup_rt_frame, regs=%p frame=%p newsp=%lx\n",
+	       regs, frame, newsp);
+#endif
+	force_sigsegv(signr, current);
+	return 0;
+}
+
+
+/*
+ * OK, we're invoking a handler
+ */
+static int handle_signal(unsigned long sig, struct k_sigaction *ka,
+			 siginfo_t *info, sigset_t *oldset, struct pt_regs *regs)
+{
+	int ret;
+
+	/* Set up Signal Frame */
+	ret = setup_rt_frame(sig, ka, info, oldset, regs);
+
+	if (ret && !(ka->sa.sa_flags & SA_NODEFER)) {
+		spin_lock_irq(&current->sighand->siglock);
+		sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
+		sigaddset(&current->blocked,sig);
+		recalc_sigpending();
+		spin_unlock_irq(&current->sighand->siglock);
+	}
+
+	return ret;
+}
+
+static inline void syscall_restart(struct pt_regs *regs, struct k_sigaction *ka)
+{
+	switch ((int)regs->result) {
+	case -ERESTART_RESTARTBLOCK:
+	case -ERESTARTNOHAND:
+		/* ERESTARTNOHAND means that the syscall should only be
+		 * restarted if there was no handler for the signal, and since
+		 * we only get here if there is a handler, we dont restart.
+		 */
+		regs->result = -EINTR;
+		break;
+	case -ERESTARTSYS:
+		/* ERESTARTSYS means to restart the syscall if there is no
+		 * handler or the handler was registered with SA_RESTART
+		 */
+		if (!(ka->sa.sa_flags & SA_RESTART)) {
+			regs->result = -EINTR;
+			break;
+		}
+		/* fallthrough */
+	case -ERESTARTNOINTR:
+		/* ERESTARTNOINTR means that the syscall should be
+		 * called again after the signal handler returns.
+		 */
+		regs->gpr[3] = regs->orig_gpr3;
+		regs->nip -= 4;
+		regs->result = 0;
+		break;
+	}
+}
+
+/*
+ * Note that 'init' is a special process: it doesn't get signals it doesn't
+ * want to handle. Thus you cannot kill init even with a SIGKILL even by
+ * mistake.
+ */
+int do_signal(sigset_t *oldset, struct pt_regs *regs)
+{
+	siginfo_t info;
+	int signr;
+	struct k_sigaction ka;
+
+	/*
+	 * If the current thread is 32 bit - invoke the
+	 * 32 bit signal handling code
+	 */
+	if (test_thread_flag(TIF_32BIT))
+		return do_signal32(oldset, regs);
+
+	if (!oldset)
+		oldset = &current->blocked;
+
+	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
+	if (signr > 0) {
+		/* Whee!  Actually deliver the signal.  */
+		if (TRAP(regs) == 0x0C00)
+			syscall_restart(regs, &ka);
+		return handle_signal(signr, &ka, &info, oldset, regs);
+	}
+
+	if (TRAP(regs) == 0x0C00) {	/* System Call! */
+		if ((int)regs->result == -ERESTARTNOHAND ||
+		    (int)regs->result == -ERESTARTSYS ||
+		    (int)regs->result == -ERESTARTNOINTR) {
+			regs->gpr[3] = regs->orig_gpr3;
+			regs->nip -= 4; /* Back up & retry system call */
+			regs->result = 0;
+		} else if ((int)regs->result == -ERESTART_RESTARTBLOCK) {
+			regs->gpr[0] = __NR_restart_syscall;
+			regs->nip -= 4;
+			regs->result = 0;
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(do_signal);
diff --git a/arch/ppc64/kernel/signal32.c b/arch/ppc64/kernel/signal32.c
new file mode 100644
index 00000000000..b0e167db6af
--- /dev/null
+++ b/arch/ppc64/kernel/signal32.c
@@ -0,0 +1,989 @@
+/*
+ * signal32.c: Support 32bit signal syscalls.
+ *
+ * Copyright (C) 2001 IBM
+ * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
+ *
+ * These routines maintain argument size conversion between 32bit and 64bit
+ * environment.
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/mm.h> 
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/syscalls.h>
+#include <linux/errno.h>
+#include <linux/elf.h>
+#include <linux/compat.h>
+#include <linux/ptrace.h>
+#include <asm/ppc32.h>
+#include <asm/uaccess.h>
+#include <asm/ppcdebug.h>
+#include <asm/unistd.h>
+#include <asm/cacheflush.h>
+#include <asm/vdso.h>
+
+#define DEBUG_SIG 0
+
+#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
+
+#define GP_REGS_SIZE32	min(sizeof(elf_gregset_t32), sizeof(struct pt_regs32))
+
+/*
+ * When we have signals to deliver, we set up on the
+ * user stack, going down from the original stack pointer:
+ *	a sigregs32 struct
+ *	a sigcontext32 struct
+ *	a gap of __SIGNAL_FRAMESIZE32 bytes
+ *
+ * Each of these things must be a multiple of 16 bytes in size.
+ *
+ */
+struct sigregs32 {
+	struct mcontext32	mctx;		/* all the register values */
+	/*
+	 * Programs using the rs6000/xcoff abi can save up to 19 gp
+	 * regs and 18 fp regs below sp before decrementing it.
+	 */
+	int			abigap[56];
+};
+
+/* We use the mc_pad field for the signal return trampoline. */
+#define tramp	mc_pad
+
+/*
+ *  When we have rt signals to deliver, we set up on the
+ *  user stack, going down from the original stack pointer:
+ *	one rt_sigframe32 struct (siginfo + ucontext + ABI gap)
+ *	a gap of __SIGNAL_FRAMESIZE32+16 bytes
+ *  (the +16 is to get the siginfo and ucontext32 in the same
+ *  positions as in older kernels).
+ *
+ *  Each of these things must be a multiple of 16 bytes in size.
+ *
+ */
+struct rt_sigframe32 {
+	compat_siginfo_t	info;
+	struct ucontext32	uc;
+	/*
+	 * Programs using the rs6000/xcoff abi can save up to 19 gp
+	 * regs and 18 fp regs below sp before decrementing it.
+	 */
+	int			abigap[56];
+};
+
+
+/*
+ * Common utility functions used by signal and context support
+ *
+ */
+
+/*
+ * Restore the user process's signal mask
+ * (implemented in signal.c)
+ */
+extern void restore_sigmask(sigset_t *set);
+
+/*
+ * Functions for flipping sigsets (thanks to brain dead generic
+ * implementation that makes things simple for little endian only
+ */
+static inline void compat_from_sigset(compat_sigset_t *compat, sigset_t *set)
+{
+	switch (_NSIG_WORDS) {
+	case 4: compat->sig[5] = set->sig[3] & 0xffffffffull ;
+		compat->sig[7] = set->sig[3] >> 32; 
+	case 3: compat->sig[4] = set->sig[2] & 0xffffffffull ;
+		compat->sig[5] = set->sig[2] >> 32; 
+	case 2: compat->sig[2] = set->sig[1] & 0xffffffffull ;
+		compat->sig[3] = set->sig[1] >> 32; 
+	case 1: compat->sig[0] = set->sig[0] & 0xffffffffull ;
+		compat->sig[1] = set->sig[0] >> 32; 
+	}
+}
+
+static inline void sigset_from_compat(sigset_t *set, compat_sigset_t *compat)
+{
+	switch (_NSIG_WORDS) {
+	case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32);
+	case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32);
+	case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32);
+	case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32);
+	}
+}
+
+
+/*
+ * Save the current user registers on the user stack.
+ * We only save the altivec registers if the process has used
+ * altivec instructions at some point.
+ */
+static int save_user_regs(struct pt_regs *regs, struct mcontext32 __user *frame, int sigret)
+{
+	elf_greg_t64 *gregs = (elf_greg_t64 *)regs;
+	int i, err = 0;
+
+	/* Make sure floating point registers are stored in regs */
+	flush_fp_to_thread(current);
+
+	/* save general and floating-point registers */
+	for (i = 0; i <= PT_RESULT; i ++)
+		err |= __put_user((unsigned int)gregs[i], &frame->mc_gregs[i]);
+	err |= __copy_to_user(&frame->mc_fregs, current->thread.fpr,
+			      ELF_NFPREG * sizeof(double));
+	if (err)
+		return 1;
+
+	current->thread.fpscr = 0;	/* turn off all fp exceptions */
+
+#ifdef CONFIG_ALTIVEC
+	/* save altivec registers */
+	if (current->thread.used_vr) {
+		flush_altivec_to_thread(current);
+		if (__copy_to_user(&frame->mc_vregs, current->thread.vr,
+				   ELF_NVRREG32 * sizeof(vector128)))
+			return 1;
+		/* set MSR_VEC in the saved MSR value to indicate that
+		   frame->mc_vregs contains valid data */
+		if (__put_user(regs->msr | MSR_VEC, &frame->mc_gregs[PT_MSR]))
+			return 1;
+	}
+	/* else assert((regs->msr & MSR_VEC) == 0) */
+
+	/* We always copy to/from vrsave, it's 0 if we don't have or don't
+	 * use altivec. Since VSCR only contains 32 bits saved in the least
+	 * significant bits of a vector, we "cheat" and stuff VRSAVE in the
+	 * most significant bits of that same vector. --BenH
+	 */
+	if (__put_user(current->thread.vrsave, (u32 __user *)&frame->mc_vregs[32]))
+		return 1;
+#endif /* CONFIG_ALTIVEC */
+
+	if (sigret) {
+		/* Set up the sigreturn trampoline: li r0,sigret; sc */
+		if (__put_user(0x38000000UL + sigret, &frame->tramp[0])
+		    || __put_user(0x44000002UL, &frame->tramp[1]))
+			return 1;
+		flush_icache_range((unsigned long) &frame->tramp[0],
+				   (unsigned long) &frame->tramp[2]);
+	}
+
+	return 0;
+}
+
+/*
+ * Restore the current user register values from the user stack,
+ * (except for MSR).
+ */
+static long restore_user_regs(struct pt_regs *regs,
+			      struct mcontext32 __user *sr, int sig)
+{
+	elf_greg_t64 *gregs = (elf_greg_t64 *)regs;
+	int i;
+	long err = 0;
+	unsigned int save_r2 = 0;
+#ifdef CONFIG_ALTIVEC
+	unsigned long msr;
+#endif
+
+	/*
+	 * restore general registers but not including MSR or SOFTE. Also
+	 * take care of keeping r2 (TLS) intact if not a signal
+	 */
+	if (!sig)
+		save_r2 = (unsigned int)regs->gpr[2];
+	for (i = 0; i <= PT_RESULT; i++) {
+		if ((i == PT_MSR) || (i == PT_SOFTE))
+			continue;
+		err |= __get_user(gregs[i], &sr->mc_gregs[i]);
+	}
+	if (!sig)
+		regs->gpr[2] = (unsigned long) save_r2;
+	if (err)
+		return 1;
+
+	/* force the process to reload the FP registers from
+	   current->thread when it next does FP instructions */
+	regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1);
+	if (__copy_from_user(current->thread.fpr, &sr->mc_fregs,
+			     sizeof(sr->mc_fregs)))
+		return 1;
+
+#ifdef CONFIG_ALTIVEC
+	/* force the process to reload the altivec registers from
+	   current->thread when it next does altivec instructions */
+	regs->msr &= ~MSR_VEC;
+	if (!__get_user(msr, &sr->mc_gregs[PT_MSR]) && (msr & MSR_VEC) != 0) {
+		/* restore altivec registers from the stack */
+		if (__copy_from_user(current->thread.vr, &sr->mc_vregs,
+				     sizeof(sr->mc_vregs)))
+			return 1;
+	} else if (current->thread.used_vr)
+		memset(current->thread.vr, 0, ELF_NVRREG32 * sizeof(vector128));
+
+	/* Always get VRSAVE back */
+	if (__get_user(current->thread.vrsave, (u32 __user *)&sr->mc_vregs[32]))
+		return 1;
+#endif /* CONFIG_ALTIVEC */
+
+#ifndef CONFIG_SMP
+	preempt_disable();
+	if (last_task_used_math == current)
+		last_task_used_math = NULL;
+	if (last_task_used_altivec == current)
+		last_task_used_altivec = NULL;
+	preempt_enable();
+#endif
+	return 0;
+}
+
+
+/*
+ *  Start of nonRT signal support
+ *
+ *     sigset_t is 32 bits for non-rt signals
+ *
+ *  System Calls
+ *       sigaction                sys32_sigaction
+ *       sigreturn                sys32_sigreturn
+ *
+ *  Note sigsuspend has no special 32 bit routine - uses the 64 bit routine
+ *
+ *  Other routines
+ *        setup_frame32
+ */
+
+/*
+ * Atomically swap in the new signal mask, and wait for a signal.
+ */
+long sys32_sigsuspend(old_sigset_t mask, int p2, int p3, int p4, int p6, int p7,
+	       struct pt_regs *regs)
+{
+	sigset_t saveset;
+
+	mask &= _BLOCKABLE;
+	spin_lock_irq(&current->sighand->siglock);
+	saveset = current->blocked;
+	siginitset(&current->blocked, mask);
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+
+	regs->result = -EINTR;
+	regs->gpr[3] = EINTR;
+	regs->ccr |= 0x10000000;
+	while (1) {
+		current->state = TASK_INTERRUPTIBLE;
+		schedule();
+		if (do_signal32(&saveset, regs))
+			/*
+			 * Returning 0 means we return to userspace via
+			 * ret_from_except and thus restore all user
+			 * registers from *regs.  This is what we need
+			 * to do when a signal has been delivered.
+			 */
+			return 0;
+	}
+}
+
+long sys32_sigaction(int sig, struct old_sigaction32 __user *act,
+		struct old_sigaction32 __user *oact)
+{
+	struct k_sigaction new_ka, old_ka;
+	int ret;
+	
+	if (sig < 0)
+		sig = -sig;
+
+	if (act) {
+		compat_old_sigset_t mask;
+		compat_uptr_t handler, restorer;
+
+		if (get_user(handler, &act->sa_handler) ||
+		    __get_user(restorer, &act->sa_restorer) ||
+		    __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
+		    __get_user(mask, &act->sa_mask))
+			return -EFAULT;
+		new_ka.sa.sa_handler = compat_ptr(handler);
+		new_ka.sa.sa_restorer = compat_ptr(restorer);
+		siginitset(&new_ka.sa.sa_mask, mask);
+	}
+
+	ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+	if (!ret && oact) {
+		if (put_user((long)old_ka.sa.sa_handler, &oact->sa_handler) ||
+		    __put_user((long)old_ka.sa.sa_restorer, &oact->sa_restorer) ||
+		    __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
+		    __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
+			return -EFAULT;
+	}
+
+	return ret;
+}
+
+
+
+/*
+ *  Start of RT signal support
+ *
+ *     sigset_t is 64 bits for rt signals
+ *
+ *  System Calls
+ *       sigaction                sys32_rt_sigaction
+ *       sigpending               sys32_rt_sigpending
+ *       sigprocmask              sys32_rt_sigprocmask
+ *       sigreturn                sys32_rt_sigreturn
+ *       sigqueueinfo             sys32_rt_sigqueueinfo
+ *       sigsuspend               sys32_rt_sigsuspend
+ *
+ *  Other routines
+ *        setup_rt_frame32
+ *        copy_siginfo_to_user32
+ *        siginfo32to64
+ */
+
+
+long sys32_rt_sigaction(int sig, const struct sigaction32 __user *act,
+		struct sigaction32 __user *oact, size_t sigsetsize)
+{
+	struct k_sigaction new_ka, old_ka;
+	int ret;
+	compat_sigset_t set32;
+
+	/* XXX: Don't preclude handling different sized sigset_t's.  */
+	if (sigsetsize != sizeof(compat_sigset_t))
+		return -EINVAL;
+
+	if (act) {
+		compat_uptr_t handler;
+
+		ret = get_user(handler, &act->sa_handler);
+		new_ka.sa.sa_handler = compat_ptr(handler);
+		ret |= __copy_from_user(&set32, &act->sa_mask,
+					sizeof(compat_sigset_t));
+		sigset_from_compat(&new_ka.sa.sa_mask, &set32);
+		ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags);
+		if (ret)
+			return -EFAULT;
+	}
+
+	ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+	if (!ret && oact) {
+		compat_from_sigset(&set32, &old_ka.sa.sa_mask);
+		ret = put_user((long)old_ka.sa.sa_handler, &oact->sa_handler);
+		ret |= __copy_to_user(&oact->sa_mask, &set32,
+				      sizeof(compat_sigset_t));
+		ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
+	}
+	return ret;
+}
+
+/*
+ * Note: it is necessary to treat how as an unsigned int, with the
+ * corresponding cast to a signed int to insure that the proper
+ * conversion (sign extension) between the register representation
+ * of a signed int (msr in 32-bit mode) and the register representation
+ * of a signed int (msr in 64-bit mode) is performed.
+ */
+long sys32_rt_sigprocmask(u32 how, compat_sigset_t __user *set,
+		compat_sigset_t __user *oset, size_t sigsetsize)
+{
+	sigset_t s;
+	sigset_t __user *up;
+	compat_sigset_t s32;
+	int ret;
+	mm_segment_t old_fs = get_fs();
+
+	if (set) {
+		if (copy_from_user (&s32, set, sizeof(compat_sigset_t)))
+			return -EFAULT;    
+		sigset_from_compat(&s, &s32);
+	}
+	
+	set_fs(KERNEL_DS);
+	/* This is valid because of the set_fs() */
+	up = (sigset_t __user *) &s;
+	ret = sys_rt_sigprocmask((int)how, set ? up : NULL, oset ? up : NULL,
+				 sigsetsize); 
+	set_fs(old_fs);
+	if (ret)
+		return ret;
+	if (oset) {
+		compat_from_sigset(&s32, &s);
+		if (copy_to_user (oset, &s32, sizeof(compat_sigset_t)))
+			return -EFAULT;
+	}
+	return 0;
+}
+
+long sys32_rt_sigpending(compat_sigset_t __user *set, compat_size_t sigsetsize)
+{
+	sigset_t s;
+	compat_sigset_t s32;
+	int ret;
+	mm_segment_t old_fs = get_fs();
+
+	set_fs(KERNEL_DS);
+	/* The __user pointer cast is valid because of the set_fs() */
+	ret = sys_rt_sigpending((sigset_t __user *) &s, sigsetsize);
+	set_fs(old_fs);
+	if (!ret) {
+		compat_from_sigset(&s32, &s);
+		if (copy_to_user (set, &s32, sizeof(compat_sigset_t)))
+			return -EFAULT;
+	}
+	return ret;
+}
+
+
+int copy_siginfo_to_user32(struct compat_siginfo __user *d, siginfo_t *s)
+{
+	int err;
+
+	if (!access_ok (VERIFY_WRITE, d, sizeof(*d)))
+		return -EFAULT;
+
+	/* If you change siginfo_t structure, please be sure
+	 * this code is fixed accordingly.
+	 * It should never copy any pad contained in the structure
+	 * to avoid security leaks, but must copy the generic
+	 * 3 ints plus the relevant union member.
+	 * This routine must convert siginfo from 64bit to 32bit as well
+	 * at the same time.
+	 */
+	err = __put_user(s->si_signo, &d->si_signo);
+	err |= __put_user(s->si_errno, &d->si_errno);
+	err |= __put_user((short)s->si_code, &d->si_code);
+	if (s->si_code < 0)
+		err |= __copy_to_user(&d->_sifields._pad, &s->_sifields._pad,
+				      SI_PAD_SIZE32);
+	else switch(s->si_code >> 16) {
+	case __SI_CHLD >> 16:
+		err |= __put_user(s->si_pid, &d->si_pid);
+		err |= __put_user(s->si_uid, &d->si_uid);
+		err |= __put_user(s->si_utime, &d->si_utime);
+		err |= __put_user(s->si_stime, &d->si_stime);
+		err |= __put_user(s->si_status, &d->si_status);
+		break;
+	case __SI_FAULT >> 16:
+		err |= __put_user((unsigned int)(unsigned long)s->si_addr,
+				  &d->si_addr);
+		break;
+	case __SI_POLL >> 16:
+		err |= __put_user(s->si_band, &d->si_band);
+		err |= __put_user(s->si_fd, &d->si_fd);
+		break;
+	case __SI_TIMER >> 16:
+		err |= __put_user(s->si_tid, &d->si_tid);
+		err |= __put_user(s->si_overrun, &d->si_overrun);
+		err |= __put_user(s->si_int, &d->si_int);
+		break;
+	case __SI_RT >> 16: /* This is not generated by the kernel as of now.  */
+	case __SI_MESGQ >> 16:
+		err |= __put_user(s->si_int, &d->si_int);
+		/* fallthrough */
+	case __SI_KILL >> 16:
+	default:
+		err |= __put_user(s->si_pid, &d->si_pid);
+		err |= __put_user(s->si_uid, &d->si_uid);
+		break;
+	}
+	return err;
+}
+
+/*
+ * Note: it is necessary to treat pid and sig as unsigned ints, with the
+ * corresponding cast to a signed int to insure that the proper conversion
+ * (sign extension) between the register representation of a signed int
+ * (msr in 32-bit mode) and the register representation of a signed int
+ * (msr in 64-bit mode) is performed.
+ */
+long sys32_rt_sigqueueinfo(u32 pid, u32 sig, compat_siginfo_t __user *uinfo)
+{
+	siginfo_t info;
+	int ret;
+	mm_segment_t old_fs = get_fs();
+	
+	if (copy_from_user (&info, uinfo, 3*sizeof(int)) ||
+	    copy_from_user (info._sifields._pad, uinfo->_sifields._pad, SI_PAD_SIZE32))
+		return -EFAULT;
+	set_fs (KERNEL_DS);
+	/* The __user pointer cast is valid becasuse of the set_fs() */
+	ret = sys_rt_sigqueueinfo((int)pid, (int)sig, (siginfo_t __user *) &info);
+	set_fs (old_fs);
+	return ret;
+}
+
+int sys32_rt_sigsuspend(compat_sigset_t __user * unewset, size_t sigsetsize, int p3,
+		int p4, int p6, int p7, struct pt_regs *regs)
+{
+	sigset_t saveset, newset;
+	compat_sigset_t s32;
+
+	/* XXX: Don't preclude handling different sized sigset_t's.  */
+	if (sigsetsize != sizeof(sigset_t))
+		return -EINVAL;
+
+	if (copy_from_user(&s32, unewset, sizeof(s32)))
+		return -EFAULT;
+
+	/*
+	 * Swap the 2 words of the 64-bit sigset_t (they are stored
+	 * in the "wrong" endian in 32-bit user storage).
+	 */
+	sigset_from_compat(&newset, &s32);
+
+	sigdelsetmask(&newset, ~_BLOCKABLE);
+	spin_lock_irq(&current->sighand->siglock);
+	saveset = current->blocked;
+	current->blocked = newset;
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+
+	regs->result = -EINTR;
+	regs->gpr[3] = EINTR;
+	regs->ccr |= 0x10000000;
+	while (1) {
+		current->state = TASK_INTERRUPTIBLE;
+		schedule();
+		if (do_signal32(&saveset, regs))
+			/*
+			 * Returning 0 means we return to userspace via
+			 * ret_from_except and thus restore all user
+			 * registers from *regs.  This is what we need
+			 * to do when a signal has been delivered.
+			 */
+			return 0;
+	}
+}
+
+/*
+ *  Start Alternate signal stack support
+ *
+ *  System Calls
+ *       sigaltatck               sys32_sigaltstack
+ */
+
+int sys32_sigaltstack(u32 __new, u32 __old, int r5,
+		      int r6, int r7, int r8, struct pt_regs *regs)
+{
+	stack_32_t __user * newstack = (stack_32_t __user *)(long) __new;
+	stack_32_t __user * oldstack = (stack_32_t __user *)(long) __old;
+	stack_t uss, uoss;
+	int ret;
+	mm_segment_t old_fs;
+	unsigned long sp;
+	compat_uptr_t ss_sp;
+
+	/*
+	 * set sp to the user stack on entry to the system call
+	 * the system call router sets R9 to the saved registers
+	 */
+	sp = regs->gpr[1];
+
+	/* Put new stack info in local 64 bit stack struct */
+	if (newstack) {
+		if (get_user(ss_sp, &newstack->ss_sp) ||
+		    __get_user(uss.ss_flags, &newstack->ss_flags) ||
+		    __get_user(uss.ss_size, &newstack->ss_size))
+			return -EFAULT;
+		uss.ss_sp = compat_ptr(ss_sp);
+	}
+
+	old_fs = get_fs();
+	set_fs(KERNEL_DS);
+	/* The __user pointer casts are valid because of the set_fs() */
+	ret = do_sigaltstack(
+		newstack ? (stack_t __user *) &uss : NULL,
+		oldstack ? (stack_t __user *) &uoss : NULL,
+		sp);
+	set_fs(old_fs);
+	/* Copy the stack information to the user output buffer */
+	if (!ret && oldstack  &&
+		(put_user((long)uoss.ss_sp, &oldstack->ss_sp) ||
+		 __put_user(uoss.ss_flags, &oldstack->ss_flags) ||
+		 __put_user(uoss.ss_size, &oldstack->ss_size)))
+		return -EFAULT;
+	return ret;
+}
+
+
+/*
+ * Set up a signal frame for a "real-time" signal handler
+ * (one which gets siginfo).
+ */
+static int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka,
+			      siginfo_t *info, sigset_t *oldset,
+			      struct pt_regs * regs, unsigned long newsp)
+{
+	struct rt_sigframe32 __user *rt_sf;
+	struct mcontext32 __user *frame;
+	unsigned long origsp = newsp;
+	compat_sigset_t c_oldset;
+
+	/* Set up Signal Frame */
+	/* Put a Real Time Context onto stack */
+	newsp -= sizeof(*rt_sf);
+	rt_sf = (struct rt_sigframe32 __user *)newsp;
+
+	/* create a stack frame for the caller of the handler */
+	newsp -= __SIGNAL_FRAMESIZE32 + 16;
+
+	if (!access_ok(VERIFY_WRITE, (void __user *)newsp, origsp - newsp))
+		goto badframe;
+
+	compat_from_sigset(&c_oldset, oldset);
+
+	/* Put the siginfo & fill in most of the ucontext */
+	if (copy_siginfo_to_user32(&rt_sf->info, info)
+	    || __put_user(0, &rt_sf->uc.uc_flags)
+	    || __put_user(0, &rt_sf->uc.uc_link)
+	    || __put_user(current->sas_ss_sp, &rt_sf->uc.uc_stack.ss_sp)
+	    || __put_user(sas_ss_flags(regs->gpr[1]),
+			  &rt_sf->uc.uc_stack.ss_flags)
+	    || __put_user(current->sas_ss_size, &rt_sf->uc.uc_stack.ss_size)
+	    || __put_user((u32)(u64)&rt_sf->uc.uc_mcontext, &rt_sf->uc.uc_regs)
+	    || __copy_to_user(&rt_sf->uc.uc_sigmask, &c_oldset, sizeof(c_oldset)))
+		goto badframe;
+
+	/* Save user registers on the stack */
+	frame = &rt_sf->uc.uc_mcontext;
+	if (put_user(regs->gpr[1], (unsigned long __user *)newsp))
+		goto badframe;
+
+	if (vdso32_rt_sigtramp && current->thread.vdso_base) {
+		if (save_user_regs(regs, frame, 0))
+			goto badframe;
+		regs->link = current->thread.vdso_base + vdso32_rt_sigtramp;
+	} else {
+		if (save_user_regs(regs, frame, __NR_rt_sigreturn))
+			goto badframe;
+		regs->link = (unsigned long) frame->tramp;
+	}
+	regs->gpr[1] = (unsigned long) newsp;
+	regs->gpr[3] = sig;
+	regs->gpr[4] = (unsigned long) &rt_sf->info;
+	regs->gpr[5] = (unsigned long) &rt_sf->uc;
+	regs->gpr[6] = (unsigned long) rt_sf;
+	regs->nip = (unsigned long) ka->sa.sa_handler;
+	regs->trap = 0;
+	regs->result = 0;
+
+	if (test_thread_flag(TIF_SINGLESTEP))
+		ptrace_notify(SIGTRAP);
+
+	return 1;
+
+badframe:
+#if DEBUG_SIG
+	printk("badframe in handle_rt_signal, regs=%p frame=%p newsp=%lx\n",
+	       regs, frame, newsp);
+#endif
+	force_sigsegv(sig, current);
+	return 0;
+}
+
+static long do_setcontext32(struct ucontext32 __user *ucp, struct pt_regs *regs, int sig)
+{
+	compat_sigset_t c_set;
+	sigset_t set;
+	u32 mcp;
+
+	if (__copy_from_user(&c_set, &ucp->uc_sigmask, sizeof(c_set))
+	    || __get_user(mcp, &ucp->uc_regs))
+		return -EFAULT;
+	sigset_from_compat(&set, &c_set);
+	restore_sigmask(&set);
+	if (restore_user_regs(regs, (struct mcontext32 __user *)(u64)mcp, sig))
+		return -EFAULT;
+
+	return 0;
+}
+
+/*
+ * Handle {get,set,swap}_context operations for 32 bits processes
+ */
+
+long sys32_swapcontext(struct ucontext32 __user *old_ctx,
+		       struct ucontext32 __user *new_ctx,
+		       int ctx_size, int r6, int r7, int r8, struct pt_regs *regs)
+{
+	unsigned char tmp;
+	compat_sigset_t c_set;
+
+	/* Context size is for future use. Right now, we only make sure
+	 * we are passed something we understand
+	 */
+	if (ctx_size < sizeof(struct ucontext32))
+		return -EINVAL;
+
+	if (old_ctx != NULL) {
+		compat_from_sigset(&c_set, &current->blocked);
+		if (!access_ok(VERIFY_WRITE, old_ctx, sizeof(*old_ctx))
+		    || save_user_regs(regs, &old_ctx->uc_mcontext, 0)
+		    || __copy_to_user(&old_ctx->uc_sigmask, &c_set, sizeof(c_set))
+		    || __put_user((u32)(u64)&old_ctx->uc_mcontext, &old_ctx->uc_regs))
+			return -EFAULT;
+	}
+	if (new_ctx == NULL)
+		return 0;
+	if (!access_ok(VERIFY_READ, new_ctx, sizeof(*new_ctx))
+	    || __get_user(tmp, (u8 __user *) new_ctx)
+	    || __get_user(tmp, (u8 __user *) (new_ctx + 1) - 1))
+		return -EFAULT;
+
+	/*
+	 * If we get a fault copying the context into the kernel's
+	 * image of the user's registers, we can't just return -EFAULT
+	 * because the user's registers will be corrupted.  For instance
+	 * the NIP value may have been updated but not some of the
+	 * other registers.  Given that we have done the access_ok
+	 * and successfully read the first and last bytes of the region
+	 * above, this should only happen in an out-of-memory situation
+	 * or if another thread unmaps the region containing the context.
+	 * We kill the task with a SIGSEGV in this situation.
+	 */
+	if (do_setcontext32(new_ctx, regs, 0))
+		do_exit(SIGSEGV);
+
+	return 0;
+}
+
+long sys32_rt_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
+		     struct pt_regs *regs)
+{
+	struct rt_sigframe32 __user *rt_sf;
+	int ret;
+
+
+	/* Always make any pending restarted system calls return -EINTR */
+	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+
+	rt_sf = (struct rt_sigframe32 __user *)
+		(regs->gpr[1] + __SIGNAL_FRAMESIZE32 + 16);
+	if (!access_ok(VERIFY_READ, rt_sf, sizeof(*rt_sf)))
+		goto bad;
+	if (do_setcontext32(&rt_sf->uc, regs, 1))
+		goto bad;
+
+	/*
+	 * It's not clear whether or why it is desirable to save the
+	 * sigaltstack setting on signal delivery and restore it on
+	 * signal return.  But other architectures do this and we have
+	 * always done it up until now so it is probably better not to
+	 * change it.  -- paulus
+	 * We use the sys32_ version that does the 32/64 bits conversion
+	 * and takes userland pointer directly. What about error checking ?
+	 * nobody does any...
+	 */
+       	sys32_sigaltstack((u32)(u64)&rt_sf->uc.uc_stack, 0, 0, 0, 0, 0, regs);
+
+	ret = regs->result;
+
+	return ret;
+
+ bad:
+	force_sig(SIGSEGV, current);
+	return 0;
+}
+
+
+/*
+ * OK, we're invoking a handler
+ */
+static int handle_signal32(unsigned long sig, struct k_sigaction *ka,
+			    siginfo_t *info, sigset_t *oldset,
+			    struct pt_regs * regs, unsigned long newsp)
+{
+	struct sigcontext32 __user *sc;
+	struct sigregs32 __user *frame;
+	unsigned long origsp = newsp;
+
+	/* Set up Signal Frame */
+	newsp -= sizeof(struct sigregs32);
+	frame = (struct sigregs32 __user *) newsp;
+
+	/* Put a sigcontext on the stack */
+	newsp -= sizeof(*sc);
+	sc = (struct sigcontext32 __user *) newsp;
+
+	/* create a stack frame for the caller of the handler */
+	newsp -= __SIGNAL_FRAMESIZE32;
+
+	if (!access_ok(VERIFY_WRITE, (void __user *) newsp, origsp - newsp))
+		goto badframe;
+
+#if _NSIG != 64
+#error "Please adjust handle_signal32()"
+#endif
+	if (__put_user((u32)(u64)ka->sa.sa_handler, &sc->handler)
+	    || __put_user(oldset->sig[0], &sc->oldmask)
+	    || __put_user((oldset->sig[0] >> 32), &sc->_unused[3])
+	    || __put_user((u32)(u64)frame, &sc->regs)
+	    || __put_user(sig, &sc->signal))
+		goto badframe;
+
+	if (vdso32_sigtramp && current->thread.vdso_base) {
+		if (save_user_regs(regs, &frame->mctx, 0))
+			goto badframe;
+		regs->link = current->thread.vdso_base + vdso32_sigtramp;
+	} else {
+		if (save_user_regs(regs, &frame->mctx, __NR_sigreturn))
+			goto badframe;
+		regs->link = (unsigned long) frame->mctx.tramp;
+	}
+
+	if (put_user(regs->gpr[1], (unsigned long __user *)newsp))
+		goto badframe;
+	regs->gpr[1] = (unsigned long) newsp;
+	regs->gpr[3] = sig;
+	regs->gpr[4] = (unsigned long) sc;
+	regs->nip = (unsigned long) ka->sa.sa_handler;
+	regs->trap = 0;
+	regs->result = 0;
+
+	if (test_thread_flag(TIF_SINGLESTEP))
+		ptrace_notify(SIGTRAP);
+
+	return 1;
+
+badframe:
+#if DEBUG_SIG
+	printk("badframe in handle_signal, regs=%p frame=%x newsp=%x\n",
+	       regs, frame, *newspp);
+#endif
+	force_sigsegv(sig, current);
+	return 0;
+}
+
+/*
+ * Do a signal return; undo the signal stack.
+ */
+long sys32_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
+		       struct pt_regs *regs)
+{
+	struct sigcontext32 __user *sc;
+	struct sigcontext32 sigctx;
+	struct mcontext32 __user *sr;
+	sigset_t set;
+	int ret;
+
+	/* Always make any pending restarted system calls return -EINTR */
+	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+
+	sc = (struct sigcontext32 __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE32);
+	if (copy_from_user(&sigctx, sc, sizeof(sigctx)))
+		goto badframe;
+
+	/*
+	 * Note that PPC32 puts the upper 32 bits of the sigmask in the
+	 * unused part of the signal stackframe
+	 */
+	set.sig[0] = sigctx.oldmask + ((long)(sigctx._unused[3]) << 32);
+	restore_sigmask(&set);
+
+	sr = (struct mcontext32 __user *)(u64)sigctx.regs;
+	if (!access_ok(VERIFY_READ, sr, sizeof(*sr))
+	    || restore_user_regs(regs, sr, 1))
+		goto badframe;
+
+	ret = regs->result;
+	return ret;
+
+badframe:
+	force_sig(SIGSEGV, current);
+	return 0;
+}
+
+
+
+/*
+ *  Start of do_signal32 routine
+ *
+ *   This routine gets control when a pending signal needs to be processed
+ *     in the 32 bit target thread -
+ *
+ *   It handles both rt and non-rt signals
+ */
+
+/*
+ * Note that 'init' is a special process: it doesn't get signals it doesn't
+ * want to handle. Thus you cannot kill init even with a SIGKILL even by
+ * mistake.
+ */
+
+int do_signal32(sigset_t *oldset, struct pt_regs *regs)
+{
+	siginfo_t info;
+	unsigned int frame, newsp;
+	int signr, ret;
+	struct k_sigaction ka;
+
+	if (!oldset)
+		oldset = &current->blocked;
+
+	newsp = frame = 0;
+
+	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
+
+	if (TRAP(regs) == 0x0C00		/* System Call! */
+	    && regs->ccr & 0x10000000		/* error signalled */
+	    && ((ret = regs->gpr[3]) == ERESTARTSYS
+		|| ret == ERESTARTNOHAND || ret == ERESTARTNOINTR
+		|| ret == ERESTART_RESTARTBLOCK)) {
+
+		if (signr > 0
+		    && (ret == ERESTARTNOHAND || ret == ERESTART_RESTARTBLOCK
+			|| (ret == ERESTARTSYS
+			    && !(ka.sa.sa_flags & SA_RESTART)))) {
+			/* make the system call return an EINTR error */
+			regs->result = -EINTR;
+			regs->gpr[3] = EINTR;
+			/* note that the cr0.SO bit is already set */
+		} else {
+			regs->nip -= 4;	/* Back up & retry system call */
+			regs->result = 0;
+			regs->trap = 0;
+			if (ret == ERESTART_RESTARTBLOCK)
+				regs->gpr[0] = __NR_restart_syscall;
+			else
+				regs->gpr[3] = regs->orig_gpr3;
+		}
+	}
+
+	if (signr == 0)
+		return 0;		/* no signals delivered */
+
+	if ((ka.sa.sa_flags & SA_ONSTACK) && current->sas_ss_size
+	    && (!on_sig_stack(regs->gpr[1])))
+		newsp = (current->sas_ss_sp + current->sas_ss_size);
+	else
+		newsp = regs->gpr[1];
+	newsp &= ~0xfUL;
+
+	/* Whee!  Actually deliver the signal.  */
+	if (ka.sa.sa_flags & SA_SIGINFO)
+		ret = handle_rt_signal32(signr, &ka, &info, oldset, regs, newsp);
+	else
+		ret = handle_signal32(signr, &ka, &info, oldset, regs, newsp);
+
+	if (ret && !(ka.sa.sa_flags & SA_NODEFER)) {
+		spin_lock_irq(&current->sighand->siglock);
+		sigorsets(&current->blocked, &current->blocked,
+			  &ka.sa.sa_mask);
+		sigaddset(&current->blocked, signr);
+		recalc_sigpending();
+		spin_unlock_irq(&current->sighand->siglock);
+	}
+
+	return ret;
+}
diff --git a/arch/ppc64/kernel/smp-tbsync.c b/arch/ppc64/kernel/smp-tbsync.c
new file mode 100644
index 00000000000..7d8ec9996b3
--- /dev/null
+++ b/arch/ppc64/kernel/smp-tbsync.c
@@ -0,0 +1,179 @@
+/*
+ * Smp timebase synchronization for ppc.
+ *
+ * Copyright (C) 2003 Samuel Rydh (samuel@ibrium.se)
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/unistd.h>
+#include <linux/init.h>
+#include <asm/atomic.h>
+#include <asm/smp.h>
+#include <asm/time.h>
+
+#define NUM_ITER		300
+
+enum {
+	kExit=0, kSetAndTest, kTest
+};
+
+static struct {
+	volatile long		tb;
+	volatile long		mark;
+	volatile int		cmd;
+	volatile int		handshake;
+	int			filler[3];
+
+	volatile int		ack;
+	int			filler2[7];
+
+	volatile int		race_result;
+} *tbsync;
+
+static volatile int		running;
+
+static void __devinit
+enter_contest( long mark, long add )
+{
+	while( (long)(mftb() - mark) < 0 )
+		tbsync->race_result = add;
+}
+
+void __devinit
+smp_generic_take_timebase( void )
+{
+	int cmd;
+	long tb;
+
+	local_irq_disable();
+	while( !running )
+		;
+	rmb();
+
+	for( ;; ) {
+		tbsync->ack = 1;
+		while( !tbsync->handshake )
+			;
+		rmb();
+
+		cmd = tbsync->cmd;
+		tb = tbsync->tb;
+		tbsync->ack = 0;
+		if( cmd == kExit )
+			return;
+
+		if( cmd == kSetAndTest ) {
+			while( tbsync->handshake )
+				;
+			asm volatile ("mttbl %0" :: "r" (tb & 0xfffffffful) );
+			asm volatile ("mttbu %0" :: "r" (tb >> 32) );
+		} else {
+			while( tbsync->handshake )
+				;
+		}
+		enter_contest( tbsync->mark, -1 );
+	}
+	local_irq_enable();
+}
+
+static int __devinit
+start_contest( int cmd, long offset, long num )
+{
+	int i, score=0;
+	long tb, mark;
+
+	tbsync->cmd = cmd;
+
+	local_irq_disable();
+	for( i=-3; i<num; ) {
+		tb = (long)mftb() + 400;
+		tbsync->tb = tb + offset;
+		tbsync->mark = mark = tb + 400;
+
+		wmb();
+
+		tbsync->handshake = 1;
+		while( tbsync->ack )
+			;
+
+		while( (long)(mftb() - tb) <= 0 )
+			;
+		tbsync->handshake = 0;
+		enter_contest( mark, 1 );
+
+		while( !tbsync->ack )
+			;
+
+	       	if ((tbsync->tb ^ (long)mftb()) & 0x8000000000000000ul)
+			continue;
+		if( i++ > 0 )
+			score += tbsync->race_result;
+	}
+	local_irq_enable();
+	return score;
+}
+
+void __devinit
+smp_generic_give_timebase( void )
+{
+	int i, score, score2, old, min=0, max=5000, offset=1000;
+
+	printk("Synchronizing timebase\n");
+
+	/* if this fails then this kernel won't work anyway... */
+	tbsync = kmalloc( sizeof(*tbsync), GFP_KERNEL );
+	memset( tbsync, 0, sizeof(*tbsync) );
+	mb();
+	running = 1;
+
+	while( !tbsync->ack )
+		;
+
+	printk("Got ack\n");
+
+	/* binary search */
+	for( old=-1 ; old != offset ; offset=(min+max)/2 ) {
+		score = start_contest( kSetAndTest, offset, NUM_ITER );
+
+		printk("score %d, offset %d\n", score, offset );
+
+		if( score > 0 )
+			max = offset;
+		else
+			min = offset;
+		old = offset;
+	}
+	score = start_contest( kSetAndTest, min, NUM_ITER );
+	score2 = start_contest( kSetAndTest, max, NUM_ITER );
+
+	printk( "Min %d (score %d), Max %d (score %d)\n", min, score, max, score2 );
+	score = abs( score );
+	score2 = abs( score2 );
+	offset = (score < score2) ? min : max;
+
+	/* guard against inaccurate mttb */
+	for( i=0; i<10; i++ ) {
+		start_contest( kSetAndTest, offset, NUM_ITER/10 );
+
+		if( (score2=start_contest(kTest, offset, NUM_ITER)) < 0 )
+			score2 = -score2;
+		if( score2 <= score || score2 < 20 )
+			break;
+	}
+	printk("Final offset: %d (%d/%d)\n", offset, score2, NUM_ITER );
+
+	/* exiting */
+	tbsync->cmd = kExit;
+	wmb();
+	tbsync->handshake = 1;
+	while( tbsync->ack )
+		;
+	tbsync->handshake = 0;
+	kfree( tbsync );
+	tbsync = NULL;
+	running = 0;
+}
diff --git a/arch/ppc64/kernel/smp.c b/arch/ppc64/kernel/smp.c
new file mode 100644
index 00000000000..1c92da3e452
--- /dev/null
+++ b/arch/ppc64/kernel/smp.c
@@ -0,0 +1,622 @@
+/*
+ * SMP support for ppc.
+ *
+ * Written by Cort Dougan (cort@cs.nmt.edu) borrowing a great
+ * deal of code from the sparc and intel versions.
+ *
+ * Copyright (C) 1999 Cort Dougan <cort@cs.nmt.edu>
+ *
+ * PowerPC-64 Support added by Dave Engebretsen, Peter Bergner, and
+ * Mike Corrigan {engebret|bergner|mikec}@us.ibm.com
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/cache.h>
+#include <linux/err.h>
+#include <linux/sysdev.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+
+#include <asm/ptrace.h>
+#include <asm/atomic.h>
+#include <asm/irq.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/prom.h>
+#include <asm/smp.h>
+#include <asm/paca.h>
+#include <asm/time.h>
+#include <asm/machdep.h>
+#include <asm/cputable.h>
+#include <asm/system.h>
+#include <asm/abs_addr.h>
+
+#include "mpic.h"
+
+#ifdef DEBUG
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+cpumask_t cpu_possible_map = CPU_MASK_NONE;
+cpumask_t cpu_online_map = CPU_MASK_NONE;
+cpumask_t cpu_sibling_map[NR_CPUS] = { [0 ... NR_CPUS-1] = CPU_MASK_NONE };
+
+EXPORT_SYMBOL(cpu_online_map);
+EXPORT_SYMBOL(cpu_possible_map);
+
+struct smp_ops_t *smp_ops;
+
+static volatile unsigned int cpu_callin_map[NR_CPUS];
+
+extern unsigned char stab_array[];
+
+void smp_call_function_interrupt(void);
+
+int smt_enabled_at_boot = 1;
+
+#ifdef CONFIG_PPC_MULTIPLATFORM
+void smp_mpic_message_pass(int target, int msg)
+{
+	/* make sure we're sending something that translates to an IPI */
+	if ( msg > 0x3 ){
+		printk("SMP %d: smp_message_pass: unknown msg %d\n",
+		       smp_processor_id(), msg);
+		return;
+	}
+	switch ( target )
+	{
+	case MSG_ALL:
+		mpic_send_ipi(msg, 0xffffffff);
+		break;
+	case MSG_ALL_BUT_SELF:
+		mpic_send_ipi(msg, 0xffffffff & ~(1 << smp_processor_id()));
+		break;
+	default:
+		mpic_send_ipi(msg, 1 << target);
+		break;
+	}
+}
+
+int __init smp_mpic_probe(void)
+{
+	int nr_cpus;
+
+	DBG("smp_mpic_probe()...\n");
+
+	nr_cpus = cpus_weight(cpu_possible_map);
+
+	DBG("nr_cpus: %d\n", nr_cpus);
+
+	if (nr_cpus > 1)
+		mpic_request_ipis();
+
+	return nr_cpus;
+}
+
+void __devinit smp_mpic_setup_cpu(int cpu)
+{
+	mpic_setup_this_cpu();
+}
+
+void __devinit smp_generic_kick_cpu(int nr)
+{
+	BUG_ON(nr < 0 || nr >= NR_CPUS);
+
+	/*
+	 * The processor is currently spinning, waiting for the
+	 * cpu_start field to become non-zero After we set cpu_start,
+	 * the processor will continue on to secondary_start
+	 */
+	paca[nr].cpu_start = 1;
+	mb();
+}
+
+#endif /* CONFIG_PPC_MULTIPLATFORM */
+
+static void __init smp_space_timers(unsigned int max_cpus)
+{
+	int i;
+	unsigned long offset = tb_ticks_per_jiffy / max_cpus;
+	unsigned long previous_tb = paca[boot_cpuid].next_jiffy_update_tb;
+
+	for_each_cpu(i) {
+		if (i != boot_cpuid) {
+			paca[i].next_jiffy_update_tb =
+				previous_tb + offset;
+			previous_tb = paca[i].next_jiffy_update_tb;
+		}
+	}
+}
+
+void smp_message_recv(int msg, struct pt_regs *regs)
+{
+	switch(msg) {
+	case PPC_MSG_CALL_FUNCTION:
+		smp_call_function_interrupt();
+		break;
+	case PPC_MSG_RESCHEDULE: 
+		/* XXX Do we have to do this? */
+		set_need_resched();
+		break;
+#if 0
+	case PPC_MSG_MIGRATE_TASK:
+		/* spare */
+		break;
+#endif
+#ifdef CONFIG_DEBUGGER
+	case PPC_MSG_DEBUGGER_BREAK:
+		debugger_ipi(regs);
+		break;
+#endif
+	default:
+		printk("SMP %d: smp_message_recv(): unknown msg %d\n",
+		       smp_processor_id(), msg);
+		break;
+	}
+}
+
+void smp_send_reschedule(int cpu)
+{
+	smp_ops->message_pass(cpu, PPC_MSG_RESCHEDULE);
+}
+
+#ifdef CONFIG_DEBUGGER
+void smp_send_debugger_break(int cpu)
+{
+	smp_ops->message_pass(cpu, PPC_MSG_DEBUGGER_BREAK);
+}
+#endif
+
+static void stop_this_cpu(void *dummy)
+{
+	local_irq_disable();
+	while (1)
+		;
+}
+
+void smp_send_stop(void)
+{
+	smp_call_function(stop_this_cpu, NULL, 1, 0);
+}
+
+/*
+ * Structure and data for smp_call_function(). This is designed to minimise
+ * static memory requirements. It also looks cleaner.
+ * Stolen from the i386 version.
+ */
+static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(call_lock);
+
+static struct call_data_struct {
+	void (*func) (void *info);
+	void *info;
+	atomic_t started;
+	atomic_t finished;
+	int wait;
+} *call_data;
+
+/* delay of at least 8 seconds on 1GHz cpu */
+#define SMP_CALL_TIMEOUT (1UL << (30 + 3))
+
+/*
+ * This function sends a 'generic call function' IPI to all other CPUs
+ * in the system.
+ *
+ * [SUMMARY] Run a function on all other CPUs.
+ * <func> The function to run. This must be fast and non-blocking.
+ * <info> An arbitrary pointer to pass to the function.
+ * <nonatomic> currently unused.
+ * <wait> If true, wait (atomically) until function has completed on other CPUs.
+ * [RETURNS] 0 on success, else a negative status code. Does not return until
+ * remote CPUs are nearly ready to execute <<func>> or are or have executed.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler.
+ */
+int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
+		       int wait)
+{ 
+	struct call_data_struct data;
+	int ret = -1, cpus;
+	unsigned long timeout;
+
+	/* Can deadlock when called with interrupts disabled */
+	WARN_ON(irqs_disabled());
+
+	data.func = func;
+	data.info = info;
+	atomic_set(&data.started, 0);
+	data.wait = wait;
+	if (wait)
+		atomic_set(&data.finished, 0);
+
+	spin_lock(&call_lock);
+	/* Must grab online cpu count with preempt disabled, otherwise
+	 * it can change. */
+	cpus = num_online_cpus() - 1;
+	if (!cpus) {
+		ret = 0;
+		goto out;
+	}
+
+	call_data = &data;
+	wmb();
+	/* Send a message to all other CPUs and wait for them to respond */
+	smp_ops->message_pass(MSG_ALL_BUT_SELF, PPC_MSG_CALL_FUNCTION);
+
+	/* Wait for response */
+	timeout = SMP_CALL_TIMEOUT;
+	while (atomic_read(&data.started) != cpus) {
+		HMT_low();
+		if (--timeout == 0) {
+			printk("smp_call_function on cpu %d: other cpus not "
+			       "responding (%d)\n", smp_processor_id(),
+			       atomic_read(&data.started));
+			debugger(NULL);
+			goto out;
+		}
+	}
+
+	if (wait) {
+		timeout = SMP_CALL_TIMEOUT;
+		while (atomic_read(&data.finished) != cpus) {
+			HMT_low();
+			if (--timeout == 0) {
+				printk("smp_call_function on cpu %d: other "
+				       "cpus not finishing (%d/%d)\n",
+				       smp_processor_id(),
+				       atomic_read(&data.finished),
+				       atomic_read(&data.started));
+				debugger(NULL);
+				goto out;
+			}
+		}
+	}
+
+	ret = 0;
+
+out:
+	call_data = NULL;
+	HMT_medium();
+	spin_unlock(&call_lock);
+	return ret;
+}
+
+EXPORT_SYMBOL(smp_call_function);
+
+void smp_call_function_interrupt(void)
+{
+	void (*func) (void *info);
+	void *info;
+	int wait;
+
+	/* call_data will be NULL if the sender timed out while
+	 * waiting on us to receive the call.
+	 */
+	if (!call_data)
+		return;
+
+	func = call_data->func;
+	info = call_data->info;
+	wait = call_data->wait;
+
+	if (!wait)
+		smp_mb__before_atomic_inc();
+
+	/*
+	 * Notify initiating CPU that I've grabbed the data and am
+	 * about to execute the function
+	 */
+	atomic_inc(&call_data->started);
+	/*
+	 * At this point the info structure may be out of scope unless wait==1
+	 */
+	(*func)(info);
+	if (wait) {
+		smp_mb__before_atomic_inc();
+		atomic_inc(&call_data->finished);
+	}
+}
+
+extern unsigned long decr_overclock;
+extern struct gettimeofday_struct do_gtod;
+
+struct thread_info *current_set[NR_CPUS];
+
+DECLARE_PER_CPU(unsigned int, pvr);
+
+static void __devinit smp_store_cpu_info(int id)
+{
+	per_cpu(pvr, id) = mfspr(SPRN_PVR);
+}
+
+static void __init smp_create_idle(unsigned int cpu)
+{
+	struct task_struct *p;
+
+	/* create a process for the processor */
+	p = fork_idle(cpu);
+	if (IS_ERR(p))
+		panic("failed fork for CPU %u: %li", cpu, PTR_ERR(p));
+	paca[cpu].__current = p;
+	current_set[cpu] = p->thread_info;
+}
+
+void __init smp_prepare_cpus(unsigned int max_cpus)
+{
+	unsigned int cpu;
+
+	DBG("smp_prepare_cpus\n");
+
+	/* 
+	 * setup_cpu may need to be called on the boot cpu. We havent
+	 * spun any cpus up but lets be paranoid.
+	 */
+	BUG_ON(boot_cpuid != smp_processor_id());
+
+	/* Fixup boot cpu */
+	smp_store_cpu_info(boot_cpuid);
+	cpu_callin_map[boot_cpuid] = 1;
+
+#ifndef CONFIG_PPC_ISERIES
+	paca[boot_cpuid].next_jiffy_update_tb = tb_last_stamp = get_tb();
+
+	/*
+	 * Should update do_gtod.stamp_xsec.
+	 * For now we leave it which means the time can be some
+	 * number of msecs off until someone does a settimeofday()
+	 */
+	do_gtod.varp->tb_orig_stamp = tb_last_stamp;
+	systemcfg->tb_orig_stamp = tb_last_stamp;
+#endif
+
+	max_cpus = smp_ops->probe();
+ 
+	smp_space_timers(max_cpus);
+
+	for_each_cpu(cpu)
+		if (cpu != boot_cpuid)
+			smp_create_idle(cpu);
+}
+
+void __devinit smp_prepare_boot_cpu(void)
+{
+	BUG_ON(smp_processor_id() != boot_cpuid);
+
+	cpu_set(boot_cpuid, cpu_online_map);
+
+	paca[boot_cpuid].__current = current;
+	current_set[boot_cpuid] = current->thread_info;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+/* State of each CPU during hotplug phases */
+DEFINE_PER_CPU(int, cpu_state) = { 0 };
+
+int generic_cpu_disable(void)
+{
+	unsigned int cpu = smp_processor_id();
+
+	if (cpu == boot_cpuid)
+		return -EBUSY;
+
+	systemcfg->processorCount--;
+	cpu_clear(cpu, cpu_online_map);
+	fixup_irqs(cpu_online_map);
+	return 0;
+}
+
+int generic_cpu_enable(unsigned int cpu)
+{
+	/* Do the normal bootup if we haven't
+	 * already bootstrapped. */
+	if (system_state != SYSTEM_RUNNING)
+		return -ENOSYS;
+
+	/* get the target out of it's holding state */
+	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+	wmb();
+
+	while (!cpu_online(cpu))
+		cpu_relax();
+
+	fixup_irqs(cpu_online_map);
+	/* counter the irq disable in fixup_irqs */
+	local_irq_enable();
+	return 0;
+}
+
+void generic_cpu_die(unsigned int cpu)
+{
+	int i;
+
+	for (i = 0; i < 100; i++) {
+		rmb();
+		if (per_cpu(cpu_state, cpu) == CPU_DEAD)
+			return;
+		msleep(100);
+	}
+	printk(KERN_ERR "CPU%d didn't die...\n", cpu);
+}
+
+void generic_mach_cpu_die(void)
+{
+	unsigned int cpu;
+
+	local_irq_disable();
+	cpu = smp_processor_id();
+	printk(KERN_DEBUG "CPU%d offline\n", cpu);
+	__get_cpu_var(cpu_state) = CPU_DEAD;
+	wmb();
+	while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE)
+		cpu_relax();
+
+	flush_tlb_pending();
+	cpu_set(cpu, cpu_online_map);
+	local_irq_enable();
+}
+#endif
+
+static int __devinit cpu_enable(unsigned int cpu)
+{
+	if (smp_ops->cpu_enable)
+		return smp_ops->cpu_enable(cpu);
+
+	return -ENOSYS;
+}
+
+int __devinit __cpu_up(unsigned int cpu)
+{
+	int c;
+
+	if (!cpu_enable(cpu))
+		return 0;
+
+	if (smp_ops->cpu_bootable && !smp_ops->cpu_bootable(cpu))
+		return -EINVAL;
+
+	paca[cpu].default_decr = tb_ticks_per_jiffy / decr_overclock;
+
+	if (!cpu_has_feature(CPU_FTR_SLB)) {
+		void *tmp;
+
+		/* maximum of 48 CPUs on machines with a segment table */
+		if (cpu >= 48)
+			BUG();
+
+		tmp = &stab_array[PAGE_SIZE * cpu];
+		memset(tmp, 0, PAGE_SIZE); 
+		paca[cpu].stab_addr = (unsigned long)tmp;
+		paca[cpu].stab_real = virt_to_abs(tmp);
+	}
+
+	/* Make sure callin-map entry is 0 (can be leftover a CPU
+	 * hotplug
+	 */
+	cpu_callin_map[cpu] = 0;
+
+	/* The information for processor bringup must
+	 * be written out to main store before we release
+	 * the processor.
+	 */
+	mb();
+
+	/* wake up cpus */
+	DBG("smp: kicking cpu %d\n", cpu);
+	smp_ops->kick_cpu(cpu);
+
+	/*
+	 * wait to see if the cpu made a callin (is actually up).
+	 * use this value that I found through experimentation.
+	 * -- Cort
+	 */
+	if (system_state < SYSTEM_RUNNING)
+		for (c = 5000; c && !cpu_callin_map[cpu]; c--)
+			udelay(100);
+#ifdef CONFIG_HOTPLUG_CPU
+	else
+		/*
+		 * CPUs can take much longer to come up in the
+		 * hotplug case.  Wait five seconds.
+		 */
+		for (c = 25; c && !cpu_callin_map[cpu]; c--) {
+			msleep(200);
+		}
+#endif
+
+	if (!cpu_callin_map[cpu]) {
+		printk("Processor %u is stuck.\n", cpu);
+		return -ENOENT;
+	}
+
+	printk("Processor %u found.\n", cpu);
+
+	if (smp_ops->give_timebase)
+		smp_ops->give_timebase();
+
+	/* Wait until cpu puts itself in the online map */
+	while (!cpu_online(cpu))
+		cpu_relax();
+
+	return 0;
+}
+
+
+/* Activate a secondary processor. */
+int __devinit start_secondary(void *unused)
+{
+	unsigned int cpu = smp_processor_id();
+
+	atomic_inc(&init_mm.mm_count);
+	current->active_mm = &init_mm;
+
+	smp_store_cpu_info(cpu);
+	set_dec(paca[cpu].default_decr);
+	cpu_callin_map[cpu] = 1;
+
+	smp_ops->setup_cpu(cpu);
+	if (smp_ops->take_timebase)
+		smp_ops->take_timebase();
+
+	spin_lock(&call_lock);
+	cpu_set(cpu, cpu_online_map);
+	spin_unlock(&call_lock);
+
+	local_irq_enable();
+
+	cpu_idle();
+	return 0;
+}
+
+int setup_profiling_timer(unsigned int multiplier)
+{
+	return 0;
+}
+
+void __init smp_cpus_done(unsigned int max_cpus)
+{
+	cpumask_t old_mask;
+
+	/* We want the setup_cpu() here to be called from CPU 0, but our
+	 * init thread may have been "borrowed" by another CPU in the meantime
+	 * se we pin us down to CPU 0 for a short while
+	 */
+	old_mask = current->cpus_allowed;
+	set_cpus_allowed(current, cpumask_of_cpu(boot_cpuid));
+	
+	smp_ops->setup_cpu(boot_cpuid);
+
+	set_cpus_allowed(current, old_mask);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+int __cpu_disable(void)
+{
+	if (smp_ops->cpu_disable)
+		return smp_ops->cpu_disable();
+
+	return -ENOSYS;
+}
+
+void __cpu_die(unsigned int cpu)
+{
+	if (smp_ops->cpu_die)
+		smp_ops->cpu_die(cpu);
+}
+#endif
diff --git a/arch/ppc64/kernel/sys_ppc32.c b/arch/ppc64/kernel/sys_ppc32.c
new file mode 100644
index 00000000000..7cf7a960002
--- /dev/null
+++ b/arch/ppc64/kernel/sys_ppc32.c
@@ -0,0 +1,1329 @@
+/*
+ * sys_ppc32.c: Conversion between 32bit and 64bit native syscalls.
+ *
+ * Copyright (C) 2001 IBM
+ * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
+ *
+ * These routines maintain argument size conversion between 32bit and 64bit
+ * environment.
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/fs.h> 
+#include <linux/mm.h> 
+#include <linux/file.h> 
+#include <linux/signal.h>
+#include <linux/resource.h>
+#include <linux/times.h>
+#include <linux/utsname.h>
+#include <linux/timex.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/sem.h>
+#include <linux/msg.h>
+#include <linux/shm.h>
+#include <linux/slab.h>
+#include <linux/uio.h>
+#include <linux/aio.h>
+#include <linux/nfs_fs.h>
+#include <linux/module.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/nfsd/nfsd.h>
+#include <linux/nfsd/cache.h>
+#include <linux/nfsd/xdr.h>
+#include <linux/nfsd/syscall.h>
+#include <linux/poll.h>
+#include <linux/personality.h>
+#include <linux/stat.h>
+#include <linux/filter.h>
+#include <linux/highmem.h>
+#include <linux/highuid.h>
+#include <linux/mman.h>
+#include <linux/ipv6.h>
+#include <linux/in.h>
+#include <linux/icmpv6.h>
+#include <linux/syscalls.h>
+#include <linux/unistd.h>
+#include <linux/sysctl.h>
+#include <linux/binfmts.h>
+#include <linux/dnotify.h>
+#include <linux/security.h>
+#include <linux/compat.h>
+#include <linux/ptrace.h>
+#include <linux/aio_abi.h>
+#include <linux/elf.h>
+
+#include <net/scm.h>
+#include <net/sock.h>
+
+#include <asm/ptrace.h>
+#include <asm/types.h>
+#include <asm/ipc.h>
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+#include <asm/semaphore.h>
+#include <asm/ppcdebug.h>
+#include <asm/time.h>
+#include <asm/mmu_context.h>
+#include <asm/systemcfg.h>
+
+#include "pci.h"
+
+/* readdir & getdents */
+#define NAME_OFFSET(de) ((int) ((de)->d_name - (char __user *) (de)))
+#define ROUND_UP(x) (((x)+sizeof(u32)-1) & ~(sizeof(u32)-1))
+
+struct old_linux_dirent32 {
+	u32		d_ino;
+	u32		d_offset;
+	unsigned short	d_namlen;
+	char		d_name[1];
+};
+
+struct readdir_callback32 {
+	struct old_linux_dirent32 __user * dirent;
+	int count;
+};
+
+static int fillonedir(void * __buf, const char * name, int namlen,
+		                  off_t offset, ino_t ino, unsigned int d_type)
+{
+	struct readdir_callback32 * buf = (struct readdir_callback32 *) __buf;
+	struct old_linux_dirent32 __user * dirent;
+
+	if (buf->count)
+		return -EINVAL;
+	buf->count++;
+	dirent = buf->dirent;
+	put_user(ino, &dirent->d_ino);
+	put_user(offset, &dirent->d_offset);
+	put_user(namlen, &dirent->d_namlen);
+	copy_to_user(dirent->d_name, name, namlen);
+	put_user(0, dirent->d_name + namlen);
+	return 0;
+}
+
+asmlinkage int old32_readdir(unsigned int fd, struct old_linux_dirent32 __user *dirent, unsigned int count)
+{
+	int error = -EBADF;
+	struct file * file;
+	struct readdir_callback32 buf;
+
+	file = fget(fd);
+	if (!file)
+		goto out;
+
+	buf.count = 0;
+	buf.dirent = dirent;
+
+	error = vfs_readdir(file, (filldir_t)fillonedir, &buf);
+	if (error < 0)
+		goto out_putf;
+	error = buf.count;
+
+out_putf:
+	fput(file);
+out:
+	return error;
+}
+
+struct linux_dirent32 {
+	u32		d_ino;
+	u32		d_off;
+	unsigned short	d_reclen;
+	char		d_name[1];
+};
+
+struct getdents_callback32 {
+	struct linux_dirent32 __user * current_dir;
+	struct linux_dirent32 __user * previous;
+	int count;
+	int error;
+};
+
+static int filldir(void * __buf, const char * name, int namlen, off_t offset,
+		   ino_t ino, unsigned int d_type)
+{
+	struct linux_dirent32 __user * dirent;
+	struct getdents_callback32 * buf = (struct getdents_callback32 *) __buf;
+	int reclen = ROUND_UP(NAME_OFFSET(dirent) + namlen + 2);
+
+	buf->error = -EINVAL;	/* only used if we fail.. */
+	if (reclen > buf->count)
+		return -EINVAL;
+	dirent = buf->previous;
+	if (dirent) {
+		if (__put_user(offset, &dirent->d_off))
+			goto efault;
+	}
+	dirent = buf->current_dir;
+	if (__put_user(ino, &dirent->d_ino))
+		goto efault;
+	if (__put_user(reclen, &dirent->d_reclen))
+		goto efault;
+	if (copy_to_user(dirent->d_name, name, namlen))
+		goto efault;
+	if (__put_user(0, dirent->d_name + namlen))
+		goto efault;
+	if (__put_user(d_type, (char __user *) dirent + reclen - 1))
+		goto efault;
+	buf->previous = dirent;
+	dirent = (void __user *)dirent + reclen;
+	buf->current_dir = dirent;
+	buf->count -= reclen;
+	return 0;
+efault:
+	buf->error = -EFAULT;
+	return -EFAULT;
+}
+
+asmlinkage long sys32_getdents(unsigned int fd, struct linux_dirent32 __user *dirent,
+		    unsigned int count)
+{
+	struct file * file;
+	struct linux_dirent32 __user * lastdirent;
+	struct getdents_callback32 buf;
+	int error;
+
+	error = -EFAULT;
+	if (!access_ok(VERIFY_WRITE, dirent, count))
+		goto out;
+
+	error = -EBADF;
+	file = fget(fd);
+	if (!file)
+		goto out;
+
+	buf.current_dir = dirent;
+	buf.previous = NULL;
+	buf.count = count;
+	buf.error = 0;
+
+	error = vfs_readdir(file, (filldir_t)filldir, &buf);
+	if (error < 0)
+		goto out_putf;
+	error = buf.error;
+	lastdirent = buf.previous;
+	if (lastdirent) {
+		if (put_user(file->f_pos, &lastdirent->d_off))
+			error = -EFAULT;
+		else
+			error = count - buf.count;
+	}
+
+out_putf:
+	fput(file);
+out:
+	return error;
+}
+
+asmlinkage long ppc32_select(u32 n, compat_ulong_t __user *inp,
+		compat_ulong_t __user *outp, compat_ulong_t __user *exp,
+		compat_uptr_t tvp_x)
+{
+	/* sign extend n */
+	return compat_sys_select((int)n, inp, outp, exp, compat_ptr(tvp_x));
+}
+
+int cp_compat_stat(struct kstat *stat, struct compat_stat __user *statbuf)
+{
+	long err;
+
+	if (stat->size > MAX_NON_LFS || !new_valid_dev(stat->dev) ||
+	    !new_valid_dev(stat->rdev))
+		return -EOVERFLOW;
+
+	err  = access_ok(VERIFY_WRITE, statbuf, sizeof(*statbuf)) ? 0 : -EFAULT;
+	err |= __put_user(new_encode_dev(stat->dev), &statbuf->st_dev);
+	err |= __put_user(stat->ino, &statbuf->st_ino);
+	err |= __put_user(stat->mode, &statbuf->st_mode);
+	err |= __put_user(stat->nlink, &statbuf->st_nlink);
+	err |= __put_user(stat->uid, &statbuf->st_uid);
+	err |= __put_user(stat->gid, &statbuf->st_gid);
+	err |= __put_user(new_encode_dev(stat->rdev), &statbuf->st_rdev);
+	err |= __put_user(stat->size, &statbuf->st_size);
+	err |= __put_user(stat->atime.tv_sec, &statbuf->st_atime);
+	err |= __put_user(stat->atime.tv_nsec, &statbuf->st_atime_nsec);
+	err |= __put_user(stat->mtime.tv_sec, &statbuf->st_mtime);
+	err |= __put_user(stat->mtime.tv_nsec, &statbuf->st_mtime_nsec);
+	err |= __put_user(stat->ctime.tv_sec, &statbuf->st_ctime);
+	err |= __put_user(stat->ctime.tv_nsec, &statbuf->st_ctime_nsec);
+	err |= __put_user(stat->blksize, &statbuf->st_blksize);
+	err |= __put_user(stat->blocks, &statbuf->st_blocks);
+	err |= __put_user(0, &statbuf->__unused4[0]);
+	err |= __put_user(0, &statbuf->__unused4[1]);
+
+	return err;
+}
+
+/* Note: it is necessary to treat option as an unsigned int,
+ * with the corresponding cast to a signed int to insure that the 
+ * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
+ * and the register representation of a signed int (msr in 64-bit mode) is performed.
+ */
+asmlinkage long sys32_sysfs(u32 option, u32 arg1, u32 arg2)
+{
+	return sys_sysfs((int)option, arg1, arg2);
+}
+
+/* Handle adjtimex compatibility. */
+struct timex32 {
+	u32 modes;
+	s32 offset, freq, maxerror, esterror;
+	s32 status, constant, precision, tolerance;
+	struct compat_timeval time;
+	s32 tick;
+	s32 ppsfreq, jitter, shift, stabil;
+	s32 jitcnt, calcnt, errcnt, stbcnt;
+	s32  :32; s32  :32; s32  :32; s32  :32;
+	s32  :32; s32  :32; s32  :32; s32  :32;
+	s32  :32; s32  :32; s32  :32; s32  :32;
+};
+
+extern int do_adjtimex(struct timex *);
+extern void ppc_adjtimex(void);
+
+asmlinkage long sys32_adjtimex(struct timex32 __user *utp)
+{
+	struct timex txc;
+	int ret;
+	
+	memset(&txc, 0, sizeof(struct timex));
+
+	if(get_user(txc.modes, &utp->modes) ||
+	   __get_user(txc.offset, &utp->offset) ||
+	   __get_user(txc.freq, &utp->freq) ||
+	   __get_user(txc.maxerror, &utp->maxerror) ||
+	   __get_user(txc.esterror, &utp->esterror) ||
+	   __get_user(txc.status, &utp->status) ||
+	   __get_user(txc.constant, &utp->constant) ||
+	   __get_user(txc.precision, &utp->precision) ||
+	   __get_user(txc.tolerance, &utp->tolerance) ||
+	   __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
+	   __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
+	   __get_user(txc.tick, &utp->tick) ||
+	   __get_user(txc.ppsfreq, &utp->ppsfreq) ||
+	   __get_user(txc.jitter, &utp->jitter) ||
+	   __get_user(txc.shift, &utp->shift) ||
+	   __get_user(txc.stabil, &utp->stabil) ||
+	   __get_user(txc.jitcnt, &utp->jitcnt) ||
+	   __get_user(txc.calcnt, &utp->calcnt) ||
+	   __get_user(txc.errcnt, &utp->errcnt) ||
+	   __get_user(txc.stbcnt, &utp->stbcnt))
+		return -EFAULT;
+
+	ret = do_adjtimex(&txc);
+
+	/* adjust the conversion of TB to time of day to track adjtimex */
+	ppc_adjtimex();
+
+	if(put_user(txc.modes, &utp->modes) ||
+	   __put_user(txc.offset, &utp->offset) ||
+	   __put_user(txc.freq, &utp->freq) ||
+	   __put_user(txc.maxerror, &utp->maxerror) ||
+	   __put_user(txc.esterror, &utp->esterror) ||
+	   __put_user(txc.status, &utp->status) ||
+	   __put_user(txc.constant, &utp->constant) ||
+	   __put_user(txc.precision, &utp->precision) ||
+	   __put_user(txc.tolerance, &utp->tolerance) ||
+	   __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
+	   __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
+	   __put_user(txc.tick, &utp->tick) ||
+	   __put_user(txc.ppsfreq, &utp->ppsfreq) ||
+	   __put_user(txc.jitter, &utp->jitter) ||
+	   __put_user(txc.shift, &utp->shift) ||
+	   __put_user(txc.stabil, &utp->stabil) ||
+	   __put_user(txc.jitcnt, &utp->jitcnt) ||
+	   __put_user(txc.calcnt, &utp->calcnt) ||
+	   __put_user(txc.errcnt, &utp->errcnt) ||
+	   __put_user(txc.stbcnt, &utp->stbcnt))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+
+/* These are here just in case some old sparc32 binary calls it. */
+asmlinkage long sys32_pause(void)
+{
+	current->state = TASK_INTERRUPTIBLE;
+	schedule();
+	
+	return -ERESTARTNOHAND;
+}
+
+
+
+static inline long get_ts32(struct timespec *o, struct compat_timeval __user *i)
+{
+	long usec;
+
+	if (!access_ok(VERIFY_READ, i, sizeof(*i)))
+		return -EFAULT;
+	if (__get_user(o->tv_sec, &i->tv_sec))
+		return -EFAULT;
+	if (__get_user(usec, &i->tv_usec))
+		return -EFAULT;
+	o->tv_nsec = usec * 1000;
+	return 0;
+}
+
+static inline long put_tv32(struct compat_timeval __user *o, struct timeval *i)
+{
+	return (!access_ok(VERIFY_WRITE, o, sizeof(*o)) ||
+		(__put_user(i->tv_sec, &o->tv_sec) |
+		 __put_user(i->tv_usec, &o->tv_usec)));
+}
+
+struct sysinfo32 {
+        s32 uptime;
+        u32 loads[3];
+        u32 totalram;
+        u32 freeram;
+        u32 sharedram;
+        u32 bufferram;
+        u32 totalswap;
+        u32 freeswap;
+        unsigned short procs;
+	unsigned short pad;
+	u32 totalhigh;
+	u32 freehigh;
+	u32 mem_unit;
+	char _f[20-2*sizeof(int)-sizeof(int)];
+};
+
+asmlinkage long sys32_sysinfo(struct sysinfo32 __user *info)
+{
+	struct sysinfo s;
+	int ret, err;
+	int bitcount=0;
+	mm_segment_t old_fs = get_fs ();
+	
+	/* The __user cast is valid due to set_fs() */
+	set_fs (KERNEL_DS);
+	ret = sys_sysinfo((struct sysinfo __user *)&s);
+	set_fs (old_fs);
+
+	/* Check to see if any memory value is too large for 32-bit and
+         * scale down if needed.
+         */
+	if ((s.totalram >> 32) || (s.totalswap >> 32)) {
+	    while (s.mem_unit < PAGE_SIZE) {
+		s.mem_unit <<= 1;
+		bitcount++;
+	    }
+	    s.totalram >>=bitcount;
+	    s.freeram >>= bitcount;
+	    s.sharedram >>= bitcount;
+	    s.bufferram >>= bitcount;
+	    s.totalswap >>= bitcount;
+	    s.freeswap >>= bitcount;
+	    s.totalhigh >>= bitcount;
+	    s.freehigh >>= bitcount;
+	}
+
+	err = put_user (s.uptime, &info->uptime);
+	err |= __put_user (s.loads[0], &info->loads[0]);
+	err |= __put_user (s.loads[1], &info->loads[1]);
+	err |= __put_user (s.loads[2], &info->loads[2]);
+	err |= __put_user (s.totalram, &info->totalram);
+	err |= __put_user (s.freeram, &info->freeram);
+	err |= __put_user (s.sharedram, &info->sharedram);
+	err |= __put_user (s.bufferram, &info->bufferram);
+	err |= __put_user (s.totalswap, &info->totalswap);
+	err |= __put_user (s.freeswap, &info->freeswap);
+	err |= __put_user (s.procs, &info->procs);
+	err |= __put_user (s.totalhigh, &info->totalhigh);
+	err |= __put_user (s.freehigh, &info->freehigh);
+	err |= __put_user (s.mem_unit, &info->mem_unit);
+	if (err)
+		return -EFAULT;
+	
+	return ret;
+}
+
+
+
+
+/* Translations due to time_t size differences.  Which affects all
+   sorts of things, like timeval and itimerval.  */
+extern struct timezone sys_tz;
+
+asmlinkage long sys32_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz)
+{
+	if (tv) {
+		struct timeval ktv;
+		do_gettimeofday(&ktv);
+		if (put_tv32(tv, &ktv))
+			return -EFAULT;
+	}
+	if (tz) {
+		if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
+			return -EFAULT;
+	}
+	
+	return 0;
+}
+
+
+
+asmlinkage long sys32_settimeofday(struct compat_timeval __user *tv, struct timezone __user *tz)
+{
+	struct timespec kts;
+	struct timezone ktz;
+	
+ 	if (tv) {
+		if (get_ts32(&kts, tv))
+			return -EFAULT;
+	}
+	if (tz) {
+		if (copy_from_user(&ktz, tz, sizeof(ktz)))
+			return -EFAULT;
+	}
+
+	return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
+}
+
+#ifdef CONFIG_SYSVIPC
+long sys32_ipc(u32 call, u32 first, u32 second, u32 third, compat_uptr_t ptr,
+	       u32 fifth)
+{
+	int version;
+
+	version = call >> 16; /* hack for backward compatibility */
+	call &= 0xffff;
+
+	switch (call) {
+
+	case SEMTIMEDOP:
+		if (fifth)
+			/* sign extend semid */
+			return compat_sys_semtimedop((int)first,
+						     compat_ptr(ptr), second,
+						     compat_ptr(fifth));
+		/* else fall through for normal semop() */
+	case SEMOP:
+		/* struct sembuf is the same on 32 and 64bit :)) */
+		/* sign extend semid */
+		return sys_semtimedop((int)first, compat_ptr(ptr), second,
+				      NULL);
+	case SEMGET:
+		/* sign extend key, nsems */
+		return sys_semget((int)first, (int)second, third);
+	case SEMCTL:
+		/* sign extend semid, semnum */
+		return compat_sys_semctl((int)first, (int)second, third,
+					 compat_ptr(ptr));
+
+	case MSGSND:
+		/* sign extend msqid */
+		return compat_sys_msgsnd((int)first, (int)second, third,
+					 compat_ptr(ptr));
+	case MSGRCV:
+		/* sign extend msqid, msgtyp */
+		return compat_sys_msgrcv((int)first, second, (int)fifth,
+					 third, version, compat_ptr(ptr));
+	case MSGGET:
+		/* sign extend key */
+		return sys_msgget((int)first, second);
+	case MSGCTL:
+		/* sign extend msqid */
+		return compat_sys_msgctl((int)first, second, compat_ptr(ptr));
+
+	case SHMAT:
+		/* sign extend shmid */
+		return compat_sys_shmat((int)first, second, third, version,
+					compat_ptr(ptr));
+	case SHMDT:
+		return sys_shmdt(compat_ptr(ptr));
+	case SHMGET:
+		/* sign extend key_t */
+		return sys_shmget((int)first, second, third);
+	case SHMCTL:
+		/* sign extend shmid */
+		return compat_sys_shmctl((int)first, second, compat_ptr(ptr));
+
+	default:
+		return -ENOSYS;
+	}
+
+	return -ENOSYS;
+}
+#endif
+
+/* Note: it is necessary to treat out_fd and in_fd as unsigned ints, 
+ * with the corresponding cast to a signed int to insure that the 
+ * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
+ * and the register representation of a signed int (msr in 64-bit mode) is performed.
+ */
+asmlinkage long sys32_sendfile(u32 out_fd, u32 in_fd, compat_off_t __user * offset, u32 count)
+{
+	mm_segment_t old_fs = get_fs();
+	int ret;
+	off_t of;
+	off_t __user *up;
+
+	if (offset && get_user(of, offset))
+		return -EFAULT;
+
+	/* The __user pointer cast is valid because of the set_fs() */		
+	set_fs(KERNEL_DS);
+	up = offset ? (off_t __user *) &of : NULL;
+	ret = sys_sendfile((int)out_fd, (int)in_fd, up, count);
+	set_fs(old_fs);
+	
+	if (offset && put_user(of, offset))
+		return -EFAULT;
+		
+	return ret;
+}
+
+asmlinkage int sys32_sendfile64(int out_fd, int in_fd, compat_loff_t __user *offset, s32 count)
+{
+	mm_segment_t old_fs = get_fs();
+	int ret;
+	loff_t lof;
+	loff_t __user *up;
+	
+	if (offset && get_user(lof, offset))
+		return -EFAULT;
+		
+	/* The __user pointer cast is valid because of the set_fs() */		
+	set_fs(KERNEL_DS);
+	up = offset ? (loff_t __user *) &lof : NULL;
+	ret = sys_sendfile64(out_fd, in_fd, up, count);
+	set_fs(old_fs);
+	
+	if (offset && put_user(lof, offset))
+		return -EFAULT;
+		
+	return ret;
+}
+
+long sys32_execve(unsigned long a0, unsigned long a1, unsigned long a2,
+		  unsigned long a3, unsigned long a4, unsigned long a5,
+		  struct pt_regs *regs)
+{
+	int error;
+	char * filename;
+	
+	filename = getname((char __user *) a0);
+	error = PTR_ERR(filename);
+	if (IS_ERR(filename))
+		goto out;
+	flush_fp_to_thread(current);
+	flush_altivec_to_thread(current);
+
+	error = compat_do_execve(filename, compat_ptr(a1), compat_ptr(a2), regs);
+
+	if (error == 0) {
+		task_lock(current);
+		current->ptrace &= ~PT_DTRACE;
+		task_unlock(current);
+	}
+	putname(filename);
+
+out:
+	return error;
+}
+
+/* Set up a thread for executing a new program. */
+void start_thread32(struct pt_regs* regs, unsigned long nip, unsigned long sp)
+{
+	set_fs(USER_DS);
+
+	/*
+	 * If we exec out of a kernel thread then thread.regs will not be
+	 * set. Do it now.
+	 */
+	if (!current->thread.regs) {
+		unsigned long childregs = (unsigned long)current->thread_info +
+						THREAD_SIZE;
+		childregs -= sizeof(struct pt_regs);
+		current->thread.regs = (struct pt_regs *)childregs;
+	}
+
+	/*
+	 * ELF_PLAT_INIT already clears all registers but it also sets r2.
+	 * So just clear r2 here.
+	 */
+	regs->gpr[2] = 0;
+
+	regs->nip = nip;
+	regs->gpr[1] = sp;
+	regs->msr = MSR_USER32;
+#ifndef CONFIG_SMP
+	if (last_task_used_math == current)
+		last_task_used_math = 0;
+#endif /* CONFIG_SMP */
+	current->thread.fpscr = 0;
+	memset(current->thread.fpr, 0, sizeof(current->thread.fpr));
+#ifdef CONFIG_ALTIVEC
+#ifndef CONFIG_SMP
+	if (last_task_used_altivec == current)
+		last_task_used_altivec = 0;
+#endif /* CONFIG_SMP */
+	memset(current->thread.vr, 0, sizeof(current->thread.vr));
+	current->thread.vscr.u[0] = 0;
+	current->thread.vscr.u[1] = 0;
+	current->thread.vscr.u[2] = 0;
+	current->thread.vscr.u[3] = 0x00010000; /* Java mode disabled */
+	current->thread.vrsave = 0;
+	current->thread.used_vr = 0;
+#endif /* CONFIG_ALTIVEC */
+}
+
+/* Note: it is necessary to treat option as an unsigned int, 
+ * with the corresponding cast to a signed int to insure that the 
+ * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
+ * and the register representation of a signed int (msr in 64-bit mode) is performed.
+ */
+asmlinkage long sys32_prctl(u32 option, u32 arg2, u32 arg3, u32 arg4, u32 arg5)
+{
+	return sys_prctl((int)option,
+			 (unsigned long) arg2,
+			 (unsigned long) arg3,
+			 (unsigned long) arg4,
+			 (unsigned long) arg5);
+}
+
+/* Note: it is necessary to treat pid as an unsigned int, 
+ * with the corresponding cast to a signed int to insure that the 
+ * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
+ * and the register representation of a signed int (msr in 64-bit mode) is performed.
+ */
+asmlinkage long sys32_sched_rr_get_interval(u32 pid, struct compat_timespec __user *interval)
+{
+	struct timespec t;
+	int ret;
+	mm_segment_t old_fs = get_fs ();
+
+	/* The __user pointer cast is valid because of the set_fs() */
+	set_fs (KERNEL_DS);
+	ret = sys_sched_rr_get_interval((int)pid, (struct timespec __user *) &t);
+	set_fs (old_fs);
+	if (put_compat_timespec(&t, interval))
+		return -EFAULT;
+	return ret;
+}
+
+asmlinkage int sys32_pciconfig_read(u32 bus, u32 dfn, u32 off, u32 len, u32 ubuf)
+{
+	return sys_pciconfig_read((unsigned long) bus,
+				  (unsigned long) dfn,
+				  (unsigned long) off,
+				  (unsigned long) len,
+				  compat_ptr(ubuf));
+}
+
+asmlinkage int sys32_pciconfig_write(u32 bus, u32 dfn, u32 off, u32 len, u32 ubuf)
+{
+	return sys_pciconfig_write((unsigned long) bus,
+				   (unsigned long) dfn,
+				   (unsigned long) off,
+				   (unsigned long) len,
+				   compat_ptr(ubuf));
+}
+
+#define IOBASE_BRIDGE_NUMBER	0
+#define IOBASE_MEMORY		1
+#define IOBASE_IO		2
+#define IOBASE_ISA_IO		3
+#define IOBASE_ISA_MEM		4
+
+asmlinkage int sys32_pciconfig_iobase(u32 which, u32 in_bus, u32 in_devfn)
+{
+	struct pci_controller* hose;
+	struct list_head *ln;
+	struct pci_bus *bus = NULL;
+	struct device_node *hose_node;
+
+	/* Argh ! Please forgive me for that hack, but that's the
+	 * simplest way to get existing XFree to not lockup on some
+	 * G5 machines... So when something asks for bus 0 io base
+	 * (bus 0 is HT root), we return the AGP one instead.
+	 */
+#ifdef CONFIG_PPC_PMAC
+	if (systemcfg->platform == PLATFORM_POWERMAC &&
+	    machine_is_compatible("MacRISC4"))
+		if (in_bus == 0)
+			in_bus = 0xf0;
+#endif /* CONFIG_PPC_PMAC */
+
+	/* That syscall isn't quite compatible with PCI domains, but it's
+	 * used on pre-domains setup. We return the first match
+	 */
+
+	for (ln = pci_root_buses.next; ln != &pci_root_buses; ln = ln->next) {
+		bus = pci_bus_b(ln);
+		if (in_bus >= bus->number && in_bus < (bus->number + bus->subordinate))
+			break;
+		bus = NULL;
+	}
+	if (bus == NULL || bus->sysdata == NULL)
+		return -ENODEV;
+
+	hose_node = (struct device_node *)bus->sysdata;
+	hose = hose_node->phb;
+
+	switch (which) {
+	case IOBASE_BRIDGE_NUMBER:
+		return (long)hose->first_busno;
+	case IOBASE_MEMORY:
+		return (long)hose->pci_mem_offset;
+	case IOBASE_IO:
+		return (long)hose->io_base_phys;
+	case IOBASE_ISA_IO:
+		return (long)isa_io_base;
+	case IOBASE_ISA_MEM:
+		return -EINVAL;
+	}
+
+	return -EOPNOTSUPP;
+}
+
+
+asmlinkage int ppc64_newuname(struct new_utsname __user * name)
+{
+	int errno = sys_newuname(name);
+
+	if (current->personality == PER_LINUX32 && !errno) {
+		if(copy_to_user(name->machine, "ppc\0\0", 8)) {
+			errno = -EFAULT;
+		}
+	}
+	return errno;
+}
+
+asmlinkage int ppc64_personality(unsigned long personality)
+{
+	int ret;
+	if (current->personality == PER_LINUX32 && personality == PER_LINUX)
+		personality = PER_LINUX32;
+	ret = sys_personality(personality);
+	if (ret == PER_LINUX32)
+		ret = PER_LINUX;
+	return ret;
+}
+
+
+
+/* Note: it is necessary to treat mode as an unsigned int,
+ * with the corresponding cast to a signed int to insure that the 
+ * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
+ * and the register representation of a signed int (msr in 64-bit mode) is performed.
+ */
+asmlinkage long sys32_access(const char __user * filename, u32 mode)
+{
+	return sys_access(filename, (int)mode);
+}
+
+
+/* Note: it is necessary to treat mode as an unsigned int,
+ * with the corresponding cast to a signed int to insure that the 
+ * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
+ * and the register representation of a signed int (msr in 64-bit mode) is performed.
+ */
+asmlinkage long sys32_creat(const char __user * pathname, u32 mode)
+{
+	return sys_creat(pathname, (int)mode);
+}
+
+
+/* Note: it is necessary to treat pid and options as unsigned ints,
+ * with the corresponding cast to a signed int to insure that the 
+ * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
+ * and the register representation of a signed int (msr in 64-bit mode) is performed.
+ */
+asmlinkage long sys32_waitpid(u32 pid, unsigned int __user * stat_addr, u32 options)
+{
+	return sys_waitpid((int)pid, stat_addr, (int)options);
+}
+
+
+/* Note: it is necessary to treat gidsetsize as an unsigned int,
+ * with the corresponding cast to a signed int to insure that the 
+ * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
+ * and the register representation of a signed int (msr in 64-bit mode) is performed.
+ */
+asmlinkage long sys32_getgroups(u32 gidsetsize, gid_t __user *grouplist)
+{
+	return sys_getgroups((int)gidsetsize, grouplist);
+}
+
+
+/* Note: it is necessary to treat pid as an unsigned int,
+ * with the corresponding cast to a signed int to insure that the 
+ * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
+ * and the register representation of a signed int (msr in 64-bit mode) is performed.
+ */
+asmlinkage long sys32_getpgid(u32 pid)
+{
+	return sys_getpgid((int)pid);
+}
+
+
+/* Note: it is necessary to treat which and who as unsigned ints,
+ * with the corresponding cast to a signed int to insure that the 
+ * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
+ * and the register representation of a signed int (msr in 64-bit mode) is performed.
+ */
+asmlinkage long sys32_getpriority(u32 which, u32 who)
+{
+	return sys_getpriority((int)which, (int)who);
+}
+
+
+/* Note: it is necessary to treat pid as an unsigned int,
+ * with the corresponding cast to a signed int to insure that the 
+ * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
+ * and the register representation of a signed int (msr in 64-bit mode) is performed.
+ */
+asmlinkage long sys32_getsid(u32 pid)
+{
+	return sys_getsid((int)pid);
+}
+
+
+/* Note: it is necessary to treat pid and sig as unsigned ints,
+ * with the corresponding cast to a signed int to insure that the 
+ * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
+ * and the register representation of a signed int (msr in 64-bit mode) is performed.
+ */
+asmlinkage long sys32_kill(u32 pid, u32 sig)
+{
+	return sys_kill((int)pid, (int)sig);
+}
+
+
+/* Note: it is necessary to treat mode as an unsigned int,
+ * with the corresponding cast to a signed int to insure that the 
+ * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
+ * and the register representation of a signed int (msr in 64-bit mode) is performed.
+ */
+asmlinkage long sys32_mkdir(const char __user * pathname, u32 mode)
+{
+	return sys_mkdir(pathname, (int)mode);
+}
+
+long sys32_nice(u32 increment)
+{
+	/* sign extend increment */
+	return sys_nice((int)increment);
+}
+
+off_t ppc32_lseek(unsigned int fd, u32 offset, unsigned int origin)
+{
+	/* sign extend n */
+	return sys_lseek(fd, (int)offset, origin);
+}
+
+/*
+ * This is just a version for 32-bit applications which does
+ * not force O_LARGEFILE on.
+ */
+asmlinkage long sys32_open(const char __user * filename, int flags, int mode)
+{
+	char * tmp;
+	int fd, error;
+
+	tmp = getname(filename);
+	fd = PTR_ERR(tmp);
+	if (!IS_ERR(tmp)) {
+		fd = get_unused_fd();
+		if (fd >= 0) {
+			struct file * f = filp_open(tmp, flags, mode);
+			error = PTR_ERR(f);
+			if (IS_ERR(f))
+				goto out_error;
+			fd_install(fd, f);
+		}
+out:
+		putname(tmp);
+	}
+	return fd;
+
+out_error:
+	put_unused_fd(fd);
+	fd = error;
+	goto out;
+}
+
+/* Note: it is necessary to treat bufsiz as an unsigned int,
+ * with the corresponding cast to a signed int to insure that the 
+ * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
+ * and the register representation of a signed int (msr in 64-bit mode) is performed.
+ */
+asmlinkage long sys32_readlink(const char __user * path, char __user * buf, u32 bufsiz)
+{
+	return sys_readlink(path, buf, (int)bufsiz);
+}
+
+/* Note: it is necessary to treat option as an unsigned int,
+ * with the corresponding cast to a signed int to insure that the 
+ * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
+ * and the register representation of a signed int (msr in 64-bit mode) is performed.
+ */
+asmlinkage long sys32_sched_get_priority_max(u32 policy)
+{
+	return sys_sched_get_priority_max((int)policy);
+}
+
+
+/* Note: it is necessary to treat policy as an unsigned int,
+ * with the corresponding cast to a signed int to insure that the 
+ * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
+ * and the register representation of a signed int (msr in 64-bit mode) is performed.
+ */
+asmlinkage long sys32_sched_get_priority_min(u32 policy)
+{
+	return sys_sched_get_priority_min((int)policy);
+}
+
+
+/* Note: it is necessary to treat pid as an unsigned int,
+ * with the corresponding cast to a signed int to insure that the 
+ * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
+ * and the register representation of a signed int (msr in 64-bit mode) is performed.
+ */
+asmlinkage long sys32_sched_getparam(u32 pid, struct sched_param __user *param)
+{
+	return sys_sched_getparam((int)pid, param);
+}
+
+
+/* Note: it is necessary to treat pid as an unsigned int,
+ * with the corresponding cast to a signed int to insure that the 
+ * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
+ * and the register representation of a signed int (msr in 64-bit mode) is performed.
+ */
+asmlinkage long sys32_sched_getscheduler(u32 pid)
+{
+	return sys_sched_getscheduler((int)pid);
+}
+
+
+/* Note: it is necessary to treat pid as an unsigned int,
+ * with the corresponding cast to a signed int to insure that the 
+ * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
+ * and the register representation of a signed int (msr in 64-bit mode) is performed.
+ */
+asmlinkage long sys32_sched_setparam(u32 pid, struct sched_param __user *param)
+{
+	return sys_sched_setparam((int)pid, param);
+}
+
+
+/* Note: it is necessary to treat pid and policy as unsigned ints,
+ * with the corresponding cast to a signed int to insure that the 
+ * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
+ * and the register representation of a signed int (msr in 64-bit mode) is performed.
+ */
+asmlinkage long sys32_sched_setscheduler(u32 pid, u32 policy, struct sched_param __user *param)
+{
+	return sys_sched_setscheduler((int)pid, (int)policy, param);
+}
+
+
+/* Note: it is necessary to treat len as an unsigned int,
+ * with the corresponding cast to a signed int to insure that the 
+ * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
+ * and the register representation of a signed int (msr in 64-bit mode) is performed.
+ */
+asmlinkage long sys32_setdomainname(char __user *name, u32 len)
+{
+	return sys_setdomainname(name, (int)len);
+}
+
+
+/* Note: it is necessary to treat gidsetsize as an unsigned int,
+ * with the corresponding cast to a signed int to insure that the 
+ * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
+ * and the register representation of a signed int (msr in 64-bit mode) is performed.
+ */
+asmlinkage long sys32_setgroups(u32 gidsetsize, gid_t __user *grouplist)
+{
+	return sys_setgroups((int)gidsetsize, grouplist);
+}
+
+
+asmlinkage long sys32_sethostname(char __user *name, u32 len)
+{
+	/* sign extend len */
+	return sys_sethostname(name, (int)len);
+}
+
+
+/* Note: it is necessary to treat pid and pgid as unsigned ints,
+ * with the corresponding cast to a signed int to insure that the 
+ * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
+ * and the register representation of a signed int (msr in 64-bit mode) is performed.
+ */
+asmlinkage long sys32_setpgid(u32 pid, u32 pgid)
+{
+	return sys_setpgid((int)pid, (int)pgid);
+}
+
+
+long sys32_setpriority(u32 which, u32 who, u32 niceval)
+{
+	/* sign extend which, who and niceval */
+	return sys_setpriority((int)which, (int)who, (int)niceval);
+}
+
+/* Note: it is necessary to treat newmask as an unsigned int,
+ * with the corresponding cast to a signed int to insure that the 
+ * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
+ * and the register representation of a signed int (msr in 64-bit mode) is performed.
+ */
+asmlinkage long sys32_ssetmask(u32 newmask)
+{
+	return sys_ssetmask((int) newmask);
+}
+
+asmlinkage long sys32_syslog(u32 type, char __user * buf, u32 len)
+{
+	/* sign extend len */
+	return sys_syslog(type, buf, (int)len);
+}
+
+
+/* Note: it is necessary to treat mask as an unsigned int,
+ * with the corresponding cast to a signed int to insure that the 
+ * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
+ * and the register representation of a signed int (msr in 64-bit mode) is performed.
+ */
+asmlinkage long sys32_umask(u32 mask)
+{
+	return sys_umask((int)mask);
+}
+
+#ifdef CONFIG_SYSCTL
+struct __sysctl_args32 {
+	u32 name;
+	int nlen;
+	u32 oldval;
+	u32 oldlenp;
+	u32 newval;
+	u32 newlen;
+	u32 __unused[4];
+};
+
+asmlinkage long sys32_sysctl(struct __sysctl_args32 __user *args)
+{
+	struct __sysctl_args32 tmp;
+	int error;
+	size_t oldlen;
+	size_t __user *oldlenp = NULL;
+	unsigned long addr = (((unsigned long)&args->__unused[0]) + 7) & ~7;
+
+	if (copy_from_user(&tmp, args, sizeof(tmp)))
+		return -EFAULT;
+
+	if (tmp.oldval && tmp.oldlenp) {
+		/* Duh, this is ugly and might not work if sysctl_args
+		   is in read-only memory, but do_sysctl does indirectly
+		   a lot of uaccess in both directions and we'd have to
+		   basically copy the whole sysctl.c here, and
+		   glibc's __sysctl uses rw memory for the structure
+		   anyway.  */
+		oldlenp = (size_t __user *)addr;
+		if (get_user(oldlen, (compat_size_t __user *)compat_ptr(tmp.oldlenp)) ||
+		    put_user(oldlen, oldlenp))
+			return -EFAULT;
+	}
+
+	lock_kernel();
+	error = do_sysctl(compat_ptr(tmp.name), tmp.nlen,
+			  compat_ptr(tmp.oldval), oldlenp,
+			  compat_ptr(tmp.newval), tmp.newlen);
+	unlock_kernel();
+	if (oldlenp) {
+		if (!error) {
+			if (get_user(oldlen, oldlenp) ||
+			    put_user(oldlen, (compat_size_t __user *)compat_ptr(tmp.oldlenp)))
+				error = -EFAULT;
+		}
+		copy_to_user(args->__unused, tmp.__unused, sizeof(tmp.__unused));
+	}
+	return error;
+}
+#endif
+
+asmlinkage int sys32_olduname(struct oldold_utsname __user * name)
+{
+	int error;
+	
+	if (!name)
+		return -EFAULT;
+	if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname)))
+		return -EFAULT;
+  
+	down_read(&uts_sem);
+	error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN);
+	error -= __put_user(0,name->sysname+__OLD_UTS_LEN);
+	error -= __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN);
+	error -= __put_user(0,name->nodename+__OLD_UTS_LEN);
+	error -= __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN);
+	error -= __put_user(0,name->release+__OLD_UTS_LEN);
+	error -= __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN);
+	error -= __put_user(0,name->version+__OLD_UTS_LEN);
+	error -= __copy_to_user(&name->machine,&system_utsname.machine,__OLD_UTS_LEN);
+	error = __put_user(0,name->machine+__OLD_UTS_LEN);
+	up_read(&uts_sem);
+
+	error = error ? -EFAULT : 0;
+	
+	return error;
+}
+
+unsigned long sys32_mmap2(unsigned long addr, size_t len,
+			  unsigned long prot, unsigned long flags,
+			  unsigned long fd, unsigned long pgoff)
+{
+	/* This should remain 12 even if PAGE_SIZE changes */
+	return sys_mmap(addr, len, prot, flags, fd, pgoff << 12);
+}
+
+int get_compat_timeval(struct timeval *tv, struct compat_timeval __user *ctv)
+{
+	return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) ||
+		__get_user(tv->tv_sec, &ctv->tv_sec) ||
+		__get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
+}
+
+asmlinkage long sys32_utimes(char __user *filename, struct compat_timeval __user *tvs)
+{
+	struct timeval ktvs[2], *ptr;
+
+	ptr = NULL;
+	if (tvs) {
+		if (get_compat_timeval(&ktvs[0], &tvs[0]) ||
+		    get_compat_timeval(&ktvs[1], &tvs[1]))
+			return -EFAULT;
+		ptr = ktvs;
+	}
+
+	return do_utimes(filename, ptr);
+}
+
+long sys32_tgkill(u32 tgid, u32 pid, int sig)
+{
+	/* sign extend tgid, pid */
+	return sys_tgkill((int)tgid, (int)pid, sig);
+}
+
+/* 
+ * long long munging:
+ * The 32 bit ABI passes long longs in an odd even register pair.
+ */
+
+compat_ssize_t sys32_pread64(unsigned int fd, char __user *ubuf, compat_size_t count,
+			     u32 reg6, u32 poshi, u32 poslo)
+{
+	return sys_pread64(fd, ubuf, count, ((loff_t)poshi << 32) | poslo);
+}
+
+compat_ssize_t sys32_pwrite64(unsigned int fd, char __user *ubuf, compat_size_t count,
+			      u32 reg6, u32 poshi, u32 poslo)
+{
+	return sys_pwrite64(fd, ubuf, count, ((loff_t)poshi << 32) | poslo);
+}
+
+compat_ssize_t sys32_readahead(int fd, u32 r4, u32 offhi, u32 offlo, u32 count)
+{
+	return sys_readahead(fd, ((loff_t)offhi << 32) | offlo, count);
+}
+
+asmlinkage int sys32_truncate64(const char __user * path, u32 reg4,
+				unsigned long high, unsigned long low)
+{
+	return sys_truncate(path, (high << 32) | low);
+}
+
+asmlinkage int sys32_ftruncate64(unsigned int fd, u32 reg4, unsigned long high,
+				 unsigned long low)
+{
+	return sys_ftruncate(fd, (high << 32) | low);
+}
+
+long ppc32_lookup_dcookie(u32 cookie_high, u32 cookie_low, char __user *buf,
+			  size_t len)
+{
+	return sys_lookup_dcookie((u64)cookie_high << 32 | cookie_low,
+				  buf, len);
+}
+
+long ppc32_fadvise64(int fd, u32 unused, u32 offset_high, u32 offset_low,
+		     size_t len, int advice)
+{
+	return sys_fadvise64(fd, (u64)offset_high << 32 | offset_low, len,
+			     advice);
+}
+
+long ppc32_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low,
+			u32 len_high, u32 len_low)
+{
+	return sys_fadvise64(fd, (u64)offset_high << 32 | offset_low,
+			     (u64)len_high << 32 | len_low, advice);
+}
+
+extern asmlinkage long sys_timer_create(clockid_t, sigevent_t __user *, timer_t __user *);
+
+long ppc32_timer_create(clockid_t clock,
+			struct compat_sigevent __user *ev32,
+			timer_t __user *timer_id)
+{
+	sigevent_t event;
+	timer_t t;
+	long err;
+	mm_segment_t savefs;
+
+	if (ev32 == NULL)
+		return sys_timer_create(clock, NULL, timer_id);
+
+	if (get_compat_sigevent(&event, ev32))
+		return -EFAULT;
+
+	if (!access_ok(VERIFY_WRITE, timer_id, sizeof(timer_t)))
+		return -EFAULT;
+
+	savefs = get_fs();
+	set_fs(KERNEL_DS);
+	/* The __user pointer casts are valid due to the set_fs() */
+	err = sys_timer_create(clock,
+		(sigevent_t __user *) &event,
+		(timer_t __user *) &t);
+	set_fs(savefs);
+
+	if (err == 0)
+		err = __put_user(t, timer_id);
+
+	return err;
+}
+
+asmlinkage long sys32_add_key(const char __user *_type,
+			      const char __user *_description,
+			      const void __user *_payload,
+			      u32 plen,
+			      u32 ringid)
+{
+	return sys_add_key(_type, _description, _payload, plen, ringid);
+}
+
+asmlinkage long sys32_request_key(const char __user *_type,
+				  const char __user *_description,
+				  const char __user *_callout_info,
+				  u32 destringid)
+{
+	return sys_request_key(_type, _description, _callout_info, destringid);
+}
+
diff --git a/arch/ppc64/kernel/syscalls.c b/arch/ppc64/kernel/syscalls.c
new file mode 100644
index 00000000000..f2865ff8d2f
--- /dev/null
+++ b/arch/ppc64/kernel/syscalls.c
@@ -0,0 +1,258 @@
+/*
+ * linux/arch/ppc64/kernel/sys_ppc.c
+ *
+ *  PowerPC version 
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Derived from "arch/i386/kernel/sys_i386.c"
+ * Adapted from the i386 version by Gary Thomas
+ * Modified by Cort Dougan (cort@cs.nmt.edu)
+ * and Paul Mackerras (paulus@cs.anu.edu.au).
+ *
+ * This file contains various random system calls that
+ * have a non-standard calling sequence on the Linux/PPC
+ * platform.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/sem.h>
+#include <linux/msg.h>
+#include <linux/shm.h>
+#include <linux/stat.h>
+#include <linux/mman.h>
+#include <linux/sys.h>
+#include <linux/ipc.h>
+#include <linux/utsname.h>
+#include <linux/file.h>
+#include <linux/init.h>
+#include <linux/personality.h>
+
+#include <asm/uaccess.h>
+#include <asm/ipc.h>
+#include <asm/semaphore.h>
+#include <asm/time.h>
+#include <asm/unistd.h>
+
+extern unsigned long wall_jiffies;
+
+void
+check_bugs(void)
+{
+}
+
+/*
+ * sys_ipc() is the de-multiplexer for the SysV IPC calls..
+ *
+ * This is really horribly ugly.
+ */
+asmlinkage int 
+sys_ipc (uint call, int first, unsigned long second, long third,
+	 void __user *ptr, long fifth)
+{
+	int version, ret;
+
+	version = call >> 16; /* hack for backward compatibility */
+	call &= 0xffff;
+
+	ret = -ENOSYS;
+	switch (call) {
+	case SEMOP:
+		ret = sys_semtimedop(first, (struct sembuf __user *)ptr,
+				      (unsigned)second, NULL);
+		break;
+	case SEMTIMEDOP:
+		ret = sys_semtimedop(first, (struct sembuf __user *)ptr,
+				      (unsigned)second,
+				      (const struct timespec __user *) fifth);
+		break;
+	case SEMGET:
+		ret = sys_semget (first, (int)second, third);
+		break;
+	case SEMCTL: {
+		union semun fourth;
+
+		ret = -EINVAL;
+		if (!ptr)
+			break;
+		if ((ret = get_user(fourth.__pad, (void __user * __user *)ptr)))
+			break;
+		ret = sys_semctl(first, (int)second, third, fourth);
+		break;
+	}
+	case MSGSND:
+		ret = sys_msgsnd(first, (struct msgbuf __user *)ptr,
+				  (size_t)second, third);
+		break;
+	case MSGRCV:
+		switch (version) {
+		case 0: {
+			struct ipc_kludge tmp;
+
+			ret = -EINVAL;
+			if (!ptr)
+				break;
+			if ((ret = copy_from_user(&tmp,
+						(struct ipc_kludge __user *) ptr,
+						sizeof (tmp)) ? -EFAULT : 0))
+				break;
+			ret = sys_msgrcv(first, tmp.msgp, (size_t) second,
+					  tmp.msgtyp, third);
+			break;
+		}
+		default:
+			ret = sys_msgrcv (first, (struct msgbuf __user *) ptr,
+					  (size_t)second, fifth, third);
+			break;
+		}
+		break;
+	case MSGGET:
+		ret = sys_msgget ((key_t)first, (int)second);
+		break;
+	case MSGCTL:
+		ret = sys_msgctl(first, (int)second,
+				  (struct msqid_ds __user *)ptr);
+		break;
+	case SHMAT:
+		switch (version) {
+		default: {
+			ulong raddr;
+			ret = do_shmat(first, (char __user *) ptr,
+					(int)second, &raddr);
+			if (ret)
+				break;
+			ret = put_user (raddr, (ulong __user *) third);
+			break;
+		}
+		case 1:	/* iBCS2 emulator entry point */
+			ret = -EINVAL;
+			if (!segment_eq(get_fs(), get_ds()))
+				break;
+			ret = do_shmat(first, (char __user *)ptr,
+					(int)second, (ulong *)third);
+			break;
+		}
+		break;
+	case SHMDT: 
+		ret = sys_shmdt ((char __user *)ptr);
+		break;
+	case SHMGET:
+		ret = sys_shmget (first, (size_t)second, third);
+		break;
+	case SHMCTL:
+		ret = sys_shmctl(first, (int)second,
+				  (struct shmid_ds __user *)ptr);
+		break;
+	}
+
+	return ret;
+}
+
+/*
+ * sys_pipe() is the normal C calling standard for creating
+ * a pipe. It's not the way unix traditionally does this, though.
+ */
+asmlinkage int sys_pipe(int __user *fildes)
+{
+	int fd[2];
+	int error;
+	
+	error = do_pipe(fd);
+	if (!error) {
+		if (copy_to_user(fildes, fd, 2*sizeof(int)))
+			error = -EFAULT;
+	}
+	
+	return error;
+}
+
+unsigned long sys_mmap(unsigned long addr, size_t len,
+		       unsigned long prot, unsigned long flags,
+		       unsigned long fd, off_t offset)
+{
+	struct file * file = NULL;
+	unsigned long ret = -EBADF;
+
+	if (!(flags & MAP_ANONYMOUS)) {
+		if (!(file = fget(fd)))
+			goto out;
+	}
+
+	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+	down_write(&current->mm->mmap_sem);
+	ret = do_mmap(file, addr, len, prot, flags, offset);
+	up_write(&current->mm->mmap_sem);
+	if (file)
+		fput(file);
+
+out:
+	return ret;
+}
+
+static int __init set_fakeppc(char *str)
+{
+	if (*str)
+		return 0;
+	init_task.personality = PER_LINUX32;
+	return 1;
+}
+__setup("fakeppc", set_fakeppc);
+
+asmlinkage int sys_uname(struct old_utsname __user * name)
+{
+	int err = -EFAULT;
+	
+	down_read(&uts_sem);
+	if (name && !copy_to_user(name, &system_utsname, sizeof (*name)))
+		err = 0;
+	up_read(&uts_sem);
+	
+	return err;
+}
+
+asmlinkage time_t sys64_time(time_t __user * tloc)
+{
+	time_t secs;
+	time_t usecs;
+
+	long tb_delta = tb_ticks_since(tb_last_stamp);
+	tb_delta += (jiffies - wall_jiffies) * tb_ticks_per_jiffy;
+
+	secs  = xtime.tv_sec;  
+	usecs = (xtime.tv_nsec/1000) + tb_delta / tb_ticks_per_usec;
+	while (usecs >= USEC_PER_SEC) {
+		++secs;
+		usecs -= USEC_PER_SEC;
+	}
+
+	if (tloc) {
+		if (put_user(secs,tloc))
+			secs = -EFAULT;
+	}
+
+	return secs;
+}
+
+void do_show_syscall(unsigned long r3, unsigned long r4, unsigned long r5,
+		     unsigned long r6, unsigned long r7, unsigned long r8,
+		     struct pt_regs *regs)
+{
+	printk("syscall %ld(%lx, %lx, %lx, %lx, %lx, %lx) regs=%p current=%p"
+	       " cpu=%d\n", regs->gpr[0], r3, r4, r5, r6, r7, r8, regs,
+	       current, smp_processor_id());
+}
+
+void do_show_syscall_exit(unsigned long r3)
+{
+	printk(" -> %lx, current=%p cpu=%d\n", r3, current, smp_processor_id());
+}
diff --git a/arch/ppc64/kernel/sysfs.c b/arch/ppc64/kernel/sysfs.c
new file mode 100644
index 00000000000..0925694c3ce
--- /dev/null
+++ b/arch/ppc64/kernel/sysfs.c
@@ -0,0 +1,431 @@
+#include <linux/config.h>
+#include <linux/sysdev.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/nodemask.h>
+#include <linux/cpumask.h>
+#include <linux/notifier.h>
+
+#include <asm/current.h>
+#include <asm/processor.h>
+#include <asm/cputable.h>
+#include <asm/hvcall.h>
+#include <asm/prom.h>
+#include <asm/systemcfg.h>
+#include <asm/paca.h>
+#include <asm/lppaca.h>
+#include <asm/machdep.h>
+
+static DEFINE_PER_CPU(struct cpu, cpu_devices);
+
+/* SMT stuff */
+
+#ifdef CONFIG_PPC_MULTIPLATFORM
+/* default to snooze disabled */
+DEFINE_PER_CPU(unsigned long, smt_snooze_delay);
+
+static ssize_t store_smt_snooze_delay(struct sys_device *dev, const char *buf,
+				      size_t count)
+{
+	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
+	ssize_t ret;
+	unsigned long snooze;
+
+	ret = sscanf(buf, "%lu", &snooze);
+	if (ret != 1)
+		return -EINVAL;
+
+	per_cpu(smt_snooze_delay, cpu->sysdev.id) = snooze;
+
+	return count;
+}
+
+static ssize_t show_smt_snooze_delay(struct sys_device *dev, char *buf)
+{
+	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
+
+	return sprintf(buf, "%lu\n", per_cpu(smt_snooze_delay, cpu->sysdev.id));
+}
+
+static SYSDEV_ATTR(smt_snooze_delay, 0644, show_smt_snooze_delay,
+		   store_smt_snooze_delay);
+
+/* Only parse OF options if the matching cmdline option was not specified */
+static int smt_snooze_cmdline;
+
+static int __init smt_setup(void)
+{
+	struct device_node *options;
+	unsigned int *val;
+	unsigned int cpu;
+
+	if (!cpu_has_feature(CPU_FTR_SMT))
+		return 1;
+
+	options = find_path_device("/options");
+	if (!options)
+		return 1;
+
+	val = (unsigned int *)get_property(options, "ibm,smt-snooze-delay",
+					   NULL);
+	if (!smt_snooze_cmdline && val) {
+		for_each_cpu(cpu)
+			per_cpu(smt_snooze_delay, cpu) = *val;
+	}
+
+	return 1;
+}
+__initcall(smt_setup);
+
+static int __init setup_smt_snooze_delay(char *str)
+{
+	unsigned int cpu;
+	int snooze;
+
+	if (!cpu_has_feature(CPU_FTR_SMT))
+		return 1;
+
+	smt_snooze_cmdline = 1;
+
+	if (get_option(&str, &snooze)) {
+		for_each_cpu(cpu)
+			per_cpu(smt_snooze_delay, cpu) = snooze;
+	}
+
+	return 1;
+}
+__setup("smt-snooze-delay=", setup_smt_snooze_delay);
+
+/*
+ * Enabling PMCs will slow partition context switch times so we only do
+ * it the first time we write to the PMCs.
+ */
+
+static DEFINE_PER_CPU(char, pmcs_enabled);
+
+void ppc64_enable_pmcs(void)
+{
+	unsigned long hid0;
+#ifdef CONFIG_PPC_PSERIES
+	unsigned long set, reset;
+	int ret;
+	unsigned int ctrl;
+#endif /* CONFIG_PPC_PSERIES */
+
+	/* Only need to enable them once */
+	if (__get_cpu_var(pmcs_enabled))
+		return;
+
+	__get_cpu_var(pmcs_enabled) = 1;
+
+	switch (systemcfg->platform) {
+	case PLATFORM_PSERIES:
+	case PLATFORM_POWERMAC:
+		hid0 = mfspr(HID0);
+		hid0 |= 1UL << (63 - 20);
+
+		/* POWER4 requires the following sequence */
+		asm volatile(
+			     "sync\n"
+			     "mtspr	%1, %0\n"
+			     "mfspr	%0, %1\n"
+			     "mfspr	%0, %1\n"
+			     "mfspr	%0, %1\n"
+			     "mfspr	%0, %1\n"
+			     "mfspr	%0, %1\n"
+			     "mfspr	%0, %1\n"
+			     "isync" : "=&r" (hid0) : "i" (HID0), "0" (hid0):
+			     "memory");
+		break;
+
+#ifdef CONFIG_PPC_PSERIES
+	case PLATFORM_PSERIES_LPAR:
+		set = 1UL << 63;
+		reset = 0;
+		ret = plpar_hcall_norets(H_PERFMON, set, reset);
+		if (ret)
+			printk(KERN_ERR "H_PERFMON call on cpu %u "
+			       "returned %d\n",
+			       smp_processor_id(), ret);
+		break;
+#endif /* CONFIG_PPC_PSERIES */
+
+	default:
+		break;
+	}
+
+#ifdef CONFIG_PPC_PSERIES
+	/* instruct hypervisor to maintain PMCs */
+	if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR)
+		get_paca()->lppaca.pmcregs_in_use = 1;
+
+	/*
+	 * On SMT machines we have to set the run latch in the ctrl register
+	 * in order to make PMC6 spin.
+	 */
+	if (cpu_has_feature(CPU_FTR_SMT)) {
+		ctrl = mfspr(CTRLF);
+		ctrl |= RUNLATCH;
+		mtspr(CTRLT, ctrl);
+	}
+#endif /* CONFIG_PPC_PSERIES */
+}
+
+#else
+
+/* PMC stuff */
+void ppc64_enable_pmcs(void)
+{
+	/* XXX Implement for iseries */
+}
+#endif /* CONFIG_PPC_MULTIPLATFORM */
+
+EXPORT_SYMBOL(ppc64_enable_pmcs);
+
+/* XXX convert to rusty's on_one_cpu */
+static unsigned long run_on_cpu(unsigned long cpu,
+			        unsigned long (*func)(unsigned long),
+				unsigned long arg)
+{
+	cpumask_t old_affinity = current->cpus_allowed;
+	unsigned long ret;
+
+	/* should return -EINVAL to userspace */
+	if (set_cpus_allowed(current, cpumask_of_cpu(cpu)))
+		return 0;
+
+	ret = func(arg);
+
+	set_cpus_allowed(current, old_affinity);
+
+	return ret;
+}
+
+#define SYSFS_PMCSETUP(NAME, ADDRESS) \
+static unsigned long read_##NAME(unsigned long junk) \
+{ \
+	return mfspr(ADDRESS); \
+} \
+static unsigned long write_##NAME(unsigned long val) \
+{ \
+	ppc64_enable_pmcs(); \
+	mtspr(ADDRESS, val); \
+	return 0; \
+} \
+static ssize_t show_##NAME(struct sys_device *dev, char *buf) \
+{ \
+	struct cpu *cpu = container_of(dev, struct cpu, sysdev); \
+	unsigned long val = run_on_cpu(cpu->sysdev.id, read_##NAME, 0); \
+	return sprintf(buf, "%lx\n", val); \
+} \
+static ssize_t __attribute_used__ \
+	store_##NAME(struct sys_device *dev, const char *buf, size_t count) \
+{ \
+	struct cpu *cpu = container_of(dev, struct cpu, sysdev); \
+	unsigned long val; \
+	int ret = sscanf(buf, "%lx", &val); \
+	if (ret != 1) \
+		return -EINVAL; \
+	run_on_cpu(cpu->sysdev.id, write_##NAME, val); \
+	return count; \
+}
+
+SYSFS_PMCSETUP(mmcr0, SPRN_MMCR0);
+SYSFS_PMCSETUP(mmcr1, SPRN_MMCR1);
+SYSFS_PMCSETUP(mmcra, SPRN_MMCRA);
+SYSFS_PMCSETUP(pmc1, SPRN_PMC1);
+SYSFS_PMCSETUP(pmc2, SPRN_PMC2);
+SYSFS_PMCSETUP(pmc3, SPRN_PMC3);
+SYSFS_PMCSETUP(pmc4, SPRN_PMC4);
+SYSFS_PMCSETUP(pmc5, SPRN_PMC5);
+SYSFS_PMCSETUP(pmc6, SPRN_PMC6);
+SYSFS_PMCSETUP(pmc7, SPRN_PMC7);
+SYSFS_PMCSETUP(pmc8, SPRN_PMC8);
+SYSFS_PMCSETUP(purr, SPRN_PURR);
+
+static SYSDEV_ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0);
+static SYSDEV_ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1);
+static SYSDEV_ATTR(mmcra, 0600, show_mmcra, store_mmcra);
+static SYSDEV_ATTR(pmc1, 0600, show_pmc1, store_pmc1);
+static SYSDEV_ATTR(pmc2, 0600, show_pmc2, store_pmc2);
+static SYSDEV_ATTR(pmc3, 0600, show_pmc3, store_pmc3);
+static SYSDEV_ATTR(pmc4, 0600, show_pmc4, store_pmc4);
+static SYSDEV_ATTR(pmc5, 0600, show_pmc5, store_pmc5);
+static SYSDEV_ATTR(pmc6, 0600, show_pmc6, store_pmc6);
+static SYSDEV_ATTR(pmc7, 0600, show_pmc7, store_pmc7);
+static SYSDEV_ATTR(pmc8, 0600, show_pmc8, store_pmc8);
+static SYSDEV_ATTR(purr, 0600, show_purr, NULL);
+
+static void register_cpu_online(unsigned int cpu)
+{
+	struct cpu *c = &per_cpu(cpu_devices, cpu);
+	struct sys_device *s = &c->sysdev;
+
+#ifndef CONFIG_PPC_ISERIES
+	if (cpu_has_feature(CPU_FTR_SMT))
+		sysdev_create_file(s, &attr_smt_snooze_delay);
+#endif
+
+	/* PMC stuff */
+
+	sysdev_create_file(s, &attr_mmcr0);
+	sysdev_create_file(s, &attr_mmcr1);
+
+	if (cpu_has_feature(CPU_FTR_MMCRA))
+		sysdev_create_file(s, &attr_mmcra);
+
+	sysdev_create_file(s, &attr_pmc1);
+	sysdev_create_file(s, &attr_pmc2);
+	sysdev_create_file(s, &attr_pmc3);
+	sysdev_create_file(s, &attr_pmc4);
+	sysdev_create_file(s, &attr_pmc5);
+	sysdev_create_file(s, &attr_pmc6);
+
+	if (cpu_has_feature(CPU_FTR_PMC8)) {
+		sysdev_create_file(s, &attr_pmc7);
+		sysdev_create_file(s, &attr_pmc8);
+	}
+
+	if (cpu_has_feature(CPU_FTR_SMT))
+		sysdev_create_file(s, &attr_purr);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void unregister_cpu_online(unsigned int cpu)
+{
+	struct cpu *c = &per_cpu(cpu_devices, cpu);
+	struct sys_device *s = &c->sysdev;
+
+	BUG_ON(c->no_control);
+
+#ifndef CONFIG_PPC_ISERIES
+	if (cpu_has_feature(CPU_FTR_SMT))
+		sysdev_remove_file(s, &attr_smt_snooze_delay);
+#endif
+
+	/* PMC stuff */
+
+	sysdev_remove_file(s, &attr_mmcr0);
+	sysdev_remove_file(s, &attr_mmcr1);
+
+	if (cpu_has_feature(CPU_FTR_MMCRA))
+		sysdev_remove_file(s, &attr_mmcra);
+
+	sysdev_remove_file(s, &attr_pmc1);
+	sysdev_remove_file(s, &attr_pmc2);
+	sysdev_remove_file(s, &attr_pmc3);
+	sysdev_remove_file(s, &attr_pmc4);
+	sysdev_remove_file(s, &attr_pmc5);
+	sysdev_remove_file(s, &attr_pmc6);
+
+	if (cpu_has_feature(CPU_FTR_PMC8)) {
+		sysdev_remove_file(s, &attr_pmc7);
+		sysdev_remove_file(s, &attr_pmc8);
+	}
+
+	if (cpu_has_feature(CPU_FTR_SMT))
+		sysdev_remove_file(s, &attr_purr);
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static int __devinit sysfs_cpu_notify(struct notifier_block *self,
+				      unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned int)(long)hcpu;
+
+	switch (action) {
+	case CPU_ONLINE:
+		register_cpu_online(cpu);
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_DEAD:
+		unregister_cpu_online(cpu);
+		break;
+#endif
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata sysfs_cpu_nb = {
+	.notifier_call	= sysfs_cpu_notify,
+};
+
+/* NUMA stuff */
+
+#ifdef CONFIG_NUMA
+static struct node node_devices[MAX_NUMNODES];
+
+static void register_nodes(void)
+{
+	int i;
+
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		if (node_online(i)) {
+			int p_node = parent_node(i);
+			struct node *parent = NULL;
+
+			if (p_node != i)
+				parent = &node_devices[p_node];
+
+			register_node(&node_devices[i], i, parent);
+		}
+	}
+}
+#else
+static void register_nodes(void)
+{
+	return;
+}
+#endif
+
+/* Only valid if CPU is present. */
+static ssize_t show_physical_id(struct sys_device *dev, char *buf)
+{
+	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
+
+	return sprintf(buf, "%d\n", get_hard_smp_processor_id(cpu->sysdev.id));
+}
+static SYSDEV_ATTR(physical_id, 0444, show_physical_id, NULL);
+
+static int __init topology_init(void)
+{
+	int cpu;
+	struct node *parent = NULL;
+
+	register_nodes();
+
+	register_cpu_notifier(&sysfs_cpu_nb);
+
+	for_each_cpu(cpu) {
+		struct cpu *c = &per_cpu(cpu_devices, cpu);
+
+#ifdef CONFIG_NUMA
+		parent = &node_devices[cpu_to_node(cpu)];
+#endif
+		/*
+		 * For now, we just see if the system supports making
+		 * the RTAS calls for CPU hotplug.  But, there may be a
+		 * more comprehensive way to do this for an individual
+		 * CPU.  For instance, the boot cpu might never be valid
+		 * for hotplugging.
+		 */
+		if (!ppc_md.cpu_die)
+			c->no_control = 1;
+
+		if (cpu_online(cpu) || (c->no_control == 0)) {
+			register_cpu(c, cpu, parent);
+
+			sysdev_create_file(&c->sysdev, &attr_physical_id);
+		}
+
+		if (cpu_online(cpu))
+			register_cpu_online(cpu);
+	}
+
+	return 0;
+}
+__initcall(topology_init);
diff --git a/arch/ppc64/kernel/time.c b/arch/ppc64/kernel/time.c
new file mode 100644
index 00000000000..77ded5a363b
--- /dev/null
+++ b/arch/ppc64/kernel/time.c
@@ -0,0 +1,827 @@
+/*
+ * 
+ * Common time routines among all ppc machines.
+ *
+ * Written by Cort Dougan (cort@cs.nmt.edu) to merge
+ * Paul Mackerras' version and mine for PReP and Pmac.
+ * MPC8xx/MBX changes by Dan Malek (dmalek@jlc.net).
+ * Converted for 64-bit by Mike Corrigan (mikejc@us.ibm.com)
+ *
+ * First round of bugfixes by Gabriel Paubert (paubert@iram.es)
+ * to make clock more stable (2.4.0-test5). The only thing
+ * that this code assumes is that the timebases have been synchronized
+ * by firmware on SMP and are never stopped (never do sleep
+ * on SMP then, nap and doze are OK).
+ * 
+ * Speeded up do_gettimeofday by getting rid of references to
+ * xtime (which required locks for consistency). (mikejc@us.ibm.com)
+ *
+ * TODO (not necessarily in this file):
+ * - improve precision and reproducibility of timebase frequency
+ * measurement at boot time. (for iSeries, we calibrate the timebase
+ * against the Titan chip's clock.)
+ * - for astronomical applications: add a new function to get
+ * non ambiguous timestamps even around leap seconds. This needs
+ * a new timestamp format and a good name.
+ *
+ * 1997-09-10  Updated NTP code according to technical memorandum Jan '96
+ *             "A Kernel Model for Precision Timekeeping" by Dave Mills
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/param.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/timex.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/profile.h>
+#include <linux/cpu.h>
+#include <linux/security.h>
+
+#include <asm/segment.h>
+#include <asm/io.h>
+#include <asm/processor.h>
+#include <asm/nvram.h>
+#include <asm/cache.h>
+#include <asm/machdep.h>
+#ifdef CONFIG_PPC_ISERIES
+#include <asm/iSeries/ItLpQueue.h>
+#include <asm/iSeries/HvCallXm.h>
+#endif
+#include <asm/uaccess.h>
+#include <asm/time.h>
+#include <asm/ppcdebug.h>
+#include <asm/prom.h>
+#include <asm/sections.h>
+#include <asm/systemcfg.h>
+
+u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
+
+EXPORT_SYMBOL(jiffies_64);
+
+/* keep track of when we need to update the rtc */
+time_t last_rtc_update;
+extern int piranha_simulator;
+#ifdef CONFIG_PPC_ISERIES
+unsigned long iSeries_recal_titan = 0;
+unsigned long iSeries_recal_tb = 0; 
+static unsigned long first_settimeofday = 1;
+#endif
+
+#define XSEC_PER_SEC (1024*1024)
+
+unsigned long tb_ticks_per_jiffy;
+unsigned long tb_ticks_per_usec = 100; /* sane default */
+EXPORT_SYMBOL(tb_ticks_per_usec);
+unsigned long tb_ticks_per_sec;
+unsigned long tb_to_xs;
+unsigned      tb_to_us;
+unsigned long processor_freq;
+DEFINE_SPINLOCK(rtc_lock);
+
+unsigned long tb_to_ns_scale;
+unsigned long tb_to_ns_shift;
+
+struct gettimeofday_struct do_gtod;
+
+extern unsigned long wall_jiffies;
+extern unsigned long lpevent_count;
+extern int smp_tb_synchronized;
+
+extern struct timezone sys_tz;
+
+void ppc_adjtimex(void);
+
+static unsigned adjusting_time = 0;
+
+static __inline__ void timer_check_rtc(void)
+{
+        /*
+         * update the rtc when needed, this should be performed on the
+         * right fraction of a second. Half or full second ?
+         * Full second works on mk48t59 clocks, others need testing.
+         * Note that this update is basically only used through 
+         * the adjtimex system calls. Setting the HW clock in
+         * any other way is a /dev/rtc and userland business.
+         * This is still wrong by -0.5/+1.5 jiffies because of the
+         * timer interrupt resolution and possible delay, but here we 
+         * hit a quantization limit which can only be solved by higher
+         * resolution timers and decoupling time management from timer
+         * interrupts. This is also wrong on the clocks
+         * which require being written at the half second boundary.
+         * We should have an rtc call that only sets the minutes and
+         * seconds like on Intel to avoid problems with non UTC clocks.
+         */
+        if ( (time_status & STA_UNSYNC) == 0 &&
+             xtime.tv_sec - last_rtc_update >= 659 &&
+             abs((xtime.tv_nsec/1000) - (1000000-1000000/HZ)) < 500000/HZ &&
+             jiffies - wall_jiffies == 1) {
+	    struct rtc_time tm;
+	    to_tm(xtime.tv_sec+1, &tm);
+	    tm.tm_year -= 1900;
+	    tm.tm_mon -= 1;
+            if (ppc_md.set_rtc_time(&tm) == 0)
+                last_rtc_update = xtime.tv_sec+1;
+            else
+                /* Try again one minute later */
+                last_rtc_update += 60;
+        }
+}
+
+/*
+ * This version of gettimeofday has microsecond resolution.
+ */
+static inline void __do_gettimeofday(struct timeval *tv, unsigned long tb_val)
+{
+	unsigned long sec, usec, tb_ticks;
+	unsigned long xsec, tb_xsec;
+	struct gettimeofday_vars * temp_varp;
+	unsigned long temp_tb_to_xs, temp_stamp_xsec;
+
+	/*
+	 * These calculations are faster (gets rid of divides)
+	 * if done in units of 1/2^20 rather than microseconds.
+	 * The conversion to microseconds at the end is done
+	 * without a divide (and in fact, without a multiply)
+	 */
+	temp_varp = do_gtod.varp;
+	tb_ticks = tb_val - temp_varp->tb_orig_stamp;
+	temp_tb_to_xs = temp_varp->tb_to_xs;
+	temp_stamp_xsec = temp_varp->stamp_xsec;
+	tb_xsec = mulhdu( tb_ticks, temp_tb_to_xs );
+	xsec = temp_stamp_xsec + tb_xsec;
+	sec = xsec / XSEC_PER_SEC;
+	xsec -= sec * XSEC_PER_SEC;
+	usec = (xsec * USEC_PER_SEC)/XSEC_PER_SEC;
+
+	tv->tv_sec = sec;
+	tv->tv_usec = usec;
+}
+
+void do_gettimeofday(struct timeval *tv)
+{
+	__do_gettimeofday(tv, get_tb());
+}
+
+EXPORT_SYMBOL(do_gettimeofday);
+
+/* Synchronize xtime with do_gettimeofday */ 
+
+static inline void timer_sync_xtime(unsigned long cur_tb)
+{
+	struct timeval my_tv;
+
+	__do_gettimeofday(&my_tv, cur_tb);
+
+	if (xtime.tv_sec <= my_tv.tv_sec) {
+		xtime.tv_sec = my_tv.tv_sec;
+		xtime.tv_nsec = my_tv.tv_usec * 1000;
+	}
+}
+
+/*
+ * When the timebase - tb_orig_stamp gets too big, we do a manipulation
+ * between tb_orig_stamp and stamp_xsec. The goal here is to keep the
+ * difference tb - tb_orig_stamp small enough to always fit inside a
+ * 32 bits number. This is a requirement of our fast 32 bits userland
+ * implementation in the vdso. If we "miss" a call to this function
+ * (interrupt latency, CPU locked in a spinlock, ...) and we end up
+ * with a too big difference, then the vdso will fallback to calling
+ * the syscall
+ */
+static __inline__ void timer_recalc_offset(unsigned long cur_tb)
+{
+	struct gettimeofday_vars * temp_varp;
+	unsigned temp_idx;
+	unsigned long offset, new_stamp_xsec, new_tb_orig_stamp;
+
+	if (((cur_tb - do_gtod.varp->tb_orig_stamp) & 0x80000000u) == 0)
+		return;
+
+	temp_idx = (do_gtod.var_idx == 0);
+	temp_varp = &do_gtod.vars[temp_idx];
+
+	new_tb_orig_stamp = cur_tb;
+	offset = new_tb_orig_stamp - do_gtod.varp->tb_orig_stamp;
+	new_stamp_xsec = do_gtod.varp->stamp_xsec + mulhdu(offset, do_gtod.varp->tb_to_xs);
+
+	temp_varp->tb_to_xs = do_gtod.varp->tb_to_xs;
+	temp_varp->tb_orig_stamp = new_tb_orig_stamp;
+	temp_varp->stamp_xsec = new_stamp_xsec;
+	mb();
+	do_gtod.varp = temp_varp;
+	do_gtod.var_idx = temp_idx;
+
+	++(systemcfg->tb_update_count);
+	wmb();
+	systemcfg->tb_orig_stamp = new_tb_orig_stamp;
+	systemcfg->stamp_xsec = new_stamp_xsec;
+	wmb();
+	++(systemcfg->tb_update_count);
+}
+
+#ifdef CONFIG_SMP
+unsigned long profile_pc(struct pt_regs *regs)
+{
+	unsigned long pc = instruction_pointer(regs);
+
+	if (in_lock_functions(pc))
+		return regs->link;
+
+	return pc;
+}
+EXPORT_SYMBOL(profile_pc);
+#endif
+
+#ifdef CONFIG_PPC_ISERIES
+
+/* 
+ * This function recalibrates the timebase based on the 49-bit time-of-day
+ * value in the Titan chip.  The Titan is much more accurate than the value
+ * returned by the service processor for the timebase frequency.  
+ */
+
+static void iSeries_tb_recal(void)
+{
+	struct div_result divres;
+	unsigned long titan, tb;
+	tb = get_tb();
+	titan = HvCallXm_loadTod();
+	if ( iSeries_recal_titan ) {
+		unsigned long tb_ticks = tb - iSeries_recal_tb;
+		unsigned long titan_usec = (titan - iSeries_recal_titan) >> 12;
+		unsigned long new_tb_ticks_per_sec   = (tb_ticks * USEC_PER_SEC)/titan_usec;
+		unsigned long new_tb_ticks_per_jiffy = (new_tb_ticks_per_sec+(HZ/2))/HZ;
+		long tick_diff = new_tb_ticks_per_jiffy - tb_ticks_per_jiffy;
+		char sign = '+';		
+		/* make sure tb_ticks_per_sec and tb_ticks_per_jiffy are consistent */
+		new_tb_ticks_per_sec = new_tb_ticks_per_jiffy * HZ;
+
+		if ( tick_diff < 0 ) {
+			tick_diff = -tick_diff;
+			sign = '-';
+		}
+		if ( tick_diff ) {
+			if ( tick_diff < tb_ticks_per_jiffy/25 ) {
+				printk( "Titan recalibrate: new tb_ticks_per_jiffy = %lu (%c%ld)\n",
+						new_tb_ticks_per_jiffy, sign, tick_diff );
+				tb_ticks_per_jiffy = new_tb_ticks_per_jiffy;
+				tb_ticks_per_sec   = new_tb_ticks_per_sec;
+				div128_by_32( XSEC_PER_SEC, 0, tb_ticks_per_sec, &divres );
+				do_gtod.tb_ticks_per_sec = tb_ticks_per_sec;
+				tb_to_xs = divres.result_low;
+				do_gtod.varp->tb_to_xs = tb_to_xs;
+				systemcfg->tb_ticks_per_sec = tb_ticks_per_sec;
+				systemcfg->tb_to_xs = tb_to_xs;
+			}
+			else {
+				printk( "Titan recalibrate: FAILED (difference > 4 percent)\n"
+					"                   new tb_ticks_per_jiffy = %lu\n"
+					"                   old tb_ticks_per_jiffy = %lu\n",
+					new_tb_ticks_per_jiffy, tb_ticks_per_jiffy );
+			}
+		}
+	}
+	iSeries_recal_titan = titan;
+	iSeries_recal_tb = tb;
+}
+#endif
+
+/*
+ * For iSeries shared processors, we have to let the hypervisor
+ * set the hardware decrementer.  We set a virtual decrementer
+ * in the lppaca and call the hypervisor if the virtual
+ * decrementer is less than the current value in the hardware
+ * decrementer. (almost always the new decrementer value will
+ * be greater than the current hardware decementer so the hypervisor
+ * call will not be needed)
+ */
+
+unsigned long tb_last_stamp __cacheline_aligned_in_smp;
+
+/*
+ * timer_interrupt - gets called when the decrementer overflows,
+ * with interrupts disabled.
+ */
+int timer_interrupt(struct pt_regs * regs)
+{
+	int next_dec;
+	unsigned long cur_tb;
+	struct paca_struct *lpaca = get_paca();
+	unsigned long cpu = smp_processor_id();
+
+	irq_enter();
+
+#ifndef CONFIG_PPC_ISERIES
+	profile_tick(CPU_PROFILING, regs);
+#endif
+
+	lpaca->lppaca.int_dword.fields.decr_int = 0;
+
+	while (lpaca->next_jiffy_update_tb <= (cur_tb = get_tb())) {
+		/*
+		 * We cannot disable the decrementer, so in the period
+		 * between this cpu's being marked offline in cpu_online_map
+		 * and calling stop-self, it is taking timer interrupts.
+		 * Avoid calling into the scheduler rebalancing code if this
+		 * is the case.
+		 */
+		if (!cpu_is_offline(cpu))
+			update_process_times(user_mode(regs));
+		/*
+		 * No need to check whether cpu is offline here; boot_cpuid
+		 * should have been fixed up by now.
+		 */
+		if (cpu == boot_cpuid) {
+			write_seqlock(&xtime_lock);
+			tb_last_stamp = lpaca->next_jiffy_update_tb;
+			timer_recalc_offset(lpaca->next_jiffy_update_tb);
+			do_timer(regs);
+			timer_sync_xtime(lpaca->next_jiffy_update_tb);
+			timer_check_rtc();
+			write_sequnlock(&xtime_lock);
+			if ( adjusting_time && (time_adjust == 0) )
+				ppc_adjtimex();
+		}
+		lpaca->next_jiffy_update_tb += tb_ticks_per_jiffy;
+	}
+	
+	next_dec = lpaca->next_jiffy_update_tb - cur_tb;
+	if (next_dec > lpaca->default_decr)
+        	next_dec = lpaca->default_decr;
+	set_dec(next_dec);
+
+#ifdef CONFIG_PPC_ISERIES
+	{
+		struct ItLpQueue *lpq = lpaca->lpqueue_ptr;
+		if (lpq && ItLpQueue_isLpIntPending(lpq))
+			lpevent_count += ItLpQueue_process(lpq, regs);
+	}
+#endif
+
+/* collect purr register values often, for accurate calculations */
+#if defined(CONFIG_PPC_PSERIES)
+	if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) {
+		struct cpu_usage *cu = &__get_cpu_var(cpu_usage_array);
+		cu->current_tb = mfspr(SPRN_PURR);
+	}
+#endif
+
+	irq_exit();
+
+	return 1;
+}
+
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ *
+ * Note: mulhdu(a, b) (multiply high double unsigned) returns
+ * the high 64 bits of a * b, i.e. (a * b) >> 64, where a and b
+ * are 64-bit unsigned numbers.
+ */
+unsigned long long sched_clock(void)
+{
+	return mulhdu(get_tb(), tb_to_ns_scale) << tb_to_ns_shift;
+}
+
+int do_settimeofday(struct timespec *tv)
+{
+	time_t wtm_sec, new_sec = tv->tv_sec;
+	long wtm_nsec, new_nsec = tv->tv_nsec;
+	unsigned long flags;
+	unsigned long delta_xsec;
+	long int tb_delta;
+	unsigned long new_xsec;
+
+	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
+		return -EINVAL;
+
+	write_seqlock_irqsave(&xtime_lock, flags);
+	/* Updating the RTC is not the job of this code. If the time is
+	 * stepped under NTP, the RTC will be update after STA_UNSYNC
+	 * is cleared. Tool like clock/hwclock either copy the RTC
+	 * to the system time, in which case there is no point in writing
+	 * to the RTC again, or write to the RTC but then they don't call
+	 * settimeofday to perform this operation.
+	 */
+#ifdef CONFIG_PPC_ISERIES
+	if ( first_settimeofday ) {
+		iSeries_tb_recal();
+		first_settimeofday = 0;
+	}
+#endif
+	tb_delta = tb_ticks_since(tb_last_stamp);
+	tb_delta += (jiffies - wall_jiffies) * tb_ticks_per_jiffy;
+
+	new_nsec -= tb_delta / tb_ticks_per_usec / 1000;
+
+	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - new_sec);
+	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - new_nsec);
+
+ 	set_normalized_timespec(&xtime, new_sec, new_nsec);
+	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
+
+	/* In case of a large backwards jump in time with NTP, we want the 
+	 * clock to be updated as soon as the PLL is again in lock.
+	 */
+	last_rtc_update = new_sec - 658;
+
+	time_adjust = 0;                /* stop active adjtime() */
+	time_status |= STA_UNSYNC;
+	time_maxerror = NTP_PHASE_LIMIT;
+	time_esterror = NTP_PHASE_LIMIT;
+
+	delta_xsec = mulhdu( (tb_last_stamp-do_gtod.varp->tb_orig_stamp),
+			     do_gtod.varp->tb_to_xs );
+
+	new_xsec = (new_nsec * XSEC_PER_SEC) / NSEC_PER_SEC;
+	new_xsec += new_sec * XSEC_PER_SEC;
+	if ( new_xsec > delta_xsec ) {
+		do_gtod.varp->stamp_xsec = new_xsec - delta_xsec;
+		systemcfg->stamp_xsec = new_xsec - delta_xsec;
+	}
+	else {
+		/* This is only for the case where the user is setting the time
+		 * way back to a time such that the boot time would have been
+		 * before 1970 ... eg. we booted ten days ago, and we are setting
+		 * the time to Jan 5, 1970 */
+		do_gtod.varp->stamp_xsec = new_xsec;
+		do_gtod.varp->tb_orig_stamp = tb_last_stamp;
+		systemcfg->stamp_xsec = new_xsec;
+		systemcfg->tb_orig_stamp = tb_last_stamp;
+	}
+
+	systemcfg->tz_minuteswest = sys_tz.tz_minuteswest;
+	systemcfg->tz_dsttime = sys_tz.tz_dsttime;
+
+	write_sequnlock_irqrestore(&xtime_lock, flags);
+	clock_was_set();
+	return 0;
+}
+
+EXPORT_SYMBOL(do_settimeofday);
+
+void __init time_init(void)
+{
+	/* This function is only called on the boot processor */
+	unsigned long flags;
+	struct rtc_time tm;
+	struct div_result res;
+	unsigned long scale, shift;
+
+	ppc_md.calibrate_decr();
+
+	/*
+	 * Compute scale factor for sched_clock.
+	 * The calibrate_decr() function has set tb_ticks_per_sec,
+	 * which is the timebase frequency.
+	 * We compute 1e9 * 2^64 / tb_ticks_per_sec and interpret
+	 * the 128-bit result as a 64.64 fixed-point number.
+	 * We then shift that number right until it is less than 1.0,
+	 * giving us the scale factor and shift count to use in
+	 * sched_clock().
+	 */
+	div128_by_32(1000000000, 0, tb_ticks_per_sec, &res);
+	scale = res.result_low;
+	for (shift = 0; res.result_high != 0; ++shift) {
+		scale = (scale >> 1) | (res.result_high << 63);
+		res.result_high >>= 1;
+	}
+	tb_to_ns_scale = scale;
+	tb_to_ns_shift = shift;
+
+#ifdef CONFIG_PPC_ISERIES
+	if (!piranha_simulator)
+#endif
+		ppc_md.get_boot_time(&tm);
+
+	write_seqlock_irqsave(&xtime_lock, flags);
+	xtime.tv_sec = mktime(tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
+			      tm.tm_hour, tm.tm_min, tm.tm_sec);
+	tb_last_stamp = get_tb();
+	do_gtod.varp = &do_gtod.vars[0];
+	do_gtod.var_idx = 0;
+	do_gtod.varp->tb_orig_stamp = tb_last_stamp;
+	do_gtod.varp->stamp_xsec = xtime.tv_sec * XSEC_PER_SEC;
+	do_gtod.tb_ticks_per_sec = tb_ticks_per_sec;
+	do_gtod.varp->tb_to_xs = tb_to_xs;
+	do_gtod.tb_to_us = tb_to_us;
+	systemcfg->tb_orig_stamp = tb_last_stamp;
+	systemcfg->tb_update_count = 0;
+	systemcfg->tb_ticks_per_sec = tb_ticks_per_sec;
+	systemcfg->stamp_xsec = xtime.tv_sec * XSEC_PER_SEC;
+	systemcfg->tb_to_xs = tb_to_xs;
+
+	time_freq = 0;
+
+	xtime.tv_nsec = 0;
+	last_rtc_update = xtime.tv_sec;
+	set_normalized_timespec(&wall_to_monotonic,
+	                        -xtime.tv_sec, -xtime.tv_nsec);
+	write_sequnlock_irqrestore(&xtime_lock, flags);
+
+	/* Not exact, but the timer interrupt takes care of this */
+	set_dec(tb_ticks_per_jiffy);
+}
+
+/* 
+ * After adjtimex is called, adjust the conversion of tb ticks
+ * to microseconds to keep do_gettimeofday synchronized 
+ * with ntpd.
+ *
+ * Use the time_adjust, time_freq and time_offset computed by adjtimex to 
+ * adjust the frequency.
+ */
+
+/* #define DEBUG_PPC_ADJTIMEX 1 */
+
+void ppc_adjtimex(void)
+{
+	unsigned long den, new_tb_ticks_per_sec, tb_ticks, old_xsec, new_tb_to_xs, new_xsec, new_stamp_xsec;
+	unsigned long tb_ticks_per_sec_delta;
+	long delta_freq, ltemp;
+	struct div_result divres; 
+	unsigned long flags;
+	struct gettimeofday_vars * temp_varp;
+	unsigned temp_idx;
+	long singleshot_ppm = 0;
+
+	/* Compute parts per million frequency adjustment to accomplish the time adjustment
+	   implied by time_offset to be applied over the elapsed time indicated by time_constant.
+	   Use SHIFT_USEC to get it into the same units as time_freq. */
+	if ( time_offset < 0 ) {
+		ltemp = -time_offset;
+		ltemp <<= SHIFT_USEC - SHIFT_UPDATE;
+		ltemp >>= SHIFT_KG + time_constant;
+		ltemp = -ltemp;
+	}
+	else {
+		ltemp = time_offset;
+		ltemp <<= SHIFT_USEC - SHIFT_UPDATE;
+		ltemp >>= SHIFT_KG + time_constant;
+	}
+	
+	/* If there is a single shot time adjustment in progress */
+	if ( time_adjust ) {
+#ifdef DEBUG_PPC_ADJTIMEX
+		printk("ppc_adjtimex: ");
+		if ( adjusting_time == 0 )
+			printk("starting ");
+		printk("single shot time_adjust = %ld\n", time_adjust);
+#endif	
+	
+		adjusting_time = 1;
+		
+		/* Compute parts per million frequency adjustment to match time_adjust */
+		singleshot_ppm = tickadj * HZ;	
+		/*
+		 * The adjustment should be tickadj*HZ to match the code in
+		 * linux/kernel/timer.c, but experiments show that this is too
+		 * large. 3/4 of tickadj*HZ seems about right
+		 */
+		singleshot_ppm -= singleshot_ppm / 4;
+		/* Use SHIFT_USEC to get it into the same units as time_freq */	
+		singleshot_ppm <<= SHIFT_USEC;
+		if ( time_adjust < 0 )
+			singleshot_ppm = -singleshot_ppm;
+	}
+	else {
+#ifdef DEBUG_PPC_ADJTIMEX
+		if ( adjusting_time )
+			printk("ppc_adjtimex: ending single shot time_adjust\n");
+#endif
+		adjusting_time = 0;
+	}
+	
+	/* Add up all of the frequency adjustments */
+	delta_freq = time_freq + ltemp + singleshot_ppm;
+	
+	/* Compute a new value for tb_ticks_per_sec based on the frequency adjustment */
+	den = 1000000 * (1 << (SHIFT_USEC - 8));
+	if ( delta_freq < 0 ) {
+		tb_ticks_per_sec_delta = ( tb_ticks_per_sec * ( (-delta_freq) >> (SHIFT_USEC - 8))) / den;
+		new_tb_ticks_per_sec = tb_ticks_per_sec + tb_ticks_per_sec_delta;
+	}
+	else {
+		tb_ticks_per_sec_delta = ( tb_ticks_per_sec * ( delta_freq >> (SHIFT_USEC - 8))) / den;
+		new_tb_ticks_per_sec = tb_ticks_per_sec - tb_ticks_per_sec_delta;
+	}
+	
+#ifdef DEBUG_PPC_ADJTIMEX
+	printk("ppc_adjtimex: ltemp = %ld, time_freq = %ld, singleshot_ppm = %ld\n", ltemp, time_freq, singleshot_ppm);
+	printk("ppc_adjtimex: tb_ticks_per_sec - base = %ld  new = %ld\n", tb_ticks_per_sec, new_tb_ticks_per_sec);
+#endif
+				
+	/* Compute a new value of tb_to_xs (used to convert tb to microseconds and a new value of 
+	   stamp_xsec which is the time (in 1/2^20 second units) corresponding to tb_orig_stamp.  This 
+	   new value of stamp_xsec compensates for the change in frequency (implied by the new tb_to_xs)
+	   which guarantees that the current time remains the same */ 
+	write_seqlock_irqsave( &xtime_lock, flags );
+	tb_ticks = get_tb() - do_gtod.varp->tb_orig_stamp;
+	div128_by_32( 1024*1024, 0, new_tb_ticks_per_sec, &divres );
+	new_tb_to_xs = divres.result_low;
+	new_xsec = mulhdu( tb_ticks, new_tb_to_xs );
+
+	old_xsec = mulhdu( tb_ticks, do_gtod.varp->tb_to_xs );
+	new_stamp_xsec = do_gtod.varp->stamp_xsec + old_xsec - new_xsec;
+
+	/* There are two copies of tb_to_xs and stamp_xsec so that no lock is needed to access and use these
+	   values in do_gettimeofday.  We alternate the copies and as long as a reasonable time elapses between
+	   changes, there will never be inconsistent values.  ntpd has a minimum of one minute between updates */
+
+	temp_idx = (do_gtod.var_idx == 0);
+	temp_varp = &do_gtod.vars[temp_idx];
+
+	temp_varp->tb_to_xs = new_tb_to_xs;
+	temp_varp->stamp_xsec = new_stamp_xsec;
+	temp_varp->tb_orig_stamp = do_gtod.varp->tb_orig_stamp;
+	mb();
+	do_gtod.varp = temp_varp;
+	do_gtod.var_idx = temp_idx;
+
+	/*
+	 * tb_update_count is used to allow the problem state gettimeofday code
+	 * to assure itself that it sees a consistent view of the tb_to_xs and
+	 * stamp_xsec variables.  It reads the tb_update_count, then reads
+	 * tb_to_xs and stamp_xsec and then reads tb_update_count again.  If
+	 * the two values of tb_update_count match and are even then the
+	 * tb_to_xs and stamp_xsec values are consistent.  If not, then it
+	 * loops back and reads them again until this criteria is met.
+	 */
+	++(systemcfg->tb_update_count);
+	wmb();
+	systemcfg->tb_to_xs = new_tb_to_xs;
+	systemcfg->stamp_xsec = new_stamp_xsec;
+	wmb();
+	++(systemcfg->tb_update_count);
+
+	write_sequnlock_irqrestore( &xtime_lock, flags );
+
+}
+
+
+#define TICK_SIZE tick
+#define FEBRUARY	2
+#define	STARTOFTIME	1970
+#define SECDAY		86400L
+#define SECYR		(SECDAY * 365)
+#define	leapyear(year)		((year) % 4 == 0)
+#define	days_in_year(a) 	(leapyear(a) ? 366 : 365)
+#define	days_in_month(a) 	(month_days[(a) - 1])
+
+static int month_days[12] = {
+	31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31
+};
+
+/*
+ * This only works for the Gregorian calendar - i.e. after 1752 (in the UK)
+ */
+void GregorianDay(struct rtc_time * tm)
+{
+	int leapsToDate;
+	int lastYear;
+	int day;
+	int MonthOffset[] = { 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334 };
+
+	lastYear=tm->tm_year-1;
+
+	/*
+	 * Number of leap corrections to apply up to end of last year
+	 */
+	leapsToDate = lastYear/4 - lastYear/100 + lastYear/400;
+
+	/*
+	 * This year is a leap year if it is divisible by 4 except when it is
+	 * divisible by 100 unless it is divisible by 400
+	 *
+	 * e.g. 1904 was a leap year, 1900 was not, 1996 is, and 2000 will be
+	 */
+	if((tm->tm_year%4==0) &&
+	   ((tm->tm_year%100!=0) || (tm->tm_year%400==0)) &&
+	   (tm->tm_mon>2))
+	{
+		/*
+		 * We are past Feb. 29 in a leap year
+		 */
+		day=1;
+	}
+	else
+	{
+		day=0;
+	}
+
+	day += lastYear*365 + leapsToDate + MonthOffset[tm->tm_mon-1] +
+		   tm->tm_mday;
+
+	tm->tm_wday=day%7;
+}
+
+void to_tm(int tim, struct rtc_time * tm)
+{
+	register int    i;
+	register long   hms, day;
+
+	day = tim / SECDAY;
+	hms = tim % SECDAY;
+
+	/* Hours, minutes, seconds are easy */
+	tm->tm_hour = hms / 3600;
+	tm->tm_min = (hms % 3600) / 60;
+	tm->tm_sec = (hms % 3600) % 60;
+
+	/* Number of years in days */
+	for (i = STARTOFTIME; day >= days_in_year(i); i++)
+		day -= days_in_year(i);
+	tm->tm_year = i;
+
+	/* Number of months in days left */
+	if (leapyear(tm->tm_year))
+		days_in_month(FEBRUARY) = 29;
+	for (i = 1; day >= days_in_month(i); i++)
+		day -= days_in_month(i);
+	days_in_month(FEBRUARY) = 28;
+	tm->tm_mon = i;
+
+	/* Days are what is left over (+1) from all that. */
+	tm->tm_mday = day + 1;
+
+	/*
+	 * Determine the day of week
+	 */
+	GregorianDay(tm);
+}
+
+/* Auxiliary function to compute scaling factors */
+/* Actually the choice of a timebase running at 1/4 the of the bus
+ * frequency giving resolution of a few tens of nanoseconds is quite nice.
+ * It makes this computation very precise (27-28 bits typically) which
+ * is optimistic considering the stability of most processor clock
+ * oscillators and the precision with which the timebase frequency
+ * is measured but does not harm.
+ */
+unsigned mulhwu_scale_factor(unsigned inscale, unsigned outscale) {
+        unsigned mlt=0, tmp, err;
+        /* No concern for performance, it's done once: use a stupid
+         * but safe and compact method to find the multiplier.
+         */
+  
+        for (tmp = 1U<<31; tmp != 0; tmp >>= 1) {
+                if (mulhwu(inscale, mlt|tmp) < outscale) mlt|=tmp;
+        }
+  
+        /* We might still be off by 1 for the best approximation.
+         * A side effect of this is that if outscale is too large
+         * the returned value will be zero.
+         * Many corner cases have been checked and seem to work,
+         * some might have been forgotten in the test however.
+         */
+  
+        err = inscale*(mlt+1);
+        if (err <= inscale/2) mlt++;
+        return mlt;
+  }
+
+/*
+ * Divide a 128-bit dividend by a 32-bit divisor, leaving a 128 bit
+ * result.
+ */
+
+void div128_by_32( unsigned long dividend_high, unsigned long dividend_low,
+		   unsigned divisor, struct div_result *dr )
+{
+	unsigned long a,b,c,d, w,x,y,z, ra,rb,rc;
+
+	a = dividend_high >> 32;
+	b = dividend_high & 0xffffffff;
+	c = dividend_low >> 32;
+	d = dividend_low & 0xffffffff;
+
+	w = a/divisor;
+	ra = (a - (w * divisor)) << 32;
+
+	x = (ra + b)/divisor;
+	rb = ((ra + b) - (x * divisor)) << 32;
+
+	y = (rb + c)/divisor;
+	rc = ((rb + b) - (y * divisor)) << 32;
+
+	z = (rc + d)/divisor;
+
+	dr->result_high = (w << 32) + x;
+	dr->result_low  = (y << 32) + z;
+
+}
+
diff --git a/arch/ppc64/kernel/traps.c b/arch/ppc64/kernel/traps.c
new file mode 100644
index 00000000000..10fc61f3f6a
--- /dev/null
+++ b/arch/ppc64/kernel/traps.c
@@ -0,0 +1,565 @@
+/*
+ *  linux/arch/ppc64/kernel/traps.c
+ *
+ *  Copyright (C) 1995-1996  Gary Thomas (gdt@linuxppc.org)
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ *  Modified by Cort Dougan (cort@cs.nmt.edu)
+ *  and Paul Mackerras (paulus@cs.anu.edu.au)
+ */
+
+/*
+ * This file handles the architecture-dependent parts of hardware exceptions
+ */
+
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/slab.h>
+#include <linux/user.h>
+#include <linux/a.out.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <asm/kdebug.h>
+
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/processor.h>
+#include <asm/ppcdebug.h>
+#include <asm/rtas.h>
+#include <asm/systemcfg.h>
+#include <asm/machdep.h>
+#include <asm/pmc.h>
+
+#ifdef CONFIG_DEBUGGER
+int (*__debugger)(struct pt_regs *regs);
+int (*__debugger_ipi)(struct pt_regs *regs);
+int (*__debugger_bpt)(struct pt_regs *regs);
+int (*__debugger_sstep)(struct pt_regs *regs);
+int (*__debugger_iabr_match)(struct pt_regs *regs);
+int (*__debugger_dabr_match)(struct pt_regs *regs);
+int (*__debugger_fault_handler)(struct pt_regs *regs);
+
+EXPORT_SYMBOL(__debugger);
+EXPORT_SYMBOL(__debugger_ipi);
+EXPORT_SYMBOL(__debugger_bpt);
+EXPORT_SYMBOL(__debugger_sstep);
+EXPORT_SYMBOL(__debugger_iabr_match);
+EXPORT_SYMBOL(__debugger_dabr_match);
+EXPORT_SYMBOL(__debugger_fault_handler);
+#endif
+
+struct notifier_block *ppc64_die_chain;
+static DEFINE_SPINLOCK(die_notifier_lock);
+
+int register_die_notifier(struct notifier_block *nb)
+{
+	int err = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&die_notifier_lock, flags);
+	err = notifier_chain_register(&ppc64_die_chain, nb);
+	spin_unlock_irqrestore(&die_notifier_lock, flags);
+	return err;
+}
+
+/*
+ * Trap & Exception support
+ */
+
+static DEFINE_SPINLOCK(die_lock);
+
+int die(const char *str, struct pt_regs *regs, long err)
+{
+	static int die_counter;
+	int nl = 0;
+
+	if (debugger(regs))
+		return 1;
+
+	console_verbose();
+	spin_lock_irq(&die_lock);
+	bust_spinlocks(1);
+	printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter);
+#ifdef CONFIG_PREEMPT
+	printk("PREEMPT ");
+	nl = 1;
+#endif
+#ifdef CONFIG_SMP
+	printk("SMP NR_CPUS=%d ", NR_CPUS);
+	nl = 1;
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	printk("DEBUG_PAGEALLOC ");
+	nl = 1;
+#endif
+#ifdef CONFIG_NUMA
+	printk("NUMA ");
+	nl = 1;
+#endif
+	switch(systemcfg->platform) {
+		case PLATFORM_PSERIES:
+			printk("PSERIES ");
+			nl = 1;
+			break;
+		case PLATFORM_PSERIES_LPAR:
+			printk("PSERIES LPAR ");
+			nl = 1;
+			break;
+		case PLATFORM_ISERIES_LPAR:
+			printk("ISERIES LPAR ");
+			nl = 1;
+			break;
+		case PLATFORM_POWERMAC:
+			printk("POWERMAC ");
+			nl = 1;
+			break;
+	}
+	if (nl)
+		printk("\n");
+	print_modules();
+	show_regs(regs);
+	bust_spinlocks(0);
+	spin_unlock_irq(&die_lock);
+
+	if (in_interrupt())
+		panic("Fatal exception in interrupt");
+
+	if (panic_on_oops) {
+		printk(KERN_EMERG "Fatal exception: panic in 5 seconds\n");
+		ssleep(5);
+		panic("Fatal exception");
+	}
+	do_exit(SIGSEGV);
+
+	return 0;
+}
+
+void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr)
+{
+	siginfo_t info;
+
+	if (!user_mode(regs)) {
+		if (die("Exception in kernel mode", regs, signr))
+			return;
+	}
+
+	memset(&info, 0, sizeof(info));
+	info.si_signo = signr;
+	info.si_code = code;
+	info.si_addr = (void __user *) addr;
+	force_sig_info(signr, &info, current);
+}
+
+void system_reset_exception(struct pt_regs *regs)
+{
+	/* See if any machine dependent calls */
+	if (ppc_md.system_reset_exception)
+		ppc_md.system_reset_exception(regs);
+
+	die("System Reset", regs, 0);
+
+	/* Must die if the interrupt is not recoverable */
+	if (!(regs->msr & MSR_RI))
+		panic("Unrecoverable System Reset");
+
+	/* What should we do here? We could issue a shutdown or hard reset. */
+}
+
+void machine_check_exception(struct pt_regs *regs)
+{
+	int recover = 0;
+
+	/* See if any machine dependent calls */
+	if (ppc_md.machine_check_exception)
+		recover = ppc_md.machine_check_exception(regs);
+
+	if (recover)
+		return;
+
+	if (debugger_fault_handler(regs))
+		return;
+	die("Machine check", regs, 0);
+
+	/* Must die if the interrupt is not recoverable */
+	if (!(regs->msr & MSR_RI))
+		panic("Unrecoverable Machine check");
+}
+
+void unknown_exception(struct pt_regs *regs)
+{
+	printk("Bad trap at PC: %lx, SR: %lx, vector=%lx\n",
+	       regs->nip, regs->msr, regs->trap);
+
+	_exception(SIGTRAP, regs, 0, 0);
+}
+
+void instruction_breakpoint_exception(struct pt_regs *regs)
+{
+	if (notify_die(DIE_IABR_MATCH, "iabr_match", regs, 5,
+					5, SIGTRAP) == NOTIFY_STOP)
+		return;
+	if (debugger_iabr_match(regs))
+		return;
+	_exception(SIGTRAP, regs, TRAP_BRKPT, regs->nip);
+}
+
+void single_step_exception(struct pt_regs *regs)
+{
+	regs->msr &= ~MSR_SE;  /* Turn off 'trace' bit */
+
+	if (notify_die(DIE_SSTEP, "single_step", regs, 5,
+					5, SIGTRAP) == NOTIFY_STOP)
+		return;
+	if (debugger_sstep(regs))
+		return;
+
+	_exception(SIGTRAP, regs, TRAP_TRACE, regs->nip);
+}
+
+/*
+ * After we have successfully emulated an instruction, we have to
+ * check if the instruction was being single-stepped, and if so,
+ * pretend we got a single-step exception.  This was pointed out
+ * by Kumar Gala.  -- paulus
+ */
+static inline void emulate_single_step(struct pt_regs *regs)
+{
+	if (regs->msr & MSR_SE)
+		single_step_exception(regs);
+}
+
+static void parse_fpe(struct pt_regs *regs)
+{
+	int code = 0;
+	unsigned long fpscr;
+
+	flush_fp_to_thread(current);
+
+	fpscr = current->thread.fpscr;
+
+	/* Invalid operation */
+	if ((fpscr & FPSCR_VE) && (fpscr & FPSCR_VX))
+		code = FPE_FLTINV;
+
+	/* Overflow */
+	else if ((fpscr & FPSCR_OE) && (fpscr & FPSCR_OX))
+		code = FPE_FLTOVF;
+
+	/* Underflow */
+	else if ((fpscr & FPSCR_UE) && (fpscr & FPSCR_UX))
+		code = FPE_FLTUND;
+
+	/* Divide by zero */
+	else if ((fpscr & FPSCR_ZE) && (fpscr & FPSCR_ZX))
+		code = FPE_FLTDIV;
+
+	/* Inexact result */
+	else if ((fpscr & FPSCR_XE) && (fpscr & FPSCR_XX))
+		code = FPE_FLTRES;
+
+	_exception(SIGFPE, regs, code, regs->nip);
+}
+
+/*
+ * Illegal instruction emulation support.  Return non-zero if we can't
+ * emulate, or -EFAULT if the associated memory access caused an access
+ * fault.  Return zero on success.
+ */
+
+#define INST_MFSPR_PVR		0x7c1f42a6
+#define INST_MFSPR_PVR_MASK	0xfc1fffff
+
+#define INST_DCBA		0x7c0005ec
+#define INST_DCBA_MASK		0x7c0007fe
+
+#define INST_MCRXR		0x7c000400
+#define INST_MCRXR_MASK		0x7c0007fe
+
+static int emulate_instruction(struct pt_regs *regs)
+{
+	unsigned int instword;
+
+	if (!user_mode(regs))
+		return -EINVAL;
+
+	CHECK_FULL_REGS(regs);
+
+	if (get_user(instword, (unsigned int __user *)(regs->nip)))
+		return -EFAULT;
+
+	/* Emulate the mfspr rD, PVR. */
+	if ((instword & INST_MFSPR_PVR_MASK) == INST_MFSPR_PVR) {
+		unsigned int rd;
+
+		rd = (instword >> 21) & 0x1f;
+		regs->gpr[rd] = mfspr(SPRN_PVR);
+		return 0;
+	}
+
+	/* Emulating the dcba insn is just a no-op.  */
+	if ((instword & INST_DCBA_MASK) == INST_DCBA) {
+		static int warned;
+
+		if (!warned) {
+			printk(KERN_WARNING
+			       "process %d (%s) uses obsolete 'dcba' insn\n",
+			       current->pid, current->comm);
+			warned = 1;
+		}
+		return 0;
+	}
+
+	/* Emulate the mcrxr insn.  */
+	if ((instword & INST_MCRXR_MASK) == INST_MCRXR) {
+		static int warned;
+		unsigned int shift;
+
+		if (!warned) {
+			printk(KERN_WARNING
+			       "process %d (%s) uses obsolete 'mcrxr' insn\n",
+			       current->pid, current->comm);
+			warned = 1;
+		}
+
+		shift = (instword >> 21) & 0x1c;
+		regs->ccr &= ~(0xf0000000 >> shift);
+		regs->ccr |= (regs->xer & 0xf0000000) >> shift;
+		regs->xer &= ~0xf0000000;
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+/*
+ * Look through the list of trap instructions that are used for BUG(),
+ * BUG_ON() and WARN_ON() and see if we hit one.  At this point we know
+ * that the exception was caused by a trap instruction of some kind.
+ * Returns 1 if we should continue (i.e. it was a WARN_ON) or 0
+ * otherwise.
+ */
+extern struct bug_entry __start___bug_table[], __stop___bug_table[];
+
+#ifndef CONFIG_MODULES
+#define module_find_bug(x)	NULL
+#endif
+
+struct bug_entry *find_bug(unsigned long bugaddr)
+{
+	struct bug_entry *bug;
+
+	for (bug = __start___bug_table; bug < __stop___bug_table; ++bug)
+		if (bugaddr == bug->bug_addr)
+			return bug;
+	return module_find_bug(bugaddr);
+}
+
+static int
+check_bug_trap(struct pt_regs *regs)
+{
+	struct bug_entry *bug;
+	unsigned long addr;
+
+	if (regs->msr & MSR_PR)
+		return 0;	/* not in kernel */
+	addr = regs->nip;	/* address of trap instruction */
+	if (addr < PAGE_OFFSET)
+		return 0;
+	bug = find_bug(regs->nip);
+	if (bug == NULL)
+		return 0;
+	if (bug->line & BUG_WARNING_TRAP) {
+		/* this is a WARN_ON rather than BUG/BUG_ON */
+		printk(KERN_ERR "Badness in %s at %s:%d\n",
+		       bug->function, bug->file,
+		      (unsigned int)bug->line & ~BUG_WARNING_TRAP);
+		show_stack(current, (void *)regs->gpr[1]);
+		return 1;
+	}
+	printk(KERN_CRIT "kernel BUG in %s at %s:%d!\n",
+	       bug->function, bug->file, (unsigned int)bug->line);
+	return 0;
+}
+
+void program_check_exception(struct pt_regs *regs)
+{
+	if (debugger_fault_handler(regs))
+		return;
+
+	if (regs->msr & 0x100000) {
+		/* IEEE FP exception */
+		parse_fpe(regs);
+	} else if (regs->msr & 0x20000) {
+		/* trap exception */
+
+		if (notify_die(DIE_BPT, "breakpoint", regs, 5,
+					5, SIGTRAP) == NOTIFY_STOP)
+			return;
+		if (debugger_bpt(regs))
+			return;
+
+		if (check_bug_trap(regs)) {
+			regs->nip += 4;
+			return;
+		}
+		_exception(SIGTRAP, regs, TRAP_BRKPT, regs->nip);
+
+	} else {
+		/* Privileged or illegal instruction; try to emulate it. */
+		switch (emulate_instruction(regs)) {
+		case 0:
+			regs->nip += 4;
+			emulate_single_step(regs);
+			break;
+
+		case -EFAULT:
+			_exception(SIGSEGV, regs, SEGV_MAPERR, regs->nip);
+			break;
+
+		default:
+			if (regs->msr & 0x40000)
+				/* priveleged */
+				_exception(SIGILL, regs, ILL_PRVOPC, regs->nip);
+			else
+				/* illegal */
+				_exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
+			break;
+		}
+	}
+}
+
+void kernel_fp_unavailable_exception(struct pt_regs *regs)
+{
+	printk(KERN_EMERG "Unrecoverable FP Unavailable Exception "
+			  "%lx at %lx\n", regs->trap, regs->nip);
+	die("Unrecoverable FP Unavailable Exception", regs, SIGABRT);
+}
+
+void altivec_unavailable_exception(struct pt_regs *regs)
+{
+#ifndef CONFIG_ALTIVEC
+	if (user_mode(regs)) {
+		/* A user program has executed an altivec instruction,
+		   but this kernel doesn't support altivec. */
+		_exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
+		return;
+	}
+#endif
+	printk(KERN_EMERG "Unrecoverable VMX/Altivec Unavailable Exception "
+			  "%lx at %lx\n", regs->trap, regs->nip);
+	die("Unrecoverable VMX/Altivec Unavailable Exception", regs, SIGABRT);
+}
+
+extern perf_irq_t perf_irq;
+
+void performance_monitor_exception(struct pt_regs *regs)
+{
+	perf_irq(regs);
+}
+
+void alignment_exception(struct pt_regs *regs)
+{
+	int fixed;
+
+	fixed = fix_alignment(regs);
+
+	if (fixed == 1) {
+		regs->nip += 4;	/* skip over emulated instruction */
+		emulate_single_step(regs);
+		return;
+	}
+
+	/* Operand address was bad */	
+	if (fixed == -EFAULT) {
+		if (user_mode(regs)) {
+			_exception(SIGSEGV, regs, SEGV_MAPERR, regs->dar);
+		} else {
+			/* Search exception table */
+			bad_page_fault(regs, regs->dar, SIGSEGV);
+		}
+
+		return;
+	}
+
+	_exception(SIGBUS, regs, BUS_ADRALN, regs->nip);
+}
+
+#ifdef CONFIG_ALTIVEC
+void altivec_assist_exception(struct pt_regs *regs)
+{
+	int err;
+	siginfo_t info;
+
+	if (!user_mode(regs)) {
+		printk(KERN_EMERG "VMX/Altivec assist exception in kernel mode"
+		       " at %lx\n", regs->nip);
+		die("Kernel VMX/Altivec assist exception", regs, SIGILL);
+	}
+
+	flush_altivec_to_thread(current);
+
+	err = emulate_altivec(regs);
+	if (err == 0) {
+		regs->nip += 4;		/* skip emulated instruction */
+		emulate_single_step(regs);
+		return;
+	}
+
+	if (err == -EFAULT) {
+		/* got an error reading the instruction */
+		info.si_signo = SIGSEGV;
+		info.si_errno = 0;
+		info.si_code = SEGV_MAPERR;
+		info.si_addr = (void __user *) regs->nip;
+		force_sig_info(SIGSEGV, &info, current);
+	} else {
+		/* didn't recognize the instruction */
+		/* XXX quick hack for now: set the non-Java bit in the VSCR */
+		if (printk_ratelimit())
+			printk(KERN_ERR "Unrecognized altivec instruction "
+			       "in %s at %lx\n", current->comm, regs->nip);
+		current->thread.vscr.u[3] |= 0x10000;
+	}
+}
+#endif /* CONFIG_ALTIVEC */
+
+/*
+ * We enter here if we get an unrecoverable exception, that is, one
+ * that happened at a point where the RI (recoverable interrupt) bit
+ * in the MSR is 0.  This indicates that SRR0/1 are live, and that
+ * we therefore lost state by taking this exception.
+ */
+void unrecoverable_exception(struct pt_regs *regs)
+{
+	printk(KERN_EMERG "Unrecoverable exception %lx at %lx\n",
+	       regs->trap, regs->nip);
+	die("Unrecoverable exception", regs, SIGABRT);
+}
+
+/*
+ * We enter here if we discover during exception entry that we are
+ * running in supervisor mode with a userspace value in the stack pointer.
+ */
+void kernel_bad_stack(struct pt_regs *regs)
+{
+	printk(KERN_EMERG "Bad kernel stack pointer %lx at %lx\n",
+	       regs->gpr[1], regs->nip);
+	die("Bad kernel stack pointer", regs, SIGABRT);
+}
+
+void __init trap_init(void)
+{
+}
diff --git a/arch/ppc64/kernel/u3_iommu.c b/arch/ppc64/kernel/u3_iommu.c
new file mode 100644
index 00000000000..b6e3bca4102
--- /dev/null
+++ b/arch/ppc64/kernel/u3_iommu.c
@@ -0,0 +1,349 @@
+/*
+ * arch/ppc64/kernel/u3_iommu.c
+ *
+ * Copyright (C) 2004 Olof Johansson <olof@austin.ibm.com>, IBM Corporation
+ *
+ * Based on pSeries_iommu.c:
+ * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation
+ * Copyright (C) 2004 Olof Johansson <olof@austin.ibm.com>, IBM Corporation
+ *
+ * Dynamic DMA mapping support, Apple U3 & IBM CPC925 "DART" iommu.
+ *
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/vmalloc.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/ppcdebug.h>
+#include <asm/iommu.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+#include <asm/abs_addr.h>
+#include <asm/cacheflush.h>
+#include <asm/lmb.h>
+
+#include "pci.h"
+
+extern int iommu_force_on;
+
+/* physical base of DART registers */
+#define DART_BASE        0xf8033000UL
+
+/* Offset from base to control register */
+#define DARTCNTL   0
+/* Offset from base to exception register */
+#define DARTEXCP   0x10
+/* Offset from base to TLB tag registers */
+#define DARTTAG    0x1000
+
+
+/* Control Register fields */
+
+/* base address of table (pfn) */
+#define DARTCNTL_BASE_MASK    0xfffff
+#define DARTCNTL_BASE_SHIFT   12
+
+#define DARTCNTL_FLUSHTLB     0x400
+#define DARTCNTL_ENABLE       0x200
+
+/* size of table in pages */
+#define DARTCNTL_SIZE_MASK    0x1ff
+#define DARTCNTL_SIZE_SHIFT   0
+
+/* DART table fields */
+#define DARTMAP_VALID   0x80000000
+#define DARTMAP_RPNMASK 0x00ffffff
+
+/* Physical base address and size of the DART table */
+unsigned long dart_tablebase; /* exported to htab_initialize */
+static unsigned long dart_tablesize;
+
+/* Virtual base address of the DART table */
+static u32 *dart_vbase;
+
+/* Mapped base address for the dart */
+static unsigned int *dart; 
+
+/* Dummy val that entries are set to when unused */
+static unsigned int dart_emptyval;
+
+static struct iommu_table iommu_table_u3;
+static int iommu_table_u3_inited;
+static int dart_dirty;
+
+#define DBG(...)
+
+static inline void dart_tlb_invalidate_all(void)
+{
+	unsigned long l = 0;
+	unsigned int reg;
+	unsigned long limit;
+
+	DBG("dart: flush\n");
+
+	/* To invalidate the DART, set the DARTCNTL_FLUSHTLB bit in the
+	 * control register and wait for it to clear.
+	 *
+	 * Gotcha: Sometimes, the DART won't detect that the bit gets
+	 * set. If so, clear it and set it again.
+	 */ 
+
+	limit = 0;
+
+retry:
+	reg = in_be32((unsigned int *)dart+DARTCNTL);
+	reg |= DARTCNTL_FLUSHTLB;
+	out_be32((unsigned int *)dart+DARTCNTL, reg);
+
+	l = 0;
+	while ((in_be32((unsigned int *)dart+DARTCNTL) & DARTCNTL_FLUSHTLB) &&
+		l < (1L<<limit)) {
+		l++;
+	}
+	if (l == (1L<<limit)) {
+		if (limit < 4) {
+			limit++;
+		        reg = in_be32((unsigned int *)dart+DARTCNTL);
+		        reg &= ~DARTCNTL_FLUSHTLB;
+		        out_be32((unsigned int *)dart+DARTCNTL, reg);
+			goto retry;
+		} else
+			panic("U3-DART: TLB did not flush after waiting a long "
+			      "time. Buggy U3 ?");
+	}
+}
+
+static void dart_flush(struct iommu_table *tbl)
+{
+	if (dart_dirty)
+		dart_tlb_invalidate_all();
+	dart_dirty = 0;
+}
+
+static void dart_build(struct iommu_table *tbl, long index, 
+		       long npages, unsigned long uaddr,
+		       enum dma_data_direction direction)
+{
+	unsigned int *dp;
+	unsigned int rpn;
+
+	DBG("dart: build at: %lx, %lx, addr: %x\n", index, npages, uaddr);
+
+	dp = ((unsigned int*)tbl->it_base) + index;
+	
+	/* On U3, all memory is contigous, so we can move this
+	 * out of the loop.
+	 */
+	while (npages--) {
+		rpn = virt_to_abs(uaddr) >> PAGE_SHIFT;
+
+		*(dp++) = DARTMAP_VALID | (rpn & DARTMAP_RPNMASK);
+
+		rpn++;
+		uaddr += PAGE_SIZE;
+	}
+
+	dart_dirty = 1;
+}
+
+
+static void dart_free(struct iommu_table *tbl, long index, long npages)
+{
+	unsigned int *dp;
+	
+	/* We don't worry about flushing the TLB cache. The only drawback of
+	 * not doing it is that we won't catch buggy device drivers doing
+	 * bad DMAs, but then no 32-bit architecture ever does either.
+	 */
+
+	DBG("dart: free at: %lx, %lx\n", index, npages);
+
+	dp  = ((unsigned int *)tbl->it_base) + index;
+		
+	while (npages--)
+		*(dp++) = dart_emptyval;
+}
+
+
+static int dart_init(struct device_node *dart_node)
+{
+	unsigned int regword;
+	unsigned int i;
+	unsigned long tmp;
+
+	if (dart_tablebase == 0 || dart_tablesize == 0) {
+		printk(KERN_INFO "U3-DART: table not allocated, using direct DMA\n");
+		return -ENODEV;
+	}
+
+	/* Make sure nothing from the DART range remains in the CPU cache
+	 * from a previous mapping that existed before the kernel took
+	 * over
+	 */
+	flush_dcache_phys_range(dart_tablebase, dart_tablebase + dart_tablesize);
+
+	/* Allocate a spare page to map all invalid DART pages. We need to do
+	 * that to work around what looks like a problem with the HT bridge
+	 * prefetching into invalid pages and corrupting data
+	 */
+	tmp = lmb_alloc(PAGE_SIZE, PAGE_SIZE);
+	if (!tmp)
+		panic("U3-DART: Cannot allocate spare page!");
+	dart_emptyval = DARTMAP_VALID | ((tmp >> PAGE_SHIFT) & DARTMAP_RPNMASK);
+
+	/* Map in DART registers. FIXME: Use device node to get base address */
+	dart = ioremap(DART_BASE, 0x7000);
+	if (dart == NULL)
+		panic("U3-DART: Cannot map registers!");
+
+	/* Set initial control register contents: table base, 
+	 * table size and enable bit
+	 */
+	regword = DARTCNTL_ENABLE | 
+		((dart_tablebase >> PAGE_SHIFT) << DARTCNTL_BASE_SHIFT) |
+		(((dart_tablesize >> PAGE_SHIFT) & DARTCNTL_SIZE_MASK)
+				 << DARTCNTL_SIZE_SHIFT);
+	dart_vbase = ioremap(virt_to_abs(dart_tablebase), dart_tablesize);
+
+	/* Fill initial table */
+	for (i = 0; i < dart_tablesize/4; i++)
+		dart_vbase[i] = dart_emptyval;
+
+	/* Initialize DART with table base and enable it. */
+	out_be32((unsigned int *)dart, regword);
+
+	/* Invalidate DART to get rid of possible stale TLBs */
+	dart_tlb_invalidate_all();
+
+	printk(KERN_INFO "U3/CPC925 DART IOMMU initialized\n");
+
+	return 0;
+}
+
+static void iommu_table_u3_setup(void)
+{
+	iommu_table_u3.it_busno = 0;
+	iommu_table_u3.it_offset = 0;
+	/* it_size is in number of entries */
+	iommu_table_u3.it_size = dart_tablesize / sizeof(u32);
+
+	/* Initialize the common IOMMU code */
+	iommu_table_u3.it_base = (unsigned long)dart_vbase;
+	iommu_table_u3.it_index = 0;
+	iommu_table_u3.it_blocksize = 1;
+	iommu_init_table(&iommu_table_u3);
+
+	/* Reserve the last page of the DART to avoid possible prefetch
+	 * past the DART mapped area
+	 */
+	set_bit(iommu_table_u3.it_size - 1, iommu_table_u3.it_map);
+}
+
+static void iommu_dev_setup_u3(struct pci_dev *dev)
+{
+	struct device_node *dn;
+
+	/* We only have one iommu table on the mac for now, which makes
+	 * things simple. Setup all PCI devices to point to this table
+	 *
+	 * We must use pci_device_to_OF_node() to make sure that
+	 * we get the real "final" pointer to the device in the
+	 * pci_dev sysdata and not the temporary PHB one
+	 */
+	dn = pci_device_to_OF_node(dev);
+
+	if (dn)
+		dn->iommu_table = &iommu_table_u3;
+}
+
+static void iommu_bus_setup_u3(struct pci_bus *bus)
+{
+	struct device_node *dn;
+
+	if (!iommu_table_u3_inited) {
+		iommu_table_u3_inited = 1;
+		iommu_table_u3_setup();
+	}
+
+	dn = pci_bus_to_OF_node(bus);
+
+	if (dn)
+		dn->iommu_table = &iommu_table_u3;
+}
+
+static void iommu_dev_setup_null(struct pci_dev *dev) { }
+static void iommu_bus_setup_null(struct pci_bus *bus) { }
+
+void iommu_init_early_u3(void)
+{
+	struct device_node *dn;
+
+	/* Find the DART in the device-tree */
+	dn = of_find_compatible_node(NULL, "dart", "u3-dart");
+	if (dn == NULL)
+		return;
+
+	/* Setup low level TCE operations for the core IOMMU code */
+	ppc_md.tce_build = dart_build;
+	ppc_md.tce_free  = dart_free;
+	ppc_md.tce_flush = dart_flush;
+
+	/* Initialize the DART HW */
+	if (dart_init(dn)) {
+		/* If init failed, use direct iommu and null setup functions */
+		ppc_md.iommu_dev_setup = iommu_dev_setup_null;
+		ppc_md.iommu_bus_setup = iommu_bus_setup_null;
+
+		/* Setup pci_dma ops */
+		pci_direct_iommu_init();
+	} else {
+		ppc_md.iommu_dev_setup = iommu_dev_setup_u3;
+		ppc_md.iommu_bus_setup = iommu_bus_setup_u3;
+
+		/* Setup pci_dma ops */
+		pci_iommu_init();
+	}
+}
+
+
+void __init alloc_u3_dart_table(void)
+{
+	/* Only reserve DART space if machine has more than 2GB of RAM
+	 * or if requested with iommu=on on cmdline.
+	 */
+	if (lmb_end_of_DRAM() <= 0x80000000ull && !iommu_force_on)
+		return;
+
+	/* 512 pages (2MB) is max DART tablesize. */
+	dart_tablesize = 1UL << 21;
+	/* 16MB (1 << 24) alignment. We allocate a full 16Mb chuck since we
+	 * will blow up an entire large page anyway in the kernel mapping
+	 */
+	dart_tablebase = (unsigned long)
+		abs_to_virt(lmb_alloc_base(1UL<<24, 1UL<<24, 0x80000000L));
+
+	printk(KERN_INFO "U3-DART allocated at: %lx\n", dart_tablebase);
+}
diff --git a/arch/ppc64/kernel/udbg.c b/arch/ppc64/kernel/udbg.c
new file mode 100644
index 00000000000..d4ccd6f1ef4
--- /dev/null
+++ b/arch/ppc64/kernel/udbg.c
@@ -0,0 +1,360 @@
+/*
+ * NS16550 Serial Port (uart) debugging stuff.
+ *
+ * c 2001 PPC 64 Team, IBM Corp
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <stdarg.h>
+#define WANT_PPCDBG_TAB /* Only defined here */
+#include <linux/config.h>
+#include <linux/types.h>
+#include <asm/ppcdebug.h>
+#include <asm/processor.h>
+#include <asm/uaccess.h>
+#include <asm/machdep.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/pmac_feature.h>
+
+extern u8 real_readb(volatile u8 __iomem  *addr);
+extern void real_writeb(u8 data, volatile u8 __iomem *addr);
+
+struct NS16550 {
+	/* this struct must be packed */
+	unsigned char rbr;  /* 0 */
+	unsigned char ier;  /* 1 */
+	unsigned char fcr;  /* 2 */
+	unsigned char lcr;  /* 3 */
+	unsigned char mcr;  /* 4 */
+	unsigned char lsr;  /* 5 */
+	unsigned char msr;  /* 6 */
+	unsigned char scr;  /* 7 */
+};
+
+#define thr rbr
+#define iir fcr
+#define dll rbr
+#define dlm ier
+#define dlab lcr
+
+#define LSR_DR   0x01  /* Data ready */
+#define LSR_OE   0x02  /* Overrun */
+#define LSR_PE   0x04  /* Parity error */
+#define LSR_FE   0x08  /* Framing error */
+#define LSR_BI   0x10  /* Break */
+#define LSR_THRE 0x20  /* Xmit holding register empty */
+#define LSR_TEMT 0x40  /* Xmitter empty */
+#define LSR_ERR  0x80  /* Error */
+
+static volatile struct NS16550 __iomem *udbg_comport;
+
+void udbg_init_uart(void __iomem *comport, unsigned int speed)
+{
+	u16 dll = speed ? (115200 / speed) : 12;
+
+	if (comport) {
+		udbg_comport = (struct NS16550 __iomem *)comport;
+		out_8(&udbg_comport->lcr, 0x00);
+		out_8(&udbg_comport->ier, 0xff);
+		out_8(&udbg_comport->ier, 0x00);
+		out_8(&udbg_comport->lcr, 0x80);	/* Access baud rate */
+		out_8(&udbg_comport->dll, dll & 0xff);	/* 1 = 115200,  2 = 57600,
+							   3 = 38400, 12 = 9600 baud */
+		out_8(&udbg_comport->dlm, dll >> 8);	/* dll >> 8 which should be zero
+							   for fast rates; */
+		out_8(&udbg_comport->lcr, 0x03);	/* 8 data, 1 stop, no parity */
+		out_8(&udbg_comport->mcr, 0x03);	/* RTS/DTR */
+		out_8(&udbg_comport->fcr ,0x07);	/* Clear & enable FIFOs */
+	}
+}
+
+#ifdef CONFIG_PPC_PMAC
+
+#define	SCC_TXRDY	4
+#define SCC_RXRDY	1
+
+static volatile u8 __iomem *sccc;
+static volatile u8 __iomem *sccd;
+
+static unsigned char scc_inittab[] = {
+    13, 0,		/* set baud rate divisor */
+    12, 0,
+    14, 1,		/* baud rate gen enable, src=rtxc */
+    11, 0x50,		/* clocks = br gen */
+    5,  0xea,		/* tx 8 bits, assert DTR & RTS */
+    4,  0x46,		/* x16 clock, 1 stop */
+    3,  0xc1,		/* rx enable, 8 bits */
+};
+
+void udbg_init_scc(struct device_node *np)
+{
+	u32 *reg;
+	unsigned long addr;
+	int i, x;
+
+	if (np == NULL)
+		np = of_find_node_by_name(NULL, "escc");
+	if (np == NULL || np->parent == NULL)
+		return;
+
+	udbg_printf("found SCC...\n");
+	/* Get address within mac-io ASIC */ 
+	reg = (u32 *)get_property(np, "reg", NULL);
+	if (reg == NULL)
+		return;
+	addr = reg[0];
+	udbg_printf("local addr: %lx\n", addr);
+	/* Get address of mac-io PCI itself */
+	reg = (u32 *)get_property(np->parent, "assigned-addresses", NULL);
+	if (reg == NULL)
+		return;
+	addr += reg[2];
+	udbg_printf("final addr: %lx\n", addr);
+
+	/* Setup for 57600 8N1 */
+	addr += 0x20;
+	sccc = (volatile u8 * __iomem) ioremap(addr & PAGE_MASK, PAGE_SIZE) ;
+	sccc += addr & ~PAGE_MASK;
+	sccd = sccc + 0x10;
+
+	udbg_printf("ioremap result sccc: %p\n", sccc);
+	mb();
+
+	for (i = 20000; i != 0; --i)
+		x = in_8(sccc);
+	out_8(sccc, 0x09);		/* reset A or B side */
+	out_8(sccc, 0xc0);
+	for (i = 0; i < sizeof(scc_inittab); ++i)
+		out_8(sccc, scc_inittab[i]);
+
+	ppc_md.udbg_putc = udbg_putc;
+	ppc_md.udbg_getc = udbg_getc;
+	ppc_md.udbg_getc_poll = udbg_getc_poll;
+
+	udbg_puts("Hello World !\n");
+}
+
+#endif /* CONFIG_PPC_PMAC */
+
+#if CONFIG_PPC_PMAC
+static void udbg_real_putc(unsigned char c)
+{
+	while ((real_readb(sccc) & SCC_TXRDY) == 0)
+		;
+	real_writeb(c, sccd);
+	if (c == '\n')
+		udbg_real_putc('\r');
+}
+
+void udbg_init_pmac_realmode(void)
+{
+	sccc = (volatile u8 __iomem *)0x80013020ul;
+	sccd = (volatile u8 __iomem *)0x80013030ul;
+
+	ppc_md.udbg_putc = udbg_real_putc;
+	ppc_md.udbg_getc = NULL;
+	ppc_md.udbg_getc_poll = NULL;
+}
+#endif /* CONFIG_PPC_PMAC */
+
+#ifdef CONFIG_PPC_MAPLE
+void udbg_maple_real_putc(unsigned char c)
+{
+	if (udbg_comport) {
+		while ((real_readb(&udbg_comport->lsr) & LSR_THRE) == 0)
+			/* wait for idle */;
+		real_writeb(c, &udbg_comport->thr); eieio();
+		if (c == '\n') {
+			/* Also put a CR.  This is for convenience. */
+			while ((real_readb(&udbg_comport->lsr) & LSR_THRE) == 0)
+				/* wait for idle */;
+			real_writeb('\r', &udbg_comport->thr); eieio();
+		}
+	}
+}
+
+void udbg_init_maple_realmode(void)
+{
+	udbg_comport = (volatile struct NS16550 __iomem *)0xf40003f8;
+
+	ppc_md.udbg_putc = udbg_maple_real_putc;
+	ppc_md.udbg_getc = NULL;
+	ppc_md.udbg_getc_poll = NULL;
+}
+#endif /* CONFIG_PPC_MAPLE */
+
+void udbg_putc(unsigned char c)
+{
+	if (udbg_comport) {
+		while ((in_8(&udbg_comport->lsr) & LSR_THRE) == 0)
+			/* wait for idle */;
+		out_8(&udbg_comport->thr, c);
+		if (c == '\n') {
+			/* Also put a CR.  This is for convenience. */
+			while ((in_8(&udbg_comport->lsr) & LSR_THRE) == 0)
+				/* wait for idle */; 
+			out_8(&udbg_comport->thr, '\r');
+		}
+	}
+#ifdef CONFIG_PPC_PMAC
+	else if (sccc) {
+		while ((in_8(sccc) & SCC_TXRDY) == 0)
+			;
+		out_8(sccd,  c);		
+		if (c == '\n')
+			udbg_putc('\r');
+	}
+#endif /* CONFIG_PPC_PMAC */
+}
+
+int udbg_getc_poll(void)
+{
+	if (udbg_comport) {
+		if ((in_8(&udbg_comport->lsr) & LSR_DR) != 0)
+			return in_8(&udbg_comport->rbr);
+		else
+			return -1;
+	}
+#ifdef CONFIG_PPC_PMAC
+	else if (sccc) {
+		if ((in_8(sccc) & SCC_RXRDY) != 0)
+			return in_8(sccd);
+		else
+			return -1;
+	}
+#endif /* CONFIG_PPC_PMAC */
+	return -1;
+}
+
+unsigned char udbg_getc(void)
+{
+	if (udbg_comport) {
+		while ((in_8(&udbg_comport->lsr) & LSR_DR) == 0)
+			/* wait for char */;
+		return in_8(&udbg_comport->rbr);
+	}
+#ifdef CONFIG_PPC_PMAC
+	else if (sccc) {
+		while ((in_8(sccc) & SCC_RXRDY) == 0)
+			;
+		return in_8(sccd);
+	}
+#endif /* CONFIG_PPC_PMAC */
+	return 0;
+}
+
+void udbg_puts(const char *s)
+{
+	if (ppc_md.udbg_putc) {
+		char c;
+
+		if (s && *s != '\0') {
+			while ((c = *s++) != '\0')
+				ppc_md.udbg_putc(c);
+		}
+	}
+#if 0
+	else {
+		printk("%s", s);
+	}
+#endif
+}
+
+int udbg_write(const char *s, int n)
+{
+	int remain = n;
+	char c;
+
+	if (!ppc_md.udbg_putc)
+		return 0;
+
+	if (s && *s != '\0') {
+		while (((c = *s++) != '\0') && (remain-- > 0)) {
+			ppc_md.udbg_putc(c);
+		}
+	}
+
+	return n - remain;
+}
+
+int udbg_read(char *buf, int buflen)
+{
+	char c, *p = buf;
+	int i;
+
+	if (!ppc_md.udbg_getc)
+		return 0;
+
+	for (i = 0; i < buflen; ++i) {
+		do {
+			c = ppc_md.udbg_getc();
+		} while (c == 0x11 || c == 0x13);
+		if (c == 0)
+			break;
+		*p++ = c;
+	}
+
+	return i;
+}
+
+void udbg_console_write(struct console *con, const char *s, unsigned int n)
+{
+	udbg_write(s, n);
+}
+
+#define UDBG_BUFSIZE 256
+void udbg_printf(const char *fmt, ...)
+{
+	unsigned char buf[UDBG_BUFSIZE];
+	va_list args;
+
+	va_start(args, fmt);
+	vsnprintf(buf, UDBG_BUFSIZE, fmt, args);
+	udbg_puts(buf);
+	va_end(args);
+}
+
+/* Special print used by PPCDBG() macro */
+void udbg_ppcdbg(unsigned long debug_flags, const char *fmt, ...)
+{
+	unsigned long active_debugs = debug_flags & ppc64_debug_switch;
+
+	if (active_debugs) {
+		va_list ap;
+		unsigned char buf[UDBG_BUFSIZE];
+		unsigned long i, len = 0;
+
+		for (i=0; i < PPCDBG_NUM_FLAGS; i++) {
+			if (((1U << i) & active_debugs) && 
+			    trace_names[i]) {
+				len += strlen(trace_names[i]); 
+				udbg_puts(trace_names[i]);
+				break;
+			}
+		}
+
+		snprintf(buf, UDBG_BUFSIZE, " [%s]: ", current->comm);
+		len += strlen(buf); 
+		udbg_puts(buf);
+
+		while (len < 18) {
+			udbg_puts(" ");
+			len++;
+		}
+
+		va_start(ap, fmt);
+		vsnprintf(buf, UDBG_BUFSIZE, fmt, ap);
+		udbg_puts(buf);
+		va_end(ap);
+	}
+}
+
+unsigned long udbg_ifdebug(unsigned long flags)
+{
+	return (flags & ppc64_debug_switch);
+}
diff --git a/arch/ppc64/kernel/vdso.c b/arch/ppc64/kernel/vdso.c
new file mode 100644
index 00000000000..8c4597224b7
--- /dev/null
+++ b/arch/ppc64/kernel/vdso.c
@@ -0,0 +1,614 @@
+/*
+ *  linux/arch/ppc64/kernel/vdso.c
+ *
+ *    Copyright (C) 2004 Benjamin Herrenschmidt, IBM Corp.
+ *			 <benh@kernel.crashing.org>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/slab.h>
+#include <linux/user.h>
+#include <linux/elf.h>
+#include <linux/security.h>
+#include <linux/bootmem.h>
+
+#include <asm/pgtable.h>
+#include <asm/system.h>
+#include <asm/processor.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/machdep.h>
+#include <asm/cputable.h>
+#include <asm/sections.h>
+#include <asm/vdso.h>
+
+#undef DEBUG
+
+#ifdef DEBUG
+#define DBG(fmt...) printk(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+
+/*
+ * The vDSOs themselves are here
+ */
+extern char vdso64_start, vdso64_end;
+extern char vdso32_start, vdso32_end;
+
+static void *vdso64_kbase = &vdso64_start;
+static void *vdso32_kbase = &vdso32_start;
+
+unsigned int vdso64_pages;
+unsigned int vdso32_pages;
+
+/* Signal trampolines user addresses */
+
+unsigned long vdso64_rt_sigtramp;
+unsigned long vdso32_sigtramp;
+unsigned long vdso32_rt_sigtramp;
+
+/* Format of the patch table */
+struct vdso_patch_def
+{
+	u32		pvr_mask, pvr_value;
+	const char	*gen_name;
+	const char	*fix_name;
+};
+
+/* Table of functions to patch based on the CPU type/revision
+ *
+ * TODO: Improve by adding whole lists for each entry
+ */
+static struct vdso_patch_def vdso_patches[] = {
+	{
+		0xffff0000, 0x003a0000,		/* POWER5 */
+		"__kernel_sync_dicache", "__kernel_sync_dicache_p5"
+	},
+	{
+		0xffff0000, 0x003b0000,		/* POWER5 */
+		"__kernel_sync_dicache", "__kernel_sync_dicache_p5"
+	},
+};
+
+/*
+ * Some infos carried around for each of them during parsing at
+ * boot time.
+ */
+struct lib32_elfinfo
+{
+	Elf32_Ehdr	*hdr;		/* ptr to ELF */
+	Elf32_Sym	*dynsym;	/* ptr to .dynsym section */
+	unsigned long	dynsymsize;	/* size of .dynsym section */
+	char		*dynstr;	/* ptr to .dynstr section */
+	unsigned long	text;		/* offset of .text section in .so */
+};
+
+struct lib64_elfinfo
+{
+	Elf64_Ehdr	*hdr;
+	Elf64_Sym	*dynsym;
+	unsigned long	dynsymsize;
+	char		*dynstr;
+	unsigned long	text;
+};
+
+
+#ifdef __DEBUG
+static void dump_one_vdso_page(struct page *pg, struct page *upg)
+{
+	printk("kpg: %p (c:%d,f:%08lx)", __va(page_to_pfn(pg) << PAGE_SHIFT),
+	       page_count(pg),
+	       pg->flags);
+	if (upg/* && pg != upg*/) {
+		printk(" upg: %p (c:%d,f:%08lx)", __va(page_to_pfn(upg) << PAGE_SHIFT),
+		       page_count(upg),
+		       upg->flags);
+	}
+	printk("\n");
+}
+
+static void dump_vdso_pages(struct vm_area_struct * vma)
+{
+	int i;
+
+	if (!vma || test_thread_flag(TIF_32BIT)) {
+		printk("vDSO32 @ %016lx:\n", (unsigned long)vdso32_kbase);
+		for (i=0; i<vdso32_pages; i++) {
+			struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE);
+			struct page *upg = (vma && vma->vm_mm) ?
+				follow_page(vma->vm_mm, vma->vm_start + i*PAGE_SIZE, 0)
+				: NULL;
+			dump_one_vdso_page(pg, upg);
+		}
+	}
+	if (!vma || !test_thread_flag(TIF_32BIT)) {
+		printk("vDSO64 @ %016lx:\n", (unsigned long)vdso64_kbase);
+		for (i=0; i<vdso64_pages; i++) {
+			struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE);
+			struct page *upg = (vma && vma->vm_mm) ?
+				follow_page(vma->vm_mm, vma->vm_start + i*PAGE_SIZE, 0)
+				: NULL;
+			dump_one_vdso_page(pg, upg);
+		}
+	}
+}
+#endif /* DEBUG */
+
+/*
+ * Keep a dummy vma_close for now, it will prevent VMA merging.
+ */
+static void vdso_vma_close(struct vm_area_struct * vma)
+{
+}
+
+/*
+ * Our nopage() function, maps in the actual vDSO kernel pages, they will
+ * be mapped read-only by do_no_page(), and eventually COW'ed, either
+ * right away for an initial write access, or by do_wp_page().
+ */
+static struct page * vdso_vma_nopage(struct vm_area_struct * vma,
+				     unsigned long address, int *type)
+{
+	unsigned long offset = address - vma->vm_start;
+	struct page *pg;
+	void *vbase = test_thread_flag(TIF_32BIT) ? vdso32_kbase : vdso64_kbase;
+
+	DBG("vdso_vma_nopage(current: %s, address: %016lx, off: %lx)\n",
+	    current->comm, address, offset);
+
+	if (address < vma->vm_start || address > vma->vm_end)
+		return NOPAGE_SIGBUS;
+
+	/*
+	 * Last page is systemcfg, special handling here, no get_page() a
+	 * this is a reserved page
+	 */
+	if ((vma->vm_end - address) <= PAGE_SIZE)
+		return virt_to_page(systemcfg);
+
+	pg = virt_to_page(vbase + offset);
+	get_page(pg);
+	DBG(" ->page count: %d\n", page_count(pg));
+
+	return pg;
+}
+
+static struct vm_operations_struct vdso_vmops = {
+	.close	= vdso_vma_close,
+	.nopage	= vdso_vma_nopage,
+};
+
+/*
+ * This is called from binfmt_elf, we create the special vma for the
+ * vDSO and insert it into the mm struct tree
+ */
+int arch_setup_additional_pages(struct linux_binprm *bprm, int executable_stack)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	unsigned long vdso_pages;
+	unsigned long vdso_base;
+
+	if (test_thread_flag(TIF_32BIT)) {
+		vdso_pages = vdso32_pages;
+		vdso_base = VDSO32_MBASE;
+	} else {
+		vdso_pages = vdso64_pages;
+		vdso_base = VDSO64_MBASE;
+	}
+
+	/* vDSO has a problem and was disabled, just don't "enable" it for the
+	 * process
+	 */
+	if (vdso_pages == 0) {
+		current->thread.vdso_base = 0;
+		return 0;
+	}
+	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	if (vma == NULL)
+		return -ENOMEM;
+	if (security_vm_enough_memory(vdso_pages)) {
+		kmem_cache_free(vm_area_cachep, vma);
+		return -ENOMEM;
+	}
+	memset(vma, 0, sizeof(*vma));
+
+	/*
+	 * pick a base address for the vDSO in process space. We have a default
+	 * base of 1Mb on which we had a random offset up to 1Mb.
+	 * XXX: Add possibility for a program header to specify that location
+	 */
+	current->thread.vdso_base = vdso_base;
+	/*  + ((unsigned long)vma & 0x000ff000); */
+
+	vma->vm_mm = mm;
+	vma->vm_start = current->thread.vdso_base;
+
+	/*
+	 * the VMA size is one page more than the vDSO since systemcfg
+	 * is mapped in the last one
+	 */
+	vma->vm_end = vma->vm_start + ((vdso_pages + 1) << PAGE_SHIFT);
+
+	/*
+	 * our vma flags don't have VM_WRITE so by default, the process isn't allowed
+	 * to write those pages.
+	 * gdb can break that with ptrace interface, and thus trigger COW on those
+	 * pages but it's then your responsibility to never do that on the "data" page
+	 * of the vDSO or you'll stop getting kernel updates and your nice userland
+	 * gettimeofday will be totally dead. It's fine to use that for setting
+	 * breakpoints in the vDSO code pages though
+	 */
+	vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+	vma->vm_flags |= mm->def_flags;
+	vma->vm_page_prot = protection_map[vma->vm_flags & 0x7];
+	vma->vm_ops = &vdso_vmops;
+
+	down_write(&mm->mmap_sem);
+	insert_vm_struct(mm, vma);
+	mm->total_vm += (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+	up_write(&mm->mmap_sem);
+
+	return 0;
+}
+
+static void * __init find_section32(Elf32_Ehdr *ehdr, const char *secname,
+				  unsigned long *size)
+{
+	Elf32_Shdr *sechdrs;
+	unsigned int i;
+	char *secnames;
+
+	/* Grab section headers and strings so we can tell who is who */
+	sechdrs = (void *)ehdr + ehdr->e_shoff;
+	secnames = (void *)ehdr + sechdrs[ehdr->e_shstrndx].sh_offset;
+
+	/* Find the section they want */
+	for (i = 1; i < ehdr->e_shnum; i++) {
+		if (strcmp(secnames+sechdrs[i].sh_name, secname) == 0) {
+			if (size)
+				*size = sechdrs[i].sh_size;
+			return (void *)ehdr + sechdrs[i].sh_offset;
+		}
+	}
+	*size = 0;
+	return NULL;
+}
+
+static void * __init find_section64(Elf64_Ehdr *ehdr, const char *secname,
+				  unsigned long *size)
+{
+	Elf64_Shdr *sechdrs;
+	unsigned int i;
+	char *secnames;
+
+	/* Grab section headers and strings so we can tell who is who */
+	sechdrs = (void *)ehdr + ehdr->e_shoff;
+	secnames = (void *)ehdr + sechdrs[ehdr->e_shstrndx].sh_offset;
+
+	/* Find the section they want */
+	for (i = 1; i < ehdr->e_shnum; i++) {
+		if (strcmp(secnames+sechdrs[i].sh_name, secname) == 0) {
+			if (size)
+				*size = sechdrs[i].sh_size;
+			return (void *)ehdr + sechdrs[i].sh_offset;
+		}
+	}
+	if (size)
+		*size = 0;
+	return NULL;
+}
+
+static Elf32_Sym * __init find_symbol32(struct lib32_elfinfo *lib, const char *symname)
+{
+	unsigned int i;
+	char name[32], *c;
+
+	for (i = 0; i < (lib->dynsymsize / sizeof(Elf32_Sym)); i++) {
+		if (lib->dynsym[i].st_name == 0)
+			continue;
+		strlcpy(name, lib->dynstr + lib->dynsym[i].st_name, 32);
+		c = strchr(name, '@');
+		if (c)
+			*c = 0;
+		if (strcmp(symname, name) == 0)
+			return &lib->dynsym[i];
+	}
+	return NULL;
+}
+
+static Elf64_Sym * __init find_symbol64(struct lib64_elfinfo *lib, const char *symname)
+{
+	unsigned int i;
+	char name[32], *c;
+
+	for (i = 0; i < (lib->dynsymsize / sizeof(Elf64_Sym)); i++) {
+		if (lib->dynsym[i].st_name == 0)
+			continue;
+		strlcpy(name, lib->dynstr + lib->dynsym[i].st_name, 32);
+		c = strchr(name, '@');
+		if (c)
+			*c = 0;
+		if (strcmp(symname, name) == 0)
+			return &lib->dynsym[i];
+	}
+	return NULL;
+}
+
+/* Note that we assume the section is .text and the symbol is relative to
+ * the library base
+ */
+static unsigned long __init find_function32(struct lib32_elfinfo *lib, const char *symname)
+{
+	Elf32_Sym *sym = find_symbol32(lib, symname);
+
+	if (sym == NULL) {
+		printk(KERN_WARNING "vDSO32: function %s not found !\n", symname);
+		return 0;
+	}
+	return sym->st_value - VDSO32_LBASE;
+}
+
+/* Note that we assume the section is .text and the symbol is relative to
+ * the library base
+ */
+static unsigned long __init find_function64(struct lib64_elfinfo *lib, const char *symname)
+{
+	Elf64_Sym *sym = find_symbol64(lib, symname);
+
+	if (sym == NULL) {
+		printk(KERN_WARNING "vDSO64: function %s not found !\n", symname);
+		return 0;
+	}
+#ifdef VDS64_HAS_DESCRIPTORS
+	return *((u64 *)(vdso64_kbase + sym->st_value - VDSO64_LBASE)) - VDSO64_LBASE;
+#else
+	return sym->st_value - VDSO64_LBASE;
+#endif
+}
+
+
+static __init int vdso_do_find_sections(struct lib32_elfinfo *v32,
+					struct lib64_elfinfo *v64)
+{
+	void *sect;
+
+	/*
+	 * Locate symbol tables & text section
+	 */
+
+	v32->dynsym = find_section32(v32->hdr, ".dynsym", &v32->dynsymsize);
+	v32->dynstr = find_section32(v32->hdr, ".dynstr", NULL);
+	if (v32->dynsym == NULL || v32->dynstr == NULL) {
+		printk(KERN_ERR "vDSO32: a required symbol section was not found\n");
+		return -1;
+	}
+	sect = find_section32(v32->hdr, ".text", NULL);
+	if (sect == NULL) {
+		printk(KERN_ERR "vDSO32: the .text section was not found\n");
+		return -1;
+	}
+	v32->text = sect - vdso32_kbase;
+
+	v64->dynsym = find_section64(v64->hdr, ".dynsym", &v64->dynsymsize);
+	v64->dynstr = find_section64(v64->hdr, ".dynstr", NULL);
+	if (v64->dynsym == NULL || v64->dynstr == NULL) {
+		printk(KERN_ERR "vDSO64: a required symbol section was not found\n");
+		return -1;
+	}
+	sect = find_section64(v64->hdr, ".text", NULL);
+	if (sect == NULL) {
+		printk(KERN_ERR "vDSO64: the .text section was not found\n");
+		return -1;
+	}
+	v64->text = sect - vdso64_kbase;
+
+	return 0;
+}
+
+static __init void vdso_setup_trampolines(struct lib32_elfinfo *v32,
+					  struct lib64_elfinfo *v64)
+{
+	/*
+	 * Find signal trampolines
+	 */
+
+	vdso64_rt_sigtramp	= find_function64(v64, "__kernel_sigtramp_rt64");
+	vdso32_sigtramp		= find_function32(v32, "__kernel_sigtramp32");
+	vdso32_rt_sigtramp	= find_function32(v32, "__kernel_sigtramp_rt32");
+}
+
+static __init int vdso_fixup_datapage(struct lib32_elfinfo *v32,
+				       struct lib64_elfinfo *v64)
+{
+	Elf32_Sym *sym32;
+	Elf64_Sym *sym64;
+
+	sym32 = find_symbol32(v32, "__kernel_datapage_offset");
+	if (sym32 == NULL) {
+		printk(KERN_ERR "vDSO32: Can't find symbol __kernel_datapage_offset !\n");
+		return -1;
+	}
+	*((int *)(vdso32_kbase + (sym32->st_value - VDSO32_LBASE))) =
+		(vdso32_pages << PAGE_SHIFT) - (sym32->st_value - VDSO32_LBASE);
+
+       	sym64 = find_symbol64(v64, "__kernel_datapage_offset");
+	if (sym64 == NULL) {
+		printk(KERN_ERR "vDSO64: Can't find symbol __kernel_datapage_offset !\n");
+		return -1;
+	}
+	*((int *)(vdso64_kbase + sym64->st_value - VDSO64_LBASE)) =
+		(vdso64_pages << PAGE_SHIFT) - (sym64->st_value - VDSO64_LBASE);
+
+	return 0;
+}
+
+static int vdso_do_func_patch32(struct lib32_elfinfo *v32,
+				struct lib64_elfinfo *v64,
+				const char *orig, const char *fix)
+{
+	Elf32_Sym *sym32_gen, *sym32_fix;
+
+	sym32_gen = find_symbol32(v32, orig);
+	if (sym32_gen == NULL) {
+		printk(KERN_ERR "vDSO32: Can't find symbol %s !\n", orig);
+		return -1;
+	}
+	sym32_fix = find_symbol32(v32, fix);
+	if (sym32_fix == NULL) {
+		printk(KERN_ERR "vDSO32: Can't find symbol %s !\n", fix);
+		return -1;
+	}
+	sym32_gen->st_value = sym32_fix->st_value;
+	sym32_gen->st_size = sym32_fix->st_size;
+	sym32_gen->st_info = sym32_fix->st_info;
+	sym32_gen->st_other = sym32_fix->st_other;
+	sym32_gen->st_shndx = sym32_fix->st_shndx;
+
+	return 0;
+}
+
+static int vdso_do_func_patch64(struct lib32_elfinfo *v32,
+				struct lib64_elfinfo *v64,
+				const char *orig, const char *fix)
+{
+	Elf64_Sym *sym64_gen, *sym64_fix;
+
+	sym64_gen = find_symbol64(v64, orig);
+	if (sym64_gen == NULL) {
+		printk(KERN_ERR "vDSO64: Can't find symbol %s !\n", orig);
+		return -1;
+	}
+	sym64_fix = find_symbol64(v64, fix);
+	if (sym64_fix == NULL) {
+		printk(KERN_ERR "vDSO64: Can't find symbol %s !\n", fix);
+		return -1;
+	}
+	sym64_gen->st_value = sym64_fix->st_value;
+	sym64_gen->st_size = sym64_fix->st_size;
+	sym64_gen->st_info = sym64_fix->st_info;
+	sym64_gen->st_other = sym64_fix->st_other;
+	sym64_gen->st_shndx = sym64_fix->st_shndx;
+
+	return 0;
+}
+
+static __init int vdso_fixup_alt_funcs(struct lib32_elfinfo *v32,
+				       struct lib64_elfinfo *v64)
+{
+	u32 pvr;
+	int i;
+
+	pvr = mfspr(SPRN_PVR);
+	for (i = 0; i < ARRAY_SIZE(vdso_patches); i++) {
+		struct vdso_patch_def *patch = &vdso_patches[i];
+		int match = (pvr & patch->pvr_mask) == patch->pvr_value;
+
+		DBG("patch %d (mask: %x, pvr: %x) : %s\n",
+		    i, patch->pvr_mask, patch->pvr_value, match ? "match" : "skip");
+
+		if (!match)
+			continue;
+
+		DBG("replacing %s with %s...\n", patch->gen_name, patch->fix_name);
+
+		/*
+		 * Patch the 32 bits and 64 bits symbols. Note that we do not patch
+		 * the "." symbol on 64 bits. It would be easy to do, but doesn't
+		 * seem to be necessary, patching the OPD symbol is enough.
+		 */
+		vdso_do_func_patch32(v32, v64, patch->gen_name, patch->fix_name);
+		vdso_do_func_patch64(v32, v64, patch->gen_name, patch->fix_name);
+	}
+
+	return 0;
+}
+
+
+static __init int vdso_setup(void)
+{
+	struct lib32_elfinfo	v32;
+	struct lib64_elfinfo	v64;
+
+	v32.hdr = vdso32_kbase;
+	v64.hdr = vdso64_kbase;
+
+	if (vdso_do_find_sections(&v32, &v64))
+		return -1;
+
+	if (vdso_fixup_datapage(&v32, &v64))
+		return -1;
+
+	if (vdso_fixup_alt_funcs(&v32, &v64))
+		return -1;
+
+	vdso_setup_trampolines(&v32, &v64);
+
+	return 0;
+}
+
+void __init vdso_init(void)
+{
+	int i;
+
+	vdso64_pages = (&vdso64_end - &vdso64_start) >> PAGE_SHIFT;
+	vdso32_pages = (&vdso32_end - &vdso32_start) >> PAGE_SHIFT;
+
+	DBG("vdso64_kbase: %p, 0x%x pages, vdso32_kbase: %p, 0x%x pages\n",
+	       vdso64_kbase, vdso64_pages, vdso32_kbase, vdso32_pages);
+
+	/*
+	 * Initialize the vDSO images in memory, that is do necessary
+	 * fixups of vDSO symbols, locate trampolines, etc...
+	 */
+	if (vdso_setup()) {
+		printk(KERN_ERR "vDSO setup failure, not enabled !\n");
+		/* XXX should free pages here ? */
+		vdso64_pages = vdso32_pages = 0;
+		return;
+	}
+
+	/* Make sure pages are in the correct state */
+	for (i = 0; i < vdso64_pages; i++) {
+		struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE);
+		ClearPageReserved(pg);
+		get_page(pg);
+	}
+	for (i = 0; i < vdso32_pages; i++) {
+		struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE);
+		ClearPageReserved(pg);
+		get_page(pg);
+	}
+}
+
+int in_gate_area_no_task(unsigned long addr)
+{
+	return 0;
+}
+
+int in_gate_area(struct task_struct *task, unsigned long addr)
+{
+	return 0;
+}
+
+struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+{
+	return NULL;
+}
+
diff --git a/arch/ppc64/kernel/vdso32/Makefile b/arch/ppc64/kernel/vdso32/Makefile
new file mode 100644
index 00000000000..ede2f7e477c
--- /dev/null
+++ b/arch/ppc64/kernel/vdso32/Makefile
@@ -0,0 +1,36 @@
+
+# List of files in the vdso, has to be asm only for now
+
+obj-vdso32 = sigtramp.o gettimeofday.o datapage.o cacheflush.o
+
+# Build rules
+
+targets := $(obj-vdso32) vdso32.so
+obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32))
+
+
+EXTRA_CFLAGS := -shared -s -fno-common -fno-builtin
+EXTRA_CFLAGS += -nostdlib -Wl,-soname=linux-vdso32.so.1
+EXTRA_AFLAGS := -D__VDSO32__ -s
+
+obj-y += vdso32_wrapper.o
+extra-y += vdso32.lds
+CPPFLAGS_vdso32.lds += -P -C -U$(ARCH)
+
+# Force dependency (incbin is bad)
+$(obj)/vdso32_wrapper.o : $(obj)/vdso32.so
+
+# link rule for the .so file, .lds has to be first
+$(obj)/vdso32.so: $(src)/vdso32.lds $(obj-vdso32)
+	$(call if_changed,vdso32ld)
+
+# assembly rules for the .S files
+$(obj-vdso32): %.o: %.S
+	$(call if_changed_dep,vdso32as)
+
+# actual build commands
+quiet_cmd_vdso32ld = VDSO32L $@
+      cmd_vdso32ld = $(CROSS32CC) $(c_flags) -Wl,-T $^ -o $@
+quiet_cmd_vdso32as = VDSO32A $@
+      cmd_vdso32as = $(CROSS32CC) $(a_flags) -c -o $@ $<
+
diff --git a/arch/ppc64/kernel/vdso32/cacheflush.S b/arch/ppc64/kernel/vdso32/cacheflush.S
new file mode 100644
index 00000000000..c74fddb6afd
--- /dev/null
+++ b/arch/ppc64/kernel/vdso32/cacheflush.S
@@ -0,0 +1,65 @@
+/*
+ * vDSO provided cache flush routines
+ *
+ * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org),
+ *                    IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/vdso.h>
+#include <asm/offsets.h>
+
+	.text
+
+/*
+ * Default "generic" version of __kernel_sync_dicache.
+ *
+ * void __kernel_sync_dicache(unsigned long start, unsigned long end)
+ *
+ * Flushes the data cache & invalidate the instruction cache for the
+ * provided range [start, end[
+ *
+ * Note: all CPUs supported by this kernel have a 128 bytes cache
+ * line size so we don't have to peek that info from the datapage
+ */
+V_FUNCTION_BEGIN(__kernel_sync_dicache)
+  .cfi_startproc
+	li	r5,127
+	andc	r6,r3,r5		/* round low to line bdy */
+	subf	r8,r6,r4		/* compute length */
+	add	r8,r8,r5		/* ensure we get enough */
+	srwi.	r8,r8,7			/* compute line count */
+	beqlr				/* nothing to do? */
+	mtctr	r8
+	mr	r3,r6
+1:	dcbst	0,r3
+	addi	r3,r3,128
+	bdnz	1b
+	sync
+	mtctr	r8
+1:	icbi	0,r6
+	addi	r6,r6,128
+	bdnz	1b
+	isync
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__kernel_sync_dicache)
+
+
+/*
+ * POWER5 version of __kernel_sync_dicache
+ */
+V_FUNCTION_BEGIN(__kernel_sync_dicache_p5)
+  .cfi_startproc
+	sync
+	isync
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__kernel_sync_dicache_p5)
+
diff --git a/arch/ppc64/kernel/vdso32/datapage.S b/arch/ppc64/kernel/vdso32/datapage.S
new file mode 100644
index 00000000000..29b6bd32e1f
--- /dev/null
+++ b/arch/ppc64/kernel/vdso32/datapage.S
@@ -0,0 +1,68 @@
+/*
+ * Access to the shared data page by the vDSO & syscall map
+ *
+ * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/offsets.h>
+#include <asm/unistd.h>
+#include <asm/vdso.h>
+
+	.text
+V_FUNCTION_BEGIN(__get_datapage)
+  .cfi_startproc
+	/* We don't want that exposed or overridable as we want other objects
+	 * to be able to bl directly to here
+	 */
+	.protected __get_datapage
+	.hidden __get_datapage
+
+	mflr	r0
+  .cfi_register lr,r0
+
+	bcl	20,31,1f
+	.global	__kernel_datapage_offset;
+__kernel_datapage_offset:
+	.long	0
+1:
+	mflr	r3
+	mtlr	r0
+	lwz	r0,0(r3)
+	add	r3,r0,r3
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__get_datapage)
+
+/*
+ * void *__kernel_get_syscall_map(unsigned int *syscall_count) ;
+ *
+ * returns a pointer to the syscall map. the map is agnostic to the
+ * size of "long", unlike kernel bitops, it stores bits from top to
+ * bottom so that memory actually contains a linear bitmap
+ * check for syscall N by testing bit (0x80000000 >> (N & 0x1f)) of
+ * 32 bits int at N >> 5.
+ */
+V_FUNCTION_BEGIN(__kernel_get_syscall_map)
+  .cfi_startproc
+	mflr	r12
+  .cfi_register lr,r12
+
+	mr	r4,r3
+	bl	__get_datapage@local
+	mtlr	r12
+	addi	r3,r3,CFG_SYSCALL_MAP32
+	cmpli	cr0,r4,0
+	beqlr
+	li	r0,__NR_syscalls
+	stw	r0,0(r4)
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__kernel_get_syscall_map)
diff --git a/arch/ppc64/kernel/vdso32/gettimeofday.S b/arch/ppc64/kernel/vdso32/gettimeofday.S
new file mode 100644
index 00000000000..ca7f415195c
--- /dev/null
+++ b/arch/ppc64/kernel/vdso32/gettimeofday.S
@@ -0,0 +1,139 @@
+/*
+ * Userland implementation of gettimeofday() for 32 bits processes in a
+ * ppc64 kernel for use in the vDSO
+ *
+ * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/vdso.h>
+#include <asm/offsets.h>
+#include <asm/unistd.h>
+
+	.text
+/*
+ * Exact prototype of gettimeofday
+ *
+ * int __kernel_gettimeofday(struct timeval *tv, struct timezone *tz);
+ *
+ */
+V_FUNCTION_BEGIN(__kernel_gettimeofday)
+  .cfi_startproc
+	mflr	r12
+  .cfi_register lr,r12
+
+	mr	r10,r3			/* r10 saves tv */
+	mr	r11,r4			/* r11 saves tz */
+	bl	__get_datapage@local	/* get data page */
+	mr	r9, r3			/* datapage ptr in r9 */
+	bl	__do_get_xsec@local	/* get xsec from tb & kernel */
+	bne-	2f			/* out of line -> do syscall */
+
+	/* seconds are xsec >> 20 */
+	rlwinm	r5,r4,12,20,31
+	rlwimi	r5,r3,12,0,19
+	stw	r5,TVAL32_TV_SEC(r10)
+
+	/* get remaining xsec and convert to usec. we scale
+	 * up remaining xsec by 12 bits and get the top 32 bits
+	 * of the multiplication
+	 */
+	rlwinm	r5,r4,12,0,19
+	lis	r6,1000000@h
+	ori	r6,r6,1000000@l
+	mulhwu	r5,r5,r6
+	stw	r5,TVAL32_TV_USEC(r10)
+
+	cmpli	cr0,r11,0		/* check if tz is NULL */
+	beq	1f
+	lwz	r4,CFG_TZ_MINUTEWEST(r9)/* fill tz */
+	lwz	r5,CFG_TZ_DSTTIME(r9)
+	stw	r4,TZONE_TZ_MINWEST(r11)
+	stw	r5,TZONE_TZ_DSTTIME(r11)
+
+1:	mtlr	r12
+	blr
+
+2:	mr	r3,r10
+	mr	r4,r11
+	li	r0,__NR_gettimeofday
+	sc
+	b	1b
+  .cfi_endproc
+V_FUNCTION_END(__kernel_gettimeofday)
+
+/*
+ * This is the core of gettimeofday(), it returns the xsec
+ * value in r3 & r4 and expects the datapage ptr (non clobbered)
+ * in r9. clobbers r0,r4,r5,r6,r7,r8
+*/
+__do_get_xsec:
+  .cfi_startproc
+	/* Check for update count & load values. We use the low
+	 * order 32 bits of the update count
+	 */
+1:	lwz	r8,(CFG_TB_UPDATE_COUNT+4)(r9)
+	andi.	r0,r8,1			/* pending update ? loop */
+	bne-	1b
+	xor	r0,r8,r8		/* create dependency */
+	add	r9,r9,r0
+
+	/* Load orig stamp (offset to TB) */
+	lwz	r5,CFG_TB_ORIG_STAMP(r9)
+	lwz	r6,(CFG_TB_ORIG_STAMP+4)(r9)
+
+	/* Get a stable TB value */
+2:	mftbu	r3
+	mftbl	r4
+	mftbu	r0
+	cmpl	cr0,r3,r0
+	bne-	2b
+
+	/* Substract tb orig stamp. If the high part is non-zero, we jump to the
+	 * slow path which call the syscall. If it's ok, then we have our 32 bits
+	 * tb_ticks value in r7
+	 */
+	subfc	r7,r6,r4
+	subfe.	r0,r5,r3
+	bne-	3f
+
+	/* Load scale factor & do multiplication */
+	lwz	r5,CFG_TB_TO_XS(r9)	/* load values */
+	lwz	r6,(CFG_TB_TO_XS+4)(r9)
+	mulhwu	r4,r7,r5
+	mulhwu	r6,r7,r6
+	mullw	r6,r7,r5
+	addc	r6,r6,r0
+
+	/* At this point, we have the scaled xsec value in r4 + XER:CA
+	 * we load & add the stamp since epoch
+	 */
+	lwz	r5,CFG_STAMP_XSEC(r9)
+	lwz	r6,(CFG_STAMP_XSEC+4)(r9)
+	adde	r4,r4,r6
+	addze	r3,r5
+
+	/* We now have our result in r3,r4. We create a fake dependency
+	 * on that result and re-check the counter
+	 */
+	xor	r0,r4,r4
+	add	r9,r9,r0
+	lwz	r0,(CFG_TB_UPDATE_COUNT+4)(r9)
+        cmpl    cr0,r8,r0		/* check if updated */
+	bne-	1b
+
+	/* Warning ! The caller expects CR:EQ to be set to indicate a
+	 * successful calculation (so it won't fallback to the syscall
+	 * method). We have overriden that CR bit in the counter check,
+	 * but fortunately, the loop exit condition _is_ CR:EQ set, so
+	 * we can exit safely here. If you change this code, be careful
+	 * of that side effect.
+	 */
+3:	blr
+  .cfi_endproc
diff --git a/arch/ppc64/kernel/vdso32/sigtramp.S b/arch/ppc64/kernel/vdso32/sigtramp.S
new file mode 100644
index 00000000000..e0464278191
--- /dev/null
+++ b/arch/ppc64/kernel/vdso32/sigtramp.S
@@ -0,0 +1,300 @@
+/*
+ * Signal trampolines for 32 bits processes in a ppc64 kernel for
+ * use in the vDSO
+ *
+ * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), IBM Corp.
+ * Copyright (C) 2004 Alan Modra (amodra@au.ibm.com)), IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/unistd.h>
+#include <asm/vdso.h>
+
+	.text
+
+/* The nop here is a hack.  The dwarf2 unwind routines subtract 1 from
+   the return address to get an address in the middle of the presumed
+   call instruction.  Since we don't have a call here, we artifically
+   extend the range covered by the unwind info by adding a nop before
+   the real start.  */
+	nop
+V_FUNCTION_BEGIN(__kernel_sigtramp32)
+.Lsig_start = . - 4
+	li	r0,__NR_sigreturn
+	sc
+.Lsig_end:
+V_FUNCTION_END(__kernel_sigtramp32)
+
+.Lsigrt_start:
+	nop
+V_FUNCTION_BEGIN(__kernel_sigtramp_rt32)
+	li	r0,__NR_rt_sigreturn
+	sc
+.Lsigrt_end:
+V_FUNCTION_END(__kernel_sigtramp_rt32)
+
+	.section .eh_frame,"a",@progbits
+
+/* Register r1 can be found at offset 4 of a pt_regs structure.
+   A pointer to the pt_regs is stored in memory at the old sp plus PTREGS.  */
+#define cfa_save \
+  .byte 0x0f;			/* DW_CFA_def_cfa_expression */		\
+  .uleb128 9f - 1f;		/*   length */				\
+1:									\
+  .byte 0x71; .sleb128 PTREGS;	/*     DW_OP_breg1 */			\
+  .byte 0x06;			/*     DW_OP_deref */			\
+  .byte 0x23; .uleb128 RSIZE;	/*     DW_OP_plus_uconst */		\
+  .byte 0x06;			/*     DW_OP_deref */			\
+9:
+
+/* Register REGNO can be found at offset OFS of a pt_regs structure.
+   A pointer to the pt_regs is stored in memory at the old sp plus PTREGS.  */
+#define rsave(regno, ofs) \
+  .byte 0x10;			/* DW_CFA_expression */			\
+  .uleb128 regno;		/*   regno */				\
+  .uleb128 9f - 1f;		/*   length */				\
+1:									\
+  .byte 0x71; .sleb128 PTREGS;	/*     DW_OP_breg1 */			\
+  .byte 0x06;			/*     DW_OP_deref */			\
+  .ifne ofs;								\
+    .byte 0x23; .uleb128 ofs;	/*     DW_OP_plus_uconst */		\
+  .endif;								\
+9:
+
+/* If msr bit 1<<25 is set, then VMX register REGNO is at offset REGNO*16
+   of the VMX reg struct.  The VMX reg struct is at offset VREGS of
+   the pt_regs struct.  This macro is for REGNO == 0, and contains
+   'subroutines' that the other macros jump to.  */
+#define vsave_msr0(regno) \
+  .byte 0x10;			/* DW_CFA_expression */			\
+  .uleb128 regno + 77;		/*   regno */				\
+  .uleb128 9f - 1f;		/*   length */				\
+1:									\
+  .byte 0x30 + regno;		/*     DW_OP_lit0 */			\
+2:									\
+  .byte 0x40;			/*     DW_OP_lit16 */			\
+  .byte 0x1e;			/*     DW_OP_mul */			\
+3:									\
+  .byte 0x71; .sleb128 PTREGS;	/*     DW_OP_breg1 */			\
+  .byte 0x06;			/*     DW_OP_deref */			\
+  .byte 0x12;			/*     DW_OP_dup */			\
+  .byte 0x23;			/*     DW_OP_plus_uconst */		\
+    .uleb128 33*RSIZE;		/*       msr offset */			\
+  .byte 0x06;			/*     DW_OP_deref */			\
+  .byte 0x0c; .long 1 << 25;	/*     DW_OP_const4u */			\
+  .byte 0x1a;			/*     DW_OP_and */			\
+  .byte 0x12;			/*     DW_OP_dup, ret 0 if bra taken */	\
+  .byte 0x30;			/*     DW_OP_lit0 */			\
+  .byte 0x29;			/*     DW_OP_eq */			\
+  .byte 0x28; .short 0x7fff;	/*     DW_OP_bra to end */		\
+  .byte 0x13;			/*     DW_OP_drop, pop the 0 */		\
+  .byte 0x23; .uleb128 VREGS;	/*     DW_OP_plus_uconst */		\
+  .byte 0x22;			/*     DW_OP_plus */			\
+  .byte 0x2f; .short 0x7fff;	/*     DW_OP_skip to end */		\
+9:
+
+/* If msr bit 1<<25 is set, then VMX register REGNO is at offset REGNO*16
+   of the VMX reg struct.  REGNO is 1 thru 31.  */
+#define vsave_msr1(regno) \
+  .byte 0x10;			/* DW_CFA_expression */			\
+  .uleb128 regno + 77;		/*   regno */				\
+  .uleb128 9f - 1f;		/*   length */				\
+1:									\
+  .byte 0x30 + regno;		/*     DW_OP_lit n */			\
+  .byte 0x2f; .short 2b - 9f;	/*     DW_OP_skip */			\
+9:
+
+/* If msr bit 1<<25 is set, then VMX register REGNO is at offset OFS of
+   the VMX save block.  */
+#define vsave_msr2(regno, ofs) \
+  .byte 0x10;			/* DW_CFA_expression */			\
+  .uleb128 regno + 77;		/*   regno */				\
+  .uleb128 9f - 1f;		/*   length */				\
+1:									\
+  .byte 0x0a; .short ofs;	/*     DW_OP_const2u */			\
+  .byte 0x2f; .short 3b - 9f;	/*     DW_OP_skip */			\
+9:
+
+/* VMX register REGNO is at offset OFS of the VMX save area.  */
+#define vsave(regno, ofs) \
+  .byte 0x10;			/* DW_CFA_expression */			\
+  .uleb128 regno + 77;		/*   regno */				\
+  .uleb128 9f - 1f;		/*   length */				\
+1:									\
+  .byte 0x71; .sleb128 PTREGS;	/*     DW_OP_breg1 */			\
+  .byte 0x06;			/*     DW_OP_deref */			\
+  .byte 0x23; .uleb128 VREGS;	/*     DW_OP_plus_uconst */		\
+  .byte 0x23; .uleb128 ofs;	/*     DW_OP_plus_uconst */		\
+9:
+
+/* This is where the pt_regs pointer can be found on the stack.  */
+#define PTREGS 64+28
+
+/* Size of regs.  */
+#define RSIZE 4
+
+/* This is the offset of the VMX regs.  */
+#define VREGS 48*RSIZE+34*8
+
+/* Describe where general purpose regs are saved.  */
+#define EH_FRAME_GEN \
+  cfa_save;								\
+  rsave ( 0,  0*RSIZE);							\
+  rsave ( 2,  2*RSIZE);							\
+  rsave ( 3,  3*RSIZE);							\
+  rsave ( 4,  4*RSIZE);							\
+  rsave ( 5,  5*RSIZE);							\
+  rsave ( 6,  6*RSIZE);							\
+  rsave ( 7,  7*RSIZE);							\
+  rsave ( 8,  8*RSIZE);							\
+  rsave ( 9,  9*RSIZE);							\
+  rsave (10, 10*RSIZE);							\
+  rsave (11, 11*RSIZE);							\
+  rsave (12, 12*RSIZE);							\
+  rsave (13, 13*RSIZE);							\
+  rsave (14, 14*RSIZE);							\
+  rsave (15, 15*RSIZE);							\
+  rsave (16, 16*RSIZE);							\
+  rsave (17, 17*RSIZE);							\
+  rsave (18, 18*RSIZE);							\
+  rsave (19, 19*RSIZE);							\
+  rsave (20, 20*RSIZE);							\
+  rsave (21, 21*RSIZE);							\
+  rsave (22, 22*RSIZE);							\
+  rsave (23, 23*RSIZE);							\
+  rsave (24, 24*RSIZE);							\
+  rsave (25, 25*RSIZE);							\
+  rsave (26, 26*RSIZE);							\
+  rsave (27, 27*RSIZE);							\
+  rsave (28, 28*RSIZE);							\
+  rsave (29, 29*RSIZE);							\
+  rsave (30, 30*RSIZE);							\
+  rsave (31, 31*RSIZE);							\
+  rsave (67, 32*RSIZE);		/* ap, used as temp for nip */		\
+  rsave (65, 36*RSIZE);		/* lr */				\
+  rsave (70, 38*RSIZE)		/* cr */
+
+/* Describe where the FP regs are saved.  */
+#define EH_FRAME_FP \
+  rsave (32, 48*RSIZE +  0*8);						\
+  rsave (33, 48*RSIZE +  1*8);						\
+  rsave (34, 48*RSIZE +  2*8);						\
+  rsave (35, 48*RSIZE +  3*8);						\
+  rsave (36, 48*RSIZE +  4*8);						\
+  rsave (37, 48*RSIZE +  5*8);						\
+  rsave (38, 48*RSIZE +  6*8);						\
+  rsave (39, 48*RSIZE +  7*8);						\
+  rsave (40, 48*RSIZE +  8*8);						\
+  rsave (41, 48*RSIZE +  9*8);						\
+  rsave (42, 48*RSIZE + 10*8);						\
+  rsave (43, 48*RSIZE + 11*8);						\
+  rsave (44, 48*RSIZE + 12*8);						\
+  rsave (45, 48*RSIZE + 13*8);						\
+  rsave (46, 48*RSIZE + 14*8);						\
+  rsave (47, 48*RSIZE + 15*8);						\
+  rsave (48, 48*RSIZE + 16*8);						\
+  rsave (49, 48*RSIZE + 17*8);						\
+  rsave (50, 48*RSIZE + 18*8);						\
+  rsave (51, 48*RSIZE + 19*8);						\
+  rsave (52, 48*RSIZE + 20*8);						\
+  rsave (53, 48*RSIZE + 21*8);						\
+  rsave (54, 48*RSIZE + 22*8);						\
+  rsave (55, 48*RSIZE + 23*8);						\
+  rsave (56, 48*RSIZE + 24*8);						\
+  rsave (57, 48*RSIZE + 25*8);						\
+  rsave (58, 48*RSIZE + 26*8);						\
+  rsave (59, 48*RSIZE + 27*8);						\
+  rsave (60, 48*RSIZE + 28*8);						\
+  rsave (61, 48*RSIZE + 29*8);						\
+  rsave (62, 48*RSIZE + 30*8);						\
+  rsave (63, 48*RSIZE + 31*8)
+
+/* Describe where the VMX regs are saved.  */
+#ifdef CONFIG_ALTIVEC
+#define EH_FRAME_VMX \
+  vsave_msr0 ( 0);							\
+  vsave_msr1 ( 1);							\
+  vsave_msr1 ( 2);							\
+  vsave_msr1 ( 3);							\
+  vsave_msr1 ( 4);							\
+  vsave_msr1 ( 5);							\
+  vsave_msr1 ( 6);							\
+  vsave_msr1 ( 7);							\
+  vsave_msr1 ( 8);							\
+  vsave_msr1 ( 9);							\
+  vsave_msr1 (10);							\
+  vsave_msr1 (11);							\
+  vsave_msr1 (12);							\
+  vsave_msr1 (13);							\
+  vsave_msr1 (14);							\
+  vsave_msr1 (15);							\
+  vsave_msr1 (16);							\
+  vsave_msr1 (17);							\
+  vsave_msr1 (18);							\
+  vsave_msr1 (19);							\
+  vsave_msr1 (20);							\
+  vsave_msr1 (21);							\
+  vsave_msr1 (22);							\
+  vsave_msr1 (23);							\
+  vsave_msr1 (24);							\
+  vsave_msr1 (25);							\
+  vsave_msr1 (26);							\
+  vsave_msr1 (27);							\
+  vsave_msr1 (28);							\
+  vsave_msr1 (29);							\
+  vsave_msr1 (30);							\
+  vsave_msr1 (31);							\
+  vsave_msr2 (33, 32*16+12);						\
+  vsave      (32, 32*16)
+#else
+#define EH_FRAME_VMX
+#endif
+
+.Lcie:
+	.long .Lcie_end - .Lcie_start
+.Lcie_start:
+	.long 0			/* CIE ID */
+	.byte 1			/* Version number */
+	.string "zR"		/* NUL-terminated augmentation string */
+	.uleb128 4		/* Code alignment factor */
+	.sleb128 -4		/* Data alignment factor */
+	.byte 67		/* Return address register column, ap */
+	.uleb128 1		/* Augmentation value length */
+	.byte 0x1b		/* DW_EH_PE_pcrel | DW_EH_PE_sdata4. */
+	.byte 0x0c,1,0		/* DW_CFA_def_cfa: r1 ofs 0 */
+	.balign 4
+.Lcie_end:
+
+	.long .Lfde0_end - .Lfde0_start
+.Lfde0_start:
+	.long .Lfde0_start - .Lcie	/* CIE pointer. */
+	.long .Lsig_start - .		/* PC start, length */
+	.long .Lsig_end - .Lsig_start
+	.uleb128 0			/* Augmentation */
+	EH_FRAME_GEN
+	EH_FRAME_FP
+	EH_FRAME_VMX
+	.balign 4
+.Lfde0_end:
+
+/* We have a different stack layout for rt_sigreturn.  */
+#undef PTREGS
+#define PTREGS 64+16+128+20+28
+
+	.long .Lfde1_end - .Lfde1_start
+.Lfde1_start:
+	.long .Lfde1_start - .Lcie	/* CIE pointer. */
+	.long .Lsigrt_start - .		/* PC start, length */
+	.long .Lsigrt_end - .Lsigrt_start
+	.uleb128 0			/* Augmentation */
+	EH_FRAME_GEN
+	EH_FRAME_FP
+	EH_FRAME_VMX
+	.balign 4
+.Lfde1_end:
diff --git a/arch/ppc64/kernel/vdso32/vdso32.lds.S b/arch/ppc64/kernel/vdso32/vdso32.lds.S
new file mode 100644
index 00000000000..cca27bd03a5
--- /dev/null
+++ b/arch/ppc64/kernel/vdso32/vdso32.lds.S
@@ -0,0 +1,111 @@
+
+/*
+ * This is the infamous ld script for the 32 bits vdso
+ * library
+ */
+#include <asm/vdso.h>
+
+/* Default link addresses for the vDSOs */
+OUTPUT_FORMAT("elf32-powerpc", "elf32-powerpc", "elf32-powerpc")
+OUTPUT_ARCH(powerpc:common)
+ENTRY(_start)
+
+SECTIONS
+{
+  . = VDSO32_LBASE + SIZEOF_HEADERS;
+  .hash           : { *(.hash) }			:text
+  .dynsym         : { *(.dynsym) }
+  .dynstr         : { *(.dynstr) }
+  .gnu.version    : { *(.gnu.version) }
+  .gnu.version_d  : { *(.gnu.version_d) }
+  .gnu.version_r  : { *(.gnu.version_r) }
+
+  . = ALIGN (16);
+  .text :
+  {
+    *(.text .stub .text.* .gnu.linkonce.t.*)
+  }
+  PROVIDE (__etext = .);
+  PROVIDE (_etext = .);
+  PROVIDE (etext = .);
+
+  /* Other stuff is appended to the text segment: */
+  .rodata		: { *(.rodata .rodata.* .gnu.linkonce.r.*) }
+  .rodata1		: { *(.rodata1) }
+
+  .eh_frame_hdr		: { *(.eh_frame_hdr) }		:text	:eh_frame_hdr
+  .eh_frame		: { KEEP (*(.eh_frame)) }	:text
+  .gcc_except_table	: { *(.gcc_except_table) }
+  .fixup		: { *(.fixup) }
+
+  .got ALIGN(4)		: { *(.got.plt) *(.got) }
+
+  .dynamic		: { *(.dynamic) }		:text	:dynamic
+
+  _end = .;
+  __end = .;
+  PROVIDE (end = .);
+
+
+  /* Stabs debugging sections are here too
+   */
+  .stab 0 : { *(.stab) }
+  .stabstr 0 : { *(.stabstr) }
+  .stab.excl 0 : { *(.stab.excl) }
+  .stab.exclstr 0 : { *(.stab.exclstr) }
+  .stab.index 0 : { *(.stab.index) }
+  .stab.indexstr 0 : { *(.stab.indexstr) }
+  .comment 0 : { *(.comment) }
+  .debug 0 : { *(.debug) }
+  .line 0 : { *(.line) }
+
+  .debug_srcinfo 0 : { *(.debug_srcinfo) }
+  .debug_sfnames 0 : { *(.debug_sfnames) }
+
+  .debug_aranges 0 : { *(.debug_aranges) }
+  .debug_pubnames 0 : { *(.debug_pubnames) }
+
+  .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) }
+  .debug_abbrev 0 : { *(.debug_abbrev) }
+  .debug_line 0 : { *(.debug_line) }
+  .debug_frame 0 : { *(.debug_frame) }
+  .debug_str 0 : { *(.debug_str) }
+  .debug_loc 0 : { *(.debug_loc) }
+  .debug_macinfo 0 : { *(.debug_macinfo) }
+
+  .debug_weaknames 0 : { *(.debug_weaknames) }
+  .debug_funcnames 0 : { *(.debug_funcnames) }
+  .debug_typenames 0 : { *(.debug_typenames) }
+  .debug_varnames 0 : { *(.debug_varnames) }
+
+  /DISCARD/ : { *(.note.GNU-stack) }
+  /DISCARD/ : { *(.data .data.* .gnu.linkonce.d.* .sdata*) }
+  /DISCARD/ : { *(.bss .sbss .dynbss .dynsbss) }
+}
+
+
+PHDRS
+{
+  text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
+  dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
+  eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
+}
+
+
+/*
+ * This controls what symbols we export from the DSO.
+ */
+VERSION
+{
+  VDSO_VERSION_STRING {
+    global:
+	__kernel_datapage_offset; /* Has to be there for the kernel to find it */
+	__kernel_get_syscall_map;
+	__kernel_gettimeofday;
+	__kernel_sync_dicache;
+	__kernel_sync_dicache_p5;
+	__kernel_sigtramp32;
+	__kernel_sigtramp_rt32;
+    local: *;
+  };
+}
diff --git a/arch/ppc64/kernel/vdso32/vdso32_wrapper.S b/arch/ppc64/kernel/vdso32/vdso32_wrapper.S
new file mode 100644
index 00000000000..76ca28e09d2
--- /dev/null
+++ b/arch/ppc64/kernel/vdso32/vdso32_wrapper.S
@@ -0,0 +1,13 @@
+#include <linux/init.h>
+#include <asm/page.h>
+
+	.section ".data.page_aligned"
+
+	.globl vdso32_start, vdso32_end
+	.balign PAGE_SIZE
+vdso32_start:
+	.incbin "arch/ppc64/kernel/vdso32/vdso32.so"
+	.balign PAGE_SIZE
+vdso32_end:
+
+	.previous
diff --git a/arch/ppc64/kernel/vdso64/Makefile b/arch/ppc64/kernel/vdso64/Makefile
new file mode 100644
index 00000000000..bd3f70b1a38
--- /dev/null
+++ b/arch/ppc64/kernel/vdso64/Makefile
@@ -0,0 +1,35 @@
+# List of files in the vdso, has to be asm only for now
+
+obj-vdso64 = sigtramp.o gettimeofday.o datapage.o cacheflush.o
+
+# Build rules
+
+targets := $(obj-vdso64) vdso64.so
+obj-vdso64 := $(addprefix $(obj)/, $(obj-vdso64))
+
+EXTRA_CFLAGS := -shared -s -fno-common -fno-builtin
+EXTRA_CFLAGS +=  -nostdlib -Wl,-soname=linux-vdso64.so.1
+EXTRA_AFLAGS := -D__VDSO64__ -s
+
+obj-y += vdso64_wrapper.o
+extra-y += vdso64.lds
+CPPFLAGS_vdso64.lds += -P -C -U$(ARCH)
+
+# Force dependency (incbin is bad)
+$(obj)/vdso64_wrapper.o : $(obj)/vdso64.so
+
+# link rule for the .so file, .lds has to be first
+$(obj)/vdso64.so: $(src)/vdso64.lds $(obj-vdso64)
+	$(call if_changed,vdso64ld)
+
+# assembly rules for the .S files
+$(obj-vdso64): %.o: %.S
+	$(call if_changed_dep,vdso64as)
+
+# actual build commands
+quiet_cmd_vdso64ld = VDSO64L $@
+      cmd_vdso64ld = $(CC) $(c_flags) -Wl,-T $^ -o $@
+quiet_cmd_vdso64as = VDSO64A $@
+      cmd_vdso64as = $(CC) $(a_flags) -c -o $@ $<
+
+
diff --git a/arch/ppc64/kernel/vdso64/cacheflush.S b/arch/ppc64/kernel/vdso64/cacheflush.S
new file mode 100644
index 00000000000..d9696ffcf33
--- /dev/null
+++ b/arch/ppc64/kernel/vdso64/cacheflush.S
@@ -0,0 +1,64 @@
+/*
+ * vDSO provided cache flush routines
+ *
+ * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org),
+ *                    IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/vdso.h>
+#include <asm/offsets.h>
+
+	.text
+
+/*
+ * Default "generic" version of __kernel_sync_dicache.
+ *
+ * void __kernel_sync_dicache(unsigned long start, unsigned long end)
+ *
+ * Flushes the data cache & invalidate the instruction cache for the
+ * provided range [start, end[
+ *
+ * Note: all CPUs supported by this kernel have a 128 bytes cache
+ * line size so we don't have to peek that info from the datapage
+ */
+V_FUNCTION_BEGIN(__kernel_sync_dicache)
+  .cfi_startproc
+	li	r5,127
+	andc	r6,r3,r5		/* round low to line bdy */
+	subf	r8,r6,r4		/* compute length */
+	add	r8,r8,r5		/* ensure we get enough */
+	srwi.	r8,r8,7			/* compute line count */
+	beqlr				/* nothing to do? */
+	mtctr	r8
+	mr	r3,r6
+1:	dcbst	0,r3
+	addi	r3,r3,128
+	bdnz	1b
+	sync
+	mtctr	r8
+1:	icbi	0,r6
+	addi	r6,r6,128
+	bdnz	1b
+	isync
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__kernel_sync_dicache)
+
+
+/*
+ * POWER5 version of __kernel_sync_dicache
+ */
+V_FUNCTION_BEGIN(__kernel_sync_dicache_p5)
+  .cfi_startproc
+	sync
+	isync
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__kernel_sync_dicache_p5)
diff --git a/arch/ppc64/kernel/vdso64/datapage.S b/arch/ppc64/kernel/vdso64/datapage.S
new file mode 100644
index 00000000000..18afd971c9d
--- /dev/null
+++ b/arch/ppc64/kernel/vdso64/datapage.S
@@ -0,0 +1,68 @@
+/*
+ * Access to the shared data page by the vDSO & syscall map
+ *
+ * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/offsets.h>
+#include <asm/unistd.h>
+#include <asm/vdso.h>
+
+	.text
+V_FUNCTION_BEGIN(__get_datapage)
+  .cfi_startproc
+	/* We don't want that exposed or overridable as we want other objects
+	 * to be able to bl directly to here
+	 */
+	.protected __get_datapage
+	.hidden __get_datapage
+
+	mflr	r0
+  .cfi_register lr,r0
+
+	bcl	20,31,1f
+	.global	__kernel_datapage_offset;
+__kernel_datapage_offset:
+	.long	0
+1:
+	mflr	r3
+	mtlr	r0
+	lwz	r0,0(r3)
+	add	r3,r0,r3
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__get_datapage)
+
+/*
+ * void *__kernel_get_syscall_map(unsigned int *syscall_count) ;
+ *
+ * returns a pointer to the syscall map. the map is agnostic to the
+ * size of "long", unlike kernel bitops, it stores bits from top to
+ * bottom so that memory actually contains a linear bitmap
+ * check for syscall N by testing bit (0x80000000 >> (N & 0x1f)) of
+ * 32 bits int at N >> 5.
+ */
+V_FUNCTION_BEGIN(__kernel_get_syscall_map)
+  .cfi_startproc
+	mflr	r12
+  .cfi_register lr,r12
+
+	mr	r4,r3
+	bl	V_LOCAL_FUNC(__get_datapage)
+	mtlr	r12
+	addi	r3,r3,CFG_SYSCALL_MAP64
+	cmpli	cr0,r4,0
+	beqlr
+	li	r0,__NR_syscalls
+	stw	r0,0(r4)
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__kernel_get_syscall_map)
diff --git a/arch/ppc64/kernel/vdso64/gettimeofday.S b/arch/ppc64/kernel/vdso64/gettimeofday.S
new file mode 100644
index 00000000000..ed3f970ff05
--- /dev/null
+++ b/arch/ppc64/kernel/vdso64/gettimeofday.S
@@ -0,0 +1,91 @@
+/*
+ * Userland implementation of gettimeofday() for 64 bits processes in a
+ * ppc64 kernel for use in the vDSO
+ *
+ * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org),
+ *                    IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/vdso.h>
+#include <asm/offsets.h>
+
+	.text
+/*
+ * Exact prototype of gettimeofday
+ *
+ * int __kernel_gettimeofday(struct timeval *tv, struct timezone *tz);
+ *
+ */
+V_FUNCTION_BEGIN(__kernel_gettimeofday)
+  .cfi_startproc
+	mflr	r12
+  .cfi_register lr,r12
+
+	mr	r11,r3			/* r11 holds tv */
+	mr	r10,r4			/* r10 holds tz */
+	bl	V_LOCAL_FUNC(__get_datapage)		/* get data page */
+	bl	V_LOCAL_FUNC(__do_get_xsec)		/* get xsec from tb & kernel */
+	lis     r7,15			/* r7 = 1000000 = USEC_PER_SEC */
+	ori     r7,r7,16960
+	rldicl  r5,r4,44,20		/* r5 = sec = xsec / XSEC_PER_SEC */
+	rldicr  r6,r5,20,43		/* r6 = sec * XSEC_PER_SEC */
+	std	r5,TVAL64_TV_SEC(r11)	/* store sec in tv */
+	subf	r0,r6,r4		/* r0 = xsec = (xsec - r6) */
+	mulld   r0,r0,r7		/* usec = (xsec * USEC_PER_SEC) / XSEC_PER_SEC */
+	rldicl  r0,r0,44,20
+	cmpldi	cr0,r10,0		/* check if tz is NULL */
+	std	r0,TVAL64_TV_USEC(r11)	/* store usec in tv */
+	beq	1f
+	lwz	r4,CFG_TZ_MINUTEWEST(r3)/* fill tz */
+	lwz	r5,CFG_TZ_DSTTIME(r3)
+	stw	r4,TZONE_TZ_MINWEST(r10)
+	stw	r5,TZONE_TZ_DSTTIME(r10)
+1:	mtlr	r12
+	li	r3,0			/* always success */
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__kernel_gettimeofday)
+
+
+/*
+ * This is the core of gettimeofday(), it returns the xsec
+ * value in r4 and expects the datapage ptr (non clobbered)
+ * in r3. clobbers r0,r4,r5,r6,r7,r8
+*/
+V_FUNCTION_BEGIN(__do_get_xsec)
+  .cfi_startproc
+	/* check for update count & load values */
+1:	ld	r7,CFG_TB_UPDATE_COUNT(r3)
+	andi.	r0,r4,1			/* pending update ? loop */
+	bne-	1b
+	xor	r0,r4,r4		/* create dependency */
+	add	r3,r3,r0
+
+	/* Get TB & offset it */
+	mftb	r8
+	ld	r9,CFG_TB_ORIG_STAMP(r3)
+	subf	r8,r9,r8
+
+	/* Scale result */
+	ld	r5,CFG_TB_TO_XS(r3)
+	mulhdu	r8,r8,r5
+
+	/* Add stamp since epoch */
+	ld	r6,CFG_STAMP_XSEC(r3)
+	add	r4,r6,r8
+
+	xor	r0,r4,r4
+	add	r3,r3,r0
+	ld	r0,CFG_TB_UPDATE_COUNT(r3)
+        cmpld   cr0,r0,r7		/* check if updated */
+	bne-	1b
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__do_get_xsec)
diff --git a/arch/ppc64/kernel/vdso64/sigtramp.S b/arch/ppc64/kernel/vdso64/sigtramp.S
new file mode 100644
index 00000000000..8ae8f205e47
--- /dev/null
+++ b/arch/ppc64/kernel/vdso64/sigtramp.S
@@ -0,0 +1,294 @@
+/*
+ * Signal trampoline for 64 bits processes in a ppc64 kernel for
+ * use in the vDSO
+ *
+ * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), IBM Corp.
+ * Copyright (C) 2004 Alan Modra (amodra@au.ibm.com)), IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/unistd.h>
+#include <asm/vdso.h>
+
+	.text
+
+/* The nop here is a hack.  The dwarf2 unwind routines subtract 1 from
+   the return address to get an address in the middle of the presumed
+   call instruction.  Since we don't have a call here, we artifically
+   extend the range covered by the unwind info by padding before the
+   real start.  */
+	nop
+	.balign 8
+V_FUNCTION_BEGIN(__kernel_sigtramp_rt64)
+.Lsigrt_start = . - 4
+	addi	r1, r1, __SIGNAL_FRAMESIZE
+	li	r0,__NR_rt_sigreturn
+	sc
+.Lsigrt_end:
+V_FUNCTION_END(__kernel_sigtramp_rt64)
+/* The ".balign 8" above and the following zeros mimic the old stack
+   trampoline layout.  The last magic value is the ucontext pointer,
+   chosen in such a way that older libgcc unwind code returns a zero
+   for a sigcontext pointer.  */
+	.long 0,0,0
+	.quad 0,-21*8
+
+/* Register r1 can be found at offset 8 of a pt_regs structure.
+   A pointer to the pt_regs is stored in memory at the old sp plus PTREGS.  */
+#define cfa_save \
+  .byte 0x0f;			/* DW_CFA_def_cfa_expression */		\
+  .uleb128 9f - 1f;		/*   length */				\
+1:									\
+  .byte 0x71; .sleb128 PTREGS;	/*     DW_OP_breg1 */			\
+  .byte 0x06;			/*     DW_OP_deref */			\
+  .byte 0x23; .uleb128 RSIZE;	/*     DW_OP_plus_uconst */		\
+  .byte 0x06;			/*     DW_OP_deref */			\
+9:
+
+/* Register REGNO can be found at offset OFS of a pt_regs structure.
+   A pointer to the pt_regs is stored in memory at the old sp plus PTREGS.  */
+#define rsave(regno, ofs) \
+  .byte 0x10;			/* DW_CFA_expression */			\
+  .uleb128 regno;		/*   regno */				\
+  .uleb128 9f - 1f;		/*   length */				\
+1:									\
+  .byte 0x71; .sleb128 PTREGS;	/*     DW_OP_breg1 */			\
+  .byte 0x06;			/*     DW_OP_deref */			\
+  .ifne ofs;								\
+    .byte 0x23; .uleb128 ofs;	/*     DW_OP_plus_uconst */		\
+  .endif;								\
+9:
+
+/* If msr bit 1<<25 is set, then VMX register REGNO is at offset REGNO*16
+   of the VMX reg struct.  A pointer to the VMX reg struct is at VREGS in
+   the pt_regs struct.  This macro is for REGNO == 0, and contains
+   'subroutines' that the other macros jump to.  */
+#define vsave_msr0(regno) \
+  .byte 0x10;			/* DW_CFA_expression */			\
+  .uleb128 regno + 77;		/*   regno */				\
+  .uleb128 9f - 1f;		/*   length */				\
+1:									\
+  .byte 0x30 + regno;		/*     DW_OP_lit0 */			\
+2:									\
+  .byte 0x40;			/*     DW_OP_lit16 */			\
+  .byte 0x1e;			/*     DW_OP_mul */			\
+3:									\
+  .byte 0x71; .sleb128 PTREGS;	/*     DW_OP_breg1 */			\
+  .byte 0x06;			/*     DW_OP_deref */			\
+  .byte 0x12;			/*     DW_OP_dup */			\
+  .byte 0x23;			/*     DW_OP_plus_uconst */		\
+    .uleb128 33*RSIZE;		/*       msr offset */			\
+  .byte 0x06;			/*     DW_OP_deref */			\
+  .byte 0x0c; .long 1 << 25;	/*     DW_OP_const4u */			\
+  .byte 0x1a;			/*     DW_OP_and */			\
+  .byte 0x12;			/*     DW_OP_dup, ret 0 if bra taken */	\
+  .byte 0x30;			/*     DW_OP_lit0 */			\
+  .byte 0x29;			/*     DW_OP_eq */			\
+  .byte 0x28; .short 0x7fff;	/*     DW_OP_bra to end */		\
+  .byte 0x13;			/*     DW_OP_drop, pop the 0 */		\
+  .byte 0x23; .uleb128 VREGS;	/*     DW_OP_plus_uconst */		\
+  .byte 0x06;			/*     DW_OP_deref */			\
+  .byte 0x22;			/*     DW_OP_plus */			\
+  .byte 0x2f; .short 0x7fff;	/*     DW_OP_skip to end */		\
+9:
+
+/* If msr bit 1<<25 is set, then VMX register REGNO is at offset REGNO*16
+   of the VMX reg struct.  REGNO is 1 thru 31.  */
+#define vsave_msr1(regno) \
+  .byte 0x10;			/* DW_CFA_expression */			\
+  .uleb128 regno + 77;		/*   regno */				\
+  .uleb128 9f - 1f;		/*   length */				\
+1:									\
+  .byte 0x30 + regno;		/*     DW_OP_lit n */			\
+  .byte 0x2f; .short 2b - 9f;	/*     DW_OP_skip */			\
+9:
+
+/* If msr bit 1<<25 is set, then VMX register REGNO is at offset OFS of
+   the VMX save block.  */
+#define vsave_msr2(regno, ofs) \
+  .byte 0x10;			/* DW_CFA_expression */			\
+  .uleb128 regno + 77;		/*   regno */				\
+  .uleb128 9f - 1f;		/*   length */				\
+1:									\
+  .byte 0x0a; .short ofs;	/*     DW_OP_const2u */			\
+  .byte 0x2f; .short 3b - 9f;	/*     DW_OP_skip */			\
+9:
+
+/* VMX register REGNO is at offset OFS of the VMX save area.  */
+#define vsave(regno, ofs) \
+  .byte 0x10;			/* DW_CFA_expression */			\
+  .uleb128 regno + 77;		/*   regno */				\
+  .uleb128 9f - 1f;		/*   length */				\
+1:									\
+  .byte 0x71; .sleb128 PTREGS;	/*     DW_OP_breg1 */			\
+  .byte 0x06;			/*     DW_OP_deref */			\
+  .byte 0x23; .uleb128 VREGS;	/*     DW_OP_plus_uconst */		\
+  .byte 0x06;			/*     DW_OP_deref */			\
+  .byte 0x23; .uleb128 ofs;	/*     DW_OP_plus_uconst */		\
+9:
+
+/* This is where the pt_regs pointer can be found on the stack.  */
+#define PTREGS 128+168+56
+
+/* Size of regs.  */
+#define RSIZE 8
+
+/* This is the offset of the VMX reg pointer.  */
+#define VREGS 48*RSIZE+33*8
+
+/* Describe where general purpose regs are saved.  */
+#define EH_FRAME_GEN \
+  cfa_save;								\
+  rsave ( 0,  0*RSIZE);							\
+  rsave ( 2,  2*RSIZE);							\
+  rsave ( 3,  3*RSIZE);							\
+  rsave ( 4,  4*RSIZE);							\
+  rsave ( 5,  5*RSIZE);							\
+  rsave ( 6,  6*RSIZE);							\
+  rsave ( 7,  7*RSIZE);							\
+  rsave ( 8,  8*RSIZE);							\
+  rsave ( 9,  9*RSIZE);							\
+  rsave (10, 10*RSIZE);							\
+  rsave (11, 11*RSIZE);							\
+  rsave (12, 12*RSIZE);							\
+  rsave (13, 13*RSIZE);							\
+  rsave (14, 14*RSIZE);							\
+  rsave (15, 15*RSIZE);							\
+  rsave (16, 16*RSIZE);							\
+  rsave (17, 17*RSIZE);							\
+  rsave (18, 18*RSIZE);							\
+  rsave (19, 19*RSIZE);							\
+  rsave (20, 20*RSIZE);							\
+  rsave (21, 21*RSIZE);							\
+  rsave (22, 22*RSIZE);							\
+  rsave (23, 23*RSIZE);							\
+  rsave (24, 24*RSIZE);							\
+  rsave (25, 25*RSIZE);							\
+  rsave (26, 26*RSIZE);							\
+  rsave (27, 27*RSIZE);							\
+  rsave (28, 28*RSIZE);							\
+  rsave (29, 29*RSIZE);							\
+  rsave (30, 30*RSIZE);							\
+  rsave (31, 31*RSIZE);							\
+  rsave (67, 32*RSIZE);		/* ap, used as temp for nip */		\
+  rsave (65, 36*RSIZE);		/* lr */				\
+  rsave (70, 38*RSIZE)		/* cr */
+
+/* Describe where the FP regs are saved.  */
+#define EH_FRAME_FP \
+  rsave (32, 48*RSIZE +  0*8);						\
+  rsave (33, 48*RSIZE +  1*8);						\
+  rsave (34, 48*RSIZE +  2*8);						\
+  rsave (35, 48*RSIZE +  3*8);						\
+  rsave (36, 48*RSIZE +  4*8);						\
+  rsave (37, 48*RSIZE +  5*8);						\
+  rsave (38, 48*RSIZE +  6*8);						\
+  rsave (39, 48*RSIZE +  7*8);						\
+  rsave (40, 48*RSIZE +  8*8);						\
+  rsave (41, 48*RSIZE +  9*8);						\
+  rsave (42, 48*RSIZE + 10*8);						\
+  rsave (43, 48*RSIZE + 11*8);						\
+  rsave (44, 48*RSIZE + 12*8);						\
+  rsave (45, 48*RSIZE + 13*8);						\
+  rsave (46, 48*RSIZE + 14*8);						\
+  rsave (47, 48*RSIZE + 15*8);						\
+  rsave (48, 48*RSIZE + 16*8);						\
+  rsave (49, 48*RSIZE + 17*8);						\
+  rsave (50, 48*RSIZE + 18*8);						\
+  rsave (51, 48*RSIZE + 19*8);						\
+  rsave (52, 48*RSIZE + 20*8);						\
+  rsave (53, 48*RSIZE + 21*8);						\
+  rsave (54, 48*RSIZE + 22*8);						\
+  rsave (55, 48*RSIZE + 23*8);						\
+  rsave (56, 48*RSIZE + 24*8);						\
+  rsave (57, 48*RSIZE + 25*8);						\
+  rsave (58, 48*RSIZE + 26*8);						\
+  rsave (59, 48*RSIZE + 27*8);						\
+  rsave (60, 48*RSIZE + 28*8);						\
+  rsave (61, 48*RSIZE + 29*8);						\
+  rsave (62, 48*RSIZE + 30*8);						\
+  rsave (63, 48*RSIZE + 31*8)
+
+/* Describe where the VMX regs are saved.  */
+#ifdef CONFIG_ALTIVEC
+#define EH_FRAME_VMX \
+  vsave_msr0 ( 0);							\
+  vsave_msr1 ( 1);							\
+  vsave_msr1 ( 2);							\
+  vsave_msr1 ( 3);							\
+  vsave_msr1 ( 4);							\
+  vsave_msr1 ( 5);							\
+  vsave_msr1 ( 6);							\
+  vsave_msr1 ( 7);							\
+  vsave_msr1 ( 8);							\
+  vsave_msr1 ( 9);							\
+  vsave_msr1 (10);							\
+  vsave_msr1 (11);							\
+  vsave_msr1 (12);							\
+  vsave_msr1 (13);							\
+  vsave_msr1 (14);							\
+  vsave_msr1 (15);							\
+  vsave_msr1 (16);							\
+  vsave_msr1 (17);							\
+  vsave_msr1 (18);							\
+  vsave_msr1 (19);							\
+  vsave_msr1 (20);							\
+  vsave_msr1 (21);							\
+  vsave_msr1 (22);							\
+  vsave_msr1 (23);							\
+  vsave_msr1 (24);							\
+  vsave_msr1 (25);							\
+  vsave_msr1 (26);							\
+  vsave_msr1 (27);							\
+  vsave_msr1 (28);							\
+  vsave_msr1 (29);							\
+  vsave_msr1 (30);							\
+  vsave_msr1 (31);							\
+  vsave_msr2 (33, 32*16+12);						\
+  vsave      (32, 33*16)
+#else
+#define EH_FRAME_VMX
+#endif
+
+	.section .eh_frame,"a",@progbits
+.Lcie:
+	.long .Lcie_end - .Lcie_start
+.Lcie_start:
+	.long 0			/* CIE ID */
+	.byte 1			/* Version number */
+	.string "zR"		/* NUL-terminated augmentation string */
+	.uleb128 4		/* Code alignment factor */
+	.sleb128 -8		/* Data alignment factor */
+	.byte 67		/* Return address register column, ap */
+	.uleb128 1		/* Augmentation value length */
+	.byte 0x14		/* DW_EH_PE_pcrel | DW_EH_PE_udata8. */
+	.byte 0x0c,1,0		/* DW_CFA_def_cfa: r1 ofs 0 */
+	.balign 8
+.Lcie_end:
+
+	.long .Lfde0_end - .Lfde0_start
+.Lfde0_start:
+	.long .Lfde0_start - .Lcie	/* CIE pointer. */
+	.quad .Lsigrt_start - .		/* PC start, length */
+	.quad .Lsigrt_end - .Lsigrt_start
+	.uleb128 0			/* Augmentation */
+	EH_FRAME_GEN
+	EH_FRAME_FP
+	EH_FRAME_VMX
+# Do we really need to describe the frame at this point?  ie. will
+# we ever have some call chain that returns somewhere past the addi?
+# I don't think so, since gcc doesn't support async signals.
+#	.byte 0x41		/* DW_CFA_advance_loc 1*4 */
+#undef PTREGS
+#define PTREGS 168+56
+#	EH_FRAME_GEN
+#	EH_FRAME_FP
+#	EH_FRAME_VMX
+	.balign 8
+.Lfde0_end:
diff --git a/arch/ppc64/kernel/vdso64/vdso64.lds.S b/arch/ppc64/kernel/vdso64/vdso64.lds.S
new file mode 100644
index 00000000000..942c815c7bc
--- /dev/null
+++ b/arch/ppc64/kernel/vdso64/vdso64.lds.S
@@ -0,0 +1,110 @@
+/*
+ * This is the infamous ld script for the 64 bits vdso
+ * library
+ */
+#include <asm/vdso.h>
+
+OUTPUT_FORMAT("elf64-powerpc", "elf64-powerpc", "elf64-powerpc")
+OUTPUT_ARCH(powerpc:common64)
+ENTRY(_start)
+
+SECTIONS
+{
+  . = VDSO64_LBASE + SIZEOF_HEADERS;
+  .hash           : { *(.hash) }		:text
+  .dynsym         : { *(.dynsym) }
+  .dynstr         : { *(.dynstr) }
+  .gnu.version    : { *(.gnu.version) }
+  .gnu.version_d  : { *(.gnu.version_d) }
+  .gnu.version_r  : { *(.gnu.version_r) }
+
+  . = ALIGN (16);
+  .text           :
+  {
+    *(.text .stub .text.* .gnu.linkonce.t.*)
+    *(.sfpr .glink)
+  }
+  PROVIDE (__etext = .);
+  PROVIDE (_etext = .);
+  PROVIDE (etext = .);
+
+  /* Other stuff is appended to the text segment: */
+  .rodata         : { *(.rodata .rodata.* .gnu.linkonce.r.*) }
+  .rodata1        : { *(.rodata1) }
+  .eh_frame_hdr   : { *(.eh_frame_hdr) }	:text	:eh_frame_hdr
+  .eh_frame       : { KEEP (*(.eh_frame)) }	:text
+  .gcc_except_table   : { *(.gcc_except_table) }
+
+  .opd           ALIGN(8) : { KEEP (*(.opd)) }
+  .got		 ALIGN(8) : { *(.got .toc) }
+  .rela.dyn	 ALIGN(8) : { *(.rela.dyn) }
+
+  .dynamic        : { *(.dynamic) }		:text	:dynamic
+
+  _end = .;
+  PROVIDE (end = .);
+
+  /* Stabs debugging sections are here too
+   */
+  .stab          0 : { *(.stab) }
+  .stabstr       0 : { *(.stabstr) }
+  .stab.excl     0 : { *(.stab.excl) }
+  .stab.exclstr  0 : { *(.stab.exclstr) }
+  .stab.index    0 : { *(.stab.index) }
+  .stab.indexstr 0 : { *(.stab.indexstr) }
+  .comment       0 : { *(.comment) }
+  /* DWARF debug sectio/ns.
+     Symbols in the DWARF debugging sections are relative to the beginning
+     of the section so we begin them at 0.  */
+  /* DWARF 1 */
+  .debug          0 : { *(.debug) }
+  .line           0 : { *(.line) }
+  /* GNU DWARF 1 extensions */
+  .debug_srcinfo  0 : { *(.debug_srcinfo) }
+  .debug_sfnames  0 : { *(.debug_sfnames) }
+  /* DWARF 1.1 and DWARF 2 */
+  .debug_aranges  0 : { *(.debug_aranges) }
+  .debug_pubnames 0 : { *(.debug_pubnames) }
+  /* DWARF 2 */
+  .debug_info     0 : { *(.debug_info .gnu.linkonce.wi.*) }
+  .debug_abbrev   0 : { *(.debug_abbrev) }
+  .debug_line     0 : { *(.debug_line) }
+  .debug_frame    0 : { *(.debug_frame) }
+  .debug_str      0 : { *(.debug_str) }
+  .debug_loc      0 : { *(.debug_loc) }
+  .debug_macinfo  0 : { *(.debug_macinfo) }
+  /* SGI/MIPS DWARF 2 extensions */
+  .debug_weaknames 0 : { *(.debug_weaknames) }
+  .debug_funcnames 0 : { *(.debug_funcnames) }
+  .debug_typenames 0 : { *(.debug_typenames) }
+  .debug_varnames  0 : { *(.debug_varnames) }
+
+  /DISCARD/ : { *(.note.GNU-stack) }
+  /DISCARD/ : { *(.branch_lt) }
+  /DISCARD/ : { *(.data .data.* .gnu.linkonce.d.*) }
+  /DISCARD/ : { *(.bss .sbss .dynbss .dynsbss) }
+}
+
+PHDRS
+{
+  text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
+  dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
+  eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
+}
+
+/*
+ * This controls what symbols we export from the DSO.
+ */
+VERSION
+{
+  VDSO_VERSION_STRING {
+    global:
+	__kernel_datapage_offset; /* Has to be there for the kernel to find it */
+	__kernel_get_syscall_map;
+    	__kernel_gettimeofday;
+	__kernel_sync_dicache;
+	__kernel_sync_dicache_p5;
+	__kernel_sigtramp_rt64;
+    local: *;
+  };
+}
diff --git a/arch/ppc64/kernel/vdso64/vdso64_wrapper.S b/arch/ppc64/kernel/vdso64/vdso64_wrapper.S
new file mode 100644
index 00000000000..771c2741c49
--- /dev/null
+++ b/arch/ppc64/kernel/vdso64/vdso64_wrapper.S
@@ -0,0 +1,13 @@
+#include <linux/init.h>
+#include <asm/page.h>
+
+	.section ".data.page_aligned"
+
+	.globl vdso64_start, vdso64_end
+	.balign PAGE_SIZE
+vdso64_start:
+	.incbin "arch/ppc64/kernel/vdso64/vdso64.so"
+	.balign PAGE_SIZE
+vdso64_end:
+
+	.previous
diff --git a/arch/ppc64/kernel/vecemu.c b/arch/ppc64/kernel/vecemu.c
new file mode 100644
index 00000000000..cb207629f21
--- /dev/null
+++ b/arch/ppc64/kernel/vecemu.c
@@ -0,0 +1,346 @@
+/*
+ * Routines to emulate some Altivec/VMX instructions, specifically
+ * those that can trap when given denormalized operands in Java mode.
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <asm/ptrace.h>
+#include <asm/processor.h>
+#include <asm/uaccess.h>
+
+/* Functions in vector.S */
+extern void vaddfp(vector128 *dst, vector128 *a, vector128 *b);
+extern void vsubfp(vector128 *dst, vector128 *a, vector128 *b);
+extern void vmaddfp(vector128 *dst, vector128 *a, vector128 *b, vector128 *c);
+extern void vnmsubfp(vector128 *dst, vector128 *a, vector128 *b, vector128 *c);
+extern void vrefp(vector128 *dst, vector128 *src);
+extern void vrsqrtefp(vector128 *dst, vector128 *src);
+extern void vexptep(vector128 *dst, vector128 *src);
+
+static unsigned int exp2s[8] = {
+	0x800000,
+	0x8b95c2,
+	0x9837f0,
+	0xa5fed7,
+	0xb504f3,
+	0xc5672a,
+	0xd744fd,
+	0xeac0c7
+};
+
+/*
+ * Computes an estimate of 2^x.  The `s' argument is the 32-bit
+ * single-precision floating-point representation of x.
+ */
+static unsigned int eexp2(unsigned int s)
+{
+	int exp, pwr;
+	unsigned int mant, frac;
+
+	/* extract exponent field from input */
+	exp = ((s >> 23) & 0xff) - 127;
+	if (exp > 7) {
+		/* check for NaN input */
+		if (exp == 128 && (s & 0x7fffff) != 0)
+			return s | 0x400000;	/* return QNaN */
+		/* 2^-big = 0, 2^+big = +Inf */
+		return (s & 0x80000000)? 0: 0x7f800000;	/* 0 or +Inf */
+	}
+	if (exp < -23)
+		return 0x3f800000;	/* 1.0 */
+
+	/* convert to fixed point integer in 9.23 representation */
+	pwr = (s & 0x7fffff) | 0x800000;
+	if (exp > 0)
+		pwr <<= exp;
+	else
+		pwr >>= -exp;
+	if (s & 0x80000000)
+		pwr = -pwr;
+
+	/* extract integer part, which becomes exponent part of result */
+	exp = (pwr >> 23) + 126;
+	if (exp >= 254)
+		return 0x7f800000;
+	if (exp < -23)
+		return 0;
+
+	/* table lookup on top 3 bits of fraction to get mantissa */
+	mant = exp2s[(pwr >> 20) & 7];
+
+	/* linear interpolation using remaining 20 bits of fraction */
+	asm("mulhwu %0,%1,%2" : "=r" (frac)
+	    : "r" (pwr << 12), "r" (0x172b83ff));
+	asm("mulhwu %0,%1,%2" : "=r" (frac) : "r" (frac), "r" (mant));
+	mant += frac;
+
+	if (exp >= 0)
+		return mant + (exp << 23);
+
+	/* denormalized result */
+	exp = -exp;
+	mant += 1 << (exp - 1);
+	return mant >> exp;
+}
+
+/*
+ * Computes an estimate of log_2(x).  The `s' argument is the 32-bit
+ * single-precision floating-point representation of x.
+ */
+static unsigned int elog2(unsigned int s)
+{
+	int exp, mant, lz, frac;
+
+	exp = s & 0x7f800000;
+	mant = s & 0x7fffff;
+	if (exp == 0x7f800000) {	/* Inf or NaN */
+		if (mant != 0)
+			s |= 0x400000;	/* turn NaN into QNaN */
+		return s;
+	}
+	if ((exp | mant) == 0)		/* +0 or -0 */
+		return 0xff800000;	/* return -Inf */
+
+	if (exp == 0) {
+		/* denormalized */
+		asm("cntlzw %0,%1" : "=r" (lz) : "r" (mant));
+		mant <<= lz - 8;
+		exp = (-118 - lz) << 23;
+	} else {
+		mant |= 0x800000;
+		exp -= 127 << 23;
+	}
+
+	if (mant >= 0xb504f3) {				/* 2^0.5 * 2^23 */
+		exp |= 0x400000;			/* 0.5 * 2^23 */
+		asm("mulhwu %0,%1,%2" : "=r" (mant)
+		    : "r" (mant), "r" (0xb504f334));	/* 2^-0.5 * 2^32 */
+	}
+	if (mant >= 0x9837f0) {				/* 2^0.25 * 2^23 */
+		exp |= 0x200000;			/* 0.25 * 2^23 */
+		asm("mulhwu %0,%1,%2" : "=r" (mant)
+		    : "r" (mant), "r" (0xd744fccb));	/* 2^-0.25 * 2^32 */
+	}
+	if (mant >= 0x8b95c2) {				/* 2^0.125 * 2^23 */
+		exp |= 0x100000;			/* 0.125 * 2^23 */
+		asm("mulhwu %0,%1,%2" : "=r" (mant)
+		    : "r" (mant), "r" (0xeac0c6e8));	/* 2^-0.125 * 2^32 */
+	}
+	if (mant > 0x800000) {				/* 1.0 * 2^23 */
+		/* calculate (mant - 1) * 1.381097463 */
+		/* 1.381097463 == 0.125 / (2^0.125 - 1) */
+		asm("mulhwu %0,%1,%2" : "=r" (frac)
+		    : "r" ((mant - 0x800000) << 1), "r" (0xb0c7cd3a));
+		exp += frac;
+	}
+	s = exp & 0x80000000;
+	if (exp != 0) {
+		if (s)
+			exp = -exp;
+		asm("cntlzw %0,%1" : "=r" (lz) : "r" (exp));
+		lz = 8 - lz;
+		if (lz > 0)
+			exp >>= lz;
+		else if (lz < 0)
+			exp <<= -lz;
+		s += ((lz + 126) << 23) + exp;
+	}
+	return s;
+}
+
+#define VSCR_SAT	1
+
+static int ctsxs(unsigned int x, int scale, unsigned int *vscrp)
+{
+	int exp, mant;
+
+	exp = (x >> 23) & 0xff;
+	mant = x & 0x7fffff;
+	if (exp == 255 && mant != 0)
+		return 0;		/* NaN -> 0 */
+	exp = exp - 127 + scale;
+	if (exp < 0)
+		return 0;		/* round towards zero */
+	if (exp >= 31) {
+		/* saturate, unless the result would be -2^31 */
+		if (x + (scale << 23) != 0xcf000000)
+			*vscrp |= VSCR_SAT;
+		return (x & 0x80000000)? 0x80000000: 0x7fffffff;
+	}
+	mant |= 0x800000;
+	mant = (mant << 7) >> (30 - exp);
+	return (x & 0x80000000)? -mant: mant;
+}
+
+static unsigned int ctuxs(unsigned int x, int scale, unsigned int *vscrp)
+{
+	int exp;
+	unsigned int mant;
+
+	exp = (x >> 23) & 0xff;
+	mant = x & 0x7fffff;
+	if (exp == 255 && mant != 0)
+		return 0;		/* NaN -> 0 */
+	exp = exp - 127 + scale;
+	if (exp < 0)
+		return 0;		/* round towards zero */
+	if (x & 0x80000000) {
+		/* negative => saturate to 0 */
+		*vscrp |= VSCR_SAT;
+		return 0;
+	}
+	if (exp >= 32) {
+		/* saturate */
+		*vscrp |= VSCR_SAT;
+		return 0xffffffff;
+	}
+	mant |= 0x800000;
+	mant = (mant << 8) >> (31 - exp);
+	return mant;
+}
+
+/* Round to floating integer, towards 0 */
+static unsigned int rfiz(unsigned int x)
+{
+	int exp;
+
+	exp = ((x >> 23) & 0xff) - 127;
+	if (exp == 128 && (x & 0x7fffff) != 0)
+		return x | 0x400000;	/* NaN -> make it a QNaN */
+	if (exp >= 23)
+		return x;		/* it's an integer already (or Inf) */
+	if (exp < 0)
+		return x & 0x80000000;	/* |x| < 1.0 rounds to 0 */
+	return x & ~(0x7fffff >> exp);
+}
+
+/* Round to floating integer, towards +/- Inf */
+static unsigned int rfii(unsigned int x)
+{
+	int exp, mask;
+
+	exp = ((x >> 23) & 0xff) - 127;
+	if (exp == 128 && (x & 0x7fffff) != 0)
+		return x | 0x400000;	/* NaN -> make it a QNaN */
+	if (exp >= 23)
+		return x;		/* it's an integer already (or Inf) */
+	if ((x & 0x7fffffff) == 0)
+		return x;		/* +/-0 -> +/-0 */
+	if (exp < 0)
+		/* 0 < |x| < 1.0 rounds to +/- 1.0 */
+		return (x & 0x80000000) | 0x3f800000;
+	mask = 0x7fffff >> exp;
+	/* mantissa overflows into exponent - that's OK,
+	   it can't overflow into the sign bit */
+	return (x + mask) & ~mask;
+}
+
+/* Round to floating integer, to nearest */
+static unsigned int rfin(unsigned int x)
+{
+	int exp, half;
+
+	exp = ((x >> 23) & 0xff) - 127;
+	if (exp == 128 && (x & 0x7fffff) != 0)
+		return x | 0x400000;	/* NaN -> make it a QNaN */
+	if (exp >= 23)
+		return x;		/* it's an integer already (or Inf) */
+	if (exp < -1)
+		return x & 0x80000000;	/* |x| < 0.5 -> +/-0 */
+	if (exp == -1)
+		/* 0.5 <= |x| < 1.0 rounds to +/- 1.0 */
+		return (x & 0x80000000) | 0x3f800000;
+	half = 0x400000 >> exp;
+	/* add 0.5 to the magnitude and chop off the fraction bits */
+	return (x + half) & ~(0x7fffff >> exp);
+}
+
+int
+emulate_altivec(struct pt_regs *regs)
+{
+	unsigned int instr, i;
+	unsigned int va, vb, vc, vd;
+	vector128 *vrs;
+
+	if (get_user(instr, (unsigned int __user *) regs->nip))
+		return -EFAULT;
+	if ((instr >> 26) != 4)
+		return -EINVAL;		/* not an altivec instruction */
+	vd = (instr >> 21) & 0x1f;
+	va = (instr >> 16) & 0x1f;
+	vb = (instr >> 11) & 0x1f;
+	vc = (instr >> 6) & 0x1f;
+
+	vrs = current->thread.vr;
+	switch (instr & 0x3f) {
+	case 10:
+		switch (vc) {
+		case 0:	/* vaddfp */
+			vaddfp(&vrs[vd], &vrs[va], &vrs[vb]);
+			break;
+		case 1:	/* vsubfp */
+			vsubfp(&vrs[vd], &vrs[va], &vrs[vb]);
+			break;
+		case 4:	/* vrefp */
+			vrefp(&vrs[vd], &vrs[vb]);
+			break;
+		case 5:	/* vrsqrtefp */
+			vrsqrtefp(&vrs[vd], &vrs[vb]);
+			break;
+		case 6:	/* vexptefp */
+			for (i = 0; i < 4; ++i)
+				vrs[vd].u[i] = eexp2(vrs[vb].u[i]);
+			break;
+		case 7:	/* vlogefp */
+			for (i = 0; i < 4; ++i)
+				vrs[vd].u[i] = elog2(vrs[vb].u[i]);
+			break;
+		case 8:		/* vrfin */
+			for (i = 0; i < 4; ++i)
+				vrs[vd].u[i] = rfin(vrs[vb].u[i]);
+			break;
+		case 9:		/* vrfiz */
+			for (i = 0; i < 4; ++i)
+				vrs[vd].u[i] = rfiz(vrs[vb].u[i]);
+			break;
+		case 10:	/* vrfip */
+			for (i = 0; i < 4; ++i) {
+				u32 x = vrs[vb].u[i];
+				x = (x & 0x80000000)? rfiz(x): rfii(x);
+				vrs[vd].u[i] = x;
+			}
+			break;
+		case 11:	/* vrfim */
+			for (i = 0; i < 4; ++i) {
+				u32 x = vrs[vb].u[i];
+				x = (x & 0x80000000)? rfii(x): rfiz(x);
+				vrs[vd].u[i] = x;
+			}
+			break;
+		case 14:	/* vctuxs */
+			for (i = 0; i < 4; ++i)
+				vrs[vd].u[i] = ctuxs(vrs[vb].u[i], va,
+						&current->thread.vscr.u[3]);
+			break;
+		case 15:	/* vctsxs */
+			for (i = 0; i < 4; ++i)
+				vrs[vd].u[i] = ctsxs(vrs[vb].u[i], va,
+						&current->thread.vscr.u[3]);
+			break;
+		default:
+			return -EINVAL;
+		}
+		break;
+	case 46:	/* vmaddfp */
+		vmaddfp(&vrs[vd], &vrs[va], &vrs[vb], &vrs[vc]);
+		break;
+	case 47:	/* vnmsubfp */
+		vnmsubfp(&vrs[vd], &vrs[va], &vrs[vb], &vrs[vc]);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
diff --git a/arch/ppc64/kernel/vector.S b/arch/ppc64/kernel/vector.S
new file mode 100644
index 00000000000..b79d33e4001
--- /dev/null
+++ b/arch/ppc64/kernel/vector.S
@@ -0,0 +1,172 @@
+#include <asm/ppc_asm.h>
+#include <asm/processor.h>
+
+/*
+ * The routines below are in assembler so we can closely control the
+ * usage of floating-point registers.  These routines must be called
+ * with preempt disabled.
+ */
+	.section ".toc","aw"
+fpzero:
+	.tc	FD_0_0[TC],0
+fpone:
+	.tc	FD_3ff00000_0[TC],0x3ff0000000000000	/* 1.0 */
+fphalf:
+	.tc	FD_3fe00000_0[TC],0x3fe0000000000000	/* 0.5 */
+
+	.text
+/*
+ * Internal routine to enable floating point and set FPSCR to 0.
+ * Don't call it from C; it doesn't use the normal calling convention.
+ */
+fpenable:
+	mfmsr	r10
+	ori	r11,r10,MSR_FP
+	mtmsr	r11
+	isync
+	stfd	fr31,-8(r1)
+	stfd	fr0,-16(r1)
+	stfd	fr1,-24(r1)
+	mffs	fr31
+	lfd	fr1,fpzero@toc(r2)
+	mtfsf	0xff,fr1
+	blr
+
+fpdisable:
+	mtlr	r12
+	mtfsf	0xff,fr31
+	lfd	fr1,-24(r1)
+	lfd	fr0,-16(r1)
+	lfd	fr31,-8(r1)
+	mtmsr	r10
+	isync
+	blr
+
+/*
+ * Vector add, floating point.
+ */
+_GLOBAL(vaddfp)
+	mflr	r12
+	bl	fpenable
+	li	r0,4
+	mtctr	r0
+	li	r6,0
+1:	lfsx	fr0,r4,r6
+	lfsx	fr1,r5,r6
+	fadds	fr0,fr0,fr1
+	stfsx	fr0,r3,r6
+	addi	r6,r6,4
+	bdnz	1b
+	b	fpdisable
+
+/*
+ * Vector subtract, floating point.
+ */
+_GLOBAL(vsubfp)
+	mflr	r12
+	bl	fpenable
+	li	r0,4
+	mtctr	r0
+	li	r6,0
+1:	lfsx	fr0,r4,r6
+	lfsx	fr1,r5,r6
+	fsubs	fr0,fr0,fr1
+	stfsx	fr0,r3,r6
+	addi	r6,r6,4
+	bdnz	1b
+	b	fpdisable
+
+/*
+ * Vector multiply and add, floating point.
+ */
+_GLOBAL(vmaddfp)
+	mflr	r12
+	bl	fpenable
+	stfd	fr2,-32(r1)
+	li	r0,4
+	mtctr	r0
+	li	r7,0
+1:	lfsx	fr0,r4,r7
+	lfsx	fr1,r5,r7
+	lfsx	fr2,r6,r7
+	fmadds	fr0,fr0,fr2,fr1
+	stfsx	fr0,r3,r7
+	addi	r7,r7,4
+	bdnz	1b
+	lfd	fr2,-32(r1)
+	b	fpdisable
+
+/*
+ * Vector negative multiply and subtract, floating point.
+ */
+_GLOBAL(vnmsubfp)
+	mflr	r12
+	bl	fpenable
+	stfd	fr2,-32(r1)
+	li	r0,4
+	mtctr	r0
+	li	r7,0
+1:	lfsx	fr0,r4,r7
+	lfsx	fr1,r5,r7
+	lfsx	fr2,r6,r7
+	fnmsubs	fr0,fr0,fr2,fr1
+	stfsx	fr0,r3,r7
+	addi	r7,r7,4
+	bdnz	1b
+	lfd	fr2,-32(r1)
+	b	fpdisable
+
+/*
+ * Vector reciprocal estimate.  We just compute 1.0/x.
+ * r3 -> destination, r4 -> source.
+ */
+_GLOBAL(vrefp)
+	mflr	r12
+	bl	fpenable
+	li	r0,4
+	lfd	fr1,fpone@toc(r2)
+	mtctr	r0
+	li	r6,0
+1:	lfsx	fr0,r4,r6
+	fdivs	fr0,fr1,fr0
+	stfsx	fr0,r3,r6
+	addi	r6,r6,4
+	bdnz	1b
+	b	fpdisable
+
+/*
+ * Vector reciprocal square-root estimate, floating point.
+ * We use the frsqrte instruction for the initial estimate followed
+ * by 2 iterations of Newton-Raphson to get sufficient accuracy.
+ * r3 -> destination, r4 -> source.
+ */
+_GLOBAL(vrsqrtefp)
+	mflr	r12
+	bl	fpenable
+	stfd	fr2,-32(r1)
+	stfd	fr3,-40(r1)
+	stfd	fr4,-48(r1)
+	stfd	fr5,-56(r1)
+	li	r0,4
+	lfd	fr4,fpone@toc(r2)
+	lfd	fr5,fphalf@toc(r2)
+	mtctr	r0
+	li	r6,0
+1:	lfsx	fr0,r4,r6
+	frsqrte	fr1,fr0		/* r = frsqrte(s) */
+	fmuls	fr3,fr1,fr0	/* r * s */
+	fmuls	fr2,fr1,fr5	/* r * 0.5 */
+	fnmsubs	fr3,fr1,fr3,fr4	/* 1 - s * r * r */
+	fmadds	fr1,fr2,fr3,fr1	/* r = r + 0.5 * r * (1 - s * r * r) */
+	fmuls	fr3,fr1,fr0	/* r * s */
+	fmuls	fr2,fr1,fr5	/* r * 0.5 */
+	fnmsubs	fr3,fr1,fr3,fr4	/* 1 - s * r * r */
+	fmadds	fr1,fr2,fr3,fr1	/* r = r + 0.5 * r * (1 - s * r * r) */
+	stfsx	fr1,r3,r6
+	addi	r6,r6,4
+	bdnz	1b
+	lfd	fr5,-56(r1)
+	lfd	fr4,-48(r1)
+	lfd	fr3,-40(r1)
+	lfd	fr2,-32(r1)
+	b	fpdisable
diff --git a/arch/ppc64/kernel/vio.c b/arch/ppc64/kernel/vio.c
new file mode 100644
index 00000000000..cdd830cb276
--- /dev/null
+++ b/arch/ppc64/kernel/vio.c
@@ -0,0 +1,640 @@
+/*
+ * IBM PowerPC Virtual I/O Infrastructure Support.
+ *
+ *    Copyright (c) 2003 IBM Corp.
+ *     Dave Engebretsen engebret@us.ibm.com
+ *     Santiago Leon santil@us.ibm.com
+ *     Hollis Blanchard <hollisb@us.ibm.com>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/console.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <asm/rtas.h>
+#include <asm/iommu.h>
+#include <asm/dma.h>
+#include <asm/ppcdebug.h>
+#include <asm/vio.h>
+#include <asm/hvcall.h>
+#include <asm/iSeries/vio.h>
+#include <asm/iSeries/HvTypes.h>
+#include <asm/iSeries/HvCallXm.h>
+#include <asm/iSeries/HvLpConfig.h>
+
+#define DBGENTER() pr_debug("%s entered\n", __FUNCTION__)
+
+extern struct subsystem devices_subsys; /* needed for vio_find_name() */
+
+static const struct vio_device_id *vio_match_device(
+		const struct vio_device_id *, const struct vio_dev *);
+
+#ifdef CONFIG_PPC_PSERIES
+static struct iommu_table *vio_build_iommu_table(struct vio_dev *);
+static int vio_num_address_cells;
+#endif
+static struct vio_dev *vio_bus_device; /* fake "parent" device */
+
+#ifdef CONFIG_PPC_ISERIES
+static struct vio_dev *__init vio_register_device_iseries(char *type,
+		uint32_t unit_num);
+
+static struct iommu_table veth_iommu_table;
+static struct iommu_table vio_iommu_table;
+
+static struct vio_dev _vio_dev  = {
+	.iommu_table = &vio_iommu_table,
+	.dev.bus = &vio_bus_type
+};
+struct device *iSeries_vio_dev = &_vio_dev.dev;
+EXPORT_SYMBOL(iSeries_vio_dev);
+
+#define device_is_compatible(a, b)	1
+
+#endif
+
+/* convert from struct device to struct vio_dev and pass to driver.
+ * dev->driver has already been set by generic code because vio_bus_match
+ * succeeded. */
+static int vio_bus_probe(struct device *dev)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	struct vio_driver *viodrv = to_vio_driver(dev->driver);
+	const struct vio_device_id *id;
+	int error = -ENODEV;
+
+	DBGENTER();
+
+	if (!viodrv->probe)
+		return error;
+
+	id = vio_match_device(viodrv->id_table, viodev);
+	if (id) {
+		error = viodrv->probe(viodev, id);
+	}
+
+	return error;
+}
+
+/* convert from struct device to struct vio_dev and pass to driver. */
+static int vio_bus_remove(struct device *dev)
+{
+	struct vio_dev *viodev = to_vio_dev(dev);
+	struct vio_driver *viodrv = to_vio_driver(dev->driver);
+
+	DBGENTER();
+
+	if (viodrv->remove) {
+		return viodrv->remove(viodev);
+	}
+
+	/* driver can't remove */
+	return 1;
+}
+
+/**
+ * vio_register_driver: - Register a new vio driver
+ * @drv:	The vio_driver structure to be registered.
+ */
+int vio_register_driver(struct vio_driver *viodrv)
+{
+	printk(KERN_DEBUG "%s: driver %s registering\n", __FUNCTION__,
+		viodrv->name);
+
+	/* fill in 'struct driver' fields */
+	viodrv->driver.name = viodrv->name;
+	viodrv->driver.bus = &vio_bus_type;
+	viodrv->driver.probe = vio_bus_probe;
+	viodrv->driver.remove = vio_bus_remove;
+
+	return driver_register(&viodrv->driver);
+}
+EXPORT_SYMBOL(vio_register_driver);
+
+/**
+ * vio_unregister_driver - Remove registration of vio driver.
+ * @driver:	The vio_driver struct to be removed form registration
+ */
+void vio_unregister_driver(struct vio_driver *viodrv)
+{
+	driver_unregister(&viodrv->driver);
+}
+EXPORT_SYMBOL(vio_unregister_driver);
+
+/**
+ * vio_match_device: - Tell if a VIO device has a matching VIO device id structure.
+ * @ids: 	array of VIO device id structures to search in
+ * @dev: 	the VIO device structure to match against
+ *
+ * Used by a driver to check whether a VIO device present in the
+ * system is in its list of supported devices. Returns the matching
+ * vio_device_id structure or NULL if there is no match.
+ */
+static const struct vio_device_id * vio_match_device(const struct vio_device_id *ids,
+	const struct vio_dev *dev)
+{
+	DBGENTER();
+
+	while (ids->type) {
+		if ((strncmp(dev->type, ids->type, strlen(ids->type)) == 0) &&
+			device_is_compatible(dev->dev.platform_data, ids->compat))
+			return ids;
+		ids++;
+	}
+	return NULL;
+}
+
+#ifdef CONFIG_PPC_ISERIES
+void __init iommu_vio_init(void)
+{
+	struct iommu_table *t;
+	struct iommu_table_cb cb;
+	unsigned long cbp;
+	unsigned long itc_entries;
+
+	cb.itc_busno = 255;    /* Bus 255 is the virtual bus */
+	cb.itc_virtbus = 0xff; /* Ask for virtual bus */
+
+	cbp = virt_to_abs(&cb);
+	HvCallXm_getTceTableParms(cbp);
+
+	itc_entries = cb.itc_size * PAGE_SIZE / sizeof(union tce_entry);
+	veth_iommu_table.it_size        = itc_entries / 2;
+	veth_iommu_table.it_busno       = cb.itc_busno;
+	veth_iommu_table.it_offset      = cb.itc_offset;
+	veth_iommu_table.it_index       = cb.itc_index;
+	veth_iommu_table.it_type        = TCE_VB;
+	veth_iommu_table.it_blocksize	= 1;
+
+	t = iommu_init_table(&veth_iommu_table);
+
+	if (!t)
+		printk("Virtual Bus VETH TCE table failed.\n");
+
+	vio_iommu_table.it_size         = itc_entries - veth_iommu_table.it_size;
+	vio_iommu_table.it_busno        = cb.itc_busno;
+	vio_iommu_table.it_offset       = cb.itc_offset +
+					  veth_iommu_table.it_size;
+	vio_iommu_table.it_index        = cb.itc_index;
+	vio_iommu_table.it_type         = TCE_VB;
+	vio_iommu_table.it_blocksize	= 1;
+
+	t = iommu_init_table(&vio_iommu_table);
+
+	if (!t)
+		printk("Virtual Bus VIO TCE table failed.\n");
+}
+#endif
+
+#ifdef CONFIG_PPC_PSERIES
+static void probe_bus_pseries(void)
+{
+	struct device_node *node_vroot, *of_node;
+
+	node_vroot = find_devices("vdevice");
+	if ((node_vroot == NULL) || (node_vroot->child == NULL))
+		/* this machine doesn't do virtual IO, and that's ok */
+		return;
+
+	vio_num_address_cells = prom_n_addr_cells(node_vroot->child);
+
+	/*
+	 * Create struct vio_devices for each virtual device in the device tree.
+	 * Drivers will associate with them later.
+	 */
+	for (of_node = node_vroot->child; of_node != NULL;
+			of_node = of_node->sibling) {
+		printk(KERN_DEBUG "%s: processing %p\n", __FUNCTION__, of_node);
+		vio_register_device_node(of_node);
+	}
+}
+#endif
+
+#ifdef CONFIG_PPC_ISERIES
+static void probe_bus_iseries(void)
+{
+	HvLpIndexMap vlan_map = HvLpConfig_getVirtualLanIndexMap();
+	struct vio_dev *viodev;
+	int i;
+
+	/* there is only one of each of these */
+	vio_register_device_iseries("viocons", 0);
+	vio_register_device_iseries("vscsi", 0);
+
+	vlan_map = HvLpConfig_getVirtualLanIndexMap();
+	for (i = 0; i < HVMAXARCHITECTEDVIRTUALLANS; i++) {
+		if ((vlan_map & (0x8000 >> i)) == 0)
+			continue;
+		viodev = vio_register_device_iseries("vlan", i);
+		/* veth is special and has it own iommu_table */
+		viodev->iommu_table = &veth_iommu_table;
+	}
+	for (i = 0; i < HVMAXARCHITECTEDVIRTUALDISKS; i++)
+		vio_register_device_iseries("viodasd", i);
+	for (i = 0; i < HVMAXARCHITECTEDVIRTUALCDROMS; i++)
+		vio_register_device_iseries("viocd", i);
+	for (i = 0; i < HVMAXARCHITECTEDVIRTUALTAPES; i++)
+		vio_register_device_iseries("viotape", i);
+}
+#endif
+
+/**
+ * vio_bus_init: - Initialize the virtual IO bus
+ */
+static int __init vio_bus_init(void)
+{
+	int err;
+
+	err = bus_register(&vio_bus_type);
+	if (err) {
+		printk(KERN_ERR "failed to register VIO bus\n");
+		return err;
+	}
+
+	/* the fake parent of all vio devices, just to give us a nice directory */
+	vio_bus_device = kmalloc(sizeof(struct vio_dev), GFP_KERNEL);
+	if (!vio_bus_device) {
+		return 1;
+	}
+	memset(vio_bus_device, 0, sizeof(struct vio_dev));
+	strcpy(vio_bus_device->dev.bus_id, "vio");
+
+	err = device_register(&vio_bus_device->dev);
+	if (err) {
+		printk(KERN_WARNING "%s: device_register returned %i\n", __FUNCTION__,
+			err);
+		kfree(vio_bus_device);
+		return err;
+	}
+
+#ifdef CONFIG_PPC_PSERIES
+	probe_bus_pseries();
+#endif
+#ifdef CONFIG_PPC_ISERIES
+	probe_bus_iseries();
+#endif
+
+	return 0;
+}
+
+__initcall(vio_bus_init);
+
+/* vio_dev refcount hit 0 */
+static void __devinit vio_dev_release(struct device *dev)
+{
+	DBGENTER();
+
+#ifdef CONFIG_PPC_PSERIES
+	/* XXX free TCE table */
+	of_node_put(dev->platform_data);
+#endif
+	kfree(to_vio_dev(dev));
+}
+
+#ifdef CONFIG_PPC_PSERIES
+static ssize_t viodev_show_devspec(struct device *dev, char *buf)
+{
+	struct device_node *of_node = dev->platform_data;
+
+	return sprintf(buf, "%s\n", of_node->full_name);
+}
+DEVICE_ATTR(devspec, S_IRUSR | S_IRGRP | S_IROTH, viodev_show_devspec, NULL);
+#endif
+
+static ssize_t viodev_show_name(struct device *dev, char *buf)
+{
+	return sprintf(buf, "%s\n", to_vio_dev(dev)->name);
+}
+DEVICE_ATTR(name, S_IRUSR | S_IRGRP | S_IROTH, viodev_show_name, NULL);
+
+static struct vio_dev * __devinit vio_register_device_common(
+		struct vio_dev *viodev, char *name, char *type,
+		uint32_t unit_address, struct iommu_table *iommu_table)
+{
+	DBGENTER();
+
+	viodev->name = name;
+	viodev->type = type;
+	viodev->unit_address = unit_address;
+	viodev->iommu_table = iommu_table;
+	/* init generic 'struct device' fields: */
+	viodev->dev.parent = &vio_bus_device->dev;
+	viodev->dev.bus = &vio_bus_type;
+	viodev->dev.release = vio_dev_release;
+
+	/* register with generic device framework */
+	if (device_register(&viodev->dev)) {
+		printk(KERN_ERR "%s: failed to register device %s\n",
+				__FUNCTION__, viodev->dev.bus_id);
+		return NULL;
+	}
+	device_create_file(&viodev->dev, &dev_attr_name);
+
+	return viodev;
+}
+
+#ifdef CONFIG_PPC_PSERIES
+/**
+ * vio_register_device_node: - Register a new vio device.
+ * @of_node:	The OF node for this device.
+ *
+ * Creates and initializes a vio_dev structure from the data in
+ * of_node (dev.platform_data) and adds it to the list of virtual devices.
+ * Returns a pointer to the created vio_dev or NULL if node has
+ * NULL device_type or compatible fields.
+ */
+struct vio_dev * __devinit vio_register_device_node(struct device_node *of_node)
+{
+	struct vio_dev *viodev;
+	unsigned int *unit_address;
+	unsigned int *irq_p;
+
+	DBGENTER();
+
+	/* we need the 'device_type' property, in order to match with drivers */
+	if ((NULL == of_node->type)) {
+		printk(KERN_WARNING
+			"%s: node %s missing 'device_type'\n", __FUNCTION__,
+			of_node->name ? of_node->name : "<unknown>");
+		return NULL;
+	}
+
+	unit_address = (unsigned int *)get_property(of_node, "reg", NULL);
+	if (!unit_address) {
+		printk(KERN_WARNING "%s: node %s missing 'reg'\n", __FUNCTION__,
+			of_node->name ? of_node->name : "<unknown>");
+		return NULL;
+	}
+
+	/* allocate a vio_dev for this node */
+	viodev = kmalloc(sizeof(struct vio_dev), GFP_KERNEL);
+	if (!viodev) {
+		return NULL;
+	}
+	memset(viodev, 0, sizeof(struct vio_dev));
+
+	viodev->dev.platform_data = of_node_get(of_node);
+
+	viodev->irq = NO_IRQ;
+	irq_p = (unsigned int *)get_property(of_node, "interrupts", NULL);
+	if (irq_p) {
+		int virq = virt_irq_create_mapping(*irq_p);
+		if (virq == NO_IRQ) {
+			printk(KERN_ERR "Unable to allocate interrupt "
+			       "number for %s\n", of_node->full_name);
+		} else
+			viodev->irq = irq_offset_up(virq);
+	}
+
+	snprintf(viodev->dev.bus_id, BUS_ID_SIZE, "%x", *unit_address);
+
+	/* register with generic device framework */
+	if (vio_register_device_common(viodev, of_node->name, of_node->type,
+				*unit_address, vio_build_iommu_table(viodev))
+			== NULL) {
+		/* XXX free TCE table */
+		kfree(viodev);
+		return NULL;
+	}
+	device_create_file(&viodev->dev, &dev_attr_devspec);
+
+	return viodev;
+}
+EXPORT_SYMBOL(vio_register_device_node);
+#endif
+
+#ifdef CONFIG_PPC_ISERIES
+/**
+ * vio_register_device: - Register a new vio device.
+ * @voidev:	The device to register.
+ */
+static struct vio_dev *__init vio_register_device_iseries(char *type,
+		uint32_t unit_num)
+{
+	struct vio_dev *viodev;
+
+	DBGENTER();
+
+	/* allocate a vio_dev for this node */
+	viodev = kmalloc(sizeof(struct vio_dev), GFP_KERNEL);
+	if (!viodev)
+		return NULL;
+	memset(viodev, 0, sizeof(struct vio_dev));
+
+	snprintf(viodev->dev.bus_id, BUS_ID_SIZE, "%s%d", type, unit_num);
+
+	return vio_register_device_common(viodev, viodev->dev.bus_id, type,
+			unit_num, &vio_iommu_table);
+}
+#endif
+
+void __devinit vio_unregister_device(struct vio_dev *viodev)
+{
+	DBGENTER();
+#ifdef CONFIG_PPC_PSERIES
+	device_remove_file(&viodev->dev, &dev_attr_devspec);
+#endif
+	device_remove_file(&viodev->dev, &dev_attr_name);
+	device_unregister(&viodev->dev);
+}
+EXPORT_SYMBOL(vio_unregister_device);
+
+#ifdef CONFIG_PPC_PSERIES
+/**
+ * vio_get_attribute: - get attribute for virtual device
+ * @vdev:	The vio device to get property.
+ * @which:	The property/attribute to be extracted.
+ * @length:	Pointer to length of returned data size (unused if NULL).
+ *
+ * Calls prom.c's get_property() to return the value of the
+ * attribute specified by the preprocessor constant @which
+*/
+const void * vio_get_attribute(struct vio_dev *vdev, void* which, int* length)
+{
+	return get_property(vdev->dev.platform_data, (char*)which, length);
+}
+EXPORT_SYMBOL(vio_get_attribute);
+
+/* vio_find_name() - internal because only vio.c knows how we formatted the
+ * kobject name
+ * XXX once vio_bus_type.devices is actually used as a kset in
+ * drivers/base/bus.c, this function should be removed in favor of
+ * "device_find(kobj_name, &vio_bus_type)"
+ */
+static struct vio_dev *vio_find_name(const char *kobj_name)
+{
+	struct kobject *found;
+
+	found = kset_find_obj(&devices_subsys.kset, kobj_name);
+	if (!found)
+		return NULL;
+
+	return to_vio_dev(container_of(found, struct device, kobj));
+}
+
+/**
+ * vio_find_node - find an already-registered vio_dev
+ * @vnode: device_node of the virtual device we're looking for
+ */
+struct vio_dev *vio_find_node(struct device_node *vnode)
+{
+	uint32_t *unit_address;
+	char kobj_name[BUS_ID_SIZE];
+
+	/* construct the kobject name from the device node */
+	unit_address = (uint32_t *)get_property(vnode, "reg", NULL);
+	if (!unit_address)
+		return NULL;
+	snprintf(kobj_name, BUS_ID_SIZE, "%x", *unit_address);
+
+	return vio_find_name(kobj_name);
+}
+EXPORT_SYMBOL(vio_find_node);
+
+/**
+ * vio_build_iommu_table: - gets the dma information from OF and builds the TCE tree.
+ * @dev: the virtual device.
+ *
+ * Returns a pointer to the built tce tree, or NULL if it can't
+ * find property.
+*/
+static struct iommu_table * vio_build_iommu_table(struct vio_dev *dev)
+{
+	unsigned int *dma_window;
+	struct iommu_table *newTceTable;
+	unsigned long offset;
+	int dma_window_property_size;
+
+	dma_window = (unsigned int *) get_property(dev->dev.platform_data, "ibm,my-dma-window", &dma_window_property_size);
+	if(!dma_window) {
+		return NULL;
+	}
+
+	newTceTable = (struct iommu_table *) kmalloc(sizeof(struct iommu_table), GFP_KERNEL);
+
+	/*  There should be some code to extract the phys-encoded offset
+		using prom_n_addr_cells(). However, according to a comment
+		on earlier versions, it's always zero, so we don't bother */
+	offset = dma_window[1] >>  PAGE_SHIFT;
+
+	/* TCE table size - measured in tce entries */
+	newTceTable->it_size		= dma_window[4] >> PAGE_SHIFT;
+	/* offset for VIO should always be 0 */
+	newTceTable->it_offset		= offset;
+	newTceTable->it_busno		= 0;
+	newTceTable->it_index		= (unsigned long)dma_window[0];
+	newTceTable->it_type		= TCE_VB;
+
+	return iommu_init_table(newTceTable);
+}
+
+int vio_enable_interrupts(struct vio_dev *dev)
+{
+	int rc = h_vio_signal(dev->unit_address, VIO_IRQ_ENABLE);
+	if (rc != H_Success) {
+		printk(KERN_ERR "vio: Error 0x%x enabling interrupts\n", rc);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(vio_enable_interrupts);
+
+int vio_disable_interrupts(struct vio_dev *dev)
+{
+	int rc = h_vio_signal(dev->unit_address, VIO_IRQ_DISABLE);
+	if (rc != H_Success) {
+		printk(KERN_ERR "vio: Error 0x%x disabling interrupts\n", rc);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(vio_disable_interrupts);
+#endif
+
+static dma_addr_t vio_map_single(struct device *dev, void *vaddr,
+			  size_t size, enum dma_data_direction direction)
+{
+	return iommu_map_single(to_vio_dev(dev)->iommu_table, vaddr, size,
+			direction);
+}
+
+static void vio_unmap_single(struct device *dev, dma_addr_t dma_handle,
+		      size_t size, enum dma_data_direction direction)
+{
+	iommu_unmap_single(to_vio_dev(dev)->iommu_table, dma_handle, size,
+			direction);
+}
+
+static int vio_map_sg(struct device *dev, struct scatterlist *sglist,
+		int nelems, enum dma_data_direction direction)
+{
+	return iommu_map_sg(dev, to_vio_dev(dev)->iommu_table, sglist,
+			nelems, direction);
+}
+
+static void vio_unmap_sg(struct device *dev, struct scatterlist *sglist,
+		int nelems, enum dma_data_direction direction)
+{
+	iommu_unmap_sg(to_vio_dev(dev)->iommu_table, sglist, nelems, direction);
+}
+
+static void *vio_alloc_coherent(struct device *dev, size_t size,
+			   dma_addr_t *dma_handle, unsigned int __nocast flag)
+{
+	return iommu_alloc_coherent(to_vio_dev(dev)->iommu_table, size,
+			dma_handle, flag);
+}
+
+static void vio_free_coherent(struct device *dev, size_t size,
+			 void *vaddr, dma_addr_t dma_handle)
+{
+	iommu_free_coherent(to_vio_dev(dev)->iommu_table, size, vaddr,
+			dma_handle);
+}
+
+static int vio_dma_supported(struct device *dev, u64 mask)
+{
+	return 1;
+}
+
+struct dma_mapping_ops vio_dma_ops = {
+	.alloc_coherent = vio_alloc_coherent,
+	.free_coherent = vio_free_coherent,
+	.map_single = vio_map_single,
+	.unmap_single = vio_unmap_single,
+	.map_sg = vio_map_sg,
+	.unmap_sg = vio_unmap_sg,
+	.dma_supported = vio_dma_supported,
+};
+
+static int vio_bus_match(struct device *dev, struct device_driver *drv)
+{
+	const struct vio_dev *vio_dev = to_vio_dev(dev);
+	struct vio_driver *vio_drv = to_vio_driver(drv);
+	const struct vio_device_id *ids = vio_drv->id_table;
+	const struct vio_device_id *found_id;
+
+	DBGENTER();
+
+	if (!ids)
+		return 0;
+
+	found_id = vio_match_device(ids, vio_dev);
+	if (found_id)
+		return 1;
+
+	return 0;
+}
+
+struct bus_type vio_bus_type = {
+	.name = "vio",
+	.match = vio_bus_match,
+};
+
+EXPORT_SYMBOL(vio_bus_type);
diff --git a/arch/ppc64/kernel/viopath.c b/arch/ppc64/kernel/viopath.c
new file mode 100644
index 00000000000..2ed8ee07568
--- /dev/null
+++ b/arch/ppc64/kernel/viopath.c
@@ -0,0 +1,675 @@
+/* -*- linux-c -*-
+ *  arch/ppc64/kernel/viopath.c
+ *
+ *  iSeries Virtual I/O Message Path code
+ *
+ *  Authors: Dave Boutcher <boutcher@us.ibm.com>
+ *           Ryan Arnold <ryanarn@us.ibm.com>
+ *           Colin Devilbiss <devilbis@us.ibm.com>
+ *
+ * (C) Copyright 2000-2003 IBM Corporation
+ *
+ * This code is used by the iSeries virtual disk, cd,
+ * tape, and console to communicate with OS/400 in another
+ * partition.
+ *
+ * This program is free software;  you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) anyu later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/vmalloc.h>
+#include <linux/string.h>
+#include <linux/proc_fs.h>
+#include <linux/dma-mapping.h>
+#include <linux/wait.h>
+#include <linux/seq_file.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/iSeries/HvTypes.h>
+#include <asm/iSeries/LparData.h>
+#include <asm/iSeries/HvLpEvent.h>
+#include <asm/iSeries/HvLpConfig.h>
+#include <asm/iSeries/HvCallCfg.h>
+#include <asm/iSeries/mf.h>
+#include <asm/iSeries/iSeries_proc.h>
+#include <asm/iSeries/vio.h>
+
+/* Status of the path to each other partition in the system.
+ * This is overkill, since we will only ever establish connections
+ * to our hosting partition and the primary partition on the system.
+ * But this allows for other support in the future.
+ */
+static struct viopathStatus {
+	int isOpen;		/* Did we open the path?            */
+	int isActive;		/* Do we have a mon msg outstanding */
+	int users[VIO_MAX_SUBTYPES];
+	HvLpInstanceId mSourceInst;
+	HvLpInstanceId mTargetInst;
+	int numberAllocated;
+} viopathStatus[HVMAXARCHITECTEDLPS];
+
+static DEFINE_SPINLOCK(statuslock);
+
+/*
+ * For each kind of event we allocate a buffer that is
+ * guaranteed not to cross a page boundary
+ */
+static unsigned char event_buffer[VIO_MAX_SUBTYPES * 256] __page_aligned;
+static atomic_t event_buffer_available[VIO_MAX_SUBTYPES];
+static int event_buffer_initialised;
+
+static void handleMonitorEvent(struct HvLpEvent *event);
+
+/*
+ * We use this structure to handle asynchronous responses.  The caller
+ * blocks on the semaphore and the handler posts the semaphore.  However,
+ * if system_state is not SYSTEM_RUNNING, then wait_atomic is used ...
+ */
+struct alloc_parms {
+	struct semaphore sem;
+	int number;
+	atomic_t wait_atomic;
+	int used_wait_atomic;
+};
+
+/* Put a sequence number in each mon msg.  The value is not
+ * important.  Start at something other than 0 just for
+ * readability.  wrapping this is ok.
+ */
+static u8 viomonseq = 22;
+
+/* Our hosting logical partition.  We get this at startup
+ * time, and different modules access this variable directly.
+ */
+HvLpIndex viopath_hostLp = HvLpIndexInvalid;
+EXPORT_SYMBOL(viopath_hostLp);
+HvLpIndex viopath_ourLp = HvLpIndexInvalid;
+EXPORT_SYMBOL(viopath_ourLp);
+
+/* For each kind of incoming event we set a pointer to a
+ * routine to call.
+ */
+static vio_event_handler_t *vio_handler[VIO_MAX_SUBTYPES];
+
+#define VIOPATH_KERN_WARN	KERN_WARNING "viopath: "
+#define VIOPATH_KERN_INFO	KERN_INFO "viopath: "
+
+static int proc_viopath_show(struct seq_file *m, void *v)
+{
+	char *buf;
+	u16 vlanMap;
+	dma_addr_t handle;
+	HvLpEvent_Rc hvrc;
+	DECLARE_MUTEX_LOCKED(Semaphore);
+
+	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!buf)
+		return 0;
+	memset(buf, 0, PAGE_SIZE);
+
+	handle = dma_map_single(iSeries_vio_dev, buf, PAGE_SIZE,
+				DMA_FROM_DEVICE);
+
+	hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
+			HvLpEvent_Type_VirtualIo,
+			viomajorsubtype_config | vioconfigget,
+			HvLpEvent_AckInd_DoAck, HvLpEvent_AckType_ImmediateAck,
+			viopath_sourceinst(viopath_hostLp),
+			viopath_targetinst(viopath_hostLp),
+			(u64)(unsigned long)&Semaphore, VIOVERSION << 16,
+			((u64)handle) << 32, PAGE_SIZE, 0, 0);
+
+	if (hvrc != HvLpEvent_Rc_Good)
+		printk(VIOPATH_KERN_WARN "hv error on op %d\n", (int)hvrc);
+
+	down(&Semaphore);
+
+	vlanMap = HvLpConfig_getVirtualLanIndexMap();
+
+	buf[PAGE_SIZE-1] = '\0';
+	seq_printf(m, "%s", buf);
+	seq_printf(m, "AVAILABLE_VETH=%x\n", vlanMap);
+	seq_printf(m, "SRLNBR=%c%c%c%c%c%c%c\n",
+		   e2a(xItExtVpdPanel.mfgID[2]),
+		   e2a(xItExtVpdPanel.mfgID[3]),
+		   e2a(xItExtVpdPanel.systemSerial[1]),
+		   e2a(xItExtVpdPanel.systemSerial[2]),
+		   e2a(xItExtVpdPanel.systemSerial[3]),
+		   e2a(xItExtVpdPanel.systemSerial[4]),
+		   e2a(xItExtVpdPanel.systemSerial[5]));
+
+	dma_unmap_single(iSeries_vio_dev, handle, PAGE_SIZE, DMA_FROM_DEVICE);
+	kfree(buf);
+
+	return 0;
+}
+
+static int proc_viopath_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, proc_viopath_show, NULL);
+}
+
+static struct file_operations proc_viopath_operations = {
+	.open		= proc_viopath_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init vio_proc_init(void)
+{
+	struct proc_dir_entry *e;
+
+	e = create_proc_entry("iSeries/config", 0, NULL);
+	if (e)
+		e->proc_fops = &proc_viopath_operations;
+
+        return 0;
+}
+__initcall(vio_proc_init);
+
+/* See if a given LP is active.  Allow for invalid lps to be passed in
+ * and just return invalid
+ */
+int viopath_isactive(HvLpIndex lp)
+{
+	if (lp == HvLpIndexInvalid)
+		return 0;
+	if (lp < HVMAXARCHITECTEDLPS)
+		return viopathStatus[lp].isActive;
+	else
+		return 0;
+}
+EXPORT_SYMBOL(viopath_isactive);
+
+/*
+ * We cache the source and target instance ids for each
+ * partition.
+ */
+HvLpInstanceId viopath_sourceinst(HvLpIndex lp)
+{
+	return viopathStatus[lp].mSourceInst;
+}
+EXPORT_SYMBOL(viopath_sourceinst);
+
+HvLpInstanceId viopath_targetinst(HvLpIndex lp)
+{
+	return viopathStatus[lp].mTargetInst;
+}
+EXPORT_SYMBOL(viopath_targetinst);
+
+/*
+ * Send a monitor message.  This is a message with the acknowledge
+ * bit on that the other side will NOT explicitly acknowledge.  When
+ * the other side goes down, the hypervisor will acknowledge any
+ * outstanding messages....so we will know when the other side dies.
+ */
+static void sendMonMsg(HvLpIndex remoteLp)
+{
+	HvLpEvent_Rc hvrc;
+
+	viopathStatus[remoteLp].mSourceInst =
+		HvCallEvent_getSourceLpInstanceId(remoteLp,
+				HvLpEvent_Type_VirtualIo);
+	viopathStatus[remoteLp].mTargetInst =
+		HvCallEvent_getTargetLpInstanceId(remoteLp,
+				HvLpEvent_Type_VirtualIo);
+
+	/*
+	 * Deliberately ignore the return code here.  if we call this
+	 * more than once, we don't care.
+	 */
+	vio_setHandler(viomajorsubtype_monitor, handleMonitorEvent);
+
+	hvrc = HvCallEvent_signalLpEventFast(remoteLp, HvLpEvent_Type_VirtualIo,
+			viomajorsubtype_monitor, HvLpEvent_AckInd_DoAck,
+			HvLpEvent_AckType_DeferredAck,
+			viopathStatus[remoteLp].mSourceInst,
+			viopathStatus[remoteLp].mTargetInst,
+			viomonseq++, 0, 0, 0, 0, 0);
+
+	if (hvrc == HvLpEvent_Rc_Good)
+		viopathStatus[remoteLp].isActive = 1;
+	else {
+		printk(VIOPATH_KERN_WARN "could not connect to partition %d\n",
+				remoteLp);
+		viopathStatus[remoteLp].isActive = 0;
+	}
+}
+
+static void handleMonitorEvent(struct HvLpEvent *event)
+{
+	HvLpIndex remoteLp;
+	int i;
+
+	/*
+	 * This handler is _also_ called as part of the loop
+	 * at the end of this routine, so it must be able to
+	 * ignore NULL events...
+	 */
+	if (!event)
+		return;
+
+	/*
+	 * First see if this is just a normal monitor message from the
+	 * other partition
+	 */
+	if (event->xFlags.xFunction == HvLpEvent_Function_Int) {
+		remoteLp = event->xSourceLp;
+		if (!viopathStatus[remoteLp].isActive)
+			sendMonMsg(remoteLp);
+		return;
+	}
+
+	/*
+	 * This path is for an acknowledgement; the other partition
+	 * died
+	 */
+	remoteLp = event->xTargetLp;
+	if ((event->xSourceInstanceId != viopathStatus[remoteLp].mSourceInst) ||
+	    (event->xTargetInstanceId != viopathStatus[remoteLp].mTargetInst)) {
+		printk(VIOPATH_KERN_WARN "ignoring ack....mismatched instances\n");
+		return;
+	}
+
+	printk(VIOPATH_KERN_WARN "partition %d ended\n", remoteLp);
+
+	viopathStatus[remoteLp].isActive = 0;
+
+	/*
+	 * For each active handler, pass them a NULL
+	 * message to indicate that the other partition
+	 * died
+	 */
+	for (i = 0; i < VIO_MAX_SUBTYPES; i++) {
+		if (vio_handler[i] != NULL)
+			(*vio_handler[i])(NULL);
+	}
+}
+
+int vio_setHandler(int subtype, vio_event_handler_t *beh)
+{
+	subtype = subtype >> VIOMAJOR_SUBTYPE_SHIFT;
+	if ((subtype < 0) || (subtype >= VIO_MAX_SUBTYPES))
+		return -EINVAL;
+	if (vio_handler[subtype] != NULL)
+		return -EBUSY;
+	vio_handler[subtype] = beh;
+	return 0;
+}
+EXPORT_SYMBOL(vio_setHandler);
+
+int vio_clearHandler(int subtype)
+{
+	subtype = subtype >> VIOMAJOR_SUBTYPE_SHIFT;
+	if ((subtype < 0) || (subtype >= VIO_MAX_SUBTYPES))
+		return -EINVAL;
+	if (vio_handler[subtype] == NULL)
+		return -EAGAIN;
+	vio_handler[subtype] = NULL;
+	return 0;
+}
+EXPORT_SYMBOL(vio_clearHandler);
+
+static void handleConfig(struct HvLpEvent *event)
+{
+	if (!event)
+		return;
+	if (event->xFlags.xFunction == HvLpEvent_Function_Int) {
+		printk(VIOPATH_KERN_WARN
+		       "unexpected config request from partition %d",
+		       event->xSourceLp);
+
+		if ((event->xFlags.xFunction == HvLpEvent_Function_Int) &&
+		    (event->xFlags.xAckInd == HvLpEvent_AckInd_DoAck)) {
+			event->xRc = HvLpEvent_Rc_InvalidSubtype;
+			HvCallEvent_ackLpEvent(event);
+		}
+		return;
+	}
+
+	up((struct semaphore *)event->xCorrelationToken);
+}
+
+/*
+ * Initialization of the hosting partition
+ */
+void vio_set_hostlp(void)
+{
+	/*
+	 * If this has already been set then we DON'T want to either change
+	 * it or re-register the proc file system
+	 */
+	if (viopath_hostLp != HvLpIndexInvalid)
+		return;
+
+	/*
+	 * Figure out our hosting partition.  This isn't allowed to change
+	 * while we're active
+	 */
+	viopath_ourLp = HvLpConfig_getLpIndex();
+	viopath_hostLp = HvCallCfg_getHostingLpIndex(viopath_ourLp);
+
+	if (viopath_hostLp != HvLpIndexInvalid)
+		vio_setHandler(viomajorsubtype_config, handleConfig);
+}
+EXPORT_SYMBOL(vio_set_hostlp);
+
+static void vio_handleEvent(struct HvLpEvent *event, struct pt_regs *regs)
+{
+	HvLpIndex remoteLp;
+	int subtype = (event->xSubtype & VIOMAJOR_SUBTYPE_MASK)
+		>> VIOMAJOR_SUBTYPE_SHIFT;
+
+	if (event->xFlags.xFunction == HvLpEvent_Function_Int) {
+		remoteLp = event->xSourceLp;
+		/*
+		 * The isActive is checked because if the hosting partition
+		 * went down and came back up it would not be active but it
+		 * would have different source and target instances, in which
+		 * case we'd want to reset them.  This case really protects
+		 * against an unauthorized active partition sending interrupts
+		 * or acks to this linux partition.
+		 */
+		if (viopathStatus[remoteLp].isActive
+		    && (event->xSourceInstanceId !=
+			viopathStatus[remoteLp].mTargetInst)) {
+			printk(VIOPATH_KERN_WARN
+			       "message from invalid partition. "
+			       "int msg rcvd, source inst (%d) doesnt match (%d)\n",
+			       viopathStatus[remoteLp].mTargetInst,
+			       event->xSourceInstanceId);
+			return;
+		}
+
+		if (viopathStatus[remoteLp].isActive
+		    && (event->xTargetInstanceId !=
+			viopathStatus[remoteLp].mSourceInst)) {
+			printk(VIOPATH_KERN_WARN
+			       "message from invalid partition. "
+			       "int msg rcvd, target inst (%d) doesnt match (%d)\n",
+			       viopathStatus[remoteLp].mSourceInst,
+			       event->xTargetInstanceId);
+			return;
+		}
+	} else {
+		remoteLp = event->xTargetLp;
+		if (event->xSourceInstanceId !=
+		    viopathStatus[remoteLp].mSourceInst) {
+			printk(VIOPATH_KERN_WARN
+			       "message from invalid partition. "
+			       "ack msg rcvd, source inst (%d) doesnt match (%d)\n",
+			       viopathStatus[remoteLp].mSourceInst,
+			       event->xSourceInstanceId);
+			return;
+		}
+
+		if (event->xTargetInstanceId !=
+		    viopathStatus[remoteLp].mTargetInst) {
+			printk(VIOPATH_KERN_WARN
+			       "message from invalid partition. "
+			       "viopath: ack msg rcvd, target inst (%d) doesnt match (%d)\n",
+			       viopathStatus[remoteLp].mTargetInst,
+			       event->xTargetInstanceId);
+			return;
+		}
+	}
+
+	if (vio_handler[subtype] == NULL) {
+		printk(VIOPATH_KERN_WARN
+		       "unexpected virtual io event subtype %d from partition %d\n",
+		       event->xSubtype, remoteLp);
+		/* No handler.  Ack if necessary */
+		if ((event->xFlags.xFunction == HvLpEvent_Function_Int) &&
+		    (event->xFlags.xAckInd == HvLpEvent_AckInd_DoAck)) {
+			event->xRc = HvLpEvent_Rc_InvalidSubtype;
+			HvCallEvent_ackLpEvent(event);
+		}
+		return;
+	}
+
+	/* This innocuous little line is where all the real work happens */
+	(*vio_handler[subtype])(event);
+}
+
+static void viopath_donealloc(void *parm, int number)
+{
+	struct alloc_parms *parmsp = parm;
+
+	parmsp->number = number;
+	if (parmsp->used_wait_atomic)
+		atomic_set(&parmsp->wait_atomic, 0);
+	else
+		up(&parmsp->sem);
+}
+
+static int allocateEvents(HvLpIndex remoteLp, int numEvents)
+{
+	struct alloc_parms parms;
+
+	if (system_state != SYSTEM_RUNNING) {
+		parms.used_wait_atomic = 1;
+		atomic_set(&parms.wait_atomic, 1);
+	} else {
+		parms.used_wait_atomic = 0;
+		init_MUTEX_LOCKED(&parms.sem);
+	}
+	mf_allocate_lp_events(remoteLp, HvLpEvent_Type_VirtualIo, 250,	/* It would be nice to put a real number here! */
+			    numEvents, &viopath_donealloc, &parms);
+	if (system_state != SYSTEM_RUNNING) {
+		while (atomic_read(&parms.wait_atomic))
+			mb();
+	} else
+		down(&parms.sem);
+	return parms.number;
+}
+
+int viopath_open(HvLpIndex remoteLp, int subtype, int numReq)
+{
+	int i;
+	unsigned long flags;
+	int tempNumAllocated;
+
+	if ((remoteLp >= HvMaxArchitectedLps) || (remoteLp == HvLpIndexInvalid))
+		return -EINVAL;
+
+	subtype = subtype >> VIOMAJOR_SUBTYPE_SHIFT;
+	if ((subtype < 0) || (subtype >= VIO_MAX_SUBTYPES))
+		return -EINVAL;
+
+	spin_lock_irqsave(&statuslock, flags);
+
+	if (!event_buffer_initialised) {
+		for (i = 0; i < VIO_MAX_SUBTYPES; i++)
+			atomic_set(&event_buffer_available[i], 1);
+		event_buffer_initialised = 1;
+	}
+
+	viopathStatus[remoteLp].users[subtype]++;
+
+	if (!viopathStatus[remoteLp].isOpen) {
+		viopathStatus[remoteLp].isOpen = 1;
+		HvCallEvent_openLpEventPath(remoteLp, HvLpEvent_Type_VirtualIo);
+
+		/*
+		 * Don't hold the spinlock during an operation that
+		 * can sleep.
+		 */
+		spin_unlock_irqrestore(&statuslock, flags);
+		tempNumAllocated = allocateEvents(remoteLp, 1);
+		spin_lock_irqsave(&statuslock, flags);
+
+		viopathStatus[remoteLp].numberAllocated += tempNumAllocated;
+
+		if (viopathStatus[remoteLp].numberAllocated == 0) {
+			HvCallEvent_closeLpEventPath(remoteLp,
+					HvLpEvent_Type_VirtualIo);
+
+			spin_unlock_irqrestore(&statuslock, flags);
+			return -ENOMEM;
+		}
+
+		viopathStatus[remoteLp].mSourceInst =
+			HvCallEvent_getSourceLpInstanceId(remoteLp,
+					HvLpEvent_Type_VirtualIo);
+		viopathStatus[remoteLp].mTargetInst =
+			HvCallEvent_getTargetLpInstanceId(remoteLp,
+					HvLpEvent_Type_VirtualIo);
+		HvLpEvent_registerHandler(HvLpEvent_Type_VirtualIo,
+					  &vio_handleEvent);
+		sendMonMsg(remoteLp);
+		printk(VIOPATH_KERN_INFO "opening connection to partition %d, "
+				"setting sinst %d, tinst %d\n",
+				remoteLp, viopathStatus[remoteLp].mSourceInst,
+				viopathStatus[remoteLp].mTargetInst);
+	}
+
+	spin_unlock_irqrestore(&statuslock, flags);
+	tempNumAllocated = allocateEvents(remoteLp, numReq);
+	spin_lock_irqsave(&statuslock, flags);
+	viopathStatus[remoteLp].numberAllocated += tempNumAllocated;
+	spin_unlock_irqrestore(&statuslock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(viopath_open);
+
+int viopath_close(HvLpIndex remoteLp, int subtype, int numReq)
+{
+	unsigned long flags;
+	int i;
+	int numOpen;
+	struct alloc_parms parms;
+
+	if ((remoteLp >= HvMaxArchitectedLps) || (remoteLp == HvLpIndexInvalid))
+		return -EINVAL;
+
+	subtype = subtype >> VIOMAJOR_SUBTYPE_SHIFT;
+	if ((subtype < 0) || (subtype >= VIO_MAX_SUBTYPES))
+		return -EINVAL;
+
+	spin_lock_irqsave(&statuslock, flags);
+	/*
+	 * If the viopath_close somehow gets called before a
+	 * viopath_open it could decrement to -1 which is a non
+	 * recoverable state so we'll prevent this from
+	 * happening.
+	 */
+	if (viopathStatus[remoteLp].users[subtype] > 0)
+		viopathStatus[remoteLp].users[subtype]--;
+
+	spin_unlock_irqrestore(&statuslock, flags);
+
+	parms.used_wait_atomic = 0;
+	init_MUTEX_LOCKED(&parms.sem);
+	mf_deallocate_lp_events(remoteLp, HvLpEvent_Type_VirtualIo,
+			      numReq, &viopath_donealloc, &parms);
+	down(&parms.sem);
+
+	spin_lock_irqsave(&statuslock, flags);
+	for (i = 0, numOpen = 0; i < VIO_MAX_SUBTYPES; i++)
+		numOpen += viopathStatus[remoteLp].users[i];
+
+	if ((viopathStatus[remoteLp].isOpen) && (numOpen == 0)) {
+		printk(VIOPATH_KERN_INFO "closing connection to partition %d",
+				remoteLp);
+
+		HvCallEvent_closeLpEventPath(remoteLp,
+					     HvLpEvent_Type_VirtualIo);
+		viopathStatus[remoteLp].isOpen = 0;
+		viopathStatus[remoteLp].isActive = 0;
+
+		for (i = 0; i < VIO_MAX_SUBTYPES; i++)
+			atomic_set(&event_buffer_available[i], 0);
+		event_buffer_initialised = 0;
+	}
+	spin_unlock_irqrestore(&statuslock, flags);
+	return 0;
+}
+EXPORT_SYMBOL(viopath_close);
+
+void *vio_get_event_buffer(int subtype)
+{
+	subtype = subtype >> VIOMAJOR_SUBTYPE_SHIFT;
+	if ((subtype < 0) || (subtype >= VIO_MAX_SUBTYPES))
+		return NULL;
+
+	if (atomic_dec_if_positive(&event_buffer_available[subtype]) == 0)
+		return &event_buffer[subtype * 256];
+	else
+		return NULL;
+}
+EXPORT_SYMBOL(vio_get_event_buffer);
+
+void vio_free_event_buffer(int subtype, void *buffer)
+{
+	subtype = subtype >> VIOMAJOR_SUBTYPE_SHIFT;
+	if ((subtype < 0) || (subtype >= VIO_MAX_SUBTYPES)) {
+		printk(VIOPATH_KERN_WARN
+		       "unexpected subtype %d freeing event buffer\n", subtype);
+		return;
+	}
+
+	if (atomic_read(&event_buffer_available[subtype]) != 0) {
+		printk(VIOPATH_KERN_WARN
+		       "freeing unallocated event buffer, subtype %d\n",
+		       subtype);
+		return;
+	}
+
+	if (buffer != &event_buffer[subtype * 256]) {
+		printk(VIOPATH_KERN_WARN
+		       "freeing invalid event buffer, subtype %d\n", subtype);
+	}
+
+	atomic_set(&event_buffer_available[subtype], 1);
+}
+EXPORT_SYMBOL(vio_free_event_buffer);
+
+static const struct vio_error_entry vio_no_error =
+    { 0, 0, "Non-VIO Error" };
+static const struct vio_error_entry vio_unknown_error =
+    { 0, EIO, "Unknown Error" };
+
+static const struct vio_error_entry vio_default_errors[] = {
+	{0x0001, EIO, "No Connection"},
+	{0x0002, EIO, "No Receiver"},
+	{0x0003, EIO, "No Buffer Available"},
+	{0x0004, EBADRQC, "Invalid Message Type"},
+	{0x0000, 0, NULL},
+};
+
+const struct vio_error_entry *vio_lookup_rc(
+		const struct vio_error_entry *local_table, u16 rc)
+{
+	const struct vio_error_entry *cur;
+
+	if (!rc)
+		return &vio_no_error;
+	if (local_table)
+		for (cur = local_table; cur->rc; ++cur)
+			if (cur->rc == rc)
+				return cur;
+	for (cur = vio_default_errors; cur->rc; ++cur)
+		if (cur->rc == rc)
+			return cur;
+	return &vio_unknown_error;
+}
+EXPORT_SYMBOL(vio_lookup_rc);
diff --git a/arch/ppc64/kernel/vmlinux.lds.S b/arch/ppc64/kernel/vmlinux.lds.S
new file mode 100644
index 00000000000..4103cc13f8d
--- /dev/null
+++ b/arch/ppc64/kernel/vmlinux.lds.S
@@ -0,0 +1,145 @@
+#include <asm-generic/vmlinux.lds.h>
+
+OUTPUT_ARCH(powerpc:common64)
+jiffies = jiffies_64;
+SECTIONS
+{
+  /* Sections to be discarded. */
+  /DISCARD/ : {
+	*(.exitcall.exit)
+	}
+
+
+  /* Read-only sections, merged into text segment: */
+  .text : {
+	*(.text .text.*)
+	SCHED_TEXT
+	LOCK_TEXT
+	*(.fixup)
+	. = ALIGN(4096);
+	_etext = .;
+	}
+
+  __ex_table : {
+	__start___ex_table = .;
+	*(__ex_table)
+	__stop___ex_table = .;
+	}
+
+  __bug_table : {
+	__start___bug_table = .;
+	*(__bug_table)
+	__stop___bug_table = .;
+	}
+
+  __ftr_fixup : {
+	__start___ftr_fixup = .;
+	*(__ftr_fixup)
+	__stop___ftr_fixup = .;
+	}
+
+  RODATA
+
+
+  /* will be freed after init */
+  . = ALIGN(4096);
+  __init_begin = .;
+
+  .init.text : {
+	_sinittext = .;
+	*(.init.text)
+	_einittext = .;
+	}
+
+  .init.data : {
+	*(.init.data)
+	}
+
+  . = ALIGN(16);
+  .init.setup : {
+	__setup_start = .;
+	*(.init.setup)
+	__setup_end = .;
+	}
+
+  .initcall.init : {
+	__initcall_start = .;
+	*(.initcall1.init)
+	*(.initcall2.init)
+	*(.initcall3.init)
+	*(.initcall4.init)
+	*(.initcall5.init)
+	*(.initcall6.init)
+	*(.initcall7.init)
+	__initcall_end = .;
+	}
+
+  .con_initcall.init : {
+	__con_initcall_start = .;
+	*(.con_initcall.init)
+	__con_initcall_end = .;
+	}
+
+  SECURITY_INIT
+
+  . = ALIGN(4096);
+  .init.ramfs : {
+	__initramfs_start = .;
+	*(.init.ramfs)
+	__initramfs_end = .;
+	}
+
+  .data.percpu : {
+	__per_cpu_start = .;
+	*(.data.percpu)
+	__per_cpu_end = .;
+	}
+
+  . = ALIGN(16384);
+  __init_end = .;
+  /* freed after init ends here */
+
+
+  /* Read/write sections */
+  . = ALIGN(16384);
+  /* The initial task and kernel stack */
+  .data.init_task : {
+	*(.data.init_task)
+	}
+
+  .data.page_aligned : {
+	*(.data.page_aligned)
+	}
+
+  .data.cacheline_aligned : {
+	*(.data.cacheline_aligned)
+	}
+
+  .data : {
+	*(.data .data.rel* .toc1)
+	*(.branch_lt)
+	}
+
+  .opd : {
+	*(.opd)
+	}
+
+  .got : {
+	__toc_start = .;
+	*(.got)
+	*(.toc)
+	. = ALIGN(4096);
+	_edata = .;
+	}
+
+
+  . = ALIGN(4096);
+  .bss : {
+	__bss_start = .;
+	*(.bss)
+	__bss_stop = .;
+	}
+
+  . = ALIGN(4096);
+  _end = . ;
+}
diff --git a/arch/ppc64/kernel/xics.c b/arch/ppc64/kernel/xics.c
new file mode 100644
index 00000000000..eedd1d3c2a1
--- /dev/null
+++ b/arch/ppc64/kernel/xics.c
@@ -0,0 +1,713 @@
+/* 
+ * arch/ppc64/kernel/xics.c
+ *
+ * Copyright 2000 IBM Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/threads.h>
+#include <linux/kernel.h>
+#include <linux/irq.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/signal.h>
+#include <linux/init.h>
+#include <linux/gfp.h>
+#include <linux/radix-tree.h>
+#include <linux/cpu.h>
+#include <asm/prom.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/smp.h>
+#include <asm/rtas.h>
+#include <asm/xics.h>
+#include <asm/hvcall.h>
+#include <asm/machdep.h>
+
+#include "i8259.h"
+
+static unsigned int xics_startup(unsigned int irq);
+static void xics_enable_irq(unsigned int irq);
+static void xics_disable_irq(unsigned int irq);
+static void xics_mask_and_ack_irq(unsigned int irq);
+static void xics_end_irq(unsigned int irq);
+static void xics_set_affinity(unsigned int irq_nr, cpumask_t cpumask);
+
+struct hw_interrupt_type xics_pic = {
+	.typename = " XICS     ",
+	.startup = xics_startup,
+	.enable = xics_enable_irq,
+	.disable = xics_disable_irq,
+	.ack = xics_mask_and_ack_irq,
+	.end = xics_end_irq,
+	.set_affinity = xics_set_affinity
+};
+
+struct hw_interrupt_type xics_8259_pic = {
+	.typename = " XICS/8259",
+	.ack = xics_mask_and_ack_irq,
+};
+
+/* This is used to map real irq numbers to virtual */
+static struct radix_tree_root irq_map = RADIX_TREE_INIT(GFP_ATOMIC);
+
+#define XICS_IPI		2
+#define XICS_IRQ_SPURIOUS	0
+
+/* Want a priority other than 0.  Various HW issues require this. */
+#define	DEFAULT_PRIORITY	5
+
+/* 
+ * Mark IPIs as higher priority so we can take them inside interrupts that
+ * arent marked SA_INTERRUPT
+ */
+#define IPI_PRIORITY		4
+
+struct xics_ipl {
+	union {
+		u32 word;
+		u8 bytes[4];
+	} xirr_poll;
+	union {
+		u32 word;
+		u8 bytes[4];
+	} xirr;
+	u32 dummy;
+	union {
+		u32 word;
+		u8 bytes[4];
+	} qirr;
+};
+
+static struct xics_ipl __iomem *xics_per_cpu[NR_CPUS];
+
+static int xics_irq_8259_cascade = 0;
+static int xics_irq_8259_cascade_real = 0;
+static unsigned int default_server = 0xFF;
+/* also referenced in smp.c... */
+unsigned int default_distrib_server = 0;
+unsigned int interrupt_server_size = 8;
+
+/*
+ * XICS only has a single IPI, so encode the messages per CPU
+ */
+struct xics_ipi_struct xics_ipi_message[NR_CPUS] __cacheline_aligned;
+
+/* RTAS service tokens */
+int ibm_get_xive;
+int ibm_set_xive;
+int ibm_int_on;
+int ibm_int_off;
+
+typedef struct {
+	int (*xirr_info_get)(int cpu);
+	void (*xirr_info_set)(int cpu, int val);
+	void (*cppr_info)(int cpu, u8 val);
+	void (*qirr_info)(int cpu, u8 val);
+} xics_ops;
+
+
+/* SMP */
+
+static int pSeries_xirr_info_get(int n_cpu)
+{
+	return in_be32(&xics_per_cpu[n_cpu]->xirr.word);
+}
+
+static void pSeries_xirr_info_set(int n_cpu, int value)
+{
+	out_be32(&xics_per_cpu[n_cpu]->xirr.word, value);
+}
+
+static void pSeries_cppr_info(int n_cpu, u8 value)
+{
+	out_8(&xics_per_cpu[n_cpu]->xirr.bytes[0], value);
+}
+
+static void pSeries_qirr_info(int n_cpu, u8 value)
+{
+	out_8(&xics_per_cpu[n_cpu]->qirr.bytes[0], value);
+}
+
+static xics_ops pSeries_ops = {
+	pSeries_xirr_info_get,
+	pSeries_xirr_info_set,
+	pSeries_cppr_info,
+	pSeries_qirr_info
+};
+
+static xics_ops *ops = &pSeries_ops;
+
+
+/* LPAR */
+
+static inline long plpar_eoi(unsigned long xirr)
+{
+	return plpar_hcall_norets(H_EOI, xirr);
+}
+
+static inline long plpar_cppr(unsigned long cppr)
+{
+	return plpar_hcall_norets(H_CPPR, cppr);
+}
+
+static inline long plpar_ipi(unsigned long servernum, unsigned long mfrr)
+{
+	return plpar_hcall_norets(H_IPI, servernum, mfrr);
+}
+
+static inline long plpar_xirr(unsigned long *xirr_ret)
+{
+	unsigned long dummy;
+	return plpar_hcall(H_XIRR, 0, 0, 0, 0, xirr_ret, &dummy, &dummy);
+}
+
+static int pSeriesLP_xirr_info_get(int n_cpu)
+{
+	unsigned long lpar_rc;
+	unsigned long return_value; 
+
+	lpar_rc = plpar_xirr(&return_value);
+	if (lpar_rc != H_Success)
+		panic(" bad return code xirr - rc = %lx \n", lpar_rc); 
+	return (int)return_value;
+}
+
+static void pSeriesLP_xirr_info_set(int n_cpu, int value)
+{
+	unsigned long lpar_rc;
+	unsigned long val64 = value & 0xffffffff;
+
+	lpar_rc = plpar_eoi(val64);
+	if (lpar_rc != H_Success)
+		panic("bad return code EOI - rc = %ld, value=%lx\n", lpar_rc,
+		      val64); 
+}
+
+void pSeriesLP_cppr_info(int n_cpu, u8 value)
+{
+	unsigned long lpar_rc;
+
+	lpar_rc = plpar_cppr(value);
+	if (lpar_rc != H_Success)
+		panic("bad return code cppr - rc = %lx\n", lpar_rc); 
+}
+
+static void pSeriesLP_qirr_info(int n_cpu , u8 value)
+{
+	unsigned long lpar_rc;
+
+	lpar_rc = plpar_ipi(get_hard_smp_processor_id(n_cpu), value);
+	if (lpar_rc != H_Success)
+		panic("bad return code qirr - rc = %lx\n", lpar_rc); 
+}
+
+xics_ops pSeriesLP_ops = {
+	pSeriesLP_xirr_info_get,
+	pSeriesLP_xirr_info_set,
+	pSeriesLP_cppr_info,
+	pSeriesLP_qirr_info
+};
+
+static unsigned int xics_startup(unsigned int virq)
+{
+	unsigned int irq;
+
+	irq = irq_offset_down(virq);
+	if (radix_tree_insert(&irq_map, virt_irq_to_real(irq),
+			      &virt_irq_to_real_map[irq]) == -ENOMEM)
+		printk(KERN_CRIT "Out of memory creating real -> virtual"
+		       " IRQ mapping for irq %u (real 0x%x)\n",
+		       virq, virt_irq_to_real(irq));
+	xics_enable_irq(virq);
+	return 0;	/* return value is ignored */
+}
+
+static unsigned int real_irq_to_virt(unsigned int real_irq)
+{
+	unsigned int *ptr;
+
+	ptr = radix_tree_lookup(&irq_map, real_irq);
+	if (ptr == NULL)
+		return NO_IRQ;
+	return ptr - virt_irq_to_real_map;
+}
+
+#ifdef CONFIG_SMP
+static int get_irq_server(unsigned int irq)
+{
+	unsigned int server;
+	/* For the moment only implement delivery to all cpus or one cpu */
+	cpumask_t cpumask = irq_affinity[irq];
+	cpumask_t tmp = CPU_MASK_NONE;
+
+	if (!distribute_irqs)
+		return default_server;
+
+	if (cpus_equal(cpumask, CPU_MASK_ALL)) {
+		server = default_distrib_server;
+	} else {
+		cpus_and(tmp, cpu_online_map, cpumask);
+
+		if (cpus_empty(tmp))
+			server = default_distrib_server;
+		else
+			server = get_hard_smp_processor_id(first_cpu(tmp));
+	}
+
+	return server;
+
+}
+#else
+static int get_irq_server(unsigned int irq)
+{
+	return default_server;
+}
+#endif
+
+static void xics_enable_irq(unsigned int virq)
+{
+	unsigned int irq;
+	int call_status;
+	unsigned int server;
+
+	irq = virt_irq_to_real(irq_offset_down(virq));
+	if (irq == XICS_IPI)
+		return;
+
+	server = get_irq_server(virq);
+	call_status = rtas_call(ibm_set_xive, 3, 1, NULL, irq, server,
+				DEFAULT_PRIORITY);
+	if (call_status != 0) {
+		printk(KERN_ERR "xics_enable_irq: irq=%d: ibm_set_xive "
+		       "returned %x\n", irq, call_status);
+		return;
+	}
+
+	/* Now unmask the interrupt (often a no-op) */
+	call_status = rtas_call(ibm_int_on, 1, 1, NULL, irq);
+	if (call_status != 0) {
+		printk(KERN_ERR "xics_enable_irq: irq=%d: ibm_int_on "
+		       "returned %x\n", irq, call_status);
+		return;
+	}
+}
+
+static void xics_disable_real_irq(unsigned int irq)
+{
+	int call_status;
+	unsigned int server;
+
+	if (irq == XICS_IPI)
+		return;
+
+	call_status = rtas_call(ibm_int_off, 1, 1, NULL, irq);
+	if (call_status != 0) {
+		printk(KERN_ERR "xics_disable_real_irq: irq=%d: "
+		       "ibm_int_off returned %x\n", irq, call_status);
+		return;
+	}
+
+	server = get_irq_server(irq);
+	/* Have to set XIVE to 0xff to be able to remove a slot */
+	call_status = rtas_call(ibm_set_xive, 3, 1, NULL, irq, server, 0xff);
+	if (call_status != 0) {
+		printk(KERN_ERR "xics_disable_irq: irq=%d: ibm_set_xive(0xff)"
+		       " returned %x\n", irq, call_status);
+		return;
+	}
+}
+
+static void xics_disable_irq(unsigned int virq)
+{
+	unsigned int irq;
+
+	irq = virt_irq_to_real(irq_offset_down(virq));
+	xics_disable_real_irq(irq);
+}
+
+static void xics_end_irq(unsigned int irq)
+{
+	int cpu = smp_processor_id();
+
+	iosync();
+	ops->xirr_info_set(cpu, ((0xff << 24) |
+				 (virt_irq_to_real(irq_offset_down(irq)))));
+
+}
+
+static void xics_mask_and_ack_irq(unsigned int irq)
+{
+	int cpu = smp_processor_id();
+
+	if (irq < irq_offset_value()) {
+		i8259_pic.ack(irq);
+		iosync();
+		ops->xirr_info_set(cpu, ((0xff<<24) |
+					 xics_irq_8259_cascade_real));
+		iosync();
+	}
+}
+
+int xics_get_irq(struct pt_regs *regs)
+{
+	unsigned int cpu = smp_processor_id();
+	unsigned int vec;
+	int irq;
+
+	vec = ops->xirr_info_get(cpu);
+	/*  (vec >> 24) == old priority */
+	vec &= 0x00ffffff;
+
+	/* for sanity, this had better be < NR_IRQS - 16 */
+	if (vec == xics_irq_8259_cascade_real) {
+		irq = i8259_irq(cpu);
+		if (irq == -1) {
+			/* Spurious cascaded interrupt.  Still must ack xics */
+			xics_end_irq(irq_offset_up(xics_irq_8259_cascade));
+
+			irq = -1;
+		}
+	} else if (vec == XICS_IRQ_SPURIOUS) {
+		irq = -1;
+	} else {
+		irq = real_irq_to_virt(vec);
+		if (irq == NO_IRQ)
+			irq = real_irq_to_virt_slowpath(vec);
+		if (irq == NO_IRQ) {
+			printk(KERN_ERR "Interrupt %d (real) is invalid,"
+			       " disabling it.\n", vec);
+			xics_disable_real_irq(vec);
+		} else
+			irq = irq_offset_up(irq);
+	}
+	return irq;
+}
+
+#ifdef CONFIG_SMP
+
+irqreturn_t xics_ipi_action(int irq, void *dev_id, struct pt_regs *regs)
+{
+	int cpu = smp_processor_id();
+
+	ops->qirr_info(cpu, 0xff);
+
+	WARN_ON(cpu_is_offline(cpu));
+
+	while (xics_ipi_message[cpu].value) {
+		if (test_and_clear_bit(PPC_MSG_CALL_FUNCTION,
+				       &xics_ipi_message[cpu].value)) {
+			mb();
+			smp_message_recv(PPC_MSG_CALL_FUNCTION, regs);
+		}
+		if (test_and_clear_bit(PPC_MSG_RESCHEDULE,
+				       &xics_ipi_message[cpu].value)) {
+			mb();
+			smp_message_recv(PPC_MSG_RESCHEDULE, regs);
+		}
+#if 0
+		if (test_and_clear_bit(PPC_MSG_MIGRATE_TASK,
+				       &xics_ipi_message[cpu].value)) {
+			mb();
+			smp_message_recv(PPC_MSG_MIGRATE_TASK, regs);
+		}
+#endif
+#ifdef CONFIG_DEBUGGER
+		if (test_and_clear_bit(PPC_MSG_DEBUGGER_BREAK,
+				       &xics_ipi_message[cpu].value)) {
+			mb();
+			smp_message_recv(PPC_MSG_DEBUGGER_BREAK, regs);
+		}
+#endif
+	}
+	return IRQ_HANDLED;
+}
+
+void xics_cause_IPI(int cpu)
+{
+	ops->qirr_info(cpu, IPI_PRIORITY);
+}
+
+void xics_setup_cpu(void)
+{
+	int cpu = smp_processor_id();
+
+	ops->cppr_info(cpu, 0xff);
+	iosync();
+}
+
+#endif /* CONFIG_SMP */
+
+void xics_init_IRQ(void)
+{
+	int i;
+	unsigned long intr_size = 0;
+	struct device_node *np;
+	uint *ireg, ilen, indx = 0;
+	unsigned long intr_base = 0;
+	struct xics_interrupt_node {
+		unsigned long addr;
+		unsigned long size;
+	} intnodes[NR_CPUS]; 
+
+	ppc64_boot_msg(0x20, "XICS Init");
+
+	ibm_get_xive = rtas_token("ibm,get-xive");
+	ibm_set_xive = rtas_token("ibm,set-xive");
+	ibm_int_on  = rtas_token("ibm,int-on");
+	ibm_int_off = rtas_token("ibm,int-off");
+
+	np = of_find_node_by_type(NULL, "PowerPC-External-Interrupt-Presentation");
+	if (!np)
+		panic("xics_init_IRQ: can't find interrupt presentation");
+
+nextnode:
+	ireg = (uint *)get_property(np, "ibm,interrupt-server-ranges", NULL);
+	if (ireg) {
+		/*
+		 * set node starting index for this node
+		 */
+		indx = *ireg;
+	}
+
+	ireg = (uint *)get_property(np, "reg", &ilen);
+	if (!ireg)
+		panic("xics_init_IRQ: can't find interrupt reg property");
+	
+	while (ilen) {
+		intnodes[indx].addr = (unsigned long)*ireg++ << 32;
+		ilen -= sizeof(uint);
+		intnodes[indx].addr |= *ireg++;
+		ilen -= sizeof(uint);
+		intnodes[indx].size = (unsigned long)*ireg++ << 32;
+		ilen -= sizeof(uint);
+		intnodes[indx].size |= *ireg++;
+		ilen -= sizeof(uint);
+		indx++;
+		if (indx >= NR_CPUS) break;
+	}
+
+	np = of_find_node_by_type(np, "PowerPC-External-Interrupt-Presentation");
+	if ((indx < NR_CPUS) && np) goto nextnode;
+
+	/* Find the server numbers for the boot cpu. */
+	for (np = of_find_node_by_type(NULL, "cpu");
+	     np;
+	     np = of_find_node_by_type(np, "cpu")) {
+		ireg = (uint *)get_property(np, "reg", &ilen);
+		if (ireg && ireg[0] == boot_cpuid_phys) {
+			ireg = (uint *)get_property(np, "ibm,ppc-interrupt-gserver#s",
+						    &ilen);
+			i = ilen / sizeof(int);
+			if (ireg && i > 0) {
+				default_server = ireg[0];
+				default_distrib_server = ireg[i-1]; /* take last element */
+			}
+			ireg = (uint *)get_property(np,
+					"ibm,interrupt-server#-size", NULL);
+			if (ireg)
+				interrupt_server_size = *ireg;
+			break;
+		}
+	}
+	of_node_put(np);
+
+	intr_base = intnodes[0].addr;
+	intr_size = intnodes[0].size;
+
+	np = of_find_node_by_type(NULL, "interrupt-controller");
+	if (!np) {
+		printk(KERN_WARNING "xics: no ISA interrupt controller\n");
+		xics_irq_8259_cascade_real = -1;
+		xics_irq_8259_cascade = -1;
+	} else {
+		ireg = (uint *) get_property(np, "interrupts", NULL);
+		if (!ireg)
+			panic("xics_init_IRQ: can't find ISA interrupts property");
+
+		xics_irq_8259_cascade_real = *ireg;
+		xics_irq_8259_cascade
+			= virt_irq_create_mapping(xics_irq_8259_cascade_real);
+		of_node_put(np);
+	}
+
+	if (systemcfg->platform == PLATFORM_PSERIES) {
+#ifdef CONFIG_SMP
+		for_each_cpu(i) {
+			int hard_id;
+
+			/* FIXME: Do this dynamically! --RR */
+			if (!cpu_present(i))
+				continue;
+
+			hard_id = get_hard_smp_processor_id(i);
+			xics_per_cpu[i] = ioremap(intnodes[hard_id].addr, 
+						  intnodes[hard_id].size);
+		}
+#else
+		xics_per_cpu[0] = ioremap(intr_base, intr_size);
+#endif /* CONFIG_SMP */
+	} else if (systemcfg->platform == PLATFORM_PSERIES_LPAR) {
+		ops = &pSeriesLP_ops;
+	}
+
+	xics_8259_pic.enable = i8259_pic.enable;
+	xics_8259_pic.disable = i8259_pic.disable;
+	for (i = 0; i < 16; ++i)
+		get_irq_desc(i)->handler = &xics_8259_pic;
+	for (; i < NR_IRQS; ++i)
+		get_irq_desc(i)->handler = &xics_pic;
+
+	ops->cppr_info(boot_cpuid, 0xff);
+	iosync();
+
+	ppc64_boot_msg(0x21, "XICS Done");
+}
+
+/*
+ * We cant do this in init_IRQ because we need the memory subsystem up for
+ * request_irq()
+ */
+static int __init xics_setup_i8259(void)
+{
+	if (ppc64_interrupt_controller == IC_PPC_XIC &&
+	    xics_irq_8259_cascade != -1) {
+		if (request_irq(irq_offset_up(xics_irq_8259_cascade),
+				no_action, 0, "8259 cascade", NULL))
+			printk(KERN_ERR "xics_setup_i8259: couldn't get 8259 "
+					"cascade\n");
+		i8259_init(0);
+	}
+	return 0;
+}
+arch_initcall(xics_setup_i8259);
+
+#ifdef CONFIG_SMP
+void xics_request_IPIs(void)
+{
+	virt_irq_to_real_map[XICS_IPI] = XICS_IPI;
+
+	/* IPIs are marked SA_INTERRUPT as they must run with irqs disabled */
+	request_irq(irq_offset_up(XICS_IPI), xics_ipi_action, SA_INTERRUPT,
+		    "IPI", NULL);
+	get_irq_desc(irq_offset_up(XICS_IPI))->status |= IRQ_PER_CPU;
+}
+#endif
+
+static void xics_set_affinity(unsigned int virq, cpumask_t cpumask)
+{
+	unsigned int irq;
+	int status;
+	int xics_status[2];
+	unsigned long newmask;
+	cpumask_t tmp = CPU_MASK_NONE;
+
+	irq = virt_irq_to_real(irq_offset_down(virq));
+	if (irq == XICS_IPI || irq == NO_IRQ)
+		return;
+
+	status = rtas_call(ibm_get_xive, 1, 3, xics_status, irq);
+
+	if (status) {
+		printk(KERN_ERR "xics_set_affinity: irq=%d ibm,get-xive "
+		       "returns %d\n", irq, status);
+		return;
+	}
+
+	/* For the moment only implement delivery to all cpus or one cpu */
+	if (cpus_equal(cpumask, CPU_MASK_ALL)) {
+		newmask = default_distrib_server;
+	} else {
+		cpus_and(tmp, cpu_online_map, cpumask);
+		if (cpus_empty(tmp))
+			return;
+		newmask = get_hard_smp_processor_id(first_cpu(tmp));
+	}
+
+	status = rtas_call(ibm_set_xive, 3, 1, NULL,
+				irq, newmask, xics_status[1]);
+
+	if (status) {
+		printk(KERN_ERR "xics_set_affinity: irq=%d ibm,set-xive "
+		       "returns %d\n", irq, status);
+		return;
+	}
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/* Interrupts are disabled. */
+void xics_migrate_irqs_away(void)
+{
+	int status;
+	unsigned int irq, virq, cpu = smp_processor_id();
+
+	/* Reject any interrupt that was queued to us... */
+	ops->cppr_info(cpu, 0);
+	iosync();
+
+	/* remove ourselves from the global interrupt queue */
+	status = rtas_set_indicator(GLOBAL_INTERRUPT_QUEUE,
+		(1UL << interrupt_server_size) - 1 - default_distrib_server, 0);
+	WARN_ON(status < 0);
+
+	/* Allow IPIs again... */
+	ops->cppr_info(cpu, DEFAULT_PRIORITY);
+	iosync();
+
+	for_each_irq(virq) {
+		irq_desc_t *desc;
+		int xics_status[2];
+		unsigned long flags;
+
+		/* We cant set affinity on ISA interrupts */
+		if (virq < irq_offset_value())
+			continue;
+
+		desc = get_irq_desc(virq);
+		irq = virt_irq_to_real(irq_offset_down(virq));
+
+		/* We need to get IPIs still. */
+		if (irq == XICS_IPI || irq == NO_IRQ)
+			continue;
+
+		/* We only need to migrate enabled IRQS */
+		if (desc == NULL || desc->handler == NULL
+		    || desc->action == NULL
+		    || desc->handler->set_affinity == NULL)
+			continue;
+
+		spin_lock_irqsave(&desc->lock, flags);
+
+		status = rtas_call(ibm_get_xive, 1, 3, xics_status, irq);
+		if (status) {
+			printk(KERN_ERR "migrate_irqs_away: irq=%d "
+					"ibm,get-xive returns %d\n",
+					virq, status);
+			goto unlock;
+		}
+
+		/*
+		 * We only support delivery to all cpus or to one cpu.
+		 * The irq has to be migrated only in the single cpu
+		 * case.
+		 */
+		if (xics_status[0] != get_hard_smp_processor_id(cpu))
+			goto unlock;
+
+		printk(KERN_WARNING "IRQ %d affinity broken off cpu %u\n",
+		       virq, cpu);
+
+		/* Reset affinity to all cpus */
+		desc->handler->set_affinity(virq, CPU_MASK_ALL);
+		irq_affinity[virq] = CPU_MASK_ALL;
+unlock:
+		spin_unlock_irqrestore(&desc->lock, flags);
+	}
+}
+#endif
author	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 15:20:36 -0700
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 15:20:36 -0700
commit	1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree	0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/ppc64/kernel