10 files changed, 2323 insertions, 0 deletions
diff --git a/arch/parisc/lib/Makefile b/arch/parisc/lib/Makefile
new file mode 100644
index 00000000000..7bf70567629
--- /dev/null
+++ b/arch/parisc/lib/Makefile
@@ -0,0 +1,9 @@
+#
+# Makefile for parisc-specific library files
+#
+
+lib-y	:= lusercopy.o bitops.o checksum.o io.o memset.o fixup.o memcpy.o
+
+obj-y	:= iomap.o
+
+lib-$(CONFIG_SMP) += debuglocks.o
diff --git a/arch/parisc/lib/bitops.c b/arch/parisc/lib/bitops.c
new file mode 100644
index 00000000000..2de182f6fe8
--- /dev/null
+++ b/arch/parisc/lib/bitops.c
@@ -0,0 +1,84 @@
+/*
+ * bitops.c: atomic operations which got too long to be inlined all over
+ *      the place.
+ * 
+ * Copyright 1999 Philipp Rumpf (prumpf@tux.org)
+ * Copyright 2000 Grant Grundler (grundler@cup.hp.com)
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <asm/system.h>
+#include <asm/atomic.h>
+
+#ifdef CONFIG_SMP
+spinlock_t __atomic_hash[ATOMIC_HASH_SIZE] __lock_aligned = {
+	[0 ... (ATOMIC_HASH_SIZE-1)]  = SPIN_LOCK_UNLOCKED
+};
+#endif
+
+#ifdef __LP64__
+unsigned long __xchg64(unsigned long x, unsigned long *ptr)
+{
+	unsigned long temp, flags;
+
+	_atomic_spin_lock_irqsave(ptr, flags);
+	temp = *ptr;
+	*ptr = x;
+	_atomic_spin_unlock_irqrestore(ptr, flags);
+	return temp;
+}
+#endif
+
+unsigned long __xchg32(int x, int *ptr)
+{
+	unsigned long flags;
+	long temp;
+
+	_atomic_spin_lock_irqsave(ptr, flags);
+	temp = (long) *ptr;	/* XXX - sign extension wanted? */
+	*ptr = x;
+	_atomic_spin_unlock_irqrestore(ptr, flags);
+	return (unsigned long)temp;
+}
+
+
+unsigned long __xchg8(char x, char *ptr)
+{
+	unsigned long flags;
+	long temp;
+
+	_atomic_spin_lock_irqsave(ptr, flags);
+	temp = (long) *ptr;	/* XXX - sign extension wanted? */
+	*ptr = x;
+	_atomic_spin_unlock_irqrestore(ptr, flags);
+	return (unsigned long)temp;
+}
+
+
+#ifdef __LP64__
+unsigned long __cmpxchg_u64(volatile unsigned long *ptr, unsigned long old, unsigned long new)
+{
+	unsigned long flags;
+	unsigned long prev;
+
+	_atomic_spin_lock_irqsave(ptr, flags);
+	if ((prev = *ptr) == old)
+		*ptr = new;
+	_atomic_spin_unlock_irqrestore(ptr, flags);
+	return prev;
+}
+#endif
+
+unsigned long __cmpxchg_u32(volatile unsigned int *ptr, unsigned int old, unsigned int new)
+{
+	unsigned long flags;
+	unsigned int prev;
+
+	_atomic_spin_lock_irqsave(ptr, flags);
+	if ((prev = *ptr) == old)
+		*ptr = new;
+	_atomic_spin_unlock_irqrestore(ptr, flags);
+	return (unsigned long)prev;
+}
diff --git a/arch/parisc/lib/checksum.c b/arch/parisc/lib/checksum.c
new file mode 100644
index 00000000000..8a1e08068e7
--- /dev/null
+++ b/arch/parisc/lib/checksum.c
@@ -0,0 +1,148 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		MIPS specific IP/TCP/UDP checksumming routines
+ *
+ * Authors:	Ralf Baechle, <ralf@waldorf-gmbh.de>
+ *		Lots of code moved from tcp.c and ip.c; see those files
+ *		for more names.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * $Id: checksum.c,v 1.3 1997/12/01 17:57:34 ralf Exp $
+ */
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include <net/checksum.h>
+#include <asm/byteorder.h>
+#include <asm/string.h>
+#include <asm/uaccess.h>
+
+#define addc(_t,_r)                     \
+	__asm__ __volatile__ (          \
+"       add             %0, %1, %0\n"   \
+"       addc            %0, %%r0, %0\n" \
+	: "=r"(_t)                      \
+	: "r"(_r), "0"(_t));
+
+static inline unsigned short from32to16(unsigned int x)
+{
+	/* 32 bits --> 16 bits + carry */
+	x = (x & 0xffff) + (x >> 16);
+	/* 16 bits + carry --> 16 bits including carry */
+	x = (x & 0xffff) + (x >> 16);
+	return (unsigned short)x;
+}
+
+static inline unsigned int do_csum(const unsigned char * buff, int len)
+{
+	int odd, count;
+	unsigned int result = 0;
+
+	if (len <= 0)
+		goto out;
+	odd = 1 & (unsigned long) buff;
+	if (odd) {
+		result = be16_to_cpu(*buff);
+		len--;
+		buff++;
+	}
+	count = len >> 1;		/* nr of 16-bit words.. */
+	if (count) {
+		if (2 & (unsigned long) buff) {
+			result += *(unsigned short *) buff;
+			count--;
+			len -= 2;
+			buff += 2;
+		}
+		count >>= 1;		/* nr of 32-bit words.. */
+		if (count) {
+			while (count >= 4) {
+				unsigned int r1, r2, r3, r4;
+				r1 = *(unsigned int *)(buff + 0);
+				r2 = *(unsigned int *)(buff + 4);
+				r3 = *(unsigned int *)(buff + 8);
+				r4 = *(unsigned int *)(buff + 12);
+				addc(result, r1);
+				addc(result, r2);
+				addc(result, r3);
+				addc(result, r4);
+				count -= 4;
+				buff += 16;
+			}
+			while (count) {
+				unsigned int w = *(unsigned int *) buff;
+				count--;
+				buff += 4;
+				addc(result, w);
+			}
+			result = (result & 0xffff) + (result >> 16);
+		}
+		if (len & 2) {
+			result += *(unsigned short *) buff;
+			buff += 2;
+		}
+	}
+	if (len & 1)
+		result += le16_to_cpu(*buff);
+	result = from32to16(result);
+	if (odd)
+		result = swab16(result);
+out:
+	return result;
+}
+
+/*
+ * computes a partial checksum, e.g. for TCP/UDP fragments
+ */
+unsigned int csum_partial(const unsigned char *buff, int len, unsigned int sum)
+{
+	unsigned int result = do_csum(buff, len);
+	addc(result, sum);
+	return from32to16(result);
+}
+
+EXPORT_SYMBOL(csum_partial);
+
+/*
+ * copy while checksumming, otherwise like csum_partial
+ */
+unsigned int csum_partial_copy_nocheck(const unsigned char *src, unsigned char *dst,
+				       int len, unsigned int sum)
+{
+	/*
+	 * It's 2:30 am and I don't feel like doing it real ...
+	 * This is lots slower than the real thing (tm)
+	 */
+	sum = csum_partial(src, len, sum);
+	memcpy(dst, src, len);
+
+	return sum;
+}
+EXPORT_SYMBOL(csum_partial_copy_nocheck);
+
+/*
+ * Copy from userspace and compute checksum.  If we catch an exception
+ * then zero the rest of the buffer.
+ */
+unsigned int csum_partial_copy_from_user(const unsigned char __user *src,
+					unsigned char *dst, int len,
+					unsigned int sum, int *err_ptr)
+{
+	int missing;
+
+	missing = copy_from_user(dst, src, len);
+	if (missing) {
+		memset(dst + len - missing, 0, missing);
+		*err_ptr = -EFAULT;
+	}
+		
+	return csum_partial(dst, len, sum);
+}
+EXPORT_SYMBOL(csum_partial_copy_from_user);
diff --git a/arch/parisc/lib/debuglocks.c b/arch/parisc/lib/debuglocks.c
new file mode 100644
index 00000000000..1b33fe6e5b7
--- /dev/null
+++ b/arch/parisc/lib/debuglocks.c
@@ -0,0 +1,277 @@
+/* 
+ *    Debugging versions of SMP locking primitives.
+ *
+ *    Copyright (C) 2004 Thibaut VARENE <varenet@parisc-linux.org>
+ *
+ *    Some code stollen from alpha & sparc64 ;)
+ *
+ *    This program is free software; you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation; either version 2 of the License, or
+ *    (at your option) any later version.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU General Public License for more details.
+ *
+ *    You should have received a copy of the GNU General Public License
+ *    along with this program; if not, write to the Free Software
+ *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *    We use pdc_printf() throughout the file for all output messages, to avoid
+ *    losing messages because of disabled interrupts. Since we're using these
+ *    messages for debugging purposes, it makes sense not to send them to the
+ *    linux console.
+ */
+
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>	/* in_interrupt() */
+#include <asm/system.h>
+#include <asm/hardirq.h>	/* in_interrupt() */
+#include <asm/pdc.h>
+
+#undef INIT_STUCK
+#define INIT_STUCK 1L << 30
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+
+
+void _dbg_spin_lock(spinlock_t * lock, const char *base_file, int line_no)
+{
+	volatile unsigned int *a;
+	long stuck = INIT_STUCK;
+	void *inline_pc = __builtin_return_address(0);
+	unsigned long started = jiffies;
+	int printed = 0;
+	int cpu = smp_processor_id();
+
+try_again:
+
+	/* Do the actual locking */
+	/* <T-Bone> ggg: we can't get stuck on the outter loop?
+	 * <ggg> T-Bone: We can hit the outer loop
+	 *	alot if multiple CPUs are constantly racing for a lock
+	 *	and the backplane is NOT fair about which CPU sees
+	 *	the update first. But it won't hang since every failed
+	 *	attempt will drop us back into the inner loop and
+	 *	decrement `stuck'.
+	 * <ggg> K-class and some of the others are NOT fair in the HW
+	 * 	implementation so we could see false positives.
+	 * 	But fixing the lock contention is easier than
+	 * 	fixing the HW to be fair.
+	 * <tausq> __ldcw() returns 1 if we get the lock; otherwise we
+	 * 	spin until the value of the lock changes, or we time out.
+	 */
+	mb();
+	a = __ldcw_align(lock);
+	while (stuck && (__ldcw(a) == 0))
+		while ((*a == 0) && --stuck);
+	mb();
+
+	if (unlikely(stuck <= 0)) {
+		pdc_printf(
+			"%s:%d: spin_lock(%s/%p) stuck in %s at %p(%d)"
+			" owned by %s:%d in %s at %p(%d)\n",
+			base_file, line_no, lock->module, lock,
+			current->comm, inline_pc, cpu,
+			lock->bfile, lock->bline, lock->task->comm,
+			lock->previous, lock->oncpu);
+		stuck = INIT_STUCK;
+		printed = 1;
+		goto try_again;
+	}
+
+	/* Exiting.  Got the lock.  */
+	lock->oncpu = cpu;
+	lock->previous = inline_pc;
+	lock->task = current;
+	lock->bfile = (char *)base_file;
+	lock->bline = line_no;
+
+	if (unlikely(printed)) {
+		pdc_printf(
+			"%s:%d: spin_lock grabbed in %s at %p(%d) %ld ticks\n",
+			base_file, line_no, current->comm, inline_pc,
+			cpu, jiffies - started);
+	}
+}
+
+void _dbg_spin_unlock(spinlock_t * lock, const char *base_file, int line_no)
+{
+	CHECK_LOCK(lock);
+	volatile unsigned int *a;
+	mb();
+	a = __ldcw_align(lock);
+	if (unlikely((*a != 0) && lock->babble)) {
+		lock->babble--;
+		pdc_printf(
+			"%s:%d: spin_unlock(%s:%p) not locked\n",
+			base_file, line_no, lock->module, lock);
+	}
+	*a = 1;	
+	mb();
+}
+
+int _dbg_spin_trylock(spinlock_t * lock, const char *base_file, int line_no)
+{
+	int ret;
+	volatile unsigned int *a;
+	mb();
+	a = __ldcw_align(lock);
+	ret = (__ldcw(a) != 0);
+	mb();
+	if (ret) {
+		lock->oncpu = smp_processor_id();
+		lock->previous = __builtin_return_address(0);
+		lock->task = current;
+	} else {
+		lock->bfile = (char *)base_file;
+		lock->bline = line_no;
+	}
+	return ret;
+}
+
+#endif /* CONFIG_DEBUG_SPINLOCK */
+
+#ifdef CONFIG_DEBUG_RWLOCK
+
+/* Interrupts trouble detailed explanation, thx Grant:
+ *
+ * o writer (wants to modify data) attempts to acquire the rwlock
+ * o He gets the write lock.
+ * o Interupts are still enabled, we take an interrupt with the
+ *   write still holding the lock.
+ * o interrupt handler tries to acquire the rwlock for read.
+ * o deadlock since the writer can't release it at this point.
+ * 
+ * In general, any use of spinlocks that competes between "base"
+ * level and interrupt level code will risk deadlock. Interrupts
+ * need to be disabled in the base level routines to avoid it.
+ * Or more precisely, only the IRQ the base level routine
+ * is competing with for the lock.  But it's more efficient/faster
+ * to just disable all interrupts on that CPU to guarantee
+ * once it gets the lock it can release it quickly too.
+ */
+ 
+void _dbg_write_lock(rwlock_t *rw, const char *bfile, int bline)
+{
+	void *inline_pc = __builtin_return_address(0);
+	unsigned long started = jiffies;
+	long stuck = INIT_STUCK;
+	int printed = 0;
+	int cpu = smp_processor_id();
+	
+	if(unlikely(in_interrupt())) {	/* acquiring write lock in interrupt context, bad idea */
+		pdc_printf("write_lock caller: %s:%d, IRQs enabled,\n", bfile, bline);
+		BUG();
+	}
+
+	/* Note: if interrupts are disabled (which is most likely), the printk
+	will never show on the console. We might need a polling method to flush
+	the dmesg buffer anyhow. */
+	
+retry:
+	_raw_spin_lock(&rw->lock);
+
+	if(rw->counter != 0) {
+		/* this basically never happens */
+		_raw_spin_unlock(&rw->lock);
+		
+		stuck--;
+		if ((unlikely(stuck <= 0)) && (rw->counter < 0)) {
+			pdc_printf(
+				"%s:%d: write_lock stuck on writer"
+				" in %s at %p(%d) %ld ticks\n",
+				bfile, bline, current->comm, inline_pc,
+				cpu, jiffies - started);
+			stuck = INIT_STUCK;
+			printed = 1;
+		}
+		else if (unlikely(stuck <= 0)) {
+			pdc_printf(
+				"%s:%d: write_lock stuck on reader"
+				" in %s at %p(%d) %ld ticks\n",
+				bfile, bline, current->comm, inline_pc,
+				cpu, jiffies - started);
+			stuck = INIT_STUCK;
+			printed = 1;
+		}
+		
+		while(rw->counter != 0);
+
+		goto retry;
+	}
+
+	/* got it.  now leave without unlocking */
+	rw->counter = -1; /* remember we are locked */
+
+	if (unlikely(printed)) {
+		pdc_printf(
+			"%s:%d: write_lock grabbed in %s at %p(%d) %ld ticks\n",
+			bfile, bline, current->comm, inline_pc,
+			cpu, jiffies - started);
+	}
+}
+
+int _dbg_write_trylock(rwlock_t *rw, const char *bfile, int bline)
+{
+#if 0
+	void *inline_pc = __builtin_return_address(0);
+	int cpu = smp_processor_id();
+#endif
+	
+	if(unlikely(in_interrupt())) {	/* acquiring write lock in interrupt context, bad idea */
+		pdc_printf("write_lock caller: %s:%d, IRQs enabled,\n", bfile, bline);
+		BUG();
+	}
+
+	/* Note: if interrupts are disabled (which is most likely), the printk
+	will never show on the console. We might need a polling method to flush
+	the dmesg buffer anyhow. */
+	
+	_raw_spin_lock(&rw->lock);
+
+	if(rw->counter != 0) {
+		/* this basically never happens */
+		_raw_spin_unlock(&rw->lock);
+		return 0;
+	}
+
+	/* got it.  now leave without unlocking */
+	rw->counter = -1; /* remember we are locked */
+#if 0
+	pdc_printf("%s:%d: try write_lock grabbed in %s at %p(%d)\n",
+		   bfile, bline, current->comm, inline_pc, cpu);
+#endif
+	return 1;
+}
+
+void _dbg_read_lock(rwlock_t * rw, const char *bfile, int bline)
+{
+#if 0
+	void *inline_pc = __builtin_return_address(0);
+	unsigned long started = jiffies;
+	int cpu = smp_processor_id();
+#endif
+	unsigned long flags;
+
+	local_irq_save(flags);
+	_raw_spin_lock(&rw->lock); 
+
+	rw->counter++;
+#if 0
+	pdc_printf(
+		"%s:%d: read_lock grabbed in %s at %p(%d) %ld ticks\n",
+		bfile, bline, current->comm, inline_pc,
+		cpu, jiffies - started);
+#endif
+	_raw_spin_unlock(&rw->lock);
+	local_irq_restore(flags);
+}
+
+#endif /* CONFIG_DEBUG_RWLOCK */
diff --git a/arch/parisc/lib/fixup.S b/arch/parisc/lib/fixup.S
new file mode 100644
index 00000000000..134f0cd240f
--- /dev/null
+++ b/arch/parisc/lib/fixup.S
@@ -0,0 +1,89 @@
+/*
+ * Linux/PA-RISC Project (http://www.parisc-linux.org/)
+ *
+ *  Copyright (C) 2004  Randolph Chung <tausq@debian.org>
+ *
+ *    This program is free software; you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation; either version 2, or (at your option)
+ *    any later version.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU General Public License for more details.
+ *
+ *    You should have received a copy of the GNU General Public License
+ *    along with this program; if not, write to the Free Software
+ *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * 
+ * Fixup routines for kernel exception handling.
+ */
+#include <linux/config.h>
+#include <asm/offsets.h>
+#include <asm/assembly.h>
+#include <asm/errno.h>
+
+#ifdef CONFIG_SMP
+	.macro  get_fault_ip t1 t2
+	addil LT%__per_cpu_offset,%r27
+	LDREG RT%__per_cpu_offset(%r1),\t1
+	/* t2 = smp_processor_id() */
+	mfctl 30,\t2
+	ldw TI_CPU(\t2),\t2
+#ifdef __LP64__
+	extrd,u \t2,63,32,\t2
+#endif
+	/* t2 = &__per_cpu_offset[smp_processor_id()]; */
+	LDREG,s \t2(\t1),\t2 
+	addil LT%per_cpu__exception_data,%r27
+	LDREG RT%per_cpu__exception_data(%r1),\t1
+	/* t1 = &__get_cpu_var(exception_data) */
+	add,l \t1,\t2,\t1
+	/* t1 = t1->fault_ip */
+	LDREG EXCDATA_IP(\t1), \t1
+	.endm
+#else
+	.macro  get_fault_ip t1 t2
+	/* t1 = &__get_cpu_var(exception_data) */
+	addil LT%per_cpu__exception_data,%r27
+	LDREG RT%per_cpu__exception_data(%r1),\t2
+	/* t1 = t2->fault_ip */
+	LDREG EXCDATA_IP(\t2), \t1
+	.endm
+#endif
+
+	.text
+	.section .fixup, "ax"
+
+	/* get_user() fixups, store -EFAULT in r8, and 0 in r9 */
+	.export fixup_get_user_skip_1
+fixup_get_user_skip_1:
+	get_fault_ip %r1,%r8
+	ldo 4(%r1), %r1
+	ldi -EFAULT, %r8
+	bv %r0(%r1)
+	copy %r0, %r9
+
+	.export fixup_get_user_skip_2
+fixup_get_user_skip_2:
+	get_fault_ip %r1,%r8
+	ldo 8(%r1), %r1
+	ldi -EFAULT, %r8
+	bv %r0(%r1)
+	copy %r0, %r9
+
+	/* put_user() fixups, store -EFAULT in r8 */
+	.export fixup_put_user_skip_1
+fixup_put_user_skip_1:
+	get_fault_ip %r1,%r8
+	ldo 4(%r1), %r1
+	bv %r0(%r1)
+	ldi -EFAULT, %r8
+
+	.export fixup_put_user_skip_2
+fixup_put_user_skip_2:
+	get_fault_ip %r1,%r8
+	ldo 8(%r1), %r1
+	bv %r0(%r1)
+	ldi -EFAULT, %r8
diff --git a/arch/parisc/lib/io.c b/arch/parisc/lib/io.c
new file mode 100644
index 00000000000..7c1406ff825
--- /dev/null
+++ b/arch/parisc/lib/io.c
@@ -0,0 +1,488 @@
+/*
+ * arch/parisc/lib/io.c
+ *
+ * Copyright (c) Matthew Wilcox 2001 for Hewlett-Packard
+ * Copyright (c) Randolph Chung 2001 <tausq@debian.org>
+ *
+ * IO accessing functions which shouldn't be inlined because they're too big
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/io.h>
+
+/* Copies a block of memory to a device in an efficient manner.
+ * Assumes the device can cope with 32-bit transfers.  If it can't,
+ * don't use this function.
+ */
+void memcpy_toio(volatile void __iomem *dst, const void *src, int count)
+{
+	if (((unsigned long)dst & 3) != ((unsigned long)src & 3))
+		goto bytecopy;
+	while ((unsigned long)dst & 3) {
+		writeb(*(char *)src, dst++);
+		src++;
+		count--;
+	}
+	while (count > 3) {
+		__raw_writel(*(u32 *)src, dst);
+		src += 4;
+		dst += 4;
+		count -= 4;
+	}
+ bytecopy:
+	while (count--) {
+		writeb(*(char *)src, dst++);
+		src++;
+	}
+}
+
+/*
+** Copies a block of memory from a device in an efficient manner.
+** Assumes the device can cope with 32-bit transfers.  If it can't,
+** don't use this function.
+**
+** CR16 counts on C3000 reading 256 bytes from Symbios 896 RAM:
+**	27341/64    = 427 cyc per int
+**	61311/128   = 478 cyc per short
+**	122637/256  = 479 cyc per byte
+** Ergo bus latencies dominant (not transfer size).
+**      Minimize total number of transfers at cost of CPU cycles.
+**	TODO: only look at src alignment and adjust the stores to dest.
+*/
+void memcpy_fromio(void *dst, const volatile void __iomem *src, int count)
+{
+	/* first compare alignment of src/dst */ 
+	if ( (((unsigned long)dst ^ (unsigned long)src) & 1) || (count < 2) )
+		goto bytecopy;
+
+	if ( (((unsigned long)dst ^ (unsigned long)src) & 2) || (count < 4) )
+		goto shortcopy;
+
+	/* Then check for misaligned start address */
+	if ((unsigned long)src & 1) {
+		*(u8 *)dst = readb(src);
+		src++;
+		dst++;
+		count--;
+		if (count < 2) goto bytecopy;
+	}
+
+	if ((unsigned long)src & 2) {
+		*(u16 *)dst = __raw_readw(src);
+		src += 2;
+		dst += 2;
+		count -= 2;
+	}
+
+	while (count > 3) {
+		*(u32 *)dst = __raw_readl(src);
+		dst += 4;
+		src += 4;
+		count -= 4;
+	}
+
+ shortcopy:
+	while (count > 1) {
+		*(u16 *)dst = __raw_readw(src);
+		src += 2;
+		dst += 2;
+		count -= 2;
+	}
+
+ bytecopy:
+	while (count--) {
+		*(char *)dst = readb(src);
+		src++;
+		dst++;
+	}
+}
+
+/* Sets a block of memory on a device to a given value.
+ * Assumes the device can cope with 32-bit transfers.  If it can't,
+ * don't use this function.
+ */
+void memset_io(volatile void __iomem *addr, unsigned char val, int count)
+{
+	u32 val32 = (val << 24) | (val << 16) | (val << 8) | val;
+	while ((unsigned long)addr & 3) {
+		writeb(val, addr++);
+		count--;
+	}
+	while (count > 3) {
+		__raw_writel(val32, addr);
+		addr += 4;
+		count -= 4;
+	}
+	while (count--) {
+		writeb(val, addr++);
+	}
+}
+
+/*
+ * Read COUNT 8-bit bytes from port PORT into memory starting at
+ * SRC.
+ */
+void insb (unsigned long port, void *dst, unsigned long count)
+{
+	unsigned char *p;
+
+	p = (unsigned char *)dst;
+
+	while (((unsigned long)p) & 0x3) {
+		if (!count)
+			return;
+		count--;
+		*p = inb(port);
+		p++;
+	}
+
+	while (count >= 4) {
+		unsigned int w;
+		count -= 4;
+		w = inb(port) << 24;
+		w |= inb(port) << 16;
+		w |= inb(port) << 8;
+		w |= inb(port);
+		*(unsigned int *) p = w;
+		p += 4;
+	}
+
+	while (count) {
+		--count;
+		*p = inb(port);
+		p++;
+	}
+}
+
+
+/*
+ * Read COUNT 16-bit words from port PORT into memory starting at
+ * SRC.  SRC must be at least short aligned.  This is used by the
+ * IDE driver to read disk sectors.  Performance is important, but
+ * the interfaces seems to be slow: just using the inlined version
+ * of the inw() breaks things.
+ */
+void insw (unsigned long port, void *dst, unsigned long count)
+{
+	unsigned int l = 0, l2;
+	unsigned char *p;
+
+	p = (unsigned char *)dst;
+	
+	if (!count)
+		return;
+	
+	switch (((unsigned long)p) & 0x3)
+	{
+	 case 0x00:			/* Buffer 32-bit aligned */
+		while (count>=2) {
+			
+			count -= 2;
+			l = cpu_to_le16(inw(port)) << 16;
+			l |= cpu_to_le16(inw(port));
+			*(unsigned int *)p = l;
+			p += 4;
+		}
+		if (count) {
+			*(unsigned short *)p = cpu_to_le16(inw(port));
+		}
+		break;
+	
+	 case 0x02:			/* Buffer 16-bit aligned */
+		*(unsigned short *)p = cpu_to_le16(inw(port));
+		p += 2;
+		count--;
+		while (count>=2) {
+			
+			count -= 2;
+			l = cpu_to_le16(inw(port)) << 16;
+			l |= cpu_to_le16(inw(port));
+			*(unsigned int *)p = l;
+			p += 4;
+		}
+		if (count) {
+			*(unsigned short *)p = cpu_to_le16(inw(port));
+		}
+		break;
+		
+	 case 0x01:			/* Buffer 8-bit aligned */
+	 case 0x03:
+		/* I don't bother with 32bit transfers
+		 * in this case, 16bit will have to do -- DE */
+		--count;
+		
+		l = cpu_to_le16(inw(port));
+		*p = l >> 8;
+		p++;
+		while (count--)
+		{
+			l2 = cpu_to_le16(inw(port));
+			*(unsigned short *)p = (l & 0xff) << 8 | (l2 >> 8);
+			p += 2;
+			l = l2;
+		}
+		*p = l & 0xff;
+		break;
+	}
+}
+
+
+
+/*
+ * Read COUNT 32-bit words from port PORT into memory starting at
+ * SRC. Now works with any alignment in SRC. Performance is important,
+ * but the interfaces seems to be slow: just using the inlined version
+ * of the inl() breaks things.
+ */
+void insl (unsigned long port, void *dst, unsigned long count)
+{
+	unsigned int l = 0, l2;
+	unsigned char *p;
+
+	p = (unsigned char *)dst;
+	
+	if (!count)
+		return;
+	
+	switch (((unsigned long) dst) & 0x3)
+	{
+	 case 0x00:			/* Buffer 32-bit aligned */
+		while (count--)
+		{
+			*(unsigned int *)p = cpu_to_le32(inl(port));
+			p += 4;
+		}
+		break;
+	
+	 case 0x02:			/* Buffer 16-bit aligned */
+		--count;
+		
+		l = cpu_to_le32(inl(port));
+		*(unsigned short *)p = l >> 16;
+		p += 2;
+		
+		while (count--)
+		{
+			l2 = cpu_to_le32(inl(port));
+			*(unsigned int *)p = (l & 0xffff) << 16 | (l2 >> 16);
+			p += 4;
+			l = l2;
+		}
+		*(unsigned short *)p = l & 0xffff;
+		break;
+	 case 0x01:			/* Buffer 8-bit aligned */
+		--count;
+		
+		l = cpu_to_le32(inl(port));
+		*(unsigned char *)p = l >> 24;
+		p++;
+		*(unsigned short *)p = (l >> 8) & 0xffff;
+		p += 2;
+		while (count--)
+		{
+			l2 = cpu_to_le32(inl(port));
+			*(unsigned int *)p = (l & 0xff) << 24 | (l2 >> 8);
+			p += 4;
+			l = l2;
+		}
+		*p = l & 0xff;
+		break;
+	 case 0x03:			/* Buffer 8-bit aligned */
+		--count;
+		
+		l = cpu_to_le32(inl(port));
+		*p = l >> 24;
+		p++;
+		while (count--)
+		{
+			l2 = cpu_to_le32(inl(port));
+			*(unsigned int *)p = (l & 0xffffff) << 8 | l2 >> 24;
+			p += 4;
+			l = l2;
+		}
+		*(unsigned short *)p = (l >> 8) & 0xffff;
+		p += 2;
+		*p = l & 0xff;
+		break;
+	}
+}
+
+
+/*
+ * Like insb but in the opposite direction.
+ * Don't worry as much about doing aligned memory transfers:
+ * doing byte reads the "slow" way isn't nearly as slow as
+ * doing byte writes the slow way (no r-m-w cycle).
+ */
+void outsb(unsigned long port, const void * src, unsigned long count)
+{
+	const unsigned char *p;
+
+	p = (const unsigned char *)src;
+	while (count) {
+		count--;
+		outb(*p, port);
+		p++;
+	}
+}
+
+/*
+ * Like insw but in the opposite direction.  This is used by the IDE
+ * driver to write disk sectors.  Performance is important, but the
+ * interfaces seems to be slow: just using the inlined version of the
+ * outw() breaks things.
+ */
+void outsw (unsigned long port, const void *src, unsigned long count)
+{
+	unsigned int l = 0, l2;
+	const unsigned char *p;
+
+	p = (const unsigned char *)src;
+	
+	if (!count)
+		return;
+	
+	switch (((unsigned long)p) & 0x3)
+	{
+	 case 0x00:			/* Buffer 32-bit aligned */
+		while (count>=2) {
+			count -= 2;
+			l = *(unsigned int *)p;
+			p += 4;
+			outw(le16_to_cpu(l >> 16), port);
+			outw(le16_to_cpu(l & 0xffff), port);
+		}
+		if (count) {
+			outw(le16_to_cpu(*(unsigned short*)p), port);
+		}
+		break;
+	
+	 case 0x02:			/* Buffer 16-bit aligned */
+		
+		outw(le16_to_cpu(*(unsigned short*)p), port);
+		p += 2;
+		count--;
+		
+		while (count>=2) {
+			count -= 2;
+			l = *(unsigned int *)p;
+			p += 4;
+			outw(le16_to_cpu(l >> 16), port);
+			outw(le16_to_cpu(l & 0xffff), port);
+		}
+		if (count) {
+			outw(le16_to_cpu(*(unsigned short *)p), port);
+		}
+		break;
+		
+	 case 0x01:			/* Buffer 8-bit aligned */	
+		/* I don't bother with 32bit transfers
+		 * in this case, 16bit will have to do -- DE */
+		
+		l  = *p << 8;
+		p++;
+		count--;
+		while (count)
+		{
+			count--;
+			l2 = *(unsigned short *)p;
+			p += 2;
+			outw(le16_to_cpu(l | l2 >> 8), port);
+		        l = l2 << 8;
+		}
+		l2 = *(unsigned char *)p;
+		outw (le16_to_cpu(l | l2>>8), port);
+		break;
+	
+	}
+}
+
+
+/*
+ * Like insl but in the opposite direction.  This is used by the IDE
+ * driver to write disk sectors.  Works with any alignment in SRC.
+ *  Performance is important, but the interfaces seems to be slow:
+ * just using the inlined version of the outl() breaks things.
+ */
+void outsl (unsigned long port, const void *src, unsigned long count)
+{
+	unsigned int l = 0, l2;
+	const unsigned char *p;
+
+	p = (const unsigned char *)src;
+	
+	if (!count)
+		return;
+	
+	switch (((unsigned long)p) & 0x3)
+	{
+	 case 0x00:			/* Buffer 32-bit aligned */
+		while (count--)
+		{
+			outl(le32_to_cpu(*(unsigned int *)p), port);
+			p += 4;
+		}
+		break;
+	
+	 case 0x02:			/* Buffer 16-bit aligned */
+		--count;
+		
+		l = *(unsigned short *)p;
+		p += 2;
+		
+		while (count--)
+		{
+			l2 = *(unsigned int *)p;
+			p += 4;
+			outl (le32_to_cpu(l << 16 | l2 >> 16), port);
+			l = l2;
+		}
+		l2 = *(unsigned short *)p;
+		outl (le32_to_cpu(l << 16 | l2), port);
+		break;
+	 case 0x01:			/* Buffer 8-bit aligned */
+		--count;
+
+		l = *p << 24;
+		p++;
+		l |= *(unsigned short *)p << 8;
+		p += 2;
+
+		while (count--)
+		{
+			l2 = *(unsigned int *)p;
+			p += 4;
+			outl (le32_to_cpu(l | l2 >> 24), port);
+			l = l2 << 8;
+		}
+		l2 = *p;
+		outl (le32_to_cpu(l | l2), port);
+		break;
+	 case 0x03:			/* Buffer 8-bit aligned */
+		--count;
+		
+		l = *p << 24;
+		p++;
+
+		while (count--)
+		{
+			l2 = *(unsigned int *)p;
+			p += 4;
+			outl (le32_to_cpu(l | l2 >> 8), port);
+			l = l2 << 24;
+		}
+		l2 = *(unsigned short *)p << 16;
+		p += 2;
+		l2 |= *p;
+		outl (le32_to_cpu(l | l2), port);
+		break;
+	}
+}
+
+EXPORT_SYMBOL(insb);
+EXPORT_SYMBOL(insw);
+EXPORT_SYMBOL(insl);
+EXPORT_SYMBOL(outsb);
+EXPORT_SYMBOL(outsw);
+EXPORT_SYMBOL(outsl);
diff --git a/arch/parisc/lib/iomap.c b/arch/parisc/lib/iomap.c
new file mode 100644
index 00000000000..290a62e7120
--- /dev/null
+++ b/arch/parisc/lib/iomap.c
@@ -0,0 +1,422 @@
+/*
+ * iomap.c - Implement iomap interface for PA-RISC
+ * Copyright (c) 2004 Matthew Wilcox
+ */
+
+#include <linux/ioport.h>
+#include <linux/pci.h>
+#include <asm/io.h>
+
+/*
+ * The iomap space on 32-bit PA-RISC is intended to look like this:
+ * 00000000-7fffffff virtual mapped IO
+ * 80000000-8fffffff ISA/EISA port space that can't be virtually mapped
+ * 90000000-9fffffff Dino port space
+ * a0000000-afffffff Astro port space
+ * b0000000-bfffffff PAT port space
+ * c0000000-cfffffff non-swapped memory IO
+ * f0000000-ffffffff legacy IO memory pointers
+ *
+ * For the moment, here's what it looks like:
+ * 80000000-8fffffff All ISA/EISA port space
+ * f0000000-ffffffff legacy IO memory pointers
+ *
+ * On 64-bit, everything is extended, so:
+ * 8000000000000000-8fffffffffffffff All ISA/EISA port space
+ * f000000000000000-ffffffffffffffff legacy IO memory pointers
+ */
+
+/*
+ * Technically, this should be 'if (VMALLOC_START < addr < VMALLOC_END),
+ * but that's slow and we know it'll be within the first 2GB.
+ */
+#ifdef CONFIG_64BIT
+#define INDIRECT_ADDR(addr)	(((unsigned long)(addr) & 1UL<<63) != 0)
+#define ADDR_TO_REGION(addr)    (((unsigned long)addr >> 60) & 7)
+#define IOPORT_MAP_BASE		(8UL << 60)
+#else
+#define INDIRECT_ADDR(addr)     (((unsigned long)(addr) & 1UL<<31) != 0)
+#define ADDR_TO_REGION(addr)    (((unsigned long)addr >> 28) & 7)
+#define IOPORT_MAP_BASE		(8UL << 28)
+#endif
+
+struct iomap_ops {
+	unsigned int (*read8)(void __iomem *);
+	unsigned int (*read16)(void __iomem *);
+	unsigned int (*read32)(void __iomem *);
+	void (*write8)(u8, void __iomem *);
+	void (*write16)(u16, void __iomem *);
+	void (*write32)(u32, void __iomem *);
+	void (*read8r)(void __iomem *, void *, unsigned long);
+	void (*read16r)(void __iomem *, void *, unsigned long);
+	void (*read32r)(void __iomem *, void *, unsigned long);
+	void (*write8r)(void __iomem *, const void *, unsigned long);
+	void (*write16r)(void __iomem *, const void *, unsigned long);
+	void (*write32r)(void __iomem *, const void *, unsigned long);
+};
+
+/* Generic ioport ops.  To be replaced later by specific dino/elroy/wax code */
+
+#define ADDR2PORT(addr) ((unsigned long __force)(addr) & 0xffffff)
+
+static unsigned int ioport_read8(void __iomem *addr)
+{
+	return inb(ADDR2PORT(addr));
+}
+
+static unsigned int ioport_read16(void __iomem *addr)
+{
+	return inw(ADDR2PORT(addr));
+}
+
+static unsigned int ioport_read32(void __iomem *addr)
+{
+	return inl(ADDR2PORT(addr));
+}
+
+static void ioport_write8(u8 datum, void __iomem *addr)
+{
+	outb(datum, ADDR2PORT(addr));
+}
+
+static void ioport_write16(u16 datum, void __iomem *addr)
+{
+	outw(datum, ADDR2PORT(addr));
+}
+
+static void ioport_write32(u32 datum, void __iomem *addr)
+{
+	outl(datum, ADDR2PORT(addr));
+}
+
+static void ioport_read8r(void __iomem *addr, void *dst, unsigned long count)
+{
+	insb(ADDR2PORT(addr), dst, count);
+}
+
+static void ioport_read16r(void __iomem *addr, void *dst, unsigned long count)
+{
+	insw(ADDR2PORT(addr), dst, count);
+}
+
+static void ioport_read32r(void __iomem *addr, void *dst, unsigned long count)
+{
+	insl(ADDR2PORT(addr), dst, count);
+}
+
+static void ioport_write8r(void __iomem *addr, const void *s, unsigned long n)
+{
+	outsb(ADDR2PORT(addr), s, n);
+}
+
+static void ioport_write16r(void __iomem *addr, const void *s, unsigned long n)
+{
+	outsw(ADDR2PORT(addr), s, n);
+}
+
+static void ioport_write32r(void __iomem *addr, const void *s, unsigned long n)
+{
+	outsl(ADDR2PORT(addr), s, n);
+}
+
+static const struct iomap_ops ioport_ops = {
+	ioport_read8,
+	ioport_read16,
+	ioport_read32,
+	ioport_write8,
+	ioport_write16,
+	ioport_write32,
+	ioport_read8r,
+	ioport_read16r,
+	ioport_read32r,
+	ioport_write8r,
+	ioport_write16r,
+	ioport_write32r,
+};
+
+/* Legacy I/O memory ops */
+
+static unsigned int iomem_read8(void __iomem *addr)
+{
+	return readb(addr);
+}
+
+static unsigned int iomem_read16(void __iomem *addr)
+{
+	return readw(addr);
+}
+
+static unsigned int iomem_read32(void __iomem *addr)
+{
+	return readl(addr);
+}
+
+static void iomem_write8(u8 datum, void __iomem *addr)
+{
+	writeb(datum, addr);
+}
+
+static void iomem_write16(u16 datum, void __iomem *addr)
+{
+	writew(datum, addr);
+}
+
+static void iomem_write32(u32 datum, void __iomem *addr)
+{
+	writel(datum, addr);
+}
+
+static void iomem_read8r(void __iomem *addr, void *dst, unsigned long count)
+{
+	while (count--) {
+		*(u8 *)dst = __raw_readb(addr);
+		dst++;
+	}
+}
+
+static void iomem_read16r(void __iomem *addr, void *dst, unsigned long count)
+{
+	while (count--) {
+		*(u16 *)dst = __raw_readw(addr);
+		dst += 2;
+	}
+}
+
+static void iomem_read32r(void __iomem *addr, void *dst, unsigned long count)
+{
+	while (count--) {
+		*(u32 *)dst = __raw_readl(addr);
+		dst += 4;
+	}
+}
+
+static void iomem_write8r(void __iomem *addr, const void *s, unsigned long n)
+{
+	while (n--) {
+		__raw_writeb(*(u8 *)s, addr);
+		s++;
+	}
+}
+
+static void iomem_write16r(void __iomem *addr, const void *s, unsigned long n)
+{
+	while (n--) {
+		__raw_writew(*(u16 *)s, addr);
+		s += 2;
+	}
+}
+
+static void iomem_write32r(void __iomem *addr, const void *s, unsigned long n)
+{
+	while (n--) {
+		__raw_writel(*(u32 *)s, addr);
+		s += 4;
+	}
+}
+
+static const struct iomap_ops iomem_ops = {
+	iomem_read8,
+	iomem_read16,
+	iomem_read32,
+	iomem_write8,
+	iomem_write16,
+	iomem_write32,
+	iomem_read8r,
+	iomem_read16r,
+	iomem_read32r,
+	iomem_write8r,
+	iomem_write16r,
+	iomem_write32r,
+};
+
+const struct iomap_ops *iomap_ops[8] = {
+	[0] = &ioport_ops,
+#ifdef CONFIG_DEBUG_IOREMAP
+	[6] = &iomem_ops,
+#else
+	[7] = &iomem_ops
+#endif
+};
+
+
+unsigned int ioread8(void __iomem *addr)
+{
+	if (unlikely(INDIRECT_ADDR(addr)))
+		return iomap_ops[ADDR_TO_REGION(addr)]->read8(addr);
+	return *((u8 *)addr);
+}
+
+unsigned int ioread16(void __iomem *addr)
+{
+	if (unlikely(INDIRECT_ADDR(addr)))
+		return iomap_ops[ADDR_TO_REGION(addr)]->read16(addr);
+	return le16_to_cpup((u16 *)addr);
+}
+
+unsigned int ioread32(void __iomem *addr)
+{
+	if (unlikely(INDIRECT_ADDR(addr)))
+		return iomap_ops[ADDR_TO_REGION(addr)]->read32(addr);
+	return le32_to_cpup((u32 *)addr);
+}
+
+void iowrite8(u8 datum, void __iomem *addr)
+{
+	if (unlikely(INDIRECT_ADDR(addr))) {
+		iomap_ops[ADDR_TO_REGION(addr)]->write8(datum, addr);
+	} else {
+		*((u8 *)addr) = datum;
+	}
+}
+
+void iowrite16(u16 datum, void __iomem *addr)
+{
+	if (unlikely(INDIRECT_ADDR(addr))) {
+		iomap_ops[ADDR_TO_REGION(addr)]->write16(datum, addr);
+	} else {
+		*((u16 *)addr) = cpu_to_le16(datum);
+	}
+}
+
+void iowrite32(u32 datum, void __iomem *addr)
+{
+	if (unlikely(INDIRECT_ADDR(addr))) {
+		iomap_ops[ADDR_TO_REGION(addr)]->write32(datum, addr);
+	} else {
+		*((u32 *)addr) = cpu_to_le32(datum);
+	}
+}
+
+/* Repeating interfaces */
+
+void ioread8_rep(void __iomem *addr, void *dst, unsigned long count)
+{
+	if (unlikely(INDIRECT_ADDR(addr))) {
+		iomap_ops[ADDR_TO_REGION(addr)]->read8r(addr, dst, count);
+	} else {
+		while (count--) {
+			*(u8 *)dst = *(u8 *)addr;
+			dst++;
+		}
+	}
+}
+
+void ioread16_rep(void __iomem *addr, void *dst, unsigned long count)
+{
+	if (unlikely(INDIRECT_ADDR(addr))) {
+		iomap_ops[ADDR_TO_REGION(addr)]->read16r(addr, dst, count);
+	} else {
+		while (count--) {
+			*(u16 *)dst = *(u16 *)addr;
+			dst += 2;
+		}
+	}
+}
+
+void ioread32_rep(void __iomem *addr, void *dst, unsigned long count)
+{
+	if (unlikely(INDIRECT_ADDR(addr))) {
+		iomap_ops[ADDR_TO_REGION(addr)]->read32r(addr, dst, count);
+	} else {
+		while (count--) {
+			*(u32 *)dst = *(u32 *)addr;
+			dst += 4;
+		}
+	}
+}
+
+void iowrite8_rep(void __iomem *addr, const void *src, unsigned long count)
+{
+	if (unlikely(INDIRECT_ADDR(addr))) {
+		iomap_ops[ADDR_TO_REGION(addr)]->write8r(addr, src, count);
+	} else {
+		while (count--) {
+			*(u8 *)addr = *(u8 *)src;
+			src++;
+		}
+	}
+}
+
+void iowrite16_rep(void __iomem *addr, const void *src, unsigned long count)
+{
+	if (unlikely(INDIRECT_ADDR(addr))) {
+		iomap_ops[ADDR_TO_REGION(addr)]->write16r(addr, src, count);
+	} else {
+		while (count--) {
+			*(u16 *)addr = *(u16 *)src;
+			src += 2;
+		}
+	}
+}
+
+void iowrite32_rep(void __iomem *addr, const void *src, unsigned long count)
+{
+	if (unlikely(INDIRECT_ADDR(addr))) {
+		iomap_ops[ADDR_TO_REGION(addr)]->write32r(addr, src, count);
+	} else {
+		while (count--) {
+			*(u32 *)addr = *(u32 *)src;
+			src += 4;
+		}
+	}
+}
+
+/* Mapping interfaces */
+
+void __iomem *ioport_map(unsigned long port, unsigned int nr)
+{
+	return (void __iomem *)(IOPORT_MAP_BASE | port);
+}
+
+void ioport_unmap(void __iomem *addr)
+{
+	if (!INDIRECT_ADDR(addr)) {
+		iounmap(addr);
+	}
+}
+
+/* Create a virtual mapping cookie for a PCI BAR (memory or IO) */
+void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
+{
+	unsigned long start = pci_resource_start(dev, bar);
+	unsigned long len = pci_resource_len(dev, bar);
+	unsigned long flags = pci_resource_flags(dev, bar);
+
+	if (!len || !start)
+		return NULL;
+	if (maxlen && len > maxlen)
+		len = maxlen;
+	if (flags & IORESOURCE_IO)
+		return ioport_map(start, len);
+	if (flags & IORESOURCE_MEM) {
+		if (flags & IORESOURCE_CACHEABLE)
+			return ioremap(start, len);
+		return ioremap_nocache(start, len);
+	}
+	/* What? */
+	return NULL;
+}
+
+void pci_iounmap(struct pci_dev *dev, void __iomem * addr)
+{
+	if (!INDIRECT_ADDR(addr)) {
+		iounmap(addr);
+	}
+}
+
+EXPORT_SYMBOL(ioread8);
+EXPORT_SYMBOL(ioread16);
+EXPORT_SYMBOL(ioread32);
+EXPORT_SYMBOL(iowrite8);
+EXPORT_SYMBOL(iowrite16);
+EXPORT_SYMBOL(iowrite32);
+EXPORT_SYMBOL(ioread8_rep);
+EXPORT_SYMBOL(ioread16_rep);
+EXPORT_SYMBOL(ioread32_rep);
+EXPORT_SYMBOL(iowrite8_rep);
+EXPORT_SYMBOL(iowrite16_rep);
+EXPORT_SYMBOL(iowrite32_rep);
+EXPORT_SYMBOL(ioport_map);
+EXPORT_SYMBOL(ioport_unmap);
+EXPORT_SYMBOL(pci_iomap);
+EXPORT_SYMBOL(pci_iounmap);
diff --git a/arch/parisc/lib/lusercopy.S b/arch/parisc/lib/lusercopy.S
new file mode 100644
index 00000000000..a0509855c9a
--- /dev/null
+++ b/arch/parisc/lib/lusercopy.S
@@ -0,0 +1,193 @@
+/*
+ *    User Space Access Routines
+ *
+ *    Copyright (C) 2000-2002 Hewlett-Packard (John Marvin)
+ *    Copyright (C) 2000 Richard Hirst <rhirst with parisc-linux.org>
+ *    Copyright (C) 2001 Matthieu Delahaye <delahaym at esiee.fr>
+ *    Copyright (C) 2003 Randolph Chung <tausq with parisc-linux.org>
+ *
+ *
+ *    This program is free software; you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation; either version 2, or (at your option)
+ *    any later version.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU General Public License for more details.
+ *
+ *    You should have received a copy of the GNU General Public License
+ *    along with this program; if not, write to the Free Software
+ *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/*
+ * These routines still have plenty of room for optimization
+ * (word & doubleword load/store, dual issue, store hints, etc.).
+ */
+
+/*
+ * The following routines assume that space register 3 (sr3) contains
+ * the space id associated with the current users address space.
+ */
+
+
+	.text
+	
+#include <asm/assembly.h>
+#include <asm/errno.h>
+
+	/*
+	 * get_sr gets the appropriate space value into
+	 * sr1 for kernel/user space access, depending
+	 * on the flag stored in the task structure.
+	 */
+
+	.macro  get_sr
+	mfctl       %cr30,%r1
+	ldw         TI_SEGMENT(%r1),%r22
+	mfsp        %sr3,%r1
+	or,<>       %r22,%r0,%r0
+	copy        %r0,%r1
+	mtsp        %r1,%sr1
+	.endm
+
+	.macro fixup_branch lbl
+	ldil	    L%\lbl, %r1
+	ldo	    R%\lbl(%r1), %r1
+	bv          %r0(%r1)
+	.endm
+
+	/*
+	 * long lstrncpy_from_user(char *dst, const char *src, long n)
+	 *
+	 * Returns -EFAULT if exception before terminator,
+	 *         N if the entire buffer filled,
+	 *         otherwise strlen (i.e. excludes zero byte)
+	 */
+
+	.export lstrncpy_from_user,code
+lstrncpy_from_user:
+	.proc
+	.callinfo NO_CALLS
+	.entry
+	comib,=     0,%r24,$lsfu_done
+	copy        %r24,%r23
+	get_sr
+1:      ldbs,ma     1(%sr1,%r25),%r1
+$lsfu_loop:
+	stbs,ma     %r1,1(%r26)
+	comib,=,n   0,%r1,$lsfu_done
+	addib,<>,n  -1,%r24,$lsfu_loop
+2:      ldbs,ma     1(%sr1,%r25),%r1
+$lsfu_done:
+	sub         %r23,%r24,%r28
+$lsfu_exit:
+	bv          %r0(%r2)
+	nop
+	.exit
+
+	.section .fixup,"ax"
+3:      fixup_branch $lsfu_exit
+	ldi         -EFAULT,%r28
+	.previous
+
+	.section __ex_table,"aw"
+#ifdef __LP64__
+	.dword      1b,3b
+	.dword      2b,3b
+#else
+	.word       1b,3b
+	.word       2b,3b
+#endif
+	.previous
+
+	.procend
+
+	/*
+	 * unsigned long lclear_user(void *to, unsigned long n)
+	 *
+	 * Returns 0 for success.
+	 * otherwise, returns number of bytes not transferred.
+	 */
+
+	.export lclear_user,code
+lclear_user:
+	.proc
+	.callinfo NO_CALLS
+	.entry
+	comib,=,n   0,%r25,$lclu_done
+	get_sr
+$lclu_loop:
+	addib,<>    -1,%r25,$lclu_loop
+1:      stbs,ma     %r0,1(%sr1,%r26)
+
+$lclu_done:
+	bv          %r0(%r2)
+	copy        %r25,%r28
+	.exit
+
+	.section .fixup,"ax"
+2:      fixup_branch $lclu_done
+	ldo        1(%r25),%r25
+	.previous
+
+	.section __ex_table,"aw"
+#ifdef __LP64__
+	.dword      1b,2b
+#else
+	.word       1b,2b
+#endif
+	.previous
+
+	.procend
+
+	/*
+	 * long lstrnlen_user(char *s, long n)
+	 *
+	 * Returns 0 if exception before zero byte or reaching N,
+	 *         N+1 if N would be exceeded,
+	 *         else strlen + 1 (i.e. includes zero byte).
+	 */
+
+	.export lstrnlen_user,code
+lstrnlen_user:
+	.proc
+	.callinfo NO_CALLS
+	.entry
+	comib,=     0,%r25,$lslen_nzero
+	copy	    %r26,%r24
+	get_sr
+1:      ldbs,ma     1(%sr1,%r26),%r1
+$lslen_loop:
+	comib,=,n   0,%r1,$lslen_done
+	addib,<>    -1,%r25,$lslen_loop
+2:      ldbs,ma     1(%sr1,%r26),%r1
+$lslen_done:
+	bv          %r0(%r2)
+	sub	    %r26,%r24,%r28
+	.exit
+
+$lslen_nzero:
+	b           $lslen_done
+	ldo         1(%r26),%r26 /* special case for N == 0 */
+
+	.section .fixup,"ax"
+3:      fixup_branch $lslen_done
+	copy        %r24,%r26    /* reset r26 so 0 is returned on fault */
+	.previous
+
+	.section __ex_table,"aw"
+#ifdef __LP64__
+	.dword      1b,3b
+	.dword      2b,3b
+#else
+	.word       1b,3b
+	.word       2b,3b
+#endif
+	.previous
+
+	.procend
+
+	.end
diff --git a/arch/parisc/lib/memcpy.c b/arch/parisc/lib/memcpy.c
new file mode 100644
index 00000000000..feb1b9f42c2
--- /dev/null
+++ b/arch/parisc/lib/memcpy.c
@@ -0,0 +1,522 @@
+/*
+ *    Optimized memory copy routines.
+ *
+ *    Copyright (C) 2004 Randolph Chung <tausq@debian.org>
+ *
+ *    This program is free software; you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation; either version 2, or (at your option)
+ *    any later version.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU General Public License for more details.
+ *
+ *    You should have received a copy of the GNU General Public License
+ *    along with this program; if not, write to the Free Software
+ *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ *    Portions derived from the GNU C Library
+ *    Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc.
+ *
+ * Several strategies are tried to try to get the best performance for various
+ * conditions. In the optimal case, we copy 64-bytes in an unrolled loop using 
+ * fp regs. This is followed by loops that copy 32- or 16-bytes at a time using
+ * general registers.  Unaligned copies are handled either by aligning the 
+ * destination and then using shift-and-write method, or in a few cases by 
+ * falling back to a byte-at-a-time copy.
+ *
+ * I chose to implement this in C because it is easier to maintain and debug,
+ * and in my experiments it appears that the C code generated by gcc (3.3/3.4
+ * at the time of writing) is fairly optimal. Unfortunately some of the 
+ * semantics of the copy routine (exception handling) is difficult to express
+ * in C, so we have to play some tricks to get it to work.
+ *
+ * All the loads and stores are done via explicit asm() code in order to use
+ * the right space registers. 
+ * 
+ * Testing with various alignments and buffer sizes shows that this code is 
+ * often >10x faster than a simple byte-at-a-time copy, even for strangely
+ * aligned operands. It is interesting to note that the glibc version
+ * of memcpy (written in C) is actually quite fast already. This routine is 
+ * able to beat it by 30-40% for aligned copies because of the loop unrolling, 
+ * but in some cases the glibc version is still slightly faster. This lends 
+ * more credibility that gcc can generate very good code as long as we are 
+ * careful.
+ *
+ * TODO:
+ * - cache prefetching needs more experimentation to get optimal settings
+ * - try not to use the post-increment address modifiers; they create additional
+ *   interlocks
+ * - replace byte-copy loops with stybs sequences
+ */
+
+#ifdef __KERNEL__
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <asm/uaccess.h>
+#define s_space "%%sr1"
+#define d_space "%%sr2"
+#else
+#include "memcpy.h"
+#define s_space "%%sr0"
+#define d_space "%%sr0"
+#define pa_memcpy new2_copy
+#endif
+
+DECLARE_PER_CPU(struct exception_data, exception_data);
+
+#define preserve_branch(label)	do {					\
+	volatile int dummy;						\
+	/* The following branch is never taken, it's just here to  */	\
+	/* prevent gcc from optimizing away our exception code. */ 	\
+	if (unlikely(dummy != dummy))					\
+		goto label;						\
+} while (0)
+
+#define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3))
+#define get_kernel_space() (0)
+
+#define MERGE(w0, sh_1, w1, sh_2)  ({					\
+	unsigned int _r;						\
+	asm volatile (							\
+	"mtsar %3\n"							\
+	"shrpw %1, %2, %%sar, %0\n"					\
+	: "=r"(_r)							\
+	: "r"(w0), "r"(w1), "r"(sh_2)					\
+	);								\
+	_r;								\
+})
+#define THRESHOLD	16
+
+#ifdef DEBUG_MEMCPY
+#define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __FUNCTION__ ); printk(KERN_DEBUG fmt, ##args ); } while (0)
+#else
+#define DPRINTF(fmt, args...)
+#endif
+
+#ifndef __LP64__
+#define EXC_WORD ".word"
+#else
+#define EXC_WORD ".dword"
+#endif
+
+#define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e)	\
+	__asm__ __volatile__ (				\
+	"1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n" 	\
+	"\t.section __ex_table,\"aw\"\n"		\
+	"\t" EXC_WORD "\t1b\n"				\
+	"\t" EXC_WORD "\t" #_e "\n"			\
+	"\t.previous\n"					\
+	: _tt(_t), "+r"(_a)				\
+	: 						\
+	: "r8")
+
+#define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) 	\
+	__asm__ __volatile__ (				\
+	"1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n" 	\
+	"\t.section __ex_table,\"aw\"\n"		\
+	"\t" EXC_WORD "\t1b\n"				\
+	"\t" EXC_WORD "\t" #_e "\n"			\
+	"\t.previous\n"					\
+	: "+r"(_a) 					\
+	: _tt(_t)					\
+	: "r8")
+
+#define ldbma(_s, _a, _t, _e) def_load_ai_insn(ldbs,1,"=r",_s,_a,_t,_e)
+#define stbma(_s, _t, _a, _e) def_store_ai_insn(stbs,1,"r",_s,_a,_t,_e)
+#define ldwma(_s, _a, _t, _e) def_load_ai_insn(ldw,4,"=r",_s,_a,_t,_e)
+#define stwma(_s, _t, _a, _e) def_store_ai_insn(stw,4,"r",_s,_a,_t,_e)
+#define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e)
+#define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e)
+
+#define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e) 	\
+	__asm__ __volatile__ (				\
+	"1:\t" #_insn " " #_o "(" _s ",%1), %0\n"	\
+	"\t.section __ex_table,\"aw\"\n"		\
+	"\t" EXC_WORD "\t1b\n"				\
+	"\t" EXC_WORD "\t" #_e "\n"			\
+	"\t.previous\n"					\
+	: _tt(_t) 					\
+	: "r"(_a)					\
+	: "r8")
+
+#define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e) 	\
+	__asm__ __volatile__ (				\
+	"1:\t" #_insn " %0, " #_o "(" _s ",%1)\n" 	\
+	"\t.section __ex_table,\"aw\"\n"		\
+	"\t" EXC_WORD "\t1b\n"				\
+	"\t" EXC_WORD "\t" #_e "\n"			\
+	"\t.previous\n"					\
+	: 						\
+	: _tt(_t), "r"(_a)				\
+	: "r8")
+
+#define ldw(_s,_o,_a,_t,_e)	def_load_insn(ldw,"=r",_s,_o,_a,_t,_e)
+#define stw(_s,_t,_o,_a,_e) 	def_store_insn(stw,"r",_s,_t,_o,_a,_e)
+
+#ifdef  CONFIG_PREFETCH
+extern inline void prefetch_src(const void *addr)
+{
+	__asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr));
+}
+
+extern inline void prefetch_dst(const void *addr)
+{
+	__asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr));
+}
+#else
+#define prefetch_src(addr)
+#define prefetch_dst(addr)
+#endif
+
+/* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words
+ * per loop.  This code is derived from glibc. 
+ */
+static inline unsigned long copy_dstaligned(unsigned long dst, unsigned long src, unsigned long len, unsigned long o_dst, unsigned long o_src, unsigned long o_len)
+{
+	/* gcc complains that a2 and a3 may be uninitialized, but actually
+	 * they cannot be.  Initialize a2/a3 to shut gcc up.
+	 */
+	register unsigned int a0, a1, a2 = 0, a3 = 0;
+	int sh_1, sh_2;
+	struct exception_data *d;
+
+	/* prefetch_src((const void *)src); */
+
+	/* Calculate how to shift a word read at the memory operation
+	   aligned srcp to make it aligned for copy.  */
+	sh_1 = 8 * (src % sizeof(unsigned int));
+	sh_2 = 8 * sizeof(unsigned int) - sh_1;
+
+	/* Make src aligned by rounding it down.  */
+	src &= -sizeof(unsigned int);
+
+	switch (len % 4)
+	{
+		case 2:
+			/* a1 = ((unsigned int *) src)[0];
+			   a2 = ((unsigned int *) src)[1]; */
+			ldw(s_space, 0, src, a1, cda_ldw_exc);
+			ldw(s_space, 4, src, a2, cda_ldw_exc);
+			src -= 1 * sizeof(unsigned int);
+			dst -= 3 * sizeof(unsigned int);
+			len += 2;
+			goto do1;
+		case 3:
+			/* a0 = ((unsigned int *) src)[0];
+			   a1 = ((unsigned int *) src)[1]; */
+			ldw(s_space, 0, src, a0, cda_ldw_exc);
+			ldw(s_space, 4, src, a1, cda_ldw_exc);
+			src -= 0 * sizeof(unsigned int);
+			dst -= 2 * sizeof(unsigned int);
+			len += 1;
+			goto do2;
+		case 0:
+			if (len == 0)
+				return 0;
+			/* a3 = ((unsigned int *) src)[0];
+			   a0 = ((unsigned int *) src)[1]; */
+			ldw(s_space, 0, src, a3, cda_ldw_exc);
+			ldw(s_space, 4, src, a0, cda_ldw_exc);
+			src -=-1 * sizeof(unsigned int);
+			dst -= 1 * sizeof(unsigned int);
+			len += 0;
+			goto do3;
+		case 1:
+			/* a2 = ((unsigned int *) src)[0];
+			   a3 = ((unsigned int *) src)[1]; */
+			ldw(s_space, 0, src, a2, cda_ldw_exc);
+			ldw(s_space, 4, src, a3, cda_ldw_exc);
+			src -=-2 * sizeof(unsigned int);
+			dst -= 0 * sizeof(unsigned int);
+			len -= 1;
+			if (len == 0)
+				goto do0;
+			goto do4;			/* No-op.  */
+	}
+
+	do
+	{
+		/* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */
+do4:
+		/* a0 = ((unsigned int *) src)[0]; */
+		ldw(s_space, 0, src, a0, cda_ldw_exc);
+		/* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
+		stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
+do3:
+		/* a1 = ((unsigned int *) src)[1]; */
+		ldw(s_space, 4, src, a1, cda_ldw_exc);
+		/* ((unsigned int *) dst)[1] = MERGE (a3, sh_1, a0, sh_2); */
+		stw(d_space, MERGE (a3, sh_1, a0, sh_2), 4, dst, cda_stw_exc);
+do2:
+		/* a2 = ((unsigned int *) src)[2]; */
+		ldw(s_space, 8, src, a2, cda_ldw_exc);
+		/* ((unsigned int *) dst)[2] = MERGE (a0, sh_1, a1, sh_2); */
+		stw(d_space, MERGE (a0, sh_1, a1, sh_2), 8, dst, cda_stw_exc);
+do1:
+		/* a3 = ((unsigned int *) src)[3]; */
+		ldw(s_space, 12, src, a3, cda_ldw_exc);
+		/* ((unsigned int *) dst)[3] = MERGE (a1, sh_1, a2, sh_2); */
+		stw(d_space, MERGE (a1, sh_1, a2, sh_2), 12, dst, cda_stw_exc);
+
+		src += 4 * sizeof(unsigned int);
+		dst += 4 * sizeof(unsigned int);
+		len -= 4;
+	}
+	while (len != 0);
+
+do0:
+	/* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
+	stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
+
+	preserve_branch(handle_load_error);
+	preserve_branch(handle_store_error);
+
+	return 0;
+
+handle_load_error:
+	__asm__ __volatile__ ("cda_ldw_exc:\n");
+	d = &__get_cpu_var(exception_data);
+	DPRINTF("cda_ldw_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n",
+		o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src);
+	return o_len * 4 - d->fault_addr + o_src;
+
+handle_store_error:
+	__asm__ __volatile__ ("cda_stw_exc:\n");
+	d = &__get_cpu_var(exception_data);
+	DPRINTF("cda_stw_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n",
+		o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst);
+	return o_len * 4 - d->fault_addr + o_dst;
+}
+
+
+/* Returns 0 for success, otherwise, returns number of bytes not transferred. */
+unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len)
+{
+	register unsigned long src, dst, t1, t2, t3;
+	register unsigned char *pcs, *pcd;
+	register unsigned int *pws, *pwd;
+	register double *pds, *pdd;
+	unsigned long ret = 0;
+	unsigned long o_dst, o_src, o_len;
+	struct exception_data *d;
+
+	src = (unsigned long)srcp;
+	dst = (unsigned long)dstp;
+	pcs = (unsigned char *)srcp;
+	pcd = (unsigned char *)dstp;
+
+	o_dst = dst; o_src = src; o_len = len;
+
+	/* prefetch_src((const void *)srcp); */
+
+	if (len < THRESHOLD)
+		goto byte_copy;
+
+	/* Check alignment */
+	t1 = (src ^ dst);
+	if (unlikely(t1 & (sizeof(double)-1)))
+		goto unaligned_copy;
+
+	/* src and dst have same alignment. */
+
+	/* Copy bytes till we are double-aligned. */
+	t2 = src & (sizeof(double) - 1);
+	if (unlikely(t2 != 0)) {
+		t2 = sizeof(double) - t2;
+		while (t2 && len) {
+			/* *pcd++ = *pcs++; */
+			ldbma(s_space, pcs, t3, pmc_load_exc);
+			len--;
+			stbma(d_space, t3, pcd, pmc_store_exc);
+			t2--;
+		}
+	}
+
+	pds = (double *)pcs;
+	pdd = (double *)pcd;
+
+	/* Copy 8 doubles at a time */
+	while (len >= 8*sizeof(double)) {
+		register double r1, r2, r3, r4, r5, r6, r7, r8;
+		/* prefetch_src((char *)pds + L1_CACHE_BYTES); */
+		flddma(s_space, pds, r1, pmc_load_exc);
+		flddma(s_space, pds, r2, pmc_load_exc);
+		flddma(s_space, pds, r3, pmc_load_exc);
+		flddma(s_space, pds, r4, pmc_load_exc);
+		fstdma(d_space, r1, pdd, pmc_store_exc);
+		fstdma(d_space, r2, pdd, pmc_store_exc);
+		fstdma(d_space, r3, pdd, pmc_store_exc);
+		fstdma(d_space, r4, pdd, pmc_store_exc);
+
+#if 0
+		if (L1_CACHE_BYTES <= 32)
+			prefetch_src((char *)pds + L1_CACHE_BYTES);
+#endif
+		flddma(s_space, pds, r5, pmc_load_exc);
+		flddma(s_space, pds, r6, pmc_load_exc);
+		flddma(s_space, pds, r7, pmc_load_exc);
+		flddma(s_space, pds, r8, pmc_load_exc);
+		fstdma(d_space, r5, pdd, pmc_store_exc);
+		fstdma(d_space, r6, pdd, pmc_store_exc);
+		fstdma(d_space, r7, pdd, pmc_store_exc);
+		fstdma(d_space, r8, pdd, pmc_store_exc);
+		len -= 8*sizeof(double);
+	}
+
+	pws = (unsigned int *)pds;
+	pwd = (unsigned int *)pdd;
+
+word_copy:
+	while (len >= 8*sizeof(unsigned int)) {
+		register unsigned int r1,r2,r3,r4,r5,r6,r7,r8;
+		/* prefetch_src((char *)pws + L1_CACHE_BYTES); */
+		ldwma(s_space, pws, r1, pmc_load_exc);
+		ldwma(s_space, pws, r2, pmc_load_exc);
+		ldwma(s_space, pws, r3, pmc_load_exc);
+		ldwma(s_space, pws, r4, pmc_load_exc);
+		stwma(d_space, r1, pwd, pmc_store_exc);
+		stwma(d_space, r2, pwd, pmc_store_exc);
+		stwma(d_space, r3, pwd, pmc_store_exc);
+		stwma(d_space, r4, pwd, pmc_store_exc);
+
+		ldwma(s_space, pws, r5, pmc_load_exc);
+		ldwma(s_space, pws, r6, pmc_load_exc);
+		ldwma(s_space, pws, r7, pmc_load_exc);
+		ldwma(s_space, pws, r8, pmc_load_exc);
+		stwma(d_space, r5, pwd, pmc_store_exc);
+		stwma(d_space, r6, pwd, pmc_store_exc);
+		stwma(d_space, r7, pwd, pmc_store_exc);
+		stwma(d_space, r8, pwd, pmc_store_exc);
+		len -= 8*sizeof(unsigned int);
+	}
+
+	while (len >= 4*sizeof(unsigned int)) {
+		register unsigned int r1,r2,r3,r4;
+		ldwma(s_space, pws, r1, pmc_load_exc);
+		ldwma(s_space, pws, r2, pmc_load_exc);
+		ldwma(s_space, pws, r3, pmc_load_exc);
+		ldwma(s_space, pws, r4, pmc_load_exc);
+		stwma(d_space, r1, pwd, pmc_store_exc);
+		stwma(d_space, r2, pwd, pmc_store_exc);
+		stwma(d_space, r3, pwd, pmc_store_exc);
+		stwma(d_space, r4, pwd, pmc_store_exc);
+		len -= 4*sizeof(unsigned int);
+	}
+
+	pcs = (unsigned char *)pws;
+	pcd = (unsigned char *)pwd;
+
+byte_copy:
+	while (len) {
+		/* *pcd++ = *pcs++; */
+		ldbma(s_space, pcs, t3, pmc_load_exc);
+		stbma(d_space, t3, pcd, pmc_store_exc);
+		len--;
+	}
+
+	return 0;
+
+unaligned_copy:
+	/* possibly we are aligned on a word, but not on a double... */
+	if (likely(t1 & (sizeof(unsigned int)-1)) == 0) {
+		t2 = src & (sizeof(unsigned int) - 1);
+
+		if (unlikely(t2 != 0)) {
+			t2 = sizeof(unsigned int) - t2;
+			while (t2) {
+				/* *pcd++ = *pcs++; */
+				ldbma(s_space, pcs, t3, pmc_load_exc);
+				stbma(d_space, t3, pcd, pmc_store_exc);
+				len--;
+				t2--;
+			}
+		}
+
+		pws = (unsigned int *)pcs;
+		pwd = (unsigned int *)pcd;
+		goto word_copy;
+	}
+
+	/* Align the destination.  */
+	if (unlikely((dst & (sizeof(unsigned int) - 1)) != 0)) {
+		t2 = sizeof(unsigned int) - (dst & (sizeof(unsigned int) - 1));
+		while (t2) {
+			/* *pcd++ = *pcs++; */
+			ldbma(s_space, pcs, t3, pmc_load_exc);
+			stbma(d_space, t3, pcd, pmc_store_exc);
+			len--;
+			t2--;
+		}
+		dst = (unsigned long)pcd;
+		src = (unsigned long)pcs;
+	}
+
+	ret = copy_dstaligned(dst, src, len / sizeof(unsigned int), 
+		o_dst, o_src, o_len);
+	if (ret)
+		return ret;
+
+	pcs += (len & -sizeof(unsigned int));
+	pcd += (len & -sizeof(unsigned int));
+	len %= sizeof(unsigned int);
+
+	preserve_branch(handle_load_error);
+	preserve_branch(handle_store_error);
+
+	goto byte_copy;
+
+handle_load_error:
+	__asm__ __volatile__ ("pmc_load_exc:\n");
+	d = &__get_cpu_var(exception_data);
+	DPRINTF("pmc_load_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n",
+		o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src);
+	return o_len - d->fault_addr + o_src;
+
+handle_store_error:
+	__asm__ __volatile__ ("pmc_store_exc:\n");
+	d = &__get_cpu_var(exception_data);
+	DPRINTF("pmc_store_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n",
+		o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst);
+	return o_len - d->fault_addr + o_dst;
+}
+
+#ifdef __KERNEL__
+unsigned long copy_to_user(void __user *dst, const void *src, unsigned long len)
+{
+	mtsp(get_kernel_space(), 1);
+	mtsp(get_user_space(), 2);
+	return pa_memcpy((void __force *)dst, src, len);
+}
+
+unsigned long copy_from_user(void *dst, const void __user *src, unsigned long len)
+{
+	mtsp(get_user_space(), 1);
+	mtsp(get_kernel_space(), 2);
+	return pa_memcpy(dst, (void __force *)src, len);
+}
+
+unsigned long copy_in_user(void __user *dst, const void __user *src, unsigned long len)
+{
+	mtsp(get_user_space(), 1);
+	mtsp(get_user_space(), 2);
+	return pa_memcpy((void __force *)dst, (void __force *)src, len);
+}
+
+
+void * memcpy(void * dst,const void *src, size_t count)
+{
+	mtsp(get_kernel_space(), 1);
+	mtsp(get_kernel_space(), 2);
+	pa_memcpy(dst, src, count);
+	return dst;
+}
+
+EXPORT_SYMBOL(copy_to_user);
+EXPORT_SYMBOL(copy_from_user);
+EXPORT_SYMBOL(copy_in_user);
+EXPORT_SYMBOL(memcpy);
+#endif
diff --git a/arch/parisc/lib/memset.c b/arch/parisc/lib/memset.c
new file mode 100644
index 00000000000..1d7929bd764
--- /dev/null
+++ b/arch/parisc/lib/memset.c
@@ -0,0 +1,91 @@
+/* Copyright (C) 1991, 1997 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+/* Slight modifications for pa-risc linux - Paul Bame <bame@debian.org> */
+
+#include <linux/types.h>
+#include <asm/string.h>
+
+#define OPSIZ (BITS_PER_LONG/8)
+typedef unsigned long op_t;
+
+void *
+memset (void *dstpp, int sc, size_t len)
+{
+  unsigned int c = sc;
+  long int dstp = (long int) dstpp;
+
+  if (len >= 8)
+    {
+      size_t xlen;
+      op_t cccc;
+
+      cccc = (unsigned char) c;
+      cccc |= cccc << 8;
+      cccc |= cccc << 16;
+      if (OPSIZ > 4)
+	/* Do the shift in two steps to avoid warning if long has 32 bits.  */
+	cccc |= (cccc << 16) << 16;
+
+      /* There are at least some bytes to set.
+	 No need to test for LEN == 0 in this alignment loop.  */
+      while (dstp % OPSIZ != 0)
+	{
+	  ((unsigned char *) dstp)[0] = c;
+	  dstp += 1;
+	  len -= 1;
+	}
+
+      /* Write 8 `op_t' per iteration until less than 8 `op_t' remain.  */
+      xlen = len / (OPSIZ * 8);
+      while (xlen > 0)
+	{
+	  ((op_t *) dstp)[0] = cccc;
+	  ((op_t *) dstp)[1] = cccc;
+	  ((op_t *) dstp)[2] = cccc;
+	  ((op_t *) dstp)[3] = cccc;
+	  ((op_t *) dstp)[4] = cccc;
+	  ((op_t *) dstp)[5] = cccc;
+	  ((op_t *) dstp)[6] = cccc;
+	  ((op_t *) dstp)[7] = cccc;
+	  dstp += 8 * OPSIZ;
+	  xlen -= 1;
+	}
+      len %= OPSIZ * 8;
+
+      /* Write 1 `op_t' per iteration until less than OPSIZ bytes remain.  */
+      xlen = len / OPSIZ;
+      while (xlen > 0)
+	{
+	  ((op_t *) dstp)[0] = cccc;
+	  dstp += OPSIZ;
+	  xlen -= 1;
+	}
+      len %= OPSIZ;
+    }
+
+  /* Write the last few bytes.  */
+  while (len > 0)
+    {
+      ((unsigned char *) dstp)[0] = c;
+      dstp += 1;
+      len -= 1;
+    }
+
+  return dstpp;
+}