13 files changed, 412 insertions, 342 deletions
diff --git a/arch/Kconfig b/arch/Kconfig
index 3d72dc3fc8f..694c9af520b 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -27,5 +27,12 @@ config KPROBES
 	  for kernel debugging, non-intrusive instrumentation and testing.
 	  If in doubt, say "N".
 
+config KRETPROBES
+	def_bool y
+	depends on KPROBES && HAVE_KRETPROBES
+
 config HAVE_KPROBES
 	def_bool n
+
+config HAVE_KRETPROBES
+	def_bool n
diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c
index 26d3789dfdd..be6fa105cd3 100644
--- a/arch/alpha/kernel/pci_iommu.c
+++ b/arch/alpha/kernel/pci_iommu.c
@@ -31,7 +31,6 @@
 #endif
 
 #define DEBUG_NODIRECT 0
-#define DEBUG_FORCEDAC 0
 
 #define ISA_DMA_MASK		0x00ffffff
 
@@ -126,39 +125,67 @@ iommu_arena_new(struct pci_controller *hose, dma_addr_t base,
 	return iommu_arena_new_node(0, hose, base, window_size, align);
 }
 
+static inline int is_span_boundary(unsigned int index, unsigned int nr,
+				   unsigned long shift,
+				   unsigned long boundary_size)
+{
+	shift = (shift + index) & (boundary_size - 1);
+	return shift + nr > boundary_size;
+}
+
 /* Must be called with the arena lock held */
 static long
-iommu_arena_find_pages(struct pci_iommu_arena *arena, long n, long mask)
+iommu_arena_find_pages(struct device *dev, struct pci_iommu_arena *arena,
+		       long n, long mask)
 {
 	unsigned long *ptes;
 	long i, p, nent;
+	int pass = 0;
+	unsigned long base;
+	unsigned long boundary_size;
+
+	BUG_ON(arena->dma_base & ~PAGE_MASK);
+	base = arena->dma_base >> PAGE_SHIFT;
+	if (dev)
+		boundary_size = ALIGN(dma_get_max_seg_size(dev) + 1, PAGE_SIZE)
+			>> PAGE_SHIFT;
+	else
+		boundary_size = ALIGN(1UL << 32, PAGE_SIZE) >> PAGE_SHIFT;
+
+	BUG_ON(!is_power_of_2(boundary_size));
 
 	/* Search forward for the first mask-aligned sequence of N free ptes */
 	ptes = arena->ptes;
 	nent = arena->size >> PAGE_SHIFT;
-	p = (arena->next_entry + mask) & ~mask;
+	p = ALIGN(arena->next_entry, mask + 1);
 	i = 0;
+
+again:
 	while (i < n && p+i < nent) {
+		if (!i && is_span_boundary(p, n, base, boundary_size)) {
+			p = ALIGN(p + 1, mask + 1);
+			goto again;
+		}
+
 		if (ptes[p+i])
-			p = (p + i + 1 + mask) & ~mask, i = 0;
+			p = ALIGN(p + i + 1, mask + 1), i = 0;
 		else
 			i = i + 1;
 	}
 
 	if (i < n) {
-                /* Reached the end.  Flush the TLB and restart the
-                   search from the beginning.  */
-		alpha_mv.mv_pci_tbi(arena->hose, 0, -1);
-
-		p = 0, i = 0;
-		while (i < n && p+i < nent) {
-			if (ptes[p+i])
-				p = (p + i + 1 + mask) & ~mask, i = 0;
-			else
-				i = i + 1;
-		}
-
-		if (i < n)
+		if (pass < 1) {
+			/*
+			 * Reached the end.  Flush the TLB and restart
+			 * the search from the beginning.
+			*/
+			alpha_mv.mv_pci_tbi(arena->hose, 0, -1);
+
+			pass++;
+			p = 0;
+			i = 0;
+			goto again;
+		} else
 			return -1;
 	}
 
@@ -168,7 +195,8 @@ iommu_arena_find_pages(struct pci_iommu_arena *arena, long n, long mask)
 }
 
 static long
-iommu_arena_alloc(struct pci_iommu_arena *arena, long n, unsigned int align)
+iommu_arena_alloc(struct device *dev, struct pci_iommu_arena *arena, long n,
+		  unsigned int align)
 {
 	unsigned long flags;
 	unsigned long *ptes;
@@ -179,7 +207,7 @@ iommu_arena_alloc(struct pci_iommu_arena *arena, long n, unsigned int align)
 	/* Search for N empty ptes */
 	ptes = arena->ptes;
 	mask = max(align, arena->align_entry) - 1;
-	p = iommu_arena_find_pages(arena, n, mask);
+	p = iommu_arena_find_pages(dev, arena, n, mask);
 	if (p < 0) {
 		spin_unlock_irqrestore(&arena->lock, flags);
 		return -1;
@@ -229,6 +257,7 @@ pci_map_single_1(struct pci_dev *pdev, void *cpu_addr, size_t size,
 	unsigned long paddr;
 	dma_addr_t ret;
 	unsigned int align = 0;
+	struct device *dev = pdev ? &pdev->dev : NULL;
 
 	paddr = __pa(cpu_addr);
 
@@ -276,7 +305,7 @@ pci_map_single_1(struct pci_dev *pdev, void *cpu_addr, size_t size,
 	/* Force allocation to 64KB boundary for ISA bridges. */
 	if (pdev && pdev == isa_bridge)
 		align = 8;
-	dma_ofs = iommu_arena_alloc(arena, npages, align);
+	dma_ofs = iommu_arena_alloc(dev, arena, npages, align);
 	if (dma_ofs < 0) {
 		printk(KERN_WARNING "pci_map_single failed: "
 		       "could not allocate dma page tables\n");
@@ -563,7 +592,7 @@ sg_fill(struct device *dev, struct scatterlist *leader, struct scatterlist *end,
 
 	paddr &= ~PAGE_MASK;
 	npages = calc_npages(paddr + size);
-	dma_ofs = iommu_arena_alloc(arena, npages, 0);
+	dma_ofs = iommu_arena_alloc(dev, arena, npages, 0);
 	if (dma_ofs < 0) {
 		/* If we attempted a direct map above but failed, die.  */
 		if (leader->dma_address == 0)
@@ -830,7 +859,7 @@ iommu_reserve(struct pci_iommu_arena *arena, long pg_count, long align_mask)
 
 	/* Search for N empty ptes.  */
 	ptes = arena->ptes;
-	p = iommu_arena_find_pages(arena, pg_count, align_mask);
+	p = iommu_arena_find_pages(NULL, arena, pg_count, align_mask);
 	if (p < 0) {
 		spin_unlock_irqrestore(&arena->lock, flags);
 		return -1;
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 16b82e1272b..955fc53c1c0 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -12,6 +12,7 @@ config ARM
 	select SYS_SUPPORTS_APM_EMULATION
 	select HAVE_OPROFILE
 	select HAVE_KPROBES if (!XIP_KERNEL)
+	select HAVE_KRETPROBES if (HAVE_KPROBES)
 	help
 	  The ARM series is a line of low-power-consumption RISC chip designs
 	  licensed by ARM Ltd and targeted at embedded applications and
diff --git a/arch/cris/arch-v10/kernel/time.c b/arch/cris/arch-v10/kernel/time.c
index 9310a7b476e..525483f0ddf 100644
--- a/arch/cris/arch-v10/kernel/time.c
+++ b/arch/cris/arch-v10/kernel/time.c
@@ -13,7 +13,7 @@
 #include <linux/swap.h>
 #include <linux/sched.h>
 #include <linux/init.h>
-#include <linux/vmstat.h>
+#include <linux/mm.h>
 #include <asm/arch/svinto.h>
 #include <asm/types.h>
 #include <asm/signal.h>
diff --git a/arch/cris/arch-v10/lib/string.c b/arch/cris/arch-v10/lib/string.c
index 7161a2bef4f..c7bd6ebdc93 100644
--- a/arch/cris/arch-v10/lib/string.c
+++ b/arch/cris/arch-v10/lib/string.c
@@ -1,55 +1,59 @@
-/*#************************************************************************#*/
-/*#-------------------------------------------------------------------------*/
-/*#                                                                         */
-/*# FUNCTION NAME: memcpy()                                                 */
-/*#                                                                         */
-/*# PARAMETERS:  void* dst;   Destination address.                          */
-/*#              void* src;   Source address.                               */
-/*#              int   len;   Number of bytes to copy.                      */
-/*#                                                                         */
-/*# RETURNS:     dst.                                                       */
-/*#                                                                         */
-/*# DESCRIPTION: Copies len bytes of memory from src to dst.  No guarantees */
-/*#              about copying of overlapping memory areas. This routine is */
-/*#              very sensitive to compiler changes in register allocation. */
-/*#              Should really be rewritten to avoid this problem.          */
-/*#                                                                         */
-/*#-------------------------------------------------------------------------*/
-/*#                                                                         */
-/*# HISTORY                                                                 */
-/*#                                                                         */
-/*# DATE      NAME            CHANGES                                       */
-/*# ----      ----            -------                                       */
-/*# 941007    Kenny R         Creation                                      */
-/*# 941011    Kenny R         Lots of optimizations and inlining.           */
-/*# 941129    Ulf A           Adapted for use in libc.                      */
-/*# 950216    HP              N==0 forgotten if non-aligned src/dst.        */
-/*#                           Added some optimizations.                     */
-/*# 001025    HP              Make src and dst char *.  Align dst to	    */
-/*#			      dword, not just word-if-both-src-and-dst-	    */
-/*#			      are-misaligned.				    */
-/*#                                                                         */
-/*#-------------------------------------------------------------------------*/
-
-#include <linux/types.h>
-
-void *memcpy(void *pdst,
-             const void *psrc,
-             size_t pn)
+/* A memcpy for CRIS.
+   Copyright (C) 1994-2005 Axis Communications.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Neither the name of Axis Communications nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY AXIS COMMUNICATIONS AND ITS CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL AXIS
+   COMMUNICATIONS OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+   INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+   HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+   STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+   IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.  */
+
+/* FIXME: This file should really only be used for reference, as the
+   result is somewhat depending on gcc generating what we expect rather
+   than what we describe.  An assembly file should be used instead.  */
+
+#include <stddef.h>
+
+/* Break even between movem and move16 is really at 38.7 * 2, but
+   modulo 44, so up to the next multiple of 44, we use ordinary code.  */
+#define MEMCPY_BY_BLOCK_THRESHOLD (44 * 2)
+
+/* No name ambiguities in this file.  */
+__asm__ (".syntax no_register_prefix");
+
+void *
+memcpy(void *pdst, const void *psrc, size_t pn)
 {
-  /* Ok.  Now we want the parameters put in special registers.
+  /* Now we want the parameters put in special registers.
      Make sure the compiler is able to make something useful of this.
-      As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
+     As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
 
-     If gcc was alright, it really would need no temporaries, and no
-     stack space to save stuff on. */
+     If gcc was allright, it really would need no temporaries, and no
+     stack space to save stuff on.  */
 
   register void *return_dst __asm__ ("r10") = pdst;
-  register char *dst __asm__ ("r13") = pdst;
-  register const char *src __asm__ ("r11") = psrc;
+  register unsigned char *dst __asm__ ("r13") = pdst;
+  register unsigned const char *src __asm__ ("r11") = psrc;
   register int n __asm__ ("r12") = pn;
-  
- 
+
   /* When src is aligned but not dst, this makes a few extra needless
      cycles.  I believe it would take as many to check that the
      re-alignment was unnecessary.  */
@@ -59,167 +63,174 @@ void *memcpy(void *pdst,
       && n >= 3)
   {
     if ((unsigned long) dst & 1)
-    {
-      n--;
-      *(char*)dst = *(char*)src;
-      src++;
-      dst++;
-    }
+      {
+	n--;
+	*dst = *src;
+	src++;
+	dst++;
+      }
 
     if ((unsigned long) dst & 2)
-    {
-      n -= 2;
-      *(short*)dst = *(short*)src;
-      src += 2;
-      dst += 2;
-    }
+      {
+	n -= 2;
+	*(short *) dst = *(short *) src;
+	src += 2;
+	dst += 2;
+      }
   }
 
-  /* Decide which copying method to use. */
-  if (n >= 44*2)                /* Break even between movem and
-                                   move16 is at 38.7*2, but modulo 44. */
-  {
-    /* For large copies we use 'movem' */
-
-  /* It is not optimal to tell the compiler about clobbering any
-     registers; that will move the saving/restoring of those registers
-     to the function prologue/epilogue, and make non-movem sizes
-     suboptimal.
-
-      This method is not foolproof; it assumes that the "asm reg"
-     declarations at the beginning of the function really are used
-     here (beware: they may be moved to temporary registers).
-      This way, we do not have to save/move the registers around into
-     temporaries; we can safely use them straight away.
-
-      If you want to check that the allocation was right; then
-      check the equalities in the first comment.  It should say
-      "r13=r13, r11=r11, r12=r12" */
-    __asm__ volatile ("\n\
-	;; Check that the following is true (same register names on	\n\
-	;; both sides of equal sign, as in r8=r8):			\n\
-	;; %0=r13, %1=r11, %2=r12					\n\
-	;;								\n\
-	;; Save the registers we'll use in the movem process		\n\
-	;; on the stack.						\n\
-	subq	11*4,$sp						\n\
-	movem	$r10,[$sp]						\n\
+  /* Decide which copying method to use.  */
+  if (n >= MEMCPY_BY_BLOCK_THRESHOLD)
+    {
+      /* It is not optimal to tell the compiler about clobbering any
+	 registers; that will move the saving/restoring of those registers
+	 to the function prologue/epilogue, and make non-movem sizes
+	 suboptimal.  */
+      __asm__ volatile
+	("\
+	 ;; GCC does promise correct register allocations, but let's	\n\
+	 ;; make sure it keeps its promises.				\n\
+	 .ifnc %0-%1-%2,$r13-$r11-$r12					\n\
+	 .error \"GCC reg alloc bug: %0-%1-%4 != $r13-$r12-$r11\"	\n\
+	 .endif								\n\
+									\n\
+	 ;; Save the registers we'll use in the movem process		\n\
+	 ;; on the stack.						\n\
+	 subq	11*4,sp							\n\
+	 movem	r10,[sp]						\n\
 									\n\
-	;; Now we've got this:						\n\
-	;; r11 - src							\n\
-	;; r13 - dst							\n\
-	;; r12 - n							\n\
+	 ;; Now we've got this:						\n\
+	 ;; r11 - src							\n\
+	 ;; r13 - dst							\n\
+	 ;; r12 - n							\n\
 									\n\
-	;; Update n for the first loop					\n\
-	subq	44,$r12							\n\
+	 ;; Update n for the first loop.				\n\
+	 subq	 44,r12							\n\
 0:									\n\
-	movem	[$r11+],$r10						\n\
-	subq	44,$r12							\n\
-	bge	0b							\n\
-	movem	$r10,[$r13+]						\n\
+"
+#ifdef __arch_common_v10_v32
+	 /* Cater to branch offset difference between v32 and v10.  We
+	    assume the branch below has an 8-bit offset.  */
+"	 setf\n"
+#endif
+"	 movem	[r11+],r10						\n\
+	 subq	44,r12							\n\
+	 bge	 0b							\n\
+	 movem	r10,[r13+]						\n\
 									\n\
-	addq	44,$r12 ;; compensate for last loop underflowing n	\n\
+	 ;; Compensate for last loop underflowing n.			\n\
+	 addq	44,r12							\n\
 									\n\
-	;; Restore registers from stack					\n\
-	movem	[$sp+],$r10"
+	 ;; Restore registers from stack.				\n\
+	 movem [sp+],r10"
 
-     /* Outputs */ : "=r" (dst), "=r" (src), "=r" (n) 
-     /* Inputs */ : "0" (dst), "1" (src), "2" (n));
-    
-  }
+	 /* Outputs.  */
+	 : "=r" (dst), "=r" (src), "=r" (n)
 
-  /* Either we directly starts copying, using dword copying
-     in a loop, or we copy as much as possible with 'movem' 
-     and then the last block (<44 bytes) is copied here.
-     This will work since 'movem' will have updated src,dst,n. */
+	 /* Inputs.  */
+	 : "0" (dst), "1" (src), "2" (n));
+    }
 
-  while ( n >= 16 )
-  {
-    *((long*)dst)++ = *((long*)src)++;
-    *((long*)dst)++ = *((long*)src)++;
-    *((long*)dst)++ = *((long*)src)++;
-    *((long*)dst)++ = *((long*)src)++;
-    n -= 16;
-  }
+  while (n >= 16)
+    {
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+
+      n -= 16;
+    }
 
-  /* A switch() is definitely the fastest although it takes a LOT of code.
-   * Particularly if you inline code this.
-   */
   switch (n)
-  {
+    {
     case 0:
       break;
+
     case 1:
-      *(char*)dst = *(char*)src;
+      *dst = *src;
       break;
+
     case 2:
-      *(short*)dst = *(short*)src;
+      *(short *) dst = *(short *) src;
       break;
+
     case 3:
-      *((short*)dst)++ = *((short*)src)++;
-      *(char*)dst = *(char*)src;
+      *(short *) dst = *(short *) src; dst += 2; src += 2;
+      *dst = *src;
       break;
+
     case 4:
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src;
       break;
+
     case 5:
-      *((long*)dst)++ = *((long*)src)++;
-      *(char*)dst = *(char*)src;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *dst = *src;
       break;
+
     case 6:
-      *((long*)dst)++ = *((long*)src)++;
-      *(short*)dst = *(short*)src;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(short *) dst = *(short *) src;
       break;
+
     case 7:
-      *((long*)dst)++ = *((long*)src)++;
-      *((short*)dst)++ = *((short*)src)++;
-      *(char*)dst = *(char*)src;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(short *) dst = *(short *) src; dst += 2; src += 2;
+      *dst = *src;
       break;
+
     case 8:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src;
       break;
+
     case 9:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *(char*)dst = *(char*)src;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *dst = *src;
       break;
+
     case 10:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *(short*)dst = *(short*)src;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(short *) dst = *(short *) src;
       break;
+
     case 11:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *((short*)dst)++ = *((short*)src)++;
-      *(char*)dst = *(char*)src;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(short *) dst = *(short *) src; dst += 2; src += 2;
+      *dst = *src;
       break;
+
     case 12:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src;
       break;
+
     case 13:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *(char*)dst = *(char*)src;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *dst = *src;
       break;
+
     case 14:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *(short*)dst = *(short*)src;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(short *) dst = *(short *) src;
       break;
+
     case 15:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *((short*)dst)++ = *((short*)src)++;
-      *(char*)dst = *(char*)src;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(short *) dst = *(short *) src; dst += 2; src += 2;
+      *dst = *src;
       break;
-  }
+    }
 
-  return return_dst; /* destination pointer. */
-} /* memcpy() */
+  return return_dst;
+}
diff --git a/arch/cris/arch-v10/lib/usercopy.c b/arch/cris/arch-v10/lib/usercopy.c
index b8e6c0430e5..b0a608da7bd 100644
--- a/arch/cris/arch-v10/lib/usercopy.c
+++ b/arch/cris/arch-v10/lib/usercopy.c
@@ -193,7 +193,7 @@ __copy_user (void __user *pdst, const void *psrc, unsigned long pn)
    inaccessible.  */
 
 unsigned long
-__copy_user_zeroing (void __user *pdst, const void *psrc, unsigned long pn)
+__copy_user_zeroing(void *pdst, const void __user *psrc, unsigned long pn)
 {
   /* We want the parameters put in special registers.
      Make sure the compiler is able to make something useful of this.
diff --git a/arch/cris/arch-v32/lib/string.c b/arch/cris/arch-v32/lib/string.c
index 6740b2cebae..c7bd6ebdc93 100644
--- a/arch/cris/arch-v32/lib/string.c
+++ b/arch/cris/arch-v32/lib/string.c
@@ -1,55 +1,59 @@
-/*#************************************************************************#*/
-/*#-------------------------------------------------------------------------*/
-/*#                                                                         */
-/*# FUNCTION NAME: memcpy()                                                 */
-/*#                                                                         */
-/*# PARAMETERS:  void* dst;   Destination address.                          */
-/*#              void* src;   Source address.                               */
-/*#              int   len;   Number of bytes to copy.                      */
-/*#                                                                         */
-/*# RETURNS:     dst.                                                       */
-/*#                                                                         */
-/*# DESCRIPTION: Copies len bytes of memory from src to dst.  No guarantees */
-/*#              about copying of overlapping memory areas. This routine is */
-/*#              very sensitive to compiler changes in register allocation. */
-/*#              Should really be rewritten to avoid this problem.          */
-/*#                                                                         */
-/*#-------------------------------------------------------------------------*/
-/*#                                                                         */
-/*# HISTORY                                                                 */
-/*#                                                                         */
-/*# DATE      NAME            CHANGES                                       */
-/*# ----      ----            -------                                       */
-/*# 941007    Kenny R         Creation                                      */
-/*# 941011    Kenny R         Lots of optimizations and inlining.           */
-/*# 941129    Ulf A           Adapted for use in libc.                      */
-/*# 950216    HP              N==0 forgotten if non-aligned src/dst.        */
-/*#                           Added some optimizations.                     */
-/*# 001025    HP              Make src and dst char *.  Align dst to	    */
-/*#			      dword, not just word-if-both-src-and-dst-	    */
-/*#			      are-misaligned.				    */
-/*#                                                                         */
-/*#-------------------------------------------------------------------------*/
-
-#include <linux/types.h>
-
-void *memcpy(void *pdst,
-             const void *psrc,
-             size_t pn)
+/* A memcpy for CRIS.
+   Copyright (C) 1994-2005 Axis Communications.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Neither the name of Axis Communications nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY AXIS COMMUNICATIONS AND ITS CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL AXIS
+   COMMUNICATIONS OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+   INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+   HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+   STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+   IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.  */
+
+/* FIXME: This file should really only be used for reference, as the
+   result is somewhat depending on gcc generating what we expect rather
+   than what we describe.  An assembly file should be used instead.  */
+
+#include <stddef.h>
+
+/* Break even between movem and move16 is really at 38.7 * 2, but
+   modulo 44, so up to the next multiple of 44, we use ordinary code.  */
+#define MEMCPY_BY_BLOCK_THRESHOLD (44 * 2)
+
+/* No name ambiguities in this file.  */
+__asm__ (".syntax no_register_prefix");
+
+void *
+memcpy(void *pdst, const void *psrc, size_t pn)
 {
-  /* Ok.  Now we want the parameters put in special registers.
+  /* Now we want the parameters put in special registers.
      Make sure the compiler is able to make something useful of this.
-      As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
+     As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
 
-     If gcc was alright, it really would need no temporaries, and no
-     stack space to save stuff on. */
+     If gcc was allright, it really would need no temporaries, and no
+     stack space to save stuff on.  */
 
   register void *return_dst __asm__ ("r10") = pdst;
-  register char *dst __asm__ ("r13") = pdst;
-  register const char *src __asm__ ("r11") = psrc;
+  register unsigned char *dst __asm__ ("r13") = pdst;
+  register unsigned const char *src __asm__ ("r11") = psrc;
   register int n __asm__ ("r12") = pn;
 
-
   /* When src is aligned but not dst, this makes a few extra needless
      cycles.  I believe it would take as many to check that the
      re-alignment was unnecessary.  */
@@ -59,161 +63,174 @@ void *memcpy(void *pdst,
       && n >= 3)
   {
     if ((unsigned long) dst & 1)
-    {
-      n--;
-      *(char*)dst = *(char*)src;
-      src++;
-      dst++;
-    }
+      {
+	n--;
+	*dst = *src;
+	src++;
+	dst++;
+      }
 
     if ((unsigned long) dst & 2)
-    {
-      n -= 2;
-      *(short*)dst = *(short*)src;
-      src += 2;
-      dst += 2;
-    }
+      {
+	n -= 2;
+	*(short *) dst = *(short *) src;
+	src += 2;
+	dst += 2;
+      }
   }
 
-  /* Decide which copying method to use.  Movem is dirt cheap, so the
-     overheap is low enough to always use the minimum block size as the
-     threshold.  */
-  if (n >= 44)
-  {
-    /* For large copies we use 'movem' */
-
-  /* It is not optimal to tell the compiler about clobbering any
-     registers; that will move the saving/restoring of those registers
-     to the function prologue/epilogue, and make non-movem sizes
-     suboptimal.  */
-    __asm__ volatile ("							\n\
-        ;; Check that the register asm declaration got right.		\n\
-        ;; The GCC manual explicitly says TRT will happen.		\n\
-	.ifnc %0-%1-%2,$r13-$r11-$r12					\n\
-	.err								\n\
-	.endif								\n\
-									\n\
-	;; Save the registers we'll use in the movem process		\n\
+  /* Decide which copying method to use.  */
+  if (n >= MEMCPY_BY_BLOCK_THRESHOLD)
+    {
+      /* It is not optimal to tell the compiler about clobbering any
+	 registers; that will move the saving/restoring of those registers
+	 to the function prologue/epilogue, and make non-movem sizes
+	 suboptimal.  */
+      __asm__ volatile
+	("\
+	 ;; GCC does promise correct register allocations, but let's	\n\
+	 ;; make sure it keeps its promises.				\n\
+	 .ifnc %0-%1-%2,$r13-$r11-$r12					\n\
+	 .error \"GCC reg alloc bug: %0-%1-%4 != $r13-$r12-$r11\"	\n\
+	 .endif								\n\
 									\n\
-	;; on the stack.						\n\
-	subq 	11*4,$sp						\n\
-	movem	$r10,[$sp]						\n\
+	 ;; Save the registers we'll use in the movem process		\n\
+	 ;; on the stack.						\n\
+	 subq	11*4,sp							\n\
+	 movem	r10,[sp]						\n\
 									\n\
-        ;; Now we've got this:						\n\
-	;; r11 - src							\n\
-	;; r13 - dst							\n\
-	;; r12 - n							\n\
+	 ;; Now we've got this:						\n\
+	 ;; r11 - src							\n\
+	 ;; r13 - dst							\n\
+	 ;; r12 - n							\n\
 									\n\
-        ;; Update n for the first loop					\n\
-        subq    44,$r12							\n\
+	 ;; Update n for the first loop.				\n\
+	 subq	 44,r12							\n\
 0:									\n\
-	movem	[$r11+],$r10						\n\
-        subq   44,$r12							\n\
-        bge     0b							\n\
-	movem	$r10,[$r13+]						\n\
+"
+#ifdef __arch_common_v10_v32
+	 /* Cater to branch offset difference between v32 and v10.  We
+	    assume the branch below has an 8-bit offset.  */
+"	 setf\n"
+#endif
+"	 movem	[r11+],r10						\n\
+	 subq	44,r12							\n\
+	 bge	 0b							\n\
+	 movem	r10,[r13+]						\n\
 									\n\
-        addq   44,$r12  ;; compensate for last loop underflowing n	\n\
+	 ;; Compensate for last loop underflowing n.			\n\
+	 addq	44,r12							\n\
 									\n\
-	;; Restore registers from stack					\n\
-        movem [$sp+],$r10"
+	 ;; Restore registers from stack.				\n\
+	 movem [sp+],r10"
 
-     /* Outputs */ : "=r" (dst), "=r" (src), "=r" (n)
-     /* Inputs */ : "0" (dst), "1" (src), "2" (n));
+	 /* Outputs.  */
+	 : "=r" (dst), "=r" (src), "=r" (n)
 
-  }
+	 /* Inputs.  */
+	 : "0" (dst), "1" (src), "2" (n));
+    }
 
-  /* Either we directly starts copying, using dword copying
-     in a loop, or we copy as much as possible with 'movem'
-     and then the last block (<44 bytes) is copied here.
-     This will work since 'movem' will have updated src,dst,n. */
+  while (n >= 16)
+    {
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
 
-  while ( n >= 16 )
-  {
-    *((long*)dst)++ = *((long*)src)++;
-    *((long*)dst)++ = *((long*)src)++;
-    *((long*)dst)++ = *((long*)src)++;
-    *((long*)dst)++ = *((long*)src)++;
-    n -= 16;
-  }
+      n -= 16;
+    }
 
-  /* A switch() is definitely the fastest although it takes a LOT of code.
-   * Particularly if you inline code this.
-   */
   switch (n)
-  {
+    {
     case 0:
       break;
+
     case 1:
-      *(char*)dst = *(char*)src;
+      *dst = *src;
       break;
+
     case 2:
-      *(short*)dst = *(short*)src;
+      *(short *) dst = *(short *) src;
       break;
+
     case 3:
-      *((short*)dst)++ = *((short*)src)++;
-      *(char*)dst = *(char*)src;
+      *(short *) dst = *(short *) src; dst += 2; src += 2;
+      *dst = *src;
       break;
+
     case 4:
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src;
       break;
+
     case 5:
-      *((long*)dst)++ = *((long*)src)++;
-      *(char*)dst = *(char*)src;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *dst = *src;
       break;
+
     case 6:
-      *((long*)dst)++ = *((long*)src)++;
-      *(short*)dst = *(short*)src;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(short *) dst = *(short *) src;
       break;
+
     case 7:
-      *((long*)dst)++ = *((long*)src)++;
-      *((short*)dst)++ = *((short*)src)++;
-      *(char*)dst = *(char*)src;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(short *) dst = *(short *) src; dst += 2; src += 2;
+      *dst = *src;
       break;
+
     case 8:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src;
       break;
+
     case 9:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *(char*)dst = *(char*)src;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *dst = *src;
       break;
+
     case 10:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *(short*)dst = *(short*)src;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(short *) dst = *(short *) src;
       break;
+
     case 11:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *((short*)dst)++ = *((short*)src)++;
-      *(char*)dst = *(char*)src;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(short *) dst = *(short *) src; dst += 2; src += 2;
+      *dst = *src;
       break;
+
     case 12:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src;
       break;
+
     case 13:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *(char*)dst = *(char*)src;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *dst = *src;
       break;
+
     case 14:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *(short*)dst = *(short*)src;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(short *) dst = *(short *) src;
       break;
+
     case 15:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *((short*)dst)++ = *((short*)src)++;
-      *(char*)dst = *(char*)src;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(short *) dst = *(short *) src; dst += 2; src += 2;
+      *dst = *src;
       break;
-  }
+    }
 
-  return return_dst; /* destination pointer. */
-} /* memcpy() */
+  return return_dst;
+}
diff --git a/arch/cris/arch-v32/lib/usercopy.c b/arch/cris/arch-v32/lib/usercopy.c
index 04d0cf35a27..0b5b70d5f58 100644
--- a/arch/cris/arch-v32/lib/usercopy.c
+++ b/arch/cris/arch-v32/lib/usercopy.c
@@ -161,7 +161,7 @@ __copy_user (void __user *pdst, const void *psrc, unsigned long pn)
    inaccessible.  */
 
 unsigned long
-__copy_user_zeroing (void __user *pdst, const void *psrc, unsigned long pn)
+__copy_user_zeroing(void *pdst, const void __user *psrc, unsigned long pn)
 {
   /* We want the parameters put in special registers.
      Make sure the compiler is able to make something useful of this.
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index dff9edfc746..56762d3c2a6 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -18,6 +18,7 @@ config IA64
 	select HAVE_IDE
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
+	select HAVE_KRETPROBES
 	default y
 	help
 	  The Itanium Processor Family is Intel's 64-bit successor to
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 5b8d8382b76..1189d8d6170 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -90,6 +90,7 @@ config PPC
 	select HAVE_IDE
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
+	select HAVE_KRETPROBES
 
 config EARLY_PRINTK
 	bool
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index b21444b681b..9892827b617 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -61,6 +61,7 @@ config S390
 	def_bool y
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
+	select HAVE_KRETPROBES
 
 source "init/Kconfig"
 
diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig
index 3af378ddb6a..463d1be32c9 100644
--- a/arch/sparc64/Kconfig
+++ b/arch/sparc64/Kconfig
@@ -10,6 +10,7 @@ config SPARC
 	default y
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
+	select HAVE_KRETPROBES
 
 config SPARC64
 	bool
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 53800b80a20..f41c9538ca3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -21,6 +21,7 @@ config X86
 	select HAVE_IDE
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
+	select HAVE_KRETPROBES
 	select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)