7 files changed, 491 insertions, 150 deletions
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 5bb89c82807..889f2bc106d 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -4,9 +4,7 @@
 
 subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
 
-ifeq ($(CONFIG_PPC64),y)
-EXTRA_CFLAGS		+= -mno-minimal-toc
-endif
+ccflags-$(CONFIG_PPC64)	:= -mno-minimal-toc
 
 CFLAGS_REMOVE_code-patching.o = -pg
 CFLAGS_REMOVE_feature-fixups.o = -pg
@@ -17,7 +15,8 @@ obj-$(CONFIG_PPC32)	+= div64.o copy_32.o
 obj-$(CONFIG_HAS_IOMEM)	+= devres.o
 
 obj-$(CONFIG_PPC64)	+= copypage_64.o copyuser_64.o \
-			   memcpy_64.o usercopy_64.o mem_64.o string.o
+			   memcpy_64.o usercopy_64.o mem_64.o string.o \
+			   checksum_wrappers_64.o
 obj-$(CONFIG_XMON)	+= sstep.o ldstfp.o
 obj-$(CONFIG_KPROBES)	+= sstep.o ldstfp.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= sstep.o ldstfp.o
diff --git a/arch/powerpc/lib/checksum_64.S b/arch/powerpc/lib/checksum_64.S
index ef96c6c58ef..18245af38ae 100644
--- a/arch/powerpc/lib/checksum_64.S
+++ b/arch/powerpc/lib/checksum_64.S
@@ -65,165 +65,393 @@ _GLOBAL(csum_tcpudp_magic)
 	srwi	r3,r3,16
 	blr
 
+#define STACKFRAMESIZE 256
+#define STK_REG(i)	(112 + ((i)-14)*8)
+
 /*
  * Computes the checksum of a memory block at buff, length len,
  * and adds in "sum" (32-bit).
  *
- * This code assumes at least halfword alignment, though the length
- * can be any number of bytes.  The sum is accumulated in r5.
- *
  * csum_partial(r3=buff, r4=len, r5=sum)
  */
 _GLOBAL(csum_partial)
-        subi	r3,r3,8		/* we'll offset by 8 for the loads */
-        srdi.	r6,r4,3         /* divide by 8 for doubleword count */
-        addic   r5,r5,0         /* clear carry */
-        beq	3f              /* if we're doing < 8 bytes */
-        andi.	r0,r3,2         /* aligned on a word boundary already? */
-        beq+	1f
-        lhz     r6,8(r3)        /* do 2 bytes to get aligned */
-        addi    r3,r3,2
-        subi    r4,r4,2
-        addc    r5,r5,r6
-        srdi.   r6,r4,3         /* recompute number of doublewords */
-        beq     3f              /* any left? */
-1:      mtctr   r6
-2:      ldu     r6,8(r3)        /* main sum loop */
-        adde    r5,r5,r6
-        bdnz    2b
-        andi.	r4,r4,7         /* compute bytes left to sum after doublewords */
-3:	cmpwi	0,r4,4		/* is at least a full word left? */
-	blt	4f
-	lwz	r6,8(r3)	/* sum this word */
+	addic	r0,r5,0			/* clear carry */
+
+	srdi.	r6,r4,3			/* less than 8 bytes? */
+	beq	.Lcsum_tail_word
+
+	/*
+	 * If only halfword aligned, align to a double word. Since odd
+	 * aligned addresses should be rare and they would require more
+	 * work to calculate the correct checksum, we ignore that case
+	 * and take the potential slowdown of unaligned loads.
+	 */
+	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
+	beq	.Lcsum_aligned
+
+	li	r7,4
+	sub	r6,r7,r6
+	mtctr	r6
+
+1:
+	lhz	r6,0(r3)		/* align to doubleword */
+	subi	r4,r4,2
+	addi	r3,r3,2
+	adde	r0,r0,r6
+	bdnz	1b
+
+.Lcsum_aligned:
+	/*
+	 * We unroll the loop such that each iteration is 64 bytes with an
+	 * entry and exit limb of 64 bytes, meaning a minimum size of
+	 * 128 bytes.
+	 */
+	srdi.	r6,r4,7
+	beq	.Lcsum_tail_doublewords		/* len < 128 */
+
+	srdi	r6,r4,6
+	subi	r6,r6,1
+	mtctr	r6
+
+	stdu	r1,-STACKFRAMESIZE(r1)
+	std	r14,STK_REG(r14)(r1)
+	std	r15,STK_REG(r15)(r1)
+	std	r16,STK_REG(r16)(r1)
+
+	ld	r6,0(r3)
+	ld	r9,8(r3)
+
+	ld	r10,16(r3)
+	ld	r11,24(r3)
+
+	/*
+	 * On POWER6 and POWER7 back to back addes take 2 cycles because of
+	 * the XER dependency. This means the fastest this loop can go is
+	 * 16 cycles per iteration. The scheduling of the loop below has
+	 * been shown to hit this on both POWER6 and POWER7.
+	 */
+	.align 5
+2:
+	adde	r0,r0,r6
+	ld	r12,32(r3)
+	ld	r14,40(r3)
+
+	adde	r0,r0,r9
+	ld	r15,48(r3)
+	ld	r16,56(r3)
+	addi	r3,r3,64
+
+	adde	r0,r0,r10
+
+	adde	r0,r0,r11
+
+	adde	r0,r0,r12
+
+	adde	r0,r0,r14
+
+	adde	r0,r0,r15
+	ld	r6,0(r3)
+	ld	r9,8(r3)
+
+	adde	r0,r0,r16
+	ld	r10,16(r3)
+	ld	r11,24(r3)
+	bdnz	2b
+
+
+	adde	r0,r0,r6
+	ld	r12,32(r3)
+	ld	r14,40(r3)
+
+	adde	r0,r0,r9
+	ld	r15,48(r3)
+	ld	r16,56(r3)
+	addi	r3,r3,64
+
+	adde	r0,r0,r10
+	adde	r0,r0,r11
+	adde	r0,r0,r12
+	adde	r0,r0,r14
+	adde	r0,r0,r15
+	adde	r0,r0,r16
+
+	ld	r14,STK_REG(r14)(r1)
+	ld	r15,STK_REG(r15)(r1)
+	ld	r16,STK_REG(r16)(r1)
+	addi	r1,r1,STACKFRAMESIZE
+
+	andi.	r4,r4,63
+
+.Lcsum_tail_doublewords:		/* Up to 127 bytes to go */
+	srdi.	r6,r4,3
+	beq	.Lcsum_tail_word
+
+	mtctr	r6
+3:
+	ld	r6,0(r3)
+	addi	r3,r3,8
+	adde	r0,r0,r6
+	bdnz	3b
+
+	andi.	r4,r4,7
+
+.Lcsum_tail_word:			/* Up to 7 bytes to go */
+	srdi.	r6,r4,2
+	beq	.Lcsum_tail_halfword
+
+	lwz	r6,0(r3)
 	addi	r3,r3,4
+	adde	r0,r0,r6
 	subi	r4,r4,4
-	adde	r5,r5,r6
-4:	cmpwi	0,r4,2		/* is at least a halfword left? */
-        blt+	5f
-        lhz     r6,8(r3)        /* sum this halfword */
-        addi    r3,r3,2
-        subi    r4,r4,2
-        adde    r5,r5,r6
-5:	cmpwi	0,r4,1		/* is at least a byte left? */
-        bne+    6f
-        lbz     r6,8(r3)        /* sum this byte */
-        slwi    r6,r6,8         /* this byte is assumed to be the upper byte of a halfword */
-        adde    r5,r5,r6
-6:      addze	r5,r5		/* add in final carry */
-	rldicl  r4,r5,32,0      /* fold two 32-bit halves together */
-        add     r3,r4,r5
-        srdi    r3,r3,32
-        blr
+
+.Lcsum_tail_halfword:			/* Up to 3 bytes to go */
+	srdi.	r6,r4,1
+	beq	.Lcsum_tail_byte
+
+	lhz	r6,0(r3)
+	addi	r3,r3,2
+	adde	r0,r0,r6
+	subi	r4,r4,2
+
+.Lcsum_tail_byte:			/* Up to 1 byte to go */
+	andi.	r6,r4,1
+	beq	.Lcsum_finish
+
+	lbz	r6,0(r3)
+	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
+	adde	r0,r0,r9
+
+.Lcsum_finish:
+	addze	r0,r0			/* add in final carry */
+	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
+	add	r3,r4,r0
+	srdi	r3,r3,32
+	blr
+
+
+	.macro source
+100:
+	.section __ex_table,"a"
+	.align 3
+	.llong 100b,.Lsrc_error
+	.previous
+	.endm
+
+	.macro dest
+200:
+	.section __ex_table,"a"
+	.align 3
+	.llong 200b,.Ldest_error
+	.previous
+	.endm
 
 /*
  * Computes the checksum of a memory block at src, length len,
  * and adds in "sum" (32-bit), while copying the block to dst.
  * If an access exception occurs on src or dst, it stores -EFAULT
- * to *src_err or *dst_err respectively, and (for an error on
- * src) zeroes the rest of dst.
- *
- * This code needs to be reworked to take advantage of 64 bit sum+copy.
- * However, due to tokenring halfword alignment problems this will be very
- * tricky.  For now we'll leave it until we instrument it somehow.
+ * to *src_err or *dst_err respectively. The caller must take any action
+ * required in this case (zeroing memory, recalculating partial checksum etc).
  *
  * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
  */
 _GLOBAL(csum_partial_copy_generic)
-	addic	r0,r6,0
-	subi	r3,r3,4
-	subi	r4,r4,4
-	srwi.	r6,r5,2
-	beq	3f		/* if we're doing < 4 bytes */
-	andi.	r9,r4,2		/* Align dst to longword boundary */
-	beq+	1f
-81:	lhz	r6,4(r3)	/* do 2 bytes to get aligned */
-	addi	r3,r3,2
+	addic	r0,r6,0			/* clear carry */
+
+	srdi.	r6,r5,3			/* less than 8 bytes? */
+	beq	.Lcopy_tail_word
+
+	/*
+	 * If only halfword aligned, align to a double word. Since odd
+	 * aligned addresses should be rare and they would require more
+	 * work to calculate the correct checksum, we ignore that case
+	 * and take the potential slowdown of unaligned loads.
+	 *
+	 * If the source and destination are relatively unaligned we only
+	 * align the source. This keeps things simple.
+	 */
+	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
+	beq	.Lcopy_aligned
+
+	li	r7,4
+	sub	r6,r7,r6
+	mtctr	r6
+
+1:
+source;	lhz	r6,0(r3)		/* align to doubleword */
 	subi	r5,r5,2
-91:	sth	r6,4(r4)
-	addi	r4,r4,2
-	addc	r0,r0,r6
-	srwi.	r6,r5,2		/* # words to do */
-	beq	3f
-1:	mtctr	r6
-82:	lwzu	r6,4(r3)	/* the bdnz has zero overhead, so it should */
-92:	stwu	r6,4(r4)	/* be unnecessary to unroll this loop */
-	adde	r0,r0,r6
-	bdnz	82b
-	andi.	r5,r5,3
-3:	cmpwi	0,r5,2
-	blt+	4f
-83:	lhz	r6,4(r3)
 	addi	r3,r3,2
-	subi	r5,r5,2
-93:	sth	r6,4(r4)
+	adde	r0,r0,r6
+dest;	sth	r6,0(r4)
 	addi	r4,r4,2
+	bdnz	1b
+
+.Lcopy_aligned:
+	/*
+	 * We unroll the loop such that each iteration is 64 bytes with an
+	 * entry and exit limb of 64 bytes, meaning a minimum size of
+	 * 128 bytes.
+	 */
+	srdi.	r6,r5,7
+	beq	.Lcopy_tail_doublewords		/* len < 128 */
+
+	srdi	r6,r5,6
+	subi	r6,r6,1
+	mtctr	r6
+
+	stdu	r1,-STACKFRAMESIZE(r1)
+	std	r14,STK_REG(r14)(r1)
+	std	r15,STK_REG(r15)(r1)
+	std	r16,STK_REG(r16)(r1)
+
+source;	ld	r6,0(r3)
+source;	ld	r9,8(r3)
+
+source;	ld	r10,16(r3)
+source;	ld	r11,24(r3)
+
+	/*
+	 * On POWER6 and POWER7 back to back addes take 2 cycles because of
+	 * the XER dependency. This means the fastest this loop can go is
+	 * 16 cycles per iteration. The scheduling of the loop below has
+	 * been shown to hit this on both POWER6 and POWER7.
+	 */
+	.align 5
+2:
 	adde	r0,r0,r6
-4:	cmpwi	0,r5,1
-	bne+	5f
-84:	lbz	r6,4(r3)
-94:	stb	r6,4(r4)
-	slwi	r6,r6,8		/* Upper byte of word */
+source;	ld	r12,32(r3)
+source;	ld	r14,40(r3)
+
+	adde	r0,r0,r9
+source;	ld	r15,48(r3)
+source;	ld	r16,56(r3)
+	addi	r3,r3,64
+
+	adde	r0,r0,r10
+dest;	std	r6,0(r4)
+dest;	std	r9,8(r4)
+
+	adde	r0,r0,r11
+dest;	std	r10,16(r4)
+dest;	std	r11,24(r4)
+
+	adde	r0,r0,r12
+dest;	std	r12,32(r4)
+dest;	std	r14,40(r4)
+
+	adde	r0,r0,r14
+dest;	std	r15,48(r4)
+dest;	std	r16,56(r4)
+	addi	r4,r4,64
+
+	adde	r0,r0,r15
+source;	ld	r6,0(r3)
+source;	ld	r9,8(r3)
+
+	adde	r0,r0,r16
+source;	ld	r10,16(r3)
+source;	ld	r11,24(r3)
+	bdnz	2b
+
+
 	adde	r0,r0,r6
-5:	addze	r3,r0		/* add in final carry (unlikely with 64-bit regs) */
-        rldicl  r4,r3,32,0      /* fold 64 bit value */
-        add     r3,r4,r3
-        srdi    r3,r3,32
-	blr
+source;	ld	r12,32(r3)
+source;	ld	r14,40(r3)
 
-/* These shouldn't go in the fixup section, since that would
-   cause the ex_table addresses to get out of order. */
+	adde	r0,r0,r9
+source;	ld	r15,48(r3)
+source;	ld	r16,56(r3)
+	addi	r3,r3,64
+
+	adde	r0,r0,r10
+dest;	std	r6,0(r4)
+dest;	std	r9,8(r4)
+
+	adde	r0,r0,r11
+dest;	std	r10,16(r4)
+dest;	std	r11,24(r4)
+
+	adde	r0,r0,r12
+dest;	std	r12,32(r4)
+dest;	std	r14,40(r4)
+
+	adde	r0,r0,r14
+dest;	std	r15,48(r4)
+dest;	std	r16,56(r4)
+	addi	r4,r4,64
+
+	adde	r0,r0,r15
+	adde	r0,r0,r16
+
+	ld	r14,STK_REG(r14)(r1)
+	ld	r15,STK_REG(r15)(r1)
+	ld	r16,STK_REG(r16)(r1)
+	addi	r1,r1,STACKFRAMESIZE
+
+	andi.	r5,r5,63
+
+.Lcopy_tail_doublewords:		/* Up to 127 bytes to go */
+	srdi.	r6,r5,3
+	beq	.Lcopy_tail_word
 
-	.globl src_error_1
-src_error_1:
-	li	r6,0
-	subi	r5,r5,2
-95:	sth	r6,4(r4)
-	addi	r4,r4,2
-	srwi.	r6,r5,2
-	beq	3f
 	mtctr	r6
-	.globl src_error_2
-src_error_2:
-	li	r6,0
-96:	stwu	r6,4(r4)
-	bdnz	96b
-3:	andi.	r5,r5,3
-	beq	src_error
-	.globl src_error_3
-src_error_3:
-	li	r6,0
-	mtctr	r5
-	addi	r4,r4,3
-97:	stbu	r6,1(r4)
-	bdnz	97b
-	.globl src_error
-src_error:
+3:
+source;	ld	r6,0(r3)
+	addi	r3,r3,8
+	adde	r0,r0,r6
+dest;	std	r6,0(r4)
+	addi	r4,r4,8
+	bdnz	3b
+
+	andi.	r5,r5,7
+
+.Lcopy_tail_word:			/* Up to 7 bytes to go */
+	srdi.	r6,r5,2
+	beq	.Lcopy_tail_halfword
+
+source;	lwz	r6,0(r3)
+	addi	r3,r3,4
+	adde	r0,r0,r6
+dest;	stw	r6,0(r4)
+	addi	r4,r4,4
+	subi	r5,r5,4
+
+.Lcopy_tail_halfword:			/* Up to 3 bytes to go */
+	srdi.	r6,r5,1
+	beq	.Lcopy_tail_byte
+
+source;	lhz	r6,0(r3)
+	addi	r3,r3,2
+	adde	r0,r0,r6
+dest;	sth	r6,0(r4)
+	addi	r4,r4,2
+	subi	r5,r5,2
+
+.Lcopy_tail_byte:			/* Up to 1 byte to go */
+	andi.	r6,r5,1
+	beq	.Lcopy_finish
+
+source;	lbz	r6,0(r3)
+	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
+	adde	r0,r0,r9
+dest;	stb	r6,0(r4)
+
+.Lcopy_finish:
+	addze	r0,r0			/* add in final carry */
+	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
+	add	r3,r4,r0
+	srdi	r3,r3,32
+	blr
+
+.Lsrc_error:
 	cmpdi	0,r7,0
-	beq	1f
+	beqlr
 	li	r6,-EFAULT
 	stw	r6,0(r7)
-1:	addze	r3,r0
 	blr
 
-	.globl dst_error
-dst_error:
+.Ldest_error:
 	cmpdi	0,r8,0
-	beq	1f
+	beqlr
 	li	r6,-EFAULT
 	stw	r6,0(r8)
-1:	addze	r3,r0
 	blr
-
-.section __ex_table,"a"
-	.align  3
-	.llong	81b,src_error_1
-	.llong	91b,dst_error
-	.llong	82b,src_error_2
-	.llong	92b,dst_error
-	.llong	83b,src_error_3
-	.llong	93b,dst_error
-	.llong	84b,src_error_3
-	.llong	94b,dst_error
-	.llong	95b,dst_error
-	.llong	96b,dst_error
-	.llong	97b,dst_error
diff --git a/arch/powerpc/lib/checksum_wrappers_64.c b/arch/powerpc/lib/checksum_wrappers_64.c
new file mode 100644
index 00000000000..769b817fbb3
--- /dev/null
+++ b/arch/powerpc/lib/checksum_wrappers_64.c
@@ -0,0 +1,102 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2010
+ *
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ */
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <asm/checksum.h>
+#include <asm/uaccess.h>
+
+__wsum csum_and_copy_from_user(const void __user *src, void *dst,
+			       int len, __wsum sum, int *err_ptr)
+{
+	unsigned int csum;
+
+	might_sleep();
+
+	*err_ptr = 0;
+
+	if (!len) {
+		csum = 0;
+		goto out;
+	}
+
+	if (unlikely((len < 0) || !access_ok(VERIFY_READ, src, len))) {
+		*err_ptr = -EFAULT;
+		csum = (__force unsigned int)sum;
+		goto out;
+	}
+
+	csum = csum_partial_copy_generic((void __force *)src, dst,
+					 len, sum, err_ptr, NULL);
+
+	if (unlikely(*err_ptr)) {
+		int missing = __copy_from_user(dst, src, len);
+
+		if (missing) {
+			memset(dst + len - missing, 0, missing);
+			*err_ptr = -EFAULT;
+		} else {
+			*err_ptr = 0;
+		}
+
+		csum = csum_partial(dst, len, sum);
+	}
+
+out:
+	return (__force __wsum)csum;
+}
+EXPORT_SYMBOL(csum_and_copy_from_user);
+
+__wsum csum_and_copy_to_user(const void *src, void __user *dst, int len,
+			     __wsum sum, int *err_ptr)
+{
+	unsigned int csum;
+
+	might_sleep();
+
+	*err_ptr = 0;
+
+	if (!len) {
+		csum = 0;
+		goto out;
+	}
+
+	if (unlikely((len < 0) || !access_ok(VERIFY_WRITE, dst, len))) {
+		*err_ptr = -EFAULT;
+		csum = -1; /* invalid checksum */
+		goto out;
+	}
+
+	csum = csum_partial_copy_generic(src, (void __force *)dst,
+					 len, sum, NULL, err_ptr);
+
+	if (unlikely(*err_ptr)) {
+		csum = csum_partial(src, len, sum);
+
+		if (copy_to_user(dst, src, len)) {
+			*err_ptr = -EFAULT;
+			csum = -1; /* invalid checksum */
+		}
+	}
+
+out:
+	return (__force __wsum)csum;
+}
+EXPORT_SYMBOL(csum_and_copy_to_user);
diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S
index 74a7f4130b4..55f19f9fd70 100644
--- a/arch/powerpc/lib/copy_32.S
+++ b/arch/powerpc/lib/copy_32.S
@@ -62,7 +62,7 @@
 
 	.text
 	.stabs	"arch/powerpc/lib/",N_SO,0,0,0f
-	.stabs	"copy32.S",N_SO,0,0,0f
+	.stabs	"copy_32.S",N_SO,0,0,0f
 0:
 
 CACHELINE_BYTES = L1_CACHE_BYTES
diff --git a/arch/powerpc/lib/ldstfp.S b/arch/powerpc/lib/ldstfp.S
index f6448636baf..6a85380520b 100644
--- a/arch/powerpc/lib/ldstfp.S
+++ b/arch/powerpc/lib/ldstfp.S
@@ -17,6 +17,8 @@
 #include <asm/asm-offsets.h>
 #include <linux/errno.h>
 
+#ifdef CONFIG_PPC_FPU
+
 #define STKFRM	(PPC_MIN_STKFRM + 16)
 
 	.macro	extab	instr,handler
@@ -81,7 +83,7 @@ _GLOBAL(do_lfs)
 	mfmsr	r6
 	ori	r7,r6,MSR_FP
 	cmpwi	cr7,r3,0
-	mtmsrd	r7
+	MTMSRD(r7)
 	isync
 	beq	cr7,1f
 	stfd	fr0,STKFRM-16(r1)
@@ -93,7 +95,7 @@ _GLOBAL(do_lfs)
 	lfd	fr0,STKFRM-16(r1)
 4:	PPC_LL	r0,STKFRM+PPC_LR_STKOFF(r1)
 	mtlr	r0
-	mtmsrd	r6
+	MTMSRD(r6)
 	isync
 	mr	r3,r9
 	addi	r1,r1,STKFRM
@@ -108,7 +110,7 @@ _GLOBAL(do_lfd)
 	mfmsr	r6
 	ori	r7,r6,MSR_FP
 	cmpwi	cr7,r3,0
-	mtmsrd	r7
+	MTMSRD(r7)
 	isync
 	beq	cr7,1f
 	stfd	fr0,STKFRM-16(r1)
@@ -120,7 +122,7 @@ _GLOBAL(do_lfd)
 	lfd	fr0,STKFRM-16(r1)
 4:	PPC_LL	r0,STKFRM+PPC_LR_STKOFF(r1)
 	mtlr	r0
-	mtmsrd	r6
+	MTMSRD(r6)
 	isync
 	mr	r3,r9
 	addi	r1,r1,STKFRM
@@ -135,7 +137,7 @@ _GLOBAL(do_stfs)
 	mfmsr	r6
 	ori	r7,r6,MSR_FP
 	cmpwi	cr7,r3,0
-	mtmsrd	r7
+	MTMSRD(r7)
 	isync
 	beq	cr7,1f
 	stfd	fr0,STKFRM-16(r1)
@@ -147,7 +149,7 @@ _GLOBAL(do_stfs)
 	lfd	fr0,STKFRM-16(r1)
 4:	PPC_LL	r0,STKFRM+PPC_LR_STKOFF(r1)
 	mtlr	r0
-	mtmsrd	r6
+	MTMSRD(r6)
 	isync
 	mr	r3,r9
 	addi	r1,r1,STKFRM
@@ -162,7 +164,7 @@ _GLOBAL(do_stfd)
 	mfmsr	r6
 	ori	r7,r6,MSR_FP
 	cmpwi	cr7,r3,0
-	mtmsrd	r7
+	MTMSRD(r7)
 	isync
 	beq	cr7,1f
 	stfd	fr0,STKFRM-16(r1)
@@ -174,7 +176,7 @@ _GLOBAL(do_stfd)
 	lfd	fr0,STKFRM-16(r1)
 4:	PPC_LL	r0,STKFRM+PPC_LR_STKOFF(r1)
 	mtlr	r0
-	mtmsrd	r6
+	MTMSRD(r6)
 	isync
 	mr	r3,r9
 	addi	r1,r1,STKFRM
@@ -229,7 +231,7 @@ _GLOBAL(do_lvx)
 	oris	r7,r6,MSR_VEC@h
 	cmpwi	cr7,r3,0
 	li	r8,STKFRM-16
-	mtmsrd	r7
+	MTMSRD(r7)
 	isync
 	beq	cr7,1f
 	stvx	vr0,r1,r8
@@ -241,7 +243,7 @@ _GLOBAL(do_lvx)
 	lvx	vr0,r1,r8
 4:	PPC_LL	r0,STKFRM+PPC_LR_STKOFF(r1)
 	mtlr	r0
-	mtmsrd	r6
+	MTMSRD(r6)
 	isync
 	mr	r3,r9
 	addi	r1,r1,STKFRM
@@ -257,7 +259,7 @@ _GLOBAL(do_stvx)
 	oris	r7,r6,MSR_VEC@h
 	cmpwi	cr7,r3,0
 	li	r8,STKFRM-16
-	mtmsrd	r7
+	MTMSRD(r7)
 	isync
 	beq	cr7,1f
 	stvx	vr0,r1,r8
@@ -269,7 +271,7 @@ _GLOBAL(do_stvx)
 	lvx	vr0,r1,r8
 4:	PPC_LL	r0,STKFRM+PPC_LR_STKOFF(r1)
 	mtlr	r0
-	mtmsrd	r6
+	MTMSRD(r6)
 	isync
 	mr	r3,r9
 	addi	r1,r1,STKFRM
@@ -325,7 +327,7 @@ _GLOBAL(do_lxvd2x)
 	oris	r7,r6,MSR_VSX@h
 	cmpwi	cr7,r3,0
 	li	r8,STKFRM-16
-	mtmsrd	r7
+	MTMSRD(r7)
 	isync
 	beq	cr7,1f
 	STXVD2X(0,r1,r8)
@@ -337,7 +339,7 @@ _GLOBAL(do_lxvd2x)
 	LXVD2X(0,r1,r8)
 4:	PPC_LL	r0,STKFRM+PPC_LR_STKOFF(r1)
 	mtlr	r0
-	mtmsrd	r6
+	MTMSRD(r6)
 	isync
 	mr	r3,r9
 	addi	r1,r1,STKFRM
@@ -353,7 +355,7 @@ _GLOBAL(do_stxvd2x)
 	oris	r7,r6,MSR_VSX@h
 	cmpwi	cr7,r3,0
 	li	r8,STKFRM-16
-	mtmsrd	r7
+	MTMSRD(r7)
 	isync
 	beq	cr7,1f
 	STXVD2X(0,r1,r8)
@@ -365,7 +367,7 @@ _GLOBAL(do_stxvd2x)
 	LXVD2X(0,r1,r8)
 4:	PPC_LL	r0,STKFRM+PPC_LR_STKOFF(r1)
 	mtlr	r0
-	mtmsrd	r6
+	MTMSRD(r6)
 	isync
 	mr	r3,r9
 	addi	r1,r1,STKFRM
@@ -373,3 +375,5 @@ _GLOBAL(do_stxvd2x)
 	extab	2b,3b
 
 #endif /* CONFIG_VSX */
+
+#endif	/* CONFIG_PPC_FPU */
diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c
index 58e14fba11b..9b8182e8216 100644
--- a/arch/powerpc/lib/locks.c
+++ b/arch/powerpc/lib/locks.c
@@ -34,7 +34,7 @@ void __spin_yield(arch_spinlock_t *lock)
 		return;
 	holder_cpu = lock_value & 0xffff;
 	BUG_ON(holder_cpu >= NR_CPUS);
-	yield_count = lppaca[holder_cpu].yield_count;
+	yield_count = lppaca_of(holder_cpu).yield_count;
 	if ((yield_count & 1) == 0)
 		return;		/* virtual cpu is currently running */
 	rmb();
@@ -65,7 +65,7 @@ void __rw_yield(arch_rwlock_t *rw)
 		return;		/* no write lock at present */
 	holder_cpu = lock_value & 0xffff;
 	BUG_ON(holder_cpu >= NR_CPUS);
-	yield_count = lppaca[holder_cpu].yield_count;
+	yield_count = lppaca_of(holder_cpu).yield_count;
 	if ((yield_count & 1) == 0)
 		return;		/* virtual cpu is currently running */
 	rmb();
diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index e0a9858d537..ae5189ab004 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -30,6 +30,7 @@ extern char system_call_common[];
 #define XER_OV		0x40000000U
 #define XER_CA		0x20000000U
 
+#ifdef CONFIG_PPC_FPU
 /*
  * Functions in ldstfp.S
  */
@@ -41,6 +42,7 @@ extern int do_lvx(int rn, unsigned long ea);
 extern int do_stvx(int rn, unsigned long ea);
 extern int do_lxvd2x(int rn, unsigned long ea);
 extern int do_stxvd2x(int rn, unsigned long ea);
+#endif
 
 /*
  * Determine whether a conditional branch instruction would branch.
@@ -290,6 +292,7 @@ static int __kprobes write_mem(unsigned long val, unsigned long ea, int nb,
 	return write_mem_unaligned(val, ea, nb, regs);
 }
 
+#ifdef CONFIG_PPC_FPU
 /*
  * Check the address and alignment, and call func to do the actual
  * load or store.
@@ -351,6 +354,7 @@ static int __kprobes do_fp_store(int rn, int (*func)(int, unsigned long),
 	}
 	return err;
 }
+#endif
 
 #ifdef CONFIG_ALTIVEC
 /* For Altivec/VMX, no need to worry about alignment */
@@ -1393,6 +1397,7 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)
 				regs->gpr[rd] = byterev_4(val);
 			goto ldst_done;
 
+#ifdef CONFIG_PPC_CPU
 		case 535:	/* lfsx */
 		case 567:	/* lfsux */
 			if (!(regs->msr & MSR_FP))
@@ -1424,6 +1429,7 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)
 			ea = xform_ea(instr, regs, u);
 			err = do_fp_store(rd, do_stfd, ea, 8, regs);
 			goto ldst_done;
+#endif
 
 #ifdef __powerpc64__
 		case 660:	/* stdbrx */
@@ -1534,6 +1540,7 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)
 		} while (++rd < 32);
 		goto instr_done;
 
+#ifdef CONFIG_PPC_FPU
 	case 48:	/* lfs */
 	case 49:	/* lfsu */
 		if (!(regs->msr & MSR_FP))
@@ -1565,6 +1572,7 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)
 		ea = dform_ea(instr, regs);
 		err = do_fp_store(rd, do_stfd, ea, 8, regs);
 		goto ldst_done;
+#endif
 
 #ifdef __powerpc64__
 	case 58:	/* ld[u], lwa */