8 files changed, 340 insertions, 256 deletions
diff --git a/arch/sh/kernel/cpu/Makefile b/arch/sh/kernel/cpu/Makefile
index d97c803719e..0e48bc61c27 100644
--- a/arch/sh/kernel/cpu/Makefile
+++ b/arch/sh/kernel/cpu/Makefile
@@ -17,5 +17,7 @@ obj-$(CONFIG_ARCH_SHMOBILE)	+= shmobile/
 
 obj-$(CONFIG_SH_ADC)		+= adc.o
 obj-$(CONFIG_SH_CLK_CPG)	+= clock-cpg.o
+obj-$(CONFIG_SH_FPU)		+= fpu.o
+obj-$(CONFIG_SH_FPU_EMU)	+= fpu.o
 
 obj-y	+= irq/ init.o clock.o hwblk.o
diff --git a/arch/sh/kernel/cpu/fpu.c b/arch/sh/kernel/cpu/fpu.c
new file mode 100644
index 00000000000..c23e6727002
--- /dev/null
+++ b/arch/sh/kernel/cpu/fpu.c
@@ -0,0 +1,82 @@
+#include <linux/sched.h>
+#include <asm/processor.h>
+#include <asm/fpu.h>
+
+int init_fpu(struct task_struct *tsk)
+{
+	if (tsk_used_math(tsk)) {
+		if ((boot_cpu_data.flags & CPU_HAS_FPU) && tsk == current)
+			unlazy_fpu(tsk, task_pt_regs(tsk));
+		return 0;
+	}
+
+	/*
+	 * Memory allocation at the first usage of the FPU and other state.
+	 */
+	if (!tsk->thread.xstate) {
+		tsk->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
+						      GFP_KERNEL);
+		if (!tsk->thread.xstate)
+			return -ENOMEM;
+	}
+
+	if (boot_cpu_data.flags & CPU_HAS_FPU) {
+		struct sh_fpu_hard_struct *fp = &tsk->thread.xstate->hardfpu;
+		memset(fp, 0, xstate_size);
+		fp->fpscr = FPSCR_INIT;
+	} else {
+		struct sh_fpu_soft_struct *fp = &tsk->thread.xstate->softfpu;
+		memset(fp, 0, xstate_size);
+		fp->fpscr = FPSCR_INIT;
+	}
+
+	set_stopped_child_used_math(tsk);
+	return 0;
+}
+
+#ifdef CONFIG_SH_FPU
+void __fpu_state_restore(void)
+{
+	struct task_struct *tsk = current;
+
+	restore_fpu(tsk);
+
+	task_thread_info(tsk)->status |= TS_USEDFPU;
+	tsk->fpu_counter++;
+}
+
+void fpu_state_restore(struct pt_regs *regs)
+{
+	struct task_struct *tsk = current;
+
+	if (unlikely(!user_mode(regs))) {
+		printk(KERN_ERR "BUG: FPU is used in kernel mode.\n");
+		BUG();
+		return;
+	}
+
+	if (!tsk_used_math(tsk)) {
+		/*
+		 * does a slab alloc which can sleep
+		 */
+		if (init_fpu(tsk)) {
+			/*
+			 * ran out of memory!
+			 */
+			do_group_exit(SIGKILL);
+			return;
+		}
+	}
+
+	grab_fpu(regs);
+
+	__fpu_state_restore();
+}
+
+BUILD_TRAP_HANDLER(fpu_state_restore)
+{
+	TRAP_HANDLER_DECL;
+
+	fpu_state_restore(regs);
+}
+#endif /* CONFIG_SH_FPU */
diff --git a/arch/sh/kernel/cpu/init.c b/arch/sh/kernel/cpu/init.c
index 89b4b76c0d7..a5bb0550bbf 100644
--- a/arch/sh/kernel/cpu/init.c
+++ b/arch/sh/kernel/cpu/init.c
@@ -24,22 +24,31 @@
 #include <asm/elf.h>
 #include <asm/io.h>
 #include <asm/smp.h>
-#ifdef CONFIG_SUPERH32
-#include <asm/ubc.h>
+
+#ifdef CONFIG_SH_FPU
+#define cpu_has_fpu	1
+#else
+#define cpu_has_fpu	0
+#endif
+
+#ifdef CONFIG_SH_DSP
+#define cpu_has_dsp	1
+#else
+#define cpu_has_dsp	0
 #endif
 
 /*
  * Generic wrapper for command line arguments to disable on-chip
  * peripherals (nofpu, nodsp, and so forth).
  */
-#define onchip_setup(x)				\
-static int x##_disabled __initdata = 0;		\
-						\
-static int __init x##_setup(char *opts)		\
-{						\
-	x##_disabled = 1;			\
-	return 1;				\
-}						\
+#define onchip_setup(x)					\
+static int x##_disabled __initdata = !cpu_has_##x;	\
+							\
+static int __init x##_setup(char *opts)			\
+{							\
+	x##_disabled = 1;				\
+	return 1;					\
+}							\
 __setup("no" __stringify(x), x##_setup);
 
 onchip_setup(fpu);
@@ -207,6 +216,18 @@ static void detect_cache_shape(void)
 		l2_cache_shape = -1; /* No S-cache */
 }
 
+static void __init fpu_init(void)
+{
+	/* Disable the FPU */
+	if (fpu_disabled && (current_cpu_data.flags & CPU_HAS_FPU)) {
+		printk("FPU Disabled\n");
+		current_cpu_data.flags &= ~CPU_HAS_FPU;
+	}
+
+	disable_fpu();
+	clear_used_math();
+}
+
 #ifdef CONFIG_SH_DSP
 static void __init release_dsp(void)
 {
@@ -244,28 +265,35 @@ static void __init dsp_init(void)
 	if (sr & SR_DSP)
 		current_cpu_data.flags |= CPU_HAS_DSP;
 
+	/* Disable the DSP */
+	if (dsp_disabled && (current_cpu_data.flags & CPU_HAS_DSP)) {
+		printk("DSP Disabled\n");
+		current_cpu_data.flags &= ~CPU_HAS_DSP;
+	}
+
 	/* Now that we've determined the DSP status, clear the DSP bit. */
 	release_dsp();
 }
+#else
+static inline void __init dsp_init(void) { }
 #endif /* CONFIG_SH_DSP */
 
 /**
  * sh_cpu_init
  *
- * This is our initial entry point for each CPU, and is invoked on the boot
- * CPU prior to calling start_kernel(). For SMP, a combination of this and
- * start_secondary() will bring up each processor to a ready state prior
- * to hand forking the idle loop.
+ * This is our initial entry point for each CPU, and is invoked on the
+ * boot CPU prior to calling start_kernel(). For SMP, a combination of
+ * this and start_secondary() will bring up each processor to a ready
+ * state prior to hand forking the idle loop.
  *
- * We do all of the basic processor init here, including setting up the
- * caches, FPU, DSP, kicking the UBC, etc. By the time start_kernel() is
- * hit (and subsequently platform_setup()) things like determining the
- * CPU subtype and initial configuration will all be done.
+ * We do all of the basic processor init here, including setting up
+ * the caches, FPU, DSP, etc. By the time start_kernel() is hit (and
+ * subsequently platform_setup()) things like determining the CPU
+ * subtype and initial configuration will all be done.
  *
  * Each processor family is still responsible for doing its own probing
  * and cache configuration in detect_cpu_and_cache_system().
  */
-
 asmlinkage void __init sh_cpu_init(void)
 {
 	current_thread_info()->cpu = hard_smp_processor_id();
@@ -302,18 +330,8 @@ asmlinkage void __init sh_cpu_init(void)
 		detect_cache_shape();
 	}
 
-	/* Disable the FPU */
-	if (fpu_disabled) {
-		printk("FPU Disabled\n");
-		current_cpu_data.flags &= ~CPU_HAS_FPU;
-	}
-
-	/* FPU initialization */
-	disable_fpu();
-	if ((current_cpu_data.flags & CPU_HAS_FPU)) {
-		current_thread_info()->status &= ~TS_USEDFPU;
-		clear_used_math();
-	}
+	fpu_init();
+	dsp_init();
 
 	/*
 	 * Initialize the per-CPU ASID cache very early, since the
@@ -321,18 +339,12 @@ asmlinkage void __init sh_cpu_init(void)
 	 */
 	current_cpu_data.asid_cache = NO_CONTEXT;
 
-#ifdef CONFIG_SH_DSP
-	/* Probe for DSP */
-	dsp_init();
-
-	/* Disable the DSP */
-	if (dsp_disabled) {
-		printk("DSP Disabled\n");
-		current_cpu_data.flags &= ~CPU_HAS_DSP;
-		release_dsp();
-	}
-#endif
-
 	speculative_execution_init();
 	expmask_init();
+
+	/*
+	 * Boot processor to setup the FP and extended state context info.
+	 */
+	if (raw_smp_processor_id() == 0)
+		init_thread_xstate();
 }
diff --git a/arch/sh/kernel/cpu/sh2a/fpu.c b/arch/sh/kernel/cpu/sh2a/fpu.c
index d395ce5740e..488d24e0cdf 100644
--- a/arch/sh/kernel/cpu/sh2a/fpu.c
+++ b/arch/sh/kernel/cpu/sh2a/fpu.c
@@ -26,8 +26,7 @@
 /*
  * Save FPU registers onto task structure.
  */
-void
-save_fpu(struct task_struct *tsk)
+void save_fpu(struct task_struct *tsk)
 {
 	unsigned long dummy;
 
@@ -52,7 +51,7 @@ save_fpu(struct task_struct *tsk)
 		     "fmov.s	fr0, @-%0\n\t"
 		     "lds	%3, fpscr\n\t"
 		     : "=r" (dummy)
-		     : "0" ((char *)(&tsk->thread.fpu.hard.status)),
+		     : "0" ((char *)(&tsk->thread.xstate->hardfpu.status)),
 		       "r" (FPSCR_RCHG),
 		       "r" (FPSCR_INIT)
 		     : "memory");
@@ -60,8 +59,7 @@ save_fpu(struct task_struct *tsk)
 	disable_fpu();
 }
 
-static void
-restore_fpu(struct task_struct *tsk)
+void restore_fpu(struct task_struct *tsk)
 {
 	unsigned long dummy;
 
@@ -85,45 +83,12 @@ restore_fpu(struct task_struct *tsk)
 		     "lds.l	@%0+, fpscr\n\t"
 		     "lds.l	@%0+, fpul\n\t"
 		     : "=r" (dummy)
-		     : "0" (&tsk->thread.fpu), "r" (FPSCR_RCHG)
+		     : "0" (tsk->thread.xstate), "r" (FPSCR_RCHG)
 		     : "memory");
 	disable_fpu();
 }
 
 /*
- * Load the FPU with signalling NANS.  This bit pattern we're using
- * has the property that no matter wether considered as single or as
- * double precission represents signaling NANS.
- */
-
-static void
-fpu_init(void)
-{
-	enable_fpu();
-	asm volatile("lds	%0, fpul\n\t"
-		     "fsts	fpul, fr0\n\t"
-		     "fsts	fpul, fr1\n\t"
-		     "fsts	fpul, fr2\n\t"
-		     "fsts	fpul, fr3\n\t"
-		     "fsts	fpul, fr4\n\t"
-		     "fsts	fpul, fr5\n\t"
-		     "fsts	fpul, fr6\n\t"
-		     "fsts	fpul, fr7\n\t"
-		     "fsts	fpul, fr8\n\t"
-		     "fsts	fpul, fr9\n\t"
-		     "fsts	fpul, fr10\n\t"
-		     "fsts	fpul, fr11\n\t"
-		     "fsts	fpul, fr12\n\t"
-		     "fsts	fpul, fr13\n\t"
-		     "fsts	fpul, fr14\n\t"
-		     "fsts	fpul, fr15\n\t"
-		     "lds	%2, fpscr\n\t"
-		     : /* no output */
-		     : "r" (0), "r" (FPSCR_RCHG), "r" (FPSCR_INIT));
-	disable_fpu();
-}
-
-/*
  *	Emulate arithmetic ops on denormalized number for some FPU insns.
  */
 
@@ -490,9 +455,9 @@ ieee_fpe_handler (struct pt_regs *regs)
 	if ((finsn & 0xf1ff) == 0xf0ad) { /* fcnvsd */
 		struct task_struct *tsk = current;
 
-		if ((tsk->thread.fpu.hard.fpscr & FPSCR_FPU_ERROR)) {
+		if ((tsk->thread.xstate->hardfpu.fpscr & FPSCR_FPU_ERROR)) {
 			/* FPU error */
-			denormal_to_double (&tsk->thread.fpu.hard,
+			denormal_to_double (&tsk->thread.xstate->hardfpu,
 					    (finsn >> 8) & 0xf);
 		} else
 			return 0;
@@ -507,9 +472,9 @@ ieee_fpe_handler (struct pt_regs *regs)
 
 		n = (finsn >> 8) & 0xf;
 		m = (finsn >> 4) & 0xf;
-		hx = tsk->thread.fpu.hard.fp_regs[n];
-		hy = tsk->thread.fpu.hard.fp_regs[m];
-		fpscr = tsk->thread.fpu.hard.fpscr;
+		hx = tsk->thread.xstate->hardfpu.fp_regs[n];
+		hy = tsk->thread.xstate->hardfpu.fp_regs[m];
+		fpscr = tsk->thread.xstate->hardfpu.fpscr;
 		prec = fpscr & (1 << 19);
 
 		if ((fpscr & FPSCR_FPU_ERROR)
@@ -519,15 +484,15 @@ ieee_fpe_handler (struct pt_regs *regs)
 
 			/* FPU error because of denormal */
 			llx = ((long long) hx << 32)
-			       | tsk->thread.fpu.hard.fp_regs[n+1];
+			       | tsk->thread.xstate->hardfpu.fp_regs[n+1];
 			lly = ((long long) hy << 32)
-			       | tsk->thread.fpu.hard.fp_regs[m+1];
+			       | tsk->thread.xstate->hardfpu.fp_regs[m+1];
 			if ((hx & 0x7fffffff) >= 0x00100000)
 				llx = denormal_muld(lly, llx);
 			else
 				llx = denormal_muld(llx, lly);
-			tsk->thread.fpu.hard.fp_regs[n] = llx >> 32;
-			tsk->thread.fpu.hard.fp_regs[n+1] = llx & 0xffffffff;
+			tsk->thread.xstate->hardfpu.fp_regs[n] = llx >> 32;
+			tsk->thread.xstate->hardfpu.fp_regs[n+1] = llx & 0xffffffff;
 		} else if ((fpscr & FPSCR_FPU_ERROR)
 		     && (!prec && ((hx & 0x7fffffff) < 0x00800000
 				   || (hy & 0x7fffffff) < 0x00800000))) {
@@ -536,7 +501,7 @@ ieee_fpe_handler (struct pt_regs *regs)
 				hx = denormal_mulf(hy, hx);
 			else
 				hx = denormal_mulf(hx, hy);
-			tsk->thread.fpu.hard.fp_regs[n] = hx;
+			tsk->thread.xstate->hardfpu.fp_regs[n] = hx;
 		} else
 			return 0;
 
@@ -550,9 +515,9 @@ ieee_fpe_handler (struct pt_regs *regs)
 
 		n = (finsn >> 8) & 0xf;
 		m = (finsn >> 4) & 0xf;
-		hx = tsk->thread.fpu.hard.fp_regs[n];
-		hy = tsk->thread.fpu.hard.fp_regs[m];
-		fpscr = tsk->thread.fpu.hard.fpscr;
+		hx = tsk->thread.xstate->hardfpu.fp_regs[n];
+		hy = tsk->thread.xstate->hardfpu.fp_regs[m];
+		fpscr = tsk->thread.xstate->hardfpu.fpscr;
 		prec = fpscr & (1 << 19);
 
 		if ((fpscr & FPSCR_FPU_ERROR)
@@ -562,15 +527,15 @@ ieee_fpe_handler (struct pt_regs *regs)
 
 			/* FPU error because of denormal */
 			llx = ((long long) hx << 32)
-			       | tsk->thread.fpu.hard.fp_regs[n+1];
+			       | tsk->thread.xstate->hardfpu.fp_regs[n+1];
 			lly = ((long long) hy << 32)
-			       | tsk->thread.fpu.hard.fp_regs[m+1];
+			       | tsk->thread.xstate->hardfpu.fp_regs[m+1];
 			if ((finsn & 0xf00f) == 0xf000)
 				llx = denormal_addd(llx, lly);
 			else
 				llx = denormal_addd(llx, lly ^ (1LL << 63));
-			tsk->thread.fpu.hard.fp_regs[n] = llx >> 32;
-			tsk->thread.fpu.hard.fp_regs[n+1] = llx & 0xffffffff;
+			tsk->thread.xstate->hardfpu.fp_regs[n] = llx >> 32;
+			tsk->thread.xstate->hardfpu.fp_regs[n+1] = llx & 0xffffffff;
 		} else if ((fpscr & FPSCR_FPU_ERROR)
 		     && (!prec && ((hx & 0x7fffffff) < 0x00800000
 				   || (hy & 0x7fffffff) < 0x00800000))) {
@@ -579,7 +544,7 @@ ieee_fpe_handler (struct pt_regs *regs)
 				hx = denormal_addf(hx, hy);
 			else
 				hx = denormal_addf(hx, hy ^ 0x80000000);
-			tsk->thread.fpu.hard.fp_regs[n] = hx;
+			tsk->thread.xstate->hardfpu.fp_regs[n] = hx;
 		} else
 			return 0;
 
@@ -597,7 +562,7 @@ BUILD_TRAP_HANDLER(fpu_error)
 
 	__unlazy_fpu(tsk, regs);
 	if (ieee_fpe_handler(regs)) {
-		tsk->thread.fpu.hard.fpscr &=
+		tsk->thread.xstate->hardfpu.fpscr &=
 			~(FPSCR_CAUSE_MASK | FPSCR_FLAG_MASK);
 		grab_fpu(regs);
 		restore_fpu(tsk);
@@ -607,33 +572,3 @@ BUILD_TRAP_HANDLER(fpu_error)
 
 	force_sig(SIGFPE, tsk);
 }
-
-void fpu_state_restore(struct pt_regs *regs)
-{
-	struct task_struct *tsk = current;
-
-	grab_fpu(regs);
-	if (unlikely(!user_mode(regs))) {
-		printk(KERN_ERR "BUG: FPU is used in kernel mode.\n");
-		BUG();
-		return;
-	}
-
-	if (likely(used_math())) {
-		/* Using the FPU again.  */
-		restore_fpu(tsk);
-	} else	{
-		/* First time FPU user.  */
-		fpu_init();
-		set_used_math();
-	}
-	task_thread_info(tsk)->status |= TS_USEDFPU;
-	tsk->fpu_counter++;
-}
-
-BUILD_TRAP_HANDLER(fpu_state_restore)
-{
-	TRAP_HANDLER_DECL;
-
-	fpu_state_restore(regs);
-}
diff --git a/arch/sh/kernel/cpu/sh3/ex.S b/arch/sh/kernel/cpu/sh3/ex.S
index 46610c35c23..99b4d020179 100644
--- a/arch/sh/kernel/cpu/sh3/ex.S
+++ b/arch/sh/kernel/cpu/sh3/ex.S
@@ -49,7 +49,7 @@ ENTRY(exception_handling_table)
 	.long	exception_error	! reserved_instruction (filled by trap_init) /* 180 */
 	.long	exception_error	! illegal_slot_instruction (filled by trap_init) /*1A0*/
 	.long	nmi_trap_handler	/* 1C0 */	! Allow trap to debugger
-	.long	break_point_trap	/* 1E0 */
+	.long	breakpoint_trap_handler	/* 1E0 */
 
 	/*
 	 * Pad the remainder of the table out, exceptions residing in far
diff --git a/arch/sh/kernel/cpu/sh4/fpu.c b/arch/sh/kernel/cpu/sh4/fpu.c
index e97857aec8a..447482d7f65 100644
--- a/arch/sh/kernel/cpu/sh4/fpu.c
+++ b/arch/sh/kernel/cpu/sh4/fpu.c
@@ -85,14 +85,14 @@ void save_fpu(struct task_struct *tsk)
 		      "fmov.s	fr1, @-%0\n\t"
 		      "fmov.s	fr0, @-%0\n\t"
 		      "lds	%3, fpscr\n\t":"=r" (dummy)
-		      :"0"((char *)(&tsk->thread.fpu.hard.status)),
+		      :"0"((char *)(&tsk->thread.xstate->hardfpu.status)),
 		      "r"(FPSCR_RCHG), "r"(FPSCR_INIT)
 		      :"memory");
 
 	disable_fpu();
 }
 
-static void restore_fpu(struct task_struct *tsk)
+void restore_fpu(struct task_struct *tsk)
 {
 	unsigned long dummy;
 
@@ -135,62 +135,11 @@ static void restore_fpu(struct task_struct *tsk)
 		      "lds.l	@%0+, fpscr\n\t"
 		      "lds.l	@%0+, fpul\n\t"
 		      :"=r" (dummy)
-		      :"0"(&tsk->thread.fpu), "r"(FPSCR_RCHG)
+		      :"0" (tsk->thread.xstate), "r" (FPSCR_RCHG)
 		      :"memory");
 	disable_fpu();
 }
 
-/*
- * Load the FPU with signalling NANS.  This bit pattern we're using
- * has the property that no matter wether considered as single or as
- * double precision represents signaling NANS.
- */
-
-static void fpu_init(void)
-{
-	enable_fpu();
-	asm volatile (	"lds	%0, fpul\n\t"
-			"lds	%1, fpscr\n\t"
-			"fsts	fpul, fr0\n\t"
-			"fsts	fpul, fr1\n\t"
-			"fsts	fpul, fr2\n\t"
-			"fsts	fpul, fr3\n\t"
-			"fsts	fpul, fr4\n\t"
-			"fsts	fpul, fr5\n\t"
-			"fsts	fpul, fr6\n\t"
-			"fsts	fpul, fr7\n\t"
-			"fsts	fpul, fr8\n\t"
-			"fsts	fpul, fr9\n\t"
-			"fsts	fpul, fr10\n\t"
-			"fsts	fpul, fr11\n\t"
-			"fsts	fpul, fr12\n\t"
-			"fsts	fpul, fr13\n\t"
-			"fsts	fpul, fr14\n\t"
-			"fsts	fpul, fr15\n\t"
-			"frchg\n\t"
-			"fsts	fpul, fr0\n\t"
-			"fsts	fpul, fr1\n\t"
-			"fsts	fpul, fr2\n\t"
-			"fsts	fpul, fr3\n\t"
-			"fsts	fpul, fr4\n\t"
-			"fsts	fpul, fr5\n\t"
-			"fsts	fpul, fr6\n\t"
-			"fsts	fpul, fr7\n\t"
-			"fsts	fpul, fr8\n\t"
-			"fsts	fpul, fr9\n\t"
-			"fsts	fpul, fr10\n\t"
-			"fsts	fpul, fr11\n\t"
-			"fsts	fpul, fr12\n\t"
-			"fsts	fpul, fr13\n\t"
-			"fsts	fpul, fr14\n\t"
-			"fsts	fpul, fr15\n\t"
-			"frchg\n\t"
-			"lds	%2, fpscr\n\t"
-			:	/* no output */
-			:"r" (0), "r"(FPSCR_RCHG), "r"(FPSCR_INIT));
-	disable_fpu();
-}
-
 /**
  *      denormal_to_double - Given denormalized float number,
  *                           store double float
@@ -282,9 +231,9 @@ static int ieee_fpe_handler(struct pt_regs *regs)
 		/* fcnvsd */
 		struct task_struct *tsk = current;
 
-		if ((tsk->thread.fpu.hard.fpscr & FPSCR_CAUSE_ERROR))
+		if ((tsk->thread.xstate->hardfpu.fpscr & FPSCR_CAUSE_ERROR))
 			/* FPU error */
-			denormal_to_double(&tsk->thread.fpu.hard,
+			denormal_to_double(&tsk->thread.xstate->hardfpu,
 					   (finsn >> 8) & 0xf);
 		else
 			return 0;
@@ -300,9 +249,9 @@ static int ieee_fpe_handler(struct pt_regs *regs)
 
 		n = (finsn >> 8) & 0xf;
 		m = (finsn >> 4) & 0xf;
-		hx = tsk->thread.fpu.hard.fp_regs[n];
-		hy = tsk->thread.fpu.hard.fp_regs[m];
-		fpscr = tsk->thread.fpu.hard.fpscr;
+		hx = tsk->thread.xstate->hardfpu.fp_regs[n];
+		hy = tsk->thread.xstate->hardfpu.fp_regs[m];
+		fpscr = tsk->thread.xstate->hardfpu.fpscr;
 		prec = fpscr & FPSCR_DBL_PRECISION;
 
 		if ((fpscr & FPSCR_CAUSE_ERROR)
@@ -312,18 +261,18 @@ static int ieee_fpe_handler(struct pt_regs *regs)
 
 			/* FPU error because of denormal (doubles) */
 			llx = ((long long)hx << 32)
-			    | tsk->thread.fpu.hard.fp_regs[n + 1];
+			    | tsk->thread.xstate->hardfpu.fp_regs[n + 1];
 			lly = ((long long)hy << 32)
-			    | tsk->thread.fpu.hard.fp_regs[m + 1];
+			    | tsk->thread.xstate->hardfpu.fp_regs[m + 1];
 			llx = float64_mul(llx, lly);
-			tsk->thread.fpu.hard.fp_regs[n] = llx >> 32;
-			tsk->thread.fpu.hard.fp_regs[n + 1] = llx & 0xffffffff;
+			tsk->thread.xstate->hardfpu.fp_regs[n] = llx >> 32;
+			tsk->thread.xstate->hardfpu.fp_regs[n + 1] = llx & 0xffffffff;
 		} else if ((fpscr & FPSCR_CAUSE_ERROR)
 			   && (!prec && ((hx & 0x7fffffff) < 0x00800000
 					 || (hy & 0x7fffffff) < 0x00800000))) {
 			/* FPU error because of denormal (floats) */
 			hx = float32_mul(hx, hy);
-			tsk->thread.fpu.hard.fp_regs[n] = hx;
+			tsk->thread.xstate->hardfpu.fp_regs[n] = hx;
 		} else
 			return 0;
 
@@ -338,9 +287,9 @@ static int ieee_fpe_handler(struct pt_regs *regs)
 
 		n = (finsn >> 8) & 0xf;
 		m = (finsn >> 4) & 0xf;
-		hx = tsk->thread.fpu.hard.fp_regs[n];
-		hy = tsk->thread.fpu.hard.fp_regs[m];
-		fpscr = tsk->thread.fpu.hard.fpscr;
+		hx = tsk->thread.xstate->hardfpu.fp_regs[n];
+		hy = tsk->thread.xstate->hardfpu.fp_regs[m];
+		fpscr = tsk->thread.xstate->hardfpu.fpscr;
 		prec = fpscr & FPSCR_DBL_PRECISION;
 
 		if ((fpscr & FPSCR_CAUSE_ERROR)
@@ -350,15 +299,15 @@ static int ieee_fpe_handler(struct pt_regs *regs)
 
 			/* FPU error because of denormal (doubles) */
 			llx = ((long long)hx << 32)
-			    | tsk->thread.fpu.hard.fp_regs[n + 1];
+			    | tsk->thread.xstate->hardfpu.fp_regs[n + 1];
 			lly = ((long long)hy << 32)
-			    | tsk->thread.fpu.hard.fp_regs[m + 1];
+			    | tsk->thread.xstate->hardfpu.fp_regs[m + 1];
 			if ((finsn & 0xf00f) == 0xf000)
 				llx = float64_add(llx, lly);
 			else
 				llx = float64_sub(llx, lly);
-			tsk->thread.fpu.hard.fp_regs[n] = llx >> 32;
-			tsk->thread.fpu.hard.fp_regs[n + 1] = llx & 0xffffffff;
+			tsk->thread.xstate->hardfpu.fp_regs[n] = llx >> 32;
+			tsk->thread.xstate->hardfpu.fp_regs[n + 1] = llx & 0xffffffff;
 		} else if ((fpscr & FPSCR_CAUSE_ERROR)
 			   && (!prec && ((hx & 0x7fffffff) < 0x00800000
 					 || (hy & 0x7fffffff) < 0x00800000))) {
@@ -367,7 +316,7 @@ static int ieee_fpe_handler(struct pt_regs *regs)
 				hx = float32_add(hx, hy);
 			else
 				hx = float32_sub(hx, hy);
-			tsk->thread.fpu.hard.fp_regs[n] = hx;
+			tsk->thread.xstate->hardfpu.fp_regs[n] = hx;
 		} else
 			return 0;
 
@@ -382,9 +331,9 @@ static int ieee_fpe_handler(struct pt_regs *regs)
 
 		n = (finsn >> 8) & 0xf;
 		m = (finsn >> 4) & 0xf;
-		hx = tsk->thread.fpu.hard.fp_regs[n];
-		hy = tsk->thread.fpu.hard.fp_regs[m];
-		fpscr = tsk->thread.fpu.hard.fpscr;
+		hx = tsk->thread.xstate->hardfpu.fp_regs[n];
+		hy = tsk->thread.xstate->hardfpu.fp_regs[m];
+		fpscr = tsk->thread.xstate->hardfpu.fpscr;
 		prec = fpscr & FPSCR_DBL_PRECISION;
 
 		if ((fpscr & FPSCR_CAUSE_ERROR)
@@ -394,20 +343,20 @@ static int ieee_fpe_handler(struct pt_regs *regs)
 
 			/* FPU error because of denormal (doubles) */
 			llx = ((long long)hx << 32)
-			    | tsk->thread.fpu.hard.fp_regs[n + 1];
+			    | tsk->thread.xstate->hardfpu.fp_regs[n + 1];
 			lly = ((long long)hy << 32)
-			    | tsk->thread.fpu.hard.fp_regs[m + 1];
+			    | tsk->thread.xstate->hardfpu.fp_regs[m + 1];
 
 			llx = float64_div(llx, lly);
 
-			tsk->thread.fpu.hard.fp_regs[n] = llx >> 32;
-			tsk->thread.fpu.hard.fp_regs[n + 1] = llx & 0xffffffff;
+			tsk->thread.xstate->hardfpu.fp_regs[n] = llx >> 32;
+			tsk->thread.xstate->hardfpu.fp_regs[n + 1] = llx & 0xffffffff;
 		} else if ((fpscr & FPSCR_CAUSE_ERROR)
 			   && (!prec && ((hx & 0x7fffffff) < 0x00800000
 					 || (hy & 0x7fffffff) < 0x00800000))) {
 			/* FPU error because of denormal (floats) */
 			hx = float32_div(hx, hy);
-			tsk->thread.fpu.hard.fp_regs[n] = hx;
+			tsk->thread.xstate->hardfpu.fp_regs[n] = hx;
 		} else
 			return 0;
 
@@ -420,17 +369,17 @@ static int ieee_fpe_handler(struct pt_regs *regs)
 		unsigned int hx;
 
 		m = (finsn >> 8) & 0x7;
-		hx = tsk->thread.fpu.hard.fp_regs[m];
+		hx = tsk->thread.xstate->hardfpu.fp_regs[m];
 
-		if ((tsk->thread.fpu.hard.fpscr & FPSCR_CAUSE_ERROR)
+		if ((tsk->thread.xstate->hardfpu.fpscr & FPSCR_CAUSE_ERROR)
 			&& ((hx & 0x7fffffff) < 0x00100000)) {
 			/* subnormal double to float conversion */
 			long long llx;
 
-			llx = ((long long)tsk->thread.fpu.hard.fp_regs[m] << 32)
-			    | tsk->thread.fpu.hard.fp_regs[m + 1];
+			llx = ((long long)tsk->thread.xstate->hardfpu.fp_regs[m] << 32)
+			    | tsk->thread.xstate->hardfpu.fp_regs[m + 1];
 
-			tsk->thread.fpu.hard.fpul = float64_to_float32(llx);
+			tsk->thread.xstate->hardfpu.fpul = float64_to_float32(llx);
 		} else
 			return 0;
 
@@ -449,7 +398,7 @@ void float_raise(unsigned int flags)
 int float_rounding_mode(void)
 {
 	struct task_struct *tsk = current;
-	int roundingMode = FPSCR_ROUNDING_MODE(tsk->thread.fpu.hard.fpscr);
+	int roundingMode = FPSCR_ROUNDING_MODE(tsk->thread.xstate->hardfpu.fpscr);
 	return roundingMode;
 }
 
@@ -461,16 +410,16 @@ BUILD_TRAP_HANDLER(fpu_error)
 	__unlazy_fpu(tsk, regs);
 	fpu_exception_flags = 0;
 	if (ieee_fpe_handler(regs)) {
-		tsk->thread.fpu.hard.fpscr &=
+		tsk->thread.xstate->hardfpu.fpscr &=
 		    ~(FPSCR_CAUSE_MASK | FPSCR_FLAG_MASK);
-		tsk->thread.fpu.hard.fpscr |= fpu_exception_flags;
+		tsk->thread.xstate->hardfpu.fpscr |= fpu_exception_flags;
 		/* Set the FPSCR flag as well as cause bits - simply
 		 * replicate the cause */
-		tsk->thread.fpu.hard.fpscr |= (fpu_exception_flags >> 10);
+		tsk->thread.xstate->hardfpu.fpscr |= (fpu_exception_flags >> 10);
 		grab_fpu(regs);
 		restore_fpu(tsk);
 		task_thread_info(tsk)->status |= TS_USEDFPU;
-		if ((((tsk->thread.fpu.hard.fpscr & FPSCR_ENABLE_MASK) >> 7) &
+		if ((((tsk->thread.xstate->hardfpu.fpscr & FPSCR_ENABLE_MASK) >> 7) &
 		     (fpu_exception_flags >> 2)) == 0) {
 			return;
 		}
@@ -478,33 +427,3 @@ BUILD_TRAP_HANDLER(fpu_error)
 
 	force_sig(SIGFPE, tsk);
 }
-
-void fpu_state_restore(struct pt_regs *regs)
-{
-	struct task_struct *tsk = current;
-
-	grab_fpu(regs);
-	if (unlikely(!user_mode(regs))) {
-		printk(KERN_ERR "BUG: FPU is used in kernel mode.\n");
-		BUG();
-		return;
-	}
-
-	if (likely(used_math())) {
-		/* Using the FPU again.  */
-		restore_fpu(tsk);
-	} else {
-		/* First time FPU user.  */
-		fpu_init();
-		set_used_math();
-	}
-	task_thread_info(tsk)->status |= TS_USEDFPU;
-	tsk->fpu_counter++;
-}
-
-BUILD_TRAP_HANDLER(fpu_state_restore)
-{
-	TRAP_HANDLER_DECL;
-
-	fpu_state_restore(regs);
-}
diff --git a/arch/sh/kernel/cpu/sh4a/Makefile b/arch/sh/kernel/cpu/sh4a/Makefile
index 33bab477d2e..b144e8af89d 100644
--- a/arch/sh/kernel/cpu/sh4a/Makefile
+++ b/arch/sh/kernel/cpu/sh4a/Makefile
@@ -41,7 +41,8 @@ pinmux-$(CONFIG_CPU_SUBTYPE_SH7757)	:= pinmux-sh7757.o
 pinmux-$(CONFIG_CPU_SUBTYPE_SH7785)	:= pinmux-sh7785.o
 pinmux-$(CONFIG_CPU_SUBTYPE_SH7786)	:= pinmux-sh7786.o
 
-obj-y				+= $(clock-y)
-obj-$(CONFIG_SMP)		+= $(smp-y)
-obj-$(CONFIG_GENERIC_GPIO)	+= $(pinmux-y)
-obj-$(CONFIG_PERF_EVENTS)	+= perf_event.o
+obj-y					+= $(clock-y)
+obj-$(CONFIG_SMP)			+= $(smp-y)
+obj-$(CONFIG_GENERIC_GPIO)		+= $(pinmux-y)
+obj-$(CONFIG_PERF_EVENTS)		+= perf_event.o
+obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= ubc.o
diff --git a/arch/sh/kernel/cpu/sh4a/ubc.c b/arch/sh/kernel/cpu/sh4a/ubc.c
new file mode 100644
index 00000000000..efb2745bcb3
--- /dev/null
+++ b/arch/sh/kernel/cpu/sh4a/ubc.c
@@ -0,0 +1,133 @@
+/*
+ * arch/sh/kernel/cpu/sh4a/ubc.c
+ *
+ * On-chip UBC support for SH-4A CPUs.
+ *
+ * Copyright (C) 2009 - 2010  Paul Mundt
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/clk.h>
+#include <linux/io.h>
+#include <asm/hw_breakpoint.h>
+
+#define UBC_CBR(idx)	(0xff200000 + (0x20 * idx))
+#define UBC_CRR(idx)	(0xff200004 + (0x20 * idx))
+#define UBC_CAR(idx)	(0xff200008 + (0x20 * idx))
+#define UBC_CAMR(idx)	(0xff20000c + (0x20 * idx))
+
+#define UBC_CCMFR	0xff200600
+#define UBC_CBCR	0xff200620
+
+/* CRR */
+#define UBC_CRR_PCB	(1 << 1)
+#define UBC_CRR_BIE	(1 << 0)
+
+/* CBR */
+#define UBC_CBR_CE	(1 << 0)
+
+static struct sh_ubc sh4a_ubc;
+
+static void sh4a_ubc_enable(struct arch_hw_breakpoint *info, int idx)
+{
+	__raw_writel(UBC_CBR_CE | info->len | info->type, UBC_CBR(idx));
+	__raw_writel(info->address, UBC_CAR(idx));
+}
+
+static void sh4a_ubc_disable(struct arch_hw_breakpoint *info, int idx)
+{
+	__raw_writel(0, UBC_CBR(idx));
+	__raw_writel(0, UBC_CAR(idx));
+}
+
+static void sh4a_ubc_enable_all(unsigned long mask)
+{
+	int i;
+
+	for (i = 0; i < sh4a_ubc.num_events; i++)
+		if (mask & (1 << i))
+			__raw_writel(__raw_readl(UBC_CBR(i)) | UBC_CBR_CE,
+				     UBC_CBR(i));
+}
+
+static void sh4a_ubc_disable_all(void)
+{
+	int i;
+
+	for (i = 0; i < sh4a_ubc.num_events; i++)
+		__raw_writel(__raw_readl(UBC_CBR(i)) & ~UBC_CBR_CE,
+			     UBC_CBR(i));
+}
+
+static unsigned long sh4a_ubc_active_mask(void)
+{
+	unsigned long active = 0;
+	int i;
+
+	for (i = 0; i < sh4a_ubc.num_events; i++)
+		if (__raw_readl(UBC_CBR(i)) & UBC_CBR_CE)
+			active |= (1 << i);
+
+	return active;
+}
+
+static unsigned long sh4a_ubc_triggered_mask(void)
+{
+	return __raw_readl(UBC_CCMFR);
+}
+
+static void sh4a_ubc_clear_triggered_mask(unsigned long mask)
+{
+	__raw_writel(__raw_readl(UBC_CCMFR) & ~mask, UBC_CCMFR);
+}
+
+static struct sh_ubc sh4a_ubc = {
+	.name			= "SH-4A",
+	.num_events		= 2,
+	.trap_nr		= 0x1e0,
+	.enable			= sh4a_ubc_enable,
+	.disable		= sh4a_ubc_disable,
+	.enable_all		= sh4a_ubc_enable_all,
+	.disable_all		= sh4a_ubc_disable_all,
+	.active_mask		= sh4a_ubc_active_mask,
+	.triggered_mask		= sh4a_ubc_triggered_mask,
+	.clear_triggered_mask	= sh4a_ubc_clear_triggered_mask,
+};
+
+static int __init sh4a_ubc_init(void)
+{
+	struct clk *ubc_iclk = clk_get(NULL, "ubc0");
+	int i;
+
+	/*
+	 * The UBC MSTP bit is optional, as not all platforms will have
+	 * it. Just ignore it if we can't find it.
+	 */
+	if (IS_ERR(ubc_iclk))
+		ubc_iclk = NULL;
+
+	clk_enable(ubc_iclk);
+
+	__raw_writel(0, UBC_CBCR);
+
+	for (i = 0; i < sh4a_ubc.num_events; i++) {
+		__raw_writel(0, UBC_CAMR(i));
+		__raw_writel(0, UBC_CBR(i));
+
+		__raw_writel(UBC_CRR_BIE | UBC_CRR_PCB, UBC_CRR(i));
+
+		/* dummy read for write posting */
+		(void)__raw_readl(UBC_CRR(i));
+	}
+
+	clk_disable(ubc_iclk);
+
+	sh4a_ubc.clk = ubc_iclk;
+
+	return register_sh_ubc(&sh4a_ubc);
+}
+arch_initcall(sh4a_ubc_init);