From 813f90728e7d74e9b753e6ef6c6915cd2a047adb Mon Sep 17 00:00:00 2001
From: Christian Krafft <krafft@de.ibm.com>
Date: Fri, 20 Jul 2007 21:39:18 +0200
Subject: [CELL] pmi: remove support for mutiple devices.

The pmi driver got simplified by removing support for multiple devices.
As there is no more than one pmi device per maschine, there is no need to
specify the device for listening and sending messages.

This way the caller (cbe_cpufreq) doesn't need to scan the device tree.
When registering the handler on a board without a pmi
interface, pmi.c will just return -ENODEV.

The patch that fixed the breakage of cell_defconfig has been
broken out of the earlier version of this patch. So this is
the version that applies cleanly on top of it.

Signed-off-by: Christian Krafft <krafft@de.ibm.com>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/cbe_cpufreq.c | 36 +++++++++-------------
 arch/powerpc/sysdev/pmi.c                 | 51 +++++++++++++------------------
 include/asm-powerpc/pmi.h                 |  8 ++---
 3 files changed, 40 insertions(+), 55 deletions(-)

diff --git a/arch/powerpc/platforms/cell/cbe_cpufreq.c b/arch/powerpc/platforms/cell/cbe_cpufreq.c
index ab511d5b65a..3586f529049 100644
--- a/arch/powerpc/platforms/cell/cbe_cpufreq.c
+++ b/arch/powerpc/platforms/cell/cbe_cpufreq.c
@@ -68,11 +68,12 @@ static u64 MIC_Slow_Next_Timer_table[] = {
 };
 
 static unsigned int pmi_frequency_limit = 0;
+
 /*
  * hardware specific functions
  */
 
-static struct of_device *pmi_dev;
+static bool cbe_cpufreq_has_pmi;
 
 #ifdef CONFIG_PPC_PMI
 static int set_pmode_pmi(int cpu, unsigned int pmode)
@@ -91,7 +92,7 @@ static int set_pmode_pmi(int cpu, unsigned int pmode)
 	time = (u64) get_cycles();
 #endif
 
-	pmi_send_message(pmi_dev, pmi_msg);
+	pmi_send_message(pmi_msg);
 	ret = pmi_msg.data2;
 
 	pr_debug("PMI returned slow mode %d\n", ret);
@@ -157,16 +158,16 @@ static int set_pmode_reg(int cpu, unsigned int pmode)
 	return 0;
 }
 
-static int set_pmode(int cpu, unsigned int slow_mode) {
+static int set_pmode(int cpu, unsigned int slow_mode)
+{
 #ifdef CONFIG_PPC_PMI
-	if (pmi_dev)
+	if (cbe_cpufreq_has_pmi)
 		return set_pmode_pmi(cpu, slow_mode);
-	else
 #endif
-		return set_pmode_reg(cpu, slow_mode);
+	return set_pmode_reg(cpu, slow_mode);
 }
 
-static void cbe_cpufreq_handle_pmi(struct of_device *dev, pmi_message_t pmi_msg)
+static void cbe_cpufreq_handle_pmi(pmi_message_t pmi_msg)
 {
 	u8 cpu;
 	u8 cbe_pmode_new;
@@ -253,7 +254,7 @@ static int cbe_cpufreq_cpu_init(struct cpufreq_policy *policy)
 
 	cpufreq_frequency_table_get_attr(cbe_freqs, policy->cpu);
 
-	if (pmi_dev) {
+	if (cbe_cpufreq_has_pmi) {
 		/* frequency might get limited later, initialize limit with max_freq */
 		pmi_frequency_limit = max_freq;
 		cpufreq_register_notifier(&pmi_notifier_block, CPUFREQ_POLICY_NOTIFIER);
@@ -265,7 +266,7 @@ static int cbe_cpufreq_cpu_init(struct cpufreq_policy *policy)
 
 static int cbe_cpufreq_cpu_exit(struct cpufreq_policy *policy)
 {
-	if (pmi_dev)
+	if (cbe_cpufreq_has_pmi)
 		cpufreq_unregister_notifier(&pmi_notifier_block, CPUFREQ_POLICY_NOTIFIER);
 
 	cpufreq_frequency_table_put_attr(policy->cpu);
@@ -326,29 +327,20 @@ static struct cpufreq_driver cbe_cpufreq_driver = {
 
 static int __init cbe_cpufreq_init(void)
 {
-#ifdef CONFIG_PPC_PMI
-	struct device_node *np;
-#endif
 	if (!machine_is(cell))
 		return -ENODEV;
-#ifdef CONFIG_PPC_PMI
-	np = of_find_node_by_type(NULL, "ibm,pmi");
 
-	pmi_dev = of_find_device_by_node(np);
+	cbe_cpufreq_has_pmi = pmi_register_handler(&cbe_pmi_handler) == 0;
 
-	if (pmi_dev)
-		pmi_register_handler(pmi_dev, &cbe_pmi_handler);
-#endif
 	return cpufreq_register_driver(&cbe_cpufreq_driver);
 }
 
 static void __exit cbe_cpufreq_exit(void)
 {
-#ifdef CONFIG_PPC_PMI
-	if (pmi_dev)
-		pmi_unregister_handler(pmi_dev, &cbe_pmi_handler);
-#endif
 	cpufreq_unregister_driver(&cbe_cpufreq_driver);
+
+	if (cbe_cpufreq_has_pmi)
+		pmi_unregister_handler(&cbe_pmi_handler);
 }
 
 module_init(cbe_cpufreq_init);
diff --git a/arch/powerpc/sysdev/pmi.c b/arch/powerpc/sysdev/pmi.c
index 85a7c99c100..2f91b55b775 100644
--- a/arch/powerpc/sysdev/pmi.c
+++ b/arch/powerpc/sysdev/pmi.c
@@ -48,15 +48,13 @@ struct pmi_data {
 	struct work_struct	work;
 };
 
+static struct pmi_data *data;
 
 static int pmi_irq_handler(int irq, void *dev_id)
 {
-	struct pmi_data *data;
 	u8 type;
 	int rc;
 
-	data = dev_id;
-
 	spin_lock(&data->pmi_spinlock);
 
 	type = ioread8(data->pmi_reg + PMI_READ_TYPE);
@@ -111,16 +109,13 @@ MODULE_DEVICE_TABLE(of, pmi_match);
 
 static void pmi_notify_handlers(struct work_struct *work)
 {
-	struct pmi_data *data;
 	struct pmi_handler *handler;
 
-	data = container_of(work, struct pmi_data, work);
-
 	spin_lock(&data->handler_spinlock);
 	list_for_each_entry(handler, &data->handler, node) {
 		pr_debug(KERN_INFO "pmi: notifying handler %p\n", handler);
 		if (handler->type == data->msg.type)
-			handler->handle_pmi_message(data->dev, data->msg);
+			handler->handle_pmi_message(data->msg);
 	}
 	spin_unlock(&data->handler_spinlock);
 }
@@ -129,9 +124,14 @@ static int pmi_of_probe(struct of_device *dev,
 			const struct of_device_id *match)
 {
 	struct device_node *np = dev->node;
-	struct pmi_data *data;
 	int rc;
 
+	if (data) {
+		printk(KERN_ERR "pmi: driver has already been initialized.\n");
+		rc = -EBUSY;
+		goto out;
+	}
+
 	data = kzalloc(sizeof(struct pmi_data), GFP_KERNEL);
 	if (!data) {
 		printk(KERN_ERR "pmi: could not allocate memory.\n");
@@ -154,7 +154,6 @@ static int pmi_of_probe(struct of_device *dev,
 
 	INIT_WORK(&data->work, pmi_notify_handlers);
 
-	dev->dev.driver_data = data;
 	data->dev = dev;
 
 	data->irq = irq_of_parse_and_map(np, 0);
@@ -164,7 +163,7 @@ static int pmi_of_probe(struct of_device *dev,
 		goto error_cleanup_iomap;
 	}
 
-	rc = request_irq(data->irq, pmi_irq_handler, 0, "pmi", data);
+	rc = request_irq(data->irq, pmi_irq_handler, 0, "pmi", NULL);
 	if (rc) {
 		printk(KERN_ERR "pmi: can't request IRQ %d: returned %d\n",
 				data->irq, rc);
@@ -187,12 +186,9 @@ out:
 
 static int pmi_of_remove(struct of_device *dev)
 {
-	struct pmi_data *data;
 	struct pmi_handler *handler, *tmp;
 
-	data = dev->dev.driver_data;
-
-	free_irq(data->irq, data);
+	free_irq(data->irq, NULL);
 	iounmap(data->pmi_reg);
 
 	spin_lock(&data->handler_spinlock);
@@ -202,7 +198,8 @@ static int pmi_of_remove(struct of_device *dev)
 
 	spin_unlock(&data->handler_spinlock);
 
-	kfree(dev->dev.driver_data);
+	kfree(data);
+	data = NULL;
 
 	return 0;
 }
@@ -226,13 +223,13 @@ static void __exit pmi_module_exit(void)
 }
 module_exit(pmi_module_exit);
 
-void pmi_send_message(struct of_device *device, pmi_message_t msg)
+int pmi_send_message(pmi_message_t msg)
 {
-	struct pmi_data *data;
 	unsigned long flags;
 	DECLARE_COMPLETION_ONSTACK(completion);
 
-	data = device->dev.driver_data;
+	if (!data)
+		return -ENODEV;
 
 	mutex_lock(&data->msg_mutex);
 
@@ -256,30 +253,26 @@ void pmi_send_message(struct of_device *device, pmi_message_t msg)
 	data->completion = NULL;
 
 	mutex_unlock(&data->msg_mutex);
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(pmi_send_message);
 
-void pmi_register_handler(struct of_device *device,
-			  struct pmi_handler *handler)
+int pmi_register_handler(struct pmi_handler *handler)
 {
-	struct pmi_data *data;
-	data = device->dev.driver_data;
-
 	if (!data)
-		return;
+		return -ENODEV;
 
 	spin_lock(&data->handler_spinlock);
 	list_add_tail(&handler->node, &data->handler);
 	spin_unlock(&data->handler_spinlock);
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(pmi_register_handler);
 
-void pmi_unregister_handler(struct of_device *device,
-			    struct pmi_handler *handler)
+void pmi_unregister_handler(struct pmi_handler *handler)
 {
-	struct pmi_data *data;
-	data = device->dev.driver_data;
-
 	if (!data)
 		return;
 
diff --git a/include/asm-powerpc/pmi.h b/include/asm-powerpc/pmi.h
index cb0f8aa4308..2259d4ce384 100644
--- a/include/asm-powerpc/pmi.h
+++ b/include/asm-powerpc/pmi.h
@@ -55,13 +55,13 @@ typedef struct {
 struct pmi_handler {
 	struct list_head node;
 	u8 type;
-	void (*handle_pmi_message) (struct of_device *, pmi_message_t);
+	void (*handle_pmi_message) (pmi_message_t);
 };
 
-void pmi_register_handler(struct of_device *, struct pmi_handler *);
-void pmi_unregister_handler(struct of_device *, struct pmi_handler *);
+int pmi_register_handler(struct pmi_handler *);
+void pmi_unregister_handler(struct pmi_handler *);
 
-void pmi_send_message(struct of_device *, pmi_message_t);
+int pmi_send_message(pmi_message_t);
 
 #endif /* __KERNEL__ */
 #endif /* _POWERPC_PMI_H */
-- 
cgit v1.2.3-70-g09d2


From a964b9be3e475f30aee334654b4ff200bcdc0092 Mon Sep 17 00:00:00 2001
From: Christian Krafft <krafft@de.ibm.com>
Date: Fri, 20 Jul 2007 21:39:19 +0200
Subject: [CELL] cbe_cpufreq: fix latency measurement

This patch fixes the debug code that calculates the transition time when
changing the slow modes on a Cell BE cpu.

Signed-off-by: Christian Krafft <krafft@de.ibm.com>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/cbe_cpufreq.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/platforms/cell/cbe_cpufreq.c b/arch/powerpc/platforms/cell/cbe_cpufreq.c
index 3586f529049..5820fb9a452 100644
--- a/arch/powerpc/platforms/cell/cbe_cpufreq.c
+++ b/arch/powerpc/platforms/cell/cbe_cpufreq.c
@@ -81,7 +81,7 @@ static int set_pmode_pmi(int cpu, unsigned int pmode)
 	int ret;
 	pmi_message_t pmi_msg;
 #ifdef DEBUG
-	u64 time;
+	long time;
 #endif
 
 	pmi_msg.type = PMI_TYPE_FREQ_CHANGE;
@@ -89,7 +89,7 @@ static int set_pmode_pmi(int cpu, unsigned int pmode)
 	pmi_msg.data2 = pmode;
 
 #ifdef DEBUG
-	time = (u64) get_cycles();
+	time = jiffies;
 #endif
 
 	pmi_send_message(pmi_msg);
@@ -98,9 +98,9 @@ static int set_pmode_pmi(int cpu, unsigned int pmode)
 	pr_debug("PMI returned slow mode %d\n", ret);
 
 #ifdef DEBUG
-	time = (u64) get_cycles() - time; /* actual cycles (not cpu cycles!) */
-	time = 1000000000 * time / CLOCK_TICK_RATE; /* time in ns (10^-9) */
-	pr_debug("had to wait %lu ns for a transition\n", time);
+	time = jiffies - time; /* actual cycles (not cpu cycles!) */
+	time = jiffies_to_msecs(time);
+	pr_debug("had to wait %lu ms for a transition using PMI.\n", time);
 #endif
 	return ret;
 }
@@ -123,15 +123,18 @@ static int set_pmode_reg(int cpu, unsigned int pmode)
 	struct cbe_mic_tm_regs __iomem *mic_tm_regs;
 	u64 flags;
 	u64 value;
+#ifdef DEBUG
+	long time;
+#endif
 
 	local_irq_save(flags);
 
 	mic_tm_regs = cbe_get_cpu_mic_tm_regs(cpu);
 	pmd_regs = cbe_get_cpu_pmd_regs(cpu);
 
-	pr_debug("pm register is mapped at %p\n", &pmd_regs->pmcr);
-	pr_debug("mic register is mapped at %p\n", &mic_tm_regs->slow_fast_timer_0);
-
+#ifdef DEBUG
+	time = jiffies;
+#endif
 	out_be64(&mic_tm_regs->slow_fast_timer_0, MIC_Slow_Fast_Timer_table[pmode]);
 	out_be64(&mic_tm_regs->slow_fast_timer_1, MIC_Slow_Fast_Timer_table[pmode]);
 
@@ -146,6 +149,7 @@ static int set_pmode_reg(int cpu, unsigned int pmode)
 
 	out_be64(&pmd_regs->pmcr, value);
 
+#ifdef DEBUG
 	/* wait until new pmode appears in status register */
 	value = in_be64(&pmd_regs->pmsr) & 0x07;
 	while(value != pmode) {
@@ -153,6 +157,11 @@ static int set_pmode_reg(int cpu, unsigned int pmode)
 		value = in_be64(&pmd_regs->pmsr) & 0x07;
 	}
 
+	time = jiffies - time;
+	time = jiffies_to_msecs(time);
+	pr_debug("had to wait %lu ms for a transition using " \
+		 "the pervasive unit.\n", time);
+#endif
 	local_irq_restore(flags);
 
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From e5ecc8719212e2566440818491ec5741689f3743 Mon Sep 17 00:00:00 2001
From: Christian Krafft <krafft@de.ibm.com>
Date: Fri, 20 Jul 2007 21:39:20 +0200
Subject: [CELL] cbe_cpufreq: fix initialization

This patch fixes the initialization of the cbe_cpufreq driver.
The code that initializes the PMI related functions was called per cpu:
* registering cpufreq notifier block
* registering a pmi handler

This ends in a bug that the notifier block gets called in an endless loop.
The initialization code is being put to the
module init code path by this patch. This way it only gets called once.

Signed-off-by: Christian Krafft <krafft@de.ibm.com>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/cbe_cpufreq.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/platforms/cell/cbe_cpufreq.c b/arch/powerpc/platforms/cell/cbe_cpufreq.c
index 5820fb9a452..3a26b3c115b 100644
--- a/arch/powerpc/platforms/cell/cbe_cpufreq.c
+++ b/arch/powerpc/platforms/cell/cbe_cpufreq.c
@@ -196,10 +196,9 @@ static int pmi_notifier(struct notifier_block *nb,
 {
 	struct cpufreq_policy *policy = data;
 
-	if (event != CPUFREQ_INCOMPATIBLE)
-		return 0;
+	if (pmi_frequency_limit)
+		cpufreq_verify_within_limits(policy, 0, pmi_frequency_limit);
 
-	cpufreq_verify_within_limits(policy, 0, pmi_frequency_limit);
 	return 0;
 }
 
@@ -263,11 +262,6 @@ static int cbe_cpufreq_cpu_init(struct cpufreq_policy *policy)
 
 	cpufreq_frequency_table_get_attr(cbe_freqs, policy->cpu);
 
-	if (cbe_cpufreq_has_pmi) {
-		/* frequency might get limited later, initialize limit with max_freq */
-		pmi_frequency_limit = max_freq;
-		cpufreq_register_notifier(&pmi_notifier_block, CPUFREQ_POLICY_NOTIFIER);
-	}
 
 	/* this ensures that policy->cpuinfo_min and policy->cpuinfo_max are set correctly */
 	return cpufreq_frequency_table_cpuinfo(policy, cbe_freqs);
@@ -275,9 +269,6 @@ static int cbe_cpufreq_cpu_init(struct cpufreq_policy *policy)
 
 static int cbe_cpufreq_cpu_exit(struct cpufreq_policy *policy)
 {
-	if (cbe_cpufreq_has_pmi)
-		cpufreq_unregister_notifier(&pmi_notifier_block, CPUFREQ_POLICY_NOTIFIER);
-
 	cpufreq_frequency_table_put_attr(policy->cpu);
 	return 0;
 }
@@ -341,6 +332,9 @@ static int __init cbe_cpufreq_init(void)
 
 	cbe_cpufreq_has_pmi = pmi_register_handler(&cbe_pmi_handler) == 0;
 
+	if (cbe_cpufreq_has_pmi)
+		cpufreq_register_notifier(&pmi_notifier_block, CPUFREQ_POLICY_NOTIFIER);
+
 	return cpufreq_register_driver(&cbe_cpufreq_driver);
 }
 
@@ -348,8 +342,10 @@ static void __exit cbe_cpufreq_exit(void)
 {
 	cpufreq_unregister_driver(&cbe_cpufreq_driver);
 
-	if (cbe_cpufreq_has_pmi)
+	if (cbe_cpufreq_has_pmi) {
+		cpufreq_unregister_notifier(&pmi_notifier_block, CPUFREQ_POLICY_NOTIFIER);
 		pmi_unregister_handler(&cbe_pmi_handler);
+	}
 }
 
 module_init(cbe_cpufreq_init);
-- 
cgit v1.2.3-70-g09d2


From 1e21fd5af3797a2c322e1723c69732e77aa74f48 Mon Sep 17 00:00:00 2001
From: Christian Krafft <krafft@de.ibm.com>
Date: Fri, 20 Jul 2007 21:39:21 +0200
Subject: [CELL] cbe_cpufreq: fix minor issues

Minor issues have been fixed:
* added a missing call to of_node_put()
* signedness of a function parameter
* added some line breaks
* changed global pmi_frequency_limit to a
  per node pmi_slow_mode_limit array

Signed-off-by: Christian Krafft <krafft@de.ibm.com>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/cbe_cpufreq.c | 58 +++++++++++++++++++++----------
 1 file changed, 40 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/platforms/cell/cbe_cpufreq.c b/arch/powerpc/platforms/cell/cbe_cpufreq.c
index 3a26b3c115b..a62562ee146 100644
--- a/arch/powerpc/platforms/cell/cbe_cpufreq.c
+++ b/arch/powerpc/platforms/cell/cbe_cpufreq.c
@@ -67,7 +67,7 @@ static u64 MIC_Slow_Next_Timer_table[] = {
 	0x00003FC000000000ull,
 };
 
-static unsigned int pmi_frequency_limit = 0;
+static u8 pmi_slow_mode_limit[MAX_BE];
 
 /*
  * hardware specific functions
@@ -169,35 +169,50 @@ static int set_pmode_reg(int cpu, unsigned int pmode)
 
 static int set_pmode(int cpu, unsigned int slow_mode)
 {
+	int rc;
 #ifdef CONFIG_PPC_PMI
 	if (cbe_cpufreq_has_pmi)
-		return set_pmode_pmi(cpu, slow_mode);
+		rc = set_pmode_pmi(cpu, slow_mode);
+	else
 #endif
-	return set_pmode_reg(cpu, slow_mode);
+		rc = set_pmode_reg(cpu, slow_mode);
+
+	pr_debug("register contains slow mode %d\n", get_pmode(cpu));
+
+	return rc;
 }
 
 static void cbe_cpufreq_handle_pmi(pmi_message_t pmi_msg)
 {
-	u8 cpu;
-	u8 cbe_pmode_new;
+	u8 node; slow_mode;
 
 	BUG_ON(pmi_msg.type != PMI_TYPE_FREQ_CHANGE);
 
-	cpu = cbe_node_to_cpu(pmi_msg.data1);
-	cbe_pmode_new = pmi_msg.data2;
+	node = pmi_msg.data1;
+	slow_mode = pmi_msg.data2;
 
-	pmi_frequency_limit = cbe_freqs[cbe_pmode_new].frequency;
+	pmi_slow_mode_limit[node] = slow_mode;
 
-	pr_debug("cbe_handle_pmi: max freq=%d\n", pmi_frequency_limit);
+	pr_debug("cbe_handle_pmi: node: %d, max slow_mode=%d\n", slow_mode);
 }
 
 static int pmi_notifier(struct notifier_block *nb,
 				       unsigned long event, void *data)
 {
 	struct cpufreq_policy *policy = data;
+	u8 node;
+
+	node = cbe_cpu_to_node(policy->cpu);
+
+	pr_debug("got notified, event=%lu, node=%u\n", event, node);
 
-	if (pmi_frequency_limit)
-		cpufreq_verify_within_limits(policy, 0, pmi_frequency_limit);
+	if (pmi_slow_mode_limit[node] != 0) {
+		pr_debug("limiting node %d to slow mode %d\n",
+			 node, pmi_slow_mode_limit[node]);
+
+		cpufreq_verify_within_limits(policy, 0,
+			 cbe_freqs[pmi_slow_mode_limit[node]].frequency);
+	}
 
 	return 0;
 }
@@ -232,6 +247,8 @@ static int cbe_cpufreq_cpu_init(struct cpufreq_policy *policy)
 
 	max_freqp = of_get_property(cpu, "clock-frequency", NULL);
 
+	of_node_put(cpu);
+
 	if (!max_freqp)
 		return -EINVAL;
 
@@ -248,7 +265,9 @@ static int cbe_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	}
 
 	policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
-	/* if DEBUG is enabled set_pmode() measures the correct latency of a transition */
+
+	/* if DEBUG is enabled set_pmode() measures the latency
+	 * of a transition */
 	policy->cpuinfo.transition_latency = 25000;
 
 	cur_pmode = get_pmode(policy->cpu);
@@ -262,8 +281,8 @@ static int cbe_cpufreq_cpu_init(struct cpufreq_policy *policy)
 
 	cpufreq_frequency_table_get_attr(cbe_freqs, policy->cpu);
 
-
-	/* this ensures that policy->cpuinfo_min and policy->cpuinfo_max are set correctly */
+	/* this ensures that policy->cpuinfo_min
+	 * and policy->cpuinfo_max are set correctly */
 	return cpufreq_frequency_table_cpuinfo(policy, cbe_freqs);
 }
 
@@ -279,12 +298,13 @@ static int cbe_cpufreq_verify(struct cpufreq_policy *policy)
 }
 
 
-static int cbe_cpufreq_target(struct cpufreq_policy *policy, unsigned int target_freq,
-			    unsigned int relation)
+static int cbe_cpufreq_target(struct cpufreq_policy *policy,
+			      unsigned int target_freq,
+			      unsigned int relation)
 {
 	int rc;
 	struct cpufreq_freqs freqs;
-	int cbe_pmode_new;
+	unsigned int cbe_pmode_new;
 
 	cpufreq_frequency_table_target(policy,
 				       cbe_freqs,
@@ -299,12 +319,14 @@ static int cbe_cpufreq_target(struct cpufreq_policy *policy, unsigned int target
 	mutex_lock(&cbe_switch_mutex);
 	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
 
-	pr_debug("setting frequency for cpu %d to %d kHz, 1/%d of max frequency\n",
+	pr_debug("setting frequency for cpu %d to %d kHz, " \
+		 "1/%d of max frequency\n",
 		 policy->cpu,
 		 cbe_freqs[cbe_pmode_new].frequency,
 		 cbe_freqs[cbe_pmode_new].index);
 
 	rc = set_pmode(policy->cpu, cbe_pmode_new);
+
 	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
 	mutex_unlock(&cbe_switch_mutex);
 
-- 
cgit v1.2.3-70-g09d2


From 74889e41d9a1f80928130a02af9b010673bc5ba7 Mon Sep 17 00:00:00 2001
From: Christian Krafft <krafft@de.ibm.com>
Date: Fri, 20 Jul 2007 21:39:22 +0200
Subject: [CELL] cbe_cpufreq: reorganize code

This patch reorganizes the code of the driver into three files.
Two cbe_cpufreq_pmi.c and cbe_cpufreq_pervasive.c care about hardware.
cbe_cpufreq.c contains the logic.
There is no changed behaviour, except that the PMI related function
is now located in a seperate module cbe_cpufreq_pmi. This module
will be required by cbe_cpufreq, if CONFIG_CBE_CPUFREQ_PMI has been set.

Signed-off-by: Christian Krafft <krafft@de.ibm.com>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/Kconfig                |  10 ++
 arch/powerpc/platforms/cell/Makefile               |   4 +-
 arch/powerpc/platforms/cell/cbe_cpufreq.c          | 196 +--------------------
 arch/powerpc/platforms/cell/cbe_cpufreq.h          |  24 +++
 .../powerpc/platforms/cell/cbe_cpufreq_pervasive.c | 115 ++++++++++++
 arch/powerpc/platforms/cell/cbe_cpufreq_pmi.c      | 148 ++++++++++++++++
 6 files changed, 309 insertions(+), 188 deletions(-)
 create mode 100644 arch/powerpc/platforms/cell/cbe_cpufreq.h
 create mode 100644 arch/powerpc/platforms/cell/cbe_cpufreq_pervasive.c
 create mode 100644 arch/powerpc/platforms/cell/cbe_cpufreq_pmi.c

diff --git a/arch/powerpc/platforms/cell/Kconfig b/arch/powerpc/platforms/cell/Kconfig
index 9b2b386ccf4..ac8032034fb 100644
--- a/arch/powerpc/platforms/cell/Kconfig
+++ b/arch/powerpc/platforms/cell/Kconfig
@@ -73,4 +73,14 @@ config CBE_CPUFREQ
 	  For details, take a look at <file:Documentation/cpu-freq/>.
 	  If you don't have such processor, say N
 
+config CBE_CPUFREQ_PMI
+	tristate "CBE frequency scaling using PMI interface"
+	depends on CBE_CPUFREQ && PPC_PMI && EXPERIMENTAL
+	default n
+	help
+	  Select this, if you want to use the PMI interface
+	  to switch frequencies. Using PMI, the
+	  processor will not only be able to run at lower speed,
+	  but also at lower core voltage.
+
 endmenu
diff --git a/arch/powerpc/platforms/cell/Makefile b/arch/powerpc/platforms/cell/Makefile
index 869af89df6f..be059718bec 100644
--- a/arch/powerpc/platforms/cell/Makefile
+++ b/arch/powerpc/platforms/cell/Makefile
@@ -4,7 +4,9 @@ obj-$(CONFIG_PPC_CELL_NATIVE)		+= interrupt.o iommu.o setup.o \
 obj-$(CONFIG_CBE_RAS)			+= ras.o
 
 obj-$(CONFIG_CBE_THERM)			+= cbe_thermal.o
-obj-$(CONFIG_CBE_CPUFREQ)		+= cbe_cpufreq.o
+obj-$(CONFIG_CBE_CPUFREQ_PMI)		+= cbe_cpufreq_pmi.o
+obj-$(CONFIG_CBE_CPUFREQ)		+= cbe-cpufreq.o
+cbe-cpufreq-y				+= cbe_cpufreq_pervasive.o cbe_cpufreq.o
 
 ifeq ($(CONFIG_SMP),y)
 obj-$(CONFIG_PPC_CELL_NATIVE)		+= smp.o
diff --git a/arch/powerpc/platforms/cell/cbe_cpufreq.c b/arch/powerpc/platforms/cell/cbe_cpufreq.c
index a62562ee146..9a9932624c0 100644
--- a/arch/powerpc/platforms/cell/cbe_cpufreq.c
+++ b/arch/powerpc/platforms/cell/cbe_cpufreq.c
@@ -1,7 +1,7 @@
 /*
  * cpufreq driver for the cell processor
  *
- * (C) Copyright IBM Deutschland Entwicklung GmbH 2005
+ * (C) Copyright IBM Deutschland Entwicklung GmbH 2005-2007
  *
  * Author: Christian Krafft <krafft@de.ibm.com>
  *
@@ -21,18 +21,10 @@
  */
 
 #include <linux/cpufreq.h>
-#include <linux/timer.h>
-
-#include <asm/hw_irq.h>
-#include <asm/io.h>
 #include <asm/machdep.h>
-#include <asm/processor.h>
-#include <asm/prom.h>
-#include <asm/time.h>
-#include <asm/pmi.h>
 #include <asm/of_platform.h>
-
-#include "cbe_regs.h"
+#include <asm/prom.h>
+#include "cbe_cpufreq.h"
 
 static DEFINE_MUTEX(cbe_switch_mutex);
 
@@ -50,183 +42,24 @@ static struct cpufreq_frequency_table cbe_freqs[] = {
 	{0,	CPUFREQ_TABLE_END},
 };
 
-/* to write to MIC register */
-static u64 MIC_Slow_Fast_Timer_table[] = {
-	[0 ... 7] = 0x007fc00000000000ull,
-};
-
-/* more values for the MIC */
-static u64 MIC_Slow_Next_Timer_table[] = {
-	0x0000240000000000ull,
-	0x0000268000000000ull,
-	0x000029C000000000ull,
-	0x00002D0000000000ull,
-	0x0000300000000000ull,
-	0x0000334000000000ull,
-	0x000039C000000000ull,
-	0x00003FC000000000ull,
-};
-
-static u8 pmi_slow_mode_limit[MAX_BE];
-
 /*
  * hardware specific functions
  */
 
-static bool cbe_cpufreq_has_pmi;
-
-#ifdef CONFIG_PPC_PMI
-static int set_pmode_pmi(int cpu, unsigned int pmode)
-{
-	int ret;
-	pmi_message_t pmi_msg;
-#ifdef DEBUG
-	long time;
-#endif
-
-	pmi_msg.type = PMI_TYPE_FREQ_CHANGE;
-	pmi_msg.data1 =	cbe_cpu_to_node(cpu);
-	pmi_msg.data2 = pmode;
-
-#ifdef DEBUG
-	time = jiffies;
-#endif
-
-	pmi_send_message(pmi_msg);
-	ret = pmi_msg.data2;
-
-	pr_debug("PMI returned slow mode %d\n", ret);
-
-#ifdef DEBUG
-	time = jiffies - time; /* actual cycles (not cpu cycles!) */
-	time = jiffies_to_msecs(time);
-	pr_debug("had to wait %lu ms for a transition using PMI.\n", time);
-#endif
-	return ret;
-}
-#endif
-
-static int get_pmode(int cpu)
-{
-	int ret;
-	struct cbe_pmd_regs __iomem *pmd_regs;
-
-	pmd_regs = cbe_get_cpu_pmd_regs(cpu);
-	ret = in_be64(&pmd_regs->pmsr) & 0x07;
-
-	return ret;
-}
-
-static int set_pmode_reg(int cpu, unsigned int pmode)
-{
-	struct cbe_pmd_regs __iomem *pmd_regs;
-	struct cbe_mic_tm_regs __iomem *mic_tm_regs;
-	u64 flags;
-	u64 value;
-#ifdef DEBUG
-	long time;
-#endif
-
-	local_irq_save(flags);
-
-	mic_tm_regs = cbe_get_cpu_mic_tm_regs(cpu);
-	pmd_regs = cbe_get_cpu_pmd_regs(cpu);
-
-#ifdef DEBUG
-	time = jiffies;
-#endif
-	out_be64(&mic_tm_regs->slow_fast_timer_0, MIC_Slow_Fast_Timer_table[pmode]);
-	out_be64(&mic_tm_regs->slow_fast_timer_1, MIC_Slow_Fast_Timer_table[pmode]);
-
-	out_be64(&mic_tm_regs->slow_next_timer_0, MIC_Slow_Next_Timer_table[pmode]);
-	out_be64(&mic_tm_regs->slow_next_timer_1, MIC_Slow_Next_Timer_table[pmode]);
-
-	value = in_be64(&pmd_regs->pmcr);
-	/* set bits to zero */
-	value &= 0xFFFFFFFFFFFFFFF8ull;
-	/* set bits to next pmode */
-	value |= pmode;
-
-	out_be64(&pmd_regs->pmcr, value);
-
-#ifdef DEBUG
-	/* wait until new pmode appears in status register */
-	value = in_be64(&pmd_regs->pmsr) & 0x07;
-	while(value != pmode) {
-		cpu_relax();
-		value = in_be64(&pmd_regs->pmsr) & 0x07;
-	}
-
-	time = jiffies - time;
-	time = jiffies_to_msecs(time);
-	pr_debug("had to wait %lu ms for a transition using " \
-		 "the pervasive unit.\n", time);
-#endif
-	local_irq_restore(flags);
-
-	return 0;
-}
-
-static int set_pmode(int cpu, unsigned int slow_mode)
+static int set_pmode(unsigned int cpu, unsigned int slow_mode)
 {
 	int rc;
-#ifdef CONFIG_PPC_PMI
+
 	if (cbe_cpufreq_has_pmi)
-		rc = set_pmode_pmi(cpu, slow_mode);
+		rc = cbe_cpufreq_set_pmode_pmi(cpu, slow_mode);
 	else
-#endif
-		rc = set_pmode_reg(cpu, slow_mode);
+		rc = cbe_cpufreq_set_pmode(cpu, slow_mode);
 
-	pr_debug("register contains slow mode %d\n", get_pmode(cpu));
+	pr_debug("register contains slow mode %d\n", cbe_cpufreq_get_pmode(cpu));
 
 	return rc;
 }
 
-static void cbe_cpufreq_handle_pmi(pmi_message_t pmi_msg)
-{
-	u8 node; slow_mode;
-
-	BUG_ON(pmi_msg.type != PMI_TYPE_FREQ_CHANGE);
-
-	node = pmi_msg.data1;
-	slow_mode = pmi_msg.data2;
-
-	pmi_slow_mode_limit[node] = slow_mode;
-
-	pr_debug("cbe_handle_pmi: node: %d, max slow_mode=%d\n", slow_mode);
-}
-
-static int pmi_notifier(struct notifier_block *nb,
-				       unsigned long event, void *data)
-{
-	struct cpufreq_policy *policy = data;
-	u8 node;
-
-	node = cbe_cpu_to_node(policy->cpu);
-
-	pr_debug("got notified, event=%lu, node=%u\n", event, node);
-
-	if (pmi_slow_mode_limit[node] != 0) {
-		pr_debug("limiting node %d to slow mode %d\n",
-			 node, pmi_slow_mode_limit[node]);
-
-		cpufreq_verify_within_limits(policy, 0,
-			 cbe_freqs[pmi_slow_mode_limit[node]].frequency);
-	}
-
-	return 0;
-}
-
-static struct notifier_block pmi_notifier_block = {
-	.notifier_call = pmi_notifier,
-};
-
-static struct pmi_handler cbe_pmi_handler = {
-	.type			= PMI_TYPE_FREQ_CHANGE,
-	.handle_pmi_message	= cbe_cpufreq_handle_pmi,
-};
-
-
 /*
  * cpufreq functions
  */
@@ -270,7 +103,7 @@ static int cbe_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	 * of a transition */
 	policy->cpuinfo.transition_latency = 25000;
 
-	cur_pmode = get_pmode(policy->cpu);
+	cur_pmode = cbe_cpufreq_get_pmode(policy->cpu);
 	pr_debug("current pmode is at %d\n",cur_pmode);
 
 	policy->cur = cbe_freqs[cur_pmode].frequency;
@@ -297,7 +130,6 @@ static int cbe_cpufreq_verify(struct cpufreq_policy *policy)
 	return cpufreq_frequency_table_verify(policy, cbe_freqs);
 }
 
-
 static int cbe_cpufreq_target(struct cpufreq_policy *policy,
 			      unsigned int target_freq,
 			      unsigned int relation)
@@ -352,22 +184,12 @@ static int __init cbe_cpufreq_init(void)
 	if (!machine_is(cell))
 		return -ENODEV;
 
-	cbe_cpufreq_has_pmi = pmi_register_handler(&cbe_pmi_handler) == 0;
-
-	if (cbe_cpufreq_has_pmi)
-		cpufreq_register_notifier(&pmi_notifier_block, CPUFREQ_POLICY_NOTIFIER);
-
 	return cpufreq_register_driver(&cbe_cpufreq_driver);
 }
 
 static void __exit cbe_cpufreq_exit(void)
 {
 	cpufreq_unregister_driver(&cbe_cpufreq_driver);
-
-	if (cbe_cpufreq_has_pmi) {
-		cpufreq_unregister_notifier(&pmi_notifier_block, CPUFREQ_POLICY_NOTIFIER);
-		pmi_unregister_handler(&cbe_pmi_handler);
-	}
 }
 
 module_init(cbe_cpufreq_init);
diff --git a/arch/powerpc/platforms/cell/cbe_cpufreq.h b/arch/powerpc/platforms/cell/cbe_cpufreq.h
new file mode 100644
index 00000000000..c1d86bfa92f
--- /dev/null
+++ b/arch/powerpc/platforms/cell/cbe_cpufreq.h
@@ -0,0 +1,24 @@
+/*
+ * cbe_cpufreq.h
+ *
+ * This file contains the definitions used by the cbe_cpufreq driver.
+ *
+ * (C) Copyright IBM Deutschland Entwicklung GmbH 2005-2007
+ *
+ * Author: Christian Krafft <krafft@de.ibm.com>
+ *
+ */
+
+#include <linux/cpufreq.h>
+#include <linux/types.h>
+
+int cbe_cpufreq_set_pmode(int cpu, unsigned int pmode);
+int cbe_cpufreq_get_pmode(int cpu);
+
+int cbe_cpufreq_set_pmode_pmi(int cpu, unsigned int pmode);
+
+#if defined(CONFIG_CBE_CPUFREQ_PMI) || defined(CONFIG_CBE_CPUFREQ_PMI_MODULE)
+extern bool cbe_cpufreq_has_pmi;
+#else
+#define cbe_cpufreq_has_pmi (0)
+#endif
diff --git a/arch/powerpc/platforms/cell/cbe_cpufreq_pervasive.c b/arch/powerpc/platforms/cell/cbe_cpufreq_pervasive.c
new file mode 100644
index 00000000000..163263b3e1c
--- /dev/null
+++ b/arch/powerpc/platforms/cell/cbe_cpufreq_pervasive.c
@@ -0,0 +1,115 @@
+/*
+ * pervasive backend for the cbe_cpufreq driver
+ *
+ * This driver makes use of the pervasive unit to
+ * engage the desired frequency.
+ *
+ * (C) Copyright IBM Deutschland Entwicklung GmbH 2005-2007
+ *
+ * Author: Christian Krafft <krafft@de.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/time.h>
+#include <asm/machdep.h>
+#include <asm/hw_irq.h>
+
+#include "cbe_regs.h"
+#include "cbe_cpufreq.h"
+
+/* to write to MIC register */
+static u64 MIC_Slow_Fast_Timer_table[] = {
+	[0 ... 7] = 0x007fc00000000000ull,
+};
+
+/* more values for the MIC */
+static u64 MIC_Slow_Next_Timer_table[] = {
+	0x0000240000000000ull,
+	0x0000268000000000ull,
+	0x000029C000000000ull,
+	0x00002D0000000000ull,
+	0x0000300000000000ull,
+	0x0000334000000000ull,
+	0x000039C000000000ull,
+	0x00003FC000000000ull,
+};
+
+
+int cbe_cpufreq_set_pmode(int cpu, unsigned int pmode)
+{
+	struct cbe_pmd_regs __iomem *pmd_regs;
+	struct cbe_mic_tm_regs __iomem *mic_tm_regs;
+	u64 flags;
+	u64 value;
+#ifdef DEBUG
+	long time;
+#endif
+
+	local_irq_save(flags);
+
+	mic_tm_regs = cbe_get_cpu_mic_tm_regs(cpu);
+	pmd_regs = cbe_get_cpu_pmd_regs(cpu);
+
+#ifdef DEBUG
+	time = jiffies;
+#endif
+
+	out_be64(&mic_tm_regs->slow_fast_timer_0, MIC_Slow_Fast_Timer_table[pmode]);
+	out_be64(&mic_tm_regs->slow_fast_timer_1, MIC_Slow_Fast_Timer_table[pmode]);
+
+	out_be64(&mic_tm_regs->slow_next_timer_0, MIC_Slow_Next_Timer_table[pmode]);
+	out_be64(&mic_tm_regs->slow_next_timer_1, MIC_Slow_Next_Timer_table[pmode]);
+
+	value = in_be64(&pmd_regs->pmcr);
+	/* set bits to zero */
+	value &= 0xFFFFFFFFFFFFFFF8ull;
+	/* set bits to next pmode */
+	value |= pmode;
+
+	out_be64(&pmd_regs->pmcr, value);
+
+#ifdef DEBUG
+	/* wait until new pmode appears in status register */
+	value = in_be64(&pmd_regs->pmsr) & 0x07;
+	while (value != pmode) {
+		cpu_relax();
+		value = in_be64(&pmd_regs->pmsr) & 0x07;
+	}
+
+	time = jiffies  - time;
+	time = jiffies_to_msecs(time);
+	pr_debug("had to wait %lu ms for a transition using " \
+		 "pervasive unit\n", time);
+#endif
+	local_irq_restore(flags);
+
+	return 0;
+}
+
+
+int cbe_cpufreq_get_pmode(int cpu)
+{
+	int ret;
+	struct cbe_pmd_regs __iomem *pmd_regs;
+
+	pmd_regs = cbe_get_cpu_pmd_regs(cpu);
+	ret = in_be64(&pmd_regs->pmsr) & 0x07;
+
+	return ret;
+}
+
diff --git a/arch/powerpc/platforms/cell/cbe_cpufreq_pmi.c b/arch/powerpc/platforms/cell/cbe_cpufreq_pmi.c
new file mode 100644
index 00000000000..fc6f38982ff
--- /dev/null
+++ b/arch/powerpc/platforms/cell/cbe_cpufreq_pmi.c
@@ -0,0 +1,148 @@
+/*
+ * pmi backend for the cbe_cpufreq driver
+ *
+ * (C) Copyright IBM Deutschland Entwicklung GmbH 2005-2007
+ *
+ * Author: Christian Krafft <krafft@de.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <asm/of_platform.h>
+#include <asm/processor.h>
+#include <asm/prom.h>
+#include <asm/pmi.h>
+
+#ifdef DEBUG
+#include <asm/time.h>
+#endif
+
+#include "cbe_regs.h"
+#include "cbe_cpufreq.h"
+
+static u8 pmi_slow_mode_limit[MAX_CBE];
+
+bool cbe_cpufreq_has_pmi = false;
+EXPORT_SYMBOL_GPL(cbe_cpufreq_has_pmi);
+
+/*
+ * hardware specific functions
+ */
+
+int cbe_cpufreq_set_pmode_pmi(int cpu, unsigned int pmode)
+{
+	int ret;
+	pmi_message_t pmi_msg;
+#ifdef DEBUG
+	long time;
+#endif
+	pmi_msg.type = PMI_TYPE_FREQ_CHANGE;
+	pmi_msg.data1 =	cbe_cpu_to_node(cpu);
+	pmi_msg.data2 = pmode;
+
+#ifdef DEBUG
+	time = jiffies;
+#endif
+	pmi_send_message(pmi_msg);
+
+#ifdef DEBUG
+	time = jiffies  - time;
+	time = jiffies_to_msecs(time);
+	pr_debug("had to wait %lu ms for a transition using " \
+		 "PMI\n", time);
+#endif
+	ret = pmi_msg.data2;
+	pr_debug("PMI returned slow mode %d\n", ret);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(cbe_cpufreq_set_pmode_pmi);
+
+
+static void cbe_cpufreq_handle_pmi(pmi_message_t pmi_msg)
+{
+	u8 node, slow_mode;
+
+	BUG_ON(pmi_msg.type != PMI_TYPE_FREQ_CHANGE);
+
+	node = pmi_msg.data1;
+	slow_mode = pmi_msg.data2;
+
+	pmi_slow_mode_limit[node] = slow_mode;
+
+	pr_debug("cbe_handle_pmi: node: %d max_freq: %d\n", node, slow_mode);
+}
+
+static int pmi_notifier(struct notifier_block *nb,
+				       unsigned long event, void *data)
+{
+	struct cpufreq_policy *policy = data;
+	struct cpufreq_frequency_table *cbe_freqs;
+	u8 node;
+
+	cbe_freqs = cpufreq_frequency_get_table(policy->cpu);
+	node = cbe_cpu_to_node(policy->cpu);
+
+	pr_debug("got notified, event=%lu, node=%u\n", event, node);
+
+	if (pmi_slow_mode_limit[node] != 0) {
+		pr_debug("limiting node %d to slow mode %d\n",
+			 node, pmi_slow_mode_limit[node]);
+
+		cpufreq_verify_within_limits(policy, 0,
+
+			cbe_freqs[pmi_slow_mode_limit[node]].frequency);
+	}
+
+	return 0;
+}
+
+static struct notifier_block pmi_notifier_block = {
+	.notifier_call = pmi_notifier,
+};
+
+static struct pmi_handler cbe_pmi_handler = {
+	.type			= PMI_TYPE_FREQ_CHANGE,
+	.handle_pmi_message	= cbe_cpufreq_handle_pmi,
+};
+
+
+
+static int __init cbe_cpufreq_pmi_init(void)
+{
+	cbe_cpufreq_has_pmi = pmi_register_handler(&cbe_pmi_handler) == 0;
+
+	if (!cbe_cpufreq_has_pmi)
+		return -ENODEV;
+
+	cpufreq_register_notifier(&pmi_notifier_block, CPUFREQ_POLICY_NOTIFIER);
+
+	return 0;
+}
+
+static void __exit cbe_cpufreq_pmi_exit(void)
+{
+	cpufreq_unregister_notifier(&pmi_notifier_block, CPUFREQ_POLICY_NOTIFIER);
+	pmi_unregister_handler(&cbe_pmi_handler);
+}
+
+module_init(cbe_cpufreq_pmi_init);
+module_exit(cbe_cpufreq_pmi_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Christian Krafft <krafft@de.ibm.com>");
-- 
cgit v1.2.3-70-g09d2


From 64bafa9db7e92d5a46402613188b71800924ca1f Mon Sep 17 00:00:00 2001
From: Jean-Christophe DUBOIS <jcd@tribudubois.net>
Date: Fri, 20 Jul 2007 21:39:23 +0200
Subject: [CELL] fix cbe_cpufreq for legacy SLOF tree.

Previous patch changed based on Christian Krafft's comment.

On some legacy SLOF tree the generic code is unable to ioremap some Cell BE
registers. Therefore the "generic" functions are returning a NULL pointer,
triggering a crash on such platforms.

Let's handle this more gracefully.

Signed-off-by: Jean-Christophe DUBOIS <jcd@tribudubois.net>
Acked-by: Christian Kraff <krafft@de.ibm.com>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/cbe_cpufreq.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/arch/powerpc/platforms/cell/cbe_cpufreq.c b/arch/powerpc/platforms/cell/cbe_cpufreq.c
index 9a9932624c0..0b6e8ee85ab 100644
--- a/arch/powerpc/platforms/cell/cbe_cpufreq.c
+++ b/arch/powerpc/platforms/cell/cbe_cpufreq.c
@@ -24,6 +24,7 @@
 #include <asm/machdep.h>
 #include <asm/of_platform.h>
 #include <asm/prom.h>
+#include "cbe_regs.h"
 #include "cbe_cpufreq.h"
 
 static DEFINE_MUTEX(cbe_switch_mutex);
@@ -78,6 +79,15 @@ static int cbe_cpufreq_cpu_init(struct cpufreq_policy *policy)
 
 	pr_debug("init cpufreq on CPU %d\n", policy->cpu);
 
+	/*
+	 * Let's check we can actually get to the CELL regs
+	 */
+	if (!cbe_get_cpu_pmd_regs(policy->cpu) ||
+	    !cbe_get_cpu_mic_tm_regs(policy->cpu)) {
+		pr_info("invalid CBE regs pointers for cpufreq\n");
+		return -EINVAL;
+	}
+
 	max_freqp = of_get_property(cpu, "clock-frequency", NULL);
 
 	of_node_put(cpu);
-- 
cgit v1.2.3-70-g09d2


From 827e3648dc2c31e01db7cd2e4498061cf78a97a9 Mon Sep 17 00:00:00 2001
From: Jean-Christophe DUBOIS <jcd@tribudubois.net>
Date: Fri, 20 Jul 2007 21:39:24 +0200
Subject: [CELL] fix cbe_thermal for legacy SLOF tree.

Previous patch changed based on Christian Krafft's comment.

On some legacy SLOF tree the generic code is unable to ioremap some Cell BE
registers. Therefore the "generic" functions are returning a NULL pointer,
triggering a crash on such platforms.

Let's handle this more gracefully.

Signed-off-by: Jean-Christophe DUBOIS <jcd@tribudubois.net>
Acked-by: Christian Kraff <krafft@de.ibm.com>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/cbe_thermal.c | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/platforms/cell/cbe_thermal.c b/arch/powerpc/platforms/cell/cbe_thermal.c
index f370f0fa6f4..e4132f8f51b 100644
--- a/arch/powerpc/platforms/cell/cbe_thermal.c
+++ b/arch/powerpc/platforms/cell/cbe_thermal.c
@@ -292,7 +292,7 @@ static struct attribute_group ppe_attribute_group = {
 /*
  * initialize throttling with default values
  */
-static void __init init_default_values(void)
+static int __init init_default_values(void)
 {
 	int cpu;
 	struct cbe_pmd_regs __iomem *pmd_regs;
@@ -339,25 +339,40 @@ static void __init init_default_values(void)
 	for_each_possible_cpu (cpu) {
 		pr_debug("processing cpu %d\n", cpu);
 		sysdev = get_cpu_sysdev(cpu);
+
+		if (!sysdev) {
+			pr_info("invalid sysdev pointer for cbe_thermal\n");
+			return -EINVAL;
+		}
+
 		pmd_regs = cbe_get_cpu_pmd_regs(sysdev->id);
 
+		if (!pmd_regs) {
+			pr_info("invalid CBE regs pointer for cbe_thermal\n");
+			return -EINVAL;
+		}
+
 		out_be64(&pmd_regs->tm_str2, str2);
 		out_be64(&pmd_regs->tm_str1.val, str1.val);
 		out_be64(&pmd_regs->tm_tpr.val, tpr.val);
 		out_be64(&pmd_regs->tm_cr1.val, cr1.val);
 		out_be64(&pmd_regs->tm_cr2, cr2);
 	}
+
+	return 0;
 }
 
 
 static int __init thermal_init(void)
 {
-	init_default_values();
+	int rc = init_default_values();
 
-	spu_add_sysdev_attr_group(&spu_attribute_group);
-	cpu_add_sysdev_attr_group(&ppe_attribute_group);
+	if (rc == 0) {
+		spu_add_sysdev_attr_group(&spu_attribute_group);
+		cpu_add_sysdev_attr_group(&ppe_attribute_group);
+	}
 
-	return 0;
+	return rc;
 }
 module_init(thermal_init);
 
-- 
cgit v1.2.3-70-g09d2


From b86ce01c7700cfc74665799355a46dcadf920ebd Mon Sep 17 00:00:00 2001
From: Jean-Christophe DUBOIS <jcd@tribudubois.net>
Date: Fri, 20 Jul 2007 21:39:25 +0200
Subject: [CELL] allow linux to map Cell regs on legacy SLOF tree.

The platforms missing the "cpus" property in the "be" node are mono-Cell
platforms such as CAB or Getaway.

Therefore it is possible to assume that if there is no "cpus" properties
under the "be" node then we can safely return the "device node" without
more checking. This is a bit hacky but ... it allows it to work on
these platforms.

Signed-off-by: Jean-Christophe DUBOIS <jcd@tribudubois.net>
Acked-by: Christian Krafft <krafft@de.ibm.com>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/cbe_regs.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/powerpc/platforms/cell/cbe_regs.c b/arch/powerpc/platforms/cell/cbe_regs.c
index 12c9674b4b1..c8f7f000742 100644
--- a/arch/powerpc/platforms/cell/cbe_regs.c
+++ b/arch/powerpc/platforms/cell/cbe_regs.c
@@ -174,6 +174,13 @@ static struct device_node *cbe_get_be_node(int cpu_id)
 
 		cpu_handle = of_get_property(np, "cpus", &len);
 
+		/*
+		 * the CAB SLOF tree is non compliant, so we just assume
+		 * there is only one node
+		 */
+		if (WARN_ON_ONCE(!cpu_handle))
+			return np;
+
 		for (i=0; i<len; i++)
 			if (of_find_node_by_phandle(cpu_handle[i]) == of_get_cpu_node(cpu_id, NULL))
 				return np;
-- 
cgit v1.2.3-70-g09d2


From dbdf04c40161f81d74e27f04e201acb3a5dfad69 Mon Sep 17 00:00:00 2001
From: Maxim Shchetynin <maxim@de.ibm.com>
Date: Fri, 20 Jul 2007 21:39:26 +0200
Subject: [CELL] driver for DDR2 memory on AXON

The Axon bridge chip used on new Cell/B.E. based blade servers
comes with a DDR2 memory controller that can be used to
attach cheap memory modules, as opposed to the high-speed
XDR memory that is used by the CPU itself.

Since the memory controller does not participate in the
cache coherency protocol, we can not use the memory direcly
for Linux applications, but by providing a block device
it can be used for swap space, temporary file storage and
through the use of the direct_access block device operation
for mapping into user addresses, when it is mounted with
an appropriate file system.

Signed-off-by: Maxim Shchetynin <maxim@de.ibm.com>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/Kconfig |  10 ++
 arch/powerpc/sysdev/Makefile   |   1 +
 arch/powerpc/sysdev/axonram.c  | 381 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 392 insertions(+)
 create mode 100644 arch/powerpc/sysdev/axonram.c

diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index 33545d352e9..932538a93c2 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -272,4 +272,14 @@ config CPM2
 	  you wish to build a kernel for a machine with a CPM2 coprocessor
 	  on it (826x, 827x, 8560).
 
+config AXON_RAM
+	tristate "Axon DDR2 memory device driver"
+	depends on PPC_IBM_CELL_BLADE
+	default m
+	help
+	  It registers one block device per Axon's DDR2 memory bank found
+	  on a system. Block devices are called axonram?, their major and
+	  minor numbers are available in /proc/devices, /proc/partitions or
+	  in /sys/block/axonram?/dev.
+
 endmenu
diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile
index f65078c3d3b..484eb4e0e9d 100644
--- a/arch/powerpc/sysdev/Makefile
+++ b/arch/powerpc/sysdev/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_QUICC_ENGINE)	+= qe_lib/
 mv64x60-$(CONFIG_PCI)		+= mv64x60_pci.o
 obj-$(CONFIG_MV64X60)		+= $(mv64x60-y) mv64x60_pic.o mv64x60_dev.o
 obj-$(CONFIG_RTC_DRV_CMOS)	+= rtc_cmos_setup.o
+obj-$(CONFIG_AXON_RAM)		+= axonram.o
 
 # contains only the suspend handler for time
 ifeq ($(CONFIG_RTC_CLASS),)
diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
new file mode 100644
index 00000000000..2326d5dc575
--- /dev/null
+++ b/arch/powerpc/sysdev/axonram.c
@@ -0,0 +1,381 @@
+/*
+ * (C) Copyright IBM Deutschland Entwicklung GmbH 2006
+ *
+ * Author: Maxim Shchetynin <maxim@de.ibm.com>
+ *
+ * Axon DDR2 device driver.
+ * It registers one block device per Axon's DDR2 memory bank found on a system.
+ * Block devices are called axonram?, their major and minor numbers are
+ * available in /proc/devices, /proc/partitions or in /sys/block/axonram?/dev.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/irq.h>
+#include <linux/irqreturn.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mod_devicetable.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <asm/of_device.h>
+#include <asm/of_platform.h>
+#include <asm/page.h>
+#include <asm/prom.h>
+
+#define AXON_RAM_MODULE_NAME		"axonram"
+#define AXON_RAM_DEVICE_NAME		"axonram"
+#define AXON_RAM_MINORS_PER_DISK	16
+#define AXON_RAM_BLOCK_SHIFT		PAGE_SHIFT
+#define AXON_RAM_BLOCK_SIZE		1 << AXON_RAM_BLOCK_SHIFT
+#define AXON_RAM_SECTOR_SHIFT		9
+#define AXON_RAM_SECTOR_SIZE		1 << AXON_RAM_SECTOR_SHIFT
+#define AXON_RAM_IRQ_FLAGS		IRQF_SHARED | IRQF_TRIGGER_RISING
+
+struct axon_ram_bank {
+	struct of_device	*device;
+	struct gendisk		*disk;
+	unsigned int		irq_correctable;
+	unsigned int		irq_uncorrectable;
+	unsigned long		ph_addr;
+	unsigned long		io_addr;
+	unsigned long		size;
+	unsigned long		ecc_counter;
+};
+
+static ssize_t
+axon_ram_sysfs_ecc(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct of_device *device = to_of_device(dev);
+	struct axon_ram_bank *bank = device->dev.platform_data;
+
+	BUG_ON(!bank);
+
+	return sprintf(buf, "%ld\n", bank->ecc_counter);
+}
+
+static DEVICE_ATTR(ecc, S_IRUGO, axon_ram_sysfs_ecc, NULL);
+
+/**
+ * axon_ram_irq_handler - interrupt handler for Axon RAM ECC
+ * @irq: interrupt ID
+ * @dev: pointer to of_device
+ */
+static irqreturn_t
+axon_ram_irq_handler(int irq, void *dev)
+{
+	struct of_device *device = dev;
+	struct axon_ram_bank *bank = device->dev.platform_data;
+
+	BUG_ON(!bank);
+
+	if (irq == bank->irq_correctable) {
+		dev_err(&device->dev, "Correctable memory error occured\n");
+		bank->ecc_counter++;
+		return IRQ_HANDLED;
+	} else if (irq == bank->irq_uncorrectable) {
+		dev_err(&device->dev, "Uncorrectable memory error occured\n");
+		panic("Critical ECC error on %s", device->node->full_name);
+	}
+
+	return IRQ_NONE;
+}
+
+/**
+ * axon_ram_make_request - make_request() method for block device
+ * @queue, @bio: see blk_queue_make_request()
+ */
+static int
+axon_ram_make_request(struct request_queue *queue, struct bio *bio)
+{
+	struct axon_ram_bank *bank = bio->bi_bdev->bd_disk->private_data;
+	unsigned long phys_mem, phys_end;
+	void *user_mem;
+	struct bio_vec *vec;
+	unsigned int transfered;
+	unsigned short idx;
+	int rc = 0;
+
+	phys_mem = bank->io_addr + (bio->bi_sector << AXON_RAM_SECTOR_SHIFT);
+	phys_end = bank->io_addr + bank->size;
+	transfered = 0;
+	bio_for_each_segment(vec, bio, idx) {
+		if (unlikely(phys_mem + vec->bv_len > phys_end)) {
+			bio_io_error(bio, bio->bi_size);
+			rc = -ERANGE;
+			break;
+		}
+
+		user_mem = page_address(vec->bv_page) + vec->bv_offset;
+		if (bio_data_dir(bio) == READ)
+			memcpy(user_mem, (void *) phys_mem, vec->bv_len);
+		else
+			memcpy((void *) phys_mem, user_mem, vec->bv_len);
+
+		phys_mem += vec->bv_len;
+		transfered += vec->bv_len;
+	}
+	bio_endio(bio, transfered, 0);
+
+	return rc;
+}
+
+/**
+ * axon_ram_direct_access - direct_access() method for block device
+ * @device, @sector, @data: see block_device_operations method
+ */
+static int
+axon_ram_direct_access(struct block_device *device, sector_t sector,
+		       unsigned long *data)
+{
+	struct axon_ram_bank *bank = device->bd_disk->private_data;
+	loff_t offset;
+
+	offset = sector << AXON_RAM_SECTOR_SHIFT;
+	if (offset >= bank->size) {
+		dev_err(&bank->device->dev, "Access outside of address space\n");
+		return -ERANGE;
+	}
+
+	*data = bank->ph_addr + offset;
+
+	return 0;
+}
+
+static struct block_device_operations axon_ram_devops = {
+	.owner		= THIS_MODULE,
+	.direct_access	= axon_ram_direct_access
+};
+
+/**
+ * axon_ram_probe - probe() method for platform driver
+ * @device, @device_id: see of_platform_driver method
+ */
+static int
+axon_ram_probe(struct of_device *device, const struct of_device_id *device_id)
+{
+	static int axon_ram_bank_id = -1;
+	struct axon_ram_bank *bank;
+	struct resource resource;
+	int rc = 0;
+
+	axon_ram_bank_id++;
+
+	dev_info(&device->dev, "Found memory controller on %s\n",
+			device->node->full_name);
+
+	bank = kzalloc(sizeof(struct axon_ram_bank), GFP_KERNEL);
+	if (bank == NULL) {
+		dev_err(&device->dev, "Out of memory\n");
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	device->dev.platform_data = bank;
+
+	bank->device = device;
+
+	if (of_address_to_resource(device->node, 0, &resource) != 0) {
+		dev_err(&device->dev, "Cannot access device tree\n");
+		rc = -EFAULT;
+		goto failed;
+	}
+
+	bank->size = resource.end - resource.start + 1;
+
+	if (bank->size == 0) {
+		dev_err(&device->dev, "No DDR2 memory found for %s%d\n",
+				AXON_RAM_DEVICE_NAME, axon_ram_bank_id);
+		rc = -ENODEV;
+		goto failed;
+	}
+
+	dev_info(&device->dev, "Register DDR2 memory device %s%d with %luMB\n",
+			AXON_RAM_DEVICE_NAME, axon_ram_bank_id, bank->size >> 20);
+
+	bank->ph_addr = resource.start;
+	bank->io_addr = (unsigned long) ioremap_flags(
+			bank->ph_addr, bank->size, _PAGE_NO_CACHE);
+	if (bank->io_addr == 0) {
+		dev_err(&device->dev, "ioremap() failed\n");
+		rc = -EFAULT;
+		goto failed;
+	}
+
+	bank->disk = alloc_disk(AXON_RAM_MINORS_PER_DISK);
+	if (bank->disk == NULL) {
+		dev_err(&device->dev, "Cannot register disk\n");
+		rc = -EFAULT;
+		goto failed;
+	}
+
+	bank->disk->first_minor = 0;
+	bank->disk->fops = &axon_ram_devops;
+	bank->disk->private_data = bank;
+	bank->disk->driverfs_dev = &device->dev;
+
+	sprintf(bank->disk->disk_name, "%s%d",
+			AXON_RAM_DEVICE_NAME, axon_ram_bank_id);
+	bank->disk->major = register_blkdev(0, bank->disk->disk_name);
+	if (bank->disk->major < 0) {
+		dev_err(&device->dev, "Cannot register block device\n");
+		rc = -EFAULT;
+		goto failed;
+	}
+
+	bank->disk->queue = blk_alloc_queue(GFP_KERNEL);
+	if (bank->disk->queue == NULL) {
+		dev_err(&device->dev, "Cannot register disk queue\n");
+		rc = -EFAULT;
+		goto failed;
+	}
+
+	set_capacity(bank->disk, bank->size >> AXON_RAM_SECTOR_SHIFT);
+	blk_queue_make_request(bank->disk->queue, axon_ram_make_request);
+	blk_queue_hardsect_size(bank->disk->queue, AXON_RAM_SECTOR_SIZE);
+	add_disk(bank->disk);
+
+	bank->irq_correctable = irq_of_parse_and_map(device->node, 0);
+	bank->irq_uncorrectable = irq_of_parse_and_map(device->node, 1);
+	if ((bank->irq_correctable <= 0) || (bank->irq_uncorrectable <= 0)) {
+		dev_err(&device->dev, "Cannot access ECC interrupt ID\n");
+		rc = -EFAULT;
+		goto failed;
+	}
+
+	rc = request_irq(bank->irq_correctable, axon_ram_irq_handler,
+			AXON_RAM_IRQ_FLAGS, bank->disk->disk_name, device);
+	if (rc != 0) {
+		dev_err(&device->dev, "Cannot register ECC interrupt handler\n");
+		bank->irq_correctable = bank->irq_uncorrectable = 0;
+		rc = -EFAULT;
+		goto failed;
+	}
+
+	rc = request_irq(bank->irq_uncorrectable, axon_ram_irq_handler,
+			AXON_RAM_IRQ_FLAGS, bank->disk->disk_name, device);
+	if (rc != 0) {
+		dev_err(&device->dev, "Cannot register ECC interrupt handler\n");
+		bank->irq_uncorrectable = 0;
+		rc = -EFAULT;
+		goto failed;
+	}
+
+	rc = device_create_file(&device->dev, &dev_attr_ecc);
+	if (rc != 0) {
+		dev_err(&device->dev, "Cannot create sysfs file\n");
+		rc = -EFAULT;
+		goto failed;
+	}
+
+	return 0;
+
+failed:
+	if (bank != NULL) {
+		if (bank->irq_uncorrectable > 0)
+			free_irq(bank->irq_uncorrectable, device);
+		if (bank->irq_correctable > 0)
+			free_irq(bank->irq_correctable, device);
+		if (bank->disk != NULL) {
+			if (bank->disk->queue != NULL)
+				blk_cleanup_queue(bank->disk->queue);
+			if (bank->disk->major > 0)
+				unregister_blkdev(bank->disk->major,
+						bank->disk->disk_name);
+			del_gendisk(bank->disk);
+		}
+		device->dev.platform_data = NULL;
+		if (bank->io_addr != 0)
+			iounmap((void __iomem *) bank->io_addr);
+		kfree(bank);
+	}
+
+	return rc;
+}
+
+/**
+ * axon_ram_remove - remove() method for platform driver
+ * @device: see of_platform_driver method
+ */
+static int
+axon_ram_remove(struct of_device *device)
+{
+	struct axon_ram_bank *bank = device->dev.platform_data;
+
+	BUG_ON(!bank || !bank->disk);
+
+	device_remove_file(&device->dev, &dev_attr_ecc);
+	free_irq(bank->irq_uncorrectable, device);
+	free_irq(bank->irq_correctable, device);
+	blk_cleanup_queue(bank->disk->queue);
+	unregister_blkdev(bank->disk->major, bank->disk->disk_name);
+	del_gendisk(bank->disk);
+	iounmap((void __iomem *) bank->io_addr);
+	kfree(bank);
+
+	return 0;
+}
+
+static struct of_device_id axon_ram_device_id[] = {
+	{
+		.type	= "dma-memory"
+	},
+	{}
+};
+
+static struct of_platform_driver axon_ram_driver = {
+	.owner		= THIS_MODULE,
+	.name		= AXON_RAM_MODULE_NAME,
+	.match_table	= axon_ram_device_id,
+	.probe		= axon_ram_probe,
+	.remove		= axon_ram_remove
+};
+
+/**
+ * axon_ram_init
+ */
+static int __init
+axon_ram_init(void)
+{
+	return of_register_platform_driver(&axon_ram_driver);
+}
+
+/**
+ * axon_ram_exit
+ */
+static void __exit
+axon_ram_exit(void)
+{
+	of_unregister_platform_driver(&axon_ram_driver);
+}
+
+module_init(axon_ram_init);
+module_exit(axon_ram_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Maxim Shchetynin <maxim@de.ibm.com>");
+MODULE_DESCRIPTION("Axon DDR2 RAM device driver for IBM Cell BE");
-- 
cgit v1.2.3-70-g09d2


From 8d2655e621bfc3c3f925016f881a36739d479f69 Mon Sep 17 00:00:00 2001
From: Andre Detsch <adetsch@br.ibm.com>
Date: Fri, 20 Jul 2007 21:39:27 +0200
Subject: [CELL] saving spus information for kexec crash

This patch adds support for investigating spus information after a
kernel crash event, through kdump vmcore file.
Implementation is based on xmon code, but the new functionality was
kept independent from xmon.

Signed-off-by: Lucio Jose Herculano Correia <luciojhc@br.ibm.com>
Signed-off-by: Andre Detsch <adetsch@br.ibm.com>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/kernel/crash.c            | 67 ++++++++++++++++++++++++++++++++++
 arch/powerpc/platforms/cell/spu_base.c |  2 +-
 include/asm-powerpc/spu.h              |  8 ++++
 3 files changed, 76 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c
index d3f2080d2ee..37658ea417f 100644
--- a/arch/powerpc/kernel/crash.c
+++ b/arch/powerpc/kernel/crash.c
@@ -219,6 +219,72 @@ void crash_kexec_secondary(struct pt_regs *regs)
 	cpus_in_sr = CPU_MASK_NONE;
 }
 #endif
+#ifdef CONFIG_SPU_BASE
+
+#include <asm/spu.h>
+#include <asm/spu_priv1.h>
+
+struct crash_spu_info {
+	struct spu *spu;
+	u32 saved_spu_runcntl_RW;
+	u32 saved_spu_status_R;
+	u32 saved_spu_npc_RW;
+	u64 saved_mfc_sr1_RW;
+	u64 saved_mfc_dar;
+	u64 saved_mfc_dsisr;
+};
+
+#define CRASH_NUM_SPUS	16	/* Enough for current hardware */
+static struct crash_spu_info crash_spu_info[CRASH_NUM_SPUS];
+
+static void crash_kexec_stop_spus(void)
+{
+	struct spu *spu;
+	int i;
+	u64 tmp;
+
+	for (i = 0; i < CRASH_NUM_SPUS; i++) {
+		if (!crash_spu_info[i].spu)
+			continue;
+
+		spu = crash_spu_info[i].spu;
+
+		crash_spu_info[i].saved_spu_runcntl_RW =
+			in_be32(&spu->problem->spu_runcntl_RW);
+		crash_spu_info[i].saved_spu_status_R =
+			in_be32(&spu->problem->spu_status_R);
+		crash_spu_info[i].saved_spu_npc_RW =
+			in_be32(&spu->problem->spu_npc_RW);
+
+		crash_spu_info[i].saved_mfc_dar    = spu_mfc_dar_get(spu);
+		crash_spu_info[i].saved_mfc_dsisr  = spu_mfc_dsisr_get(spu);
+		tmp = spu_mfc_sr1_get(spu);
+		crash_spu_info[i].saved_mfc_sr1_RW = tmp;
+
+		tmp &= ~MFC_STATE1_MASTER_RUN_CONTROL_MASK;
+		spu_mfc_sr1_set(spu, tmp);
+
+		__delay(200);
+	}
+}
+
+void crash_register_spus(struct list_head *list)
+{
+	struct spu *spu;
+
+	list_for_each_entry(spu, list, full_list) {
+		if (WARN_ON(spu->number >= CRASH_NUM_SPUS))
+			continue;
+
+		crash_spu_info[spu->number].spu = spu;
+	}
+}
+
+#else
+static inline void crash_kexec_stop_spus(void)
+{
+}
+#endif /* CONFIG_SPU_BASE */
 
 void default_machine_crash_shutdown(struct pt_regs *regs)
 {
@@ -254,6 +320,7 @@ void default_machine_crash_shutdown(struct pt_regs *regs)
 	crash_save_cpu(regs, crashing_cpu);
 	crash_kexec_prepare_cpus(crashing_cpu);
 	cpu_set(crashing_cpu, cpus_in_crash);
+	crash_kexec_stop_spus();
 	if (ppc_md.kexec_cpu_down)
 		ppc_md.kexec_cpu_down(1, 0);
 }
diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index 96a8f609690..c563066e640 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -676,7 +676,7 @@ static int __init init_spu_base(void)
 	}
 
 	xmon_register_spus(&spu_full_list);
-
+	crash_register_spus(&spu_full_list);
 	spu_add_sysdev_attr(&attr_stat);
 
 	return 0;
diff --git a/include/asm-powerpc/spu.h b/include/asm-powerpc/spu.h
index eedc828cef2..42d88a6d2df 100644
--- a/include/asm-powerpc/spu.h
+++ b/include/asm-powerpc/spu.h
@@ -188,6 +188,14 @@ int spu_irq_class_0_bottom(struct spu *spu);
 int spu_irq_class_1_bottom(struct spu *spu);
 void spu_irq_setaffinity(struct spu *spu, int cpu);
 
+#ifdef CONFIG_KEXEC
+void crash_register_spus(struct list_head *list);
+#else
+static inline void crash_register_spus(struct list_head *list)
+{
+}
+#endif
+
 extern void spu_invalidate_slbs(struct spu *spu);
 extern void spu_associate_mm(struct spu *spu, struct mm_struct *mm);
 
-- 
cgit v1.2.3-70-g09d2


From ce21b3c9648ae55181787bf25ee00cf91dfd5c91 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Fri, 20 Jul 2007 21:39:28 +0200
Subject: [CELL] add support for MSI on Axon-based Cell systems

This patch adds support for the setup and decoding of MSIs
on Axon-based Cell systems, using the MSIC mechanism.

This involves setting up an area of BE memory which the Axon
then uses as a FIFO for MSI messages. When one or more MSIs
are decoded by the MSIC we receive an interrupt on the MPIC,
and the MSI messages are written into the FIFO. At the moment
we use a 64KB FIFO, one per MSIC/BE.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/Makefile   |   2 +
 arch/powerpc/platforms/cell/axon_msi.c | 445 +++++++++++++++++++++++++++++++++
 2 files changed, 447 insertions(+)
 create mode 100644 arch/powerpc/platforms/cell/axon_msi.c

diff --git a/arch/powerpc/platforms/cell/Makefile b/arch/powerpc/platforms/cell/Makefile
index be059718bec..f88a7c76f29 100644
--- a/arch/powerpc/platforms/cell/Makefile
+++ b/arch/powerpc/platforms/cell/Makefile
@@ -25,3 +25,5 @@ obj-$(CONFIG_SPU_BASE)			+= spu_callbacks.o spu_base.o \
 					   $(spu-priv1-y) \
 					   $(spu-manage-y) \
 					   spufs/
+
+obj-$(CONFIG_PCI_MSI)			+= axon_msi.o
diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c
new file mode 100644
index 00000000000..4c9ab5b70ba
--- /dev/null
+++ b/arch/powerpc/platforms/cell/axon_msi.c
@@ -0,0 +1,445 @@
+/*
+ * Copyright 2007, Michael Ellerman, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/msi.h>
+#include <linux/reboot.h>
+
+#include <asm/dcr.h>
+#include <asm/machdep.h>
+#include <asm/prom.h>
+
+
+/*
+ * MSIC registers, specified as offsets from dcr_base
+ */
+#define MSIC_CTRL_REG	0x0
+
+/* Base Address registers specify FIFO location in BE memory */
+#define MSIC_BASE_ADDR_HI_REG	0x3
+#define MSIC_BASE_ADDR_LO_REG	0x4
+
+/* Hold the read/write offsets into the FIFO */
+#define MSIC_READ_OFFSET_REG	0x5
+#define MSIC_WRITE_OFFSET_REG	0x6
+
+
+/* MSIC control register flags */
+#define MSIC_CTRL_ENABLE		0x0001
+#define MSIC_CTRL_FIFO_FULL_ENABLE	0x0002
+#define MSIC_CTRL_IRQ_ENABLE		0x0008
+#define MSIC_CTRL_FULL_STOP_ENABLE	0x0010
+
+/*
+ * The MSIC can be configured to use a FIFO of 32KB, 64KB, 128KB or 256KB.
+ * Currently we're using a 64KB FIFO size.
+ */
+#define MSIC_FIFO_SIZE_SHIFT	16
+#define MSIC_FIFO_SIZE_BYTES	(1 << MSIC_FIFO_SIZE_SHIFT)
+
+/*
+ * To configure the FIFO size as (1 << n) bytes, we write (n - 15) into bits
+ * 8-9 of the MSIC control reg.
+ */
+#define MSIC_CTRL_FIFO_SIZE	(((MSIC_FIFO_SIZE_SHIFT - 15) << 8) & 0x300)
+
+/*
+ * We need to mask the read/write offsets to make sure they stay within
+ * the bounds of the FIFO. Also they should always be 16-byte aligned.
+ */
+#define MSIC_FIFO_SIZE_MASK	((MSIC_FIFO_SIZE_BYTES - 1) & ~0xFu)
+
+/* Each entry in the FIFO is 16 bytes, the first 4 bytes hold the irq # */
+#define MSIC_FIFO_ENTRY_SIZE	0x10
+
+
+struct axon_msic {
+	struct device_node *dn;
+	struct irq_host *irq_host;
+	__le32 *fifo;
+	dcr_host_t dcr_host;
+	struct list_head list;
+	u32 read_offset;
+	u32 dcr_base;
+};
+
+static LIST_HEAD(axon_msic_list);
+
+static void msic_dcr_write(struct axon_msic *msic, unsigned int dcr_n, u32 val)
+{
+	pr_debug("axon_msi: dcr_write(0x%x, 0x%x)\n", val, dcr_n);
+
+	dcr_write(msic->dcr_host, msic->dcr_base + dcr_n, val);
+}
+
+static u32 msic_dcr_read(struct axon_msic *msic, unsigned int dcr_n)
+{
+	return dcr_read(msic->dcr_host, msic->dcr_base + dcr_n);
+}
+
+static void axon_msi_cascade(unsigned int irq, struct irq_desc *desc)
+{
+	struct axon_msic *msic = get_irq_data(irq);
+	u32 write_offset, msi;
+	int idx;
+
+	write_offset = msic_dcr_read(msic, MSIC_WRITE_OFFSET_REG);
+	pr_debug("axon_msi: original write_offset 0x%x\n", write_offset);
+
+	/* write_offset doesn't wrap properly, so we have to mask it */
+	write_offset &= MSIC_FIFO_SIZE_MASK;
+
+	while (msic->read_offset != write_offset) {
+		idx  = msic->read_offset / sizeof(__le32);
+		msi  = le32_to_cpu(msic->fifo[idx]);
+		msi &= 0xFFFF;
+
+		pr_debug("axon_msi: woff %x roff %x msi %x\n",
+			  write_offset, msic->read_offset, msi);
+
+		msic->read_offset += MSIC_FIFO_ENTRY_SIZE;
+		msic->read_offset &= MSIC_FIFO_SIZE_MASK;
+
+		if (msi < NR_IRQS && irq_map[msi].host == msic->irq_host)
+			generic_handle_irq(msi);
+		else
+			pr_debug("axon_msi: invalid irq 0x%x!\n", msi);
+	}
+
+	desc->chip->eoi(irq);
+}
+
+static struct axon_msic *find_msi_translator(struct pci_dev *dev)
+{
+	struct irq_host *irq_host;
+	struct device_node *dn, *tmp;
+	const phandle *ph;
+	struct axon_msic *msic = NULL;
+
+	dn = pci_device_to_OF_node(dev);
+	if (!dn) {
+		dev_dbg(&dev->dev, "axon_msi: no pci_dn found\n");
+		return NULL;
+	}
+
+	for (; dn; tmp = of_get_parent(dn), of_node_put(dn), dn = tmp) {
+		ph = of_get_property(dn, "msi-translator", NULL);
+		if (ph)
+			break;
+	}
+
+	if (!ph) {
+		dev_dbg(&dev->dev,
+			"axon_msi: no msi-translator property found\n");
+		goto out_error;
+	}
+
+	tmp = dn;
+	dn = of_find_node_by_phandle(*ph);
+	if (!dn) {
+		dev_dbg(&dev->dev,
+			"axon_msi: msi-translator doesn't point to a node\n");
+		goto out_error;
+	}
+
+	irq_host = irq_find_host(dn);
+	if (!irq_host) {
+		dev_dbg(&dev->dev, "axon_msi: no irq_host found for node %s\n",
+			dn->full_name);
+		goto out_error;
+	}
+
+	msic = irq_host->host_data;
+
+out_error:
+	of_node_put(dn);
+	of_node_put(tmp);
+
+	return msic;
+}
+
+static int axon_msi_check_device(struct pci_dev *dev, int nvec, int type)
+{
+	if (!find_msi_translator(dev))
+		return -ENODEV;
+
+	return 0;
+}
+
+static int setup_msi_msg_address(struct pci_dev *dev, struct msi_msg *msg)
+{
+	struct device_node *dn, *tmp;
+	struct msi_desc *entry;
+	int len;
+	const u32 *prop;
+
+	dn = pci_device_to_OF_node(dev);
+	if (!dn) {
+		dev_dbg(&dev->dev, "axon_msi: no pci_dn found\n");
+		return -ENODEV;
+	}
+
+	entry = list_first_entry(&dev->msi_list, struct msi_desc, list);
+
+	for (; dn; tmp = of_get_parent(dn), of_node_put(dn), dn = tmp) {
+		if (entry->msi_attrib.is_64) {
+			prop = of_get_property(dn, "msi-address-64", &len);
+			if (prop)
+				break;
+		}
+
+		prop = of_get_property(dn, "msi-address-32", &len);
+		if (prop)
+			break;
+	}
+
+	if (!prop) {
+		dev_dbg(&dev->dev,
+			"axon_msi: no msi-address-(32|64) properties found\n");
+		return -ENOENT;
+	}
+
+	switch (len) {
+	case 8:
+		msg->address_hi = prop[0];
+		msg->address_lo = prop[1];
+		break;
+	case 4:
+		msg->address_hi = 0;
+		msg->address_lo = prop[0];
+		break;
+	default:
+		dev_dbg(&dev->dev,
+			"axon_msi: malformed msi-address-(32|64) property\n");
+		of_node_put(dn);
+		return -EINVAL;
+	}
+
+	of_node_put(dn);
+
+	return 0;
+}
+
+static int axon_msi_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+	unsigned int virq, rc;
+	struct msi_desc *entry;
+	struct msi_msg msg;
+	struct axon_msic *msic;
+
+	msic = find_msi_translator(dev);
+	if (!msic)
+		return -ENODEV;
+
+	rc = setup_msi_msg_address(dev, &msg);
+	if (rc)
+		return rc;
+
+	/* We rely on being able to stash a virq in a u16 */
+	BUILD_BUG_ON(NR_IRQS > 65536);
+
+	list_for_each_entry(entry, &dev->msi_list, list) {
+		virq = irq_create_direct_mapping(msic->irq_host);
+		if (virq == NO_IRQ) {
+			dev_warn(&dev->dev,
+				 "axon_msi: virq allocation failed!\n");
+			return -1;
+		}
+		dev_dbg(&dev->dev, "axon_msi: allocated virq 0x%x\n", virq);
+
+		set_irq_msi(virq, entry);
+		msg.data = virq;
+		write_msi_msg(virq, &msg);
+	}
+
+	return 0;
+}
+
+static void axon_msi_teardown_msi_irqs(struct pci_dev *dev)
+{
+	struct msi_desc *entry;
+
+	dev_dbg(&dev->dev, "axon_msi: tearing down msi irqs\n");
+
+	list_for_each_entry(entry, &dev->msi_list, list) {
+		if (entry->irq == NO_IRQ)
+			continue;
+
+		set_irq_msi(entry->irq, NULL);
+		irq_dispose_mapping(entry->irq);
+	}
+}
+
+static struct irq_chip msic_irq_chip = {
+	.mask		= mask_msi_irq,
+	.unmask		= unmask_msi_irq,
+	.shutdown	= unmask_msi_irq,
+	.typename	= "AXON-MSI",
+};
+
+static int msic_host_map(struct irq_host *h, unsigned int virq,
+			 irq_hw_number_t hw)
+{
+	set_irq_chip_and_handler(virq, &msic_irq_chip, handle_simple_irq);
+
+	return 0;
+}
+
+static int msic_host_match(struct irq_host *host, struct device_node *dn)
+{
+	struct axon_msic *msic = host->host_data;
+
+	return msic->dn == dn;
+}
+
+static struct irq_host_ops msic_host_ops = {
+	.match	= msic_host_match,
+	.map	= msic_host_map,
+};
+
+static int axon_msi_notify_reboot(struct notifier_block *nb,
+				  unsigned long code, void *data)
+{
+	struct axon_msic *msic;
+	u32 tmp;
+
+	list_for_each_entry(msic, &axon_msic_list, list) {
+		pr_debug("axon_msi: disabling %s\n", msic->dn->full_name);
+		tmp  = msic_dcr_read(msic, MSIC_CTRL_REG);
+		tmp &= ~MSIC_CTRL_ENABLE & ~MSIC_CTRL_IRQ_ENABLE;
+		msic_dcr_write(msic, MSIC_CTRL_REG, tmp);
+	}
+
+	return 0;
+}
+
+static struct notifier_block axon_msi_reboot_notifier = {
+	.notifier_call = axon_msi_notify_reboot
+};
+
+static int axon_msi_setup_one(struct device_node *dn)
+{
+	struct page *page;
+	struct axon_msic *msic;
+	unsigned int virq;
+	int dcr_len;
+
+	pr_debug("axon_msi: setting up dn %s\n", dn->full_name);
+
+	msic = kzalloc(sizeof(struct axon_msic), GFP_KERNEL);
+	if (!msic) {
+		printk(KERN_ERR "axon_msi: couldn't allocate msic for %s\n",
+		       dn->full_name);
+		goto out;
+	}
+
+	msic->dcr_base = dcr_resource_start(dn, 0);
+	dcr_len = dcr_resource_len(dn, 0);
+
+	if (msic->dcr_base == 0 || dcr_len == 0) {
+		printk(KERN_ERR
+		       "axon_msi: couldn't parse dcr properties on %s\n",
+			dn->full_name);
+		goto out;
+	}
+
+	msic->dcr_host = dcr_map(dn, msic->dcr_base, dcr_len);
+	if (!DCR_MAP_OK(msic->dcr_host)) {
+		printk(KERN_ERR "axon_msi: dcr_map failed for %s\n",
+		       dn->full_name);
+		goto out_free_msic;
+	}
+
+	page = alloc_pages_node(of_node_to_nid(dn), GFP_KERNEL,
+				get_order(MSIC_FIFO_SIZE_BYTES));
+	if (!page) {
+		printk(KERN_ERR "axon_msi: couldn't allocate fifo for %s\n",
+		       dn->full_name);
+		goto out_free_msic;
+	}
+
+	msic->fifo = page_address(page);
+
+	msic->irq_host = irq_alloc_host(IRQ_HOST_MAP_NOMAP, NR_IRQS,
+					&msic_host_ops, 0);
+	if (!msic->irq_host) {
+		printk(KERN_ERR "axon_msi: couldn't allocate irq_host for %s\n",
+		       dn->full_name);
+		goto out_free_fifo;
+	}
+
+	msic->irq_host->host_data = msic;
+
+	virq = irq_of_parse_and_map(dn, 0);
+	if (virq == NO_IRQ) {
+		printk(KERN_ERR "axon_msi: irq parse and map failed for %s\n",
+		       dn->full_name);
+		goto out_free_host;
+	}
+
+	msic->dn = of_node_get(dn);
+
+	set_irq_data(virq, msic);
+	set_irq_chained_handler(virq, axon_msi_cascade);
+	pr_debug("axon_msi: irq 0x%x setup for axon_msi\n", virq);
+
+	/* Enable the MSIC hardware */
+	msic_dcr_write(msic, MSIC_BASE_ADDR_HI_REG, (u64)msic->fifo >> 32);
+	msic_dcr_write(msic, MSIC_BASE_ADDR_LO_REG,
+				  (u64)msic->fifo & 0xFFFFFFFF);
+	msic_dcr_write(msic, MSIC_CTRL_REG,
+			MSIC_CTRL_IRQ_ENABLE | MSIC_CTRL_ENABLE |
+			MSIC_CTRL_FIFO_SIZE);
+
+	list_add(&msic->list, &axon_msic_list);
+
+	printk(KERN_DEBUG "axon_msi: setup MSIC on %s\n", dn->full_name);
+
+	return 0;
+
+out_free_host:
+	kfree(msic->irq_host);
+out_free_fifo:
+	__free_pages(virt_to_page(msic->fifo), get_order(MSIC_FIFO_SIZE_BYTES));
+out_free_msic:
+	kfree(msic);
+out:
+
+	return -1;
+}
+
+static int axon_msi_init(void)
+{
+	struct device_node *dn;
+	int found = 0;
+
+	pr_debug("axon_msi: initialising ...\n");
+
+	for_each_compatible_node(dn, NULL, "ibm,axon-msic") {
+		if (axon_msi_setup_one(dn) == 0)
+			found++;
+	}
+
+	if (found) {
+		ppc_md.setup_msi_irqs = axon_msi_setup_msi_irqs;
+		ppc_md.teardown_msi_irqs = axon_msi_teardown_msi_irqs;
+		ppc_md.msi_check_device = axon_msi_check_device;
+
+		register_reboot_notifier(&axon_msi_reboot_notifier);
+
+		pr_debug("axon_msi: registered callbacks!\n");
+	}
+
+	return 0;
+}
+arch_initcall(axon_msi_init);
-- 
cgit v1.2.3-70-g09d2


From d1450317554d52e0e4a454806c4d05bb2a834f00 Mon Sep 17 00:00:00 2001
From: Sebastian Siewior <sebastian@breakpoint.cc>
Date: Fri, 20 Jul 2007 21:39:29 +0200
Subject: [CELL] spufs: remove section mismatch warning

WARNING: arch/powerpc/platforms/cell/spufs/spufs.o(.init.text+0x158): Section
mismatch: reference to .exit.text:.spu_sched_exit (between '.init_module' and
'.spu_sched_init')

was introduced by c99c1994a2bb9493b4ac372b2b6ee2606d291171
This patch removes the warning.

Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sebastian Siewior <sebastian@breakpoint.cc>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/spufs/sched.c | 2 +-
 arch/powerpc/platforms/cell/spufs/spufs.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index e5b4dd1db28..9b1706cc126 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -783,7 +783,7 @@ int __init spu_sched_init(void)
 	return err;
 }
 
-void __exit spu_sched_exit(void)
+void spu_sched_exit(void)
 {
 	struct spu *spu, *tmp;
 	int node;
diff --git a/arch/powerpc/platforms/cell/spufs/spufs.h b/arch/powerpc/platforms/cell/spufs/spufs.h
index 08b3530288a..34d5f9f8b4a 100644
--- a/arch/powerpc/platforms/cell/spufs/spufs.h
+++ b/arch/powerpc/platforms/cell/spufs/spufs.h
@@ -235,7 +235,7 @@ void spu_set_timeslice(struct spu_context *ctx);
 void spu_update_sched_info(struct spu_context *ctx);
 void __spu_update_sched_info(struct spu_context *ctx);
 int __init spu_sched_init(void);
-void __exit spu_sched_exit(void);
+void spu_sched_exit(void);
 
 extern char *isolated_loader;
 
-- 
cgit v1.2.3-70-g09d2


From 49776d30aea903fb2f9966c8e9b6f23ae5f7c937 Mon Sep 17 00:00:00 2001
From: Kazunori Asayama <asayama@sm.sony.co.jp>
Date: Fri, 20 Jul 2007 21:39:30 +0200
Subject: [CELL] spufs: Avoid unexpectedly restaring MFC during context save

The current SPU context saving procedure in SPUFS unexpectedly
restarts MFC when halting decrementer, because MFC_CNTL[Dh] is set
without MFC_CNTL[Sm]. This bug causes, for example, saving broken DMA
queues. Here is a patch to fix the problem.

Signed-off-by: Kazunori Asayama <asayama@sm.sony.co.jp>
Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/spufs/switch.c | 3 ++-
 include/asm-powerpc/spu.h                  | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/cell/spufs/switch.c b/arch/powerpc/platforms/cell/spufs/switch.c
index 9c506ba08cd..827aada391f 100644
--- a/arch/powerpc/platforms/cell/spufs/switch.c
+++ b/arch/powerpc/platforms/cell/spufs/switch.c
@@ -271,7 +271,8 @@ static inline void halt_mfc_decr(struct spu_state *csa, struct spu *spu)
 	 *     Write MFC_CNTL[Dh] set to a '1' to halt
 	 *     the decrementer.
 	 */
-	out_be64(&priv2->mfc_control_RW, MFC_CNTL_DECREMENTER_HALTED);
+	out_be64(&priv2->mfc_control_RW,
+		 MFC_CNTL_DECREMENTER_HALTED | MFC_CNTL_SUSPEND_MASK);
 	eieio();
 }
 
diff --git a/include/asm-powerpc/spu.h b/include/asm-powerpc/spu.h
index 42d88a6d2df..a034f03b810 100644
--- a/include/asm-powerpc/spu.h
+++ b/include/asm-powerpc/spu.h
@@ -411,6 +411,7 @@ struct spu_priv2 {
 #define MFC_CNTL_RESUME_DMA_QUEUE		(0ull << 0)
 #define MFC_CNTL_SUSPEND_DMA_QUEUE		(1ull << 0)
 #define MFC_CNTL_SUSPEND_DMA_QUEUE_MASK		(1ull << 0)
+#define MFC_CNTL_SUSPEND_MASK			(1ull << 4)
 #define MFC_CNTL_NORMAL_DMA_QUEUE_OPERATION	(0ull << 8)
 #define MFC_CNTL_SUSPEND_IN_PROGRESS		(1ull << 8)
 #define MFC_CNTL_SUSPEND_COMPLETE		(3ull << 8)
-- 
cgit v1.2.3-70-g09d2


From d054b36ffd302ec65aabec16a0c60ddd9e6b5a62 Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@ozlabs.org>
Date: Fri, 20 Jul 2007 21:39:31 +0200
Subject: [CELL] spufs: Make signal-notification files readonly for NOSCHED
 contexts

Reading from the signal{1,2} files requires a spu_acquire_saved, so
make these files write-only for contexts created with
SPU_CREATE_NOSCHED.

Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/spufs/file.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index c2814ea96af..fe164112b3d 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -934,6 +934,13 @@ static const struct file_operations spufs_signal1_fops = {
 	.mmap = spufs_signal1_mmap,
 };
 
+static const struct file_operations spufs_signal1_nosched_fops = {
+	.open = spufs_signal1_open,
+	.release = spufs_signal1_release,
+	.write = spufs_signal1_write,
+	.mmap = spufs_signal1_mmap,
+};
+
 static int spufs_signal2_open(struct inode *inode, struct file *file)
 {
 	struct spufs_inode_info *i = SPUFS_I(inode);
@@ -1062,6 +1069,13 @@ static const struct file_operations spufs_signal2_fops = {
 	.mmap = spufs_signal2_mmap,
 };
 
+static const struct file_operations spufs_signal2_nosched_fops = {
+	.open = spufs_signal2_open,
+	.release = spufs_signal2_release,
+	.write = spufs_signal2_write,
+	.mmap = spufs_signal2_mmap,
+};
+
 static void spufs_signal1_type_set(void *data, u64 val)
 {
 	struct spu_context *ctx = data;
@@ -2184,8 +2198,8 @@ struct tree_descr spufs_dir_nosched_contents[] = {
 	{ "mbox_stat", &spufs_mbox_stat_fops, 0444, },
 	{ "ibox_stat", &spufs_ibox_stat_fops, 0444, },
 	{ "wbox_stat", &spufs_wbox_stat_fops, 0444, },
-	{ "signal1", &spufs_signal1_fops, 0666, },
-	{ "signal2", &spufs_signal2_fops, 0666, },
+	{ "signal1", &spufs_signal1_nosched_fops, 0222, },
+	{ "signal2", &spufs_signal2_nosched_fops, 0222, },
 	{ "signal1_type", &spufs_signal1_type, 0666, },
 	{ "signal2_type", &spufs_signal2_type, 0666, },
 	{ "mss", &spufs_mss_fops, 0666, },
-- 
cgit v1.2.3-70-g09d2


From e840cfe6814d6f13ecb86cff7097ad7259df502e Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@ozlabs.org>
Date: Fri, 20 Jul 2007 21:39:32 +0200
Subject: [CELL] spufs: Remove spurious WARN_ON for spu_deactivate for NOSCHED
 contexts

In 6cbf93960e64f313f6e247cbca7afaa50e3ee2c we added a WARN_ON for
calling spu_deactivate on contexts created with the SPU_CREATE_NOSCHED
flag. However, all NOSCHED contexts will need to be deactivated when
the context is destroyed, so this gives a spurious warning when any
NOSCHED context is closed.

This change removes the WARN_ON.

Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/spufs/sched.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 9b1706cc126..fe789308dd1 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -550,15 +550,6 @@ static int __spu_deactivate(struct spu_context *ctx, int force, int max_prio)
  */
 void spu_deactivate(struct spu_context *ctx)
 {
-	/*
-	 * We must never reach this for a nosched context,
-	 * but handle the case gracefull instead of panicing.
-	 */
-	if (ctx->flags & SPU_CREATE_NOSCHED) {
-		WARN_ON(1);
-		return;
-	}
-
 	__spu_deactivate(ctx, 1, MAX_PRIO);
 	spuctx_switch_state(ctx, SPUCTX_UTIL_USER);
 }
-- 
cgit v1.2.3-70-g09d2


From 27ec41d3a1d4df2b7cd190e93aad22ab86a72aa1 Mon Sep 17 00:00:00 2001
From: Andre Detsch <adetsch@br.ibm.com>
Date: Fri, 20 Jul 2007 21:39:33 +0200
Subject: [CELL] spufs: add spu stats in sysfs and ctx stat file in spufs

This patch exports per-context statistics in spufs as long as spu
statistics in sysfs.

It was formed by merging:
"spufs: add spu stats in sysfs"   From: Christoph Hellwig
"spufs: add stat file to spufs"   From: Christoph Hellwig
"spufs: fix libassist accounting" From: Jeremy Kerr
"spusched: fix spu utilization statistics" From: Luke Browning
And some adjustments by myself, after suggestions on cbe-oss-dev.

Having separate patches was making the review process harder
than it should, as we end up integrating spus and ctx statistics
accounting much more than it was on the first implementation.

Signed-off-by: Andre Detsch <adetsch@br.ibm.com>
Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/spu_base.c      | 24 +++++++----
 arch/powerpc/platforms/cell/spufs/context.c |  3 +-
 arch/powerpc/platforms/cell/spufs/fault.c   |  8 ++--
 arch/powerpc/platforms/cell/spufs/file.c    | 32 ++++++++++-----
 arch/powerpc/platforms/cell/spufs/run.c     | 10 +++++
 arch/powerpc/platforms/cell/spufs/sched.c   | 22 +++++-----
 arch/powerpc/platforms/cell/spufs/spufs.h   | 63 +++++++++++++----------------
 include/asm-powerpc/spu.h                   | 10 ++---
 8 files changed, 94 insertions(+), 78 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index c563066e640..caaf2bf78ca 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -553,6 +553,7 @@ static int __init create_spu(void *data)
 	int ret;
 	static int number;
 	unsigned long flags;
+	struct timespec ts;
 
 	ret = -ENOMEM;
 	spu = kzalloc(sizeof (*spu), GFP_KERNEL);
@@ -586,8 +587,9 @@ static int __init create_spu(void *data)
 	spin_unlock_irqrestore(&spu_list_lock, flags);
 	mutex_unlock(&spu_mutex);
 
-	spu->stats.utilization_state = SPU_UTIL_IDLE;
-	spu->stats.tstamp = jiffies;
+	spu->stats.util_state = SPU_UTIL_IDLE_LOADED;
+	ktime_get_ts(&ts);
+	spu->stats.tstamp = timespec_to_ns(&ts);
 
 	goto out;
 
@@ -608,12 +610,20 @@ static const char *spu_state_names[] = {
 static unsigned long long spu_acct_time(struct spu *spu,
 		enum spu_utilization_state state)
 {
+	struct timespec ts;
 	unsigned long long time = spu->stats.times[state];
 
-	if (spu->stats.utilization_state == state)
-		time += jiffies - spu->stats.tstamp;
+	/*
+	 * If the spu is idle or the context is stopped, utilization
+	 * statistics are not updated.  Apply the time delta from the
+	 * last recorded state of the spu.
+	 */
+	if (spu->stats.util_state == state) {
+		ktime_get_ts(&ts);
+		time += timespec_to_ns(&ts) - spu->stats.tstamp;
+	}
 
-	return jiffies_to_msecs(time);
+	return time / NSEC_PER_MSEC;
 }
 
 
@@ -623,11 +633,11 @@ static ssize_t spu_stat_show(struct sys_device *sysdev, char *buf)
 
 	return sprintf(buf, "%s %llu %llu %llu %llu "
 		      "%llu %llu %llu %llu %llu %llu %llu %llu\n",
-		spu_state_names[spu->stats.utilization_state],
+		spu_state_names[spu->stats.util_state],
 		spu_acct_time(spu, SPU_UTIL_USER),
 		spu_acct_time(spu, SPU_UTIL_SYSTEM),
 		spu_acct_time(spu, SPU_UTIL_IOWAIT),
-		spu_acct_time(spu, SPU_UTIL_IDLE),
+		spu_acct_time(spu, SPU_UTIL_IDLE_LOADED),
 		spu->stats.vol_ctx_switch,
 		spu->stats.invol_ctx_switch,
 		spu->stats.slb_flt,
diff --git a/arch/powerpc/platforms/cell/spufs/context.c b/arch/powerpc/platforms/cell/spufs/context.c
index 6d7bd60f538..0e5e55f53c8 100644
--- a/arch/powerpc/platforms/cell/spufs/context.c
+++ b/arch/powerpc/platforms/cell/spufs/context.c
@@ -59,8 +59,7 @@ struct spu_context *alloc_spu_context(struct spu_gang *gang)
 		spu_gang_add_ctx(gang, ctx);
 	ctx->cpus_allowed = current->cpus_allowed;
 	spu_set_timeslice(ctx);
-	ctx->stats.execution_state = SPUCTX_UTIL_USER;
-	ctx->stats.tstamp = jiffies;
+	ctx->stats.util_state = SPU_UTIL_IDLE_LOADED;
 
 	atomic_inc(&nr_spu_contexts);
 	goto out;
diff --git a/arch/powerpc/platforms/cell/spufs/fault.c b/arch/powerpc/platforms/cell/spufs/fault.c
index f53a0743747..917eab4be48 100644
--- a/arch/powerpc/platforms/cell/spufs/fault.c
+++ b/arch/powerpc/platforms/cell/spufs/fault.c
@@ -179,16 +179,14 @@ int spufs_handle_class1(struct spu_context *ctx)
 	if (!(dsisr & (MFC_DSISR_PTE_NOT_FOUND | MFC_DSISR_ACCESS_DENIED)))
 		return 0;
 
-	spuctx_switch_state(ctx, SPUCTX_UTIL_IOWAIT);
+	spuctx_switch_state(ctx, SPU_UTIL_IOWAIT);
 
 	pr_debug("ctx %p: ea %016lx, dsisr %016lx state %d\n", ctx, ea,
 		dsisr, ctx->state);
 
 	ctx->stats.hash_flt++;
-	if (ctx->state == SPU_STATE_RUNNABLE) {
+	if (ctx->state == SPU_STATE_RUNNABLE)
 		ctx->spu->stats.hash_flt++;
-		spu_switch_state(ctx->spu, SPU_UTIL_IOWAIT);
-	}
 
 	/* we must not hold the lock when entering spu_handle_mm_fault */
 	spu_release(ctx);
@@ -226,7 +224,7 @@ int spufs_handle_class1(struct spu_context *ctx)
 	} else
 		spufs_handle_dma_error(ctx, ea, SPE_EVENT_SPE_DATA_STORAGE);
 
-	spuctx_switch_state(ctx, SPUCTX_UTIL_SYSTEM);
+	spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(spufs_handle_class1);
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index fe164112b3d..9351db9472d 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -2079,14 +2079,26 @@ static const char *ctx_state_names[] = {
 };
 
 static unsigned long long spufs_acct_time(struct spu_context *ctx,
-		enum spuctx_execution_state state)
+		enum spu_utilization_state state)
 {
-	unsigned long time = ctx->stats.times[state];
+	struct timespec ts;
+	unsigned long long time = ctx->stats.times[state];
 
-	if (ctx->stats.execution_state == state)
-		time += jiffies - ctx->stats.tstamp;
+	/*
+	 * In general, utilization statistics are updated by the controlling
+	 * thread as the spu context moves through various well defined
+	 * state transitions, but if the context is lazily loaded its
+	 * utilization statistics are not updated as the controlling thread
+	 * is not tightly coupled with the execution of the spu context.  We
+	 * calculate and apply the time delta from the last recorded state
+	 * of the spu context.
+	 */
+	if (ctx->spu && ctx->stats.util_state == state) {
+		ktime_get_ts(&ts);
+		time += timespec_to_ns(&ts) - ctx->stats.tstamp;
+	}
 
-	return jiffies_to_msecs(time);
+	return time / NSEC_PER_MSEC;
 }
 
 static unsigned long long spufs_slb_flts(struct spu_context *ctx)
@@ -2121,11 +2133,11 @@ static int spufs_show_stat(struct seq_file *s, void *private)
 	spu_acquire(ctx);
 	seq_printf(s, "%s %llu %llu %llu %llu "
 		      "%llu %llu %llu %llu %llu %llu %llu %llu\n",
-		ctx_state_names[ctx->stats.execution_state],
-		spufs_acct_time(ctx, SPUCTX_UTIL_USER),
-		spufs_acct_time(ctx, SPUCTX_UTIL_SYSTEM),
-		spufs_acct_time(ctx, SPUCTX_UTIL_IOWAIT),
-		spufs_acct_time(ctx, SPUCTX_UTIL_LOADED),
+		ctx_state_names[ctx->stats.util_state],
+		spufs_acct_time(ctx, SPU_UTIL_USER),
+		spufs_acct_time(ctx, SPU_UTIL_SYSTEM),
+		spufs_acct_time(ctx, SPU_UTIL_IOWAIT),
+		spufs_acct_time(ctx, SPU_UTIL_IDLE_LOADED),
 		ctx->stats.vol_ctx_switch,
 		ctx->stats.invol_ctx_switch,
 		spufs_slb_flts(ctx),
diff --git a/arch/powerpc/platforms/cell/spufs/run.c b/arch/powerpc/platforms/cell/spufs/run.c
index 58ae13b7de8..8c91b3f9315 100644
--- a/arch/powerpc/platforms/cell/spufs/run.c
+++ b/arch/powerpc/platforms/cell/spufs/run.c
@@ -126,6 +126,8 @@ out:
 
 static int spu_run_init(struct spu_context *ctx, u32 * npc)
 {
+	spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
+
 	if (ctx->flags & SPU_CREATE_ISOLATE) {
 		unsigned long runcntl;
 
@@ -151,6 +153,8 @@ static int spu_run_init(struct spu_context *ctx, u32 * npc)
 		ctx->ops->runcntl_write(ctx, SPU_RUNCNTL_RUNNABLE);
 	}
 
+	spuctx_switch_state(ctx, SPU_UTIL_USER);
+
 	return 0;
 }
 
@@ -161,6 +165,8 @@ static int spu_run_fini(struct spu_context *ctx, u32 * npc,
 
 	*status = ctx->ops->status_read(ctx);
 	*npc = ctx->ops->npc_read(ctx);
+
+	spuctx_switch_state(ctx, SPU_UTIL_IDLE_LOADED);
 	spu_release(ctx);
 
 	if (signal_pending(current))
@@ -328,6 +334,9 @@ long spufs_run_spu(struct file *file, struct spu_context *ctx,
 		ret = spufs_wait(ctx->stop_wq, spu_stopped(ctx, &status));
 		if (unlikely(ret))
 			break;
+
+		spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
+
 		if ((status & SPU_STATUS_STOPPED_BY_STOP) &&
 		    (status >> SPU_STOP_STATUS_SHIFT == 0x2104)) {
 			ret = spu_process_callback(ctx);
@@ -356,6 +365,7 @@ long spufs_run_spu(struct file *file, struct spu_context *ctx,
 	    (ctx->state == SPU_STATE_RUNNABLE))
 		ctx->stats.libassist++;
 
+
 	ctx->ops->master_stop(ctx);
 	ret = spu_run_fini(ctx, npc, &status);
 	spu_yield(ctx);
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index fe789308dd1..ecd9e95116a 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -229,6 +229,7 @@ static void spu_bind_context(struct spu *spu, struct spu_context *ctx)
 {
 	pr_debug("%s: pid=%d SPU=%d NODE=%d\n", __FUNCTION__, current->pid,
 		 spu->number, spu->node);
+	spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
 
 	ctx->stats.slb_flt_base = spu->stats.slb_flt;
 	ctx->stats.class2_intr_base = spu->stats.class2_intr;
@@ -251,7 +252,8 @@ static void spu_bind_context(struct spu *spu, struct spu_context *ctx)
 	spu_cpu_affinity_set(spu, raw_smp_processor_id());
 	spu_switch_notify(spu, ctx);
 	ctx->state = SPU_STATE_RUNNABLE;
-	spu_switch_state(spu, SPU_UTIL_SYSTEM);
+
+	spuctx_switch_state(ctx, SPU_UTIL_IDLE_LOADED);
 }
 
 /**
@@ -263,8 +265,7 @@ static void spu_unbind_context(struct spu *spu, struct spu_context *ctx)
 {
 	pr_debug("%s: unbind pid=%d SPU=%d NODE=%d\n", __FUNCTION__,
 		 spu->pid, spu->number, spu->node);
-
-	spu_switch_state(spu, SPU_UTIL_IDLE);
+	spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
 
 	spu_switch_notify(spu, NULL);
 	spu_unmap_mappings(ctx);
@@ -279,7 +280,6 @@ static void spu_unbind_context(struct spu *spu, struct spu_context *ctx)
 	spu_associate_mm(spu, NULL);
 	spu->pid = 0;
 	ctx->ops = &spu_backing_ops;
-	ctx->spu = NULL;
 	spu->flags = 0;
 	spu->ctx = NULL;
 
@@ -287,6 +287,10 @@ static void spu_unbind_context(struct spu *spu, struct spu_context *ctx)
 		(spu->stats.slb_flt - ctx->stats.slb_flt_base);
 	ctx->stats.class2_intr +=
 		(spu->stats.class2_intr - ctx->stats.class2_intr_base);
+
+	/* This maps the underlying spu state to idle */
+	spuctx_switch_state(ctx, SPU_UTIL_IDLE_LOADED);
+	ctx->spu = NULL;
 }
 
 /**
@@ -455,8 +459,6 @@ static struct spu *find_victim(struct spu_context *ctx)
  */
 int spu_activate(struct spu_context *ctx, unsigned long flags)
 {
-	spuctx_switch_state(ctx, SPUCTX_UTIL_SYSTEM);
-
 	do {
 		struct spu *spu;
 
@@ -551,7 +553,6 @@ static int __spu_deactivate(struct spu_context *ctx, int force, int max_prio)
 void spu_deactivate(struct spu_context *ctx)
 {
 	__spu_deactivate(ctx, 1, MAX_PRIO);
-	spuctx_switch_state(ctx, SPUCTX_UTIL_USER);
 }
 
 /**
@@ -566,12 +567,7 @@ void spu_yield(struct spu_context *ctx)
 {
 	if (!(ctx->flags & SPU_CREATE_NOSCHED)) {
 		mutex_lock(&ctx->state_mutex);
-		if (__spu_deactivate(ctx, 0, MAX_PRIO))
-			spuctx_switch_state(ctx, SPUCTX_UTIL_USER);
-		else {
-			spuctx_switch_state(ctx, SPUCTX_UTIL_LOADED);
-			spu_switch_state(ctx->spu, SPU_UTIL_USER);
-		}
+		__spu_deactivate(ctx, 0, MAX_PRIO);
 		mutex_unlock(&ctx->state_mutex);
 	}
 }
diff --git a/arch/powerpc/platforms/cell/spufs/spufs.h b/arch/powerpc/platforms/cell/spufs/spufs.h
index 34d5f9f8b4a..fdace928437 100644
--- a/arch/powerpc/platforms/cell/spufs/spufs.h
+++ b/arch/powerpc/platforms/cell/spufs/spufs.h
@@ -40,19 +40,6 @@ enum {
 struct spu_context_ops;
 struct spu_gang;
 
-/*
- * This is the state for spu utilization reporting to userspace.
- * Because this state is visible to userspace it must never change and needs
- * to be kept strictly separate from any internal state kept by the kernel.
- */
-enum spuctx_execution_state {
-	SPUCTX_UTIL_USER = 0,
-	SPUCTX_UTIL_SYSTEM,
-	SPUCTX_UTIL_IOWAIT,
-	SPUCTX_UTIL_LOADED,
-	SPUCTX_UTIL_MAX
-};
-
 struct spu_context {
 	struct spu *spu;		  /* pointer to a physical SPU */
 	struct spu_state csa;		  /* SPU context save area. */
@@ -104,9 +91,9 @@ struct spu_context {
 	/* statistics */
 	struct {
 		/* updates protected by ctx->state_mutex */
-		enum spuctx_execution_state execution_state;
-		unsigned long tstamp;		/* time of last ctx switch */
-		unsigned long times[SPUCTX_UTIL_MAX];
+		enum spu_utilization_state util_state;
+		unsigned long long tstamp;	/* time of last state switch */
+		unsigned long long times[SPU_UTIL_MAX];
 		unsigned long long vol_ctx_switch;
 		unsigned long long invol_ctx_switch;
 		unsigned long long min_flt;
@@ -293,30 +280,34 @@ extern int spufs_coredump_num_notes;
  * line.
  */
 static inline void spuctx_switch_state(struct spu_context *ctx,
-		enum spuctx_execution_state new_state)
+		enum spu_utilization_state new_state)
 {
-	WARN_ON(!mutex_is_locked(&ctx->state_mutex));
+	unsigned long long curtime;
+	signed long long delta;
+	struct timespec ts;
+	struct spu *spu;
+	enum spu_utilization_state old_state;
 
-	if (ctx->stats.execution_state != new_state) {
-		unsigned long curtime = jiffies;
+	ktime_get_ts(&ts);
+	curtime = timespec_to_ns(&ts);
+	delta = curtime - ctx->stats.tstamp;
 
-		ctx->stats.times[ctx->stats.execution_state] +=
-				 curtime - ctx->stats.tstamp;
-		ctx->stats.tstamp = curtime;
-		ctx->stats.execution_state = new_state;
-	}
-}
-
-static inline void spu_switch_state(struct spu *spu,
-		enum spuctx_execution_state new_state)
-{
-	if (spu->stats.utilization_state != new_state) {
-		unsigned long curtime = jiffies;
-
-		spu->stats.times[spu->stats.utilization_state] +=
-				 curtime - spu->stats.tstamp;
+	WARN_ON(!mutex_is_locked(&ctx->state_mutex));
+	WARN_ON(delta < 0);
+
+	spu = ctx->spu;
+	old_state = ctx->stats.util_state;
+	ctx->stats.util_state = new_state;
+	ctx->stats.tstamp = curtime;
+
+	/*
+	 * Update the physical SPU utilization statistics.
+	 */
+	if (spu) {
+		ctx->stats.times[old_state] += delta;
+		spu->stats.times[old_state] += delta;
+		spu->stats.util_state = new_state;
 		spu->stats.tstamp = curtime;
-		spu->stats.utilization_state = new_state;
 	}
 }
 
diff --git a/include/asm-powerpc/spu.h b/include/asm-powerpc/spu.h
index a034f03b810..12442acdc76 100644
--- a/include/asm-powerpc/spu.h
+++ b/include/asm-powerpc/spu.h
@@ -107,10 +107,10 @@ struct spu_runqueue;
 struct device_node;
 
 enum spu_utilization_state {
-	SPU_UTIL_SYSTEM,
 	SPU_UTIL_USER,
+	SPU_UTIL_SYSTEM,
 	SPU_UTIL_IOWAIT,
-	SPU_UTIL_IDLE,
+	SPU_UTIL_IDLE_LOADED,
 	SPU_UTIL_MAX
 };
 
@@ -167,9 +167,9 @@ struct spu {
 
 	struct {
 		/* protected by interrupt reentrancy */
-		enum spu_utilization_state utilization_state;
-		unsigned long tstamp;		/* time of last ctx switch */
-		unsigned long times[SPU_UTIL_MAX];
+		enum spu_utilization_state util_state;
+		unsigned long long tstamp;
+		unsigned long long times[SPU_UTIL_MAX];
 		unsigned long long vol_ctx_switch;
 		unsigned long long invol_ctx_switch;
 		unsigned long long min_flt;
-- 
cgit v1.2.3-70-g09d2


From 27b1ea091f0c088ecad0d492f37fbe7b8d54d7dc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 20 Jul 2007 21:39:34 +0200
Subject: [CELL] spufs: make sure context are scheduled again after
 spu_acquire_saved

Currently a process is removed from the physical spu when spu_acquire_saved
is saved but never put back.  This patch adds a new spu_release_saved
that is to be paired with spu_acquire_saved and put the process back if
it has been in RUNNABLE state before.

Niether Jeremy not be are entirely happy about this exact patch because
it adds another spu_activate call outside of the owner thread, but I
feel this is the best short-term fix we can come up with.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/spufs/context.c  | 18 +++++++++++-
 arch/powerpc/platforms/cell/spufs/coredump.c |  2 +-
 arch/powerpc/platforms/cell/spufs/file.c     | 42 ++++++++++++++--------------
 arch/powerpc/platforms/cell/spufs/spufs.h    |  5 ++++
 4 files changed, 44 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spufs/context.c b/arch/powerpc/platforms/cell/spufs/context.c
index 0e5e55f53c8..6b091ea1d19 100644
--- a/arch/powerpc/platforms/cell/spufs/context.c
+++ b/arch/powerpc/platforms/cell/spufs/context.c
@@ -165,6 +165,22 @@ int spu_acquire_runnable(struct spu_context *ctx, unsigned long flags)
 void spu_acquire_saved(struct spu_context *ctx)
 {
 	spu_acquire(ctx);
-	if (ctx->state != SPU_STATE_SAVED)
+	if (ctx->state != SPU_STATE_SAVED) {
+		set_bit(SPU_SCHED_WAS_ACTIVE, &ctx->sched_flags);
 		spu_deactivate(ctx);
+	}
+}
+
+/**
+ * spu_release_saved - unlock spu context and return it to the runqueue
+ * @ctx:	context to unlock
+ */
+void spu_release_saved(struct spu_context *ctx)
+{
+	BUG_ON(ctx->state != SPU_STATE_SAVED);
+
+	if (test_and_clear_bit(SPU_SCHED_WAS_ACTIVE, &ctx->sched_flags))
+		spu_activate(ctx, 0);
+
+	spu_release(ctx);
 }
diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c
index 5d9ad5a0307..5e31799b1e3 100644
--- a/arch/powerpc/platforms/cell/spufs/coredump.c
+++ b/arch/powerpc/platforms/cell/spufs/coredump.c
@@ -226,7 +226,7 @@ static void spufs_arch_write_notes(struct file *file)
 		spu_acquire_saved(ctx_info->ctx);
 		for (j = 0; j < spufs_coredump_num_notes; j++)
 			spufs_arch_write_note(ctx_info, j, file);
-		spu_release(ctx_info->ctx);
+		spu_release_saved(ctx_info->ctx);
 		list_del(&ctx_info->list);
 		kfree(ctx_info);
 	}
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index 9351db9472d..88da996f6d2 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -370,7 +370,7 @@ spufs_regs_read(struct file *file, char __user *buffer,
 
 	spu_acquire_saved(ctx);
 	ret = __spufs_regs_read(ctx, buffer, size, pos);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 	return ret;
 }
 
@@ -392,7 +392,7 @@ spufs_regs_write(struct file *file, const char __user *buffer,
 	ret = copy_from_user(lscsa->gprs + *pos - size,
 			     buffer, size) ? -EFAULT : size;
 
-	spu_release(ctx);
+	spu_release_saved(ctx);
 	return ret;
 }
 
@@ -421,7 +421,7 @@ spufs_fpcr_read(struct file *file, char __user * buffer,
 
 	spu_acquire_saved(ctx);
 	ret = __spufs_fpcr_read(ctx, buffer, size, pos);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 	return ret;
 }
 
@@ -443,7 +443,7 @@ spufs_fpcr_write(struct file *file, const char __user * buffer,
 	ret = copy_from_user((char *)&lscsa->fpcr + *pos - size,
 			     buffer, size) ? -EFAULT : size;
 
-	spu_release(ctx);
+	spu_release_saved(ctx);
 	return ret;
 }
 
@@ -868,7 +868,7 @@ static ssize_t spufs_signal1_read(struct file *file, char __user *buf,
 
 	spu_acquire_saved(ctx);
 	ret = __spufs_signal1_read(ctx, buf, len, pos);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 
 	return ret;
 }
@@ -999,7 +999,7 @@ static ssize_t spufs_signal2_read(struct file *file, char __user *buf,
 
 	spu_acquire_saved(ctx);
 	ret = __spufs_signal2_read(ctx, buf, len, pos);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 
 	return ret;
 }
@@ -1626,7 +1626,7 @@ static void spufs_decr_set(void *data, u64 val)
 	struct spu_lscsa *lscsa = ctx->csa.lscsa;
 	spu_acquire_saved(ctx);
 	lscsa->decr.slot[0] = (u32) val;
-	spu_release(ctx);
+	spu_release_saved(ctx);
 }
 
 static u64 __spufs_decr_get(void *data)
@@ -1642,7 +1642,7 @@ static u64 spufs_decr_get(void *data)
 	u64 ret;
 	spu_acquire_saved(ctx);
 	ret = __spufs_decr_get(data);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 	return ret;
 }
 DEFINE_SIMPLE_ATTRIBUTE(spufs_decr_ops, spufs_decr_get, spufs_decr_set,
@@ -1654,7 +1654,7 @@ static void spufs_decr_status_set(void *data, u64 val)
 	struct spu_lscsa *lscsa = ctx->csa.lscsa;
 	spu_acquire_saved(ctx);
 	lscsa->decr_status.slot[0] = (u32) val;
-	spu_release(ctx);
+	spu_release_saved(ctx);
 }
 
 static u64 __spufs_decr_status_get(void *data)
@@ -1670,7 +1670,7 @@ static u64 spufs_decr_status_get(void *data)
 	u64 ret;
 	spu_acquire_saved(ctx);
 	ret = __spufs_decr_status_get(data);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 	return ret;
 }
 DEFINE_SIMPLE_ATTRIBUTE(spufs_decr_status_ops, spufs_decr_status_get,
@@ -1682,7 +1682,7 @@ static void spufs_event_mask_set(void *data, u64 val)
 	struct spu_lscsa *lscsa = ctx->csa.lscsa;
 	spu_acquire_saved(ctx);
 	lscsa->event_mask.slot[0] = (u32) val;
-	spu_release(ctx);
+	spu_release_saved(ctx);
 }
 
 static u64 __spufs_event_mask_get(void *data)
@@ -1698,7 +1698,7 @@ static u64 spufs_event_mask_get(void *data)
 	u64 ret;
 	spu_acquire_saved(ctx);
 	ret = __spufs_event_mask_get(data);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 	return ret;
 }
 DEFINE_SIMPLE_ATTRIBUTE(spufs_event_mask_ops, spufs_event_mask_get,
@@ -1722,7 +1722,7 @@ static u64 spufs_event_status_get(void *data)
 
 	spu_acquire_saved(ctx);
 	ret = __spufs_event_status_get(data);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 	return ret;
 }
 DEFINE_SIMPLE_ATTRIBUTE(spufs_event_status_ops, spufs_event_status_get,
@@ -1734,7 +1734,7 @@ static void spufs_srr0_set(void *data, u64 val)
 	struct spu_lscsa *lscsa = ctx->csa.lscsa;
 	spu_acquire_saved(ctx);
 	lscsa->srr0.slot[0] = (u32) val;
-	spu_release(ctx);
+	spu_release_saved(ctx);
 }
 
 static u64 spufs_srr0_get(void *data)
@@ -1744,7 +1744,7 @@ static u64 spufs_srr0_get(void *data)
 	u64 ret;
 	spu_acquire_saved(ctx);
 	ret = lscsa->srr0.slot[0];
-	spu_release(ctx);
+	spu_release_saved(ctx);
 	return ret;
 }
 DEFINE_SIMPLE_ATTRIBUTE(spufs_srr0_ops, spufs_srr0_get, spufs_srr0_set,
@@ -1800,7 +1800,7 @@ static u64 spufs_lslr_get(void *data)
 
 	spu_acquire_saved(ctx);
 	ret = __spufs_lslr_get(data);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 
 	return ret;
 }
@@ -1864,7 +1864,7 @@ static ssize_t spufs_mbox_info_read(struct file *file, char __user *buf,
 	spin_lock(&ctx->csa.register_lock);
 	ret = __spufs_mbox_info_read(ctx, buf, len, pos);
 	spin_unlock(&ctx->csa.register_lock);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 
 	return ret;
 }
@@ -1902,7 +1902,7 @@ static ssize_t spufs_ibox_info_read(struct file *file, char __user *buf,
 	spin_lock(&ctx->csa.register_lock);
 	ret = __spufs_ibox_info_read(ctx, buf, len, pos);
 	spin_unlock(&ctx->csa.register_lock);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 
 	return ret;
 }
@@ -1943,7 +1943,7 @@ static ssize_t spufs_wbox_info_read(struct file *file, char __user *buf,
 	spin_lock(&ctx->csa.register_lock);
 	ret = __spufs_wbox_info_read(ctx, buf, len, pos);
 	spin_unlock(&ctx->csa.register_lock);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 
 	return ret;
 }
@@ -1993,7 +1993,7 @@ static ssize_t spufs_dma_info_read(struct file *file, char __user *buf,
 	spin_lock(&ctx->csa.register_lock);
 	ret = __spufs_dma_info_read(ctx, buf, len, pos);
 	spin_unlock(&ctx->csa.register_lock);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 
 	return ret;
 }
@@ -2044,7 +2044,7 @@ static ssize_t spufs_proxydma_info_read(struct file *file, char __user *buf,
 	spin_lock(&ctx->csa.register_lock);
 	ret = __spufs_proxydma_info_read(ctx, buf, len, pos);
 	spin_unlock(&ctx->csa.register_lock);
-	spu_release(ctx);
+	spu_release_saved(ctx);
 
 	return ret;
 }
diff --git a/arch/powerpc/platforms/cell/spufs/spufs.h b/arch/powerpc/platforms/cell/spufs/spufs.h
index fdace928437..1438aa2c346 100644
--- a/arch/powerpc/platforms/cell/spufs/spufs.h
+++ b/arch/powerpc/platforms/cell/spufs/spufs.h
@@ -40,6 +40,10 @@ enum {
 struct spu_context_ops;
 struct spu_gang;
 
+enum {
+	SPU_SCHED_WAS_ACTIVE,	/* was active upon spu_acquire_saved()  */
+};
+
 struct spu_context {
 	struct spu *spu;		  /* pointer to a physical SPU */
 	struct spu_state csa;		  /* SPU context save area. */
@@ -214,6 +218,7 @@ void spu_unmap_mappings(struct spu_context *ctx);
 void spu_forget(struct spu_context *ctx);
 int spu_acquire_runnable(struct spu_context *ctx, unsigned long flags);
 void spu_acquire_saved(struct spu_context *ctx);
+void spu_release_saved(struct spu_context *ctx);
 
 int spu_activate(struct spu_context *ctx, unsigned long flags);
 void spu_deactivate(struct spu_context *ctx);
-- 
cgit v1.2.3-70-g09d2


From daced0f718b92b0bcdb9790622c255d4660f51ce Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@ozlabs.org>
Date: Fri, 20 Jul 2007 21:39:35 +0200
Subject: [CELL] spufs: fix array size of channel index

Based on a fix from Masato Noguchi <Masato.Noguchi@jp.sony.com>.

Remove the (incorrect) array size declarations in the spufs channel
arrays, and use ARRAY_SIZE rather than hardcoded values.

Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/spufs/switch.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spufs/switch.c b/arch/powerpc/platforms/cell/spufs/switch.c
index 827aada391f..861336e9944 100644
--- a/arch/powerpc/platforms/cell/spufs/switch.c
+++ b/arch/powerpc/platforms/cell/spufs/switch.c
@@ -616,7 +616,7 @@ static inline void save_ppuint_mb(struct spu_state *csa, struct spu *spu)
 static inline void save_ch_part1(struct spu_state *csa, struct spu *spu)
 {
 	struct spu_priv2 __iomem *priv2 = spu->priv2;
-	u64 idx, ch_indices[7] = { 0UL, 3UL, 4UL, 24UL, 25UL, 27UL };
+	u64 idx, ch_indices[] = { 0UL, 3UL, 4UL, 24UL, 25UL, 27UL };
 	int i;
 
 	/* Save, Step 42:
@@ -627,7 +627,7 @@ static inline void save_ch_part1(struct spu_state *csa, struct spu *spu)
 	csa->spu_chnldata_RW[1] = in_be64(&priv2->spu_chnldata_RW);
 
 	/* Save the following CH: [0,3,4,24,25,27] */
-	for (i = 0; i < 7; i++) {
+	for (i = 0; i < ARRAY_SIZE(ch_indices); i++) {
 		idx = ch_indices[i];
 		out_be64(&priv2->spu_chnlcntptr_RW, idx);
 		eieio();
@@ -1091,7 +1091,7 @@ static inline void clear_spu_status(struct spu_state *csa, struct spu *spu)
 static inline void reset_ch_part1(struct spu_state *csa, struct spu *spu)
 {
 	struct spu_priv2 __iomem *priv2 = spu->priv2;
-	u64 ch_indices[7] = { 0UL, 3UL, 4UL, 24UL, 25UL, 27UL };
+	u64 ch_indices[] = { 0UL, 3UL, 4UL, 24UL, 25UL, 27UL };
 	u64 idx;
 	int i;
 
@@ -1103,7 +1103,7 @@ static inline void reset_ch_part1(struct spu_state *csa, struct spu *spu)
 	out_be64(&priv2->spu_chnldata_RW, 0UL);
 
 	/* Reset the following CH: [0,3,4,24,25,27] */
-	for (i = 0; i < 7; i++) {
+	for (i = 0; i < ARRAY_SIZE(ch_indices); i++) {
 		idx = ch_indices[i];
 		out_be64(&priv2->spu_chnlcntptr_RW, idx);
 		eieio();
@@ -1563,7 +1563,7 @@ static inline void restore_decr_wrapped(struct spu_state *csa, struct spu *spu)
 static inline void restore_ch_part1(struct spu_state *csa, struct spu *spu)
 {
 	struct spu_priv2 __iomem *priv2 = spu->priv2;
-	u64 idx, ch_indices[7] = { 0UL, 3UL, 4UL, 24UL, 25UL, 27UL };
+	u64 idx, ch_indices[] = { 0UL, 3UL, 4UL, 24UL, 25UL, 27UL };
 	int i;
 
 	/* Restore, Step 59:
@@ -1574,7 +1574,7 @@ static inline void restore_ch_part1(struct spu_state *csa, struct spu *spu)
 	out_be64(&priv2->spu_chnldata_RW, csa->spu_chnldata_RW[1]);
 
 	/* Restore the following CH: [0,3,4,24,25,27] */
-	for (i = 0; i < 7; i++) {
+	for (i = 0; i < ARRAY_SIZE(ch_indices); i++) {
 		idx = ch_indices[i];
 		out_be64(&priv2->spu_chnlcntptr_RW, idx);
 		eieio();
-- 
cgit v1.2.3-70-g09d2


From cfd529b25d9b1d48423b85d76066348e2459e646 Mon Sep 17 00:00:00 2001
From: Masato Noguchi <Masato.Noguchi@jp.sony.com>
Date: Fri, 20 Jul 2007 21:39:36 +0200
Subject: [CELL] spufs: remove needless context save/restore code

The following steps are not needed in the SPE context save/restore
paths:

Save Step 12: save_mfc_decr()
  save suspend_time to CSA (It will be done by step 14)
  save ch 7 (decrementer value will be saved in LSCSA by spe-side step 10)

Restore Step 59: restore_ch_part1()
  restore ch 1 (it will be done by spe-side step 15)

This change removes the unnecessary steps.

Signed-off-by: Masato Noguchi <Masato.Noguchi@jp.sony.com>
Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/spufs/switch.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spufs/switch.c b/arch/powerpc/platforms/cell/spufs/switch.c
index 861336e9944..a08fe93817f 100644
--- a/arch/powerpc/platforms/cell/spufs/switch.c
+++ b/arch/powerpc/platforms/cell/spufs/switch.c
@@ -253,11 +253,6 @@ static inline void save_mfc_decr(struct spu_state *csa, struct spu *spu)
 	 */
 	if (in_be64(&priv2->mfc_control_RW) & MFC_CNTL_DECREMENTER_RUNNING) {
 		csa->priv2.mfc_control_RW |= MFC_CNTL_DECREMENTER_RUNNING;
-		csa->suspend_time = get_cycles();
-		out_be64(&priv2->spu_chnlcntptr_RW, 7ULL);
-		eieio();
-		csa->spu_chnldata_RW[7] = in_be64(&priv2->spu_chnldata_RW);
-		eieio();
 	} else {
 		csa->priv2.mfc_control_RW &= ~MFC_CNTL_DECREMENTER_RUNNING;
 	}
@@ -1567,13 +1562,8 @@ static inline void restore_ch_part1(struct spu_state *csa, struct spu *spu)
 	int i;
 
 	/* Restore, Step 59:
+	 *	Restore the following CH: [0,3,4,24,25,27]
 	 */
-
-	/* Restore CH 1 without count */
-	out_be64(&priv2->spu_chnlcntptr_RW, 1);
-	out_be64(&priv2->spu_chnldata_RW, csa->spu_chnldata_RW[1]);
-
-	/* Restore the following CH: [0,3,4,24,25,27] */
 	for (i = 0; i < ARRAY_SIZE(ch_indices); i++) {
 		idx = ch_indices[i];
 		out_be64(&priv2->spu_chnlcntptr_RW, idx);
-- 
cgit v1.2.3-70-g09d2


From 1cfc0f86eb0348dd04ace8c2171642ebe9cd87bb Mon Sep 17 00:00:00 2001
From: Masato Noguchi <Masato.Noguchi@jp.sony.com>
Date: Fri, 20 Jul 2007 21:39:37 +0200
Subject: [CELL] spufs: fix decr_status meanings

The decr_status in the LSCSA is confusedly used as two meanings:
 * SPU decrementer was running
 * SPU decrementer was wrapped as a result of adjust
and the code to set decr_status is missing.

This patch fixes these problems by using the decr_status argument as a
set of flags. This requires a rebuild of the shipped spu_restore code.

Signed-off-by: Masato Noguchi <Masato.Noguchi@jp.sony.com>
Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/spufs/spu_restore.c    |   2 +-
 .../cell/spufs/spu_restore_dump.h_shipped          | 470 +++++++++++----------
 arch/powerpc/platforms/cell/spufs/switch.c         |  12 +-
 include/asm-powerpc/spu_csa.h                      |   8 +-
 4 files changed, 269 insertions(+), 223 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spufs/spu_restore.c b/arch/powerpc/platforms/cell/spufs/spu_restore.c
index 4e19ed7a075..7114e033460 100644
--- a/arch/powerpc/platforms/cell/spufs/spu_restore.c
+++ b/arch/powerpc/platforms/cell/spufs/spu_restore.c
@@ -90,7 +90,7 @@ static inline void restore_decr(void)
 	 *    decrementer value from LSCSA.
 	 */
 	offset = LSCSA_QW_OFFSET(decr_status);
-	decr_running = regs_spill[offset].slot[0];
+	decr_running = regs_spill[offset].slot[0] & SPU_DECR_STATUS_RUNNING;
 	if (decr_running) {
 		offset = LSCSA_QW_OFFSET(decr);
 		decr = regs_spill[offset].slot[0];
diff --git a/arch/powerpc/platforms/cell/spufs/spu_restore_dump.h_shipped b/arch/powerpc/platforms/cell/spufs/spu_restore_dump.h_shipped
index 15183d209b5..799815e2237 100644
--- a/arch/powerpc/platforms/cell/spufs/spu_restore_dump.h_shipped
+++ b/arch/powerpc/platforms/cell/spufs/spu_restore_dump.h_shipped
@@ -10,7 +10,7 @@ static unsigned int spu_restore_code[]  __attribute__((__aligned__(128))) = {
 0x24fd8081,
 0x1cd80081,
 0x33001180,
-0x42030003,
+0x42034003,
 0x33800284,
 0x1c010204,
 0x40200000,
@@ -24,22 +24,22 @@ static unsigned int spu_restore_code[]  __attribute__((__aligned__(128))) = {
 0x23fffd84,
 0x1c100183,
 0x217ffa85,
-0x3080a000,
-0x3080a201,
-0x3080a402,
-0x3080a603,
-0x3080a804,
-0x3080aa05,
-0x3080ac06,
-0x3080ae07,
-0x3080b008,
-0x3080b209,
-0x3080b40a,
-0x3080b60b,
-0x3080b80c,
-0x3080ba0d,
-0x3080bc0e,
-0x3080be0f,
+0x3080b000,
+0x3080b201,
+0x3080b402,
+0x3080b603,
+0x3080b804,
+0x3080ba05,
+0x3080bc06,
+0x3080be07,
+0x3080c008,
+0x3080c209,
+0x3080c40a,
+0x3080c60b,
+0x3080c80c,
+0x3080ca0d,
+0x3080cc0e,
+0x3080ce0f,
 0x00003ffc,
 0x00000000,
 0x00000000,
@@ -48,19 +48,18 @@ static unsigned int spu_restore_code[]  __attribute__((__aligned__(128))) = {
 0x3ec00083,
 0xb0a14103,
 0x01a00204,
-0x3ec10082,
-0x4202800e,
-0x04000703,
-0xb0a14202,
-0x21a00803,
-0x3fbf028d,
-0x3f20068d,
-0x3fbe0682,
+0x3ec10083,
+0x4202c002,
+0xb0a14203,
+0x21a00802,
+0x3fbf028a,
+0x3f20050a,
+0x3fbe0502,
 0x3fe30102,
 0x21a00882,
-0x3f82028f,
-0x3fe3078f,
-0x3fbf0784,
+0x3f82028b,
+0x3fe3058b,
+0x3fbf0584,
 0x3f200204,
 0x3fbe0204,
 0x3fe30204,
@@ -75,52 +74,46 @@ static unsigned int spu_restore_code[]  __attribute__((__aligned__(128))) = {
 0x21a00083,
 0x40800082,
 0x21a00b02,
-0x10002818,
-0x42a00002,
-0x32800007,
-0x4207000c,
-0x18008208,
-0x40a0000b,
-0x4080020a,
-0x40800709,
-0x00200000,
-0x42070002,
-0x3ac30384,
+0x10002612,
+0x42a00003,
+0x42074006,
+0x1800c204,
+0x40a00008,
+0x40800789,
+0x1c010305,
+0x34000302,
 0x1cffc489,
-0x00200000,
-0x18008383,
-0x38830382,
-0x4cffc486,
-0x3ac28185,
-0xb0408584,
-0x28830382,
-0x1c020387,
-0x38828182,
-0xb0408405,
-0x1802c408,
-0x28828182,
-0x217ff886,
-0x04000583,
+0x3ec00303,
+0x3ec00287,
+0xb0408403,
+0x24000302,
+0x34000282,
+0x1c020306,
+0xb0408207,
+0x18020204,
+0x24000282,
+0x217ffa09,
+0x04000403,
 0x21a00803,
-0x3fbe0682,
+0x3fbe0502,
 0x3fe30102,
-0x04000106,
-0x21a00886,
-0x04000603,
-0x21a00903,
-0x40803c02,
-0x21a00982,
-0x40800003,
-0x04000184,
+0x04000105,
+0x21a00885,
+0x42074002,
+0x21a00902,
+0x40803c03,
+0x21a00983,
+0x04000484,
 0x21a00a04,
 0x40802202,
 0x21a00a82,
-0x42028005,
-0x34208702,
-0x21002282,
+0x30809c03,
+0x34000182,
+0x14004102,
+0x21002782,
 0x21a00804,
-0x21a00886,
-0x3fbf0782,
+0x21a00885,
+0x3fbf0582,
 0x3f200102,
 0x3fbe0102,
 0x3fe30102,
@@ -133,194 +126,233 @@ static unsigned int spu_restore_code[]  __attribute__((__aligned__(128))) = {
 0x40800083,
 0x21a00b83,
 0x01a00c02,
-0x01a00d83,
-0x3420c282,
+0x01a00d84,
+0x3080a003,
+0x34000182,
 0x21a00e02,
-0x34210283,
-0x21a00f03,
-0x34200284,
-0x77400200,
-0x3421c282,
+0x3080a203,
+0x34000182,
+0x21a00f02,
+0x3080a403,
+0x34000182,
+0x77400100,
+0x3080a603,
+0x34000182,
 0x21a00702,
-0x34218283,
-0x21a00083,
-0x34214282,
+0x3080a803,
+0x34000182,
+0x21a00082,
+0x3080aa03,
+0x34000182,
 0x21a00b02,
-0x4200480c,
-0x00200000,
-0x1c010286,
-0x34220284,
-0x34220302,
-0x0f608203,
-0x5c024204,
-0x3b81810b,
-0x42013c02,
-0x00200000,
-0x18008185,
-0x38808183,
-0x3b814182,
-0x21004e84,
+0x3080ae02,
+0x3080ac04,
+0x42004805,
+0x34000103,
+0x34000202,
+0x1cffc183,
+0x3b810106,
+0x0f608184,
+0x42013802,
+0x5c020183,
+0x38810102,
+0x3b810102,
+0x21000e83,
 0x4020007f,
 0x35000100,
-0x000004e0,
-0x000002a0,
-0x000002e8,
-0x00000428,
+0x00000470,
+0x000002f8,
+0x00000430,
 0x00000360,
-0x000002e8,
-0x000004a0,
-0x00000468,
+0x000002f8,
 0x000003c8,
+0x000004a8,
+0x00000298,
 0x00000360,
+0x00200000,
 0x409ffe02,
 0x30801203,
-0x40800204,
-0x3ec40085,
-0x10009c09,
-0x3ac10606,
-0xb060c105,
-0x4020007f,
-0x4020007f,
+0x40800208,
+0x3ec40084,
+0x40800407,
+0x3ac20289,
+0xb060c104,
+0x3ac1c284,
 0x20801203,
-0x38810602,
-0xb0408586,
-0x28810602,
-0x32004180,
-0x34204702,
+0x38820282,
+0x41004003,
+0xb0408189,
+0x28820282,
+0x3881c282,
+0xb0408304,
+0x2881c282,
+0x00400000,
+0x40800003,
+0x35000000,
+0x30809e03,
+0x34000182,
 0x21a00382,
 0x4020007f,
-0x327fdc80,
+0x327fd700,
 0x409ffe02,
 0x30801203,
-0x40800204,
-0x3ec40087,
-0x40800405,
-0x00200000,
-0x40800606,
-0x3ac10608,
-0x3ac14609,
-0x3ac1860a,
-0xb060c107,
+0x40800206,
+0x3ec40084,
+0x40800407,
+0x40800608,
+0x3ac1828a,
+0x3ac20289,
+0xb060c104,
+0x3ac1c284,
 0x20801203,
+0x38818282,
 0x41004003,
-0x38810602,
-0x4020007f,
-0xb0408188,
-0x4020007f,
-0x28810602,
-0x41201002,
-0x38814603,
-0x10009c09,
-0xb060c109,
-0x4020007f,
-0x28814603,
+0xb040818a,
+0x10005b0b,
+0x41201003,
+0x28818282,
+0x3881c282,
+0xb0408184,
 0x41193f83,
-0x38818602,
 0x60ffc003,
-0xb040818a,
-0x28818602,
-0x32003080,
+0x2881c282,
+0x38820282,
+0xb0408189,
+0x28820282,
+0x327fef80,
 0x409ffe02,
 0x30801203,
-0x40800204,
-0x3ec40087,
-0x41201008,
-0x10009c14,
-0x40800405,
-0x3ac10609,
-0x40800606,
-0x3ac1460a,
-0xb060c107,
-0x3ac1860b,
+0x40800207,
+0x3ec40086,
+0x4120100b,
+0x10005b14,
+0x40800404,
+0x3ac1c289,
+0x40800608,
+0xb060c106,
+0x3ac10286,
+0x3ac2028a,
 0x20801203,
-0x38810602,
-0xb0408409,
-0x28810602,
-0x38814603,
-0xb060c40a,
-0x4020007f,
-0x28814603,
+0x3881c282,
 0x41193f83,
-0x38818602,
 0x60ffc003,
-0xb040818b,
-0x28818602,
-0x32002380,
-0x409ffe02,
-0x30801204,
-0x40800205,
-0x3ec40083,
-0x40800406,
-0x3ac14607,
-0x3ac18608,
-0xb0810103,
-0x41004002,
-0x20801204,
-0x4020007f,
-0x38814603,
-0x10009c0b,
-0xb060c107,
-0x4020007f,
-0x4020007f,
-0x28814603,
-0x38818602,
-0x4020007f,
+0xb0408589,
+0x2881c282,
+0x38810282,
+0xb0408586,
+0x28810282,
+0x38820282,
+0xb040818a,
+0x28820282,
 0x4020007f,
-0xb0408588,
-0x28818602,
+0x327fe280,
+0x409ffe02,
+0x30801203,
+0x40800207,
+0x3ec40084,
+0x40800408,
+0x10005b14,
+0x40800609,
+0x3ac1c28a,
+0x3ac2028b,
+0xb060c104,
+0x3ac24284,
+0x20801203,
+0x41201003,
+0x3881c282,
+0xb040830a,
+0x2881c282,
+0x38820282,
+0xb040818b,
+0x41193f83,
+0x60ffc003,
+0x28820282,
+0x38824282,
+0xb0408184,
+0x28824282,
 0x4020007f,
-0x32001780,
+0x327fd580,
 0x409ffe02,
-0x1000640e,
-0x40800204,
+0x1000658e,
+0x40800206,
 0x30801203,
-0x40800405,
-0x3ec40087,
-0x40800606,
-0x3ac10608,
-0x3ac14609,
-0x3ac1860a,
-0xb060c107,
+0x40800407,
+0x3ec40084,
+0x40800608,
+0x3ac1828a,
+0x3ac20289,
+0xb060c104,
+0x3ac1c284,
 0x20801203,
 0x413d8003,
-0x38810602,
+0x38818282,
+0x4020007f,
+0x327fd800,
+0x409ffe03,
+0x30801202,
+0x40800207,
+0x3ec40084,
+0x10005b09,
+0x3ac1c288,
+0xb0408184,
 0x4020007f,
-0x327fd780,
-0x409ffe02,
-0x10007f0c,
-0x40800205,
-0x30801204,
-0x40800406,
-0x3ec40083,
-0x3ac14607,
-0x3ac18608,
-0xb0810103,
-0x413d8002,
-0x20801204,
-0x38814603,
 0x4020007f,
-0x327feb80,
+0x20801202,
+0x3881c282,
+0xb0408308,
+0x2881c282,
+0x327fc680,
 0x409ffe02,
+0x1000588b,
+0x40800208,
 0x30801203,
-0x40800204,
-0x3ec40087,
-0x40800405,
-0x1000650a,
-0x40800606,
-0x3ac10608,
-0x3ac14609,
-0x3ac1860a,
-0xb060c107,
+0x40800407,
+0x3ec40084,
+0x3ac20289,
+0xb060c104,
+0x3ac1c284,
 0x20801203,
-0x38810602,
-0xb0408588,
-0x4020007f,
-0x327fc980,
-0x00400000,
-0x40800003,
-0x4020007f,
-0x35000000,
+0x413d8003,
+0x38820282,
+0x327fbd80,
+0x00200000,
+0x00000da0,
+0x00000000,
+0x00000000,
+0x00000000,
+0x00000d90,
+0x00000000,
+0x00000000,
+0x00000000,
+0x00000db0,
+0x00000000,
+0x00000000,
+0x00000000,
+0x00000dc0,
+0x00000000,
+0x00000000,
+0x00000000,
+0x00000d80,
+0x00000000,
+0x00000000,
+0x00000000,
+0x00000df0,
+0x00000000,
+0x00000000,
+0x00000000,
+0x00000de0,
+0x00000000,
+0x00000000,
+0x00000000,
+0x00000dd0,
+0x00000000,
+0x00000000,
+0x00000000,
+0x00000e04,
+0x00000000,
+0x00000000,
 0x00000000,
+0x00000e00,
 0x00000000,
 0x00000000,
 0x00000000,
diff --git a/arch/powerpc/platforms/cell/spufs/switch.c b/arch/powerpc/platforms/cell/spufs/switch.c
index a08fe93817f..d4dea187484 100644
--- a/arch/powerpc/platforms/cell/spufs/switch.c
+++ b/arch/powerpc/platforms/cell/spufs/switch.c
@@ -1285,7 +1285,15 @@ static inline void setup_decr(struct spu_state *csa, struct spu *spu)
 		cycles_t resume_time = get_cycles();
 		cycles_t delta_time = resume_time - csa->suspend_time;
 
+		csa->lscsa->decr_status.slot[0] = SPU_DECR_STATUS_RUNNING;
+		if (csa->lscsa->decr.slot[0] < delta_time) {
+			csa->lscsa->decr_status.slot[0] |=
+				 SPU_DECR_STATUS_WRAPPED;
+		}
+
 		csa->lscsa->decr.slot[0] -= delta_time;
+	} else {
+		csa->lscsa->decr_status.slot[0] = 0;
 	}
 }
 
@@ -1544,10 +1552,10 @@ static inline void restore_decr_wrapped(struct spu_state *csa, struct spu *spu)
 	 *     "wrapped" flag is set, OR in a '1' to
 	 *     CSA.SPU_Event_Status[Tm].
 	 */
-	if (csa->lscsa->decr_status.slot[0] == 1) {
+	if (csa->lscsa->decr_status.slot[0] & SPU_DECR_STATUS_WRAPPED) {
 		csa->spu_chnldata_RW[0] |= 0x20;
 	}
-	if ((csa->lscsa->decr_status.slot[0] == 1) &&
+	if ((csa->lscsa->decr_status.slot[0] & SPU_DECR_STATUS_WRAPPED) &&
 	    (csa->spu_chnlcnt_RW[0] == 0 &&
 	     ((csa->spu_chnldata_RW[2] & 0x20) == 0x0) &&
 	     ((csa->spu_chnldata_RW[0] & 0x20) != 0x1))) {
diff --git a/include/asm-powerpc/spu_csa.h b/include/asm-powerpc/spu_csa.h
index c48ae185c87..e87794d5d4e 100644
--- a/include/asm-powerpc/spu_csa.h
+++ b/include/asm-powerpc/spu_csa.h
@@ -50,6 +50,12 @@
 #define SPU_STOPPED_STATUS_P_I  8
 #define SPU_STOPPED_STATUS_R    9
 
+/*
+ * Definitions for software decrementer status flag.
+ */
+#define SPU_DECR_STATUS_RUNNING 0x1
+#define SPU_DECR_STATUS_WRAPPED 0x2
+
 #ifndef  __ASSEMBLY__
 /**
  * spu_reg128 - generic 128-bit register definition.
@@ -63,7 +69,7 @@ struct spu_reg128 {
  * @gprs: Array of saved registers.
  * @fpcr: Saved floating point status control register.
  * @decr: Saved decrementer value.
- * @decr_status: Indicates decrementer run status.
+ * @decr_status: Indicates software decrementer status flags.
  * @ppu_mb: Saved PPU mailbox data.
  * @ppuint_mb: Saved PPU interrupting mailbox data.
  * @tag_mask: Saved tag group mask.
-- 
cgit v1.2.3-70-g09d2


From d40a01d4f4f205d0645beb371edc153d9ec8fb9f Mon Sep 17 00:00:00 2001
From: Masato Noguchi <Masato.Noguchi@jp.sony.com>
Date: Fri, 20 Jul 2007 21:39:38 +0200
Subject: [CELL] spufs: fix read and write for decr_status file

The decr_status in the LSCSA is valid only in the sequence of context
restore. Thus, it's nonsense to read and/or write it through spufs.

This patch changes decr_status node to access MFC_CNTL[Ds] in the CSA.

Signed-off-by: Masato Noguchi <Masato.Noguchi@jp.sony.com>
Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/spufs/file.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index 88da996f6d2..7de4e919687 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -1651,17 +1651,21 @@ DEFINE_SIMPLE_ATTRIBUTE(spufs_decr_ops, spufs_decr_get, spufs_decr_set,
 static void spufs_decr_status_set(void *data, u64 val)
 {
 	struct spu_context *ctx = data;
-	struct spu_lscsa *lscsa = ctx->csa.lscsa;
 	spu_acquire_saved(ctx);
-	lscsa->decr_status.slot[0] = (u32) val;
+	if (val)
+		ctx->csa.priv2.mfc_control_RW |= MFC_CNTL_DECREMENTER_RUNNING;
+	else
+		ctx->csa.priv2.mfc_control_RW &= ~MFC_CNTL_DECREMENTER_RUNNING;
 	spu_release_saved(ctx);
 }
 
 static u64 __spufs_decr_status_get(void *data)
 {
 	struct spu_context *ctx = data;
-	struct spu_lscsa *lscsa = ctx->csa.lscsa;
-	return lscsa->decr_status.slot[0];
+	if (ctx->csa.priv2.mfc_control_RW & MFC_CNTL_DECREMENTER_RUNNING)
+		return SPU_DECR_STATUS_RUNNING;
+	else
+		return 0;
 }
 
 static u64 spufs_decr_status_get(void *data)
-- 
cgit v1.2.3-70-g09d2


From a103f347a5ae2735b9bf0a725a36c34be3f24c88 Mon Sep 17 00:00:00 2001
From: Masato Noguchi <Masato.Noguchi@jp.sony.com>
Date: Fri, 20 Jul 2007 21:39:39 +0200
Subject: [CELL] spufs: limit saving MFC_CNTL bits

At save step 8, the mfc control register in the CSA should be written
_only_ with Sc and Sm bits (at least MFC_CNTL[Dh] should be set to 0)

Signed-off-by: Masato Noguchi <Masato.Noguchi@jp.sony.com>
Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/spufs/switch.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spufs/switch.c b/arch/powerpc/platforms/cell/spufs/switch.c
index d4dea187484..c970b14bf7d 100644
--- a/arch/powerpc/platforms/cell/spufs/switch.c
+++ b/arch/powerpc/platforms/cell/spufs/switch.c
@@ -180,7 +180,7 @@ static inline void save_mfc_cntl(struct spu_state *csa, struct spu *spu)
 	case MFC_CNTL_SUSPEND_COMPLETE:
 		if (csa) {
 			csa->priv2.mfc_control_RW =
-				in_be64(&priv2->mfc_control_RW) |
+				MFC_CNTL_SUSPEND_MASK |
 				MFC_CNTL_SUSPEND_DMA_QUEUE;
 		}
 		break;
@@ -190,9 +190,7 @@ static inline void save_mfc_cntl(struct spu_state *csa, struct spu *spu)
 				  MFC_CNTL_SUSPEND_DMA_STATUS_MASK) ==
 				 MFC_CNTL_SUSPEND_COMPLETE);
 		if (csa) {
-			csa->priv2.mfc_control_RW =
-				in_be64(&priv2->mfc_control_RW) &
-				~MFC_CNTL_SUSPEND_DMA_QUEUE;
+			csa->priv2.mfc_control_RW = 0;
 		}
 		break;
 	}
@@ -251,11 +249,8 @@ static inline void save_mfc_decr(struct spu_state *csa, struct spu *spu)
 	 *     Read MFC_CNTL[Ds].  Update saved copy of
 	 *     CSA.MFC_CNTL[Ds].
 	 */
-	if (in_be64(&priv2->mfc_control_RW) & MFC_CNTL_DECREMENTER_RUNNING) {
-		csa->priv2.mfc_control_RW |= MFC_CNTL_DECREMENTER_RUNNING;
-	} else {
-		csa->priv2.mfc_control_RW &= ~MFC_CNTL_DECREMENTER_RUNNING;
-	}
+	csa->priv2.mfc_control_RW |=
+		in_be64(&priv2->mfc_control_RW) & MFC_CNTL_DECREMENTER_RUNNING;
 }
 
 static inline void halt_mfc_decr(struct spu_state *csa, struct spu *spu)
-- 
cgit v1.2.3-70-g09d2


From cf17df223c8cd56a92f34162f2a362eec9f4b157 Mon Sep 17 00:00:00 2001
From: Masato Noguchi <Masato.Noguchi@jp.sony.com>
Date: Fri, 20 Jul 2007 21:39:40 +0200
Subject: [CELL] spufs: dont halt decrementer at restore step 47

No need to halt the SPE decrementer at context restore step 47, it will
be done in step 7.

Signed-off-by: Masato Noguchi <Masato.Noguchi@jp.sony.com>
Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/spufs/switch.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spufs/switch.c b/arch/powerpc/platforms/cell/spufs/switch.c
index c970b14bf7d..27ffdae98e5 100644
--- a/arch/powerpc/platforms/cell/spufs/switch.c
+++ b/arch/powerpc/platforms/cell/spufs/switch.c
@@ -974,13 +974,13 @@ static inline void terminate_spu_app(struct spu_state *csa, struct spu *spu)
 	 */
 }
 
-static inline void suspend_mfc(struct spu_state *csa, struct spu *spu)
+static inline void suspend_mfc_and_halt_decr(struct spu_state *csa,
+		struct spu *spu)
 {
 	struct spu_priv2 __iomem *priv2 = spu->priv2;
 
 	/* Restore, Step 7:
-	 * Restore, Step 47.
-	 *     Write MFC_Cntl[Dh,Sc]='1','1' to suspend
+	 *     Write MFC_Cntl[Dh,Sc,Sm]='1','1','0' to suspend
 	 *     the queue and halt the decrementer.
 	 */
 	out_be64(&priv2->mfc_control_RW, MFC_CNTL_SUSPEND_DMA_QUEUE |
@@ -1397,6 +1397,18 @@ static inline void restore_ls_16kb(struct spu_state *csa, struct spu *spu)
 	send_mfc_dma(spu, addr, ls_offset, size, tag, rclass, cmd);
 }
 
+static inline void suspend_mfc(struct spu_state *csa, struct spu *spu)
+{
+	struct spu_priv2 __iomem *priv2 = spu->priv2;
+
+	/* Restore, Step 47.
+	 *     Write MFC_Cntl[Sc,Sm]='1','0' to suspend
+	 *     the queue.
+	 */
+	out_be64(&priv2->mfc_control_RW, MFC_CNTL_SUSPEND_DMA_QUEUE);
+	eieio();
+}
+
 static inline void clear_interrupts(struct spu_state *csa, struct spu *spu)
 {
 	/* Restore, Step 49:
@@ -1926,7 +1938,7 @@ static void harvest(struct spu_state *prev, struct spu *spu)
 	set_switch_pending(prev, spu);	        /* Step 5.  */
 	stop_spu_isolate(spu);			/* NEW.     */
 	remove_other_spu_access(prev, spu);	/* Step 6.  */
-	suspend_mfc(prev, spu);	                /* Step 7.  */
+	suspend_mfc_and_halt_decr(prev, spu);	/* Step 7.  */
 	wait_suspend_mfc_complete(prev, spu);	/* Step 8.  */
 	if (!suspend_spe(prev, spu))	        /* Step 9.  */
 		clear_spu_status(prev, spu);	/* Step 10. */
-- 
cgit v1.2.3-70-g09d2


From ca53da3abb1d49748931ff2acb66d5a6eeeba2a1 Mon Sep 17 00:00:00 2001
From: Masato Noguchi <Masato.Noguchi@jp.sony.com>
Date: Fri, 20 Jul 2007 21:39:41 +0200
Subject: [CELL] spufs: change decrementer restore timing

The SPU decrementer should be restored after the LSCSA DMA has
completed.

Signed-off-by: Masato Noguchi <Masato.Noguchi@jp.sony.com>
Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/spufs/spu_restore.c    |  4 +--
 .../cell/spufs/spu_restore_dump.h_shipped          | 36 +++++++++++-----------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spufs/spu_restore.c b/arch/powerpc/platforms/cell/spufs/spu_restore.c
index 7114e033460..21a9c952d88 100644
--- a/arch/powerpc/platforms/cell/spufs/spu_restore.c
+++ b/arch/powerpc/platforms/cell/spufs/spu_restore.c
@@ -84,7 +84,7 @@ static inline void restore_decr(void)
 	unsigned int decr_running;
 	unsigned int decr;
 
-	/* Restore, Step 6:
+	/* Restore, Step 6(moved):
 	 *    If the LSCSA "decrementer running" flag is set
 	 *    then write the SPU_WrDec channel with the
 	 *    decrementer value from LSCSA.
@@ -318,10 +318,10 @@ int main()
 	build_dma_list(lscsa_ea);	/* Step 3.  */
 	restore_upper_240kb(lscsa_ea);	/* Step 4.  */
 					/* Step 5: done by 'exit'. */
-	restore_decr();			/* Step 6. */
 	enqueue_putllc(lscsa_ea);	/* Step 7. */
 	set_tag_update();		/* Step 8. */
 	read_tag_status();		/* Step 9. */
+	restore_decr();			/* moved Step 6. */
 	read_llar_status();		/* Step 10. */
 	write_ppu_mb();			/* Step 11. */
 	write_ppuint_mb();		/* Step 12. */
diff --git a/arch/powerpc/platforms/cell/spufs/spu_restore_dump.h_shipped b/arch/powerpc/platforms/cell/spufs/spu_restore_dump.h_shipped
index 799815e2237..f383b027e8b 100644
--- a/arch/powerpc/platforms/cell/spufs/spu_restore_dump.h_shipped
+++ b/arch/powerpc/platforms/cell/spufs/spu_restore_dump.h_shipped
@@ -93,26 +93,21 @@ static unsigned int spu_restore_code[]  __attribute__((__aligned__(128))) = {
 0x18020204,
 0x24000282,
 0x217ffa09,
-0x04000403,
-0x21a00803,
-0x3fbe0502,
-0x3fe30102,
-0x04000105,
-0x21a00885,
+0x04000402,
+0x21a00802,
+0x3fbe0504,
+0x3fe30204,
+0x21a00884,
 0x42074002,
 0x21a00902,
 0x40803c03,
 0x21a00983,
-0x04000484,
-0x21a00a04,
+0x04000485,
+0x21a00a05,
 0x40802202,
 0x21a00a82,
-0x30809c03,
-0x34000182,
-0x14004102,
-0x21002782,
-0x21a00804,
-0x21a00885,
+0x21a00805,
+0x21a00884,
 0x3fbf0582,
 0x3f200102,
 0x3fbe0102,
@@ -120,13 +115,17 @@ static unsigned int spu_restore_code[]  __attribute__((__aligned__(128))) = {
 0x21a00902,
 0x40804003,
 0x21a00983,
-0x21a00a04,
+0x21a00a05,
 0x40805a02,
 0x21a00a82,
 0x40800083,
 0x21a00b83,
 0x01a00c02,
-0x01a00d84,
+0x30809c03,
+0x34000182,
+0x14004102,
+0x21002082,
+0x01a00d82,
 0x3080a003,
 0x34000182,
 0x21a00e02,
@@ -145,9 +144,10 @@ static unsigned int spu_restore_code[]  __attribute__((__aligned__(128))) = {
 0x3080aa03,
 0x34000182,
 0x21a00b02,
+0x4020007f,
 0x3080ae02,
-0x3080ac04,
 0x42004805,
+0x3080ac04,
 0x34000103,
 0x34000202,
 0x1cffc183,
@@ -193,7 +193,7 @@ static unsigned int spu_restore_code[]  __attribute__((__aligned__(128))) = {
 0x34000182,
 0x21a00382,
 0x4020007f,
-0x327fd700,
+0x327fde00,
 0x409ffe02,
 0x30801203,
 0x40800206,
-- 
cgit v1.2.3-70-g09d2


From 50af32a94beef566664022254d677504e51b6139 Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@ozlabs.org>
Date: Fri, 20 Jul 2007 21:39:42 +0200
Subject: [CELL] spufs: remove unused file argument from spufs_run_spu()

From: Sebastian Siewior <cbe-oss-dev@ml.breakpoint.cc>

The 'file' argument is unused in spufs_run_spu(). This change removes
it.

Signed-off-by: Sebastian Siewior <sebastian@breakpoint.cc>
Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/spufs/run.c      | 3 +--
 arch/powerpc/platforms/cell/spufs/spufs.h    | 3 +--
 arch/powerpc/platforms/cell/spufs/syscalls.c | 2 +-
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spufs/run.c b/arch/powerpc/platforms/cell/spufs/run.c
index 8c91b3f9315..c0238ea5b55 100644
--- a/arch/powerpc/platforms/cell/spufs/run.c
+++ b/arch/powerpc/platforms/cell/spufs/run.c
@@ -295,8 +295,7 @@ static inline int spu_process_events(struct spu_context *ctx)
 	return ret;
 }
 
-long spufs_run_spu(struct file *file, struct spu_context *ctx,
-		   u32 *npc, u32 *event)
+long spufs_run_spu(struct spu_context *ctx, u32 *npc, u32 *event)
 {
 	int ret;
 	u32 status;
diff --git a/arch/powerpc/platforms/cell/spufs/spufs.h b/arch/powerpc/platforms/cell/spufs/spufs.h
index 1438aa2c346..03e8315f6f9 100644
--- a/arch/powerpc/platforms/cell/spufs/spufs.h
+++ b/arch/powerpc/platforms/cell/spufs/spufs.h
@@ -181,8 +181,7 @@ extern struct tree_descr spufs_dir_contents[];
 extern struct tree_descr spufs_dir_nosched_contents[];
 
 /* system call implementation */
-long spufs_run_spu(struct file *file,
-		   struct spu_context *ctx, u32 *npc, u32 *status);
+long spufs_run_spu(struct spu_context *ctx, u32 *npc, u32 *status);
 long spufs_create(struct nameidata *nd,
 			 unsigned int flags, mode_t mode);
 extern const struct file_operations spufs_context_fops;
diff --git a/arch/powerpc/platforms/cell/spufs/syscalls.c b/arch/powerpc/platforms/cell/spufs/syscalls.c
index 8e37bdf4dfd..13a383c67ca 100644
--- a/arch/powerpc/platforms/cell/spufs/syscalls.c
+++ b/arch/powerpc/platforms/cell/spufs/syscalls.c
@@ -47,7 +47,7 @@ static long do_spu_run(struct file *filp,
 		goto out;
 
 	i = SPUFS_I(filp->f_path.dentry->d_inode);
-	ret = spufs_run_spu(filp, i->i_ctx, &npc, &status);
+	ret = spufs_run_spu(i->i_ctx, &npc, &status);
 
 	if (put_user(npc, unpc))
 		ret = -EFAULT;
-- 
cgit v1.2.3-70-g09d2


From 7e90b74967ea54dbd6eb539e1cb151ec37f63d7f Mon Sep 17 00:00:00 2001
From: Masato Noguchi <Masato.Noguchi@jp.sony.com>
Date: Fri, 20 Jul 2007 21:39:43 +0200
Subject: [CELL] spufs: use find_first_bit() instead of sched_find_first_bit()

spu_sched->bitmap has MAX_PRIO(=140) width in bits.However, since
ff80a77f20f811c0cc5b251d0f657cbc6f788385, sched_find_first_bit()
only supports 100-bit bitmaps.

Thus, spu_sched->bitmap should be treated by generic find_first_bit().

Signed-off-by: Masato Noguchi <Masato.Noguchi@jp.sony.com>
Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/spufs/sched.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index ecd9e95116a..12c09665404 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -502,7 +502,7 @@ static struct spu_context *grab_runnable_context(int prio, int node)
 	int best;
 
 	spin_lock(&spu_prio->runq_lock);
-	best = sched_find_first_bit(spu_prio->bitmap);
+	best = find_first_bit(spu_prio->bitmap, prio);
 	while (best < prio) {
 		struct list_head *rq = &spu_prio->runq[best];
 
@@ -738,7 +738,6 @@ int __init spu_sched_init(void)
 		INIT_LIST_HEAD(&spu_prio->runq[i]);
 		__clear_bit(i, spu_prio->bitmap);
 	}
-	__set_bit(MAX_PRIO, spu_prio->bitmap);
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		mutex_init(&spu_prio->active_mutex[i]);
 		INIT_LIST_HEAD(&spu_prio->active_list[i]);
-- 
cgit v1.2.3-70-g09d2


From aa6d5b20254a21b69092dd839b70ee148303ef25 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 20 Jul 2007 21:39:44 +0200
Subject: [CELL] cell: add per BE structure with info about its SPUs

Addition of a spufs-global "cbe_info" array. Each entry contains information
about one Cell/B.E. node, namelly:
* list of spus (both free and busy spus are in this list);
* list of free spus (replacing the static spu_list from spu_base.c)
* number of spus;
* number of reserved (non scheduleable) spus.

SPE affinity implementation actually requires only access to one spu per
BE node (since it implements its own pointer to walk through the other spus
of the ring) and the number of scheduleable spus (n_spus - non_sched_spus)
However having this more general structure can be useful for other
functionalities, concentrating per-cbe statistics / data.

Signed-off-by: Andre Detsch <adetsch@br.ibm.com>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/spu_base.c    | 21 ++++++++++++++-------
 arch/powerpc/platforms/cell/spufs/sched.c |  5 +++++
 include/asm-powerpc/spu.h                 | 10 ++++++++++
 3 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index caaf2bf78ca..dd632e5feff 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -41,7 +41,6 @@ EXPORT_SYMBOL_GPL(spu_management_ops);
 
 const struct spu_priv1_ops *spu_priv1_ops;
 
-static struct list_head spu_list[MAX_NUMNODES];
 static LIST_HEAD(spu_full_list);
 static DEFINE_MUTEX(spu_mutex);
 static DEFINE_SPINLOCK(spu_list_lock);
@@ -429,8 +428,9 @@ struct spu *spu_alloc_node(int node)
 	struct spu *spu = NULL;
 
 	mutex_lock(&spu_mutex);
-	if (!list_empty(&spu_list[node])) {
-		spu = list_entry(spu_list[node].next, struct spu, list);
+	if (!list_empty(&cbe_spu_info[node].free_spus)) {
+		spu = list_entry(cbe_spu_info[node].free_spus.next, struct spu,
+									list);
 		list_del_init(&spu->list);
 		pr_debug("Got SPU %d %d\n", spu->number, spu->node);
 	}
@@ -459,7 +459,7 @@ struct spu *spu_alloc(void)
 void spu_free(struct spu *spu)
 {
 	mutex_lock(&spu_mutex);
-	list_add_tail(&spu->list, &spu_list[spu->node]);
+	list_add_tail(&spu->list, &cbe_spu_info[spu->node].free_spus);
 	mutex_unlock(&spu_mutex);
 }
 EXPORT_SYMBOL_GPL(spu_free);
@@ -582,7 +582,9 @@ static int __init create_spu(void *data)
 
 	mutex_lock(&spu_mutex);
 	spin_lock_irqsave(&spu_list_lock, flags);
-	list_add(&spu->list, &spu_list[spu->node]);
+	list_add(&spu->list, &cbe_spu_info[spu->node].free_spus);
+	list_add(&spu->cbe_list, &cbe_spu_info[spu->node].spus);
+	cbe_spu_info[spu->node].n_spus++;
 	list_add(&spu->full_list, &spu_full_list);
 	spin_unlock_irqrestore(&spu_list_lock, flags);
 	mutex_unlock(&spu_mutex);
@@ -650,12 +652,17 @@ static ssize_t spu_stat_show(struct sys_device *sysdev, char *buf)
 
 static SYSDEV_ATTR(stat, 0644, spu_stat_show, NULL);
 
+struct cbe_spu_info cbe_spu_info[MAX_NUMNODES];
+EXPORT_SYMBOL_GPL(cbe_spu_info);
+
 static int __init init_spu_base(void)
 {
 	int i, ret = 0;
 
-	for (i = 0; i < MAX_NUMNODES; i++)
-		INIT_LIST_HEAD(&spu_list[i]);
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		INIT_LIST_HEAD(&cbe_spu_info[i].spus);
+		INIT_LIST_HEAD(&cbe_spu_info[i].free_spus);
+	}
 
 	if (!spu_management_ops)
 		goto out;
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 12c09665404..6d0ab72cc70 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -231,6 +231,9 @@ static void spu_bind_context(struct spu *spu, struct spu_context *ctx)
 		 spu->number, spu->node);
 	spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
 
+	if (ctx->flags & SPU_CREATE_NOSCHED)
+		atomic_inc(&cbe_spu_info[spu->node].reserved_spus);
+
 	ctx->stats.slb_flt_base = spu->stats.slb_flt;
 	ctx->stats.class2_intr_base = spu->stats.class2_intr;
 
@@ -267,6 +270,8 @@ static void spu_unbind_context(struct spu *spu, struct spu_context *ctx)
 		 spu->pid, spu->number, spu->node);
 	spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
 
+ 	if (spu->ctx->flags & SPU_CREATE_NOSCHED)
+		atomic_dec(&cbe_spu_info[spu->node].reserved_spus);
 	spu_switch_notify(spu, NULL);
 	spu_unmap_mappings(ctx);
 	spu_save(&ctx->csa, spu);
diff --git a/include/asm-powerpc/spu.h b/include/asm-powerpc/spu.h
index 12442acdc76..2f2fe9f1c09 100644
--- a/include/asm-powerpc/spu.h
+++ b/include/asm-powerpc/spu.h
@@ -122,6 +122,7 @@ struct spu {
 	struct spu_problem __iomem *problem;
 	struct spu_priv2 __iomem *priv2;
 	struct list_head list;
+	struct list_head cbe_list;
 	struct list_head sched_list;
 	struct list_head full_list;
 	int number;
@@ -181,6 +182,15 @@ struct spu {
 	} stats;
 };
 
+struct cbe_spu_info {
+	struct list_head spus;
+	struct list_head free_spus;
+	int n_spus;
+	atomic_t reserved_spus;
+};
+
+extern struct cbe_spu_info cbe_spu_info[];
+
 struct spu *spu_alloc(void);
 struct spu *spu_alloc_node(int node);
 void spu_free(struct spu *spu);
-- 
cgit v1.2.3-70-g09d2


From 9d92af621f193c1c889ac8b6fd8c987ccd8aae1f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 20 Jul 2007 21:39:45 +0200
Subject: [CELL] cell: add vicinity information on spus

This patch adds affinity data to each spu instance.
A doubly linked list is created, meant to connect the spus
in the physical order they are placed in the BE. SPUs
near to memory should be marked as having memory affinity.
Adjustments of the fields acording to FW properties is done
in separate patches, one for CPBW, one for Malta (patch for
Malta under testing).

Signed-off-by: Andre Detsch <adetsch@br.ibm.com>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/spu_base.c | 2 ++
 include/asm-powerpc/spu.h              | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index dd632e5feff..0fc2e12a3c8 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -593,6 +593,8 @@ static int __init create_spu(void *data)
 	ktime_get_ts(&ts);
 	spu->stats.tstamp = timespec_to_ns(&ts);
 
+	INIT_LIST_HEAD(&spu->aff_list);
+
 	goto out;
 
 out_free_irqs:
diff --git a/include/asm-powerpc/spu.h b/include/asm-powerpc/spu.h
index 2f2fe9f1c09..18e558bef98 100644
--- a/include/asm-powerpc/spu.h
+++ b/include/asm-powerpc/spu.h
@@ -166,6 +166,9 @@ struct spu {
 
 	struct sys_device sysdev;
 
+	int has_mem_affinity;
+	struct list_head aff_list;
+
 	struct {
 		/* protected by interrupt reentrancy */
 		enum spu_utilization_state util_state;
-- 
cgit v1.2.3-70-g09d2


From 3ad216cae837d90415c605e1149e6fd88f51c973 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 20 Jul 2007 21:39:46 +0200
Subject: [CELL] cell: add hardcoded spu vicinity information for QS20

This patch allows the use of spu affinity on QS20, whose
original FW does not provide affinity information.
This is done through two hardcoded arrays, and by reading the reg
property from each spu.

Signed-off-by: Andre Detsch <adetsch@br.ibm.com>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/spu_base.c | 55 +++++++++++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index 0fc2e12a3c8..75b5af0a7e2 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -35,6 +35,8 @@
 #include <asm/spu.h>
 #include <asm/spu_priv1.h>
 #include <asm/xmon.h>
+#include <asm/prom.h>
+#include "spu_priv1_mmio.h"
 
 const struct spu_management_ops *spu_management_ops;
 EXPORT_SYMBOL_GPL(spu_management_ops);
@@ -657,6 +659,52 @@ static SYSDEV_ATTR(stat, 0644, spu_stat_show, NULL);
 struct cbe_spu_info cbe_spu_info[MAX_NUMNODES];
 EXPORT_SYMBOL_GPL(cbe_spu_info);
 
+/* Hardcoded affinity idxs for QS20 */
+#define SPES_PER_BE 8
+static int QS20_reg_idxs[SPES_PER_BE] =   { 0, 2, 4, 6, 7, 5, 3, 1 };
+static int QS20_reg_memory[SPES_PER_BE] = { 1, 1, 0, 0, 0, 0, 0, 0 };
+
+static struct spu *spu_lookup_reg(int node, u32 reg)
+{
+	struct spu *spu;
+
+	list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
+		if (*(u32 *)get_property(spu_devnode(spu), "reg", NULL) == reg)
+			return spu;
+	}
+	return NULL;
+}
+
+static void init_aff_QS20_harcoded(void)
+{
+	int node, i;
+	struct spu *last_spu, *spu;
+	u32 reg;
+
+	for (node = 0; node < MAX_NUMNODES; node++) {
+		last_spu = NULL;
+		for (i = 0; i < SPES_PER_BE; i++) {
+			reg = QS20_reg_idxs[i];
+			spu = spu_lookup_reg(node, reg);
+			if (!spu)
+				continue;
+			spu->has_mem_affinity = QS20_reg_memory[reg];
+			if (last_spu)
+				list_add_tail(&spu->aff_list,
+						&last_spu->aff_list);
+			last_spu = spu;
+		}
+	}
+}
+
+static int of_has_vicinity(void)
+{
+	struct spu* spu;
+
+	spu = list_entry(cbe_spu_info[0].spus.next, struct spu, cbe_list);
+	return of_find_property(spu_devnode(spu), "vicinity", NULL) != NULL;
+}
+
 static int __init init_spu_base(void)
 {
 	int i, ret = 0;
@@ -698,12 +746,17 @@ static int __init init_spu_base(void)
 	crash_register_spus(&spu_full_list);
 	spu_add_sysdev_attr(&attr_stat);
 
+	if (!of_has_vicinity()) {
+		long root = of_get_flat_dt_root();
+		if (of_flat_dt_is_compatible(root, "IBM,CPBW-1.0"))
+			init_aff_QS20_harcoded();
+	}
+
 	return 0;
 
  out_unregister_sysdev_class:
 	sysdev_class_unregister(&spu_sysdev_class);
  out:
-
 	return ret;
 }
 module_init(init_spu_base);
-- 
cgit v1.2.3-70-g09d2


From 8e68e2f248332a9c3fd4f08258f488c209bd3e0c Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 20 Jul 2007 21:39:47 +0200
Subject: [CELL] spufs: extension of spu_create to support affinity definition

This patch adds support for additional flags at spu_create, which relate
to the establishment of affinity between contexts and contexts to memory.
A fourth, optional, parameter is supported. This parameter represent
a affinity neighbor of the context being created, and is used when defining
SPU-SPU affinity.
Affinity is represented as a doubly linked list of spu_contexts.

Signed-off-by: Andre Detsch <adetsch@br.ibm.com>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/spu_syscalls.c   |  17 +++-
 arch/powerpc/platforms/cell/spufs/context.c  |   1 +
 arch/powerpc/platforms/cell/spufs/gang.c     |   4 +
 arch/powerpc/platforms/cell/spufs/inode.c    | 132 +++++++++++++++++++++++++--
 arch/powerpc/platforms/cell/spufs/spufs.h    |  16 +++-
 arch/powerpc/platforms/cell/spufs/syscalls.c |  32 ++++++-
 include/asm-powerpc/spu.h                    |   8 +-
 include/linux/syscalls.h                     |   2 +-
 8 files changed, 195 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spu_syscalls.c b/arch/powerpc/platforms/cell/spu_syscalls.c
index 261b507a901..dd2c6688c8a 100644
--- a/arch/powerpc/platforms/cell/spu_syscalls.c
+++ b/arch/powerpc/platforms/cell/spu_syscalls.c
@@ -34,14 +34,27 @@ struct spufs_calls spufs_calls = {
  * this file is not used and the syscalls directly enter the fs code */
 
 asmlinkage long sys_spu_create(const char __user *name,
-		unsigned int flags, mode_t mode)
+		unsigned int flags, mode_t mode, int neighbor_fd)
 {
 	long ret;
 	struct module *owner = spufs_calls.owner;
+	struct file *neighbor;
+	int fput_needed;
 
 	ret = -ENOSYS;
 	if (owner && try_module_get(owner)) {
-		ret = spufs_calls.create_thread(name, flags, mode);
+		if (flags & SPU_CREATE_AFFINITY_SPU) {
+			neighbor = fget_light(neighbor_fd, &fput_needed);
+			if (neighbor) {
+				ret = spufs_calls.create_thread(name, flags,
+								mode, neighbor);
+				fput_light(neighbor, fput_needed);
+			}
+		}
+		else {
+			ret = spufs_calls.create_thread(name, flags,
+							mode, NULL);
+		}
 		module_put(owner);
 	}
 	return ret;
diff --git a/arch/powerpc/platforms/cell/spufs/context.c b/arch/powerpc/platforms/cell/spufs/context.c
index 6b091ea1d19..a7efb999d65 100644
--- a/arch/powerpc/platforms/cell/spufs/context.c
+++ b/arch/powerpc/platforms/cell/spufs/context.c
@@ -55,6 +55,7 @@ struct spu_context *alloc_spu_context(struct spu_gang *gang)
 	ctx->ops = &spu_backing_ops;
 	ctx->owner = get_task_mm(current);
 	INIT_LIST_HEAD(&ctx->rq);
+	INIT_LIST_HEAD(&ctx->aff_list);
 	if (gang)
 		spu_gang_add_ctx(gang, ctx);
 	ctx->cpus_allowed = current->cpus_allowed;
diff --git a/arch/powerpc/platforms/cell/spufs/gang.c b/arch/powerpc/platforms/cell/spufs/gang.c
index 212ea78f905..0a752ce67c8 100644
--- a/arch/powerpc/platforms/cell/spufs/gang.c
+++ b/arch/powerpc/platforms/cell/spufs/gang.c
@@ -35,7 +35,9 @@ struct spu_gang *alloc_spu_gang(void)
 
 	kref_init(&gang->kref);
 	mutex_init(&gang->mutex);
+	mutex_init(&gang->aff_mutex);
 	INIT_LIST_HEAD(&gang->list);
+	INIT_LIST_HEAD(&gang->aff_list_head);
 
 out:
 	return gang;
@@ -73,6 +75,8 @@ void spu_gang_remove_ctx(struct spu_gang *gang, struct spu_context *ctx)
 {
 	mutex_lock(&gang->mutex);
 	WARN_ON(ctx->gang != gang);
+	if (!list_empty(&ctx->aff_list))
+		list_del_init(&ctx->aff_list);
 	list_del_init(&ctx->gang_list);
 	gang->contexts--;
 	mutex_unlock(&gang->mutex);
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 7eb4d6cbcb7..b3d0dd118dd 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -316,11 +316,107 @@ out:
 	return ret;
 }
 
-static int spufs_create_context(struct inode *inode,
-			struct dentry *dentry,
-			struct vfsmount *mnt, int flags, int mode)
+static struct spu_context *
+spufs_assert_affinity(unsigned int flags, struct spu_gang *gang,
+						struct file *filp)
+{
+	struct spu_context *tmp, *neighbor;
+	int count, node;
+	int aff_supp;
+
+	aff_supp = !list_empty(&(list_entry(cbe_spu_info[0].spus.next,
+					struct spu, cbe_list))->aff_list);
+
+	if (!aff_supp)
+		return ERR_PTR(-EINVAL);
+
+	if (flags & SPU_CREATE_GANG)
+		return ERR_PTR(-EINVAL);
+
+	if (flags & SPU_CREATE_AFFINITY_MEM &&
+	    gang->aff_ref_ctx &&
+	    gang->aff_ref_ctx->flags & SPU_CREATE_AFFINITY_MEM)
+		return ERR_PTR(-EEXIST);
+
+	if (gang->aff_flags & AFF_MERGED)
+		return ERR_PTR(-EBUSY);
+
+	neighbor = NULL;
+	if (flags & SPU_CREATE_AFFINITY_SPU) {
+		if (!filp || filp->f_op != &spufs_context_fops)
+			return ERR_PTR(-EINVAL);
+
+		neighbor = get_spu_context(
+				SPUFS_I(filp->f_dentry->d_inode)->i_ctx);
+
+		if (!list_empty(&neighbor->aff_list) && !(neighbor->aff_head) &&
+		    !list_is_last(&neighbor->aff_list, &gang->aff_list_head) &&
+		    !list_entry(neighbor->aff_list.next, struct spu_context,
+		    aff_list)->aff_head)
+			return ERR_PTR(-EEXIST);
+
+		if (gang != neighbor->gang)
+			return ERR_PTR(-EINVAL);
+
+		count = 1;
+		list_for_each_entry(tmp, &gang->aff_list_head, aff_list)
+			count++;
+		if (list_empty(&neighbor->aff_list))
+			count++;
+
+		for (node = 0; node < MAX_NUMNODES; node++) {
+			if ((cbe_spu_info[node].n_spus - atomic_read(
+				&cbe_spu_info[node].reserved_spus)) >= count)
+				break;
+		}
+
+		if (node == MAX_NUMNODES)
+			return ERR_PTR(-EEXIST);
+	}
+
+	return neighbor;
+}
+
+static void
+spufs_set_affinity(unsigned int flags, struct spu_context *ctx,
+					struct spu_context *neighbor)
+{
+	if (flags & SPU_CREATE_AFFINITY_MEM)
+		ctx->gang->aff_ref_ctx = ctx;
+
+	if (flags & SPU_CREATE_AFFINITY_SPU) {
+		if (list_empty(&neighbor->aff_list)) {
+			list_add_tail(&neighbor->aff_list,
+				&ctx->gang->aff_list_head);
+			neighbor->aff_head = 1;
+		}
+
+		if (list_is_last(&neighbor->aff_list, &ctx->gang->aff_list_head)
+		    || list_entry(neighbor->aff_list.next, struct spu_context,
+							aff_list)->aff_head) {
+			list_add(&ctx->aff_list, &neighbor->aff_list);
+		} else  {
+			list_add_tail(&ctx->aff_list, &neighbor->aff_list);
+			if (neighbor->aff_head) {
+				neighbor->aff_head = 0;
+				ctx->aff_head = 1;
+			}
+		}
+
+		if (!ctx->gang->aff_ref_ctx)
+			ctx->gang->aff_ref_ctx = ctx;
+	}
+}
+
+static int
+spufs_create_context(struct inode *inode, struct dentry *dentry,
+			struct vfsmount *mnt, int flags, int mode,
+			struct file *aff_filp)
 {
 	int ret;
+	int affinity;
+	struct spu_gang *gang;
+	struct spu_context *neighbor;
 
 	ret = -EPERM;
 	if ((flags & SPU_CREATE_NOSCHED) &&
@@ -336,9 +432,29 @@ static int spufs_create_context(struct inode *inode,
 	if ((flags & SPU_CREATE_ISOLATE) && !isolated_loader)
 		goto out_unlock;
 
+	gang = NULL;
+	neighbor = NULL;
+	affinity = flags & (SPU_CREATE_AFFINITY_MEM | SPU_CREATE_AFFINITY_SPU);
+	if (affinity) {
+		gang = SPUFS_I(inode)->i_gang;
+		ret = -EINVAL;
+		if (!gang)
+			goto out_unlock;
+		mutex_lock(&gang->aff_mutex);
+		neighbor = spufs_assert_affinity(flags, gang, aff_filp);
+		if (IS_ERR(neighbor)) {
+			ret = PTR_ERR(neighbor);
+			goto out_aff_unlock;
+		}
+	}
+
 	ret = spufs_mkdir(inode, dentry, flags, mode & S_IRWXUGO);
 	if (ret)
-		goto out_unlock;
+		goto out_aff_unlock;
+
+	if (affinity)
+		spufs_set_affinity(flags, SPUFS_I(dentry->d_inode)->i_ctx,
+								neighbor);
 
 	/*
 	 * get references for dget and mntget, will be released
@@ -352,6 +468,9 @@ static int spufs_create_context(struct inode *inode,
 		goto out;
 	}
 
+out_aff_unlock:
+	if (affinity)
+		mutex_unlock(&gang->aff_mutex);
 out_unlock:
 	mutex_unlock(&inode->i_mutex);
 out:
@@ -450,7 +569,8 @@ out:
 
 static struct file_system_type spufs_type;
 
-long spufs_create(struct nameidata *nd, unsigned int flags, mode_t mode)
+long spufs_create(struct nameidata *nd, unsigned int flags, mode_t mode,
+							struct file *filp)
 {
 	struct dentry *dentry;
 	int ret;
@@ -487,7 +607,7 @@ long spufs_create(struct nameidata *nd, unsigned int flags, mode_t mode)
 					dentry, nd->mnt, mode);
 	else
 		return spufs_create_context(nd->dentry->d_inode,
-					dentry, nd->mnt, flags, mode);
+					dentry, nd->mnt, flags, mode, filp);
 
 out_dput:
 	dput(dentry);
diff --git a/arch/powerpc/platforms/cell/spufs/spufs.h b/arch/powerpc/platforms/cell/spufs/spufs.h
index 03e8315f6f9..36da17987e9 100644
--- a/arch/powerpc/platforms/cell/spufs/spufs.h
+++ b/arch/powerpc/platforms/cell/spufs/spufs.h
@@ -109,6 +109,9 @@ struct spu_context {
 		unsigned long long class2_intr_base; /* # at last ctx switch */
 		unsigned long long libassist;
 	} stats;
+
+	struct list_head aff_list;
+	int aff_head;
 };
 
 struct spu_gang {
@@ -116,8 +119,17 @@ struct spu_gang {
 	struct mutex mutex;
 	struct kref kref;
 	int contexts;
+
+	struct spu_context *aff_ref_ctx;
+	struct list_head aff_list_head;
+	struct mutex aff_mutex;
+	int aff_flags;
 };
 
+/* Flag bits for spu_gang aff_flags */
+#define AFF_OFFSETS_SET		1
+#define AFF_MERGED		2
+
 struct mfc_dma_command {
 	int32_t pad;	/* reserved */
 	uint32_t lsa;	/* local storage address */
@@ -182,8 +194,8 @@ extern struct tree_descr spufs_dir_nosched_contents[];
 
 /* system call implementation */
 long spufs_run_spu(struct spu_context *ctx, u32 *npc, u32 *status);
-long spufs_create(struct nameidata *nd,
-			 unsigned int flags, mode_t mode);
+long spufs_create(struct nameidata *nd, unsigned int flags,
+			mode_t mode, struct file *filp);
 extern const struct file_operations spufs_context_fops;
 
 /* gang management */
diff --git a/arch/powerpc/platforms/cell/spufs/syscalls.c b/arch/powerpc/platforms/cell/spufs/syscalls.c
index 13a383c67ca..43f0fb88abb 100644
--- a/arch/powerpc/platforms/cell/spufs/syscalls.c
+++ b/arch/powerpc/platforms/cell/spufs/syscalls.c
@@ -76,8 +76,8 @@ asmlinkage long sys_spu_run(int fd, __u32 __user *unpc, __u32 __user *ustatus)
 }
 #endif
 
-asmlinkage long sys_spu_create(const char __user *pathname,
-					unsigned int flags, mode_t mode)
+asmlinkage long do_spu_create(const char __user *pathname, unsigned int flags,
+				mode_t mode, struct file *neighbor)
 {
 	char *tmp;
 	int ret;
@@ -90,7 +90,7 @@ asmlinkage long sys_spu_create(const char __user *pathname,
 		ret = path_lookup(tmp, LOOKUP_PARENT|
 				LOOKUP_OPEN|LOOKUP_CREATE, &nd);
 		if (!ret) {
-			ret = spufs_create(&nd, flags, mode);
+			ret = spufs_create(&nd, flags, mode, neighbor);
 			path_release(&nd);
 		}
 		putname(tmp);
@@ -99,8 +99,32 @@ asmlinkage long sys_spu_create(const char __user *pathname,
 	return ret;
 }
 
+#ifndef MODULE
+asmlinkage long sys_spu_create(const char __user *pathname, unsigned int flags,
+				mode_t mode, int neighbor_fd)
+{
+	int fput_needed;
+	struct file *neighbor;
+	long ret;
+
+	if (flags & SPU_CREATE_AFFINITY_SPU) {
+		ret = -EBADF;
+		neighbor = fget_light(neighbor_fd, &fput_needed);
+		if (neighbor) {
+			ret = do_spu_create(pathname, flags, mode, neighbor);
+			fput_light(neighbor, fput_needed);
+		}
+	}
+	else {
+		ret = do_spu_create(pathname, flags, mode, NULL);
+	}
+
+	return ret;
+}
+#endif
+
 struct spufs_calls spufs_calls = {
-	.create_thread = sys_spu_create,
+	.create_thread = do_spu_create,
 	.spu_run = do_spu_run,
 	.owner = THIS_MODULE,
 };
diff --git a/include/asm-powerpc/spu.h b/include/asm-powerpc/spu.h
index 18e558bef98..24f352da286 100644
--- a/include/asm-powerpc/spu.h
+++ b/include/asm-powerpc/spu.h
@@ -196,6 +196,7 @@ extern struct cbe_spu_info cbe_spu_info[];
 
 struct spu *spu_alloc(void);
 struct spu *spu_alloc_node(int node);
+struct spu *spu_alloc_spu(struct spu *spu);
 void spu_free(struct spu *spu);
 int spu_irq_class_0_bottom(struct spu *spu);
 int spu_irq_class_1_bottom(struct spu *spu);
@@ -227,7 +228,8 @@ extern long spu_sys_callback(struct spu_syscall_block *s);
 struct file;
 extern struct spufs_calls {
 	asmlinkage long (*create_thread)(const char __user *name,
-					unsigned int flags, mode_t mode);
+					unsigned int flags, mode_t mode,
+					struct file *neighbor);
 	asmlinkage long (*spu_run)(struct file *filp, __u32 __user *unpc,
 						__u32 __user *ustatus);
 	struct module *owner;
@@ -254,8 +256,10 @@ struct spu_coredump_calls {
 #define SPU_CREATE_GANG			0x0002
 #define SPU_CREATE_NOSCHED		0x0004
 #define SPU_CREATE_ISOLATE		0x0008
+#define SPU_CREATE_AFFINITY_SPU		0x0010
+#define SPU_CREATE_AFFINITY_MEM		0x0020
 
-#define SPU_CREATE_FLAG_ALL		0x000f /* mask of all valid flags */
+#define SPU_CREATE_FLAG_ALL		0x003f /* mask of all valid flags */
 
 
 #ifdef CONFIG_SPU_FS_MODULE
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 7a8b1e3322e..61def7c8fbb 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -549,7 +549,7 @@ asmlinkage long sys_inotify_rm_watch(int fd, u32 wd);
 asmlinkage long sys_spu_run(int fd, __u32 __user *unpc,
 				 __u32 __user *ustatus);
 asmlinkage long sys_spu_create(const char __user *name,
-		unsigned int flags, mode_t mode);
+		unsigned int flags, mode_t mode, int fd);
 
 asmlinkage long sys_mknodat(int dfd, const char __user * filename, int mode,
 			    unsigned dev);
-- 
cgit v1.2.3-70-g09d2


From c5fc8d2a92461fcabd00dfd678204cba36b93119 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 20 Jul 2007 21:39:48 +0200
Subject: [CELL] cell: add placement computation for scheduling of affinity
 contexts

This patch provides the spu affinity placement logic for the spufs scheduler.
Each time a gang is going to be scheduled, the placement of a reference
context is defined. The placement of all other contexts with affinity from
the gang is defined based on this reference context location and on a
precomputed displacement offset.

Signed-off-by: Andre Detsch <adetsch@br.ibm.com>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/spufs/gang.c  |   4 +-
 arch/powerpc/platforms/cell/spufs/sched.c | 142 ++++++++++++++++++++++++++++++
 arch/powerpc/platforms/cell/spufs/spufs.h |   6 ++
 3 files changed, 151 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/cell/spufs/gang.c b/arch/powerpc/platforms/cell/spufs/gang.c
index 0a752ce67c8..71a44325302 100644
--- a/arch/powerpc/platforms/cell/spufs/gang.c
+++ b/arch/powerpc/platforms/cell/spufs/gang.c
@@ -75,8 +75,10 @@ void spu_gang_remove_ctx(struct spu_gang *gang, struct spu_context *ctx)
 {
 	mutex_lock(&gang->mutex);
 	WARN_ON(ctx->gang != gang);
-	if (!list_empty(&ctx->aff_list))
+	if (!list_empty(&ctx->aff_list)) {
 		list_del_init(&ctx->aff_list);
+		gang->aff_flags &= ~AFF_OFFSETS_SET;
+	}
 	list_del_init(&ctx->gang_list);
 	gang->contexts--;
 	mutex_unlock(&gang->mutex);
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 6d0ab72cc70..a9569de4c14 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -233,6 +233,8 @@ static void spu_bind_context(struct spu *spu, struct spu_context *ctx)
 
 	if (ctx->flags & SPU_CREATE_NOSCHED)
 		atomic_inc(&cbe_spu_info[spu->node].reserved_spus);
+	if (!list_empty(&ctx->aff_list))
+		atomic_inc(&ctx->gang->aff_sched_count);
 
 	ctx->stats.slb_flt_base = spu->stats.slb_flt;
 	ctx->stats.class2_intr_base = spu->stats.class2_intr;
@@ -259,6 +261,143 @@ static void spu_bind_context(struct spu *spu, struct spu_context *ctx)
 	spuctx_switch_state(ctx, SPU_UTIL_IDLE_LOADED);
 }
 
+/*
+ * XXX(hch): needs locking.
+ */
+static inline int sched_spu(struct spu *spu)
+{
+	return (!spu->ctx || !(spu->ctx->flags & SPU_CREATE_NOSCHED));
+}
+
+static void aff_merge_remaining_ctxs(struct spu_gang *gang)
+{
+	struct spu_context *ctx;
+
+	list_for_each_entry(ctx, &gang->aff_list_head, aff_list) {
+		if (list_empty(&ctx->aff_list))
+			list_add(&ctx->aff_list, &gang->aff_list_head);
+	}
+	gang->aff_flags |= AFF_MERGED;
+}
+
+static void aff_set_offsets(struct spu_gang *gang)
+{
+	struct spu_context *ctx;
+	int offset;
+
+	offset = -1;
+	list_for_each_entry_reverse(ctx, &gang->aff_ref_ctx->aff_list,
+								aff_list) {
+		if (&ctx->aff_list == &gang->aff_list_head)
+			break;
+		ctx->aff_offset = offset--;
+	}
+
+	offset = 0;
+	list_for_each_entry(ctx, gang->aff_ref_ctx->aff_list.prev, aff_list) {
+		if (&ctx->aff_list == &gang->aff_list_head)
+			break;
+		ctx->aff_offset = offset++;
+	}
+
+	gang->aff_flags |= AFF_OFFSETS_SET;
+}
+
+static struct spu *aff_ref_location(struct spu_context *ctx, int mem_aff,
+		 int group_size, int lowest_offset)
+{
+	struct spu *spu;
+	int node, n;
+
+	/*
+	 * TODO: A better algorithm could be used to find a good spu to be
+	 *       used as reference location for the ctxs chain.
+	 */
+	node = cpu_to_node(raw_smp_processor_id());
+	for (n = 0; n < MAX_NUMNODES; n++, node++) {
+		node = (node < MAX_NUMNODES) ? node : 0;
+		if (!node_allowed(ctx, node))
+			continue;
+		list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
+			if ((!mem_aff || spu->has_mem_affinity) &&
+							sched_spu(spu))
+				return spu;
+		}
+	}
+	return NULL;
+}
+
+static void aff_set_ref_point_location(struct spu_gang *gang)
+{
+	int mem_aff, gs, lowest_offset;
+	struct spu_context *ctx;
+	struct spu *tmp;
+
+	mem_aff = gang->aff_ref_ctx->flags & SPU_CREATE_AFFINITY_MEM;
+	lowest_offset = 0;
+	gs = 0;
+
+	list_for_each_entry(tmp, &gang->aff_list_head, aff_list)
+		gs++;
+
+	list_for_each_entry_reverse(ctx, &gang->aff_ref_ctx->aff_list,
+								aff_list) {
+		if (&ctx->aff_list == &gang->aff_list_head)
+			break;
+		lowest_offset = ctx->aff_offset;
+	}
+
+	gang->aff_ref_spu = aff_ref_location(ctx, mem_aff, gs, lowest_offset);
+}
+
+static struct spu *ctx_location(struct spu *ref, int offset)
+{
+	struct spu *spu;
+
+	spu = NULL;
+	if (offset >= 0) {
+		list_for_each_entry(spu, ref->aff_list.prev, aff_list) {
+			if (offset == 0)
+				break;
+			if (sched_spu(spu))
+				offset--;
+		}
+	} else {
+		list_for_each_entry_reverse(spu, ref->aff_list.next, aff_list) {
+			if (offset == 0)
+				break;
+			if (sched_spu(spu))
+				offset++;
+		}
+	}
+	return spu;
+}
+
+/*
+ * affinity_check is called each time a context is going to be scheduled.
+ * It returns the spu ptr on which the context must run.
+ */
+struct spu *affinity_check(struct spu_context *ctx)
+{
+	struct spu_gang *gang;
+
+	if (list_empty(&ctx->aff_list))
+		return NULL;
+	gang = ctx->gang;
+	mutex_lock(&gang->aff_mutex);
+	if (!gang->aff_ref_spu) {
+		if (!(gang->aff_flags & AFF_MERGED))
+			aff_merge_remaining_ctxs(gang);
+		if (!(gang->aff_flags & AFF_OFFSETS_SET))
+			aff_set_offsets(gang);
+		aff_set_ref_point_location(gang);
+	}
+	mutex_unlock(&gang->aff_mutex);
+	if (!gang->aff_ref_spu)
+		return NULL;
+	return ctx_location(gang->aff_ref_spu, ctx->aff_offset);
+}
+
 /**
  * spu_unbind_context - unbind spu context from physical spu
  * @spu:	physical spu to unbind from
@@ -272,6 +411,9 @@ static void spu_unbind_context(struct spu *spu, struct spu_context *ctx)
 
  	if (spu->ctx->flags & SPU_CREATE_NOSCHED)
 		atomic_dec(&cbe_spu_info[spu->node].reserved_spus);
+ 	if (!list_empty(&ctx->aff_list))
+ 		if (atomic_dec_and_test(&ctx->gang->aff_sched_count))
+ 			ctx->gang->aff_ref_spu = NULL;
 	spu_switch_notify(spu, NULL);
 	spu_unmap_mappings(ctx);
 	spu_save(&ctx->csa, spu);
diff --git a/arch/powerpc/platforms/cell/spufs/spufs.h b/arch/powerpc/platforms/cell/spufs/spufs.h
index 36da17987e9..42d8da8f0fb 100644
--- a/arch/powerpc/platforms/cell/spufs/spufs.h
+++ b/arch/powerpc/platforms/cell/spufs/spufs.h
@@ -112,6 +112,7 @@ struct spu_context {
 
 	struct list_head aff_list;
 	int aff_head;
+	int aff_offset;
 };
 
 struct spu_gang {
@@ -124,6 +125,8 @@ struct spu_gang {
 	struct list_head aff_list_head;
 	struct mutex aff_mutex;
 	int aff_flags;
+	struct spu *aff_ref_spu;
+	atomic_t aff_sched_count;
 };
 
 /* Flag bits for spu_gang aff_flags */
@@ -208,6 +211,9 @@ void spu_gang_add_ctx(struct spu_gang *gang, struct spu_context *ctx);
 /* fault handling */
 int spufs_handle_class1(struct spu_context *ctx);
 
+/* affinity */
+struct spu *affinity_check(struct spu_context *ctx);
+
 /* context management */
 extern atomic_t nr_spu_contexts;
 static inline void spu_acquire(struct spu_context *ctx)
-- 
cgit v1.2.3-70-g09d2


From cbc23d3e7cb3c9fd3c9fce0bc3f44f687a9517c0 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 20 Jul 2007 21:39:49 +0200
Subject: [CELL] spufs: integration of SPE affinity with the scheduller

This patch makes the scheduller honor affinity information for each
context being scheduled. If the context has no affinity information,
behaviour is unchanged. If there are affinity information, context is
schedulled to be run on the exact spu recommended by the affinity
placement algorithm.

Signed-off-by: Andre Detsch <adetsch@br.ibm.com>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/spu_base.c    | 19 +++++++++++++++++++
 arch/powerpc/platforms/cell/spufs/sched.c |  4 ++++
 2 files changed, 23 insertions(+)

diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index 75b5af0a7e2..5f399313b47 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -425,6 +425,25 @@ static void spu_init_channels(struct spu *spu)
 	}
 }
 
+struct spu *spu_alloc_spu(struct spu *req_spu)
+{
+	struct spu *spu, *ret = NULL;
+
+	mutex_lock(&spu_mutex);
+	list_for_each_entry(spu, &cbe_spu_info[req_spu->node].free_spus, list) {
+		if (spu == req_spu) {
+			list_del_init(&spu->list);
+			pr_debug("Got SPU %d %d\n", spu->number, spu->node);
+			spu_init_channels(spu);
+			ret = spu;
+			break;
+		}
+	}
+	mutex_unlock(&spu_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(spu_alloc_spu);
+
 struct spu *spu_alloc_node(int node)
 {
 	struct spu *spu = NULL;
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index a9569de4c14..49b8f6867a9 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -507,6 +507,10 @@ static struct spu *spu_get_idle(struct spu_context *ctx)
 	int node = cpu_to_node(raw_smp_processor_id());
 	int n;
 
+	spu = affinity_check(ctx);
+	if (spu)
+		return spu_alloc_spu(spu);
+
 	for (n = 0; n < MAX_NUMNODES; n++, node++) {
 		node = (node < MAX_NUMNODES) ? node : 0;
 		if (!node_allowed(ctx, node))
-- 
cgit v1.2.3-70-g09d2


From 9e7cbcbb6ede4299d52c839e352aae527c06124a Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 20 Jul 2007 21:39:50 +0200
Subject: [CELL] cell: indexing of SPUs based on firmware vicinity properties

This patch links spus according to their physical position using
information provided by the firmware through a special vicinity
device-tree property. This property is present in current version
of Malta firmware.

Example of vicinity properties for a node in Malta:

Node:        Vicinity property contains phandles of:
spe@0        [ spe@100000 , mic-tm@50a000 ]
spe@100000   [ spe@0      , spe@200000    ]
spe@200000   [ spe@100000 , spe@300000    ]
spe@300000   [ spe@200000 , bif0@512000   ]
spe@80000    [ spe@180000 , mic-tm@50a000 ]
spe@180000   [ spe@80000  , spe@280000    ]
spe@280000   [ spe@180000 , spe@380000    ]
spe@380000   [ spe@280000 , bif0@512000   ]

Only spe@* have a vicinity property (e.g., bif0@512000 and
mic-tm@50a000 do not have it).

Signed-off-by: Andre Detsch <adetsch@br.ibm.com>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/spu_base.c | 90 +++++++++++++++++++++++++++++++++-
 1 file changed, 89 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index 5f399313b47..03b4a8eb904 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -724,6 +724,92 @@ static int of_has_vicinity(void)
 	return of_find_property(spu_devnode(spu), "vicinity", NULL) != NULL;
 }
 
+static struct spu *aff_devnode_spu(int cbe, struct device_node *dn)
+{
+	struct spu *spu;
+
+	list_for_each_entry(spu, &cbe_spu_info[cbe].spus, cbe_list)
+		if (spu_devnode(spu) == dn)
+			return spu;
+	return NULL;
+}
+
+static struct spu *
+aff_node_next_to(int cbe, struct device_node *target, struct device_node *avoid)
+{
+	struct spu *spu;
+	const phandle *vic_handles;
+	int lenp, i;
+
+	list_for_each_entry(spu, &cbe_spu_info[cbe].spus, cbe_list) {
+		if (spu_devnode(spu) == avoid)
+			continue;
+		vic_handles = get_property(spu_devnode(spu), "vicinity", &lenp);
+		for (i=0; i < (lenp / sizeof(phandle)); i++) {
+			if (vic_handles[i] == target->linux_phandle)
+				return spu;
+		}
+	}
+	return NULL;
+}
+
+static void init_aff_fw_vicinity_node(int cbe)
+{
+	struct spu *spu, *last_spu;
+	struct device_node *vic_dn, *last_spu_dn;
+	phandle avoid_ph;
+	const phandle *vic_handles;
+	const char *name;
+	int lenp, i, added, mem_aff;
+
+	last_spu = list_entry(cbe_spu_info[cbe].spus.next, struct spu, cbe_list);
+	avoid_ph = 0;
+	for (added = 1; added < cbe_spu_info[cbe].n_spus; added++) {
+		last_spu_dn = spu_devnode(last_spu);
+		vic_handles = get_property(last_spu_dn, "vicinity", &lenp);
+
+		for (i = 0; i < (lenp / sizeof(phandle)); i++) {
+			if (vic_handles[i] == avoid_ph)
+				continue;
+
+			vic_dn = of_find_node_by_phandle(vic_handles[i]);
+			if (!vic_dn)
+				continue;
+
+			name = get_property(vic_dn, "name", NULL);
+			if (strcmp(name, "spe") == 0) {
+				spu = aff_devnode_spu(cbe, vic_dn);
+				avoid_ph = last_spu_dn->linux_phandle;
+			}
+			else {
+				mem_aff = strcmp(name, "mic-tm") == 0;
+				spu = aff_node_next_to(cbe, vic_dn, last_spu_dn);
+				if (!spu)
+					continue;
+				if (mem_aff) {
+					last_spu->has_mem_affinity = 1;
+					spu->has_mem_affinity = 1;
+				}
+				avoid_ph = vic_dn->linux_phandle;
+			}
+			list_add_tail(&spu->aff_list, &last_spu->aff_list);
+			last_spu = spu;
+			break;
+		}
+	}
+}
+
+static void init_aff_fw_vicinity(void)
+{
+	int cbe;
+
+	/* sets has_mem_affinity for each spu, as long as the
+	 * spu->aff_list list, linking each spu to its neighbors
+	 */
+	for (cbe = 0; cbe < MAX_NUMNODES; cbe++)
+		init_aff_fw_vicinity_node(cbe);
+}
+
 static int __init init_spu_base(void)
 {
 	int i, ret = 0;
@@ -765,7 +851,9 @@ static int __init init_spu_base(void)
 	crash_register_spus(&spu_full_list);
 	spu_add_sysdev_attr(&attr_stat);
 
-	if (!of_has_vicinity()) {
+	if (of_has_vicinity()) {
+		init_aff_fw_vicinity();
+	} else {
 		long root = of_get_flat_dt_root();
 		if (of_flat_dt_is_compatible(root, "IBM,CPBW-1.0"))
 			init_aff_QS20_harcoded();
-- 
cgit v1.2.3-70-g09d2


From 2414059420311e5384de646eebfd529c184afd3c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 20 Jul 2007 21:39:51 +0200
Subject: [CELL] spu_base: locking cleanup

Sort out the locking mess in spu_base and document the current rules.
As an added benefit spu_alloc* and spu_free don't block anymore.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/spu_base.c | 84 +++++++++++++++++++++-------------
 1 file changed, 51 insertions(+), 33 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index 03b4a8eb904..8617b507af4 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -42,12 +42,30 @@ const struct spu_management_ops *spu_management_ops;
 EXPORT_SYMBOL_GPL(spu_management_ops);
 
 const struct spu_priv1_ops *spu_priv1_ops;
+EXPORT_SYMBOL_GPL(spu_priv1_ops);
 
-static LIST_HEAD(spu_full_list);
-static DEFINE_MUTEX(spu_mutex);
-static DEFINE_SPINLOCK(spu_list_lock);
+struct cbe_spu_info cbe_spu_info[MAX_NUMNODES];
+EXPORT_SYMBOL_GPL(cbe_spu_info);
 
-EXPORT_SYMBOL_GPL(spu_priv1_ops);
+/*
+ * Protects cbe_spu_info and spu->number.
+ */
+static DEFINE_SPINLOCK(spu_lock);
+
+/*
+ * List of all spus in the system.
+ *
+ * This list is iterated by callers from irq context and callers that
+ * want to sleep.  Thus modifications need to be done with both
+ * spu_full_list_lock and spu_full_list_mutex held, while iterating
+ * through it requires either of these locks.
+ *
+ * In addition spu_full_list_lock protects all assignmens to
+ * spu->mm.
+ */
+static LIST_HEAD(spu_full_list);
+static DEFINE_SPINLOCK(spu_full_list_lock);
+static DEFINE_MUTEX(spu_full_list_mutex);
 
 void spu_invalidate_slbs(struct spu *spu)
 {
@@ -66,12 +84,12 @@ void spu_flush_all_slbs(struct mm_struct *mm)
 	struct spu *spu;
 	unsigned long flags;
 
-	spin_lock_irqsave(&spu_list_lock, flags);
+	spin_lock_irqsave(&spu_full_list_lock, flags);
 	list_for_each_entry(spu, &spu_full_list, full_list) {
 		if (spu->mm == mm)
 			spu_invalidate_slbs(spu);
 	}
-	spin_unlock_irqrestore(&spu_list_lock, flags);
+	spin_unlock_irqrestore(&spu_full_list_lock, flags);
 }
 
 /* The hack below stinks... try to do something better one of
@@ -89,9 +107,9 @@ void spu_associate_mm(struct spu *spu, struct mm_struct *mm)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&spu_list_lock, flags);
+	spin_lock_irqsave(&spu_full_list_lock, flags);
 	spu->mm = mm;
-	spin_unlock_irqrestore(&spu_list_lock, flags);
+	spin_unlock_irqrestore(&spu_full_list_lock, flags);
 	if (mm)
 		mm_needs_global_tlbie(mm);
 }
@@ -429,7 +447,7 @@ struct spu *spu_alloc_spu(struct spu *req_spu)
 {
 	struct spu *spu, *ret = NULL;
 
-	mutex_lock(&spu_mutex);
+	spin_lock(&spu_lock);
 	list_for_each_entry(spu, &cbe_spu_info[req_spu->node].free_spus, list) {
 		if (spu == req_spu) {
 			list_del_init(&spu->list);
@@ -439,7 +457,7 @@ struct spu *spu_alloc_spu(struct spu *req_spu)
 			break;
 		}
 	}
-	mutex_unlock(&spu_mutex);
+	spin_unlock(&spu_lock);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(spu_alloc_spu);
@@ -448,14 +466,14 @@ struct spu *spu_alloc_node(int node)
 {
 	struct spu *spu = NULL;
 
-	mutex_lock(&spu_mutex);
+	spin_lock(&spu_lock);
 	if (!list_empty(&cbe_spu_info[node].free_spus)) {
 		spu = list_entry(cbe_spu_info[node].free_spus.next, struct spu,
 									list);
 		list_del_init(&spu->list);
 		pr_debug("Got SPU %d %d\n", spu->number, spu->node);
 	}
-	mutex_unlock(&spu_mutex);
+	spin_unlock(&spu_lock);
 
 	if (spu)
 		spu_init_channels(spu);
@@ -479,9 +497,9 @@ struct spu *spu_alloc(void)
 
 void spu_free(struct spu *spu)
 {
-	mutex_lock(&spu_mutex);
+	spin_lock(&spu_lock);
 	list_add_tail(&spu->list, &cbe_spu_info[spu->node].free_spus);
-	mutex_unlock(&spu_mutex);
+	spin_unlock(&spu_lock);
 }
 EXPORT_SYMBOL_GPL(spu_free);
 
@@ -502,12 +520,12 @@ struct sysdev_class spu_sysdev_class = {
 int spu_add_sysdev_attr(struct sysdev_attribute *attr)
 {
 	struct spu *spu;
-	mutex_lock(&spu_mutex);
 
+	mutex_lock(&spu_full_list_mutex);
 	list_for_each_entry(spu, &spu_full_list, full_list)
 		sysdev_create_file(&spu->sysdev, attr);
+	mutex_unlock(&spu_full_list_mutex);
 
-	mutex_unlock(&spu_mutex);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(spu_add_sysdev_attr);
@@ -515,12 +533,12 @@ EXPORT_SYMBOL_GPL(spu_add_sysdev_attr);
 int spu_add_sysdev_attr_group(struct attribute_group *attrs)
 {
 	struct spu *spu;
-	mutex_lock(&spu_mutex);
 
+	mutex_lock(&spu_full_list_mutex);
 	list_for_each_entry(spu, &spu_full_list, full_list)
 		sysfs_create_group(&spu->sysdev.kobj, attrs);
+	mutex_unlock(&spu_full_list_mutex);
 
-	mutex_unlock(&spu_mutex);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(spu_add_sysdev_attr_group);
@@ -529,24 +547,22 @@ EXPORT_SYMBOL_GPL(spu_add_sysdev_attr_group);
 void spu_remove_sysdev_attr(struct sysdev_attribute *attr)
 {
 	struct spu *spu;
-	mutex_lock(&spu_mutex);
 
+	mutex_lock(&spu_full_list_mutex);
 	list_for_each_entry(spu, &spu_full_list, full_list)
 		sysdev_remove_file(&spu->sysdev, attr);
-
-	mutex_unlock(&spu_mutex);
+	mutex_unlock(&spu_full_list_mutex);
 }
 EXPORT_SYMBOL_GPL(spu_remove_sysdev_attr);
 
 void spu_remove_sysdev_attr_group(struct attribute_group *attrs)
 {
 	struct spu *spu;
-	mutex_lock(&spu_mutex);
 
+	mutex_lock(&spu_full_list_mutex);
 	list_for_each_entry(spu, &spu_full_list, full_list)
 		sysfs_remove_group(&spu->sysdev.kobj, attrs);
-
-	mutex_unlock(&spu_mutex);
+	mutex_unlock(&spu_full_list_mutex);
 }
 EXPORT_SYMBOL_GPL(spu_remove_sysdev_attr_group);
 
@@ -582,9 +598,9 @@ static int __init create_spu(void *data)
 		goto out;
 
 	spin_lock_init(&spu->register_lock);
-	mutex_lock(&spu_mutex);
+	spin_lock(&spu_lock);
 	spu->number = number++;
-	mutex_unlock(&spu_mutex);
+	spin_unlock(&spu_lock);
 
 	ret = spu_create_spu(spu, data);
 
@@ -601,14 +617,17 @@ static int __init create_spu(void *data)
 	if (ret)
 		goto out_free_irqs;
 
-	mutex_lock(&spu_mutex);
-	spin_lock_irqsave(&spu_list_lock, flags);
+	spin_lock(&spu_lock);
 	list_add(&spu->list, &cbe_spu_info[spu->node].free_spus);
 	list_add(&spu->cbe_list, &cbe_spu_info[spu->node].spus);
 	cbe_spu_info[spu->node].n_spus++;
+	spin_unlock(&spu_lock);
+
+	mutex_lock(&spu_full_list_mutex);
+	spin_lock_irqsave(&spu_full_list_lock, flags);
 	list_add(&spu->full_list, &spu_full_list);
-	spin_unlock_irqrestore(&spu_list_lock, flags);
-	mutex_unlock(&spu_mutex);
+	spin_unlock_irqrestore(&spu_full_list_lock, flags);
+	mutex_unlock(&spu_full_list_mutex);
 
 	spu->stats.util_state = SPU_UTIL_IDLE_LOADED;
 	ktime_get_ts(&ts);
@@ -675,9 +694,6 @@ static ssize_t spu_stat_show(struct sys_device *sysdev, char *buf)
 
 static SYSDEV_ATTR(stat, 0644, spu_stat_show, NULL);
 
-struct cbe_spu_info cbe_spu_info[MAX_NUMNODES];
-EXPORT_SYMBOL_GPL(cbe_spu_info);
-
 /* Hardcoded affinity idxs for QS20 */
 #define SPES_PER_BE 8
 static int QS20_reg_idxs[SPES_PER_BE] =   { 0, 2, 4, 6, 7, 5, 3, 1 };
@@ -847,8 +863,10 @@ static int __init init_spu_base(void)
 		fb_append_extra_logo(&logo_spe_clut224, ret);
 	}
 
+	mutex_lock(&spu_full_list_mutex);
 	xmon_register_spus(&spu_full_list);
 	crash_register_spus(&spu_full_list);
+	mutex_unlock(&spu_full_list_mutex);
 	spu_add_sysdev_attr(&attr_stat);
 
 	if (of_has_vicinity()) {
-- 
cgit v1.2.3-70-g09d2


From 36aaccc1e96481e8310b1d13600096da0f24ff43 Mon Sep 17 00:00:00 2001
From: Bob Nelson <rrnelson@linux.vnet.ibm.com>
Date: Fri, 20 Jul 2007 21:39:52 +0200
Subject: [CELL] oprofile: enable SPU switch notification to detect currently
 active SPU tasks

From: Maynard Johnson <mpjohn@us.ibm.com>

This patch adds to the capability of spu_switch_event_register so that
the caller is also notified of currently active SPU tasks.
Exports spu_switch_event_register and spu_switch_event_unregister so
that OProfile can get access to the notifications provided.

Signed-off-by: Maynard Johnson <mpjohn@us.ibm.com>
Signed-off-by: Carl Love <carll@us.ibm.com>
Signed-off-by: Bob Nelson <rrnelson@us.ibm.com>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
Acked-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/platforms/cell/spufs/run.c   | 23 +++++++++++++++------
 arch/powerpc/platforms/cell/spufs/sched.c | 34 +++++++++++++++++++++++++++++--
 arch/powerpc/platforms/cell/spufs/spufs.h |  6 ++++++
 3 files changed, 55 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spufs/run.c b/arch/powerpc/platforms/cell/spufs/run.c
index c0238ea5b55..0b50fa5cb39 100644
--- a/arch/powerpc/platforms/cell/spufs/run.c
+++ b/arch/powerpc/platforms/cell/spufs/run.c
@@ -18,15 +18,17 @@ void spufs_stop_callback(struct spu *spu)
 	wake_up_all(&ctx->stop_wq);
 }
 
-static inline int spu_stopped(struct spu_context *ctx, u32 * stat)
+static inline int spu_stopped(struct spu_context *ctx, u32 *stat)
 {
 	struct spu *spu;
 	u64 pte_fault;
 
 	*stat = ctx->ops->status_read(ctx);
-	if (ctx->state != SPU_STATE_RUNNABLE)
-		return 1;
+
 	spu = ctx->spu;
+	if (ctx->state != SPU_STATE_RUNNABLE ||
+	    test_bit(SPU_SCHED_NOTIFY_ACTIVE, &ctx->sched_flags))
+		return 1;
 	pte_fault = spu->dsisr &
 	    (MFC_DSISR_PTE_NOT_FOUND | MFC_DSISR_ACCESS_DENIED);
 	return (!(*stat & SPU_STATUS_RUNNING) || pte_fault || spu->class_0_pending) ?
@@ -124,7 +126,7 @@ out:
 	return ret;
 }
 
-static int spu_run_init(struct spu_context *ctx, u32 * npc)
+static int spu_run_init(struct spu_context *ctx, u32 *npc)
 {
 	spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
 
@@ -158,8 +160,8 @@ static int spu_run_init(struct spu_context *ctx, u32 * npc)
 	return 0;
 }
 
-static int spu_run_fini(struct spu_context *ctx, u32 * npc,
-			       u32 * status)
+static int spu_run_fini(struct spu_context *ctx, u32 *npc,
+			       u32 *status)
 {
 	int ret = 0;
 
@@ -298,6 +300,7 @@ static inline int spu_process_events(struct spu_context *ctx)
 long spufs_run_spu(struct spu_context *ctx, u32 *npc, u32 *event)
 {
 	int ret;
+	struct spu *spu;
 	u32 status;
 
 	if (mutex_lock_interruptible(&ctx->run_mutex))
@@ -333,6 +336,14 @@ long spufs_run_spu(struct spu_context *ctx, u32 *npc, u32 *event)
 		ret = spufs_wait(ctx->stop_wq, spu_stopped(ctx, &status));
 		if (unlikely(ret))
 			break;
+		spu = ctx->spu;
+		if (unlikely(test_and_clear_bit(SPU_SCHED_NOTIFY_ACTIVE,
+						&ctx->sched_flags))) {
+			if (!(status & SPU_STATUS_STOPPED_BY_STOP)) {
+				spu_switch_notify(spu, ctx);
+				continue;
+			}
+		}
 
 		spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
 
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 49b8f6867a9..88ec333e90d 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -204,21 +204,51 @@ static void spu_remove_from_active_list(struct spu *spu)
 
 static BLOCKING_NOTIFIER_HEAD(spu_switch_notifier);
 
-static void spu_switch_notify(struct spu *spu, struct spu_context *ctx)
+void spu_switch_notify(struct spu *spu, struct spu_context *ctx)
 {
 	blocking_notifier_call_chain(&spu_switch_notifier,
 			    ctx ? ctx->object_id : 0, spu);
 }
 
+static void notify_spus_active(void)
+{
+	int node;
+
+	/*
+	 * Wake up the active spu_contexts.
+	 *
+	 * When the awakened processes see their "notify_active" flag is set,
+	 * they will call spu_switch_notify();
+	 */
+	for_each_online_node(node) {
+		struct spu *spu;
+		mutex_lock(&spu_prio->active_mutex[node]);
+		list_for_each_entry(spu, &spu_prio->active_list[node], list) {
+			struct spu_context *ctx = spu->ctx;
+			set_bit(SPU_SCHED_NOTIFY_ACTIVE, &ctx->sched_flags);
+			mb();	/* make sure any tasks woken up below */
+				/* can see the bit(s) set above */
+			wake_up_all(&ctx->stop_wq);
+		}
+		mutex_unlock(&spu_prio->active_mutex[node]);
+	}
+}
+
 int spu_switch_event_register(struct notifier_block * n)
 {
-	return blocking_notifier_chain_register(&spu_switch_notifier, n);
+	int ret;
+	ret = blocking_notifier_chain_register(&spu_switch_notifier, n);
+	if (!ret)
+		notify_spus_active();
+	return ret;
 }
+EXPORT_SYMBOL_GPL(spu_switch_event_register);
 
 int spu_switch_event_unregister(struct notifier_block * n)
 {
 	return blocking_notifier_chain_unregister(&spu_switch_notifier, n);
 }
+EXPORT_SYMBOL_GPL(spu_switch_event_unregister);
 
 /**
  * spu_bind_context - bind spu context to physical spu
diff --git a/arch/powerpc/platforms/cell/spufs/spufs.h b/arch/powerpc/platforms/cell/spufs/spufs.h
index 42d8da8f0fb..692dbd0edc3 100644
--- a/arch/powerpc/platforms/cell/spufs/spufs.h
+++ b/arch/powerpc/platforms/cell/spufs/spufs.h
@@ -44,6 +44,11 @@ enum {
 	SPU_SCHED_WAS_ACTIVE,	/* was active upon spu_acquire_saved()  */
 };
 
+/* ctx->sched_flags */
+enum {
+	SPU_SCHED_NOTIFY_ACTIVE,
+};
+
 struct spu_context {
 	struct spu *spu;		  /* pointer to a physical SPU */
 	struct spu_state csa;		  /* SPU context save area. */
@@ -240,6 +245,7 @@ void spu_release_saved(struct spu_context *ctx);
 int spu_activate(struct spu_context *ctx, unsigned long flags);
 void spu_deactivate(struct spu_context *ctx);
 void spu_yield(struct spu_context *ctx);
+void spu_switch_notify(struct spu *spu, struct spu_context *ctx);
 void spu_set_timeslice(struct spu_context *ctx);
 void spu_update_sched_info(struct spu_context *ctx);
 void __spu_update_sched_info(struct spu_context *ctx);
-- 
cgit v1.2.3-70-g09d2


From 1474855d0878cced6f39f51f3c2bd7428b44cb1e Mon Sep 17 00:00:00 2001
From: Bob Nelson <rrnelson@linux.vnet.ibm.com>
Date: Fri, 20 Jul 2007 21:39:53 +0200
Subject: [CELL] oprofile: add support to OProfile for profiling CELL BE SPUs

From: Maynard Johnson <mpjohn@us.ibm.com>

This patch updates the existing arch/powerpc/oprofile/op_model_cell.c
to add in the SPU profiling capabilities.  In addition, a 'cell' subdirectory
was added to arch/powerpc/oprofile to hold Cell-specific SPU profiling code.
Exports spu_set_profile_private_kref and spu_get_profile_private_kref which
are used by OProfile to store private profile information in spufs data
structures.

Also incorporated several fixes from other patches (rrn).  Check pointer
returned from kzalloc.  Eliminated unnecessary cast.  Better error
handling and cleanup in the related area.  64-bit unsigned long parameter
was being demoted to 32-bit unsigned int and eventually promoted back to
unsigned long.

Signed-off-by: Carl Love <carll@us.ibm.com>
Signed-off-by: Maynard Johnson <mpjohn@us.ibm.com>
Signed-off-by: Bob Nelson <rrnelson@us.ibm.com>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
Acked-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/configs/cell_defconfig         |   3 +-
 arch/powerpc/kernel/time.c                  |   1 +
 arch/powerpc/oprofile/Kconfig               |   7 +
 arch/powerpc/oprofile/Makefile              |   4 +-
 arch/powerpc/oprofile/cell/pr_util.h        |  97 +++++
 arch/powerpc/oprofile/cell/spu_profiler.c   | 221 ++++++++++
 arch/powerpc/oprofile/cell/spu_task_sync.c  | 484 ++++++++++++++++++++++
 arch/powerpc/oprofile/cell/vma_map.c        | 287 +++++++++++++
 arch/powerpc/oprofile/common.c              |  51 ++-
 arch/powerpc/oprofile/op_model_7450.c       |  14 +-
 arch/powerpc/oprofile/op_model_cell.c       | 607 ++++++++++++++++++++++++----
 arch/powerpc/oprofile/op_model_fsl_booke.c  |  11 +-
 arch/powerpc/oprofile/op_model_pa6t.c       |  12 +-
 arch/powerpc/oprofile/op_model_power4.c     |  11 +-
 arch/powerpc/oprofile/op_model_rs64.c       |  10 +-
 arch/powerpc/platforms/cell/spufs/context.c |  20 +
 arch/powerpc/platforms/cell/spufs/sched.c   |   4 +-
 arch/powerpc/platforms/cell/spufs/spufs.h   |   2 +
 drivers/oprofile/buffer_sync.c              |   3 +-
 drivers/oprofile/event_buffer.h             |  20 +-
 drivers/oprofile/oprof.c                    |  28 ++
 include/asm-powerpc/oprofile_impl.h         |  10 +-
 include/asm-powerpc/spu.h                   |  15 +
 include/linux/dcookies.h                    |   1 +
 include/linux/elf-em.h                      |   3 +-
 include/linux/oprofile.h                    |  35 ++
 26 files changed, 1828 insertions(+), 133 deletions(-)
 create mode 100644 arch/powerpc/oprofile/cell/pr_util.h
 create mode 100644 arch/powerpc/oprofile/cell/spu_profiler.c
 create mode 100644 arch/powerpc/oprofile/cell/spu_task_sync.c
 create mode 100644 arch/powerpc/oprofile/cell/vma_map.c

diff --git a/arch/powerpc/configs/cell_defconfig b/arch/powerpc/configs/cell_defconfig
index 74f83f4a4e5..d9ac24e8de1 100644
--- a/arch/powerpc/configs/cell_defconfig
+++ b/arch/powerpc/configs/cell_defconfig
@@ -1455,7 +1455,8 @@ CONFIG_HAS_DMA=y
 # Instrumentation Support
 #
 CONFIG_PROFILING=y
-CONFIG_OPROFILE=y
+CONFIG_OPROFILE=m
+CONFIG_OPROFILE_CELL=y
 # CONFIG_KPROBES is not set
 
 #
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index e5df167f782..727a6699f2f 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -122,6 +122,7 @@ extern struct timezone sys_tz;
 static long timezone_offset;
 
 unsigned long ppc_proc_freq;
+EXPORT_SYMBOL(ppc_proc_freq);
 unsigned long ppc_tb_freq;
 
 static u64 tb_last_jiffy __cacheline_aligned_in_smp;
diff --git a/arch/powerpc/oprofile/Kconfig b/arch/powerpc/oprofile/Kconfig
index eb2dece76a5..7089e79689b 100644
--- a/arch/powerpc/oprofile/Kconfig
+++ b/arch/powerpc/oprofile/Kconfig
@@ -15,3 +15,10 @@ config OPROFILE
 
 	  If unsure, say N.
 
+config OPROFILE_CELL
+	bool "OProfile for Cell Broadband Engine"
+	depends on (SPU_FS = y && OPROFILE = m) || (SPU_FS = y && OPROFILE = y) || (SPU_FS = m && OPROFILE = m)
+	default y
+	help
+	  Profiling of Cell BE SPUs requires special support enabled
+	  by this option.
diff --git a/arch/powerpc/oprofile/Makefile b/arch/powerpc/oprofile/Makefile
index 4b5f9528218..c5f64c3bd66 100644
--- a/arch/powerpc/oprofile/Makefile
+++ b/arch/powerpc/oprofile/Makefile
@@ -11,7 +11,9 @@ DRIVER_OBJS := $(addprefix ../../../drivers/oprofile/, \
 		timer_int.o )
 
 oprofile-y := $(DRIVER_OBJS) common.o backtrace.o
-oprofile-$(CONFIG_PPC_CELL_NATIVE) += op_model_cell.o
+oprofile-$(CONFIG_OPROFILE_CELL) += op_model_cell.o \
+		cell/spu_profiler.o cell/vma_map.o \
+		cell/spu_task_sync.o
 oprofile-$(CONFIG_PPC64) += op_model_rs64.o op_model_power4.o op_model_pa6t.o
 oprofile-$(CONFIG_FSL_BOOKE) += op_model_fsl_booke.o
 oprofile-$(CONFIG_6xx) += op_model_7450.o
diff --git a/arch/powerpc/oprofile/cell/pr_util.h b/arch/powerpc/oprofile/cell/pr_util.h
new file mode 100644
index 00000000000..e5704f00c8b
--- /dev/null
+++ b/arch/powerpc/oprofile/cell/pr_util.h
@@ -0,0 +1,97 @@
+ /*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Author: Maynard Johnson <maynardj@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef PR_UTIL_H
+#define PR_UTIL_H
+
+#include <linux/cpumask.h>
+#include <linux/oprofile.h>
+#include <asm/cell-pmu.h>
+#include <asm/spu.h>
+
+#include "../../platforms/cell/cbe_regs.h"
+
+/* Defines used for sync_start */
+#define SKIP_GENERIC_SYNC 0
+#define SYNC_START_ERROR -1
+#define DO_GENERIC_SYNC 1
+
+struct spu_overlay_info {	/* map of sections within an SPU overlay */
+	unsigned int vma;	/* SPU virtual memory address from elf */
+	unsigned int size;	/* size of section from elf */
+	unsigned int offset;	/* offset of section into elf file */
+	unsigned int buf;
+};
+
+struct vma_to_fileoffset_map {	/* map of sections within an SPU program */
+	struct vma_to_fileoffset_map *next;	/* list pointer */
+	unsigned int vma;	/* SPU virtual memory address from elf */
+	unsigned int size;	/* size of section from elf */
+	unsigned int offset;	/* offset of section into elf file */
+	unsigned int guard_ptr;
+	unsigned int guard_val;
+        /*
+	 * The guard pointer is an entry in the _ovly_buf_table,
+	 * computed using ovly.buf as the index into the table.  Since
+	 * ovly.buf values begin at '1' to reference the first (or 0th)
+	 * entry in the _ovly_buf_table, the computation subtracts 1
+	 * from ovly.buf.
+	 * The guard value is stored in the _ovly_buf_table entry and
+	 * is an index (starting at 1) back to the _ovly_table entry
+	 * that is pointing at this _ovly_buf_table entry.  So, for
+	 * example, for an overlay scenario with one overlay segment
+	 * and two overlay sections:
+	 *      - Section 1 points to the first entry of the
+	 *        _ovly_buf_table, which contains a guard value
+	 *        of '1', referencing the first (index=0) entry of
+	 *        _ovly_table.
+	 *      - Section 2 points to the second entry of the
+	 *        _ovly_buf_table, which contains a guard value
+	 *        of '2', referencing the second (index=1) entry of
+	 *        _ovly_table.
+	 */
+
+};
+
+/* The three functions below are for maintaining and accessing
+ * the vma-to-fileoffset map.
+ */
+struct vma_to_fileoffset_map *create_vma_map(const struct spu *spu,
+					     u64 objectid);
+unsigned int vma_map_lookup(struct vma_to_fileoffset_map *map,
+			    unsigned int vma, const struct spu *aSpu,
+			    int *grd_val);
+void vma_map_free(struct vma_to_fileoffset_map *map);
+
+/*
+ * Entry point for SPU profiling.
+ * cycles_reset is the SPU_CYCLES count value specified by the user.
+ */
+int start_spu_profiling(unsigned int cycles_reset);
+
+void stop_spu_profiling(void);
+
+
+/* add the necessary profiling hooks */
+int spu_sync_start(void);
+
+/* remove the hooks */
+int spu_sync_stop(void);
+
+/* Record SPU program counter samples to the oprofile event buffer. */
+void spu_sync_buffer(int spu_num, unsigned int *samples,
+		     int num_samples);
+
+void set_spu_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset);
+
+#endif	  /* PR_UTIL_H */
diff --git a/arch/powerpc/oprofile/cell/spu_profiler.c b/arch/powerpc/oprofile/cell/spu_profiler.c
new file mode 100644
index 00000000000..380d7e21753
--- /dev/null
+++ b/arch/powerpc/oprofile/cell/spu_profiler.c
@@ -0,0 +1,221 @@
+/*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Authors: Maynard Johnson <maynardj@us.ibm.com>
+ *	    Carl Love <carll@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/hrtimer.h>
+#include <linux/smp.h>
+#include <linux/slab.h>
+#include <asm/cell-pmu.h>
+#include "pr_util.h"
+
+#define TRACE_ARRAY_SIZE 1024
+#define SCALE_SHIFT 14
+
+static u32 *samples;
+
+static int spu_prof_running;
+static unsigned int profiling_interval;
+
+#define NUM_SPU_BITS_TRBUF 16
+#define SPUS_PER_TB_ENTRY   4
+#define SPUS_PER_NODE	     8
+
+#define SPU_PC_MASK	     0xFFFF
+
+static DEFINE_SPINLOCK(sample_array_lock);
+unsigned long sample_array_lock_flags;
+
+void set_spu_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset)
+{
+	unsigned long ns_per_cyc;
+
+	if (!freq_khz)
+		freq_khz = ppc_proc_freq/1000;
+
+	/* To calculate a timeout in nanoseconds, the basic
+	 * formula is ns = cycles_reset * (NSEC_PER_SEC / cpu frequency).
+	 * To avoid floating point math, we use the scale math
+	 * technique as described in linux/jiffies.h.  We use
+	 * a scale factor of SCALE_SHIFT, which provides 4 decimal places
+	 * of precision.  This is close enough for the purpose at hand.
+	 *
+	 * The value of the timeout should be small enough that the hw
+	 * trace buffer will not get more then about 1/3 full for the
+	 * maximum user specified (the LFSR value) hw sampling frequency.
+	 * This is to ensure the trace buffer will never fill even if the
+	 * kernel thread scheduling varies under a heavy system load.
+	 */
+
+	ns_per_cyc = (USEC_PER_SEC << SCALE_SHIFT)/freq_khz;
+	profiling_interval = (ns_per_cyc * cycles_reset) >> SCALE_SHIFT;
+
+}
+
+/*
+ * Extract SPU PC from trace buffer entry
+ */
+static void spu_pc_extract(int cpu, int entry)
+{
+	/* the trace buffer is 128 bits */
+	u64 trace_buffer[2];
+	u64 spu_mask;
+	int spu;
+
+	spu_mask = SPU_PC_MASK;
+
+	/* Each SPU PC is 16 bits; hence, four spus in each of
+	 * the two 64-bit buffer entries that make up the
+	 * 128-bit trace_buffer entry.	Process two 64-bit values
+	 * simultaneously.
+	 * trace[0] SPU PC contents are: 0 1 2 3
+	 * trace[1] SPU PC contents are: 4 5 6 7
+	 */
+
+	cbe_read_trace_buffer(cpu, trace_buffer);
+
+	for (spu = SPUS_PER_TB_ENTRY-1; spu >= 0; spu--) {
+		/* spu PC trace entry is upper 16 bits of the
+		 * 18 bit SPU program counter
+		 */
+		samples[spu * TRACE_ARRAY_SIZE + entry]
+			= (spu_mask & trace_buffer[0]) << 2;
+		samples[(spu + SPUS_PER_TB_ENTRY) * TRACE_ARRAY_SIZE + entry]
+			= (spu_mask & trace_buffer[1]) << 2;
+
+		trace_buffer[0] = trace_buffer[0] >> NUM_SPU_BITS_TRBUF;
+		trace_buffer[1] = trace_buffer[1] >> NUM_SPU_BITS_TRBUF;
+	}
+}
+
+static int cell_spu_pc_collection(int cpu)
+{
+	u32 trace_addr;
+	int entry;
+
+	/* process the collected SPU PC for the node */
+
+	entry = 0;
+
+	trace_addr = cbe_read_pm(cpu, trace_address);
+	while (!(trace_addr & CBE_PM_TRACE_BUF_EMPTY)) {
+		/* there is data in the trace buffer to process */
+		spu_pc_extract(cpu, entry);
+
+		entry++;
+
+		if (entry >= TRACE_ARRAY_SIZE)
+			/* spu_samples is full */
+			break;
+
+		trace_addr = cbe_read_pm(cpu, trace_address);
+	}
+
+	return entry;
+}
+
+
+static enum hrtimer_restart profile_spus(struct hrtimer *timer)
+{
+	ktime_t kt;
+	int cpu, node, k, num_samples, spu_num;
+
+	if (!spu_prof_running)
+		goto stop;
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+
+		node = cbe_cpu_to_node(cpu);
+
+		/* There should only be one kernel thread at a time processing
+		 * the samples.	 In the very unlikely case that the processing
+		 * is taking a very long time and multiple kernel threads are
+		 * started to process the samples.  Make sure only one kernel
+		 * thread is working on the samples array at a time.  The
+		 * sample array must be loaded and then processed for a given
+		 * cpu.	 The sample array is not per cpu.
+		 */
+		spin_lock_irqsave(&sample_array_lock,
+				  sample_array_lock_flags);
+		num_samples = cell_spu_pc_collection(cpu);
+
+		if (num_samples == 0) {
+			spin_unlock_irqrestore(&sample_array_lock,
+					       sample_array_lock_flags);
+			continue;
+		}
+
+		for (k = 0; k < SPUS_PER_NODE; k++) {
+			spu_num = k + (node * SPUS_PER_NODE);
+			spu_sync_buffer(spu_num,
+					samples + (k * TRACE_ARRAY_SIZE),
+					num_samples);
+		}
+
+		spin_unlock_irqrestore(&sample_array_lock,
+				       sample_array_lock_flags);
+
+	}
+	smp_wmb();	/* insure spu event buffer updates are written */
+			/* don't want events intermingled... */
+
+	kt = ktime_set(0, profiling_interval);
+	if (!spu_prof_running)
+		goto stop;
+	hrtimer_forward(timer, timer->base->get_time(), kt);
+	return HRTIMER_RESTART;
+
+ stop:
+	printk(KERN_INFO "SPU_PROF: spu-prof timer ending\n");
+	return HRTIMER_NORESTART;
+}
+
+static struct hrtimer timer;
+/*
+ * Entry point for SPU profiling.
+ * NOTE:  SPU profiling is done system-wide, not per-CPU.
+ *
+ * cycles_reset is the count value specified by the user when
+ * setting up OProfile to count SPU_CYCLES.
+ */
+int start_spu_profiling(unsigned int cycles_reset)
+{
+	ktime_t kt;
+
+	pr_debug("timer resolution: %lu\n", TICK_NSEC);
+	kt = ktime_set(0, profiling_interval);
+	hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	timer.expires = kt;
+	timer.function = profile_spus;
+
+	/* Allocate arrays for collecting SPU PC samples */
+	samples = kzalloc(SPUS_PER_NODE *
+			  TRACE_ARRAY_SIZE * sizeof(u32), GFP_KERNEL);
+
+	if (!samples)
+		return -ENOMEM;
+
+	spu_prof_running = 1;
+	hrtimer_start(&timer, kt, HRTIMER_MODE_REL);
+
+	return 0;
+}
+
+void stop_spu_profiling(void)
+{
+	spu_prof_running = 0;
+	hrtimer_cancel(&timer);
+	kfree(samples);
+	pr_debug("SPU_PROF: stop_spu_profiling issued\n");
+}
diff --git a/arch/powerpc/oprofile/cell/spu_task_sync.c b/arch/powerpc/oprofile/cell/spu_task_sync.c
new file mode 100644
index 00000000000..133665754a7
--- /dev/null
+++ b/arch/powerpc/oprofile/cell/spu_task_sync.c
@@ -0,0 +1,484 @@
+/*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Author: Maynard Johnson <maynardj@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/* The purpose of this file is to handle SPU event task switching
+ * and to record SPU context information into the OProfile
+ * event buffer.
+ *
+ * Additionally, the spu_sync_buffer function is provided as a helper
+ * for recoding actual SPU program counter samples to the event buffer.
+ */
+#include <linux/dcookies.h>
+#include <linux/kref.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/numa.h>
+#include <linux/oprofile.h>
+#include <linux/spinlock.h>
+#include "pr_util.h"
+
+#define RELEASE_ALL 9999
+
+static DEFINE_SPINLOCK(buffer_lock);
+static DEFINE_SPINLOCK(cache_lock);
+static int num_spu_nodes;
+int spu_prof_num_nodes;
+int last_guard_val[MAX_NUMNODES * 8];
+
+/* Container for caching information about an active SPU task. */
+struct cached_info {
+	struct vma_to_fileoffset_map *map;
+	struct spu *the_spu;	/* needed to access pointer to local_store */
+	struct kref cache_ref;
+};
+
+static struct cached_info *spu_info[MAX_NUMNODES * 8];
+
+static void destroy_cached_info(struct kref *kref)
+{
+	struct cached_info *info;
+
+	info = container_of(kref, struct cached_info, cache_ref);
+	vma_map_free(info->map);
+	kfree(info);
+	module_put(THIS_MODULE);
+}
+
+/* Return the cached_info for the passed SPU number.
+ * ATTENTION:  Callers are responsible for obtaining the
+ *	       cache_lock if needed prior to invoking this function.
+ */
+static struct cached_info *get_cached_info(struct spu *the_spu, int spu_num)
+{
+	struct kref *ref;
+	struct cached_info *ret_info;
+
+	if (spu_num >= num_spu_nodes) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Invalid index %d into spu info cache\n",
+		       __FUNCTION__, __LINE__, spu_num);
+		ret_info = NULL;
+		goto out;
+	}
+	if (!spu_info[spu_num] && the_spu) {
+		ref = spu_get_profile_private_kref(the_spu->ctx);
+		if (ref) {
+			spu_info[spu_num] = container_of(ref, struct cached_info, cache_ref);
+			kref_get(&spu_info[spu_num]->cache_ref);
+		}
+	}
+
+	ret_info = spu_info[spu_num];
+ out:
+	return ret_info;
+}
+
+
+/* Looks for cached info for the passed spu.  If not found, the
+ * cached info is created for the passed spu.
+ * Returns 0 for success; otherwise, -1 for error.
+ */
+static int
+prepare_cached_spu_info(struct spu *spu, unsigned long objectId)
+{
+	unsigned long flags;
+	struct vma_to_fileoffset_map *new_map;
+	int retval = 0;
+	struct cached_info *info;
+
+	/* We won't bother getting cache_lock here since
+	 * don't do anything with the cached_info that's returned.
+	 */
+	info = get_cached_info(spu, spu->number);
+
+	if (info) {
+		pr_debug("Found cached SPU info.\n");
+		goto out;
+	}
+
+	/* Create cached_info and set spu_info[spu->number] to point to it.
+	 * spu->number is a system-wide value, not a per-node value.
+	 */
+	info = kzalloc(sizeof(struct cached_info), GFP_KERNEL);
+	if (!info) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: create vma_map failed\n",
+		       __FUNCTION__, __LINE__);
+		retval = -ENOMEM;
+		goto err_alloc;
+	}
+	new_map = create_vma_map(spu, objectId);
+	if (!new_map) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: create vma_map failed\n",
+		       __FUNCTION__, __LINE__);
+		retval = -ENOMEM;
+		goto err_alloc;
+	}
+
+	pr_debug("Created vma_map\n");
+	info->map = new_map;
+	info->the_spu = spu;
+	kref_init(&info->cache_ref);
+	spin_lock_irqsave(&cache_lock, flags);
+	spu_info[spu->number] = info;
+	/* Increment count before passing off ref to SPUFS. */
+	kref_get(&info->cache_ref);
+
+	/* We increment the module refcount here since SPUFS is
+	 * responsible for the final destruction of the cached_info,
+	 * and it must be able to access the destroy_cached_info()
+	 * function defined in the OProfile module.  We decrement
+	 * the module refcount in destroy_cached_info.
+	 */
+	try_module_get(THIS_MODULE);
+	spu_set_profile_private_kref(spu->ctx, &info->cache_ref,
+				destroy_cached_info);
+	spin_unlock_irqrestore(&cache_lock, flags);
+	goto out;
+
+err_alloc:
+	kfree(info);
+out:
+	return retval;
+}
+
+/*
+ * NOTE:  The caller is responsible for locking the
+ *	  cache_lock prior to calling this function.
+ */
+static int release_cached_info(int spu_index)
+{
+	int index, end;
+
+	if (spu_index == RELEASE_ALL) {
+		end = num_spu_nodes;
+		index = 0;
+	} else {
+		if (spu_index >= num_spu_nodes) {
+			printk(KERN_ERR "SPU_PROF: "
+				"%s, line %d: "
+				"Invalid index %d into spu info cache\n",
+				__FUNCTION__, __LINE__, spu_index);
+			goto out;
+		}
+		end = spu_index + 1;
+		index = spu_index;
+	}
+	for (; index < end; index++) {
+		if (spu_info[index]) {
+			kref_put(&spu_info[index]->cache_ref,
+				 destroy_cached_info);
+			spu_info[index] = NULL;
+		}
+	}
+
+out:
+	return 0;
+}
+
+/* The source code for fast_get_dcookie was "borrowed"
+ * from drivers/oprofile/buffer_sync.c.
+ */
+
+/* Optimisation. We can manage without taking the dcookie sem
+ * because we cannot reach this code without at least one
+ * dcookie user still being registered (namely, the reader
+ * of the event buffer).
+ */
+static inline unsigned long fast_get_dcookie(struct dentry *dentry,
+					     struct vfsmount *vfsmnt)
+{
+	unsigned long cookie;
+
+	if (dentry->d_cookie)
+		return (unsigned long)dentry;
+	get_dcookie(dentry, vfsmnt, &cookie);
+	return cookie;
+}
+
+/* Look up the dcookie for the task's first VM_EXECUTABLE mapping,
+ * which corresponds loosely to "application name". Also, determine
+ * the offset for the SPU ELF object.  If computed offset is
+ * non-zero, it implies an embedded SPU object; otherwise, it's a
+ * separate SPU binary, in which case we retrieve it's dcookie.
+ * For the embedded case, we must determine if SPU ELF is embedded
+ * in the executable application or another file (i.e., shared lib).
+ * If embedded in a shared lib, we must get the dcookie and return
+ * that to the caller.
+ */
+static unsigned long
+get_exec_dcookie_and_offset(struct spu *spu, unsigned int *offsetp,
+			    unsigned long *spu_bin_dcookie,
+			    unsigned long spu_ref)
+{
+	unsigned long app_cookie = 0;
+	unsigned int my_offset = 0;
+	struct file *app = NULL;
+	struct vm_area_struct *vma;
+	struct mm_struct *mm = spu->mm;
+
+	if (!mm)
+		goto out;
+
+	down_read(&mm->mmap_sem);
+
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if (!vma->vm_file)
+			continue;
+		if (!(vma->vm_flags & VM_EXECUTABLE))
+			continue;
+		app_cookie = fast_get_dcookie(vma->vm_file->f_dentry,
+					  vma->vm_file->f_vfsmnt);
+		pr_debug("got dcookie for %s\n",
+			 vma->vm_file->f_dentry->d_name.name);
+		app = vma->vm_file;
+		break;
+	}
+
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if (vma->vm_start > spu_ref || vma->vm_end <= spu_ref)
+			continue;
+		my_offset = spu_ref - vma->vm_start;
+		if (!vma->vm_file)
+			goto fail_no_image_cookie;
+
+		pr_debug("Found spu ELF at %X(object-id:%lx) for file %s\n",
+			 my_offset, spu_ref,
+			 vma->vm_file->f_dentry->d_name.name);
+		*offsetp = my_offset;
+		break;
+	}
+
+	*spu_bin_dcookie = fast_get_dcookie(vma->vm_file->f_dentry,
+						 vma->vm_file->f_vfsmnt);
+	pr_debug("got dcookie for %s\n", vma->vm_file->f_dentry->d_name.name);
+
+	up_read(&mm->mmap_sem);
+
+out:
+	return app_cookie;
+
+fail_no_image_cookie:
+	up_read(&mm->mmap_sem);
+
+	printk(KERN_ERR "SPU_PROF: "
+		"%s, line %d: Cannot find dcookie for SPU binary\n",
+		__FUNCTION__, __LINE__);
+	goto out;
+}
+
+
+
+/* This function finds or creates cached context information for the
+ * passed SPU and records SPU context information into the OProfile
+ * event buffer.
+ */
+static int process_context_switch(struct spu *spu, unsigned long objectId)
+{
+	unsigned long flags;
+	int retval;
+	unsigned int offset = 0;
+	unsigned long spu_cookie = 0, app_dcookie;
+
+	retval = prepare_cached_spu_info(spu, objectId);
+	if (retval)
+		goto out;
+
+	/* Get dcookie first because a mutex_lock is taken in that
+	 * code path, so interrupts must not be disabled.
+	 */
+	app_dcookie = get_exec_dcookie_and_offset(spu, &offset, &spu_cookie, objectId);
+	if (!app_dcookie || !spu_cookie) {
+		retval  = -ENOENT;
+		goto out;
+	}
+
+	/* Record context info in event buffer */
+	spin_lock_irqsave(&buffer_lock, flags);
+	add_event_entry(ESCAPE_CODE);
+	add_event_entry(SPU_CTX_SWITCH_CODE);
+	add_event_entry(spu->number);
+	add_event_entry(spu->pid);
+	add_event_entry(spu->tgid);
+	add_event_entry(app_dcookie);
+	add_event_entry(spu_cookie);
+	add_event_entry(offset);
+	spin_unlock_irqrestore(&buffer_lock, flags);
+	smp_wmb();	/* insure spu event buffer updates are written */
+			/* don't want entries intermingled... */
+out:
+	return retval;
+}
+
+/*
+ * This function is invoked on either a bind_context or unbind_context.
+ * If called for an unbind_context, the val arg is 0; otherwise,
+ * it is the object-id value for the spu context.
+ * The data arg is of type 'struct spu *'.
+ */
+static int spu_active_notify(struct notifier_block *self, unsigned long val,
+				void *data)
+{
+	int retval;
+	unsigned long flags;
+	struct spu *the_spu = data;
+
+	pr_debug("SPU event notification arrived\n");
+	if (!val) {
+		spin_lock_irqsave(&cache_lock, flags);
+		retval = release_cached_info(the_spu->number);
+		spin_unlock_irqrestore(&cache_lock, flags);
+	} else {
+		retval = process_context_switch(the_spu, val);
+	}
+	return retval;
+}
+
+static struct notifier_block spu_active = {
+	.notifier_call = spu_active_notify,
+};
+
+static int number_of_online_nodes(void)
+{
+        u32 cpu; u32 tmp;
+        int nodes = 0;
+        for_each_online_cpu(cpu) {
+                tmp = cbe_cpu_to_node(cpu) + 1;
+                if (tmp > nodes)
+                        nodes++;
+        }
+        return nodes;
+}
+
+/* The main purpose of this function is to synchronize
+ * OProfile with SPUFS by registering to be notified of
+ * SPU task switches.
+ *
+ * NOTE: When profiling SPUs, we must ensure that only
+ * spu_sync_start is invoked and not the generic sync_start
+ * in drivers/oprofile/oprof.c.	 A return value of
+ * SKIP_GENERIC_SYNC or SYNC_START_ERROR will
+ * accomplish this.
+ */
+int spu_sync_start(void)
+{
+	int k;
+	int ret = SKIP_GENERIC_SYNC;
+	int register_ret;
+	unsigned long flags = 0;
+
+	spu_prof_num_nodes = number_of_online_nodes();
+	num_spu_nodes = spu_prof_num_nodes * 8;
+
+	spin_lock_irqsave(&buffer_lock, flags);
+	add_event_entry(ESCAPE_CODE);
+	add_event_entry(SPU_PROFILING_CODE);
+	add_event_entry(num_spu_nodes);
+	spin_unlock_irqrestore(&buffer_lock, flags);
+
+	/* Register for SPU events  */
+	register_ret = spu_switch_event_register(&spu_active);
+	if (register_ret) {
+		ret = SYNC_START_ERROR;
+		goto out;
+	}
+
+	for (k = 0; k < (MAX_NUMNODES * 8); k++)
+		last_guard_val[k] = 0;
+	pr_debug("spu_sync_start -- running.\n");
+out:
+	return ret;
+}
+
+/* Record SPU program counter samples to the oprofile event buffer. */
+void spu_sync_buffer(int spu_num, unsigned int *samples,
+		     int num_samples)
+{
+	unsigned long long file_offset;
+	unsigned long flags;
+	int i;
+	struct vma_to_fileoffset_map *map;
+	struct spu *the_spu;
+	unsigned long long spu_num_ll = spu_num;
+	unsigned long long spu_num_shifted = spu_num_ll << 32;
+	struct cached_info *c_info;
+
+	/* We need to obtain the cache_lock here because it's
+	 * possible that after getting the cached_info, the SPU job
+	 * corresponding to this cached_info may end, thus resulting
+	 * in the destruction of the cached_info.
+	 */
+	spin_lock_irqsave(&cache_lock, flags);
+	c_info = get_cached_info(NULL, spu_num);
+	if (!c_info) {
+		/* This legitimately happens when the SPU task ends before all
+		 * samples are recorded.
+		 * No big deal -- so we just drop a few samples.
+		 */
+		pr_debug("SPU_PROF: No cached SPU contex "
+			  "for SPU #%d. Dropping samples.\n", spu_num);
+		goto out;
+	}
+
+	map = c_info->map;
+	the_spu = c_info->the_spu;
+	spin_lock(&buffer_lock);
+	for (i = 0; i < num_samples; i++) {
+		unsigned int sample = *(samples+i);
+		int grd_val = 0;
+		file_offset = 0;
+		if (sample == 0)
+			continue;
+		file_offset = vma_map_lookup( map, sample, the_spu, &grd_val);
+
+		/* If overlays are used by this SPU application, the guard
+		 * value is non-zero, indicating which overlay section is in
+		 * use.	 We need to discard samples taken during the time
+		 * period which an overlay occurs (i.e., guard value changes).
+		 */
+		if (grd_val && grd_val != last_guard_val[spu_num]) {
+			last_guard_val[spu_num] = grd_val;
+			/* Drop the rest of the samples. */
+			break;
+		}
+
+		add_event_entry(file_offset | spu_num_shifted);
+	}
+	spin_unlock(&buffer_lock);
+out:
+	spin_unlock_irqrestore(&cache_lock, flags);
+}
+
+
+int spu_sync_stop(void)
+{
+	unsigned long flags = 0;
+	int ret = spu_switch_event_unregister(&spu_active);
+	if (ret) {
+		printk(KERN_ERR "SPU_PROF: "
+			"%s, line %d: spu_switch_event_unregister returned %d\n",
+			__FUNCTION__, __LINE__, ret);
+		goto out;
+	}
+
+	spin_lock_irqsave(&cache_lock, flags);
+	ret = release_cached_info(RELEASE_ALL);
+	spin_unlock_irqrestore(&cache_lock, flags);
+out:
+	pr_debug("spu_sync_stop -- done.\n");
+	return ret;
+}
+
+
diff --git a/arch/powerpc/oprofile/cell/vma_map.c b/arch/powerpc/oprofile/cell/vma_map.c
new file mode 100644
index 00000000000..76ec1d16aef
--- /dev/null
+++ b/arch/powerpc/oprofile/cell/vma_map.c
@@ -0,0 +1,287 @@
+/*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Author: Maynard Johnson <maynardj@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/* The code in this source file is responsible for generating
+ * vma-to-fileOffset maps for both overlay and non-overlay SPU
+ * applications.
+ */
+
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/elf.h>
+#include "pr_util.h"
+
+
+void vma_map_free(struct vma_to_fileoffset_map *map)
+{
+	while (map) {
+		struct vma_to_fileoffset_map *next = map->next;
+		kfree(map);
+		map = next;
+	}
+}
+
+unsigned int
+vma_map_lookup(struct vma_to_fileoffset_map *map, unsigned int vma,
+	       const struct spu *aSpu, int *grd_val)
+{
+	/*
+	 * Default the offset to the physical address + a flag value.
+	 * Addresses of dynamically generated code can't be found in the vma
+	 * map.  For those addresses the flagged value will be sent on to
+	 * the user space tools so they can be reported rather than just
+	 * thrown away.
+	 */
+	u32 offset = 0x10000000 + vma;
+	u32 ovly_grd;
+
+	for (; map; map = map->next) {
+		if (vma < map->vma || vma >= map->vma + map->size)
+			continue;
+
+		if (map->guard_ptr) {
+			ovly_grd = *(u32 *)(aSpu->local_store + map->guard_ptr);
+			if (ovly_grd != map->guard_val)
+				continue;
+			*grd_val = ovly_grd;
+		}
+		offset = vma - map->vma + map->offset;
+		break;
+	}
+
+	return offset;
+}
+
+static struct vma_to_fileoffset_map *
+vma_map_add(struct vma_to_fileoffset_map *map, unsigned int vma,
+	    unsigned int size, unsigned int offset, unsigned int guard_ptr,
+	    unsigned int guard_val)
+{
+	struct vma_to_fileoffset_map *new =
+		kzalloc(sizeof(struct vma_to_fileoffset_map), GFP_KERNEL);
+	if (!new) {
+		printk(KERN_ERR "SPU_PROF: %s, line %d: malloc failed\n",
+		       __FUNCTION__, __LINE__);
+		vma_map_free(map);
+		return NULL;
+	}
+
+	new->next = map;
+	new->vma = vma;
+	new->size = size;
+	new->offset = offset;
+	new->guard_ptr = guard_ptr;
+	new->guard_val = guard_val;
+
+	return new;
+}
+
+
+/* Parse SPE ELF header and generate a list of vma_maps.
+ * A pointer to the first vma_map in the generated list
+ * of vma_maps is returned.  */
+struct vma_to_fileoffset_map *create_vma_map(const struct spu *aSpu,
+					     unsigned long spu_elf_start)
+{
+	static const unsigned char expected[EI_PAD] = {
+		[EI_MAG0] = ELFMAG0,
+		[EI_MAG1] = ELFMAG1,
+		[EI_MAG2] = ELFMAG2,
+		[EI_MAG3] = ELFMAG3,
+		[EI_CLASS] = ELFCLASS32,
+		[EI_DATA] = ELFDATA2MSB,
+		[EI_VERSION] = EV_CURRENT,
+		[EI_OSABI] = ELFOSABI_NONE
+	};
+
+	int grd_val;
+	struct vma_to_fileoffset_map *map = NULL;
+	struct spu_overlay_info ovly;
+	unsigned int overlay_tbl_offset = -1;
+	unsigned long phdr_start, shdr_start;
+	Elf32_Ehdr ehdr;
+	Elf32_Phdr phdr;
+	Elf32_Shdr shdr, shdr_str;
+	Elf32_Sym sym;
+	int i, j;
+	char name[32];
+
+	unsigned int ovly_table_sym = 0;
+	unsigned int ovly_buf_table_sym = 0;
+	unsigned int ovly_table_end_sym = 0;
+	unsigned int ovly_buf_table_end_sym = 0;
+	unsigned long ovly_table;
+	unsigned int n_ovlys;
+
+	/* Get and validate ELF header.	 */
+
+	if (copy_from_user(&ehdr, (void *) spu_elf_start, sizeof (ehdr)))
+		goto fail;
+
+	if (memcmp(ehdr.e_ident, expected, EI_PAD) != 0) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Unexpected e_ident parsing SPU ELF\n",
+		       __FUNCTION__, __LINE__);
+		goto fail;
+	}
+	if (ehdr.e_machine != EM_SPU) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Unexpected e_machine parsing SPU ELF\n",
+		       __FUNCTION__,  __LINE__);
+		goto fail;
+	}
+	if (ehdr.e_type != ET_EXEC) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Unexpected e_type parsing SPU ELF\n",
+		       __FUNCTION__, __LINE__);
+		goto fail;
+	}
+	phdr_start = spu_elf_start + ehdr.e_phoff;
+	shdr_start = spu_elf_start + ehdr.e_shoff;
+
+	/* Traverse program headers.  */
+	for (i = 0; i < ehdr.e_phnum; i++) {
+		if (copy_from_user(&phdr,
+				   (void *) (phdr_start + i * sizeof(phdr)),
+				   sizeof(phdr)))
+			goto fail;
+
+		if (phdr.p_type != PT_LOAD)
+			continue;
+		if (phdr.p_flags & (1 << 27))
+			continue;
+
+		map = vma_map_add(map, phdr.p_vaddr, phdr.p_memsz,
+				  phdr.p_offset, 0, 0);
+		if (!map)
+			goto fail;
+	}
+
+	pr_debug("SPU_PROF: Created non-overlay maps\n");
+	/* Traverse section table and search for overlay-related symbols.  */
+	for (i = 0; i < ehdr.e_shnum; i++) {
+		if (copy_from_user(&shdr,
+				   (void *) (shdr_start + i * sizeof(shdr)),
+				   sizeof(shdr)))
+			goto fail;
+
+		if (shdr.sh_type != SHT_SYMTAB)
+			continue;
+		if (shdr.sh_entsize != sizeof (sym))
+			continue;
+
+		if (copy_from_user(&shdr_str,
+				   (void *) (shdr_start + shdr.sh_link *
+					     sizeof(shdr)),
+				   sizeof(shdr)))
+			goto fail;
+
+		if (shdr_str.sh_type != SHT_STRTAB)
+			goto fail;;
+
+		for (j = 0; j < shdr.sh_size / sizeof (sym); j++) {
+			if (copy_from_user(&sym, (void *) (spu_elf_start +
+						       shdr.sh_offset + j *
+							   sizeof (sym)),
+					   sizeof (sym)))
+				goto fail;
+
+			if (copy_from_user(name, (void *)
+					   (spu_elf_start + shdr_str.sh_offset +
+					    sym.st_name),
+					   20))
+				goto fail;
+
+			if (memcmp(name, "_ovly_table", 12) == 0)
+				ovly_table_sym = sym.st_value;
+			if (memcmp(name, "_ovly_buf_table", 16) == 0)
+				ovly_buf_table_sym = sym.st_value;
+			if (memcmp(name, "_ovly_table_end", 16) == 0)
+				ovly_table_end_sym = sym.st_value;
+			if (memcmp(name, "_ovly_buf_table_end", 20) == 0)
+				ovly_buf_table_end_sym = sym.st_value;
+		}
+	}
+
+	/* If we don't have overlays, we're done.  */
+	if (ovly_table_sym == 0 || ovly_buf_table_sym == 0
+	    || ovly_table_end_sym == 0 || ovly_buf_table_end_sym == 0) {
+		pr_debug("SPU_PROF: No overlay table found\n");
+		goto out;
+	} else {
+		pr_debug("SPU_PROF: Overlay table found\n");
+	}
+
+	/* The _ovly_table symbol represents a table with one entry
+	 * per overlay section.	 The _ovly_buf_table symbol represents
+	 * a table with one entry per overlay region.
+	 * The struct spu_overlay_info gives the structure of the _ovly_table
+	 * entries.  The structure of _ovly_table_buf is simply one
+	 * u32 word per entry.
+	 */
+	overlay_tbl_offset = vma_map_lookup(map, ovly_table_sym,
+					    aSpu, &grd_val);
+	if (overlay_tbl_offset < 0) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Error finding SPU overlay table\n",
+		       __FUNCTION__, __LINE__);
+		goto fail;
+	}
+	ovly_table = spu_elf_start + overlay_tbl_offset;
+
+	n_ovlys = (ovly_table_end_sym -
+		   ovly_table_sym) / sizeof (ovly);
+
+	/* Traverse overlay table.  */
+	for (i = 0; i < n_ovlys; i++) {
+		if (copy_from_user(&ovly, (void *)
+				   (ovly_table + i * sizeof (ovly)),
+				   sizeof (ovly)))
+			goto fail;
+
+		/* The ovly.vma/size/offset arguments are analogous to the same
+		 * arguments used above for non-overlay maps.  The final two
+		 * args are referred to as the guard pointer and the guard
+		 * value.
+		 * The guard pointer is an entry in the _ovly_buf_table,
+		 * computed using ovly.buf as the index into the table.	 Since
+		 * ovly.buf values begin at '1' to reference the first (or 0th)
+		 * entry in the _ovly_buf_table, the computation subtracts 1
+		 * from ovly.buf.
+		 * The guard value is stored in the _ovly_buf_table entry and
+		 * is an index (starting at 1) back to the _ovly_table entry
+		 * that is pointing at this _ovly_buf_table entry.  So, for
+		 * example, for an overlay scenario with one overlay segment
+		 * and two overlay sections:
+		 *	- Section 1 points to the first entry of the
+		 *	  _ovly_buf_table, which contains a guard value
+		 *	  of '1', referencing the first (index=0) entry of
+		 *	  _ovly_table.
+		 *	- Section 2 points to the second entry of the
+		 *	  _ovly_buf_table, which contains a guard value
+		 *	  of '2', referencing the second (index=1) entry of
+		 *	  _ovly_table.
+		 */
+		map = vma_map_add(map, ovly.vma, ovly.size, ovly.offset,
+				  ovly_buf_table_sym + (ovly.buf-1) * 4, i+1);
+		if (!map)
+			goto fail;
+	}
+	goto out;
+
+ fail:
+	map = NULL;
+ out:
+	return map;
+}
diff --git a/arch/powerpc/oprofile/common.c b/arch/powerpc/oprofile/common.c
index 1a7ef7e246d..a28cce1d6c2 100644
--- a/arch/powerpc/oprofile/common.c
+++ b/arch/powerpc/oprofile/common.c
@@ -29,6 +29,8 @@ static struct op_powerpc_model *model;
 static struct op_counter_config ctr[OP_MAX_COUNTER];
 static struct op_system_config sys;
 
+static int op_per_cpu_rc;
+
 static void op_handle_interrupt(struct pt_regs *regs)
 {
 	model->handle_interrupt(regs, ctr);
@@ -36,25 +38,41 @@ static void op_handle_interrupt(struct pt_regs *regs)
 
 static void op_powerpc_cpu_setup(void *dummy)
 {
-	model->cpu_setup(ctr);
+	int ret;
+
+	ret = model->cpu_setup(ctr);
+
+	if (ret != 0)
+		op_per_cpu_rc = ret;
 }
 
 static int op_powerpc_setup(void)
 {
 	int err;
 
+	op_per_cpu_rc = 0;
+
 	/* Grab the hardware */
 	err = reserve_pmc_hardware(op_handle_interrupt);
 	if (err)
 		return err;
 
 	/* Pre-compute the values to stuff in the hardware registers.  */
-	model->reg_setup(ctr, &sys, model->num_counters);
+	op_per_cpu_rc = model->reg_setup(ctr, &sys, model->num_counters);
 
-	/* Configure the registers on all cpus.  */
+	if (op_per_cpu_rc)
+		goto out;
+
+	/* Configure the registers on all cpus.	 If an error occurs on one
+	 * of the cpus, op_per_cpu_rc will be set to the error */
 	on_each_cpu(op_powerpc_cpu_setup, NULL, 0, 1);
 
-	return 0;
+out:	if (op_per_cpu_rc) {
+		/* error on setup release the performance counter hardware */
+		release_pmc_hardware();
+	}
+
+	return op_per_cpu_rc;
 }
 
 static void op_powerpc_shutdown(void)
@@ -64,16 +82,29 @@ static void op_powerpc_shutdown(void)
 
 static void op_powerpc_cpu_start(void *dummy)
 {
-	model->start(ctr);
+	/* If any of the cpus have return an error, set the
+	 * global flag to the error so it can be returned
+	 * to the generic OProfile caller.
+	 */
+	int ret;
+
+	ret = model->start(ctr);
+	if (ret != 0)
+		op_per_cpu_rc = ret;
 }
 
 static int op_powerpc_start(void)
 {
+	op_per_cpu_rc = 0;
+
 	if (model->global_start)
-		model->global_start(ctr);
-	if (model->start)
+		return model->global_start(ctr);
+	if (model->start) {
 		on_each_cpu(op_powerpc_cpu_start, NULL, 0, 1);
-	return 0;
+		return op_per_cpu_rc;
+	}
+	return -EIO; /* No start function is defined for this
+			power architecture */
 }
 
 static inline void op_powerpc_cpu_stop(void *dummy)
@@ -147,11 +178,13 @@ int __init oprofile_arch_init(struct oprofile_operations *ops)
 
 	switch (cur_cpu_spec->oprofile_type) {
 #ifdef CONFIG_PPC64
-#ifdef CONFIG_PPC_CELL_NATIVE
+#ifdef CONFIG_OPROFILE_CELL
 		case PPC_OPROFILE_CELL:
 			if (firmware_has_feature(FW_FEATURE_LPAR))
 				return -ENODEV;
 			model = &op_model_cell;
+			ops->sync_start = model->sync_start;
+			ops->sync_stop = model->sync_stop;
 			break;
 #endif
 		case PPC_OPROFILE_RS64:
diff --git a/arch/powerpc/oprofile/op_model_7450.c b/arch/powerpc/oprofile/op_model_7450.c
index 5d1bbaf35cc..cc599eb8768 100644
--- a/arch/powerpc/oprofile/op_model_7450.c
+++ b/arch/powerpc/oprofile/op_model_7450.c
@@ -81,7 +81,7 @@ static void pmc_stop_ctrs(void)
 
 /* Configures the counters on this CPU based on the global
  * settings */
-static void fsl7450_cpu_setup(struct op_counter_config *ctr)
+static int fsl7450_cpu_setup(struct op_counter_config *ctr)
 {
 	/* freeze all counters */
 	pmc_stop_ctrs();
@@ -89,12 +89,14 @@ static void fsl7450_cpu_setup(struct op_counter_config *ctr)
 	mtspr(SPRN_MMCR0, mmcr0_val);
 	mtspr(SPRN_MMCR1, mmcr1_val);
 	mtspr(SPRN_MMCR2, mmcr2_val);
+
+	return 0;
 }
 
 #define NUM_CTRS 6
 
 /* Configures the global settings for the countes on all CPUs. */
-static void fsl7450_reg_setup(struct op_counter_config *ctr,
+static int fsl7450_reg_setup(struct op_counter_config *ctr,
 			     struct op_system_config *sys,
 			     int num_ctrs)
 {
@@ -126,10 +128,12 @@ static void fsl7450_reg_setup(struct op_counter_config *ctr,
 		| mmcr1_event6(ctr[5].event);
 
 	mmcr2_val = 0;
+
+	return 0;
 }
 
 /* Sets the counters on this CPU to the chosen values, and starts them */
-static void fsl7450_start(struct op_counter_config *ctr)
+static int fsl7450_start(struct op_counter_config *ctr)
 {
 	int i;
 
@@ -148,6 +152,8 @@ static void fsl7450_start(struct op_counter_config *ctr)
 	pmc_start_ctrs();
 
 	oprofile_running = 1;
+
+	return 0;
 }
 
 /* Stop the counters on this CPU */
@@ -193,7 +199,7 @@ static void fsl7450_handle_interrupt(struct pt_regs *regs,
 	/* The freeze bit was set by the interrupt. */
 	/* Clear the freeze bit, and reenable the interrupt.
 	 * The counters won't actually start until the rfi clears
-	 * the PMM bit */
+	 * the PM/M bit */
 	pmc_start_ctrs();
 }
 
diff --git a/arch/powerpc/oprofile/op_model_cell.c b/arch/powerpc/oprofile/op_model_cell.c
index c29293befba..d928b54f3a0 100644
--- a/arch/powerpc/oprofile/op_model_cell.c
+++ b/arch/powerpc/oprofile/op_model_cell.c
@@ -5,8 +5,8 @@
  *
  * Author: David Erb (djerb@us.ibm.com)
  * Modifications:
- *         Carl Love <carll@us.ibm.com>
- *         Maynard Johnson <maynardj@us.ibm.com>
+ *	   Carl Love <carll@us.ibm.com>
+ *	   Maynard Johnson <maynardj@us.ibm.com>
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
@@ -38,12 +38,25 @@
 
 #include "../platforms/cell/interrupt.h"
 #include "../platforms/cell/cbe_regs.h"
+#include "cell/pr_util.h"
+
+static void cell_global_stop_spu(void);
+
+/*
+ * spu_cycle_reset is the number of cycles between samples.
+ * This variable is used for SPU profiling and should ONLY be set
+ * at the beginning of cell_reg_setup; otherwise, it's read-only.
+ */
+static unsigned int spu_cycle_reset;
+
+#define NUM_SPUS_PER_NODE    8
+#define SPU_CYCLES_EVENT_NUM 2	/*  event number for SPU_CYCLES */
 
 #define PPU_CYCLES_EVENT_NUM 1	/*  event number for CYCLES */
-#define PPU_CYCLES_GRP_NUM   1  /* special group number for identifying
-                                 * PPU_CYCLES event
-                                 */
-#define CBE_COUNT_ALL_CYCLES 0x42800000	/* PPU cycle event specifier */
+#define PPU_CYCLES_GRP_NUM   1	/* special group number for identifying
+				 * PPU_CYCLES event
+				 */
+#define CBE_COUNT_ALL_CYCLES 0x42800000 /* PPU cycle event specifier */
 
 #define NUM_THREADS 2         /* number of physical threads in
 			       * physical processor
@@ -51,6 +64,7 @@
 #define NUM_TRACE_BUS_WORDS 4
 #define NUM_INPUT_BUS_WORDS 2
 
+#define MAX_SPU_COUNT 0xFFFFFF	/* maximum 24 bit LFSR value */
 
 struct pmc_cntrl_data {
 	unsigned long vcntr;
@@ -62,11 +76,10 @@ struct pmc_cntrl_data {
 /*
  * ibm,cbe-perftools rtas parameters
  */
-
 struct pm_signal {
 	u16 cpu;		/* Processor to modify */
-	u16 sub_unit;		/* hw subunit this applies to (if applicable) */
-	short int signal_group;	/* Signal Group to Enable/Disable */
+	u16 sub_unit;		/* hw subunit this applies to (if applicable)*/
+	short int signal_group; /* Signal Group to Enable/Disable */
 	u8 bus_word;		/* Enable/Disable on this Trace/Trigger/Event
 				 * Bus Word(s) (bitmask)
 				 */
@@ -112,21 +125,42 @@ static DEFINE_PER_CPU(unsigned long[NR_PHYS_CTRS], pmc_values);
 
 static struct pmc_cntrl_data pmc_cntrl[NUM_THREADS][NR_PHYS_CTRS];
 
-/* Interpetation of hdw_thread:
+/*
+ * The CELL profiling code makes rtas calls to setup the debug bus to
+ * route the performance signals.  Additionally, SPU profiling requires
+ * a second rtas call to setup the hardware to capture the SPU PCs.
+ * The EIO error value is returned if the token lookups or the rtas
+ * call fail.  The EIO error number is the best choice of the existing
+ * error numbers.  The probability of rtas related error is very low.  But
+ * by returning EIO and printing additional information to dmsg the user
+ * will know that OProfile did not start and dmesg will tell them why.
+ * OProfile does not support returning errors on Stop.	Not a huge issue
+ * since failure to reset the debug bus or stop the SPU PC collection is
+ * not a fatel issue.  Chances are if the Stop failed, Start doesn't work
+ * either.
+ */
+
+/*
+ * Interpetation of hdw_thread:
  * 0 - even virtual cpus 0, 2, 4,...
  * 1 - odd virtual cpus 1, 3, 5, ...
+ *
+ * FIXME: this is strictly wrong, we need to clean this up in a number
+ * of places. It works for now. -arnd
  */
 static u32 hdw_thread;
 
 static u32 virt_cntr_inter_mask;
 static struct timer_list timer_virt_cntr;
 
-/* pm_signal needs to be global since it is initialized in
+/*
+ * pm_signal needs to be global since it is initialized in
  * cell_reg_setup at the time when the necessary information
  * is available.
  */
 static struct pm_signal pm_signal[NR_PHYS_CTRS];
-static int pm_rtas_token;
+static int pm_rtas_token;    /* token for debug bus setup call */
+static int spu_rtas_token;   /* token for SPU cycle profiling */
 
 static u32 reset_value[NR_PHYS_CTRS];
 static int num_counters;
@@ -147,8 +181,8 @@ rtas_ibm_cbe_perftools(int subfunc, int passthru,
 {
 	u64 paddr = __pa(address);
 
-	return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, passthru,
-			 paddr >> 32, paddr & 0xffffffff, length);
+	return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc,
+			 passthru, paddr >> 32, paddr & 0xffffffff, length);
 }
 
 static void pm_rtas_reset_signals(u32 node)
@@ -156,12 +190,13 @@ static void pm_rtas_reset_signals(u32 node)
 	int ret;
 	struct pm_signal pm_signal_local;
 
-	/*  The debug bus is being set to the passthru disable state.
-	 *  However, the FW still expects atleast one legal signal routing
-	 *  entry or it will return an error on the arguments.  If we don't
-	 *  supply a valid entry, we must ignore all return values.  Ignoring
-	 *  all return values means we might miss an error we should be
-	 *  concerned about.
+	/*
+	 * The debug bus is being set to the passthru disable state.
+	 * However, the FW still expects atleast one legal signal routing
+	 * entry or it will return an error on the arguments.	If we don't
+	 * supply a valid entry, we must ignore all return values.  Ignoring
+	 * all return values means we might miss an error we should be
+	 * concerned about.
 	 */
 
 	/*  fw expects physical cpu #. */
@@ -175,18 +210,24 @@ static void pm_rtas_reset_signals(u32 node)
 				     &pm_signal_local,
 				     sizeof(struct pm_signal));
 
-	if (ret)
+	if (unlikely(ret))
+		/*
+		 * Not a fatal error. For Oprofile stop, the oprofile
+		 * functions do not support returning an error for
+		 * failure to stop OProfile.
+		 */
 		printk(KERN_WARNING "%s: rtas returned: %d\n",
 		       __FUNCTION__, ret);
 }
 
-static void pm_rtas_activate_signals(u32 node, u32 count)
+static int pm_rtas_activate_signals(u32 node, u32 count)
 {
 	int ret;
 	int i, j;
 	struct pm_signal pm_signal_local[NR_PHYS_CTRS];
 
-	/* There is no debug setup required for the cycles event.
+	/*
+	 * There is no debug setup required for the cycles event.
 	 * Note that only events in the same group can be used.
 	 * Otherwise, there will be conflicts in correctly routing
 	 * the signals on the debug bus.  It is the responsiblity
@@ -213,10 +254,14 @@ static void pm_rtas_activate_signals(u32 node, u32 count)
 					     pm_signal_local,
 					     i * sizeof(struct pm_signal));
 
-		if (ret)
+		if (unlikely(ret)) {
 			printk(KERN_WARNING "%s: rtas returned: %d\n",
 			       __FUNCTION__, ret);
+			return -EIO;
+		}
 	}
+
+	return 0;
 }
 
 /*
@@ -260,11 +305,12 @@ static void set_pm_event(u32 ctr, int event, u32 unit_mask)
 	pm_regs.pm07_cntrl[ctr] |= PM07_CTR_POLARITY(polarity);
 	pm_regs.pm07_cntrl[ctr] |= PM07_CTR_INPUT_CONTROL(input_control);
 
-	/* Some of the islands signal selection is based on 64 bit words.
+	/*
+	 * Some of the islands signal selection is based on 64 bit words.
 	 * The debug bus words are 32 bits, the input words to the performance
 	 * counters are defined as 32 bits.  Need to convert the 64 bit island
 	 * specification to the appropriate 32 input bit and bus word for the
-	 * performance counter event selection.  See the CELL Performance
+	 * performance counter event selection.	 See the CELL Performance
 	 * monitoring signals manual and the Perf cntr hardware descriptions
 	 * for the details.
 	 */
@@ -298,6 +344,7 @@ static void set_pm_event(u32 ctr, int event, u32 unit_mask)
 					input_bus[j] = i;
 					pm_regs.group_control |=
 					    (i << (31 - i));
+
 					break;
 				}
 			}
@@ -309,7 +356,8 @@ out:
 
 static void write_pm_cntrl(int cpu)
 {
-	/* Oprofile will use 32 bit counters, set bits 7:10 to 0
+	/*
+	 * Oprofile will use 32 bit counters, set bits 7:10 to 0
 	 * pmregs.pm_cntrl is a global
 	 */
 
@@ -326,7 +374,8 @@ static void write_pm_cntrl(int cpu)
 	if (pm_regs.pm_cntrl.freeze == 1)
 		val |= CBE_PM_FREEZE_ALL_CTRS;
 
-	/* Routine set_count_mode must be called previously to set
+	/*
+	 * Routine set_count_mode must be called previously to set
 	 * the count mode based on the user selection of user and kernel.
 	 */
 	val |= CBE_PM_COUNT_MODE_SET(pm_regs.pm_cntrl.count_mode);
@@ -336,7 +385,8 @@ static void write_pm_cntrl(int cpu)
 static inline void
 set_count_mode(u32 kernel, u32 user)
 {
-	/* The user must specify user and kernel if they want them. If
+	/*
+	 * The user must specify user and kernel if they want them. If
 	 *  neither is specified, OProfile will count in hypervisor mode.
 	 *  pm_regs.pm_cntrl is a global
 	 */
@@ -364,7 +414,7 @@ static inline void enable_ctr(u32 cpu, u32 ctr, u32 * pm07_cntrl)
 
 /*
  * Oprofile is expected to collect data on all CPUs simultaneously.
- * However, there is one set of performance counters per node.  There are
+ * However, there is one set of performance counters per node.	There are
  * two hardware threads or virtual CPUs on each node.  Hence, OProfile must
  * multiplex in time the performance counter collection on the two virtual
  * CPUs.  The multiplexing of the performance counters is done by this
@@ -377,19 +427,19 @@ static inline void enable_ctr(u32 cpu, u32 ctr, u32 * pm07_cntrl)
  * pair of per-cpu arrays is used for storing the previous and next
  * pmc values for a given node.
  * NOTE: We use the per-cpu variable to improve cache performance.
+ *
+ * This routine will alternate loading the virtual counters for
+ * virtual CPUs
  */
 static void cell_virtual_cntr(unsigned long data)
 {
-	/* This routine will alternate loading the virtual counters for
-	 * virtual CPUs
-	 */
 	int i, prev_hdw_thread, next_hdw_thread;
 	u32 cpu;
 	unsigned long flags;
 
-	/* Make sure that the interrupt_hander and
-	 * the virt counter are not both playing with
-	 * the counters on the same node.
+	/*
+	 * Make sure that the interrupt_hander and the virt counter are
+	 * not both playing with the counters on the same node.
 	 */
 
 	spin_lock_irqsave(&virt_cntr_lock, flags);
@@ -400,22 +450,25 @@ static void cell_virtual_cntr(unsigned long data)
 	hdw_thread = 1 ^ hdw_thread;
 	next_hdw_thread = hdw_thread;
 
-	for (i = 0; i < num_counters; i++)
-	/* There are some per thread events.  Must do the
+	/*
+	 * There are some per thread events.  Must do the
 	 * set event, for the thread that is being started
 	 */
+	for (i = 0; i < num_counters; i++)
 		set_pm_event(i,
 			pmc_cntrl[next_hdw_thread][i].evnts,
 			pmc_cntrl[next_hdw_thread][i].masks);
 
-	/* The following is done only once per each node, but
+	/*
+	 * The following is done only once per each node, but
 	 * we need cpu #, not node #, to pass to the cbe_xxx functions.
 	 */
 	for_each_online_cpu(cpu) {
 		if (cbe_get_hw_thread_id(cpu))
 			continue;
 
-		/* stop counters, save counter values, restore counts
+		/*
+		 * stop counters, save counter values, restore counts
 		 * for previous thread
 		 */
 		cbe_disable_pm(cpu);
@@ -428,7 +481,7 @@ static void cell_virtual_cntr(unsigned long data)
 			    == 0xFFFFFFFF)
 				/* If the cntr value is 0xffffffff, we must
 				 * reset that to 0xfffffff0 when the current
-				 * thread is restarted.  This will generate a
+				 * thread is restarted.	 This will generate a
 				 * new interrupt and make sure that we never
 				 * restore the counters to the max value.  If
 				 * the counters were restored to the max value,
@@ -444,13 +497,15 @@ static void cell_virtual_cntr(unsigned long data)
 						      next_hdw_thread)[i]);
 		}
 
-		/* Switch to the other thread. Change the interrupt
+		/*
+		 * Switch to the other thread. Change the interrupt
 		 * and control regs to be scheduled on the CPU
 		 * corresponding to the thread to execute.
 		 */
 		for (i = 0; i < num_counters; i++) {
 			if (pmc_cntrl[next_hdw_thread][i].enabled) {
-				/* There are some per thread events.
+				/*
+				 * There are some per thread events.
 				 * Must do the set event, enable_cntr
 				 * for each cpu.
 				 */
@@ -482,17 +537,42 @@ static void start_virt_cntrs(void)
 }
 
 /* This function is called once for all cpus combined */
-static void
-cell_reg_setup(struct op_counter_config *ctr,
-	       struct op_system_config *sys, int num_ctrs)
+static int cell_reg_setup(struct op_counter_config *ctr,
+			struct op_system_config *sys, int num_ctrs)
 {
 	int i, j, cpu;
+	spu_cycle_reset = 0;
+
+	if (ctr[0].event == SPU_CYCLES_EVENT_NUM) {
+		spu_cycle_reset = ctr[0].count;
+
+		/*
+		 * Each node will need to make the rtas call to start
+		 * and stop SPU profiling.  Get the token once and store it.
+		 */
+		spu_rtas_token = rtas_token("ibm,cbe-spu-perftools");
+
+		if (unlikely(spu_rtas_token == RTAS_UNKNOWN_SERVICE)) {
+			printk(KERN_ERR
+			       "%s: rtas token ibm,cbe-spu-perftools unknown\n",
+			       __FUNCTION__);
+			return -EIO;
+		}
+	}
 
 	pm_rtas_token = rtas_token("ibm,cbe-perftools");
-	if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) {
-		printk(KERN_WARNING "%s: RTAS_UNKNOWN_SERVICE\n",
+
+	/*
+	 * For all events excetp PPU CYCLEs, each node will need to make
+	 * the rtas cbe-perftools call to setup and reset the debug bus.
+	 * Make the token lookup call once and store it in the global
+	 * variable pm_rtas_token.
+	 */
+	if (unlikely(pm_rtas_token == RTAS_UNKNOWN_SERVICE)) {
+		printk(KERN_ERR
+		       "%s: rtas token ibm,cbe-perftools unknown\n",
 		       __FUNCTION__);
-		goto out;
+		return -EIO;
 	}
 
 	num_counters = num_ctrs;
@@ -520,7 +600,8 @@ cell_reg_setup(struct op_counter_config *ctr,
 			per_cpu(pmc_values, j)[i] = 0;
 	}
 
-	/* Setup the thread 1 events, map the thread 0 event to the
+	/*
+	 * Setup the thread 1 events, map the thread 0 event to the
 	 * equivalent thread 1 event.
 	 */
 	for (i = 0; i < num_ctrs; ++i) {
@@ -544,9 +625,10 @@ cell_reg_setup(struct op_counter_config *ctr,
 	for (i = 0; i < NUM_INPUT_BUS_WORDS; i++)
 		input_bus[i] = 0xff;
 
-	/* Our counters count up, and "count" refers to
+	/*
+	 * Our counters count up, and "count" refers to
 	 * how much before the next interrupt, and we interrupt
-	 * on overflow.  So we calculate the starting value
+	 * on overflow.	 So we calculate the starting value
 	 * which will give us "count" until overflow.
 	 * Then we set the events on the enabled counters.
 	 */
@@ -569,28 +651,27 @@ cell_reg_setup(struct op_counter_config *ctr,
 		for (i = 0; i < num_counters; ++i) {
 			per_cpu(pmc_values, cpu)[i] = reset_value[i];
 		}
-out:
-	;
+
+	return 0;
 }
 
+
+
 /* This function is called once for each cpu */
-static void cell_cpu_setup(struct op_counter_config *cntr)
+static int cell_cpu_setup(struct op_counter_config *cntr)
 {
 	u32 cpu = smp_processor_id();
 	u32 num_enabled = 0;
 	int i;
 
+	if (spu_cycle_reset)
+		return 0;
+
 	/* There is one performance monitor per processor chip (i.e. node),
 	 * so we only need to perform this function once per node.
 	 */
 	if (cbe_get_hw_thread_id(cpu))
-		goto out;
-
-	if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) {
-		printk(KERN_WARNING "%s: RTAS_UNKNOWN_SERVICE\n",
-		       __FUNCTION__);
-		goto out;
-	}
+		return 0;
 
 	/* Stop all counters */
 	cbe_disable_pm(cpu);
@@ -609,16 +690,286 @@ static void cell_cpu_setup(struct op_counter_config *cntr)
 		}
 	}
 
-	pm_rtas_activate_signals(cbe_cpu_to_node(cpu), num_enabled);
+	/*
+	 * The pm_rtas_activate_signals will return -EIO if the FW
+	 * call failed.
+	 */
+	return pm_rtas_activate_signals(cbe_cpu_to_node(cpu), num_enabled);
+}
+
+#define ENTRIES	 303
+#define MAXLFSR	 0xFFFFFF
+
+/* precomputed table of 24 bit LFSR values */
+static int initial_lfsr[] = {
+ 8221349, 12579195, 5379618, 10097839, 7512963, 7519310, 3955098, 10753424,
+ 15507573, 7458917, 285419, 2641121, 9780088, 3915503, 6668768, 1548716,
+ 4885000, 8774424, 9650099, 2044357, 2304411, 9326253, 10332526, 4421547,
+ 3440748, 10179459, 13332843, 10375561, 1313462, 8375100, 5198480, 6071392,
+ 9341783, 1526887, 3985002, 1439429, 13923762, 7010104, 11969769, 4547026,
+ 2040072, 4025602, 3437678, 7939992, 11444177, 4496094, 9803157, 10745556,
+ 3671780, 4257846, 5662259, 13196905, 3237343, 12077182, 16222879, 7587769,
+ 14706824, 2184640, 12591135, 10420257, 7406075, 3648978, 11042541, 15906893,
+ 11914928, 4732944, 10695697, 12928164, 11980531, 4430912, 11939291, 2917017,
+ 6119256, 4172004, 9373765, 8410071, 14788383, 5047459, 5474428, 1737756,
+ 15967514, 13351758, 6691285, 8034329, 2856544, 14394753, 11310160, 12149558,
+ 7487528, 7542781, 15668898, 12525138, 12790975, 3707933, 9106617, 1965401,
+ 16219109, 12801644, 2443203, 4909502, 8762329, 3120803, 6360315, 9309720,
+ 15164599, 10844842, 4456529, 6667610, 14924259, 884312, 6234963, 3326042,
+ 15973422, 13919464, 5272099, 6414643, 3909029, 2764324, 5237926, 4774955,
+ 10445906, 4955302, 5203726, 10798229, 11443419, 2303395, 333836, 9646934,
+ 3464726, 4159182, 568492, 995747, 10318756, 13299332, 4836017, 8237783,
+ 3878992, 2581665, 11394667, 5672745, 14412947, 3159169, 9094251, 16467278,
+ 8671392, 15230076, 4843545, 7009238, 15504095, 1494895, 9627886, 14485051,
+ 8304291, 252817, 12421642, 16085736, 4774072, 2456177, 4160695, 15409741,
+ 4902868, 5793091, 13162925, 16039714, 782255, 11347835, 14884586, 366972,
+ 16308990, 11913488, 13390465, 2958444, 10340278, 1177858, 1319431, 10426302,
+ 2868597, 126119, 5784857, 5245324, 10903900, 16436004, 3389013, 1742384,
+ 14674502, 10279218, 8536112, 10364279, 6877778, 14051163, 1025130, 6072469,
+ 1988305, 8354440, 8216060, 16342977, 13112639, 3976679, 5913576, 8816697,
+ 6879995, 14043764, 3339515, 9364420, 15808858, 12261651, 2141560, 5636398,
+ 10345425, 10414756, 781725, 6155650, 4746914, 5078683, 7469001, 6799140,
+ 10156444, 9667150, 10116470, 4133858, 2121972, 1124204, 1003577, 1611214,
+ 14304602, 16221850, 13878465, 13577744, 3629235, 8772583, 10881308, 2410386,
+ 7300044, 5378855, 9301235, 12755149, 4977682, 8083074, 10327581, 6395087,
+ 9155434, 15501696, 7514362, 14520507, 15808945, 3244584, 4741962, 9658130,
+ 14336147, 8654727, 7969093, 15759799, 14029445, 5038459, 9894848, 8659300,
+ 13699287, 8834306, 10712885, 14753895, 10410465, 3373251, 309501, 9561475,
+ 5526688, 14647426, 14209836, 5339224, 207299, 14069911, 8722990, 2290950,
+ 3258216, 12505185, 6007317, 9218111, 14661019, 10537428, 11731949, 9027003,
+ 6641507, 9490160, 200241, 9720425, 16277895, 10816638, 1554761, 10431375,
+ 7467528, 6790302, 3429078, 14633753, 14428997, 11463204, 3576212, 2003426,
+ 6123687, 820520, 9992513, 15784513, 5778891, 6428165, 8388607
+};
+
+/*
+ * The hardware uses an LFSR counting sequence to determine when to capture
+ * the SPU PCs.	 An LFSR sequence is like a puesdo random number sequence
+ * where each number occurs once in the sequence but the sequence is not in
+ * numerical order. The SPU PC capture is done when the LFSR sequence reaches
+ * the last value in the sequence.  Hence the user specified value N
+ * corresponds to the LFSR number that is N from the end of the sequence.
+ *
+ * To avoid the time to compute the LFSR, a lookup table is used.  The 24 bit
+ * LFSR sequence is broken into four ranges.  The spacing of the precomputed
+ * values is adjusted in each range so the error between the user specifed
+ * number (N) of events between samples and the actual number of events based
+ * on the precomputed value will be les then about 6.2%.  Note, if the user
+ * specifies N < 2^16, the LFSR value that is 2^16 from the end will be used.
+ * This is to prevent the loss of samples because the trace buffer is full.
+ *
+ *	   User specified N		     Step between	   Index in
+ *					 precomputed values	 precomputed
+ *								    table
+ * 0		    to	2^16-1			----		      0
+ * 2^16	    to	2^16+2^19-1		2^12		    1 to 128
+ * 2^16+2^19	    to	2^16+2^19+2^22-1	2^15		  129 to 256
+ * 2^16+2^19+2^22  to	2^24-1			2^18		  257 to 302
+ *
+ *
+ * For example, the LFSR values in the second range are computed for 2^16,
+ * 2^16+2^12, ... , 2^19-2^16, 2^19 and stored in the table at indicies
+ * 1, 2,..., 127, 128.
+ *
+ * The 24 bit LFSR value for the nth number in the sequence can be
+ * calculated using the following code:
+ *
+ * #define size 24
+ * int calculate_lfsr(int n)
+ * {
+ *	int i;
+ *	unsigned int newlfsr0;
+ *	unsigned int lfsr = 0xFFFFFF;
+ *	unsigned int howmany = n;
+ *
+ *	for (i = 2; i < howmany + 2; i++) {
+ *		newlfsr0 = (((lfsr >> (size - 1 - 0)) & 1) ^
+ *		((lfsr >> (size - 1 - 1)) & 1) ^
+ *		(((lfsr >> (size - 1 - 6)) & 1) ^
+ *		((lfsr >> (size - 1 - 23)) & 1)));
+ *
+ *		lfsr >>= 1;
+ *		lfsr = lfsr | (newlfsr0 << (size - 1));
+ *	}
+ *	return lfsr;
+ * }
+ */
+
+#define V2_16  (0x1 << 16)
+#define V2_19  (0x1 << 19)
+#define V2_22  (0x1 << 22)
+
+static int calculate_lfsr(int n)
+{
+	/*
+	 * The ranges and steps are in powers of 2 so the calculations
+	 * can be done using shifts rather then divide.
+	 */
+	int index;
+
+	if ((n >> 16) == 0)
+		index = 0;
+	else if (((n - V2_16) >> 19) == 0)
+		index = ((n - V2_16) >> 12) + 1;
+	else if (((n - V2_16 - V2_19) >> 22) == 0)
+		index = ((n - V2_16 - V2_19) >> 15 ) + 1 + 128;
+	else if (((n - V2_16 - V2_19 - V2_22) >> 24) == 0)
+		index = ((n - V2_16 - V2_19 - V2_22) >> 18 ) + 1 + 256;
+	else
+		index = ENTRIES-1;
+
+	/* make sure index is valid */
+	if ((index > ENTRIES) || (index < 0))
+		index = ENTRIES-1;
+
+	return initial_lfsr[index];
+}
+
+static int pm_rtas_activate_spu_profiling(u32 node)
+{
+	int ret, i;
+	struct pm_signal pm_signal_local[NR_PHYS_CTRS];
+
+	/*
+	 * Set up the rtas call to configure the debug bus to
+	 * route the SPU PCs.  Setup the pm_signal for each SPU
+	 */
+	for (i = 0; i < NUM_SPUS_PER_NODE; i++) {
+		pm_signal_local[i].cpu = node;
+		pm_signal_local[i].signal_group = 41;
+		/* spu i on word (i/2) */
+		pm_signal_local[i].bus_word = 1 << i / 2;
+		/* spu i */
+		pm_signal_local[i].sub_unit = i;
+		pm_signal_local[i].bit = 63;
+	}
+
+	ret = rtas_ibm_cbe_perftools(SUBFUNC_ACTIVATE,
+				     PASSTHRU_ENABLE, pm_signal_local,
+				     (NUM_SPUS_PER_NODE
+				      * sizeof(struct pm_signal)));
+
+	if (unlikely(ret)) {
+		printk(KERN_WARNING "%s: rtas returned: %d\n",
+		       __FUNCTION__, ret);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_CPU_FREQ
+static int
+oprof_cpufreq_notify(struct notifier_block *nb, unsigned long val, void *data)
+{
+	int ret = 0;
+	struct cpufreq_freqs *frq = data;
+	if ((val == CPUFREQ_PRECHANGE && frq->old < frq->new) ||
+	    (val == CPUFREQ_POSTCHANGE && frq->old > frq->new) ||
+	    (val == CPUFREQ_RESUMECHANGE || val == CPUFREQ_SUSPENDCHANGE))
+		set_spu_profiling_frequency(frq->new, spu_cycle_reset);
+	return ret;
+}
+
+static struct notifier_block cpu_freq_notifier_block = {
+	.notifier_call	= oprof_cpufreq_notify
+};
+#endif
+
+static int cell_global_start_spu(struct op_counter_config *ctr)
+{
+	int subfunc;
+	unsigned int lfsr_value;
+	int cpu;
+	int ret;
+	int rtas_error;
+	unsigned int cpu_khzfreq = 0;
+
+	/* The SPU profiling uses time-based profiling based on
+	 * cpu frequency, so if configured with the CPU_FREQ
+	 * option, we should detect frequency changes and react
+	 * accordingly.
+	 */
+#ifdef CONFIG_CPU_FREQ
+	ret = cpufreq_register_notifier(&cpu_freq_notifier_block,
+					CPUFREQ_TRANSITION_NOTIFIER);
+	if (ret < 0)
+		/* this is not a fatal error */
+		printk(KERN_ERR "CPU freq change registration failed: %d\n",
+		       ret);
+
+	else
+		cpu_khzfreq = cpufreq_quick_get(smp_processor_id());
+#endif
+
+	set_spu_profiling_frequency(cpu_khzfreq, spu_cycle_reset);
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+
+		/*
+		 * Setup SPU cycle-based profiling.
+		 * Set perf_mon_control bit 0 to a zero before
+		 * enabling spu collection hardware.
+		 */
+		cbe_write_pm(cpu, pm_control, 0);
+
+		if (spu_cycle_reset > MAX_SPU_COUNT)
+			/* use largest possible value */
+			lfsr_value = calculate_lfsr(MAX_SPU_COUNT-1);
+		else
+			lfsr_value = calculate_lfsr(spu_cycle_reset);
+
+		/* must use a non zero value. Zero disables data collection. */
+		if (lfsr_value == 0)
+			lfsr_value = calculate_lfsr(1);
+
+		lfsr_value = lfsr_value << 8; /* shift lfsr to correct
+						* register location
+						*/
+
+		/* debug bus setup */
+		ret = pm_rtas_activate_spu_profiling(cbe_cpu_to_node(cpu));
+
+		if (unlikely(ret)) {
+			rtas_error = ret;
+			goto out;
+		}
+
+
+		subfunc = 2;	/* 2 - activate SPU tracing, 3 - deactivate */
+
+		/* start profiling */
+		ret = rtas_call(spu_rtas_token, 3, 1, NULL, subfunc,
+		  cbe_cpu_to_node(cpu), lfsr_value);
+
+		if (unlikely(ret != 0)) {
+			printk(KERN_ERR
+			       "%s: rtas call ibm,cbe-spu-perftools failed, return = %d\n",
+			       __FUNCTION__, ret);
+			rtas_error = -EIO;
+			goto out;
+		}
+	}
+
+	rtas_error = start_spu_profiling(spu_cycle_reset);
+	if (rtas_error)
+		goto out_stop;
+
+	oprofile_running = 1;
+	return 0;
+
+out_stop:
+	cell_global_stop_spu();		/* clean up the PMU/debug bus */
 out:
-	;
+	return rtas_error;
 }
 
-static void cell_global_start(struct op_counter_config *ctr)
+static int cell_global_start_ppu(struct op_counter_config *ctr)
 {
-	u32 cpu;
+	u32 cpu, i;
 	u32 interrupt_mask = 0;
-	u32 i;
 
 	/* This routine gets called once for the system.
 	 * There is one performance monitor per node, so we
@@ -651,19 +1002,79 @@ static void cell_global_start(struct op_counter_config *ctr)
 	oprofile_running = 1;
 	smp_wmb();
 
-	/* NOTE: start_virt_cntrs will result in cell_virtual_cntr() being
-	 * executed which manipulates the PMU.  We start the "virtual counter"
+	/*
+	 * NOTE: start_virt_cntrs will result in cell_virtual_cntr() being
+	 * executed which manipulates the PMU.	We start the "virtual counter"
 	 * here so that we do not need to synchronize access to the PMU in
 	 * the above for-loop.
 	 */
 	start_virt_cntrs();
+
+	return 0;
 }
 
-static void cell_global_stop(void)
+static int cell_global_start(struct op_counter_config *ctr)
+{
+	if (spu_cycle_reset)
+		return cell_global_start_spu(ctr);
+	else
+		return cell_global_start_ppu(ctr);
+}
+
+/*
+ * Note the generic OProfile stop calls do not support returning
+ * an error on stop.  Hence, will not return an error if the FW
+ * calls fail on stop.	Failure to reset the debug bus is not an issue.
+ * Failure to disable the SPU profiling is not an issue.  The FW calls
+ * to enable the performance counters and debug bus will work even if
+ * the hardware was not cleanly reset.
+ */
+static void cell_global_stop_spu(void)
+{
+	int subfunc, rtn_value;
+	unsigned int lfsr_value;
+	int cpu;
+
+	oprofile_running = 0;
+
+#ifdef CONFIG_CPU_FREQ
+	cpufreq_unregister_notifier(&cpu_freq_notifier_block,
+				    CPUFREQ_TRANSITION_NOTIFIER);
+#endif
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+
+		subfunc = 3;	/*
+				 * 2 - activate SPU tracing,
+				 * 3 - deactivate
+				 */
+		lfsr_value = 0x8f100000;
+
+		rtn_value = rtas_call(spu_rtas_token, 3, 1, NULL,
+				      subfunc, cbe_cpu_to_node(cpu),
+				      lfsr_value);
+
+		if (unlikely(rtn_value != 0)) {
+			printk(KERN_ERR
+			       "%s: rtas call ibm,cbe-spu-perftools failed, return = %d\n",
+			       __FUNCTION__, rtn_value);
+		}
+
+		/* Deactivate the signals */
+		pm_rtas_reset_signals(cbe_cpu_to_node(cpu));
+	}
+
+	stop_spu_profiling();
+}
+
+static void cell_global_stop_ppu(void)
 {
 	int cpu;
 
-	/* This routine will be called once for the system.
+	/*
+	 * This routine will be called once for the system.
 	 * There is one performance monitor per node, so we
 	 * only need to perform this function once per node.
 	 */
@@ -687,8 +1098,16 @@ static void cell_global_stop(void)
 	}
 }
 
-static void
-cell_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ctr)
+static void cell_global_stop(void)
+{
+	if (spu_cycle_reset)
+		cell_global_stop_spu();
+	else
+		cell_global_stop_ppu();
+}
+
+static void cell_handle_interrupt(struct pt_regs *regs,
+				struct op_counter_config *ctr)
 {
 	u32 cpu;
 	u64 pc;
@@ -699,13 +1118,15 @@ cell_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ctr)
 
 	cpu = smp_processor_id();
 
-	/* Need to make sure the interrupt handler and the virt counter
+	/*
+	 * Need to make sure the interrupt handler and the virt counter
 	 * routine are not running at the same time. See the
 	 * cell_virtual_cntr() routine for additional comments.
 	 */
 	spin_lock_irqsave(&virt_cntr_lock, flags);
 
-	/* Need to disable and reenable the performance counters
+	/*
+	 * Need to disable and reenable the performance counters
 	 * to get the desired behavior from the hardware.  This
 	 * is hardware specific.
 	 */
@@ -714,7 +1135,8 @@ cell_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ctr)
 
 	interrupt_mask = cbe_get_and_clear_pm_interrupts(cpu);
 
-	/* If the interrupt mask has been cleared, then the virt cntr
+	/*
+	 * If the interrupt mask has been cleared, then the virt cntr
 	 * has cleared the interrupt.  When the thread that generated
 	 * the interrupt is restored, the data count will be restored to
 	 * 0xffffff0 to cause the interrupt to be regenerated.
@@ -732,18 +1154,20 @@ cell_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ctr)
 			}
 		}
 
-		/* The counters were frozen by the interrupt.
+		/*
+		 * The counters were frozen by the interrupt.
 		 * Reenable the interrupt and restart the counters.
 		 * If there was a race between the interrupt handler and
-		 * the virtual counter routine.  The virutal counter
+		 * the virtual counter routine.	 The virutal counter
 		 * routine may have cleared the interrupts.  Hence must
 		 * use the virt_cntr_inter_mask to re-enable the interrupts.
 		 */
 		cbe_enable_pm_interrupts(cpu, hdw_thread,
 					 virt_cntr_inter_mask);
 
-		/* The writes to the various performance counters only writes
-		 * to a latch.  The new values (interrupt setting bits, reset
+		/*
+		 * The writes to the various performance counters only writes
+		 * to a latch.	The new values (interrupt setting bits, reset
 		 * counter value etc.) are not copied to the actual registers
 		 * until the performance monitor is enabled.  In order to get
 		 * this to work as desired, the permormance monitor needs to
@@ -755,10 +1179,33 @@ cell_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ctr)
 	spin_unlock_irqrestore(&virt_cntr_lock, flags);
 }
 
+/*
+ * This function is called from the generic OProfile
+ * driver.  When profiling PPUs, we need to do the
+ * generic sync start; otherwise, do spu_sync_start.
+ */
+static int cell_sync_start(void)
+{
+	if (spu_cycle_reset)
+		return spu_sync_start();
+	else
+		return DO_GENERIC_SYNC;
+}
+
+static int cell_sync_stop(void)
+{
+	if (spu_cycle_reset)
+		return spu_sync_stop();
+	else
+		return 1;
+}
+
 struct op_powerpc_model op_model_cell = {
 	.reg_setup = cell_reg_setup,
 	.cpu_setup = cell_cpu_setup,
 	.global_start = cell_global_start,
 	.global_stop = cell_global_stop,
+	.sync_start = cell_sync_start,
+	.sync_stop = cell_sync_stop,
 	.handle_interrupt = cell_handle_interrupt,
 };
diff --git a/arch/powerpc/oprofile/op_model_fsl_booke.c b/arch/powerpc/oprofile/op_model_fsl_booke.c
index 2267eb8c661..183a28bb181 100644
--- a/arch/powerpc/oprofile/op_model_fsl_booke.c
+++ b/arch/powerpc/oprofile/op_model_fsl_booke.c
@@ -244,7 +244,7 @@ static void dump_pmcs(void)
 			mfpmr(PMRN_PMLCA3), mfpmr(PMRN_PMLCB3));
 }
 
-static void fsl_booke_cpu_setup(struct op_counter_config *ctr)
+static int fsl_booke_cpu_setup(struct op_counter_config *ctr)
 {
 	int i;
 
@@ -258,9 +258,11 @@ static void fsl_booke_cpu_setup(struct op_counter_config *ctr)
 
 		set_pmc_user_kernel(i, ctr[i].user, ctr[i].kernel);
 	}
+
+	return 0;
 }
 
-static void fsl_booke_reg_setup(struct op_counter_config *ctr,
+static int fsl_booke_reg_setup(struct op_counter_config *ctr,
 			     struct op_system_config *sys,
 			     int num_ctrs)
 {
@@ -276,9 +278,10 @@ static void fsl_booke_reg_setup(struct op_counter_config *ctr,
 	for (i = 0; i < num_counters; ++i)
 		reset_value[i] = 0x80000000UL - ctr[i].count;
 
+	return 0;
 }
 
-static void fsl_booke_start(struct op_counter_config *ctr)
+static int fsl_booke_start(struct op_counter_config *ctr)
 {
 	int i;
 
@@ -308,6 +311,8 @@ static void fsl_booke_start(struct op_counter_config *ctr)
 
 	pr_debug("start on cpu %d, pmgc0 %x\n", smp_processor_id(),
 			mfpmr(PMRN_PMGC0));
+
+	return 0;
 }
 
 static void fsl_booke_stop(void)
diff --git a/arch/powerpc/oprofile/op_model_pa6t.c b/arch/powerpc/oprofile/op_model_pa6t.c
index e8a56b0adad..c40de461fd4 100644
--- a/arch/powerpc/oprofile/op_model_pa6t.c
+++ b/arch/powerpc/oprofile/op_model_pa6t.c
@@ -89,7 +89,7 @@ static inline void ctr_write(unsigned int i, u64 val)
 
 
 /* precompute the values to stuff in the hardware registers */
-static void pa6t_reg_setup(struct op_counter_config *ctr,
+static int pa6t_reg_setup(struct op_counter_config *ctr,
 			   struct op_system_config *sys,
 			   int num_ctrs)
 {
@@ -135,10 +135,12 @@ static void pa6t_reg_setup(struct op_counter_config *ctr,
 		pr_debug("reset_value for pmc%u inited to 0x%lx\n",
 				 pmc, reset_value[pmc]);
 	}
+
+	return 0;
 }
 
 /* configure registers on this cpu */
-static void pa6t_cpu_setup(struct op_counter_config *ctr)
+static int pa6t_cpu_setup(struct op_counter_config *ctr)
 {
 	u64 mmcr0 = mmcr0_val;
 	u64 mmcr1 = mmcr1_val;
@@ -154,9 +156,11 @@ static void pa6t_cpu_setup(struct op_counter_config *ctr)
 		mfspr(SPRN_PA6T_MMCR0));
 	pr_debug("setup on cpu %d, mmcr1 %016lx\n", smp_processor_id(),
 		mfspr(SPRN_PA6T_MMCR1));
+
+	return 0;
 }
 
-static void pa6t_start(struct op_counter_config *ctr)
+static int pa6t_start(struct op_counter_config *ctr)
 {
 	int i;
 
@@ -174,6 +178,8 @@ static void pa6t_start(struct op_counter_config *ctr)
 	oprofile_running = 1;
 
 	pr_debug("start on cpu %d, mmcr0 %lx\n", smp_processor_id(), mmcr0);
+
+	return 0;
 }
 
 static void pa6t_stop(void)
diff --git a/arch/powerpc/oprofile/op_model_power4.c b/arch/powerpc/oprofile/op_model_power4.c
index a7c206b665a..cddc250a6a5 100644
--- a/arch/powerpc/oprofile/op_model_power4.c
+++ b/arch/powerpc/oprofile/op_model_power4.c
@@ -32,7 +32,7 @@ static u32 mmcr0_val;
 static u64 mmcr1_val;
 static u64 mmcra_val;
 
-static void power4_reg_setup(struct op_counter_config *ctr,
+static int power4_reg_setup(struct op_counter_config *ctr,
 			     struct op_system_config *sys,
 			     int num_ctrs)
 {
@@ -60,6 +60,8 @@ static void power4_reg_setup(struct op_counter_config *ctr,
 		mmcr0_val &= ~MMCR0_PROBLEM_DISABLE;
 	else
 		mmcr0_val |= MMCR0_PROBLEM_DISABLE;
+
+	return 0;
 }
 
 extern void ppc64_enable_pmcs(void);
@@ -84,7 +86,7 @@ static inline int mmcra_must_set_sample(void)
 	return 0;
 }
 
-static void power4_cpu_setup(struct op_counter_config *ctr)
+static int power4_cpu_setup(struct op_counter_config *ctr)
 {
 	unsigned int mmcr0 = mmcr0_val;
 	unsigned long mmcra = mmcra_val;
@@ -111,9 +113,11 @@ static void power4_cpu_setup(struct op_counter_config *ctr)
 	    mfspr(SPRN_MMCR1));
 	dbg("setup on cpu %d, mmcra %lx\n", smp_processor_id(),
 	    mfspr(SPRN_MMCRA));
+
+	return 0;
 }
 
-static void power4_start(struct op_counter_config *ctr)
+static int power4_start(struct op_counter_config *ctr)
 {
 	int i;
 	unsigned int mmcr0;
@@ -148,6 +152,7 @@ static void power4_start(struct op_counter_config *ctr)
 	oprofile_running = 1;
 
 	dbg("start on cpu %d, mmcr0 %x\n", smp_processor_id(), mmcr0);
+	return 0;
 }
 
 static void power4_stop(void)
diff --git a/arch/powerpc/oprofile/op_model_rs64.c b/arch/powerpc/oprofile/op_model_rs64.c
index c731acbfb2a..a20afe45d93 100644
--- a/arch/powerpc/oprofile/op_model_rs64.c
+++ b/arch/powerpc/oprofile/op_model_rs64.c
@@ -88,7 +88,7 @@ static unsigned long reset_value[OP_MAX_COUNTER];
 
 static int num_counters;
 
-static void rs64_reg_setup(struct op_counter_config *ctr,
+static int rs64_reg_setup(struct op_counter_config *ctr,
 			   struct op_system_config *sys,
 			   int num_ctrs)
 {
@@ -100,9 +100,10 @@ static void rs64_reg_setup(struct op_counter_config *ctr,
 		reset_value[i] = 0x80000000UL - ctr[i].count;
 
 	/* XXX setup user and kernel profiling */
+	return 0;
 }
 
-static void rs64_cpu_setup(struct op_counter_config *ctr)
+static int rs64_cpu_setup(struct op_counter_config *ctr)
 {
 	unsigned int mmcr0;
 
@@ -125,9 +126,11 @@ static void rs64_cpu_setup(struct op_counter_config *ctr)
 	    mfspr(SPRN_MMCR0));
 	dbg("setup on cpu %d, mmcr1 %lx\n", smp_processor_id(),
 	    mfspr(SPRN_MMCR1));
+
+	return 0;
 }
 
-static void rs64_start(struct op_counter_config *ctr)
+static int rs64_start(struct op_counter_config *ctr)
 {
 	int i;
 	unsigned int mmcr0;
@@ -155,6 +158,7 @@ static void rs64_start(struct op_counter_config *ctr)
 	mtspr(SPRN_MMCR0, mmcr0);
 
 	dbg("start on cpu %d, mmcr0 %x\n", smp_processor_id(), mmcr0);
+	return 0;
 }
 
 static void rs64_stop(void)
diff --git a/arch/powerpc/platforms/cell/spufs/context.c b/arch/powerpc/platforms/cell/spufs/context.c
index a7efb999d65..6694f86d700 100644
--- a/arch/powerpc/platforms/cell/spufs/context.c
+++ b/arch/powerpc/platforms/cell/spufs/context.c
@@ -22,6 +22,7 @@
 
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/module.h>
 #include <linux/slab.h>
 #include <asm/atomic.h>
 #include <asm/spu.h>
@@ -81,6 +82,8 @@ void destroy_spu_context(struct kref *kref)
 	spu_fini_csa(&ctx->csa);
 	if (ctx->gang)
 		spu_gang_remove_ctx(ctx->gang, ctx);
+	if (ctx->prof_priv_kref)
+		kref_put(ctx->prof_priv_kref, ctx->prof_priv_release);
 	BUG_ON(!list_empty(&ctx->rq));
 	atomic_dec(&nr_spu_contexts);
 	kfree(ctx);
@@ -185,3 +188,20 @@ void spu_release_saved(struct spu_context *ctx)
 
 	spu_release(ctx);
 }
+
+void spu_set_profile_private_kref(struct spu_context *ctx,
+				  struct kref *prof_info_kref,
+				  void ( * prof_info_release) (struct kref *kref))
+{
+	ctx->prof_priv_kref = prof_info_kref;
+	ctx->prof_priv_release = prof_info_release;
+}
+EXPORT_SYMBOL_GPL(spu_set_profile_private_kref);
+
+void *spu_get_profile_private_kref(struct spu_context *ctx)
+{
+	return ctx->prof_priv_kref;
+}
+EXPORT_SYMBOL_GPL(spu_get_profile_private_kref);
+
+
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 88ec333e90d..44e2338a05d 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -274,6 +274,7 @@ static void spu_bind_context(struct spu *spu, struct spu_context *ctx)
 	ctx->spu = spu;
 	ctx->ops = &spu_hw_ops;
 	spu->pid = current->pid;
+	spu->tgid = current->tgid;
 	spu_associate_mm(spu, ctx->owner);
 	spu->ibox_callback = spufs_ibox_callback;
 	spu->wbox_callback = spufs_wbox_callback;
@@ -456,6 +457,7 @@ static void spu_unbind_context(struct spu *spu, struct spu_context *ctx)
 	spu->dma_callback = NULL;
 	spu_associate_mm(spu, NULL);
 	spu->pid = 0;
+	spu->tgid = 0;
 	ctx->ops = &spu_backing_ops;
 	spu->flags = 0;
 	spu->ctx = NULL;
@@ -737,7 +739,7 @@ void spu_deactivate(struct spu_context *ctx)
 }
 
 /**
- * spu_yield -  yield a physical spu if others are waiting
+ * spu_yield -	yield a physical spu if others are waiting
  * @ctx:	spu context to yield
  *
  * Check if there is a higher priority context waiting and if yes
diff --git a/arch/powerpc/platforms/cell/spufs/spufs.h b/arch/powerpc/platforms/cell/spufs/spufs.h
index 692dbd0edc3..8b20c0c1556 100644
--- a/arch/powerpc/platforms/cell/spufs/spufs.h
+++ b/arch/powerpc/platforms/cell/spufs/spufs.h
@@ -85,6 +85,8 @@ struct spu_context {
 
 	struct list_head gang_list;
 	struct spu_gang *gang;
+	struct kref *prof_priv_kref;
+	void ( * prof_priv_release) (struct kref *kref);
 
 	/* owner thread */
 	pid_t tid;
diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c
index edd6de99572..8134c7e198a 100644
--- a/drivers/oprofile/buffer_sync.c
+++ b/drivers/oprofile/buffer_sync.c
@@ -26,8 +26,9 @@
 #include <linux/profile.h>
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/oprofile.h>
 #include <linux/sched.h>
- 
+
 #include "oprofile_stats.h"
 #include "event_buffer.h"
 #include "cpu_buffer.h"
diff --git a/drivers/oprofile/event_buffer.h b/drivers/oprofile/event_buffer.h
index 9b6a4ebd03e..5076ed1ebd8 100644
--- a/drivers/oprofile/event_buffer.h
+++ b/drivers/oprofile/event_buffer.h
@@ -19,28 +19,10 @@ void free_event_buffer(void);
  
 /* wake up the process sleeping on the event file */
 void wake_up_buffer_waiter(void);
- 
-/* Each escaped entry is prefixed by ESCAPE_CODE
- * then one of the following codes, then the
- * relevant data.
- */
-#define ESCAPE_CODE			~0UL
-#define CTX_SWITCH_CODE 		1
-#define CPU_SWITCH_CODE 		2
-#define COOKIE_SWITCH_CODE 		3
-#define KERNEL_ENTER_SWITCH_CODE	4
-#define KERNEL_EXIT_SWITCH_CODE		5
-#define MODULE_LOADED_CODE		6
-#define CTX_TGID_CODE			7
-#define TRACE_BEGIN_CODE		8
-#define TRACE_END_CODE			9
- 
+
 #define INVALID_COOKIE ~0UL
 #define NO_COOKIE 0UL
 
-/* add data to the event buffer */
-void add_event_entry(unsigned long data);
- 
 extern const struct file_operations event_buffer_fops;
  
 /* mutex between sync_cpu_buffers() and the
diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c
index e5162a64018..2c645170f06 100644
--- a/drivers/oprofile/oprof.c
+++ b/drivers/oprofile/oprof.c
@@ -53,9 +53,24 @@ int oprofile_setup(void)
 	 * us missing task deaths and eventually oopsing
 	 * when trying to process the event buffer.
 	 */
+	if (oprofile_ops.sync_start) {
+		int sync_ret = oprofile_ops.sync_start();
+		switch (sync_ret) {
+		case 0:
+			goto post_sync;
+		case 1:
+			goto do_generic;
+		case -1:
+			goto out3;
+		default:
+			goto out3;
+		}
+	}
+do_generic:
 	if ((err = sync_start()))
 		goto out3;
 
+post_sync:
 	is_setup = 1;
 	mutex_unlock(&start_mutex);
 	return 0;
@@ -118,7 +133,20 @@ out:
 void oprofile_shutdown(void)
 {
 	mutex_lock(&start_mutex);
+	if (oprofile_ops.sync_stop) {
+		int sync_ret = oprofile_ops.sync_stop();
+		switch (sync_ret) {
+		case 0:
+			goto post_sync;
+		case 1:
+			goto do_generic;
+		default:
+			goto post_sync;
+		}
+	}
+do_generic:
 	sync_stop();
+post_sync:
 	if (oprofile_ops.shutdown)
 		oprofile_ops.shutdown();
 	is_setup = 0;
diff --git a/include/asm-powerpc/oprofile_impl.h b/include/asm-powerpc/oprofile_impl.h
index 8d6b47f7b30..938fefb4c4b 100644
--- a/include/asm-powerpc/oprofile_impl.h
+++ b/include/asm-powerpc/oprofile_impl.h
@@ -39,14 +39,16 @@ struct op_system_config {
 
 /* Per-arch configuration */
 struct op_powerpc_model {
-	void (*reg_setup) (struct op_counter_config *,
+	int (*reg_setup) (struct op_counter_config *,
 			   struct op_system_config *,
 			   int num_counters);
-	void (*cpu_setup) (struct op_counter_config *);
-	void (*start) (struct op_counter_config *);
-        void (*global_start) (struct op_counter_config *);
+	int  (*cpu_setup) (struct op_counter_config *);
+	int  (*start) (struct op_counter_config *);
+	int  (*global_start) (struct op_counter_config *);
 	void (*stop) (void);
 	void (*global_stop) (void);
+	int (*sync_start)(void);
+	int (*sync_stop)(void);
 	void (*handle_interrupt) (struct pt_regs *,
 				  struct op_counter_config *);
 	int num_counters;
diff --git a/include/asm-powerpc/spu.h b/include/asm-powerpc/spu.h
index 24f352da286..a0f7fc8e23b 100644
--- a/include/asm-powerpc/spu.h
+++ b/include/asm-powerpc/spu.h
@@ -138,6 +138,7 @@ struct spu {
 	struct spu_runqueue *rq;
 	unsigned long long timestamp;
 	pid_t pid;
+	pid_t tgid;
 	int class_0_pending;
 	spinlock_t register_lock;
 
@@ -217,6 +218,20 @@ extern void spu_associate_mm(struct spu *spu, struct mm_struct *mm);
 struct mm_struct;
 extern void spu_flush_all_slbs(struct mm_struct *mm);
 
+/* This interface allows a profiler (e.g., OProfile) to store a ref
+ * to spu context information that it creates.	This caching technique
+ * avoids the need to recreate this information after a save/restore operation.
+ *
+ * Assumes the caller has already incremented the ref count to
+ * profile_info; then spu_context_destroy must call kref_put
+ * on prof_info_kref.
+ */
+void spu_set_profile_private_kref(struct spu_context *ctx,
+				  struct kref *prof_info_kref,
+				  void ( * prof_info_release) (struct kref *kref));
+
+void *spu_get_profile_private_kref(struct spu_context *ctx);
+
 /* system callbacks from the SPU */
 struct spu_syscall_block {
 	u64 nr_ret;
diff --git a/include/linux/dcookies.h b/include/linux/dcookies.h
index 0fe7cdf326f..98c69ab80c8 100644
--- a/include/linux/dcookies.h
+++ b/include/linux/dcookies.h
@@ -12,6 +12,7 @@
 
 #ifdef CONFIG_PROFILING
  
+#include <linux/dcache.h>
 #include <linux/types.h>
  
 struct dcookie_user;
diff --git a/include/linux/elf-em.h b/include/linux/elf-em.h
index 0311bad838b..5834e843a94 100644
--- a/include/linux/elf-em.h
+++ b/include/linux/elf-em.h
@@ -20,7 +20,8 @@
 #define EM_PARISC	15	/* HPPA */
 #define EM_SPARC32PLUS	18	/* Sun's "v8plus" */
 #define EM_PPC		20	/* PowerPC */
-#define EM_PPC64	21       /* PowerPC64 */
+#define EM_PPC64	21	 /* PowerPC64 */
+#define EM_SPU		23	/* Cell BE SPU */
 #define EM_SH		42	/* SuperH */
 #define EM_SPARCV9	43	/* SPARC v9 64-bit */
 #define EM_IA_64	50	/* HP/Intel IA-64 */
diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h
index 0d514b25245..041bb31100f 100644
--- a/include/linux/oprofile.h
+++ b/include/linux/oprofile.h
@@ -17,6 +17,26 @@
 #include <linux/spinlock.h>
 #include <asm/atomic.h>
  
+/* Each escaped entry is prefixed by ESCAPE_CODE
+ * then one of the following codes, then the
+ * relevant data.
+ * These #defines live in this file so that arch-specific
+ * buffer sync'ing code can access them.
+ */
+#define ESCAPE_CODE			~0UL
+#define CTX_SWITCH_CODE			1
+#define CPU_SWITCH_CODE			2
+#define COOKIE_SWITCH_CODE		3
+#define KERNEL_ENTER_SWITCH_CODE	4
+#define KERNEL_EXIT_SWITCH_CODE		5
+#define MODULE_LOADED_CODE		6
+#define CTX_TGID_CODE			7
+#define TRACE_BEGIN_CODE		8
+#define TRACE_END_CODE			9
+#define XEN_ENTER_SWITCH_CODE		10
+#define SPU_PROFILING_CODE		11
+#define SPU_CTX_SWITCH_CODE		12
+
 struct super_block;
 struct dentry;
 struct file_operations;
@@ -35,6 +55,14 @@ struct oprofile_operations {
 	int (*start)(void);
 	/* Stop delivering interrupts. */
 	void (*stop)(void);
+	/* Arch-specific buffer sync functions.
+	 * Return value = 0:  Success
+	 * Return value = -1: Failure
+	 * Return value = 1:  Run generic sync function
+	 */
+	int (*sync_start)(void);
+	int (*sync_stop)(void);
+
 	/* Initiate a stack backtrace. Optional. */
 	void (*backtrace)(struct pt_regs * const regs, unsigned int depth);
 	/* CPU identification string. */
@@ -55,6 +83,13 @@ int oprofile_arch_init(struct oprofile_operations * ops);
  */
 void oprofile_arch_exit(void);
 
+/**
+ * Add data to the event buffer.
+ * The data passed is free-form, but typically consists of
+ * file offsets, dcookies, context information, and ESCAPE codes.
+ */
+void add_event_entry(unsigned long data);
+
 /**
  * Add a sample. This may be called from any context. Pass
  * smp_processor_id() as cpu.
-- 
cgit v1.2.3-70-g09d2


From 486acd4850dde6d2f8c7f431432f3914c4bfb5f5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 20 Jul 2007 21:39:54 +0200
Subject: [CELL] spufs: rework list management and associated locking

This sorts out the various lists and related locks in the spu code.

In detail:

 - the per-node free_spus and active_list are gone.  Instead struct spu
   gained an alloc_state member telling whether the spu is free or not
 - the per-node spus array is now locked by a per-node mutex, which
   takes over from the global spu_lock and the per-node active_mutex
 - the spu_alloc* and spu_free function are gone as the state change is
   now done inline in the spufs code.  This allows some more sharing of
   code for the affinity vs normal case and more efficient locking
 - some little refactoring in the affinity code for this locking scheme

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/spu_base.c    |  72 ++---------
 arch/powerpc/platforms/cell/spufs/sched.c | 198 +++++++++++++++---------------
 include/asm-powerpc/spu.h                 |  11 +-
 3 files changed, 112 insertions(+), 169 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index 8617b507af4..90124228b8f 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -409,7 +409,7 @@ static void spu_free_irqs(struct spu *spu)
 		free_irq(spu->irqs[2], spu);
 }
 
-static void spu_init_channels(struct spu *spu)
+void spu_init_channels(struct spu *spu)
 {
 	static const struct {
 		 unsigned channel;
@@ -442,66 +442,7 @@ static void spu_init_channels(struct spu *spu)
 		out_be64(&priv2->spu_chnlcnt_RW, count_list[i].count);
 	}
 }
-
-struct spu *spu_alloc_spu(struct spu *req_spu)
-{
-	struct spu *spu, *ret = NULL;
-
-	spin_lock(&spu_lock);
-	list_for_each_entry(spu, &cbe_spu_info[req_spu->node].free_spus, list) {
-		if (spu == req_spu) {
-			list_del_init(&spu->list);
-			pr_debug("Got SPU %d %d\n", spu->number, spu->node);
-			spu_init_channels(spu);
-			ret = spu;
-			break;
-		}
-	}
-	spin_unlock(&spu_lock);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(spu_alloc_spu);
-
-struct spu *spu_alloc_node(int node)
-{
-	struct spu *spu = NULL;
-
-	spin_lock(&spu_lock);
-	if (!list_empty(&cbe_spu_info[node].free_spus)) {
-		spu = list_entry(cbe_spu_info[node].free_spus.next, struct spu,
-									list);
-		list_del_init(&spu->list);
-		pr_debug("Got SPU %d %d\n", spu->number, spu->node);
-	}
-	spin_unlock(&spu_lock);
-
-	if (spu)
-		spu_init_channels(spu);
-	return spu;
-}
-EXPORT_SYMBOL_GPL(spu_alloc_node);
-
-struct spu *spu_alloc(void)
-{
-	struct spu *spu = NULL;
-	int node;
-
-	for (node = 0; node < MAX_NUMNODES; node++) {
-		spu = spu_alloc_node(node);
-		if (spu)
-			break;
-	}
-
-	return spu;
-}
-
-void spu_free(struct spu *spu)
-{
-	spin_lock(&spu_lock);
-	list_add_tail(&spu->list, &cbe_spu_info[spu->node].free_spus);
-	spin_unlock(&spu_lock);
-}
-EXPORT_SYMBOL_GPL(spu_free);
+EXPORT_SYMBOL_GPL(spu_init_channels);
 
 static int spu_shutdown(struct sys_device *sysdev)
 {
@@ -597,6 +538,8 @@ static int __init create_spu(void *data)
 	if (!spu)
 		goto out;
 
+	spu->alloc_state = SPU_FREE;
+
 	spin_lock_init(&spu->register_lock);
 	spin_lock(&spu_lock);
 	spu->number = number++;
@@ -617,11 +560,10 @@ static int __init create_spu(void *data)
 	if (ret)
 		goto out_free_irqs;
 
-	spin_lock(&spu_lock);
-	list_add(&spu->list, &cbe_spu_info[spu->node].free_spus);
+	mutex_lock(&cbe_spu_info[spu->node].list_mutex);
 	list_add(&spu->cbe_list, &cbe_spu_info[spu->node].spus);
 	cbe_spu_info[spu->node].n_spus++;
-	spin_unlock(&spu_lock);
+	mutex_unlock(&cbe_spu_info[spu->node].list_mutex);
 
 	mutex_lock(&spu_full_list_mutex);
 	spin_lock_irqsave(&spu_full_list_lock, flags);
@@ -831,8 +773,8 @@ static int __init init_spu_base(void)
 	int i, ret = 0;
 
 	for (i = 0; i < MAX_NUMNODES; i++) {
+		mutex_init(&cbe_spu_info[i].list_mutex);
 		INIT_LIST_HEAD(&cbe_spu_info[i].spus);
-		INIT_LIST_HEAD(&cbe_spu_info[i].free_spus);
 	}
 
 	if (!spu_management_ops)
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 44e2338a05d..227968b4779 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -51,9 +51,6 @@ struct spu_prio_array {
 	DECLARE_BITMAP(bitmap, MAX_PRIO);
 	struct list_head runq[MAX_PRIO];
 	spinlock_t runq_lock;
-	struct list_head active_list[MAX_NUMNODES];
-	struct mutex active_mutex[MAX_NUMNODES];
-	int nr_active[MAX_NUMNODES];
 	int nr_waiting;
 };
 
@@ -127,7 +124,7 @@ void __spu_update_sched_info(struct spu_context *ctx)
 	ctx->policy = current->policy;
 
 	/*
-	 * A lot of places that don't hold active_mutex poke into
+	 * A lot of places that don't hold list_mutex poke into
 	 * cpus_allowed, including grab_runnable_context which
 	 * already holds the runq_lock.  So abuse runq_lock
 	 * to protect this field aswell.
@@ -141,9 +138,9 @@ void spu_update_sched_info(struct spu_context *ctx)
 {
 	int node = ctx->spu->node;
 
-	mutex_lock(&spu_prio->active_mutex[node]);
+	mutex_lock(&cbe_spu_info[node].list_mutex);
 	__spu_update_sched_info(ctx);
-	mutex_unlock(&spu_prio->active_mutex[node]);
+	mutex_unlock(&cbe_spu_info[node].list_mutex);
 }
 
 static int __node_allowed(struct spu_context *ctx, int node)
@@ -169,39 +166,6 @@ static int node_allowed(struct spu_context *ctx, int node)
 	return rval;
 }
 
-/**
- * spu_add_to_active_list - add spu to active list
- * @spu:	spu to add to the active list
- */
-static void spu_add_to_active_list(struct spu *spu)
-{
-	int node = spu->node;
-
-	mutex_lock(&spu_prio->active_mutex[node]);
-	spu_prio->nr_active[node]++;
-	list_add_tail(&spu->list, &spu_prio->active_list[node]);
-	mutex_unlock(&spu_prio->active_mutex[node]);
-}
-
-static void __spu_remove_from_active_list(struct spu *spu)
-{
-	list_del_init(&spu->list);
-	spu_prio->nr_active[spu->node]--;
-}
-
-/**
- * spu_remove_from_active_list - remove spu from active list
- * @spu:       spu to remove from the active list
- */
-static void spu_remove_from_active_list(struct spu *spu)
-{
-	int node = spu->node;
-
-	mutex_lock(&spu_prio->active_mutex[node]);
-	__spu_remove_from_active_list(spu);
-	mutex_unlock(&spu_prio->active_mutex[node]);
-}
-
 static BLOCKING_NOTIFIER_HEAD(spu_switch_notifier);
 
 void spu_switch_notify(struct spu *spu, struct spu_context *ctx)
@@ -222,15 +186,18 @@ static void notify_spus_active(void)
 	 */
 	for_each_online_node(node) {
 		struct spu *spu;
-		mutex_lock(&spu_prio->active_mutex[node]);
-		list_for_each_entry(spu, &spu_prio->active_list[node], list) {
-			struct spu_context *ctx = spu->ctx;
-			set_bit(SPU_SCHED_NOTIFY_ACTIVE, &ctx->sched_flags);
-			mb();	/* make sure any tasks woken up below */
-				/* can see the bit(s) set above */
-			wake_up_all(&ctx->stop_wq);
+
+		mutex_lock(&cbe_spu_info[node].list_mutex);
+		list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
+			if (spu->alloc_state != SPU_FREE) {
+				struct spu_context *ctx = spu->ctx;
+				set_bit(SPU_SCHED_NOTIFY_ACTIVE,
+					&ctx->sched_flags);
+				mb();
+				wake_up_all(&ctx->stop_wq);
+			}
 		}
-		mutex_unlock(&spu_prio->active_mutex[node]);
+		mutex_unlock(&cbe_spu_info[node].list_mutex);
 	}
 }
 
@@ -293,10 +260,12 @@ static void spu_bind_context(struct spu *spu, struct spu_context *ctx)
 }
 
 /*
- * XXX(hch): needs locking.
+ * Must be used with the list_mutex held.
  */
 static inline int sched_spu(struct spu *spu)
 {
+	BUG_ON(!mutex_is_locked(&cbe_spu_info[spu->node].list_mutex));
+
 	return (!spu->ctx || !(spu->ctx->flags & SPU_CREATE_NOSCHED));
 }
 
@@ -349,11 +318,15 @@ static struct spu *aff_ref_location(struct spu_context *ctx, int mem_aff,
 		node = (node < MAX_NUMNODES) ? node : 0;
 		if (!node_allowed(ctx, node))
 			continue;
+		mutex_lock(&cbe_spu_info[node].list_mutex);
 		list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
 			if ((!mem_aff || spu->has_mem_affinity) &&
-							sched_spu(spu))
+							sched_spu(spu)) {
+				mutex_unlock(&cbe_spu_info[node].list_mutex);
 				return spu;
+			}
 		}
+		mutex_unlock(&cbe_spu_info[node].list_mutex);
 	}
 	return NULL;
 }
@@ -381,13 +354,14 @@ static void aff_set_ref_point_location(struct spu_gang *gang)
 	gang->aff_ref_spu = aff_ref_location(ctx, mem_aff, gs, lowest_offset);
 }
 
-static struct spu *ctx_location(struct spu *ref, int offset)
+static struct spu *ctx_location(struct spu *ref, int offset, int node)
 {
 	struct spu *spu;
 
 	spu = NULL;
 	if (offset >= 0) {
 		list_for_each_entry(spu, ref->aff_list.prev, aff_list) {
+			BUG_ON(spu->node != node);
 			if (offset == 0)
 				break;
 			if (sched_spu(spu))
@@ -395,12 +369,14 @@ static struct spu *ctx_location(struct spu *ref, int offset)
 		}
 	} else {
 		list_for_each_entry_reverse(spu, ref->aff_list.next, aff_list) {
+			BUG_ON(spu->node != node);
 			if (offset == 0)
 				break;
 			if (sched_spu(spu))
 				offset++;
 		}
 	}
+
 	return spu;
 }
 
@@ -408,13 +384,13 @@ static struct spu *ctx_location(struct spu *ref, int offset)
  * affinity_check is called each time a context is going to be scheduled.
  * It returns the spu ptr on which the context must run.
  */
-struct spu *affinity_check(struct spu_context *ctx)
+static int has_affinity(struct spu_context *ctx)
 {
-	struct spu_gang *gang;
+	struct spu_gang *gang = ctx->gang;
 
 	if (list_empty(&ctx->aff_list))
-		return NULL;
-	gang = ctx->gang;
+		return 0;
+
 	mutex_lock(&gang->aff_mutex);
 	if (!gang->aff_ref_spu) {
 		if (!(gang->aff_flags & AFF_MERGED))
@@ -424,9 +400,8 @@ struct spu *affinity_check(struct spu_context *ctx)
 		aff_set_ref_point_location(gang);
 	}
 	mutex_unlock(&gang->aff_mutex);
-	if (!gang->aff_ref_spu)
-		return NULL;
-	return ctx_location(gang->aff_ref_spu, ctx->aff_offset);
+
+	return gang->aff_ref_spu != NULL;
 }
 
 /**
@@ -535,22 +510,41 @@ static void spu_prio_wait(struct spu_context *ctx)
 
 static struct spu *spu_get_idle(struct spu_context *ctx)
 {
-	struct spu *spu = NULL;
-	int node = cpu_to_node(raw_smp_processor_id());
-	int n;
+	struct spu *spu;
+	int node, n;
+
+	if (has_affinity(ctx)) {
+		node = ctx->gang->aff_ref_spu->node;
 
-	spu = affinity_check(ctx);
-	if (spu)
-		return spu_alloc_spu(spu);
+		mutex_lock(&cbe_spu_info[node].list_mutex);
+		spu = ctx_location(ctx->gang->aff_ref_spu, ctx->aff_offset, node);
+		if (spu && spu->alloc_state == SPU_FREE)
+			goto found;
+		mutex_unlock(&cbe_spu_info[node].list_mutex);
+		return NULL;
+	}
 
+	node = cpu_to_node(raw_smp_processor_id());
 	for (n = 0; n < MAX_NUMNODES; n++, node++) {
 		node = (node < MAX_NUMNODES) ? node : 0;
 		if (!node_allowed(ctx, node))
 			continue;
-		spu = spu_alloc_node(node);
-		if (spu)
-			break;
+
+		mutex_lock(&cbe_spu_info[node].list_mutex);
+		list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
+			if (spu->alloc_state == SPU_FREE)
+				goto found;
+		}
+		mutex_unlock(&cbe_spu_info[node].list_mutex);
 	}
+
+	return NULL;
+
+ found:
+	spu->alloc_state = SPU_USED;
+	mutex_unlock(&cbe_spu_info[node].list_mutex);
+	pr_debug("Got SPU %d %d\n", spu->number, spu->node);
+	spu_init_channels(spu);
 	return spu;
 }
 
@@ -580,15 +574,15 @@ static struct spu *find_victim(struct spu_context *ctx)
 		if (!node_allowed(ctx, node))
 			continue;
 
-		mutex_lock(&spu_prio->active_mutex[node]);
-		list_for_each_entry(spu, &spu_prio->active_list[node], list) {
+		mutex_lock(&cbe_spu_info[node].list_mutex);
+		list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
 			struct spu_context *tmp = spu->ctx;
 
 			if (tmp->prio > ctx->prio &&
 			    (!victim || tmp->prio > victim->prio))
 				victim = spu->ctx;
 		}
-		mutex_unlock(&spu_prio->active_mutex[node]);
+		mutex_unlock(&cbe_spu_info[node].list_mutex);
 
 		if (victim) {
 			/*
@@ -613,7 +607,11 @@ static struct spu *find_victim(struct spu_context *ctx)
 				victim = NULL;
 				goto restart;
 			}
-			spu_remove_from_active_list(spu);
+
+			mutex_lock(&cbe_spu_info[node].list_mutex);
+			cbe_spu_info[node].nr_active--;
+			mutex_unlock(&cbe_spu_info[node].list_mutex);
+
 			spu_unbind_context(spu, victim);
 			victim->stats.invol_ctx_switch++;
 			spu->stats.invol_ctx_switch++;
@@ -662,8 +660,12 @@ int spu_activate(struct spu_context *ctx, unsigned long flags)
 		if (!spu && rt_prio(ctx->prio))
 			spu = find_victim(ctx);
 		if (spu) {
+			int node = spu->node;
+
+			mutex_lock(&cbe_spu_info[node].list_mutex);
 			spu_bind_context(spu, ctx);
-			spu_add_to_active_list(spu);
+			cbe_spu_info[node].nr_active++;
+			mutex_unlock(&cbe_spu_info[node].list_mutex);
 			return 0;
 		}
 
@@ -712,11 +714,17 @@ static int __spu_deactivate(struct spu_context *ctx, int force, int max_prio)
 	if (spu) {
 		new = grab_runnable_context(max_prio, spu->node);
 		if (new || force) {
-			spu_remove_from_active_list(spu);
+			int node = spu->node;
+
+			mutex_lock(&cbe_spu_info[node].list_mutex);
 			spu_unbind_context(spu, ctx);
+			spu->alloc_state = SPU_FREE;
+			cbe_spu_info[node].nr_active--;
+			mutex_unlock(&cbe_spu_info[node].list_mutex);
+
 			ctx->stats.vol_ctx_switch++;
 			spu->stats.vol_ctx_switch++;
-			spu_free(spu);
+
 			if (new)
 				wake_up(&new->stop_wq);
 		}
@@ -755,7 +763,7 @@ void spu_yield(struct spu_context *ctx)
 	}
 }
 
-static void spusched_tick(struct spu_context *ctx)
+static noinline void spusched_tick(struct spu_context *ctx)
 {
 	if (ctx->flags & SPU_CREATE_NOSCHED)
 		return;
@@ -766,7 +774,7 @@ static void spusched_tick(struct spu_context *ctx)
 		return;
 
 	/*
-	 * Unfortunately active_mutex ranks outside of state_mutex, so
+	 * Unfortunately list_mutex ranks outside of state_mutex, so
 	 * we have to trylock here.  If we fail give the context another
 	 * tick and try again.
 	 */
@@ -776,12 +784,11 @@ static void spusched_tick(struct spu_context *ctx)
 
 		new = grab_runnable_context(ctx->prio + 1, spu->node);
 		if (new) {
-
-			__spu_remove_from_active_list(spu);
 			spu_unbind_context(spu, ctx);
 			ctx->stats.invol_ctx_switch++;
 			spu->stats.invol_ctx_switch++;
-			spu_free(spu);
+			spu->alloc_state = SPU_FREE;
+			cbe_spu_info[spu->node].nr_active--;
 			wake_up(&new->stop_wq);
 			/*
 			 * We need to break out of the wait loop in
@@ -802,7 +809,7 @@ static void spusched_tick(struct spu_context *ctx)
  *
  * Return the number of tasks currently running or waiting to run.
  *
- * Note that we don't take runq_lock / active_mutex here.  Reading
+ * Note that we don't take runq_lock / list_mutex here.  Reading
  * a single 32bit value is atomic on powerpc, and we don't care
  * about memory ordering issues here.
  */
@@ -811,7 +818,7 @@ static unsigned long count_active_contexts(void)
 	int nr_active = 0, node;
 
 	for (node = 0; node < MAX_NUMNODES; node++)
-		nr_active += spu_prio->nr_active[node];
+		nr_active += cbe_spu_info[node].nr_active;
 	nr_active += spu_prio->nr_waiting;
 
 	return nr_active;
@@ -851,19 +858,18 @@ static void spusched_wake(unsigned long data)
 
 static int spusched_thread(void *unused)
 {
-	struct spu *spu, *next;
+	struct spu *spu;
 	int node;
 
 	while (!kthread_should_stop()) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		schedule();
 		for (node = 0; node < MAX_NUMNODES; node++) {
-			mutex_lock(&spu_prio->active_mutex[node]);
-			list_for_each_entry_safe(spu, next,
-						 &spu_prio->active_list[node],
-						 list)
-				spusched_tick(spu->ctx);
-			mutex_unlock(&spu_prio->active_mutex[node]);
+			mutex_lock(&cbe_spu_info[node].list_mutex);
+			list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list)
+				if (spu->ctx)
+					spusched_tick(spu->ctx);
+			mutex_unlock(&cbe_spu_info[node].list_mutex);
 		}
 	}
 
@@ -922,8 +928,8 @@ int __init spu_sched_init(void)
 		__clear_bit(i, spu_prio->bitmap);
 	}
 	for (i = 0; i < MAX_NUMNODES; i++) {
-		mutex_init(&spu_prio->active_mutex[i]);
-		INIT_LIST_HEAD(&spu_prio->active_list[i]);
+		mutex_init(&cbe_spu_info[i].list_mutex);
+		INIT_LIST_HEAD(&cbe_spu_info[i].spus);
 	}
 	spin_lock_init(&spu_prio->runq_lock);
 
@@ -954,7 +960,7 @@ int __init spu_sched_init(void)
 
 void spu_sched_exit(void)
 {
-	struct spu *spu, *tmp;
+	struct spu *spu;
 	int node;
 
 	remove_proc_entry("spu_loadavg", NULL);
@@ -963,13 +969,11 @@ void spu_sched_exit(void)
 	kthread_stop(spusched_task);
 
 	for (node = 0; node < MAX_NUMNODES; node++) {
-		mutex_lock(&spu_prio->active_mutex[node]);
-		list_for_each_entry_safe(spu, tmp, &spu_prio->active_list[node],
-					 list) {
-			list_del_init(&spu->list);
-			spu_free(spu);
-		}
-		mutex_unlock(&spu_prio->active_mutex[node]);
+		mutex_lock(&cbe_spu_info[node].list_mutex);
+		list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list)
+			if (spu->alloc_state != SPU_FREE)
+				spu->alloc_state = SPU_FREE;
+		mutex_unlock(&cbe_spu_info[node].list_mutex);
 	}
 	kfree(spu_prio);
 }
diff --git a/include/asm-powerpc/spu.h b/include/asm-powerpc/spu.h
index a0f7fc8e23b..8836c0f1f2f 100644
--- a/include/asm-powerpc/spu.h
+++ b/include/asm-powerpc/spu.h
@@ -121,10 +121,9 @@ struct spu {
 	unsigned long problem_phys;
 	struct spu_problem __iomem *problem;
 	struct spu_priv2 __iomem *priv2;
-	struct list_head list;
 	struct list_head cbe_list;
-	struct list_head sched_list;
 	struct list_head full_list;
+	enum { SPU_FREE, SPU_USED } alloc_state;
 	int number;
 	unsigned int irqs[3];
 	u32 node;
@@ -187,18 +186,16 @@ struct spu {
 };
 
 struct cbe_spu_info {
+	struct mutex list_mutex;
 	struct list_head spus;
-	struct list_head free_spus;
 	int n_spus;
+	int nr_active;
 	atomic_t reserved_spus;
 };
 
 extern struct cbe_spu_info cbe_spu_info[];
 
-struct spu *spu_alloc(void);
-struct spu *spu_alloc_node(int node);
-struct spu *spu_alloc_spu(struct spu *spu);
-void spu_free(struct spu *spu);
+void spu_init_channels(struct spu *spu);
 int spu_irq_class_0_bottom(struct spu *spu);
 int spu_irq_class_1_bottom(struct spu *spu);
 void spu_irq_setaffinity(struct spu *spu, int cpu);
-- 
cgit v1.2.3-70-g09d2