From 7d26e2d5e2da37e92c6c7644b26b294dedd8c982 Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Thu, 2 Jul 2009 17:08:30 -0700
Subject: [CPUFREQ] Eliminate the recent lockdep warnings in cpufreq

Commit b14893a62c73af0eca414cfed505b8c09efc613c although it was very
much needed to properly cleanup ondemand timer, opened-up a can of worms
related to locking dependencies in cpufreq.

Patch here defines the need for dbs_mutex and cleans up its usage in
ondemand governor. This also resolves the lockdep warnings reported here

http://lkml.indiana.edu/hypermail/linux/kernel/0906.1/01925.html
http://lkml.indiana.edu/hypermail/linux/kernel/0907.0/00820.html

and few others..

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 drivers/cpufreq/cpufreq.c              |  4 ++--
 drivers/cpufreq/cpufreq_conservative.c | 27 +++++++++++----------------
 drivers/cpufreq/cpufreq_ondemand.c     | 27 +++++++++++----------------
 3 files changed, 24 insertions(+), 34 deletions(-)

(limited to 'drivers/cpufreq')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 6e2ec0b1894..c7fe16e0474 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1070,8 +1070,6 @@ static int __cpufreq_remove_dev(struct sys_device *sys_dev)
 	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
 #endif
 
-	unlock_policy_rwsem_write(cpu);
-
 	if (cpufreq_driver->target)
 		__cpufreq_governor(data, CPUFREQ_GOV_STOP);
 
@@ -1088,6 +1086,8 @@ static int __cpufreq_remove_dev(struct sys_device *sys_dev)
 	if (cpufreq_driver->exit)
 		cpufreq_driver->exit(data);
 
+	unlock_policy_rwsem_write(cpu);
+
 	free_cpumask_var(data->related_cpus);
 	free_cpumask_var(data->cpus);
 	kfree(data);
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 7fc58af748b..58889f26029 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -70,15 +70,10 @@ static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info);
 static unsigned int dbs_enable;	/* number of CPUs using this policy */
 
 /*
- * DEADLOCK ALERT! There is a ordering requirement between cpu_hotplug
- * lock and dbs_mutex. cpu_hotplug lock should always be held before
- * dbs_mutex. If any function that can potentially take cpu_hotplug lock
- * (like __cpufreq_driver_target()) is being called with dbs_mutex taken, then
- * cpu_hotplug lock should be taken before that. Note that cpu_hotplug lock
- * is recursive for the same process. -Venki
- * DEADLOCK ALERT! (2) : do_dbs_timer() must not take the dbs_mutex, because it
- * would deadlock with cancel_delayed_work_sync(), which is needed for proper
- * raceless workqueue teardown.
+ * dbs_mutex protects data in dbs_tuners_ins from concurrent changes on
+ * different CPUs. It protects dbs_enable in governor start/stop. It also
+ * serializes governor limit_change with do_dbs_timer. We do not want
+ * do_dbs_timer to run when user is changing the governor or limits.
  */
 static DEFINE_MUTEX(dbs_mutex);
 
@@ -488,18 +483,17 @@ static void do_dbs_timer(struct work_struct *work)
 
 	delay -= jiffies % delay;
 
-	if (lock_policy_rwsem_write(cpu) < 0)
-		return;
+	mutex_lock(&dbs_mutex);
 
 	if (!dbs_info->enable) {
-		unlock_policy_rwsem_write(cpu);
+		mutex_unlock(&dbs_mutex);
 		return;
 	}
 
 	dbs_check_cpu(dbs_info);
 
 	queue_delayed_work_on(cpu, kconservative_wq, &dbs_info->work, delay);
-	unlock_policy_rwsem_write(cpu);
+	mutex_unlock(&dbs_mutex);
 }
 
 static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info)
@@ -590,15 +584,16 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 					&dbs_cpufreq_notifier_block,
 					CPUFREQ_TRANSITION_NOTIFIER);
 		}
-		dbs_timer_init(this_dbs_info);
-
 		mutex_unlock(&dbs_mutex);
 
+		dbs_timer_init(this_dbs_info);
+
 		break;
 
 	case CPUFREQ_GOV_STOP:
-		mutex_lock(&dbs_mutex);
 		dbs_timer_exit(this_dbs_info);
+
+		mutex_lock(&dbs_mutex);
 		sysfs_remove_group(&policy->kobj, &dbs_attr_group);
 		dbs_enable--;
 
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 1911d172935..246ae147df7 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -78,15 +78,10 @@ static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info);
 static unsigned int dbs_enable;	/* number of CPUs using this policy */
 
 /*
- * DEADLOCK ALERT! There is a ordering requirement between cpu_hotplug
- * lock and dbs_mutex. cpu_hotplug lock should always be held before
- * dbs_mutex. If any function that can potentially take cpu_hotplug lock
- * (like __cpufreq_driver_target()) is being called with dbs_mutex taken, then
- * cpu_hotplug lock should be taken before that. Note that cpu_hotplug lock
- * is recursive for the same process. -Venki
- * DEADLOCK ALERT! (2) : do_dbs_timer() must not take the dbs_mutex, because it
- * would deadlock with cancel_delayed_work_sync(), which is needed for proper
- * raceless workqueue teardown.
+ * dbs_mutex protects data in dbs_tuners_ins from concurrent changes on
+ * different CPUs. It protects dbs_enable in governor start/stop. It also
+ * serializes governor limit_change with do_dbs_timer. We do not want
+ * do_dbs_timer to run when user is changing the governor or limits.
  */
 static DEFINE_MUTEX(dbs_mutex);
 
@@ -494,11 +489,10 @@ static void do_dbs_timer(struct work_struct *work)
 
 	delay -= jiffies % delay;
 
-	if (lock_policy_rwsem_write(cpu) < 0)
-		return;
+	mutex_lock(&dbs_mutex);
 
 	if (!dbs_info->enable) {
-		unlock_policy_rwsem_write(cpu);
+		mutex_unlock(&dbs_mutex);
 		return;
 	}
 
@@ -517,7 +511,7 @@ static void do_dbs_timer(struct work_struct *work)
 			dbs_info->freq_lo, CPUFREQ_RELATION_H);
 	}
 	queue_delayed_work_on(cpu, kondemand_wq, &dbs_info->work, delay);
-	unlock_policy_rwsem_write(cpu);
+	mutex_unlock(&dbs_mutex);
 }
 
 static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info)
@@ -598,14 +592,15 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 				max(min_sampling_rate,
 				    latency * LATENCY_MULTIPLIER);
 		}
-		dbs_timer_init(this_dbs_info);
-
 		mutex_unlock(&dbs_mutex);
+
+		dbs_timer_init(this_dbs_info);
 		break;
 
 	case CPUFREQ_GOV_STOP:
-		mutex_lock(&dbs_mutex);
 		dbs_timer_exit(this_dbs_info);
+
+		mutex_lock(&dbs_mutex);
 		sysfs_remove_group(&policy->kobj, &dbs_attr_group);
 		dbs_enable--;
 		mutex_unlock(&dbs_mutex);
-- 
cgit v1.2.3-70-g09d2


From 5a75c82828e7c088ca6e7b4827911dc29cc8e774 Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Thu, 2 Jul 2009 17:08:32 -0700
Subject: [CPUFREQ] Cleanup locking in ondemand governor

Redesign the locking inside ondemand driver. Make dbs_mutex handle all the
global state changes inside the driver and invent a new percpu mutex
to serialize percpu timer and frequency limit change.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 drivers/cpufreq/cpufreq_ondemand.c | 62 +++++++++++++++++---------------------
 1 file changed, 27 insertions(+), 35 deletions(-)

(limited to 'drivers/cpufreq')

diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 246ae147df7..d6ba14276bb 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -70,8 +70,13 @@ struct cpu_dbs_info_s {
 	unsigned int freq_lo_jiffies;
 	unsigned int freq_hi_jiffies;
 	int cpu;
-	unsigned int enable:1,
-		sample_type:1;
+	unsigned int sample_type:1;
+	/*
+	 * percpu mutex that serializes governor limit change with
+	 * do_dbs_timer invocation. We do not want do_dbs_timer to run
+	 * when user is changing the governor or limits.
+	 */
+	struct mutex timer_mutex;
 };
 static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info);
 
@@ -79,9 +84,7 @@ static unsigned int dbs_enable;	/* number of CPUs using this policy */
 
 /*
  * dbs_mutex protects data in dbs_tuners_ins from concurrent changes on
- * different CPUs. It protects dbs_enable in governor start/stop. It also
- * serializes governor limit_change with do_dbs_timer. We do not want
- * do_dbs_timer to run when user is changing the governor or limits.
+ * different CPUs. It protects dbs_enable in governor start/stop.
  */
 static DEFINE_MUTEX(dbs_mutex);
 
@@ -187,13 +190,18 @@ static unsigned int powersave_bias_target(struct cpufreq_policy *policy,
 	return freq_hi;
 }
 
+static void ondemand_powersave_bias_init_cpu(int cpu)
+{
+	struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, cpu);
+	dbs_info->freq_table = cpufreq_frequency_get_table(cpu);
+	dbs_info->freq_lo = 0;
+}
+
 static void ondemand_powersave_bias_init(void)
 {
 	int i;
 	for_each_online_cpu(i) {
-		struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, i);
-		dbs_info->freq_table = cpufreq_frequency_get_table(i);
-		dbs_info->freq_lo = 0;
+		ondemand_powersave_bias_init_cpu(i);
 	}
 }
 
@@ -235,12 +243,10 @@ static ssize_t store_sampling_rate(struct cpufreq_policy *unused,
 	unsigned int input;
 	int ret;
 	ret = sscanf(buf, "%u", &input);
+	if (ret != 1)
+		return -EINVAL;
 
 	mutex_lock(&dbs_mutex);
-	if (ret != 1) {
-		mutex_unlock(&dbs_mutex);
-		return -EINVAL;
-	}
 	dbs_tuners_ins.sampling_rate = max(input, min_sampling_rate);
 	mutex_unlock(&dbs_mutex);
 
@@ -254,13 +260,12 @@ static ssize_t store_up_threshold(struct cpufreq_policy *unused,
 	int ret;
 	ret = sscanf(buf, "%u", &input);
 
-	mutex_lock(&dbs_mutex);
 	if (ret != 1 || input > MAX_FREQUENCY_UP_THRESHOLD ||
 			input < MIN_FREQUENCY_UP_THRESHOLD) {
-		mutex_unlock(&dbs_mutex);
 		return -EINVAL;
 	}
 
+	mutex_lock(&dbs_mutex);
 	dbs_tuners_ins.up_threshold = input;
 	mutex_unlock(&dbs_mutex);
 
@@ -358,9 +363,6 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
 	struct cpufreq_policy *policy;
 	unsigned int j;
 
-	if (!this_dbs_info->enable)
-		return;
-
 	this_dbs_info->freq_lo = 0;
 	policy = this_dbs_info->cur_policy;
 
@@ -488,13 +490,7 @@ static void do_dbs_timer(struct work_struct *work)
 	int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
 
 	delay -= jiffies % delay;
-
-	mutex_lock(&dbs_mutex);
-
-	if (!dbs_info->enable) {
-		mutex_unlock(&dbs_mutex);
-		return;
-	}
+	mutex_lock(&dbs_info->timer_mutex);
 
 	/* Common NORMAL_SAMPLE setup */
 	dbs_info->sample_type = DBS_NORMAL_SAMPLE;
@@ -511,7 +507,7 @@ static void do_dbs_timer(struct work_struct *work)
 			dbs_info->freq_lo, CPUFREQ_RELATION_H);
 	}
 	queue_delayed_work_on(cpu, kondemand_wq, &dbs_info->work, delay);
-	mutex_unlock(&dbs_mutex);
+	mutex_unlock(&dbs_info->timer_mutex);
 }
 
 static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info)
@@ -520,8 +516,6 @@ static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info)
 	int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
 	delay -= jiffies % delay;
 
-	dbs_info->enable = 1;
-	ondemand_powersave_bias_init();
 	dbs_info->sample_type = DBS_NORMAL_SAMPLE;
 	INIT_DELAYED_WORK_DEFERRABLE(&dbs_info->work, do_dbs_timer);
 	queue_delayed_work_on(dbs_info->cpu, kondemand_wq, &dbs_info->work,
@@ -530,7 +524,6 @@ static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info)
 
 static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info)
 {
-	dbs_info->enable = 0;
 	cancel_delayed_work_sync(&dbs_info->work);
 }
 
@@ -549,19 +542,15 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 		if ((!cpu_online(cpu)) || (!policy->cur))
 			return -EINVAL;
 
-		if (this_dbs_info->enable) /* Already enabled */
-			break;
-
 		mutex_lock(&dbs_mutex);
-		dbs_enable++;
 
 		rc = sysfs_create_group(&policy->kobj, &dbs_attr_group);
 		if (rc) {
-			dbs_enable--;
 			mutex_unlock(&dbs_mutex);
 			return rc;
 		}
 
+		dbs_enable++;
 		for_each_cpu(j, policy->cpus) {
 			struct cpu_dbs_info_s *j_dbs_info;
 			j_dbs_info = &per_cpu(cpu_dbs_info, j);
@@ -575,6 +564,8 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 			}
 		}
 		this_dbs_info->cpu = cpu;
+		ondemand_powersave_bias_init_cpu(cpu);
+		mutex_init(&this_dbs_info->timer_mutex);
 		/*
 		 * Start the timerschedule work, when this governor
 		 * is used for first time
@@ -602,20 +593,21 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 
 		mutex_lock(&dbs_mutex);
 		sysfs_remove_group(&policy->kobj, &dbs_attr_group);
+		mutex_destroy(&this_dbs_info->timer_mutex);
 		dbs_enable--;
 		mutex_unlock(&dbs_mutex);
 
 		break;
 
 	case CPUFREQ_GOV_LIMITS:
-		mutex_lock(&dbs_mutex);
+		mutex_lock(&this_dbs_info->timer_mutex);
 		if (policy->max < this_dbs_info->cur_policy->cur)
 			__cpufreq_driver_target(this_dbs_info->cur_policy,
 				policy->max, CPUFREQ_RELATION_H);
 		else if (policy->min > this_dbs_info->cur_policy->cur)
 			__cpufreq_driver_target(this_dbs_info->cur_policy,
 				policy->min, CPUFREQ_RELATION_L);
-		mutex_unlock(&dbs_mutex);
+		mutex_unlock(&this_dbs_info->timer_mutex);
 		break;
 	}
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From ee88415caf736b89500f16e0a545614541a45005 Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Thu, 2 Jul 2009 17:08:33 -0700
Subject: [CPUFREQ] Cleanup locking in conservative governor

Redesign the locking inside conservative driver. Make dbs_mutex handle all the
global state changes inside the driver and invent a new percpu mutex
to serialize percpu timer and frequency limit change.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 drivers/cpufreq/cpufreq_conservative.c | 34 +++++++++++++---------------------
 1 file changed, 13 insertions(+), 21 deletions(-)

(limited to 'drivers/cpufreq')

diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 58889f26029..57490502b21 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -63,7 +63,12 @@ struct cpu_dbs_info_s {
 	unsigned int down_skip;
 	unsigned int requested_freq;
 	int cpu;
-	unsigned int enable:1;
+	/*
+	 * percpu mutex that serializes governor limit change with
+	 * do_dbs_timer invocation. We do not want do_dbs_timer to run
+	 * when user is changing the governor or limits.
+	 */
+	struct mutex timer_mutex;
 };
 static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info);
 
@@ -71,9 +76,7 @@ static unsigned int dbs_enable;	/* number of CPUs using this policy */
 
 /*
  * dbs_mutex protects data in dbs_tuners_ins from concurrent changes on
- * different CPUs. It protects dbs_enable in governor start/stop. It also
- * serializes governor limit_change with do_dbs_timer. We do not want
- * do_dbs_timer to run when user is changing the governor or limits.
+ * different CPUs. It protects dbs_enable in governor start/stop.
  */
 static DEFINE_MUTEX(dbs_mutex);
 
@@ -138,9 +141,6 @@ dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
 
 	struct cpufreq_policy *policy;
 
-	if (!this_dbs_info->enable)
-		return 0;
-
 	policy = this_dbs_info->cur_policy;
 
 	/*
@@ -483,17 +483,12 @@ static void do_dbs_timer(struct work_struct *work)
 
 	delay -= jiffies % delay;
 
-	mutex_lock(&dbs_mutex);
-
-	if (!dbs_info->enable) {
-		mutex_unlock(&dbs_mutex);
-		return;
-	}
+	mutex_lock(&dbs_info->timer_mutex);
 
 	dbs_check_cpu(dbs_info);
 
 	queue_delayed_work_on(cpu, kconservative_wq, &dbs_info->work, delay);
-	mutex_unlock(&dbs_mutex);
+	mutex_unlock(&dbs_info->timer_mutex);
 }
 
 static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info)
@@ -502,7 +497,6 @@ static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info)
 	int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
 	delay -= jiffies % delay;
 
-	dbs_info->enable = 1;
 	INIT_DELAYED_WORK_DEFERRABLE(&dbs_info->work, do_dbs_timer);
 	queue_delayed_work_on(dbs_info->cpu, kconservative_wq, &dbs_info->work,
 				delay);
@@ -510,7 +504,6 @@ static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info)
 
 static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info)
 {
-	dbs_info->enable = 0;
 	cancel_delayed_work_sync(&dbs_info->work);
 }
 
@@ -529,9 +522,6 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 		if ((!cpu_online(cpu)) || (!policy->cur))
 			return -EINVAL;
 
-		if (this_dbs_info->enable) /* Already enabled */
-			break;
-
 		mutex_lock(&dbs_mutex);
 
 		rc = sysfs_create_group(&policy->kobj, &dbs_attr_group);
@@ -555,6 +545,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 		this_dbs_info->down_skip = 0;
 		this_dbs_info->requested_freq = policy->cur;
 
+		mutex_init(&this_dbs_info->timer_mutex);
 		dbs_enable++;
 		/*
 		 * Start the timerschedule work, when this governor
@@ -596,6 +587,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 		mutex_lock(&dbs_mutex);
 		sysfs_remove_group(&policy->kobj, &dbs_attr_group);
 		dbs_enable--;
+		mutex_destroy(&this_dbs_info->timer_mutex);
 
 		/*
 		 * Stop the timerschedule work, when this governor
@@ -611,7 +603,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 		break;
 
 	case CPUFREQ_GOV_LIMITS:
-		mutex_lock(&dbs_mutex);
+		mutex_lock(&this_dbs_info->timer_mutex);
 		if (policy->max < this_dbs_info->cur_policy->cur)
 			__cpufreq_driver_target(
 					this_dbs_info->cur_policy,
@@ -620,7 +612,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 			__cpufreq_driver_target(
 					this_dbs_info->cur_policy,
 					policy->min, CPUFREQ_RELATION_L);
-		mutex_unlock(&dbs_mutex);
+		mutex_unlock(&this_dbs_info->timer_mutex);
 
 		break;
 	}
-- 
cgit v1.2.3-70-g09d2


From 3f4a782b5ce2698b1870b5a7b573cd721d4fce33 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 3 Jul 2009 11:25:16 -0400
Subject: [CPUFREQ] fix (utter) cpufreq_add_dev mess

OK, I've tried to clean it up the best I could, but please test this with
concurrent cpu hotplug and cpufreq add/remove in loops. I'm sure we will make
other interesting findings.

This is step one of fixing the overall locking dependency mess in cpufreq.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
CC: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
CC: rjw@sisk.pl
CC: mingo@elte.hu
CC: Shaohua Li <shaohua.li@intel.com>
CC: Pekka Enberg <penberg@cs.helsinki.fi>
CC: Dave Young <hidave.darkstar@gmail.com>
CC: "Rafael J. Wysocki" <rjw@sisk.pl>
CC: Rusty Russell <rusty@rustcorp.com.au>
CC: sven.wegener@stealer.net
CC: cpufreq@vger.kernel.org
CC: Thomas Renninger <trenn@suse.de>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 drivers/cpufreq/cpufreq.c | 65 +++++++++++++++++++++++++++++------------------
 1 file changed, 40 insertions(+), 25 deletions(-)

(limited to 'drivers/cpufreq')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index c7fe16e0474..c668ac855f7 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -761,6 +761,10 @@ static struct kobj_type ktype_cpufreq = {
  * cpufreq_add_dev - add a CPU device
  *
  * Adds the cpufreq interface for a CPU device.
+ *
+ * The Oracle says: try running cpufreq registration/unregistration concurrently
+ * with with cpu hotplugging and all hell will break loose. Tried to clean this
+ * mess up, but more thorough testing is needed. - Mathieu
  */
 static int cpufreq_add_dev(struct sys_device *sys_dev)
 {
@@ -804,15 +808,12 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
 		goto nomem_out;
 	}
 	if (!alloc_cpumask_var(&policy->cpus, GFP_KERNEL)) {
-		kfree(policy);
 		ret = -ENOMEM;
-		goto nomem_out;
+		goto err_free_policy;
 	}
 	if (!zalloc_cpumask_var(&policy->related_cpus, GFP_KERNEL)) {
-		free_cpumask_var(policy->cpus);
-		kfree(policy);
 		ret = -ENOMEM;
-		goto nomem_out;
+		goto err_free_cpumask;
 	}
 
 	policy->cpu = cpu;
@@ -820,7 +821,8 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
 
 	/* Initially set CPU itself as the policy_cpu */
 	per_cpu(policy_cpu, cpu) = cpu;
-	lock_policy_rwsem_write(cpu);
+	ret = (lock_policy_rwsem_write(cpu) < 0);
+	WARN_ON(ret);
 
 	init_completion(&policy->kobj_unregister);
 	INIT_WORK(&policy->update, handle_update);
@@ -833,7 +835,7 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
 	ret = cpufreq_driver->init(policy);
 	if (ret) {
 		dprintk("initialization failed\n");
-		goto err_out;
+		goto err_unlock_policy;
 	}
 	policy->user_policy.min = policy->min;
 	policy->user_policy.max = policy->max;
@@ -858,15 +860,21 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
 		/* Check for existing affected CPUs.
 		 * They may not be aware of it due to CPU Hotplug.
 		 */
-		managed_policy = cpufreq_cpu_get(j);		/* FIXME: Where is this released?  What about error paths? */
+		managed_policy = cpufreq_cpu_get(j);
 		if (unlikely(managed_policy)) {
 
 			/* Set proper policy_cpu */
 			unlock_policy_rwsem_write(cpu);
 			per_cpu(policy_cpu, cpu) = managed_policy->cpu;
 
-			if (lock_policy_rwsem_write(cpu) < 0)
-				goto err_out_driver_exit;
+			if (lock_policy_rwsem_write(cpu) < 0) {
+				/* Should not go through policy unlock path */
+				if (cpufreq_driver->exit)
+					cpufreq_driver->exit(policy);
+				ret = -EBUSY;
+				cpufreq_cpu_put(managed_policy);
+				goto err_free_cpumask;
+			}
 
 			spin_lock_irqsave(&cpufreq_driver_lock, flags);
 			cpumask_copy(managed_policy->cpus, policy->cpus);
@@ -877,12 +885,14 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
 			ret = sysfs_create_link(&sys_dev->kobj,
 						&managed_policy->kobj,
 						"cpufreq");
-			if (ret)
-				goto err_out_driver_exit;
-
-			cpufreq_debug_enable_ratelimit();
-			ret = 0;
-			goto err_out_driver_exit; /* call driver->exit() */
+			if (!ret)
+				cpufreq_cpu_put(managed_policy);
+			/*
+			 * Success. We only needed to be added to the mask.
+			 * Call driver->exit() because only the cpu parent of
+			 * the kobj needed to call init().
+			 */
+			goto out_driver_exit; /* call driver->exit() */
 		}
 	}
 #endif
@@ -892,25 +902,25 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
 	ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq, &sys_dev->kobj,
 				   "cpufreq");
 	if (ret)
-		goto err_out_driver_exit;
+		goto out_driver_exit;
 
 	/* set up files for this cpu device */
 	drv_attr = cpufreq_driver->attr;
 	while ((drv_attr) && (*drv_attr)) {
 		ret = sysfs_create_file(&policy->kobj, &((*drv_attr)->attr));
 		if (ret)
-			goto err_out_driver_exit;
+			goto err_out_kobj_put;
 		drv_attr++;
 	}
 	if (cpufreq_driver->get) {
 		ret = sysfs_create_file(&policy->kobj, &cpuinfo_cur_freq.attr);
 		if (ret)
-			goto err_out_driver_exit;
+			goto err_out_kobj_put;
 	}
 	if (cpufreq_driver->target) {
 		ret = sysfs_create_file(&policy->kobj, &scaling_cur_freq.attr);
 		if (ret)
-			goto err_out_driver_exit;
+			goto err_out_kobj_put;
 	}
 
 	spin_lock_irqsave(&cpufreq_driver_lock, flags);
@@ -928,12 +938,14 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
 			continue;
 
 		dprintk("CPU %u already managed, adding link\n", j);
-		cpufreq_cpu_get(cpu);
+		managed_policy = cpufreq_cpu_get(cpu);
 		cpu_sys_dev = get_cpu_sysdev(j);
 		ret = sysfs_create_link(&cpu_sys_dev->kobj, &policy->kobj,
 					"cpufreq");
-		if (ret)
+		if (ret) {
+			cpufreq_cpu_put(managed_policy);
 			goto err_out_unregister;
+		}
 	}
 
 	policy->governor = NULL; /* to assure that the starting sequence is
@@ -965,17 +977,20 @@ err_out_unregister:
 		per_cpu(cpufreq_cpu_data, j) = NULL;
 	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
 
+err_out_kobj_put:
 	kobject_put(&policy->kobj);
 	wait_for_completion(&policy->kobj_unregister);
 
-err_out_driver_exit:
+out_driver_exit:
 	if (cpufreq_driver->exit)
 		cpufreq_driver->exit(policy);
 
-err_out:
+err_unlock_policy:
 	unlock_policy_rwsem_write(cpu);
+err_free_cpumask:
+	free_cpumask_var(policy->cpus);
+err_free_policy:
 	kfree(policy);
-
 nomem_out:
 	module_put(cpufreq_driver->owner);
 module_out:
-- 
cgit v1.2.3-70-g09d2


From 5e1596f75395e7a402e1059c518e633d2732dcf8 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Wed, 8 Jul 2009 16:14:23 -0400
Subject: [CPUFREQ] Fix compile failure in cpufreq.c

managed_policy is out of scope for the non-smp case.
Declare it locally where used (twice)

Signed-off-by: Dave Jones <davej@redhat.com>
---
 drivers/cpufreq/cpufreq.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'drivers/cpufreq')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index c668ac855f7..b90eda8b344 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -776,9 +776,6 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
 	struct sys_device *cpu_sys_dev;
 	unsigned long flags;
 	unsigned int j;
-#ifdef CONFIG_SMP
-	struct cpufreq_policy *managed_policy;
-#endif
 
 	if (cpu_is_offline(cpu))
 		return 0;
@@ -854,6 +851,8 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
 #endif
 
 	for_each_cpu(j, policy->cpus) {
+		struct cpufreq_policy *managed_policy;
+
 		if (cpu == j)
 			continue;
 
@@ -932,6 +931,8 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
 
 	/* symlink affected CPUs */
 	for_each_cpu(j, policy->cpus) {
+		struct cpufreq_policy *managed_policy;
+
 		if (j == cpu)
 			continue;
 		if (!cpu_online(j))
-- 
cgit v1.2.3-70-g09d2


From 26d204afa18f7df177f21bdb3759e0098ca8f7d5 Mon Sep 17 00:00:00 2001
From: "Pallipadi, Venkatesh" <venkatesh.pallipadi@intel.com>
Date: Wed, 29 Jul 2009 13:36:10 -0700
Subject: [CPUFREQ] Fix NULL pointer dereference regression in conservative
 governor

Commit ee88415caf736b89500f16e0a545614541a45005
introduced this regression when it removed enable bit in cpu_dbs_info_s.
That added a possibility of dbs_cpufreq_notifier getting called for a
CPU that is not yet managed by conservative governor. That will happen
as the transition notifier is set as soon as one CPU switches to
conservative governor and other CPUs can get a NULL pointer dereference
without the enable bit check. Add the enable bit back again.

Reported-by: Lermytte Christophe <Christophe.Lermytte@thomson.net>
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 drivers/cpufreq/cpufreq_conservative.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'drivers/cpufreq')

diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 57490502b21..bdea7e2f94b 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -63,6 +63,7 @@ struct cpu_dbs_info_s {
 	unsigned int down_skip;
 	unsigned int requested_freq;
 	int cpu;
+	unsigned int enable:1;
 	/*
 	 * percpu mutex that serializes governor limit change with
 	 * do_dbs_timer invocation. We do not want do_dbs_timer to run
@@ -141,6 +142,9 @@ dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
 
 	struct cpufreq_policy *policy;
 
+	if (!this_dbs_info->enable)
+		return 0;
+
 	policy = this_dbs_info->cur_policy;
 
 	/*
@@ -497,6 +501,7 @@ static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info)
 	int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
 	delay -= jiffies % delay;
 
+	dbs_info->enable = 1;
 	INIT_DELAYED_WORK_DEFERRABLE(&dbs_info->work, do_dbs_timer);
 	queue_delayed_work_on(dbs_info->cpu, kconservative_wq, &dbs_info->work,
 				delay);
@@ -504,6 +509,7 @@ static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info)
 
 static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info)
 {
+	dbs_info->enable = 0;
 	cancel_delayed_work_sync(&dbs_info->work);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 42c74b84c64633dd3badbfc2abd2ef1728b64b30 Mon Sep 17 00:00:00 2001
From: Prarit Bhargava <prarit@redhat.com>
Date: Mon, 3 Aug 2009 10:58:11 -0400
Subject: [CPUFREQ] Do not set policy for offline cpus

Suspend/Resume fails on multi socket, multi core systems because the cpufreq
code erroneously sets the per_cpu policy_cpu value when a logical cpu is
offline.

This most notably results in missing sysfs files that are used to set the
cpu frequencies of the various cpus.

Signed-off-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 drivers/cpufreq/cpufreq.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'drivers/cpufreq')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index b90eda8b344..120d236c0ff 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -924,6 +924,8 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
 
 	spin_lock_irqsave(&cpufreq_driver_lock, flags);
 	for_each_cpu(j, policy->cpus) {
+		if (!cpu_online(j))
+			continue;
 		per_cpu(cpufreq_cpu_data, j) = policy;
 		per_cpu(policy_cpu, j) = policy->cpu;
 	}
-- 
cgit v1.2.3-70-g09d2


From d5194decd0a6f792b2789eebd4ddf022a248f655 Mon Sep 17 00:00:00 2001
From: Thomas Renninger <trenn@suse.de>
Date: Wed, 29 Jul 2009 11:26:20 +0200
Subject: [CPUFREQ] Fix a kobject reference bug related to managed CPUs

The first offline/online cycle is successful, the second not.
Doing:
echo 0 >cpu1/online
echo 1 >cpu1/online
echo 0 >cpu1/online

The last command will trigger:
Jul 22 14:39:50 linux kernel: [  593.210125] ------------[ cut here ]------------
Jul 22 14:39:50 linux kernel: [  593.210139] WARNING: at lib/kref.c:43 kref_get+0x23/0x2b()
Jul 22 14:39:50 linux kernel: [  593.210144] Hardware name: To Be Filled By O.E.M.
Jul 22 14:39:50 linux kernel: [  593.210148] Modules linked in: powernow_k8
Jul 22 14:39:50 linux kernel: [  593.210158] Pid: 378, comm: kondemand/2 Tainted: G        W  2.6.31-rc2 #38
Jul 22 14:39:50 linux kernel: [  593.210163] Call Trace:
Jul 22 14:39:50 linux kernel: [  593.210171]  [<ffffffff812008e8>] ? kref_get+0x23/0x2b
Jul 22 14:39:50 linux kernel: [  593.210181]  [<ffffffff81041926>] warn_slowpath_common+0x77/0xa4
Jul 22 14:39:50 linux kernel: [  593.210190]  [<ffffffff81041962>] warn_slowpath_null+0xf/0x11
Jul 22 14:39:50 linux kernel: [  593.210198]  [<ffffffff812008e8>] kref_get+0x23/0x2b
Jul 22 14:39:50 linux kernel: [  593.210206]  [<ffffffff811ffa19>] kobject_get+0x1a/0x22
Jul 22 14:39:50 linux kernel: [  593.210214]  [<ffffffff813e815d>] cpufreq_cpu_get+0x8a/0xcb
Jul 22 14:39:50 linux kernel: [  593.210222]  [<ffffffff813e87d1>] __cpufreq_driver_getavg+0x1d/0x67
Jul 22 14:39:50 linux kernel: [  593.210231]  [<ffffffff813ea18f>] do_dbs_timer+0x158/0x27f
Jul 22 14:39:50 linux kernel: [  593.210240]  [<ffffffff810529ea>] worker_thread+0x200/0x313
...

The output continues on every do_dbs_timer ondemand freq checking poll.
This regression was introduced by git commit:
3f4a782b5ce2698b1870b5a7b573cd721d4fce33

The policy is released when the cpufreq device is removed in:
__cpufreq_remove_dev():
	/* if this isn't the CPU which is the parent of the kobj, we
	 * only need to unlink, put and exit
	 */

Not creating the symlink is not sever at all.
As long as:
sysfs_remove_link(&sys_dev->kobj, "cpufreq");
handles it gracefully that the symlink did not exist.
Possibly no error should be returned at all, because ondemand
governor would still provide the same functionality.
Userspace in userspace gov case might be confused if the link
is missing.

Resolves http://bugzilla.kernel.org/show_bug.cgi?id=13903

CC: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
CC: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 drivers/cpufreq/cpufreq.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'drivers/cpufreq')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 120d236c0ff..bd74a0b1217 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -858,6 +858,8 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
 
 		/* Check for existing affected CPUs.
 		 * They may not be aware of it due to CPU Hotplug.
+		 * cpufreq_cpu_put is called when the device is removed
+		 * in __cpufreq_remove_dev()
 		 */
 		managed_policy = cpufreq_cpu_get(j);
 		if (unlikely(managed_policy)) {
@@ -884,7 +886,7 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
 			ret = sysfs_create_link(&sys_dev->kobj,
 						&managed_policy->kobj,
 						"cpufreq");
-			if (!ret)
+			if (ret)
 				cpufreq_cpu_put(managed_policy);
 			/*
 			 * Success. We only needed to be added to the mask.
-- 
cgit v1.2.3-70-g09d2


From 4bc5d34135039566b8d6efa2de7515b2be505da8 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Tue, 4 Aug 2009 14:03:25 -0400
Subject: [CPUFREQ] Make cpufreq suspend code conditional on powerpc.

The suspend code runs with interrupts disabled, and the powerpc workaround we
do in the cpufreq suspend hook calls the drivers ->get method.

powernow-k8's ->get does an smp_call_function_single
which needs interrupts enabled

cpufreq's suspend/resume code was added in 42d4dc3f4e1e to work around
a hardware problem on ppc powerbooks.  If we make all this code
conditional on powerpc, we avoid the issue above.

Signed-off-by: Dave Jones <davej@redhat.com>
---
 drivers/cpufreq/cpufreq.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'drivers/cpufreq')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index bd74a0b1217..fd69086d08d 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1248,13 +1248,22 @@ EXPORT_SYMBOL(cpufreq_get);
 
 static int cpufreq_suspend(struct sys_device *sysdev, pm_message_t pmsg)
 {
-	int cpu = sysdev->id;
 	int ret = 0;
+
+#ifdef __powerpc__
+	int cpu = sysdev->id;
 	unsigned int cur_freq = 0;
 	struct cpufreq_policy *cpu_policy;
 
 	dprintk("suspending cpu %u\n", cpu);
 
+	/*
+	 * This whole bogosity is here because Powerbooks are made of fail.
+	 * No sane platform should need any of the code below to be run.
+	 * (it's entirely the wrong thing to do, as driver->get may
+	 *  reenable interrupts on some architectures).
+	 */
+
 	if (!cpu_online(cpu))
 		return 0;
 
@@ -1313,6 +1322,7 @@ static int cpufreq_suspend(struct sys_device *sysdev, pm_message_t pmsg)
 
 out:
 	cpufreq_cpu_put(cpu_policy);
+#endif	/* __powerpc__ */
 	return ret;
 }
 
@@ -1326,12 +1336,18 @@ out:
  */
 static int cpufreq_resume(struct sys_device *sysdev)
 {
-	int cpu = sysdev->id;
 	int ret = 0;
+
+#ifdef __powerpc__
+	int cpu = sysdev->id;
 	struct cpufreq_policy *cpu_policy;
 
 	dprintk("resuming cpu %u\n", cpu);
 
+	/* As with the ->suspend method, all the code below is
+	 * only necessary because Powerbooks suck.
+	 * See commit 42d4dc3f4e1e for jokes. */
+
 	if (!cpu_online(cpu))
 		return 0;
 
@@ -1395,6 +1411,7 @@ out:
 	schedule_work(&cpu_policy->update);
 fail:
 	cpufreq_cpu_put(cpu_policy);
+#endif	/* __powerpc__ */
 	return ret;
 }
 
-- 
cgit v1.2.3-70-g09d2