From a953e4597abd51b74c99e0e3b7074532a60fd031 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Mon, 12 May 2008 21:21:12 +0200
Subject: sched: replace MAX_NUMNODES with nr_node_ids in kernel/sched.c

  * Replace usages of MAX_NUMNODES with nr_node_ids in kernel/sched.c,
    where appropriate.  This saves some allocated space as well as many
    wasted cycles going through node entries that are non-existent.

For inclusion into sched-devel/latest tree.

Based on:
	git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
    +   sched-devel/latest  .../mingo/linux-2.6-sched-devel.git

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index cfa222a9153..1ed8011db82 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6879,9 +6879,9 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
 
 	min_val = INT_MAX;
 
-	for (i = 0; i < MAX_NUMNODES; i++) {
+	for (i = 0; i < nr_node_ids; i++) {
 		/* Start at @node */
-		n = (node + i) % MAX_NUMNODES;
+		n = (node + i) % nr_node_ids;
 
 		if (!nr_cpus_node(n))
 			continue;
@@ -7075,7 +7075,7 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
 		if (!sched_group_nodes)
 			continue;
 
-		for (i = 0; i < MAX_NUMNODES; i++) {
+		for (i = 0; i < nr_node_ids; i++) {
 			struct sched_group *oldsg, *sg = sched_group_nodes[i];
 
 			*nodemask = node_to_cpumask(i);
@@ -7263,7 +7263,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	/*
 	 * Allocate the per-node list of sched groups
 	 */
-	sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *),
+	sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
 				    GFP_KERNEL);
 	if (!sched_group_nodes) {
 		printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -7407,7 +7407,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #endif
 
 	/* Set up physical groups */
-	for (i = 0; i < MAX_NUMNODES; i++) {
+	for (i = 0; i < nr_node_ids; i++) {
 		SCHED_CPUMASK_VAR(nodemask, allmasks);
 		SCHED_CPUMASK_VAR(send_covered, allmasks);
 
@@ -7431,7 +7431,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 					send_covered, tmpmask);
 	}
 
-	for (i = 0; i < MAX_NUMNODES; i++) {
+	for (i = 0; i < nr_node_ids; i++) {
 		/* Set up node groups */
 		struct sched_group *sg, *prev;
 		SCHED_CPUMASK_VAR(nodemask, allmasks);
@@ -7470,9 +7470,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		cpus_or(*covered, *covered, *nodemask);
 		prev = sg;
 
-		for (j = 0; j < MAX_NUMNODES; j++) {
+		for (j = 0; j < nr_node_ids; j++) {
 			SCHED_CPUMASK_VAR(notcovered, allmasks);
-			int n = (i + j) % MAX_NUMNODES;
+			int n = (i + j) % nr_node_ids;
 			node_to_cpumask_ptr(pnodemask, n);
 
 			cpus_complement(*notcovered, *covered);
@@ -7525,7 +7525,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	}
 
 #ifdef CONFIG_NUMA
-	for (i = 0; i < MAX_NUMNODES; i++)
+	for (i = 0; i < nr_node_ids; i++)
 		init_numa_sched_groups_power(sched_group_nodes[i]);
 
 	if (sd_allnodes) {
-- 
cgit v1.2.3-70-g09d2


From 363ab6f1424cdea63e5d182312d60e19077b892a Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Mon, 12 May 2008 21:21:13 +0200
Subject: core: use performance variant for_each_cpu_mask_nr

Change references from for_each_cpu_mask to for_each_cpu_mask_nr
where appropriate

Reviewed-by: Paul Jackson <pj@sgi.com>
Reviewed-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/cpu.c        |  2 +-
 kernel/rcuclassic.c |  2 +-
 kernel/rcupreempt.c | 10 +++++-----
 kernel/sched.c      | 36 ++++++++++++++++++------------------
 kernel/sched_fair.c |  2 +-
 kernel/sched_rt.c   |  6 +++---
 kernel/taskstats.c  |  4 ++--
 kernel/workqueue.c  |  6 +++---
 8 files changed, 34 insertions(+), 34 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index c77bc3a1c72..50ae922c602 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -390,7 +390,7 @@ void __ref enable_nonboot_cpus(void)
 		goto out;
 
 	printk("Enabling non-boot CPUs ...\n");
-	for_each_cpu_mask(cpu, frozen_cpus) {
+	for_each_cpu_mask_nr(cpu, frozen_cpus) {
 		error = _cpu_up(cpu, 1);
 		if (!error) {
 			printk("CPU%d is up\n", cpu);
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index f4ffbd0f306..251358de70b 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -92,7 +92,7 @@ static void force_quiescent_state(struct rcu_data *rdp,
 		 */
 		cpumask = rcp->cpumask;
 		cpu_clear(rdp->cpu, cpumask);
-		for_each_cpu_mask(cpu, cpumask)
+		for_each_cpu_mask_nr(cpu, cpumask)
 			smp_send_reschedule(cpu);
 	}
 }
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index e1cdf196a51..18af270125c 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -657,7 +657,7 @@ rcu_try_flip_idle(void)
 
 	/* Now ask each CPU for acknowledgement of the flip. */
 
-	for_each_cpu_mask(cpu, rcu_cpu_online_map) {
+	for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) {
 		per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
 		dyntick_save_progress_counter(cpu);
 	}
@@ -675,7 +675,7 @@ rcu_try_flip_waitack(void)
 	int cpu;
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
-	for_each_cpu_mask(cpu, rcu_cpu_online_map)
+	for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
 		if (rcu_try_flip_waitack_needed(cpu) &&
 		    per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
 			RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
@@ -707,7 +707,7 @@ rcu_try_flip_waitzero(void)
 	/* Check to see if the sum of the "last" counters is zero. */
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
-	for_each_cpu_mask(cpu, rcu_cpu_online_map)
+	for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
 		sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
 	if (sum != 0) {
 		RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
@@ -722,7 +722,7 @@ rcu_try_flip_waitzero(void)
 	smp_mb();  /*  ^^^^^^^^^^^^ */
 
 	/* Call for a memory barrier from each CPU. */
-	for_each_cpu_mask(cpu, rcu_cpu_online_map) {
+	for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) {
 		per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
 		dyntick_save_progress_counter(cpu);
 	}
@@ -742,7 +742,7 @@ rcu_try_flip_waitmb(void)
 	int cpu;
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
-	for_each_cpu_mask(cpu, rcu_cpu_online_map)
+	for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
 		if (rcu_try_flip_waitmb_needed(cpu) &&
 		    per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
 			RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
diff --git a/kernel/sched.c b/kernel/sched.c
index 1ed8011db82..814d6e17f1e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2271,7 +2271,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 		/* Tally up the load of all CPUs in the group */
 		avg_load = 0;
 
-		for_each_cpu_mask(i, group->cpumask) {
+		for_each_cpu_mask_nr(i, group->cpumask) {
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
 				load = source_load(i, load_idx);
@@ -2313,7 +2313,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
 	/* Traverse only the allowed CPUs */
 	cpus_and(*tmp, group->cpumask, p->cpus_allowed);
 
-	for_each_cpu_mask(i, *tmp) {
+	for_each_cpu_mask_nr(i, *tmp) {
 		load = weighted_cpuload(i);
 
 		if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -3296,7 +3296,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		max_cpu_load = 0;
 		min_cpu_load = ~0UL;
 
-		for_each_cpu_mask(i, group->cpumask) {
+		for_each_cpu_mask_nr(i, group->cpumask) {
 			struct rq *rq;
 
 			if (!cpu_isset(i, *cpus))
@@ -3560,7 +3560,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
 	unsigned long max_load = 0;
 	int i;
 
-	for_each_cpu_mask(i, group->cpumask) {
+	for_each_cpu_mask_nr(i, group->cpumask) {
 		unsigned long wl;
 
 		if (!cpu_isset(i, *cpus))
@@ -4100,7 +4100,7 @@ static void run_rebalance_domains(struct softirq_action *h)
 		int balance_cpu;
 
 		cpu_clear(this_cpu, cpus);
-		for_each_cpu_mask(balance_cpu, cpus) {
+		for_each_cpu_mask_nr(balance_cpu, cpus) {
 			/*
 			 * If this cpu gets work to do, stop the load balancing
 			 * work being done for other cpus. Next load
@@ -6832,7 +6832,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
 
 	cpus_clear(*covered);
 
-	for_each_cpu_mask(i, *span) {
+	for_each_cpu_mask_nr(i, *span) {
 		struct sched_group *sg;
 		int group = group_fn(i, cpu_map, &sg, tmpmask);
 		int j;
@@ -6843,7 +6843,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
 		cpus_clear(sg->cpumask);
 		sg->__cpu_power = 0;
 
-		for_each_cpu_mask(j, *span) {
+		for_each_cpu_mask_nr(j, *span) {
 			if (group_fn(j, cpu_map, NULL, tmpmask) != group)
 				continue;
 
@@ -7043,7 +7043,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
 	if (!sg)
 		return;
 	do {
-		for_each_cpu_mask(j, sg->cpumask) {
+		for_each_cpu_mask_nr(j, sg->cpumask) {
 			struct sched_domain *sd;
 
 			sd = &per_cpu(phys_domains, j);
@@ -7068,7 +7068,7 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
 {
 	int cpu, i;
 
-	for_each_cpu_mask(cpu, *cpu_map) {
+	for_each_cpu_mask_nr(cpu, *cpu_map) {
 		struct sched_group **sched_group_nodes
 			= sched_group_nodes_bycpu[cpu];
 
@@ -7302,7 +7302,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	/*
 	 * Set up domains for cpus specified by the cpu_map.
 	 */
-	for_each_cpu_mask(i, *cpu_map) {
+	for_each_cpu_mask_nr(i, *cpu_map) {
 		struct sched_domain *sd = NULL, *p;
 		SCHED_CPUMASK_VAR(nodemask, allmasks);
 
@@ -7374,7 +7374,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
 #ifdef CONFIG_SCHED_SMT
 	/* Set up CPU (sibling) groups */
-	for_each_cpu_mask(i, *cpu_map) {
+	for_each_cpu_mask_nr(i, *cpu_map) {
 		SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
 		SCHED_CPUMASK_VAR(send_covered, allmasks);
 
@@ -7391,7 +7391,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
 #ifdef CONFIG_SCHED_MC
 	/* Set up multi-core groups */
-	for_each_cpu_mask(i, *cpu_map) {
+	for_each_cpu_mask_nr(i, *cpu_map) {
 		SCHED_CPUMASK_VAR(this_core_map, allmasks);
 		SCHED_CPUMASK_VAR(send_covered, allmasks);
 
@@ -7458,7 +7458,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 			goto error;
 		}
 		sched_group_nodes[i] = sg;
-		for_each_cpu_mask(j, *nodemask) {
+		for_each_cpu_mask_nr(j, *nodemask) {
 			struct sched_domain *sd;
 
 			sd = &per_cpu(node_domains, j);
@@ -7504,21 +7504,21 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
 	/* Calculate CPU power for physical packages and nodes */
 #ifdef CONFIG_SCHED_SMT
-	for_each_cpu_mask(i, *cpu_map) {
+	for_each_cpu_mask_nr(i, *cpu_map) {
 		struct sched_domain *sd = &per_cpu(cpu_domains, i);
 
 		init_sched_groups_power(i, sd);
 	}
 #endif
 #ifdef CONFIG_SCHED_MC
-	for_each_cpu_mask(i, *cpu_map) {
+	for_each_cpu_mask_nr(i, *cpu_map) {
 		struct sched_domain *sd = &per_cpu(core_domains, i);
 
 		init_sched_groups_power(i, sd);
 	}
 #endif
 
-	for_each_cpu_mask(i, *cpu_map) {
+	for_each_cpu_mask_nr(i, *cpu_map) {
 		struct sched_domain *sd = &per_cpu(phys_domains, i);
 
 		init_sched_groups_power(i, sd);
@@ -7538,7 +7538,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #endif
 
 	/* Attach the domains */
-	for_each_cpu_mask(i, *cpu_map) {
+	for_each_cpu_mask_nr(i, *cpu_map) {
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
 		sd = &per_cpu(cpu_domains, i);
@@ -7621,7 +7621,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
 
 	unregister_sched_domain_sysctl();
 
-	for_each_cpu_mask(i, *cpu_map)
+	for_each_cpu_mask_nr(i, *cpu_map)
 		cpu_attach_domain(NULL, &def_root_domain, i);
 	synchronize_sched();
 	arch_destroy_sched_domains(cpu_map, &tmpmask);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e24ecd39c4b..e398318f101 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1022,7 +1022,7 @@ static int wake_idle(int cpu, struct task_struct *p)
 		    || ((sd->flags & SD_WAKE_IDLE_FAR)
 			&& !task_hot(p, task_rq(p)->clock, sd))) {
 			cpus_and(tmp, sd->span, p->cpus_allowed);
-			for_each_cpu_mask(i, tmp) {
+			for_each_cpu_mask_nr(i, tmp) {
 				if (idle_cpu(i)) {
 					if (i != task_cpu(p)) {
 						schedstat_inc(p,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 060e87b0cb1..d73386c6e36 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -231,7 +231,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 		return 1;
 
 	span = sched_rt_period_mask();
-	for_each_cpu_mask(i, span) {
+	for_each_cpu_mask_nr(i, span) {
 		int enqueue = 0;
 		struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
 		struct rq *rq = rq_of_rt_rq(rt_rq);
@@ -272,7 +272,7 @@ static int balance_runtime(struct rt_rq *rt_rq)
 
 	spin_lock(&rt_b->rt_runtime_lock);
 	rt_period = ktime_to_ns(rt_b->rt_period);
-	for_each_cpu_mask(i, rd->span) {
+	for_each_cpu_mask_nr(i, rd->span) {
 		struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 		s64 diff;
 
@@ -1000,7 +1000,7 @@ static int pull_rt_task(struct rq *this_rq)
 
 	next = pick_next_task_rt(this_rq);
 
-	for_each_cpu_mask(cpu, this_rq->rd->rto_mask) {
+	for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) {
 		if (this_cpu == cpu)
 			continue;
 
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 4a23517169a..06b17547f4e 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -301,7 +301,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
 		return -EINVAL;
 
 	if (isadd == REGISTER) {
-		for_each_cpu_mask(cpu, mask) {
+		for_each_cpu_mask_nr(cpu, mask) {
 			s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
 					 cpu_to_node(cpu));
 			if (!s)
@@ -320,7 +320,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
 
 	/* Deregister or cleanup */
 cleanup:
-	for_each_cpu_mask(cpu, mask) {
+	for_each_cpu_mask_nr(cpu, mask) {
 		listeners = &per_cpu(listener_array, cpu);
 		down_write(&listeners->sem);
 		list_for_each_entry_safe(s, tmp, &listeners->list, list) {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 29fc39f1029..28c2b2c96ac 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -397,7 +397,7 @@ void flush_workqueue(struct workqueue_struct *wq)
 	might_sleep();
 	lock_acquire(&wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
 	lock_release(&wq->lockdep_map, 1, _THIS_IP_);
-	for_each_cpu_mask(cpu, *cpu_map)
+	for_each_cpu_mask_nr(cpu, *cpu_map)
 		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
@@ -477,7 +477,7 @@ static void wait_on_work(struct work_struct *work)
 	wq = cwq->wq;
 	cpu_map = wq_cpu_map(wq);
 
-	for_each_cpu_mask(cpu, *cpu_map)
+	for_each_cpu_mask_nr(cpu, *cpu_map)
 		wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 }
 
@@ -813,7 +813,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	list_del(&wq->list);
 	spin_unlock(&workqueue_lock);
 
-	for_each_cpu_mask(cpu, *cpu_map)
+	for_each_cpu_mask_nr(cpu, *cpu_map)
 		cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
 	put_online_cpus();
 
-- 
cgit v1.2.3-70-g09d2


From 13b40c1e40f3261e83ee514a08b77dbecb93021b Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Tue, 1 Jul 2008 10:32:50 -0700
Subject: sched: reduce stack size in isolated_cpu_setup()

  * Remove 16k stack requirements in isolated_cpu_setup when NR_CPUS=4096.

Signed-off-by: Mike Travis <travis@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 99e6d850eca..1ee18dbb451 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6768,7 +6768,8 @@ static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
-	int ints[NR_CPUS], i;
+	static int __initdata ints[NR_CPUS];
+	int i;
 
 	str = get_options(str, ARRAY_SIZE(ints), ints);
 	cpus_clear(cpu_isolated_map);
-- 
cgit v1.2.3-70-g09d2


From e761b7725234276a802322549cee5255305a0930 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qualcomm.com>
Date: Tue, 15 Jul 2008 04:43:49 -0700
Subject: cpu hotplug, sched: Introduce cpu_active_map and redo sched domain
 managment (take 2)

This is based on Linus' idea of creating cpu_active_map that prevents
scheduler load balancer from migrating tasks to the cpu that is going
down.

It allows us to simplify domain management code and avoid unecessary
domain rebuilds during cpu hotplug event handling.

Please ignore the cpusets part for now. It needs some more work in order
to avoid crazy lock nesting. Although I did simplfy and unify domain
reinitialization logic. We now simply call partition_sched_domains() in
all the cases. This means that we're using exact same code paths as in
cpusets case and hence the test below cover cpusets too.
Cpuset changes to make rebuild_sched_domains() callable from various
contexts are in the separate patch (right next after this one).

This not only boots but also easily handles
	while true; do make clean; make -j 8; done
and
	while true; do on-off-cpu 1; done
at the same time.
(on-off-cpu 1 simple does echo 0/1 > /sys/.../cpu1/online thing).

Suprisingly the box (dual-core Core2) is quite usable. In fact I'm typing
this on right now in gnome-terminal and things are moving just fine.

Also this is running with most of the debug features enabled (lockdep,
mutex, etc) no BUG_ONs or lockdep complaints so far.

I believe I addressed all of the Dmitry's comments for original Linus'
version. I changed both fair and rt balancer to mask out non-active cpus.
And replaced cpu_is_offline() with !cpu_active() in the main scheduler
code where it made sense (to me).

Signed-off-by: Max Krasnyanskiy <maxk@qualcomm.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Gregory Haskins <ghaskins@novell.com>
Cc: dmitry.adamushko@gmail.com
Cc: pj@sgi.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/cpumask.h |   6 ++-
 include/linux/cpuset.h  |   7 ++++
 init/main.c             |   7 ++++
 kernel/cpu.c            |  30 +++++++++++---
 kernel/cpuset.c         |   2 +-
 kernel/sched.c          | 108 ++++++++++++++++++++----------------------------
 kernel/sched_fair.c     |   3 ++
 kernel/sched_rt.c       |   7 ++++
 8 files changed, 99 insertions(+), 71 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index c24875bd9c5..d614d247279 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -359,13 +359,14 @@ static inline void __cpus_fold(cpumask_t *dstp, const cpumask_t *origp,
 
 /*
  * The following particular system cpumasks and operations manage
- * possible, present and online cpus.  Each of them is a fixed size
+ * possible, present, active and online cpus.  Each of them is a fixed size
  * bitmap of size NR_CPUS.
  *
  *  #ifdef CONFIG_HOTPLUG_CPU
  *     cpu_possible_map - has bit 'cpu' set iff cpu is populatable
  *     cpu_present_map  - has bit 'cpu' set iff cpu is populated
  *     cpu_online_map   - has bit 'cpu' set iff cpu available to scheduler
+ *     cpu_active_map   - has bit 'cpu' set iff cpu available to migration
  *  #else
  *     cpu_possible_map - has bit 'cpu' set iff cpu is populated
  *     cpu_present_map  - copy of cpu_possible_map
@@ -416,6 +417,7 @@ static inline void __cpus_fold(cpumask_t *dstp, const cpumask_t *origp,
 extern cpumask_t cpu_possible_map;
 extern cpumask_t cpu_online_map;
 extern cpumask_t cpu_present_map;
+extern cpumask_t cpu_active_map;
 
 #if NR_CPUS > 1
 #define num_online_cpus()	cpus_weight(cpu_online_map)
@@ -424,6 +426,7 @@ extern cpumask_t cpu_present_map;
 #define cpu_online(cpu)		cpu_isset((cpu), cpu_online_map)
 #define cpu_possible(cpu)	cpu_isset((cpu), cpu_possible_map)
 #define cpu_present(cpu)	cpu_isset((cpu), cpu_present_map)
+#define cpu_active(cpu)		cpu_isset((cpu), cpu_active_map)
 #else
 #define num_online_cpus()	1
 #define num_possible_cpus()	1
@@ -431,6 +434,7 @@ extern cpumask_t cpu_present_map;
 #define cpu_online(cpu)		((cpu) == 0)
 #define cpu_possible(cpu)	((cpu) == 0)
 #define cpu_present(cpu)	((cpu) == 0)
+#define cpu_active(cpu)		((cpu) == 0)
 #endif
 
 #define cpu_is_offline(cpu)	unlikely(!cpu_online(cpu))
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 038578362b4..e8f450c499b 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -78,6 +78,8 @@ extern void cpuset_track_online_nodes(void);
 
 extern int current_cpuset_is_being_rebound(void);
 
+extern void rebuild_sched_domains(void);
+
 #else /* !CONFIG_CPUSETS */
 
 static inline int cpuset_init_early(void) { return 0; }
@@ -156,6 +158,11 @@ static inline int current_cpuset_is_being_rebound(void)
 	return 0;
 }
 
+static inline void rebuild_sched_domains(void)
+{
+	partition_sched_domains(0, NULL, NULL);
+}
+
 #endif /* !CONFIG_CPUSETS */
 
 #endif /* _LINUX_CPUSET_H */
diff --git a/init/main.c b/init/main.c
index edeace036fd..dd25259530e 100644
--- a/init/main.c
+++ b/init/main.c
@@ -415,6 +415,13 @@ static void __init smp_init(void)
 {
 	unsigned int cpu;
 
+	/*
+	 * Set up the current CPU as possible to migrate to.
+	 * The other ones will be done by cpu_up/cpu_down()
+	 */
+	cpu = smp_processor_id();
+	cpu_set(cpu, cpu_active_map);
+
 	/* FIXME: This should be done in userspace --RR */
 	for_each_present_cpu(cpu) {
 		if (num_online_cpus() >= setup_max_cpus)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index cfb1d43ab80..a1ac7ea245d 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -64,6 +64,8 @@ void __init cpu_hotplug_init(void)
 	cpu_hotplug.refcount = 0;
 }
 
+cpumask_t cpu_active_map;
+
 #ifdef CONFIG_HOTPLUG_CPU
 
 void get_online_cpus(void)
@@ -291,11 +293,20 @@ int __ref cpu_down(unsigned int cpu)
 	int err = 0;
 
 	cpu_maps_update_begin();
-	if (cpu_hotplug_disabled)
+
+	if (cpu_hotplug_disabled) {
 		err = -EBUSY;
-	else
-		err = _cpu_down(cpu, 0);
+		goto out;
+	}
+
+	cpu_clear(cpu, cpu_active_map);
+
+	err = _cpu_down(cpu, 0);
+
+	if (cpu_online(cpu))
+		cpu_set(cpu, cpu_active_map);
 
+out:
 	cpu_maps_update_done();
 	return err;
 }
@@ -355,11 +366,18 @@ int __cpuinit cpu_up(unsigned int cpu)
 	}
 
 	cpu_maps_update_begin();
-	if (cpu_hotplug_disabled)
+
+	if (cpu_hotplug_disabled) {
 		err = -EBUSY;
-	else
-		err = _cpu_up(cpu, 0);
+		goto out;
+	}
 
+	err = _cpu_up(cpu, 0);
+
+	if (cpu_online(cpu))
+		cpu_set(cpu, cpu_active_map);
+
+out:
 	cpu_maps_update_done();
 	return err;
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 459d601947a..3c3ef02f65f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -564,7 +564,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
  *	partition_sched_domains().
  */
 
-static void rebuild_sched_domains(void)
+void rebuild_sched_domains(void)
 {
 	struct kfifo *q;	/* queue of cpusets to be scanned */
 	struct cpuset *cp;	/* scans q */
diff --git a/kernel/sched.c b/kernel/sched.c
index 1ee18dbb451..c237624a8a0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2881,7 +2881,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
 
 	rq = task_rq_lock(p, &flags);
 	if (!cpu_isset(dest_cpu, p->cpus_allowed)
-	    || unlikely(cpu_is_offline(dest_cpu)))
+	    || unlikely(!cpu_active(dest_cpu)))
 		goto out;
 
 	/* force the process onto the specified CPU */
@@ -3849,7 +3849,7 @@ int select_nohz_load_balancer(int stop_tick)
 		/*
 		 * If we are going offline and still the leader, give up!
 		 */
-		if (cpu_is_offline(cpu) &&
+		if (!cpu_active(cpu) &&
 		    atomic_read(&nohz.load_balancer) == cpu) {
 			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
 				BUG();
@@ -5876,7 +5876,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 	struct rq *rq_dest, *rq_src;
 	int ret = 0, on_rq;
 
-	if (unlikely(cpu_is_offline(dest_cpu)))
+	if (unlikely(!cpu_active(dest_cpu)))
 		return ret;
 
 	rq_src = cpu_rq(src_cpu);
@@ -7553,18 +7553,6 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
 {
 }
 
-/*
- * Free current domain masks.
- * Called after all cpus are attached to NULL domain.
- */
-static void free_sched_domains(void)
-{
-	ndoms_cur = 0;
-	if (doms_cur != &fallback_doms)
-		kfree(doms_cur);
-	doms_cur = &fallback_doms;
-}
-
 /*
  * Set up scheduler domains and groups. Callers must hold the hotplug lock.
  * For now this just excludes isolated cpus, but could be used to
@@ -7643,7 +7631,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
  * ownership of it and will kfree it when done with it. If the caller
  * failed the kmalloc call, then it can pass in doms_new == NULL,
  * and partition_sched_domains() will fallback to the single partition
- * 'fallback_doms'.
+ * 'fallback_doms', it also forces the domains to be rebuilt.
  *
  * Call with hotplug lock held
  */
@@ -7657,12 +7645,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
 	/* always unregister in case we don't destroy any domains */
 	unregister_sched_domain_sysctl();
 
-	if (doms_new == NULL) {
-		ndoms_new = 1;
-		doms_new = &fallback_doms;
-		cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
-		dattr_new = NULL;
-	}
+	if (doms_new == NULL)
+		ndoms_new = 0;
 
 	/* Destroy deleted domains */
 	for (i = 0; i < ndoms_cur; i++) {
@@ -7677,6 +7661,14 @@ match1:
 		;
 	}
 
+	if (doms_new == NULL) {
+		ndoms_cur = 0;
+		ndoms_new = 1;
+		doms_new = &fallback_doms;
+		cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
+		dattr_new = NULL;
+	}
+
 	/* Build new domains */
 	for (i = 0; i < ndoms_new; i++) {
 		for (j = 0; j < ndoms_cur; j++) {
@@ -7707,17 +7699,10 @@ match2:
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 int arch_reinit_sched_domains(void)
 {
-	int err;
-
 	get_online_cpus();
-	mutex_lock(&sched_domains_mutex);
-	detach_destroy_domains(&cpu_online_map);
-	free_sched_domains();
-	err = arch_init_sched_domains(&cpu_online_map);
-	mutex_unlock(&sched_domains_mutex);
+	rebuild_sched_domains();
 	put_online_cpus();
-
-	return err;
+	return 0;
 }
 
 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
@@ -7783,14 +7768,30 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
 }
 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 
+#ifndef CONFIG_CPUSETS
 /*
- * Force a reinitialization of the sched domains hierarchy. The domains
- * and groups cannot be updated in place without racing with the balancing
- * code, so we temporarily attach all running cpus to the NULL domain
- * which will prevent rebalancing while the sched domains are recalculated.
+ * Add online and remove offline CPUs from the scheduler domains.
+ * When cpusets are enabled they take over this function.
  */
 static int update_sched_domains(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
+{
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		partition_sched_domains(0, NULL, NULL);
+		return NOTIFY_OK;
+
+	default:
+		return NOTIFY_DONE;
+	}
+}
+#endif
+
+static int update_runtime(struct notifier_block *nfb,
+				unsigned long action, void *hcpu)
 {
 	int cpu = (int)(long)hcpu;
 
@@ -7798,44 +7799,18 @@ static int update_sched_domains(struct notifier_block *nfb,
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 		disable_runtime(cpu_rq(cpu));
-		/* fall-through */
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-		detach_destroy_domains(&cpu_online_map);
-		free_sched_domains();
 		return NOTIFY_OK;
 
-
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
 		enable_runtime(cpu_rq(cpu));
-		/* fall-through */
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		/*
-		 * Fall through and re-initialise the domains.
-		 */
-		break;
+		return NOTIFY_OK;
+
 	default:
 		return NOTIFY_DONE;
 	}
-
-#ifndef CONFIG_CPUSETS
-	/*
-	 * Create default domain partitioning if cpusets are disabled.
-	 * Otherwise we let cpusets rebuild the domains based on the
-	 * current setup.
-	 */
-
-	/* The hotplug lock is already held by cpu_up/cpu_down */
-	arch_init_sched_domains(&cpu_online_map);
-#endif
-
-	return NOTIFY_OK;
 }
 
 void __init sched_init_smp(void)
@@ -7855,8 +7830,15 @@ void __init sched_init_smp(void)
 		cpu_set(smp_processor_id(), non_isolated_cpus);
 	mutex_unlock(&sched_domains_mutex);
 	put_online_cpus();
+
+#ifndef CONFIG_CPUSETS
 	/* XXX: Theoretical race here - CPU may be hotplugged now */
 	hotcpu_notifier(update_sched_domains, 0);
+#endif
+
+	/* RT runtime code needs to handle some hotplug events */
+	hotcpu_notifier(update_runtime, 0);
+
 	init_hrtick();
 
 	/* Move init over to a non-isolated CPU */
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f2aa987027d..d924c679dfa 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1004,6 +1004,8 @@ static void yield_task_fair(struct rq *rq)
  * not idle and an idle cpu is available.  The span of cpus to
  * search starts with cpus closest then further out as needed,
  * so we always favor a closer, idle cpu.
+ * Domains may include CPUs that are not usable for migration,
+ * hence we need to mask them out (cpu_active_map)
  *
  * Returns the CPU we should wake onto.
  */
@@ -1031,6 +1033,7 @@ static int wake_idle(int cpu, struct task_struct *p)
 		    || ((sd->flags & SD_WAKE_IDLE_FAR)
 			&& !task_hot(p, task_rq(p)->clock, sd))) {
 			cpus_and(tmp, sd->span, p->cpus_allowed);
+			cpus_and(tmp, tmp, cpu_active_map);
 			for_each_cpu_mask(i, tmp) {
 				if (idle_cpu(i)) {
 					if (i != task_cpu(p)) {
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d3d1cccb3d7..50735bb9614 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -933,6 +933,13 @@ static int find_lowest_rq(struct task_struct *task)
 	if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
 		return -1; /* No targets found */
 
+	/*
+	 * Only consider CPUs that are usable for migration.
+	 * I guess we might want to change cpupri_find() to ignore those
+	 * in the first place.
+	 */
+	cpus_and(*lowest_mask, *lowest_mask, cpu_active_map);
+
 	/*
 	 * At this point we have built a mask of cpus representing the
 	 * lowest priority tasks in the system.  Now we want to elect
-- 
cgit v1.2.3-70-g09d2


From 31656519e132f6612584815f128c83976a9aaaef Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 18 Jul 2008 18:01:23 +0200
Subject: sched, x86: clean up hrtick implementation

random uvesafb failures were reported against Gentoo:

  http://bugs.gentoo.org/show_bug.cgi?id=222799

and Mihai Moldovan bisected it back to:

> 8f4d37ec073c17e2d4aa8851df5837d798606d6f is first bad commit
> commit 8f4d37ec073c17e2d4aa8851df5837d798606d6f
> Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
> Date:   Fri Jan 25 21:08:29 2008 +0100
>
>    sched: high-res preemption tick

Linus suspected it to be hrtick + vm86 interaction and observed:

> Btw, Peter, Ingo: I think that commit is doing bad things. They aren't
> _incorrect_ per se, but they are definitely bad.
>
> Why?
>
> Using random _TIF_WORK_MASK flags is really impolite for doing
> "scheduling" work. There's a reason that arch/x86/kernel/entry_32.S
> special-cases the _TIF_NEED_RESCHED flag: we don't want to exit out of
> vm86 mode unnecessarily.
>
> See the "work_notifysig_v86" label, and how it does that
> "save_v86_state()" thing etc etc.

Right, I never liked having to fiddle with those TIF flags. Initially I
needed it because the hrtimer base lock could not nest in the rq lock.
That however is fixed these days.

Currently the only reason left to fiddle with the TIF flags is remote
wakeups. We cannot program a remote cpu's hrtimer. I've been thinking
about using the new and improved IPI function call stuff to implement
hrtimer_start_on().

However that does require that smp_call_function_single(.wait=0) works
from interrupt context - /me looks at the latest series from Jens - Yes
that does seem to be supported, good.

Here's a stab at cleaning this stuff up ...

Mihai reported test success as well.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Mihai Moldovan <ionic@ionic.de>
Cc: Michal Januszewski <spock@gentoo.org>
Cc: Antonino Daplas <adaplas@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/signal_32.c   |   3 -
 arch/x86/kernel/signal_64.c   |   3 -
 include/asm-x86/thread_info.h |   4 +-
 kernel/Kconfig.hz             |   2 +-
 kernel/sched.c                | 202 +++++++++++++-----------------------------
 kernel/sched_fair.c           |   5 +-
 6 files changed, 64 insertions(+), 155 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index d9237363096..e1fc7bd57bf 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -667,8 +667,5 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 	if (thread_info_flags & _TIF_SIGPENDING)
 		do_signal(regs);
 
-	if (thread_info_flags & _TIF_HRTICK_RESCHED)
-		hrtick_resched();
-
 	clear_thread_flag(TIF_IRET);
 }
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index e53b267662e..88023fcc704 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -502,9 +502,6 @@ void do_notify_resume(struct pt_regs *regs, void *unused,
 	/* deal with pending signal delivery */
 	if (thread_info_flags & _TIF_SIGPENDING)
 		do_signal(regs);
-
-	if (thread_info_flags & _TIF_HRTICK_RESCHED)
-		hrtick_resched();
 }
 
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
diff --git a/include/asm-x86/thread_info.h b/include/asm-x86/thread_info.h
index 895339d2bc0..d7012634ace 100644
--- a/include/asm-x86/thread_info.h
+++ b/include/asm-x86/thread_info.h
@@ -81,7 +81,6 @@ struct thread_info {
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
-#define TIF_HRTICK_RESCHED	11	/* reprogram hrtick timer */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
 #define TIF_IA32		17	/* 32bit process */
 #define TIF_FORK		18	/* ret_from_fork */
@@ -108,7 +107,6 @@ struct thread_info {
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 #define _TIF_MCE_NOTIFY		(1 << TIF_MCE_NOTIFY)
-#define _TIF_HRTICK_RESCHED	(1 << TIF_HRTICK_RESCHED)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
 #define _TIF_IA32		(1 << TIF_IA32)
 #define _TIF_FORK		(1 << TIF_FORK)
@@ -132,7 +130,7 @@ struct thread_info {
 
 /* Only used for 64 bit */
 #define _TIF_DO_NOTIFY_MASK						\
-	(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY|_TIF_HRTICK_RESCHED)
+	(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 526128a2e62..2a202a84675 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -55,4 +55,4 @@ config HZ
 	default 1000 if HZ_1000
 
 config SCHED_HRTICK
-	def_bool HIGH_RES_TIMERS && X86
+	def_bool HIGH_RES_TIMERS
diff --git a/kernel/sched.c b/kernel/sched.c
index 1ee18dbb451..c13c75e9f9f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -571,8 +571,10 @@ struct rq {
 #endif
 
 #ifdef CONFIG_SCHED_HRTICK
-	unsigned long hrtick_flags;
-	ktime_t hrtick_expire;
+#ifdef CONFIG_SMP
+	int hrtick_csd_pending;
+	struct call_single_data hrtick_csd;
+#endif
 	struct hrtimer hrtick_timer;
 #endif
 
@@ -983,13 +985,6 @@ static struct rq *this_rq_lock(void)
 	return rq;
 }
 
-static void __resched_task(struct task_struct *p, int tif_bit);
-
-static inline void resched_task(struct task_struct *p)
-{
-	__resched_task(p, TIF_NEED_RESCHED);
-}
-
 #ifdef CONFIG_SCHED_HRTICK
 /*
  * Use HR-timers to deliver accurate preemption points.
@@ -1001,25 +996,6 @@ static inline void resched_task(struct task_struct *p)
  * When we get rescheduled we reprogram the hrtick_timer outside of the
  * rq->lock.
  */
-static inline void resched_hrt(struct task_struct *p)
-{
-	__resched_task(p, TIF_HRTICK_RESCHED);
-}
-
-static inline void resched_rq(struct rq *rq)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&rq->lock, flags);
-	resched_task(rq->curr);
-	spin_unlock_irqrestore(&rq->lock, flags);
-}
-
-enum {
-	HRTICK_SET,		/* re-programm hrtick_timer */
-	HRTICK_RESET,		/* not a new slice */
-	HRTICK_BLOCK,		/* stop hrtick operations */
-};
 
 /*
  * Use hrtick when:
@@ -1030,72 +1006,17 @@ static inline int hrtick_enabled(struct rq *rq)
 {
 	if (!sched_feat(HRTICK))
 		return 0;
-	if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags)))
+	if (!cpu_online(cpu_of(rq)))
 		return 0;
 	return hrtimer_is_hres_active(&rq->hrtick_timer);
 }
 
-/*
- * Called to set the hrtick timer state.
- *
- * called with rq->lock held and irqs disabled
- */
-static void hrtick_start(struct rq *rq, u64 delay, int reset)
-{
-	assert_spin_locked(&rq->lock);
-
-	/*
-	 * preempt at: now + delay
-	 */
-	rq->hrtick_expire =
-		ktime_add_ns(rq->hrtick_timer.base->get_time(), delay);
-	/*
-	 * indicate we need to program the timer
-	 */
-	__set_bit(HRTICK_SET, &rq->hrtick_flags);
-	if (reset)
-		__set_bit(HRTICK_RESET, &rq->hrtick_flags);
-
-	/*
-	 * New slices are called from the schedule path and don't need a
-	 * forced reschedule.
-	 */
-	if (reset)
-		resched_hrt(rq->curr);
-}
-
 static void hrtick_clear(struct rq *rq)
 {
 	if (hrtimer_active(&rq->hrtick_timer))
 		hrtimer_cancel(&rq->hrtick_timer);
 }
 
-/*
- * Update the timer from the possible pending state.
- */
-static void hrtick_set(struct rq *rq)
-{
-	ktime_t time;
-	int set, reset;
-	unsigned long flags;
-
-	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
-
-	spin_lock_irqsave(&rq->lock, flags);
-	set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags);
-	reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags);
-	time = rq->hrtick_expire;
-	clear_thread_flag(TIF_HRTICK_RESCHED);
-	spin_unlock_irqrestore(&rq->lock, flags);
-
-	if (set) {
-		hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS);
-		if (reset && !hrtimer_active(&rq->hrtick_timer))
-			resched_rq(rq);
-	} else
-		hrtick_clear(rq);
-}
-
 /*
  * High-resolution timer tick.
  * Runs from hardirq context with interrupts disabled.
@@ -1115,27 +1036,37 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
 }
 
 #ifdef CONFIG_SMP
-static void hotplug_hrtick_disable(int cpu)
+/*
+ * called from hardirq (IPI) context
+ */
+static void __hrtick_start(void *arg)
 {
-	struct rq *rq = cpu_rq(cpu);
-	unsigned long flags;
-
-	spin_lock_irqsave(&rq->lock, flags);
-	rq->hrtick_flags = 0;
-	__set_bit(HRTICK_BLOCK, &rq->hrtick_flags);
-	spin_unlock_irqrestore(&rq->lock, flags);
+	struct rq *rq = arg;
 
-	hrtick_clear(rq);
+	spin_lock(&rq->lock);
+	hrtimer_restart(&rq->hrtick_timer);
+	rq->hrtick_csd_pending = 0;
+	spin_unlock(&rq->lock);
 }
 
-static void hotplug_hrtick_enable(int cpu)
+/*
+ * Called to set the hrtick timer state.
+ *
+ * called with rq->lock held and irqs disabled
+ */
+static void hrtick_start(struct rq *rq, u64 delay)
 {
-	struct rq *rq = cpu_rq(cpu);
-	unsigned long flags;
+	struct hrtimer *timer = &rq->hrtick_timer;
+	ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
 
-	spin_lock_irqsave(&rq->lock, flags);
-	__clear_bit(HRTICK_BLOCK, &rq->hrtick_flags);
-	spin_unlock_irqrestore(&rq->lock, flags);
+	timer->expires = time;
+
+	if (rq == this_rq()) {
+		hrtimer_restart(timer);
+	} else if (!rq->hrtick_csd_pending) {
+		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
+		rq->hrtick_csd_pending = 1;
+	}
 }
 
 static int
@@ -1150,16 +1081,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	case CPU_DOWN_PREPARE_FROZEN:
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
-		hotplug_hrtick_disable(cpu);
-		return NOTIFY_OK;
-
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-	case CPU_DOWN_FAILED:
-	case CPU_DOWN_FAILED_FROZEN:
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-		hotplug_hrtick_enable(cpu);
+		hrtick_clear(cpu_rq(cpu));
 		return NOTIFY_OK;
 	}
 
@@ -1170,46 +1092,45 @@ static void init_hrtick(void)
 {
 	hotcpu_notifier(hotplug_hrtick, 0);
 }
-#endif /* CONFIG_SMP */
+#else
+/*
+ * Called to set the hrtick timer state.
+ *
+ * called with rq->lock held and irqs disabled
+ */
+static void hrtick_start(struct rq *rq, u64 delay)
+{
+	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
+}
 
-static void init_rq_hrtick(struct rq *rq)
+static void init_hrtick(void)
 {
-	rq->hrtick_flags = 0;
-	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	rq->hrtick_timer.function = hrtick;
-	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 }
+#endif /* CONFIG_SMP */
 
-void hrtick_resched(void)
+static void init_rq_hrtick(struct rq *rq)
 {
-	struct rq *rq;
-	unsigned long flags;
+#ifdef CONFIG_SMP
+	rq->hrtick_csd_pending = 0;
 
-	if (!test_thread_flag(TIF_HRTICK_RESCHED))
-		return;
+	rq->hrtick_csd.flags = 0;
+	rq->hrtick_csd.func = __hrtick_start;
+	rq->hrtick_csd.info = rq;
+#endif
 
-	local_irq_save(flags);
-	rq = cpu_rq(smp_processor_id());
-	hrtick_set(rq);
-	local_irq_restore(flags);
+	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	rq->hrtick_timer.function = hrtick;
+	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 }
 #else
 static inline void hrtick_clear(struct rq *rq)
 {
 }
 
-static inline void hrtick_set(struct rq *rq)
-{
-}
-
 static inline void init_rq_hrtick(struct rq *rq)
 {
 }
 
-void hrtick_resched(void)
-{
-}
-
 static inline void init_hrtick(void)
 {
 }
@@ -1228,16 +1149,16 @@ static inline void init_hrtick(void)
 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
 #endif
 
-static void __resched_task(struct task_struct *p, int tif_bit)
+static void resched_task(struct task_struct *p)
 {
 	int cpu;
 
 	assert_spin_locked(&task_rq(p)->lock);
 
-	if (unlikely(test_tsk_thread_flag(p, tif_bit)))
+	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
 		return;
 
-	set_tsk_thread_flag(p, tif_bit);
+	set_tsk_thread_flag(p, TIF_NEED_RESCHED);
 
 	cpu = task_cpu(p);
 	if (cpu == smp_processor_id())
@@ -1303,10 +1224,10 @@ void wake_up_idle_cpu(int cpu)
 #endif /* CONFIG_NO_HZ */
 
 #else /* !CONFIG_SMP */
-static void __resched_task(struct task_struct *p, int tif_bit)
+static void resched_task(struct task_struct *p)
 {
 	assert_spin_locked(&task_rq(p)->lock);
-	set_tsk_thread_flag(p, tif_bit);
+	set_tsk_need_resched(p);
 }
 #endif /* CONFIG_SMP */
 
@@ -4395,7 +4316,7 @@ asmlinkage void __sched schedule(void)
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
 	struct rq *rq;
-	int cpu, hrtick = sched_feat(HRTICK);
+	int cpu;
 
 need_resched:
 	preempt_disable();
@@ -4410,7 +4331,7 @@ need_resched_nonpreemptible:
 
 	schedule_debug(prev);
 
-	if (hrtick)
+	if (sched_feat(HRTICK))
 		hrtick_clear(rq);
 
 	/*
@@ -4457,9 +4378,6 @@ need_resched_nonpreemptible:
 	} else
 		spin_unlock_irq(&rq->lock);
 
-	if (hrtick)
-		hrtick_set(rq);
-
 	if (unlikely(reacquire_kernel_lock(current) < 0))
 		goto need_resched_nonpreemptible;
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f2aa987027d..6893b3ed65f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -878,7 +878,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 #ifdef CONFIG_SCHED_HRTICK
 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
 {
-	int requeue = rq->curr == p;
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
@@ -899,10 +898,10 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
 		 * Don't schedule slices shorter than 10000ns, that just
 		 * doesn't make sense. Rely on vruntime for fairness.
 		 */
-		if (!requeue)
+		if (rq->curr != p)
 			delta = max(10000LL, delta);
 
-		hrtick_start(rq, delta, requeue);
+		hrtick_start(rq, delta);
 	}
 }
 #else /* !CONFIG_SCHED_HRTICK */
-- 
cgit v1.2.3-70-g09d2


From ba42059fbd0aa1ac91b582412b5fedb1258f241f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 20 Jul 2008 11:02:06 +0200
Subject: sched: hrtick_enabled() should use cpu_active()

Peter pointed out that hrtick_enabled() should use cpu_active().

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 85cf246cfdf..62b1b8ecb5c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1006,7 +1006,7 @@ static inline int hrtick_enabled(struct rq *rq)
 {
 	if (!sched_feat(HRTICK))
 		return 0;
-	if (!cpu_online(cpu_of(rq)))
+	if (!cpu_active(cpu_of(rq)))
 		return 0;
 	return hrtimer_is_hres_active(&rq->hrtick_timer);
 }
-- 
cgit v1.2.3-70-g09d2


From 4a0b2b4dbe1335b8b9886ba3dc85a145d5d938ed Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Tue, 1 Jul 2008 18:48:41 +0200
Subject: sysdev: Pass the attribute to the low level sysdev show/store
 function

This allow to dynamically generate attributes and share show/store
functions between attributes. Right now most attributes are generated
by special macros and lots of duplicated code. With the attribute
passed it's instead possible to attach some data to the attribute
and then use that in shared low level functions to do different things.

I need this for the dynamically generated bank attributes in the x86
machine check code, but it'll allow some further cleanups.

I converted all users in tree to the new show/store prototype. It's a single
huge patch to avoid unbisectable sections.

Runtime tested: x86-32, x86-64
Compiled only: ia64, powerpc
Not compile tested/only grep converted: sh, arm, avr32

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/arm/kernel/time.c                    |  4 ++-
 arch/avr32/kernel/cpu.c                   | 38 +++++++++++++++++---------
 arch/ia64/kernel/err_inject.c             | 22 ++++++++++-----
 arch/powerpc/kernel/sysfs.c               | 15 ++++++++---
 arch/powerpc/platforms/cell/cbe_thermal.c | 45 ++++++++++++++++++++-----------
 arch/powerpc/platforms/cell/spu_base.c    |  3 ++-
 arch/s390/kernel/smp.c                    | 36 ++++++++++++++++---------
 arch/s390/kernel/time.c                   | 35 ++++++++++++++++--------
 arch/sh/drivers/dma/dma-sysfs.c           | 15 ++++++++---
 arch/sparc64/kernel/sysfs.c               | 16 +++++++----
 arch/x86/kernel/cpu/mcheck/mce_64.c       | 14 +++++++---
 arch/x86/kernel/cpu/mcheck/therm_throt.c  |  1 +
 arch/x86/kernel/microcode.c               | 10 ++++---
 drivers/base/cpu.c                        | 10 ++++---
 drivers/base/memory.c                     | 12 ++++++---
 drivers/base/node.c                       | 15 +++++++----
 drivers/base/sys.c                        |  4 +--
 drivers/base/topology.c                   | 17 ++++++++----
 drivers/cpuidle/sysfs.c                   | 10 ++++---
 drivers/xen/balloon.c                     |  1 +
 include/linux/sysdev.h                    |  5 ++--
 kernel/rtmutex-tester.c                   |  7 ++---
 kernel/sched.c                            |  8 ++++--
 kernel/time/clocksource.c                 |  8 ++++--
 24 files changed, 239 insertions(+), 112 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c
index cc5145b28e7..368d171754c 100644
--- a/arch/arm/kernel/time.c
+++ b/arch/arm/kernel/time.c
@@ -130,7 +130,9 @@ static const struct leds_evt_name evt_names[] = {
 	{ "red",   led_red_on,   led_red_off   },
 };
 
-static ssize_t leds_store(struct sys_device *dev, const char *buf, size_t size)
+static ssize_t leds_store(struct sys_device *dev,
+			struct sysdev_attribute *attr,
+			const char *buf, size_t size)
 {
 	int ret = -EINVAL, len = strcspn(buf, " ");
 
diff --git a/arch/avr32/kernel/cpu.c b/arch/avr32/kernel/cpu.c
index b8409caeb23..e84faffbbec 100644
--- a/arch/avr32/kernel/cpu.c
+++ b/arch/avr32/kernel/cpu.c
@@ -26,14 +26,16 @@ static DEFINE_PER_CPU(struct cpu, cpu_devices);
  * XXX: If/when a SMP-capable implementation of AVR32 will ever be
  * made, we must make sure that the code executes on the correct CPU.
  */
-static ssize_t show_pc0event(struct sys_device *dev, char *buf)
+static ssize_t show_pc0event(struct sys_device *dev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	unsigned long pccr;
 
 	pccr = sysreg_read(PCCR);
 	return sprintf(buf, "0x%lx\n", (pccr >> 12) & 0x3f);
 }
-static ssize_t store_pc0event(struct sys_device *dev, const char *buf,
+static ssize_t store_pc0event(struct sys_device *dev,
+			struct sysdev_attribute *attr, const char *buf,
 			      size_t count)
 {
 	unsigned long val;
@@ -46,15 +48,17 @@ static ssize_t store_pc0event(struct sys_device *dev, const char *buf,
 	sysreg_write(PCCR, val);
 	return count;
 }
-static ssize_t show_pc0count(struct sys_device *dev, char *buf)
+static ssize_t show_pc0count(struct sys_device *dev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	unsigned long pcnt0;
 
 	pcnt0 = sysreg_read(PCNT0);
 	return sprintf(buf, "%lu\n", pcnt0);
 }
-static ssize_t store_pc0count(struct sys_device *dev, const char *buf,
-			      size_t count)
+static ssize_t store_pc0count(struct sys_device *dev,
+				struct sysdev_attribute *attr,
+				const char *buf, size_t count)
 {
 	unsigned long val;
 	char *endp;
@@ -67,14 +71,16 @@ static ssize_t store_pc0count(struct sys_device *dev, const char *buf,
 	return count;
 }
 
-static ssize_t show_pc1event(struct sys_device *dev, char *buf)
+static ssize_t show_pc1event(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	unsigned long pccr;
 
 	pccr = sysreg_read(PCCR);
 	return sprintf(buf, "0x%lx\n", (pccr >> 18) & 0x3f);
 }
-static ssize_t store_pc1event(struct sys_device *dev, const char *buf,
+static ssize_t store_pc1event(struct sys_device *dev,
+			      struct sysdev_attribute *attr, const char *buf,
 			      size_t count)
 {
 	unsigned long val;
@@ -87,14 +93,16 @@ static ssize_t store_pc1event(struct sys_device *dev, const char *buf,
 	sysreg_write(PCCR, val);
 	return count;
 }
-static ssize_t show_pc1count(struct sys_device *dev, char *buf)
+static ssize_t show_pc1count(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	unsigned long pcnt1;
 
 	pcnt1 = sysreg_read(PCNT1);
 	return sprintf(buf, "%lu\n", pcnt1);
 }
-static ssize_t store_pc1count(struct sys_device *dev, const char *buf,
+static ssize_t store_pc1count(struct sys_device *dev,
+				struct sysdev_attribute *attr, const char *buf,
 			      size_t count)
 {
 	unsigned long val;
@@ -108,14 +116,16 @@ static ssize_t store_pc1count(struct sys_device *dev, const char *buf,
 	return count;
 }
 
-static ssize_t show_pccycles(struct sys_device *dev, char *buf)
+static ssize_t show_pccycles(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	unsigned long pccnt;
 
 	pccnt = sysreg_read(PCCNT);
 	return sprintf(buf, "%lu\n", pccnt);
 }
-static ssize_t store_pccycles(struct sys_device *dev, const char *buf,
+static ssize_t store_pccycles(struct sys_device *dev,
+				struct sysdev_attribute *attr, const char *buf,
 			      size_t count)
 {
 	unsigned long val;
@@ -129,14 +139,16 @@ static ssize_t store_pccycles(struct sys_device *dev, const char *buf,
 	return count;
 }
 
-static ssize_t show_pcenable(struct sys_device *dev, char *buf)
+static ssize_t show_pcenable(struct sys_device *dev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	unsigned long pccr;
 
 	pccr = sysreg_read(PCCR);
 	return sprintf(buf, "%c\n", (pccr & 1)?'1':'0');
 }
-static ssize_t store_pcenable(struct sys_device *dev, const char *buf,
+static ssize_t store_pcenable(struct sys_device *dev,
+			      struct sysdev_attribute *attr, const char *buf,
 			      size_t count)
 {
 	unsigned long pccr, val;
diff --git a/arch/ia64/kernel/err_inject.c b/arch/ia64/kernel/err_inject.c
index b642648cc2a..c539c689493 100644
--- a/arch/ia64/kernel/err_inject.c
+++ b/arch/ia64/kernel/err_inject.c
@@ -55,7 +55,8 @@ static u64 resources[NR_CPUS];
 
 #define show(name) 							\
 static ssize_t 								\
-show_##name(struct sys_device *dev, char *buf)				\
+show_##name(struct sys_device *dev, struct sysdev_attribute *attr,	\
+		char *buf)						\
 {									\
 	u32 cpu=dev->id;						\
 	return sprintf(buf, "%lx\n", name[cpu]);			\
@@ -63,7 +64,8 @@ show_##name(struct sys_device *dev, char *buf)				\
 
 #define store(name)							\
 static ssize_t 								\
-store_##name(struct sys_device *dev, const char *buf, size_t size)	\
+store_##name(struct sys_device *dev, struct sysdev_attribute *attr,	\
+					const char *buf, size_t size)	\
 {									\
 	unsigned int cpu=dev->id;					\
 	name[cpu] = simple_strtoull(buf, NULL, 16);			\
@@ -76,7 +78,8 @@ show(call_start)
  * processor. The cpu number in driver is only used for storing data.
  */
 static ssize_t
-store_call_start(struct sys_device *dev, const char *buf, size_t size)
+store_call_start(struct sys_device *dev, struct sysdev_attribute *attr,
+		const char *buf, size_t size)
 {
 	unsigned int cpu=dev->id;
 	unsigned long call_start = simple_strtoull(buf, NULL, 16);
@@ -124,14 +127,16 @@ show(err_type_info)
 store(err_type_info)
 
 static ssize_t
-show_virtual_to_phys(struct sys_device *dev, char *buf)
+show_virtual_to_phys(struct sys_device *dev, struct sysdev_attribute *attr,
+			char *buf)
 {
 	unsigned int cpu=dev->id;
 	return sprintf(buf, "%lx\n", phys_addr[cpu]);
 }
 
 static ssize_t
-store_virtual_to_phys(struct sys_device *dev, const char *buf, size_t size)
+store_virtual_to_phys(struct sys_device *dev, struct sysdev_attribute *attr,
+			const char *buf, size_t size)
 {
 	unsigned int cpu=dev->id;
 	u64 virt_addr=simple_strtoull(buf, NULL, 16);
@@ -154,7 +159,8 @@ show(err_struct_info)
 store(err_struct_info)
 
 static ssize_t
-show_err_data_buffer(struct sys_device *dev, char *buf)
+show_err_data_buffer(struct sys_device *dev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	unsigned int cpu=dev->id;
 
@@ -165,7 +171,9 @@ show_err_data_buffer(struct sys_device *dev, char *buf)
 }
 
 static ssize_t
-store_err_data_buffer(struct sys_device *dev, const char *buf, size_t size)
+store_err_data_buffer(struct sys_device *dev,
+			struct sysdev_attribute *attr,
+			const char *buf, size_t size)
 {
 	unsigned int cpu=dev->id;
 	int ret;
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index c8127f832df..aba0ba95f06 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -28,7 +28,9 @@ static DEFINE_PER_CPU(struct cpu, cpu_devices);
 /* Time in microseconds we delay before sleeping in the idle loop */
 DEFINE_PER_CPU(unsigned long, smt_snooze_delay) = { 100 };
 
-static ssize_t store_smt_snooze_delay(struct sys_device *dev, const char *buf,
+static ssize_t store_smt_snooze_delay(struct sys_device *dev,
+				      struct sysdev_attribute *attr,
+				      const char *buf,
 				      size_t count)
 {
 	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
@@ -44,7 +46,9 @@ static ssize_t store_smt_snooze_delay(struct sys_device *dev, const char *buf,
 	return count;
 }
 
-static ssize_t show_smt_snooze_delay(struct sys_device *dev, char *buf)
+static ssize_t show_smt_snooze_delay(struct sys_device *dev,
+				     struct sysdev_attribute *attr,
+				     char *buf)
 {
 	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
 
@@ -152,14 +156,17 @@ static unsigned long write_##NAME(unsigned long val) \
 	mtspr(ADDRESS, val); \
 	return 0; \
 } \
-static ssize_t show_##NAME(struct sys_device *dev, char *buf) \
+static ssize_t show_##NAME(struct sys_device *dev, \
+			struct sysdev_attribute *attr, \
+			char *buf) \
 { \
 	struct cpu *cpu = container_of(dev, struct cpu, sysdev); \
 	unsigned long val = run_on_cpu(cpu->sysdev.id, read_##NAME, 0); \
 	return sprintf(buf, "%lx\n", val); \
 } \
 static ssize_t __used \
-	store_##NAME(struct sys_device *dev, const char *buf, size_t count) \
+	store_##NAME(struct sys_device *dev, struct sysdev_attribute *attr, \
+			const char *buf, size_t count) \
 { \
 	struct cpu *cpu = container_of(dev, struct cpu, sysdev); \
 	unsigned long val; \
diff --git a/arch/powerpc/platforms/cell/cbe_thermal.c b/arch/powerpc/platforms/cell/cbe_thermal.c
index 4852bf312d8..4d4c8c16912 100644
--- a/arch/powerpc/platforms/cell/cbe_thermal.c
+++ b/arch/powerpc/platforms/cell/cbe_thermal.c
@@ -97,7 +97,8 @@ static u8 spu_read_register_value(struct sys_device *sysdev, union spe_reg __iom
 	return value.spe[spu->spe_id];
 }
 
-static ssize_t spu_show_temp(struct sys_device *sysdev, char *buf)
+static ssize_t spu_show_temp(struct sys_device *sysdev, struct sysdev_attribute *attr,
+			char *buf)
 {
 	u8 value;
 	struct cbe_pmd_regs __iomem *pmd_regs;
@@ -146,32 +147,38 @@ static ssize_t store_throttle(struct cbe_pmd_regs __iomem *pmd_regs, const char
 	return size;
 }
 
-static ssize_t spu_show_throttle_end(struct sys_device *sysdev, char *buf)
+static ssize_t spu_show_throttle_end(struct sys_device *sysdev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	return show_throttle(get_pmd_regs(sysdev), buf, 0);
 }
 
-static ssize_t spu_show_throttle_begin(struct sys_device *sysdev, char *buf)
+static ssize_t spu_show_throttle_begin(struct sys_device *sysdev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	return show_throttle(get_pmd_regs(sysdev), buf, 8);
 }
 
-static ssize_t spu_show_throttle_full_stop(struct sys_device *sysdev, char *buf)
+static ssize_t spu_show_throttle_full_stop(struct sys_device *sysdev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	return show_throttle(get_pmd_regs(sysdev), buf, 16);
 }
 
-static ssize_t spu_store_throttle_end(struct sys_device *sysdev, const char *buf, size_t size)
+static ssize_t spu_store_throttle_end(struct sys_device *sysdev,
+			struct sysdev_attribute *attr, const char *buf, size_t size)
 {
 	return store_throttle(get_pmd_regs(sysdev), buf, size, 0);
 }
 
-static ssize_t spu_store_throttle_begin(struct sys_device *sysdev, const char *buf, size_t size)
+static ssize_t spu_store_throttle_begin(struct sys_device *sysdev,
+			struct sysdev_attribute *attr, const char *buf, size_t size)
 {
 	return store_throttle(get_pmd_regs(sysdev), buf, size, 8);
 }
 
-static ssize_t spu_store_throttle_full_stop(struct sys_device *sysdev, const char *buf, size_t size)
+static ssize_t spu_store_throttle_full_stop(struct sys_device *sysdev,
+			struct sysdev_attribute *attr, const char *buf, size_t size)
 {
 	return store_throttle(get_pmd_regs(sysdev), buf, size, 16);
 }
@@ -192,43 +199,51 @@ static ssize_t ppe_show_temp(struct sys_device *sysdev, char *buf, int pos)
 
 /* shows the temperature of the DTS on the PPE,
  * located near the linear thermal sensor */
-static ssize_t ppe_show_temp0(struct sys_device *sysdev, char *buf)
+static ssize_t ppe_show_temp0(struct sys_device *sysdev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	return ppe_show_temp(sysdev, buf, 32);
 }
 
 /* shows the temperature of the second DTS on the PPE */
-static ssize_t ppe_show_temp1(struct sys_device *sysdev, char *buf)
+static ssize_t ppe_show_temp1(struct sys_device *sysdev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	return ppe_show_temp(sysdev, buf, 0);
 }
 
-static ssize_t ppe_show_throttle_end(struct sys_device *sysdev, char *buf)
+static ssize_t ppe_show_throttle_end(struct sys_device *sysdev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	return show_throttle(cbe_get_cpu_pmd_regs(sysdev->id), buf, 32);
 }
 
-static ssize_t ppe_show_throttle_begin(struct sys_device *sysdev, char *buf)
+static ssize_t ppe_show_throttle_begin(struct sys_device *sysdev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	return show_throttle(cbe_get_cpu_pmd_regs(sysdev->id), buf, 40);
 }
 
-static ssize_t ppe_show_throttle_full_stop(struct sys_device *sysdev, char *buf)
+static ssize_t ppe_show_throttle_full_stop(struct sys_device *sysdev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	return show_throttle(cbe_get_cpu_pmd_regs(sysdev->id), buf, 48);
 }
 
-static ssize_t ppe_store_throttle_end(struct sys_device *sysdev, const char *buf, size_t size)
+static ssize_t ppe_store_throttle_end(struct sys_device *sysdev,
+			struct sysdev_attribute *attr, const char *buf, size_t size)
 {
 	return store_throttle(cbe_get_cpu_pmd_regs(sysdev->id), buf, size, 32);
 }
 
-static ssize_t ppe_store_throttle_begin(struct sys_device *sysdev, const char *buf, size_t size)
+static ssize_t ppe_store_throttle_begin(struct sys_device *sysdev,
+			struct sysdev_attribute *attr, const char *buf, size_t size)
 {
 	return store_throttle(cbe_get_cpu_pmd_regs(sysdev->id), buf, size, 40);
 }
 
-static ssize_t ppe_store_throttle_full_stop(struct sys_device *sysdev, const char *buf, size_t size)
+static ssize_t ppe_store_throttle_full_stop(struct sys_device *sysdev,
+			struct sysdev_attribute *attr, const char *buf, size_t size)
 {
 	return store_throttle(cbe_get_cpu_pmd_regs(sysdev->id), buf, size, 48);
 }
diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index 78f905bc6a4..a5bdb89a17c 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -703,7 +703,8 @@ static unsigned long long spu_acct_time(struct spu *spu,
 }
 
 
-static ssize_t spu_stat_show(struct sys_device *sysdev, char *buf)
+static ssize_t spu_stat_show(struct sys_device *sysdev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	struct spu *spu = container_of(sysdev, struct spu, sysdev);
 
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index b6781030cfb..b795b3e24af 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -864,7 +864,8 @@ int setup_profiling_timer(unsigned int multiplier)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static ssize_t cpu_configure_show(struct sys_device *dev, char *buf)
+static ssize_t cpu_configure_show(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	ssize_t count;
 
@@ -874,8 +875,9 @@ static ssize_t cpu_configure_show(struct sys_device *dev, char *buf)
 	return count;
 }
 
-static ssize_t cpu_configure_store(struct sys_device *dev, const char *buf,
-				   size_t count)
+static ssize_t cpu_configure_store(struct sys_device *dev,
+				  struct sysdev_attribute *attr,
+				  const char *buf, size_t count)
 {
 	int cpu = dev->id;
 	int val, rc;
@@ -922,7 +924,8 @@ out:
 static SYSDEV_ATTR(configure, 0644, cpu_configure_show, cpu_configure_store);
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static ssize_t cpu_polarization_show(struct sys_device *dev, char *buf)
+static ssize_t cpu_polarization_show(struct sys_device *dev,
+				     struct sysdev_attribute *attr, char *buf)
 {
 	int cpu = dev->id;
 	ssize_t count;
@@ -950,7 +953,8 @@ static ssize_t cpu_polarization_show(struct sys_device *dev, char *buf)
 }
 static SYSDEV_ATTR(polarization, 0444, cpu_polarization_show, NULL);
 
-static ssize_t show_cpu_address(struct sys_device *dev, char *buf)
+static ssize_t show_cpu_address(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%d\n", __cpu_logical_map[dev->id]);
 }
@@ -970,7 +974,8 @@ static struct attribute_group cpu_common_attr_group = {
 	.attrs = cpu_common_attrs,
 };
 
-static ssize_t show_capability(struct sys_device *dev, char *buf)
+static ssize_t show_capability(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	unsigned int capability;
 	int rc;
@@ -982,7 +987,8 @@ static ssize_t show_capability(struct sys_device *dev, char *buf)
 }
 static SYSDEV_ATTR(capability, 0444, show_capability, NULL);
 
-static ssize_t show_idle_count(struct sys_device *dev, char *buf)
+static ssize_t show_idle_count(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	struct s390_idle_data *idle;
 	unsigned long long idle_count;
@@ -995,7 +1001,8 @@ static ssize_t show_idle_count(struct sys_device *dev, char *buf)
 }
 static SYSDEV_ATTR(idle_count, 0444, show_idle_count, NULL);
 
-static ssize_t show_idle_time(struct sys_device *dev, char *buf)
+static ssize_t show_idle_time(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	struct s390_idle_data *idle;
 	unsigned long long new_time;
@@ -1112,7 +1119,9 @@ out:
 	return rc;
 }
 
-static ssize_t __ref rescan_store(struct sys_device *dev, const char *buf,
+static ssize_t __ref rescan_store(struct sys_device *dev,
+				  struct sysdev_attribute *attr,
+				  const char *buf,
 				  size_t count)
 {
 	int rc;
@@ -1123,7 +1132,9 @@ static ssize_t __ref rescan_store(struct sys_device *dev, const char *buf,
 static SYSDEV_ATTR(rescan, 0200, NULL, rescan_store);
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static ssize_t dispatching_show(struct sys_device *dev, char *buf)
+static ssize_t dispatching_show(struct sys_device *dev,
+				struct sysdev_attribute *attr,
+				char *buf)
 {
 	ssize_t count;
 
@@ -1133,8 +1144,9 @@ static ssize_t dispatching_show(struct sys_device *dev, char *buf)
 	return count;
 }
 
-static ssize_t dispatching_store(struct sys_device *dev, const char *buf,
-				 size_t count)
+static ssize_t dispatching_store(struct sys_device *dev,
+				 struct sysdev_attribute *attr,
+				 const char *buf, size_t count)
 {
 	int val, rc;
 	char delim;
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index f2cede3947b..ab70d9bd926 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -1100,7 +1100,9 @@ static inline struct etr_aib *etr_aib_from_dev(struct sys_device *dev)
 		return etr_port1_online ? &etr_port1 : NULL;
 }
 
-static ssize_t etr_online_show(struct sys_device *dev, char *buf)
+static ssize_t etr_online_show(struct sys_device *dev,
+				struct sysdev_attribute *attr,
+				char *buf)
 {
 	unsigned int online;
 
@@ -1109,7 +1111,8 @@ static ssize_t etr_online_show(struct sys_device *dev, char *buf)
 }
 
 static ssize_t etr_online_store(struct sys_device *dev,
-			      const char *buf, size_t count)
+				struct sysdev_attribute *attr,
+				const char *buf, size_t count)
 {
 	unsigned int value;
 
@@ -1136,7 +1139,9 @@ static ssize_t etr_online_store(struct sys_device *dev,
 
 static SYSDEV_ATTR(online, 0600, etr_online_show, etr_online_store);
 
-static ssize_t etr_stepping_control_show(struct sys_device *dev, char *buf)
+static ssize_t etr_stepping_control_show(struct sys_device *dev,
+					struct sysdev_attribute *attr,
+					char *buf)
 {
 	return sprintf(buf, "%i\n", (dev == &etr_port0_dev) ?
 		       etr_eacr.e0 : etr_eacr.e1);
@@ -1144,7 +1149,8 @@ static ssize_t etr_stepping_control_show(struct sys_device *dev, char *buf)
 
 static SYSDEV_ATTR(stepping_control, 0400, etr_stepping_control_show, NULL);
 
-static ssize_t etr_mode_code_show(struct sys_device *dev, char *buf)
+static ssize_t etr_mode_code_show(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	if (!etr_port0_online && !etr_port1_online)
 		/* Status word is not uptodate if both ports are offline. */
@@ -1155,7 +1161,8 @@ static ssize_t etr_mode_code_show(struct sys_device *dev, char *buf)
 
 static SYSDEV_ATTR(state_code, 0400, etr_mode_code_show, NULL);
 
-static ssize_t etr_untuned_show(struct sys_device *dev, char *buf)
+static ssize_t etr_untuned_show(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	struct etr_aib *aib = etr_aib_from_dev(dev);
 
@@ -1166,7 +1173,8 @@ static ssize_t etr_untuned_show(struct sys_device *dev, char *buf)
 
 static SYSDEV_ATTR(untuned, 0400, etr_untuned_show, NULL);
 
-static ssize_t etr_network_id_show(struct sys_device *dev, char *buf)
+static ssize_t etr_network_id_show(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	struct etr_aib *aib = etr_aib_from_dev(dev);
 
@@ -1177,7 +1185,8 @@ static ssize_t etr_network_id_show(struct sys_device *dev, char *buf)
 
 static SYSDEV_ATTR(network, 0400, etr_network_id_show, NULL);
 
-static ssize_t etr_id_show(struct sys_device *dev, char *buf)
+static ssize_t etr_id_show(struct sys_device *dev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	struct etr_aib *aib = etr_aib_from_dev(dev);
 
@@ -1188,7 +1197,8 @@ static ssize_t etr_id_show(struct sys_device *dev, char *buf)
 
 static SYSDEV_ATTR(id, 0400, etr_id_show, NULL);
 
-static ssize_t etr_port_number_show(struct sys_device *dev, char *buf)
+static ssize_t etr_port_number_show(struct sys_device *dev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	struct etr_aib *aib = etr_aib_from_dev(dev);
 
@@ -1199,7 +1209,8 @@ static ssize_t etr_port_number_show(struct sys_device *dev, char *buf)
 
 static SYSDEV_ATTR(port, 0400, etr_port_number_show, NULL);
 
-static ssize_t etr_coupled_show(struct sys_device *dev, char *buf)
+static ssize_t etr_coupled_show(struct sys_device *dev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	struct etr_aib *aib = etr_aib_from_dev(dev);
 
@@ -1210,7 +1221,8 @@ static ssize_t etr_coupled_show(struct sys_device *dev, char *buf)
 
 static SYSDEV_ATTR(coupled, 0400, etr_coupled_show, NULL);
 
-static ssize_t etr_local_time_show(struct sys_device *dev, char *buf)
+static ssize_t etr_local_time_show(struct sys_device *dev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	struct etr_aib *aib = etr_aib_from_dev(dev);
 
@@ -1221,7 +1233,8 @@ static ssize_t etr_local_time_show(struct sys_device *dev, char *buf)
 
 static SYSDEV_ATTR(local_time, 0400, etr_local_time_show, NULL);
 
-static ssize_t etr_utc_offset_show(struct sys_device *dev, char *buf)
+static ssize_t etr_utc_offset_show(struct sys_device *dev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	struct etr_aib *aib = etr_aib_from_dev(dev);
 
diff --git a/arch/sh/drivers/dma/dma-sysfs.c b/arch/sh/drivers/dma/dma-sysfs.c
index 51b57c0d1a3..347ee11351e 100644
--- a/arch/sh/drivers/dma/dma-sysfs.c
+++ b/arch/sh/drivers/dma/dma-sysfs.c
@@ -23,7 +23,8 @@ static struct sysdev_class dma_sysclass = {
 };
 EXPORT_SYMBOL(dma_sysclass);
 
-static ssize_t dma_show_devices(struct sys_device *dev, char *buf)
+static ssize_t dma_show_devices(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	ssize_t len = 0;
 	int i;
@@ -57,13 +58,15 @@ static int __init dma_sysclass_init(void)
 }
 postcore_initcall(dma_sysclass_init);
 
-static ssize_t dma_show_dev_id(struct sys_device *dev, char *buf)
+static ssize_t dma_show_dev_id(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	struct dma_channel *channel = to_dma_channel(dev);
 	return sprintf(buf, "%s\n", channel->dev_id);
 }
 
 static ssize_t dma_store_dev_id(struct sys_device *dev,
+				struct sysdev_attribute *attr,
 				const char *buf, size_t count)
 {
 	struct dma_channel *channel = to_dma_channel(dev);
@@ -74,6 +77,7 @@ static ssize_t dma_store_dev_id(struct sys_device *dev,
 static SYSDEV_ATTR(dev_id, S_IRUGO | S_IWUSR, dma_show_dev_id, dma_store_dev_id);
 
 static ssize_t dma_store_config(struct sys_device *dev,
+				struct sysdev_attribute *attr,
 				const char *buf, size_t count)
 {
 	struct dma_channel *channel = to_dma_channel(dev);
@@ -87,13 +91,15 @@ static ssize_t dma_store_config(struct sys_device *dev,
 
 static SYSDEV_ATTR(config, S_IWUSR, NULL, dma_store_config);
 
-static ssize_t dma_show_mode(struct sys_device *dev, char *buf)
+static ssize_t dma_show_mode(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	struct dma_channel *channel = to_dma_channel(dev);
 	return sprintf(buf, "0x%08x\n", channel->mode);
 }
 
 static ssize_t dma_store_mode(struct sys_device *dev,
+			      struct sysdev_attribute *attr,
 			      const char *buf, size_t count)
 {
 	struct dma_channel *channel = to_dma_channel(dev);
@@ -104,7 +110,8 @@ static ssize_t dma_store_mode(struct sys_device *dev,
 static SYSDEV_ATTR(mode, S_IRUGO | S_IWUSR, dma_show_mode, dma_store_mode);
 
 #define dma_ro_attr(field, fmt)						\
-static ssize_t dma_show_##field(struct sys_device *dev, char *buf)	\
+static ssize_t dma_show_##field(struct sys_device *dev, 		\
+				struct sysdev_attribute *attr, char *buf)\
 {									\
 	struct dma_channel *channel = to_dma_channel(dev);		\
 	return sprintf(buf, fmt, channel->field);			\
diff --git a/arch/sparc64/kernel/sysfs.c b/arch/sparc64/kernel/sysfs.c
index e885034a6b7..84e5ce14671 100644
--- a/arch/sparc64/kernel/sysfs.c
+++ b/arch/sparc64/kernel/sysfs.c
@@ -14,7 +14,8 @@
 static DEFINE_PER_CPU(struct hv_mmu_statistics, mmu_stats) __attribute__((aligned(64)));
 
 #define SHOW_MMUSTAT_ULONG(NAME) \
-static ssize_t show_##NAME(struct sys_device *dev, char *buf) \
+static ssize_t show_##NAME(struct sys_device *dev, \
+			struct sysdev_attribute *attr, char *buf) \
 { \
 	struct hv_mmu_statistics *p = &per_cpu(mmu_stats, dev->id); \
 	return sprintf(buf, "%lu\n", p->NAME); \
@@ -135,13 +136,16 @@ static unsigned long write_mmustat_enable(unsigned long val)
 	return sun4v_mmustat_conf(ra, &orig_ra);
 }
 
-static ssize_t show_mmustat_enable(struct sys_device *s, char *buf)
+static ssize_t show_mmustat_enable(struct sys_device *s,
+				struct sysdev_attribute *attr, char *buf)
 {
 	unsigned long val = run_on_cpu(s->id, read_mmustat_enable, 0);
 	return sprintf(buf, "%lx\n", val);
 }
 
-static ssize_t store_mmustat_enable(struct sys_device *s, const char *buf, size_t count)
+static ssize_t store_mmustat_enable(struct sys_device *s,
+			struct sysdev_attribute *attr, const char *buf,
+			size_t count)
 {
 	unsigned long val, err;
 	int ret = sscanf(buf, "%ld", &val);
@@ -179,14 +183,16 @@ static void unregister_mmu_stats(struct sys_device *s)
 #endif
 
 #define SHOW_CPUDATA_ULONG_NAME(NAME, MEMBER) \
-static ssize_t show_##NAME(struct sys_device *dev, char *buf) \
+static ssize_t show_##NAME(struct sys_device *dev, \
+		struct sysdev_attribute *attr, char *buf) \
 { \
 	cpuinfo_sparc *c = &cpu_data(dev->id); \
 	return sprintf(buf, "%lu\n", c->MEMBER); \
 }
 
 #define SHOW_CPUDATA_UINT_NAME(NAME, MEMBER) \
-static ssize_t show_##NAME(struct sys_device *dev, char *buf) \
+static ssize_t show_##NAME(struct sys_device *dev, \
+		struct sysdev_attribute *attr, char *buf) \
 { \
 	cpuinfo_sparc *c = &cpu_data(dev->id); \
 	return sprintf(buf, "%u\n", c->MEMBER); \
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index c4a7ec31394..e6a4d5f6764 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -762,10 +762,14 @@ DEFINE_PER_CPU(struct sys_device, device_mce);
 
 /* Why are there no generic functions for this? */
 #define ACCESSOR(name, var, start) \
-	static ssize_t show_ ## name(struct sys_device *s, char *buf) {	\
+	static ssize_t show_ ## name(struct sys_device *s,		\
+				     struct sysdev_attribute *attr,	\
+				     char *buf) {			\
 		return sprintf(buf, "%lx\n", (unsigned long)var);	\
 	}								\
-	static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
+	static ssize_t set_ ## name(struct sys_device *s,		\
+				    struct sysdev_attribute *attr,	\
+				    const char *buf, size_t siz) {	\
 		char *end;						\
 		unsigned long new = simple_strtoul(buf, &end, 0);	\
 		if (end == buf) return -EINVAL;				\
@@ -786,14 +790,16 @@ ACCESSOR(bank3ctl,bank[3],mce_restart())
 ACCESSOR(bank4ctl,bank[4],mce_restart())
 ACCESSOR(bank5ctl,bank[5],mce_restart())
 
-static ssize_t show_trigger(struct sys_device *s, char *buf)
+static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
+				char *buf)
 {
 	strcpy(buf, trigger);
 	strcat(buf, "\n");
 	return strlen(trigger) + 1;
 }
 
-static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
+static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
+				const char *buf,size_t siz)
 {
 	char *p;
 	int len;
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 1f4cc48c14c..d5ae2243f0b 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -35,6 +35,7 @@ atomic_t therm_throt_en = ATOMIC_INIT(0);
 
 #define define_therm_throt_sysdev_show_func(name)                            \
 static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev,        \
+					struct sysdev_attribute *attr,	     \
                                               char *buf)                     \
 {                                                                            \
 	unsigned int cpu = dev->id;                                          \
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
index 56b933119a0..fc4790638b6 100644
--- a/arch/x86/kernel/microcode.c
+++ b/arch/x86/kernel/microcode.c
@@ -644,7 +644,9 @@ static void microcode_fini_cpu(int cpu)
 	mutex_unlock(&microcode_mutex);
 }
 
-static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
+static ssize_t reload_store(struct sys_device *dev,
+			    struct sysdev_attribute *attr,
+			    const char *buf, size_t sz)
 {
 	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
 	char *end;
@@ -674,14 +676,16 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
 	return sz;
 }
 
-static ssize_t version_show(struct sys_device *dev, char *buf)
+static ssize_t version_show(struct sys_device *dev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
 
 	return sprintf(buf, "0x%x\n", uci->rev);
 }
 
-static ssize_t pf_show(struct sys_device *dev, char *buf)
+static ssize_t pf_show(struct sys_device *dev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
 
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index e38dfed41d8..20537d50790 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -21,15 +21,16 @@ EXPORT_SYMBOL(cpu_sysdev_class);
 static DEFINE_PER_CPU(struct sys_device *, cpu_sys_devices);
 
 #ifdef CONFIG_HOTPLUG_CPU
-static ssize_t show_online(struct sys_device *dev, char *buf)
+static ssize_t show_online(struct sys_device *dev, struct sysdev_attribute *attr,
+			   char *buf)
 {
 	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
 
 	return sprintf(buf, "%u\n", !!cpu_online(cpu->sysdev.id));
 }
 
-static ssize_t __ref store_online(struct sys_device *dev, const char *buf,
-			    size_t count)
+static ssize_t __ref store_online(struct sys_device *dev, struct sysdev_attribute *attr,
+				 const char *buf, size_t count)
 {
 	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
 	ssize_t ret;
@@ -80,7 +81,8 @@ static inline void register_cpu_control(struct cpu *cpu)
 #ifdef CONFIG_KEXEC
 #include <linux/kexec.h>
 
-static ssize_t show_crash_notes(struct sys_device *dev, char *buf)
+static ssize_t show_crash_notes(struct sys_device *dev, struct sysdev_attribute *attr,
+				char *buf)
 {
 	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
 	ssize_t rc;
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 937e8258981..4d4e0e7b6e9 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -92,7 +92,8 @@ unregister_memory(struct memory_block *memory, struct mem_section *section)
  * uses.
  */
 
-static ssize_t show_mem_phys_index(struct sys_device *dev, char *buf)
+static ssize_t show_mem_phys_index(struct sys_device *dev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	struct memory_block *mem =
 		container_of(dev, struct memory_block, sysdev);
@@ -102,7 +103,8 @@ static ssize_t show_mem_phys_index(struct sys_device *dev, char *buf)
 /*
  * online, offline, going offline, etc.
  */
-static ssize_t show_mem_state(struct sys_device *dev, char *buf)
+static ssize_t show_mem_state(struct sys_device *dev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	struct memory_block *mem =
 		container_of(dev, struct memory_block, sysdev);
@@ -217,7 +219,8 @@ out:
 }
 
 static ssize_t
-store_mem_state(struct sys_device *dev, const char *buf, size_t count)
+store_mem_state(struct sys_device *dev,
+		struct sysdev_attribute *attr, const char *buf, size_t count)
 {
 	struct memory_block *mem;
 	unsigned int phys_section_nr;
@@ -248,7 +251,8 @@ out:
  * s.t. if I offline all of these sections I can then
  * remove the physical device?
  */
-static ssize_t show_phys_device(struct sys_device *dev, char *buf)
+static ssize_t show_phys_device(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	struct memory_block *mem =
 		container_of(dev, struct memory_block, sysdev);
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 0f867a08333..5116b78c632 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -36,11 +36,13 @@ static ssize_t node_read_cpumap(struct sys_device *dev, int type, char *buf)
 	return len;
 }
 
-static inline ssize_t node_read_cpumask(struct sys_device *dev, char *buf)
+static inline ssize_t node_read_cpumask(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	return node_read_cpumap(dev, 0, buf);
 }
-static inline ssize_t node_read_cpulist(struct sys_device *dev, char *buf)
+static inline ssize_t node_read_cpulist(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	return node_read_cpumap(dev, 1, buf);
 }
@@ -49,7 +51,8 @@ static SYSDEV_ATTR(cpumap,  S_IRUGO, node_read_cpumask, NULL);
 static SYSDEV_ATTR(cpulist, S_IRUGO, node_read_cpulist, NULL);
 
 #define K(x) ((x) << (PAGE_SHIFT - 10))
-static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
+static ssize_t node_read_meminfo(struct sys_device * dev,
+			struct sysdev_attribute *attr, char * buf)
 {
 	int n;
 	int nid = dev->id;
@@ -112,7 +115,8 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
 #undef K
 static SYSDEV_ATTR(meminfo, S_IRUGO, node_read_meminfo, NULL);
 
-static ssize_t node_read_numastat(struct sys_device * dev, char * buf)
+static ssize_t node_read_numastat(struct sys_device * dev,
+				struct sysdev_attribute *attr, char * buf)
 {
 	return sprintf(buf,
 		       "numa_hit %lu\n"
@@ -130,7 +134,8 @@ static ssize_t node_read_numastat(struct sys_device * dev, char * buf)
 }
 static SYSDEV_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);
 
-static ssize_t node_read_distance(struct sys_device * dev, char * buf)
+static ssize_t node_read_distance(struct sys_device * dev,
+			struct sysdev_attribute *attr, char * buf)
 {
 	int nid = dev->id;
 	int len = 0;
diff --git a/drivers/base/sys.c b/drivers/base/sys.c
index 50690d9df24..dc7dace14e1 100644
--- a/drivers/base/sys.c
+++ b/drivers/base/sys.c
@@ -36,7 +36,7 @@ sysdev_show(struct kobject * kobj, struct attribute * attr, char * buffer)
 	struct sysdev_attribute * sysdev_attr = to_sysdev_attr(attr);
 
 	if (sysdev_attr->show)
-		return sysdev_attr->show(sysdev, buffer);
+		return sysdev_attr->show(sysdev, sysdev_attr, buffer);
 	return -EIO;
 }
 
@@ -49,7 +49,7 @@ sysdev_store(struct kobject * kobj, struct attribute * attr,
 	struct sysdev_attribute * sysdev_attr = to_sysdev_attr(attr);
 
 	if (sysdev_attr->store)
-		return sysdev_attr->store(sysdev, buffer, count);
+		return sysdev_attr->store(sysdev, sysdev_attr, buffer, count);
 	return -EIO;
 }
 
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index 3f6d9b0a6ab..199cd97e32e 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -34,7 +34,8 @@
 static SYSDEV_ATTR(_name, 0444, show_##_name, NULL)
 
 #define define_id_show_func(name)				\
-static ssize_t show_##name(struct sys_device *dev, char *buf)	\
+static ssize_t show_##name(struct sys_device *dev,		\
+		struct sysdev_attribute *attr, char *buf)	\
 {								\
 	unsigned int cpu = dev->id;				\
 	return sprintf(buf, "%d\n", topology_##name(cpu));	\
@@ -59,14 +60,17 @@ static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf)
 
 #ifdef arch_provides_topology_pointers
 #define define_siblings_show_map(name)					\
-static ssize_t show_##name(struct sys_device *dev, char *buf)	\
+static ssize_t show_##name(struct sys_device *dev,			\
+			   struct sysdev_attribute *attr, char *buf)	\
 {									\
 	unsigned int cpu = dev->id;					\
 	return show_cpumap(0, &(topology_##name(cpu)), buf);		\
 }
 
 #define define_siblings_show_list(name)					\
-static ssize_t show_##name##_list(struct sys_device *dev, char *buf) \
+static ssize_t show_##name##_list(struct sys_device *dev,		\
+				  struct sysdev_attribute *attr,	\
+				  char *buf)				\
 {									\
 	unsigned int cpu = dev->id;					\
 	return show_cpumap(1, &(topology_##name(cpu)), buf);		\
@@ -74,7 +78,8 @@ static ssize_t show_##name##_list(struct sys_device *dev, char *buf) \
 
 #else
 #define define_siblings_show_map(name)					\
-static ssize_t show_##name(struct sys_device *dev, char *buf)	\
+static ssize_t show_##name(struct sys_device *dev,			\
+			   struct sysdev_attribute *attr, char *buf)	\
 {									\
 	unsigned int cpu = dev->id;					\
 	cpumask_t mask = topology_##name(cpu);				\
@@ -82,7 +87,9 @@ static ssize_t show_##name(struct sys_device *dev, char *buf)	\
 }
 
 #define define_siblings_show_list(name)					\
-static ssize_t show_##name##_list(struct sys_device *dev, char *buf) \
+static ssize_t show_##name##_list(struct sys_device *dev,		\
+				  struct sysdev_attribute *attr,	\
+				  char *buf)				\
 {									\
 	unsigned int cpu = dev->id;					\
 	cpumask_t mask = topology_##name(cpu);				\
diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index e949618b9be..31a0e0b455b 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -21,7 +21,8 @@ static int __init cpuidle_sysfs_setup(char *unused)
 }
 __setup("cpuidle_sysfs_switch", cpuidle_sysfs_setup);
 
-static ssize_t show_available_governors(struct sys_device *dev, char *buf)
+static ssize_t show_available_governors(struct sys_device *dev,
+		struct sysdev_attribute *attr, char *buf)
 {
 	ssize_t i = 0;
 	struct cpuidle_governor *tmp;
@@ -39,7 +40,8 @@ out:
 	return i;
 }
 
-static ssize_t show_current_driver(struct sys_device *dev, char *buf)
+static ssize_t show_current_driver(struct sys_device *dev,
+		struct sysdev_attribute *attr, char *buf)
 {
 	ssize_t ret;
 
@@ -53,7 +55,8 @@ static ssize_t show_current_driver(struct sys_device *dev, char *buf)
 	return ret;
 }
 
-static ssize_t show_current_governor(struct sys_device *dev, char *buf)
+static ssize_t show_current_governor(struct sys_device *dev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	ssize_t ret;
 
@@ -68,6 +71,7 @@ static ssize_t show_current_governor(struct sys_device *dev, char *buf)
 }
 
 static ssize_t store_current_governor(struct sys_device *dev,
+	struct sysdev_attribute *attr,
 	const char *buf, size_t count)
 {
 	char gov_name[CPUIDLE_NAME_LEN];
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 591bc29b55f..d4427cb8697 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -610,6 +610,7 @@ static ssize_t show_target_kb(struct sys_device *dev, char *buf)
 }
 
 static ssize_t store_target_kb(struct sys_device *dev,
+			       struct sysdev_attribute *attr,
 			       const char *buf,
 			       size_t count)
 {
diff --git a/include/linux/sysdev.h b/include/linux/sysdev.h
index f2767bc6b73..8dcf3162b21 100644
--- a/include/linux/sysdev.h
+++ b/include/linux/sysdev.h
@@ -99,8 +99,9 @@ extern void sysdev_unregister(struct sys_device *);
 
 struct sysdev_attribute { 
 	struct attribute	attr;
-	ssize_t (*show)(struct sys_device *, char *);
-	ssize_t (*store)(struct sys_device *, const char *, size_t);
+	ssize_t (*show)(struct sys_device *, struct sysdev_attribute *, char *);
+	ssize_t (*store)(struct sys_device *, struct sysdev_attribute *,
+			 const char *, size_t);
 };
 
 
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 092e4c620af..a56f629b057 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -297,8 +297,8 @@ static int test_func(void *data)
  *
  * opcode:data
  */
-static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf,
-				  size_t count)
+static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribute *attr,
+				  const char *buf, size_t count)
 {
 	struct sched_param schedpar;
 	struct test_thread_data *td;
@@ -360,7 +360,8 @@ static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf,
  * @dev:	thread to query
  * @buf:	char buffer to be filled with thread status info
  */
-static ssize_t sysfs_test_status(struct sys_device *dev, char *buf)
+static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute *attr,
+				 char *buf)
 {
 	struct test_thread_data *td;
 	struct task_struct *tsk;
diff --git a/kernel/sched.c b/kernel/sched.c
index 99e6d850eca..b1104ea5d25 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7737,11 +7737,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 }
 
 #ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
+static ssize_t sched_mc_power_savings_show(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *page)
 {
 	return sprintf(page, "%u\n", sched_mc_power_savings);
 }
 static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
+					    struct sysdev_attribute *attr,
 					    const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 0);
@@ -7751,11 +7753,13 @@ static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
 #endif
 
 #ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
+static ssize_t sched_smt_power_savings_show(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *page)
 {
 	return sprintf(page, "%u\n", sched_smt_power_savings);
 }
 static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
+					     struct sysdev_attribute *attr,
 					     const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 1);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index dadde5361f3..b1c2da81b05 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -376,7 +376,8 @@ void clocksource_unregister(struct clocksource *cs)
  * Provides sysfs interface for listing current clocksource.
  */
 static ssize_t
-sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
+sysfs_show_current_clocksources(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	ssize_t count = 0;
 
@@ -397,6 +398,7 @@ sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
  * clocksource selction.
  */
 static ssize_t sysfs_override_clocksource(struct sys_device *dev,
+					  struct sysdev_attribute *attr,
 					  const char *buf, size_t count)
 {
 	struct clocksource *ovr = NULL;
@@ -449,7 +451,9 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
  * Provides sysfs interface for listing registered clocksources
  */
 static ssize_t
-sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
+sysfs_show_available_clocksources(struct sys_device *dev,
+				  struct sysdev_attribute *attr,
+				  char *buf)
 {
 	struct clocksource *src;
 	ssize_t count = 0;
-- 
cgit v1.2.3-70-g09d2


From 49b5cf34727a6c1be1568ab28e89a2d9a6bf51e0 Mon Sep 17 00:00:00 2001
From: Jonathan Lim <jlim@sgi.com>
Date: Fri, 25 Jul 2008 01:48:40 -0700
Subject: accounting: account for user time when updating memory integrals

Adapt acct_update_integrals() to include user time when calculating the time
difference.  The units of acct_rss_mem1 and acct_vm_mem1 are also changed from
pages-jiffies to pages-usecs to avoid calling jiffies_to_usecs() in
xacct_add_tsk() which might overflow.

Signed-off-by: Jonathan Lim <jlim@sgi.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  2 +-
 kernel/sched.c        |  2 ++
 kernel/tsacct.c       | 21 ++++++++++++++-------
 3 files changed, 17 insertions(+), 8 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 354ef478a80..af780f299c7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1257,7 +1257,7 @@ struct task_struct {
 #if defined(CONFIG_TASK_XACCT)
 	u64 acct_rss_mem1;	/* accumulated rss usage */
 	u64 acct_vm_mem1;	/* accumulated virtual memory usage */
-	cputime_t acct_stimexpd;/* stime since last update */
+	cputime_t acct_timexpd;	/* stime + utime since last update */
 #endif
 #ifdef CONFIG_CPUSETS
 	nodemask_t mems_allowed;
diff --git a/kernel/sched.c b/kernel/sched.c
index 6acf749d333..0047bd9b96a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4046,6 +4046,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
 		cpustat->nice = cputime64_add(cpustat->nice, tmp);
 	else
 		cpustat->user = cputime64_add(cpustat->user, tmp);
+	/* Account for user time used */
+	acct_update_integrals(p);
 }
 
 /*
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 4ab1b584961..1da6990af8e 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -84,9 +84,9 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 {
 	struct mm_struct *mm;
 
-	/* convert pages-jiffies to Mbyte-usec */
-	stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB;
-	stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB;
+	/* convert pages-usec to Mbyte-usec */
+	stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB;
+	stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB;
 	mm = get_task_mm(p);
 	if (mm) {
 		/* adjust to KB unit */
@@ -118,12 +118,19 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 void acct_update_integrals(struct task_struct *tsk)
 {
 	if (likely(tsk->mm)) {
-		long delta = cputime_to_jiffies(
-			cputime_sub(tsk->stime, tsk->acct_stimexpd));
+		cputime_t time, dtime;
+		struct timeval value;
+		u64 delta;
+
+		time = tsk->stime + tsk->utime;
+		dtime = cputime_sub(time, tsk->acct_timexpd);
+		jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
+		delta = value.tv_sec;
+		delta = delta * USEC_PER_SEC + value.tv_usec;
 
 		if (delta == 0)
 			return;
-		tsk->acct_stimexpd = tsk->stime;
+		tsk->acct_timexpd = time;
 		tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
 		tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
 	}
@@ -135,7 +142,7 @@ void acct_update_integrals(struct task_struct *tsk)
  */
 void acct_clear_integrals(struct task_struct *tsk)
 {
-	tsk->acct_stimexpd = 0;
+	tsk->acct_timexpd = 0;
 	tsk->acct_rss_mem1 = 0;
 	tsk->acct_vm_mem1 = 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 7babe8db99d305340cf4828ce1f5a1481d5622ef Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Fri, 25 Jul 2008 19:45:11 -0700
Subject: Full conversion to early_initcall() interface, remove old interface

A previous patch added the early_initcall(), to allow a cleaner hooking of
pre-SMP initcalls.  Now we remove the older interface, converting all
existing users to the new one.

[akpm@linux-foundation.org: cleanups]
[akpm@linux-foundation.org: build fix]
[kosaki.motohiro@jp.fujitsu.com: warning fix]
[kosaki.motohiro@jp.fujitsu.com: warning fix]
Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  9 ---------
 include/linux/smp.h   |  5 -----
 init/main.c           | 23 +----------------------
 kernel/sched.c        |  5 ++++-
 kernel/smp.c          |  4 +++-
 kernel/softirq.c      |  3 ++-
 kernel/softlockup.c   | 25 ++++++++++++++++++++++---
 7 files changed, 32 insertions(+), 42 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3260a5c42b9..adb8077dc46 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -292,7 +292,6 @@ extern void sched_show_task(struct task_struct *p);
 
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 extern void softlockup_tick(void);
-extern void spawn_softlockup_task(void);
 extern void touch_softlockup_watchdog(void);
 extern void touch_all_softlockup_watchdogs(void);
 extern unsigned int  softlockup_panic;
@@ -2222,14 +2221,6 @@ static inline void inc_syscw(struct task_struct *tsk)
 }
 #endif
 
-#ifdef CONFIG_SMP
-void migration_init(void);
-#else
-static inline void migration_init(void)
-{
-}
-#endif
-
 #ifndef TASK_SIZE_OF
 #define TASK_SIZE_OF(tsk)	TASK_SIZE
 #endif
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 48262f86c96..66484d4a845 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -74,15 +74,10 @@ void __smp_call_function_single(int cpuid, struct call_single_data *data);
 #ifdef CONFIG_USE_GENERIC_SMP_HELPERS
 void generic_smp_call_function_single_interrupt(void);
 void generic_smp_call_function_interrupt(void);
-void init_call_single_data(void);
 void ipi_call_lock(void);
 void ipi_call_unlock(void);
 void ipi_call_lock_irq(void);
 void ipi_call_unlock_irq(void);
-#else
-static inline void init_call_single_data(void)
-{
-}
 #endif
 
 /*
diff --git a/init/main.c b/init/main.c
index b6fec08dbbe..20fdc9884b7 100644
--- a/init/main.c
+++ b/init/main.c
@@ -774,16 +774,7 @@ static void __init do_basic_setup(void)
 	do_initcalls();
 }
 
-static int __initdata nosoftlockup;
-
-static int __init nosoftlockup_setup(char *str)
-{
-	nosoftlockup = 1;
-	return 1;
-}
-__setup("nosoftlockup", nosoftlockup_setup);
-
-static void __init __do_pre_smp_initcalls(void)
+static void __init do_pre_smp_initcalls(void)
 {
 	initcall_t *call;
 
@@ -791,17 +782,6 @@ static void __init __do_pre_smp_initcalls(void)
 		do_one_initcall(*call);
 }
 
-static void __init do_pre_smp_initcalls(void)
-{
-	extern int spawn_ksoftirqd(void);
-
-	init_call_single_data();
-	migration_init();
-	spawn_ksoftirqd();
-	if (!nosoftlockup)
-		spawn_softlockup_task();
-}
-
 static void run_init_process(char *init_filename)
 {
 	argv_init[0] = init_filename;
@@ -873,7 +853,6 @@ static int __init kernel_init(void * unused)
 
 	smp_prepare_cpus(setup_max_cpus);
 
-	__do_pre_smp_initcalls();
 	do_pre_smp_initcalls();
 
 	smp_init();
diff --git a/kernel/sched.c b/kernel/sched.c
index 0047bd9b96a..fde1a102635 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6389,7 +6389,7 @@ static struct notifier_block __cpuinitdata migration_notifier = {
 	.priority = 10
 };
 
-void __init migration_init(void)
+static int __init migration_init(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
 	int err;
@@ -6399,7 +6399,10 @@ void __init migration_init(void)
 	BUG_ON(err == NOTIFY_BAD);
 	migration_call(&migration_notifier, CPU_ONLINE, cpu);
 	register_cpu_notifier(&migration_notifier);
+
+	return err;
 }
+early_initcall(migration_init);
 #endif
 
 #ifdef CONFIG_SMP
diff --git a/kernel/smp.c b/kernel/smp.c
index 462c785ca1e..96fc7c0edc5 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -33,7 +33,7 @@ struct call_single_queue {
 	spinlock_t lock;
 };
 
-void __cpuinit init_call_single_data(void)
+static int __cpuinit init_call_single_data(void)
 {
 	int i;
 
@@ -43,7 +43,9 @@ void __cpuinit init_call_single_data(void)
 		spin_lock_init(&q->lock);
 		INIT_LIST_HEAD(&q->list);
 	}
+	return 0;
 }
+early_initcall(init_call_single_data);
 
 static void csd_flag_wait(struct call_single_data *data)
 {
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f6b03d56c2b..c506f266a6b 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -630,7 +630,7 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
 	.notifier_call = cpu_callback
 };
 
-__init int spawn_ksoftirqd(void)
+static __init int spawn_ksoftirqd(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
 	int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
@@ -640,6 +640,7 @@ __init int spawn_ksoftirqd(void)
 	register_cpu_notifier(&cpu_nfb);
 	return 0;
 }
+early_initcall(spawn_ksoftirqd);
 
 #ifdef CONFIG_SMP
 /*
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 7bd8d1aadd5..b75b492fbfc 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -338,14 +338,33 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
 	.notifier_call = cpu_callback
 };
 
-__init void spawn_softlockup_task(void)
+static int __initdata nosoftlockup;
+
+static int __init nosoftlockup_setup(char *str)
+{
+	nosoftlockup = 1;
+	return 1;
+}
+__setup("nosoftlockup", nosoftlockup_setup);
+
+static int __init spawn_softlockup_task(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
-	int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	int err;
 
-	BUG_ON(err == NOTIFY_BAD);
+	if (nosoftlockup)
+		return 0;
+
+	err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	if (err == NOTIFY_BAD) {
+		BUG();
+		return 1;
+	}
 	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
 	register_cpu_notifier(&cpu_nfb);
 
 	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+
+	return 0;
 }
+early_initcall(spawn_softlockup_task);
-- 
cgit v1.2.3-70-g09d2


From 85ba2d862e521375a8ee01526c5c46b1f24bb4af Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:58 -0700
Subject: tracehook: wait_task_inactive

This extends wait_task_inactive() with a new argument so it can be used in
a "soft" mode where it will check for the task changing state unexpectedly
and back off.  There is no change to existing callers.  This lays the
groundwork to allow robust, noninvasive tracing that can try to sample a
blocked thread but back off safely if it wakes up.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kernel/perfmon.c |  4 ++--
 include/linux/sched.h      |  8 ++++++--
 kernel/kthread.c           |  2 +-
 kernel/ptrace.c            |  2 +-
 kernel/sched.c             | 29 +++++++++++++++++++++++++++--
 5 files changed, 37 insertions(+), 8 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 19d4493c619..fc8f3509df2 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2626,7 +2626,7 @@ pfm_task_incompatible(pfm_context_t *ctx, struct task_struct *task)
 	/*
 	 * make sure the task is off any CPU
 	 */
-	wait_task_inactive(task);
+	wait_task_inactive(task, 0);
 
 	/* more to come... */
 
@@ -4774,7 +4774,7 @@ recheck:
 
 		UNPROTECT_CTX(ctx, flags);
 
-		wait_task_inactive(task);
+		wait_task_inactive(task, 0);
 
 		PROTECT_CTX(ctx, flags);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a95d84d0da9..f59318a0099 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1882,9 +1882,13 @@ extern void set_task_comm(struct task_struct *tsk, char *from);
 extern char *get_task_comm(char *to, struct task_struct *tsk);
 
 #ifdef CONFIG_SMP
-extern void wait_task_inactive(struct task_struct * p);
+extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
 #else
-#define wait_task_inactive(p)	do { } while (0)
+static inline unsigned long wait_task_inactive(struct task_struct *p,
+					       long match_state)
+{
+	return 1;
+}
 #endif
 
 #define next_task(p)	list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks)
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 6111c27491b..96cff2f8710 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -176,7 +176,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
 		return;
 	}
 	/* Must have done schedule() in kthread() before we set_task_cpu */
-	wait_task_inactive(k);
+	wait_task_inactive(k, 0);
 	set_task_cpu(k, cpu);
 	k->cpus_allowed = cpumask_of_cpu(cpu);
 	k->rt.nr_cpus_allowed = 1;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 8392a9da645..082b3fcb32a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -107,7 +107,7 @@ int ptrace_check_attach(struct task_struct *child, int kill)
 	read_unlock(&tasklist_lock);
 
 	if (!ret && !kill)
-		wait_task_inactive(child);
+		ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH;
 
 	/* All systems go.. */
 	return ret;
diff --git a/kernel/sched.c b/kernel/sched.c
index fde1a102635..0236958addc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1867,16 +1867,24 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
+ * If @match_state is nonzero, it's the @p->state value just checked and
+ * not expected to change.  If it changes, i.e. @p might have woken up,
+ * then return zero.  When we succeed in waiting for @p to be off its CPU,
+ * we return a positive number (its total switch count).  If a second call
+ * a short while later returns the same number, the caller can be sure that
+ * @p has remained unscheduled the whole time.
+ *
  * The caller must ensure that the task *will* unschedule sometime soon,
  * else this function might spin for a *long* time. This function can't
  * be called with interrupts off, or it may introduce deadlock with
  * smp_call_function() if an IPI is sent by the same process we are
  * waiting to become inactive.
  */
-void wait_task_inactive(struct task_struct *p)
+unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 {
 	unsigned long flags;
 	int running, on_rq;
+	unsigned long ncsw;
 	struct rq *rq;
 
 	for (;;) {
@@ -1899,8 +1907,11 @@ void wait_task_inactive(struct task_struct *p)
 		 * return false if the runqueue has changed and p
 		 * is actually now running somewhere else!
 		 */
-		while (task_running(rq, p))
+		while (task_running(rq, p)) {
+			if (match_state && unlikely(p->state != match_state))
+				return 0;
 			cpu_relax();
+		}
 
 		/*
 		 * Ok, time to look more closely! We need the rq
@@ -1910,8 +1921,20 @@ void wait_task_inactive(struct task_struct *p)
 		rq = task_rq_lock(p, &flags);
 		running = task_running(rq, p);
 		on_rq = p->se.on_rq;
+		ncsw = 0;
+		if (!match_state || p->state == match_state) {
+			ncsw = p->nivcsw + p->nvcsw;
+			if (unlikely(!ncsw))
+				ncsw = 1;
+		}
 		task_rq_unlock(rq, &flags);
 
+		/*
+		 * If it changed from the expected state, bail out now.
+		 */
+		if (unlikely(!ncsw))
+			break;
+
 		/*
 		 * Was it really running after all now that we
 		 * checked with the proper locks actually held?
@@ -1944,6 +1967,8 @@ void wait_task_inactive(struct task_struct *p)
 		 */
 		break;
 	}
+
+	return ncsw;
 }
 
 /***
-- 
cgit v1.2.3-70-g09d2


From e26873bb10f722f10b1af9de05119a3d7cbc07b2 Mon Sep 17 00:00:00 2001
From: roel kluin <roel.kluin@gmail.com>
Date: Tue, 22 Jul 2008 16:51:15 -0400
Subject: sched: test runtime rather than period in global_rt_runtime()

Test runtime rather than period

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0236958addc..0d1717b0022 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -834,7 +834,7 @@ static inline u64 global_rt_period(void)
 
 static inline u64 global_rt_runtime(void)
 {
-	if (sysctl_sched_rt_period < 0)
+	if (sysctl_sched_rt_runtime < 0)
 		return RUNTIME_INF;
 
 	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
-- 
cgit v1.2.3-70-g09d2


From f718cd4add5aea9d379faff92f162571e356cc5f Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Tue, 29 Jul 2008 22:33:52 -0700
Subject: sched: make scheduler sysfs attributes sysdev class devices

They are really class devices, but were incorrectly declared.  This
leads to crashes with the recent changes that makes non normal sysdevs
use a different prototype.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Pierre Ossman <drzeus-list@drzeus.cx>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0236958addc..21f7da94662 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7671,34 +7671,34 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 }
 
 #ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct sys_device *dev,
-				struct sysdev_attribute *attr, char *page)
+static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
+					   char *page)
 {
 	return sprintf(page, "%u\n", sched_mc_power_savings);
 }
-static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
-					    struct sysdev_attribute *attr,
+static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
 					    const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 0);
 }
-static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
-		   sched_mc_power_savings_store);
+static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
+			 sched_mc_power_savings_show,
+			 sched_mc_power_savings_store);
 #endif
 
 #ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct sys_device *dev,
-				struct sysdev_attribute *attr, char *page)
+static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
+					    char *page)
 {
 	return sprintf(page, "%u\n", sched_smt_power_savings);
 }
-static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
-					     struct sysdev_attribute *attr,
+static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
 					     const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 1);
 }
-static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
+static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
+		   sched_smt_power_savings_show,
 		   sched_smt_power_savings_store);
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From 5e710e37bde120bb069f691bee68e69ef4393173 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 30 Jul 2008 13:26:57 +0200
Subject: lockdep: change scheduler annotation

While thinking about David's graph walk lockdep patch it _finally_
dawned on me that there is no reason we have a lock class per cpu ...

Sorry for being dense :-/

The below changes the annotation from a lock class per cpu, to a single
nested lock, as the scheduler never holds more that 2 rq locks at a time
anyway.

If there was code requiring holding all rq locks this would not work and
the original annotation would be the only option, but that not being the
case, this is a much lighter one.

Compiles and boots on a 2-way x86_64.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0236958addc..655f1db26b1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -600,7 +600,6 @@ struct rq {
 	/* BKL stats */
 	unsigned int bkl_count;
 #endif
-	struct lock_class_key rq_lock_key;
 };
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -2759,10 +2758,10 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
 	} else {
 		if (rq1 < rq2) {
 			spin_lock(&rq1->lock);
-			spin_lock(&rq2->lock);
+			spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
 		} else {
 			spin_lock(&rq2->lock);
-			spin_lock(&rq1->lock);
+			spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
 		}
 	}
 	update_rq_clock(rq1);
@@ -2805,10 +2804,10 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 		if (busiest < this_rq) {
 			spin_unlock(&this_rq->lock);
 			spin_lock(&busiest->lock);
-			spin_lock(&this_rq->lock);
+			spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
 			ret = 1;
 		} else
-			spin_lock(&busiest->lock);
+			spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
 	}
 	return ret;
 }
@@ -7998,7 +7997,6 @@ void __init sched_init(void)
 
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
-		lockdep_set_class(&rq->lock, &rq->rq_lock_key);
 		rq->nr_running = 0;
 		init_cfs_rq(&rq->cfs, rq);
 		init_rt_rq(&rq->rt, rq);
-- 
cgit v1.2.3-70-g09d2


From 725aad24c3ba96a7c06448c14c265a466cdbd663 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Sun, 3 Aug 2008 09:33:03 -0700
Subject: __sched_setscheduler: don't do any policy checks when not "user"

The "user" parameter to __sched_setscheduler indicates whether the
change is being done on behalf of a user process or not.  If not, we
shouldn't apply any permissions checks, so don't call
security_task_setscheduler().

Signed-off-by: Jeremy Fitzhardinge <jeremy@goop.org>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 21f7da94662..04160d277e7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5004,19 +5004,21 @@ recheck:
 			return -EPERM;
 	}
 
+	if (user) {
 #ifdef CONFIG_RT_GROUP_SCHED
-	/*
-	 * Do not allow realtime tasks into groups that have no runtime
-	 * assigned.
-	 */
-	if (user
-	    && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
-		return -EPERM;
+		/*
+		 * Do not allow realtime tasks into groups that have no runtime
+		 * assigned.
+		 */
+		if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
+			return -EPERM;
 #endif
 
-	retval = security_task_setscheduler(p, policy, param);
-	if (retval)
-		return retval;
+		retval = security_task_setscheduler(p, policy, param);
+		if (retval)
+			return retval;
+	}
+
 	/*
 	 * make sure no PI-waiters arrive (or leave) while we are
 	 * changing the priority of the task:
-- 
cgit v1.2.3-70-g09d2


From 1b12bbc747560ea68bcc132c3d05699e52271da0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 11 Aug 2008 09:30:22 +0200
Subject: lockdep: re-annotate scheduler runqueues

Instead of using a per-rq lock class, use the regular nesting operations.

However, take extra care with double_lock_balance() as it can release the
already held rq->lock (and therefore change its nesting class).

So what can happen is:

 spin_lock(rq->lock);	// this rq subclass 0

 double_lock_balance(rq, other_rq);
   // release rq
   // acquire other_rq->lock subclass 0
   // acquire rq->lock subclass 1

 spin_unlock(other_rq->lock);

leaving you with rq->lock in subclass 1

So a subsequent double_lock_balance() call can try to nest a subclass 1
lock while already holding a subclass 1 lock.

Fix this by introducing double_unlock_balance() which releases the other
rq's lock, but also re-sets the subclass for this rq's lock to 0.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    | 11 +++++++++--
 kernel/sched_rt.c |  8 +++++---
 2 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 655f1db26b1..9b2b6a85577 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2812,6 +2812,13 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 	return ret;
 }
 
+static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+	__releases(busiest->lock)
+{
+	spin_unlock(&busiest->lock);
+	lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+}
+
 /*
  * If dest_cpu is allowed for this process, migrate the task to it.
  * This is accomplished by forcing the cpu_allowed mask to only
@@ -3636,7 +3643,7 @@ redo:
 		ld_moved = move_tasks(this_rq, this_cpu, busiest,
 					imbalance, sd, CPU_NEWLY_IDLE,
 					&all_pinned);
-		spin_unlock(&busiest->lock);
+		double_unlock_balance(this_rq, busiest);
 
 		if (unlikely(all_pinned)) {
 			cpu_clear(cpu_of(busiest), *cpus);
@@ -3751,7 +3758,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 		else
 			schedstat_inc(sd, alb_failed);
 	}
-	spin_unlock(&target_rq->lock);
+	double_unlock_balance(busiest_rq, target_rq);
 }
 
 #ifdef CONFIG_NO_HZ
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 908c04f9dad..6163e4cf885 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -861,6 +861,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 #define RT_MAX_TRIES 3
 
 static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
+static void double_unlock_balance(struct rq *this_rq, struct rq *busiest);
+
 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
@@ -1022,7 +1024,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 			break;
 
 		/* try again */
-		spin_unlock(&lowest_rq->lock);
+		double_unlock_balance(rq, lowest_rq);
 		lowest_rq = NULL;
 	}
 
@@ -1091,7 +1093,7 @@ static int push_rt_task(struct rq *rq)
 
 	resched_task(lowest_rq->curr);
 
-	spin_unlock(&lowest_rq->lock);
+	double_unlock_balance(rq, lowest_rq);
 
 	ret = 1;
 out:
@@ -1197,7 +1199,7 @@ static int pull_rt_task(struct rq *this_rq)
 
 		}
  skip:
-		spin_unlock(&src_rq->lock);
+		double_unlock_balance(this_rq, src_rq);
 	}
 
 	return ret;
-- 
cgit v1.2.3-70-g09d2