From f29c9b1ccb52904ee442a933cf3dee628f9f4e62 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 6 Nov 2008 09:45:16 +0800 Subject: sched: fix a bug in sched domain degenerate Impact: re-add incorrectly eliminated sched domain layers (1) on i386 with SCHED_SMT and SCHED_MC enabled # mount -t cgroup -o cpuset xxx /mnt # echo 0 > /mnt/cpuset.sched_load_balance # mkdir /mnt/0 # echo 0 > /mnt/0/cpuset.cpus # dmesg CPU0 attaching sched-domain: domain 0: span 0 level CPU groups: 0 (2) on i386 with SCHED_MC enabled but SCHED_SMT disabled # same with (1) # dmesg CPU0 attaching NULL sched-domain. The bug is that some sched domains may be skipped unintentionally when degenerating (optimizing) sched domains. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 82cc839c921..4c7e2bcdfa8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6877,15 +6877,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) struct sched_domain *tmp; /* Remove the sched domains which do not contribute to scheduling. */ - for (tmp = sd; tmp; tmp = tmp->parent) { + for (tmp = sd; tmp; ) { struct sched_domain *parent = tmp->parent; if (!parent) break; + if (sd_parent_degenerate(tmp, parent)) { tmp->parent = parent->parent; if (parent->parent) parent->parent->child = tmp; - } + } else + tmp = tmp->parent; } if (sd && sd_degenerate(sd)) { -- cgit v1.2.3-70-g09d2 From ca3273f9646694e0419cfb9d6c12deb1c9aff27c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 7 Nov 2008 14:47:21 +0800 Subject: sched: fix memory leak in a failure path Impact: fix rare memory leak in the sched-domains manual reconfiguration code In the failure path, rd is not attached to a sched domain, so it causes a leak. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 4c7e2bcdfa8..57c933ffbee 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7676,6 +7676,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, error: free_sched_groups(cpu_map, tmpmask); SCHED_CPUMASK_FREE((void *)allmasks); + kfree(rd); return -ENOMEM; #endif } -- cgit v1.2.3-70-g09d2 From 5ac5c4d604bf894ef672a7971d03fefdc7ea7e49 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Nov 2008 10:46:32 +0100 Subject: sched: clean up debug info Impact: clean up and fix debug info printout While looking over the sched_debug code I noticed that we printed the rq schedstats for every cfs_rq, ammend this. Also change nr_spead_over into an int, and fix a little buglet in min_vruntime printing. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/sched_debug.c | 41 +++++++++++++++++++++-------------------- 2 files changed, 22 insertions(+), 21 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 57c933ffbee..f3149244e32 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -399,7 +399,7 @@ struct cfs_rq { */ struct sched_entity *curr, *next, *last; - unsigned long nr_spread_over; + unsigned int nr_spread_over; #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 5ae17762ec3..48ecc51e770 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -144,7 +144,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) last = __pick_last_entity(cfs_rq); if (last) max_vruntime = last->vruntime; - min_vruntime = rq->cfs.min_vruntime; + min_vruntime = cfs_rq->min_vruntime; rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime; spin_unlock_irqrestore(&rq->lock, flags); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", @@ -161,26 +161,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(spread0)); SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); -#ifdef CONFIG_SCHEDSTATS -#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); - - P(yld_exp_empty); - P(yld_act_empty); - P(yld_both_empty); - P(yld_count); - P(sched_switch); - P(sched_count); - P(sched_goidle); - - P(ttwu_count); - P(ttwu_local); - - P(bkl_count); - -#undef P -#endif - SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", + SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP @@ -260,6 +242,25 @@ static void print_cpu(struct seq_file *m, int cpu) #undef P #undef PN +#ifdef CONFIG_SCHEDSTATS +#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); + + P(yld_exp_empty); + P(yld_act_empty); + P(yld_both_empty); + P(yld_count); + + P(sched_switch); + P(sched_count); + P(sched_goidle); + + P(ttwu_count); + P(ttwu_local); + + P(bkl_count); + +#undef P +#endif print_cfs_stats(m, cpu); print_rt_stats(m, cpu); -- cgit v1.2.3-70-g09d2 From ad474caca3e2a0550b7ce0706527ad5ab389a4d4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 10 Nov 2008 15:39:30 +0100 Subject: fix for account_group_exec_runtime(), make sure ->signal can't be freed under rq->lock Impact: fix hang/crash on ia64 under high load This is ugly, but the simplest patch by far. Unlike other similar routines, account_group_exec_runtime() could be called "implicitly" from within scheduler after exit_notify(). This means we can race with the parent doing release_task(), we can't just check ->signal != NULL. Change __exit_signal() to do spin_unlock_wait(&task_rq(tsk)->lock) before __cleanup_signal() to make sure ->signal can't be freed under task_rq(tsk)->lock. Note that task_rq_unlock_wait() doesn't care about the case when tsk changes cpu/rq under us, this should be OK. Thanks to Ingo who nacked my previous buggy patch. Signed-off-by: Oleg Nesterov Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar Reported-by: Doug Chapman --- include/linux/sched.h | 1 + kernel/exit.c | 5 +++++ kernel/sched.c | 8 ++++++++ 3 files changed, 14 insertions(+) (limited to 'kernel/sched.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index 295b7c756ca..644ffbda17c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -247,6 +247,7 @@ extern void init_idle(struct task_struct *idle, int cpu); extern void init_idle_bootup_task(struct task_struct *idle); extern int runqueue_is_locked(void); +extern void task_rq_unlock_wait(struct task_struct *p); extern cpumask_t nohz_cpu_mask; #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) diff --git a/kernel/exit.c b/kernel/exit.c index 80137a5d946..ae2b92be5fa 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -141,6 +141,11 @@ static void __exit_signal(struct task_struct *tsk) if (sig) { flush_sigqueue(&sig->shared_pending); taskstats_tgid_free(sig); + /* + * Make sure ->signal can't go away under rq->lock, + * see account_group_exec_runtime(). + */ + task_rq_unlock_wait(tsk); __cleanup_signal(sig); } } diff --git a/kernel/sched.c b/kernel/sched.c index f3149244e32..50a21f96467 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -969,6 +969,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) } } +void task_rq_unlock_wait(struct task_struct *p) +{ + struct rq *rq = task_rq(p); + + smp_mb(); /* spin-unlock-wait is not a full memory barrier */ + spin_unlock_wait(&rq->lock); +} + static void __task_rq_unlock(struct rq *rq) __releases(rq->lock) { -- cgit v1.2.3-70-g09d2