From e162b39a368f0401e41b558f430c354d12a85b37 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Thu, 15 Jan 2009 11:08:40 -0800 Subject: softlockup: decouple hung tasks check from softlockup detection Decoupling allows: * hung tasks check to happen at very low priority * hung tasks check and softlockup to be enabled/disabled independently at compile and/or run-time * individual panic settings to be enabled disabled independently at compile and/or run-time * softlockup threshold to be reduced without increasing hung tasks poll frequency (hung task check is expensive relative to softlock watchdog) * hung task check to be zero over-head when disabled at run-time Signed-off-by: Mandeep Singh Baines Signed-off-by: Ingo Molnar --- kernel/hung_task.c | 198 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 198 insertions(+) create mode 100644 kernel/hung_task.c (limited to 'kernel/hung_task.c') diff --git a/kernel/hung_task.c b/kernel/hung_task.c new file mode 100644 index 00000000000..ba5a77cad3b --- /dev/null +++ b/kernel/hung_task.c @@ -0,0 +1,198 @@ +/* + * Detect Hung Task + * + * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Have a reasonable limit on the number of tasks checked: + */ +unsigned long __read_mostly sysctl_hung_task_check_count = 1024; + +/* + * Zero means infinite timeout - no checking done: + */ +unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; +static unsigned long __read_mostly hung_task_poll_jiffies; + +unsigned long __read_mostly sysctl_hung_task_warnings = 10; + +static int __read_mostly did_panic; + +static struct task_struct *watchdog_task; + +/* + * Should we panic (and reboot, if panic_timeout= is set) when a + * hung task is detected: + */ +unsigned int __read_mostly sysctl_hung_task_panic = + CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE; + +static int __init hung_task_panic_setup(char *str) +{ + sysctl_hung_task_panic = simple_strtoul(str, NULL, 0); + + return 1; +} +__setup("hung_task_panic=", hung_task_panic_setup); + +static int +hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) +{ + did_panic = 1; + + return NOTIFY_DONE; +} + +static struct notifier_block panic_block = { + .notifier_call = hung_task_panic, +}; + +/* + * Returns seconds, approximately. We don't need nanosecond + * resolution, and we don't need to waste time with a big divide when + * 2^30ns == 1.074s. + */ +static unsigned long get_timestamp(void) +{ + int this_cpu = raw_smp_processor_id(); + + return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ +} + +static void check_hung_task(struct task_struct *t, unsigned long now) +{ + unsigned long switch_count = t->nvcsw + t->nivcsw; + + if (t->flags & PF_FROZEN) + return; + + if (switch_count != t->last_switch_count || !t->last_switch_timestamp) { + t->last_switch_count = switch_count; + t->last_switch_timestamp = now; + return; + } + if ((long)(now - t->last_switch_timestamp) < + sysctl_hung_task_timeout_secs) + return; + if (!sysctl_hung_task_warnings) + return; + sysctl_hung_task_warnings--; + + /* + * Ok, the task did not get scheduled for more than 2 minutes, + * complain: + */ + printk(KERN_ERR "INFO: task %s:%d blocked for more than " + "%ld seconds.\n", t->comm, t->pid, + sysctl_hung_task_timeout_secs); + printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" + " disables this message.\n"); + sched_show_task(t); + __debug_show_held_locks(t); + + t->last_switch_timestamp = now; + touch_nmi_watchdog(); + + if (sysctl_hung_task_panic) + panic("hung_task: blocked tasks"); +} + +/* + * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for + * a really long time (120 seconds). If that happens, print out + * a warning. + */ +static void check_hung_uninterruptible_tasks(void) +{ + int max_count = sysctl_hung_task_check_count; + unsigned long now = get_timestamp(); + struct task_struct *g, *t; + + /* + * If the system crashed already then all bets are off, + * do not report extra hung tasks: + */ + if (test_taint(TAINT_DIE) || did_panic) + return; + + read_lock(&tasklist_lock); + do_each_thread(g, t) { + if (!--max_count) + goto unlock; + /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ + if (t->state == TASK_UNINTERRUPTIBLE) + check_hung_task(t, now); + } while_each_thread(g, t); + unlock: + read_unlock(&tasklist_lock); +} + +static void update_poll_jiffies(void) +{ + /* timeout of 0 will disable the watchdog */ + if (sysctl_hung_task_timeout_secs == 0) + hung_task_poll_jiffies = MAX_SCHEDULE_TIMEOUT; + else + hung_task_poll_jiffies = sysctl_hung_task_timeout_secs * HZ / 2; +} + +/* + * Process updating of timeout sysctl + */ +int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + + if (ret || !write) + goto out; + + update_poll_jiffies(); + + wake_up_process(watchdog_task); + + out: + return ret; +} + +/* + * kthread which checks for tasks stuck in D state + */ +static int watchdog(void *dummy) +{ + set_user_nice(current, 0); + update_poll_jiffies(); + + for ( ; ; ) { + while (schedule_timeout_interruptible(hung_task_poll_jiffies)); + check_hung_uninterruptible_tasks(); + } + + return 0; +} + +static int __init hung_task_init(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, &panic_block); + watchdog_task = kthread_run(watchdog, NULL, "khungtaskd"); + + return 0; +} + +module_init(hung_task_init); -- cgit v1.2.3-70-g09d2 From 603a148f434742fff08273207ffa44176cad13a1 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Sat, 17 Jan 2009 10:31:48 -0800 Subject: softlockup: fix potential race in hung_task when resetting timeout Impact: fix potential false panic A potential race exists if sysctl_hung_task_timeout_secs is reset to 0 while inside check_hung_uniterruptible_tasks(). If check_task() is entered, a comparison with 0 will result in a false hung_task being detected. If sysctl_hung_task_panic is set, the system will panic. Signed-off-by: Mandeep Singh Baines Signed-off-by: Ingo Molnar --- kernel/hung_task.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'kernel/hung_task.c') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index ba5a77cad3b..ba8ccd43296 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -72,7 +72,8 @@ static unsigned long get_timestamp(void) return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ } -static void check_hung_task(struct task_struct *t, unsigned long now) +static void check_hung_task(struct task_struct *t, unsigned long now, + unsigned long timeout) { unsigned long switch_count = t->nvcsw + t->nivcsw; @@ -84,8 +85,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now) t->last_switch_timestamp = now; return; } - if ((long)(now - t->last_switch_timestamp) < - sysctl_hung_task_timeout_secs) + if ((long)(now - t->last_switch_timestamp) < timeout) return; if (!sysctl_hung_task_warnings) return; @@ -96,8 +96,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now) * complain: */ printk(KERN_ERR "INFO: task %s:%d blocked for more than " - "%ld seconds.\n", t->comm, t->pid, - sysctl_hung_task_timeout_secs); + "%ld seconds.\n", t->comm, t->pid, timeout); printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); @@ -115,7 +114,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now) * a really long time (120 seconds). If that happens, print out * a warning. */ -static void check_hung_uninterruptible_tasks(void) +static void check_hung_uninterruptible_tasks(unsigned long timeout) { int max_count = sysctl_hung_task_check_count; unsigned long now = get_timestamp(); @@ -134,7 +133,7 @@ static void check_hung_uninterruptible_tasks(void) goto unlock; /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ if (t->state == TASK_UNINTERRUPTIBLE) - check_hung_task(t, now); + check_hung_task(t, now, timeout); } while_each_thread(g, t); unlock: read_unlock(&tasklist_lock); @@ -180,8 +179,17 @@ static int watchdog(void *dummy) update_poll_jiffies(); for ( ; ; ) { + unsigned long timeout; + while (schedule_timeout_interruptible(hung_task_poll_jiffies)); - check_hung_uninterruptible_tasks(); + + /* + * Need to cache timeout here to avoid timeout being set + * to 0 via sysctl while inside check_hung_*_tasks(). + */ + timeout = sysctl_hung_task_timeout_secs; + if (timeout) + check_hung_uninterruptible_tasks(timeout); } return 0; -- cgit v1.2.3-70-g09d2 From ce9dbe244bf2063c41792e40dae7745957b118e0 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Wed, 4 Feb 2009 20:35:48 -0800 Subject: softlockup: check all tasks in hung_task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: extend the scope of hung-task checks Changed the default value of hung_task_check_count to PID_MAX_LIMIT. hung_task_batch_count added to put an upper bound on the critical section. Every hung_task_batch_count checks, the rcu lock is never held for a too long time. Keeping the critical section small minimizes time preemption is disabled and keeps rcu grace periods small. To prevent following a stale pointer, get_task_struct is called on g and t. To verify that g and t have not been unhashed while outside the critical section, the task states are checked. The design was proposed by Frédéric Weisbecker. Signed-off-by: Mandeep Singh Baines Suggested-by: Frédéric Weisbecker Acked-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/hung_task.c | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) (limited to 'kernel/hung_task.c') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index ba8ccd43296..481ca8b5c2b 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -17,9 +17,18 @@ #include /* - * Have a reasonable limit on the number of tasks checked: + * The number of tasks checked: */ -unsigned long __read_mostly sysctl_hung_task_check_count = 1024; +unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; + +/* + * Limit number of tasks checked in a batch. + * + * This value controls the preemptibility of khungtaskd since preemption + * is disabled during the critical section. It also controls the size of + * the RCU grace period. So it needs to be upper-bound. + */ +#define HUNG_TASK_BATCHING 1024 /* * Zero means infinite timeout - no checking done: @@ -109,6 +118,24 @@ static void check_hung_task(struct task_struct *t, unsigned long now, panic("hung_task: blocked tasks"); } +/* + * To avoid extending the RCU grace period for an unbounded amount of time, + * periodically exit the critical section and enter a new one. + * + * For preemptible RCU it is sufficient to call rcu_read_unlock in order + * exit the grace period. For classic RCU, a reschedule is required. + */ +static void rcu_lock_break(struct task_struct *g, struct task_struct *t) +{ + get_task_struct(g); + get_task_struct(t); + rcu_read_unlock(); + cond_resched(); + rcu_read_lock(); + put_task_struct(t); + put_task_struct(g); +} + /* * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for * a really long time (120 seconds). If that happens, print out @@ -117,6 +144,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now, static void check_hung_uninterruptible_tasks(unsigned long timeout) { int max_count = sysctl_hung_task_check_count; + int batch_count = HUNG_TASK_BATCHING; unsigned long now = get_timestamp(); struct task_struct *g, *t; @@ -131,6 +159,13 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) do_each_thread(g, t) { if (!--max_count) goto unlock; + if (!--batch_count) { + batch_count = HUNG_TASK_BATCHING; + rcu_lock_break(g, t); + /* Exit if t or g was unhashed during refresh. */ + if (t->state == TASK_DEAD || g->state == TASK_DEAD) + goto unlock; + } /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ if (t->state == TASK_UNINTERRUPTIBLE) check_hung_task(t, now, timeout); -- cgit v1.2.3-70-g09d2 From 94be52dc075a32af4aa73d7e10f68734d62d6af2 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Thu, 5 Feb 2009 09:56:08 -0800 Subject: softlockup: convert read_lock in hung_task to rcu_read_lock Since the tasklist is protected by rcu list operations, it is safe to convert the read_lock()s to rcu_read_lock(). Suggested-by: Peter Zijlstra Signed-off-by: Mandeep Singh Baines Signed-off-by: Ingo Molnar --- kernel/hung_task.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/hung_task.c') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 481ca8b5c2b..3951a80e7cb 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -155,7 +155,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) if (test_taint(TAINT_DIE) || did_panic) return; - read_lock(&tasklist_lock); + rcu_read_lock(); do_each_thread(g, t) { if (!--max_count) goto unlock; @@ -171,7 +171,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) check_hung_task(t, now, timeout); } while_each_thread(g, t); unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); } static void update_poll_jiffies(void) -- cgit v1.2.3-70-g09d2 From 17406b82d621930cca8ccc1272cdac9a7dae8e40 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Fri, 6 Feb 2009 15:37:47 -0800 Subject: softlockup: remove timestamp checking from hung_task Impact: saves sizeof(long) bytes per task_struct By guaranteeing that sysctl_hung_task_timeout_secs have elapsed between tasklist scans we can avoid using timestamps. Signed-off-by: Mandeep Singh Baines Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - kernel/fork.c | 8 +++----- kernel/hung_task.c | 48 +++++++++--------------------------------------- 3 files changed, 12 insertions(+), 45 deletions(-) (limited to 'kernel/hung_task.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2a2811c6239..e0d723fea9f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1241,7 +1241,6 @@ struct task_struct { #endif #ifdef CONFIG_DETECT_HUNG_TASK /* hung task detection */ - unsigned long last_switch_timestamp; unsigned long last_switch_count; #endif /* CPU-specific state of this task */ diff --git a/kernel/fork.c b/kernel/fork.c index fb944428283..bf582f75014 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -639,6 +639,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) tsk->min_flt = tsk->maj_flt = 0; tsk->nvcsw = tsk->nivcsw = 0; +#ifdef CONFIG_DETECT_HUNG_TASK + tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw; +#endif tsk->mm = NULL; tsk->active_mm = NULL; @@ -1041,11 +1044,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->default_timer_slack_ns = current->timer_slack_ns; -#ifdef CONFIG_DETECT_HUNG_TASK - p->last_switch_count = 0; - p->last_switch_timestamp = 0; -#endif - task_io_accounting_init(&p->ioac); acct_clear_integrals(p); diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 3951a80e7cb..0c924de58cb 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -34,7 +34,6 @@ unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; * Zero means infinite timeout - no checking done: */ unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; -static unsigned long __read_mostly hung_task_poll_jiffies; unsigned long __read_mostly sysctl_hung_task_warnings = 10; @@ -69,33 +68,17 @@ static struct notifier_block panic_block = { .notifier_call = hung_task_panic, }; -/* - * Returns seconds, approximately. We don't need nanosecond - * resolution, and we don't need to waste time with a big divide when - * 2^30ns == 1.074s. - */ -static unsigned long get_timestamp(void) -{ - int this_cpu = raw_smp_processor_id(); - - return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ -} - -static void check_hung_task(struct task_struct *t, unsigned long now, - unsigned long timeout) +static void check_hung_task(struct task_struct *t, unsigned long timeout) { unsigned long switch_count = t->nvcsw + t->nivcsw; if (t->flags & PF_FROZEN) return; - if (switch_count != t->last_switch_count || !t->last_switch_timestamp) { + if (switch_count != t->last_switch_count) { t->last_switch_count = switch_count; - t->last_switch_timestamp = now; return; } - if ((long)(now - t->last_switch_timestamp) < timeout) - return; if (!sysctl_hung_task_warnings) return; sysctl_hung_task_warnings--; @@ -111,7 +94,6 @@ static void check_hung_task(struct task_struct *t, unsigned long now, sched_show_task(t); __debug_show_held_locks(t); - t->last_switch_timestamp = now; touch_nmi_watchdog(); if (sysctl_hung_task_panic) @@ -145,7 +127,6 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) { int max_count = sysctl_hung_task_check_count; int batch_count = HUNG_TASK_BATCHING; - unsigned long now = get_timestamp(); struct task_struct *g, *t; /* @@ -168,19 +149,16 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) } /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ if (t->state == TASK_UNINTERRUPTIBLE) - check_hung_task(t, now, timeout); + check_hung_task(t, timeout); } while_each_thread(g, t); unlock: rcu_read_unlock(); } -static void update_poll_jiffies(void) +static unsigned long timeout_jiffies(unsigned long timeout) { /* timeout of 0 will disable the watchdog */ - if (sysctl_hung_task_timeout_secs == 0) - hung_task_poll_jiffies = MAX_SCHEDULE_TIMEOUT; - else - hung_task_poll_jiffies = sysctl_hung_task_timeout_secs * HZ / 2; + return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT; } /* @@ -197,8 +175,6 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, if (ret || !write) goto out; - update_poll_jiffies(); - wake_up_process(watchdog_task); out: @@ -211,20 +187,14 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, static int watchdog(void *dummy) { set_user_nice(current, 0); - update_poll_jiffies(); for ( ; ; ) { - unsigned long timeout; + unsigned long timeout = sysctl_hung_task_timeout_secs; - while (schedule_timeout_interruptible(hung_task_poll_jiffies)); + while (schedule_timeout_interruptible(timeout_jiffies(timeout))) + timeout = sysctl_hung_task_timeout_secs; - /* - * Need to cache timeout here to avoid timeout being set - * to 0 via sysctl while inside check_hung_*_tasks(). - */ - timeout = sysctl_hung_task_timeout_secs; - if (timeout) - check_hung_uninterruptible_tasks(timeout); + check_hung_uninterruptible_tasks(timeout); } return 0; -- cgit v1.2.3-70-g09d2 From cf2592f59c0e8ed4308adbdb2e0a88655379d579 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 10 Feb 2009 16:52:37 +0100 Subject: softlockup: ensure the task has been switched out once When we check if a task has been switched out since the last scan, we might have a race condition on the following scenario: - the task is freshly created and scheduled - it puts its state to TASK_UNINTERRUPTIBLE and is not yet switched out - check_hung_task() scans this task and will report a false positive because t->nvcsw + t->nivcsw == t->last_switch_count == 0 Add a check for such cases. Signed-off-by: Frederic Weisbecker Acked-by: Mandeep Singh Baines Signed-off-by: Ingo Molnar --- kernel/hung_task.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel/hung_task.c') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 0c924de58cb..022a4927b78 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -72,7 +72,13 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) { unsigned long switch_count = t->nvcsw + t->nivcsw; - if (t->flags & PF_FROZEN) + /* + * Ensure the task is not frozen. + * Also, when a freshly created task is scheduled once, changes + * its state to TASK_UNINTERRUPTIBLE without having ever been + * switched out once, it musn't be checked. + */ + if (unlikely(t->flags & PF_FROZEN || !switch_count)) return; if (switch_count != t->last_switch_count) { -- cgit v1.2.3-70-g09d2