From a8dd2176a8e988e3744e863ac39647a6f59fa900 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Jan 2013 20:54:17 -0500 Subject: tracing: Fix regression of trace_options file setting The latest change to allow trace options to be set on the command line also broke the trace_options file. The zeroing of the last byte of the option name that is echoed into the trace_option file was removed with the consolidation of some of the code. The compare between the option and what was written to the trace_options file fails because the string holding the data written doesn't terminate with a null character. A zero needs to be added to the end of the string copied from user space. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e5125677efa..1bbfa044650 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2899,6 +2899,8 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; + buf[cnt] = 0; + trace_set_options(buf); *ppos += cnt; -- cgit v1.2.3-70-g09d2 From bfbbd96c51b441b7a9a08762aa9ab832f6655b2c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 9 Jan 2013 17:12:45 -0800 Subject: audit: fix auditfilter.c kernel-doc warnings Fix new kernel-doc warning in auditfilter.c: Warning(kernel/auditfilter.c:1157): Excess function parameter 'uid' description in 'audit_receive_filter' Signed-off-by: Randy Dunlap Cc: Al Viro Cc: Eric Paris Cc: linux-audit@redhat.com (subscribers-only) Signed-off-by: Linus Torvalds --- kernel/auditfilter.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 7f19f23d38a..f9fc54bbe06 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1144,7 +1144,6 @@ static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid, * audit_receive_filter - apply all rules to the specified message type * @type: audit message type * @pid: target pid for netlink audit messages - * @uid: target uid for netlink audit messages * @seq: netlink audit message sequence (serial) number * @data: payload data * @datasz: size of payload data -- cgit v1.2.3-70-g09d2 From 2df8f8a6a897ebf4c5613b5be6103d33b2a21520 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 11 Jan 2013 16:14:10 -0500 Subject: tracing: Fix regression with irqsoff tracer and tracing_on file Commit 02404baf1b47 "tracing: Remove deprecated tracing_enabled file" removed the tracing_enabled file as it never worked properly and the tracing_on file should be used instead. But the tracing_on file didn't call into the tracers start/stop routines like the tracing_enabled file did. This caused trace-cmd to break when it enabled the irqsoff tracer. If you just did "echo irqsoff > current_tracer" then it would work properly. But the tool trace-cmd disables tracing first by writing "0" into the tracing_on file. Then it writes "irqsoff" into current_tracer and then writes "1" into tracing_on. Unfortunately, the above commit changed the irqsoff tracer to check the tracing_on status instead of the tracing_enabled status. If it's disabled then it does not start the tracer internals. The problem is that writing "1" into tracing_on does not call the tracers "start" routine like writing "1" into tracing_enabled did. This makes the irqsoff tracer not start when using the trace-cmd tool, and is a regression for userspace. Simple fix is to have the tracing_on file call the tracers start() method when being enabled (and the stop() method when disabled). Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1bbfa044650..f3ec1cfb0de 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4817,10 +4817,17 @@ rb_simple_write(struct file *filp, const char __user *ubuf, return ret; if (buffer) { - if (val) + mutex_lock(&trace_types_lock); + if (val) { ring_buffer_record_on(buffer); - else + if (current_trace->start) + current_trace->start(tr); + } else { ring_buffer_record_off(buffer); + if (current_trace->stop) + current_trace->stop(tr); + } + mutex_unlock(&trace_types_lock); } (*ppos)++; -- cgit v1.2.3-70-g09d2 From 1b963c81b14509e330e0fe3218b645ece2738dc5 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 11 Jan 2013 14:31:56 -0800 Subject: lockdep, rwsem: provide down_write_nest_lock() down_write_nest_lock() provides a means to annotate locking scenario where an outer lock is guaranteed to serialize the order nested locks are being acquired. This is analogoue to already existing mutex_lock_nest_lock() and spin_lock_nest_lock(). Signed-off-by: Jiri Kosina Cc: Rik van Riel Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Mel Gorman Tested-by: Sedat Dilek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/lockdep.h | 3 +++ include/linux/rwsem.h | 9 +++++++++ kernel/rwsem.c | 10 ++++++++++ 3 files changed, 22 insertions(+) (limited to 'kernel') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 00e46376e28..2bca44b0893 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -524,14 +524,17 @@ static inline void print_irqtrace_events(struct task_struct *curr) #ifdef CONFIG_DEBUG_LOCK_ALLOC # ifdef CONFIG_PROVE_LOCKING # define rwsem_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, NULL, i) +# define rwsem_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 2, n, i) # define rwsem_acquire_read(l, s, t, i) lock_acquire(l, s, t, 1, 2, NULL, i) # else # define rwsem_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, NULL, i) +# define rwsem_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i) # define rwsem_acquire_read(l, s, t, i) lock_acquire(l, s, t, 1, 1, NULL, i) # endif # define rwsem_release(l, n, i) lock_release(l, n, i) #else # define rwsem_acquire(l, s, t, i) do { } while (0) +# define rwsem_acquire_nest(l, s, t, n, i) do { } while (0) # define rwsem_acquire_read(l, s, t, i) do { } while (0) # define rwsem_release(l, n, i) do { } while (0) #endif diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 54bd7cd7ecb..413cc11e414 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -125,8 +125,17 @@ extern void downgrade_write(struct rw_semaphore *sem); */ extern void down_read_nested(struct rw_semaphore *sem, int subclass); extern void down_write_nested(struct rw_semaphore *sem, int subclass); +extern void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest_lock); + +# define down_write_nest_lock(sem, nest_lock) \ +do { \ + typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \ + _down_write_nest_lock(sem, &(nest_lock)->dep_map); \ +} while (0); + #else # define down_read_nested(sem, subclass) down_read(sem) +# define down_write_nest_lock(sem, nest_lock) down_read(sem) # define down_write_nested(sem, subclass) down_write(sem) #endif diff --git a/kernel/rwsem.c b/kernel/rwsem.c index 6850f53e02d..b3c6c3fcd84 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c @@ -116,6 +116,16 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_read_nested); +void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) +{ + might_sleep(); + rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); + + LOCK_CONTENDED(sem, __down_write_trylock, __down_write); +} + +EXPORT_SYMBOL(_down_write_nest_lock); + void down_write_nested(struct rw_semaphore *sem, int subclass) { might_sleep(); -- cgit v1.2.3-70-g09d2 From 7b9205bd775afc4439ed86d617f9042ee9e76a71 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 11 Jan 2013 14:32:05 -0800 Subject: audit: create explicit AUDIT_SECCOMP event type The seccomp path was using AUDIT_ANOM_ABEND from when seccomp mode 1 could only kill a process. While we still want to make sure an audit record is forced on a kill, this should use a separate record type since seccomp mode 2 introduces other behaviors. In the case of "handled" behaviors (process wasn't killed), only emit a record if the process is under inspection. This change also fixes userspace examination of seccomp audit events, since it was considered malformed due to missing fields of the AUDIT_ANOM_ABEND event type. Signed-off-by: Kees Cook Cc: Al Viro Cc: Eric Paris Cc: Jeff Layton Cc: "Eric W. Biederman" Cc: Julien Tinnes Acked-by: Will Drewry Acked-by: Steve Grubb Cc: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/audit.h | 3 ++- include/uapi/linux/audit.h | 1 + kernel/auditsc.c | 14 +++++++++++--- 3 files changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index bce729afbcf..9d5104d7aba 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -157,7 +157,8 @@ void audit_core_dumps(long signr); static inline void audit_seccomp(unsigned long syscall, long signr, int code) { - if (unlikely(!audit_dummy_context())) + /* Force a record to be reported if a signal was delivered. */ + if (signr || unlikely(!audit_dummy_context())) __audit_seccomp(syscall, signr, code); } diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 76352ac45f2..09a2d94ab11 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -106,6 +106,7 @@ #define AUDIT_MMAP 1323 /* Record showing descriptor and flags in mmap */ #define AUDIT_NETFILTER_PKT 1324 /* Packets traversing netfilter chains */ #define AUDIT_NETFILTER_CFG 1325 /* Netfilter chain modifications */ +#define AUDIT_SECCOMP 1326 /* Secure Computing event */ #define AUDIT_AVC 1400 /* SE Linux avc denial or grant */ #define AUDIT_SELINUX_ERR 1401 /* Internal SE Linux Errors */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e37e6a12c5e..3e46d1dec61 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2675,7 +2675,7 @@ void __audit_mmap_fd(int fd, int flags) context->type = AUDIT_MMAP; } -static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) +static void audit_log_task(struct audit_buffer *ab) { kuid_t auid, uid; kgid_t gid; @@ -2693,6 +2693,11 @@ static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) audit_log_task_context(ab); audit_log_format(ab, " pid=%d comm=", current->pid); audit_log_untrustedstring(ab, current->comm); +} + +static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) +{ + audit_log_task(ab); audit_log_format(ab, " reason="); audit_log_string(ab, reason); audit_log_format(ab, " sig=%ld", signr); @@ -2723,8 +2728,11 @@ void __audit_seccomp(unsigned long syscall, long signr, int code) { struct audit_buffer *ab; - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); - audit_log_abend(ab, "seccomp", signr); + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_SECCOMP); + if (unlikely(!ab)) + return; + audit_log_task(ab); + audit_log_format(ab, " sig=%ld", signr); audit_log_format(ab, " syscall=%ld", syscall); audit_log_format(ab, " compat=%d", is_compat_task()); audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current)); -- cgit v1.2.3-70-g09d2 From 0644ec0cc8a33fb654e348897ad7684e22a4b5d8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 11 Jan 2013 14:32:07 -0800 Subject: audit: catch possible NULL audit buffers It's possible for audit_log_start() to return NULL. Handle it in the various callers. Signed-off-by: Kees Cook Cc: Al Viro Cc: Eric Paris Cc: Jeff Layton Cc: "Eric W. Biederman" Cc: Julien Tinnes Cc: Will Drewry Cc: Steve Grubb Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit.c | 4 ++++ kernel/audit_tree.c | 26 +++++++++++++++++--------- kernel/audit_watch.c | 2 ++ kernel/auditsc.c | 6 ++++-- 4 files changed, 27 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 40414e9143d..a219998aecc 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -272,6 +272,8 @@ static int audit_log_config_change(char *function_name, int new, int old, int rc = 0; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + if (unlikely(!ab)) + return rc; audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, old, from_kuid(&init_user_ns, loginuid), sessionid); if (sid) { @@ -619,6 +621,8 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, } *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); + if (unlikely(!*ab)) + return rc; audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", task_tgid_vnr(current), from_kuid(&init_user_ns, current_uid()), diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index e81175ef25f..642a89c4f3d 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -449,11 +449,26 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) return 0; } +static void audit_log_remove_rule(struct audit_krule *rule) +{ + struct audit_buffer *ab; + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + if (unlikely(!ab)) + return; + audit_log_format(ab, "op="); + audit_log_string(ab, "remove rule"); + audit_log_format(ab, " dir="); + audit_log_untrustedstring(ab, rule->tree->pathname); + audit_log_key(ab, rule->filterkey); + audit_log_format(ab, " list=%d res=1", rule->listnr); + audit_log_end(ab); +} + static void kill_rules(struct audit_tree *tree) { struct audit_krule *rule, *next; struct audit_entry *entry; - struct audit_buffer *ab; list_for_each_entry_safe(rule, next, &tree->rules, rlist) { entry = container_of(rule, struct audit_entry, rule); @@ -461,14 +476,7 @@ static void kill_rules(struct audit_tree *tree) list_del_init(&rule->rlist); if (rule->tree) { /* not a half-baked one */ - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); - audit_log_format(ab, "op="); - audit_log_string(ab, "remove rule"); - audit_log_format(ab, " dir="); - audit_log_untrustedstring(ab, rule->tree->pathname); - audit_log_key(ab, rule->filterkey); - audit_log_format(ab, " list=%d res=1", rule->listnr); - audit_log_end(ab); + audit_log_remove_rule(rule); rule->tree = NULL; list_del_rcu(&entry->list); list_del(&entry->rule.list); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 4a599f699ad..22831c4d369 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -240,6 +240,8 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc if (audit_enabled) { struct audit_buffer *ab; ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); + if (unlikely(!ab)) + return; audit_log_format(ab, "auid=%u ses=%u op=", from_kuid(&init_user_ns, audit_get_loginuid(current)), audit_get_sessionid(current)); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3e46d1dec61..a371f857a0a 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1464,14 +1464,14 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_end(ab); ab = audit_log_start(context, GFP_KERNEL, AUDIT_IPC_SET_PERM); + if (unlikely(!ab)) + return; audit_log_format(ab, "qbytes=%lx ouid=%u ogid=%u mode=%#ho", context->ipc.qbytes, context->ipc.perm_uid, context->ipc.perm_gid, context->ipc.perm_mode); - if (!ab) - return; } break; } case AUDIT_MQ_OPEN: { @@ -2720,6 +2720,8 @@ void audit_core_dumps(long signr) return; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); + if (unlikely(!ab)) + return; audit_log_abend(ab, "memory violation", signr); audit_log_end(ab); } -- cgit v1.2.3-70-g09d2 From 829199197a430dade2519d54f5545c4a094393b8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 11 Jan 2013 14:32:11 -0800 Subject: kernel/audit.c: avoid negative sleep durations audit_log_start() performs the same jiffies comparison in two places. If sufficient time has elapsed between the two comparisons, the second one produces a negative sleep duration: schedule_timeout: wrong timeout value fffffffffffffff0 Pid: 6606, comm: trinity-child1 Not tainted 3.8.0-rc1+ #43 Call Trace: schedule_timeout+0x305/0x340 audit_log_start+0x311/0x470 audit_log_exit+0x4b/0xfb0 __audit_syscall_exit+0x25f/0x2c0 sysret_audit+0x17/0x21 Fix it by performing the comparison a single time. Reported-by: Dave Jones Cc: Al Viro Cc: Eric Paris Reviewed-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index a219998aecc..d596e5355f1 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1101,6 +1101,23 @@ static inline void audit_get_stamp(struct audit_context *ctx, } } +/* + * Wait for auditd to drain the queue a little + */ +static void wait_for_auditd(unsigned long sleep_time) +{ + DECLARE_WAITQUEUE(wait, current); + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&audit_backlog_wait, &wait); + + if (audit_backlog_limit && + skb_queue_len(&audit_skb_queue) > audit_backlog_limit) + schedule_timeout(sleep_time); + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&audit_backlog_wait, &wait); +} + /* Obtain an audit buffer. This routine does locking to obtain the * audit buffer, but then no locking is required for calls to * audit_log_*format. If the tsk is a task that is currently in a @@ -1146,20 +1163,13 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, while (audit_backlog_limit && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { - if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time - && time_before(jiffies, timeout_start + audit_backlog_wait_time)) { - - /* Wait for auditd to drain the queue a little */ - DECLARE_WAITQUEUE(wait, current); - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&audit_backlog_wait, &wait); - - if (audit_backlog_limit && - skb_queue_len(&audit_skb_queue) > audit_backlog_limit) - schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies); + if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) { + unsigned long sleep_time; - __set_current_state(TASK_RUNNING); - remove_wait_queue(&audit_backlog_wait, &wait); + sleep_time = timeout_start + audit_backlog_wait_time - + jiffies; + if ((long)sleep_time > 0) + wait_for_auditd(sleep_time); continue; } if (audit_rate_check() && printk_ratelimit()) -- cgit v1.2.3-70-g09d2 From 250bfd3d8e7e19cb649dd94689f0af2ce3474060 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 14 Jan 2013 10:54:11 +0800 Subject: tracing: Fix regression of trace_pipe Commit 0fb9656d "tracing: Make tracing_enabled be equal to tracing_on" changes the behaviour of trace_pipe, ie. it makes trace_pipe return if we've read something and tracing is enabled, and this means that we have to 'cat trace_pipe' again and again while running tests. IMO the right way is if tracing is enabled, we always block and wait for ring buffer, or we may lose what we want since ring buffer's size is limited. Link: http://lkml.kernel.org/r/1358132051-5410-1-git-send-email-bo.li.liu@oracle.com Signed-off-by: Liu Bo Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f3ec1cfb0de..3c13e46d7d2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3454,7 +3454,7 @@ static int tracing_wait_pipe(struct file *filp) return -EINTR; /* - * We block until we read something and tracing is enabled. + * We block until we read something and tracing is disabled. * We still block if tracing is disabled, but we have never * read anything. This allows a user to cat this file, and * then enable tracing. But after we have read something, @@ -3462,7 +3462,7 @@ static int tracing_wait_pipe(struct file *filp) * * iter->pos will be 0 if we haven't read anything. */ - if (tracing_is_enabled() && iter->pos) + if (!tracing_is_enabled() && iter->pos) break; } -- cgit v1.2.3-70-g09d2 From 774a1221e862b343388347bac9b318767336b20b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 15 Jan 2013 18:52:51 -0800 Subject: module, async: async_synchronize_full() on module init iff async is used If the default iosched is built as module, the kernel may deadlock while trying to load the iosched module on device probe if the probing was running off async. This is because async_synchronize_full() at the end of module init ends up waiting for the async job which initiated the module loading. async A modprobe 1. finds a device 2. registers the block device 3. request_module(default iosched) 4. modprobe in userland 5. load and init module 6. async_synchronize_full() Async A waits for modprobe to finish in request_module() and modprobe waits for async A to finish in async_synchronize_full(). Because there's no easy to track dependency once control goes out to userland, implementing properly nested flushing is difficult. For now, make module init perform async_synchronize_full() iff module init has queued async jobs as suggested by Linus. This avoids the described deadlock because iosched module doesn't use async and thus wouldn't invoke async_synchronize_full(). This is hacky and incomplete. It will deadlock if async module loading nests; however, this works around the known problem case and seems to be the best of bad options. For more details, please refer to the following thread. http://thread.gmane.org/gmane.linux.kernel/1420814 Signed-off-by: Tejun Heo Reported-by: Alex Riesen Tested-by: Ming Lei Tested-by: Alex Riesen Cc: Arjan van de Ven Cc: Jens Axboe Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + kernel/async.c | 3 +++ kernel/module.c | 27 +++++++++++++++++++++++++-- 3 files changed, 29 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 206bb089c06..6fc8f45de4e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1810,6 +1810,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */ +#define PF_USED_ASYNC 0x00004000 /* used async_schedule*(), used by module init */ #define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */ #define PF_FROZEN 0x00010000 /* frozen for system suspend */ #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ diff --git a/kernel/async.c b/kernel/async.c index 9d311838485..a1d585c351d 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -196,6 +196,9 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a atomic_inc(&entry_count); spin_unlock_irqrestore(&async_lock, flags); + /* mark that this task has queued an async job, used by module init */ + current->flags |= PF_USED_ASYNC; + /* schedule for execution */ queue_work(system_unbound_wq, &entry->work); diff --git a/kernel/module.c b/kernel/module.c index 250092c1d57..b10b048367e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3013,6 +3013,12 @@ static int do_init_module(struct module *mod) { int ret = 0; + /* + * We want to find out whether @mod uses async during init. Clear + * PF_USED_ASYNC. async_schedule*() will set it. + */ + current->flags &= ~PF_USED_ASYNC; + blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); @@ -3058,8 +3064,25 @@ static int do_init_module(struct module *mod) blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_LIVE, mod); - /* We need to finish all async code before the module init sequence is done */ - async_synchronize_full(); + /* + * We need to finish all async code before the module init sequence + * is done. This has potential to deadlock. For example, a newly + * detected block device can trigger request_module() of the + * default iosched from async probing task. Once userland helper + * reaches here, async_synchronize_full() will wait on the async + * task waiting on request_module() and deadlock. + * + * This deadlock is avoided by perfomring async_synchronize_full() + * iff module init queued any async jobs. This isn't a full + * solution as it will deadlock the same if module loading from + * async jobs nests more than once; however, due to the various + * constraints, this hack seems to be the best option for now. + * Please refer to the following thread for details. + * + * http://thread.gmane.org/gmane.linux.kernel/1420814 + */ + if (current->flags & PF_USED_ASYNC) + async_synchronize_full(); mutex_lock(&module_mutex); /* Drop initial reference. */ -- cgit v1.2.3-70-g09d2