From db05021d49a994ee40a9735d9c3cb0060c9babb8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 27 Feb 2013 21:48:09 -0500 Subject: ftrace: Update the kconfig for DYNAMIC_FTRACE The prompt to enable DYNAMIC_FTRACE (the ability to nop and enable function tracing at run time) had a confusing statement: "enable/disable ftrace tracepoints dynamically" This was written before tracepoints were added to the kernel, but now that tracepoints have been added, this is very confusing and has confused people enough to give wrong information during presentations. Not only that, I looked at the help text, and it still references that dreaded daemon that use to wake up once a second to update the nop locations and brick NICs, that hasn't been around for over five years. Time to bring the text up to the current decade. Cc: stable@vger.kernel.org Reported-by: Ezequiel Garcia Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 36567564e22..b516a8e19d5 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -429,24 +429,28 @@ config PROBE_EVENTS def_bool n config DYNAMIC_FTRACE - bool "enable/disable ftrace tracepoints dynamically" + bool "enable/disable function tracing dynamically" depends on FUNCTION_TRACER depends on HAVE_DYNAMIC_FTRACE default y help - This option will modify all the calls to ftrace dynamically - (will patch them out of the binary image and replace them - with a No-Op instruction) as they are called. A table is - created to dynamically enable them again. + This option will modify all the calls to function tracing + dynamically (will patch them out of the binary image and + replace them with a No-Op instruction) on boot up. During + compile time, a table is made of all the locations that ftrace + can function trace, and this table is linked into the kernel + image. When this is enabled, functions can be individually + enabled, and the functions not enabled will not affect + performance of the system. + + See the files in /sys/kernel/debug/tracing: + available_filter_functions + set_ftrace_filter + set_ftrace_notrace This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but otherwise has native performance as long as no tracing is active. - The changes to the code are done by a kernel thread that - wakes up once a second and checks to see if any ftrace calls - were made. If so, it runs stop_machine (stops all CPUS) - and modifies the code to jump over the call to ftrace. - config DYNAMIC_FTRACE_WITH_REGS def_bool y depends on DYNAMIC_FTRACE -- cgit v1.2.3-70-g09d2 From d8741e2e88ac9a458765a0c7b4e6542d7c038334 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 5 Mar 2013 10:25:16 -0500 Subject: tracing: Add help of snapshot feature when snapshot is empty When cat'ing the snapshot file, instead of showing an empty trace header like the trace file does, show how to use the snapshot feature. Also, this is a good place to show if the snapshot has been allocated or not. Users may want to "pre allocate" the snapshot to have a fast "swap" of the current buffer. Otherwise, a swap would be slow and might fail as it would need to allocate the snapshot buffer, and that might fail under tight memory constraints. Here's what it looked like before: # tracer: nop # # entries-in-buffer/entries-written: 0/0 #P:4 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | Here's what it looks like now: # tracer: nop # # # * Snapshot is freed * # # Snapshot commands: # echo 0 > snapshot : Clears and frees snapshot buffer # echo 1 > snapshot : Allocates snapshot buffer, if not already allocated. # Takes a snapshot of the main buffer. # echo 2 > snapshot : Clears snapshot buffer (but does not allocate) # (Doesn't have to be '2' works with any number that # is not a '0' or '1') Acked-by: Hiraku Toyooka Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c2e2c231037..9e3120b8a2a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2400,6 +2400,27 @@ static void test_ftrace_alive(struct seq_file *m) seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n"); } +#ifdef CONFIG_TRACER_MAX_TRACE +static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) +{ + if (iter->trace->allocated_snapshot) + seq_printf(m, "#\n# * Snapshot is allocated *\n#\n"); + else + seq_printf(m, "#\n# * Snapshot is freed *\n#\n"); + + seq_printf(m, "# Snapshot commands:\n"); + seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); + seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); + seq_printf(m, "# Takes a snapshot of the main buffer.\n"); + seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate)\n"); + seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); + seq_printf(m, "# is not a '0' or '1')\n"); +} +#else +/* Should never be called */ +static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { } +#endif + static int s_show(struct seq_file *m, void *v) { struct trace_iterator *iter = v; @@ -2411,7 +2432,9 @@ static int s_show(struct seq_file *m, void *v) seq_puts(m, "#\n"); test_ftrace_alive(m); } - if (iter->trace && iter->trace->print_header) + if (iter->snapshot && trace_empty(iter)) + print_snapshot_help(m, iter); + else if (iter->trace && iter->trace->print_header) iter->trace->print_header(m); else trace_default_header(m); -- cgit v1.2.3-70-g09d2 From c9960e48543799f168c4c9486f9790fb686ce5a8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 5 Mar 2013 10:53:02 -0500 Subject: tracing: Do not return EINVAL in snapshot when not allocated To use the tracing snapshot feature, writing a '1' into the snapshot file causes the snapshot buffer to be allocated if it has not already been allocated and dose a 'swap' with the main buffer, so that the snapshot now contains what was in the main buffer, and the main buffer now writes to what was the snapshot buffer. To free the snapshot buffer, a '0' is written into the snapshot file. To clear the snapshot buffer, any number but a '0' or '1' is written into the snapshot file. But if the file is not allocated it returns -EINVAL error code. This is rather pointless. It is better just to do nothing and return success. Acked-by: Hiraku Toyooka Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9e3120b8a2a..1f835a83cb2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4167,8 +4167,6 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, default: if (current_trace->allocated_snapshot) tracing_reset_online_cpus(&max_tr); - else - ret = -EINVAL; break; } -- cgit v1.2.3-70-g09d2 From a7dc19b8652c862d5b7c4d2339bd3c428bd29c4a Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 7 Mar 2013 15:09:24 +0000 Subject: clockevents: Don't allow dummy broadcast timers Currently tick_check_broadcast_device doesn't reject clock_event_devices with CLOCK_EVT_FEAT_DUMMY, and may select them in preference to real hardware if they have a higher rating value. In this situation, the dummy timer is responsible for broadcasting to itself, and the core clockevents code may attempt to call non-existent callbacks for programming the dummy, eventually leading to a panic. This patch makes tick_check_broadcast_device always reject dummy timers, preventing this problem. Signed-off-by: Mark Rutland Cc: linux-arm-kernel@lists.infradead.org Cc: Jon Medhurst (Tixy) Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 2fb8cb88df8..7f32fe0e52c 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -67,7 +67,8 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc) */ int tick_check_broadcast_device(struct clock_event_device *dev) { - if ((tick_broadcast_device.evtdev && + if ((dev->features & CLOCK_EVT_FEAT_DUMMY) || + (tick_broadcast_device.evtdev && tick_broadcast_device.evtdev->rating >= dev->rating) || (dev->features & CLOCK_EVT_FEAT_C3STOP)) return 0; -- cgit v1.2.3-70-g09d2 From eb2834285cf172856cd12f66892fc7467935ebed Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 8 Mar 2013 15:18:28 -0800 Subject: workqueue: fix possible pool stall bug in wq_unbind_fn() Since multiple pools per cpu have been introduced, wq_unbind_fn() has a subtle bug which may theoretically stall work item processing. The problem is two-fold. * wq_unbind_fn() depends on the worker executing wq_unbind_fn() itself to start unbound chain execution, which works fine when there was only single pool. With multiple pools, only the pool which is running wq_unbind_fn() - the highpri one - is guaranteed to have such kick-off. The other pool could stall when its busy workers block. * The current code is setting WORKER_UNBIND / POOL_DISASSOCIATED of the two pools in succession without initiating work execution inbetween. Because setting the flags requires grabbing assoc_mutex which is held while new workers are created, this could lead to stalls if a pool's manager is waiting for the previous pool's work items to release memory. This is almost purely theoretical tho. Update wq_unbind_fn() such that it sets WORKER_UNBIND / POOL_DISASSOCIATED, goes over schedule() and explicitly kicks off execution for a pool and then moves on to the next one. tj: Updated comments and description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- kernel/workqueue.c | 44 +++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 81f2457811e..604801b91cb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3446,28 +3446,34 @@ static void wq_unbind_fn(struct work_struct *work) spin_unlock_irq(&pool->lock); mutex_unlock(&pool->assoc_mutex); - } - /* - * Call schedule() so that we cross rq->lock and thus can guarantee - * sched callbacks see the %WORKER_UNBOUND flag. This is necessary - * as scheduler callbacks may be invoked from other cpus. - */ - schedule(); + /* + * Call schedule() so that we cross rq->lock and thus can + * guarantee sched callbacks see the %WORKER_UNBOUND flag. + * This is necessary as scheduler callbacks may be invoked + * from other cpus. + */ + schedule(); - /* - * Sched callbacks are disabled now. Zap nr_running. After this, - * nr_running stays zero and need_more_worker() and keep_working() - * are always true as long as the worklist is not empty. Pools on - * @cpu now behave as unbound (in terms of concurrency management) - * pools which are served by workers tied to the CPU. - * - * On return from this function, the current worker would trigger - * unbound chain execution of pending work items if other workers - * didn't already. - */ - for_each_std_worker_pool(pool, cpu) + /* + * Sched callbacks are disabled now. Zap nr_running. + * After this, nr_running stays zero and need_more_worker() + * and keep_working() are always true as long as the + * worklist is not empty. This pool now behaves as an + * unbound (in terms of concurrency management) pool which + * are served by workers tied to the pool. + */ atomic_set(&pool->nr_running, 0); + + /* + * With concurrency management just turned off, a busy + * worker blocking could lead to lengthy stalls. Kick off + * unbound chain execution of currently pending work items. + */ + spin_lock_irq(&pool->lock); + wake_up_worker(pool); + spin_unlock_irq(&pool->lock); + } } /* -- cgit v1.2.3-70-g09d2 From 2721e72dd10f71a3ba90f59781becf02638aa0d9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 12 Mar 2013 11:32:32 -0400 Subject: tracing: Fix race in snapshot swapping Although the swap is wrapped with a spin_lock, the assignment of the temp buffer used to swap is not within that lock. It needs to be moved into that lock, otherwise two swaps happening on two different CPUs, can end up using the wrong temp buffer to assign in the swap. Luckily, all current callers of the swap function appear to have their own locks. But in case something is added that allows two different callers to call the swap, then there's a chance that this race can trigger and corrupt the buffers. New code is coming soon that will allow for this race to trigger. I've Cc'd stable, so this bug will not show up if someone backports one of the changes that can trigger this bug. Cc: stable@vger.kernel.org Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1f835a83cb2..53df2839bb9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -704,7 +704,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { - struct ring_buffer *buf = tr->buffer; + struct ring_buffer *buf; if (trace_stop_count) return; @@ -719,6 +719,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) arch_spin_lock(&ftrace_max_lock); + buf = tr->buffer; tr->buffer = max_tr.buffer; max_tr.buffer = buf; -- cgit v1.2.3-70-g09d2 From 20f22ab42e9c832bde6e9a7ed04cdc73ec737e5b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 4 Mar 2013 14:32:59 -0800 Subject: signals: fix new kernel-doc warnings Fix new kernel-doc warnings in kernel/signal.c: Warning(kernel/signal.c:2689): No description found for parameter 'uset' Warning(kernel/signal.c:2689): Excess function parameter 'set' description in 'sys_rt_sigpending' Signed-off-by: Randy Dunlap Cc: Alexander Viro Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 2ec870a4c3c..d63c79e7e41 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2682,7 +2682,7 @@ static int do_sigpending(void *set, unsigned long sigsetsize) /** * sys_rt_sigpending - examine a pending signal that has been raised * while blocked - * @set: stores pending signals + * @uset: stores pending signals * @sigsetsize: size of sigset_t type or larger */ SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize) -- cgit v1.2.3-70-g09d2 From 6c23cbbd5056b155401b0a2b5567d530e6c750c4 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 5 Mar 2013 10:00:24 -0800 Subject: futex: fix kernel-doc notation and spello Fix kernel-doc warning in futex.c and convert 'Returns' to the new Return: kernel-doc notation format. Warning(kernel/futex.c:2286): Excess function parameter 'clockrt' description in 'futex_wait_requeue_pi' Fix one spello. Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- kernel/futex.c | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index f0090a993da..b26dcfc02c9 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -223,7 +223,8 @@ static void drop_futex_key_refs(union futex_key *key) * @rw: mapping needs to be read/write (values: VERIFY_READ, * VERIFY_WRITE) * - * Returns a negative error code or 0 + * Return: a negative error code or 0 + * * The key words are stored in *key on success. * * For shared mappings, it's (page->index, file_inode(vma->vm_file), @@ -705,9 +706,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, * be "current" except in the case of requeue pi. * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) * - * Returns: - * 0 - ready to wait - * 1 - acquired the lock + * Return: + * 0 - ready to wait; + * 1 - acquired the lock; * <0 - error * * The hb->lock and futex_key refs shall be held by the caller. @@ -1191,9 +1192,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. * hb1 and hb2 must be held by the caller. * - * Returns: - * 0 - failed to acquire the lock atomicly - * 1 - acquired the lock + * Return: + * 0 - failed to acquire the lock atomically; + * 1 - acquired the lock; * <0 - error */ static int futex_proxy_trylock_atomic(u32 __user *pifutex, @@ -1254,8 +1255,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire * uaddr2 atomically on behalf of the top waiter. * - * Returns: - * >=0 - on success, the number of tasks requeued or woken + * Return: + * >=0 - on success, the number of tasks requeued or woken; * <0 - on error */ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, @@ -1536,8 +1537,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must * be paired with exactly one earlier call to queue_me(). * - * Returns: - * 1 - if the futex_q was still queued (and we removed unqueued it) + * Return: + * 1 - if the futex_q was still queued (and we removed unqueued it); * 0 - if the futex_q was already removed by the waking thread */ static int unqueue_me(struct futex_q *q) @@ -1707,9 +1708,9 @@ static long futex_wait_restart(struct restart_block *restart); * the pi_state owner as well as handle race conditions that may allow us to * acquire the lock. Must be called with the hb lock held. * - * Returns: - * 1 - success, lock taken - * 0 - success, lock not taken + * Return: + * 1 - success, lock taken; + * 0 - success, lock not taken; * <0 - on error (-EFAULT) */ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) @@ -1824,8 +1825,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * Return with the hb lock held and a q.key reference on success, and unlocked * with no q.key reference on failure. * - * Returns: - * 0 - uaddr contains val and hb has been locked + * Return: + * 0 - uaddr contains val and hb has been locked; * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked */ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, @@ -2203,9 +2204,9 @@ pi_faulted: * the wakeup and return the appropriate error code to the caller. Must be * called with the hb lock held. * - * Returns - * 0 - no early wakeup detected - * <0 - -ETIMEDOUT or -ERESTARTNOINTR + * Return: + * 0 = no early wakeup detected; + * <0 = -ETIMEDOUT or -ERESTARTNOINTR */ static inline int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, @@ -2247,7 +2248,6 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * @val: the expected value of uaddr * @abs_time: absolute timeout * @bitset: 32 bit wakeup bitset set by userspace, defaults to all - * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0) * @uaddr2: the pi futex we will take prior to returning to user-space * * The caller will wait on uaddr and will be requeued by futex_requeue() to @@ -2258,7 +2258,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * there was a need to. * * We call schedule in futex_wait_queue_me() when we enqueue and return there - * via the following: + * via the following-- * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() * 2) wakeup on uaddr2 after a requeue * 3) signal @@ -2276,8 +2276,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * * If 4 or 7, we cleanup and return with -ETIMEDOUT. * - * Returns: - * 0 - On success + * Return: + * 0 - On success; * <0 - On error */ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, -- cgit v1.2.3-70-g09d2 From 740466bc89ad8bd5afcc8de220f715f62b21e365 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 13 Mar 2013 11:15:19 -0400 Subject: tracing: Fix free of probe entry by calling call_rcu_sched() Because function tracing is very invasive, and can even trace calls to rcu_read_lock(), RCU access in function tracing is done with preempt_disable_notrace(). This requires a synchronize_sched() for updates and not a synchronize_rcu(). Function probes (traceon, traceoff, etc) must be freed after a synchronize_sched() after its entry has been removed from the hash. But call_rcu() is used. Fix this by using call_rcu_sched(). Also fix the usage to use hlist_del_rcu() instead of hlist_del(). Cc: stable@vger.kernel.org Cc: Paul McKenney Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 98ca94a4181..e6effd0c40a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3108,8 +3108,8 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, continue; } - hlist_del(&entry->node); - call_rcu(&entry->rcu, ftrace_free_entry_rcu); + hlist_del_rcu(&entry->node); + call_rcu_sched(&entry->rcu, ftrace_free_entry_rcu); } } __disable_ftrace_function_probe(); -- cgit v1.2.3-70-g09d2 From e66eded8309ebf679d3d3c1f5820d1f2ca332c71 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 13 Mar 2013 11:51:49 -0700 Subject: userns: Don't allow CLONE_NEWUSER | CLONE_FS Don't allowing sharing the root directory with processes in a different user namespace. There doesn't seem to be any point, and to allow it would require the overhead of putting a user namespace reference in fs_struct (for permission checks) and incrementing that reference count on practically every call to fork. So just perform the inexpensive test of forbidding sharing fs_struct acrosss processes in different user namespaces. We already disallow other forms of threading when unsharing a user namespace so this should be no real burden in practice. This updates setns, clone, and unshare to disallow multiple user namespaces sharing an fs_struct. Cc: stable@vger.kernel.org Signed-off-by: "Eric W. Biederman" Signed-off-by: Linus Torvalds --- kernel/fork.c | 5 ++++- kernel/user_namespace.c | 4 ++++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8d932b1c905..1766d324d5e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1141,6 +1141,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); + if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) + return ERR_PTR(-EINVAL); + /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. @@ -1807,7 +1810,7 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) * If unsharing a user namespace must also unshare the thread. */ if (unshare_flags & CLONE_NEWUSER) - unshare_flags |= CLONE_THREAD; + unshare_flags |= CLONE_THREAD | CLONE_FS; /* * If unsharing a pid namespace must also unshare the thread. */ diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 8b650837083..b14f4d34204 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -21,6 +21,7 @@ #include #include #include +#include static struct kmem_cache *user_ns_cachep __read_mostly; @@ -837,6 +838,9 @@ static int userns_install(struct nsproxy *nsproxy, void *ns) if (atomic_read(¤t->mm->mm_users) > 1) return -EINVAL; + if (current->fs->users != 1) + return -EINVAL; + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; -- cgit v1.2.3-70-g09d2 From 2ca39528c01a933f6689cd6505ce65bd6d68a530 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 13 Mar 2013 14:59:33 -0700 Subject: signal: always clear sa_restorer on execve When the new signal handlers are set up, the location of sa_restorer is not cleared, leaking a parent process's address space location to children. This allows for a potential bypass of the parent's ASLR by examining the sa_restorer value returned when calling sigaction(). Based on what should be considered "secret" about addresses, it only matters across the exec not the fork (since the VMAs haven't changed until the exec). But since exec sets SIG_DFL and keeps sa_restorer, this is where it should be fixed. Given the few uses of sa_restorer, a "set" function was not written since this would be the only use. Instead, we use __ARCH_HAS_SA_RESTORER, as already done in other places. Example of the leak before applying this patch: $ cat /proc/$$/maps ... 7fb9f3083000-7fb9f3238000 r-xp 00000000 fd:01 404469 .../libc-2.15.so ... $ ./leak ... 7f278bc74000-7f278be29000 r-xp 00000000 fd:01 404469 .../libc-2.15.so ... 1 0 (nil) 0x7fb9f30b94a0 2 4000000 (nil) 0x7f278bcaa4a0 3 4000000 (nil) 0x7f278bcaa4a0 4 0 (nil) 0x7fb9f30b94a0 ... [akpm@linux-foundation.org: use SA_RESTORER for backportability] Signed-off-by: Kees Cook Reported-by: Emese Revfy Cc: Emese Revfy Cc: PaX Team Cc: Al Viro Cc: Oleg Nesterov Cc: "Eric W. Biederman" Cc: Serge Hallyn Cc: Julien Tinnes Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index d63c79e7e41..43b0d4a1b7b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -485,6 +485,9 @@ flush_signal_handlers(struct task_struct *t, int force_default) if (force_default || ka->sa.sa_handler != SIG_IGN) ka->sa.sa_handler = SIG_DFL; ka->sa.sa_flags = 0; +#ifdef SA_RESTORER + ka->sa.sa_restorer = NULL; +#endif sigemptyset(&ka->sa.sa_mask); ka++; } -- cgit v1.2.3-70-g09d2 From 522cff142d7d2f9230839c9e1f21a4d8bcc22a4a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 13 Mar 2013 14:59:34 -0700 Subject: kernel/signal.c: use __ARCH_HAS_SA_RESTORER instead of SA_RESTORER __ARCH_HAS_SA_RESTORER is the preferred conditional for use in 3.9 and later kernels, per Kees. Cc: Emese Revfy Cc: Emese Revfy Cc: PaX Team Cc: Al Viro Cc: Oleg Nesterov Cc: "Eric W. Biederman" Cc: Serge Hallyn Cc: Julien Tinnes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 43b0d4a1b7b..dd72567767d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -485,7 +485,7 @@ flush_signal_handlers(struct task_struct *t, int force_default) if (force_default || ka->sa.sa_handler != SIG_IGN) ka->sa.sa_handler = SIG_DFL; ka->sa.sa_flags = 0; -#ifdef SA_RESTORER +#ifdef __ARCH_HAS_SA_RESTORER ka->sa.sa_restorer = NULL; #endif sigemptyset(&ka->sa.sa_mask); -- cgit v1.2.3-70-g09d2 From e68035fb65dec05718d765fbea14d2e527214ff6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 Mar 2013 14:59:38 -0700 Subject: workqueue: convert to idr_alloc() idr_get_new*() and friends are about to be deprecated. Convert to the new idr_alloc() interface. Signed-off-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 81f2457811e..55fac5b991b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -457,11 +457,12 @@ static int worker_pool_assign_id(struct worker_pool *pool) int ret; mutex_lock(&worker_pool_idr_mutex); - idr_pre_get(&worker_pool_idr, GFP_KERNEL); - ret = idr_get_new(&worker_pool_idr, pool, &pool->id); + ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL); + if (ret >= 0) + pool->id = ret; mutex_unlock(&worker_pool_idr_mutex); - return ret; + return ret < 0 ? ret : 0; } /* -- cgit v1.2.3-70-g09d2 From 69d34da2984c95b33ea21518227e1f9470f11d95 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 14 Mar 2013 13:50:56 -0400 Subject: tracing: Protect tracer flags with trace_types_lock Seems that the tracer flags have never been protected from synchronous writes. Luckily, admins don't usually modify the tracing flags via two different tasks. But if scripts were to be used to modify them, then they could get corrupted. Move the trace_types_lock that protects against tracers changing to also protect the flags being set. Cc: stable@vger.kernel.org Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 53df2839bb9..00daf5f8c50 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2916,6 +2916,8 @@ static int trace_set_options(char *option) cmp += 2; } + mutex_lock(&trace_types_lock); + for (i = 0; trace_options[i]; i++) { if (strcmp(cmp, trace_options[i]) == 0) { set_tracer_flags(1 << i, !neg); @@ -2924,11 +2926,10 @@ static int trace_set_options(char *option) } /* If no option could be set, test the specific tracer options */ - if (!trace_options[i]) { - mutex_lock(&trace_types_lock); + if (!trace_options[i]) ret = set_tracer_option(current_trace, cmp, neg); - mutex_unlock(&trace_types_lock); - } + + mutex_unlock(&trace_types_lock); return ret; } @@ -4781,7 +4782,10 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, if (val != 0 && val != 1) return -EINVAL; + + mutex_lock(&trace_types_lock); set_tracer_flags(1 << index, val); + mutex_unlock(&trace_types_lock); *ppos += cnt; -- cgit v1.2.3-70-g09d2 From 80902822658aab18330569587cdb69ac1dfdcea8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 14 Mar 2013 14:20:54 -0400 Subject: tracing: Keep overwrite in sync between regular and snapshot buffers Changing the overwrite mode for the ring buffer via the trace option only sets the normal buffer. But the snapshot buffer could swap with it, and then the snapshot would be in non overwrite mode and the normal buffer would be in overwrite mode, even though the option flag states otherwise. Keep the two buffers overwrite modes in sync. Cc: stable@vger.kernel.org Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 00daf5f8c50..02debabe9ed 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2895,8 +2895,12 @@ static void set_tracer_flags(unsigned int mask, int enabled) if (mask == TRACE_ITER_RECORD_CMD) trace_event_enable_cmd_record(enabled); - if (mask == TRACE_ITER_OVERWRITE) + if (mask == TRACE_ITER_OVERWRITE) { ring_buffer_change_overwrite(global_trace.buffer, enabled); +#ifdef CONFIG_TRACER_MAX_TRACE + ring_buffer_change_overwrite(max_tr.buffer, enabled); +#endif + } if (mask == TRACE_ITER_PRINTK) trace_printk_start_stop_comm(enabled); -- cgit v1.2.3-70-g09d2 From 613f04a0f51e6e68ac6fe571ab79da3c0a5eb4da Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 14 Mar 2013 15:03:53 -0400 Subject: tracing: Prevent buffer overwrite disabled for latency tracers The latency tracers require the buffers to be in overwrite mode, otherwise they get screwed up. Force the buffers to stay in overwrite mode when latency tracers are enabled. Added a flag_changed() method to the tracer structure to allow the tracers to see what flags are being changed, and also be able to prevent the change from happing. Cc: stable@vger.kernel.org Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 38 ++++++++++++++++++++++++++++++++------ kernel/trace/trace.h | 6 ++++++ kernel/trace/trace_irqsoff.c | 19 ++++++++++++++----- kernel/trace/trace_sched_wakeup.c | 18 +++++++++++++----- 4 files changed, 65 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 02debabe9ed..4f1dade5698 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2881,11 +2881,25 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg) return -EINVAL; } -static void set_tracer_flags(unsigned int mask, int enabled) +/* Some tracers require overwrite to stay enabled */ +int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) +{ + if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set) + return -1; + + return 0; +} + +int set_tracer_flag(unsigned int mask, int enabled) { /* do nothing if flag is already set */ if (!!(trace_flags & mask) == !!enabled) - return; + return 0; + + /* Give the tracer a chance to approve the change */ + if (current_trace->flag_changed) + if (current_trace->flag_changed(current_trace, mask, !!enabled)) + return -EINVAL; if (enabled) trace_flags |= mask; @@ -2904,13 +2918,15 @@ static void set_tracer_flags(unsigned int mask, int enabled) if (mask == TRACE_ITER_PRINTK) trace_printk_start_stop_comm(enabled); + + return 0; } static int trace_set_options(char *option) { char *cmp; int neg = 0; - int ret = 0; + int ret = -ENODEV; int i; cmp = strstrip(option); @@ -2924,7 +2940,7 @@ static int trace_set_options(char *option) for (i = 0; trace_options[i]; i++) { if (strcmp(cmp, trace_options[i]) == 0) { - set_tracer_flags(1 << i, !neg); + ret = set_tracer_flag(1 << i, !neg); break; } } @@ -2943,6 +2959,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[64]; + int ret; if (cnt >= sizeof(buf)) return -EINVAL; @@ -2952,7 +2969,9 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, buf[cnt] = 0; - trace_set_options(buf); + ret = trace_set_options(buf); + if (ret < 0) + return ret; *ppos += cnt; @@ -3256,6 +3275,9 @@ static int tracing_set_tracer(const char *buf) goto out; trace_branch_disable(); + + current_trace->enabled = false; + if (current_trace->reset) current_trace->reset(tr); @@ -3300,6 +3322,7 @@ static int tracing_set_tracer(const char *buf) } current_trace = t; + current_trace->enabled = true; trace_branch_enable(tr); out: mutex_unlock(&trace_types_lock); @@ -4788,9 +4811,12 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, return -EINVAL; mutex_lock(&trace_types_lock); - set_tracer_flags(1 << index, val); + ret = set_tracer_flag(1 << index, val); mutex_unlock(&trace_types_lock); + if (ret < 0) + return ret; + *ppos += cnt; return cnt; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 57d7e5397d5..2081971367e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -283,11 +283,15 @@ struct tracer { enum print_line_t (*print_line)(struct trace_iterator *iter); /* If you handled the flag setting, return 0 */ int (*set_flag)(u32 old_flags, u32 bit, int set); + /* Return 0 if OK with change, else return non-zero */ + int (*flag_changed)(struct tracer *tracer, + u32 mask, int set); struct tracer *next; struct tracer_flags *flags; bool print_max; bool use_max_tr; bool allocated_snapshot; + bool enabled; }; @@ -943,6 +947,8 @@ extern const char *__stop___trace_bprintk_fmt[]; void trace_printk_init_buffers(void); void trace_printk_start_comm(void); +int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); +int set_tracer_flag(unsigned int mask, int enabled); #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 713a2cac488..443b25b43b4 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -32,7 +32,7 @@ enum { static int trace_type __read_mostly; -static int save_lat_flag; +static int save_flags; static void stop_irqsoff_tracer(struct trace_array *tr, int graph); static int start_irqsoff_tracer(struct trace_array *tr, int graph); @@ -558,8 +558,11 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph) static void __irqsoff_tracer_init(struct trace_array *tr) { - save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; - trace_flags |= TRACE_ITER_LATENCY_FMT; + save_flags = trace_flags; + + /* non overwrite screws up the latency tracers */ + set_tracer_flag(TRACE_ITER_OVERWRITE, 1); + set_tracer_flag(TRACE_ITER_LATENCY_FMT, 1); tracing_max_latency = 0; irqsoff_trace = tr; @@ -573,10 +576,13 @@ static void __irqsoff_tracer_init(struct trace_array *tr) static void irqsoff_tracer_reset(struct trace_array *tr) { + int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; + int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; + stop_irqsoff_tracer(tr, is_graph()); - if (!save_lat_flag) - trace_flags &= ~TRACE_ITER_LATENCY_FMT; + set_tracer_flag(TRACE_ITER_LATENCY_FMT, lat_flag); + set_tracer_flag(TRACE_ITER_OVERWRITE, overwrite_flag); } static void irqsoff_tracer_start(struct trace_array *tr) @@ -609,6 +615,7 @@ static struct tracer irqsoff_tracer __read_mostly = .print_line = irqsoff_print_line, .flags = &tracer_flags, .set_flag = irqsoff_set_flag, + .flag_changed = trace_keep_overwrite, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_irqsoff, #endif @@ -642,6 +649,7 @@ static struct tracer preemptoff_tracer __read_mostly = .print_line = irqsoff_print_line, .flags = &tracer_flags, .set_flag = irqsoff_set_flag, + .flag_changed = trace_keep_overwrite, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptoff, #endif @@ -677,6 +685,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = .print_line = irqsoff_print_line, .flags = &tracer_flags, .set_flag = irqsoff_set_flag, + .flag_changed = trace_keep_overwrite, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptirqsoff, #endif diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 75aa97fbe1a..fde652c9a51 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -36,7 +36,7 @@ static void __wakeup_reset(struct trace_array *tr); static int wakeup_graph_entry(struct ftrace_graph_ent *trace); static void wakeup_graph_return(struct ftrace_graph_ret *trace); -static int save_lat_flag; +static int save_flags; #define TRACE_DISPLAY_GRAPH 1 @@ -540,8 +540,11 @@ static void stop_wakeup_tracer(struct trace_array *tr) static int __wakeup_tracer_init(struct trace_array *tr) { - save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; - trace_flags |= TRACE_ITER_LATENCY_FMT; + save_flags = trace_flags; + + /* non overwrite screws up the latency tracers */ + set_tracer_flag(TRACE_ITER_OVERWRITE, 1); + set_tracer_flag(TRACE_ITER_LATENCY_FMT, 1); tracing_max_latency = 0; wakeup_trace = tr; @@ -563,12 +566,15 @@ static int wakeup_rt_tracer_init(struct trace_array *tr) static void wakeup_tracer_reset(struct trace_array *tr) { + int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; + int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; + stop_wakeup_tracer(tr); /* make sure we put back any tasks we are tracing */ wakeup_reset(tr); - if (!save_lat_flag) - trace_flags &= ~TRACE_ITER_LATENCY_FMT; + set_tracer_flag(TRACE_ITER_LATENCY_FMT, lat_flag); + set_tracer_flag(TRACE_ITER_OVERWRITE, overwrite_flag); } static void wakeup_tracer_start(struct trace_array *tr) @@ -594,6 +600,7 @@ static struct tracer wakeup_tracer __read_mostly = .print_line = wakeup_print_line, .flags = &tracer_flags, .set_flag = wakeup_set_flag, + .flag_changed = trace_keep_overwrite, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif @@ -615,6 +622,7 @@ static struct tracer wakeup_rt_tracer __read_mostly = .print_line = wakeup_print_line, .flags = &tracer_flags, .set_flag = wakeup_set_flag, + .flag_changed = trace_keep_overwrite, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif -- cgit v1.2.3-70-g09d2 From 778141e3cf0bf29f91cd3cb5c314ea477b9402a7 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 18 Mar 2013 11:41:46 +0900 Subject: perf: Reset hwc->last_period on sw clock events When cpu/task clock events are initialized, their sampling frequencies are converted to have a fixed value. However it missed to update the hwc->last_period which was set to 1 for initial sampling frequency calibration. Because this hwc->last_period value is used as a period in perf_swevent_ hrtime(), every recorded sample will have an incorrected period of 1. $ perf record -e task-clock noploop 1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.158 MB perf.data (~6919 samples) ] $ perf report -n --show-total-period --stdio # Samples: 4K of event 'task-clock' # Event count (approx.): 4000 # # Overhead Samples Period Command Shared Object Symbol # ........ ............ ............ ....... ............. .................. # 99.95% 3998 3998 noploop noploop [.] main 0.03% 1 1 noploop libc-2.15.so [.] init_cacheinfo 0.03% 1 1 noploop ld-2.15.so [.] open_verify Note that it doesn't affect the non-sampling event so that the perf stat still gets correct value with or without this patch. $ perf stat -e task-clock noploop 1 Performance counter stats for 'noploop 1': 1000.272525 task-clock # 1.000 CPUs utilized 1.000560605 seconds time elapsed Signed-off-by: Namhyung Kim Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1363574507-18808-1-git-send-email-namhyung@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index b0cd86501c3..fa79c377d65 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5647,6 +5647,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event) event->attr.sample_period = NSEC_PER_SEC / freq; hwc->sample_period = event->attr.sample_period; local64_set(&hwc->period_left, hwc->sample_period); + hwc->last_period = hwc->sample_period; event->attr.freq = 0; } } -- cgit v1.2.3-70-g09d2 From d610d98b5de6860feb21539726e9af7c9094151c Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 15 Mar 2013 16:27:13 +0900 Subject: perf: Generate EXIT event only once per task context perf_event_task_event() iterates pmu list and generate events for each eligible pmu context. But if task_event has task_ctx like in EXIT it'll generate events even though the pmu doesn't have an eligible one. Fix it by moving the code to proper places. Before this patch: $ perf record -n true [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.006 MB perf.data (~248 samples) ] $ perf report -D | tail Aggregated stats: TOTAL events: 73 MMAP events: 67 COMM events: 2 EXIT events: 4 cycles stats: TOTAL events: 73 MMAP events: 67 COMM events: 2 EXIT events: 4 After this patch: $ perf report -D | tail Aggregated stats: TOTAL events: 70 MMAP events: 67 COMM events: 2 EXIT events: 1 cycles stats: TOTAL events: 70 MMAP events: 67 COMM events: 2 EXIT events: 1 Signed-off-by: Namhyung Kim Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1363332433-7637-1-git-send-email-namhyung@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index fa79c377d65..59412d037ee 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4434,12 +4434,15 @@ static void perf_event_task_event(struct perf_task_event *task_event) if (ctxn < 0) goto next; ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) + perf_event_task_ctx(ctx, task_event); } - if (ctx) - perf_event_task_ctx(ctx, task_event); next: put_cpu_ptr(pmu->pmu_cpu_context); } + if (task_event->task_ctx) + perf_event_task_ctx(task_event->task_ctx, task_event); + rcu_read_unlock(); } -- cgit v1.2.3-70-g09d2 From 383efcd00053ec40023010ce5034bd702e7ab373 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 18 Mar 2013 12:22:34 -0700 Subject: sched: Convert BUG_ON()s in try_to_wake_up_local() to WARN_ON_ONCE()s try_to_wake_up_local() should only be invoked to wake up another task in the same runqueue and BUG_ON()s are used to enforce the rule. Missing try_to_wake_up_local() can stall workqueue execution but such stalls are likely to be finite either by another work item being queued or the one blocked getting unblocked. There's no reason to trigger BUG while holding rq lock crashing the whole system. Convert BUG_ON()s in try_to_wake_up_local() to WARN_ON_ONCE()s. Signed-off-by: Tejun Heo Acked-by: Steven Rostedt Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20130318192234.GD3042@htj.dyndns.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b7b03cd2d4c..306943f531a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1488,8 +1488,10 @@ static void try_to_wake_up_local(struct task_struct *p) { struct rq *rq = task_rq(p); - BUG_ON(rq != this_rq()); - BUG_ON(p == current); + if (WARN_ON_ONCE(rq != this_rq()) || + WARN_ON_ONCE(p == current)) + return; + lockdep_assert_held(&rq->lock); if (!raw_spin_trylock(&p->pi_lock)) { -- cgit v1.2.3-70-g09d2 From dd9c086d9f507d02d5ba4d7c5eef4bb9518088b8 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 18 Mar 2013 14:33:28 +0100 Subject: perf: Fix ring_buffer perf_output_space() boundary calculation This patch fixes a flaw in perf_output_space(). In case the size of the space needed is bigger than the actual buffer size, there may be situations where the function would return true (i.e., there is space) when it should not. head > offset due to rounding of the masking logic. The problem can be tested by activating BTS on Intel processors. A BTS record can be as big as 16 pages. The following command fails: $ perf record -m 4 -c 1 -e branches:u my_test_program You will get a buffer corruption with this. Perf report won't be able to parse the perf.data. The fix is to first check that the requested space is smaller than the buffer size. If so, then the masking logic will work fine. If not, then there is no chance the record can be saved and it will be gracefully handled by upper code layers. [ In v2, we also make the logic for the writable more explicit by renaming it to rb->overwrite because it tells whether or not the buffer can overwrite its tail (suggested by PeterZ). ] Signed-off-by: Stephane Eranian Acked-by: Peter Zijlstra Cc: peterz@infradead.org Cc: jolsa@redhat.com Cc: fweisbec@gmail.com Link: http://lkml.kernel.org/r/20130318133327.GA3056@quad Signed-off-by: Ingo Molnar --- kernel/events/internal.h | 2 +- kernel/events/ring_buffer.c | 22 ++++++++++++++++++---- 2 files changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/internal.h b/kernel/events/internal.h index d56a64c99a8..eb675c4d59d 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -16,7 +16,7 @@ struct ring_buffer { int page_order; /* allocation order */ #endif int nr_pages; /* nr of data pages */ - int writable; /* are we writable */ + int overwrite; /* can overwrite itself */ atomic_t poll; /* POLL_ for wakeups */ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 23cb34ff397..97fddb09762 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -18,12 +18,24 @@ static bool perf_output_space(struct ring_buffer *rb, unsigned long tail, unsigned long offset, unsigned long head) { - unsigned long mask; + unsigned long sz = perf_data_size(rb); + unsigned long mask = sz - 1; - if (!rb->writable) + /* + * check if user-writable + * overwrite : over-write its own tail + * !overwrite: buffer possibly drops events. + */ + if (rb->overwrite) return true; - mask = perf_data_size(rb) - 1; + /* + * verify that payload is not bigger than buffer + * otherwise masking logic may fail to detect + * the "not enough space" condition + */ + if ((head - offset) > sz) + return false; offset = (offset - tail) & mask; head = (head - tail) & mask; @@ -212,7 +224,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) rb->watermark = max_size / 2; if (flags & RING_BUFFER_WRITABLE) - rb->writable = 1; + rb->overwrite = 0; + else + rb->overwrite = 1; atomic_set(&rb->refcount, 1); -- cgit v1.2.3-70-g09d2 From dc72c32e1fd872a9a4fdfe645283c9dcd68e556d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 22 Mar 2013 15:04:39 -0700 Subject: printk: Provide a wake_up_klogd() off-case wake_up_klogd() is useless when CONFIG_PRINTK=n because neither printk() nor printk_sched() are in use and there are actually no waiter on log_wait waitqueue. It should be a stub in this case for users like bust_spinlocks(). Otherwise this results in this warning when CONFIG_PRINTK=n and CONFIG_IRQ_WORK=n: kernel/built-in.o In function `wake_up_klogd': (.text.wake_up_klogd+0xb4): undefined reference to `irq_work_queue' To fix this, provide an off-case for wake_up_klogd() when CONFIG_PRINTK=n. There is much more from console_unlock() and other console related code in printk.c that should be moved under CONFIG_PRINTK. But for now, focus on a minimal fix as we passed the merged window already. [akpm@linux-foundation.org: include printk.h in bust_spinlocks.c] Signed-off-by: Frederic Weisbecker Reported-by: James Hogan Cc: James Hogan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 1 - include/linux/printk.h | 6 ++++ kernel/printk.c | 80 ++++++++++++++++++++++++-------------------------- lib/bust_spinlocks.c | 3 +- 4 files changed, 46 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 80d36874689..79fdd80a42d 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -390,7 +390,6 @@ extern struct pid *session_of_pgrp(struct pid *pgrp); unsigned long int_sqrt(unsigned long); extern void bust_spinlocks(int yes); -extern void wake_up_klogd(void); extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ extern int panic_timeout; extern int panic_on_oops; diff --git a/include/linux/printk.h b/include/linux/printk.h index 1249a54d17e..822171fcb1c 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -134,6 +134,8 @@ extern int printk_delay_msec; extern int dmesg_restrict; extern int kptr_restrict; +extern void wake_up_klogd(void); + void log_buf_kexec_setup(void); void __init setup_log_buf(int early); #else @@ -162,6 +164,10 @@ static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, return false; } +static inline void wake_up_klogd(void) +{ +} + static inline void log_buf_kexec_setup(void) { } diff --git a/kernel/printk.c b/kernel/printk.c index 0b31715f335..abbdd9e2ac8 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -63,8 +63,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ #define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ -DECLARE_WAIT_QUEUE_HEAD(log_wait); - int console_printk[4] = { DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ @@ -224,6 +222,7 @@ struct log { static DEFINE_RAW_SPINLOCK(logbuf_lock); #ifdef CONFIG_PRINTK +DECLARE_WAIT_QUEUE_HEAD(log_wait); /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static u32 syslog_idx; @@ -1957,45 +1956,6 @@ int is_console_locked(void) return console_locked; } -/* - * Delayed printk version, for scheduler-internal messages: - */ -#define PRINTK_BUF_SIZE 512 - -#define PRINTK_PENDING_WAKEUP 0x01 -#define PRINTK_PENDING_SCHED 0x02 - -static DEFINE_PER_CPU(int, printk_pending); -static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); - -static void wake_up_klogd_work_func(struct irq_work *irq_work) -{ - int pending = __this_cpu_xchg(printk_pending, 0); - - if (pending & PRINTK_PENDING_SCHED) { - char *buf = __get_cpu_var(printk_sched_buf); - printk(KERN_WARNING "[sched_delayed] %s", buf); - } - - if (pending & PRINTK_PENDING_WAKEUP) - wake_up_interruptible(&log_wait); -} - -static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { - .func = wake_up_klogd_work_func, - .flags = IRQ_WORK_LAZY, -}; - -void wake_up_klogd(void) -{ - preempt_disable(); - if (waitqueue_active(&log_wait)) { - this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); - irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); - } - preempt_enable(); -} - static void console_cont_flush(char *text, size_t size) { unsigned long flags; @@ -2458,6 +2418,44 @@ static int __init printk_late_init(void) late_initcall(printk_late_init); #if defined CONFIG_PRINTK +/* + * Delayed printk version, for scheduler-internal messages: + */ +#define PRINTK_BUF_SIZE 512 + +#define PRINTK_PENDING_WAKEUP 0x01 +#define PRINTK_PENDING_SCHED 0x02 + +static DEFINE_PER_CPU(int, printk_pending); +static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); + +static void wake_up_klogd_work_func(struct irq_work *irq_work) +{ + int pending = __this_cpu_xchg(printk_pending, 0); + + if (pending & PRINTK_PENDING_SCHED) { + char *buf = __get_cpu_var(printk_sched_buf); + printk(KERN_WARNING "[sched_delayed] %s", buf); + } + + if (pending & PRINTK_PENDING_WAKEUP) + wake_up_interruptible(&log_wait); +} + +static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { + .func = wake_up_klogd_work_func, + .flags = IRQ_WORK_LAZY, +}; + +void wake_up_klogd(void) +{ + preempt_disable(); + if (waitqueue_active(&log_wait)) { + this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); + irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); + } + preempt_enable(); +} int printk_sched(const char *fmt, ...) { diff --git a/lib/bust_spinlocks.c b/lib/bust_spinlocks.c index 9681d54b95d..f8e0e536739 100644 --- a/lib/bust_spinlocks.c +++ b/lib/bust_spinlocks.c @@ -8,6 +8,7 @@ */ #include +#include #include #include #include @@ -28,5 +29,3 @@ void __attribute__((weak)) bust_spinlocks(int yes) wake_up_klogd(); } } - - -- cgit v1.2.3-70-g09d2 From 2ca067efd82939dfd87827d29d36a265823a4c2f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 22 Mar 2013 15:04:41 -0700 Subject: poweroff: change orderly_poweroff() to use schedule_work() David said: Commit 6c0c0d4d1080 ("poweroff: fix bug in orderly_poweroff()") apparently fixes one bug in orderly_poweroff(), but introduces another. The comments on orderly_poweroff() claim it can be called from any context - and indeed we call it from interrupt context in arch/powerpc/platforms/pseries/ras.c for example. But since that commit this is no longer safe, since call_usermodehelper_fns() is not safe in interrupt context without the UMH_NO_WAIT option. orderly_poweroff() can be used from any context but UMH_WAIT_EXEC is sleepable. Move the "force" logic into __orderly_poweroff() and change orderly_poweroff() to use the global poweroff_work which simply calls __orderly_poweroff(). While at it, remove the unneeded "int argc" and change argv_split() to use GFP_KERNEL. We use the global "bool poweroff_force" to pass the argument, this can obviously affect the previous request if it is pending/running. So we only allow the "false => true" transition assuming that the pending "true" should succeed anyway. If schedule_work() fails after that we know that work->func() was not called yet, it must see the new value. This means that orderly_poweroff() becomes async even if we do not run the command and always succeeds, schedule_work() can only fail if the work is already pending. We can export __orderly_poweroff() and change the non-atomic callers which want the old semantics. Signed-off-by: Oleg Nesterov Reported-by: Benjamin Herrenschmidt Reported-by: David Gibson Cc: Lucas De Marchi Cc: Feng Hong Cc: Kees Cook Cc: Serge Hallyn Cc: "Eric W. Biederman" Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 57 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 32 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 81f56445fba..39c9c4a2949 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2185,9 +2185,8 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; -static int __orderly_poweroff(void) +static int __orderly_poweroff(bool force) { - int argc; char **argv; static char *envp[] = { "HOME=/", @@ -2196,20 +2195,40 @@ static int __orderly_poweroff(void) }; int ret; - argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); - if (argv == NULL) { + argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL); + if (argv) { + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); + argv_free(argv); + } else { printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", - __func__, poweroff_cmd); - return -ENOMEM; + __func__, poweroff_cmd); + ret = -ENOMEM; } - ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC, - NULL, NULL, NULL); - argv_free(argv); + if (ret && force) { + printk(KERN_WARNING "Failed to start orderly shutdown: " + "forcing the issue\n"); + /* + * I guess this should try to kick off some daemon to sync and + * poweroff asap. Or not even bother syncing if we're doing an + * emergency shutdown? + */ + emergency_sync(); + kernel_power_off(); + } return ret; } +static bool poweroff_force; + +static void poweroff_work_func(struct work_struct *work) +{ + __orderly_poweroff(poweroff_force); +} + +static DECLARE_WORK(poweroff_work, poweroff_work_func); + /** * orderly_poweroff - Trigger an orderly system poweroff * @force: force poweroff if command execution fails @@ -2219,21 +2238,9 @@ static int __orderly_poweroff(void) */ int orderly_poweroff(bool force) { - int ret = __orderly_poweroff(); - - if (ret && force) { - printk(KERN_WARNING "Failed to start orderly shutdown: " - "forcing the issue\n"); - - /* - * I guess this should try to kick off some daemon to sync and - * poweroff asap. Or not even bother syncing if we're doing an - * emergency shutdown? - */ - emergency_sync(); - kernel_power_off(); - } - - return ret; + if (force) /* do not override the pending "true" */ + poweroff_force = true; + schedule_work(&poweroff_work); + return 0; } EXPORT_SYMBOL_GPL(orderly_poweroff); -- cgit v1.2.3-70-g09d2 From 751c644b95bb48aaa8825f0c66abbcc184d92051 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 26 Mar 2013 02:27:11 -0700 Subject: pid: Handle the exit of a multi-threaded init. When a multi-threaded init exits and the initial thread is not the last thread to exit the initial thread hangs around as a zombie until the last thread exits. In that case zap_pid_ns_processes needs to wait until there are only 2 hashed pids in the pid namespace not one. v2. Replace thread_pid_vnr(me) == 1 with the test thread_group_leader(me) as suggested by Oleg. Cc: stable@vger.kernel.org Cc: Oleg Nesterov Reported-by: Caj Larsson Signed-off-by: "Eric W. Biederman" --- kernel/pid_namespace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index c1c3dc1c602..bea15bdf82b 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -181,6 +181,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) int nr; int rc; struct task_struct *task, *me = current; + int init_pids = thread_group_leader(me) ? 1 : 2; /* Don't allow any more processes into the pid namespace */ disable_pid_allocation(pid_ns); @@ -230,7 +231,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) */ for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); - if (pid_ns->nr_hashed == 1) + if (pid_ns->nr_hashed == init_pids) break; schedule(); } -- cgit v1.2.3-70-g09d2 From 3151527ee007b73a0ebd296010f1c0454a919c7d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 15 Mar 2013 01:45:51 -0700 Subject: userns: Don't allow creation if the user is chrooted Guarantee that the policy of which files may be access that is established by setting the root directory will not be violated by user namespaces by verifying that the root directory points to the root of the mount namespace at the time of user namespace creation. Changing the root is a privileged operation, and as a matter of policy it serves to limit unprivileged processes to files below the current root directory. For reasons of simplicity and comprehensibility the privilege to change the root directory is gated solely on the CAP_SYS_CHROOT capability in the user namespace. Therefore when creating a user namespace we must ensure that the policy of which files may be access can not be violated by changing the root directory. Anyone who runs a processes in a chroot and would like to use user namespace can setup the same view of filesystems with a mount namespace instead. With this result that this is not a practical limitation for using user namespaces. Cc: stable@vger.kernel.org Acked-by: Serge Hallyn Reported-by: Andy Lutomirski Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 24 ++++++++++++++++++++++++ include/linux/fs_struct.h | 2 ++ kernel/user_namespace.c | 9 +++++++++ 3 files changed, 35 insertions(+) (limited to 'kernel') diff --git a/fs/namespace.c b/fs/namespace.c index 50ca17d3cb4..a3035223d42 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2732,6 +2732,30 @@ bool our_mnt(struct vfsmount *mnt) return check_mnt(real_mount(mnt)); } +bool current_chrooted(void) +{ + /* Does the current process have a non-standard root */ + struct path ns_root; + struct path fs_root; + bool chrooted; + + /* Find the namespace root */ + ns_root.mnt = ¤t->nsproxy->mnt_ns->root->mnt; + ns_root.dentry = ns_root.mnt->mnt_root; + path_get(&ns_root); + while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) + ; + + get_fs_root(current->fs, &fs_root); + + chrooted = !path_equal(&fs_root, &ns_root); + + path_put(&fs_root); + path_put(&ns_root); + + return chrooted; +} + static void *mntns_get(struct task_struct *task) { struct mnt_namespace *ns = NULL; diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index 729eded4b24..2b93a9a5a1e 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -50,4 +50,6 @@ static inline void get_fs_root_and_pwd(struct fs_struct *fs, struct path *root, spin_unlock(&fs->lock); } +extern bool current_chrooted(void); + #endif /* _LINUX_FS_STRUCT_H */ diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index b14f4d34204..0f1e4288457 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -61,6 +61,15 @@ int create_user_ns(struct cred *new) kgid_t group = new->egid; int ret; + /* + * Verify that we can not violate the policy of which files + * may be accessed that is specified by the root directory, + * by verifing that the root directory is at the root of the + * mount namespace which allows all files to be accessed. + */ + if (current_chrooted()) + return -EPERM; + /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who * created a user_namespace. -- cgit v1.2.3-70-g09d2 From 87a8ebd637dafc255070f503909a053cf0d98d3f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 24 Mar 2013 14:28:27 -0700 Subject: userns: Restrict when proc and sysfs can be mounted Only allow unprivileged mounts of proc and sysfs if they are already mounted when the user namespace is created. proc and sysfs are interesting because they have content that is per namespace, and so fresh mounts are needed when new namespaces are created while at the same time proc and sysfs have content that is shared between every instance. Respect the policy of who may see the shared content of proc and sysfs by only allowing new mounts if there was an existing mount at the time the user namespace was created. In practice there are only two interesting cases: proc and sysfs are mounted at their usual places, proc and sysfs are not mounted at all (some form of mount namespace jail). Cc: stable@vger.kernel.org Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 21 +++++++++++++++++++++ fs/proc/root.c | 4 ++++ fs/sysfs/mount.c | 4 ++++ include/linux/user_namespace.h | 4 ++++ kernel/user.c | 2 ++ kernel/user_namespace.c | 2 ++ 6 files changed, 37 insertions(+) (limited to 'kernel') diff --git a/fs/namespace.c b/fs/namespace.c index 968d4c5eae0..d581e45c0a9 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2763,6 +2763,27 @@ bool current_chrooted(void) return chrooted; } +void update_mnt_policy(struct user_namespace *userns) +{ + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + struct mount *mnt; + + down_read(&namespace_sem); + list_for_each_entry(mnt, &ns->list, mnt_list) { + switch (mnt->mnt.mnt_sb->s_magic) { + case SYSFS_MAGIC: + userns->may_mount_sysfs = true; + break; + case PROC_SUPER_MAGIC: + userns->may_mount_proc = true; + break; + } + if (userns->may_mount_sysfs && userns->may_mount_proc) + break; + } + up_read(&namespace_sem); +} + static void *mntns_get(struct task_struct *task) { struct mnt_namespace *ns = NULL; diff --git a/fs/proc/root.c b/fs/proc/root.c index c6e9fac26ba..9c7fab1d23f 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -108,6 +109,9 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, } else { ns = task_active_pid_ns(current); options = data; + + if (!current_user_ns()->may_mount_proc) + return ERR_PTR(-EPERM); } sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 8d924b5ec73..afd83273e6c 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "sysfs.h" @@ -111,6 +112,9 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, struct super_block *sb; int error; + if (!(flags & MS_KERNMOUNT) && !current_user_ns()->may_mount_sysfs) + return ERR_PTR(-EPERM); + info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return ERR_PTR(-ENOMEM); diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 4ce00932493..b6b215f13b4 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -26,6 +26,8 @@ struct user_namespace { kuid_t owner; kgid_t group; unsigned int proc_inum; + bool may_mount_sysfs; + bool may_mount_proc; }; extern struct user_namespace init_user_ns; @@ -82,4 +84,6 @@ static inline void put_user_ns(struct user_namespace *ns) #endif +void update_mnt_policy(struct user_namespace *userns); + #endif /* _LINUX_USER_H */ diff --git a/kernel/user.c b/kernel/user.c index e81978e8c03..8e635a18ab5 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -51,6 +51,8 @@ struct user_namespace init_user_ns = { .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .proc_inum = PROC_USER_INIT_INO, + .may_mount_sysfs = true, + .may_mount_proc = true, }; EXPORT_SYMBOL_GPL(init_user_ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 0f1e4288457..a54f26f82eb 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -96,6 +96,8 @@ int create_user_ns(struct cred *new) set_cred_user_ns(new, ns); + update_mnt_policy(ns); + return 0; } -- cgit v1.2.3-70-g09d2 From dbf520a9d7d4d5ba28d2947be11e34099a5e3e20 Mon Sep 17 00:00:00 2001 From: Paul Walmsley Date: Sun, 31 Mar 2013 00:04:40 +0000 Subject: Revert "lockdep: check that no locks held at freeze time" This reverts commit 6aa9707099c4b25700940eb3d016f16c4434360d. Commit 6aa9707099c4 ("lockdep: check that no locks held at freeze time") causes problems with NFS root filesystems. The failures were noticed on OMAP2 and 3 boards during kernel init: [ BUG: swapper/0/1 still has locks held! ] 3.9.0-rc3-00344-ga937536 #1 Not tainted ------------------------------------- 1 lock held by swapper/0/1: #0: (&type->s_umount_key#13/1){+.+.+.}, at: [] sget+0x248/0x574 stack backtrace: rpc_wait_bit_killable __wait_on_bit out_of_line_wait_on_bit __rpc_execute rpc_run_task rpc_call_sync nfs_proc_get_root nfs_get_root nfs_fs_mount_common nfs_try_mount nfs_fs_mount mount_fs vfs_kern_mount do_mount sys_mount do_mount_root mount_root prepare_namespace kernel_init_freeable kernel_init Although the rootfs mounts, the system is unstable. Here's a transcript from a PM test: http://www.pwsan.com/omap/testlogs/test_v3.9-rc3/20130317194234/pm/37xxevm/37xxevm_log.txt Here's what the test log should look like: http://www.pwsan.com/omap/testlogs/test_v3.8/20130218214403/pm/37xxevm/37xxevm_log.txt Mailing list discussion is here: http://lkml.org/lkml/2013/3/4/221 Deal with this for v3.9 by reverting the problem commit, until folks can figure out the right long-term course of action. Signed-off-by: Paul Walmsley Cc: Mandeep Singh Baines Cc: Jeff Layton Cc: Shawn Guo Cc: Cc: Fengguang Wu Cc: Trond Myklebust Cc: Ingo Molnar Cc: Ben Chan Cc: Oleg Nesterov Cc: Tejun Heo Cc: Rafael J. Wysocki Cc: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/debug_locks.h | 4 ++-- include/linux/freezer.h | 3 --- kernel/exit.c | 2 +- kernel/lockdep.c | 17 +++++++++-------- 4 files changed, 12 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h index a975de1ff59..3bd46f76675 100644 --- a/include/linux/debug_locks.h +++ b/include/linux/debug_locks.h @@ -51,7 +51,7 @@ struct task_struct; extern void debug_show_all_locks(void); extern void debug_show_held_locks(struct task_struct *task); extern void debug_check_no_locks_freed(const void *from, unsigned long len); -extern void debug_check_no_locks_held(void); +extern void debug_check_no_locks_held(struct task_struct *task); #else static inline void debug_show_all_locks(void) { @@ -67,7 +67,7 @@ debug_check_no_locks_freed(const void *from, unsigned long len) } static inline void -debug_check_no_locks_held(void) +debug_check_no_locks_held(struct task_struct *task) { } #endif diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 043a5cf8b5b..e70df40d84f 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -3,7 +3,6 @@ #ifndef FREEZER_H_INCLUDED #define FREEZER_H_INCLUDED -#include #include #include #include @@ -49,8 +48,6 @@ extern void thaw_kernel_threads(void); static inline bool try_to_freeze(void) { - if (!(current->flags & PF_NOFREEZE)) - debug_check_no_locks_held(); might_sleep(); if (likely(!freezing(current))) return false; diff --git a/kernel/exit.c b/kernel/exit.c index 51e485ca993..60bc027c61c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -835,7 +835,7 @@ void do_exit(long code) /* * Make sure we are holding no locks: */ - debug_check_no_locks_held(); + debug_check_no_locks_held(tsk); /* * We can do this unlocked here. The futex code uses this flag * just to verify whether the pi state cleanup has been done diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 259db207b5d..8a0efac4f99 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -4088,7 +4088,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) } EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); -static void print_held_locks_bug(void) +static void print_held_locks_bug(struct task_struct *curr) { if (!debug_locks_off()) return; @@ -4097,21 +4097,22 @@ static void print_held_locks_bug(void) printk("\n"); printk("=====================================\n"); - printk("[ BUG: %s/%d still has locks held! ]\n", - current->comm, task_pid_nr(current)); + printk("[ BUG: lock held at task exit time! ]\n"); print_kernel_ident(); printk("-------------------------------------\n"); - lockdep_print_held_locks(current); + printk("%s/%d is exiting with locks still held!\n", + curr->comm, task_pid_nr(curr)); + lockdep_print_held_locks(curr); + printk("\nstack backtrace:\n"); dump_stack(); } -void debug_check_no_locks_held(void) +void debug_check_no_locks_held(struct task_struct *task) { - if (unlikely(current->lockdep_depth > 0)) - print_held_locks_bug(); + if (unlikely(task->lockdep_depth > 0)) + print_held_locks_bug(task); } -EXPORT_SYMBOL_GPL(debug_check_no_locks_held); void debug_show_all_locks(void) { -- cgit v1.2.3-70-g09d2 From a1cbcaa9ea87b87a96b9fc465951dcf36e459ca2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 6 Apr 2013 10:10:27 +0200 Subject: sched_clock: Prevent 64bit inatomicity on 32bit systems The sched_clock_remote() implementation has the following inatomicity problem on 32bit systems when accessing the remote scd->clock, which is a 64bit value. CPU0 CPU1 sched_clock_local() sched_clock_remote(CPU0) ... remote_clock = scd[CPU0]->clock read_low32bit(scd[CPU0]->clock) cmpxchg64(scd->clock,...) read_high32bit(scd[CPU0]->clock) While the update of scd->clock is using an atomic64 mechanism, the readout on the remote cpu is not, which can cause completely bogus readouts. It is a quite rare problem, because it requires the update to hit the narrow race window between the low/high readout and the update must go across the 32bit boundary. The resulting misbehaviour is, that CPU1 will see the sched_clock on CPU1 ~4 seconds ahead of it's own and update CPU1s sched_clock value to this bogus timestamp. This stays that way due to the clamping implementation for about 4 seconds until the synchronization with CLOCK_MONOTONIC undoes the problem. The issue is hard to observe, because it might only result in a less accurate SCHED_OTHER timeslicing behaviour. To create observable damage on realtime scheduling classes, it is necessary that the bogus update of CPU1 sched_clock happens in the context of an realtime thread, which then gets charged 4 seconds of RT runtime, which results in the RT throttler mechanism to trigger and prevent scheduling of RT tasks for a little less than 4 seconds. So this is quite unlikely as well. The issue was quite hard to decode as the reproduction time is between 2 days and 3 weeks and intrusive tracing makes it less likely, but the following trace recorded with trace_clock=global, which uses sched_clock_local(), gave the final hint: -0 0d..30 400269.477150: hrtimer_cancel: hrtimer=0xf7061e80 -0 0d..30 400269.477151: hrtimer_start: hrtimer=0xf7061e80 ... irq/20-S-587 1d..32 400273.772118: sched_wakeup: comm= ... target_cpu=0 -0 0dN.30 400273.772118: hrtimer_cancel: hrtimer=0xf7061e80 What happens is that CPU0 goes idle and invokes sched_clock_idle_sleep_event() which invokes sched_clock_local() and CPU1 runs a remote wakeup for CPU0 at the same time, which invokes sched_remote_clock(). The time jump gets propagated to CPU0 via sched_remote_clock() and stays stale on both cores for ~4 seconds. There are only two other possibilities, which could cause a stale sched clock: 1) ktime_get() which reads out CLOCK_MONOTONIC returns a sporadic wrong value. 2) sched_clock() which reads the TSC returns a sporadic wrong value. #1 can be excluded because sched_clock would continue to increase for one jiffy and then go stale. #2 can be excluded because it would not make the clock jump forward. It would just result in a stale sched_clock for one jiffy. After quite some brain twisting and finding the same pattern on other traces, sched_clock_remote() remained the only place which could cause such a problem and as explained above it's indeed racy on 32bit systems. So while on 64bit systems the readout is atomic, we need to verify the remote readout on 32bit machines. We need to protect the local->clock readout in sched_clock_remote() on 32bit as well because an NMI could hit between the low and the high readout, call sched_clock_local() and modify local->clock. Thanks to Siegfried Wulsch for bearing with my debug requests and going through the tedious tasks of running a bunch of reproducer systems to generate the debug information which let me decode the issue. Reported-by: Siegfried Wulsch Acked-by: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1304051544160.21884@ionos Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org --- kernel/sched/clock.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c685e31492d..c3ae1446461 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -176,10 +176,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd) u64 this_clock, remote_clock; u64 *ptr, old_val, val; +#if BITS_PER_LONG != 64 +again: + /* + * Careful here: The local and the remote clock values need to + * be read out atomic as we need to compare the values and + * then update either the local or the remote side. So the + * cmpxchg64 below only protects one readout. + * + * We must reread via sched_clock_local() in the retry case on + * 32bit as an NMI could use sched_clock_local() via the + * tracer and hit between the readout of + * the low32bit and the high 32bit portion. + */ + this_clock = sched_clock_local(my_scd); + /* + * We must enforce atomic readout on 32bit, otherwise the + * update on the remote cpu can hit inbetween the readout of + * the low32bit and the high 32bit portion. + */ + remote_clock = cmpxchg64(&scd->clock, 0, 0); +#else + /* + * On 64bit the read of [my]scd->clock is atomic versus the + * update, so we can avoid the above 32bit dance. + */ sched_clock_local(my_scd); again: this_clock = my_scd->clock; remote_clock = scd->clock; +#endif /* * Use the opportunity that we have both locks -- cgit v1.2.3-70-g09d2 From fd9b86d37a600488dbd80fe60cca46b822bff1cd Mon Sep 17 00:00:00 2001 From: libin Date: Mon, 8 Apr 2013 14:39:12 +0800 Subject: sched/debug: Fix sd->*_idx limit range avoiding overflow Commit 201c373e8e ("sched/debug: Limit sd->*_idx range on sysctl") was an incomplete bug fix. This patch fixes sd->*_idx limit range to [0 ~ CPU_LOAD_IDX_MAX-1] avoiding array overflow caused by setting sd->*_idx to CPU_LOAD_IDX_MAX on sysctl. Signed-off-by: Libin Cc: Cc: Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/51626610.2040607@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 306943f531a..fa077929e31 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4933,7 +4933,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) } static int min_load_idx = 0; -static int max_load_idx = CPU_LOAD_IDX_MAX; +static int max_load_idx = CPU_LOAD_IDX_MAX-1; static void set_table_entry(struct ctl_table *entry, -- cgit v1.2.3-70-g09d2 From c97847d2f0eb77c806e650e04d9bbcf79fa05730 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Mon, 8 Apr 2013 11:48:27 +0800 Subject: perf: Fix strncpy() use, always make sure it's NUL terminated For NUL terminated string, always make sure that there's '\0' at the end. In our case we need a return value, so still use strncpy() and fix up the tail explicitly. (strlcpy() returns the size, not the pointer) Signed-off-by: Chen Gang Cc: a.p.zijlstra@chello.nl Cc: paulus@samba.org Cc: acme@ghostprotocols.net Link: http://lkml.kernel.org/r/51623E0B.7070101@asianux.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 59412d037ee..7f0d67ea3f4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4737,7 +4737,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) } else { if (arch_vma_name(mmap_event->vma)) { name = strncpy(tmp, arch_vma_name(mmap_event->vma), - sizeof(tmp)); + sizeof(tmp) - 1); + tmp[sizeof(tmp) - 1] = '\0'; goto got_name; } -- cgit v1.2.3-70-g09d2 From 67012ab1d2ce871afea4ee55408f233f97d09d07 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Mon, 8 Apr 2013 12:06:44 +0800 Subject: perf: Fix strncpy() use, use strlcpy() instead of strncpy() For NUL terminated string we always need to set '\0' at the end. Signed-off-by: Chen Gang Cc: rostedt@goodmis.org Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/51624254.30301@asianux.com Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4f1dade5698..3f5046a4d81 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -132,7 +132,7 @@ static char *default_bootup_tracer; static int __init set_cmdline_ftrace(char *str) { - strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); + strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; /* We are using ftrace early, expand it */ ring_buffer_expanded = 1; @@ -162,7 +162,7 @@ static char *trace_boot_options __initdata; static int __init set_trace_boot_options(char *str) { - strncpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); + strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); trace_boot_options = trace_boot_options_buf; return 0; } -- cgit v1.2.3-70-g09d2 From 75761cc15877c155b3849b4e0e0cb3f897faf471 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Mon, 8 Apr 2013 12:12:39 +0800 Subject: ftrace: Fix strncpy() use, use strlcpy() instead of strncpy() For NUL terminated string we always need to set '\0' at the end. Signed-off-by: Chen Gang Cc: rostedt@goodmis.org Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/516243B7.9020405@asianux.com Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6893d5a2bf0..db143742704 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3441,14 +3441,14 @@ static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; static int __init set_ftrace_notrace(char *str) { - strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); + strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_notrace=", set_ftrace_notrace); static int __init set_ftrace_filter(char *str) { - strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); + strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_filter=", set_ftrace_filter); -- cgit v1.2.3-70-g09d2 From e614b3332a4f3f264a26da28e5a1f4cc3aea3974 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Thu, 4 Apr 2013 10:57:48 +0200 Subject: sched/cputime: Fix accounting on multi-threaded processes Recent commit 6fac4829 ("cputime: Use accessors to read task cputime stats") introduced a bug, where we account many times the cputime of the first thread, instead of cputimes of all the different threads. Signed-off-by: Stanislaw Gruszka Acked-by: Frederic Weisbecker Cc: Oleg Nesterov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20130404085740.GA2495@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index ed12cbb135f..e93cca92f38 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -310,7 +310,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) t = tsk; do { - task_cputime(tsk, &utime, &stime); + task_cputime(t, &utime, &stime); times->utime += utime; times->stime += stime; times->sum_exec_runtime += task_sched_runtime(t); -- cgit v1.2.3-70-g09d2 From 2930e04d00e113ae24bb2b7c2b58de7b648a62c7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 26 Mar 2013 17:33:00 -0400 Subject: tracing: Fix race with update_max_tr_single and changing tracers The commit 34600f0e9 "tracing: Fix race with max_tr and changing tracers" fixed the updating of the main buffers with the race of changing tracers, but left out the fix to the updating of just a per cpu buffer. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4f1dade5698..7ba7fc76f9e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -744,8 +744,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) return; WARN_ON_ONCE(!irqs_disabled()); - if (WARN_ON_ONCE(!current_trace->allocated_snapshot)) + if (!current_trace->allocated_snapshot) { + /* Only the nop tracer should hit this when disabling */ + WARN_ON_ONCE(current_trace != &nop_trace); return; + } arch_spin_lock(&ftrace_max_lock); -- cgit v1.2.3-70-g09d2 From 5000c418840b309251c5887f0b56503aae30f84c Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 26 Mar 2013 17:53:03 +0100 Subject: ftrace: Consistently restore trace function on sysctl enabling If we reenable ftrace via syctl, we currently set ftrace_trace_function based on the previous simplistic algorithm. This is inconsistent with what update_ftrace_function does. So better call that helper instead. Link: http://lkml.kernel.org/r/5151D26F.1070702@siemens.com Cc: stable@vger.kernel.org Signed-off-by: Jan Kiszka Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6893d5a2bf0..cc4943c7ce6 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4555,12 +4555,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, ftrace_startup_sysctl(); /* we are starting ftrace again */ - if (ftrace_ops_list != &ftrace_list_end) { - if (ftrace_ops_list->next == &ftrace_list_end) - ftrace_trace_function = ftrace_ops_list->func; - else - ftrace_trace_function = ftrace_ops_list_func; - } + if (ftrace_ops_list != &ftrace_list_end) + update_ftrace_function(); } else { /* stopping ftrace calls (just send to ftrace_stub) */ -- cgit v1.2.3-70-g09d2 From 395b97a3aeff0b8d949ee3e67bf8c11c5ffd6861 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 27 Mar 2013 09:31:28 -0400 Subject: ftrace: Do not call stub functions in control loop The function tracing control loop used by perf spits out a warning if the called function is not a control function. This is because the control function references a per cpu allocated data structure on struct ftrace_ops that is not allocated for other types of functions. commit 0a016409e42 "ftrace: Optimize the function tracer list loop" Had an optimization done to all function tracing loops to optimize for a single registered ops. Unfortunately, this allows for a slight race when tracing starts or ends, where the stub function might be called after the current registered ops is removed. In this case we get the following dump: root# perf stat -e ftrace:function sleep 1 [ 74.339105] WARNING: at include/linux/ftrace.h:209 ftrace_ops_control_func+0xde/0xf0() [ 74.349522] Hardware name: PRIMERGY RX200 S6 [ 74.357149] Modules linked in: sg igb iTCO_wdt ptp pps_core iTCO_vendor_support i7core_edac dca lpc_ich i2c_i801 coretemp edac_core crc32c_intel mfd_core ghash_clmulni_intel dm_multipath acpi_power_meter pcspk r microcode vhost_net tun macvtap macvlan nfsd kvm_intel kvm auth_rpcgss nfs_acl lockd sunrpc uinput xfs libcrc32c sd_mod crc_t10dif sr_mod cdrom mgag200 i2c_algo_bit drm_kms_helper ttm qla2xxx mptsas ahci drm li bahci scsi_transport_sas mptscsih libata scsi_transport_fc i2c_core mptbase scsi_tgt dm_mirror dm_region_hash dm_log dm_mod [ 74.446233] Pid: 1377, comm: perf Tainted: G W 3.9.0-rc1 #1 [ 74.453458] Call Trace: [ 74.456233] [] warn_slowpath_common+0x7f/0xc0 [ 74.462997] [] ? rcu_note_context_switch+0xa0/0xa0 [ 74.470272] [] ? __unregister_ftrace_function+0xa2/0x1a0 [ 74.478117] [] warn_slowpath_null+0x1a/0x20 [ 74.484681] [] ftrace_ops_control_func+0xde/0xf0 [ 74.491760] [] ftrace_call+0x5/0x2f [ 74.497511] [] ? ftrace_call+0x5/0x2f [ 74.503486] [] ? ftrace_call+0x5/0x2f [ 74.509500] [] ? synchronize_sched+0x5/0x50 [ 74.516088] [] ? _cond_resched+0x5/0x40 [ 74.522268] [] ? synchronize_sched+0x5/0x50 [ 74.528837] [] ? __unregister_ftrace_function+0xa2/0x1a0 [ 74.536696] [] ? _cond_resched+0x5/0x40 [ 74.542878] [] ? mutex_lock+0x1d/0x50 [ 74.548869] [] unregister_ftrace_function+0x27/0x50 [ 74.556243] [] perf_ftrace_event_register+0x9f/0x140 [ 74.563709] [] ? _cond_resched+0x5/0x40 [ 74.569887] [] ? mutex_lock+0x1d/0x50 [ 74.575898] [] perf_trace_destroy+0x2e/0x50 [ 74.582505] [] tp_perf_event_destroy+0x9/0x10 [ 74.589298] [] free_event+0x70/0x1a0 [ 74.595208] [] perf_event_release_kernel+0x69/0xa0 [ 74.602460] [] ? _cond_resched+0x5/0x40 [ 74.608667] [] put_event+0x90/0xc0 [ 74.614373] [] perf_release+0x10/0x20 [ 74.620367] [] __fput+0xf4/0x280 [ 74.625894] [] ____fput+0xe/0x10 [ 74.631387] [] task_work_run+0xa7/0xe0 [ 74.637452] [] do_notify_resume+0x71/0xb0 [ 74.643843] [] int_signal+0x12/0x17 To fix this a new ftrace_ops flag is added that denotes the ftrace_list_end ftrace_ops stub as just that, a stub. This flag is now checked in the control loop and the function is not called if the flag is set. Thanks to Jovi for not just reporting the bug, but also pointing out where the bug was in the code. Link: http://lkml.kernel.org/r/514A8855.7090402@redhat.com Link: http://lkml.kernel.org/r/1364377499-1900-15-git-send-email-jovi.zhangwei@huawei.com Tested-by: WANG Chao Reported-by: WANG Chao Reported-by: zhangwei(Jovi) Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 2 ++ kernel/trace/ftrace.c | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index e5ca8ef50e9..167abf90780 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -89,6 +89,7 @@ typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip, * that the call back has its own recursion protection. If it does * not set this, then the ftrace infrastructure will add recursion * protection for the caller. + * STUB - The ftrace_ops is just a place holder. */ enum { FTRACE_OPS_FL_ENABLED = 1 << 0, @@ -98,6 +99,7 @@ enum { FTRACE_OPS_FL_SAVE_REGS = 1 << 4, FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED = 1 << 5, FTRACE_OPS_FL_RECURSION_SAFE = 1 << 6, + FTRACE_OPS_FL_STUB = 1 << 7, }; struct ftrace_ops { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index cc4943c7ce6..7e897106b7e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -66,7 +66,7 @@ static struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, - .flags = FTRACE_OPS_FL_RECURSION_SAFE, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB, }; /* ftrace_enabled is a method to turn ftrace on or off */ @@ -4131,7 +4131,8 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, preempt_disable_notrace(); trace_recursion_set(TRACE_CONTROL_BIT); do_for_each_ftrace_op(op, ftrace_control_list) { - if (!ftrace_function_local_disabled(op) && + if (!(op->flags & FTRACE_OPS_FL_STUB) && + !ftrace_function_local_disabled(op) && ftrace_ops_test(op, ip)) op->func(ip, parent_ip, op, regs); } while_for_each_ftrace_op(op); -- cgit v1.2.3-70-g09d2 From 6f389a8f1dd22a24f3d9afc2812b30d639e94625 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Sun, 7 Apr 2013 02:14:14 +0000 Subject: PM / reboot: call syscore_shutdown() after disable_nonboot_cpus() As commit 40dc166c (PM / Core: Introduce struct syscore_ops for core subsystems PM) say, syscore_ops operations should be carried with one CPU on-line and interrupts disabled. However, after commit f96972f2d (kernel/sys.c: call disable_nonboot_cpus() in kernel_restart()), syscore_shutdown() is called before disable_nonboot_cpus(), so break the rules. We have a MIPS machine with a 8259A PIC, and there is an external timer (HPET) linked at 8259A. Since 8259A has been shutdown too early (by syscore_shutdown()), disable_nonboot_cpus() runs without timer interrupt, so it hangs and reboot fails. This patch call syscore_shutdown() a little later (after disable_nonboot_cpus()) to avoid reboot failure, this is the same way as poweroff does. For consistency, add disable_nonboot_cpus() to kernel_halt(). Signed-off-by: Huacai Chen Cc: Signed-off-by: Rafael J. Wysocki --- kernel/sys.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 39c9c4a2949..0da73cf73e6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -324,7 +324,6 @@ void kernel_restart_prepare(char *cmd) system_state = SYSTEM_RESTART; usermodehelper_disable(); device_shutdown(); - syscore_shutdown(); } /** @@ -370,6 +369,7 @@ void kernel_restart(char *cmd) { kernel_restart_prepare(cmd); disable_nonboot_cpus(); + syscore_shutdown(); if (!cmd) printk(KERN_EMERG "Restarting system.\n"); else @@ -395,6 +395,7 @@ static void kernel_shutdown_prepare(enum system_states state) void kernel_halt(void) { kernel_shutdown_prepare(SYSTEM_HALT); + disable_nonboot_cpus(); syscore_shutdown(); printk(KERN_EMERG "System halted.\n"); kmsg_dump(KMSG_DUMP_HALT); -- cgit v1.2.3-70-g09d2 From 83e03b3fe4daffdebbb42151d5410d730ae50bd1 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 1 Apr 2013 21:46:23 +0900 Subject: tracing: Fix double free when function profile init failed On the failure path, stat->start and stat->pages will refer same page. So it'll attempt to free the same page again and get kernel panic. Link: http://lkml.kernel.org/r/1364820385-32027-1-git-send-email-namhyung@kernel.org Cc: Frederic Weisbecker Cc: Namhyung Kim Cc: stable@vger.kernel.org Signed-off-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7e897106b7e..926ebfb7493 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -694,7 +694,6 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) free_page(tmp); } - free_page((unsigned long)stat->pages); stat->pages = NULL; stat->start = NULL; -- cgit v1.2.3-70-g09d2 From c481420248c6730246d2a1b1773d5d7007ae0835 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 12 Apr 2013 11:05:54 +0800 Subject: perf: Fix error return code Fix to return -ENOMEM in the allocation error case instead of 0 (if pmu_bus_running == 1), as done elsewhere in this function. Signed-off-by: Wei Yongjun Cc: a.p.zijlstra@chello.nl Cc: paulus@samba.org Cc: acme@ghostprotocols.net Link: http://lkml.kernel.org/r/CAPgLHd8j_fWcgqe%3DKLWjpBj%2B%3Do0Pw6Z-SEq%3DNTPU08c2w1tngQ@mail.gmail.com [ Tweaked the error code setting placement and the changelog. ] Signed-off-by: Ingo Molnar --- kernel/events/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 7f0d67ea3f4..7e0962ed7f8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5987,6 +5987,7 @@ skip_type: if (pmu->pmu_cpu_context) goto got_cpu_context; + ret = -ENOMEM; pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); if (!pmu->pmu_cpu_context) goto free_dev; -- cgit v1.2.3-70-g09d2 From 6a76f8c0ab19f215af2a3442870eeb5f0e81998d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 11 Apr 2013 15:55:01 +0900 Subject: tracing: Fix possible NULL pointer dereferences Currently set_ftrace_pid and set_graph_function files use seq_lseek for their fops. However seq_open() is called only for FMODE_READ in the fops->open() so that if an user tries to seek one of those file when she open it for writing, it sees NULL seq_file and then panic. It can be easily reproduced with following command: $ cd /sys/kernel/debug/tracing $ echo 1234 | sudo tee -a set_ftrace_pid In this example, GNU coreutils' tee opens the file with fopen(, "a") and then the fopen() internally calls lseek(). Link: http://lkml.kernel.org/r/1365663302-2170-1-git-send-email-namhyung@kernel.org Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Namhyung Kim Cc: stable@vger.kernel.org Signed-off-by: Namhyung Kim Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 2 +- kernel/trace/ftrace.c | 10 +++++----- kernel/trace/trace_stack.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 167abf90780..eb3ce327b97 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -396,7 +396,7 @@ ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos); ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos); -loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int whence); +loff_t ftrace_filter_lseek(struct file *file, loff_t offset, int whence); int ftrace_regex_release(struct inode *inode, struct file *file); void __init diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 926ebfb7493..affc35d829c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2697,7 +2697,7 @@ ftrace_notrace_open(struct inode *inode, struct file *file) } loff_t -ftrace_regex_lseek(struct file *file, loff_t offset, int whence) +ftrace_filter_lseek(struct file *file, loff_t offset, int whence) { loff_t ret; @@ -3570,7 +3570,7 @@ static const struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, .read = seq_read, .write = ftrace_filter_write, - .llseek = ftrace_regex_lseek, + .llseek = ftrace_filter_lseek, .release = ftrace_regex_release, }; @@ -3578,7 +3578,7 @@ static const struct file_operations ftrace_notrace_fops = { .open = ftrace_notrace_open, .read = seq_read, .write = ftrace_notrace_write, - .llseek = ftrace_regex_lseek, + .llseek = ftrace_filter_lseek, .release = ftrace_regex_release, }; @@ -3783,8 +3783,8 @@ static const struct file_operations ftrace_graph_fops = { .open = ftrace_graph_open, .read = seq_read, .write = ftrace_graph_write, + .llseek = ftrace_filter_lseek, .release = ftrace_graph_release, - .llseek = seq_lseek, }; #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ @@ -4439,7 +4439,7 @@ static const struct file_operations ftrace_pid_fops = { .open = ftrace_pid_open, .write = ftrace_pid_write, .read = seq_read, - .llseek = seq_lseek, + .llseek = ftrace_filter_lseek, .release = ftrace_pid_release, }; diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 42ca822fc70..83a8b5b7bd3 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -322,7 +322,7 @@ static const struct file_operations stack_trace_filter_fops = { .open = stack_trace_filter_open, .read = seq_read, .write = ftrace_filter_write, - .llseek = ftrace_regex_lseek, + .llseek = ftrace_filter_lseek, .release = ftrace_regex_release, }; -- cgit v1.2.3-70-g09d2 From 7f49ef69db6bbf756c0abca7e9b65b32e999eec8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 12 Apr 2013 16:40:13 -0400 Subject: ftrace: Move ftrace_filter_lseek out of CONFIG_DYNAMIC_FTRACE section As ftrace_filter_lseek is now used with ftrace_pid_fops, it needs to be moved out of the #ifdef CONFIG_DYNAMIC_FTRACE section as the ftrace_pid_fops is defined when DYNAMIC_FTRACE is not. Cc: stable@vger.kernel.org Cc: Namhyung Kim Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 3 ++- kernel/trace/ftrace.c | 28 ++++++++++++++-------------- 2 files changed, 16 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index eb3ce327b97..52da2a25079 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -396,7 +396,6 @@ ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos); ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos); -loff_t ftrace_filter_lseek(struct file *file, loff_t offset, int whence); int ftrace_regex_release(struct inode *inode, struct file *file); void __init @@ -569,6 +568,8 @@ static inline int ftrace_regex_release(struct inode *inode, struct file *file) { return -ENODEV; } #endif /* CONFIG_DYNAMIC_FTRACE */ +loff_t ftrace_filter_lseek(struct file *file, loff_t offset, int whence); + /* totally disable ftrace - can not re-enable after this */ void ftrace_kill(void); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index affc35d829c..2461ede45a8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1052,6 +1052,19 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer) static struct pid * const ftrace_swapper_pid = &init_struct_pid; +loff_t +ftrace_filter_lseek(struct file *file, loff_t offset, int whence) +{ + loff_t ret; + + if (file->f_mode & FMODE_READ) + ret = seq_lseek(file, offset, whence); + else + file->f_pos = ret = 1; + + return ret; +} + #ifdef CONFIG_DYNAMIC_FTRACE #ifndef CONFIG_FTRACE_MCOUNT_RECORD @@ -2612,7 +2625,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash) * routine, you can use ftrace_filter_write() for the write * routine if @flag has FTRACE_ITER_FILTER set, or * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. - * ftrace_regex_lseek() should be used as the lseek routine, and + * ftrace_filter_lseek() should be used as the lseek routine, and * release must call ftrace_regex_release(). */ int @@ -2696,19 +2709,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file) inode, file); } -loff_t -ftrace_filter_lseek(struct file *file, loff_t offset, int whence) -{ - loff_t ret; - - if (file->f_mode & FMODE_READ) - ret = seq_lseek(file, offset, whence); - else - file->f_pos = ret = 1; - - return ret; -} - static int ftrace_match(char *str, char *regex, int len, int type) { int matched = 0; -- cgit v1.2.3-70-g09d2 From 935d8aabd4331f47a89c3e1daa5779d23cf244ee Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 14 Apr 2013 10:06:31 -0700 Subject: Add file_ns_capable() helper function for open-time capability checking Nothing is using it yet, but this will allow us to delay the open-time checks to use time, without breaking the normal UNIX permission semantics where permissions are determined by the opener (and the file descriptor can then be passed to a different process, or the process can drop capabilities). Signed-off-by: Linus Torvalds --- include/linux/capability.h | 2 ++ kernel/capability.c | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'kernel') diff --git a/include/linux/capability.h b/include/linux/capability.h index 98503b79236..d9a4f7f40f3 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -35,6 +35,7 @@ struct cpu_vfs_cap_data { #define _KERNEL_CAP_T_SIZE (sizeof(kernel_cap_t)) +struct file; struct inode; struct dentry; struct user_namespace; @@ -211,6 +212,7 @@ extern bool capable(int cap); extern bool ns_capable(struct user_namespace *ns, int cap); extern bool nsown_capable(int cap); extern bool inode_capable(const struct inode *inode, int cap); +extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); /* audit system wants to get cap info from files as well */ extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); diff --git a/kernel/capability.c b/kernel/capability.c index 493d9725948..f6c2ce5701e 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -392,6 +392,30 @@ bool ns_capable(struct user_namespace *ns, int cap) } EXPORT_SYMBOL(ns_capable); +/** + * file_ns_capable - Determine if the file's opener had a capability in effect + * @file: The file we want to check + * @ns: The usernamespace we want the capability in + * @cap: The capability to be tested for + * + * Return true if task that opened the file had a capability in effect + * when the file was opened. + * + * This does not set PF_SUPERPRIV because the caller may not + * actually be privileged. + */ +bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) +{ + if (WARN_ON_ONCE(!cap_valid(cap))) + return false; + + if (security_capable(file->f_cred, ns, cap) == 0) + return true; + + return false; +} +EXPORT_SYMBOL(file_ns_capable); + /** * capable - Determine if the current task has a superior capability in effect * @cap: The capability to be tested for -- cgit v1.2.3-70-g09d2