From db05021d49a994ee40a9735d9c3cb0060c9babb8 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 27 Feb 2013 21:48:09 -0500
Subject: ftrace: Update the kconfig for DYNAMIC_FTRACE

The prompt to enable DYNAMIC_FTRACE (the ability to nop and
enable function tracing at run time) had a confusing statement:

 "enable/disable ftrace tracepoints dynamically"

This was written before tracepoints were added to the kernel,
but now that tracepoints have been added, this is very confusing
and has confused people enough to give wrong information during
presentations.

Not only that, I looked at the help text, and it still references
that dreaded daemon that use to wake up once a second to update
the nop locations and brick NICs, that hasn't been around for over
five years.

Time to bring the text up to the current decade.

Cc: stable@vger.kernel.org
Reported-by: Ezequiel Garcia <elezegarcia@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 36567564e22..b516a8e19d5 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -429,24 +429,28 @@ config PROBE_EVENTS
 	def_bool n
 
 config DYNAMIC_FTRACE
-	bool "enable/disable ftrace tracepoints dynamically"
+	bool "enable/disable function tracing dynamically"
 	depends on FUNCTION_TRACER
 	depends on HAVE_DYNAMIC_FTRACE
 	default y
 	help
-          This option will modify all the calls to ftrace dynamically
-	  (will patch them out of the binary image and replace them
-	  with a No-Op instruction) as they are called. A table is
-	  created to dynamically enable them again.
+	  This option will modify all the calls to function tracing
+	  dynamically (will patch them out of the binary image and
+	  replace them with a No-Op instruction) on boot up. During
+	  compile time, a table is made of all the locations that ftrace
+	  can function trace, and this table is linked into the kernel
+	  image. When this is enabled, functions can be individually
+	  enabled, and the functions not enabled will not affect
+	  performance of the system.
+
+	  See the files in /sys/kernel/debug/tracing:
+	    available_filter_functions
+	    set_ftrace_filter
+	    set_ftrace_notrace
 
 	  This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but
 	  otherwise has native performance as long as no tracing is active.
 
-	  The changes to the code are done by a kernel thread that
-	  wakes up once a second and checks to see if any ftrace calls
-	  were made. If so, it runs stop_machine (stops all CPUS)
-	  and modifies the code to jump over the call to ftrace.
-
 config DYNAMIC_FTRACE_WITH_REGS
 	def_bool y
 	depends on DYNAMIC_FTRACE
-- 
cgit v1.2.3-70-g09d2


From d8741e2e88ac9a458765a0c7b4e6542d7c038334 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com>
Date: Tue, 5 Mar 2013 10:25:16 -0500
Subject: tracing: Add help of snapshot feature when snapshot is empty

When cat'ing the snapshot file, instead of showing an empty trace
header like the trace file does, show how to use the snapshot
feature.

Also, this is a good place to show if the snapshot has been allocated
or not. Users may want to "pre allocate" the snapshot to have a fast
"swap" of the current buffer. Otherwise, a swap would be slow and might
fail as it would need to allocate the snapshot buffer, and that might
fail under tight memory constraints.

Here's what it looked like before:

 # tracer: nop
 #
 # entries-in-buffer/entries-written: 0/0   #P:4
 #
 #                              _-----=> irqs-off
 #                             / _----=> need-resched
 #                            | / _---=> hardirq/softirq
 #                            || / _--=> preempt-depth
 #                            ||| /     delay
 #           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
 #              | |       |   ||||       |         |

Here's what it looks like now:

 # tracer: nop
 #
 #
 # * Snapshot is freed *
 #
 # Snapshot commands:
 # echo 0 > snapshot : Clears and frees snapshot buffer
 # echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.
 #                      Takes a snapshot of the main buffer.
 # echo 2 > snapshot : Clears snapshot buffer (but does not allocate)
 #                      (Doesn't have to be '2' works with any number that
 #                       is not a '0' or '1')

Acked-by: Hiraku Toyooka <hiraku.toyooka.gu@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c2e2c231037..9e3120b8a2a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2400,6 +2400,27 @@ static void test_ftrace_alive(struct seq_file *m)
 	seq_printf(m, "#          MAY BE MISSING FUNCTION EVENTS\n");
 }
 
+#ifdef CONFIG_TRACER_MAX_TRACE
+static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
+{
+	if (iter->trace->allocated_snapshot)
+		seq_printf(m, "#\n# * Snapshot is allocated *\n#\n");
+	else
+		seq_printf(m, "#\n# * Snapshot is freed *\n#\n");
+
+	seq_printf(m, "# Snapshot commands:\n");
+	seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n");
+	seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
+	seq_printf(m, "#                      Takes a snapshot of the main buffer.\n");
+	seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate)\n");
+	seq_printf(m, "#                      (Doesn't have to be '2' works with any number that\n");
+	seq_printf(m, "#                       is not a '0' or '1')\n");
+}
+#else
+/* Should never be called */
+static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { }
+#endif
+
 static int s_show(struct seq_file *m, void *v)
 {
 	struct trace_iterator *iter = v;
@@ -2411,7 +2432,9 @@ static int s_show(struct seq_file *m, void *v)
 			seq_puts(m, "#\n");
 			test_ftrace_alive(m);
 		}
-		if (iter->trace && iter->trace->print_header)
+		if (iter->snapshot && trace_empty(iter))
+			print_snapshot_help(m, iter);
+		else if (iter->trace && iter->trace->print_header)
 			iter->trace->print_header(m);
 		else
 			trace_default_header(m);
-- 
cgit v1.2.3-70-g09d2


From c9960e48543799f168c4c9486f9790fb686ce5a8 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com>
Date: Tue, 5 Mar 2013 10:53:02 -0500
Subject: tracing: Do not return EINVAL in snapshot when not allocated

To use the tracing snapshot feature, writing a '1' into the snapshot
file causes the snapshot buffer to be allocated if it has not already
been allocated and dose a 'swap' with the main buffer, so that the
snapshot now contains what was in the main buffer, and the main buffer
now writes to what was the snapshot buffer.

To free the snapshot buffer, a '0' is written into the snapshot file.

To clear the snapshot buffer, any number but a '0' or '1' is written
into the snapshot file. But if the file is not allocated it returns
-EINVAL error code. This is rather pointless. It is better just to
do nothing and return success.

Acked-by: Hiraku Toyooka <hiraku.toyooka.gu@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9e3120b8a2a..1f835a83cb2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4167,8 +4167,6 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	default:
 		if (current_trace->allocated_snapshot)
 			tracing_reset_online_cpus(&max_tr);
-		else
-			ret = -EINVAL;
 		break;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From a7dc19b8652c862d5b7c4d2339bd3c428bd29c4a Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 7 Mar 2013 15:09:24 +0000
Subject: clockevents: Don't allow dummy broadcast timers

Currently tick_check_broadcast_device doesn't reject clock_event_devices
with CLOCK_EVT_FEAT_DUMMY, and may select them in preference to real
hardware if they have a higher rating value. In this situation, the
dummy timer is responsible for broadcasting to itself, and the core
clockevents code may attempt to call non-existent callbacks for
programming the dummy, eventually leading to a panic.

This patch makes tick_check_broadcast_device always reject dummy timers,
preventing this problem.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Jon Medhurst (Tixy) <tixy@linaro.org>
Cc: stable@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 2fb8cb88df8..7f32fe0e52c 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -67,7 +67,8 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)
  */
 int tick_check_broadcast_device(struct clock_event_device *dev)
 {
-	if ((tick_broadcast_device.evtdev &&
+	if ((dev->features & CLOCK_EVT_FEAT_DUMMY) ||
+	    (tick_broadcast_device.evtdev &&
 	     tick_broadcast_device.evtdev->rating >= dev->rating) ||
 	     (dev->features & CLOCK_EVT_FEAT_C3STOP))
 		return 0;
-- 
cgit v1.2.3-70-g09d2


From eb2834285cf172856cd12f66892fc7467935ebed Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 8 Mar 2013 15:18:28 -0800
Subject: workqueue: fix possible pool stall bug in wq_unbind_fn()

Since multiple pools per cpu have been introduced, wq_unbind_fn() has
a subtle bug which may theoretically stall work item processing.  The
problem is two-fold.

* wq_unbind_fn() depends on the worker executing wq_unbind_fn() itself
  to start unbound chain execution, which works fine when there was
  only single pool.  With multiple pools, only the pool which is
  running wq_unbind_fn() - the highpri one - is guaranteed to have
  such kick-off.  The other pool could stall when its busy workers
  block.

* The current code is setting WORKER_UNBIND / POOL_DISASSOCIATED of
  the two pools in succession without initiating work execution
  inbetween.  Because setting the flags requires grabbing assoc_mutex
  which is held while new workers are created, this could lead to
  stalls if a pool's manager is waiting for the previous pool's work
  items to release memory.  This is almost purely theoretical tho.

Update wq_unbind_fn() such that it sets WORKER_UNBIND /
POOL_DISASSOCIATED, goes over schedule() and explicitly kicks off
execution for a pool and then moves on to the next one.

tj: Updated comments and description.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org
---
 kernel/workqueue.c | 44 +++++++++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 81f2457811e..604801b91cb 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3446,28 +3446,34 @@ static void wq_unbind_fn(struct work_struct *work)
 
 		spin_unlock_irq(&pool->lock);
 		mutex_unlock(&pool->assoc_mutex);
-	}
 
-	/*
-	 * Call schedule() so that we cross rq->lock and thus can guarantee
-	 * sched callbacks see the %WORKER_UNBOUND flag.  This is necessary
-	 * as scheduler callbacks may be invoked from other cpus.
-	 */
-	schedule();
+		/*
+		 * Call schedule() so that we cross rq->lock and thus can
+		 * guarantee sched callbacks see the %WORKER_UNBOUND flag.
+		 * This is necessary as scheduler callbacks may be invoked
+		 * from other cpus.
+		 */
+		schedule();
 
-	/*
-	 * Sched callbacks are disabled now.  Zap nr_running.  After this,
-	 * nr_running stays zero and need_more_worker() and keep_working()
-	 * are always true as long as the worklist is not empty.  Pools on
-	 * @cpu now behave as unbound (in terms of concurrency management)
-	 * pools which are served by workers tied to the CPU.
-	 *
-	 * On return from this function, the current worker would trigger
-	 * unbound chain execution of pending work items if other workers
-	 * didn't already.
-	 */
-	for_each_std_worker_pool(pool, cpu)
+		/*
+		 * Sched callbacks are disabled now.  Zap nr_running.
+		 * After this, nr_running stays zero and need_more_worker()
+		 * and keep_working() are always true as long as the
+		 * worklist is not empty.  This pool now behaves as an
+		 * unbound (in terms of concurrency management) pool which
+		 * are served by workers tied to the pool.
+		 */
 		atomic_set(&pool->nr_running, 0);
+
+		/*
+		 * With concurrency management just turned off, a busy
+		 * worker blocking could lead to lengthy stalls.  Kick off
+		 * unbound chain execution of currently pending work items.
+		 */
+		spin_lock_irq(&pool->lock);
+		wake_up_worker(pool);
+		spin_unlock_irq(&pool->lock);
+	}
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 2721e72dd10f71a3ba90f59781becf02638aa0d9 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 12 Mar 2013 11:32:32 -0400
Subject: tracing: Fix race in snapshot swapping

Although the swap is wrapped with a spin_lock, the assignment
of the temp buffer used to swap is not within that lock.
It needs to be moved into that lock, otherwise two swaps
happening on two different CPUs, can end up using the wrong
temp buffer to assign in the swap.

Luckily, all current callers of the swap function appear to have
their own locks. But in case something is added that allows two
different callers to call the swap, then there's a chance that
this race can trigger and corrupt the buffers.

New code is coming soon that will allow for this race to trigger.

I've Cc'd stable, so this bug will not show up if someone backports
one of the changes that can trigger this bug.

Cc: stable@vger.kernel.org
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1f835a83cb2..53df2839bb9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -704,7 +704,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 void
 update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 {
-	struct ring_buffer *buf = tr->buffer;
+	struct ring_buffer *buf;
 
 	if (trace_stop_count)
 		return;
@@ -719,6 +719,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 
 	arch_spin_lock(&ftrace_max_lock);
 
+	buf = tr->buffer;
 	tr->buffer = max_tr.buffer;
 	max_tr.buffer = buf;
 
-- 
cgit v1.2.3-70-g09d2


From 20f22ab42e9c832bde6e9a7ed04cdc73ec737e5b Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 4 Mar 2013 14:32:59 -0800
Subject: signals: fix new kernel-doc warnings

Fix new kernel-doc warnings in kernel/signal.c:

  Warning(kernel/signal.c:2689): No description found for parameter 'uset'
  Warning(kernel/signal.c:2689): Excess function parameter 'set' description in 'sys_rt_sigpending'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 2ec870a4c3c..d63c79e7e41 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2682,7 +2682,7 @@ static int do_sigpending(void *set, unsigned long sigsetsize)
 /**
  *  sys_rt_sigpending - examine a pending signal that has been raised
  *			while blocked
- *  @set: stores pending signals
+ *  @uset: stores pending signals
  *  @sigsetsize: size of sigset_t type or larger
  */
 SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
-- 
cgit v1.2.3-70-g09d2


From 6c23cbbd5056b155401b0a2b5567d530e6c750c4 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 5 Mar 2013 10:00:24 -0800
Subject: futex: fix kernel-doc notation and spello

Fix kernel-doc warning in futex.c and convert 'Returns' to the new Return:
kernel-doc notation format.

  Warning(kernel/futex.c:2286): Excess function parameter 'clockrt' description in 'futex_wait_requeue_pi'

Fix one spello.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/futex.c | 46 +++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index f0090a993da..b26dcfc02c9 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -223,7 +223,8 @@ static void drop_futex_key_refs(union futex_key *key)
  * @rw:		mapping needs to be read/write (values: VERIFY_READ,
  *              VERIFY_WRITE)
  *
- * Returns a negative error code or 0
+ * Return: a negative error code or 0
+ *
  * The key words are stored in *key on success.
  *
  * For shared mappings, it's (page->index, file_inode(vma->vm_file),
@@ -705,9 +706,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
  *			be "current" except in the case of requeue pi.
  * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0)
  *
- * Returns:
- *  0 - ready to wait
- *  1 - acquired the lock
+ * Return:
+ *  0 - ready to wait;
+ *  1 - acquired the lock;
  * <0 - error
  *
  * The hb->lock and futex_key refs shall be held by the caller.
@@ -1191,9 +1192,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
  * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
  * hb1 and hb2 must be held by the caller.
  *
- * Returns:
- *  0 - failed to acquire the lock atomicly
- *  1 - acquired the lock
+ * Return:
+ *  0 - failed to acquire the lock atomically;
+ *  1 - acquired the lock;
  * <0 - error
  */
 static int futex_proxy_trylock_atomic(u32 __user *pifutex,
@@ -1254,8 +1255,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
  * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
  * uaddr2 atomically on behalf of the top waiter.
  *
- * Returns:
- * >=0 - on success, the number of tasks requeued or woken
+ * Return:
+ * >=0 - on success, the number of tasks requeued or woken;
  *  <0 - on error
  */
 static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
@@ -1536,8 +1537,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
  * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
  * be paired with exactly one earlier call to queue_me().
  *
- * Returns:
- *   1 - if the futex_q was still queued (and we removed unqueued it)
+ * Return:
+ *   1 - if the futex_q was still queued (and we removed unqueued it);
  *   0 - if the futex_q was already removed by the waking thread
  */
 static int unqueue_me(struct futex_q *q)
@@ -1707,9 +1708,9 @@ static long futex_wait_restart(struct restart_block *restart);
  * the pi_state owner as well as handle race conditions that may allow us to
  * acquire the lock. Must be called with the hb lock held.
  *
- * Returns:
- *  1 - success, lock taken
- *  0 - success, lock not taken
+ * Return:
+ *  1 - success, lock taken;
+ *  0 - success, lock not taken;
  * <0 - on error (-EFAULT)
  */
 static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
@@ -1824,8 +1825,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
  * Return with the hb lock held and a q.key reference on success, and unlocked
  * with no q.key reference on failure.
  *
- * Returns:
- *  0 - uaddr contains val and hb has been locked
+ * Return:
+ *  0 - uaddr contains val and hb has been locked;
  * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
  */
 static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
@@ -2203,9 +2204,9 @@ pi_faulted:
  * the wakeup and return the appropriate error code to the caller.  Must be
  * called with the hb lock held.
  *
- * Returns
- *  0 - no early wakeup detected
- * <0 - -ETIMEDOUT or -ERESTARTNOINTR
+ * Return:
+ *  0 = no early wakeup detected;
+ * <0 = -ETIMEDOUT or -ERESTARTNOINTR
  */
 static inline
 int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
@@ -2247,7 +2248,6 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
  * @val:	the expected value of uaddr
  * @abs_time:	absolute timeout
  * @bitset:	32 bit wakeup bitset set by userspace, defaults to all
- * @clockrt:	whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
  * @uaddr2:	the pi futex we will take prior to returning to user-space
  *
  * The caller will wait on uaddr and will be requeued by futex_requeue() to
@@ -2258,7 +2258,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
  * there was a need to.
  *
  * We call schedule in futex_wait_queue_me() when we enqueue and return there
- * via the following:
+ * via the following--
  * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
  * 2) wakeup on uaddr2 after a requeue
  * 3) signal
@@ -2276,8 +2276,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
  *
  * If 4 or 7, we cleanup and return with -ETIMEDOUT.
  *
- * Returns:
- *  0 - On success
+ * Return:
+ *  0 - On success;
  * <0 - On error
  */
 static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
-- 
cgit v1.2.3-70-g09d2


From 740466bc89ad8bd5afcc8de220f715f62b21e365 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 13 Mar 2013 11:15:19 -0400
Subject: tracing: Fix free of probe entry by calling call_rcu_sched()

Because function tracing is very invasive, and can even trace
calls to rcu_read_lock(), RCU access in function tracing is done
with preempt_disable_notrace(). This requires a synchronize_sched()
for updates and not a synchronize_rcu().

Function probes (traceon, traceoff, etc) must be freed after
a synchronize_sched() after its entry has been removed from the
hash. But call_rcu() is used. Fix this by using call_rcu_sched().

Also fix the usage to use hlist_del_rcu() instead of hlist_del().

Cc: stable@vger.kernel.org
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 98ca94a4181..e6effd0c40a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3108,8 +3108,8 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 					continue;
 			}
 
-			hlist_del(&entry->node);
-			call_rcu(&entry->rcu, ftrace_free_entry_rcu);
+			hlist_del_rcu(&entry->node);
+			call_rcu_sched(&entry->rcu, ftrace_free_entry_rcu);
 		}
 	}
 	__disable_ftrace_function_probe();
-- 
cgit v1.2.3-70-g09d2


From e66eded8309ebf679d3d3c1f5820d1f2ca332c71 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 13 Mar 2013 11:51:49 -0700
Subject: userns: Don't allow CLONE_NEWUSER | CLONE_FS

Don't allowing sharing the root directory with processes in a
different user namespace.  There doesn't seem to be any point, and to
allow it would require the overhead of putting a user namespace
reference in fs_struct (for permission checks) and incrementing that
reference count on practically every call to fork.

So just perform the inexpensive test of forbidding sharing fs_struct
acrosss processes in different user namespaces.  We already disallow
other forms of threading when unsharing a user namespace so this
should be no real burden in practice.

This updates setns, clone, and unshare to disallow multiple user
namespaces sharing an fs_struct.

Cc: stable@vger.kernel.org
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c           | 5 ++++-
 kernel/user_namespace.c | 4 ++++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 8d932b1c905..1766d324d5e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1141,6 +1141,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
 		return ERR_PTR(-EINVAL);
 
+	if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
+		return ERR_PTR(-EINVAL);
+
 	/*
 	 * Thread groups must share signals as well, and detached threads
 	 * can only be started up within the thread group.
@@ -1807,7 +1810,7 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 	 * If unsharing a user namespace must also unshare the thread.
 	 */
 	if (unshare_flags & CLONE_NEWUSER)
-		unshare_flags |= CLONE_THREAD;
+		unshare_flags |= CLONE_THREAD | CLONE_FS;
 	/*
 	 * If unsharing a pid namespace must also unshare the thread.
 	 */
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 8b650837083..b14f4d34204 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -21,6 +21,7 @@
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
 #include <linux/projid.h>
+#include <linux/fs_struct.h>
 
 static struct kmem_cache *user_ns_cachep __read_mostly;
 
@@ -837,6 +838,9 @@ static int userns_install(struct nsproxy *nsproxy, void *ns)
 	if (atomic_read(&current->mm->mm_users) > 1)
 		return -EINVAL;
 
+	if (current->fs->users != 1)
+		return -EINVAL;
+
 	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 
-- 
cgit v1.2.3-70-g09d2


From 2ca39528c01a933f6689cd6505ce65bd6d68a530 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 13 Mar 2013 14:59:33 -0700
Subject: signal: always clear sa_restorer on execve

When the new signal handlers are set up, the location of sa_restorer is
not cleared, leaking a parent process's address space location to
children.  This allows for a potential bypass of the parent's ASLR by
examining the sa_restorer value returned when calling sigaction().

Based on what should be considered "secret" about addresses, it only
matters across the exec not the fork (since the VMAs haven't changed
until the exec).  But since exec sets SIG_DFL and keeps sa_restorer,
this is where it should be fixed.

Given the few uses of sa_restorer, a "set" function was not written
since this would be the only use.  Instead, we use
__ARCH_HAS_SA_RESTORER, as already done in other places.

Example of the leak before applying this patch:

  $ cat /proc/$$/maps
  ...
  7fb9f3083000-7fb9f3238000 r-xp 00000000 fd:01 404469 .../libc-2.15.so
  ...
  $ ./leak
  ...
  7f278bc74000-7f278be29000 r-xp 00000000 fd:01 404469 .../libc-2.15.so
  ...
  1 0 (nil) 0x7fb9f30b94a0
  2 4000000 (nil) 0x7f278bcaa4a0
  3 4000000 (nil) 0x7f278bcaa4a0
  4 0 (nil) 0x7fb9f30b94a0
  ...

[akpm@linux-foundation.org: use SA_RESTORER for backportability]
Signed-off-by: Kees Cook <keescook@chromium.org>
Reported-by: Emese Revfy <re.emese@gmail.com>
Cc: Emese Revfy <re.emese@gmail.com>
Cc: PaX Team <pageexec@freemail.hu>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Serge Hallyn <serge.hallyn@canonical.com>
Cc: Julien Tinnes <jln@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index d63c79e7e41..43b0d4a1b7b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -485,6 +485,9 @@ flush_signal_handlers(struct task_struct *t, int force_default)
 		if (force_default || ka->sa.sa_handler != SIG_IGN)
 			ka->sa.sa_handler = SIG_DFL;
 		ka->sa.sa_flags = 0;
+#ifdef SA_RESTORER
+		ka->sa.sa_restorer = NULL;
+#endif
 		sigemptyset(&ka->sa.sa_mask);
 		ka++;
 	}
-- 
cgit v1.2.3-70-g09d2


From 522cff142d7d2f9230839c9e1f21a4d8bcc22a4a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 13 Mar 2013 14:59:34 -0700
Subject: kernel/signal.c: use __ARCH_HAS_SA_RESTORER instead of SA_RESTORER

__ARCH_HAS_SA_RESTORER is the preferred conditional for use in 3.9 and
later kernels, per Kees.

Cc: Emese Revfy <re.emese@gmail.com>
Cc: Emese Revfy <re.emese@gmail.com>
Cc: PaX Team <pageexec@freemail.hu>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Serge Hallyn <serge.hallyn@canonical.com>
Cc: Julien Tinnes <jln@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 43b0d4a1b7b..dd72567767d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -485,7 +485,7 @@ flush_signal_handlers(struct task_struct *t, int force_default)
 		if (force_default || ka->sa.sa_handler != SIG_IGN)
 			ka->sa.sa_handler = SIG_DFL;
 		ka->sa.sa_flags = 0;
-#ifdef SA_RESTORER
+#ifdef __ARCH_HAS_SA_RESTORER
 		ka->sa.sa_restorer = NULL;
 #endif
 		sigemptyset(&ka->sa.sa_mask);
-- 
cgit v1.2.3-70-g09d2


From e68035fb65dec05718d765fbea14d2e527214ff6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 13 Mar 2013 14:59:38 -0700
Subject: workqueue: convert to idr_alloc()

idr_get_new*() and friends are about to be deprecated.  Convert to the
new idr_alloc() interface.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 81f2457811e..55fac5b991b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -457,11 +457,12 @@ static int worker_pool_assign_id(struct worker_pool *pool)
 	int ret;
 
 	mutex_lock(&worker_pool_idr_mutex);
-	idr_pre_get(&worker_pool_idr, GFP_KERNEL);
-	ret = idr_get_new(&worker_pool_idr, pool, &pool->id);
+	ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL);
+	if (ret >= 0)
+		pool->id = ret;
 	mutex_unlock(&worker_pool_idr_mutex);
 
-	return ret;
+	return ret < 0 ? ret : 0;
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 69d34da2984c95b33ea21518227e1f9470f11d95 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 14 Mar 2013 13:50:56 -0400
Subject: tracing: Protect tracer flags with trace_types_lock

Seems that the tracer flags have never been protected from
synchronous writes. Luckily, admins don't usually modify the
tracing flags via two different tasks. But if scripts were to
be used to modify them, then they could get corrupted.

Move the trace_types_lock that protects against tracers changing
to also protect the flags being set.

Cc: stable@vger.kernel.org
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 53df2839bb9..00daf5f8c50 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2916,6 +2916,8 @@ static int trace_set_options(char *option)
 		cmp += 2;
 	}
 
+	mutex_lock(&trace_types_lock);
+
 	for (i = 0; trace_options[i]; i++) {
 		if (strcmp(cmp, trace_options[i]) == 0) {
 			set_tracer_flags(1 << i, !neg);
@@ -2924,11 +2926,10 @@ static int trace_set_options(char *option)
 	}
 
 	/* If no option could be set, test the specific tracer options */
-	if (!trace_options[i]) {
-		mutex_lock(&trace_types_lock);
+	if (!trace_options[i])
 		ret = set_tracer_option(current_trace, cmp, neg);
-		mutex_unlock(&trace_types_lock);
-	}
+
+	mutex_unlock(&trace_types_lock);
 
 	return ret;
 }
@@ -4781,7 +4782,10 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
 
 	if (val != 0 && val != 1)
 		return -EINVAL;
+
+	mutex_lock(&trace_types_lock);
 	set_tracer_flags(1 << index, val);
+	mutex_unlock(&trace_types_lock);
 
 	*ppos += cnt;
 
-- 
cgit v1.2.3-70-g09d2


From 80902822658aab18330569587cdb69ac1dfdcea8 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 14 Mar 2013 14:20:54 -0400
Subject: tracing: Keep overwrite in sync between regular and snapshot buffers

Changing the overwrite mode for the ring buffer via the trace
option only sets the normal buffer. But the snapshot buffer could
swap with it, and then the snapshot would be in non overwrite mode
and the normal buffer would be in overwrite mode, even though the
option flag states otherwise.

Keep the two buffers overwrite modes in sync.

Cc: stable@vger.kernel.org
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 00daf5f8c50..02debabe9ed 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2895,8 +2895,12 @@ static void set_tracer_flags(unsigned int mask, int enabled)
 	if (mask == TRACE_ITER_RECORD_CMD)
 		trace_event_enable_cmd_record(enabled);
 
-	if (mask == TRACE_ITER_OVERWRITE)
+	if (mask == TRACE_ITER_OVERWRITE) {
 		ring_buffer_change_overwrite(global_trace.buffer, enabled);
+#ifdef CONFIG_TRACER_MAX_TRACE
+		ring_buffer_change_overwrite(max_tr.buffer, enabled);
+#endif
+	}
 
 	if (mask == TRACE_ITER_PRINTK)
 		trace_printk_start_stop_comm(enabled);
-- 
cgit v1.2.3-70-g09d2


From 613f04a0f51e6e68ac6fe571ab79da3c0a5eb4da Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 14 Mar 2013 15:03:53 -0400
Subject: tracing: Prevent buffer overwrite disabled for latency tracers

The latency tracers require the buffers to be in overwrite mode,
otherwise they get screwed up. Force the buffers to stay in overwrite
mode when latency tracers are enabled.

Added a flag_changed() method to the tracer structure to allow
the tracers to see what flags are being changed, and also be able
to prevent the change from happing.

Cc: stable@vger.kernel.org
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c              | 38 ++++++++++++++++++++++++++++++++------
 kernel/trace/trace.h              |  6 ++++++
 kernel/trace/trace_irqsoff.c      | 19 ++++++++++++++-----
 kernel/trace/trace_sched_wakeup.c | 18 +++++++++++++-----
 4 files changed, 65 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 02debabe9ed..4f1dade5698 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2881,11 +2881,25 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
 	return -EINVAL;
 }
 
-static void set_tracer_flags(unsigned int mask, int enabled)
+/* Some tracers require overwrite to stay enabled */
+int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
+{
+	if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set)
+		return -1;
+
+	return 0;
+}
+
+int set_tracer_flag(unsigned int mask, int enabled)
 {
 	/* do nothing if flag is already set */
 	if (!!(trace_flags & mask) == !!enabled)
-		return;
+		return 0;
+
+	/* Give the tracer a chance to approve the change */
+	if (current_trace->flag_changed)
+		if (current_trace->flag_changed(current_trace, mask, !!enabled))
+			return -EINVAL;
 
 	if (enabled)
 		trace_flags |= mask;
@@ -2904,13 +2918,15 @@ static void set_tracer_flags(unsigned int mask, int enabled)
 
 	if (mask == TRACE_ITER_PRINTK)
 		trace_printk_start_stop_comm(enabled);
+
+	return 0;
 }
 
 static int trace_set_options(char *option)
 {
 	char *cmp;
 	int neg = 0;
-	int ret = 0;
+	int ret = -ENODEV;
 	int i;
 
 	cmp = strstrip(option);
@@ -2924,7 +2940,7 @@ static int trace_set_options(char *option)
 
 	for (i = 0; trace_options[i]; i++) {
 		if (strcmp(cmp, trace_options[i]) == 0) {
-			set_tracer_flags(1 << i, !neg);
+			ret = set_tracer_flag(1 << i, !neg);
 			break;
 		}
 	}
@@ -2943,6 +2959,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
 			size_t cnt, loff_t *ppos)
 {
 	char buf[64];
+	int ret;
 
 	if (cnt >= sizeof(buf))
 		return -EINVAL;
@@ -2952,7 +2969,9 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
 
 	buf[cnt] = 0;
 
-	trace_set_options(buf);
+	ret = trace_set_options(buf);
+	if (ret < 0)
+		return ret;
 
 	*ppos += cnt;
 
@@ -3256,6 +3275,9 @@ static int tracing_set_tracer(const char *buf)
 		goto out;
 
 	trace_branch_disable();
+
+	current_trace->enabled = false;
+
 	if (current_trace->reset)
 		current_trace->reset(tr);
 
@@ -3300,6 +3322,7 @@ static int tracing_set_tracer(const char *buf)
 	}
 
 	current_trace = t;
+	current_trace->enabled = true;
 	trace_branch_enable(tr);
  out:
 	mutex_unlock(&trace_types_lock);
@@ -4788,9 +4811,12 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		return -EINVAL;
 
 	mutex_lock(&trace_types_lock);
-	set_tracer_flags(1 << index, val);
+	ret = set_tracer_flag(1 << index, val);
 	mutex_unlock(&trace_types_lock);
 
+	if (ret < 0)
+		return ret;
+
 	*ppos += cnt;
 
 	return cnt;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 57d7e5397d5..2081971367e 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -283,11 +283,15 @@ struct tracer {
 	enum print_line_t	(*print_line)(struct trace_iterator *iter);
 	/* If you handled the flag setting, return 0 */
 	int			(*set_flag)(u32 old_flags, u32 bit, int set);
+	/* Return 0 if OK with change, else return non-zero */
+	int			(*flag_changed)(struct tracer *tracer,
+						u32 mask, int set);
 	struct tracer		*next;
 	struct tracer_flags	*flags;
 	bool			print_max;
 	bool			use_max_tr;
 	bool			allocated_snapshot;
+	bool			enabled;
 };
 
 
@@ -943,6 +947,8 @@ extern const char *__stop___trace_bprintk_fmt[];
 
 void trace_printk_init_buffers(void);
 void trace_printk_start_comm(void);
+int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
+int set_tracer_flag(unsigned int mask, int enabled);
 
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter)	\
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 713a2cac488..443b25b43b4 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -32,7 +32,7 @@ enum {
 
 static int trace_type __read_mostly;
 
-static int save_lat_flag;
+static int save_flags;
 
 static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
 static int start_irqsoff_tracer(struct trace_array *tr, int graph);
@@ -558,8 +558,11 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
 
 static void __irqsoff_tracer_init(struct trace_array *tr)
 {
-	save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
-	trace_flags |= TRACE_ITER_LATENCY_FMT;
+	save_flags = trace_flags;
+
+	/* non overwrite screws up the latency tracers */
+	set_tracer_flag(TRACE_ITER_OVERWRITE, 1);
+	set_tracer_flag(TRACE_ITER_LATENCY_FMT, 1);
 
 	tracing_max_latency = 0;
 	irqsoff_trace = tr;
@@ -573,10 +576,13 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
 
 static void irqsoff_tracer_reset(struct trace_array *tr)
 {
+	int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
+	int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
+
 	stop_irqsoff_tracer(tr, is_graph());
 
-	if (!save_lat_flag)
-		trace_flags &= ~TRACE_ITER_LATENCY_FMT;
+	set_tracer_flag(TRACE_ITER_LATENCY_FMT, lat_flag);
+	set_tracer_flag(TRACE_ITER_OVERWRITE, overwrite_flag);
 }
 
 static void irqsoff_tracer_start(struct trace_array *tr)
@@ -609,6 +615,7 @@ static struct tracer irqsoff_tracer __read_mostly =
 	.print_line     = irqsoff_print_line,
 	.flags		= &tracer_flags,
 	.set_flag	= irqsoff_set_flag,
+	.flag_changed	= trace_keep_overwrite,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_irqsoff,
 #endif
@@ -642,6 +649,7 @@ static struct tracer preemptoff_tracer __read_mostly =
 	.print_line     = irqsoff_print_line,
 	.flags		= &tracer_flags,
 	.set_flag	= irqsoff_set_flag,
+	.flag_changed	= trace_keep_overwrite,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_preemptoff,
 #endif
@@ -677,6 +685,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
 	.print_line     = irqsoff_print_line,
 	.flags		= &tracer_flags,
 	.set_flag	= irqsoff_set_flag,
+	.flag_changed	= trace_keep_overwrite,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_preemptirqsoff,
 #endif
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 75aa97fbe1a..fde652c9a51 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -36,7 +36,7 @@ static void __wakeup_reset(struct trace_array *tr);
 static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
 static void wakeup_graph_return(struct ftrace_graph_ret *trace);
 
-static int save_lat_flag;
+static int save_flags;
 
 #define TRACE_DISPLAY_GRAPH     1
 
@@ -540,8 +540,11 @@ static void stop_wakeup_tracer(struct trace_array *tr)
 
 static int __wakeup_tracer_init(struct trace_array *tr)
 {
-	save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
-	trace_flags |= TRACE_ITER_LATENCY_FMT;
+	save_flags = trace_flags;
+
+	/* non overwrite screws up the latency tracers */
+	set_tracer_flag(TRACE_ITER_OVERWRITE, 1);
+	set_tracer_flag(TRACE_ITER_LATENCY_FMT, 1);
 
 	tracing_max_latency = 0;
 	wakeup_trace = tr;
@@ -563,12 +566,15 @@ static int wakeup_rt_tracer_init(struct trace_array *tr)
 
 static void wakeup_tracer_reset(struct trace_array *tr)
 {
+	int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
+	int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
+
 	stop_wakeup_tracer(tr);
 	/* make sure we put back any tasks we are tracing */
 	wakeup_reset(tr);
 
-	if (!save_lat_flag)
-		trace_flags &= ~TRACE_ITER_LATENCY_FMT;
+	set_tracer_flag(TRACE_ITER_LATENCY_FMT, lat_flag);
+	set_tracer_flag(TRACE_ITER_OVERWRITE, overwrite_flag);
 }
 
 static void wakeup_tracer_start(struct trace_array *tr)
@@ -594,6 +600,7 @@ static struct tracer wakeup_tracer __read_mostly =
 	.print_line	= wakeup_print_line,
 	.flags		= &tracer_flags,
 	.set_flag	= wakeup_set_flag,
+	.flag_changed	= trace_keep_overwrite,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_wakeup,
 #endif
@@ -615,6 +622,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
 	.print_line	= wakeup_print_line,
 	.flags		= &tracer_flags,
 	.set_flag	= wakeup_set_flag,
+	.flag_changed	= trace_keep_overwrite,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_wakeup,
 #endif
-- 
cgit v1.2.3-70-g09d2


From 778141e3cf0bf29f91cd3cb5c314ea477b9402a7 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung.kim@lge.com>
Date: Mon, 18 Mar 2013 11:41:46 +0900
Subject: perf: Reset hwc->last_period on sw clock events

When cpu/task clock events are initialized, their sampling
frequencies are converted to have a fixed value.  However it
missed to update the hwc->last_period which was set to 1 for
initial sampling frequency calibration.

Because this hwc->last_period value is used as a period in
perf_swevent_ hrtime(), every recorded sample will have an
incorrected period of 1.

  $ perf record -e task-clock noploop 1
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.158 MB perf.data (~6919 samples) ]

  $ perf report -n --show-total-period  --stdio
  # Samples: 4K of event 'task-clock'
  # Event count (approx.): 4000
  #
  # Overhead       Samples        Period  Command  Shared Object              Symbol
  # ........  ............  ............  .......  .............  ..................
  #
      99.95%          3998          3998  noploop  noploop        [.] main
       0.03%             1             1  noploop  libc-2.15.so   [.] init_cacheinfo
       0.03%             1             1  noploop  ld-2.15.so     [.] open_verify

Note that it doesn't affect the non-sampling event so that the
perf stat still gets correct value with or without this patch.

  $ perf stat -e task-clock noploop 1

   Performance counter stats for 'noploop 1':

         1000.272525 task-clock                #    1.000 CPUs utilized

         1.000560605 seconds time elapsed

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung.kim@lge.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1363574507-18808-1-git-send-email-namhyung@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index b0cd86501c3..fa79c377d65 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5647,6 +5647,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
 		event->attr.sample_period = NSEC_PER_SEC / freq;
 		hwc->sample_period = event->attr.sample_period;
 		local64_set(&hwc->period_left, hwc->sample_period);
+		hwc->last_period = hwc->sample_period;
 		event->attr.freq = 0;
 	}
 }
-- 
cgit v1.2.3-70-g09d2


From d610d98b5de6860feb21539726e9af7c9094151c Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung.kim@lge.com>
Date: Fri, 15 Mar 2013 16:27:13 +0900
Subject: perf: Generate EXIT event only once per task context

perf_event_task_event() iterates pmu list and generate events
for each eligible pmu context.  But if task_event has task_ctx
like in EXIT it'll generate events even though the pmu doesn't
have an eligible one. Fix it by moving the code to proper
places.

Before this patch:

  $ perf record -n true
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.006 MB perf.data (~248 samples) ]

  $ perf report -D | tail
  Aggregated stats:
             TOTAL events:         73
              MMAP events:         67
              COMM events:          2
              EXIT events:          4
  cycles stats:
             TOTAL events:         73
              MMAP events:         67
              COMM events:          2
              EXIT events:          4

After this patch:

  $ perf report -D | tail
  Aggregated stats:
             TOTAL events:         70
              MMAP events:         67
              COMM events:          2
              EXIT events:          1
  cycles stats:
             TOTAL events:         70
              MMAP events:         67
              COMM events:          2
              EXIT events:          1

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung.kim@lge.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1363332433-7637-1-git-send-email-namhyung@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index fa79c377d65..59412d037ee 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4434,12 +4434,15 @@ static void perf_event_task_event(struct perf_task_event *task_event)
 			if (ctxn < 0)
 				goto next;
 			ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+			if (ctx)
+				perf_event_task_ctx(ctx, task_event);
 		}
-		if (ctx)
-			perf_event_task_ctx(ctx, task_event);
 next:
 		put_cpu_ptr(pmu->pmu_cpu_context);
 	}
+	if (task_event->task_ctx)
+		perf_event_task_ctx(task_event->task_ctx, task_event);
+
 	rcu_read_unlock();
 }
 
-- 
cgit v1.2.3-70-g09d2


From 383efcd00053ec40023010ce5034bd702e7ab373 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 18 Mar 2013 12:22:34 -0700
Subject: sched: Convert BUG_ON()s in try_to_wake_up_local() to WARN_ON_ONCE()s

try_to_wake_up_local() should only be invoked to wake up another
task in the same runqueue and BUG_ON()s are used to enforce the
rule. Missing try_to_wake_up_local() can stall workqueue
execution but such stalls are likely to be finite either by
another work item being queued or the one blocked getting
unblocked.  There's no reason to trigger BUG while holding rq
lock crashing the whole system.

Convert BUG_ON()s in try_to_wake_up_local() to WARN_ON_ONCE()s.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20130318192234.GD3042@htj.dyndns.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b7b03cd2d4c..306943f531a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1488,8 +1488,10 @@ static void try_to_wake_up_local(struct task_struct *p)
 {
 	struct rq *rq = task_rq(p);
 
-	BUG_ON(rq != this_rq());
-	BUG_ON(p == current);
+	if (WARN_ON_ONCE(rq != this_rq()) ||
+	    WARN_ON_ONCE(p == current))
+		return;
+
 	lockdep_assert_held(&rq->lock);
 
 	if (!raw_spin_trylock(&p->pi_lock)) {
-- 
cgit v1.2.3-70-g09d2


From dd9c086d9f507d02d5ba4d7c5eef4bb9518088b8 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 18 Mar 2013 14:33:28 +0100
Subject: perf: Fix ring_buffer perf_output_space() boundary calculation

This patch fixes a flaw in perf_output_space(). In case the size
of the space needed is bigger than the actual buffer size, there
may be situations where the function would return true (i.e.,
there is space) when it should not. head > offset due to
rounding of the masking logic.

The problem can be tested by activating BTS on Intel processors.
A BTS record can be as big as 16 pages. The following command
fails:

  $ perf record -m 4 -c 1 -e branches:u my_test_program

You will get a buffer corruption with this. Perf report won't be
able to parse the perf.data.

The fix is to first check that the requested space is smaller
than the buffer size. If so, then the masking logic will work
fine. If not, then there is no chance the record can be saved
and it will be gracefully handled by upper code layers.

[ In v2, we also make the logic for the writable more explicit by
  renaming it to rb->overwrite because it tells whether or not the
  buffer can overwrite its tail (suggested by PeterZ). ]

Signed-off-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: peterz@infradead.org
Cc: jolsa@redhat.com
Cc: fweisbec@gmail.com
Link: http://lkml.kernel.org/r/20130318133327.GA3056@quad
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/internal.h    |  2 +-
 kernel/events/ring_buffer.c | 22 ++++++++++++++++++----
 2 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index d56a64c99a8..eb675c4d59d 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -16,7 +16,7 @@ struct ring_buffer {
 	int				page_order;	/* allocation order  */
 #endif
 	int				nr_pages;	/* nr of data pages  */
-	int				writable;	/* are we writable   */
+	int				overwrite;	/* can overwrite itself */
 
 	atomic_t			poll;		/* POLL_ for wakeups */
 
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 23cb34ff397..97fddb09762 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -18,12 +18,24 @@
 static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
 			      unsigned long offset, unsigned long head)
 {
-	unsigned long mask;
+	unsigned long sz = perf_data_size(rb);
+	unsigned long mask = sz - 1;
 
-	if (!rb->writable)
+	/*
+	 * check if user-writable
+	 * overwrite : over-write its own tail
+	 * !overwrite: buffer possibly drops events.
+	 */
+	if (rb->overwrite)
 		return true;
 
-	mask = perf_data_size(rb) - 1;
+	/*
+	 * verify that payload is not bigger than buffer
+	 * otherwise masking logic may fail to detect
+	 * the "not enough space" condition
+	 */
+	if ((head - offset) > sz)
+		return false;
 
 	offset = (offset - tail) & mask;
 	head   = (head   - tail) & mask;
@@ -212,7 +224,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
 		rb->watermark = max_size / 2;
 
 	if (flags & RING_BUFFER_WRITABLE)
-		rb->writable = 1;
+		rb->overwrite = 0;
+	else
+		rb->overwrite = 1;
 
 	atomic_set(&rb->refcount, 1);
 
-- 
cgit v1.2.3-70-g09d2


From dc72c32e1fd872a9a4fdfe645283c9dcd68e556d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 22 Mar 2013 15:04:39 -0700
Subject: printk: Provide a wake_up_klogd() off-case

wake_up_klogd() is useless when CONFIG_PRINTK=n because neither printk()
nor printk_sched() are in use and there are actually no waiter on
log_wait waitqueue.  It should be a stub in this case for users like
bust_spinlocks().

Otherwise this results in this warning when CONFIG_PRINTK=n and
CONFIG_IRQ_WORK=n:

	kernel/built-in.o In function `wake_up_klogd':
	(.text.wake_up_klogd+0xb4): undefined reference to `irq_work_queue'

To fix this, provide an off-case for wake_up_klogd() when
CONFIG_PRINTK=n.

There is much more from console_unlock() and other console related code
in printk.c that should be moved under CONFIG_PRINTK.  But for now,
focus on a minimal fix as we passed the merged window already.

[akpm@linux-foundation.org: include printk.h in bust_spinlocks.c]
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Reported-by: James Hogan <james.hogan@imgtec.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h |  1 -
 include/linux/printk.h |  6 ++++
 kernel/printk.c        | 80 ++++++++++++++++++++++++--------------------------
 lib/bust_spinlocks.c   |  3 +-
 4 files changed, 46 insertions(+), 44 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 80d36874689..79fdd80a42d 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -390,7 +390,6 @@ extern struct pid *session_of_pgrp(struct pid *pgrp);
 unsigned long int_sqrt(unsigned long);
 
 extern void bust_spinlocks(int yes);
-extern void wake_up_klogd(void);
 extern int oops_in_progress;		/* If set, an oops, panic(), BUG() or die() is in progress */
 extern int panic_timeout;
 extern int panic_on_oops;
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 1249a54d17e..822171fcb1c 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -134,6 +134,8 @@ extern int printk_delay_msec;
 extern int dmesg_restrict;
 extern int kptr_restrict;
 
+extern void wake_up_klogd(void);
+
 void log_buf_kexec_setup(void);
 void __init setup_log_buf(int early);
 #else
@@ -162,6 +164,10 @@ static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies,
 	return false;
 }
 
+static inline void wake_up_klogd(void)
+{
+}
+
 static inline void log_buf_kexec_setup(void)
 {
 }
diff --git a/kernel/printk.c b/kernel/printk.c
index 0b31715f335..abbdd9e2ac8 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -63,8 +63,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
 #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
 #define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */
 
-DECLARE_WAIT_QUEUE_HEAD(log_wait);
-
 int console_printk[4] = {
 	DEFAULT_CONSOLE_LOGLEVEL,	/* console_loglevel */
 	DEFAULT_MESSAGE_LOGLEVEL,	/* default_message_loglevel */
@@ -224,6 +222,7 @@ struct log {
 static DEFINE_RAW_SPINLOCK(logbuf_lock);
 
 #ifdef CONFIG_PRINTK
+DECLARE_WAIT_QUEUE_HEAD(log_wait);
 /* the next printk record to read by syslog(READ) or /proc/kmsg */
 static u64 syslog_seq;
 static u32 syslog_idx;
@@ -1957,45 +1956,6 @@ int is_console_locked(void)
 	return console_locked;
 }
 
-/*
- * Delayed printk version, for scheduler-internal messages:
- */
-#define PRINTK_BUF_SIZE		512
-
-#define PRINTK_PENDING_WAKEUP	0x01
-#define PRINTK_PENDING_SCHED	0x02
-
-static DEFINE_PER_CPU(int, printk_pending);
-static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
-
-static void wake_up_klogd_work_func(struct irq_work *irq_work)
-{
-	int pending = __this_cpu_xchg(printk_pending, 0);
-
-	if (pending & PRINTK_PENDING_SCHED) {
-		char *buf = __get_cpu_var(printk_sched_buf);
-		printk(KERN_WARNING "[sched_delayed] %s", buf);
-	}
-
-	if (pending & PRINTK_PENDING_WAKEUP)
-		wake_up_interruptible(&log_wait);
-}
-
-static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
-	.func = wake_up_klogd_work_func,
-	.flags = IRQ_WORK_LAZY,
-};
-
-void wake_up_klogd(void)
-{
-	preempt_disable();
-	if (waitqueue_active(&log_wait)) {
-		this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
-		irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
-	}
-	preempt_enable();
-}
-
 static void console_cont_flush(char *text, size_t size)
 {
 	unsigned long flags;
@@ -2458,6 +2418,44 @@ static int __init printk_late_init(void)
 late_initcall(printk_late_init);
 
 #if defined CONFIG_PRINTK
+/*
+ * Delayed printk version, for scheduler-internal messages:
+ */
+#define PRINTK_BUF_SIZE		512
+
+#define PRINTK_PENDING_WAKEUP	0x01
+#define PRINTK_PENDING_SCHED	0x02
+
+static DEFINE_PER_CPU(int, printk_pending);
+static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
+
+static void wake_up_klogd_work_func(struct irq_work *irq_work)
+{
+	int pending = __this_cpu_xchg(printk_pending, 0);
+
+	if (pending & PRINTK_PENDING_SCHED) {
+		char *buf = __get_cpu_var(printk_sched_buf);
+		printk(KERN_WARNING "[sched_delayed] %s", buf);
+	}
+
+	if (pending & PRINTK_PENDING_WAKEUP)
+		wake_up_interruptible(&log_wait);
+}
+
+static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
+	.func = wake_up_klogd_work_func,
+	.flags = IRQ_WORK_LAZY,
+};
+
+void wake_up_klogd(void)
+{
+	preempt_disable();
+	if (waitqueue_active(&log_wait)) {
+		this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
+		irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
+	}
+	preempt_enable();
+}
 
 int printk_sched(const char *fmt, ...)
 {
diff --git a/lib/bust_spinlocks.c b/lib/bust_spinlocks.c
index 9681d54b95d..f8e0e536739 100644
--- a/lib/bust_spinlocks.c
+++ b/lib/bust_spinlocks.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/printk.h>
 #include <linux/spinlock.h>
 #include <linux/tty.h>
 #include <linux/wait.h>
@@ -28,5 +29,3 @@ void __attribute__((weak)) bust_spinlocks(int yes)
 			wake_up_klogd();
 	}
 }
-
-
-- 
cgit v1.2.3-70-g09d2


From 2ca067efd82939dfd87827d29d36a265823a4c2f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 22 Mar 2013 15:04:41 -0700
Subject: poweroff: change orderly_poweroff() to use schedule_work()

David said:

    Commit 6c0c0d4d1080 ("poweroff: fix bug in orderly_poweroff()")
    apparently fixes one bug in orderly_poweroff(), but introduces
    another.  The comments on orderly_poweroff() claim it can be called
    from any context - and indeed we call it from interrupt context in
    arch/powerpc/platforms/pseries/ras.c for example.  But since that
    commit this is no longer safe, since call_usermodehelper_fns() is not
    safe in interrupt context without the UMH_NO_WAIT option.

orderly_poweroff() can be used from any context but UMH_WAIT_EXEC is
sleepable.  Move the "force" logic into __orderly_poweroff() and change
orderly_poweroff() to use the global poweroff_work which simply calls
__orderly_poweroff().

While at it, remove the unneeded "int argc" and change argv_split() to
use GFP_KERNEL.

We use the global "bool poweroff_force" to pass the argument, this can
obviously affect the previous request if it is pending/running.  So we
only allow the "false => true" transition assuming that the pending
"true" should succeed anyway.  If schedule_work() fails after that we
know that work->func() was not called yet, it must see the new value.

This means that orderly_poweroff() becomes async even if we do not run
the command and always succeeds, schedule_work() can only fail if the
work is already pending.  We can export __orderly_poweroff() and change
the non-atomic callers which want the old semantics.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reported-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Reported-by: David Gibson <david@gibson.dropbear.id.au>
Cc: Lucas De Marchi <lucas.demarchi@profusion.mobi>
Cc: Feng Hong <hongfeng@marvell.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Serge Hallyn <serge.hallyn@canonical.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 57 ++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 32 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 81f56445fba..39c9c4a2949 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2185,9 +2185,8 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
 
 char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
 
-static int __orderly_poweroff(void)
+static int __orderly_poweroff(bool force)
 {
-	int argc;
 	char **argv;
 	static char *envp[] = {
 		"HOME=/",
@@ -2196,20 +2195,40 @@ static int __orderly_poweroff(void)
 	};
 	int ret;
 
-	argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
-	if (argv == NULL) {
+	argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL);
+	if (argv) {
+		ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+		argv_free(argv);
+	} else {
 		printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
-		       __func__, poweroff_cmd);
-		return -ENOMEM;
+					 __func__, poweroff_cmd);
+		ret = -ENOMEM;
 	}
 
-	ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC,
-				      NULL, NULL, NULL);
-	argv_free(argv);
+	if (ret && force) {
+		printk(KERN_WARNING "Failed to start orderly shutdown: "
+					"forcing the issue\n");
+		/*
+		 * I guess this should try to kick off some daemon to sync and
+		 * poweroff asap.  Or not even bother syncing if we're doing an
+		 * emergency shutdown?
+		 */
+		emergency_sync();
+		kernel_power_off();
+	}
 
 	return ret;
 }
 
+static bool poweroff_force;
+
+static void poweroff_work_func(struct work_struct *work)
+{
+	__orderly_poweroff(poweroff_force);
+}
+
+static DECLARE_WORK(poweroff_work, poweroff_work_func);
+
 /**
  * orderly_poweroff - Trigger an orderly system poweroff
  * @force: force poweroff if command execution fails
@@ -2219,21 +2238,9 @@ static int __orderly_poweroff(void)
  */
 int orderly_poweroff(bool force)
 {
-	int ret = __orderly_poweroff();
-
-	if (ret && force) {
-		printk(KERN_WARNING "Failed to start orderly shutdown: "
-		       "forcing the issue\n");
-
-		/*
-		 * I guess this should try to kick off some daemon to sync and
-		 * poweroff asap.  Or not even bother syncing if we're doing an
-		 * emergency shutdown?
-		 */
-		emergency_sync();
-		kernel_power_off();
-	}
-
-	return ret;
+	if (force) /* do not override the pending "true" */
+		poweroff_force = true;
+	schedule_work(&poweroff_work);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(orderly_poweroff);
-- 
cgit v1.2.3-70-g09d2


From 751c644b95bb48aaa8825f0c66abbcc184d92051 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 26 Mar 2013 02:27:11 -0700
Subject: pid: Handle the exit of a multi-threaded init.

When a multi-threaded init exits and the initial thread is not the
last thread to exit the initial thread hangs around as a zombie
until the last thread exits.  In that case zap_pid_ns_processes
needs to wait until there are only 2 hashed pids in the pid
namespace not one.

v2. Replace thread_pid_vnr(me) == 1 with the test thread_group_leader(me)
    as suggested by Oleg.

Cc: stable@vger.kernel.org
Cc: Oleg Nesterov <oleg@redhat.com>
Reported-by: Caj Larsson <caj@omnicloud.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/pid_namespace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index c1c3dc1c602..bea15bdf82b 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -181,6 +181,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	int nr;
 	int rc;
 	struct task_struct *task, *me = current;
+	int init_pids = thread_group_leader(me) ? 1 : 2;
 
 	/* Don't allow any more processes into the pid namespace */
 	disable_pid_allocation(pid_ns);
@@ -230,7 +231,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	 */
 	for (;;) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
-		if (pid_ns->nr_hashed == 1)
+		if (pid_ns->nr_hashed == init_pids)
 			break;
 		schedule();
 	}
-- 
cgit v1.2.3-70-g09d2


From 3151527ee007b73a0ebd296010f1c0454a919c7d Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 15 Mar 2013 01:45:51 -0700
Subject: userns:  Don't allow creation if the user is chrooted

Guarantee that the policy of which files may be access that is
established by setting the root directory will not be violated
by user namespaces by verifying that the root directory points
to the root of the mount namespace at the time of user namespace
creation.

Changing the root is a privileged operation, and as a matter of policy
it serves to limit unprivileged processes to files below the current
root directory.

For reasons of simplicity and comprehensibility the privilege to
change the root directory is gated solely on the CAP_SYS_CHROOT
capability in the user namespace.  Therefore when creating a user
namespace we must ensure that the policy of which files may be access
can not be violated by changing the root directory.

Anyone who runs a processes in a chroot and would like to use user
namespace can setup the same view of filesystems with a mount
namespace instead.  With this result that this is not a practical
limitation for using user namespaces.

Cc: stable@vger.kernel.org
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Reported-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/namespace.c            | 24 ++++++++++++++++++++++++
 include/linux/fs_struct.h |  2 ++
 kernel/user_namespace.c   |  9 +++++++++
 3 files changed, 35 insertions(+)

(limited to 'kernel')

diff --git a/fs/namespace.c b/fs/namespace.c
index 50ca17d3cb4..a3035223d42 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2732,6 +2732,30 @@ bool our_mnt(struct vfsmount *mnt)
 	return check_mnt(real_mount(mnt));
 }
 
+bool current_chrooted(void)
+{
+	/* Does the current process have a non-standard root */
+	struct path ns_root;
+	struct path fs_root;
+	bool chrooted;
+
+	/* Find the namespace root */
+	ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
+	ns_root.dentry = ns_root.mnt->mnt_root;
+	path_get(&ns_root);
+	while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
+		;
+
+	get_fs_root(current->fs, &fs_root);
+
+	chrooted = !path_equal(&fs_root, &ns_root);
+
+	path_put(&fs_root);
+	path_put(&ns_root);
+
+	return chrooted;
+}
+
 static void *mntns_get(struct task_struct *task)
 {
 	struct mnt_namespace *ns = NULL;
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index 729eded4b24..2b93a9a5a1e 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -50,4 +50,6 @@ static inline void get_fs_root_and_pwd(struct fs_struct *fs, struct path *root,
 	spin_unlock(&fs->lock);
 }
 
+extern bool current_chrooted(void);
+
 #endif /* _LINUX_FS_STRUCT_H */
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index b14f4d34204..0f1e4288457 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -61,6 +61,15 @@ int create_user_ns(struct cred *new)
 	kgid_t group = new->egid;
 	int ret;
 
+	/*
+	 * Verify that we can not violate the policy of which files
+	 * may be accessed that is specified by the root directory,
+	 * by verifing that the root directory is at the root of the
+	 * mount namespace which allows all files to be accessed.
+	 */
+	if (current_chrooted())
+		return -EPERM;
+
 	/* The creator needs a mapping in the parent user namespace
 	 * or else we won't be able to reasonably tell userspace who
 	 * created a user_namespace.
-- 
cgit v1.2.3-70-g09d2


From 87a8ebd637dafc255070f503909a053cf0d98d3f Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 24 Mar 2013 14:28:27 -0700
Subject: userns: Restrict when proc and sysfs can be mounted

Only allow unprivileged mounts of proc and sysfs if they are already
mounted when the user namespace is created.

proc and sysfs are interesting because they have content that is
per namespace, and so fresh mounts are needed when new namespaces
are created while at the same time proc and sysfs have content that
is shared between every instance.

Respect the policy of who may see the shared content of proc and sysfs
by only allowing new mounts if there was an existing mount at the time
the user namespace was created.

In practice there are only two interesting cases: proc and sysfs are
mounted at their usual places, proc and sysfs are not mounted at all
(some form of mount namespace jail).

Cc: stable@vger.kernel.org
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/namespace.c                 | 21 +++++++++++++++++++++
 fs/proc/root.c                 |  4 ++++
 fs/sysfs/mount.c               |  4 ++++
 include/linux/user_namespace.h |  4 ++++
 kernel/user.c                  |  2 ++
 kernel/user_namespace.c        |  2 ++
 6 files changed, 37 insertions(+)

(limited to 'kernel')

diff --git a/fs/namespace.c b/fs/namespace.c
index 968d4c5eae0..d581e45c0a9 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2763,6 +2763,27 @@ bool current_chrooted(void)
 	return chrooted;
 }
 
+void update_mnt_policy(struct user_namespace *userns)
+{
+	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+	struct mount *mnt;
+
+	down_read(&namespace_sem);
+	list_for_each_entry(mnt, &ns->list, mnt_list) {
+		switch (mnt->mnt.mnt_sb->s_magic) {
+		case SYSFS_MAGIC:
+			userns->may_mount_sysfs = true;
+			break;
+		case PROC_SUPER_MAGIC:
+			userns->may_mount_proc = true;
+			break;
+		}
+		if (userns->may_mount_sysfs && userns->may_mount_proc)
+			break;
+	}
+	up_read(&namespace_sem);
+}
+
 static void *mntns_get(struct task_struct *task)
 {
 	struct mnt_namespace *ns = NULL;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index c6e9fac26ba..9c7fab1d23f 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -16,6 +16,7 @@
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/bitops.h>
+#include <linux/user_namespace.h>
 #include <linux/mount.h>
 #include <linux/pid_namespace.h>
 #include <linux/parser.h>
@@ -108,6 +109,9 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 	} else {
 		ns = task_active_pid_ns(current);
 		options = data;
+
+		if (!current_user_ns()->may_mount_proc)
+			return ERR_PTR(-EPERM);
 	}
 
 	sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns);
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 8d924b5ec73..afd83273e6c 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -19,6 +19,7 @@
 #include <linux/module.h>
 #include <linux/magic.h>
 #include <linux/slab.h>
+#include <linux/user_namespace.h>
 
 #include "sysfs.h"
 
@@ -111,6 +112,9 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 	struct super_block *sb;
 	int error;
 
+	if (!(flags & MS_KERNMOUNT) && !current_user_ns()->may_mount_sysfs)
+		return ERR_PTR(-EPERM);
+
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info)
 		return ERR_PTR(-ENOMEM);
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 4ce00932493..b6b215f13b4 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -26,6 +26,8 @@ struct user_namespace {
 	kuid_t			owner;
 	kgid_t			group;
 	unsigned int		proc_inum;
+	bool			may_mount_sysfs;
+	bool			may_mount_proc;
 };
 
 extern struct user_namespace init_user_ns;
@@ -82,4 +84,6 @@ static inline void put_user_ns(struct user_namespace *ns)
 
 #endif
 
+void update_mnt_policy(struct user_namespace *userns);
+
 #endif /* _LINUX_USER_H */
diff --git a/kernel/user.c b/kernel/user.c
index e81978e8c03..8e635a18ab5 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -51,6 +51,8 @@ struct user_namespace init_user_ns = {
 	.owner = GLOBAL_ROOT_UID,
 	.group = GLOBAL_ROOT_GID,
 	.proc_inum = PROC_USER_INIT_INO,
+	.may_mount_sysfs = true,
+	.may_mount_proc = true,
 };
 EXPORT_SYMBOL_GPL(init_user_ns);
 
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 0f1e4288457..a54f26f82eb 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -96,6 +96,8 @@ int create_user_ns(struct cred *new)
 
 	set_cred_user_ns(new, ns);
 
+	update_mnt_policy(ns);
+
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From dbf520a9d7d4d5ba28d2947be11e34099a5e3e20 Mon Sep 17 00:00:00 2001
From: Paul Walmsley <paul@pwsan.com>
Date: Sun, 31 Mar 2013 00:04:40 +0000
Subject: Revert "lockdep: check that no locks held at freeze time"

This reverts commit 6aa9707099c4b25700940eb3d016f16c4434360d.

Commit 6aa9707099c4 ("lockdep: check that no locks held at freeze time")
causes problems with NFS root filesystems.  The failures were noticed on
OMAP2 and 3 boards during kernel init:

  [ BUG: swapper/0/1 still has locks held! ]
  3.9.0-rc3-00344-ga937536 #1 Not tainted
  -------------------------------------
  1 lock held by swapper/0/1:
   #0:  (&type->s_umount_key#13/1){+.+.+.}, at: [<c011e84c>] sget+0x248/0x574

  stack backtrace:
    rpc_wait_bit_killable
    __wait_on_bit
    out_of_line_wait_on_bit
    __rpc_execute
    rpc_run_task
    rpc_call_sync
    nfs_proc_get_root
    nfs_get_root
    nfs_fs_mount_common
    nfs_try_mount
    nfs_fs_mount
    mount_fs
    vfs_kern_mount
    do_mount
    sys_mount
    do_mount_root
    mount_root
    prepare_namespace
    kernel_init_freeable
    kernel_init

Although the rootfs mounts, the system is unstable.  Here's a transcript
from a PM test:

  http://www.pwsan.com/omap/testlogs/test_v3.9-rc3/20130317194234/pm/37xxevm/37xxevm_log.txt

Here's what the test log should look like:

  http://www.pwsan.com/omap/testlogs/test_v3.8/20130218214403/pm/37xxevm/37xxevm_log.txt

Mailing list discussion is here:

  http://lkml.org/lkml/2013/3/4/221

Deal with this for v3.9 by reverting the problem commit, until folks can
figure out the right long-term course of action.

Signed-off-by: Paul Walmsley <paul@pwsan.com>
Cc: Mandeep Singh Baines <msb@chromium.org>
Cc: Jeff Layton <jlayton@redhat.com>
Cc: Shawn Guo <shawn.guo@linaro.org>
Cc: <maciej.rutecki@gmail.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ben Chan <benchan@chromium.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/debug_locks.h |  4 ++--
 include/linux/freezer.h     |  3 ---
 kernel/exit.c               |  2 +-
 kernel/lockdep.c            | 17 +++++++++--------
 4 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h
index a975de1ff59..3bd46f76675 100644
--- a/include/linux/debug_locks.h
+++ b/include/linux/debug_locks.h
@@ -51,7 +51,7 @@ struct task_struct;
 extern void debug_show_all_locks(void);
 extern void debug_show_held_locks(struct task_struct *task);
 extern void debug_check_no_locks_freed(const void *from, unsigned long len);
-extern void debug_check_no_locks_held(void);
+extern void debug_check_no_locks_held(struct task_struct *task);
 #else
 static inline void debug_show_all_locks(void)
 {
@@ -67,7 +67,7 @@ debug_check_no_locks_freed(const void *from, unsigned long len)
 }
 
 static inline void
-debug_check_no_locks_held(void)
+debug_check_no_locks_held(struct task_struct *task)
 {
 }
 #endif
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 043a5cf8b5b..e70df40d84f 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -3,7 +3,6 @@
 #ifndef FREEZER_H_INCLUDED
 #define FREEZER_H_INCLUDED
 
-#include <linux/debug_locks.h>
 #include <linux/sched.h>
 #include <linux/wait.h>
 #include <linux/atomic.h>
@@ -49,8 +48,6 @@ extern void thaw_kernel_threads(void);
 
 static inline bool try_to_freeze(void)
 {
-	if (!(current->flags & PF_NOFREEZE))
-		debug_check_no_locks_held();
 	might_sleep();
 	if (likely(!freezing(current)))
 		return false;
diff --git a/kernel/exit.c b/kernel/exit.c
index 51e485ca993..60bc027c61c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -835,7 +835,7 @@ void do_exit(long code)
 	/*
 	 * Make sure we are holding no locks:
 	 */
-	debug_check_no_locks_held();
+	debug_check_no_locks_held(tsk);
 	/*
 	 * We can do this unlocked here. The futex code uses this flag
 	 * just to verify whether the pi state cleanup has been done
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 259db207b5d..8a0efac4f99 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -4088,7 +4088,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
 }
 EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
 
-static void print_held_locks_bug(void)
+static void print_held_locks_bug(struct task_struct *curr)
 {
 	if (!debug_locks_off())
 		return;
@@ -4097,21 +4097,22 @@ static void print_held_locks_bug(void)
 
 	printk("\n");
 	printk("=====================================\n");
-	printk("[ BUG: %s/%d still has locks held! ]\n",
-	       current->comm, task_pid_nr(current));
+	printk("[ BUG: lock held at task exit time! ]\n");
 	print_kernel_ident();
 	printk("-------------------------------------\n");
-	lockdep_print_held_locks(current);
+	printk("%s/%d is exiting with locks still held!\n",
+		curr->comm, task_pid_nr(curr));
+	lockdep_print_held_locks(curr);
+
 	printk("\nstack backtrace:\n");
 	dump_stack();
 }
 
-void debug_check_no_locks_held(void)
+void debug_check_no_locks_held(struct task_struct *task)
 {
-	if (unlikely(current->lockdep_depth > 0))
-		print_held_locks_bug();
+	if (unlikely(task->lockdep_depth > 0))
+		print_held_locks_bug(task);
 }
-EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
 
 void debug_show_all_locks(void)
 {
-- 
cgit v1.2.3-70-g09d2


From a1cbcaa9ea87b87a96b9fc465951dcf36e459ca2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 6 Apr 2013 10:10:27 +0200
Subject: sched_clock: Prevent 64bit inatomicity on 32bit systems

The sched_clock_remote() implementation has the following inatomicity
problem on 32bit systems when accessing the remote scd->clock, which
is a 64bit value.

CPU0			CPU1

sched_clock_local()	sched_clock_remote(CPU0)
...
			remote_clock = scd[CPU0]->clock
			    read_low32bit(scd[CPU0]->clock)
cmpxchg64(scd->clock,...)
			    read_high32bit(scd[CPU0]->clock)

While the update of scd->clock is using an atomic64 mechanism, the
readout on the remote cpu is not, which can cause completely bogus
readouts.

It is a quite rare problem, because it requires the update to hit the
narrow race window between the low/high readout and the update must go
across the 32bit boundary.

The resulting misbehaviour is, that CPU1 will see the sched_clock on
CPU1 ~4 seconds ahead of it's own and update CPU1s sched_clock value
to this bogus timestamp. This stays that way due to the clamping
implementation for about 4 seconds until the synchronization with
CLOCK_MONOTONIC undoes the problem.

The issue is hard to observe, because it might only result in a less
accurate SCHED_OTHER timeslicing behaviour. To create observable
damage on realtime scheduling classes, it is necessary that the bogus
update of CPU1 sched_clock happens in the context of an realtime
thread, which then gets charged 4 seconds of RT runtime, which results
in the RT throttler mechanism to trigger and prevent scheduling of RT
tasks for a little less than 4 seconds. So this is quite unlikely as
well.

The issue was quite hard to decode as the reproduction time is between
2 days and 3 weeks and intrusive tracing makes it less likely, but the
following trace recorded with trace_clock=global, which uses
sched_clock_local(), gave the final hint:

  <idle>-0   0d..30 400269.477150: hrtimer_cancel: hrtimer=0xf7061e80
  <idle>-0   0d..30 400269.477151: hrtimer_start:  hrtimer=0xf7061e80 ...
irq/20-S-587 1d..32 400273.772118: sched_wakeup:   comm= ... target_cpu=0
  <idle>-0   0dN.30 400273.772118: hrtimer_cancel: hrtimer=0xf7061e80

What happens is that CPU0 goes idle and invokes
sched_clock_idle_sleep_event() which invokes sched_clock_local() and
CPU1 runs a remote wakeup for CPU0 at the same time, which invokes
sched_remote_clock(). The time jump gets propagated to CPU0 via
sched_remote_clock() and stays stale on both cores for ~4 seconds.

There are only two other possibilities, which could cause a stale
sched clock:

1) ktime_get() which reads out CLOCK_MONOTONIC returns a sporadic
   wrong value.

2) sched_clock() which reads the TSC returns a sporadic wrong value.

#1 can be excluded because sched_clock would continue to increase for
   one jiffy and then go stale.

#2 can be excluded because it would not make the clock jump
   forward. It would just result in a stale sched_clock for one jiffy.

After quite some brain twisting and finding the same pattern on other
traces, sched_clock_remote() remained the only place which could cause
such a problem and as explained above it's indeed racy on 32bit
systems.

So while on 64bit systems the readout is atomic, we need to verify the
remote readout on 32bit machines. We need to protect the local->clock
readout in sched_clock_remote() on 32bit as well because an NMI could
hit between the low and the high readout, call sched_clock_local() and
modify local->clock.

Thanks to Siegfried Wulsch for bearing with my debug requests and
going through the tedious tasks of running a bunch of reproducer
systems to generate the debug information which let me decode the
issue.

Reported-by: Siegfried Wulsch <Siegfried.Wulsch@rovema.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1304051544160.21884@ionos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
---
 kernel/sched/clock.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c685e31492d..c3ae1446461 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -176,10 +176,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)
 	u64 this_clock, remote_clock;
 	u64 *ptr, old_val, val;
 
+#if BITS_PER_LONG != 64
+again:
+	/*
+	 * Careful here: The local and the remote clock values need to
+	 * be read out atomic as we need to compare the values and
+	 * then update either the local or the remote side. So the
+	 * cmpxchg64 below only protects one readout.
+	 *
+	 * We must reread via sched_clock_local() in the retry case on
+	 * 32bit as an NMI could use sched_clock_local() via the
+	 * tracer and hit between the readout of
+	 * the low32bit and the high 32bit portion.
+	 */
+	this_clock = sched_clock_local(my_scd);
+	/*
+	 * We must enforce atomic readout on 32bit, otherwise the
+	 * update on the remote cpu can hit inbetween the readout of
+	 * the low32bit and the high 32bit portion.
+	 */
+	remote_clock = cmpxchg64(&scd->clock, 0, 0);
+#else
+	/*
+	 * On 64bit the read of [my]scd->clock is atomic versus the
+	 * update, so we can avoid the above 32bit dance.
+	 */
 	sched_clock_local(my_scd);
 again:
 	this_clock = my_scd->clock;
 	remote_clock = scd->clock;
+#endif
 
 	/*
 	 * Use the opportunity that we have both locks
-- 
cgit v1.2.3-70-g09d2


From fd9b86d37a600488dbd80fe60cca46b822bff1cd Mon Sep 17 00:00:00 2001
From: libin <huawei.libin@huawei.com>
Date: Mon, 8 Apr 2013 14:39:12 +0800
Subject: sched/debug: Fix sd->*_idx limit range avoiding overflow

Commit 201c373e8e ("sched/debug: Limit sd->*_idx range on
sysctl") was an incomplete bug fix.

This patch fixes sd->*_idx limit range to [0 ~ CPU_LOAD_IDX_MAX-1]
avoiding array overflow caused by setting sd->*_idx to CPU_LOAD_IDX_MAX
on sysctl.

Signed-off-by: Libin <huawei.libin@huawei.com>
Cc: <jiang.liu@huawei.com>
Cc: <guohanjun@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/51626610.2040607@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 306943f531a..fa077929e31 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4933,7 +4933,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
 }
 
 static int min_load_idx = 0;
-static int max_load_idx = CPU_LOAD_IDX_MAX;
+static int max_load_idx = CPU_LOAD_IDX_MAX-1;
 
 static void
 set_table_entry(struct ctl_table *entry,
-- 
cgit v1.2.3-70-g09d2


From c97847d2f0eb77c806e650e04d9bbcf79fa05730 Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen@asianux.com>
Date: Mon, 8 Apr 2013 11:48:27 +0800
Subject: perf: Fix strncpy() use, always make sure it's NUL terminated

For NUL terminated string, always make sure that there's '\0' at the end.

In our case we need a return value, so still use strncpy() and
fix up the tail explicitly.

(strlcpy() returns the size, not the pointer)

Signed-off-by: Chen Gang <gang.chen@asianux.com>
Cc: a.p.zijlstra@chello.nl <a.p.zijlstra@chello.nl>
Cc: paulus@samba.org <paulus@samba.org>
Cc: acme@ghostprotocols.net <acme@ghostprotocols.net>
Link: http://lkml.kernel.org/r/51623E0B.7070101@asianux.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 59412d037ee..7f0d67ea3f4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4737,7 +4737,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
 	} else {
 		if (arch_vma_name(mmap_event->vma)) {
 			name = strncpy(tmp, arch_vma_name(mmap_event->vma),
-				       sizeof(tmp));
+				       sizeof(tmp) - 1);
+			tmp[sizeof(tmp) - 1] = '\0';
 			goto got_name;
 		}
 
-- 
cgit v1.2.3-70-g09d2


From 67012ab1d2ce871afea4ee55408f233f97d09d07 Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen@asianux.com>
Date: Mon, 8 Apr 2013 12:06:44 +0800
Subject: perf: Fix strncpy() use, use strlcpy() instead of strncpy()

For NUL terminated string we always need to set '\0' at the end.

Signed-off-by: Chen Gang <gang.chen@asianux.com>
Cc: rostedt@goodmis.org
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/51624254.30301@asianux.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4f1dade5698..3f5046a4d81 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -132,7 +132,7 @@ static char *default_bootup_tracer;
 
 static int __init set_cmdline_ftrace(char *str)
 {
-	strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
+	strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
 	default_bootup_tracer = bootup_tracer_buf;
 	/* We are using ftrace early, expand it */
 	ring_buffer_expanded = 1;
@@ -162,7 +162,7 @@ static char *trace_boot_options __initdata;
 
 static int __init set_trace_boot_options(char *str)
 {
-	strncpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
+	strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
 	trace_boot_options = trace_boot_options_buf;
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 75761cc15877c155b3849b4e0e0cb3f897faf471 Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen@asianux.com>
Date: Mon, 8 Apr 2013 12:12:39 +0800
Subject: ftrace: Fix strncpy() use, use strlcpy() instead of strncpy()

For NUL terminated string we always need to set '\0' at the end.

Signed-off-by: Chen Gang <gang.chen@asianux.com>
Cc: rostedt@goodmis.org
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/516243B7.9020405@asianux.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/trace/ftrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6893d5a2bf0..db143742704 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3441,14 +3441,14 @@ static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
 
 static int __init set_ftrace_notrace(char *str)
 {
-	strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
+	strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
 	return 1;
 }
 __setup("ftrace_notrace=", set_ftrace_notrace);
 
 static int __init set_ftrace_filter(char *str)
 {
-	strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
+	strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
 	return 1;
 }
 __setup("ftrace_filter=", set_ftrace_filter);
-- 
cgit v1.2.3-70-g09d2


From e614b3332a4f3f264a26da28e5a1f4cc3aea3974 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Thu, 4 Apr 2013 10:57:48 +0200
Subject: sched/cputime: Fix accounting on multi-threaded processes

Recent commit 6fac4829 ("cputime: Use accessors to read task
cputime stats") introduced a bug, where we account many times
the cputime of the first thread, instead of cputimes of all
the different threads.

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20130404085740.GA2495@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cputime.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ed12cbb135f..e93cca92f38 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -310,7 +310,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 
 	t = tsk;
 	do {
-		task_cputime(tsk, &utime, &stime);
+		task_cputime(t, &utime, &stime);
 		times->utime += utime;
 		times->stime += stime;
 		times->sum_exec_runtime += task_sched_runtime(t);
-- 
cgit v1.2.3-70-g09d2


From 2930e04d00e113ae24bb2b7c2b58de7b648a62c7 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 26 Mar 2013 17:33:00 -0400
Subject: tracing: Fix race with update_max_tr_single and changing tracers

The commit 34600f0e9 "tracing: Fix race with max_tr and changing tracers"
fixed the updating of the main buffers with the race of changing
tracers, but left out the fix to the updating of just a per cpu buffer.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4f1dade5698..7ba7fc76f9e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -744,8 +744,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 		return;
 
 	WARN_ON_ONCE(!irqs_disabled());
-	if (WARN_ON_ONCE(!current_trace->allocated_snapshot))
+	if (!current_trace->allocated_snapshot) {
+		/* Only the nop tracer should hit this when disabling */
+		WARN_ON_ONCE(current_trace != &nop_trace);
 		return;
+	}
 
 	arch_spin_lock(&ftrace_max_lock);
 
-- 
cgit v1.2.3-70-g09d2


From 5000c418840b309251c5887f0b56503aae30f84c Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 26 Mar 2013 17:53:03 +0100
Subject: ftrace: Consistently restore trace function on sysctl enabling

If we reenable ftrace via syctl, we currently set ftrace_trace_function
based on the previous simplistic algorithm. This is inconsistent with
what update_ftrace_function does. So better call that helper instead.

Link: http://lkml.kernel.org/r/5151D26F.1070702@siemens.com

Cc: stable@vger.kernel.org
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6893d5a2bf0..cc4943c7ce6 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4555,12 +4555,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 		ftrace_startup_sysctl();
 
 		/* we are starting ftrace again */
-		if (ftrace_ops_list != &ftrace_list_end) {
-			if (ftrace_ops_list->next == &ftrace_list_end)
-				ftrace_trace_function = ftrace_ops_list->func;
-			else
-				ftrace_trace_function = ftrace_ops_list_func;
-		}
+		if (ftrace_ops_list != &ftrace_list_end)
+			update_ftrace_function();
 
 	} else {
 		/* stopping ftrace calls (just send to ftrace_stub) */
-- 
cgit v1.2.3-70-g09d2


From 395b97a3aeff0b8d949ee3e67bf8c11c5ffd6861 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 27 Mar 2013 09:31:28 -0400
Subject: ftrace: Do not call stub functions in control loop

The function tracing control loop used by perf spits out a warning
if the called function is not a control function. This is because
the control function references a per cpu allocated data structure
on struct ftrace_ops that is not allocated for other types of
functions.

commit 0a016409e42 "ftrace: Optimize the function tracer list loop"

Had an optimization done to all function tracing loops to optimize
for a single registered ops. Unfortunately, this allows for a slight
race when tracing starts or ends, where the stub function might be
called after the current registered ops is removed. In this case we
get the following dump:

root# perf stat -e ftrace:function sleep 1
[   74.339105] WARNING: at include/linux/ftrace.h:209 ftrace_ops_control_func+0xde/0xf0()
[   74.349522] Hardware name: PRIMERGY RX200 S6
[   74.357149] Modules linked in: sg igb iTCO_wdt ptp pps_core iTCO_vendor_support i7core_edac dca lpc_ich i2c_i801 coretemp edac_core crc32c_intel mfd_core ghash_clmulni_intel dm_multipath acpi_power_meter pcspk
r microcode vhost_net tun macvtap macvlan nfsd kvm_intel kvm auth_rpcgss nfs_acl lockd sunrpc uinput xfs libcrc32c sd_mod crc_t10dif sr_mod cdrom mgag200 i2c_algo_bit drm_kms_helper ttm qla2xxx mptsas ahci drm li
bahci scsi_transport_sas mptscsih libata scsi_transport_fc i2c_core mptbase scsi_tgt dm_mirror dm_region_hash dm_log dm_mod
[   74.446233] Pid: 1377, comm: perf Tainted: G        W    3.9.0-rc1 #1
[   74.453458] Call Trace:
[   74.456233]  [<ffffffff81062e3f>] warn_slowpath_common+0x7f/0xc0
[   74.462997]  [<ffffffff810fbc60>] ? rcu_note_context_switch+0xa0/0xa0
[   74.470272]  [<ffffffff811041a2>] ? __unregister_ftrace_function+0xa2/0x1a0
[   74.478117]  [<ffffffff81062e9a>] warn_slowpath_null+0x1a/0x20
[   74.484681]  [<ffffffff81102ede>] ftrace_ops_control_func+0xde/0xf0
[   74.491760]  [<ffffffff8162f400>] ftrace_call+0x5/0x2f
[   74.497511]  [<ffffffff8162f400>] ? ftrace_call+0x5/0x2f
[   74.503486]  [<ffffffff8162f400>] ? ftrace_call+0x5/0x2f
[   74.509500]  [<ffffffff810fbc65>] ? synchronize_sched+0x5/0x50
[   74.516088]  [<ffffffff816254d5>] ? _cond_resched+0x5/0x40
[   74.522268]  [<ffffffff810fbc65>] ? synchronize_sched+0x5/0x50
[   74.528837]  [<ffffffff811041a2>] ? __unregister_ftrace_function+0xa2/0x1a0
[   74.536696]  [<ffffffff816254d5>] ? _cond_resched+0x5/0x40
[   74.542878]  [<ffffffff8162402d>] ? mutex_lock+0x1d/0x50
[   74.548869]  [<ffffffff81105c67>] unregister_ftrace_function+0x27/0x50
[   74.556243]  [<ffffffff8111eadf>] perf_ftrace_event_register+0x9f/0x140
[   74.563709]  [<ffffffff816254d5>] ? _cond_resched+0x5/0x40
[   74.569887]  [<ffffffff8162402d>] ? mutex_lock+0x1d/0x50
[   74.575898]  [<ffffffff8111e94e>] perf_trace_destroy+0x2e/0x50
[   74.582505]  [<ffffffff81127ba9>] tp_perf_event_destroy+0x9/0x10
[   74.589298]  [<ffffffff811295d0>] free_event+0x70/0x1a0
[   74.595208]  [<ffffffff8112a579>] perf_event_release_kernel+0x69/0xa0
[   74.602460]  [<ffffffff816254d5>] ? _cond_resched+0x5/0x40
[   74.608667]  [<ffffffff8112a640>] put_event+0x90/0xc0
[   74.614373]  [<ffffffff8112a740>] perf_release+0x10/0x20
[   74.620367]  [<ffffffff811a3044>] __fput+0xf4/0x280
[   74.625894]  [<ffffffff811a31de>] ____fput+0xe/0x10
[   74.631387]  [<ffffffff81083697>] task_work_run+0xa7/0xe0
[   74.637452]  [<ffffffff81014981>] do_notify_resume+0x71/0xb0
[   74.643843]  [<ffffffff8162fa92>] int_signal+0x12/0x17

To fix this a new ftrace_ops flag is added that denotes the ftrace_list_end
ftrace_ops stub as just that, a stub. This flag is now checked in the
control loop and the function is not called if the flag is set.

Thanks to Jovi for not just reporting the bug, but also pointing out
where the bug was in the code.

Link: http://lkml.kernel.org/r/514A8855.7090402@redhat.com
Link: http://lkml.kernel.org/r/1364377499-1900-15-git-send-email-jovi.zhangwei@huawei.com

Tested-by: WANG Chao <chaowang@redhat.com>
Reported-by: WANG Chao <chaowang@redhat.com>
Reported-by: zhangwei(Jovi) <jovi.zhangwei@huawei.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 2 ++
 kernel/trace/ftrace.c  | 5 +++--
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index e5ca8ef50e9..167abf90780 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -89,6 +89,7 @@ typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip,
  *            that the call back has its own recursion protection. If it does
  *            not set this, then the ftrace infrastructure will add recursion
  *            protection for the caller.
+ * STUB   - The ftrace_ops is just a place holder.
  */
 enum {
 	FTRACE_OPS_FL_ENABLED			= 1 << 0,
@@ -98,6 +99,7 @@ enum {
 	FTRACE_OPS_FL_SAVE_REGS			= 1 << 4,
 	FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED	= 1 << 5,
 	FTRACE_OPS_FL_RECURSION_SAFE		= 1 << 6,
+	FTRACE_OPS_FL_STUB			= 1 << 7,
 };
 
 struct ftrace_ops {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index cc4943c7ce6..7e897106b7e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -66,7 +66,7 @@
 
 static struct ftrace_ops ftrace_list_end __read_mostly = {
 	.func		= ftrace_stub,
-	.flags		= FTRACE_OPS_FL_RECURSION_SAFE,
+	.flags		= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
 };
 
 /* ftrace_enabled is a method to turn ftrace on or off */
@@ -4131,7 +4131,8 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
 	preempt_disable_notrace();
 	trace_recursion_set(TRACE_CONTROL_BIT);
 	do_for_each_ftrace_op(op, ftrace_control_list) {
-		if (!ftrace_function_local_disabled(op) &&
+		if (!(op->flags & FTRACE_OPS_FL_STUB) &&
+		    !ftrace_function_local_disabled(op) &&
 		    ftrace_ops_test(op, ip))
 			op->func(ip, parent_ip, op, regs);
 	} while_for_each_ftrace_op(op);
-- 
cgit v1.2.3-70-g09d2


From 6f389a8f1dd22a24f3d9afc2812b30d639e94625 Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhc@lemote.com>
Date: Sun, 7 Apr 2013 02:14:14 +0000
Subject: PM / reboot: call syscore_shutdown() after disable_nonboot_cpus()

As commit 40dc166c (PM / Core: Introduce struct syscore_ops for core
subsystems PM) say, syscore_ops operations should be carried with one
CPU on-line and interrupts disabled. However, after commit f96972f2d
(kernel/sys.c: call disable_nonboot_cpus() in kernel_restart()),
syscore_shutdown() is called before disable_nonboot_cpus(), so break
the rules. We have a MIPS machine with a 8259A PIC, and there is an
external timer (HPET) linked at 8259A. Since 8259A has been shutdown
too early (by syscore_shutdown()), disable_nonboot_cpus() runs without
timer interrupt, so it hangs and reboot fails. This patch call
syscore_shutdown() a little later (after disable_nonboot_cpus()) to
avoid reboot failure, this is the same way as poweroff does.

For consistency, add disable_nonboot_cpus() to kernel_halt().

Signed-off-by: Huacai Chen <chenhc@lemote.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sys.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 39c9c4a2949..0da73cf73e6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -324,7 +324,6 @@ void kernel_restart_prepare(char *cmd)
 	system_state = SYSTEM_RESTART;
 	usermodehelper_disable();
 	device_shutdown();
-	syscore_shutdown();
 }
 
 /**
@@ -370,6 +369,7 @@ void kernel_restart(char *cmd)
 {
 	kernel_restart_prepare(cmd);
 	disable_nonboot_cpus();
+	syscore_shutdown();
 	if (!cmd)
 		printk(KERN_EMERG "Restarting system.\n");
 	else
@@ -395,6 +395,7 @@ static void kernel_shutdown_prepare(enum system_states state)
 void kernel_halt(void)
 {
 	kernel_shutdown_prepare(SYSTEM_HALT);
+	disable_nonboot_cpus();
 	syscore_shutdown();
 	printk(KERN_EMERG "System halted.\n");
 	kmsg_dump(KMSG_DUMP_HALT);
-- 
cgit v1.2.3-70-g09d2


From 83e03b3fe4daffdebbb42151d5410d730ae50bd1 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung.kim@lge.com>
Date: Mon, 1 Apr 2013 21:46:23 +0900
Subject: tracing: Fix double free when function profile init failed

On the failure path, stat->start and stat->pages will refer same page.
So it'll attempt to free the same page again and get kernel panic.

Link: http://lkml.kernel.org/r/1364820385-32027-1-git-send-email-namhyung@kernel.org

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Namhyung Kim <namhyung.kim@lge.com>
Cc: stable@vger.kernel.org
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7e897106b7e..926ebfb7493 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -694,7 +694,6 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
 		free_page(tmp);
 	}
 
-	free_page((unsigned long)stat->pages);
 	stat->pages = NULL;
 	stat->start = NULL;
 
-- 
cgit v1.2.3-70-g09d2


From c481420248c6730246d2a1b1773d5d7007ae0835 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Fri, 12 Apr 2013 11:05:54 +0800
Subject: perf: Fix error return code

Fix to return -ENOMEM in the allocation error case instead of 0
(if pmu_bus_running == 1), as done elsewhere in this function.

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Cc: a.p.zijlstra@chello.nl
Cc: paulus@samba.org
Cc: acme@ghostprotocols.net
Link: http://lkml.kernel.org/r/CAPgLHd8j_fWcgqe%3DKLWjpBj%2B%3Do0Pw6Z-SEq%3DNTPU08c2w1tngQ@mail.gmail.com
[ Tweaked the error code setting placement and the changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7f0d67ea3f4..7e0962ed7f8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5987,6 +5987,7 @@ skip_type:
 	if (pmu->pmu_cpu_context)
 		goto got_cpu_context;
 
+	ret = -ENOMEM;
 	pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
 	if (!pmu->pmu_cpu_context)
 		goto free_dev;
-- 
cgit v1.2.3-70-g09d2


From 6a76f8c0ab19f215af2a3442870eeb5f0e81998d Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung.kim@lge.com>
Date: Thu, 11 Apr 2013 15:55:01 +0900
Subject: tracing: Fix possible NULL pointer dereferences

Currently set_ftrace_pid and set_graph_function files use seq_lseek
for their fops.  However seq_open() is called only for FMODE_READ in
the fops->open() so that if an user tries to seek one of those file
when she open it for writing, it sees NULL seq_file and then panic.

It can be easily reproduced with following command:

  $ cd /sys/kernel/debug/tracing
  $ echo 1234 | sudo tee -a set_ftrace_pid

In this example, GNU coreutils' tee opens the file with fopen(, "a")
and then the fopen() internally calls lseek().

Link: http://lkml.kernel.org/r/1365663302-2170-1-git-send-email-namhyung@kernel.org

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Namhyung Kim <namhyung.kim@lge.com>
Cc: stable@vger.kernel.org
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h     |  2 +-
 kernel/trace/ftrace.c      | 10 +++++-----
 kernel/trace/trace_stack.c |  2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 167abf90780..eb3ce327b97 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -396,7 +396,7 @@ ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf,
 			    size_t cnt, loff_t *ppos);
 ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf,
 			     size_t cnt, loff_t *ppos);
-loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int whence);
+loff_t ftrace_filter_lseek(struct file *file, loff_t offset, int whence);
 int ftrace_regex_release(struct inode *inode, struct file *file);
 
 void __init
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 926ebfb7493..affc35d829c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2697,7 +2697,7 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
 }
 
 loff_t
-ftrace_regex_lseek(struct file *file, loff_t offset, int whence)
+ftrace_filter_lseek(struct file *file, loff_t offset, int whence)
 {
 	loff_t ret;
 
@@ -3570,7 +3570,7 @@ static const struct file_operations ftrace_filter_fops = {
 	.open = ftrace_filter_open,
 	.read = seq_read,
 	.write = ftrace_filter_write,
-	.llseek = ftrace_regex_lseek,
+	.llseek = ftrace_filter_lseek,
 	.release = ftrace_regex_release,
 };
 
@@ -3578,7 +3578,7 @@ static const struct file_operations ftrace_notrace_fops = {
 	.open = ftrace_notrace_open,
 	.read = seq_read,
 	.write = ftrace_notrace_write,
-	.llseek = ftrace_regex_lseek,
+	.llseek = ftrace_filter_lseek,
 	.release = ftrace_regex_release,
 };
 
@@ -3783,8 +3783,8 @@ static const struct file_operations ftrace_graph_fops = {
 	.open		= ftrace_graph_open,
 	.read		= seq_read,
 	.write		= ftrace_graph_write,
+	.llseek		= ftrace_filter_lseek,
 	.release	= ftrace_graph_release,
-	.llseek		= seq_lseek,
 };
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
@@ -4439,7 +4439,7 @@ static const struct file_operations ftrace_pid_fops = {
 	.open		= ftrace_pid_open,
 	.write		= ftrace_pid_write,
 	.read		= seq_read,
-	.llseek		= seq_lseek,
+	.llseek		= ftrace_filter_lseek,
 	.release	= ftrace_pid_release,
 };
 
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 42ca822fc70..83a8b5b7bd3 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -322,7 +322,7 @@ static const struct file_operations stack_trace_filter_fops = {
 	.open = stack_trace_filter_open,
 	.read = seq_read,
 	.write = ftrace_filter_write,
-	.llseek = ftrace_regex_lseek,
+	.llseek = ftrace_filter_lseek,
 	.release = ftrace_regex_release,
 };
 
-- 
cgit v1.2.3-70-g09d2


From 7f49ef69db6bbf756c0abca7e9b65b32e999eec8 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 12 Apr 2013 16:40:13 -0400
Subject: ftrace: Move ftrace_filter_lseek out of CONFIG_DYNAMIC_FTRACE section

As ftrace_filter_lseek is now used with ftrace_pid_fops, it needs to
be moved out of the #ifdef CONFIG_DYNAMIC_FTRACE section as the
ftrace_pid_fops is defined when DYNAMIC_FTRACE is not.

Cc: stable@vger.kernel.org
Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  3 ++-
 kernel/trace/ftrace.c  | 28 ++++++++++++++--------------
 2 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index eb3ce327b97..52da2a25079 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -396,7 +396,6 @@ ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf,
 			    size_t cnt, loff_t *ppos);
 ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf,
 			     size_t cnt, loff_t *ppos);
-loff_t ftrace_filter_lseek(struct file *file, loff_t offset, int whence);
 int ftrace_regex_release(struct inode *inode, struct file *file);
 
 void __init
@@ -569,6 +568,8 @@ static inline int
 ftrace_regex_release(struct inode *inode, struct file *file) { return -ENODEV; }
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
+loff_t ftrace_filter_lseek(struct file *file, loff_t offset, int whence);
+
 /* totally disable ftrace - can not re-enable after this */
 void ftrace_kill(void);
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index affc35d829c..2461ede45a8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1052,6 +1052,19 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
 
 static struct pid * const ftrace_swapper_pid = &init_struct_pid;
 
+loff_t
+ftrace_filter_lseek(struct file *file, loff_t offset, int whence)
+{
+	loff_t ret;
+
+	if (file->f_mode & FMODE_READ)
+		ret = seq_lseek(file, offset, whence);
+	else
+		file->f_pos = ret = 1;
+
+	return ret;
+}
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 #ifndef CONFIG_FTRACE_MCOUNT_RECORD
@@ -2612,7 +2625,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)
  * routine, you can use ftrace_filter_write() for the write
  * routine if @flag has FTRACE_ITER_FILTER set, or
  * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
- * ftrace_regex_lseek() should be used as the lseek routine, and
+ * ftrace_filter_lseek() should be used as the lseek routine, and
  * release must call ftrace_regex_release().
  */
 int
@@ -2696,19 +2709,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
 				 inode, file);
 }
 
-loff_t
-ftrace_filter_lseek(struct file *file, loff_t offset, int whence)
-{
-	loff_t ret;
-
-	if (file->f_mode & FMODE_READ)
-		ret = seq_lseek(file, offset, whence);
-	else
-		file->f_pos = ret = 1;
-
-	return ret;
-}
-
 static int ftrace_match(char *str, char *regex, int len, int type)
 {
 	int matched = 0;
-- 
cgit v1.2.3-70-g09d2


From 935d8aabd4331f47a89c3e1daa5779d23cf244ee Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 14 Apr 2013 10:06:31 -0700
Subject: Add file_ns_capable() helper function for open-time capability
 checking

Nothing is using it yet, but this will allow us to delay the open-time
checks to use time, without breaking the normal UNIX permission
semantics where permissions are determined by the opener (and the file
descriptor can then be passed to a different process, or the process can
drop capabilities).

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/capability.h |  2 ++
 kernel/capability.c        | 24 ++++++++++++++++++++++++
 2 files changed, 26 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 98503b79236..d9a4f7f40f3 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -35,6 +35,7 @@ struct cpu_vfs_cap_data {
 #define _KERNEL_CAP_T_SIZE     (sizeof(kernel_cap_t))
 
 
+struct file;
 struct inode;
 struct dentry;
 struct user_namespace;
@@ -211,6 +212,7 @@ extern bool capable(int cap);
 extern bool ns_capable(struct user_namespace *ns, int cap);
 extern bool nsown_capable(int cap);
 extern bool inode_capable(const struct inode *inode, int cap);
+extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap);
 
 /* audit system wants to get cap info from files as well */
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
diff --git a/kernel/capability.c b/kernel/capability.c
index 493d9725948..f6c2ce5701e 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -392,6 +392,30 @@ bool ns_capable(struct user_namespace *ns, int cap)
 }
 EXPORT_SYMBOL(ns_capable);
 
+/**
+ * file_ns_capable - Determine if the file's opener had a capability in effect
+ * @file:  The file we want to check
+ * @ns:  The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if task that opened the file had a capability in effect
+ * when the file was opened.
+ *
+ * This does not set PF_SUPERPRIV because the caller may not
+ * actually be privileged.
+ */
+bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap)
+{
+	if (WARN_ON_ONCE(!cap_valid(cap)))
+		return false;
+
+	if (security_capable(file->f_cred, ns, cap) == 0)
+		return true;
+
+	return false;
+}
+EXPORT_SYMBOL(file_ns_capable);
+
 /**
  * capable - Determine if the current task has a superior capability in effect
  * @cap: The capability to be tested for
-- 
cgit v1.2.3-70-g09d2