12 files changed, 104 insertions, 98 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 7c2893602d0..47845c57eb1 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -643,13 +643,13 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
 		if ((task_active_pid_ns(current) != &init_pid_ns))
 			return -EPERM;
 
-		if (!capable(CAP_AUDIT_CONTROL))
+		if (!netlink_capable(skb, CAP_AUDIT_CONTROL))
 			err = -EPERM;
 		break;
 	case AUDIT_USER:
 	case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
 	case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
-		if (!capable(CAP_AUDIT_WRITE))
+		if (!netlink_capable(skb, CAP_AUDIT_WRITE))
 			err = -EPERM;
 		break;
 	default:  /* bad msg */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 9fcdaa705b6..3f1ca934a23 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -348,7 +348,7 @@ struct cgrp_cset_link {
  * reference-counted, to improve performance when child cgroups
  * haven't been created.
  */
-static struct css_set init_css_set = {
+struct css_set init_css_set = {
 	.refcount		= ATOMIC_INIT(1),
 	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
 	.tasks			= LIST_HEAD_INIT(init_css_set.tasks),
@@ -1495,7 +1495,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	 */
 	if (!use_task_css_set_links)
 		cgroup_enable_task_cg_lists();
-retry:
+
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 
@@ -1503,7 +1503,7 @@ retry:
 	ret = parse_cgroupfs_options(data, &opts);
 	if (ret)
 		goto out_unlock;
-
+retry:
 	/* look for a matching existing root */
 	if (!opts.subsys_mask && !opts.none && !opts.name) {
 		cgrp_dfl_root_visible = true;
@@ -1562,9 +1562,9 @@ retry:
 		if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {
 			mutex_unlock(&cgroup_mutex);
 			mutex_unlock(&cgroup_tree_mutex);
-			kfree(opts.release_agent);
-			kfree(opts.name);
 			msleep(10);
+			mutex_lock(&cgroup_tree_mutex);
+			mutex_lock(&cgroup_mutex);
 			goto retry;
 		}
 
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 2bc4a225644..345628c78b5 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -21,6 +21,7 @@
 #include <linux/uaccess.h>
 #include <linux/freezer.h>
 #include <linux/seq_file.h>
+#include <linux/mutex.h>
 
 /*
  * A cgroup is freezing if any FREEZING flags are set.  FREEZING_SELF is
@@ -42,9 +43,10 @@ enum freezer_state_flags {
 struct freezer {
 	struct cgroup_subsys_state	css;
 	unsigned int			state;
-	spinlock_t			lock;
 };
 
+static DEFINE_MUTEX(freezer_mutex);
+
 static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
 {
 	return css ? container_of(css, struct freezer, css) : NULL;
@@ -93,7 +95,6 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css)
 	if (!freezer)
 		return ERR_PTR(-ENOMEM);
 
-	spin_lock_init(&freezer->lock);
 	return &freezer->css;
 }
 
@@ -110,14 +111,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
 	struct freezer *freezer = css_freezer(css);
 	struct freezer *parent = parent_freezer(freezer);
 
-	/*
-	 * The following double locking and freezing state inheritance
-	 * guarantee that @cgroup can never escape ancestors' freezing
-	 * states.  See css_for_each_descendant_pre() for details.
-	 */
-	if (parent)
-		spin_lock_irq(&parent->lock);
-	spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING);
+	mutex_lock(&freezer_mutex);
 
 	freezer->state |= CGROUP_FREEZER_ONLINE;
 
@@ -126,10 +120,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
 		atomic_inc(&system_freezing_cnt);
 	}
 
-	spin_unlock(&freezer->lock);
-	if (parent)
-		spin_unlock_irq(&parent->lock);
-
+	mutex_unlock(&freezer_mutex);
 	return 0;
 }
 
@@ -144,14 +135,14 @@ static void freezer_css_offline(struct cgroup_subsys_state *css)
 {
 	struct freezer *freezer = css_freezer(css);
 
-	spin_lock_irq(&freezer->lock);
+	mutex_lock(&freezer_mutex);
 
 	if (freezer->state & CGROUP_FREEZING)
 		atomic_dec(&system_freezing_cnt);
 
 	freezer->state = 0;
 
-	spin_unlock_irq(&freezer->lock);
+	mutex_unlock(&freezer_mutex);
 }
 
 static void freezer_css_free(struct cgroup_subsys_state *css)
@@ -175,7 +166,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
 	struct task_struct *task;
 	bool clear_frozen = false;
 
-	spin_lock_irq(&freezer->lock);
+	mutex_lock(&freezer_mutex);
 
 	/*
 	 * Make the new tasks conform to the current state of @new_css.
@@ -197,21 +188,13 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
 		}
 	}
 
-	spin_unlock_irq(&freezer->lock);
-
-	/*
-	 * Propagate FROZEN clearing upwards.  We may race with
-	 * update_if_frozen(), but as long as both work bottom-up, either
-	 * update_if_frozen() sees child's FROZEN cleared or we clear the
-	 * parent's FROZEN later.  No parent w/ !FROZEN children can be
-	 * left FROZEN.
-	 */
+	/* propagate FROZEN clearing upwards */
 	while (clear_frozen && (freezer = parent_freezer(freezer))) {
-		spin_lock_irq(&freezer->lock);
 		freezer->state &= ~CGROUP_FROZEN;
 		clear_frozen = freezer->state & CGROUP_FREEZING;
-		spin_unlock_irq(&freezer->lock);
 	}
+
+	mutex_unlock(&freezer_mutex);
 }
 
 /**
@@ -228,9 +211,6 @@ static void freezer_fork(struct task_struct *task)
 {
 	struct freezer *freezer;
 
-	rcu_read_lock();
-	freezer = task_freezer(task);
-
 	/*
 	 * The root cgroup is non-freezable, so we can skip locking the
 	 * freezer.  This is safe regardless of race with task migration.
@@ -238,24 +218,18 @@ static void freezer_fork(struct task_struct *task)
 	 * to do.  If we lost and root is the new cgroup, noop is still the
 	 * right thing to do.
 	 */
-	if (!parent_freezer(freezer))
-		goto out;
+	if (task_css_is_root(task, freezer_cgrp_id))
+		return;
 
-	/*
-	 * Grab @freezer->lock and freeze @task after verifying @task still
-	 * belongs to @freezer and it's freezing.  The former is for the
-	 * case where we have raced against task migration and lost and
-	 * @task is already in a different cgroup which may not be frozen.
-	 * This isn't strictly necessary as freeze_task() is allowed to be
-	 * called spuriously but let's do it anyway for, if nothing else,
-	 * documentation.
-	 */
-	spin_lock_irq(&freezer->lock);
-	if (freezer == task_freezer(task) && (freezer->state & CGROUP_FREEZING))
+	mutex_lock(&freezer_mutex);
+	rcu_read_lock();
+
+	freezer = task_freezer(task);
+	if (freezer->state & CGROUP_FREEZING)
 		freeze_task(task);
-	spin_unlock_irq(&freezer->lock);
-out:
+
 	rcu_read_unlock();
+	mutex_unlock(&freezer_mutex);
 }
 
 /**
@@ -281,22 +255,24 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
 	struct css_task_iter it;
 	struct task_struct *task;
 
-	WARN_ON_ONCE(!rcu_read_lock_held());
-
-	spin_lock_irq(&freezer->lock);
+	lockdep_assert_held(&freezer_mutex);
 
 	if (!(freezer->state & CGROUP_FREEZING) ||
 	    (freezer->state & CGROUP_FROZEN))
-		goto out_unlock;
+		return;
 
 	/* are all (live) children frozen? */
+	rcu_read_lock();
 	css_for_each_child(pos, css) {
 		struct freezer *child = css_freezer(pos);
 
 		if ((child->state & CGROUP_FREEZER_ONLINE) &&
-		    !(child->state & CGROUP_FROZEN))
-			goto out_unlock;
+		    !(child->state & CGROUP_FROZEN)) {
+			rcu_read_unlock();
+			return;
+		}
 	}
+	rcu_read_unlock();
 
 	/* are all tasks frozen? */
 	css_task_iter_start(css, &it);
@@ -317,21 +293,29 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
 	freezer->state |= CGROUP_FROZEN;
 out_iter_end:
 	css_task_iter_end(&it);
-out_unlock:
-	spin_unlock_irq(&freezer->lock);
 }
 
 static int freezer_read(struct seq_file *m, void *v)
 {
 	struct cgroup_subsys_state *css = seq_css(m), *pos;
 
+	mutex_lock(&freezer_mutex);
 	rcu_read_lock();
 
 	/* update states bottom-up */
-	css_for_each_descendant_post(pos, css)
+	css_for_each_descendant_post(pos, css) {
+		if (!css_tryget(pos))
+			continue;
+		rcu_read_unlock();
+
 		update_if_frozen(pos);
 
+		rcu_read_lock();
+		css_put(pos);
+	}
+
 	rcu_read_unlock();
+	mutex_unlock(&freezer_mutex);
 
 	seq_puts(m, freezer_state_strs(css_freezer(css)->state));
 	seq_putc(m, '\n');
@@ -373,7 +357,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
 				unsigned int state)
 {
 	/* also synchronizes against task migration, see freezer_attach() */
-	lockdep_assert_held(&freezer->lock);
+	lockdep_assert_held(&freezer_mutex);
 
 	if (!(freezer->state & CGROUP_FREEZER_ONLINE))
 		return;
@@ -414,31 +398,29 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
 	 * descendant will try to inherit its parent's FREEZING state as
 	 * CGROUP_FREEZING_PARENT.
 	 */
+	mutex_lock(&freezer_mutex);
 	rcu_read_lock();
 	css_for_each_descendant_pre(pos, &freezer->css) {
 		struct freezer *pos_f = css_freezer(pos);
 		struct freezer *parent = parent_freezer(pos_f);
 
-		spin_lock_irq(&pos_f->lock);
+		if (!css_tryget(pos))
+			continue;
+		rcu_read_unlock();
 
-		if (pos_f == freezer) {
+		if (pos_f == freezer)
 			freezer_apply_state(pos_f, freeze,
 					    CGROUP_FREEZING_SELF);
-		} else {
-			/*
-			 * Our update to @parent->state is already visible
-			 * which is all we need.  No need to lock @parent.
-			 * For more info on synchronization, see
-			 * freezer_post_create().
-			 */
+		else
 			freezer_apply_state(pos_f,
 					    parent->state & CGROUP_FREEZING,
 					    CGROUP_FREEZING_PARENT);
-		}
 
-		spin_unlock_irq(&pos_f->lock);
+		rcu_read_lock();
+		css_put(pos);
 	}
 	rcu_read_unlock();
+	mutex_unlock(&freezer_mutex);
 }
 
 static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft,
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 6cb20d2e7ee..019d4500844 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -120,7 +120,7 @@ void context_tracking_user_enter(void)
  * instead of preempt_schedule() to exit user context if needed before
  * calling the scheduler.
  */
-asmlinkage void __sched notrace preempt_schedule_context(void)
+asmlinkage __visible void __sched notrace preempt_schedule_context(void)
 {
 	enum ctx_state prev_ctx;
 
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 6b715c0af1b..e0501fe7140 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -990,11 +990,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 	/* Remove an active timer from the queue: */
 	ret = remove_hrtimer(timer, base);
 
-	/* Switch the timer base, if necessary: */
-	new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
-
 	if (mode & HRTIMER_MODE_REL) {
-		tim = ktime_add_safe(tim, new_base->get_time());
+		tim = ktime_add_safe(tim, base->get_time());
 		/*
 		 * CONFIG_TIME_LOW_RES is a temporary way for architectures
 		 * to signal that they simply return xtime in
@@ -1009,6 +1006,9 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 
 	hrtimer_set_expires_range_ns(timer, tim, delta_ns);
 
+	/* Switch the timer base, if necessary: */
+	new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
+
 	timer_stats_hrtimer_set_start_info(timer);
 
 	leftmost = enqueue_hrtimer(timer, new_base);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index b0e9467922e..d24e4339b46 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -4188,7 +4188,7 @@ void debug_show_held_locks(struct task_struct *task)
 }
 EXPORT_SYMBOL_GPL(debug_show_held_locks);
 
-asmlinkage void lockdep_sys_exit(void)
+asmlinkage __visible void lockdep_sys_exit(void)
 {
 	struct task_struct *curr = current;
 
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 18fb7a2fb14..1ea328aafdc 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1586,7 +1586,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
 	return -ENOMEM;
 }
 
-asmlinkage int swsusp_save(void)
+asmlinkage __visible int swsusp_save(void)
 {
 	unsigned int nr_pages, nr_highmem;
 
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index a45b5096229..7228258b85e 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1674,7 +1674,7 @@ EXPORT_SYMBOL(printk_emit);
  *
  * See the vsnprintf() documentation for format string extensions over C99.
  */
-asmlinkage int printk(const char *fmt, ...)
+asmlinkage __visible int printk(const char *fmt, ...)
 {
 	va_list args;
 	int r;
@@ -1737,7 +1737,7 @@ void early_vprintk(const char *fmt, va_list ap)
 	}
 }
 
-asmlinkage void early_printk(const char *fmt, ...)
+asmlinkage __visible void early_printk(const char *fmt, ...)
 {
 	va_list ap;
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 268a45ea238..d9d8ece46a1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2192,7 +2192,7 @@ static inline void post_schedule(struct rq *rq)
  * schedule_tail - first thing a freshly forked thread must call.
  * @prev: the thread we just switched away from.
  */
-asmlinkage void schedule_tail(struct task_struct *prev)
+asmlinkage __visible void schedule_tail(struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct rq *rq = this_rq();
@@ -2741,7 +2741,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
 		blk_schedule_flush_plug(tsk);
 }
 
-asmlinkage void __sched schedule(void)
+asmlinkage __visible void __sched schedule(void)
 {
 	struct task_struct *tsk = current;
 
@@ -2751,7 +2751,7 @@ asmlinkage void __sched schedule(void)
 EXPORT_SYMBOL(schedule);
 
 #ifdef CONFIG_CONTEXT_TRACKING
-asmlinkage void __sched schedule_user(void)
+asmlinkage __visible void __sched schedule_user(void)
 {
 	/*
 	 * If we come here after a random call to set_need_resched(),
@@ -2783,7 +2783,7 @@ void __sched schedule_preempt_disabled(void)
  * off of preempt_enable. Kernel preemptions off return from interrupt
  * occur there and call schedule directly.
  */
-asmlinkage void __sched notrace preempt_schedule(void)
+asmlinkage __visible void __sched notrace preempt_schedule(void)
 {
 	/*
 	 * If there is a non-zero preempt_count or interrupts are disabled,
@@ -2813,7 +2813,7 @@ EXPORT_SYMBOL(preempt_schedule);
  * Note, that this is called and return with irqs disabled. This will
  * protect us against recursive calling from irq.
  */
-asmlinkage void __sched preempt_schedule_irq(void)
+asmlinkage __visible void __sched preempt_schedule_irq(void)
 {
 	enum ctx_state prev_state;
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 33e4648ae0e..92f24f5e8d5 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -223,7 +223,7 @@ static inline bool lockdep_softirq_start(void) { return false; }
 static inline void lockdep_softirq_end(bool in_hardirq) { }
 #endif
 
-asmlinkage void __do_softirq(void)
+asmlinkage __visible void __do_softirq(void)
 {
 	unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
 	unsigned long old_flags = current->flags;
@@ -299,7 +299,7 @@ restart:
 	tsk_restore_flags(current, old_flags, PF_MEMALLOC);
 }
 
-asmlinkage void do_softirq(void)
+asmlinkage __visible void do_softirq(void)
 {
 	__u32 pending;
 	unsigned long flags;
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index ac5b23cf721..6620e5837ce 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -188,7 +188,6 @@ static int tracepoint_add_func(struct tracepoint *tp,
 		WARN_ON_ONCE(1);
 		return PTR_ERR(old);
 	}
-	release_probes(old);
 
 	/*
 	 * rcu_assign_pointer has a smp_wmb() which makes sure that the new
@@ -200,6 +199,7 @@ static int tracepoint_add_func(struct tracepoint *tp,
 	rcu_assign_pointer(tp->funcs, tp_funcs);
 	if (!static_key_enabled(&tp->key))
 		static_key_slow_inc(&tp->key);
+	release_probes(old);
 	return 0;
 }
 
@@ -221,7 +221,6 @@ static int tracepoint_remove_func(struct tracepoint *tp,
 		WARN_ON_ONCE(1);
 		return PTR_ERR(old);
 	}
-	release_probes(old);
 
 	if (!tp_funcs) {
 		/* Removed last function */
@@ -232,6 +231,7 @@ static int tracepoint_remove_func(struct tracepoint *tp,
 			static_key_slow_dec(&tp->key);
 	}
 	rcu_assign_pointer(tp->funcs, tp_funcs);
+	release_probes(old);
 	return 0;
 }
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0ee63af30bd..8edc8718542 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1916,6 +1916,12 @@ static void send_mayday(struct work_struct *work)
 
 	/* mayday mayday mayday */
 	if (list_empty(&pwq->mayday_node)) {
+		/*
+		 * If @pwq is for an unbound wq, its base ref may be put at
+		 * any time due to an attribute change.  Pin @pwq until the
+		 * rescuer is done with it.
+		 */
+		get_pwq(pwq);
 		list_add_tail(&pwq->mayday_node, &wq->maydays);
 		wake_up_process(wq->rescuer->task);
 	}
@@ -2398,6 +2404,7 @@ static int rescuer_thread(void *__rescuer)
 	struct worker *rescuer = __rescuer;
 	struct workqueue_struct *wq = rescuer->rescue_wq;
 	struct list_head *scheduled = &rescuer->scheduled;
+	bool should_stop;
 
 	set_user_nice(current, RESCUER_NICE_LEVEL);
 
@@ -2409,11 +2416,15 @@ static int rescuer_thread(void *__rescuer)
 repeat:
 	set_current_state(TASK_INTERRUPTIBLE);
 
-	if (kthread_should_stop()) {
-		__set_current_state(TASK_RUNNING);
-		rescuer->task->flags &= ~PF_WQ_WORKER;
-		return 0;
-	}
+	/*
+	 * By the time the rescuer is requested to stop, the workqueue
+	 * shouldn't have any work pending, but @wq->maydays may still have
+	 * pwq(s) queued.  This can happen by non-rescuer workers consuming
+	 * all the work items before the rescuer got to them.  Go through
+	 * @wq->maydays processing before acting on should_stop so that the
+	 * list is always empty on exit.
+	 */
+	should_stop = kthread_should_stop();
 
 	/* see whether any pwq is asking for help */
 	spin_lock_irq(&wq_mayday_lock);
@@ -2445,6 +2456,12 @@ repeat:
 		process_scheduled_works(rescuer);
 
 		/*
+		 * Put the reference grabbed by send_mayday().  @pool won't
+		 * go away while we're holding its lock.
+		 */
+		put_pwq(pwq);
+
+		/*
 		 * Leave this pool.  If keep_working() is %true, notify a
 		 * regular worker; otherwise, we end up with 0 concurrency
 		 * and stalling the execution.
@@ -2459,6 +2476,12 @@ repeat:
 
 	spin_unlock_irq(&wq_mayday_lock);
 
+	if (should_stop) {
+		__set_current_state(TASK_RUNNING);
+		rescuer->task->flags &= ~PF_WQ_WORKER;
+		return 0;
+	}
+
 	/* rescuers should never participate in concurrency management */
 	WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
 	schedule();
@@ -4100,7 +4123,8 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
 	if (!pwq) {
 		pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
 			   wq->name);
-		goto out_unlock;
+		mutex_lock(&wq->mutex);
+		goto use_dfl_pwq;
 	}
 
 	/*