17 files changed, 181 insertions, 93 deletions
diff --git a/kernel/exit.c b/kernel/exit.c
index 06b24b3aa37..993369ee94d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -24,7 +24,6 @@
 #include <linux/pid_namespace.h>
 #include <linux/ptrace.h>
 #include <linux/profile.h>
-#include <linux/signalfd.h>
 #include <linux/mount.h>
 #include <linux/proc_fs.h>
 #include <linux/kthread.h>
@@ -86,14 +85,6 @@ static void __exit_signal(struct task_struct *tsk)
 	sighand = rcu_dereference(tsk->sighand);
 	spin_lock(&sighand->siglock);
 
-	/*
-	 * Notify that this sighand has been detached. This must
-	 * be called with the tsk->sighand lock held. Also, this
-	 * access tsk->sighand internally, so it must be called
-	 * before tsk->sighand is reset.
-	 */
-	signalfd_detach_locked(tsk);
-
 	posix_cpu_timers_exit(tsk);
 	if (atomic_dec_and_test(&sig->count))
 		posix_cpu_timers_exit_group(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 7332e236d36..33f12f48684 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1438,7 +1438,7 @@ static void sighand_ctor(void *data, struct kmem_cache *cachep,
 	struct sighand_struct *sighand = data;
 
 	spin_lock_init(&sighand->siglock);
-	INIT_LIST_HEAD(&sighand->signalfd_list);
+	init_waitqueue_head(&sighand->signalfd_wqh);
 }
 
 void __init proc_caches_init(void)
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index f7921360efa..7e52eb051f2 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -61,10 +61,10 @@ void compat_exit_robust_list(struct task_struct *curr)
 	if (fetch_robust_entry(&upending, &pending,
 			       &head->list_op_pending, &pip))
 		return;
-	if (upending)
+	if (pending)
 		handle_futex_death((void __user *)pending + futex_offset, curr, pip);
 
-	while (compat_ptr(uentry) != &head->list) {
+	while (entry != (struct robust_list __user *) &head->list) {
 		/*
 		 * A pending lock might already be on the list, so
 		 * dont process it twice:
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 9809cc1f33d..c6a4f8aebeb 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -505,7 +505,7 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
 	if (ret < 0)
 		goto out;
 
-	return call_usermodehelper_exec(sub_info, 1);
+	return call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
 
   out:
 	call_usermodehelper_freeinfo(sub_info);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 82a558b655d..3eca7a55f2e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -233,6 +233,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 
 	/* Architecture-specific hardware disable .. */
 	ptrace_disable(child);
+	clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 
 	write_lock_irq(&tasklist_lock);
 	/* protect against de_thread()->release_task() */
diff --git a/kernel/sched.c b/kernel/sched.c
index b533d6db78a..6107a0cd632 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -668,7 +668,7 @@ static u64 div64_likely32(u64 divident, unsigned long divisor)
 /*
  * Shift right and round:
  */
-#define RSR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
+#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
 
 static unsigned long
 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
@@ -684,10 +684,10 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 	 * Check whether we'd overflow the 64-bit multiplication:
 	 */
 	if (unlikely(tmp > WMULT_CONST))
-		tmp = RSR(RSR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
+		tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
 			WMULT_SHIFT/2);
 	else
-		tmp = RSR(tmp * lw->inv_weight, WMULT_SHIFT);
+		tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
 
 	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
 }
@@ -858,7 +858,6 @@ static void dec_nr_running(struct task_struct *p, struct rq *rq)
 
 static void set_load_weight(struct task_struct *p)
 {
-	task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
 	p->se.wait_runtime = 0;
 
 	if (task_has_rt_policy(p)) {
@@ -1683,6 +1682,11 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 
 	p->prio = effective_prio(p);
 
+	if (rt_prio(p->prio))
+		p->sched_class = &rt_sched_class;
+	else
+		p->sched_class = &fair_sched_class;
+
 	if (!p->sched_class->task_new || !sysctl_sched_child_runs_first ||
 			(clone_flags & CLONE_VM) || task_cpu(p) != this_cpu ||
 			!current->se.on_rq) {
@@ -2512,7 +2516,7 @@ group_next:
 	 * a think about bumping its value to force at least one task to be
 	 * moved
 	 */
-	if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task) {
+	if (*imbalance < busiest_load_per_task) {
 		unsigned long tmp, pwr_now, pwr_move;
 		unsigned int imbn;
 
@@ -2564,10 +2568,8 @@ small_imbalance:
 		pwr_move /= SCHED_LOAD_SCALE;
 
 		/* Move if we gain throughput */
-		if (pwr_move <= pwr_now)
-			goto out_balanced;
-
-		*imbalance = busiest_load_per_task;
+		if (pwr_move > pwr_now)
+			*imbalance = busiest_load_per_task;
 	}
 
 	return busiest;
@@ -4553,10 +4555,7 @@ asmlinkage long sys_sched_yield(void)
 	struct rq *rq = this_rq_lock();
 
 	schedstat_inc(rq, yld_cnt);
-	if (unlikely(rq->nr_running == 1))
-		schedstat_inc(rq, yld_act_empty);
-	else
-		current->sched_class->yield_task(rq, current);
+	current->sched_class->yield_task(rq, current);
 
 	/*
 	 * Since we are going to call schedule() anyway, there's
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index ab18f45f2ab..c3ee38bd342 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -283,4 +283,5 @@ void proc_sched_set_task(struct task_struct *p)
 	p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0;
 #endif
 	p->se.sum_exec_runtime = 0;
+	p->se.prev_sum_exec_runtime	= 0;
 }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ce39282d9c0..c9fbe8e73a4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -43,6 +43,14 @@ unsigned int sysctl_sched_latency __read_mostly = 20000000ULL;
 unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL;
 
 /*
+ * sys_sched_yield() compat mode
+ *
+ * This option switches the agressive yield implementation of the
+ * old scheduler back on.
+ */
+unsigned int __read_mostly sysctl_sched_compat_yield;
+
+/*
  * SCHED_BATCH wake-up granularity.
  * (default: 25 msec, units: nanoseconds)
  *
@@ -194,6 +202,8 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	update_load_add(&cfs_rq->load, se->load.weight);
 	cfs_rq->nr_running++;
 	se->on_rq = 1;
+
+	schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
 }
 
 static inline void
@@ -205,6 +215,8 @@ __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	update_load_sub(&cfs_rq->load, se->load.weight);
 	cfs_rq->nr_running--;
 	se->on_rq = 0;
+
+	schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime);
 }
 
 static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
@@ -291,7 +303,7 @@ niced_granularity(struct sched_entity *curr, unsigned long granularity)
 	/*
 	 * It will always fit into 'long':
 	 */
-	return (long) (tmp >> WMULT_SHIFT);
+	return (long) (tmp >> (WMULT_SHIFT-NICE_0_SHIFT));
 }
 
 static inline void
@@ -574,7 +586,6 @@ static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 	prev_runtime = se->wait_runtime;
 	__add_wait_runtime(cfs_rq, se, delta_fair);
-	schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
 	delta_fair = se->wait_runtime - prev_runtime;
 
 	/*
@@ -662,7 +673,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 			if (tsk->state & TASK_UNINTERRUPTIBLE)
 				se->block_start = rq_of(cfs_rq)->clock;
 		}
-		cfs_rq->wait_runtime -= se->wait_runtime;
 #endif
 	}
 	__dequeue_entity(cfs_rq, se);
@@ -671,22 +681,39 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 /*
  * Preempt the current task with a newly woken task if needed:
  */
-static int
+static void
 __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se,
 			  struct sched_entity *curr, unsigned long granularity)
 {
 	s64 __delta = curr->fair_key - se->fair_key;
+	unsigned long ideal_runtime, delta_exec;
+
+	/*
+	 * ideal_runtime is compared against sum_exec_runtime, which is
+	 * walltime, hence do not scale.
+	 */
+	ideal_runtime = max(sysctl_sched_latency / cfs_rq->nr_running,
+			(unsigned long)sysctl_sched_min_granularity);
+
+	/*
+	 * If we executed more than what the latency constraint suggests,
+	 * reduce the rescheduling granularity. This way the total latency
+	 * of how much a task is not scheduled converges to
+	 * sysctl_sched_latency:
+	 */
+	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
+	if (delta_exec > ideal_runtime)
+		granularity = 0;
 
 	/*
 	 * Take scheduling granularity into account - do not
 	 * preempt the current task unless the best task has
 	 * a larger than sched_granularity fairness advantage:
+	 *
+	 * scale granularity as key space is in fair_clock.
 	 */
-	if (__delta > niced_granularity(curr, granularity)) {
+	if (__delta > niced_granularity(curr, granularity))
 		resched_task(rq_of(cfs_rq)->curr);
-		return 1;
-	}
-	return 0;
 }
 
 static inline void
@@ -702,6 +729,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	update_stats_wait_end(cfs_rq, se);
 	update_stats_curr_start(cfs_rq, se);
 	set_cfs_rq_curr(cfs_rq, se);
+	se->prev_sum_exec_runtime = se->sum_exec_runtime;
 }
 
 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
@@ -731,7 +759,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 
 static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
-	unsigned long gran, ideal_runtime, delta_exec;
 	struct sched_entity *next;
 
 	/*
@@ -748,22 +775,8 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	if (next == curr)
 		return;
 
-	gran = sched_granularity(cfs_rq);
-	ideal_runtime = niced_granularity(curr,
-		max(sysctl_sched_latency / cfs_rq->nr_running,
-		    (unsigned long)sysctl_sched_min_granularity));
-	/*
-	 * If we executed more than what the latency constraint suggests,
-	 * reduce the rescheduling granularity. This way the total latency
-	 * of how much a task is not scheduled converges to
-	 * sysctl_sched_latency:
-	 */
-	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
-	if (delta_exec > ideal_runtime)
-		gran = 0;
-
-	if (__check_preempt_curr_fair(cfs_rq, next, curr, gran))
-		curr->prev_sum_exec_runtime = curr->sum_exec_runtime;
+	__check_preempt_curr_fair(cfs_rq, next, curr,
+			sched_granularity(cfs_rq));
 }
 
 /**************************************************
@@ -892,19 +905,62 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
 }
 
 /*
- * sched_yield() support is very simple - we dequeue and enqueue
+ * sched_yield() support is very simple - we dequeue and enqueue.
+ *
+ * If compat_yield is turned on then we requeue to the end of the tree.
  */
 static void yield_task_fair(struct rq *rq, struct task_struct *p)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
+	struct sched_entity *rightmost, *se = &p->se;
+	struct rb_node *parent;
 
-	__update_rq_clock(rq);
 	/*
-	 * Dequeue and enqueue the task to update its
-	 * position within the tree:
+	 * Are we the only task in the tree?
+	 */
+	if (unlikely(cfs_rq->nr_running == 1))
+		return;
+
+	if (likely(!sysctl_sched_compat_yield)) {
+		__update_rq_clock(rq);
+		/*
+		 * Dequeue and enqueue the task to update its
+		 * position within the tree:
+		 */
+		dequeue_entity(cfs_rq, &p->se, 0);
+		enqueue_entity(cfs_rq, &p->se, 0);
+
+		return;
+	}
+	/*
+	 * Find the rightmost entry in the rbtree:
+	 */
+	do {
+		parent = *link;
+		link = &parent->rb_right;
+	} while (*link);
+
+	rightmost = rb_entry(parent, struct sched_entity, run_node);
+	/*
+	 * Already in the rightmost position?
+	 */
+	if (unlikely(rightmost == se))
+		return;
+
+	/*
+	 * Minimally necessary key value to be last in the tree:
+	 */
+	se->fair_key = rightmost->fair_key + 1;
+
+	if (cfs_rq->rb_leftmost == &se->run_node)
+		cfs_rq->rb_leftmost = rb_next(&se->run_node);
+	/*
+	 * Relink the task to the rightmost position:
 	 */
-	dequeue_entity(cfs_rq, &p->se, 0);
-	enqueue_entity(cfs_rq, &p->se, 0);
+	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
+	rb_link_node(&se->run_node, parent, link);
+	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
 }
 
 /*
@@ -1121,10 +1177,8 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 	 * The statistical average of wait_runtime is about
 	 * -granularity/2, so initialize the task with that:
 	 */
-	if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) {
+	if (sysctl_sched_features & SCHED_FEAT_START_DEBIT)
 		se->wait_runtime = -(sched_granularity(cfs_rq) / 2);
-		schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
-	}
 
 	__enqueue_entity(cfs_rq, se);
 }
diff --git a/kernel/signal.c b/kernel/signal.c
index 3169bed0b4d..9fb91a32edd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -378,8 +378,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 	/* We only dequeue private signals from ourselves, we don't let
 	 * signalfd steal them
 	 */
-	if (likely(tsk == current))
-		signr = __dequeue_signal(&tsk->pending, mask, info);
+	signr = __dequeue_signal(&tsk->pending, mask, info);
 	if (!signr) {
 		signr = __dequeue_signal(&tsk->signal->shared_pending,
 					 mask, info);
@@ -407,8 +406,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 			}
 		}
 	}
-	if (likely(tsk == current))
-		recalc_sigpending();
+	recalc_sigpending();
 	if (signr && unlikely(sig_kernel_stop(signr))) {
 		/*
 		 * Set a marker that we have dequeued a stop signal.  Our
@@ -425,7 +423,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 		if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
 			tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
 	}
-	if (signr && likely(tsk == current) &&
+	if (signr &&
 	     ((info->si_code & __SI_MASK) == __SI_TIMER) &&
 	     info->si_sys_private){
 		/*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6ace893c17c..53a456ebf6d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -303,6 +303,14 @@ static ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_compat_yield",
+		.data		= &sysctl_sched_compat_yield,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #ifdef CONFIG_PROVE_LOCKING
 	{
 		.ctl_name	= CTL_UNNUMBERED,
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index cd91237dbfe..de6a2d6b3eb 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -226,7 +226,7 @@ static void sync_cmos_clock(unsigned long dummy)
 
 static void notify_cmos_timer(void)
 {
-	if (no_sync_cmos_clock)
+	if (!no_sync_cmos_clock)
 		mod_timer(&sync_cmos_timer, jiffies + 1);
 }
 
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index db8e0f3d409..0962e057766 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -383,11 +383,7 @@ static int tick_broadcast_set_event(ktime_t expires, int force)
 int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
 {
 	clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
-
-	if(!cpus_empty(tick_broadcast_oneshot_mask))
-		tick_broadcast_set_event(ktime_get(), 1);
-
-	return cpu_isset(smp_processor_id(), tick_broadcast_oneshot_mask);
+	return 0;
 }
 
 /*
@@ -549,20 +545,17 @@ void tick_broadcast_switch_to_oneshot(void)
  */
 void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
 {
-	struct clock_event_device *bc;
 	unsigned long flags;
 	unsigned int cpu = *cpup;
 
 	spin_lock_irqsave(&tick_broadcast_lock, flags);
 
-	bc = tick_broadcast_device.evtdev;
+	/*
+	 * Clear the broadcast mask flag for the dead cpu, but do not
+	 * stop the broadcast device!
+	 */
 	cpu_clear(cpu, tick_broadcast_oneshot_mask);
 
-	if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) {
-		if (bc && cpus_empty(tick_broadcast_oneshot_mask))
-			clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
-	}
-
 	spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
 
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b416995b975..8c3fef1db09 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -160,6 +160,18 @@ void tick_nohz_stop_sched_tick(void)
 	cpu = smp_processor_id();
 	ts = &per_cpu(tick_cpu_sched, cpu);
 
+	/*
+	 * If this cpu is offline and it is the one which updates
+	 * jiffies, then give up the assignment and let it be taken by
+	 * the cpu which runs the tick timer next. If we don't drop
+	 * this here the jiffies might be stale and do_timer() never
+	 * invoked.
+	 */
+	if (unlikely(!cpu_online(cpu))) {
+		if (cpu == tick_do_timer_cpu)
+			tick_do_timer_cpu = -1;
+	}
+
 	if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
 		goto end;
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index acc417b5a9b..4ad79f6bdec 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -217,6 +217,7 @@ static void change_clocksource(void)
 }
 #else
 static inline void change_clocksource(void) { }
+static inline s64 __get_nsec_offset(void) { return 0; }
 #endif
 
 /**
@@ -280,6 +281,8 @@ void __init timekeeping_init(void)
 static int timekeeping_suspended;
 /* time in seconds when suspend began */
 static unsigned long timekeeping_suspend_time;
+/* xtime offset when we went into suspend */
+static s64 timekeeping_suspend_nsecs;
 
 /**
  * timekeeping_resume - Resumes the generic timekeeping subsystem.
@@ -305,6 +308,8 @@ static int timekeeping_resume(struct sys_device *dev)
 		wall_to_monotonic.tv_sec -= sleep_length;
 		total_sleep_time += sleep_length;
 	}
+	/* Make sure that we have the correct xtime reference */
+	timespec_add_ns(&xtime, timekeeping_suspend_nsecs);
 	/* re-base the last cycle value */
 	clock->cycle_last = clocksource_read(clock);
 	clock->error = 0;
@@ -325,9 +330,12 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
 {
 	unsigned long flags;
 
+	timekeeping_suspend_time = read_persistent_clock();
+
 	write_seqlock_irqsave(&xtime_lock, flags);
+	/* Get the current xtime offset */
+	timekeeping_suspend_nsecs = __get_nsec_offset();
 	timekeeping_suspended = 1;
-	timekeeping_suspend_time = read_persistent_clock();
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 
 	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
diff --git a/kernel/user.c b/kernel/user.c
index e7d11cef699..9ca2848fc35 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -55,25 +55,22 @@ struct user_struct root_user = {
 /*
  * These routines must be called with the uidhash spinlock held!
  */
-static inline void uid_hash_insert(struct user_struct *up, struct list_head *hashent)
+static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
 {
-	list_add(&up->uidhash_list, hashent);
+	hlist_add_head(&up->uidhash_node, hashent);
 }
 
 static inline void uid_hash_remove(struct user_struct *up)
 {
-	list_del(&up->uidhash_list);
+	hlist_del_init(&up->uidhash_node);
 }
 
-static inline struct user_struct *uid_hash_find(uid_t uid, struct list_head *hashent)
+static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
 {
-	struct list_head *up;
-
-	list_for_each(up, hashent) {
-		struct user_struct *user;
-
-		user = list_entry(up, struct user_struct, uidhash_list);
+	struct user_struct *user;
+	struct hlist_node *h;
 
+	hlist_for_each_entry(user, h, hashent, uidhash_node) {
 		if(user->uid == uid) {
 			atomic_inc(&user->__count);
 			return user;
@@ -122,7 +119,7 @@ void free_uid(struct user_struct *up)
 
 struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 {
-	struct list_head *hashent = uidhashentry(ns, uid);
+	struct hlist_head *hashent = uidhashentry(ns, uid);
 	struct user_struct *up;
 
 	spin_lock_irq(&uidhash_lock);
@@ -202,6 +199,30 @@ void switch_uid(struct user_struct *new_user)
 	suid_keys(current);
 }
 
+void release_uids(struct user_namespace *ns)
+{
+	int i;
+	unsigned long flags;
+	struct hlist_head *head;
+	struct hlist_node *nd;
+
+	spin_lock_irqsave(&uidhash_lock, flags);
+	/*
+	 * collapse the chains so that the user_struct-s will
+	 * be still alive, but not in hashes. subsequent free_uid()
+	 * will free them.
+	 */
+	for (i = 0; i < UIDHASH_SZ; i++) {
+		head = ns->uidhash_table + i;
+		while (!hlist_empty(head)) {
+			nd = head->first;
+			hlist_del_init(nd);
+		}
+	}
+	spin_unlock_irqrestore(&uidhash_lock, flags);
+
+	free_uid(ns->root_user);
+}
 
 static int __init uid_cache_init(void)
 {
@@ -211,7 +232,7 @@ static int __init uid_cache_init(void)
 			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 
 	for(n = 0; n < UIDHASH_SZ; ++n)
-		INIT_LIST_HEAD(init_user_ns.uidhash_table + n);
+		INIT_HLIST_HEAD(init_user_ns.uidhash_table + n);
 
 	/* Insert the root user immediately (init already runs as root) */
 	spin_lock_irq(&uidhash_lock);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 85af9422ea6..7af90fc4f0f 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -39,7 +39,7 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
 	kref_init(&ns->kref);
 
 	for (n = 0; n < UIDHASH_SZ; ++n)
-		INIT_LIST_HEAD(ns->uidhash_table + n);
+		INIT_HLIST_HEAD(ns->uidhash_table + n);
 
 	/* Insert new root user.  */
 	ns->root_user = alloc_uid(ns, 0);
@@ -81,7 +81,7 @@ void free_user_ns(struct kref *kref)
 	struct user_namespace *ns;
 
 	ns = container_of(kref, struct user_namespace, kref);
-	free_uid(ns->root_user);
+	release_uids(ns);
 	kfree(ns);
 }
 
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 9d8180a0f0d..816d7b24fa0 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -28,7 +28,9 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
 	if (!ns)
 		return ERR_PTR(-ENOMEM);
 
+	down_read(&uts_sem);
 	memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
+	up_read(&uts_sem);
 	kref_init(&ns->kref);
 	return ns;
 }