From 038e7332b8d4c0629a2965e3ede1a92e8e427bd6 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 14 Jun 2012 02:31:10 -0700
Subject: userns: make each net (net_ns) belong to a user_ns

The user namespace which creates a new network namespace owns that
namespace and all resources created in it.  This way we can target
capability checks for privileged operations against network resources to
the user_ns which created the network namespace in which the resource
lives.  Privilege to the user namespace which owns the network
namespace, or any parent user namespace thereof, provides the same
privilege to the network resource.

This patch is reworked from a version originally by
Serge E. Hallyn <serge.hallyn@canonical.com>

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/nsproxy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index b576f7f14bc..7e1c3de1ce4 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -90,7 +90,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
 		goto out_pid;
 	}
 
-	new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns);
+	new_nsp->net_ns = copy_net_ns(flags, task_cred_xxx(tsk, user_ns), tsk->nsproxy->net_ns);
 	if (IS_ERR(new_nsp->net_ns)) {
 		err = PTR_ERR(new_nsp->net_ns);
 		goto out_net;
-- 
cgit v1.2.3-70-g09d2


From 49f4d8b93ccf9454284b6f524b96c66d8d7fbccc Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 2 Aug 2012 04:25:10 -0700
Subject: pidns: Capture the user namespace and filter ns_last_pid

- Capture the the user namespace that creates the pid namespace
- Use that user namespace to test if it is ok to write to
  /proc/sys/kernel/ns_last_pid.

Zhao Hongjiang <zhaohongjiang@huawei.com> noticed I was missing a put_user_ns
in when destroying a pid_ns.  I have foloded his patch into this one
so that bisects will work properly.

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/pid_namespace.h |  8 +++++---
 kernel/nsproxy.c              |  2 +-
 kernel/pid.c                  |  1 +
 kernel/pid_namespace.c        | 17 ++++++++++++-----
 4 files changed, 19 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 65e3e87eacc..c89c9cfcd24 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -31,6 +31,7 @@ struct pid_namespace {
 #ifdef CONFIG_BSD_PROCESS_ACCT
 	struct bsd_acct_struct *bacct;
 #endif
+	struct user_namespace *user_ns;
 	kgid_t pid_gid;
 	int hide_pid;
 	int reboot;	/* group exit code if this pidns was rebooted */
@@ -46,7 +47,8 @@ static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
 	return ns;
 }
 
-extern struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *ns);
+extern struct pid_namespace *copy_pid_ns(unsigned long flags,
+	struct user_namespace *user_ns, struct pid_namespace *ns);
 extern void zap_pid_ns_processes(struct pid_namespace *pid_ns);
 extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd);
 extern void put_pid_ns(struct pid_namespace *ns);
@@ -59,8 +61,8 @@ static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
 	return ns;
 }
 
-static inline struct pid_namespace *
-copy_pid_ns(unsigned long flags, struct pid_namespace *ns)
+static inline struct pid_namespace *copy_pid_ns(unsigned long flags,
+	struct user_namespace *user_ns, struct pid_namespace *ns)
 {
 	if (flags & CLONE_NEWPID)
 		ns = ERR_PTR(-EINVAL);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 7e1c3de1ce4..ca27d2c5264 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -84,7 +84,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
 		goto out_ipc;
 	}
 
-	new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk));
+	new_nsp->pid_ns = copy_pid_ns(flags, task_cred_xxx(tsk, user_ns), task_active_pid_ns(tsk));
 	if (IS_ERR(new_nsp->pid_ns)) {
 		err = PTR_ERR(new_nsp->pid_ns);
 		goto out_pid;
diff --git a/kernel/pid.c b/kernel/pid.c
index aebd4f5aaf4..2a624f1486e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -78,6 +78,7 @@ struct pid_namespace init_pid_ns = {
 	.last_pid = 0,
 	.level = 0,
 	.child_reaper = &init_task,
+	.user_ns = &init_user_ns,
 };
 EXPORT_SYMBOL_GPL(init_pid_ns);
 
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 7b07cc0dfb7..b2604950aa5 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -10,6 +10,7 @@
 
 #include <linux/pid.h>
 #include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
 #include <linux/syscalls.h>
 #include <linux/err.h>
 #include <linux/acct.h>
@@ -74,7 +75,8 @@ err_alloc:
 /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
 #define MAX_PID_NS_LEVEL 32
 
-static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns)
+static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
+	struct pid_namespace *parent_pid_ns)
 {
 	struct pid_namespace *ns;
 	unsigned int level = parent_pid_ns->level + 1;
@@ -102,6 +104,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
 	kref_init(&ns->kref);
 	ns->level = level;
 	ns->parent = get_pid_ns(parent_pid_ns);
+	ns->user_ns = get_user_ns(user_ns);
 
 	set_bit(0, ns->pidmap[0].page);
 	atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
@@ -117,6 +120,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
 
 out_put_parent_pid_ns:
 	put_pid_ns(parent_pid_ns);
+	put_user_ns(user_ns);
 out_free_map:
 	kfree(ns->pidmap[0].page);
 out_free:
@@ -131,16 +135,18 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
 
 	for (i = 0; i < PIDMAP_ENTRIES; i++)
 		kfree(ns->pidmap[i].page);
+	put_user_ns(ns->user_ns);
 	kmem_cache_free(pid_ns_cachep, ns);
 }
 
-struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
+struct pid_namespace *copy_pid_ns(unsigned long flags,
+	struct user_namespace *user_ns, struct pid_namespace *old_ns)
 {
 	if (!(flags & CLONE_NEWPID))
 		return get_pid_ns(old_ns);
 	if (flags & (CLONE_THREAD|CLONE_PARENT))
 		return ERR_PTR(-EINVAL);
-	return create_pid_namespace(old_ns);
+	return create_pid_namespace(user_ns, old_ns);
 }
 
 static void free_pid_ns(struct kref *kref)
@@ -239,9 +245,10 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 static int pid_ns_ctl_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp, loff_t *ppos)
 {
+	struct pid_namespace *pid_ns = task_active_pid_ns(current);
 	struct ctl_table tmp = *table;
 
-	if (write && !capable(CAP_SYS_ADMIN))
+	if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 
 	/*
@@ -250,7 +257,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
 	 * it should synchronize its usage with external means.
 	 */
 
-	tmp.data = &current->nsproxy->pid_ns->last_pid;
+	tmp.data = &pid_ns->last_pid;
 	return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 17cf22c33e1f1b5e435469c84e43872579497653 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 2 Mar 2010 14:51:53 -0800
Subject: pidns: Use task_active_pid_ns where appropriate

The expressions tsk->nsproxy->pid_ns and task_active_pid_ns
aka ns_of_pid(task_pid(tsk)) should have the same number of
cache line misses with the practical difference that
ns_of_pid(task_pid(tsk)) is released later in a processes life.

Furthermore by using task_active_pid_ns it becomes trivial
to write an unshare implementation for the the pid namespace.

So I have used task_active_pid_ns everywhere I can.

In fork since the pid has not yet been attached to the
process I use ns_of_pid, to achieve the same effect.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 arch/powerpc/platforms/cell/spufs/sched.c | 2 +-
 arch/um/drivers/mconsole_kern.c           | 2 +-
 drivers/staging/android/binder.c          | 3 ++-
 fs/hppfs/hppfs.c                          | 2 +-
 fs/proc/root.c                            | 2 +-
 kernel/cgroup.c                           | 2 +-
 kernel/events/core.c                      | 2 +-
 kernel/fork.c                             | 2 +-
 kernel/nsproxy.c                          | 2 +-
 kernel/pid.c                              | 8 ++++----
 kernel/signal.c                           | 2 +-
 kernel/sysctl_binary.c                    | 2 +-
 12 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 965d381abd7..25db92a8e1c 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -1094,7 +1094,7 @@ static int show_spu_loadavg(struct seq_file *s, void *private)
 		LOAD_INT(c), LOAD_FRAC(c),
 		count_active_contexts(),
 		atomic_read(&nr_spu_contexts),
-		current->nsproxy->pid_ns->last_pid);
+		task_active_pid_ns(current)->last_pid);
 	return 0;
 }
 
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 79ccfe6c707..7fc71c62826 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -123,7 +123,7 @@ void mconsole_log(struct mc_request *req)
 
 void mconsole_proc(struct mc_request *req)
 {
-	struct vfsmount *mnt = current->nsproxy->pid_ns->proc_mnt;
+	struct vfsmount *mnt = task_active_pid_ns(current)->proc_mnt;
 	char *buf;
 	int len;
 	struct file *file;
diff --git a/drivers/staging/android/binder.c b/drivers/staging/android/binder.c
index 5d4610babd8..a97bbcd1c9e 100644
--- a/drivers/staging/android/binder.c
+++ b/drivers/staging/android/binder.c
@@ -33,6 +33,7 @@
 #include <linux/uaccess.h>
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
+#include <linux/pid_namespace.h>
 
 #include "binder.h"
 
@@ -2344,7 +2345,7 @@ retry:
 		if (t->from) {
 			struct task_struct *sender = t->from->proc->tsk;
 			tr.sender_pid = task_tgid_nr_ns(sender,
-							current->nsproxy->pid_ns);
+							task_active_pid_ns(current));
 		} else {
 			tr.sender_pid = 0;
 		}
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 78f21f8dc2e..43b315f2002 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -710,7 +710,7 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
 	struct vfsmount *proc_mnt;
 	int err = -ENOENT;
 
-	proc_mnt = mntget(current->nsproxy->pid_ns->proc_mnt);
+	proc_mnt = mntget(task_active_pid_ns(current)->proc_mnt);
 	if (IS_ERR(proc_mnt))
 		goto out;
 
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 13ef6247e7a..fc1609321a7 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -106,7 +106,7 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 		ns = (struct pid_namespace *)data;
 		options = NULL;
 	} else {
-		ns = current->nsproxy->pid_ns;
+		ns = task_active_pid_ns(current);
 		options = data;
 	}
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f24f724620d..0dbfba2efa7 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3390,7 +3390,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
 {
 	struct cgroup_pidlist *l;
 	/* don't need task_nsproxy() if we're looking at ourself */
-	struct pid_namespace *ns = current->nsproxy->pid_ns;
+	struct pid_namespace *ns = task_active_pid_ns(current);
 
 	/*
 	 * We can't drop the pidlist_mutex before taking the l->mutex in case
diff --git a/kernel/events/core.c b/kernel/events/core.c
index dbccf83c134..738f3564e83 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6155,7 +6155,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 
 	event->parent		= parent_event;
 
-	event->ns		= get_pid_ns(current->nsproxy->pid_ns);
+	event->ns		= get_pid_ns(task_active_pid_ns(current));
 	event->id		= atomic64_inc_return(&perf_event_id);
 
 	event->state		= PERF_EVENT_STATE_INACTIVE;
diff --git a/kernel/fork.c b/kernel/fork.c
index 8b20ab7d3aa..7798c247f4b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1442,7 +1442,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 		if (thread_group_leader(p)) {
 			if (is_child_reaper(pid))
-				p->nsproxy->pid_ns->child_reaper = p;
+				ns_of_pid(pid)->child_reaper = p;
 
 			p->signal->leader_pid = pid;
 			p->signal->tty = tty_kref_get(current->signal->tty);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index ca27d2c5264..acc92680381 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -84,7 +84,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
 		goto out_ipc;
 	}
 
-	new_nsp->pid_ns = copy_pid_ns(flags, task_cred_xxx(tsk, user_ns), task_active_pid_ns(tsk));
+	new_nsp->pid_ns = copy_pid_ns(flags, task_cred_xxx(tsk, user_ns), tsk->nsproxy->pid_ns);
 	if (IS_ERR(new_nsp->pid_ns)) {
 		err = PTR_ERR(new_nsp->pid_ns);
 		goto out_pid;
diff --git a/kernel/pid.c b/kernel/pid.c
index 2a624f1486e..3a5f238c1ca 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -345,7 +345,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns);
 
 struct pid *find_vpid(int nr)
 {
-	return find_pid_ns(nr, current->nsproxy->pid_ns);
+	return find_pid_ns(nr, task_active_pid_ns(current));
 }
 EXPORT_SYMBOL_GPL(find_vpid);
 
@@ -429,7 +429,7 @@ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
 
 struct task_struct *find_task_by_vpid(pid_t vnr)
 {
-	return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
+	return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
 }
 
 struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
@@ -484,7 +484,7 @@ EXPORT_SYMBOL_GPL(pid_nr_ns);
 
 pid_t pid_vnr(struct pid *pid)
 {
-	return pid_nr_ns(pid, current->nsproxy->pid_ns);
+	return pid_nr_ns(pid, task_active_pid_ns(current));
 }
 EXPORT_SYMBOL_GPL(pid_vnr);
 
@@ -495,7 +495,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
 
 	rcu_read_lock();
 	if (!ns)
-		ns = current->nsproxy->pid_ns;
+		ns = task_active_pid_ns(current);
 	if (likely(pid_alive(task))) {
 		if (type != PIDTYPE_PID)
 			task = task->group_leader;
diff --git a/kernel/signal.c b/kernel/signal.c
index 0af8868525d..b2445d86f22 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1752,7 +1752,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
 	 * see comment in do_notify_parent() about the following 4 lines
 	 */
 	rcu_read_lock();
-	info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
+	info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent));
 	info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
 	rcu_read_unlock();
 
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 65bdcf198d4..5a638445050 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1344,7 +1344,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
 		goto out_putname;
 	}
 
-	mnt = current->nsproxy->pid_ns->proc_mnt;
+	mnt = task_active_pid_ns(current)->proc_mnt;
 	file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
 	result = PTR_ERR(file);
 	if (IS_ERR(file))
-- 
cgit v1.2.3-70-g09d2


From 0a01f2cc390e10633a54f72c608cc3fe19a50c3d Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 1 Aug 2012 10:33:47 -0700
Subject: pidns: Make the pidns proc mount/umount logic obvious.

Track the number of pids in the proc hash table.  When the number of
pids goes to 0 schedule work to unmount the kernel mount of proc.

Move the mount of proc into alloc_pid when we allocate the pid for
init.

Remove the surprising calls of pid_ns_release proc in fork and
proc_flush_task.  Those code paths really shouldn't know about proc
namespace implementation details and people have demonstrated several
times that finding and understanding those code paths is difficult and
non-obvious.

Because of the call path detach pid is alwasy called with the
rtnl_lock held free_pid is not allowed to sleep, so the work to
unmounting proc is moved to a work queue.  This has the side benefit
of not blocking the entire world waiting for the unnecessary
rcu_barrier in deactivate_locked_super.

In the process of making the code clear and obvious this fixes a bug
reported by Gao feng <gaofeng@cn.fujitsu.com> where we would leak a
mount of proc during clone(CLONE_NEWPID|CLONE_NEWNET) if copy_pid_ns
succeeded and copy_net_ns failed.

Acked-by: "Serge E. Hallyn" <serge@hallyn.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/proc/base.c                |  4 ----
 fs/proc/root.c                |  5 -----
 include/linux/pid_namespace.h |  2 ++
 kernel/fork.c                 |  2 --
 kernel/pid.c                  | 21 +++++++++++++++++----
 kernel/pid_namespace.c        | 14 +++++++-------
 6 files changed, 26 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6177fc238fd..7621dc51cff 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2590,10 +2590,6 @@ void proc_flush_task(struct task_struct *task)
 		proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
 					tgid->numbers[i].nr);
 	}
-
-	upid = &pid->numbers[pid->level];
-	if (upid->nr == 1)
-		pid_ns_release_proc(upid->ns);
 }
 
 static struct dentry *proc_pid_instantiate(struct inode *dir,
diff --git a/fs/proc/root.c b/fs/proc/root.c
index fc1609321a7..f2f251158d3 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -155,11 +155,6 @@ void __init proc_root_init(void)
 	err = register_filesystem(&proc_fs_type);
 	if (err)
 		return;
-	err = pid_ns_prepare_proc(&init_pid_ns);
-	if (err) {
-		unregister_filesystem(&proc_fs_type);
-		return;
-	}
 
 	proc_self_init();
 	proc_symlink("mounts", NULL, "self/mounts");
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index c89c9cfcd24..4c96acdb248 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -21,6 +21,7 @@ struct pid_namespace {
 	struct kref kref;
 	struct pidmap pidmap[PIDMAP_ENTRIES];
 	int last_pid;
+	int nr_hashed;
 	struct task_struct *child_reaper;
 	struct kmem_cache *pid_cachep;
 	unsigned int level;
@@ -32,6 +33,7 @@ struct pid_namespace {
 	struct bsd_acct_struct *bacct;
 #endif
 	struct user_namespace *user_ns;
+	struct work_struct proc_work;
 	kgid_t pid_gid;
 	int hide_pid;
 	int reboot;	/* group exit code if this pidns was rebooted */
diff --git a/kernel/fork.c b/kernel/fork.c
index 7798c247f4b..666dc8b0660 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1476,8 +1476,6 @@ bad_fork_cleanup_io:
 	if (p->io_context)
 		exit_io_context(p);
 bad_fork_cleanup_namespaces:
-	if (unlikely(clone_flags & CLONE_NEWPID))
-		pid_ns_release_proc(p->nsproxy->pid_ns);
 	exit_task_namespaces(p);
 bad_fork_cleanup_mm:
 	if (p->mm)
diff --git a/kernel/pid.c b/kernel/pid.c
index 3a5f238c1ca..e957f8b0913 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -36,6 +36,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/init_task.h>
 #include <linux/syscalls.h>
+#include <linux/proc_fs.h>
 
 #define pid_hashfn(nr, ns)	\
 	hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -270,8 +271,12 @@ void free_pid(struct pid *pid)
 	unsigned long flags;
 
 	spin_lock_irqsave(&pidmap_lock, flags);
-	for (i = 0; i <= pid->level; i++)
-		hlist_del_rcu(&pid->numbers[i].pid_chain);
+	for (i = 0; i <= pid->level; i++) {
+		struct upid *upid = pid->numbers + i;
+		hlist_del_rcu(&upid->pid_chain);
+		if (--upid->ns->nr_hashed == 0)
+			schedule_work(&upid->ns->proc_work);
+	}
 	spin_unlock_irqrestore(&pidmap_lock, flags);
 
 	for (i = 0; i <= pid->level; i++)
@@ -293,6 +298,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 		goto out;
 
 	tmp = ns;
+	pid->level = ns->level;
 	for (i = ns->level; i >= 0; i--) {
 		nr = alloc_pidmap(tmp);
 		if (nr < 0)
@@ -303,17 +309,23 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 		tmp = tmp->parent;
 	}
 
+	if (unlikely(is_child_reaper(pid))) {
+		if (pid_ns_prepare_proc(ns))
+			goto out_free;
+	}
+
 	get_pid_ns(ns);
-	pid->level = ns->level;
 	atomic_set(&pid->count, 1);
 	for (type = 0; type < PIDTYPE_MAX; ++type)
 		INIT_HLIST_HEAD(&pid->tasks[type]);
 
 	upid = pid->numbers + ns->level;
 	spin_lock_irq(&pidmap_lock);
-	for ( ; upid >= pid->numbers; --upid)
+	for ( ; upid >= pid->numbers; --upid) {
 		hlist_add_head_rcu(&upid->pid_chain,
 				&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
+		upid->ns->nr_hashed++;
+	}
 	spin_unlock_irq(&pidmap_lock);
 
 out:
@@ -570,6 +582,7 @@ void __init pidmap_init(void)
 	/* Reserve PID 0. We never call free_pidmap(0) */
 	set_bit(0, init_pid_ns.pidmap[0].page);
 	atomic_dec(&init_pid_ns.pidmap[0].nr_free);
+	init_pid_ns.nr_hashed = 1;
 
 	init_pid_ns.pid_cachep = KMEM_CACHE(pid,
 			SLAB_HWCACHE_ALIGN | SLAB_PANIC);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index b2604950aa5..84591cfeefc 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -72,6 +72,12 @@ err_alloc:
 	return NULL;
 }
 
+static void proc_cleanup_work(struct work_struct *work)
+{
+	struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
+	pid_ns_release_proc(ns);
+}
+
 /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
 #define MAX_PID_NS_LEVEL 32
 
@@ -105,6 +111,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	ns->level = level;
 	ns->parent = get_pid_ns(parent_pid_ns);
 	ns->user_ns = get_user_ns(user_ns);
+	INIT_WORK(&ns->proc_work, proc_cleanup_work);
 
 	set_bit(0, ns->pidmap[0].page);
 	atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
@@ -112,15 +119,8 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	for (i = 1; i < PIDMAP_ENTRIES; i++)
 		atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
 
-	err = pid_ns_prepare_proc(ns);
-	if (err)
-		goto out_put_parent_pid_ns;
-
 	return ns;
 
-out_put_parent_pid_ns:
-	put_pid_ns(parent_pid_ns);
-	put_user_ns(user_ns);
 out_free_map:
 	kfree(ns->pidmap[0].page);
 out_free:
-- 
cgit v1.2.3-70-g09d2


From 5e1182deb81ae8c68494017c4a8a71811659c870 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 12 Jul 2010 18:50:25 -0700
Subject: pidns: Don't allow new processes in a dead pid namespace.

Set nr_hashed to -1 just before we schedule the work to cleanup proc.
Test nr_hashed just before we hash a new pid and if nr_hashed is < 0
fail.

This guaranteees that processes never enter a pid namespaces after we
have cleaned up the state to support processes in a pid namespace.

Currently sending SIGKILL to all of the process in a pid namespace as
init exists gives us this guarantee but we need something a little
stronger to support unsharing and joining a pid namespace.

Acked-by: "Serge E. Hallyn" <serge@hallyn.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/pid.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index e957f8b0913..9c219117af3 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -274,8 +274,10 @@ void free_pid(struct pid *pid)
 	for (i = 0; i <= pid->level; i++) {
 		struct upid *upid = pid->numbers + i;
 		hlist_del_rcu(&upid->pid_chain);
-		if (--upid->ns->nr_hashed == 0)
+		if (--upid->ns->nr_hashed == 0) {
+			upid->ns->nr_hashed = -1;
 			schedule_work(&upid->ns->proc_work);
+		}
 	}
 	spin_unlock_irqrestore(&pidmap_lock, flags);
 
@@ -321,6 +323,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 
 	upid = pid->numbers + ns->level;
 	spin_lock_irq(&pidmap_lock);
+	if (ns->nr_hashed < 0)
+		goto out_unlock;
 	for ( ; upid >= pid->numbers; --upid) {
 		hlist_add_head_rcu(&upid->pid_chain,
 				&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
@@ -331,6 +335,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 out:
 	return pid;
 
+out_unlock:
+	spin_unlock(&pidmap_lock);
 out_free:
 	while (++i <= ns->level)
 		free_pidmap(pid->numbers + i);
-- 
cgit v1.2.3-70-g09d2


From af4b8a83add95ef40716401395b44a1b579965f4 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 1 Aug 2012 15:03:42 -0700
Subject: pidns: Wait in zap_pid_ns_processes until pid_ns->nr_hashed == 1

Looking at pid_ns->nr_hashed is a bit simpler and it works for
disjoint process trees that an unshare or a join of a pid_namespace
may create.

Acked-by: "Serge E. Hallyn" <serge@hallyn.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/exit.c          | 12 ------------
 kernel/pid.c           | 16 +++++++++++++---
 kernel/pid_namespace.c | 15 ++++-----------
 3 files changed, 17 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 346616c0092..d7fe58db452 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -72,18 +72,6 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
 		list_del_rcu(&p->tasks);
 		list_del_init(&p->sibling);
 		__this_cpu_dec(process_counts);
-		/*
-		 * If we are the last child process in a pid namespace to be
-		 * reaped, notify the reaper sleeping zap_pid_ns_processes().
-		 */
-		if (IS_ENABLED(CONFIG_PID_NS)) {
-			struct task_struct *parent = p->real_parent;
-
-			if ((task_active_pid_ns(parent)->child_reaper == parent) &&
-			    list_empty(&parent->children) &&
-			    (parent->flags & PF_EXITING))
-				wake_up_process(parent);
-		}
 	}
 	list_del_rcu(&p->thread_group);
 }
diff --git a/kernel/pid.c b/kernel/pid.c
index 9c219117af3..6e8da291de4 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -273,10 +273,20 @@ void free_pid(struct pid *pid)
 	spin_lock_irqsave(&pidmap_lock, flags);
 	for (i = 0; i <= pid->level; i++) {
 		struct upid *upid = pid->numbers + i;
+		struct pid_namespace *ns = upid->ns;
 		hlist_del_rcu(&upid->pid_chain);
-		if (--upid->ns->nr_hashed == 0) {
-			upid->ns->nr_hashed = -1;
-			schedule_work(&upid->ns->proc_work);
+		switch(--ns->nr_hashed) {
+		case 1:
+			/* When all that is left in the pid namespace
+			 * is the reaper wake up the reaper.  The reaper
+			 * may be sleeping in zap_pid_ns_processes().
+			 */
+			wake_up_process(ns->child_reaper);
+			break;
+		case 0:
+			ns->nr_hashed = -1;
+			schedule_work(&ns->proc_work);
+			break;
 		}
 	}
 	spin_unlock_irqrestore(&pidmap_lock, flags);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 84591cfeefc..3cc29b830e9 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -217,22 +217,15 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 
 	/*
 	 * sys_wait4() above can't reap the TASK_DEAD children.
-	 * Make sure they all go away, see __unhash_process().
+	 * Make sure they all go away, see free_pid().
 	 */
 	for (;;) {
-		bool need_wait = false;
-
-		read_lock(&tasklist_lock);
-		if (!list_empty(&current->children)) {
-			__set_current_state(TASK_UNINTERRUPTIBLE);
-			need_wait = true;
-		}
-		read_unlock(&tasklist_lock);
-
-		if (!need_wait)
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (pid_ns->nr_hashed == 1)
 			break;
 		schedule();
 	}
+	__set_current_state(TASK_RUNNING);
 
 	if (pid_ns->reboot)
 		current->signal->group_exit_code = pid_ns->reboot;
-- 
cgit v1.2.3-70-g09d2


From 225778d68d98e7cfe2579f8d8b2d7b76f8541b8b Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 2 Aug 2012 08:35:35 -0700
Subject: pidns: Deny strange cases when creating pid namespaces.

task_active_pid_ns(current) != current->ns_proxy->pid_ns will
soon be allowed to support unshare and setns.

The definition of creating a child pid namespace when
task_active_pid_ns(current) != current->ns_proxy->pid_ns could be that
we create a child pid namespace of current->ns_proxy->pid_ns.  However
that leads to strange cases like trying to have a single process be
init in multiple pid namespaces, which is racy and hard to think
about.

The definition of creating a child pid namespace when
task_active_pid_ns(current) != current->ns_proxy->pid_ns could be that
we create a child pid namespace of task_active_pid_ns(current).  While
that seems less racy it does not provide any utility.

Therefore define the semantics of creating a child pid namespace when
task_active_pid_ns(current) != current->ns_proxy->pid_ns to be that the
pid namespace creation fails.  That is easy to implement and easy
to think about.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/pid_namespace.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 3cc29b830e9..0dbbc66b6ec 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -146,6 +146,8 @@ struct pid_namespace *copy_pid_ns(unsigned long flags,
 		return get_pid_ns(old_ns);
 	if (flags & (CLONE_THREAD|CLONE_PARENT))
 		return ERR_PTR(-EINVAL);
+	if (task_active_pid_ns(current) != old_ns)
+		return ERR_PTR(-EINVAL);
 	return create_pid_namespace(user_ns, old_ns);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 57e8391d327609cbf12d843259c968b9e5c1838f Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 7 Mar 2010 18:17:03 -0800
Subject: pidns: Add setns support

- Pid namespaces are designed to be inescapable so verify that the
  passed in pid namespace is a child of the currently active
  pid namespace or the currently active pid namespace itself.

  Allowing the currently active pid namespace is important so
  the effects of an earlier setns can be cancelled.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/proc/namespaces.c    |  3 +++
 include/linux/proc_fs.h |  1 +
 kernel/pid_namespace.c  | 54 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 58 insertions(+)

(limited to 'kernel')

diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index b178ed733c3..85ca047e35f 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -24,6 +24,9 @@ static const struct proc_ns_operations *ns_entries[] = {
 #ifdef CONFIG_IPC_NS
 	&ipcns_operations,
 #endif
+#ifdef CONFIG_PID_NS
+	&pidns_operations,
+#endif
 };
 
 static const struct file_operations ns_file_operations = {
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 3fd2e871ff1..acaafcd40aa 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -251,6 +251,7 @@ struct proc_ns_operations {
 extern const struct proc_ns_operations netns_operations;
 extern const struct proc_ns_operations utsns_operations;
 extern const struct proc_ns_operations ipcns_operations;
+extern const struct proc_ns_operations pidns_operations;
 
 union proc_op {
 	int (*proc_get_link)(struct dentry *, struct path *);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 0dbbc66b6ec..f78fc48c86b 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -301,6 +301,60 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
 	return 0;
 }
 
+static void *pidns_get(struct task_struct *task)
+{
+	struct pid_namespace *ns;
+
+	rcu_read_lock();
+	ns = get_pid_ns(task_active_pid_ns(task));
+	rcu_read_unlock();
+
+	return ns;
+}
+
+static void pidns_put(void *ns)
+{
+	put_pid_ns(ns);
+}
+
+static int pidns_install(struct nsproxy *nsproxy, void *ns)
+{
+	struct pid_namespace *active = task_active_pid_ns(current);
+	struct pid_namespace *ancestor, *new = ns;
+
+	if (!ns_capable(new->user_ns, CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/*
+	 * Only allow entering the current active pid namespace
+	 * or a child of the current active pid namespace.
+	 *
+	 * This is required for fork to return a usable pid value and
+	 * this maintains the property that processes and their
+	 * children can not escape their current pid namespace.
+	 */
+	if (new->level < active->level)
+		return -EINVAL;
+
+	ancestor = new;
+	while (ancestor->level > active->level)
+		ancestor = ancestor->parent;
+	if (ancestor != active)
+		return -EINVAL;
+
+	put_pid_ns(nsproxy->pid_ns);
+	nsproxy->pid_ns = get_pid_ns(new);
+	return 0;
+}
+
+const struct proc_ns_operations pidns_operations = {
+	.name		= "pid",
+	.type		= CLONE_NEWPID,
+	.get		= pidns_get,
+	.put		= pidns_put,
+	.install	= pidns_install,
+};
+
 static __init int pid_namespaces_init(void)
 {
 	pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
-- 
cgit v1.2.3-70-g09d2


From 1c4042c29bd2e85aac4110552ca8ade763762e84 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 12 Jul 2010 17:10:36 -0700
Subject: pidns: Consolidate initialzation of special init task state

Instead of setting child_reaper and SIGNAL_UNKILLABLE one way
for the system init process, and another way for pid namespace
init processes test pid->nr == 1 and use the same code for both.

For the global init this results in SIGNAL_UNKILLABLE being set
much earlier in the initialization process.

This is a small cleanup and it paves the way for allowing unshare and
enter of the pid namespace as that path like our global init also will
not set CLONE_NEWPID.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 init/main.c   | 1 -
 kernel/fork.c | 6 +++---
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/init/main.c b/init/main.c
index 9cf77ab138a..317750a18f7 100644
--- a/init/main.c
+++ b/init/main.c
@@ -810,7 +810,6 @@ static int __ref kernel_init(void *unused)
 	system_state = SYSTEM_RUNNING;
 	numa_default_policy();
 
-	current->signal->flags |= SIGNAL_UNKILLABLE;
 	flush_delayed_fput();
 
 	if (ramdisk_execute_command) {
diff --git a/kernel/fork.c b/kernel/fork.c
index 666dc8b0660..0f2bbce311f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1039,8 +1039,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	atomic_set(&sig->live, 1);
 	atomic_set(&sig->sigcnt, 1);
 	init_waitqueue_head(&sig->wait_chldexit);
-	if (clone_flags & CLONE_NEWPID)
-		sig->flags |= SIGNAL_UNKILLABLE;
 	sig->curr_target = tsk;
 	init_sigpending(&sig->shared_pending);
 	INIT_LIST_HEAD(&sig->posix_timers);
@@ -1441,8 +1439,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
 
 		if (thread_group_leader(p)) {
-			if (is_child_reaper(pid))
+			if (is_child_reaper(pid)) {
 				ns_of_pid(pid)->child_reaper = p;
+				p->signal->flags |= SIGNAL_UNKILLABLE;
+			}
 
 			p->signal->leader_pid = pid;
 			p->signal->tty = tty_kref_get(current->signal->tty);
-- 
cgit v1.2.3-70-g09d2


From 50804fe3737ca6a5942fdc2057a18a8141d00141 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 2 Mar 2010 15:41:50 -0800
Subject: pidns: Support unsharing the pid namespace.

Unsharing of the pid namespace unlike unsharing of other namespaces
does not take affect immediately.  Instead it affects the children
created with fork and clone.  The first of these children becomes the init
process of the new pid namespace, the rest become oddball children
of pid 0.  From the point of view of the new pid namespace the process
that created it is pid 0, as it's pid does not map.

A couple of different semantics were considered but this one was
settled on because it is easy to implement and it is usable from
pam modules.  The core reasons for the existence of unshare.

I took a survey of the callers of pam modules and the following
appears to be a representative sample of their logic.
{
	setup stuff include pam
	child = fork();
	if (!child) {
		setuid()
                exec /bin/bash
        }
        waitpid(child);

        pam and other cleanup
}

As you can see there is a fork to create the unprivileged user
space process.  Which means that the unprivileged user space
process will appear as pid 1 in the new pid namespace.  Further
most login processes do not cope with extraneous children which
means shifting the duty of reaping extraneous child process to
the creator of those extraneous children makes the system more
comprehensible.

The practical reason for this set of pid namespace semantics is
that it is simple to implement and verify they work correctly.
Whereas an implementation that requres changing the struct
pid on a process comes with a lot more races and pain.  Not
the least of which is that glibc caches getpid().

These semantics are implemented by having two notions
of the pid namespace of a proces.  There is task_active_pid_ns
which is the pid namspace the process was created with
and the pid namespace that all pids are presented to
that process in.  The task_active_pid_ns is stored
in the struct pid of the task.

Then there is the pid namespace that will be used for children
that pid namespace is stored in task->nsproxy->pid_ns.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/fork.c          | 32 +++++++++++++++++++++++++-------
 kernel/nsproxy.c       |  2 +-
 kernel/pid_namespace.c |  2 --
 3 files changed, 26 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 0f2bbce311f..811ffbad788 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1565,9 +1565,11 @@ long do_fork(unsigned long clone_flags,
 	 * Do some preliminary argument and permissions checking before we
 	 * actually start allocating stuff
 	 */
-	if (clone_flags & CLONE_NEWUSER) {
-		if (clone_flags & CLONE_THREAD)
+	if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) {
+		if (clone_flags & (CLONE_THREAD|CLONE_PARENT))
 			return -EINVAL;
+	}
+	if (clone_flags & CLONE_NEWUSER) {
 		/* hopefully this check will go away when userns support is
 		 * complete
 		 */
@@ -1692,7 +1694,8 @@ static int check_unshare_flags(unsigned long unshare_flags)
 {
 	if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
 				CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
-				CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
+				CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
+				CLONE_NEWPID))
 		return -EINVAL;
 	/*
 	 * Not implemented, but pretend it works if there is nothing to
@@ -1763,15 +1766,30 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 	int do_sysvsem = 0;
 	int err;
 
-	err = check_unshare_flags(unshare_flags);
-	if (err)
-		goto bad_unshare_out;
-
+	/*
+	 * If unsharing a pid namespace must also unshare the thread.
+	 */
+	if (unshare_flags & CLONE_NEWPID)
+		unshare_flags |= CLONE_THREAD;
+	/*
+	 * If unsharing a thread from a thread group, must also unshare vm.
+	 */
+	if (unshare_flags & CLONE_THREAD)
+		unshare_flags |= CLONE_VM;
+	/*
+	 * If unsharing vm, must also unshare signal handlers.
+	 */
+	if (unshare_flags & CLONE_VM)
+		unshare_flags |= CLONE_SIGHAND;
 	/*
 	 * If unsharing namespace, must also unshare filesystem information.
 	 */
 	if (unshare_flags & CLONE_NEWNS)
 		unshare_flags |= CLONE_FS;
+
+	err = check_unshare_flags(unshare_flags);
+	if (err)
+		goto bad_unshare_out;
 	/*
 	 * CLONE_NEWIPC must also detach from the undolist: after switching
 	 * to a new ipc namespace, the semaphore arrays from the old
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index acc92680381..b8d4d8709d7 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -188,7 +188,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
 	int err = 0;
 
 	if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
-			       CLONE_NEWNET)))
+			       CLONE_NEWNET | CLONE_NEWPID)))
 		return 0;
 
 	if (!capable(CAP_SYS_ADMIN))
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index f78fc48c86b..68508d33063 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -144,8 +144,6 @@ struct pid_namespace *copy_pid_ns(unsigned long flags,
 {
 	if (!(flags & CLONE_NEWPID))
 		return get_pid_ns(old_ns);
-	if (flags & (CLONE_THREAD|CLONE_PARENT))
-		return ERR_PTR(-EINVAL);
 	if (task_active_pid_ns(current) != old_ns)
 		return ERR_PTR(-EINVAL);
 	return create_pid_namespace(user_ns, old_ns);
-- 
cgit v1.2.3-70-g09d2


From 771b1371686e0a63e938ada28de020b9a0040f55 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 26 Jul 2012 21:08:32 -0700
Subject: vfs: Add a user namespace reference from struct mnt_namespace

This will allow for support for unprivileged mounts in a new user namespace.

Acked-by: "Serge E. Hallyn" <serge@hallyn.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/mount.h                    |  1 +
 fs/namespace.c                | 24 ++++++++++++++++--------
 include/linux/mnt_namespace.h |  3 ++-
 kernel/nsproxy.c              |  2 +-
 4 files changed, 20 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/fs/mount.h b/fs/mount.h
index e9c37dd3d00..630fafc616b 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -6,6 +6,7 @@ struct mnt_namespace {
 	atomic_t		count;
 	struct mount *	root;
 	struct list_head	list;
+	struct user_namespace	*user_ns;
 	u64			seq;	/* Sequence number to prevent loops */
 	wait_queue_head_t poll;
 	int event;
diff --git a/fs/namespace.c b/fs/namespace.c
index d287e7e7464..207c7ba84ad 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -12,6 +12,7 @@
 #include <linux/export.h>
 #include <linux/capability.h>
 #include <linux/mnt_namespace.h>
+#include <linux/user_namespace.h>
 #include <linux/namei.h>
 #include <linux/security.h>
 #include <linux/idr.h>
@@ -2286,6 +2287,12 @@ dput_out:
 	return retval;
 }
 
+static void free_mnt_ns(struct mnt_namespace *ns)
+{
+	put_user_ns(ns->user_ns);
+	kfree(ns);
+}
+
 /*
  * Assign a sequence number so we can detect when we attempt to bind
  * mount a reference to an older mount namespace into the current
@@ -2295,7 +2302,7 @@ dput_out:
  */
 static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
 
-static struct mnt_namespace *alloc_mnt_ns(void)
+static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
 {
 	struct mnt_namespace *new_ns;
 
@@ -2308,6 +2315,7 @@ static struct mnt_namespace *alloc_mnt_ns(void)
 	INIT_LIST_HEAD(&new_ns->list);
 	init_waitqueue_head(&new_ns->poll);
 	new_ns->event = 0;
+	new_ns->user_ns = get_user_ns(user_ns);
 	return new_ns;
 }
 
@@ -2316,7 +2324,7 @@ static struct mnt_namespace *alloc_mnt_ns(void)
  * copied from the namespace of the passed in task structure.
  */
 static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
-		struct fs_struct *fs)
+		struct user_namespace *user_ns, struct fs_struct *fs)
 {
 	struct mnt_namespace *new_ns;
 	struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
@@ -2324,7 +2332,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 	struct mount *old = mnt_ns->root;
 	struct mount *new;
 
-	new_ns = alloc_mnt_ns();
+	new_ns = alloc_mnt_ns(user_ns);
 	if (IS_ERR(new_ns))
 		return new_ns;
 
@@ -2333,7 +2341,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 	new = copy_tree(old, old->mnt.mnt_root, CL_COPY_ALL | CL_EXPIRE);
 	if (IS_ERR(new)) {
 		up_write(&namespace_sem);
-		kfree(new_ns);
+		free_mnt_ns(new_ns);
 		return ERR_CAST(new);
 	}
 	new_ns->root = new;
@@ -2374,7 +2382,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 }
 
 struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
-		struct fs_struct *new_fs)
+		struct user_namespace *user_ns, struct fs_struct *new_fs)
 {
 	struct mnt_namespace *new_ns;
 
@@ -2384,7 +2392,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 	if (!(flags & CLONE_NEWNS))
 		return ns;
 
-	new_ns = dup_mnt_ns(ns, new_fs);
+	new_ns = dup_mnt_ns(ns, user_ns, new_fs);
 
 	put_mnt_ns(ns);
 	return new_ns;
@@ -2396,7 +2404,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
  */
 static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)
 {
-	struct mnt_namespace *new_ns = alloc_mnt_ns();
+	struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);
 	if (!IS_ERR(new_ns)) {
 		struct mount *mnt = real_mount(m);
 		mnt->mnt_ns = new_ns;
@@ -2682,7 +2690,7 @@ void put_mnt_ns(struct mnt_namespace *ns)
 	br_write_unlock(&vfsmount_lock);
 	up_write(&namespace_sem);
 	release_mounts(&umount_list);
-	kfree(ns);
+	free_mnt_ns(ns);
 }
 
 struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
index 5a8e3903d77..12b2ab51032 100644
--- a/include/linux/mnt_namespace.h
+++ b/include/linux/mnt_namespace.h
@@ -4,9 +4,10 @@
 
 struct mnt_namespace;
 struct fs_struct;
+struct user_namespace;
 
 extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *,
-		struct fs_struct *);
+		struct user_namespace *, struct fs_struct *);
 extern void put_mnt_ns(struct mnt_namespace *ns);
 
 extern const struct file_operations proc_mounts_operations;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index b8d4d8709d7..7f8b051fc19 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -66,7 +66,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
 	if (!new_nsp)
 		return ERR_PTR(-ENOMEM);
 
-	new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs);
+	new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, task_cred_xxx(tsk, user_ns), new_fs);
 	if (IS_ERR(new_nsp->mnt_ns)) {
 		err = PTR_ERR(new_nsp->mnt_ns);
 		goto out_ns;
-- 
cgit v1.2.3-70-g09d2


From 5eaf563e53294d6696e651466697eb9d491f3946 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 21 Nov 2011 17:22:31 -0800
Subject: userns: Allow unprivileged users to create user namespaces.

Now that we have been through every permission check in the kernel
having uid == 0 and gid == 0 in your local user namespace no
longer adds any special privileges.  Even having a full set
of caps in your local user namespace is safe because capabilies
are relative to your local user namespace, and do not confer
unexpected privileges.

Over the long term this should allow much more of the kernels
functionality to be safely used by non-root users.  Functionality
like unsharing the mount namespace that is only unsafe because
it can fool applications whose privileges are raised when they
are executed.  Since those applications have no privileges in
a user namespaces it becomes safe to spoof and confuse those
applications all you want.

Those capabilities will still need to be enabled carefully because
we may still need things like rlimits on the number of unprivileged
mounts but that is to avoid DOS attacks not to avoid fooling root
owned processes.

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/fork.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 811ffbad788..8c29abb1901 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1569,14 +1569,6 @@ long do_fork(unsigned long clone_flags,
 		if (clone_flags & (CLONE_THREAD|CLONE_PARENT))
 			return -EINVAL;
 	}
-	if (clone_flags & CLONE_NEWUSER) {
-		/* hopefully this check will go away when userns support is
-		 * complete
-		 */
-		if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
-				!capable(CAP_SETGID))
-			return -EPERM;
-	}
 
 	/*
 	 * Determine whether and which event to report to ptracer.  When
-- 
cgit v1.2.3-70-g09d2


From 37657da3c5d4a3bbbbb9d3b78f53a8134a0abae0 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 27 Jul 2012 06:21:27 -0700
Subject: userns: Allow setting a userns mapping to your current uid.

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/user_namespace.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 456a6b9fba3..49096d559e0 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -709,6 +709,21 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t
 static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
 				struct uid_gid_map *new_map)
 {
+	/* Allow mapping to your own filesystem ids */
+	if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) {
+		u32 id = new_map->extent[0].lower_first;
+		if (cap_setid == CAP_SETUID) {
+			kuid_t uid = make_kuid(ns->parent, id);
+			if (uid_eq(uid, current_fsuid()))
+				return true;
+		}
+		else if (cap_setid == CAP_SETGID) {
+			kgid_t gid = make_kgid(ns->parent, id);
+			if (gid_eq(gid, current_fsgid()))
+				return true;
+		}
+	}
+
 	/* Allow anyone to set a mapping that doesn't require privilege */
 	if (!cap_valid(cap_setid))
 		return true;
-- 
cgit v1.2.3-70-g09d2


From b33c77ef23dd3ec5692c9c0cc739a3f5f0f2baae Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 26 Jul 2012 00:50:47 -0700
Subject: userns: Allow unprivileged users to create new namespaces

If an unprivileged user has the appropriate capabilities in their
current user namespace allow the creation of new namespaces.

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/nsproxy.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 7f8b051fc19..a214e0e9035 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -122,6 +122,7 @@ out_ns:
 int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 {
 	struct nsproxy *old_ns = tsk->nsproxy;
+	struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
 	struct nsproxy *new_ns;
 	int err = 0;
 
@@ -134,7 +135,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 				CLONE_NEWPID | CLONE_NEWNET)))
 		return 0;
 
-	if (!capable(CAP_SYS_ADMIN)) {
+	if (!ns_capable(user_ns, CAP_SYS_ADMIN)) {
 		err = -EPERM;
 		goto out;
 	}
@@ -191,7 +192,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
 			       CLONE_NEWNET | CLONE_NEWPID)))
 		return 0;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!nsown_capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
 	*new_nsp = create_new_namespaces(unshare_flags, current,
-- 
cgit v1.2.3-70-g09d2


From 142e1d1d5f088e7a38659daca6e84a730967774a Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 26 Jul 2012 01:13:20 -0700
Subject: userns: Allow unprivileged use of setns.

- Push the permission check from the core setns syscall into
  the setns install methods where the user namespace of the
  target namespace can be determined, and used in a ns_capable
  call.

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 ipc/namespace.c          | 6 +++++-
 kernel/nsproxy.c         | 3 ---
 kernel/utsname.c         | 7 ++++++-
 net/core/net_namespace.c | 7 ++++++-
 4 files changed, 17 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/ipc/namespace.c b/ipc/namespace.c
index f362298c5ce..6ed33c05cb6 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -161,8 +161,12 @@ static void ipcns_put(void *ns)
 	return put_ipc_ns(ns);
 }
 
-static int ipcns_install(struct nsproxy *nsproxy, void *ns)
+static int ipcns_install(struct nsproxy *nsproxy, void *new)
 {
+	struct ipc_namespace *ns = new;
+	if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
+		return -EPERM;
+
 	/* Ditch state from the old ipc namespace */
 	exit_sem(current);
 	put_ipc_ns(nsproxy->ipc_ns);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index a214e0e9035..4357a0a7d17 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -242,9 +242,6 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
 	struct file *file;
 	int err;
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
 	file = proc_ns_fget(fd);
 	if (IS_ERR(file))
 		return PTR_ERR(file);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 679d97a5d3f..4a9362f9325 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -102,8 +102,13 @@ static void utsns_put(void *ns)
 	put_uts_ns(ns);
 }
 
-static int utsns_install(struct nsproxy *nsproxy, void *ns)
+static int utsns_install(struct nsproxy *nsproxy, void *new)
 {
+	struct uts_namespace *ns = new;
+
+	if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
+		return -EPERM;
+
 	get_uts_ns(ns);
 	put_uts_ns(nsproxy->uts_ns);
 	nsproxy->uts_ns = ns;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 6456439cbbd..ec2870b44c1 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -630,8 +630,13 @@ static void netns_put(void *ns)
 
 static int netns_install(struct nsproxy *nsproxy, void *ns)
 {
+	struct net *net = ns;
+
+	if (!ns_capable(net->user_ns, CAP_SYS_ADMIN))
+		return -EPERM;
+
 	put_net(nsproxy->net_ns);
-	nsproxy->net_ns = get_net(ns);
+	nsproxy->net_ns = get_net(net);
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From bcf58e725ddc45d31addbc6627d4f0edccc824c1 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 26 Jul 2012 04:02:49 -0700
Subject: userns: Make create_new_namespaces take a user_ns parameter

Modify create_new_namespaces to explicitly take a user namespace
parameter, instead of implicitly through the task_struct.

This allows an implementation of unshare(CLONE_NEWUSER) where
the new user namespace is not stored onto the current task_struct
until after all of the namespaces are created.

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/ipc_namespace.h |  7 ++++---
 include/linux/utsname.h       |  6 +++---
 ipc/namespace.c               | 10 ++++------
 kernel/nsproxy.c              | 22 +++++++++++++---------
 kernel/utsname.c              |  9 ++++-----
 5 files changed, 28 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index 5499c92a915..f03af702a39 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -133,7 +133,8 @@ static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; }
 
 #if defined(CONFIG_IPC_NS)
 extern struct ipc_namespace *copy_ipcs(unsigned long flags,
-				       struct task_struct *tsk);
+	struct user_namespace *user_ns, struct ipc_namespace *ns);
+
 static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
 {
 	if (ns)
@@ -144,12 +145,12 @@ static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
 extern void put_ipc_ns(struct ipc_namespace *ns);
 #else
 static inline struct ipc_namespace *copy_ipcs(unsigned long flags,
-					      struct task_struct *tsk)
+	struct user_namespace *user_ns, struct ipc_namespace *ns)
 {
 	if (flags & CLONE_NEWIPC)
 		return ERR_PTR(-EINVAL);
 
-	return tsk->nsproxy->ipc_ns;
+	return ns;
 }
 
 static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
diff --git a/include/linux/utsname.h b/include/linux/utsname.h
index 2b345206722..221f4a0a750 100644
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -33,7 +33,7 @@ static inline void get_uts_ns(struct uts_namespace *ns)
 }
 
 extern struct uts_namespace *copy_utsname(unsigned long flags,
-					  struct task_struct *tsk);
+	struct user_namespace *user_ns, struct uts_namespace *old_ns);
 extern void free_uts_ns(struct kref *kref);
 
 static inline void put_uts_ns(struct uts_namespace *ns)
@@ -50,12 +50,12 @@ static inline void put_uts_ns(struct uts_namespace *ns)
 }
 
 static inline struct uts_namespace *copy_utsname(unsigned long flags,
-						 struct task_struct *tsk)
+	struct user_namespace *user_ns, struct uts_namespace *old_ns)
 {
 	if (flags & CLONE_NEWUTS)
 		return ERR_PTR(-EINVAL);
 
-	return tsk->nsproxy->uts_ns;
+	return old_ns;
 }
 #endif
 
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 6ed33c05cb6..72c86827779 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -16,7 +16,7 @@
 
 #include "util.h"
 
-static struct ipc_namespace *create_ipc_ns(struct task_struct *tsk,
+static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
 					   struct ipc_namespace *old_ns)
 {
 	struct ipc_namespace *ns;
@@ -46,19 +46,17 @@ static struct ipc_namespace *create_ipc_ns(struct task_struct *tsk,
 	ipcns_notify(IPCNS_CREATED);
 	register_ipcns_notifier(ns);
 
-	ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns));
+	ns->user_ns = get_user_ns(user_ns);
 
 	return ns;
 }
 
 struct ipc_namespace *copy_ipcs(unsigned long flags,
-				struct task_struct *tsk)
+	struct user_namespace *user_ns, struct ipc_namespace *ns)
 {
-	struct ipc_namespace *ns = tsk->nsproxy->ipc_ns;
-
 	if (!(flags & CLONE_NEWIPC))
 		return get_ipc_ns(ns);
-	return create_ipc_ns(tsk, ns);
+	return create_ipc_ns(user_ns, ns);
 }
 
 /*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 4357a0a7d17..2ddd81657a2 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -57,7 +57,8 @@ static inline struct nsproxy *create_nsproxy(void)
  * leave it to the caller to do proper locking and attach it to task.
  */
 static struct nsproxy *create_new_namespaces(unsigned long flags,
-			struct task_struct *tsk, struct fs_struct *new_fs)
+	struct task_struct *tsk, struct user_namespace *user_ns,
+	struct fs_struct *new_fs)
 {
 	struct nsproxy *new_nsp;
 	int err;
@@ -66,31 +67,31 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
 	if (!new_nsp)
 		return ERR_PTR(-ENOMEM);
 
-	new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, task_cred_xxx(tsk, user_ns), new_fs);
+	new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
 	if (IS_ERR(new_nsp->mnt_ns)) {
 		err = PTR_ERR(new_nsp->mnt_ns);
 		goto out_ns;
 	}
 
-	new_nsp->uts_ns = copy_utsname(flags, tsk);
+	new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns);
 	if (IS_ERR(new_nsp->uts_ns)) {
 		err = PTR_ERR(new_nsp->uts_ns);
 		goto out_uts;
 	}
 
-	new_nsp->ipc_ns = copy_ipcs(flags, tsk);
+	new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns);
 	if (IS_ERR(new_nsp->ipc_ns)) {
 		err = PTR_ERR(new_nsp->ipc_ns);
 		goto out_ipc;
 	}
 
-	new_nsp->pid_ns = copy_pid_ns(flags, task_cred_xxx(tsk, user_ns), tsk->nsproxy->pid_ns);
+	new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns);
 	if (IS_ERR(new_nsp->pid_ns)) {
 		err = PTR_ERR(new_nsp->pid_ns);
 		goto out_pid;
 	}
 
-	new_nsp->net_ns = copy_net_ns(flags, task_cred_xxx(tsk, user_ns), tsk->nsproxy->net_ns);
+	new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
 	if (IS_ERR(new_nsp->net_ns)) {
 		err = PTR_ERR(new_nsp->net_ns);
 		goto out_net;
@@ -152,7 +153,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 		goto out;
 	}
 
-	new_ns = create_new_namespaces(flags, tsk, tsk->fs);
+	new_ns = create_new_namespaces(flags, tsk,
+				       task_cred_xxx(tsk, user_ns), tsk->fs);
 	if (IS_ERR(new_ns)) {
 		err = PTR_ERR(new_ns);
 		goto out;
@@ -186,6 +188,7 @@ void free_nsproxy(struct nsproxy *ns)
 int unshare_nsproxy_namespaces(unsigned long unshare_flags,
 		struct nsproxy **new_nsp, struct fs_struct *new_fs)
 {
+	struct user_namespace *user_ns;
 	int err = 0;
 
 	if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
@@ -195,7 +198,8 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
 	if (!nsown_capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	*new_nsp = create_new_namespaces(unshare_flags, current,
+	user_ns = current_user_ns();
+	*new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
 				new_fs ? new_fs : current->fs);
 	if (IS_ERR(*new_nsp)) {
 		err = PTR_ERR(*new_nsp);
@@ -252,7 +256,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
 	if (nstype && (ops->type != nstype))
 		goto out;
 
-	new_nsproxy = create_new_namespaces(0, tsk, tsk->fs);
+	new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
 	if (IS_ERR(new_nsproxy)) {
 		err = PTR_ERR(new_nsproxy);
 		goto out;
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 4a9362f9325..fdc619eb61e 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -32,7 +32,7 @@ static struct uts_namespace *create_uts_ns(void)
  * @old_ns: namespace to clone
  * Return NULL on error (failure to kmalloc), new ns otherwise
  */
-static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
+static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
 					  struct uts_namespace *old_ns)
 {
 	struct uts_namespace *ns;
@@ -43,7 +43,7 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
 
 	down_read(&uts_sem);
 	memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
-	ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns));
+	ns->user_ns = get_user_ns(user_ns);
 	up_read(&uts_sem);
 	return ns;
 }
@@ -55,9 +55,8 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
  * versa.
  */
 struct uts_namespace *copy_utsname(unsigned long flags,
-				   struct task_struct *tsk)
+	struct user_namespace *user_ns, struct uts_namespace *old_ns)
 {
-	struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
 	struct uts_namespace *new_ns;
 
 	BUG_ON(!old_ns);
@@ -66,7 +65,7 @@ struct uts_namespace *copy_utsname(unsigned long flags,
 	if (!(flags & CLONE_NEWUTS))
 		return old_ns;
 
-	new_ns = clone_uts_ns(tsk, old_ns);
+	new_ns = clone_uts_ns(user_ns, old_ns);
 
 	put_uts_ns(old_ns);
 	return new_ns;
-- 
cgit v1.2.3-70-g09d2


From 4c44aaafa8108f584831850ab48a975e971db2de Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 26 Jul 2012 05:05:21 -0700
Subject: userns: Kill task_user_ns

The task_user_ns function hides the fact that it is getting the user
namespace from struct cred on the task.  struct cred may go away as
soon as the rcu lock is released.  This leads to a race where we
can dereference a stale user namespace pointer.

To make it obvious a struct cred is involved kill task_user_ns.

To kill the race modify the users of task_user_ns to only
reference the user namespace while the rcu lock is held.

Cc: Kees Cook <keescook@chromium.org>
Cc: James Morris <james.l.morris@oracle.com>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/cred.h     |  2 --
 kernel/ptrace.c          | 10 ++++++++--
 kernel/sched/core.c      | 10 ++++++++--
 security/yama/yama_lsm.c | 12 +++++++++---
 4 files changed, 25 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index ebbed2ce663..856d2622d83 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -357,10 +357,8 @@ static inline void put_cred(const struct cred *_cred)
 extern struct user_namespace init_user_ns;
 #ifdef CONFIG_USER_NS
 #define current_user_ns()	(current_cred_xxx(user_ns))
-#define task_user_ns(task)	(task_cred_xxx((task), user_ns))
 #else
 #define current_user_ns()	(&init_user_ns)
-#define task_user_ns(task)	(&init_user_ns)
 #endif
 
 
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1f5e55dda95..7b09b88862c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -215,8 +215,12 @@ ok:
 	smp_rmb();
 	if (task->mm)
 		dumpable = get_dumpable(task->mm);
-	if (!dumpable  && !ptrace_has_cap(task_user_ns(task), mode))
+	rcu_read_lock();
+	if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
+		rcu_read_unlock();
 		return -EPERM;
+	}
+	rcu_read_unlock();
 
 	return security_ptrace_access_check(task, mode);
 }
@@ -280,8 +284,10 @@ static int ptrace_attach(struct task_struct *task, long request,
 
 	if (seize)
 		flags |= PT_SEIZED;
-	if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE))
+	rcu_read_lock();
+	if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE))
 		flags |= PT_PTRACE_CAP;
+	rcu_read_unlock();
 	task->ptrace = flags;
 
 	__ptrace_link(task, current);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d8927fda71..2f5eb1838b3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4029,8 +4029,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 		goto out_free_cpus_allowed;
 	}
 	retval = -EPERM;
-	if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
-		goto out_unlock;
+	if (!check_same_owner(p)) {
+		rcu_read_lock();
+		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+			rcu_read_unlock();
+			goto out_unlock;
+		}
+		rcu_read_unlock();
+	}
 
 	retval = security_task_setscheduler(p);
 	if (retval)
diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c
index b4c29848b49..0e72239aeb0 100644
--- a/security/yama/yama_lsm.c
+++ b/security/yama/yama_lsm.c
@@ -262,14 +262,18 @@ int yama_ptrace_access_check(struct task_struct *child,
 			/* No additional restrictions. */
 			break;
 		case YAMA_SCOPE_RELATIONAL:
+			rcu_read_lock();
 			if (!task_is_descendant(current, child) &&
 			    !ptracer_exception_found(current, child) &&
-			    !ns_capable(task_user_ns(child), CAP_SYS_PTRACE))
+			    !ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE))
 				rc = -EPERM;
+			rcu_read_unlock();
 			break;
 		case YAMA_SCOPE_CAPABILITY:
-			if (!ns_capable(task_user_ns(child), CAP_SYS_PTRACE))
+			rcu_read_lock();
+			if (!ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE))
 				rc = -EPERM;
+			rcu_read_unlock();
 			break;
 		case YAMA_SCOPE_NO_ATTACH:
 		default:
@@ -307,8 +311,10 @@ int yama_ptrace_traceme(struct task_struct *parent)
 	/* Only disallow PTRACE_TRACEME on more aggressive settings. */
 	switch (ptrace_scope) {
 	case YAMA_SCOPE_CAPABILITY:
-		if (!ns_capable(task_user_ns(parent), CAP_SYS_PTRACE))
+		rcu_read_lock();
+		if (!ns_capable(__task_cred(parent)->user_ns, CAP_SYS_PTRACE))
 			rc = -EPERM;
+		rcu_read_unlock();
 		break;
 	case YAMA_SCOPE_NO_ATTACH:
 		rc = -EPERM;
-- 
cgit v1.2.3-70-g09d2


From cde1975bc242f3e1072bde623ef378e547b73f91 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 26 Jul 2012 06:24:06 -0700
Subject: userns: Implent proc namespace operations

This allows entering a user namespace, and the ability
to store a reference to a user namespace with a bind
mount.

Addition of missing userns_ns_put in userns_install
from Gao feng <gaofeng@cn.fujitsu.com>

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/proc/namespaces.c    |  4 +++
 include/linux/proc_fs.h |  1 +
 kernel/user_namespace.c | 90 +++++++++++++++++++++++++++++++++++++++----------
 3 files changed, 78 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 2a17fd9ae6a..030250c27d7 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -11,6 +11,7 @@
 #include <net/net_namespace.h>
 #include <linux/ipc_namespace.h>
 #include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
 #include "internal.h"
 
 
@@ -26,6 +27,9 @@ static const struct proc_ns_operations *ns_entries[] = {
 #endif
 #ifdef CONFIG_PID_NS
 	&pidns_operations,
+#endif
+#ifdef CONFIG_USER_NS
+	&userns_operations,
 #endif
 	&mntns_operations,
 };
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 9014c041e75..31447819bc5 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -258,6 +258,7 @@ extern const struct proc_ns_operations netns_operations;
 extern const struct proc_ns_operations utsns_operations;
 extern const struct proc_ns_operations ipcns_operations;
 extern const struct proc_ns_operations pidns_operations;
+extern const struct proc_ns_operations userns_operations;
 extern const struct proc_ns_operations mntns_operations;
 
 union proc_op {
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 49096d559e0..a9460774e77 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,6 +9,7 @@
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
+#include <linux/proc_fs.h>
 #include <linux/highuid.h>
 #include <linux/cred.h>
 #include <linux/securebits.h>
@@ -26,6 +27,24 @@ static struct kmem_cache *user_ns_cachep __read_mostly;
 static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
 				struct uid_gid_map *map);
 
+static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
+{
+	/* Start with the same capabilities as init but useless for doing
+	 * anything as the capabilities are bound to the new user namespace.
+	 */
+	cred->securebits = SECUREBITS_DEFAULT;
+	cred->cap_inheritable = CAP_EMPTY_SET;
+	cred->cap_permitted = CAP_FULL_SET;
+	cred->cap_effective = CAP_FULL_SET;
+	cred->cap_bset = CAP_FULL_SET;
+#ifdef CONFIG_KEYS
+	key_put(cred->request_key_auth);
+	cred->request_key_auth = NULL;
+#endif
+	/* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
+	cred->user_ns = user_ns;
+}
+
 /*
  * Create a new user namespace, deriving the creator from the user in the
  * passed credentials, and replacing that user with the new root user for the
@@ -53,27 +72,12 @@ int create_user_ns(struct cred *new)
 		return -ENOMEM;
 
 	kref_init(&ns->kref);
+	/* Leave the new->user_ns reference with the new user namespace. */
 	ns->parent = parent_ns;
 	ns->owner = owner;
 	ns->group = group;
 
-	/* Start with the same capabilities as init but useless for doing
-	 * anything as the capabilities are bound to the new user namespace.
-	 */
-	new->securebits = SECUREBITS_DEFAULT;
-	new->cap_inheritable = CAP_EMPTY_SET;
-	new->cap_permitted = CAP_FULL_SET;
-	new->cap_effective = CAP_FULL_SET;
-	new->cap_bset = CAP_FULL_SET;
-#ifdef CONFIG_KEYS
-	key_put(new->request_key_auth);
-	new->request_key_auth = NULL;
-#endif
-	/* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
-
-	/* Leave the new->user_ns reference with the new user namespace. */
-	/* Leave the reference to our user_ns with the new cred. */
-	new->user_ns = ns;
+	set_cred_user_ns(new, ns);
 
 	return 0;
 }
@@ -737,6 +741,58 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
 	return false;
 }
 
+static void *userns_get(struct task_struct *task)
+{
+	struct user_namespace *user_ns;
+
+	rcu_read_lock();
+	user_ns = get_user_ns(__task_cred(task)->user_ns);
+	rcu_read_unlock();
+
+	return user_ns;
+}
+
+static void userns_put(void *ns)
+{
+	put_user_ns(ns);
+}
+
+static int userns_install(struct nsproxy *nsproxy, void *ns)
+{
+	struct user_namespace *user_ns = ns;
+	struct cred *cred;
+
+	/* Don't allow gaining capabilities by reentering
+	 * the same user namespace.
+	 */
+	if (user_ns == current_user_ns())
+		return -EINVAL;
+
+	/* Threaded many not enter a different user namespace */
+	if (atomic_read(&current->mm->mm_users) > 1)
+		return -EINVAL;
+
+	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+		return -EPERM;
+
+	cred = prepare_creds();
+	if (!cred)
+		return -ENOMEM;
+
+	put_user_ns(cred->user_ns);
+	set_cred_user_ns(cred, get_user_ns(user_ns));
+
+	return commit_creds(cred);
+}
+
+const struct proc_ns_operations userns_operations = {
+	.name		= "user",
+	.type		= CLONE_NEWUSER,
+	.get		= userns_get,
+	.put		= userns_put,
+	.install	= userns_install,
+};
+
 static __init int user_namespaces_init(void)
 {
 	user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
-- 
cgit v1.2.3-70-g09d2


From b2e0d98705e60e45bbb3c0032c48824ad7ae0704 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 26 Jul 2012 05:15:35 -0700
Subject: userns: Implement unshare of the user namespace

- Add CLONE_THREAD to the unshare flags if CLONE_NEWUSER is selected
  As changing user namespaces is only valid if all there is only
  a single thread.
- Restore the code to add CLONE_VM if CLONE_THREAD is selected and
  the code to addCLONE_SIGHAND if CLONE_VM is selected.
  Making the constraints in the code clear.

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/nsproxy.h        |  2 +-
 include/linux/user_namespace.h |  9 +++++++++
 kernel/fork.c                  | 25 ++++++++++++++++++++++---
 kernel/nsproxy.c               |  8 ++++----
 kernel/user_namespace.c        | 15 +++++++++++++++
 5 files changed, 51 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index cc37a55ad00..10e5947491c 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -67,7 +67,7 @@ void exit_task_namespaces(struct task_struct *tsk);
 void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
 void free_nsproxy(struct nsproxy *ns);
 int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **,
-	struct fs_struct *);
+	struct cred *, struct fs_struct *);
 int __init nsproxy_cache_init(void);
 
 static inline void put_nsproxy(struct nsproxy *ns)
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 95142cae446..17651f08d67 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -39,6 +39,7 @@ static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
 }
 
 extern int create_user_ns(struct cred *new);
+extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred);
 extern void free_user_ns(struct kref *kref);
 
 static inline void put_user_ns(struct user_namespace *ns)
@@ -66,6 +67,14 @@ static inline int create_user_ns(struct cred *new)
 	return -EINVAL;
 }
 
+static inline int unshare_userns(unsigned long unshare_flags,
+				 struct cred **new_cred)
+{
+	if (unshare_flags & CLONE_NEWUSER)
+		return -EINVAL;
+	return 0;
+}
+
 static inline void put_user_ns(struct user_namespace *ns)
 {
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 8c29abb1901..38e53b87402 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1687,7 +1687,7 @@ static int check_unshare_flags(unsigned long unshare_flags)
 	if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
 				CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
 				CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
-				CLONE_NEWPID))
+				CLONE_NEWUSER|CLONE_NEWPID))
 		return -EINVAL;
 	/*
 	 * Not implemented, but pretend it works if there is nothing to
@@ -1754,10 +1754,16 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 {
 	struct fs_struct *fs, *new_fs = NULL;
 	struct files_struct *fd, *new_fd = NULL;
+	struct cred *new_cred = NULL;
 	struct nsproxy *new_nsproxy = NULL;
 	int do_sysvsem = 0;
 	int err;
 
+	/*
+	 * If unsharing a user namespace must also unshare the thread.
+	 */
+	if (unshare_flags & CLONE_NEWUSER)
+		unshare_flags |= CLONE_THREAD;
 	/*
 	 * If unsharing a pid namespace must also unshare the thread.
 	 */
@@ -1795,11 +1801,15 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 	err = unshare_fd(unshare_flags, &new_fd);
 	if (err)
 		goto bad_unshare_cleanup_fs;
-	err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs);
+	err = unshare_userns(unshare_flags, &new_cred);
 	if (err)
 		goto bad_unshare_cleanup_fd;
+	err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
+					 new_cred, new_fs);
+	if (err)
+		goto bad_unshare_cleanup_cred;
 
-	if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
+	if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
 		if (do_sysvsem) {
 			/*
 			 * CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1832,11 +1842,20 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 		}
 
 		task_unlock(current);
+
+		if (new_cred) {
+			/* Install the new user namespace */
+			commit_creds(new_cred);
+			new_cred = NULL;
+		}
 	}
 
 	if (new_nsproxy)
 		put_nsproxy(new_nsproxy);
 
+bad_unshare_cleanup_cred:
+	if (new_cred)
+		put_cred(new_cred);
 bad_unshare_cleanup_fd:
 	if (new_fd)
 		put_files_struct(new_fd);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 2ddd81657a2..78e2ecb2016 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -186,7 +186,7 @@ void free_nsproxy(struct nsproxy *ns)
  * On success, returns the new nsproxy.
  */
 int unshare_nsproxy_namespaces(unsigned long unshare_flags,
-		struct nsproxy **new_nsp, struct fs_struct *new_fs)
+	struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
 {
 	struct user_namespace *user_ns;
 	int err = 0;
@@ -195,12 +195,12 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
 			       CLONE_NEWNET | CLONE_NEWPID)))
 		return 0;
 
-	if (!nsown_capable(CAP_SYS_ADMIN))
+	user_ns = new_cred ? new_cred->user_ns : current_user_ns();
+	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 
-	user_ns = current_user_ns();
 	*new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
-				new_fs ? new_fs : current->fs);
+					 new_fs ? new_fs : current->fs);
 	if (IS_ERR(*new_nsp)) {
 		err = PTR_ERR(*new_nsp);
 		goto out;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index a9460774e77..ce92f7e6290 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -82,6 +82,21 @@ int create_user_ns(struct cred *new)
 	return 0;
 }
 
+int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
+{
+	struct cred *cred;
+
+	if (!(unshare_flags & CLONE_NEWUSER))
+		return 0;
+
+	cred = prepare_creds();
+	if (!cred)
+		return -ENOMEM;
+
+	*new_cred = cred;
+	return create_user_ns(cred);
+}
+
 void free_user_ns(struct kref *kref)
 {
 	struct user_namespace *parent, *ns =
-- 
cgit v1.2.3-70-g09d2


From c450f371d48557e3e0fa510a4af27b92f0d8c4cc Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 14 Aug 2012 21:25:13 -0700
Subject: userns: For /proc/self/{uid,gid}_map derive the lower userns from the
 struct file

To keep things sane in the context of file descriptor passing derive the
user namespace that uids are mapped into from the opener of the file
instead of from current.

When writing to the maps file the lower user namespace must always
be the parent user namespace, or setting the mapping simply does
not make sense.  Enforce that the opener of the file was in
the parent user namespace or the user namespace whose mapping
is being set.

Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/user_namespace.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index ce92f7e6290..89f6eaed067 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -391,7 +391,7 @@ static int uid_m_show(struct seq_file *seq, void *v)
 	struct user_namespace *lower_ns;
 	uid_t lower;
 
-	lower_ns = current_user_ns();
+	lower_ns = seq_user_ns(seq);
 	if ((lower_ns == ns) && lower_ns->parent)
 		lower_ns = lower_ns->parent;
 
@@ -412,7 +412,7 @@ static int gid_m_show(struct seq_file *seq, void *v)
 	struct user_namespace *lower_ns;
 	gid_t lower;
 
-	lower_ns = current_user_ns();
+	lower_ns = seq_user_ns(seq);
 	if ((lower_ns == ns) && lower_ns->parent)
 		lower_ns = lower_ns->parent;
 
@@ -688,10 +688,14 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz
 {
 	struct seq_file *seq = file->private_data;
 	struct user_namespace *ns = seq->private;
+	struct user_namespace *seq_ns = seq_user_ns(seq);
 
 	if (!ns->parent)
 		return -EPERM;
 
+	if ((seq_ns != ns) && (seq_ns != ns->parent))
+		return -EPERM;
+
 	return map_write(file, buf, size, ppos, CAP_SETUID,
 			 &ns->uid_map, &ns->parent->uid_map);
 }
@@ -700,10 +704,14 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz
 {
 	struct seq_file *seq = file->private_data;
 	struct user_namespace *ns = seq->private;
+	struct user_namespace *seq_ns = seq_user_ns(seq);
 
 	if (!ns->parent)
 		return -EPERM;
 
+	if ((seq_ns != ns) && (seq_ns != ns->parent))
+		return -EPERM;
+
 	return map_write(file, buf, size, ppos, CAP_SETGID,
 			 &ns->gid_map, &ns->parent->gid_map);
 }
-- 
cgit v1.2.3-70-g09d2


From 98f842e675f96ffac96e6c50315790912b2812be Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 15 Jun 2011 10:21:48 -0700
Subject: proc: Usable inode numbers for the namespace file descriptors.

Assign a unique proc inode to each namespace, and use that
inode number to ensure we only allocate at most one proc
inode for every namespace in proc.

A single proc inode per namespace allows userspace to test
to see if two processes are in the same namespace.

This has been a long requested feature and only blocked because
a naive implementation would put the id in a global space and
would ultimately require having a namespace for the names of
namespaces, making migration and certain virtualization tricks
impossible.

We still don't have per superblock inode numbers for proc, which
appears necessary for application unaware checkpoint/restart and
migrations (if the application is using namespace file descriptors)
but that is now allowd by the design if it becomes important.

I have preallocated the ipc and uts initial proc inode numbers so
their structures can be statically initialized.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/mount.h                     |  1 +
 fs/namespace.c                 | 14 ++++++++++++++
 fs/proc/namespaces.c           | 24 ++++++++++++++----------
 include/linux/ipc_namespace.h  |  2 ++
 include/linux/pid_namespace.h  |  1 +
 include/linux/proc_fs.h        |  7 ++++++-
 include/linux/user_namespace.h |  1 +
 include/linux/utsname.h        |  1 +
 include/net/net_namespace.h    |  2 ++
 init/version.c                 |  2 ++
 ipc/msgutil.c                  |  2 ++
 ipc/namespace.c                | 16 ++++++++++++++++
 kernel/pid.c                   |  1 +
 kernel/pid_namespace.c         | 12 ++++++++++++
 kernel/user.c                  |  2 ++
 kernel/user_namespace.c        | 15 +++++++++++++++
 kernel/utsname.c               | 17 ++++++++++++++++-
 net/core/net_namespace.c       | 24 ++++++++++++++++++++++++
 18 files changed, 132 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/fs/mount.h b/fs/mount.h
index 630fafc616b..cd500798040 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -4,6 +4,7 @@
 
 struct mnt_namespace {
 	atomic_t		count;
+	unsigned int		proc_inum;
 	struct mount *	root;
 	struct list_head	list;
 	struct user_namespace	*user_ns;
diff --git a/fs/namespace.c b/fs/namespace.c
index cab78a74aca..c1bbe86f492 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2301,6 +2301,7 @@ dput_out:
 
 static void free_mnt_ns(struct mnt_namespace *ns)
 {
+	proc_free_inum(ns->proc_inum);
 	put_user_ns(ns->user_ns);
 	kfree(ns);
 }
@@ -2317,10 +2318,16 @@ static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
 static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
 {
 	struct mnt_namespace *new_ns;
+	int ret;
 
 	new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
 	if (!new_ns)
 		return ERR_PTR(-ENOMEM);
+	ret = proc_alloc_inum(&new_ns->proc_inum);
+	if (ret) {
+		kfree(new_ns);
+		return ERR_PTR(ret);
+	}
 	new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
 	atomic_set(&new_ns->count, 1);
 	new_ns->root = NULL;
@@ -2799,10 +2806,17 @@ static int mntns_install(struct nsproxy *nsproxy, void *ns)
 	return 0;
 }
 
+static unsigned int mntns_inum(void *ns)
+{
+	struct mnt_namespace *mnt_ns = ns;
+	return mnt_ns->proc_inum;
+}
+
 const struct proc_ns_operations mntns_operations = {
 	.name		= "mnt",
 	.type		= CLONE_NEWNS,
 	.get		= mntns_get,
 	.put		= mntns_put,
 	.install	= mntns_install,
+	.inum		= mntns_inum,
 };
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 7a6d8d69cdb..b7a47196c8c 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -82,7 +82,7 @@ static struct dentry *proc_ns_get_dentry(struct super_block *sb,
 		return ERR_PTR(-ENOMEM);
 	}
 
-	inode = new_inode(sb);
+	inode = iget_locked(sb, ns_ops->inum(ns));
 	if (!inode) {
 		dput(dentry);
 		ns_ops->put(ns);
@@ -90,13 +90,17 @@ static struct dentry *proc_ns_get_dentry(struct super_block *sb,
 	}
 
 	ei = PROC_I(inode);
-	inode->i_ino = get_next_ino();
-	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-	inode->i_op = &ns_inode_operations;
-	inode->i_mode = S_IFREG | S_IRUGO;
-	inode->i_fop = &ns_file_operations;
-	ei->ns_ops = ns_ops;
-	ei->ns = ns;
+	if (inode->i_state & I_NEW) {
+		inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+		inode->i_op = &ns_inode_operations;
+		inode->i_mode = S_IFREG | S_IRUGO;
+		inode->i_fop = &ns_file_operations;
+		ei->ns_ops = ns_ops;
+		ei->ns = ns;
+		unlock_new_inode(inode);
+	} else {
+		ns_ops->put(ns);
+	}
 
 	d_set_d_op(dentry, &ns_dentry_operations);
 	result = d_instantiate_unique(dentry, inode);
@@ -162,12 +166,12 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
 	if (!ns)
 		goto out_put_task;
 
-	snprintf(name, sizeof(name), "%s", ns_ops->name);
+	snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns));
 	len = strlen(name);
 
 	if (len > buflen)
 		len = buflen;
-	if (copy_to_user(buffer, ns_ops->name, len))
+	if (copy_to_user(buffer, name, len))
 		len = -EFAULT;
 
 	ns_ops->put(ns);
diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index f03af702a39..fe771978e87 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -67,6 +67,8 @@ struct ipc_namespace {
 
 	/* user_ns which owns the ipc ns */
 	struct user_namespace *user_ns;
+
+	unsigned int	proc_inum;
 };
 
 extern struct ipc_namespace init_ipc_ns;
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 4c96acdb248..bf285999273 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -37,6 +37,7 @@ struct pid_namespace {
 	kgid_t pid_gid;
 	int hide_pid;
 	int reboot;	/* group exit code if this pidns was rebooted */
+	unsigned int proc_inum;
 };
 
 extern struct pid_namespace init_pid_ns;
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index bf1d000fbba..2e24018b7ce 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -28,7 +28,11 @@ struct mm_struct;
  */
 
 enum {
-	PROC_ROOT_INO = 1,
+	PROC_ROOT_INO		= 1,
+	PROC_IPC_INIT_INO	= 0xEFFFFFFFU,
+	PROC_UTS_INIT_INO	= 0xEFFFFFFEU,
+	PROC_USER_INIT_INO	= 0xEFFFFFFDU,
+	PROC_PID_INIT_INO	= 0xEFFFFFFCU,
 };
 
 /*
@@ -263,6 +267,7 @@ struct proc_ns_operations {
 	void *(*get)(struct task_struct *task);
 	void (*put)(void *ns);
 	int (*install)(struct nsproxy *nsproxy, void *ns);
+	unsigned int (*inum)(void *ns);
 };
 extern const struct proc_ns_operations netns_operations;
 extern const struct proc_ns_operations utsns_operations;
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 17651f08d67..b9bd2e6c73c 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -25,6 +25,7 @@ struct user_namespace {
 	struct user_namespace	*parent;
 	kuid_t			owner;
 	kgid_t			group;
+	unsigned int		proc_inum;
 };
 
 extern struct user_namespace init_user_ns;
diff --git a/include/linux/utsname.h b/include/linux/utsname.h
index 221f4a0a750..239e27733d6 100644
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -23,6 +23,7 @@ struct uts_namespace {
 	struct kref kref;
 	struct new_utsname name;
 	struct user_namespace *user_ns;
+	unsigned int proc_inum;
 };
 extern struct uts_namespace init_uts_ns;
 
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index c5a43f56b79..de644bcd861 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -56,6 +56,8 @@ struct net {
 
 	struct user_namespace   *user_ns;	/* Owning user namespace */
 
+	unsigned int		proc_inum;
+
 	struct proc_dir_entry 	*proc_net;
 	struct proc_dir_entry 	*proc_net_stat;
 
diff --git a/init/version.c b/init/version.c
index 86fe0ccb997..58170f18912 100644
--- a/init/version.c
+++ b/init/version.c
@@ -12,6 +12,7 @@
 #include <linux/utsname.h>
 #include <generated/utsrelease.h>
 #include <linux/version.h>
+#include <linux/proc_fs.h>
 
 #ifndef CONFIG_KALLSYMS
 #define version(a) Version_ ## a
@@ -34,6 +35,7 @@ struct uts_namespace init_uts_ns = {
 		.domainname	= UTS_DOMAINNAME,
 	},
 	.user_ns = &init_user_ns,
+	.proc_inum = PROC_UTS_INIT_INO,
 };
 EXPORT_SYMBOL_GPL(init_uts_ns);
 
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
index 26143d377c9..6471f1bdae9 100644
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -16,6 +16,7 @@
 #include <linux/msg.h>
 #include <linux/ipc_namespace.h>
 #include <linux/utsname.h>
+#include <linux/proc_fs.h>
 #include <asm/uaccess.h>
 
 #include "util.h"
@@ -30,6 +31,7 @@ DEFINE_SPINLOCK(mq_lock);
 struct ipc_namespace init_ipc_ns = {
 	.count		= ATOMIC_INIT(1),
 	.user_ns = &init_user_ns,
+	.proc_inum = PROC_IPC_INIT_INO,
 };
 
 atomic_t nr_ipc_ns = ATOMIC_INIT(1);
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 72c86827779..cf3386a51de 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -26,9 +26,16 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
 	if (ns == NULL)
 		return ERR_PTR(-ENOMEM);
 
+	err = proc_alloc_inum(&ns->proc_inum);
+	if (err) {
+		kfree(ns);
+		return ERR_PTR(err);
+	}
+
 	atomic_set(&ns->count, 1);
 	err = mq_init_ns(ns);
 	if (err) {
+		proc_free_inum(ns->proc_inum);
 		kfree(ns);
 		return ERR_PTR(err);
 	}
@@ -111,6 +118,7 @@ static void free_ipc_ns(struct ipc_namespace *ns)
 	 */
 	ipcns_notify(IPCNS_REMOVED);
 	put_user_ns(ns->user_ns);
+	proc_free_inum(ns->proc_inum);
 	kfree(ns);
 }
 
@@ -172,10 +180,18 @@ static int ipcns_install(struct nsproxy *nsproxy, void *new)
 	return 0;
 }
 
+static unsigned int ipcns_inum(void *vp)
+{
+	struct ipc_namespace *ns = vp;
+
+	return ns->proc_inum;
+}
+
 const struct proc_ns_operations ipcns_operations = {
 	.name		= "ipc",
 	.type		= CLONE_NEWIPC,
 	.get		= ipcns_get,
 	.put		= ipcns_put,
 	.install	= ipcns_install,
+	.inum		= ipcns_inum,
 };
diff --git a/kernel/pid.c b/kernel/pid.c
index 6e8da291de4..3026ddae0a3 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -80,6 +80,7 @@ struct pid_namespace init_pid_ns = {
 	.level = 0,
 	.child_reaper = &init_task,
 	.user_ns = &init_user_ns,
+	.proc_inum = PROC_PID_INIT_INO,
 };
 EXPORT_SYMBOL_GPL(init_pid_ns);
 
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 68508d33063..560da0dab23 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -107,6 +107,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	if (ns->pid_cachep == NULL)
 		goto out_free_map;
 
+	err = proc_alloc_inum(&ns->proc_inum);
+	if (err)
+		goto out_free_map;
+
 	kref_init(&ns->kref);
 	ns->level = level;
 	ns->parent = get_pid_ns(parent_pid_ns);
@@ -133,6 +137,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
 {
 	int i;
 
+	proc_free_inum(ns->proc_inum);
 	for (i = 0; i < PIDMAP_ENTRIES; i++)
 		kfree(ns->pidmap[i].page);
 	put_user_ns(ns->user_ns);
@@ -345,12 +350,19 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns)
 	return 0;
 }
 
+static unsigned int pidns_inum(void *ns)
+{
+	struct pid_namespace *pid_ns = ns;
+	return pid_ns->proc_inum;
+}
+
 const struct proc_ns_operations pidns_operations = {
 	.name		= "pid",
 	.type		= CLONE_NEWPID,
 	.get		= pidns_get,
 	.put		= pidns_put,
 	.install	= pidns_install,
+	.inum		= pidns_inum,
 };
 
 static __init int pid_namespaces_init(void)
diff --git a/kernel/user.c b/kernel/user.c
index 750acffbe9e..33acb5e53a5 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,6 +16,7 @@
 #include <linux/interrupt.h>
 #include <linux/export.h>
 #include <linux/user_namespace.h>
+#include <linux/proc_fs.h>
 
 /*
  * userns count is 1 for root user, 1 for init_uts_ns,
@@ -51,6 +52,7 @@ struct user_namespace init_user_ns = {
 	},
 	.owner = GLOBAL_ROOT_UID,
 	.group = GLOBAL_ROOT_GID,
+	.proc_inum = PROC_USER_INIT_INO,
 };
 EXPORT_SYMBOL_GPL(init_user_ns);
 
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 89f6eaed067..f5975ccf934 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -58,6 +58,7 @@ int create_user_ns(struct cred *new)
 	struct user_namespace *ns, *parent_ns = new->user_ns;
 	kuid_t owner = new->euid;
 	kgid_t group = new->egid;
+	int ret;
 
 	/* The creator needs a mapping in the parent user namespace
 	 * or else we won't be able to reasonably tell userspace who
@@ -71,6 +72,12 @@ int create_user_ns(struct cred *new)
 	if (!ns)
 		return -ENOMEM;
 
+	ret = proc_alloc_inum(&ns->proc_inum);
+	if (ret) {
+		kmem_cache_free(user_ns_cachep, ns);
+		return ret;
+	}
+
 	kref_init(&ns->kref);
 	/* Leave the new->user_ns reference with the new user namespace. */
 	ns->parent = parent_ns;
@@ -103,6 +110,7 @@ void free_user_ns(struct kref *kref)
 		container_of(kref, struct user_namespace, kref);
 
 	parent = ns->parent;
+	proc_free_inum(ns->proc_inum);
 	kmem_cache_free(user_ns_cachep, ns);
 	put_user_ns(parent);
 }
@@ -808,12 +816,19 @@ static int userns_install(struct nsproxy *nsproxy, void *ns)
 	return commit_creds(cred);
 }
 
+static unsigned int userns_inum(void *ns)
+{
+	struct user_namespace *user_ns = ns;
+	return user_ns->proc_inum;
+}
+
 const struct proc_ns_operations userns_operations = {
 	.name		= "user",
 	.type		= CLONE_NEWUSER,
 	.get		= userns_get,
 	.put		= userns_put,
 	.install	= userns_install,
+	.inum		= userns_inum,
 };
 
 static __init int user_namespaces_init(void)
diff --git a/kernel/utsname.c b/kernel/utsname.c
index fdc619eb61e..f6336d51d64 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -36,11 +36,18 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
 					  struct uts_namespace *old_ns)
 {
 	struct uts_namespace *ns;
+	int err;
 
 	ns = create_uts_ns();
 	if (!ns)
 		return ERR_PTR(-ENOMEM);
 
+	err = proc_alloc_inum(&ns->proc_inum);
+	if (err) {
+		kfree(ns);
+		return ERR_PTR(err);
+	}
+
 	down_read(&uts_sem);
 	memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
 	ns->user_ns = get_user_ns(user_ns);
@@ -77,6 +84,7 @@ void free_uts_ns(struct kref *kref)
 
 	ns = container_of(kref, struct uts_namespace, kref);
 	put_user_ns(ns->user_ns);
+	proc_free_inum(ns->proc_inum);
 	kfree(ns);
 }
 
@@ -114,11 +122,18 @@ static int utsns_install(struct nsproxy *nsproxy, void *new)
 	return 0;
 }
 
+static unsigned int utsns_inum(void *vp)
+{
+	struct uts_namespace *ns = vp;
+
+	return ns->proc_inum;
+}
+
 const struct proc_ns_operations utsns_operations = {
 	.name		= "uts",
 	.type		= CLONE_NEWUTS,
 	.get		= utsns_get,
 	.put		= utsns_put,
 	.install	= utsns_install,
+	.inum		= utsns_inum,
 };
-
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index ec2870b44c1..2e9a3132b8d 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -381,6 +381,21 @@ struct net *get_net_ns_by_pid(pid_t pid)
 }
 EXPORT_SYMBOL_GPL(get_net_ns_by_pid);
 
+static __net_init int net_ns_net_init(struct net *net)
+{
+	return proc_alloc_inum(&net->proc_inum);
+}
+
+static __net_exit void net_ns_net_exit(struct net *net)
+{
+	proc_free_inum(net->proc_inum);
+}
+
+static struct pernet_operations __net_initdata net_ns_ops = {
+	.init = net_ns_net_init,
+	.exit = net_ns_net_exit,
+};
+
 static int __init net_ns_init(void)
 {
 	struct net_generic *ng;
@@ -412,6 +427,8 @@ static int __init net_ns_init(void)
 
 	mutex_unlock(&net_mutex);
 
+	register_pernet_subsys(&net_ns_ops);
+
 	return 0;
 }
 
@@ -640,11 +657,18 @@ static int netns_install(struct nsproxy *nsproxy, void *ns)
 	return 0;
 }
 
+static unsigned int netns_inum(void *ns)
+{
+	struct net *net = ns;
+	return net->proc_inum;
+}
+
 const struct proc_ns_operations netns_operations = {
 	.name		= "net",
 	.type		= CLONE_NEWNET,
 	.get		= netns_get,
 	.put		= netns_put,
 	.install	= netns_install,
+	.inum		= netns_inum,
 };
 #endif
-- 
cgit v1.2.3-70-g09d2