From e656d8a6f7fdf7612d2f5771f0ddfca9487f59d9 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 10 Jul 2010 14:52:49 -0700 Subject: procfs: Use the proc generic infrastructure for proc/self. I had visions at one point of splitting proc into two filesystems. If that had happened proc/self being the the part of proc that actually deals with pids would have been a nice cleanup. As it is proc/self requires a lot of unnecessary infrastructure for a single file. The only user visible change is that a mounted /proc for a pid namespace that is dead now shows a broken proc symlink, instead of being completely invisible. I don't think anyone will notice or care. Signed-off-by: Eric W. Biederman --- fs/proc/root.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/proc/root.c') diff --git a/fs/proc/root.c b/fs/proc/root.c index 9889a92d2e0..5da984959ed 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -169,6 +169,7 @@ void __init proc_root_init(void) return; } + proc_self_init(); proc_symlink("mounts", NULL, "self/mounts"); proc_net_init(); -- cgit v1.2.3-70-g09d2 From ae06c7c83fc6e97ba247a261921c101960f3d28f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 10 Jul 2010 15:23:34 -0700 Subject: procfs: Don't cache a pid in the root inode. Now that we have s_fs_info pointing to our pid namespace the original reason for the proc root inode having a struct pid is gone. Caching a pid in the root inode has led to some complicated code. Now that we don't need the struct pid, just remove it. Signed-off-by: Eric W. Biederman --- fs/proc/base.c | 11 +---------- fs/proc/root.c | 8 -------- 2 files changed, 1 insertion(+), 18 deletions(-) (limited to 'fs/proc/root.c') diff --git a/fs/proc/base.c b/fs/proc/base.c index cbe454e94af..6177fc238fd 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2714,19 +2714,12 @@ static int fake_filldir(void *buf, const char *name, int namelen, /* for the /proc/ directory itself, after non-process stuff has been done */ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) { - unsigned int nr; - struct task_struct *reaper; struct tgid_iter iter; struct pid_namespace *ns; filldir_t __filldir; if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET) - goto out_no_task; - nr = filp->f_pos - FIRST_PROCESS_ENTRY; - - reaper = get_proc_task(filp->f_path.dentry->d_inode); - if (!reaper) - goto out_no_task; + goto out; ns = filp->f_dentry->d_sb->s_fs_info; iter.task = NULL; @@ -2747,8 +2740,6 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) } filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET; out: - put_task_struct(reaper); -out_no_task: return 0; } diff --git a/fs/proc/root.c b/fs/proc/root.c index 5da984959ed..13ef6247e7a 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -100,7 +100,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, int err; struct super_block *sb; struct pid_namespace *ns; - struct proc_inode *ei; char *options; if (flags & MS_KERNMOUNT) { @@ -130,13 +129,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, sb->s_flags |= MS_ACTIVE; } - ei = PROC_I(sb->s_root->d_inode); - if (!ei->pid) { - rcu_read_lock(); - ei->pid = get_pid(find_pid_ns(1, ns)); - rcu_read_unlock(); - } - return dget(sb->s_root); } -- cgit v1.2.3-70-g09d2 From 17cf22c33e1f1b5e435469c84e43872579497653 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 2 Mar 2010 14:51:53 -0800 Subject: pidns: Use task_active_pid_ns where appropriate The expressions tsk->nsproxy->pid_ns and task_active_pid_ns aka ns_of_pid(task_pid(tsk)) should have the same number of cache line misses with the practical difference that ns_of_pid(task_pid(tsk)) is released later in a processes life. Furthermore by using task_active_pid_ns it becomes trivial to write an unshare implementation for the the pid namespace. So I have used task_active_pid_ns everywhere I can. In fork since the pid has not yet been attached to the process I use ns_of_pid, to achieve the same effect. Signed-off-by: Eric W. Biederman --- arch/powerpc/platforms/cell/spufs/sched.c | 2 +- arch/um/drivers/mconsole_kern.c | 2 +- drivers/staging/android/binder.c | 3 ++- fs/hppfs/hppfs.c | 2 +- fs/proc/root.c | 2 +- kernel/cgroup.c | 2 +- kernel/events/core.c | 2 +- kernel/fork.c | 2 +- kernel/nsproxy.c | 2 +- kernel/pid.c | 8 ++++---- kernel/signal.c | 2 +- kernel/sysctl_binary.c | 2 +- 12 files changed, 16 insertions(+), 15 deletions(-) (limited to 'fs/proc/root.c') diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index 965d381abd7..25db92a8e1c 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c @@ -1094,7 +1094,7 @@ static int show_spu_loadavg(struct seq_file *s, void *private) LOAD_INT(c), LOAD_FRAC(c), count_active_contexts(), atomic_read(&nr_spu_contexts), - current->nsproxy->pid_ns->last_pid); + task_active_pid_ns(current)->last_pid); return 0; } diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index 79ccfe6c707..7fc71c62826 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -123,7 +123,7 @@ void mconsole_log(struct mc_request *req) void mconsole_proc(struct mc_request *req) { - struct vfsmount *mnt = current->nsproxy->pid_ns->proc_mnt; + struct vfsmount *mnt = task_active_pid_ns(current)->proc_mnt; char *buf; int len; struct file *file; diff --git a/drivers/staging/android/binder.c b/drivers/staging/android/binder.c index 5d4610babd8..a97bbcd1c9e 100644 --- a/drivers/staging/android/binder.c +++ b/drivers/staging/android/binder.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "binder.h" @@ -2344,7 +2345,7 @@ retry: if (t->from) { struct task_struct *sender = t->from->proc->tsk; tr.sender_pid = task_tgid_nr_ns(sender, - current->nsproxy->pid_ns); + task_active_pid_ns(current)); } else { tr.sender_pid = 0; } diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 78f21f8dc2e..43b315f2002 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -710,7 +710,7 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent) struct vfsmount *proc_mnt; int err = -ENOENT; - proc_mnt = mntget(current->nsproxy->pid_ns->proc_mnt); + proc_mnt = mntget(task_active_pid_ns(current)->proc_mnt); if (IS_ERR(proc_mnt)) goto out; diff --git a/fs/proc/root.c b/fs/proc/root.c index 13ef6247e7a..fc1609321a7 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -106,7 +106,7 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, ns = (struct pid_namespace *)data; options = NULL; } else { - ns = current->nsproxy->pid_ns; + ns = task_active_pid_ns(current); options = data; } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f24f724620d..0dbfba2efa7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3390,7 +3390,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, { struct cgroup_pidlist *l; /* don't need task_nsproxy() if we're looking at ourself */ - struct pid_namespace *ns = current->nsproxy->pid_ns; + struct pid_namespace *ns = task_active_pid_ns(current); /* * We can't drop the pidlist_mutex before taking the l->mutex in case diff --git a/kernel/events/core.c b/kernel/events/core.c index dbccf83c134..738f3564e83 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6155,7 +6155,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->parent = parent_event; - event->ns = get_pid_ns(current->nsproxy->pid_ns); + event->ns = get_pid_ns(task_active_pid_ns(current)); event->id = atomic64_inc_return(&perf_event_id); event->state = PERF_EVENT_STATE_INACTIVE; diff --git a/kernel/fork.c b/kernel/fork.c index 8b20ab7d3aa..7798c247f4b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1442,7 +1442,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (thread_group_leader(p)) { if (is_child_reaper(pid)) - p->nsproxy->pid_ns->child_reaper = p; + ns_of_pid(pid)->child_reaper = p; p->signal->leader_pid = pid; p->signal->tty = tty_kref_get(current->signal->tty); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index ca27d2c5264..acc92680381 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -84,7 +84,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_ipc; } - new_nsp->pid_ns = copy_pid_ns(flags, task_cred_xxx(tsk, user_ns), task_active_pid_ns(tsk)); + new_nsp->pid_ns = copy_pid_ns(flags, task_cred_xxx(tsk, user_ns), tsk->nsproxy->pid_ns); if (IS_ERR(new_nsp->pid_ns)) { err = PTR_ERR(new_nsp->pid_ns); goto out_pid; diff --git a/kernel/pid.c b/kernel/pid.c index 2a624f1486e..3a5f238c1ca 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -345,7 +345,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns); struct pid *find_vpid(int nr) { - return find_pid_ns(nr, current->nsproxy->pid_ns); + return find_pid_ns(nr, task_active_pid_ns(current)); } EXPORT_SYMBOL_GPL(find_vpid); @@ -429,7 +429,7 @@ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) struct task_struct *find_task_by_vpid(pid_t vnr) { - return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns); + return find_task_by_pid_ns(vnr, task_active_pid_ns(current)); } struct pid *get_task_pid(struct task_struct *task, enum pid_type type) @@ -484,7 +484,7 @@ EXPORT_SYMBOL_GPL(pid_nr_ns); pid_t pid_vnr(struct pid *pid) { - return pid_nr_ns(pid, current->nsproxy->pid_ns); + return pid_nr_ns(pid, task_active_pid_ns(current)); } EXPORT_SYMBOL_GPL(pid_vnr); @@ -495,7 +495,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, rcu_read_lock(); if (!ns) - ns = current->nsproxy->pid_ns; + ns = task_active_pid_ns(current); if (likely(pid_alive(task))) { if (type != PIDTYPE_PID) task = task->group_leader; diff --git a/kernel/signal.c b/kernel/signal.c index 0af8868525d..b2445d86f22 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1752,7 +1752,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, * see comment in do_notify_parent() about the following 4 lines */ rcu_read_lock(); - info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); + info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent)); info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); rcu_read_unlock(); diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 65bdcf198d4..5a638445050 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1344,7 +1344,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, goto out_putname; } - mnt = current->nsproxy->pid_ns->proc_mnt; + mnt = task_active_pid_ns(current)->proc_mnt; file = file_open_root(mnt->mnt_root, mnt, pathname, flags); result = PTR_ERR(file); if (IS_ERR(file)) -- cgit v1.2.3-70-g09d2 From 0a01f2cc390e10633a54f72c608cc3fe19a50c3d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 1 Aug 2012 10:33:47 -0700 Subject: pidns: Make the pidns proc mount/umount logic obvious. Track the number of pids in the proc hash table. When the number of pids goes to 0 schedule work to unmount the kernel mount of proc. Move the mount of proc into alloc_pid when we allocate the pid for init. Remove the surprising calls of pid_ns_release proc in fork and proc_flush_task. Those code paths really shouldn't know about proc namespace implementation details and people have demonstrated several times that finding and understanding those code paths is difficult and non-obvious. Because of the call path detach pid is alwasy called with the rtnl_lock held free_pid is not allowed to sleep, so the work to unmounting proc is moved to a work queue. This has the side benefit of not blocking the entire world waiting for the unnecessary rcu_barrier in deactivate_locked_super. In the process of making the code clear and obvious this fixes a bug reported by Gao feng where we would leak a mount of proc during clone(CLONE_NEWPID|CLONE_NEWNET) if copy_pid_ns succeeded and copy_net_ns failed. Acked-by: "Serge E. Hallyn" Signed-off-by: "Eric W. Biederman" --- fs/proc/base.c | 4 ---- fs/proc/root.c | 5 ----- include/linux/pid_namespace.h | 2 ++ kernel/fork.c | 2 -- kernel/pid.c | 21 +++++++++++++++++---- kernel/pid_namespace.c | 14 +++++++------- 6 files changed, 26 insertions(+), 22 deletions(-) (limited to 'fs/proc/root.c') diff --git a/fs/proc/base.c b/fs/proc/base.c index 6177fc238fd..7621dc51cff 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2590,10 +2590,6 @@ void proc_flush_task(struct task_struct *task) proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, tgid->numbers[i].nr); } - - upid = &pid->numbers[pid->level]; - if (upid->nr == 1) - pid_ns_release_proc(upid->ns); } static struct dentry *proc_pid_instantiate(struct inode *dir, diff --git a/fs/proc/root.c b/fs/proc/root.c index fc1609321a7..f2f251158d3 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -155,11 +155,6 @@ void __init proc_root_init(void) err = register_filesystem(&proc_fs_type); if (err) return; - err = pid_ns_prepare_proc(&init_pid_ns); - if (err) { - unregister_filesystem(&proc_fs_type); - return; - } proc_self_init(); proc_symlink("mounts", NULL, "self/mounts"); diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index c89c9cfcd24..4c96acdb248 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -21,6 +21,7 @@ struct pid_namespace { struct kref kref; struct pidmap pidmap[PIDMAP_ENTRIES]; int last_pid; + int nr_hashed; struct task_struct *child_reaper; struct kmem_cache *pid_cachep; unsigned int level; @@ -32,6 +33,7 @@ struct pid_namespace { struct bsd_acct_struct *bacct; #endif struct user_namespace *user_ns; + struct work_struct proc_work; kgid_t pid_gid; int hide_pid; int reboot; /* group exit code if this pidns was rebooted */ diff --git a/kernel/fork.c b/kernel/fork.c index 7798c247f4b..666dc8b0660 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1476,8 +1476,6 @@ bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); bad_fork_cleanup_namespaces: - if (unlikely(clone_flags & CLONE_NEWPID)) - pid_ns_release_proc(p->nsproxy->pid_ns); exit_task_namespaces(p); bad_fork_cleanup_mm: if (p->mm) diff --git a/kernel/pid.c b/kernel/pid.c index 3a5f238c1ca..e957f8b0913 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -36,6 +36,7 @@ #include #include #include +#include #define pid_hashfn(nr, ns) \ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) @@ -270,8 +271,12 @@ void free_pid(struct pid *pid) unsigned long flags; spin_lock_irqsave(&pidmap_lock, flags); - for (i = 0; i <= pid->level; i++) - hlist_del_rcu(&pid->numbers[i].pid_chain); + for (i = 0; i <= pid->level; i++) { + struct upid *upid = pid->numbers + i; + hlist_del_rcu(&upid->pid_chain); + if (--upid->ns->nr_hashed == 0) + schedule_work(&upid->ns->proc_work); + } spin_unlock_irqrestore(&pidmap_lock, flags); for (i = 0; i <= pid->level; i++) @@ -293,6 +298,7 @@ struct pid *alloc_pid(struct pid_namespace *ns) goto out; tmp = ns; + pid->level = ns->level; for (i = ns->level; i >= 0; i--) { nr = alloc_pidmap(tmp); if (nr < 0) @@ -303,17 +309,23 @@ struct pid *alloc_pid(struct pid_namespace *ns) tmp = tmp->parent; } + if (unlikely(is_child_reaper(pid))) { + if (pid_ns_prepare_proc(ns)) + goto out_free; + } + get_pid_ns(ns); - pid->level = ns->level; atomic_set(&pid->count, 1); for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); - for ( ; upid >= pid->numbers; --upid) + for ( ; upid >= pid->numbers; --upid) { hlist_add_head_rcu(&upid->pid_chain, &pid_hash[pid_hashfn(upid->nr, upid->ns)]); + upid->ns->nr_hashed++; + } spin_unlock_irq(&pidmap_lock); out: @@ -570,6 +582,7 @@ void __init pidmap_init(void) /* Reserve PID 0. We never call free_pidmap(0) */ set_bit(0, init_pid_ns.pidmap[0].page); atomic_dec(&init_pid_ns.pidmap[0].nr_free); + init_pid_ns.nr_hashed = 1; init_pid_ns.pid_cachep = KMEM_CACHE(pid, SLAB_HWCACHE_ALIGN | SLAB_PANIC); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index b2604950aa5..84591cfeefc 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -72,6 +72,12 @@ err_alloc: return NULL; } +static void proc_cleanup_work(struct work_struct *work) +{ + struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work); + pid_ns_release_proc(ns); +} + /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ #define MAX_PID_NS_LEVEL 32 @@ -105,6 +111,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); ns->user_ns = get_user_ns(user_ns); + INIT_WORK(&ns->proc_work, proc_cleanup_work); set_bit(0, ns->pidmap[0].page); atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); @@ -112,15 +119,8 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns for (i = 1; i < PIDMAP_ENTRIES; i++) atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); - err = pid_ns_prepare_proc(ns); - if (err) - goto out_put_parent_pid_ns; - return ns; -out_put_parent_pid_ns: - put_pid_ns(parent_pid_ns); - put_user_ns(user_ns); out_free_map: kfree(ns->pidmap[0].page); out_free: -- cgit v1.2.3-70-g09d2 From 4f326c0064b20b78b8041f4d2f6fe188a1129f18 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 27 Jul 2012 05:56:48 -0700 Subject: userns: Allow unprivilged mounts of proc and sysfs - The context in which proc and sysfs are mounted have no effect on the the uid/gid of their files so no conversion is needed except allowing the mount. Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- fs/proc/root.c | 1 + fs/sysfs/mount.c | 1 + 2 files changed, 2 insertions(+) (limited to 'fs/proc/root.c') diff --git a/fs/proc/root.c b/fs/proc/root.c index f2f251158d3..c6e9fac26ba 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -145,6 +145,7 @@ static struct file_system_type proc_fs_type = { .name = "proc", .mount = proc_mount, .kill_sb = proc_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; void __init proc_root_init(void) diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 71eb7e25392..db940a9be04 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -149,6 +149,7 @@ static struct file_system_type sysfs_fs_type = { .name = "sysfs", .mount = sysfs_mount, .kill_sb = sysfs_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; int __init sysfs_init(void) -- cgit v1.2.3-70-g09d2