From 8823c079ba7136dc1948d6f6dcb5f8022bde438e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 7 Mar 2010 18:49:36 -0800 Subject: vfs: Add setns support for the mount namespace setns support for the mount namespace is a little tricky as an arbitrary decision must be made about what to set fs->root and fs->pwd to, as there is no expectation of a relationship between the two mount namespaces. Therefore I arbitrarily find the root mount point, and follow every mount on top of it to find the top of the mount stack. Then I set fs->root and fs->pwd to that location. The topmost root of the mount stack seems like a reasonable place to be. Bind mount support for the mount namespace inodes has the possibility of creating circular dependencies between mount namespaces. Circular dependencies can result in loops that prevent mount namespaces from every being freed. I avoid creating those circular dependencies by adding a sequence number to the mount namespace and require all bind mounts be of a younger mount namespace into an older mount namespace. Add a helper function proc_ns_inode so it is possible to detect when we are attempting to bind mound a namespace inode. Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- fs/namespace.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index 24960626bb6..d287e7e7464 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -20,6 +20,7 @@ #include /* get_fs_root et.al. */ #include /* fsnotify_vfsmount_delete */ #include +#include #include "pnode.h" #include "internal.h" @@ -1308,6 +1309,26 @@ static int mount_is_safe(struct path *path) #endif } +static bool mnt_ns_loop(struct path *path) +{ + /* Could bind mounting the mount namespace inode cause a + * mount namespace loop? + */ + struct inode *inode = path->dentry->d_inode; + struct proc_inode *ei; + struct mnt_namespace *mnt_ns; + + if (!proc_ns_inode(inode)) + return false; + + ei = PROC_I(inode); + if (ei->ns_ops != &mntns_operations) + return false; + + mnt_ns = ei->ns; + return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; +} + struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, int flag) { @@ -1655,6 +1676,10 @@ static int do_loopback(struct path *path, const char *old_name, if (err) return err; + err = -EINVAL; + if (mnt_ns_loop(&old_path)) + goto out; + err = lock_mount(path); if (err) goto out; @@ -2261,6 +2286,15 @@ dput_out: return retval; } +/* + * Assign a sequence number so we can detect when we attempt to bind + * mount a reference to an older mount namespace into the current + * mount namespace, preventing reference counting loops. A 64bit + * number incrementing at 10Ghz will take 12,427 years to wrap which + * is effectively never, so we can ignore the possibility. + */ +static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); + static struct mnt_namespace *alloc_mnt_ns(void) { struct mnt_namespace *new_ns; @@ -2268,6 +2302,7 @@ static struct mnt_namespace *alloc_mnt_ns(void) new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); if (!new_ns) return ERR_PTR(-ENOMEM); + new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); atomic_set(&new_ns->count, 1); new_ns->root = NULL; INIT_LIST_HEAD(&new_ns->list); @@ -2681,3 +2716,63 @@ bool our_mnt(struct vfsmount *mnt) { return check_mnt(real_mount(mnt)); } + +static void *mntns_get(struct task_struct *task) +{ + struct mnt_namespace *ns = NULL; + struct nsproxy *nsproxy; + + rcu_read_lock(); + nsproxy = task_nsproxy(task); + if (nsproxy) { + ns = nsproxy->mnt_ns; + get_mnt_ns(ns); + } + rcu_read_unlock(); + + return ns; +} + +static void mntns_put(void *ns) +{ + put_mnt_ns(ns); +} + +static int mntns_install(struct nsproxy *nsproxy, void *ns) +{ + struct fs_struct *fs = current->fs; + struct mnt_namespace *mnt_ns = ns; + struct path root; + + if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_CHROOT)) + return -EINVAL; + + if (fs->users != 1) + return -EINVAL; + + get_mnt_ns(mnt_ns); + put_mnt_ns(nsproxy->mnt_ns); + nsproxy->mnt_ns = mnt_ns; + + /* Find the root */ + root.mnt = &mnt_ns->root->mnt; + root.dentry = mnt_ns->root->mnt.mnt_root; + path_get(&root); + while(d_mountpoint(root.dentry) && follow_down_one(&root)) + ; + + /* Update the pwd and root */ + set_fs_pwd(fs, &root); + set_fs_root(fs, &root); + + path_put(&root); + return 0; +} + +const struct proc_ns_operations mntns_operations = { + .name = "mnt", + .type = CLONE_NEWNS, + .get = mntns_get, + .put = mntns_put, + .install = mntns_install, +}; -- cgit v1.2.3-70-g09d2 From 771b1371686e0a63e938ada28de020b9a0040f55 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 26 Jul 2012 21:08:32 -0700 Subject: vfs: Add a user namespace reference from struct mnt_namespace This will allow for support for unprivileged mounts in a new user namespace. Acked-by: "Serge E. Hallyn" Signed-off-by: "Eric W. Biederman" --- fs/mount.h | 1 + fs/namespace.c | 24 ++++++++++++++++-------- include/linux/mnt_namespace.h | 3 ++- kernel/nsproxy.c | 2 +- 4 files changed, 20 insertions(+), 10 deletions(-) (limited to 'fs/namespace.c') diff --git a/fs/mount.h b/fs/mount.h index e9c37dd3d00..630fafc616b 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -6,6 +6,7 @@ struct mnt_namespace { atomic_t count; struct mount * root; struct list_head list; + struct user_namespace *user_ns; u64 seq; /* Sequence number to prevent loops */ wait_queue_head_t poll; int event; diff --git a/fs/namespace.c b/fs/namespace.c index d287e7e7464..207c7ba84ad 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -2286,6 +2287,12 @@ dput_out: return retval; } +static void free_mnt_ns(struct mnt_namespace *ns) +{ + put_user_ns(ns->user_ns); + kfree(ns); +} + /* * Assign a sequence number so we can detect when we attempt to bind * mount a reference to an older mount namespace into the current @@ -2295,7 +2302,7 @@ dput_out: */ static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); -static struct mnt_namespace *alloc_mnt_ns(void) +static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) { struct mnt_namespace *new_ns; @@ -2308,6 +2315,7 @@ static struct mnt_namespace *alloc_mnt_ns(void) INIT_LIST_HEAD(&new_ns->list); init_waitqueue_head(&new_ns->poll); new_ns->event = 0; + new_ns->user_ns = get_user_ns(user_ns); return new_ns; } @@ -2316,7 +2324,7 @@ static struct mnt_namespace *alloc_mnt_ns(void) * copied from the namespace of the passed in task structure. */ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, - struct fs_struct *fs) + struct user_namespace *user_ns, struct fs_struct *fs) { struct mnt_namespace *new_ns; struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; @@ -2324,7 +2332,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, struct mount *old = mnt_ns->root; struct mount *new; - new_ns = alloc_mnt_ns(); + new_ns = alloc_mnt_ns(user_ns); if (IS_ERR(new_ns)) return new_ns; @@ -2333,7 +2341,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, new = copy_tree(old, old->mnt.mnt_root, CL_COPY_ALL | CL_EXPIRE); if (IS_ERR(new)) { up_write(&namespace_sem); - kfree(new_ns); + free_mnt_ns(new_ns); return ERR_CAST(new); } new_ns->root = new; @@ -2374,7 +2382,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, } struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, - struct fs_struct *new_fs) + struct user_namespace *user_ns, struct fs_struct *new_fs) { struct mnt_namespace *new_ns; @@ -2384,7 +2392,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, if (!(flags & CLONE_NEWNS)) return ns; - new_ns = dup_mnt_ns(ns, new_fs); + new_ns = dup_mnt_ns(ns, user_ns, new_fs); put_mnt_ns(ns); return new_ns; @@ -2396,7 +2404,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, */ static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) { - struct mnt_namespace *new_ns = alloc_mnt_ns(); + struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns); if (!IS_ERR(new_ns)) { struct mount *mnt = real_mount(m); mnt->mnt_ns = new_ns; @@ -2682,7 +2690,7 @@ void put_mnt_ns(struct mnt_namespace *ns) br_write_unlock(&vfsmount_lock); up_write(&namespace_sem); release_mounts(&umount_list); - kfree(ns); + free_mnt_ns(ns); } struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index 5a8e3903d77..12b2ab51032 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -4,9 +4,10 @@ struct mnt_namespace; struct fs_struct; +struct user_namespace; extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, - struct fs_struct *); + struct user_namespace *, struct fs_struct *); extern void put_mnt_ns(struct mnt_namespace *ns); extern const struct file_operations proc_mounts_operations; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index b8d4d8709d7..7f8b051fc19 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -66,7 +66,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, if (!new_nsp) return ERR_PTR(-ENOMEM); - new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); + new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, task_cred_xxx(tsk, user_ns), new_fs); if (IS_ERR(new_nsp->mnt_ns)) { err = PTR_ERR(new_nsp->mnt_ns); goto out_ns; -- cgit v1.2.3-70-g09d2 From 7a472ef4be8387bc05a42e16309b02c8ca943a40 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 31 Jul 2012 13:13:04 -0700 Subject: vfs: Only support slave subtrees across different user namespaces Sharing mount subtress with mount namespaces created by unprivileged users allows unprivileged mounts created by unprivileged users to propagate to mount namespaces controlled by privileged users. Prevent nasty consequences by changing shared subtrees to slave subtress when an unprivileged users creates a new mount namespace. Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 11 ++++++++--- fs/pnode.h | 1 + 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index 207c7ba84ad..4dfcaf05d17 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -786,7 +786,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, if (!mnt) return ERR_PTR(-ENOMEM); - if (flag & (CL_SLAVE | CL_PRIVATE)) + if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE)) mnt->mnt_group_id = 0; /* not a peer of original */ else mnt->mnt_group_id = old->mnt_group_id; @@ -807,7 +807,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, list_add_tail(&mnt->mnt_instance, &sb->s_mounts); br_write_unlock(&vfsmount_lock); - if (flag & CL_SLAVE) { + if ((flag & CL_SLAVE) || + ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) { list_add(&mnt->mnt_slave, &old->mnt_slave_list); mnt->mnt_master = old; CLEAR_MNT_SHARED(mnt); @@ -2331,6 +2332,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, struct mount *p, *q; struct mount *old = mnt_ns->root; struct mount *new; + int copy_flags; new_ns = alloc_mnt_ns(user_ns); if (IS_ERR(new_ns)) @@ -2338,7 +2340,10 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, down_write(&namespace_sem); /* First pass: copy the tree topology */ - new = copy_tree(old, old->mnt.mnt_root, CL_COPY_ALL | CL_EXPIRE); + copy_flags = CL_COPY_ALL | CL_EXPIRE; + if (user_ns != mnt_ns->user_ns) + copy_flags |= CL_SHARED_TO_SLAVE; + new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { up_write(&namespace_sem); free_mnt_ns(new_ns); diff --git a/fs/pnode.h b/fs/pnode.h index 65c60979d54..19b853a3445 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -22,6 +22,7 @@ #define CL_COPY_ALL 0x04 #define CL_MAKE_SHARED 0x08 #define CL_PRIVATE 0x10 +#define CL_SHARED_TO_SLAVE 0x20 static inline void set_mnt_shared(struct mount *mnt) { -- cgit v1.2.3-70-g09d2 From 0c55cfc4166d9a0f38de779bd4d75a90afbe7734 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 26 Jul 2012 21:42:03 -0700 Subject: vfs: Allow unprivileged manipulation of the mount namespace. - Add a filesystem flag to mark filesystems that are safe to mount as an unprivileged user. - Add a filesystem flag to mark filesystems that don't need MNT_NODEV when mounted by an unprivileged user. - Relax the permission checks to allow unprivileged users that have CAP_SYS_ADMIN permissions in the user namespace referred to by the current mount namespace to be allowed to mount, unmount, and move filesystems. Acked-by: "Serge E. Hallyn" Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 69 ++++++++++++++++++++++++++++++++++-------------------- include/linux/fs.h | 2 ++ 2 files changed, 45 insertions(+), 26 deletions(-) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index 4dfcaf05d17..9ddc86f9322 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1269,7 +1269,7 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags) goto dput_and_out; retval = -EPERM; - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) goto dput_and_out; retval = do_umount(mnt, flags); @@ -1295,7 +1295,7 @@ SYSCALL_DEFINE1(oldumount, char __user *, name) static int mount_is_safe(struct path *path) { - if (capable(CAP_SYS_ADMIN)) + if (ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN)) return 0; return -EPERM; #ifdef notyet @@ -1633,7 +1633,7 @@ static int do_change_type(struct path *path, int flag) int type; int err = 0; - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; if (path->dentry != path->mnt->mnt_root) @@ -1797,7 +1797,7 @@ static int do_move_mount(struct path *path, const char *old_name) struct mount *p; struct mount *old; int err = 0; - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; if (!old_name || !*old_name) return -EINVAL; @@ -1884,21 +1884,6 @@ static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) return ERR_PTR(err); } -static struct vfsmount * -do_kern_mount(const char *fstype, int flags, const char *name, void *data) -{ - struct file_system_type *type = get_fs_type(fstype); - struct vfsmount *mnt; - if (!type) - return ERR_PTR(-ENODEV); - mnt = vfs_kern_mount(type, flags, name, data); - if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && - !mnt->mnt_sb->s_subtype) - mnt = fs_set_subtype(mnt, fstype); - put_filesystem(type); - return mnt; -} - /* * add a mount into a namespace's mount tree */ @@ -1944,20 +1929,46 @@ unlock: * create a new mount for userspace and request it to be added into the * namespace's tree */ -static int do_new_mount(struct path *path, const char *type, int flags, +static int do_new_mount(struct path *path, const char *fstype, int flags, int mnt_flags, const char *name, void *data) { + struct file_system_type *type; + struct user_namespace *user_ns; struct vfsmount *mnt; int err; - if (!type) + if (!fstype) return -EINVAL; /* we need capabilities... */ - if (!capable(CAP_SYS_ADMIN)) + user_ns = real_mount(path->mnt)->mnt_ns->user_ns; + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; - mnt = do_kern_mount(type, flags, name, data); + type = get_fs_type(fstype); + if (!type) + return -ENODEV; + + if (user_ns != &init_user_ns) { + if (!(type->fs_flags & FS_USERNS_MOUNT)) { + put_filesystem(type); + return -EPERM; + } + /* Only in special cases allow devices from mounts + * created outside the initial user namespace. + */ + if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) { + flags |= MS_NODEV; + mnt_flags |= MNT_NODEV; + } + } + + mnt = vfs_kern_mount(type, flags, name, data); + if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && + !mnt->mnt_sb->s_subtype) + mnt = fs_set_subtype(mnt, fstype); + + put_filesystem(type); if (IS_ERR(mnt)) return PTR_ERR(mnt); @@ -2549,7 +2560,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, struct mount *new_mnt, *root_mnt; int error; - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; error = user_path_dir(new_root, &new); @@ -2631,8 +2642,13 @@ static void __init init_mount_tree(void) struct vfsmount *mnt; struct mnt_namespace *ns; struct path root; + struct file_system_type *type; - mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); + type = get_fs_type("rootfs"); + if (!type) + panic("Can't find rootfs type"); + mnt = vfs_kern_mount(type, 0, "rootfs", NULL); + put_filesystem(type); if (IS_ERR(mnt)) panic("Can't create rootfs"); @@ -2757,7 +2773,8 @@ static int mntns_install(struct nsproxy *nsproxy, void *ns) struct mnt_namespace *mnt_ns = ns; struct path root; - if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_CHROOT)) + if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || + !nsown_capable(CAP_SYS_CHROOT)) return -EINVAL; if (fs->users != 1) diff --git a/include/linux/fs.h b/include/linux/fs.h index b33cfc97b9c..5037aa6817f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1812,6 +1812,8 @@ struct file_system_type { #define FS_REQUIRES_DEV 1 #define FS_BINARY_MOUNTDATA 2 #define FS_HAS_SUBTYPE 4 +#define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ +#define FS_USERNS_DEV_MOUNT 16 /* A userns mount does not imply MNT_NODEV */ #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ struct dentry *(*mount) (struct file_system_type *, int, -- cgit v1.2.3-70-g09d2 From ae11e0f18482bfe0cd83b9b61434ea7e0bd94e25 Mon Sep 17 00:00:00 2001 From: Zhao Hongjiang Date: Thu, 13 Sep 2012 16:38:03 +0800 Subject: userns: fix return value on mntns_install() failure Change return value from -EINVAL to -EPERM when the permission check fails. Signed-off-by: Zhao Hongjiang Signed-off-by: Eric W. Biederman --- fs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index 9ddc86f9322..cab78a74aca 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2775,7 +2775,7 @@ static int mntns_install(struct nsproxy *nsproxy, void *ns) if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || !nsown_capable(CAP_SYS_CHROOT)) - return -EINVAL; + return -EPERM; if (fs->users != 1) return -EINVAL; -- cgit v1.2.3-70-g09d2 From 98f842e675f96ffac96e6c50315790912b2812be Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 15 Jun 2011 10:21:48 -0700 Subject: proc: Usable inode numbers for the namespace file descriptors. Assign a unique proc inode to each namespace, and use that inode number to ensure we only allocate at most one proc inode for every namespace in proc. A single proc inode per namespace allows userspace to test to see if two processes are in the same namespace. This has been a long requested feature and only blocked because a naive implementation would put the id in a global space and would ultimately require having a namespace for the names of namespaces, making migration and certain virtualization tricks impossible. We still don't have per superblock inode numbers for proc, which appears necessary for application unaware checkpoint/restart and migrations (if the application is using namespace file descriptors) but that is now allowd by the design if it becomes important. I have preallocated the ipc and uts initial proc inode numbers so their structures can be statically initialized. Signed-off-by: Eric W. Biederman --- fs/mount.h | 1 + fs/namespace.c | 14 ++++++++++++++ fs/proc/namespaces.c | 24 ++++++++++++++---------- include/linux/ipc_namespace.h | 2 ++ include/linux/pid_namespace.h | 1 + include/linux/proc_fs.h | 7 ++++++- include/linux/user_namespace.h | 1 + include/linux/utsname.h | 1 + include/net/net_namespace.h | 2 ++ init/version.c | 2 ++ ipc/msgutil.c | 2 ++ ipc/namespace.c | 16 ++++++++++++++++ kernel/pid.c | 1 + kernel/pid_namespace.c | 12 ++++++++++++ kernel/user.c | 2 ++ kernel/user_namespace.c | 15 +++++++++++++++ kernel/utsname.c | 17 ++++++++++++++++- net/core/net_namespace.c | 24 ++++++++++++++++++++++++ 18 files changed, 132 insertions(+), 12 deletions(-) (limited to 'fs/namespace.c') diff --git a/fs/mount.h b/fs/mount.h index 630fafc616b..cd500798040 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -4,6 +4,7 @@ struct mnt_namespace { atomic_t count; + unsigned int proc_inum; struct mount * root; struct list_head list; struct user_namespace *user_ns; diff --git a/fs/namespace.c b/fs/namespace.c index cab78a74aca..c1bbe86f492 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2301,6 +2301,7 @@ dput_out: static void free_mnt_ns(struct mnt_namespace *ns) { + proc_free_inum(ns->proc_inum); put_user_ns(ns->user_ns); kfree(ns); } @@ -2317,10 +2318,16 @@ static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) { struct mnt_namespace *new_ns; + int ret; new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); if (!new_ns) return ERR_PTR(-ENOMEM); + ret = proc_alloc_inum(&new_ns->proc_inum); + if (ret) { + kfree(new_ns); + return ERR_PTR(ret); + } new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); atomic_set(&new_ns->count, 1); new_ns->root = NULL; @@ -2799,10 +2806,17 @@ static int mntns_install(struct nsproxy *nsproxy, void *ns) return 0; } +static unsigned int mntns_inum(void *ns) +{ + struct mnt_namespace *mnt_ns = ns; + return mnt_ns->proc_inum; +} + const struct proc_ns_operations mntns_operations = { .name = "mnt", .type = CLONE_NEWNS, .get = mntns_get, .put = mntns_put, .install = mntns_install, + .inum = mntns_inum, }; diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 7a6d8d69cdb..b7a47196c8c 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -82,7 +82,7 @@ static struct dentry *proc_ns_get_dentry(struct super_block *sb, return ERR_PTR(-ENOMEM); } - inode = new_inode(sb); + inode = iget_locked(sb, ns_ops->inum(ns)); if (!inode) { dput(dentry); ns_ops->put(ns); @@ -90,13 +90,17 @@ static struct dentry *proc_ns_get_dentry(struct super_block *sb, } ei = PROC_I(inode); - inode->i_ino = get_next_ino(); - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - inode->i_op = &ns_inode_operations; - inode->i_mode = S_IFREG | S_IRUGO; - inode->i_fop = &ns_file_operations; - ei->ns_ops = ns_ops; - ei->ns = ns; + if (inode->i_state & I_NEW) { + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_op = &ns_inode_operations; + inode->i_mode = S_IFREG | S_IRUGO; + inode->i_fop = &ns_file_operations; + ei->ns_ops = ns_ops; + ei->ns = ns; + unlock_new_inode(inode); + } else { + ns_ops->put(ns); + } d_set_d_op(dentry, &ns_dentry_operations); result = d_instantiate_unique(dentry, inode); @@ -162,12 +166,12 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl if (!ns) goto out_put_task; - snprintf(name, sizeof(name), "%s", ns_ops->name); + snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns)); len = strlen(name); if (len > buflen) len = buflen; - if (copy_to_user(buffer, ns_ops->name, len)) + if (copy_to_user(buffer, name, len)) len = -EFAULT; ns_ops->put(ns); diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index f03af702a39..fe771978e87 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -67,6 +67,8 @@ struct ipc_namespace { /* user_ns which owns the ipc ns */ struct user_namespace *user_ns; + + unsigned int proc_inum; }; extern struct ipc_namespace init_ipc_ns; diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 4c96acdb248..bf285999273 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -37,6 +37,7 @@ struct pid_namespace { kgid_t pid_gid; int hide_pid; int reboot; /* group exit code if this pidns was rebooted */ + unsigned int proc_inum; }; extern struct pid_namespace init_pid_ns; diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index bf1d000fbba..2e24018b7ce 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -28,7 +28,11 @@ struct mm_struct; */ enum { - PROC_ROOT_INO = 1, + PROC_ROOT_INO = 1, + PROC_IPC_INIT_INO = 0xEFFFFFFFU, + PROC_UTS_INIT_INO = 0xEFFFFFFEU, + PROC_USER_INIT_INO = 0xEFFFFFFDU, + PROC_PID_INIT_INO = 0xEFFFFFFCU, }; /* @@ -263,6 +267,7 @@ struct proc_ns_operations { void *(*get)(struct task_struct *task); void (*put)(void *ns); int (*install)(struct nsproxy *nsproxy, void *ns); + unsigned int (*inum)(void *ns); }; extern const struct proc_ns_operations netns_operations; extern const struct proc_ns_operations utsns_operations; diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 17651f08d67..b9bd2e6c73c 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -25,6 +25,7 @@ struct user_namespace { struct user_namespace *parent; kuid_t owner; kgid_t group; + unsigned int proc_inum; }; extern struct user_namespace init_user_ns; diff --git a/include/linux/utsname.h b/include/linux/utsname.h index 221f4a0a750..239e27733d6 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -23,6 +23,7 @@ struct uts_namespace { struct kref kref; struct new_utsname name; struct user_namespace *user_ns; + unsigned int proc_inum; }; extern struct uts_namespace init_uts_ns; diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index c5a43f56b79..de644bcd861 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -56,6 +56,8 @@ struct net { struct user_namespace *user_ns; /* Owning user namespace */ + unsigned int proc_inum; + struct proc_dir_entry *proc_net; struct proc_dir_entry *proc_net_stat; diff --git a/init/version.c b/init/version.c index 86fe0ccb997..58170f18912 100644 --- a/init/version.c +++ b/init/version.c @@ -12,6 +12,7 @@ #include #include #include +#include #ifndef CONFIG_KALLSYMS #define version(a) Version_ ## a @@ -34,6 +35,7 @@ struct uts_namespace init_uts_ns = { .domainname = UTS_DOMAINNAME, }, .user_ns = &init_user_ns, + .proc_inum = PROC_UTS_INIT_INO, }; EXPORT_SYMBOL_GPL(init_uts_ns); diff --git a/ipc/msgutil.c b/ipc/msgutil.c index 26143d377c9..6471f1bdae9 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "util.h" @@ -30,6 +31,7 @@ DEFINE_SPINLOCK(mq_lock); struct ipc_namespace init_ipc_ns = { .count = ATOMIC_INIT(1), .user_ns = &init_user_ns, + .proc_inum = PROC_IPC_INIT_INO, }; atomic_t nr_ipc_ns = ATOMIC_INIT(1); diff --git a/ipc/namespace.c b/ipc/namespace.c index 72c86827779..cf3386a51de 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -26,9 +26,16 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, if (ns == NULL) return ERR_PTR(-ENOMEM); + err = proc_alloc_inum(&ns->proc_inum); + if (err) { + kfree(ns); + return ERR_PTR(err); + } + atomic_set(&ns->count, 1); err = mq_init_ns(ns); if (err) { + proc_free_inum(ns->proc_inum); kfree(ns); return ERR_PTR(err); } @@ -111,6 +118,7 @@ static void free_ipc_ns(struct ipc_namespace *ns) */ ipcns_notify(IPCNS_REMOVED); put_user_ns(ns->user_ns); + proc_free_inum(ns->proc_inum); kfree(ns); } @@ -172,10 +180,18 @@ static int ipcns_install(struct nsproxy *nsproxy, void *new) return 0; } +static unsigned int ipcns_inum(void *vp) +{ + struct ipc_namespace *ns = vp; + + return ns->proc_inum; +} + const struct proc_ns_operations ipcns_operations = { .name = "ipc", .type = CLONE_NEWIPC, .get = ipcns_get, .put = ipcns_put, .install = ipcns_install, + .inum = ipcns_inum, }; diff --git a/kernel/pid.c b/kernel/pid.c index 6e8da291de4..3026ddae0a3 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -80,6 +80,7 @@ struct pid_namespace init_pid_ns = { .level = 0, .child_reaper = &init_task, .user_ns = &init_user_ns, + .proc_inum = PROC_PID_INIT_INO, }; EXPORT_SYMBOL_GPL(init_pid_ns); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 68508d33063..560da0dab23 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -107,6 +107,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns if (ns->pid_cachep == NULL) goto out_free_map; + err = proc_alloc_inum(&ns->proc_inum); + if (err) + goto out_free_map; + kref_init(&ns->kref); ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); @@ -133,6 +137,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns) { int i; + proc_free_inum(ns->proc_inum); for (i = 0; i < PIDMAP_ENTRIES; i++) kfree(ns->pidmap[i].page); put_user_ns(ns->user_ns); @@ -345,12 +350,19 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns) return 0; } +static unsigned int pidns_inum(void *ns) +{ + struct pid_namespace *pid_ns = ns; + return pid_ns->proc_inum; +} + const struct proc_ns_operations pidns_operations = { .name = "pid", .type = CLONE_NEWPID, .get = pidns_get, .put = pidns_put, .install = pidns_install, + .inum = pidns_inum, }; static __init int pid_namespaces_init(void) diff --git a/kernel/user.c b/kernel/user.c index 750acffbe9e..33acb5e53a5 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -16,6 +16,7 @@ #include #include #include +#include /* * userns count is 1 for root user, 1 for init_uts_ns, @@ -51,6 +52,7 @@ struct user_namespace init_user_ns = { }, .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, + .proc_inum = PROC_USER_INIT_INO, }; EXPORT_SYMBOL_GPL(init_user_ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 89f6eaed067..f5975ccf934 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -58,6 +58,7 @@ int create_user_ns(struct cred *new) struct user_namespace *ns, *parent_ns = new->user_ns; kuid_t owner = new->euid; kgid_t group = new->egid; + int ret; /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who @@ -71,6 +72,12 @@ int create_user_ns(struct cred *new) if (!ns) return -ENOMEM; + ret = proc_alloc_inum(&ns->proc_inum); + if (ret) { + kmem_cache_free(user_ns_cachep, ns); + return ret; + } + kref_init(&ns->kref); /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; @@ -103,6 +110,7 @@ void free_user_ns(struct kref *kref) container_of(kref, struct user_namespace, kref); parent = ns->parent; + proc_free_inum(ns->proc_inum); kmem_cache_free(user_ns_cachep, ns); put_user_ns(parent); } @@ -808,12 +816,19 @@ static int userns_install(struct nsproxy *nsproxy, void *ns) return commit_creds(cred); } +static unsigned int userns_inum(void *ns) +{ + struct user_namespace *user_ns = ns; + return user_ns->proc_inum; +} + const struct proc_ns_operations userns_operations = { .name = "user", .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, + .inum = userns_inum, }; static __init int user_namespaces_init(void) diff --git a/kernel/utsname.c b/kernel/utsname.c index fdc619eb61e..f6336d51d64 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -36,11 +36,18 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, struct uts_namespace *old_ns) { struct uts_namespace *ns; + int err; ns = create_uts_ns(); if (!ns) return ERR_PTR(-ENOMEM); + err = proc_alloc_inum(&ns->proc_inum); + if (err) { + kfree(ns); + return ERR_PTR(err); + } + down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); ns->user_ns = get_user_ns(user_ns); @@ -77,6 +84,7 @@ void free_uts_ns(struct kref *kref) ns = container_of(kref, struct uts_namespace, kref); put_user_ns(ns->user_ns); + proc_free_inum(ns->proc_inum); kfree(ns); } @@ -114,11 +122,18 @@ static int utsns_install(struct nsproxy *nsproxy, void *new) return 0; } +static unsigned int utsns_inum(void *vp) +{ + struct uts_namespace *ns = vp; + + return ns->proc_inum; +} + const struct proc_ns_operations utsns_operations = { .name = "uts", .type = CLONE_NEWUTS, .get = utsns_get, .put = utsns_put, .install = utsns_install, + .inum = utsns_inum, }; - diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index ec2870b44c1..2e9a3132b8d 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -381,6 +381,21 @@ struct net *get_net_ns_by_pid(pid_t pid) } EXPORT_SYMBOL_GPL(get_net_ns_by_pid); +static __net_init int net_ns_net_init(struct net *net) +{ + return proc_alloc_inum(&net->proc_inum); +} + +static __net_exit void net_ns_net_exit(struct net *net) +{ + proc_free_inum(net->proc_inum); +} + +static struct pernet_operations __net_initdata net_ns_ops = { + .init = net_ns_net_init, + .exit = net_ns_net_exit, +}; + static int __init net_ns_init(void) { struct net_generic *ng; @@ -412,6 +427,8 @@ static int __init net_ns_init(void) mutex_unlock(&net_mutex); + register_pernet_subsys(&net_ns_ops); + return 0; } @@ -640,11 +657,18 @@ static int netns_install(struct nsproxy *nsproxy, void *ns) return 0; } +static unsigned int netns_inum(void *ns) +{ + struct net *net = ns; + return net->proc_inum; +} + const struct proc_ns_operations netns_operations = { .name = "net", .type = CLONE_NEWNET, .get = netns_get, .put = netns_put, .install = netns_install, + .inum = netns_inum, }; #endif -- cgit v1.2.3-70-g09d2 From 5e4a08476b50fa39210fca82e03325cc46b9c235 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 14 Dec 2012 07:55:36 -0800 Subject: userns: Require CAP_SYS_ADMIN for most uses of setns. Andy Lutomirski found a nasty little bug in the permissions of setns. With unprivileged user namespaces it became possible to create new namespaces without privilege. However the setns calls were relaxed to only require CAP_SYS_ADMIN in the user nameapce of the targed namespace. Which made the following nasty sequence possible. pid = clone(CLONE_NEWUSER | CLONE_NEWNS); if (pid == 0) { /* child */ system("mount --bind /home/me/passwd /etc/passwd"); } else if (pid != 0) { /* parent */ char path[PATH_MAX]; snprintf(path, sizeof(path), "/proc/%u/ns/mnt"); fd = open(path, O_RDONLY); setns(fd, 0); system("su -"); } Prevent this possibility by requiring CAP_SYS_ADMIN in the current user namespace when joing all but the user namespace. Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 3 ++- ipc/namespace.c | 3 ++- kernel/pid_namespace.c | 3 ++- kernel/utsname.c | 3 ++- net/core/net_namespace.c | 3 ++- 5 files changed, 10 insertions(+), 5 deletions(-) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index c1bbe86f492..398a50ff243 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2781,7 +2781,8 @@ static int mntns_install(struct nsproxy *nsproxy, void *ns) struct path root; if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || - !nsown_capable(CAP_SYS_CHROOT)) + !nsown_capable(CAP_SYS_CHROOT) || + !nsown_capable(CAP_SYS_ADMIN)) return -EPERM; if (fs->users != 1) diff --git a/ipc/namespace.c b/ipc/namespace.c index cf3386a51de..7c1fa451b0b 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -170,7 +170,8 @@ static void ipcns_put(void *ns) static int ipcns_install(struct nsproxy *nsproxy, void *new) { struct ipc_namespace *ns = new; - if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || + !nsown_capable(CAP_SYS_ADMIN)) return -EPERM; /* Ditch state from the old ipc namespace */ diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 560da0dab23..fdbd0cdf271 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -325,7 +325,8 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns) struct pid_namespace *active = task_active_pid_ns(current); struct pid_namespace *ancestor, *new = ns; - if (!ns_capable(new->user_ns, CAP_SYS_ADMIN)) + if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || + !nsown_capable(CAP_SYS_ADMIN)) return -EPERM; /* diff --git a/kernel/utsname.c b/kernel/utsname.c index f6336d51d64..08b197e8c48 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -113,7 +113,8 @@ static int utsns_install(struct nsproxy *nsproxy, void *new) { struct uts_namespace *ns = new; - if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || + !nsown_capable(CAP_SYS_ADMIN)) return -EPERM; get_uts_ns(ns); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 2e9a3132b8d..8acce01b6da 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -649,7 +649,8 @@ static int netns_install(struct nsproxy *nsproxy, void *ns) { struct net *net = ns; - if (!ns_capable(net->user_ns, CAP_SYS_ADMIN)) + if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) || + !nsown_capable(CAP_SYS_ADMIN)) return -EPERM; put_net(nsproxy->net_ns); -- cgit v1.2.3-70-g09d2 From 1e75529e3c6c18dc535f38454173c4f2dfa99685 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 16 Nov 2012 17:23:50 +0800 Subject: vfs, freeze: use ACCESS_ONCE() to guard access to ->mnt_flags The compiler may optimize the while loop and make the check just be done once, so we should use ACCESS_ONCE() to guard access to ->mnt_flags Signed-off-by: Miao Xie Signed-off-by: Al Viro --- fs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index 398a50ff243..55605c55278 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -313,7 +313,7 @@ int __mnt_want_write(struct vfsmount *m) * incremented count after it has set MNT_WRITE_HOLD. */ smp_mb(); - while (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) + while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) cpu_relax(); /* * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will -- cgit v1.2.3-70-g09d2