From 728dba3a39c66b3d8ac889ddbe38b5b1c264aec3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 3 Feb 2014 19:13:49 -0800 Subject: namespaces: Use task_lock and not rcu to protect nsproxy The synchronous syncrhonize_rcu in switch_task_namespaces makes setns a sufficiently expensive system call that people have complained. Upon inspect nsproxy no longer needs rcu protection for remote reads. remote reads are rare. So optimize for same process reads and write by switching using rask_lock instead. This yields a simpler to understand lock, and a faster setns system call. In particular this fixes a performance regression observed by Rafael David Tinoco . This is effectively a revert of Pavel Emelyanov's commit cf7b708c8d1d7a27736771bcf4c457b332b0f818 Make access to task's nsproxy lighter from 2007. The race this originialy fixed no longer exists as do_notify_parent uses task_active_pid_ns(parent) instead of parent->nsproxy. Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index 182bc41cd88..7187d01329c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2972,13 +2972,13 @@ static void *mntns_get(struct task_struct *task) struct mnt_namespace *ns = NULL; struct nsproxy *nsproxy; - rcu_read_lock(); - nsproxy = task_nsproxy(task); + task_lock(task); + nsproxy = task->nsproxy; if (nsproxy) { ns = nsproxy->mnt_ns; get_mnt_ns(ns); } - rcu_read_unlock(); + task_unlock(task); return ns; } -- cgit v1.2.3-70-g09d2 From a6138db815df5ee542d848318e5dae681590fccd Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 28 Jul 2014 16:26:53 -0700 Subject: mnt: Only change user settable mount flags in remount Kenton Varda discovered that by remounting a read-only bind mount read-only in a user namespace the MNT_LOCK_READONLY bit would be cleared, allowing an unprivileged user to the remount a read-only mount read-write. Correct this by replacing the mask of mount flags to preserve with a mask of mount flags that may be changed, and preserve all others. This ensures that any future bugs with this mask and remount will fail in an easy to detect way where new mount flags simply won't change. Cc: stable@vger.kernel.org Acked-by: Serge E. Hallyn Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 2 +- include/linux/mount.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index 7187d01329c..cb40449ea0d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1937,7 +1937,7 @@ static int do_remount(struct path *path, int flags, int mnt_flags, err = do_remount_sb(sb, flags, data, 0); if (!err) { lock_mount_hash(); - mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK; + mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK; mnt->mnt.mnt_flags = mnt_flags; touch_mnt_namespace(mnt->mnt_ns); unlock_mount_hash(); diff --git a/include/linux/mount.h b/include/linux/mount.h index 839bac27090..b637a89e1fa 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -42,7 +42,9 @@ struct mnt_namespace; * flag, consider how it interacts with shared mounts. */ #define MNT_SHARED_MASK (MNT_UNBINDABLE) -#define MNT_PROPAGATION_MASK (MNT_SHARED | MNT_UNBINDABLE) +#define MNT_USER_SETTABLE_MASK (MNT_NOSUID | MNT_NODEV | MNT_NOEXEC \ + | MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME \ + | MNT_READONLY) #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \ MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED) -- cgit v1.2.3-70-g09d2 From 07b645589dcda8b7a5249e096fece2a67556f0f4 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 28 Jul 2014 17:10:56 -0700 Subject: mnt: Move the test for MNT_LOCK_READONLY from change_mount_flags into do_remount There are no races as locked mount flags are guaranteed to never change. Moving the test into do_remount makes it more visible, and ensures all filesystem remounts pass the MNT_LOCK_READONLY permission check. This second case is not an issue today as filesystem remounts are guarded by capable(CAP_DAC_ADMIN) and thus will always fail in less privileged mount namespaces, but it could become an issue in the future. Cc: stable@vger.kernel.org Acked-by: Serge E. Hallyn Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index cb40449ea0d..1105a577a14 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1896,9 +1896,6 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags) if (readonly_request == __mnt_is_readonly(mnt)) return 0; - if (mnt->mnt_flags & MNT_LOCK_READONLY) - return -EPERM; - if (readonly_request) error = mnt_make_readonly(real_mount(mnt)); else @@ -1924,6 +1921,16 @@ static int do_remount(struct path *path, int flags, int mnt_flags, if (path->dentry != path->mnt->mnt_root) return -EINVAL; + /* Don't allow changing of locked mnt flags. + * + * No locks need to be held here while testing the various + * MNT_LOCK flags because those flags can never be cleared + * once they are set. + */ + if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) && + !(mnt_flags & MNT_READONLY)) { + return -EPERM; + } err = security_sb_remount(sb, data); if (err) return err; -- cgit v1.2.3-70-g09d2 From 9566d6742852c527bf5af38af5cbb878dad75705 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 28 Jul 2014 17:26:07 -0700 Subject: mnt: Correct permission checks in do_remount While invesgiating the issue where in "mount --bind -oremount,ro ..." would result in later "mount --bind -oremount,rw" succeeding even if the mount started off locked I realized that there are several additional mount flags that should be locked and are not. In particular MNT_NOSUID, MNT_NODEV, MNT_NOEXEC, and the atime flags in addition to MNT_READONLY should all be locked. These flags are all per superblock, can all be changed with MS_BIND, and should not be changable if set by a more privileged user. The following additions to the current logic are added in this patch. - nosuid may not be clearable by a less privileged user. - nodev may not be clearable by a less privielged user. - noexec may not be clearable by a less privileged user. - atime flags may not be changeable by a less privileged user. The logic with atime is that always setting atime on access is a global policy and backup software and auditing software could break if atime bits are not updated (when they are configured to be updated), and serious performance degradation could result (DOS attack) if atime updates happen when they have been explicitly disabled. Therefore an unprivileged user should not be able to mess with the atime bits set by a more privileged user. The additional restrictions are implemented with the addition of MNT_LOCK_NOSUID, MNT_LOCK_NODEV, MNT_LOCK_NOEXEC, and MNT_LOCK_ATIME mnt flags. Taken together these changes and the fixes for MNT_LOCK_READONLY should make it safe for an unprivileged user to create a user namespace and to call "mount --bind -o remount,... ..." without the danger of mount flags being changed maliciously. Cc: stable@vger.kernel.org Acked-by: Serge E. Hallyn Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 36 +++++++++++++++++++++++++++++++++--- include/linux/mount.h | 5 +++++ 2 files changed, 38 insertions(+), 3 deletions(-) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index 1105a577a14..dd9c93b5a9d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -890,8 +890,21 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED); /* Don't allow unprivileged users to change mount flags */ - if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY)) - mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; + if (flag & CL_UNPRIVILEGED) { + mnt->mnt.mnt_flags |= MNT_LOCK_ATIME; + + if (mnt->mnt.mnt_flags & MNT_READONLY) + mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; + + if (mnt->mnt.mnt_flags & MNT_NODEV) + mnt->mnt.mnt_flags |= MNT_LOCK_NODEV; + + if (mnt->mnt.mnt_flags & MNT_NOSUID) + mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID; + + if (mnt->mnt.mnt_flags & MNT_NOEXEC) + mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC; + } /* Don't allow unprivileged users to reveal what is under a mount */ if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire)) @@ -1931,6 +1944,23 @@ static int do_remount(struct path *path, int flags, int mnt_flags, !(mnt_flags & MNT_READONLY)) { return -EPERM; } + if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) && + !(mnt_flags & MNT_NODEV)) { + return -EPERM; + } + if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) && + !(mnt_flags & MNT_NOSUID)) { + return -EPERM; + } + if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) && + !(mnt_flags & MNT_NOEXEC)) { + return -EPERM; + } + if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) && + ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) { + return -EPERM; + } + err = security_sb_remount(sb, data); if (err) return err; @@ -2129,7 +2159,7 @@ static int do_new_mount(struct path *path, const char *fstype, int flags, */ if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) { flags |= MS_NODEV; - mnt_flags |= MNT_NODEV; + mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV; } } diff --git a/include/linux/mount.h b/include/linux/mount.h index b637a89e1fa..b0c1e6574e7 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -45,12 +45,17 @@ struct mnt_namespace; #define MNT_USER_SETTABLE_MASK (MNT_NOSUID | MNT_NODEV | MNT_NOEXEC \ | MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME \ | MNT_READONLY) +#define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME ) #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \ MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED) #define MNT_INTERNAL 0x4000 +#define MNT_LOCK_ATIME 0x040000 +#define MNT_LOCK_NOEXEC 0x080000 +#define MNT_LOCK_NOSUID 0x100000 +#define MNT_LOCK_NODEV 0x200000 #define MNT_LOCK_READONLY 0x400000 #define MNT_LOCKED 0x800000 #define MNT_DOOMED 0x1000000 -- cgit v1.2.3-70-g09d2 From ffbc6f0ead47fa5a1dc9642b0331cb75c20a640e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 28 Jul 2014 17:36:04 -0700 Subject: mnt: Change the default remount atime from relatime to the existing value Since March 2009 the kernel has treated the state that if no MS_..ATIME flags are passed then the kernel defaults to relatime. Defaulting to relatime instead of the existing atime state during a remount is silly, and causes problems in practice for people who don't specify any MS_...ATIME flags and to get the default filesystem atime setting. Those users may encounter a permission error because the default atime setting does not work. A default that does not work and causes permission problems is ridiculous, so preserve the existing value to have a default atime setting that is always guaranteed to work. Using the default atime setting in this way is particularly interesting for applications built to run in restricted userspace environments without /proc mounted, as the existing atime mount options of a filesystem can not be read from /proc/mounts. In practice this fixes user space that uses the default atime setting on remount that are broken by the permission checks keeping less privileged users from changing more privileged users atime settings. Cc: stable@vger.kernel.org Acked-by: Serge E. Hallyn Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index dd9c93b5a9d..7886176232c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2473,6 +2473,14 @@ long do_mount(const char *dev_name, const char *dir_name, if (flags & MS_RDONLY) mnt_flags |= MNT_READONLY; + /* The default atime for remount is preservation */ + if ((flags & MS_REMOUNT) && + ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME | + MS_STRICTATIME)) == 0)) { + mnt_flags &= ~MNT_ATIME_MASK; + mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK; + } + flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | MS_STRICTATIME); -- cgit v1.2.3-70-g09d2 From 1d023284c31a4e40a94d5bbcb7dbb7a35ee0bcbc Mon Sep 17 00:00:00 2001 From: Ken Helias Date: Wed, 6 Aug 2014 16:09:16 -0700 Subject: list: fix order of arguments for hlist_add_after(_rcu) All other add functions for lists have the new item as first argument and the position where it is added as second argument. This was changed for no good reason in this function and makes using it unnecessary confusing. The name was changed to hlist_add_behind() to cause unconverted code to generate a compile error instead of using the wrong parameter order. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Ken Helias Cc: "Paul E. McKenney" Acked-by: Jeff Kirsher [intel driver bits] Cc: Hugh Dickins Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/RCU/whatisRCU.txt | 2 +- drivers/gpu/drm/drm_hashtab.c | 2 +- drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 2 +- drivers/staging/lustre/lustre/libcfs/hash.c | 4 ++-- fs/namespace.c | 2 +- fs/notify/inode_mark.c | 2 +- fs/notify/vfsmount_mark.c | 2 +- include/linux/list.h | 4 ++-- include/linux/rculist.h | 8 ++++---- net/batman-adv/fragmentation.c | 2 +- net/bridge/br_multicast.c | 2 +- net/ipv4/fib_trie.c | 2 +- net/ipv6/addrlabel.c | 2 +- net/xfrm/xfrm_policy.c | 4 ++-- 15 files changed, 21 insertions(+), 21 deletions(-) (limited to 'fs/namespace.c') diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 49b8551a3b6..e48c57f1943 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -818,7 +818,7 @@ RCU pointer/list update: list_add_tail_rcu list_del_rcu list_replace_rcu - hlist_add_after_rcu + hlist_add_behind_rcu hlist_add_before_rcu hlist_add_head_rcu hlist_del_rcu diff --git a/drivers/gpu/drm/drm_hashtab.c b/drivers/gpu/drm/drm_hashtab.c index 7e4bae760e2..c3b80fd65d6 100644 --- a/drivers/gpu/drm/drm_hashtab.c +++ b/drivers/gpu/drm/drm_hashtab.c @@ -125,7 +125,7 @@ int drm_ht_insert_item(struct drm_open_hash *ht, struct drm_hash_item *item) parent = &entry->head; } if (parent) { - hlist_add_after_rcu(parent, &item->head); + hlist_add_behind_rcu(&item->head, parent); } else { hlist_add_head_rcu(&item->head, h_list); } diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index 681a9e81ff5..e8ba7470700 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -1948,7 +1948,7 @@ static int i40e_update_ethtool_fdir_entry(struct i40e_vsi *vsi, /* add filter to the list */ if (parent) - hlist_add_after(&parent->fdir_node, &input->fdir_node); + hlist_add_behind(&input->fdir_node, &parent->fdir_node); else hlist_add_head(&input->fdir_node, &pf->fdir_filter_list); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c index 94a1c07efeb..e4100b5737b 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c @@ -2517,7 +2517,7 @@ static int ixgbe_update_ethtool_fdir_entry(struct ixgbe_adapter *adapter, /* add filter to the list */ if (parent) - hlist_add_after(&parent->fdir_node, &input->fdir_node); + hlist_add_behind(&input->fdir_node, &parent->fdir_node); else hlist_add_head(&input->fdir_node, &adapter->fdir_filter_list); diff --git a/drivers/staging/lustre/lustre/libcfs/hash.c b/drivers/staging/lustre/lustre/libcfs/hash.c index 5dde7941829..8ef1deb59d4 100644 --- a/drivers/staging/lustre/lustre/libcfs/hash.c +++ b/drivers/staging/lustre/lustre/libcfs/hash.c @@ -351,7 +351,7 @@ cfs_hash_dh_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd, cfs_hash_dhead_t, dh_head); if (dh->dh_tail != NULL) /* not empty */ - hlist_add_after(dh->dh_tail, hnode); + hlist_add_behind(hnode, dh->dh_tail); else /* empty list */ hlist_add_head(hnode, &dh->dh_head); dh->dh_tail = hnode; @@ -406,7 +406,7 @@ cfs_hash_dd_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd, cfs_hash_dhead_dep_t, dd_head); if (dh->dd_tail != NULL) /* not empty */ - hlist_add_after(dh->dd_tail, hnode); + hlist_add_behind(hnode, dh->dd_tail); else /* empty list */ hlist_add_head(hnode, &dh->dd_head); dh->dd_tail = hnode; diff --git a/fs/namespace.c b/fs/namespace.c index 182bc41cd88..2a1447c946e 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -798,7 +798,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows) list_splice(&head, n->list.prev); if (shadows) - hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash); + hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash); else hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mnt->mnt_mountpoint)); diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 74825be65b7..9ce062218de 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -232,7 +232,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, BUG_ON(last == NULL); /* mark should be the last entry. last is the current last entry */ - hlist_add_after_rcu(&last->i.i_list, &mark->i.i_list); + hlist_add_behind_rcu(&mark->i.i_list, &last->i.i_list); out: fsnotify_recalc_inode_mask_locked(inode); spin_unlock(&inode->i_lock); diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index 68ca5a8704b..ac851e8376b 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c @@ -191,7 +191,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, BUG_ON(last == NULL); /* mark should be the last entry. last is the current last entry */ - hlist_add_after_rcu(&last->m.m_list, &mark->m.m_list); + hlist_add_behind_rcu(&mark->m.m_list, &last->m.m_list); out: fsnotify_recalc_vfsmount_mask_locked(mnt); spin_unlock(&mnt->mnt_root->d_lock); diff --git a/include/linux/list.h b/include/linux/list.h index 624ec7f4829..cbbb96fcead 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -654,8 +654,8 @@ static inline void hlist_add_before(struct hlist_node *n, *(n->pprev) = n; } -static inline void hlist_add_after(struct hlist_node *prev, - struct hlist_node *n) +static inline void hlist_add_behind(struct hlist_node *n, + struct hlist_node *prev) { n->next = prev->next; prev->next = n; diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 8183b46fbaa..372ad5e0dcb 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -432,9 +432,9 @@ static inline void hlist_add_before_rcu(struct hlist_node *n, } /** - * hlist_add_after_rcu - * @prev: the existing element to add the new element after. + * hlist_add_behind_rcu * @n: the new element to add to the hash list. + * @prev: the existing element to add the new element after. * * Description: * Adds the specified element to the specified hlist @@ -449,8 +449,8 @@ static inline void hlist_add_before_rcu(struct hlist_node *n, * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. */ -static inline void hlist_add_after_rcu(struct hlist_node *prev, - struct hlist_node *n) +static inline void hlist_add_behind_rcu(struct hlist_node *n, + struct hlist_node *prev) { n->next = prev->next; n->pprev = &prev->next; diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 022d18ab27a..52c43f90422 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -188,7 +188,7 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node, /* Reached the end of the list, so insert after 'frag_entry_last'. */ if (likely(frag_entry_last)) { - hlist_add_after(&frag_entry_last->list, &frag_entry_new->list); + hlist_add_behind(&frag_entry_last->list, &frag_entry_new->list); chain->size += skb->len - hdr_size; chain->timestamp = jiffies; ret = true; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index b4845f4b2bb..7751c92c8c5 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1174,7 +1174,7 @@ static void br_multicast_add_router(struct net_bridge *br, } if (slot) - hlist_add_after_rcu(slot, &port->rlist); + hlist_add_behind_rcu(&port->rlist, slot); else hlist_add_head_rcu(&port->rlist, &br->router_list); } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 5afeb5aa4c7..e9cb2588e41 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -940,7 +940,7 @@ static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) last = li; } if (last) - hlist_add_after_rcu(&last->hlist, &new->hlist); + hlist_add_behind_rcu(&new->hlist, &last->hlist); else hlist_add_before_rcu(&new->hlist, &li->hlist); } diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 731e1e1722d..fd0dc47f471 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -277,7 +277,7 @@ static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace) last = p; } if (last) - hlist_add_after_rcu(&last->list, &newp->list); + hlist_add_behind_rcu(&newp->list, &last->list); else hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head); out: diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 0525d78ba32..beeed602aeb 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -389,7 +389,7 @@ redo: if (h != h0) continue; hlist_del(&pol->bydst); - hlist_add_after(entry0, &pol->bydst); + hlist_add_behind(&pol->bydst, entry0); } entry0 = &pol->bydst; } @@ -654,7 +654,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) break; } if (newpos) - hlist_add_after(newpos, &policy->bydst); + hlist_add_behind(&policy->bydst, newpos); else hlist_add_head(&policy->bydst, chain); xfrm_pol_hold(policy); -- cgit v1.2.3-70-g09d2 From 215752fce31c80f3b3a1530bc7cddb3ba6a69b3a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Aug 2014 06:23:41 -0400 Subject: acct: get rid of acct_list Put these suckers on per-vfsmount and per-superblock lists instead. Note: right now it's still acct_lock for everything, but that's going to change. Signed-off-by: Al Viro --- fs/mount.h | 1 + fs/namespace.c | 2 +- fs/super.c | 2 +- include/linux/acct.h | 6 +-- include/linux/fs.h | 1 + kernel/acct.c | 135 +++++++++++++++++++++------------------------------ 6 files changed, 62 insertions(+), 85 deletions(-) (limited to 'fs/namespace.c') diff --git a/fs/mount.h b/fs/mount.h index d55297f2fa0..0a2d1458681 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -56,6 +56,7 @@ struct mount { int mnt_group_id; /* peer group identifier */ int mnt_expiry_mark; /* true if marked for expiry */ int mnt_pinned; + struct hlist_head mnt_pins; struct path mnt_ex_mountpoint; }; diff --git a/fs/namespace.c b/fs/namespace.c index 182bc41cd88..22e530addfa 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -956,7 +956,7 @@ put_again: mnt->mnt_pinned = 0; rcu_read_unlock(); unlock_mount_hash(); - acct_auto_close_mnt(&mnt->mnt); + acct_auto_close_mnt(&mnt->mnt_pins); goto put_again; } if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { diff --git a/fs/super.c b/fs/super.c index d20d5b11ded..52ed93eb63d 100644 --- a/fs/super.c +++ b/fs/super.c @@ -703,7 +703,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) #endif if (flags & MS_RDONLY) - acct_auto_close(sb); + acct_auto_close(&sb->s_pins); shrink_dcache_sb(sb); remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); diff --git a/include/linux/acct.h b/include/linux/acct.h index 4a5b7cb5607..65a4f889182 100644 --- a/include/linux/acct.h +++ b/include/linux/acct.h @@ -24,14 +24,14 @@ struct super_block; struct pacct_struct; struct pid_namespace; extern int acct_parm[]; /* for sysctl */ -extern void acct_auto_close_mnt(struct vfsmount *m); -extern void acct_auto_close(struct super_block *sb); +extern void acct_auto_close(struct hlist_head *); +extern void acct_auto_close_mnt(struct hlist_head *); extern void acct_collect(long exitcode, int group_dead); extern void acct_process(void); extern void acct_exit_ns(struct pid_namespace *); #else -#define acct_auto_close_mnt(x) do { } while (0) #define acct_auto_close(x) do { } while (0) +#define acct_auto_close_mnt(x) do { } while (0) #define acct_collect(x,y) do { } while (0) #define acct_process() do { } while (0) #define acct_exit_ns(ns) do { } while (0) diff --git a/include/linux/fs.h b/include/linux/fs.h index 4b7d57cf786..17f70872a4a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1250,6 +1250,7 @@ struct super_block { /* AIO completions deferred from interrupt context */ struct workqueue_struct *s_dio_done_wq; + struct hlist_head s_pins; /* * Keep the lru lists last in the structure so they always sit on their diff --git a/kernel/acct.c b/kernel/acct.c index 019f012a3c6..21fbb3c27c2 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -59,6 +59,7 @@ #include #include /* sector_div */ #include +#include <../fs/mount.h> /* will go away when we refactor */ /* * These constants control the amount of freespace that suspend and @@ -79,16 +80,16 @@ static void do_acct_process(struct bsd_acct_struct *acct); struct bsd_acct_struct { long count; + struct hlist_node s_list; + struct hlist_node m_list; struct mutex lock; int active; unsigned long needcheck; struct file *file; struct pid_namespace *ns; - struct list_head list; }; static DEFINE_SPINLOCK(acct_lock); -static LIST_HEAD(acct_list); /* * Check the amount of free space and suspend/resume accordingly. @@ -133,25 +134,33 @@ static void acct_put(struct bsd_acct_struct *p) spin_unlock(&acct_lock); } -static struct bsd_acct_struct *acct_get(struct bsd_acct_struct **p) +static struct bsd_acct_struct *__acct_get(struct bsd_acct_struct *res) +{ + res->count++; + spin_unlock(&acct_lock); + mutex_lock(&res->lock); + if (!res->ns) { + mutex_unlock(&res->lock); + spin_lock(&acct_lock); + if (!--res->count) + kfree(res); + return NULL; + } + return res; +} + +static struct bsd_acct_struct *acct_get(struct pid_namespace *ns) { struct bsd_acct_struct *res; spin_lock(&acct_lock); again: - res = *p; - if (res) - res->count++; - spin_unlock(&acct_lock); - if (res) { - mutex_lock(&res->lock); - if (!res->ns) { - mutex_unlock(&res->lock); - spin_lock(&acct_lock); - if (!--res->count) - kfree(res); - goto again; - } + if (!ns->bacct) { + spin_unlock(&acct_lock); + return NULL; } + res = __acct_get(ns->bacct); + if (!res) + goto again; return res; } @@ -162,7 +171,8 @@ static void acct_kill(struct bsd_acct_struct *acct, struct file *file = acct->file; struct pid_namespace *ns = acct->ns; spin_lock(&acct_lock); - list_del(&acct->list); + hlist_del(&acct->m_list); + hlist_del(&acct->s_list); mnt_unpin(file->f_path.mnt); spin_unlock(&acct_lock); do_acct_process(acct); @@ -170,8 +180,10 @@ static void acct_kill(struct bsd_acct_struct *acct, spin_lock(&acct_lock); ns->bacct = new; if (new) { - mnt_pin(new->file->f_path.mnt); - list_add(&new->list, &acct_list); + struct vfsmount *m = new->file->f_path.mnt; + mnt_pin(m); + hlist_add_head(&new->s_list, &m->mnt_sb->s_pins); + hlist_add_head(&new->m_list, &real_mount(m)->mnt_pins); } acct->ns = NULL; mutex_unlock(&acct->lock); @@ -218,14 +230,15 @@ static int acct_on(struct filename *pathname) mutex_init(&acct->lock); mnt = file->f_path.mnt; - old = acct_get(&ns->bacct); + old = acct_get(ns); if (old) { acct_kill(old, acct); } else { spin_lock(&acct_lock); ns->bacct = acct; mnt_pin(mnt); - list_add(&acct->list, &acct_list); + hlist_add_head(&acct->s_list, &mnt->mnt_sb->s_pins); + hlist_add_head(&acct->m_list, &real_mount(mnt)->mnt_pins); spin_unlock(&acct_lock); } mntput(mnt); /* it's pinned, now give up active reference */ @@ -261,79 +274,41 @@ SYSCALL_DEFINE1(acct, const char __user *, name) mutex_unlock(&acct_on_mutex); putname(tmp); } else { - acct_kill(acct_get(&task_active_pid_ns(current)->bacct), NULL); + acct_kill(acct_get(task_active_pid_ns(current)), NULL); } return error; } -/** - * acct_auto_close - turn off a filesystem's accounting if it is on - * @m: vfsmount being shut down - * - * If the accounting is turned on for a file in the subtree pointed to - * to by m, turn accounting off. Done when m is about to die. - */ -void acct_auto_close_mnt(struct vfsmount *m) +void acct_auto_close_mnt(struct hlist_head *list) { - struct bsd_acct_struct *acct; - - spin_lock(&acct_lock); -restart: - list_for_each_entry(acct, &acct_list, list) - if (acct->file->f_path.mnt == m) { - acct->count++; - spin_unlock(&acct_lock); - mutex_lock(&acct->lock); - if (!acct->ns) { - mutex_unlock(&acct->lock); - spin_lock(&acct_lock); - if (!--acct->count) - kfree(acct); - goto restart; - } - acct_kill(acct, NULL); - spin_lock(&acct_lock); - goto restart; - } + while (1) { + spin_lock(&acct_lock); + if (!list->first) + break; + acct_kill(__acct_get(hlist_entry(list->first, + struct bsd_acct_struct, + m_list)), NULL); + } spin_unlock(&acct_lock); } -/** - * acct_auto_close - turn off a filesystem's accounting if it is on - * @sb: super block for the filesystem - * - * If the accounting is turned on for a file in the filesystem pointed - * to by sb, turn accounting off. - */ -void acct_auto_close(struct super_block *sb) +void acct_auto_close(struct hlist_head *list) { - struct bsd_acct_struct *acct; - - spin_lock(&acct_lock); -restart: - list_for_each_entry(acct, &acct_list, list) - if (acct->file->f_path.dentry->d_sb == sb) { - acct->count++; - spin_unlock(&acct_lock); - mutex_lock(&acct->lock); - if (!acct->ns) { - mutex_unlock(&acct->lock); - spin_lock(&acct_lock); - if (!--acct->count) - kfree(acct); - goto restart; - } - acct_kill(acct, NULL); - spin_lock(&acct_lock); - goto restart; - } + while (1) { + spin_lock(&acct_lock); + if (!list->first) + break; + acct_kill(__acct_get(hlist_entry(list->first, + struct bsd_acct_struct, + s_list)), NULL); + } spin_unlock(&acct_lock); } void acct_exit_ns(struct pid_namespace *ns) { - acct_kill(acct_get(&ns->bacct), NULL); + acct_kill(acct_get(ns), NULL); } /* @@ -602,7 +577,7 @@ void acct_collect(long exitcode, int group_dead) static void slow_acct_process(struct pid_namespace *ns) { for ( ; ns; ns = ns->parent) { - struct bsd_acct_struct *acct = acct_get(&ns->bacct); + struct bsd_acct_struct *acct = acct_get(ns); if (acct) { do_acct_process(acct); mutex_unlock(&acct->lock); -- cgit v1.2.3-70-g09d2 From 8fa1f1c2bd86007beb4a4845e6087ac4a704dc80 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 21 May 2014 18:22:52 -0400 Subject: make fs/{namespace,super}.c forget about acct.h These externs belong in fs/internal.h. Rename (they are not acct-specific anymore) and move them over there. Signed-off-by: Al Viro --- fs/fs_pin.c | 9 +++++---- fs/internal.h | 6 ++++++ fs/namespace.c | 3 +-- fs/super.c | 3 +-- include/linux/acct.h | 2 -- 5 files changed, 13 insertions(+), 10 deletions(-) (limited to 'fs/namespace.c') diff --git a/fs/fs_pin.c b/fs/fs_pin.c index f3ce0b874a4..9368236ca10 100644 --- a/fs/fs_pin.c +++ b/fs/fs_pin.c @@ -1,6 +1,7 @@ #include #include #include +#include "internal.h" #include "mount.h" static void pin_free_rcu(struct rcu_head *head) @@ -32,13 +33,13 @@ void pin_insert(struct fs_pin *pin, struct vfsmount *m) spin_unlock(&pin_lock); } -void acct_auto_close_mnt(struct hlist_head *list) +void mnt_pin_kill(struct mount *m) { while (1) { struct hlist_node *p; struct fs_pin *pin; rcu_read_lock(); - p = ACCESS_ONCE(list->first); + p = ACCESS_ONCE(m->mnt_pins.first); if (!p) { rcu_read_unlock(); break; @@ -54,13 +55,13 @@ void acct_auto_close_mnt(struct hlist_head *list) } } -void acct_auto_close(struct hlist_head *list) +void sb_pin_kill(struct super_block *sb) { while (1) { struct hlist_node *p; struct fs_pin *pin; rcu_read_lock(); - p = ACCESS_ONCE(list->first); + p = ACCESS_ONCE(sb->s_pins.first); if (!p) { rcu_read_unlock(); break; diff --git a/fs/internal.h b/fs/internal.h index 9a2edba87c2..e325b4f9c79 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -143,3 +143,9 @@ extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, * pipe.c */ extern const struct file_operations pipefifo_fops; + +/* + * fs_pin.c + */ +extern void sb_pin_kill(struct super_block *sb); +extern void mnt_pin_kill(struct mount *m); diff --git a/fs/namespace.c b/fs/namespace.c index 22e530addfa..0e4ce51c527 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -16,7 +16,6 @@ #include #include #include -#include /* acct_auto_close_mnt */ #include /* init_rootfs */ #include /* get_fs_root et.al. */ #include /* fsnotify_vfsmount_delete */ @@ -956,7 +955,7 @@ put_again: mnt->mnt_pinned = 0; rcu_read_unlock(); unlock_mount_hash(); - acct_auto_close_mnt(&mnt->mnt_pins); + mnt_pin_kill(mnt); goto put_again; } if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { diff --git a/fs/super.c b/fs/super.c index a369f8964dc..a371ce6aa91 100644 --- a/fs/super.c +++ b/fs/super.c @@ -22,7 +22,6 @@ #include #include -#include #include #include #include @@ -707,7 +706,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) if (remount_ro) { if (sb->s_pins.first) { up_write(&sb->s_umount); - acct_auto_close(&sb->s_pins); + sb_pin_kill(sb); down_write(&sb->s_umount); if (!sb->s_root) return 0; diff --git a/include/linux/acct.h b/include/linux/acct.h index 137837929db..dccc2d4fe7d 100644 --- a/include/linux/acct.h +++ b/include/linux/acct.h @@ -32,8 +32,6 @@ extern void acct_exit_ns(struct pid_namespace *); #define acct_process() do { } while (0) #define acct_exit_ns(ns) do { } while (0) #endif -extern void acct_auto_close(struct hlist_head *); -extern void acct_auto_close_mnt(struct hlist_head *); /* * ACCT_VERSION numbers as yet defined: -- cgit v1.2.3-70-g09d2 From 3064c3563ba4c23e2c7a47254ec056ed9ba0098a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Aug 2014 09:12:31 -0400 Subject: death to mnt_pinned Rather than playing silly buggers with vfsmount refcounts, just have acct_on() ask fs/namespace.c for internal clone of file->f_path.mnt and replace it with said clone. Then attach the pin to original vfsmount. Voila - the clone will be alive until the file gets closed, making sure that underlying superblock remains active, etc., and we can drop the original vfsmount, so that it's not kept busy. If the file lives until the final mntput of the original vfsmount, we'll notice that there's an fs_pin (one in bsd_acct_struct that holds that file) and mnt_pin_kill() will take it out. Since ->kill() is synchronous, we won't proceed past that point until these files are closed (and private clones of our vfsmount are gone), so we get the same ordering warranties we used to get. mnt_pin()/mnt_unpin()/->mnt_pinned is gone now, and good riddance - it never became usable outside of kernel/acct.c (and racy wrt umount even there). Signed-off-by: Al Viro --- fs/mount.h | 1 - fs/namespace.c | 35 +++++++++-------------------------- include/linux/mount.h | 4 ++-- kernel/acct.c | 24 +++++++++++++++++++----- 4 files changed, 30 insertions(+), 34 deletions(-) (limited to 'fs/namespace.c') diff --git a/fs/mount.h b/fs/mount.h index 0a2d1458681..6740a621552 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -55,7 +55,6 @@ struct mount { int mnt_id; /* mount identifier */ int mnt_group_id; /* peer group identifier */ int mnt_expiry_mark; /* true if marked for expiry */ - int mnt_pinned; struct hlist_head mnt_pins; struct path mnt_ex_mountpoint; }; diff --git a/fs/namespace.c b/fs/namespace.c index 0e4ce51c527..65af9d0e0d6 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -937,7 +937,6 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, static void mntput_no_expire(struct mount *mnt) { -put_again: rcu_read_lock(); mnt_add_count(mnt, -1); if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */ @@ -950,14 +949,6 @@ put_again: unlock_mount_hash(); return; } - if (unlikely(mnt->mnt_pinned)) { - mnt_add_count(mnt, mnt->mnt_pinned + 1); - mnt->mnt_pinned = 0; - rcu_read_unlock(); - unlock_mount_hash(); - mnt_pin_kill(mnt); - goto put_again; - } if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { rcu_read_unlock(); unlock_mount_hash(); @@ -980,6 +971,8 @@ put_again: * so mnt_get_writers() below is safe. */ WARN_ON(mnt_get_writers(mnt)); + if (unlikely(mnt->mnt_pins.first)) + mnt_pin_kill(mnt); fsnotify_vfsmount_delete(&mnt->mnt); dput(mnt->mnt.mnt_root); deactivate_super(mnt->mnt.mnt_sb); @@ -1007,25 +1000,15 @@ struct vfsmount *mntget(struct vfsmount *mnt) } EXPORT_SYMBOL(mntget); -void mnt_pin(struct vfsmount *mnt) +struct vfsmount *mnt_clone_internal(struct path *path) { - lock_mount_hash(); - real_mount(mnt)->mnt_pinned++; - unlock_mount_hash(); -} -EXPORT_SYMBOL(mnt_pin); - -void mnt_unpin(struct vfsmount *m) -{ - struct mount *mnt = real_mount(m); - lock_mount_hash(); - if (mnt->mnt_pinned) { - mnt_add_count(mnt, 1); - mnt->mnt_pinned--; - } - unlock_mount_hash(); + struct mount *p; + p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE); + if (IS_ERR(p)) + return ERR_CAST(p); + p->mnt.mnt_flags |= MNT_INTERNAL; + return &p->mnt; } -EXPORT_SYMBOL(mnt_unpin); static inline void mangle(struct seq_file *m, const char *s) { diff --git a/include/linux/mount.h b/include/linux/mount.h index 839bac27090..864b120c134 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -62,6 +62,7 @@ struct vfsmount { }; struct file; /* forward dec */ +struct path; extern int mnt_want_write(struct vfsmount *mnt); extern int mnt_want_write_file(struct file *file); @@ -70,8 +71,7 @@ extern void mnt_drop_write(struct vfsmount *mnt); extern void mnt_drop_write_file(struct file *file); extern void mntput(struct vfsmount *mnt); extern struct vfsmount *mntget(struct vfsmount *mnt); -extern void mnt_pin(struct vfsmount *mnt); -extern void mnt_unpin(struct vfsmount *mnt); +extern struct vfsmount *mnt_clone_internal(struct path *path); extern int __mnt_is_readonly(struct vfsmount *mnt); struct file_system_type; diff --git a/kernel/acct.c b/kernel/acct.c index a7993a6cb60..2e6cf818021 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -154,7 +154,6 @@ static void close_work(struct work_struct *work) { struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work); struct file *file = acct->file; - mnt_unpin(file->f_path.mnt); if (file->f_op->flush) file->f_op->flush(file, NULL); __fput_sync(file); @@ -196,9 +195,10 @@ static void acct_pin_kill(struct fs_pin *pin) static int acct_on(struct filename *pathname) { struct file *file; - struct vfsmount *mnt; + struct vfsmount *mnt, *internal; struct pid_namespace *ns = task_active_pid_ns(current); struct bsd_acct_struct *acct, *old; + int err; acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); if (!acct) @@ -222,6 +222,21 @@ static int acct_on(struct filename *pathname) filp_close(file, NULL); return -EIO; } + internal = mnt_clone_internal(&file->f_path); + if (IS_ERR(internal)) { + kfree(acct); + filp_close(file, NULL); + return PTR_ERR(internal); + } + err = mnt_want_write(internal); + if (err) { + mntput(internal); + kfree(acct); + filp_close(file, NULL); + return err; + } + mnt = file->f_path.mnt; + file->f_path.mnt = internal; atomic_long_set(&acct->pin.count, 1); acct->pin.kill = acct_pin_kill; @@ -229,8 +244,6 @@ static int acct_on(struct filename *pathname) acct->needcheck = jiffies; acct->ns = ns; mutex_init(&acct->lock); - mnt = file->f_path.mnt; - mnt_pin(mnt); mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */ pin_insert(&acct->pin, mnt); @@ -240,7 +253,8 @@ static int acct_on(struct filename *pathname) else ns->bacct = acct; mutex_unlock(&acct->lock); - mntput(mnt); /* it's pinned, now give up active reference */ + mnt_drop_write(mnt); + mntput(mnt); return 0; } -- cgit v1.2.3-70-g09d2 From 12a5b5294cb1896e9a3c9fca8ff5a7e3def4e8c6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 10 Aug 2014 03:44:55 -0400 Subject: fix copy_tree() regression Since 3.14 we had copy_tree() get the shadowing wrong - if we had one vfsmount shadowing another (i.e. if A is a slave of B, C is mounted on A/foo, then D got mounted on B/foo creating D' on A/foo shadowed by C), copy_tree() of A would make a copy of D' shadow the the copy of C, not the other way around. It's easy to fix, fortunately - just make sure that mount follows the one that shadows it in mnt_child as well as in mnt_hash, and when copy_tree() decides to attach a new mount, check if the last child it has added to the same parent should be shadowing the new one. And if it should, just use the same logics commit_tree() has - put the new mount into the hash and children lists right after the one that should shadow it. Cc: stable@vger.kernel.org [3.14 and later] Signed-off-by: Al Viro --- fs/namespace.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index 65af9d0e0d6..be3f6f23a47 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -778,6 +778,20 @@ static void attach_mnt(struct mount *mnt, list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); } +static void attach_shadowed(struct mount *mnt, + struct mount *parent, + struct mount *shadows) +{ + if (shadows) { + hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash); + list_add(&mnt->mnt_child, &shadows->mnt_child); + } else { + hlist_add_head_rcu(&mnt->mnt_hash, + m_hash(&parent->mnt, mnt->mnt_mountpoint)); + list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); + } +} + /* * vfsmount lock must be held for write */ @@ -796,12 +810,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows) list_splice(&head, n->list.prev); - if (shadows) - hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash); - else - hlist_add_head_rcu(&mnt->mnt_hash, - m_hash(&parent->mnt, mnt->mnt_mountpoint)); - list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); + attach_shadowed(mnt, parent, shadows); touch_mnt_namespace(n); } @@ -1474,6 +1483,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, continue; for (s = r; s; s = next_mnt(s, r)) { + struct mount *t = NULL; if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(s)) { s = skip_mnt_tree(s); @@ -1495,7 +1505,14 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, goto out; lock_mount_hash(); list_add_tail(&q->mnt_list, &res->mnt_list); - attach_mnt(q, parent, p->mnt_mp); + mnt_set_mountpoint(parent, p->mnt_mp, q); + if (!list_empty(&parent->mnt_mounts)) { + t = list_last_entry(&parent->mnt_mounts, + struct mount, mnt_child); + if (t->mnt_mp != p->mnt_mp) + t = NULL; + } + attach_shadowed(q, parent, t); unlock_mount_hash(); } } -- cgit v1.2.3-70-g09d2 From 88b368f27a094277143d8ecd5a056116f6a41520 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 18 Aug 2014 15:09:26 -0400 Subject: get rid of propagate_umount() mistakenly treating slaves as busy. The check in __propagate_umount() ("has somebody explicitly mounted something on that slave?") is done *before* taking the already doomed victims out of the child lists. Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/namespace.c | 4 +++- fs/pnode.c | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index a01c7730e9a..3273177873f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1253,6 +1253,9 @@ void umount_tree(struct mount *mnt, int how) hlist_add_head(&p->mnt_hash, &tmp_list); } + hlist_for_each_entry(p, &tmp_list, mnt_hash) + list_del_init(&p->mnt_child); + if (how) propagate_umount(&tmp_list); @@ -1263,7 +1266,6 @@ void umount_tree(struct mount *mnt, int how) p->mnt_ns = NULL; if (how < 2) p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; - list_del_init(&p->mnt_child); if (mnt_has_parent(p)) { put_mountpoint(p->mnt_mp); /* move the reference to mountpoint into ->mnt_ex_mountpoint */ diff --git a/fs/pnode.c b/fs/pnode.c index 302bf22c4a3..aae331a5d03 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -381,6 +381,7 @@ static void __propagate_umount(struct mount *mnt) * other children */ if (child && list_empty(&child->mnt_mounts)) { + list_del_init(&child->mnt_child); hlist_del_init_rcu(&child->mnt_hash); hlist_add_before_rcu(&child->mnt_hash, &mnt->mnt_hash); } -- cgit v1.2.3-70-g09d2 From 81b6b06197606b4bef4e427a197aeb808e8d89e1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 30 Aug 2014 18:32:05 -0400 Subject: fix EBUSY on umount() from MNT_SHRINKABLE We need the parents of victims alive until namespace_unlock() gets to dput() of the (ex-)mountpoints. However, that screws up the "is it busy" checks in case when we have shrinkable mounts that need to be killed. Solution: go ahead and decrement refcounts of parents right in umount_tree(), increment them again just before dropping rwsem in namespace_unlock() (and let the loop in the end of namespace_unlock() finally drop those references for good, as we do now). Parents can't get freed until we drop rwsem - at least one reference is kept until then, both in case when parent is among the victims and when it is not. So they'll still be around when we get to namespace_unlock(). Cc: stable@vger.kernel.org # 3.12+ Signed-off-by: Al Viro --- fs/namespace.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index 3273177873f..ef42d9bee21 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1217,6 +1217,11 @@ static void namespace_unlock(void) head.first->pprev = &head.first; INIT_HLIST_HEAD(&unmounted); + /* undo decrements we'd done in umount_tree() */ + hlist_for_each_entry(mnt, &head, mnt_hash) + if (mnt->mnt_ex_mountpoint.mnt) + mntget(mnt->mnt_ex_mountpoint.mnt); + up_write(&namespace_sem); synchronize_rcu(); @@ -1268,6 +1273,7 @@ void umount_tree(struct mount *mnt, int how) p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; if (mnt_has_parent(p)) { put_mountpoint(p->mnt_mp); + mnt_add_count(p->mnt_parent, -1); /* move the reference to mountpoint into ->mnt_ex_mountpoint */ p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint; p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt; -- cgit v1.2.3-70-g09d2 From a1480dcc3c706e309a88884723446f2e84fedd5b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 8 Oct 2014 12:32:47 -0700 Subject: fs: Add a missing permission check to do_umount Accessing do_remount_sb should require global CAP_SYS_ADMIN, but only one of the two call sites was appropriately protected. Fixes CVE-2014-7975. Signed-off-by: Andy Lutomirski --- fs/namespace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index ef42d9bee21..7f67b463a5b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1356,6 +1356,8 @@ static int do_umount(struct mount *mnt, int flags) * Special case for "unmounting" root ... * we just try to remount it readonly. */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; down_write(&sb->s_umount); if (!(sb->s_flags & MS_RDONLY)) retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); -- cgit v1.2.3-70-g09d2 From 9ea459e110df32e60a762f311f7939eaa879601d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 8 Aug 2014 13:08:20 -0400 Subject: delayed mntput On final mntput() we want fs shutdown to happen before return to userland; however, the only case where we want it happen right there (i.e. where task_work_add won't do) is MNT_INTERNAL victim. Those have to be fully synchronous - failure halfway through module init might count on having vfsmount killed right there. Fortunately, final mntput on MNT_INTERNAL vfsmounts happens on shallow stack. So we handle those synchronously and do an analog of delayed fput logics for everything else. As the result, we are guaranteed that fs shutdown will always happen on shallow stack. Signed-off-by: Al Viro --- fs/mount.h | 5 ++++- fs/namespace.c | 71 +++++++++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 57 insertions(+), 19 deletions(-) (limited to 'fs/namespace.c') diff --git a/fs/mount.h b/fs/mount.h index 6740a621552..8f2a14ae38a 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -29,7 +29,10 @@ struct mount { struct mount *mnt_parent; struct dentry *mnt_mountpoint; struct vfsmount mnt; - struct rcu_head mnt_rcu; + union { + struct rcu_head mnt_rcu; + struct llist_node mnt_llist; + }; #ifdef CONFIG_SMP struct mnt_pcp __percpu *mnt_pcp; #else diff --git a/fs/namespace.c b/fs/namespace.c index ef42d9bee21..044134315f9 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "pnode.h" #include "internal.h" @@ -957,6 +958,46 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, return ERR_PTR(err); } +static void cleanup_mnt(struct mount *mnt) +{ + /* + * This probably indicates that somebody messed + * up a mnt_want/drop_write() pair. If this + * happens, the filesystem was probably unable + * to make r/w->r/o transitions. + */ + /* + * The locking used to deal with mnt_count decrement provides barriers, + * so mnt_get_writers() below is safe. + */ + WARN_ON(mnt_get_writers(mnt)); + if (unlikely(mnt->mnt_pins.first)) + mnt_pin_kill(mnt); + fsnotify_vfsmount_delete(&mnt->mnt); + dput(mnt->mnt.mnt_root); + deactivate_super(mnt->mnt.mnt_sb); + mnt_free_id(mnt); + call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); +} + +static void __cleanup_mnt(struct rcu_head *head) +{ + cleanup_mnt(container_of(head, struct mount, mnt_rcu)); +} + +static LLIST_HEAD(delayed_mntput_list); +static void delayed_mntput(struct work_struct *unused) +{ + struct llist_node *node = llist_del_all(&delayed_mntput_list); + struct llist_node *next; + + for (; node; node = next) { + next = llist_next(node); + cleanup_mnt(llist_entry(node, struct mount, mnt_llist)); + } +} +static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput); + static void mntput_no_expire(struct mount *mnt) { rcu_read_lock(); @@ -982,24 +1023,18 @@ static void mntput_no_expire(struct mount *mnt) list_del(&mnt->mnt_instance); unlock_mount_hash(); - /* - * This probably indicates that somebody messed - * up a mnt_want/drop_write() pair. If this - * happens, the filesystem was probably unable - * to make r/w->r/o transitions. - */ - /* - * The locking used to deal with mnt_count decrement provides barriers, - * so mnt_get_writers() below is safe. - */ - WARN_ON(mnt_get_writers(mnt)); - if (unlikely(mnt->mnt_pins.first)) - mnt_pin_kill(mnt); - fsnotify_vfsmount_delete(&mnt->mnt); - dput(mnt->mnt.mnt_root); - deactivate_super(mnt->mnt.mnt_sb); - mnt_free_id(mnt); - call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); + if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) { + struct task_struct *task = current; + if (likely(!(task->flags & PF_KTHREAD))) { + init_task_work(&mnt->mnt_rcu, __cleanup_mnt); + if (!task_work_add(task, &mnt->mnt_rcu, true)) + return; + } + if (llist_add(&mnt->mnt_llist, &delayed_mntput_list)) + schedule_delayed_work(&delayed_mntput_work, 1); + return; + } + cleanup_mnt(mnt); } void mntput(struct vfsmount *mnt) -- cgit v1.2.3-70-g09d2 From 7af1364ffa64db61e386628594836e13d2ef04b5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 4 Oct 2013 19:15:13 -0700 Subject: vfs: Don't allow overwriting mounts in the current mount namespace In preparation for allowing mountpoints to be renamed and unlinked in remote filesystems and in other mount namespaces test if on a dentry there is a mount in the local mount namespace before allowing it to be renamed or unlinked. The primary motivation here are old versions of fusermount unmount which is not safe if the a path can be renamed or unlinked while it is verifying the mount is safe to unmount. More recent versions are simpler and safer by simply using UMOUNT_NOFOLLOW when unmounting a mount in a directory owned by an arbitrary user. Miklos Szeredi reports this is approach is good enough to remove concerns about new kernels mixed with old versions of fusermount. A secondary motivation for restrictions here is that it removing empty directories that have non-empty mount points on them appears to violate the rule that rmdir can not remove empty directories. As Linus Torvalds pointed out this is useful for programs (like git) that test if a directory is empty with rmdir. Therefore this patch arranges to enforce the existing mount point semantics for local mount namespace. v2: Rewrote the test to be a drop in replacement for d_mountpoint v3: Use bool instead of int as the return type of is_local_mountpoint Reviewed-by: Miklos Szeredi Signed-off-by: "Eric W. Biederman" Signed-off-by: Al Viro --- fs/mount.h | 9 +++++++++ fs/namei.c | 6 +++++- fs/namespace.c | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 1 deletion(-) (limited to 'fs/namespace.c') diff --git a/fs/mount.h b/fs/mount.h index 8f2a14ae38a..8c6a2a65125 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -115,3 +115,12 @@ struct proc_mounts { #define proc_mounts(p) (container_of((p), struct proc_mounts, m)) extern const struct seq_operations mounts_op; + +extern bool __is_local_mountpoint(struct dentry *dentry); +static inline bool is_local_mountpoint(struct dentry *dentry) +{ + if (!d_mountpoint(dentry)) + return false; + + return __is_local_mountpoint(dentry); +} diff --git a/fs/namei.c b/fs/namei.c index a7b05bf82d3..a3a14b033b0 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3565,6 +3565,8 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) mutex_lock(&dentry->d_inode->i_mutex); error = -EBUSY; + if (is_local_mountpoint(dentry)) + goto out; if (d_mountpoint(dentry)) goto out; @@ -3681,7 +3683,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate return -EPERM; mutex_lock(&target->i_mutex); - if (d_mountpoint(dentry)) + if (is_local_mountpoint(dentry) || d_mountpoint(dentry)) error = -EBUSY; else { error = security_inode_unlink(dir, dentry); @@ -4126,6 +4128,8 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, mutex_lock(&target->i_mutex); error = -EBUSY; + if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry)) + goto out; if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry)) goto out; diff --git a/fs/namespace.c b/fs/namespace.c index 044134315f9..77ffdb82f63 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -667,6 +667,41 @@ struct vfsmount *lookup_mnt(struct path *path) return m; } +/* + * __is_local_mountpoint - Test to see if dentry is a mountpoint in the + * current mount namespace. + * + * The common case is dentries are not mountpoints at all and that + * test is handled inline. For the slow case when we are actually + * dealing with a mountpoint of some kind, walk through all of the + * mounts in the current mount namespace and test to see if the dentry + * is a mountpoint. + * + * The mount_hashtable is not usable in the context because we + * need to identify all mounts that may be in the current mount + * namespace not just a mount that happens to have some specified + * parent mount. + */ +bool __is_local_mountpoint(struct dentry *dentry) +{ + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + struct mount *mnt; + bool is_covered = false; + + if (!d_mountpoint(dentry)) + goto out; + + down_read(&namespace_sem); + list_for_each_entry(mnt, &ns->list, mnt_list) { + is_covered = (mnt->mnt_mountpoint == dentry); + if (is_covered) + break; + } + up_read(&namespace_sem); +out: + return is_covered; +} + static struct mountpoint *new_mountpoint(struct dentry *dentry) { struct hlist_head *chain = mp_hash(dentry); -- cgit v1.2.3-70-g09d2 From 0a5eb7c8189922e86a840972cd0b57e41de6f031 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 22 Sep 2013 19:37:01 -0700 Subject: vfs: Keep a list of mounts on a mount point To spot any possible problems call BUG if a mountpoint is put when it's list of mounts is not empty. AV: use hlist instead of list_head Reviewed-by: Miklos Szeredi Signed-off-by: Eric W. Biederman Signed-off-by: Al Viro --- fs/mount.h | 2 ++ fs/namespace.c | 6 ++++++ 2 files changed, 8 insertions(+) (limited to 'fs/namespace.c') diff --git a/fs/mount.h b/fs/mount.h index 8c6a2a65125..68bb03ed7f1 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -21,6 +21,7 @@ struct mnt_pcp { struct mountpoint { struct hlist_node m_hash; struct dentry *m_dentry; + struct hlist_head m_list; int m_count; }; @@ -51,6 +52,7 @@ struct mount { struct mount *mnt_master; /* slave is on master->mnt_slave_list */ struct mnt_namespace *mnt_ns; /* containing namespace */ struct mountpoint *mnt_mp; /* where is it mounted */ + struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */ #ifdef CONFIG_FSNOTIFY struct hlist_head mnt_fsnotify_marks; __u32 mnt_fsnotify_mask; diff --git a/fs/namespace.c b/fs/namespace.c index 77ffdb82f63..99572dd0833 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -225,6 +225,7 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_LIST_HEAD(&mnt->mnt_share); INIT_LIST_HEAD(&mnt->mnt_slave_list); INIT_LIST_HEAD(&mnt->mnt_slave); + INIT_HLIST_NODE(&mnt->mnt_mp_list); #ifdef CONFIG_FSNOTIFY INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); #endif @@ -731,6 +732,7 @@ static struct mountpoint *new_mountpoint(struct dentry *dentry) mp->m_dentry = dentry; mp->m_count = 1; hlist_add_head(&mp->m_hash, chain); + INIT_HLIST_HEAD(&mp->m_list); return mp; } @@ -738,6 +740,7 @@ static void put_mountpoint(struct mountpoint *mp) { if (!--mp->m_count) { struct dentry *dentry = mp->m_dentry; + BUG_ON(!hlist_empty(&mp->m_list)); spin_lock(&dentry->d_lock); dentry->d_flags &= ~DCACHE_MOUNTED; spin_unlock(&dentry->d_lock); @@ -784,6 +787,7 @@ static void detach_mnt(struct mount *mnt, struct path *old_path) mnt->mnt_mountpoint = mnt->mnt.mnt_root; list_del_init(&mnt->mnt_child); hlist_del_init_rcu(&mnt->mnt_hash); + hlist_del_init(&mnt->mnt_mp_list); put_mountpoint(mnt->mnt_mp); mnt->mnt_mp = NULL; } @@ -800,6 +804,7 @@ void mnt_set_mountpoint(struct mount *mnt, child_mnt->mnt_mountpoint = dget(mp->m_dentry); child_mnt->mnt_parent = mnt; child_mnt->mnt_mp = mp; + hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); } /* @@ -1342,6 +1347,7 @@ void umount_tree(struct mount *mnt, int how) if (how < 2) p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; if (mnt_has_parent(p)) { + hlist_del_init(&p->mnt_mp_list); put_mountpoint(p->mnt_mp); mnt_add_count(p->mnt_parent, -1); /* move the reference to mountpoint into ->mnt_ex_mountpoint */ -- cgit v1.2.3-70-g09d2 From e2dfa935464272395b4f35f4cc74ffcc87418b84 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 24 Feb 2014 17:32:34 -0800 Subject: vfs: factor out lookup_mountpoint from new_mountpoint I am shortly going to add a new user of struct mountpoint that needs to look up existing entries but does not want to create a struct mountpoint if one does not exist. Therefore to keep the code simple and easy to read split out lookup_mountpoint from new_mountpoint. Signed-off-by: "Eric W. Biederman" Signed-off-by: Al Viro --- fs/namespace.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index 99572dd0833..88fc3f4d77e 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -703,11 +703,10 @@ out: return is_covered; } -static struct mountpoint *new_mountpoint(struct dentry *dentry) +static struct mountpoint *lookup_mountpoint(struct dentry *dentry) { struct hlist_head *chain = mp_hash(dentry); struct mountpoint *mp; - int ret; hlist_for_each_entry(mp, chain, m_hash) { if (mp->m_dentry == dentry) { @@ -718,6 +717,14 @@ static struct mountpoint *new_mountpoint(struct dentry *dentry) return mp; } } + return NULL; +} + +static struct mountpoint *new_mountpoint(struct dentry *dentry) +{ + struct hlist_head *chain = mp_hash(dentry); + struct mountpoint *mp; + int ret; mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); if (!mp) @@ -1818,7 +1825,9 @@ retry: namespace_lock(); mnt = lookup_mnt(path); if (likely(!mnt)) { - struct mountpoint *mp = new_mountpoint(dentry); + struct mountpoint *mp = lookup_mountpoint(dentry); + if (!mp) + mp = new_mountpoint(dentry); if (IS_ERR(mp)) { namespace_unlock(); mutex_unlock(&dentry->d_inode->i_mutex); -- cgit v1.2.3-70-g09d2 From 80b5dce8c59b0de1ed6e403b8298e02dcb4db64b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 3 Oct 2013 01:31:18 -0700 Subject: vfs: Add a function to lazily unmount all mounts from any dentry. The new function detach_mounts comes in two pieces. The first piece is a static inline test of d_mounpoint that returns immediately without taking any locks if d_mounpoint is not set. In the common case when mountpoints are absent this allows the vfs to continue running with it's same cacheline foot print. The second piece of detach_mounts __detach_mounts actually does the work and it assumes that a mountpoint is present so it is slow and takes namespace_sem for write, and then locks the mount hash (aka mount_lock) after a struct mountpoint has been found. With those two locks held each entry on the list of mounts on a mountpoint is selected and lazily unmounted until all of the mount have been lazily unmounted. v7: Wrote a proper change description and removed the changelog documenting deleted wrong turns. Signed-off-by: Eric W. Biederman Signed-off-by: Al Viro --- fs/mount.h | 9 +++++++++ fs/namespace.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) (limited to 'fs/namespace.c') diff --git a/fs/mount.h b/fs/mount.h index 68bb03ed7f1..f82c6284090 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -87,6 +87,15 @@ extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *); extern bool legitimize_mnt(struct vfsmount *, unsigned); +extern void __detach_mounts(struct dentry *dentry); + +static inline void detach_mounts(struct dentry *dentry) +{ + if (!d_mountpoint(dentry)) + return; + __detach_mounts(dentry); +} + static inline void get_mnt_ns(struct mnt_namespace *ns) { atomic_inc(&ns->count); diff --git a/fs/namespace.c b/fs/namespace.c index 88fc3f4d77e..00e5b1efa59 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1468,6 +1468,37 @@ static int do_umount(struct mount *mnt, int flags) return retval; } +/* + * __detach_mounts - lazily unmount all mounts on the specified dentry + * + * During unlink, rmdir, and d_drop it is possible to loose the path + * to an existing mountpoint, and wind up leaking the mount. + * detach_mounts allows lazily unmounting those mounts instead of + * leaking them. + * + * The caller may hold dentry->d_inode->i_mutex. + */ +void __detach_mounts(struct dentry *dentry) +{ + struct mountpoint *mp; + struct mount *mnt; + + namespace_lock(); + mp = lookup_mountpoint(dentry); + if (!mp) + goto out_unlock; + + lock_mount_hash(); + while (!hlist_empty(&mp->m_list)) { + mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); + umount_tree(mnt, 2); + } + unlock_mount_hash(); + put_mountpoint(mp); +out_unlock: + namespace_unlock(); +} + /* * Is the caller allowed to modify his namespace? */ -- cgit v1.2.3-70-g09d2 From b8850d1fa8e2f6653e57daf6d08e58c5f5eb2c85 Mon Sep 17 00:00:00 2001 From: Tim Gardner Date: Thu, 28 Aug 2014 11:26:03 -0600 Subject: fs: namespace: suppress 'may be used uninitialized' warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The gcc version 4.9.1 compiler complains Even though it isn't possible for these variables to not get initialized before they are used. fs/namespace.c: In function ‘SyS_mount’: fs/namespace.c:2720:8: warning: ‘kernel_dev’ may be used uninitialized in this function [-Wmaybe-uninitialized] ret = do_mount(kernel_dev, kernel_dir->name, kernel_type, flags, ^ fs/namespace.c:2699:8: note: ‘kernel_dev’ was declared here char *kernel_dev; ^ fs/namespace.c:2720:8: warning: ‘kernel_type’ may be used uninitialized in this function [-Wmaybe-uninitialized] ret = do_mount(kernel_dev, kernel_dir->name, kernel_type, flags, ^ fs/namespace.c:2697:8: note: ‘kernel_type’ was declared here char *kernel_type; ^ Fix the warnings by simplifying copy_mount_string() as suggested by Al Viro. Cc: Alexander Viro Signed-off-by: Tim Gardner Signed-off-by: Al Viro --- fs/compat.c | 10 ++++++---- fs/internal.h | 2 +- fs/namespace.c | 26 ++++++++------------------ 3 files changed, 15 insertions(+), 23 deletions(-) (limited to 'fs/namespace.c') diff --git a/fs/compat.c b/fs/compat.c index 66d3d3c6b4b..6205c247a6e 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -797,8 +797,9 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name, struct filename *dir; int retval; - retval = copy_mount_string(type, &kernel_type); - if (retval < 0) + kernel_type = copy_mount_string(type); + retval = PTR_ERR(kernel_type); + if (IS_ERR(kernel_type)) goto out; dir = getname(dir_name); @@ -806,8 +807,9 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name, if (IS_ERR(dir)) goto out1; - retval = copy_mount_string(dev_name, &kernel_dev); - if (retval < 0) + kernel_dev = copy_mount_string(dev_name); + retval = PTR_ERR(kernel_dev); + if (IS_ERR(kernel_dev)) goto out2; retval = copy_mount_options(data, &data_page); diff --git a/fs/internal.h b/fs/internal.h index e325b4f9c79..bd4ac19f4d8 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -51,7 +51,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *, * namespace.c */ extern int copy_mount_options(const void __user *, unsigned long *); -extern int copy_mount_string(const void __user *, char **); +extern char *copy_mount_string(const void __user *); extern struct vfsmount *lookup_mnt(struct path *); extern int finish_automount(struct vfsmount *, struct path *); diff --git a/fs/namespace.c b/fs/namespace.c index 00e5b1efa59..abd3abb5261 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2514,21 +2514,9 @@ int copy_mount_options(const void __user * data, unsigned long *where) return 0; } -int copy_mount_string(const void __user *data, char **where) +char *copy_mount_string(const void __user *data) { - char *tmp; - - if (!data) { - *where = NULL; - return 0; - } - - tmp = strndup_user(data, PAGE_SIZE); - if (IS_ERR(tmp)) - return PTR_ERR(tmp); - - *where = tmp; - return 0; + return data ? strndup_user(data, PAGE_SIZE) : NULL; } /* @@ -2794,8 +2782,9 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, char *kernel_dev; unsigned long data_page; - ret = copy_mount_string(type, &kernel_type); - if (ret < 0) + kernel_type = copy_mount_string(type); + ret = PTR_ERR(kernel_type); + if (IS_ERR(kernel_type)) goto out_type; kernel_dir = getname(dir_name); @@ -2804,8 +2793,9 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, goto out_dir; } - ret = copy_mount_string(dev_name, &kernel_dev); - if (ret < 0) + kernel_dev = copy_mount_string(dev_name); + ret = PTR_ERR(kernel_dev); + if (IS_ERR(kernel_dev)) goto out_dev; ret = copy_mount_options(data, &data_page); -- cgit v1.2.3-70-g09d2 From 5e6123f3477e4260fb14392f0a88f1a842fa4d42 Mon Sep 17 00:00:00 2001 From: Seunghun Lee Date: Sun, 14 Sep 2014 22:15:10 +0900 Subject: vfs: move getname() from callers to do_mount() It would make more sense to pass char __user * instead of char * in callers of do_mount() and do getname() inside do_mount(). Suggested-by: Al Viro Signed-off-by: Seunghun Lee Signed-off-by: Al Viro --- arch/alpha/kernel/osf_sys.c | 23 ++++++++++------------- fs/compat.c | 20 ++++++-------------- fs/namespace.c | 19 +++---------------- include/linux/fs.h | 3 ++- 4 files changed, 21 insertions(+), 44 deletions(-) (limited to 'fs/namespace.c') diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index 1402fcc11c2..f9c732e1828 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -446,7 +446,8 @@ struct procfs_args { * unhappy with OSF UFS. [CHECKME] */ static int -osf_ufs_mount(const char *dirname, struct ufs_args __user *args, int flags) +osf_ufs_mount(const char __user *dirname, + struct ufs_args __user *args, int flags) { int retval; struct cdfs_args tmp; @@ -466,7 +467,8 @@ osf_ufs_mount(const char *dirname, struct ufs_args __user *args, int flags) } static int -osf_cdfs_mount(const char *dirname, struct cdfs_args __user *args, int flags) +osf_cdfs_mount(const char __user *dirname, + struct cdfs_args __user *args, int flags) { int retval; struct cdfs_args tmp; @@ -486,7 +488,8 @@ osf_cdfs_mount(const char *dirname, struct cdfs_args __user *args, int flags) } static int -osf_procfs_mount(const char *dirname, struct procfs_args __user *args, int flags) +osf_procfs_mount(const char __user *dirname, + struct procfs_args __user *args, int flags) { struct procfs_args tmp; @@ -500,28 +503,22 @@ SYSCALL_DEFINE4(osf_mount, unsigned long, typenr, const char __user *, path, int, flag, void __user *, data) { int retval; - struct filename *name; - name = getname(path); - retval = PTR_ERR(name); - if (IS_ERR(name)) - goto out; switch (typenr) { case 1: - retval = osf_ufs_mount(name->name, data, flag); + retval = osf_ufs_mount(path, data, flag); break; case 6: - retval = osf_cdfs_mount(name->name, data, flag); + retval = osf_cdfs_mount(path, data, flag); break; case 9: - retval = osf_procfs_mount(name->name, data, flag); + retval = osf_procfs_mount(path, data, flag); break; default: retval = -EINVAL; printk("osf_mount(%ld, %x)\n", typenr, flag); } - putname(name); - out: + return retval; } diff --git a/fs/compat.c b/fs/compat.c index 6205c247a6e..b13df99f353 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -794,7 +794,6 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name, char *kernel_type; unsigned long data_page; char *kernel_dev; - struct filename *dir; int retval; kernel_type = copy_mount_string(type); @@ -802,19 +801,14 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name, if (IS_ERR(kernel_type)) goto out; - dir = getname(dir_name); - retval = PTR_ERR(dir); - if (IS_ERR(dir)) - goto out1; - kernel_dev = copy_mount_string(dev_name); retval = PTR_ERR(kernel_dev); if (IS_ERR(kernel_dev)) - goto out2; + goto out1; retval = copy_mount_options(data, &data_page); if (retval < 0) - goto out3; + goto out2; retval = -EINVAL; @@ -823,19 +817,17 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name, do_ncp_super_data_conv((void *)data_page); } else if (!strcmp(kernel_type, NFS4_NAME)) { if (do_nfs4_super_data_conv((void *) data_page)) - goto out4; + goto out3; } } - retval = do_mount(kernel_dev, dir->name, kernel_type, + retval = do_mount(kernel_dev, dir_name, kernel_type, flags, (void*)data_page); - out4: - free_page(data_page); out3: - kfree(kernel_dev); + free_page(data_page); out2: - putname(dir); + kfree(kernel_dev); out1: kfree(kernel_type); out: diff --git a/fs/namespace.c b/fs/namespace.c index abd3abb5261..348562f14e9 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2533,7 +2533,7 @@ char *copy_mount_string(const void __user *data) * Therefore, if this magic number is present, it carries no information * and must be discarded. */ -long do_mount(const char *dev_name, const char *dir_name, +long do_mount(const char *dev_name, const char __user *dir_name, const char *type_page, unsigned long flags, void *data_page) { struct path path; @@ -2545,15 +2545,11 @@ long do_mount(const char *dev_name, const char *dir_name, flags &= ~MS_MGC_MSK; /* Basic sanity checks */ - - if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE)) - return -EINVAL; - if (data_page) ((char *)data_page)[PAGE_SIZE - 1] = 0; /* ... and get the mountpoint */ - retval = kern_path(dir_name, LOOKUP_FOLLOW, &path); + retval = user_path(dir_name, &path); if (retval) return retval; @@ -2778,7 +2774,6 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, { int ret; char *kernel_type; - struct filename *kernel_dir; char *kernel_dev; unsigned long data_page; @@ -2787,12 +2782,6 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, if (IS_ERR(kernel_type)) goto out_type; - kernel_dir = getname(dir_name); - if (IS_ERR(kernel_dir)) { - ret = PTR_ERR(kernel_dir); - goto out_dir; - } - kernel_dev = copy_mount_string(dev_name); ret = PTR_ERR(kernel_dev); if (IS_ERR(kernel_dev)) @@ -2802,15 +2791,13 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, if (ret < 0) goto out_data; - ret = do_mount(kernel_dev, kernel_dir->name, kernel_type, flags, + ret = do_mount(kernel_dev, dir_name, kernel_type, flags, (void *) data_page); free_page(data_page); out_data: kfree(kernel_dev); out_dev: - putname(kernel_dir); -out_dir: kfree(kernel_type); out_type: return ret; diff --git a/include/linux/fs.h b/include/linux/fs.h index 75bdd51ec9b..92148361bef 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1855,7 +1855,8 @@ extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data); extern void kern_unmount(struct vfsmount *mnt); extern int may_umount_tree(struct vfsmount *); extern int may_umount(struct vfsmount *); -extern long do_mount(const char *, const char *, const char *, unsigned long, void *); +extern long do_mount(const char *, const char __user *, + const char *, unsigned long, void *); extern struct vfsmount *collect_mounts(struct path *); extern void drop_collected_mounts(struct vfsmount *); extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *, -- cgit v1.2.3-70-g09d2 From 0d0826019e529f21c84687521d03f60cd241ca7d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 8 Oct 2014 10:42:27 -0700 Subject: mnt: Prevent pivot_root from creating a loop in the mount tree Andy Lutomirski recently demonstrated that when chroot is used to set the root path below the path for the new ``root'' passed to pivot_root the pivot_root system call succeeds and leaks mounts. In examining the code I see that starting with a new root that is below the current root in the mount tree will result in a loop in the mount tree after the mounts are detached and then reattached to one another. Resulting in all kinds of ugliness including a leak of that mounts involved in the leak of the mount loop. Prevent this problem by ensuring that the new mount is reachable from the current root of the mount tree. [Added stable cc. Fixes CVE-2014-7970. --Andy] Cc: stable@vger.kernel.org Reported-by: Andy Lutomirski Reviewed-by: Andy Lutomirski Link: http://lkml.kernel.org/r/87bnpmihks.fsf@x220.int.ebiederm.org Signed-off-by: "Eric W. Biederman" Signed-off-by: Andy Lutomirski --- fs/namespace.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index ef42d9bee21..74647c2fe69 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2820,6 +2820,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, /* make sure we can reach put_old from new_root */ if (!is_path_reachable(old_mnt, old.dentry, &new)) goto out4; + /* make certain new is below the root */ + if (!is_path_reachable(new_mnt, new.dentry, &root)) + goto out4; root_mp->m_count++; /* pin it so it won't go away */ lock_mount_hash(); detach_mnt(new_mnt, &parent_path); -- cgit v1.2.3-70-g09d2 From c771d683a62e5d36bc46036f5c07f4f5bb7dda61 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:36 +0200 Subject: vfs: introduce clone_private_mount() Overlayfs needs a private clone of the mount, so create a function for this and export to modules. Signed-off-by: Miklos Szeredi --- fs/namespace.c | 27 +++++++++++++++++++++++++++ include/linux/mount.h | 3 +++ 2 files changed, 30 insertions(+) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index fbba8b17330..5b66b2b3624 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1686,6 +1686,33 @@ void drop_collected_mounts(struct vfsmount *mnt) namespace_unlock(); } +/** + * clone_private_mount - create a private clone of a path + * + * This creates a new vfsmount, which will be the clone of @path. The new will + * not be attached anywhere in the namespace and will be private (i.e. changes + * to the originating mount won't be propagated into this). + * + * Release with mntput(). + */ +struct vfsmount *clone_private_mount(struct path *path) +{ + struct mount *old_mnt = real_mount(path->mnt); + struct mount *new_mnt; + + if (IS_MNT_UNBINDABLE(old_mnt)) + return ERR_PTR(-EINVAL); + + down_read(&namespace_sem); + new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); + up_read(&namespace_sem); + if (IS_ERR(new_mnt)) + return ERR_CAST(new_mnt); + + return &new_mnt->mnt; +} +EXPORT_SYMBOL_GPL(clone_private_mount); + int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, struct vfsmount *root) { diff --git a/include/linux/mount.h b/include/linux/mount.h index 9262e4bf0cc..c2c561dc011 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -81,6 +81,9 @@ extern struct vfsmount *mntget(struct vfsmount *mnt); extern struct vfsmount *mnt_clone_internal(struct path *path); extern int __mnt_is_readonly(struct vfsmount *mnt); +struct path; +extern struct vfsmount *clone_private_mount(struct path *path); + struct file_system_type; extern struct vfsmount *vfs_kern_mount(struct file_system_type *type, int flags, const char *name, -- cgit v1.2.3-70-g09d2