summaryrefslogtreecommitdiffstats
path: root/fs/namespace.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/namespace.c')
-rw-r--r--fs/namespace.c367
1 files changed, 288 insertions, 79 deletions
diff --git a/fs/namespace.c b/fs/namespace.c
index a72eaabfe8f..d1edf26025d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -13,7 +13,6 @@
#include <linux/sched.h>
#include <linux/spinlock.h>
#include <linux/percpu.h>
-#include <linux/smp_lock.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/acct.h>
@@ -139,6 +138,64 @@ void mnt_release_group_id(struct vfsmount *mnt)
mnt->mnt_group_id = 0;
}
+/*
+ * vfsmount lock must be held for read
+ */
+static inline void mnt_add_count(struct vfsmount *mnt, int n)
+{
+#ifdef CONFIG_SMP
+ this_cpu_add(mnt->mnt_pcp->mnt_count, n);
+#else
+ preempt_disable();
+ mnt->mnt_count += n;
+ preempt_enable();
+#endif
+}
+
+static inline void mnt_set_count(struct vfsmount *mnt, int n)
+{
+#ifdef CONFIG_SMP
+ this_cpu_write(mnt->mnt_pcp->mnt_count, n);
+#else
+ mnt->mnt_count = n;
+#endif
+}
+
+/*
+ * vfsmount lock must be held for read
+ */
+static inline void mnt_inc_count(struct vfsmount *mnt)
+{
+ mnt_add_count(mnt, 1);
+}
+
+/*
+ * vfsmount lock must be held for read
+ */
+static inline void mnt_dec_count(struct vfsmount *mnt)
+{
+ mnt_add_count(mnt, -1);
+}
+
+/*
+ * vfsmount lock must be held for write
+ */
+unsigned int mnt_get_count(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+ unsigned int count = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
+ }
+
+ return count;
+#else
+ return mnt->mnt_count;
+#endif
+}
+
struct vfsmount *alloc_vfsmnt(const char *name)
{
struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -155,7 +212,17 @@ struct vfsmount *alloc_vfsmnt(const char *name)
goto out_free_id;
}
- atomic_set(&mnt->mnt_count, 1);
+#ifdef CONFIG_SMP
+ mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
+ if (!mnt->mnt_pcp)
+ goto out_free_devname;
+
+ this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
+#else
+ mnt->mnt_count = 1;
+ mnt->mnt_writers = 0;
+#endif
+
INIT_LIST_HEAD(&mnt->mnt_hash);
INIT_LIST_HEAD(&mnt->mnt_child);
INIT_LIST_HEAD(&mnt->mnt_mounts);
@@ -167,13 +234,6 @@ struct vfsmount *alloc_vfsmnt(const char *name)
#ifdef CONFIG_FSNOTIFY
INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
#endif
-#ifdef CONFIG_SMP
- mnt->mnt_writers = alloc_percpu(int);
- if (!mnt->mnt_writers)
- goto out_free_devname;
-#else
- mnt->mnt_writers = 0;
-#endif
}
return mnt;
@@ -217,32 +277,32 @@ int __mnt_is_readonly(struct vfsmount *mnt)
}
EXPORT_SYMBOL_GPL(__mnt_is_readonly);
-static inline void inc_mnt_writers(struct vfsmount *mnt)
+static inline void mnt_inc_writers(struct vfsmount *mnt)
{
#ifdef CONFIG_SMP
- (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
+ this_cpu_inc(mnt->mnt_pcp->mnt_writers);
#else
mnt->mnt_writers++;
#endif
}
-static inline void dec_mnt_writers(struct vfsmount *mnt)
+static inline void mnt_dec_writers(struct vfsmount *mnt)
{
#ifdef CONFIG_SMP
- (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
+ this_cpu_dec(mnt->mnt_pcp->mnt_writers);
#else
mnt->mnt_writers--;
#endif
}
-static unsigned int count_mnt_writers(struct vfsmount *mnt)
+static unsigned int mnt_get_writers(struct vfsmount *mnt)
{
#ifdef CONFIG_SMP
unsigned int count = 0;
int cpu;
for_each_possible_cpu(cpu) {
- count += *per_cpu_ptr(mnt->mnt_writers, cpu);
+ count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
}
return count;
@@ -274,9 +334,9 @@ int mnt_want_write(struct vfsmount *mnt)
int ret = 0;
preempt_disable();
- inc_mnt_writers(mnt);
+ mnt_inc_writers(mnt);
/*
- * The store to inc_mnt_writers must be visible before we pass
+ * The store to mnt_inc_writers must be visible before we pass
* MNT_WRITE_HOLD loop below, so that the slowpath can see our
* incremented count after it has set MNT_WRITE_HOLD.
*/
@@ -290,7 +350,7 @@ int mnt_want_write(struct vfsmount *mnt)
*/
smp_rmb();
if (__mnt_is_readonly(mnt)) {
- dec_mnt_writers(mnt);
+ mnt_dec_writers(mnt);
ret = -EROFS;
goto out;
}
@@ -318,7 +378,7 @@ int mnt_clone_write(struct vfsmount *mnt)
if (__mnt_is_readonly(mnt))
return -EROFS;
preempt_disable();
- inc_mnt_writers(mnt);
+ mnt_inc_writers(mnt);
preempt_enable();
return 0;
}
@@ -352,7 +412,7 @@ EXPORT_SYMBOL_GPL(mnt_want_write_file);
void mnt_drop_write(struct vfsmount *mnt)
{
preempt_disable();
- dec_mnt_writers(mnt);
+ mnt_dec_writers(mnt);
preempt_enable();
}
EXPORT_SYMBOL_GPL(mnt_drop_write);
@@ -385,7 +445,7 @@ static int mnt_make_readonly(struct vfsmount *mnt)
* MNT_WRITE_HOLD, so it can't be decremented by another CPU while
* we're counting up here.
*/
- if (count_mnt_writers(mnt) > 0)
+ if (mnt_get_writers(mnt) > 0)
ret = -EBUSY;
else
mnt->mnt_flags |= MNT_READONLY;
@@ -419,7 +479,7 @@ void free_vfsmnt(struct vfsmount *mnt)
kfree(mnt->mnt_devname);
mnt_free_id(mnt);
#ifdef CONFIG_SMP
- free_percpu(mnt->mnt_writers);
+ free_percpu(mnt->mnt_pcp);
#endif
kmem_cache_free(mnt_cache, mnt);
}
@@ -493,6 +553,27 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
}
/*
+ * Clear dentry's mounted state if it has no remaining mounts.
+ * vfsmount_lock must be held for write.
+ */
+static void dentry_reset_mounted(struct vfsmount *mnt, struct dentry *dentry)
+{
+ unsigned u;
+
+ for (u = 0; u < HASH_SIZE; u++) {
+ struct vfsmount *p;
+
+ list_for_each_entry(p, &mount_hashtable[u], mnt_hash) {
+ if (p->mnt_mountpoint == dentry)
+ return;
+ }
+ }
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags &= ~DCACHE_MOUNTED;
+ spin_unlock(&dentry->d_lock);
+}
+
+/*
* vfsmount lock must be held for write
*/
static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
@@ -503,7 +584,7 @@ static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
mnt->mnt_mountpoint = mnt->mnt_root;
list_del_init(&mnt->mnt_child);
list_del_init(&mnt->mnt_hash);
- old_path->dentry->d_mounted--;
+ dentry_reset_mounted(old_path->mnt, old_path->dentry);
}
/*
@@ -514,7 +595,9 @@ void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
{
child_mnt->mnt_parent = mntget(mnt);
child_mnt->mnt_mountpoint = dget(dentry);
- dentry->d_mounted++;
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags |= DCACHE_MOUNTED;
+ spin_unlock(&dentry->d_lock);
}
/*
@@ -528,6 +611,21 @@ static void attach_mnt(struct vfsmount *mnt, struct path *path)
list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts);
}
+static inline void __mnt_make_longterm(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+ atomic_inc(&mnt->mnt_longterm);
+#endif
+}
+
+/* needs vfsmount lock for write */
+static inline void __mnt_make_shortterm(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+ atomic_dec(&mnt->mnt_longterm);
+#endif
+}
+
/*
* vfsmount lock must be held for write
*/
@@ -541,8 +639,11 @@ static void commit_tree(struct vfsmount *mnt)
BUG_ON(parent == mnt);
list_add_tail(&head, &mnt->mnt_list);
- list_for_each_entry(m, &head, mnt_list)
+ list_for_each_entry(m, &head, mnt_list) {
m->mnt_ns = n;
+ __mnt_make_longterm(m);
+ }
+
list_splice(&head, n->list.prev);
list_add_tail(&mnt->mnt_hash, mount_hashtable +
@@ -595,7 +696,7 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
goto out_free;
}
- mnt->mnt_flags = old->mnt_flags;
+ mnt->mnt_flags = old->mnt_flags & ~MNT_WRITE_HOLD;
atomic_inc(&sb->s_active);
mnt->mnt_sb = sb;
mnt->mnt_root = dget(root);
@@ -630,9 +731,10 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
return NULL;
}
-static inline void __mntput(struct vfsmount *mnt)
+static inline void mntfree(struct vfsmount *mnt)
{
struct super_block *sb = mnt->mnt_sb;
+
/*
* This probably indicates that somebody messed
* up a mnt_want/drop_write() pair. If this
@@ -640,38 +742,69 @@ static inline void __mntput(struct vfsmount *mnt)
* to make r/w->r/o transitions.
*/
/*
- * atomic_dec_and_lock() used to deal with ->mnt_count decrements
- * provides barriers, so count_mnt_writers() below is safe. AV
+ * The locking used to deal with mnt_count decrement provides barriers,
+ * so mnt_get_writers() below is safe.
*/
- WARN_ON(count_mnt_writers(mnt));
+ WARN_ON(mnt_get_writers(mnt));
fsnotify_vfsmount_delete(mnt);
dput(mnt->mnt_root);
free_vfsmnt(mnt);
deactivate_super(sb);
}
-void mntput_no_expire(struct vfsmount *mnt)
+static void mntput_no_expire(struct vfsmount *mnt)
{
-repeat:
- if (atomic_add_unless(&mnt->mnt_count, -1, 1))
+put_again:
+#ifdef CONFIG_SMP
+ br_read_lock(vfsmount_lock);
+ if (likely(atomic_read(&mnt->mnt_longterm))) {
+ mnt_dec_count(mnt);
+ br_read_unlock(vfsmount_lock);
return;
+ }
+ br_read_unlock(vfsmount_lock);
+
br_write_lock(vfsmount_lock);
- if (!atomic_dec_and_test(&mnt->mnt_count)) {
+ mnt_dec_count(mnt);
+ if (mnt_get_count(mnt)) {
br_write_unlock(vfsmount_lock);
return;
}
- if (likely(!mnt->mnt_pinned)) {
- br_write_unlock(vfsmount_lock);
- __mntput(mnt);
+#else
+ mnt_dec_count(mnt);
+ if (likely(mnt_get_count(mnt)))
return;
+ br_write_lock(vfsmount_lock);
+#endif
+ if (unlikely(mnt->mnt_pinned)) {
+ mnt_add_count(mnt, mnt->mnt_pinned + 1);
+ mnt->mnt_pinned = 0;
+ br_write_unlock(vfsmount_lock);
+ acct_auto_close_mnt(mnt);
+ goto put_again;
}
- atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
- mnt->mnt_pinned = 0;
br_write_unlock(vfsmount_lock);
- acct_auto_close_mnt(mnt);
- goto repeat;
+ mntfree(mnt);
}
-EXPORT_SYMBOL(mntput_no_expire);
+
+void mntput(struct vfsmount *mnt)
+{
+ if (mnt) {
+ /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
+ if (unlikely(mnt->mnt_expiry_mark))
+ mnt->mnt_expiry_mark = 0;
+ mntput_no_expire(mnt);
+ }
+}
+EXPORT_SYMBOL(mntput);
+
+struct vfsmount *mntget(struct vfsmount *mnt)
+{
+ if (mnt)
+ mnt_inc_count(mnt);
+ return mnt;
+}
+EXPORT_SYMBOL(mntget);
void mnt_pin(struct vfsmount *mnt)
{
@@ -679,19 +812,17 @@ void mnt_pin(struct vfsmount *mnt)
mnt->mnt_pinned++;
br_write_unlock(vfsmount_lock);
}
-
EXPORT_SYMBOL(mnt_pin);
void mnt_unpin(struct vfsmount *mnt)
{
br_write_lock(vfsmount_lock);
if (mnt->mnt_pinned) {
- atomic_inc(&mnt->mnt_count);
+ mnt_inc_count(mnt);
mnt->mnt_pinned--;
}
br_write_unlock(vfsmount_lock);
}
-
EXPORT_SYMBOL(mnt_unpin);
static inline void mangle(struct seq_file *m, const char *s)
@@ -986,12 +1117,13 @@ int may_umount_tree(struct vfsmount *mnt)
int minimum_refs = 0;
struct vfsmount *p;
- br_read_lock(vfsmount_lock);
+ /* write lock needed for mnt_get_count */
+ br_write_lock(vfsmount_lock);
for (p = mnt; p; p = next_mnt(p, mnt)) {
- actual_refs += atomic_read(&p->mnt_count);
+ actual_refs += mnt_get_count(p);
minimum_refs += 2;
}
- br_read_unlock(vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
if (actual_refs > minimum_refs)
return 0;
@@ -1018,10 +1150,10 @@ int may_umount(struct vfsmount *mnt)
{
int ret = 1;
down_read(&namespace_sem);
- br_read_lock(vfsmount_lock);
+ br_write_lock(vfsmount_lock);
if (propagate_mount_busy(mnt, 2))
ret = 0;
- br_read_unlock(vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
up_read(&namespace_sem);
return ret;
}
@@ -1058,26 +1190,29 @@ void release_mounts(struct list_head *head)
*/
void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
{
+ LIST_HEAD(tmp_list);
struct vfsmount *p;
for (p = mnt; p; p = next_mnt(p, mnt))
- list_move(&p->mnt_hash, kill);
+ list_move(&p->mnt_hash, &tmp_list);
if (propagate)
- propagate_umount(kill);
+ propagate_umount(&tmp_list);
- list_for_each_entry(p, kill, mnt_hash) {
+ list_for_each_entry(p, &tmp_list, mnt_hash) {
list_del_init(&p->mnt_expire);
list_del_init(&p->mnt_list);
__touch_mnt_namespace(p->mnt_ns);
p->mnt_ns = NULL;
+ __mnt_make_shortterm(p);
list_del_init(&p->mnt_child);
if (p->mnt_parent != p) {
p->mnt_parent->mnt_ghosts++;
- p->mnt_mountpoint->d_mounted--;
+ dentry_reset_mounted(p->mnt_parent, p->mnt_mountpoint);
}
change_mnt_propagation(p, MS_PRIVATE);
}
+ list_splice(&tmp_list, kill);
}
static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts);
@@ -1103,8 +1238,16 @@ static int do_umount(struct vfsmount *mnt, int flags)
flags & (MNT_FORCE | MNT_DETACH))
return -EINVAL;
- if (atomic_read(&mnt->mnt_count) != 2)
+ /*
+ * probably don't strictly need the lock here if we examined
+ * all race cases, but it's a slowpath.
+ */
+ br_write_lock(vfsmount_lock);
+ if (mnt_get_count(mnt) != 2) {
+ br_write_unlock(vfsmount_lock);
return -EBUSY;
+ }
+ br_write_unlock(vfsmount_lock);
if (!xchg(&mnt->mnt_expiry_mark, 1))
return -EAGAIN;
@@ -1668,9 +1811,10 @@ static int do_move_mount(struct path *path, char *old_name)
return err;
down_write(&namespace_sem);
- while (d_mountpoint(path->dentry) &&
- follow_down(path))
- ;
+ err = follow_down(path, true);
+ if (err < 0)
+ goto out;
+
err = -EINVAL;
if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
goto out;
@@ -1728,6 +1872,8 @@ out:
return err;
}
+static int do_add_mount(struct vfsmount *, struct path *, int);
+
/*
* create a new mount for userspace and request it to be added into the
* namespace's tree
@@ -1736,6 +1882,7 @@ static int do_new_mount(struct path *path, char *type, int flags,
int mnt_flags, char *name, void *data)
{
struct vfsmount *mnt;
+ int err;
if (!type)
return -EINVAL;
@@ -1744,21 +1891,51 @@ static int do_new_mount(struct path *path, char *type, int flags,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- lock_kernel();
mnt = do_kern_mount(type, flags, name, data);
- unlock_kernel();
if (IS_ERR(mnt))
return PTR_ERR(mnt);
- return do_add_mount(mnt, path, mnt_flags, NULL);
+ err = do_add_mount(mnt, path, mnt_flags);
+ if (err)
+ mntput(mnt);
+ return err;
+}
+
+int finish_automount(struct vfsmount *m, struct path *path)
+{
+ int err;
+ /* The new mount record should have at least 2 refs to prevent it being
+ * expired before we get a chance to add it
+ */
+ BUG_ON(mnt_get_count(m) < 2);
+
+ if (m->mnt_sb == path->mnt->mnt_sb &&
+ m->mnt_root == path->dentry) {
+ err = -ELOOP;
+ goto fail;
+ }
+
+ err = do_add_mount(m, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
+ if (!err)
+ return 0;
+fail:
+ /* remove m from any expiration list it may be on */
+ if (!list_empty(&m->mnt_expire)) {
+ down_write(&namespace_sem);
+ br_write_lock(vfsmount_lock);
+ list_del_init(&m->mnt_expire);
+ br_write_unlock(vfsmount_lock);
+ up_write(&namespace_sem);
+ }
+ mntput(m);
+ mntput(m);
+ return err;
}
/*
* add a mount into a namespace's mount tree
- * - provide the option of adding the new mount to an expiration list
*/
-int do_add_mount(struct vfsmount *newmnt, struct path *path,
- int mnt_flags, struct list_head *fslist)
+static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
{
int err;
@@ -1766,9 +1943,10 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
down_write(&namespace_sem);
/* Something was mounted here while we slept */
- while (d_mountpoint(path->dentry) &&
- follow_down(path))
- ;
+ err = follow_down(path, true);
+ if (err < 0)
+ goto unlock;
+
err = -EINVAL;
if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
goto unlock;
@@ -1784,22 +1962,29 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
goto unlock;
newmnt->mnt_flags = mnt_flags;
- if ((err = graft_tree(newmnt, path)))
- goto unlock;
-
- if (fslist) /* add to the specified expiration list */
- list_add_tail(&newmnt->mnt_expire, fslist);
-
- up_write(&namespace_sem);
- return 0;
+ err = graft_tree(newmnt, path);
unlock:
up_write(&namespace_sem);
- mntput(newmnt);
return err;
}
-EXPORT_SYMBOL_GPL(do_add_mount);
+/**
+ * mnt_set_expiry - Put a mount on an expiration list
+ * @mnt: The mount to list.
+ * @expiry_list: The list to add the mount to.
+ */
+void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
+{
+ down_write(&namespace_sem);
+ br_write_lock(vfsmount_lock);
+
+ list_add_tail(&mnt->mnt_expire, expiry_list);
+
+ br_write_unlock(vfsmount_lock);
+ up_write(&namespace_sem);
+}
+EXPORT_SYMBOL(mnt_set_expiry);
/*
* process a list of expirable mountpoints with the intent of discarding any
@@ -2088,6 +2273,22 @@ static struct mnt_namespace *alloc_mnt_ns(void)
return new_ns;
}
+void mnt_make_longterm(struct vfsmount *mnt)
+{
+ __mnt_make_longterm(mnt);
+}
+
+void mnt_make_shortterm(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+ if (atomic_add_unless(&mnt->mnt_longterm, -1, 1))
+ return;
+ br_write_lock(vfsmount_lock);
+ atomic_dec(&mnt->mnt_longterm);
+ br_write_unlock(vfsmount_lock);
+#endif
+}
+
/*
* Allocate a new namespace structure and populate it with contents
* copied from the namespace of the passed in task structure.
@@ -2125,14 +2326,19 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
q = new_ns->root;
while (p) {
q->mnt_ns = new_ns;
+ __mnt_make_longterm(q);
if (fs) {
if (p == fs->root.mnt) {
- rootmnt = p;
fs->root.mnt = mntget(q);
+ __mnt_make_longterm(q);
+ mnt_make_shortterm(p);
+ rootmnt = p;
}
if (p == fs->pwd.mnt) {
- pwdmnt = p;
fs->pwd.mnt = mntget(q);
+ __mnt_make_longterm(q);
+ mnt_make_shortterm(p);
+ pwdmnt = p;
}
}
p = next_mnt(p, mnt_ns->root);
@@ -2176,6 +2382,7 @@ struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt)
new_ns = alloc_mnt_ns();
if (!IS_ERR(new_ns)) {
mnt->mnt_ns = new_ns;
+ __mnt_make_longterm(mnt);
new_ns->root = mnt;
list_add(&new_ns->list, &new_ns->root->mnt_list);
}
@@ -2330,6 +2537,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
touch_mnt_namespace(current->nsproxy->mnt_ns);
br_write_unlock(vfsmount_lock);
chroot_fs_refs(&root, &new);
+
error = 0;
path_put(&root_parent);
path_put(&parent_path);
@@ -2356,6 +2564,7 @@ static void __init init_mount_tree(void)
mnt = do_kern_mount("rootfs", 0, "rootfs", NULL);
if (IS_ERR(mnt))
panic("Can't create rootfs");
+
ns = create_mnt_ns(mnt);
if (IS_ERR(ns))
panic("Can't allocate initial namespace");