From ecfdb33d1fbc7e6e095ba24dac2930208494e734 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Apr 2014 14:44:49 -0400 Subject: acct: encode_comp_t(0) is 0, fortunately... There was an amusing bogosity in ac_rw calculation - it tried to do encode_comp_t(encode_comp_t(0) / 1024). Seeing that comp_t is a 3-bit exponent + 13-bit mantissa... it's a good thing that 0 is represented by all-bits-clear. The history of that one is interesting - it was introduced in 2.1.68pre1, when acct.c had been reworked and moved to separate file. Two months later (2.1.86) somebody has noticed that the sucker won't compile - there was no task_struct::io_usage. At which point the ac_io calculation had changed from encode_comp_t(current->io_usage) to encode_comp_t(0) and the bug in the next line (absolutely real back then, had it ever managed to compile) become a harmless bogosity. Looks like nobody has ever noticed until now. Anyway, let's bury that idiocy now that it got noticed. 17 years is long enough... Signed-off-by: Al Viro --- kernel/acct.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/acct.c b/kernel/acct.c index a1844f14c6d..807ebc5d833 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -531,9 +531,6 @@ static void do_acct_process(struct bsd_acct_struct *acct, ac.ac_majflt = encode_comp_t(pacct->ac_majflt); ac.ac_exitcode = pacct->ac_exitcode; spin_unlock_irq(¤t->sighand->siglock); - ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ - ac.ac_rw = encode_comp_t(ac.ac_io / 1024); - ac.ac_swaps = encode_comp_t(0); /* * Get freeze protection. If the fs is frozen, just skip the write -- cgit v1.2.3-70-g09d2 From ed44724b79d8e03a40665436019cf22baba80d30 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Apr 2014 14:37:20 -0400 Subject: acct: switch to __kernel_write() Signed-off-by: Al Viro --- fs/internal.h | 1 - include/linux/fs.h | 1 + kernel/acct.c | 31 ++++++++++++------------------- 3 files changed, 13 insertions(+), 20 deletions(-) diff --git a/fs/internal.h b/fs/internal.h index 46574240746..9a2edba87c2 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -131,7 +131,6 @@ extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, /* * read_write.c */ -extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *); extern int rw_verify_area(int, struct file *, const loff_t *, size_t); /* diff --git a/include/linux/fs.h b/include/linux/fs.h index e11d60cc867..4b7d57cf786 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2335,6 +2335,7 @@ extern int do_pipe_flags(int *, int); extern int kernel_read(struct file *, loff_t, char *, unsigned long); extern ssize_t kernel_write(struct file *, const char *, size_t, loff_t); +extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *); extern struct file * open_exec(const char *); /* fs/dcache.c -- generic fs support functions */ diff --git a/kernel/acct.c b/kernel/acct.c index 807ebc5d833..8082d9875d6 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -456,12 +456,16 @@ static void do_acct_process(struct bsd_acct_struct *acct, { struct pacct_struct *pacct = ¤t->signal->pacct; acct_t ac; - mm_segment_t fs; unsigned long flim; u64 elapsed, run_time; struct tty_struct *tty; const struct cred *orig_cred; + /* + * Accounting records are not subject to resource limits. + */ + flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; /* Perform file operations on behalf of whoever enabled accounting */ orig_cred = override_creds(file->f_cred); @@ -536,25 +540,14 @@ static void do_acct_process(struct bsd_acct_struct *acct, * Get freeze protection. If the fs is frozen, just skip the write * as we could deadlock the system otherwise. */ - if (!file_start_write_trylock(file)) - goto out; - /* - * Kernel segment override to datasegment and write it - * to the accounting file. - */ - fs = get_fs(); - set_fs(KERNEL_DS); - /* - * Accounting records are not subject to resource limits. - */ - flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; - file->f_op->write(file, (char *)&ac, - sizeof(acct_t), &file->f_pos); - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; - set_fs(fs); - file_end_write(file); + if (file_start_write_trylock(file)) { + /* it's been opened O_APPEND, so position is irrelevant */ + loff_t pos = 0; + __kernel_write(file, (char *)&ac, sizeof(acct_t), &pos); + file_end_write(file); + } out: + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; revert_creds(orig_cred); } -- cgit v1.2.3-70-g09d2 From cdd37e23092c3c6fbbb2e611f8c3d18e676bf28f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 26 Apr 2014 23:45:53 -0400 Subject: separate namespace-independent parts of filling acct_t Signed-off-by: Al Viro --- kernel/acct.c | 98 +++++++++++++++++++++++++++++++---------------------------- 1 file changed, 51 insertions(+), 47 deletions(-) diff --git a/kernel/acct.c b/kernel/acct.c index 8082d9875d6..efa891beeaa 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -448,42 +448,20 @@ static u32 encode_float(u64 value) * do_exit() or when switching to a different output file. */ -/* - * do_acct_process does all actual work. Caller holds the reference to file. - */ -static void do_acct_process(struct bsd_acct_struct *acct, - struct pid_namespace *ns, struct file *file) +static void fill_ac(acct_t *ac) { struct pacct_struct *pacct = ¤t->signal->pacct; - acct_t ac; - unsigned long flim; u64 elapsed, run_time; struct tty_struct *tty; - const struct cred *orig_cred; - - /* - * Accounting records are not subject to resource limits. - */ - flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; - /* Perform file operations on behalf of whoever enabled accounting */ - orig_cred = override_creds(file->f_cred); - - /* - * First check to see if there is enough free_space to continue - * the process accounting system. - */ - if (!check_free_space(acct, file)) - goto out; /* * Fill the accounting struct with the needed info as recorded * by the different kernel functions. */ - memset(&ac, 0, sizeof(acct_t)); + memset(ac, 0, sizeof(acct_t)); - ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; - strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); + ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER; + strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm)); /* calculate run_time in nsec*/ run_time = ktime_get_ns(); @@ -491,27 +469,66 @@ static void do_acct_process(struct bsd_acct_struct *acct, /* convert nsec -> AHZ */ elapsed = nsec_to_AHZ(run_time); #if ACCT_VERSION==3 - ac.ac_etime = encode_float(elapsed); + ac->ac_etime = encode_float(elapsed); #else - ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? + ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? (unsigned long) elapsed : (unsigned long) -1l); #endif #if ACCT_VERSION==1 || ACCT_VERSION==2 { /* new enlarged etime field */ comp2_t etime = encode_comp2_t(elapsed); - ac.ac_etime_hi = etime >> 16; - ac.ac_etime_lo = (u16) etime; + ac->ac_etime_hi = etime >> 16; + ac->ac_etime_lo = (u16) etime; } #endif do_div(elapsed, AHZ); - ac.ac_btime = get_seconds() - elapsed; + ac->ac_btime = get_seconds() - elapsed; +#if ACCT_VERSION==2 + ac->ac_ahz = AHZ; +#endif + + spin_lock_irq(¤t->sighand->siglock); + tty = current->signal->tty; /* Safe as we hold the siglock */ + ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; + ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); + ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); + ac->ac_flag = pacct->ac_flag; + ac->ac_mem = encode_comp_t(pacct->ac_mem); + ac->ac_minflt = encode_comp_t(pacct->ac_minflt); + ac->ac_majflt = encode_comp_t(pacct->ac_majflt); + ac->ac_exitcode = pacct->ac_exitcode; + spin_unlock_irq(¤t->sighand->siglock); +} +/* + * do_acct_process does all actual work. Caller holds the reference to file. + */ +static void do_acct_process(struct bsd_acct_struct *acct, + struct pid_namespace *ns, struct file *file) +{ + acct_t ac; + unsigned long flim; + const struct cred *orig_cred; + + /* + * Accounting records are not subject to resource limits. + */ + flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; + /* Perform file operations on behalf of whoever enabled accounting */ + orig_cred = override_creds(file->f_cred); + + /* + * First check to see if there is enough free_space to continue + * the process accounting system. + */ + if (!check_free_space(acct, file)) + goto out; + + fill_ac(&ac); /* we really need to bite the bullet and change layout */ ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); -#if ACCT_VERSION==2 - ac.ac_ahz = AHZ; -#endif #if ACCT_VERSION==1 || ACCT_VERSION==2 /* backward-compatible 16 bit fields */ ac.ac_uid16 = ac.ac_uid; @@ -523,19 +540,6 @@ static void do_acct_process(struct bsd_acct_struct *acct, ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); rcu_read_unlock(); #endif - - spin_lock_irq(¤t->sighand->siglock); - tty = current->signal->tty; /* Safe as we hold the siglock */ - ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; - ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); - ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); - ac.ac_flag = pacct->ac_flag; - ac.ac_mem = encode_comp_t(pacct->ac_mem); - ac.ac_minflt = encode_comp_t(pacct->ac_minflt); - ac.ac_majflt = encode_comp_t(pacct->ac_majflt); - ac.ac_exitcode = pacct->ac_exitcode; - spin_unlock_irq(¤t->sighand->siglock); - /* * Get freeze protection. If the fs is frozen, just skip the write * as we could deadlock the system otherwise. -- cgit v1.2.3-70-g09d2 From e25ff11ff16aba000dfe9e568d867e5142c31f16 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 May 2014 05:12:09 -0400 Subject: split the slow path in acct_process() off Signed-off-by: Al Viro --- kernel/acct.c | 50 ++++++++++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/kernel/acct.c b/kernel/acct.c index efa891beeaa..51188603b25 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -599,34 +599,35 @@ void acct_collect(long exitcode, int group_dead) spin_unlock_irq(¤t->sighand->siglock); } -static void acct_process_in_ns(struct pid_namespace *ns) +static void slow_acct_process(struct pid_namespace *ns) { - struct file *file = NULL; - struct bsd_acct_struct *acct; + for ( ; ns; ns = ns->parent) { + struct file *file = NULL; + struct bsd_acct_struct *acct; - acct = ns->bacct; - /* - * accelerate the common fastpath: - */ - if (!acct || !acct->file) - return; + acct = ns->bacct; + /* + * accelerate the common fastpath: + */ + if (!acct || !acct->file) + continue; - spin_lock(&acct_lock); - file = acct->file; - if (unlikely(!file)) { + spin_lock(&acct_lock); + file = acct->file; + if (unlikely(!file)) { + spin_unlock(&acct_lock); + continue; + } + get_file(file); spin_unlock(&acct_lock); - return; - } - get_file(file); - spin_unlock(&acct_lock); - do_acct_process(acct, ns, file); - fput(file); + do_acct_process(acct, ns, file); + fput(file); + } } /** - * acct_process - now just a wrapper around acct_process_in_ns, - * which in turn is a wrapper around do_acct_process. + * acct_process * * handles process accounting for an exiting task */ @@ -639,6 +640,11 @@ void acct_process(void) * alive and holds its namespace, which in turn holds * its parent. */ - for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) - acct_process_in_ns(ns); + for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) { + struct bsd_acct_struct *acct = ns->bacct; + if (acct && acct->file) + break; + } + if (unlikely(ns)) + slow_acct_process(ns); } -- cgit v1.2.3-70-g09d2 From 795a2f22a8eaf749e20a11271a8821bf04ac6d90 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 May 2014 05:23:41 -0400 Subject: acct() should honour the limits from the very beginning We need to check free space on the first write to freshly opened log. Signed-off-by: Al Viro --- kernel/acct.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/acct.c b/kernel/acct.c index 51188603b25..87773725a0d 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -180,8 +180,8 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, if (file) { acct->file = file; acct->ns = ns; - acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; - acct->active = 1; + acct->needcheck = jiffies; + acct->active = 0; list_add(&acct->list, &acct_list); } if (old_acct) { -- cgit v1.2.3-70-g09d2 From 9df7fa16ee956bf0cdf4a711eac827be92d584bc Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 15 May 2014 06:49:45 -0400 Subject: acct: serialize acct_on() brute-force - on a global mutex that isn't nested into anything. Signed-off-by: Al Viro --- kernel/acct.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/acct.c b/kernel/acct.c index 87773725a0d..08963a29287 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -241,6 +241,8 @@ static int acct_on(struct filename *pathname) return 0; } +static DEFINE_MUTEX(acct_on_mutex); + /** * sys_acct - enable/disable process accounting * @name: file name for accounting records or NULL to shutdown accounting @@ -263,7 +265,9 @@ SYSCALL_DEFINE1(acct, const char __user *, name) struct filename *tmp = getname(name); if (IS_ERR(tmp)) return PTR_ERR(tmp); + mutex_lock(&acct_on_mutex); error = acct_on(tmp); + mutex_unlock(&acct_on_mutex); putname(tmp); } else { struct bsd_acct_struct *acct; -- cgit v1.2.3-70-g09d2 From b8f00e6be46f4c9a112e05fd692712873c4c4048 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Aug 2014 07:51:03 -0400 Subject: acct: new lifetime rules Do not reuse bsd_acct_struct after closing the damn thing. Structure lifetime is controlled by refcount now. We also have a mutex in there, held over closing and writing (the file is O_APPEND, so we are not losing any concurrency). As the result, we do not need to bother with get_file()/fput() on log write anymore. Moreover, do_acct_process() only needs acct itself; file and pidns are picked from it. Killed instances are distinguished by having NULL ->ns. Refcount is protected by acct_lock; anybody taking the mutex needs to grab a reference first. The things will get a lot simpler in the next commits - this is just the minimal chunk switching to the new lifetime rules. Signed-off-by: Al Viro --- kernel/acct.c | 220 ++++++++++++++++++++++++++++++---------------------------- 1 file changed, 114 insertions(+), 106 deletions(-) diff --git a/kernel/acct.c b/kernel/acct.c index 08963a29287..f9ef9db55c0 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -75,15 +75,11 @@ int acct_parm[3] = {4, 2, 30}; /* * External references and all of the globals. */ -static void do_acct_process(struct bsd_acct_struct *acct, - struct pid_namespace *ns, struct file *); +static void do_acct_process(struct bsd_acct_struct *acct); -/* - * This structure is used so that all the data protected by lock - * can be placed in the same cache line as the lock. This primes - * the cache line to have the data after getting the lock. - */ struct bsd_acct_struct { + long count; + struct mutex lock; int active; unsigned long needcheck; struct file *file; @@ -157,39 +153,59 @@ out: return res; } -/* - * Close the old accounting file (if currently open) and then replace - * it with file (if non-NULL). - * - * NOTE: acct_lock MUST be held on entry and exit. - */ -static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, - struct pid_namespace *ns) +static void acct_put(struct bsd_acct_struct *p) { - struct file *old_acct = NULL; - struct pid_namespace *old_ns = NULL; - - if (acct->file) { - old_acct = acct->file; - old_ns = acct->ns; - acct->active = 0; - acct->file = NULL; - acct->ns = NULL; - list_del(&acct->list); - } - if (file) { - acct->file = file; - acct->ns = ns; - acct->needcheck = jiffies; - acct->active = 0; - list_add(&acct->list, &acct_list); + spin_lock(&acct_lock); + if (!--p->count) + kfree(p); + spin_unlock(&acct_lock); +} + +static struct bsd_acct_struct *acct_get(struct bsd_acct_struct **p) +{ + struct bsd_acct_struct *res; + spin_lock(&acct_lock); +again: + res = *p; + if (res) + res->count++; + spin_unlock(&acct_lock); + if (res) { + mutex_lock(&res->lock); + if (!res->ns) { + mutex_unlock(&res->lock); + spin_lock(&acct_lock); + if (!--res->count) + kfree(res); + goto again; + } } - if (old_acct) { - mnt_unpin(old_acct->f_path.mnt); + return res; +} + +static void acct_kill(struct bsd_acct_struct *acct, + struct bsd_acct_struct *new) +{ + if (acct) { + struct file *file = acct->file; + struct pid_namespace *ns = acct->ns; + spin_lock(&acct_lock); + list_del(&acct->list); + mnt_unpin(file->f_path.mnt); spin_unlock(&acct_lock); - do_acct_process(acct, old_ns, old_acct); - filp_close(old_acct, NULL); + do_acct_process(acct); + filp_close(file, NULL); spin_lock(&acct_lock); + ns->bacct = new; + if (new) { + mnt_pin(new->file->f_path.mnt); + list_add(&new->list, &acct_list); + } + acct->ns = NULL; + mutex_unlock(&acct->lock); + if (!(acct->count -= 2)) + kfree(acct); + spin_unlock(&acct_lock); } } @@ -197,47 +213,50 @@ static int acct_on(struct filename *pathname) { struct file *file; struct vfsmount *mnt; - struct pid_namespace *ns; - struct bsd_acct_struct *acct = NULL; + struct pid_namespace *ns = task_active_pid_ns(current); + struct bsd_acct_struct *acct, *old; + + acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); + if (!acct) + return -ENOMEM; /* Difference from BSD - they don't do O_APPEND */ file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0); - if (IS_ERR(file)) + if (IS_ERR(file)) { + kfree(acct); return PTR_ERR(file); + } if (!S_ISREG(file_inode(file)->i_mode)) { + kfree(acct); filp_close(file, NULL); return -EACCES; } if (!file->f_op->write) { + kfree(acct); filp_close(file, NULL); return -EIO; } - ns = task_active_pid_ns(current); - if (ns->bacct == NULL) { - acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); - if (acct == NULL) { - filp_close(file, NULL); - return -ENOMEM; - } - } + acct->count = 1; + acct->file = file; + acct->needcheck = jiffies; + acct->ns = ns; + mutex_init(&acct->lock); + mnt = file->f_path.mnt; - spin_lock(&acct_lock); - if (ns->bacct == NULL) { + old = acct_get(&ns->bacct); + if (old) { + acct_kill(old, acct); + } else { + spin_lock(&acct_lock); ns->bacct = acct; - acct = NULL; + mnt_pin(mnt); + list_add(&acct->list, &acct_list); + spin_unlock(&acct_lock); } - - mnt = file->f_path.mnt; - mnt_pin(mnt); - acct_file_reopen(ns->bacct, file, ns); - spin_unlock(&acct_lock); - mntput(mnt); /* it's pinned, now give up active reference */ - kfree(acct); - return 0; } @@ -270,15 +289,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name) mutex_unlock(&acct_on_mutex); putname(tmp); } else { - struct bsd_acct_struct *acct; - - acct = task_active_pid_ns(current)->bacct; - if (acct == NULL) - return 0; - - spin_lock(&acct_lock); - acct_file_reopen(acct, NULL, NULL); - spin_unlock(&acct_lock); + acct_kill(acct_get(&task_active_pid_ns(current)->bacct), NULL); } return error; @@ -298,8 +309,19 @@ void acct_auto_close_mnt(struct vfsmount *m) spin_lock(&acct_lock); restart: list_for_each_entry(acct, &acct_list, list) - if (acct->file && acct->file->f_path.mnt == m) { - acct_file_reopen(acct, NULL, NULL); + if (acct->file->f_path.mnt == m) { + acct->count++; + spin_unlock(&acct_lock); + mutex_lock(&acct->lock); + if (!acct->ns) { + mutex_unlock(&acct->lock); + spin_lock(&acct_lock); + if (!--acct->count) + kfree(acct); + goto restart; + } + acct_kill(acct, NULL); + spin_lock(&acct_lock); goto restart; } spin_unlock(&acct_lock); @@ -319,8 +341,19 @@ void acct_auto_close(struct super_block *sb) spin_lock(&acct_lock); restart: list_for_each_entry(acct, &acct_list, list) - if (acct->file && acct->file->f_path.dentry->d_sb == sb) { - acct_file_reopen(acct, NULL, NULL); + if (acct->file->f_path.dentry->d_sb == sb) { + acct->count++; + spin_unlock(&acct_lock); + mutex_lock(&acct->lock); + if (!acct->ns) { + mutex_unlock(&acct->lock); + spin_lock(&acct_lock); + if (!--acct->count) + kfree(acct); + goto restart; + } + acct_kill(acct, NULL); + spin_lock(&acct_lock); goto restart; } spin_unlock(&acct_lock); @@ -328,17 +361,7 @@ restart: void acct_exit_ns(struct pid_namespace *ns) { - struct bsd_acct_struct *acct = ns->bacct; - - if (acct == NULL) - return; - - spin_lock(&acct_lock); - if (acct->file != NULL) - acct_file_reopen(acct, NULL, NULL); - spin_unlock(&acct_lock); - - kfree(acct); + acct_kill(acct_get(&ns->bacct), NULL); } /* @@ -507,12 +530,13 @@ static void fill_ac(acct_t *ac) /* * do_acct_process does all actual work. Caller holds the reference to file. */ -static void do_acct_process(struct bsd_acct_struct *acct, - struct pid_namespace *ns, struct file *file) +static void do_acct_process(struct bsd_acct_struct *acct) { acct_t ac; unsigned long flim; const struct cred *orig_cred; + struct pid_namespace *ns = acct->ns; + struct file *file = acct->file; /* * Accounting records are not subject to resource limits. @@ -606,27 +630,12 @@ void acct_collect(long exitcode, int group_dead) static void slow_acct_process(struct pid_namespace *ns) { for ( ; ns; ns = ns->parent) { - struct file *file = NULL; - struct bsd_acct_struct *acct; - - acct = ns->bacct; - /* - * accelerate the common fastpath: - */ - if (!acct || !acct->file) - continue; - - spin_lock(&acct_lock); - file = acct->file; - if (unlikely(!file)) { - spin_unlock(&acct_lock); - continue; + struct bsd_acct_struct *acct = acct_get(&ns->bacct); + if (acct) { + do_acct_process(acct); + mutex_unlock(&acct->lock); + acct_put(acct); } - get_file(file); - spin_unlock(&acct_lock); - - do_acct_process(acct, ns, file); - fput(file); } } @@ -645,8 +654,7 @@ void acct_process(void) * its parent. */ for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) { - struct bsd_acct_struct *acct = ns->bacct; - if (acct && acct->file) + if (ns->bacct) break; } if (unlikely(ns)) -- cgit v1.2.3-70-g09d2 From 54a4d58a6459a93fc6ee898354b3d2ffb80dd03a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Apr 2014 14:24:18 -0400 Subject: acct: simplify check_free_space() a) file can't be NULL b) file can't be changed under us c) all writes are serialized by acct->lock; no need to mess with spinlock there. Signed-off-by: Al Viro --- kernel/acct.c | 50 +++++++++++--------------------------------------- 1 file changed, 11 insertions(+), 39 deletions(-) diff --git a/kernel/acct.c b/kernel/acct.c index f9ef9db55c0..019f012a3c6 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -93,64 +93,36 @@ static LIST_HEAD(acct_list); /* * Check the amount of free space and suspend/resume accordingly. */ -static int check_free_space(struct bsd_acct_struct *acct, struct file *file) +static int check_free_space(struct bsd_acct_struct *acct) { struct kstatfs sbuf; - int res; - int act; - u64 resume; - u64 suspend; - spin_lock(&acct_lock); - res = acct->active; - if (!file || time_is_before_jiffies(acct->needcheck)) + if (time_is_before_jiffies(acct->needcheck)) goto out; - spin_unlock(&acct_lock); /* May block */ - if (vfs_statfs(&file->f_path, &sbuf)) - return res; - suspend = sbuf.f_blocks * SUSPEND; - resume = sbuf.f_blocks * RESUME; - - do_div(suspend, 100); - do_div(resume, 100); - - if (sbuf.f_bavail <= suspend) - act = -1; - else if (sbuf.f_bavail >= resume) - act = 1; - else - act = 0; - - /* - * If some joker switched acct->file under us we'ld better be - * silent and _not_ touch anything. - */ - spin_lock(&acct_lock); - if (file != acct->file) { - if (act) - res = act > 0; + if (vfs_statfs(&acct->file->f_path, &sbuf)) goto out; - } if (acct->active) { - if (act < 0) { + u64 suspend = sbuf.f_blocks * SUSPEND; + do_div(suspend, 100); + if (sbuf.f_bavail <= suspend) { acct->active = 0; printk(KERN_INFO "Process accounting paused\n"); } } else { - if (act > 0) { + u64 resume = sbuf.f_blocks * RESUME; + do_div(resume, 100); + if (sbuf.f_bavail >= resume) { acct->active = 1; printk(KERN_INFO "Process accounting resumed\n"); } } acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; - res = acct->active; out: - spin_unlock(&acct_lock); - return res; + return acct->active; } static void acct_put(struct bsd_acct_struct *p) @@ -550,7 +522,7 @@ static void do_acct_process(struct bsd_acct_struct *acct) * First check to see if there is enough free_space to continue * the process accounting system. */ - if (!check_free_space(acct, file)) + if (!check_free_space(acct)) goto out; fill_ac(&ac); -- cgit v1.2.3-70-g09d2 From 215752fce31c80f3b3a1530bc7cddb3ba6a69b3a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Aug 2014 06:23:41 -0400 Subject: acct: get rid of acct_list Put these suckers on per-vfsmount and per-superblock lists instead. Note: right now it's still acct_lock for everything, but that's going to change. Signed-off-by: Al Viro --- fs/mount.h | 1 + fs/namespace.c | 2 +- fs/super.c | 2 +- include/linux/acct.h | 6 +-- include/linux/fs.h | 1 + kernel/acct.c | 135 +++++++++++++++++++++------------------------------ 6 files changed, 62 insertions(+), 85 deletions(-) diff --git a/fs/mount.h b/fs/mount.h index d55297f2fa0..0a2d1458681 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -56,6 +56,7 @@ struct mount { int mnt_group_id; /* peer group identifier */ int mnt_expiry_mark; /* true if marked for expiry */ int mnt_pinned; + struct hlist_head mnt_pins; struct path mnt_ex_mountpoint; }; diff --git a/fs/namespace.c b/fs/namespace.c index 182bc41cd88..22e530addfa 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -956,7 +956,7 @@ put_again: mnt->mnt_pinned = 0; rcu_read_unlock(); unlock_mount_hash(); - acct_auto_close_mnt(&mnt->mnt); + acct_auto_close_mnt(&mnt->mnt_pins); goto put_again; } if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { diff --git a/fs/super.c b/fs/super.c index d20d5b11ded..52ed93eb63d 100644 --- a/fs/super.c +++ b/fs/super.c @@ -703,7 +703,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) #endif if (flags & MS_RDONLY) - acct_auto_close(sb); + acct_auto_close(&sb->s_pins); shrink_dcache_sb(sb); remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); diff --git a/include/linux/acct.h b/include/linux/acct.h index 4a5b7cb5607..65a4f889182 100644 --- a/include/linux/acct.h +++ b/include/linux/acct.h @@ -24,14 +24,14 @@ struct super_block; struct pacct_struct; struct pid_namespace; extern int acct_parm[]; /* for sysctl */ -extern void acct_auto_close_mnt(struct vfsmount *m); -extern void acct_auto_close(struct super_block *sb); +extern void acct_auto_close(struct hlist_head *); +extern void acct_auto_close_mnt(struct hlist_head *); extern void acct_collect(long exitcode, int group_dead); extern void acct_process(void); extern void acct_exit_ns(struct pid_namespace *); #else -#define acct_auto_close_mnt(x) do { } while (0) #define acct_auto_close(x) do { } while (0) +#define acct_auto_close_mnt(x) do { } while (0) #define acct_collect(x,y) do { } while (0) #define acct_process() do { } while (0) #define acct_exit_ns(ns) do { } while (0) diff --git a/include/linux/fs.h b/include/linux/fs.h index 4b7d57cf786..17f70872a4a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1250,6 +1250,7 @@ struct super_block { /* AIO completions deferred from interrupt context */ struct workqueue_struct *s_dio_done_wq; + struct hlist_head s_pins; /* * Keep the lru lists last in the structure so they always sit on their diff --git a/kernel/acct.c b/kernel/acct.c index 019f012a3c6..21fbb3c27c2 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -59,6 +59,7 @@ #include #include /* sector_div */ #include +#include <../fs/mount.h> /* will go away when we refactor */ /* * These constants control the amount of freespace that suspend and @@ -79,16 +80,16 @@ static void do_acct_process(struct bsd_acct_struct *acct); struct bsd_acct_struct { long count; + struct hlist_node s_list; + struct hlist_node m_list; struct mutex lock; int active; unsigned long needcheck; struct file *file; struct pid_namespace *ns; - struct list_head list; }; static DEFINE_SPINLOCK(acct_lock); -static LIST_HEAD(acct_list); /* * Check the amount of free space and suspend/resume accordingly. @@ -133,25 +134,33 @@ static void acct_put(struct bsd_acct_struct *p) spin_unlock(&acct_lock); } -static struct bsd_acct_struct *acct_get(struct bsd_acct_struct **p) +static struct bsd_acct_struct *__acct_get(struct bsd_acct_struct *res) +{ + res->count++; + spin_unlock(&acct_lock); + mutex_lock(&res->lock); + if (!res->ns) { + mutex_unlock(&res->lock); + spin_lock(&acct_lock); + if (!--res->count) + kfree(res); + return NULL; + } + return res; +} + +static struct bsd_acct_struct *acct_get(struct pid_namespace *ns) { struct bsd_acct_struct *res; spin_lock(&acct_lock); again: - res = *p; - if (res) - res->count++; - spin_unlock(&acct_lock); - if (res) { - mutex_lock(&res->lock); - if (!res->ns) { - mutex_unlock(&res->lock); - spin_lock(&acct_lock); - if (!--res->count) - kfree(res); - goto again; - } + if (!ns->bacct) { + spin_unlock(&acct_lock); + return NULL; } + res = __acct_get(ns->bacct); + if (!res) + goto again; return res; } @@ -162,7 +171,8 @@ static void acct_kill(struct bsd_acct_struct *acct, struct file *file = acct->file; struct pid_namespace *ns = acct->ns; spin_lock(&acct_lock); - list_del(&acct->list); + hlist_del(&acct->m_list); + hlist_del(&acct->s_list); mnt_unpin(file->f_path.mnt); spin_unlock(&acct_lock); do_acct_process(acct); @@ -170,8 +180,10 @@ static void acct_kill(struct bsd_acct_struct *acct, spin_lock(&acct_lock); ns->bacct = new; if (new) { - mnt_pin(new->file->f_path.mnt); - list_add(&new->list, &acct_list); + struct vfsmount *m = new->file->f_path.mnt; + mnt_pin(m); + hlist_add_head(&new->s_list, &m->mnt_sb->s_pins); + hlist_add_head(&new->m_list, &real_mount(m)->mnt_pins); } acct->ns = NULL; mutex_unlock(&acct->lock); @@ -218,14 +230,15 @@ static int acct_on(struct filename *pathname) mutex_init(&acct->lock); mnt = file->f_path.mnt; - old = acct_get(&ns->bacct); + old = acct_get(ns); if (old) { acct_kill(old, acct); } else { spin_lock(&acct_lock); ns->bacct = acct; mnt_pin(mnt); - list_add(&acct->list, &acct_list); + hlist_add_head(&acct->s_list, &mnt->mnt_sb->s_pins); + hlist_add_head(&acct->m_list, &real_mount(mnt)->mnt_pins); spin_unlock(&acct_lock); } mntput(mnt); /* it's pinned, now give up active reference */ @@ -261,79 +274,41 @@ SYSCALL_DEFINE1(acct, const char __user *, name) mutex_unlock(&acct_on_mutex); putname(tmp); } else { - acct_kill(acct_get(&task_active_pid_ns(current)->bacct), NULL); + acct_kill(acct_get(task_active_pid_ns(current)), NULL); } return error; } -/** - * acct_auto_close - turn off a filesystem's accounting if it is on - * @m: vfsmount being shut down - * - * If the accounting is turned on for a file in the subtree pointed to - * to by m, turn accounting off. Done when m is about to die. - */ -void acct_auto_close_mnt(struct vfsmount *m) +void acct_auto_close_mnt(struct hlist_head *list) { - struct bsd_acct_struct *acct; - - spin_lock(&acct_lock); -restart: - list_for_each_entry(acct, &acct_list, list) - if (acct->file->f_path.mnt == m) { - acct->count++; - spin_unlock(&acct_lock); - mutex_lock(&acct->lock); - if (!acct->ns) { - mutex_unlock(&acct->lock); - spin_lock(&acct_lock); - if (!--acct->count) - kfree(acct); - goto restart; - } - acct_kill(acct, NULL); - spin_lock(&acct_lock); - goto restart; - } + while (1) { + spin_lock(&acct_lock); + if (!list->first) + break; + acct_kill(__acct_get(hlist_entry(list->first, + struct bsd_acct_struct, + m_list)), NULL); + } spin_unlock(&acct_lock); } -/** - * acct_auto_close - turn off a filesystem's accounting if it is on - * @sb: super block for the filesystem - * - * If the accounting is turned on for a file in the filesystem pointed - * to by sb, turn accounting off. - */ -void acct_auto_close(struct super_block *sb) +void acct_auto_close(struct hlist_head *list) { - struct bsd_acct_struct *acct; - - spin_lock(&acct_lock); -restart: - list_for_each_entry(acct, &acct_list, list) - if (acct->file->f_path.dentry->d_sb == sb) { - acct->count++; - spin_unlock(&acct_lock); - mutex_lock(&acct->lock); - if (!acct->ns) { - mutex_unlock(&acct->lock); - spin_lock(&acct_lock); - if (!--acct->count) - kfree(acct); - goto restart; - } - acct_kill(acct, NULL); - spin_lock(&acct_lock); - goto restart; - } + while (1) { + spin_lock(&acct_lock); + if (!list->first) + break; + acct_kill(__acct_get(hlist_entry(list->first, + struct bsd_acct_struct, + s_list)), NULL); + } spin_unlock(&acct_lock); } void acct_exit_ns(struct pid_namespace *ns) { - acct_kill(acct_get(&ns->bacct), NULL); + acct_kill(acct_get(ns), NULL); } /* @@ -602,7 +577,7 @@ void acct_collect(long exitcode, int group_dead) static void slow_acct_process(struct pid_namespace *ns) { for ( ; ns; ns = ns->parent) { - struct bsd_acct_struct *acct = acct_get(&ns->bacct); + struct bsd_acct_struct *acct = acct_get(ns); if (acct) { do_acct_process(acct); mutex_unlock(&acct->lock); -- cgit v1.2.3-70-g09d2 From 2798d4ce61601808b965253d60624bbf201b51b0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Aug 2014 07:04:28 -0400 Subject: acct: get rid of acct_lock for acct->count * make acct->count atomic and acct freeing - rcu-delayed. * instead of grabbing acct_lock around the places where we take a reference, do that under rcu_read_lock() with atomic_long_inc_not_zero(). * have the new acct locked before making ns->bacct point to it Signed-off-by: Al Viro --- kernel/acct.c | 85 ++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 52 insertions(+), 33 deletions(-) diff --git a/kernel/acct.c b/kernel/acct.c index 21fbb3c27c2..6fd375f1562 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -79,9 +79,14 @@ int acct_parm[3] = {4, 2, 30}; static void do_acct_process(struct bsd_acct_struct *acct); struct bsd_acct_struct { - long count; - struct hlist_node s_list; - struct hlist_node m_list; + atomic_long_t count; + union { + struct { + struct hlist_node s_list; + struct hlist_node m_list; + }; + struct rcu_head rcu; + }; struct mutex lock; int active; unsigned long needcheck; @@ -89,6 +94,11 @@ struct bsd_acct_struct { struct pid_namespace *ns; }; +static void acct_free_rcu(struct rcu_head *head) +{ + kfree(container_of(head, struct bsd_acct_struct, rcu)); +} + static DEFINE_SPINLOCK(acct_lock); /* @@ -128,22 +138,22 @@ out: static void acct_put(struct bsd_acct_struct *p) { - spin_lock(&acct_lock); - if (!--p->count) - kfree(p); - spin_unlock(&acct_lock); + if (atomic_long_dec_and_test(&p->count)) + call_rcu(&p->rcu, acct_free_rcu); } static struct bsd_acct_struct *__acct_get(struct bsd_acct_struct *res) { - res->count++; - spin_unlock(&acct_lock); + if (!atomic_long_inc_not_zero(&res->count)) { + rcu_read_unlock(); + cpu_relax(); + return NULL; + } + rcu_read_unlock(); mutex_lock(&res->lock); if (!res->ns) { mutex_unlock(&res->lock); - spin_lock(&acct_lock); - if (!--res->count) - kfree(res); + acct_put(res); return NULL; } return res; @@ -152,13 +162,15 @@ static struct bsd_acct_struct *__acct_get(struct bsd_acct_struct *res) static struct bsd_acct_struct *acct_get(struct pid_namespace *ns) { struct bsd_acct_struct *res; - spin_lock(&acct_lock); again: - if (!ns->bacct) { - spin_unlock(&acct_lock); + smp_rmb(); + rcu_read_lock(); + res = ACCESS_ONCE(ns->bacct); + if (!res) { + rcu_read_unlock(); return NULL; } - res = __acct_get(ns->bacct); + res = __acct_get(res); if (!res) goto again; return res; @@ -170,26 +182,27 @@ static void acct_kill(struct bsd_acct_struct *acct, if (acct) { struct file *file = acct->file; struct pid_namespace *ns = acct->ns; + do_acct_process(acct); + mnt_unpin(file->f_path.mnt); + filp_close(file, NULL); spin_lock(&acct_lock); hlist_del(&acct->m_list); hlist_del(&acct->s_list); - mnt_unpin(file->f_path.mnt); spin_unlock(&acct_lock); - do_acct_process(acct); - filp_close(file, NULL); - spin_lock(&acct_lock); ns->bacct = new; if (new) { struct vfsmount *m = new->file->f_path.mnt; mnt_pin(m); + spin_lock(&acct_lock); hlist_add_head(&new->s_list, &m->mnt_sb->s_pins); hlist_add_head(&new->m_list, &real_mount(m)->mnt_pins); + spin_unlock(&acct_lock); + mutex_unlock(&new->lock); } acct->ns = NULL; + atomic_long_dec(&acct->count); mutex_unlock(&acct->lock); - if (!(acct->count -= 2)) - kfree(acct); - spin_unlock(&acct_lock); + acct_put(acct); } } @@ -223,7 +236,7 @@ static int acct_on(struct filename *pathname) return -EIO; } - acct->count = 1; + atomic_long_set(&acct->count, 1); acct->file = file; acct->needcheck = jiffies; acct->ns = ns; @@ -231,15 +244,17 @@ static int acct_on(struct filename *pathname) mnt = file->f_path.mnt; old = acct_get(ns); + mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */ if (old) { acct_kill(old, acct); } else { - spin_lock(&acct_lock); ns->bacct = acct; + spin_lock(&acct_lock); mnt_pin(mnt); hlist_add_head(&acct->s_list, &mnt->mnt_sb->s_pins); hlist_add_head(&acct->m_list, &real_mount(mnt)->mnt_pins); spin_unlock(&acct_lock); + mutex_unlock(&acct->lock); } mntput(mnt); /* it's pinned, now give up active reference */ return 0; @@ -282,28 +297,32 @@ SYSCALL_DEFINE1(acct, const char __user *, name) void acct_auto_close_mnt(struct hlist_head *list) { + rcu_read_lock(); while (1) { - spin_lock(&acct_lock); - if (!list->first) + struct hlist_node *p = ACCESS_ONCE(list->first); + if (!p) break; - acct_kill(__acct_get(hlist_entry(list->first, + acct_kill(__acct_get(hlist_entry(p, struct bsd_acct_struct, m_list)), NULL); + rcu_read_lock(); } - spin_unlock(&acct_lock); + rcu_read_unlock(); } void acct_auto_close(struct hlist_head *list) { + rcu_read_lock(); while (1) { - spin_lock(&acct_lock); - if (!list->first) + struct hlist_node *p = ACCESS_ONCE(list->first); + if (!p) break; - acct_kill(__acct_get(hlist_entry(list->first, + acct_kill(__acct_get(hlist_entry(p, struct bsd_acct_struct, s_list)), NULL); + rcu_read_lock(); } - spin_unlock(&acct_lock); + rcu_read_unlock(); } void acct_exit_ns(struct pid_namespace *ns) -- cgit v1.2.3-70-g09d2 From 0aec09d049d7e994eba54bad4376dd8f58eab797 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Aug 2014 07:32:06 -0400 Subject: drop ->s_umount around acct_auto_close() just repeat the frozen check after regaining it, and check that sb is still alive. If several threads hit acct_auto_close() at the same time, acct_auto_close() will survive that just fine. And we really don't want to play with writes and closing the file with ->s_umount held exclusive - it's a deadlock country. Signed-off-by: Al Viro --- fs/super.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/fs/super.c b/fs/super.c index 52ed93eb63d..a369f8964dc 100644 --- a/fs/super.c +++ b/fs/super.c @@ -702,12 +702,22 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) return -EACCES; #endif - if (flags & MS_RDONLY) - acct_auto_close(&sb->s_pins); - shrink_dcache_sb(sb); - remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); + if (remount_ro) { + if (sb->s_pins.first) { + up_write(&sb->s_umount); + acct_auto_close(&sb->s_pins); + down_write(&sb->s_umount); + if (!sb->s_root) + return 0; + if (sb->s_writers.frozen != SB_UNFROZEN) + return -EBUSY; + remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); + } + } + shrink_dcache_sb(sb); + /* If we are remounting RDONLY and current sb is read/write, make sure there are no rw files opened */ if (remount_ro) { -- cgit v1.2.3-70-g09d2 From 17c0a5aaffa63da6b5c73a31e36616bdcd12d143 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Aug 2014 07:35:19 -0400 Subject: make acct_kill() wait for file closing. Do actual closing of file via schedule_work(). And use __fput_sync() there. Signed-off-by: Al Viro --- kernel/acct.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/kernel/acct.c b/kernel/acct.c index 6fd375f1562..d9ebc96b112 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -92,6 +92,8 @@ struct bsd_acct_struct { unsigned long needcheck; struct file *file; struct pid_namespace *ns; + struct work_struct work; + struct completion done; }; static void acct_free_rcu(struct rcu_head *head) @@ -176,15 +178,27 @@ again: return res; } +static void close_work(struct work_struct *work) +{ + struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work); + struct file *file = acct->file; + mnt_unpin(file->f_path.mnt); + if (file->f_op->flush) + file->f_op->flush(file, NULL); + __fput_sync(file); + complete(&acct->done); +} + static void acct_kill(struct bsd_acct_struct *acct, struct bsd_acct_struct *new) { if (acct) { - struct file *file = acct->file; struct pid_namespace *ns = acct->ns; do_acct_process(acct); - mnt_unpin(file->f_path.mnt); - filp_close(file, NULL); + INIT_WORK(&acct->work, close_work); + init_completion(&acct->done); + schedule_work(&acct->work); + wait_for_completion(&acct->done); spin_lock(&acct_lock); hlist_del(&acct->m_list); hlist_del(&acct->s_list); -- cgit v1.2.3-70-g09d2 From 215748e67d893169de9e62c3416e9e035e9e9c5f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Aug 2014 07:51:29 -0400 Subject: acct: move mnt_pin() upwards. Signed-off-by: Al Viro --- kernel/acct.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/acct.c b/kernel/acct.c index d9ebc96b112..2d9e04d9899 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -206,7 +206,6 @@ static void acct_kill(struct bsd_acct_struct *acct, ns->bacct = new; if (new) { struct vfsmount *m = new->file->f_path.mnt; - mnt_pin(m); spin_lock(&acct_lock); hlist_add_head(&new->s_list, &m->mnt_sb->s_pins); hlist_add_head(&new->m_list, &real_mount(m)->mnt_pins); @@ -256,6 +255,7 @@ static int acct_on(struct filename *pathname) acct->ns = ns; mutex_init(&acct->lock); mnt = file->f_path.mnt; + mnt_pin(mnt); old = acct_get(ns); mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */ @@ -264,7 +264,6 @@ static int acct_on(struct filename *pathname) } else { ns->bacct = acct; spin_lock(&acct_lock); - mnt_pin(mnt); hlist_add_head(&acct->s_list, &mnt->mnt_sb->s_pins); hlist_add_head(&acct->m_list, &real_mount(mnt)->mnt_pins); spin_unlock(&acct_lock); -- cgit v1.2.3-70-g09d2 From 1629d0eb3ead0e0c49e4402049ec7b5b31b81cd7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Aug 2014 08:00:52 -0400 Subject: start carving bsd_acct_struct up pull generic parts into struct fs_pin. Eventually we want those to replace mnt_pin()/mnt_unpin() mess; that stuff will move to fs/*. Signed-off-by: Al Viro --- kernel/acct.c | 42 +++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/kernel/acct.c b/kernel/acct.c index 2d9e04d9899..afeaaa6f49b 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -78,7 +78,7 @@ int acct_parm[3] = {4, 2, 30}; */ static void do_acct_process(struct bsd_acct_struct *acct); -struct bsd_acct_struct { +struct fs_pin { atomic_long_t count; union { struct { @@ -87,6 +87,10 @@ struct bsd_acct_struct { }; struct rcu_head rcu; }; +}; + +struct bsd_acct_struct { + struct fs_pin pin; struct mutex lock; int active; unsigned long needcheck; @@ -96,9 +100,9 @@ struct bsd_acct_struct { struct completion done; }; -static void acct_free_rcu(struct rcu_head *head) +static void pin_free_rcu(struct rcu_head *head) { - kfree(container_of(head, struct bsd_acct_struct, rcu)); + kfree(container_of(head, struct fs_pin, rcu)); } static DEFINE_SPINLOCK(acct_lock); @@ -138,15 +142,15 @@ out: return acct->active; } -static void acct_put(struct bsd_acct_struct *p) +static void pin_put(struct fs_pin *p) { if (atomic_long_dec_and_test(&p->count)) - call_rcu(&p->rcu, acct_free_rcu); + call_rcu(&p->rcu, pin_free_rcu); } static struct bsd_acct_struct *__acct_get(struct bsd_acct_struct *res) { - if (!atomic_long_inc_not_zero(&res->count)) { + if (!atomic_long_inc_not_zero(&res->pin.count)) { rcu_read_unlock(); cpu_relax(); return NULL; @@ -155,7 +159,7 @@ static struct bsd_acct_struct *__acct_get(struct bsd_acct_struct *res) mutex_lock(&res->lock); if (!res->ns) { mutex_unlock(&res->lock); - acct_put(res); + pin_put(&res->pin); return NULL; } return res; @@ -200,22 +204,22 @@ static void acct_kill(struct bsd_acct_struct *acct, schedule_work(&acct->work); wait_for_completion(&acct->done); spin_lock(&acct_lock); - hlist_del(&acct->m_list); - hlist_del(&acct->s_list); + hlist_del(&acct->pin.m_list); + hlist_del(&acct->pin.s_list); spin_unlock(&acct_lock); ns->bacct = new; if (new) { struct vfsmount *m = new->file->f_path.mnt; spin_lock(&acct_lock); - hlist_add_head(&new->s_list, &m->mnt_sb->s_pins); - hlist_add_head(&new->m_list, &real_mount(m)->mnt_pins); + hlist_add_head(&new->pin.s_list, &m->mnt_sb->s_pins); + hlist_add_head(&new->pin.m_list, &real_mount(m)->mnt_pins); spin_unlock(&acct_lock); mutex_unlock(&new->lock); } acct->ns = NULL; - atomic_long_dec(&acct->count); + atomic_long_dec(&acct->pin.count); mutex_unlock(&acct->lock); - acct_put(acct); + pin_put(&acct->pin); } } @@ -249,7 +253,7 @@ static int acct_on(struct filename *pathname) return -EIO; } - atomic_long_set(&acct->count, 1); + atomic_long_set(&acct->pin.count, 1); acct->file = file; acct->needcheck = jiffies; acct->ns = ns; @@ -264,8 +268,8 @@ static int acct_on(struct filename *pathname) } else { ns->bacct = acct; spin_lock(&acct_lock); - hlist_add_head(&acct->s_list, &mnt->mnt_sb->s_pins); - hlist_add_head(&acct->m_list, &real_mount(mnt)->mnt_pins); + hlist_add_head(&acct->pin.s_list, &mnt->mnt_sb->s_pins); + hlist_add_head(&acct->pin.m_list, &real_mount(mnt)->mnt_pins); spin_unlock(&acct_lock); mutex_unlock(&acct->lock); } @@ -317,7 +321,7 @@ void acct_auto_close_mnt(struct hlist_head *list) break; acct_kill(__acct_get(hlist_entry(p, struct bsd_acct_struct, - m_list)), NULL); + pin.m_list)), NULL); rcu_read_lock(); } rcu_read_unlock(); @@ -332,7 +336,7 @@ void acct_auto_close(struct hlist_head *list) break; acct_kill(__acct_get(hlist_entry(p, struct bsd_acct_struct, - s_list)), NULL); + pin.s_list)), NULL); rcu_read_lock(); } rcu_read_unlock(); @@ -613,7 +617,7 @@ static void slow_acct_process(struct pid_namespace *ns) if (acct) { do_acct_process(acct); mutex_unlock(&acct->lock); - acct_put(acct); + pin_put(&acct->pin); } } } -- cgit v1.2.3-70-g09d2 From efb170c22867cdc6f770de441bdefecec6712199 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Aug 2014 08:39:04 -0400 Subject: take fs_pin stuff to fs/* Add a new field to fs_pin - kill(pin). That's what umount and r/o remount will be calling for all pins attached to vfsmount and superblock resp. Called after bumping the refcount, so it won't go away under us. Dropping the refcount is responsibility of the instance. All generic stuff moved to fs/fs_pin.c; the next step will rip all the knowledge of kernel/acct.c from fs/super.c and fs/namespace.c. After that - death to mnt_pin(); it was intended to be usable as generic mechanism for code that wants to attach objects to vfsmount, so that they would not make the sucker busy and would get killed on umount. Never got it right; it remained acct.c-specific all along. Now it's very close to being killable. Signed-off-by: Al Viro --- fs/Makefile | 2 +- fs/fs_pin.c | 77 ++++++++++++++++++++++++++++++ include/linux/acct.h | 6 +-- include/linux/fs_pin.h | 17 +++++++ kernel/acct.c | 127 +++++++++++++------------------------------------ 5 files changed, 129 insertions(+), 100 deletions(-) create mode 100644 fs/fs_pin.c create mode 100644 include/linux/fs_pin.h diff --git a/fs/Makefile b/fs/Makefile index 4030cbfbc9a..90c88529892 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \ attr.o bad_inode.o file.o filesystems.o namespace.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o splice.o sync.o utimes.o \ - stack.o fs_struct.o statfs.o + stack.o fs_struct.o statfs.o fs_pin.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o block_dev.o direct-io.o mpage.o diff --git a/fs/fs_pin.c b/fs/fs_pin.c new file mode 100644 index 00000000000..f3ce0b874a4 --- /dev/null +++ b/fs/fs_pin.c @@ -0,0 +1,77 @@ +#include +#include +#include +#include "mount.h" + +static void pin_free_rcu(struct rcu_head *head) +{ + kfree(container_of(head, struct fs_pin, rcu)); +} + +static DEFINE_SPINLOCK(pin_lock); + +void pin_put(struct fs_pin *p) +{ + if (atomic_long_dec_and_test(&p->count)) + call_rcu(&p->rcu, pin_free_rcu); +} + +void pin_remove(struct fs_pin *pin) +{ + spin_lock(&pin_lock); + hlist_del(&pin->m_list); + hlist_del(&pin->s_list); + spin_unlock(&pin_lock); +} + +void pin_insert(struct fs_pin *pin, struct vfsmount *m) +{ + spin_lock(&pin_lock); + hlist_add_head(&pin->s_list, &m->mnt_sb->s_pins); + hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins); + spin_unlock(&pin_lock); +} + +void acct_auto_close_mnt(struct hlist_head *list) +{ + while (1) { + struct hlist_node *p; + struct fs_pin *pin; + rcu_read_lock(); + p = ACCESS_ONCE(list->first); + if (!p) { + rcu_read_unlock(); + break; + } + pin = hlist_entry(p, struct fs_pin, m_list); + if (!atomic_long_inc_not_zero(&pin->count)) { + rcu_read_unlock(); + cpu_relax(); + continue; + } + rcu_read_unlock(); + pin->kill(pin); + } +} + +void acct_auto_close(struct hlist_head *list) +{ + while (1) { + struct hlist_node *p; + struct fs_pin *pin; + rcu_read_lock(); + p = ACCESS_ONCE(list->first); + if (!p) { + rcu_read_unlock(); + break; + } + pin = hlist_entry(p, struct fs_pin, s_list); + if (!atomic_long_inc_not_zero(&pin->count)) { + rcu_read_unlock(); + cpu_relax(); + continue; + } + rcu_read_unlock(); + pin->kill(pin); + } +} diff --git a/include/linux/acct.h b/include/linux/acct.h index 65a4f889182..137837929db 100644 --- a/include/linux/acct.h +++ b/include/linux/acct.h @@ -24,18 +24,16 @@ struct super_block; struct pacct_struct; struct pid_namespace; extern int acct_parm[]; /* for sysctl */ -extern void acct_auto_close(struct hlist_head *); -extern void acct_auto_close_mnt(struct hlist_head *); extern void acct_collect(long exitcode, int group_dead); extern void acct_process(void); extern void acct_exit_ns(struct pid_namespace *); #else -#define acct_auto_close(x) do { } while (0) -#define acct_auto_close_mnt(x) do { } while (0) #define acct_collect(x,y) do { } while (0) #define acct_process() do { } while (0) #define acct_exit_ns(ns) do { } while (0) #endif +extern void acct_auto_close(struct hlist_head *); +extern void acct_auto_close_mnt(struct hlist_head *); /* * ACCT_VERSION numbers as yet defined: diff --git a/include/linux/fs_pin.h b/include/linux/fs_pin.h new file mode 100644 index 00000000000..f66525e72cc --- /dev/null +++ b/include/linux/fs_pin.h @@ -0,0 +1,17 @@ +#include + +struct fs_pin { + atomic_long_t count; + union { + struct { + struct hlist_node s_list; + struct hlist_node m_list; + }; + struct rcu_head rcu; + }; + void (*kill)(struct fs_pin *); +}; + +void pin_put(struct fs_pin *); +void pin_remove(struct fs_pin *); +void pin_insert(struct fs_pin *, struct vfsmount *); diff --git a/kernel/acct.c b/kernel/acct.c index afeaaa6f49b..a7993a6cb60 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -59,7 +59,7 @@ #include #include /* sector_div */ #include -#include <../fs/mount.h> /* will go away when we refactor */ +#include /* * These constants control the amount of freespace that suspend and @@ -78,17 +78,6 @@ int acct_parm[3] = {4, 2, 30}; */ static void do_acct_process(struct bsd_acct_struct *acct); -struct fs_pin { - atomic_long_t count; - union { - struct { - struct hlist_node s_list; - struct hlist_node m_list; - }; - struct rcu_head rcu; - }; -}; - struct bsd_acct_struct { struct fs_pin pin; struct mutex lock; @@ -100,13 +89,6 @@ struct bsd_acct_struct { struct completion done; }; -static void pin_free_rcu(struct rcu_head *head) -{ - kfree(container_of(head, struct fs_pin, rcu)); -} - -static DEFINE_SPINLOCK(acct_lock); - /* * Check the amount of free space and suspend/resume accordingly. */ @@ -142,29 +124,6 @@ out: return acct->active; } -static void pin_put(struct fs_pin *p) -{ - if (atomic_long_dec_and_test(&p->count)) - call_rcu(&p->rcu, pin_free_rcu); -} - -static struct bsd_acct_struct *__acct_get(struct bsd_acct_struct *res) -{ - if (!atomic_long_inc_not_zero(&res->pin.count)) { - rcu_read_unlock(); - cpu_relax(); - return NULL; - } - rcu_read_unlock(); - mutex_lock(&res->lock); - if (!res->ns) { - mutex_unlock(&res->lock); - pin_put(&res->pin); - return NULL; - } - return res; -} - static struct bsd_acct_struct *acct_get(struct pid_namespace *ns) { struct bsd_acct_struct *res; @@ -176,9 +135,18 @@ again: rcu_read_unlock(); return NULL; } - res = __acct_get(res); - if (!res) + if (!atomic_long_inc_not_zero(&res->pin.count)) { + rcu_read_unlock(); + cpu_relax(); goto again; + } + rcu_read_unlock(); + mutex_lock(&res->lock); + if (!res->ns) { + mutex_unlock(&res->lock); + pin_put(&res->pin); + goto again; + } return res; } @@ -203,19 +171,8 @@ static void acct_kill(struct bsd_acct_struct *acct, init_completion(&acct->done); schedule_work(&acct->work); wait_for_completion(&acct->done); - spin_lock(&acct_lock); - hlist_del(&acct->pin.m_list); - hlist_del(&acct->pin.s_list); - spin_unlock(&acct_lock); + pin_remove(&acct->pin); ns->bacct = new; - if (new) { - struct vfsmount *m = new->file->f_path.mnt; - spin_lock(&acct_lock); - hlist_add_head(&new->pin.s_list, &m->mnt_sb->s_pins); - hlist_add_head(&new->pin.m_list, &real_mount(m)->mnt_pins); - spin_unlock(&acct_lock); - mutex_unlock(&new->lock); - } acct->ns = NULL; atomic_long_dec(&acct->pin.count); mutex_unlock(&acct->lock); @@ -223,6 +180,19 @@ static void acct_kill(struct bsd_acct_struct *acct, } } +static void acct_pin_kill(struct fs_pin *pin) +{ + struct bsd_acct_struct *acct; + acct = container_of(pin, struct bsd_acct_struct, pin); + mutex_lock(&acct->lock); + if (!acct->ns) { + mutex_unlock(&acct->lock); + pin_put(pin); + acct = NULL; + } + acct_kill(acct, NULL); +} + static int acct_on(struct filename *pathname) { struct file *file; @@ -254,25 +224,22 @@ static int acct_on(struct filename *pathname) } atomic_long_set(&acct->pin.count, 1); + acct->pin.kill = acct_pin_kill; acct->file = file; acct->needcheck = jiffies; acct->ns = ns; mutex_init(&acct->lock); mnt = file->f_path.mnt; mnt_pin(mnt); + mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */ + pin_insert(&acct->pin, mnt); old = acct_get(ns); - mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */ - if (old) { + if (old) acct_kill(old, acct); - } else { + else ns->bacct = acct; - spin_lock(&acct_lock); - hlist_add_head(&acct->pin.s_list, &mnt->mnt_sb->s_pins); - hlist_add_head(&acct->pin.m_list, &real_mount(mnt)->mnt_pins); - spin_unlock(&acct_lock); - mutex_unlock(&acct->lock); - } + mutex_unlock(&acct->lock); mntput(mnt); /* it's pinned, now give up active reference */ return 0; } @@ -312,36 +279,6 @@ SYSCALL_DEFINE1(acct, const char __user *, name) return error; } -void acct_auto_close_mnt(struct hlist_head *list) -{ - rcu_read_lock(); - while (1) { - struct hlist_node *p = ACCESS_ONCE(list->first); - if (!p) - break; - acct_kill(__acct_get(hlist_entry(p, - struct bsd_acct_struct, - pin.m_list)), NULL); - rcu_read_lock(); - } - rcu_read_unlock(); -} - -void acct_auto_close(struct hlist_head *list) -{ - rcu_read_lock(); - while (1) { - struct hlist_node *p = ACCESS_ONCE(list->first); - if (!p) - break; - acct_kill(__acct_get(hlist_entry(p, - struct bsd_acct_struct, - pin.s_list)), NULL); - rcu_read_lock(); - } - rcu_read_unlock(); -} - void acct_exit_ns(struct pid_namespace *ns) { acct_kill(acct_get(ns), NULL); -- cgit v1.2.3-70-g09d2 From 8fa1f1c2bd86007beb4a4845e6087ac4a704dc80 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 21 May 2014 18:22:52 -0400 Subject: make fs/{namespace,super}.c forget about acct.h These externs belong in fs/internal.h. Rename (they are not acct-specific anymore) and move them over there. Signed-off-by: Al Viro --- fs/fs_pin.c | 9 +++++---- fs/internal.h | 6 ++++++ fs/namespace.c | 3 +-- fs/super.c | 3 +-- include/linux/acct.h | 2 -- 5 files changed, 13 insertions(+), 10 deletions(-) diff --git a/fs/fs_pin.c b/fs/fs_pin.c index f3ce0b874a4..9368236ca10 100644 --- a/fs/fs_pin.c +++ b/fs/fs_pin.c @@ -1,6 +1,7 @@ #include #include #include +#include "internal.h" #include "mount.h" static void pin_free_rcu(struct rcu_head *head) @@ -32,13 +33,13 @@ void pin_insert(struct fs_pin *pin, struct vfsmount *m) spin_unlock(&pin_lock); } -void acct_auto_close_mnt(struct hlist_head *list) +void mnt_pin_kill(struct mount *m) { while (1) { struct hlist_node *p; struct fs_pin *pin; rcu_read_lock(); - p = ACCESS_ONCE(list->first); + p = ACCESS_ONCE(m->mnt_pins.first); if (!p) { rcu_read_unlock(); break; @@ -54,13 +55,13 @@ void acct_auto_close_mnt(struct hlist_head *list) } } -void acct_auto_close(struct hlist_head *list) +void sb_pin_kill(struct super_block *sb) { while (1) { struct hlist_node *p; struct fs_pin *pin; rcu_read_lock(); - p = ACCESS_ONCE(list->first); + p = ACCESS_ONCE(sb->s_pins.first); if (!p) { rcu_read_unlock(); break; diff --git a/fs/internal.h b/fs/internal.h index 9a2edba87c2..e325b4f9c79 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -143,3 +143,9 @@ extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, * pipe.c */ extern const struct file_operations pipefifo_fops; + +/* + * fs_pin.c + */ +extern void sb_pin_kill(struct super_block *sb); +extern void mnt_pin_kill(struct mount *m); diff --git a/fs/namespace.c b/fs/namespace.c index 22e530addfa..0e4ce51c527 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -16,7 +16,6 @@ #include #include #include -#include /* acct_auto_close_mnt */ #include /* init_rootfs */ #include /* get_fs_root et.al. */ #include /* fsnotify_vfsmount_delete */ @@ -956,7 +955,7 @@ put_again: mnt->mnt_pinned = 0; rcu_read_unlock(); unlock_mount_hash(); - acct_auto_close_mnt(&mnt->mnt_pins); + mnt_pin_kill(mnt); goto put_again; } if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { diff --git a/fs/super.c b/fs/super.c index a369f8964dc..a371ce6aa91 100644 --- a/fs/super.c +++ b/fs/super.c @@ -22,7 +22,6 @@ #include #include -#include #include #include #include @@ -707,7 +706,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) if (remount_ro) { if (sb->s_pins.first) { up_write(&sb->s_umount); - acct_auto_close(&sb->s_pins); + sb_pin_kill(sb); down_write(&sb->s_umount); if (!sb->s_root) return 0; diff --git a/include/linux/acct.h b/include/linux/acct.h index 137837929db..dccc2d4fe7d 100644 --- a/include/linux/acct.h +++ b/include/linux/acct.h @@ -32,8 +32,6 @@ extern void acct_exit_ns(struct pid_namespace *); #define acct_process() do { } while (0) #define acct_exit_ns(ns) do { } while (0) #endif -extern void acct_auto_close(struct hlist_head *); -extern void acct_auto_close_mnt(struct hlist_head *); /* * ACCT_VERSION numbers as yet defined: -- cgit v1.2.3-70-g09d2 From 3064c3563ba4c23e2c7a47254ec056ed9ba0098a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Aug 2014 09:12:31 -0400 Subject: death to mnt_pinned Rather than playing silly buggers with vfsmount refcounts, just have acct_on() ask fs/namespace.c for internal clone of file->f_path.mnt and replace it with said clone. Then attach the pin to original vfsmount. Voila - the clone will be alive until the file gets closed, making sure that underlying superblock remains active, etc., and we can drop the original vfsmount, so that it's not kept busy. If the file lives until the final mntput of the original vfsmount, we'll notice that there's an fs_pin (one in bsd_acct_struct that holds that file) and mnt_pin_kill() will take it out. Since ->kill() is synchronous, we won't proceed past that point until these files are closed (and private clones of our vfsmount are gone), so we get the same ordering warranties we used to get. mnt_pin()/mnt_unpin()/->mnt_pinned is gone now, and good riddance - it never became usable outside of kernel/acct.c (and racy wrt umount even there). Signed-off-by: Al Viro --- fs/mount.h | 1 - fs/namespace.c | 35 +++++++++-------------------------- include/linux/mount.h | 4 ++-- kernel/acct.c | 24 +++++++++++++++++++----- 4 files changed, 30 insertions(+), 34 deletions(-) diff --git a/fs/mount.h b/fs/mount.h index 0a2d1458681..6740a621552 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -55,7 +55,6 @@ struct mount { int mnt_id; /* mount identifier */ int mnt_group_id; /* peer group identifier */ int mnt_expiry_mark; /* true if marked for expiry */ - int mnt_pinned; struct hlist_head mnt_pins; struct path mnt_ex_mountpoint; }; diff --git a/fs/namespace.c b/fs/namespace.c index 0e4ce51c527..65af9d0e0d6 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -937,7 +937,6 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, static void mntput_no_expire(struct mount *mnt) { -put_again: rcu_read_lock(); mnt_add_count(mnt, -1); if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */ @@ -950,14 +949,6 @@ put_again: unlock_mount_hash(); return; } - if (unlikely(mnt->mnt_pinned)) { - mnt_add_count(mnt, mnt->mnt_pinned + 1); - mnt->mnt_pinned = 0; - rcu_read_unlock(); - unlock_mount_hash(); - mnt_pin_kill(mnt); - goto put_again; - } if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { rcu_read_unlock(); unlock_mount_hash(); @@ -980,6 +971,8 @@ put_again: * so mnt_get_writers() below is safe. */ WARN_ON(mnt_get_writers(mnt)); + if (unlikely(mnt->mnt_pins.first)) + mnt_pin_kill(mnt); fsnotify_vfsmount_delete(&mnt->mnt); dput(mnt->mnt.mnt_root); deactivate_super(mnt->mnt.mnt_sb); @@ -1007,25 +1000,15 @@ struct vfsmount *mntget(struct vfsmount *mnt) } EXPORT_SYMBOL(mntget); -void mnt_pin(struct vfsmount *mnt) +struct vfsmount *mnt_clone_internal(struct path *path) { - lock_mount_hash(); - real_mount(mnt)->mnt_pinned++; - unlock_mount_hash(); -} -EXPORT_SYMBOL(mnt_pin); - -void mnt_unpin(struct vfsmount *m) -{ - struct mount *mnt = real_mount(m); - lock_mount_hash(); - if (mnt->mnt_pinned) { - mnt_add_count(mnt, 1); - mnt->mnt_pinned--; - } - unlock_mount_hash(); + struct mount *p; + p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE); + if (IS_ERR(p)) + return ERR_CAST(p); + p->mnt.mnt_flags |= MNT_INTERNAL; + return &p->mnt; } -EXPORT_SYMBOL(mnt_unpin); static inline void mangle(struct seq_file *m, const char *s) { diff --git a/include/linux/mount.h b/include/linux/mount.h index 839bac27090..864b120c134 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -62,6 +62,7 @@ struct vfsmount { }; struct file; /* forward dec */ +struct path; extern int mnt_want_write(struct vfsmount *mnt); extern int mnt_want_write_file(struct file *file); @@ -70,8 +71,7 @@ extern void mnt_drop_write(struct vfsmount *mnt); extern void mnt_drop_write_file(struct file *file); extern void mntput(struct vfsmount *mnt); extern struct vfsmount *mntget(struct vfsmount *mnt); -extern void mnt_pin(struct vfsmount *mnt); -extern void mnt_unpin(struct vfsmount *mnt); +extern struct vfsmount *mnt_clone_internal(struct path *path); extern int __mnt_is_readonly(struct vfsmount *mnt); struct file_system_type; diff --git a/kernel/acct.c b/kernel/acct.c index a7993a6cb60..2e6cf818021 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -154,7 +154,6 @@ static void close_work(struct work_struct *work) { struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work); struct file *file = acct->file; - mnt_unpin(file->f_path.mnt); if (file->f_op->flush) file->f_op->flush(file, NULL); __fput_sync(file); @@ -196,9 +195,10 @@ static void acct_pin_kill(struct fs_pin *pin) static int acct_on(struct filename *pathname) { struct file *file; - struct vfsmount *mnt; + struct vfsmount *mnt, *internal; struct pid_namespace *ns = task_active_pid_ns(current); struct bsd_acct_struct *acct, *old; + int err; acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); if (!acct) @@ -222,6 +222,21 @@ static int acct_on(struct filename *pathname) filp_close(file, NULL); return -EIO; } + internal = mnt_clone_internal(&file->f_path); + if (IS_ERR(internal)) { + kfree(acct); + filp_close(file, NULL); + return PTR_ERR(internal); + } + err = mnt_want_write(internal); + if (err) { + mntput(internal); + kfree(acct); + filp_close(file, NULL); + return err; + } + mnt = file->f_path.mnt; + file->f_path.mnt = internal; atomic_long_set(&acct->pin.count, 1); acct->pin.kill = acct_pin_kill; @@ -229,8 +244,6 @@ static int acct_on(struct filename *pathname) acct->needcheck = jiffies; acct->ns = ns; mutex_init(&acct->lock); - mnt = file->f_path.mnt; - mnt_pin(mnt); mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */ pin_insert(&acct->pin, mnt); @@ -240,7 +253,8 @@ static int acct_on(struct filename *pathname) else ns->bacct = acct; mutex_unlock(&acct->lock); - mntput(mnt); /* it's pinned, now give up active reference */ + mnt_drop_write(mnt); + mntput(mnt); return 0; } -- cgit v1.2.3-70-g09d2 From 2577d92ebd28dd9b3dacdfad6dcd81be0d21bbdf Mon Sep 17 00:00:00 2001 From: Ionut Alexa Date: Thu, 31 Jul 2014 09:28:36 +1000 Subject: kernel/acct.c: fix coding style warnings and errors Signed-off-by: Ionut Alexa Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- kernel/acct.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/kernel/acct.c b/kernel/acct.c index 2e6cf818021..b4c667d22e7 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -108,14 +108,14 @@ static int check_free_space(struct bsd_acct_struct *acct) do_div(suspend, 100); if (sbuf.f_bavail <= suspend) { acct->active = 0; - printk(KERN_INFO "Process accounting paused\n"); + pr_info("Process accounting paused\n"); } } else { u64 resume = sbuf.f_blocks * RESUME; do_div(resume, 100); if (sbuf.f_bavail >= resume) { acct->active = 1; - printk(KERN_INFO "Process accounting resumed\n"); + pr_info("Process accounting resumed\n"); } } @@ -280,6 +280,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name) if (name) { struct filename *tmp = getname(name); + if (IS_ERR(tmp)) return PTR_ERR(tmp); mutex_lock(&acct_on_mutex); @@ -337,7 +338,7 @@ static comp_t encode_comp_t(unsigned long value) return exp; } -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 /* * encode an u64 into a comp2_t (24 bits) * @@ -350,7 +351,7 @@ static comp_t encode_comp_t(unsigned long value) #define MANTSIZE2 20 /* 20 bit mantissa. */ #define EXPSIZE2 5 /* 5 bit base 2 exponent. */ #define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */ -#define MAXEXP2 ((1 < 0){ + if (value == 0) + return 0; + while ((s64)value > 0) { value <<= 1; exp--; } @@ -429,16 +431,17 @@ static void fill_ac(acct_t *ac) run_time -= current->group_leader->start_time; /* convert nsec -> AHZ */ elapsed = nsec_to_AHZ(run_time); -#if ACCT_VERSION==3 +#if ACCT_VERSION == 3 ac->ac_etime = encode_float(elapsed); #else ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? - (unsigned long) elapsed : (unsigned long) -1l); + (unsigned long) elapsed : (unsigned long) -1l); #endif -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 { /* new enlarged etime field */ comp2_t etime = encode_comp2_t(elapsed); + ac->ac_etime_hi = etime >> 16; ac->ac_etime_lo = (u16) etime; } @@ -491,12 +494,12 @@ static void do_acct_process(struct bsd_acct_struct *acct) /* we really need to bite the bullet and change layout */ ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 /* backward-compatible 16 bit fields */ ac.ac_uid16 = ac.ac_uid; ac.ac_gid16 = ac.ac_gid; #endif -#if ACCT_VERSION==3 +#if ACCT_VERSION == 3 ac.ac_pid = task_tgid_nr_ns(current, ns); rcu_read_lock(); ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); @@ -530,6 +533,7 @@ void acct_collect(long exitcode, int group_dead) if (group_dead && current->mm) { struct vm_area_struct *vma; + down_read(¤t->mm->mmap_sem); vma = current->mm->mmap; while (vma) { -- cgit v1.2.3-70-g09d2 From 7177a9c4b509eb357cc450256bc3cf39f1a1e639 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 23 Jul 2014 15:15:30 +0200 Subject: fs: call rename2 if exists Christoph Hellwig suggests: 1) make vfs_rename call ->rename2 if it exists instead of ->rename 2) switch all filesystems that you're adding NOREPLACE support for to use ->rename2 3) see how many ->rename instances we'll have left after a few iterations of 2. Signed-off-by: Miklos Szeredi Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/ext4/namei.c | 1 - fs/fuse/dir.c | 7 ------- fs/namei.c | 5 +++-- 3 files changed, 3 insertions(+), 10 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 3520ab8a663..b147a67baa0 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3455,7 +3455,6 @@ const struct inode_operations ext4_dir_inode_operations = { .rmdir = ext4_rmdir, .mknod = ext4_mknod, .tmpfile = ext4_tmpfile, - .rename = ext4_rename, .rename2 = ext4_rename2, .setattr = ext4_setattr, .setxattr = generic_setxattr, diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 0c6048247a3..de1d84af9f7 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -845,12 +845,6 @@ static int fuse_rename2(struct inode *olddir, struct dentry *oldent, return err; } -static int fuse_rename(struct inode *olddir, struct dentry *oldent, - struct inode *newdir, struct dentry *newent) -{ - return fuse_rename2(olddir, oldent, newdir, newent, 0); -} - static int fuse_link(struct dentry *entry, struct inode *newdir, struct dentry *newent) { @@ -2024,7 +2018,6 @@ static const struct inode_operations fuse_dir_inode_operations = { .symlink = fuse_symlink, .unlink = fuse_unlink, .rmdir = fuse_rmdir, - .rename = fuse_rename, .rename2 = fuse_rename2, .link = fuse_link, .setattr = fuse_setattr, diff --git a/fs/namei.c b/fs/namei.c index 9eb787e5c16..0ff23cecb1b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4075,7 +4075,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (error) return error; - if (!old_dir->i_op->rename) + if (!old_dir->i_op->rename && !old_dir->i_op->rename2) return -EPERM; if (flags && !old_dir->i_op->rename2) @@ -4134,10 +4134,11 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (error) goto out; } - if (!flags) { + if (!old_dir->i_op->rename2) { error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); } else { + WARN_ON(old_dir->i_op->rename != NULL); error = old_dir->i_op->rename2(old_dir, old_dentry, new_dir, new_dentry, flags); } -- cgit v1.2.3-70-g09d2 From a0dbc56610b3e157f19241404e738744b7e7877e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 23 Jul 2014 15:15:31 +0200 Subject: bad_inode: add ->rename2() so we return -EIO instead of -EINVAL. Signed-off-by: Miklos Szeredi Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/bad_inode.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 7c93953030f..afd2b4408ad 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -218,8 +218,9 @@ static int bad_inode_mknod (struct inode *dir, struct dentry *dentry, return -EIO; } -static int bad_inode_rename (struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) +static int bad_inode_rename2(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { return -EIO; } @@ -279,7 +280,7 @@ static const struct inode_operations bad_inode_ops = .mkdir = bad_inode_mkdir, .rmdir = bad_inode_rmdir, .mknod = bad_inode_mknod, - .rename = bad_inode_rename, + .rename2 = bad_inode_rename2, .readlink = bad_inode_readlink, /* follow_link must be no-op, otherwise unmounting this inode won't work */ -- cgit v1.2.3-70-g09d2 From 80ace85c915d0f41016f82917218997b72431258 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 23 Jul 2014 15:15:32 +0200 Subject: btrfs: add RENAME_NOREPLACE RENAME_NOREPLACE is trivial to implement for most filesystems: switch over to ->rename2() and check for the supported flags. The rest is done by the VFS. Signed-off-by: Miklos Szeredi Cc: Chris Mason Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/btrfs/inode.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3668048e16f..3183742d6f0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8476,6 +8476,16 @@ out_notrans: return ret; } +static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + + return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry); +} + static void btrfs_run_delalloc_work(struct btrfs_work *work) { struct btrfs_delalloc_work *delalloc_work; @@ -9019,7 +9029,7 @@ static const struct inode_operations btrfs_dir_inode_operations = { .link = btrfs_link, .mkdir = btrfs_mkdir, .rmdir = btrfs_rmdir, - .rename = btrfs_rename, + .rename2 = btrfs_rename2, .symlink = btrfs_symlink, .setattr = btrfs_setattr, .mknod = btrfs_mknod, -- cgit v1.2.3-70-g09d2 From 3b69ff51d087d265aa4af3a532fc4f20bf33e718 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 23 Jul 2014 15:15:33 +0200 Subject: shmem: support RENAME_NOREPLACE Implement ->rename2 instead of ->rename. Signed-off-by: Miklos Szeredi Acked-by: Hugh Dickins Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- mm/shmem.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index af68b15a8fc..fe959181f99 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2054,11 +2054,14 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry) * it exists so that the VFS layer correctly free's it when it * gets overwritten. */ -static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) +static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct inode *inode = old_dentry->d_inode; int they_are_dirs = S_ISDIR(inode->i_mode); + if (flags & ~(RENAME_NOREPLACE)) + return -EINVAL; + if (!simple_empty(new_dentry)) return -ENOTEMPTY; @@ -2741,7 +2744,7 @@ static const struct inode_operations shmem_dir_inode_operations = { .mkdir = shmem_mkdir, .rmdir = shmem_rmdir, .mknod = shmem_mknod, - .rename = shmem_rename, + .rename2 = shmem_rename2, .tmpfile = shmem_tmpfile, #endif #ifdef CONFIG_TMPFS_XATTR -- cgit v1.2.3-70-g09d2 From 37456771c58be10dd813fb4510035d0d67a969aa Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 23 Jul 2014 15:15:34 +0200 Subject: shmem: support RENAME_EXCHANGE This is really simple in tmpfs since the VFS already takes care of shuffling the dentries. Just adjust nlink on parent directories and touch c & mtimes. Signed-off-by: Miklos Szeredi Acked-by: Hugh Dickins Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- mm/shmem.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index fe959181f99..0f018002dd6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2048,6 +2048,28 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry) return shmem_unlink(dir, dentry); } +static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) +{ + bool old_is_dir = S_ISDIR(old_dentry->d_inode->i_mode); + bool new_is_dir = S_ISDIR(new_dentry->d_inode->i_mode); + + if (old_dir != new_dir && old_is_dir != new_is_dir) { + if (old_is_dir) { + drop_nlink(old_dir); + inc_nlink(new_dir); + } else { + drop_nlink(new_dir); + inc_nlink(old_dir); + } + } + old_dir->i_ctime = old_dir->i_mtime = + new_dir->i_ctime = new_dir->i_mtime = + old_dentry->d_inode->i_ctime = + new_dentry->d_inode->i_ctime = CURRENT_TIME; + + return 0; +} + /* * The VFS layer already does all the dentry stuff for rename, * we just have to decrement the usage count for the target if @@ -2059,9 +2081,12 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc struct inode *inode = old_dentry->d_inode; int they_are_dirs = S_ISDIR(inode->i_mode); - if (flags & ~(RENAME_NOREPLACE)) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) return -EINVAL; + if (flags & RENAME_EXCHANGE) + return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry); + if (!simple_empty(new_dentry)) return -ENOTEMPTY; -- cgit v1.2.3-70-g09d2 From 9a423bb6e3577bb372942edfb5d9d26632741d43 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 23 Jul 2014 15:15:35 +0200 Subject: hostfs: support rename flags Support RENAME_NOREPLACE and RENAME_EXCHANGE flags on hostfs if the underlying filesystem supports it. Since renameat2(2) is not yet in any libc, use syscall(2) to invoke the renameat2 syscall. Signed-off-by: Miklos Szeredi Cc: Richard Weinberger Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/hostfs/hostfs.h | 1 + fs/hostfs/hostfs_kern.c | 30 ++++++++++++++++++++---------- fs/hostfs/hostfs_user.c | 28 ++++++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 10 deletions(-) diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index 9c88da0e855..4fcd40d6f30 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -89,6 +89,7 @@ extern int do_mknod(const char *file, int mode, unsigned int major, extern int link_file(const char *from, const char *to); extern int hostfs_do_readlink(char *file, char *buf, int size); extern int rename_file(char *from, char *to); +extern int rename2_file(char *from, char *to, unsigned int flags); extern int do_statfs(char *root, long *bsize_out, long long *blocks_out, long long *bfree_out, long long *bavail_out, long long *files_out, long long *ffree_out, diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index bb529f3b7f2..fd62cae0fdc 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -741,21 +741,31 @@ static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, return err; } -static int hostfs_rename(struct inode *from_ino, struct dentry *from, - struct inode *to_ino, struct dentry *to) +static int hostfs_rename2(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { - char *from_name, *to_name; + char *old_name, *new_name; int err; - if ((from_name = dentry_name(from)) == NULL) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + old_name = dentry_name(old_dentry); + if (old_name == NULL) return -ENOMEM; - if ((to_name = dentry_name(to)) == NULL) { - __putname(from_name); + new_name = dentry_name(new_dentry); + if (new_name == NULL) { + __putname(old_name); return -ENOMEM; } - err = rename_file(from_name, to_name); - __putname(from_name); - __putname(to_name); + if (!flags) + err = rename_file(old_name, new_name); + else + err = rename2_file(old_name, new_name, flags); + + __putname(old_name); + __putname(new_name); return err; } @@ -867,7 +877,7 @@ static const struct inode_operations hostfs_dir_iops = { .mkdir = hostfs_mkdir, .rmdir = hostfs_rmdir, .mknod = hostfs_mknod, - .rename = hostfs_rename, + .rename2 = hostfs_rename2, .permission = hostfs_permission, .setattr = hostfs_setattr, }; diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index 67838f3aa20..9765dab95cb 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "hostfs.h" #include @@ -360,6 +361,33 @@ int rename_file(char *from, char *to) return 0; } +int rename2_file(char *from, char *to, unsigned int flags) +{ + int err; + +#ifndef SYS_renameat2 +# ifdef __x86_64__ +# define SYS_renameat2 316 +# endif +# ifdef __i386__ +# define SYS_renameat2 353 +# endif +#endif + +#ifdef SYS_renameat2 + err = syscall(SYS_renameat2, AT_FDCWD, from, AT_FDCWD, to, flags); + if (err < 0) { + if (errno != ENOSYS) + return -errno; + else + return -EINVAL; + } + return 0; +#else + return -EINVAL; +#endif +} + int do_statfs(char *root, long *bsize_out, long long *blocks_out, long long *bfree_out, long long *bavail_out, long long *files_out, long long *ffree_out, -- cgit v1.2.3-70-g09d2 From 7c33d5972ce382bcc506d16235f1e9b7d22cbef8 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 23 Jul 2014 15:15:36 +0200 Subject: cifs: support RENAME_NOREPLACE This flag gives CIFS the ability to support its native rename semantics. Implementation is simple: just bail out before trying to hack around the noreplace semantics. Signed-off-by: Miklos Szeredi Cc: Steve French Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/cifs/cifsfs.c | 2 +- fs/cifs/cifsfs.h | 4 ++-- fs/cifs/inode.c | 14 ++++++++++++-- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 88839806742..ac4f260155c 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -848,7 +848,7 @@ const struct inode_operations cifs_dir_inode_ops = { .link = cifs_hardlink, .mkdir = cifs_mkdir, .rmdir = cifs_rmdir, - .rename = cifs_rename, + .rename2 = cifs_rename2, .permission = cifs_permission, /* revalidate:cifs_revalidate, */ .setattr = cifs_setattr, diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 70f178a7c75..ed58c88f5f5 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -68,8 +68,8 @@ extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *); extern int cifs_mknod(struct inode *, struct dentry *, umode_t, dev_t); extern int cifs_mkdir(struct inode *, struct dentry *, umode_t); extern int cifs_rmdir(struct inode *, struct dentry *); -extern int cifs_rename(struct inode *, struct dentry *, struct inode *, - struct dentry *); +extern int cifs_rename2(struct inode *, struct dentry *, struct inode *, + struct dentry *, unsigned int); extern int cifs_revalidate_file_attr(struct file *filp); extern int cifs_revalidate_dentry_attr(struct dentry *); extern int cifs_revalidate_file(struct file *filp); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index a174605f6af..bec0a0831be 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1627,8 +1627,9 @@ do_rename_exit: } int -cifs_rename(struct inode *source_dir, struct dentry *source_dentry, - struct inode *target_dir, struct dentry *target_dentry) +cifs_rename2(struct inode *source_dir, struct dentry *source_dentry, + struct inode *target_dir, struct dentry *target_dentry, + unsigned int flags) { char *from_name = NULL; char *to_name = NULL; @@ -1640,6 +1641,9 @@ cifs_rename(struct inode *source_dir, struct dentry *source_dentry, unsigned int xid; int rc, tmprc; + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + cifs_sb = CIFS_SB(source_dir->i_sb); tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) @@ -1667,6 +1671,12 @@ cifs_rename(struct inode *source_dir, struct dentry *source_dentry, rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry, to_name); + /* + * No-replace is the natural behavior for CIFS, so skip unlink hacks. + */ + if (flags & RENAME_NOREPLACE) + goto cifs_rename_exit; + if (rc == -EEXIST && tcon->unix_ext) { /* * Are src and dst hardlinks of same inode? We can only tell -- cgit v1.2.3-70-g09d2 From b8faf035ea9d0011b04856a8198e2e212d93346a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 4 Aug 2014 17:06:29 +1000 Subject: VFS: allow ->d_manage() to declare -EISDIR in rcu_walk mode. In REF-walk mode, ->d_manage can return -EISDIR to indicate that the dentry is not really a mount trap (or even a mount point) and that any mounts or any DCACHE_NEED_AUTOMOUNT flag should be ignored. RCU-walk mode doesn't currently support this, so if there is a dentry with DCACHE_NEED_AUTOMOUNT set but which shouldn't be a mount-trap, lookup_fast() will always drop in REF-walk mode. With this patch, an -EISDIR from ->d_manage will always cause mounts and automounts to be ignored, both in REF-walk and RCU-walk. Bug-fixed-by: Dan Carpenter Cc: Ian Kent Signed-off-by: NeilBrown Signed-off-by: Al Viro --- Documentation/filesystems/vfs.txt | 3 ++- fs/namei.c | 27 ++++++++++++++++----------- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index a1d0d7a3016..61d65cc65c5 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -1053,7 +1053,8 @@ struct dentry_operations { If the 'rcu_walk' parameter is true, then the caller is doing a pathwalk in RCU-walk mode. Sleeping is not permitted in this mode, and the caller can be asked to leave it and call again by returning - -ECHILD. + -ECHILD. -EISDIR may also be returned to tell pathwalk to + ignore d_automount or any mounts. This function is only used if DCACHE_MANAGE_TRANSIT is set on the dentry being transited from. diff --git a/fs/namei.c b/fs/namei.c index 0ff23cecb1b..8a217c48f6d 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1091,10 +1091,10 @@ int follow_down_one(struct path *path) } EXPORT_SYMBOL(follow_down_one); -static inline bool managed_dentry_might_block(struct dentry *dentry) +static inline int managed_dentry_rcu(struct dentry *dentry) { - return (dentry->d_flags & DCACHE_MANAGE_TRANSIT && - dentry->d_op->d_manage(dentry, true) < 0); + return (dentry->d_flags & DCACHE_MANAGE_TRANSIT) ? + dentry->d_op->d_manage(dentry, true) : 0; } /* @@ -1110,11 +1110,18 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, * Don't forget we might have a non-mountpoint managed dentry * that wants to block transit. */ - if (unlikely(managed_dentry_might_block(path->dentry))) + switch (managed_dentry_rcu(path->dentry)) { + case -ECHILD: + default: return false; + case -EISDIR: + return true; + case 0: + break; + } if (!d_mountpoint(path->dentry)) - return true; + return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT); mounted = __lookup_mnt(path->mnt, path->dentry); if (!mounted) @@ -1130,7 +1137,8 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, */ *inode = path->dentry->d_inode; } - return read_seqretry(&mount_lock, nd->m_seq); + return read_seqretry(&mount_lock, nd->m_seq) && + !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT); } static int follow_dotdot_rcu(struct nameidata *nd) @@ -1402,11 +1410,8 @@ static int lookup_fast(struct nameidata *nd, } path->mnt = mnt; path->dentry = dentry; - if (unlikely(!__follow_mount_rcu(nd, path, inode))) - goto unlazy; - if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT)) - goto unlazy; - return 0; + if (likely(__follow_mount_rcu(nd, path, inode))) + return 0; unlazy: if (unlazy_walk(nd, dentry)) return -ECHILD; -- cgit v1.2.3-70-g09d2 From d03b29a271eb1d6de5af0f46cf0e7487e9e9284b Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 17 Feb 2014 16:52:33 -0500 Subject: namei: trivial fix to vfs_rename_dir comment Looks like the directory loop check is actually done in renameat? Whatever, leave this out rather than trying to keep it up to date with the code. Signed-off-by: J. Bruce Fields Signed-off-by: Al Viro --- fs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/namei.c b/fs/namei.c index 8a217c48f6d..a996bb48dfa 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4024,7 +4024,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * The worst of all namespace operations - renaming directory. "Perverted" * doesn't even start to describe it. Somebody in UCB had a heck of a trip... * Problems: - * a) we can get into loop creation. Check is done in is_subdir(). + * a) we can get into loop creation. * b) race potential - two innocent renames can create a loop together. * That's where 4.4 screws up. Current fix: serialization on * sb->s_vfs_rename_mutex. We might be more accurate, but that's another -- cgit v1.2.3-70-g09d2 From 3f70bd51cb4405dc5cf8624292ffa474679fc9c7 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 18 Feb 2014 14:11:26 -0500 Subject: dcache: move d_splice_alias Just a trivial move to locate it near (similar) d_materialise_unique code and save some forward references in a following patch. Reviewed-by: Christoph Hellwig Signed-off-by: J. Bruce Fields Signed-off-by: Al Viro --- fs/dcache.c | 104 ++++++++++++++++++++++++++++++------------------------------ 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 06f65857a85..8bdae36a095 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1853,58 +1853,6 @@ struct dentry *d_obtain_alias(struct inode *inode) } EXPORT_SYMBOL(d_obtain_alias); -/** - * d_splice_alias - splice a disconnected dentry into the tree if one exists - * @inode: the inode which may have a disconnected dentry - * @dentry: a negative dentry which we want to point to the inode. - * - * If inode is a directory and has a 'disconnected' dentry (i.e. IS_ROOT and - * DCACHE_DISCONNECTED), then d_move that in place of the given dentry - * and return it, else simply d_add the inode to the dentry and return NULL. - * - * This is needed in the lookup routine of any filesystem that is exportable - * (via knfsd) so that we can build dcache paths to directories effectively. - * - * If a dentry was found and moved, then it is returned. Otherwise NULL - * is returned. This matches the expected return value of ->lookup. - * - * Cluster filesystems may call this function with a negative, hashed dentry. - * In that case, we know that the inode will be a regular file, and also this - * will only occur during atomic_open. So we need to check for the dentry - * being already hashed only in the final case. - */ -struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) -{ - struct dentry *new = NULL; - - if (IS_ERR(inode)) - return ERR_CAST(inode); - - if (inode && S_ISDIR(inode->i_mode)) { - spin_lock(&inode->i_lock); - new = __d_find_alias(inode, 1); - if (new) { - BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED)); - spin_unlock(&inode->i_lock); - security_d_instantiate(new, inode); - d_move(new, dentry); - iput(inode); - } else { - /* already taking inode->i_lock, so d_add() by hand */ - __d_instantiate(dentry, inode); - spin_unlock(&inode->i_lock); - security_d_instantiate(dentry, inode); - d_rehash(dentry); - } - } else { - d_instantiate(dentry, inode); - if (d_unhashed(dentry)) - d_rehash(dentry); - } - return new; -} -EXPORT_SYMBOL(d_splice_alias); - /** * d_add_ci - lookup or allocate new dentry with case-exact name * @inode: the inode case-insensitive lookup has found @@ -2696,6 +2644,58 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) /* anon->d_lock still locked, returns locked */ } +/** + * d_splice_alias - splice a disconnected dentry into the tree if one exists + * @inode: the inode which may have a disconnected dentry + * @dentry: a negative dentry which we want to point to the inode. + * + * If inode is a directory and has a 'disconnected' dentry (i.e. IS_ROOT and + * DCACHE_DISCONNECTED), then d_move that in place of the given dentry + * and return it, else simply d_add the inode to the dentry and return NULL. + * + * This is needed in the lookup routine of any filesystem that is exportable + * (via knfsd) so that we can build dcache paths to directories effectively. + * + * If a dentry was found and moved, then it is returned. Otherwise NULL + * is returned. This matches the expected return value of ->lookup. + * + * Cluster filesystems may call this function with a negative, hashed dentry. + * In that case, we know that the inode will be a regular file, and also this + * will only occur during atomic_open. So we need to check for the dentry + * being already hashed only in the final case. + */ +struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) +{ + struct dentry *new = NULL; + + if (IS_ERR(inode)) + return ERR_CAST(inode); + + if (inode && S_ISDIR(inode->i_mode)) { + spin_lock(&inode->i_lock); + new = __d_find_alias(inode, 1); + if (new) { + BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED)); + spin_unlock(&inode->i_lock); + security_d_instantiate(new, inode); + d_move(new, dentry); + iput(inode); + } else { + /* already taking inode->i_lock, so d_add() by hand */ + __d_instantiate(dentry, inode); + spin_unlock(&inode->i_lock); + security_d_instantiate(dentry, inode); + d_rehash(dentry); + } + } else { + d_instantiate(dentry, inode); + if (d_unhashed(dentry)) + d_rehash(dentry); + } + return new; +} +EXPORT_SYMBOL(d_splice_alias); + /** * d_materialise_unique - introduce an inode into the tree * @dentry: candidate dentry -- cgit v1.2.3-70-g09d2 From 75a2352d0110960aeee1a28ddc09a55f97c99100 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 17 Feb 2014 17:45:56 -0500 Subject: dcache: close d_move race in d_splice_alias d_splice_alias will d_move an IS_ROOT() directory dentry into place if one exists. This should be safe as long as the dentry remains IS_ROOT, but I can't see what guarantees that: once we drop the i_lock all we hold here is the i_mutex on an unrelated parent directory. Instead copy the logic of d_materialise_unique. Reviewed-by: Christoph Hellwig Signed-off-by: J. Bruce Fields Signed-off-by: Al Viro --- fs/dcache.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/dcache.c b/fs/dcache.c index 8bdae36a095..8c09db9bb2a 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2676,9 +2676,14 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) new = __d_find_alias(inode, 1); if (new) { BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED)); + write_seqlock(&rename_lock); + __d_materialise_dentry(dentry, new); + write_sequnlock(&rename_lock); + __d_drop(new); + _d_rehash(new); + spin_unlock(&new->d_lock); spin_unlock(&inode->i_lock); security_d_instantiate(new, inode); - d_move(new, dentry); iput(inode); } else { /* already taking inode->i_lock, so d_add() by hand */ -- cgit v1.2.3-70-g09d2 From 908790fa3b779d37365e6b28e3aa0f6e833020c3 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 17 Feb 2014 17:58:42 -0500 Subject: dcache: d_splice_alias mustn't create directory aliases Currently if d_splice_alias finds a directory with an alias that is not IS_ROOT or not DCACHE_DISCONNECTED, it creates a duplicate directory. Duplicate directory dentries are unacceptable; it is better just to error out. (In the case of a local filesystem the most likely case is filesystem corruption: for example, perhaps two directories point to the same child directory, and the other parent has already been found and cached.) Note that distributed filesystems may encounter this case in normal operation if a remote host moves a directory to a location different from the one we last cached in the dcache. For that reason, such filesystems should instead use d_materialise_unique, which tries to move the old directory alias to the right place instead of erroring out. Signed-off-by: J. Bruce Fields Signed-off-by: Al Viro --- fs/dcache.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 8c09db9bb2a..a191eebf1d6 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2653,6 +2653,9 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) * DCACHE_DISCONNECTED), then d_move that in place of the given dentry * and return it, else simply d_add the inode to the dentry and return NULL. * + * If a non-IS_ROOT directory is found, the filesystem is corrupt, and + * we should error out: directories can't have multiple aliases. + * * This is needed in the lookup routine of any filesystem that is exportable * (via knfsd) so that we can build dcache paths to directories effectively. * @@ -2673,9 +2676,13 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) if (inode && S_ISDIR(inode->i_mode)) { spin_lock(&inode->i_lock); - new = __d_find_alias(inode, 1); + new = __d_find_any_alias(inode); if (new) { - BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED)); + if (!IS_ROOT(new) || !(new->d_flags & DCACHE_DISCONNECTED)) { + spin_unlock(&inode->i_lock); + dput(new); + return ERR_PTR(-EIO); + } write_seqlock(&rename_lock); __d_materialise_dentry(dentry, new); write_sequnlock(&rename_lock); -- cgit v1.2.3-70-g09d2 From da093a9b76efca0a7a217af538929e1ecb204466 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 17 Feb 2014 18:03:57 -0500 Subject: dcache: d_splice_alias should ignore DCACHE_DISCONNECTED Any IS_ROOT() alias should be safe to use; there's nothing special about DCACHE_DISCONNECTED dentries. Note that this is in fact useful for filesystems such as btrfs which can legimately encounter a directory with a preexisting IS_ROOT alias on a lookup that crosses into a subvolume. (Those aliases are currently marked DCACHE_DISCONNECTED--but not really for any good reason, and we'll change that soon.) Signed-off-by: J. Bruce Fields Signed-off-by: Al Viro --- fs/dcache.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index a191eebf1d6..3ed09536399 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2649,9 +2649,9 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) * @inode: the inode which may have a disconnected dentry * @dentry: a negative dentry which we want to point to the inode. * - * If inode is a directory and has a 'disconnected' dentry (i.e. IS_ROOT and - * DCACHE_DISCONNECTED), then d_move that in place of the given dentry - * and return it, else simply d_add the inode to the dentry and return NULL. + * If inode is a directory and has an IS_ROOT alias, then d_move that in + * place of the given dentry and return it, else simply d_add the inode + * to the dentry and return NULL. * * If a non-IS_ROOT directory is found, the filesystem is corrupt, and * we should error out: directories can't have multiple aliases. @@ -2678,7 +2678,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) spin_lock(&inode->i_lock); new = __d_find_any_alias(inode); if (new) { - if (!IS_ROOT(new) || !(new->d_flags & DCACHE_DISCONNECTED)) { + if (!IS_ROOT(new)) { spin_unlock(&inode->i_lock); dput(new); return ERR_PTR(-EIO); -- cgit v1.2.3-70-g09d2 From 1a0a397e41cb1bf70cfe45fd0eeff08c7c501ec0 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 14 Feb 2014 17:35:37 -0500 Subject: dcache: d_obtain_alias callers don't all want DISCONNECTED There are a few d_obtain_alias callers that are using it to get the root of a filesystem which may already have an alias somewhere else. This is not the same as the filehandle-lookup case, and none of them actually need DCACHE_DISCONNECTED set. It isn't really a serious problem, but it would really be clearer if we reserved DCACHE_DISCONNECTED for those cases where it's actually needed. In the btrfs case this was causing a spurious printk from nfsd/nfsfh.c:fh_verify when it found an unexpected DCACHE_DISCONNECTED dentry. Josef worked around this by unsetting DCACHE_DISCONNECTED manually in 3a0dfa6a12e "Btrfs: unset DCACHE_DISCONNECTED when mounting default subvol", and this replaces that workaround. Cc: Josef Bacik Signed-off-by: J. Bruce Fields Signed-off-by: Al Viro --- fs/btrfs/super.c | 9 +------ fs/ceph/super.c | 2 +- fs/dcache.c | 69 +++++++++++++++++++++++++++++++++++--------------- fs/nfs/getroot.c | 2 +- fs/nilfs2/super.c | 2 +- include/linux/dcache.h | 1 + 6 files changed, 54 insertions(+), 31 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 8e16bca69c5..67b48b9a03e 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -851,7 +851,6 @@ static struct dentry *get_default_root(struct super_block *sb, struct btrfs_path *path; struct btrfs_key location; struct inode *inode; - struct dentry *dentry; u64 dir_id; int new = 0; @@ -922,13 +921,7 @@ setup_root: return dget(sb->s_root); } - dentry = d_obtain_alias(inode); - if (!IS_ERR(dentry)) { - spin_lock(&dentry->d_lock); - dentry->d_flags &= ~DCACHE_DISCONNECTED; - spin_unlock(&dentry->d_lock); - } - return dentry; + return d_obtain_root(inode); } static int btrfs_fill_super(struct super_block *sb, diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 06150fd745a..f6e12377335 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -755,7 +755,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, goto out; } } else { - root = d_obtain_alias(inode); + root = d_obtain_root(inode); } ceph_init_dentry(root); dout("open_root_inode success, root dentry is %p\n", root); diff --git a/fs/dcache.c b/fs/dcache.c index 3ed09536399..63d556c0e69 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1781,25 +1781,7 @@ struct dentry *d_find_any_alias(struct inode *inode) } EXPORT_SYMBOL(d_find_any_alias); -/** - * d_obtain_alias - find or allocate a dentry for a given inode - * @inode: inode to allocate the dentry for - * - * Obtain a dentry for an inode resulting from NFS filehandle conversion or - * similar open by handle operations. The returned dentry may be anonymous, - * or may have a full name (if the inode was already in the cache). - * - * When called on a directory inode, we must ensure that the inode only ever - * has one dentry. If a dentry is found, that is returned instead of - * allocating a new one. - * - * On successful return, the reference to the inode has been transferred - * to the dentry. In case of an error the reference on the inode is released. - * To make it easier to use in export operations a %NULL or IS_ERR inode may - * be passed in and will be the error will be propagate to the return value, - * with a %NULL @inode replaced by ERR_PTR(-ESTALE). - */ -struct dentry *d_obtain_alias(struct inode *inode) +struct dentry *__d_obtain_alias(struct inode *inode, int disconnected) { static const struct qstr anonstring = QSTR_INIT("/", 1); struct dentry *tmp; @@ -1830,7 +1812,10 @@ struct dentry *d_obtain_alias(struct inode *inode) } /* attach a disconnected dentry */ - add_flags = d_flags_for_inode(inode) | DCACHE_DISCONNECTED; + add_flags = d_flags_for_inode(inode); + + if (disconnected) + add_flags |= DCACHE_DISCONNECTED; spin_lock(&tmp->d_lock); tmp->d_inode = inode; @@ -1851,8 +1836,52 @@ struct dentry *d_obtain_alias(struct inode *inode) iput(inode); return res; } + +/** + * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode + * @inode: inode to allocate the dentry for + * + * Obtain a dentry for an inode resulting from NFS filehandle conversion or + * similar open by handle operations. The returned dentry may be anonymous, + * or may have a full name (if the inode was already in the cache). + * + * When called on a directory inode, we must ensure that the inode only ever + * has one dentry. If a dentry is found, that is returned instead of + * allocating a new one. + * + * On successful return, the reference to the inode has been transferred + * to the dentry. In case of an error the reference on the inode is released. + * To make it easier to use in export operations a %NULL or IS_ERR inode may + * be passed in and the error will be propagated to the return value, + * with a %NULL @inode replaced by ERR_PTR(-ESTALE). + */ +struct dentry *d_obtain_alias(struct inode *inode) +{ + return __d_obtain_alias(inode, 1); +} EXPORT_SYMBOL(d_obtain_alias); +/** + * d_obtain_root - find or allocate a dentry for a given inode + * @inode: inode to allocate the dentry for + * + * Obtain an IS_ROOT dentry for the root of a filesystem. + * + * We must ensure that directory inodes only ever have one dentry. If a + * dentry is found, that is returned instead of allocating a new one. + * + * On successful return, the reference to the inode has been transferred + * to the dentry. In case of an error the reference on the inode is + * released. A %NULL or IS_ERR inode may be passed in and will be the + * error will be propagate to the return value, with a %NULL @inode + * replaced by ERR_PTR(-ESTALE). + */ +struct dentry *d_obtain_root(struct inode *inode) +{ + return __d_obtain_alias(inode, 0); +} +EXPORT_SYMBOL(d_obtain_root); + /** * d_add_ci - lookup or allocate new dentry with case-exact name * @inode: the inode case-insensitive lookup has found diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index b94f80420a5..880618a8b04 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -112,7 +112,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh, * if the dentry tree reaches them; however if the dentry already * exists, we'll pick it up at this point and use it as the root */ - ret = d_obtain_alias(inode); + ret = d_obtain_root(inode); if (IS_ERR(ret)) { dprintk("nfs_get_root: get root dentry failed\n"); goto out; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 8c532b2ca3a..ac914994dfe 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -942,7 +942,7 @@ static int nilfs_get_root_dentry(struct super_block *sb, iput(inode); } } else { - dentry = d_obtain_alias(inode); + dentry = d_obtain_root(inode); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); goto failed_dentry; diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 3c7ec327ebd..e4ae2ad48d0 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -249,6 +249,7 @@ extern struct dentry * d_splice_alias(struct inode *, struct dentry *); extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *); extern struct dentry *d_find_any_alias(struct inode *inode); extern struct dentry * d_obtain_alias(struct inode *); +extern struct dentry * d_obtain_root(struct inode *); extern void shrink_dcache_sb(struct super_block *); extern void shrink_dcache_parent(struct dentry *); extern void shrink_dcache_for_umount(struct super_block *); -- cgit v1.2.3-70-g09d2 From 52ed46f0fa88243887b823d24ccb9fcf47a735b3 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 16 Jan 2014 11:15:51 -0500 Subject: dcache: remove unused d_find_alias parameter Signed-off-by: J. Bruce Fields Signed-off-by: Al Viro --- fs/dcache.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 63d556c0e69..5c5f3bd9af5 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -731,8 +731,6 @@ EXPORT_SYMBOL(dget_parent); /** * d_find_alias - grab a hashed alias of inode * @inode: inode in question - * @want_discon: flag, used by d_splice_alias, to request - * that only a DISCONNECTED alias be returned. * * If inode has a hashed alias, or is a directory and has any alias, * acquire the reference to alias and return it. Otherwise return NULL. @@ -741,10 +739,9 @@ EXPORT_SYMBOL(dget_parent); * of a filesystem. * * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer - * any other hashed alias over that one unless @want_discon is set, - * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias. + * any other hashed alias over that one. */ -static struct dentry *__d_find_alias(struct inode *inode, int want_discon) +static struct dentry *__d_find_alias(struct inode *inode) { struct dentry *alias, *discon_alias; @@ -756,7 +753,7 @@ again: if (IS_ROOT(alias) && (alias->d_flags & DCACHE_DISCONNECTED)) { discon_alias = alias; - } else if (!want_discon) { + } else { __dget_dlock(alias); spin_unlock(&alias->d_lock); return alias; @@ -787,7 +784,7 @@ struct dentry *d_find_alias(struct inode *inode) if (!hlist_empty(&inode->i_dentry)) { spin_lock(&inode->i_lock); - de = __d_find_alias(inode, 0); + de = __d_find_alias(inode); spin_unlock(&inode->i_lock); } return de; @@ -2765,7 +2762,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) struct dentry *alias; /* Does an aliased dentry already exist? */ - alias = __d_find_alias(inode, 0); + alias = __d_find_alias(inode); if (alias) { actual = alias; write_seqlock(&rename_lock); -- cgit v1.2.3-70-g09d2 From 8d80d7dabe9668965574669afbd31733f7b0fe9b Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 16 Jan 2014 17:17:31 -0500 Subject: dcache: d_find_alias needn't recheck IS_ROOT && DCACHE_DISCONNECTED If we get to this point and discover the dentry is not a root dentry, or not DCACHE_DISCONNECTED--great, we always prefer that anyway. Signed-off-by: J. Bruce Fields Signed-off-by: Al Viro --- fs/dcache.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 5c5f3bd9af5..85a2aad3dcb 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -765,12 +765,9 @@ again: alias = discon_alias; spin_lock(&alias->d_lock); if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) { - if (IS_ROOT(alias) && - (alias->d_flags & DCACHE_DISCONNECTED)) { - __dget_dlock(alias); - spin_unlock(&alias->d_lock); - return alias; - } + __dget_dlock(alias); + spin_unlock(&alias->d_lock); + return alias; } spin_unlock(&alias->d_lock); goto again; -- cgit v1.2.3-70-g09d2 From 9635389534a765c8357d988e53d323bf033dcdd0 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 18 Feb 2014 12:31:31 -0500 Subject: exportfs: update Exporting documentation Minor documentation updates: - refer to d_obtain_alias rather than d_alloc_anon - explain when to use d_splice_alias and when d_materialise_unique. - cut some details of d_splice_alias/d_materialise_unique implementation. Signed-off-by: J. Bruce Fields Signed-off-by: Al Viro --- Documentation/filesystems/nfs/Exporting | 38 ++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/Documentation/filesystems/nfs/Exporting b/Documentation/filesystems/nfs/Exporting index e543b1a619c..c8f036a9b13 100644 --- a/Documentation/filesystems/nfs/Exporting +++ b/Documentation/filesystems/nfs/Exporting @@ -66,23 +66,31 @@ b/ A per-superblock list "s_anon" of dentries which are the roots of c/ Helper routines to allocate anonymous dentries, and to help attach loose directory dentries at lookup time. They are: - d_alloc_anon(inode) will return a dentry for the given inode. + d_obtain_alias(inode) will return a dentry for the given inode. If the inode already has a dentry, one of those is returned. If it doesn't, a new anonymous (IS_ROOT and DCACHE_DISCONNECTED) dentry is allocated and attached. In the case of a directory, care is taken that only one dentry can ever be attached. - d_splice_alias(inode, dentry) will make sure that there is a - dentry with the same name and parent as the given dentry, and - which refers to the given inode. - If the inode is a directory and already has a dentry, then that - dentry is d_moved over the given dentry. - If the passed dentry gets attached, care is taken that this is - mutually exclusive to a d_alloc_anon operation. - If the passed dentry is used, NULL is returned, else the used - dentry is returned. This corresponds to the calling pattern of - ->lookup. - + d_splice_alias(inode, dentry) or d_materialise_unique(dentry, inode) + will introduce a new dentry into the tree; either the passed-in + dentry or a preexisting alias for the given inode (such as an + anonymous one created by d_obtain_alias), if appropriate. The two + functions differ in their handling of directories with preexisting + aliases: + d_splice_alias will use any existing IS_ROOT dentry, but it will + return -EIO rather than try to move a dentry with a different + parent. This is appropriate for local filesystems, which + should never see such an alias unless the filesystem is + corrupted somehow (for example, if two on-disk directory + entries refer to the same directory.) + d_materialise_unique will attempt to move any dentry. This is + appropriate for distributed filesystems, where finding a + directory other than where we last cached it may be a normal + consequence of concurrent operations on other hosts. + Both functions return NULL when the passed-in dentry is used, + following the calling convention of ->lookup. + Filesystem Issues ----------------- @@ -120,12 +128,12 @@ struct which has the following members: fh_to_dentry (mandatory) Given a filehandle fragment, this should find the implied object and - create a dentry for it (possibly with d_alloc_anon). + create a dentry for it (possibly with d_obtain_alias). fh_to_parent (optional but strongly recommended) Given a filehandle fragment, this should find the parent of the - implied object and create a dentry for it (possibly with d_alloc_anon). - May fail if the filehandle fragment is too small. + implied object and create a dentry for it (possibly with + d_obtain_alias). May fail if the filehandle fragment is too small. get_parent (optional but strongly recommended) When given a dentry for a directory, this should return a dentry for -- cgit v1.2.3-70-g09d2 From 95ad5c291313b66a98a44dc92b57e0b37c1dd589 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 12 Mar 2014 12:19:23 -0400 Subject: dcache: d_splice_alias should detect loops I believe this can only happen in the case of a corrupted filesystem. So -EIO looks like the appropriate error. Signed-off-by: J. Bruce Fields Signed-off-by: Al Viro --- fs/dcache.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/dcache.c b/fs/dcache.c index 85a2aad3dcb..ad137005cda 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2706,6 +2706,11 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) dput(new); return ERR_PTR(-EIO); } + if (d_ancestor(new, dentry)) { + spin_unlock(&inode->i_lock); + dput(new); + return ERR_PTR(-EIO); + } write_seqlock(&rename_lock); __d_materialise_dentry(dentry, new); write_sequnlock(&rename_lock); -- cgit v1.2.3-70-g09d2 From 49c7dd287adffc972e6dd6cf7011d63c7c5c2e10 Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Thu, 31 Jul 2014 17:59:02 -0400 Subject: fs: mark __d_obtain_alias static Signed-off-by: Fengguang Wu Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/dcache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/dcache.c b/fs/dcache.c index ad137005cda..d30ce699ae4 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1775,7 +1775,7 @@ struct dentry *d_find_any_alias(struct inode *inode) } EXPORT_SYMBOL(d_find_any_alias); -struct dentry *__d_obtain_alias(struct inode *inode, int disconnected) +static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected) { static const struct qstr anonstring = QSTR_INIT("/", 1); struct dentry *tmp; -- cgit v1.2.3-70-g09d2 From c7f3888ad7f0932a87fb76e6e4edff2a90cc7920 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 18 Jun 2014 20:34:33 -0400 Subject: switch iov_iter_get_pages() to passing maximal number of pages ... instead of maximal size. Signed-off-by: Al Viro --- fs/direct-io.c | 2 +- fs/fuse/file.c | 4 ++-- include/linux/uio.h | 2 +- mm/iov_iter.c | 17 ++++++++--------- 4 files changed, 12 insertions(+), 13 deletions(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index 17e39b047de..c3116404ab4 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -158,7 +158,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) { ssize_t ret; - ret = iov_iter_get_pages(sdio->iter, dio->pages, DIO_PAGES * PAGE_SIZE, + ret = iov_iter_get_pages(sdio->iter, dio->pages, DIO_PAGES, &sdio->from); if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 40ac2628ddc..912061ac4ba 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1303,10 +1303,10 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, while (nbytes < *nbytesp && req->num_pages < req->max_pages) { unsigned npages; size_t start; - unsigned n = req->max_pages - req->num_pages; ssize_t ret = iov_iter_get_pages(ii, &req->pages[req->num_pages], - n * PAGE_SIZE, &start); + req->max_pages - req->num_pages, + &start); if (ret < 0) return ret; diff --git a/include/linux/uio.h b/include/linux/uio.h index 09a7cffc224..48d64e6ab29 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -84,7 +84,7 @@ unsigned long iov_iter_alignment(const struct iov_iter *i); void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov, unsigned long nr_segs, size_t count); ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages, - size_t maxsize, size_t *start); + unsigned maxpages, size_t *start); ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start); int iov_iter_npages(const struct iov_iter *i, int maxpages); diff --git a/mm/iov_iter.c b/mm/iov_iter.c index 7b5dbd1517b..ab88dc0ea1d 100644 --- a/mm/iov_iter.c +++ b/mm/iov_iter.c @@ -310,7 +310,7 @@ void iov_iter_init(struct iov_iter *i, int direction, EXPORT_SYMBOL(iov_iter_init); static ssize_t get_pages_iovec(struct iov_iter *i, - struct page **pages, size_t maxsize, + struct page **pages, unsigned maxpages, size_t *start) { size_t offset = i->iov_offset; @@ -323,10 +323,10 @@ static ssize_t get_pages_iovec(struct iov_iter *i, len = iov->iov_len - offset; if (len > i->count) len = i->count; - if (len > maxsize) - len = maxsize; addr = (unsigned long)iov->iov_base + offset; len += *start = addr & (PAGE_SIZE - 1); + if (len > maxpages * PAGE_SIZE) + len = maxpages * PAGE_SIZE; addr &= ~(PAGE_SIZE - 1); n = (len + PAGE_SIZE - 1) / PAGE_SIZE; res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages); @@ -588,15 +588,14 @@ static unsigned long alignment_bvec(const struct iov_iter *i) } static ssize_t get_pages_bvec(struct iov_iter *i, - struct page **pages, size_t maxsize, + struct page **pages, unsigned maxpages, size_t *start) { const struct bio_vec *bvec = i->bvec; size_t len = bvec->bv_len - i->iov_offset; if (len > i->count) len = i->count; - if (len > maxsize) - len = maxsize; + /* can't be more than PAGE_SIZE */ *start = bvec->bv_offset + i->iov_offset; get_page(*pages = bvec->bv_page); @@ -712,13 +711,13 @@ unsigned long iov_iter_alignment(const struct iov_iter *i) EXPORT_SYMBOL(iov_iter_alignment); ssize_t iov_iter_get_pages(struct iov_iter *i, - struct page **pages, size_t maxsize, + struct page **pages, unsigned maxpages, size_t *start) { if (i->type & ITER_BVEC) - return get_pages_bvec(i, pages, maxsize, start); + return get_pages_bvec(i, pages, maxpages, start); else - return get_pages_iovec(i, pages, maxsize, start); + return get_pages_iovec(i, pages, maxpages, start); } EXPORT_SYMBOL(iov_iter_get_pages); -- cgit v1.2.3-70-g09d2 From 60bb45297f7551833346c5cebc6d483ea17ea5f2 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 8 Aug 2014 12:39:16 -0400 Subject: __generic_file_write_iter(): fix handling of sync error after DIO If DIO results in short write and sync write fails, we want to bugger off whether the DIO part has written anything or not; the logics on the return will take care of the right return value. Cc: stable@vger.kernel.org [3.16] Reported-by: Anton Altaparmakov Signed-off-by: Al Viro --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index 900edfaf6df..8163e043949 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2584,7 +2584,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) * that this differs from normal direct-io semantics, which * will return -EFOO even if some bytes were written. */ - if (unlikely(status < 0) && !written) { + if (unlikely(status < 0)) { err = status; goto out; } -- cgit v1.2.3-70-g09d2 From 12a5b5294cb1896e9a3c9fca8ff5a7e3def4e8c6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 10 Aug 2014 03:44:55 -0400 Subject: fix copy_tree() regression Since 3.14 we had copy_tree() get the shadowing wrong - if we had one vfsmount shadowing another (i.e. if A is a slave of B, C is mounted on A/foo, then D got mounted on B/foo creating D' on A/foo shadowed by C), copy_tree() of A would make a copy of D' shadow the the copy of C, not the other way around. It's easy to fix, fortunately - just make sure that mount follows the one that shadows it in mnt_child as well as in mnt_hash, and when copy_tree() decides to attach a new mount, check if the last child it has added to the same parent should be shadowing the new one. And if it should, just use the same logics commit_tree() has - put the new mount into the hash and children lists right after the one that should shadow it. Cc: stable@vger.kernel.org [3.14 and later] Signed-off-by: Al Viro --- fs/namespace.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 65af9d0e0d6..be3f6f23a47 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -778,6 +778,20 @@ static void attach_mnt(struct mount *mnt, list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); } +static void attach_shadowed(struct mount *mnt, + struct mount *parent, + struct mount *shadows) +{ + if (shadows) { + hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash); + list_add(&mnt->mnt_child, &shadows->mnt_child); + } else { + hlist_add_head_rcu(&mnt->mnt_hash, + m_hash(&parent->mnt, mnt->mnt_mountpoint)); + list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); + } +} + /* * vfsmount lock must be held for write */ @@ -796,12 +810,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows) list_splice(&head, n->list.prev); - if (shadows) - hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash); - else - hlist_add_head_rcu(&mnt->mnt_hash, - m_hash(&parent->mnt, mnt->mnt_mountpoint)); - list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); + attach_shadowed(mnt, parent, shadows); touch_mnt_namespace(n); } @@ -1474,6 +1483,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, continue; for (s = r; s; s = next_mnt(s, r)) { + struct mount *t = NULL; if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(s)) { s = skip_mnt_tree(s); @@ -1495,7 +1505,14 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, goto out; lock_mount_hash(); list_add_tail(&q->mnt_list, &res->mnt_list); - attach_mnt(q, parent, p->mnt_mp); + mnt_set_mountpoint(parent, p->mnt_mp, q); + if (!list_empty(&parent->mnt_mounts)) { + t = list_last_entry(&parent->mnt_mounts, + struct mount, mnt_child); + if (t->mnt_mp != p->mnt_mp) + t = NULL; + } + attach_shadowed(q, parent, t); unlock_mount_hash(); } } -- cgit v1.2.3-70-g09d2